* buffer.h (FETCH_MULTIBYTE_CHAR): Define as inline.
[bpt/emacs.git] / src / coding.c
CommitLineData
9542cb1f 1/* Coding system handler (conversion, detection, etc).
acaf905b 2 Copyright (C) 2001-2012 Free Software Foundation, Inc.
7976eda0 3 Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
5df4f04c 4 2005, 2006, 2007, 2008, 2009, 2010, 2011
ce03bf76
KH
5 National Institute of Advanced Industrial Science and Technology (AIST)
6 Registration Number H14PRO021
8f924df7 7 Copyright (C) 2003
df7492f9
KH
8 National Institute of Advanced Industrial Science and Technology (AIST)
9 Registration Number H13PRO009
4ed46869 10
369314dc
KH
11This file is part of GNU Emacs.
12
9ec0b715 13GNU Emacs is free software: you can redistribute it and/or modify
369314dc 14it under the terms of the GNU General Public License as published by
9ec0b715
GM
15the Free Software Foundation, either version 3 of the License, or
16(at your option) any later version.
4ed46869 17
369314dc
KH
18GNU Emacs is distributed in the hope that it will be useful,
19but WITHOUT ANY WARRANTY; without even the implied warranty of
20MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21GNU General Public License for more details.
4ed46869 22
369314dc 23You should have received a copy of the GNU General Public License
9ec0b715 24along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */
4ed46869
KH
25
26/*** TABLE OF CONTENTS ***
27
b73bfc1c 28 0. General comments
4ed46869 29 1. Preamble
df7492f9
KH
30 2. Emacs' internal format (emacs-utf-8) handlers
31 3. UTF-8 handlers
32 4. UTF-16 handlers
33 5. Charset-base coding systems handlers
34 6. emacs-mule (old Emacs' internal format) handlers
35 7. ISO2022 handlers
36 8. Shift-JIS and BIG5 handlers
37 9. CCL handlers
38 10. C library functions
39 11. Emacs Lisp library functions
40 12. Postamble
4ed46869
KH
41
42*/
43
df7492f9 44/*** 0. General comments ***
b73bfc1c
KH
45
46
df7492f9 47CODING SYSTEM
4ed46869 48
5bad0796
DL
49 A coding system is an object for an encoding mechanism that contains
50 information about how to convert byte sequences to character
e19c3639
KH
51 sequences and vice versa. When we say "decode", it means converting
52 a byte sequence of a specific coding system into a character
53 sequence that is represented by Emacs' internal coding system
54 `emacs-utf-8', and when we say "encode", it means converting a
55 character sequence of emacs-utf-8 to a byte sequence of a specific
0ef69138 56 coding system.
4ed46869 57
34809aa6
EZ
58 In Emacs Lisp, a coding system is represented by a Lisp symbol. On
59 the C level, a coding system is represented by a vector of attributes
5bad0796 60 stored in the hash table Vcharset_hash_table. The conversion from
e19c3639
KH
61 coding system symbol to attributes vector is done by looking up
62 Vcharset_hash_table by the symbol.
4ed46869 63
e19c3639 64 Coding systems are classified into the following types depending on
5bad0796 65 the encoding mechanism. Here's a brief description of the types.
4ed46869 66
df7492f9
KH
67 o UTF-8
68
69 o UTF-16
70
71 o Charset-base coding system
72
73 A coding system defined by one or more (coded) character sets.
5bad0796 74 Decoding and encoding are done by a code converter defined for each
df7492f9
KH
75 character set.
76
5bad0796 77 o Old Emacs internal format (emacs-mule)
df7492f9 78
5bad0796 79 The coding system adopted by old versions of Emacs (20 and 21).
4ed46869 80
df7492f9 81 o ISO2022-base coding system
4ed46869
KH
82
83 The most famous coding system for multiple character sets. X's
df7492f9
KH
84 Compound Text, various EUCs (Extended Unix Code), and coding systems
85 used in the Internet communication such as ISO-2022-JP are all
86 variants of ISO2022.
4ed46869 87
df7492f9 88 o SJIS (or Shift-JIS or MS-Kanji-Code)
93dec019 89
4ed46869
KH
90 A coding system to encode character sets: ASCII, JISX0201, and
91 JISX0208. Widely used for PC's in Japan. Details are described in
df7492f9 92 section 8.
4ed46869 93
df7492f9 94 o BIG5
4ed46869 95
df7492f9 96 A coding system to encode character sets: ASCII and Big5. Widely
cfb43547 97 used for Chinese (mainly in Taiwan and Hong Kong). Details are
df7492f9
KH
98 described in section 8. In this file, when we write "big5" (all
99 lowercase), we mean the coding system, and when we write "Big5"
100 (capitalized), we mean the character set.
4ed46869 101
df7492f9 102 o CCL
27901516 103
5bad0796 104 If a user wants to decode/encode text encoded in a coding system
df7492f9
KH
105 not listed above, he can supply a decoder and an encoder for it in
106 CCL (Code Conversion Language) programs. Emacs executes the CCL
107 program while decoding/encoding.
27901516 108
df7492f9 109 o Raw-text
4ed46869 110
5a936b46 111 A coding system for text containing raw eight-bit data. Emacs
5bad0796 112 treats each byte of source text as a character (except for
df7492f9 113 end-of-line conversion).
4ed46869 114
df7492f9
KH
115 o No-conversion
116
117 Like raw text, but don't do end-of-line conversion.
4ed46869 118
4ed46869 119
df7492f9 120END-OF-LINE FORMAT
4ed46869 121
5bad0796 122 How text end-of-line is encoded depends on operating system. For
df7492f9 123 instance, Unix's format is just one byte of LF (line-feed) code,
f4dee582 124 whereas DOS's format is two-byte sequence of `carriage-return' and
d46c5b12
KH
125 `line-feed' codes. MacOS's format is usually one byte of
126 `carriage-return'.
4ed46869 127
cfb43547 128 Since text character encoding and end-of-line encoding are
df7492f9
KH
129 independent, any coding system described above can take any format
130 of end-of-line (except for no-conversion).
4ed46869 131
e19c3639
KH
132STRUCT CODING_SYSTEM
133
134 Before using a coding system for code conversion (i.e. decoding and
135 encoding), we setup a structure of type `struct coding_system'.
136 This structure keeps various information about a specific code
5bad0796 137 conversion (e.g. the location of source and destination data).
4ed46869
KH
138
139*/
140
df7492f9
KH
141/* COMMON MACROS */
142
143
4ed46869
KH
144/*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
145
df7492f9 146 These functions check if a byte sequence specified as a source in
ff0dacd7
KH
147 CODING conforms to the format of XXX, and update the members of
148 DETECT_INFO.
df7492f9 149
ff0dacd7 150 Return 1 if the byte sequence conforms to XXX, otherwise return 0.
df7492f9
KH
151
152 Below is the template of these functions. */
153
4ed46869 154#if 0
df7492f9 155static int
cf84bb53
JB
156detect_coding_XXX (struct coding_system *coding,
157 struct coding_detection_info *detect_info)
4ed46869 158{
f1d34bca
MB
159 const unsigned char *src = coding->source;
160 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9 161 int multibytep = coding->src_multibyte;
d311d28c 162 ptrdiff_t consumed_chars = 0;
df7492f9
KH
163 int found = 0;
164 ...;
165
166 while (1)
167 {
ad1746f5 168 /* Get one byte from the source. If the source is exhausted, jump
df7492f9
KH
169 to no_more_source:. */
170 ONE_MORE_BYTE (c);
ff0dacd7
KH
171
172 if (! __C_conforms_to_XXX___ (c))
173 break;
174 if (! __C_strongly_suggests_XXX__ (c))
175 found = CATEGORY_MASK_XXX;
df7492f9 176 }
ff0dacd7
KH
177 /* The byte sequence is invalid for XXX. */
178 detect_info->rejected |= CATEGORY_MASK_XXX;
df7492f9 179 return 0;
ff0dacd7 180
df7492f9 181 no_more_source:
ad1746f5 182 /* The source exhausted successfully. */
ff0dacd7 183 detect_info->found |= found;
df7492f9 184 return 1;
4ed46869
KH
185}
186#endif
187
188/*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
189
df7492f9
KH
190 These functions decode a byte sequence specified as a source by
191 CODING. The resulting multibyte text goes to a place pointed to by
192 CODING->charbuf, the length of which should not exceed
193 CODING->charbuf_size;
d46c5b12 194
df7492f9
KH
195 These functions set the information of original and decoded texts in
196 CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
197 They also set CODING->result to one of CODING_RESULT_XXX indicating
198 how the decoding is finished.
d46c5b12 199
df7492f9 200 Below is the template of these functions. */
d46c5b12 201
4ed46869 202#if 0
b73bfc1c 203static void
cf84bb53 204decode_coding_XXXX (struct coding_system *coding)
4ed46869 205{
f1d34bca
MB
206 const unsigned char *src = coding->source + coding->consumed;
207 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9
KH
208 /* SRC_BASE remembers the start position in source in each loop.
209 The loop will be exited when there's not enough source code, or
210 when there's no room in CHARBUF for a decoded character. */
f1d34bca 211 const unsigned char *src_base;
df7492f9 212 /* A buffer to produce decoded characters. */
69a80ea3
KH
213 int *charbuf = coding->charbuf + coding->charbuf_used;
214 int *charbuf_end = coding->charbuf + coding->charbuf_size;
df7492f9
KH
215 int multibytep = coding->src_multibyte;
216
217 while (1)
218 {
219 src_base = src;
220 if (charbuf < charbuf_end)
221 /* No more room to produce a decoded character. */
222 break;
223 ONE_MORE_BYTE (c);
224 /* Decode it. */
225 }
226
227 no_more_source:
228 if (src_base < src_end
229 && coding->mode & CODING_MODE_LAST_BLOCK)
230 /* If the source ends by partial bytes to construct a character,
231 treat them as eight-bit raw data. */
232 while (src_base < src_end && charbuf < charbuf_end)
233 *charbuf++ = *src_base++;
234 /* Remember how many bytes and characters we consumed. If the
235 source is multibyte, the bytes and chars are not identical. */
236 coding->consumed = coding->consumed_char = src_base - coding->source;
237 /* Remember how many characters we produced. */
238 coding->charbuf_used = charbuf - coding->charbuf;
4ed46869
KH
239}
240#endif
241
242/*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
243
df7492f9
KH
244 These functions encode SRC_BYTES length text at SOURCE of Emacs'
245 internal multibyte format by CODING. The resulting byte sequence
b73bfc1c
KH
246 goes to a place pointed to by DESTINATION, the length of which
247 should not exceed DST_BYTES.
d46c5b12 248
df7492f9
KH
249 These functions set the information of original and encoded texts in
250 the members produced, produced_char, consumed, and consumed_char of
251 the structure *CODING. They also set the member result to one of
252 CODING_RESULT_XXX indicating how the encoding finished.
d46c5b12 253
df7492f9
KH
254 DST_BYTES zero means that source area and destination area are
255 overlapped, which means that we can produce a encoded text until it
256 reaches at the head of not-yet-encoded source text.
d46c5b12 257
df7492f9 258 Below is a template of these functions. */
4ed46869 259#if 0
b73bfc1c 260static void
cf84bb53 261encode_coding_XXX (struct coding_system *coding)
4ed46869 262{
df7492f9
KH
263 int multibytep = coding->dst_multibyte;
264 int *charbuf = coding->charbuf;
265 int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
266 unsigned char *dst = coding->destination + coding->produced;
267 unsigned char *dst_end = coding->destination + coding->dst_bytes;
268 unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
d311d28c 269 ptrdiff_t produced_chars = 0;
df7492f9
KH
270
271 for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
272 {
273 int c = *charbuf;
274 /* Encode C into DST, and increment DST. */
275 }
276 label_no_more_destination:
277 /* How many chars and bytes we produced. */
278 coding->produced_char += produced_chars;
279 coding->produced = dst - coding->destination;
4ed46869
KH
280}
281#endif
282
4ed46869
KH
283\f
284/*** 1. Preamble ***/
285
68c45bf0 286#include <config.h>
4ed46869 287#include <stdio.h>
d7306fe6 288#include <setjmp.h>
4ed46869 289
4ed46869 290#include "lisp.h"
df7492f9 291#include "character.h"
e5560ff7 292#include "buffer.h"
4ed46869
KH
293#include "charset.h"
294#include "ccl.h"
df7492f9 295#include "composite.h"
4ed46869
KH
296#include "coding.h"
297#include "window.h"
b8299c66
KL
298#include "frame.h"
299#include "termhooks.h"
4ed46869 300
df7492f9 301Lisp_Object Vcoding_system_hash_table;
4ed46869 302
955cbe7b
PE
303static Lisp_Object Qcoding_system, Qeol_type;
304static Lisp_Object Qcoding_aliases;
1965cb73 305Lisp_Object Qunix, Qdos;
4ed46869 306Lisp_Object Qbuffer_file_coding_system;
955cbe7b
PE
307static Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
308static Lisp_Object Qdefault_char;
27901516 309Lisp_Object Qno_conversion, Qundecided;
955cbe7b
PE
310Lisp_Object Qcharset, Qutf_8;
311static Lisp_Object Qiso_2022;
312static Lisp_Object Qutf_16, Qshift_jis, Qbig5;
313static Lisp_Object Qbig, Qlittle;
314static Lisp_Object Qcoding_system_history;
315static Lisp_Object Qvalid_codes;
316static Lisp_Object QCcategory, QCmnemonic, QCdefault_char;
317static Lisp_Object QCdecode_translation_table, QCencode_translation_table;
318static Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
319static Lisp_Object QCascii_compatible_p;
4ed46869 320
387f6ba5 321Lisp_Object Qcall_process, Qcall_process_region;
4ed46869 322Lisp_Object Qstart_process, Qopen_network_stream;
955cbe7b 323static Lisp_Object Qtarget_idx;
4ed46869 324
955cbe7b
PE
325static Lisp_Object Qinsufficient_source, Qinconsistent_eol, Qinvalid_source;
326static Lisp_Object Qinterrupted, Qinsufficient_memory;
065e3595 327
44e8490d
KH
328/* If a symbol has this property, evaluate the value to define the
329 symbol as a coding system. */
330static Lisp_Object Qcoding_system_define_form;
331
fcbcfb64
KH
332/* Format of end-of-line decided by system. This is Qunix on
333 Unix and Mac, Qdos on DOS/Windows.
334 This has an effect only for external encoding (i.e. for output to
335 file and process), not for in-buffer or Lisp string encoding. */
336static Lisp_Object system_eol_type;
337
4ed46869
KH
338#ifdef emacs
339
4608c386 340Lisp_Object Qcoding_system_p, Qcoding_system_error;
4ed46869 341
d46c5b12
KH
342/* Coding system emacs-mule and raw-text are for converting only
343 end-of-line format. */
344Lisp_Object Qemacs_mule, Qraw_text;
8f924df7 345Lisp_Object Qutf_8_emacs;
ecf488bc 346
4ed46869
KH
347/* Coding-systems are handed between Emacs Lisp programs and C internal
348 routines by the following three variables. */
c4825358
KH
349/* Coding system to be used to encode text for terminal display when
350 terminal coding system is nil. */
351struct coding_system safe_terminal_coding;
352
4ed46869
KH
353#endif /* emacs */
354
f967223b
KH
355Lisp_Object Qtranslation_table;
356Lisp_Object Qtranslation_table_id;
955cbe7b
PE
357static Lisp_Object Qtranslation_table_for_decode;
358static Lisp_Object Qtranslation_table_for_encode;
4ed46869 359
df7492f9 360/* Two special coding systems. */
74ab6df5
PE
361static Lisp_Object Vsjis_coding_system;
362static Lisp_Object Vbig5_coding_system;
df7492f9 363
df7492f9
KH
364/* ISO2022 section */
365
366#define CODING_ISO_INITIAL(coding, reg) \
367 (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id), \
368 coding_attr_iso_initial), \
369 reg)))
370
371
1b3b981b
AS
372#define CODING_ISO_REQUEST(coding, charset_id) \
373 (((charset_id) <= (coding)->max_charset_id \
374 ? ((coding)->safe_charsets[charset_id] != 255 \
375 ? (coding)->safe_charsets[charset_id] \
376 : -1) \
df7492f9
KH
377 : -1))
378
379
380#define CODING_ISO_FLAGS(coding) \
381 ((coding)->spec.iso_2022.flags)
382#define CODING_ISO_DESIGNATION(coding, reg) \
383 ((coding)->spec.iso_2022.current_designation[reg])
384#define CODING_ISO_INVOCATION(coding, plane) \
385 ((coding)->spec.iso_2022.current_invocation[plane])
386#define CODING_ISO_SINGLE_SHIFTING(coding) \
387 ((coding)->spec.iso_2022.single_shifting)
388#define CODING_ISO_BOL(coding) \
389 ((coding)->spec.iso_2022.bol)
390#define CODING_ISO_INVOKED_CHARSET(coding, plane) \
391 CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
e951386e
KH
392#define CODING_ISO_CMP_STATUS(coding) \
393 (&(coding)->spec.iso_2022.cmp_status)
394#define CODING_ISO_EXTSEGMENT_LEN(coding) \
395 ((coding)->spec.iso_2022.ctext_extended_segment_len)
396#define CODING_ISO_EMBEDDED_UTF_8(coding) \
397 ((coding)->spec.iso_2022.embedded_utf_8)
df7492f9
KH
398
399/* Control characters of ISO2022. */
400 /* code */ /* function */
df7492f9
KH
401#define ISO_CODE_SO 0x0E /* shift-out */
402#define ISO_CODE_SI 0x0F /* shift-in */
403#define ISO_CODE_SS2_7 0x19 /* single-shift-2 for 7-bit code */
404#define ISO_CODE_ESC 0x1B /* escape */
405#define ISO_CODE_SS2 0x8E /* single-shift-2 */
406#define ISO_CODE_SS3 0x8F /* single-shift-3 */
407#define ISO_CODE_CSI 0x9B /* control-sequence-introducer */
408
409/* All code (1-byte) of ISO2022 is classified into one of the
410 followings. */
411enum iso_code_class_type
412 {
413 ISO_control_0, /* Control codes in the range
414 0x00..0x1F and 0x7F, except for the
415 following 5 codes. */
df7492f9
KH
416 ISO_shift_out, /* ISO_CODE_SO (0x0E) */
417 ISO_shift_in, /* ISO_CODE_SI (0x0F) */
418 ISO_single_shift_2_7, /* ISO_CODE_SS2_7 (0x19) */
419 ISO_escape, /* ISO_CODE_SO (0x1B) */
420 ISO_control_1, /* Control codes in the range
421 0x80..0x9F, except for the
422 following 3 codes. */
423 ISO_single_shift_2, /* ISO_CODE_SS2 (0x8E) */
424 ISO_single_shift_3, /* ISO_CODE_SS3 (0x8F) */
425 ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
426 ISO_0x20_or_0x7F, /* Codes of the values 0x20 or 0x7F. */
427 ISO_graphic_plane_0, /* Graphic codes in the range 0x21..0x7E. */
428 ISO_0xA0_or_0xFF, /* Codes of the values 0xA0 or 0xFF. */
429 ISO_graphic_plane_1 /* Graphic codes in the range 0xA1..0xFE. */
430 };
05e6f5dc 431
df7492f9
KH
432/** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
433 `iso-flags' attribute of an iso2022 coding system. */
05e6f5dc 434
df7492f9
KH
435/* If set, produce long-form designation sequence (e.g. ESC $ ( A)
436 instead of the correct short-form sequence (e.g. ESC $ A). */
437#define CODING_ISO_FLAG_LONG_FORM 0x0001
93dec019 438
df7492f9
KH
439/* If set, reset graphic planes and registers at end-of-line to the
440 initial state. */
441#define CODING_ISO_FLAG_RESET_AT_EOL 0x0002
05e6f5dc 442
df7492f9
KH
443/* If set, reset graphic planes and registers before any control
444 characters to the initial state. */
445#define CODING_ISO_FLAG_RESET_AT_CNTL 0x0004
05e6f5dc 446
df7492f9
KH
447/* If set, encode by 7-bit environment. */
448#define CODING_ISO_FLAG_SEVEN_BITS 0x0008
4ed46869 449
df7492f9
KH
450/* If set, use locking-shift function. */
451#define CODING_ISO_FLAG_LOCKING_SHIFT 0x0010
b73bfc1c 452
df7492f9
KH
453/* If set, use single-shift function. Overwrite
454 CODING_ISO_FLAG_LOCKING_SHIFT. */
455#define CODING_ISO_FLAG_SINGLE_SHIFT 0x0020
b73bfc1c 456
df7492f9
KH
457/* If set, use designation escape sequence. */
458#define CODING_ISO_FLAG_DESIGNATION 0x0040
b73bfc1c 459
df7492f9
KH
460/* If set, produce revision number sequence. */
461#define CODING_ISO_FLAG_REVISION 0x0080
b73bfc1c 462
df7492f9
KH
463/* If set, produce ISO6429's direction specifying sequence. */
464#define CODING_ISO_FLAG_DIRECTION 0x0100
f4dee582 465
df7492f9
KH
466/* If set, assume designation states are reset at beginning of line on
467 output. */
468#define CODING_ISO_FLAG_INIT_AT_BOL 0x0200
4ed46869 469
df7492f9
KH
470/* If set, designation sequence should be placed at beginning of line
471 on output. */
472#define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
aa72b389 473
ad1746f5 474/* If set, do not encode unsafe characters on output. */
df7492f9 475#define CODING_ISO_FLAG_SAFE 0x0800
aa72b389 476
df7492f9
KH
477/* If set, extra latin codes (128..159) are accepted as a valid code
478 on input. */
479#define CODING_ISO_FLAG_LATIN_EXTRA 0x1000
aa72b389 480
df7492f9 481#define CODING_ISO_FLAG_COMPOSITION 0x2000
aa72b389 482
5f58e762 483/* #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000 */
aa72b389 484
bf16eb23 485#define CODING_ISO_FLAG_USE_ROMAN 0x8000
aa72b389 486
bf16eb23 487#define CODING_ISO_FLAG_USE_OLDJIS 0x10000
aa72b389 488
bf16eb23 489#define CODING_ISO_FLAG_FULL_SUPPORT 0x100000
aa72b389 490
df7492f9
KH
491/* A character to be produced on output if encoding of the original
492 character is prohibited by CODING_ISO_FLAG_SAFE. */
493#define CODING_INHIBIT_CHARACTER_SUBSTITUTION '?'
aa72b389 494
a470d443
KH
495/* UTF-8 section */
496#define CODING_UTF_8_BOM(coding) \
497 ((coding)->spec.utf_8_bom)
4ed46869 498
df7492f9
KH
499/* UTF-16 section */
500#define CODING_UTF_16_BOM(coding) \
501 ((coding)->spec.utf_16.bom)
4ed46869 502
df7492f9
KH
503#define CODING_UTF_16_ENDIAN(coding) \
504 ((coding)->spec.utf_16.endian)
4ed46869 505
df7492f9
KH
506#define CODING_UTF_16_SURROGATE(coding) \
507 ((coding)->spec.utf_16.surrogate)
4ed46869 508
4ed46869 509
df7492f9
KH
510/* CCL section */
511#define CODING_CCL_DECODER(coding) \
512 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
513#define CODING_CCL_ENCODER(coding) \
514 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
515#define CODING_CCL_VALIDS(coding) \
8f924df7 516 (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
4ed46869 517
5a936b46 518/* Index for each coding category in `coding_categories' */
4ed46869 519
df7492f9
KH
520enum coding_category
521 {
522 coding_category_iso_7,
523 coding_category_iso_7_tight,
524 coding_category_iso_8_1,
525 coding_category_iso_8_2,
526 coding_category_iso_7_else,
527 coding_category_iso_8_else,
a470d443
KH
528 coding_category_utf_8_auto,
529 coding_category_utf_8_nosig,
530 coding_category_utf_8_sig,
df7492f9
KH
531 coding_category_utf_16_auto,
532 coding_category_utf_16_be,
533 coding_category_utf_16_le,
534 coding_category_utf_16_be_nosig,
535 coding_category_utf_16_le_nosig,
536 coding_category_charset,
537 coding_category_sjis,
538 coding_category_big5,
539 coding_category_ccl,
540 coding_category_emacs_mule,
541 /* All above are targets of code detection. */
542 coding_category_raw_text,
543 coding_category_undecided,
544 coding_category_max
545 };
546
547/* Definitions of flag bits used in detect_coding_XXXX. */
548#define CATEGORY_MASK_ISO_7 (1 << coding_category_iso_7)
549#define CATEGORY_MASK_ISO_7_TIGHT (1 << coding_category_iso_7_tight)
550#define CATEGORY_MASK_ISO_8_1 (1 << coding_category_iso_8_1)
551#define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2)
552#define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else)
553#define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else)
a470d443
KH
554#define CATEGORY_MASK_UTF_8_AUTO (1 << coding_category_utf_8_auto)
555#define CATEGORY_MASK_UTF_8_NOSIG (1 << coding_category_utf_8_nosig)
556#define CATEGORY_MASK_UTF_8_SIG (1 << coding_category_utf_8_sig)
b49a1807 557#define CATEGORY_MASK_UTF_16_AUTO (1 << coding_category_utf_16_auto)
df7492f9
KH
558#define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be)
559#define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le)
560#define CATEGORY_MASK_UTF_16_BE_NOSIG (1 << coding_category_utf_16_be_nosig)
561#define CATEGORY_MASK_UTF_16_LE_NOSIG (1 << coding_category_utf_16_le_nosig)
562#define CATEGORY_MASK_CHARSET (1 << coding_category_charset)
563#define CATEGORY_MASK_SJIS (1 << coding_category_sjis)
564#define CATEGORY_MASK_BIG5 (1 << coding_category_big5)
565#define CATEGORY_MASK_CCL (1 << coding_category_ccl)
566#define CATEGORY_MASK_EMACS_MULE (1 << coding_category_emacs_mule)
ff0dacd7 567#define CATEGORY_MASK_RAW_TEXT (1 << coding_category_raw_text)
df7492f9
KH
568
569/* This value is returned if detect_coding_mask () find nothing other
570 than ASCII characters. */
571#define CATEGORY_MASK_ANY \
572 (CATEGORY_MASK_ISO_7 \
573 | CATEGORY_MASK_ISO_7_TIGHT \
574 | CATEGORY_MASK_ISO_8_1 \
575 | CATEGORY_MASK_ISO_8_2 \
576 | CATEGORY_MASK_ISO_7_ELSE \
577 | CATEGORY_MASK_ISO_8_ELSE \
a470d443
KH
578 | CATEGORY_MASK_UTF_8_AUTO \
579 | CATEGORY_MASK_UTF_8_NOSIG \
580 | CATEGORY_MASK_UTF_8_SIG \
2f3cbb32 581 | CATEGORY_MASK_UTF_16_AUTO \
df7492f9
KH
582 | CATEGORY_MASK_UTF_16_BE \
583 | CATEGORY_MASK_UTF_16_LE \
584 | CATEGORY_MASK_UTF_16_BE_NOSIG \
585 | CATEGORY_MASK_UTF_16_LE_NOSIG \
586 | CATEGORY_MASK_CHARSET \
587 | CATEGORY_MASK_SJIS \
588 | CATEGORY_MASK_BIG5 \
589 | CATEGORY_MASK_CCL \
590 | CATEGORY_MASK_EMACS_MULE)
591
592
593#define CATEGORY_MASK_ISO_7BIT \
594 (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
595
596#define CATEGORY_MASK_ISO_8BIT \
597 (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
598
599#define CATEGORY_MASK_ISO_ELSE \
600 (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
601
602#define CATEGORY_MASK_ISO_ESCAPE \
603 (CATEGORY_MASK_ISO_7 \
604 | CATEGORY_MASK_ISO_7_TIGHT \
605 | CATEGORY_MASK_ISO_7_ELSE \
606 | CATEGORY_MASK_ISO_8_ELSE)
607
608#define CATEGORY_MASK_ISO \
609 ( CATEGORY_MASK_ISO_7BIT \
610 | CATEGORY_MASK_ISO_8BIT \
611 | CATEGORY_MASK_ISO_ELSE)
612
613#define CATEGORY_MASK_UTF_16 \
2f3cbb32
KH
614 (CATEGORY_MASK_UTF_16_AUTO \
615 | CATEGORY_MASK_UTF_16_BE \
df7492f9
KH
616 | CATEGORY_MASK_UTF_16_LE \
617 | CATEGORY_MASK_UTF_16_BE_NOSIG \
618 | CATEGORY_MASK_UTF_16_LE_NOSIG)
619
a470d443
KH
620#define CATEGORY_MASK_UTF_8 \
621 (CATEGORY_MASK_UTF_8_AUTO \
622 | CATEGORY_MASK_UTF_8_NOSIG \
623 | CATEGORY_MASK_UTF_8_SIG)
df7492f9 624
df7492f9 625/* Table of coding categories (Lisp symbols). This variable is for
ad1746f5 626 internal use only. */
df7492f9
KH
627static Lisp_Object Vcoding_category_table;
628
629/* Table of coding-categories ordered by priority. */
630static enum coding_category coding_priorities[coding_category_max];
631
632/* Nth element is a coding context for the coding system bound to the
633 Nth coding category. */
634static struct coding_system coding_categories[coding_category_max];
635
df7492f9
KH
636/*** Commonly used macros and functions ***/
637
638#ifndef min
639#define min(a, b) ((a) < (b) ? (a) : (b))
640#endif
641#ifndef max
642#define max(a, b) ((a) > (b) ? (a) : (b))
643#endif
4ed46869 644
24a73b0a
KH
645#define CODING_GET_INFO(coding, attrs, charset_list) \
646 do { \
647 (attrs) = CODING_ID_ATTRS ((coding)->id); \
648 (charset_list) = CODING_ATTR_CHARSET_LIST (attrs); \
df7492f9 649 } while (0)
4ed46869 650
4ed46869 651
df7492f9
KH
652/* Safely get one byte from the source text pointed by SRC which ends
653 at SRC_END, and set C to that byte. If there are not enough bytes
065e3595
KH
654 in the source, it jumps to `no_more_source'. If multibytep is
655 nonzero, and a multibyte character is found at SRC, set C to the
656 negative value of the character code. The caller should declare
657 and set these variables appropriately in advance:
658 src, src_end, multibytep */
aa72b389 659
065e3595
KH
660#define ONE_MORE_BYTE(c) \
661 do { \
662 if (src == src_end) \
663 { \
664 if (src_base < src) \
665 record_conversion_result \
666 (coding, CODING_RESULT_INSUFFICIENT_SRC); \
667 goto no_more_source; \
668 } \
669 c = *src++; \
670 if (multibytep && (c & 0x80)) \
671 { \
672 if ((c & 0xFE) == 0xC0) \
673 c = ((c & 1) << 6) | *src++; \
674 else \
675 { \
35befdaa
KH
676 src--; \
677 c = - string_char (src, &src, NULL); \
065e3595
KH
678 record_conversion_result \
679 (coding, CODING_RESULT_INVALID_SRC); \
680 } \
681 } \
682 consumed_chars++; \
aa72b389
KH
683 } while (0)
684
f56a4450 685/* Safely get two bytes from the source text pointed by SRC which ends
220eeac9
KH
686 at SRC_END, and set C1 and C2 to those bytes while skipping the
687 heading multibyte characters. If there are not enough bytes in the
688 source, it jumps to `no_more_source'. If multibytep is nonzero and
689 a multibyte character is found for C2, set C2 to the negative value
690 of the character code. The caller should declare and set these
691 variables appropriately in advance:
f56a4450
KH
692 src, src_end, multibytep
693 It is intended that this macro is used in detect_coding_utf_16. */
694
220eeac9
KH
695#define TWO_MORE_BYTES(c1, c2) \
696 do { \
697 do { \
698 if (src == src_end) \
699 goto no_more_source; \
700 c1 = *src++; \
701 if (multibytep && (c1 & 0x80)) \
702 { \
703 if ((c1 & 0xFE) == 0xC0) \
704 c1 = ((c1 & 1) << 6) | *src++; \
705 else \
706 { \
707 src += BYTES_BY_CHAR_HEAD (c1) - 1; \
708 c1 = -1; \
709 } \
710 } \
711 } while (c1 < 0); \
712 if (src == src_end) \
713 goto no_more_source; \
714 c2 = *src++; \
715 if (multibytep && (c2 & 0x80)) \
716 { \
717 if ((c2 & 0xFE) == 0xC0) \
718 c2 = ((c2 & 1) << 6) | *src++; \
719 else \
720 c2 = -1; \
721 } \
f56a4450
KH
722 } while (0)
723
aa72b389 724
df7492f9
KH
725/* Store a byte C in the place pointed by DST and increment DST to the
726 next free point, and increment PRODUCED_CHARS. The caller should
727 assure that C is 0..127, and declare and set the variable `dst'
728 appropriately in advance.
729*/
aa72b389
KH
730
731
df7492f9
KH
732#define EMIT_ONE_ASCII_BYTE(c) \
733 do { \
734 produced_chars++; \
735 *dst++ = (c); \
b6871cc7 736 } while (0)
aa72b389
KH
737
738
ad1746f5 739/* Like EMIT_ONE_ASCII_BYTE but store two bytes; C1 and C2. */
aa72b389 740
df7492f9
KH
741#define EMIT_TWO_ASCII_BYTES(c1, c2) \
742 do { \
743 produced_chars += 2; \
744 *dst++ = (c1), *dst++ = (c2); \
745 } while (0)
aa72b389
KH
746
747
df7492f9
KH
748/* Store a byte C in the place pointed by DST and increment DST to the
749 next free point, and increment PRODUCED_CHARS. If MULTIBYTEP is
750 nonzero, store in an appropriate multibyte from. The caller should
751 declare and set the variables `dst' and `multibytep' appropriately
752 in advance. */
753
754#define EMIT_ONE_BYTE(c) \
755 do { \
756 produced_chars++; \
757 if (multibytep) \
758 { \
b25d760e 759 unsigned ch = (c); \
df7492f9
KH
760 if (ch >= 0x80) \
761 ch = BYTE8_TO_CHAR (ch); \
762 CHAR_STRING_ADVANCE (ch, dst); \
763 } \
764 else \
765 *dst++ = (c); \
aa72b389 766 } while (0)
aa72b389 767
aa72b389 768
df7492f9 769/* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */
aa72b389 770
e19c3639
KH
771#define EMIT_TWO_BYTES(c1, c2) \
772 do { \
773 produced_chars += 2; \
774 if (multibytep) \
775 { \
b25d760e 776 unsigned ch; \
e19c3639
KH
777 \
778 ch = (c1); \
779 if (ch >= 0x80) \
780 ch = BYTE8_TO_CHAR (ch); \
781 CHAR_STRING_ADVANCE (ch, dst); \
782 ch = (c2); \
783 if (ch >= 0x80) \
784 ch = BYTE8_TO_CHAR (ch); \
785 CHAR_STRING_ADVANCE (ch, dst); \
786 } \
787 else \
788 { \
789 *dst++ = (c1); \
790 *dst++ = (c2); \
791 } \
aa72b389
KH
792 } while (0)
793
794
df7492f9
KH
795#define EMIT_THREE_BYTES(c1, c2, c3) \
796 do { \
797 EMIT_ONE_BYTE (c1); \
798 EMIT_TWO_BYTES (c2, c3); \
799 } while (0)
aa72b389 800
aa72b389 801
df7492f9
KH
802#define EMIT_FOUR_BYTES(c1, c2, c3, c4) \
803 do { \
804 EMIT_TWO_BYTES (c1, c2); \
805 EMIT_TWO_BYTES (c3, c4); \
806 } while (0)
aa72b389 807
aa72b389 808
f6cbaf43 809/* Prototypes for static functions. */
f57e2426
J
810static void record_conversion_result (struct coding_system *coding,
811 enum coding_result_code result);
812static int detect_coding_utf_8 (struct coding_system *,
813 struct coding_detection_info *info);
814static void decode_coding_utf_8 (struct coding_system *);
815static int encode_coding_utf_8 (struct coding_system *);
816
817static int detect_coding_utf_16 (struct coding_system *,
818 struct coding_detection_info *info);
819static void decode_coding_utf_16 (struct coding_system *);
820static int encode_coding_utf_16 (struct coding_system *);
821
822static int detect_coding_iso_2022 (struct coding_system *,
823 struct coding_detection_info *info);
824static void decode_coding_iso_2022 (struct coding_system *);
825static int encode_coding_iso_2022 (struct coding_system *);
826
827static int detect_coding_emacs_mule (struct coding_system *,
828 struct coding_detection_info *info);
829static void decode_coding_emacs_mule (struct coding_system *);
830static int encode_coding_emacs_mule (struct coding_system *);
831
832static int detect_coding_sjis (struct coding_system *,
833 struct coding_detection_info *info);
834static void decode_coding_sjis (struct coding_system *);
835static int encode_coding_sjis (struct coding_system *);
836
837static int detect_coding_big5 (struct coding_system *,
838 struct coding_detection_info *info);
839static void decode_coding_big5 (struct coding_system *);
840static int encode_coding_big5 (struct coding_system *);
841
842static int detect_coding_ccl (struct coding_system *,
843 struct coding_detection_info *info);
844static void decode_coding_ccl (struct coding_system *);
845static int encode_coding_ccl (struct coding_system *);
846
847static void decode_coding_raw_text (struct coding_system *);
848static int encode_coding_raw_text (struct coding_system *);
849
c1892f11
PE
850static void coding_set_source (struct coding_system *);
851static ptrdiff_t coding_change_source (struct coding_system *);
852static void coding_set_destination (struct coding_system *);
853static ptrdiff_t coding_change_destination (struct coding_system *);
d311d28c 854static void coding_alloc_by_realloc (struct coding_system *, ptrdiff_t);
f57e2426 855static void coding_alloc_by_making_gap (struct coding_system *,
d311d28c 856 ptrdiff_t, ptrdiff_t);
f57e2426 857static unsigned char *alloc_destination (struct coding_system *,
d311d28c 858 ptrdiff_t, unsigned char *);
f57e2426 859static void setup_iso_safe_charsets (Lisp_Object);
6e6c82a4 860static ptrdiff_t encode_designation_at_bol (struct coding_system *,
5eb05ea3 861 int *, int *, unsigned char *);
f57e2426 862static int detect_eol (const unsigned char *,
d311d28c 863 ptrdiff_t, enum coding_category);
f57e2426
J
864static Lisp_Object adjust_coding_eol_type (struct coding_system *, int);
865static void decode_eol (struct coding_system *);
866static Lisp_Object get_translation_table (Lisp_Object, int, int *);
867static Lisp_Object get_translation (Lisp_Object, int *, int *);
868static int produce_chars (struct coding_system *, Lisp_Object, int);
55d4c1b2 869static inline void produce_charset (struct coding_system *, int *,
d311d28c
PE
870 ptrdiff_t);
871static void produce_annotation (struct coding_system *, ptrdiff_t);
f57e2426 872static int decode_coding (struct coding_system *);
d311d28c 873static inline int *handle_composition_annotation (ptrdiff_t, ptrdiff_t,
f57e2426 874 struct coding_system *,
d311d28c
PE
875 int *, ptrdiff_t *);
876static inline int *handle_charset_annotation (ptrdiff_t, ptrdiff_t,
f57e2426 877 struct coding_system *,
d311d28c 878 int *, ptrdiff_t *);
f57e2426
J
879static void consume_chars (struct coding_system *, Lisp_Object, int);
880static int encode_coding (struct coding_system *);
881static Lisp_Object make_conversion_work_buffer (int);
882static Lisp_Object code_conversion_restore (Lisp_Object);
55d4c1b2 883static inline int char_encodable_p (int, Lisp_Object);
f57e2426 884static Lisp_Object make_subsidiaries (Lisp_Object);
f6cbaf43 885
065e3595
KH
886static void
887record_conversion_result (struct coding_system *coding,
888 enum coding_result_code result)
889{
890 coding->result = result;
891 switch (result)
892 {
893 case CODING_RESULT_INSUFFICIENT_SRC:
894 Vlast_code_conversion_error = Qinsufficient_source;
895 break;
896 case CODING_RESULT_INCONSISTENT_EOL:
897 Vlast_code_conversion_error = Qinconsistent_eol;
898 break;
899 case CODING_RESULT_INVALID_SRC:
900 Vlast_code_conversion_error = Qinvalid_source;
901 break;
902 case CODING_RESULT_INTERRUPT:
903 Vlast_code_conversion_error = Qinterrupted;
904 break;
905 case CODING_RESULT_INSUFFICIENT_MEM:
906 Vlast_code_conversion_error = Qinsufficient_memory;
907 break;
ebaf11b6
KH
908 case CODING_RESULT_INSUFFICIENT_DST:
909 /* Don't record this error in Vlast_code_conversion_error
910 because it happens just temporarily and is resolved when the
911 whole conversion is finished. */
912 break;
409ea3a1
AS
913 case CODING_RESULT_SUCCESS:
914 break;
35befdaa
KH
915 default:
916 Vlast_code_conversion_error = intern ("Unknown error");
065e3595
KH
917 }
918}
919
5eb05ea3
KH
920/* These wrapper macros are used to preserve validity of pointers into
921 buffer text across calls to decode_char, encode_char, etc, which
922 could cause relocation of buffers if it loads a charset map,
923 because loading a charset map allocates large structures. */
924
df7492f9
KH
925#define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
926 do { \
8f50130c 927 ptrdiff_t offset; \
5eb05ea3 928 \
df7492f9
KH
929 charset_map_loaded = 0; \
930 c = DECODE_CHAR (charset, code); \
5eb05ea3 931 if (charset_map_loaded \
c1892f11 932 && (offset = coding_change_source (coding))) \
df7492f9 933 { \
df7492f9
KH
934 src += offset; \
935 src_base += offset; \
936 src_end += offset; \
937 } \
aa72b389
KH
938 } while (0)
939
5eb05ea3
KH
940#define CODING_ENCODE_CHAR(coding, dst, dst_end, charset, c, code) \
941 do { \
8f50130c 942 ptrdiff_t offset; \
5eb05ea3
KH
943 \
944 charset_map_loaded = 0; \
945 code = ENCODE_CHAR (charset, c); \
946 if (charset_map_loaded \
c1892f11 947 && (offset = coding_change_destination (coding))) \
5eb05ea3
KH
948 { \
949 dst += offset; \
950 dst_end += offset; \
951 } \
952 } while (0)
953
954#define CODING_CHAR_CHARSET(coding, dst, dst_end, c, charset_list, code_return, charset) \
955 do { \
8f50130c 956 ptrdiff_t offset; \
5eb05ea3
KH
957 \
958 charset_map_loaded = 0; \
959 charset = char_charset (c, charset_list, code_return); \
960 if (charset_map_loaded \
c1892f11 961 && (offset = coding_change_destination (coding))) \
5eb05ea3
KH
962 { \
963 dst += offset; \
964 dst_end += offset; \
965 } \
966 } while (0)
967
968#define CODING_CHAR_CHARSET_P(coding, dst, dst_end, c, charset, result) \
969 do { \
8f50130c 970 ptrdiff_t offset; \
5eb05ea3
KH
971 \
972 charset_map_loaded = 0; \
973 result = CHAR_CHARSET_P (c, charset); \
974 if (charset_map_loaded \
c1892f11 975 && (offset = coding_change_destination (coding))) \
5eb05ea3
KH
976 { \
977 dst += offset; \
978 dst_end += offset; \
979 } \
980 } while (0)
981
aa72b389 982
119852e7
KH
983/* If there are at least BYTES length of room at dst, allocate memory
984 for coding->destination and update dst and dst_end. We don't have
985 to take care of coding->source which will be relocated. It is
986 handled by calling coding_set_source in encode_coding. */
987
df7492f9
KH
988#define ASSURE_DESTINATION(bytes) \
989 do { \
990 if (dst + (bytes) >= dst_end) \
991 { \
d311d28c 992 ptrdiff_t more_bytes = charbuf_end - charbuf + (bytes); \
df7492f9
KH
993 \
994 dst = alloc_destination (coding, more_bytes, dst); \
995 dst_end = coding->destination + coding->dst_bytes; \
996 } \
997 } while (0)
aa72b389 998
aa72b389 999
db274c7a
KH
1000/* Store multibyte form of the character C in P, and advance P to the
1001 end of the multibyte form. This is like CHAR_STRING_ADVANCE but it
1002 never calls MAYBE_UNIFY_CHAR. */
1003
1004#define CHAR_STRING_ADVANCE_NO_UNIFY(c, p) \
1005 do { \
1006 if ((c) <= MAX_1_BYTE_CHAR) \
1007 *(p)++ = (c); \
1008 else if ((c) <= MAX_2_BYTE_CHAR) \
1009 *(p)++ = (0xC0 | ((c) >> 6)), \
1010 *(p)++ = (0x80 | ((c) & 0x3F)); \
1011 else if ((c) <= MAX_3_BYTE_CHAR) \
1012 *(p)++ = (0xE0 | ((c) >> 12)), \
1013 *(p)++ = (0x80 | (((c) >> 6) & 0x3F)), \
1014 *(p)++ = (0x80 | ((c) & 0x3F)); \
1015 else if ((c) <= MAX_4_BYTE_CHAR) \
1016 *(p)++ = (0xF0 | (c >> 18)), \
1017 *(p)++ = (0x80 | ((c >> 12) & 0x3F)), \
1018 *(p)++ = (0x80 | ((c >> 6) & 0x3F)), \
1019 *(p)++ = (0x80 | (c & 0x3F)); \
1020 else if ((c) <= MAX_5_BYTE_CHAR) \
1021 *(p)++ = 0xF8, \
1022 *(p)++ = (0x80 | ((c >> 18) & 0x0F)), \
1023 *(p)++ = (0x80 | ((c >> 12) & 0x3F)), \
1024 *(p)++ = (0x80 | ((c >> 6) & 0x3F)), \
1025 *(p)++ = (0x80 | (c & 0x3F)); \
1026 else \
1027 (p) += BYTE8_STRING ((c) - 0x3FFF80, p); \
1028 } while (0)
1029
1030
1031/* Return the character code of character whose multibyte form is at
1032 P, and advance P to the end of the multibyte form. This is like
1033 STRING_CHAR_ADVANCE, but it never calls MAYBE_UNIFY_CHAR. */
1034
1035#define STRING_CHAR_ADVANCE_NO_UNIFY(p) \
1036 (!((p)[0] & 0x80) \
1037 ? *(p)++ \
1038 : ! ((p)[0] & 0x20) \
1039 ? ((p) += 2, \
1040 ((((p)[-2] & 0x1F) << 6) \
1041 | ((p)[-1] & 0x3F) \
1042 | ((unsigned char) ((p)[-2]) < 0xC2 ? 0x3FFF80 : 0))) \
1043 : ! ((p)[0] & 0x10) \
1044 ? ((p) += 3, \
1045 ((((p)[-3] & 0x0F) << 12) \
1046 | (((p)[-2] & 0x3F) << 6) \
1047 | ((p)[-1] & 0x3F))) \
1048 : ! ((p)[0] & 0x08) \
1049 ? ((p) += 4, \
1050 ((((p)[-4] & 0xF) << 18) \
1051 | (((p)[-3] & 0x3F) << 12) \
1052 | (((p)[-2] & 0x3F) << 6) \
1053 | ((p)[-1] & 0x3F))) \
1054 : ((p) += 5, \
1055 ((((p)[-4] & 0x3F) << 18) \
1056 | (((p)[-3] & 0x3F) << 12) \
1057 | (((p)[-2] & 0x3F) << 6) \
1058 | ((p)[-1] & 0x3F))))
1059
aa72b389 1060
c1892f11 1061/* Set coding->source from coding->src_object. */
5eb05ea3 1062
c1892f11 1063static void
971de7fb 1064coding_set_source (struct coding_system *coding)
aa72b389 1065{
df7492f9
KH
1066 if (BUFFERP (coding->src_object))
1067 {
2cb26057 1068 struct buffer *buf = XBUFFER (coding->src_object);
aa72b389 1069
df7492f9 1070 if (coding->src_pos < 0)
2cb26057 1071 coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
df7492f9 1072 else
2cb26057 1073 coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
aa72b389 1074 }
df7492f9 1075 else if (STRINGP (coding->src_object))
aa72b389 1076 {
8f924df7 1077 coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
aa72b389 1078 }
df7492f9 1079 else
f38b440c
PE
1080 {
1081 /* Otherwise, the source is C string and is never relocated
1082 automatically. Thus we don't have to update anything. */
1083 }
df7492f9 1084}
aa72b389 1085
5eb05ea3 1086
c1892f11
PE
1087/* Set coding->source from coding->src_object, and return how many
1088 bytes coding->source was changed. */
5eb05ea3 1089
8f50130c 1090static ptrdiff_t
c1892f11 1091coding_change_source (struct coding_system *coding)
df7492f9 1092{
c1892f11
PE
1093 const unsigned char *orig = coding->source;
1094 coding_set_source (coding);
1095 return coding->source - orig;
1096}
1097
5eb05ea3 1098
c1892f11
PE
1099/* Set coding->destination from coding->dst_object. */
1100
1101static void
1102coding_set_destination (struct coding_system *coding)
1103{
df7492f9 1104 if (BUFFERP (coding->dst_object))
aa72b389 1105 {
a0241d01 1106 if (BUFFERP (coding->src_object) && coding->src_pos < 0)
aa72b389 1107 {
13818c30 1108 coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
28f67a95
KH
1109 coding->dst_bytes = (GAP_END_ADDR
1110 - (coding->src_bytes - coding->consumed)
1111 - coding->destination);
aa72b389 1112 }
df7492f9 1113 else
28f67a95
KH
1114 {
1115 /* We are sure that coding->dst_pos_byte is before the gap
1116 of the buffer. */
1117 coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
13818c30 1118 + coding->dst_pos_byte - BEG_BYTE);
28f67a95
KH
1119 coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
1120 - coding->destination);
1121 }
df7492f9
KH
1122 }
1123 else
f38b440c
PE
1124 {
1125 /* Otherwise, the destination is C string and is never relocated
1126 automatically. Thus we don't have to update anything. */
1127 }
c1892f11
PE
1128}
1129
1130
1131/* Set coding->destination from coding->dst_object, and return how
1132 many bytes coding->destination was changed. */
1133
1134static ptrdiff_t
1135coding_change_destination (struct coding_system *coding)
1136{
1137 const unsigned char *orig = coding->destination;
1138 coding_set_destination (coding);
5eb05ea3 1139 return coding->destination - orig;
df7492f9
KH
1140}
1141
1142
1143static void
d311d28c 1144coding_alloc_by_realloc (struct coding_system *coding, ptrdiff_t bytes)
df7492f9 1145{
c9d624c6 1146 if (STRING_BYTES_BOUND - coding->dst_bytes < bytes)
d1f3d2af 1147 string_overflow ();
df7492f9
KH
1148 coding->destination = (unsigned char *) xrealloc (coding->destination,
1149 coding->dst_bytes + bytes);
1150 coding->dst_bytes += bytes;
1151}
1152
1153static void
cf84bb53 1154coding_alloc_by_making_gap (struct coding_system *coding,
d311d28c 1155 ptrdiff_t gap_head_used, ptrdiff_t bytes)
df7492f9 1156{
db274c7a 1157 if (EQ (coding->src_object, coding->dst_object))
df7492f9 1158 {
db274c7a
KH
1159 /* The gap may contain the produced data at the head and not-yet
1160 consumed data at the tail. To preserve those data, we at
1161 first make the gap size to zero, then increase the gap
1162 size. */
d311d28c 1163 ptrdiff_t add = GAP_SIZE;
db274c7a
KH
1164
1165 GPT += gap_head_used, GPT_BYTE += gap_head_used;
1166 GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
df7492f9
KH
1167 make_gap (bytes);
1168 GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
db274c7a 1169 GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
df7492f9 1170 }
730fff51 1171 else
df7492f9 1172 {
2c78b7e1
KH
1173 Lisp_Object this_buffer;
1174
1175 this_buffer = Fcurrent_buffer ();
df7492f9
KH
1176 set_buffer_internal (XBUFFER (coding->dst_object));
1177 make_gap (bytes);
1178 set_buffer_internal (XBUFFER (this_buffer));
aa72b389 1179 }
df7492f9 1180}
8f924df7 1181
df7492f9
KH
1182
1183static unsigned char *
d311d28c 1184alloc_destination (struct coding_system *coding, ptrdiff_t nbytes,
cf84bb53 1185 unsigned char *dst)
df7492f9 1186{
d311d28c 1187 ptrdiff_t offset = dst - coding->destination;
df7492f9
KH
1188
1189 if (BUFFERP (coding->dst_object))
db274c7a
KH
1190 {
1191 struct buffer *buf = XBUFFER (coding->dst_object);
1192
1193 coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1194 }
aa72b389 1195 else
df7492f9 1196 coding_alloc_by_realloc (coding, nbytes);
df7492f9
KH
1197 coding_set_destination (coding);
1198 dst = coding->destination + offset;
1199 return dst;
1200}
aa72b389 1201
ff0dacd7
KH
1202/** Macros for annotations. */
1203
ff0dacd7
KH
1204/* An annotation data is stored in the array coding->charbuf in this
1205 format:
69a80ea3 1206 [ -LENGTH ANNOTATION_MASK NCHARS ... ]
ff0dacd7
KH
1207 LENGTH is the number of elements in the annotation.
1208 ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
69a80ea3 1209 NCHARS is the number of characters in the text annotated.
ff0dacd7
KH
1210
1211 The format of the following elements depend on ANNOTATION_MASK.
1212
1213 In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1214 follows:
e951386e
KH
1215 ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1216
1217 NBYTES is the number of bytes specified in the header part of
1218 old-style emacs-mule encoding, or 0 for the other kind of
1219 composition.
1220
ff0dacd7 1221 METHOD is one of enum composition_method.
e951386e 1222
ad1746f5 1223 Optional COMPOSITION-COMPONENTS are characters and composition
ff0dacd7
KH
1224 rules.
1225
1226 In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
e951386e
KH
1227 follows.
1228
1229 If ANNOTATION_MASK is 0, this annotation is just a space holder to
1230 recover from an invalid annotation, and should be skipped by
1231 produce_annotation. */
1232
1233/* Maximum length of the header of annotation data. */
1234#define MAX_ANNOTATION_LENGTH 5
ff0dacd7 1235
69a80ea3 1236#define ADD_ANNOTATION_DATA(buf, len, mask, nchars) \
ff0dacd7
KH
1237 do { \
1238 *(buf)++ = -(len); \
1239 *(buf)++ = (mask); \
69a80ea3 1240 *(buf)++ = (nchars); \
ff0dacd7
KH
1241 coding->annotated = 1; \
1242 } while (0);
1243
e951386e 1244#define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method) \
69a80ea3 1245 do { \
e951386e
KH
1246 ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1247 *buf++ = nbytes; \
69a80ea3 1248 *buf++ = method; \
ff0dacd7
KH
1249 } while (0)
1250
1251
69a80ea3
KH
1252#define ADD_CHARSET_DATA(buf, nchars, id) \
1253 do { \
1254 ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1255 *buf++ = id; \
ff0dacd7
KH
1256 } while (0)
1257
df7492f9
KH
1258\f
1259/*** 2. Emacs' internal format (emacs-utf-8) ***/
1260
1261
1262
1263\f
1264/*** 3. UTF-8 ***/
1265
1266/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
ff0dacd7
KH
1267 Check if a text is encoded in UTF-8. If it is, return 1, else
1268 return 0. */
df7492f9
KH
1269
1270#define UTF_8_1_OCTET_P(c) ((c) < 0x80)
1271#define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
1272#define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1273#define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1274#define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1275#define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1276
a470d443
KH
1277#define UTF_8_BOM_1 0xEF
1278#define UTF_8_BOM_2 0xBB
1279#define UTF_8_BOM_3 0xBF
1280
df7492f9 1281static int
cf84bb53
JB
1282detect_coding_utf_8 (struct coding_system *coding,
1283 struct coding_detection_info *detect_info)
df7492f9 1284{
065e3595 1285 const unsigned char *src = coding->source, *src_base;
8f924df7 1286 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9 1287 int multibytep = coding->src_multibyte;
d311d28c 1288 ptrdiff_t consumed_chars = 0;
a470d443 1289 int bom_found = 0;
df7492f9
KH
1290 int found = 0;
1291
ff0dacd7 1292 detect_info->checked |= CATEGORY_MASK_UTF_8;
df7492f9
KH
1293 /* A coding system of this category is always ASCII compatible. */
1294 src += coding->head_ascii;
1295
1296 while (1)
aa72b389 1297 {
df7492f9 1298 int c, c1, c2, c3, c4;
aa72b389 1299
065e3595 1300 src_base = src;
df7492f9 1301 ONE_MORE_BYTE (c);
065e3595 1302 if (c < 0 || UTF_8_1_OCTET_P (c))
df7492f9
KH
1303 continue;
1304 ONE_MORE_BYTE (c1);
065e3595 1305 if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
df7492f9
KH
1306 break;
1307 if (UTF_8_2_OCTET_LEADING_P (c))
aa72b389 1308 {
a470d443 1309 found = 1;
df7492f9 1310 continue;
aa72b389 1311 }
df7492f9 1312 ONE_MORE_BYTE (c2);
065e3595 1313 if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
df7492f9
KH
1314 break;
1315 if (UTF_8_3_OCTET_LEADING_P (c))
aa72b389 1316 {
a470d443
KH
1317 found = 1;
1318 if (src_base == coding->source
1319 && c == UTF_8_BOM_1 && c1 == UTF_8_BOM_2 && c2 == UTF_8_BOM_3)
1320 bom_found = 1;
df7492f9 1321 continue;
aa72b389 1322 }
df7492f9 1323 ONE_MORE_BYTE (c3);
065e3595 1324 if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
df7492f9
KH
1325 break;
1326 if (UTF_8_4_OCTET_LEADING_P (c))
aa72b389 1327 {
a470d443 1328 found = 1;
df7492f9
KH
1329 continue;
1330 }
1331 ONE_MORE_BYTE (c4);
065e3595 1332 if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
df7492f9
KH
1333 break;
1334 if (UTF_8_5_OCTET_LEADING_P (c))
1335 {
a470d443 1336 found = 1;
df7492f9
KH
1337 continue;
1338 }
1339 break;
aa72b389 1340 }
ff0dacd7 1341 detect_info->rejected |= CATEGORY_MASK_UTF_8;
df7492f9 1342 return 0;
aa72b389 1343
df7492f9 1344 no_more_source:
065e3595 1345 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
aa72b389 1346 {
ff0dacd7 1347 detect_info->rejected |= CATEGORY_MASK_UTF_8;
89528eb3 1348 return 0;
aa72b389 1349 }
a470d443
KH
1350 if (bom_found)
1351 {
1352 /* The first character 0xFFFE doesn't necessarily mean a BOM. */
1353 detect_info->found |= CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1354 }
1355 else
1356 {
1357 detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
0e17387a
KH
1358 if (found)
1359 detect_info->found |= CATEGORY_MASK_UTF_8_NOSIG;
a470d443 1360 }
ff0dacd7 1361 return 1;
aa72b389
KH
1362}
1363
4ed46869 1364
b73bfc1c 1365static void
971de7fb 1366decode_coding_utf_8 (struct coding_system *coding)
b73bfc1c 1367{
8f924df7
KH
1368 const unsigned char *src = coding->source + coding->consumed;
1369 const unsigned char *src_end = coding->source + coding->src_bytes;
1370 const unsigned char *src_base;
69a80ea3
KH
1371 int *charbuf = coding->charbuf + coding->charbuf_used;
1372 int *charbuf_end = coding->charbuf + coding->charbuf_size;
d311d28c 1373 ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
df7492f9 1374 int multibytep = coding->src_multibyte;
a470d443 1375 enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
2735d060 1376 int eol_dos =
0a9564cb 1377 !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
119852e7 1378 int byte_after_cr = -1;
4ed46869 1379
a470d443
KH
1380 if (bom != utf_without_bom)
1381 {
1382 int c1, c2, c3;
1383
1384 src_base = src;
1385 ONE_MORE_BYTE (c1);
1386 if (! UTF_8_3_OCTET_LEADING_P (c1))
1387 src = src_base;
1388 else
1389 {
159bd5a2 1390 ONE_MORE_BYTE (c2);
a470d443
KH
1391 if (! UTF_8_EXTRA_OCTET_P (c2))
1392 src = src_base;
1393 else
1394 {
159bd5a2 1395 ONE_MORE_BYTE (c3);
a470d443
KH
1396 if (! UTF_8_EXTRA_OCTET_P (c3))
1397 src = src_base;
1398 else
1399 {
1400 if ((c1 != UTF_8_BOM_1)
1401 || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1402 src = src_base;
1403 else
1404 CODING_UTF_8_BOM (coding) = utf_without_bom;
1405 }
1406 }
1407 }
1408 }
1409 CODING_UTF_8_BOM (coding) = utf_without_bom;
1410
df7492f9 1411 while (1)
b73bfc1c 1412 {
df7492f9 1413 int c, c1, c2, c3, c4, c5;
ec6d2bb8 1414
df7492f9
KH
1415 src_base = src;
1416 consumed_chars_base = consumed_chars;
4af310db 1417
df7492f9 1418 if (charbuf >= charbuf_end)
b71f6f73
KH
1419 {
1420 if (byte_after_cr >= 0)
1421 src_base--;
1422 break;
1423 }
df7492f9 1424
119852e7
KH
1425 if (byte_after_cr >= 0)
1426 c1 = byte_after_cr, byte_after_cr = -1;
1427 else
1428 ONE_MORE_BYTE (c1);
065e3595
KH
1429 if (c1 < 0)
1430 {
1431 c = - c1;
1432 }
1a4990fb 1433 else if (UTF_8_1_OCTET_P (c1))
df7492f9 1434 {
2735d060 1435 if (eol_dos && c1 == '\r')
119852e7 1436 ONE_MORE_BYTE (byte_after_cr);
df7492f9 1437 c = c1;
4af310db 1438 }
df7492f9 1439 else
4af310db 1440 {
df7492f9 1441 ONE_MORE_BYTE (c2);
065e3595 1442 if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
df7492f9
KH
1443 goto invalid_code;
1444 if (UTF_8_2_OCTET_LEADING_P (c1))
4af310db 1445 {
b0edb2c5
DL
1446 c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1447 /* Reject overlong sequences here and below. Encoders
1448 producing them are incorrect, they can be misleading,
1449 and they mess up read/write invariance. */
1450 if (c < 128)
1451 goto invalid_code;
4af310db 1452 }
df7492f9 1453 else
aa72b389 1454 {
df7492f9 1455 ONE_MORE_BYTE (c3);
065e3595 1456 if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
df7492f9
KH
1457 goto invalid_code;
1458 if (UTF_8_3_OCTET_LEADING_P (c1))
b0edb2c5
DL
1459 {
1460 c = (((c1 & 0xF) << 12)
1461 | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
72fe1301
DL
1462 if (c < 0x800
1463 || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
b0edb2c5
DL
1464 goto invalid_code;
1465 }
df7492f9
KH
1466 else
1467 {
1468 ONE_MORE_BYTE (c4);
065e3595 1469 if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
df7492f9
KH
1470 goto invalid_code;
1471 if (UTF_8_4_OCTET_LEADING_P (c1))
b0edb2c5 1472 {
df7492f9
KH
1473 c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1474 | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
b0edb2c5
DL
1475 if (c < 0x10000)
1476 goto invalid_code;
1477 }
df7492f9
KH
1478 else
1479 {
1480 ONE_MORE_BYTE (c5);
065e3595 1481 if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
df7492f9
KH
1482 goto invalid_code;
1483 if (UTF_8_5_OCTET_LEADING_P (c1))
1484 {
1485 c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1486 | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1487 | (c5 & 0x3F));
b0edb2c5 1488 if ((c > MAX_CHAR) || (c < 0x200000))
df7492f9
KH
1489 goto invalid_code;
1490 }
1491 else
1492 goto invalid_code;
1493 }
1494 }
aa72b389 1495 }
b73bfc1c 1496 }
df7492f9
KH
1497
1498 *charbuf++ = c;
1499 continue;
1500
1501 invalid_code:
1502 src = src_base;
1503 consumed_chars = consumed_chars_base;
1504 ONE_MORE_BYTE (c);
1505 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1506 coding->errors++;
aa72b389
KH
1507 }
1508
df7492f9
KH
1509 no_more_source:
1510 coding->consumed_char += consumed_chars_base;
1511 coding->consumed = src_base - coding->source;
1512 coding->charbuf_used = charbuf - coding->charbuf;
1513}
1514
1515
1516static int
971de7fb 1517encode_coding_utf_8 (struct coding_system *coding)
df7492f9
KH
1518{
1519 int multibytep = coding->dst_multibyte;
1520 int *charbuf = coding->charbuf;
1521 int *charbuf_end = charbuf + coding->charbuf_used;
1522 unsigned char *dst = coding->destination + coding->produced;
1523 unsigned char *dst_end = coding->destination + coding->dst_bytes;
d311d28c 1524 ptrdiff_t produced_chars = 0;
df7492f9
KH
1525 int c;
1526
a470d443
KH
1527 if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1528 {
1529 ASSURE_DESTINATION (3);
1530 EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1531 CODING_UTF_8_BOM (coding) = utf_without_bom;
1532 }
1533
df7492f9 1534 if (multibytep)
aa72b389 1535 {
df7492f9
KH
1536 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1537
1538 while (charbuf < charbuf_end)
b73bfc1c 1539 {
df7492f9 1540 unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
8f924df7 1541
df7492f9
KH
1542 ASSURE_DESTINATION (safe_room);
1543 c = *charbuf++;
28f67a95
KH
1544 if (CHAR_BYTE8_P (c))
1545 {
1546 c = CHAR_TO_BYTE8 (c);
1547 EMIT_ONE_BYTE (c);
1548 }
1549 else
1550 {
db274c7a 1551 CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
28f67a95
KH
1552 for (p = str; p < pend; p++)
1553 EMIT_ONE_BYTE (*p);
1554 }
b73bfc1c 1555 }
aa72b389 1556 }
df7492f9
KH
1557 else
1558 {
1559 int safe_room = MAX_MULTIBYTE_LENGTH;
1560
1561 while (charbuf < charbuf_end)
b73bfc1c 1562 {
df7492f9
KH
1563 ASSURE_DESTINATION (safe_room);
1564 c = *charbuf++;
f03caae0
KH
1565 if (CHAR_BYTE8_P (c))
1566 *dst++ = CHAR_TO_BYTE8 (c);
1567 else
db274c7a 1568 CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
df7492f9 1569 produced_chars++;
4ed46869
KH
1570 }
1571 }
065e3595 1572 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
1573 coding->produced_char += produced_chars;
1574 coding->produced = dst - coding->destination;
1575 return 0;
4ed46869
KH
1576}
1577
b73bfc1c 1578
df7492f9 1579/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
ff0dacd7
KH
1580 Check if a text is encoded in one of UTF-16 based coding systems.
1581 If it is, return 1, else return 0. */
aa72b389 1582
df7492f9
KH
1583#define UTF_16_HIGH_SURROGATE_P(val) \
1584 (((val) & 0xFC00) == 0xD800)
1585
1586#define UTF_16_LOW_SURROGATE_P(val) \
1587 (((val) & 0xFC00) == 0xDC00)
93dec019 1588
aa72b389 1589
df7492f9 1590static int
cf84bb53
JB
1591detect_coding_utf_16 (struct coding_system *coding,
1592 struct coding_detection_info *detect_info)
aa72b389 1593{
ef1b0ba7 1594 const unsigned char *src = coding->source;
8f924df7 1595 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9 1596 int multibytep = coding->src_multibyte;
df7492f9 1597 int c1, c2;
aa72b389 1598
ff0dacd7 1599 detect_info->checked |= CATEGORY_MASK_UTF_16;
ff0dacd7 1600 if (coding->mode & CODING_MODE_LAST_BLOCK
24a73b0a 1601 && (coding->src_chars & 1))
ff0dacd7
KH
1602 {
1603 detect_info->rejected |= CATEGORY_MASK_UTF_16;
1604 return 0;
1605 }
24a73b0a 1606
f56a4450 1607 TWO_MORE_BYTES (c1, c2);
df7492f9 1608 if ((c1 == 0xFF) && (c2 == 0xFE))
aa72b389 1609 {
b49a1807
KH
1610 detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1611 | CATEGORY_MASK_UTF_16_AUTO);
24a73b0a
KH
1612 detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1613 | CATEGORY_MASK_UTF_16_BE_NOSIG
1614 | CATEGORY_MASK_UTF_16_LE_NOSIG);
aa72b389 1615 }
df7492f9 1616 else if ((c1 == 0xFE) && (c2 == 0xFF))
ff0dacd7 1617 {
b49a1807
KH
1618 detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1619 | CATEGORY_MASK_UTF_16_AUTO);
24a73b0a
KH
1620 detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1621 | CATEGORY_MASK_UTF_16_BE_NOSIG
1622 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1623 }
220eeac9 1624 else if (c2 < 0)
f56a4450
KH
1625 {
1626 detect_info->rejected |= CATEGORY_MASK_UTF_16;
1627 return 0;
1628 }
2f3cbb32 1629 else
24a73b0a 1630 {
2f3cbb32
KH
1631 /* We check the dispersion of Eth and Oth bytes where E is even and
1632 O is odd. If both are high, we assume binary data.*/
1633 unsigned char e[256], o[256];
1634 unsigned e_num = 1, o_num = 1;
1635
1636 memset (e, 0, 256);
1637 memset (o, 0, 256);
1638 e[c1] = 1;
1639 o[c2] = 1;
1640
cc13543e
KH
1641 detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
1642 |CATEGORY_MASK_UTF_16_BE
1643 | CATEGORY_MASK_UTF_16_LE);
2f3cbb32 1644
7f1faf1c
KH
1645 while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
1646 != CATEGORY_MASK_UTF_16)
2f3cbb32 1647 {
f56a4450 1648 TWO_MORE_BYTES (c1, c2);
220eeac9 1649 if (c2 < 0)
f56a4450 1650 break;
2f3cbb32
KH
1651 if (! e[c1])
1652 {
1653 e[c1] = 1;
1654 e_num++;
cc13543e
KH
1655 if (e_num >= 128)
1656 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
2f3cbb32
KH
1657 }
1658 if (! o[c2])
1659 {
977b85f4 1660 o[c2] = 1;
2f3cbb32 1661 o_num++;
cc13543e
KH
1662 if (o_num >= 128)
1663 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
2f3cbb32
KH
1664 }
1665 }
2f3cbb32 1666 return 0;
ff0dacd7 1667 }
2f3cbb32 1668
df7492f9 1669 no_more_source:
ff0dacd7 1670 return 1;
df7492f9 1671}
aa72b389 1672
df7492f9 1673static void
971de7fb 1674decode_coding_utf_16 (struct coding_system *coding)
df7492f9 1675{
8f924df7
KH
1676 const unsigned char *src = coding->source + coding->consumed;
1677 const unsigned char *src_end = coding->source + coding->src_bytes;
1678 const unsigned char *src_base;
69a80ea3 1679 int *charbuf = coding->charbuf + coding->charbuf_used;
df80c7f0
KH
1680 /* We may produces at most 3 chars in one loop. */
1681 int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
d311d28c 1682 ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
df7492f9 1683 int multibytep = coding->src_multibyte;
a470d443 1684 enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
df7492f9
KH
1685 enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1686 int surrogate = CODING_UTF_16_SURROGATE (coding);
2735d060 1687 int eol_dos =
0a9564cb 1688 !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
119852e7 1689 int byte_after_cr1 = -1, byte_after_cr2 = -1;
df7492f9 1690
a470d443 1691 if (bom == utf_with_bom)
aa72b389 1692 {
df7492f9 1693 int c, c1, c2;
4af310db 1694
aa72b389 1695 src_base = src;
df7492f9
KH
1696 ONE_MORE_BYTE (c1);
1697 ONE_MORE_BYTE (c2);
e19c3639 1698 c = (c1 << 8) | c2;
aa72b389 1699
b49a1807
KH
1700 if (endian == utf_16_big_endian
1701 ? c != 0xFEFF : c != 0xFFFE)
aa72b389 1702 {
b49a1807
KH
1703 /* The first two bytes are not BOM. Treat them as bytes
1704 for a normal character. */
1705 src = src_base;
1706 coding->errors++;
aa72b389 1707 }
a470d443 1708 CODING_UTF_16_BOM (coding) = utf_without_bom;
b49a1807 1709 }
a470d443 1710 else if (bom == utf_detect_bom)
b49a1807
KH
1711 {
1712 /* We have already tried to detect BOM and failed in
1713 detect_coding. */
a470d443 1714 CODING_UTF_16_BOM (coding) = utf_without_bom;
df7492f9 1715 }
aa72b389 1716
df7492f9
KH
1717 while (1)
1718 {
1719 int c, c1, c2;
1720
1721 src_base = src;
1722 consumed_chars_base = consumed_chars;
1723
df80c7f0 1724 if (charbuf >= charbuf_end)
b71f6f73
KH
1725 {
1726 if (byte_after_cr1 >= 0)
1727 src_base -= 2;
1728 break;
1729 }
df7492f9 1730
119852e7
KH
1731 if (byte_after_cr1 >= 0)
1732 c1 = byte_after_cr1, byte_after_cr1 = -1;
1733 else
1734 ONE_MORE_BYTE (c1);
065e3595
KH
1735 if (c1 < 0)
1736 {
1737 *charbuf++ = -c1;
1738 continue;
1739 }
119852e7
KH
1740 if (byte_after_cr2 >= 0)
1741 c2 = byte_after_cr2, byte_after_cr2 = -1;
1742 else
1743 ONE_MORE_BYTE (c2);
065e3595
KH
1744 if (c2 < 0)
1745 {
1746 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1747 *charbuf++ = -c2;
1748 continue;
1749 }
df7492f9 1750 c = (endian == utf_16_big_endian
e19c3639 1751 ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
119852e7 1752
df7492f9 1753 if (surrogate)
fd3ae0b9 1754 {
df7492f9 1755 if (! UTF_16_LOW_SURROGATE_P (c))
fd3ae0b9 1756 {
df7492f9
KH
1757 if (endian == utf_16_big_endian)
1758 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1759 else
1760 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1761 *charbuf++ = c1;
1762 *charbuf++ = c2;
1763 coding->errors++;
1764 if (UTF_16_HIGH_SURROGATE_P (c))
1765 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
fd3ae0b9 1766 else
df7492f9 1767 *charbuf++ = c;
fd3ae0b9
KH
1768 }
1769 else
df7492f9
KH
1770 {
1771 c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1772 CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
29f7ffd0 1773 *charbuf++ = 0x10000 + c;
df7492f9 1774 }
fd3ae0b9 1775 }
aa72b389 1776 else
df7492f9
KH
1777 {
1778 if (UTF_16_HIGH_SURROGATE_P (c))
1779 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1780 else
119852e7 1781 {
2735d060 1782 if (eol_dos && c == '\r')
119852e7
KH
1783 {
1784 ONE_MORE_BYTE (byte_after_cr1);
1785 ONE_MORE_BYTE (byte_after_cr2);
1786 }
1787 *charbuf++ = c;
1788 }
8f924df7 1789 }
aa72b389 1790 }
df7492f9
KH
1791
1792 no_more_source:
1793 coding->consumed_char += consumed_chars_base;
1794 coding->consumed = src_base - coding->source;
1795 coding->charbuf_used = charbuf - coding->charbuf;
aa72b389 1796}
b73bfc1c 1797
df7492f9 1798static int
971de7fb 1799encode_coding_utf_16 (struct coding_system *coding)
df7492f9
KH
1800{
1801 int multibytep = coding->dst_multibyte;
1802 int *charbuf = coding->charbuf;
1803 int *charbuf_end = charbuf + coding->charbuf_used;
1804 unsigned char *dst = coding->destination + coding->produced;
1805 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1806 int safe_room = 8;
a470d443 1807 enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
df7492f9 1808 int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
d311d28c 1809 ptrdiff_t produced_chars = 0;
df7492f9 1810 int c;
4ed46869 1811
a470d443 1812 if (bom != utf_without_bom)
df7492f9
KH
1813 {
1814 ASSURE_DESTINATION (safe_room);
1815 if (big_endian)
df7492f9 1816 EMIT_TWO_BYTES (0xFE, 0xFF);
880cf180
KH
1817 else
1818 EMIT_TWO_BYTES (0xFF, 0xFE);
a470d443 1819 CODING_UTF_16_BOM (coding) = utf_without_bom;
df7492f9
KH
1820 }
1821
1822 while (charbuf < charbuf_end)
1823 {
1824 ASSURE_DESTINATION (safe_room);
1825 c = *charbuf++;
60afa08d 1826 if (c > MAX_UNICODE_CHAR)
e19c3639 1827 c = coding->default_char;
df7492f9
KH
1828
1829 if (c < 0x10000)
1830 {
1831 if (big_endian)
1832 EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1833 else
1834 EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1835 }
1836 else
1837 {
1838 int c1, c2;
1839
1840 c -= 0x10000;
1841 c1 = (c >> 10) + 0xD800;
1842 c2 = (c & 0x3FF) + 0xDC00;
1843 if (big_endian)
1844 EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1845 else
1846 EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1847 }
1848 }
065e3595 1849 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
1850 coding->produced = dst - coding->destination;
1851 coding->produced_char += produced_chars;
1852 return 0;
1853}
1854
1855\f
1856/*** 6. Old Emacs' internal format (emacs-mule) ***/
1857
1858/* Emacs' internal format for representation of multiple character
1859 sets is a kind of multi-byte encoding, i.e. characters are
1860 represented by variable-length sequences of one-byte codes.
1861
1862 ASCII characters and control characters (e.g. `tab', `newline') are
1863 represented by one-byte sequences which are their ASCII codes, in
1864 the range 0x00 through 0x7F.
1865
1866 8-bit characters of the range 0x80..0x9F are represented by
1867 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1868 code + 0x20).
1869
1870 8-bit characters of the range 0xA0..0xFF are represented by
1871 one-byte sequences which are their 8-bit code.
1872
1873 The other characters are represented by a sequence of `base
1874 leading-code', optional `extended leading-code', and one or two
1875 `position-code's. The length of the sequence is determined by the
1876 base leading-code. Leading-code takes the range 0x81 through 0x9D,
1877 whereas extended leading-code and position-code take the range 0xA0
1878 through 0xFF. See `charset.h' for more details about leading-code
1879 and position-code.
1880
1881 --- CODE RANGE of Emacs' internal format ---
1882 character set range
1883 ------------- -----
1884 ascii 0x00..0x7F
1885 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1886 eight-bit-graphic 0xA0..0xBF
1887 ELSE 0x81..0x9D + [0xA0..0xFF]+
1888 ---------------------------------------------
1889
1890 As this is the internal character representation, the format is
1891 usually not used externally (i.e. in a file or in a data sent to a
1892 process). But, it is possible to have a text externally in this
1893 format (i.e. by encoding by the coding system `emacs-mule').
1894
1895 In that case, a sequence of one-byte codes has a slightly different
1896 form.
1897
1898 At first, all characters in eight-bit-control are represented by
1899 one-byte sequences which are their 8-bit code.
1900
1901 Next, character composition data are represented by the byte
1902 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1903 where,
e951386e 1904 METHOD is 0xF2 plus one of composition method (enum
df7492f9
KH
1905 composition_method),
1906
1907 BYTES is 0xA0 plus a byte length of this composition data,
1908
e951386e 1909 CHARS is 0xA0 plus a number of characters composed by this
df7492f9
KH
1910 data,
1911
ad1746f5 1912 COMPONENTs are characters of multibyte form or composition
df7492f9
KH
1913 rules encoded by two-byte of ASCII codes.
1914
1915 In addition, for backward compatibility, the following formats are
1916 also recognized as composition data on decoding.
1917
1918 0x80 MSEQ ...
1919 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1920
1921 Here,
1922 MSEQ is a multibyte form but in these special format:
1923 ASCII: 0xA0 ASCII_CODE+0x80,
1924 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1925 RULE is a one byte code of the range 0xA0..0xF0 that
1926 represents a composition rule.
1927 */
1928
1929char emacs_mule_bytes[256];
1930
e951386e
KH
1931
1932/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1933 Check if a text is encoded in `emacs-mule'. If it is, return 1,
1934 else return 0. */
1935
1936static int
cf84bb53
JB
1937detect_coding_emacs_mule (struct coding_system *coding,
1938 struct coding_detection_info *detect_info)
e951386e
KH
1939{
1940 const unsigned char *src = coding->source, *src_base;
1941 const unsigned char *src_end = coding->source + coding->src_bytes;
1942 int multibytep = coding->src_multibyte;
d311d28c 1943 ptrdiff_t consumed_chars = 0;
e951386e
KH
1944 int c;
1945 int found = 0;
1946
1947 detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1948 /* A coding system of this category is always ASCII compatible. */
1949 src += coding->head_ascii;
1950
1951 while (1)
1952 {
1953 src_base = src;
1954 ONE_MORE_BYTE (c);
1955 if (c < 0)
1956 continue;
1957 if (c == 0x80)
1958 {
1959 /* Perhaps the start of composite character. We simply skip
1960 it because analyzing it is too heavy for detecting. But,
1961 at least, we check that the composite character
1962 constitutes of more than 4 bytes. */
2735d060 1963 const unsigned char *src_start;
e951386e
KH
1964
1965 repeat:
2735d060 1966 src_start = src;
e951386e
KH
1967 do
1968 {
1969 ONE_MORE_BYTE (c);
1970 }
1971 while (c >= 0xA0);
1972
2735d060 1973 if (src - src_start <= 4)
e951386e
KH
1974 break;
1975 found = CATEGORY_MASK_EMACS_MULE;
1976 if (c == 0x80)
1977 goto repeat;
1978 }
1979
1980 if (c < 0x80)
1981 {
1982 if (c < 0x20
1983 && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1984 break;
1985 }
1986 else
1987 {
396475b7 1988 int more_bytes = emacs_mule_bytes[c] - 1;
e951386e
KH
1989
1990 while (more_bytes > 0)
1991 {
1992 ONE_MORE_BYTE (c);
1993 if (c < 0xA0)
1994 {
1995 src--; /* Unread the last byte. */
1996 break;
1997 }
1998 more_bytes--;
1999 }
2000 if (more_bytes != 0)
2001 break;
2002 found = CATEGORY_MASK_EMACS_MULE;
2003 }
2004 }
2005 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
2006 return 0;
2007
2008 no_more_source:
2009 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
2010 {
2011 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
2012 return 0;
2013 }
2014 detect_info->found |= found;
2015 return 1;
2016}
2017
2018
2019/* Parse emacs-mule multibyte sequence at SRC and return the decoded
2020 character. If CMP_STATUS indicates that we must expect MSEQ or
2021 RULE described above, decode it and return the negative value of
685ebdc8 2022 the decoded character or rule. If an invalid byte is found, return
e951386e
KH
2023 -1. If SRC is too short, return -2. */
2024
e2f1bab9 2025static int
cf84bb53
JB
2026emacs_mule_char (struct coding_system *coding, const unsigned char *src,
2027 int *nbytes, int *nchars, int *id,
2028 struct composition_status *cmp_status)
df7492f9 2029{
8f924df7
KH
2030 const unsigned char *src_end = coding->source + coding->src_bytes;
2031 const unsigned char *src_base = src;
df7492f9 2032 int multibytep = coding->src_multibyte;
2735d060 2033 int charset_ID;
df7492f9
KH
2034 unsigned code;
2035 int c;
2036 int consumed_chars = 0;
e951386e 2037 int mseq_found = 0;
df7492f9
KH
2038
2039 ONE_MORE_BYTE (c);
065e3595 2040 if (c < 0)
df7492f9 2041 {
065e3595 2042 c = -c;
2735d060 2043 charset_ID = emacs_mule_charset[0];
065e3595
KH
2044 }
2045 else
2046 {
4d41e8b7
KH
2047 if (c >= 0xA0)
2048 {
e951386e
KH
2049 if (cmp_status->state != COMPOSING_NO
2050 && cmp_status->old_form)
4d41e8b7 2051 {
e951386e
KH
2052 if (cmp_status->state == COMPOSING_CHAR)
2053 {
2054 if (c == 0xA0)
2055 {
2056 ONE_MORE_BYTE (c);
2057 c -= 0x80;
2058 if (c < 0)
2059 goto invalid_code;
2060 }
2061 else
2062 c -= 0x20;
2063 mseq_found = 1;
2064 }
2065 else
2066 {
2067 *nbytes = src - src_base;
2068 *nchars = consumed_chars;
2069 return -c;
2070 }
4d41e8b7
KH
2071 }
2072 else
e951386e 2073 goto invalid_code;
4d41e8b7
KH
2074 }
2075
065e3595 2076 switch (emacs_mule_bytes[c])
b73bfc1c 2077 {
065e3595 2078 case 2:
2735d060 2079 if ((charset_ID = emacs_mule_charset[c]) < 0)
df7492f9
KH
2080 goto invalid_code;
2081 ONE_MORE_BYTE (c);
9ffd559c 2082 if (c < 0xA0)
065e3595 2083 goto invalid_code;
df7492f9 2084 code = c & 0x7F;
065e3595
KH
2085 break;
2086
2087 case 3:
2088 if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
2089 || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
2090 {
2091 ONE_MORE_BYTE (c);
2735d060 2092 if (c < 0xA0 || (charset_ID = emacs_mule_charset[c]) < 0)
065e3595
KH
2093 goto invalid_code;
2094 ONE_MORE_BYTE (c);
9ffd559c 2095 if (c < 0xA0)
065e3595
KH
2096 goto invalid_code;
2097 code = c & 0x7F;
2098 }
2099 else
2100 {
2735d060 2101 if ((charset_ID = emacs_mule_charset[c]) < 0)
065e3595
KH
2102 goto invalid_code;
2103 ONE_MORE_BYTE (c);
9ffd559c 2104 if (c < 0xA0)
065e3595
KH
2105 goto invalid_code;
2106 code = (c & 0x7F) << 8;
2107 ONE_MORE_BYTE (c);
9ffd559c 2108 if (c < 0xA0)
065e3595
KH
2109 goto invalid_code;
2110 code |= c & 0x7F;
2111 }
2112 break;
2113
2114 case 4:
2115 ONE_MORE_BYTE (c);
2735d060 2116 if (c < 0 || (charset_ID = emacs_mule_charset[c]) < 0)
df7492f9
KH
2117 goto invalid_code;
2118 ONE_MORE_BYTE (c);
9ffd559c 2119 if (c < 0xA0)
065e3595 2120 goto invalid_code;
781d7a48 2121 code = (c & 0x7F) << 8;
df7492f9 2122 ONE_MORE_BYTE (c);
9ffd559c 2123 if (c < 0xA0)
065e3595 2124 goto invalid_code;
df7492f9 2125 code |= c & 0x7F;
065e3595 2126 break;
df7492f9 2127
065e3595
KH
2128 case 1:
2129 code = c;
2735d060 2130 charset_ID = ASCII_BYTE_P (code) ? charset_ascii : charset_eight_bit;
065e3595 2131 break;
df7492f9 2132
065e3595
KH
2133 default:
2134 abort ();
2135 }
b84ae584 2136 CODING_DECODE_CHAR (coding, src, src_base, src_end,
2735d060 2137 CHARSET_FROM_ID (charset_ID), code, c);
065e3595
KH
2138 if (c < 0)
2139 goto invalid_code;
df7492f9 2140 }
df7492f9
KH
2141 *nbytes = src - src_base;
2142 *nchars = consumed_chars;
ff0dacd7 2143 if (id)
2735d060 2144 *id = charset_ID;
e951386e 2145 return (mseq_found ? -c : c);
df7492f9
KH
2146
2147 no_more_source:
2148 return -2;
2149
2150 invalid_code:
2151 return -1;
2152}
2153
2154
e951386e 2155/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
df7492f9 2156
e951386e
KH
2157/* Handle these composition sequence ('|': the end of header elements,
2158 BYTES and CHARS >= 0xA0):
df7492f9 2159
e951386e
KH
2160 (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2161 (2) altchar composition: 0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2162 (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
df7492f9 2163
e951386e 2164 and these old form:
1a4990fb 2165
e951386e
KH
2166 (4) relative composition: 0x80 | MSEQ ... MSEQ
2167 (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
df7492f9 2168
e951386e
KH
2169 When the starter 0x80 and the following header elements are found,
2170 this annotation header is produced.
df7492f9 2171
e951386e 2172 [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
df7492f9 2173
e951386e
KH
2174 NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2175 NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
df7492f9 2176
e951386e
KH
2177 Then, upon reading the following elements, these codes are produced
2178 until the composition end is found:
df7492f9 2179
e951386e
KH
2180 (1) CHAR ... CHAR
2181 (2) ALT ... ALT CHAR ... CHAR
2182 (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2183 (4) CHAR ... CHAR
2184 (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
4ed46869 2185
e951386e
KH
2186 When the composition end is found, LENGTH and NCHARS in the
2187 annotation header is updated as below:
b73bfc1c 2188
e951386e
KH
2189 (1) LENGTH: unchanged, NCHARS: unchanged
2190 (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2191 (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2192 (4) LENGTH: unchanged, NCHARS: number of CHARs
2193 (5) LENGTH: unchanged, NCHARS: number of CHARs
df7492f9 2194
e951386e
KH
2195 If an error is found while composing, the annotation header is
2196 changed to the original composition header (plus filler -1s) as
2197 below:
2198
2199 (1),(2),(3) [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2200 (5) [ 0x80 0xFF -1 -1- -1 ]
2201
2202 and the sequence [ -2 DECODED-RULE ] is changed to the original
2203 byte sequence as below:
2204 o the original byte sequence is B: [ B -1 ]
2205 o the original byte sequence is B1 B2: [ B1 B2 ]
2206
2207 Most of the routines are implemented by macros because many
2208 variables and labels in the caller decode_coding_emacs_mule must be
2209 accessible, and they are usually called just once (thus doesn't
2210 increase the size of compiled object). */
2211
2212/* Decode a composition rule represented by C as a component of
2213 composition sequence of Emacs 20 style. Set RULE to the decoded
2214 rule. */
2215
2216#define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule) \
df7492f9 2217 do { \
e951386e
KH
2218 int gref, nref; \
2219 \
4d41e8b7 2220 c -= 0xA0; \
df7492f9
KH
2221 if (c < 0 || c >= 81) \
2222 goto invalid_code; \
df7492f9 2223 gref = c / 9, nref = c % 9; \
e951386e
KH
2224 if (gref == 4) gref = 10; \
2225 if (nref == 4) nref = 10; \
2226 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
df7492f9
KH
2227 } while (0)
2228
2229
e951386e
KH
2230/* Decode a composition rule represented by C and the following byte
2231 at SRC as a component of composition sequence of Emacs 21 style.
2232 Set RULE to the decoded rule. */
781d7a48 2233
e951386e 2234#define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule) \
781d7a48
KH
2235 do { \
2236 int gref, nref; \
e951386e
KH
2237 \
2238 gref = c - 0x20; \
2239 if (gref < 0 || gref >= 81) \
781d7a48 2240 goto invalid_code; \
e951386e
KH
2241 ONE_MORE_BYTE (c); \
2242 nref = c - 0x20; \
2243 if (nref < 0 || nref >= 81) \
781d7a48 2244 goto invalid_code; \
e951386e 2245 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
781d7a48
KH
2246 } while (0)
2247
2248
e951386e
KH
2249/* Start of Emacs 21 style format. The first three bytes at SRC are
2250 (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2251 byte length of this composition information, CHARS is the number of
2252 characters composed by this composition. */
2253
2254#define DECODE_EMACS_MULE_21_COMPOSITION() \
aa72b389 2255 do { \
781d7a48 2256 enum composition_method method = c - 0xF2; \
df7492f9 2257 int nbytes, nchars; \
e951386e 2258 \
df7492f9 2259 ONE_MORE_BYTE (c); \
065e3595
KH
2260 if (c < 0) \
2261 goto invalid_code; \
df7492f9 2262 nbytes = c - 0xA0; \
e951386e 2263 if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4)) \
df7492f9
KH
2264 goto invalid_code; \
2265 ONE_MORE_BYTE (c); \
2266 nchars = c - 0xA0; \
e951386e
KH
2267 if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS) \
2268 goto invalid_code; \
2269 cmp_status->old_form = 0; \
2270 cmp_status->method = method; \
2271 if (method == COMPOSITION_RELATIVE) \
2272 cmp_status->state = COMPOSING_CHAR; \
2273 else \
2274 cmp_status->state = COMPOSING_COMPONENT_CHAR; \
2275 cmp_status->length = MAX_ANNOTATION_LENGTH; \
2276 cmp_status->nchars = nchars; \
2277 cmp_status->ncomps = nbytes - 4; \
2278 ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method); \
aa72b389 2279 } while (0)
93dec019 2280
aa72b389 2281
e951386e
KH
2282/* Start of Emacs 20 style format for relative composition. */
2283
2284#define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION() \
2285 do { \
2286 cmp_status->old_form = 1; \
2287 cmp_status->method = COMPOSITION_RELATIVE; \
2288 cmp_status->state = COMPOSING_CHAR; \
2289 cmp_status->length = MAX_ANNOTATION_LENGTH; \
2290 cmp_status->nchars = cmp_status->ncomps = 0; \
2291 ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method); \
2292 } while (0)
2293
2294
2295/* Start of Emacs 20 style format for rule-base composition. */
2296
2297#define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION() \
2298 do { \
2299 cmp_status->old_form = 1; \
2300 cmp_status->method = COMPOSITION_WITH_RULE; \
2301 cmp_status->state = COMPOSING_CHAR; \
2302 cmp_status->length = MAX_ANNOTATION_LENGTH; \
2303 cmp_status->nchars = cmp_status->ncomps = 0; \
2304 ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method); \
df7492f9
KH
2305 } while (0)
2306
2307
e951386e
KH
2308#define DECODE_EMACS_MULE_COMPOSITION_START() \
2309 do { \
2310 const unsigned char *current_src = src; \
2311 \
2312 ONE_MORE_BYTE (c); \
2313 if (c < 0) \
2314 goto invalid_code; \
2315 if (c - 0xF2 >= COMPOSITION_RELATIVE \
2316 && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS) \
2317 DECODE_EMACS_MULE_21_COMPOSITION (); \
2318 else if (c < 0xA0) \
2319 goto invalid_code; \
2320 else if (c < 0xC0) \
2321 { \
2322 DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (); \
2323 /* Re-read C as a composition component. */ \
2324 src = current_src; \
2325 } \
2326 else if (c == 0xFF) \
2327 DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (); \
2328 else \
2329 goto invalid_code; \
2330 } while (0)
2331
2332#define EMACS_MULE_COMPOSITION_END() \
df7492f9 2333 do { \
e951386e 2334 int idx = - cmp_status->length; \
4d41e8b7 2335 \
e951386e
KH
2336 if (cmp_status->old_form) \
2337 charbuf[idx + 2] = cmp_status->nchars; \
2338 else if (cmp_status->method > COMPOSITION_RELATIVE) \
2339 charbuf[idx] = charbuf[idx + 2] - cmp_status->length; \
2340 cmp_status->state = COMPOSING_NO; \
2341 } while (0)
2342
2343
2344static int
cf84bb53
JB
2345emacs_mule_finish_composition (int *charbuf,
2346 struct composition_status *cmp_status)
e951386e
KH
2347{
2348 int idx = - cmp_status->length;
2349 int new_chars;
2350
2351 if (cmp_status->old_form && cmp_status->nchars > 0)
2352 {
2353 charbuf[idx + 2] = cmp_status->nchars;
2354 new_chars = 0;
2355 if (cmp_status->method == COMPOSITION_WITH_RULE
2356 && cmp_status->state == COMPOSING_CHAR)
2357 {
2358 /* The last rule was invalid. */
2359 int rule = charbuf[-1] + 0xA0;
2360
2361 charbuf[-2] = BYTE8_TO_CHAR (rule);
2362 charbuf[-1] = -1;
2363 new_chars = 1;
2364 }
2365 }
2366 else
2367 {
2368 charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2369
2370 if (cmp_status->method == COMPOSITION_WITH_RULE)
2371 {
2372 charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2373 charbuf[idx++] = -3;
2374 charbuf[idx++] = 0;
2375 new_chars = 1;
2376 }
2377 else
2378 {
2379 int nchars = charbuf[idx + 1] + 0xA0;
2380 int nbytes = charbuf[idx + 2] + 0xA0;
2381
2382 charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2383 charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2384 charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2385 charbuf[idx++] = -1;
2386 new_chars = 4;
2387 }
2388 }
2389 cmp_status->state = COMPOSING_NO;
2390 return new_chars;
2391}
2392
2393#define EMACS_MULE_MAYBE_FINISH_COMPOSITION() \
2394 do { \
2395 if (cmp_status->state != COMPOSING_NO) \
2396 char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
df7492f9
KH
2397 } while (0)
2398
aa72b389
KH
2399
2400static void
971de7fb 2401decode_coding_emacs_mule (struct coding_system *coding)
aa72b389 2402{
8f924df7
KH
2403 const unsigned char *src = coding->source + coding->consumed;
2404 const unsigned char *src_end = coding->source + coding->src_bytes;
2405 const unsigned char *src_base;
69a80ea3 2406 int *charbuf = coding->charbuf + coding->charbuf_used;
ad1746f5
KH
2407 /* We may produce two annotations (charset and composition) in one
2408 loop and one more charset annotation at the end. */
69a80ea3 2409 int *charbuf_end
15cbd324
EZ
2410 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3)
2411 /* We can produce up to 2 characters in a loop. */
2412 - 1;
d311d28c 2413 ptrdiff_t consumed_chars = 0, consumed_chars_base;
df7492f9 2414 int multibytep = coding->src_multibyte;
d311d28c
PE
2415 ptrdiff_t char_offset = coding->produced_char;
2416 ptrdiff_t last_offset = char_offset;
ff0dacd7 2417 int last_id = charset_ascii;
2735d060 2418 int eol_dos =
0a9564cb 2419 !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
119852e7 2420 int byte_after_cr = -1;
e951386e 2421 struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
aa72b389 2422
e951386e
KH
2423 if (cmp_status->state != COMPOSING_NO)
2424 {
2425 int i;
2426
15cbd324
EZ
2427 if (charbuf_end - charbuf < cmp_status->length)
2428 abort ();
e951386e
KH
2429 for (i = 0; i < cmp_status->length; i++)
2430 *charbuf++ = cmp_status->carryover[i];
2431 coding->annotated = 1;
2432 }
2433
aa72b389
KH
2434 while (1)
2435 {
ee05f961 2436 int c, id IF_LINT (= 0);
df7492f9 2437
aa72b389 2438 src_base = src;
df7492f9
KH
2439 consumed_chars_base = consumed_chars;
2440
2441 if (charbuf >= charbuf_end)
b71f6f73
KH
2442 {
2443 if (byte_after_cr >= 0)
2444 src_base--;
2445 break;
2446 }
aa72b389 2447
119852e7
KH
2448 if (byte_after_cr >= 0)
2449 c = byte_after_cr, byte_after_cr = -1;
2450 else
2451 ONE_MORE_BYTE (c);
e951386e
KH
2452
2453 if (c < 0 || c == 0x80)
065e3595 2454 {
e951386e
KH
2455 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2456 if (c < 0)
2457 {
2458 *charbuf++ = -c;
2459 char_offset++;
2460 }
2461 else
2462 DECODE_EMACS_MULE_COMPOSITION_START ();
2463 continue;
065e3595 2464 }
e951386e
KH
2465
2466 if (c < 0x80)
aa72b389 2467 {
2735d060 2468 if (eol_dos && c == '\r')
119852e7 2469 ONE_MORE_BYTE (byte_after_cr);
e951386e
KH
2470 id = charset_ascii;
2471 if (cmp_status->state != COMPOSING_NO)
2472 {
2473 if (cmp_status->old_form)
2474 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2475 else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2476 cmp_status->ncomps--;
2477 }
2478 }
2479 else
2480 {
ee05f961 2481 int nchars IF_LINT (= 0), nbytes IF_LINT (= 0);
75f80e63
EZ
2482 /* emacs_mule_char can load a charset map from a file, which
2483 allocates a large structure and might cause buffer text
2484 to be relocated as result. Thus, we need to remember the
ad1746f5 2485 original pointer to buffer text, and fix up all related
75f80e63
EZ
2486 pointers after the call. */
2487 const unsigned char *orig = coding->source;
d311d28c 2488 ptrdiff_t offset;
e951386e
KH
2489
2490 c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2491 cmp_status);
75f80e63
EZ
2492 offset = coding->source - orig;
2493 if (offset)
2494 {
2495 src += offset;
2496 src_base += offset;
2497 src_end += offset;
2498 }
e951386e
KH
2499 if (c < 0)
2500 {
2501 if (c == -1)
2502 goto invalid_code;
2503 if (c == -2)
2504 break;
2505 }
2506 src = src_base + nbytes;
2507 consumed_chars = consumed_chars_base + nchars;
2508 if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2509 cmp_status->ncomps -= nchars;
2510 }
2511
ad1746f5 2512 /* Now if C >= 0, we found a normally encoded character, if C <
e951386e
KH
2513 0, we found an old-style composition component character or
2514 rule. */
2515
2516 if (cmp_status->state == COMPOSING_NO)
2517 {
2518 if (last_id != id)
2519 {
2520 if (last_id != charset_ascii)
2521 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2522 last_id);
2523 last_id = id;
2524 last_offset = char_offset;
2525 }
df7492f9
KH
2526 *charbuf++ = c;
2527 char_offset++;
aa72b389 2528 }
e951386e 2529 else if (cmp_status->state == COMPOSING_CHAR)
df7492f9 2530 {
e951386e
KH
2531 if (cmp_status->old_form)
2532 {
2533 if (c >= 0)
2534 {
2535 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2536 *charbuf++ = c;
2537 char_offset++;
2538 }
2539 else
2540 {
2541 *charbuf++ = -c;
2542 cmp_status->nchars++;
2543 cmp_status->length++;
2544 if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2545 EMACS_MULE_COMPOSITION_END ();
2546 else if (cmp_status->method == COMPOSITION_WITH_RULE)
2547 cmp_status->state = COMPOSING_RULE;
2548 }
2549 }
df7492f9 2550 else
e951386e
KH
2551 {
2552 *charbuf++ = c;
2553 cmp_status->length++;
2554 cmp_status->nchars--;
2555 if (cmp_status->nchars == 0)
2556 EMACS_MULE_COMPOSITION_END ();
2557 }
df7492f9 2558 }
e951386e 2559 else if (cmp_status->state == COMPOSING_RULE)
df7492f9 2560 {
e951386e 2561 int rule;
ff0dacd7 2562
e951386e 2563 if (c >= 0)
df7492f9 2564 {
e951386e
KH
2565 EMACS_MULE_COMPOSITION_END ();
2566 *charbuf++ = c;
2567 char_offset++;
df7492f9 2568 }
e951386e 2569 else
ff0dacd7 2570 {
e951386e
KH
2571 c = -c;
2572 DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2573 if (rule < 0)
2574 goto invalid_code;
2575 *charbuf++ = -2;
2576 *charbuf++ = rule;
2577 cmp_status->length += 2;
2578 cmp_status->state = COMPOSING_CHAR;
ff0dacd7 2579 }
e951386e
KH
2580 }
2581 else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2582 {
df7492f9 2583 *charbuf++ = c;
e951386e
KH
2584 cmp_status->length++;
2585 if (cmp_status->ncomps == 0)
2586 cmp_status->state = COMPOSING_CHAR;
2587 else if (cmp_status->ncomps > 0)
2588 {
2589 if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2590 cmp_status->state = COMPOSING_COMPONENT_RULE;
2591 }
2592 else
2593 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
df7492f9 2594 }
e951386e
KH
2595 else /* COMPOSING_COMPONENT_RULE */
2596 {
2597 int rule;
2598
2599 DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2600 if (rule < 0)
2601 goto invalid_code;
2602 *charbuf++ = -2;
2603 *charbuf++ = rule;
2604 cmp_status->length += 2;
2605 cmp_status->ncomps--;
2606 if (cmp_status->ncomps > 0)
2607 cmp_status->state = COMPOSING_COMPONENT_CHAR;
2608 else
2609 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2610 }
2611 continue;
2612
df7492f9 2613 invalid_code:
e951386e 2614 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
df7492f9
KH
2615 src = src_base;
2616 consumed_chars = consumed_chars_base;
2617 ONE_MORE_BYTE (c);
2618 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
ff0dacd7 2619 char_offset++;
df7492f9
KH
2620 coding->errors++;
2621 }
2622
2623 no_more_source:
e951386e
KH
2624 if (cmp_status->state != COMPOSING_NO)
2625 {
2626 if (coding->mode & CODING_MODE_LAST_BLOCK)
2627 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2628 else
2629 {
2630 int i;
2631
2632 charbuf -= cmp_status->length;
2633 for (i = 0; i < cmp_status->length; i++)
2634 cmp_status->carryover[i] = charbuf[i];
2635 }
2636 }
ff0dacd7 2637 if (last_id != charset_ascii)
69a80ea3 2638 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
df7492f9
KH
2639 coding->consumed_char += consumed_chars_base;
2640 coding->consumed = src_base - coding->source;
2641 coding->charbuf_used = charbuf - coding->charbuf;
2642}
2643
2644
2645#define EMACS_MULE_LEADING_CODES(id, codes) \
2646 do { \
2647 if (id < 0xA0) \
2648 codes[0] = id, codes[1] = 0; \
2649 else if (id < 0xE0) \
2650 codes[0] = 0x9A, codes[1] = id; \
2651 else if (id < 0xF0) \
2652 codes[0] = 0x9B, codes[1] = id; \
2653 else if (id < 0xF5) \
2654 codes[0] = 0x9C, codes[1] = id; \
2655 else \
2656 codes[0] = 0x9D, codes[1] = id; \
2657 } while (0);
2658
aa72b389 2659
df7492f9 2660static int
971de7fb 2661encode_coding_emacs_mule (struct coding_system *coding)
df7492f9
KH
2662{
2663 int multibytep = coding->dst_multibyte;
2664 int *charbuf = coding->charbuf;
2665 int *charbuf_end = charbuf + coding->charbuf_used;
2666 unsigned char *dst = coding->destination + coding->produced;
2667 unsigned char *dst_end = coding->destination + coding->dst_bytes;
2668 int safe_room = 8;
d311d28c 2669 ptrdiff_t produced_chars = 0;
24a73b0a 2670 Lisp_Object attrs, charset_list;
df7492f9 2671 int c;
ff0dacd7 2672 int preferred_charset_id = -1;
df7492f9 2673
24a73b0a 2674 CODING_GET_INFO (coding, attrs, charset_list);
eccb6815
KH
2675 if (! EQ (charset_list, Vemacs_mule_charset_list))
2676 {
2677 CODING_ATTR_CHARSET_LIST (attrs)
2678 = charset_list = Vemacs_mule_charset_list;
2679 }
df7492f9
KH
2680
2681 while (charbuf < charbuf_end)
2682 {
2683 ASSURE_DESTINATION (safe_room);
2684 c = *charbuf++;
ff0dacd7
KH
2685
2686 if (c < 0)
2687 {
2688 /* Handle an annotation. */
2689 switch (*charbuf)
2690 {
2691 case CODING_ANNOTATE_COMPOSITION_MASK:
2692 /* Not yet implemented. */
2693 break;
2694 case CODING_ANNOTATE_CHARSET_MASK:
2695 preferred_charset_id = charbuf[3];
2696 if (preferred_charset_id >= 0
2697 && NILP (Fmemq (make_number (preferred_charset_id),
2698 charset_list)))
2699 preferred_charset_id = -1;
2700 break;
2701 default:
2702 abort ();
2703 }
2704 charbuf += -c - 1;
2705 continue;
2706 }
2707
df7492f9
KH
2708 if (ASCII_CHAR_P (c))
2709 EMIT_ONE_ASCII_BYTE (c);
16eafb5d
KH
2710 else if (CHAR_BYTE8_P (c))
2711 {
2712 c = CHAR_TO_BYTE8 (c);
2713 EMIT_ONE_BYTE (c);
2714 }
df7492f9 2715 else
aa72b389 2716 {
df7492f9
KH
2717 struct charset *charset;
2718 unsigned code;
2719 int dimension;
2720 int emacs_mule_id;
2721 unsigned char leading_codes[2];
2722
ff0dacd7
KH
2723 if (preferred_charset_id >= 0)
2724 {
5eb05ea3
KH
2725 int result;
2726
ff0dacd7 2727 charset = CHARSET_FROM_ID (preferred_charset_id);
5eb05ea3
KH
2728 CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
2729 if (result)
905ca9d2
KH
2730 code = ENCODE_CHAR (charset, c);
2731 else
5eb05ea3
KH
2732 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2733 &code, charset);
ff0dacd7
KH
2734 }
2735 else
5eb05ea3
KH
2736 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2737 &code, charset);
df7492f9
KH
2738 if (! charset)
2739 {
2740 c = coding->default_char;
2741 if (ASCII_CHAR_P (c))
2742 {
2743 EMIT_ONE_ASCII_BYTE (c);
2744 continue;
2745 }
5eb05ea3
KH
2746 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2747 &code, charset);
df7492f9
KH
2748 }
2749 dimension = CHARSET_DIMENSION (charset);
2750 emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2751 EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2752 EMIT_ONE_BYTE (leading_codes[0]);
2753 if (leading_codes[1])
2754 EMIT_ONE_BYTE (leading_codes[1]);
2755 if (dimension == 1)
1fa663f9 2756 EMIT_ONE_BYTE (code | 0x80);
aa72b389 2757 else
df7492f9 2758 {
1fa663f9 2759 code |= 0x8080;
df7492f9
KH
2760 EMIT_ONE_BYTE (code >> 8);
2761 EMIT_ONE_BYTE (code & 0xFF);
2762 }
aa72b389 2763 }
aa72b389 2764 }
065e3595 2765 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
2766 coding->produced_char += produced_chars;
2767 coding->produced = dst - coding->destination;
2768 return 0;
aa72b389 2769}
b73bfc1c 2770
4ed46869 2771\f
df7492f9 2772/*** 7. ISO2022 handlers ***/
4ed46869
KH
2773
2774/* The following note describes the coding system ISO2022 briefly.
39787efd 2775 Since the intention of this note is to help understand the
5a936b46 2776 functions in this file, some parts are NOT ACCURATE or are OVERLY
39787efd 2777 SIMPLIFIED. For thorough understanding, please refer to the
5a936b46 2778 original document of ISO2022. This is equivalent to the standard
cfb43547 2779 ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
4ed46869
KH
2780
2781 ISO2022 provides many mechanisms to encode several character sets
cfb43547 2782 in 7-bit and 8-bit environments. For 7-bit environments, all text
39787efd
KH
2783 is encoded using bytes less than 128. This may make the encoded
2784 text a little bit longer, but the text passes more easily through
cfb43547 2785 several types of gateway, some of which strip off the MSB (Most
8ca3766a 2786 Significant Bit).
b73bfc1c 2787
cfb43547
DL
2788 There are two kinds of character sets: control character sets and
2789 graphic character sets. The former contain control characters such
4ed46869 2790 as `newline' and `escape' to provide control functions (control
39787efd 2791 functions are also provided by escape sequences). The latter
cfb43547 2792 contain graphic characters such as 'A' and '-'. Emacs recognizes
4ed46869
KH
2793 two control character sets and many graphic character sets.
2794
2795 Graphic character sets are classified into one of the following
39787efd
KH
2796 four classes, according to the number of bytes (DIMENSION) and
2797 number of characters in one dimension (CHARS) of the set:
2798 - DIMENSION1_CHARS94
2799 - DIMENSION1_CHARS96
2800 - DIMENSION2_CHARS94
2801 - DIMENSION2_CHARS96
2802
2803 In addition, each character set is assigned an identification tag,
cfb43547 2804 unique for each set, called the "final character" (denoted as <F>
39787efd
KH
2805 hereafter). The <F> of each character set is decided by ECMA(*)
2806 when it is registered in ISO. The code range of <F> is 0x30..0x7F
2807 (0x30..0x3F are for private use only).
4ed46869
KH
2808
2809 Note (*): ECMA = European Computer Manufacturers Association
2810
cfb43547 2811 Here are examples of graphic character sets [NAME(<F>)]:
4ed46869
KH
2812 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2813 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2814 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2815 o DIMENSION2_CHARS96 -- none for the moment
2816
39787efd 2817 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
4ed46869
KH
2818 C0 [0x00..0x1F] -- control character plane 0
2819 GL [0x20..0x7F] -- graphic character plane 0
2820 C1 [0x80..0x9F] -- control character plane 1
2821 GR [0xA0..0xFF] -- graphic character plane 1
2822
2823 A control character set is directly designated and invoked to C0 or
39787efd
KH
2824 C1 by an escape sequence. The most common case is that:
2825 - ISO646's control character set is designated/invoked to C0, and
2826 - ISO6429's control character set is designated/invoked to C1,
2827 and usually these designations/invocations are omitted in encoded
2828 text. In a 7-bit environment, only C0 can be used, and a control
2829 character for C1 is encoded by an appropriate escape sequence to
2830 fit into the environment. All control characters for C1 are
2831 defined to have corresponding escape sequences.
4ed46869
KH
2832
2833 A graphic character set is at first designated to one of four
2834 graphic registers (G0 through G3), then these graphic registers are
2835 invoked to GL or GR. These designations and invocations can be
2836 done independently. The most common case is that G0 is invoked to
39787efd
KH
2837 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
2838 these invocations and designations are omitted in encoded text.
2839 In a 7-bit environment, only GL can be used.
4ed46869 2840
39787efd
KH
2841 When a graphic character set of CHARS94 is invoked to GL, codes
2842 0x20 and 0x7F of the GL area work as control characters SPACE and
2843 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2844 be used.
4ed46869
KH
2845
2846 There are two ways of invocation: locking-shift and single-shift.
2847 With locking-shift, the invocation lasts until the next different
39787efd
KH
2848 invocation, whereas with single-shift, the invocation affects the
2849 following character only and doesn't affect the locking-shift
2850 state. Invocations are done by the following control characters or
2851 escape sequences:
4ed46869
KH
2852
2853 ----------------------------------------------------------------------
39787efd 2854 abbrev function cntrl escape seq description
4ed46869 2855 ----------------------------------------------------------------------
39787efd
KH
2856 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
2857 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
2858 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
2859 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
2860 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
2861 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
2862 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
2863 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
2864 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
4ed46869 2865 ----------------------------------------------------------------------
39787efd
KH
2866 (*) These are not used by any known coding system.
2867
2868 Control characters for these functions are defined by macros
2869 ISO_CODE_XXX in `coding.h'.
4ed46869 2870
39787efd 2871 Designations are done by the following escape sequences:
4ed46869
KH
2872 ----------------------------------------------------------------------
2873 escape sequence description
2874 ----------------------------------------------------------------------
2875 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
2876 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
2877 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
2878 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
2879 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
2880 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
2881 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
2882 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
2883 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
2884 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
2885 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
2886 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
2887 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
2888 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
2889 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
2890 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
2891 ----------------------------------------------------------------------
2892
2893 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
39787efd 2894 of dimension 1, chars 94, and final character <F>, etc...
4ed46869
KH
2895
2896 Note (*): Although these designations are not allowed in ISO2022,
2897 Emacs accepts them on decoding, and produces them on encoding
39787efd 2898 CHARS96 character sets in a coding system which is characterized as
4ed46869
KH
2899 7-bit environment, non-locking-shift, and non-single-shift.
2900
2901 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
df7492f9 2902 '(' must be omitted. We refer to this as "short-form" hereafter.
4ed46869 2903
cfb43547 2904 Now you may notice that there are a lot of ways of encoding the
39787efd
KH
2905 same multilingual text in ISO2022. Actually, there exist many
2906 coding systems such as Compound Text (used in X11's inter client
8ca3766a
DL
2907 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2908 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
4ed46869
KH
2909 localized platforms), and all of these are variants of ISO2022.
2910
2911 In addition to the above, Emacs handles two more kinds of escape
2912 sequences: ISO6429's direction specification and Emacs' private
2913 sequence for specifying character composition.
2914
39787efd 2915 ISO6429's direction specification takes the following form:
4ed46869
KH
2916 o CSI ']' -- end of the current direction
2917 o CSI '0' ']' -- end of the current direction
2918 o CSI '1' ']' -- start of left-to-right text
2919 o CSI '2' ']' -- start of right-to-left text
2920 The control character CSI (0x9B: control sequence introducer) is
39787efd
KH
2921 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2922
2923 Character composition specification takes the following form:
ec6d2bb8
KH
2924 o ESC '0' -- start relative composition
2925 o ESC '1' -- end composition
2926 o ESC '2' -- start rule-base composition (*)
2927 o ESC '3' -- start relative composition with alternate chars (**)
2928 o ESC '4' -- start rule-base composition with alternate chars (**)
b73bfc1c 2929 Since these are not standard escape sequences of any ISO standard,
cfb43547 2930 the use of them with these meanings is restricted to Emacs only.
ec6d2bb8 2931
5a936b46
DL
2932 (*) This form is used only in Emacs 20.7 and older versions,
2933 but newer versions can safely decode it.
cfb43547 2934 (**) This form is used only in Emacs 21.1 and newer versions,
5a936b46 2935 and older versions can't decode it.
ec6d2bb8 2936
cfb43547 2937 Here's a list of example usages of these composition escape
b73bfc1c 2938 sequences (categorized by `enum composition_method').
ec6d2bb8 2939
b73bfc1c 2940 COMPOSITION_RELATIVE:
ec6d2bb8 2941 ESC 0 CHAR [ CHAR ] ESC 1
8ca3766a 2942 COMPOSITION_WITH_RULE:
ec6d2bb8 2943 ESC 2 CHAR [ RULE CHAR ] ESC 1
b73bfc1c 2944 COMPOSITION_WITH_ALTCHARS:
ec6d2bb8 2945 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
b73bfc1c 2946 COMPOSITION_WITH_RULE_ALTCHARS:
ec6d2bb8 2947 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
4ed46869 2948
74ab6df5 2949static enum iso_code_class_type iso_code_class[256];
4ed46869 2950
df7492f9
KH
2951#define SAFE_CHARSET_P(coding, id) \
2952 ((id) <= (coding)->max_charset_id \
1b3b981b 2953 && (coding)->safe_charsets[id] != 255)
df7492f9 2954
df7492f9 2955static void
971de7fb 2956setup_iso_safe_charsets (Lisp_Object attrs)
df7492f9
KH
2957{
2958 Lisp_Object charset_list, safe_charsets;
2959 Lisp_Object request;
2960 Lisp_Object reg_usage;
2961 Lisp_Object tail;
d311d28c 2962 EMACS_INT reg94, reg96;
df7492f9
KH
2963 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2964 int max_charset_id;
2965
2966 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2967 if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2968 && ! EQ (charset_list, Viso_2022_charset_list))
2969 {
2970 CODING_ATTR_CHARSET_LIST (attrs)
2971 = charset_list = Viso_2022_charset_list;
2972 ASET (attrs, coding_attr_safe_charsets, Qnil);
2973 }
2974
2975 if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2976 return;
2977
2978 max_charset_id = 0;
2979 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2980 {
2981 int id = XINT (XCAR (tail));
2982 if (max_charset_id < id)
2983 max_charset_id = id;
2984 }
d46c5b12 2985
1b3b981b
AS
2986 safe_charsets = make_uninit_string (max_charset_id + 1);
2987 memset (SDATA (safe_charsets), 255, max_charset_id + 1);
df7492f9
KH
2988 request = AREF (attrs, coding_attr_iso_request);
2989 reg_usage = AREF (attrs, coding_attr_iso_usage);
2990 reg94 = XINT (XCAR (reg_usage));
2991 reg96 = XINT (XCDR (reg_usage));
2992
2993 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2994 {
2995 Lisp_Object id;
2996 Lisp_Object reg;
2997 struct charset *charset;
2998
2999 id = XCAR (tail);
3000 charset = CHARSET_FROM_ID (XINT (id));
bf16eb23 3001 reg = Fcdr (Fassq (id, request));
df7492f9 3002 if (! NILP (reg))
8f924df7 3003 SSET (safe_charsets, XINT (id), XINT (reg));
df7492f9
KH
3004 else if (charset->iso_chars_96)
3005 {
3006 if (reg96 < 4)
8f924df7 3007 SSET (safe_charsets, XINT (id), reg96);
df7492f9
KH
3008 }
3009 else
3010 {
3011 if (reg94 < 4)
8f924df7 3012 SSET (safe_charsets, XINT (id), reg94);
df7492f9
KH
3013 }
3014 }
3015 ASET (attrs, coding_attr_safe_charsets, safe_charsets);
3016}
d46c5b12 3017
b6871cc7 3018
4ed46869 3019/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
ad1746f5 3020 Check if a text is encoded in one of ISO-2022 based coding systems.
ff0dacd7 3021 If it is, return 1, else return 0. */
4ed46869 3022
0a28aafb 3023static int
cf84bb53
JB
3024detect_coding_iso_2022 (struct coding_system *coding,
3025 struct coding_detection_info *detect_info)
4ed46869 3026{
8f924df7
KH
3027 const unsigned char *src = coding->source, *src_base = src;
3028 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9 3029 int multibytep = coding->src_multibyte;
ff0dacd7 3030 int single_shifting = 0;
0e48bb22 3031 int id;
df7492f9 3032 int c, c1;
d311d28c 3033 ptrdiff_t consumed_chars = 0;
df7492f9 3034 int i;
ff0dacd7
KH
3035 int rejected = 0;
3036 int found = 0;
cee53ed4 3037 int composition_count = -1;
ff0dacd7
KH
3038
3039 detect_info->checked |= CATEGORY_MASK_ISO;
df7492f9
KH
3040
3041 for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
3042 {
3043 struct coding_system *this = &(coding_categories[i]);
3044 Lisp_Object attrs, val;
3045
c6b278e7
KH
3046 if (this->id < 0)
3047 continue;
df7492f9
KH
3048 attrs = CODING_ID_ATTRS (this->id);
3049 if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
1b3b981b 3050 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
df7492f9
KH
3051 setup_iso_safe_charsets (attrs);
3052 val = CODING_ATTR_SAFE_CHARSETS (attrs);
8f924df7 3053 this->max_charset_id = SCHARS (val) - 1;
1b3b981b 3054 this->safe_charsets = SDATA (val);
df7492f9
KH
3055 }
3056
3057 /* A coding system of this category is always ASCII compatible. */
3058 src += coding->head_ascii;
3f003981 3059
ff0dacd7 3060 while (rejected != CATEGORY_MASK_ISO)
4ed46869 3061 {
065e3595 3062 src_base = src;
df7492f9 3063 ONE_MORE_BYTE (c);
4ed46869
KH
3064 switch (c)
3065 {
3066 case ISO_CODE_ESC:
74383408
KH
3067 if (inhibit_iso_escape_detection)
3068 break;
f46869e4 3069 single_shifting = 0;
df7492f9 3070 ONE_MORE_BYTE (c);
0e48bb22 3071 if (c == 'N' || c == 'O')
d46c5b12 3072 {
ae9ff118 3073 /* ESC <Fe> for SS2 or SS3. */
ff0dacd7
KH
3074 single_shifting = 1;
3075 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
4ed46869 3076 }
cee53ed4
KH
3077 else if (c == '1')
3078 {
3079 /* End of composition. */
3080 if (composition_count < 0
3081 || composition_count > MAX_COMPOSITION_COMPONENTS)
3082 /* Invalid */
3083 break;
3084 composition_count = -1;
3085 found |= CATEGORY_MASK_ISO;
3086 }
ec6d2bb8
KH
3087 else if (c >= '0' && c <= '4')
3088 {
3089 /* ESC <Fp> for start/end composition. */
cee53ed4 3090 composition_count = 0;
ec6d2bb8 3091 }
bf9cdd4e 3092 else
df7492f9 3093 {
0e48bb22
AS
3094 if (c >= '(' && c <= '/')
3095 {
3096 /* Designation sequence for a charset of dimension 1. */
3097 ONE_MORE_BYTE (c1);
3098 if (c1 < ' ' || c1 >= 0x80
3099 || (id = iso_charset_table[0][c >= ','][c1]) < 0)
3100 /* Invalid designation sequence. Just ignore. */
3101 break;
3102 }
3103 else if (c == '$')
3104 {
3105 /* Designation sequence for a charset of dimension 2. */
3106 ONE_MORE_BYTE (c);
3107 if (c >= '@' && c <= 'B')
3108 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
3109 id = iso_charset_table[1][0][c];
3110 else if (c >= '(' && c <= '/')
3111 {
3112 ONE_MORE_BYTE (c1);
3113 if (c1 < ' ' || c1 >= 0x80
3114 || (id = iso_charset_table[1][c >= ','][c1]) < 0)
3115 /* Invalid designation sequence. Just ignore. */
3116 break;
3117 }
3118 else
3119 /* Invalid designation sequence. Just ignore it. */
3120 break;
3121 }
3122 else
3123 {
3124 /* Invalid escape sequence. Just ignore it. */
3125 break;
3126 }
d46c5b12 3127
0e48bb22
AS
3128 /* We found a valid designation sequence for CHARSET. */
3129 rejected |= CATEGORY_MASK_ISO_8BIT;
3130 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
3131 id))
3132 found |= CATEGORY_MASK_ISO_7;
3133 else
3134 rejected |= CATEGORY_MASK_ISO_7;
3135 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
3136 id))
3137 found |= CATEGORY_MASK_ISO_7_TIGHT;
3138 else
3139 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
3140 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3141 id))
3142 found |= CATEGORY_MASK_ISO_7_ELSE;
3143 else
3144 rejected |= CATEGORY_MASK_ISO_7_ELSE;
3145 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3146 id))
3147 found |= CATEGORY_MASK_ISO_8_ELSE;
3148 else
3149 rejected |= CATEGORY_MASK_ISO_8_ELSE;
3150 }
4ed46869
KH
3151 break;
3152
4ed46869 3153 case ISO_CODE_SO:
d46c5b12 3154 case ISO_CODE_SI:
ff0dacd7 3155 /* Locking shift out/in. */
74383408
KH
3156 if (inhibit_iso_escape_detection)
3157 break;
f46869e4 3158 single_shifting = 0;
ff0dacd7 3159 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
d46c5b12
KH
3160 break;
3161
4ed46869 3162 case ISO_CODE_CSI:
ff0dacd7 3163 /* Control sequence introducer. */
f46869e4 3164 single_shifting = 0;
ff0dacd7
KH
3165 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3166 found |= CATEGORY_MASK_ISO_8_ELSE;
3167 goto check_extra_latin;
3168
4ed46869
KH
3169 case ISO_CODE_SS2:
3170 case ISO_CODE_SS3:
ff0dacd7
KH
3171 /* Single shift. */
3172 if (inhibit_iso_escape_detection)
3173 break;
75e2a253 3174 single_shifting = 0;
ff0dacd7
KH
3175 rejected |= CATEGORY_MASK_ISO_7BIT;
3176 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3177 & CODING_ISO_FLAG_SINGLE_SHIFT)
0e48bb22
AS
3178 {
3179 found |= CATEGORY_MASK_ISO_8_1;
3180 single_shifting = 1;
3181 }
ff0dacd7
KH
3182 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3183 & CODING_ISO_FLAG_SINGLE_SHIFT)
0e48bb22
AS
3184 {
3185 found |= CATEGORY_MASK_ISO_8_2;
3186 single_shifting = 1;
3187 }
75e2a253
KH
3188 if (single_shifting)
3189 break;
0e48bb22
AS
3190 check_extra_latin:
3191 if (! VECTORP (Vlatin_extra_code_table)
3192 || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
3193 {
3194 rejected = CATEGORY_MASK_ISO;
3195 break;
3196 }
3197 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3198 & CODING_ISO_FLAG_LATIN_EXTRA)
3199 found |= CATEGORY_MASK_ISO_8_1;
3200 else
3201 rejected |= CATEGORY_MASK_ISO_8_1;
3202 rejected |= CATEGORY_MASK_ISO_8_2;
3203 break;
4ed46869
KH
3204
3205 default:
065e3595
KH
3206 if (c < 0)
3207 continue;
4ed46869 3208 if (c < 0x80)
f46869e4 3209 {
cee53ed4
KH
3210 if (composition_count >= 0)
3211 composition_count++;
f46869e4
KH
3212 single_shifting = 0;
3213 break;
3214 }
ff0dacd7 3215 if (c >= 0xA0)
c4825358 3216 {
ff0dacd7
KH
3217 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3218 found |= CATEGORY_MASK_ISO_8_1;
f46869e4 3219 /* Check the length of succeeding codes of the range
ff0dacd7
KH
3220 0xA0..0FF. If the byte length is even, we include
3221 CATEGORY_MASK_ISO_8_2 in `found'. We can check this
3222 only when we are not single shifting. */
3223 if (! single_shifting
3224 && ! (rejected & CATEGORY_MASK_ISO_8_2))
f46869e4 3225 {
2735d060 3226 int len = 1;
b73bfc1c
KH
3227 while (src < src_end)
3228 {
d12bd917 3229 src_base = src;
df7492f9 3230 ONE_MORE_BYTE (c);
b73bfc1c 3231 if (c < 0xA0)
d12bd917
KH
3232 {
3233 src = src_base;
3234 break;
3235 }
2735d060 3236 len++;
b73bfc1c
KH
3237 }
3238
2735d060 3239 if (len & 1 && src < src_end)
cee53ed4
KH
3240 {
3241 rejected |= CATEGORY_MASK_ISO_8_2;
3242 if (composition_count >= 0)
2735d060 3243 composition_count += len;
cee53ed4 3244 }
f46869e4 3245 else
cee53ed4
KH
3246 {
3247 found |= CATEGORY_MASK_ISO_8_2;
3248 if (composition_count >= 0)
2735d060 3249 composition_count += len / 2;
cee53ed4 3250 }
f46869e4 3251 }
ff0dacd7 3252 break;
4ed46869 3253 }
4ed46869
KH
3254 }
3255 }
ff0dacd7
KH
3256 detect_info->rejected |= CATEGORY_MASK_ISO;
3257 return 0;
4ed46869 3258
df7492f9 3259 no_more_source:
ff0dacd7
KH
3260 detect_info->rejected |= rejected;
3261 detect_info->found |= (found & ~rejected);
df7492f9 3262 return 1;
4ed46869 3263}
ec6d2bb8 3264
4ed46869 3265
134b9549
KH
3266/* Set designation state into CODING. Set CHARS_96 to -1 if the
3267 escape sequence should be kept. */
df7492f9
KH
3268#define DECODE_DESIGNATION(reg, dim, chars_96, final) \
3269 do { \
3270 int id, prev; \
3271 \
3272 if (final < '0' || final >= 128 \
3273 || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0) \
3274 || !SAFE_CHARSET_P (coding, id)) \
3275 { \
3276 CODING_ISO_DESIGNATION (coding, reg) = -2; \
134b9549
KH
3277 chars_96 = -1; \
3278 break; \
df7492f9
KH
3279 } \
3280 prev = CODING_ISO_DESIGNATION (coding, reg); \
bf16eb23
KH
3281 if (id == charset_jisx0201_roman) \
3282 { \
3283 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
3284 id = charset_ascii; \
3285 } \
3286 else if (id == charset_jisx0208_1978) \
3287 { \
3288 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
3289 id = charset_jisx0208; \
3290 } \
df7492f9
KH
3291 CODING_ISO_DESIGNATION (coding, reg) = id; \
3292 /* If there was an invalid designation to REG previously, and this \
3293 designation is ASCII to REG, we should keep this designation \
3294 sequence. */ \
3295 if (prev == -2 && id == charset_ascii) \
134b9549 3296 chars_96 = -1; \
4ed46869
KH
3297 } while (0)
3298
d46c5b12 3299
e951386e
KH
3300/* Handle these composition sequence (ALT: alternate char):
3301
3302 (1) relative composition: ESC 0 CHAR ... ESC 1
3303 (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3304 (3) altchar composition: ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3305 (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3306
3307 When the start sequence (ESC 0/2/3/4) is found, this annotation
3308 header is produced.
3309
3310 [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3311
3312 Then, upon reading CHAR or RULE (one or two bytes), these codes are
3313 produced until the end sequence (ESC 1) is found:
3314
3315 (1) CHAR ... CHAR
3316 (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3317 (3) ALT ... ALT -1 -1 CHAR ... CHAR
3318 (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3319
3320 When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3321 annotation header is updated as below:
3322
3323 (1) LENGTH: unchanged, NCHARS: number of CHARs
3324 (2) LENGTH: unchanged, NCHARS: number of CHARs
3325 (3) LENGTH: += number of ALTs + 2, NCHARS: number of CHARs
3326 (4) LENGTH: += number of ALTs * 3, NCHARS: number of CHARs
3327
3328 If an error is found while composing, the annotation header is
3329 changed to:
3330
3331 [ ESC '0'/'2'/'3'/'4' -2 0 ]
3332
3333 and the sequence [ -2 DECODED-RULE ] is changed to the original
3334 byte sequence as below:
3335 o the original byte sequence is B: [ B -1 ]
3336 o the original byte sequence is B1 B2: [ B1 B2 ]
3337 and the sequence [ -1 -1 ] is changed to the original byte
3338 sequence:
3339 [ ESC '0' ]
3340*/
3341
3342/* Decode a composition rule C1 and maybe one more byte from the
66ebf983 3343 source, and set RULE to the encoded composition rule. If the rule
d5efd1d1 3344 is invalid, goto invalid_code. */
e951386e 3345
66ebf983 3346#define DECODE_COMPOSITION_RULE(rule) \
e951386e
KH
3347 do { \
3348 rule = c1 - 32; \
3349 if (rule < 0) \
d5efd1d1 3350 goto invalid_code; \
e951386e
KH
3351 if (rule < 81) /* old format (before ver.21) */ \
3352 { \
3353 int gref = (rule) / 9; \
3354 int nref = (rule) % 9; \
3355 if (gref == 4) gref = 10; \
3356 if (nref == 4) nref = 10; \
3357 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
e951386e
KH
3358 } \
3359 else /* new format (after ver.21) */ \
3360 { \
2735d060 3361 int b; \
e951386e 3362 \
2735d060 3363 ONE_MORE_BYTE (b); \
d5efd1d1
PE
3364 if (! COMPOSITION_ENCODE_RULE_VALID (rule - 81, b - 32)) \
3365 goto invalid_code; \
2735d060 3366 rule = COMPOSITION_ENCODE_RULE (rule - 81, b - 32); \
d5efd1d1 3367 rule += 0x100; /* Distinguish it from the old format. */ \
e951386e
KH
3368 } \
3369 } while (0)
3370
3371#define ENCODE_COMPOSITION_RULE(rule) \
df7492f9 3372 do { \
e951386e
KH
3373 int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3374 \
3375 if (rule < 0x100) /* old format */ \
df7492f9 3376 { \
e951386e
KH
3377 if (gref == 10) gref = 4; \
3378 if (nref == 10) nref = 4; \
3379 charbuf[idx] = 32 + gref * 9 + nref; \
3380 charbuf[idx + 1] = -1; \
3381 new_chars++; \
df7492f9 3382 } \
e951386e 3383 else /* new format */ \
df7492f9 3384 { \
e951386e
KH
3385 charbuf[idx] = 32 + 81 + gref; \
3386 charbuf[idx + 1] = 32 + nref; \
3387 new_chars += 2; \
df7492f9
KH
3388 } \
3389 } while (0)
3390
e951386e
KH
3391/* Finish the current composition as invalid. */
3392
f57e2426 3393static int finish_composition (int *, struct composition_status *);
e951386e
KH
3394
3395static int
971de7fb 3396finish_composition (int *charbuf, struct composition_status *cmp_status)
e951386e
KH
3397{
3398 int idx = - cmp_status->length;
3399 int new_chars;
3400
3401 /* Recover the original ESC sequence */
3402 charbuf[idx++] = ISO_CODE_ESC;
3403 charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3404 : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3405 : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3406 /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3407 : '4');
3408 charbuf[idx++] = -2;
3409 charbuf[idx++] = 0;
3410 charbuf[idx++] = -1;
3411 new_chars = cmp_status->nchars;
3412 if (cmp_status->method >= COMPOSITION_WITH_RULE)
3413 for (; idx < 0; idx++)
3414 {
3415 int elt = charbuf[idx];
3416
3417 if (elt == -2)
3418 {
3419 ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3420 idx++;
3421 }
3422 else if (elt == -1)
3423 {
3424 charbuf[idx++] = ISO_CODE_ESC;
3425 charbuf[idx] = '0';
3426 new_chars += 2;
3427 }
3428 }
3429 cmp_status->state = COMPOSING_NO;
3430 return new_chars;
3431}
3432
ad1746f5 3433/* If characters are under composition, finish the composition. */
e951386e
KH
3434#define MAYBE_FINISH_COMPOSITION() \
3435 do { \
3436 if (cmp_status->state != COMPOSING_NO) \
3437 char_offset += finish_composition (charbuf, cmp_status); \
3438 } while (0)
d46c5b12 3439
aa72b389 3440/* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
e951386e 3441
aa72b389
KH
3442 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3443 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
df7492f9
KH
3444 ESC 3 : altchar composition : ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3445 ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
ec6d2bb8 3446
e951386e
KH
3447 Produce this annotation sequence now:
3448
3449 [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3450*/
3451
3452#define DECODE_COMPOSITION_START(c1) \
3453 do { \
3454 if (c1 == '0' \
3455 && ((cmp_status->state == COMPOSING_COMPONENT_CHAR \
3456 && cmp_status->method == COMPOSITION_WITH_ALTCHARS) \
3457 || (cmp_status->state == COMPOSING_COMPONENT_RULE \
3458 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3459 { \
3460 *charbuf++ = -1; \
3461 *charbuf++= -1; \
3462 cmp_status->state = COMPOSING_CHAR; \
3463 cmp_status->length += 2; \
3464 } \
3465 else \
3466 { \
3467 MAYBE_FINISH_COMPOSITION (); \
3468 cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE \
3469 : c1 == '2' ? COMPOSITION_WITH_RULE \
3470 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
3471 : COMPOSITION_WITH_RULE_ALTCHARS); \
3472 cmp_status->state \
3473 = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR); \
3474 ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method); \
3475 cmp_status->length = MAX_ANNOTATION_LENGTH; \
3476 cmp_status->nchars = cmp_status->ncomps = 0; \
3477 coding->annotated = 1; \
3478 } \
ec6d2bb8
KH
3479 } while (0)
3480
ec6d2bb8 3481
e951386e 3482/* Handle composition end sequence ESC 1. */
df7492f9
KH
3483
3484#define DECODE_COMPOSITION_END() \
ec6d2bb8 3485 do { \
e951386e
KH
3486 if (cmp_status->nchars == 0 \
3487 || ((cmp_status->state == COMPOSING_CHAR) \
3488 == (cmp_status->method == COMPOSITION_WITH_RULE))) \
ec6d2bb8 3489 { \
e951386e
KH
3490 MAYBE_FINISH_COMPOSITION (); \
3491 goto invalid_code; \
ec6d2bb8 3492 } \
e951386e
KH
3493 if (cmp_status->method == COMPOSITION_WITH_ALTCHARS) \
3494 charbuf[- cmp_status->length] -= cmp_status->ncomps + 2; \
3495 else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS) \
3496 charbuf[- cmp_status->length] -= cmp_status->ncomps * 3; \
3497 charbuf[- cmp_status->length + 2] = cmp_status->nchars; \
3498 char_offset += cmp_status->nchars; \
3499 cmp_status->state = COMPOSING_NO; \
ec6d2bb8
KH
3500 } while (0)
3501
e951386e 3502/* Store a composition rule RULE in charbuf, and update cmp_status. */
df7492f9 3503
e951386e
KH
3504#define STORE_COMPOSITION_RULE(rule) \
3505 do { \
3506 *charbuf++ = -2; \
3507 *charbuf++ = rule; \
3508 cmp_status->length += 2; \
3509 cmp_status->state--; \
3510 } while (0)
ec6d2bb8 3511
e951386e
KH
3512/* Store a composed char or a component char C in charbuf, and update
3513 cmp_status. */
3514
3515#define STORE_COMPOSITION_CHAR(c) \
ec6d2bb8 3516 do { \
e951386e
KH
3517 *charbuf++ = (c); \
3518 cmp_status->length++; \
3519 if (cmp_status->state == COMPOSING_CHAR) \
3520 cmp_status->nchars++; \
df7492f9 3521 else \
e951386e
KH
3522 cmp_status->ncomps++; \
3523 if (cmp_status->method == COMPOSITION_WITH_RULE \
3524 || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS \
3525 && cmp_status->state == COMPOSING_COMPONENT_CHAR)) \
3526 cmp_status->state++; \
ec6d2bb8 3527 } while (0)
88993dfd 3528
d46c5b12 3529
4ed46869
KH
3530/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
3531
b73bfc1c 3532static void
971de7fb 3533decode_coding_iso_2022 (struct coding_system *coding)
4ed46869 3534{
8f924df7
KH
3535 const unsigned char *src = coding->source + coding->consumed;
3536 const unsigned char *src_end = coding->source + coding->src_bytes;
3537 const unsigned char *src_base;
69a80ea3 3538 int *charbuf = coding->charbuf + coding->charbuf_used;
ad1746f5
KH
3539 /* We may produce two annotations (charset and composition) in one
3540 loop and one more charset annotation at the end. */
ff0dacd7 3541 int *charbuf_end
df80c7f0 3542 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
d311d28c 3543 ptrdiff_t consumed_chars = 0, consumed_chars_base;
df7492f9 3544 int multibytep = coding->src_multibyte;
4ed46869 3545 /* Charsets invoked to graphic plane 0 and 1 respectively. */
df7492f9
KH
3546 int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3547 int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
134b9549 3548 int charset_id_2, charset_id_3;
df7492f9
KH
3549 struct charset *charset;
3550 int c;
e951386e 3551 struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
66ebf983 3552 Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
d311d28c
PE
3553 ptrdiff_t char_offset = coding->produced_char;
3554 ptrdiff_t last_offset = char_offset;
ff0dacd7 3555 int last_id = charset_ascii;
2735d060 3556 int eol_dos =
0a9564cb 3557 !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
119852e7 3558 int byte_after_cr = -1;
e951386e 3559 int i;
df7492f9 3560
df7492f9 3561 setup_iso_safe_charsets (attrs);
1b3b981b 3562 coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
b73bfc1c 3563
e951386e
KH
3564 if (cmp_status->state != COMPOSING_NO)
3565 {
15cbd324
EZ
3566 if (charbuf_end - charbuf < cmp_status->length)
3567 abort ();
e951386e
KH
3568 for (i = 0; i < cmp_status->length; i++)
3569 *charbuf++ = cmp_status->carryover[i];
3570 coding->annotated = 1;
3571 }
3572
b73bfc1c 3573 while (1)
4ed46869 3574 {
cf299835 3575 int c1, c2, c3;
b73bfc1c
KH
3576
3577 src_base = src;
df7492f9
KH
3578 consumed_chars_base = consumed_chars;
3579
3580 if (charbuf >= charbuf_end)
b71f6f73
KH
3581 {
3582 if (byte_after_cr >= 0)
3583 src_base--;
3584 break;
3585 }
df7492f9 3586
119852e7
KH
3587 if (byte_after_cr >= 0)
3588 c1 = byte_after_cr, byte_after_cr = -1;
3589 else
3590 ONE_MORE_BYTE (c1);
065e3595
KH
3591 if (c1 < 0)
3592 goto invalid_code;
4ed46869 3593
e951386e 3594 if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
4ed46869 3595 {
e951386e
KH
3596 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3597 char_offset++;
3598 CODING_ISO_EXTSEGMENT_LEN (coding)--;
3599 continue;
3600 }
3601
3602 if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3603 {
3604 if (c1 == ISO_CODE_ESC)
ec6d2bb8 3605 {
e951386e
KH
3606 if (src + 1 >= src_end)
3607 goto no_more_source;
3608 *charbuf++ = ISO_CODE_ESC;
3609 char_offset++;
3610 if (src[0] == '%' && src[1] == '@')
df7492f9 3611 {
e951386e
KH
3612 src += 2;
3613 consumed_chars += 2;
3614 char_offset += 2;
3615 /* We are sure charbuf can contain two more chars. */
3616 *charbuf++ = '%';
3617 *charbuf++ = '@';
3618 CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
df7492f9 3619 }
4ed46869 3620 }
e951386e
KH
3621 else
3622 {
3623 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3624 char_offset++;
3625 }
3626 continue;
3627 }
3628
3629 if ((cmp_status->state == COMPOSING_RULE
3630 || cmp_status->state == COMPOSING_COMPONENT_RULE)
3631 && c1 != ISO_CODE_ESC)
3632 {
66ebf983 3633 int rule;
e951386e 3634
66ebf983 3635 DECODE_COMPOSITION_RULE (rule);
e951386e
KH
3636 STORE_COMPOSITION_RULE (rule);
3637 continue;
3638 }
3639
3640 /* We produce at most one character. */
3641 switch (iso_code_class [c1])
3642 {
3643 case ISO_0x20_or_0x7F:
df7492f9
KH
3644 if (charset_id_0 < 0
3645 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
781d7a48
KH
3646 /* This is SPACE or DEL. */
3647 charset = CHARSET_FROM_ID (charset_ascii);
3648 else
3649 charset = CHARSET_FROM_ID (charset_id_0);
3650 break;
4ed46869
KH
3651
3652 case ISO_graphic_plane_0:
134b9549
KH
3653 if (charset_id_0 < 0)
3654 charset = CHARSET_FROM_ID (charset_ascii);
3655 else
3656 charset = CHARSET_FROM_ID (charset_id_0);
4ed46869
KH
3657 break;
3658
3659 case ISO_0xA0_or_0xFF:
df7492f9
KH
3660 if (charset_id_1 < 0
3661 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3662 || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3663 goto invalid_code;
4ed46869
KH
3664 /* This is a graphic character, we fall down ... */
3665
3666 case ISO_graphic_plane_1:
df7492f9
KH
3667 if (charset_id_1 < 0)
3668 goto invalid_code;
3669 charset = CHARSET_FROM_ID (charset_id_1);
4ed46869
KH
3670 break;
3671
df7492f9 3672 case ISO_control_0:
2735d060 3673 if (eol_dos && c1 == '\r')
119852e7 3674 ONE_MORE_BYTE (byte_after_cr);
df7492f9
KH
3675 MAYBE_FINISH_COMPOSITION ();
3676 charset = CHARSET_FROM_ID (charset_ascii);
4ed46869
KH
3677 break;
3678
df7492f9 3679 case ISO_control_1:
df7492f9
KH
3680 goto invalid_code;
3681
4ed46869 3682 case ISO_shift_out:
df7492f9
KH
3683 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3684 || CODING_ISO_DESIGNATION (coding, 1) < 0)
3685 goto invalid_code;
3686 CODING_ISO_INVOCATION (coding, 0) = 1;
3687 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
b73bfc1c 3688 continue;
4ed46869
KH
3689
3690 case ISO_shift_in:
df7492f9
KH
3691 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3692 goto invalid_code;
3693 CODING_ISO_INVOCATION (coding, 0) = 0;
3694 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
b73bfc1c 3695 continue;
4ed46869
KH
3696
3697 case ISO_single_shift_2_7:
a63dba42
KH
3698 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS))
3699 goto invalid_code;
4ed46869 3700 case ISO_single_shift_2:
df7492f9
KH
3701 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3702 goto invalid_code;
4ed46869
KH
3703 /* SS2 is handled as an escape sequence of ESC 'N' */
3704 c1 = 'N';
3705 goto label_escape_sequence;
3706
3707 case ISO_single_shift_3:
df7492f9
KH
3708 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3709 goto invalid_code;
4ed46869
KH
3710 /* SS2 is handled as an escape sequence of ESC 'O' */
3711 c1 = 'O';
3712 goto label_escape_sequence;
3713
3714 case ISO_control_sequence_introducer:
3715 /* CSI is handled as an escape sequence of ESC '[' ... */
3716 c1 = '[';
3717 goto label_escape_sequence;
3718
3719 case ISO_escape:
3720 ONE_MORE_BYTE (c1);
3721 label_escape_sequence:
df7492f9 3722 /* Escape sequences handled here are invocation,
4ed46869
KH
3723 designation, direction specification, and character
3724 composition specification. */
3725 switch (c1)
3726 {
3727 case '&': /* revision of following character set */
3728 ONE_MORE_BYTE (c1);
3729 if (!(c1 >= '@' && c1 <= '~'))
df7492f9 3730 goto invalid_code;
4ed46869
KH
3731 ONE_MORE_BYTE (c1);
3732 if (c1 != ISO_CODE_ESC)
df7492f9 3733 goto invalid_code;
4ed46869
KH
3734 ONE_MORE_BYTE (c1);
3735 goto label_escape_sequence;
3736
3737 case '$': /* designation of 2-byte character set */
df7492f9
KH
3738 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3739 goto invalid_code;
134b9549
KH
3740 {
3741 int reg, chars96;
3742
3743 ONE_MORE_BYTE (c1);
3744 if (c1 >= '@' && c1 <= 'B')
3745 { /* designation of JISX0208.1978, GB2312.1980,
88993dfd 3746 or JISX0208.1980 */
134b9549
KH
3747 reg = 0, chars96 = 0;
3748 }
3749 else if (c1 >= 0x28 && c1 <= 0x2B)
3750 { /* designation of DIMENSION2_CHARS94 character set */
3751 reg = c1 - 0x28, chars96 = 0;
3752 ONE_MORE_BYTE (c1);
3753 }
3754 else if (c1 >= 0x2C && c1 <= 0x2F)
3755 { /* designation of DIMENSION2_CHARS96 character set */
3756 reg = c1 - 0x2C, chars96 = 1;
3757 ONE_MORE_BYTE (c1);
3758 }
3759 else
3760 goto invalid_code;
3761 DECODE_DESIGNATION (reg, 2, chars96, c1);
3762 /* We must update these variables now. */
3763 if (reg == 0)
3764 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3765 else if (reg == 1)
3766 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3767 if (chars96 < 0)
3768 goto invalid_code;
3769 }
b73bfc1c 3770 continue;
4ed46869
KH
3771
3772 case 'n': /* invocation of locking-shift-2 */
df7492f9
KH
3773 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3774 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3775 goto invalid_code;
3776 CODING_ISO_INVOCATION (coding, 0) = 2;
3777 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
b73bfc1c 3778 continue;
4ed46869
KH
3779
3780 case 'o': /* invocation of locking-shift-3 */
df7492f9
KH
3781 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3782 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3783 goto invalid_code;
3784 CODING_ISO_INVOCATION (coding, 0) = 3;
3785 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
b73bfc1c 3786 continue;
4ed46869
KH
3787
3788 case 'N': /* invocation of single-shift-2 */
df7492f9
KH
3789 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3790 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3791 goto invalid_code;
134b9549
KH
3792 charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3793 if (charset_id_2 < 0)
3794 charset = CHARSET_FROM_ID (charset_ascii);
3795 else
3796 charset = CHARSET_FROM_ID (charset_id_2);
b73bfc1c 3797 ONE_MORE_BYTE (c1);
e7046a18 3798 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
df7492f9 3799 goto invalid_code;
4ed46869
KH
3800 break;
3801
3802 case 'O': /* invocation of single-shift-3 */
df7492f9
KH
3803 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3804 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3805 goto invalid_code;
134b9549
KH
3806 charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3807 if (charset_id_3 < 0)
3808 charset = CHARSET_FROM_ID (charset_ascii);
3809 else
3810 charset = CHARSET_FROM_ID (charset_id_3);
b73bfc1c 3811 ONE_MORE_BYTE (c1);
e7046a18 3812 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
df7492f9 3813 goto invalid_code;
4ed46869
KH
3814 break;
3815
ec6d2bb8 3816 case '0': case '2': case '3': case '4': /* start composition */
df7492f9
KH
3817 if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3818 goto invalid_code;
e951386e
KH
3819 if (last_id != charset_ascii)
3820 {
3821 ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3822 last_id = charset_ascii;
3823 last_offset = char_offset;
3824 }
ec6d2bb8 3825 DECODE_COMPOSITION_START (c1);
b73bfc1c 3826 continue;
4ed46869 3827
ec6d2bb8 3828 case '1': /* end composition */
e951386e 3829 if (cmp_status->state == COMPOSING_NO)
df7492f9
KH
3830 goto invalid_code;
3831 DECODE_COMPOSITION_END ();
b73bfc1c 3832 continue;
4ed46869
KH
3833
3834 case '[': /* specification of direction */
de59072a 3835 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION))
df7492f9 3836 goto invalid_code;
4ed46869 3837 /* For the moment, nested direction is not supported.
d46c5b12 3838 So, `coding->mode & CODING_MODE_DIRECTION' zero means
ad1746f5 3839 left-to-right, and nonzero means right-to-left. */
4ed46869
KH
3840 ONE_MORE_BYTE (c1);
3841 switch (c1)
3842 {
3843 case ']': /* end of the current direction */
d46c5b12 3844 coding->mode &= ~CODING_MODE_DIRECTION;
4ed46869
KH
3845
3846 case '0': /* end of the current direction */
3847 case '1': /* start of left-to-right direction */
3848 ONE_MORE_BYTE (c1);
3849 if (c1 == ']')
d46c5b12 3850 coding->mode &= ~CODING_MODE_DIRECTION;
4ed46869 3851 else
df7492f9 3852 goto invalid_code;
4ed46869
KH
3853 break;
3854
3855 case '2': /* start of right-to-left direction */
3856 ONE_MORE_BYTE (c1);
3857 if (c1 == ']')
d46c5b12 3858 coding->mode |= CODING_MODE_DIRECTION;
4ed46869 3859 else
df7492f9 3860 goto invalid_code;
4ed46869
KH
3861 break;
3862
3863 default:
df7492f9 3864 goto invalid_code;
4ed46869 3865 }
b73bfc1c 3866 continue;
4ed46869 3867
103e0180 3868 case '%':
103e0180
KH
3869 ONE_MORE_BYTE (c1);
3870 if (c1 == '/')
3871 {
3872 /* CTEXT extended segment:
3873 ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3874 We keep these bytes as is for the moment.
3875 They may be decoded by post-read-conversion. */
3876 int dim, M, L;
4776e638 3877 int size;
8f924df7 3878
103e0180 3879 ONE_MORE_BYTE (dim);
7a84eee5 3880 if (dim < '0' || dim > '4')
e951386e 3881 goto invalid_code;
103e0180 3882 ONE_MORE_BYTE (M);
e951386e
KH
3883 if (M < 128)
3884 goto invalid_code;
103e0180 3885 ONE_MORE_BYTE (L);
e951386e
KH
3886 if (L < 128)
3887 goto invalid_code;
103e0180 3888 size = ((M - 128) * 128) + (L - 128);
e951386e 3889 if (charbuf + 6 > charbuf_end)
4776e638
KH
3890 goto break_loop;
3891 *charbuf++ = ISO_CODE_ESC;
3892 *charbuf++ = '%';
3893 *charbuf++ = '/';
3894 *charbuf++ = dim;
3895 *charbuf++ = BYTE8_TO_CHAR (M);
3896 *charbuf++ = BYTE8_TO_CHAR (L);
e951386e 3897 CODING_ISO_EXTSEGMENT_LEN (coding) = size;
103e0180
KH
3898 }
3899 else if (c1 == 'G')
3900 {
103e0180
KH
3901 /* XFree86 extension for embedding UTF-8 in CTEXT:
3902 ESC % G --UTF-8-BYTES-- ESC % @
3903 We keep these bytes as is for the moment.
3904 They may be decoded by post-read-conversion. */
e951386e 3905 if (charbuf + 3 > charbuf_end)
4776e638 3906 goto break_loop;
e951386e
KH
3907 *charbuf++ = ISO_CODE_ESC;
3908 *charbuf++ = '%';
3909 *charbuf++ = 'G';
3910 CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
103e0180
KH
3911 }
3912 else
4776e638 3913 goto invalid_code;
103e0180 3914 continue;
4776e638 3915 break;
103e0180 3916
4ed46869 3917 default:
df7492f9
KH
3918 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3919 goto invalid_code;
134b9549
KH
3920 {
3921 int reg, chars96;
3922
3923 if (c1 >= 0x28 && c1 <= 0x2B)
3924 { /* designation of DIMENSION1_CHARS94 character set */
3925 reg = c1 - 0x28, chars96 = 0;
3926 ONE_MORE_BYTE (c1);
3927 }
3928 else if (c1 >= 0x2C && c1 <= 0x2F)
3929 { /* designation of DIMENSION1_CHARS96 character set */
3930 reg = c1 - 0x2C, chars96 = 1;
3931 ONE_MORE_BYTE (c1);
3932 }
3933 else
3934 goto invalid_code;
3935 DECODE_DESIGNATION (reg, 1, chars96, c1);
3936 /* We must update these variables now. */
3937 if (reg == 0)
3938 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3939 else if (reg == 1)
3940 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3941 if (chars96 < 0)
3942 goto invalid_code;
3943 }
b73bfc1c 3944 continue;
4ed46869 3945 }
413bb2db
PE
3946 break;
3947
3948 default:
3949 abort ();
b73bfc1c 3950 }
4ed46869 3951
e951386e
KH
3952 if (cmp_status->state == COMPOSING_NO
3953 && charset->id != charset_ascii
ff0dacd7
KH
3954 && last_id != charset->id)
3955 {
3956 if (last_id != charset_ascii)
69a80ea3 3957 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
ff0dacd7
KH
3958 last_id = charset->id;
3959 last_offset = char_offset;
3960 }
3961
b73bfc1c 3962 /* Now we know CHARSET and 1st position code C1 of a character.
cf299835
KH
3963 Produce a decoded character while getting 2nd and 3rd
3964 position codes C2, C3 if necessary. */
df7492f9 3965 if (CHARSET_DIMENSION (charset) > 1)
b73bfc1c
KH
3966 {
3967 ONE_MORE_BYTE (c2);
cf299835
KH
3968 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
3969 || ((c1 & 0x80) != (c2 & 0x80)))
b73bfc1c 3970 /* C2 is not in a valid range. */
df7492f9 3971 goto invalid_code;
cf299835
KH
3972 if (CHARSET_DIMENSION (charset) == 2)
3973 c1 = (c1 << 8) | c2;
3974 else
df7492f9 3975 {
cf299835
KH
3976 ONE_MORE_BYTE (c3);
3977 if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
3978 || ((c1 & 0x80) != (c3 & 0x80)))
3979 /* C3 is not in a valid range. */
df7492f9 3980 goto invalid_code;
cf299835 3981 c1 = (c1 << 16) | (c2 << 8) | c2;
df7492f9
KH
3982 }
3983 }
cf299835 3984 c1 &= 0x7F7F7F;
df7492f9
KH
3985 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3986 if (c < 0)
3987 {
3988 MAYBE_FINISH_COMPOSITION ();
3989 for (; src_base < src; src_base++, char_offset++)
3990 {
3991 if (ASCII_BYTE_P (*src_base))
3992 *charbuf++ = *src_base;
3993 else
3994 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3995 }
3996 }
e951386e 3997 else if (cmp_status->state == COMPOSING_NO)
df7492f9
KH
3998 {
3999 *charbuf++ = c;
4000 char_offset++;
4ed46869 4001 }
e951386e
KH
4002 else if ((cmp_status->state == COMPOSING_CHAR
4003 ? cmp_status->nchars
4004 : cmp_status->ncomps)
4005 >= MAX_COMPOSITION_COMPONENTS)
781d7a48 4006 {
e951386e
KH
4007 /* Too long composition. */
4008 MAYBE_FINISH_COMPOSITION ();
4009 *charbuf++ = c;
4010 char_offset++;
4ed46869 4011 }
e951386e
KH
4012 else
4013 STORE_COMPOSITION_CHAR (c);
4ed46869
KH
4014 continue;
4015
df7492f9
KH
4016 invalid_code:
4017 MAYBE_FINISH_COMPOSITION ();
4ed46869 4018 src = src_base;
df7492f9
KH
4019 consumed_chars = consumed_chars_base;
4020 ONE_MORE_BYTE (c);
065e3595 4021 *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
ff0dacd7 4022 char_offset++;
df7492f9 4023 coding->errors++;
4776e638
KH
4024 continue;
4025
4026 break_loop:
4027 break;
4ed46869 4028 }
fb88bf2d 4029
df7492f9 4030 no_more_source:
e951386e
KH
4031 if (cmp_status->state != COMPOSING_NO)
4032 {
4033 if (coding->mode & CODING_MODE_LAST_BLOCK)
4034 MAYBE_FINISH_COMPOSITION ();
4035 else
4036 {
4037 charbuf -= cmp_status->length;
4038 for (i = 0; i < cmp_status->length; i++)
4039 cmp_status->carryover[i] = charbuf[i];
4040 }
4041 }
4042 else if (last_id != charset_ascii)
69a80ea3 4043 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
df7492f9
KH
4044 coding->consumed_char += consumed_chars_base;
4045 coding->consumed = src_base - coding->source;
4046 coding->charbuf_used = charbuf - coding->charbuf;
4ed46869
KH
4047}
4048
b73bfc1c 4049
f4dee582 4050/* ISO2022 encoding stuff. */
4ed46869
KH
4051
4052/*
f4dee582 4053 It is not enough to say just "ISO2022" on encoding, we have to
df7492f9 4054 specify more details. In Emacs, each coding system of ISO2022
4ed46869 4055 variant has the following specifications:
df7492f9 4056 1. Initial designation to G0 thru G3.
4ed46869
KH
4057 2. Allows short-form designation?
4058 3. ASCII should be designated to G0 before control characters?
4059 4. ASCII should be designated to G0 at end of line?
4060 5. 7-bit environment or 8-bit environment?
4061 6. Use locking-shift?
4062 7. Use Single-shift?
4063 And the following two are only for Japanese:
4064 8. Use ASCII in place of JIS0201-1976-Roman?
4065 9. Use JISX0208-1983 in place of JISX0208-1978?
df7492f9
KH
4066 These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
4067 defined by macros CODING_ISO_FLAG_XXX. See `coding.h' for more
f4dee582 4068 details.
4ed46869
KH
4069*/
4070
4071/* Produce codes (escape sequence) for designating CHARSET to graphic
b73bfc1c
KH
4072 register REG at DST, and increment DST. If <final-char> of CHARSET is
4073 '@', 'A', or 'B' and the coding system CODING allows, produce
4074 designation sequence of short-form. */
4ed46869
KH
4075
4076#define ENCODE_DESIGNATION(charset, reg, coding) \
4077 do { \
df7492f9 4078 unsigned char final_char = CHARSET_ISO_FINAL (charset); \
675e2c69
DN
4079 const char *intermediate_char_94 = "()*+"; \
4080 const char *intermediate_char_96 = ",-./"; \
df7492f9 4081 int revision = -1; \
df7492f9
KH
4082 \
4083 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION) \
c197f191 4084 revision = CHARSET_ISO_REVISION (charset); \
df7492f9
KH
4085 \
4086 if (revision >= 0) \
70c22245 4087 { \
df7492f9
KH
4088 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&'); \
4089 EMIT_ONE_BYTE ('@' + revision); \
4ed46869 4090 } \
df7492f9 4091 EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC); \
4ed46869
KH
4092 if (CHARSET_DIMENSION (charset) == 1) \
4093 { \
2735d060 4094 int b; \
df7492f9 4095 if (! CHARSET_ISO_CHARS_96 (charset)) \
2735d060 4096 b = intermediate_char_94[reg]; \
4ed46869 4097 else \
2735d060
PE
4098 b = intermediate_char_96[reg]; \
4099 EMIT_ONE_ASCII_BYTE (b); \
4ed46869
KH
4100 } \
4101 else \
4102 { \
df7492f9
KH
4103 EMIT_ONE_ASCII_BYTE ('$'); \
4104 if (! CHARSET_ISO_CHARS_96 (charset)) \
4ed46869 4105 { \
df7492f9 4106 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM \
b73bfc1c
KH
4107 || reg != 0 \
4108 || final_char < '@' || final_char > 'B') \
df7492f9 4109 EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]); \
4ed46869
KH
4110 } \
4111 else \
df7492f9 4112 EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]); \
4ed46869 4113 } \
df7492f9
KH
4114 EMIT_ONE_ASCII_BYTE (final_char); \
4115 \
4116 CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset); \
4ed46869
KH
4117 } while (0)
4118
df7492f9 4119
4ed46869
KH
4120/* The following two macros produce codes (control character or escape
4121 sequence) for ISO2022 single-shift functions (single-shift-2 and
4122 single-shift-3). */
4123
df7492f9
KH
4124#define ENCODE_SINGLE_SHIFT_2 \
4125 do { \
4126 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4127 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N'); \
4128 else \
4129 EMIT_ONE_BYTE (ISO_CODE_SS2); \
4130 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
4ed46869
KH
4131 } while (0)
4132
df7492f9
KH
4133
4134#define ENCODE_SINGLE_SHIFT_3 \
4135 do { \
4136 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4137 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O'); \
4138 else \
4139 EMIT_ONE_BYTE (ISO_CODE_SS3); \
4140 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
4ed46869
KH
4141 } while (0)
4142
df7492f9 4143
4ed46869
KH
4144/* The following four macros produce codes (control character or
4145 escape sequence) for ISO2022 locking-shift functions (shift-in,
4146 shift-out, locking-shift-2, and locking-shift-3). */
4147
df7492f9
KH
4148#define ENCODE_SHIFT_IN \
4149 do { \
4150 EMIT_ONE_ASCII_BYTE (ISO_CODE_SI); \
4151 CODING_ISO_INVOCATION (coding, 0) = 0; \
4ed46869
KH
4152 } while (0)
4153
df7492f9
KH
4154
4155#define ENCODE_SHIFT_OUT \
4156 do { \
4157 EMIT_ONE_ASCII_BYTE (ISO_CODE_SO); \
4158 CODING_ISO_INVOCATION (coding, 0) = 1; \
4ed46869
KH
4159 } while (0)
4160
df7492f9
KH
4161
4162#define ENCODE_LOCKING_SHIFT_2 \
4163 do { \
4164 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
4165 CODING_ISO_INVOCATION (coding, 0) = 2; \
4ed46869
KH
4166 } while (0)
4167
df7492f9
KH
4168
4169#define ENCODE_LOCKING_SHIFT_3 \
4170 do { \
4171 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
4172 CODING_ISO_INVOCATION (coding, 0) = 3; \
4ed46869
KH
4173 } while (0)
4174
df7492f9 4175
f4dee582
RS
4176/* Produce codes for a DIMENSION1 character whose character set is
4177 CHARSET and whose position-code is C1. Designation and invocation
4ed46869
KH
4178 sequences are also produced in advance if necessary. */
4179
6e85d753
KH
4180#define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
4181 do { \
df7492f9 4182 int id = CHARSET_ID (charset); \
bf16eb23
KH
4183 \
4184 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
4185 && id == charset_ascii) \
4186 { \
4187 id = charset_jisx0201_roman; \
4188 charset = CHARSET_FROM_ID (id); \
4189 } \
4190 \
df7492f9 4191 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
6e85d753 4192 { \
df7492f9
KH
4193 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4194 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
6e85d753 4195 else \
df7492f9
KH
4196 EMIT_ONE_BYTE (c1 | 0x80); \
4197 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
6e85d753
KH
4198 break; \
4199 } \
df7492f9 4200 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
6e85d753 4201 { \
df7492f9 4202 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
6e85d753
KH
4203 break; \
4204 } \
df7492f9 4205 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
6e85d753 4206 { \
df7492f9 4207 EMIT_ONE_BYTE (c1 | 0x80); \
6e85d753
KH
4208 break; \
4209 } \
6e85d753
KH
4210 else \
4211 /* Since CHARSET is not yet invoked to any graphic planes, we \
4212 must invoke it, or, at first, designate it to some graphic \
4213 register. Then repeat the loop to actually produce the \
4214 character. */ \
df7492f9
KH
4215 dst = encode_invocation_designation (charset, coding, dst, \
4216 &produced_chars); \
4ed46869
KH
4217 } while (1)
4218
df7492f9 4219
f4dee582
RS
4220/* Produce codes for a DIMENSION2 character whose character set is
4221 CHARSET and whose position-codes are C1 and C2. Designation and
4ed46869
KH
4222 invocation codes are also produced in advance if necessary. */
4223
6e85d753
KH
4224#define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
4225 do { \
df7492f9 4226 int id = CHARSET_ID (charset); \
bf16eb23
KH
4227 \
4228 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
4229 && id == charset_jisx0208) \
4230 { \
4231 id = charset_jisx0208_1978; \
4232 charset = CHARSET_FROM_ID (id); \
4233 } \
4234 \
df7492f9 4235 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
6e85d753 4236 { \
df7492f9
KH
4237 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4238 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
6e85d753 4239 else \
df7492f9
KH
4240 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
4241 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
6e85d753
KH
4242 break; \
4243 } \
df7492f9 4244 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
6e85d753 4245 { \
df7492f9 4246 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
6e85d753
KH
4247 break; \
4248 } \
df7492f9 4249 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
6e85d753 4250 { \
df7492f9 4251 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
6e85d753
KH
4252 break; \
4253 } \
6e85d753
KH
4254 else \
4255 /* Since CHARSET is not yet invoked to any graphic planes, we \
4256 must invoke it, or, at first, designate it to some graphic \
4257 register. Then repeat the loop to actually produce the \
4258 character. */ \
df7492f9
KH
4259 dst = encode_invocation_designation (charset, coding, dst, \
4260 &produced_chars); \
4ed46869
KH
4261 } while (1)
4262
05e6f5dc 4263
df7492f9
KH
4264#define ENCODE_ISO_CHARACTER(charset, c) \
4265 do { \
8f50130c 4266 unsigned code; \
5eb05ea3 4267 CODING_ENCODE_CHAR (coding, dst, dst_end, (charset), (c), code); \
df7492f9
KH
4268 \
4269 if (CHARSET_DIMENSION (charset) == 1) \
4270 ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code); \
4271 else \
4272 ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
84fbb8a0 4273 } while (0)
bdd9fb48 4274
05e6f5dc 4275
4ed46869 4276/* Produce designation and invocation codes at a place pointed by DST
df7492f9 4277 to use CHARSET. The element `spec.iso_2022' of *CODING is updated.
4ed46869
KH
4278 Return new DST. */
4279
e2f1bab9 4280static unsigned char *
cf84bb53
JB
4281encode_invocation_designation (struct charset *charset,
4282 struct coding_system *coding,
d311d28c 4283 unsigned char *dst, ptrdiff_t *p_nchars)
4ed46869 4284{
df7492f9 4285 int multibytep = coding->dst_multibyte;
d311d28c 4286 ptrdiff_t produced_chars = *p_nchars;
4ed46869 4287 int reg; /* graphic register number */
df7492f9 4288 int id = CHARSET_ID (charset);
4ed46869
KH
4289
4290 /* At first, check designations. */
4291 for (reg = 0; reg < 4; reg++)
df7492f9 4292 if (id == CODING_ISO_DESIGNATION (coding, reg))
4ed46869
KH
4293 break;
4294
4295 if (reg >= 4)
4296 {
4297 /* CHARSET is not yet designated to any graphic registers. */
4298 /* At first check the requested designation. */
df7492f9
KH
4299 reg = CODING_ISO_REQUEST (coding, id);
4300 if (reg < 0)
1ba9e4ab
KH
4301 /* Since CHARSET requests no special designation, designate it
4302 to graphic register 0. */
4ed46869
KH
4303 reg = 0;
4304
4305 ENCODE_DESIGNATION (charset, reg, coding);
4306 }
4307
df7492f9
KH
4308 if (CODING_ISO_INVOCATION (coding, 0) != reg
4309 && CODING_ISO_INVOCATION (coding, 1) != reg)
4ed46869
KH
4310 {
4311 /* Since the graphic register REG is not invoked to any graphic
4312 planes, invoke it to graphic plane 0. */
4313 switch (reg)
4314 {
4315 case 0: /* graphic register 0 */
4316 ENCODE_SHIFT_IN;
4317 break;
4318
4319 case 1: /* graphic register 1 */
4320 ENCODE_SHIFT_OUT;
4321 break;
4322
4323 case 2: /* graphic register 2 */
df7492f9 4324 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4ed46869
KH
4325 ENCODE_SINGLE_SHIFT_2;
4326 else
4327 ENCODE_LOCKING_SHIFT_2;
4328 break;
4329
4330 case 3: /* graphic register 3 */
df7492f9 4331 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4ed46869
KH
4332 ENCODE_SINGLE_SHIFT_3;
4333 else
4334 ENCODE_LOCKING_SHIFT_3;
4335 break;
4336 }
4337 }
b73bfc1c 4338
df7492f9 4339 *p_nchars = produced_chars;
4ed46869
KH
4340 return dst;
4341}
4342
4ed46869
KH
4343
4344/* Produce codes for designation and invocation to reset the graphic
4345 planes and registers to initial state. */
df7492f9
KH
4346#define ENCODE_RESET_PLANE_AND_REGISTER() \
4347 do { \
4348 int reg; \
4349 struct charset *charset; \
4350 \
4351 if (CODING_ISO_INVOCATION (coding, 0) != 0) \
4352 ENCODE_SHIFT_IN; \
4353 for (reg = 0; reg < 4; reg++) \
4354 if (CODING_ISO_INITIAL (coding, reg) >= 0 \
4355 && (CODING_ISO_DESIGNATION (coding, reg) \
4356 != CODING_ISO_INITIAL (coding, reg))) \
4357 { \
4358 charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4359 ENCODE_DESIGNATION (charset, reg, coding); \
4360 } \
4ed46869
KH
4361 } while (0)
4362
df7492f9 4363
bdd9fb48 4364/* Produce designation sequences of charsets in the line started from
5eb05ea3
KH
4365 CHARBUF to a place pointed by DST, and return the number of
4366 produced bytes. DST should not directly point a buffer text area
4367 which may be relocated by char_charset call.
bdd9fb48
KH
4368
4369 If the current block ends before any end-of-line, we may fail to
d46c5b12
KH
4370 find all the necessary designations. */
4371
6e6c82a4 4372static ptrdiff_t
5eb05ea3
KH
4373encode_designation_at_bol (struct coding_system *coding,
4374 int *charbuf, int *charbuf_end,
461c2ab9 4375 unsigned char *dst)
e0e989f6 4376{
75a3b399 4377 unsigned char *orig = dst;
df7492f9 4378 struct charset *charset;
bdd9fb48
KH
4379 /* Table of charsets to be designated to each graphic register. */
4380 int r[4];
df7492f9 4381 int c, found = 0, reg;
d311d28c 4382 ptrdiff_t produced_chars = 0;
df7492f9
KH
4383 int multibytep = coding->dst_multibyte;
4384 Lisp_Object attrs;
4385 Lisp_Object charset_list;
4386
4387 attrs = CODING_ID_ATTRS (coding->id);
4388 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4389 if (EQ (charset_list, Qiso_2022))
4390 charset_list = Viso_2022_charset_list;
bdd9fb48
KH
4391
4392 for (reg = 0; reg < 4; reg++)
4393 r[reg] = -1;
4394
5eb05ea3 4395 while (charbuf < charbuf_end && found < 4)
e0e989f6 4396 {
df7492f9
KH
4397 int id;
4398
4399 c = *charbuf++;
b73bfc1c
KH
4400 if (c == '\n')
4401 break;
df7492f9
KH
4402 charset = char_charset (c, charset_list, NULL);
4403 id = CHARSET_ID (charset);
4404 reg = CODING_ISO_REQUEST (coding, id);
4405 if (reg >= 0 && r[reg] < 0)
bdd9fb48
KH
4406 {
4407 found++;
df7492f9 4408 r[reg] = id;
bdd9fb48 4409 }
bdd9fb48
KH
4410 }
4411
4412 if (found)
4413 {
4414 for (reg = 0; reg < 4; reg++)
4415 if (r[reg] >= 0
df7492f9
KH
4416 && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4417 ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
e0e989f6 4418 }
b73bfc1c 4419
5eb05ea3 4420 return dst - orig;
e0e989f6
KH
4421}
4422
4ed46869
KH
4423/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
4424
df7492f9 4425static int
971de7fb 4426encode_coding_iso_2022 (struct coding_system *coding)
4ed46869 4427{
df7492f9
KH
4428 int multibytep = coding->dst_multibyte;
4429 int *charbuf = coding->charbuf;
4430 int *charbuf_end = charbuf + coding->charbuf_used;
4431 unsigned char *dst = coding->destination + coding->produced;
4432 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4433 int safe_room = 16;
4434 int bol_designation
4435 = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4436 && CODING_ISO_BOL (coding));
d311d28c 4437 ptrdiff_t produced_chars = 0;
df7492f9
KH
4438 Lisp_Object attrs, eol_type, charset_list;
4439 int ascii_compatible;
b73bfc1c 4440 int c;
ff0dacd7 4441 int preferred_charset_id = -1;
05e6f5dc 4442
24a73b0a 4443 CODING_GET_INFO (coding, attrs, charset_list);
0a9564cb 4444 eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
24a73b0a
KH
4445 if (VECTORP (eol_type))
4446 eol_type = Qunix;
4447
004068e4 4448 setup_iso_safe_charsets (attrs);
ff0dacd7 4449 /* Charset list may have been changed. */
287c57d7 4450 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
1b3b981b 4451 coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
0eecad43 4452
a552b35a
KH
4453 ascii_compatible
4454 = (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
4455 && ! (CODING_ISO_FLAGS (coding) & (CODING_ISO_FLAG_DESIGNATION
4456 | CODING_ISO_FLAG_LOCKING_SHIFT)));
bdd9fb48 4457
df7492f9 4458 while (charbuf < charbuf_end)
4ed46869 4459 {
df7492f9 4460 ASSURE_DESTINATION (safe_room);
b73bfc1c 4461
df7492f9 4462 if (bol_designation)
b73bfc1c 4463 {
bdd9fb48 4464 /* We have to produce designation sequences if any now. */
5eb05ea3
KH
4465 unsigned char desig_buf[16];
4466 int nbytes;
8f50130c 4467 ptrdiff_t offset;
5eb05ea3
KH
4468
4469 charset_map_loaded = 0;
4470 nbytes = encode_designation_at_bol (coding, charbuf, charbuf_end,
4471 desig_buf);
4472 if (charset_map_loaded
c1892f11 4473 && (offset = coding_change_destination (coding)))
5eb05ea3
KH
4474 {
4475 dst += offset;
4476 dst_end += offset;
4477 }
4478 memcpy (dst, desig_buf, nbytes);
4479 dst += nbytes;
df7492f9 4480 /* We are sure that designation sequences are all ASCII bytes. */
5eb05ea3
KH
4481 produced_chars += nbytes;
4482 bol_designation = 0;
4483 ASSURE_DESTINATION (safe_room);
e0e989f6
KH
4484 }
4485
df7492f9 4486 c = *charbuf++;
ec6d2bb8 4487
ff0dacd7
KH
4488 if (c < 0)
4489 {
4490 /* Handle an annotation. */
4491 switch (*charbuf)
ec6d2bb8 4492 {
ff0dacd7
KH
4493 case CODING_ANNOTATE_COMPOSITION_MASK:
4494 /* Not yet implemented. */
4495 break;
4496 case CODING_ANNOTATE_CHARSET_MASK:
cf7dfdf5 4497 preferred_charset_id = charbuf[2];
ff0dacd7
KH
4498 if (preferred_charset_id >= 0
4499 && NILP (Fmemq (make_number (preferred_charset_id),
4500 charset_list)))
4501 preferred_charset_id = -1;
4502 break;
4503 default:
4504 abort ();
4ed46869 4505 }
ff0dacd7
KH
4506 charbuf += -c - 1;
4507 continue;
4ed46869 4508 }
ec6d2bb8 4509
b73bfc1c
KH
4510 /* Now encode the character C. */
4511 if (c < 0x20 || c == 0x7F)
4512 {
df7492f9
KH
4513 if (c == '\n'
4514 || (c == '\r' && EQ (eol_type, Qmac)))
19a8d9e0 4515 {
df7492f9
KH
4516 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4517 ENCODE_RESET_PLANE_AND_REGISTER ();
4518 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
b73bfc1c 4519 {
df7492f9
KH
4520 int i;
4521
4522 for (i = 0; i < 4; i++)
4523 CODING_ISO_DESIGNATION (coding, i)
4524 = CODING_ISO_INITIAL (coding, i);
b73bfc1c 4525 }
df7492f9
KH
4526 bol_designation
4527 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
19a8d9e0 4528 }
df7492f9
KH
4529 else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4530 ENCODE_RESET_PLANE_AND_REGISTER ();
4531 EMIT_ONE_ASCII_BYTE (c);
4ed46869 4532 }
df7492f9 4533 else if (ASCII_CHAR_P (c))
88993dfd 4534 {
df7492f9
KH
4535 if (ascii_compatible)
4536 EMIT_ONE_ASCII_BYTE (c);
93dec019 4537 else
19a8d9e0 4538 {
bf16eb23
KH
4539 struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4540 ENCODE_ISO_CHARACTER (charset, c);
19a8d9e0 4541 }
4ed46869 4542 }
16eafb5d 4543 else if (CHAR_BYTE8_P (c))
88993dfd 4544 {
16eafb5d
KH
4545 c = CHAR_TO_BYTE8 (c);
4546 EMIT_ONE_BYTE (c);
88993dfd 4547 }
b73bfc1c 4548 else
df7492f9 4549 {
ff0dacd7 4550 struct charset *charset;
b73bfc1c 4551
ff0dacd7
KH
4552 if (preferred_charset_id >= 0)
4553 {
5eb05ea3
KH
4554 int result;
4555
ff0dacd7 4556 charset = CHARSET_FROM_ID (preferred_charset_id);
5eb05ea3
KH
4557 CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
4558 if (! result)
4559 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4560 NULL, charset);
ff0dacd7
KH
4561 }
4562 else
5eb05ea3
KH
4563 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4564 NULL, charset);
df7492f9
KH
4565 if (!charset)
4566 {
41cbe562
KH
4567 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4568 {
4569 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4570 charset = CHARSET_FROM_ID (charset_ascii);
4571 }
4572 else
4573 {
4574 c = coding->default_char;
5eb05ea3
KH
4575 CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4576 charset_list, NULL, charset);
41cbe562 4577 }
df7492f9
KH
4578 }
4579 ENCODE_ISO_CHARACTER (charset, c);
4580 }
84fbb8a0 4581 }
b73bfc1c 4582
df7492f9
KH
4583 if (coding->mode & CODING_MODE_LAST_BLOCK
4584 && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4585 {
4586 ASSURE_DESTINATION (safe_room);
4587 ENCODE_RESET_PLANE_AND_REGISTER ();
4588 }
065e3595 4589 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
4590 CODING_ISO_BOL (coding) = bol_designation;
4591 coding->produced_char += produced_chars;
4592 coding->produced = dst - coding->destination;
4593 return 0;
4ed46869
KH
4594}
4595
4596\f
df7492f9 4597/*** 8,9. SJIS and BIG5 handlers ***/
4ed46869 4598
df7492f9 4599/* Although SJIS and BIG5 are not ISO's coding system, they are used
4ed46869
KH
4600 quite widely. So, for the moment, Emacs supports them in the bare
4601 C code. But, in the future, they may be supported only by CCL. */
4602
4603/* SJIS is a coding system encoding three character sets: ASCII, right
4604 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
4605 as is. A character of charset katakana-jisx0201 is encoded by
4606 "position-code + 0x80". A character of charset japanese-jisx0208
4607 is encoded in 2-byte but two position-codes are divided and shifted
df7492f9 4608 so that it fit in the range below.
4ed46869
KH
4609
4610 --- CODE RANGE of SJIS ---
4611 (character set) (range)
4612 ASCII 0x00 .. 0x7F
df7492f9 4613 KATAKANA-JISX0201 0xA0 .. 0xDF
c28a9453 4614 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
d14d03ac 4615 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
4ed46869
KH
4616 -------------------------------
4617
4618*/
4619
4620/* BIG5 is a coding system encoding two character sets: ASCII and
4621 Big5. An ASCII character is encoded as is. Big5 is a two-byte
df7492f9 4622 character set and is encoded in two-byte.
4ed46869
KH
4623
4624 --- CODE RANGE of BIG5 ---
4625 (character set) (range)
4626 ASCII 0x00 .. 0x7F
4627 Big5 (1st byte) 0xA1 .. 0xFE
4628 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
4629 --------------------------
4630
df7492f9 4631 */
4ed46869
KH
4632
4633/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4634 Check if a text is encoded in SJIS. If it is, return
df7492f9 4635 CATEGORY_MASK_SJIS, else return 0. */
4ed46869 4636
0a28aafb 4637static int
cf84bb53
JB
4638detect_coding_sjis (struct coding_system *coding,
4639 struct coding_detection_info *detect_info)
4ed46869 4640{
065e3595 4641 const unsigned char *src = coding->source, *src_base;
8f924df7 4642 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9 4643 int multibytep = coding->src_multibyte;
d311d28c 4644 ptrdiff_t consumed_chars = 0;
df7492f9 4645 int found = 0;
b73bfc1c 4646 int c;
f07190ca
KH
4647 Lisp_Object attrs, charset_list;
4648 int max_first_byte_of_2_byte_code;
4649
4650 CODING_GET_INFO (coding, attrs, charset_list);
4651 max_first_byte_of_2_byte_code
4652 = (XINT (Flength (charset_list)) > 3 ? 0xFC : 0xEF);
df7492f9 4653
ff0dacd7 4654 detect_info->checked |= CATEGORY_MASK_SJIS;
df7492f9
KH
4655 /* A coding system of this category is always ASCII compatible. */
4656 src += coding->head_ascii;
4ed46869 4657
b73bfc1c 4658 while (1)
4ed46869 4659 {
065e3595 4660 src_base = src;
df7492f9 4661 ONE_MORE_BYTE (c);
682169fe
KH
4662 if (c < 0x80)
4663 continue;
f07190ca
KH
4664 if ((c >= 0x81 && c <= 0x9F)
4665 || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code))
4ed46869 4666 {
df7492f9 4667 ONE_MORE_BYTE (c);
682169fe 4668 if (c < 0x40 || c == 0x7F || c > 0xFC)
df7492f9 4669 break;
ff0dacd7 4670 found = CATEGORY_MASK_SJIS;
4ed46869 4671 }
df7492f9 4672 else if (c >= 0xA0 && c < 0xE0)
ff0dacd7 4673 found = CATEGORY_MASK_SJIS;
df7492f9
KH
4674 else
4675 break;
4ed46869 4676 }
ff0dacd7 4677 detect_info->rejected |= CATEGORY_MASK_SJIS;
df7492f9
KH
4678 return 0;
4679
4680 no_more_source:
065e3595 4681 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
89528eb3 4682 {
ff0dacd7 4683 detect_info->rejected |= CATEGORY_MASK_SJIS;
89528eb3 4684 return 0;
4ed46869 4685 }
ff0dacd7
KH
4686 detect_info->found |= found;
4687 return 1;
4ed46869
KH
4688}
4689
4690/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4691 Check if a text is encoded in BIG5. If it is, return
df7492f9 4692 CATEGORY_MASK_BIG5, else return 0. */
4ed46869 4693
0a28aafb 4694static int
cf84bb53
JB
4695detect_coding_big5 (struct coding_system *coding,
4696 struct coding_detection_info *detect_info)
4ed46869 4697{
065e3595 4698 const unsigned char *src = coding->source, *src_base;
8f924df7 4699 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9 4700 int multibytep = coding->src_multibyte;
d311d28c 4701 ptrdiff_t consumed_chars = 0;
df7492f9 4702 int found = 0;
b73bfc1c 4703 int c;
fa42c37f 4704
ff0dacd7 4705 detect_info->checked |= CATEGORY_MASK_BIG5;
df7492f9
KH
4706 /* A coding system of this category is always ASCII compatible. */
4707 src += coding->head_ascii;
fa42c37f 4708
b73bfc1c 4709 while (1)
fa42c37f 4710 {
065e3595 4711 src_base = src;
df7492f9
KH
4712 ONE_MORE_BYTE (c);
4713 if (c < 0x80)
fa42c37f 4714 continue;
df7492f9 4715 if (c >= 0xA1)
fa42c37f 4716 {
df7492f9
KH
4717 ONE_MORE_BYTE (c);
4718 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
fa42c37f 4719 return 0;
ff0dacd7 4720 found = CATEGORY_MASK_BIG5;
fa42c37f 4721 }
df7492f9
KH
4722 else
4723 break;
fa42c37f 4724 }
ff0dacd7 4725 detect_info->rejected |= CATEGORY_MASK_BIG5;
fa42c37f 4726 return 0;
fa42c37f 4727
df7492f9 4728 no_more_source:
065e3595 4729 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
89528eb3 4730 {
ff0dacd7 4731 detect_info->rejected |= CATEGORY_MASK_BIG5;
89528eb3
KH
4732 return 0;
4733 }
ff0dacd7
KH
4734 detect_info->found |= found;
4735 return 1;
fa42c37f
KH
4736}
4737
4ed46869
KH
4738/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
4739 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
fa42c37f 4740
b73bfc1c 4741static void
971de7fb 4742decode_coding_sjis (struct coding_system *coding)
4ed46869 4743{
8f924df7
KH
4744 const unsigned char *src = coding->source + coding->consumed;
4745 const unsigned char *src_end = coding->source + coding->src_bytes;
4746 const unsigned char *src_base;
69a80ea3 4747 int *charbuf = coding->charbuf + coding->charbuf_used;
ad1746f5 4748 /* We may produce one charset annotation in one loop and one more at
df80c7f0 4749 the end. */
69a80ea3 4750 int *charbuf_end
df80c7f0 4751 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
d311d28c 4752 ptrdiff_t consumed_chars = 0, consumed_chars_base;
df7492f9
KH
4753 int multibytep = coding->src_multibyte;
4754 struct charset *charset_roman, *charset_kanji, *charset_kana;
57a47f8a 4755 struct charset *charset_kanji2;
24a73b0a 4756 Lisp_Object attrs, charset_list, val;
d311d28c
PE
4757 ptrdiff_t char_offset = coding->produced_char;
4758 ptrdiff_t last_offset = char_offset;
ff0dacd7 4759 int last_id = charset_ascii;
2735d060 4760 int eol_dos =
0a9564cb 4761 !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
119852e7 4762 int byte_after_cr = -1;
a5d301df 4763
24a73b0a 4764 CODING_GET_INFO (coding, attrs, charset_list);
df7492f9
KH
4765
4766 val = charset_list;
4767 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
89528eb3 4768 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
57a47f8a
KH
4769 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4770 charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
fa42c37f 4771
b73bfc1c 4772 while (1)
4ed46869 4773 {
df7492f9 4774 int c, c1;
24a73b0a 4775 struct charset *charset;
fa42c37f 4776
b73bfc1c 4777 src_base = src;
df7492f9 4778 consumed_chars_base = consumed_chars;
fa42c37f 4779
df7492f9 4780 if (charbuf >= charbuf_end)
b71f6f73
KH
4781 {
4782 if (byte_after_cr >= 0)
4783 src_base--;
4784 break;
4785 }
df7492f9 4786
119852e7
KH
4787 if (byte_after_cr >= 0)
4788 c = byte_after_cr, byte_after_cr = -1;
4789 else
4790 ONE_MORE_BYTE (c);
065e3595
KH
4791 if (c < 0)
4792 goto invalid_code;
24a73b0a 4793 if (c < 0x80)
119852e7 4794 {
2735d060 4795 if (eol_dos && c == '\r')
119852e7
KH
4796 ONE_MORE_BYTE (byte_after_cr);
4797 charset = charset_roman;
4798 }
57a47f8a 4799 else if (c == 0x80 || c == 0xA0)
8e921c4b 4800 goto invalid_code;
57a47f8a
KH
4801 else if (c >= 0xA1 && c <= 0xDF)
4802 {
4803 /* SJIS -> JISX0201-Kana */
4804 c &= 0x7F;
4805 charset = charset_kana;
4806 }
4807 else if (c <= 0xEF)
df7492f9 4808 {
57a47f8a
KH
4809 /* SJIS -> JISX0208 */
4810 ONE_MORE_BYTE (c1);
4811 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
24a73b0a 4812 goto invalid_code;
57a47f8a
KH
4813 c = (c << 8) | c1;
4814 SJIS_TO_JIS (c);
4815 charset = charset_kanji;
4816 }
4817 else if (c <= 0xFC && charset_kanji2)
4818 {
c6876370 4819 /* SJIS -> JISX0213-2 */
57a47f8a
KH
4820 ONE_MORE_BYTE (c1);
4821 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
24a73b0a 4822 goto invalid_code;
57a47f8a
KH
4823 c = (c << 8) | c1;
4824 SJIS_TO_JIS2 (c);
4825 charset = charset_kanji2;
df7492f9 4826 }
57a47f8a
KH
4827 else
4828 goto invalid_code;
24a73b0a
KH
4829 if (charset->id != charset_ascii
4830 && last_id != charset->id)
4831 {
4832 if (last_id != charset_ascii)
69a80ea3 4833 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
24a73b0a
KH
4834 last_id = charset->id;
4835 last_offset = char_offset;
4836 }
4837 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
df7492f9 4838 *charbuf++ = c;
ff0dacd7 4839 char_offset++;
df7492f9 4840 continue;
b73bfc1c 4841
df7492f9
KH
4842 invalid_code:
4843 src = src_base;
4844 consumed_chars = consumed_chars_base;
4845 ONE_MORE_BYTE (c);
065e3595 4846 *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
ff0dacd7 4847 char_offset++;
df7492f9
KH
4848 coding->errors++;
4849 }
fa42c37f 4850
df7492f9 4851 no_more_source:
ff0dacd7 4852 if (last_id != charset_ascii)
69a80ea3 4853 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
df7492f9
KH
4854 coding->consumed_char += consumed_chars_base;
4855 coding->consumed = src_base - coding->source;
4856 coding->charbuf_used = charbuf - coding->charbuf;
fa42c37f
KH
4857}
4858
b73bfc1c 4859static void
971de7fb 4860decode_coding_big5 (struct coding_system *coding)
4ed46869 4861{
8f924df7
KH
4862 const unsigned char *src = coding->source + coding->consumed;
4863 const unsigned char *src_end = coding->source + coding->src_bytes;
4864 const unsigned char *src_base;
69a80ea3 4865 int *charbuf = coding->charbuf + coding->charbuf_used;
ad1746f5 4866 /* We may produce one charset annotation in one loop and one more at
df80c7f0 4867 the end. */
69a80ea3 4868 int *charbuf_end
df80c7f0 4869 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
d311d28c 4870 ptrdiff_t consumed_chars = 0, consumed_chars_base;
df7492f9
KH
4871 int multibytep = coding->src_multibyte;
4872 struct charset *charset_roman, *charset_big5;
24a73b0a 4873 Lisp_Object attrs, charset_list, val;
d311d28c
PE
4874 ptrdiff_t char_offset = coding->produced_char;
4875 ptrdiff_t last_offset = char_offset;
ff0dacd7 4876 int last_id = charset_ascii;
2735d060 4877 int eol_dos =
0a9564cb 4878 !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
119852e7 4879 int byte_after_cr = -1;
df7492f9 4880
24a73b0a 4881 CODING_GET_INFO (coding, attrs, charset_list);
df7492f9
KH
4882 val = charset_list;
4883 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4884 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4ed46869 4885
b73bfc1c 4886 while (1)
4ed46869 4887 {
df7492f9 4888 int c, c1;
24a73b0a 4889 struct charset *charset;
b73bfc1c
KH
4890
4891 src_base = src;
df7492f9
KH
4892 consumed_chars_base = consumed_chars;
4893
4894 if (charbuf >= charbuf_end)
b71f6f73
KH
4895 {
4896 if (byte_after_cr >= 0)
4897 src_base--;
4898 break;
4899 }
df7492f9 4900
119852e7 4901 if (byte_after_cr >= 0)
14daee73 4902 c = byte_after_cr, byte_after_cr = -1;
119852e7
KH
4903 else
4904 ONE_MORE_BYTE (c);
b73bfc1c 4905
065e3595
KH
4906 if (c < 0)
4907 goto invalid_code;
24a73b0a 4908 if (c < 0x80)
119852e7 4909 {
2735d060 4910 if (eol_dos && c == '\r')
119852e7
KH
4911 ONE_MORE_BYTE (byte_after_cr);
4912 charset = charset_roman;
4913 }
24a73b0a 4914 else
4ed46869 4915 {
24a73b0a
KH
4916 /* BIG5 -> Big5 */
4917 if (c < 0xA1 || c > 0xFE)
4918 goto invalid_code;
4919 ONE_MORE_BYTE (c1);
4920 if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4921 goto invalid_code;
4922 c = c << 8 | c1;
4923 charset = charset_big5;
4ed46869 4924 }
24a73b0a
KH
4925 if (charset->id != charset_ascii
4926 && last_id != charset->id)
df7492f9 4927 {
24a73b0a 4928 if (last_id != charset_ascii)
69a80ea3 4929 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
24a73b0a
KH
4930 last_id = charset->id;
4931 last_offset = char_offset;
4ed46869 4932 }
24a73b0a 4933 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
df7492f9 4934 *charbuf++ = c;
ff0dacd7 4935 char_offset++;
fb88bf2d
KH
4936 continue;
4937
df7492f9 4938 invalid_code:
4ed46869 4939 src = src_base;
df7492f9
KH
4940 consumed_chars = consumed_chars_base;
4941 ONE_MORE_BYTE (c);
065e3595 4942 *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
ff0dacd7 4943 char_offset++;
df7492f9 4944 coding->errors++;
fb88bf2d 4945 }
d46c5b12 4946
df7492f9 4947 no_more_source:
ff0dacd7 4948 if (last_id != charset_ascii)
69a80ea3 4949 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
df7492f9
KH
4950 coding->consumed_char += consumed_chars_base;
4951 coding->consumed = src_base - coding->source;
4952 coding->charbuf_used = charbuf - coding->charbuf;
4ed46869
KH
4953}
4954
4955/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
b73bfc1c
KH
4956 This function can encode charsets `ascii', `katakana-jisx0201',
4957 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
4958 are sure that all these charsets are registered as official charset
4ed46869
KH
4959 (i.e. do not have extended leading-codes). Characters of other
4960 charsets are produced without any encoding. If SJIS_P is 1, encode
4961 SJIS text, else encode BIG5 text. */
4962
df7492f9 4963static int
971de7fb 4964encode_coding_sjis (struct coding_system *coding)
4ed46869 4965{
df7492f9
KH
4966 int multibytep = coding->dst_multibyte;
4967 int *charbuf = coding->charbuf;
4968 int *charbuf_end = charbuf + coding->charbuf_used;
4969 unsigned char *dst = coding->destination + coding->produced;
4970 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4971 int safe_room = 4;
d311d28c 4972 ptrdiff_t produced_chars = 0;
24a73b0a 4973 Lisp_Object attrs, charset_list, val;
df7492f9 4974 int ascii_compatible;
66ebf983 4975 struct charset *charset_kanji, *charset_kana;
57a47f8a 4976 struct charset *charset_kanji2;
df7492f9 4977 int c;
a5d301df 4978
24a73b0a 4979 CODING_GET_INFO (coding, attrs, charset_list);
66ebf983 4980 val = XCDR (charset_list);
df7492f9 4981 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
57a47f8a
KH
4982 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4983 charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4ed46869 4984
df7492f9 4985 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
93dec019 4986
df7492f9
KH
4987 while (charbuf < charbuf_end)
4988 {
4989 ASSURE_DESTINATION (safe_room);
4990 c = *charbuf++;
b73bfc1c 4991 /* Now encode the character C. */
df7492f9
KH
4992 if (ASCII_CHAR_P (c) && ascii_compatible)
4993 EMIT_ONE_ASCII_BYTE (c);
16eafb5d
KH
4994 else if (CHAR_BYTE8_P (c))
4995 {
4996 c = CHAR_TO_BYTE8 (c);
4997 EMIT_ONE_BYTE (c);
4998 }
df7492f9 4999 else
b73bfc1c 5000 {
df7492f9 5001 unsigned code;
5eb05ea3
KH
5002 struct charset *charset;
5003 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5004 &code, charset);
df7492f9
KH
5005
5006 if (!charset)
4ed46869 5007 {
41cbe562 5008 if (coding->mode & CODING_MODE_SAFE_ENCODING)
b73bfc1c 5009 {
41cbe562
KH
5010 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5011 charset = CHARSET_FROM_ID (charset_ascii);
b73bfc1c 5012 }
41cbe562 5013 else
b73bfc1c 5014 {
41cbe562 5015 c = coding->default_char;
5eb05ea3
KH
5016 CODING_CHAR_CHARSET (coding, dst, dst_end, c,
5017 charset_list, &code, charset);
b73bfc1c 5018 }
b73bfc1c 5019 }
df7492f9
KH
5020 if (code == CHARSET_INVALID_CODE (charset))
5021 abort ();
5022 if (charset == charset_kanji)
5023 {
5024 int c1, c2;
5025 JIS_TO_SJIS (code);
5026 c1 = code >> 8, c2 = code & 0xFF;
5027 EMIT_TWO_BYTES (c1, c2);
5028 }
5029 else if (charset == charset_kana)
5030 EMIT_ONE_BYTE (code | 0x80);
57a47f8a
KH
5031 else if (charset_kanji2 && charset == charset_kanji2)
5032 {
5033 int c1, c2;
5034
5035 c1 = code >> 8;
f07190ca
KH
5036 if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25)
5037 || c1 == 0x28
57a47f8a
KH
5038 || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
5039 {
5040 JIS_TO_SJIS2 (code);
5041 c1 = code >> 8, c2 = code & 0xFF;
5042 EMIT_TWO_BYTES (c1, c2);
5043 }
5044 else
5045 EMIT_ONE_ASCII_BYTE (code & 0x7F);
5046 }
df7492f9
KH
5047 else
5048 EMIT_ONE_ASCII_BYTE (code & 0x7F);
5049 }
5050 }
065e3595 5051 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
5052 coding->produced_char += produced_chars;
5053 coding->produced = dst - coding->destination;
5054 return 0;
5055}
5056
5057static int
971de7fb 5058encode_coding_big5 (struct coding_system *coding)
df7492f9
KH
5059{
5060 int multibytep = coding->dst_multibyte;
5061 int *charbuf = coding->charbuf;
5062 int *charbuf_end = charbuf + coding->charbuf_used;
5063 unsigned char *dst = coding->destination + coding->produced;
5064 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5065 int safe_room = 4;
d311d28c 5066 ptrdiff_t produced_chars = 0;
24a73b0a 5067 Lisp_Object attrs, charset_list, val;
df7492f9 5068 int ascii_compatible;
66ebf983 5069 struct charset *charset_big5;
df7492f9
KH
5070 int c;
5071
24a73b0a 5072 CODING_GET_INFO (coding, attrs, charset_list);
66ebf983 5073 val = XCDR (charset_list);
df7492f9
KH
5074 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
5075 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5076
5077 while (charbuf < charbuf_end)
5078 {
5079 ASSURE_DESTINATION (safe_room);
5080 c = *charbuf++;
5081 /* Now encode the character C. */
5082 if (ASCII_CHAR_P (c) && ascii_compatible)
5083 EMIT_ONE_ASCII_BYTE (c);
16eafb5d
KH
5084 else if (CHAR_BYTE8_P (c))
5085 {
5086 c = CHAR_TO_BYTE8 (c);
5087 EMIT_ONE_BYTE (c);
b73bfc1c
KH
5088 }
5089 else
5090 {
df7492f9 5091 unsigned code;
5eb05ea3
KH
5092 struct charset *charset;
5093 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5094 &code, charset);
df7492f9
KH
5095
5096 if (! charset)
b73bfc1c 5097 {
41cbe562 5098 if (coding->mode & CODING_MODE_SAFE_ENCODING)
b73bfc1c 5099 {
41cbe562
KH
5100 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5101 charset = CHARSET_FROM_ID (charset_ascii);
b73bfc1c 5102 }
41cbe562 5103 else
0eecad43 5104 {
41cbe562 5105 c = coding->default_char;
5eb05ea3
KH
5106 CODING_CHAR_CHARSET (coding, dst, dst_end, c,
5107 charset_list, &code, charset);
0eecad43 5108 }
4ed46869 5109 }
df7492f9
KH
5110 if (code == CHARSET_INVALID_CODE (charset))
5111 abort ();
5112 if (charset == charset_big5)
b73bfc1c 5113 {
df7492f9
KH
5114 int c1, c2;
5115
5116 c1 = code >> 8, c2 = code & 0xFF;
5117 EMIT_TWO_BYTES (c1, c2);
b73bfc1c 5118 }
df7492f9
KH
5119 else
5120 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4ed46869 5121 }
4ed46869 5122 }
065e3595 5123 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
5124 coding->produced_char += produced_chars;
5125 coding->produced = dst - coding->destination;
5126 return 0;
4ed46869
KH
5127}
5128
5129\f
df7492f9 5130/*** 10. CCL handlers ***/
1397dc18
KH
5131
5132/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5133 Check if a text is encoded in a coding system of which
5134 encoder/decoder are written in CCL program. If it is, return
df7492f9 5135 CATEGORY_MASK_CCL, else return 0. */
1397dc18 5136
0a28aafb 5137static int
cf84bb53
JB
5138detect_coding_ccl (struct coding_system *coding,
5139 struct coding_detection_info *detect_info)
1397dc18 5140{
065e3595 5141 const unsigned char *src = coding->source, *src_base;
8f924df7 5142 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9 5143 int multibytep = coding->src_multibyte;
d311d28c 5144 ptrdiff_t consumed_chars = 0;
df7492f9 5145 int found = 0;
0e219d54 5146 unsigned char *valids;
d311d28c 5147 ptrdiff_t head_ascii = coding->head_ascii;
df7492f9
KH
5148 Lisp_Object attrs;
5149
ff0dacd7
KH
5150 detect_info->checked |= CATEGORY_MASK_CCL;
5151
df7492f9 5152 coding = &coding_categories[coding_category_ccl];
0e219d54 5153 valids = CODING_CCL_VALIDS (coding);
df7492f9
KH
5154 attrs = CODING_ID_ATTRS (coding->id);
5155 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5156 src += head_ascii;
1397dc18 5157
b73bfc1c 5158 while (1)
1397dc18 5159 {
df7492f9 5160 int c;
065e3595
KH
5161
5162 src_base = src;
df7492f9 5163 ONE_MORE_BYTE (c);
065e3595 5164 if (c < 0 || ! valids[c])
df7492f9 5165 break;
ff0dacd7
KH
5166 if ((valids[c] > 1))
5167 found = CATEGORY_MASK_CCL;
df7492f9 5168 }
ff0dacd7 5169 detect_info->rejected |= CATEGORY_MASK_CCL;
df7492f9
KH
5170 return 0;
5171
5172 no_more_source:
ff0dacd7
KH
5173 detect_info->found |= found;
5174 return 1;
df7492f9
KH
5175}
5176
5177static void
971de7fb 5178decode_coding_ccl (struct coding_system *coding)
df7492f9 5179{
7c78e542 5180 const unsigned char *src = coding->source + coding->consumed;
8f924df7 5181 const unsigned char *src_end = coding->source + coding->src_bytes;
69a80ea3
KH
5182 int *charbuf = coding->charbuf + coding->charbuf_used;
5183 int *charbuf_end = coding->charbuf + coding->charbuf_size;
d311d28c 5184 ptrdiff_t consumed_chars = 0;
df7492f9 5185 int multibytep = coding->src_multibyte;
d0396581 5186 struct ccl_program *ccl = &coding->spec.ccl->ccl;
df7492f9 5187 int source_charbuf[1024];
fbdc1721 5188 int source_byteidx[1025];
24a73b0a 5189 Lisp_Object attrs, charset_list;
df7492f9 5190
24a73b0a 5191 CODING_GET_INFO (coding, attrs, charset_list);
df7492f9 5192
d0396581 5193 while (1)
df7492f9 5194 {
7c78e542 5195 const unsigned char *p = src;
df7492f9
KH
5196 int i = 0;
5197
5198 if (multibytep)
fbdc1721
KH
5199 {
5200 while (i < 1024 && p < src_end)
5201 {
5202 source_byteidx[i] = p - src;
5203 source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
5204 }
5205 source_byteidx[i] = p - src;
5206 }
df7492f9
KH
5207 else
5208 while (i < 1024 && p < src_end)
5209 source_charbuf[i++] = *p++;
8f924df7 5210
df7492f9 5211 if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
d0396581
KH
5212 ccl->last_block = 1;
5213 ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf,
5214 charset_list);
5215 charbuf += ccl->produced;
fbdc1721 5216 if (multibytep)
d0396581 5217 src += source_byteidx[ccl->consumed];
df7492f9 5218 else
d0396581
KH
5219 src += ccl->consumed;
5220 consumed_chars += ccl->consumed;
5221 if (p == src_end || ccl->status != CCL_STAT_SUSPEND_BY_SRC)
df7492f9
KH
5222 break;
5223 }
5224
d0396581 5225 switch (ccl->status)
df7492f9
KH
5226 {
5227 case CCL_STAT_SUSPEND_BY_SRC:
065e3595 5228 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
df7492f9
KH
5229 break;
5230 case CCL_STAT_SUSPEND_BY_DST:
d0396581 5231 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
df7492f9
KH
5232 break;
5233 case CCL_STAT_QUIT:
5234 case CCL_STAT_INVALID_CMD:
065e3595 5235 record_conversion_result (coding, CODING_RESULT_INTERRUPT);
df7492f9
KH
5236 break;
5237 default:
065e3595 5238 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
5239 break;
5240 }
5241 coding->consumed_char += consumed_chars;
5242 coding->consumed = src - coding->source;
5243 coding->charbuf_used = charbuf - coding->charbuf;
5244}
5245
5246static int
971de7fb 5247encode_coding_ccl (struct coding_system *coding)
df7492f9 5248{
fb608df3 5249 struct ccl_program *ccl = &coding->spec.ccl->ccl;
df7492f9
KH
5250 int multibytep = coding->dst_multibyte;
5251 int *charbuf = coding->charbuf;
5252 int *charbuf_end = charbuf + coding->charbuf_used;
5253 unsigned char *dst = coding->destination + coding->produced;
5254 unsigned char *dst_end = coding->destination + coding->dst_bytes;
df7492f9 5255 int destination_charbuf[1024];
d311d28c 5256 ptrdiff_t produced_chars = 0;
a53e2e89 5257 int i;
24a73b0a 5258 Lisp_Object attrs, charset_list;
df7492f9 5259
24a73b0a 5260 CODING_GET_INFO (coding, attrs, charset_list);
fb608df3
KH
5261 if (coding->consumed_char == coding->src_chars
5262 && coding->mode & CODING_MODE_LAST_BLOCK)
5263 ccl->last_block = 1;
df7492f9 5264
76470ad1 5265 do
df7492f9 5266 {
fb608df3 5267 ccl_driver (ccl, charbuf, destination_charbuf,
8cffd3e7 5268 charbuf_end - charbuf, 1024, charset_list);
df7492f9 5269 if (multibytep)
8cffd3e7 5270 {
fb608df3
KH
5271 ASSURE_DESTINATION (ccl->produced * 2);
5272 for (i = 0; i < ccl->produced; i++)
8cffd3e7
KH
5273 EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5274 }
df7492f9
KH
5275 else
5276 {
fb608df3
KH
5277 ASSURE_DESTINATION (ccl->produced);
5278 for (i = 0; i < ccl->produced; i++)
df7492f9 5279 *dst++ = destination_charbuf[i] & 0xFF;
fb608df3 5280 produced_chars += ccl->produced;
df7492f9 5281 }
fb608df3
KH
5282 charbuf += ccl->consumed;
5283 if (ccl->status == CCL_STAT_QUIT
5284 || ccl->status == CCL_STAT_INVALID_CMD)
8cffd3e7 5285 break;
df7492f9 5286 }
76470ad1 5287 while (charbuf < charbuf_end);
df7492f9 5288
fb608df3 5289 switch (ccl->status)
df7492f9
KH
5290 {
5291 case CCL_STAT_SUSPEND_BY_SRC:
065e3595 5292 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
df7492f9
KH
5293 break;
5294 case CCL_STAT_SUSPEND_BY_DST:
065e3595 5295 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
df7492f9
KH
5296 break;
5297 case CCL_STAT_QUIT:
5298 case CCL_STAT_INVALID_CMD:
065e3595 5299 record_conversion_result (coding, CODING_RESULT_INTERRUPT);
df7492f9
KH
5300 break;
5301 default:
065e3595 5302 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9 5303 break;
1397dc18 5304 }
df7492f9
KH
5305
5306 coding->produced_char += produced_chars;
5307 coding->produced = dst - coding->destination;
5308 return 0;
1397dc18
KH
5309}
5310
df7492f9 5311
1397dc18 5312\f
df7492f9 5313/*** 10, 11. no-conversion handlers ***/
4ed46869 5314
b73bfc1c 5315/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
4ed46869 5316
b73bfc1c 5317static void
971de7fb 5318decode_coding_raw_text (struct coding_system *coding)
4ed46869 5319{
2735d060 5320 int eol_dos =
0a9564cb 5321 !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
119852e7 5322
df7492f9 5323 coding->chars_at_source = 1;
119852e7
KH
5324 coding->consumed_char = coding->src_chars;
5325 coding->consumed = coding->src_bytes;
2735d060 5326 if (eol_dos && coding->source[coding->src_bytes - 1] == '\r')
119852e7
KH
5327 {
5328 coding->consumed_char--;
5329 coding->consumed--;
5330 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5331 }
5332 else
5333 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9 5334}
4ed46869 5335
df7492f9 5336static int
971de7fb 5337encode_coding_raw_text (struct coding_system *coding)
df7492f9
KH
5338{
5339 int multibytep = coding->dst_multibyte;
5340 int *charbuf = coding->charbuf;
5341 int *charbuf_end = coding->charbuf + coding->charbuf_used;
5342 unsigned char *dst = coding->destination + coding->produced;
5343 unsigned char *dst_end = coding->destination + coding->dst_bytes;
d311d28c 5344 ptrdiff_t produced_chars = 0;
b73bfc1c
KH
5345 int c;
5346
df7492f9 5347 if (multibytep)
b73bfc1c 5348 {
df7492f9 5349 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
4ed46869 5350
df7492f9
KH
5351 if (coding->src_multibyte)
5352 while (charbuf < charbuf_end)
5353 {
5354 ASSURE_DESTINATION (safe_room);
5355 c = *charbuf++;
5356 if (ASCII_CHAR_P (c))
5357 EMIT_ONE_ASCII_BYTE (c);
5358 else if (CHAR_BYTE8_P (c))
5359 {
5360 c = CHAR_TO_BYTE8 (c);
5361 EMIT_ONE_BYTE (c);
5362 }
5363 else
5364 {
5365 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
93dec019 5366
df7492f9 5367 CHAR_STRING_ADVANCE (c, p1);
8abc3f12 5368 do
9d123124
KH
5369 {
5370 EMIT_ONE_BYTE (*p0);
5371 p0++;
5372 }
8abc3f12 5373 while (p0 < p1);
df7492f9
KH
5374 }
5375 }
b73bfc1c 5376 else
df7492f9
KH
5377 while (charbuf < charbuf_end)
5378 {
5379 ASSURE_DESTINATION (safe_room);
5380 c = *charbuf++;
5381 EMIT_ONE_BYTE (c);
5382 }
5383 }
5384 else
4ed46869 5385 {
df7492f9 5386 if (coding->src_multibyte)
d46c5b12 5387 {
df7492f9
KH
5388 int safe_room = MAX_MULTIBYTE_LENGTH;
5389
5390 while (charbuf < charbuf_end)
d46c5b12 5391 {
df7492f9
KH
5392 ASSURE_DESTINATION (safe_room);
5393 c = *charbuf++;
5394 if (ASCII_CHAR_P (c))
5395 *dst++ = c;
5396 else if (CHAR_BYTE8_P (c))
5397 *dst++ = CHAR_TO_BYTE8 (c);
b73bfc1c 5398 else
df7492f9 5399 CHAR_STRING_ADVANCE (c, dst);
d46c5b12
KH
5400 }
5401 }
df7492f9
KH
5402 else
5403 {
5404 ASSURE_DESTINATION (charbuf_end - charbuf);
5405 while (charbuf < charbuf_end && dst < dst_end)
5406 *dst++ = *charbuf++;
8f924df7 5407 }
319a3947 5408 produced_chars = dst - (coding->destination + coding->produced);
4ed46869 5409 }
065e3595 5410 record_conversion_result (coding, CODING_RESULT_SUCCESS);
a0ed9b27 5411 coding->produced_char += produced_chars;
df7492f9
KH
5412 coding->produced = dst - coding->destination;
5413 return 0;
4ed46869
KH
5414}
5415
ff0dacd7
KH
5416/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5417 Check if a text is encoded in a charset-based coding system. If it
5418 is, return 1, else return 0. */
5419
0a28aafb 5420static int
cf84bb53
JB
5421detect_coding_charset (struct coding_system *coding,
5422 struct coding_detection_info *detect_info)
1397dc18 5423{
065e3595 5424 const unsigned char *src = coding->source, *src_base;
8f924df7 5425 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9 5426 int multibytep = coding->src_multibyte;
d311d28c 5427 ptrdiff_t consumed_chars = 0;
07295713 5428 Lisp_Object attrs, valids, name;
584948ac 5429 int found = 0;
d311d28c 5430 ptrdiff_t head_ascii = coding->head_ascii;
07295713 5431 int check_latin_extra = 0;
1397dc18 5432
ff0dacd7
KH
5433 detect_info->checked |= CATEGORY_MASK_CHARSET;
5434
df7492f9
KH
5435 coding = &coding_categories[coding_category_charset];
5436 attrs = CODING_ID_ATTRS (coding->id);
5437 valids = AREF (attrs, coding_attr_charset_valids);
07295713 5438 name = CODING_ID_NAME (coding->id);
51b59d79 5439 if (strncmp (SSDATA (SYMBOL_NAME (name)),
237aabf4 5440 "iso-8859-", sizeof ("iso-8859-") - 1) == 0
51b59d79 5441 || strncmp (SSDATA (SYMBOL_NAME (name)),
237aabf4 5442 "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
07295713 5443 check_latin_extra = 1;
237aabf4 5444
df7492f9 5445 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
716b3fa0 5446 src += head_ascii;
1397dc18 5447
b73bfc1c 5448 while (1)
1397dc18 5449 {
df7492f9 5450 int c;
716b3fa0
KH
5451 Lisp_Object val;
5452 struct charset *charset;
5453 int dim, idx;
1397dc18 5454
065e3595 5455 src_base = src;
df7492f9 5456 ONE_MORE_BYTE (c);
065e3595
KH
5457 if (c < 0)
5458 continue;
716b3fa0
KH
5459 val = AREF (valids, c);
5460 if (NILP (val))
df7492f9 5461 break;
584948ac 5462 if (c >= 0x80)
07295713
KH
5463 {
5464 if (c < 0xA0
237aabf4
JR
5465 && check_latin_extra
5466 && (!VECTORP (Vlatin_extra_code_table)
9f0526cb 5467 || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c])))
07295713
KH
5468 break;
5469 found = CATEGORY_MASK_CHARSET;
5470 }
716b3fa0
KH
5471 if (INTEGERP (val))
5472 {
5473 charset = CHARSET_FROM_ID (XFASTINT (val));
5474 dim = CHARSET_DIMENSION (charset);
5475 for (idx = 1; idx < dim; idx++)
5476 {
5477 if (src == src_end)
5478 goto too_short;
5479 ONE_MORE_BYTE (c);
2f9442b8
PE
5480 if (c < charset->code_space[(dim - 1 - idx) * 4]
5481 || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
716b3fa0
KH
5482 break;
5483 }
5484 if (idx < dim)
5485 break;
5486 }
5487 else
5488 {
5489 idx = 1;
5490 for (; CONSP (val); val = XCDR (val))
5491 {
5492 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5493 dim = CHARSET_DIMENSION (charset);
5494 while (idx < dim)
5495 {
5496 if (src == src_end)
5497 goto too_short;
5498 ONE_MORE_BYTE (c);
5499 if (c < charset->code_space[(dim - 1 - idx) * 4]
5500 || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5501 break;
5502 idx++;
5503 }
5504 if (idx == dim)
5505 {
5506 val = Qnil;
5507 break;
5508 }
5509 }
5510 if (CONSP (val))
5511 break;
5512 }
df7492f9 5513 }
716b3fa0 5514 too_short:
ff0dacd7 5515 detect_info->rejected |= CATEGORY_MASK_CHARSET;
df7492f9 5516 return 0;
4ed46869 5517
df7492f9 5518 no_more_source:
ff0dacd7
KH
5519 detect_info->found |= found;
5520 return 1;
df7492f9 5521}
b73bfc1c 5522
b73bfc1c 5523static void
971de7fb 5524decode_coding_charset (struct coding_system *coding)
4ed46869 5525{
8f924df7
KH
5526 const unsigned char *src = coding->source + coding->consumed;
5527 const unsigned char *src_end = coding->source + coding->src_bytes;
5528 const unsigned char *src_base;
69a80ea3 5529 int *charbuf = coding->charbuf + coding->charbuf_used;
ad1746f5 5530 /* We may produce one charset annotation in one loop and one more at
df80c7f0 5531 the end. */
69a80ea3 5532 int *charbuf_end
df80c7f0 5533 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
d311d28c 5534 ptrdiff_t consumed_chars = 0, consumed_chars_base;
df7492f9 5535 int multibytep = coding->src_multibyte;
66ebf983
PE
5536 Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
5537 Lisp_Object valids;
d311d28c
PE
5538 ptrdiff_t char_offset = coding->produced_char;
5539 ptrdiff_t last_offset = char_offset;
ff0dacd7 5540 int last_id = charset_ascii;
2735d060 5541 int eol_dos =
0a9564cb 5542 !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
119852e7 5543 int byte_after_cr = -1;
df7492f9 5544
4eb6d3f1 5545 valids = AREF (attrs, coding_attr_charset_valids);
b73bfc1c 5546
df7492f9 5547 while (1)
4ed46869 5548 {
4eb6d3f1 5549 int c;
24a73b0a
KH
5550 Lisp_Object val;
5551 struct charset *charset;
5552 int dim;
5553 int len = 1;
5554 unsigned code;
df7492f9
KH
5555
5556 src_base = src;
5557 consumed_chars_base = consumed_chars;
b73bfc1c 5558
df7492f9 5559 if (charbuf >= charbuf_end)
b71f6f73
KH
5560 {
5561 if (byte_after_cr >= 0)
5562 src_base--;
5563 break;
5564 }
df7492f9 5565
119852e7
KH
5566 if (byte_after_cr >= 0)
5567 {
5568 c = byte_after_cr;
5569 byte_after_cr = -1;
5570 }
5571 else
5572 {
5573 ONE_MORE_BYTE (c);
2735d060 5574 if (eol_dos && c == '\r')
119852e7
KH
5575 ONE_MORE_BYTE (byte_after_cr);
5576 }
065e3595
KH
5577 if (c < 0)
5578 goto invalid_code;
24a73b0a
KH
5579 code = c;
5580
5581 val = AREF (valids, c);
1b17adfd 5582 if (! INTEGERP (val) && ! CONSP (val))
24a73b0a
KH
5583 goto invalid_code;
5584 if (INTEGERP (val))
d46c5b12 5585 {
24a73b0a
KH
5586 charset = CHARSET_FROM_ID (XFASTINT (val));
5587 dim = CHARSET_DIMENSION (charset);
5588 while (len < dim)
b73bfc1c 5589 {
24a73b0a
KH
5590 ONE_MORE_BYTE (c);
5591 code = (code << 8) | c;
5592 len++;
b73bfc1c 5593 }
24a73b0a
KH
5594 CODING_DECODE_CHAR (coding, src, src_base, src_end,
5595 charset, code, c);
d46c5b12 5596 }
df7492f9 5597 else
d46c5b12 5598 {
24a73b0a
KH
5599 /* VAL is a list of charset IDs. It is assured that the
5600 list is sorted by charset dimensions (smaller one
5601 comes first). */
5602 while (CONSP (val))
4eb6d3f1 5603 {
24a73b0a 5604 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
c7c66a95 5605 dim = CHARSET_DIMENSION (charset);
f9d71dcd 5606 while (len < dim)
4eb6d3f1 5607 {
acb2a965
KH
5608 ONE_MORE_BYTE (c);
5609 code = (code << 8) | c;
f9d71dcd 5610 len++;
4eb6d3f1 5611 }
24a73b0a
KH
5612 CODING_DECODE_CHAR (coding, src, src_base,
5613 src_end, charset, code, c);
5614 if (c >= 0)
5615 break;
5616 val = XCDR (val);
ff0dacd7 5617 }
d46c5b12 5618 }
24a73b0a
KH
5619 if (c < 0)
5620 goto invalid_code;
5621 if (charset->id != charset_ascii
5622 && last_id != charset->id)
5623 {
5624 if (last_id != charset_ascii)
69a80ea3 5625 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
24a73b0a
KH
5626 last_id = charset->id;
5627 last_offset = char_offset;
5628 }
5629
df7492f9 5630 *charbuf++ = c;
ff0dacd7 5631 char_offset++;
df7492f9
KH
5632 continue;
5633
5634 invalid_code:
5635 src = src_base;
5636 consumed_chars = consumed_chars_base;
5637 ONE_MORE_BYTE (c);
065e3595 5638 *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
ff0dacd7 5639 char_offset++;
df7492f9 5640 coding->errors++;
4ed46869
KH
5641 }
5642
df7492f9 5643 no_more_source:
ff0dacd7 5644 if (last_id != charset_ascii)
69a80ea3 5645 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
df7492f9
KH
5646 coding->consumed_char += consumed_chars_base;
5647 coding->consumed = src_base - coding->source;
5648 coding->charbuf_used = charbuf - coding->charbuf;
4ed46869
KH
5649}
5650
df7492f9 5651static int
971de7fb 5652encode_coding_charset (struct coding_system *coding)
4ed46869 5653{
df7492f9
KH
5654 int multibytep = coding->dst_multibyte;
5655 int *charbuf = coding->charbuf;
5656 int *charbuf_end = charbuf + coding->charbuf_used;
5657 unsigned char *dst = coding->destination + coding->produced;
5658 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5659 int safe_room = MAX_MULTIBYTE_LENGTH;
d311d28c 5660 ptrdiff_t produced_chars = 0;
24a73b0a 5661 Lisp_Object attrs, charset_list;
df7492f9 5662 int ascii_compatible;
b73bfc1c 5663 int c;
b73bfc1c 5664
24a73b0a 5665 CODING_GET_INFO (coding, attrs, charset_list);
df7492f9 5666 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
fb88bf2d 5667
df7492f9 5668 while (charbuf < charbuf_end)
4ed46869 5669 {
4eb6d3f1 5670 struct charset *charset;
df7492f9 5671 unsigned code;
8f924df7 5672
df7492f9
KH
5673 ASSURE_DESTINATION (safe_room);
5674 c = *charbuf++;
5675 if (ascii_compatible && ASCII_CHAR_P (c))
5676 EMIT_ONE_ASCII_BYTE (c);
16eafb5d 5677 else if (CHAR_BYTE8_P (c))
4ed46869 5678 {
16eafb5d
KH
5679 c = CHAR_TO_BYTE8 (c);
5680 EMIT_ONE_BYTE (c);
d46c5b12 5681 }
d46c5b12 5682 else
b73bfc1c 5683 {
5eb05ea3
KH
5684 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5685 &code, charset);
5686
4eb6d3f1
KH
5687 if (charset)
5688 {
5689 if (CHARSET_DIMENSION (charset) == 1)
5690 EMIT_ONE_BYTE (code);
5691 else if (CHARSET_DIMENSION (charset) == 2)
5692 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5693 else if (CHARSET_DIMENSION (charset) == 3)
5694 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5695 else
5696 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5697 (code >> 8) & 0xFF, code & 0xFF);
5698 }
5699 else
41cbe562
KH
5700 {
5701 if (coding->mode & CODING_MODE_SAFE_ENCODING)
5702 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5703 else
5704 c = coding->default_char;
5705 EMIT_ONE_BYTE (c);
5706 }
4ed46869 5707 }
4ed46869
KH
5708 }
5709
065e3595 5710 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
5711 coding->produced_char += produced_chars;
5712 coding->produced = dst - coding->destination;
5713 return 0;
4ed46869
KH
5714}
5715
5716\f
1397dc18 5717/*** 7. C library functions ***/
4ed46869 5718
df7492f9
KH
5719/* Setup coding context CODING from information about CODING_SYSTEM.
5720 If CODING_SYSTEM is nil, `no-conversion' is assumed. If
5721 CODING_SYSTEM is invalid, signal an error. */
4ed46869 5722
ec6d2bb8 5723void
971de7fb 5724setup_coding_system (Lisp_Object coding_system, struct coding_system *coding)
4ed46869 5725{
df7492f9
KH
5726 Lisp_Object attrs;
5727 Lisp_Object eol_type;
5728 Lisp_Object coding_type;
4608c386 5729 Lisp_Object val;
4ed46869 5730
df7492f9 5731 if (NILP (coding_system))
ae6f73fa 5732 coding_system = Qundecided;
c07c8e12 5733
df7492f9 5734 CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
1f5dbf34 5735
df7492f9 5736 attrs = CODING_ID_ATTRS (coding->id);
0a9564cb 5737 eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
1f5dbf34 5738
df7492f9
KH
5739 coding->mode = 0;
5740 coding->head_ascii = -1;
4a015c45
KH
5741 if (VECTORP (eol_type))
5742 coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5743 | CODING_REQUIRE_DETECTION_MASK);
5744 else if (! EQ (eol_type, Qunix))
5745 coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5746 | CODING_REQUIRE_ENCODING_MASK);
5747 else
5748 coding->common_flags = 0;
5e5c78be
KH
5749 if (! NILP (CODING_ATTR_POST_READ (attrs)))
5750 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5751 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5752 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
8f924df7
KH
5753 if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5754 coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
1f5dbf34 5755
df7492f9 5756 val = CODING_ATTR_SAFE_CHARSETS (attrs);
8f924df7 5757 coding->max_charset_id = SCHARS (val) - 1;
1b3b981b 5758 coding->safe_charsets = SDATA (val);
df7492f9 5759 coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
624bda09 5760 coding->carryover_bytes = 0;
4608c386 5761
df7492f9
KH
5762 coding_type = CODING_ATTR_TYPE (attrs);
5763 if (EQ (coding_type, Qundecided))
d46c5b12 5764 {
df7492f9
KH
5765 coding->detector = NULL;
5766 coding->decoder = decode_coding_raw_text;
5767 coding->encoder = encode_coding_raw_text;
5768 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
d46c5b12 5769 }
df7492f9 5770 else if (EQ (coding_type, Qiso_2022))
d46c5b12 5771 {
df7492f9
KH
5772 int i;
5773 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5774
5775 /* Invoke graphic register 0 to plane 0. */
5776 CODING_ISO_INVOCATION (coding, 0) = 0;
5777 /* Invoke graphic register 1 to plane 1 if we can use 8-bit. */
5778 CODING_ISO_INVOCATION (coding, 1)
5779 = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5780 /* Setup the initial status of designation. */
5781 for (i = 0; i < 4; i++)
5782 CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5783 /* Not single shifting initially. */
5784 CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5785 /* Beginning of buffer should also be regarded as bol. */
5786 CODING_ISO_BOL (coding) = 1;
5787 coding->detector = detect_coding_iso_2022;
5788 coding->decoder = decode_coding_iso_2022;
5789 coding->encoder = encode_coding_iso_2022;
5790 if (flags & CODING_ISO_FLAG_SAFE)
5791 coding->mode |= CODING_MODE_SAFE_ENCODING;
d46c5b12 5792 coding->common_flags
df7492f9
KH
5793 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5794 | CODING_REQUIRE_FLUSHING_MASK);
5795 if (flags & CODING_ISO_FLAG_COMPOSITION)
5796 coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
ff0dacd7
KH
5797 if (flags & CODING_ISO_FLAG_DESIGNATION)
5798 coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
df7492f9
KH
5799 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5800 {
5801 setup_iso_safe_charsets (attrs);
5802 val = CODING_ATTR_SAFE_CHARSETS (attrs);
8f924df7 5803 coding->max_charset_id = SCHARS (val) - 1;
1b3b981b 5804 coding->safe_charsets = SDATA (val);
df7492f9
KH
5805 }
5806 CODING_ISO_FLAGS (coding) = flags;
e951386e
KH
5807 CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5808 CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5809 CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5810 CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
d46c5b12 5811 }
df7492f9 5812 else if (EQ (coding_type, Qcharset))
d46c5b12 5813 {
df7492f9
KH
5814 coding->detector = detect_coding_charset;
5815 coding->decoder = decode_coding_charset;
5816 coding->encoder = encode_coding_charset;
d46c5b12 5817 coding->common_flags
df7492f9 5818 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
d46c5b12 5819 }
df7492f9 5820 else if (EQ (coding_type, Qutf_8))
d46c5b12 5821 {
a470d443
KH
5822 val = AREF (attrs, coding_attr_utf_bom);
5823 CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5824 : EQ (val, Qt) ? utf_with_bom
5825 : utf_without_bom);
df7492f9
KH
5826 coding->detector = detect_coding_utf_8;
5827 coding->decoder = decode_coding_utf_8;
5828 coding->encoder = encode_coding_utf_8;
5829 coding->common_flags
5830 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
a470d443
KH
5831 if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5832 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
df7492f9
KH
5833 }
5834 else if (EQ (coding_type, Qutf_16))
5835 {
a470d443
KH
5836 val = AREF (attrs, coding_attr_utf_bom);
5837 CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5838 : EQ (val, Qt) ? utf_with_bom
5839 : utf_without_bom);
df7492f9 5840 val = AREF (attrs, coding_attr_utf_16_endian);
b49a1807 5841 CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
df7492f9 5842 : utf_16_little_endian);
e19c3639 5843 CODING_UTF_16_SURROGATE (coding) = 0;
df7492f9
KH
5844 coding->detector = detect_coding_utf_16;
5845 coding->decoder = decode_coding_utf_16;
5846 coding->encoder = encode_coding_utf_16;
5847 coding->common_flags
5848 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
a470d443 5849 if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
b49a1807 5850 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
d46c5b12 5851 }
df7492f9 5852 else if (EQ (coding_type, Qccl))
4ed46869 5853 {
df7492f9
KH
5854 coding->detector = detect_coding_ccl;
5855 coding->decoder = decode_coding_ccl;
5856 coding->encoder = encode_coding_ccl;
c952af22 5857 coding->common_flags
df7492f9
KH
5858 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5859 | CODING_REQUIRE_FLUSHING_MASK);
5860 }
5861 else if (EQ (coding_type, Qemacs_mule))
5862 {
5863 coding->detector = detect_coding_emacs_mule;
5864 coding->decoder = decode_coding_emacs_mule;
5865 coding->encoder = encode_coding_emacs_mule;
c952af22 5866 coding->common_flags
df7492f9 5867 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
e951386e 5868 coding->spec.emacs_mule.full_support = 1;
df7492f9
KH
5869 if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5870 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5871 {
5872 Lisp_Object tail, safe_charsets;
5873 int max_charset_id = 0;
5874
5875 for (tail = Vemacs_mule_charset_list; CONSP (tail);
5876 tail = XCDR (tail))
5877 if (max_charset_id < XFASTINT (XCAR (tail)))
5878 max_charset_id = XFASTINT (XCAR (tail));
1b3b981b
AS
5879 safe_charsets = make_uninit_string (max_charset_id + 1);
5880 memset (SDATA (safe_charsets), 255, max_charset_id + 1);
df7492f9
KH
5881 for (tail = Vemacs_mule_charset_list; CONSP (tail);
5882 tail = XCDR (tail))
8f924df7 5883 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
df7492f9 5884 coding->max_charset_id = max_charset_id;
1b3b981b 5885 coding->safe_charsets = SDATA (safe_charsets);
e951386e 5886 coding->spec.emacs_mule.full_support = 1;
df7492f9 5887 }
e951386e
KH
5888 coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
5889 coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
df7492f9
KH
5890 }
5891 else if (EQ (coding_type, Qshift_jis))
5892 {
5893 coding->detector = detect_coding_sjis;
5894 coding->decoder = decode_coding_sjis;
5895 coding->encoder = encode_coding_sjis;
c952af22 5896 coding->common_flags
df7492f9
KH
5897 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5898 }
5899 else if (EQ (coding_type, Qbig5))
5900 {
5901 coding->detector = detect_coding_big5;
5902 coding->decoder = decode_coding_big5;
5903 coding->encoder = encode_coding_big5;
c952af22 5904 coding->common_flags
df7492f9
KH
5905 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5906 }
5907 else /* EQ (coding_type, Qraw_text) */
ec6d2bb8 5908 {
df7492f9
KH
5909 coding->detector = NULL;
5910 coding->decoder = decode_coding_raw_text;
5911 coding->encoder = encode_coding_raw_text;
ea29edf2
KH
5912 if (! EQ (eol_type, Qunix))
5913 {
5914 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5915 if (! VECTORP (eol_type))
5916 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5917 }
5918
4ed46869 5919 }
4ed46869 5920
df7492f9 5921 return;
4ed46869
KH
5922}
5923
0ff61e78
KH
5924/* Return a list of charsets supported by CODING. */
5925
5926Lisp_Object
971de7fb 5927coding_charset_list (struct coding_system *coding)
0ff61e78 5928{
35befdaa 5929 Lisp_Object attrs, charset_list;
0ff61e78
KH
5930
5931 CODING_GET_INFO (coding, attrs, charset_list);
5932 if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5933 {
5934 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5935
5936 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5937 charset_list = Viso_2022_charset_list;
5938 }
5939 else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5940 {
5941 charset_list = Vemacs_mule_charset_list;
5942 }
5943 return charset_list;
5944}
5945
5946
e9f91ece
KH
5947/* Return a list of charsets supported by CODING-SYSTEM. */
5948
5949Lisp_Object
971de7fb 5950coding_system_charset_list (Lisp_Object coding_system)
e9f91ece 5951{
d3411f89 5952 ptrdiff_t id;
e9f91ece
KH
5953 Lisp_Object attrs, charset_list;
5954
5955 CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
5956 attrs = CODING_ID_ATTRS (id);
5957
5958 if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5959 {
5960 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5961
5962 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5963 charset_list = Viso_2022_charset_list;
5964 else
5965 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5966 }
5967 else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5968 {
5969 charset_list = Vemacs_mule_charset_list;
5970 }
5971 else
5972 {
5973 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5974 }
5975 return charset_list;
5976}
5977
5978
df7492f9
KH
5979/* Return raw-text or one of its subsidiaries that has the same
5980 eol_type as CODING-SYSTEM. */
ec6d2bb8 5981
df7492f9 5982Lisp_Object
971de7fb 5983raw_text_coding_system (Lisp_Object coding_system)
ec6d2bb8 5984{
0be8721c 5985 Lisp_Object spec, attrs;
df7492f9 5986 Lisp_Object eol_type, raw_text_eol_type;
ec6d2bb8 5987
d3e4cb56
KH
5988 if (NILP (coding_system))
5989 return Qraw_text;
df7492f9
KH
5990 spec = CODING_SYSTEM_SPEC (coding_system);
5991 attrs = AREF (spec, 0);
ec6d2bb8 5992
df7492f9
KH
5993 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5994 return coding_system;
ec6d2bb8 5995
df7492f9
KH
5996 eol_type = AREF (spec, 2);
5997 if (VECTORP (eol_type))
5998 return Qraw_text;
5999 spec = CODING_SYSTEM_SPEC (Qraw_text);
6000 raw_text_eol_type = AREF (spec, 2);
6001 return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
6002 : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
6003 : AREF (raw_text_eol_type, 2));
ec6d2bb8
KH
6004}
6005
54f78171 6006
1911a33b
KH
6007/* If CODING_SYSTEM doesn't specify end-of-line format, return one of
6008 the subsidiary that has the same eol-spec as PARENT (if it is not
6009 nil and specifies end-of-line format) or the system's setting
fcbcfb64 6010 (system_eol_type). */
df7492f9
KH
6011
6012Lisp_Object
971de7fb 6013coding_inherit_eol_type (Lisp_Object coding_system, Lisp_Object parent)
54f78171 6014{
3e139625 6015 Lisp_Object spec, eol_type;
54f78171 6016
d3e4cb56
KH
6017 if (NILP (coding_system))
6018 coding_system = Qraw_text;
df7492f9 6019 spec = CODING_SYSTEM_SPEC (coding_system);
df7492f9 6020 eol_type = AREF (spec, 2);
fcbcfb64 6021 if (VECTORP (eol_type))
df7492f9 6022 {
df7492f9
KH
6023 Lisp_Object parent_eol_type;
6024
fcbcfb64
KH
6025 if (! NILP (parent))
6026 {
6027 Lisp_Object parent_spec;
6028
4a015c45 6029 parent_spec = CODING_SYSTEM_SPEC (parent);
fcbcfb64 6030 parent_eol_type = AREF (parent_spec, 2);
1911a33b 6031 if (VECTORP (parent_eol_type))
4628bef1 6032 parent_eol_type = system_eol_type;
fcbcfb64
KH
6033 }
6034 else
6035 parent_eol_type = system_eol_type;
df7492f9
KH
6036 if (EQ (parent_eol_type, Qunix))
6037 coding_system = AREF (eol_type, 0);
6038 else if (EQ (parent_eol_type, Qdos))
6039 coding_system = AREF (eol_type, 1);
6040 else if (EQ (parent_eol_type, Qmac))
6041 coding_system = AREF (eol_type, 2);
54f78171 6042 }
df7492f9 6043 return coding_system;
54f78171
KH
6044}
6045
fcaf8878
KH
6046
6047/* Check if text-conversion and eol-conversion of CODING_SYSTEM are
6048 decided for writing to a process. If not, complement them, and
6049 return a new coding system. */
6050
6051Lisp_Object
4628bef1 6052complement_process_encoding_system (Lisp_Object coding_system)
fcaf8878 6053{
5886ec9c
KH
6054 Lisp_Object coding_base = Qnil, eol_base = Qnil;
6055 Lisp_Object spec, attrs;
93d50df8 6056 int i;
fcaf8878 6057
93d50df8 6058 for (i = 0; i < 3; i++)
fcaf8878 6059 {
93d50df8
KH
6060 if (i == 1)
6061 coding_system = CDR_SAFE (Vdefault_process_coding_system);
6062 else if (i == 2)
6063 coding_system = preferred_coding_system ();
6064 spec = CODING_SYSTEM_SPEC (coding_system);
6065 if (NILP (spec))
6066 continue;
6067 attrs = AREF (spec, 0);
6068 if (NILP (coding_base) && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
6069 coding_base = CODING_ATTR_BASE_NAME (attrs);
6070 if (NILP (eol_base) && ! VECTORP (AREF (spec, 2)))
6071 eol_base = coding_system;
6072 if (! NILP (coding_base) && ! NILP (eol_base))
6073 break;
fcaf8878 6074 }
fcaf8878 6075
93d50df8
KH
6076 if (i > 0)
6077 /* The original CODING_SYSTEM didn't specify text-conversion or
6078 eol-conversion. Be sure that we return a fully complemented
6079 coding system. */
6080 coding_system = coding_inherit_eol_type (coding_base, eol_base);
6081 return coding_system;
fcaf8878
KH
6082}
6083
6084
4ed46869
KH
6085/* Emacs has a mechanism to automatically detect a coding system if it
6086 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
6087 it's impossible to distinguish some coding systems accurately
6088 because they use the same range of codes. So, at first, coding
6089 systems are categorized into 7, those are:
6090
0ef69138 6091 o coding-category-emacs-mule
4ed46869
KH
6092
6093 The category for a coding system which has the same code range
6094 as Emacs' internal format. Assigned the coding-system (Lisp
0ef69138 6095 symbol) `emacs-mule' by default.
4ed46869
KH
6096
6097 o coding-category-sjis
6098
6099 The category for a coding system which has the same code range
6100 as SJIS. Assigned the coding-system (Lisp
7717c392 6101 symbol) `japanese-shift-jis' by default.
4ed46869
KH
6102
6103 o coding-category-iso-7
6104
6105 The category for a coding system which has the same code range
7717c392 6106 as ISO2022 of 7-bit environment. This doesn't use any locking
d46c5b12
KH
6107 shift and single shift functions. This can encode/decode all
6108 charsets. Assigned the coding-system (Lisp symbol)
6109 `iso-2022-7bit' by default.
6110
6111 o coding-category-iso-7-tight
6112
6113 Same as coding-category-iso-7 except that this can
6114 encode/decode only the specified charsets.
4ed46869
KH
6115
6116 o coding-category-iso-8-1
6117
6118 The category for a coding system which has the same code range
6119 as ISO2022 of 8-bit environment and graphic plane 1 used only
7717c392
KH
6120 for DIMENSION1 charset. This doesn't use any locking shift
6121 and single shift functions. Assigned the coding-system (Lisp
6122 symbol) `iso-latin-1' by default.
4ed46869
KH
6123
6124 o coding-category-iso-8-2
6125
6126 The category for a coding system which has the same code range
6127 as ISO2022 of 8-bit environment and graphic plane 1 used only
7717c392
KH
6128 for DIMENSION2 charset. This doesn't use any locking shift
6129 and single shift functions. Assigned the coding-system (Lisp
6130 symbol) `japanese-iso-8bit' by default.
4ed46869 6131
7717c392 6132 o coding-category-iso-7-else
4ed46869
KH
6133
6134 The category for a coding system which has the same code range
ad1746f5 6135 as ISO2022 of 7-bit environment but uses locking shift or
7717c392
KH
6136 single shift functions. Assigned the coding-system (Lisp
6137 symbol) `iso-2022-7bit-lock' by default.
6138
6139 o coding-category-iso-8-else
6140
6141 The category for a coding system which has the same code range
ad1746f5 6142 as ISO2022 of 8-bit environment but uses locking shift or
7717c392
KH
6143 single shift functions. Assigned the coding-system (Lisp
6144 symbol) `iso-2022-8bit-ss2' by default.
4ed46869
KH
6145
6146 o coding-category-big5
6147
6148 The category for a coding system which has the same code range
6149 as BIG5. Assigned the coding-system (Lisp symbol)
e0e989f6 6150 `cn-big5' by default.
4ed46869 6151
fa42c37f
KH
6152 o coding-category-utf-8
6153
6154 The category for a coding system which has the same code range
6e76ae91 6155 as UTF-8 (cf. RFC3629). Assigned the coding-system (Lisp
fa42c37f
KH
6156 symbol) `utf-8' by default.
6157
6158 o coding-category-utf-16-be
6159
6160 The category for a coding system in which a text has an
6161 Unicode signature (cf. Unicode Standard) in the order of BIG
6162 endian at the head. Assigned the coding-system (Lisp symbol)
6163 `utf-16-be' by default.
6164
6165 o coding-category-utf-16-le
6166
6167 The category for a coding system in which a text has an
6168 Unicode signature (cf. Unicode Standard) in the order of
6169 LITTLE endian at the head. Assigned the coding-system (Lisp
6170 symbol) `utf-16-le' by default.
6171
1397dc18
KH
6172 o coding-category-ccl
6173
6174 The category for a coding system of which encoder/decoder is
6175 written in CCL programs. The default value is nil, i.e., no
6176 coding system is assigned.
6177
4ed46869
KH
6178 o coding-category-binary
6179
6180 The category for a coding system not categorized in any of the
6181 above. Assigned the coding-system (Lisp symbol)
e0e989f6 6182 `no-conversion' by default.
4ed46869
KH
6183
6184 Each of them is a Lisp symbol and the value is an actual
df7492f9 6185 `coding-system's (this is also a Lisp symbol) assigned by a user.
4ed46869
KH
6186 What Emacs does actually is to detect a category of coding system.
6187 Then, it uses a `coding-system' assigned to it. If Emacs can't
df7492f9 6188 decide only one possible category, it selects a category of the
4ed46869
KH
6189 highest priority. Priorities of categories are also specified by a
6190 user in a Lisp variable `coding-category-list'.
6191
6192*/
6193
df7492f9
KH
6194#define EOL_SEEN_NONE 0
6195#define EOL_SEEN_LF 1
6196#define EOL_SEEN_CR 2
6197#define EOL_SEEN_CRLF 4
66cfb530 6198
ff0dacd7
KH
6199/* Detect how end-of-line of a text of length SRC_BYTES pointed by
6200 SOURCE is encoded. If CATEGORY is one of
6201 coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6202 two-byte, else they are encoded by one-byte.
6203
6204 Return one of EOL_SEEN_XXX. */
4ed46869 6205
bc4bc72a 6206#define MAX_EOL_CHECK_COUNT 3
d46c5b12
KH
6207
6208static int
d311d28c 6209detect_eol (const unsigned char *source, ptrdiff_t src_bytes,
cf84bb53 6210 enum coding_category category)
4ed46869 6211{
f6cbaf43 6212 const unsigned char *src = source, *src_end = src + src_bytes;
4ed46869 6213 unsigned char c;
df7492f9
KH
6214 int total = 0;
6215 int eol_seen = EOL_SEEN_NONE;
4ed46869 6216
89528eb3 6217 if ((1 << category) & CATEGORY_MASK_UTF_16)
4ed46869 6218 {
df7492f9 6219 int msb, lsb;
fa42c37f 6220
89528eb3
KH
6221 msb = category == (coding_category_utf_16_le
6222 | coding_category_utf_16_le_nosig);
df7492f9 6223 lsb = 1 - msb;
fa42c37f 6224
df7492f9 6225 while (src + 1 < src_end)
fa42c37f 6226 {
df7492f9
KH
6227 c = src[lsb];
6228 if (src[msb] == 0 && (c == '\n' || c == '\r'))
fa42c37f 6229 {
df7492f9
KH
6230 int this_eol;
6231
6232 if (c == '\n')
6233 this_eol = EOL_SEEN_LF;
6234 else if (src + 3 >= src_end
6235 || src[msb + 2] != 0
6236 || src[lsb + 2] != '\n')
6237 this_eol = EOL_SEEN_CR;
fa42c37f 6238 else
75f4f1ac
EZ
6239 {
6240 this_eol = EOL_SEEN_CRLF;
6241 src += 2;
6242 }
df7492f9
KH
6243
6244 if (eol_seen == EOL_SEEN_NONE)
6245 /* This is the first end-of-line. */
6246 eol_seen = this_eol;
6247 else if (eol_seen != this_eol)
fa42c37f 6248 {
75f4f1ac
EZ
6249 /* The found type is different from what found before.
6250 Allow for stray ^M characters in DOS EOL files. */
ef1b0ba7
SM
6251 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6252 || (eol_seen == EOL_SEEN_CRLF
6253 && this_eol == EOL_SEEN_CR))
75f4f1ac
EZ
6254 eol_seen = EOL_SEEN_CRLF;
6255 else
6256 {
6257 eol_seen = EOL_SEEN_LF;
6258 break;
6259 }
fa42c37f 6260 }
df7492f9
KH
6261 if (++total == MAX_EOL_CHECK_COUNT)
6262 break;
fa42c37f 6263 }
df7492f9 6264 src += 2;
fa42c37f 6265 }
bcf26d6a 6266 }
d46c5b12 6267 else
ef1b0ba7
SM
6268 while (src < src_end)
6269 {
6270 c = *src++;
6271 if (c == '\n' || c == '\r')
6272 {
6273 int this_eol;
d46c5b12 6274
ef1b0ba7
SM
6275 if (c == '\n')
6276 this_eol = EOL_SEEN_LF;
6277 else if (src >= src_end || *src != '\n')
6278 this_eol = EOL_SEEN_CR;
6279 else
6280 this_eol = EOL_SEEN_CRLF, src++;
d46c5b12 6281
ef1b0ba7
SM
6282 if (eol_seen == EOL_SEEN_NONE)
6283 /* This is the first end-of-line. */
6284 eol_seen = this_eol;
6285 else if (eol_seen != this_eol)
6286 {
6287 /* The found type is different from what found before.
6288 Allow for stray ^M characters in DOS EOL files. */
6289 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6290 || (eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR))
6291 eol_seen = EOL_SEEN_CRLF;
6292 else
6293 {
6294 eol_seen = EOL_SEEN_LF;
6295 break;
6296 }
6297 }
6298 if (++total == MAX_EOL_CHECK_COUNT)
6299 break;
6300 }
6301 }
df7492f9 6302 return eol_seen;
73be902c
KH
6303}
6304
df7492f9 6305
24a73b0a 6306static Lisp_Object
971de7fb 6307adjust_coding_eol_type (struct coding_system *coding, int eol_seen)
73be902c 6308{
0be8721c 6309 Lisp_Object eol_type;
8f924df7 6310
df7492f9
KH
6311 eol_type = CODING_ID_EOL_TYPE (coding->id);
6312 if (eol_seen & EOL_SEEN_LF)
24a73b0a
KH
6313 {
6314 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6315 eol_type = Qunix;
6316 }
6f197c07 6317 else if (eol_seen & EOL_SEEN_CRLF)
24a73b0a
KH
6318 {
6319 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6320 eol_type = Qdos;
6321 }
6f197c07 6322 else if (eol_seen & EOL_SEEN_CR)
24a73b0a
KH
6323 {
6324 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6325 eol_type = Qmac;
6326 }
6327 return eol_type;
d46c5b12 6328}
4ed46869 6329
df7492f9
KH
6330/* Detect how a text specified in CODING is encoded. If a coding
6331 system is detected, update fields of CODING by the detected coding
6332 system. */
0a28aafb 6333
74ab6df5 6334static void
971de7fb 6335detect_coding (struct coding_system *coding)
d46c5b12 6336{
8f924df7 6337 const unsigned char *src, *src_end;
73cce38d 6338 int saved_mode = coding->mode;
d46c5b12 6339
df7492f9
KH
6340 coding->consumed = coding->consumed_char = 0;
6341 coding->produced = coding->produced_char = 0;
6342 coding_set_source (coding);
1c3478b0 6343
df7492f9 6344 src_end = coding->source + coding->src_bytes;
c0e16b14 6345 coding->head_ascii = 0;
1c3478b0 6346
df7492f9
KH
6347 /* If we have not yet decided the text encoding type, detect it
6348 now. */
6349 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
b73bfc1c 6350 {
df7492f9 6351 int c, i;
6cb21a4f 6352 struct coding_detection_info detect_info;
2f3cbb32 6353 int null_byte_found = 0, eight_bit_found = 0;
df7492f9 6354
6cb21a4f 6355 detect_info.checked = detect_info.found = detect_info.rejected = 0;
2f3cbb32 6356 for (src = coding->source; src < src_end; src++)
d46c5b12 6357 {
df7492f9 6358 c = *src;
6cb21a4f 6359 if (c & 0x80)
6cb21a4f 6360 {
2f3cbb32 6361 eight_bit_found = 1;
2f3cbb32
KH
6362 if (null_byte_found)
6363 break;
6364 }
6365 else if (c < 0x20)
6366 {
6367 if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6368 && ! inhibit_iso_escape_detection
6369 && ! detect_info.checked)
6cb21a4f 6370 {
2f3cbb32
KH
6371 if (detect_coding_iso_2022 (coding, &detect_info))
6372 {
6373 /* We have scanned the whole data. */
6374 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
c0e16b14
KH
6375 {
6376 /* We didn't find an 8-bit code. We may
6377 have found a null-byte, but it's very
ce5b453a 6378 rare that a binary file conforms to
c0e16b14
KH
6379 ISO-2022. */
6380 src = src_end;
6381 coding->head_ascii = src - coding->source;
6382 }
6383 detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
2f3cbb32
KH
6384 break;
6385 }
6386 }
97b1b294 6387 else if (! c && !inhibit_null_byte_detection)
2f3cbb32
KH
6388 {
6389 null_byte_found = 1;
6390 if (eight_bit_found)
6391 break;
6cb21a4f 6392 }
c006c0c8
KH
6393 if (! eight_bit_found)
6394 coding->head_ascii++;
6cb21a4f 6395 }
c006c0c8 6396 else if (! eight_bit_found)
c0e16b14 6397 coding->head_ascii++;
d46c5b12 6398 }
df7492f9 6399
2f3cbb32
KH
6400 if (null_byte_found || eight_bit_found
6401 || coding->head_ascii < coding->src_bytes
6cb21a4f 6402 || detect_info.found)
d46c5b12 6403 {
ff0dacd7
KH
6404 enum coding_category category;
6405 struct coding_system *this;
df7492f9 6406
6cb21a4f
KH
6407 if (coding->head_ascii == coding->src_bytes)
6408 /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */
6409 for (i = 0; i < coding_category_raw_text; i++)
6410 {
6411 category = coding_priorities[i];
6412 this = coding_categories + category;
6413 if (detect_info.found & (1 << category))
24a73b0a 6414 break;
6cb21a4f
KH
6415 }
6416 else
2f3cbb32
KH
6417 {
6418 if (null_byte_found)
ff0dacd7 6419 {
2f3cbb32
KH
6420 detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6421 detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
ff0dacd7 6422 }
2f3cbb32
KH
6423 for (i = 0; i < coding_category_raw_text; i++)
6424 {
6425 category = coding_priorities[i];
6426 this = coding_categories + category;
6427 if (this->id < 0)
6428 {
6429 /* No coding system of this category is defined. */
6430 detect_info.rejected |= (1 << category);
6431 }
6432 else if (category >= coding_category_raw_text)
6433 continue;
6434 else if (detect_info.checked & (1 << category))
6435 {
6436 if (detect_info.found & (1 << category))
6437 break;
6438 }
6439 else if ((*(this->detector)) (coding, &detect_info)
6440 && detect_info.found & (1 << category))
6441 {
6442 if (category == coding_category_utf_16_auto)
6443 {
6444 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6445 category = coding_category_utf_16_le;
6446 else
6447 category = coding_category_utf_16_be;
6448 }
6449 break;
6450 }
6451 }
2f3cbb32 6452 }
c0e16b14
KH
6453
6454 if (i < coding_category_raw_text)
6455 setup_coding_system (CODING_ID_NAME (this->id), coding);
6456 else if (null_byte_found)
6457 setup_coding_system (Qno_conversion, coding);
6458 else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6459 == CATEGORY_MASK_ANY)
6460 setup_coding_system (Qraw_text, coding);
6461 else if (detect_info.rejected)
6462 for (i = 0; i < coding_category_raw_text; i++)
6463 if (! (detect_info.rejected & (1 << coding_priorities[i])))
6464 {
6465 this = coding_categories + coding_priorities[i];
6466 setup_coding_system (CODING_ID_NAME (this->id), coding);
6467 break;
6468 }
d46c5b12 6469 }
b73bfc1c 6470 }
a470d443
KH
6471 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6472 == coding_category_utf_8_auto)
6473 {
6474 Lisp_Object coding_systems;
6475 struct coding_detection_info detect_info;
6476
6477 coding_systems
6478 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6479 detect_info.found = detect_info.rejected = 0;
6480 coding->head_ascii = 0;
6481 if (CONSP (coding_systems)
6482 && detect_coding_utf_8 (coding, &detect_info))
6483 {
6484 if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6485 setup_coding_system (XCAR (coding_systems), coding);
6486 else
6487 setup_coding_system (XCDR (coding_systems), coding);
6488 }
6489 }
24a73b0a
KH
6490 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6491 == coding_category_utf_16_auto)
b49a1807
KH
6492 {
6493 Lisp_Object coding_systems;
6494 struct coding_detection_info detect_info;
6495
6496 coding_systems
a470d443 6497 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
b49a1807 6498 detect_info.found = detect_info.rejected = 0;
a470d443 6499 coding->head_ascii = 0;
b49a1807 6500 if (CONSP (coding_systems)
24a73b0a 6501 && detect_coding_utf_16 (coding, &detect_info))
b49a1807
KH
6502 {
6503 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6504 setup_coding_system (XCAR (coding_systems), coding);
24a73b0a 6505 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
b49a1807
KH
6506 setup_coding_system (XCDR (coding_systems), coding);
6507 }
6508 }
73cce38d 6509 coding->mode = saved_mode;
4ed46869 6510}
4ed46869 6511
d46c5b12 6512
aaaf0b1e 6513static void
971de7fb 6514decode_eol (struct coding_system *coding)
aaaf0b1e 6515{
24a73b0a
KH
6516 Lisp_Object eol_type;
6517 unsigned char *p, *pbeg, *pend;
3ed051d4 6518
24a73b0a 6519 eol_type = CODING_ID_EOL_TYPE (coding->id);
0a9564cb 6520 if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
24a73b0a
KH
6521 return;
6522
6523 if (NILP (coding->dst_object))
6524 pbeg = coding->destination;
6525 else
6526 pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6527 pend = pbeg + coding->produced;
6528
6529 if (VECTORP (eol_type))
aaaf0b1e 6530 {
df7492f9 6531 int eol_seen = EOL_SEEN_NONE;
4ed46869 6532
24a73b0a 6533 for (p = pbeg; p < pend; p++)
aaaf0b1e 6534 {
df7492f9
KH
6535 if (*p == '\n')
6536 eol_seen |= EOL_SEEN_LF;
6537 else if (*p == '\r')
aaaf0b1e 6538 {
df7492f9 6539 if (p + 1 < pend && *(p + 1) == '\n')
aaaf0b1e 6540 {
df7492f9
KH
6541 eol_seen |= EOL_SEEN_CRLF;
6542 p++;
aaaf0b1e 6543 }
aaaf0b1e 6544 else
df7492f9 6545 eol_seen |= EOL_SEEN_CR;
aaaf0b1e 6546 }
aaaf0b1e 6547 }
75f4f1ac
EZ
6548 /* Handle DOS-style EOLs in a file with stray ^M characters. */
6549 if ((eol_seen & EOL_SEEN_CRLF) != 0
6550 && (eol_seen & EOL_SEEN_CR) != 0
6551 && (eol_seen & EOL_SEEN_LF) == 0)
6552 eol_seen = EOL_SEEN_CRLF;
6553 else if (eol_seen != EOL_SEEN_NONE
24a73b0a
KH
6554 && eol_seen != EOL_SEEN_LF
6555 && eol_seen != EOL_SEEN_CRLF
6556 && eol_seen != EOL_SEEN_CR)
6557 eol_seen = EOL_SEEN_LF;
df7492f9 6558 if (eol_seen != EOL_SEEN_NONE)
24a73b0a 6559 eol_type = adjust_coding_eol_type (coding, eol_seen);
aaaf0b1e 6560 }
d46c5b12 6561
24a73b0a 6562 if (EQ (eol_type, Qmac))
27901516 6563 {
24a73b0a 6564 for (p = pbeg; p < pend; p++)
df7492f9
KH
6565 if (*p == '\r')
6566 *p = '\n';
4ed46869 6567 }
24a73b0a 6568 else if (EQ (eol_type, Qdos))
df7492f9 6569 {
d311d28c 6570 ptrdiff_t n = 0;
b73bfc1c 6571
24a73b0a
KH
6572 if (NILP (coding->dst_object))
6573 {
4347441b
KH
6574 /* Start deleting '\r' from the tail to minimize the memory
6575 movement. */
24a73b0a
KH
6576 for (p = pend - 2; p >= pbeg; p--)
6577 if (*p == '\r')
6578 {
72af86bd 6579 memmove (p, p + 1, pend-- - p - 1);
24a73b0a
KH
6580 n++;
6581 }
6582 }
6583 else
6584 {
d311d28c
PE
6585 ptrdiff_t pos_byte = coding->dst_pos_byte;
6586 ptrdiff_t pos = coding->dst_pos;
6587 ptrdiff_t pos_end = pos + coding->produced_char - 1;
4347441b
KH
6588
6589 while (pos < pos_end)
6590 {
6591 p = BYTE_POS_ADDR (pos_byte);
6592 if (*p == '\r' && p[1] == '\n')
6593 {
6594 del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6595 n++;
6596 pos_end--;
6597 }
6598 pos++;
69b8522d
KH
6599 if (coding->dst_multibyte)
6600 pos_byte += BYTES_BY_CHAR_HEAD (*p);
6601 else
6602 pos_byte++;
4347441b 6603 }
24a73b0a
KH
6604 }
6605 coding->produced -= n;
6606 coding->produced_char -= n;
aaaf0b1e 6607 }
4ed46869
KH
6608}
6609
7d64c6ad 6610
a6f87d34
KH
6611/* Return a translation table (or list of them) from coding system
6612 attribute vector ATTRS for encoding (ENCODEP is nonzero) or
6613 decoding (ENCODEP is zero). */
7d64c6ad 6614
e6a54062 6615static Lisp_Object
971de7fb 6616get_translation_table (Lisp_Object attrs, int encodep, int *max_lookup)
7d64c6ad
KH
6617{
6618 Lisp_Object standard, translation_table;
09ee6fdd 6619 Lisp_Object val;
7d64c6ad 6620
4bed5909
CY
6621 if (NILP (Venable_character_translation))
6622 {
6623 if (max_lookup)
6624 *max_lookup = 0;
6625 return Qnil;
6626 }
7d64c6ad
KH
6627 if (encodep)
6628 translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6629 standard = Vstandard_translation_table_for_encode;
6630 else
6631 translation_table = CODING_ATTR_DECODE_TBL (attrs),
6632 standard = Vstandard_translation_table_for_decode;
7d64c6ad 6633 if (NILP (translation_table))
09ee6fdd
KH
6634 translation_table = standard;
6635 else
a6f87d34 6636 {
09ee6fdd
KH
6637 if (SYMBOLP (translation_table))
6638 translation_table = Fget (translation_table, Qtranslation_table);
6639 else if (CONSP (translation_table))
6640 {
6641 translation_table = Fcopy_sequence (translation_table);
6642 for (val = translation_table; CONSP (val); val = XCDR (val))
6643 if (SYMBOLP (XCAR (val)))
6644 XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6645 }
6646 if (CHAR_TABLE_P (standard))
6647 {
6648 if (CONSP (translation_table))
6649 translation_table = nconc2 (translation_table,
6650 Fcons (standard, Qnil));
6651 else
6652 translation_table = Fcons (translation_table,
6653 Fcons (standard, Qnil));
6654 }
a6f87d34 6655 }
2170c8f0
KH
6656
6657 if (max_lookup)
09ee6fdd 6658 {
2170c8f0
KH
6659 *max_lookup = 1;
6660 if (CHAR_TABLE_P (translation_table)
6661 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6662 {
6663 val = XCHAR_TABLE (translation_table)->extras[1];
6664 if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6665 *max_lookup = XFASTINT (val);
6666 }
6667 else if (CONSP (translation_table))
6668 {
2735d060 6669 Lisp_Object tail;
09ee6fdd 6670
2170c8f0
KH
6671 for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6672 if (CHAR_TABLE_P (XCAR (tail))
6673 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6674 {
2735d060
PE
6675 Lisp_Object tailval = XCHAR_TABLE (XCAR (tail))->extras[1];
6676 if (NATNUMP (tailval) && *max_lookup < XFASTINT (tailval))
6677 *max_lookup = XFASTINT (tailval);
2170c8f0
KH
6678 }
6679 }
a6f87d34 6680 }
7d64c6ad
KH
6681 return translation_table;
6682}
6683
09ee6fdd
KH
6684#define LOOKUP_TRANSLATION_TABLE(table, c, trans) \
6685 do { \
6686 trans = Qnil; \
6687 if (CHAR_TABLE_P (table)) \
6688 { \
6689 trans = CHAR_TABLE_REF (table, c); \
6690 if (CHARACTERP (trans)) \
6691 c = XFASTINT (trans), trans = Qnil; \
6692 } \
6693 else if (CONSP (table)) \
6694 { \
6695 Lisp_Object tail; \
6696 \
6697 for (tail = table; CONSP (tail); tail = XCDR (tail)) \
6698 if (CHAR_TABLE_P (XCAR (tail))) \
6699 { \
6700 trans = CHAR_TABLE_REF (XCAR (tail), c); \
6701 if (CHARACTERP (trans)) \
6702 c = XFASTINT (trans), trans = Qnil; \
6703 else if (! NILP (trans)) \
6704 break; \
6705 } \
6706 } \
e6a54062
KH
6707 } while (0)
6708
7d64c6ad 6709
e951386e
KH
6710/* Return a translation of character(s) at BUF according to TRANS.
6711 TRANS is TO-CHAR or ((FROM . TO) ...) where
6712 FROM = [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].
6713 The return value is TO-CHAR or ([FROM-CHAR ...] . TO) if a
6714 translation is found, and Qnil if not found..
6715 If BUF is too short to lookup characters in FROM, return Qt. */
6716
69a80ea3 6717static Lisp_Object
971de7fb 6718get_translation (Lisp_Object trans, int *buf, int *buf_end)
69a80ea3 6719{
e951386e
KH
6720
6721 if (INTEGERP (trans))
6722 return trans;
6723 for (; CONSP (trans); trans = XCDR (trans))
69a80ea3 6724 {
e951386e
KH
6725 Lisp_Object val = XCAR (trans);
6726 Lisp_Object from = XCAR (val);
2c6a9faa
PE
6727 ptrdiff_t len = ASIZE (from);
6728 ptrdiff_t i;
69a80ea3 6729
e951386e 6730 for (i = 0; i < len; i++)
69a80ea3 6731 {
e951386e
KH
6732 if (buf + i == buf_end)
6733 return Qt;
6734 if (XINT (AREF (from, i)) != buf[i])
6735 break;
69a80ea3 6736 }
e951386e
KH
6737 if (i == len)
6738 return val;
69a80ea3 6739 }
e951386e 6740 return Qnil;
69a80ea3
KH
6741}
6742
6743
d46c5b12 6744static int
cf84bb53
JB
6745produce_chars (struct coding_system *coding, Lisp_Object translation_table,
6746 int last_block)
4ed46869 6747{
df7492f9
KH
6748 unsigned char *dst = coding->destination + coding->produced;
6749 unsigned char *dst_end = coding->destination + coding->dst_bytes;
d311d28c
PE
6750 ptrdiff_t produced;
6751 ptrdiff_t produced_chars = 0;
69a80ea3 6752 int carryover = 0;
4ed46869 6753
df7492f9 6754 if (! coding->chars_at_source)
4ed46869 6755 {
119852e7 6756 /* Source characters are in coding->charbuf. */
fba4576f
AS
6757 int *buf = coding->charbuf;
6758 int *buf_end = buf + coding->charbuf_used;
4ed46869 6759
db274c7a
KH
6760 if (EQ (coding->src_object, coding->dst_object))
6761 {
6762 coding_set_source (coding);
6763 dst_end = ((unsigned char *) coding->source) + coding->consumed;
6764 }
4ed46869 6765
df7492f9 6766 while (buf < buf_end)
4ed46869 6767 {
69a80ea3 6768 int c = *buf, i;
bc4bc72a 6769
df7492f9
KH
6770 if (c >= 0)
6771 {
d311d28c 6772 ptrdiff_t from_nchars = 1, to_nchars = 1;
69a80ea3
KH
6773 Lisp_Object trans = Qnil;
6774
09ee6fdd 6775 LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
e6a54062 6776 if (! NILP (trans))
69a80ea3 6777 {
e951386e
KH
6778 trans = get_translation (trans, buf, buf_end);
6779 if (INTEGERP (trans))
6780 c = XINT (trans);
6781 else if (CONSP (trans))
6782 {
6783 from_nchars = ASIZE (XCAR (trans));
6784 trans = XCDR (trans);
6785 if (INTEGERP (trans))
6786 c = XINT (trans);
6787 else
6788 {
6789 to_nchars = ASIZE (trans);
6790 c = XINT (AREF (trans, 0));
6791 }
6792 }
6793 else if (EQ (trans, Qt) && ! last_block)
69a80ea3 6794 break;
69a80ea3
KH
6795 }
6796
5d009b3a 6797 if ((dst_end - dst) / MAX_MULTIBYTE_LENGTH < to_nchars)
69a80ea3 6798 {
5d009b3a
PE
6799 if (((min (PTRDIFF_MAX, SIZE_MAX) - (buf_end - buf))
6800 / MAX_MULTIBYTE_LENGTH)
6801 < to_nchars)
6802 memory_full (SIZE_MAX);
69a80ea3
KH
6803 dst = alloc_destination (coding,
6804 buf_end - buf
6805 + MAX_MULTIBYTE_LENGTH * to_nchars,
6806 dst);
db274c7a
KH
6807 if (EQ (coding->src_object, coding->dst_object))
6808 {
6809 coding_set_source (coding);
e951386e
KH
6810 dst_end = (((unsigned char *) coding->source)
6811 + coding->consumed);
db274c7a
KH
6812 }
6813 else
6814 dst_end = coding->destination + coding->dst_bytes;
69a80ea3
KH
6815 }
6816
433f7f87 6817 for (i = 0; i < to_nchars; i++)
69a80ea3 6818 {
433f7f87
KH
6819 if (i > 0)
6820 c = XINT (AREF (trans, i));
69a80ea3
KH
6821 if (coding->dst_multibyte
6822 || ! CHAR_BYTE8_P (c))
db274c7a 6823 CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
69a80ea3
KH
6824 else
6825 *dst++ = CHAR_TO_BYTE8 (c);
6826 }
6827 produced_chars += to_nchars;
e951386e 6828 buf += from_nchars;
d46c5b12 6829 }
df7492f9 6830 else
69a80ea3
KH
6831 /* This is an annotation datum. (-C) is the length. */
6832 buf += -c;
4ed46869 6833 }
69a80ea3 6834 carryover = buf_end - buf;
4ed46869 6835 }
fa42c37f 6836 else
fa42c37f 6837 {
119852e7 6838 /* Source characters are at coding->source. */
8f924df7 6839 const unsigned char *src = coding->source;
119852e7 6840 const unsigned char *src_end = src + coding->consumed;
4ed46869 6841
db274c7a
KH
6842 if (EQ (coding->dst_object, coding->src_object))
6843 dst_end = (unsigned char *) src;
df7492f9 6844 if (coding->src_multibyte != coding->dst_multibyte)
fa42c37f 6845 {
df7492f9 6846 if (coding->src_multibyte)
fa42c37f 6847 {
71c81426 6848 int multibytep = 1;
d311d28c 6849 ptrdiff_t consumed_chars = 0;
d46c5b12 6850
df7492f9
KH
6851 while (1)
6852 {
8f924df7 6853 const unsigned char *src_base = src;
df7492f9 6854 int c;
b73bfc1c 6855
df7492f9 6856 ONE_MORE_BYTE (c);
119852e7 6857 if (dst == dst_end)
df7492f9 6858 {
119852e7
KH
6859 if (EQ (coding->src_object, coding->dst_object))
6860 dst_end = (unsigned char *) src;
6861 if (dst == dst_end)
df7492f9 6862 {
d311d28c 6863 ptrdiff_t offset = src - coding->source;
119852e7
KH
6864
6865 dst = alloc_destination (coding, src_end - src + 1,
6866 dst);
6867 dst_end = coding->destination + coding->dst_bytes;
6868 coding_set_source (coding);
6869 src = coding->source + offset;
5c1ca13d 6870 src_end = coding->source + coding->consumed;
db274c7a
KH
6871 if (EQ (coding->src_object, coding->dst_object))
6872 dst_end = (unsigned char *) src;
df7492f9 6873 }
df7492f9
KH
6874 }
6875 *dst++ = c;
6876 produced_chars++;
6877 }
6878 no_more_source:
6879 ;
fa42c37f
KH
6880 }
6881 else
df7492f9
KH
6882 while (src < src_end)
6883 {
71c81426 6884 int multibytep = 1;
df7492f9 6885 int c = *src++;
b73bfc1c 6886
df7492f9
KH
6887 if (dst >= dst_end - 1)
6888 {
2c78b7e1 6889 if (EQ (coding->src_object, coding->dst_object))
8f924df7 6890 dst_end = (unsigned char *) src;
2c78b7e1
KH
6891 if (dst >= dst_end - 1)
6892 {
d311d28c
PE
6893 ptrdiff_t offset = src - coding->source;
6894 ptrdiff_t more_bytes;
119852e7 6895
db274c7a
KH
6896 if (EQ (coding->src_object, coding->dst_object))
6897 more_bytes = ((src_end - src) / 2) + 2;
6898 else
6899 more_bytes = src_end - src + 2;
6900 dst = alloc_destination (coding, more_bytes, dst);
2c78b7e1
KH
6901 dst_end = coding->destination + coding->dst_bytes;
6902 coding_set_source (coding);
119852e7 6903 src = coding->source + offset;
5c1ca13d 6904 src_end = coding->source + coding->consumed;
db274c7a
KH
6905 if (EQ (coding->src_object, coding->dst_object))
6906 dst_end = (unsigned char *) src;
2c78b7e1 6907 }
df7492f9
KH
6908 }
6909 EMIT_ONE_BYTE (c);
6910 }
d46c5b12 6911 }
df7492f9
KH
6912 else
6913 {
6914 if (!EQ (coding->src_object, coding->dst_object))
fa42c37f 6915 {
d311d28c 6916 ptrdiff_t require = coding->src_bytes - coding->dst_bytes;
4ed46869 6917
df7492f9 6918 if (require > 0)
fa42c37f 6919 {
d311d28c 6920 ptrdiff_t offset = src - coding->source;
df7492f9
KH
6921
6922 dst = alloc_destination (coding, require, dst);
6923 coding_set_source (coding);
6924 src = coding->source + offset;
5c1ca13d 6925 src_end = coding->source + coding->consumed;
fa42c37f
KH
6926 }
6927 }
119852e7 6928 produced_chars = coding->consumed_char;
df7492f9 6929 while (src < src_end)
14daee73 6930 *dst++ = *src++;
fa42c37f
KH
6931 }
6932 }
6933
df7492f9 6934 produced = dst - (coding->destination + coding->produced);
284201e4 6935 if (BUFFERP (coding->dst_object) && produced_chars > 0)
df7492f9
KH
6936 insert_from_gap (produced_chars, produced);
6937 coding->produced += produced;
6938 coding->produced_char += produced_chars;
69a80ea3 6939 return carryover;
fa42c37f
KH
6940}
6941
ff0dacd7
KH
6942/* Compose text in CODING->object according to the annotation data at
6943 CHARBUF. CHARBUF is an array:
e951386e 6944 [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
df7492f9 6945 */
4ed46869 6946
55d4c1b2 6947static inline void
d311d28c 6948produce_composition (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
4ed46869 6949{
df7492f9 6950 int len;
d311d28c 6951 ptrdiff_t to;
df7492f9 6952 enum composition_method method;
df7492f9 6953 Lisp_Object components;
fa42c37f 6954
e951386e 6955 len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
69a80ea3 6956 to = pos + charbuf[2];
e951386e 6957 method = (enum composition_method) (charbuf[4]);
d46c5b12 6958
df7492f9
KH
6959 if (method == COMPOSITION_RELATIVE)
6960 components = Qnil;
e951386e 6961 else
d46c5b12 6962 {
df7492f9 6963 Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
e951386e 6964 int i, j;
b73bfc1c 6965
e951386e
KH
6966 if (method == COMPOSITION_WITH_RULE)
6967 len = charbuf[2] * 3 - 2;
6968 charbuf += MAX_ANNOTATION_LENGTH;
6969 /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
6970 for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
9ffd559c 6971 {
e951386e
KH
6972 if (charbuf[i] >= 0)
6973 args[j] = make_number (charbuf[i]);
6974 else
6975 {
6976 i++;
6977 args[j] = make_number (charbuf[i] % 0x100);
6978 }
9ffd559c 6979 }
e951386e 6980 components = (i == j ? Fstring (j, args) : Fvector (j, args));
d46c5b12 6981 }
69a80ea3 6982 compose_text (pos, to, components, Qnil, coding->dst_object);
d46c5b12
KH
6983}
6984
d46c5b12 6985
ff0dacd7
KH
6986/* Put `charset' property on text in CODING->object according to
6987 the annotation data at CHARBUF. CHARBUF is an array:
69a80ea3 6988 [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
ff0dacd7 6989 */
d46c5b12 6990
55d4c1b2 6991static inline void
d311d28c 6992produce_charset (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
d46c5b12 6993{
d311d28c 6994 ptrdiff_t from = pos - charbuf[2];
69a80ea3 6995 struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
b73bfc1c 6996
69a80ea3 6997 Fput_text_property (make_number (from), make_number (pos),
ff0dacd7
KH
6998 Qcharset, CHARSET_NAME (charset),
6999 coding->dst_object);
d46c5b12
KH
7000}
7001
d46c5b12 7002
df7492f9
KH
7003#define CHARBUF_SIZE 0x4000
7004
7005#define ALLOC_CONVERSION_WORK_AREA(coding) \
7006 do { \
8510724d 7007 int size = CHARBUF_SIZE; \
df7492f9
KH
7008 \
7009 coding->charbuf = NULL; \
7010 while (size > 1024) \
7011 { \
7012 coding->charbuf = (int *) alloca (sizeof (int) * size); \
7013 if (coding->charbuf) \
7014 break; \
7015 size >>= 1; \
7016 } \
7017 if (! coding->charbuf) \
7018 { \
065e3595 7019 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
df7492f9
KH
7020 return coding->result; \
7021 } \
7022 coding->charbuf_size = size; \
7023 } while (0)
4ed46869 7024
d46c5b12
KH
7025
7026static void
d311d28c 7027produce_annotation (struct coding_system *coding, ptrdiff_t pos)
d46c5b12 7028{
df7492f9
KH
7029 int *charbuf = coding->charbuf;
7030 int *charbuf_end = charbuf + coding->charbuf_used;
d46c5b12 7031
ff0dacd7
KH
7032 if (NILP (coding->dst_object))
7033 return;
d46c5b12 7034
df7492f9 7035 while (charbuf < charbuf_end)
a84f1519 7036 {
df7492f9 7037 if (*charbuf >= 0)
e951386e 7038 pos++, charbuf++;
d46c5b12 7039 else
d46c5b12 7040 {
df7492f9 7041 int len = -*charbuf;
e951386e
KH
7042
7043 if (len > 2)
7044 switch (charbuf[1])
7045 {
7046 case CODING_ANNOTATE_COMPOSITION_MASK:
7047 produce_composition (coding, charbuf, pos);
7048 break;
7049 case CODING_ANNOTATE_CHARSET_MASK:
7050 produce_charset (coding, charbuf, pos);
7051 break;
7052 }
df7492f9 7053 charbuf += len;
d46c5b12 7054 }
a84f1519 7055 }
d46c5b12
KH
7056}
7057
df7492f9
KH
7058/* Decode the data at CODING->src_object into CODING->dst_object.
7059 CODING->src_object is a buffer, a string, or nil.
7060 CODING->dst_object is a buffer.
d46c5b12 7061
df7492f9
KH
7062 If CODING->src_object is a buffer, it must be the current buffer.
7063 In this case, if CODING->src_pos is positive, it is a position of
7064 the source text in the buffer, otherwise, the source text is in the
7065 gap area of the buffer, and CODING->src_pos specifies the offset of
7066 the text from GPT (which must be the same as PT). If this is the
7067 same buffer as CODING->dst_object, CODING->src_pos must be
7068 negative.
d46c5b12 7069
b6828792 7070 If CODING->src_object is a string, CODING->src_pos is an index to
df7492f9 7071 that string.
d46c5b12 7072
df7492f9
KH
7073 If CODING->src_object is nil, CODING->source must already point to
7074 the non-relocatable memory area. In this case, CODING->src_pos is
7075 an offset from CODING->source.
73be902c 7076
df7492f9
KH
7077 The decoded data is inserted at the current point of the buffer
7078 CODING->dst_object.
7079*/
d46c5b12 7080
df7492f9 7081static int
971de7fb 7082decode_coding (struct coding_system *coding)
d46c5b12 7083{
df7492f9 7084 Lisp_Object attrs;
24a73b0a 7085 Lisp_Object undo_list;
7d64c6ad 7086 Lisp_Object translation_table;
d0396581 7087 struct ccl_spec cclspec;
69a80ea3
KH
7088 int carryover;
7089 int i;
d46c5b12 7090
df7492f9
KH
7091 if (BUFFERP (coding->src_object)
7092 && coding->src_pos > 0
7093 && coding->src_pos < GPT
7094 && coding->src_pos + coding->src_chars > GPT)
7095 move_gap_both (coding->src_pos, coding->src_pos_byte);
d46c5b12 7096
24a73b0a 7097 undo_list = Qt;
df7492f9 7098 if (BUFFERP (coding->dst_object))
1c3478b0 7099 {
df7492f9
KH
7100 if (current_buffer != XBUFFER (coding->dst_object))
7101 set_buffer_internal (XBUFFER (coding->dst_object));
7102 if (GPT != PT)
7103 move_gap_both (PT, PT_BYTE);
4b4deea2
TT
7104 undo_list = BVAR (current_buffer, undo_list);
7105 BVAR (current_buffer, undo_list) = Qt;
1c3478b0
KH
7106 }
7107
df7492f9
KH
7108 coding->consumed = coding->consumed_char = 0;
7109 coding->produced = coding->produced_char = 0;
7110 coding->chars_at_source = 0;
065e3595 7111 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9 7112 coding->errors = 0;
1c3478b0 7113
df7492f9
KH
7114 ALLOC_CONVERSION_WORK_AREA (coding);
7115
7116 attrs = CODING_ID_ATTRS (coding->id);
2170c8f0 7117 translation_table = get_translation_table (attrs, 0, NULL);
df7492f9 7118
69a80ea3 7119 carryover = 0;
d0396581
KH
7120 if (coding->decoder == decode_coding_ccl)
7121 {
7122 coding->spec.ccl = &cclspec;
7123 setup_ccl_program (&cclspec.ccl, CODING_CCL_DECODER (coding));
7124 }
df7492f9 7125 do
b73bfc1c 7126 {
d311d28c 7127 ptrdiff_t pos = coding->dst_pos + coding->produced_char;
69a80ea3 7128
df7492f9
KH
7129 coding_set_source (coding);
7130 coding->annotated = 0;
69a80ea3 7131 coding->charbuf_used = carryover;
df7492f9 7132 (*(coding->decoder)) (coding);
df7492f9 7133 coding_set_destination (coding);
69a80ea3 7134 carryover = produce_chars (coding, translation_table, 0);
df7492f9 7135 if (coding->annotated)
69a80ea3
KH
7136 produce_annotation (coding, pos);
7137 for (i = 0; i < carryover; i++)
7138 coding->charbuf[i]
7139 = coding->charbuf[coding->charbuf_used - carryover + i];
d46c5b12 7140 }
d0396581
KH
7141 while (coding->result == CODING_RESULT_INSUFFICIENT_DST
7142 || (coding->consumed < coding->src_bytes
7143 && (coding->result == CODING_RESULT_SUCCESS
7144 || coding->result == CODING_RESULT_INVALID_SRC)));
d46c5b12 7145
69a80ea3
KH
7146 if (carryover > 0)
7147 {
7148 coding_set_destination (coding);
7149 coding->charbuf_used = carryover;
7150 produce_chars (coding, translation_table, 1);
7151 }
7152
df7492f9
KH
7153 coding->carryover_bytes = 0;
7154 if (coding->consumed < coding->src_bytes)
d46c5b12 7155 {
df7492f9 7156 int nbytes = coding->src_bytes - coding->consumed;
8f924df7 7157 const unsigned char *src;
df7492f9
KH
7158
7159 coding_set_source (coding);
7160 coding_set_destination (coding);
7161 src = coding->source + coding->consumed;
7162
7163 if (coding->mode & CODING_MODE_LAST_BLOCK)
1c3478b0 7164 {
df7492f9
KH
7165 /* Flush out unprocessed data as binary chars. We are sure
7166 that the number of data is less than the size of
7167 coding->charbuf. */
065e3595 7168 coding->charbuf_used = 0;
b2dab6c8
JR
7169 coding->chars_at_source = 0;
7170
df7492f9 7171 while (nbytes-- > 0)
1c3478b0 7172 {
df7492f9 7173 int c = *src++;
98725083 7174
1c91457d
KH
7175 if (c & 0x80)
7176 c = BYTE8_TO_CHAR (c);
7177 coding->charbuf[coding->charbuf_used++] = c;
1c3478b0 7178 }
f6cbaf43 7179 produce_chars (coding, Qnil, 1);
d46c5b12 7180 }
d46c5b12 7181 else
df7492f9
KH
7182 {
7183 /* Record unprocessed bytes in coding->carryover. We are
7184 sure that the number of data is less than the size of
7185 coding->carryover. */
7186 unsigned char *p = coding->carryover;
7187
f289d375
KH
7188 if (nbytes > sizeof coding->carryover)
7189 nbytes = sizeof coding->carryover;
df7492f9
KH
7190 coding->carryover_bytes = nbytes;
7191 while (nbytes-- > 0)
7192 *p++ = *src++;
1c3478b0 7193 }
df7492f9 7194 coding->consumed = coding->src_bytes;
b73bfc1c 7195 }
69f76525 7196
0a9564cb
EZ
7197 if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7198 && !inhibit_eol_conversion)
4347441b 7199 decode_eol (coding);
24a73b0a
KH
7200 if (BUFFERP (coding->dst_object))
7201 {
4b4deea2 7202 BVAR (current_buffer, undo_list) = undo_list;
24a73b0a
KH
7203 record_insert (coding->dst_pos, coding->produced_char);
7204 }
73be902c 7205 return coding->result;
4ed46869
KH
7206}
7207
aaaf0b1e 7208
e1c23804 7209/* Extract an annotation datum from a composition starting at POS and
ff0dacd7
KH
7210 ending before LIMIT of CODING->src_object (buffer or string), store
7211 the data in BUF, set *STOP to a starting position of the next
7212 composition (if any) or to LIMIT, and return the address of the
7213 next element of BUF.
7214
7215 If such an annotation is not found, set *STOP to a starting
7216 position of a composition after POS (if any) or to LIMIT, and
7217 return BUF. */
7218
55d4c1b2 7219static inline int *
d311d28c 7220handle_composition_annotation (ptrdiff_t pos, ptrdiff_t limit,
cf84bb53 7221 struct coding_system *coding, int *buf,
d311d28c 7222 ptrdiff_t *stop)
aaaf0b1e 7223{
d311d28c 7224 ptrdiff_t start, end;
ff0dacd7 7225 Lisp_Object prop;
aaaf0b1e 7226
ff0dacd7
KH
7227 if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7228 || end > limit)
7229 *stop = limit;
7230 else if (start > pos)
7231 *stop = start;
7232 else
aaaf0b1e 7233 {
ff0dacd7 7234 if (start == pos)
aaaf0b1e 7235 {
ff0dacd7
KH
7236 /* We found a composition. Store the corresponding
7237 annotation data in BUF. */
7238 int *head = buf;
7239 enum composition_method method = COMPOSITION_METHOD (prop);
7240 int nchars = COMPOSITION_LENGTH (prop);
7241
e951386e 7242 ADD_COMPOSITION_DATA (buf, nchars, 0, method);
ff0dacd7 7243 if (method != COMPOSITION_RELATIVE)
aaaf0b1e 7244 {
ff0dacd7 7245 Lisp_Object components;
2c6a9faa 7246 ptrdiff_t i, len, i_byte;
ff0dacd7
KH
7247
7248 components = COMPOSITION_COMPONENTS (prop);
7249 if (VECTORP (components))
aaaf0b1e 7250 {
77b37c05 7251 len = ASIZE (components);
ff0dacd7
KH
7252 for (i = 0; i < len; i++)
7253 *buf++ = XINT (AREF (components, i));
aaaf0b1e 7254 }
ff0dacd7 7255 else if (STRINGP (components))
aaaf0b1e 7256 {
8f924df7 7257 len = SCHARS (components);
ff0dacd7
KH
7258 i = i_byte = 0;
7259 while (i < len)
7260 {
7261 FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
7262 buf++;
7263 }
7264 }
7265 else if (INTEGERP (components))
7266 {
7267 len = 1;
7268 *buf++ = XINT (components);
7269 }
7270 else if (CONSP (components))
7271 {
7272 for (len = 0; CONSP (components);
7273 len++, components = XCDR (components))
7274 *buf++ = XINT (XCAR (components));
aaaf0b1e 7275 }
aaaf0b1e 7276 else
ff0dacd7
KH
7277 abort ();
7278 *head -= len;
aaaf0b1e 7279 }
aaaf0b1e 7280 }
ff0dacd7
KH
7281
7282 if (find_composition (end, limit, &start, &end, &prop,
7283 coding->src_object)
7284 && end <= limit)
7285 *stop = start;
7286 else
7287 *stop = limit;
aaaf0b1e 7288 }
ff0dacd7
KH
7289 return buf;
7290}
7291
7292
e1c23804 7293/* Extract an annotation datum from a text property `charset' at POS of
ff0dacd7
KH
7294 CODING->src_object (buffer of string), store the data in BUF, set
7295 *STOP to the position where the value of `charset' property changes
7296 (limiting by LIMIT), and return the address of the next element of
7297 BUF.
7298
7299 If the property value is nil, set *STOP to the position where the
7300 property value is non-nil (limiting by LIMIT), and return BUF. */
7301
55d4c1b2 7302static inline int *
d311d28c 7303handle_charset_annotation (ptrdiff_t pos, ptrdiff_t limit,
cf84bb53 7304 struct coding_system *coding, int *buf,
d311d28c 7305 ptrdiff_t *stop)
ff0dacd7
KH
7306{
7307 Lisp_Object val, next;
7308 int id;
7309
7310 val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
7311 if (! NILP (val) && CHARSETP (val))
7312 id = XINT (CHARSET_SYMBOL_ID (val));
7313 else
7314 id = -1;
69a80ea3 7315 ADD_CHARSET_DATA (buf, 0, id);
ff0dacd7
KH
7316 next = Fnext_single_property_change (make_number (pos), Qcharset,
7317 coding->src_object,
7318 make_number (limit));
7319 *stop = XINT (next);
7320 return buf;
7321}
7322
7323
df7492f9 7324static void
cf84bb53
JB
7325consume_chars (struct coding_system *coding, Lisp_Object translation_table,
7326 int max_lookup)
df7492f9
KH
7327{
7328 int *buf = coding->charbuf;
ff0dacd7 7329 int *buf_end = coding->charbuf + coding->charbuf_size;
7c78e542 7330 const unsigned char *src = coding->source + coding->consumed;
4776e638 7331 const unsigned char *src_end = coding->source + coding->src_bytes;
d311d28c
PE
7332 ptrdiff_t pos = coding->src_pos + coding->consumed_char;
7333 ptrdiff_t end_pos = coding->src_pos + coding->src_chars;
df7492f9
KH
7334 int multibytep = coding->src_multibyte;
7335 Lisp_Object eol_type;
7336 int c;
d311d28c 7337 ptrdiff_t stop, stop_composition, stop_charset;
09ee6fdd 7338 int *lookup_buf = NULL;
433f7f87
KH
7339
7340 if (! NILP (translation_table))
09ee6fdd 7341 lookup_buf = alloca (sizeof (int) * max_lookup);
88993dfd 7342
0a9564cb 7343 eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
df7492f9
KH
7344 if (VECTORP (eol_type))
7345 eol_type = Qunix;
88993dfd 7346
df7492f9
KH
7347 /* Note: composition handling is not yet implemented. */
7348 coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
ec6d2bb8 7349
0b5670c9
KH
7350 if (NILP (coding->src_object))
7351 stop = stop_composition = stop_charset = end_pos;
ff0dacd7 7352 else
0b5670c9
KH
7353 {
7354 if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7355 stop = stop_composition = pos;
7356 else
7357 stop = stop_composition = end_pos;
7358 if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7359 stop = stop_charset = pos;
7360 else
7361 stop_charset = end_pos;
7362 }
ec6d2bb8 7363
24a73b0a 7364 /* Compensate for CRLF and conversion. */
ff0dacd7 7365 buf_end -= 1 + MAX_ANNOTATION_LENGTH;
df7492f9 7366 while (buf < buf_end)
aaaf0b1e 7367 {
433f7f87
KH
7368 Lisp_Object trans;
7369
df7492f9 7370 if (pos == stop)
ec6d2bb8 7371 {
df7492f9
KH
7372 if (pos == end_pos)
7373 break;
ff0dacd7
KH
7374 if (pos == stop_composition)
7375 buf = handle_composition_annotation (pos, end_pos, coding,
7376 buf, &stop_composition);
7377 if (pos == stop_charset)
7378 buf = handle_charset_annotation (pos, end_pos, coding,
7379 buf, &stop_charset);
7380 stop = (stop_composition < stop_charset
7381 ? stop_composition : stop_charset);
df7492f9
KH
7382 }
7383
7384 if (! multibytep)
4776e638 7385 {
d311d28c 7386 int bytes;
aaaf0b1e 7387
4d1e6632
KH
7388 if (coding->encoder == encode_coding_raw_text
7389 || coding->encoder == encode_coding_ccl)
ea29edf2
KH
7390 c = *src++, pos++;
7391 else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
db274c7a 7392 c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
4776e638 7393 else
f03caae0 7394 c = BYTE8_TO_CHAR (*src), src++, pos++;
4776e638 7395 }
df7492f9 7396 else
db274c7a 7397 c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
df7492f9
KH
7398 if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7399 c = '\n';
7400 if (! EQ (eol_type, Qunix))
aaaf0b1e 7401 {
df7492f9 7402 if (c == '\n')
aaaf0b1e 7403 {
df7492f9
KH
7404 if (EQ (eol_type, Qdos))
7405 *buf++ = '\r';
7406 else
7407 c = '\r';
aaaf0b1e
KH
7408 }
7409 }
433f7f87 7410
e6a54062 7411 trans = Qnil;
09ee6fdd 7412 LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
e6a54062 7413 if (NILP (trans))
433f7f87
KH
7414 *buf++ = c;
7415 else
7416 {
2c6a9faa 7417 ptrdiff_t from_nchars = 1, to_nchars = 1;
433f7f87
KH
7418 int *lookup_buf_end;
7419 const unsigned char *p = src;
7420 int i;
7421
7422 lookup_buf[0] = c;
7423 for (i = 1; i < max_lookup && p < src_end; i++)
7424 lookup_buf[i] = STRING_CHAR_ADVANCE (p);
7425 lookup_buf_end = lookup_buf + i;
e951386e
KH
7426 trans = get_translation (trans, lookup_buf, lookup_buf_end);
7427 if (INTEGERP (trans))
7428 c = XINT (trans);
7429 else if (CONSP (trans))
7430 {
7431 from_nchars = ASIZE (XCAR (trans));
7432 trans = XCDR (trans);
7433 if (INTEGERP (trans))
7434 c = XINT (trans);
7435 else
7436 {
7437 to_nchars = ASIZE (trans);
2c6a9faa 7438 if (buf_end - buf < to_nchars)
e951386e
KH
7439 break;
7440 c = XINT (AREF (trans, 0));
7441 }
7442 }
7443 else
433f7f87 7444 break;
e951386e 7445 *buf++ = c;
433f7f87
KH
7446 for (i = 1; i < to_nchars; i++)
7447 *buf++ = XINT (AREF (trans, i));
7448 for (i = 1; i < from_nchars; i++, pos++)
7449 src += MULTIBYTE_LENGTH_NO_CHECK (src);
7450 }
aaaf0b1e 7451 }
ec6d2bb8 7452
df7492f9
KH
7453 coding->consumed = src - coding->source;
7454 coding->consumed_char = pos - coding->src_pos;
7455 coding->charbuf_used = buf - coding->charbuf;
7456 coding->chars_at_source = 0;
aaaf0b1e
KH
7457}
7458
4ed46869 7459
df7492f9
KH
7460/* Encode the text at CODING->src_object into CODING->dst_object.
7461 CODING->src_object is a buffer or a string.
7462 CODING->dst_object is a buffer or nil.
7463
7464 If CODING->src_object is a buffer, it must be the current buffer.
7465 In this case, if CODING->src_pos is positive, it is a position of
7466 the source text in the buffer, otherwise. the source text is in the
7467 gap area of the buffer, and coding->src_pos specifies the offset of
7468 the text from GPT (which must be the same as PT). If this is the
7469 same buffer as CODING->dst_object, CODING->src_pos must be
7470 negative and CODING should not have `pre-write-conversion'.
7471
7472 If CODING->src_object is a string, CODING should not have
7473 `pre-write-conversion'.
7474
7475 If CODING->dst_object is a buffer, the encoded data is inserted at
7476 the current point of that buffer.
7477
7478 If CODING->dst_object is nil, the encoded data is placed at the
7479 memory area specified by CODING->destination. */
7480
7481static int
971de7fb 7482encode_coding (struct coding_system *coding)
4ed46869 7483{
df7492f9 7484 Lisp_Object attrs;
7d64c6ad 7485 Lisp_Object translation_table;
09ee6fdd 7486 int max_lookup;
fb608df3 7487 struct ccl_spec cclspec;
9861e777 7488
df7492f9 7489 attrs = CODING_ID_ATTRS (coding->id);
ea29edf2
KH
7490 if (coding->encoder == encode_coding_raw_text)
7491 translation_table = Qnil, max_lookup = 0;
7492 else
7493 translation_table = get_translation_table (attrs, 1, &max_lookup);
4ed46869 7494
df7492f9 7495 if (BUFFERP (coding->dst_object))
8844fa83 7496 {
df7492f9
KH
7497 set_buffer_internal (XBUFFER (coding->dst_object));
7498 coding->dst_multibyte
4b4deea2 7499 = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
8844fa83 7500 }
4ed46869 7501
b73bfc1c 7502 coding->consumed = coding->consumed_char = 0;
df7492f9 7503 coding->produced = coding->produced_char = 0;
065e3595 7504 record_conversion_result (coding, CODING_RESULT_SUCCESS);
b73bfc1c 7505 coding->errors = 0;
b73bfc1c 7506
df7492f9 7507 ALLOC_CONVERSION_WORK_AREA (coding);
4ed46869 7508
fb608df3
KH
7509 if (coding->encoder == encode_coding_ccl)
7510 {
7511 coding->spec.ccl = &cclspec;
7512 setup_ccl_program (&cclspec.ccl, CODING_CCL_ENCODER (coding));
7513 }
df7492f9
KH
7514 do {
7515 coding_set_source (coding);
09ee6fdd 7516 consume_chars (coding, translation_table, max_lookup);
df7492f9
KH
7517 coding_set_destination (coding);
7518 (*(coding->encoder)) (coding);
7519 } while (coding->consumed_char < coding->src_chars);
7520
284201e4 7521 if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
df7492f9
KH
7522 insert_from_gap (coding->produced_char, coding->produced);
7523
7524 return (coding->result);
ec6d2bb8
KH
7525}
7526
fb88bf2d 7527
24a73b0a
KH
7528/* Name (or base name) of work buffer for code conversion. */
7529static Lisp_Object Vcode_conversion_workbuf_name;
d46c5b12 7530
24a73b0a
KH
7531/* A working buffer used by the top level conversion. Once it is
7532 created, it is never destroyed. It has the name
7533 Vcode_conversion_workbuf_name. The other working buffers are
7534 destroyed after the use is finished, and their names are modified
7535 versions of Vcode_conversion_workbuf_name. */
7536static Lisp_Object Vcode_conversion_reused_workbuf;
b73bfc1c 7537
24a73b0a
KH
7538/* 1 iff Vcode_conversion_reused_workbuf is already in use. */
7539static int reused_workbuf_in_use;
4ed46869 7540
24a73b0a 7541
ad1746f5 7542/* Return a working buffer of code conversion. MULTIBYTE specifies the
24a73b0a 7543 multibyteness of returning buffer. */
b73bfc1c 7544
f6cbaf43 7545static Lisp_Object
971de7fb 7546make_conversion_work_buffer (int multibyte)
df7492f9 7547{
24a73b0a
KH
7548 Lisp_Object name, workbuf;
7549 struct buffer *current;
4ed46869 7550
24a73b0a 7551 if (reused_workbuf_in_use++)
065e3595
KH
7552 {
7553 name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7554 workbuf = Fget_buffer_create (name);
7555 }
df7492f9 7556 else
065e3595 7557 {
159bd5a2 7558 if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
a993c7a1
KH
7559 Vcode_conversion_reused_workbuf
7560 = Fget_buffer_create (Vcode_conversion_workbuf_name);
7561 workbuf = Vcode_conversion_reused_workbuf;
065e3595 7562 }
24a73b0a
KH
7563 current = current_buffer;
7564 set_buffer_internal (XBUFFER (workbuf));
df36ff1f
CY
7565 /* We can't allow modification hooks to run in the work buffer. For
7566 instance, directory_files_internal assumes that file decoding
7567 doesn't compile new regexps. */
7568 Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
3ed051d4 7569 Ferase_buffer ();
4b4deea2
TT
7570 BVAR (current_buffer, undo_list) = Qt;
7571 BVAR (current_buffer, enable_multibyte_characters) = multibyte ? Qt : Qnil;
df7492f9 7572 set_buffer_internal (current);
24a73b0a 7573 return workbuf;
df7492f9 7574}
d46c5b12 7575
24a73b0a 7576
4776e638 7577static Lisp_Object
971de7fb 7578code_conversion_restore (Lisp_Object arg)
4776e638 7579{
24a73b0a 7580 Lisp_Object current, workbuf;
948bdcf3 7581 struct gcpro gcpro1;
24a73b0a 7582
948bdcf3 7583 GCPRO1 (arg);
24a73b0a
KH
7584 current = XCAR (arg);
7585 workbuf = XCDR (arg);
7586 if (! NILP (workbuf))
7587 {
7588 if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7589 reused_workbuf_in_use = 0;
7590 else if (! NILP (Fbuffer_live_p (workbuf)))
7591 Fkill_buffer (workbuf);
7592 }
7593 set_buffer_internal (XBUFFER (current));
948bdcf3 7594 UNGCPRO;
4776e638
KH
7595 return Qnil;
7596}
b73bfc1c 7597
24a73b0a 7598Lisp_Object
971de7fb 7599code_conversion_save (int with_work_buf, int multibyte)
df7492f9 7600{
24a73b0a 7601 Lisp_Object workbuf = Qnil;
b73bfc1c 7602
4776e638 7603 if (with_work_buf)
24a73b0a
KH
7604 workbuf = make_conversion_work_buffer (multibyte);
7605 record_unwind_protect (code_conversion_restore,
7606 Fcons (Fcurrent_buffer (), workbuf));
4776e638 7607 return workbuf;
df7492f9 7608}
d46c5b12 7609
df7492f9 7610int
cf84bb53 7611decode_coding_gap (struct coding_system *coding,
d311d28c 7612 ptrdiff_t chars, ptrdiff_t bytes)
df7492f9 7613{
d311d28c 7614 ptrdiff_t count = SPECPDL_INDEX ();
5e5c78be 7615 Lisp_Object attrs;
fb88bf2d 7616
24a73b0a 7617 code_conversion_save (0, 0);
ec6d2bb8 7618
24a73b0a 7619 coding->src_object = Fcurrent_buffer ();
df7492f9
KH
7620 coding->src_chars = chars;
7621 coding->src_bytes = bytes;
7622 coding->src_pos = -chars;
7623 coding->src_pos_byte = -bytes;
7624 coding->src_multibyte = chars < bytes;
24a73b0a 7625 coding->dst_object = coding->src_object;
df7492f9
KH
7626 coding->dst_pos = PT;
7627 coding->dst_pos_byte = PT_BYTE;
4b4deea2 7628 coding->dst_multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
4ed46869 7629
df7492f9
KH
7630 if (CODING_REQUIRE_DETECTION (coding))
7631 detect_coding (coding);
8f924df7 7632
9286b333 7633 coding->mode |= CODING_MODE_LAST_BLOCK;
287c57d7 7634 current_buffer->text->inhibit_shrinking = 1;
df7492f9 7635 decode_coding (coding);
287c57d7 7636 current_buffer->text->inhibit_shrinking = 0;
d46c5b12 7637
5e5c78be
KH
7638 attrs = CODING_ID_ATTRS (coding->id);
7639 if (! NILP (CODING_ATTR_POST_READ (attrs)))
b73bfc1c 7640 {
d311d28c 7641 ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
5e5c78be
KH
7642 Lisp_Object val;
7643
7644 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
5e5c78be
KH
7645 val = call1 (CODING_ATTR_POST_READ (attrs),
7646 make_number (coding->produced_char));
5e5c78be
KH
7647 CHECK_NATNUM (val);
7648 coding->produced_char += Z - prev_Z;
7649 coding->produced += Z_BYTE - prev_Z_BYTE;
b73bfc1c 7650 }
4ed46869 7651
df7492f9 7652 unbind_to (count, Qnil);
b73bfc1c
KH
7653 return coding->result;
7654}
52d41803 7655
d46c5b12 7656
df7492f9
KH
7657/* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
7658 SRC_OBJECT into DST_OBJECT by coding context CODING.
b73bfc1c 7659
df7492f9 7660 SRC_OBJECT is a buffer, a string, or Qnil.
b73bfc1c 7661
df7492f9
KH
7662 If it is a buffer, the text is at point of the buffer. FROM and TO
7663 are positions in the buffer.
b73bfc1c 7664
df7492f9
KH
7665 If it is a string, the text is at the beginning of the string.
7666 FROM and TO are indices to the string.
4ed46869 7667
df7492f9
KH
7668 If it is nil, the text is at coding->source. FROM and TO are
7669 indices to coding->source.
bb10be8b 7670
df7492f9 7671 DST_OBJECT is a buffer, Qt, or Qnil.
4ed46869 7672
df7492f9
KH
7673 If it is a buffer, the decoded text is inserted at point of the
7674 buffer. If the buffer is the same as SRC_OBJECT, the source text
7675 is deleted.
4ed46869 7676
df7492f9
KH
7677 If it is Qt, a string is made from the decoded text, and
7678 set in CODING->dst_object.
d46c5b12 7679
df7492f9 7680 If it is Qnil, the decoded text is stored at CODING->destination.
2cb26057 7681 The caller must allocate CODING->dst_bytes bytes at
df7492f9
KH
7682 CODING->destination by xmalloc. If the decoded text is longer than
7683 CODING->dst_bytes, CODING->destination is relocated by xrealloc.
7684 */
d46c5b12 7685
df7492f9 7686void
cf84bb53
JB
7687decode_coding_object (struct coding_system *coding,
7688 Lisp_Object src_object,
d311d28c
PE
7689 ptrdiff_t from, ptrdiff_t from_byte,
7690 ptrdiff_t to, ptrdiff_t to_byte,
cf84bb53 7691 Lisp_Object dst_object)
d46c5b12 7692{
d311d28c 7693 ptrdiff_t count = SPECPDL_INDEX ();
c4a63b12 7694 unsigned char *destination IF_LINT (= NULL);
d311d28c
PE
7695 ptrdiff_t dst_bytes IF_LINT (= 0);
7696 ptrdiff_t chars = to - from;
7697 ptrdiff_t bytes = to_byte - from_byte;
df7492f9 7698 Lisp_Object attrs;
c4a63b12 7699 int saved_pt = -1, saved_pt_byte IF_LINT (= 0);
64cedb0c 7700 int need_marker_adjustment = 0;
b3bfad50 7701 Lisp_Object old_deactivate_mark;
d46c5b12 7702
b3bfad50 7703 old_deactivate_mark = Vdeactivate_mark;
93dec019 7704
df7492f9 7705 if (NILP (dst_object))
d46c5b12 7706 {
df7492f9
KH
7707 destination = coding->destination;
7708 dst_bytes = coding->dst_bytes;
d46c5b12 7709 }
93dec019 7710
df7492f9
KH
7711 coding->src_object = src_object;
7712 coding->src_chars = chars;
7713 coding->src_bytes = bytes;
7714 coding->src_multibyte = chars < bytes;
70ad9fc4 7715
df7492f9 7716 if (STRINGP (src_object))
d46c5b12 7717 {
df7492f9
KH
7718 coding->src_pos = from;
7719 coding->src_pos_byte = from_byte;
d46c5b12 7720 }
df7492f9 7721 else if (BUFFERP (src_object))
88993dfd 7722 {
df7492f9
KH
7723 set_buffer_internal (XBUFFER (src_object));
7724 if (from != GPT)
7725 move_gap_both (from, from_byte);
7726 if (EQ (src_object, dst_object))
fb88bf2d 7727 {
64cedb0c
KH
7728 struct Lisp_Marker *tail;
7729
7730 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7731 {
7732 tail->need_adjustment
7733 = tail->charpos == (tail->insertion_type ? from : to);
7734 need_marker_adjustment |= tail->need_adjustment;
7735 }
4776e638 7736 saved_pt = PT, saved_pt_byte = PT_BYTE;
df7492f9 7737 TEMP_SET_PT_BOTH (from, from_byte);
f4a3cc44 7738 current_buffer->text->inhibit_shrinking = 1;
df7492f9
KH
7739 del_range_both (from, from_byte, to, to_byte, 1);
7740 coding->src_pos = -chars;
7741 coding->src_pos_byte = -bytes;
fb88bf2d 7742 }
df7492f9 7743 else
fb88bf2d 7744 {
df7492f9
KH
7745 coding->src_pos = from;
7746 coding->src_pos_byte = from_byte;
fb88bf2d 7747 }
88993dfd
KH
7748 }
7749
df7492f9
KH
7750 if (CODING_REQUIRE_DETECTION (coding))
7751 detect_coding (coding);
7752 attrs = CODING_ID_ATTRS (coding->id);
d46c5b12 7753
2cb26057
KH
7754 if (EQ (dst_object, Qt)
7755 || (! NILP (CODING_ATTR_POST_READ (attrs))
7756 && NILP (dst_object)))
b73bfc1c 7757 {
a1567c45
SM
7758 coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
7759 coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
df7492f9
KH
7760 coding->dst_pos = BEG;
7761 coding->dst_pos_byte = BEG_BYTE;
b73bfc1c 7762 }
df7492f9 7763 else if (BUFFERP (dst_object))
d46c5b12 7764 {
24a73b0a 7765 code_conversion_save (0, 0);
df7492f9
KH
7766 coding->dst_object = dst_object;
7767 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
7768 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
7769 coding->dst_multibyte
4b4deea2 7770 = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
d46c5b12
KH
7771 }
7772 else
7773 {
24a73b0a 7774 code_conversion_save (0, 0);
df7492f9 7775 coding->dst_object = Qnil;
0154725e
SM
7776 /* Most callers presume this will return a multibyte result, and they
7777 won't use `binary' or `raw-text' anyway, so let's not worry about
7778 CODING_FOR_UNIBYTE. */
bb555731 7779 coding->dst_multibyte = 1;
d46c5b12
KH
7780 }
7781
df7492f9 7782 decode_coding (coding);
fa46990e 7783
df7492f9
KH
7784 if (BUFFERP (coding->dst_object))
7785 set_buffer_internal (XBUFFER (coding->dst_object));
d46c5b12 7786
df7492f9 7787 if (! NILP (CODING_ATTR_POST_READ (attrs)))
d46c5b12 7788 {
b3bfad50 7789 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
d311d28c 7790 ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
2b4f9037 7791 Lisp_Object val;
d46c5b12 7792
c0cc7f7f 7793 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
b3bfad50
KH
7794 GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7795 old_deactivate_mark);
d4850d67
KH
7796 val = safe_call1 (CODING_ATTR_POST_READ (attrs),
7797 make_number (coding->produced_char));
df7492f9
KH
7798 UNGCPRO;
7799 CHECK_NATNUM (val);
7800 coding->produced_char += Z - prev_Z;
7801 coding->produced += Z_BYTE - prev_Z_BYTE;
d46c5b12 7802 }
de79a6a5 7803
df7492f9 7804 if (EQ (dst_object, Qt))
ec6d2bb8 7805 {
df7492f9
KH
7806 coding->dst_object = Fbuffer_string ();
7807 }
7808 else if (NILP (dst_object) && BUFFERP (coding->dst_object))
7809 {
7810 set_buffer_internal (XBUFFER (coding->dst_object));
7811 if (dst_bytes < coding->produced)
7812 {
b3bfad50 7813 destination = xrealloc (destination, coding->produced);
df7492f9
KH
7814 if (! destination)
7815 {
065e3595 7816 record_conversion_result (coding,
ebaf11b6 7817 CODING_RESULT_INSUFFICIENT_MEM);
df7492f9
KH
7818 unbind_to (count, Qnil);
7819 return;
7820 }
7821 if (BEGV < GPT && GPT < BEGV + coding->produced_char)
7822 move_gap_both (BEGV, BEGV_BYTE);
72af86bd 7823 memcpy (destination, BEGV_ADDR, coding->produced);
df7492f9 7824 coding->destination = destination;
d46c5b12 7825 }
ec6d2bb8 7826 }
b73bfc1c 7827
4776e638
KH
7828 if (saved_pt >= 0)
7829 {
7830 /* This is the case of:
7831 (BUFFERP (src_object) && EQ (src_object, dst_object))
7832 As we have moved PT while replacing the original buffer
7833 contents, we must recover it now. */
7834 set_buffer_internal (XBUFFER (src_object));
f4a3cc44 7835 current_buffer->text->inhibit_shrinking = 0;
4776e638
KH
7836 if (saved_pt < from)
7837 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7838 else if (saved_pt < from + chars)
7839 TEMP_SET_PT_BOTH (from, from_byte);
4b4deea2 7840 else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
4776e638
KH
7841 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7842 saved_pt_byte + (coding->produced - bytes));
7843 else
7844 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7845 saved_pt_byte + (coding->produced - bytes));
64cedb0c
KH
7846
7847 if (need_marker_adjustment)
7848 {
7849 struct Lisp_Marker *tail;
7850
7851 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7852 if (tail->need_adjustment)
7853 {
7854 tail->need_adjustment = 0;
7855 if (tail->insertion_type)
7856 {
7857 tail->bytepos = from_byte;
7858 tail->charpos = from;
7859 }
7860 else
7861 {
7862 tail->bytepos = from_byte + coding->produced;
7863 tail->charpos
4b4deea2 7864 = (NILP (BVAR (current_buffer, enable_multibyte_characters))
64cedb0c
KH
7865 ? tail->bytepos : from + coding->produced_char);
7866 }
7867 }
7868 }
d46c5b12 7869 }
4776e638 7870
b3bfad50 7871 Vdeactivate_mark = old_deactivate_mark;
065e3595 7872 unbind_to (count, coding->dst_object);
d46c5b12
KH
7873}
7874
d46c5b12 7875
df7492f9 7876void
cf84bb53
JB
7877encode_coding_object (struct coding_system *coding,
7878 Lisp_Object src_object,
d311d28c
PE
7879 ptrdiff_t from, ptrdiff_t from_byte,
7880 ptrdiff_t to, ptrdiff_t to_byte,
cf84bb53 7881 Lisp_Object dst_object)
d46c5b12 7882{
d311d28c
PE
7883 ptrdiff_t count = SPECPDL_INDEX ();
7884 ptrdiff_t chars = to - from;
7885 ptrdiff_t bytes = to_byte - from_byte;
df7492f9 7886 Lisp_Object attrs;
c4a63b12 7887 int saved_pt = -1, saved_pt_byte IF_LINT (= 0);
64cedb0c 7888 int need_marker_adjustment = 0;
c02d943b 7889 int kill_src_buffer = 0;
b3bfad50 7890 Lisp_Object old_deactivate_mark;
df7492f9 7891
b3bfad50 7892 old_deactivate_mark = Vdeactivate_mark;
df7492f9
KH
7893
7894 coding->src_object = src_object;
7895 coding->src_chars = chars;
7896 coding->src_bytes = bytes;
7897 coding->src_multibyte = chars < bytes;
7898
7899 attrs = CODING_ID_ATTRS (coding->id);
7900
64cedb0c
KH
7901 if (EQ (src_object, dst_object))
7902 {
7903 struct Lisp_Marker *tail;
7904
7905 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7906 {
7907 tail->need_adjustment
7908 = tail->charpos == (tail->insertion_type ? from : to);
7909 need_marker_adjustment |= tail->need_adjustment;
7910 }
7911 }
7912
df7492f9 7913 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
6bac5b12 7914 {
24a73b0a 7915 coding->src_object = code_conversion_save (1, coding->src_multibyte);
df7492f9
KH
7916 set_buffer_internal (XBUFFER (coding->src_object));
7917 if (STRINGP (src_object))
7918 insert_from_string (src_object, from, from_byte, chars, bytes, 0);
7919 else if (BUFFERP (src_object))
7920 insert_from_buffer (XBUFFER (src_object), from, chars, 0);
7921 else
b68864e5 7922 insert_1_both ((char *) coding->source + from, chars, bytes, 0, 0, 0);
d46c5b12 7923
df7492f9
KH
7924 if (EQ (src_object, dst_object))
7925 {
7926 set_buffer_internal (XBUFFER (src_object));
4776e638 7927 saved_pt = PT, saved_pt_byte = PT_BYTE;
df7492f9
KH
7928 del_range_both (from, from_byte, to, to_byte, 1);
7929 set_buffer_internal (XBUFFER (coding->src_object));
7930 }
7931
d4850d67
KH
7932 {
7933 Lisp_Object args[3];
b3bfad50 7934 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
d4850d67 7935
b3bfad50
KH
7936 GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7937 old_deactivate_mark);
d4850d67
KH
7938 args[0] = CODING_ATTR_PRE_WRITE (attrs);
7939 args[1] = make_number (BEG);
7940 args[2] = make_number (Z);
7941 safe_call (3, args);
b3bfad50 7942 UNGCPRO;
d4850d67 7943 }
c02d943b
KH
7944 if (XBUFFER (coding->src_object) != current_buffer)
7945 kill_src_buffer = 1;
ac87bbef 7946 coding->src_object = Fcurrent_buffer ();
df7492f9
KH
7947 if (BEG != GPT)
7948 move_gap_both (BEG, BEG_BYTE);
7949 coding->src_chars = Z - BEG;
7950 coding->src_bytes = Z_BYTE - BEG_BYTE;
7951 coding->src_pos = BEG;
7952 coding->src_pos_byte = BEG_BYTE;
7953 coding->src_multibyte = Z < Z_BYTE;
7954 }
7955 else if (STRINGP (src_object))
d46c5b12 7956 {
24a73b0a 7957 code_conversion_save (0, 0);
df7492f9
KH
7958 coding->src_pos = from;
7959 coding->src_pos_byte = from_byte;
b73bfc1c 7960 }
df7492f9 7961 else if (BUFFERP (src_object))
b73bfc1c 7962 {
24a73b0a 7963 code_conversion_save (0, 0);
df7492f9 7964 set_buffer_internal (XBUFFER (src_object));
df7492f9 7965 if (EQ (src_object, dst_object))
d46c5b12 7966 {
4776e638 7967 saved_pt = PT, saved_pt_byte = PT_BYTE;
ff0dacd7
KH
7968 coding->src_object = del_range_1 (from, to, 1, 1);
7969 coding->src_pos = 0;
7970 coding->src_pos_byte = 0;
d46c5b12 7971 }
df7492f9 7972 else
d46c5b12 7973 {
ff0dacd7
KH
7974 if (from < GPT && to >= GPT)
7975 move_gap_both (from, from_byte);
df7492f9
KH
7976 coding->src_pos = from;
7977 coding->src_pos_byte = from_byte;
d46c5b12 7978 }
d46c5b12 7979 }
4776e638 7980 else
24a73b0a 7981 code_conversion_save (0, 0);
d46c5b12 7982
df7492f9 7983 if (BUFFERP (dst_object))
88993dfd 7984 {
df7492f9 7985 coding->dst_object = dst_object;
28f67a95
KH
7986 if (EQ (src_object, dst_object))
7987 {
7988 coding->dst_pos = from;
7989 coding->dst_pos_byte = from_byte;
7990 }
7991 else
7992 {
319a3947
KH
7993 struct buffer *current = current_buffer;
7994
7995 set_buffer_temp (XBUFFER (dst_object));
7996 coding->dst_pos = PT;
7997 coding->dst_pos_byte = PT_BYTE;
7998 move_gap_both (coding->dst_pos, coding->dst_pos_byte);
7999 set_buffer_temp (current);
28f67a95 8000 }
df7492f9 8001 coding->dst_multibyte
4b4deea2 8002 = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
88993dfd 8003 }
df7492f9 8004 else if (EQ (dst_object, Qt))
d46c5b12 8005 {
5d009b3a 8006 ptrdiff_t dst_bytes = max (1, coding->src_chars);
df7492f9 8007 coding->dst_object = Qnil;
5d009b3a
PE
8008 coding->destination = (unsigned char *) xmalloc (dst_bytes);
8009 coding->dst_bytes = dst_bytes;
df7492f9 8010 coding->dst_multibyte = 0;
d46c5b12
KH
8011 }
8012 else
8013 {
df7492f9
KH
8014 coding->dst_object = Qnil;
8015 coding->dst_multibyte = 0;
d46c5b12
KH
8016 }
8017
df7492f9 8018 encode_coding (coding);
d46c5b12 8019
df7492f9 8020 if (EQ (dst_object, Qt))
d46c5b12 8021 {
df7492f9
KH
8022 if (BUFFERP (coding->dst_object))
8023 coding->dst_object = Fbuffer_string ();
8024 else
d46c5b12 8025 {
df7492f9
KH
8026 coding->dst_object
8027 = make_unibyte_string ((char *) coding->destination,
8028 coding->produced);
8029 xfree (coding->destination);
d46c5b12 8030 }
4ed46869 8031 }
d46c5b12 8032
4776e638
KH
8033 if (saved_pt >= 0)
8034 {
8035 /* This is the case of:
8036 (BUFFERP (src_object) && EQ (src_object, dst_object))
8037 As we have moved PT while replacing the original buffer
8038 contents, we must recover it now. */
8039 set_buffer_internal (XBUFFER (src_object));
8040 if (saved_pt < from)
8041 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8042 else if (saved_pt < from + chars)
8043 TEMP_SET_PT_BOTH (from, from_byte);
4b4deea2 8044 else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
4776e638
KH
8045 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8046 saved_pt_byte + (coding->produced - bytes));
d46c5b12 8047 else
4776e638
KH
8048 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8049 saved_pt_byte + (coding->produced - bytes));
64cedb0c
KH
8050
8051 if (need_marker_adjustment)
8052 {
8053 struct Lisp_Marker *tail;
8054
8055 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8056 if (tail->need_adjustment)
8057 {
8058 tail->need_adjustment = 0;
8059 if (tail->insertion_type)
8060 {
8061 tail->bytepos = from_byte;
8062 tail->charpos = from;
8063 }
8064 else
8065 {
8066 tail->bytepos = from_byte + coding->produced;
8067 tail->charpos
4b4deea2 8068 = (NILP (BVAR (current_buffer, enable_multibyte_characters))
64cedb0c
KH
8069 ? tail->bytepos : from + coding->produced_char);
8070 }
8071 }
8072 }
4776e638
KH
8073 }
8074
c02d943b
KH
8075 if (kill_src_buffer)
8076 Fkill_buffer (coding->src_object);
b3bfad50
KH
8077
8078 Vdeactivate_mark = old_deactivate_mark;
df7492f9 8079 unbind_to (count, Qnil);
b73bfc1c
KH
8080}
8081
df7492f9 8082
b73bfc1c 8083Lisp_Object
971de7fb 8084preferred_coding_system (void)
b73bfc1c 8085{
df7492f9 8086 int id = coding_categories[coding_priorities[0]].id;
2391eaa4 8087
df7492f9 8088 return CODING_ID_NAME (id);
4ed46869
KH
8089}
8090
8091\f
8092#ifdef emacs
1397dc18 8093/*** 8. Emacs Lisp library functions ***/
4ed46869 8094
a7ca3326 8095DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
48b0f3ae 8096 doc: /* Return t if OBJECT is nil or a coding-system.
df7492f9 8097See the documentation of `define-coding-system' for information
48b0f3ae 8098about coding-system objects. */)
5842a27b 8099 (Lisp_Object object)
4ed46869 8100{
d4a1d553
JB
8101 if (NILP (object)
8102 || CODING_SYSTEM_ID (object) >= 0)
44e8490d 8103 return Qt;
d4a1d553
JB
8104 if (! SYMBOLP (object)
8105 || NILP (Fget (object, Qcoding_system_define_form)))
44e8490d
KH
8106 return Qnil;
8107 return Qt;
4ed46869
KH
8108}
8109
a7ca3326 8110DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
9d991de8 8111 Sread_non_nil_coding_system, 1, 1, 0,
48b0f3ae 8112 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
5842a27b 8113 (Lisp_Object prompt)
4ed46869 8114{
e0e989f6 8115 Lisp_Object val;
9d991de8
RS
8116 do
8117 {
4608c386
KH
8118 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8119 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
9d991de8 8120 }
8f924df7 8121 while (SCHARS (val) == 0);
e0e989f6 8122 return (Fintern (val, Qnil));
4ed46869
KH
8123}
8124
a7ca3326 8125DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
48b0f3ae 8126 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
c7183fb8
GM
8127If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8128Ignores case when completing coding systems (all Emacs coding systems
8129are lower-case). */)
5842a27b 8130 (Lisp_Object prompt, Lisp_Object default_coding_system)
4ed46869 8131{
f44d27ce 8132 Lisp_Object val;
d311d28c 8133 ptrdiff_t count = SPECPDL_INDEX ();
c7183fb8 8134
9b787f3e 8135 if (SYMBOLP (default_coding_system))
57d25e6f 8136 default_coding_system = SYMBOL_NAME (default_coding_system);
c7183fb8 8137 specbind (Qcompletion_ignore_case, Qt);
4608c386 8138 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
9b787f3e
RS
8139 Qt, Qnil, Qcoding_system_history,
8140 default_coding_system, Qnil);
c7183fb8 8141 unbind_to (count, Qnil);
8f924df7 8142 return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
4ed46869
KH
8143}
8144
a7ca3326 8145DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
4ed46869 8146 1, 1, 0,
48b0f3ae 8147 doc: /* Check validity of CODING-SYSTEM.
9ffd559c
KH
8148If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8149It is valid if it is nil or a symbol defined as a coding system by the
8150function `define-coding-system'. */)
5842a27b 8151 (Lisp_Object coding_system)
4ed46869 8152{
44e8490d
KH
8153 Lisp_Object define_form;
8154
8155 define_form = Fget (coding_system, Qcoding_system_define_form);
8156 if (! NILP (define_form))
8157 {
8158 Fput (coding_system, Qcoding_system_define_form, Qnil);
8159 safe_eval (define_form);
8160 }
4ed46869
KH
8161 if (!NILP (Fcoding_system_p (coding_system)))
8162 return coding_system;
fcad4ec4 8163 xsignal1 (Qcoding_system_error, coding_system);
4ed46869 8164}
df7492f9 8165
3a73fa5d 8166\f
89528eb3
KH
8167/* Detect how the bytes at SRC of length SRC_BYTES are encoded. If
8168 HIGHEST is nonzero, return the coding system of the highest
ad1746f5 8169 priority among the detected coding systems. Otherwise return a
89528eb3
KH
8170 list of detected coding systems sorted by their priorities. If
8171 MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
8172 multibyte form but contains only ASCII and eight-bit chars.
8173 Otherwise, the bytes are raw bytes.
8174
8175 CODING-SYSTEM controls the detection as below:
8176
8177 If it is nil, detect both text-format and eol-format. If the
8178 text-format part of CODING-SYSTEM is already specified
8179 (e.g. `iso-latin-1'), detect only eol-format. If the eol-format
8180 part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8181 detect only text-format. */
8182
d46c5b12 8183Lisp_Object
cf84bb53 8184detect_coding_system (const unsigned char *src,
d311d28c 8185 ptrdiff_t src_chars, ptrdiff_t src_bytes,
cf84bb53
JB
8186 int highest, int multibytep,
8187 Lisp_Object coding_system)
4ed46869 8188{
8f924df7 8189 const unsigned char *src_end = src + src_bytes;
df7492f9 8190 Lisp_Object attrs, eol_type;
4533845d 8191 Lisp_Object val = Qnil;
df7492f9 8192 struct coding_system coding;
d3411f89 8193 ptrdiff_t id;
ff0dacd7 8194 struct coding_detection_info detect_info;
24a73b0a 8195 enum coding_category base_category;
2f3cbb32 8196 int null_byte_found = 0, eight_bit_found = 0;
b73bfc1c 8197
df7492f9
KH
8198 if (NILP (coding_system))
8199 coding_system = Qundecided;
8200 setup_coding_system (coding_system, &coding);
8201 attrs = CODING_ID_ATTRS (coding.id);
8202 eol_type = CODING_ID_EOL_TYPE (coding.id);
89528eb3 8203 coding_system = CODING_ATTR_BASE_NAME (attrs);
4ed46869 8204
df7492f9 8205 coding.source = src;
24a73b0a 8206 coding.src_chars = src_chars;
df7492f9
KH
8207 coding.src_bytes = src_bytes;
8208 coding.src_multibyte = multibytep;
8209 coding.consumed = 0;
89528eb3 8210 coding.mode |= CODING_MODE_LAST_BLOCK;
c0e16b14 8211 coding.head_ascii = 0;
d46c5b12 8212
ff0dacd7 8213 detect_info.checked = detect_info.found = detect_info.rejected = 0;
d46c5b12 8214
89528eb3 8215 /* At first, detect text-format if necessary. */
24a73b0a
KH
8216 base_category = XINT (CODING_ATTR_CATEGORY (attrs));
8217 if (base_category == coding_category_undecided)
4ed46869 8218 {
c4a63b12
PE
8219 enum coding_category category IF_LINT (= 0);
8220 struct coding_system *this IF_LINT (= NULL);
ff0dacd7 8221 int c, i;
88993dfd 8222
24a73b0a 8223 /* Skip all ASCII bytes except for a few ISO2022 controls. */
2f3cbb32 8224 for (; src < src_end; src++)
4ed46869 8225 {
df7492f9 8226 c = *src;
6cb21a4f 8227 if (c & 0x80)
6cb21a4f 8228 {
2f3cbb32 8229 eight_bit_found = 1;
2f3cbb32
KH
8230 if (null_byte_found)
8231 break;
8232 }
c0e16b14 8233 else if (c < 0x20)
2f3cbb32
KH
8234 {
8235 if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8236 && ! inhibit_iso_escape_detection
8237 && ! detect_info.checked)
6cb21a4f 8238 {
2f3cbb32
KH
8239 if (detect_coding_iso_2022 (&coding, &detect_info))
8240 {
8241 /* We have scanned the whole data. */
8242 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
c0e16b14
KH
8243 {
8244 /* We didn't find an 8-bit code. We may
8245 have found a null-byte, but it's very
8246 rare that a binary file confirm to
8247 ISO-2022. */
8248 src = src_end;
8249 coding.head_ascii = src - coding.source;
8250 }
8251 detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
2f3cbb32
KH
8252 break;
8253 }
8254 }
97b1b294 8255 else if (! c && !inhibit_null_byte_detection)
2f3cbb32
KH
8256 {
8257 null_byte_found = 1;
8258 if (eight_bit_found)
8259 break;
6cb21a4f 8260 }
c006c0c8
KH
8261 if (! eight_bit_found)
8262 coding.head_ascii++;
6cb21a4f 8263 }
c006c0c8 8264 else if (! eight_bit_found)
c0e16b14 8265 coding.head_ascii++;
4ed46869 8266 }
88993dfd 8267
2f3cbb32
KH
8268 if (null_byte_found || eight_bit_found
8269 || coding.head_ascii < coding.src_bytes
6cb21a4f
KH
8270 || detect_info.found)
8271 {
2f3cbb32 8272 if (coding.head_ascii == coding.src_bytes)
6cb21a4f
KH
8273 /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */
8274 for (i = 0; i < coding_category_raw_text; i++)
ff0dacd7 8275 {
6cb21a4f 8276 category = coding_priorities[i];
c7266f4a 8277 this = coding_categories + category;
6cb21a4f 8278 if (detect_info.found & (1 << category))
ff0dacd7
KH
8279 break;
8280 }
6cb21a4f 8281 else
2f3cbb32
KH
8282 {
8283 if (null_byte_found)
8284 {
8285 detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8286 detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8287 }
8288 for (i = 0; i < coding_category_raw_text; i++)
8289 {
8290 category = coding_priorities[i];
8291 this = coding_categories + category;
6cb21a4f 8292
2f3cbb32
KH
8293 if (this->id < 0)
8294 {
8295 /* No coding system of this category is defined. */
8296 detect_info.rejected |= (1 << category);
8297 }
8298 else if (category >= coding_category_raw_text)
8299 continue;
8300 else if (detect_info.checked & (1 << category))
8301 {
8302 if (highest
8303 && (detect_info.found & (1 << category)))
6cb21a4f 8304 break;
2f3cbb32
KH
8305 }
8306 else if ((*(this->detector)) (&coding, &detect_info)
8307 && highest
8308 && (detect_info.found & (1 << category)))
8309 {
8310 if (category == coding_category_utf_16_auto)
8311 {
8312 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8313 category = coding_category_utf_16_le;
8314 else
8315 category = coding_category_utf_16_be;
8316 }
8317 break;
8318 }
8319 }
8320 }
6cb21a4f 8321 }
ec6d2bb8 8322
4cddb209
KH
8323 if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8324 || null_byte_found)
ec6d2bb8 8325 {
ff0dacd7 8326 detect_info.found = CATEGORY_MASK_RAW_TEXT;
4cddb209 8327 id = CODING_SYSTEM_ID (Qno_conversion);
89528eb3
KH
8328 val = Fcons (make_number (id), Qnil);
8329 }
ff0dacd7 8330 else if (! detect_info.rejected && ! detect_info.found)
89528eb3 8331 {
ff0dacd7 8332 detect_info.found = CATEGORY_MASK_ANY;
89528eb3
KH
8333 id = coding_categories[coding_category_undecided].id;
8334 val = Fcons (make_number (id), Qnil);
8335 }
8336 else if (highest)
8337 {
ff0dacd7 8338 if (detect_info.found)
ec6d2bb8 8339 {
ff0dacd7
KH
8340 detect_info.found = 1 << category;
8341 val = Fcons (make_number (this->id), Qnil);
8342 }
8343 else
8344 for (i = 0; i < coding_category_raw_text; i++)
8345 if (! (detect_info.rejected & (1 << coding_priorities[i])))
8346 {
8347 detect_info.found = 1 << coding_priorities[i];
8348 id = coding_categories[coding_priorities[i]].id;
8349 val = Fcons (make_number (id), Qnil);
8350 break;
8351 }
8352 }
89528eb3
KH
8353 else
8354 {
ff0dacd7
KH
8355 int mask = detect_info.rejected | detect_info.found;
8356 int found = 0;
ec6d2bb8 8357
89528eb3 8358 for (i = coding_category_raw_text - 1; i >= 0; i--)
ff0dacd7
KH
8359 {
8360 category = coding_priorities[i];
8361 if (! (mask & (1 << category)))
ec6d2bb8 8362 {
ff0dacd7
KH
8363 found |= 1 << category;
8364 id = coding_categories[category].id;
c7266f4a
KH
8365 if (id >= 0)
8366 val = Fcons (make_number (id), val);
ff0dacd7
KH
8367 }
8368 }
8369 for (i = coding_category_raw_text - 1; i >= 0; i--)
8370 {
8371 category = coding_priorities[i];
8372 if (detect_info.found & (1 << category))
8373 {
8374 id = coding_categories[category].id;
8375 val = Fcons (make_number (id), val);
ec6d2bb8 8376 }
ec6d2bb8 8377 }
ff0dacd7 8378 detect_info.found |= found;
ec6d2bb8 8379 }
ec6d2bb8 8380 }
a470d443
KH
8381 else if (base_category == coding_category_utf_8_auto)
8382 {
8383 if (detect_coding_utf_8 (&coding, &detect_info))
8384 {
8385 struct coding_system *this;
8386
8387 if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8388 this = coding_categories + coding_category_utf_8_sig;
8389 else
8390 this = coding_categories + coding_category_utf_8_nosig;
8391 val = Fcons (make_number (this->id), Qnil);
8392 }
8393 }
24a73b0a
KH
8394 else if (base_category == coding_category_utf_16_auto)
8395 {
8396 if (detect_coding_utf_16 (&coding, &detect_info))
8397 {
24a73b0a
KH
8398 struct coding_system *this;
8399
8400 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8401 this = coding_categories + coding_category_utf_16_le;
8402 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8403 this = coding_categories + coding_category_utf_16_be;
8404 else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8405 this = coding_categories + coding_category_utf_16_be_nosig;
8406 else
8407 this = coding_categories + coding_category_utf_16_le_nosig;
8408 val = Fcons (make_number (this->id), Qnil);
8409 }
8410 }
df7492f9
KH
8411 else
8412 {
ff0dacd7 8413 detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
89528eb3 8414 val = Fcons (make_number (coding.id), Qnil);
4ed46869 8415 }
df7492f9 8416
89528eb3 8417 /* Then, detect eol-format if necessary. */
df7492f9 8418 {
4533845d 8419 int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
df7492f9
KH
8420 Lisp_Object tail;
8421
89528eb3
KH
8422 if (VECTORP (eol_type))
8423 {
ff0dacd7 8424 if (detect_info.found & ~CATEGORY_MASK_UTF_16)
2f3cbb32
KH
8425 {
8426 if (null_byte_found)
8427 normal_eol = EOL_SEEN_LF;
8428 else
8429 normal_eol = detect_eol (coding.source, src_bytes,
8430 coding_category_raw_text);
8431 }
ff0dacd7
KH
8432 if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8433 | CATEGORY_MASK_UTF_16_BE_NOSIG))
89528eb3
KH
8434 utf_16_be_eol = detect_eol (coding.source, src_bytes,
8435 coding_category_utf_16_be);
ff0dacd7
KH
8436 if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8437 | CATEGORY_MASK_UTF_16_LE_NOSIG))
89528eb3
KH
8438 utf_16_le_eol = detect_eol (coding.source, src_bytes,
8439 coding_category_utf_16_le);
8440 }
8441 else
8442 {
8443 if (EQ (eol_type, Qunix))
8444 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8445 else if (EQ (eol_type, Qdos))
8446 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8447 else
8448 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8449 }
8450
df7492f9
KH
8451 for (tail = val; CONSP (tail); tail = XCDR (tail))
8452 {
89528eb3 8453 enum coding_category category;
df7492f9 8454 int this_eol;
89528eb3
KH
8455
8456 id = XINT (XCAR (tail));
8457 attrs = CODING_ID_ATTRS (id);
8458 category = XINT (CODING_ATTR_CATEGORY (attrs));
8459 eol_type = CODING_ID_EOL_TYPE (id);
df7492f9
KH
8460 if (VECTORP (eol_type))
8461 {
89528eb3
KH
8462 if (category == coding_category_utf_16_be
8463 || category == coding_category_utf_16_be_nosig)
8464 this_eol = utf_16_be_eol;
8465 else if (category == coding_category_utf_16_le
8466 || category == coding_category_utf_16_le_nosig)
8467 this_eol = utf_16_le_eol;
df7492f9 8468 else
89528eb3
KH
8469 this_eol = normal_eol;
8470
df7492f9
KH
8471 if (this_eol == EOL_SEEN_LF)
8472 XSETCAR (tail, AREF (eol_type, 0));
8473 else if (this_eol == EOL_SEEN_CRLF)
8474 XSETCAR (tail, AREF (eol_type, 1));
8475 else if (this_eol == EOL_SEEN_CR)
8476 XSETCAR (tail, AREF (eol_type, 2));
89528eb3
KH
8477 else
8478 XSETCAR (tail, CODING_ID_NAME (id));
df7492f9 8479 }
89528eb3
KH
8480 else
8481 XSETCAR (tail, CODING_ID_NAME (id));
df7492f9
KH
8482 }
8483 }
ec6d2bb8 8484
4533845d 8485 return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
ec6d2bb8
KH
8486}
8487
ec6d2bb8 8488
d46c5b12
KH
8489DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8490 2, 3, 0,
48b0f3ae
PJ
8491 doc: /* Detect coding system of the text in the region between START and END.
8492Return a list of possible coding systems ordered by priority.
b811c52b
KH
8493The coding systems to try and their priorities follows what
8494the function `coding-system-priority-list' (which see) returns.
ec6d2bb8 8495
12e0131a 8496If only ASCII characters are found (except for such ISO-2022 control
d4a1d553
JB
8497characters as ESC), it returns a list of single element `undecided'
8498or its subsidiary coding system according to a detected end-of-line
8499format.
ec6d2bb8 8500
48b0f3ae
PJ
8501If optional argument HIGHEST is non-nil, return the coding system of
8502highest priority. */)
5842a27b 8503 (Lisp_Object start, Lisp_Object end, Lisp_Object highest)
d46c5b12 8504{
d311d28c
PE
8505 ptrdiff_t from, to;
8506 ptrdiff_t from_byte, to_byte;
ec6d2bb8 8507
b7826503
PJ
8508 CHECK_NUMBER_COERCE_MARKER (start);
8509 CHECK_NUMBER_COERCE_MARKER (end);
ec6d2bb8 8510
d46c5b12
KH
8511 validate_region (&start, &end);
8512 from = XINT (start), to = XINT (end);
8513 from_byte = CHAR_TO_BYTE (from);
8514 to_byte = CHAR_TO_BYTE (to);
ec6d2bb8 8515
d46c5b12
KH
8516 if (from < GPT && to >= GPT)
8517 move_gap_both (to, to_byte);
c210f766 8518
d46c5b12 8519 return detect_coding_system (BYTE_POS_ADDR (from_byte),
24a73b0a 8520 to - from, to_byte - from_byte,
0a28aafb 8521 !NILP (highest),
4b4deea2 8522 !NILP (BVAR (current_buffer
5d8ea120 8523 , enable_multibyte_characters)),
df7492f9 8524 Qnil);
ec6d2bb8
KH
8525}
8526
d46c5b12
KH
8527DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8528 1, 2, 0,
48b0f3ae
PJ
8529 doc: /* Detect coding system of the text in STRING.
8530Return a list of possible coding systems ordered by priority.
67ceab9d
KH
8531The coding systems to try and their priorities follows what
8532the function `coding-system-priority-list' (which see) returns.
fb88bf2d 8533
12e0131a 8534If only ASCII characters are found (except for such ISO-2022 control
d4a1d553
JB
8535characters as ESC), it returns a list of single element `undecided'
8536or its subsidiary coding system according to a detected end-of-line
8537format.
d46c5b12 8538
48b0f3ae
PJ
8539If optional argument HIGHEST is non-nil, return the coding system of
8540highest priority. */)
5842a27b 8541 (Lisp_Object string, Lisp_Object highest)
d46c5b12 8542{
b7826503 8543 CHECK_STRING (string);
b73bfc1c 8544
24a73b0a
KH
8545 return detect_coding_system (SDATA (string),
8546 SCHARS (string), SBYTES (string),
8f924df7 8547 !NILP (highest), STRING_MULTIBYTE (string),
df7492f9 8548 Qnil);
4ed46869 8549}
4ed46869 8550
b73bfc1c 8551
55d4c1b2 8552static inline int
971de7fb 8553char_encodable_p (int c, Lisp_Object attrs)
05e6f5dc 8554{
df7492f9 8555 Lisp_Object tail;
df7492f9 8556 struct charset *charset;
7d64c6ad 8557 Lisp_Object translation_table;
d46c5b12 8558
7d64c6ad 8559 translation_table = CODING_ATTR_TRANS_TBL (attrs);
a6f87d34 8560 if (! NILP (translation_table))
7d64c6ad 8561 c = translate_char (translation_table, c);
df7492f9
KH
8562 for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8563 CONSP (tail); tail = XCDR (tail))
e133c8fa 8564 {
df7492f9
KH
8565 charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8566 if (CHAR_CHARSET_P (c, charset))
8567 break;
e133c8fa 8568 }
df7492f9 8569 return (! NILP (tail));
05e6f5dc 8570}
83fa074f 8571
fb88bf2d 8572
df7492f9
KH
8573/* Return a list of coding systems that safely encode the text between
8574 START and END. If EXCLUDE is non-nil, it is a list of coding
8575 systems not to check. The returned list doesn't contain any such
48468dac 8576 coding systems. In any case, if the text contains only ASCII or is
df7492f9 8577 unibyte, return t. */
e077cc80 8578
df7492f9
KH
8579DEFUN ("find-coding-systems-region-internal",
8580 Ffind_coding_systems_region_internal,
8581 Sfind_coding_systems_region_internal, 2, 3, 0,
8582 doc: /* Internal use only. */)
5842a27b 8583 (Lisp_Object start, Lisp_Object end, Lisp_Object exclude)
df7492f9
KH
8584{
8585 Lisp_Object coding_attrs_list, safe_codings;
d311d28c 8586 ptrdiff_t start_byte, end_byte;
7c78e542 8587 const unsigned char *p, *pbeg, *pend;
df7492f9 8588 int c;
0e727afa 8589 Lisp_Object tail, elt, work_table;
d46c5b12 8590
df7492f9
KH
8591 if (STRINGP (start))
8592 {
8593 if (!STRING_MULTIBYTE (start)
8f924df7 8594 || SCHARS (start) == SBYTES (start))
df7492f9
KH
8595 return Qt;
8596 start_byte = 0;
8f924df7 8597 end_byte = SBYTES (start);
df7492f9
KH
8598 }
8599 else
d46c5b12 8600 {
df7492f9
KH
8601 CHECK_NUMBER_COERCE_MARKER (start);
8602 CHECK_NUMBER_COERCE_MARKER (end);
8603 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8604 args_out_of_range (start, end);
4b4deea2 8605 if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
df7492f9
KH
8606 return Qt;
8607 start_byte = CHAR_TO_BYTE (XINT (start));
8608 end_byte = CHAR_TO_BYTE (XINT (end));
8609 if (XINT (end) - XINT (start) == end_byte - start_byte)
8610 return Qt;
d46c5b12 8611
e1c23804 8612 if (XINT (start) < GPT && XINT (end) > GPT)
d46c5b12 8613 {
e1c23804
DL
8614 if ((GPT - XINT (start)) < (XINT (end) - GPT))
8615 move_gap_both (XINT (start), start_byte);
df7492f9 8616 else
e1c23804 8617 move_gap_both (XINT (end), end_byte);
d46c5b12
KH
8618 }
8619 }
8620
df7492f9
KH
8621 coding_attrs_list = Qnil;
8622 for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
8623 if (NILP (exclude)
8624 || NILP (Fmemq (XCAR (tail), exclude)))
8625 {
8626 Lisp_Object attrs;
d46c5b12 8627
df7492f9
KH
8628 attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
8629 if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
8630 && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
7d64c6ad
KH
8631 {
8632 ASET (attrs, coding_attr_trans_tbl,
2170c8f0 8633 get_translation_table (attrs, 1, NULL));
7d64c6ad
KH
8634 coding_attrs_list = Fcons (attrs, coding_attrs_list);
8635 }
df7492f9 8636 }
d46c5b12 8637
df7492f9 8638 if (STRINGP (start))
8f924df7 8639 p = pbeg = SDATA (start);
df7492f9
KH
8640 else
8641 p = pbeg = BYTE_POS_ADDR (start_byte);
8642 pend = p + (end_byte - start_byte);
b843d1ae 8643
df7492f9
KH
8644 while (p < pend && ASCII_BYTE_P (*p)) p++;
8645 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
d46c5b12 8646
0e727afa 8647 work_table = Fmake_char_table (Qnil, Qnil);
05e6f5dc 8648 while (p < pend)
72d1a715 8649 {
df7492f9
KH
8650 if (ASCII_BYTE_P (*p))
8651 p++;
72d1a715
RS
8652 else
8653 {
df7492f9 8654 c = STRING_CHAR_ADVANCE (p);
0e727afa
YM
8655 if (!NILP (char_table_ref (work_table, c)))
8656 /* This character was already checked. Ignore it. */
8657 continue;
12410ef1 8658
df7492f9
KH
8659 charset_map_loaded = 0;
8660 for (tail = coding_attrs_list; CONSP (tail);)
8661 {
8662 elt = XCAR (tail);
8663 if (NILP (elt))
8664 tail = XCDR (tail);
8665 else if (char_encodable_p (c, elt))
8666 tail = XCDR (tail);
8667 else if (CONSP (XCDR (tail)))
8668 {
8669 XSETCAR (tail, XCAR (XCDR (tail)));
8670 XSETCDR (tail, XCDR (XCDR (tail)));
8671 }
8672 else
8673 {
8674 XSETCAR (tail, Qnil);
8675 tail = XCDR (tail);
8676 }
8677 }
8678 if (charset_map_loaded)
8679 {
d311d28c 8680 ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
05e6f5dc 8681
df7492f9 8682 if (STRINGP (start))
8f924df7 8683 pbeg = SDATA (start);
df7492f9
KH
8684 else
8685 pbeg = BYTE_POS_ADDR (start_byte);
8686 p = pbeg + p_offset;
8687 pend = pbeg + pend_offset;
8688 }
0e727afa 8689 char_table_set (work_table, c, Qt);
df7492f9 8690 }
ec6d2bb8 8691 }
fb88bf2d 8692
988b3759 8693 safe_codings = list2 (Qraw_text, Qno_conversion);
df7492f9
KH
8694 for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
8695 if (! NILP (XCAR (tail)))
8696 safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
ec6d2bb8 8697
05e6f5dc
KH
8698 return safe_codings;
8699}
4956c225 8700
d46c5b12 8701
8f924df7
KH
8702DEFUN ("unencodable-char-position", Funencodable_char_position,
8703 Sunencodable_char_position, 3, 5, 0,
8704 doc: /*
8705Return position of first un-encodable character in a region.
d4a1d553 8706START and END specify the region and CODING-SYSTEM specifies the
8f924df7 8707encoding to check. Return nil if CODING-SYSTEM does encode the region.
d46c5b12 8708
8f924df7
KH
8709If optional 4th argument COUNT is non-nil, it specifies at most how
8710many un-encodable characters to search. In this case, the value is a
8711list of positions.
d46c5b12 8712
8f924df7
KH
8713If optional 5th argument STRING is non-nil, it is a string to search
8714for un-encodable characters. In that case, START and END are indexes
8715to the string. */)
5842a27b 8716 (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object count, Lisp_Object string)
8f924df7 8717{
d311d28c 8718 EMACS_INT n;
8f924df7 8719 struct coding_system coding;
7d64c6ad 8720 Lisp_Object attrs, charset_list, translation_table;
8f924df7 8721 Lisp_Object positions;
d311d28c 8722 ptrdiff_t from, to;
8f924df7
KH
8723 const unsigned char *p, *stop, *pend;
8724 int ascii_compatible;
fb88bf2d 8725
8f924df7
KH
8726 setup_coding_system (Fcheck_coding_system (coding_system), &coding);
8727 attrs = CODING_ID_ATTRS (coding.id);
8728 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
8729 return Qnil;
8730 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
8731 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2170c8f0 8732 translation_table = get_translation_table (attrs, 1, NULL);
fb88bf2d 8733
8f924df7
KH
8734 if (NILP (string))
8735 {
8736 validate_region (&start, &end);
8737 from = XINT (start);
8738 to = XINT (end);
4b4deea2 8739 if (NILP (BVAR (current_buffer, enable_multibyte_characters))
8f924df7
KH
8740 || (ascii_compatible
8741 && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
8742 return Qnil;
8743 p = CHAR_POS_ADDR (from);
8744 pend = CHAR_POS_ADDR (to);
8745 if (from < GPT && to >= GPT)
8746 stop = GPT_ADDR;
8747 else
8748 stop = pend;
8749 }
8750 else
8751 {
8752 CHECK_STRING (string);
8753 CHECK_NATNUM (start);
8754 CHECK_NATNUM (end);
d311d28c
PE
8755 if (! (XINT (start) <= XINT (end) && XINT (end) <= SCHARS (string)))
8756 args_out_of_range_3 (string, start, end);
8f924df7
KH
8757 from = XINT (start);
8758 to = XINT (end);
8f924df7
KH
8759 if (! STRING_MULTIBYTE (string))
8760 return Qnil;
8761 p = SDATA (string) + string_char_to_byte (string, from);
8762 stop = pend = SDATA (string) + string_char_to_byte (string, to);
8763 if (ascii_compatible && (to - from) == (pend - p))
8764 return Qnil;
8765 }
f2558efd 8766
8f924df7
KH
8767 if (NILP (count))
8768 n = 1;
8769 else
b73bfc1c 8770 {
8f924df7
KH
8771 CHECK_NATNUM (count);
8772 n = XINT (count);
b73bfc1c
KH
8773 }
8774
8f924df7 8775 positions = Qnil;
3633e3aa 8776 charset_map_loaded = 0;
8f924df7 8777 while (1)
d46c5b12 8778 {
8f924df7 8779 int c;
ec6d2bb8 8780
8f924df7
KH
8781 if (ascii_compatible)
8782 while (p < stop && ASCII_BYTE_P (*p))
8783 p++, from++;
8784 if (p >= stop)
0e79d667 8785 {
8f924df7
KH
8786 if (p >= pend)
8787 break;
8788 stop = pend;
8789 p = GAP_END_ADDR;
0e79d667 8790 }
ec6d2bb8 8791
8f924df7
KH
8792 c = STRING_CHAR_ADVANCE (p);
8793 if (! (ASCII_CHAR_P (c) && ascii_compatible)
7d64c6ad
KH
8794 && ! char_charset (translate_char (translation_table, c),
8795 charset_list, NULL))
ec6d2bb8 8796 {
8f924df7
KH
8797 positions = Fcons (make_number (from), positions);
8798 n--;
8799 if (n == 0)
8800 break;
ec6d2bb8
KH
8801 }
8802
8f924df7 8803 from++;
3633e3aa
KH
8804 if (charset_map_loaded && NILP (string))
8805 {
8806 p = CHAR_POS_ADDR (from);
8807 pend = CHAR_POS_ADDR (to);
8808 if (from < GPT && to >= GPT)
8809 stop = GPT_ADDR;
8810 else
8811 stop = pend;
8812 charset_map_loaded = 0;
8813 }
8f924df7 8814 }
d46c5b12 8815
8f924df7
KH
8816 return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
8817}
d46c5b12 8818
d46c5b12 8819
df7492f9
KH
8820DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
8821 Scheck_coding_systems_region, 3, 3, 0,
8822 doc: /* Check if the region is encodable by coding systems.
d46c5b12 8823
df7492f9
KH
8824START and END are buffer positions specifying the region.
8825CODING-SYSTEM-LIST is a list of coding systems to check.
d46c5b12 8826
df7492f9 8827The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
d4a1d553 8828CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
df7492f9
KH
8829whole region, POS0, POS1, ... are buffer positions where non-encodable
8830characters are found.
93dec019 8831
df7492f9
KH
8832If all coding systems in CODING-SYSTEM-LIST can encode the region, the
8833value is nil.
93dec019 8834
df7492f9
KH
8835START may be a string. In that case, check if the string is
8836encodable, and the value contains indices to the string instead of
5704f39a
KH
8837buffer positions. END is ignored.
8838
4c1958f4 8839If the current buffer (or START if it is a string) is unibyte, the value
5704f39a 8840is nil. */)
5842a27b 8841 (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system_list)
05e6f5dc 8842{
df7492f9 8843 Lisp_Object list;
d311d28c
PE
8844 ptrdiff_t start_byte, end_byte;
8845 ptrdiff_t pos;
7c78e542 8846 const unsigned char *p, *pbeg, *pend;
df7492f9 8847 int c;
7d64c6ad 8848 Lisp_Object tail, elt, attrs;
70ad9fc4 8849
05e6f5dc
KH
8850 if (STRINGP (start))
8851 {
df7492f9 8852 if (!STRING_MULTIBYTE (start)
4c1958f4 8853 || SCHARS (start) == SBYTES (start))
df7492f9
KH
8854 return Qnil;
8855 start_byte = 0;
8f924df7 8856 end_byte = SBYTES (start);
df7492f9 8857 pos = 0;
d46c5b12 8858 }
05e6f5dc 8859 else
b73bfc1c 8860 {
b7826503
PJ
8861 CHECK_NUMBER_COERCE_MARKER (start);
8862 CHECK_NUMBER_COERCE_MARKER (end);
05e6f5dc
KH
8863 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8864 args_out_of_range (start, end);
4b4deea2 8865 if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
df7492f9
KH
8866 return Qnil;
8867 start_byte = CHAR_TO_BYTE (XINT (start));
8868 end_byte = CHAR_TO_BYTE (XINT (end));
8869 if (XINT (end) - XINT (start) == end_byte - start_byte)
5704f39a 8870 return Qnil;
df7492f9 8871
e1c23804 8872 if (XINT (start) < GPT && XINT (end) > GPT)
b73bfc1c 8873 {
e1c23804
DL
8874 if ((GPT - XINT (start)) < (XINT (end) - GPT))
8875 move_gap_both (XINT (start), start_byte);
df7492f9 8876 else
e1c23804 8877 move_gap_both (XINT (end), end_byte);
b73bfc1c 8878 }
e1c23804 8879 pos = XINT (start);
b73bfc1c 8880 }
7553d0e1 8881
df7492f9
KH
8882 list = Qnil;
8883 for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
12410ef1 8884 {
df7492f9 8885 elt = XCAR (tail);
7d64c6ad 8886 attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
2170c8f0
KH
8887 ASET (attrs, coding_attr_trans_tbl,
8888 get_translation_table (attrs, 1, NULL));
7d64c6ad 8889 list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
12410ef1
KH
8890 }
8891
df7492f9 8892 if (STRINGP (start))
8f924df7 8893 p = pbeg = SDATA (start);
72d1a715 8894 else
df7492f9
KH
8895 p = pbeg = BYTE_POS_ADDR (start_byte);
8896 pend = p + (end_byte - start_byte);
4ed46869 8897
df7492f9
KH
8898 while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
8899 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
ec6d2bb8 8900
df7492f9 8901 while (p < pend)
d46c5b12 8902 {
df7492f9
KH
8903 if (ASCII_BYTE_P (*p))
8904 p++;
e133c8fa 8905 else
05e6f5dc 8906 {
df7492f9
KH
8907 c = STRING_CHAR_ADVANCE (p);
8908
8909 charset_map_loaded = 0;
8910 for (tail = list; CONSP (tail); tail = XCDR (tail))
8911 {
8912 elt = XCDR (XCAR (tail));
8913 if (! char_encodable_p (c, XCAR (elt)))
8914 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
8915 }
8916 if (charset_map_loaded)
8917 {
d311d28c 8918 ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
df7492f9
KH
8919
8920 if (STRINGP (start))
8f924df7 8921 pbeg = SDATA (start);
df7492f9
KH
8922 else
8923 pbeg = BYTE_POS_ADDR (start_byte);
8924 p = pbeg + p_offset;
8925 pend = pbeg + pend_offset;
8926 }
05e6f5dc 8927 }
df7492f9 8928 pos++;
d46c5b12 8929 }
4ed46869 8930
df7492f9
KH
8931 tail = list;
8932 list = Qnil;
8933 for (; CONSP (tail); tail = XCDR (tail))
ec6d2bb8 8934 {
df7492f9
KH
8935 elt = XCAR (tail);
8936 if (CONSP (XCDR (XCDR (elt))))
8937 list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
8938 list);
ec6d2bb8 8939 }
2b4f9037 8940
df7492f9 8941 return list;
d46c5b12
KH
8942}
8943
3fd9494b 8944
74ab6df5 8945static Lisp_Object
cf84bb53
JB
8946code_convert_region (Lisp_Object start, Lisp_Object end,
8947 Lisp_Object coding_system, Lisp_Object dst_object,
8948 int encodep, int norecord)
4ed46869 8949{
3a73fa5d 8950 struct coding_system coding;
d311d28c 8951 ptrdiff_t from, from_byte, to, to_byte;
df7492f9 8952 Lisp_Object src_object;
4ed46869 8953
b7826503
PJ
8954 CHECK_NUMBER_COERCE_MARKER (start);
8955 CHECK_NUMBER_COERCE_MARKER (end);
df7492f9
KH
8956 if (NILP (coding_system))
8957 coding_system = Qno_conversion;
8958 else
8959 CHECK_CODING_SYSTEM (coding_system);
8960 src_object = Fcurrent_buffer ();
8961 if (NILP (dst_object))
8962 dst_object = src_object;
8963 else if (! EQ (dst_object, Qt))
8964 CHECK_BUFFER (dst_object);
3a73fa5d 8965
d46c5b12
KH
8966 validate_region (&start, &end);
8967 from = XFASTINT (start);
df7492f9 8968 from_byte = CHAR_TO_BYTE (from);
d46c5b12 8969 to = XFASTINT (end);
df7492f9 8970 to_byte = CHAR_TO_BYTE (to);
764ca8da 8971
df7492f9
KH
8972 setup_coding_system (coding_system, &coding);
8973 coding.mode |= CODING_MODE_LAST_BLOCK;
ec6d2bb8 8974
df7492f9
KH
8975 if (encodep)
8976 encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8977 dst_object);
8978 else
8979 decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8980 dst_object);
8981 if (! norecord)
8982 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
ec6d2bb8 8983
df7492f9
KH
8984 return (BUFFERP (dst_object)
8985 ? make_number (coding.produced_char)
8986 : coding.dst_object);
4031e2bf 8987}
78108bcd 8988
4ed46869 8989
4031e2bf 8990DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
df7492f9 8991 3, 4, "r\nzCoding system: ",
48b0f3ae 8992 doc: /* Decode the current region from the specified coding system.
df7492f9
KH
8993When called from a program, takes four arguments:
8994 START, END, CODING-SYSTEM, and DESTINATION.
8995START and END are buffer positions.
8844fa83 8996
df7492f9 8997Optional 4th arguments DESTINATION specifies where the decoded text goes.
2354b80b 8998If nil, the region between START and END is replaced by the decoded text.
1560f91a
EZ
8999If buffer, the decoded text is inserted in that buffer after point (point
9000does not move).
446dcd75 9001In those cases, the length of the decoded text is returned.
319a3947 9002If DESTINATION is t, the decoded text is returned.
8844fa83 9003
48b0f3ae
PJ
9004This function sets `last-coding-system-used' to the precise coding system
9005used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
319a3947 9006not fully specified.) */)
5842a27b 9007 (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
4031e2bf 9008{
df7492f9 9009 return code_convert_region (start, end, coding_system, destination, 0, 0);
3a73fa5d 9010}
8844fa83 9011
3a73fa5d 9012DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
df7492f9
KH
9013 3, 4, "r\nzCoding system: ",
9014 doc: /* Encode the current region by specified coding system.
d4a1d553
JB
9015When called from a program, takes four arguments:
9016 START, END, CODING-SYSTEM and DESTINATION.
9017START and END are buffer positions.
d46c5b12 9018
df7492f9
KH
9019Optional 4th arguments DESTINATION specifies where the encoded text goes.
9020If nil, the region between START and END is replace by the encoded text.
1560f91a
EZ
9021If buffer, the encoded text is inserted in that buffer after point (point
9022does not move).
446dcd75 9023In those cases, the length of the encoded text is returned.
319a3947 9024If DESTINATION is t, the encoded text is returned.
2391eaa4 9025
48b0f3ae
PJ
9026This function sets `last-coding-system-used' to the precise coding system
9027used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
319a3947 9028not fully specified.) */)
5842a27b 9029 (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
3a73fa5d 9030{
df7492f9 9031 return code_convert_region (start, end, coding_system, destination, 1, 0);
b73bfc1c
KH
9032}
9033
9034Lisp_Object
6f704c76
DN
9035code_convert_string (Lisp_Object string, Lisp_Object coding_system,
9036 Lisp_Object dst_object, int encodep, int nocopy, int norecord)
b73bfc1c 9037{
4031e2bf 9038 struct coding_system coding;
d311d28c 9039 ptrdiff_t chars, bytes;
ec6d2bb8 9040
b7826503 9041 CHECK_STRING (string);
d46c5b12 9042 if (NILP (coding_system))
4956c225 9043 {
df7492f9
KH
9044 if (! norecord)
9045 Vlast_coding_system_used = Qno_conversion;
9046 if (NILP (dst_object))
9047 return (nocopy ? Fcopy_sequence (string) : string);
4956c225 9048 }
b73bfc1c 9049
df7492f9
KH
9050 if (NILP (coding_system))
9051 coding_system = Qno_conversion;
9052 else
9053 CHECK_CODING_SYSTEM (coding_system);
9054 if (NILP (dst_object))
9055 dst_object = Qt;
9056 else if (! EQ (dst_object, Qt))
9057 CHECK_BUFFER (dst_object);
73be902c 9058
df7492f9 9059 setup_coding_system (coding_system, &coding);
d46c5b12 9060 coding.mode |= CODING_MODE_LAST_BLOCK;
8f924df7
KH
9061 chars = SCHARS (string);
9062 bytes = SBYTES (string);
df7492f9
KH
9063 if (encodep)
9064 encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9065 else
9066 decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9067 if (! norecord)
9068 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
73be902c 9069
df7492f9
KH
9070 return (BUFFERP (dst_object)
9071 ? make_number (coding.produced_char)
9072 : coding.dst_object);
4ed46869 9073}
73be902c 9074
b73bfc1c 9075
ecec61c1 9076/* Encode or decode STRING according to CODING_SYSTEM.
ec6d2bb8 9077 Do not set Vlast_coding_system_used.
4ed46869 9078
ec6d2bb8
KH
9079 This function is called only from macros DECODE_FILE and
9080 ENCODE_FILE, thus we ignore character composition. */
4ed46869 9081
ecec61c1 9082Lisp_Object
cf84bb53
JB
9083code_convert_string_norecord (Lisp_Object string, Lisp_Object coding_system,
9084 int encodep)
4ed46869 9085{
0be8721c 9086 return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
4ed46869
KH
9087}
9088
4ed46869 9089
a7ca3326 9090DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
df7492f9
KH
9091 2, 4, 0,
9092 doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
9093
9094Optional third arg NOCOPY non-nil means it is OK to return STRING itself
9095if the decoding operation is trivial.
ecec61c1 9096
d4a1d553 9097Optional fourth arg BUFFER non-nil means that the decoded text is
1560f91a
EZ
9098inserted in that buffer after point (point does not move). In this
9099case, the return value is the length of the decoded text.
ecec61c1 9100
df7492f9
KH
9101This function sets `last-coding-system-used' to the precise coding system
9102used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
d4a1d553 9103not fully specified.) */)
5842a27b 9104 (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
4ed46869 9105{
df7492f9
KH
9106 return code_convert_string (string, coding_system, buffer,
9107 0, ! NILP (nocopy), 0);
4ed46869
KH
9108}
9109
df7492f9
KH
9110DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
9111 2, 4, 0,
9112 doc: /* Encode STRING to CODING-SYSTEM, and return the result.
9113
9114Optional third arg NOCOPY non-nil means it is OK to return STRING
9115itself if the encoding operation is trivial.
9116
d4a1d553 9117Optional fourth arg BUFFER non-nil means that the encoded text is
1560f91a
EZ
9118inserted in that buffer after point (point does not move). In this
9119case, the return value is the length of the encoded text.
df7492f9
KH
9120
9121This function sets `last-coding-system-used' to the precise coding system
9122used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9123not fully specified.) */)
5842a27b 9124 (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
4ed46869 9125{
df7492f9 9126 return code_convert_string (string, coding_system, buffer,
4550efdf 9127 1, ! NILP (nocopy), 0);
4ed46869 9128}
df7492f9 9129
3a73fa5d 9130\f
4ed46869 9131DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
48b0f3ae
PJ
9132 doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
9133Return the corresponding character. */)
5842a27b 9134 (Lisp_Object code)
4ed46869 9135{
df7492f9
KH
9136 Lisp_Object spec, attrs, val;
9137 struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
5fdb398c
PE
9138 EMACS_INT ch;
9139 int c;
4ed46869 9140
df7492f9 9141 CHECK_NATNUM (code);
5fdb398c 9142 ch = XFASTINT (code);
df7492f9
KH
9143 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9144 attrs = AREF (spec, 0);
4ed46869 9145
5fdb398c 9146 if (ASCII_BYTE_P (ch)
df7492f9
KH
9147 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9148 return code;
4ed46869 9149
df7492f9
KH
9150 val = CODING_ATTR_CHARSET_LIST (attrs);
9151 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
004068e4
KH
9152 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9153 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
fa42c37f 9154
5fdb398c
PE
9155 if (ch <= 0x7F)
9156 {
9157 c = ch;
9158 charset = charset_roman;
9159 }
9160 else if (ch >= 0xA0 && ch < 0xDF)
55ab7be3 9161 {
5fdb398c 9162 c = ch - 0x80;
df7492f9 9163 charset = charset_kana;
4ed46869 9164 }
55ab7be3 9165 else
4ed46869 9166 {
5fdb398c
PE
9167 EMACS_INT c1 = ch >> 8;
9168 int c2 = ch & 0xFF;
df7492f9 9169
2735d060
PE
9170 if (c1 < 0x81 || (c1 > 0x9F && c1 < 0xE0) || c1 > 0xEF
9171 || c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
c2982e87 9172 error ("Invalid code: %"pI"d", ch);
5fdb398c 9173 c = ch;
df7492f9
KH
9174 SJIS_TO_JIS (c);
9175 charset = charset_kanji;
4ed46869 9176 }
df7492f9
KH
9177 c = DECODE_CHAR (charset, c);
9178 if (c < 0)
c2982e87 9179 error ("Invalid code: %"pI"d", ch);
df7492f9 9180 return make_number (c);
93dec019 9181}
4ed46869 9182
48b0f3ae 9183
4ed46869 9184DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
8acf0c0e 9185 doc: /* Encode a Japanese character CH to shift_jis encoding.
48b0f3ae 9186Return the corresponding code in SJIS. */)
5842a27b 9187 (Lisp_Object ch)
4ed46869 9188{
df7492f9
KH
9189 Lisp_Object spec, attrs, charset_list;
9190 int c;
9191 struct charset *charset;
9192 unsigned code;
48b0f3ae 9193
df7492f9
KH
9194 CHECK_CHARACTER (ch);
9195 c = XFASTINT (ch);
9196 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9197 attrs = AREF (spec, 0);
9198
9199 if (ASCII_CHAR_P (c)
9200 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9201 return ch;
9202
9203 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9204 charset = char_charset (c, charset_list, &code);
9205 if (code == CHARSET_INVALID_CODE (charset))
e6c3da20 9206 error ("Can't encode by shift_jis encoding: %c", c);
df7492f9
KH
9207 JIS_TO_SJIS (code);
9208
9209 return make_number (code);
4ed46869
KH
9210}
9211
9212DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
48b0f3ae
PJ
9213 doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
9214Return the corresponding character. */)
5842a27b 9215 (Lisp_Object code)
d46c5b12 9216{
df7492f9
KH
9217 Lisp_Object spec, attrs, val;
9218 struct charset *charset_roman, *charset_big5, *charset;
5fdb398c 9219 EMACS_INT ch;
df7492f9 9220 int c;
6289dd10 9221
df7492f9 9222 CHECK_NATNUM (code);
5fdb398c 9223 ch = XFASTINT (code);
df7492f9
KH
9224 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9225 attrs = AREF (spec, 0);
4ed46869 9226
5fdb398c 9227 if (ASCII_BYTE_P (ch)
df7492f9
KH
9228 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9229 return code;
6289dd10 9230
df7492f9
KH
9231 val = CODING_ATTR_CHARSET_LIST (attrs);
9232 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9233 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
c210f766 9234
5fdb398c
PE
9235 if (ch <= 0x7F)
9236 {
9237 c = ch;
9238 charset = charset_roman;
9239 }
c28a9453
KH
9240 else
9241 {
5fdb398c
PE
9242 EMACS_INT b1 = ch >> 8;
9243 int b2 = ch & 0x7F;
df7492f9
KH
9244 if (b1 < 0xA1 || b1 > 0xFE
9245 || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
c2982e87 9246 error ("Invalid code: %"pI"d", ch);
5fdb398c 9247 c = ch;
df7492f9 9248 charset = charset_big5;
c28a9453 9249 }
5fdb398c 9250 c = DECODE_CHAR (charset, c);
df7492f9 9251 if (c < 0)
c2982e87 9252 error ("Invalid code: %"pI"d", ch);
df7492f9 9253 return make_number (c);
d46c5b12 9254}
6289dd10 9255
4ed46869 9256DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
8acf0c0e 9257 doc: /* Encode the Big5 character CH to BIG5 coding system.
48b0f3ae 9258Return the corresponding character code in Big5. */)
5842a27b 9259 (Lisp_Object ch)
4ed46869 9260{
df7492f9
KH
9261 Lisp_Object spec, attrs, charset_list;
9262 struct charset *charset;
9263 int c;
9264 unsigned code;
9265
9266 CHECK_CHARACTER (ch);
9267 c = XFASTINT (ch);
9268 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9269 attrs = AREF (spec, 0);
9270 if (ASCII_CHAR_P (c)
9271 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9272 return ch;
9273
9274 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9275 charset = char_charset (c, charset_list, &code);
9276 if (code == CHARSET_INVALID_CODE (charset))
e6c3da20 9277 error ("Can't encode by Big5 encoding: %c", c);
df7492f9
KH
9278
9279 return make_number (code);
4ed46869 9280}
48b0f3ae 9281
3a73fa5d 9282\f
002fdb44 9283DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
68bba4e4 9284 Sset_terminal_coding_system_internal, 1, 2, 0,
48b0f3ae 9285 doc: /* Internal use only. */)
5842a27b 9286 (Lisp_Object coding_system, Lisp_Object terminal)
4ed46869 9287{
b18fad6d
KH
9288 struct terminal *term = get_terminal (terminal, 1);
9289 struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (term);
b7826503 9290 CHECK_SYMBOL (coding_system);
b8299c66 9291 setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
70c22245 9292 /* We had better not send unsafe characters to terminal. */
c73bd236 9293 terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
ad1746f5 9294 /* Character composition should be disabled. */
c73bd236 9295 terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
b8299c66
KL
9296 terminal_coding->src_multibyte = 1;
9297 terminal_coding->dst_multibyte = 0;
b18fad6d
KH
9298 if (terminal_coding->common_flags & CODING_REQUIRE_ENCODING_MASK)
9299 term->charset_list = coding_charset_list (terminal_coding);
9300 else
6b4bb703 9301 term->charset_list = Fcons (make_number (charset_ascii), Qnil);
4ed46869
KH
9302 return Qnil;
9303}
9304
c4825358
KH
9305DEFUN ("set-safe-terminal-coding-system-internal",
9306 Fset_safe_terminal_coding_system_internal,
48b0f3ae 9307 Sset_safe_terminal_coding_system_internal, 1, 1, 0,
ddb67bdc 9308 doc: /* Internal use only. */)
5842a27b 9309 (Lisp_Object coding_system)
d46c5b12 9310{
b7826503 9311 CHECK_SYMBOL (coding_system);
c4825358
KH
9312 setup_coding_system (Fcheck_coding_system (coding_system),
9313 &safe_terminal_coding);
ad1746f5 9314 /* Character composition should be disabled. */
df7492f9 9315 safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
b73bfc1c
KH
9316 safe_terminal_coding.src_multibyte = 1;
9317 safe_terminal_coding.dst_multibyte = 0;
c4825358
KH
9318 return Qnil;
9319}
4ed46869 9320
002fdb44 9321DEFUN ("terminal-coding-system", Fterminal_coding_system,
68bba4e4 9322 Sterminal_coding_system, 0, 1, 0,
6ed8eeff 9323 doc: /* Return coding system specified for terminal output on the given terminal.
708e05dc 9324TERMINAL may be a terminal object, a frame, or nil for the selected
6ed8eeff 9325frame's terminal device. */)
5842a27b 9326 (Lisp_Object terminal)
4ed46869 9327{
985773c9
MB
9328 struct coding_system *terminal_coding
9329 = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
9330 Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
ae6f73fa 9331
6d5eb5b0 9332 /* For backward compatibility, return nil if it is `undecided'. */
f75c90a9 9333 return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
4ed46869
KH
9334}
9335
002fdb44 9336DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
68bba4e4 9337 Sset_keyboard_coding_system_internal, 1, 2, 0,
48b0f3ae 9338 doc: /* Internal use only. */)
5842a27b 9339 (Lisp_Object coding_system, Lisp_Object terminal)
4ed46869 9340{
6ed8eeff 9341 struct terminal *t = get_terminal (terminal, 1);
b7826503 9342 CHECK_SYMBOL (coding_system);
624bda09
KH
9343 if (NILP (coding_system))
9344 coding_system = Qno_conversion;
9345 else
9346 Fcheck_coding_system (coding_system);
9347 setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t));
ad1746f5 9348 /* Character composition should be disabled. */
c73bd236
MB
9349 TERMINAL_KEYBOARD_CODING (t)->common_flags
9350 &= ~CODING_ANNOTATE_COMPOSITION_MASK;
4ed46869
KH
9351 return Qnil;
9352}
9353
9354DEFUN ("keyboard-coding-system",
985773c9 9355 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
48b0f3ae 9356 doc: /* Return coding system specified for decoding keyboard input. */)
5842a27b 9357 (Lisp_Object terminal)
4ed46869 9358{
985773c9
MB
9359 return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9360 (get_terminal (terminal, 1))->id);
4ed46869
KH
9361}
9362
4ed46869 9363\f
a7ca3326 9364DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
a5d301df 9365 Sfind_operation_coding_system, 1, MANY, 0,
48b0f3ae
PJ
9366 doc: /* Choose a coding system for an operation based on the target name.
9367The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9368DECODING-SYSTEM is the coding system to use for decoding
9369\(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9370for encoding (in case OPERATION does encoding).
05e6f5dc 9371
48b0f3ae
PJ
9372The first argument OPERATION specifies an I/O primitive:
9373 For file I/O, `insert-file-contents' or `write-region'.
9374 For process I/O, `call-process', `call-process-region', or `start-process'.
9375 For network I/O, `open-network-stream'.
05e6f5dc 9376
48b0f3ae
PJ
9377The remaining arguments should be the same arguments that were passed
9378to the primitive. Depending on which primitive, one of those arguments
9379is selected as the TARGET. For example, if OPERATION does file I/O,
9380whichever argument specifies the file name is TARGET.
05e6f5dc 9381
48b0f3ae 9382TARGET has a meaning which depends on OPERATION:
b883cdb2 9383 For file I/O, TARGET is a file name (except for the special case below).
48b0f3ae 9384 For process I/O, TARGET is a process name.
d4a1d553 9385 For network I/O, TARGET is a service name or a port number.
05e6f5dc 9386
d4a1d553 9387This function looks up what is specified for TARGET in
48b0f3ae
PJ
9388`file-coding-system-alist', `process-coding-system-alist',
9389or `network-coding-system-alist' depending on OPERATION.
9390They may specify a coding system, a cons of coding systems,
9391or a function symbol to call.
9392In the last case, we call the function with one argument,
9393which is a list of all the arguments given to this function.
1011c487
MB
9394If the function can't decide a coding system, it can return
9395`undecided' so that the normal code-detection is performed.
48b0f3ae 9396
b883cdb2
MB
9397If OPERATION is `insert-file-contents', the argument corresponding to
9398TARGET may be a cons (FILENAME . BUFFER). In that case, FILENAME is a
9399file name to look up, and BUFFER is a buffer that contains the file's
9400contents (not yet decoded). If `file-coding-system-alist' specifies a
9401function to call for FILENAME, that function should examine the
9402contents of BUFFER instead of reading the file.
9403
d918f936 9404usage: (find-operation-coding-system OPERATION ARGUMENTS...) */)
f66c7cf8 9405 (ptrdiff_t nargs, Lisp_Object *args)
6b89e3aa 9406{
4ed46869
KH
9407 Lisp_Object operation, target_idx, target, val;
9408 register Lisp_Object chain;
177c0ea7 9409
4ed46869
KH
9410 if (nargs < 2)
9411 error ("Too few arguments");
9412 operation = args[0];
9413 if (!SYMBOLP (operation)
d311d28c 9414 || (target_idx = Fget (operation, Qtarget_idx), !NATNUMP (target_idx)))
3ed051d4 9415 error ("Invalid first argument");
7b09a37a 9416 if (nargs <= 1 + XFASTINT (target_idx))
94dcfacf 9417 error ("Too few arguments for operation `%s'",
8f924df7 9418 SDATA (SYMBOL_NAME (operation)));
c5101a77 9419 target = args[XFASTINT (target_idx) + 1];
4ed46869 9420 if (!(STRINGP (target)
091a0ff0
KH
9421 || (EQ (operation, Qinsert_file_contents) && CONSP (target)
9422 && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
4ed46869 9423 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
94dcfacf
EZ
9424 error ("Invalid argument %"pI"d of operation `%s'",
9425 XFASTINT (target_idx) + 1, SDATA (SYMBOL_NAME (operation)));
091a0ff0
KH
9426 if (CONSP (target))
9427 target = XCAR (target);
4ed46869 9428
2e34157c
RS
9429 chain = ((EQ (operation, Qinsert_file_contents)
9430 || EQ (operation, Qwrite_region))
02ba4723 9431 ? Vfile_coding_system_alist
2e34157c 9432 : (EQ (operation, Qopen_network_stream)
02ba4723
KH
9433 ? Vnetwork_coding_system_alist
9434 : Vprocess_coding_system_alist));
4ed46869
KH
9435 if (NILP (chain))
9436 return Qnil;
9437
03699b14 9438 for (; CONSP (chain); chain = XCDR (chain))
6b89e3aa 9439 {
f44d27ce 9440 Lisp_Object elt;
6b89e3aa 9441
df7492f9 9442 elt = XCAR (chain);
4ed46869
KH
9443 if (CONSP (elt)
9444 && ((STRINGP (target)
03699b14
KR
9445 && STRINGP (XCAR (elt))
9446 && fast_string_match (XCAR (elt), target) >= 0)
9447 || (INTEGERP (target) && EQ (target, XCAR (elt)))))
6b89e3aa 9448 {
03699b14 9449 val = XCDR (elt);
b19fd4c5
KH
9450 /* Here, if VAL is both a valid coding system and a valid
9451 function symbol, we return VAL as a coding system. */
02ba4723
KH
9452 if (CONSP (val))
9453 return val;
9454 if (! SYMBOLP (val))
9455 return Qnil;
9456 if (! NILP (Fcoding_system_p (val)))
9457 return Fcons (val, val);
b19fd4c5 9458 if (! NILP (Ffboundp (val)))
6b89e3aa 9459 {
e2b97060
MB
9460 /* We use call1 rather than safe_call1
9461 so as to get bug reports about functions called here
9462 which don't handle the current interface. */
9463 val = call1 (val, Flist (nargs, args));
b19fd4c5
KH
9464 if (CONSP (val))
9465 return val;
9466 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
9467 return Fcons (val, val);
6b89e3aa 9468 }
02ba4723 9469 return Qnil;
6b89e3aa
KH
9470 }
9471 }
4ed46869 9472 return Qnil;
6b89e3aa
KH
9473}
9474
df7492f9 9475DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
a3f6ee6d 9476 Sset_coding_system_priority, 0, MANY, 0,
da7db224 9477 doc: /* Assign higher priority to the coding systems given as arguments.
d4a1d553 9478If multiple coding systems belong to the same category,
a3181084
DL
9479all but the first one are ignored.
9480
d4a1d553 9481usage: (set-coding-system-priority &rest coding-systems) */)
f66c7cf8 9482 (ptrdiff_t nargs, Lisp_Object *args)
df7492f9 9483{
f66c7cf8 9484 ptrdiff_t i, j;
df7492f9
KH
9485 int changed[coding_category_max];
9486 enum coding_category priorities[coding_category_max];
9487
72af86bd 9488 memset (changed, 0, sizeof changed);
6b89e3aa 9489
df7492f9 9490 for (i = j = 0; i < nargs; i++)
6b89e3aa 9491 {
df7492f9
KH
9492 enum coding_category category;
9493 Lisp_Object spec, attrs;
6b89e3aa 9494
df7492f9
KH
9495 CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9496 attrs = AREF (spec, 0);
9497 category = XINT (CODING_ATTR_CATEGORY (attrs));
9498 if (changed[category])
9499 /* Ignore this coding system because a coding system of the
9500 same category already had a higher priority. */
9501 continue;
9502 changed[category] = 1;
9503 priorities[j++] = category;
9504 if (coding_categories[category].id >= 0
9505 && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9506 setup_coding_system (args[i], &coding_categories[category]);
ff563fce 9507 Fset (AREF (Vcoding_category_table, category), args[i]);
df7492f9 9508 }
6b89e3aa 9509
df7492f9
KH
9510 /* Now we have decided top J priorities. Reflect the order of the
9511 original priorities to the remaining priorities. */
6b89e3aa 9512
df7492f9 9513 for (i = j, j = 0; i < coding_category_max; i++, j++)
6b89e3aa 9514 {
df7492f9
KH
9515 while (j < coding_category_max
9516 && changed[coding_priorities[j]])
9517 j++;
9518 if (j == coding_category_max)
9519 abort ();
9520 priorities[i] = coding_priorities[j];
9521 }
6b89e3aa 9522
72af86bd 9523 memcpy (coding_priorities, priorities, sizeof priorities);
177c0ea7 9524
ff563fce
KH
9525 /* Update `coding-category-list'. */
9526 Vcoding_category_list = Qnil;
c5101a77 9527 for (i = coding_category_max; i-- > 0; )
ff563fce
KH
9528 Vcoding_category_list
9529 = Fcons (AREF (Vcoding_category_table, priorities[i]),
9530 Vcoding_category_list);
6b89e3aa 9531
df7492f9 9532 return Qnil;
6b89e3aa
KH
9533}
9534
df7492f9
KH
9535DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
9536 Scoding_system_priority_list, 0, 1, 0,
da7db224 9537 doc: /* Return a list of coding systems ordered by their priorities.
b811c52b
KH
9538The list contains a subset of coding systems; i.e. coding systems
9539assigned to each coding category (see `coding-category-list').
9540
da7db224 9541HIGHESTP non-nil means just return the highest priority one. */)
5842a27b 9542 (Lisp_Object highestp)
d46c5b12
KH
9543{
9544 int i;
df7492f9 9545 Lisp_Object val;
6b89e3aa 9546
df7492f9 9547 for (i = 0, val = Qnil; i < coding_category_max; i++)
d46c5b12 9548 {
df7492f9
KH
9549 enum coding_category category = coding_priorities[i];
9550 int id = coding_categories[category].id;
9551 Lisp_Object attrs;
068a9dbd 9552
df7492f9
KH
9553 if (id < 0)
9554 continue;
9555 attrs = CODING_ID_ATTRS (id);
9556 if (! NILP (highestp))
9557 return CODING_ATTR_BASE_NAME (attrs);
9558 val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
9559 }
9560 return Fnreverse (val);
9561}
068a9dbd 9562
91433552 9563static const char *const suffixes[] = { "-unix", "-dos", "-mac" };
068a9dbd
KH
9564
9565static Lisp_Object
971de7fb 9566make_subsidiaries (Lisp_Object base)
068a9dbd 9567{
df7492f9 9568 Lisp_Object subsidiaries;
1bfdaf10 9569 ptrdiff_t base_name_len = SBYTES (SYMBOL_NAME (base));
df7492f9
KH
9570 char *buf = (char *) alloca (base_name_len + 6);
9571 int i;
068a9dbd 9572
72af86bd 9573 memcpy (buf, SDATA (SYMBOL_NAME (base)), base_name_len);
df7492f9
KH
9574 subsidiaries = Fmake_vector (make_number (3), Qnil);
9575 for (i = 0; i < 3; i++)
068a9dbd 9576 {
1bfdaf10 9577 strcpy (buf + base_name_len, suffixes[i]);
df7492f9 9578 ASET (subsidiaries, i, intern (buf));
068a9dbd 9579 }
df7492f9 9580 return subsidiaries;
068a9dbd
KH
9581}
9582
9583
df7492f9
KH
9584DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
9585 Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
1fcd6c8b
DL
9586 doc: /* For internal use only.
9587usage: (define-coding-system-internal ...) */)
f66c7cf8 9588 (ptrdiff_t nargs, Lisp_Object *args)
068a9dbd 9589{
df7492f9
KH
9590 Lisp_Object name;
9591 Lisp_Object spec_vec; /* [ ATTRS ALIASE EOL_TYPE ] */
9592 Lisp_Object attrs; /* Vector of attributes. */
9593 Lisp_Object eol_type;
9594 Lisp_Object aliases;
9595 Lisp_Object coding_type, charset_list, safe_charsets;
9596 enum coding_category category;
9597 Lisp_Object tail, val;
9598 int max_charset_id = 0;
9599 int i;
068a9dbd 9600
df7492f9
KH
9601 if (nargs < coding_arg_max)
9602 goto short_args;
068a9dbd 9603
df7492f9 9604 attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
068a9dbd 9605
df7492f9
KH
9606 name = args[coding_arg_name];
9607 CHECK_SYMBOL (name);
9608 CODING_ATTR_BASE_NAME (attrs) = name;
068a9dbd 9609
df7492f9
KH
9610 val = args[coding_arg_mnemonic];
9611 if (! STRINGP (val))
9612 CHECK_CHARACTER (val);
9613 CODING_ATTR_MNEMONIC (attrs) = val;
068a9dbd 9614
df7492f9
KH
9615 coding_type = args[coding_arg_coding_type];
9616 CHECK_SYMBOL (coding_type);
9617 CODING_ATTR_TYPE (attrs) = coding_type;
068a9dbd 9618
df7492f9
KH
9619 charset_list = args[coding_arg_charset_list];
9620 if (SYMBOLP (charset_list))
9621 {
9622 if (EQ (charset_list, Qiso_2022))
9623 {
9624 if (! EQ (coding_type, Qiso_2022))
9625 error ("Invalid charset-list");
9626 charset_list = Viso_2022_charset_list;
9627 }
9628 else if (EQ (charset_list, Qemacs_mule))
9629 {
9630 if (! EQ (coding_type, Qemacs_mule))
9631 error ("Invalid charset-list");
9632 charset_list = Vemacs_mule_charset_list;
9633 }
9634 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
d311d28c
PE
9635 {
9636 if (! RANGED_INTEGERP (0, XCAR (tail), INT_MAX - 1))
9637 error ("Invalid charset-list");
9638 if (max_charset_id < XFASTINT (XCAR (tail)))
9639 max_charset_id = XFASTINT (XCAR (tail));
9640 }
df7492f9 9641 }
068a9dbd
KH
9642 else
9643 {
df7492f9 9644 charset_list = Fcopy_sequence (charset_list);
985773c9 9645 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
068a9dbd 9646 {
df7492f9
KH
9647 struct charset *charset;
9648
985773c9 9649 val = XCAR (tail);
df7492f9
KH
9650 CHECK_CHARSET_GET_CHARSET (val, charset);
9651 if (EQ (coding_type, Qiso_2022)
9652 ? CHARSET_ISO_FINAL (charset) < 0
9653 : EQ (coding_type, Qemacs_mule)
9654 ? CHARSET_EMACS_MULE_ID (charset) < 0
9655 : 0)
9656 error ("Can't handle charset `%s'",
8f924df7 9657 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
df7492f9 9658
8f924df7 9659 XSETCAR (tail, make_number (charset->id));
df7492f9
KH
9660 if (max_charset_id < charset->id)
9661 max_charset_id = charset->id;
068a9dbd
KH
9662 }
9663 }
df7492f9 9664 CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
068a9dbd 9665
1b3b981b
AS
9666 safe_charsets = make_uninit_string (max_charset_id + 1);
9667 memset (SDATA (safe_charsets), 255, max_charset_id + 1);
df7492f9 9668 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8f924df7 9669 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
df7492f9 9670 CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
068a9dbd 9671
584948ac 9672 CODING_ATTR_ASCII_COMPAT (attrs) = args[coding_arg_ascii_compatible_p];
3a73fa5d 9673
df7492f9 9674 val = args[coding_arg_decode_translation_table];
a6f87d34 9675 if (! CHAR_TABLE_P (val) && ! CONSP (val))
7d64c6ad 9676 CHECK_SYMBOL (val);
df7492f9 9677 CODING_ATTR_DECODE_TBL (attrs) = val;
3a73fa5d 9678
df7492f9 9679 val = args[coding_arg_encode_translation_table];
a6f87d34 9680 if (! CHAR_TABLE_P (val) && ! CONSP (val))
7d64c6ad 9681 CHECK_SYMBOL (val);
df7492f9 9682 CODING_ATTR_ENCODE_TBL (attrs) = val;
d46c5b12 9683
df7492f9
KH
9684 val = args[coding_arg_post_read_conversion];
9685 CHECK_SYMBOL (val);
9686 CODING_ATTR_POST_READ (attrs) = val;
d46c5b12 9687
df7492f9
KH
9688 val = args[coding_arg_pre_write_conversion];
9689 CHECK_SYMBOL (val);
9690 CODING_ATTR_PRE_WRITE (attrs) = val;
3a73fa5d 9691
df7492f9
KH
9692 val = args[coding_arg_default_char];
9693 if (NILP (val))
9694 CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
9695 else
9696 {
8f924df7 9697 CHECK_CHARACTER (val);
df7492f9
KH
9698 CODING_ATTR_DEFAULT_CHAR (attrs) = val;
9699 }
4031e2bf 9700
8f924df7
KH
9701 val = args[coding_arg_for_unibyte];
9702 CODING_ATTR_FOR_UNIBYTE (attrs) = NILP (val) ? Qnil : Qt;
3a73fa5d 9703
df7492f9
KH
9704 val = args[coding_arg_plist];
9705 CHECK_LIST (val);
9706 CODING_ATTR_PLIST (attrs) = val;
3a73fa5d 9707
df7492f9
KH
9708 if (EQ (coding_type, Qcharset))
9709 {
c7c66a95
KH
9710 /* Generate a lisp vector of 256 elements. Each element is nil,
9711 integer, or a list of charset IDs.
3a73fa5d 9712
c7c66a95
KH
9713 If Nth element is nil, the byte code N is invalid in this
9714 coding system.
4ed46869 9715
c7c66a95
KH
9716 If Nth element is a number NUM, N is the first byte of a
9717 charset whose ID is NUM.
4ed46869 9718
c7c66a95
KH
9719 If Nth element is a list of charset IDs, N is the first byte
9720 of one of them. The list is sorted by dimensions of the
ad1746f5 9721 charsets. A charset of smaller dimension comes first. */
df7492f9 9722 val = Fmake_vector (make_number (256), Qnil);
4ed46869 9723
5c99c2e6 9724 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
df7492f9 9725 {
c7c66a95
KH
9726 struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
9727 int dim = CHARSET_DIMENSION (charset);
9728 int idx = (dim - 1) * 4;
4ed46869 9729
5c99c2e6 9730 if (CHARSET_ASCII_COMPATIBLE_P (charset))
584948ac 9731 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
4031e2bf 9732
15d143f7
KH
9733 for (i = charset->code_space[idx];
9734 i <= charset->code_space[idx + 1]; i++)
9735 {
c7c66a95
KH
9736 Lisp_Object tmp, tmp2;
9737 int dim2;
ec6d2bb8 9738
c7c66a95
KH
9739 tmp = AREF (val, i);
9740 if (NILP (tmp))
9741 tmp = XCAR (tail);
9742 else if (NUMBERP (tmp))
9743 {
9744 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
9745 if (dim < dim2)
c7c66a95 9746 tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
f9d71dcd
KH
9747 else
9748 tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
c7c66a95 9749 }
15d143f7 9750 else
c7c66a95
KH
9751 {
9752 for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
9753 {
9754 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
9755 if (dim < dim2)
9756 break;
9757 }
9758 if (NILP (tmp2))
9759 tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
9760 else
9761 {
9762 XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
9763 XSETCAR (tmp2, XCAR (tail));
9764 }
9765 }
9766 ASET (val, i, tmp);
15d143f7 9767 }
df7492f9
KH
9768 }
9769 ASET (attrs, coding_attr_charset_valids, val);
9770 category = coding_category_charset;
9771 }
9772 else if (EQ (coding_type, Qccl))
9773 {
9774 Lisp_Object valids;
ecec61c1 9775
df7492f9
KH
9776 if (nargs < coding_arg_ccl_max)
9777 goto short_args;
ecec61c1 9778
df7492f9
KH
9779 val = args[coding_arg_ccl_decoder];
9780 CHECK_CCL_PROGRAM (val);
9781 if (VECTORP (val))
9782 val = Fcopy_sequence (val);
9783 ASET (attrs, coding_attr_ccl_decoder, val);
ecec61c1 9784
df7492f9
KH
9785 val = args[coding_arg_ccl_encoder];
9786 CHECK_CCL_PROGRAM (val);
9787 if (VECTORP (val))
9788 val = Fcopy_sequence (val);
9789 ASET (attrs, coding_attr_ccl_encoder, val);
ecec61c1 9790
df7492f9
KH
9791 val = args[coding_arg_ccl_valids];
9792 valids = Fmake_string (make_number (256), make_number (0));
9793 for (tail = val; !NILP (tail); tail = Fcdr (tail))
9794 {
8dcbea82 9795 int from, to;
ecec61c1 9796
df7492f9
KH
9797 val = Fcar (tail);
9798 if (INTEGERP (val))
8dcbea82 9799 {
d311d28c 9800 if (! (0 <= XINT (val) && XINT (val) <= 255))
8dcbea82 9801 args_out_of_range_3 (val, make_number (0), make_number (255));
d311d28c 9802 from = to = XINT (val);
8dcbea82 9803 }
df7492f9
KH
9804 else
9805 {
df7492f9 9806 CHECK_CONS (val);
8f924df7 9807 CHECK_NATNUM_CAR (val);
d311d28c
PE
9808 CHECK_NUMBER_CDR (val);
9809 if (XINT (XCAR (val)) > 255)
8dcbea82
KH
9810 args_out_of_range_3 (XCAR (val),
9811 make_number (0), make_number (255));
d311d28c
PE
9812 from = XINT (XCAR (val));
9813 if (! (from <= XINT (XCDR (val)) && XINT (XCDR (val)) <= 255))
8dcbea82
KH
9814 args_out_of_range_3 (XCDR (val),
9815 XCAR (val), make_number (255));
d311d28c 9816 to = XINT (XCDR (val));
df7492f9 9817 }
8dcbea82 9818 for (i = from; i <= to; i++)
8f924df7 9819 SSET (valids, i, 1);
df7492f9
KH
9820 }
9821 ASET (attrs, coding_attr_ccl_valids, valids);
4ed46869 9822
df7492f9 9823 category = coding_category_ccl;
55ab7be3 9824 }
df7492f9 9825 else if (EQ (coding_type, Qutf_16))
55ab7be3 9826 {
df7492f9 9827 Lisp_Object bom, endian;
4ed46869 9828
584948ac 9829 CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
4ed46869 9830
df7492f9
KH
9831 if (nargs < coding_arg_utf16_max)
9832 goto short_args;
4ed46869 9833
df7492f9
KH
9834 bom = args[coding_arg_utf16_bom];
9835 if (! NILP (bom) && ! EQ (bom, Qt))
9836 {
9837 CHECK_CONS (bom);
8f924df7
KH
9838 val = XCAR (bom);
9839 CHECK_CODING_SYSTEM (val);
9840 val = XCDR (bom);
9841 CHECK_CODING_SYSTEM (val);
df7492f9 9842 }
a470d443 9843 ASET (attrs, coding_attr_utf_bom, bom);
df7492f9
KH
9844
9845 endian = args[coding_arg_utf16_endian];
b49a1807
KH
9846 CHECK_SYMBOL (endian);
9847 if (NILP (endian))
9848 endian = Qbig;
9849 else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
8f924df7 9850 error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
df7492f9
KH
9851 ASET (attrs, coding_attr_utf_16_endian, endian);
9852
9853 category = (CONSP (bom)
9854 ? coding_category_utf_16_auto
9855 : NILP (bom)
b49a1807 9856 ? (EQ (endian, Qbig)
df7492f9
KH
9857 ? coding_category_utf_16_be_nosig
9858 : coding_category_utf_16_le_nosig)
b49a1807 9859 : (EQ (endian, Qbig)
df7492f9
KH
9860 ? coding_category_utf_16_be
9861 : coding_category_utf_16_le));
9862 }
9863 else if (EQ (coding_type, Qiso_2022))
9864 {
9865 Lisp_Object initial, reg_usage, request, flags;
1397dc18 9866
df7492f9
KH
9867 if (nargs < coding_arg_iso2022_max)
9868 goto short_args;
9869
9870 initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
9871 CHECK_VECTOR (initial);
9872 for (i = 0; i < 4; i++)
9873 {
9874 val = Faref (initial, make_number (i));
9875 if (! NILP (val))
9876 {
584948ac
KH
9877 struct charset *charset;
9878
9879 CHECK_CHARSET_GET_CHARSET (val, charset);
9880 ASET (initial, i, make_number (CHARSET_ID (charset)));
9881 if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
9882 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
df7492f9
KH
9883 }
9884 else
9885 ASET (initial, i, make_number (-1));
9886 }
9887
9888 reg_usage = args[coding_arg_iso2022_reg_usage];
9889 CHECK_CONS (reg_usage);
8f924df7
KH
9890 CHECK_NUMBER_CAR (reg_usage);
9891 CHECK_NUMBER_CDR (reg_usage);
df7492f9
KH
9892
9893 request = Fcopy_sequence (args[coding_arg_iso2022_request]);
9894 for (tail = request; ! NILP (tail); tail = Fcdr (tail))
1397dc18 9895 {
df7492f9 9896 int id;
2735d060 9897 Lisp_Object tmp1;
df7492f9
KH
9898
9899 val = Fcar (tail);
9900 CHECK_CONS (val);
2735d060
PE
9901 tmp1 = XCAR (val);
9902 CHECK_CHARSET_GET_ID (tmp1, id);
8f924df7 9903 CHECK_NATNUM_CDR (val);
df7492f9 9904 if (XINT (XCDR (val)) >= 4)
c2982e87 9905 error ("Invalid graphic register number: %"pI"d", XINT (XCDR (val)));
8f924df7 9906 XSETCAR (val, make_number (id));
1397dc18 9907 }
4ed46869 9908
df7492f9
KH
9909 flags = args[coding_arg_iso2022_flags];
9910 CHECK_NATNUM (flags);
d311d28c 9911 i = XINT (flags) & INT_MAX;
df7492f9 9912 if (EQ (args[coding_arg_charset_list], Qiso_2022))
d311d28c
PE
9913 i |= CODING_ISO_FLAG_FULL_SUPPORT;
9914 flags = make_number (i);
df7492f9
KH
9915
9916 ASET (attrs, coding_attr_iso_initial, initial);
9917 ASET (attrs, coding_attr_iso_usage, reg_usage);
9918 ASET (attrs, coding_attr_iso_request, request);
9919 ASET (attrs, coding_attr_iso_flags, flags);
9920 setup_iso_safe_charsets (attrs);
9921
9922 if (i & CODING_ISO_FLAG_SEVEN_BITS)
9923 category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
9924 | CODING_ISO_FLAG_SINGLE_SHIFT))
9925 ? coding_category_iso_7_else
9926 : EQ (args[coding_arg_charset_list], Qiso_2022)
9927 ? coding_category_iso_7
9928 : coding_category_iso_7_tight);
9929 else
9930 {
9931 int id = XINT (AREF (initial, 1));
9932
c6fb6e98 9933 category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
df7492f9
KH
9934 || EQ (args[coding_arg_charset_list], Qiso_2022)
9935 || id < 0)
9936 ? coding_category_iso_8_else
9937 : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
9938 ? coding_category_iso_8_1
9939 : coding_category_iso_8_2);
9940 }
0ce7886f
KH
9941 if (category != coding_category_iso_8_1
9942 && category != coding_category_iso_8_2)
9943 CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
df7492f9
KH
9944 }
9945 else if (EQ (coding_type, Qemacs_mule))
c28a9453 9946 {
df7492f9
KH
9947 if (EQ (args[coding_arg_charset_list], Qemacs_mule))
9948 ASET (attrs, coding_attr_emacs_mule_full, Qt);
584948ac 9949 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
df7492f9 9950 category = coding_category_emacs_mule;
c28a9453 9951 }
df7492f9 9952 else if (EQ (coding_type, Qshift_jis))
c28a9453 9953 {
df7492f9
KH
9954
9955 struct charset *charset;
9956
7d64c6ad 9957 if (XINT (Flength (charset_list)) != 3
6e07c25f 9958 && XINT (Flength (charset_list)) != 4)
7d64c6ad 9959 error ("There should be three or four charsets");
df7492f9
KH
9960
9961 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9962 if (CHARSET_DIMENSION (charset) != 1)
9963 error ("Dimension of charset %s is not one",
8f924df7 9964 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
584948ac
KH
9965 if (CHARSET_ASCII_COMPATIBLE_P (charset))
9966 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
df7492f9
KH
9967
9968 charset_list = XCDR (charset_list);
9969 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9970 if (CHARSET_DIMENSION (charset) != 1)
9971 error ("Dimension of charset %s is not one",
8f924df7 9972 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
df7492f9
KH
9973
9974 charset_list = XCDR (charset_list);
9975 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9976 if (CHARSET_DIMENSION (charset) != 2)
7d64c6ad
KH
9977 error ("Dimension of charset %s is not two",
9978 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9979
9980 charset_list = XCDR (charset_list);
2b917a06
KH
9981 if (! NILP (charset_list))
9982 {
9983 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9984 if (CHARSET_DIMENSION (charset) != 2)
9985 error ("Dimension of charset %s is not two",
9986 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9987 }
df7492f9
KH
9988
9989 category = coding_category_sjis;
9990 Vsjis_coding_system = name;
c28a9453 9991 }
df7492f9
KH
9992 else if (EQ (coding_type, Qbig5))
9993 {
9994 struct charset *charset;
4ed46869 9995
df7492f9
KH
9996 if (XINT (Flength (charset_list)) != 2)
9997 error ("There should be just two charsets");
9998
9999 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10000 if (CHARSET_DIMENSION (charset) != 1)
10001 error ("Dimension of charset %s is not one",
8f924df7 10002 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
584948ac
KH
10003 if (CHARSET_ASCII_COMPATIBLE_P (charset))
10004 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
df7492f9
KH
10005
10006 charset_list = XCDR (charset_list);
10007 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10008 if (CHARSET_DIMENSION (charset) != 2)
10009 error ("Dimension of charset %s is not two",
8f924df7 10010 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
4ed46869 10011
df7492f9
KH
10012 category = coding_category_big5;
10013 Vbig5_coding_system = name;
10014 }
10015 else if (EQ (coding_type, Qraw_text))
c28a9453 10016 {
584948ac
KH
10017 category = coding_category_raw_text;
10018 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
c28a9453 10019 }
df7492f9 10020 else if (EQ (coding_type, Qutf_8))
4ed46869 10021 {
a470d443
KH
10022 Lisp_Object bom;
10023
a470d443
KH
10024 if (nargs < coding_arg_utf8_max)
10025 goto short_args;
10026
10027 bom = args[coding_arg_utf8_bom];
10028 if (! NILP (bom) && ! EQ (bom, Qt))
10029 {
10030 CHECK_CONS (bom);
10031 val = XCAR (bom);
10032 CHECK_CODING_SYSTEM (val);
10033 val = XCDR (bom);
10034 CHECK_CODING_SYSTEM (val);
10035 }
10036 ASET (attrs, coding_attr_utf_bom, bom);
0e5317f7
KH
10037 if (NILP (bom))
10038 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
a470d443
KH
10039
10040 category = (CONSP (bom) ? coding_category_utf_8_auto
10041 : NILP (bom) ? coding_category_utf_8_nosig
10042 : coding_category_utf_8_sig);
4ed46869 10043 }
df7492f9
KH
10044 else if (EQ (coding_type, Qundecided))
10045 category = coding_category_undecided;
4ed46869 10046 else
df7492f9 10047 error ("Invalid coding system type: %s",
8f924df7 10048 SDATA (SYMBOL_NAME (coding_type)));
4ed46869 10049
df7492f9 10050 CODING_ATTR_CATEGORY (attrs) = make_number (category);
01378f49
KH
10051 CODING_ATTR_PLIST (attrs)
10052 = Fcons (QCcategory, Fcons (AREF (Vcoding_category_table, category),
10053 CODING_ATTR_PLIST (attrs)));
35befdaa 10054 CODING_ATTR_PLIST (attrs)
3ed051d4 10055 = Fcons (QCascii_compatible_p,
35befdaa
KH
10056 Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
10057 CODING_ATTR_PLIST (attrs)));
c4825358 10058
df7492f9
KH
10059 eol_type = args[coding_arg_eol_type];
10060 if (! NILP (eol_type)
10061 && ! EQ (eol_type, Qunix)
10062 && ! EQ (eol_type, Qdos)
10063 && ! EQ (eol_type, Qmac))
10064 error ("Invalid eol-type");
4ed46869 10065
df7492f9 10066 aliases = Fcons (name, Qnil);
4ed46869 10067
df7492f9
KH
10068 if (NILP (eol_type))
10069 {
10070 eol_type = make_subsidiaries (name);
10071 for (i = 0; i < 3; i++)
1397dc18 10072 {
df7492f9
KH
10073 Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
10074
10075 this_name = AREF (eol_type, i);
10076 this_aliases = Fcons (this_name, Qnil);
10077 this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
10078 this_spec = Fmake_vector (make_number (3), attrs);
10079 ASET (this_spec, 1, this_aliases);
10080 ASET (this_spec, 2, this_eol_type);
10081 Fputhash (this_name, this_spec, Vcoding_system_hash_table);
10082 Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
583f71ca
KH
10083 val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
10084 if (NILP (val))
10085 Vcoding_system_alist
10086 = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
10087 Vcoding_system_alist);
1397dc18 10088 }
d46c5b12 10089 }
4ed46869 10090
df7492f9
KH
10091 spec_vec = Fmake_vector (make_number (3), attrs);
10092 ASET (spec_vec, 1, aliases);
10093 ASET (spec_vec, 2, eol_type);
48b0f3ae 10094
df7492f9
KH
10095 Fputhash (name, spec_vec, Vcoding_system_hash_table);
10096 Vcoding_system_list = Fcons (name, Vcoding_system_list);
583f71ca
KH
10097 val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
10098 if (NILP (val))
10099 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
10100 Vcoding_system_alist);
48b0f3ae 10101
df7492f9
KH
10102 {
10103 int id = coding_categories[category].id;
48b0f3ae 10104
df7492f9
KH
10105 if (id < 0 || EQ (name, CODING_ID_NAME (id)))
10106 setup_coding_system (name, &coding_categories[category]);
10107 }
48b0f3ae 10108
d46c5b12 10109 return Qnil;
48b0f3ae 10110
df7492f9
KH
10111 short_args:
10112 return Fsignal (Qwrong_number_of_arguments,
10113 Fcons (intern ("define-coding-system-internal"),
10114 make_number (nargs)));
d46c5b12 10115}
4ed46869 10116
d6925f38 10117
a6f87d34
KH
10118DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
10119 3, 3, 0,
10120 doc: /* Change value in CODING-SYSTEM's property list PROP to VAL. */)
5842a27b 10121 (Lisp_Object coding_system, Lisp_Object prop, Lisp_Object val)
a6f87d34 10122{
3dbe7859 10123 Lisp_Object spec, attrs;
a6f87d34
KH
10124
10125 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10126 attrs = AREF (spec, 0);
10127 if (EQ (prop, QCmnemonic))
10128 {
10129 if (! STRINGP (val))
10130 CHECK_CHARACTER (val);
10131 CODING_ATTR_MNEMONIC (attrs) = val;
10132 }
2133e2d1 10133 else if (EQ (prop, QCdefault_char))
a6f87d34
KH
10134 {
10135 if (NILP (val))
10136 val = make_number (' ');
10137 else
10138 CHECK_CHARACTER (val);
10139 CODING_ATTR_DEFAULT_CHAR (attrs) = val;
10140 }
10141 else if (EQ (prop, QCdecode_translation_table))
10142 {
10143 if (! CHAR_TABLE_P (val) && ! CONSP (val))
10144 CHECK_SYMBOL (val);
10145 CODING_ATTR_DECODE_TBL (attrs) = val;
10146 }
10147 else if (EQ (prop, QCencode_translation_table))
10148 {
10149 if (! CHAR_TABLE_P (val) && ! CONSP (val))
10150 CHECK_SYMBOL (val);
10151 CODING_ATTR_ENCODE_TBL (attrs) = val;
10152 }
10153 else if (EQ (prop, QCpost_read_conversion))
10154 {
10155 CHECK_SYMBOL (val);
10156 CODING_ATTR_POST_READ (attrs) = val;
10157 }
10158 else if (EQ (prop, QCpre_write_conversion))
10159 {
10160 CHECK_SYMBOL (val);
10161 CODING_ATTR_PRE_WRITE (attrs) = val;
10162 }
35befdaa
KH
10163 else if (EQ (prop, QCascii_compatible_p))
10164 {
10165 CODING_ATTR_ASCII_COMPAT (attrs) = val;
10166 }
a6f87d34
KH
10167
10168 CODING_ATTR_PLIST (attrs)
10169 = Fplist_put (CODING_ATTR_PLIST (attrs), prop, val);
10170 return val;
10171}
10172
10173
df7492f9
KH
10174DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
10175 Sdefine_coding_system_alias, 2, 2, 0,
10176 doc: /* Define ALIAS as an alias for CODING-SYSTEM. */)
5842a27b 10177 (Lisp_Object alias, Lisp_Object coding_system)
66cfb530 10178{
583f71ca 10179 Lisp_Object spec, aliases, eol_type, val;
4ed46869 10180
df7492f9
KH
10181 CHECK_SYMBOL (alias);
10182 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10183 aliases = AREF (spec, 1);
d4a1d553 10184 /* ALIASES should be a list of length more than zero, and the first
d6925f38
KH
10185 element is a base coding system. Append ALIAS at the tail of the
10186 list. */
df7492f9
KH
10187 while (!NILP (XCDR (aliases)))
10188 aliases = XCDR (aliases);
8f924df7 10189 XSETCDR (aliases, Fcons (alias, Qnil));
4ed46869 10190
df7492f9
KH
10191 eol_type = AREF (spec, 2);
10192 if (VECTORP (eol_type))
4ed46869 10193 {
df7492f9
KH
10194 Lisp_Object subsidiaries;
10195 int i;
4ed46869 10196
df7492f9
KH
10197 subsidiaries = make_subsidiaries (alias);
10198 for (i = 0; i < 3; i++)
10199 Fdefine_coding_system_alias (AREF (subsidiaries, i),
10200 AREF (eol_type, i));
4ed46869 10201 }
df7492f9
KH
10202
10203 Fputhash (alias, spec, Vcoding_system_hash_table);
d6925f38 10204 Vcoding_system_list = Fcons (alias, Vcoding_system_list);
583f71ca
KH
10205 val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
10206 if (NILP (val))
10207 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
10208 Vcoding_system_alist);
66cfb530 10209
4ed46869
KH
10210 return Qnil;
10211}
10212
a7ca3326 10213DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
df7492f9
KH
10214 1, 1, 0,
10215 doc: /* Return the base of CODING-SYSTEM.
da7db224 10216Any alias or subsidiary coding system is not a base coding system. */)
5842a27b 10217 (Lisp_Object coding_system)
d46c5b12 10218{
df7492f9 10219 Lisp_Object spec, attrs;
d46c5b12 10220
df7492f9
KH
10221 if (NILP (coding_system))
10222 return (Qno_conversion);
10223 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10224 attrs = AREF (spec, 0);
10225 return CODING_ATTR_BASE_NAME (attrs);
10226}
1397dc18 10227
df7492f9
KH
10228DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
10229 1, 1, 0,
10230 doc: "Return the property list of CODING-SYSTEM.")
5842a27b 10231 (Lisp_Object coding_system)
df7492f9
KH
10232{
10233 Lisp_Object spec, attrs;
1397dc18 10234
df7492f9
KH
10235 if (NILP (coding_system))
10236 coding_system = Qno_conversion;
10237 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10238 attrs = AREF (spec, 0);
10239 return CODING_ATTR_PLIST (attrs);
d46c5b12
KH
10240}
10241
df7492f9
KH
10242
10243DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
10244 1, 1, 0,
da7db224 10245 doc: /* Return the list of aliases of CODING-SYSTEM. */)
5842a27b 10246 (Lisp_Object coding_system)
66cfb530 10247{
df7492f9 10248 Lisp_Object spec;
84d60297 10249
df7492f9
KH
10250 if (NILP (coding_system))
10251 coding_system = Qno_conversion;
10252 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
da7db224 10253 return AREF (spec, 1);
df7492f9 10254}
66cfb530 10255
a7ca3326 10256DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
df7492f9
KH
10257 Scoding_system_eol_type, 1, 1, 0,
10258 doc: /* Return eol-type of CODING-SYSTEM.
d4a1d553 10259An eol-type is an integer 0, 1, 2, or a vector of coding systems.
66cfb530 10260
df7492f9
KH
10261Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10262and CR respectively.
66cfb530 10263
df7492f9
KH
10264A vector value indicates that a format of end-of-line should be
10265detected automatically. Nth element of the vector is the subsidiary
10266coding system whose eol-type is N. */)
5842a27b 10267 (Lisp_Object coding_system)
6b89e3aa 10268{
df7492f9
KH
10269 Lisp_Object spec, eol_type;
10270 int n;
6b89e3aa 10271
df7492f9
KH
10272 if (NILP (coding_system))
10273 coding_system = Qno_conversion;
10274 if (! CODING_SYSTEM_P (coding_system))
10275 return Qnil;
10276 spec = CODING_SYSTEM_SPEC (coding_system);
10277 eol_type = AREF (spec, 2);
10278 if (VECTORP (eol_type))
10279 return Fcopy_sequence (eol_type);
10280 n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
10281 return make_number (n);
6b89e3aa
KH
10282}
10283
4ed46869
KH
10284#endif /* emacs */
10285
10286\f
1397dc18 10287/*** 9. Post-amble ***/
4ed46869 10288
dfcf069d 10289void
971de7fb 10290init_coding_once (void)
4ed46869
KH
10291{
10292 int i;
10293
df7492f9
KH
10294 for (i = 0; i < coding_category_max; i++)
10295 {
10296 coding_categories[i].id = -1;
10297 coding_priorities[i] = i;
10298 }
4ed46869
KH
10299
10300 /* ISO2022 specific initialize routine. */
10301 for (i = 0; i < 0x20; i++)
b73bfc1c 10302 iso_code_class[i] = ISO_control_0;
4ed46869
KH
10303 for (i = 0x21; i < 0x7F; i++)
10304 iso_code_class[i] = ISO_graphic_plane_0;
10305 for (i = 0x80; i < 0xA0; i++)
b73bfc1c 10306 iso_code_class[i] = ISO_control_1;
4ed46869
KH
10307 for (i = 0xA1; i < 0xFF; i++)
10308 iso_code_class[i] = ISO_graphic_plane_1;
10309 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
10310 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
4ed46869
KH
10311 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
10312 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
10313 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
10314 iso_code_class[ISO_CODE_ESC] = ISO_escape;
10315 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
10316 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
10317 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
10318
df7492f9
KH
10319 for (i = 0; i < 256; i++)
10320 {
10321 emacs_mule_bytes[i] = 1;
10322 }
7c78e542
KH
10323 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
10324 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
10325 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
10326 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
e0e989f6
KH
10327}
10328
10329#ifdef emacs
10330
dfcf069d 10331void
971de7fb 10332syms_of_coding (void)
e0e989f6 10333{
df7492f9 10334 staticpro (&Vcoding_system_hash_table);
8f924df7
KH
10335 {
10336 Lisp_Object args[2];
10337 args[0] = QCtest;
10338 args[1] = Qeq;
10339 Vcoding_system_hash_table = Fmake_hash_table (2, args);
10340 }
df7492f9
KH
10341
10342 staticpro (&Vsjis_coding_system);
10343 Vsjis_coding_system = Qnil;
e0e989f6 10344
df7492f9
KH
10345 staticpro (&Vbig5_coding_system);
10346 Vbig5_coding_system = Qnil;
10347
24a73b0a
KH
10348 staticpro (&Vcode_conversion_reused_workbuf);
10349 Vcode_conversion_reused_workbuf = Qnil;
10350
10351 staticpro (&Vcode_conversion_workbuf_name);
d67b4f80 10352 Vcode_conversion_workbuf_name = make_pure_c_string (" *code-conversion-work*");
e0e989f6 10353
24a73b0a 10354 reused_workbuf_in_use = 0;
df7492f9
KH
10355
10356 DEFSYM (Qcharset, "charset");
10357 DEFSYM (Qtarget_idx, "target-idx");
10358 DEFSYM (Qcoding_system_history, "coding-system-history");
bb0115a2
RS
10359 Fset (Qcoding_system_history, Qnil);
10360
9ce27fde 10361 /* Target FILENAME is the first argument. */
e0e989f6 10362 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
9ce27fde 10363 /* Target FILENAME is the third argument. */
e0e989f6
KH
10364 Fput (Qwrite_region, Qtarget_idx, make_number (2));
10365
df7492f9 10366 DEFSYM (Qcall_process, "call-process");
9ce27fde 10367 /* Target PROGRAM is the first argument. */
e0e989f6
KH
10368 Fput (Qcall_process, Qtarget_idx, make_number (0));
10369
df7492f9 10370 DEFSYM (Qcall_process_region, "call-process-region");
9ce27fde 10371 /* Target PROGRAM is the third argument. */
e0e989f6
KH
10372 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
10373
df7492f9 10374 DEFSYM (Qstart_process, "start-process");
9ce27fde 10375 /* Target PROGRAM is the third argument. */
e0e989f6
KH
10376 Fput (Qstart_process, Qtarget_idx, make_number (2));
10377
df7492f9 10378 DEFSYM (Qopen_network_stream, "open-network-stream");
9ce27fde 10379 /* Target SERVICE is the fourth argument. */
e0e989f6
KH
10380 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
10381
df7492f9
KH
10382 DEFSYM (Qcoding_system, "coding-system");
10383 DEFSYM (Qcoding_aliases, "coding-aliases");
4ed46869 10384
df7492f9
KH
10385 DEFSYM (Qeol_type, "eol-type");
10386 DEFSYM (Qunix, "unix");
10387 DEFSYM (Qdos, "dos");
4ed46869 10388
df7492f9
KH
10389 DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10390 DEFSYM (Qpost_read_conversion, "post-read-conversion");
10391 DEFSYM (Qpre_write_conversion, "pre-write-conversion");
10392 DEFSYM (Qdefault_char, "default-char");
10393 DEFSYM (Qundecided, "undecided");
10394 DEFSYM (Qno_conversion, "no-conversion");
10395 DEFSYM (Qraw_text, "raw-text");
4ed46869 10396
df7492f9 10397 DEFSYM (Qiso_2022, "iso-2022");
4ed46869 10398
df7492f9 10399 DEFSYM (Qutf_8, "utf-8");
8f924df7 10400 DEFSYM (Qutf_8_emacs, "utf-8-emacs");
27901516 10401
df7492f9 10402 DEFSYM (Qutf_16, "utf-16");
df7492f9
KH
10403 DEFSYM (Qbig, "big");
10404 DEFSYM (Qlittle, "little");
27901516 10405
df7492f9
KH
10406 DEFSYM (Qshift_jis, "shift-jis");
10407 DEFSYM (Qbig5, "big5");
4ed46869 10408
df7492f9 10409 DEFSYM (Qcoding_system_p, "coding-system-p");
4ed46869 10410
df7492f9 10411 DEFSYM (Qcoding_system_error, "coding-system-error");
4ed46869 10412 Fput (Qcoding_system_error, Qerror_conditions,
d67b4f80 10413 pure_cons (Qcoding_system_error, pure_cons (Qerror, Qnil)));
4ed46869 10414 Fput (Qcoding_system_error, Qerror_message,
d67b4f80 10415 make_pure_c_string ("Invalid coding system"));
4ed46869 10416
05e6f5dc
KH
10417 /* Intern this now in case it isn't already done.
10418 Setting this variable twice is harmless.
10419 But don't staticpro it here--that is done in alloc.c. */
d67b4f80 10420 Qchar_table_extra_slots = intern_c_string ("char-table-extra-slots");
70c22245 10421
df7492f9 10422 DEFSYM (Qtranslation_table, "translation-table");
433f7f87 10423 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
df7492f9
KH
10424 DEFSYM (Qtranslation_table_id, "translation-table-id");
10425 DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
10426 DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
1397dc18 10427
df7492f9 10428 DEFSYM (Qvalid_codes, "valid-codes");
9ce27fde 10429
df7492f9 10430 DEFSYM (Qemacs_mule, "emacs-mule");
d46c5b12 10431
01378f49 10432 DEFSYM (QCcategory, ":category");
a6f87d34 10433 DEFSYM (QCmnemonic, ":mnemonic");
2133e2d1 10434 DEFSYM (QCdefault_char, ":default-char");
a6f87d34
KH
10435 DEFSYM (QCdecode_translation_table, ":decode-translation-table");
10436 DEFSYM (QCencode_translation_table, ":encode-translation-table");
10437 DEFSYM (QCpost_read_conversion, ":post-read-conversion");
10438 DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
35befdaa 10439 DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
01378f49 10440
df7492f9
KH
10441 Vcoding_category_table
10442 = Fmake_vector (make_number (coding_category_max), Qnil);
10443 staticpro (&Vcoding_category_table);
10444 /* Followings are target of code detection. */
10445 ASET (Vcoding_category_table, coding_category_iso_7,
d67b4f80 10446 intern_c_string ("coding-category-iso-7"));
df7492f9 10447 ASET (Vcoding_category_table, coding_category_iso_7_tight,
d67b4f80 10448 intern_c_string ("coding-category-iso-7-tight"));
df7492f9 10449 ASET (Vcoding_category_table, coding_category_iso_8_1,
d67b4f80 10450 intern_c_string ("coding-category-iso-8-1"));
df7492f9 10451 ASET (Vcoding_category_table, coding_category_iso_8_2,
d67b4f80 10452 intern_c_string ("coding-category-iso-8-2"));
df7492f9 10453 ASET (Vcoding_category_table, coding_category_iso_7_else,
d67b4f80 10454 intern_c_string ("coding-category-iso-7-else"));
df7492f9 10455 ASET (Vcoding_category_table, coding_category_iso_8_else,
d67b4f80 10456 intern_c_string ("coding-category-iso-8-else"));
a470d443 10457 ASET (Vcoding_category_table, coding_category_utf_8_auto,
d67b4f80 10458 intern_c_string ("coding-category-utf-8-auto"));
a470d443 10459 ASET (Vcoding_category_table, coding_category_utf_8_nosig,
d67b4f80 10460 intern_c_string ("coding-category-utf-8"));
a470d443 10461 ASET (Vcoding_category_table, coding_category_utf_8_sig,
d67b4f80 10462 intern_c_string ("coding-category-utf-8-sig"));
df7492f9 10463 ASET (Vcoding_category_table, coding_category_utf_16_be,
d67b4f80 10464 intern_c_string ("coding-category-utf-16-be"));
ff563fce 10465 ASET (Vcoding_category_table, coding_category_utf_16_auto,
d67b4f80 10466 intern_c_string ("coding-category-utf-16-auto"));
df7492f9 10467 ASET (Vcoding_category_table, coding_category_utf_16_le,
d67b4f80 10468 intern_c_string ("coding-category-utf-16-le"));
df7492f9 10469 ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
d67b4f80 10470 intern_c_string ("coding-category-utf-16-be-nosig"));
df7492f9 10471 ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
d67b4f80 10472 intern_c_string ("coding-category-utf-16-le-nosig"));
df7492f9 10473 ASET (Vcoding_category_table, coding_category_charset,
d67b4f80 10474 intern_c_string ("coding-category-charset"));
df7492f9 10475 ASET (Vcoding_category_table, coding_category_sjis,
d67b4f80 10476 intern_c_string ("coding-category-sjis"));
df7492f9 10477 ASET (Vcoding_category_table, coding_category_big5,
d67b4f80 10478 intern_c_string ("coding-category-big5"));
df7492f9 10479 ASET (Vcoding_category_table, coding_category_ccl,
d67b4f80 10480 intern_c_string ("coding-category-ccl"));
df7492f9 10481 ASET (Vcoding_category_table, coding_category_emacs_mule,
d67b4f80 10482 intern_c_string ("coding-category-emacs-mule"));
df7492f9
KH
10483 /* Followings are NOT target of code detection. */
10484 ASET (Vcoding_category_table, coding_category_raw_text,
d67b4f80 10485 intern_c_string ("coding-category-raw-text"));
df7492f9 10486 ASET (Vcoding_category_table, coding_category_undecided,
d67b4f80 10487 intern_c_string ("coding-category-undecided"));
ecf488bc 10488
065e3595
KH
10489 DEFSYM (Qinsufficient_source, "insufficient-source");
10490 DEFSYM (Qinconsistent_eol, "inconsistent-eol");
10491 DEFSYM (Qinvalid_source, "invalid-source");
10492 DEFSYM (Qinterrupted, "interrupted");
10493 DEFSYM (Qinsufficient_memory, "insufficient-memory");
44e8490d 10494 DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
065e3595 10495
4ed46869
KH
10496 defsubr (&Scoding_system_p);
10497 defsubr (&Sread_coding_system);
10498 defsubr (&Sread_non_nil_coding_system);
10499 defsubr (&Scheck_coding_system);
10500 defsubr (&Sdetect_coding_region);
d46c5b12 10501 defsubr (&Sdetect_coding_string);
05e6f5dc 10502 defsubr (&Sfind_coding_systems_region_internal);
068a9dbd 10503 defsubr (&Sunencodable_char_position);
df7492f9 10504 defsubr (&Scheck_coding_systems_region);
4ed46869
KH
10505 defsubr (&Sdecode_coding_region);
10506 defsubr (&Sencode_coding_region);
10507 defsubr (&Sdecode_coding_string);
10508 defsubr (&Sencode_coding_string);
10509 defsubr (&Sdecode_sjis_char);
10510 defsubr (&Sencode_sjis_char);
10511 defsubr (&Sdecode_big5_char);
10512 defsubr (&Sencode_big5_char);
1ba9e4ab 10513 defsubr (&Sset_terminal_coding_system_internal);
c4825358 10514 defsubr (&Sset_safe_terminal_coding_system_internal);
4ed46869 10515 defsubr (&Sterminal_coding_system);
1ba9e4ab 10516 defsubr (&Sset_keyboard_coding_system_internal);
4ed46869 10517 defsubr (&Skeyboard_coding_system);
a5d301df 10518 defsubr (&Sfind_operation_coding_system);
df7492f9 10519 defsubr (&Sset_coding_system_priority);
6b89e3aa 10520 defsubr (&Sdefine_coding_system_internal);
df7492f9 10521 defsubr (&Sdefine_coding_system_alias);
a6f87d34 10522 defsubr (&Scoding_system_put);
df7492f9
KH
10523 defsubr (&Scoding_system_base);
10524 defsubr (&Scoding_system_plist);
10525 defsubr (&Scoding_system_aliases);
10526 defsubr (&Scoding_system_eol_type);
10527 defsubr (&Scoding_system_priority_list);
4ed46869 10528
29208e82 10529 DEFVAR_LISP ("coding-system-list", Vcoding_system_list,
48b0f3ae
PJ
10530 doc: /* List of coding systems.
10531
10532Do not alter the value of this variable manually. This variable should be
df7492f9 10533updated by the functions `define-coding-system' and
48b0f3ae 10534`define-coding-system-alias'. */);
4608c386
KH
10535 Vcoding_system_list = Qnil;
10536
29208e82 10537 DEFVAR_LISP ("coding-system-alist", Vcoding_system_alist,
48b0f3ae
PJ
10538 doc: /* Alist of coding system names.
10539Each element is one element list of coding system name.
446dcd75 10540This variable is given to `completing-read' as COLLECTION argument.
48b0f3ae
PJ
10541
10542Do not alter the value of this variable manually. This variable should be
10543updated by the functions `make-coding-system' and
10544`define-coding-system-alias'. */);
4608c386
KH
10545 Vcoding_system_alist = Qnil;
10546
29208e82 10547 DEFVAR_LISP ("coding-category-list", Vcoding_category_list,
48b0f3ae
PJ
10548 doc: /* List of coding-categories (symbols) ordered by priority.
10549
10550On detecting a coding system, Emacs tries code detection algorithms
10551associated with each coding-category one by one in this order. When
10552one algorithm agrees with a byte sequence of source text, the coding
0ec31faf
KH
10553system bound to the corresponding coding-category is selected.
10554
448e17d6 10555Don't modify this variable directly, but use `set-coding-system-priority'. */);
4ed46869
KH
10556 {
10557 int i;
10558
10559 Vcoding_category_list = Qnil;
df7492f9 10560 for (i = coding_category_max - 1; i >= 0; i--)
4ed46869 10561 Vcoding_category_list
d46c5b12
KH
10562 = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
10563 Vcoding_category_list);
4ed46869
KH
10564 }
10565
29208e82 10566 DEFVAR_LISP ("coding-system-for-read", Vcoding_system_for_read,
48b0f3ae
PJ
10567 doc: /* Specify the coding system for read operations.
10568It is useful to bind this variable with `let', but do not set it globally.
10569If the value is a coding system, it is used for decoding on read operation.
446dcd75
JB
10570If not, an appropriate element is used from one of the coding system alists.
10571There are three such tables: `file-coding-system-alist',
48b0f3ae 10572`process-coding-system-alist', and `network-coding-system-alist'. */);
4ed46869
KH
10573 Vcoding_system_for_read = Qnil;
10574
29208e82 10575 DEFVAR_LISP ("coding-system-for-write", Vcoding_system_for_write,
48b0f3ae
PJ
10576 doc: /* Specify the coding system for write operations.
10577Programs bind this variable with `let', but you should not set it globally.
10578If the value is a coding system, it is used for encoding of output,
10579when writing it to a file and when sending it to a file or subprocess.
10580
10581If this does not specify a coding system, an appropriate element
446dcd75
JB
10582is used from one of the coding system alists.
10583There are three such tables: `file-coding-system-alist',
48b0f3ae
PJ
10584`process-coding-system-alist', and `network-coding-system-alist'.
10585For output to files, if the above procedure does not specify a coding system,
10586the value of `buffer-file-coding-system' is used. */);
4ed46869
KH
10587 Vcoding_system_for_write = Qnil;
10588
29208e82 10589 DEFVAR_LISP ("last-coding-system-used", Vlast_coding_system_used,
df7492f9
KH
10590 doc: /*
10591Coding system used in the latest file or process I/O. */);
4ed46869
KH
10592 Vlast_coding_system_used = Qnil;
10593
29208e82 10594 DEFVAR_LISP ("last-code-conversion-error", Vlast_code_conversion_error,
065e3595
KH
10595 doc: /*
10596Error status of the last code conversion.
10597
10598When an error was detected in the last code conversion, this variable
10599is set to one of the following symbols.
10600 `insufficient-source'
10601 `inconsistent-eol'
10602 `invalid-source'
10603 `interrupted'
10604 `insufficient-memory'
10605When no error was detected, the value doesn't change. So, to check
10606the error status of a code conversion by this variable, you must
10607explicitly set this variable to nil before performing code
10608conversion. */);
10609 Vlast_code_conversion_error = Qnil;
10610
29208e82 10611 DEFVAR_BOOL ("inhibit-eol-conversion", inhibit_eol_conversion,
df7492f9
KH
10612 doc: /*
10613*Non-nil means always inhibit code conversion of end-of-line format.
48b0f3ae
PJ
10614See info node `Coding Systems' and info node `Text and Binary' concerning
10615such conversion. */);
9ce27fde
KH
10616 inhibit_eol_conversion = 0;
10617
29208e82 10618 DEFVAR_BOOL ("inherit-process-coding-system", inherit_process_coding_system,
df7492f9
KH
10619 doc: /*
10620Non-nil means process buffer inherits coding system of process output.
48b0f3ae
PJ
10621Bind it to t if the process output is to be treated as if it were a file
10622read from some filesystem. */);
ed29121d
EZ
10623 inherit_process_coding_system = 0;
10624
29208e82 10625 DEFVAR_LISP ("file-coding-system-alist", Vfile_coding_system_alist,
df7492f9
KH
10626 doc: /*
10627Alist to decide a coding system to use for a file I/O operation.
48b0f3ae
PJ
10628The format is ((PATTERN . VAL) ...),
10629where PATTERN is a regular expression matching a file name,
10630VAL is a coding system, a cons of coding systems, or a function symbol.
10631If VAL is a coding system, it is used for both decoding and encoding
10632the file contents.
10633If VAL is a cons of coding systems, the car part is used for decoding,
10634and the cdr part is used for encoding.
10635If VAL is a function symbol, the function must return a coding system
2c53e699
KH
10636or a cons of coding systems which are used as above. The function is
10637called with an argument that is a list of the arguments with which
5a0bbd9a
KH
10638`find-operation-coding-system' was called. If the function can't decide
10639a coding system, it can return `undecided' so that the normal
10640code-detection is performed.
48b0f3ae
PJ
10641
10642See also the function `find-operation-coding-system'
10643and the variable `auto-coding-alist'. */);
02ba4723
KH
10644 Vfile_coding_system_alist = Qnil;
10645
29208e82 10646 DEFVAR_LISP ("process-coding-system-alist", Vprocess_coding_system_alist,
df7492f9
KH
10647 doc: /*
10648Alist to decide a coding system to use for a process I/O operation.
48b0f3ae
PJ
10649The format is ((PATTERN . VAL) ...),
10650where PATTERN is a regular expression matching a program name,
10651VAL is a coding system, a cons of coding systems, or a function symbol.
10652If VAL is a coding system, it is used for both decoding what received
10653from the program and encoding what sent to the program.
10654If VAL is a cons of coding systems, the car part is used for decoding,
10655and the cdr part is used for encoding.
10656If VAL is a function symbol, the function must return a coding system
10657or a cons of coding systems which are used as above.
10658
10659See also the function `find-operation-coding-system'. */);
02ba4723
KH
10660 Vprocess_coding_system_alist = Qnil;
10661
29208e82 10662 DEFVAR_LISP ("network-coding-system-alist", Vnetwork_coding_system_alist,
df7492f9
KH
10663 doc: /*
10664Alist to decide a coding system to use for a network I/O operation.
48b0f3ae
PJ
10665The format is ((PATTERN . VAL) ...),
10666where PATTERN is a regular expression matching a network service name
10667or is a port number to connect to,
10668VAL is a coding system, a cons of coding systems, or a function symbol.
10669If VAL is a coding system, it is used for both decoding what received
10670from the network stream and encoding what sent to the network stream.
10671If VAL is a cons of coding systems, the car part is used for decoding,
10672and the cdr part is used for encoding.
10673If VAL is a function symbol, the function must return a coding system
10674or a cons of coding systems which are used as above.
10675
10676See also the function `find-operation-coding-system'. */);
02ba4723 10677 Vnetwork_coding_system_alist = Qnil;
4ed46869 10678
29208e82 10679 DEFVAR_LISP ("locale-coding-system", Vlocale_coding_system,
75205970
RS
10680 doc: /* Coding system to use with system messages.
10681Also used for decoding keyboard input on X Window system. */);
68c45bf0
PE
10682 Vlocale_coding_system = Qnil;
10683
005f0d35 10684 /* The eol mnemonics are reset in startup.el system-dependently. */
29208e82 10685 DEFVAR_LISP ("eol-mnemonic-unix", eol_mnemonic_unix,
df7492f9
KH
10686 doc: /*
10687*String displayed in mode line for UNIX-like (LF) end-of-line format. */);
d67b4f80 10688 eol_mnemonic_unix = make_pure_c_string (":");
4ed46869 10689
29208e82 10690 DEFVAR_LISP ("eol-mnemonic-dos", eol_mnemonic_dos,
df7492f9
KH
10691 doc: /*
10692*String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
d67b4f80 10693 eol_mnemonic_dos = make_pure_c_string ("\\");
4ed46869 10694
29208e82 10695 DEFVAR_LISP ("eol-mnemonic-mac", eol_mnemonic_mac,
df7492f9
KH
10696 doc: /*
10697*String displayed in mode line for MAC-like (CR) end-of-line format. */);
d67b4f80 10698 eol_mnemonic_mac = make_pure_c_string ("/");
4ed46869 10699
29208e82 10700 DEFVAR_LISP ("eol-mnemonic-undecided", eol_mnemonic_undecided,
df7492f9
KH
10701 doc: /*
10702*String displayed in mode line when end-of-line format is not yet determined. */);
d67b4f80 10703 eol_mnemonic_undecided = make_pure_c_string (":");
4ed46869 10704
29208e82 10705 DEFVAR_LISP ("enable-character-translation", Venable_character_translation,
df7492f9
KH
10706 doc: /*
10707*Non-nil enables character translation while encoding and decoding. */);
84fbb8a0 10708 Venable_character_translation = Qt;
bdd9fb48 10709
f967223b 10710 DEFVAR_LISP ("standard-translation-table-for-decode",
29208e82 10711 Vstandard_translation_table_for_decode,
48b0f3ae 10712 doc: /* Table for translating characters while decoding. */);
f967223b 10713 Vstandard_translation_table_for_decode = Qnil;
bdd9fb48 10714
f967223b 10715 DEFVAR_LISP ("standard-translation-table-for-encode",
29208e82 10716 Vstandard_translation_table_for_encode,
48b0f3ae 10717 doc: /* Table for translating characters while encoding. */);
f967223b 10718 Vstandard_translation_table_for_encode = Qnil;
4ed46869 10719
29208e82 10720 DEFVAR_LISP ("charset-revision-table", Vcharset_revision_table,
48b0f3ae
PJ
10721 doc: /* Alist of charsets vs revision numbers.
10722While encoding, if a charset (car part of an element) is found,
df7492f9
KH
10723designate it with the escape sequence identifying revision (cdr part
10724of the element). */);
10725 Vcharset_revision_table = Qnil;
02ba4723
KH
10726
10727 DEFVAR_LISP ("default-process-coding-system",
29208e82 10728 Vdefault_process_coding_system,
48b0f3ae
PJ
10729 doc: /* Cons of coding systems used for process I/O by default.
10730The car part is used for decoding a process output,
10731the cdr part is used for encoding a text to be sent to a process. */);
02ba4723 10732 Vdefault_process_coding_system = Qnil;
c4825358 10733
29208e82 10734 DEFVAR_LISP ("latin-extra-code-table", Vlatin_extra_code_table,
df7492f9
KH
10735 doc: /*
10736Table of extra Latin codes in the range 128..159 (inclusive).
48b0f3ae
PJ
10737This is a vector of length 256.
10738If Nth element is non-nil, the existence of code N in a file
10739\(or output of subprocess) doesn't prevent it to be detected as
10740a coding system of ISO 2022 variant which has a flag
10741`accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
10742or reading output of a subprocess.
446dcd75 10743Only 128th through 159th elements have a meaning. */);
3f003981 10744 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
d46c5b12
KH
10745
10746 DEFVAR_LISP ("select-safe-coding-system-function",
29208e82 10747 Vselect_safe_coding_system_function,
df7492f9
KH
10748 doc: /*
10749Function to call to select safe coding system for encoding a text.
48b0f3ae
PJ
10750
10751If set, this function is called to force a user to select a proper
10752coding system which can encode the text in the case that a default
fdecf907
GM
10753coding system used in each operation can't encode the text. The
10754function should take care that the buffer is not modified while
10755the coding system is being selected.
48b0f3ae
PJ
10756
10757The default value is `select-safe-coding-system' (which see). */);
d46c5b12
KH
10758 Vselect_safe_coding_system_function = Qnil;
10759
5d5bf4d8 10760 DEFVAR_BOOL ("coding-system-require-warning",
29208e82 10761 coding_system_require_warning,
5d5bf4d8 10762 doc: /* Internal use only.
6b89e3aa
KH
10763If non-nil, on writing a file, `select-safe-coding-system-function' is
10764called even if `coding-system-for-write' is non-nil. The command
10765`universal-coding-system-argument' binds this variable to t temporarily. */);
5d5bf4d8
KH
10766 coding_system_require_warning = 0;
10767
10768
22ab2303 10769 DEFVAR_BOOL ("inhibit-iso-escape-detection",
29208e82 10770 inhibit_iso_escape_detection,
df7492f9 10771 doc: /*
97b1b294 10772If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
48b0f3ae 10773
97b1b294
EZ
10774When Emacs reads text, it tries to detect how the text is encoded.
10775This code detection is sensitive to escape sequences. If Emacs sees
10776a valid ISO-2022 escape sequence, it assumes the text is encoded in one
10777of the ISO2022 encodings, and decodes text by the corresponding coding
10778system (e.g. `iso-2022-7bit').
48b0f3ae
PJ
10779
10780However, there may be a case that you want to read escape sequences in
10781a file as is. In such a case, you can set this variable to non-nil.
97b1b294
EZ
10782Then the code detection will ignore any escape sequences, and no text is
10783detected as encoded in some ISO-2022 encoding. The result is that all
48b0f3ae
PJ
10784escape sequences become visible in a buffer.
10785
10786The default value is nil, and it is strongly recommended not to change
10787it. That is because many Emacs Lisp source files that contain
10788non-ASCII characters are encoded by the coding system `iso-2022-7bit'
10789in Emacs's distribution, and they won't be decoded correctly on
10790reading if you suppress escape sequence detection.
10791
10792The other way to read escape sequences in a file without decoding is
97b1b294 10793to explicitly specify some coding system that doesn't use ISO-2022
48b0f3ae 10794escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument]. */);
74383408 10795 inhibit_iso_escape_detection = 0;
002fdb44 10796
97b1b294 10797 DEFVAR_BOOL ("inhibit-null-byte-detection",
29208e82 10798 inhibit_null_byte_detection,
97b1b294
EZ
10799 doc: /* If non-nil, Emacs ignores null bytes on code detection.
10800By default, Emacs treats it as binary data, and does not attempt to
10801decode it. The effect is as if you specified `no-conversion' for
10802reading that text.
10803
10804Set this to non-nil when a regular text happens to include null bytes.
10805Examples are Index nodes of Info files and null-byte delimited output
10806from GNU Find and GNU Grep. Emacs will then ignore the null bytes and
10807decode text as usual. */);
10808 inhibit_null_byte_detection = 0;
10809
29208e82 10810 DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input,
15c8f9d1 10811 doc: /* Char table for translating self-inserting characters.
446dcd75 10812This is applied to the result of input methods, not their input.
8434d0b8
EZ
10813See also `keyboard-translate-table'.
10814
10815Use of this variable for character code unification was rendered
10816obsolete in Emacs 23.1 and later, since Unicode is now the basis of
10817internal character representation. */);
002fdb44 10818 Vtranslation_table_for_input = Qnil;
8f924df7 10819
2c78b7e1
KH
10820 {
10821 Lisp_Object args[coding_arg_max];
8f924df7 10822 Lisp_Object plist[16];
2c78b7e1
KH
10823 int i;
10824
10825 for (i = 0; i < coding_arg_max; i++)
10826 args[i] = Qnil;
10827
d67b4f80 10828 plist[0] = intern_c_string (":name");
2c78b7e1 10829 plist[1] = args[coding_arg_name] = Qno_conversion;
d67b4f80 10830 plist[2] = intern_c_string (":mnemonic");
2c78b7e1 10831 plist[3] = args[coding_arg_mnemonic] = make_number ('=');
d67b4f80 10832 plist[4] = intern_c_string (":coding-type");
2c78b7e1 10833 plist[5] = args[coding_arg_coding_type] = Qraw_text;
d67b4f80 10834 plist[6] = intern_c_string (":ascii-compatible-p");
2c78b7e1 10835 plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
d67b4f80 10836 plist[8] = intern_c_string (":default-char");
2c78b7e1 10837 plist[9] = args[coding_arg_default_char] = make_number (0);
d67b4f80 10838 plist[10] = intern_c_string (":for-unibyte");
8f924df7 10839 plist[11] = args[coding_arg_for_unibyte] = Qt;
d67b4f80
DN
10840 plist[12] = intern_c_string (":docstring");
10841 plist[13] = make_pure_c_string ("Do no conversion.\n\
2c78b7e1
KH
10842\n\
10843When you visit a file with this coding, the file is read into a\n\
10844unibyte buffer as is, thus each byte of a file is treated as a\n\
10845character.");
d67b4f80 10846 plist[14] = intern_c_string (":eol-type");
8f924df7
KH
10847 plist[15] = args[coding_arg_eol_type] = Qunix;
10848 args[coding_arg_plist] = Flist (16, plist);
2c78b7e1 10849 Fdefine_coding_system_internal (coding_arg_max, args);
ae6f73fa
KH
10850
10851 plist[1] = args[coding_arg_name] = Qundecided;
10852 plist[3] = args[coding_arg_mnemonic] = make_number ('-');
10853 plist[5] = args[coding_arg_coding_type] = Qundecided;
10854 /* This is already set.
35befdaa 10855 plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
d67b4f80 10856 plist[8] = intern_c_string (":charset-list");
ae6f73fa
KH
10857 plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
10858 plist[11] = args[coding_arg_for_unibyte] = Qnil;
d67b4f80 10859 plist[13] = make_pure_c_string ("No conversion on encoding, automatic conversion on decoding.");
ae6f73fa
KH
10860 plist[15] = args[coding_arg_eol_type] = Qnil;
10861 args[coding_arg_plist] = Flist (16, plist);
10862 Fdefine_coding_system_internal (coding_arg_max, args);
2c78b7e1
KH
10863 }
10864
2c78b7e1 10865 setup_coding_system (Qno_conversion, &safe_terminal_coding);
ff563fce
KH
10866
10867 {
10868 int i;
10869
10870 for (i = 0; i < coding_category_max; i++)
10871 Fset (AREF (Vcoding_category_table, i), Qno_conversion);
10872 }
1a4990fb 10873#if defined (DOS_NT)
fcbcfb64
KH
10874 system_eol_type = Qdos;
10875#else
10876 system_eol_type = Qunix;
10877#endif
10878 staticpro (&system_eol_type);
4ed46869
KH
10879}
10880
68c45bf0 10881char *
971de7fb 10882emacs_strerror (int error_number)
68c45bf0
PE
10883{
10884 char *str;
10885
ca9c0567 10886 synchronize_system_messages_locale ();
68c45bf0
PE
10887 str = strerror (error_number);
10888
10889 if (! NILP (Vlocale_coding_system))
10890 {
10891 Lisp_Object dec = code_convert_string_norecord (build_string (str),
10892 Vlocale_coding_system,
10893 0);
51b59d79 10894 str = SSDATA (dec);
68c45bf0
PE
10895 }
10896
10897 return str;
10898}
10899
4ed46869 10900#endif /* emacs */