* lisp.h (make_gap_1): New prototype.
[bpt/emacs.git] / src / coding.c
CommitLineData
9542cb1f 1/* Coding system handler (conversion, detection, etc).
ab422c4d 2 Copyright (C) 2001-2013 Free Software Foundation, Inc.
7976eda0 3 Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
5df4f04c 4 2005, 2006, 2007, 2008, 2009, 2010, 2011
ce03bf76
KH
5 National Institute of Advanced Industrial Science and Technology (AIST)
6 Registration Number H14PRO021
8f924df7 7 Copyright (C) 2003
df7492f9
KH
8 National Institute of Advanced Industrial Science and Technology (AIST)
9 Registration Number H13PRO009
4ed46869 10
369314dc
KH
11This file is part of GNU Emacs.
12
9ec0b715 13GNU Emacs is free software: you can redistribute it and/or modify
369314dc 14it under the terms of the GNU General Public License as published by
9ec0b715
GM
15the Free Software Foundation, either version 3 of the License, or
16(at your option) any later version.
4ed46869 17
369314dc
KH
18GNU Emacs is distributed in the hope that it will be useful,
19but WITHOUT ANY WARRANTY; without even the implied warranty of
20MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21GNU General Public License for more details.
4ed46869 22
369314dc 23You should have received a copy of the GNU General Public License
9ec0b715 24along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */
4ed46869
KH
25
26/*** TABLE OF CONTENTS ***
27
b73bfc1c 28 0. General comments
4ed46869 29 1. Preamble
df7492f9
KH
30 2. Emacs' internal format (emacs-utf-8) handlers
31 3. UTF-8 handlers
32 4. UTF-16 handlers
33 5. Charset-base coding systems handlers
34 6. emacs-mule (old Emacs' internal format) handlers
35 7. ISO2022 handlers
36 8. Shift-JIS and BIG5 handlers
37 9. CCL handlers
38 10. C library functions
39 11. Emacs Lisp library functions
40 12. Postamble
4ed46869
KH
41
42*/
43
df7492f9 44/*** 0. General comments ***
b73bfc1c
KH
45
46
df7492f9 47CODING SYSTEM
4ed46869 48
5bad0796
DL
49 A coding system is an object for an encoding mechanism that contains
50 information about how to convert byte sequences to character
e19c3639
KH
51 sequences and vice versa. When we say "decode", it means converting
52 a byte sequence of a specific coding system into a character
53 sequence that is represented by Emacs' internal coding system
54 `emacs-utf-8', and when we say "encode", it means converting a
55 character sequence of emacs-utf-8 to a byte sequence of a specific
0ef69138 56 coding system.
4ed46869 57
34809aa6
EZ
58 In Emacs Lisp, a coding system is represented by a Lisp symbol. On
59 the C level, a coding system is represented by a vector of attributes
5bad0796 60 stored in the hash table Vcharset_hash_table. The conversion from
e19c3639
KH
61 coding system symbol to attributes vector is done by looking up
62 Vcharset_hash_table by the symbol.
4ed46869 63
e19c3639 64 Coding systems are classified into the following types depending on
5bad0796 65 the encoding mechanism. Here's a brief description of the types.
4ed46869 66
df7492f9
KH
67 o UTF-8
68
69 o UTF-16
70
71 o Charset-base coding system
72
73 A coding system defined by one or more (coded) character sets.
5bad0796 74 Decoding and encoding are done by a code converter defined for each
df7492f9
KH
75 character set.
76
5bad0796 77 o Old Emacs internal format (emacs-mule)
df7492f9 78
5bad0796 79 The coding system adopted by old versions of Emacs (20 and 21).
4ed46869 80
df7492f9 81 o ISO2022-base coding system
4ed46869
KH
82
83 The most famous coding system for multiple character sets. X's
df7492f9
KH
84 Compound Text, various EUCs (Extended Unix Code), and coding systems
85 used in the Internet communication such as ISO-2022-JP are all
86 variants of ISO2022.
4ed46869 87
df7492f9 88 o SJIS (or Shift-JIS or MS-Kanji-Code)
93dec019 89
4ed46869
KH
90 A coding system to encode character sets: ASCII, JISX0201, and
91 JISX0208. Widely used for PC's in Japan. Details are described in
df7492f9 92 section 8.
4ed46869 93
df7492f9 94 o BIG5
4ed46869 95
df7492f9 96 A coding system to encode character sets: ASCII and Big5. Widely
cfb43547 97 used for Chinese (mainly in Taiwan and Hong Kong). Details are
df7492f9
KH
98 described in section 8. In this file, when we write "big5" (all
99 lowercase), we mean the coding system, and when we write "Big5"
100 (capitalized), we mean the character set.
4ed46869 101
df7492f9 102 o CCL
27901516 103
5bad0796 104 If a user wants to decode/encode text encoded in a coding system
df7492f9
KH
105 not listed above, he can supply a decoder and an encoder for it in
106 CCL (Code Conversion Language) programs. Emacs executes the CCL
107 program while decoding/encoding.
27901516 108
df7492f9 109 o Raw-text
4ed46869 110
5a936b46 111 A coding system for text containing raw eight-bit data. Emacs
5bad0796 112 treats each byte of source text as a character (except for
df7492f9 113 end-of-line conversion).
4ed46869 114
df7492f9
KH
115 o No-conversion
116
117 Like raw text, but don't do end-of-line conversion.
4ed46869 118
4ed46869 119
df7492f9 120END-OF-LINE FORMAT
4ed46869 121
5bad0796 122 How text end-of-line is encoded depends on operating system. For
df7492f9 123 instance, Unix's format is just one byte of LF (line-feed) code,
f4dee582 124 whereas DOS's format is two-byte sequence of `carriage-return' and
d46c5b12
KH
125 `line-feed' codes. MacOS's format is usually one byte of
126 `carriage-return'.
4ed46869 127
cfb43547 128 Since text character encoding and end-of-line encoding are
df7492f9
KH
129 independent, any coding system described above can take any format
130 of end-of-line (except for no-conversion).
4ed46869 131
e19c3639
KH
132STRUCT CODING_SYSTEM
133
134 Before using a coding system for code conversion (i.e. decoding and
135 encoding), we setup a structure of type `struct coding_system'.
136 This structure keeps various information about a specific code
5bad0796 137 conversion (e.g. the location of source and destination data).
4ed46869
KH
138
139*/
140
df7492f9
KH
141/* COMMON MACROS */
142
143
4ed46869
KH
144/*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
145
df7492f9 146 These functions check if a byte sequence specified as a source in
ff0dacd7
KH
147 CODING conforms to the format of XXX, and update the members of
148 DETECT_INFO.
df7492f9 149
f10fe38f 150 Return true if the byte sequence conforms to XXX.
df7492f9
KH
151
152 Below is the template of these functions. */
153
4ed46869 154#if 0
f10fe38f 155static bool
cf84bb53
JB
156detect_coding_XXX (struct coding_system *coding,
157 struct coding_detection_info *detect_info)
4ed46869 158{
f1d34bca
MB
159 const unsigned char *src = coding->source;
160 const unsigned char *src_end = coding->source + coding->src_bytes;
f10fe38f 161 bool multibytep = coding->src_multibyte;
d311d28c 162 ptrdiff_t consumed_chars = 0;
df7492f9
KH
163 int found = 0;
164 ...;
165
166 while (1)
167 {
ad1746f5 168 /* Get one byte from the source. If the source is exhausted, jump
df7492f9
KH
169 to no_more_source:. */
170 ONE_MORE_BYTE (c);
ff0dacd7
KH
171
172 if (! __C_conforms_to_XXX___ (c))
173 break;
174 if (! __C_strongly_suggests_XXX__ (c))
175 found = CATEGORY_MASK_XXX;
df7492f9 176 }
ff0dacd7
KH
177 /* The byte sequence is invalid for XXX. */
178 detect_info->rejected |= CATEGORY_MASK_XXX;
df7492f9 179 return 0;
ff0dacd7 180
df7492f9 181 no_more_source:
ad1746f5 182 /* The source exhausted successfully. */
ff0dacd7 183 detect_info->found |= found;
df7492f9 184 return 1;
4ed46869
KH
185}
186#endif
187
188/*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
189
df7492f9
KH
190 These functions decode a byte sequence specified as a source by
191 CODING. The resulting multibyte text goes to a place pointed to by
192 CODING->charbuf, the length of which should not exceed
193 CODING->charbuf_size;
d46c5b12 194
df7492f9
KH
195 These functions set the information of original and decoded texts in
196 CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
197 They also set CODING->result to one of CODING_RESULT_XXX indicating
198 how the decoding is finished.
d46c5b12 199
df7492f9 200 Below is the template of these functions. */
d46c5b12 201
4ed46869 202#if 0
b73bfc1c 203static void
cf84bb53 204decode_coding_XXXX (struct coding_system *coding)
4ed46869 205{
f1d34bca
MB
206 const unsigned char *src = coding->source + coding->consumed;
207 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9
KH
208 /* SRC_BASE remembers the start position in source in each loop.
209 The loop will be exited when there's not enough source code, or
210 when there's no room in CHARBUF for a decoded character. */
f1d34bca 211 const unsigned char *src_base;
df7492f9 212 /* A buffer to produce decoded characters. */
69a80ea3
KH
213 int *charbuf = coding->charbuf + coding->charbuf_used;
214 int *charbuf_end = coding->charbuf + coding->charbuf_size;
f10fe38f 215 bool multibytep = coding->src_multibyte;
df7492f9
KH
216
217 while (1)
218 {
219 src_base = src;
220 if (charbuf < charbuf_end)
221 /* No more room to produce a decoded character. */
222 break;
223 ONE_MORE_BYTE (c);
224 /* Decode it. */
225 }
226
227 no_more_source:
228 if (src_base < src_end
229 && coding->mode & CODING_MODE_LAST_BLOCK)
230 /* If the source ends by partial bytes to construct a character,
231 treat them as eight-bit raw data. */
232 while (src_base < src_end && charbuf < charbuf_end)
233 *charbuf++ = *src_base++;
234 /* Remember how many bytes and characters we consumed. If the
235 source is multibyte, the bytes and chars are not identical. */
236 coding->consumed = coding->consumed_char = src_base - coding->source;
237 /* Remember how many characters we produced. */
238 coding->charbuf_used = charbuf - coding->charbuf;
4ed46869
KH
239}
240#endif
241
242/*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
243
df7492f9
KH
244 These functions encode SRC_BYTES length text at SOURCE of Emacs'
245 internal multibyte format by CODING. The resulting byte sequence
b73bfc1c
KH
246 goes to a place pointed to by DESTINATION, the length of which
247 should not exceed DST_BYTES.
d46c5b12 248
df7492f9
KH
249 These functions set the information of original and encoded texts in
250 the members produced, produced_char, consumed, and consumed_char of
251 the structure *CODING. They also set the member result to one of
252 CODING_RESULT_XXX indicating how the encoding finished.
d46c5b12 253
df7492f9
KH
254 DST_BYTES zero means that source area and destination area are
255 overlapped, which means that we can produce a encoded text until it
256 reaches at the head of not-yet-encoded source text.
d46c5b12 257
df7492f9 258 Below is a template of these functions. */
4ed46869 259#if 0
b73bfc1c 260static void
cf84bb53 261encode_coding_XXX (struct coding_system *coding)
4ed46869 262{
f10fe38f 263 bool multibytep = coding->dst_multibyte;
df7492f9
KH
264 int *charbuf = coding->charbuf;
265 int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
266 unsigned char *dst = coding->destination + coding->produced;
267 unsigned char *dst_end = coding->destination + coding->dst_bytes;
268 unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
d311d28c 269 ptrdiff_t produced_chars = 0;
df7492f9
KH
270
271 for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
272 {
273 int c = *charbuf;
274 /* Encode C into DST, and increment DST. */
275 }
276 label_no_more_destination:
277 /* How many chars and bytes we produced. */
278 coding->produced_char += produced_chars;
279 coding->produced = dst - coding->destination;
4ed46869
KH
280}
281#endif
282
4ed46869
KH
283\f
284/*** 1. Preamble ***/
285
68c45bf0 286#include <config.h>
4ed46869
KH
287#include <stdio.h>
288
4ed46869 289#include "lisp.h"
df7492f9 290#include "character.h"
e5560ff7 291#include "buffer.h"
4ed46869
KH
292#include "charset.h"
293#include "ccl.h"
df7492f9 294#include "composite.h"
4ed46869
KH
295#include "coding.h"
296#include "window.h"
b8299c66
KL
297#include "frame.h"
298#include "termhooks.h"
4ed46869 299
df7492f9 300Lisp_Object Vcoding_system_hash_table;
4ed46869 301
955cbe7b
PE
302static Lisp_Object Qcoding_system, Qeol_type;
303static Lisp_Object Qcoding_aliases;
84cc1ab6
PE
304Lisp_Object Qunix, Qdos;
305static Lisp_Object Qmac;
4ed46869 306Lisp_Object Qbuffer_file_coding_system;
955cbe7b
PE
307static Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
308static Lisp_Object Qdefault_char;
27901516 309Lisp_Object Qno_conversion, Qundecided;
955cbe7b
PE
310Lisp_Object Qcharset, Qutf_8;
311static Lisp_Object Qiso_2022;
312static Lisp_Object Qutf_16, Qshift_jis, Qbig5;
313static Lisp_Object Qbig, Qlittle;
314static Lisp_Object Qcoding_system_history;
315static Lisp_Object Qvalid_codes;
316static Lisp_Object QCcategory, QCmnemonic, QCdefault_char;
317static Lisp_Object QCdecode_translation_table, QCencode_translation_table;
318static Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
319static Lisp_Object QCascii_compatible_p;
4ed46869 320
387f6ba5 321Lisp_Object Qcall_process, Qcall_process_region;
4ed46869 322Lisp_Object Qstart_process, Qopen_network_stream;
955cbe7b 323static Lisp_Object Qtarget_idx;
4ed46869 324
955cbe7b
PE
325static Lisp_Object Qinsufficient_source, Qinconsistent_eol, Qinvalid_source;
326static Lisp_Object Qinterrupted, Qinsufficient_memory;
065e3595 327
44e8490d
KH
328/* If a symbol has this property, evaluate the value to define the
329 symbol as a coding system. */
330static Lisp_Object Qcoding_system_define_form;
331
fcbcfb64
KH
332/* Format of end-of-line decided by system. This is Qunix on
333 Unix and Mac, Qdos on DOS/Windows.
334 This has an effect only for external encoding (i.e. for output to
335 file and process), not for in-buffer or Lisp string encoding. */
336static Lisp_Object system_eol_type;
337
4ed46869
KH
338#ifdef emacs
339
4608c386 340Lisp_Object Qcoding_system_p, Qcoding_system_error;
4ed46869 341
d46c5b12
KH
342/* Coding system emacs-mule and raw-text are for converting only
343 end-of-line format. */
344Lisp_Object Qemacs_mule, Qraw_text;
8f924df7 345Lisp_Object Qutf_8_emacs;
ecf488bc 346
53372c27 347#if defined (WINDOWSNT) || defined (CYGWIN)
ba116008
DC
348static Lisp_Object Qutf_16le;
349#endif
350
4ed46869
KH
351/* Coding-systems are handed between Emacs Lisp programs and C internal
352 routines by the following three variables. */
c4825358
KH
353/* Coding system to be used to encode text for terminal display when
354 terminal coding system is nil. */
355struct coding_system safe_terminal_coding;
356
4ed46869
KH
357#endif /* emacs */
358
f967223b
KH
359Lisp_Object Qtranslation_table;
360Lisp_Object Qtranslation_table_id;
955cbe7b
PE
361static Lisp_Object Qtranslation_table_for_decode;
362static Lisp_Object Qtranslation_table_for_encode;
4ed46869 363
df7492f9 364/* Two special coding systems. */
74ab6df5
PE
365static Lisp_Object Vsjis_coding_system;
366static Lisp_Object Vbig5_coding_system;
df7492f9 367
df7492f9
KH
368/* ISO2022 section */
369
370#define CODING_ISO_INITIAL(coding, reg) \
371 (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id), \
372 coding_attr_iso_initial), \
373 reg)))
374
375
1b3b981b
AS
376#define CODING_ISO_REQUEST(coding, charset_id) \
377 (((charset_id) <= (coding)->max_charset_id \
378 ? ((coding)->safe_charsets[charset_id] != 255 \
379 ? (coding)->safe_charsets[charset_id] \
380 : -1) \
df7492f9
KH
381 : -1))
382
383
384#define CODING_ISO_FLAGS(coding) \
385 ((coding)->spec.iso_2022.flags)
386#define CODING_ISO_DESIGNATION(coding, reg) \
387 ((coding)->spec.iso_2022.current_designation[reg])
388#define CODING_ISO_INVOCATION(coding, plane) \
389 ((coding)->spec.iso_2022.current_invocation[plane])
390#define CODING_ISO_SINGLE_SHIFTING(coding) \
391 ((coding)->spec.iso_2022.single_shifting)
392#define CODING_ISO_BOL(coding) \
393 ((coding)->spec.iso_2022.bol)
394#define CODING_ISO_INVOKED_CHARSET(coding, plane) \
395 CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
e951386e
KH
396#define CODING_ISO_CMP_STATUS(coding) \
397 (&(coding)->spec.iso_2022.cmp_status)
398#define CODING_ISO_EXTSEGMENT_LEN(coding) \
399 ((coding)->spec.iso_2022.ctext_extended_segment_len)
400#define CODING_ISO_EMBEDDED_UTF_8(coding) \
401 ((coding)->spec.iso_2022.embedded_utf_8)
df7492f9
KH
402
403/* Control characters of ISO2022. */
404 /* code */ /* function */
df7492f9
KH
405#define ISO_CODE_SO 0x0E /* shift-out */
406#define ISO_CODE_SI 0x0F /* shift-in */
407#define ISO_CODE_SS2_7 0x19 /* single-shift-2 for 7-bit code */
408#define ISO_CODE_ESC 0x1B /* escape */
409#define ISO_CODE_SS2 0x8E /* single-shift-2 */
410#define ISO_CODE_SS3 0x8F /* single-shift-3 */
411#define ISO_CODE_CSI 0x9B /* control-sequence-introducer */
412
413/* All code (1-byte) of ISO2022 is classified into one of the
414 followings. */
415enum iso_code_class_type
416 {
417 ISO_control_0, /* Control codes in the range
418 0x00..0x1F and 0x7F, except for the
419 following 5 codes. */
df7492f9
KH
420 ISO_shift_out, /* ISO_CODE_SO (0x0E) */
421 ISO_shift_in, /* ISO_CODE_SI (0x0F) */
422 ISO_single_shift_2_7, /* ISO_CODE_SS2_7 (0x19) */
a0d7415f 423 ISO_escape, /* ISO_CODE_ESC (0x1B) */
df7492f9
KH
424 ISO_control_1, /* Control codes in the range
425 0x80..0x9F, except for the
426 following 3 codes. */
427 ISO_single_shift_2, /* ISO_CODE_SS2 (0x8E) */
428 ISO_single_shift_3, /* ISO_CODE_SS3 (0x8F) */
429 ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
430 ISO_0x20_or_0x7F, /* Codes of the values 0x20 or 0x7F. */
431 ISO_graphic_plane_0, /* Graphic codes in the range 0x21..0x7E. */
432 ISO_0xA0_or_0xFF, /* Codes of the values 0xA0 or 0xFF. */
433 ISO_graphic_plane_1 /* Graphic codes in the range 0xA1..0xFE. */
434 };
05e6f5dc 435
df7492f9
KH
436/** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
437 `iso-flags' attribute of an iso2022 coding system. */
05e6f5dc 438
df7492f9
KH
439/* If set, produce long-form designation sequence (e.g. ESC $ ( A)
440 instead of the correct short-form sequence (e.g. ESC $ A). */
441#define CODING_ISO_FLAG_LONG_FORM 0x0001
93dec019 442
df7492f9
KH
443/* If set, reset graphic planes and registers at end-of-line to the
444 initial state. */
445#define CODING_ISO_FLAG_RESET_AT_EOL 0x0002
05e6f5dc 446
df7492f9
KH
447/* If set, reset graphic planes and registers before any control
448 characters to the initial state. */
449#define CODING_ISO_FLAG_RESET_AT_CNTL 0x0004
05e6f5dc 450
df7492f9
KH
451/* If set, encode by 7-bit environment. */
452#define CODING_ISO_FLAG_SEVEN_BITS 0x0008
4ed46869 453
df7492f9
KH
454/* If set, use locking-shift function. */
455#define CODING_ISO_FLAG_LOCKING_SHIFT 0x0010
b73bfc1c 456
df7492f9
KH
457/* If set, use single-shift function. Overwrite
458 CODING_ISO_FLAG_LOCKING_SHIFT. */
459#define CODING_ISO_FLAG_SINGLE_SHIFT 0x0020
b73bfc1c 460
df7492f9
KH
461/* If set, use designation escape sequence. */
462#define CODING_ISO_FLAG_DESIGNATION 0x0040
b73bfc1c 463
df7492f9
KH
464/* If set, produce revision number sequence. */
465#define CODING_ISO_FLAG_REVISION 0x0080
b73bfc1c 466
df7492f9
KH
467/* If set, produce ISO6429's direction specifying sequence. */
468#define CODING_ISO_FLAG_DIRECTION 0x0100
f4dee582 469
df7492f9
KH
470/* If set, assume designation states are reset at beginning of line on
471 output. */
472#define CODING_ISO_FLAG_INIT_AT_BOL 0x0200
4ed46869 473
df7492f9
KH
474/* If set, designation sequence should be placed at beginning of line
475 on output. */
476#define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
aa72b389 477
ad1746f5 478/* If set, do not encode unsafe characters on output. */
df7492f9 479#define CODING_ISO_FLAG_SAFE 0x0800
aa72b389 480
df7492f9
KH
481/* If set, extra latin codes (128..159) are accepted as a valid code
482 on input. */
483#define CODING_ISO_FLAG_LATIN_EXTRA 0x1000
aa72b389 484
df7492f9 485#define CODING_ISO_FLAG_COMPOSITION 0x2000
aa72b389 486
5f58e762 487/* #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000 */
aa72b389 488
bf16eb23 489#define CODING_ISO_FLAG_USE_ROMAN 0x8000
aa72b389 490
bf16eb23 491#define CODING_ISO_FLAG_USE_OLDJIS 0x10000
aa72b389 492
bf16eb23 493#define CODING_ISO_FLAG_FULL_SUPPORT 0x100000
aa72b389 494
df7492f9
KH
495/* A character to be produced on output if encoding of the original
496 character is prohibited by CODING_ISO_FLAG_SAFE. */
497#define CODING_INHIBIT_CHARACTER_SUBSTITUTION '?'
aa72b389 498
a470d443
KH
499/* UTF-8 section */
500#define CODING_UTF_8_BOM(coding) \
501 ((coding)->spec.utf_8_bom)
4ed46869 502
df7492f9
KH
503/* UTF-16 section */
504#define CODING_UTF_16_BOM(coding) \
505 ((coding)->spec.utf_16.bom)
4ed46869 506
df7492f9
KH
507#define CODING_UTF_16_ENDIAN(coding) \
508 ((coding)->spec.utf_16.endian)
4ed46869 509
df7492f9
KH
510#define CODING_UTF_16_SURROGATE(coding) \
511 ((coding)->spec.utf_16.surrogate)
4ed46869 512
4ed46869 513
df7492f9
KH
514/* CCL section */
515#define CODING_CCL_DECODER(coding) \
516 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
517#define CODING_CCL_ENCODER(coding) \
518 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
519#define CODING_CCL_VALIDS(coding) \
8f924df7 520 (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
4ed46869 521
5a936b46 522/* Index for each coding category in `coding_categories' */
4ed46869 523
df7492f9
KH
524enum coding_category
525 {
526 coding_category_iso_7,
527 coding_category_iso_7_tight,
528 coding_category_iso_8_1,
529 coding_category_iso_8_2,
530 coding_category_iso_7_else,
531 coding_category_iso_8_else,
a470d443
KH
532 coding_category_utf_8_auto,
533 coding_category_utf_8_nosig,
534 coding_category_utf_8_sig,
df7492f9
KH
535 coding_category_utf_16_auto,
536 coding_category_utf_16_be,
537 coding_category_utf_16_le,
538 coding_category_utf_16_be_nosig,
539 coding_category_utf_16_le_nosig,
540 coding_category_charset,
541 coding_category_sjis,
542 coding_category_big5,
543 coding_category_ccl,
544 coding_category_emacs_mule,
545 /* All above are targets of code detection. */
546 coding_category_raw_text,
547 coding_category_undecided,
548 coding_category_max
549 };
550
551/* Definitions of flag bits used in detect_coding_XXXX. */
552#define CATEGORY_MASK_ISO_7 (1 << coding_category_iso_7)
553#define CATEGORY_MASK_ISO_7_TIGHT (1 << coding_category_iso_7_tight)
554#define CATEGORY_MASK_ISO_8_1 (1 << coding_category_iso_8_1)
555#define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2)
556#define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else)
557#define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else)
a470d443
KH
558#define CATEGORY_MASK_UTF_8_AUTO (1 << coding_category_utf_8_auto)
559#define CATEGORY_MASK_UTF_8_NOSIG (1 << coding_category_utf_8_nosig)
560#define CATEGORY_MASK_UTF_8_SIG (1 << coding_category_utf_8_sig)
b49a1807 561#define CATEGORY_MASK_UTF_16_AUTO (1 << coding_category_utf_16_auto)
df7492f9
KH
562#define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be)
563#define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le)
564#define CATEGORY_MASK_UTF_16_BE_NOSIG (1 << coding_category_utf_16_be_nosig)
565#define CATEGORY_MASK_UTF_16_LE_NOSIG (1 << coding_category_utf_16_le_nosig)
566#define CATEGORY_MASK_CHARSET (1 << coding_category_charset)
567#define CATEGORY_MASK_SJIS (1 << coding_category_sjis)
568#define CATEGORY_MASK_BIG5 (1 << coding_category_big5)
569#define CATEGORY_MASK_CCL (1 << coding_category_ccl)
570#define CATEGORY_MASK_EMACS_MULE (1 << coding_category_emacs_mule)
ff0dacd7 571#define CATEGORY_MASK_RAW_TEXT (1 << coding_category_raw_text)
df7492f9
KH
572
573/* This value is returned if detect_coding_mask () find nothing other
574 than ASCII characters. */
575#define CATEGORY_MASK_ANY \
576 (CATEGORY_MASK_ISO_7 \
577 | CATEGORY_MASK_ISO_7_TIGHT \
578 | CATEGORY_MASK_ISO_8_1 \
579 | CATEGORY_MASK_ISO_8_2 \
580 | CATEGORY_MASK_ISO_7_ELSE \
581 | CATEGORY_MASK_ISO_8_ELSE \
a470d443
KH
582 | CATEGORY_MASK_UTF_8_AUTO \
583 | CATEGORY_MASK_UTF_8_NOSIG \
584 | CATEGORY_MASK_UTF_8_SIG \
2f3cbb32 585 | CATEGORY_MASK_UTF_16_AUTO \
df7492f9
KH
586 | CATEGORY_MASK_UTF_16_BE \
587 | CATEGORY_MASK_UTF_16_LE \
588 | CATEGORY_MASK_UTF_16_BE_NOSIG \
589 | CATEGORY_MASK_UTF_16_LE_NOSIG \
590 | CATEGORY_MASK_CHARSET \
591 | CATEGORY_MASK_SJIS \
592 | CATEGORY_MASK_BIG5 \
593 | CATEGORY_MASK_CCL \
594 | CATEGORY_MASK_EMACS_MULE)
595
596
597#define CATEGORY_MASK_ISO_7BIT \
598 (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
599
600#define CATEGORY_MASK_ISO_8BIT \
601 (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
602
603#define CATEGORY_MASK_ISO_ELSE \
604 (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
605
606#define CATEGORY_MASK_ISO_ESCAPE \
607 (CATEGORY_MASK_ISO_7 \
608 | CATEGORY_MASK_ISO_7_TIGHT \
609 | CATEGORY_MASK_ISO_7_ELSE \
610 | CATEGORY_MASK_ISO_8_ELSE)
611
612#define CATEGORY_MASK_ISO \
613 ( CATEGORY_MASK_ISO_7BIT \
614 | CATEGORY_MASK_ISO_8BIT \
615 | CATEGORY_MASK_ISO_ELSE)
616
617#define CATEGORY_MASK_UTF_16 \
2f3cbb32
KH
618 (CATEGORY_MASK_UTF_16_AUTO \
619 | CATEGORY_MASK_UTF_16_BE \
df7492f9
KH
620 | CATEGORY_MASK_UTF_16_LE \
621 | CATEGORY_MASK_UTF_16_BE_NOSIG \
622 | CATEGORY_MASK_UTF_16_LE_NOSIG)
623
a470d443
KH
624#define CATEGORY_MASK_UTF_8 \
625 (CATEGORY_MASK_UTF_8_AUTO \
626 | CATEGORY_MASK_UTF_8_NOSIG \
627 | CATEGORY_MASK_UTF_8_SIG)
df7492f9 628
df7492f9 629/* Table of coding categories (Lisp symbols). This variable is for
ad1746f5 630 internal use only. */
df7492f9
KH
631static Lisp_Object Vcoding_category_table;
632
633/* Table of coding-categories ordered by priority. */
634static enum coding_category coding_priorities[coding_category_max];
635
636/* Nth element is a coding context for the coding system bound to the
637 Nth coding category. */
638static struct coding_system coding_categories[coding_category_max];
639
df7492f9
KH
640/*** Commonly used macros and functions ***/
641
642#ifndef min
643#define min(a, b) ((a) < (b) ? (a) : (b))
644#endif
645#ifndef max
646#define max(a, b) ((a) > (b) ? (a) : (b))
647#endif
4ed46869 648
24a73b0a
KH
649#define CODING_GET_INFO(coding, attrs, charset_list) \
650 do { \
651 (attrs) = CODING_ID_ATTRS ((coding)->id); \
652 (charset_list) = CODING_ATTR_CHARSET_LIST (attrs); \
df7492f9 653 } while (0)
4ed46869 654
4ed46869 655
df7492f9
KH
656/* Safely get one byte from the source text pointed by SRC which ends
657 at SRC_END, and set C to that byte. If there are not enough bytes
f10fe38f
PE
658 in the source, it jumps to 'no_more_source'. If MULTIBYTEP,
659 and a multibyte character is found at SRC, set C to the
065e3595
KH
660 negative value of the character code. The caller should declare
661 and set these variables appropriately in advance:
662 src, src_end, multibytep */
aa72b389 663
065e3595
KH
664#define ONE_MORE_BYTE(c) \
665 do { \
666 if (src == src_end) \
667 { \
668 if (src_base < src) \
669 record_conversion_result \
670 (coding, CODING_RESULT_INSUFFICIENT_SRC); \
671 goto no_more_source; \
672 } \
673 c = *src++; \
674 if (multibytep && (c & 0x80)) \
675 { \
676 if ((c & 0xFE) == 0xC0) \
677 c = ((c & 1) << 6) | *src++; \
678 else \
679 { \
35befdaa
KH
680 src--; \
681 c = - string_char (src, &src, NULL); \
065e3595
KH
682 record_conversion_result \
683 (coding, CODING_RESULT_INVALID_SRC); \
684 } \
685 } \
686 consumed_chars++; \
aa72b389
KH
687 } while (0)
688
f56a4450 689/* Safely get two bytes from the source text pointed by SRC which ends
220eeac9
KH
690 at SRC_END, and set C1 and C2 to those bytes while skipping the
691 heading multibyte characters. If there are not enough bytes in the
f10fe38f 692 source, it jumps to 'no_more_source'. If MULTIBYTEP and
220eeac9
KH
693 a multibyte character is found for C2, set C2 to the negative value
694 of the character code. The caller should declare and set these
695 variables appropriately in advance:
f56a4450
KH
696 src, src_end, multibytep
697 It is intended that this macro is used in detect_coding_utf_16. */
698
220eeac9
KH
699#define TWO_MORE_BYTES(c1, c2) \
700 do { \
701 do { \
702 if (src == src_end) \
703 goto no_more_source; \
704 c1 = *src++; \
705 if (multibytep && (c1 & 0x80)) \
706 { \
707 if ((c1 & 0xFE) == 0xC0) \
708 c1 = ((c1 & 1) << 6) | *src++; \
709 else \
710 { \
711 src += BYTES_BY_CHAR_HEAD (c1) - 1; \
712 c1 = -1; \
713 } \
714 } \
715 } while (c1 < 0); \
716 if (src == src_end) \
717 goto no_more_source; \
718 c2 = *src++; \
719 if (multibytep && (c2 & 0x80)) \
720 { \
721 if ((c2 & 0xFE) == 0xC0) \
722 c2 = ((c2 & 1) << 6) | *src++; \
723 else \
724 c2 = -1; \
725 } \
f56a4450
KH
726 } while (0)
727
aa72b389 728
df7492f9
KH
729/* Store a byte C in the place pointed by DST and increment DST to the
730 next free point, and increment PRODUCED_CHARS. The caller should
731 assure that C is 0..127, and declare and set the variable `dst'
732 appropriately in advance.
733*/
aa72b389
KH
734
735
df7492f9
KH
736#define EMIT_ONE_ASCII_BYTE(c) \
737 do { \
738 produced_chars++; \
739 *dst++ = (c); \
b6871cc7 740 } while (0)
aa72b389
KH
741
742
ad1746f5 743/* Like EMIT_ONE_ASCII_BYTE but store two bytes; C1 and C2. */
aa72b389 744
df7492f9
KH
745#define EMIT_TWO_ASCII_BYTES(c1, c2) \
746 do { \
747 produced_chars += 2; \
748 *dst++ = (c1), *dst++ = (c2); \
749 } while (0)
aa72b389
KH
750
751
df7492f9 752/* Store a byte C in the place pointed by DST and increment DST to the
f10fe38f
PE
753 next free point, and increment PRODUCED_CHARS. If MULTIBYTEP,
754 store in an appropriate multibyte form. The caller should
df7492f9
KH
755 declare and set the variables `dst' and `multibytep' appropriately
756 in advance. */
757
758#define EMIT_ONE_BYTE(c) \
759 do { \
760 produced_chars++; \
761 if (multibytep) \
762 { \
b25d760e 763 unsigned ch = (c); \
df7492f9
KH
764 if (ch >= 0x80) \
765 ch = BYTE8_TO_CHAR (ch); \
766 CHAR_STRING_ADVANCE (ch, dst); \
767 } \
768 else \
769 *dst++ = (c); \
aa72b389 770 } while (0)
aa72b389 771
aa72b389 772
df7492f9 773/* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */
aa72b389 774
e19c3639
KH
775#define EMIT_TWO_BYTES(c1, c2) \
776 do { \
777 produced_chars += 2; \
778 if (multibytep) \
779 { \
b25d760e 780 unsigned ch; \
e19c3639
KH
781 \
782 ch = (c1); \
783 if (ch >= 0x80) \
784 ch = BYTE8_TO_CHAR (ch); \
785 CHAR_STRING_ADVANCE (ch, dst); \
786 ch = (c2); \
787 if (ch >= 0x80) \
788 ch = BYTE8_TO_CHAR (ch); \
789 CHAR_STRING_ADVANCE (ch, dst); \
790 } \
791 else \
792 { \
793 *dst++ = (c1); \
794 *dst++ = (c2); \
795 } \
aa72b389
KH
796 } while (0)
797
798
df7492f9
KH
799#define EMIT_THREE_BYTES(c1, c2, c3) \
800 do { \
801 EMIT_ONE_BYTE (c1); \
802 EMIT_TWO_BYTES (c2, c3); \
803 } while (0)
aa72b389 804
aa72b389 805
df7492f9
KH
806#define EMIT_FOUR_BYTES(c1, c2, c3, c4) \
807 do { \
808 EMIT_TWO_BYTES (c1, c2); \
809 EMIT_TWO_BYTES (c3, c4); \
810 } while (0)
aa72b389 811
aa72b389 812
065e3595
KH
813static void
814record_conversion_result (struct coding_system *coding,
815 enum coding_result_code result)
816{
817 coding->result = result;
818 switch (result)
819 {
820 case CODING_RESULT_INSUFFICIENT_SRC:
821 Vlast_code_conversion_error = Qinsufficient_source;
822 break;
823 case CODING_RESULT_INCONSISTENT_EOL:
824 Vlast_code_conversion_error = Qinconsistent_eol;
825 break;
826 case CODING_RESULT_INVALID_SRC:
827 Vlast_code_conversion_error = Qinvalid_source;
828 break;
829 case CODING_RESULT_INTERRUPT:
830 Vlast_code_conversion_error = Qinterrupted;
831 break;
832 case CODING_RESULT_INSUFFICIENT_MEM:
833 Vlast_code_conversion_error = Qinsufficient_memory;
834 break;
ebaf11b6
KH
835 case CODING_RESULT_INSUFFICIENT_DST:
836 /* Don't record this error in Vlast_code_conversion_error
837 because it happens just temporarily and is resolved when the
838 whole conversion is finished. */
839 break;
409ea3a1
AS
840 case CODING_RESULT_SUCCESS:
841 break;
35befdaa
KH
842 default:
843 Vlast_code_conversion_error = intern ("Unknown error");
065e3595
KH
844 }
845}
846
5eb05ea3
KH
847/* These wrapper macros are used to preserve validity of pointers into
848 buffer text across calls to decode_char, encode_char, etc, which
849 could cause relocation of buffers if it loads a charset map,
850 because loading a charset map allocates large structures. */
851
df7492f9
KH
852#define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
853 do { \
8f50130c 854 ptrdiff_t offset; \
5eb05ea3 855 \
df7492f9
KH
856 charset_map_loaded = 0; \
857 c = DECODE_CHAR (charset, code); \
5eb05ea3 858 if (charset_map_loaded \
c1892f11 859 && (offset = coding_change_source (coding))) \
df7492f9 860 { \
df7492f9
KH
861 src += offset; \
862 src_base += offset; \
863 src_end += offset; \
864 } \
aa72b389
KH
865 } while (0)
866
5eb05ea3
KH
867#define CODING_ENCODE_CHAR(coding, dst, dst_end, charset, c, code) \
868 do { \
8f50130c 869 ptrdiff_t offset; \
5eb05ea3
KH
870 \
871 charset_map_loaded = 0; \
872 code = ENCODE_CHAR (charset, c); \
873 if (charset_map_loaded \
c1892f11 874 && (offset = coding_change_destination (coding))) \
5eb05ea3
KH
875 { \
876 dst += offset; \
877 dst_end += offset; \
878 } \
879 } while (0)
880
881#define CODING_CHAR_CHARSET(coding, dst, dst_end, c, charset_list, code_return, charset) \
882 do { \
8f50130c 883 ptrdiff_t offset; \
5eb05ea3
KH
884 \
885 charset_map_loaded = 0; \
886 charset = char_charset (c, charset_list, code_return); \
887 if (charset_map_loaded \
c1892f11 888 && (offset = coding_change_destination (coding))) \
5eb05ea3
KH
889 { \
890 dst += offset; \
891 dst_end += offset; \
892 } \
893 } while (0)
894
895#define CODING_CHAR_CHARSET_P(coding, dst, dst_end, c, charset, result) \
896 do { \
8f50130c 897 ptrdiff_t offset; \
5eb05ea3
KH
898 \
899 charset_map_loaded = 0; \
900 result = CHAR_CHARSET_P (c, charset); \
901 if (charset_map_loaded \
c1892f11 902 && (offset = coding_change_destination (coding))) \
5eb05ea3
KH
903 { \
904 dst += offset; \
905 dst_end += offset; \
906 } \
907 } while (0)
908
aa72b389 909
119852e7
KH
910/* If there are at least BYTES length of room at dst, allocate memory
911 for coding->destination and update dst and dst_end. We don't have
912 to take care of coding->source which will be relocated. It is
913 handled by calling coding_set_source in encode_coding. */
914
df7492f9
KH
915#define ASSURE_DESTINATION(bytes) \
916 do { \
917 if (dst + (bytes) >= dst_end) \
918 { \
d311d28c 919 ptrdiff_t more_bytes = charbuf_end - charbuf + (bytes); \
df7492f9
KH
920 \
921 dst = alloc_destination (coding, more_bytes, dst); \
922 dst_end = coding->destination + coding->dst_bytes; \
923 } \
924 } while (0)
aa72b389 925
aa72b389 926
db274c7a 927/* Store multibyte form of the character C in P, and advance P to the
eedec3ee
EZ
928 end of the multibyte form. This used to be like CHAR_STRING_ADVANCE
929 without ever calling MAYBE_UNIFY_CHAR, but nowadays we don't call
930 MAYBE_UNIFY_CHAR in CHAR_STRING_ADVANCE. */
db274c7a 931
eedec3ee 932#define CHAR_STRING_ADVANCE_NO_UNIFY(c, p) CHAR_STRING_ADVANCE(c, p)
db274c7a
KH
933
934/* Return the character code of character whose multibyte form is at
eedec3ee
EZ
935 P, and advance P to the end of the multibyte form. This used to be
936 like STRING_CHAR_ADVANCE without ever calling MAYBE_UNIFY_CHAR, but
937 nowadays STRING_CHAR_ADVANCE doesn't call MAYBE_UNIFY_CHAR. */
db274c7a 938
eedec3ee 939#define STRING_CHAR_ADVANCE_NO_UNIFY(p) STRING_CHAR_ADVANCE(p)
aa72b389 940
c1892f11 941/* Set coding->source from coding->src_object. */
5eb05ea3 942
c1892f11 943static void
971de7fb 944coding_set_source (struct coding_system *coding)
aa72b389 945{
df7492f9
KH
946 if (BUFFERP (coding->src_object))
947 {
2cb26057 948 struct buffer *buf = XBUFFER (coding->src_object);
aa72b389 949
df7492f9 950 if (coding->src_pos < 0)
2cb26057 951 coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
df7492f9 952 else
2cb26057 953 coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
aa72b389 954 }
df7492f9 955 else if (STRINGP (coding->src_object))
aa72b389 956 {
8f924df7 957 coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
aa72b389 958 }
df7492f9 959 else
f38b440c
PE
960 {
961 /* Otherwise, the source is C string and is never relocated
962 automatically. Thus we don't have to update anything. */
963 }
df7492f9 964}
aa72b389 965
5eb05ea3 966
c1892f11
PE
967/* Set coding->source from coding->src_object, and return how many
968 bytes coding->source was changed. */
5eb05ea3 969
8f50130c 970static ptrdiff_t
c1892f11 971coding_change_source (struct coding_system *coding)
df7492f9 972{
c1892f11
PE
973 const unsigned char *orig = coding->source;
974 coding_set_source (coding);
975 return coding->source - orig;
976}
977
5eb05ea3 978
c1892f11
PE
979/* Set coding->destination from coding->dst_object. */
980
981static void
982coding_set_destination (struct coding_system *coding)
983{
df7492f9 984 if (BUFFERP (coding->dst_object))
aa72b389 985 {
a0241d01 986 if (BUFFERP (coding->src_object) && coding->src_pos < 0)
aa72b389 987 {
13818c30 988 coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
28f67a95
KH
989 coding->dst_bytes = (GAP_END_ADDR
990 - (coding->src_bytes - coding->consumed)
991 - coding->destination);
aa72b389 992 }
df7492f9 993 else
28f67a95
KH
994 {
995 /* We are sure that coding->dst_pos_byte is before the gap
996 of the buffer. */
997 coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
13818c30 998 + coding->dst_pos_byte - BEG_BYTE);
28f67a95
KH
999 coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
1000 - coding->destination);
1001 }
df7492f9
KH
1002 }
1003 else
f38b440c
PE
1004 {
1005 /* Otherwise, the destination is C string and is never relocated
1006 automatically. Thus we don't have to update anything. */
1007 }
c1892f11
PE
1008}
1009
1010
1011/* Set coding->destination from coding->dst_object, and return how
1012 many bytes coding->destination was changed. */
1013
1014static ptrdiff_t
1015coding_change_destination (struct coding_system *coding)
1016{
1017 const unsigned char *orig = coding->destination;
1018 coding_set_destination (coding);
5eb05ea3 1019 return coding->destination - orig;
df7492f9
KH
1020}
1021
1022
1023static void
d311d28c 1024coding_alloc_by_realloc (struct coding_system *coding, ptrdiff_t bytes)
df7492f9 1025{
c9d624c6 1026 if (STRING_BYTES_BOUND - coding->dst_bytes < bytes)
d1f3d2af 1027 string_overflow ();
38182d90
PE
1028 coding->destination = xrealloc (coding->destination,
1029 coding->dst_bytes + bytes);
df7492f9
KH
1030 coding->dst_bytes += bytes;
1031}
1032
1033static void
cf84bb53 1034coding_alloc_by_making_gap (struct coding_system *coding,
d311d28c 1035 ptrdiff_t gap_head_used, ptrdiff_t bytes)
df7492f9 1036{
db274c7a 1037 if (EQ (coding->src_object, coding->dst_object))
df7492f9 1038 {
db274c7a
KH
1039 /* The gap may contain the produced data at the head and not-yet
1040 consumed data at the tail. To preserve those data, we at
1041 first make the gap size to zero, then increase the gap
1042 size. */
d311d28c 1043 ptrdiff_t add = GAP_SIZE;
db274c7a
KH
1044
1045 GPT += gap_head_used, GPT_BYTE += gap_head_used;
1046 GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
df7492f9
KH
1047 make_gap (bytes);
1048 GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
db274c7a 1049 GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
df7492f9 1050 }
730fff51 1051 else
eefd7278 1052 make_gap_1 (XBUFFER (coding->dst_object), bytes);
df7492f9 1053}
8f924df7 1054
df7492f9
KH
1055
1056static unsigned char *
d311d28c 1057alloc_destination (struct coding_system *coding, ptrdiff_t nbytes,
cf84bb53 1058 unsigned char *dst)
df7492f9 1059{
d311d28c 1060 ptrdiff_t offset = dst - coding->destination;
df7492f9
KH
1061
1062 if (BUFFERP (coding->dst_object))
db274c7a
KH
1063 {
1064 struct buffer *buf = XBUFFER (coding->dst_object);
1065
1066 coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1067 }
aa72b389 1068 else
df7492f9 1069 coding_alloc_by_realloc (coding, nbytes);
df7492f9
KH
1070 coding_set_destination (coding);
1071 dst = coding->destination + offset;
1072 return dst;
1073}
aa72b389 1074
ff0dacd7
KH
1075/** Macros for annotations. */
1076
ff0dacd7
KH
1077/* An annotation data is stored in the array coding->charbuf in this
1078 format:
69a80ea3 1079 [ -LENGTH ANNOTATION_MASK NCHARS ... ]
ff0dacd7
KH
1080 LENGTH is the number of elements in the annotation.
1081 ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
69a80ea3 1082 NCHARS is the number of characters in the text annotated.
ff0dacd7
KH
1083
1084 The format of the following elements depend on ANNOTATION_MASK.
1085
1086 In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1087 follows:
e951386e
KH
1088 ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1089
1090 NBYTES is the number of bytes specified in the header part of
1091 old-style emacs-mule encoding, or 0 for the other kind of
1092 composition.
1093
ff0dacd7 1094 METHOD is one of enum composition_method.
e951386e 1095
ad1746f5 1096 Optional COMPOSITION-COMPONENTS are characters and composition
ff0dacd7
KH
1097 rules.
1098
1099 In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
e951386e
KH
1100 follows.
1101
1102 If ANNOTATION_MASK is 0, this annotation is just a space holder to
1103 recover from an invalid annotation, and should be skipped by
1104 produce_annotation. */
1105
1106/* Maximum length of the header of annotation data. */
1107#define MAX_ANNOTATION_LENGTH 5
ff0dacd7 1108
69a80ea3 1109#define ADD_ANNOTATION_DATA(buf, len, mask, nchars) \
ff0dacd7
KH
1110 do { \
1111 *(buf)++ = -(len); \
1112 *(buf)++ = (mask); \
69a80ea3 1113 *(buf)++ = (nchars); \
ff0dacd7
KH
1114 coding->annotated = 1; \
1115 } while (0);
1116
e951386e 1117#define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method) \
69a80ea3 1118 do { \
e951386e
KH
1119 ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1120 *buf++ = nbytes; \
69a80ea3 1121 *buf++ = method; \
ff0dacd7
KH
1122 } while (0)
1123
1124
69a80ea3
KH
1125#define ADD_CHARSET_DATA(buf, nchars, id) \
1126 do { \
1127 ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1128 *buf++ = id; \
ff0dacd7
KH
1129 } while (0)
1130
df7492f9
KH
1131\f
1132/*** 2. Emacs' internal format (emacs-utf-8) ***/
1133
1134
1135
1136\f
1137/*** 3. UTF-8 ***/
1138
1139/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
f10fe38f 1140 Return true if a text is encoded in UTF-8. */
df7492f9
KH
1141
1142#define UTF_8_1_OCTET_P(c) ((c) < 0x80)
1143#define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
1144#define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1145#define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1146#define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1147#define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1148
a470d443
KH
1149#define UTF_8_BOM_1 0xEF
1150#define UTF_8_BOM_2 0xBB
1151#define UTF_8_BOM_3 0xBF
1152
f10fe38f 1153static bool
cf84bb53
JB
1154detect_coding_utf_8 (struct coding_system *coding,
1155 struct coding_detection_info *detect_info)
df7492f9 1156{
065e3595 1157 const unsigned char *src = coding->source, *src_base;
8f924df7 1158 const unsigned char *src_end = coding->source + coding->src_bytes;
f10fe38f 1159 bool multibytep = coding->src_multibyte;
d311d28c 1160 ptrdiff_t consumed_chars = 0;
f10fe38f
PE
1161 bool bom_found = 0;
1162 bool found = 0;
df7492f9 1163
ff0dacd7 1164 detect_info->checked |= CATEGORY_MASK_UTF_8;
df7492f9
KH
1165 /* A coding system of this category is always ASCII compatible. */
1166 src += coding->head_ascii;
1167
1168 while (1)
aa72b389 1169 {
df7492f9 1170 int c, c1, c2, c3, c4;
aa72b389 1171
065e3595 1172 src_base = src;
df7492f9 1173 ONE_MORE_BYTE (c);
065e3595 1174 if (c < 0 || UTF_8_1_OCTET_P (c))
df7492f9
KH
1175 continue;
1176 ONE_MORE_BYTE (c1);
065e3595 1177 if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
df7492f9
KH
1178 break;
1179 if (UTF_8_2_OCTET_LEADING_P (c))
aa72b389 1180 {
a470d443 1181 found = 1;
df7492f9 1182 continue;
aa72b389 1183 }
df7492f9 1184 ONE_MORE_BYTE (c2);
065e3595 1185 if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
df7492f9
KH
1186 break;
1187 if (UTF_8_3_OCTET_LEADING_P (c))
aa72b389 1188 {
a470d443
KH
1189 found = 1;
1190 if (src_base == coding->source
1191 && c == UTF_8_BOM_1 && c1 == UTF_8_BOM_2 && c2 == UTF_8_BOM_3)
1192 bom_found = 1;
df7492f9 1193 continue;
aa72b389 1194 }
df7492f9 1195 ONE_MORE_BYTE (c3);
065e3595 1196 if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
df7492f9
KH
1197 break;
1198 if (UTF_8_4_OCTET_LEADING_P (c))
aa72b389 1199 {
a470d443 1200 found = 1;
df7492f9
KH
1201 continue;
1202 }
1203 ONE_MORE_BYTE (c4);
065e3595 1204 if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
df7492f9
KH
1205 break;
1206 if (UTF_8_5_OCTET_LEADING_P (c))
1207 {
a470d443 1208 found = 1;
df7492f9
KH
1209 continue;
1210 }
1211 break;
aa72b389 1212 }
ff0dacd7 1213 detect_info->rejected |= CATEGORY_MASK_UTF_8;
df7492f9 1214 return 0;
aa72b389 1215
df7492f9 1216 no_more_source:
065e3595 1217 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
aa72b389 1218 {
ff0dacd7 1219 detect_info->rejected |= CATEGORY_MASK_UTF_8;
89528eb3 1220 return 0;
aa72b389 1221 }
a470d443
KH
1222 if (bom_found)
1223 {
1224 /* The first character 0xFFFE doesn't necessarily mean a BOM. */
1225 detect_info->found |= CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1226 }
1227 else
1228 {
1229 detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
0e17387a
KH
1230 if (found)
1231 detect_info->found |= CATEGORY_MASK_UTF_8_NOSIG;
a470d443 1232 }
ff0dacd7 1233 return 1;
aa72b389
KH
1234}
1235
4ed46869 1236
b73bfc1c 1237static void
971de7fb 1238decode_coding_utf_8 (struct coding_system *coding)
b73bfc1c 1239{
8f924df7
KH
1240 const unsigned char *src = coding->source + coding->consumed;
1241 const unsigned char *src_end = coding->source + coding->src_bytes;
1242 const unsigned char *src_base;
69a80ea3
KH
1243 int *charbuf = coding->charbuf + coding->charbuf_used;
1244 int *charbuf_end = coding->charbuf + coding->charbuf_size;
d311d28c 1245 ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
f10fe38f 1246 bool multibytep = coding->src_multibyte;
a470d443 1247 enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
f10fe38f
PE
1248 bool eol_dos
1249 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
119852e7 1250 int byte_after_cr = -1;
4ed46869 1251
a470d443
KH
1252 if (bom != utf_without_bom)
1253 {
1254 int c1, c2, c3;
1255
1256 src_base = src;
1257 ONE_MORE_BYTE (c1);
1258 if (! UTF_8_3_OCTET_LEADING_P (c1))
1259 src = src_base;
1260 else
1261 {
159bd5a2 1262 ONE_MORE_BYTE (c2);
a470d443
KH
1263 if (! UTF_8_EXTRA_OCTET_P (c2))
1264 src = src_base;
1265 else
1266 {
159bd5a2 1267 ONE_MORE_BYTE (c3);
a470d443
KH
1268 if (! UTF_8_EXTRA_OCTET_P (c3))
1269 src = src_base;
1270 else
1271 {
1272 if ((c1 != UTF_8_BOM_1)
1273 || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1274 src = src_base;
1275 else
1276 CODING_UTF_8_BOM (coding) = utf_without_bom;
1277 }
1278 }
1279 }
1280 }
1281 CODING_UTF_8_BOM (coding) = utf_without_bom;
1282
df7492f9 1283 while (1)
b73bfc1c 1284 {
df7492f9 1285 int c, c1, c2, c3, c4, c5;
ec6d2bb8 1286
df7492f9
KH
1287 src_base = src;
1288 consumed_chars_base = consumed_chars;
4af310db 1289
df7492f9 1290 if (charbuf >= charbuf_end)
b71f6f73
KH
1291 {
1292 if (byte_after_cr >= 0)
1293 src_base--;
1294 break;
1295 }
df7492f9 1296
119852e7
KH
1297 if (byte_after_cr >= 0)
1298 c1 = byte_after_cr, byte_after_cr = -1;
1299 else
1300 ONE_MORE_BYTE (c1);
065e3595
KH
1301 if (c1 < 0)
1302 {
1303 c = - c1;
1304 }
1a4990fb 1305 else if (UTF_8_1_OCTET_P (c1))
df7492f9 1306 {
2735d060 1307 if (eol_dos && c1 == '\r')
119852e7 1308 ONE_MORE_BYTE (byte_after_cr);
df7492f9 1309 c = c1;
4af310db 1310 }
df7492f9 1311 else
4af310db 1312 {
df7492f9 1313 ONE_MORE_BYTE (c2);
065e3595 1314 if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
df7492f9
KH
1315 goto invalid_code;
1316 if (UTF_8_2_OCTET_LEADING_P (c1))
4af310db 1317 {
b0edb2c5
DL
1318 c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1319 /* Reject overlong sequences here and below. Encoders
1320 producing them are incorrect, they can be misleading,
1321 and they mess up read/write invariance. */
1322 if (c < 128)
1323 goto invalid_code;
4af310db 1324 }
df7492f9 1325 else
aa72b389 1326 {
df7492f9 1327 ONE_MORE_BYTE (c3);
065e3595 1328 if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
df7492f9
KH
1329 goto invalid_code;
1330 if (UTF_8_3_OCTET_LEADING_P (c1))
b0edb2c5
DL
1331 {
1332 c = (((c1 & 0xF) << 12)
1333 | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
72fe1301
DL
1334 if (c < 0x800
1335 || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
b0edb2c5
DL
1336 goto invalid_code;
1337 }
df7492f9
KH
1338 else
1339 {
1340 ONE_MORE_BYTE (c4);
065e3595 1341 if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
df7492f9
KH
1342 goto invalid_code;
1343 if (UTF_8_4_OCTET_LEADING_P (c1))
b0edb2c5 1344 {
df7492f9
KH
1345 c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1346 | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
b0edb2c5
DL
1347 if (c < 0x10000)
1348 goto invalid_code;
1349 }
df7492f9
KH
1350 else
1351 {
1352 ONE_MORE_BYTE (c5);
065e3595 1353 if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
df7492f9
KH
1354 goto invalid_code;
1355 if (UTF_8_5_OCTET_LEADING_P (c1))
1356 {
1357 c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1358 | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1359 | (c5 & 0x3F));
b0edb2c5 1360 if ((c > MAX_CHAR) || (c < 0x200000))
df7492f9
KH
1361 goto invalid_code;
1362 }
1363 else
1364 goto invalid_code;
1365 }
1366 }
aa72b389 1367 }
b73bfc1c 1368 }
df7492f9
KH
1369
1370 *charbuf++ = c;
1371 continue;
1372
1373 invalid_code:
1374 src = src_base;
1375 consumed_chars = consumed_chars_base;
1376 ONE_MORE_BYTE (c);
1377 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1378 coding->errors++;
aa72b389
KH
1379 }
1380
df7492f9
KH
1381 no_more_source:
1382 coding->consumed_char += consumed_chars_base;
1383 coding->consumed = src_base - coding->source;
1384 coding->charbuf_used = charbuf - coding->charbuf;
1385}
1386
1387
f10fe38f 1388static bool
971de7fb 1389encode_coding_utf_8 (struct coding_system *coding)
df7492f9 1390{
f10fe38f 1391 bool multibytep = coding->dst_multibyte;
df7492f9
KH
1392 int *charbuf = coding->charbuf;
1393 int *charbuf_end = charbuf + coding->charbuf_used;
1394 unsigned char *dst = coding->destination + coding->produced;
1395 unsigned char *dst_end = coding->destination + coding->dst_bytes;
d311d28c 1396 ptrdiff_t produced_chars = 0;
df7492f9
KH
1397 int c;
1398
a470d443
KH
1399 if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1400 {
1401 ASSURE_DESTINATION (3);
1402 EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1403 CODING_UTF_8_BOM (coding) = utf_without_bom;
1404 }
1405
df7492f9 1406 if (multibytep)
aa72b389 1407 {
df7492f9
KH
1408 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1409
1410 while (charbuf < charbuf_end)
b73bfc1c 1411 {
df7492f9 1412 unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
8f924df7 1413
df7492f9
KH
1414 ASSURE_DESTINATION (safe_room);
1415 c = *charbuf++;
28f67a95
KH
1416 if (CHAR_BYTE8_P (c))
1417 {
1418 c = CHAR_TO_BYTE8 (c);
1419 EMIT_ONE_BYTE (c);
1420 }
1421 else
1422 {
db274c7a 1423 CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
28f67a95
KH
1424 for (p = str; p < pend; p++)
1425 EMIT_ONE_BYTE (*p);
1426 }
b73bfc1c 1427 }
aa72b389 1428 }
df7492f9
KH
1429 else
1430 {
1431 int safe_room = MAX_MULTIBYTE_LENGTH;
1432
1433 while (charbuf < charbuf_end)
b73bfc1c 1434 {
df7492f9
KH
1435 ASSURE_DESTINATION (safe_room);
1436 c = *charbuf++;
f03caae0
KH
1437 if (CHAR_BYTE8_P (c))
1438 *dst++ = CHAR_TO_BYTE8 (c);
1439 else
db274c7a 1440 CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
df7492f9 1441 produced_chars++;
4ed46869
KH
1442 }
1443 }
065e3595 1444 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
1445 coding->produced_char += produced_chars;
1446 coding->produced = dst - coding->destination;
1447 return 0;
4ed46869
KH
1448}
1449
b73bfc1c 1450
df7492f9 1451/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
f10fe38f 1452 Return true if a text is encoded in one of UTF-16 based coding systems. */
aa72b389 1453
df7492f9
KH
1454#define UTF_16_HIGH_SURROGATE_P(val) \
1455 (((val) & 0xFC00) == 0xD800)
1456
1457#define UTF_16_LOW_SURROGATE_P(val) \
1458 (((val) & 0xFC00) == 0xDC00)
93dec019 1459
aa72b389 1460
f10fe38f 1461static bool
cf84bb53
JB
1462detect_coding_utf_16 (struct coding_system *coding,
1463 struct coding_detection_info *detect_info)
aa72b389 1464{
ef1b0ba7 1465 const unsigned char *src = coding->source;
8f924df7 1466 const unsigned char *src_end = coding->source + coding->src_bytes;
f10fe38f 1467 bool multibytep = coding->src_multibyte;
df7492f9 1468 int c1, c2;
aa72b389 1469
ff0dacd7 1470 detect_info->checked |= CATEGORY_MASK_UTF_16;
ff0dacd7 1471 if (coding->mode & CODING_MODE_LAST_BLOCK
24a73b0a 1472 && (coding->src_chars & 1))
ff0dacd7
KH
1473 {
1474 detect_info->rejected |= CATEGORY_MASK_UTF_16;
1475 return 0;
1476 }
24a73b0a 1477
f56a4450 1478 TWO_MORE_BYTES (c1, c2);
df7492f9 1479 if ((c1 == 0xFF) && (c2 == 0xFE))
aa72b389 1480 {
b49a1807
KH
1481 detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1482 | CATEGORY_MASK_UTF_16_AUTO);
24a73b0a
KH
1483 detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1484 | CATEGORY_MASK_UTF_16_BE_NOSIG
1485 | CATEGORY_MASK_UTF_16_LE_NOSIG);
aa72b389 1486 }
df7492f9 1487 else if ((c1 == 0xFE) && (c2 == 0xFF))
ff0dacd7 1488 {
b49a1807
KH
1489 detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1490 | CATEGORY_MASK_UTF_16_AUTO);
24a73b0a
KH
1491 detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1492 | CATEGORY_MASK_UTF_16_BE_NOSIG
1493 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1494 }
220eeac9 1495 else if (c2 < 0)
f56a4450
KH
1496 {
1497 detect_info->rejected |= CATEGORY_MASK_UTF_16;
1498 return 0;
1499 }
2f3cbb32 1500 else
24a73b0a 1501 {
2f3cbb32
KH
1502 /* We check the dispersion of Eth and Oth bytes where E is even and
1503 O is odd. If both are high, we assume binary data.*/
1504 unsigned char e[256], o[256];
1505 unsigned e_num = 1, o_num = 1;
1506
1507 memset (e, 0, 256);
1508 memset (o, 0, 256);
1509 e[c1] = 1;
1510 o[c2] = 1;
1511
cc13543e
KH
1512 detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
1513 |CATEGORY_MASK_UTF_16_BE
1514 | CATEGORY_MASK_UTF_16_LE);
2f3cbb32 1515
7f1faf1c
KH
1516 while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
1517 != CATEGORY_MASK_UTF_16)
2f3cbb32 1518 {
f56a4450 1519 TWO_MORE_BYTES (c1, c2);
220eeac9 1520 if (c2 < 0)
f56a4450 1521 break;
2f3cbb32
KH
1522 if (! e[c1])
1523 {
1524 e[c1] = 1;
1525 e_num++;
cc13543e
KH
1526 if (e_num >= 128)
1527 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
2f3cbb32
KH
1528 }
1529 if (! o[c2])
1530 {
977b85f4 1531 o[c2] = 1;
2f3cbb32 1532 o_num++;
cc13543e
KH
1533 if (o_num >= 128)
1534 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
2f3cbb32
KH
1535 }
1536 }
2f3cbb32 1537 return 0;
ff0dacd7 1538 }
2f3cbb32 1539
df7492f9 1540 no_more_source:
ff0dacd7 1541 return 1;
df7492f9 1542}
aa72b389 1543
df7492f9 1544static void
971de7fb 1545decode_coding_utf_16 (struct coding_system *coding)
df7492f9 1546{
8f924df7
KH
1547 const unsigned char *src = coding->source + coding->consumed;
1548 const unsigned char *src_end = coding->source + coding->src_bytes;
1549 const unsigned char *src_base;
69a80ea3 1550 int *charbuf = coding->charbuf + coding->charbuf_used;
df80c7f0
KH
1551 /* We may produces at most 3 chars in one loop. */
1552 int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
d311d28c 1553 ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
f10fe38f 1554 bool multibytep = coding->src_multibyte;
a470d443 1555 enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
df7492f9
KH
1556 enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1557 int surrogate = CODING_UTF_16_SURROGATE (coding);
f10fe38f
PE
1558 bool eol_dos
1559 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
119852e7 1560 int byte_after_cr1 = -1, byte_after_cr2 = -1;
df7492f9 1561
a470d443 1562 if (bom == utf_with_bom)
aa72b389 1563 {
df7492f9 1564 int c, c1, c2;
4af310db 1565
aa72b389 1566 src_base = src;
df7492f9
KH
1567 ONE_MORE_BYTE (c1);
1568 ONE_MORE_BYTE (c2);
e19c3639 1569 c = (c1 << 8) | c2;
aa72b389 1570
b49a1807
KH
1571 if (endian == utf_16_big_endian
1572 ? c != 0xFEFF : c != 0xFFFE)
aa72b389 1573 {
b49a1807
KH
1574 /* The first two bytes are not BOM. Treat them as bytes
1575 for a normal character. */
1576 src = src_base;
1577 coding->errors++;
aa72b389 1578 }
a470d443 1579 CODING_UTF_16_BOM (coding) = utf_without_bom;
b49a1807 1580 }
a470d443 1581 else if (bom == utf_detect_bom)
b49a1807
KH
1582 {
1583 /* We have already tried to detect BOM and failed in
1584 detect_coding. */
a470d443 1585 CODING_UTF_16_BOM (coding) = utf_without_bom;
df7492f9 1586 }
aa72b389 1587
df7492f9
KH
1588 while (1)
1589 {
1590 int c, c1, c2;
1591
1592 src_base = src;
1593 consumed_chars_base = consumed_chars;
1594
df80c7f0 1595 if (charbuf >= charbuf_end)
b71f6f73
KH
1596 {
1597 if (byte_after_cr1 >= 0)
1598 src_base -= 2;
1599 break;
1600 }
df7492f9 1601
119852e7
KH
1602 if (byte_after_cr1 >= 0)
1603 c1 = byte_after_cr1, byte_after_cr1 = -1;
1604 else
1605 ONE_MORE_BYTE (c1);
065e3595
KH
1606 if (c1 < 0)
1607 {
1608 *charbuf++ = -c1;
1609 continue;
1610 }
119852e7
KH
1611 if (byte_after_cr2 >= 0)
1612 c2 = byte_after_cr2, byte_after_cr2 = -1;
1613 else
1614 ONE_MORE_BYTE (c2);
065e3595
KH
1615 if (c2 < 0)
1616 {
1617 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1618 *charbuf++ = -c2;
1619 continue;
1620 }
df7492f9 1621 c = (endian == utf_16_big_endian
e19c3639 1622 ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
119852e7 1623
df7492f9 1624 if (surrogate)
fd3ae0b9 1625 {
df7492f9 1626 if (! UTF_16_LOW_SURROGATE_P (c))
fd3ae0b9 1627 {
df7492f9
KH
1628 if (endian == utf_16_big_endian)
1629 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1630 else
1631 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1632 *charbuf++ = c1;
1633 *charbuf++ = c2;
1634 coding->errors++;
1635 if (UTF_16_HIGH_SURROGATE_P (c))
1636 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
fd3ae0b9 1637 else
df7492f9 1638 *charbuf++ = c;
fd3ae0b9
KH
1639 }
1640 else
df7492f9
KH
1641 {
1642 c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1643 CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
29f7ffd0 1644 *charbuf++ = 0x10000 + c;
df7492f9 1645 }
fd3ae0b9 1646 }
aa72b389 1647 else
df7492f9
KH
1648 {
1649 if (UTF_16_HIGH_SURROGATE_P (c))
1650 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1651 else
119852e7 1652 {
2735d060 1653 if (eol_dos && c == '\r')
119852e7
KH
1654 {
1655 ONE_MORE_BYTE (byte_after_cr1);
1656 ONE_MORE_BYTE (byte_after_cr2);
1657 }
1658 *charbuf++ = c;
1659 }
8f924df7 1660 }
aa72b389 1661 }
df7492f9
KH
1662
1663 no_more_source:
1664 coding->consumed_char += consumed_chars_base;
1665 coding->consumed = src_base - coding->source;
1666 coding->charbuf_used = charbuf - coding->charbuf;
aa72b389 1667}
b73bfc1c 1668
f10fe38f 1669static bool
971de7fb 1670encode_coding_utf_16 (struct coding_system *coding)
df7492f9 1671{
f10fe38f 1672 bool multibytep = coding->dst_multibyte;
df7492f9
KH
1673 int *charbuf = coding->charbuf;
1674 int *charbuf_end = charbuf + coding->charbuf_used;
1675 unsigned char *dst = coding->destination + coding->produced;
1676 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1677 int safe_room = 8;
a470d443 1678 enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
f10fe38f 1679 bool big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
d311d28c 1680 ptrdiff_t produced_chars = 0;
df7492f9 1681 int c;
4ed46869 1682
a470d443 1683 if (bom != utf_without_bom)
df7492f9
KH
1684 {
1685 ASSURE_DESTINATION (safe_room);
1686 if (big_endian)
df7492f9 1687 EMIT_TWO_BYTES (0xFE, 0xFF);
880cf180
KH
1688 else
1689 EMIT_TWO_BYTES (0xFF, 0xFE);
a470d443 1690 CODING_UTF_16_BOM (coding) = utf_without_bom;
df7492f9
KH
1691 }
1692
1693 while (charbuf < charbuf_end)
1694 {
1695 ASSURE_DESTINATION (safe_room);
1696 c = *charbuf++;
60afa08d 1697 if (c > MAX_UNICODE_CHAR)
e19c3639 1698 c = coding->default_char;
df7492f9
KH
1699
1700 if (c < 0x10000)
1701 {
1702 if (big_endian)
1703 EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1704 else
1705 EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1706 }
1707 else
1708 {
1709 int c1, c2;
1710
1711 c -= 0x10000;
1712 c1 = (c >> 10) + 0xD800;
1713 c2 = (c & 0x3FF) + 0xDC00;
1714 if (big_endian)
1715 EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1716 else
1717 EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1718 }
1719 }
065e3595 1720 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
1721 coding->produced = dst - coding->destination;
1722 coding->produced_char += produced_chars;
1723 return 0;
1724}
1725
1726\f
1727/*** 6. Old Emacs' internal format (emacs-mule) ***/
1728
1729/* Emacs' internal format for representation of multiple character
1730 sets is a kind of multi-byte encoding, i.e. characters are
1731 represented by variable-length sequences of one-byte codes.
1732
1733 ASCII characters and control characters (e.g. `tab', `newline') are
1734 represented by one-byte sequences which are their ASCII codes, in
1735 the range 0x00 through 0x7F.
1736
1737 8-bit characters of the range 0x80..0x9F are represented by
1738 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1739 code + 0x20).
1740
1741 8-bit characters of the range 0xA0..0xFF are represented by
1742 one-byte sequences which are their 8-bit code.
1743
1744 The other characters are represented by a sequence of `base
1745 leading-code', optional `extended leading-code', and one or two
1746 `position-code's. The length of the sequence is determined by the
1747 base leading-code. Leading-code takes the range 0x81 through 0x9D,
1748 whereas extended leading-code and position-code take the range 0xA0
1749 through 0xFF. See `charset.h' for more details about leading-code
1750 and position-code.
1751
1752 --- CODE RANGE of Emacs' internal format ---
1753 character set range
1754 ------------- -----
1755 ascii 0x00..0x7F
1756 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1757 eight-bit-graphic 0xA0..0xBF
1758 ELSE 0x81..0x9D + [0xA0..0xFF]+
1759 ---------------------------------------------
1760
1761 As this is the internal character representation, the format is
1762 usually not used externally (i.e. in a file or in a data sent to a
1763 process). But, it is possible to have a text externally in this
1764 format (i.e. by encoding by the coding system `emacs-mule').
1765
1766 In that case, a sequence of one-byte codes has a slightly different
1767 form.
1768
1769 At first, all characters in eight-bit-control are represented by
1770 one-byte sequences which are their 8-bit code.
1771
1772 Next, character composition data are represented by the byte
1773 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1774 where,
e951386e 1775 METHOD is 0xF2 plus one of composition method (enum
df7492f9
KH
1776 composition_method),
1777
1778 BYTES is 0xA0 plus a byte length of this composition data,
1779
e951386e 1780 CHARS is 0xA0 plus a number of characters composed by this
df7492f9
KH
1781 data,
1782
ad1746f5 1783 COMPONENTs are characters of multibyte form or composition
df7492f9
KH
1784 rules encoded by two-byte of ASCII codes.
1785
1786 In addition, for backward compatibility, the following formats are
1787 also recognized as composition data on decoding.
1788
1789 0x80 MSEQ ...
1790 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1791
1792 Here,
1793 MSEQ is a multibyte form but in these special format:
1794 ASCII: 0xA0 ASCII_CODE+0x80,
1795 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1796 RULE is a one byte code of the range 0xA0..0xF0 that
1797 represents a composition rule.
1798 */
1799
1800char emacs_mule_bytes[256];
1801
e951386e
KH
1802
1803/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
f10fe38f 1804 Return true if a text is encoded in 'emacs-mule'. */
e951386e 1805
f10fe38f 1806static bool
cf84bb53
JB
1807detect_coding_emacs_mule (struct coding_system *coding,
1808 struct coding_detection_info *detect_info)
e951386e
KH
1809{
1810 const unsigned char *src = coding->source, *src_base;
1811 const unsigned char *src_end = coding->source + coding->src_bytes;
f10fe38f 1812 bool multibytep = coding->src_multibyte;
d311d28c 1813 ptrdiff_t consumed_chars = 0;
e951386e
KH
1814 int c;
1815 int found = 0;
1816
1817 detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1818 /* A coding system of this category is always ASCII compatible. */
1819 src += coding->head_ascii;
1820
1821 while (1)
1822 {
1823 src_base = src;
1824 ONE_MORE_BYTE (c);
1825 if (c < 0)
1826 continue;
1827 if (c == 0x80)
1828 {
1829 /* Perhaps the start of composite character. We simply skip
1830 it because analyzing it is too heavy for detecting. But,
1831 at least, we check that the composite character
1832 constitutes of more than 4 bytes. */
2735d060 1833 const unsigned char *src_start;
e951386e
KH
1834
1835 repeat:
2735d060 1836 src_start = src;
e951386e
KH
1837 do
1838 {
1839 ONE_MORE_BYTE (c);
1840 }
1841 while (c >= 0xA0);
1842
2735d060 1843 if (src - src_start <= 4)
e951386e
KH
1844 break;
1845 found = CATEGORY_MASK_EMACS_MULE;
1846 if (c == 0x80)
1847 goto repeat;
1848 }
1849
1850 if (c < 0x80)
1851 {
1852 if (c < 0x20
1853 && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1854 break;
1855 }
1856 else
1857 {
396475b7 1858 int more_bytes = emacs_mule_bytes[c] - 1;
e951386e
KH
1859
1860 while (more_bytes > 0)
1861 {
1862 ONE_MORE_BYTE (c);
1863 if (c < 0xA0)
1864 {
1865 src--; /* Unread the last byte. */
1866 break;
1867 }
1868 more_bytes--;
1869 }
1870 if (more_bytes != 0)
1871 break;
1872 found = CATEGORY_MASK_EMACS_MULE;
1873 }
1874 }
1875 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1876 return 0;
1877
1878 no_more_source:
1879 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1880 {
1881 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1882 return 0;
1883 }
1884 detect_info->found |= found;
1885 return 1;
1886}
1887
1888
1889/* Parse emacs-mule multibyte sequence at SRC and return the decoded
1890 character. If CMP_STATUS indicates that we must expect MSEQ or
1891 RULE described above, decode it and return the negative value of
685ebdc8 1892 the decoded character or rule. If an invalid byte is found, return
e951386e
KH
1893 -1. If SRC is too short, return -2. */
1894
e2f1bab9 1895static int
cf84bb53
JB
1896emacs_mule_char (struct coding_system *coding, const unsigned char *src,
1897 int *nbytes, int *nchars, int *id,
1898 struct composition_status *cmp_status)
df7492f9 1899{
8f924df7
KH
1900 const unsigned char *src_end = coding->source + coding->src_bytes;
1901 const unsigned char *src_base = src;
f10fe38f 1902 bool multibytep = coding->src_multibyte;
2735d060 1903 int charset_ID;
df7492f9
KH
1904 unsigned code;
1905 int c;
1906 int consumed_chars = 0;
f10fe38f 1907 bool mseq_found = 0;
df7492f9
KH
1908
1909 ONE_MORE_BYTE (c);
065e3595 1910 if (c < 0)
df7492f9 1911 {
065e3595 1912 c = -c;
2735d060 1913 charset_ID = emacs_mule_charset[0];
065e3595
KH
1914 }
1915 else
1916 {
4d41e8b7
KH
1917 if (c >= 0xA0)
1918 {
e951386e
KH
1919 if (cmp_status->state != COMPOSING_NO
1920 && cmp_status->old_form)
4d41e8b7 1921 {
e951386e
KH
1922 if (cmp_status->state == COMPOSING_CHAR)
1923 {
1924 if (c == 0xA0)
1925 {
1926 ONE_MORE_BYTE (c);
1927 c -= 0x80;
1928 if (c < 0)
1929 goto invalid_code;
1930 }
1931 else
1932 c -= 0x20;
1933 mseq_found = 1;
1934 }
1935 else
1936 {
1937 *nbytes = src - src_base;
1938 *nchars = consumed_chars;
1939 return -c;
1940 }
4d41e8b7
KH
1941 }
1942 else
e951386e 1943 goto invalid_code;
4d41e8b7
KH
1944 }
1945
065e3595 1946 switch (emacs_mule_bytes[c])
b73bfc1c 1947 {
065e3595 1948 case 2:
2735d060 1949 if ((charset_ID = emacs_mule_charset[c]) < 0)
df7492f9
KH
1950 goto invalid_code;
1951 ONE_MORE_BYTE (c);
9ffd559c 1952 if (c < 0xA0)
065e3595 1953 goto invalid_code;
df7492f9 1954 code = c & 0x7F;
065e3595
KH
1955 break;
1956
1957 case 3:
1958 if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
1959 || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
1960 {
1961 ONE_MORE_BYTE (c);
2735d060 1962 if (c < 0xA0 || (charset_ID = emacs_mule_charset[c]) < 0)
065e3595
KH
1963 goto invalid_code;
1964 ONE_MORE_BYTE (c);
9ffd559c 1965 if (c < 0xA0)
065e3595
KH
1966 goto invalid_code;
1967 code = c & 0x7F;
1968 }
1969 else
1970 {
2735d060 1971 if ((charset_ID = emacs_mule_charset[c]) < 0)
065e3595
KH
1972 goto invalid_code;
1973 ONE_MORE_BYTE (c);
9ffd559c 1974 if (c < 0xA0)
065e3595
KH
1975 goto invalid_code;
1976 code = (c & 0x7F) << 8;
1977 ONE_MORE_BYTE (c);
9ffd559c 1978 if (c < 0xA0)
065e3595
KH
1979 goto invalid_code;
1980 code |= c & 0x7F;
1981 }
1982 break;
1983
1984 case 4:
1985 ONE_MORE_BYTE (c);
2735d060 1986 if (c < 0 || (charset_ID = emacs_mule_charset[c]) < 0)
df7492f9
KH
1987 goto invalid_code;
1988 ONE_MORE_BYTE (c);
9ffd559c 1989 if (c < 0xA0)
065e3595 1990 goto invalid_code;
781d7a48 1991 code = (c & 0x7F) << 8;
df7492f9 1992 ONE_MORE_BYTE (c);
9ffd559c 1993 if (c < 0xA0)
065e3595 1994 goto invalid_code;
df7492f9 1995 code |= c & 0x7F;
065e3595 1996 break;
df7492f9 1997
065e3595
KH
1998 case 1:
1999 code = c;
2735d060 2000 charset_ID = ASCII_BYTE_P (code) ? charset_ascii : charset_eight_bit;
065e3595 2001 break;
df7492f9 2002
065e3595 2003 default:
1088b922 2004 emacs_abort ();
065e3595 2005 }
b84ae584 2006 CODING_DECODE_CHAR (coding, src, src_base, src_end,
2735d060 2007 CHARSET_FROM_ID (charset_ID), code, c);
065e3595
KH
2008 if (c < 0)
2009 goto invalid_code;
df7492f9 2010 }
df7492f9
KH
2011 *nbytes = src - src_base;
2012 *nchars = consumed_chars;
ff0dacd7 2013 if (id)
2735d060 2014 *id = charset_ID;
e951386e 2015 return (mseq_found ? -c : c);
df7492f9
KH
2016
2017 no_more_source:
2018 return -2;
2019
2020 invalid_code:
2021 return -1;
2022}
2023
2024
e951386e 2025/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
df7492f9 2026
e951386e
KH
2027/* Handle these composition sequence ('|': the end of header elements,
2028 BYTES and CHARS >= 0xA0):
df7492f9 2029
e951386e
KH
2030 (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2031 (2) altchar composition: 0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2032 (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
df7492f9 2033
e951386e 2034 and these old form:
1a4990fb 2035
e951386e
KH
2036 (4) relative composition: 0x80 | MSEQ ... MSEQ
2037 (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
df7492f9 2038
e951386e
KH
2039 When the starter 0x80 and the following header elements are found,
2040 this annotation header is produced.
df7492f9 2041
e951386e 2042 [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
df7492f9 2043
e951386e
KH
2044 NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2045 NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
df7492f9 2046
e951386e
KH
2047 Then, upon reading the following elements, these codes are produced
2048 until the composition end is found:
df7492f9 2049
e951386e
KH
2050 (1) CHAR ... CHAR
2051 (2) ALT ... ALT CHAR ... CHAR
2052 (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2053 (4) CHAR ... CHAR
2054 (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
4ed46869 2055
e951386e
KH
2056 When the composition end is found, LENGTH and NCHARS in the
2057 annotation header is updated as below:
b73bfc1c 2058
e951386e
KH
2059 (1) LENGTH: unchanged, NCHARS: unchanged
2060 (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2061 (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2062 (4) LENGTH: unchanged, NCHARS: number of CHARs
2063 (5) LENGTH: unchanged, NCHARS: number of CHARs
df7492f9 2064
e951386e
KH
2065 If an error is found while composing, the annotation header is
2066 changed to the original composition header (plus filler -1s) as
2067 below:
2068
2069 (1),(2),(3) [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2070 (5) [ 0x80 0xFF -1 -1- -1 ]
2071
2072 and the sequence [ -2 DECODED-RULE ] is changed to the original
2073 byte sequence as below:
2074 o the original byte sequence is B: [ B -1 ]
2075 o the original byte sequence is B1 B2: [ B1 B2 ]
2076
2077 Most of the routines are implemented by macros because many
2078 variables and labels in the caller decode_coding_emacs_mule must be
2079 accessible, and they are usually called just once (thus doesn't
2080 increase the size of compiled object). */
2081
2082/* Decode a composition rule represented by C as a component of
2083 composition sequence of Emacs 20 style. Set RULE to the decoded
2084 rule. */
2085
2086#define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule) \
df7492f9 2087 do { \
e951386e
KH
2088 int gref, nref; \
2089 \
4d41e8b7 2090 c -= 0xA0; \
df7492f9
KH
2091 if (c < 0 || c >= 81) \
2092 goto invalid_code; \
df7492f9 2093 gref = c / 9, nref = c % 9; \
e951386e
KH
2094 if (gref == 4) gref = 10; \
2095 if (nref == 4) nref = 10; \
2096 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
df7492f9
KH
2097 } while (0)
2098
2099
e951386e
KH
2100/* Decode a composition rule represented by C and the following byte
2101 at SRC as a component of composition sequence of Emacs 21 style.
2102 Set RULE to the decoded rule. */
781d7a48 2103
e951386e 2104#define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule) \
781d7a48
KH
2105 do { \
2106 int gref, nref; \
e951386e
KH
2107 \
2108 gref = c - 0x20; \
2109 if (gref < 0 || gref >= 81) \
781d7a48 2110 goto invalid_code; \
e951386e
KH
2111 ONE_MORE_BYTE (c); \
2112 nref = c - 0x20; \
2113 if (nref < 0 || nref >= 81) \
781d7a48 2114 goto invalid_code; \
e951386e 2115 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
781d7a48
KH
2116 } while (0)
2117
2118
e951386e
KH
2119/* Start of Emacs 21 style format. The first three bytes at SRC are
2120 (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2121 byte length of this composition information, CHARS is the number of
2122 characters composed by this composition. */
2123
2124#define DECODE_EMACS_MULE_21_COMPOSITION() \
aa72b389 2125 do { \
781d7a48 2126 enum composition_method method = c - 0xF2; \
df7492f9 2127 int nbytes, nchars; \
e951386e 2128 \
df7492f9 2129 ONE_MORE_BYTE (c); \
065e3595
KH
2130 if (c < 0) \
2131 goto invalid_code; \
df7492f9 2132 nbytes = c - 0xA0; \
e951386e 2133 if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4)) \
df7492f9
KH
2134 goto invalid_code; \
2135 ONE_MORE_BYTE (c); \
2136 nchars = c - 0xA0; \
e951386e
KH
2137 if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS) \
2138 goto invalid_code; \
2139 cmp_status->old_form = 0; \
2140 cmp_status->method = method; \
2141 if (method == COMPOSITION_RELATIVE) \
2142 cmp_status->state = COMPOSING_CHAR; \
2143 else \
2144 cmp_status->state = COMPOSING_COMPONENT_CHAR; \
2145 cmp_status->length = MAX_ANNOTATION_LENGTH; \
2146 cmp_status->nchars = nchars; \
2147 cmp_status->ncomps = nbytes - 4; \
2148 ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method); \
aa72b389 2149 } while (0)
93dec019 2150
aa72b389 2151
e951386e
KH
2152/* Start of Emacs 20 style format for relative composition. */
2153
2154#define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION() \
2155 do { \
2156 cmp_status->old_form = 1; \
2157 cmp_status->method = COMPOSITION_RELATIVE; \
2158 cmp_status->state = COMPOSING_CHAR; \
2159 cmp_status->length = MAX_ANNOTATION_LENGTH; \
2160 cmp_status->nchars = cmp_status->ncomps = 0; \
2161 ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method); \
2162 } while (0)
2163
2164
2165/* Start of Emacs 20 style format for rule-base composition. */
2166
2167#define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION() \
2168 do { \
2169 cmp_status->old_form = 1; \
2170 cmp_status->method = COMPOSITION_WITH_RULE; \
2171 cmp_status->state = COMPOSING_CHAR; \
2172 cmp_status->length = MAX_ANNOTATION_LENGTH; \
2173 cmp_status->nchars = cmp_status->ncomps = 0; \
2174 ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method); \
df7492f9
KH
2175 } while (0)
2176
2177
e951386e
KH
2178#define DECODE_EMACS_MULE_COMPOSITION_START() \
2179 do { \
2180 const unsigned char *current_src = src; \
2181 \
2182 ONE_MORE_BYTE (c); \
2183 if (c < 0) \
2184 goto invalid_code; \
2185 if (c - 0xF2 >= COMPOSITION_RELATIVE \
2186 && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS) \
2187 DECODE_EMACS_MULE_21_COMPOSITION (); \
2188 else if (c < 0xA0) \
2189 goto invalid_code; \
2190 else if (c < 0xC0) \
2191 { \
2192 DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (); \
2193 /* Re-read C as a composition component. */ \
2194 src = current_src; \
2195 } \
2196 else if (c == 0xFF) \
2197 DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (); \
2198 else \
2199 goto invalid_code; \
2200 } while (0)
2201
2202#define EMACS_MULE_COMPOSITION_END() \
df7492f9 2203 do { \
e951386e 2204 int idx = - cmp_status->length; \
4d41e8b7 2205 \
e951386e
KH
2206 if (cmp_status->old_form) \
2207 charbuf[idx + 2] = cmp_status->nchars; \
2208 else if (cmp_status->method > COMPOSITION_RELATIVE) \
2209 charbuf[idx] = charbuf[idx + 2] - cmp_status->length; \
2210 cmp_status->state = COMPOSING_NO; \
2211 } while (0)
2212
2213
2214static int
cf84bb53
JB
2215emacs_mule_finish_composition (int *charbuf,
2216 struct composition_status *cmp_status)
e951386e
KH
2217{
2218 int idx = - cmp_status->length;
2219 int new_chars;
2220
2221 if (cmp_status->old_form && cmp_status->nchars > 0)
2222 {
2223 charbuf[idx + 2] = cmp_status->nchars;
2224 new_chars = 0;
2225 if (cmp_status->method == COMPOSITION_WITH_RULE
2226 && cmp_status->state == COMPOSING_CHAR)
2227 {
2228 /* The last rule was invalid. */
2229 int rule = charbuf[-1] + 0xA0;
2230
2231 charbuf[-2] = BYTE8_TO_CHAR (rule);
2232 charbuf[-1] = -1;
2233 new_chars = 1;
2234 }
2235 }
2236 else
2237 {
2238 charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2239
2240 if (cmp_status->method == COMPOSITION_WITH_RULE)
2241 {
2242 charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2243 charbuf[idx++] = -3;
2244 charbuf[idx++] = 0;
2245 new_chars = 1;
2246 }
2247 else
2248 {
2249 int nchars = charbuf[idx + 1] + 0xA0;
2250 int nbytes = charbuf[idx + 2] + 0xA0;
2251
2252 charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2253 charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2254 charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2255 charbuf[idx++] = -1;
2256 new_chars = 4;
2257 }
2258 }
2259 cmp_status->state = COMPOSING_NO;
2260 return new_chars;
2261}
2262
2263#define EMACS_MULE_MAYBE_FINISH_COMPOSITION() \
2264 do { \
2265 if (cmp_status->state != COMPOSING_NO) \
2266 char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
df7492f9
KH
2267 } while (0)
2268
aa72b389
KH
2269
2270static void
971de7fb 2271decode_coding_emacs_mule (struct coding_system *coding)
aa72b389 2272{
8f924df7
KH
2273 const unsigned char *src = coding->source + coding->consumed;
2274 const unsigned char *src_end = coding->source + coding->src_bytes;
2275 const unsigned char *src_base;
69a80ea3 2276 int *charbuf = coding->charbuf + coding->charbuf_used;
ad1746f5
KH
2277 /* We may produce two annotations (charset and composition) in one
2278 loop and one more charset annotation at the end. */
69a80ea3 2279 int *charbuf_end
15cbd324
EZ
2280 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3)
2281 /* We can produce up to 2 characters in a loop. */
2282 - 1;
d311d28c 2283 ptrdiff_t consumed_chars = 0, consumed_chars_base;
f10fe38f 2284 bool multibytep = coding->src_multibyte;
d311d28c
PE
2285 ptrdiff_t char_offset = coding->produced_char;
2286 ptrdiff_t last_offset = char_offset;
ff0dacd7 2287 int last_id = charset_ascii;
f10fe38f
PE
2288 bool eol_dos
2289 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
119852e7 2290 int byte_after_cr = -1;
e951386e 2291 struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
aa72b389 2292
e951386e
KH
2293 if (cmp_status->state != COMPOSING_NO)
2294 {
2295 int i;
2296
15cbd324 2297 if (charbuf_end - charbuf < cmp_status->length)
1088b922 2298 emacs_abort ();
e951386e
KH
2299 for (i = 0; i < cmp_status->length; i++)
2300 *charbuf++ = cmp_status->carryover[i];
2301 coding->annotated = 1;
2302 }
2303
aa72b389
KH
2304 while (1)
2305 {
ee05f961 2306 int c, id IF_LINT (= 0);
df7492f9 2307
aa72b389 2308 src_base = src;
df7492f9
KH
2309 consumed_chars_base = consumed_chars;
2310
2311 if (charbuf >= charbuf_end)
b71f6f73
KH
2312 {
2313 if (byte_after_cr >= 0)
2314 src_base--;
2315 break;
2316 }
aa72b389 2317
119852e7
KH
2318 if (byte_after_cr >= 0)
2319 c = byte_after_cr, byte_after_cr = -1;
2320 else
2321 ONE_MORE_BYTE (c);
e951386e
KH
2322
2323 if (c < 0 || c == 0x80)
065e3595 2324 {
e951386e
KH
2325 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2326 if (c < 0)
2327 {
2328 *charbuf++ = -c;
2329 char_offset++;
2330 }
2331 else
2332 DECODE_EMACS_MULE_COMPOSITION_START ();
2333 continue;
065e3595 2334 }
e951386e
KH
2335
2336 if (c < 0x80)
aa72b389 2337 {
2735d060 2338 if (eol_dos && c == '\r')
119852e7 2339 ONE_MORE_BYTE (byte_after_cr);
e951386e
KH
2340 id = charset_ascii;
2341 if (cmp_status->state != COMPOSING_NO)
2342 {
2343 if (cmp_status->old_form)
2344 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2345 else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2346 cmp_status->ncomps--;
2347 }
2348 }
2349 else
2350 {
ee05f961 2351 int nchars IF_LINT (= 0), nbytes IF_LINT (= 0);
75f80e63
EZ
2352 /* emacs_mule_char can load a charset map from a file, which
2353 allocates a large structure and might cause buffer text
2354 to be relocated as result. Thus, we need to remember the
ad1746f5 2355 original pointer to buffer text, and fix up all related
75f80e63
EZ
2356 pointers after the call. */
2357 const unsigned char *orig = coding->source;
d311d28c 2358 ptrdiff_t offset;
e951386e
KH
2359
2360 c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2361 cmp_status);
75f80e63
EZ
2362 offset = coding->source - orig;
2363 if (offset)
2364 {
2365 src += offset;
2366 src_base += offset;
2367 src_end += offset;
2368 }
e951386e
KH
2369 if (c < 0)
2370 {
2371 if (c == -1)
2372 goto invalid_code;
2373 if (c == -2)
2374 break;
2375 }
2376 src = src_base + nbytes;
2377 consumed_chars = consumed_chars_base + nchars;
2378 if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2379 cmp_status->ncomps -= nchars;
2380 }
2381
ad1746f5 2382 /* Now if C >= 0, we found a normally encoded character, if C <
e951386e
KH
2383 0, we found an old-style composition component character or
2384 rule. */
2385
2386 if (cmp_status->state == COMPOSING_NO)
2387 {
2388 if (last_id != id)
2389 {
2390 if (last_id != charset_ascii)
2391 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2392 last_id);
2393 last_id = id;
2394 last_offset = char_offset;
2395 }
df7492f9
KH
2396 *charbuf++ = c;
2397 char_offset++;
aa72b389 2398 }
e951386e 2399 else if (cmp_status->state == COMPOSING_CHAR)
df7492f9 2400 {
e951386e
KH
2401 if (cmp_status->old_form)
2402 {
2403 if (c >= 0)
2404 {
2405 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2406 *charbuf++ = c;
2407 char_offset++;
2408 }
2409 else
2410 {
2411 *charbuf++ = -c;
2412 cmp_status->nchars++;
2413 cmp_status->length++;
2414 if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2415 EMACS_MULE_COMPOSITION_END ();
2416 else if (cmp_status->method == COMPOSITION_WITH_RULE)
2417 cmp_status->state = COMPOSING_RULE;
2418 }
2419 }
df7492f9 2420 else
e951386e
KH
2421 {
2422 *charbuf++ = c;
2423 cmp_status->length++;
2424 cmp_status->nchars--;
2425 if (cmp_status->nchars == 0)
2426 EMACS_MULE_COMPOSITION_END ();
2427 }
df7492f9 2428 }
e951386e 2429 else if (cmp_status->state == COMPOSING_RULE)
df7492f9 2430 {
e951386e 2431 int rule;
ff0dacd7 2432
e951386e 2433 if (c >= 0)
df7492f9 2434 {
e951386e
KH
2435 EMACS_MULE_COMPOSITION_END ();
2436 *charbuf++ = c;
2437 char_offset++;
df7492f9 2438 }
e951386e 2439 else
ff0dacd7 2440 {
e951386e
KH
2441 c = -c;
2442 DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2443 if (rule < 0)
2444 goto invalid_code;
2445 *charbuf++ = -2;
2446 *charbuf++ = rule;
2447 cmp_status->length += 2;
2448 cmp_status->state = COMPOSING_CHAR;
ff0dacd7 2449 }
e951386e
KH
2450 }
2451 else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2452 {
df7492f9 2453 *charbuf++ = c;
e951386e
KH
2454 cmp_status->length++;
2455 if (cmp_status->ncomps == 0)
2456 cmp_status->state = COMPOSING_CHAR;
2457 else if (cmp_status->ncomps > 0)
2458 {
2459 if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2460 cmp_status->state = COMPOSING_COMPONENT_RULE;
2461 }
2462 else
2463 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
df7492f9 2464 }
e951386e
KH
2465 else /* COMPOSING_COMPONENT_RULE */
2466 {
2467 int rule;
2468
2469 DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2470 if (rule < 0)
2471 goto invalid_code;
2472 *charbuf++ = -2;
2473 *charbuf++ = rule;
2474 cmp_status->length += 2;
2475 cmp_status->ncomps--;
2476 if (cmp_status->ncomps > 0)
2477 cmp_status->state = COMPOSING_COMPONENT_CHAR;
2478 else
2479 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2480 }
2481 continue;
2482
df7492f9 2483 invalid_code:
e951386e 2484 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
df7492f9
KH
2485 src = src_base;
2486 consumed_chars = consumed_chars_base;
2487 ONE_MORE_BYTE (c);
2488 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
ff0dacd7 2489 char_offset++;
df7492f9
KH
2490 coding->errors++;
2491 }
2492
2493 no_more_source:
e951386e
KH
2494 if (cmp_status->state != COMPOSING_NO)
2495 {
2496 if (coding->mode & CODING_MODE_LAST_BLOCK)
2497 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2498 else
2499 {
2500 int i;
2501
2502 charbuf -= cmp_status->length;
2503 for (i = 0; i < cmp_status->length; i++)
2504 cmp_status->carryover[i] = charbuf[i];
2505 }
2506 }
ff0dacd7 2507 if (last_id != charset_ascii)
69a80ea3 2508 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
df7492f9
KH
2509 coding->consumed_char += consumed_chars_base;
2510 coding->consumed = src_base - coding->source;
2511 coding->charbuf_used = charbuf - coding->charbuf;
2512}
2513
2514
2515#define EMACS_MULE_LEADING_CODES(id, codes) \
2516 do { \
2517 if (id < 0xA0) \
2518 codes[0] = id, codes[1] = 0; \
2519 else if (id < 0xE0) \
2520 codes[0] = 0x9A, codes[1] = id; \
2521 else if (id < 0xF0) \
2522 codes[0] = 0x9B, codes[1] = id; \
2523 else if (id < 0xF5) \
2524 codes[0] = 0x9C, codes[1] = id; \
2525 else \
2526 codes[0] = 0x9D, codes[1] = id; \
2527 } while (0);
2528
aa72b389 2529
f10fe38f 2530static bool
971de7fb 2531encode_coding_emacs_mule (struct coding_system *coding)
df7492f9 2532{
f10fe38f 2533 bool multibytep = coding->dst_multibyte;
df7492f9
KH
2534 int *charbuf = coding->charbuf;
2535 int *charbuf_end = charbuf + coding->charbuf_used;
2536 unsigned char *dst = coding->destination + coding->produced;
2537 unsigned char *dst_end = coding->destination + coding->dst_bytes;
2538 int safe_room = 8;
d311d28c 2539 ptrdiff_t produced_chars = 0;
24a73b0a 2540 Lisp_Object attrs, charset_list;
df7492f9 2541 int c;
ff0dacd7 2542 int preferred_charset_id = -1;
df7492f9 2543
24a73b0a 2544 CODING_GET_INFO (coding, attrs, charset_list);
eccb6815
KH
2545 if (! EQ (charset_list, Vemacs_mule_charset_list))
2546 {
4939150c
PE
2547 charset_list = Vemacs_mule_charset_list;
2548 ASET (attrs, coding_attr_charset_list, charset_list);
eccb6815 2549 }
df7492f9
KH
2550
2551 while (charbuf < charbuf_end)
2552 {
2553 ASSURE_DESTINATION (safe_room);
2554 c = *charbuf++;
ff0dacd7
KH
2555
2556 if (c < 0)
2557 {
2558 /* Handle an annotation. */
2559 switch (*charbuf)
2560 {
2561 case CODING_ANNOTATE_COMPOSITION_MASK:
2562 /* Not yet implemented. */
2563 break;
2564 case CODING_ANNOTATE_CHARSET_MASK:
2565 preferred_charset_id = charbuf[3];
2566 if (preferred_charset_id >= 0
2567 && NILP (Fmemq (make_number (preferred_charset_id),
2568 charset_list)))
2569 preferred_charset_id = -1;
2570 break;
2571 default:
1088b922 2572 emacs_abort ();
ff0dacd7
KH
2573 }
2574 charbuf += -c - 1;
2575 continue;
2576 }
2577
df7492f9
KH
2578 if (ASCII_CHAR_P (c))
2579 EMIT_ONE_ASCII_BYTE (c);
16eafb5d
KH
2580 else if (CHAR_BYTE8_P (c))
2581 {
2582 c = CHAR_TO_BYTE8 (c);
2583 EMIT_ONE_BYTE (c);
2584 }
df7492f9 2585 else
aa72b389 2586 {
df7492f9
KH
2587 struct charset *charset;
2588 unsigned code;
2589 int dimension;
2590 int emacs_mule_id;
2591 unsigned char leading_codes[2];
2592
ff0dacd7
KH
2593 if (preferred_charset_id >= 0)
2594 {
f10fe38f 2595 bool result;
5eb05ea3 2596
ff0dacd7 2597 charset = CHARSET_FROM_ID (preferred_charset_id);
5eb05ea3
KH
2598 CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
2599 if (result)
905ca9d2
KH
2600 code = ENCODE_CHAR (charset, c);
2601 else
5eb05ea3
KH
2602 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2603 &code, charset);
ff0dacd7
KH
2604 }
2605 else
5eb05ea3
KH
2606 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2607 &code, charset);
df7492f9
KH
2608 if (! charset)
2609 {
2610 c = coding->default_char;
2611 if (ASCII_CHAR_P (c))
2612 {
2613 EMIT_ONE_ASCII_BYTE (c);
2614 continue;
2615 }
5eb05ea3
KH
2616 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2617 &code, charset);
df7492f9
KH
2618 }
2619 dimension = CHARSET_DIMENSION (charset);
2620 emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2621 EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2622 EMIT_ONE_BYTE (leading_codes[0]);
2623 if (leading_codes[1])
2624 EMIT_ONE_BYTE (leading_codes[1]);
2625 if (dimension == 1)
1fa663f9 2626 EMIT_ONE_BYTE (code | 0x80);
aa72b389 2627 else
df7492f9 2628 {
1fa663f9 2629 code |= 0x8080;
df7492f9
KH
2630 EMIT_ONE_BYTE (code >> 8);
2631 EMIT_ONE_BYTE (code & 0xFF);
2632 }
aa72b389 2633 }
aa72b389 2634 }
065e3595 2635 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
2636 coding->produced_char += produced_chars;
2637 coding->produced = dst - coding->destination;
2638 return 0;
aa72b389 2639}
b73bfc1c 2640
4ed46869 2641\f
df7492f9 2642/*** 7. ISO2022 handlers ***/
4ed46869
KH
2643
2644/* The following note describes the coding system ISO2022 briefly.
39787efd 2645 Since the intention of this note is to help understand the
5a936b46 2646 functions in this file, some parts are NOT ACCURATE or are OVERLY
39787efd 2647 SIMPLIFIED. For thorough understanding, please refer to the
5a936b46 2648 original document of ISO2022. This is equivalent to the standard
cfb43547 2649 ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
4ed46869
KH
2650
2651 ISO2022 provides many mechanisms to encode several character sets
cfb43547 2652 in 7-bit and 8-bit environments. For 7-bit environments, all text
39787efd
KH
2653 is encoded using bytes less than 128. This may make the encoded
2654 text a little bit longer, but the text passes more easily through
cfb43547 2655 several types of gateway, some of which strip off the MSB (Most
8ca3766a 2656 Significant Bit).
b73bfc1c 2657
cfb43547
DL
2658 There are two kinds of character sets: control character sets and
2659 graphic character sets. The former contain control characters such
4ed46869 2660 as `newline' and `escape' to provide control functions (control
39787efd 2661 functions are also provided by escape sequences). The latter
cfb43547 2662 contain graphic characters such as 'A' and '-'. Emacs recognizes
4ed46869
KH
2663 two control character sets and many graphic character sets.
2664
2665 Graphic character sets are classified into one of the following
39787efd
KH
2666 four classes, according to the number of bytes (DIMENSION) and
2667 number of characters in one dimension (CHARS) of the set:
2668 - DIMENSION1_CHARS94
2669 - DIMENSION1_CHARS96
2670 - DIMENSION2_CHARS94
2671 - DIMENSION2_CHARS96
2672
2673 In addition, each character set is assigned an identification tag,
cfb43547 2674 unique for each set, called the "final character" (denoted as <F>
39787efd
KH
2675 hereafter). The <F> of each character set is decided by ECMA(*)
2676 when it is registered in ISO. The code range of <F> is 0x30..0x7F
2677 (0x30..0x3F are for private use only).
4ed46869
KH
2678
2679 Note (*): ECMA = European Computer Manufacturers Association
2680
cfb43547 2681 Here are examples of graphic character sets [NAME(<F>)]:
4ed46869
KH
2682 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2683 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2684 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2685 o DIMENSION2_CHARS96 -- none for the moment
2686
39787efd 2687 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
4ed46869
KH
2688 C0 [0x00..0x1F] -- control character plane 0
2689 GL [0x20..0x7F] -- graphic character plane 0
2690 C1 [0x80..0x9F] -- control character plane 1
2691 GR [0xA0..0xFF] -- graphic character plane 1
2692
2693 A control character set is directly designated and invoked to C0 or
39787efd
KH
2694 C1 by an escape sequence. The most common case is that:
2695 - ISO646's control character set is designated/invoked to C0, and
2696 - ISO6429's control character set is designated/invoked to C1,
2697 and usually these designations/invocations are omitted in encoded
2698 text. In a 7-bit environment, only C0 can be used, and a control
2699 character for C1 is encoded by an appropriate escape sequence to
2700 fit into the environment. All control characters for C1 are
2701 defined to have corresponding escape sequences.
4ed46869
KH
2702
2703 A graphic character set is at first designated to one of four
2704 graphic registers (G0 through G3), then these graphic registers are
2705 invoked to GL or GR. These designations and invocations can be
2706 done independently. The most common case is that G0 is invoked to
39787efd
KH
2707 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
2708 these invocations and designations are omitted in encoded text.
2709 In a 7-bit environment, only GL can be used.
4ed46869 2710
39787efd
KH
2711 When a graphic character set of CHARS94 is invoked to GL, codes
2712 0x20 and 0x7F of the GL area work as control characters SPACE and
2713 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2714 be used.
4ed46869
KH
2715
2716 There are two ways of invocation: locking-shift and single-shift.
2717 With locking-shift, the invocation lasts until the next different
39787efd
KH
2718 invocation, whereas with single-shift, the invocation affects the
2719 following character only and doesn't affect the locking-shift
2720 state. Invocations are done by the following control characters or
2721 escape sequences:
4ed46869
KH
2722
2723 ----------------------------------------------------------------------
39787efd 2724 abbrev function cntrl escape seq description
4ed46869 2725 ----------------------------------------------------------------------
39787efd
KH
2726 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
2727 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
2728 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
2729 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
2730 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
2731 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
2732 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
2733 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
2734 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
4ed46869 2735 ----------------------------------------------------------------------
39787efd
KH
2736 (*) These are not used by any known coding system.
2737
2738 Control characters for these functions are defined by macros
2739 ISO_CODE_XXX in `coding.h'.
4ed46869 2740
39787efd 2741 Designations are done by the following escape sequences:
4ed46869
KH
2742 ----------------------------------------------------------------------
2743 escape sequence description
2744 ----------------------------------------------------------------------
2745 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
2746 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
2747 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
2748 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
2749 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
2750 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
2751 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
2752 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
2753 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
2754 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
2755 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
2756 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
2757 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
2758 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
2759 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
2760 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
2761 ----------------------------------------------------------------------
2762
2763 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
39787efd 2764 of dimension 1, chars 94, and final character <F>, etc...
4ed46869
KH
2765
2766 Note (*): Although these designations are not allowed in ISO2022,
2767 Emacs accepts them on decoding, and produces them on encoding
39787efd 2768 CHARS96 character sets in a coding system which is characterized as
4ed46869
KH
2769 7-bit environment, non-locking-shift, and non-single-shift.
2770
2771 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
df7492f9 2772 '(' must be omitted. We refer to this as "short-form" hereafter.
4ed46869 2773
cfb43547 2774 Now you may notice that there are a lot of ways of encoding the
39787efd
KH
2775 same multilingual text in ISO2022. Actually, there exist many
2776 coding systems such as Compound Text (used in X11's inter client
8ca3766a
DL
2777 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2778 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
4ed46869
KH
2779 localized platforms), and all of these are variants of ISO2022.
2780
2781 In addition to the above, Emacs handles two more kinds of escape
2782 sequences: ISO6429's direction specification and Emacs' private
2783 sequence for specifying character composition.
2784
39787efd 2785 ISO6429's direction specification takes the following form:
4ed46869
KH
2786 o CSI ']' -- end of the current direction
2787 o CSI '0' ']' -- end of the current direction
2788 o CSI '1' ']' -- start of left-to-right text
2789 o CSI '2' ']' -- start of right-to-left text
2790 The control character CSI (0x9B: control sequence introducer) is
39787efd
KH
2791 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2792
2793 Character composition specification takes the following form:
ec6d2bb8
KH
2794 o ESC '0' -- start relative composition
2795 o ESC '1' -- end composition
2796 o ESC '2' -- start rule-base composition (*)
2797 o ESC '3' -- start relative composition with alternate chars (**)
2798 o ESC '4' -- start rule-base composition with alternate chars (**)
b73bfc1c 2799 Since these are not standard escape sequences of any ISO standard,
cfb43547 2800 the use of them with these meanings is restricted to Emacs only.
ec6d2bb8 2801
5a936b46
DL
2802 (*) This form is used only in Emacs 20.7 and older versions,
2803 but newer versions can safely decode it.
cfb43547 2804 (**) This form is used only in Emacs 21.1 and newer versions,
5a936b46 2805 and older versions can't decode it.
ec6d2bb8 2806
cfb43547 2807 Here's a list of example usages of these composition escape
b73bfc1c 2808 sequences (categorized by `enum composition_method').
ec6d2bb8 2809
b73bfc1c 2810 COMPOSITION_RELATIVE:
ec6d2bb8 2811 ESC 0 CHAR [ CHAR ] ESC 1
8ca3766a 2812 COMPOSITION_WITH_RULE:
ec6d2bb8 2813 ESC 2 CHAR [ RULE CHAR ] ESC 1
b73bfc1c 2814 COMPOSITION_WITH_ALTCHARS:
ec6d2bb8 2815 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
b73bfc1c 2816 COMPOSITION_WITH_RULE_ALTCHARS:
ec6d2bb8 2817 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
4ed46869 2818
74ab6df5 2819static enum iso_code_class_type iso_code_class[256];
4ed46869 2820
df7492f9
KH
2821#define SAFE_CHARSET_P(coding, id) \
2822 ((id) <= (coding)->max_charset_id \
1b3b981b 2823 && (coding)->safe_charsets[id] != 255)
df7492f9 2824
df7492f9 2825static void
971de7fb 2826setup_iso_safe_charsets (Lisp_Object attrs)
df7492f9
KH
2827{
2828 Lisp_Object charset_list, safe_charsets;
2829 Lisp_Object request;
2830 Lisp_Object reg_usage;
2831 Lisp_Object tail;
d311d28c 2832 EMACS_INT reg94, reg96;
df7492f9
KH
2833 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2834 int max_charset_id;
2835
2836 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2837 if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2838 && ! EQ (charset_list, Viso_2022_charset_list))
2839 {
4939150c
PE
2840 charset_list = Viso_2022_charset_list;
2841 ASET (attrs, coding_attr_charset_list, charset_list);
df7492f9
KH
2842 ASET (attrs, coding_attr_safe_charsets, Qnil);
2843 }
2844
2845 if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2846 return;
2847
2848 max_charset_id = 0;
2849 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2850 {
2851 int id = XINT (XCAR (tail));
2852 if (max_charset_id < id)
2853 max_charset_id = id;
2854 }
d46c5b12 2855
1b3b981b
AS
2856 safe_charsets = make_uninit_string (max_charset_id + 1);
2857 memset (SDATA (safe_charsets), 255, max_charset_id + 1);
df7492f9
KH
2858 request = AREF (attrs, coding_attr_iso_request);
2859 reg_usage = AREF (attrs, coding_attr_iso_usage);
2860 reg94 = XINT (XCAR (reg_usage));
2861 reg96 = XINT (XCDR (reg_usage));
2862
2863 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2864 {
2865 Lisp_Object id;
2866 Lisp_Object reg;
2867 struct charset *charset;
2868
2869 id = XCAR (tail);
2870 charset = CHARSET_FROM_ID (XINT (id));
bf16eb23 2871 reg = Fcdr (Fassq (id, request));
df7492f9 2872 if (! NILP (reg))
8f924df7 2873 SSET (safe_charsets, XINT (id), XINT (reg));
df7492f9
KH
2874 else if (charset->iso_chars_96)
2875 {
2876 if (reg96 < 4)
8f924df7 2877 SSET (safe_charsets, XINT (id), reg96);
df7492f9
KH
2878 }
2879 else
2880 {
2881 if (reg94 < 4)
8f924df7 2882 SSET (safe_charsets, XINT (id), reg94);
df7492f9
KH
2883 }
2884 }
2885 ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2886}
d46c5b12 2887
b6871cc7 2888
4ed46869 2889/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
f10fe38f
PE
2890 Return true if a text is encoded in one of ISO-2022 based coding
2891 systems. */
4ed46869 2892
f10fe38f 2893static bool
cf84bb53
JB
2894detect_coding_iso_2022 (struct coding_system *coding,
2895 struct coding_detection_info *detect_info)
4ed46869 2896{
8f924df7
KH
2897 const unsigned char *src = coding->source, *src_base = src;
2898 const unsigned char *src_end = coding->source + coding->src_bytes;
f10fe38f
PE
2899 bool multibytep = coding->src_multibyte;
2900 bool single_shifting = 0;
0e48bb22 2901 int id;
df7492f9 2902 int c, c1;
d311d28c 2903 ptrdiff_t consumed_chars = 0;
df7492f9 2904 int i;
ff0dacd7
KH
2905 int rejected = 0;
2906 int found = 0;
cee53ed4 2907 int composition_count = -1;
ff0dacd7
KH
2908
2909 detect_info->checked |= CATEGORY_MASK_ISO;
df7492f9
KH
2910
2911 for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2912 {
2913 struct coding_system *this = &(coding_categories[i]);
2914 Lisp_Object attrs, val;
2915
c6b278e7
KH
2916 if (this->id < 0)
2917 continue;
df7492f9
KH
2918 attrs = CODING_ID_ATTRS (this->id);
2919 if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
1b3b981b 2920 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
df7492f9
KH
2921 setup_iso_safe_charsets (attrs);
2922 val = CODING_ATTR_SAFE_CHARSETS (attrs);
8f924df7 2923 this->max_charset_id = SCHARS (val) - 1;
1b3b981b 2924 this->safe_charsets = SDATA (val);
df7492f9
KH
2925 }
2926
2927 /* A coding system of this category is always ASCII compatible. */
2928 src += coding->head_ascii;
3f003981 2929
ff0dacd7 2930 while (rejected != CATEGORY_MASK_ISO)
4ed46869 2931 {
065e3595 2932 src_base = src;
df7492f9 2933 ONE_MORE_BYTE (c);
4ed46869
KH
2934 switch (c)
2935 {
2936 case ISO_CODE_ESC:
74383408
KH
2937 if (inhibit_iso_escape_detection)
2938 break;
f46869e4 2939 single_shifting = 0;
df7492f9 2940 ONE_MORE_BYTE (c);
0e48bb22 2941 if (c == 'N' || c == 'O')
d46c5b12 2942 {
ae9ff118 2943 /* ESC <Fe> for SS2 or SS3. */
ff0dacd7
KH
2944 single_shifting = 1;
2945 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
4ed46869 2946 }
cee53ed4
KH
2947 else if (c == '1')
2948 {
2949 /* End of composition. */
2950 if (composition_count < 0
2951 || composition_count > MAX_COMPOSITION_COMPONENTS)
2952 /* Invalid */
2953 break;
2954 composition_count = -1;
2955 found |= CATEGORY_MASK_ISO;
2956 }
ec6d2bb8
KH
2957 else if (c >= '0' && c <= '4')
2958 {
2959 /* ESC <Fp> for start/end composition. */
cee53ed4 2960 composition_count = 0;
ec6d2bb8 2961 }
bf9cdd4e 2962 else
df7492f9 2963 {
0e48bb22
AS
2964 if (c >= '(' && c <= '/')
2965 {
2966 /* Designation sequence for a charset of dimension 1. */
2967 ONE_MORE_BYTE (c1);
2968 if (c1 < ' ' || c1 >= 0x80
2969 || (id = iso_charset_table[0][c >= ','][c1]) < 0)
2970 /* Invalid designation sequence. Just ignore. */
2971 break;
2972 }
2973 else if (c == '$')
2974 {
2975 /* Designation sequence for a charset of dimension 2. */
2976 ONE_MORE_BYTE (c);
2977 if (c >= '@' && c <= 'B')
2978 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
2979 id = iso_charset_table[1][0][c];
2980 else if (c >= '(' && c <= '/')
2981 {
2982 ONE_MORE_BYTE (c1);
2983 if (c1 < ' ' || c1 >= 0x80
2984 || (id = iso_charset_table[1][c >= ','][c1]) < 0)
2985 /* Invalid designation sequence. Just ignore. */
2986 break;
2987 }
2988 else
2989 /* Invalid designation sequence. Just ignore it. */
2990 break;
2991 }
2992 else
2993 {
2994 /* Invalid escape sequence. Just ignore it. */
2995 break;
2996 }
d46c5b12 2997
0e48bb22
AS
2998 /* We found a valid designation sequence for CHARSET. */
2999 rejected |= CATEGORY_MASK_ISO_8BIT;
3000 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
3001 id))
3002 found |= CATEGORY_MASK_ISO_7;
3003 else
3004 rejected |= CATEGORY_MASK_ISO_7;
3005 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
3006 id))
3007 found |= CATEGORY_MASK_ISO_7_TIGHT;
3008 else
3009 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
3010 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3011 id))
3012 found |= CATEGORY_MASK_ISO_7_ELSE;
3013 else
3014 rejected |= CATEGORY_MASK_ISO_7_ELSE;
3015 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3016 id))
3017 found |= CATEGORY_MASK_ISO_8_ELSE;
3018 else
3019 rejected |= CATEGORY_MASK_ISO_8_ELSE;
3020 }
4ed46869
KH
3021 break;
3022
4ed46869 3023 case ISO_CODE_SO:
d46c5b12 3024 case ISO_CODE_SI:
ff0dacd7 3025 /* Locking shift out/in. */
74383408
KH
3026 if (inhibit_iso_escape_detection)
3027 break;
f46869e4 3028 single_shifting = 0;
ff0dacd7 3029 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
d46c5b12
KH
3030 break;
3031
4ed46869 3032 case ISO_CODE_CSI:
ff0dacd7 3033 /* Control sequence introducer. */
f46869e4 3034 single_shifting = 0;
ff0dacd7
KH
3035 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3036 found |= CATEGORY_MASK_ISO_8_ELSE;
3037 goto check_extra_latin;
3038
4ed46869
KH
3039 case ISO_CODE_SS2:
3040 case ISO_CODE_SS3:
ff0dacd7
KH
3041 /* Single shift. */
3042 if (inhibit_iso_escape_detection)
3043 break;
75e2a253 3044 single_shifting = 0;
ff0dacd7
KH
3045 rejected |= CATEGORY_MASK_ISO_7BIT;
3046 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3047 & CODING_ISO_FLAG_SINGLE_SHIFT)
0e48bb22
AS
3048 {
3049 found |= CATEGORY_MASK_ISO_8_1;
3050 single_shifting = 1;
3051 }
ff0dacd7
KH
3052 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3053 & CODING_ISO_FLAG_SINGLE_SHIFT)
0e48bb22
AS
3054 {
3055 found |= CATEGORY_MASK_ISO_8_2;
3056 single_shifting = 1;
3057 }
75e2a253
KH
3058 if (single_shifting)
3059 break;
0e48bb22
AS
3060 check_extra_latin:
3061 if (! VECTORP (Vlatin_extra_code_table)
28be1ada 3062 || NILP (AREF (Vlatin_extra_code_table, c)))
0e48bb22
AS
3063 {
3064 rejected = CATEGORY_MASK_ISO;
3065 break;
3066 }
3067 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3068 & CODING_ISO_FLAG_LATIN_EXTRA)
3069 found |= CATEGORY_MASK_ISO_8_1;
3070 else
3071 rejected |= CATEGORY_MASK_ISO_8_1;
3072 rejected |= CATEGORY_MASK_ISO_8_2;
3073 break;
4ed46869
KH
3074
3075 default:
065e3595
KH
3076 if (c < 0)
3077 continue;
4ed46869 3078 if (c < 0x80)
f46869e4 3079 {
cee53ed4
KH
3080 if (composition_count >= 0)
3081 composition_count++;
f46869e4
KH
3082 single_shifting = 0;
3083 break;
3084 }
ff0dacd7 3085 if (c >= 0xA0)
c4825358 3086 {
ff0dacd7
KH
3087 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3088 found |= CATEGORY_MASK_ISO_8_1;
f46869e4 3089 /* Check the length of succeeding codes of the range
ff0dacd7
KH
3090 0xA0..0FF. If the byte length is even, we include
3091 CATEGORY_MASK_ISO_8_2 in `found'. We can check this
3092 only when we are not single shifting. */
3093 if (! single_shifting
3094 && ! (rejected & CATEGORY_MASK_ISO_8_2))
f46869e4 3095 {
2735d060 3096 int len = 1;
b73bfc1c
KH
3097 while (src < src_end)
3098 {
d12bd917 3099 src_base = src;
df7492f9 3100 ONE_MORE_BYTE (c);
b73bfc1c 3101 if (c < 0xA0)
d12bd917
KH
3102 {
3103 src = src_base;
3104 break;
3105 }
2735d060 3106 len++;
b73bfc1c
KH
3107 }
3108
2735d060 3109 if (len & 1 && src < src_end)
cee53ed4
KH
3110 {
3111 rejected |= CATEGORY_MASK_ISO_8_2;
3112 if (composition_count >= 0)
2735d060 3113 composition_count += len;
cee53ed4 3114 }
f46869e4 3115 else
cee53ed4
KH
3116 {
3117 found |= CATEGORY_MASK_ISO_8_2;
3118 if (composition_count >= 0)
2735d060 3119 composition_count += len / 2;
cee53ed4 3120 }
f46869e4 3121 }
ff0dacd7 3122 break;
4ed46869 3123 }
4ed46869
KH
3124 }
3125 }
ff0dacd7
KH
3126 detect_info->rejected |= CATEGORY_MASK_ISO;
3127 return 0;
4ed46869 3128
df7492f9 3129 no_more_source:
ff0dacd7
KH
3130 detect_info->rejected |= rejected;
3131 detect_info->found |= (found & ~rejected);
df7492f9 3132 return 1;
4ed46869 3133}
ec6d2bb8 3134
4ed46869 3135
134b9549
KH
3136/* Set designation state into CODING. Set CHARS_96 to -1 if the
3137 escape sequence should be kept. */
df7492f9
KH
3138#define DECODE_DESIGNATION(reg, dim, chars_96, final) \
3139 do { \
3140 int id, prev; \
3141 \
3142 if (final < '0' || final >= 128 \
3143 || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0) \
3144 || !SAFE_CHARSET_P (coding, id)) \
3145 { \
3146 CODING_ISO_DESIGNATION (coding, reg) = -2; \
134b9549
KH
3147 chars_96 = -1; \
3148 break; \
df7492f9
KH
3149 } \
3150 prev = CODING_ISO_DESIGNATION (coding, reg); \
bf16eb23
KH
3151 if (id == charset_jisx0201_roman) \
3152 { \
3153 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
3154 id = charset_ascii; \
3155 } \
3156 else if (id == charset_jisx0208_1978) \
3157 { \
3158 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
3159 id = charset_jisx0208; \
3160 } \
df7492f9
KH
3161 CODING_ISO_DESIGNATION (coding, reg) = id; \
3162 /* If there was an invalid designation to REG previously, and this \
3163 designation is ASCII to REG, we should keep this designation \
3164 sequence. */ \
3165 if (prev == -2 && id == charset_ascii) \
134b9549 3166 chars_96 = -1; \
4ed46869
KH
3167 } while (0)
3168
d46c5b12 3169
e951386e
KH
3170/* Handle these composition sequence (ALT: alternate char):
3171
3172 (1) relative composition: ESC 0 CHAR ... ESC 1
3173 (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3174 (3) altchar composition: ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3175 (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3176
3177 When the start sequence (ESC 0/2/3/4) is found, this annotation
3178 header is produced.
3179
3180 [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3181
3182 Then, upon reading CHAR or RULE (one or two bytes), these codes are
3183 produced until the end sequence (ESC 1) is found:
3184
3185 (1) CHAR ... CHAR
3186 (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3187 (3) ALT ... ALT -1 -1 CHAR ... CHAR
3188 (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3189
3190 When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3191 annotation header is updated as below:
3192
3193 (1) LENGTH: unchanged, NCHARS: number of CHARs
3194 (2) LENGTH: unchanged, NCHARS: number of CHARs
3195 (3) LENGTH: += number of ALTs + 2, NCHARS: number of CHARs
3196 (4) LENGTH: += number of ALTs * 3, NCHARS: number of CHARs
3197
3198 If an error is found while composing, the annotation header is
3199 changed to:
3200
3201 [ ESC '0'/'2'/'3'/'4' -2 0 ]
3202
3203 and the sequence [ -2 DECODED-RULE ] is changed to the original
3204 byte sequence as below:
3205 o the original byte sequence is B: [ B -1 ]
3206 o the original byte sequence is B1 B2: [ B1 B2 ]
3207 and the sequence [ -1 -1 ] is changed to the original byte
3208 sequence:
3209 [ ESC '0' ]
3210*/
3211
3212/* Decode a composition rule C1 and maybe one more byte from the
66ebf983 3213 source, and set RULE to the encoded composition rule. If the rule
d5efd1d1 3214 is invalid, goto invalid_code. */
e951386e 3215
66ebf983 3216#define DECODE_COMPOSITION_RULE(rule) \
e951386e
KH
3217 do { \
3218 rule = c1 - 32; \
3219 if (rule < 0) \
d5efd1d1 3220 goto invalid_code; \
e951386e
KH
3221 if (rule < 81) /* old format (before ver.21) */ \
3222 { \
3223 int gref = (rule) / 9; \
3224 int nref = (rule) % 9; \
3225 if (gref == 4) gref = 10; \
3226 if (nref == 4) nref = 10; \
3227 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
e951386e
KH
3228 } \
3229 else /* new format (after ver.21) */ \
3230 { \
2735d060 3231 int b; \
e951386e 3232 \
2735d060 3233 ONE_MORE_BYTE (b); \
d5efd1d1
PE
3234 if (! COMPOSITION_ENCODE_RULE_VALID (rule - 81, b - 32)) \
3235 goto invalid_code; \
2735d060 3236 rule = COMPOSITION_ENCODE_RULE (rule - 81, b - 32); \
d5efd1d1 3237 rule += 0x100; /* Distinguish it from the old format. */ \
e951386e
KH
3238 } \
3239 } while (0)
3240
3241#define ENCODE_COMPOSITION_RULE(rule) \
df7492f9 3242 do { \
e951386e
KH
3243 int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3244 \
3245 if (rule < 0x100) /* old format */ \
df7492f9 3246 { \
e951386e
KH
3247 if (gref == 10) gref = 4; \
3248 if (nref == 10) nref = 4; \
3249 charbuf[idx] = 32 + gref * 9 + nref; \
3250 charbuf[idx + 1] = -1; \
3251 new_chars++; \
df7492f9 3252 } \
e951386e 3253 else /* new format */ \
df7492f9 3254 { \
e951386e
KH
3255 charbuf[idx] = 32 + 81 + gref; \
3256 charbuf[idx + 1] = 32 + nref; \
3257 new_chars += 2; \
df7492f9
KH
3258 } \
3259 } while (0)
3260
e951386e
KH
3261/* Finish the current composition as invalid. */
3262
e951386e 3263static int
971de7fb 3264finish_composition (int *charbuf, struct composition_status *cmp_status)
e951386e
KH
3265{
3266 int idx = - cmp_status->length;
3267 int new_chars;
3268
3269 /* Recover the original ESC sequence */
3270 charbuf[idx++] = ISO_CODE_ESC;
3271 charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3272 : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3273 : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3274 /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3275 : '4');
3276 charbuf[idx++] = -2;
3277 charbuf[idx++] = 0;
3278 charbuf[idx++] = -1;
3279 new_chars = cmp_status->nchars;
3280 if (cmp_status->method >= COMPOSITION_WITH_RULE)
3281 for (; idx < 0; idx++)
3282 {
3283 int elt = charbuf[idx];
3284
3285 if (elt == -2)
3286 {
3287 ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3288 idx++;
3289 }
3290 else if (elt == -1)
3291 {
3292 charbuf[idx++] = ISO_CODE_ESC;
3293 charbuf[idx] = '0';
3294 new_chars += 2;
3295 }
3296 }
3297 cmp_status->state = COMPOSING_NO;
3298 return new_chars;
3299}
3300
ad1746f5 3301/* If characters are under composition, finish the composition. */
e951386e
KH
3302#define MAYBE_FINISH_COMPOSITION() \
3303 do { \
3304 if (cmp_status->state != COMPOSING_NO) \
3305 char_offset += finish_composition (charbuf, cmp_status); \
3306 } while (0)
d46c5b12 3307
aa72b389 3308/* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
e951386e 3309
aa72b389
KH
3310 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3311 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
df7492f9
KH
3312 ESC 3 : altchar composition : ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3313 ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
ec6d2bb8 3314
e951386e
KH
3315 Produce this annotation sequence now:
3316
3317 [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3318*/
3319
3320#define DECODE_COMPOSITION_START(c1) \
3321 do { \
3322 if (c1 == '0' \
3323 && ((cmp_status->state == COMPOSING_COMPONENT_CHAR \
3324 && cmp_status->method == COMPOSITION_WITH_ALTCHARS) \
3325 || (cmp_status->state == COMPOSING_COMPONENT_RULE \
3326 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3327 { \
3328 *charbuf++ = -1; \
3329 *charbuf++= -1; \
3330 cmp_status->state = COMPOSING_CHAR; \
3331 cmp_status->length += 2; \
3332 } \
3333 else \
3334 { \
3335 MAYBE_FINISH_COMPOSITION (); \
3336 cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE \
3337 : c1 == '2' ? COMPOSITION_WITH_RULE \
3338 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
3339 : COMPOSITION_WITH_RULE_ALTCHARS); \
3340 cmp_status->state \
3341 = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR); \
3342 ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method); \
3343 cmp_status->length = MAX_ANNOTATION_LENGTH; \
3344 cmp_status->nchars = cmp_status->ncomps = 0; \
3345 coding->annotated = 1; \
3346 } \
ec6d2bb8
KH
3347 } while (0)
3348
ec6d2bb8 3349
e951386e 3350/* Handle composition end sequence ESC 1. */
df7492f9
KH
3351
3352#define DECODE_COMPOSITION_END() \
ec6d2bb8 3353 do { \
e951386e
KH
3354 if (cmp_status->nchars == 0 \
3355 || ((cmp_status->state == COMPOSING_CHAR) \
3356 == (cmp_status->method == COMPOSITION_WITH_RULE))) \
ec6d2bb8 3357 { \
e951386e
KH
3358 MAYBE_FINISH_COMPOSITION (); \
3359 goto invalid_code; \
ec6d2bb8 3360 } \
e951386e
KH
3361 if (cmp_status->method == COMPOSITION_WITH_ALTCHARS) \
3362 charbuf[- cmp_status->length] -= cmp_status->ncomps + 2; \
3363 else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS) \
3364 charbuf[- cmp_status->length] -= cmp_status->ncomps * 3; \
3365 charbuf[- cmp_status->length + 2] = cmp_status->nchars; \
3366 char_offset += cmp_status->nchars; \
3367 cmp_status->state = COMPOSING_NO; \
ec6d2bb8
KH
3368 } while (0)
3369
e951386e 3370/* Store a composition rule RULE in charbuf, and update cmp_status. */
df7492f9 3371
e951386e
KH
3372#define STORE_COMPOSITION_RULE(rule) \
3373 do { \
3374 *charbuf++ = -2; \
3375 *charbuf++ = rule; \
3376 cmp_status->length += 2; \
3377 cmp_status->state--; \
3378 } while (0)
ec6d2bb8 3379
e951386e
KH
3380/* Store a composed char or a component char C in charbuf, and update
3381 cmp_status. */
3382
3383#define STORE_COMPOSITION_CHAR(c) \
ec6d2bb8 3384 do { \
e951386e
KH
3385 *charbuf++ = (c); \
3386 cmp_status->length++; \
3387 if (cmp_status->state == COMPOSING_CHAR) \
3388 cmp_status->nchars++; \
df7492f9 3389 else \
e951386e
KH
3390 cmp_status->ncomps++; \
3391 if (cmp_status->method == COMPOSITION_WITH_RULE \
3392 || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS \
3393 && cmp_status->state == COMPOSING_COMPONENT_CHAR)) \
3394 cmp_status->state++; \
ec6d2bb8 3395 } while (0)
88993dfd 3396
d46c5b12 3397
4ed46869
KH
3398/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
3399
b73bfc1c 3400static void
971de7fb 3401decode_coding_iso_2022 (struct coding_system *coding)
4ed46869 3402{
8f924df7
KH
3403 const unsigned char *src = coding->source + coding->consumed;
3404 const unsigned char *src_end = coding->source + coding->src_bytes;
3405 const unsigned char *src_base;
69a80ea3 3406 int *charbuf = coding->charbuf + coding->charbuf_used;
ad1746f5
KH
3407 /* We may produce two annotations (charset and composition) in one
3408 loop and one more charset annotation at the end. */
ff0dacd7 3409 int *charbuf_end
df80c7f0 3410 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
d311d28c 3411 ptrdiff_t consumed_chars = 0, consumed_chars_base;
f10fe38f 3412 bool multibytep = coding->src_multibyte;
4ed46869 3413 /* Charsets invoked to graphic plane 0 and 1 respectively. */
df7492f9
KH
3414 int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3415 int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
134b9549 3416 int charset_id_2, charset_id_3;
df7492f9
KH
3417 struct charset *charset;
3418 int c;
e951386e 3419 struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
66ebf983 3420 Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
d311d28c
PE
3421 ptrdiff_t char_offset = coding->produced_char;
3422 ptrdiff_t last_offset = char_offset;
ff0dacd7 3423 int last_id = charset_ascii;
f10fe38f
PE
3424 bool eol_dos
3425 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
119852e7 3426 int byte_after_cr = -1;
e951386e 3427 int i;
df7492f9 3428
df7492f9 3429 setup_iso_safe_charsets (attrs);
1b3b981b 3430 coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
b73bfc1c 3431
e951386e
KH
3432 if (cmp_status->state != COMPOSING_NO)
3433 {
15cbd324 3434 if (charbuf_end - charbuf < cmp_status->length)
1088b922 3435 emacs_abort ();
e951386e
KH
3436 for (i = 0; i < cmp_status->length; i++)
3437 *charbuf++ = cmp_status->carryover[i];
3438 coding->annotated = 1;
3439 }
3440
b73bfc1c 3441 while (1)
4ed46869 3442 {
cf299835 3443 int c1, c2, c3;
b73bfc1c
KH
3444
3445 src_base = src;
df7492f9
KH
3446 consumed_chars_base = consumed_chars;
3447
3448 if (charbuf >= charbuf_end)
b71f6f73
KH
3449 {
3450 if (byte_after_cr >= 0)
3451 src_base--;
3452 break;
3453 }
df7492f9 3454
119852e7
KH
3455 if (byte_after_cr >= 0)
3456 c1 = byte_after_cr, byte_after_cr = -1;
3457 else
3458 ONE_MORE_BYTE (c1);
065e3595
KH
3459 if (c1 < 0)
3460 goto invalid_code;
4ed46869 3461
e951386e 3462 if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
4ed46869 3463 {
e951386e
KH
3464 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3465 char_offset++;
3466 CODING_ISO_EXTSEGMENT_LEN (coding)--;
3467 continue;
3468 }
3469
3470 if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3471 {
3472 if (c1 == ISO_CODE_ESC)
ec6d2bb8 3473 {
e951386e
KH
3474 if (src + 1 >= src_end)
3475 goto no_more_source;
3476 *charbuf++ = ISO_CODE_ESC;
3477 char_offset++;
3478 if (src[0] == '%' && src[1] == '@')
df7492f9 3479 {
e951386e
KH
3480 src += 2;
3481 consumed_chars += 2;
3482 char_offset += 2;
3483 /* We are sure charbuf can contain two more chars. */
3484 *charbuf++ = '%';
3485 *charbuf++ = '@';
3486 CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
df7492f9 3487 }
4ed46869 3488 }
e951386e
KH
3489 else
3490 {
3491 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3492 char_offset++;
3493 }
3494 continue;
3495 }
3496
3497 if ((cmp_status->state == COMPOSING_RULE
3498 || cmp_status->state == COMPOSING_COMPONENT_RULE)
3499 && c1 != ISO_CODE_ESC)
3500 {
66ebf983 3501 int rule;
e951386e 3502
66ebf983 3503 DECODE_COMPOSITION_RULE (rule);
e951386e
KH
3504 STORE_COMPOSITION_RULE (rule);
3505 continue;
3506 }
3507
3508 /* We produce at most one character. */
3509 switch (iso_code_class [c1])
3510 {
3511 case ISO_0x20_or_0x7F:
df7492f9
KH
3512 if (charset_id_0 < 0
3513 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
781d7a48
KH
3514 /* This is SPACE or DEL. */
3515 charset = CHARSET_FROM_ID (charset_ascii);
3516 else
3517 charset = CHARSET_FROM_ID (charset_id_0);
3518 break;
4ed46869
KH
3519
3520 case ISO_graphic_plane_0:
134b9549
KH
3521 if (charset_id_0 < 0)
3522 charset = CHARSET_FROM_ID (charset_ascii);
3523 else
3524 charset = CHARSET_FROM_ID (charset_id_0);
4ed46869
KH
3525 break;
3526
3527 case ISO_0xA0_or_0xFF:
df7492f9
KH
3528 if (charset_id_1 < 0
3529 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3530 || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3531 goto invalid_code;
4ed46869
KH
3532 /* This is a graphic character, we fall down ... */
3533
3534 case ISO_graphic_plane_1:
df7492f9
KH
3535 if (charset_id_1 < 0)
3536 goto invalid_code;
3537 charset = CHARSET_FROM_ID (charset_id_1);
4ed46869
KH
3538 break;
3539
df7492f9 3540 case ISO_control_0:
2735d060 3541 if (eol_dos && c1 == '\r')
119852e7 3542 ONE_MORE_BYTE (byte_after_cr);
df7492f9
KH
3543 MAYBE_FINISH_COMPOSITION ();
3544 charset = CHARSET_FROM_ID (charset_ascii);
4ed46869
KH
3545 break;
3546
df7492f9 3547 case ISO_control_1:
df7492f9
KH
3548 goto invalid_code;
3549
4ed46869 3550 case ISO_shift_out:
df7492f9
KH
3551 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3552 || CODING_ISO_DESIGNATION (coding, 1) < 0)
3553 goto invalid_code;
3554 CODING_ISO_INVOCATION (coding, 0) = 1;
3555 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
b73bfc1c 3556 continue;
4ed46869
KH
3557
3558 case ISO_shift_in:
df7492f9
KH
3559 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3560 goto invalid_code;
3561 CODING_ISO_INVOCATION (coding, 0) = 0;
3562 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
b73bfc1c 3563 continue;
4ed46869
KH
3564
3565 case ISO_single_shift_2_7:
a63dba42
KH
3566 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS))
3567 goto invalid_code;
4ed46869 3568 case ISO_single_shift_2:
df7492f9
KH
3569 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3570 goto invalid_code;
4ed46869
KH
3571 /* SS2 is handled as an escape sequence of ESC 'N' */
3572 c1 = 'N';
3573 goto label_escape_sequence;
3574
3575 case ISO_single_shift_3:
df7492f9
KH
3576 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3577 goto invalid_code;
4ed46869
KH
3578 /* SS2 is handled as an escape sequence of ESC 'O' */
3579 c1 = 'O';
3580 goto label_escape_sequence;
3581
3582 case ISO_control_sequence_introducer:
3583 /* CSI is handled as an escape sequence of ESC '[' ... */
3584 c1 = '[';
3585 goto label_escape_sequence;
3586
3587 case ISO_escape:
3588 ONE_MORE_BYTE (c1);
3589 label_escape_sequence:
df7492f9 3590 /* Escape sequences handled here are invocation,
4ed46869
KH
3591 designation, direction specification, and character
3592 composition specification. */
3593 switch (c1)
3594 {
3595 case '&': /* revision of following character set */
3596 ONE_MORE_BYTE (c1);
3597 if (!(c1 >= '@' && c1 <= '~'))
df7492f9 3598 goto invalid_code;
4ed46869
KH
3599 ONE_MORE_BYTE (c1);
3600 if (c1 != ISO_CODE_ESC)
df7492f9 3601 goto invalid_code;
4ed46869
KH
3602 ONE_MORE_BYTE (c1);
3603 goto label_escape_sequence;
3604
3605 case '$': /* designation of 2-byte character set */
df7492f9
KH
3606 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3607 goto invalid_code;
134b9549
KH
3608 {
3609 int reg, chars96;
3610
3611 ONE_MORE_BYTE (c1);
3612 if (c1 >= '@' && c1 <= 'B')
3613 { /* designation of JISX0208.1978, GB2312.1980,
88993dfd 3614 or JISX0208.1980 */
134b9549
KH
3615 reg = 0, chars96 = 0;
3616 }
3617 else if (c1 >= 0x28 && c1 <= 0x2B)
3618 { /* designation of DIMENSION2_CHARS94 character set */
3619 reg = c1 - 0x28, chars96 = 0;
3620 ONE_MORE_BYTE (c1);
3621 }
3622 else if (c1 >= 0x2C && c1 <= 0x2F)
3623 { /* designation of DIMENSION2_CHARS96 character set */
3624 reg = c1 - 0x2C, chars96 = 1;
3625 ONE_MORE_BYTE (c1);
3626 }
3627 else
3628 goto invalid_code;
3629 DECODE_DESIGNATION (reg, 2, chars96, c1);
3630 /* We must update these variables now. */
3631 if (reg == 0)
3632 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3633 else if (reg == 1)
3634 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3635 if (chars96 < 0)
3636 goto invalid_code;
3637 }
b73bfc1c 3638 continue;
4ed46869
KH
3639
3640 case 'n': /* invocation of locking-shift-2 */
df7492f9
KH
3641 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3642 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3643 goto invalid_code;
3644 CODING_ISO_INVOCATION (coding, 0) = 2;
3645 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
b73bfc1c 3646 continue;
4ed46869
KH
3647
3648 case 'o': /* invocation of locking-shift-3 */
df7492f9
KH
3649 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3650 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3651 goto invalid_code;
3652 CODING_ISO_INVOCATION (coding, 0) = 3;
3653 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
b73bfc1c 3654 continue;
4ed46869
KH
3655
3656 case 'N': /* invocation of single-shift-2 */
df7492f9
KH
3657 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3658 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3659 goto invalid_code;
134b9549
KH
3660 charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3661 if (charset_id_2 < 0)
3662 charset = CHARSET_FROM_ID (charset_ascii);
3663 else
3664 charset = CHARSET_FROM_ID (charset_id_2);
b73bfc1c 3665 ONE_MORE_BYTE (c1);
e7046a18 3666 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
df7492f9 3667 goto invalid_code;
4ed46869
KH
3668 break;
3669
3670 case 'O': /* invocation of single-shift-3 */
df7492f9
KH
3671 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3672 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3673 goto invalid_code;
134b9549
KH
3674 charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3675 if (charset_id_3 < 0)
3676 charset = CHARSET_FROM_ID (charset_ascii);
3677 else
3678 charset = CHARSET_FROM_ID (charset_id_3);
b73bfc1c 3679 ONE_MORE_BYTE (c1);
e7046a18 3680 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
df7492f9 3681 goto invalid_code;
4ed46869
KH
3682 break;
3683
ec6d2bb8 3684 case '0': case '2': case '3': case '4': /* start composition */
df7492f9
KH
3685 if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3686 goto invalid_code;
e951386e
KH
3687 if (last_id != charset_ascii)
3688 {
3689 ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3690 last_id = charset_ascii;
3691 last_offset = char_offset;
3692 }
ec6d2bb8 3693 DECODE_COMPOSITION_START (c1);
b73bfc1c 3694 continue;
4ed46869 3695
ec6d2bb8 3696 case '1': /* end composition */
e951386e 3697 if (cmp_status->state == COMPOSING_NO)
df7492f9
KH
3698 goto invalid_code;
3699 DECODE_COMPOSITION_END ();
b73bfc1c 3700 continue;
4ed46869
KH
3701
3702 case '[': /* specification of direction */
de59072a 3703 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION))
df7492f9 3704 goto invalid_code;
4ed46869 3705 /* For the moment, nested direction is not supported.
d46c5b12 3706 So, `coding->mode & CODING_MODE_DIRECTION' zero means
ad1746f5 3707 left-to-right, and nonzero means right-to-left. */
4ed46869
KH
3708 ONE_MORE_BYTE (c1);
3709 switch (c1)
3710 {
3711 case ']': /* end of the current direction */
d46c5b12 3712 coding->mode &= ~CODING_MODE_DIRECTION;
4ed46869
KH
3713
3714 case '0': /* end of the current direction */
3715 case '1': /* start of left-to-right direction */
3716 ONE_MORE_BYTE (c1);
3717 if (c1 == ']')
d46c5b12 3718 coding->mode &= ~CODING_MODE_DIRECTION;
4ed46869 3719 else
df7492f9 3720 goto invalid_code;
4ed46869
KH
3721 break;
3722
3723 case '2': /* start of right-to-left direction */
3724 ONE_MORE_BYTE (c1);
3725 if (c1 == ']')
d46c5b12 3726 coding->mode |= CODING_MODE_DIRECTION;
4ed46869 3727 else
df7492f9 3728 goto invalid_code;
4ed46869
KH
3729 break;
3730
3731 default:
df7492f9 3732 goto invalid_code;
4ed46869 3733 }
b73bfc1c 3734 continue;
4ed46869 3735
103e0180 3736 case '%':
103e0180
KH
3737 ONE_MORE_BYTE (c1);
3738 if (c1 == '/')
3739 {
3740 /* CTEXT extended segment:
3741 ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3742 We keep these bytes as is for the moment.
3743 They may be decoded by post-read-conversion. */
3744 int dim, M, L;
4776e638 3745 int size;
8f924df7 3746
103e0180 3747 ONE_MORE_BYTE (dim);
7a84eee5 3748 if (dim < '0' || dim > '4')
e951386e 3749 goto invalid_code;
103e0180 3750 ONE_MORE_BYTE (M);
e951386e
KH
3751 if (M < 128)
3752 goto invalid_code;
103e0180 3753 ONE_MORE_BYTE (L);
e951386e
KH
3754 if (L < 128)
3755 goto invalid_code;
103e0180 3756 size = ((M - 128) * 128) + (L - 128);
e951386e 3757 if (charbuf + 6 > charbuf_end)
4776e638
KH
3758 goto break_loop;
3759 *charbuf++ = ISO_CODE_ESC;
3760 *charbuf++ = '%';
3761 *charbuf++ = '/';
3762 *charbuf++ = dim;
3763 *charbuf++ = BYTE8_TO_CHAR (M);
3764 *charbuf++ = BYTE8_TO_CHAR (L);
e951386e 3765 CODING_ISO_EXTSEGMENT_LEN (coding) = size;
103e0180
KH
3766 }
3767 else if (c1 == 'G')
3768 {
103e0180
KH
3769 /* XFree86 extension for embedding UTF-8 in CTEXT:
3770 ESC % G --UTF-8-BYTES-- ESC % @
3771 We keep these bytes as is for the moment.
3772 They may be decoded by post-read-conversion. */
e951386e 3773 if (charbuf + 3 > charbuf_end)
4776e638 3774 goto break_loop;
e951386e
KH
3775 *charbuf++ = ISO_CODE_ESC;
3776 *charbuf++ = '%';
3777 *charbuf++ = 'G';
3778 CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
103e0180
KH
3779 }
3780 else
4776e638 3781 goto invalid_code;
103e0180 3782 continue;
4776e638 3783 break;
103e0180 3784
4ed46869 3785 default:
df7492f9
KH
3786 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3787 goto invalid_code;
134b9549
KH
3788 {
3789 int reg, chars96;
3790
3791 if (c1 >= 0x28 && c1 <= 0x2B)
3792 { /* designation of DIMENSION1_CHARS94 character set */
3793 reg = c1 - 0x28, chars96 = 0;
3794 ONE_MORE_BYTE (c1);
3795 }
3796 else if (c1 >= 0x2C && c1 <= 0x2F)
3797 { /* designation of DIMENSION1_CHARS96 character set */
3798 reg = c1 - 0x2C, chars96 = 1;
3799 ONE_MORE_BYTE (c1);
3800 }
3801 else
3802 goto invalid_code;
3803 DECODE_DESIGNATION (reg, 1, chars96, c1);
3804 /* We must update these variables now. */
3805 if (reg == 0)
3806 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3807 else if (reg == 1)
3808 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3809 if (chars96 < 0)
3810 goto invalid_code;
3811 }
b73bfc1c 3812 continue;
4ed46869 3813 }
413bb2db
PE
3814 break;
3815
3816 default:
1088b922 3817 emacs_abort ();
b73bfc1c 3818 }
4ed46869 3819
e951386e
KH
3820 if (cmp_status->state == COMPOSING_NO
3821 && charset->id != charset_ascii
ff0dacd7
KH
3822 && last_id != charset->id)
3823 {
3824 if (last_id != charset_ascii)
69a80ea3 3825 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
ff0dacd7
KH
3826 last_id = charset->id;
3827 last_offset = char_offset;
3828 }
3829
b73bfc1c 3830 /* Now we know CHARSET and 1st position code C1 of a character.
cf299835
KH
3831 Produce a decoded character while getting 2nd and 3rd
3832 position codes C2, C3 if necessary. */
df7492f9 3833 if (CHARSET_DIMENSION (charset) > 1)
b73bfc1c
KH
3834 {
3835 ONE_MORE_BYTE (c2);
cf299835
KH
3836 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
3837 || ((c1 & 0x80) != (c2 & 0x80)))
b73bfc1c 3838 /* C2 is not in a valid range. */
df7492f9 3839 goto invalid_code;
cf299835
KH
3840 if (CHARSET_DIMENSION (charset) == 2)
3841 c1 = (c1 << 8) | c2;
3842 else
df7492f9 3843 {
cf299835
KH
3844 ONE_MORE_BYTE (c3);
3845 if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
3846 || ((c1 & 0x80) != (c3 & 0x80)))
3847 /* C3 is not in a valid range. */
df7492f9 3848 goto invalid_code;
cf299835 3849 c1 = (c1 << 16) | (c2 << 8) | c2;
df7492f9
KH
3850 }
3851 }
cf299835 3852 c1 &= 0x7F7F7F;
df7492f9
KH
3853 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3854 if (c < 0)
3855 {
3856 MAYBE_FINISH_COMPOSITION ();
3857 for (; src_base < src; src_base++, char_offset++)
3858 {
3859 if (ASCII_BYTE_P (*src_base))
3860 *charbuf++ = *src_base;
3861 else
3862 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3863 }
3864 }
e951386e 3865 else if (cmp_status->state == COMPOSING_NO)
df7492f9
KH
3866 {
3867 *charbuf++ = c;
3868 char_offset++;
4ed46869 3869 }
e951386e
KH
3870 else if ((cmp_status->state == COMPOSING_CHAR
3871 ? cmp_status->nchars
3872 : cmp_status->ncomps)
3873 >= MAX_COMPOSITION_COMPONENTS)
781d7a48 3874 {
e951386e
KH
3875 /* Too long composition. */
3876 MAYBE_FINISH_COMPOSITION ();
3877 *charbuf++ = c;
3878 char_offset++;
4ed46869 3879 }
e951386e
KH
3880 else
3881 STORE_COMPOSITION_CHAR (c);
4ed46869
KH
3882 continue;
3883
df7492f9
KH
3884 invalid_code:
3885 MAYBE_FINISH_COMPOSITION ();
4ed46869 3886 src = src_base;
df7492f9
KH
3887 consumed_chars = consumed_chars_base;
3888 ONE_MORE_BYTE (c);
065e3595 3889 *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
ff0dacd7 3890 char_offset++;
df7492f9 3891 coding->errors++;
4776e638
KH
3892 continue;
3893
3894 break_loop:
3895 break;
4ed46869 3896 }
fb88bf2d 3897
df7492f9 3898 no_more_source:
e951386e
KH
3899 if (cmp_status->state != COMPOSING_NO)
3900 {
3901 if (coding->mode & CODING_MODE_LAST_BLOCK)
3902 MAYBE_FINISH_COMPOSITION ();
3903 else
3904 {
3905 charbuf -= cmp_status->length;
3906 for (i = 0; i < cmp_status->length; i++)
3907 cmp_status->carryover[i] = charbuf[i];
3908 }
3909 }
3910 else if (last_id != charset_ascii)
69a80ea3 3911 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
df7492f9
KH
3912 coding->consumed_char += consumed_chars_base;
3913 coding->consumed = src_base - coding->source;
3914 coding->charbuf_used = charbuf - coding->charbuf;
4ed46869
KH
3915}
3916
b73bfc1c 3917
f4dee582 3918/* ISO2022 encoding stuff. */
4ed46869
KH
3919
3920/*
f4dee582 3921 It is not enough to say just "ISO2022" on encoding, we have to
df7492f9 3922 specify more details. In Emacs, each coding system of ISO2022
4ed46869 3923 variant has the following specifications:
df7492f9 3924 1. Initial designation to G0 thru G3.
4ed46869
KH
3925 2. Allows short-form designation?
3926 3. ASCII should be designated to G0 before control characters?
3927 4. ASCII should be designated to G0 at end of line?
3928 5. 7-bit environment or 8-bit environment?
3929 6. Use locking-shift?
3930 7. Use Single-shift?
3931 And the following two are only for Japanese:
3932 8. Use ASCII in place of JIS0201-1976-Roman?
3933 9. Use JISX0208-1983 in place of JISX0208-1978?
df7492f9
KH
3934 These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3935 defined by macros CODING_ISO_FLAG_XXX. See `coding.h' for more
f4dee582 3936 details.
4ed46869
KH
3937*/
3938
3939/* Produce codes (escape sequence) for designating CHARSET to graphic
b73bfc1c
KH
3940 register REG at DST, and increment DST. If <final-char> of CHARSET is
3941 '@', 'A', or 'B' and the coding system CODING allows, produce
3942 designation sequence of short-form. */
4ed46869
KH
3943
3944#define ENCODE_DESIGNATION(charset, reg, coding) \
3945 do { \
df7492f9 3946 unsigned char final_char = CHARSET_ISO_FINAL (charset); \
675e2c69
DN
3947 const char *intermediate_char_94 = "()*+"; \
3948 const char *intermediate_char_96 = ",-./"; \
df7492f9 3949 int revision = -1; \
df7492f9
KH
3950 \
3951 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION) \
c197f191 3952 revision = CHARSET_ISO_REVISION (charset); \
df7492f9
KH
3953 \
3954 if (revision >= 0) \
70c22245 3955 { \
df7492f9
KH
3956 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&'); \
3957 EMIT_ONE_BYTE ('@' + revision); \
4ed46869 3958 } \
df7492f9 3959 EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC); \
4ed46869
KH
3960 if (CHARSET_DIMENSION (charset) == 1) \
3961 { \
2735d060 3962 int b; \
df7492f9 3963 if (! CHARSET_ISO_CHARS_96 (charset)) \
2735d060 3964 b = intermediate_char_94[reg]; \
4ed46869 3965 else \
2735d060
PE
3966 b = intermediate_char_96[reg]; \
3967 EMIT_ONE_ASCII_BYTE (b); \
4ed46869
KH
3968 } \
3969 else \
3970 { \
df7492f9
KH
3971 EMIT_ONE_ASCII_BYTE ('$'); \
3972 if (! CHARSET_ISO_CHARS_96 (charset)) \
4ed46869 3973 { \
df7492f9 3974 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM \
b73bfc1c
KH
3975 || reg != 0 \
3976 || final_char < '@' || final_char > 'B') \
df7492f9 3977 EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]); \
4ed46869
KH
3978 } \
3979 else \
df7492f9 3980 EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]); \
4ed46869 3981 } \
df7492f9
KH
3982 EMIT_ONE_ASCII_BYTE (final_char); \
3983 \
3984 CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset); \
4ed46869
KH
3985 } while (0)
3986
df7492f9 3987
4ed46869
KH
3988/* The following two macros produce codes (control character or escape
3989 sequence) for ISO2022 single-shift functions (single-shift-2 and
3990 single-shift-3). */
3991
df7492f9
KH
3992#define ENCODE_SINGLE_SHIFT_2 \
3993 do { \
3994 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3995 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N'); \
3996 else \
3997 EMIT_ONE_BYTE (ISO_CODE_SS2); \
3998 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
4ed46869
KH
3999 } while (0)
4000
df7492f9
KH
4001
4002#define ENCODE_SINGLE_SHIFT_3 \
4003 do { \
4004 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4005 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O'); \
4006 else \
4007 EMIT_ONE_BYTE (ISO_CODE_SS3); \
4008 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
4ed46869
KH
4009 } while (0)
4010
df7492f9 4011
4ed46869
KH
4012/* The following four macros produce codes (control character or
4013 escape sequence) for ISO2022 locking-shift functions (shift-in,
4014 shift-out, locking-shift-2, and locking-shift-3). */
4015
df7492f9
KH
4016#define ENCODE_SHIFT_IN \
4017 do { \
4018 EMIT_ONE_ASCII_BYTE (ISO_CODE_SI); \
4019 CODING_ISO_INVOCATION (coding, 0) = 0; \
4ed46869
KH
4020 } while (0)
4021
df7492f9
KH
4022
4023#define ENCODE_SHIFT_OUT \
4024 do { \
4025 EMIT_ONE_ASCII_BYTE (ISO_CODE_SO); \
4026 CODING_ISO_INVOCATION (coding, 0) = 1; \
4ed46869
KH
4027 } while (0)
4028
df7492f9
KH
4029
4030#define ENCODE_LOCKING_SHIFT_2 \
4031 do { \
4032 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
4033 CODING_ISO_INVOCATION (coding, 0) = 2; \
4ed46869
KH
4034 } while (0)
4035
df7492f9
KH
4036
4037#define ENCODE_LOCKING_SHIFT_3 \
4038 do { \
4039 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
4040 CODING_ISO_INVOCATION (coding, 0) = 3; \
4ed46869
KH
4041 } while (0)
4042
df7492f9 4043
f4dee582
RS
4044/* Produce codes for a DIMENSION1 character whose character set is
4045 CHARSET and whose position-code is C1. Designation and invocation
4ed46869
KH
4046 sequences are also produced in advance if necessary. */
4047
6e85d753
KH
4048#define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
4049 do { \
df7492f9 4050 int id = CHARSET_ID (charset); \
bf16eb23
KH
4051 \
4052 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
4053 && id == charset_ascii) \
4054 { \
4055 id = charset_jisx0201_roman; \
4056 charset = CHARSET_FROM_ID (id); \
4057 } \
4058 \
df7492f9 4059 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
6e85d753 4060 { \
df7492f9
KH
4061 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4062 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
6e85d753 4063 else \
df7492f9
KH
4064 EMIT_ONE_BYTE (c1 | 0x80); \
4065 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
6e85d753
KH
4066 break; \
4067 } \
df7492f9 4068 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
6e85d753 4069 { \
df7492f9 4070 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
6e85d753
KH
4071 break; \
4072 } \
df7492f9 4073 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
6e85d753 4074 { \
df7492f9 4075 EMIT_ONE_BYTE (c1 | 0x80); \
6e85d753
KH
4076 break; \
4077 } \
6e85d753
KH
4078 else \
4079 /* Since CHARSET is not yet invoked to any graphic planes, we \
4080 must invoke it, or, at first, designate it to some graphic \
4081 register. Then repeat the loop to actually produce the \
4082 character. */ \
df7492f9
KH
4083 dst = encode_invocation_designation (charset, coding, dst, \
4084 &produced_chars); \
4ed46869
KH
4085 } while (1)
4086
df7492f9 4087
f4dee582
RS
4088/* Produce codes for a DIMENSION2 character whose character set is
4089 CHARSET and whose position-codes are C1 and C2. Designation and
4ed46869
KH
4090 invocation codes are also produced in advance if necessary. */
4091
6e85d753
KH
4092#define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
4093 do { \
df7492f9 4094 int id = CHARSET_ID (charset); \
bf16eb23
KH
4095 \
4096 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
4097 && id == charset_jisx0208) \
4098 { \
4099 id = charset_jisx0208_1978; \
4100 charset = CHARSET_FROM_ID (id); \
4101 } \
4102 \
df7492f9 4103 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
6e85d753 4104 { \
df7492f9
KH
4105 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4106 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
6e85d753 4107 else \
df7492f9
KH
4108 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
4109 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
6e85d753
KH
4110 break; \
4111 } \
df7492f9 4112 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
6e85d753 4113 { \
df7492f9 4114 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
6e85d753
KH
4115 break; \
4116 } \
df7492f9 4117 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
6e85d753 4118 { \
df7492f9 4119 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
6e85d753
KH
4120 break; \
4121 } \
6e85d753
KH
4122 else \
4123 /* Since CHARSET is not yet invoked to any graphic planes, we \
4124 must invoke it, or, at first, designate it to some graphic \
4125 register. Then repeat the loop to actually produce the \
4126 character. */ \
df7492f9
KH
4127 dst = encode_invocation_designation (charset, coding, dst, \
4128 &produced_chars); \
4ed46869
KH
4129 } while (1)
4130
05e6f5dc 4131
df7492f9
KH
4132#define ENCODE_ISO_CHARACTER(charset, c) \
4133 do { \
8f50130c 4134 unsigned code; \
5eb05ea3 4135 CODING_ENCODE_CHAR (coding, dst, dst_end, (charset), (c), code); \
df7492f9
KH
4136 \
4137 if (CHARSET_DIMENSION (charset) == 1) \
4138 ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code); \
4139 else \
4140 ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
84fbb8a0 4141 } while (0)
bdd9fb48 4142
05e6f5dc 4143
4ed46869 4144/* Produce designation and invocation codes at a place pointed by DST
df7492f9 4145 to use CHARSET. The element `spec.iso_2022' of *CODING is updated.
4ed46869
KH
4146 Return new DST. */
4147
e2f1bab9 4148static unsigned char *
cf84bb53
JB
4149encode_invocation_designation (struct charset *charset,
4150 struct coding_system *coding,
d311d28c 4151 unsigned char *dst, ptrdiff_t *p_nchars)
4ed46869 4152{
f10fe38f 4153 bool multibytep = coding->dst_multibyte;
d311d28c 4154 ptrdiff_t produced_chars = *p_nchars;
4ed46869 4155 int reg; /* graphic register number */
df7492f9 4156 int id = CHARSET_ID (charset);
4ed46869
KH
4157
4158 /* At first, check designations. */
4159 for (reg = 0; reg < 4; reg++)
df7492f9 4160 if (id == CODING_ISO_DESIGNATION (coding, reg))
4ed46869
KH
4161 break;
4162
4163 if (reg >= 4)
4164 {
4165 /* CHARSET is not yet designated to any graphic registers. */
4166 /* At first check the requested designation. */
df7492f9
KH
4167 reg = CODING_ISO_REQUEST (coding, id);
4168 if (reg < 0)
1ba9e4ab
KH
4169 /* Since CHARSET requests no special designation, designate it
4170 to graphic register 0. */
4ed46869
KH
4171 reg = 0;
4172
4173 ENCODE_DESIGNATION (charset, reg, coding);
4174 }
4175
df7492f9
KH
4176 if (CODING_ISO_INVOCATION (coding, 0) != reg
4177 && CODING_ISO_INVOCATION (coding, 1) != reg)
4ed46869
KH
4178 {
4179 /* Since the graphic register REG is not invoked to any graphic
4180 planes, invoke it to graphic plane 0. */
4181 switch (reg)
4182 {
4183 case 0: /* graphic register 0 */
4184 ENCODE_SHIFT_IN;
4185 break;
4186
4187 case 1: /* graphic register 1 */
4188 ENCODE_SHIFT_OUT;
4189 break;
4190
4191 case 2: /* graphic register 2 */
df7492f9 4192 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4ed46869
KH
4193 ENCODE_SINGLE_SHIFT_2;
4194 else
4195 ENCODE_LOCKING_SHIFT_2;
4196 break;
4197
4198 case 3: /* graphic register 3 */
df7492f9 4199 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4ed46869
KH
4200 ENCODE_SINGLE_SHIFT_3;
4201 else
4202 ENCODE_LOCKING_SHIFT_3;
4203 break;
4204 }
4205 }
b73bfc1c 4206
df7492f9 4207 *p_nchars = produced_chars;
4ed46869
KH
4208 return dst;
4209}
4210
4ed46869
KH
4211
4212/* Produce codes for designation and invocation to reset the graphic
4213 planes and registers to initial state. */
df7492f9
KH
4214#define ENCODE_RESET_PLANE_AND_REGISTER() \
4215 do { \
4216 int reg; \
4217 struct charset *charset; \
4218 \
4219 if (CODING_ISO_INVOCATION (coding, 0) != 0) \
4220 ENCODE_SHIFT_IN; \
4221 for (reg = 0; reg < 4; reg++) \
4222 if (CODING_ISO_INITIAL (coding, reg) >= 0 \
4223 && (CODING_ISO_DESIGNATION (coding, reg) \
4224 != CODING_ISO_INITIAL (coding, reg))) \
4225 { \
4226 charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4227 ENCODE_DESIGNATION (charset, reg, coding); \
4228 } \
4ed46869
KH
4229 } while (0)
4230
df7492f9 4231
bdd9fb48 4232/* Produce designation sequences of charsets in the line started from
5eb05ea3
KH
4233 CHARBUF to a place pointed by DST, and return the number of
4234 produced bytes. DST should not directly point a buffer text area
4235 which may be relocated by char_charset call.
bdd9fb48
KH
4236
4237 If the current block ends before any end-of-line, we may fail to
d46c5b12
KH
4238 find all the necessary designations. */
4239
6e6c82a4 4240static ptrdiff_t
5eb05ea3
KH
4241encode_designation_at_bol (struct coding_system *coding,
4242 int *charbuf, int *charbuf_end,
461c2ab9 4243 unsigned char *dst)
e0e989f6 4244{
75a3b399 4245 unsigned char *orig = dst;
df7492f9 4246 struct charset *charset;
bdd9fb48
KH
4247 /* Table of charsets to be designated to each graphic register. */
4248 int r[4];
df7492f9 4249 int c, found = 0, reg;
d311d28c 4250 ptrdiff_t produced_chars = 0;
f10fe38f 4251 bool multibytep = coding->dst_multibyte;
df7492f9
KH
4252 Lisp_Object attrs;
4253 Lisp_Object charset_list;
4254
4255 attrs = CODING_ID_ATTRS (coding->id);
4256 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4257 if (EQ (charset_list, Qiso_2022))
4258 charset_list = Viso_2022_charset_list;
bdd9fb48
KH
4259
4260 for (reg = 0; reg < 4; reg++)
4261 r[reg] = -1;
4262
5eb05ea3 4263 while (charbuf < charbuf_end && found < 4)
e0e989f6 4264 {
df7492f9
KH
4265 int id;
4266
4267 c = *charbuf++;
b73bfc1c
KH
4268 if (c == '\n')
4269 break;
df7492f9
KH
4270 charset = char_charset (c, charset_list, NULL);
4271 id = CHARSET_ID (charset);
4272 reg = CODING_ISO_REQUEST (coding, id);
4273 if (reg >= 0 && r[reg] < 0)
bdd9fb48
KH
4274 {
4275 found++;
df7492f9 4276 r[reg] = id;
bdd9fb48 4277 }
bdd9fb48
KH
4278 }
4279
4280 if (found)
4281 {
4282 for (reg = 0; reg < 4; reg++)
4283 if (r[reg] >= 0
df7492f9
KH
4284 && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4285 ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
e0e989f6 4286 }
b73bfc1c 4287
5eb05ea3 4288 return dst - orig;
e0e989f6
KH
4289}
4290
4ed46869
KH
4291/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
4292
f10fe38f 4293static bool
971de7fb 4294encode_coding_iso_2022 (struct coding_system *coding)
4ed46869 4295{
f10fe38f 4296 bool multibytep = coding->dst_multibyte;
df7492f9
KH
4297 int *charbuf = coding->charbuf;
4298 int *charbuf_end = charbuf + coding->charbuf_used;
4299 unsigned char *dst = coding->destination + coding->produced;
4300 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4301 int safe_room = 16;
f10fe38f 4302 bool bol_designation
df7492f9
KH
4303 = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4304 && CODING_ISO_BOL (coding));
d311d28c 4305 ptrdiff_t produced_chars = 0;
df7492f9 4306 Lisp_Object attrs, eol_type, charset_list;
f10fe38f 4307 bool ascii_compatible;
b73bfc1c 4308 int c;
ff0dacd7 4309 int preferred_charset_id = -1;
05e6f5dc 4310
24a73b0a 4311 CODING_GET_INFO (coding, attrs, charset_list);
0a9564cb 4312 eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
24a73b0a
KH
4313 if (VECTORP (eol_type))
4314 eol_type = Qunix;
4315
004068e4 4316 setup_iso_safe_charsets (attrs);
ff0dacd7 4317 /* Charset list may have been changed. */
287c57d7 4318 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
1b3b981b 4319 coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
0eecad43 4320
a552b35a
KH
4321 ascii_compatible
4322 = (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
4323 && ! (CODING_ISO_FLAGS (coding) & (CODING_ISO_FLAG_DESIGNATION
4324 | CODING_ISO_FLAG_LOCKING_SHIFT)));
bdd9fb48 4325
df7492f9 4326 while (charbuf < charbuf_end)
4ed46869 4327 {
df7492f9 4328 ASSURE_DESTINATION (safe_room);
b73bfc1c 4329
df7492f9 4330 if (bol_designation)
b73bfc1c 4331 {
bdd9fb48 4332 /* We have to produce designation sequences if any now. */
5eb05ea3
KH
4333 unsigned char desig_buf[16];
4334 int nbytes;
8f50130c 4335 ptrdiff_t offset;
5eb05ea3
KH
4336
4337 charset_map_loaded = 0;
4338 nbytes = encode_designation_at_bol (coding, charbuf, charbuf_end,
4339 desig_buf);
4340 if (charset_map_loaded
c1892f11 4341 && (offset = coding_change_destination (coding)))
5eb05ea3
KH
4342 {
4343 dst += offset;
4344 dst_end += offset;
4345 }
4346 memcpy (dst, desig_buf, nbytes);
4347 dst += nbytes;
df7492f9 4348 /* We are sure that designation sequences are all ASCII bytes. */
5eb05ea3
KH
4349 produced_chars += nbytes;
4350 bol_designation = 0;
4351 ASSURE_DESTINATION (safe_room);
e0e989f6
KH
4352 }
4353
df7492f9 4354 c = *charbuf++;
ec6d2bb8 4355
ff0dacd7
KH
4356 if (c < 0)
4357 {
4358 /* Handle an annotation. */
4359 switch (*charbuf)
ec6d2bb8 4360 {
ff0dacd7
KH
4361 case CODING_ANNOTATE_COMPOSITION_MASK:
4362 /* Not yet implemented. */
4363 break;
4364 case CODING_ANNOTATE_CHARSET_MASK:
cf7dfdf5 4365 preferred_charset_id = charbuf[2];
ff0dacd7
KH
4366 if (preferred_charset_id >= 0
4367 && NILP (Fmemq (make_number (preferred_charset_id),
4368 charset_list)))
4369 preferred_charset_id = -1;
4370 break;
4371 default:
1088b922 4372 emacs_abort ();
4ed46869 4373 }
ff0dacd7
KH
4374 charbuf += -c - 1;
4375 continue;
4ed46869 4376 }
ec6d2bb8 4377
b73bfc1c
KH
4378 /* Now encode the character C. */
4379 if (c < 0x20 || c == 0x7F)
4380 {
df7492f9
KH
4381 if (c == '\n'
4382 || (c == '\r' && EQ (eol_type, Qmac)))
19a8d9e0 4383 {
df7492f9
KH
4384 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4385 ENCODE_RESET_PLANE_AND_REGISTER ();
4386 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
b73bfc1c 4387 {
df7492f9
KH
4388 int i;
4389
4390 for (i = 0; i < 4; i++)
4391 CODING_ISO_DESIGNATION (coding, i)
4392 = CODING_ISO_INITIAL (coding, i);
b73bfc1c 4393 }
f10fe38f
PE
4394 bol_designation = ((CODING_ISO_FLAGS (coding)
4395 & CODING_ISO_FLAG_DESIGNATE_AT_BOL)
4396 != 0);
19a8d9e0 4397 }
df7492f9
KH
4398 else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4399 ENCODE_RESET_PLANE_AND_REGISTER ();
4400 EMIT_ONE_ASCII_BYTE (c);
4ed46869 4401 }
df7492f9 4402 else if (ASCII_CHAR_P (c))
88993dfd 4403 {
df7492f9
KH
4404 if (ascii_compatible)
4405 EMIT_ONE_ASCII_BYTE (c);
93dec019 4406 else
19a8d9e0 4407 {
bf16eb23
KH
4408 struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4409 ENCODE_ISO_CHARACTER (charset, c);
19a8d9e0 4410 }
4ed46869 4411 }
16eafb5d 4412 else if (CHAR_BYTE8_P (c))
88993dfd 4413 {
16eafb5d
KH
4414 c = CHAR_TO_BYTE8 (c);
4415 EMIT_ONE_BYTE (c);
88993dfd 4416 }
b73bfc1c 4417 else
df7492f9 4418 {
ff0dacd7 4419 struct charset *charset;
b73bfc1c 4420
ff0dacd7
KH
4421 if (preferred_charset_id >= 0)
4422 {
f10fe38f 4423 bool result;
5eb05ea3 4424
ff0dacd7 4425 charset = CHARSET_FROM_ID (preferred_charset_id);
5eb05ea3
KH
4426 CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
4427 if (! result)
4428 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4429 NULL, charset);
ff0dacd7
KH
4430 }
4431 else
5eb05ea3
KH
4432 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4433 NULL, charset);
df7492f9
KH
4434 if (!charset)
4435 {
41cbe562
KH
4436 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4437 {
4438 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4439 charset = CHARSET_FROM_ID (charset_ascii);
4440 }
4441 else
4442 {
4443 c = coding->default_char;
5eb05ea3
KH
4444 CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4445 charset_list, NULL, charset);
41cbe562 4446 }
df7492f9
KH
4447 }
4448 ENCODE_ISO_CHARACTER (charset, c);
4449 }
84fbb8a0 4450 }
b73bfc1c 4451
df7492f9
KH
4452 if (coding->mode & CODING_MODE_LAST_BLOCK
4453 && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4454 {
4455 ASSURE_DESTINATION (safe_room);
4456 ENCODE_RESET_PLANE_AND_REGISTER ();
4457 }
065e3595 4458 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
4459 CODING_ISO_BOL (coding) = bol_designation;
4460 coding->produced_char += produced_chars;
4461 coding->produced = dst - coding->destination;
4462 return 0;
4ed46869
KH
4463}
4464
4465\f
df7492f9 4466/*** 8,9. SJIS and BIG5 handlers ***/
4ed46869 4467
df7492f9 4468/* Although SJIS and BIG5 are not ISO's coding system, they are used
4ed46869
KH
4469 quite widely. So, for the moment, Emacs supports them in the bare
4470 C code. But, in the future, they may be supported only by CCL. */
4471
4472/* SJIS is a coding system encoding three character sets: ASCII, right
4473 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
4474 as is. A character of charset katakana-jisx0201 is encoded by
4475 "position-code + 0x80". A character of charset japanese-jisx0208
4476 is encoded in 2-byte but two position-codes are divided and shifted
df7492f9 4477 so that it fit in the range below.
4ed46869
KH
4478
4479 --- CODE RANGE of SJIS ---
4480 (character set) (range)
4481 ASCII 0x00 .. 0x7F
df7492f9 4482 KATAKANA-JISX0201 0xA0 .. 0xDF
c28a9453 4483 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
d14d03ac 4484 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
4ed46869
KH
4485 -------------------------------
4486
4487*/
4488
4489/* BIG5 is a coding system encoding two character sets: ASCII and
4490 Big5. An ASCII character is encoded as is. Big5 is a two-byte
df7492f9 4491 character set and is encoded in two-byte.
4ed46869
KH
4492
4493 --- CODE RANGE of BIG5 ---
4494 (character set) (range)
4495 ASCII 0x00 .. 0x7F
4496 Big5 (1st byte) 0xA1 .. 0xFE
4497 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
4498 --------------------------
4499
df7492f9 4500 */
4ed46869
KH
4501
4502/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
f10fe38f 4503 Return true if a text is encoded in SJIS. */
4ed46869 4504
f10fe38f 4505static bool
cf84bb53
JB
4506detect_coding_sjis (struct coding_system *coding,
4507 struct coding_detection_info *detect_info)
4ed46869 4508{
065e3595 4509 const unsigned char *src = coding->source, *src_base;
8f924df7 4510 const unsigned char *src_end = coding->source + coding->src_bytes;
f10fe38f 4511 bool multibytep = coding->src_multibyte;
d311d28c 4512 ptrdiff_t consumed_chars = 0;
df7492f9 4513 int found = 0;
b73bfc1c 4514 int c;
f07190ca
KH
4515 Lisp_Object attrs, charset_list;
4516 int max_first_byte_of_2_byte_code;
4517
4518 CODING_GET_INFO (coding, attrs, charset_list);
4519 max_first_byte_of_2_byte_code
4520 = (XINT (Flength (charset_list)) > 3 ? 0xFC : 0xEF);
df7492f9 4521
ff0dacd7 4522 detect_info->checked |= CATEGORY_MASK_SJIS;
df7492f9
KH
4523 /* A coding system of this category is always ASCII compatible. */
4524 src += coding->head_ascii;
4ed46869 4525
b73bfc1c 4526 while (1)
4ed46869 4527 {
065e3595 4528 src_base = src;
df7492f9 4529 ONE_MORE_BYTE (c);
682169fe
KH
4530 if (c < 0x80)
4531 continue;
f07190ca
KH
4532 if ((c >= 0x81 && c <= 0x9F)
4533 || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code))
4ed46869 4534 {
df7492f9 4535 ONE_MORE_BYTE (c);
682169fe 4536 if (c < 0x40 || c == 0x7F || c > 0xFC)
df7492f9 4537 break;
ff0dacd7 4538 found = CATEGORY_MASK_SJIS;
4ed46869 4539 }
df7492f9 4540 else if (c >= 0xA0 && c < 0xE0)
ff0dacd7 4541 found = CATEGORY_MASK_SJIS;
df7492f9
KH
4542 else
4543 break;
4ed46869 4544 }
ff0dacd7 4545 detect_info->rejected |= CATEGORY_MASK_SJIS;
df7492f9
KH
4546 return 0;
4547
4548 no_more_source:
065e3595 4549 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
89528eb3 4550 {
ff0dacd7 4551 detect_info->rejected |= CATEGORY_MASK_SJIS;
89528eb3 4552 return 0;
4ed46869 4553 }
ff0dacd7
KH
4554 detect_info->found |= found;
4555 return 1;
4ed46869
KH
4556}
4557
4558/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
f10fe38f 4559 Return true if a text is encoded in BIG5. */
4ed46869 4560
f10fe38f 4561static bool
cf84bb53
JB
4562detect_coding_big5 (struct coding_system *coding,
4563 struct coding_detection_info *detect_info)
4ed46869 4564{
065e3595 4565 const unsigned char *src = coding->source, *src_base;
8f924df7 4566 const unsigned char *src_end = coding->source + coding->src_bytes;
f10fe38f 4567 bool multibytep = coding->src_multibyte;
d311d28c 4568 ptrdiff_t consumed_chars = 0;
df7492f9 4569 int found = 0;
b73bfc1c 4570 int c;
fa42c37f 4571
ff0dacd7 4572 detect_info->checked |= CATEGORY_MASK_BIG5;
df7492f9
KH
4573 /* A coding system of this category is always ASCII compatible. */
4574 src += coding->head_ascii;
fa42c37f 4575
b73bfc1c 4576 while (1)
fa42c37f 4577 {
065e3595 4578 src_base = src;
df7492f9
KH
4579 ONE_MORE_BYTE (c);
4580 if (c < 0x80)
fa42c37f 4581 continue;
df7492f9 4582 if (c >= 0xA1)
fa42c37f 4583 {
df7492f9
KH
4584 ONE_MORE_BYTE (c);
4585 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
fa42c37f 4586 return 0;
ff0dacd7 4587 found = CATEGORY_MASK_BIG5;
fa42c37f 4588 }
df7492f9
KH
4589 else
4590 break;
fa42c37f 4591 }
ff0dacd7 4592 detect_info->rejected |= CATEGORY_MASK_BIG5;
fa42c37f 4593 return 0;
fa42c37f 4594
df7492f9 4595 no_more_source:
065e3595 4596 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
89528eb3 4597 {
ff0dacd7 4598 detect_info->rejected |= CATEGORY_MASK_BIG5;
89528eb3
KH
4599 return 0;
4600 }
ff0dacd7
KH
4601 detect_info->found |= found;
4602 return 1;
fa42c37f
KH
4603}
4604
f10fe38f 4605/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
fa42c37f 4606
b73bfc1c 4607static void
971de7fb 4608decode_coding_sjis (struct coding_system *coding)
4ed46869 4609{
8f924df7
KH
4610 const unsigned char *src = coding->source + coding->consumed;
4611 const unsigned char *src_end = coding->source + coding->src_bytes;
4612 const unsigned char *src_base;
69a80ea3 4613 int *charbuf = coding->charbuf + coding->charbuf_used;
ad1746f5 4614 /* We may produce one charset annotation in one loop and one more at
df80c7f0 4615 the end. */
69a80ea3 4616 int *charbuf_end
df80c7f0 4617 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
d311d28c 4618 ptrdiff_t consumed_chars = 0, consumed_chars_base;
f10fe38f 4619 bool multibytep = coding->src_multibyte;
df7492f9 4620 struct charset *charset_roman, *charset_kanji, *charset_kana;
57a47f8a 4621 struct charset *charset_kanji2;
24a73b0a 4622 Lisp_Object attrs, charset_list, val;
d311d28c
PE
4623 ptrdiff_t char_offset = coding->produced_char;
4624 ptrdiff_t last_offset = char_offset;
ff0dacd7 4625 int last_id = charset_ascii;
f10fe38f
PE
4626 bool eol_dos
4627 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
119852e7 4628 int byte_after_cr = -1;
a5d301df 4629
24a73b0a 4630 CODING_GET_INFO (coding, attrs, charset_list);
df7492f9
KH
4631
4632 val = charset_list;
4633 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
89528eb3 4634 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
57a47f8a
KH
4635 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4636 charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
fa42c37f 4637
b73bfc1c 4638 while (1)
4ed46869 4639 {
df7492f9 4640 int c, c1;
24a73b0a 4641 struct charset *charset;
fa42c37f 4642
b73bfc1c 4643 src_base = src;
df7492f9 4644 consumed_chars_base = consumed_chars;
fa42c37f 4645
df7492f9 4646 if (charbuf >= charbuf_end)
b71f6f73
KH
4647 {
4648 if (byte_after_cr >= 0)
4649 src_base--;
4650 break;
4651 }
df7492f9 4652
119852e7
KH
4653 if (byte_after_cr >= 0)
4654 c = byte_after_cr, byte_after_cr = -1;
4655 else
4656 ONE_MORE_BYTE (c);
065e3595
KH
4657 if (c < 0)
4658 goto invalid_code;
24a73b0a 4659 if (c < 0x80)
119852e7 4660 {
2735d060 4661 if (eol_dos && c == '\r')
119852e7
KH
4662 ONE_MORE_BYTE (byte_after_cr);
4663 charset = charset_roman;
4664 }
57a47f8a 4665 else if (c == 0x80 || c == 0xA0)
8e921c4b 4666 goto invalid_code;
57a47f8a
KH
4667 else if (c >= 0xA1 && c <= 0xDF)
4668 {
4669 /* SJIS -> JISX0201-Kana */
4670 c &= 0x7F;
4671 charset = charset_kana;
4672 }
4673 else if (c <= 0xEF)
df7492f9 4674 {
57a47f8a
KH
4675 /* SJIS -> JISX0208 */
4676 ONE_MORE_BYTE (c1);
4677 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
24a73b0a 4678 goto invalid_code;
57a47f8a
KH
4679 c = (c << 8) | c1;
4680 SJIS_TO_JIS (c);
4681 charset = charset_kanji;
4682 }
4683 else if (c <= 0xFC && charset_kanji2)
4684 {
c6876370 4685 /* SJIS -> JISX0213-2 */
57a47f8a
KH
4686 ONE_MORE_BYTE (c1);
4687 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
24a73b0a 4688 goto invalid_code;
57a47f8a
KH
4689 c = (c << 8) | c1;
4690 SJIS_TO_JIS2 (c);
4691 charset = charset_kanji2;
df7492f9 4692 }
57a47f8a
KH
4693 else
4694 goto invalid_code;
24a73b0a
KH
4695 if (charset->id != charset_ascii
4696 && last_id != charset->id)
4697 {
4698 if (last_id != charset_ascii)
69a80ea3 4699 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
24a73b0a
KH
4700 last_id = charset->id;
4701 last_offset = char_offset;
4702 }
4703 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
df7492f9 4704 *charbuf++ = c;
ff0dacd7 4705 char_offset++;
df7492f9 4706 continue;
b73bfc1c 4707
df7492f9
KH
4708 invalid_code:
4709 src = src_base;
4710 consumed_chars = consumed_chars_base;
4711 ONE_MORE_BYTE (c);
065e3595 4712 *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
ff0dacd7 4713 char_offset++;
df7492f9
KH
4714 coding->errors++;
4715 }
fa42c37f 4716
df7492f9 4717 no_more_source:
ff0dacd7 4718 if (last_id != charset_ascii)
69a80ea3 4719 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
df7492f9
KH
4720 coding->consumed_char += consumed_chars_base;
4721 coding->consumed = src_base - coding->source;
4722 coding->charbuf_used = charbuf - coding->charbuf;
fa42c37f
KH
4723}
4724
b73bfc1c 4725static void
971de7fb 4726decode_coding_big5 (struct coding_system *coding)
4ed46869 4727{
8f924df7
KH
4728 const unsigned char *src = coding->source + coding->consumed;
4729 const unsigned char *src_end = coding->source + coding->src_bytes;
4730 const unsigned char *src_base;
69a80ea3 4731 int *charbuf = coding->charbuf + coding->charbuf_used;
ad1746f5 4732 /* We may produce one charset annotation in one loop and one more at
df80c7f0 4733 the end. */
69a80ea3 4734 int *charbuf_end
df80c7f0 4735 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
d311d28c 4736 ptrdiff_t consumed_chars = 0, consumed_chars_base;
f10fe38f 4737 bool multibytep = coding->src_multibyte;
df7492f9 4738 struct charset *charset_roman, *charset_big5;
24a73b0a 4739 Lisp_Object attrs, charset_list, val;
d311d28c
PE
4740 ptrdiff_t char_offset = coding->produced_char;
4741 ptrdiff_t last_offset = char_offset;
ff0dacd7 4742 int last_id = charset_ascii;
f10fe38f
PE
4743 bool eol_dos
4744 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
119852e7 4745 int byte_after_cr = -1;
df7492f9 4746
24a73b0a 4747 CODING_GET_INFO (coding, attrs, charset_list);
df7492f9
KH
4748 val = charset_list;
4749 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4750 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4ed46869 4751
b73bfc1c 4752 while (1)
4ed46869 4753 {
df7492f9 4754 int c, c1;
24a73b0a 4755 struct charset *charset;
b73bfc1c
KH
4756
4757 src_base = src;
df7492f9
KH
4758 consumed_chars_base = consumed_chars;
4759
4760 if (charbuf >= charbuf_end)
b71f6f73
KH
4761 {
4762 if (byte_after_cr >= 0)
4763 src_base--;
4764 break;
4765 }
df7492f9 4766
119852e7 4767 if (byte_after_cr >= 0)
14daee73 4768 c = byte_after_cr, byte_after_cr = -1;
119852e7
KH
4769 else
4770 ONE_MORE_BYTE (c);
b73bfc1c 4771
065e3595
KH
4772 if (c < 0)
4773 goto invalid_code;
24a73b0a 4774 if (c < 0x80)
119852e7 4775 {
2735d060 4776 if (eol_dos && c == '\r')
119852e7
KH
4777 ONE_MORE_BYTE (byte_after_cr);
4778 charset = charset_roman;
4779 }
24a73b0a 4780 else
4ed46869 4781 {
24a73b0a
KH
4782 /* BIG5 -> Big5 */
4783 if (c < 0xA1 || c > 0xFE)
4784 goto invalid_code;
4785 ONE_MORE_BYTE (c1);
4786 if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4787 goto invalid_code;
4788 c = c << 8 | c1;
4789 charset = charset_big5;
4ed46869 4790 }
24a73b0a
KH
4791 if (charset->id != charset_ascii
4792 && last_id != charset->id)
df7492f9 4793 {
24a73b0a 4794 if (last_id != charset_ascii)
69a80ea3 4795 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
24a73b0a
KH
4796 last_id = charset->id;
4797 last_offset = char_offset;
4ed46869 4798 }
24a73b0a 4799 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
df7492f9 4800 *charbuf++ = c;
ff0dacd7 4801 char_offset++;
fb88bf2d
KH
4802 continue;
4803
df7492f9 4804 invalid_code:
4ed46869 4805 src = src_base;
df7492f9
KH
4806 consumed_chars = consumed_chars_base;
4807 ONE_MORE_BYTE (c);
065e3595 4808 *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
ff0dacd7 4809 char_offset++;
df7492f9 4810 coding->errors++;
fb88bf2d 4811 }
d46c5b12 4812
df7492f9 4813 no_more_source:
ff0dacd7 4814 if (last_id != charset_ascii)
69a80ea3 4815 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
df7492f9
KH
4816 coding->consumed_char += consumed_chars_base;
4817 coding->consumed = src_base - coding->source;
4818 coding->charbuf_used = charbuf - coding->charbuf;
4ed46869
KH
4819}
4820
4821/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
b73bfc1c
KH
4822 This function can encode charsets `ascii', `katakana-jisx0201',
4823 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
4824 are sure that all these charsets are registered as official charset
4ed46869 4825 (i.e. do not have extended leading-codes). Characters of other
f10fe38f 4826 charsets are produced without any encoding. */
4ed46869 4827
f10fe38f 4828static bool
971de7fb 4829encode_coding_sjis (struct coding_system *coding)
4ed46869 4830{
f10fe38f 4831 bool multibytep = coding->dst_multibyte;
df7492f9
KH
4832 int *charbuf = coding->charbuf;
4833 int *charbuf_end = charbuf + coding->charbuf_used;
4834 unsigned char *dst = coding->destination + coding->produced;
4835 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4836 int safe_room = 4;
d311d28c 4837 ptrdiff_t produced_chars = 0;
24a73b0a 4838 Lisp_Object attrs, charset_list, val;
f10fe38f 4839 bool ascii_compatible;
66ebf983 4840 struct charset *charset_kanji, *charset_kana;
57a47f8a 4841 struct charset *charset_kanji2;
df7492f9 4842 int c;
a5d301df 4843
24a73b0a 4844 CODING_GET_INFO (coding, attrs, charset_list);
66ebf983 4845 val = XCDR (charset_list);
df7492f9 4846 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
57a47f8a
KH
4847 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4848 charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4ed46869 4849
df7492f9 4850 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
93dec019 4851
df7492f9
KH
4852 while (charbuf < charbuf_end)
4853 {
4854 ASSURE_DESTINATION (safe_room);
4855 c = *charbuf++;
b73bfc1c 4856 /* Now encode the character C. */
df7492f9
KH
4857 if (ASCII_CHAR_P (c) && ascii_compatible)
4858 EMIT_ONE_ASCII_BYTE (c);
16eafb5d
KH
4859 else if (CHAR_BYTE8_P (c))
4860 {
4861 c = CHAR_TO_BYTE8 (c);
4862 EMIT_ONE_BYTE (c);
4863 }
df7492f9 4864 else
b73bfc1c 4865 {
df7492f9 4866 unsigned code;
5eb05ea3
KH
4867 struct charset *charset;
4868 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4869 &code, charset);
df7492f9
KH
4870
4871 if (!charset)
4ed46869 4872 {
41cbe562 4873 if (coding->mode & CODING_MODE_SAFE_ENCODING)
b73bfc1c 4874 {
41cbe562
KH
4875 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4876 charset = CHARSET_FROM_ID (charset_ascii);
b73bfc1c 4877 }
41cbe562 4878 else
b73bfc1c 4879 {
41cbe562 4880 c = coding->default_char;
5eb05ea3
KH
4881 CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4882 charset_list, &code, charset);
b73bfc1c 4883 }
b73bfc1c 4884 }
df7492f9 4885 if (code == CHARSET_INVALID_CODE (charset))
1088b922 4886 emacs_abort ();
df7492f9
KH
4887 if (charset == charset_kanji)
4888 {
4889 int c1, c2;
4890 JIS_TO_SJIS (code);
4891 c1 = code >> 8, c2 = code & 0xFF;
4892 EMIT_TWO_BYTES (c1, c2);
4893 }
4894 else if (charset == charset_kana)
4895 EMIT_ONE_BYTE (code | 0x80);
57a47f8a
KH
4896 else if (charset_kanji2 && charset == charset_kanji2)
4897 {
4898 int c1, c2;
4899
4900 c1 = code >> 8;
f07190ca
KH
4901 if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25)
4902 || c1 == 0x28
57a47f8a
KH
4903 || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
4904 {
4905 JIS_TO_SJIS2 (code);
4906 c1 = code >> 8, c2 = code & 0xFF;
4907 EMIT_TWO_BYTES (c1, c2);
4908 }
4909 else
4910 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4911 }
df7492f9
KH
4912 else
4913 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4914 }
4915 }
065e3595 4916 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
4917 coding->produced_char += produced_chars;
4918 coding->produced = dst - coding->destination;
4919 return 0;
4920}
4921
f10fe38f 4922static bool
971de7fb 4923encode_coding_big5 (struct coding_system *coding)
df7492f9 4924{
f10fe38f 4925 bool multibytep = coding->dst_multibyte;
df7492f9
KH
4926 int *charbuf = coding->charbuf;
4927 int *charbuf_end = charbuf + coding->charbuf_used;
4928 unsigned char *dst = coding->destination + coding->produced;
4929 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4930 int safe_room = 4;
d311d28c 4931 ptrdiff_t produced_chars = 0;
24a73b0a 4932 Lisp_Object attrs, charset_list, val;
f10fe38f 4933 bool ascii_compatible;
66ebf983 4934 struct charset *charset_big5;
df7492f9
KH
4935 int c;
4936
24a73b0a 4937 CODING_GET_INFO (coding, attrs, charset_list);
66ebf983 4938 val = XCDR (charset_list);
df7492f9
KH
4939 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4940 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4941
4942 while (charbuf < charbuf_end)
4943 {
4944 ASSURE_DESTINATION (safe_room);
4945 c = *charbuf++;
4946 /* Now encode the character C. */
4947 if (ASCII_CHAR_P (c) && ascii_compatible)
4948 EMIT_ONE_ASCII_BYTE (c);
16eafb5d
KH
4949 else if (CHAR_BYTE8_P (c))
4950 {
4951 c = CHAR_TO_BYTE8 (c);
4952 EMIT_ONE_BYTE (c);
b73bfc1c
KH
4953 }
4954 else
4955 {
df7492f9 4956 unsigned code;
5eb05ea3
KH
4957 struct charset *charset;
4958 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4959 &code, charset);
df7492f9
KH
4960
4961 if (! charset)
b73bfc1c 4962 {
41cbe562 4963 if (coding->mode & CODING_MODE_SAFE_ENCODING)
b73bfc1c 4964 {
41cbe562
KH
4965 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4966 charset = CHARSET_FROM_ID (charset_ascii);
b73bfc1c 4967 }
41cbe562 4968 else
0eecad43 4969 {
41cbe562 4970 c = coding->default_char;
5eb05ea3
KH
4971 CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4972 charset_list, &code, charset);
0eecad43 4973 }
4ed46869 4974 }
df7492f9 4975 if (code == CHARSET_INVALID_CODE (charset))
1088b922 4976 emacs_abort ();
df7492f9 4977 if (charset == charset_big5)
b73bfc1c 4978 {
df7492f9
KH
4979 int c1, c2;
4980
4981 c1 = code >> 8, c2 = code & 0xFF;
4982 EMIT_TWO_BYTES (c1, c2);
b73bfc1c 4983 }
df7492f9
KH
4984 else
4985 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4ed46869 4986 }
4ed46869 4987 }
065e3595 4988 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
4989 coding->produced_char += produced_chars;
4990 coding->produced = dst - coding->destination;
4991 return 0;
4ed46869
KH
4992}
4993
4994\f
df7492f9 4995/*** 10. CCL handlers ***/
1397dc18
KH
4996
4997/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
f10fe38f
PE
4998 Return true if a text is encoded in a coding system of which
4999 encoder/decoder are written in CCL program. */
1397dc18 5000
f10fe38f 5001static bool
cf84bb53
JB
5002detect_coding_ccl (struct coding_system *coding,
5003 struct coding_detection_info *detect_info)
1397dc18 5004{
065e3595 5005 const unsigned char *src = coding->source, *src_base;
8f924df7 5006 const unsigned char *src_end = coding->source + coding->src_bytes;
f10fe38f 5007 bool multibytep = coding->src_multibyte;
d311d28c 5008 ptrdiff_t consumed_chars = 0;
df7492f9 5009 int found = 0;
0e219d54 5010 unsigned char *valids;
d311d28c 5011 ptrdiff_t head_ascii = coding->head_ascii;
df7492f9
KH
5012 Lisp_Object attrs;
5013
ff0dacd7
KH
5014 detect_info->checked |= CATEGORY_MASK_CCL;
5015
df7492f9 5016 coding = &coding_categories[coding_category_ccl];
0e219d54 5017 valids = CODING_CCL_VALIDS (coding);
df7492f9
KH
5018 attrs = CODING_ID_ATTRS (coding->id);
5019 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5020 src += head_ascii;
1397dc18 5021
b73bfc1c 5022 while (1)
1397dc18 5023 {
df7492f9 5024 int c;
065e3595
KH
5025
5026 src_base = src;
df7492f9 5027 ONE_MORE_BYTE (c);
065e3595 5028 if (c < 0 || ! valids[c])
df7492f9 5029 break;
ff0dacd7
KH
5030 if ((valids[c] > 1))
5031 found = CATEGORY_MASK_CCL;
df7492f9 5032 }
ff0dacd7 5033 detect_info->rejected |= CATEGORY_MASK_CCL;
df7492f9
KH
5034 return 0;
5035
5036 no_more_source:
ff0dacd7
KH
5037 detect_info->found |= found;
5038 return 1;
df7492f9
KH
5039}
5040
5041static void
971de7fb 5042decode_coding_ccl (struct coding_system *coding)
df7492f9 5043{
7c78e542 5044 const unsigned char *src = coding->source + coding->consumed;
8f924df7 5045 const unsigned char *src_end = coding->source + coding->src_bytes;
69a80ea3
KH
5046 int *charbuf = coding->charbuf + coding->charbuf_used;
5047 int *charbuf_end = coding->charbuf + coding->charbuf_size;
d311d28c 5048 ptrdiff_t consumed_chars = 0;
f10fe38f 5049 bool multibytep = coding->src_multibyte;
d0396581 5050 struct ccl_program *ccl = &coding->spec.ccl->ccl;
df7492f9 5051 int source_charbuf[1024];
fbdc1721 5052 int source_byteidx[1025];
24a73b0a 5053 Lisp_Object attrs, charset_list;
df7492f9 5054
24a73b0a 5055 CODING_GET_INFO (coding, attrs, charset_list);
df7492f9 5056
d0396581 5057 while (1)
df7492f9 5058 {
7c78e542 5059 const unsigned char *p = src;
95402d5f 5060 ptrdiff_t offset;
df7492f9
KH
5061 int i = 0;
5062
5063 if (multibytep)
fbdc1721
KH
5064 {
5065 while (i < 1024 && p < src_end)
5066 {
5067 source_byteidx[i] = p - src;
5068 source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
5069 }
5070 source_byteidx[i] = p - src;
5071 }
df7492f9
KH
5072 else
5073 while (i < 1024 && p < src_end)
5074 source_charbuf[i++] = *p++;
8f924df7 5075
df7492f9 5076 if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
d0396581 5077 ccl->last_block = 1;
95402d5f
KH
5078 /* As ccl_driver calls DECODE_CHAR, buffer may be relocated. */
5079 charset_map_loaded = 0;
d0396581
KH
5080 ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf,
5081 charset_list);
95402d5f
KH
5082 if (charset_map_loaded
5083 && (offset = coding_change_source (coding)))
5084 {
5085 p += offset;
5086 src += offset;
5087 src_end += offset;
5088 }
d0396581 5089 charbuf += ccl->produced;
fbdc1721 5090 if (multibytep)
d0396581 5091 src += source_byteidx[ccl->consumed];
df7492f9 5092 else
d0396581
KH
5093 src += ccl->consumed;
5094 consumed_chars += ccl->consumed;
5095 if (p == src_end || ccl->status != CCL_STAT_SUSPEND_BY_SRC)
df7492f9
KH
5096 break;
5097 }
5098
d0396581 5099 switch (ccl->status)
df7492f9
KH
5100 {
5101 case CCL_STAT_SUSPEND_BY_SRC:
065e3595 5102 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
df7492f9
KH
5103 break;
5104 case CCL_STAT_SUSPEND_BY_DST:
d0396581 5105 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
df7492f9
KH
5106 break;
5107 case CCL_STAT_QUIT:
5108 case CCL_STAT_INVALID_CMD:
065e3595 5109 record_conversion_result (coding, CODING_RESULT_INTERRUPT);
df7492f9
KH
5110 break;
5111 default:
065e3595 5112 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
5113 break;
5114 }
5115 coding->consumed_char += consumed_chars;
5116 coding->consumed = src - coding->source;
5117 coding->charbuf_used = charbuf - coding->charbuf;
5118}
5119
f10fe38f 5120static bool
971de7fb 5121encode_coding_ccl (struct coding_system *coding)
df7492f9 5122{
fb608df3 5123 struct ccl_program *ccl = &coding->spec.ccl->ccl;
f10fe38f 5124 bool multibytep = coding->dst_multibyte;
df7492f9
KH
5125 int *charbuf = coding->charbuf;
5126 int *charbuf_end = charbuf + coding->charbuf_used;
5127 unsigned char *dst = coding->destination + coding->produced;
5128 unsigned char *dst_end = coding->destination + coding->dst_bytes;
df7492f9 5129 int destination_charbuf[1024];
d311d28c 5130 ptrdiff_t produced_chars = 0;
a53e2e89 5131 int i;
24a73b0a 5132 Lisp_Object attrs, charset_list;
df7492f9 5133
24a73b0a 5134 CODING_GET_INFO (coding, attrs, charset_list);
fb608df3
KH
5135 if (coding->consumed_char == coding->src_chars
5136 && coding->mode & CODING_MODE_LAST_BLOCK)
5137 ccl->last_block = 1;
df7492f9 5138
76470ad1 5139 do
df7492f9 5140 {
95402d5f
KH
5141 ptrdiff_t offset;
5142
5143 /* As ccl_driver calls DECODE_CHAR, buffer may be relocated. */
5144 charset_map_loaded = 0;
fb608df3 5145 ccl_driver (ccl, charbuf, destination_charbuf,
8cffd3e7 5146 charbuf_end - charbuf, 1024, charset_list);
95402d5f
KH
5147 if (charset_map_loaded
5148 && (offset = coding_change_destination (coding)))
5149 dst += offset;
df7492f9 5150 if (multibytep)
8cffd3e7 5151 {
fb608df3
KH
5152 ASSURE_DESTINATION (ccl->produced * 2);
5153 for (i = 0; i < ccl->produced; i++)
8cffd3e7
KH
5154 EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5155 }
df7492f9
KH
5156 else
5157 {
fb608df3
KH
5158 ASSURE_DESTINATION (ccl->produced);
5159 for (i = 0; i < ccl->produced; i++)
df7492f9 5160 *dst++ = destination_charbuf[i] & 0xFF;
fb608df3 5161 produced_chars += ccl->produced;
df7492f9 5162 }
fb608df3
KH
5163 charbuf += ccl->consumed;
5164 if (ccl->status == CCL_STAT_QUIT
5165 || ccl->status == CCL_STAT_INVALID_CMD)
8cffd3e7 5166 break;
df7492f9 5167 }
76470ad1 5168 while (charbuf < charbuf_end);
df7492f9 5169
fb608df3 5170 switch (ccl->status)
df7492f9
KH
5171 {
5172 case CCL_STAT_SUSPEND_BY_SRC:
065e3595 5173 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
df7492f9
KH
5174 break;
5175 case CCL_STAT_SUSPEND_BY_DST:
065e3595 5176 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
df7492f9
KH
5177 break;
5178 case CCL_STAT_QUIT:
5179 case CCL_STAT_INVALID_CMD:
065e3595 5180 record_conversion_result (coding, CODING_RESULT_INTERRUPT);
df7492f9
KH
5181 break;
5182 default:
065e3595 5183 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9 5184 break;
1397dc18 5185 }
df7492f9
KH
5186
5187 coding->produced_char += produced_chars;
5188 coding->produced = dst - coding->destination;
5189 return 0;
1397dc18
KH
5190}
5191
5192\f
df7492f9 5193/*** 10, 11. no-conversion handlers ***/
4ed46869 5194
b73bfc1c 5195/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
4ed46869 5196
b73bfc1c 5197static void
971de7fb 5198decode_coding_raw_text (struct coding_system *coding)
4ed46869 5199{
f10fe38f
PE
5200 bool eol_dos
5201 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
119852e7 5202
df7492f9 5203 coding->chars_at_source = 1;
119852e7
KH
5204 coding->consumed_char = coding->src_chars;
5205 coding->consumed = coding->src_bytes;
2735d060 5206 if (eol_dos && coding->source[coding->src_bytes - 1] == '\r')
119852e7
KH
5207 {
5208 coding->consumed_char--;
5209 coding->consumed--;
5210 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5211 }
5212 else
5213 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9 5214}
4ed46869 5215
f10fe38f 5216static bool
971de7fb 5217encode_coding_raw_text (struct coding_system *coding)
df7492f9 5218{
f10fe38f 5219 bool multibytep = coding->dst_multibyte;
df7492f9
KH
5220 int *charbuf = coding->charbuf;
5221 int *charbuf_end = coding->charbuf + coding->charbuf_used;
5222 unsigned char *dst = coding->destination + coding->produced;
5223 unsigned char *dst_end = coding->destination + coding->dst_bytes;
d311d28c 5224 ptrdiff_t produced_chars = 0;
b73bfc1c
KH
5225 int c;
5226
df7492f9 5227 if (multibytep)
b73bfc1c 5228 {
df7492f9 5229 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
4ed46869 5230
df7492f9
KH
5231 if (coding->src_multibyte)
5232 while (charbuf < charbuf_end)
5233 {
5234 ASSURE_DESTINATION (safe_room);
5235 c = *charbuf++;
5236 if (ASCII_CHAR_P (c))
5237 EMIT_ONE_ASCII_BYTE (c);
5238 else if (CHAR_BYTE8_P (c))
5239 {
5240 c = CHAR_TO_BYTE8 (c);
5241 EMIT_ONE_BYTE (c);
5242 }
5243 else
5244 {
5245 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
93dec019 5246
df7492f9 5247 CHAR_STRING_ADVANCE (c, p1);
8abc3f12 5248 do
9d123124
KH
5249 {
5250 EMIT_ONE_BYTE (*p0);
5251 p0++;
5252 }
8abc3f12 5253 while (p0 < p1);
df7492f9
KH
5254 }
5255 }
b73bfc1c 5256 else
df7492f9
KH
5257 while (charbuf < charbuf_end)
5258 {
5259 ASSURE_DESTINATION (safe_room);
5260 c = *charbuf++;
5261 EMIT_ONE_BYTE (c);
5262 }
5263 }
5264 else
4ed46869 5265 {
df7492f9 5266 if (coding->src_multibyte)
d46c5b12 5267 {
df7492f9
KH
5268 int safe_room = MAX_MULTIBYTE_LENGTH;
5269
5270 while (charbuf < charbuf_end)
d46c5b12 5271 {
df7492f9
KH
5272 ASSURE_DESTINATION (safe_room);
5273 c = *charbuf++;
5274 if (ASCII_CHAR_P (c))
5275 *dst++ = c;
5276 else if (CHAR_BYTE8_P (c))
5277 *dst++ = CHAR_TO_BYTE8 (c);
b73bfc1c 5278 else
df7492f9 5279 CHAR_STRING_ADVANCE (c, dst);
d46c5b12
KH
5280 }
5281 }
df7492f9
KH
5282 else
5283 {
5284 ASSURE_DESTINATION (charbuf_end - charbuf);
5285 while (charbuf < charbuf_end && dst < dst_end)
5286 *dst++ = *charbuf++;
8f924df7 5287 }
319a3947 5288 produced_chars = dst - (coding->destination + coding->produced);
4ed46869 5289 }
065e3595 5290 record_conversion_result (coding, CODING_RESULT_SUCCESS);
a0ed9b27 5291 coding->produced_char += produced_chars;
df7492f9
KH
5292 coding->produced = dst - coding->destination;
5293 return 0;
4ed46869
KH
5294}
5295
ff0dacd7 5296/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
f10fe38f 5297 Return true if a text is encoded in a charset-based coding system. */
ff0dacd7 5298
f10fe38f 5299static bool
cf84bb53
JB
5300detect_coding_charset (struct coding_system *coding,
5301 struct coding_detection_info *detect_info)
1397dc18 5302{
065e3595 5303 const unsigned char *src = coding->source, *src_base;
8f924df7 5304 const unsigned char *src_end = coding->source + coding->src_bytes;
f10fe38f 5305 bool multibytep = coding->src_multibyte;
d311d28c 5306 ptrdiff_t consumed_chars = 0;
07295713 5307 Lisp_Object attrs, valids, name;
584948ac 5308 int found = 0;
d311d28c 5309 ptrdiff_t head_ascii = coding->head_ascii;
f10fe38f 5310 bool check_latin_extra = 0;
1397dc18 5311
ff0dacd7
KH
5312 detect_info->checked |= CATEGORY_MASK_CHARSET;
5313
df7492f9
KH
5314 coding = &coding_categories[coding_category_charset];
5315 attrs = CODING_ID_ATTRS (coding->id);
5316 valids = AREF (attrs, coding_attr_charset_valids);
07295713 5317 name = CODING_ID_NAME (coding->id);
51b59d79 5318 if (strncmp (SSDATA (SYMBOL_NAME (name)),
237aabf4 5319 "iso-8859-", sizeof ("iso-8859-") - 1) == 0
51b59d79 5320 || strncmp (SSDATA (SYMBOL_NAME (name)),
237aabf4 5321 "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
07295713 5322 check_latin_extra = 1;
237aabf4 5323
df7492f9 5324 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
716b3fa0 5325 src += head_ascii;
1397dc18 5326
b73bfc1c 5327 while (1)
1397dc18 5328 {
df7492f9 5329 int c;
716b3fa0
KH
5330 Lisp_Object val;
5331 struct charset *charset;
5332 int dim, idx;
1397dc18 5333
065e3595 5334 src_base = src;
df7492f9 5335 ONE_MORE_BYTE (c);
065e3595
KH
5336 if (c < 0)
5337 continue;
716b3fa0
KH
5338 val = AREF (valids, c);
5339 if (NILP (val))
df7492f9 5340 break;
584948ac 5341 if (c >= 0x80)
07295713
KH
5342 {
5343 if (c < 0xA0
237aabf4
JR
5344 && check_latin_extra
5345 && (!VECTORP (Vlatin_extra_code_table)
28be1ada 5346 || NILP (AREF (Vlatin_extra_code_table, c))))
07295713
KH
5347 break;
5348 found = CATEGORY_MASK_CHARSET;
5349 }
716b3fa0
KH
5350 if (INTEGERP (val))
5351 {
5352 charset = CHARSET_FROM_ID (XFASTINT (val));
5353 dim = CHARSET_DIMENSION (charset);
5354 for (idx = 1; idx < dim; idx++)
5355 {
5356 if (src == src_end)
5357 goto too_short;
5358 ONE_MORE_BYTE (c);
2f9442b8
PE
5359 if (c < charset->code_space[(dim - 1 - idx) * 4]
5360 || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
716b3fa0
KH
5361 break;
5362 }
5363 if (idx < dim)
5364 break;
5365 }
5366 else
5367 {
5368 idx = 1;
5369 for (; CONSP (val); val = XCDR (val))
5370 {
5371 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5372 dim = CHARSET_DIMENSION (charset);
5373 while (idx < dim)
5374 {
5375 if (src == src_end)
5376 goto too_short;
5377 ONE_MORE_BYTE (c);
5378 if (c < charset->code_space[(dim - 1 - idx) * 4]
5379 || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5380 break;
5381 idx++;
5382 }
5383 if (idx == dim)
5384 {
5385 val = Qnil;
5386 break;
5387 }
5388 }
5389 if (CONSP (val))
5390 break;
5391 }
df7492f9 5392 }
716b3fa0 5393 too_short:
ff0dacd7 5394 detect_info->rejected |= CATEGORY_MASK_CHARSET;
df7492f9 5395 return 0;
4ed46869 5396
df7492f9 5397 no_more_source:
ff0dacd7
KH
5398 detect_info->found |= found;
5399 return 1;
df7492f9 5400}
b73bfc1c 5401
b73bfc1c 5402static void
971de7fb 5403decode_coding_charset (struct coding_system *coding)
4ed46869 5404{
8f924df7
KH
5405 const unsigned char *src = coding->source + coding->consumed;
5406 const unsigned char *src_end = coding->source + coding->src_bytes;
5407 const unsigned char *src_base;
69a80ea3 5408 int *charbuf = coding->charbuf + coding->charbuf_used;
ad1746f5 5409 /* We may produce one charset annotation in one loop and one more at
df80c7f0 5410 the end. */
69a80ea3 5411 int *charbuf_end
df80c7f0 5412 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
d311d28c 5413 ptrdiff_t consumed_chars = 0, consumed_chars_base;
f10fe38f 5414 bool multibytep = coding->src_multibyte;
66ebf983
PE
5415 Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
5416 Lisp_Object valids;
d311d28c
PE
5417 ptrdiff_t char_offset = coding->produced_char;
5418 ptrdiff_t last_offset = char_offset;
ff0dacd7 5419 int last_id = charset_ascii;
f10fe38f
PE
5420 bool eol_dos
5421 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
119852e7 5422 int byte_after_cr = -1;
df7492f9 5423
4eb6d3f1 5424 valids = AREF (attrs, coding_attr_charset_valids);
b73bfc1c 5425
df7492f9 5426 while (1)
4ed46869 5427 {
4eb6d3f1 5428 int c;
24a73b0a
KH
5429 Lisp_Object val;
5430 struct charset *charset;
5431 int dim;
5432 int len = 1;
5433 unsigned code;
df7492f9
KH
5434
5435 src_base = src;
5436 consumed_chars_base = consumed_chars;
b73bfc1c 5437
df7492f9 5438 if (charbuf >= charbuf_end)
b71f6f73
KH
5439 {
5440 if (byte_after_cr >= 0)
5441 src_base--;
5442 break;
5443 }
df7492f9 5444
119852e7
KH
5445 if (byte_after_cr >= 0)
5446 {
5447 c = byte_after_cr;
5448 byte_after_cr = -1;
5449 }
5450 else
5451 {
5452 ONE_MORE_BYTE (c);
2735d060 5453 if (eol_dos && c == '\r')
119852e7
KH
5454 ONE_MORE_BYTE (byte_after_cr);
5455 }
065e3595
KH
5456 if (c < 0)
5457 goto invalid_code;
24a73b0a
KH
5458 code = c;
5459
5460 val = AREF (valids, c);
1b17adfd 5461 if (! INTEGERP (val) && ! CONSP (val))
24a73b0a
KH
5462 goto invalid_code;
5463 if (INTEGERP (val))
d46c5b12 5464 {
24a73b0a
KH
5465 charset = CHARSET_FROM_ID (XFASTINT (val));
5466 dim = CHARSET_DIMENSION (charset);
5467 while (len < dim)
b73bfc1c 5468 {
24a73b0a
KH
5469 ONE_MORE_BYTE (c);
5470 code = (code << 8) | c;
5471 len++;
b73bfc1c 5472 }
24a73b0a
KH
5473 CODING_DECODE_CHAR (coding, src, src_base, src_end,
5474 charset, code, c);
d46c5b12 5475 }
df7492f9 5476 else
d46c5b12 5477 {
24a73b0a
KH
5478 /* VAL is a list of charset IDs. It is assured that the
5479 list is sorted by charset dimensions (smaller one
5480 comes first). */
5481 while (CONSP (val))
4eb6d3f1 5482 {
24a73b0a 5483 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
c7c66a95 5484 dim = CHARSET_DIMENSION (charset);
f9d71dcd 5485 while (len < dim)
4eb6d3f1 5486 {
acb2a965
KH
5487 ONE_MORE_BYTE (c);
5488 code = (code << 8) | c;
f9d71dcd 5489 len++;
4eb6d3f1 5490 }
24a73b0a
KH
5491 CODING_DECODE_CHAR (coding, src, src_base,
5492 src_end, charset, code, c);
5493 if (c >= 0)
5494 break;
5495 val = XCDR (val);
ff0dacd7 5496 }
d46c5b12 5497 }
24a73b0a
KH
5498 if (c < 0)
5499 goto invalid_code;
5500 if (charset->id != charset_ascii
5501 && last_id != charset->id)
5502 {
5503 if (last_id != charset_ascii)
69a80ea3 5504 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
24a73b0a
KH
5505 last_id = charset->id;
5506 last_offset = char_offset;
5507 }
5508
df7492f9 5509 *charbuf++ = c;
ff0dacd7 5510 char_offset++;
df7492f9
KH
5511 continue;
5512
5513 invalid_code:
5514 src = src_base;
5515 consumed_chars = consumed_chars_base;
5516 ONE_MORE_BYTE (c);
065e3595 5517 *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
ff0dacd7 5518 char_offset++;
df7492f9 5519 coding->errors++;
4ed46869
KH
5520 }
5521
df7492f9 5522 no_more_source:
ff0dacd7 5523 if (last_id != charset_ascii)
69a80ea3 5524 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
df7492f9
KH
5525 coding->consumed_char += consumed_chars_base;
5526 coding->consumed = src_base - coding->source;
5527 coding->charbuf_used = charbuf - coding->charbuf;
4ed46869
KH
5528}
5529
f10fe38f 5530static bool
971de7fb 5531encode_coding_charset (struct coding_system *coding)
4ed46869 5532{
f10fe38f 5533 bool multibytep = coding->dst_multibyte;
df7492f9
KH
5534 int *charbuf = coding->charbuf;
5535 int *charbuf_end = charbuf + coding->charbuf_used;
5536 unsigned char *dst = coding->destination + coding->produced;
5537 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5538 int safe_room = MAX_MULTIBYTE_LENGTH;
d311d28c 5539 ptrdiff_t produced_chars = 0;
24a73b0a 5540 Lisp_Object attrs, charset_list;
f10fe38f 5541 bool ascii_compatible;
b73bfc1c 5542 int c;
b73bfc1c 5543
24a73b0a 5544 CODING_GET_INFO (coding, attrs, charset_list);
df7492f9 5545 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
fb88bf2d 5546
df7492f9 5547 while (charbuf < charbuf_end)
4ed46869 5548 {
4eb6d3f1 5549 struct charset *charset;
df7492f9 5550 unsigned code;
8f924df7 5551
df7492f9
KH
5552 ASSURE_DESTINATION (safe_room);
5553 c = *charbuf++;
5554 if (ascii_compatible && ASCII_CHAR_P (c))
5555 EMIT_ONE_ASCII_BYTE (c);
16eafb5d 5556 else if (CHAR_BYTE8_P (c))
4ed46869 5557 {
16eafb5d
KH
5558 c = CHAR_TO_BYTE8 (c);
5559 EMIT_ONE_BYTE (c);
d46c5b12 5560 }
d46c5b12 5561 else
b73bfc1c 5562 {
5eb05ea3
KH
5563 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5564 &code, charset);
5565
4eb6d3f1
KH
5566 if (charset)
5567 {
5568 if (CHARSET_DIMENSION (charset) == 1)
5569 EMIT_ONE_BYTE (code);
5570 else if (CHARSET_DIMENSION (charset) == 2)
5571 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5572 else if (CHARSET_DIMENSION (charset) == 3)
5573 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5574 else
5575 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5576 (code >> 8) & 0xFF, code & 0xFF);
5577 }
5578 else
41cbe562
KH
5579 {
5580 if (coding->mode & CODING_MODE_SAFE_ENCODING)
5581 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5582 else
5583 c = coding->default_char;
5584 EMIT_ONE_BYTE (c);
5585 }
4ed46869 5586 }
4ed46869
KH
5587 }
5588
065e3595 5589 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
5590 coding->produced_char += produced_chars;
5591 coding->produced = dst - coding->destination;
5592 return 0;
4ed46869
KH
5593}
5594
5595\f
1397dc18 5596/*** 7. C library functions ***/
4ed46869 5597
df7492f9
KH
5598/* Setup coding context CODING from information about CODING_SYSTEM.
5599 If CODING_SYSTEM is nil, `no-conversion' is assumed. If
5600 CODING_SYSTEM is invalid, signal an error. */
4ed46869 5601
ec6d2bb8 5602void
971de7fb 5603setup_coding_system (Lisp_Object coding_system, struct coding_system *coding)
4ed46869 5604{
df7492f9
KH
5605 Lisp_Object attrs;
5606 Lisp_Object eol_type;
5607 Lisp_Object coding_type;
4608c386 5608 Lisp_Object val;
4ed46869 5609
df7492f9 5610 if (NILP (coding_system))
ae6f73fa 5611 coding_system = Qundecided;
c07c8e12 5612
df7492f9 5613 CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
1f5dbf34 5614
df7492f9 5615 attrs = CODING_ID_ATTRS (coding->id);
0a9564cb 5616 eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
1f5dbf34 5617
df7492f9
KH
5618 coding->mode = 0;
5619 coding->head_ascii = -1;
4a015c45
KH
5620 if (VECTORP (eol_type))
5621 coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5622 | CODING_REQUIRE_DETECTION_MASK);
5623 else if (! EQ (eol_type, Qunix))
5624 coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5625 | CODING_REQUIRE_ENCODING_MASK);
5626 else
5627 coding->common_flags = 0;
5e5c78be
KH
5628 if (! NILP (CODING_ATTR_POST_READ (attrs)))
5629 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5630 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5631 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
8f924df7
KH
5632 if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5633 coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
1f5dbf34 5634
df7492f9 5635 val = CODING_ATTR_SAFE_CHARSETS (attrs);
8f924df7 5636 coding->max_charset_id = SCHARS (val) - 1;
1b3b981b 5637 coding->safe_charsets = SDATA (val);
df7492f9 5638 coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
624bda09 5639 coding->carryover_bytes = 0;
4608c386 5640
df7492f9
KH
5641 coding_type = CODING_ATTR_TYPE (attrs);
5642 if (EQ (coding_type, Qundecided))
d46c5b12 5643 {
df7492f9
KH
5644 coding->detector = NULL;
5645 coding->decoder = decode_coding_raw_text;
5646 coding->encoder = encode_coding_raw_text;
5647 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
d46c5b12 5648 }
df7492f9 5649 else if (EQ (coding_type, Qiso_2022))
d46c5b12 5650 {
df7492f9
KH
5651 int i;
5652 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5653
5654 /* Invoke graphic register 0 to plane 0. */
5655 CODING_ISO_INVOCATION (coding, 0) = 0;
5656 /* Invoke graphic register 1 to plane 1 if we can use 8-bit. */
5657 CODING_ISO_INVOCATION (coding, 1)
5658 = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5659 /* Setup the initial status of designation. */
5660 for (i = 0; i < 4; i++)
5661 CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5662 /* Not single shifting initially. */
5663 CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5664 /* Beginning of buffer should also be regarded as bol. */
5665 CODING_ISO_BOL (coding) = 1;
5666 coding->detector = detect_coding_iso_2022;
5667 coding->decoder = decode_coding_iso_2022;
5668 coding->encoder = encode_coding_iso_2022;
5669 if (flags & CODING_ISO_FLAG_SAFE)
5670 coding->mode |= CODING_MODE_SAFE_ENCODING;
d46c5b12 5671 coding->common_flags
df7492f9
KH
5672 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5673 | CODING_REQUIRE_FLUSHING_MASK);
5674 if (flags & CODING_ISO_FLAG_COMPOSITION)
5675 coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
ff0dacd7
KH
5676 if (flags & CODING_ISO_FLAG_DESIGNATION)
5677 coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
df7492f9
KH
5678 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5679 {
5680 setup_iso_safe_charsets (attrs);
5681 val = CODING_ATTR_SAFE_CHARSETS (attrs);
8f924df7 5682 coding->max_charset_id = SCHARS (val) - 1;
1b3b981b 5683 coding->safe_charsets = SDATA (val);
df7492f9
KH
5684 }
5685 CODING_ISO_FLAGS (coding) = flags;
e951386e
KH
5686 CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5687 CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5688 CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5689 CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
d46c5b12 5690 }
df7492f9 5691 else if (EQ (coding_type, Qcharset))
d46c5b12 5692 {
df7492f9
KH
5693 coding->detector = detect_coding_charset;
5694 coding->decoder = decode_coding_charset;
5695 coding->encoder = encode_coding_charset;
d46c5b12 5696 coding->common_flags
df7492f9 5697 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
d46c5b12 5698 }
df7492f9 5699 else if (EQ (coding_type, Qutf_8))
d46c5b12 5700 {
a470d443
KH
5701 val = AREF (attrs, coding_attr_utf_bom);
5702 CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5703 : EQ (val, Qt) ? utf_with_bom
5704 : utf_without_bom);
df7492f9
KH
5705 coding->detector = detect_coding_utf_8;
5706 coding->decoder = decode_coding_utf_8;
5707 coding->encoder = encode_coding_utf_8;
5708 coding->common_flags
5709 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
a470d443
KH
5710 if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5711 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
df7492f9
KH
5712 }
5713 else if (EQ (coding_type, Qutf_16))
5714 {
a470d443
KH
5715 val = AREF (attrs, coding_attr_utf_bom);
5716 CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5717 : EQ (val, Qt) ? utf_with_bom
5718 : utf_without_bom);
df7492f9 5719 val = AREF (attrs, coding_attr_utf_16_endian);
b49a1807 5720 CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
df7492f9 5721 : utf_16_little_endian);
e19c3639 5722 CODING_UTF_16_SURROGATE (coding) = 0;
df7492f9
KH
5723 coding->detector = detect_coding_utf_16;
5724 coding->decoder = decode_coding_utf_16;
5725 coding->encoder = encode_coding_utf_16;
5726 coding->common_flags
5727 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
a470d443 5728 if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
b49a1807 5729 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
d46c5b12 5730 }
df7492f9 5731 else if (EQ (coding_type, Qccl))
4ed46869 5732 {
df7492f9
KH
5733 coding->detector = detect_coding_ccl;
5734 coding->decoder = decode_coding_ccl;
5735 coding->encoder = encode_coding_ccl;
c952af22 5736 coding->common_flags
df7492f9
KH
5737 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5738 | CODING_REQUIRE_FLUSHING_MASK);
5739 }
5740 else if (EQ (coding_type, Qemacs_mule))
5741 {
5742 coding->detector = detect_coding_emacs_mule;
5743 coding->decoder = decode_coding_emacs_mule;
5744 coding->encoder = encode_coding_emacs_mule;
c952af22 5745 coding->common_flags
df7492f9
KH
5746 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5747 if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5748 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5749 {
5750 Lisp_Object tail, safe_charsets;
5751 int max_charset_id = 0;
5752
5753 for (tail = Vemacs_mule_charset_list; CONSP (tail);
5754 tail = XCDR (tail))
5755 if (max_charset_id < XFASTINT (XCAR (tail)))
5756 max_charset_id = XFASTINT (XCAR (tail));
1b3b981b
AS
5757 safe_charsets = make_uninit_string (max_charset_id + 1);
5758 memset (SDATA (safe_charsets), 255, max_charset_id + 1);
df7492f9
KH
5759 for (tail = Vemacs_mule_charset_list; CONSP (tail);
5760 tail = XCDR (tail))
8f924df7 5761 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
df7492f9 5762 coding->max_charset_id = max_charset_id;
1b3b981b 5763 coding->safe_charsets = SDATA (safe_charsets);
df7492f9 5764 }
e951386e
KH
5765 coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
5766 coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
df7492f9
KH
5767 }
5768 else if (EQ (coding_type, Qshift_jis))
5769 {
5770 coding->detector = detect_coding_sjis;
5771 coding->decoder = decode_coding_sjis;
5772 coding->encoder = encode_coding_sjis;
c952af22 5773 coding->common_flags
df7492f9
KH
5774 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5775 }
5776 else if (EQ (coding_type, Qbig5))
5777 {
5778 coding->detector = detect_coding_big5;
5779 coding->decoder = decode_coding_big5;
5780 coding->encoder = encode_coding_big5;
c952af22 5781 coding->common_flags
df7492f9
KH
5782 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5783 }
5784 else /* EQ (coding_type, Qraw_text) */
ec6d2bb8 5785 {
df7492f9
KH
5786 coding->detector = NULL;
5787 coding->decoder = decode_coding_raw_text;
5788 coding->encoder = encode_coding_raw_text;
ea29edf2
KH
5789 if (! EQ (eol_type, Qunix))
5790 {
5791 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5792 if (! VECTORP (eol_type))
5793 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5794 }
5795
4ed46869 5796 }
4ed46869 5797
df7492f9 5798 return;
4ed46869
KH
5799}
5800
0ff61e78
KH
5801/* Return a list of charsets supported by CODING. */
5802
5803Lisp_Object
971de7fb 5804coding_charset_list (struct coding_system *coding)
0ff61e78 5805{
35befdaa 5806 Lisp_Object attrs, charset_list;
0ff61e78
KH
5807
5808 CODING_GET_INFO (coding, attrs, charset_list);
5809 if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5810 {
5811 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5812
5813 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5814 charset_list = Viso_2022_charset_list;
5815 }
5816 else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5817 {
5818 charset_list = Vemacs_mule_charset_list;
5819 }
5820 return charset_list;
5821}
5822
5823
e9f91ece
KH
5824/* Return a list of charsets supported by CODING-SYSTEM. */
5825
5826Lisp_Object
971de7fb 5827coding_system_charset_list (Lisp_Object coding_system)
e9f91ece 5828{
d3411f89 5829 ptrdiff_t id;
e9f91ece
KH
5830 Lisp_Object attrs, charset_list;
5831
5832 CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
5833 attrs = CODING_ID_ATTRS (id);
5834
5835 if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5836 {
5837 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5838
5839 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5840 charset_list = Viso_2022_charset_list;
5841 else
5842 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5843 }
5844 else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5845 {
5846 charset_list = Vemacs_mule_charset_list;
5847 }
5848 else
5849 {
5850 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5851 }
5852 return charset_list;
5853}
5854
5855
df7492f9
KH
5856/* Return raw-text or one of its subsidiaries that has the same
5857 eol_type as CODING-SYSTEM. */
ec6d2bb8 5858
df7492f9 5859Lisp_Object
971de7fb 5860raw_text_coding_system (Lisp_Object coding_system)
ec6d2bb8 5861{
0be8721c 5862 Lisp_Object spec, attrs;
df7492f9 5863 Lisp_Object eol_type, raw_text_eol_type;
ec6d2bb8 5864
d3e4cb56
KH
5865 if (NILP (coding_system))
5866 return Qraw_text;
df7492f9
KH
5867 spec = CODING_SYSTEM_SPEC (coding_system);
5868 attrs = AREF (spec, 0);
ec6d2bb8 5869
df7492f9
KH
5870 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5871 return coding_system;
ec6d2bb8 5872
df7492f9
KH
5873 eol_type = AREF (spec, 2);
5874 if (VECTORP (eol_type))
5875 return Qraw_text;
5876 spec = CODING_SYSTEM_SPEC (Qraw_text);
5877 raw_text_eol_type = AREF (spec, 2);
5878 return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5879 : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5880 : AREF (raw_text_eol_type, 2));
ec6d2bb8
KH
5881}
5882
54f78171 5883
1911a33b
KH
5884/* If CODING_SYSTEM doesn't specify end-of-line format, return one of
5885 the subsidiary that has the same eol-spec as PARENT (if it is not
5886 nil and specifies end-of-line format) or the system's setting
fcbcfb64 5887 (system_eol_type). */
df7492f9
KH
5888
5889Lisp_Object
971de7fb 5890coding_inherit_eol_type (Lisp_Object coding_system, Lisp_Object parent)
54f78171 5891{
3e139625 5892 Lisp_Object spec, eol_type;
54f78171 5893
d3e4cb56
KH
5894 if (NILP (coding_system))
5895 coding_system = Qraw_text;
df7492f9 5896 spec = CODING_SYSTEM_SPEC (coding_system);
df7492f9 5897 eol_type = AREF (spec, 2);
fcbcfb64 5898 if (VECTORP (eol_type))
df7492f9 5899 {
df7492f9
KH
5900 Lisp_Object parent_eol_type;
5901
fcbcfb64
KH
5902 if (! NILP (parent))
5903 {
5904 Lisp_Object parent_spec;
5905
4a015c45 5906 parent_spec = CODING_SYSTEM_SPEC (parent);
fcbcfb64 5907 parent_eol_type = AREF (parent_spec, 2);
1911a33b 5908 if (VECTORP (parent_eol_type))
4628bef1 5909 parent_eol_type = system_eol_type;
fcbcfb64
KH
5910 }
5911 else
5912 parent_eol_type = system_eol_type;
df7492f9
KH
5913 if (EQ (parent_eol_type, Qunix))
5914 coding_system = AREF (eol_type, 0);
5915 else if (EQ (parent_eol_type, Qdos))
5916 coding_system = AREF (eol_type, 1);
5917 else if (EQ (parent_eol_type, Qmac))
5918 coding_system = AREF (eol_type, 2);
54f78171 5919 }
df7492f9 5920 return coding_system;
54f78171
KH
5921}
5922
fcaf8878
KH
5923
5924/* Check if text-conversion and eol-conversion of CODING_SYSTEM are
5925 decided for writing to a process. If not, complement them, and
5926 return a new coding system. */
5927
5928Lisp_Object
4628bef1 5929complement_process_encoding_system (Lisp_Object coding_system)
fcaf8878 5930{
5886ec9c
KH
5931 Lisp_Object coding_base = Qnil, eol_base = Qnil;
5932 Lisp_Object spec, attrs;
93d50df8 5933 int i;
fcaf8878 5934
93d50df8 5935 for (i = 0; i < 3; i++)
fcaf8878 5936 {
93d50df8
KH
5937 if (i == 1)
5938 coding_system = CDR_SAFE (Vdefault_process_coding_system);
5939 else if (i == 2)
5940 coding_system = preferred_coding_system ();
5941 spec = CODING_SYSTEM_SPEC (coding_system);
5942 if (NILP (spec))
5943 continue;
5944 attrs = AREF (spec, 0);
5945 if (NILP (coding_base) && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
5946 coding_base = CODING_ATTR_BASE_NAME (attrs);
5947 if (NILP (eol_base) && ! VECTORP (AREF (spec, 2)))
5948 eol_base = coding_system;
5949 if (! NILP (coding_base) && ! NILP (eol_base))
5950 break;
fcaf8878 5951 }
fcaf8878 5952
93d50df8
KH
5953 if (i > 0)
5954 /* The original CODING_SYSTEM didn't specify text-conversion or
5955 eol-conversion. Be sure that we return a fully complemented
5956 coding system. */
5957 coding_system = coding_inherit_eol_type (coding_base, eol_base);
5958 return coding_system;
fcaf8878
KH
5959}
5960
5961
4ed46869
KH
5962/* Emacs has a mechanism to automatically detect a coding system if it
5963 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
5964 it's impossible to distinguish some coding systems accurately
5965 because they use the same range of codes. So, at first, coding
5966 systems are categorized into 7, those are:
5967
0ef69138 5968 o coding-category-emacs-mule
4ed46869
KH
5969
5970 The category for a coding system which has the same code range
5971 as Emacs' internal format. Assigned the coding-system (Lisp
0ef69138 5972 symbol) `emacs-mule' by default.
4ed46869
KH
5973
5974 o coding-category-sjis
5975
5976 The category for a coding system which has the same code range
5977 as SJIS. Assigned the coding-system (Lisp
7717c392 5978 symbol) `japanese-shift-jis' by default.
4ed46869
KH
5979
5980 o coding-category-iso-7
5981
5982 The category for a coding system which has the same code range
7717c392 5983 as ISO2022 of 7-bit environment. This doesn't use any locking
d46c5b12
KH
5984 shift and single shift functions. This can encode/decode all
5985 charsets. Assigned the coding-system (Lisp symbol)
5986 `iso-2022-7bit' by default.
5987
5988 o coding-category-iso-7-tight
5989
5990 Same as coding-category-iso-7 except that this can
5991 encode/decode only the specified charsets.
4ed46869
KH
5992
5993 o coding-category-iso-8-1
5994
5995 The category for a coding system which has the same code range
5996 as ISO2022 of 8-bit environment and graphic plane 1 used only
7717c392
KH
5997 for DIMENSION1 charset. This doesn't use any locking shift
5998 and single shift functions. Assigned the coding-system (Lisp
5999 symbol) `iso-latin-1' by default.
4ed46869
KH
6000
6001 o coding-category-iso-8-2
6002
6003 The category for a coding system which has the same code range
6004 as ISO2022 of 8-bit environment and graphic plane 1 used only
7717c392
KH
6005 for DIMENSION2 charset. This doesn't use any locking shift
6006 and single shift functions. Assigned the coding-system (Lisp
6007 symbol) `japanese-iso-8bit' by default.
4ed46869 6008
7717c392 6009 o coding-category-iso-7-else
4ed46869
KH
6010
6011 The category for a coding system which has the same code range
ad1746f5 6012 as ISO2022 of 7-bit environment but uses locking shift or
7717c392
KH
6013 single shift functions. Assigned the coding-system (Lisp
6014 symbol) `iso-2022-7bit-lock' by default.
6015
6016 o coding-category-iso-8-else
6017
6018 The category for a coding system which has the same code range
ad1746f5 6019 as ISO2022 of 8-bit environment but uses locking shift or
7717c392
KH
6020 single shift functions. Assigned the coding-system (Lisp
6021 symbol) `iso-2022-8bit-ss2' by default.
4ed46869
KH
6022
6023 o coding-category-big5
6024
6025 The category for a coding system which has the same code range
6026 as BIG5. Assigned the coding-system (Lisp symbol)
e0e989f6 6027 `cn-big5' by default.
4ed46869 6028
fa42c37f
KH
6029 o coding-category-utf-8
6030
6031 The category for a coding system which has the same code range
6e76ae91 6032 as UTF-8 (cf. RFC3629). Assigned the coding-system (Lisp
fa42c37f
KH
6033 symbol) `utf-8' by default.
6034
6035 o coding-category-utf-16-be
6036
6037 The category for a coding system in which a text has an
6038 Unicode signature (cf. Unicode Standard) in the order of BIG
6039 endian at the head. Assigned the coding-system (Lisp symbol)
6040 `utf-16-be' by default.
6041
6042 o coding-category-utf-16-le
6043
6044 The category for a coding system in which a text has an
6045 Unicode signature (cf. Unicode Standard) in the order of
6046 LITTLE endian at the head. Assigned the coding-system (Lisp
6047 symbol) `utf-16-le' by default.
6048
1397dc18
KH
6049 o coding-category-ccl
6050
6051 The category for a coding system of which encoder/decoder is
6052 written in CCL programs. The default value is nil, i.e., no
6053 coding system is assigned.
6054
4ed46869
KH
6055 o coding-category-binary
6056
6057 The category for a coding system not categorized in any of the
6058 above. Assigned the coding-system (Lisp symbol)
e0e989f6 6059 `no-conversion' by default.
4ed46869
KH
6060
6061 Each of them is a Lisp symbol and the value is an actual
df7492f9 6062 `coding-system's (this is also a Lisp symbol) assigned by a user.
4ed46869
KH
6063 What Emacs does actually is to detect a category of coding system.
6064 Then, it uses a `coding-system' assigned to it. If Emacs can't
df7492f9 6065 decide only one possible category, it selects a category of the
4ed46869
KH
6066 highest priority. Priorities of categories are also specified by a
6067 user in a Lisp variable `coding-category-list'.
6068
6069*/
6070
df7492f9
KH
6071#define EOL_SEEN_NONE 0
6072#define EOL_SEEN_LF 1
6073#define EOL_SEEN_CR 2
6074#define EOL_SEEN_CRLF 4
66cfb530 6075
ff0dacd7
KH
6076/* Detect how end-of-line of a text of length SRC_BYTES pointed by
6077 SOURCE is encoded. If CATEGORY is one of
6078 coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6079 two-byte, else they are encoded by one-byte.
6080
6081 Return one of EOL_SEEN_XXX. */
4ed46869 6082
bc4bc72a 6083#define MAX_EOL_CHECK_COUNT 3
d46c5b12
KH
6084
6085static int
d311d28c 6086detect_eol (const unsigned char *source, ptrdiff_t src_bytes,
cf84bb53 6087 enum coding_category category)
4ed46869 6088{
f6cbaf43 6089 const unsigned char *src = source, *src_end = src + src_bytes;
4ed46869 6090 unsigned char c;
df7492f9
KH
6091 int total = 0;
6092 int eol_seen = EOL_SEEN_NONE;
4ed46869 6093
89528eb3 6094 if ((1 << category) & CATEGORY_MASK_UTF_16)
4ed46869 6095 {
f10fe38f
PE
6096 bool msb = category == (coding_category_utf_16_le
6097 | coding_category_utf_16_le_nosig);
6098 bool lsb = !msb;
fa42c37f 6099
df7492f9 6100 while (src + 1 < src_end)
fa42c37f 6101 {
df7492f9
KH
6102 c = src[lsb];
6103 if (src[msb] == 0 && (c == '\n' || c == '\r'))
fa42c37f 6104 {
df7492f9
KH
6105 int this_eol;
6106
6107 if (c == '\n')
6108 this_eol = EOL_SEEN_LF;
6109 else if (src + 3 >= src_end
6110 || src[msb + 2] != 0
6111 || src[lsb + 2] != '\n')
6112 this_eol = EOL_SEEN_CR;
fa42c37f 6113 else
75f4f1ac
EZ
6114 {
6115 this_eol = EOL_SEEN_CRLF;
6116 src += 2;
6117 }
df7492f9
KH
6118
6119 if (eol_seen == EOL_SEEN_NONE)
6120 /* This is the first end-of-line. */
6121 eol_seen = this_eol;
6122 else if (eol_seen != this_eol)
fa42c37f 6123 {
75f4f1ac
EZ
6124 /* The found type is different from what found before.
6125 Allow for stray ^M characters in DOS EOL files. */
ef1b0ba7
SM
6126 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6127 || (eol_seen == EOL_SEEN_CRLF
6128 && this_eol == EOL_SEEN_CR))
75f4f1ac
EZ
6129 eol_seen = EOL_SEEN_CRLF;
6130 else
6131 {
6132 eol_seen = EOL_SEEN_LF;
6133 break;
6134 }
fa42c37f 6135 }
df7492f9
KH
6136 if (++total == MAX_EOL_CHECK_COUNT)
6137 break;
fa42c37f 6138 }
df7492f9 6139 src += 2;
fa42c37f 6140 }
bcf26d6a 6141 }
d46c5b12 6142 else
ef1b0ba7
SM
6143 while (src < src_end)
6144 {
6145 c = *src++;
6146 if (c == '\n' || c == '\r')
6147 {
6148 int this_eol;
d46c5b12 6149
ef1b0ba7
SM
6150 if (c == '\n')
6151 this_eol = EOL_SEEN_LF;
6152 else if (src >= src_end || *src != '\n')
6153 this_eol = EOL_SEEN_CR;
6154 else
6155 this_eol = EOL_SEEN_CRLF, src++;
d46c5b12 6156
ef1b0ba7
SM
6157 if (eol_seen == EOL_SEEN_NONE)
6158 /* This is the first end-of-line. */
6159 eol_seen = this_eol;
6160 else if (eol_seen != this_eol)
6161 {
6162 /* The found type is different from what found before.
6163 Allow for stray ^M characters in DOS EOL files. */
6164 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6165 || (eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR))
6166 eol_seen = EOL_SEEN_CRLF;
6167 else
6168 {
6169 eol_seen = EOL_SEEN_LF;
6170 break;
6171 }
6172 }
6173 if (++total == MAX_EOL_CHECK_COUNT)
6174 break;
6175 }
6176 }
df7492f9 6177 return eol_seen;
73be902c
KH
6178}
6179
df7492f9 6180
24a73b0a 6181static Lisp_Object
971de7fb 6182adjust_coding_eol_type (struct coding_system *coding, int eol_seen)
73be902c 6183{
0be8721c 6184 Lisp_Object eol_type;
8f924df7 6185
df7492f9
KH
6186 eol_type = CODING_ID_EOL_TYPE (coding->id);
6187 if (eol_seen & EOL_SEEN_LF)
24a73b0a
KH
6188 {
6189 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6190 eol_type = Qunix;
6191 }
6f197c07 6192 else if (eol_seen & EOL_SEEN_CRLF)
24a73b0a
KH
6193 {
6194 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6195 eol_type = Qdos;
6196 }
6f197c07 6197 else if (eol_seen & EOL_SEEN_CR)
24a73b0a
KH
6198 {
6199 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6200 eol_type = Qmac;
6201 }
6202 return eol_type;
d46c5b12 6203}
4ed46869 6204
df7492f9
KH
6205/* Detect how a text specified in CODING is encoded. If a coding
6206 system is detected, update fields of CODING by the detected coding
6207 system. */
0a28aafb 6208
74ab6df5 6209static void
971de7fb 6210detect_coding (struct coding_system *coding)
d46c5b12 6211{
8f924df7 6212 const unsigned char *src, *src_end;
f10fe38f 6213 unsigned int saved_mode = coding->mode;
d46c5b12 6214
df7492f9
KH
6215 coding->consumed = coding->consumed_char = 0;
6216 coding->produced = coding->produced_char = 0;
6217 coding_set_source (coding);
1c3478b0 6218
df7492f9 6219 src_end = coding->source + coding->src_bytes;
c0e16b14 6220 coding->head_ascii = 0;
1c3478b0 6221
df7492f9
KH
6222 /* If we have not yet decided the text encoding type, detect it
6223 now. */
6224 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
b73bfc1c 6225 {
df7492f9 6226 int c, i;
6cb21a4f 6227 struct coding_detection_info detect_info;
f10fe38f 6228 bool null_byte_found = 0, eight_bit_found = 0;
df7492f9 6229
6cb21a4f 6230 detect_info.checked = detect_info.found = detect_info.rejected = 0;
2f3cbb32 6231 for (src = coding->source; src < src_end; src++)
d46c5b12 6232 {
df7492f9 6233 c = *src;
6cb21a4f 6234 if (c & 0x80)
6cb21a4f 6235 {
2f3cbb32 6236 eight_bit_found = 1;
2f3cbb32
KH
6237 if (null_byte_found)
6238 break;
6239 }
6240 else if (c < 0x20)
6241 {
6242 if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6243 && ! inhibit_iso_escape_detection
6244 && ! detect_info.checked)
6cb21a4f 6245 {
2f3cbb32
KH
6246 if (detect_coding_iso_2022 (coding, &detect_info))
6247 {
6248 /* We have scanned the whole data. */
6249 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
c0e16b14
KH
6250 {
6251 /* We didn't find an 8-bit code. We may
6252 have found a null-byte, but it's very
ce5b453a 6253 rare that a binary file conforms to
c0e16b14
KH
6254 ISO-2022. */
6255 src = src_end;
6256 coding->head_ascii = src - coding->source;
6257 }
6258 detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
2f3cbb32
KH
6259 break;
6260 }
6261 }
97b1b294 6262 else if (! c && !inhibit_null_byte_detection)
2f3cbb32
KH
6263 {
6264 null_byte_found = 1;
6265 if (eight_bit_found)
6266 break;
6cb21a4f 6267 }
c006c0c8
KH
6268 if (! eight_bit_found)
6269 coding->head_ascii++;
6cb21a4f 6270 }
c006c0c8 6271 else if (! eight_bit_found)
c0e16b14 6272 coding->head_ascii++;
d46c5b12 6273 }
df7492f9 6274
2f3cbb32
KH
6275 if (null_byte_found || eight_bit_found
6276 || coding->head_ascii < coding->src_bytes
6cb21a4f 6277 || detect_info.found)
d46c5b12 6278 {
ff0dacd7
KH
6279 enum coding_category category;
6280 struct coding_system *this;
df7492f9 6281
6cb21a4f
KH
6282 if (coding->head_ascii == coding->src_bytes)
6283 /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */
6284 for (i = 0; i < coding_category_raw_text; i++)
6285 {
6286 category = coding_priorities[i];
6287 this = coding_categories + category;
6288 if (detect_info.found & (1 << category))
24a73b0a 6289 break;
6cb21a4f
KH
6290 }
6291 else
2f3cbb32
KH
6292 {
6293 if (null_byte_found)
ff0dacd7 6294 {
2f3cbb32
KH
6295 detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6296 detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
ff0dacd7 6297 }
2f3cbb32
KH
6298 for (i = 0; i < coding_category_raw_text; i++)
6299 {
6300 category = coding_priorities[i];
6301 this = coding_categories + category;
0ba06a77
KH
6302 /* Some of this->detector (e.g. detect_coding_sjis)
6303 require this information. */
6304 coding->id = this->id;
2f3cbb32
KH
6305 if (this->id < 0)
6306 {
6307 /* No coding system of this category is defined. */
6308 detect_info.rejected |= (1 << category);
6309 }
6310 else if (category >= coding_category_raw_text)
6311 continue;
6312 else if (detect_info.checked & (1 << category))
6313 {
6314 if (detect_info.found & (1 << category))
6315 break;
6316 }
6317 else if ((*(this->detector)) (coding, &detect_info)
6318 && detect_info.found & (1 << category))
6319 {
6320 if (category == coding_category_utf_16_auto)
6321 {
6322 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6323 category = coding_category_utf_16_le;
6324 else
6325 category = coding_category_utf_16_be;
6326 }
6327 break;
6328 }
6329 }
2f3cbb32 6330 }
c0e16b14
KH
6331
6332 if (i < coding_category_raw_text)
6333 setup_coding_system (CODING_ID_NAME (this->id), coding);
6334 else if (null_byte_found)
6335 setup_coding_system (Qno_conversion, coding);
6336 else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6337 == CATEGORY_MASK_ANY)
6338 setup_coding_system (Qraw_text, coding);
6339 else if (detect_info.rejected)
6340 for (i = 0; i < coding_category_raw_text; i++)
6341 if (! (detect_info.rejected & (1 << coding_priorities[i])))
6342 {
6343 this = coding_categories + coding_priorities[i];
6344 setup_coding_system (CODING_ID_NAME (this->id), coding);
6345 break;
6346 }
d46c5b12 6347 }
b73bfc1c 6348 }
a470d443
KH
6349 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6350 == coding_category_utf_8_auto)
6351 {
6352 Lisp_Object coding_systems;
6353 struct coding_detection_info detect_info;
6354
6355 coding_systems
6356 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6357 detect_info.found = detect_info.rejected = 0;
6358 coding->head_ascii = 0;
6359 if (CONSP (coding_systems)
6360 && detect_coding_utf_8 (coding, &detect_info))
6361 {
6362 if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6363 setup_coding_system (XCAR (coding_systems), coding);
6364 else
6365 setup_coding_system (XCDR (coding_systems), coding);
6366 }
6367 }
24a73b0a
KH
6368 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6369 == coding_category_utf_16_auto)
b49a1807
KH
6370 {
6371 Lisp_Object coding_systems;
6372 struct coding_detection_info detect_info;
6373
6374 coding_systems
a470d443 6375 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
b49a1807 6376 detect_info.found = detect_info.rejected = 0;
a470d443 6377 coding->head_ascii = 0;
b49a1807 6378 if (CONSP (coding_systems)
24a73b0a 6379 && detect_coding_utf_16 (coding, &detect_info))
b49a1807
KH
6380 {
6381 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6382 setup_coding_system (XCAR (coding_systems), coding);
24a73b0a 6383 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
b49a1807
KH
6384 setup_coding_system (XCDR (coding_systems), coding);
6385 }
6386 }
73cce38d 6387 coding->mode = saved_mode;
4ed46869 6388}
4ed46869 6389
d46c5b12 6390
aaaf0b1e 6391static void
971de7fb 6392decode_eol (struct coding_system *coding)
aaaf0b1e 6393{
24a73b0a
KH
6394 Lisp_Object eol_type;
6395 unsigned char *p, *pbeg, *pend;
3ed051d4 6396
24a73b0a 6397 eol_type = CODING_ID_EOL_TYPE (coding->id);
0a9564cb 6398 if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
24a73b0a
KH
6399 return;
6400
6401 if (NILP (coding->dst_object))
6402 pbeg = coding->destination;
6403 else
6404 pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6405 pend = pbeg + coding->produced;
6406
6407 if (VECTORP (eol_type))
aaaf0b1e 6408 {
df7492f9 6409 int eol_seen = EOL_SEEN_NONE;
4ed46869 6410
24a73b0a 6411 for (p = pbeg; p < pend; p++)
aaaf0b1e 6412 {
df7492f9
KH
6413 if (*p == '\n')
6414 eol_seen |= EOL_SEEN_LF;
6415 else if (*p == '\r')
aaaf0b1e 6416 {
df7492f9 6417 if (p + 1 < pend && *(p + 1) == '\n')
aaaf0b1e 6418 {
df7492f9
KH
6419 eol_seen |= EOL_SEEN_CRLF;
6420 p++;
aaaf0b1e 6421 }
aaaf0b1e 6422 else
df7492f9 6423 eol_seen |= EOL_SEEN_CR;
aaaf0b1e 6424 }
aaaf0b1e 6425 }
75f4f1ac
EZ
6426 /* Handle DOS-style EOLs in a file with stray ^M characters. */
6427 if ((eol_seen & EOL_SEEN_CRLF) != 0
6428 && (eol_seen & EOL_SEEN_CR) != 0
6429 && (eol_seen & EOL_SEEN_LF) == 0)
6430 eol_seen = EOL_SEEN_CRLF;
6431 else if (eol_seen != EOL_SEEN_NONE
24a73b0a
KH
6432 && eol_seen != EOL_SEEN_LF
6433 && eol_seen != EOL_SEEN_CRLF
6434 && eol_seen != EOL_SEEN_CR)
6435 eol_seen = EOL_SEEN_LF;
df7492f9 6436 if (eol_seen != EOL_SEEN_NONE)
24a73b0a 6437 eol_type = adjust_coding_eol_type (coding, eol_seen);
aaaf0b1e 6438 }
d46c5b12 6439
24a73b0a 6440 if (EQ (eol_type, Qmac))
27901516 6441 {
24a73b0a 6442 for (p = pbeg; p < pend; p++)
df7492f9
KH
6443 if (*p == '\r')
6444 *p = '\n';
4ed46869 6445 }
24a73b0a 6446 else if (EQ (eol_type, Qdos))
df7492f9 6447 {
d311d28c 6448 ptrdiff_t n = 0;
b73bfc1c 6449
24a73b0a
KH
6450 if (NILP (coding->dst_object))
6451 {
4347441b
KH
6452 /* Start deleting '\r' from the tail to minimize the memory
6453 movement. */
24a73b0a
KH
6454 for (p = pend - 2; p >= pbeg; p--)
6455 if (*p == '\r')
6456 {
72af86bd 6457 memmove (p, p + 1, pend-- - p - 1);
24a73b0a
KH
6458 n++;
6459 }
6460 }
6461 else
6462 {
d311d28c
PE
6463 ptrdiff_t pos_byte = coding->dst_pos_byte;
6464 ptrdiff_t pos = coding->dst_pos;
6465 ptrdiff_t pos_end = pos + coding->produced_char - 1;
4347441b
KH
6466
6467 while (pos < pos_end)
6468 {
6469 p = BYTE_POS_ADDR (pos_byte);
6470 if (*p == '\r' && p[1] == '\n')
6471 {
6472 del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6473 n++;
6474 pos_end--;
6475 }
6476 pos++;
69b8522d
KH
6477 if (coding->dst_multibyte)
6478 pos_byte += BYTES_BY_CHAR_HEAD (*p);
6479 else
6480 pos_byte++;
4347441b 6481 }
24a73b0a
KH
6482 }
6483 coding->produced -= n;
6484 coding->produced_char -= n;
aaaf0b1e 6485 }
4ed46869
KH
6486}
6487
7d64c6ad 6488
a6f87d34 6489/* Return a translation table (or list of them) from coding system
f10fe38f
PE
6490 attribute vector ATTRS for encoding (if ENCODEP) or decoding (if
6491 not ENCODEP). */
7d64c6ad 6492
e6a54062 6493static Lisp_Object
f10fe38f 6494get_translation_table (Lisp_Object attrs, bool encodep, int *max_lookup)
7d64c6ad
KH
6495{
6496 Lisp_Object standard, translation_table;
09ee6fdd 6497 Lisp_Object val;
7d64c6ad 6498
4bed5909
CY
6499 if (NILP (Venable_character_translation))
6500 {
6501 if (max_lookup)
6502 *max_lookup = 0;
6503 return Qnil;
6504 }
7d64c6ad
KH
6505 if (encodep)
6506 translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6507 standard = Vstandard_translation_table_for_encode;
6508 else
6509 translation_table = CODING_ATTR_DECODE_TBL (attrs),
6510 standard = Vstandard_translation_table_for_decode;
7d64c6ad 6511 if (NILP (translation_table))
09ee6fdd
KH
6512 translation_table = standard;
6513 else
a6f87d34 6514 {
09ee6fdd
KH
6515 if (SYMBOLP (translation_table))
6516 translation_table = Fget (translation_table, Qtranslation_table);
6517 else if (CONSP (translation_table))
6518 {
6519 translation_table = Fcopy_sequence (translation_table);
6520 for (val = translation_table; CONSP (val); val = XCDR (val))
6521 if (SYMBOLP (XCAR (val)))
6522 XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6523 }
6524 if (CHAR_TABLE_P (standard))
6525 {
6526 if (CONSP (translation_table))
6527 translation_table = nconc2 (translation_table,
6528 Fcons (standard, Qnil));
6529 else
6530 translation_table = Fcons (translation_table,
6531 Fcons (standard, Qnil));
6532 }
a6f87d34 6533 }
2170c8f0
KH
6534
6535 if (max_lookup)
09ee6fdd 6536 {
2170c8f0
KH
6537 *max_lookup = 1;
6538 if (CHAR_TABLE_P (translation_table)
6539 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6540 {
6541 val = XCHAR_TABLE (translation_table)->extras[1];
6542 if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6543 *max_lookup = XFASTINT (val);
6544 }
6545 else if (CONSP (translation_table))
6546 {
2735d060 6547 Lisp_Object tail;
09ee6fdd 6548
2170c8f0
KH
6549 for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6550 if (CHAR_TABLE_P (XCAR (tail))
6551 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6552 {
2735d060
PE
6553 Lisp_Object tailval = XCHAR_TABLE (XCAR (tail))->extras[1];
6554 if (NATNUMP (tailval) && *max_lookup < XFASTINT (tailval))
6555 *max_lookup = XFASTINT (tailval);
2170c8f0
KH
6556 }
6557 }
a6f87d34 6558 }
7d64c6ad
KH
6559 return translation_table;
6560}
6561
09ee6fdd
KH
6562#define LOOKUP_TRANSLATION_TABLE(table, c, trans) \
6563 do { \
6564 trans = Qnil; \
6565 if (CHAR_TABLE_P (table)) \
6566 { \
6567 trans = CHAR_TABLE_REF (table, c); \
6568 if (CHARACTERP (trans)) \
6569 c = XFASTINT (trans), trans = Qnil; \
6570 } \
6571 else if (CONSP (table)) \
6572 { \
6573 Lisp_Object tail; \
6574 \
6575 for (tail = table; CONSP (tail); tail = XCDR (tail)) \
6576 if (CHAR_TABLE_P (XCAR (tail))) \
6577 { \
6578 trans = CHAR_TABLE_REF (XCAR (tail), c); \
6579 if (CHARACTERP (trans)) \
6580 c = XFASTINT (trans), trans = Qnil; \
6581 else if (! NILP (trans)) \
6582 break; \
6583 } \
6584 } \
e6a54062
KH
6585 } while (0)
6586
7d64c6ad 6587
e951386e
KH
6588/* Return a translation of character(s) at BUF according to TRANS.
6589 TRANS is TO-CHAR or ((FROM . TO) ...) where
6590 FROM = [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].
6591 The return value is TO-CHAR or ([FROM-CHAR ...] . TO) if a
6592 translation is found, and Qnil if not found..
6593 If BUF is too short to lookup characters in FROM, return Qt. */
6594
69a80ea3 6595static Lisp_Object
971de7fb 6596get_translation (Lisp_Object trans, int *buf, int *buf_end)
69a80ea3 6597{
e951386e
KH
6598
6599 if (INTEGERP (trans))
6600 return trans;
6601 for (; CONSP (trans); trans = XCDR (trans))
69a80ea3 6602 {
e951386e
KH
6603 Lisp_Object val = XCAR (trans);
6604 Lisp_Object from = XCAR (val);
2c6a9faa
PE
6605 ptrdiff_t len = ASIZE (from);
6606 ptrdiff_t i;
69a80ea3 6607
e951386e 6608 for (i = 0; i < len; i++)
69a80ea3 6609 {
e951386e
KH
6610 if (buf + i == buf_end)
6611 return Qt;
6612 if (XINT (AREF (from, i)) != buf[i])
6613 break;
69a80ea3 6614 }
e951386e
KH
6615 if (i == len)
6616 return val;
69a80ea3 6617 }
e951386e 6618 return Qnil;
69a80ea3
KH
6619}
6620
6621
d46c5b12 6622static int
cf84bb53 6623produce_chars (struct coding_system *coding, Lisp_Object translation_table,
f10fe38f 6624 bool last_block)
4ed46869 6625{
df7492f9
KH
6626 unsigned char *dst = coding->destination + coding->produced;
6627 unsigned char *dst_end = coding->destination + coding->dst_bytes;
d311d28c
PE
6628 ptrdiff_t produced;
6629 ptrdiff_t produced_chars = 0;
69a80ea3 6630 int carryover = 0;
4ed46869 6631
df7492f9 6632 if (! coding->chars_at_source)
4ed46869 6633 {
119852e7 6634 /* Source characters are in coding->charbuf. */
fba4576f
AS
6635 int *buf = coding->charbuf;
6636 int *buf_end = buf + coding->charbuf_used;
4ed46869 6637
db274c7a
KH
6638 if (EQ (coding->src_object, coding->dst_object))
6639 {
6640 coding_set_source (coding);
6641 dst_end = ((unsigned char *) coding->source) + coding->consumed;
6642 }
4ed46869 6643
df7492f9 6644 while (buf < buf_end)
4ed46869 6645 {
27bb1ca4
PE
6646 int c = *buf;
6647 ptrdiff_t i;
bc4bc72a 6648
df7492f9
KH
6649 if (c >= 0)
6650 {
d311d28c 6651 ptrdiff_t from_nchars = 1, to_nchars = 1;
69a80ea3
KH
6652 Lisp_Object trans = Qnil;
6653
09ee6fdd 6654 LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
e6a54062 6655 if (! NILP (trans))
69a80ea3 6656 {
e951386e
KH
6657 trans = get_translation (trans, buf, buf_end);
6658 if (INTEGERP (trans))
6659 c = XINT (trans);
6660 else if (CONSP (trans))
6661 {
6662 from_nchars = ASIZE (XCAR (trans));
6663 trans = XCDR (trans);
6664 if (INTEGERP (trans))
6665 c = XINT (trans);
6666 else
6667 {
6668 to_nchars = ASIZE (trans);
6669 c = XINT (AREF (trans, 0));
6670 }
6671 }
6672 else if (EQ (trans, Qt) && ! last_block)
69a80ea3 6673 break;
69a80ea3
KH
6674 }
6675
5d009b3a 6676 if ((dst_end - dst) / MAX_MULTIBYTE_LENGTH < to_nchars)
69a80ea3 6677 {
5d009b3a
PE
6678 if (((min (PTRDIFF_MAX, SIZE_MAX) - (buf_end - buf))
6679 / MAX_MULTIBYTE_LENGTH)
6680 < to_nchars)
6681 memory_full (SIZE_MAX);
69a80ea3
KH
6682 dst = alloc_destination (coding,
6683 buf_end - buf
6684 + MAX_MULTIBYTE_LENGTH * to_nchars,
6685 dst);
db274c7a
KH
6686 if (EQ (coding->src_object, coding->dst_object))
6687 {
6688 coding_set_source (coding);
e951386e
KH
6689 dst_end = (((unsigned char *) coding->source)
6690 + coding->consumed);
db274c7a
KH
6691 }
6692 else
6693 dst_end = coding->destination + coding->dst_bytes;
69a80ea3
KH
6694 }
6695
433f7f87 6696 for (i = 0; i < to_nchars; i++)
69a80ea3 6697 {
433f7f87
KH
6698 if (i > 0)
6699 c = XINT (AREF (trans, i));
69a80ea3
KH
6700 if (coding->dst_multibyte
6701 || ! CHAR_BYTE8_P (c))
db274c7a 6702 CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
69a80ea3
KH
6703 else
6704 *dst++ = CHAR_TO_BYTE8 (c);
6705 }
6706 produced_chars += to_nchars;
e951386e 6707 buf += from_nchars;
d46c5b12 6708 }
df7492f9 6709 else
69a80ea3
KH
6710 /* This is an annotation datum. (-C) is the length. */
6711 buf += -c;
4ed46869 6712 }
69a80ea3 6713 carryover = buf_end - buf;
4ed46869 6714 }
fa42c37f 6715 else
fa42c37f 6716 {
119852e7 6717 /* Source characters are at coding->source. */
8f924df7 6718 const unsigned char *src = coding->source;
119852e7 6719 const unsigned char *src_end = src + coding->consumed;
4ed46869 6720
db274c7a
KH
6721 if (EQ (coding->dst_object, coding->src_object))
6722 dst_end = (unsigned char *) src;
df7492f9 6723 if (coding->src_multibyte != coding->dst_multibyte)
fa42c37f 6724 {
df7492f9 6725 if (coding->src_multibyte)
fa42c37f 6726 {
f10fe38f 6727 bool multibytep = 1;
d311d28c 6728 ptrdiff_t consumed_chars = 0;
d46c5b12 6729
df7492f9
KH
6730 while (1)
6731 {
8f924df7 6732 const unsigned char *src_base = src;
df7492f9 6733 int c;
b73bfc1c 6734
df7492f9 6735 ONE_MORE_BYTE (c);
119852e7 6736 if (dst == dst_end)
df7492f9 6737 {
119852e7
KH
6738 if (EQ (coding->src_object, coding->dst_object))
6739 dst_end = (unsigned char *) src;
6740 if (dst == dst_end)
df7492f9 6741 {
d311d28c 6742 ptrdiff_t offset = src - coding->source;
119852e7
KH
6743
6744 dst = alloc_destination (coding, src_end - src + 1,
6745 dst);
6746 dst_end = coding->destination + coding->dst_bytes;
6747 coding_set_source (coding);
6748 src = coding->source + offset;
5c1ca13d 6749 src_end = coding->source + coding->consumed;
db274c7a
KH
6750 if (EQ (coding->src_object, coding->dst_object))
6751 dst_end = (unsigned char *) src;
df7492f9 6752 }
df7492f9
KH
6753 }
6754 *dst++ = c;
6755 produced_chars++;
6756 }
6757 no_more_source:
6758 ;
fa42c37f
KH
6759 }
6760 else
df7492f9
KH
6761 while (src < src_end)
6762 {
f10fe38f 6763 bool multibytep = 1;
df7492f9 6764 int c = *src++;
b73bfc1c 6765
df7492f9
KH
6766 if (dst >= dst_end - 1)
6767 {
2c78b7e1 6768 if (EQ (coding->src_object, coding->dst_object))
8f924df7 6769 dst_end = (unsigned char *) src;
2c78b7e1
KH
6770 if (dst >= dst_end - 1)
6771 {
d311d28c
PE
6772 ptrdiff_t offset = src - coding->source;
6773 ptrdiff_t more_bytes;
119852e7 6774
db274c7a
KH
6775 if (EQ (coding->src_object, coding->dst_object))
6776 more_bytes = ((src_end - src) / 2) + 2;
6777 else
6778 more_bytes = src_end - src + 2;
6779 dst = alloc_destination (coding, more_bytes, dst);
2c78b7e1
KH
6780 dst_end = coding->destination + coding->dst_bytes;
6781 coding_set_source (coding);
119852e7 6782 src = coding->source + offset;
5c1ca13d 6783 src_end = coding->source + coding->consumed;
db274c7a
KH
6784 if (EQ (coding->src_object, coding->dst_object))
6785 dst_end = (unsigned char *) src;
2c78b7e1 6786 }
df7492f9
KH
6787 }
6788 EMIT_ONE_BYTE (c);
6789 }
d46c5b12 6790 }
df7492f9
KH
6791 else
6792 {
6793 if (!EQ (coding->src_object, coding->dst_object))
fa42c37f 6794 {
d311d28c 6795 ptrdiff_t require = coding->src_bytes - coding->dst_bytes;
4ed46869 6796
df7492f9 6797 if (require > 0)
fa42c37f 6798 {
d311d28c 6799 ptrdiff_t offset = src - coding->source;
df7492f9
KH
6800
6801 dst = alloc_destination (coding, require, dst);
6802 coding_set_source (coding);
6803 src = coding->source + offset;
5c1ca13d 6804 src_end = coding->source + coding->consumed;
fa42c37f
KH
6805 }
6806 }
119852e7 6807 produced_chars = coding->consumed_char;
df7492f9 6808 while (src < src_end)
14daee73 6809 *dst++ = *src++;
fa42c37f
KH
6810 }
6811 }
6812
df7492f9 6813 produced = dst - (coding->destination + coding->produced);
284201e4 6814 if (BUFFERP (coding->dst_object) && produced_chars > 0)
df7492f9
KH
6815 insert_from_gap (produced_chars, produced);
6816 coding->produced += produced;
6817 coding->produced_char += produced_chars;
69a80ea3 6818 return carryover;
fa42c37f
KH
6819}
6820
ff0dacd7
KH
6821/* Compose text in CODING->object according to the annotation data at
6822 CHARBUF. CHARBUF is an array:
e951386e 6823 [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
df7492f9 6824 */
4ed46869 6825
b0ab8123 6826static void
d311d28c 6827produce_composition (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
4ed46869 6828{
df7492f9 6829 int len;
d311d28c 6830 ptrdiff_t to;
df7492f9 6831 enum composition_method method;
df7492f9 6832 Lisp_Object components;
fa42c37f 6833
e951386e 6834 len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
69a80ea3 6835 to = pos + charbuf[2];
e951386e 6836 method = (enum composition_method) (charbuf[4]);
d46c5b12 6837
df7492f9
KH
6838 if (method == COMPOSITION_RELATIVE)
6839 components = Qnil;
e951386e 6840 else
d46c5b12 6841 {
df7492f9 6842 Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
e951386e 6843 int i, j;
b73bfc1c 6844
e951386e
KH
6845 if (method == COMPOSITION_WITH_RULE)
6846 len = charbuf[2] * 3 - 2;
6847 charbuf += MAX_ANNOTATION_LENGTH;
6848 /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
6849 for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
9ffd559c 6850 {
e951386e
KH
6851 if (charbuf[i] >= 0)
6852 args[j] = make_number (charbuf[i]);
6853 else
6854 {
6855 i++;
6856 args[j] = make_number (charbuf[i] % 0x100);
6857 }
9ffd559c 6858 }
e951386e 6859 components = (i == j ? Fstring (j, args) : Fvector (j, args));
d46c5b12 6860 }
69a80ea3 6861 compose_text (pos, to, components, Qnil, coding->dst_object);
d46c5b12
KH
6862}
6863
d46c5b12 6864
ff0dacd7
KH
6865/* Put `charset' property on text in CODING->object according to
6866 the annotation data at CHARBUF. CHARBUF is an array:
69a80ea3 6867 [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
ff0dacd7 6868 */
d46c5b12 6869
b0ab8123 6870static void
d311d28c 6871produce_charset (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
d46c5b12 6872{
d311d28c 6873 ptrdiff_t from = pos - charbuf[2];
69a80ea3 6874 struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
b73bfc1c 6875
69a80ea3 6876 Fput_text_property (make_number (from), make_number (pos),
ff0dacd7
KH
6877 Qcharset, CHARSET_NAME (charset),
6878 coding->dst_object);
d46c5b12
KH
6879}
6880
d46c5b12 6881
df7492f9
KH
6882#define CHARBUF_SIZE 0x4000
6883
6884#define ALLOC_CONVERSION_WORK_AREA(coding) \
6885 do { \
8510724d 6886 int size = CHARBUF_SIZE; \
df7492f9
KH
6887 \
6888 coding->charbuf = NULL; \
6889 while (size > 1024) \
6890 { \
38182d90 6891 coding->charbuf = alloca (sizeof (int) * size); \
df7492f9
KH
6892 if (coding->charbuf) \
6893 break; \
6894 size >>= 1; \
6895 } \
6896 if (! coding->charbuf) \
6897 { \
065e3595 6898 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
f10fe38f 6899 return; \
df7492f9
KH
6900 } \
6901 coding->charbuf_size = size; \
6902 } while (0)
4ed46869 6903
d46c5b12
KH
6904
6905static void
d311d28c 6906produce_annotation (struct coding_system *coding, ptrdiff_t pos)
d46c5b12 6907{
df7492f9
KH
6908 int *charbuf = coding->charbuf;
6909 int *charbuf_end = charbuf + coding->charbuf_used;
d46c5b12 6910
ff0dacd7
KH
6911 if (NILP (coding->dst_object))
6912 return;
d46c5b12 6913
df7492f9 6914 while (charbuf < charbuf_end)
a84f1519 6915 {
df7492f9 6916 if (*charbuf >= 0)
e951386e 6917 pos++, charbuf++;
d46c5b12 6918 else
d46c5b12 6919 {
df7492f9 6920 int len = -*charbuf;
e951386e
KH
6921
6922 if (len > 2)
6923 switch (charbuf[1])
6924 {
6925 case CODING_ANNOTATE_COMPOSITION_MASK:
6926 produce_composition (coding, charbuf, pos);
6927 break;
6928 case CODING_ANNOTATE_CHARSET_MASK:
6929 produce_charset (coding, charbuf, pos);
6930 break;
6931 }
df7492f9 6932 charbuf += len;
d46c5b12 6933 }
a84f1519 6934 }
d46c5b12
KH
6935}
6936
df7492f9
KH
6937/* Decode the data at CODING->src_object into CODING->dst_object.
6938 CODING->src_object is a buffer, a string, or nil.
6939 CODING->dst_object is a buffer.
d46c5b12 6940
df7492f9
KH
6941 If CODING->src_object is a buffer, it must be the current buffer.
6942 In this case, if CODING->src_pos is positive, it is a position of
6943 the source text in the buffer, otherwise, the source text is in the
6944 gap area of the buffer, and CODING->src_pos specifies the offset of
6945 the text from GPT (which must be the same as PT). If this is the
6946 same buffer as CODING->dst_object, CODING->src_pos must be
6947 negative.
d46c5b12 6948
b6828792 6949 If CODING->src_object is a string, CODING->src_pos is an index to
df7492f9 6950 that string.
d46c5b12 6951
df7492f9
KH
6952 If CODING->src_object is nil, CODING->source must already point to
6953 the non-relocatable memory area. In this case, CODING->src_pos is
6954 an offset from CODING->source.
73be902c 6955
df7492f9
KH
6956 The decoded data is inserted at the current point of the buffer
6957 CODING->dst_object.
6958*/
d46c5b12 6959
f10fe38f 6960static void
971de7fb 6961decode_coding (struct coding_system *coding)
d46c5b12 6962{
df7492f9 6963 Lisp_Object attrs;
24a73b0a 6964 Lisp_Object undo_list;
7d64c6ad 6965 Lisp_Object translation_table;
d0396581 6966 struct ccl_spec cclspec;
69a80ea3
KH
6967 int carryover;
6968 int i;
d46c5b12 6969
df7492f9
KH
6970 if (BUFFERP (coding->src_object)
6971 && coding->src_pos > 0
6972 && coding->src_pos < GPT
6973 && coding->src_pos + coding->src_chars > GPT)
6974 move_gap_both (coding->src_pos, coding->src_pos_byte);
d46c5b12 6975
24a73b0a 6976 undo_list = Qt;
df7492f9 6977 if (BUFFERP (coding->dst_object))
1c3478b0 6978 {
a3d794a1 6979 set_buffer_internal (XBUFFER (coding->dst_object));
df7492f9
KH
6980 if (GPT != PT)
6981 move_gap_both (PT, PT_BYTE);
f48b82fd
GR
6982
6983 /* We must disable undo_list in order to record the whole insert
6984 transaction via record_insert at the end. But doing so also
6985 disables the recording of the first change to the undo_list.
6986 Therefore we check for first change here and record it via
6987 record_first_change if needed. */
6988 if (MODIFF <= SAVE_MODIFF)
6989 record_first_change ();
6990
4b4deea2 6991 undo_list = BVAR (current_buffer, undo_list);
39eb03f1 6992 bset_undo_list (current_buffer, Qt);
1c3478b0
KH
6993 }
6994
df7492f9
KH
6995 coding->consumed = coding->consumed_char = 0;
6996 coding->produced = coding->produced_char = 0;
6997 coding->chars_at_source = 0;
065e3595 6998 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9 6999 coding->errors = 0;
1c3478b0 7000
df7492f9
KH
7001 ALLOC_CONVERSION_WORK_AREA (coding);
7002
7003 attrs = CODING_ID_ATTRS (coding->id);
2170c8f0 7004 translation_table = get_translation_table (attrs, 0, NULL);
df7492f9 7005
69a80ea3 7006 carryover = 0;
d0396581
KH
7007 if (coding->decoder == decode_coding_ccl)
7008 {
7009 coding->spec.ccl = &cclspec;
7010 setup_ccl_program (&cclspec.ccl, CODING_CCL_DECODER (coding));
7011 }
df7492f9 7012 do
b73bfc1c 7013 {
d311d28c 7014 ptrdiff_t pos = coding->dst_pos + coding->produced_char;
69a80ea3 7015
df7492f9
KH
7016 coding_set_source (coding);
7017 coding->annotated = 0;
69a80ea3 7018 coding->charbuf_used = carryover;
df7492f9 7019 (*(coding->decoder)) (coding);
df7492f9 7020 coding_set_destination (coding);
69a80ea3 7021 carryover = produce_chars (coding, translation_table, 0);
df7492f9 7022 if (coding->annotated)
69a80ea3
KH
7023 produce_annotation (coding, pos);
7024 for (i = 0; i < carryover; i++)
7025 coding->charbuf[i]
7026 = coding->charbuf[coding->charbuf_used - carryover + i];
d46c5b12 7027 }
d0396581
KH
7028 while (coding->result == CODING_RESULT_INSUFFICIENT_DST
7029 || (coding->consumed < coding->src_bytes
7030 && (coding->result == CODING_RESULT_SUCCESS
7031 || coding->result == CODING_RESULT_INVALID_SRC)));
d46c5b12 7032
69a80ea3
KH
7033 if (carryover > 0)
7034 {
7035 coding_set_destination (coding);
7036 coding->charbuf_used = carryover;
7037 produce_chars (coding, translation_table, 1);
7038 }
7039
df7492f9
KH
7040 coding->carryover_bytes = 0;
7041 if (coding->consumed < coding->src_bytes)
d46c5b12 7042 {
df7492f9 7043 int nbytes = coding->src_bytes - coding->consumed;
8f924df7 7044 const unsigned char *src;
df7492f9
KH
7045
7046 coding_set_source (coding);
7047 coding_set_destination (coding);
7048 src = coding->source + coding->consumed;
7049
7050 if (coding->mode & CODING_MODE_LAST_BLOCK)
1c3478b0 7051 {
df7492f9
KH
7052 /* Flush out unprocessed data as binary chars. We are sure
7053 that the number of data is less than the size of
7054 coding->charbuf. */
065e3595 7055 coding->charbuf_used = 0;
b2dab6c8
JR
7056 coding->chars_at_source = 0;
7057
df7492f9 7058 while (nbytes-- > 0)
1c3478b0 7059 {
df7492f9 7060 int c = *src++;
98725083 7061
1c91457d
KH
7062 if (c & 0x80)
7063 c = BYTE8_TO_CHAR (c);
7064 coding->charbuf[coding->charbuf_used++] = c;
1c3478b0 7065 }
f6cbaf43 7066 produce_chars (coding, Qnil, 1);
d46c5b12 7067 }
d46c5b12 7068 else
df7492f9
KH
7069 {
7070 /* Record unprocessed bytes in coding->carryover. We are
7071 sure that the number of data is less than the size of
7072 coding->carryover. */
7073 unsigned char *p = coding->carryover;
7074
f289d375
KH
7075 if (nbytes > sizeof coding->carryover)
7076 nbytes = sizeof coding->carryover;
df7492f9
KH
7077 coding->carryover_bytes = nbytes;
7078 while (nbytes-- > 0)
7079 *p++ = *src++;
1c3478b0 7080 }
df7492f9 7081 coding->consumed = coding->src_bytes;
b73bfc1c 7082 }
69f76525 7083
0a9564cb
EZ
7084 if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7085 && !inhibit_eol_conversion)
4347441b 7086 decode_eol (coding);
24a73b0a
KH
7087 if (BUFFERP (coding->dst_object))
7088 {
39eb03f1 7089 bset_undo_list (current_buffer, undo_list);
24a73b0a
KH
7090 record_insert (coding->dst_pos, coding->produced_char);
7091 }
4ed46869
KH
7092}
7093
aaaf0b1e 7094
e1c23804 7095/* Extract an annotation datum from a composition starting at POS and
ff0dacd7
KH
7096 ending before LIMIT of CODING->src_object (buffer or string), store
7097 the data in BUF, set *STOP to a starting position of the next
7098 composition (if any) or to LIMIT, and return the address of the
7099 next element of BUF.
7100
7101 If such an annotation is not found, set *STOP to a starting
7102 position of a composition after POS (if any) or to LIMIT, and
7103 return BUF. */
7104
b0ab8123 7105static int *
d311d28c 7106handle_composition_annotation (ptrdiff_t pos, ptrdiff_t limit,
cf84bb53 7107 struct coding_system *coding, int *buf,
d311d28c 7108 ptrdiff_t *stop)
aaaf0b1e 7109{
d311d28c 7110 ptrdiff_t start, end;
ff0dacd7 7111 Lisp_Object prop;
aaaf0b1e 7112
ff0dacd7
KH
7113 if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7114 || end > limit)
7115 *stop = limit;
7116 else if (start > pos)
7117 *stop = start;
7118 else
aaaf0b1e 7119 {
ff0dacd7 7120 if (start == pos)
aaaf0b1e 7121 {
ff0dacd7
KH
7122 /* We found a composition. Store the corresponding
7123 annotation data in BUF. */
7124 int *head = buf;
7125 enum composition_method method = COMPOSITION_METHOD (prop);
7126 int nchars = COMPOSITION_LENGTH (prop);
7127
e951386e 7128 ADD_COMPOSITION_DATA (buf, nchars, 0, method);
ff0dacd7 7129 if (method != COMPOSITION_RELATIVE)
aaaf0b1e 7130 {
ff0dacd7 7131 Lisp_Object components;
2c6a9faa 7132 ptrdiff_t i, len, i_byte;
ff0dacd7
KH
7133
7134 components = COMPOSITION_COMPONENTS (prop);
7135 if (VECTORP (components))
aaaf0b1e 7136 {
77b37c05 7137 len = ASIZE (components);
ff0dacd7
KH
7138 for (i = 0; i < len; i++)
7139 *buf++ = XINT (AREF (components, i));
aaaf0b1e 7140 }
ff0dacd7 7141 else if (STRINGP (components))
aaaf0b1e 7142 {
8f924df7 7143 len = SCHARS (components);
ff0dacd7
KH
7144 i = i_byte = 0;
7145 while (i < len)
7146 {
7147 FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
7148 buf++;
7149 }
7150 }
7151 else if (INTEGERP (components))
7152 {
7153 len = 1;
7154 *buf++ = XINT (components);
7155 }
7156 else if (CONSP (components))
7157 {
7158 for (len = 0; CONSP (components);
7159 len++, components = XCDR (components))
7160 *buf++ = XINT (XCAR (components));
aaaf0b1e 7161 }
aaaf0b1e 7162 else
1088b922 7163 emacs_abort ();
ff0dacd7 7164 *head -= len;
aaaf0b1e 7165 }
aaaf0b1e 7166 }
ff0dacd7
KH
7167
7168 if (find_composition (end, limit, &start, &end, &prop,
7169 coding->src_object)
7170 && end <= limit)
7171 *stop = start;
7172 else
7173 *stop = limit;
aaaf0b1e 7174 }
ff0dacd7
KH
7175 return buf;
7176}
7177
7178
e1c23804 7179/* Extract an annotation datum from a text property `charset' at POS of
ff0dacd7
KH
7180 CODING->src_object (buffer of string), store the data in BUF, set
7181 *STOP to the position where the value of `charset' property changes
7182 (limiting by LIMIT), and return the address of the next element of
7183 BUF.
7184
7185 If the property value is nil, set *STOP to the position where the
7186 property value is non-nil (limiting by LIMIT), and return BUF. */
7187
b0ab8123 7188static int *
d311d28c 7189handle_charset_annotation (ptrdiff_t pos, ptrdiff_t limit,
cf84bb53 7190 struct coding_system *coding, int *buf,
d311d28c 7191 ptrdiff_t *stop)
ff0dacd7
KH
7192{
7193 Lisp_Object val, next;
7194 int id;
7195
7196 val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
7197 if (! NILP (val) && CHARSETP (val))
7198 id = XINT (CHARSET_SYMBOL_ID (val));
7199 else
7200 id = -1;
69a80ea3 7201 ADD_CHARSET_DATA (buf, 0, id);
ff0dacd7
KH
7202 next = Fnext_single_property_change (make_number (pos), Qcharset,
7203 coding->src_object,
7204 make_number (limit));
7205 *stop = XINT (next);
7206 return buf;
7207}
7208
7209
df7492f9 7210static void
cf84bb53
JB
7211consume_chars (struct coding_system *coding, Lisp_Object translation_table,
7212 int max_lookup)
df7492f9
KH
7213{
7214 int *buf = coding->charbuf;
ff0dacd7 7215 int *buf_end = coding->charbuf + coding->charbuf_size;
7c78e542 7216 const unsigned char *src = coding->source + coding->consumed;
4776e638 7217 const unsigned char *src_end = coding->source + coding->src_bytes;
d311d28c
PE
7218 ptrdiff_t pos = coding->src_pos + coding->consumed_char;
7219 ptrdiff_t end_pos = coding->src_pos + coding->src_chars;
f10fe38f 7220 bool multibytep = coding->src_multibyte;
df7492f9
KH
7221 Lisp_Object eol_type;
7222 int c;
d311d28c 7223 ptrdiff_t stop, stop_composition, stop_charset;
09ee6fdd 7224 int *lookup_buf = NULL;
433f7f87
KH
7225
7226 if (! NILP (translation_table))
09ee6fdd 7227 lookup_buf = alloca (sizeof (int) * max_lookup);
88993dfd 7228
0a9564cb 7229 eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
df7492f9
KH
7230 if (VECTORP (eol_type))
7231 eol_type = Qunix;
88993dfd 7232
df7492f9
KH
7233 /* Note: composition handling is not yet implemented. */
7234 coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
ec6d2bb8 7235
0b5670c9
KH
7236 if (NILP (coding->src_object))
7237 stop = stop_composition = stop_charset = end_pos;
ff0dacd7 7238 else
0b5670c9
KH
7239 {
7240 if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7241 stop = stop_composition = pos;
7242 else
7243 stop = stop_composition = end_pos;
7244 if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7245 stop = stop_charset = pos;
7246 else
7247 stop_charset = end_pos;
7248 }
ec6d2bb8 7249
24a73b0a 7250 /* Compensate for CRLF and conversion. */
ff0dacd7 7251 buf_end -= 1 + MAX_ANNOTATION_LENGTH;
df7492f9 7252 while (buf < buf_end)
aaaf0b1e 7253 {
433f7f87
KH
7254 Lisp_Object trans;
7255
df7492f9 7256 if (pos == stop)
ec6d2bb8 7257 {
df7492f9
KH
7258 if (pos == end_pos)
7259 break;
ff0dacd7
KH
7260 if (pos == stop_composition)
7261 buf = handle_composition_annotation (pos, end_pos, coding,
7262 buf, &stop_composition);
7263 if (pos == stop_charset)
7264 buf = handle_charset_annotation (pos, end_pos, coding,
7265 buf, &stop_charset);
7266 stop = (stop_composition < stop_charset
7267 ? stop_composition : stop_charset);
df7492f9
KH
7268 }
7269
7270 if (! multibytep)
4776e638 7271 {
d311d28c 7272 int bytes;
aaaf0b1e 7273
4d1e6632
KH
7274 if (coding->encoder == encode_coding_raw_text
7275 || coding->encoder == encode_coding_ccl)
ea29edf2
KH
7276 c = *src++, pos++;
7277 else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
db274c7a 7278 c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
4776e638 7279 else
f03caae0 7280 c = BYTE8_TO_CHAR (*src), src++, pos++;
4776e638 7281 }
df7492f9 7282 else
db274c7a 7283 c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
df7492f9
KH
7284 if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7285 c = '\n';
7286 if (! EQ (eol_type, Qunix))
aaaf0b1e 7287 {
df7492f9 7288 if (c == '\n')
aaaf0b1e 7289 {
df7492f9
KH
7290 if (EQ (eol_type, Qdos))
7291 *buf++ = '\r';
7292 else
7293 c = '\r';
aaaf0b1e
KH
7294 }
7295 }
433f7f87 7296
e6a54062 7297 trans = Qnil;
09ee6fdd 7298 LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
e6a54062 7299 if (NILP (trans))
433f7f87
KH
7300 *buf++ = c;
7301 else
7302 {
2c6a9faa 7303 ptrdiff_t from_nchars = 1, to_nchars = 1;
433f7f87
KH
7304 int *lookup_buf_end;
7305 const unsigned char *p = src;
7306 int i;
7307
7308 lookup_buf[0] = c;
7309 for (i = 1; i < max_lookup && p < src_end; i++)
7310 lookup_buf[i] = STRING_CHAR_ADVANCE (p);
7311 lookup_buf_end = lookup_buf + i;
e951386e
KH
7312 trans = get_translation (trans, lookup_buf, lookup_buf_end);
7313 if (INTEGERP (trans))
7314 c = XINT (trans);
7315 else if (CONSP (trans))
7316 {
7317 from_nchars = ASIZE (XCAR (trans));
7318 trans = XCDR (trans);
7319 if (INTEGERP (trans))
7320 c = XINT (trans);
7321 else
7322 {
7323 to_nchars = ASIZE (trans);
2c6a9faa 7324 if (buf_end - buf < to_nchars)
e951386e
KH
7325 break;
7326 c = XINT (AREF (trans, 0));
7327 }
7328 }
7329 else
433f7f87 7330 break;
e951386e 7331 *buf++ = c;
433f7f87
KH
7332 for (i = 1; i < to_nchars; i++)
7333 *buf++ = XINT (AREF (trans, i));
7334 for (i = 1; i < from_nchars; i++, pos++)
7335 src += MULTIBYTE_LENGTH_NO_CHECK (src);
7336 }
aaaf0b1e 7337 }
ec6d2bb8 7338
df7492f9
KH
7339 coding->consumed = src - coding->source;
7340 coding->consumed_char = pos - coding->src_pos;
7341 coding->charbuf_used = buf - coding->charbuf;
7342 coding->chars_at_source = 0;
aaaf0b1e
KH
7343}
7344
4ed46869 7345
df7492f9
KH
7346/* Encode the text at CODING->src_object into CODING->dst_object.
7347 CODING->src_object is a buffer or a string.
7348 CODING->dst_object is a buffer or nil.
7349
7350 If CODING->src_object is a buffer, it must be the current buffer.
7351 In this case, if CODING->src_pos is positive, it is a position of
7352 the source text in the buffer, otherwise. the source text is in the
7353 gap area of the buffer, and coding->src_pos specifies the offset of
7354 the text from GPT (which must be the same as PT). If this is the
7355 same buffer as CODING->dst_object, CODING->src_pos must be
7356 negative and CODING should not have `pre-write-conversion'.
7357
7358 If CODING->src_object is a string, CODING should not have
7359 `pre-write-conversion'.
7360
7361 If CODING->dst_object is a buffer, the encoded data is inserted at
7362 the current point of that buffer.
7363
7364 If CODING->dst_object is nil, the encoded data is placed at the
7365 memory area specified by CODING->destination. */
7366
f10fe38f 7367static void
971de7fb 7368encode_coding (struct coding_system *coding)
4ed46869 7369{
df7492f9 7370 Lisp_Object attrs;
7d64c6ad 7371 Lisp_Object translation_table;
09ee6fdd 7372 int max_lookup;
fb608df3 7373 struct ccl_spec cclspec;
9861e777 7374
df7492f9 7375 attrs = CODING_ID_ATTRS (coding->id);
ea29edf2
KH
7376 if (coding->encoder == encode_coding_raw_text)
7377 translation_table = Qnil, max_lookup = 0;
7378 else
7379 translation_table = get_translation_table (attrs, 1, &max_lookup);
4ed46869 7380
df7492f9 7381 if (BUFFERP (coding->dst_object))
8844fa83 7382 {
df7492f9
KH
7383 set_buffer_internal (XBUFFER (coding->dst_object));
7384 coding->dst_multibyte
4b4deea2 7385 = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
8844fa83 7386 }
4ed46869 7387
b73bfc1c 7388 coding->consumed = coding->consumed_char = 0;
df7492f9 7389 coding->produced = coding->produced_char = 0;
065e3595 7390 record_conversion_result (coding, CODING_RESULT_SUCCESS);
b73bfc1c 7391 coding->errors = 0;
b73bfc1c 7392
df7492f9 7393 ALLOC_CONVERSION_WORK_AREA (coding);
4ed46869 7394
fb608df3
KH
7395 if (coding->encoder == encode_coding_ccl)
7396 {
7397 coding->spec.ccl = &cclspec;
7398 setup_ccl_program (&cclspec.ccl, CODING_CCL_ENCODER (coding));
7399 }
df7492f9
KH
7400 do {
7401 coding_set_source (coding);
09ee6fdd 7402 consume_chars (coding, translation_table, max_lookup);
df7492f9
KH
7403 coding_set_destination (coding);
7404 (*(coding->encoder)) (coding);
7405 } while (coding->consumed_char < coding->src_chars);
7406
284201e4 7407 if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
df7492f9 7408 insert_from_gap (coding->produced_char, coding->produced);
ec6d2bb8
KH
7409}
7410
fb88bf2d 7411
24a73b0a
KH
7412/* Name (or base name) of work buffer for code conversion. */
7413static Lisp_Object Vcode_conversion_workbuf_name;
d46c5b12 7414
24a73b0a
KH
7415/* A working buffer used by the top level conversion. Once it is
7416 created, it is never destroyed. It has the name
7417 Vcode_conversion_workbuf_name. The other working buffers are
7418 destroyed after the use is finished, and their names are modified
7419 versions of Vcode_conversion_workbuf_name. */
7420static Lisp_Object Vcode_conversion_reused_workbuf;
b73bfc1c 7421
f10fe38f
PE
7422/* True iff Vcode_conversion_reused_workbuf is already in use. */
7423static bool reused_workbuf_in_use;
4ed46869 7424
24a73b0a 7425
ad1746f5 7426/* Return a working buffer of code conversion. MULTIBYTE specifies the
24a73b0a 7427 multibyteness of returning buffer. */
b73bfc1c 7428
f6cbaf43 7429static Lisp_Object
f10fe38f 7430make_conversion_work_buffer (bool multibyte)
df7492f9 7431{
24a73b0a
KH
7432 Lisp_Object name, workbuf;
7433 struct buffer *current;
4ed46869 7434
f10fe38f 7435 if (reused_workbuf_in_use)
065e3595
KH
7436 {
7437 name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7438 workbuf = Fget_buffer_create (name);
7439 }
df7492f9 7440 else
065e3595 7441 {
f10fe38f 7442 reused_workbuf_in_use = 1;
159bd5a2 7443 if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
a993c7a1
KH
7444 Vcode_conversion_reused_workbuf
7445 = Fget_buffer_create (Vcode_conversion_workbuf_name);
7446 workbuf = Vcode_conversion_reused_workbuf;
065e3595 7447 }
24a73b0a
KH
7448 current = current_buffer;
7449 set_buffer_internal (XBUFFER (workbuf));
df36ff1f
CY
7450 /* We can't allow modification hooks to run in the work buffer. For
7451 instance, directory_files_internal assumes that file decoding
7452 doesn't compile new regexps. */
7453 Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
3ed051d4 7454 Ferase_buffer ();
39eb03f1
PE
7455 bset_undo_list (current_buffer, Qt);
7456 bset_enable_multibyte_characters (current_buffer, multibyte ? Qt : Qnil);
df7492f9 7457 set_buffer_internal (current);
24a73b0a 7458 return workbuf;
df7492f9 7459}
d46c5b12 7460
24a73b0a 7461
4776e638 7462static Lisp_Object
971de7fb 7463code_conversion_restore (Lisp_Object arg)
4776e638 7464{
24a73b0a 7465 Lisp_Object current, workbuf;
948bdcf3 7466 struct gcpro gcpro1;
24a73b0a 7467
948bdcf3 7468 GCPRO1 (arg);
24a73b0a
KH
7469 current = XCAR (arg);
7470 workbuf = XCDR (arg);
7471 if (! NILP (workbuf))
7472 {
7473 if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7474 reused_workbuf_in_use = 0;
d17337e5 7475 else
24a73b0a
KH
7476 Fkill_buffer (workbuf);
7477 }
7478 set_buffer_internal (XBUFFER (current));
948bdcf3 7479 UNGCPRO;
4776e638
KH
7480 return Qnil;
7481}
b73bfc1c 7482
24a73b0a 7483Lisp_Object
f10fe38f 7484code_conversion_save (bool with_work_buf, bool multibyte)
df7492f9 7485{
24a73b0a 7486 Lisp_Object workbuf = Qnil;
b73bfc1c 7487
4776e638 7488 if (with_work_buf)
24a73b0a
KH
7489 workbuf = make_conversion_work_buffer (multibyte);
7490 record_unwind_protect (code_conversion_restore,
7491 Fcons (Fcurrent_buffer (), workbuf));
4776e638 7492 return workbuf;
df7492f9 7493}
d46c5b12 7494
f10fe38f 7495void
cf84bb53 7496decode_coding_gap (struct coding_system *coding,
d311d28c 7497 ptrdiff_t chars, ptrdiff_t bytes)
df7492f9 7498{
d311d28c 7499 ptrdiff_t count = SPECPDL_INDEX ();
5e5c78be 7500 Lisp_Object attrs;
fb88bf2d 7501
24a73b0a 7502 code_conversion_save (0, 0);
ec6d2bb8 7503
24a73b0a 7504 coding->src_object = Fcurrent_buffer ();
df7492f9
KH
7505 coding->src_chars = chars;
7506 coding->src_bytes = bytes;
7507 coding->src_pos = -chars;
7508 coding->src_pos_byte = -bytes;
7509 coding->src_multibyte = chars < bytes;
24a73b0a 7510 coding->dst_object = coding->src_object;
df7492f9
KH
7511 coding->dst_pos = PT;
7512 coding->dst_pos_byte = PT_BYTE;
4b4deea2 7513 coding->dst_multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
4ed46869 7514
df7492f9
KH
7515 if (CODING_REQUIRE_DETECTION (coding))
7516 detect_coding (coding);
8f924df7 7517
9286b333 7518 coding->mode |= CODING_MODE_LAST_BLOCK;
287c57d7 7519 current_buffer->text->inhibit_shrinking = 1;
df7492f9 7520 decode_coding (coding);
287c57d7 7521 current_buffer->text->inhibit_shrinking = 0;
d46c5b12 7522
5e5c78be
KH
7523 attrs = CODING_ID_ATTRS (coding->id);
7524 if (! NILP (CODING_ATTR_POST_READ (attrs)))
b73bfc1c 7525 {
d311d28c 7526 ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
5e5c78be
KH
7527 Lisp_Object val;
7528
7529 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
5e5c78be
KH
7530 val = call1 (CODING_ATTR_POST_READ (attrs),
7531 make_number (coding->produced_char));
5e5c78be
KH
7532 CHECK_NATNUM (val);
7533 coding->produced_char += Z - prev_Z;
7534 coding->produced += Z_BYTE - prev_Z_BYTE;
b73bfc1c 7535 }
4ed46869 7536
df7492f9 7537 unbind_to (count, Qnil);
b73bfc1c 7538}
52d41803 7539
d46c5b12 7540
df7492f9
KH
7541/* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
7542 SRC_OBJECT into DST_OBJECT by coding context CODING.
b73bfc1c 7543
df7492f9 7544 SRC_OBJECT is a buffer, a string, or Qnil.
b73bfc1c 7545
df7492f9
KH
7546 If it is a buffer, the text is at point of the buffer. FROM and TO
7547 are positions in the buffer.
b73bfc1c 7548
df7492f9
KH
7549 If it is a string, the text is at the beginning of the string.
7550 FROM and TO are indices to the string.
4ed46869 7551
df7492f9
KH
7552 If it is nil, the text is at coding->source. FROM and TO are
7553 indices to coding->source.
bb10be8b 7554
df7492f9 7555 DST_OBJECT is a buffer, Qt, or Qnil.
4ed46869 7556
df7492f9
KH
7557 If it is a buffer, the decoded text is inserted at point of the
7558 buffer. If the buffer is the same as SRC_OBJECT, the source text
7559 is deleted.
4ed46869 7560
df7492f9
KH
7561 If it is Qt, a string is made from the decoded text, and
7562 set in CODING->dst_object.
d46c5b12 7563
df7492f9 7564 If it is Qnil, the decoded text is stored at CODING->destination.
2cb26057 7565 The caller must allocate CODING->dst_bytes bytes at
df7492f9
KH
7566 CODING->destination by xmalloc. If the decoded text is longer than
7567 CODING->dst_bytes, CODING->destination is relocated by xrealloc.
7568 */
d46c5b12 7569
df7492f9 7570void
cf84bb53
JB
7571decode_coding_object (struct coding_system *coding,
7572 Lisp_Object src_object,
d311d28c
PE
7573 ptrdiff_t from, ptrdiff_t from_byte,
7574 ptrdiff_t to, ptrdiff_t to_byte,
cf84bb53 7575 Lisp_Object dst_object)
d46c5b12 7576{
d311d28c 7577 ptrdiff_t count = SPECPDL_INDEX ();
c4a63b12 7578 unsigned char *destination IF_LINT (= NULL);
d311d28c
PE
7579 ptrdiff_t dst_bytes IF_LINT (= 0);
7580 ptrdiff_t chars = to - from;
7581 ptrdiff_t bytes = to_byte - from_byte;
df7492f9 7582 Lisp_Object attrs;
f10fe38f
PE
7583 ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
7584 bool need_marker_adjustment = 0;
b3bfad50 7585 Lisp_Object old_deactivate_mark;
d46c5b12 7586
b3bfad50 7587 old_deactivate_mark = Vdeactivate_mark;
93dec019 7588
df7492f9 7589 if (NILP (dst_object))
d46c5b12 7590 {
df7492f9
KH
7591 destination = coding->destination;
7592 dst_bytes = coding->dst_bytes;
d46c5b12 7593 }
93dec019 7594
df7492f9
KH
7595 coding->src_object = src_object;
7596 coding->src_chars = chars;
7597 coding->src_bytes = bytes;
7598 coding->src_multibyte = chars < bytes;
70ad9fc4 7599
df7492f9 7600 if (STRINGP (src_object))
d46c5b12 7601 {
df7492f9
KH
7602 coding->src_pos = from;
7603 coding->src_pos_byte = from_byte;
d46c5b12 7604 }
df7492f9 7605 else if (BUFFERP (src_object))
88993dfd 7606 {
df7492f9
KH
7607 set_buffer_internal (XBUFFER (src_object));
7608 if (from != GPT)
7609 move_gap_both (from, from_byte);
7610 if (EQ (src_object, dst_object))
fb88bf2d 7611 {
64cedb0c
KH
7612 struct Lisp_Marker *tail;
7613
7614 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7615 {
7616 tail->need_adjustment
7617 = tail->charpos == (tail->insertion_type ? from : to);
7618 need_marker_adjustment |= tail->need_adjustment;
7619 }
4776e638 7620 saved_pt = PT, saved_pt_byte = PT_BYTE;
df7492f9 7621 TEMP_SET_PT_BOTH (from, from_byte);
f4a3cc44 7622 current_buffer->text->inhibit_shrinking = 1;
df7492f9
KH
7623 del_range_both (from, from_byte, to, to_byte, 1);
7624 coding->src_pos = -chars;
7625 coding->src_pos_byte = -bytes;
fb88bf2d 7626 }
df7492f9 7627 else
fb88bf2d 7628 {
df7492f9
KH
7629 coding->src_pos = from;
7630 coding->src_pos_byte = from_byte;
fb88bf2d 7631 }
88993dfd
KH
7632 }
7633
df7492f9
KH
7634 if (CODING_REQUIRE_DETECTION (coding))
7635 detect_coding (coding);
7636 attrs = CODING_ID_ATTRS (coding->id);
d46c5b12 7637
2cb26057
KH
7638 if (EQ (dst_object, Qt)
7639 || (! NILP (CODING_ATTR_POST_READ (attrs))
7640 && NILP (dst_object)))
b73bfc1c 7641 {
a1567c45
SM
7642 coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
7643 coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
df7492f9
KH
7644 coding->dst_pos = BEG;
7645 coding->dst_pos_byte = BEG_BYTE;
b73bfc1c 7646 }
df7492f9 7647 else if (BUFFERP (dst_object))
d46c5b12 7648 {
24a73b0a 7649 code_conversion_save (0, 0);
df7492f9
KH
7650 coding->dst_object = dst_object;
7651 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
7652 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
7653 coding->dst_multibyte
4b4deea2 7654 = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
d46c5b12
KH
7655 }
7656 else
7657 {
24a73b0a 7658 code_conversion_save (0, 0);
df7492f9 7659 coding->dst_object = Qnil;
0154725e
SM
7660 /* Most callers presume this will return a multibyte result, and they
7661 won't use `binary' or `raw-text' anyway, so let's not worry about
7662 CODING_FOR_UNIBYTE. */
bb555731 7663 coding->dst_multibyte = 1;
d46c5b12
KH
7664 }
7665
df7492f9 7666 decode_coding (coding);
fa46990e 7667
df7492f9
KH
7668 if (BUFFERP (coding->dst_object))
7669 set_buffer_internal (XBUFFER (coding->dst_object));
d46c5b12 7670
df7492f9 7671 if (! NILP (CODING_ATTR_POST_READ (attrs)))
d46c5b12 7672 {
b3bfad50 7673 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
d311d28c 7674 ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
2b4f9037 7675 Lisp_Object val;
d46c5b12 7676
c0cc7f7f 7677 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
b3bfad50
KH
7678 GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7679 old_deactivate_mark);
d4850d67
KH
7680 val = safe_call1 (CODING_ATTR_POST_READ (attrs),
7681 make_number (coding->produced_char));
df7492f9
KH
7682 UNGCPRO;
7683 CHECK_NATNUM (val);
7684 coding->produced_char += Z - prev_Z;
7685 coding->produced += Z_BYTE - prev_Z_BYTE;
d46c5b12 7686 }
de79a6a5 7687
df7492f9 7688 if (EQ (dst_object, Qt))
ec6d2bb8 7689 {
df7492f9
KH
7690 coding->dst_object = Fbuffer_string ();
7691 }
7692 else if (NILP (dst_object) && BUFFERP (coding->dst_object))
7693 {
7694 set_buffer_internal (XBUFFER (coding->dst_object));
7695 if (dst_bytes < coding->produced)
7696 {
b3bfad50 7697 destination = xrealloc (destination, coding->produced);
df7492f9
KH
7698 if (! destination)
7699 {
065e3595 7700 record_conversion_result (coding,
ebaf11b6 7701 CODING_RESULT_INSUFFICIENT_MEM);
df7492f9
KH
7702 unbind_to (count, Qnil);
7703 return;
7704 }
7705 if (BEGV < GPT && GPT < BEGV + coding->produced_char)
7706 move_gap_both (BEGV, BEGV_BYTE);
72af86bd 7707 memcpy (destination, BEGV_ADDR, coding->produced);
df7492f9 7708 coding->destination = destination;
d46c5b12 7709 }
ec6d2bb8 7710 }
b73bfc1c 7711
4776e638
KH
7712 if (saved_pt >= 0)
7713 {
7714 /* This is the case of:
7715 (BUFFERP (src_object) && EQ (src_object, dst_object))
7716 As we have moved PT while replacing the original buffer
7717 contents, we must recover it now. */
7718 set_buffer_internal (XBUFFER (src_object));
f4a3cc44 7719 current_buffer->text->inhibit_shrinking = 0;
4776e638
KH
7720 if (saved_pt < from)
7721 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7722 else if (saved_pt < from + chars)
7723 TEMP_SET_PT_BOTH (from, from_byte);
4b4deea2 7724 else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
4776e638
KH
7725 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7726 saved_pt_byte + (coding->produced - bytes));
7727 else
7728 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7729 saved_pt_byte + (coding->produced - bytes));
64cedb0c
KH
7730
7731 if (need_marker_adjustment)
7732 {
7733 struct Lisp_Marker *tail;
7734
7735 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7736 if (tail->need_adjustment)
7737 {
7738 tail->need_adjustment = 0;
7739 if (tail->insertion_type)
7740 {
7741 tail->bytepos = from_byte;
7742 tail->charpos = from;
7743 }
7744 else
7745 {
7746 tail->bytepos = from_byte + coding->produced;
7747 tail->charpos
4b4deea2 7748 = (NILP (BVAR (current_buffer, enable_multibyte_characters))
64cedb0c
KH
7749 ? tail->bytepos : from + coding->produced_char);
7750 }
7751 }
7752 }
d46c5b12 7753 }
4776e638 7754
b3bfad50 7755 Vdeactivate_mark = old_deactivate_mark;
065e3595 7756 unbind_to (count, coding->dst_object);
d46c5b12
KH
7757}
7758
d46c5b12 7759
df7492f9 7760void
cf84bb53
JB
7761encode_coding_object (struct coding_system *coding,
7762 Lisp_Object src_object,
d311d28c
PE
7763 ptrdiff_t from, ptrdiff_t from_byte,
7764 ptrdiff_t to, ptrdiff_t to_byte,
cf84bb53 7765 Lisp_Object dst_object)
d46c5b12 7766{
d311d28c
PE
7767 ptrdiff_t count = SPECPDL_INDEX ();
7768 ptrdiff_t chars = to - from;
7769 ptrdiff_t bytes = to_byte - from_byte;
df7492f9 7770 Lisp_Object attrs;
f10fe38f
PE
7771 ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
7772 bool need_marker_adjustment = 0;
7773 bool kill_src_buffer = 0;
b3bfad50 7774 Lisp_Object old_deactivate_mark;
df7492f9 7775
b3bfad50 7776 old_deactivate_mark = Vdeactivate_mark;
df7492f9
KH
7777
7778 coding->src_object = src_object;
7779 coding->src_chars = chars;
7780 coding->src_bytes = bytes;
7781 coding->src_multibyte = chars < bytes;
7782
7783 attrs = CODING_ID_ATTRS (coding->id);
7784
64cedb0c
KH
7785 if (EQ (src_object, dst_object))
7786 {
7787 struct Lisp_Marker *tail;
7788
7789 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7790 {
7791 tail->need_adjustment
7792 = tail->charpos == (tail->insertion_type ? from : to);
7793 need_marker_adjustment |= tail->need_adjustment;
7794 }
7795 }
7796
df7492f9 7797 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
6bac5b12 7798 {
24a73b0a 7799 coding->src_object = code_conversion_save (1, coding->src_multibyte);
df7492f9
KH
7800 set_buffer_internal (XBUFFER (coding->src_object));
7801 if (STRINGP (src_object))
7802 insert_from_string (src_object, from, from_byte, chars, bytes, 0);
7803 else if (BUFFERP (src_object))
7804 insert_from_buffer (XBUFFER (src_object), from, chars, 0);
7805 else
b68864e5 7806 insert_1_both ((char *) coding->source + from, chars, bytes, 0, 0, 0);
d46c5b12 7807
df7492f9
KH
7808 if (EQ (src_object, dst_object))
7809 {
7810 set_buffer_internal (XBUFFER (src_object));
4776e638 7811 saved_pt = PT, saved_pt_byte = PT_BYTE;
df7492f9
KH
7812 del_range_both (from, from_byte, to, to_byte, 1);
7813 set_buffer_internal (XBUFFER (coding->src_object));
7814 }
7815
d4850d67 7816 {
b3bfad50 7817 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
d4850d67 7818
b3bfad50
KH
7819 GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7820 old_deactivate_mark);
6cd7a139
DA
7821 safe_call2 (CODING_ATTR_PRE_WRITE (attrs),
7822 make_number (BEG), make_number (Z));
b3bfad50 7823 UNGCPRO;
d4850d67 7824 }
c02d943b
KH
7825 if (XBUFFER (coding->src_object) != current_buffer)
7826 kill_src_buffer = 1;
ac87bbef 7827 coding->src_object = Fcurrent_buffer ();
df7492f9
KH
7828 if (BEG != GPT)
7829 move_gap_both (BEG, BEG_BYTE);
7830 coding->src_chars = Z - BEG;
7831 coding->src_bytes = Z_BYTE - BEG_BYTE;
7832 coding->src_pos = BEG;
7833 coding->src_pos_byte = BEG_BYTE;
7834 coding->src_multibyte = Z < Z_BYTE;
7835 }
7836 else if (STRINGP (src_object))
d46c5b12 7837 {
24a73b0a 7838 code_conversion_save (0, 0);
df7492f9
KH
7839 coding->src_pos = from;
7840 coding->src_pos_byte = from_byte;
b73bfc1c 7841 }
df7492f9 7842 else if (BUFFERP (src_object))
b73bfc1c 7843 {
24a73b0a 7844 code_conversion_save (0, 0);
df7492f9 7845 set_buffer_internal (XBUFFER (src_object));
df7492f9 7846 if (EQ (src_object, dst_object))
d46c5b12 7847 {
4776e638 7848 saved_pt = PT, saved_pt_byte = PT_BYTE;
ff0dacd7
KH
7849 coding->src_object = del_range_1 (from, to, 1, 1);
7850 coding->src_pos = 0;
7851 coding->src_pos_byte = 0;
d46c5b12 7852 }
df7492f9 7853 else
d46c5b12 7854 {
ff0dacd7
KH
7855 if (from < GPT && to >= GPT)
7856 move_gap_both (from, from_byte);
df7492f9
KH
7857 coding->src_pos = from;
7858 coding->src_pos_byte = from_byte;
d46c5b12 7859 }
d46c5b12 7860 }
4776e638 7861 else
24a73b0a 7862 code_conversion_save (0, 0);
d46c5b12 7863
df7492f9 7864 if (BUFFERP (dst_object))
88993dfd 7865 {
df7492f9 7866 coding->dst_object = dst_object;
28f67a95
KH
7867 if (EQ (src_object, dst_object))
7868 {
7869 coding->dst_pos = from;
7870 coding->dst_pos_byte = from_byte;
7871 }
7872 else
7873 {
319a3947
KH
7874 struct buffer *current = current_buffer;
7875
7876 set_buffer_temp (XBUFFER (dst_object));
7877 coding->dst_pos = PT;
7878 coding->dst_pos_byte = PT_BYTE;
7879 move_gap_both (coding->dst_pos, coding->dst_pos_byte);
7880 set_buffer_temp (current);
28f67a95 7881 }
df7492f9 7882 coding->dst_multibyte
4b4deea2 7883 = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
88993dfd 7884 }
df7492f9 7885 else if (EQ (dst_object, Qt))
d46c5b12 7886 {
5d009b3a 7887 ptrdiff_t dst_bytes = max (1, coding->src_chars);
df7492f9 7888 coding->dst_object = Qnil;
23f86fce 7889 coding->destination = xmalloc (dst_bytes);
5d009b3a 7890 coding->dst_bytes = dst_bytes;
df7492f9 7891 coding->dst_multibyte = 0;
d46c5b12
KH
7892 }
7893 else
7894 {
df7492f9
KH
7895 coding->dst_object = Qnil;
7896 coding->dst_multibyte = 0;
d46c5b12
KH
7897 }
7898
df7492f9 7899 encode_coding (coding);
d46c5b12 7900
df7492f9 7901 if (EQ (dst_object, Qt))
d46c5b12 7902 {
df7492f9
KH
7903 if (BUFFERP (coding->dst_object))
7904 coding->dst_object = Fbuffer_string ();
7905 else
d46c5b12 7906 {
df7492f9
KH
7907 coding->dst_object
7908 = make_unibyte_string ((char *) coding->destination,
7909 coding->produced);
7910 xfree (coding->destination);
d46c5b12 7911 }
4ed46869 7912 }
d46c5b12 7913
4776e638
KH
7914 if (saved_pt >= 0)
7915 {
7916 /* This is the case of:
7917 (BUFFERP (src_object) && EQ (src_object, dst_object))
7918 As we have moved PT while replacing the original buffer
7919 contents, we must recover it now. */
7920 set_buffer_internal (XBUFFER (src_object));
7921 if (saved_pt < from)
7922 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7923 else if (saved_pt < from + chars)
7924 TEMP_SET_PT_BOTH (from, from_byte);
4b4deea2 7925 else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
4776e638
KH
7926 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7927 saved_pt_byte + (coding->produced - bytes));
d46c5b12 7928 else
4776e638
KH
7929 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7930 saved_pt_byte + (coding->produced - bytes));
64cedb0c
KH
7931
7932 if (need_marker_adjustment)
7933 {
7934 struct Lisp_Marker *tail;
7935
7936 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7937 if (tail->need_adjustment)
7938 {
7939 tail->need_adjustment = 0;
7940 if (tail->insertion_type)
7941 {
7942 tail->bytepos = from_byte;
7943 tail->charpos = from;
7944 }
7945 else
7946 {
7947 tail->bytepos = from_byte + coding->produced;
7948 tail->charpos
4b4deea2 7949 = (NILP (BVAR (current_buffer, enable_multibyte_characters))
64cedb0c
KH
7950 ? tail->bytepos : from + coding->produced_char);
7951 }
7952 }
7953 }
4776e638
KH
7954 }
7955
c02d943b
KH
7956 if (kill_src_buffer)
7957 Fkill_buffer (coding->src_object);
b3bfad50
KH
7958
7959 Vdeactivate_mark = old_deactivate_mark;
df7492f9 7960 unbind_to (count, Qnil);
b73bfc1c
KH
7961}
7962
df7492f9 7963
b73bfc1c 7964Lisp_Object
971de7fb 7965preferred_coding_system (void)
b73bfc1c 7966{
df7492f9 7967 int id = coding_categories[coding_priorities[0]].id;
2391eaa4 7968
df7492f9 7969 return CODING_ID_NAME (id);
4ed46869
KH
7970}
7971
7f590b0c 7972#if defined (WINDOWSNT) || defined (CYGWIN)
ba116008
DC
7973
7974Lisp_Object
7975from_unicode (Lisp_Object str)
7976{
7977 CHECK_STRING (str);
7978 if (!STRING_MULTIBYTE (str) &&
7979 SBYTES (str) & 1)
7980 {
7981 str = Fsubstring (str, make_number (0), make_number (-1));
7982 }
7983
7984 return code_convert_string_norecord (str, Qutf_16le, 0);
7985}
7986
7987wchar_t *
7988to_unicode (Lisp_Object str, Lisp_Object *buf)
7989{
7990 *buf = code_convert_string_norecord (str, Qutf_16le, 1);
7991 /* We need to make a another copy (in addition to the one made by
7992 code_convert_string_norecord) to ensure that the final string is
7993 _doubly_ zero terminated --- that is, that the string is
7994 terminated by two zero bytes and one utf-16le null character.
7995 Because strings are already terminated with a single zero byte,
7996 we just add one additional zero. */
7997 str = make_uninit_string (SBYTES (*buf) + 1);
7998 memcpy (SDATA (str), SDATA (*buf), SBYTES (*buf));
7999 SDATA (str) [SBYTES (*buf)] = '\0';
8000 *buf = str;
8001 return WCSDATA (*buf);
8002}
7f590b0c
DC
8003
8004#endif /* WINDOWSNT || CYGWIN */
ba116008 8005
4ed46869
KH
8006\f
8007#ifdef emacs
1397dc18 8008/*** 8. Emacs Lisp library functions ***/
4ed46869 8009
a7ca3326 8010DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
48b0f3ae 8011 doc: /* Return t if OBJECT is nil or a coding-system.
df7492f9 8012See the documentation of `define-coding-system' for information
48b0f3ae 8013about coding-system objects. */)
5842a27b 8014 (Lisp_Object object)
4ed46869 8015{
d4a1d553
JB
8016 if (NILP (object)
8017 || CODING_SYSTEM_ID (object) >= 0)
44e8490d 8018 return Qt;
d4a1d553
JB
8019 if (! SYMBOLP (object)
8020 || NILP (Fget (object, Qcoding_system_define_form)))
44e8490d
KH
8021 return Qnil;
8022 return Qt;
4ed46869
KH
8023}
8024
a7ca3326 8025DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
9d991de8 8026 Sread_non_nil_coding_system, 1, 1, 0,
48b0f3ae 8027 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
5842a27b 8028 (Lisp_Object prompt)
4ed46869 8029{
e0e989f6 8030 Lisp_Object val;
9d991de8
RS
8031 do
8032 {
4608c386
KH
8033 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8034 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
9d991de8 8035 }
8f924df7 8036 while (SCHARS (val) == 0);
e0e989f6 8037 return (Fintern (val, Qnil));
4ed46869
KH
8038}
8039
a7ca3326 8040DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
48b0f3ae 8041 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
c7183fb8
GM
8042If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8043Ignores case when completing coding systems (all Emacs coding systems
8044are lower-case). */)
5842a27b 8045 (Lisp_Object prompt, Lisp_Object default_coding_system)
4ed46869 8046{
f44d27ce 8047 Lisp_Object val;
d311d28c 8048 ptrdiff_t count = SPECPDL_INDEX ();
c7183fb8 8049
9b787f3e 8050 if (SYMBOLP (default_coding_system))
57d25e6f 8051 default_coding_system = SYMBOL_NAME (default_coding_system);
c7183fb8 8052 specbind (Qcompletion_ignore_case, Qt);
4608c386 8053 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
9b787f3e
RS
8054 Qt, Qnil, Qcoding_system_history,
8055 default_coding_system, Qnil);
c7183fb8 8056 unbind_to (count, Qnil);
8f924df7 8057 return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
4ed46869
KH
8058}
8059
a7ca3326 8060DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
4ed46869 8061 1, 1, 0,
48b0f3ae 8062 doc: /* Check validity of CODING-SYSTEM.
9ffd559c
KH
8063If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8064It is valid if it is nil or a symbol defined as a coding system by the
8065function `define-coding-system'. */)
5842a27b 8066 (Lisp_Object coding_system)
4ed46869 8067{
44e8490d
KH
8068 Lisp_Object define_form;
8069
8070 define_form = Fget (coding_system, Qcoding_system_define_form);
8071 if (! NILP (define_form))
8072 {
8073 Fput (coding_system, Qcoding_system_define_form, Qnil);
8074 safe_eval (define_form);
8075 }
4ed46869
KH
8076 if (!NILP (Fcoding_system_p (coding_system)))
8077 return coding_system;
fcad4ec4 8078 xsignal1 (Qcoding_system_error, coding_system);
4ed46869 8079}
df7492f9 8080
3a73fa5d 8081\f
89528eb3 8082/* Detect how the bytes at SRC of length SRC_BYTES are encoded. If
f10fe38f 8083 HIGHEST, return the coding system of the highest
ad1746f5 8084 priority among the detected coding systems. Otherwise return a
89528eb3 8085 list of detected coding systems sorted by their priorities. If
f10fe38f 8086 MULTIBYTEP, it is assumed that the bytes are in correct
89528eb3
KH
8087 multibyte form but contains only ASCII and eight-bit chars.
8088 Otherwise, the bytes are raw bytes.
8089
8090 CODING-SYSTEM controls the detection as below:
8091
8092 If it is nil, detect both text-format and eol-format. If the
8093 text-format part of CODING-SYSTEM is already specified
8094 (e.g. `iso-latin-1'), detect only eol-format. If the eol-format
8095 part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8096 detect only text-format. */
8097
d46c5b12 8098Lisp_Object
cf84bb53 8099detect_coding_system (const unsigned char *src,
d311d28c 8100 ptrdiff_t src_chars, ptrdiff_t src_bytes,
f10fe38f 8101 bool highest, bool multibytep,
cf84bb53 8102 Lisp_Object coding_system)
4ed46869 8103{
8f924df7 8104 const unsigned char *src_end = src + src_bytes;
df7492f9 8105 Lisp_Object attrs, eol_type;
4533845d 8106 Lisp_Object val = Qnil;
df7492f9 8107 struct coding_system coding;
d3411f89 8108 ptrdiff_t id;
ff0dacd7 8109 struct coding_detection_info detect_info;
24a73b0a 8110 enum coding_category base_category;
f10fe38f 8111 bool null_byte_found = 0, eight_bit_found = 0;
b73bfc1c 8112
df7492f9
KH
8113 if (NILP (coding_system))
8114 coding_system = Qundecided;
8115 setup_coding_system (coding_system, &coding);
8116 attrs = CODING_ID_ATTRS (coding.id);
8117 eol_type = CODING_ID_EOL_TYPE (coding.id);
89528eb3 8118 coding_system = CODING_ATTR_BASE_NAME (attrs);
4ed46869 8119
df7492f9 8120 coding.source = src;
24a73b0a 8121 coding.src_chars = src_chars;
df7492f9
KH
8122 coding.src_bytes = src_bytes;
8123 coding.src_multibyte = multibytep;
8124 coding.consumed = 0;
89528eb3 8125 coding.mode |= CODING_MODE_LAST_BLOCK;
c0e16b14 8126 coding.head_ascii = 0;
d46c5b12 8127
ff0dacd7 8128 detect_info.checked = detect_info.found = detect_info.rejected = 0;
d46c5b12 8129
89528eb3 8130 /* At first, detect text-format if necessary. */
24a73b0a
KH
8131 base_category = XINT (CODING_ATTR_CATEGORY (attrs));
8132 if (base_category == coding_category_undecided)
4ed46869 8133 {
c4a63b12
PE
8134 enum coding_category category IF_LINT (= 0);
8135 struct coding_system *this IF_LINT (= NULL);
ff0dacd7 8136 int c, i;
88993dfd 8137
24a73b0a 8138 /* Skip all ASCII bytes except for a few ISO2022 controls. */
2f3cbb32 8139 for (; src < src_end; src++)
4ed46869 8140 {
df7492f9 8141 c = *src;
6cb21a4f 8142 if (c & 0x80)
6cb21a4f 8143 {
2f3cbb32 8144 eight_bit_found = 1;
2f3cbb32
KH
8145 if (null_byte_found)
8146 break;
8147 }
c0e16b14 8148 else if (c < 0x20)
2f3cbb32
KH
8149 {
8150 if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8151 && ! inhibit_iso_escape_detection
8152 && ! detect_info.checked)
6cb21a4f 8153 {
2f3cbb32
KH
8154 if (detect_coding_iso_2022 (&coding, &detect_info))
8155 {
8156 /* We have scanned the whole data. */
8157 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
c0e16b14
KH
8158 {
8159 /* We didn't find an 8-bit code. We may
8160 have found a null-byte, but it's very
8161 rare that a binary file confirm to
8162 ISO-2022. */
8163 src = src_end;
8164 coding.head_ascii = src - coding.source;
8165 }
8166 detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
2f3cbb32
KH
8167 break;
8168 }
8169 }
97b1b294 8170 else if (! c && !inhibit_null_byte_detection)
2f3cbb32
KH
8171 {
8172 null_byte_found = 1;
8173 if (eight_bit_found)
8174 break;
6cb21a4f 8175 }
c006c0c8
KH
8176 if (! eight_bit_found)
8177 coding.head_ascii++;
6cb21a4f 8178 }
c006c0c8 8179 else if (! eight_bit_found)
c0e16b14 8180 coding.head_ascii++;
4ed46869 8181 }
88993dfd 8182
2f3cbb32
KH
8183 if (null_byte_found || eight_bit_found
8184 || coding.head_ascii < coding.src_bytes
6cb21a4f
KH
8185 || detect_info.found)
8186 {
2f3cbb32 8187 if (coding.head_ascii == coding.src_bytes)
6cb21a4f
KH
8188 /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */
8189 for (i = 0; i < coding_category_raw_text; i++)
ff0dacd7 8190 {
6cb21a4f 8191 category = coding_priorities[i];
c7266f4a 8192 this = coding_categories + category;
6cb21a4f 8193 if (detect_info.found & (1 << category))
ff0dacd7
KH
8194 break;
8195 }
6cb21a4f 8196 else
2f3cbb32
KH
8197 {
8198 if (null_byte_found)
8199 {
8200 detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8201 detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8202 }
8203 for (i = 0; i < coding_category_raw_text; i++)
8204 {
8205 category = coding_priorities[i];
8206 this = coding_categories + category;
6cb21a4f 8207
2f3cbb32
KH
8208 if (this->id < 0)
8209 {
8210 /* No coding system of this category is defined. */
8211 detect_info.rejected |= (1 << category);
8212 }
8213 else if (category >= coding_category_raw_text)
8214 continue;
8215 else if (detect_info.checked & (1 << category))
8216 {
8217 if (highest
8218 && (detect_info.found & (1 << category)))
6cb21a4f 8219 break;
2f3cbb32
KH
8220 }
8221 else if ((*(this->detector)) (&coding, &detect_info)
8222 && highest
8223 && (detect_info.found & (1 << category)))
8224 {
8225 if (category == coding_category_utf_16_auto)
8226 {
8227 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8228 category = coding_category_utf_16_le;
8229 else
8230 category = coding_category_utf_16_be;
8231 }
8232 break;
8233 }
8234 }
8235 }
6cb21a4f 8236 }
ec6d2bb8 8237
4cddb209
KH
8238 if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8239 || null_byte_found)
ec6d2bb8 8240 {
ff0dacd7 8241 detect_info.found = CATEGORY_MASK_RAW_TEXT;
4cddb209 8242 id = CODING_SYSTEM_ID (Qno_conversion);
89528eb3
KH
8243 val = Fcons (make_number (id), Qnil);
8244 }
ff0dacd7 8245 else if (! detect_info.rejected && ! detect_info.found)
89528eb3 8246 {
ff0dacd7 8247 detect_info.found = CATEGORY_MASK_ANY;
89528eb3
KH
8248 id = coding_categories[coding_category_undecided].id;
8249 val = Fcons (make_number (id), Qnil);
8250 }
8251 else if (highest)
8252 {
ff0dacd7 8253 if (detect_info.found)
ec6d2bb8 8254 {
ff0dacd7
KH
8255 detect_info.found = 1 << category;
8256 val = Fcons (make_number (this->id), Qnil);
8257 }
8258 else
8259 for (i = 0; i < coding_category_raw_text; i++)
8260 if (! (detect_info.rejected & (1 << coding_priorities[i])))
8261 {
8262 detect_info.found = 1 << coding_priorities[i];
8263 id = coding_categories[coding_priorities[i]].id;
8264 val = Fcons (make_number (id), Qnil);
8265 break;
8266 }
8267 }
89528eb3
KH
8268 else
8269 {
ff0dacd7
KH
8270 int mask = detect_info.rejected | detect_info.found;
8271 int found = 0;
ec6d2bb8 8272
89528eb3 8273 for (i = coding_category_raw_text - 1; i >= 0; i--)
ff0dacd7
KH
8274 {
8275 category = coding_priorities[i];
8276 if (! (mask & (1 << category)))
ec6d2bb8 8277 {
ff0dacd7
KH
8278 found |= 1 << category;
8279 id = coding_categories[category].id;
c7266f4a
KH
8280 if (id >= 0)
8281 val = Fcons (make_number (id), val);
ff0dacd7
KH
8282 }
8283 }
8284 for (i = coding_category_raw_text - 1; i >= 0; i--)
8285 {
8286 category = coding_priorities[i];
8287 if (detect_info.found & (1 << category))
8288 {
8289 id = coding_categories[category].id;
8290 val = Fcons (make_number (id), val);
ec6d2bb8 8291 }
ec6d2bb8 8292 }
ff0dacd7 8293 detect_info.found |= found;
ec6d2bb8 8294 }
ec6d2bb8 8295 }
a470d443
KH
8296 else if (base_category == coding_category_utf_8_auto)
8297 {
8298 if (detect_coding_utf_8 (&coding, &detect_info))
8299 {
8300 struct coding_system *this;
8301
8302 if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8303 this = coding_categories + coding_category_utf_8_sig;
8304 else
8305 this = coding_categories + coding_category_utf_8_nosig;
8306 val = Fcons (make_number (this->id), Qnil);
8307 }
8308 }
24a73b0a
KH
8309 else if (base_category == coding_category_utf_16_auto)
8310 {
8311 if (detect_coding_utf_16 (&coding, &detect_info))
8312 {
24a73b0a
KH
8313 struct coding_system *this;
8314
8315 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8316 this = coding_categories + coding_category_utf_16_le;
8317 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8318 this = coding_categories + coding_category_utf_16_be;
8319 else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8320 this = coding_categories + coding_category_utf_16_be_nosig;
8321 else
8322 this = coding_categories + coding_category_utf_16_le_nosig;
8323 val = Fcons (make_number (this->id), Qnil);
8324 }
8325 }
df7492f9
KH
8326 else
8327 {
ff0dacd7 8328 detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
89528eb3 8329 val = Fcons (make_number (coding.id), Qnil);
4ed46869 8330 }
df7492f9 8331
89528eb3 8332 /* Then, detect eol-format if necessary. */
df7492f9 8333 {
4533845d 8334 int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
df7492f9
KH
8335 Lisp_Object tail;
8336
89528eb3
KH
8337 if (VECTORP (eol_type))
8338 {
ff0dacd7 8339 if (detect_info.found & ~CATEGORY_MASK_UTF_16)
2f3cbb32
KH
8340 {
8341 if (null_byte_found)
8342 normal_eol = EOL_SEEN_LF;
8343 else
8344 normal_eol = detect_eol (coding.source, src_bytes,
8345 coding_category_raw_text);
8346 }
ff0dacd7
KH
8347 if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8348 | CATEGORY_MASK_UTF_16_BE_NOSIG))
89528eb3
KH
8349 utf_16_be_eol = detect_eol (coding.source, src_bytes,
8350 coding_category_utf_16_be);
ff0dacd7
KH
8351 if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8352 | CATEGORY_MASK_UTF_16_LE_NOSIG))
89528eb3
KH
8353 utf_16_le_eol = detect_eol (coding.source, src_bytes,
8354 coding_category_utf_16_le);
8355 }
8356 else
8357 {
8358 if (EQ (eol_type, Qunix))
8359 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8360 else if (EQ (eol_type, Qdos))
8361 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8362 else
8363 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8364 }
8365
df7492f9
KH
8366 for (tail = val; CONSP (tail); tail = XCDR (tail))
8367 {
89528eb3 8368 enum coding_category category;
df7492f9 8369 int this_eol;
89528eb3
KH
8370
8371 id = XINT (XCAR (tail));
8372 attrs = CODING_ID_ATTRS (id);
8373 category = XINT (CODING_ATTR_CATEGORY (attrs));
8374 eol_type = CODING_ID_EOL_TYPE (id);
df7492f9
KH
8375 if (VECTORP (eol_type))
8376 {
89528eb3
KH
8377 if (category == coding_category_utf_16_be
8378 || category == coding_category_utf_16_be_nosig)
8379 this_eol = utf_16_be_eol;
8380 else if (category == coding_category_utf_16_le
8381 || category == coding_category_utf_16_le_nosig)
8382 this_eol = utf_16_le_eol;
df7492f9 8383 else
89528eb3
KH
8384 this_eol = normal_eol;
8385
df7492f9
KH
8386 if (this_eol == EOL_SEEN_LF)
8387 XSETCAR (tail, AREF (eol_type, 0));
8388 else if (this_eol == EOL_SEEN_CRLF)
8389 XSETCAR (tail, AREF (eol_type, 1));
8390 else if (this_eol == EOL_SEEN_CR)
8391 XSETCAR (tail, AREF (eol_type, 2));
89528eb3
KH
8392 else
8393 XSETCAR (tail, CODING_ID_NAME (id));
df7492f9 8394 }
89528eb3
KH
8395 else
8396 XSETCAR (tail, CODING_ID_NAME (id));
df7492f9
KH
8397 }
8398 }
ec6d2bb8 8399
4533845d 8400 return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
ec6d2bb8
KH
8401}
8402
ec6d2bb8 8403
d46c5b12
KH
8404DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8405 2, 3, 0,
48b0f3ae
PJ
8406 doc: /* Detect coding system of the text in the region between START and END.
8407Return a list of possible coding systems ordered by priority.
b811c52b
KH
8408The coding systems to try and their priorities follows what
8409the function `coding-system-priority-list' (which see) returns.
ec6d2bb8 8410
12e0131a 8411If only ASCII characters are found (except for such ISO-2022 control
d4a1d553
JB
8412characters as ESC), it returns a list of single element `undecided'
8413or its subsidiary coding system according to a detected end-of-line
8414format.
ec6d2bb8 8415
48b0f3ae
PJ
8416If optional argument HIGHEST is non-nil, return the coding system of
8417highest priority. */)
5842a27b 8418 (Lisp_Object start, Lisp_Object end, Lisp_Object highest)
d46c5b12 8419{
d311d28c
PE
8420 ptrdiff_t from, to;
8421 ptrdiff_t from_byte, to_byte;
ec6d2bb8 8422
d46c5b12
KH
8423 validate_region (&start, &end);
8424 from = XINT (start), to = XINT (end);
8425 from_byte = CHAR_TO_BYTE (from);
8426 to_byte = CHAR_TO_BYTE (to);
ec6d2bb8 8427
d46c5b12
KH
8428 if (from < GPT && to >= GPT)
8429 move_gap_both (to, to_byte);
c210f766 8430
d46c5b12 8431 return detect_coding_system (BYTE_POS_ADDR (from_byte),
24a73b0a 8432 to - from, to_byte - from_byte,
0a28aafb 8433 !NILP (highest),
4b4deea2 8434 !NILP (BVAR (current_buffer
5d8ea120 8435 , enable_multibyte_characters)),
df7492f9 8436 Qnil);
ec6d2bb8
KH
8437}
8438
d46c5b12
KH
8439DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8440 1, 2, 0,
48b0f3ae
PJ
8441 doc: /* Detect coding system of the text in STRING.
8442Return a list of possible coding systems ordered by priority.
67ceab9d
KH
8443The coding systems to try and their priorities follows what
8444the function `coding-system-priority-list' (which see) returns.
fb88bf2d 8445
12e0131a 8446If only ASCII characters are found (except for such ISO-2022 control
d4a1d553
JB
8447characters as ESC), it returns a list of single element `undecided'
8448or its subsidiary coding system according to a detected end-of-line
8449format.
d46c5b12 8450
48b0f3ae
PJ
8451If optional argument HIGHEST is non-nil, return the coding system of
8452highest priority. */)
5842a27b 8453 (Lisp_Object string, Lisp_Object highest)
d46c5b12 8454{
b7826503 8455 CHECK_STRING (string);
b73bfc1c 8456
24a73b0a
KH
8457 return detect_coding_system (SDATA (string),
8458 SCHARS (string), SBYTES (string),
8f924df7 8459 !NILP (highest), STRING_MULTIBYTE (string),
df7492f9 8460 Qnil);
4ed46869 8461}
4ed46869 8462
b73bfc1c 8463
b0ab8123 8464static bool
971de7fb 8465char_encodable_p (int c, Lisp_Object attrs)
05e6f5dc 8466{
df7492f9 8467 Lisp_Object tail;
df7492f9 8468 struct charset *charset;
7d64c6ad 8469 Lisp_Object translation_table;
d46c5b12 8470
7d64c6ad 8471 translation_table = CODING_ATTR_TRANS_TBL (attrs);
a6f87d34 8472 if (! NILP (translation_table))
7d64c6ad 8473 c = translate_char (translation_table, c);
df7492f9
KH
8474 for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8475 CONSP (tail); tail = XCDR (tail))
e133c8fa 8476 {
df7492f9
KH
8477 charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8478 if (CHAR_CHARSET_P (c, charset))
8479 break;
e133c8fa 8480 }
df7492f9 8481 return (! NILP (tail));
05e6f5dc 8482}
83fa074f 8483
fb88bf2d 8484
df7492f9
KH
8485/* Return a list of coding systems that safely encode the text between
8486 START and END. If EXCLUDE is non-nil, it is a list of coding
8487 systems not to check. The returned list doesn't contain any such
48468dac 8488 coding systems. In any case, if the text contains only ASCII or is
df7492f9 8489 unibyte, return t. */
e077cc80 8490
df7492f9
KH
8491DEFUN ("find-coding-systems-region-internal",
8492 Ffind_coding_systems_region_internal,
8493 Sfind_coding_systems_region_internal, 2, 3, 0,
8494 doc: /* Internal use only. */)
5842a27b 8495 (Lisp_Object start, Lisp_Object end, Lisp_Object exclude)
df7492f9
KH
8496{
8497 Lisp_Object coding_attrs_list, safe_codings;
d311d28c 8498 ptrdiff_t start_byte, end_byte;
7c78e542 8499 const unsigned char *p, *pbeg, *pend;
df7492f9 8500 int c;
0e727afa 8501 Lisp_Object tail, elt, work_table;
d46c5b12 8502
df7492f9
KH
8503 if (STRINGP (start))
8504 {
8505 if (!STRING_MULTIBYTE (start)
8f924df7 8506 || SCHARS (start) == SBYTES (start))
df7492f9
KH
8507 return Qt;
8508 start_byte = 0;
8f924df7 8509 end_byte = SBYTES (start);
df7492f9
KH
8510 }
8511 else
d46c5b12 8512 {
df7492f9
KH
8513 CHECK_NUMBER_COERCE_MARKER (start);
8514 CHECK_NUMBER_COERCE_MARKER (end);
8515 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8516 args_out_of_range (start, end);
4b4deea2 8517 if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
df7492f9
KH
8518 return Qt;
8519 start_byte = CHAR_TO_BYTE (XINT (start));
8520 end_byte = CHAR_TO_BYTE (XINT (end));
8521 if (XINT (end) - XINT (start) == end_byte - start_byte)
8522 return Qt;
d46c5b12 8523
e1c23804 8524 if (XINT (start) < GPT && XINT (end) > GPT)
d46c5b12 8525 {
e1c23804
DL
8526 if ((GPT - XINT (start)) < (XINT (end) - GPT))
8527 move_gap_both (XINT (start), start_byte);
df7492f9 8528 else
e1c23804 8529 move_gap_both (XINT (end), end_byte);
d46c5b12
KH
8530 }
8531 }
8532
df7492f9
KH
8533 coding_attrs_list = Qnil;
8534 for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
8535 if (NILP (exclude)
8536 || NILP (Fmemq (XCAR (tail), exclude)))
8537 {
8538 Lisp_Object attrs;
d46c5b12 8539
df7492f9
KH
8540 attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
8541 if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
8542 && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
7d64c6ad
KH
8543 {
8544 ASET (attrs, coding_attr_trans_tbl,
2170c8f0 8545 get_translation_table (attrs, 1, NULL));
7d64c6ad
KH
8546 coding_attrs_list = Fcons (attrs, coding_attrs_list);
8547 }
df7492f9 8548 }
d46c5b12 8549
df7492f9 8550 if (STRINGP (start))
8f924df7 8551 p = pbeg = SDATA (start);
df7492f9
KH
8552 else
8553 p = pbeg = BYTE_POS_ADDR (start_byte);
8554 pend = p + (end_byte - start_byte);
b843d1ae 8555
df7492f9
KH
8556 while (p < pend && ASCII_BYTE_P (*p)) p++;
8557 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
d46c5b12 8558
0e727afa 8559 work_table = Fmake_char_table (Qnil, Qnil);
05e6f5dc 8560 while (p < pend)
72d1a715 8561 {
df7492f9
KH
8562 if (ASCII_BYTE_P (*p))
8563 p++;
72d1a715
RS
8564 else
8565 {
df7492f9 8566 c = STRING_CHAR_ADVANCE (p);
0e727afa
YM
8567 if (!NILP (char_table_ref (work_table, c)))
8568 /* This character was already checked. Ignore it. */
8569 continue;
12410ef1 8570
df7492f9
KH
8571 charset_map_loaded = 0;
8572 for (tail = coding_attrs_list; CONSP (tail);)
8573 {
8574 elt = XCAR (tail);
8575 if (NILP (elt))
8576 tail = XCDR (tail);
8577 else if (char_encodable_p (c, elt))
8578 tail = XCDR (tail);
8579 else if (CONSP (XCDR (tail)))
8580 {
8581 XSETCAR (tail, XCAR (XCDR (tail)));
8582 XSETCDR (tail, XCDR (XCDR (tail)));
8583 }
8584 else
8585 {
8586 XSETCAR (tail, Qnil);
8587 tail = XCDR (tail);
8588 }
8589 }
8590 if (charset_map_loaded)
8591 {
d311d28c 8592 ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
05e6f5dc 8593
df7492f9 8594 if (STRINGP (start))
8f924df7 8595 pbeg = SDATA (start);
df7492f9
KH
8596 else
8597 pbeg = BYTE_POS_ADDR (start_byte);
8598 p = pbeg + p_offset;
8599 pend = pbeg + pend_offset;
8600 }
0e727afa 8601 char_table_set (work_table, c, Qt);
df7492f9 8602 }
ec6d2bb8 8603 }
fb88bf2d 8604
988b3759 8605 safe_codings = list2 (Qraw_text, Qno_conversion);
df7492f9
KH
8606 for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
8607 if (! NILP (XCAR (tail)))
8608 safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
ec6d2bb8 8609
05e6f5dc
KH
8610 return safe_codings;
8611}
4956c225 8612
d46c5b12 8613
8f924df7
KH
8614DEFUN ("unencodable-char-position", Funencodable_char_position,
8615 Sunencodable_char_position, 3, 5, 0,
8616 doc: /*
8617Return position of first un-encodable character in a region.
d4a1d553 8618START and END specify the region and CODING-SYSTEM specifies the
8f924df7 8619encoding to check. Return nil if CODING-SYSTEM does encode the region.
d46c5b12 8620
8f924df7
KH
8621If optional 4th argument COUNT is non-nil, it specifies at most how
8622many un-encodable characters to search. In this case, the value is a
8623list of positions.
d46c5b12 8624
8f924df7
KH
8625If optional 5th argument STRING is non-nil, it is a string to search
8626for un-encodable characters. In that case, START and END are indexes
8627to the string. */)
5842a27b 8628 (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object count, Lisp_Object string)
8f924df7 8629{
d311d28c 8630 EMACS_INT n;
8f924df7 8631 struct coding_system coding;
7d64c6ad 8632 Lisp_Object attrs, charset_list, translation_table;
8f924df7 8633 Lisp_Object positions;
d311d28c 8634 ptrdiff_t from, to;
8f924df7 8635 const unsigned char *p, *stop, *pend;
f10fe38f 8636 bool ascii_compatible;
fb88bf2d 8637
8f924df7
KH
8638 setup_coding_system (Fcheck_coding_system (coding_system), &coding);
8639 attrs = CODING_ID_ATTRS (coding.id);
8640 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
8641 return Qnil;
8642 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
8643 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2170c8f0 8644 translation_table = get_translation_table (attrs, 1, NULL);
fb88bf2d 8645
8f924df7
KH
8646 if (NILP (string))
8647 {
8648 validate_region (&start, &end);
8649 from = XINT (start);
8650 to = XINT (end);
4b4deea2 8651 if (NILP (BVAR (current_buffer, enable_multibyte_characters))
8f924df7
KH
8652 || (ascii_compatible
8653 && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
8654 return Qnil;
8655 p = CHAR_POS_ADDR (from);
8656 pend = CHAR_POS_ADDR (to);
8657 if (from < GPT && to >= GPT)
8658 stop = GPT_ADDR;
8659 else
8660 stop = pend;
8661 }
8662 else
8663 {
8664 CHECK_STRING (string);
8665 CHECK_NATNUM (start);
8666 CHECK_NATNUM (end);
d311d28c
PE
8667 if (! (XINT (start) <= XINT (end) && XINT (end) <= SCHARS (string)))
8668 args_out_of_range_3 (string, start, end);
8f924df7
KH
8669 from = XINT (start);
8670 to = XINT (end);
8f924df7
KH
8671 if (! STRING_MULTIBYTE (string))
8672 return Qnil;
8673 p = SDATA (string) + string_char_to_byte (string, from);
8674 stop = pend = SDATA (string) + string_char_to_byte (string, to);
8675 if (ascii_compatible && (to - from) == (pend - p))
8676 return Qnil;
8677 }
f2558efd 8678
8f924df7
KH
8679 if (NILP (count))
8680 n = 1;
8681 else
b73bfc1c 8682 {
8f924df7
KH
8683 CHECK_NATNUM (count);
8684 n = XINT (count);
b73bfc1c
KH
8685 }
8686
8f924df7 8687 positions = Qnil;
3633e3aa 8688 charset_map_loaded = 0;
8f924df7 8689 while (1)
d46c5b12 8690 {
8f924df7 8691 int c;
ec6d2bb8 8692
8f924df7
KH
8693 if (ascii_compatible)
8694 while (p < stop && ASCII_BYTE_P (*p))
8695 p++, from++;
8696 if (p >= stop)
0e79d667 8697 {
8f924df7
KH
8698 if (p >= pend)
8699 break;
8700 stop = pend;
8701 p = GAP_END_ADDR;
0e79d667 8702 }
ec6d2bb8 8703
8f924df7
KH
8704 c = STRING_CHAR_ADVANCE (p);
8705 if (! (ASCII_CHAR_P (c) && ascii_compatible)
7d64c6ad
KH
8706 && ! char_charset (translate_char (translation_table, c),
8707 charset_list, NULL))
ec6d2bb8 8708 {
8f924df7
KH
8709 positions = Fcons (make_number (from), positions);
8710 n--;
8711 if (n == 0)
8712 break;
ec6d2bb8
KH
8713 }
8714
8f924df7 8715 from++;
3633e3aa
KH
8716 if (charset_map_loaded && NILP (string))
8717 {
8718 p = CHAR_POS_ADDR (from);
8719 pend = CHAR_POS_ADDR (to);
8720 if (from < GPT && to >= GPT)
8721 stop = GPT_ADDR;
8722 else
8723 stop = pend;
8724 charset_map_loaded = 0;
8725 }
8f924df7 8726 }
d46c5b12 8727
8f924df7
KH
8728 return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
8729}
d46c5b12 8730
d46c5b12 8731
df7492f9
KH
8732DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
8733 Scheck_coding_systems_region, 3, 3, 0,
8734 doc: /* Check if the region is encodable by coding systems.
d46c5b12 8735
df7492f9
KH
8736START and END are buffer positions specifying the region.
8737CODING-SYSTEM-LIST is a list of coding systems to check.
d46c5b12 8738
df7492f9 8739The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
d4a1d553 8740CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
df7492f9
KH
8741whole region, POS0, POS1, ... are buffer positions where non-encodable
8742characters are found.
93dec019 8743
df7492f9
KH
8744If all coding systems in CODING-SYSTEM-LIST can encode the region, the
8745value is nil.
93dec019 8746
df7492f9
KH
8747START may be a string. In that case, check if the string is
8748encodable, and the value contains indices to the string instead of
5704f39a
KH
8749buffer positions. END is ignored.
8750
4c1958f4 8751If the current buffer (or START if it is a string) is unibyte, the value
5704f39a 8752is nil. */)
5842a27b 8753 (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system_list)
05e6f5dc 8754{
df7492f9 8755 Lisp_Object list;
d311d28c
PE
8756 ptrdiff_t start_byte, end_byte;
8757 ptrdiff_t pos;
7c78e542 8758 const unsigned char *p, *pbeg, *pend;
df7492f9 8759 int c;
7d64c6ad 8760 Lisp_Object tail, elt, attrs;
70ad9fc4 8761
05e6f5dc
KH
8762 if (STRINGP (start))
8763 {
df7492f9 8764 if (!STRING_MULTIBYTE (start)
4c1958f4 8765 || SCHARS (start) == SBYTES (start))
df7492f9
KH
8766 return Qnil;
8767 start_byte = 0;
8f924df7 8768 end_byte = SBYTES (start);
df7492f9 8769 pos = 0;
d46c5b12 8770 }
05e6f5dc 8771 else
b73bfc1c 8772 {
b7826503
PJ
8773 CHECK_NUMBER_COERCE_MARKER (start);
8774 CHECK_NUMBER_COERCE_MARKER (end);
05e6f5dc
KH
8775 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8776 args_out_of_range (start, end);
4b4deea2 8777 if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
df7492f9
KH
8778 return Qnil;
8779 start_byte = CHAR_TO_BYTE (XINT (start));
8780 end_byte = CHAR_TO_BYTE (XINT (end));
8781 if (XINT (end) - XINT (start) == end_byte - start_byte)
5704f39a 8782 return Qnil;
df7492f9 8783
e1c23804 8784 if (XINT (start) < GPT && XINT (end) > GPT)
b73bfc1c 8785 {
e1c23804
DL
8786 if ((GPT - XINT (start)) < (XINT (end) - GPT))
8787 move_gap_both (XINT (start), start_byte);
df7492f9 8788 else
e1c23804 8789 move_gap_both (XINT (end), end_byte);
b73bfc1c 8790 }
e1c23804 8791 pos = XINT (start);
b73bfc1c 8792 }
7553d0e1 8793
df7492f9
KH
8794 list = Qnil;
8795 for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
12410ef1 8796 {
df7492f9 8797 elt = XCAR (tail);
7d64c6ad 8798 attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
2170c8f0
KH
8799 ASET (attrs, coding_attr_trans_tbl,
8800 get_translation_table (attrs, 1, NULL));
7d64c6ad 8801 list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
12410ef1
KH
8802 }
8803
df7492f9 8804 if (STRINGP (start))
8f924df7 8805 p = pbeg = SDATA (start);
72d1a715 8806 else
df7492f9
KH
8807 p = pbeg = BYTE_POS_ADDR (start_byte);
8808 pend = p + (end_byte - start_byte);
4ed46869 8809
df7492f9
KH
8810 while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
8811 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
ec6d2bb8 8812
df7492f9 8813 while (p < pend)
d46c5b12 8814 {
df7492f9
KH
8815 if (ASCII_BYTE_P (*p))
8816 p++;
e133c8fa 8817 else
05e6f5dc 8818 {
df7492f9
KH
8819 c = STRING_CHAR_ADVANCE (p);
8820
8821 charset_map_loaded = 0;
8822 for (tail = list; CONSP (tail); tail = XCDR (tail))
8823 {
8824 elt = XCDR (XCAR (tail));
8825 if (! char_encodable_p (c, XCAR (elt)))
8826 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
8827 }
8828 if (charset_map_loaded)
8829 {
d311d28c 8830 ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
df7492f9
KH
8831
8832 if (STRINGP (start))
8f924df7 8833 pbeg = SDATA (start);
df7492f9
KH
8834 else
8835 pbeg = BYTE_POS_ADDR (start_byte);
8836 p = pbeg + p_offset;
8837 pend = pbeg + pend_offset;
8838 }
05e6f5dc 8839 }
df7492f9 8840 pos++;
d46c5b12 8841 }
4ed46869 8842
df7492f9
KH
8843 tail = list;
8844 list = Qnil;
8845 for (; CONSP (tail); tail = XCDR (tail))
ec6d2bb8 8846 {
df7492f9
KH
8847 elt = XCAR (tail);
8848 if (CONSP (XCDR (XCDR (elt))))
8849 list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
8850 list);
ec6d2bb8 8851 }
2b4f9037 8852
df7492f9 8853 return list;
d46c5b12
KH
8854}
8855
3fd9494b 8856
74ab6df5 8857static Lisp_Object
cf84bb53
JB
8858code_convert_region (Lisp_Object start, Lisp_Object end,
8859 Lisp_Object coding_system, Lisp_Object dst_object,
f10fe38f 8860 bool encodep, bool norecord)
4ed46869 8861{
3a73fa5d 8862 struct coding_system coding;
d311d28c 8863 ptrdiff_t from, from_byte, to, to_byte;
df7492f9 8864 Lisp_Object src_object;
4ed46869 8865
df7492f9
KH
8866 if (NILP (coding_system))
8867 coding_system = Qno_conversion;
8868 else
8869 CHECK_CODING_SYSTEM (coding_system);
8870 src_object = Fcurrent_buffer ();
8871 if (NILP (dst_object))
8872 dst_object = src_object;
8873 else if (! EQ (dst_object, Qt))
8874 CHECK_BUFFER (dst_object);
3a73fa5d 8875
d46c5b12
KH
8876 validate_region (&start, &end);
8877 from = XFASTINT (start);
df7492f9 8878 from_byte = CHAR_TO_BYTE (from);
d46c5b12 8879 to = XFASTINT (end);
df7492f9 8880 to_byte = CHAR_TO_BYTE (to);
764ca8da 8881
df7492f9
KH
8882 setup_coding_system (coding_system, &coding);
8883 coding.mode |= CODING_MODE_LAST_BLOCK;
ec6d2bb8 8884
df7492f9
KH
8885 if (encodep)
8886 encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8887 dst_object);
8888 else
8889 decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8890 dst_object);
8891 if (! norecord)
8892 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
ec6d2bb8 8893
df7492f9
KH
8894 return (BUFFERP (dst_object)
8895 ? make_number (coding.produced_char)
8896 : coding.dst_object);
4031e2bf 8897}
78108bcd 8898
4ed46869 8899
4031e2bf 8900DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
df7492f9 8901 3, 4, "r\nzCoding system: ",
48b0f3ae 8902 doc: /* Decode the current region from the specified coding system.
df7492f9
KH
8903When called from a program, takes four arguments:
8904 START, END, CODING-SYSTEM, and DESTINATION.
8905START and END are buffer positions.
8844fa83 8906
df7492f9 8907Optional 4th arguments DESTINATION specifies where the decoded text goes.
2354b80b 8908If nil, the region between START and END is replaced by the decoded text.
1560f91a
EZ
8909If buffer, the decoded text is inserted in that buffer after point (point
8910does not move).
446dcd75 8911In those cases, the length of the decoded text is returned.
319a3947 8912If DESTINATION is t, the decoded text is returned.
8844fa83 8913
48b0f3ae
PJ
8914This function sets `last-coding-system-used' to the precise coding system
8915used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
319a3947 8916not fully specified.) */)
5842a27b 8917 (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
4031e2bf 8918{
df7492f9 8919 return code_convert_region (start, end, coding_system, destination, 0, 0);
3a73fa5d 8920}
8844fa83 8921
3a73fa5d 8922DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
df7492f9
KH
8923 3, 4, "r\nzCoding system: ",
8924 doc: /* Encode the current region by specified coding system.
d4a1d553
JB
8925When called from a program, takes four arguments:
8926 START, END, CODING-SYSTEM and DESTINATION.
8927START and END are buffer positions.
d46c5b12 8928
df7492f9
KH
8929Optional 4th arguments DESTINATION specifies where the encoded text goes.
8930If nil, the region between START and END is replace by the encoded text.
1560f91a
EZ
8931If buffer, the encoded text is inserted in that buffer after point (point
8932does not move).
446dcd75 8933In those cases, the length of the encoded text is returned.
319a3947 8934If DESTINATION is t, the encoded text is returned.
2391eaa4 8935
48b0f3ae
PJ
8936This function sets `last-coding-system-used' to the precise coding system
8937used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
319a3947 8938not fully specified.) */)
5842a27b 8939 (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
3a73fa5d 8940{
df7492f9 8941 return code_convert_region (start, end, coding_system, destination, 1, 0);
b73bfc1c
KH
8942}
8943
8944Lisp_Object
6f704c76 8945code_convert_string (Lisp_Object string, Lisp_Object coding_system,
f10fe38f
PE
8946 Lisp_Object dst_object, bool encodep, bool nocopy,
8947 bool norecord)
b73bfc1c 8948{
4031e2bf 8949 struct coding_system coding;
d311d28c 8950 ptrdiff_t chars, bytes;
ec6d2bb8 8951
b7826503 8952 CHECK_STRING (string);
d46c5b12 8953 if (NILP (coding_system))
4956c225 8954 {
df7492f9
KH
8955 if (! norecord)
8956 Vlast_coding_system_used = Qno_conversion;
8957 if (NILP (dst_object))
8958 return (nocopy ? Fcopy_sequence (string) : string);
4956c225 8959 }
b73bfc1c 8960
df7492f9
KH
8961 if (NILP (coding_system))
8962 coding_system = Qno_conversion;
8963 else
8964 CHECK_CODING_SYSTEM (coding_system);
8965 if (NILP (dst_object))
8966 dst_object = Qt;
8967 else if (! EQ (dst_object, Qt))
8968 CHECK_BUFFER (dst_object);
73be902c 8969
df7492f9 8970 setup_coding_system (coding_system, &coding);
d46c5b12 8971 coding.mode |= CODING_MODE_LAST_BLOCK;
8f924df7
KH
8972 chars = SCHARS (string);
8973 bytes = SBYTES (string);
df7492f9
KH
8974 if (encodep)
8975 encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
8976 else
8977 decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
8978 if (! norecord)
8979 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
73be902c 8980
df7492f9
KH
8981 return (BUFFERP (dst_object)
8982 ? make_number (coding.produced_char)
8983 : coding.dst_object);
4ed46869 8984}
73be902c 8985
b73bfc1c 8986
ecec61c1 8987/* Encode or decode STRING according to CODING_SYSTEM.
ec6d2bb8 8988 Do not set Vlast_coding_system_used.
4ed46869 8989
ec6d2bb8
KH
8990 This function is called only from macros DECODE_FILE and
8991 ENCODE_FILE, thus we ignore character composition. */
4ed46869 8992
ecec61c1 8993Lisp_Object
cf84bb53 8994code_convert_string_norecord (Lisp_Object string, Lisp_Object coding_system,
f10fe38f 8995 bool encodep)
4ed46869 8996{
0be8721c 8997 return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
4ed46869
KH
8998}
8999
4ed46869 9000
a7ca3326 9001DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
df7492f9
KH
9002 2, 4, 0,
9003 doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
9004
9005Optional third arg NOCOPY non-nil means it is OK to return STRING itself
9006if the decoding operation is trivial.
ecec61c1 9007
d4a1d553 9008Optional fourth arg BUFFER non-nil means that the decoded text is
1560f91a
EZ
9009inserted in that buffer after point (point does not move). In this
9010case, the return value is the length of the decoded text.
ecec61c1 9011
df7492f9
KH
9012This function sets `last-coding-system-used' to the precise coding system
9013used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
d4a1d553 9014not fully specified.) */)
5842a27b 9015 (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
4ed46869 9016{
df7492f9
KH
9017 return code_convert_string (string, coding_system, buffer,
9018 0, ! NILP (nocopy), 0);
4ed46869
KH
9019}
9020
df7492f9
KH
9021DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
9022 2, 4, 0,
9023 doc: /* Encode STRING to CODING-SYSTEM, and return the result.
9024
9025Optional third arg NOCOPY non-nil means it is OK to return STRING
9026itself if the encoding operation is trivial.
9027
d4a1d553 9028Optional fourth arg BUFFER non-nil means that the encoded text is
1560f91a
EZ
9029inserted in that buffer after point (point does not move). In this
9030case, the return value is the length of the encoded text.
df7492f9
KH
9031
9032This function sets `last-coding-system-used' to the precise coding system
9033used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9034not fully specified.) */)
5842a27b 9035 (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
4ed46869 9036{
df7492f9 9037 return code_convert_string (string, coding_system, buffer,
4550efdf 9038 1, ! NILP (nocopy), 0);
4ed46869 9039}
df7492f9 9040
3a73fa5d 9041\f
4ed46869 9042DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
48b0f3ae
PJ
9043 doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
9044Return the corresponding character. */)
5842a27b 9045 (Lisp_Object code)
4ed46869 9046{
df7492f9
KH
9047 Lisp_Object spec, attrs, val;
9048 struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
5fdb398c
PE
9049 EMACS_INT ch;
9050 int c;
4ed46869 9051
df7492f9 9052 CHECK_NATNUM (code);
5fdb398c 9053 ch = XFASTINT (code);
df7492f9
KH
9054 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9055 attrs = AREF (spec, 0);
4ed46869 9056
5fdb398c 9057 if (ASCII_BYTE_P (ch)
df7492f9
KH
9058 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9059 return code;
4ed46869 9060
df7492f9
KH
9061 val = CODING_ATTR_CHARSET_LIST (attrs);
9062 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
004068e4
KH
9063 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9064 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
fa42c37f 9065
5fdb398c
PE
9066 if (ch <= 0x7F)
9067 {
9068 c = ch;
9069 charset = charset_roman;
9070 }
9071 else if (ch >= 0xA0 && ch < 0xDF)
55ab7be3 9072 {
5fdb398c 9073 c = ch - 0x80;
df7492f9 9074 charset = charset_kana;
4ed46869 9075 }
55ab7be3 9076 else
4ed46869 9077 {
5fdb398c
PE
9078 EMACS_INT c1 = ch >> 8;
9079 int c2 = ch & 0xFF;
df7492f9 9080
2735d060
PE
9081 if (c1 < 0x81 || (c1 > 0x9F && c1 < 0xE0) || c1 > 0xEF
9082 || c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
c2982e87 9083 error ("Invalid code: %"pI"d", ch);
5fdb398c 9084 c = ch;
df7492f9
KH
9085 SJIS_TO_JIS (c);
9086 charset = charset_kanji;
4ed46869 9087 }
df7492f9
KH
9088 c = DECODE_CHAR (charset, c);
9089 if (c < 0)
c2982e87 9090 error ("Invalid code: %"pI"d", ch);
df7492f9 9091 return make_number (c);
93dec019 9092}
4ed46869 9093
48b0f3ae 9094
4ed46869 9095DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
8acf0c0e 9096 doc: /* Encode a Japanese character CH to shift_jis encoding.
48b0f3ae 9097Return the corresponding code in SJIS. */)
5842a27b 9098 (Lisp_Object ch)
4ed46869 9099{
df7492f9
KH
9100 Lisp_Object spec, attrs, charset_list;
9101 int c;
9102 struct charset *charset;
9103 unsigned code;
48b0f3ae 9104
df7492f9
KH
9105 CHECK_CHARACTER (ch);
9106 c = XFASTINT (ch);
9107 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9108 attrs = AREF (spec, 0);
9109
9110 if (ASCII_CHAR_P (c)
9111 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9112 return ch;
9113
9114 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9115 charset = char_charset (c, charset_list, &code);
9116 if (code == CHARSET_INVALID_CODE (charset))
e6c3da20 9117 error ("Can't encode by shift_jis encoding: %c", c);
df7492f9
KH
9118 JIS_TO_SJIS (code);
9119
9120 return make_number (code);
4ed46869
KH
9121}
9122
9123DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
48b0f3ae
PJ
9124 doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
9125Return the corresponding character. */)
5842a27b 9126 (Lisp_Object code)
d46c5b12 9127{
df7492f9
KH
9128 Lisp_Object spec, attrs, val;
9129 struct charset *charset_roman, *charset_big5, *charset;
5fdb398c 9130 EMACS_INT ch;
df7492f9 9131 int c;
6289dd10 9132
df7492f9 9133 CHECK_NATNUM (code);
5fdb398c 9134 ch = XFASTINT (code);
df7492f9
KH
9135 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9136 attrs = AREF (spec, 0);
4ed46869 9137
5fdb398c 9138 if (ASCII_BYTE_P (ch)
df7492f9
KH
9139 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9140 return code;
6289dd10 9141
df7492f9
KH
9142 val = CODING_ATTR_CHARSET_LIST (attrs);
9143 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9144 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
c210f766 9145
5fdb398c
PE
9146 if (ch <= 0x7F)
9147 {
9148 c = ch;
9149 charset = charset_roman;
9150 }
c28a9453
KH
9151 else
9152 {
5fdb398c
PE
9153 EMACS_INT b1 = ch >> 8;
9154 int b2 = ch & 0x7F;
df7492f9
KH
9155 if (b1 < 0xA1 || b1 > 0xFE
9156 || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
c2982e87 9157 error ("Invalid code: %"pI"d", ch);
5fdb398c 9158 c = ch;
df7492f9 9159 charset = charset_big5;
c28a9453 9160 }
5fdb398c 9161 c = DECODE_CHAR (charset, c);
df7492f9 9162 if (c < 0)
c2982e87 9163 error ("Invalid code: %"pI"d", ch);
df7492f9 9164 return make_number (c);
d46c5b12 9165}
6289dd10 9166
4ed46869 9167DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
8acf0c0e 9168 doc: /* Encode the Big5 character CH to BIG5 coding system.
48b0f3ae 9169Return the corresponding character code in Big5. */)
5842a27b 9170 (Lisp_Object ch)
4ed46869 9171{
df7492f9
KH
9172 Lisp_Object spec, attrs, charset_list;
9173 struct charset *charset;
9174 int c;
9175 unsigned code;
9176
9177 CHECK_CHARACTER (ch);
9178 c = XFASTINT (ch);
9179 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9180 attrs = AREF (spec, 0);
9181 if (ASCII_CHAR_P (c)
9182 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9183 return ch;
9184
9185 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9186 charset = char_charset (c, charset_list, &code);
9187 if (code == CHARSET_INVALID_CODE (charset))
e6c3da20 9188 error ("Can't encode by Big5 encoding: %c", c);
df7492f9
KH
9189
9190 return make_number (code);
4ed46869 9191}
48b0f3ae 9192
3a73fa5d 9193\f
002fdb44 9194DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
68bba4e4 9195 Sset_terminal_coding_system_internal, 1, 2, 0,
48b0f3ae 9196 doc: /* Internal use only. */)
5842a27b 9197 (Lisp_Object coding_system, Lisp_Object terminal)
4ed46869 9198{
b18fad6d
KH
9199 struct terminal *term = get_terminal (terminal, 1);
9200 struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (term);
b7826503 9201 CHECK_SYMBOL (coding_system);
b8299c66 9202 setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
70c22245 9203 /* We had better not send unsafe characters to terminal. */
c73bd236 9204 terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
ad1746f5 9205 /* Character composition should be disabled. */
c73bd236 9206 terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
b8299c66
KL
9207 terminal_coding->src_multibyte = 1;
9208 terminal_coding->dst_multibyte = 0;
3f22b86f
PE
9209 tset_charset_list
9210 (term, (terminal_coding->common_flags & CODING_REQUIRE_ENCODING_MASK
9211 ? coding_charset_list (terminal_coding)
9212 : Fcons (make_number (charset_ascii), Qnil)));
4ed46869
KH
9213 return Qnil;
9214}
9215
c4825358
KH
9216DEFUN ("set-safe-terminal-coding-system-internal",
9217 Fset_safe_terminal_coding_system_internal,
48b0f3ae 9218 Sset_safe_terminal_coding_system_internal, 1, 1, 0,
ddb67bdc 9219 doc: /* Internal use only. */)
5842a27b 9220 (Lisp_Object coding_system)
d46c5b12 9221{
b7826503 9222 CHECK_SYMBOL (coding_system);
c4825358
KH
9223 setup_coding_system (Fcheck_coding_system (coding_system),
9224 &safe_terminal_coding);
ad1746f5 9225 /* Character composition should be disabled. */
df7492f9 9226 safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
b73bfc1c
KH
9227 safe_terminal_coding.src_multibyte = 1;
9228 safe_terminal_coding.dst_multibyte = 0;
c4825358
KH
9229 return Qnil;
9230}
4ed46869 9231
002fdb44 9232DEFUN ("terminal-coding-system", Fterminal_coding_system,
68bba4e4 9233 Sterminal_coding_system, 0, 1, 0,
6ed8eeff 9234 doc: /* Return coding system specified for terminal output on the given terminal.
708e05dc 9235TERMINAL may be a terminal object, a frame, or nil for the selected
6ed8eeff 9236frame's terminal device. */)
5842a27b 9237 (Lisp_Object terminal)
4ed46869 9238{
985773c9
MB
9239 struct coding_system *terminal_coding
9240 = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
9241 Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
ae6f73fa 9242
6d5eb5b0 9243 /* For backward compatibility, return nil if it is `undecided'. */
f75c90a9 9244 return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
4ed46869
KH
9245}
9246
002fdb44 9247DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
68bba4e4 9248 Sset_keyboard_coding_system_internal, 1, 2, 0,
48b0f3ae 9249 doc: /* Internal use only. */)
5842a27b 9250 (Lisp_Object coding_system, Lisp_Object terminal)
4ed46869 9251{
6ed8eeff 9252 struct terminal *t = get_terminal (terminal, 1);
b7826503 9253 CHECK_SYMBOL (coding_system);
624bda09
KH
9254 if (NILP (coding_system))
9255 coding_system = Qno_conversion;
9256 else
9257 Fcheck_coding_system (coding_system);
9258 setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t));
ad1746f5 9259 /* Character composition should be disabled. */
c73bd236
MB
9260 TERMINAL_KEYBOARD_CODING (t)->common_flags
9261 &= ~CODING_ANNOTATE_COMPOSITION_MASK;
4ed46869
KH
9262 return Qnil;
9263}
9264
9265DEFUN ("keyboard-coding-system",
985773c9 9266 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
48b0f3ae 9267 doc: /* Return coding system specified for decoding keyboard input. */)
5842a27b 9268 (Lisp_Object terminal)
4ed46869 9269{
985773c9
MB
9270 return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9271 (get_terminal (terminal, 1))->id);
4ed46869
KH
9272}
9273
4ed46869 9274\f
a7ca3326 9275DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
a5d301df 9276 Sfind_operation_coding_system, 1, MANY, 0,
48b0f3ae
PJ
9277 doc: /* Choose a coding system for an operation based on the target name.
9278The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9279DECODING-SYSTEM is the coding system to use for decoding
9280\(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9281for encoding (in case OPERATION does encoding).
05e6f5dc 9282
48b0f3ae
PJ
9283The first argument OPERATION specifies an I/O primitive:
9284 For file I/O, `insert-file-contents' or `write-region'.
9285 For process I/O, `call-process', `call-process-region', or `start-process'.
9286 For network I/O, `open-network-stream'.
05e6f5dc 9287
48b0f3ae
PJ
9288The remaining arguments should be the same arguments that were passed
9289to the primitive. Depending on which primitive, one of those arguments
9290is selected as the TARGET. For example, if OPERATION does file I/O,
9291whichever argument specifies the file name is TARGET.
05e6f5dc 9292
48b0f3ae 9293TARGET has a meaning which depends on OPERATION:
b883cdb2 9294 For file I/O, TARGET is a file name (except for the special case below).
48b0f3ae 9295 For process I/O, TARGET is a process name.
d4a1d553 9296 For network I/O, TARGET is a service name or a port number.
05e6f5dc 9297
d4a1d553 9298This function looks up what is specified for TARGET in
48b0f3ae
PJ
9299`file-coding-system-alist', `process-coding-system-alist',
9300or `network-coding-system-alist' depending on OPERATION.
9301They may specify a coding system, a cons of coding systems,
9302or a function symbol to call.
9303In the last case, we call the function with one argument,
9304which is a list of all the arguments given to this function.
1011c487
MB
9305If the function can't decide a coding system, it can return
9306`undecided' so that the normal code-detection is performed.
48b0f3ae 9307
b883cdb2
MB
9308If OPERATION is `insert-file-contents', the argument corresponding to
9309TARGET may be a cons (FILENAME . BUFFER). In that case, FILENAME is a
9310file name to look up, and BUFFER is a buffer that contains the file's
9311contents (not yet decoded). If `file-coding-system-alist' specifies a
9312function to call for FILENAME, that function should examine the
9313contents of BUFFER instead of reading the file.
9314
d918f936 9315usage: (find-operation-coding-system OPERATION ARGUMENTS...) */)
f66c7cf8 9316 (ptrdiff_t nargs, Lisp_Object *args)
6b89e3aa 9317{
4ed46869
KH
9318 Lisp_Object operation, target_idx, target, val;
9319 register Lisp_Object chain;
177c0ea7 9320
4ed46869
KH
9321 if (nargs < 2)
9322 error ("Too few arguments");
9323 operation = args[0];
9324 if (!SYMBOLP (operation)
d311d28c 9325 || (target_idx = Fget (operation, Qtarget_idx), !NATNUMP (target_idx)))
3ed051d4 9326 error ("Invalid first argument");
7b09a37a 9327 if (nargs <= 1 + XFASTINT (target_idx))
94dcfacf 9328 error ("Too few arguments for operation `%s'",
8f924df7 9329 SDATA (SYMBOL_NAME (operation)));
c5101a77 9330 target = args[XFASTINT (target_idx) + 1];
4ed46869 9331 if (!(STRINGP (target)
091a0ff0
KH
9332 || (EQ (operation, Qinsert_file_contents) && CONSP (target)
9333 && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
4ed46869 9334 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
94dcfacf
EZ
9335 error ("Invalid argument %"pI"d of operation `%s'",
9336 XFASTINT (target_idx) + 1, SDATA (SYMBOL_NAME (operation)));
091a0ff0
KH
9337 if (CONSP (target))
9338 target = XCAR (target);
4ed46869 9339
2e34157c
RS
9340 chain = ((EQ (operation, Qinsert_file_contents)
9341 || EQ (operation, Qwrite_region))
02ba4723 9342 ? Vfile_coding_system_alist
2e34157c 9343 : (EQ (operation, Qopen_network_stream)
02ba4723
KH
9344 ? Vnetwork_coding_system_alist
9345 : Vprocess_coding_system_alist));
4ed46869
KH
9346 if (NILP (chain))
9347 return Qnil;
9348
03699b14 9349 for (; CONSP (chain); chain = XCDR (chain))
6b89e3aa 9350 {
f44d27ce 9351 Lisp_Object elt;
6b89e3aa 9352
df7492f9 9353 elt = XCAR (chain);
4ed46869
KH
9354 if (CONSP (elt)
9355 && ((STRINGP (target)
03699b14
KR
9356 && STRINGP (XCAR (elt))
9357 && fast_string_match (XCAR (elt), target) >= 0)
9358 || (INTEGERP (target) && EQ (target, XCAR (elt)))))
6b89e3aa 9359 {
03699b14 9360 val = XCDR (elt);
b19fd4c5
KH
9361 /* Here, if VAL is both a valid coding system and a valid
9362 function symbol, we return VAL as a coding system. */
02ba4723
KH
9363 if (CONSP (val))
9364 return val;
9365 if (! SYMBOLP (val))
9366 return Qnil;
9367 if (! NILP (Fcoding_system_p (val)))
9368 return Fcons (val, val);
b19fd4c5 9369 if (! NILP (Ffboundp (val)))
6b89e3aa 9370 {
e2b97060
MB
9371 /* We use call1 rather than safe_call1
9372 so as to get bug reports about functions called here
9373 which don't handle the current interface. */
9374 val = call1 (val, Flist (nargs, args));
b19fd4c5
KH
9375 if (CONSP (val))
9376 return val;
9377 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
9378 return Fcons (val, val);
6b89e3aa 9379 }
02ba4723 9380 return Qnil;
6b89e3aa
KH
9381 }
9382 }
4ed46869 9383 return Qnil;
6b89e3aa
KH
9384}
9385
df7492f9 9386DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
a3f6ee6d 9387 Sset_coding_system_priority, 0, MANY, 0,
da7db224 9388 doc: /* Assign higher priority to the coding systems given as arguments.
d4a1d553 9389If multiple coding systems belong to the same category,
a3181084
DL
9390all but the first one are ignored.
9391
d4a1d553 9392usage: (set-coding-system-priority &rest coding-systems) */)
f66c7cf8 9393 (ptrdiff_t nargs, Lisp_Object *args)
df7492f9 9394{
f66c7cf8 9395 ptrdiff_t i, j;
f10fe38f 9396 bool changed[coding_category_max];
df7492f9
KH
9397 enum coding_category priorities[coding_category_max];
9398
72af86bd 9399 memset (changed, 0, sizeof changed);
6b89e3aa 9400
df7492f9 9401 for (i = j = 0; i < nargs; i++)
6b89e3aa 9402 {
df7492f9
KH
9403 enum coding_category category;
9404 Lisp_Object spec, attrs;
6b89e3aa 9405
df7492f9
KH
9406 CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9407 attrs = AREF (spec, 0);
9408 category = XINT (CODING_ATTR_CATEGORY (attrs));
9409 if (changed[category])
9410 /* Ignore this coding system because a coding system of the
9411 same category already had a higher priority. */
9412 continue;
9413 changed[category] = 1;
9414 priorities[j++] = category;
9415 if (coding_categories[category].id >= 0
9416 && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9417 setup_coding_system (args[i], &coding_categories[category]);
ff563fce 9418 Fset (AREF (Vcoding_category_table, category), args[i]);
df7492f9 9419 }
6b89e3aa 9420
df7492f9
KH
9421 /* Now we have decided top J priorities. Reflect the order of the
9422 original priorities to the remaining priorities. */
6b89e3aa 9423
df7492f9 9424 for (i = j, j = 0; i < coding_category_max; i++, j++)
6b89e3aa 9425 {
df7492f9
KH
9426 while (j < coding_category_max
9427 && changed[coding_priorities[j]])
9428 j++;
9429 if (j == coding_category_max)
1088b922 9430 emacs_abort ();
df7492f9
KH
9431 priorities[i] = coding_priorities[j];
9432 }
6b89e3aa 9433
72af86bd 9434 memcpy (coding_priorities, priorities, sizeof priorities);
177c0ea7 9435
ff563fce
KH
9436 /* Update `coding-category-list'. */
9437 Vcoding_category_list = Qnil;
c5101a77 9438 for (i = coding_category_max; i-- > 0; )
ff563fce
KH
9439 Vcoding_category_list
9440 = Fcons (AREF (Vcoding_category_table, priorities[i]),
9441 Vcoding_category_list);
6b89e3aa 9442
df7492f9 9443 return Qnil;
6b89e3aa
KH
9444}
9445
df7492f9
KH
9446DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
9447 Scoding_system_priority_list, 0, 1, 0,
da7db224 9448 doc: /* Return a list of coding systems ordered by their priorities.
b811c52b
KH
9449The list contains a subset of coding systems; i.e. coding systems
9450assigned to each coding category (see `coding-category-list').
9451
da7db224 9452HIGHESTP non-nil means just return the highest priority one. */)
5842a27b 9453 (Lisp_Object highestp)
d46c5b12
KH
9454{
9455 int i;
df7492f9 9456 Lisp_Object val;
6b89e3aa 9457
df7492f9 9458 for (i = 0, val = Qnil; i < coding_category_max; i++)
d46c5b12 9459 {
df7492f9
KH
9460 enum coding_category category = coding_priorities[i];
9461 int id = coding_categories[category].id;
9462 Lisp_Object attrs;
068a9dbd 9463
df7492f9
KH
9464 if (id < 0)
9465 continue;
9466 attrs = CODING_ID_ATTRS (id);
9467 if (! NILP (highestp))
9468 return CODING_ATTR_BASE_NAME (attrs);
9469 val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
9470 }
9471 return Fnreverse (val);
9472}
068a9dbd 9473
91433552 9474static const char *const suffixes[] = { "-unix", "-dos", "-mac" };
068a9dbd
KH
9475
9476static Lisp_Object
971de7fb 9477make_subsidiaries (Lisp_Object base)
068a9dbd 9478{
df7492f9 9479 Lisp_Object subsidiaries;
1bfdaf10 9480 ptrdiff_t base_name_len = SBYTES (SYMBOL_NAME (base));
38182d90 9481 char *buf = alloca (base_name_len + 6);
df7492f9 9482 int i;
068a9dbd 9483
72af86bd 9484 memcpy (buf, SDATA (SYMBOL_NAME (base)), base_name_len);
df7492f9
KH
9485 subsidiaries = Fmake_vector (make_number (3), Qnil);
9486 for (i = 0; i < 3; i++)
068a9dbd 9487 {
1bfdaf10 9488 strcpy (buf + base_name_len, suffixes[i]);
df7492f9 9489 ASET (subsidiaries, i, intern (buf));
068a9dbd 9490 }
df7492f9 9491 return subsidiaries;
068a9dbd
KH
9492}
9493
9494
df7492f9
KH
9495DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
9496 Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
1fcd6c8b
DL
9497 doc: /* For internal use only.
9498usage: (define-coding-system-internal ...) */)
f66c7cf8 9499 (ptrdiff_t nargs, Lisp_Object *args)
068a9dbd 9500{
df7492f9
KH
9501 Lisp_Object name;
9502 Lisp_Object spec_vec; /* [ ATTRS ALIASE EOL_TYPE ] */
9503 Lisp_Object attrs; /* Vector of attributes. */
9504 Lisp_Object eol_type;
9505 Lisp_Object aliases;
9506 Lisp_Object coding_type, charset_list, safe_charsets;
9507 enum coding_category category;
9508 Lisp_Object tail, val;
9509 int max_charset_id = 0;
9510 int i;
068a9dbd 9511
df7492f9
KH
9512 if (nargs < coding_arg_max)
9513 goto short_args;
068a9dbd 9514
df7492f9 9515 attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
068a9dbd 9516
df7492f9
KH
9517 name = args[coding_arg_name];
9518 CHECK_SYMBOL (name);
4939150c 9519 ASET (attrs, coding_attr_base_name, name);
068a9dbd 9520
df7492f9
KH
9521 val = args[coding_arg_mnemonic];
9522 if (! STRINGP (val))
9523 CHECK_CHARACTER (val);
4939150c 9524 ASET (attrs, coding_attr_mnemonic, val);
068a9dbd 9525
df7492f9
KH
9526 coding_type = args[coding_arg_coding_type];
9527 CHECK_SYMBOL (coding_type);
4939150c 9528 ASET (attrs, coding_attr_type, coding_type);
068a9dbd 9529
df7492f9
KH
9530 charset_list = args[coding_arg_charset_list];
9531 if (SYMBOLP (charset_list))
9532 {
9533 if (EQ (charset_list, Qiso_2022))
9534 {
9535 if (! EQ (coding_type, Qiso_2022))
9536 error ("Invalid charset-list");
9537 charset_list = Viso_2022_charset_list;
9538 }
9539 else if (EQ (charset_list, Qemacs_mule))
9540 {
9541 if (! EQ (coding_type, Qemacs_mule))
9542 error ("Invalid charset-list");
9543 charset_list = Vemacs_mule_charset_list;
9544 }
9545 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
d311d28c
PE
9546 {
9547 if (! RANGED_INTEGERP (0, XCAR (tail), INT_MAX - 1))
9548 error ("Invalid charset-list");
9549 if (max_charset_id < XFASTINT (XCAR (tail)))
9550 max_charset_id = XFASTINT (XCAR (tail));
9551 }
df7492f9 9552 }
068a9dbd
KH
9553 else
9554 {
df7492f9 9555 charset_list = Fcopy_sequence (charset_list);
985773c9 9556 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
068a9dbd 9557 {
df7492f9
KH
9558 struct charset *charset;
9559
985773c9 9560 val = XCAR (tail);
df7492f9
KH
9561 CHECK_CHARSET_GET_CHARSET (val, charset);
9562 if (EQ (coding_type, Qiso_2022)
9563 ? CHARSET_ISO_FINAL (charset) < 0
9564 : EQ (coding_type, Qemacs_mule)
9565 ? CHARSET_EMACS_MULE_ID (charset) < 0
9566 : 0)
9567 error ("Can't handle charset `%s'",
8f924df7 9568 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
df7492f9 9569
8f924df7 9570 XSETCAR (tail, make_number (charset->id));
df7492f9
KH
9571 if (max_charset_id < charset->id)
9572 max_charset_id = charset->id;
068a9dbd
KH
9573 }
9574 }
4939150c 9575 ASET (attrs, coding_attr_charset_list, charset_list);
068a9dbd 9576
1b3b981b
AS
9577 safe_charsets = make_uninit_string (max_charset_id + 1);
9578 memset (SDATA (safe_charsets), 255, max_charset_id + 1);
df7492f9 9579 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8f924df7 9580 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
4939150c 9581 ASET (attrs, coding_attr_safe_charsets, safe_charsets);
068a9dbd 9582
4939150c 9583 ASET (attrs, coding_attr_ascii_compat, args[coding_arg_ascii_compatible_p]);
3a73fa5d 9584
df7492f9 9585 val = args[coding_arg_decode_translation_table];
a6f87d34 9586 if (! CHAR_TABLE_P (val) && ! CONSP (val))
7d64c6ad 9587 CHECK_SYMBOL (val);
4939150c 9588 ASET (attrs, coding_attr_decode_tbl, val);
3a73fa5d 9589
df7492f9 9590 val = args[coding_arg_encode_translation_table];
a6f87d34 9591 if (! CHAR_TABLE_P (val) && ! CONSP (val))
7d64c6ad 9592 CHECK_SYMBOL (val);
4939150c 9593 ASET (attrs, coding_attr_encode_tbl, val);
d46c5b12 9594
df7492f9
KH
9595 val = args[coding_arg_post_read_conversion];
9596 CHECK_SYMBOL (val);
4939150c 9597 ASET (attrs, coding_attr_post_read, val);
d46c5b12 9598
df7492f9
KH
9599 val = args[coding_arg_pre_write_conversion];
9600 CHECK_SYMBOL (val);
4939150c 9601 ASET (attrs, coding_attr_pre_write, val);
3a73fa5d 9602
df7492f9
KH
9603 val = args[coding_arg_default_char];
9604 if (NILP (val))
4939150c 9605 ASET (attrs, coding_attr_default_char, make_number (' '));
df7492f9
KH
9606 else
9607 {
8f924df7 9608 CHECK_CHARACTER (val);
4939150c 9609 ASET (attrs, coding_attr_default_char, val);
df7492f9 9610 }
4031e2bf 9611
8f924df7 9612 val = args[coding_arg_for_unibyte];
4939150c 9613 ASET (attrs, coding_attr_for_unibyte, NILP (val) ? Qnil : Qt);
3a73fa5d 9614
df7492f9
KH
9615 val = args[coding_arg_plist];
9616 CHECK_LIST (val);
4939150c 9617 ASET (attrs, coding_attr_plist, val);
3a73fa5d 9618
df7492f9
KH
9619 if (EQ (coding_type, Qcharset))
9620 {
c7c66a95
KH
9621 /* Generate a lisp vector of 256 elements. Each element is nil,
9622 integer, or a list of charset IDs.
3a73fa5d 9623
c7c66a95
KH
9624 If Nth element is nil, the byte code N is invalid in this
9625 coding system.
4ed46869 9626
c7c66a95
KH
9627 If Nth element is a number NUM, N is the first byte of a
9628 charset whose ID is NUM.
4ed46869 9629
c7c66a95
KH
9630 If Nth element is a list of charset IDs, N is the first byte
9631 of one of them. The list is sorted by dimensions of the
ad1746f5 9632 charsets. A charset of smaller dimension comes first. */
df7492f9 9633 val = Fmake_vector (make_number (256), Qnil);
4ed46869 9634
5c99c2e6 9635 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
df7492f9 9636 {
c7c66a95
KH
9637 struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
9638 int dim = CHARSET_DIMENSION (charset);
9639 int idx = (dim - 1) * 4;
4ed46869 9640
5c99c2e6 9641 if (CHARSET_ASCII_COMPATIBLE_P (charset))
4939150c 9642 ASET (attrs, coding_attr_ascii_compat, Qt);
4031e2bf 9643
15d143f7
KH
9644 for (i = charset->code_space[idx];
9645 i <= charset->code_space[idx + 1]; i++)
9646 {
c7c66a95
KH
9647 Lisp_Object tmp, tmp2;
9648 int dim2;
ec6d2bb8 9649
c7c66a95
KH
9650 tmp = AREF (val, i);
9651 if (NILP (tmp))
9652 tmp = XCAR (tail);
9653 else if (NUMBERP (tmp))
9654 {
9655 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
9656 if (dim < dim2)
c7c66a95 9657 tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
f9d71dcd
KH
9658 else
9659 tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
c7c66a95 9660 }
15d143f7 9661 else
c7c66a95
KH
9662 {
9663 for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
9664 {
9665 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
9666 if (dim < dim2)
9667 break;
9668 }
9669 if (NILP (tmp2))
9670 tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
9671 else
9672 {
9673 XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
9674 XSETCAR (tmp2, XCAR (tail));
9675 }
9676 }
9677 ASET (val, i, tmp);
15d143f7 9678 }
df7492f9
KH
9679 }
9680 ASET (attrs, coding_attr_charset_valids, val);
9681 category = coding_category_charset;
9682 }
9683 else if (EQ (coding_type, Qccl))
9684 {
9685 Lisp_Object valids;
ecec61c1 9686
df7492f9
KH
9687 if (nargs < coding_arg_ccl_max)
9688 goto short_args;
ecec61c1 9689
df7492f9
KH
9690 val = args[coding_arg_ccl_decoder];
9691 CHECK_CCL_PROGRAM (val);
9692 if (VECTORP (val))
9693 val = Fcopy_sequence (val);
9694 ASET (attrs, coding_attr_ccl_decoder, val);
ecec61c1 9695
df7492f9
KH
9696 val = args[coding_arg_ccl_encoder];
9697 CHECK_CCL_PROGRAM (val);
9698 if (VECTORP (val))
9699 val = Fcopy_sequence (val);
9700 ASET (attrs, coding_attr_ccl_encoder, val);
ecec61c1 9701
df7492f9
KH
9702 val = args[coding_arg_ccl_valids];
9703 valids = Fmake_string (make_number (256), make_number (0));
7d7bbefd 9704 for (tail = val; CONSP (tail); tail = XCDR (tail))
df7492f9 9705 {
8dcbea82 9706 int from, to;
ecec61c1 9707
34348bd4 9708 val = XCAR (tail);
df7492f9 9709 if (INTEGERP (val))
8dcbea82 9710 {
d311d28c 9711 if (! (0 <= XINT (val) && XINT (val) <= 255))
8dcbea82 9712 args_out_of_range_3 (val, make_number (0), make_number (255));
d311d28c 9713 from = to = XINT (val);
8dcbea82 9714 }
df7492f9
KH
9715 else
9716 {
df7492f9 9717 CHECK_CONS (val);
8f924df7 9718 CHECK_NATNUM_CAR (val);
d311d28c
PE
9719 CHECK_NUMBER_CDR (val);
9720 if (XINT (XCAR (val)) > 255)
8dcbea82
KH
9721 args_out_of_range_3 (XCAR (val),
9722 make_number (0), make_number (255));
d311d28c
PE
9723 from = XINT (XCAR (val));
9724 if (! (from <= XINT (XCDR (val)) && XINT (XCDR (val)) <= 255))
8dcbea82
KH
9725 args_out_of_range_3 (XCDR (val),
9726 XCAR (val), make_number (255));
d311d28c 9727 to = XINT (XCDR (val));
df7492f9 9728 }
8dcbea82 9729 for (i = from; i <= to; i++)
8f924df7 9730 SSET (valids, i, 1);
df7492f9
KH
9731 }
9732 ASET (attrs, coding_attr_ccl_valids, valids);
4ed46869 9733
df7492f9 9734 category = coding_category_ccl;
55ab7be3 9735 }
df7492f9 9736 else if (EQ (coding_type, Qutf_16))
55ab7be3 9737 {
df7492f9 9738 Lisp_Object bom, endian;
4ed46869 9739
4939150c 9740 ASET (attrs, coding_attr_ascii_compat, Qnil);
4ed46869 9741
df7492f9
KH
9742 if (nargs < coding_arg_utf16_max)
9743 goto short_args;
4ed46869 9744
df7492f9
KH
9745 bom = args[coding_arg_utf16_bom];
9746 if (! NILP (bom) && ! EQ (bom, Qt))
9747 {
9748 CHECK_CONS (bom);
8f924df7
KH
9749 val = XCAR (bom);
9750 CHECK_CODING_SYSTEM (val);
9751 val = XCDR (bom);
9752 CHECK_CODING_SYSTEM (val);
df7492f9 9753 }
a470d443 9754 ASET (attrs, coding_attr_utf_bom, bom);
df7492f9
KH
9755
9756 endian = args[coding_arg_utf16_endian];
b49a1807
KH
9757 CHECK_SYMBOL (endian);
9758 if (NILP (endian))
9759 endian = Qbig;
9760 else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
8f924df7 9761 error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
df7492f9
KH
9762 ASET (attrs, coding_attr_utf_16_endian, endian);
9763
9764 category = (CONSP (bom)
9765 ? coding_category_utf_16_auto
9766 : NILP (bom)
b49a1807 9767 ? (EQ (endian, Qbig)
df7492f9
KH
9768 ? coding_category_utf_16_be_nosig
9769 : coding_category_utf_16_le_nosig)
b49a1807 9770 : (EQ (endian, Qbig)
df7492f9
KH
9771 ? coding_category_utf_16_be
9772 : coding_category_utf_16_le));
9773 }
9774 else if (EQ (coding_type, Qiso_2022))
9775 {
9776 Lisp_Object initial, reg_usage, request, flags;
1397dc18 9777
df7492f9
KH
9778 if (nargs < coding_arg_iso2022_max)
9779 goto short_args;
9780
9781 initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
9782 CHECK_VECTOR (initial);
9783 for (i = 0; i < 4; i++)
9784 {
9785 val = Faref (initial, make_number (i));
9786 if (! NILP (val))
9787 {
584948ac
KH
9788 struct charset *charset;
9789
9790 CHECK_CHARSET_GET_CHARSET (val, charset);
9791 ASET (initial, i, make_number (CHARSET_ID (charset)));
9792 if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
4939150c 9793 ASET (attrs, coding_attr_ascii_compat, Qt);
df7492f9
KH
9794 }
9795 else
9796 ASET (initial, i, make_number (-1));
9797 }
9798
9799 reg_usage = args[coding_arg_iso2022_reg_usage];
9800 CHECK_CONS (reg_usage);
8f924df7
KH
9801 CHECK_NUMBER_CAR (reg_usage);
9802 CHECK_NUMBER_CDR (reg_usage);
df7492f9
KH
9803
9804 request = Fcopy_sequence (args[coding_arg_iso2022_request]);
7d7bbefd 9805 for (tail = request; CONSP (tail); tail = XCDR (tail))
1397dc18 9806 {
df7492f9 9807 int id;
2735d060 9808 Lisp_Object tmp1;
df7492f9 9809
34348bd4 9810 val = XCAR (tail);
df7492f9 9811 CHECK_CONS (val);
2735d060
PE
9812 tmp1 = XCAR (val);
9813 CHECK_CHARSET_GET_ID (tmp1, id);
8f924df7 9814 CHECK_NATNUM_CDR (val);
df7492f9 9815 if (XINT (XCDR (val)) >= 4)
c2982e87 9816 error ("Invalid graphic register number: %"pI"d", XINT (XCDR (val)));
8f924df7 9817 XSETCAR (val, make_number (id));
1397dc18 9818 }
4ed46869 9819
df7492f9
KH
9820 flags = args[coding_arg_iso2022_flags];
9821 CHECK_NATNUM (flags);
d311d28c 9822 i = XINT (flags) & INT_MAX;
df7492f9 9823 if (EQ (args[coding_arg_charset_list], Qiso_2022))
d311d28c
PE
9824 i |= CODING_ISO_FLAG_FULL_SUPPORT;
9825 flags = make_number (i);
df7492f9
KH
9826
9827 ASET (attrs, coding_attr_iso_initial, initial);
9828 ASET (attrs, coding_attr_iso_usage, reg_usage);
9829 ASET (attrs, coding_attr_iso_request, request);
9830 ASET (attrs, coding_attr_iso_flags, flags);
9831 setup_iso_safe_charsets (attrs);
9832
9833 if (i & CODING_ISO_FLAG_SEVEN_BITS)
9834 category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
9835 | CODING_ISO_FLAG_SINGLE_SHIFT))
9836 ? coding_category_iso_7_else
9837 : EQ (args[coding_arg_charset_list], Qiso_2022)
9838 ? coding_category_iso_7
9839 : coding_category_iso_7_tight);
9840 else
9841 {
9842 int id = XINT (AREF (initial, 1));
9843
c6fb6e98 9844 category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
df7492f9
KH
9845 || EQ (args[coding_arg_charset_list], Qiso_2022)
9846 || id < 0)
9847 ? coding_category_iso_8_else
9848 : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
9849 ? coding_category_iso_8_1
9850 : coding_category_iso_8_2);
9851 }
0ce7886f
KH
9852 if (category != coding_category_iso_8_1
9853 && category != coding_category_iso_8_2)
4939150c 9854 ASET (attrs, coding_attr_ascii_compat, Qnil);
df7492f9
KH
9855 }
9856 else if (EQ (coding_type, Qemacs_mule))
c28a9453 9857 {
df7492f9
KH
9858 if (EQ (args[coding_arg_charset_list], Qemacs_mule))
9859 ASET (attrs, coding_attr_emacs_mule_full, Qt);
4939150c 9860 ASET (attrs, coding_attr_ascii_compat, Qt);
df7492f9 9861 category = coding_category_emacs_mule;
c28a9453 9862 }
df7492f9 9863 else if (EQ (coding_type, Qshift_jis))
c28a9453 9864 {
df7492f9
KH
9865
9866 struct charset *charset;
9867
7d64c6ad 9868 if (XINT (Flength (charset_list)) != 3
6e07c25f 9869 && XINT (Flength (charset_list)) != 4)
7d64c6ad 9870 error ("There should be three or four charsets");
df7492f9
KH
9871
9872 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9873 if (CHARSET_DIMENSION (charset) != 1)
9874 error ("Dimension of charset %s is not one",
8f924df7 9875 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
584948ac 9876 if (CHARSET_ASCII_COMPATIBLE_P (charset))
4939150c 9877 ASET (attrs, coding_attr_ascii_compat, Qt);
df7492f9
KH
9878
9879 charset_list = XCDR (charset_list);
9880 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9881 if (CHARSET_DIMENSION (charset) != 1)
9882 error ("Dimension of charset %s is not one",
8f924df7 9883 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
df7492f9
KH
9884
9885 charset_list = XCDR (charset_list);
9886 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9887 if (CHARSET_DIMENSION (charset) != 2)
7d64c6ad
KH
9888 error ("Dimension of charset %s is not two",
9889 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9890
9891 charset_list = XCDR (charset_list);
2b917a06
KH
9892 if (! NILP (charset_list))
9893 {
9894 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9895 if (CHARSET_DIMENSION (charset) != 2)
9896 error ("Dimension of charset %s is not two",
9897 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9898 }
df7492f9
KH
9899
9900 category = coding_category_sjis;
9901 Vsjis_coding_system = name;
c28a9453 9902 }
df7492f9
KH
9903 else if (EQ (coding_type, Qbig5))
9904 {
9905 struct charset *charset;
4ed46869 9906
df7492f9
KH
9907 if (XINT (Flength (charset_list)) != 2)
9908 error ("There should be just two charsets");
9909
9910 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9911 if (CHARSET_DIMENSION (charset) != 1)
9912 error ("Dimension of charset %s is not one",
8f924df7 9913 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
584948ac 9914 if (CHARSET_ASCII_COMPATIBLE_P (charset))
4939150c 9915 ASET (attrs, coding_attr_ascii_compat, Qt);
df7492f9
KH
9916
9917 charset_list = XCDR (charset_list);
9918 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9919 if (CHARSET_DIMENSION (charset) != 2)
9920 error ("Dimension of charset %s is not two",
8f924df7 9921 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
4ed46869 9922
df7492f9
KH
9923 category = coding_category_big5;
9924 Vbig5_coding_system = name;
9925 }
9926 else if (EQ (coding_type, Qraw_text))
c28a9453 9927 {
584948ac 9928 category = coding_category_raw_text;
4939150c 9929 ASET (attrs, coding_attr_ascii_compat, Qt);
c28a9453 9930 }
df7492f9 9931 else if (EQ (coding_type, Qutf_8))
4ed46869 9932 {
a470d443
KH
9933 Lisp_Object bom;
9934
a470d443
KH
9935 if (nargs < coding_arg_utf8_max)
9936 goto short_args;
9937
9938 bom = args[coding_arg_utf8_bom];
9939 if (! NILP (bom) && ! EQ (bom, Qt))
9940 {
9941 CHECK_CONS (bom);
9942 val = XCAR (bom);
9943 CHECK_CODING_SYSTEM (val);
9944 val = XCDR (bom);
9945 CHECK_CODING_SYSTEM (val);
9946 }
9947 ASET (attrs, coding_attr_utf_bom, bom);
0e5317f7 9948 if (NILP (bom))
4939150c 9949 ASET (attrs, coding_attr_ascii_compat, Qt);
a470d443
KH
9950
9951 category = (CONSP (bom) ? coding_category_utf_8_auto
9952 : NILP (bom) ? coding_category_utf_8_nosig
9953 : coding_category_utf_8_sig);
4ed46869 9954 }
df7492f9
KH
9955 else if (EQ (coding_type, Qundecided))
9956 category = coding_category_undecided;
4ed46869 9957 else
df7492f9 9958 error ("Invalid coding system type: %s",
8f924df7 9959 SDATA (SYMBOL_NAME (coding_type)));
4ed46869 9960
4939150c
PE
9961 ASET (attrs, coding_attr_category, make_number (category));
9962 ASET (attrs, coding_attr_plist,
9963 Fcons (QCcategory,
9964 Fcons (AREF (Vcoding_category_table, category),
9965 CODING_ATTR_PLIST (attrs))));
9966 ASET (attrs, coding_attr_plist,
9967 Fcons (QCascii_compatible_p,
9968 Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
9969 CODING_ATTR_PLIST (attrs))));
c4825358 9970
df7492f9
KH
9971 eol_type = args[coding_arg_eol_type];
9972 if (! NILP (eol_type)
9973 && ! EQ (eol_type, Qunix)
9974 && ! EQ (eol_type, Qdos)
9975 && ! EQ (eol_type, Qmac))
9976 error ("Invalid eol-type");
4ed46869 9977
df7492f9 9978 aliases = Fcons (name, Qnil);
4ed46869 9979
df7492f9
KH
9980 if (NILP (eol_type))
9981 {
9982 eol_type = make_subsidiaries (name);
9983 for (i = 0; i < 3; i++)
1397dc18 9984 {
df7492f9
KH
9985 Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
9986
9987 this_name = AREF (eol_type, i);
9988 this_aliases = Fcons (this_name, Qnil);
9989 this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
9990 this_spec = Fmake_vector (make_number (3), attrs);
9991 ASET (this_spec, 1, this_aliases);
9992 ASET (this_spec, 2, this_eol_type);
9993 Fputhash (this_name, this_spec, Vcoding_system_hash_table);
9994 Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
583f71ca
KH
9995 val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
9996 if (NILP (val))
9997 Vcoding_system_alist
9998 = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
9999 Vcoding_system_alist);
1397dc18 10000 }
d46c5b12 10001 }
4ed46869 10002
df7492f9
KH
10003 spec_vec = Fmake_vector (make_number (3), attrs);
10004 ASET (spec_vec, 1, aliases);
10005 ASET (spec_vec, 2, eol_type);
48b0f3ae 10006
df7492f9
KH
10007 Fputhash (name, spec_vec, Vcoding_system_hash_table);
10008 Vcoding_system_list = Fcons (name, Vcoding_system_list);
583f71ca
KH
10009 val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
10010 if (NILP (val))
10011 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
10012 Vcoding_system_alist);
48b0f3ae 10013
df7492f9
KH
10014 {
10015 int id = coding_categories[category].id;
48b0f3ae 10016
df7492f9
KH
10017 if (id < 0 || EQ (name, CODING_ID_NAME (id)))
10018 setup_coding_system (name, &coding_categories[category]);
10019 }
48b0f3ae 10020
d46c5b12 10021 return Qnil;
48b0f3ae 10022
df7492f9
KH
10023 short_args:
10024 return Fsignal (Qwrong_number_of_arguments,
10025 Fcons (intern ("define-coding-system-internal"),
10026 make_number (nargs)));
d46c5b12 10027}
4ed46869 10028
d6925f38 10029
a6f87d34
KH
10030DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
10031 3, 3, 0,
10032 doc: /* Change value in CODING-SYSTEM's property list PROP to VAL. */)
5842a27b 10033 (Lisp_Object coding_system, Lisp_Object prop, Lisp_Object val)
a6f87d34 10034{
3dbe7859 10035 Lisp_Object spec, attrs;
a6f87d34
KH
10036
10037 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10038 attrs = AREF (spec, 0);
10039 if (EQ (prop, QCmnemonic))
10040 {
10041 if (! STRINGP (val))
10042 CHECK_CHARACTER (val);
4939150c 10043 ASET (attrs, coding_attr_mnemonic, val);
a6f87d34 10044 }
2133e2d1 10045 else if (EQ (prop, QCdefault_char))
a6f87d34
KH
10046 {
10047 if (NILP (val))
10048 val = make_number (' ');
10049 else
10050 CHECK_CHARACTER (val);
4939150c 10051 ASET (attrs, coding_attr_default_char, val);
a6f87d34
KH
10052 }
10053 else if (EQ (prop, QCdecode_translation_table))
10054 {
10055 if (! CHAR_TABLE_P (val) && ! CONSP (val))
10056 CHECK_SYMBOL (val);
4939150c 10057 ASET (attrs, coding_attr_decode_tbl, val);
a6f87d34
KH
10058 }
10059 else if (EQ (prop, QCencode_translation_table))
10060 {
10061 if (! CHAR_TABLE_P (val) && ! CONSP (val))
10062 CHECK_SYMBOL (val);
4939150c 10063 ASET (attrs, coding_attr_encode_tbl, val);
a6f87d34
KH
10064 }
10065 else if (EQ (prop, QCpost_read_conversion))
10066 {
10067 CHECK_SYMBOL (val);
4939150c 10068 ASET (attrs, coding_attr_post_read, val);
a6f87d34
KH
10069 }
10070 else if (EQ (prop, QCpre_write_conversion))
10071 {
10072 CHECK_SYMBOL (val);
4939150c 10073 ASET (attrs, coding_attr_pre_write, val);
a6f87d34 10074 }
35befdaa
KH
10075 else if (EQ (prop, QCascii_compatible_p))
10076 {
4939150c 10077 ASET (attrs, coding_attr_ascii_compat, val);
35befdaa 10078 }
a6f87d34 10079
4939150c
PE
10080 ASET (attrs, coding_attr_plist,
10081 Fplist_put (CODING_ATTR_PLIST (attrs), prop, val));
a6f87d34
KH
10082 return val;
10083}
10084
10085
df7492f9
KH
10086DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
10087 Sdefine_coding_system_alias, 2, 2, 0,
10088 doc: /* Define ALIAS as an alias for CODING-SYSTEM. */)
5842a27b 10089 (Lisp_Object alias, Lisp_Object coding_system)
66cfb530 10090{
583f71ca 10091 Lisp_Object spec, aliases, eol_type, val;
4ed46869 10092
df7492f9
KH
10093 CHECK_SYMBOL (alias);
10094 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10095 aliases = AREF (spec, 1);
d4a1d553 10096 /* ALIASES should be a list of length more than zero, and the first
d6925f38
KH
10097 element is a base coding system. Append ALIAS at the tail of the
10098 list. */
df7492f9
KH
10099 while (!NILP (XCDR (aliases)))
10100 aliases = XCDR (aliases);
8f924df7 10101 XSETCDR (aliases, Fcons (alias, Qnil));
4ed46869 10102
df7492f9
KH
10103 eol_type = AREF (spec, 2);
10104 if (VECTORP (eol_type))
4ed46869 10105 {
df7492f9
KH
10106 Lisp_Object subsidiaries;
10107 int i;
4ed46869 10108
df7492f9
KH
10109 subsidiaries = make_subsidiaries (alias);
10110 for (i = 0; i < 3; i++)
10111 Fdefine_coding_system_alias (AREF (subsidiaries, i),
10112 AREF (eol_type, i));
4ed46869 10113 }
df7492f9
KH
10114
10115 Fputhash (alias, spec, Vcoding_system_hash_table);
d6925f38 10116 Vcoding_system_list = Fcons (alias, Vcoding_system_list);
583f71ca
KH
10117 val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
10118 if (NILP (val))
10119 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
10120 Vcoding_system_alist);
66cfb530 10121
4ed46869
KH
10122 return Qnil;
10123}
10124
a7ca3326 10125DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
df7492f9
KH
10126 1, 1, 0,
10127 doc: /* Return the base of CODING-SYSTEM.
da7db224 10128Any alias or subsidiary coding system is not a base coding system. */)
5842a27b 10129 (Lisp_Object coding_system)
d46c5b12 10130{
df7492f9 10131 Lisp_Object spec, attrs;
d46c5b12 10132
df7492f9
KH
10133 if (NILP (coding_system))
10134 return (Qno_conversion);
10135 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10136 attrs = AREF (spec, 0);
10137 return CODING_ATTR_BASE_NAME (attrs);
10138}
1397dc18 10139
df7492f9
KH
10140DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
10141 1, 1, 0,
10142 doc: "Return the property list of CODING-SYSTEM.")
5842a27b 10143 (Lisp_Object coding_system)
df7492f9
KH
10144{
10145 Lisp_Object spec, attrs;
1397dc18 10146
df7492f9
KH
10147 if (NILP (coding_system))
10148 coding_system = Qno_conversion;
10149 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10150 attrs = AREF (spec, 0);
10151 return CODING_ATTR_PLIST (attrs);
d46c5b12
KH
10152}
10153
df7492f9
KH
10154
10155DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
10156 1, 1, 0,
da7db224 10157 doc: /* Return the list of aliases of CODING-SYSTEM. */)
5842a27b 10158 (Lisp_Object coding_system)
66cfb530 10159{
df7492f9 10160 Lisp_Object spec;
84d60297 10161
df7492f9
KH
10162 if (NILP (coding_system))
10163 coding_system = Qno_conversion;
10164 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
da7db224 10165 return AREF (spec, 1);
df7492f9 10166}
66cfb530 10167
a7ca3326 10168DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
df7492f9
KH
10169 Scoding_system_eol_type, 1, 1, 0,
10170 doc: /* Return eol-type of CODING-SYSTEM.
d4a1d553 10171An eol-type is an integer 0, 1, 2, or a vector of coding systems.
66cfb530 10172
df7492f9
KH
10173Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10174and CR respectively.
66cfb530 10175
df7492f9
KH
10176A vector value indicates that a format of end-of-line should be
10177detected automatically. Nth element of the vector is the subsidiary
10178coding system whose eol-type is N. */)
5842a27b 10179 (Lisp_Object coding_system)
6b89e3aa 10180{
df7492f9
KH
10181 Lisp_Object spec, eol_type;
10182 int n;
6b89e3aa 10183
df7492f9
KH
10184 if (NILP (coding_system))
10185 coding_system = Qno_conversion;
10186 if (! CODING_SYSTEM_P (coding_system))
10187 return Qnil;
10188 spec = CODING_SYSTEM_SPEC (coding_system);
10189 eol_type = AREF (spec, 2);
10190 if (VECTORP (eol_type))
10191 return Fcopy_sequence (eol_type);
10192 n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
10193 return make_number (n);
6b89e3aa
KH
10194}
10195
4ed46869
KH
10196#endif /* emacs */
10197
10198\f
1397dc18 10199/*** 9. Post-amble ***/
4ed46869 10200
dfcf069d 10201void
971de7fb 10202init_coding_once (void)
4ed46869
KH
10203{
10204 int i;
10205
df7492f9
KH
10206 for (i = 0; i < coding_category_max; i++)
10207 {
10208 coding_categories[i].id = -1;
10209 coding_priorities[i] = i;
10210 }
4ed46869
KH
10211
10212 /* ISO2022 specific initialize routine. */
10213 for (i = 0; i < 0x20; i++)
b73bfc1c 10214 iso_code_class[i] = ISO_control_0;
4ed46869
KH
10215 for (i = 0x21; i < 0x7F; i++)
10216 iso_code_class[i] = ISO_graphic_plane_0;
10217 for (i = 0x80; i < 0xA0; i++)
b73bfc1c 10218 iso_code_class[i] = ISO_control_1;
4ed46869
KH
10219 for (i = 0xA1; i < 0xFF; i++)
10220 iso_code_class[i] = ISO_graphic_plane_1;
10221 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
10222 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
4ed46869
KH
10223 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
10224 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
10225 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
10226 iso_code_class[ISO_CODE_ESC] = ISO_escape;
10227 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
10228 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
10229 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
10230
df7492f9
KH
10231 for (i = 0; i < 256; i++)
10232 {
10233 emacs_mule_bytes[i] = 1;
10234 }
7c78e542
KH
10235 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
10236 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
10237 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
10238 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
e0e989f6
KH
10239}
10240
10241#ifdef emacs
10242
dfcf069d 10243void
971de7fb 10244syms_of_coding (void)
e0e989f6 10245{
df7492f9 10246 staticpro (&Vcoding_system_hash_table);
8f924df7
KH
10247 {
10248 Lisp_Object args[2];
10249 args[0] = QCtest;
10250 args[1] = Qeq;
10251 Vcoding_system_hash_table = Fmake_hash_table (2, args);
10252 }
df7492f9
KH
10253
10254 staticpro (&Vsjis_coding_system);
10255 Vsjis_coding_system = Qnil;
e0e989f6 10256
df7492f9
KH
10257 staticpro (&Vbig5_coding_system);
10258 Vbig5_coding_system = Qnil;
10259
24a73b0a
KH
10260 staticpro (&Vcode_conversion_reused_workbuf);
10261 Vcode_conversion_reused_workbuf = Qnil;
10262
10263 staticpro (&Vcode_conversion_workbuf_name);
2a0213a6 10264 Vcode_conversion_workbuf_name = build_pure_c_string (" *code-conversion-work*");
e0e989f6 10265
24a73b0a 10266 reused_workbuf_in_use = 0;
df7492f9
KH
10267
10268 DEFSYM (Qcharset, "charset");
10269 DEFSYM (Qtarget_idx, "target-idx");
10270 DEFSYM (Qcoding_system_history, "coding-system-history");
bb0115a2
RS
10271 Fset (Qcoding_system_history, Qnil);
10272
9ce27fde 10273 /* Target FILENAME is the first argument. */
e0e989f6 10274 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
9ce27fde 10275 /* Target FILENAME is the third argument. */
e0e989f6
KH
10276 Fput (Qwrite_region, Qtarget_idx, make_number (2));
10277
df7492f9 10278 DEFSYM (Qcall_process, "call-process");
9ce27fde 10279 /* Target PROGRAM is the first argument. */
e0e989f6
KH
10280 Fput (Qcall_process, Qtarget_idx, make_number (0));
10281
df7492f9 10282 DEFSYM (Qcall_process_region, "call-process-region");
9ce27fde 10283 /* Target PROGRAM is the third argument. */
e0e989f6
KH
10284 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
10285
df7492f9 10286 DEFSYM (Qstart_process, "start-process");
9ce27fde 10287 /* Target PROGRAM is the third argument. */
e0e989f6
KH
10288 Fput (Qstart_process, Qtarget_idx, make_number (2));
10289
df7492f9 10290 DEFSYM (Qopen_network_stream, "open-network-stream");
9ce27fde 10291 /* Target SERVICE is the fourth argument. */
e0e989f6
KH
10292 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
10293
df7492f9
KH
10294 DEFSYM (Qcoding_system, "coding-system");
10295 DEFSYM (Qcoding_aliases, "coding-aliases");
4ed46869 10296
df7492f9
KH
10297 DEFSYM (Qeol_type, "eol-type");
10298 DEFSYM (Qunix, "unix");
10299 DEFSYM (Qdos, "dos");
4b298d5a 10300 DEFSYM (Qmac, "mac");
4ed46869 10301
df7492f9
KH
10302 DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10303 DEFSYM (Qpost_read_conversion, "post-read-conversion");
10304 DEFSYM (Qpre_write_conversion, "pre-write-conversion");
10305 DEFSYM (Qdefault_char, "default-char");
10306 DEFSYM (Qundecided, "undecided");
10307 DEFSYM (Qno_conversion, "no-conversion");
10308 DEFSYM (Qraw_text, "raw-text");
4ed46869 10309
df7492f9 10310 DEFSYM (Qiso_2022, "iso-2022");
4ed46869 10311
df7492f9 10312 DEFSYM (Qutf_8, "utf-8");
8f924df7 10313 DEFSYM (Qutf_8_emacs, "utf-8-emacs");
27901516 10314
7f590b0c 10315#if defined (WINDOWSNT) || defined (CYGWIN)
ba116008
DC
10316 /* No, not utf-16-le: that one has a BOM. */
10317 DEFSYM (Qutf_16le, "utf-16le");
10318#endif
10319
df7492f9 10320 DEFSYM (Qutf_16, "utf-16");
df7492f9
KH
10321 DEFSYM (Qbig, "big");
10322 DEFSYM (Qlittle, "little");
27901516 10323
df7492f9
KH
10324 DEFSYM (Qshift_jis, "shift-jis");
10325 DEFSYM (Qbig5, "big5");
4ed46869 10326
df7492f9 10327 DEFSYM (Qcoding_system_p, "coding-system-p");
4ed46869 10328
df7492f9 10329 DEFSYM (Qcoding_system_error, "coding-system-error");
4ed46869 10330 Fput (Qcoding_system_error, Qerror_conditions,
3438fe21 10331 listn (CONSTYPE_PURE, 2, Qcoding_system_error, Qerror));
4ed46869 10332 Fput (Qcoding_system_error, Qerror_message,
2a0213a6 10333 build_pure_c_string ("Invalid coding system"));
4ed46869 10334
05e6f5dc
KH
10335 /* Intern this now in case it isn't already done.
10336 Setting this variable twice is harmless.
10337 But don't staticpro it here--that is done in alloc.c. */
d67b4f80 10338 Qchar_table_extra_slots = intern_c_string ("char-table-extra-slots");
70c22245 10339
df7492f9 10340 DEFSYM (Qtranslation_table, "translation-table");
433f7f87 10341 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
df7492f9
KH
10342 DEFSYM (Qtranslation_table_id, "translation-table-id");
10343 DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
10344 DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
1397dc18 10345
df7492f9 10346 DEFSYM (Qvalid_codes, "valid-codes");
9ce27fde 10347
df7492f9 10348 DEFSYM (Qemacs_mule, "emacs-mule");
d46c5b12 10349
01378f49 10350 DEFSYM (QCcategory, ":category");
a6f87d34 10351 DEFSYM (QCmnemonic, ":mnemonic");
2133e2d1 10352 DEFSYM (QCdefault_char, ":default-char");
a6f87d34
KH
10353 DEFSYM (QCdecode_translation_table, ":decode-translation-table");
10354 DEFSYM (QCencode_translation_table, ":encode-translation-table");
10355 DEFSYM (QCpost_read_conversion, ":post-read-conversion");
10356 DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
35befdaa 10357 DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
01378f49 10358
df7492f9
KH
10359 Vcoding_category_table
10360 = Fmake_vector (make_number (coding_category_max), Qnil);
10361 staticpro (&Vcoding_category_table);
10362 /* Followings are target of code detection. */
10363 ASET (Vcoding_category_table, coding_category_iso_7,
d67b4f80 10364 intern_c_string ("coding-category-iso-7"));
df7492f9 10365 ASET (Vcoding_category_table, coding_category_iso_7_tight,
d67b4f80 10366 intern_c_string ("coding-category-iso-7-tight"));
df7492f9 10367 ASET (Vcoding_category_table, coding_category_iso_8_1,
d67b4f80 10368 intern_c_string ("coding-category-iso-8-1"));
df7492f9 10369 ASET (Vcoding_category_table, coding_category_iso_8_2,
d67b4f80 10370 intern_c_string ("coding-category-iso-8-2"));
df7492f9 10371 ASET (Vcoding_category_table, coding_category_iso_7_else,
d67b4f80 10372 intern_c_string ("coding-category-iso-7-else"));
df7492f9 10373 ASET (Vcoding_category_table, coding_category_iso_8_else,
d67b4f80 10374 intern_c_string ("coding-category-iso-8-else"));
a470d443 10375 ASET (Vcoding_category_table, coding_category_utf_8_auto,
d67b4f80 10376 intern_c_string ("coding-category-utf-8-auto"));
a470d443 10377 ASET (Vcoding_category_table, coding_category_utf_8_nosig,
d67b4f80 10378 intern_c_string ("coding-category-utf-8"));
a470d443 10379 ASET (Vcoding_category_table, coding_category_utf_8_sig,
d67b4f80 10380 intern_c_string ("coding-category-utf-8-sig"));
df7492f9 10381 ASET (Vcoding_category_table, coding_category_utf_16_be,
d67b4f80 10382 intern_c_string ("coding-category-utf-16-be"));
ff563fce 10383 ASET (Vcoding_category_table, coding_category_utf_16_auto,
d67b4f80 10384 intern_c_string ("coding-category-utf-16-auto"));
df7492f9 10385 ASET (Vcoding_category_table, coding_category_utf_16_le,
d67b4f80 10386 intern_c_string ("coding-category-utf-16-le"));
df7492f9 10387 ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
d67b4f80 10388 intern_c_string ("coding-category-utf-16-be-nosig"));
df7492f9 10389 ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
d67b4f80 10390 intern_c_string ("coding-category-utf-16-le-nosig"));
df7492f9 10391 ASET (Vcoding_category_table, coding_category_charset,
d67b4f80 10392 intern_c_string ("coding-category-charset"));
df7492f9 10393 ASET (Vcoding_category_table, coding_category_sjis,
d67b4f80 10394 intern_c_string ("coding-category-sjis"));
df7492f9 10395 ASET (Vcoding_category_table, coding_category_big5,
d67b4f80 10396 intern_c_string ("coding-category-big5"));
df7492f9 10397 ASET (Vcoding_category_table, coding_category_ccl,
d67b4f80 10398 intern_c_string ("coding-category-ccl"));
df7492f9 10399 ASET (Vcoding_category_table, coding_category_emacs_mule,
d67b4f80 10400 intern_c_string ("coding-category-emacs-mule"));
df7492f9
KH
10401 /* Followings are NOT target of code detection. */
10402 ASET (Vcoding_category_table, coding_category_raw_text,
d67b4f80 10403 intern_c_string ("coding-category-raw-text"));
df7492f9 10404 ASET (Vcoding_category_table, coding_category_undecided,
d67b4f80 10405 intern_c_string ("coding-category-undecided"));
ecf488bc 10406
065e3595
KH
10407 DEFSYM (Qinsufficient_source, "insufficient-source");
10408 DEFSYM (Qinconsistent_eol, "inconsistent-eol");
10409 DEFSYM (Qinvalid_source, "invalid-source");
10410 DEFSYM (Qinterrupted, "interrupted");
10411 DEFSYM (Qinsufficient_memory, "insufficient-memory");
44e8490d 10412 DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
065e3595 10413
4ed46869
KH
10414 defsubr (&Scoding_system_p);
10415 defsubr (&Sread_coding_system);
10416 defsubr (&Sread_non_nil_coding_system);
10417 defsubr (&Scheck_coding_system);
10418 defsubr (&Sdetect_coding_region);
d46c5b12 10419 defsubr (&Sdetect_coding_string);
05e6f5dc 10420 defsubr (&Sfind_coding_systems_region_internal);
068a9dbd 10421 defsubr (&Sunencodable_char_position);
df7492f9 10422 defsubr (&Scheck_coding_systems_region);
4ed46869
KH
10423 defsubr (&Sdecode_coding_region);
10424 defsubr (&Sencode_coding_region);
10425 defsubr (&Sdecode_coding_string);
10426 defsubr (&Sencode_coding_string);
10427 defsubr (&Sdecode_sjis_char);
10428 defsubr (&Sencode_sjis_char);
10429 defsubr (&Sdecode_big5_char);
10430 defsubr (&Sencode_big5_char);
1ba9e4ab 10431 defsubr (&Sset_terminal_coding_system_internal);
c4825358 10432 defsubr (&Sset_safe_terminal_coding_system_internal);
4ed46869 10433 defsubr (&Sterminal_coding_system);
1ba9e4ab 10434 defsubr (&Sset_keyboard_coding_system_internal);
4ed46869 10435 defsubr (&Skeyboard_coding_system);
a5d301df 10436 defsubr (&Sfind_operation_coding_system);
df7492f9 10437 defsubr (&Sset_coding_system_priority);
6b89e3aa 10438 defsubr (&Sdefine_coding_system_internal);
df7492f9 10439 defsubr (&Sdefine_coding_system_alias);
a6f87d34 10440 defsubr (&Scoding_system_put);
df7492f9
KH
10441 defsubr (&Scoding_system_base);
10442 defsubr (&Scoding_system_plist);
10443 defsubr (&Scoding_system_aliases);
10444 defsubr (&Scoding_system_eol_type);
10445 defsubr (&Scoding_system_priority_list);
4ed46869 10446
29208e82 10447 DEFVAR_LISP ("coding-system-list", Vcoding_system_list,
48b0f3ae
PJ
10448 doc: /* List of coding systems.
10449
10450Do not alter the value of this variable manually. This variable should be
df7492f9 10451updated by the functions `define-coding-system' and
48b0f3ae 10452`define-coding-system-alias'. */);
4608c386
KH
10453 Vcoding_system_list = Qnil;
10454
29208e82 10455 DEFVAR_LISP ("coding-system-alist", Vcoding_system_alist,
48b0f3ae
PJ
10456 doc: /* Alist of coding system names.
10457Each element is one element list of coding system name.
446dcd75 10458This variable is given to `completing-read' as COLLECTION argument.
48b0f3ae
PJ
10459
10460Do not alter the value of this variable manually. This variable should be
10461updated by the functions `make-coding-system' and
10462`define-coding-system-alias'. */);
4608c386
KH
10463 Vcoding_system_alist = Qnil;
10464
29208e82 10465 DEFVAR_LISP ("coding-category-list", Vcoding_category_list,
48b0f3ae
PJ
10466 doc: /* List of coding-categories (symbols) ordered by priority.
10467
10468On detecting a coding system, Emacs tries code detection algorithms
10469associated with each coding-category one by one in this order. When
10470one algorithm agrees with a byte sequence of source text, the coding
0ec31faf
KH
10471system bound to the corresponding coding-category is selected.
10472
448e17d6 10473Don't modify this variable directly, but use `set-coding-system-priority'. */);
4ed46869
KH
10474 {
10475 int i;
10476
10477 Vcoding_category_list = Qnil;
df7492f9 10478 for (i = coding_category_max - 1; i >= 0; i--)
4ed46869 10479 Vcoding_category_list
28be1ada 10480 = Fcons (AREF (Vcoding_category_table, i),
d46c5b12 10481 Vcoding_category_list);
4ed46869
KH
10482 }
10483
29208e82 10484 DEFVAR_LISP ("coding-system-for-read", Vcoding_system_for_read,
48b0f3ae
PJ
10485 doc: /* Specify the coding system for read operations.
10486It is useful to bind this variable with `let', but do not set it globally.
10487If the value is a coding system, it is used for decoding on read operation.
446dcd75
JB
10488If not, an appropriate element is used from one of the coding system alists.
10489There are three such tables: `file-coding-system-alist',
48b0f3ae 10490`process-coding-system-alist', and `network-coding-system-alist'. */);
4ed46869
KH
10491 Vcoding_system_for_read = Qnil;
10492
29208e82 10493 DEFVAR_LISP ("coding-system-for-write", Vcoding_system_for_write,
48b0f3ae
PJ
10494 doc: /* Specify the coding system for write operations.
10495Programs bind this variable with `let', but you should not set it globally.
10496If the value is a coding system, it is used for encoding of output,
10497when writing it to a file and when sending it to a file or subprocess.
10498
10499If this does not specify a coding system, an appropriate element
446dcd75
JB
10500is used from one of the coding system alists.
10501There are three such tables: `file-coding-system-alist',
48b0f3ae
PJ
10502`process-coding-system-alist', and `network-coding-system-alist'.
10503For output to files, if the above procedure does not specify a coding system,
10504the value of `buffer-file-coding-system' is used. */);
4ed46869
KH
10505 Vcoding_system_for_write = Qnil;
10506
29208e82 10507 DEFVAR_LISP ("last-coding-system-used", Vlast_coding_system_used,
df7492f9
KH
10508 doc: /*
10509Coding system used in the latest file or process I/O. */);
4ed46869
KH
10510 Vlast_coding_system_used = Qnil;
10511
29208e82 10512 DEFVAR_LISP ("last-code-conversion-error", Vlast_code_conversion_error,
065e3595
KH
10513 doc: /*
10514Error status of the last code conversion.
10515
10516When an error was detected in the last code conversion, this variable
10517is set to one of the following symbols.
10518 `insufficient-source'
10519 `inconsistent-eol'
10520 `invalid-source'
10521 `interrupted'
10522 `insufficient-memory'
10523When no error was detected, the value doesn't change. So, to check
10524the error status of a code conversion by this variable, you must
10525explicitly set this variable to nil before performing code
10526conversion. */);
10527 Vlast_code_conversion_error = Qnil;
10528
29208e82 10529 DEFVAR_BOOL ("inhibit-eol-conversion", inhibit_eol_conversion,
df7492f9
KH
10530 doc: /*
10531*Non-nil means always inhibit code conversion of end-of-line format.
48b0f3ae
PJ
10532See info node `Coding Systems' and info node `Text and Binary' concerning
10533such conversion. */);
9ce27fde
KH
10534 inhibit_eol_conversion = 0;
10535
29208e82 10536 DEFVAR_BOOL ("inherit-process-coding-system", inherit_process_coding_system,
df7492f9
KH
10537 doc: /*
10538Non-nil means process buffer inherits coding system of process output.
48b0f3ae
PJ
10539Bind it to t if the process output is to be treated as if it were a file
10540read from some filesystem. */);
ed29121d
EZ
10541 inherit_process_coding_system = 0;
10542
29208e82 10543 DEFVAR_LISP ("file-coding-system-alist", Vfile_coding_system_alist,
df7492f9
KH
10544 doc: /*
10545Alist to decide a coding system to use for a file I/O operation.
48b0f3ae
PJ
10546The format is ((PATTERN . VAL) ...),
10547where PATTERN is a regular expression matching a file name,
10548VAL is a coding system, a cons of coding systems, or a function symbol.
10549If VAL is a coding system, it is used for both decoding and encoding
10550the file contents.
10551If VAL is a cons of coding systems, the car part is used for decoding,
10552and the cdr part is used for encoding.
10553If VAL is a function symbol, the function must return a coding system
2c53e699
KH
10554or a cons of coding systems which are used as above. The function is
10555called with an argument that is a list of the arguments with which
5a0bbd9a
KH
10556`find-operation-coding-system' was called. If the function can't decide
10557a coding system, it can return `undecided' so that the normal
10558code-detection is performed.
48b0f3ae
PJ
10559
10560See also the function `find-operation-coding-system'
10561and the variable `auto-coding-alist'. */);
02ba4723
KH
10562 Vfile_coding_system_alist = Qnil;
10563
29208e82 10564 DEFVAR_LISP ("process-coding-system-alist", Vprocess_coding_system_alist,
df7492f9
KH
10565 doc: /*
10566Alist to decide a coding system to use for a process I/O operation.
48b0f3ae
PJ
10567The format is ((PATTERN . VAL) ...),
10568where PATTERN is a regular expression matching a program name,
10569VAL is a coding system, a cons of coding systems, or a function symbol.
10570If VAL is a coding system, it is used for both decoding what received
10571from the program and encoding what sent to the program.
10572If VAL is a cons of coding systems, the car part is used for decoding,
10573and the cdr part is used for encoding.
10574If VAL is a function symbol, the function must return a coding system
10575or a cons of coding systems which are used as above.
10576
10577See also the function `find-operation-coding-system'. */);
02ba4723
KH
10578 Vprocess_coding_system_alist = Qnil;
10579
29208e82 10580 DEFVAR_LISP ("network-coding-system-alist", Vnetwork_coding_system_alist,
df7492f9
KH
10581 doc: /*
10582Alist to decide a coding system to use for a network I/O operation.
48b0f3ae
PJ
10583The format is ((PATTERN . VAL) ...),
10584where PATTERN is a regular expression matching a network service name
10585or is a port number to connect to,
10586VAL is a coding system, a cons of coding systems, or a function symbol.
10587If VAL is a coding system, it is used for both decoding what received
10588from the network stream and encoding what sent to the network stream.
10589If VAL is a cons of coding systems, the car part is used for decoding,
10590and the cdr part is used for encoding.
10591If VAL is a function symbol, the function must return a coding system
10592or a cons of coding systems which are used as above.
10593
10594See also the function `find-operation-coding-system'. */);
02ba4723 10595 Vnetwork_coding_system_alist = Qnil;
4ed46869 10596
29208e82 10597 DEFVAR_LISP ("locale-coding-system", Vlocale_coding_system,
75205970
RS
10598 doc: /* Coding system to use with system messages.
10599Also used for decoding keyboard input on X Window system. */);
68c45bf0
PE
10600 Vlocale_coding_system = Qnil;
10601
005f0d35 10602 /* The eol mnemonics are reset in startup.el system-dependently. */
29208e82 10603 DEFVAR_LISP ("eol-mnemonic-unix", eol_mnemonic_unix,
df7492f9
KH
10604 doc: /*
10605*String displayed in mode line for UNIX-like (LF) end-of-line format. */);
2a0213a6 10606 eol_mnemonic_unix = build_pure_c_string (":");
4ed46869 10607
29208e82 10608 DEFVAR_LISP ("eol-mnemonic-dos", eol_mnemonic_dos,
df7492f9
KH
10609 doc: /*
10610*String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
2a0213a6 10611 eol_mnemonic_dos = build_pure_c_string ("\\");
4ed46869 10612
29208e82 10613 DEFVAR_LISP ("eol-mnemonic-mac", eol_mnemonic_mac,
df7492f9
KH
10614 doc: /*
10615*String displayed in mode line for MAC-like (CR) end-of-line format. */);
2a0213a6 10616 eol_mnemonic_mac = build_pure_c_string ("/");
4ed46869 10617
29208e82 10618 DEFVAR_LISP ("eol-mnemonic-undecided", eol_mnemonic_undecided,
df7492f9
KH
10619 doc: /*
10620*String displayed in mode line when end-of-line format is not yet determined. */);
2a0213a6 10621 eol_mnemonic_undecided = build_pure_c_string (":");
4ed46869 10622
29208e82 10623 DEFVAR_LISP ("enable-character-translation", Venable_character_translation,
df7492f9
KH
10624 doc: /*
10625*Non-nil enables character translation while encoding and decoding. */);
84fbb8a0 10626 Venable_character_translation = Qt;
bdd9fb48 10627
f967223b 10628 DEFVAR_LISP ("standard-translation-table-for-decode",
29208e82 10629 Vstandard_translation_table_for_decode,
48b0f3ae 10630 doc: /* Table for translating characters while decoding. */);
f967223b 10631 Vstandard_translation_table_for_decode = Qnil;
bdd9fb48 10632
f967223b 10633 DEFVAR_LISP ("standard-translation-table-for-encode",
29208e82 10634 Vstandard_translation_table_for_encode,
48b0f3ae 10635 doc: /* Table for translating characters while encoding. */);
f967223b 10636 Vstandard_translation_table_for_encode = Qnil;
4ed46869 10637
29208e82 10638 DEFVAR_LISP ("charset-revision-table", Vcharset_revision_table,
48b0f3ae
PJ
10639 doc: /* Alist of charsets vs revision numbers.
10640While encoding, if a charset (car part of an element) is found,
df7492f9
KH
10641designate it with the escape sequence identifying revision (cdr part
10642of the element). */);
10643 Vcharset_revision_table = Qnil;
02ba4723
KH
10644
10645 DEFVAR_LISP ("default-process-coding-system",
29208e82 10646 Vdefault_process_coding_system,
48b0f3ae
PJ
10647 doc: /* Cons of coding systems used for process I/O by default.
10648The car part is used for decoding a process output,
10649the cdr part is used for encoding a text to be sent to a process. */);
02ba4723 10650 Vdefault_process_coding_system = Qnil;
c4825358 10651
29208e82 10652 DEFVAR_LISP ("latin-extra-code-table", Vlatin_extra_code_table,
df7492f9
KH
10653 doc: /*
10654Table of extra Latin codes in the range 128..159 (inclusive).
48b0f3ae
PJ
10655This is a vector of length 256.
10656If Nth element is non-nil, the existence of code N in a file
10657\(or output of subprocess) doesn't prevent it to be detected as
10658a coding system of ISO 2022 variant which has a flag
10659`accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
10660or reading output of a subprocess.
446dcd75 10661Only 128th through 159th elements have a meaning. */);
3f003981 10662 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
d46c5b12
KH
10663
10664 DEFVAR_LISP ("select-safe-coding-system-function",
29208e82 10665 Vselect_safe_coding_system_function,
df7492f9
KH
10666 doc: /*
10667Function to call to select safe coding system for encoding a text.
48b0f3ae
PJ
10668
10669If set, this function is called to force a user to select a proper
10670coding system which can encode the text in the case that a default
fdecf907
GM
10671coding system used in each operation can't encode the text. The
10672function should take care that the buffer is not modified while
10673the coding system is being selected.
48b0f3ae
PJ
10674
10675The default value is `select-safe-coding-system' (which see). */);
d46c5b12
KH
10676 Vselect_safe_coding_system_function = Qnil;
10677
5d5bf4d8 10678 DEFVAR_BOOL ("coding-system-require-warning",
29208e82 10679 coding_system_require_warning,
5d5bf4d8 10680 doc: /* Internal use only.
6b89e3aa
KH
10681If non-nil, on writing a file, `select-safe-coding-system-function' is
10682called even if `coding-system-for-write' is non-nil. The command
10683`universal-coding-system-argument' binds this variable to t temporarily. */);
5d5bf4d8
KH
10684 coding_system_require_warning = 0;
10685
10686
22ab2303 10687 DEFVAR_BOOL ("inhibit-iso-escape-detection",
29208e82 10688 inhibit_iso_escape_detection,
df7492f9 10689 doc: /*
97b1b294 10690If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
48b0f3ae 10691
97b1b294
EZ
10692When Emacs reads text, it tries to detect how the text is encoded.
10693This code detection is sensitive to escape sequences. If Emacs sees
10694a valid ISO-2022 escape sequence, it assumes the text is encoded in one
10695of the ISO2022 encodings, and decodes text by the corresponding coding
10696system (e.g. `iso-2022-7bit').
48b0f3ae
PJ
10697
10698However, there may be a case that you want to read escape sequences in
10699a file as is. In such a case, you can set this variable to non-nil.
97b1b294
EZ
10700Then the code detection will ignore any escape sequences, and no text is
10701detected as encoded in some ISO-2022 encoding. The result is that all
48b0f3ae
PJ
10702escape sequences become visible in a buffer.
10703
10704The default value is nil, and it is strongly recommended not to change
10705it. That is because many Emacs Lisp source files that contain
10706non-ASCII characters are encoded by the coding system `iso-2022-7bit'
10707in Emacs's distribution, and they won't be decoded correctly on
10708reading if you suppress escape sequence detection.
10709
10710The other way to read escape sequences in a file without decoding is
97b1b294 10711to explicitly specify some coding system that doesn't use ISO-2022
48b0f3ae 10712escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument]. */);
74383408 10713 inhibit_iso_escape_detection = 0;
002fdb44 10714
97b1b294 10715 DEFVAR_BOOL ("inhibit-null-byte-detection",
29208e82 10716 inhibit_null_byte_detection,
97b1b294
EZ
10717 doc: /* If non-nil, Emacs ignores null bytes on code detection.
10718By default, Emacs treats it as binary data, and does not attempt to
10719decode it. The effect is as if you specified `no-conversion' for
10720reading that text.
10721
10722Set this to non-nil when a regular text happens to include null bytes.
10723Examples are Index nodes of Info files and null-byte delimited output
10724from GNU Find and GNU Grep. Emacs will then ignore the null bytes and
10725decode text as usual. */);
10726 inhibit_null_byte_detection = 0;
10727
29208e82 10728 DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input,
15c8f9d1 10729 doc: /* Char table for translating self-inserting characters.
446dcd75 10730This is applied to the result of input methods, not their input.
8434d0b8
EZ
10731See also `keyboard-translate-table'.
10732
10733Use of this variable for character code unification was rendered
10734obsolete in Emacs 23.1 and later, since Unicode is now the basis of
10735internal character representation. */);
002fdb44 10736 Vtranslation_table_for_input = Qnil;
8f924df7 10737
2c78b7e1
KH
10738 {
10739 Lisp_Object args[coding_arg_max];
8f924df7 10740 Lisp_Object plist[16];
2c78b7e1
KH
10741 int i;
10742
10743 for (i = 0; i < coding_arg_max; i++)
10744 args[i] = Qnil;
10745
d67b4f80 10746 plist[0] = intern_c_string (":name");
2c78b7e1 10747 plist[1] = args[coding_arg_name] = Qno_conversion;
d67b4f80 10748 plist[2] = intern_c_string (":mnemonic");
2c78b7e1 10749 plist[3] = args[coding_arg_mnemonic] = make_number ('=');
d67b4f80 10750 plist[4] = intern_c_string (":coding-type");
2c78b7e1 10751 plist[5] = args[coding_arg_coding_type] = Qraw_text;
d67b4f80 10752 plist[6] = intern_c_string (":ascii-compatible-p");
2c78b7e1 10753 plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
d67b4f80 10754 plist[8] = intern_c_string (":default-char");
2c78b7e1 10755 plist[9] = args[coding_arg_default_char] = make_number (0);
d67b4f80 10756 plist[10] = intern_c_string (":for-unibyte");
8f924df7 10757 plist[11] = args[coding_arg_for_unibyte] = Qt;
d67b4f80 10758 plist[12] = intern_c_string (":docstring");
2a0213a6 10759 plist[13] = build_pure_c_string ("Do no conversion.\n\
2c78b7e1
KH
10760\n\
10761When you visit a file with this coding, the file is read into a\n\
10762unibyte buffer as is, thus each byte of a file is treated as a\n\
10763character.");
d67b4f80 10764 plist[14] = intern_c_string (":eol-type");
8f924df7
KH
10765 plist[15] = args[coding_arg_eol_type] = Qunix;
10766 args[coding_arg_plist] = Flist (16, plist);
2c78b7e1 10767 Fdefine_coding_system_internal (coding_arg_max, args);
ae6f73fa
KH
10768
10769 plist[1] = args[coding_arg_name] = Qundecided;
10770 plist[3] = args[coding_arg_mnemonic] = make_number ('-');
10771 plist[5] = args[coding_arg_coding_type] = Qundecided;
10772 /* This is already set.
35befdaa 10773 plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
d67b4f80 10774 plist[8] = intern_c_string (":charset-list");
ae6f73fa
KH
10775 plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
10776 plist[11] = args[coding_arg_for_unibyte] = Qnil;
2a0213a6 10777 plist[13] = build_pure_c_string ("No conversion on encoding, automatic conversion on decoding.");
ae6f73fa
KH
10778 plist[15] = args[coding_arg_eol_type] = Qnil;
10779 args[coding_arg_plist] = Flist (16, plist);
10780 Fdefine_coding_system_internal (coding_arg_max, args);
2c78b7e1
KH
10781 }
10782
2c78b7e1 10783 setup_coding_system (Qno_conversion, &safe_terminal_coding);
ff563fce
KH
10784
10785 {
10786 int i;
10787
10788 for (i = 0; i < coding_category_max; i++)
10789 Fset (AREF (Vcoding_category_table, i), Qno_conversion);
10790 }
1a4990fb 10791#if defined (DOS_NT)
fcbcfb64
KH
10792 system_eol_type = Qdos;
10793#else
10794 system_eol_type = Qunix;
10795#endif
10796 staticpro (&system_eol_type);
4ed46869
KH
10797}
10798
68c45bf0 10799char *
971de7fb 10800emacs_strerror (int error_number)
68c45bf0
PE
10801{
10802 char *str;
10803
ca9c0567 10804 synchronize_system_messages_locale ();
68c45bf0
PE
10805 str = strerror (error_number);
10806
10807 if (! NILP (Vlocale_coding_system))
10808 {
10809 Lisp_Object dec = code_convert_string_norecord (build_string (str),
10810 Vlocale_coding_system,
10811 0);
51b59d79 10812 str = SSDATA (dec);
68c45bf0
PE
10813 }
10814
10815 return str;
10816}
10817
4ed46869 10818#endif /* emacs */