* emacs-lisp/byte-run.el (defmacro): Use same argument parsing as
[bpt/emacs.git] / src / coding.c
CommitLineData
9542cb1f 1/* Coding system handler (conversion, detection, etc).
acaf905b 2 Copyright (C) 2001-2012 Free Software Foundation, Inc.
7976eda0 3 Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
5df4f04c 4 2005, 2006, 2007, 2008, 2009, 2010, 2011
ce03bf76
KH
5 National Institute of Advanced Industrial Science and Technology (AIST)
6 Registration Number H14PRO021
8f924df7 7 Copyright (C) 2003
df7492f9
KH
8 National Institute of Advanced Industrial Science and Technology (AIST)
9 Registration Number H13PRO009
4ed46869 10
369314dc
KH
11This file is part of GNU Emacs.
12
9ec0b715 13GNU Emacs is free software: you can redistribute it and/or modify
369314dc 14it under the terms of the GNU General Public License as published by
9ec0b715
GM
15the Free Software Foundation, either version 3 of the License, or
16(at your option) any later version.
4ed46869 17
369314dc
KH
18GNU Emacs is distributed in the hope that it will be useful,
19but WITHOUT ANY WARRANTY; without even the implied warranty of
20MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21GNU General Public License for more details.
4ed46869 22
369314dc 23You should have received a copy of the GNU General Public License
9ec0b715 24along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */
4ed46869
KH
25
26/*** TABLE OF CONTENTS ***
27
b73bfc1c 28 0. General comments
4ed46869 29 1. Preamble
df7492f9
KH
30 2. Emacs' internal format (emacs-utf-8) handlers
31 3. UTF-8 handlers
32 4. UTF-16 handlers
33 5. Charset-base coding systems handlers
34 6. emacs-mule (old Emacs' internal format) handlers
35 7. ISO2022 handlers
36 8. Shift-JIS and BIG5 handlers
37 9. CCL handlers
38 10. C library functions
39 11. Emacs Lisp library functions
40 12. Postamble
4ed46869
KH
41
42*/
43
df7492f9 44/*** 0. General comments ***
b73bfc1c
KH
45
46
df7492f9 47CODING SYSTEM
4ed46869 48
5bad0796
DL
49 A coding system is an object for an encoding mechanism that contains
50 information about how to convert byte sequences to character
e19c3639
KH
51 sequences and vice versa. When we say "decode", it means converting
52 a byte sequence of a specific coding system into a character
53 sequence that is represented by Emacs' internal coding system
54 `emacs-utf-8', and when we say "encode", it means converting a
55 character sequence of emacs-utf-8 to a byte sequence of a specific
0ef69138 56 coding system.
4ed46869 57
34809aa6
EZ
58 In Emacs Lisp, a coding system is represented by a Lisp symbol. On
59 the C level, a coding system is represented by a vector of attributes
5bad0796 60 stored in the hash table Vcharset_hash_table. The conversion from
e19c3639
KH
61 coding system symbol to attributes vector is done by looking up
62 Vcharset_hash_table by the symbol.
4ed46869 63
e19c3639 64 Coding systems are classified into the following types depending on
5bad0796 65 the encoding mechanism. Here's a brief description of the types.
4ed46869 66
df7492f9
KH
67 o UTF-8
68
69 o UTF-16
70
71 o Charset-base coding system
72
73 A coding system defined by one or more (coded) character sets.
5bad0796 74 Decoding and encoding are done by a code converter defined for each
df7492f9
KH
75 character set.
76
5bad0796 77 o Old Emacs internal format (emacs-mule)
df7492f9 78
5bad0796 79 The coding system adopted by old versions of Emacs (20 and 21).
4ed46869 80
df7492f9 81 o ISO2022-base coding system
4ed46869
KH
82
83 The most famous coding system for multiple character sets. X's
df7492f9
KH
84 Compound Text, various EUCs (Extended Unix Code), and coding systems
85 used in the Internet communication such as ISO-2022-JP are all
86 variants of ISO2022.
4ed46869 87
df7492f9 88 o SJIS (or Shift-JIS or MS-Kanji-Code)
93dec019 89
4ed46869
KH
90 A coding system to encode character sets: ASCII, JISX0201, and
91 JISX0208. Widely used for PC's in Japan. Details are described in
df7492f9 92 section 8.
4ed46869 93
df7492f9 94 o BIG5
4ed46869 95
df7492f9 96 A coding system to encode character sets: ASCII and Big5. Widely
cfb43547 97 used for Chinese (mainly in Taiwan and Hong Kong). Details are
df7492f9
KH
98 described in section 8. In this file, when we write "big5" (all
99 lowercase), we mean the coding system, and when we write "Big5"
100 (capitalized), we mean the character set.
4ed46869 101
df7492f9 102 o CCL
27901516 103
5bad0796 104 If a user wants to decode/encode text encoded in a coding system
df7492f9
KH
105 not listed above, he can supply a decoder and an encoder for it in
106 CCL (Code Conversion Language) programs. Emacs executes the CCL
107 program while decoding/encoding.
27901516 108
df7492f9 109 o Raw-text
4ed46869 110
5a936b46 111 A coding system for text containing raw eight-bit data. Emacs
5bad0796 112 treats each byte of source text as a character (except for
df7492f9 113 end-of-line conversion).
4ed46869 114
df7492f9
KH
115 o No-conversion
116
117 Like raw text, but don't do end-of-line conversion.
4ed46869 118
4ed46869 119
df7492f9 120END-OF-LINE FORMAT
4ed46869 121
5bad0796 122 How text end-of-line is encoded depends on operating system. For
df7492f9 123 instance, Unix's format is just one byte of LF (line-feed) code,
f4dee582 124 whereas DOS's format is two-byte sequence of `carriage-return' and
d46c5b12
KH
125 `line-feed' codes. MacOS's format is usually one byte of
126 `carriage-return'.
4ed46869 127
cfb43547 128 Since text character encoding and end-of-line encoding are
df7492f9
KH
129 independent, any coding system described above can take any format
130 of end-of-line (except for no-conversion).
4ed46869 131
e19c3639
KH
132STRUCT CODING_SYSTEM
133
134 Before using a coding system for code conversion (i.e. decoding and
135 encoding), we setup a structure of type `struct coding_system'.
136 This structure keeps various information about a specific code
5bad0796 137 conversion (e.g. the location of source and destination data).
4ed46869
KH
138
139*/
140
df7492f9
KH
141/* COMMON MACROS */
142
143
4ed46869
KH
144/*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
145
df7492f9 146 These functions check if a byte sequence specified as a source in
ff0dacd7
KH
147 CODING conforms to the format of XXX, and update the members of
148 DETECT_INFO.
df7492f9 149
f10fe38f 150 Return true if the byte sequence conforms to XXX.
df7492f9
KH
151
152 Below is the template of these functions. */
153
4ed46869 154#if 0
f10fe38f 155static bool
cf84bb53
JB
156detect_coding_XXX (struct coding_system *coding,
157 struct coding_detection_info *detect_info)
4ed46869 158{
f1d34bca
MB
159 const unsigned char *src = coding->source;
160 const unsigned char *src_end = coding->source + coding->src_bytes;
f10fe38f 161 bool multibytep = coding->src_multibyte;
d311d28c 162 ptrdiff_t consumed_chars = 0;
df7492f9
KH
163 int found = 0;
164 ...;
165
166 while (1)
167 {
ad1746f5 168 /* Get one byte from the source. If the source is exhausted, jump
df7492f9
KH
169 to no_more_source:. */
170 ONE_MORE_BYTE (c);
ff0dacd7
KH
171
172 if (! __C_conforms_to_XXX___ (c))
173 break;
174 if (! __C_strongly_suggests_XXX__ (c))
175 found = CATEGORY_MASK_XXX;
df7492f9 176 }
ff0dacd7
KH
177 /* The byte sequence is invalid for XXX. */
178 detect_info->rejected |= CATEGORY_MASK_XXX;
df7492f9 179 return 0;
ff0dacd7 180
df7492f9 181 no_more_source:
ad1746f5 182 /* The source exhausted successfully. */
ff0dacd7 183 detect_info->found |= found;
df7492f9 184 return 1;
4ed46869
KH
185}
186#endif
187
188/*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
189
df7492f9
KH
190 These functions decode a byte sequence specified as a source by
191 CODING. The resulting multibyte text goes to a place pointed to by
192 CODING->charbuf, the length of which should not exceed
193 CODING->charbuf_size;
d46c5b12 194
df7492f9
KH
195 These functions set the information of original and decoded texts in
196 CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
197 They also set CODING->result to one of CODING_RESULT_XXX indicating
198 how the decoding is finished.
d46c5b12 199
df7492f9 200 Below is the template of these functions. */
d46c5b12 201
4ed46869 202#if 0
b73bfc1c 203static void
cf84bb53 204decode_coding_XXXX (struct coding_system *coding)
4ed46869 205{
f1d34bca
MB
206 const unsigned char *src = coding->source + coding->consumed;
207 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9
KH
208 /* SRC_BASE remembers the start position in source in each loop.
209 The loop will be exited when there's not enough source code, or
210 when there's no room in CHARBUF for a decoded character. */
f1d34bca 211 const unsigned char *src_base;
df7492f9 212 /* A buffer to produce decoded characters. */
69a80ea3
KH
213 int *charbuf = coding->charbuf + coding->charbuf_used;
214 int *charbuf_end = coding->charbuf + coding->charbuf_size;
f10fe38f 215 bool multibytep = coding->src_multibyte;
df7492f9
KH
216
217 while (1)
218 {
219 src_base = src;
220 if (charbuf < charbuf_end)
221 /* No more room to produce a decoded character. */
222 break;
223 ONE_MORE_BYTE (c);
224 /* Decode it. */
225 }
226
227 no_more_source:
228 if (src_base < src_end
229 && coding->mode & CODING_MODE_LAST_BLOCK)
230 /* If the source ends by partial bytes to construct a character,
231 treat them as eight-bit raw data. */
232 while (src_base < src_end && charbuf < charbuf_end)
233 *charbuf++ = *src_base++;
234 /* Remember how many bytes and characters we consumed. If the
235 source is multibyte, the bytes and chars are not identical. */
236 coding->consumed = coding->consumed_char = src_base - coding->source;
237 /* Remember how many characters we produced. */
238 coding->charbuf_used = charbuf - coding->charbuf;
4ed46869
KH
239}
240#endif
241
242/*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
243
df7492f9
KH
244 These functions encode SRC_BYTES length text at SOURCE of Emacs'
245 internal multibyte format by CODING. The resulting byte sequence
b73bfc1c
KH
246 goes to a place pointed to by DESTINATION, the length of which
247 should not exceed DST_BYTES.
d46c5b12 248
df7492f9
KH
249 These functions set the information of original and encoded texts in
250 the members produced, produced_char, consumed, and consumed_char of
251 the structure *CODING. They also set the member result to one of
252 CODING_RESULT_XXX indicating how the encoding finished.
d46c5b12 253
df7492f9
KH
254 DST_BYTES zero means that source area and destination area are
255 overlapped, which means that we can produce a encoded text until it
256 reaches at the head of not-yet-encoded source text.
d46c5b12 257
df7492f9 258 Below is a template of these functions. */
4ed46869 259#if 0
b73bfc1c 260static void
cf84bb53 261encode_coding_XXX (struct coding_system *coding)
4ed46869 262{
f10fe38f 263 bool multibytep = coding->dst_multibyte;
df7492f9
KH
264 int *charbuf = coding->charbuf;
265 int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
266 unsigned char *dst = coding->destination + coding->produced;
267 unsigned char *dst_end = coding->destination + coding->dst_bytes;
268 unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
d311d28c 269 ptrdiff_t produced_chars = 0;
df7492f9
KH
270
271 for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
272 {
273 int c = *charbuf;
274 /* Encode C into DST, and increment DST. */
275 }
276 label_no_more_destination:
277 /* How many chars and bytes we produced. */
278 coding->produced_char += produced_chars;
279 coding->produced = dst - coding->destination;
4ed46869
KH
280}
281#endif
282
4ed46869
KH
283\f
284/*** 1. Preamble ***/
285
68c45bf0 286#include <config.h>
4ed46869
KH
287#include <stdio.h>
288
4ed46869 289#include "lisp.h"
df7492f9 290#include "character.h"
e5560ff7 291#include "buffer.h"
4ed46869
KH
292#include "charset.h"
293#include "ccl.h"
df7492f9 294#include "composite.h"
4ed46869
KH
295#include "coding.h"
296#include "window.h"
b8299c66
KL
297#include "frame.h"
298#include "termhooks.h"
4ed46869 299
df7492f9 300Lisp_Object Vcoding_system_hash_table;
4ed46869 301
955cbe7b
PE
302static Lisp_Object Qcoding_system, Qeol_type;
303static Lisp_Object Qcoding_aliases;
1965cb73 304Lisp_Object Qunix, Qdos;
4ed46869 305Lisp_Object Qbuffer_file_coding_system;
955cbe7b
PE
306static Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
307static Lisp_Object Qdefault_char;
27901516 308Lisp_Object Qno_conversion, Qundecided;
955cbe7b
PE
309Lisp_Object Qcharset, Qutf_8;
310static Lisp_Object Qiso_2022;
311static Lisp_Object Qutf_16, Qshift_jis, Qbig5;
312static Lisp_Object Qbig, Qlittle;
313static Lisp_Object Qcoding_system_history;
314static Lisp_Object Qvalid_codes;
315static Lisp_Object QCcategory, QCmnemonic, QCdefault_char;
316static Lisp_Object QCdecode_translation_table, QCencode_translation_table;
317static Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
318static Lisp_Object QCascii_compatible_p;
4ed46869 319
387f6ba5 320Lisp_Object Qcall_process, Qcall_process_region;
4ed46869 321Lisp_Object Qstart_process, Qopen_network_stream;
955cbe7b 322static Lisp_Object Qtarget_idx;
4ed46869 323
955cbe7b
PE
324static Lisp_Object Qinsufficient_source, Qinconsistent_eol, Qinvalid_source;
325static Lisp_Object Qinterrupted, Qinsufficient_memory;
065e3595 326
44e8490d
KH
327/* If a symbol has this property, evaluate the value to define the
328 symbol as a coding system. */
329static Lisp_Object Qcoding_system_define_form;
330
fcbcfb64
KH
331/* Format of end-of-line decided by system. This is Qunix on
332 Unix and Mac, Qdos on DOS/Windows.
333 This has an effect only for external encoding (i.e. for output to
334 file and process), not for in-buffer or Lisp string encoding. */
335static Lisp_Object system_eol_type;
336
4ed46869
KH
337#ifdef emacs
338
4608c386 339Lisp_Object Qcoding_system_p, Qcoding_system_error;
4ed46869 340
d46c5b12
KH
341/* Coding system emacs-mule and raw-text are for converting only
342 end-of-line format. */
343Lisp_Object Qemacs_mule, Qraw_text;
8f924df7 344Lisp_Object Qutf_8_emacs;
ecf488bc 345
53372c27 346#if defined (WINDOWSNT) || defined (CYGWIN)
ba116008
DC
347static Lisp_Object Qutf_16le;
348#endif
349
4ed46869
KH
350/* Coding-systems are handed between Emacs Lisp programs and C internal
351 routines by the following three variables. */
c4825358
KH
352/* Coding system to be used to encode text for terminal display when
353 terminal coding system is nil. */
354struct coding_system safe_terminal_coding;
355
4ed46869
KH
356#endif /* emacs */
357
f967223b
KH
358Lisp_Object Qtranslation_table;
359Lisp_Object Qtranslation_table_id;
955cbe7b
PE
360static Lisp_Object Qtranslation_table_for_decode;
361static Lisp_Object Qtranslation_table_for_encode;
4ed46869 362
df7492f9 363/* Two special coding systems. */
74ab6df5
PE
364static Lisp_Object Vsjis_coding_system;
365static Lisp_Object Vbig5_coding_system;
df7492f9 366
df7492f9
KH
367/* ISO2022 section */
368
369#define CODING_ISO_INITIAL(coding, reg) \
370 (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id), \
371 coding_attr_iso_initial), \
372 reg)))
373
374
1b3b981b
AS
375#define CODING_ISO_REQUEST(coding, charset_id) \
376 (((charset_id) <= (coding)->max_charset_id \
377 ? ((coding)->safe_charsets[charset_id] != 255 \
378 ? (coding)->safe_charsets[charset_id] \
379 : -1) \
df7492f9
KH
380 : -1))
381
382
383#define CODING_ISO_FLAGS(coding) \
384 ((coding)->spec.iso_2022.flags)
385#define CODING_ISO_DESIGNATION(coding, reg) \
386 ((coding)->spec.iso_2022.current_designation[reg])
387#define CODING_ISO_INVOCATION(coding, plane) \
388 ((coding)->spec.iso_2022.current_invocation[plane])
389#define CODING_ISO_SINGLE_SHIFTING(coding) \
390 ((coding)->spec.iso_2022.single_shifting)
391#define CODING_ISO_BOL(coding) \
392 ((coding)->spec.iso_2022.bol)
393#define CODING_ISO_INVOKED_CHARSET(coding, plane) \
394 CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
e951386e
KH
395#define CODING_ISO_CMP_STATUS(coding) \
396 (&(coding)->spec.iso_2022.cmp_status)
397#define CODING_ISO_EXTSEGMENT_LEN(coding) \
398 ((coding)->spec.iso_2022.ctext_extended_segment_len)
399#define CODING_ISO_EMBEDDED_UTF_8(coding) \
400 ((coding)->spec.iso_2022.embedded_utf_8)
df7492f9
KH
401
402/* Control characters of ISO2022. */
403 /* code */ /* function */
df7492f9
KH
404#define ISO_CODE_SO 0x0E /* shift-out */
405#define ISO_CODE_SI 0x0F /* shift-in */
406#define ISO_CODE_SS2_7 0x19 /* single-shift-2 for 7-bit code */
407#define ISO_CODE_ESC 0x1B /* escape */
408#define ISO_CODE_SS2 0x8E /* single-shift-2 */
409#define ISO_CODE_SS3 0x8F /* single-shift-3 */
410#define ISO_CODE_CSI 0x9B /* control-sequence-introducer */
411
412/* All code (1-byte) of ISO2022 is classified into one of the
413 followings. */
414enum iso_code_class_type
415 {
416 ISO_control_0, /* Control codes in the range
417 0x00..0x1F and 0x7F, except for the
418 following 5 codes. */
df7492f9
KH
419 ISO_shift_out, /* ISO_CODE_SO (0x0E) */
420 ISO_shift_in, /* ISO_CODE_SI (0x0F) */
421 ISO_single_shift_2_7, /* ISO_CODE_SS2_7 (0x19) */
a0d7415f 422 ISO_escape, /* ISO_CODE_ESC (0x1B) */
df7492f9
KH
423 ISO_control_1, /* Control codes in the range
424 0x80..0x9F, except for the
425 following 3 codes. */
426 ISO_single_shift_2, /* ISO_CODE_SS2 (0x8E) */
427 ISO_single_shift_3, /* ISO_CODE_SS3 (0x8F) */
428 ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
429 ISO_0x20_or_0x7F, /* Codes of the values 0x20 or 0x7F. */
430 ISO_graphic_plane_0, /* Graphic codes in the range 0x21..0x7E. */
431 ISO_0xA0_or_0xFF, /* Codes of the values 0xA0 or 0xFF. */
432 ISO_graphic_plane_1 /* Graphic codes in the range 0xA1..0xFE. */
433 };
05e6f5dc 434
df7492f9
KH
435/** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
436 `iso-flags' attribute of an iso2022 coding system. */
05e6f5dc 437
df7492f9
KH
438/* If set, produce long-form designation sequence (e.g. ESC $ ( A)
439 instead of the correct short-form sequence (e.g. ESC $ A). */
440#define CODING_ISO_FLAG_LONG_FORM 0x0001
93dec019 441
df7492f9
KH
442/* If set, reset graphic planes and registers at end-of-line to the
443 initial state. */
444#define CODING_ISO_FLAG_RESET_AT_EOL 0x0002
05e6f5dc 445
df7492f9
KH
446/* If set, reset graphic planes and registers before any control
447 characters to the initial state. */
448#define CODING_ISO_FLAG_RESET_AT_CNTL 0x0004
05e6f5dc 449
df7492f9
KH
450/* If set, encode by 7-bit environment. */
451#define CODING_ISO_FLAG_SEVEN_BITS 0x0008
4ed46869 452
df7492f9
KH
453/* If set, use locking-shift function. */
454#define CODING_ISO_FLAG_LOCKING_SHIFT 0x0010
b73bfc1c 455
df7492f9
KH
456/* If set, use single-shift function. Overwrite
457 CODING_ISO_FLAG_LOCKING_SHIFT. */
458#define CODING_ISO_FLAG_SINGLE_SHIFT 0x0020
b73bfc1c 459
df7492f9
KH
460/* If set, use designation escape sequence. */
461#define CODING_ISO_FLAG_DESIGNATION 0x0040
b73bfc1c 462
df7492f9
KH
463/* If set, produce revision number sequence. */
464#define CODING_ISO_FLAG_REVISION 0x0080
b73bfc1c 465
df7492f9
KH
466/* If set, produce ISO6429's direction specifying sequence. */
467#define CODING_ISO_FLAG_DIRECTION 0x0100
f4dee582 468
df7492f9
KH
469/* If set, assume designation states are reset at beginning of line on
470 output. */
471#define CODING_ISO_FLAG_INIT_AT_BOL 0x0200
4ed46869 472
df7492f9
KH
473/* If set, designation sequence should be placed at beginning of line
474 on output. */
475#define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
aa72b389 476
ad1746f5 477/* If set, do not encode unsafe characters on output. */
df7492f9 478#define CODING_ISO_FLAG_SAFE 0x0800
aa72b389 479
df7492f9
KH
480/* If set, extra latin codes (128..159) are accepted as a valid code
481 on input. */
482#define CODING_ISO_FLAG_LATIN_EXTRA 0x1000
aa72b389 483
df7492f9 484#define CODING_ISO_FLAG_COMPOSITION 0x2000
aa72b389 485
5f58e762 486/* #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000 */
aa72b389 487
bf16eb23 488#define CODING_ISO_FLAG_USE_ROMAN 0x8000
aa72b389 489
bf16eb23 490#define CODING_ISO_FLAG_USE_OLDJIS 0x10000
aa72b389 491
bf16eb23 492#define CODING_ISO_FLAG_FULL_SUPPORT 0x100000
aa72b389 493
df7492f9
KH
494/* A character to be produced on output if encoding of the original
495 character is prohibited by CODING_ISO_FLAG_SAFE. */
496#define CODING_INHIBIT_CHARACTER_SUBSTITUTION '?'
aa72b389 497
a470d443
KH
498/* UTF-8 section */
499#define CODING_UTF_8_BOM(coding) \
500 ((coding)->spec.utf_8_bom)
4ed46869 501
df7492f9
KH
502/* UTF-16 section */
503#define CODING_UTF_16_BOM(coding) \
504 ((coding)->spec.utf_16.bom)
4ed46869 505
df7492f9
KH
506#define CODING_UTF_16_ENDIAN(coding) \
507 ((coding)->spec.utf_16.endian)
4ed46869 508
df7492f9
KH
509#define CODING_UTF_16_SURROGATE(coding) \
510 ((coding)->spec.utf_16.surrogate)
4ed46869 511
4ed46869 512
df7492f9
KH
513/* CCL section */
514#define CODING_CCL_DECODER(coding) \
515 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
516#define CODING_CCL_ENCODER(coding) \
517 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
518#define CODING_CCL_VALIDS(coding) \
8f924df7 519 (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
4ed46869 520
5a936b46 521/* Index for each coding category in `coding_categories' */
4ed46869 522
df7492f9
KH
523enum coding_category
524 {
525 coding_category_iso_7,
526 coding_category_iso_7_tight,
527 coding_category_iso_8_1,
528 coding_category_iso_8_2,
529 coding_category_iso_7_else,
530 coding_category_iso_8_else,
a470d443
KH
531 coding_category_utf_8_auto,
532 coding_category_utf_8_nosig,
533 coding_category_utf_8_sig,
df7492f9
KH
534 coding_category_utf_16_auto,
535 coding_category_utf_16_be,
536 coding_category_utf_16_le,
537 coding_category_utf_16_be_nosig,
538 coding_category_utf_16_le_nosig,
539 coding_category_charset,
540 coding_category_sjis,
541 coding_category_big5,
542 coding_category_ccl,
543 coding_category_emacs_mule,
544 /* All above are targets of code detection. */
545 coding_category_raw_text,
546 coding_category_undecided,
547 coding_category_max
548 };
549
550/* Definitions of flag bits used in detect_coding_XXXX. */
551#define CATEGORY_MASK_ISO_7 (1 << coding_category_iso_7)
552#define CATEGORY_MASK_ISO_7_TIGHT (1 << coding_category_iso_7_tight)
553#define CATEGORY_MASK_ISO_8_1 (1 << coding_category_iso_8_1)
554#define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2)
555#define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else)
556#define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else)
a470d443
KH
557#define CATEGORY_MASK_UTF_8_AUTO (1 << coding_category_utf_8_auto)
558#define CATEGORY_MASK_UTF_8_NOSIG (1 << coding_category_utf_8_nosig)
559#define CATEGORY_MASK_UTF_8_SIG (1 << coding_category_utf_8_sig)
b49a1807 560#define CATEGORY_MASK_UTF_16_AUTO (1 << coding_category_utf_16_auto)
df7492f9
KH
561#define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be)
562#define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le)
563#define CATEGORY_MASK_UTF_16_BE_NOSIG (1 << coding_category_utf_16_be_nosig)
564#define CATEGORY_MASK_UTF_16_LE_NOSIG (1 << coding_category_utf_16_le_nosig)
565#define CATEGORY_MASK_CHARSET (1 << coding_category_charset)
566#define CATEGORY_MASK_SJIS (1 << coding_category_sjis)
567#define CATEGORY_MASK_BIG5 (1 << coding_category_big5)
568#define CATEGORY_MASK_CCL (1 << coding_category_ccl)
569#define CATEGORY_MASK_EMACS_MULE (1 << coding_category_emacs_mule)
ff0dacd7 570#define CATEGORY_MASK_RAW_TEXT (1 << coding_category_raw_text)
df7492f9
KH
571
572/* This value is returned if detect_coding_mask () find nothing other
573 than ASCII characters. */
574#define CATEGORY_MASK_ANY \
575 (CATEGORY_MASK_ISO_7 \
576 | CATEGORY_MASK_ISO_7_TIGHT \
577 | CATEGORY_MASK_ISO_8_1 \
578 | CATEGORY_MASK_ISO_8_2 \
579 | CATEGORY_MASK_ISO_7_ELSE \
580 | CATEGORY_MASK_ISO_8_ELSE \
a470d443
KH
581 | CATEGORY_MASK_UTF_8_AUTO \
582 | CATEGORY_MASK_UTF_8_NOSIG \
583 | CATEGORY_MASK_UTF_8_SIG \
2f3cbb32 584 | CATEGORY_MASK_UTF_16_AUTO \
df7492f9
KH
585 | CATEGORY_MASK_UTF_16_BE \
586 | CATEGORY_MASK_UTF_16_LE \
587 | CATEGORY_MASK_UTF_16_BE_NOSIG \
588 | CATEGORY_MASK_UTF_16_LE_NOSIG \
589 | CATEGORY_MASK_CHARSET \
590 | CATEGORY_MASK_SJIS \
591 | CATEGORY_MASK_BIG5 \
592 | CATEGORY_MASK_CCL \
593 | CATEGORY_MASK_EMACS_MULE)
594
595
596#define CATEGORY_MASK_ISO_7BIT \
597 (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
598
599#define CATEGORY_MASK_ISO_8BIT \
600 (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
601
602#define CATEGORY_MASK_ISO_ELSE \
603 (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
604
605#define CATEGORY_MASK_ISO_ESCAPE \
606 (CATEGORY_MASK_ISO_7 \
607 | CATEGORY_MASK_ISO_7_TIGHT \
608 | CATEGORY_MASK_ISO_7_ELSE \
609 | CATEGORY_MASK_ISO_8_ELSE)
610
611#define CATEGORY_MASK_ISO \
612 ( CATEGORY_MASK_ISO_7BIT \
613 | CATEGORY_MASK_ISO_8BIT \
614 | CATEGORY_MASK_ISO_ELSE)
615
616#define CATEGORY_MASK_UTF_16 \
2f3cbb32
KH
617 (CATEGORY_MASK_UTF_16_AUTO \
618 | CATEGORY_MASK_UTF_16_BE \
df7492f9
KH
619 | CATEGORY_MASK_UTF_16_LE \
620 | CATEGORY_MASK_UTF_16_BE_NOSIG \
621 | CATEGORY_MASK_UTF_16_LE_NOSIG)
622
a470d443
KH
623#define CATEGORY_MASK_UTF_8 \
624 (CATEGORY_MASK_UTF_8_AUTO \
625 | CATEGORY_MASK_UTF_8_NOSIG \
626 | CATEGORY_MASK_UTF_8_SIG)
df7492f9 627
df7492f9 628/* Table of coding categories (Lisp symbols). This variable is for
ad1746f5 629 internal use only. */
df7492f9
KH
630static Lisp_Object Vcoding_category_table;
631
632/* Table of coding-categories ordered by priority. */
633static enum coding_category coding_priorities[coding_category_max];
634
635/* Nth element is a coding context for the coding system bound to the
636 Nth coding category. */
637static struct coding_system coding_categories[coding_category_max];
638
df7492f9
KH
639/*** Commonly used macros and functions ***/
640
641#ifndef min
642#define min(a, b) ((a) < (b) ? (a) : (b))
643#endif
644#ifndef max
645#define max(a, b) ((a) > (b) ? (a) : (b))
646#endif
4ed46869 647
24a73b0a
KH
648#define CODING_GET_INFO(coding, attrs, charset_list) \
649 do { \
650 (attrs) = CODING_ID_ATTRS ((coding)->id); \
651 (charset_list) = CODING_ATTR_CHARSET_LIST (attrs); \
df7492f9 652 } while (0)
4ed46869 653
4ed46869 654
df7492f9
KH
655/* Safely get one byte from the source text pointed by SRC which ends
656 at SRC_END, and set C to that byte. If there are not enough bytes
f10fe38f
PE
657 in the source, it jumps to 'no_more_source'. If MULTIBYTEP,
658 and a multibyte character is found at SRC, set C to the
065e3595
KH
659 negative value of the character code. The caller should declare
660 and set these variables appropriately in advance:
661 src, src_end, multibytep */
aa72b389 662
065e3595
KH
663#define ONE_MORE_BYTE(c) \
664 do { \
665 if (src == src_end) \
666 { \
667 if (src_base < src) \
668 record_conversion_result \
669 (coding, CODING_RESULT_INSUFFICIENT_SRC); \
670 goto no_more_source; \
671 } \
672 c = *src++; \
673 if (multibytep && (c & 0x80)) \
674 { \
675 if ((c & 0xFE) == 0xC0) \
676 c = ((c & 1) << 6) | *src++; \
677 else \
678 { \
35befdaa
KH
679 src--; \
680 c = - string_char (src, &src, NULL); \
065e3595
KH
681 record_conversion_result \
682 (coding, CODING_RESULT_INVALID_SRC); \
683 } \
684 } \
685 consumed_chars++; \
aa72b389
KH
686 } while (0)
687
f56a4450 688/* Safely get two bytes from the source text pointed by SRC which ends
220eeac9
KH
689 at SRC_END, and set C1 and C2 to those bytes while skipping the
690 heading multibyte characters. If there are not enough bytes in the
f10fe38f 691 source, it jumps to 'no_more_source'. If MULTIBYTEP and
220eeac9
KH
692 a multibyte character is found for C2, set C2 to the negative value
693 of the character code. The caller should declare and set these
694 variables appropriately in advance:
f56a4450
KH
695 src, src_end, multibytep
696 It is intended that this macro is used in detect_coding_utf_16. */
697
220eeac9
KH
698#define TWO_MORE_BYTES(c1, c2) \
699 do { \
700 do { \
701 if (src == src_end) \
702 goto no_more_source; \
703 c1 = *src++; \
704 if (multibytep && (c1 & 0x80)) \
705 { \
706 if ((c1 & 0xFE) == 0xC0) \
707 c1 = ((c1 & 1) << 6) | *src++; \
708 else \
709 { \
710 src += BYTES_BY_CHAR_HEAD (c1) - 1; \
711 c1 = -1; \
712 } \
713 } \
714 } while (c1 < 0); \
715 if (src == src_end) \
716 goto no_more_source; \
717 c2 = *src++; \
718 if (multibytep && (c2 & 0x80)) \
719 { \
720 if ((c2 & 0xFE) == 0xC0) \
721 c2 = ((c2 & 1) << 6) | *src++; \
722 else \
723 c2 = -1; \
724 } \
f56a4450
KH
725 } while (0)
726
aa72b389 727
df7492f9
KH
728/* Store a byte C in the place pointed by DST and increment DST to the
729 next free point, and increment PRODUCED_CHARS. The caller should
730 assure that C is 0..127, and declare and set the variable `dst'
731 appropriately in advance.
732*/
aa72b389
KH
733
734
df7492f9
KH
735#define EMIT_ONE_ASCII_BYTE(c) \
736 do { \
737 produced_chars++; \
738 *dst++ = (c); \
b6871cc7 739 } while (0)
aa72b389
KH
740
741
ad1746f5 742/* Like EMIT_ONE_ASCII_BYTE but store two bytes; C1 and C2. */
aa72b389 743
df7492f9
KH
744#define EMIT_TWO_ASCII_BYTES(c1, c2) \
745 do { \
746 produced_chars += 2; \
747 *dst++ = (c1), *dst++ = (c2); \
748 } while (0)
aa72b389
KH
749
750
df7492f9 751/* Store a byte C in the place pointed by DST and increment DST to the
f10fe38f
PE
752 next free point, and increment PRODUCED_CHARS. If MULTIBYTEP,
753 store in an appropriate multibyte form. The caller should
df7492f9
KH
754 declare and set the variables `dst' and `multibytep' appropriately
755 in advance. */
756
757#define EMIT_ONE_BYTE(c) \
758 do { \
759 produced_chars++; \
760 if (multibytep) \
761 { \
b25d760e 762 unsigned ch = (c); \
df7492f9
KH
763 if (ch >= 0x80) \
764 ch = BYTE8_TO_CHAR (ch); \
765 CHAR_STRING_ADVANCE (ch, dst); \
766 } \
767 else \
768 *dst++ = (c); \
aa72b389 769 } while (0)
aa72b389 770
aa72b389 771
df7492f9 772/* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */
aa72b389 773
e19c3639
KH
774#define EMIT_TWO_BYTES(c1, c2) \
775 do { \
776 produced_chars += 2; \
777 if (multibytep) \
778 { \
b25d760e 779 unsigned ch; \
e19c3639
KH
780 \
781 ch = (c1); \
782 if (ch >= 0x80) \
783 ch = BYTE8_TO_CHAR (ch); \
784 CHAR_STRING_ADVANCE (ch, dst); \
785 ch = (c2); \
786 if (ch >= 0x80) \
787 ch = BYTE8_TO_CHAR (ch); \
788 CHAR_STRING_ADVANCE (ch, dst); \
789 } \
790 else \
791 { \
792 *dst++ = (c1); \
793 *dst++ = (c2); \
794 } \
aa72b389
KH
795 } while (0)
796
797
df7492f9
KH
798#define EMIT_THREE_BYTES(c1, c2, c3) \
799 do { \
800 EMIT_ONE_BYTE (c1); \
801 EMIT_TWO_BYTES (c2, c3); \
802 } while (0)
aa72b389 803
aa72b389 804
df7492f9
KH
805#define EMIT_FOUR_BYTES(c1, c2, c3, c4) \
806 do { \
807 EMIT_TWO_BYTES (c1, c2); \
808 EMIT_TWO_BYTES (c3, c4); \
809 } while (0)
aa72b389 810
aa72b389 811
065e3595
KH
812static void
813record_conversion_result (struct coding_system *coding,
814 enum coding_result_code result)
815{
816 coding->result = result;
817 switch (result)
818 {
819 case CODING_RESULT_INSUFFICIENT_SRC:
820 Vlast_code_conversion_error = Qinsufficient_source;
821 break;
822 case CODING_RESULT_INCONSISTENT_EOL:
823 Vlast_code_conversion_error = Qinconsistent_eol;
824 break;
825 case CODING_RESULT_INVALID_SRC:
826 Vlast_code_conversion_error = Qinvalid_source;
827 break;
828 case CODING_RESULT_INTERRUPT:
829 Vlast_code_conversion_error = Qinterrupted;
830 break;
831 case CODING_RESULT_INSUFFICIENT_MEM:
832 Vlast_code_conversion_error = Qinsufficient_memory;
833 break;
ebaf11b6
KH
834 case CODING_RESULT_INSUFFICIENT_DST:
835 /* Don't record this error in Vlast_code_conversion_error
836 because it happens just temporarily and is resolved when the
837 whole conversion is finished. */
838 break;
409ea3a1
AS
839 case CODING_RESULT_SUCCESS:
840 break;
35befdaa
KH
841 default:
842 Vlast_code_conversion_error = intern ("Unknown error");
065e3595
KH
843 }
844}
845
5eb05ea3
KH
846/* These wrapper macros are used to preserve validity of pointers into
847 buffer text across calls to decode_char, encode_char, etc, which
848 could cause relocation of buffers if it loads a charset map,
849 because loading a charset map allocates large structures. */
850
df7492f9
KH
851#define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
852 do { \
8f50130c 853 ptrdiff_t offset; \
5eb05ea3 854 \
df7492f9
KH
855 charset_map_loaded = 0; \
856 c = DECODE_CHAR (charset, code); \
5eb05ea3 857 if (charset_map_loaded \
c1892f11 858 && (offset = coding_change_source (coding))) \
df7492f9 859 { \
df7492f9
KH
860 src += offset; \
861 src_base += offset; \
862 src_end += offset; \
863 } \
aa72b389
KH
864 } while (0)
865
5eb05ea3
KH
866#define CODING_ENCODE_CHAR(coding, dst, dst_end, charset, c, code) \
867 do { \
8f50130c 868 ptrdiff_t offset; \
5eb05ea3
KH
869 \
870 charset_map_loaded = 0; \
871 code = ENCODE_CHAR (charset, c); \
872 if (charset_map_loaded \
c1892f11 873 && (offset = coding_change_destination (coding))) \
5eb05ea3
KH
874 { \
875 dst += offset; \
876 dst_end += offset; \
877 } \
878 } while (0)
879
880#define CODING_CHAR_CHARSET(coding, dst, dst_end, c, charset_list, code_return, charset) \
881 do { \
8f50130c 882 ptrdiff_t offset; \
5eb05ea3
KH
883 \
884 charset_map_loaded = 0; \
885 charset = char_charset (c, charset_list, code_return); \
886 if (charset_map_loaded \
c1892f11 887 && (offset = coding_change_destination (coding))) \
5eb05ea3
KH
888 { \
889 dst += offset; \
890 dst_end += offset; \
891 } \
892 } while (0)
893
894#define CODING_CHAR_CHARSET_P(coding, dst, dst_end, c, charset, result) \
895 do { \
8f50130c 896 ptrdiff_t offset; \
5eb05ea3
KH
897 \
898 charset_map_loaded = 0; \
899 result = CHAR_CHARSET_P (c, charset); \
900 if (charset_map_loaded \
c1892f11 901 && (offset = coding_change_destination (coding))) \
5eb05ea3
KH
902 { \
903 dst += offset; \
904 dst_end += offset; \
905 } \
906 } while (0)
907
aa72b389 908
119852e7
KH
909/* If there are at least BYTES length of room at dst, allocate memory
910 for coding->destination and update dst and dst_end. We don't have
911 to take care of coding->source which will be relocated. It is
912 handled by calling coding_set_source in encode_coding. */
913
df7492f9
KH
914#define ASSURE_DESTINATION(bytes) \
915 do { \
916 if (dst + (bytes) >= dst_end) \
917 { \
d311d28c 918 ptrdiff_t more_bytes = charbuf_end - charbuf + (bytes); \
df7492f9
KH
919 \
920 dst = alloc_destination (coding, more_bytes, dst); \
921 dst_end = coding->destination + coding->dst_bytes; \
922 } \
923 } while (0)
aa72b389 924
aa72b389 925
db274c7a 926/* Store multibyte form of the character C in P, and advance P to the
eedec3ee
EZ
927 end of the multibyte form. This used to be like CHAR_STRING_ADVANCE
928 without ever calling MAYBE_UNIFY_CHAR, but nowadays we don't call
929 MAYBE_UNIFY_CHAR in CHAR_STRING_ADVANCE. */
db274c7a 930
eedec3ee 931#define CHAR_STRING_ADVANCE_NO_UNIFY(c, p) CHAR_STRING_ADVANCE(c, p)
db274c7a
KH
932
933/* Return the character code of character whose multibyte form is at
eedec3ee
EZ
934 P, and advance P to the end of the multibyte form. This used to be
935 like STRING_CHAR_ADVANCE without ever calling MAYBE_UNIFY_CHAR, but
936 nowadays STRING_CHAR_ADVANCE doesn't call MAYBE_UNIFY_CHAR. */
db274c7a 937
eedec3ee 938#define STRING_CHAR_ADVANCE_NO_UNIFY(p) STRING_CHAR_ADVANCE(p)
aa72b389 939
c1892f11 940/* Set coding->source from coding->src_object. */
5eb05ea3 941
c1892f11 942static void
971de7fb 943coding_set_source (struct coding_system *coding)
aa72b389 944{
df7492f9
KH
945 if (BUFFERP (coding->src_object))
946 {
2cb26057 947 struct buffer *buf = XBUFFER (coding->src_object);
aa72b389 948
df7492f9 949 if (coding->src_pos < 0)
2cb26057 950 coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
df7492f9 951 else
2cb26057 952 coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
aa72b389 953 }
df7492f9 954 else if (STRINGP (coding->src_object))
aa72b389 955 {
8f924df7 956 coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
aa72b389 957 }
df7492f9 958 else
f38b440c
PE
959 {
960 /* Otherwise, the source is C string and is never relocated
961 automatically. Thus we don't have to update anything. */
962 }
df7492f9 963}
aa72b389 964
5eb05ea3 965
c1892f11
PE
966/* Set coding->source from coding->src_object, and return how many
967 bytes coding->source was changed. */
5eb05ea3 968
8f50130c 969static ptrdiff_t
c1892f11 970coding_change_source (struct coding_system *coding)
df7492f9 971{
c1892f11
PE
972 const unsigned char *orig = coding->source;
973 coding_set_source (coding);
974 return coding->source - orig;
975}
976
5eb05ea3 977
c1892f11
PE
978/* Set coding->destination from coding->dst_object. */
979
980static void
981coding_set_destination (struct coding_system *coding)
982{
df7492f9 983 if (BUFFERP (coding->dst_object))
aa72b389 984 {
a0241d01 985 if (BUFFERP (coding->src_object) && coding->src_pos < 0)
aa72b389 986 {
13818c30 987 coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
28f67a95
KH
988 coding->dst_bytes = (GAP_END_ADDR
989 - (coding->src_bytes - coding->consumed)
990 - coding->destination);
aa72b389 991 }
df7492f9 992 else
28f67a95
KH
993 {
994 /* We are sure that coding->dst_pos_byte is before the gap
995 of the buffer. */
996 coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
13818c30 997 + coding->dst_pos_byte - BEG_BYTE);
28f67a95
KH
998 coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
999 - coding->destination);
1000 }
df7492f9
KH
1001 }
1002 else
f38b440c
PE
1003 {
1004 /* Otherwise, the destination is C string and is never relocated
1005 automatically. Thus we don't have to update anything. */
1006 }
c1892f11
PE
1007}
1008
1009
1010/* Set coding->destination from coding->dst_object, and return how
1011 many bytes coding->destination was changed. */
1012
1013static ptrdiff_t
1014coding_change_destination (struct coding_system *coding)
1015{
1016 const unsigned char *orig = coding->destination;
1017 coding_set_destination (coding);
5eb05ea3 1018 return coding->destination - orig;
df7492f9
KH
1019}
1020
1021
1022static void
d311d28c 1023coding_alloc_by_realloc (struct coding_system *coding, ptrdiff_t bytes)
df7492f9 1024{
c9d624c6 1025 if (STRING_BYTES_BOUND - coding->dst_bytes < bytes)
d1f3d2af 1026 string_overflow ();
38182d90
PE
1027 coding->destination = xrealloc (coding->destination,
1028 coding->dst_bytes + bytes);
df7492f9
KH
1029 coding->dst_bytes += bytes;
1030}
1031
1032static void
cf84bb53 1033coding_alloc_by_making_gap (struct coding_system *coding,
d311d28c 1034 ptrdiff_t gap_head_used, ptrdiff_t bytes)
df7492f9 1035{
db274c7a 1036 if (EQ (coding->src_object, coding->dst_object))
df7492f9 1037 {
db274c7a
KH
1038 /* The gap may contain the produced data at the head and not-yet
1039 consumed data at the tail. To preserve those data, we at
1040 first make the gap size to zero, then increase the gap
1041 size. */
d311d28c 1042 ptrdiff_t add = GAP_SIZE;
db274c7a
KH
1043
1044 GPT += gap_head_used, GPT_BYTE += gap_head_used;
1045 GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
df7492f9
KH
1046 make_gap (bytes);
1047 GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
db274c7a 1048 GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
df7492f9 1049 }
730fff51 1050 else
df7492f9 1051 {
2c78b7e1
KH
1052 Lisp_Object this_buffer;
1053
1054 this_buffer = Fcurrent_buffer ();
df7492f9
KH
1055 set_buffer_internal (XBUFFER (coding->dst_object));
1056 make_gap (bytes);
1057 set_buffer_internal (XBUFFER (this_buffer));
aa72b389 1058 }
df7492f9 1059}
8f924df7 1060
df7492f9
KH
1061
1062static unsigned char *
d311d28c 1063alloc_destination (struct coding_system *coding, ptrdiff_t nbytes,
cf84bb53 1064 unsigned char *dst)
df7492f9 1065{
d311d28c 1066 ptrdiff_t offset = dst - coding->destination;
df7492f9
KH
1067
1068 if (BUFFERP (coding->dst_object))
db274c7a
KH
1069 {
1070 struct buffer *buf = XBUFFER (coding->dst_object);
1071
1072 coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1073 }
aa72b389 1074 else
df7492f9 1075 coding_alloc_by_realloc (coding, nbytes);
df7492f9
KH
1076 coding_set_destination (coding);
1077 dst = coding->destination + offset;
1078 return dst;
1079}
aa72b389 1080
ff0dacd7
KH
1081/** Macros for annotations. */
1082
ff0dacd7
KH
1083/* An annotation data is stored in the array coding->charbuf in this
1084 format:
69a80ea3 1085 [ -LENGTH ANNOTATION_MASK NCHARS ... ]
ff0dacd7
KH
1086 LENGTH is the number of elements in the annotation.
1087 ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
69a80ea3 1088 NCHARS is the number of characters in the text annotated.
ff0dacd7
KH
1089
1090 The format of the following elements depend on ANNOTATION_MASK.
1091
1092 In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1093 follows:
e951386e
KH
1094 ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1095
1096 NBYTES is the number of bytes specified in the header part of
1097 old-style emacs-mule encoding, or 0 for the other kind of
1098 composition.
1099
ff0dacd7 1100 METHOD is one of enum composition_method.
e951386e 1101
ad1746f5 1102 Optional COMPOSITION-COMPONENTS are characters and composition
ff0dacd7
KH
1103 rules.
1104
1105 In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
e951386e
KH
1106 follows.
1107
1108 If ANNOTATION_MASK is 0, this annotation is just a space holder to
1109 recover from an invalid annotation, and should be skipped by
1110 produce_annotation. */
1111
1112/* Maximum length of the header of annotation data. */
1113#define MAX_ANNOTATION_LENGTH 5
ff0dacd7 1114
69a80ea3 1115#define ADD_ANNOTATION_DATA(buf, len, mask, nchars) \
ff0dacd7
KH
1116 do { \
1117 *(buf)++ = -(len); \
1118 *(buf)++ = (mask); \
69a80ea3 1119 *(buf)++ = (nchars); \
ff0dacd7
KH
1120 coding->annotated = 1; \
1121 } while (0);
1122
e951386e 1123#define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method) \
69a80ea3 1124 do { \
e951386e
KH
1125 ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1126 *buf++ = nbytes; \
69a80ea3 1127 *buf++ = method; \
ff0dacd7
KH
1128 } while (0)
1129
1130
69a80ea3
KH
1131#define ADD_CHARSET_DATA(buf, nchars, id) \
1132 do { \
1133 ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1134 *buf++ = id; \
ff0dacd7
KH
1135 } while (0)
1136
df7492f9
KH
1137\f
1138/*** 2. Emacs' internal format (emacs-utf-8) ***/
1139
1140
1141
1142\f
1143/*** 3. UTF-8 ***/
1144
1145/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
f10fe38f 1146 Return true if a text is encoded in UTF-8. */
df7492f9
KH
1147
1148#define UTF_8_1_OCTET_P(c) ((c) < 0x80)
1149#define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
1150#define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1151#define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1152#define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1153#define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1154
a470d443
KH
1155#define UTF_8_BOM_1 0xEF
1156#define UTF_8_BOM_2 0xBB
1157#define UTF_8_BOM_3 0xBF
1158
f10fe38f 1159static bool
cf84bb53
JB
1160detect_coding_utf_8 (struct coding_system *coding,
1161 struct coding_detection_info *detect_info)
df7492f9 1162{
065e3595 1163 const unsigned char *src = coding->source, *src_base;
8f924df7 1164 const unsigned char *src_end = coding->source + coding->src_bytes;
f10fe38f 1165 bool multibytep = coding->src_multibyte;
d311d28c 1166 ptrdiff_t consumed_chars = 0;
f10fe38f
PE
1167 bool bom_found = 0;
1168 bool found = 0;
df7492f9 1169
ff0dacd7 1170 detect_info->checked |= CATEGORY_MASK_UTF_8;
df7492f9
KH
1171 /* A coding system of this category is always ASCII compatible. */
1172 src += coding->head_ascii;
1173
1174 while (1)
aa72b389 1175 {
df7492f9 1176 int c, c1, c2, c3, c4;
aa72b389 1177
065e3595 1178 src_base = src;
df7492f9 1179 ONE_MORE_BYTE (c);
065e3595 1180 if (c < 0 || UTF_8_1_OCTET_P (c))
df7492f9
KH
1181 continue;
1182 ONE_MORE_BYTE (c1);
065e3595 1183 if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
df7492f9
KH
1184 break;
1185 if (UTF_8_2_OCTET_LEADING_P (c))
aa72b389 1186 {
a470d443 1187 found = 1;
df7492f9 1188 continue;
aa72b389 1189 }
df7492f9 1190 ONE_MORE_BYTE (c2);
065e3595 1191 if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
df7492f9
KH
1192 break;
1193 if (UTF_8_3_OCTET_LEADING_P (c))
aa72b389 1194 {
a470d443
KH
1195 found = 1;
1196 if (src_base == coding->source
1197 && c == UTF_8_BOM_1 && c1 == UTF_8_BOM_2 && c2 == UTF_8_BOM_3)
1198 bom_found = 1;
df7492f9 1199 continue;
aa72b389 1200 }
df7492f9 1201 ONE_MORE_BYTE (c3);
065e3595 1202 if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
df7492f9
KH
1203 break;
1204 if (UTF_8_4_OCTET_LEADING_P (c))
aa72b389 1205 {
a470d443 1206 found = 1;
df7492f9
KH
1207 continue;
1208 }
1209 ONE_MORE_BYTE (c4);
065e3595 1210 if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
df7492f9
KH
1211 break;
1212 if (UTF_8_5_OCTET_LEADING_P (c))
1213 {
a470d443 1214 found = 1;
df7492f9
KH
1215 continue;
1216 }
1217 break;
aa72b389 1218 }
ff0dacd7 1219 detect_info->rejected |= CATEGORY_MASK_UTF_8;
df7492f9 1220 return 0;
aa72b389 1221
df7492f9 1222 no_more_source:
065e3595 1223 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
aa72b389 1224 {
ff0dacd7 1225 detect_info->rejected |= CATEGORY_MASK_UTF_8;
89528eb3 1226 return 0;
aa72b389 1227 }
a470d443
KH
1228 if (bom_found)
1229 {
1230 /* The first character 0xFFFE doesn't necessarily mean a BOM. */
1231 detect_info->found |= CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1232 }
1233 else
1234 {
1235 detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
0e17387a
KH
1236 if (found)
1237 detect_info->found |= CATEGORY_MASK_UTF_8_NOSIG;
a470d443 1238 }
ff0dacd7 1239 return 1;
aa72b389
KH
1240}
1241
4ed46869 1242
b73bfc1c 1243static void
971de7fb 1244decode_coding_utf_8 (struct coding_system *coding)
b73bfc1c 1245{
8f924df7
KH
1246 const unsigned char *src = coding->source + coding->consumed;
1247 const unsigned char *src_end = coding->source + coding->src_bytes;
1248 const unsigned char *src_base;
69a80ea3
KH
1249 int *charbuf = coding->charbuf + coding->charbuf_used;
1250 int *charbuf_end = coding->charbuf + coding->charbuf_size;
d311d28c 1251 ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
f10fe38f 1252 bool multibytep = coding->src_multibyte;
a470d443 1253 enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
f10fe38f
PE
1254 bool eol_dos
1255 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
119852e7 1256 int byte_after_cr = -1;
4ed46869 1257
a470d443
KH
1258 if (bom != utf_without_bom)
1259 {
1260 int c1, c2, c3;
1261
1262 src_base = src;
1263 ONE_MORE_BYTE (c1);
1264 if (! UTF_8_3_OCTET_LEADING_P (c1))
1265 src = src_base;
1266 else
1267 {
159bd5a2 1268 ONE_MORE_BYTE (c2);
a470d443
KH
1269 if (! UTF_8_EXTRA_OCTET_P (c2))
1270 src = src_base;
1271 else
1272 {
159bd5a2 1273 ONE_MORE_BYTE (c3);
a470d443
KH
1274 if (! UTF_8_EXTRA_OCTET_P (c3))
1275 src = src_base;
1276 else
1277 {
1278 if ((c1 != UTF_8_BOM_1)
1279 || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1280 src = src_base;
1281 else
1282 CODING_UTF_8_BOM (coding) = utf_without_bom;
1283 }
1284 }
1285 }
1286 }
1287 CODING_UTF_8_BOM (coding) = utf_without_bom;
1288
df7492f9 1289 while (1)
b73bfc1c 1290 {
df7492f9 1291 int c, c1, c2, c3, c4, c5;
ec6d2bb8 1292
df7492f9
KH
1293 src_base = src;
1294 consumed_chars_base = consumed_chars;
4af310db 1295
df7492f9 1296 if (charbuf >= charbuf_end)
b71f6f73
KH
1297 {
1298 if (byte_after_cr >= 0)
1299 src_base--;
1300 break;
1301 }
df7492f9 1302
119852e7
KH
1303 if (byte_after_cr >= 0)
1304 c1 = byte_after_cr, byte_after_cr = -1;
1305 else
1306 ONE_MORE_BYTE (c1);
065e3595
KH
1307 if (c1 < 0)
1308 {
1309 c = - c1;
1310 }
1a4990fb 1311 else if (UTF_8_1_OCTET_P (c1))
df7492f9 1312 {
2735d060 1313 if (eol_dos && c1 == '\r')
119852e7 1314 ONE_MORE_BYTE (byte_after_cr);
df7492f9 1315 c = c1;
4af310db 1316 }
df7492f9 1317 else
4af310db 1318 {
df7492f9 1319 ONE_MORE_BYTE (c2);
065e3595 1320 if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
df7492f9
KH
1321 goto invalid_code;
1322 if (UTF_8_2_OCTET_LEADING_P (c1))
4af310db 1323 {
b0edb2c5
DL
1324 c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1325 /* Reject overlong sequences here and below. Encoders
1326 producing them are incorrect, they can be misleading,
1327 and they mess up read/write invariance. */
1328 if (c < 128)
1329 goto invalid_code;
4af310db 1330 }
df7492f9 1331 else
aa72b389 1332 {
df7492f9 1333 ONE_MORE_BYTE (c3);
065e3595 1334 if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
df7492f9
KH
1335 goto invalid_code;
1336 if (UTF_8_3_OCTET_LEADING_P (c1))
b0edb2c5
DL
1337 {
1338 c = (((c1 & 0xF) << 12)
1339 | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
72fe1301
DL
1340 if (c < 0x800
1341 || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
b0edb2c5
DL
1342 goto invalid_code;
1343 }
df7492f9
KH
1344 else
1345 {
1346 ONE_MORE_BYTE (c4);
065e3595 1347 if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
df7492f9
KH
1348 goto invalid_code;
1349 if (UTF_8_4_OCTET_LEADING_P (c1))
b0edb2c5 1350 {
df7492f9
KH
1351 c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1352 | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
b0edb2c5
DL
1353 if (c < 0x10000)
1354 goto invalid_code;
1355 }
df7492f9
KH
1356 else
1357 {
1358 ONE_MORE_BYTE (c5);
065e3595 1359 if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
df7492f9
KH
1360 goto invalid_code;
1361 if (UTF_8_5_OCTET_LEADING_P (c1))
1362 {
1363 c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1364 | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1365 | (c5 & 0x3F));
b0edb2c5 1366 if ((c > MAX_CHAR) || (c < 0x200000))
df7492f9
KH
1367 goto invalid_code;
1368 }
1369 else
1370 goto invalid_code;
1371 }
1372 }
aa72b389 1373 }
b73bfc1c 1374 }
df7492f9
KH
1375
1376 *charbuf++ = c;
1377 continue;
1378
1379 invalid_code:
1380 src = src_base;
1381 consumed_chars = consumed_chars_base;
1382 ONE_MORE_BYTE (c);
1383 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1384 coding->errors++;
aa72b389
KH
1385 }
1386
df7492f9
KH
1387 no_more_source:
1388 coding->consumed_char += consumed_chars_base;
1389 coding->consumed = src_base - coding->source;
1390 coding->charbuf_used = charbuf - coding->charbuf;
1391}
1392
1393
f10fe38f 1394static bool
971de7fb 1395encode_coding_utf_8 (struct coding_system *coding)
df7492f9 1396{
f10fe38f 1397 bool multibytep = coding->dst_multibyte;
df7492f9
KH
1398 int *charbuf = coding->charbuf;
1399 int *charbuf_end = charbuf + coding->charbuf_used;
1400 unsigned char *dst = coding->destination + coding->produced;
1401 unsigned char *dst_end = coding->destination + coding->dst_bytes;
d311d28c 1402 ptrdiff_t produced_chars = 0;
df7492f9
KH
1403 int c;
1404
a470d443
KH
1405 if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1406 {
1407 ASSURE_DESTINATION (3);
1408 EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1409 CODING_UTF_8_BOM (coding) = utf_without_bom;
1410 }
1411
df7492f9 1412 if (multibytep)
aa72b389 1413 {
df7492f9
KH
1414 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1415
1416 while (charbuf < charbuf_end)
b73bfc1c 1417 {
df7492f9 1418 unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
8f924df7 1419
df7492f9
KH
1420 ASSURE_DESTINATION (safe_room);
1421 c = *charbuf++;
28f67a95
KH
1422 if (CHAR_BYTE8_P (c))
1423 {
1424 c = CHAR_TO_BYTE8 (c);
1425 EMIT_ONE_BYTE (c);
1426 }
1427 else
1428 {
db274c7a 1429 CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
28f67a95
KH
1430 for (p = str; p < pend; p++)
1431 EMIT_ONE_BYTE (*p);
1432 }
b73bfc1c 1433 }
aa72b389 1434 }
df7492f9
KH
1435 else
1436 {
1437 int safe_room = MAX_MULTIBYTE_LENGTH;
1438
1439 while (charbuf < charbuf_end)
b73bfc1c 1440 {
df7492f9
KH
1441 ASSURE_DESTINATION (safe_room);
1442 c = *charbuf++;
f03caae0
KH
1443 if (CHAR_BYTE8_P (c))
1444 *dst++ = CHAR_TO_BYTE8 (c);
1445 else
db274c7a 1446 CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
df7492f9 1447 produced_chars++;
4ed46869
KH
1448 }
1449 }
065e3595 1450 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
1451 coding->produced_char += produced_chars;
1452 coding->produced = dst - coding->destination;
1453 return 0;
4ed46869
KH
1454}
1455
b73bfc1c 1456
df7492f9 1457/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
f10fe38f 1458 Return true if a text is encoded in one of UTF-16 based coding systems. */
aa72b389 1459
df7492f9
KH
1460#define UTF_16_HIGH_SURROGATE_P(val) \
1461 (((val) & 0xFC00) == 0xD800)
1462
1463#define UTF_16_LOW_SURROGATE_P(val) \
1464 (((val) & 0xFC00) == 0xDC00)
93dec019 1465
aa72b389 1466
f10fe38f 1467static bool
cf84bb53
JB
1468detect_coding_utf_16 (struct coding_system *coding,
1469 struct coding_detection_info *detect_info)
aa72b389 1470{
ef1b0ba7 1471 const unsigned char *src = coding->source;
8f924df7 1472 const unsigned char *src_end = coding->source + coding->src_bytes;
f10fe38f 1473 bool multibytep = coding->src_multibyte;
df7492f9 1474 int c1, c2;
aa72b389 1475
ff0dacd7 1476 detect_info->checked |= CATEGORY_MASK_UTF_16;
ff0dacd7 1477 if (coding->mode & CODING_MODE_LAST_BLOCK
24a73b0a 1478 && (coding->src_chars & 1))
ff0dacd7
KH
1479 {
1480 detect_info->rejected |= CATEGORY_MASK_UTF_16;
1481 return 0;
1482 }
24a73b0a 1483
f56a4450 1484 TWO_MORE_BYTES (c1, c2);
df7492f9 1485 if ((c1 == 0xFF) && (c2 == 0xFE))
aa72b389 1486 {
b49a1807
KH
1487 detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1488 | CATEGORY_MASK_UTF_16_AUTO);
24a73b0a
KH
1489 detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1490 | CATEGORY_MASK_UTF_16_BE_NOSIG
1491 | CATEGORY_MASK_UTF_16_LE_NOSIG);
aa72b389 1492 }
df7492f9 1493 else if ((c1 == 0xFE) && (c2 == 0xFF))
ff0dacd7 1494 {
b49a1807
KH
1495 detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1496 | CATEGORY_MASK_UTF_16_AUTO);
24a73b0a
KH
1497 detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1498 | CATEGORY_MASK_UTF_16_BE_NOSIG
1499 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1500 }
220eeac9 1501 else if (c2 < 0)
f56a4450
KH
1502 {
1503 detect_info->rejected |= CATEGORY_MASK_UTF_16;
1504 return 0;
1505 }
2f3cbb32 1506 else
24a73b0a 1507 {
2f3cbb32
KH
1508 /* We check the dispersion of Eth and Oth bytes where E is even and
1509 O is odd. If both are high, we assume binary data.*/
1510 unsigned char e[256], o[256];
1511 unsigned e_num = 1, o_num = 1;
1512
1513 memset (e, 0, 256);
1514 memset (o, 0, 256);
1515 e[c1] = 1;
1516 o[c2] = 1;
1517
cc13543e
KH
1518 detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
1519 |CATEGORY_MASK_UTF_16_BE
1520 | CATEGORY_MASK_UTF_16_LE);
2f3cbb32 1521
7f1faf1c
KH
1522 while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
1523 != CATEGORY_MASK_UTF_16)
2f3cbb32 1524 {
f56a4450 1525 TWO_MORE_BYTES (c1, c2);
220eeac9 1526 if (c2 < 0)
f56a4450 1527 break;
2f3cbb32
KH
1528 if (! e[c1])
1529 {
1530 e[c1] = 1;
1531 e_num++;
cc13543e
KH
1532 if (e_num >= 128)
1533 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
2f3cbb32
KH
1534 }
1535 if (! o[c2])
1536 {
977b85f4 1537 o[c2] = 1;
2f3cbb32 1538 o_num++;
cc13543e
KH
1539 if (o_num >= 128)
1540 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
2f3cbb32
KH
1541 }
1542 }
2f3cbb32 1543 return 0;
ff0dacd7 1544 }
2f3cbb32 1545
df7492f9 1546 no_more_source:
ff0dacd7 1547 return 1;
df7492f9 1548}
aa72b389 1549
df7492f9 1550static void
971de7fb 1551decode_coding_utf_16 (struct coding_system *coding)
df7492f9 1552{
8f924df7
KH
1553 const unsigned char *src = coding->source + coding->consumed;
1554 const unsigned char *src_end = coding->source + coding->src_bytes;
1555 const unsigned char *src_base;
69a80ea3 1556 int *charbuf = coding->charbuf + coding->charbuf_used;
df80c7f0
KH
1557 /* We may produces at most 3 chars in one loop. */
1558 int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
d311d28c 1559 ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
f10fe38f 1560 bool multibytep = coding->src_multibyte;
a470d443 1561 enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
df7492f9
KH
1562 enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1563 int surrogate = CODING_UTF_16_SURROGATE (coding);
f10fe38f
PE
1564 bool eol_dos
1565 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
119852e7 1566 int byte_after_cr1 = -1, byte_after_cr2 = -1;
df7492f9 1567
a470d443 1568 if (bom == utf_with_bom)
aa72b389 1569 {
df7492f9 1570 int c, c1, c2;
4af310db 1571
aa72b389 1572 src_base = src;
df7492f9
KH
1573 ONE_MORE_BYTE (c1);
1574 ONE_MORE_BYTE (c2);
e19c3639 1575 c = (c1 << 8) | c2;
aa72b389 1576
b49a1807
KH
1577 if (endian == utf_16_big_endian
1578 ? c != 0xFEFF : c != 0xFFFE)
aa72b389 1579 {
b49a1807
KH
1580 /* The first two bytes are not BOM. Treat them as bytes
1581 for a normal character. */
1582 src = src_base;
1583 coding->errors++;
aa72b389 1584 }
a470d443 1585 CODING_UTF_16_BOM (coding) = utf_without_bom;
b49a1807 1586 }
a470d443 1587 else if (bom == utf_detect_bom)
b49a1807
KH
1588 {
1589 /* We have already tried to detect BOM and failed in
1590 detect_coding. */
a470d443 1591 CODING_UTF_16_BOM (coding) = utf_without_bom;
df7492f9 1592 }
aa72b389 1593
df7492f9
KH
1594 while (1)
1595 {
1596 int c, c1, c2;
1597
1598 src_base = src;
1599 consumed_chars_base = consumed_chars;
1600
df80c7f0 1601 if (charbuf >= charbuf_end)
b71f6f73
KH
1602 {
1603 if (byte_after_cr1 >= 0)
1604 src_base -= 2;
1605 break;
1606 }
df7492f9 1607
119852e7
KH
1608 if (byte_after_cr1 >= 0)
1609 c1 = byte_after_cr1, byte_after_cr1 = -1;
1610 else
1611 ONE_MORE_BYTE (c1);
065e3595
KH
1612 if (c1 < 0)
1613 {
1614 *charbuf++ = -c1;
1615 continue;
1616 }
119852e7
KH
1617 if (byte_after_cr2 >= 0)
1618 c2 = byte_after_cr2, byte_after_cr2 = -1;
1619 else
1620 ONE_MORE_BYTE (c2);
065e3595
KH
1621 if (c2 < 0)
1622 {
1623 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1624 *charbuf++ = -c2;
1625 continue;
1626 }
df7492f9 1627 c = (endian == utf_16_big_endian
e19c3639 1628 ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
119852e7 1629
df7492f9 1630 if (surrogate)
fd3ae0b9 1631 {
df7492f9 1632 if (! UTF_16_LOW_SURROGATE_P (c))
fd3ae0b9 1633 {
df7492f9
KH
1634 if (endian == utf_16_big_endian)
1635 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1636 else
1637 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1638 *charbuf++ = c1;
1639 *charbuf++ = c2;
1640 coding->errors++;
1641 if (UTF_16_HIGH_SURROGATE_P (c))
1642 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
fd3ae0b9 1643 else
df7492f9 1644 *charbuf++ = c;
fd3ae0b9
KH
1645 }
1646 else
df7492f9
KH
1647 {
1648 c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1649 CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
29f7ffd0 1650 *charbuf++ = 0x10000 + c;
df7492f9 1651 }
fd3ae0b9 1652 }
aa72b389 1653 else
df7492f9
KH
1654 {
1655 if (UTF_16_HIGH_SURROGATE_P (c))
1656 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1657 else
119852e7 1658 {
2735d060 1659 if (eol_dos && c == '\r')
119852e7
KH
1660 {
1661 ONE_MORE_BYTE (byte_after_cr1);
1662 ONE_MORE_BYTE (byte_after_cr2);
1663 }
1664 *charbuf++ = c;
1665 }
8f924df7 1666 }
aa72b389 1667 }
df7492f9
KH
1668
1669 no_more_source:
1670 coding->consumed_char += consumed_chars_base;
1671 coding->consumed = src_base - coding->source;
1672 coding->charbuf_used = charbuf - coding->charbuf;
aa72b389 1673}
b73bfc1c 1674
f10fe38f 1675static bool
971de7fb 1676encode_coding_utf_16 (struct coding_system *coding)
df7492f9 1677{
f10fe38f 1678 bool multibytep = coding->dst_multibyte;
df7492f9
KH
1679 int *charbuf = coding->charbuf;
1680 int *charbuf_end = charbuf + coding->charbuf_used;
1681 unsigned char *dst = coding->destination + coding->produced;
1682 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1683 int safe_room = 8;
a470d443 1684 enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
f10fe38f 1685 bool big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
d311d28c 1686 ptrdiff_t produced_chars = 0;
df7492f9 1687 int c;
4ed46869 1688
a470d443 1689 if (bom != utf_without_bom)
df7492f9
KH
1690 {
1691 ASSURE_DESTINATION (safe_room);
1692 if (big_endian)
df7492f9 1693 EMIT_TWO_BYTES (0xFE, 0xFF);
880cf180
KH
1694 else
1695 EMIT_TWO_BYTES (0xFF, 0xFE);
a470d443 1696 CODING_UTF_16_BOM (coding) = utf_without_bom;
df7492f9
KH
1697 }
1698
1699 while (charbuf < charbuf_end)
1700 {
1701 ASSURE_DESTINATION (safe_room);
1702 c = *charbuf++;
60afa08d 1703 if (c > MAX_UNICODE_CHAR)
e19c3639 1704 c = coding->default_char;
df7492f9
KH
1705
1706 if (c < 0x10000)
1707 {
1708 if (big_endian)
1709 EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1710 else
1711 EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1712 }
1713 else
1714 {
1715 int c1, c2;
1716
1717 c -= 0x10000;
1718 c1 = (c >> 10) + 0xD800;
1719 c2 = (c & 0x3FF) + 0xDC00;
1720 if (big_endian)
1721 EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1722 else
1723 EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1724 }
1725 }
065e3595 1726 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
1727 coding->produced = dst - coding->destination;
1728 coding->produced_char += produced_chars;
1729 return 0;
1730}
1731
1732\f
1733/*** 6. Old Emacs' internal format (emacs-mule) ***/
1734
1735/* Emacs' internal format for representation of multiple character
1736 sets is a kind of multi-byte encoding, i.e. characters are
1737 represented by variable-length sequences of one-byte codes.
1738
1739 ASCII characters and control characters (e.g. `tab', `newline') are
1740 represented by one-byte sequences which are their ASCII codes, in
1741 the range 0x00 through 0x7F.
1742
1743 8-bit characters of the range 0x80..0x9F are represented by
1744 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1745 code + 0x20).
1746
1747 8-bit characters of the range 0xA0..0xFF are represented by
1748 one-byte sequences which are their 8-bit code.
1749
1750 The other characters are represented by a sequence of `base
1751 leading-code', optional `extended leading-code', and one or two
1752 `position-code's. The length of the sequence is determined by the
1753 base leading-code. Leading-code takes the range 0x81 through 0x9D,
1754 whereas extended leading-code and position-code take the range 0xA0
1755 through 0xFF. See `charset.h' for more details about leading-code
1756 and position-code.
1757
1758 --- CODE RANGE of Emacs' internal format ---
1759 character set range
1760 ------------- -----
1761 ascii 0x00..0x7F
1762 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1763 eight-bit-graphic 0xA0..0xBF
1764 ELSE 0x81..0x9D + [0xA0..0xFF]+
1765 ---------------------------------------------
1766
1767 As this is the internal character representation, the format is
1768 usually not used externally (i.e. in a file or in a data sent to a
1769 process). But, it is possible to have a text externally in this
1770 format (i.e. by encoding by the coding system `emacs-mule').
1771
1772 In that case, a sequence of one-byte codes has a slightly different
1773 form.
1774
1775 At first, all characters in eight-bit-control are represented by
1776 one-byte sequences which are their 8-bit code.
1777
1778 Next, character composition data are represented by the byte
1779 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1780 where,
e951386e 1781 METHOD is 0xF2 plus one of composition method (enum
df7492f9
KH
1782 composition_method),
1783
1784 BYTES is 0xA0 plus a byte length of this composition data,
1785
e951386e 1786 CHARS is 0xA0 plus a number of characters composed by this
df7492f9
KH
1787 data,
1788
ad1746f5 1789 COMPONENTs are characters of multibyte form or composition
df7492f9
KH
1790 rules encoded by two-byte of ASCII codes.
1791
1792 In addition, for backward compatibility, the following formats are
1793 also recognized as composition data on decoding.
1794
1795 0x80 MSEQ ...
1796 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1797
1798 Here,
1799 MSEQ is a multibyte form but in these special format:
1800 ASCII: 0xA0 ASCII_CODE+0x80,
1801 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1802 RULE is a one byte code of the range 0xA0..0xF0 that
1803 represents a composition rule.
1804 */
1805
1806char emacs_mule_bytes[256];
1807
e951386e
KH
1808
1809/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
f10fe38f 1810 Return true if a text is encoded in 'emacs-mule'. */
e951386e 1811
f10fe38f 1812static bool
cf84bb53
JB
1813detect_coding_emacs_mule (struct coding_system *coding,
1814 struct coding_detection_info *detect_info)
e951386e
KH
1815{
1816 const unsigned char *src = coding->source, *src_base;
1817 const unsigned char *src_end = coding->source + coding->src_bytes;
f10fe38f 1818 bool multibytep = coding->src_multibyte;
d311d28c 1819 ptrdiff_t consumed_chars = 0;
e951386e
KH
1820 int c;
1821 int found = 0;
1822
1823 detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1824 /* A coding system of this category is always ASCII compatible. */
1825 src += coding->head_ascii;
1826
1827 while (1)
1828 {
1829 src_base = src;
1830 ONE_MORE_BYTE (c);
1831 if (c < 0)
1832 continue;
1833 if (c == 0x80)
1834 {
1835 /* Perhaps the start of composite character. We simply skip
1836 it because analyzing it is too heavy for detecting. But,
1837 at least, we check that the composite character
1838 constitutes of more than 4 bytes. */
2735d060 1839 const unsigned char *src_start;
e951386e
KH
1840
1841 repeat:
2735d060 1842 src_start = src;
e951386e
KH
1843 do
1844 {
1845 ONE_MORE_BYTE (c);
1846 }
1847 while (c >= 0xA0);
1848
2735d060 1849 if (src - src_start <= 4)
e951386e
KH
1850 break;
1851 found = CATEGORY_MASK_EMACS_MULE;
1852 if (c == 0x80)
1853 goto repeat;
1854 }
1855
1856 if (c < 0x80)
1857 {
1858 if (c < 0x20
1859 && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1860 break;
1861 }
1862 else
1863 {
396475b7 1864 int more_bytes = emacs_mule_bytes[c] - 1;
e951386e
KH
1865
1866 while (more_bytes > 0)
1867 {
1868 ONE_MORE_BYTE (c);
1869 if (c < 0xA0)
1870 {
1871 src--; /* Unread the last byte. */
1872 break;
1873 }
1874 more_bytes--;
1875 }
1876 if (more_bytes != 0)
1877 break;
1878 found = CATEGORY_MASK_EMACS_MULE;
1879 }
1880 }
1881 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1882 return 0;
1883
1884 no_more_source:
1885 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1886 {
1887 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1888 return 0;
1889 }
1890 detect_info->found |= found;
1891 return 1;
1892}
1893
1894
1895/* Parse emacs-mule multibyte sequence at SRC and return the decoded
1896 character. If CMP_STATUS indicates that we must expect MSEQ or
1897 RULE described above, decode it and return the negative value of
685ebdc8 1898 the decoded character or rule. If an invalid byte is found, return
e951386e
KH
1899 -1. If SRC is too short, return -2. */
1900
e2f1bab9 1901static int
cf84bb53
JB
1902emacs_mule_char (struct coding_system *coding, const unsigned char *src,
1903 int *nbytes, int *nchars, int *id,
1904 struct composition_status *cmp_status)
df7492f9 1905{
8f924df7
KH
1906 const unsigned char *src_end = coding->source + coding->src_bytes;
1907 const unsigned char *src_base = src;
f10fe38f 1908 bool multibytep = coding->src_multibyte;
2735d060 1909 int charset_ID;
df7492f9
KH
1910 unsigned code;
1911 int c;
1912 int consumed_chars = 0;
f10fe38f 1913 bool mseq_found = 0;
df7492f9
KH
1914
1915 ONE_MORE_BYTE (c);
065e3595 1916 if (c < 0)
df7492f9 1917 {
065e3595 1918 c = -c;
2735d060 1919 charset_ID = emacs_mule_charset[0];
065e3595
KH
1920 }
1921 else
1922 {
4d41e8b7
KH
1923 if (c >= 0xA0)
1924 {
e951386e
KH
1925 if (cmp_status->state != COMPOSING_NO
1926 && cmp_status->old_form)
4d41e8b7 1927 {
e951386e
KH
1928 if (cmp_status->state == COMPOSING_CHAR)
1929 {
1930 if (c == 0xA0)
1931 {
1932 ONE_MORE_BYTE (c);
1933 c -= 0x80;
1934 if (c < 0)
1935 goto invalid_code;
1936 }
1937 else
1938 c -= 0x20;
1939 mseq_found = 1;
1940 }
1941 else
1942 {
1943 *nbytes = src - src_base;
1944 *nchars = consumed_chars;
1945 return -c;
1946 }
4d41e8b7
KH
1947 }
1948 else
e951386e 1949 goto invalid_code;
4d41e8b7
KH
1950 }
1951
065e3595 1952 switch (emacs_mule_bytes[c])
b73bfc1c 1953 {
065e3595 1954 case 2:
2735d060 1955 if ((charset_ID = emacs_mule_charset[c]) < 0)
df7492f9
KH
1956 goto invalid_code;
1957 ONE_MORE_BYTE (c);
9ffd559c 1958 if (c < 0xA0)
065e3595 1959 goto invalid_code;
df7492f9 1960 code = c & 0x7F;
065e3595
KH
1961 break;
1962
1963 case 3:
1964 if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
1965 || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
1966 {
1967 ONE_MORE_BYTE (c);
2735d060 1968 if (c < 0xA0 || (charset_ID = emacs_mule_charset[c]) < 0)
065e3595
KH
1969 goto invalid_code;
1970 ONE_MORE_BYTE (c);
9ffd559c 1971 if (c < 0xA0)
065e3595
KH
1972 goto invalid_code;
1973 code = c & 0x7F;
1974 }
1975 else
1976 {
2735d060 1977 if ((charset_ID = emacs_mule_charset[c]) < 0)
065e3595
KH
1978 goto invalid_code;
1979 ONE_MORE_BYTE (c);
9ffd559c 1980 if (c < 0xA0)
065e3595
KH
1981 goto invalid_code;
1982 code = (c & 0x7F) << 8;
1983 ONE_MORE_BYTE (c);
9ffd559c 1984 if (c < 0xA0)
065e3595
KH
1985 goto invalid_code;
1986 code |= c & 0x7F;
1987 }
1988 break;
1989
1990 case 4:
1991 ONE_MORE_BYTE (c);
2735d060 1992 if (c < 0 || (charset_ID = emacs_mule_charset[c]) < 0)
df7492f9
KH
1993 goto invalid_code;
1994 ONE_MORE_BYTE (c);
9ffd559c 1995 if (c < 0xA0)
065e3595 1996 goto invalid_code;
781d7a48 1997 code = (c & 0x7F) << 8;
df7492f9 1998 ONE_MORE_BYTE (c);
9ffd559c 1999 if (c < 0xA0)
065e3595 2000 goto invalid_code;
df7492f9 2001 code |= c & 0x7F;
065e3595 2002 break;
df7492f9 2003
065e3595
KH
2004 case 1:
2005 code = c;
2735d060 2006 charset_ID = ASCII_BYTE_P (code) ? charset_ascii : charset_eight_bit;
065e3595 2007 break;
df7492f9 2008
065e3595 2009 default:
1088b922 2010 emacs_abort ();
065e3595 2011 }
b84ae584 2012 CODING_DECODE_CHAR (coding, src, src_base, src_end,
2735d060 2013 CHARSET_FROM_ID (charset_ID), code, c);
065e3595
KH
2014 if (c < 0)
2015 goto invalid_code;
df7492f9 2016 }
df7492f9
KH
2017 *nbytes = src - src_base;
2018 *nchars = consumed_chars;
ff0dacd7 2019 if (id)
2735d060 2020 *id = charset_ID;
e951386e 2021 return (mseq_found ? -c : c);
df7492f9
KH
2022
2023 no_more_source:
2024 return -2;
2025
2026 invalid_code:
2027 return -1;
2028}
2029
2030
e951386e 2031/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
df7492f9 2032
e951386e
KH
2033/* Handle these composition sequence ('|': the end of header elements,
2034 BYTES and CHARS >= 0xA0):
df7492f9 2035
e951386e
KH
2036 (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2037 (2) altchar composition: 0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2038 (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
df7492f9 2039
e951386e 2040 and these old form:
1a4990fb 2041
e951386e
KH
2042 (4) relative composition: 0x80 | MSEQ ... MSEQ
2043 (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
df7492f9 2044
e951386e
KH
2045 When the starter 0x80 and the following header elements are found,
2046 this annotation header is produced.
df7492f9 2047
e951386e 2048 [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
df7492f9 2049
e951386e
KH
2050 NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2051 NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
df7492f9 2052
e951386e
KH
2053 Then, upon reading the following elements, these codes are produced
2054 until the composition end is found:
df7492f9 2055
e951386e
KH
2056 (1) CHAR ... CHAR
2057 (2) ALT ... ALT CHAR ... CHAR
2058 (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2059 (4) CHAR ... CHAR
2060 (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
4ed46869 2061
e951386e
KH
2062 When the composition end is found, LENGTH and NCHARS in the
2063 annotation header is updated as below:
b73bfc1c 2064
e951386e
KH
2065 (1) LENGTH: unchanged, NCHARS: unchanged
2066 (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2067 (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2068 (4) LENGTH: unchanged, NCHARS: number of CHARs
2069 (5) LENGTH: unchanged, NCHARS: number of CHARs
df7492f9 2070
e951386e
KH
2071 If an error is found while composing, the annotation header is
2072 changed to the original composition header (plus filler -1s) as
2073 below:
2074
2075 (1),(2),(3) [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2076 (5) [ 0x80 0xFF -1 -1- -1 ]
2077
2078 and the sequence [ -2 DECODED-RULE ] is changed to the original
2079 byte sequence as below:
2080 o the original byte sequence is B: [ B -1 ]
2081 o the original byte sequence is B1 B2: [ B1 B2 ]
2082
2083 Most of the routines are implemented by macros because many
2084 variables and labels in the caller decode_coding_emacs_mule must be
2085 accessible, and they are usually called just once (thus doesn't
2086 increase the size of compiled object). */
2087
2088/* Decode a composition rule represented by C as a component of
2089 composition sequence of Emacs 20 style. Set RULE to the decoded
2090 rule. */
2091
2092#define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule) \
df7492f9 2093 do { \
e951386e
KH
2094 int gref, nref; \
2095 \
4d41e8b7 2096 c -= 0xA0; \
df7492f9
KH
2097 if (c < 0 || c >= 81) \
2098 goto invalid_code; \
df7492f9 2099 gref = c / 9, nref = c % 9; \
e951386e
KH
2100 if (gref == 4) gref = 10; \
2101 if (nref == 4) nref = 10; \
2102 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
df7492f9
KH
2103 } while (0)
2104
2105
e951386e
KH
2106/* Decode a composition rule represented by C and the following byte
2107 at SRC as a component of composition sequence of Emacs 21 style.
2108 Set RULE to the decoded rule. */
781d7a48 2109
e951386e 2110#define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule) \
781d7a48
KH
2111 do { \
2112 int gref, nref; \
e951386e
KH
2113 \
2114 gref = c - 0x20; \
2115 if (gref < 0 || gref >= 81) \
781d7a48 2116 goto invalid_code; \
e951386e
KH
2117 ONE_MORE_BYTE (c); \
2118 nref = c - 0x20; \
2119 if (nref < 0 || nref >= 81) \
781d7a48 2120 goto invalid_code; \
e951386e 2121 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
781d7a48
KH
2122 } while (0)
2123
2124
e951386e
KH
2125/* Start of Emacs 21 style format. The first three bytes at SRC are
2126 (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2127 byte length of this composition information, CHARS is the number of
2128 characters composed by this composition. */
2129
2130#define DECODE_EMACS_MULE_21_COMPOSITION() \
aa72b389 2131 do { \
781d7a48 2132 enum composition_method method = c - 0xF2; \
df7492f9 2133 int nbytes, nchars; \
e951386e 2134 \
df7492f9 2135 ONE_MORE_BYTE (c); \
065e3595
KH
2136 if (c < 0) \
2137 goto invalid_code; \
df7492f9 2138 nbytes = c - 0xA0; \
e951386e 2139 if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4)) \
df7492f9
KH
2140 goto invalid_code; \
2141 ONE_MORE_BYTE (c); \
2142 nchars = c - 0xA0; \
e951386e
KH
2143 if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS) \
2144 goto invalid_code; \
2145 cmp_status->old_form = 0; \
2146 cmp_status->method = method; \
2147 if (method == COMPOSITION_RELATIVE) \
2148 cmp_status->state = COMPOSING_CHAR; \
2149 else \
2150 cmp_status->state = COMPOSING_COMPONENT_CHAR; \
2151 cmp_status->length = MAX_ANNOTATION_LENGTH; \
2152 cmp_status->nchars = nchars; \
2153 cmp_status->ncomps = nbytes - 4; \
2154 ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method); \
aa72b389 2155 } while (0)
93dec019 2156
aa72b389 2157
e951386e
KH
2158/* Start of Emacs 20 style format for relative composition. */
2159
2160#define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION() \
2161 do { \
2162 cmp_status->old_form = 1; \
2163 cmp_status->method = COMPOSITION_RELATIVE; \
2164 cmp_status->state = COMPOSING_CHAR; \
2165 cmp_status->length = MAX_ANNOTATION_LENGTH; \
2166 cmp_status->nchars = cmp_status->ncomps = 0; \
2167 ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method); \
2168 } while (0)
2169
2170
2171/* Start of Emacs 20 style format for rule-base composition. */
2172
2173#define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION() \
2174 do { \
2175 cmp_status->old_form = 1; \
2176 cmp_status->method = COMPOSITION_WITH_RULE; \
2177 cmp_status->state = COMPOSING_CHAR; \
2178 cmp_status->length = MAX_ANNOTATION_LENGTH; \
2179 cmp_status->nchars = cmp_status->ncomps = 0; \
2180 ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method); \
df7492f9
KH
2181 } while (0)
2182
2183
e951386e
KH
2184#define DECODE_EMACS_MULE_COMPOSITION_START() \
2185 do { \
2186 const unsigned char *current_src = src; \
2187 \
2188 ONE_MORE_BYTE (c); \
2189 if (c < 0) \
2190 goto invalid_code; \
2191 if (c - 0xF2 >= COMPOSITION_RELATIVE \
2192 && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS) \
2193 DECODE_EMACS_MULE_21_COMPOSITION (); \
2194 else if (c < 0xA0) \
2195 goto invalid_code; \
2196 else if (c < 0xC0) \
2197 { \
2198 DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (); \
2199 /* Re-read C as a composition component. */ \
2200 src = current_src; \
2201 } \
2202 else if (c == 0xFF) \
2203 DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (); \
2204 else \
2205 goto invalid_code; \
2206 } while (0)
2207
2208#define EMACS_MULE_COMPOSITION_END() \
df7492f9 2209 do { \
e951386e 2210 int idx = - cmp_status->length; \
4d41e8b7 2211 \
e951386e
KH
2212 if (cmp_status->old_form) \
2213 charbuf[idx + 2] = cmp_status->nchars; \
2214 else if (cmp_status->method > COMPOSITION_RELATIVE) \
2215 charbuf[idx] = charbuf[idx + 2] - cmp_status->length; \
2216 cmp_status->state = COMPOSING_NO; \
2217 } while (0)
2218
2219
2220static int
cf84bb53
JB
2221emacs_mule_finish_composition (int *charbuf,
2222 struct composition_status *cmp_status)
e951386e
KH
2223{
2224 int idx = - cmp_status->length;
2225 int new_chars;
2226
2227 if (cmp_status->old_form && cmp_status->nchars > 0)
2228 {
2229 charbuf[idx + 2] = cmp_status->nchars;
2230 new_chars = 0;
2231 if (cmp_status->method == COMPOSITION_WITH_RULE
2232 && cmp_status->state == COMPOSING_CHAR)
2233 {
2234 /* The last rule was invalid. */
2235 int rule = charbuf[-1] + 0xA0;
2236
2237 charbuf[-2] = BYTE8_TO_CHAR (rule);
2238 charbuf[-1] = -1;
2239 new_chars = 1;
2240 }
2241 }
2242 else
2243 {
2244 charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2245
2246 if (cmp_status->method == COMPOSITION_WITH_RULE)
2247 {
2248 charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2249 charbuf[idx++] = -3;
2250 charbuf[idx++] = 0;
2251 new_chars = 1;
2252 }
2253 else
2254 {
2255 int nchars = charbuf[idx + 1] + 0xA0;
2256 int nbytes = charbuf[idx + 2] + 0xA0;
2257
2258 charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2259 charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2260 charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2261 charbuf[idx++] = -1;
2262 new_chars = 4;
2263 }
2264 }
2265 cmp_status->state = COMPOSING_NO;
2266 return new_chars;
2267}
2268
2269#define EMACS_MULE_MAYBE_FINISH_COMPOSITION() \
2270 do { \
2271 if (cmp_status->state != COMPOSING_NO) \
2272 char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
df7492f9
KH
2273 } while (0)
2274
aa72b389
KH
2275
2276static void
971de7fb 2277decode_coding_emacs_mule (struct coding_system *coding)
aa72b389 2278{
8f924df7
KH
2279 const unsigned char *src = coding->source + coding->consumed;
2280 const unsigned char *src_end = coding->source + coding->src_bytes;
2281 const unsigned char *src_base;
69a80ea3 2282 int *charbuf = coding->charbuf + coding->charbuf_used;
ad1746f5
KH
2283 /* We may produce two annotations (charset and composition) in one
2284 loop and one more charset annotation at the end. */
69a80ea3 2285 int *charbuf_end
15cbd324
EZ
2286 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3)
2287 /* We can produce up to 2 characters in a loop. */
2288 - 1;
d311d28c 2289 ptrdiff_t consumed_chars = 0, consumed_chars_base;
f10fe38f 2290 bool multibytep = coding->src_multibyte;
d311d28c
PE
2291 ptrdiff_t char_offset = coding->produced_char;
2292 ptrdiff_t last_offset = char_offset;
ff0dacd7 2293 int last_id = charset_ascii;
f10fe38f
PE
2294 bool eol_dos
2295 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
119852e7 2296 int byte_after_cr = -1;
e951386e 2297 struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
aa72b389 2298
e951386e
KH
2299 if (cmp_status->state != COMPOSING_NO)
2300 {
2301 int i;
2302
15cbd324 2303 if (charbuf_end - charbuf < cmp_status->length)
1088b922 2304 emacs_abort ();
e951386e
KH
2305 for (i = 0; i < cmp_status->length; i++)
2306 *charbuf++ = cmp_status->carryover[i];
2307 coding->annotated = 1;
2308 }
2309
aa72b389
KH
2310 while (1)
2311 {
ee05f961 2312 int c, id IF_LINT (= 0);
df7492f9 2313
aa72b389 2314 src_base = src;
df7492f9
KH
2315 consumed_chars_base = consumed_chars;
2316
2317 if (charbuf >= charbuf_end)
b71f6f73
KH
2318 {
2319 if (byte_after_cr >= 0)
2320 src_base--;
2321 break;
2322 }
aa72b389 2323
119852e7
KH
2324 if (byte_after_cr >= 0)
2325 c = byte_after_cr, byte_after_cr = -1;
2326 else
2327 ONE_MORE_BYTE (c);
e951386e
KH
2328
2329 if (c < 0 || c == 0x80)
065e3595 2330 {
e951386e
KH
2331 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2332 if (c < 0)
2333 {
2334 *charbuf++ = -c;
2335 char_offset++;
2336 }
2337 else
2338 DECODE_EMACS_MULE_COMPOSITION_START ();
2339 continue;
065e3595 2340 }
e951386e
KH
2341
2342 if (c < 0x80)
aa72b389 2343 {
2735d060 2344 if (eol_dos && c == '\r')
119852e7 2345 ONE_MORE_BYTE (byte_after_cr);
e951386e
KH
2346 id = charset_ascii;
2347 if (cmp_status->state != COMPOSING_NO)
2348 {
2349 if (cmp_status->old_form)
2350 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2351 else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2352 cmp_status->ncomps--;
2353 }
2354 }
2355 else
2356 {
ee05f961 2357 int nchars IF_LINT (= 0), nbytes IF_LINT (= 0);
75f80e63
EZ
2358 /* emacs_mule_char can load a charset map from a file, which
2359 allocates a large structure and might cause buffer text
2360 to be relocated as result. Thus, we need to remember the
ad1746f5 2361 original pointer to buffer text, and fix up all related
75f80e63
EZ
2362 pointers after the call. */
2363 const unsigned char *orig = coding->source;
d311d28c 2364 ptrdiff_t offset;
e951386e
KH
2365
2366 c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2367 cmp_status);
75f80e63
EZ
2368 offset = coding->source - orig;
2369 if (offset)
2370 {
2371 src += offset;
2372 src_base += offset;
2373 src_end += offset;
2374 }
e951386e
KH
2375 if (c < 0)
2376 {
2377 if (c == -1)
2378 goto invalid_code;
2379 if (c == -2)
2380 break;
2381 }
2382 src = src_base + nbytes;
2383 consumed_chars = consumed_chars_base + nchars;
2384 if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2385 cmp_status->ncomps -= nchars;
2386 }
2387
ad1746f5 2388 /* Now if C >= 0, we found a normally encoded character, if C <
e951386e
KH
2389 0, we found an old-style composition component character or
2390 rule. */
2391
2392 if (cmp_status->state == COMPOSING_NO)
2393 {
2394 if (last_id != id)
2395 {
2396 if (last_id != charset_ascii)
2397 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2398 last_id);
2399 last_id = id;
2400 last_offset = char_offset;
2401 }
df7492f9
KH
2402 *charbuf++ = c;
2403 char_offset++;
aa72b389 2404 }
e951386e 2405 else if (cmp_status->state == COMPOSING_CHAR)
df7492f9 2406 {
e951386e
KH
2407 if (cmp_status->old_form)
2408 {
2409 if (c >= 0)
2410 {
2411 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2412 *charbuf++ = c;
2413 char_offset++;
2414 }
2415 else
2416 {
2417 *charbuf++ = -c;
2418 cmp_status->nchars++;
2419 cmp_status->length++;
2420 if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2421 EMACS_MULE_COMPOSITION_END ();
2422 else if (cmp_status->method == COMPOSITION_WITH_RULE)
2423 cmp_status->state = COMPOSING_RULE;
2424 }
2425 }
df7492f9 2426 else
e951386e
KH
2427 {
2428 *charbuf++ = c;
2429 cmp_status->length++;
2430 cmp_status->nchars--;
2431 if (cmp_status->nchars == 0)
2432 EMACS_MULE_COMPOSITION_END ();
2433 }
df7492f9 2434 }
e951386e 2435 else if (cmp_status->state == COMPOSING_RULE)
df7492f9 2436 {
e951386e 2437 int rule;
ff0dacd7 2438
e951386e 2439 if (c >= 0)
df7492f9 2440 {
e951386e
KH
2441 EMACS_MULE_COMPOSITION_END ();
2442 *charbuf++ = c;
2443 char_offset++;
df7492f9 2444 }
e951386e 2445 else
ff0dacd7 2446 {
e951386e
KH
2447 c = -c;
2448 DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2449 if (rule < 0)
2450 goto invalid_code;
2451 *charbuf++ = -2;
2452 *charbuf++ = rule;
2453 cmp_status->length += 2;
2454 cmp_status->state = COMPOSING_CHAR;
ff0dacd7 2455 }
e951386e
KH
2456 }
2457 else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2458 {
df7492f9 2459 *charbuf++ = c;
e951386e
KH
2460 cmp_status->length++;
2461 if (cmp_status->ncomps == 0)
2462 cmp_status->state = COMPOSING_CHAR;
2463 else if (cmp_status->ncomps > 0)
2464 {
2465 if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2466 cmp_status->state = COMPOSING_COMPONENT_RULE;
2467 }
2468 else
2469 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
df7492f9 2470 }
e951386e
KH
2471 else /* COMPOSING_COMPONENT_RULE */
2472 {
2473 int rule;
2474
2475 DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2476 if (rule < 0)
2477 goto invalid_code;
2478 *charbuf++ = -2;
2479 *charbuf++ = rule;
2480 cmp_status->length += 2;
2481 cmp_status->ncomps--;
2482 if (cmp_status->ncomps > 0)
2483 cmp_status->state = COMPOSING_COMPONENT_CHAR;
2484 else
2485 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2486 }
2487 continue;
2488
df7492f9 2489 invalid_code:
e951386e 2490 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
df7492f9
KH
2491 src = src_base;
2492 consumed_chars = consumed_chars_base;
2493 ONE_MORE_BYTE (c);
2494 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
ff0dacd7 2495 char_offset++;
df7492f9
KH
2496 coding->errors++;
2497 }
2498
2499 no_more_source:
e951386e
KH
2500 if (cmp_status->state != COMPOSING_NO)
2501 {
2502 if (coding->mode & CODING_MODE_LAST_BLOCK)
2503 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2504 else
2505 {
2506 int i;
2507
2508 charbuf -= cmp_status->length;
2509 for (i = 0; i < cmp_status->length; i++)
2510 cmp_status->carryover[i] = charbuf[i];
2511 }
2512 }
ff0dacd7 2513 if (last_id != charset_ascii)
69a80ea3 2514 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
df7492f9
KH
2515 coding->consumed_char += consumed_chars_base;
2516 coding->consumed = src_base - coding->source;
2517 coding->charbuf_used = charbuf - coding->charbuf;
2518}
2519
2520
2521#define EMACS_MULE_LEADING_CODES(id, codes) \
2522 do { \
2523 if (id < 0xA0) \
2524 codes[0] = id, codes[1] = 0; \
2525 else if (id < 0xE0) \
2526 codes[0] = 0x9A, codes[1] = id; \
2527 else if (id < 0xF0) \
2528 codes[0] = 0x9B, codes[1] = id; \
2529 else if (id < 0xF5) \
2530 codes[0] = 0x9C, codes[1] = id; \
2531 else \
2532 codes[0] = 0x9D, codes[1] = id; \
2533 } while (0);
2534
aa72b389 2535
f10fe38f 2536static bool
971de7fb 2537encode_coding_emacs_mule (struct coding_system *coding)
df7492f9 2538{
f10fe38f 2539 bool multibytep = coding->dst_multibyte;
df7492f9
KH
2540 int *charbuf = coding->charbuf;
2541 int *charbuf_end = charbuf + coding->charbuf_used;
2542 unsigned char *dst = coding->destination + coding->produced;
2543 unsigned char *dst_end = coding->destination + coding->dst_bytes;
2544 int safe_room = 8;
d311d28c 2545 ptrdiff_t produced_chars = 0;
24a73b0a 2546 Lisp_Object attrs, charset_list;
df7492f9 2547 int c;
ff0dacd7 2548 int preferred_charset_id = -1;
df7492f9 2549
24a73b0a 2550 CODING_GET_INFO (coding, attrs, charset_list);
eccb6815
KH
2551 if (! EQ (charset_list, Vemacs_mule_charset_list))
2552 {
4939150c
PE
2553 charset_list = Vemacs_mule_charset_list;
2554 ASET (attrs, coding_attr_charset_list, charset_list);
eccb6815 2555 }
df7492f9
KH
2556
2557 while (charbuf < charbuf_end)
2558 {
2559 ASSURE_DESTINATION (safe_room);
2560 c = *charbuf++;
ff0dacd7
KH
2561
2562 if (c < 0)
2563 {
2564 /* Handle an annotation. */
2565 switch (*charbuf)
2566 {
2567 case CODING_ANNOTATE_COMPOSITION_MASK:
2568 /* Not yet implemented. */
2569 break;
2570 case CODING_ANNOTATE_CHARSET_MASK:
2571 preferred_charset_id = charbuf[3];
2572 if (preferred_charset_id >= 0
2573 && NILP (Fmemq (make_number (preferred_charset_id),
2574 charset_list)))
2575 preferred_charset_id = -1;
2576 break;
2577 default:
1088b922 2578 emacs_abort ();
ff0dacd7
KH
2579 }
2580 charbuf += -c - 1;
2581 continue;
2582 }
2583
df7492f9
KH
2584 if (ASCII_CHAR_P (c))
2585 EMIT_ONE_ASCII_BYTE (c);
16eafb5d
KH
2586 else if (CHAR_BYTE8_P (c))
2587 {
2588 c = CHAR_TO_BYTE8 (c);
2589 EMIT_ONE_BYTE (c);
2590 }
df7492f9 2591 else
aa72b389 2592 {
df7492f9
KH
2593 struct charset *charset;
2594 unsigned code;
2595 int dimension;
2596 int emacs_mule_id;
2597 unsigned char leading_codes[2];
2598
ff0dacd7
KH
2599 if (preferred_charset_id >= 0)
2600 {
f10fe38f 2601 bool result;
5eb05ea3 2602
ff0dacd7 2603 charset = CHARSET_FROM_ID (preferred_charset_id);
5eb05ea3
KH
2604 CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
2605 if (result)
905ca9d2
KH
2606 code = ENCODE_CHAR (charset, c);
2607 else
5eb05ea3
KH
2608 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2609 &code, charset);
ff0dacd7
KH
2610 }
2611 else
5eb05ea3
KH
2612 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2613 &code, charset);
df7492f9
KH
2614 if (! charset)
2615 {
2616 c = coding->default_char;
2617 if (ASCII_CHAR_P (c))
2618 {
2619 EMIT_ONE_ASCII_BYTE (c);
2620 continue;
2621 }
5eb05ea3
KH
2622 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2623 &code, charset);
df7492f9
KH
2624 }
2625 dimension = CHARSET_DIMENSION (charset);
2626 emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2627 EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2628 EMIT_ONE_BYTE (leading_codes[0]);
2629 if (leading_codes[1])
2630 EMIT_ONE_BYTE (leading_codes[1]);
2631 if (dimension == 1)
1fa663f9 2632 EMIT_ONE_BYTE (code | 0x80);
aa72b389 2633 else
df7492f9 2634 {
1fa663f9 2635 code |= 0x8080;
df7492f9
KH
2636 EMIT_ONE_BYTE (code >> 8);
2637 EMIT_ONE_BYTE (code & 0xFF);
2638 }
aa72b389 2639 }
aa72b389 2640 }
065e3595 2641 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
2642 coding->produced_char += produced_chars;
2643 coding->produced = dst - coding->destination;
2644 return 0;
aa72b389 2645}
b73bfc1c 2646
4ed46869 2647\f
df7492f9 2648/*** 7. ISO2022 handlers ***/
4ed46869
KH
2649
2650/* The following note describes the coding system ISO2022 briefly.
39787efd 2651 Since the intention of this note is to help understand the
5a936b46 2652 functions in this file, some parts are NOT ACCURATE or are OVERLY
39787efd 2653 SIMPLIFIED. For thorough understanding, please refer to the
5a936b46 2654 original document of ISO2022. This is equivalent to the standard
cfb43547 2655 ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
4ed46869
KH
2656
2657 ISO2022 provides many mechanisms to encode several character sets
cfb43547 2658 in 7-bit and 8-bit environments. For 7-bit environments, all text
39787efd
KH
2659 is encoded using bytes less than 128. This may make the encoded
2660 text a little bit longer, but the text passes more easily through
cfb43547 2661 several types of gateway, some of which strip off the MSB (Most
8ca3766a 2662 Significant Bit).
b73bfc1c 2663
cfb43547
DL
2664 There are two kinds of character sets: control character sets and
2665 graphic character sets. The former contain control characters such
4ed46869 2666 as `newline' and `escape' to provide control functions (control
39787efd 2667 functions are also provided by escape sequences). The latter
cfb43547 2668 contain graphic characters such as 'A' and '-'. Emacs recognizes
4ed46869
KH
2669 two control character sets and many graphic character sets.
2670
2671 Graphic character sets are classified into one of the following
39787efd
KH
2672 four classes, according to the number of bytes (DIMENSION) and
2673 number of characters in one dimension (CHARS) of the set:
2674 - DIMENSION1_CHARS94
2675 - DIMENSION1_CHARS96
2676 - DIMENSION2_CHARS94
2677 - DIMENSION2_CHARS96
2678
2679 In addition, each character set is assigned an identification tag,
cfb43547 2680 unique for each set, called the "final character" (denoted as <F>
39787efd
KH
2681 hereafter). The <F> of each character set is decided by ECMA(*)
2682 when it is registered in ISO. The code range of <F> is 0x30..0x7F
2683 (0x30..0x3F are for private use only).
4ed46869
KH
2684
2685 Note (*): ECMA = European Computer Manufacturers Association
2686
cfb43547 2687 Here are examples of graphic character sets [NAME(<F>)]:
4ed46869
KH
2688 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2689 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2690 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2691 o DIMENSION2_CHARS96 -- none for the moment
2692
39787efd 2693 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
4ed46869
KH
2694 C0 [0x00..0x1F] -- control character plane 0
2695 GL [0x20..0x7F] -- graphic character plane 0
2696 C1 [0x80..0x9F] -- control character plane 1
2697 GR [0xA0..0xFF] -- graphic character plane 1
2698
2699 A control character set is directly designated and invoked to C0 or
39787efd
KH
2700 C1 by an escape sequence. The most common case is that:
2701 - ISO646's control character set is designated/invoked to C0, and
2702 - ISO6429's control character set is designated/invoked to C1,
2703 and usually these designations/invocations are omitted in encoded
2704 text. In a 7-bit environment, only C0 can be used, and a control
2705 character for C1 is encoded by an appropriate escape sequence to
2706 fit into the environment. All control characters for C1 are
2707 defined to have corresponding escape sequences.
4ed46869
KH
2708
2709 A graphic character set is at first designated to one of four
2710 graphic registers (G0 through G3), then these graphic registers are
2711 invoked to GL or GR. These designations and invocations can be
2712 done independently. The most common case is that G0 is invoked to
39787efd
KH
2713 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
2714 these invocations and designations are omitted in encoded text.
2715 In a 7-bit environment, only GL can be used.
4ed46869 2716
39787efd
KH
2717 When a graphic character set of CHARS94 is invoked to GL, codes
2718 0x20 and 0x7F of the GL area work as control characters SPACE and
2719 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2720 be used.
4ed46869
KH
2721
2722 There are two ways of invocation: locking-shift and single-shift.
2723 With locking-shift, the invocation lasts until the next different
39787efd
KH
2724 invocation, whereas with single-shift, the invocation affects the
2725 following character only and doesn't affect the locking-shift
2726 state. Invocations are done by the following control characters or
2727 escape sequences:
4ed46869
KH
2728
2729 ----------------------------------------------------------------------
39787efd 2730 abbrev function cntrl escape seq description
4ed46869 2731 ----------------------------------------------------------------------
39787efd
KH
2732 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
2733 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
2734 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
2735 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
2736 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
2737 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
2738 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
2739 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
2740 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
4ed46869 2741 ----------------------------------------------------------------------
39787efd
KH
2742 (*) These are not used by any known coding system.
2743
2744 Control characters for these functions are defined by macros
2745 ISO_CODE_XXX in `coding.h'.
4ed46869 2746
39787efd 2747 Designations are done by the following escape sequences:
4ed46869
KH
2748 ----------------------------------------------------------------------
2749 escape sequence description
2750 ----------------------------------------------------------------------
2751 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
2752 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
2753 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
2754 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
2755 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
2756 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
2757 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
2758 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
2759 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
2760 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
2761 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
2762 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
2763 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
2764 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
2765 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
2766 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
2767 ----------------------------------------------------------------------
2768
2769 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
39787efd 2770 of dimension 1, chars 94, and final character <F>, etc...
4ed46869
KH
2771
2772 Note (*): Although these designations are not allowed in ISO2022,
2773 Emacs accepts them on decoding, and produces them on encoding
39787efd 2774 CHARS96 character sets in a coding system which is characterized as
4ed46869
KH
2775 7-bit environment, non-locking-shift, and non-single-shift.
2776
2777 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
df7492f9 2778 '(' must be omitted. We refer to this as "short-form" hereafter.
4ed46869 2779
cfb43547 2780 Now you may notice that there are a lot of ways of encoding the
39787efd
KH
2781 same multilingual text in ISO2022. Actually, there exist many
2782 coding systems such as Compound Text (used in X11's inter client
8ca3766a
DL
2783 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2784 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
4ed46869
KH
2785 localized platforms), and all of these are variants of ISO2022.
2786
2787 In addition to the above, Emacs handles two more kinds of escape
2788 sequences: ISO6429's direction specification and Emacs' private
2789 sequence for specifying character composition.
2790
39787efd 2791 ISO6429's direction specification takes the following form:
4ed46869
KH
2792 o CSI ']' -- end of the current direction
2793 o CSI '0' ']' -- end of the current direction
2794 o CSI '1' ']' -- start of left-to-right text
2795 o CSI '2' ']' -- start of right-to-left text
2796 The control character CSI (0x9B: control sequence introducer) is
39787efd
KH
2797 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2798
2799 Character composition specification takes the following form:
ec6d2bb8
KH
2800 o ESC '0' -- start relative composition
2801 o ESC '1' -- end composition
2802 o ESC '2' -- start rule-base composition (*)
2803 o ESC '3' -- start relative composition with alternate chars (**)
2804 o ESC '4' -- start rule-base composition with alternate chars (**)
b73bfc1c 2805 Since these are not standard escape sequences of any ISO standard,
cfb43547 2806 the use of them with these meanings is restricted to Emacs only.
ec6d2bb8 2807
5a936b46
DL
2808 (*) This form is used only in Emacs 20.7 and older versions,
2809 but newer versions can safely decode it.
cfb43547 2810 (**) This form is used only in Emacs 21.1 and newer versions,
5a936b46 2811 and older versions can't decode it.
ec6d2bb8 2812
cfb43547 2813 Here's a list of example usages of these composition escape
b73bfc1c 2814 sequences (categorized by `enum composition_method').
ec6d2bb8 2815
b73bfc1c 2816 COMPOSITION_RELATIVE:
ec6d2bb8 2817 ESC 0 CHAR [ CHAR ] ESC 1
8ca3766a 2818 COMPOSITION_WITH_RULE:
ec6d2bb8 2819 ESC 2 CHAR [ RULE CHAR ] ESC 1
b73bfc1c 2820 COMPOSITION_WITH_ALTCHARS:
ec6d2bb8 2821 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
b73bfc1c 2822 COMPOSITION_WITH_RULE_ALTCHARS:
ec6d2bb8 2823 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
4ed46869 2824
74ab6df5 2825static enum iso_code_class_type iso_code_class[256];
4ed46869 2826
df7492f9
KH
2827#define SAFE_CHARSET_P(coding, id) \
2828 ((id) <= (coding)->max_charset_id \
1b3b981b 2829 && (coding)->safe_charsets[id] != 255)
df7492f9 2830
df7492f9 2831static void
971de7fb 2832setup_iso_safe_charsets (Lisp_Object attrs)
df7492f9
KH
2833{
2834 Lisp_Object charset_list, safe_charsets;
2835 Lisp_Object request;
2836 Lisp_Object reg_usage;
2837 Lisp_Object tail;
d311d28c 2838 EMACS_INT reg94, reg96;
df7492f9
KH
2839 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2840 int max_charset_id;
2841
2842 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2843 if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2844 && ! EQ (charset_list, Viso_2022_charset_list))
2845 {
4939150c
PE
2846 charset_list = Viso_2022_charset_list;
2847 ASET (attrs, coding_attr_charset_list, charset_list);
df7492f9
KH
2848 ASET (attrs, coding_attr_safe_charsets, Qnil);
2849 }
2850
2851 if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2852 return;
2853
2854 max_charset_id = 0;
2855 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2856 {
2857 int id = XINT (XCAR (tail));
2858 if (max_charset_id < id)
2859 max_charset_id = id;
2860 }
d46c5b12 2861
1b3b981b
AS
2862 safe_charsets = make_uninit_string (max_charset_id + 1);
2863 memset (SDATA (safe_charsets), 255, max_charset_id + 1);
df7492f9
KH
2864 request = AREF (attrs, coding_attr_iso_request);
2865 reg_usage = AREF (attrs, coding_attr_iso_usage);
2866 reg94 = XINT (XCAR (reg_usage));
2867 reg96 = XINT (XCDR (reg_usage));
2868
2869 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2870 {
2871 Lisp_Object id;
2872 Lisp_Object reg;
2873 struct charset *charset;
2874
2875 id = XCAR (tail);
2876 charset = CHARSET_FROM_ID (XINT (id));
bf16eb23 2877 reg = Fcdr (Fassq (id, request));
df7492f9 2878 if (! NILP (reg))
8f924df7 2879 SSET (safe_charsets, XINT (id), XINT (reg));
df7492f9
KH
2880 else if (charset->iso_chars_96)
2881 {
2882 if (reg96 < 4)
8f924df7 2883 SSET (safe_charsets, XINT (id), reg96);
df7492f9
KH
2884 }
2885 else
2886 {
2887 if (reg94 < 4)
8f924df7 2888 SSET (safe_charsets, XINT (id), reg94);
df7492f9
KH
2889 }
2890 }
2891 ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2892}
d46c5b12 2893
b6871cc7 2894
4ed46869 2895/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
f10fe38f
PE
2896 Return true if a text is encoded in one of ISO-2022 based coding
2897 systems. */
4ed46869 2898
f10fe38f 2899static bool
cf84bb53
JB
2900detect_coding_iso_2022 (struct coding_system *coding,
2901 struct coding_detection_info *detect_info)
4ed46869 2902{
8f924df7
KH
2903 const unsigned char *src = coding->source, *src_base = src;
2904 const unsigned char *src_end = coding->source + coding->src_bytes;
f10fe38f
PE
2905 bool multibytep = coding->src_multibyte;
2906 bool single_shifting = 0;
0e48bb22 2907 int id;
df7492f9 2908 int c, c1;
d311d28c 2909 ptrdiff_t consumed_chars = 0;
df7492f9 2910 int i;
ff0dacd7
KH
2911 int rejected = 0;
2912 int found = 0;
cee53ed4 2913 int composition_count = -1;
ff0dacd7
KH
2914
2915 detect_info->checked |= CATEGORY_MASK_ISO;
df7492f9
KH
2916
2917 for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2918 {
2919 struct coding_system *this = &(coding_categories[i]);
2920 Lisp_Object attrs, val;
2921
c6b278e7
KH
2922 if (this->id < 0)
2923 continue;
df7492f9
KH
2924 attrs = CODING_ID_ATTRS (this->id);
2925 if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
1b3b981b 2926 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
df7492f9
KH
2927 setup_iso_safe_charsets (attrs);
2928 val = CODING_ATTR_SAFE_CHARSETS (attrs);
8f924df7 2929 this->max_charset_id = SCHARS (val) - 1;
1b3b981b 2930 this->safe_charsets = SDATA (val);
df7492f9
KH
2931 }
2932
2933 /* A coding system of this category is always ASCII compatible. */
2934 src += coding->head_ascii;
3f003981 2935
ff0dacd7 2936 while (rejected != CATEGORY_MASK_ISO)
4ed46869 2937 {
065e3595 2938 src_base = src;
df7492f9 2939 ONE_MORE_BYTE (c);
4ed46869
KH
2940 switch (c)
2941 {
2942 case ISO_CODE_ESC:
74383408
KH
2943 if (inhibit_iso_escape_detection)
2944 break;
f46869e4 2945 single_shifting = 0;
df7492f9 2946 ONE_MORE_BYTE (c);
0e48bb22 2947 if (c == 'N' || c == 'O')
d46c5b12 2948 {
ae9ff118 2949 /* ESC <Fe> for SS2 or SS3. */
ff0dacd7
KH
2950 single_shifting = 1;
2951 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
4ed46869 2952 }
cee53ed4
KH
2953 else if (c == '1')
2954 {
2955 /* End of composition. */
2956 if (composition_count < 0
2957 || composition_count > MAX_COMPOSITION_COMPONENTS)
2958 /* Invalid */
2959 break;
2960 composition_count = -1;
2961 found |= CATEGORY_MASK_ISO;
2962 }
ec6d2bb8
KH
2963 else if (c >= '0' && c <= '4')
2964 {
2965 /* ESC <Fp> for start/end composition. */
cee53ed4 2966 composition_count = 0;
ec6d2bb8 2967 }
bf9cdd4e 2968 else
df7492f9 2969 {
0e48bb22
AS
2970 if (c >= '(' && c <= '/')
2971 {
2972 /* Designation sequence for a charset of dimension 1. */
2973 ONE_MORE_BYTE (c1);
2974 if (c1 < ' ' || c1 >= 0x80
2975 || (id = iso_charset_table[0][c >= ','][c1]) < 0)
2976 /* Invalid designation sequence. Just ignore. */
2977 break;
2978 }
2979 else if (c == '$')
2980 {
2981 /* Designation sequence for a charset of dimension 2. */
2982 ONE_MORE_BYTE (c);
2983 if (c >= '@' && c <= 'B')
2984 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
2985 id = iso_charset_table[1][0][c];
2986 else if (c >= '(' && c <= '/')
2987 {
2988 ONE_MORE_BYTE (c1);
2989 if (c1 < ' ' || c1 >= 0x80
2990 || (id = iso_charset_table[1][c >= ','][c1]) < 0)
2991 /* Invalid designation sequence. Just ignore. */
2992 break;
2993 }
2994 else
2995 /* Invalid designation sequence. Just ignore it. */
2996 break;
2997 }
2998 else
2999 {
3000 /* Invalid escape sequence. Just ignore it. */
3001 break;
3002 }
d46c5b12 3003
0e48bb22
AS
3004 /* We found a valid designation sequence for CHARSET. */
3005 rejected |= CATEGORY_MASK_ISO_8BIT;
3006 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
3007 id))
3008 found |= CATEGORY_MASK_ISO_7;
3009 else
3010 rejected |= CATEGORY_MASK_ISO_7;
3011 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
3012 id))
3013 found |= CATEGORY_MASK_ISO_7_TIGHT;
3014 else
3015 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
3016 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3017 id))
3018 found |= CATEGORY_MASK_ISO_7_ELSE;
3019 else
3020 rejected |= CATEGORY_MASK_ISO_7_ELSE;
3021 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3022 id))
3023 found |= CATEGORY_MASK_ISO_8_ELSE;
3024 else
3025 rejected |= CATEGORY_MASK_ISO_8_ELSE;
3026 }
4ed46869
KH
3027 break;
3028
4ed46869 3029 case ISO_CODE_SO:
d46c5b12 3030 case ISO_CODE_SI:
ff0dacd7 3031 /* Locking shift out/in. */
74383408
KH
3032 if (inhibit_iso_escape_detection)
3033 break;
f46869e4 3034 single_shifting = 0;
ff0dacd7 3035 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
d46c5b12
KH
3036 break;
3037
4ed46869 3038 case ISO_CODE_CSI:
ff0dacd7 3039 /* Control sequence introducer. */
f46869e4 3040 single_shifting = 0;
ff0dacd7
KH
3041 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3042 found |= CATEGORY_MASK_ISO_8_ELSE;
3043 goto check_extra_latin;
3044
4ed46869
KH
3045 case ISO_CODE_SS2:
3046 case ISO_CODE_SS3:
ff0dacd7
KH
3047 /* Single shift. */
3048 if (inhibit_iso_escape_detection)
3049 break;
75e2a253 3050 single_shifting = 0;
ff0dacd7
KH
3051 rejected |= CATEGORY_MASK_ISO_7BIT;
3052 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3053 & CODING_ISO_FLAG_SINGLE_SHIFT)
0e48bb22
AS
3054 {
3055 found |= CATEGORY_MASK_ISO_8_1;
3056 single_shifting = 1;
3057 }
ff0dacd7
KH
3058 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3059 & CODING_ISO_FLAG_SINGLE_SHIFT)
0e48bb22
AS
3060 {
3061 found |= CATEGORY_MASK_ISO_8_2;
3062 single_shifting = 1;
3063 }
75e2a253
KH
3064 if (single_shifting)
3065 break;
0e48bb22
AS
3066 check_extra_latin:
3067 if (! VECTORP (Vlatin_extra_code_table)
28be1ada 3068 || NILP (AREF (Vlatin_extra_code_table, c)))
0e48bb22
AS
3069 {
3070 rejected = CATEGORY_MASK_ISO;
3071 break;
3072 }
3073 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3074 & CODING_ISO_FLAG_LATIN_EXTRA)
3075 found |= CATEGORY_MASK_ISO_8_1;
3076 else
3077 rejected |= CATEGORY_MASK_ISO_8_1;
3078 rejected |= CATEGORY_MASK_ISO_8_2;
3079 break;
4ed46869
KH
3080
3081 default:
065e3595
KH
3082 if (c < 0)
3083 continue;
4ed46869 3084 if (c < 0x80)
f46869e4 3085 {
cee53ed4
KH
3086 if (composition_count >= 0)
3087 composition_count++;
f46869e4
KH
3088 single_shifting = 0;
3089 break;
3090 }
ff0dacd7 3091 if (c >= 0xA0)
c4825358 3092 {
ff0dacd7
KH
3093 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3094 found |= CATEGORY_MASK_ISO_8_1;
f46869e4 3095 /* Check the length of succeeding codes of the range
ff0dacd7
KH
3096 0xA0..0FF. If the byte length is even, we include
3097 CATEGORY_MASK_ISO_8_2 in `found'. We can check this
3098 only when we are not single shifting. */
3099 if (! single_shifting
3100 && ! (rejected & CATEGORY_MASK_ISO_8_2))
f46869e4 3101 {
2735d060 3102 int len = 1;
b73bfc1c
KH
3103 while (src < src_end)
3104 {
d12bd917 3105 src_base = src;
df7492f9 3106 ONE_MORE_BYTE (c);
b73bfc1c 3107 if (c < 0xA0)
d12bd917
KH
3108 {
3109 src = src_base;
3110 break;
3111 }
2735d060 3112 len++;
b73bfc1c
KH
3113 }
3114
2735d060 3115 if (len & 1 && src < src_end)
cee53ed4
KH
3116 {
3117 rejected |= CATEGORY_MASK_ISO_8_2;
3118 if (composition_count >= 0)
2735d060 3119 composition_count += len;
cee53ed4 3120 }
f46869e4 3121 else
cee53ed4
KH
3122 {
3123 found |= CATEGORY_MASK_ISO_8_2;
3124 if (composition_count >= 0)
2735d060 3125 composition_count += len / 2;
cee53ed4 3126 }
f46869e4 3127 }
ff0dacd7 3128 break;
4ed46869 3129 }
4ed46869
KH
3130 }
3131 }
ff0dacd7
KH
3132 detect_info->rejected |= CATEGORY_MASK_ISO;
3133 return 0;
4ed46869 3134
df7492f9 3135 no_more_source:
ff0dacd7
KH
3136 detect_info->rejected |= rejected;
3137 detect_info->found |= (found & ~rejected);
df7492f9 3138 return 1;
4ed46869 3139}
ec6d2bb8 3140
4ed46869 3141
134b9549
KH
3142/* Set designation state into CODING. Set CHARS_96 to -1 if the
3143 escape sequence should be kept. */
df7492f9
KH
3144#define DECODE_DESIGNATION(reg, dim, chars_96, final) \
3145 do { \
3146 int id, prev; \
3147 \
3148 if (final < '0' || final >= 128 \
3149 || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0) \
3150 || !SAFE_CHARSET_P (coding, id)) \
3151 { \
3152 CODING_ISO_DESIGNATION (coding, reg) = -2; \
134b9549
KH
3153 chars_96 = -1; \
3154 break; \
df7492f9
KH
3155 } \
3156 prev = CODING_ISO_DESIGNATION (coding, reg); \
bf16eb23
KH
3157 if (id == charset_jisx0201_roman) \
3158 { \
3159 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
3160 id = charset_ascii; \
3161 } \
3162 else if (id == charset_jisx0208_1978) \
3163 { \
3164 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
3165 id = charset_jisx0208; \
3166 } \
df7492f9
KH
3167 CODING_ISO_DESIGNATION (coding, reg) = id; \
3168 /* If there was an invalid designation to REG previously, and this \
3169 designation is ASCII to REG, we should keep this designation \
3170 sequence. */ \
3171 if (prev == -2 && id == charset_ascii) \
134b9549 3172 chars_96 = -1; \
4ed46869
KH
3173 } while (0)
3174
d46c5b12 3175
e951386e
KH
3176/* Handle these composition sequence (ALT: alternate char):
3177
3178 (1) relative composition: ESC 0 CHAR ... ESC 1
3179 (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3180 (3) altchar composition: ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3181 (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3182
3183 When the start sequence (ESC 0/2/3/4) is found, this annotation
3184 header is produced.
3185
3186 [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3187
3188 Then, upon reading CHAR or RULE (one or two bytes), these codes are
3189 produced until the end sequence (ESC 1) is found:
3190
3191 (1) CHAR ... CHAR
3192 (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3193 (3) ALT ... ALT -1 -1 CHAR ... CHAR
3194 (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3195
3196 When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3197 annotation header is updated as below:
3198
3199 (1) LENGTH: unchanged, NCHARS: number of CHARs
3200 (2) LENGTH: unchanged, NCHARS: number of CHARs
3201 (3) LENGTH: += number of ALTs + 2, NCHARS: number of CHARs
3202 (4) LENGTH: += number of ALTs * 3, NCHARS: number of CHARs
3203
3204 If an error is found while composing, the annotation header is
3205 changed to:
3206
3207 [ ESC '0'/'2'/'3'/'4' -2 0 ]
3208
3209 and the sequence [ -2 DECODED-RULE ] is changed to the original
3210 byte sequence as below:
3211 o the original byte sequence is B: [ B -1 ]
3212 o the original byte sequence is B1 B2: [ B1 B2 ]
3213 and the sequence [ -1 -1 ] is changed to the original byte
3214 sequence:
3215 [ ESC '0' ]
3216*/
3217
3218/* Decode a composition rule C1 and maybe one more byte from the
66ebf983 3219 source, and set RULE to the encoded composition rule. If the rule
d5efd1d1 3220 is invalid, goto invalid_code. */
e951386e 3221
66ebf983 3222#define DECODE_COMPOSITION_RULE(rule) \
e951386e
KH
3223 do { \
3224 rule = c1 - 32; \
3225 if (rule < 0) \
d5efd1d1 3226 goto invalid_code; \
e951386e
KH
3227 if (rule < 81) /* old format (before ver.21) */ \
3228 { \
3229 int gref = (rule) / 9; \
3230 int nref = (rule) % 9; \
3231 if (gref == 4) gref = 10; \
3232 if (nref == 4) nref = 10; \
3233 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
e951386e
KH
3234 } \
3235 else /* new format (after ver.21) */ \
3236 { \
2735d060 3237 int b; \
e951386e 3238 \
2735d060 3239 ONE_MORE_BYTE (b); \
d5efd1d1
PE
3240 if (! COMPOSITION_ENCODE_RULE_VALID (rule - 81, b - 32)) \
3241 goto invalid_code; \
2735d060 3242 rule = COMPOSITION_ENCODE_RULE (rule - 81, b - 32); \
d5efd1d1 3243 rule += 0x100; /* Distinguish it from the old format. */ \
e951386e
KH
3244 } \
3245 } while (0)
3246
3247#define ENCODE_COMPOSITION_RULE(rule) \
df7492f9 3248 do { \
e951386e
KH
3249 int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3250 \
3251 if (rule < 0x100) /* old format */ \
df7492f9 3252 { \
e951386e
KH
3253 if (gref == 10) gref = 4; \
3254 if (nref == 10) nref = 4; \
3255 charbuf[idx] = 32 + gref * 9 + nref; \
3256 charbuf[idx + 1] = -1; \
3257 new_chars++; \
df7492f9 3258 } \
e951386e 3259 else /* new format */ \
df7492f9 3260 { \
e951386e
KH
3261 charbuf[idx] = 32 + 81 + gref; \
3262 charbuf[idx + 1] = 32 + nref; \
3263 new_chars += 2; \
df7492f9
KH
3264 } \
3265 } while (0)
3266
e951386e
KH
3267/* Finish the current composition as invalid. */
3268
e951386e 3269static int
971de7fb 3270finish_composition (int *charbuf, struct composition_status *cmp_status)
e951386e
KH
3271{
3272 int idx = - cmp_status->length;
3273 int new_chars;
3274
3275 /* Recover the original ESC sequence */
3276 charbuf[idx++] = ISO_CODE_ESC;
3277 charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3278 : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3279 : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3280 /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3281 : '4');
3282 charbuf[idx++] = -2;
3283 charbuf[idx++] = 0;
3284 charbuf[idx++] = -1;
3285 new_chars = cmp_status->nchars;
3286 if (cmp_status->method >= COMPOSITION_WITH_RULE)
3287 for (; idx < 0; idx++)
3288 {
3289 int elt = charbuf[idx];
3290
3291 if (elt == -2)
3292 {
3293 ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3294 idx++;
3295 }
3296 else if (elt == -1)
3297 {
3298 charbuf[idx++] = ISO_CODE_ESC;
3299 charbuf[idx] = '0';
3300 new_chars += 2;
3301 }
3302 }
3303 cmp_status->state = COMPOSING_NO;
3304 return new_chars;
3305}
3306
ad1746f5 3307/* If characters are under composition, finish the composition. */
e951386e
KH
3308#define MAYBE_FINISH_COMPOSITION() \
3309 do { \
3310 if (cmp_status->state != COMPOSING_NO) \
3311 char_offset += finish_composition (charbuf, cmp_status); \
3312 } while (0)
d46c5b12 3313
aa72b389 3314/* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
e951386e 3315
aa72b389
KH
3316 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3317 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
df7492f9
KH
3318 ESC 3 : altchar composition : ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3319 ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
ec6d2bb8 3320
e951386e
KH
3321 Produce this annotation sequence now:
3322
3323 [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3324*/
3325
3326#define DECODE_COMPOSITION_START(c1) \
3327 do { \
3328 if (c1 == '0' \
3329 && ((cmp_status->state == COMPOSING_COMPONENT_CHAR \
3330 && cmp_status->method == COMPOSITION_WITH_ALTCHARS) \
3331 || (cmp_status->state == COMPOSING_COMPONENT_RULE \
3332 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3333 { \
3334 *charbuf++ = -1; \
3335 *charbuf++= -1; \
3336 cmp_status->state = COMPOSING_CHAR; \
3337 cmp_status->length += 2; \
3338 } \
3339 else \
3340 { \
3341 MAYBE_FINISH_COMPOSITION (); \
3342 cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE \
3343 : c1 == '2' ? COMPOSITION_WITH_RULE \
3344 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
3345 : COMPOSITION_WITH_RULE_ALTCHARS); \
3346 cmp_status->state \
3347 = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR); \
3348 ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method); \
3349 cmp_status->length = MAX_ANNOTATION_LENGTH; \
3350 cmp_status->nchars = cmp_status->ncomps = 0; \
3351 coding->annotated = 1; \
3352 } \
ec6d2bb8
KH
3353 } while (0)
3354
ec6d2bb8 3355
e951386e 3356/* Handle composition end sequence ESC 1. */
df7492f9
KH
3357
3358#define DECODE_COMPOSITION_END() \
ec6d2bb8 3359 do { \
e951386e
KH
3360 if (cmp_status->nchars == 0 \
3361 || ((cmp_status->state == COMPOSING_CHAR) \
3362 == (cmp_status->method == COMPOSITION_WITH_RULE))) \
ec6d2bb8 3363 { \
e951386e
KH
3364 MAYBE_FINISH_COMPOSITION (); \
3365 goto invalid_code; \
ec6d2bb8 3366 } \
e951386e
KH
3367 if (cmp_status->method == COMPOSITION_WITH_ALTCHARS) \
3368 charbuf[- cmp_status->length] -= cmp_status->ncomps + 2; \
3369 else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS) \
3370 charbuf[- cmp_status->length] -= cmp_status->ncomps * 3; \
3371 charbuf[- cmp_status->length + 2] = cmp_status->nchars; \
3372 char_offset += cmp_status->nchars; \
3373 cmp_status->state = COMPOSING_NO; \
ec6d2bb8
KH
3374 } while (0)
3375
e951386e 3376/* Store a composition rule RULE in charbuf, and update cmp_status. */
df7492f9 3377
e951386e
KH
3378#define STORE_COMPOSITION_RULE(rule) \
3379 do { \
3380 *charbuf++ = -2; \
3381 *charbuf++ = rule; \
3382 cmp_status->length += 2; \
3383 cmp_status->state--; \
3384 } while (0)
ec6d2bb8 3385
e951386e
KH
3386/* Store a composed char or a component char C in charbuf, and update
3387 cmp_status. */
3388
3389#define STORE_COMPOSITION_CHAR(c) \
ec6d2bb8 3390 do { \
e951386e
KH
3391 *charbuf++ = (c); \
3392 cmp_status->length++; \
3393 if (cmp_status->state == COMPOSING_CHAR) \
3394 cmp_status->nchars++; \
df7492f9 3395 else \
e951386e
KH
3396 cmp_status->ncomps++; \
3397 if (cmp_status->method == COMPOSITION_WITH_RULE \
3398 || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS \
3399 && cmp_status->state == COMPOSING_COMPONENT_CHAR)) \
3400 cmp_status->state++; \
ec6d2bb8 3401 } while (0)
88993dfd 3402
d46c5b12 3403
4ed46869
KH
3404/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
3405
b73bfc1c 3406static void
971de7fb 3407decode_coding_iso_2022 (struct coding_system *coding)
4ed46869 3408{
8f924df7
KH
3409 const unsigned char *src = coding->source + coding->consumed;
3410 const unsigned char *src_end = coding->source + coding->src_bytes;
3411 const unsigned char *src_base;
69a80ea3 3412 int *charbuf = coding->charbuf + coding->charbuf_used;
ad1746f5
KH
3413 /* We may produce two annotations (charset and composition) in one
3414 loop and one more charset annotation at the end. */
ff0dacd7 3415 int *charbuf_end
df80c7f0 3416 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
d311d28c 3417 ptrdiff_t consumed_chars = 0, consumed_chars_base;
f10fe38f 3418 bool multibytep = coding->src_multibyte;
4ed46869 3419 /* Charsets invoked to graphic plane 0 and 1 respectively. */
df7492f9
KH
3420 int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3421 int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
134b9549 3422 int charset_id_2, charset_id_3;
df7492f9
KH
3423 struct charset *charset;
3424 int c;
e951386e 3425 struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
66ebf983 3426 Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
d311d28c
PE
3427 ptrdiff_t char_offset = coding->produced_char;
3428 ptrdiff_t last_offset = char_offset;
ff0dacd7 3429 int last_id = charset_ascii;
f10fe38f
PE
3430 bool eol_dos
3431 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
119852e7 3432 int byte_after_cr = -1;
e951386e 3433 int i;
df7492f9 3434
df7492f9 3435 setup_iso_safe_charsets (attrs);
1b3b981b 3436 coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
b73bfc1c 3437
e951386e
KH
3438 if (cmp_status->state != COMPOSING_NO)
3439 {
15cbd324 3440 if (charbuf_end - charbuf < cmp_status->length)
1088b922 3441 emacs_abort ();
e951386e
KH
3442 for (i = 0; i < cmp_status->length; i++)
3443 *charbuf++ = cmp_status->carryover[i];
3444 coding->annotated = 1;
3445 }
3446
b73bfc1c 3447 while (1)
4ed46869 3448 {
cf299835 3449 int c1, c2, c3;
b73bfc1c
KH
3450
3451 src_base = src;
df7492f9
KH
3452 consumed_chars_base = consumed_chars;
3453
3454 if (charbuf >= charbuf_end)
b71f6f73
KH
3455 {
3456 if (byte_after_cr >= 0)
3457 src_base--;
3458 break;
3459 }
df7492f9 3460
119852e7
KH
3461 if (byte_after_cr >= 0)
3462 c1 = byte_after_cr, byte_after_cr = -1;
3463 else
3464 ONE_MORE_BYTE (c1);
065e3595
KH
3465 if (c1 < 0)
3466 goto invalid_code;
4ed46869 3467
e951386e 3468 if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
4ed46869 3469 {
e951386e
KH
3470 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3471 char_offset++;
3472 CODING_ISO_EXTSEGMENT_LEN (coding)--;
3473 continue;
3474 }
3475
3476 if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3477 {
3478 if (c1 == ISO_CODE_ESC)
ec6d2bb8 3479 {
e951386e
KH
3480 if (src + 1 >= src_end)
3481 goto no_more_source;
3482 *charbuf++ = ISO_CODE_ESC;
3483 char_offset++;
3484 if (src[0] == '%' && src[1] == '@')
df7492f9 3485 {
e951386e
KH
3486 src += 2;
3487 consumed_chars += 2;
3488 char_offset += 2;
3489 /* We are sure charbuf can contain two more chars. */
3490 *charbuf++ = '%';
3491 *charbuf++ = '@';
3492 CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
df7492f9 3493 }
4ed46869 3494 }
e951386e
KH
3495 else
3496 {
3497 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3498 char_offset++;
3499 }
3500 continue;
3501 }
3502
3503 if ((cmp_status->state == COMPOSING_RULE
3504 || cmp_status->state == COMPOSING_COMPONENT_RULE)
3505 && c1 != ISO_CODE_ESC)
3506 {
66ebf983 3507 int rule;
e951386e 3508
66ebf983 3509 DECODE_COMPOSITION_RULE (rule);
e951386e
KH
3510 STORE_COMPOSITION_RULE (rule);
3511 continue;
3512 }
3513
3514 /* We produce at most one character. */
3515 switch (iso_code_class [c1])
3516 {
3517 case ISO_0x20_or_0x7F:
df7492f9
KH
3518 if (charset_id_0 < 0
3519 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
781d7a48
KH
3520 /* This is SPACE or DEL. */
3521 charset = CHARSET_FROM_ID (charset_ascii);
3522 else
3523 charset = CHARSET_FROM_ID (charset_id_0);
3524 break;
4ed46869
KH
3525
3526 case ISO_graphic_plane_0:
134b9549
KH
3527 if (charset_id_0 < 0)
3528 charset = CHARSET_FROM_ID (charset_ascii);
3529 else
3530 charset = CHARSET_FROM_ID (charset_id_0);
4ed46869
KH
3531 break;
3532
3533 case ISO_0xA0_or_0xFF:
df7492f9
KH
3534 if (charset_id_1 < 0
3535 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3536 || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3537 goto invalid_code;
4ed46869
KH
3538 /* This is a graphic character, we fall down ... */
3539
3540 case ISO_graphic_plane_1:
df7492f9
KH
3541 if (charset_id_1 < 0)
3542 goto invalid_code;
3543 charset = CHARSET_FROM_ID (charset_id_1);
4ed46869
KH
3544 break;
3545
df7492f9 3546 case ISO_control_0:
2735d060 3547 if (eol_dos && c1 == '\r')
119852e7 3548 ONE_MORE_BYTE (byte_after_cr);
df7492f9
KH
3549 MAYBE_FINISH_COMPOSITION ();
3550 charset = CHARSET_FROM_ID (charset_ascii);
4ed46869
KH
3551 break;
3552
df7492f9 3553 case ISO_control_1:
df7492f9
KH
3554 goto invalid_code;
3555
4ed46869 3556 case ISO_shift_out:
df7492f9
KH
3557 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3558 || CODING_ISO_DESIGNATION (coding, 1) < 0)
3559 goto invalid_code;
3560 CODING_ISO_INVOCATION (coding, 0) = 1;
3561 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
b73bfc1c 3562 continue;
4ed46869
KH
3563
3564 case ISO_shift_in:
df7492f9
KH
3565 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3566 goto invalid_code;
3567 CODING_ISO_INVOCATION (coding, 0) = 0;
3568 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
b73bfc1c 3569 continue;
4ed46869
KH
3570
3571 case ISO_single_shift_2_7:
a63dba42
KH
3572 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS))
3573 goto invalid_code;
4ed46869 3574 case ISO_single_shift_2:
df7492f9
KH
3575 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3576 goto invalid_code;
4ed46869
KH
3577 /* SS2 is handled as an escape sequence of ESC 'N' */
3578 c1 = 'N';
3579 goto label_escape_sequence;
3580
3581 case ISO_single_shift_3:
df7492f9
KH
3582 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3583 goto invalid_code;
4ed46869
KH
3584 /* SS2 is handled as an escape sequence of ESC 'O' */
3585 c1 = 'O';
3586 goto label_escape_sequence;
3587
3588 case ISO_control_sequence_introducer:
3589 /* CSI is handled as an escape sequence of ESC '[' ... */
3590 c1 = '[';
3591 goto label_escape_sequence;
3592
3593 case ISO_escape:
3594 ONE_MORE_BYTE (c1);
3595 label_escape_sequence:
df7492f9 3596 /* Escape sequences handled here are invocation,
4ed46869
KH
3597 designation, direction specification, and character
3598 composition specification. */
3599 switch (c1)
3600 {
3601 case '&': /* revision of following character set */
3602 ONE_MORE_BYTE (c1);
3603 if (!(c1 >= '@' && c1 <= '~'))
df7492f9 3604 goto invalid_code;
4ed46869
KH
3605 ONE_MORE_BYTE (c1);
3606 if (c1 != ISO_CODE_ESC)
df7492f9 3607 goto invalid_code;
4ed46869
KH
3608 ONE_MORE_BYTE (c1);
3609 goto label_escape_sequence;
3610
3611 case '$': /* designation of 2-byte character set */
df7492f9
KH
3612 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3613 goto invalid_code;
134b9549
KH
3614 {
3615 int reg, chars96;
3616
3617 ONE_MORE_BYTE (c1);
3618 if (c1 >= '@' && c1 <= 'B')
3619 { /* designation of JISX0208.1978, GB2312.1980,
88993dfd 3620 or JISX0208.1980 */
134b9549
KH
3621 reg = 0, chars96 = 0;
3622 }
3623 else if (c1 >= 0x28 && c1 <= 0x2B)
3624 { /* designation of DIMENSION2_CHARS94 character set */
3625 reg = c1 - 0x28, chars96 = 0;
3626 ONE_MORE_BYTE (c1);
3627 }
3628 else if (c1 >= 0x2C && c1 <= 0x2F)
3629 { /* designation of DIMENSION2_CHARS96 character set */
3630 reg = c1 - 0x2C, chars96 = 1;
3631 ONE_MORE_BYTE (c1);
3632 }
3633 else
3634 goto invalid_code;
3635 DECODE_DESIGNATION (reg, 2, chars96, c1);
3636 /* We must update these variables now. */
3637 if (reg == 0)
3638 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3639 else if (reg == 1)
3640 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3641 if (chars96 < 0)
3642 goto invalid_code;
3643 }
b73bfc1c 3644 continue;
4ed46869
KH
3645
3646 case 'n': /* invocation of locking-shift-2 */
df7492f9
KH
3647 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3648 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3649 goto invalid_code;
3650 CODING_ISO_INVOCATION (coding, 0) = 2;
3651 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
b73bfc1c 3652 continue;
4ed46869
KH
3653
3654 case 'o': /* invocation of locking-shift-3 */
df7492f9
KH
3655 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3656 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3657 goto invalid_code;
3658 CODING_ISO_INVOCATION (coding, 0) = 3;
3659 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
b73bfc1c 3660 continue;
4ed46869
KH
3661
3662 case 'N': /* invocation of single-shift-2 */
df7492f9
KH
3663 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3664 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3665 goto invalid_code;
134b9549
KH
3666 charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3667 if (charset_id_2 < 0)
3668 charset = CHARSET_FROM_ID (charset_ascii);
3669 else
3670 charset = CHARSET_FROM_ID (charset_id_2);
b73bfc1c 3671 ONE_MORE_BYTE (c1);
e7046a18 3672 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
df7492f9 3673 goto invalid_code;
4ed46869
KH
3674 break;
3675
3676 case 'O': /* invocation of single-shift-3 */
df7492f9
KH
3677 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3678 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3679 goto invalid_code;
134b9549
KH
3680 charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3681 if (charset_id_3 < 0)
3682 charset = CHARSET_FROM_ID (charset_ascii);
3683 else
3684 charset = CHARSET_FROM_ID (charset_id_3);
b73bfc1c 3685 ONE_MORE_BYTE (c1);
e7046a18 3686 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
df7492f9 3687 goto invalid_code;
4ed46869
KH
3688 break;
3689
ec6d2bb8 3690 case '0': case '2': case '3': case '4': /* start composition */
df7492f9
KH
3691 if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3692 goto invalid_code;
e951386e
KH
3693 if (last_id != charset_ascii)
3694 {
3695 ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3696 last_id = charset_ascii;
3697 last_offset = char_offset;
3698 }
ec6d2bb8 3699 DECODE_COMPOSITION_START (c1);
b73bfc1c 3700 continue;
4ed46869 3701
ec6d2bb8 3702 case '1': /* end composition */
e951386e 3703 if (cmp_status->state == COMPOSING_NO)
df7492f9
KH
3704 goto invalid_code;
3705 DECODE_COMPOSITION_END ();
b73bfc1c 3706 continue;
4ed46869
KH
3707
3708 case '[': /* specification of direction */
de59072a 3709 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION))
df7492f9 3710 goto invalid_code;
4ed46869 3711 /* For the moment, nested direction is not supported.
d46c5b12 3712 So, `coding->mode & CODING_MODE_DIRECTION' zero means
ad1746f5 3713 left-to-right, and nonzero means right-to-left. */
4ed46869
KH
3714 ONE_MORE_BYTE (c1);
3715 switch (c1)
3716 {
3717 case ']': /* end of the current direction */
d46c5b12 3718 coding->mode &= ~CODING_MODE_DIRECTION;
4ed46869
KH
3719
3720 case '0': /* end of the current direction */
3721 case '1': /* start of left-to-right direction */
3722 ONE_MORE_BYTE (c1);
3723 if (c1 == ']')
d46c5b12 3724 coding->mode &= ~CODING_MODE_DIRECTION;
4ed46869 3725 else
df7492f9 3726 goto invalid_code;
4ed46869
KH
3727 break;
3728
3729 case '2': /* start of right-to-left direction */
3730 ONE_MORE_BYTE (c1);
3731 if (c1 == ']')
d46c5b12 3732 coding->mode |= CODING_MODE_DIRECTION;
4ed46869 3733 else
df7492f9 3734 goto invalid_code;
4ed46869
KH
3735 break;
3736
3737 default:
df7492f9 3738 goto invalid_code;
4ed46869 3739 }
b73bfc1c 3740 continue;
4ed46869 3741
103e0180 3742 case '%':
103e0180
KH
3743 ONE_MORE_BYTE (c1);
3744 if (c1 == '/')
3745 {
3746 /* CTEXT extended segment:
3747 ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3748 We keep these bytes as is for the moment.
3749 They may be decoded by post-read-conversion. */
3750 int dim, M, L;
4776e638 3751 int size;
8f924df7 3752
103e0180 3753 ONE_MORE_BYTE (dim);
7a84eee5 3754 if (dim < '0' || dim > '4')
e951386e 3755 goto invalid_code;
103e0180 3756 ONE_MORE_BYTE (M);
e951386e
KH
3757 if (M < 128)
3758 goto invalid_code;
103e0180 3759 ONE_MORE_BYTE (L);
e951386e
KH
3760 if (L < 128)
3761 goto invalid_code;
103e0180 3762 size = ((M - 128) * 128) + (L - 128);
e951386e 3763 if (charbuf + 6 > charbuf_end)
4776e638
KH
3764 goto break_loop;
3765 *charbuf++ = ISO_CODE_ESC;
3766 *charbuf++ = '%';
3767 *charbuf++ = '/';
3768 *charbuf++ = dim;
3769 *charbuf++ = BYTE8_TO_CHAR (M);
3770 *charbuf++ = BYTE8_TO_CHAR (L);
e951386e 3771 CODING_ISO_EXTSEGMENT_LEN (coding) = size;
103e0180
KH
3772 }
3773 else if (c1 == 'G')
3774 {
103e0180
KH
3775 /* XFree86 extension for embedding UTF-8 in CTEXT:
3776 ESC % G --UTF-8-BYTES-- ESC % @
3777 We keep these bytes as is for the moment.
3778 They may be decoded by post-read-conversion. */
e951386e 3779 if (charbuf + 3 > charbuf_end)
4776e638 3780 goto break_loop;
e951386e
KH
3781 *charbuf++ = ISO_CODE_ESC;
3782 *charbuf++ = '%';
3783 *charbuf++ = 'G';
3784 CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
103e0180
KH
3785 }
3786 else
4776e638 3787 goto invalid_code;
103e0180 3788 continue;
4776e638 3789 break;
103e0180 3790
4ed46869 3791 default:
df7492f9
KH
3792 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3793 goto invalid_code;
134b9549
KH
3794 {
3795 int reg, chars96;
3796
3797 if (c1 >= 0x28 && c1 <= 0x2B)
3798 { /* designation of DIMENSION1_CHARS94 character set */
3799 reg = c1 - 0x28, chars96 = 0;
3800 ONE_MORE_BYTE (c1);
3801 }
3802 else if (c1 >= 0x2C && c1 <= 0x2F)
3803 { /* designation of DIMENSION1_CHARS96 character set */
3804 reg = c1 - 0x2C, chars96 = 1;
3805 ONE_MORE_BYTE (c1);
3806 }
3807 else
3808 goto invalid_code;
3809 DECODE_DESIGNATION (reg, 1, chars96, c1);
3810 /* We must update these variables now. */
3811 if (reg == 0)
3812 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3813 else if (reg == 1)
3814 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3815 if (chars96 < 0)
3816 goto invalid_code;
3817 }
b73bfc1c 3818 continue;
4ed46869 3819 }
413bb2db
PE
3820 break;
3821
3822 default:
1088b922 3823 emacs_abort ();
b73bfc1c 3824 }
4ed46869 3825
e951386e
KH
3826 if (cmp_status->state == COMPOSING_NO
3827 && charset->id != charset_ascii
ff0dacd7
KH
3828 && last_id != charset->id)
3829 {
3830 if (last_id != charset_ascii)
69a80ea3 3831 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
ff0dacd7
KH
3832 last_id = charset->id;
3833 last_offset = char_offset;
3834 }
3835
b73bfc1c 3836 /* Now we know CHARSET and 1st position code C1 of a character.
cf299835
KH
3837 Produce a decoded character while getting 2nd and 3rd
3838 position codes C2, C3 if necessary. */
df7492f9 3839 if (CHARSET_DIMENSION (charset) > 1)
b73bfc1c
KH
3840 {
3841 ONE_MORE_BYTE (c2);
cf299835
KH
3842 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
3843 || ((c1 & 0x80) != (c2 & 0x80)))
b73bfc1c 3844 /* C2 is not in a valid range. */
df7492f9 3845 goto invalid_code;
cf299835
KH
3846 if (CHARSET_DIMENSION (charset) == 2)
3847 c1 = (c1 << 8) | c2;
3848 else
df7492f9 3849 {
cf299835
KH
3850 ONE_MORE_BYTE (c3);
3851 if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
3852 || ((c1 & 0x80) != (c3 & 0x80)))
3853 /* C3 is not in a valid range. */
df7492f9 3854 goto invalid_code;
cf299835 3855 c1 = (c1 << 16) | (c2 << 8) | c2;
df7492f9
KH
3856 }
3857 }
cf299835 3858 c1 &= 0x7F7F7F;
df7492f9
KH
3859 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3860 if (c < 0)
3861 {
3862 MAYBE_FINISH_COMPOSITION ();
3863 for (; src_base < src; src_base++, char_offset++)
3864 {
3865 if (ASCII_BYTE_P (*src_base))
3866 *charbuf++ = *src_base;
3867 else
3868 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3869 }
3870 }
e951386e 3871 else if (cmp_status->state == COMPOSING_NO)
df7492f9
KH
3872 {
3873 *charbuf++ = c;
3874 char_offset++;
4ed46869 3875 }
e951386e
KH
3876 else if ((cmp_status->state == COMPOSING_CHAR
3877 ? cmp_status->nchars
3878 : cmp_status->ncomps)
3879 >= MAX_COMPOSITION_COMPONENTS)
781d7a48 3880 {
e951386e
KH
3881 /* Too long composition. */
3882 MAYBE_FINISH_COMPOSITION ();
3883 *charbuf++ = c;
3884 char_offset++;
4ed46869 3885 }
e951386e
KH
3886 else
3887 STORE_COMPOSITION_CHAR (c);
4ed46869
KH
3888 continue;
3889
df7492f9
KH
3890 invalid_code:
3891 MAYBE_FINISH_COMPOSITION ();
4ed46869 3892 src = src_base;
df7492f9
KH
3893 consumed_chars = consumed_chars_base;
3894 ONE_MORE_BYTE (c);
065e3595 3895 *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
ff0dacd7 3896 char_offset++;
df7492f9 3897 coding->errors++;
4776e638
KH
3898 continue;
3899
3900 break_loop:
3901 break;
4ed46869 3902 }
fb88bf2d 3903
df7492f9 3904 no_more_source:
e951386e
KH
3905 if (cmp_status->state != COMPOSING_NO)
3906 {
3907 if (coding->mode & CODING_MODE_LAST_BLOCK)
3908 MAYBE_FINISH_COMPOSITION ();
3909 else
3910 {
3911 charbuf -= cmp_status->length;
3912 for (i = 0; i < cmp_status->length; i++)
3913 cmp_status->carryover[i] = charbuf[i];
3914 }
3915 }
3916 else if (last_id != charset_ascii)
69a80ea3 3917 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
df7492f9
KH
3918 coding->consumed_char += consumed_chars_base;
3919 coding->consumed = src_base - coding->source;
3920 coding->charbuf_used = charbuf - coding->charbuf;
4ed46869
KH
3921}
3922
b73bfc1c 3923
f4dee582 3924/* ISO2022 encoding stuff. */
4ed46869
KH
3925
3926/*
f4dee582 3927 It is not enough to say just "ISO2022" on encoding, we have to
df7492f9 3928 specify more details. In Emacs, each coding system of ISO2022
4ed46869 3929 variant has the following specifications:
df7492f9 3930 1. Initial designation to G0 thru G3.
4ed46869
KH
3931 2. Allows short-form designation?
3932 3. ASCII should be designated to G0 before control characters?
3933 4. ASCII should be designated to G0 at end of line?
3934 5. 7-bit environment or 8-bit environment?
3935 6. Use locking-shift?
3936 7. Use Single-shift?
3937 And the following two are only for Japanese:
3938 8. Use ASCII in place of JIS0201-1976-Roman?
3939 9. Use JISX0208-1983 in place of JISX0208-1978?
df7492f9
KH
3940 These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3941 defined by macros CODING_ISO_FLAG_XXX. See `coding.h' for more
f4dee582 3942 details.
4ed46869
KH
3943*/
3944
3945/* Produce codes (escape sequence) for designating CHARSET to graphic
b73bfc1c
KH
3946 register REG at DST, and increment DST. If <final-char> of CHARSET is
3947 '@', 'A', or 'B' and the coding system CODING allows, produce
3948 designation sequence of short-form. */
4ed46869
KH
3949
3950#define ENCODE_DESIGNATION(charset, reg, coding) \
3951 do { \
df7492f9 3952 unsigned char final_char = CHARSET_ISO_FINAL (charset); \
675e2c69
DN
3953 const char *intermediate_char_94 = "()*+"; \
3954 const char *intermediate_char_96 = ",-./"; \
df7492f9 3955 int revision = -1; \
df7492f9
KH
3956 \
3957 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION) \
c197f191 3958 revision = CHARSET_ISO_REVISION (charset); \
df7492f9
KH
3959 \
3960 if (revision >= 0) \
70c22245 3961 { \
df7492f9
KH
3962 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&'); \
3963 EMIT_ONE_BYTE ('@' + revision); \
4ed46869 3964 } \
df7492f9 3965 EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC); \
4ed46869
KH
3966 if (CHARSET_DIMENSION (charset) == 1) \
3967 { \
2735d060 3968 int b; \
df7492f9 3969 if (! CHARSET_ISO_CHARS_96 (charset)) \
2735d060 3970 b = intermediate_char_94[reg]; \
4ed46869 3971 else \
2735d060
PE
3972 b = intermediate_char_96[reg]; \
3973 EMIT_ONE_ASCII_BYTE (b); \
4ed46869
KH
3974 } \
3975 else \
3976 { \
df7492f9
KH
3977 EMIT_ONE_ASCII_BYTE ('$'); \
3978 if (! CHARSET_ISO_CHARS_96 (charset)) \
4ed46869 3979 { \
df7492f9 3980 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM \
b73bfc1c
KH
3981 || reg != 0 \
3982 || final_char < '@' || final_char > 'B') \
df7492f9 3983 EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]); \
4ed46869
KH
3984 } \
3985 else \
df7492f9 3986 EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]); \
4ed46869 3987 } \
df7492f9
KH
3988 EMIT_ONE_ASCII_BYTE (final_char); \
3989 \
3990 CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset); \
4ed46869
KH
3991 } while (0)
3992
df7492f9 3993
4ed46869
KH
3994/* The following two macros produce codes (control character or escape
3995 sequence) for ISO2022 single-shift functions (single-shift-2 and
3996 single-shift-3). */
3997
df7492f9
KH
3998#define ENCODE_SINGLE_SHIFT_2 \
3999 do { \
4000 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4001 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N'); \
4002 else \
4003 EMIT_ONE_BYTE (ISO_CODE_SS2); \
4004 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
4ed46869
KH
4005 } while (0)
4006
df7492f9
KH
4007
4008#define ENCODE_SINGLE_SHIFT_3 \
4009 do { \
4010 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4011 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O'); \
4012 else \
4013 EMIT_ONE_BYTE (ISO_CODE_SS3); \
4014 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
4ed46869
KH
4015 } while (0)
4016
df7492f9 4017
4ed46869
KH
4018/* The following four macros produce codes (control character or
4019 escape sequence) for ISO2022 locking-shift functions (shift-in,
4020 shift-out, locking-shift-2, and locking-shift-3). */
4021
df7492f9
KH
4022#define ENCODE_SHIFT_IN \
4023 do { \
4024 EMIT_ONE_ASCII_BYTE (ISO_CODE_SI); \
4025 CODING_ISO_INVOCATION (coding, 0) = 0; \
4ed46869
KH
4026 } while (0)
4027
df7492f9
KH
4028
4029#define ENCODE_SHIFT_OUT \
4030 do { \
4031 EMIT_ONE_ASCII_BYTE (ISO_CODE_SO); \
4032 CODING_ISO_INVOCATION (coding, 0) = 1; \
4ed46869
KH
4033 } while (0)
4034
df7492f9
KH
4035
4036#define ENCODE_LOCKING_SHIFT_2 \
4037 do { \
4038 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
4039 CODING_ISO_INVOCATION (coding, 0) = 2; \
4ed46869
KH
4040 } while (0)
4041
df7492f9
KH
4042
4043#define ENCODE_LOCKING_SHIFT_3 \
4044 do { \
4045 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
4046 CODING_ISO_INVOCATION (coding, 0) = 3; \
4ed46869
KH
4047 } while (0)
4048
df7492f9 4049
f4dee582
RS
4050/* Produce codes for a DIMENSION1 character whose character set is
4051 CHARSET and whose position-code is C1. Designation and invocation
4ed46869
KH
4052 sequences are also produced in advance if necessary. */
4053
6e85d753
KH
4054#define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
4055 do { \
df7492f9 4056 int id = CHARSET_ID (charset); \
bf16eb23
KH
4057 \
4058 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
4059 && id == charset_ascii) \
4060 { \
4061 id = charset_jisx0201_roman; \
4062 charset = CHARSET_FROM_ID (id); \
4063 } \
4064 \
df7492f9 4065 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
6e85d753 4066 { \
df7492f9
KH
4067 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4068 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
6e85d753 4069 else \
df7492f9
KH
4070 EMIT_ONE_BYTE (c1 | 0x80); \
4071 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
6e85d753
KH
4072 break; \
4073 } \
df7492f9 4074 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
6e85d753 4075 { \
df7492f9 4076 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
6e85d753
KH
4077 break; \
4078 } \
df7492f9 4079 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
6e85d753 4080 { \
df7492f9 4081 EMIT_ONE_BYTE (c1 | 0x80); \
6e85d753
KH
4082 break; \
4083 } \
6e85d753
KH
4084 else \
4085 /* Since CHARSET is not yet invoked to any graphic planes, we \
4086 must invoke it, or, at first, designate it to some graphic \
4087 register. Then repeat the loop to actually produce the \
4088 character. */ \
df7492f9
KH
4089 dst = encode_invocation_designation (charset, coding, dst, \
4090 &produced_chars); \
4ed46869
KH
4091 } while (1)
4092
df7492f9 4093
f4dee582
RS
4094/* Produce codes for a DIMENSION2 character whose character set is
4095 CHARSET and whose position-codes are C1 and C2. Designation and
4ed46869
KH
4096 invocation codes are also produced in advance if necessary. */
4097
6e85d753
KH
4098#define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
4099 do { \
df7492f9 4100 int id = CHARSET_ID (charset); \
bf16eb23
KH
4101 \
4102 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
4103 && id == charset_jisx0208) \
4104 { \
4105 id = charset_jisx0208_1978; \
4106 charset = CHARSET_FROM_ID (id); \
4107 } \
4108 \
df7492f9 4109 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
6e85d753 4110 { \
df7492f9
KH
4111 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4112 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
6e85d753 4113 else \
df7492f9
KH
4114 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
4115 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
6e85d753
KH
4116 break; \
4117 } \
df7492f9 4118 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
6e85d753 4119 { \
df7492f9 4120 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
6e85d753
KH
4121 break; \
4122 } \
df7492f9 4123 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
6e85d753 4124 { \
df7492f9 4125 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
6e85d753
KH
4126 break; \
4127 } \
6e85d753
KH
4128 else \
4129 /* Since CHARSET is not yet invoked to any graphic planes, we \
4130 must invoke it, or, at first, designate it to some graphic \
4131 register. Then repeat the loop to actually produce the \
4132 character. */ \
df7492f9
KH
4133 dst = encode_invocation_designation (charset, coding, dst, \
4134 &produced_chars); \
4ed46869
KH
4135 } while (1)
4136
05e6f5dc 4137
df7492f9
KH
4138#define ENCODE_ISO_CHARACTER(charset, c) \
4139 do { \
8f50130c 4140 unsigned code; \
5eb05ea3 4141 CODING_ENCODE_CHAR (coding, dst, dst_end, (charset), (c), code); \
df7492f9
KH
4142 \
4143 if (CHARSET_DIMENSION (charset) == 1) \
4144 ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code); \
4145 else \
4146 ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
84fbb8a0 4147 } while (0)
bdd9fb48 4148
05e6f5dc 4149
4ed46869 4150/* Produce designation and invocation codes at a place pointed by DST
df7492f9 4151 to use CHARSET. The element `spec.iso_2022' of *CODING is updated.
4ed46869
KH
4152 Return new DST. */
4153
e2f1bab9 4154static unsigned char *
cf84bb53
JB
4155encode_invocation_designation (struct charset *charset,
4156 struct coding_system *coding,
d311d28c 4157 unsigned char *dst, ptrdiff_t *p_nchars)
4ed46869 4158{
f10fe38f 4159 bool multibytep = coding->dst_multibyte;
d311d28c 4160 ptrdiff_t produced_chars = *p_nchars;
4ed46869 4161 int reg; /* graphic register number */
df7492f9 4162 int id = CHARSET_ID (charset);
4ed46869
KH
4163
4164 /* At first, check designations. */
4165 for (reg = 0; reg < 4; reg++)
df7492f9 4166 if (id == CODING_ISO_DESIGNATION (coding, reg))
4ed46869
KH
4167 break;
4168
4169 if (reg >= 4)
4170 {
4171 /* CHARSET is not yet designated to any graphic registers. */
4172 /* At first check the requested designation. */
df7492f9
KH
4173 reg = CODING_ISO_REQUEST (coding, id);
4174 if (reg < 0)
1ba9e4ab
KH
4175 /* Since CHARSET requests no special designation, designate it
4176 to graphic register 0. */
4ed46869
KH
4177 reg = 0;
4178
4179 ENCODE_DESIGNATION (charset, reg, coding);
4180 }
4181
df7492f9
KH
4182 if (CODING_ISO_INVOCATION (coding, 0) != reg
4183 && CODING_ISO_INVOCATION (coding, 1) != reg)
4ed46869
KH
4184 {
4185 /* Since the graphic register REG is not invoked to any graphic
4186 planes, invoke it to graphic plane 0. */
4187 switch (reg)
4188 {
4189 case 0: /* graphic register 0 */
4190 ENCODE_SHIFT_IN;
4191 break;
4192
4193 case 1: /* graphic register 1 */
4194 ENCODE_SHIFT_OUT;
4195 break;
4196
4197 case 2: /* graphic register 2 */
df7492f9 4198 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4ed46869
KH
4199 ENCODE_SINGLE_SHIFT_2;
4200 else
4201 ENCODE_LOCKING_SHIFT_2;
4202 break;
4203
4204 case 3: /* graphic register 3 */
df7492f9 4205 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4ed46869
KH
4206 ENCODE_SINGLE_SHIFT_3;
4207 else
4208 ENCODE_LOCKING_SHIFT_3;
4209 break;
4210 }
4211 }
b73bfc1c 4212
df7492f9 4213 *p_nchars = produced_chars;
4ed46869
KH
4214 return dst;
4215}
4216
4ed46869
KH
4217
4218/* Produce codes for designation and invocation to reset the graphic
4219 planes and registers to initial state. */
df7492f9
KH
4220#define ENCODE_RESET_PLANE_AND_REGISTER() \
4221 do { \
4222 int reg; \
4223 struct charset *charset; \
4224 \
4225 if (CODING_ISO_INVOCATION (coding, 0) != 0) \
4226 ENCODE_SHIFT_IN; \
4227 for (reg = 0; reg < 4; reg++) \
4228 if (CODING_ISO_INITIAL (coding, reg) >= 0 \
4229 && (CODING_ISO_DESIGNATION (coding, reg) \
4230 != CODING_ISO_INITIAL (coding, reg))) \
4231 { \
4232 charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4233 ENCODE_DESIGNATION (charset, reg, coding); \
4234 } \
4ed46869
KH
4235 } while (0)
4236
df7492f9 4237
bdd9fb48 4238/* Produce designation sequences of charsets in the line started from
5eb05ea3
KH
4239 CHARBUF to a place pointed by DST, and return the number of
4240 produced bytes. DST should not directly point a buffer text area
4241 which may be relocated by char_charset call.
bdd9fb48
KH
4242
4243 If the current block ends before any end-of-line, we may fail to
d46c5b12
KH
4244 find all the necessary designations. */
4245
6e6c82a4 4246static ptrdiff_t
5eb05ea3
KH
4247encode_designation_at_bol (struct coding_system *coding,
4248 int *charbuf, int *charbuf_end,
461c2ab9 4249 unsigned char *dst)
e0e989f6 4250{
75a3b399 4251 unsigned char *orig = dst;
df7492f9 4252 struct charset *charset;
bdd9fb48
KH
4253 /* Table of charsets to be designated to each graphic register. */
4254 int r[4];
df7492f9 4255 int c, found = 0, reg;
d311d28c 4256 ptrdiff_t produced_chars = 0;
f10fe38f 4257 bool multibytep = coding->dst_multibyte;
df7492f9
KH
4258 Lisp_Object attrs;
4259 Lisp_Object charset_list;
4260
4261 attrs = CODING_ID_ATTRS (coding->id);
4262 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4263 if (EQ (charset_list, Qiso_2022))
4264 charset_list = Viso_2022_charset_list;
bdd9fb48
KH
4265
4266 for (reg = 0; reg < 4; reg++)
4267 r[reg] = -1;
4268
5eb05ea3 4269 while (charbuf < charbuf_end && found < 4)
e0e989f6 4270 {
df7492f9
KH
4271 int id;
4272
4273 c = *charbuf++;
b73bfc1c
KH
4274 if (c == '\n')
4275 break;
df7492f9
KH
4276 charset = char_charset (c, charset_list, NULL);
4277 id = CHARSET_ID (charset);
4278 reg = CODING_ISO_REQUEST (coding, id);
4279 if (reg >= 0 && r[reg] < 0)
bdd9fb48
KH
4280 {
4281 found++;
df7492f9 4282 r[reg] = id;
bdd9fb48 4283 }
bdd9fb48
KH
4284 }
4285
4286 if (found)
4287 {
4288 for (reg = 0; reg < 4; reg++)
4289 if (r[reg] >= 0
df7492f9
KH
4290 && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4291 ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
e0e989f6 4292 }
b73bfc1c 4293
5eb05ea3 4294 return dst - orig;
e0e989f6
KH
4295}
4296
4ed46869
KH
4297/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
4298
f10fe38f 4299static bool
971de7fb 4300encode_coding_iso_2022 (struct coding_system *coding)
4ed46869 4301{
f10fe38f 4302 bool multibytep = coding->dst_multibyte;
df7492f9
KH
4303 int *charbuf = coding->charbuf;
4304 int *charbuf_end = charbuf + coding->charbuf_used;
4305 unsigned char *dst = coding->destination + coding->produced;
4306 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4307 int safe_room = 16;
f10fe38f 4308 bool bol_designation
df7492f9
KH
4309 = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4310 && CODING_ISO_BOL (coding));
d311d28c 4311 ptrdiff_t produced_chars = 0;
df7492f9 4312 Lisp_Object attrs, eol_type, charset_list;
f10fe38f 4313 bool ascii_compatible;
b73bfc1c 4314 int c;
ff0dacd7 4315 int preferred_charset_id = -1;
05e6f5dc 4316
24a73b0a 4317 CODING_GET_INFO (coding, attrs, charset_list);
0a9564cb 4318 eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
24a73b0a
KH
4319 if (VECTORP (eol_type))
4320 eol_type = Qunix;
4321
004068e4 4322 setup_iso_safe_charsets (attrs);
ff0dacd7 4323 /* Charset list may have been changed. */
287c57d7 4324 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
1b3b981b 4325 coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
0eecad43 4326
a552b35a
KH
4327 ascii_compatible
4328 = (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
4329 && ! (CODING_ISO_FLAGS (coding) & (CODING_ISO_FLAG_DESIGNATION
4330 | CODING_ISO_FLAG_LOCKING_SHIFT)));
bdd9fb48 4331
df7492f9 4332 while (charbuf < charbuf_end)
4ed46869 4333 {
df7492f9 4334 ASSURE_DESTINATION (safe_room);
b73bfc1c 4335
df7492f9 4336 if (bol_designation)
b73bfc1c 4337 {
bdd9fb48 4338 /* We have to produce designation sequences if any now. */
5eb05ea3
KH
4339 unsigned char desig_buf[16];
4340 int nbytes;
8f50130c 4341 ptrdiff_t offset;
5eb05ea3
KH
4342
4343 charset_map_loaded = 0;
4344 nbytes = encode_designation_at_bol (coding, charbuf, charbuf_end,
4345 desig_buf);
4346 if (charset_map_loaded
c1892f11 4347 && (offset = coding_change_destination (coding)))
5eb05ea3
KH
4348 {
4349 dst += offset;
4350 dst_end += offset;
4351 }
4352 memcpy (dst, desig_buf, nbytes);
4353 dst += nbytes;
df7492f9 4354 /* We are sure that designation sequences are all ASCII bytes. */
5eb05ea3
KH
4355 produced_chars += nbytes;
4356 bol_designation = 0;
4357 ASSURE_DESTINATION (safe_room);
e0e989f6
KH
4358 }
4359
df7492f9 4360 c = *charbuf++;
ec6d2bb8 4361
ff0dacd7
KH
4362 if (c < 0)
4363 {
4364 /* Handle an annotation. */
4365 switch (*charbuf)
ec6d2bb8 4366 {
ff0dacd7
KH
4367 case CODING_ANNOTATE_COMPOSITION_MASK:
4368 /* Not yet implemented. */
4369 break;
4370 case CODING_ANNOTATE_CHARSET_MASK:
cf7dfdf5 4371 preferred_charset_id = charbuf[2];
ff0dacd7
KH
4372 if (preferred_charset_id >= 0
4373 && NILP (Fmemq (make_number (preferred_charset_id),
4374 charset_list)))
4375 preferred_charset_id = -1;
4376 break;
4377 default:
1088b922 4378 emacs_abort ();
4ed46869 4379 }
ff0dacd7
KH
4380 charbuf += -c - 1;
4381 continue;
4ed46869 4382 }
ec6d2bb8 4383
b73bfc1c
KH
4384 /* Now encode the character C. */
4385 if (c < 0x20 || c == 0x7F)
4386 {
df7492f9
KH
4387 if (c == '\n'
4388 || (c == '\r' && EQ (eol_type, Qmac)))
19a8d9e0 4389 {
df7492f9
KH
4390 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4391 ENCODE_RESET_PLANE_AND_REGISTER ();
4392 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
b73bfc1c 4393 {
df7492f9
KH
4394 int i;
4395
4396 for (i = 0; i < 4; i++)
4397 CODING_ISO_DESIGNATION (coding, i)
4398 = CODING_ISO_INITIAL (coding, i);
b73bfc1c 4399 }
f10fe38f
PE
4400 bol_designation = ((CODING_ISO_FLAGS (coding)
4401 & CODING_ISO_FLAG_DESIGNATE_AT_BOL)
4402 != 0);
19a8d9e0 4403 }
df7492f9
KH
4404 else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4405 ENCODE_RESET_PLANE_AND_REGISTER ();
4406 EMIT_ONE_ASCII_BYTE (c);
4ed46869 4407 }
df7492f9 4408 else if (ASCII_CHAR_P (c))
88993dfd 4409 {
df7492f9
KH
4410 if (ascii_compatible)
4411 EMIT_ONE_ASCII_BYTE (c);
93dec019 4412 else
19a8d9e0 4413 {
bf16eb23
KH
4414 struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4415 ENCODE_ISO_CHARACTER (charset, c);
19a8d9e0 4416 }
4ed46869 4417 }
16eafb5d 4418 else if (CHAR_BYTE8_P (c))
88993dfd 4419 {
16eafb5d
KH
4420 c = CHAR_TO_BYTE8 (c);
4421 EMIT_ONE_BYTE (c);
88993dfd 4422 }
b73bfc1c 4423 else
df7492f9 4424 {
ff0dacd7 4425 struct charset *charset;
b73bfc1c 4426
ff0dacd7
KH
4427 if (preferred_charset_id >= 0)
4428 {
f10fe38f 4429 bool result;
5eb05ea3 4430
ff0dacd7 4431 charset = CHARSET_FROM_ID (preferred_charset_id);
5eb05ea3
KH
4432 CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
4433 if (! result)
4434 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4435 NULL, charset);
ff0dacd7
KH
4436 }
4437 else
5eb05ea3
KH
4438 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4439 NULL, charset);
df7492f9
KH
4440 if (!charset)
4441 {
41cbe562
KH
4442 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4443 {
4444 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4445 charset = CHARSET_FROM_ID (charset_ascii);
4446 }
4447 else
4448 {
4449 c = coding->default_char;
5eb05ea3
KH
4450 CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4451 charset_list, NULL, charset);
41cbe562 4452 }
df7492f9
KH
4453 }
4454 ENCODE_ISO_CHARACTER (charset, c);
4455 }
84fbb8a0 4456 }
b73bfc1c 4457
df7492f9
KH
4458 if (coding->mode & CODING_MODE_LAST_BLOCK
4459 && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4460 {
4461 ASSURE_DESTINATION (safe_room);
4462 ENCODE_RESET_PLANE_AND_REGISTER ();
4463 }
065e3595 4464 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
4465 CODING_ISO_BOL (coding) = bol_designation;
4466 coding->produced_char += produced_chars;
4467 coding->produced = dst - coding->destination;
4468 return 0;
4ed46869
KH
4469}
4470
4471\f
df7492f9 4472/*** 8,9. SJIS and BIG5 handlers ***/
4ed46869 4473
df7492f9 4474/* Although SJIS and BIG5 are not ISO's coding system, they are used
4ed46869
KH
4475 quite widely. So, for the moment, Emacs supports them in the bare
4476 C code. But, in the future, they may be supported only by CCL. */
4477
4478/* SJIS is a coding system encoding three character sets: ASCII, right
4479 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
4480 as is. A character of charset katakana-jisx0201 is encoded by
4481 "position-code + 0x80". A character of charset japanese-jisx0208
4482 is encoded in 2-byte but two position-codes are divided and shifted
df7492f9 4483 so that it fit in the range below.
4ed46869
KH
4484
4485 --- CODE RANGE of SJIS ---
4486 (character set) (range)
4487 ASCII 0x00 .. 0x7F
df7492f9 4488 KATAKANA-JISX0201 0xA0 .. 0xDF
c28a9453 4489 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
d14d03ac 4490 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
4ed46869
KH
4491 -------------------------------
4492
4493*/
4494
4495/* BIG5 is a coding system encoding two character sets: ASCII and
4496 Big5. An ASCII character is encoded as is. Big5 is a two-byte
df7492f9 4497 character set and is encoded in two-byte.
4ed46869
KH
4498
4499 --- CODE RANGE of BIG5 ---
4500 (character set) (range)
4501 ASCII 0x00 .. 0x7F
4502 Big5 (1st byte) 0xA1 .. 0xFE
4503 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
4504 --------------------------
4505
df7492f9 4506 */
4ed46869
KH
4507
4508/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
f10fe38f 4509 Return true if a text is encoded in SJIS. */
4ed46869 4510
f10fe38f 4511static bool
cf84bb53
JB
4512detect_coding_sjis (struct coding_system *coding,
4513 struct coding_detection_info *detect_info)
4ed46869 4514{
065e3595 4515 const unsigned char *src = coding->source, *src_base;
8f924df7 4516 const unsigned char *src_end = coding->source + coding->src_bytes;
f10fe38f 4517 bool multibytep = coding->src_multibyte;
d311d28c 4518 ptrdiff_t consumed_chars = 0;
df7492f9 4519 int found = 0;
b73bfc1c 4520 int c;
f07190ca
KH
4521 Lisp_Object attrs, charset_list;
4522 int max_first_byte_of_2_byte_code;
4523
4524 CODING_GET_INFO (coding, attrs, charset_list);
4525 max_first_byte_of_2_byte_code
4526 = (XINT (Flength (charset_list)) > 3 ? 0xFC : 0xEF);
df7492f9 4527
ff0dacd7 4528 detect_info->checked |= CATEGORY_MASK_SJIS;
df7492f9
KH
4529 /* A coding system of this category is always ASCII compatible. */
4530 src += coding->head_ascii;
4ed46869 4531
b73bfc1c 4532 while (1)
4ed46869 4533 {
065e3595 4534 src_base = src;
df7492f9 4535 ONE_MORE_BYTE (c);
682169fe
KH
4536 if (c < 0x80)
4537 continue;
f07190ca
KH
4538 if ((c >= 0x81 && c <= 0x9F)
4539 || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code))
4ed46869 4540 {
df7492f9 4541 ONE_MORE_BYTE (c);
682169fe 4542 if (c < 0x40 || c == 0x7F || c > 0xFC)
df7492f9 4543 break;
ff0dacd7 4544 found = CATEGORY_MASK_SJIS;
4ed46869 4545 }
df7492f9 4546 else if (c >= 0xA0 && c < 0xE0)
ff0dacd7 4547 found = CATEGORY_MASK_SJIS;
df7492f9
KH
4548 else
4549 break;
4ed46869 4550 }
ff0dacd7 4551 detect_info->rejected |= CATEGORY_MASK_SJIS;
df7492f9
KH
4552 return 0;
4553
4554 no_more_source:
065e3595 4555 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
89528eb3 4556 {
ff0dacd7 4557 detect_info->rejected |= CATEGORY_MASK_SJIS;
89528eb3 4558 return 0;
4ed46869 4559 }
ff0dacd7
KH
4560 detect_info->found |= found;
4561 return 1;
4ed46869
KH
4562}
4563
4564/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
f10fe38f 4565 Return true if a text is encoded in BIG5. */
4ed46869 4566
f10fe38f 4567static bool
cf84bb53
JB
4568detect_coding_big5 (struct coding_system *coding,
4569 struct coding_detection_info *detect_info)
4ed46869 4570{
065e3595 4571 const unsigned char *src = coding->source, *src_base;
8f924df7 4572 const unsigned char *src_end = coding->source + coding->src_bytes;
f10fe38f 4573 bool multibytep = coding->src_multibyte;
d311d28c 4574 ptrdiff_t consumed_chars = 0;
df7492f9 4575 int found = 0;
b73bfc1c 4576 int c;
fa42c37f 4577
ff0dacd7 4578 detect_info->checked |= CATEGORY_MASK_BIG5;
df7492f9
KH
4579 /* A coding system of this category is always ASCII compatible. */
4580 src += coding->head_ascii;
fa42c37f 4581
b73bfc1c 4582 while (1)
fa42c37f 4583 {
065e3595 4584 src_base = src;
df7492f9
KH
4585 ONE_MORE_BYTE (c);
4586 if (c < 0x80)
fa42c37f 4587 continue;
df7492f9 4588 if (c >= 0xA1)
fa42c37f 4589 {
df7492f9
KH
4590 ONE_MORE_BYTE (c);
4591 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
fa42c37f 4592 return 0;
ff0dacd7 4593 found = CATEGORY_MASK_BIG5;
fa42c37f 4594 }
df7492f9
KH
4595 else
4596 break;
fa42c37f 4597 }
ff0dacd7 4598 detect_info->rejected |= CATEGORY_MASK_BIG5;
fa42c37f 4599 return 0;
fa42c37f 4600
df7492f9 4601 no_more_source:
065e3595 4602 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
89528eb3 4603 {
ff0dacd7 4604 detect_info->rejected |= CATEGORY_MASK_BIG5;
89528eb3
KH
4605 return 0;
4606 }
ff0dacd7
KH
4607 detect_info->found |= found;
4608 return 1;
fa42c37f
KH
4609}
4610
f10fe38f 4611/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
fa42c37f 4612
b73bfc1c 4613static void
971de7fb 4614decode_coding_sjis (struct coding_system *coding)
4ed46869 4615{
8f924df7
KH
4616 const unsigned char *src = coding->source + coding->consumed;
4617 const unsigned char *src_end = coding->source + coding->src_bytes;
4618 const unsigned char *src_base;
69a80ea3 4619 int *charbuf = coding->charbuf + coding->charbuf_used;
ad1746f5 4620 /* We may produce one charset annotation in one loop and one more at
df80c7f0 4621 the end. */
69a80ea3 4622 int *charbuf_end
df80c7f0 4623 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
d311d28c 4624 ptrdiff_t consumed_chars = 0, consumed_chars_base;
f10fe38f 4625 bool multibytep = coding->src_multibyte;
df7492f9 4626 struct charset *charset_roman, *charset_kanji, *charset_kana;
57a47f8a 4627 struct charset *charset_kanji2;
24a73b0a 4628 Lisp_Object attrs, charset_list, val;
d311d28c
PE
4629 ptrdiff_t char_offset = coding->produced_char;
4630 ptrdiff_t last_offset = char_offset;
ff0dacd7 4631 int last_id = charset_ascii;
f10fe38f
PE
4632 bool eol_dos
4633 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
119852e7 4634 int byte_after_cr = -1;
a5d301df 4635
24a73b0a 4636 CODING_GET_INFO (coding, attrs, charset_list);
df7492f9
KH
4637
4638 val = charset_list;
4639 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
89528eb3 4640 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
57a47f8a
KH
4641 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4642 charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
fa42c37f 4643
b73bfc1c 4644 while (1)
4ed46869 4645 {
df7492f9 4646 int c, c1;
24a73b0a 4647 struct charset *charset;
fa42c37f 4648
b73bfc1c 4649 src_base = src;
df7492f9 4650 consumed_chars_base = consumed_chars;
fa42c37f 4651
df7492f9 4652 if (charbuf >= charbuf_end)
b71f6f73
KH
4653 {
4654 if (byte_after_cr >= 0)
4655 src_base--;
4656 break;
4657 }
df7492f9 4658
119852e7
KH
4659 if (byte_after_cr >= 0)
4660 c = byte_after_cr, byte_after_cr = -1;
4661 else
4662 ONE_MORE_BYTE (c);
065e3595
KH
4663 if (c < 0)
4664 goto invalid_code;
24a73b0a 4665 if (c < 0x80)
119852e7 4666 {
2735d060 4667 if (eol_dos && c == '\r')
119852e7
KH
4668 ONE_MORE_BYTE (byte_after_cr);
4669 charset = charset_roman;
4670 }
57a47f8a 4671 else if (c == 0x80 || c == 0xA0)
8e921c4b 4672 goto invalid_code;
57a47f8a
KH
4673 else if (c >= 0xA1 && c <= 0xDF)
4674 {
4675 /* SJIS -> JISX0201-Kana */
4676 c &= 0x7F;
4677 charset = charset_kana;
4678 }
4679 else if (c <= 0xEF)
df7492f9 4680 {
57a47f8a
KH
4681 /* SJIS -> JISX0208 */
4682 ONE_MORE_BYTE (c1);
4683 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
24a73b0a 4684 goto invalid_code;
57a47f8a
KH
4685 c = (c << 8) | c1;
4686 SJIS_TO_JIS (c);
4687 charset = charset_kanji;
4688 }
4689 else if (c <= 0xFC && charset_kanji2)
4690 {
c6876370 4691 /* SJIS -> JISX0213-2 */
57a47f8a
KH
4692 ONE_MORE_BYTE (c1);
4693 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
24a73b0a 4694 goto invalid_code;
57a47f8a
KH
4695 c = (c << 8) | c1;
4696 SJIS_TO_JIS2 (c);
4697 charset = charset_kanji2;
df7492f9 4698 }
57a47f8a
KH
4699 else
4700 goto invalid_code;
24a73b0a
KH
4701 if (charset->id != charset_ascii
4702 && last_id != charset->id)
4703 {
4704 if (last_id != charset_ascii)
69a80ea3 4705 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
24a73b0a
KH
4706 last_id = charset->id;
4707 last_offset = char_offset;
4708 }
4709 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
df7492f9 4710 *charbuf++ = c;
ff0dacd7 4711 char_offset++;
df7492f9 4712 continue;
b73bfc1c 4713
df7492f9
KH
4714 invalid_code:
4715 src = src_base;
4716 consumed_chars = consumed_chars_base;
4717 ONE_MORE_BYTE (c);
065e3595 4718 *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
ff0dacd7 4719 char_offset++;
df7492f9
KH
4720 coding->errors++;
4721 }
fa42c37f 4722
df7492f9 4723 no_more_source:
ff0dacd7 4724 if (last_id != charset_ascii)
69a80ea3 4725 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
df7492f9
KH
4726 coding->consumed_char += consumed_chars_base;
4727 coding->consumed = src_base - coding->source;
4728 coding->charbuf_used = charbuf - coding->charbuf;
fa42c37f
KH
4729}
4730
b73bfc1c 4731static void
971de7fb 4732decode_coding_big5 (struct coding_system *coding)
4ed46869 4733{
8f924df7
KH
4734 const unsigned char *src = coding->source + coding->consumed;
4735 const unsigned char *src_end = coding->source + coding->src_bytes;
4736 const unsigned char *src_base;
69a80ea3 4737 int *charbuf = coding->charbuf + coding->charbuf_used;
ad1746f5 4738 /* We may produce one charset annotation in one loop and one more at
df80c7f0 4739 the end. */
69a80ea3 4740 int *charbuf_end
df80c7f0 4741 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
d311d28c 4742 ptrdiff_t consumed_chars = 0, consumed_chars_base;
f10fe38f 4743 bool multibytep = coding->src_multibyte;
df7492f9 4744 struct charset *charset_roman, *charset_big5;
24a73b0a 4745 Lisp_Object attrs, charset_list, val;
d311d28c
PE
4746 ptrdiff_t char_offset = coding->produced_char;
4747 ptrdiff_t last_offset = char_offset;
ff0dacd7 4748 int last_id = charset_ascii;
f10fe38f
PE
4749 bool eol_dos
4750 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
119852e7 4751 int byte_after_cr = -1;
df7492f9 4752
24a73b0a 4753 CODING_GET_INFO (coding, attrs, charset_list);
df7492f9
KH
4754 val = charset_list;
4755 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4756 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4ed46869 4757
b73bfc1c 4758 while (1)
4ed46869 4759 {
df7492f9 4760 int c, c1;
24a73b0a 4761 struct charset *charset;
b73bfc1c
KH
4762
4763 src_base = src;
df7492f9
KH
4764 consumed_chars_base = consumed_chars;
4765
4766 if (charbuf >= charbuf_end)
b71f6f73
KH
4767 {
4768 if (byte_after_cr >= 0)
4769 src_base--;
4770 break;
4771 }
df7492f9 4772
119852e7 4773 if (byte_after_cr >= 0)
14daee73 4774 c = byte_after_cr, byte_after_cr = -1;
119852e7
KH
4775 else
4776 ONE_MORE_BYTE (c);
b73bfc1c 4777
065e3595
KH
4778 if (c < 0)
4779 goto invalid_code;
24a73b0a 4780 if (c < 0x80)
119852e7 4781 {
2735d060 4782 if (eol_dos && c == '\r')
119852e7
KH
4783 ONE_MORE_BYTE (byte_after_cr);
4784 charset = charset_roman;
4785 }
24a73b0a 4786 else
4ed46869 4787 {
24a73b0a
KH
4788 /* BIG5 -> Big5 */
4789 if (c < 0xA1 || c > 0xFE)
4790 goto invalid_code;
4791 ONE_MORE_BYTE (c1);
4792 if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4793 goto invalid_code;
4794 c = c << 8 | c1;
4795 charset = charset_big5;
4ed46869 4796 }
24a73b0a
KH
4797 if (charset->id != charset_ascii
4798 && last_id != charset->id)
df7492f9 4799 {
24a73b0a 4800 if (last_id != charset_ascii)
69a80ea3 4801 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
24a73b0a
KH
4802 last_id = charset->id;
4803 last_offset = char_offset;
4ed46869 4804 }
24a73b0a 4805 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
df7492f9 4806 *charbuf++ = c;
ff0dacd7 4807 char_offset++;
fb88bf2d
KH
4808 continue;
4809
df7492f9 4810 invalid_code:
4ed46869 4811 src = src_base;
df7492f9
KH
4812 consumed_chars = consumed_chars_base;
4813 ONE_MORE_BYTE (c);
065e3595 4814 *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
ff0dacd7 4815 char_offset++;
df7492f9 4816 coding->errors++;
fb88bf2d 4817 }
d46c5b12 4818
df7492f9 4819 no_more_source:
ff0dacd7 4820 if (last_id != charset_ascii)
69a80ea3 4821 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
df7492f9
KH
4822 coding->consumed_char += consumed_chars_base;
4823 coding->consumed = src_base - coding->source;
4824 coding->charbuf_used = charbuf - coding->charbuf;
4ed46869
KH
4825}
4826
4827/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
b73bfc1c
KH
4828 This function can encode charsets `ascii', `katakana-jisx0201',
4829 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
4830 are sure that all these charsets are registered as official charset
4ed46869 4831 (i.e. do not have extended leading-codes). Characters of other
f10fe38f 4832 charsets are produced without any encoding. */
4ed46869 4833
f10fe38f 4834static bool
971de7fb 4835encode_coding_sjis (struct coding_system *coding)
4ed46869 4836{
f10fe38f 4837 bool multibytep = coding->dst_multibyte;
df7492f9
KH
4838 int *charbuf = coding->charbuf;
4839 int *charbuf_end = charbuf + coding->charbuf_used;
4840 unsigned char *dst = coding->destination + coding->produced;
4841 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4842 int safe_room = 4;
d311d28c 4843 ptrdiff_t produced_chars = 0;
24a73b0a 4844 Lisp_Object attrs, charset_list, val;
f10fe38f 4845 bool ascii_compatible;
66ebf983 4846 struct charset *charset_kanji, *charset_kana;
57a47f8a 4847 struct charset *charset_kanji2;
df7492f9 4848 int c;
a5d301df 4849
24a73b0a 4850 CODING_GET_INFO (coding, attrs, charset_list);
66ebf983 4851 val = XCDR (charset_list);
df7492f9 4852 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
57a47f8a
KH
4853 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4854 charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4ed46869 4855
df7492f9 4856 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
93dec019 4857
df7492f9
KH
4858 while (charbuf < charbuf_end)
4859 {
4860 ASSURE_DESTINATION (safe_room);
4861 c = *charbuf++;
b73bfc1c 4862 /* Now encode the character C. */
df7492f9
KH
4863 if (ASCII_CHAR_P (c) && ascii_compatible)
4864 EMIT_ONE_ASCII_BYTE (c);
16eafb5d
KH
4865 else if (CHAR_BYTE8_P (c))
4866 {
4867 c = CHAR_TO_BYTE8 (c);
4868 EMIT_ONE_BYTE (c);
4869 }
df7492f9 4870 else
b73bfc1c 4871 {
df7492f9 4872 unsigned code;
5eb05ea3
KH
4873 struct charset *charset;
4874 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4875 &code, charset);
df7492f9
KH
4876
4877 if (!charset)
4ed46869 4878 {
41cbe562 4879 if (coding->mode & CODING_MODE_SAFE_ENCODING)
b73bfc1c 4880 {
41cbe562
KH
4881 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4882 charset = CHARSET_FROM_ID (charset_ascii);
b73bfc1c 4883 }
41cbe562 4884 else
b73bfc1c 4885 {
41cbe562 4886 c = coding->default_char;
5eb05ea3
KH
4887 CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4888 charset_list, &code, charset);
b73bfc1c 4889 }
b73bfc1c 4890 }
df7492f9 4891 if (code == CHARSET_INVALID_CODE (charset))
1088b922 4892 emacs_abort ();
df7492f9
KH
4893 if (charset == charset_kanji)
4894 {
4895 int c1, c2;
4896 JIS_TO_SJIS (code);
4897 c1 = code >> 8, c2 = code & 0xFF;
4898 EMIT_TWO_BYTES (c1, c2);
4899 }
4900 else if (charset == charset_kana)
4901 EMIT_ONE_BYTE (code | 0x80);
57a47f8a
KH
4902 else if (charset_kanji2 && charset == charset_kanji2)
4903 {
4904 int c1, c2;
4905
4906 c1 = code >> 8;
f07190ca
KH
4907 if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25)
4908 || c1 == 0x28
57a47f8a
KH
4909 || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
4910 {
4911 JIS_TO_SJIS2 (code);
4912 c1 = code >> 8, c2 = code & 0xFF;
4913 EMIT_TWO_BYTES (c1, c2);
4914 }
4915 else
4916 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4917 }
df7492f9
KH
4918 else
4919 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4920 }
4921 }
065e3595 4922 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
4923 coding->produced_char += produced_chars;
4924 coding->produced = dst - coding->destination;
4925 return 0;
4926}
4927
f10fe38f 4928static bool
971de7fb 4929encode_coding_big5 (struct coding_system *coding)
df7492f9 4930{
f10fe38f 4931 bool multibytep = coding->dst_multibyte;
df7492f9
KH
4932 int *charbuf = coding->charbuf;
4933 int *charbuf_end = charbuf + coding->charbuf_used;
4934 unsigned char *dst = coding->destination + coding->produced;
4935 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4936 int safe_room = 4;
d311d28c 4937 ptrdiff_t produced_chars = 0;
24a73b0a 4938 Lisp_Object attrs, charset_list, val;
f10fe38f 4939 bool ascii_compatible;
66ebf983 4940 struct charset *charset_big5;
df7492f9
KH
4941 int c;
4942
24a73b0a 4943 CODING_GET_INFO (coding, attrs, charset_list);
66ebf983 4944 val = XCDR (charset_list);
df7492f9
KH
4945 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4946 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4947
4948 while (charbuf < charbuf_end)
4949 {
4950 ASSURE_DESTINATION (safe_room);
4951 c = *charbuf++;
4952 /* Now encode the character C. */
4953 if (ASCII_CHAR_P (c) && ascii_compatible)
4954 EMIT_ONE_ASCII_BYTE (c);
16eafb5d
KH
4955 else if (CHAR_BYTE8_P (c))
4956 {
4957 c = CHAR_TO_BYTE8 (c);
4958 EMIT_ONE_BYTE (c);
b73bfc1c
KH
4959 }
4960 else
4961 {
df7492f9 4962 unsigned code;
5eb05ea3
KH
4963 struct charset *charset;
4964 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4965 &code, charset);
df7492f9
KH
4966
4967 if (! charset)
b73bfc1c 4968 {
41cbe562 4969 if (coding->mode & CODING_MODE_SAFE_ENCODING)
b73bfc1c 4970 {
41cbe562
KH
4971 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4972 charset = CHARSET_FROM_ID (charset_ascii);
b73bfc1c 4973 }
41cbe562 4974 else
0eecad43 4975 {
41cbe562 4976 c = coding->default_char;
5eb05ea3
KH
4977 CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4978 charset_list, &code, charset);
0eecad43 4979 }
4ed46869 4980 }
df7492f9 4981 if (code == CHARSET_INVALID_CODE (charset))
1088b922 4982 emacs_abort ();
df7492f9 4983 if (charset == charset_big5)
b73bfc1c 4984 {
df7492f9
KH
4985 int c1, c2;
4986
4987 c1 = code >> 8, c2 = code & 0xFF;
4988 EMIT_TWO_BYTES (c1, c2);
b73bfc1c 4989 }
df7492f9
KH
4990 else
4991 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4ed46869 4992 }
4ed46869 4993 }
065e3595 4994 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
4995 coding->produced_char += produced_chars;
4996 coding->produced = dst - coding->destination;
4997 return 0;
4ed46869
KH
4998}
4999
5000\f
df7492f9 5001/*** 10. CCL handlers ***/
1397dc18
KH
5002
5003/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
f10fe38f
PE
5004 Return true if a text is encoded in a coding system of which
5005 encoder/decoder are written in CCL program. */
1397dc18 5006
f10fe38f 5007static bool
cf84bb53
JB
5008detect_coding_ccl (struct coding_system *coding,
5009 struct coding_detection_info *detect_info)
1397dc18 5010{
065e3595 5011 const unsigned char *src = coding->source, *src_base;
8f924df7 5012 const unsigned char *src_end = coding->source + coding->src_bytes;
f10fe38f 5013 bool multibytep = coding->src_multibyte;
d311d28c 5014 ptrdiff_t consumed_chars = 0;
df7492f9 5015 int found = 0;
0e219d54 5016 unsigned char *valids;
d311d28c 5017 ptrdiff_t head_ascii = coding->head_ascii;
df7492f9
KH
5018 Lisp_Object attrs;
5019
ff0dacd7
KH
5020 detect_info->checked |= CATEGORY_MASK_CCL;
5021
df7492f9 5022 coding = &coding_categories[coding_category_ccl];
0e219d54 5023 valids = CODING_CCL_VALIDS (coding);
df7492f9
KH
5024 attrs = CODING_ID_ATTRS (coding->id);
5025 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5026 src += head_ascii;
1397dc18 5027
b73bfc1c 5028 while (1)
1397dc18 5029 {
df7492f9 5030 int c;
065e3595
KH
5031
5032 src_base = src;
df7492f9 5033 ONE_MORE_BYTE (c);
065e3595 5034 if (c < 0 || ! valids[c])
df7492f9 5035 break;
ff0dacd7
KH
5036 if ((valids[c] > 1))
5037 found = CATEGORY_MASK_CCL;
df7492f9 5038 }
ff0dacd7 5039 detect_info->rejected |= CATEGORY_MASK_CCL;
df7492f9
KH
5040 return 0;
5041
5042 no_more_source:
ff0dacd7
KH
5043 detect_info->found |= found;
5044 return 1;
df7492f9
KH
5045}
5046
5047static void
971de7fb 5048decode_coding_ccl (struct coding_system *coding)
df7492f9 5049{
7c78e542 5050 const unsigned char *src = coding->source + coding->consumed;
8f924df7 5051 const unsigned char *src_end = coding->source + coding->src_bytes;
69a80ea3
KH
5052 int *charbuf = coding->charbuf + coding->charbuf_used;
5053 int *charbuf_end = coding->charbuf + coding->charbuf_size;
d311d28c 5054 ptrdiff_t consumed_chars = 0;
f10fe38f 5055 bool multibytep = coding->src_multibyte;
d0396581 5056 struct ccl_program *ccl = &coding->spec.ccl->ccl;
df7492f9 5057 int source_charbuf[1024];
fbdc1721 5058 int source_byteidx[1025];
24a73b0a 5059 Lisp_Object attrs, charset_list;
df7492f9 5060
24a73b0a 5061 CODING_GET_INFO (coding, attrs, charset_list);
df7492f9 5062
d0396581 5063 while (1)
df7492f9 5064 {
7c78e542 5065 const unsigned char *p = src;
95402d5f 5066 ptrdiff_t offset;
df7492f9
KH
5067 int i = 0;
5068
5069 if (multibytep)
fbdc1721
KH
5070 {
5071 while (i < 1024 && p < src_end)
5072 {
5073 source_byteidx[i] = p - src;
5074 source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
5075 }
5076 source_byteidx[i] = p - src;
5077 }
df7492f9
KH
5078 else
5079 while (i < 1024 && p < src_end)
5080 source_charbuf[i++] = *p++;
8f924df7 5081
df7492f9 5082 if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
d0396581 5083 ccl->last_block = 1;
95402d5f
KH
5084 /* As ccl_driver calls DECODE_CHAR, buffer may be relocated. */
5085 charset_map_loaded = 0;
d0396581
KH
5086 ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf,
5087 charset_list);
95402d5f
KH
5088 if (charset_map_loaded
5089 && (offset = coding_change_source (coding)))
5090 {
5091 p += offset;
5092 src += offset;
5093 src_end += offset;
5094 }
d0396581 5095 charbuf += ccl->produced;
fbdc1721 5096 if (multibytep)
d0396581 5097 src += source_byteidx[ccl->consumed];
df7492f9 5098 else
d0396581
KH
5099 src += ccl->consumed;
5100 consumed_chars += ccl->consumed;
5101 if (p == src_end || ccl->status != CCL_STAT_SUSPEND_BY_SRC)
df7492f9
KH
5102 break;
5103 }
5104
d0396581 5105 switch (ccl->status)
df7492f9
KH
5106 {
5107 case CCL_STAT_SUSPEND_BY_SRC:
065e3595 5108 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
df7492f9
KH
5109 break;
5110 case CCL_STAT_SUSPEND_BY_DST:
d0396581 5111 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
df7492f9
KH
5112 break;
5113 case CCL_STAT_QUIT:
5114 case CCL_STAT_INVALID_CMD:
065e3595 5115 record_conversion_result (coding, CODING_RESULT_INTERRUPT);
df7492f9
KH
5116 break;
5117 default:
065e3595 5118 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
5119 break;
5120 }
5121 coding->consumed_char += consumed_chars;
5122 coding->consumed = src - coding->source;
5123 coding->charbuf_used = charbuf - coding->charbuf;
5124}
5125
f10fe38f 5126static bool
971de7fb 5127encode_coding_ccl (struct coding_system *coding)
df7492f9 5128{
fb608df3 5129 struct ccl_program *ccl = &coding->spec.ccl->ccl;
f10fe38f 5130 bool multibytep = coding->dst_multibyte;
df7492f9
KH
5131 int *charbuf = coding->charbuf;
5132 int *charbuf_end = charbuf + coding->charbuf_used;
5133 unsigned char *dst = coding->destination + coding->produced;
5134 unsigned char *dst_end = coding->destination + coding->dst_bytes;
df7492f9 5135 int destination_charbuf[1024];
d311d28c 5136 ptrdiff_t produced_chars = 0;
a53e2e89 5137 int i;
24a73b0a 5138 Lisp_Object attrs, charset_list;
df7492f9 5139
24a73b0a 5140 CODING_GET_INFO (coding, attrs, charset_list);
fb608df3
KH
5141 if (coding->consumed_char == coding->src_chars
5142 && coding->mode & CODING_MODE_LAST_BLOCK)
5143 ccl->last_block = 1;
df7492f9 5144
76470ad1 5145 do
df7492f9 5146 {
95402d5f
KH
5147 ptrdiff_t offset;
5148
5149 /* As ccl_driver calls DECODE_CHAR, buffer may be relocated. */
5150 charset_map_loaded = 0;
fb608df3 5151 ccl_driver (ccl, charbuf, destination_charbuf,
8cffd3e7 5152 charbuf_end - charbuf, 1024, charset_list);
95402d5f
KH
5153 if (charset_map_loaded
5154 && (offset = coding_change_destination (coding)))
5155 dst += offset;
df7492f9 5156 if (multibytep)
8cffd3e7 5157 {
fb608df3
KH
5158 ASSURE_DESTINATION (ccl->produced * 2);
5159 for (i = 0; i < ccl->produced; i++)
8cffd3e7
KH
5160 EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5161 }
df7492f9
KH
5162 else
5163 {
fb608df3
KH
5164 ASSURE_DESTINATION (ccl->produced);
5165 for (i = 0; i < ccl->produced; i++)
df7492f9 5166 *dst++ = destination_charbuf[i] & 0xFF;
fb608df3 5167 produced_chars += ccl->produced;
df7492f9 5168 }
fb608df3
KH
5169 charbuf += ccl->consumed;
5170 if (ccl->status == CCL_STAT_QUIT
5171 || ccl->status == CCL_STAT_INVALID_CMD)
8cffd3e7 5172 break;
df7492f9 5173 }
76470ad1 5174 while (charbuf < charbuf_end);
df7492f9 5175
fb608df3 5176 switch (ccl->status)
df7492f9
KH
5177 {
5178 case CCL_STAT_SUSPEND_BY_SRC:
065e3595 5179 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
df7492f9
KH
5180 break;
5181 case CCL_STAT_SUSPEND_BY_DST:
065e3595 5182 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
df7492f9
KH
5183 break;
5184 case CCL_STAT_QUIT:
5185 case CCL_STAT_INVALID_CMD:
065e3595 5186 record_conversion_result (coding, CODING_RESULT_INTERRUPT);
df7492f9
KH
5187 break;
5188 default:
065e3595 5189 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9 5190 break;
1397dc18 5191 }
df7492f9
KH
5192
5193 coding->produced_char += produced_chars;
5194 coding->produced = dst - coding->destination;
5195 return 0;
1397dc18
KH
5196}
5197
5198\f
df7492f9 5199/*** 10, 11. no-conversion handlers ***/
4ed46869 5200
b73bfc1c 5201/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
4ed46869 5202
b73bfc1c 5203static void
971de7fb 5204decode_coding_raw_text (struct coding_system *coding)
4ed46869 5205{
f10fe38f
PE
5206 bool eol_dos
5207 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
119852e7 5208
df7492f9 5209 coding->chars_at_source = 1;
119852e7
KH
5210 coding->consumed_char = coding->src_chars;
5211 coding->consumed = coding->src_bytes;
2735d060 5212 if (eol_dos && coding->source[coding->src_bytes - 1] == '\r')
119852e7
KH
5213 {
5214 coding->consumed_char--;
5215 coding->consumed--;
5216 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5217 }
5218 else
5219 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9 5220}
4ed46869 5221
f10fe38f 5222static bool
971de7fb 5223encode_coding_raw_text (struct coding_system *coding)
df7492f9 5224{
f10fe38f 5225 bool multibytep = coding->dst_multibyte;
df7492f9
KH
5226 int *charbuf = coding->charbuf;
5227 int *charbuf_end = coding->charbuf + coding->charbuf_used;
5228 unsigned char *dst = coding->destination + coding->produced;
5229 unsigned char *dst_end = coding->destination + coding->dst_bytes;
d311d28c 5230 ptrdiff_t produced_chars = 0;
b73bfc1c
KH
5231 int c;
5232
df7492f9 5233 if (multibytep)
b73bfc1c 5234 {
df7492f9 5235 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
4ed46869 5236
df7492f9
KH
5237 if (coding->src_multibyte)
5238 while (charbuf < charbuf_end)
5239 {
5240 ASSURE_DESTINATION (safe_room);
5241 c = *charbuf++;
5242 if (ASCII_CHAR_P (c))
5243 EMIT_ONE_ASCII_BYTE (c);
5244 else if (CHAR_BYTE8_P (c))
5245 {
5246 c = CHAR_TO_BYTE8 (c);
5247 EMIT_ONE_BYTE (c);
5248 }
5249 else
5250 {
5251 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
93dec019 5252
df7492f9 5253 CHAR_STRING_ADVANCE (c, p1);
8abc3f12 5254 do
9d123124
KH
5255 {
5256 EMIT_ONE_BYTE (*p0);
5257 p0++;
5258 }
8abc3f12 5259 while (p0 < p1);
df7492f9
KH
5260 }
5261 }
b73bfc1c 5262 else
df7492f9
KH
5263 while (charbuf < charbuf_end)
5264 {
5265 ASSURE_DESTINATION (safe_room);
5266 c = *charbuf++;
5267 EMIT_ONE_BYTE (c);
5268 }
5269 }
5270 else
4ed46869 5271 {
df7492f9 5272 if (coding->src_multibyte)
d46c5b12 5273 {
df7492f9
KH
5274 int safe_room = MAX_MULTIBYTE_LENGTH;
5275
5276 while (charbuf < charbuf_end)
d46c5b12 5277 {
df7492f9
KH
5278 ASSURE_DESTINATION (safe_room);
5279 c = *charbuf++;
5280 if (ASCII_CHAR_P (c))
5281 *dst++ = c;
5282 else if (CHAR_BYTE8_P (c))
5283 *dst++ = CHAR_TO_BYTE8 (c);
b73bfc1c 5284 else
df7492f9 5285 CHAR_STRING_ADVANCE (c, dst);
d46c5b12
KH
5286 }
5287 }
df7492f9
KH
5288 else
5289 {
5290 ASSURE_DESTINATION (charbuf_end - charbuf);
5291 while (charbuf < charbuf_end && dst < dst_end)
5292 *dst++ = *charbuf++;
8f924df7 5293 }
319a3947 5294 produced_chars = dst - (coding->destination + coding->produced);
4ed46869 5295 }
065e3595 5296 record_conversion_result (coding, CODING_RESULT_SUCCESS);
a0ed9b27 5297 coding->produced_char += produced_chars;
df7492f9
KH
5298 coding->produced = dst - coding->destination;
5299 return 0;
4ed46869
KH
5300}
5301
ff0dacd7 5302/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
f10fe38f 5303 Return true if a text is encoded in a charset-based coding system. */
ff0dacd7 5304
f10fe38f 5305static bool
cf84bb53
JB
5306detect_coding_charset (struct coding_system *coding,
5307 struct coding_detection_info *detect_info)
1397dc18 5308{
065e3595 5309 const unsigned char *src = coding->source, *src_base;
8f924df7 5310 const unsigned char *src_end = coding->source + coding->src_bytes;
f10fe38f 5311 bool multibytep = coding->src_multibyte;
d311d28c 5312 ptrdiff_t consumed_chars = 0;
07295713 5313 Lisp_Object attrs, valids, name;
584948ac 5314 int found = 0;
d311d28c 5315 ptrdiff_t head_ascii = coding->head_ascii;
f10fe38f 5316 bool check_latin_extra = 0;
1397dc18 5317
ff0dacd7
KH
5318 detect_info->checked |= CATEGORY_MASK_CHARSET;
5319
df7492f9
KH
5320 coding = &coding_categories[coding_category_charset];
5321 attrs = CODING_ID_ATTRS (coding->id);
5322 valids = AREF (attrs, coding_attr_charset_valids);
07295713 5323 name = CODING_ID_NAME (coding->id);
51b59d79 5324 if (strncmp (SSDATA (SYMBOL_NAME (name)),
237aabf4 5325 "iso-8859-", sizeof ("iso-8859-") - 1) == 0
51b59d79 5326 || strncmp (SSDATA (SYMBOL_NAME (name)),
237aabf4 5327 "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
07295713 5328 check_latin_extra = 1;
237aabf4 5329
df7492f9 5330 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
716b3fa0 5331 src += head_ascii;
1397dc18 5332
b73bfc1c 5333 while (1)
1397dc18 5334 {
df7492f9 5335 int c;
716b3fa0
KH
5336 Lisp_Object val;
5337 struct charset *charset;
5338 int dim, idx;
1397dc18 5339
065e3595 5340 src_base = src;
df7492f9 5341 ONE_MORE_BYTE (c);
065e3595
KH
5342 if (c < 0)
5343 continue;
716b3fa0
KH
5344 val = AREF (valids, c);
5345 if (NILP (val))
df7492f9 5346 break;
584948ac 5347 if (c >= 0x80)
07295713
KH
5348 {
5349 if (c < 0xA0
237aabf4
JR
5350 && check_latin_extra
5351 && (!VECTORP (Vlatin_extra_code_table)
28be1ada 5352 || NILP (AREF (Vlatin_extra_code_table, c))))
07295713
KH
5353 break;
5354 found = CATEGORY_MASK_CHARSET;
5355 }
716b3fa0
KH
5356 if (INTEGERP (val))
5357 {
5358 charset = CHARSET_FROM_ID (XFASTINT (val));
5359 dim = CHARSET_DIMENSION (charset);
5360 for (idx = 1; idx < dim; idx++)
5361 {
5362 if (src == src_end)
5363 goto too_short;
5364 ONE_MORE_BYTE (c);
2f9442b8
PE
5365 if (c < charset->code_space[(dim - 1 - idx) * 4]
5366 || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
716b3fa0
KH
5367 break;
5368 }
5369 if (idx < dim)
5370 break;
5371 }
5372 else
5373 {
5374 idx = 1;
5375 for (; CONSP (val); val = XCDR (val))
5376 {
5377 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5378 dim = CHARSET_DIMENSION (charset);
5379 while (idx < dim)
5380 {
5381 if (src == src_end)
5382 goto too_short;
5383 ONE_MORE_BYTE (c);
5384 if (c < charset->code_space[(dim - 1 - idx) * 4]
5385 || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5386 break;
5387 idx++;
5388 }
5389 if (idx == dim)
5390 {
5391 val = Qnil;
5392 break;
5393 }
5394 }
5395 if (CONSP (val))
5396 break;
5397 }
df7492f9 5398 }
716b3fa0 5399 too_short:
ff0dacd7 5400 detect_info->rejected |= CATEGORY_MASK_CHARSET;
df7492f9 5401 return 0;
4ed46869 5402
df7492f9 5403 no_more_source:
ff0dacd7
KH
5404 detect_info->found |= found;
5405 return 1;
df7492f9 5406}
b73bfc1c 5407
b73bfc1c 5408static void
971de7fb 5409decode_coding_charset (struct coding_system *coding)
4ed46869 5410{
8f924df7
KH
5411 const unsigned char *src = coding->source + coding->consumed;
5412 const unsigned char *src_end = coding->source + coding->src_bytes;
5413 const unsigned char *src_base;
69a80ea3 5414 int *charbuf = coding->charbuf + coding->charbuf_used;
ad1746f5 5415 /* We may produce one charset annotation in one loop and one more at
df80c7f0 5416 the end. */
69a80ea3 5417 int *charbuf_end
df80c7f0 5418 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
d311d28c 5419 ptrdiff_t consumed_chars = 0, consumed_chars_base;
f10fe38f 5420 bool multibytep = coding->src_multibyte;
66ebf983
PE
5421 Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
5422 Lisp_Object valids;
d311d28c
PE
5423 ptrdiff_t char_offset = coding->produced_char;
5424 ptrdiff_t last_offset = char_offset;
ff0dacd7 5425 int last_id = charset_ascii;
f10fe38f
PE
5426 bool eol_dos
5427 = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
119852e7 5428 int byte_after_cr = -1;
df7492f9 5429
4eb6d3f1 5430 valids = AREF (attrs, coding_attr_charset_valids);
b73bfc1c 5431
df7492f9 5432 while (1)
4ed46869 5433 {
4eb6d3f1 5434 int c;
24a73b0a
KH
5435 Lisp_Object val;
5436 struct charset *charset;
5437 int dim;
5438 int len = 1;
5439 unsigned code;
df7492f9
KH
5440
5441 src_base = src;
5442 consumed_chars_base = consumed_chars;
b73bfc1c 5443
df7492f9 5444 if (charbuf >= charbuf_end)
b71f6f73
KH
5445 {
5446 if (byte_after_cr >= 0)
5447 src_base--;
5448 break;
5449 }
df7492f9 5450
119852e7
KH
5451 if (byte_after_cr >= 0)
5452 {
5453 c = byte_after_cr;
5454 byte_after_cr = -1;
5455 }
5456 else
5457 {
5458 ONE_MORE_BYTE (c);
2735d060 5459 if (eol_dos && c == '\r')
119852e7
KH
5460 ONE_MORE_BYTE (byte_after_cr);
5461 }
065e3595
KH
5462 if (c < 0)
5463 goto invalid_code;
24a73b0a
KH
5464 code = c;
5465
5466 val = AREF (valids, c);
1b17adfd 5467 if (! INTEGERP (val) && ! CONSP (val))
24a73b0a
KH
5468 goto invalid_code;
5469 if (INTEGERP (val))
d46c5b12 5470 {
24a73b0a
KH
5471 charset = CHARSET_FROM_ID (XFASTINT (val));
5472 dim = CHARSET_DIMENSION (charset);
5473 while (len < dim)
b73bfc1c 5474 {
24a73b0a
KH
5475 ONE_MORE_BYTE (c);
5476 code = (code << 8) | c;
5477 len++;
b73bfc1c 5478 }
24a73b0a
KH
5479 CODING_DECODE_CHAR (coding, src, src_base, src_end,
5480 charset, code, c);
d46c5b12 5481 }
df7492f9 5482 else
d46c5b12 5483 {
24a73b0a
KH
5484 /* VAL is a list of charset IDs. It is assured that the
5485 list is sorted by charset dimensions (smaller one
5486 comes first). */
5487 while (CONSP (val))
4eb6d3f1 5488 {
24a73b0a 5489 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
c7c66a95 5490 dim = CHARSET_DIMENSION (charset);
f9d71dcd 5491 while (len < dim)
4eb6d3f1 5492 {
acb2a965
KH
5493 ONE_MORE_BYTE (c);
5494 code = (code << 8) | c;
f9d71dcd 5495 len++;
4eb6d3f1 5496 }
24a73b0a
KH
5497 CODING_DECODE_CHAR (coding, src, src_base,
5498 src_end, charset, code, c);
5499 if (c >= 0)
5500 break;
5501 val = XCDR (val);
ff0dacd7 5502 }
d46c5b12 5503 }
24a73b0a
KH
5504 if (c < 0)
5505 goto invalid_code;
5506 if (charset->id != charset_ascii
5507 && last_id != charset->id)
5508 {
5509 if (last_id != charset_ascii)
69a80ea3 5510 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
24a73b0a
KH
5511 last_id = charset->id;
5512 last_offset = char_offset;
5513 }
5514
df7492f9 5515 *charbuf++ = c;
ff0dacd7 5516 char_offset++;
df7492f9
KH
5517 continue;
5518
5519 invalid_code:
5520 src = src_base;
5521 consumed_chars = consumed_chars_base;
5522 ONE_MORE_BYTE (c);
065e3595 5523 *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
ff0dacd7 5524 char_offset++;
df7492f9 5525 coding->errors++;
4ed46869
KH
5526 }
5527
df7492f9 5528 no_more_source:
ff0dacd7 5529 if (last_id != charset_ascii)
69a80ea3 5530 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
df7492f9
KH
5531 coding->consumed_char += consumed_chars_base;
5532 coding->consumed = src_base - coding->source;
5533 coding->charbuf_used = charbuf - coding->charbuf;
4ed46869
KH
5534}
5535
f10fe38f 5536static bool
971de7fb 5537encode_coding_charset (struct coding_system *coding)
4ed46869 5538{
f10fe38f 5539 bool multibytep = coding->dst_multibyte;
df7492f9
KH
5540 int *charbuf = coding->charbuf;
5541 int *charbuf_end = charbuf + coding->charbuf_used;
5542 unsigned char *dst = coding->destination + coding->produced;
5543 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5544 int safe_room = MAX_MULTIBYTE_LENGTH;
d311d28c 5545 ptrdiff_t produced_chars = 0;
24a73b0a 5546 Lisp_Object attrs, charset_list;
f10fe38f 5547 bool ascii_compatible;
b73bfc1c 5548 int c;
b73bfc1c 5549
24a73b0a 5550 CODING_GET_INFO (coding, attrs, charset_list);
df7492f9 5551 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
fb88bf2d 5552
df7492f9 5553 while (charbuf < charbuf_end)
4ed46869 5554 {
4eb6d3f1 5555 struct charset *charset;
df7492f9 5556 unsigned code;
8f924df7 5557
df7492f9
KH
5558 ASSURE_DESTINATION (safe_room);
5559 c = *charbuf++;
5560 if (ascii_compatible && ASCII_CHAR_P (c))
5561 EMIT_ONE_ASCII_BYTE (c);
16eafb5d 5562 else if (CHAR_BYTE8_P (c))
4ed46869 5563 {
16eafb5d
KH
5564 c = CHAR_TO_BYTE8 (c);
5565 EMIT_ONE_BYTE (c);
d46c5b12 5566 }
d46c5b12 5567 else
b73bfc1c 5568 {
5eb05ea3
KH
5569 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5570 &code, charset);
5571
4eb6d3f1
KH
5572 if (charset)
5573 {
5574 if (CHARSET_DIMENSION (charset) == 1)
5575 EMIT_ONE_BYTE (code);
5576 else if (CHARSET_DIMENSION (charset) == 2)
5577 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5578 else if (CHARSET_DIMENSION (charset) == 3)
5579 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5580 else
5581 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5582 (code >> 8) & 0xFF, code & 0xFF);
5583 }
5584 else
41cbe562
KH
5585 {
5586 if (coding->mode & CODING_MODE_SAFE_ENCODING)
5587 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5588 else
5589 c = coding->default_char;
5590 EMIT_ONE_BYTE (c);
5591 }
4ed46869 5592 }
4ed46869
KH
5593 }
5594
065e3595 5595 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
5596 coding->produced_char += produced_chars;
5597 coding->produced = dst - coding->destination;
5598 return 0;
4ed46869
KH
5599}
5600
5601\f
1397dc18 5602/*** 7. C library functions ***/
4ed46869 5603
df7492f9
KH
5604/* Setup coding context CODING from information about CODING_SYSTEM.
5605 If CODING_SYSTEM is nil, `no-conversion' is assumed. If
5606 CODING_SYSTEM is invalid, signal an error. */
4ed46869 5607
ec6d2bb8 5608void
971de7fb 5609setup_coding_system (Lisp_Object coding_system, struct coding_system *coding)
4ed46869 5610{
df7492f9
KH
5611 Lisp_Object attrs;
5612 Lisp_Object eol_type;
5613 Lisp_Object coding_type;
4608c386 5614 Lisp_Object val;
4ed46869 5615
df7492f9 5616 if (NILP (coding_system))
ae6f73fa 5617 coding_system = Qundecided;
c07c8e12 5618
df7492f9 5619 CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
1f5dbf34 5620
df7492f9 5621 attrs = CODING_ID_ATTRS (coding->id);
0a9564cb 5622 eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
1f5dbf34 5623
df7492f9
KH
5624 coding->mode = 0;
5625 coding->head_ascii = -1;
4a015c45
KH
5626 if (VECTORP (eol_type))
5627 coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5628 | CODING_REQUIRE_DETECTION_MASK);
5629 else if (! EQ (eol_type, Qunix))
5630 coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5631 | CODING_REQUIRE_ENCODING_MASK);
5632 else
5633 coding->common_flags = 0;
5e5c78be
KH
5634 if (! NILP (CODING_ATTR_POST_READ (attrs)))
5635 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5636 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5637 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
8f924df7
KH
5638 if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5639 coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
1f5dbf34 5640
df7492f9 5641 val = CODING_ATTR_SAFE_CHARSETS (attrs);
8f924df7 5642 coding->max_charset_id = SCHARS (val) - 1;
1b3b981b 5643 coding->safe_charsets = SDATA (val);
df7492f9 5644 coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
624bda09 5645 coding->carryover_bytes = 0;
4608c386 5646
df7492f9
KH
5647 coding_type = CODING_ATTR_TYPE (attrs);
5648 if (EQ (coding_type, Qundecided))
d46c5b12 5649 {
df7492f9
KH
5650 coding->detector = NULL;
5651 coding->decoder = decode_coding_raw_text;
5652 coding->encoder = encode_coding_raw_text;
5653 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
d46c5b12 5654 }
df7492f9 5655 else if (EQ (coding_type, Qiso_2022))
d46c5b12 5656 {
df7492f9
KH
5657 int i;
5658 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5659
5660 /* Invoke graphic register 0 to plane 0. */
5661 CODING_ISO_INVOCATION (coding, 0) = 0;
5662 /* Invoke graphic register 1 to plane 1 if we can use 8-bit. */
5663 CODING_ISO_INVOCATION (coding, 1)
5664 = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5665 /* Setup the initial status of designation. */
5666 for (i = 0; i < 4; i++)
5667 CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5668 /* Not single shifting initially. */
5669 CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5670 /* Beginning of buffer should also be regarded as bol. */
5671 CODING_ISO_BOL (coding) = 1;
5672 coding->detector = detect_coding_iso_2022;
5673 coding->decoder = decode_coding_iso_2022;
5674 coding->encoder = encode_coding_iso_2022;
5675 if (flags & CODING_ISO_FLAG_SAFE)
5676 coding->mode |= CODING_MODE_SAFE_ENCODING;
d46c5b12 5677 coding->common_flags
df7492f9
KH
5678 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5679 | CODING_REQUIRE_FLUSHING_MASK);
5680 if (flags & CODING_ISO_FLAG_COMPOSITION)
5681 coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
ff0dacd7
KH
5682 if (flags & CODING_ISO_FLAG_DESIGNATION)
5683 coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
df7492f9
KH
5684 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5685 {
5686 setup_iso_safe_charsets (attrs);
5687 val = CODING_ATTR_SAFE_CHARSETS (attrs);
8f924df7 5688 coding->max_charset_id = SCHARS (val) - 1;
1b3b981b 5689 coding->safe_charsets = SDATA (val);
df7492f9
KH
5690 }
5691 CODING_ISO_FLAGS (coding) = flags;
e951386e
KH
5692 CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5693 CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5694 CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5695 CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
d46c5b12 5696 }
df7492f9 5697 else if (EQ (coding_type, Qcharset))
d46c5b12 5698 {
df7492f9
KH
5699 coding->detector = detect_coding_charset;
5700 coding->decoder = decode_coding_charset;
5701 coding->encoder = encode_coding_charset;
d46c5b12 5702 coding->common_flags
df7492f9 5703 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
d46c5b12 5704 }
df7492f9 5705 else if (EQ (coding_type, Qutf_8))
d46c5b12 5706 {
a470d443
KH
5707 val = AREF (attrs, coding_attr_utf_bom);
5708 CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5709 : EQ (val, Qt) ? utf_with_bom
5710 : utf_without_bom);
df7492f9
KH
5711 coding->detector = detect_coding_utf_8;
5712 coding->decoder = decode_coding_utf_8;
5713 coding->encoder = encode_coding_utf_8;
5714 coding->common_flags
5715 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
a470d443
KH
5716 if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5717 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
df7492f9
KH
5718 }
5719 else if (EQ (coding_type, Qutf_16))
5720 {
a470d443
KH
5721 val = AREF (attrs, coding_attr_utf_bom);
5722 CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5723 : EQ (val, Qt) ? utf_with_bom
5724 : utf_without_bom);
df7492f9 5725 val = AREF (attrs, coding_attr_utf_16_endian);
b49a1807 5726 CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
df7492f9 5727 : utf_16_little_endian);
e19c3639 5728 CODING_UTF_16_SURROGATE (coding) = 0;
df7492f9
KH
5729 coding->detector = detect_coding_utf_16;
5730 coding->decoder = decode_coding_utf_16;
5731 coding->encoder = encode_coding_utf_16;
5732 coding->common_flags
5733 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
a470d443 5734 if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
b49a1807 5735 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
d46c5b12 5736 }
df7492f9 5737 else if (EQ (coding_type, Qccl))
4ed46869 5738 {
df7492f9
KH
5739 coding->detector = detect_coding_ccl;
5740 coding->decoder = decode_coding_ccl;
5741 coding->encoder = encode_coding_ccl;
c952af22 5742 coding->common_flags
df7492f9
KH
5743 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5744 | CODING_REQUIRE_FLUSHING_MASK);
5745 }
5746 else if (EQ (coding_type, Qemacs_mule))
5747 {
5748 coding->detector = detect_coding_emacs_mule;
5749 coding->decoder = decode_coding_emacs_mule;
5750 coding->encoder = encode_coding_emacs_mule;
c952af22 5751 coding->common_flags
df7492f9
KH
5752 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5753 if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5754 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5755 {
5756 Lisp_Object tail, safe_charsets;
5757 int max_charset_id = 0;
5758
5759 for (tail = Vemacs_mule_charset_list; CONSP (tail);
5760 tail = XCDR (tail))
5761 if (max_charset_id < XFASTINT (XCAR (tail)))
5762 max_charset_id = XFASTINT (XCAR (tail));
1b3b981b
AS
5763 safe_charsets = make_uninit_string (max_charset_id + 1);
5764 memset (SDATA (safe_charsets), 255, max_charset_id + 1);
df7492f9
KH
5765 for (tail = Vemacs_mule_charset_list; CONSP (tail);
5766 tail = XCDR (tail))
8f924df7 5767 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
df7492f9 5768 coding->max_charset_id = max_charset_id;
1b3b981b 5769 coding->safe_charsets = SDATA (safe_charsets);
df7492f9 5770 }
e951386e
KH
5771 coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
5772 coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
df7492f9
KH
5773 }
5774 else if (EQ (coding_type, Qshift_jis))
5775 {
5776 coding->detector = detect_coding_sjis;
5777 coding->decoder = decode_coding_sjis;
5778 coding->encoder = encode_coding_sjis;
c952af22 5779 coding->common_flags
df7492f9
KH
5780 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5781 }
5782 else if (EQ (coding_type, Qbig5))
5783 {
5784 coding->detector = detect_coding_big5;
5785 coding->decoder = decode_coding_big5;
5786 coding->encoder = encode_coding_big5;
c952af22 5787 coding->common_flags
df7492f9
KH
5788 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5789 }
5790 else /* EQ (coding_type, Qraw_text) */
ec6d2bb8 5791 {
df7492f9
KH
5792 coding->detector = NULL;
5793 coding->decoder = decode_coding_raw_text;
5794 coding->encoder = encode_coding_raw_text;
ea29edf2
KH
5795 if (! EQ (eol_type, Qunix))
5796 {
5797 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5798 if (! VECTORP (eol_type))
5799 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5800 }
5801
4ed46869 5802 }
4ed46869 5803
df7492f9 5804 return;
4ed46869
KH
5805}
5806
0ff61e78
KH
5807/* Return a list of charsets supported by CODING. */
5808
5809Lisp_Object
971de7fb 5810coding_charset_list (struct coding_system *coding)
0ff61e78 5811{
35befdaa 5812 Lisp_Object attrs, charset_list;
0ff61e78
KH
5813
5814 CODING_GET_INFO (coding, attrs, charset_list);
5815 if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5816 {
5817 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5818
5819 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5820 charset_list = Viso_2022_charset_list;
5821 }
5822 else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5823 {
5824 charset_list = Vemacs_mule_charset_list;
5825 }
5826 return charset_list;
5827}
5828
5829
e9f91ece
KH
5830/* Return a list of charsets supported by CODING-SYSTEM. */
5831
5832Lisp_Object
971de7fb 5833coding_system_charset_list (Lisp_Object coding_system)
e9f91ece 5834{
d3411f89 5835 ptrdiff_t id;
e9f91ece
KH
5836 Lisp_Object attrs, charset_list;
5837
5838 CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
5839 attrs = CODING_ID_ATTRS (id);
5840
5841 if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5842 {
5843 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5844
5845 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5846 charset_list = Viso_2022_charset_list;
5847 else
5848 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5849 }
5850 else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5851 {
5852 charset_list = Vemacs_mule_charset_list;
5853 }
5854 else
5855 {
5856 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5857 }
5858 return charset_list;
5859}
5860
5861
df7492f9
KH
5862/* Return raw-text or one of its subsidiaries that has the same
5863 eol_type as CODING-SYSTEM. */
ec6d2bb8 5864
df7492f9 5865Lisp_Object
971de7fb 5866raw_text_coding_system (Lisp_Object coding_system)
ec6d2bb8 5867{
0be8721c 5868 Lisp_Object spec, attrs;
df7492f9 5869 Lisp_Object eol_type, raw_text_eol_type;
ec6d2bb8 5870
d3e4cb56
KH
5871 if (NILP (coding_system))
5872 return Qraw_text;
df7492f9
KH
5873 spec = CODING_SYSTEM_SPEC (coding_system);
5874 attrs = AREF (spec, 0);
ec6d2bb8 5875
df7492f9
KH
5876 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5877 return coding_system;
ec6d2bb8 5878
df7492f9
KH
5879 eol_type = AREF (spec, 2);
5880 if (VECTORP (eol_type))
5881 return Qraw_text;
5882 spec = CODING_SYSTEM_SPEC (Qraw_text);
5883 raw_text_eol_type = AREF (spec, 2);
5884 return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5885 : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5886 : AREF (raw_text_eol_type, 2));
ec6d2bb8
KH
5887}
5888
54f78171 5889
1911a33b
KH
5890/* If CODING_SYSTEM doesn't specify end-of-line format, return one of
5891 the subsidiary that has the same eol-spec as PARENT (if it is not
5892 nil and specifies end-of-line format) or the system's setting
fcbcfb64 5893 (system_eol_type). */
df7492f9
KH
5894
5895Lisp_Object
971de7fb 5896coding_inherit_eol_type (Lisp_Object coding_system, Lisp_Object parent)
54f78171 5897{
3e139625 5898 Lisp_Object spec, eol_type;
54f78171 5899
d3e4cb56
KH
5900 if (NILP (coding_system))
5901 coding_system = Qraw_text;
df7492f9 5902 spec = CODING_SYSTEM_SPEC (coding_system);
df7492f9 5903 eol_type = AREF (spec, 2);
fcbcfb64 5904 if (VECTORP (eol_type))
df7492f9 5905 {
df7492f9
KH
5906 Lisp_Object parent_eol_type;
5907
fcbcfb64
KH
5908 if (! NILP (parent))
5909 {
5910 Lisp_Object parent_spec;
5911
4a015c45 5912 parent_spec = CODING_SYSTEM_SPEC (parent);
fcbcfb64 5913 parent_eol_type = AREF (parent_spec, 2);
1911a33b 5914 if (VECTORP (parent_eol_type))
4628bef1 5915 parent_eol_type = system_eol_type;
fcbcfb64
KH
5916 }
5917 else
5918 parent_eol_type = system_eol_type;
df7492f9
KH
5919 if (EQ (parent_eol_type, Qunix))
5920 coding_system = AREF (eol_type, 0);
5921 else if (EQ (parent_eol_type, Qdos))
5922 coding_system = AREF (eol_type, 1);
5923 else if (EQ (parent_eol_type, Qmac))
5924 coding_system = AREF (eol_type, 2);
54f78171 5925 }
df7492f9 5926 return coding_system;
54f78171
KH
5927}
5928
fcaf8878
KH
5929
5930/* Check if text-conversion and eol-conversion of CODING_SYSTEM are
5931 decided for writing to a process. If not, complement them, and
5932 return a new coding system. */
5933
5934Lisp_Object
4628bef1 5935complement_process_encoding_system (Lisp_Object coding_system)
fcaf8878 5936{
5886ec9c
KH
5937 Lisp_Object coding_base = Qnil, eol_base = Qnil;
5938 Lisp_Object spec, attrs;
93d50df8 5939 int i;
fcaf8878 5940
93d50df8 5941 for (i = 0; i < 3; i++)
fcaf8878 5942 {
93d50df8
KH
5943 if (i == 1)
5944 coding_system = CDR_SAFE (Vdefault_process_coding_system);
5945 else if (i == 2)
5946 coding_system = preferred_coding_system ();
5947 spec = CODING_SYSTEM_SPEC (coding_system);
5948 if (NILP (spec))
5949 continue;
5950 attrs = AREF (spec, 0);
5951 if (NILP (coding_base) && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
5952 coding_base = CODING_ATTR_BASE_NAME (attrs);
5953 if (NILP (eol_base) && ! VECTORP (AREF (spec, 2)))
5954 eol_base = coding_system;
5955 if (! NILP (coding_base) && ! NILP (eol_base))
5956 break;
fcaf8878 5957 }
fcaf8878 5958
93d50df8
KH
5959 if (i > 0)
5960 /* The original CODING_SYSTEM didn't specify text-conversion or
5961 eol-conversion. Be sure that we return a fully complemented
5962 coding system. */
5963 coding_system = coding_inherit_eol_type (coding_base, eol_base);
5964 return coding_system;
fcaf8878
KH
5965}
5966
5967
4ed46869
KH
5968/* Emacs has a mechanism to automatically detect a coding system if it
5969 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
5970 it's impossible to distinguish some coding systems accurately
5971 because they use the same range of codes. So, at first, coding
5972 systems are categorized into 7, those are:
5973
0ef69138 5974 o coding-category-emacs-mule
4ed46869
KH
5975
5976 The category for a coding system which has the same code range
5977 as Emacs' internal format. Assigned the coding-system (Lisp
0ef69138 5978 symbol) `emacs-mule' by default.
4ed46869
KH
5979
5980 o coding-category-sjis
5981
5982 The category for a coding system which has the same code range
5983 as SJIS. Assigned the coding-system (Lisp
7717c392 5984 symbol) `japanese-shift-jis' by default.
4ed46869
KH
5985
5986 o coding-category-iso-7
5987
5988 The category for a coding system which has the same code range
7717c392 5989 as ISO2022 of 7-bit environment. This doesn't use any locking
d46c5b12
KH
5990 shift and single shift functions. This can encode/decode all
5991 charsets. Assigned the coding-system (Lisp symbol)
5992 `iso-2022-7bit' by default.
5993
5994 o coding-category-iso-7-tight
5995
5996 Same as coding-category-iso-7 except that this can
5997 encode/decode only the specified charsets.
4ed46869
KH
5998
5999 o coding-category-iso-8-1
6000
6001 The category for a coding system which has the same code range
6002 as ISO2022 of 8-bit environment and graphic plane 1 used only
7717c392
KH
6003 for DIMENSION1 charset. This doesn't use any locking shift
6004 and single shift functions. Assigned the coding-system (Lisp
6005 symbol) `iso-latin-1' by default.
4ed46869
KH
6006
6007 o coding-category-iso-8-2
6008
6009 The category for a coding system which has the same code range
6010 as ISO2022 of 8-bit environment and graphic plane 1 used only
7717c392
KH
6011 for DIMENSION2 charset. This doesn't use any locking shift
6012 and single shift functions. Assigned the coding-system (Lisp
6013 symbol) `japanese-iso-8bit' by default.
4ed46869 6014
7717c392 6015 o coding-category-iso-7-else
4ed46869
KH
6016
6017 The category for a coding system which has the same code range
ad1746f5 6018 as ISO2022 of 7-bit environment but uses locking shift or
7717c392
KH
6019 single shift functions. Assigned the coding-system (Lisp
6020 symbol) `iso-2022-7bit-lock' by default.
6021
6022 o coding-category-iso-8-else
6023
6024 The category for a coding system which has the same code range
ad1746f5 6025 as ISO2022 of 8-bit environment but uses locking shift or
7717c392
KH
6026 single shift functions. Assigned the coding-system (Lisp
6027 symbol) `iso-2022-8bit-ss2' by default.
4ed46869
KH
6028
6029 o coding-category-big5
6030
6031 The category for a coding system which has the same code range
6032 as BIG5. Assigned the coding-system (Lisp symbol)
e0e989f6 6033 `cn-big5' by default.
4ed46869 6034
fa42c37f
KH
6035 o coding-category-utf-8
6036
6037 The category for a coding system which has the same code range
6e76ae91 6038 as UTF-8 (cf. RFC3629). Assigned the coding-system (Lisp
fa42c37f
KH
6039 symbol) `utf-8' by default.
6040
6041 o coding-category-utf-16-be
6042
6043 The category for a coding system in which a text has an
6044 Unicode signature (cf. Unicode Standard) in the order of BIG
6045 endian at the head. Assigned the coding-system (Lisp symbol)
6046 `utf-16-be' by default.
6047
6048 o coding-category-utf-16-le
6049
6050 The category for a coding system in which a text has an
6051 Unicode signature (cf. Unicode Standard) in the order of
6052 LITTLE endian at the head. Assigned the coding-system (Lisp
6053 symbol) `utf-16-le' by default.
6054
1397dc18
KH
6055 o coding-category-ccl
6056
6057 The category for a coding system of which encoder/decoder is
6058 written in CCL programs. The default value is nil, i.e., no
6059 coding system is assigned.
6060
4ed46869
KH
6061 o coding-category-binary
6062
6063 The category for a coding system not categorized in any of the
6064 above. Assigned the coding-system (Lisp symbol)
e0e989f6 6065 `no-conversion' by default.
4ed46869
KH
6066
6067 Each of them is a Lisp symbol and the value is an actual
df7492f9 6068 `coding-system's (this is also a Lisp symbol) assigned by a user.
4ed46869
KH
6069 What Emacs does actually is to detect a category of coding system.
6070 Then, it uses a `coding-system' assigned to it. If Emacs can't
df7492f9 6071 decide only one possible category, it selects a category of the
4ed46869
KH
6072 highest priority. Priorities of categories are also specified by a
6073 user in a Lisp variable `coding-category-list'.
6074
6075*/
6076
df7492f9
KH
6077#define EOL_SEEN_NONE 0
6078#define EOL_SEEN_LF 1
6079#define EOL_SEEN_CR 2
6080#define EOL_SEEN_CRLF 4
66cfb530 6081
ff0dacd7
KH
6082/* Detect how end-of-line of a text of length SRC_BYTES pointed by
6083 SOURCE is encoded. If CATEGORY is one of
6084 coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6085 two-byte, else they are encoded by one-byte.
6086
6087 Return one of EOL_SEEN_XXX. */
4ed46869 6088
bc4bc72a 6089#define MAX_EOL_CHECK_COUNT 3
d46c5b12
KH
6090
6091static int
d311d28c 6092detect_eol (const unsigned char *source, ptrdiff_t src_bytes,
cf84bb53 6093 enum coding_category category)
4ed46869 6094{
f6cbaf43 6095 const unsigned char *src = source, *src_end = src + src_bytes;
4ed46869 6096 unsigned char c;
df7492f9
KH
6097 int total = 0;
6098 int eol_seen = EOL_SEEN_NONE;
4ed46869 6099
89528eb3 6100 if ((1 << category) & CATEGORY_MASK_UTF_16)
4ed46869 6101 {
f10fe38f
PE
6102 bool msb = category == (coding_category_utf_16_le
6103 | coding_category_utf_16_le_nosig);
6104 bool lsb = !msb;
fa42c37f 6105
df7492f9 6106 while (src + 1 < src_end)
fa42c37f 6107 {
df7492f9
KH
6108 c = src[lsb];
6109 if (src[msb] == 0 && (c == '\n' || c == '\r'))
fa42c37f 6110 {
df7492f9
KH
6111 int this_eol;
6112
6113 if (c == '\n')
6114 this_eol = EOL_SEEN_LF;
6115 else if (src + 3 >= src_end
6116 || src[msb + 2] != 0
6117 || src[lsb + 2] != '\n')
6118 this_eol = EOL_SEEN_CR;
fa42c37f 6119 else
75f4f1ac
EZ
6120 {
6121 this_eol = EOL_SEEN_CRLF;
6122 src += 2;
6123 }
df7492f9
KH
6124
6125 if (eol_seen == EOL_SEEN_NONE)
6126 /* This is the first end-of-line. */
6127 eol_seen = this_eol;
6128 else if (eol_seen != this_eol)
fa42c37f 6129 {
75f4f1ac
EZ
6130 /* The found type is different from what found before.
6131 Allow for stray ^M characters in DOS EOL files. */
ef1b0ba7
SM
6132 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6133 || (eol_seen == EOL_SEEN_CRLF
6134 && this_eol == EOL_SEEN_CR))
75f4f1ac
EZ
6135 eol_seen = EOL_SEEN_CRLF;
6136 else
6137 {
6138 eol_seen = EOL_SEEN_LF;
6139 break;
6140 }
fa42c37f 6141 }
df7492f9
KH
6142 if (++total == MAX_EOL_CHECK_COUNT)
6143 break;
fa42c37f 6144 }
df7492f9 6145 src += 2;
fa42c37f 6146 }
bcf26d6a 6147 }
d46c5b12 6148 else
ef1b0ba7
SM
6149 while (src < src_end)
6150 {
6151 c = *src++;
6152 if (c == '\n' || c == '\r')
6153 {
6154 int this_eol;
d46c5b12 6155
ef1b0ba7
SM
6156 if (c == '\n')
6157 this_eol = EOL_SEEN_LF;
6158 else if (src >= src_end || *src != '\n')
6159 this_eol = EOL_SEEN_CR;
6160 else
6161 this_eol = EOL_SEEN_CRLF, src++;
d46c5b12 6162
ef1b0ba7
SM
6163 if (eol_seen == EOL_SEEN_NONE)
6164 /* This is the first end-of-line. */
6165 eol_seen = this_eol;
6166 else if (eol_seen != this_eol)
6167 {
6168 /* The found type is different from what found before.
6169 Allow for stray ^M characters in DOS EOL files. */
6170 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6171 || (eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR))
6172 eol_seen = EOL_SEEN_CRLF;
6173 else
6174 {
6175 eol_seen = EOL_SEEN_LF;
6176 break;
6177 }
6178 }
6179 if (++total == MAX_EOL_CHECK_COUNT)
6180 break;
6181 }
6182 }
df7492f9 6183 return eol_seen;
73be902c
KH
6184}
6185
df7492f9 6186
24a73b0a 6187static Lisp_Object
971de7fb 6188adjust_coding_eol_type (struct coding_system *coding, int eol_seen)
73be902c 6189{
0be8721c 6190 Lisp_Object eol_type;
8f924df7 6191
df7492f9
KH
6192 eol_type = CODING_ID_EOL_TYPE (coding->id);
6193 if (eol_seen & EOL_SEEN_LF)
24a73b0a
KH
6194 {
6195 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6196 eol_type = Qunix;
6197 }
6f197c07 6198 else if (eol_seen & EOL_SEEN_CRLF)
24a73b0a
KH
6199 {
6200 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6201 eol_type = Qdos;
6202 }
6f197c07 6203 else if (eol_seen & EOL_SEEN_CR)
24a73b0a
KH
6204 {
6205 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6206 eol_type = Qmac;
6207 }
6208 return eol_type;
d46c5b12 6209}
4ed46869 6210
df7492f9
KH
6211/* Detect how a text specified in CODING is encoded. If a coding
6212 system is detected, update fields of CODING by the detected coding
6213 system. */
0a28aafb 6214
74ab6df5 6215static void
971de7fb 6216detect_coding (struct coding_system *coding)
d46c5b12 6217{
8f924df7 6218 const unsigned char *src, *src_end;
f10fe38f 6219 unsigned int saved_mode = coding->mode;
d46c5b12 6220
df7492f9
KH
6221 coding->consumed = coding->consumed_char = 0;
6222 coding->produced = coding->produced_char = 0;
6223 coding_set_source (coding);
1c3478b0 6224
df7492f9 6225 src_end = coding->source + coding->src_bytes;
c0e16b14 6226 coding->head_ascii = 0;
1c3478b0 6227
df7492f9
KH
6228 /* If we have not yet decided the text encoding type, detect it
6229 now. */
6230 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
b73bfc1c 6231 {
df7492f9 6232 int c, i;
6cb21a4f 6233 struct coding_detection_info detect_info;
f10fe38f 6234 bool null_byte_found = 0, eight_bit_found = 0;
df7492f9 6235
6cb21a4f 6236 detect_info.checked = detect_info.found = detect_info.rejected = 0;
2f3cbb32 6237 for (src = coding->source; src < src_end; src++)
d46c5b12 6238 {
df7492f9 6239 c = *src;
6cb21a4f 6240 if (c & 0x80)
6cb21a4f 6241 {
2f3cbb32 6242 eight_bit_found = 1;
2f3cbb32
KH
6243 if (null_byte_found)
6244 break;
6245 }
6246 else if (c < 0x20)
6247 {
6248 if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6249 && ! inhibit_iso_escape_detection
6250 && ! detect_info.checked)
6cb21a4f 6251 {
2f3cbb32
KH
6252 if (detect_coding_iso_2022 (coding, &detect_info))
6253 {
6254 /* We have scanned the whole data. */
6255 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
c0e16b14
KH
6256 {
6257 /* We didn't find an 8-bit code. We may
6258 have found a null-byte, but it's very
ce5b453a 6259 rare that a binary file conforms to
c0e16b14
KH
6260 ISO-2022. */
6261 src = src_end;
6262 coding->head_ascii = src - coding->source;
6263 }
6264 detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
2f3cbb32
KH
6265 break;
6266 }
6267 }
97b1b294 6268 else if (! c && !inhibit_null_byte_detection)
2f3cbb32
KH
6269 {
6270 null_byte_found = 1;
6271 if (eight_bit_found)
6272 break;
6cb21a4f 6273 }
c006c0c8
KH
6274 if (! eight_bit_found)
6275 coding->head_ascii++;
6cb21a4f 6276 }
c006c0c8 6277 else if (! eight_bit_found)
c0e16b14 6278 coding->head_ascii++;
d46c5b12 6279 }
df7492f9 6280
2f3cbb32
KH
6281 if (null_byte_found || eight_bit_found
6282 || coding->head_ascii < coding->src_bytes
6cb21a4f 6283 || detect_info.found)
d46c5b12 6284 {
ff0dacd7
KH
6285 enum coding_category category;
6286 struct coding_system *this;
df7492f9 6287
6cb21a4f
KH
6288 if (coding->head_ascii == coding->src_bytes)
6289 /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */
6290 for (i = 0; i < coding_category_raw_text; i++)
6291 {
6292 category = coding_priorities[i];
6293 this = coding_categories + category;
6294 if (detect_info.found & (1 << category))
24a73b0a 6295 break;
6cb21a4f
KH
6296 }
6297 else
2f3cbb32
KH
6298 {
6299 if (null_byte_found)
ff0dacd7 6300 {
2f3cbb32
KH
6301 detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6302 detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
ff0dacd7 6303 }
2f3cbb32
KH
6304 for (i = 0; i < coding_category_raw_text; i++)
6305 {
6306 category = coding_priorities[i];
6307 this = coding_categories + category;
0ba06a77
KH
6308 /* Some of this->detector (e.g. detect_coding_sjis)
6309 require this information. */
6310 coding->id = this->id;
2f3cbb32
KH
6311 if (this->id < 0)
6312 {
6313 /* No coding system of this category is defined. */
6314 detect_info.rejected |= (1 << category);
6315 }
6316 else if (category >= coding_category_raw_text)
6317 continue;
6318 else if (detect_info.checked & (1 << category))
6319 {
6320 if (detect_info.found & (1 << category))
6321 break;
6322 }
6323 else if ((*(this->detector)) (coding, &detect_info)
6324 && detect_info.found & (1 << category))
6325 {
6326 if (category == coding_category_utf_16_auto)
6327 {
6328 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6329 category = coding_category_utf_16_le;
6330 else
6331 category = coding_category_utf_16_be;
6332 }
6333 break;
6334 }
6335 }
2f3cbb32 6336 }
c0e16b14
KH
6337
6338 if (i < coding_category_raw_text)
6339 setup_coding_system (CODING_ID_NAME (this->id), coding);
6340 else if (null_byte_found)
6341 setup_coding_system (Qno_conversion, coding);
6342 else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6343 == CATEGORY_MASK_ANY)
6344 setup_coding_system (Qraw_text, coding);
6345 else if (detect_info.rejected)
6346 for (i = 0; i < coding_category_raw_text; i++)
6347 if (! (detect_info.rejected & (1 << coding_priorities[i])))
6348 {
6349 this = coding_categories + coding_priorities[i];
6350 setup_coding_system (CODING_ID_NAME (this->id), coding);
6351 break;
6352 }
d46c5b12 6353 }
b73bfc1c 6354 }
a470d443
KH
6355 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6356 == coding_category_utf_8_auto)
6357 {
6358 Lisp_Object coding_systems;
6359 struct coding_detection_info detect_info;
6360
6361 coding_systems
6362 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6363 detect_info.found = detect_info.rejected = 0;
6364 coding->head_ascii = 0;
6365 if (CONSP (coding_systems)
6366 && detect_coding_utf_8 (coding, &detect_info))
6367 {
6368 if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6369 setup_coding_system (XCAR (coding_systems), coding);
6370 else
6371 setup_coding_system (XCDR (coding_systems), coding);
6372 }
6373 }
24a73b0a
KH
6374 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6375 == coding_category_utf_16_auto)
b49a1807
KH
6376 {
6377 Lisp_Object coding_systems;
6378 struct coding_detection_info detect_info;
6379
6380 coding_systems
a470d443 6381 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
b49a1807 6382 detect_info.found = detect_info.rejected = 0;
a470d443 6383 coding->head_ascii = 0;
b49a1807 6384 if (CONSP (coding_systems)
24a73b0a 6385 && detect_coding_utf_16 (coding, &detect_info))
b49a1807
KH
6386 {
6387 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6388 setup_coding_system (XCAR (coding_systems), coding);
24a73b0a 6389 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
b49a1807
KH
6390 setup_coding_system (XCDR (coding_systems), coding);
6391 }
6392 }
73cce38d 6393 coding->mode = saved_mode;
4ed46869 6394}
4ed46869 6395
d46c5b12 6396
aaaf0b1e 6397static void
971de7fb 6398decode_eol (struct coding_system *coding)
aaaf0b1e 6399{
24a73b0a
KH
6400 Lisp_Object eol_type;
6401 unsigned char *p, *pbeg, *pend;
3ed051d4 6402
24a73b0a 6403 eol_type = CODING_ID_EOL_TYPE (coding->id);
0a9564cb 6404 if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
24a73b0a
KH
6405 return;
6406
6407 if (NILP (coding->dst_object))
6408 pbeg = coding->destination;
6409 else
6410 pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6411 pend = pbeg + coding->produced;
6412
6413 if (VECTORP (eol_type))
aaaf0b1e 6414 {
df7492f9 6415 int eol_seen = EOL_SEEN_NONE;
4ed46869 6416
24a73b0a 6417 for (p = pbeg; p < pend; p++)
aaaf0b1e 6418 {
df7492f9
KH
6419 if (*p == '\n')
6420 eol_seen |= EOL_SEEN_LF;
6421 else if (*p == '\r')
aaaf0b1e 6422 {
df7492f9 6423 if (p + 1 < pend && *(p + 1) == '\n')
aaaf0b1e 6424 {
df7492f9
KH
6425 eol_seen |= EOL_SEEN_CRLF;
6426 p++;
aaaf0b1e 6427 }
aaaf0b1e 6428 else
df7492f9 6429 eol_seen |= EOL_SEEN_CR;
aaaf0b1e 6430 }
aaaf0b1e 6431 }
75f4f1ac
EZ
6432 /* Handle DOS-style EOLs in a file with stray ^M characters. */
6433 if ((eol_seen & EOL_SEEN_CRLF) != 0
6434 && (eol_seen & EOL_SEEN_CR) != 0
6435 && (eol_seen & EOL_SEEN_LF) == 0)
6436 eol_seen = EOL_SEEN_CRLF;
6437 else if (eol_seen != EOL_SEEN_NONE
24a73b0a
KH
6438 && eol_seen != EOL_SEEN_LF
6439 && eol_seen != EOL_SEEN_CRLF
6440 && eol_seen != EOL_SEEN_CR)
6441 eol_seen = EOL_SEEN_LF;
df7492f9 6442 if (eol_seen != EOL_SEEN_NONE)
24a73b0a 6443 eol_type = adjust_coding_eol_type (coding, eol_seen);
aaaf0b1e 6444 }
d46c5b12 6445
24a73b0a 6446 if (EQ (eol_type, Qmac))
27901516 6447 {
24a73b0a 6448 for (p = pbeg; p < pend; p++)
df7492f9
KH
6449 if (*p == '\r')
6450 *p = '\n';
4ed46869 6451 }
24a73b0a 6452 else if (EQ (eol_type, Qdos))
df7492f9 6453 {
d311d28c 6454 ptrdiff_t n = 0;
b73bfc1c 6455
24a73b0a
KH
6456 if (NILP (coding->dst_object))
6457 {
4347441b
KH
6458 /* Start deleting '\r' from the tail to minimize the memory
6459 movement. */
24a73b0a
KH
6460 for (p = pend - 2; p >= pbeg; p--)
6461 if (*p == '\r')
6462 {
72af86bd 6463 memmove (p, p + 1, pend-- - p - 1);
24a73b0a
KH
6464 n++;
6465 }
6466 }
6467 else
6468 {
d311d28c
PE
6469 ptrdiff_t pos_byte = coding->dst_pos_byte;
6470 ptrdiff_t pos = coding->dst_pos;
6471 ptrdiff_t pos_end = pos + coding->produced_char - 1;
4347441b
KH
6472
6473 while (pos < pos_end)
6474 {
6475 p = BYTE_POS_ADDR (pos_byte);
6476 if (*p == '\r' && p[1] == '\n')
6477 {
6478 del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6479 n++;
6480 pos_end--;
6481 }
6482 pos++;
69b8522d
KH
6483 if (coding->dst_multibyte)
6484 pos_byte += BYTES_BY_CHAR_HEAD (*p);
6485 else
6486 pos_byte++;
4347441b 6487 }
24a73b0a
KH
6488 }
6489 coding->produced -= n;
6490 coding->produced_char -= n;
aaaf0b1e 6491 }
4ed46869
KH
6492}
6493
7d64c6ad 6494
a6f87d34 6495/* Return a translation table (or list of them) from coding system
f10fe38f
PE
6496 attribute vector ATTRS for encoding (if ENCODEP) or decoding (if
6497 not ENCODEP). */
7d64c6ad 6498
e6a54062 6499static Lisp_Object
f10fe38f 6500get_translation_table (Lisp_Object attrs, bool encodep, int *max_lookup)
7d64c6ad
KH
6501{
6502 Lisp_Object standard, translation_table;
09ee6fdd 6503 Lisp_Object val;
7d64c6ad 6504
4bed5909
CY
6505 if (NILP (Venable_character_translation))
6506 {
6507 if (max_lookup)
6508 *max_lookup = 0;
6509 return Qnil;
6510 }
7d64c6ad
KH
6511 if (encodep)
6512 translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6513 standard = Vstandard_translation_table_for_encode;
6514 else
6515 translation_table = CODING_ATTR_DECODE_TBL (attrs),
6516 standard = Vstandard_translation_table_for_decode;
7d64c6ad 6517 if (NILP (translation_table))
09ee6fdd
KH
6518 translation_table = standard;
6519 else
a6f87d34 6520 {
09ee6fdd
KH
6521 if (SYMBOLP (translation_table))
6522 translation_table = Fget (translation_table, Qtranslation_table);
6523 else if (CONSP (translation_table))
6524 {
6525 translation_table = Fcopy_sequence (translation_table);
6526 for (val = translation_table; CONSP (val); val = XCDR (val))
6527 if (SYMBOLP (XCAR (val)))
6528 XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6529 }
6530 if (CHAR_TABLE_P (standard))
6531 {
6532 if (CONSP (translation_table))
6533 translation_table = nconc2 (translation_table,
6534 Fcons (standard, Qnil));
6535 else
6536 translation_table = Fcons (translation_table,
6537 Fcons (standard, Qnil));
6538 }
a6f87d34 6539 }
2170c8f0
KH
6540
6541 if (max_lookup)
09ee6fdd 6542 {
2170c8f0
KH
6543 *max_lookup = 1;
6544 if (CHAR_TABLE_P (translation_table)
6545 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6546 {
6547 val = XCHAR_TABLE (translation_table)->extras[1];
6548 if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6549 *max_lookup = XFASTINT (val);
6550 }
6551 else if (CONSP (translation_table))
6552 {
2735d060 6553 Lisp_Object tail;
09ee6fdd 6554
2170c8f0
KH
6555 for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6556 if (CHAR_TABLE_P (XCAR (tail))
6557 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6558 {
2735d060
PE
6559 Lisp_Object tailval = XCHAR_TABLE (XCAR (tail))->extras[1];
6560 if (NATNUMP (tailval) && *max_lookup < XFASTINT (tailval))
6561 *max_lookup = XFASTINT (tailval);
2170c8f0
KH
6562 }
6563 }
a6f87d34 6564 }
7d64c6ad
KH
6565 return translation_table;
6566}
6567
09ee6fdd
KH
6568#define LOOKUP_TRANSLATION_TABLE(table, c, trans) \
6569 do { \
6570 trans = Qnil; \
6571 if (CHAR_TABLE_P (table)) \
6572 { \
6573 trans = CHAR_TABLE_REF (table, c); \
6574 if (CHARACTERP (trans)) \
6575 c = XFASTINT (trans), trans = Qnil; \
6576 } \
6577 else if (CONSP (table)) \
6578 { \
6579 Lisp_Object tail; \
6580 \
6581 for (tail = table; CONSP (tail); tail = XCDR (tail)) \
6582 if (CHAR_TABLE_P (XCAR (tail))) \
6583 { \
6584 trans = CHAR_TABLE_REF (XCAR (tail), c); \
6585 if (CHARACTERP (trans)) \
6586 c = XFASTINT (trans), trans = Qnil; \
6587 else if (! NILP (trans)) \
6588 break; \
6589 } \
6590 } \
e6a54062
KH
6591 } while (0)
6592
7d64c6ad 6593
e951386e
KH
6594/* Return a translation of character(s) at BUF according to TRANS.
6595 TRANS is TO-CHAR or ((FROM . TO) ...) where
6596 FROM = [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].
6597 The return value is TO-CHAR or ([FROM-CHAR ...] . TO) if a
6598 translation is found, and Qnil if not found..
6599 If BUF is too short to lookup characters in FROM, return Qt. */
6600
69a80ea3 6601static Lisp_Object
971de7fb 6602get_translation (Lisp_Object trans, int *buf, int *buf_end)
69a80ea3 6603{
e951386e
KH
6604
6605 if (INTEGERP (trans))
6606 return trans;
6607 for (; CONSP (trans); trans = XCDR (trans))
69a80ea3 6608 {
e951386e
KH
6609 Lisp_Object val = XCAR (trans);
6610 Lisp_Object from = XCAR (val);
2c6a9faa
PE
6611 ptrdiff_t len = ASIZE (from);
6612 ptrdiff_t i;
69a80ea3 6613
e951386e 6614 for (i = 0; i < len; i++)
69a80ea3 6615 {
e951386e
KH
6616 if (buf + i == buf_end)
6617 return Qt;
6618 if (XINT (AREF (from, i)) != buf[i])
6619 break;
69a80ea3 6620 }
e951386e
KH
6621 if (i == len)
6622 return val;
69a80ea3 6623 }
e951386e 6624 return Qnil;
69a80ea3
KH
6625}
6626
6627
d46c5b12 6628static int
cf84bb53 6629produce_chars (struct coding_system *coding, Lisp_Object translation_table,
f10fe38f 6630 bool last_block)
4ed46869 6631{
df7492f9
KH
6632 unsigned char *dst = coding->destination + coding->produced;
6633 unsigned char *dst_end = coding->destination + coding->dst_bytes;
d311d28c
PE
6634 ptrdiff_t produced;
6635 ptrdiff_t produced_chars = 0;
69a80ea3 6636 int carryover = 0;
4ed46869 6637
df7492f9 6638 if (! coding->chars_at_source)
4ed46869 6639 {
119852e7 6640 /* Source characters are in coding->charbuf. */
fba4576f
AS
6641 int *buf = coding->charbuf;
6642 int *buf_end = buf + coding->charbuf_used;
4ed46869 6643
db274c7a
KH
6644 if (EQ (coding->src_object, coding->dst_object))
6645 {
6646 coding_set_source (coding);
6647 dst_end = ((unsigned char *) coding->source) + coding->consumed;
6648 }
4ed46869 6649
df7492f9 6650 while (buf < buf_end)
4ed46869 6651 {
27bb1ca4
PE
6652 int c = *buf;
6653 ptrdiff_t i;
bc4bc72a 6654
df7492f9
KH
6655 if (c >= 0)
6656 {
d311d28c 6657 ptrdiff_t from_nchars = 1, to_nchars = 1;
69a80ea3
KH
6658 Lisp_Object trans = Qnil;
6659
09ee6fdd 6660 LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
e6a54062 6661 if (! NILP (trans))
69a80ea3 6662 {
e951386e
KH
6663 trans = get_translation (trans, buf, buf_end);
6664 if (INTEGERP (trans))
6665 c = XINT (trans);
6666 else if (CONSP (trans))
6667 {
6668 from_nchars = ASIZE (XCAR (trans));
6669 trans = XCDR (trans);
6670 if (INTEGERP (trans))
6671 c = XINT (trans);
6672 else
6673 {
6674 to_nchars = ASIZE (trans);
6675 c = XINT (AREF (trans, 0));
6676 }
6677 }
6678 else if (EQ (trans, Qt) && ! last_block)
69a80ea3 6679 break;
69a80ea3
KH
6680 }
6681
5d009b3a 6682 if ((dst_end - dst) / MAX_MULTIBYTE_LENGTH < to_nchars)
69a80ea3 6683 {
5d009b3a
PE
6684 if (((min (PTRDIFF_MAX, SIZE_MAX) - (buf_end - buf))
6685 / MAX_MULTIBYTE_LENGTH)
6686 < to_nchars)
6687 memory_full (SIZE_MAX);
69a80ea3
KH
6688 dst = alloc_destination (coding,
6689 buf_end - buf
6690 + MAX_MULTIBYTE_LENGTH * to_nchars,
6691 dst);
db274c7a
KH
6692 if (EQ (coding->src_object, coding->dst_object))
6693 {
6694 coding_set_source (coding);
e951386e
KH
6695 dst_end = (((unsigned char *) coding->source)
6696 + coding->consumed);
db274c7a
KH
6697 }
6698 else
6699 dst_end = coding->destination + coding->dst_bytes;
69a80ea3
KH
6700 }
6701
433f7f87 6702 for (i = 0; i < to_nchars; i++)
69a80ea3 6703 {
433f7f87
KH
6704 if (i > 0)
6705 c = XINT (AREF (trans, i));
69a80ea3
KH
6706 if (coding->dst_multibyte
6707 || ! CHAR_BYTE8_P (c))
db274c7a 6708 CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
69a80ea3
KH
6709 else
6710 *dst++ = CHAR_TO_BYTE8 (c);
6711 }
6712 produced_chars += to_nchars;
e951386e 6713 buf += from_nchars;
d46c5b12 6714 }
df7492f9 6715 else
69a80ea3
KH
6716 /* This is an annotation datum. (-C) is the length. */
6717 buf += -c;
4ed46869 6718 }
69a80ea3 6719 carryover = buf_end - buf;
4ed46869 6720 }
fa42c37f 6721 else
fa42c37f 6722 {
119852e7 6723 /* Source characters are at coding->source. */
8f924df7 6724 const unsigned char *src = coding->source;
119852e7 6725 const unsigned char *src_end = src + coding->consumed;
4ed46869 6726
db274c7a
KH
6727 if (EQ (coding->dst_object, coding->src_object))
6728 dst_end = (unsigned char *) src;
df7492f9 6729 if (coding->src_multibyte != coding->dst_multibyte)
fa42c37f 6730 {
df7492f9 6731 if (coding->src_multibyte)
fa42c37f 6732 {
f10fe38f 6733 bool multibytep = 1;
d311d28c 6734 ptrdiff_t consumed_chars = 0;
d46c5b12 6735
df7492f9
KH
6736 while (1)
6737 {
8f924df7 6738 const unsigned char *src_base = src;
df7492f9 6739 int c;
b73bfc1c 6740
df7492f9 6741 ONE_MORE_BYTE (c);
119852e7 6742 if (dst == dst_end)
df7492f9 6743 {
119852e7
KH
6744 if (EQ (coding->src_object, coding->dst_object))
6745 dst_end = (unsigned char *) src;
6746 if (dst == dst_end)
df7492f9 6747 {
d311d28c 6748 ptrdiff_t offset = src - coding->source;
119852e7
KH
6749
6750 dst = alloc_destination (coding, src_end - src + 1,
6751 dst);
6752 dst_end = coding->destination + coding->dst_bytes;
6753 coding_set_source (coding);
6754 src = coding->source + offset;
5c1ca13d 6755 src_end = coding->source + coding->consumed;
db274c7a
KH
6756 if (EQ (coding->src_object, coding->dst_object))
6757 dst_end = (unsigned char *) src;
df7492f9 6758 }
df7492f9
KH
6759 }
6760 *dst++ = c;
6761 produced_chars++;
6762 }
6763 no_more_source:
6764 ;
fa42c37f
KH
6765 }
6766 else
df7492f9
KH
6767 while (src < src_end)
6768 {
f10fe38f 6769 bool multibytep = 1;
df7492f9 6770 int c = *src++;
b73bfc1c 6771
df7492f9
KH
6772 if (dst >= dst_end - 1)
6773 {
2c78b7e1 6774 if (EQ (coding->src_object, coding->dst_object))
8f924df7 6775 dst_end = (unsigned char *) src;
2c78b7e1
KH
6776 if (dst >= dst_end - 1)
6777 {
d311d28c
PE
6778 ptrdiff_t offset = src - coding->source;
6779 ptrdiff_t more_bytes;
119852e7 6780
db274c7a
KH
6781 if (EQ (coding->src_object, coding->dst_object))
6782 more_bytes = ((src_end - src) / 2) + 2;
6783 else
6784 more_bytes = src_end - src + 2;
6785 dst = alloc_destination (coding, more_bytes, dst);
2c78b7e1
KH
6786 dst_end = coding->destination + coding->dst_bytes;
6787 coding_set_source (coding);
119852e7 6788 src = coding->source + offset;
5c1ca13d 6789 src_end = coding->source + coding->consumed;
db274c7a
KH
6790 if (EQ (coding->src_object, coding->dst_object))
6791 dst_end = (unsigned char *) src;
2c78b7e1 6792 }
df7492f9
KH
6793 }
6794 EMIT_ONE_BYTE (c);
6795 }
d46c5b12 6796 }
df7492f9
KH
6797 else
6798 {
6799 if (!EQ (coding->src_object, coding->dst_object))
fa42c37f 6800 {
d311d28c 6801 ptrdiff_t require = coding->src_bytes - coding->dst_bytes;
4ed46869 6802
df7492f9 6803 if (require > 0)
fa42c37f 6804 {
d311d28c 6805 ptrdiff_t offset = src - coding->source;
df7492f9
KH
6806
6807 dst = alloc_destination (coding, require, dst);
6808 coding_set_source (coding);
6809 src = coding->source + offset;
5c1ca13d 6810 src_end = coding->source + coding->consumed;
fa42c37f
KH
6811 }
6812 }
119852e7 6813 produced_chars = coding->consumed_char;
df7492f9 6814 while (src < src_end)
14daee73 6815 *dst++ = *src++;
fa42c37f
KH
6816 }
6817 }
6818
df7492f9 6819 produced = dst - (coding->destination + coding->produced);
284201e4 6820 if (BUFFERP (coding->dst_object) && produced_chars > 0)
df7492f9
KH
6821 insert_from_gap (produced_chars, produced);
6822 coding->produced += produced;
6823 coding->produced_char += produced_chars;
69a80ea3 6824 return carryover;
fa42c37f
KH
6825}
6826
ff0dacd7
KH
6827/* Compose text in CODING->object according to the annotation data at
6828 CHARBUF. CHARBUF is an array:
e951386e 6829 [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
df7492f9 6830 */
4ed46869 6831
b0ab8123 6832static void
d311d28c 6833produce_composition (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
4ed46869 6834{
df7492f9 6835 int len;
d311d28c 6836 ptrdiff_t to;
df7492f9 6837 enum composition_method method;
df7492f9 6838 Lisp_Object components;
fa42c37f 6839
e951386e 6840 len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
69a80ea3 6841 to = pos + charbuf[2];
e951386e 6842 method = (enum composition_method) (charbuf[4]);
d46c5b12 6843
df7492f9
KH
6844 if (method == COMPOSITION_RELATIVE)
6845 components = Qnil;
e951386e 6846 else
d46c5b12 6847 {
df7492f9 6848 Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
e951386e 6849 int i, j;
b73bfc1c 6850
e951386e
KH
6851 if (method == COMPOSITION_WITH_RULE)
6852 len = charbuf[2] * 3 - 2;
6853 charbuf += MAX_ANNOTATION_LENGTH;
6854 /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
6855 for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
9ffd559c 6856 {
e951386e
KH
6857 if (charbuf[i] >= 0)
6858 args[j] = make_number (charbuf[i]);
6859 else
6860 {
6861 i++;
6862 args[j] = make_number (charbuf[i] % 0x100);
6863 }
9ffd559c 6864 }
e951386e 6865 components = (i == j ? Fstring (j, args) : Fvector (j, args));
d46c5b12 6866 }
69a80ea3 6867 compose_text (pos, to, components, Qnil, coding->dst_object);
d46c5b12
KH
6868}
6869
d46c5b12 6870
ff0dacd7
KH
6871/* Put `charset' property on text in CODING->object according to
6872 the annotation data at CHARBUF. CHARBUF is an array:
69a80ea3 6873 [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
ff0dacd7 6874 */
d46c5b12 6875
b0ab8123 6876static void
d311d28c 6877produce_charset (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
d46c5b12 6878{
d311d28c 6879 ptrdiff_t from = pos - charbuf[2];
69a80ea3 6880 struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
b73bfc1c 6881
69a80ea3 6882 Fput_text_property (make_number (from), make_number (pos),
ff0dacd7
KH
6883 Qcharset, CHARSET_NAME (charset),
6884 coding->dst_object);
d46c5b12
KH
6885}
6886
d46c5b12 6887
df7492f9
KH
6888#define CHARBUF_SIZE 0x4000
6889
6890#define ALLOC_CONVERSION_WORK_AREA(coding) \
6891 do { \
8510724d 6892 int size = CHARBUF_SIZE; \
df7492f9
KH
6893 \
6894 coding->charbuf = NULL; \
6895 while (size > 1024) \
6896 { \
38182d90 6897 coding->charbuf = alloca (sizeof (int) * size); \
df7492f9
KH
6898 if (coding->charbuf) \
6899 break; \
6900 size >>= 1; \
6901 } \
6902 if (! coding->charbuf) \
6903 { \
065e3595 6904 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
f10fe38f 6905 return; \
df7492f9
KH
6906 } \
6907 coding->charbuf_size = size; \
6908 } while (0)
4ed46869 6909
d46c5b12
KH
6910
6911static void
d311d28c 6912produce_annotation (struct coding_system *coding, ptrdiff_t pos)
d46c5b12 6913{
df7492f9
KH
6914 int *charbuf = coding->charbuf;
6915 int *charbuf_end = charbuf + coding->charbuf_used;
d46c5b12 6916
ff0dacd7
KH
6917 if (NILP (coding->dst_object))
6918 return;
d46c5b12 6919
df7492f9 6920 while (charbuf < charbuf_end)
a84f1519 6921 {
df7492f9 6922 if (*charbuf >= 0)
e951386e 6923 pos++, charbuf++;
d46c5b12 6924 else
d46c5b12 6925 {
df7492f9 6926 int len = -*charbuf;
e951386e
KH
6927
6928 if (len > 2)
6929 switch (charbuf[1])
6930 {
6931 case CODING_ANNOTATE_COMPOSITION_MASK:
6932 produce_composition (coding, charbuf, pos);
6933 break;
6934 case CODING_ANNOTATE_CHARSET_MASK:
6935 produce_charset (coding, charbuf, pos);
6936 break;
6937 }
df7492f9 6938 charbuf += len;
d46c5b12 6939 }
a84f1519 6940 }
d46c5b12
KH
6941}
6942
df7492f9
KH
6943/* Decode the data at CODING->src_object into CODING->dst_object.
6944 CODING->src_object is a buffer, a string, or nil.
6945 CODING->dst_object is a buffer.
d46c5b12 6946
df7492f9
KH
6947 If CODING->src_object is a buffer, it must be the current buffer.
6948 In this case, if CODING->src_pos is positive, it is a position of
6949 the source text in the buffer, otherwise, the source text is in the
6950 gap area of the buffer, and CODING->src_pos specifies the offset of
6951 the text from GPT (which must be the same as PT). If this is the
6952 same buffer as CODING->dst_object, CODING->src_pos must be
6953 negative.
d46c5b12 6954
b6828792 6955 If CODING->src_object is a string, CODING->src_pos is an index to
df7492f9 6956 that string.
d46c5b12 6957
df7492f9
KH
6958 If CODING->src_object is nil, CODING->source must already point to
6959 the non-relocatable memory area. In this case, CODING->src_pos is
6960 an offset from CODING->source.
73be902c 6961
df7492f9
KH
6962 The decoded data is inserted at the current point of the buffer
6963 CODING->dst_object.
6964*/
d46c5b12 6965
f10fe38f 6966static void
971de7fb 6967decode_coding (struct coding_system *coding)
d46c5b12 6968{
df7492f9 6969 Lisp_Object attrs;
24a73b0a 6970 Lisp_Object undo_list;
7d64c6ad 6971 Lisp_Object translation_table;
d0396581 6972 struct ccl_spec cclspec;
69a80ea3
KH
6973 int carryover;
6974 int i;
d46c5b12 6975
df7492f9
KH
6976 if (BUFFERP (coding->src_object)
6977 && coding->src_pos > 0
6978 && coding->src_pos < GPT
6979 && coding->src_pos + coding->src_chars > GPT)
6980 move_gap_both (coding->src_pos, coding->src_pos_byte);
d46c5b12 6981
24a73b0a 6982 undo_list = Qt;
df7492f9 6983 if (BUFFERP (coding->dst_object))
1c3478b0 6984 {
a3d794a1 6985 set_buffer_internal (XBUFFER (coding->dst_object));
df7492f9
KH
6986 if (GPT != PT)
6987 move_gap_both (PT, PT_BYTE);
f48b82fd
GR
6988
6989 /* We must disable undo_list in order to record the whole insert
6990 transaction via record_insert at the end. But doing so also
6991 disables the recording of the first change to the undo_list.
6992 Therefore we check for first change here and record it via
6993 record_first_change if needed. */
6994 if (MODIFF <= SAVE_MODIFF)
6995 record_first_change ();
6996
4b4deea2 6997 undo_list = BVAR (current_buffer, undo_list);
39eb03f1 6998 bset_undo_list (current_buffer, Qt);
1c3478b0
KH
6999 }
7000
df7492f9
KH
7001 coding->consumed = coding->consumed_char = 0;
7002 coding->produced = coding->produced_char = 0;
7003 coding->chars_at_source = 0;
065e3595 7004 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9 7005 coding->errors = 0;
1c3478b0 7006
df7492f9
KH
7007 ALLOC_CONVERSION_WORK_AREA (coding);
7008
7009 attrs = CODING_ID_ATTRS (coding->id);
2170c8f0 7010 translation_table = get_translation_table (attrs, 0, NULL);
df7492f9 7011
69a80ea3 7012 carryover = 0;
d0396581
KH
7013 if (coding->decoder == decode_coding_ccl)
7014 {
7015 coding->spec.ccl = &cclspec;
7016 setup_ccl_program (&cclspec.ccl, CODING_CCL_DECODER (coding));
7017 }
df7492f9 7018 do
b73bfc1c 7019 {
d311d28c 7020 ptrdiff_t pos = coding->dst_pos + coding->produced_char;
69a80ea3 7021
df7492f9
KH
7022 coding_set_source (coding);
7023 coding->annotated = 0;
69a80ea3 7024 coding->charbuf_used = carryover;
df7492f9 7025 (*(coding->decoder)) (coding);
df7492f9 7026 coding_set_destination (coding);
69a80ea3 7027 carryover = produce_chars (coding, translation_table, 0);
df7492f9 7028 if (coding->annotated)
69a80ea3
KH
7029 produce_annotation (coding, pos);
7030 for (i = 0; i < carryover; i++)
7031 coding->charbuf[i]
7032 = coding->charbuf[coding->charbuf_used - carryover + i];
d46c5b12 7033 }
d0396581
KH
7034 while (coding->result == CODING_RESULT_INSUFFICIENT_DST
7035 || (coding->consumed < coding->src_bytes
7036 && (coding->result == CODING_RESULT_SUCCESS
7037 || coding->result == CODING_RESULT_INVALID_SRC)));
d46c5b12 7038
69a80ea3
KH
7039 if (carryover > 0)
7040 {
7041 coding_set_destination (coding);
7042 coding->charbuf_used = carryover;
7043 produce_chars (coding, translation_table, 1);
7044 }
7045
df7492f9
KH
7046 coding->carryover_bytes = 0;
7047 if (coding->consumed < coding->src_bytes)
d46c5b12 7048 {
df7492f9 7049 int nbytes = coding->src_bytes - coding->consumed;
8f924df7 7050 const unsigned char *src;
df7492f9
KH
7051
7052 coding_set_source (coding);
7053 coding_set_destination (coding);
7054 src = coding->source + coding->consumed;
7055
7056 if (coding->mode & CODING_MODE_LAST_BLOCK)
1c3478b0 7057 {
df7492f9
KH
7058 /* Flush out unprocessed data as binary chars. We are sure
7059 that the number of data is less than the size of
7060 coding->charbuf. */
065e3595 7061 coding->charbuf_used = 0;
b2dab6c8
JR
7062 coding->chars_at_source = 0;
7063
df7492f9 7064 while (nbytes-- > 0)
1c3478b0 7065 {
df7492f9 7066 int c = *src++;
98725083 7067
1c91457d
KH
7068 if (c & 0x80)
7069 c = BYTE8_TO_CHAR (c);
7070 coding->charbuf[coding->charbuf_used++] = c;
1c3478b0 7071 }
f6cbaf43 7072 produce_chars (coding, Qnil, 1);
d46c5b12 7073 }
d46c5b12 7074 else
df7492f9
KH
7075 {
7076 /* Record unprocessed bytes in coding->carryover. We are
7077 sure that the number of data is less than the size of
7078 coding->carryover. */
7079 unsigned char *p = coding->carryover;
7080
f289d375
KH
7081 if (nbytes > sizeof coding->carryover)
7082 nbytes = sizeof coding->carryover;
df7492f9
KH
7083 coding->carryover_bytes = nbytes;
7084 while (nbytes-- > 0)
7085 *p++ = *src++;
1c3478b0 7086 }
df7492f9 7087 coding->consumed = coding->src_bytes;
b73bfc1c 7088 }
69f76525 7089
0a9564cb
EZ
7090 if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7091 && !inhibit_eol_conversion)
4347441b 7092 decode_eol (coding);
24a73b0a
KH
7093 if (BUFFERP (coding->dst_object))
7094 {
39eb03f1 7095 bset_undo_list (current_buffer, undo_list);
24a73b0a
KH
7096 record_insert (coding->dst_pos, coding->produced_char);
7097 }
4ed46869
KH
7098}
7099
aaaf0b1e 7100
e1c23804 7101/* Extract an annotation datum from a composition starting at POS and
ff0dacd7
KH
7102 ending before LIMIT of CODING->src_object (buffer or string), store
7103 the data in BUF, set *STOP to a starting position of the next
7104 composition (if any) or to LIMIT, and return the address of the
7105 next element of BUF.
7106
7107 If such an annotation is not found, set *STOP to a starting
7108 position of a composition after POS (if any) or to LIMIT, and
7109 return BUF. */
7110
b0ab8123 7111static int *
d311d28c 7112handle_composition_annotation (ptrdiff_t pos, ptrdiff_t limit,
cf84bb53 7113 struct coding_system *coding, int *buf,
d311d28c 7114 ptrdiff_t *stop)
aaaf0b1e 7115{
d311d28c 7116 ptrdiff_t start, end;
ff0dacd7 7117 Lisp_Object prop;
aaaf0b1e 7118
ff0dacd7
KH
7119 if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7120 || end > limit)
7121 *stop = limit;
7122 else if (start > pos)
7123 *stop = start;
7124 else
aaaf0b1e 7125 {
ff0dacd7 7126 if (start == pos)
aaaf0b1e 7127 {
ff0dacd7
KH
7128 /* We found a composition. Store the corresponding
7129 annotation data in BUF. */
7130 int *head = buf;
7131 enum composition_method method = COMPOSITION_METHOD (prop);
7132 int nchars = COMPOSITION_LENGTH (prop);
7133
e951386e 7134 ADD_COMPOSITION_DATA (buf, nchars, 0, method);
ff0dacd7 7135 if (method != COMPOSITION_RELATIVE)
aaaf0b1e 7136 {
ff0dacd7 7137 Lisp_Object components;
2c6a9faa 7138 ptrdiff_t i, len, i_byte;
ff0dacd7
KH
7139
7140 components = COMPOSITION_COMPONENTS (prop);
7141 if (VECTORP (components))
aaaf0b1e 7142 {
77b37c05 7143 len = ASIZE (components);
ff0dacd7
KH
7144 for (i = 0; i < len; i++)
7145 *buf++ = XINT (AREF (components, i));
aaaf0b1e 7146 }
ff0dacd7 7147 else if (STRINGP (components))
aaaf0b1e 7148 {
8f924df7 7149 len = SCHARS (components);
ff0dacd7
KH
7150 i = i_byte = 0;
7151 while (i < len)
7152 {
7153 FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
7154 buf++;
7155 }
7156 }
7157 else if (INTEGERP (components))
7158 {
7159 len = 1;
7160 *buf++ = XINT (components);
7161 }
7162 else if (CONSP (components))
7163 {
7164 for (len = 0; CONSP (components);
7165 len++, components = XCDR (components))
7166 *buf++ = XINT (XCAR (components));
aaaf0b1e 7167 }
aaaf0b1e 7168 else
1088b922 7169 emacs_abort ();
ff0dacd7 7170 *head -= len;
aaaf0b1e 7171 }
aaaf0b1e 7172 }
ff0dacd7
KH
7173
7174 if (find_composition (end, limit, &start, &end, &prop,
7175 coding->src_object)
7176 && end <= limit)
7177 *stop = start;
7178 else
7179 *stop = limit;
aaaf0b1e 7180 }
ff0dacd7
KH
7181 return buf;
7182}
7183
7184
e1c23804 7185/* Extract an annotation datum from a text property `charset' at POS of
ff0dacd7
KH
7186 CODING->src_object (buffer of string), store the data in BUF, set
7187 *STOP to the position where the value of `charset' property changes
7188 (limiting by LIMIT), and return the address of the next element of
7189 BUF.
7190
7191 If the property value is nil, set *STOP to the position where the
7192 property value is non-nil (limiting by LIMIT), and return BUF. */
7193
b0ab8123 7194static int *
d311d28c 7195handle_charset_annotation (ptrdiff_t pos, ptrdiff_t limit,
cf84bb53 7196 struct coding_system *coding, int *buf,
d311d28c 7197 ptrdiff_t *stop)
ff0dacd7
KH
7198{
7199 Lisp_Object val, next;
7200 int id;
7201
7202 val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
7203 if (! NILP (val) && CHARSETP (val))
7204 id = XINT (CHARSET_SYMBOL_ID (val));
7205 else
7206 id = -1;
69a80ea3 7207 ADD_CHARSET_DATA (buf, 0, id);
ff0dacd7
KH
7208 next = Fnext_single_property_change (make_number (pos), Qcharset,
7209 coding->src_object,
7210 make_number (limit));
7211 *stop = XINT (next);
7212 return buf;
7213}
7214
7215
df7492f9 7216static void
cf84bb53
JB
7217consume_chars (struct coding_system *coding, Lisp_Object translation_table,
7218 int max_lookup)
df7492f9
KH
7219{
7220 int *buf = coding->charbuf;
ff0dacd7 7221 int *buf_end = coding->charbuf + coding->charbuf_size;
7c78e542 7222 const unsigned char *src = coding->source + coding->consumed;
4776e638 7223 const unsigned char *src_end = coding->source + coding->src_bytes;
d311d28c
PE
7224 ptrdiff_t pos = coding->src_pos + coding->consumed_char;
7225 ptrdiff_t end_pos = coding->src_pos + coding->src_chars;
f10fe38f 7226 bool multibytep = coding->src_multibyte;
df7492f9
KH
7227 Lisp_Object eol_type;
7228 int c;
d311d28c 7229 ptrdiff_t stop, stop_composition, stop_charset;
09ee6fdd 7230 int *lookup_buf = NULL;
433f7f87
KH
7231
7232 if (! NILP (translation_table))
09ee6fdd 7233 lookup_buf = alloca (sizeof (int) * max_lookup);
88993dfd 7234
0a9564cb 7235 eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
df7492f9
KH
7236 if (VECTORP (eol_type))
7237 eol_type = Qunix;
88993dfd 7238
df7492f9
KH
7239 /* Note: composition handling is not yet implemented. */
7240 coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
ec6d2bb8 7241
0b5670c9
KH
7242 if (NILP (coding->src_object))
7243 stop = stop_composition = stop_charset = end_pos;
ff0dacd7 7244 else
0b5670c9
KH
7245 {
7246 if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7247 stop = stop_composition = pos;
7248 else
7249 stop = stop_composition = end_pos;
7250 if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7251 stop = stop_charset = pos;
7252 else
7253 stop_charset = end_pos;
7254 }
ec6d2bb8 7255
24a73b0a 7256 /* Compensate for CRLF and conversion. */
ff0dacd7 7257 buf_end -= 1 + MAX_ANNOTATION_LENGTH;
df7492f9 7258 while (buf < buf_end)
aaaf0b1e 7259 {
433f7f87
KH
7260 Lisp_Object trans;
7261
df7492f9 7262 if (pos == stop)
ec6d2bb8 7263 {
df7492f9
KH
7264 if (pos == end_pos)
7265 break;
ff0dacd7
KH
7266 if (pos == stop_composition)
7267 buf = handle_composition_annotation (pos, end_pos, coding,
7268 buf, &stop_composition);
7269 if (pos == stop_charset)
7270 buf = handle_charset_annotation (pos, end_pos, coding,
7271 buf, &stop_charset);
7272 stop = (stop_composition < stop_charset
7273 ? stop_composition : stop_charset);
df7492f9
KH
7274 }
7275
7276 if (! multibytep)
4776e638 7277 {
d311d28c 7278 int bytes;
aaaf0b1e 7279
4d1e6632
KH
7280 if (coding->encoder == encode_coding_raw_text
7281 || coding->encoder == encode_coding_ccl)
ea29edf2
KH
7282 c = *src++, pos++;
7283 else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
db274c7a 7284 c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
4776e638 7285 else
f03caae0 7286 c = BYTE8_TO_CHAR (*src), src++, pos++;
4776e638 7287 }
df7492f9 7288 else
db274c7a 7289 c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
df7492f9
KH
7290 if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7291 c = '\n';
7292 if (! EQ (eol_type, Qunix))
aaaf0b1e 7293 {
df7492f9 7294 if (c == '\n')
aaaf0b1e 7295 {
df7492f9
KH
7296 if (EQ (eol_type, Qdos))
7297 *buf++ = '\r';
7298 else
7299 c = '\r';
aaaf0b1e
KH
7300 }
7301 }
433f7f87 7302
e6a54062 7303 trans = Qnil;
09ee6fdd 7304 LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
e6a54062 7305 if (NILP (trans))
433f7f87
KH
7306 *buf++ = c;
7307 else
7308 {
2c6a9faa 7309 ptrdiff_t from_nchars = 1, to_nchars = 1;
433f7f87
KH
7310 int *lookup_buf_end;
7311 const unsigned char *p = src;
7312 int i;
7313
7314 lookup_buf[0] = c;
7315 for (i = 1; i < max_lookup && p < src_end; i++)
7316 lookup_buf[i] = STRING_CHAR_ADVANCE (p);
7317 lookup_buf_end = lookup_buf + i;
e951386e
KH
7318 trans = get_translation (trans, lookup_buf, lookup_buf_end);
7319 if (INTEGERP (trans))
7320 c = XINT (trans);
7321 else if (CONSP (trans))
7322 {
7323 from_nchars = ASIZE (XCAR (trans));
7324 trans = XCDR (trans);
7325 if (INTEGERP (trans))
7326 c = XINT (trans);
7327 else
7328 {
7329 to_nchars = ASIZE (trans);
2c6a9faa 7330 if (buf_end - buf < to_nchars)
e951386e
KH
7331 break;
7332 c = XINT (AREF (trans, 0));
7333 }
7334 }
7335 else
433f7f87 7336 break;
e951386e 7337 *buf++ = c;
433f7f87
KH
7338 for (i = 1; i < to_nchars; i++)
7339 *buf++ = XINT (AREF (trans, i));
7340 for (i = 1; i < from_nchars; i++, pos++)
7341 src += MULTIBYTE_LENGTH_NO_CHECK (src);
7342 }
aaaf0b1e 7343 }
ec6d2bb8 7344
df7492f9
KH
7345 coding->consumed = src - coding->source;
7346 coding->consumed_char = pos - coding->src_pos;
7347 coding->charbuf_used = buf - coding->charbuf;
7348 coding->chars_at_source = 0;
aaaf0b1e
KH
7349}
7350
4ed46869 7351
df7492f9
KH
7352/* Encode the text at CODING->src_object into CODING->dst_object.
7353 CODING->src_object is a buffer or a string.
7354 CODING->dst_object is a buffer or nil.
7355
7356 If CODING->src_object is a buffer, it must be the current buffer.
7357 In this case, if CODING->src_pos is positive, it is a position of
7358 the source text in the buffer, otherwise. the source text is in the
7359 gap area of the buffer, and coding->src_pos specifies the offset of
7360 the text from GPT (which must be the same as PT). If this is the
7361 same buffer as CODING->dst_object, CODING->src_pos must be
7362 negative and CODING should not have `pre-write-conversion'.
7363
7364 If CODING->src_object is a string, CODING should not have
7365 `pre-write-conversion'.
7366
7367 If CODING->dst_object is a buffer, the encoded data is inserted at
7368 the current point of that buffer.
7369
7370 If CODING->dst_object is nil, the encoded data is placed at the
7371 memory area specified by CODING->destination. */
7372
f10fe38f 7373static void
971de7fb 7374encode_coding (struct coding_system *coding)
4ed46869 7375{
df7492f9 7376 Lisp_Object attrs;
7d64c6ad 7377 Lisp_Object translation_table;
09ee6fdd 7378 int max_lookup;
fb608df3 7379 struct ccl_spec cclspec;
9861e777 7380
df7492f9 7381 attrs = CODING_ID_ATTRS (coding->id);
ea29edf2
KH
7382 if (coding->encoder == encode_coding_raw_text)
7383 translation_table = Qnil, max_lookup = 0;
7384 else
7385 translation_table = get_translation_table (attrs, 1, &max_lookup);
4ed46869 7386
df7492f9 7387 if (BUFFERP (coding->dst_object))
8844fa83 7388 {
df7492f9
KH
7389 set_buffer_internal (XBUFFER (coding->dst_object));
7390 coding->dst_multibyte
4b4deea2 7391 = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
8844fa83 7392 }
4ed46869 7393
b73bfc1c 7394 coding->consumed = coding->consumed_char = 0;
df7492f9 7395 coding->produced = coding->produced_char = 0;
065e3595 7396 record_conversion_result (coding, CODING_RESULT_SUCCESS);
b73bfc1c 7397 coding->errors = 0;
b73bfc1c 7398
df7492f9 7399 ALLOC_CONVERSION_WORK_AREA (coding);
4ed46869 7400
fb608df3
KH
7401 if (coding->encoder == encode_coding_ccl)
7402 {
7403 coding->spec.ccl = &cclspec;
7404 setup_ccl_program (&cclspec.ccl, CODING_CCL_ENCODER (coding));
7405 }
df7492f9
KH
7406 do {
7407 coding_set_source (coding);
09ee6fdd 7408 consume_chars (coding, translation_table, max_lookup);
df7492f9
KH
7409 coding_set_destination (coding);
7410 (*(coding->encoder)) (coding);
7411 } while (coding->consumed_char < coding->src_chars);
7412
284201e4 7413 if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
df7492f9 7414 insert_from_gap (coding->produced_char, coding->produced);
ec6d2bb8
KH
7415}
7416
fb88bf2d 7417
24a73b0a
KH
7418/* Name (or base name) of work buffer for code conversion. */
7419static Lisp_Object Vcode_conversion_workbuf_name;
d46c5b12 7420
24a73b0a
KH
7421/* A working buffer used by the top level conversion. Once it is
7422 created, it is never destroyed. It has the name
7423 Vcode_conversion_workbuf_name. The other working buffers are
7424 destroyed after the use is finished, and their names are modified
7425 versions of Vcode_conversion_workbuf_name. */
7426static Lisp_Object Vcode_conversion_reused_workbuf;
b73bfc1c 7427
f10fe38f
PE
7428/* True iff Vcode_conversion_reused_workbuf is already in use. */
7429static bool reused_workbuf_in_use;
4ed46869 7430
24a73b0a 7431
ad1746f5 7432/* Return a working buffer of code conversion. MULTIBYTE specifies the
24a73b0a 7433 multibyteness of returning buffer. */
b73bfc1c 7434
f6cbaf43 7435static Lisp_Object
f10fe38f 7436make_conversion_work_buffer (bool multibyte)
df7492f9 7437{
24a73b0a
KH
7438 Lisp_Object name, workbuf;
7439 struct buffer *current;
4ed46869 7440
f10fe38f 7441 if (reused_workbuf_in_use)
065e3595
KH
7442 {
7443 name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7444 workbuf = Fget_buffer_create (name);
7445 }
df7492f9 7446 else
065e3595 7447 {
f10fe38f 7448 reused_workbuf_in_use = 1;
159bd5a2 7449 if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
a993c7a1
KH
7450 Vcode_conversion_reused_workbuf
7451 = Fget_buffer_create (Vcode_conversion_workbuf_name);
7452 workbuf = Vcode_conversion_reused_workbuf;
065e3595 7453 }
24a73b0a
KH
7454 current = current_buffer;
7455 set_buffer_internal (XBUFFER (workbuf));
df36ff1f
CY
7456 /* We can't allow modification hooks to run in the work buffer. For
7457 instance, directory_files_internal assumes that file decoding
7458 doesn't compile new regexps. */
7459 Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
3ed051d4 7460 Ferase_buffer ();
39eb03f1
PE
7461 bset_undo_list (current_buffer, Qt);
7462 bset_enable_multibyte_characters (current_buffer, multibyte ? Qt : Qnil);
df7492f9 7463 set_buffer_internal (current);
24a73b0a 7464 return workbuf;
df7492f9 7465}
d46c5b12 7466
24a73b0a 7467
4776e638 7468static Lisp_Object
971de7fb 7469code_conversion_restore (Lisp_Object arg)
4776e638 7470{
24a73b0a 7471 Lisp_Object current, workbuf;
948bdcf3 7472 struct gcpro gcpro1;
24a73b0a 7473
948bdcf3 7474 GCPRO1 (arg);
24a73b0a
KH
7475 current = XCAR (arg);
7476 workbuf = XCDR (arg);
7477 if (! NILP (workbuf))
7478 {
7479 if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7480 reused_workbuf_in_use = 0;
d17337e5 7481 else
24a73b0a
KH
7482 Fkill_buffer (workbuf);
7483 }
7484 set_buffer_internal (XBUFFER (current));
948bdcf3 7485 UNGCPRO;
4776e638
KH
7486 return Qnil;
7487}
b73bfc1c 7488
24a73b0a 7489Lisp_Object
f10fe38f 7490code_conversion_save (bool with_work_buf, bool multibyte)
df7492f9 7491{
24a73b0a 7492 Lisp_Object workbuf = Qnil;
b73bfc1c 7493
4776e638 7494 if (with_work_buf)
24a73b0a
KH
7495 workbuf = make_conversion_work_buffer (multibyte);
7496 record_unwind_protect (code_conversion_restore,
7497 Fcons (Fcurrent_buffer (), workbuf));
4776e638 7498 return workbuf;
df7492f9 7499}
d46c5b12 7500
f10fe38f 7501void
cf84bb53 7502decode_coding_gap (struct coding_system *coding,
d311d28c 7503 ptrdiff_t chars, ptrdiff_t bytes)
df7492f9 7504{
d311d28c 7505 ptrdiff_t count = SPECPDL_INDEX ();
5e5c78be 7506 Lisp_Object attrs;
fb88bf2d 7507
24a73b0a 7508 code_conversion_save (0, 0);
ec6d2bb8 7509
24a73b0a 7510 coding->src_object = Fcurrent_buffer ();
df7492f9
KH
7511 coding->src_chars = chars;
7512 coding->src_bytes = bytes;
7513 coding->src_pos = -chars;
7514 coding->src_pos_byte = -bytes;
7515 coding->src_multibyte = chars < bytes;
24a73b0a 7516 coding->dst_object = coding->src_object;
df7492f9
KH
7517 coding->dst_pos = PT;
7518 coding->dst_pos_byte = PT_BYTE;
4b4deea2 7519 coding->dst_multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
4ed46869 7520
df7492f9
KH
7521 if (CODING_REQUIRE_DETECTION (coding))
7522 detect_coding (coding);
8f924df7 7523
9286b333 7524 coding->mode |= CODING_MODE_LAST_BLOCK;
287c57d7 7525 current_buffer->text->inhibit_shrinking = 1;
df7492f9 7526 decode_coding (coding);
287c57d7 7527 current_buffer->text->inhibit_shrinking = 0;
d46c5b12 7528
5e5c78be
KH
7529 attrs = CODING_ID_ATTRS (coding->id);
7530 if (! NILP (CODING_ATTR_POST_READ (attrs)))
b73bfc1c 7531 {
d311d28c 7532 ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
5e5c78be
KH
7533 Lisp_Object val;
7534
7535 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
5e5c78be
KH
7536 val = call1 (CODING_ATTR_POST_READ (attrs),
7537 make_number (coding->produced_char));
5e5c78be
KH
7538 CHECK_NATNUM (val);
7539 coding->produced_char += Z - prev_Z;
7540 coding->produced += Z_BYTE - prev_Z_BYTE;
b73bfc1c 7541 }
4ed46869 7542
df7492f9 7543 unbind_to (count, Qnil);
b73bfc1c 7544}
52d41803 7545
d46c5b12 7546
df7492f9
KH
7547/* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
7548 SRC_OBJECT into DST_OBJECT by coding context CODING.
b73bfc1c 7549
df7492f9 7550 SRC_OBJECT is a buffer, a string, or Qnil.
b73bfc1c 7551
df7492f9
KH
7552 If it is a buffer, the text is at point of the buffer. FROM and TO
7553 are positions in the buffer.
b73bfc1c 7554
df7492f9
KH
7555 If it is a string, the text is at the beginning of the string.
7556 FROM and TO are indices to the string.
4ed46869 7557
df7492f9
KH
7558 If it is nil, the text is at coding->source. FROM and TO are
7559 indices to coding->source.
bb10be8b 7560
df7492f9 7561 DST_OBJECT is a buffer, Qt, or Qnil.
4ed46869 7562
df7492f9
KH
7563 If it is a buffer, the decoded text is inserted at point of the
7564 buffer. If the buffer is the same as SRC_OBJECT, the source text
7565 is deleted.
4ed46869 7566
df7492f9
KH
7567 If it is Qt, a string is made from the decoded text, and
7568 set in CODING->dst_object.
d46c5b12 7569
df7492f9 7570 If it is Qnil, the decoded text is stored at CODING->destination.
2cb26057 7571 The caller must allocate CODING->dst_bytes bytes at
df7492f9
KH
7572 CODING->destination by xmalloc. If the decoded text is longer than
7573 CODING->dst_bytes, CODING->destination is relocated by xrealloc.
7574 */
d46c5b12 7575
df7492f9 7576void
cf84bb53
JB
7577decode_coding_object (struct coding_system *coding,
7578 Lisp_Object src_object,
d311d28c
PE
7579 ptrdiff_t from, ptrdiff_t from_byte,
7580 ptrdiff_t to, ptrdiff_t to_byte,
cf84bb53 7581 Lisp_Object dst_object)
d46c5b12 7582{
d311d28c 7583 ptrdiff_t count = SPECPDL_INDEX ();
c4a63b12 7584 unsigned char *destination IF_LINT (= NULL);
d311d28c
PE
7585 ptrdiff_t dst_bytes IF_LINT (= 0);
7586 ptrdiff_t chars = to - from;
7587 ptrdiff_t bytes = to_byte - from_byte;
df7492f9 7588 Lisp_Object attrs;
f10fe38f
PE
7589 ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
7590 bool need_marker_adjustment = 0;
b3bfad50 7591 Lisp_Object old_deactivate_mark;
d46c5b12 7592
b3bfad50 7593 old_deactivate_mark = Vdeactivate_mark;
93dec019 7594
df7492f9 7595 if (NILP (dst_object))
d46c5b12 7596 {
df7492f9
KH
7597 destination = coding->destination;
7598 dst_bytes = coding->dst_bytes;
d46c5b12 7599 }
93dec019 7600
df7492f9
KH
7601 coding->src_object = src_object;
7602 coding->src_chars = chars;
7603 coding->src_bytes = bytes;
7604 coding->src_multibyte = chars < bytes;
70ad9fc4 7605
df7492f9 7606 if (STRINGP (src_object))
d46c5b12 7607 {
df7492f9
KH
7608 coding->src_pos = from;
7609 coding->src_pos_byte = from_byte;
d46c5b12 7610 }
df7492f9 7611 else if (BUFFERP (src_object))
88993dfd 7612 {
df7492f9
KH
7613 set_buffer_internal (XBUFFER (src_object));
7614 if (from != GPT)
7615 move_gap_both (from, from_byte);
7616 if (EQ (src_object, dst_object))
fb88bf2d 7617 {
64cedb0c
KH
7618 struct Lisp_Marker *tail;
7619
7620 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7621 {
7622 tail->need_adjustment
7623 = tail->charpos == (tail->insertion_type ? from : to);
7624 need_marker_adjustment |= tail->need_adjustment;
7625 }
4776e638 7626 saved_pt = PT, saved_pt_byte = PT_BYTE;
df7492f9 7627 TEMP_SET_PT_BOTH (from, from_byte);
f4a3cc44 7628 current_buffer->text->inhibit_shrinking = 1;
df7492f9
KH
7629 del_range_both (from, from_byte, to, to_byte, 1);
7630 coding->src_pos = -chars;
7631 coding->src_pos_byte = -bytes;
fb88bf2d 7632 }
df7492f9 7633 else
fb88bf2d 7634 {
df7492f9
KH
7635 coding->src_pos = from;
7636 coding->src_pos_byte = from_byte;
fb88bf2d 7637 }
88993dfd
KH
7638 }
7639
df7492f9
KH
7640 if (CODING_REQUIRE_DETECTION (coding))
7641 detect_coding (coding);
7642 attrs = CODING_ID_ATTRS (coding->id);
d46c5b12 7643
2cb26057
KH
7644 if (EQ (dst_object, Qt)
7645 || (! NILP (CODING_ATTR_POST_READ (attrs))
7646 && NILP (dst_object)))
b73bfc1c 7647 {
a1567c45
SM
7648 coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
7649 coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
df7492f9
KH
7650 coding->dst_pos = BEG;
7651 coding->dst_pos_byte = BEG_BYTE;
b73bfc1c 7652 }
df7492f9 7653 else if (BUFFERP (dst_object))
d46c5b12 7654 {
24a73b0a 7655 code_conversion_save (0, 0);
df7492f9
KH
7656 coding->dst_object = dst_object;
7657 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
7658 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
7659 coding->dst_multibyte
4b4deea2 7660 = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
d46c5b12
KH
7661 }
7662 else
7663 {
24a73b0a 7664 code_conversion_save (0, 0);
df7492f9 7665 coding->dst_object = Qnil;
0154725e
SM
7666 /* Most callers presume this will return a multibyte result, and they
7667 won't use `binary' or `raw-text' anyway, so let's not worry about
7668 CODING_FOR_UNIBYTE. */
bb555731 7669 coding->dst_multibyte = 1;
d46c5b12
KH
7670 }
7671
df7492f9 7672 decode_coding (coding);
fa46990e 7673
df7492f9
KH
7674 if (BUFFERP (coding->dst_object))
7675 set_buffer_internal (XBUFFER (coding->dst_object));
d46c5b12 7676
df7492f9 7677 if (! NILP (CODING_ATTR_POST_READ (attrs)))
d46c5b12 7678 {
b3bfad50 7679 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
d311d28c 7680 ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
2b4f9037 7681 Lisp_Object val;
d46c5b12 7682
c0cc7f7f 7683 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
b3bfad50
KH
7684 GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7685 old_deactivate_mark);
d4850d67
KH
7686 val = safe_call1 (CODING_ATTR_POST_READ (attrs),
7687 make_number (coding->produced_char));
df7492f9
KH
7688 UNGCPRO;
7689 CHECK_NATNUM (val);
7690 coding->produced_char += Z - prev_Z;
7691 coding->produced += Z_BYTE - prev_Z_BYTE;
d46c5b12 7692 }
de79a6a5 7693
df7492f9 7694 if (EQ (dst_object, Qt))
ec6d2bb8 7695 {
df7492f9
KH
7696 coding->dst_object = Fbuffer_string ();
7697 }
7698 else if (NILP (dst_object) && BUFFERP (coding->dst_object))
7699 {
7700 set_buffer_internal (XBUFFER (coding->dst_object));
7701 if (dst_bytes < coding->produced)
7702 {
b3bfad50 7703 destination = xrealloc (destination, coding->produced);
df7492f9
KH
7704 if (! destination)
7705 {
065e3595 7706 record_conversion_result (coding,
ebaf11b6 7707 CODING_RESULT_INSUFFICIENT_MEM);
df7492f9
KH
7708 unbind_to (count, Qnil);
7709 return;
7710 }
7711 if (BEGV < GPT && GPT < BEGV + coding->produced_char)
7712 move_gap_both (BEGV, BEGV_BYTE);
72af86bd 7713 memcpy (destination, BEGV_ADDR, coding->produced);
df7492f9 7714 coding->destination = destination;
d46c5b12 7715 }
ec6d2bb8 7716 }
b73bfc1c 7717
4776e638
KH
7718 if (saved_pt >= 0)
7719 {
7720 /* This is the case of:
7721 (BUFFERP (src_object) && EQ (src_object, dst_object))
7722 As we have moved PT while replacing the original buffer
7723 contents, we must recover it now. */
7724 set_buffer_internal (XBUFFER (src_object));
f4a3cc44 7725 current_buffer->text->inhibit_shrinking = 0;
4776e638
KH
7726 if (saved_pt < from)
7727 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7728 else if (saved_pt < from + chars)
7729 TEMP_SET_PT_BOTH (from, from_byte);
4b4deea2 7730 else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
4776e638
KH
7731 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7732 saved_pt_byte + (coding->produced - bytes));
7733 else
7734 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7735 saved_pt_byte + (coding->produced - bytes));
64cedb0c
KH
7736
7737 if (need_marker_adjustment)
7738 {
7739 struct Lisp_Marker *tail;
7740
7741 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7742 if (tail->need_adjustment)
7743 {
7744 tail->need_adjustment = 0;
7745 if (tail->insertion_type)
7746 {
7747 tail->bytepos = from_byte;
7748 tail->charpos = from;
7749 }
7750 else
7751 {
7752 tail->bytepos = from_byte + coding->produced;
7753 tail->charpos
4b4deea2 7754 = (NILP (BVAR (current_buffer, enable_multibyte_characters))
64cedb0c
KH
7755 ? tail->bytepos : from + coding->produced_char);
7756 }
7757 }
7758 }
d46c5b12 7759 }
4776e638 7760
b3bfad50 7761 Vdeactivate_mark = old_deactivate_mark;
065e3595 7762 unbind_to (count, coding->dst_object);
d46c5b12
KH
7763}
7764
d46c5b12 7765
df7492f9 7766void
cf84bb53
JB
7767encode_coding_object (struct coding_system *coding,
7768 Lisp_Object src_object,
d311d28c
PE
7769 ptrdiff_t from, ptrdiff_t from_byte,
7770 ptrdiff_t to, ptrdiff_t to_byte,
cf84bb53 7771 Lisp_Object dst_object)
d46c5b12 7772{
d311d28c
PE
7773 ptrdiff_t count = SPECPDL_INDEX ();
7774 ptrdiff_t chars = to - from;
7775 ptrdiff_t bytes = to_byte - from_byte;
df7492f9 7776 Lisp_Object attrs;
f10fe38f
PE
7777 ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
7778 bool need_marker_adjustment = 0;
7779 bool kill_src_buffer = 0;
b3bfad50 7780 Lisp_Object old_deactivate_mark;
df7492f9 7781
b3bfad50 7782 old_deactivate_mark = Vdeactivate_mark;
df7492f9
KH
7783
7784 coding->src_object = src_object;
7785 coding->src_chars = chars;
7786 coding->src_bytes = bytes;
7787 coding->src_multibyte = chars < bytes;
7788
7789 attrs = CODING_ID_ATTRS (coding->id);
7790
64cedb0c
KH
7791 if (EQ (src_object, dst_object))
7792 {
7793 struct Lisp_Marker *tail;
7794
7795 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7796 {
7797 tail->need_adjustment
7798 = tail->charpos == (tail->insertion_type ? from : to);
7799 need_marker_adjustment |= tail->need_adjustment;
7800 }
7801 }
7802
df7492f9 7803 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
6bac5b12 7804 {
24a73b0a 7805 coding->src_object = code_conversion_save (1, coding->src_multibyte);
df7492f9
KH
7806 set_buffer_internal (XBUFFER (coding->src_object));
7807 if (STRINGP (src_object))
7808 insert_from_string (src_object, from, from_byte, chars, bytes, 0);
7809 else if (BUFFERP (src_object))
7810 insert_from_buffer (XBUFFER (src_object), from, chars, 0);
7811 else
b68864e5 7812 insert_1_both ((char *) coding->source + from, chars, bytes, 0, 0, 0);
d46c5b12 7813
df7492f9
KH
7814 if (EQ (src_object, dst_object))
7815 {
7816 set_buffer_internal (XBUFFER (src_object));
4776e638 7817 saved_pt = PT, saved_pt_byte = PT_BYTE;
df7492f9
KH
7818 del_range_both (from, from_byte, to, to_byte, 1);
7819 set_buffer_internal (XBUFFER (coding->src_object));
7820 }
7821
d4850d67 7822 {
b3bfad50 7823 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
d4850d67 7824
b3bfad50
KH
7825 GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7826 old_deactivate_mark);
6cd7a139
DA
7827 safe_call2 (CODING_ATTR_PRE_WRITE (attrs),
7828 make_number (BEG), make_number (Z));
b3bfad50 7829 UNGCPRO;
d4850d67 7830 }
c02d943b
KH
7831 if (XBUFFER (coding->src_object) != current_buffer)
7832 kill_src_buffer = 1;
ac87bbef 7833 coding->src_object = Fcurrent_buffer ();
df7492f9
KH
7834 if (BEG != GPT)
7835 move_gap_both (BEG, BEG_BYTE);
7836 coding->src_chars = Z - BEG;
7837 coding->src_bytes = Z_BYTE - BEG_BYTE;
7838 coding->src_pos = BEG;
7839 coding->src_pos_byte = BEG_BYTE;
7840 coding->src_multibyte = Z < Z_BYTE;
7841 }
7842 else if (STRINGP (src_object))
d46c5b12 7843 {
24a73b0a 7844 code_conversion_save (0, 0);
df7492f9
KH
7845 coding->src_pos = from;
7846 coding->src_pos_byte = from_byte;
b73bfc1c 7847 }
df7492f9 7848 else if (BUFFERP (src_object))
b73bfc1c 7849 {
24a73b0a 7850 code_conversion_save (0, 0);
df7492f9 7851 set_buffer_internal (XBUFFER (src_object));
df7492f9 7852 if (EQ (src_object, dst_object))
d46c5b12 7853 {
4776e638 7854 saved_pt = PT, saved_pt_byte = PT_BYTE;
ff0dacd7
KH
7855 coding->src_object = del_range_1 (from, to, 1, 1);
7856 coding->src_pos = 0;
7857 coding->src_pos_byte = 0;
d46c5b12 7858 }
df7492f9 7859 else
d46c5b12 7860 {
ff0dacd7
KH
7861 if (from < GPT && to >= GPT)
7862 move_gap_both (from, from_byte);
df7492f9
KH
7863 coding->src_pos = from;
7864 coding->src_pos_byte = from_byte;
d46c5b12 7865 }
d46c5b12 7866 }
4776e638 7867 else
24a73b0a 7868 code_conversion_save (0, 0);
d46c5b12 7869
df7492f9 7870 if (BUFFERP (dst_object))
88993dfd 7871 {
df7492f9 7872 coding->dst_object = dst_object;
28f67a95
KH
7873 if (EQ (src_object, dst_object))
7874 {
7875 coding->dst_pos = from;
7876 coding->dst_pos_byte = from_byte;
7877 }
7878 else
7879 {
319a3947
KH
7880 struct buffer *current = current_buffer;
7881
7882 set_buffer_temp (XBUFFER (dst_object));
7883 coding->dst_pos = PT;
7884 coding->dst_pos_byte = PT_BYTE;
7885 move_gap_both (coding->dst_pos, coding->dst_pos_byte);
7886 set_buffer_temp (current);
28f67a95 7887 }
df7492f9 7888 coding->dst_multibyte
4b4deea2 7889 = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
88993dfd 7890 }
df7492f9 7891 else if (EQ (dst_object, Qt))
d46c5b12 7892 {
5d009b3a 7893 ptrdiff_t dst_bytes = max (1, coding->src_chars);
df7492f9 7894 coding->dst_object = Qnil;
23f86fce 7895 coding->destination = xmalloc (dst_bytes);
5d009b3a 7896 coding->dst_bytes = dst_bytes;
df7492f9 7897 coding->dst_multibyte = 0;
d46c5b12
KH
7898 }
7899 else
7900 {
df7492f9
KH
7901 coding->dst_object = Qnil;
7902 coding->dst_multibyte = 0;
d46c5b12
KH
7903 }
7904
df7492f9 7905 encode_coding (coding);
d46c5b12 7906
df7492f9 7907 if (EQ (dst_object, Qt))
d46c5b12 7908 {
df7492f9
KH
7909 if (BUFFERP (coding->dst_object))
7910 coding->dst_object = Fbuffer_string ();
7911 else
d46c5b12 7912 {
df7492f9
KH
7913 coding->dst_object
7914 = make_unibyte_string ((char *) coding->destination,
7915 coding->produced);
7916 xfree (coding->destination);
d46c5b12 7917 }
4ed46869 7918 }
d46c5b12 7919
4776e638
KH
7920 if (saved_pt >= 0)
7921 {
7922 /* This is the case of:
7923 (BUFFERP (src_object) && EQ (src_object, dst_object))
7924 As we have moved PT while replacing the original buffer
7925 contents, we must recover it now. */
7926 set_buffer_internal (XBUFFER (src_object));
7927 if (saved_pt < from)
7928 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7929 else if (saved_pt < from + chars)
7930 TEMP_SET_PT_BOTH (from, from_byte);
4b4deea2 7931 else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
4776e638
KH
7932 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7933 saved_pt_byte + (coding->produced - bytes));
d46c5b12 7934 else
4776e638
KH
7935 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7936 saved_pt_byte + (coding->produced - bytes));
64cedb0c
KH
7937
7938 if (need_marker_adjustment)
7939 {
7940 struct Lisp_Marker *tail;
7941
7942 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7943 if (tail->need_adjustment)
7944 {
7945 tail->need_adjustment = 0;
7946 if (tail->insertion_type)
7947 {
7948 tail->bytepos = from_byte;
7949 tail->charpos = from;
7950 }
7951 else
7952 {
7953 tail->bytepos = from_byte + coding->produced;
7954 tail->charpos
4b4deea2 7955 = (NILP (BVAR (current_buffer, enable_multibyte_characters))
64cedb0c
KH
7956 ? tail->bytepos : from + coding->produced_char);
7957 }
7958 }
7959 }
4776e638
KH
7960 }
7961
c02d943b
KH
7962 if (kill_src_buffer)
7963 Fkill_buffer (coding->src_object);
b3bfad50
KH
7964
7965 Vdeactivate_mark = old_deactivate_mark;
df7492f9 7966 unbind_to (count, Qnil);
b73bfc1c
KH
7967}
7968
df7492f9 7969
b73bfc1c 7970Lisp_Object
971de7fb 7971preferred_coding_system (void)
b73bfc1c 7972{
df7492f9 7973 int id = coding_categories[coding_priorities[0]].id;
2391eaa4 7974
df7492f9 7975 return CODING_ID_NAME (id);
4ed46869
KH
7976}
7977
7f590b0c 7978#if defined (WINDOWSNT) || defined (CYGWIN)
ba116008
DC
7979
7980Lisp_Object
7981from_unicode (Lisp_Object str)
7982{
7983 CHECK_STRING (str);
7984 if (!STRING_MULTIBYTE (str) &&
7985 SBYTES (str) & 1)
7986 {
7987 str = Fsubstring (str, make_number (0), make_number (-1));
7988 }
7989
7990 return code_convert_string_norecord (str, Qutf_16le, 0);
7991}
7992
7993wchar_t *
7994to_unicode (Lisp_Object str, Lisp_Object *buf)
7995{
7996 *buf = code_convert_string_norecord (str, Qutf_16le, 1);
7997 /* We need to make a another copy (in addition to the one made by
7998 code_convert_string_norecord) to ensure that the final string is
7999 _doubly_ zero terminated --- that is, that the string is
8000 terminated by two zero bytes and one utf-16le null character.
8001 Because strings are already terminated with a single zero byte,
8002 we just add one additional zero. */
8003 str = make_uninit_string (SBYTES (*buf) + 1);
8004 memcpy (SDATA (str), SDATA (*buf), SBYTES (*buf));
8005 SDATA (str) [SBYTES (*buf)] = '\0';
8006 *buf = str;
8007 return WCSDATA (*buf);
8008}
7f590b0c
DC
8009
8010#endif /* WINDOWSNT || CYGWIN */
ba116008 8011
4ed46869
KH
8012\f
8013#ifdef emacs
1397dc18 8014/*** 8. Emacs Lisp library functions ***/
4ed46869 8015
a7ca3326 8016DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
48b0f3ae 8017 doc: /* Return t if OBJECT is nil or a coding-system.
df7492f9 8018See the documentation of `define-coding-system' for information
48b0f3ae 8019about coding-system objects. */)
5842a27b 8020 (Lisp_Object object)
4ed46869 8021{
d4a1d553
JB
8022 if (NILP (object)
8023 || CODING_SYSTEM_ID (object) >= 0)
44e8490d 8024 return Qt;
d4a1d553
JB
8025 if (! SYMBOLP (object)
8026 || NILP (Fget (object, Qcoding_system_define_form)))
44e8490d
KH
8027 return Qnil;
8028 return Qt;
4ed46869
KH
8029}
8030
a7ca3326 8031DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
9d991de8 8032 Sread_non_nil_coding_system, 1, 1, 0,
48b0f3ae 8033 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
5842a27b 8034 (Lisp_Object prompt)
4ed46869 8035{
e0e989f6 8036 Lisp_Object val;
9d991de8
RS
8037 do
8038 {
4608c386
KH
8039 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8040 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
9d991de8 8041 }
8f924df7 8042 while (SCHARS (val) == 0);
e0e989f6 8043 return (Fintern (val, Qnil));
4ed46869
KH
8044}
8045
a7ca3326 8046DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
48b0f3ae 8047 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
c7183fb8
GM
8048If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8049Ignores case when completing coding systems (all Emacs coding systems
8050are lower-case). */)
5842a27b 8051 (Lisp_Object prompt, Lisp_Object default_coding_system)
4ed46869 8052{
f44d27ce 8053 Lisp_Object val;
d311d28c 8054 ptrdiff_t count = SPECPDL_INDEX ();
c7183fb8 8055
9b787f3e 8056 if (SYMBOLP (default_coding_system))
57d25e6f 8057 default_coding_system = SYMBOL_NAME (default_coding_system);
c7183fb8 8058 specbind (Qcompletion_ignore_case, Qt);
4608c386 8059 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
9b787f3e
RS
8060 Qt, Qnil, Qcoding_system_history,
8061 default_coding_system, Qnil);
c7183fb8 8062 unbind_to (count, Qnil);
8f924df7 8063 return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
4ed46869
KH
8064}
8065
a7ca3326 8066DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
4ed46869 8067 1, 1, 0,
48b0f3ae 8068 doc: /* Check validity of CODING-SYSTEM.
9ffd559c
KH
8069If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8070It is valid if it is nil or a symbol defined as a coding system by the
8071function `define-coding-system'. */)
5842a27b 8072 (Lisp_Object coding_system)
4ed46869 8073{
44e8490d
KH
8074 Lisp_Object define_form;
8075
8076 define_form = Fget (coding_system, Qcoding_system_define_form);
8077 if (! NILP (define_form))
8078 {
8079 Fput (coding_system, Qcoding_system_define_form, Qnil);
8080 safe_eval (define_form);
8081 }
4ed46869
KH
8082 if (!NILP (Fcoding_system_p (coding_system)))
8083 return coding_system;
fcad4ec4 8084 xsignal1 (Qcoding_system_error, coding_system);
4ed46869 8085}
df7492f9 8086
3a73fa5d 8087\f
89528eb3 8088/* Detect how the bytes at SRC of length SRC_BYTES are encoded. If
f10fe38f 8089 HIGHEST, return the coding system of the highest
ad1746f5 8090 priority among the detected coding systems. Otherwise return a
89528eb3 8091 list of detected coding systems sorted by their priorities. If
f10fe38f 8092 MULTIBYTEP, it is assumed that the bytes are in correct
89528eb3
KH
8093 multibyte form but contains only ASCII and eight-bit chars.
8094 Otherwise, the bytes are raw bytes.
8095
8096 CODING-SYSTEM controls the detection as below:
8097
8098 If it is nil, detect both text-format and eol-format. If the
8099 text-format part of CODING-SYSTEM is already specified
8100 (e.g. `iso-latin-1'), detect only eol-format. If the eol-format
8101 part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8102 detect only text-format. */
8103
d46c5b12 8104Lisp_Object
cf84bb53 8105detect_coding_system (const unsigned char *src,
d311d28c 8106 ptrdiff_t src_chars, ptrdiff_t src_bytes,
f10fe38f 8107 bool highest, bool multibytep,
cf84bb53 8108 Lisp_Object coding_system)
4ed46869 8109{
8f924df7 8110 const unsigned char *src_end = src + src_bytes;
df7492f9 8111 Lisp_Object attrs, eol_type;
4533845d 8112 Lisp_Object val = Qnil;
df7492f9 8113 struct coding_system coding;
d3411f89 8114 ptrdiff_t id;
ff0dacd7 8115 struct coding_detection_info detect_info;
24a73b0a 8116 enum coding_category base_category;
f10fe38f 8117 bool null_byte_found = 0, eight_bit_found = 0;
b73bfc1c 8118
df7492f9
KH
8119 if (NILP (coding_system))
8120 coding_system = Qundecided;
8121 setup_coding_system (coding_system, &coding);
8122 attrs = CODING_ID_ATTRS (coding.id);
8123 eol_type = CODING_ID_EOL_TYPE (coding.id);
89528eb3 8124 coding_system = CODING_ATTR_BASE_NAME (attrs);
4ed46869 8125
df7492f9 8126 coding.source = src;
24a73b0a 8127 coding.src_chars = src_chars;
df7492f9
KH
8128 coding.src_bytes = src_bytes;
8129 coding.src_multibyte = multibytep;
8130 coding.consumed = 0;
89528eb3 8131 coding.mode |= CODING_MODE_LAST_BLOCK;
c0e16b14 8132 coding.head_ascii = 0;
d46c5b12 8133
ff0dacd7 8134 detect_info.checked = detect_info.found = detect_info.rejected = 0;
d46c5b12 8135
89528eb3 8136 /* At first, detect text-format if necessary. */
24a73b0a
KH
8137 base_category = XINT (CODING_ATTR_CATEGORY (attrs));
8138 if (base_category == coding_category_undecided)
4ed46869 8139 {
c4a63b12
PE
8140 enum coding_category category IF_LINT (= 0);
8141 struct coding_system *this IF_LINT (= NULL);
ff0dacd7 8142 int c, i;
88993dfd 8143
24a73b0a 8144 /* Skip all ASCII bytes except for a few ISO2022 controls. */
2f3cbb32 8145 for (; src < src_end; src++)
4ed46869 8146 {
df7492f9 8147 c = *src;
6cb21a4f 8148 if (c & 0x80)
6cb21a4f 8149 {
2f3cbb32 8150 eight_bit_found = 1;
2f3cbb32
KH
8151 if (null_byte_found)
8152 break;
8153 }
c0e16b14 8154 else if (c < 0x20)
2f3cbb32
KH
8155 {
8156 if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8157 && ! inhibit_iso_escape_detection
8158 && ! detect_info.checked)
6cb21a4f 8159 {
2f3cbb32
KH
8160 if (detect_coding_iso_2022 (&coding, &detect_info))
8161 {
8162 /* We have scanned the whole data. */
8163 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
c0e16b14
KH
8164 {
8165 /* We didn't find an 8-bit code. We may
8166 have found a null-byte, but it's very
8167 rare that a binary file confirm to
8168 ISO-2022. */
8169 src = src_end;
8170 coding.head_ascii = src - coding.source;
8171 }
8172 detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
2f3cbb32
KH
8173 break;
8174 }
8175 }
97b1b294 8176 else if (! c && !inhibit_null_byte_detection)
2f3cbb32
KH
8177 {
8178 null_byte_found = 1;
8179 if (eight_bit_found)
8180 break;
6cb21a4f 8181 }
c006c0c8
KH
8182 if (! eight_bit_found)
8183 coding.head_ascii++;
6cb21a4f 8184 }
c006c0c8 8185 else if (! eight_bit_found)
c0e16b14 8186 coding.head_ascii++;
4ed46869 8187 }
88993dfd 8188
2f3cbb32
KH
8189 if (null_byte_found || eight_bit_found
8190 || coding.head_ascii < coding.src_bytes
6cb21a4f
KH
8191 || detect_info.found)
8192 {
2f3cbb32 8193 if (coding.head_ascii == coding.src_bytes)
6cb21a4f
KH
8194 /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */
8195 for (i = 0; i < coding_category_raw_text; i++)
ff0dacd7 8196 {
6cb21a4f 8197 category = coding_priorities[i];
c7266f4a 8198 this = coding_categories + category;
6cb21a4f 8199 if (detect_info.found & (1 << category))
ff0dacd7
KH
8200 break;
8201 }
6cb21a4f 8202 else
2f3cbb32
KH
8203 {
8204 if (null_byte_found)
8205 {
8206 detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8207 detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8208 }
8209 for (i = 0; i < coding_category_raw_text; i++)
8210 {
8211 category = coding_priorities[i];
8212 this = coding_categories + category;
6cb21a4f 8213
2f3cbb32
KH
8214 if (this->id < 0)
8215 {
8216 /* No coding system of this category is defined. */
8217 detect_info.rejected |= (1 << category);
8218 }
8219 else if (category >= coding_category_raw_text)
8220 continue;
8221 else if (detect_info.checked & (1 << category))
8222 {
8223 if (highest
8224 && (detect_info.found & (1 << category)))
6cb21a4f 8225 break;
2f3cbb32
KH
8226 }
8227 else if ((*(this->detector)) (&coding, &detect_info)
8228 && highest
8229 && (detect_info.found & (1 << category)))
8230 {
8231 if (category == coding_category_utf_16_auto)
8232 {
8233 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8234 category = coding_category_utf_16_le;
8235 else
8236 category = coding_category_utf_16_be;
8237 }
8238 break;
8239 }
8240 }
8241 }
6cb21a4f 8242 }
ec6d2bb8 8243
4cddb209
KH
8244 if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8245 || null_byte_found)
ec6d2bb8 8246 {
ff0dacd7 8247 detect_info.found = CATEGORY_MASK_RAW_TEXT;
4cddb209 8248 id = CODING_SYSTEM_ID (Qno_conversion);
89528eb3
KH
8249 val = Fcons (make_number (id), Qnil);
8250 }
ff0dacd7 8251 else if (! detect_info.rejected && ! detect_info.found)
89528eb3 8252 {
ff0dacd7 8253 detect_info.found = CATEGORY_MASK_ANY;
89528eb3
KH
8254 id = coding_categories[coding_category_undecided].id;
8255 val = Fcons (make_number (id), Qnil);
8256 }
8257 else if (highest)
8258 {
ff0dacd7 8259 if (detect_info.found)
ec6d2bb8 8260 {
ff0dacd7
KH
8261 detect_info.found = 1 << category;
8262 val = Fcons (make_number (this->id), Qnil);
8263 }
8264 else
8265 for (i = 0; i < coding_category_raw_text; i++)
8266 if (! (detect_info.rejected & (1 << coding_priorities[i])))
8267 {
8268 detect_info.found = 1 << coding_priorities[i];
8269 id = coding_categories[coding_priorities[i]].id;
8270 val = Fcons (make_number (id), Qnil);
8271 break;
8272 }
8273 }
89528eb3
KH
8274 else
8275 {
ff0dacd7
KH
8276 int mask = detect_info.rejected | detect_info.found;
8277 int found = 0;
ec6d2bb8 8278
89528eb3 8279 for (i = coding_category_raw_text - 1; i >= 0; i--)
ff0dacd7
KH
8280 {
8281 category = coding_priorities[i];
8282 if (! (mask & (1 << category)))
ec6d2bb8 8283 {
ff0dacd7
KH
8284 found |= 1 << category;
8285 id = coding_categories[category].id;
c7266f4a
KH
8286 if (id >= 0)
8287 val = Fcons (make_number (id), val);
ff0dacd7
KH
8288 }
8289 }
8290 for (i = coding_category_raw_text - 1; i >= 0; i--)
8291 {
8292 category = coding_priorities[i];
8293 if (detect_info.found & (1 << category))
8294 {
8295 id = coding_categories[category].id;
8296 val = Fcons (make_number (id), val);
ec6d2bb8 8297 }
ec6d2bb8 8298 }
ff0dacd7 8299 detect_info.found |= found;
ec6d2bb8 8300 }
ec6d2bb8 8301 }
a470d443
KH
8302 else if (base_category == coding_category_utf_8_auto)
8303 {
8304 if (detect_coding_utf_8 (&coding, &detect_info))
8305 {
8306 struct coding_system *this;
8307
8308 if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8309 this = coding_categories + coding_category_utf_8_sig;
8310 else
8311 this = coding_categories + coding_category_utf_8_nosig;
8312 val = Fcons (make_number (this->id), Qnil);
8313 }
8314 }
24a73b0a
KH
8315 else if (base_category == coding_category_utf_16_auto)
8316 {
8317 if (detect_coding_utf_16 (&coding, &detect_info))
8318 {
24a73b0a
KH
8319 struct coding_system *this;
8320
8321 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8322 this = coding_categories + coding_category_utf_16_le;
8323 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8324 this = coding_categories + coding_category_utf_16_be;
8325 else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8326 this = coding_categories + coding_category_utf_16_be_nosig;
8327 else
8328 this = coding_categories + coding_category_utf_16_le_nosig;
8329 val = Fcons (make_number (this->id), Qnil);
8330 }
8331 }
df7492f9
KH
8332 else
8333 {
ff0dacd7 8334 detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
89528eb3 8335 val = Fcons (make_number (coding.id), Qnil);
4ed46869 8336 }
df7492f9 8337
89528eb3 8338 /* Then, detect eol-format if necessary. */
df7492f9 8339 {
4533845d 8340 int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
df7492f9
KH
8341 Lisp_Object tail;
8342
89528eb3
KH
8343 if (VECTORP (eol_type))
8344 {
ff0dacd7 8345 if (detect_info.found & ~CATEGORY_MASK_UTF_16)
2f3cbb32
KH
8346 {
8347 if (null_byte_found)
8348 normal_eol = EOL_SEEN_LF;
8349 else
8350 normal_eol = detect_eol (coding.source, src_bytes,
8351 coding_category_raw_text);
8352 }
ff0dacd7
KH
8353 if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8354 | CATEGORY_MASK_UTF_16_BE_NOSIG))
89528eb3
KH
8355 utf_16_be_eol = detect_eol (coding.source, src_bytes,
8356 coding_category_utf_16_be);
ff0dacd7
KH
8357 if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8358 | CATEGORY_MASK_UTF_16_LE_NOSIG))
89528eb3
KH
8359 utf_16_le_eol = detect_eol (coding.source, src_bytes,
8360 coding_category_utf_16_le);
8361 }
8362 else
8363 {
8364 if (EQ (eol_type, Qunix))
8365 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8366 else if (EQ (eol_type, Qdos))
8367 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8368 else
8369 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8370 }
8371
df7492f9
KH
8372 for (tail = val; CONSP (tail); tail = XCDR (tail))
8373 {
89528eb3 8374 enum coding_category category;
df7492f9 8375 int this_eol;
89528eb3
KH
8376
8377 id = XINT (XCAR (tail));
8378 attrs = CODING_ID_ATTRS (id);
8379 category = XINT (CODING_ATTR_CATEGORY (attrs));
8380 eol_type = CODING_ID_EOL_TYPE (id);
df7492f9
KH
8381 if (VECTORP (eol_type))
8382 {
89528eb3
KH
8383 if (category == coding_category_utf_16_be
8384 || category == coding_category_utf_16_be_nosig)
8385 this_eol = utf_16_be_eol;
8386 else if (category == coding_category_utf_16_le
8387 || category == coding_category_utf_16_le_nosig)
8388 this_eol = utf_16_le_eol;
df7492f9 8389 else
89528eb3
KH
8390 this_eol = normal_eol;
8391
df7492f9
KH
8392 if (this_eol == EOL_SEEN_LF)
8393 XSETCAR (tail, AREF (eol_type, 0));
8394 else if (this_eol == EOL_SEEN_CRLF)
8395 XSETCAR (tail, AREF (eol_type, 1));
8396 else if (this_eol == EOL_SEEN_CR)
8397 XSETCAR (tail, AREF (eol_type, 2));
89528eb3
KH
8398 else
8399 XSETCAR (tail, CODING_ID_NAME (id));
df7492f9 8400 }
89528eb3
KH
8401 else
8402 XSETCAR (tail, CODING_ID_NAME (id));
df7492f9
KH
8403 }
8404 }
ec6d2bb8 8405
4533845d 8406 return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
ec6d2bb8
KH
8407}
8408
ec6d2bb8 8409
d46c5b12
KH
8410DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8411 2, 3, 0,
48b0f3ae
PJ
8412 doc: /* Detect coding system of the text in the region between START and END.
8413Return a list of possible coding systems ordered by priority.
b811c52b
KH
8414The coding systems to try and their priorities follows what
8415the function `coding-system-priority-list' (which see) returns.
ec6d2bb8 8416
12e0131a 8417If only ASCII characters are found (except for such ISO-2022 control
d4a1d553
JB
8418characters as ESC), it returns a list of single element `undecided'
8419or its subsidiary coding system according to a detected end-of-line
8420format.
ec6d2bb8 8421
48b0f3ae
PJ
8422If optional argument HIGHEST is non-nil, return the coding system of
8423highest priority. */)
5842a27b 8424 (Lisp_Object start, Lisp_Object end, Lisp_Object highest)
d46c5b12 8425{
d311d28c
PE
8426 ptrdiff_t from, to;
8427 ptrdiff_t from_byte, to_byte;
ec6d2bb8 8428
b7826503
PJ
8429 CHECK_NUMBER_COERCE_MARKER (start);
8430 CHECK_NUMBER_COERCE_MARKER (end);
ec6d2bb8 8431
d46c5b12
KH
8432 validate_region (&start, &end);
8433 from = XINT (start), to = XINT (end);
8434 from_byte = CHAR_TO_BYTE (from);
8435 to_byte = CHAR_TO_BYTE (to);
ec6d2bb8 8436
d46c5b12
KH
8437 if (from < GPT && to >= GPT)
8438 move_gap_both (to, to_byte);
c210f766 8439
d46c5b12 8440 return detect_coding_system (BYTE_POS_ADDR (from_byte),
24a73b0a 8441 to - from, to_byte - from_byte,
0a28aafb 8442 !NILP (highest),
4b4deea2 8443 !NILP (BVAR (current_buffer
5d8ea120 8444 , enable_multibyte_characters)),
df7492f9 8445 Qnil);
ec6d2bb8
KH
8446}
8447
d46c5b12
KH
8448DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8449 1, 2, 0,
48b0f3ae
PJ
8450 doc: /* Detect coding system of the text in STRING.
8451Return a list of possible coding systems ordered by priority.
67ceab9d
KH
8452The coding systems to try and their priorities follows what
8453the function `coding-system-priority-list' (which see) returns.
fb88bf2d 8454
12e0131a 8455If only ASCII characters are found (except for such ISO-2022 control
d4a1d553
JB
8456characters as ESC), it returns a list of single element `undecided'
8457or its subsidiary coding system according to a detected end-of-line
8458format.
d46c5b12 8459
48b0f3ae
PJ
8460If optional argument HIGHEST is non-nil, return the coding system of
8461highest priority. */)
5842a27b 8462 (Lisp_Object string, Lisp_Object highest)
d46c5b12 8463{
b7826503 8464 CHECK_STRING (string);
b73bfc1c 8465
24a73b0a
KH
8466 return detect_coding_system (SDATA (string),
8467 SCHARS (string), SBYTES (string),
8f924df7 8468 !NILP (highest), STRING_MULTIBYTE (string),
df7492f9 8469 Qnil);
4ed46869 8470}
4ed46869 8471
b73bfc1c 8472
b0ab8123 8473static bool
971de7fb 8474char_encodable_p (int c, Lisp_Object attrs)
05e6f5dc 8475{
df7492f9 8476 Lisp_Object tail;
df7492f9 8477 struct charset *charset;
7d64c6ad 8478 Lisp_Object translation_table;
d46c5b12 8479
7d64c6ad 8480 translation_table = CODING_ATTR_TRANS_TBL (attrs);
a6f87d34 8481 if (! NILP (translation_table))
7d64c6ad 8482 c = translate_char (translation_table, c);
df7492f9
KH
8483 for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8484 CONSP (tail); tail = XCDR (tail))
e133c8fa 8485 {
df7492f9
KH
8486 charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8487 if (CHAR_CHARSET_P (c, charset))
8488 break;
e133c8fa 8489 }
df7492f9 8490 return (! NILP (tail));
05e6f5dc 8491}
83fa074f 8492
fb88bf2d 8493
df7492f9
KH
8494/* Return a list of coding systems that safely encode the text between
8495 START and END. If EXCLUDE is non-nil, it is a list of coding
8496 systems not to check. The returned list doesn't contain any such
48468dac 8497 coding systems. In any case, if the text contains only ASCII or is
df7492f9 8498 unibyte, return t. */
e077cc80 8499
df7492f9
KH
8500DEFUN ("find-coding-systems-region-internal",
8501 Ffind_coding_systems_region_internal,
8502 Sfind_coding_systems_region_internal, 2, 3, 0,
8503 doc: /* Internal use only. */)
5842a27b 8504 (Lisp_Object start, Lisp_Object end, Lisp_Object exclude)
df7492f9
KH
8505{
8506 Lisp_Object coding_attrs_list, safe_codings;
d311d28c 8507 ptrdiff_t start_byte, end_byte;
7c78e542 8508 const unsigned char *p, *pbeg, *pend;
df7492f9 8509 int c;
0e727afa 8510 Lisp_Object tail, elt, work_table;
d46c5b12 8511
df7492f9
KH
8512 if (STRINGP (start))
8513 {
8514 if (!STRING_MULTIBYTE (start)
8f924df7 8515 || SCHARS (start) == SBYTES (start))
df7492f9
KH
8516 return Qt;
8517 start_byte = 0;
8f924df7 8518 end_byte = SBYTES (start);
df7492f9
KH
8519 }
8520 else
d46c5b12 8521 {
df7492f9
KH
8522 CHECK_NUMBER_COERCE_MARKER (start);
8523 CHECK_NUMBER_COERCE_MARKER (end);
8524 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8525 args_out_of_range (start, end);
4b4deea2 8526 if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
df7492f9
KH
8527 return Qt;
8528 start_byte = CHAR_TO_BYTE (XINT (start));
8529 end_byte = CHAR_TO_BYTE (XINT (end));
8530 if (XINT (end) - XINT (start) == end_byte - start_byte)
8531 return Qt;
d46c5b12 8532
e1c23804 8533 if (XINT (start) < GPT && XINT (end) > GPT)
d46c5b12 8534 {
e1c23804
DL
8535 if ((GPT - XINT (start)) < (XINT (end) - GPT))
8536 move_gap_both (XINT (start), start_byte);
df7492f9 8537 else
e1c23804 8538 move_gap_both (XINT (end), end_byte);
d46c5b12
KH
8539 }
8540 }
8541
df7492f9
KH
8542 coding_attrs_list = Qnil;
8543 for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
8544 if (NILP (exclude)
8545 || NILP (Fmemq (XCAR (tail), exclude)))
8546 {
8547 Lisp_Object attrs;
d46c5b12 8548
df7492f9
KH
8549 attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
8550 if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
8551 && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
7d64c6ad
KH
8552 {
8553 ASET (attrs, coding_attr_trans_tbl,
2170c8f0 8554 get_translation_table (attrs, 1, NULL));
7d64c6ad
KH
8555 coding_attrs_list = Fcons (attrs, coding_attrs_list);
8556 }
df7492f9 8557 }
d46c5b12 8558
df7492f9 8559 if (STRINGP (start))
8f924df7 8560 p = pbeg = SDATA (start);
df7492f9
KH
8561 else
8562 p = pbeg = BYTE_POS_ADDR (start_byte);
8563 pend = p + (end_byte - start_byte);
b843d1ae 8564
df7492f9
KH
8565 while (p < pend && ASCII_BYTE_P (*p)) p++;
8566 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
d46c5b12 8567
0e727afa 8568 work_table = Fmake_char_table (Qnil, Qnil);
05e6f5dc 8569 while (p < pend)
72d1a715 8570 {
df7492f9
KH
8571 if (ASCII_BYTE_P (*p))
8572 p++;
72d1a715
RS
8573 else
8574 {
df7492f9 8575 c = STRING_CHAR_ADVANCE (p);
0e727afa
YM
8576 if (!NILP (char_table_ref (work_table, c)))
8577 /* This character was already checked. Ignore it. */
8578 continue;
12410ef1 8579
df7492f9
KH
8580 charset_map_loaded = 0;
8581 for (tail = coding_attrs_list; CONSP (tail);)
8582 {
8583 elt = XCAR (tail);
8584 if (NILP (elt))
8585 tail = XCDR (tail);
8586 else if (char_encodable_p (c, elt))
8587 tail = XCDR (tail);
8588 else if (CONSP (XCDR (tail)))
8589 {
8590 XSETCAR (tail, XCAR (XCDR (tail)));
8591 XSETCDR (tail, XCDR (XCDR (tail)));
8592 }
8593 else
8594 {
8595 XSETCAR (tail, Qnil);
8596 tail = XCDR (tail);
8597 }
8598 }
8599 if (charset_map_loaded)
8600 {
d311d28c 8601 ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
05e6f5dc 8602
df7492f9 8603 if (STRINGP (start))
8f924df7 8604 pbeg = SDATA (start);
df7492f9
KH
8605 else
8606 pbeg = BYTE_POS_ADDR (start_byte);
8607 p = pbeg + p_offset;
8608 pend = pbeg + pend_offset;
8609 }
0e727afa 8610 char_table_set (work_table, c, Qt);
df7492f9 8611 }
ec6d2bb8 8612 }
fb88bf2d 8613
988b3759 8614 safe_codings = list2 (Qraw_text, Qno_conversion);
df7492f9
KH
8615 for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
8616 if (! NILP (XCAR (tail)))
8617 safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
ec6d2bb8 8618
05e6f5dc
KH
8619 return safe_codings;
8620}
4956c225 8621
d46c5b12 8622
8f924df7
KH
8623DEFUN ("unencodable-char-position", Funencodable_char_position,
8624 Sunencodable_char_position, 3, 5, 0,
8625 doc: /*
8626Return position of first un-encodable character in a region.
d4a1d553 8627START and END specify the region and CODING-SYSTEM specifies the
8f924df7 8628encoding to check. Return nil if CODING-SYSTEM does encode the region.
d46c5b12 8629
8f924df7
KH
8630If optional 4th argument COUNT is non-nil, it specifies at most how
8631many un-encodable characters to search. In this case, the value is a
8632list of positions.
d46c5b12 8633
8f924df7
KH
8634If optional 5th argument STRING is non-nil, it is a string to search
8635for un-encodable characters. In that case, START and END are indexes
8636to the string. */)
5842a27b 8637 (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object count, Lisp_Object string)
8f924df7 8638{
d311d28c 8639 EMACS_INT n;
8f924df7 8640 struct coding_system coding;
7d64c6ad 8641 Lisp_Object attrs, charset_list, translation_table;
8f924df7 8642 Lisp_Object positions;
d311d28c 8643 ptrdiff_t from, to;
8f924df7 8644 const unsigned char *p, *stop, *pend;
f10fe38f 8645 bool ascii_compatible;
fb88bf2d 8646
8f924df7
KH
8647 setup_coding_system (Fcheck_coding_system (coding_system), &coding);
8648 attrs = CODING_ID_ATTRS (coding.id);
8649 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
8650 return Qnil;
8651 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
8652 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2170c8f0 8653 translation_table = get_translation_table (attrs, 1, NULL);
fb88bf2d 8654
8f924df7
KH
8655 if (NILP (string))
8656 {
8657 validate_region (&start, &end);
8658 from = XINT (start);
8659 to = XINT (end);
4b4deea2 8660 if (NILP (BVAR (current_buffer, enable_multibyte_characters))
8f924df7
KH
8661 || (ascii_compatible
8662 && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
8663 return Qnil;
8664 p = CHAR_POS_ADDR (from);
8665 pend = CHAR_POS_ADDR (to);
8666 if (from < GPT && to >= GPT)
8667 stop = GPT_ADDR;
8668 else
8669 stop = pend;
8670 }
8671 else
8672 {
8673 CHECK_STRING (string);
8674 CHECK_NATNUM (start);
8675 CHECK_NATNUM (end);
d311d28c
PE
8676 if (! (XINT (start) <= XINT (end) && XINT (end) <= SCHARS (string)))
8677 args_out_of_range_3 (string, start, end);
8f924df7
KH
8678 from = XINT (start);
8679 to = XINT (end);
8f924df7
KH
8680 if (! STRING_MULTIBYTE (string))
8681 return Qnil;
8682 p = SDATA (string) + string_char_to_byte (string, from);
8683 stop = pend = SDATA (string) + string_char_to_byte (string, to);
8684 if (ascii_compatible && (to - from) == (pend - p))
8685 return Qnil;
8686 }
f2558efd 8687
8f924df7
KH
8688 if (NILP (count))
8689 n = 1;
8690 else
b73bfc1c 8691 {
8f924df7
KH
8692 CHECK_NATNUM (count);
8693 n = XINT (count);
b73bfc1c
KH
8694 }
8695
8f924df7 8696 positions = Qnil;
3633e3aa 8697 charset_map_loaded = 0;
8f924df7 8698 while (1)
d46c5b12 8699 {
8f924df7 8700 int c;
ec6d2bb8 8701
8f924df7
KH
8702 if (ascii_compatible)
8703 while (p < stop && ASCII_BYTE_P (*p))
8704 p++, from++;
8705 if (p >= stop)
0e79d667 8706 {
8f924df7
KH
8707 if (p >= pend)
8708 break;
8709 stop = pend;
8710 p = GAP_END_ADDR;
0e79d667 8711 }
ec6d2bb8 8712
8f924df7
KH
8713 c = STRING_CHAR_ADVANCE (p);
8714 if (! (ASCII_CHAR_P (c) && ascii_compatible)
7d64c6ad
KH
8715 && ! char_charset (translate_char (translation_table, c),
8716 charset_list, NULL))
ec6d2bb8 8717 {
8f924df7
KH
8718 positions = Fcons (make_number (from), positions);
8719 n--;
8720 if (n == 0)
8721 break;
ec6d2bb8
KH
8722 }
8723
8f924df7 8724 from++;
3633e3aa
KH
8725 if (charset_map_loaded && NILP (string))
8726 {
8727 p = CHAR_POS_ADDR (from);
8728 pend = CHAR_POS_ADDR (to);
8729 if (from < GPT && to >= GPT)
8730 stop = GPT_ADDR;
8731 else
8732 stop = pend;
8733 charset_map_loaded = 0;
8734 }
8f924df7 8735 }
d46c5b12 8736
8f924df7
KH
8737 return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
8738}
d46c5b12 8739
d46c5b12 8740
df7492f9
KH
8741DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
8742 Scheck_coding_systems_region, 3, 3, 0,
8743 doc: /* Check if the region is encodable by coding systems.
d46c5b12 8744
df7492f9
KH
8745START and END are buffer positions specifying the region.
8746CODING-SYSTEM-LIST is a list of coding systems to check.
d46c5b12 8747
df7492f9 8748The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
d4a1d553 8749CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
df7492f9
KH
8750whole region, POS0, POS1, ... are buffer positions where non-encodable
8751characters are found.
93dec019 8752
df7492f9
KH
8753If all coding systems in CODING-SYSTEM-LIST can encode the region, the
8754value is nil.
93dec019 8755
df7492f9
KH
8756START may be a string. In that case, check if the string is
8757encodable, and the value contains indices to the string instead of
5704f39a
KH
8758buffer positions. END is ignored.
8759
4c1958f4 8760If the current buffer (or START if it is a string) is unibyte, the value
5704f39a 8761is nil. */)
5842a27b 8762 (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system_list)
05e6f5dc 8763{
df7492f9 8764 Lisp_Object list;
d311d28c
PE
8765 ptrdiff_t start_byte, end_byte;
8766 ptrdiff_t pos;
7c78e542 8767 const unsigned char *p, *pbeg, *pend;
df7492f9 8768 int c;
7d64c6ad 8769 Lisp_Object tail, elt, attrs;
70ad9fc4 8770
05e6f5dc
KH
8771 if (STRINGP (start))
8772 {
df7492f9 8773 if (!STRING_MULTIBYTE (start)
4c1958f4 8774 || SCHARS (start) == SBYTES (start))
df7492f9
KH
8775 return Qnil;
8776 start_byte = 0;
8f924df7 8777 end_byte = SBYTES (start);
df7492f9 8778 pos = 0;
d46c5b12 8779 }
05e6f5dc 8780 else
b73bfc1c 8781 {
b7826503
PJ
8782 CHECK_NUMBER_COERCE_MARKER (start);
8783 CHECK_NUMBER_COERCE_MARKER (end);
05e6f5dc
KH
8784 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8785 args_out_of_range (start, end);
4b4deea2 8786 if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
df7492f9
KH
8787 return Qnil;
8788 start_byte = CHAR_TO_BYTE (XINT (start));
8789 end_byte = CHAR_TO_BYTE (XINT (end));
8790 if (XINT (end) - XINT (start) == end_byte - start_byte)
5704f39a 8791 return Qnil;
df7492f9 8792
e1c23804 8793 if (XINT (start) < GPT && XINT (end) > GPT)
b73bfc1c 8794 {
e1c23804
DL
8795 if ((GPT - XINT (start)) < (XINT (end) - GPT))
8796 move_gap_both (XINT (start), start_byte);
df7492f9 8797 else
e1c23804 8798 move_gap_both (XINT (end), end_byte);
b73bfc1c 8799 }
e1c23804 8800 pos = XINT (start);
b73bfc1c 8801 }
7553d0e1 8802
df7492f9
KH
8803 list = Qnil;
8804 for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
12410ef1 8805 {
df7492f9 8806 elt = XCAR (tail);
7d64c6ad 8807 attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
2170c8f0
KH
8808 ASET (attrs, coding_attr_trans_tbl,
8809 get_translation_table (attrs, 1, NULL));
7d64c6ad 8810 list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
12410ef1
KH
8811 }
8812
df7492f9 8813 if (STRINGP (start))
8f924df7 8814 p = pbeg = SDATA (start);
72d1a715 8815 else
df7492f9
KH
8816 p = pbeg = BYTE_POS_ADDR (start_byte);
8817 pend = p + (end_byte - start_byte);
4ed46869 8818
df7492f9
KH
8819 while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
8820 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
ec6d2bb8 8821
df7492f9 8822 while (p < pend)
d46c5b12 8823 {
df7492f9
KH
8824 if (ASCII_BYTE_P (*p))
8825 p++;
e133c8fa 8826 else
05e6f5dc 8827 {
df7492f9
KH
8828 c = STRING_CHAR_ADVANCE (p);
8829
8830 charset_map_loaded = 0;
8831 for (tail = list; CONSP (tail); tail = XCDR (tail))
8832 {
8833 elt = XCDR (XCAR (tail));
8834 if (! char_encodable_p (c, XCAR (elt)))
8835 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
8836 }
8837 if (charset_map_loaded)
8838 {
d311d28c 8839 ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
df7492f9
KH
8840
8841 if (STRINGP (start))
8f924df7 8842 pbeg = SDATA (start);
df7492f9
KH
8843 else
8844 pbeg = BYTE_POS_ADDR (start_byte);
8845 p = pbeg + p_offset;
8846 pend = pbeg + pend_offset;
8847 }
05e6f5dc 8848 }
df7492f9 8849 pos++;
d46c5b12 8850 }
4ed46869 8851
df7492f9
KH
8852 tail = list;
8853 list = Qnil;
8854 for (; CONSP (tail); tail = XCDR (tail))
ec6d2bb8 8855 {
df7492f9
KH
8856 elt = XCAR (tail);
8857 if (CONSP (XCDR (XCDR (elt))))
8858 list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
8859 list);
ec6d2bb8 8860 }
2b4f9037 8861
df7492f9 8862 return list;
d46c5b12
KH
8863}
8864
3fd9494b 8865
74ab6df5 8866static Lisp_Object
cf84bb53
JB
8867code_convert_region (Lisp_Object start, Lisp_Object end,
8868 Lisp_Object coding_system, Lisp_Object dst_object,
f10fe38f 8869 bool encodep, bool norecord)
4ed46869 8870{
3a73fa5d 8871 struct coding_system coding;
d311d28c 8872 ptrdiff_t from, from_byte, to, to_byte;
df7492f9 8873 Lisp_Object src_object;
4ed46869 8874
b7826503
PJ
8875 CHECK_NUMBER_COERCE_MARKER (start);
8876 CHECK_NUMBER_COERCE_MARKER (end);
df7492f9
KH
8877 if (NILP (coding_system))
8878 coding_system = Qno_conversion;
8879 else
8880 CHECK_CODING_SYSTEM (coding_system);
8881 src_object = Fcurrent_buffer ();
8882 if (NILP (dst_object))
8883 dst_object = src_object;
8884 else if (! EQ (dst_object, Qt))
8885 CHECK_BUFFER (dst_object);
3a73fa5d 8886
d46c5b12
KH
8887 validate_region (&start, &end);
8888 from = XFASTINT (start);
df7492f9 8889 from_byte = CHAR_TO_BYTE (from);
d46c5b12 8890 to = XFASTINT (end);
df7492f9 8891 to_byte = CHAR_TO_BYTE (to);
764ca8da 8892
df7492f9
KH
8893 setup_coding_system (coding_system, &coding);
8894 coding.mode |= CODING_MODE_LAST_BLOCK;
ec6d2bb8 8895
df7492f9
KH
8896 if (encodep)
8897 encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8898 dst_object);
8899 else
8900 decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8901 dst_object);
8902 if (! norecord)
8903 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
ec6d2bb8 8904
df7492f9
KH
8905 return (BUFFERP (dst_object)
8906 ? make_number (coding.produced_char)
8907 : coding.dst_object);
4031e2bf 8908}
78108bcd 8909
4ed46869 8910
4031e2bf 8911DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
df7492f9 8912 3, 4, "r\nzCoding system: ",
48b0f3ae 8913 doc: /* Decode the current region from the specified coding system.
df7492f9
KH
8914When called from a program, takes four arguments:
8915 START, END, CODING-SYSTEM, and DESTINATION.
8916START and END are buffer positions.
8844fa83 8917
df7492f9 8918Optional 4th arguments DESTINATION specifies where the decoded text goes.
2354b80b 8919If nil, the region between START and END is replaced by the decoded text.
1560f91a
EZ
8920If buffer, the decoded text is inserted in that buffer after point (point
8921does not move).
446dcd75 8922In those cases, the length of the decoded text is returned.
319a3947 8923If DESTINATION is t, the decoded text is returned.
8844fa83 8924
48b0f3ae
PJ
8925This function sets `last-coding-system-used' to the precise coding system
8926used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
319a3947 8927not fully specified.) */)
5842a27b 8928 (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
4031e2bf 8929{
df7492f9 8930 return code_convert_region (start, end, coding_system, destination, 0, 0);
3a73fa5d 8931}
8844fa83 8932
3a73fa5d 8933DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
df7492f9
KH
8934 3, 4, "r\nzCoding system: ",
8935 doc: /* Encode the current region by specified coding system.
d4a1d553
JB
8936When called from a program, takes four arguments:
8937 START, END, CODING-SYSTEM and DESTINATION.
8938START and END are buffer positions.
d46c5b12 8939
df7492f9
KH
8940Optional 4th arguments DESTINATION specifies where the encoded text goes.
8941If nil, the region between START and END is replace by the encoded text.
1560f91a
EZ
8942If buffer, the encoded text is inserted in that buffer after point (point
8943does not move).
446dcd75 8944In those cases, the length of the encoded text is returned.
319a3947 8945If DESTINATION is t, the encoded text is returned.
2391eaa4 8946
48b0f3ae
PJ
8947This function sets `last-coding-system-used' to the precise coding system
8948used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
319a3947 8949not fully specified.) */)
5842a27b 8950 (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
3a73fa5d 8951{
df7492f9 8952 return code_convert_region (start, end, coding_system, destination, 1, 0);
b73bfc1c
KH
8953}
8954
8955Lisp_Object
6f704c76 8956code_convert_string (Lisp_Object string, Lisp_Object coding_system,
f10fe38f
PE
8957 Lisp_Object dst_object, bool encodep, bool nocopy,
8958 bool norecord)
b73bfc1c 8959{
4031e2bf 8960 struct coding_system coding;
d311d28c 8961 ptrdiff_t chars, bytes;
ec6d2bb8 8962
b7826503 8963 CHECK_STRING (string);
d46c5b12 8964 if (NILP (coding_system))
4956c225 8965 {
df7492f9
KH
8966 if (! norecord)
8967 Vlast_coding_system_used = Qno_conversion;
8968 if (NILP (dst_object))
8969 return (nocopy ? Fcopy_sequence (string) : string);
4956c225 8970 }
b73bfc1c 8971
df7492f9
KH
8972 if (NILP (coding_system))
8973 coding_system = Qno_conversion;
8974 else
8975 CHECK_CODING_SYSTEM (coding_system);
8976 if (NILP (dst_object))
8977 dst_object = Qt;
8978 else if (! EQ (dst_object, Qt))
8979 CHECK_BUFFER (dst_object);
73be902c 8980
df7492f9 8981 setup_coding_system (coding_system, &coding);
d46c5b12 8982 coding.mode |= CODING_MODE_LAST_BLOCK;
8f924df7
KH
8983 chars = SCHARS (string);
8984 bytes = SBYTES (string);
df7492f9
KH
8985 if (encodep)
8986 encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
8987 else
8988 decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
8989 if (! norecord)
8990 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
73be902c 8991
df7492f9
KH
8992 return (BUFFERP (dst_object)
8993 ? make_number (coding.produced_char)
8994 : coding.dst_object);
4ed46869 8995}
73be902c 8996
b73bfc1c 8997
ecec61c1 8998/* Encode or decode STRING according to CODING_SYSTEM.
ec6d2bb8 8999 Do not set Vlast_coding_system_used.
4ed46869 9000
ec6d2bb8
KH
9001 This function is called only from macros DECODE_FILE and
9002 ENCODE_FILE, thus we ignore character composition. */
4ed46869 9003
ecec61c1 9004Lisp_Object
cf84bb53 9005code_convert_string_norecord (Lisp_Object string, Lisp_Object coding_system,
f10fe38f 9006 bool encodep)
4ed46869 9007{
0be8721c 9008 return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
4ed46869
KH
9009}
9010
4ed46869 9011
a7ca3326 9012DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
df7492f9
KH
9013 2, 4, 0,
9014 doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
9015
9016Optional third arg NOCOPY non-nil means it is OK to return STRING itself
9017if the decoding operation is trivial.
ecec61c1 9018
d4a1d553 9019Optional fourth arg BUFFER non-nil means that the decoded text is
1560f91a
EZ
9020inserted in that buffer after point (point does not move). In this
9021case, the return value is the length of the decoded text.
ecec61c1 9022
df7492f9
KH
9023This function sets `last-coding-system-used' to the precise coding system
9024used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
d4a1d553 9025not fully specified.) */)
5842a27b 9026 (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
4ed46869 9027{
df7492f9
KH
9028 return code_convert_string (string, coding_system, buffer,
9029 0, ! NILP (nocopy), 0);
4ed46869
KH
9030}
9031
df7492f9
KH
9032DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
9033 2, 4, 0,
9034 doc: /* Encode STRING to CODING-SYSTEM, and return the result.
9035
9036Optional third arg NOCOPY non-nil means it is OK to return STRING
9037itself if the encoding operation is trivial.
9038
d4a1d553 9039Optional fourth arg BUFFER non-nil means that the encoded text is
1560f91a
EZ
9040inserted in that buffer after point (point does not move). In this
9041case, the return value is the length of the encoded text.
df7492f9
KH
9042
9043This function sets `last-coding-system-used' to the precise coding system
9044used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9045not fully specified.) */)
5842a27b 9046 (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
4ed46869 9047{
df7492f9 9048 return code_convert_string (string, coding_system, buffer,
4550efdf 9049 1, ! NILP (nocopy), 0);
4ed46869 9050}
df7492f9 9051
3a73fa5d 9052\f
4ed46869 9053DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
48b0f3ae
PJ
9054 doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
9055Return the corresponding character. */)
5842a27b 9056 (Lisp_Object code)
4ed46869 9057{
df7492f9
KH
9058 Lisp_Object spec, attrs, val;
9059 struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
5fdb398c
PE
9060 EMACS_INT ch;
9061 int c;
4ed46869 9062
df7492f9 9063 CHECK_NATNUM (code);
5fdb398c 9064 ch = XFASTINT (code);
df7492f9
KH
9065 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9066 attrs = AREF (spec, 0);
4ed46869 9067
5fdb398c 9068 if (ASCII_BYTE_P (ch)
df7492f9
KH
9069 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9070 return code;
4ed46869 9071
df7492f9
KH
9072 val = CODING_ATTR_CHARSET_LIST (attrs);
9073 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
004068e4
KH
9074 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9075 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
fa42c37f 9076
5fdb398c
PE
9077 if (ch <= 0x7F)
9078 {
9079 c = ch;
9080 charset = charset_roman;
9081 }
9082 else if (ch >= 0xA0 && ch < 0xDF)
55ab7be3 9083 {
5fdb398c 9084 c = ch - 0x80;
df7492f9 9085 charset = charset_kana;
4ed46869 9086 }
55ab7be3 9087 else
4ed46869 9088 {
5fdb398c
PE
9089 EMACS_INT c1 = ch >> 8;
9090 int c2 = ch & 0xFF;
df7492f9 9091
2735d060
PE
9092 if (c1 < 0x81 || (c1 > 0x9F && c1 < 0xE0) || c1 > 0xEF
9093 || c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
c2982e87 9094 error ("Invalid code: %"pI"d", ch);
5fdb398c 9095 c = ch;
df7492f9
KH
9096 SJIS_TO_JIS (c);
9097 charset = charset_kanji;
4ed46869 9098 }
df7492f9
KH
9099 c = DECODE_CHAR (charset, c);
9100 if (c < 0)
c2982e87 9101 error ("Invalid code: %"pI"d", ch);
df7492f9 9102 return make_number (c);
93dec019 9103}
4ed46869 9104
48b0f3ae 9105
4ed46869 9106DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
8acf0c0e 9107 doc: /* Encode a Japanese character CH to shift_jis encoding.
48b0f3ae 9108Return the corresponding code in SJIS. */)
5842a27b 9109 (Lisp_Object ch)
4ed46869 9110{
df7492f9
KH
9111 Lisp_Object spec, attrs, charset_list;
9112 int c;
9113 struct charset *charset;
9114 unsigned code;
48b0f3ae 9115
df7492f9
KH
9116 CHECK_CHARACTER (ch);
9117 c = XFASTINT (ch);
9118 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9119 attrs = AREF (spec, 0);
9120
9121 if (ASCII_CHAR_P (c)
9122 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9123 return ch;
9124
9125 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9126 charset = char_charset (c, charset_list, &code);
9127 if (code == CHARSET_INVALID_CODE (charset))
e6c3da20 9128 error ("Can't encode by shift_jis encoding: %c", c);
df7492f9
KH
9129 JIS_TO_SJIS (code);
9130
9131 return make_number (code);
4ed46869
KH
9132}
9133
9134DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
48b0f3ae
PJ
9135 doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
9136Return the corresponding character. */)
5842a27b 9137 (Lisp_Object code)
d46c5b12 9138{
df7492f9
KH
9139 Lisp_Object spec, attrs, val;
9140 struct charset *charset_roman, *charset_big5, *charset;
5fdb398c 9141 EMACS_INT ch;
df7492f9 9142 int c;
6289dd10 9143
df7492f9 9144 CHECK_NATNUM (code);
5fdb398c 9145 ch = XFASTINT (code);
df7492f9
KH
9146 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9147 attrs = AREF (spec, 0);
4ed46869 9148
5fdb398c 9149 if (ASCII_BYTE_P (ch)
df7492f9
KH
9150 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9151 return code;
6289dd10 9152
df7492f9
KH
9153 val = CODING_ATTR_CHARSET_LIST (attrs);
9154 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9155 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
c210f766 9156
5fdb398c
PE
9157 if (ch <= 0x7F)
9158 {
9159 c = ch;
9160 charset = charset_roman;
9161 }
c28a9453
KH
9162 else
9163 {
5fdb398c
PE
9164 EMACS_INT b1 = ch >> 8;
9165 int b2 = ch & 0x7F;
df7492f9
KH
9166 if (b1 < 0xA1 || b1 > 0xFE
9167 || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
c2982e87 9168 error ("Invalid code: %"pI"d", ch);
5fdb398c 9169 c = ch;
df7492f9 9170 charset = charset_big5;
c28a9453 9171 }
5fdb398c 9172 c = DECODE_CHAR (charset, c);
df7492f9 9173 if (c < 0)
c2982e87 9174 error ("Invalid code: %"pI"d", ch);
df7492f9 9175 return make_number (c);
d46c5b12 9176}
6289dd10 9177
4ed46869 9178DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
8acf0c0e 9179 doc: /* Encode the Big5 character CH to BIG5 coding system.
48b0f3ae 9180Return the corresponding character code in Big5. */)
5842a27b 9181 (Lisp_Object ch)
4ed46869 9182{
df7492f9
KH
9183 Lisp_Object spec, attrs, charset_list;
9184 struct charset *charset;
9185 int c;
9186 unsigned code;
9187
9188 CHECK_CHARACTER (ch);
9189 c = XFASTINT (ch);
9190 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9191 attrs = AREF (spec, 0);
9192 if (ASCII_CHAR_P (c)
9193 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9194 return ch;
9195
9196 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9197 charset = char_charset (c, charset_list, &code);
9198 if (code == CHARSET_INVALID_CODE (charset))
e6c3da20 9199 error ("Can't encode by Big5 encoding: %c", c);
df7492f9
KH
9200
9201 return make_number (code);
4ed46869 9202}
48b0f3ae 9203
3a73fa5d 9204\f
002fdb44 9205DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
68bba4e4 9206 Sset_terminal_coding_system_internal, 1, 2, 0,
48b0f3ae 9207 doc: /* Internal use only. */)
5842a27b 9208 (Lisp_Object coding_system, Lisp_Object terminal)
4ed46869 9209{
b18fad6d
KH
9210 struct terminal *term = get_terminal (terminal, 1);
9211 struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (term);
b7826503 9212 CHECK_SYMBOL (coding_system);
b8299c66 9213 setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
70c22245 9214 /* We had better not send unsafe characters to terminal. */
c73bd236 9215 terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
ad1746f5 9216 /* Character composition should be disabled. */
c73bd236 9217 terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
b8299c66
KL
9218 terminal_coding->src_multibyte = 1;
9219 terminal_coding->dst_multibyte = 0;
3f22b86f
PE
9220 tset_charset_list
9221 (term, (terminal_coding->common_flags & CODING_REQUIRE_ENCODING_MASK
9222 ? coding_charset_list (terminal_coding)
9223 : Fcons (make_number (charset_ascii), Qnil)));
4ed46869
KH
9224 return Qnil;
9225}
9226
c4825358
KH
9227DEFUN ("set-safe-terminal-coding-system-internal",
9228 Fset_safe_terminal_coding_system_internal,
48b0f3ae 9229 Sset_safe_terminal_coding_system_internal, 1, 1, 0,
ddb67bdc 9230 doc: /* Internal use only. */)
5842a27b 9231 (Lisp_Object coding_system)
d46c5b12 9232{
b7826503 9233 CHECK_SYMBOL (coding_system);
c4825358
KH
9234 setup_coding_system (Fcheck_coding_system (coding_system),
9235 &safe_terminal_coding);
ad1746f5 9236 /* Character composition should be disabled. */
df7492f9 9237 safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
b73bfc1c
KH
9238 safe_terminal_coding.src_multibyte = 1;
9239 safe_terminal_coding.dst_multibyte = 0;
c4825358
KH
9240 return Qnil;
9241}
4ed46869 9242
002fdb44 9243DEFUN ("terminal-coding-system", Fterminal_coding_system,
68bba4e4 9244 Sterminal_coding_system, 0, 1, 0,
6ed8eeff 9245 doc: /* Return coding system specified for terminal output on the given terminal.
708e05dc 9246TERMINAL may be a terminal object, a frame, or nil for the selected
6ed8eeff 9247frame's terminal device. */)
5842a27b 9248 (Lisp_Object terminal)
4ed46869 9249{
985773c9
MB
9250 struct coding_system *terminal_coding
9251 = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
9252 Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
ae6f73fa 9253
6d5eb5b0 9254 /* For backward compatibility, return nil if it is `undecided'. */
f75c90a9 9255 return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
4ed46869
KH
9256}
9257
002fdb44 9258DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
68bba4e4 9259 Sset_keyboard_coding_system_internal, 1, 2, 0,
48b0f3ae 9260 doc: /* Internal use only. */)
5842a27b 9261 (Lisp_Object coding_system, Lisp_Object terminal)
4ed46869 9262{
6ed8eeff 9263 struct terminal *t = get_terminal (terminal, 1);
b7826503 9264 CHECK_SYMBOL (coding_system);
624bda09
KH
9265 if (NILP (coding_system))
9266 coding_system = Qno_conversion;
9267 else
9268 Fcheck_coding_system (coding_system);
9269 setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t));
ad1746f5 9270 /* Character composition should be disabled. */
c73bd236
MB
9271 TERMINAL_KEYBOARD_CODING (t)->common_flags
9272 &= ~CODING_ANNOTATE_COMPOSITION_MASK;
4ed46869
KH
9273 return Qnil;
9274}
9275
9276DEFUN ("keyboard-coding-system",
985773c9 9277 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
48b0f3ae 9278 doc: /* Return coding system specified for decoding keyboard input. */)
5842a27b 9279 (Lisp_Object terminal)
4ed46869 9280{
985773c9
MB
9281 return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9282 (get_terminal (terminal, 1))->id);
4ed46869
KH
9283}
9284
4ed46869 9285\f
a7ca3326 9286DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
a5d301df 9287 Sfind_operation_coding_system, 1, MANY, 0,
48b0f3ae
PJ
9288 doc: /* Choose a coding system for an operation based on the target name.
9289The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9290DECODING-SYSTEM is the coding system to use for decoding
9291\(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9292for encoding (in case OPERATION does encoding).
05e6f5dc 9293
48b0f3ae
PJ
9294The first argument OPERATION specifies an I/O primitive:
9295 For file I/O, `insert-file-contents' or `write-region'.
9296 For process I/O, `call-process', `call-process-region', or `start-process'.
9297 For network I/O, `open-network-stream'.
05e6f5dc 9298
48b0f3ae
PJ
9299The remaining arguments should be the same arguments that were passed
9300to the primitive. Depending on which primitive, one of those arguments
9301is selected as the TARGET. For example, if OPERATION does file I/O,
9302whichever argument specifies the file name is TARGET.
05e6f5dc 9303
48b0f3ae 9304TARGET has a meaning which depends on OPERATION:
b883cdb2 9305 For file I/O, TARGET is a file name (except for the special case below).
48b0f3ae 9306 For process I/O, TARGET is a process name.
d4a1d553 9307 For network I/O, TARGET is a service name or a port number.
05e6f5dc 9308
d4a1d553 9309This function looks up what is specified for TARGET in
48b0f3ae
PJ
9310`file-coding-system-alist', `process-coding-system-alist',
9311or `network-coding-system-alist' depending on OPERATION.
9312They may specify a coding system, a cons of coding systems,
9313or a function symbol to call.
9314In the last case, we call the function with one argument,
9315which is a list of all the arguments given to this function.
1011c487
MB
9316If the function can't decide a coding system, it can return
9317`undecided' so that the normal code-detection is performed.
48b0f3ae 9318
b883cdb2
MB
9319If OPERATION is `insert-file-contents', the argument corresponding to
9320TARGET may be a cons (FILENAME . BUFFER). In that case, FILENAME is a
9321file name to look up, and BUFFER is a buffer that contains the file's
9322contents (not yet decoded). If `file-coding-system-alist' specifies a
9323function to call for FILENAME, that function should examine the
9324contents of BUFFER instead of reading the file.
9325
d918f936 9326usage: (find-operation-coding-system OPERATION ARGUMENTS...) */)
f66c7cf8 9327 (ptrdiff_t nargs, Lisp_Object *args)
6b89e3aa 9328{
4ed46869
KH
9329 Lisp_Object operation, target_idx, target, val;
9330 register Lisp_Object chain;
177c0ea7 9331
4ed46869
KH
9332 if (nargs < 2)
9333 error ("Too few arguments");
9334 operation = args[0];
9335 if (!SYMBOLP (operation)
d311d28c 9336 || (target_idx = Fget (operation, Qtarget_idx), !NATNUMP (target_idx)))
3ed051d4 9337 error ("Invalid first argument");
7b09a37a 9338 if (nargs <= 1 + XFASTINT (target_idx))
94dcfacf 9339 error ("Too few arguments for operation `%s'",
8f924df7 9340 SDATA (SYMBOL_NAME (operation)));
c5101a77 9341 target = args[XFASTINT (target_idx) + 1];
4ed46869 9342 if (!(STRINGP (target)
091a0ff0
KH
9343 || (EQ (operation, Qinsert_file_contents) && CONSP (target)
9344 && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
4ed46869 9345 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
94dcfacf
EZ
9346 error ("Invalid argument %"pI"d of operation `%s'",
9347 XFASTINT (target_idx) + 1, SDATA (SYMBOL_NAME (operation)));
091a0ff0
KH
9348 if (CONSP (target))
9349 target = XCAR (target);
4ed46869 9350
2e34157c
RS
9351 chain = ((EQ (operation, Qinsert_file_contents)
9352 || EQ (operation, Qwrite_region))
02ba4723 9353 ? Vfile_coding_system_alist
2e34157c 9354 : (EQ (operation, Qopen_network_stream)
02ba4723
KH
9355 ? Vnetwork_coding_system_alist
9356 : Vprocess_coding_system_alist));
4ed46869
KH
9357 if (NILP (chain))
9358 return Qnil;
9359
03699b14 9360 for (; CONSP (chain); chain = XCDR (chain))
6b89e3aa 9361 {
f44d27ce 9362 Lisp_Object elt;
6b89e3aa 9363
df7492f9 9364 elt = XCAR (chain);
4ed46869
KH
9365 if (CONSP (elt)
9366 && ((STRINGP (target)
03699b14
KR
9367 && STRINGP (XCAR (elt))
9368 && fast_string_match (XCAR (elt), target) >= 0)
9369 || (INTEGERP (target) && EQ (target, XCAR (elt)))))
6b89e3aa 9370 {
03699b14 9371 val = XCDR (elt);
b19fd4c5
KH
9372 /* Here, if VAL is both a valid coding system and a valid
9373 function symbol, we return VAL as a coding system. */
02ba4723
KH
9374 if (CONSP (val))
9375 return val;
9376 if (! SYMBOLP (val))
9377 return Qnil;
9378 if (! NILP (Fcoding_system_p (val)))
9379 return Fcons (val, val);
b19fd4c5 9380 if (! NILP (Ffboundp (val)))
6b89e3aa 9381 {
e2b97060
MB
9382 /* We use call1 rather than safe_call1
9383 so as to get bug reports about functions called here
9384 which don't handle the current interface. */
9385 val = call1 (val, Flist (nargs, args));
b19fd4c5
KH
9386 if (CONSP (val))
9387 return val;
9388 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
9389 return Fcons (val, val);
6b89e3aa 9390 }
02ba4723 9391 return Qnil;
6b89e3aa
KH
9392 }
9393 }
4ed46869 9394 return Qnil;
6b89e3aa
KH
9395}
9396
df7492f9 9397DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
a3f6ee6d 9398 Sset_coding_system_priority, 0, MANY, 0,
da7db224 9399 doc: /* Assign higher priority to the coding systems given as arguments.
d4a1d553 9400If multiple coding systems belong to the same category,
a3181084
DL
9401all but the first one are ignored.
9402
d4a1d553 9403usage: (set-coding-system-priority &rest coding-systems) */)
f66c7cf8 9404 (ptrdiff_t nargs, Lisp_Object *args)
df7492f9 9405{
f66c7cf8 9406 ptrdiff_t i, j;
f10fe38f 9407 bool changed[coding_category_max];
df7492f9
KH
9408 enum coding_category priorities[coding_category_max];
9409
72af86bd 9410 memset (changed, 0, sizeof changed);
6b89e3aa 9411
df7492f9 9412 for (i = j = 0; i < nargs; i++)
6b89e3aa 9413 {
df7492f9
KH
9414 enum coding_category category;
9415 Lisp_Object spec, attrs;
6b89e3aa 9416
df7492f9
KH
9417 CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9418 attrs = AREF (spec, 0);
9419 category = XINT (CODING_ATTR_CATEGORY (attrs));
9420 if (changed[category])
9421 /* Ignore this coding system because a coding system of the
9422 same category already had a higher priority. */
9423 continue;
9424 changed[category] = 1;
9425 priorities[j++] = category;
9426 if (coding_categories[category].id >= 0
9427 && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9428 setup_coding_system (args[i], &coding_categories[category]);
ff563fce 9429 Fset (AREF (Vcoding_category_table, category), args[i]);
df7492f9 9430 }
6b89e3aa 9431
df7492f9
KH
9432 /* Now we have decided top J priorities. Reflect the order of the
9433 original priorities to the remaining priorities. */
6b89e3aa 9434
df7492f9 9435 for (i = j, j = 0; i < coding_category_max; i++, j++)
6b89e3aa 9436 {
df7492f9
KH
9437 while (j < coding_category_max
9438 && changed[coding_priorities[j]])
9439 j++;
9440 if (j == coding_category_max)
1088b922 9441 emacs_abort ();
df7492f9
KH
9442 priorities[i] = coding_priorities[j];
9443 }
6b89e3aa 9444
72af86bd 9445 memcpy (coding_priorities, priorities, sizeof priorities);
177c0ea7 9446
ff563fce
KH
9447 /* Update `coding-category-list'. */
9448 Vcoding_category_list = Qnil;
c5101a77 9449 for (i = coding_category_max; i-- > 0; )
ff563fce
KH
9450 Vcoding_category_list
9451 = Fcons (AREF (Vcoding_category_table, priorities[i]),
9452 Vcoding_category_list);
6b89e3aa 9453
df7492f9 9454 return Qnil;
6b89e3aa
KH
9455}
9456
df7492f9
KH
9457DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
9458 Scoding_system_priority_list, 0, 1, 0,
da7db224 9459 doc: /* Return a list of coding systems ordered by their priorities.
b811c52b
KH
9460The list contains a subset of coding systems; i.e. coding systems
9461assigned to each coding category (see `coding-category-list').
9462
da7db224 9463HIGHESTP non-nil means just return the highest priority one. */)
5842a27b 9464 (Lisp_Object highestp)
d46c5b12
KH
9465{
9466 int i;
df7492f9 9467 Lisp_Object val;
6b89e3aa 9468
df7492f9 9469 for (i = 0, val = Qnil; i < coding_category_max; i++)
d46c5b12 9470 {
df7492f9
KH
9471 enum coding_category category = coding_priorities[i];
9472 int id = coding_categories[category].id;
9473 Lisp_Object attrs;
068a9dbd 9474
df7492f9
KH
9475 if (id < 0)
9476 continue;
9477 attrs = CODING_ID_ATTRS (id);
9478 if (! NILP (highestp))
9479 return CODING_ATTR_BASE_NAME (attrs);
9480 val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
9481 }
9482 return Fnreverse (val);
9483}
068a9dbd 9484
91433552 9485static const char *const suffixes[] = { "-unix", "-dos", "-mac" };
068a9dbd
KH
9486
9487static Lisp_Object
971de7fb 9488make_subsidiaries (Lisp_Object base)
068a9dbd 9489{
df7492f9 9490 Lisp_Object subsidiaries;
1bfdaf10 9491 ptrdiff_t base_name_len = SBYTES (SYMBOL_NAME (base));
38182d90 9492 char *buf = alloca (base_name_len + 6);
df7492f9 9493 int i;
068a9dbd 9494
72af86bd 9495 memcpy (buf, SDATA (SYMBOL_NAME (base)), base_name_len);
df7492f9
KH
9496 subsidiaries = Fmake_vector (make_number (3), Qnil);
9497 for (i = 0; i < 3; i++)
068a9dbd 9498 {
1bfdaf10 9499 strcpy (buf + base_name_len, suffixes[i]);
df7492f9 9500 ASET (subsidiaries, i, intern (buf));
068a9dbd 9501 }
df7492f9 9502 return subsidiaries;
068a9dbd
KH
9503}
9504
9505
df7492f9
KH
9506DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
9507 Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
1fcd6c8b
DL
9508 doc: /* For internal use only.
9509usage: (define-coding-system-internal ...) */)
f66c7cf8 9510 (ptrdiff_t nargs, Lisp_Object *args)
068a9dbd 9511{
df7492f9
KH
9512 Lisp_Object name;
9513 Lisp_Object spec_vec; /* [ ATTRS ALIASE EOL_TYPE ] */
9514 Lisp_Object attrs; /* Vector of attributes. */
9515 Lisp_Object eol_type;
9516 Lisp_Object aliases;
9517 Lisp_Object coding_type, charset_list, safe_charsets;
9518 enum coding_category category;
9519 Lisp_Object tail, val;
9520 int max_charset_id = 0;
9521 int i;
068a9dbd 9522
df7492f9
KH
9523 if (nargs < coding_arg_max)
9524 goto short_args;
068a9dbd 9525
df7492f9 9526 attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
068a9dbd 9527
df7492f9
KH
9528 name = args[coding_arg_name];
9529 CHECK_SYMBOL (name);
4939150c 9530 ASET (attrs, coding_attr_base_name, name);
068a9dbd 9531
df7492f9
KH
9532 val = args[coding_arg_mnemonic];
9533 if (! STRINGP (val))
9534 CHECK_CHARACTER (val);
4939150c 9535 ASET (attrs, coding_attr_mnemonic, val);
068a9dbd 9536
df7492f9
KH
9537 coding_type = args[coding_arg_coding_type];
9538 CHECK_SYMBOL (coding_type);
4939150c 9539 ASET (attrs, coding_attr_type, coding_type);
068a9dbd 9540
df7492f9
KH
9541 charset_list = args[coding_arg_charset_list];
9542 if (SYMBOLP (charset_list))
9543 {
9544 if (EQ (charset_list, Qiso_2022))
9545 {
9546 if (! EQ (coding_type, Qiso_2022))
9547 error ("Invalid charset-list");
9548 charset_list = Viso_2022_charset_list;
9549 }
9550 else if (EQ (charset_list, Qemacs_mule))
9551 {
9552 if (! EQ (coding_type, Qemacs_mule))
9553 error ("Invalid charset-list");
9554 charset_list = Vemacs_mule_charset_list;
9555 }
9556 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
d311d28c
PE
9557 {
9558 if (! RANGED_INTEGERP (0, XCAR (tail), INT_MAX - 1))
9559 error ("Invalid charset-list");
9560 if (max_charset_id < XFASTINT (XCAR (tail)))
9561 max_charset_id = XFASTINT (XCAR (tail));
9562 }
df7492f9 9563 }
068a9dbd
KH
9564 else
9565 {
df7492f9 9566 charset_list = Fcopy_sequence (charset_list);
985773c9 9567 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
068a9dbd 9568 {
df7492f9
KH
9569 struct charset *charset;
9570
985773c9 9571 val = XCAR (tail);
df7492f9
KH
9572 CHECK_CHARSET_GET_CHARSET (val, charset);
9573 if (EQ (coding_type, Qiso_2022)
9574 ? CHARSET_ISO_FINAL (charset) < 0
9575 : EQ (coding_type, Qemacs_mule)
9576 ? CHARSET_EMACS_MULE_ID (charset) < 0
9577 : 0)
9578 error ("Can't handle charset `%s'",
8f924df7 9579 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
df7492f9 9580
8f924df7 9581 XSETCAR (tail, make_number (charset->id));
df7492f9
KH
9582 if (max_charset_id < charset->id)
9583 max_charset_id = charset->id;
068a9dbd
KH
9584 }
9585 }
4939150c 9586 ASET (attrs, coding_attr_charset_list, charset_list);
068a9dbd 9587
1b3b981b
AS
9588 safe_charsets = make_uninit_string (max_charset_id + 1);
9589 memset (SDATA (safe_charsets), 255, max_charset_id + 1);
df7492f9 9590 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8f924df7 9591 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
4939150c 9592 ASET (attrs, coding_attr_safe_charsets, safe_charsets);
068a9dbd 9593
4939150c 9594 ASET (attrs, coding_attr_ascii_compat, args[coding_arg_ascii_compatible_p]);
3a73fa5d 9595
df7492f9 9596 val = args[coding_arg_decode_translation_table];
a6f87d34 9597 if (! CHAR_TABLE_P (val) && ! CONSP (val))
7d64c6ad 9598 CHECK_SYMBOL (val);
4939150c 9599 ASET (attrs, coding_attr_decode_tbl, val);
3a73fa5d 9600
df7492f9 9601 val = args[coding_arg_encode_translation_table];
a6f87d34 9602 if (! CHAR_TABLE_P (val) && ! CONSP (val))
7d64c6ad 9603 CHECK_SYMBOL (val);
4939150c 9604 ASET (attrs, coding_attr_encode_tbl, val);
d46c5b12 9605
df7492f9
KH
9606 val = args[coding_arg_post_read_conversion];
9607 CHECK_SYMBOL (val);
4939150c 9608 ASET (attrs, coding_attr_post_read, val);
d46c5b12 9609
df7492f9
KH
9610 val = args[coding_arg_pre_write_conversion];
9611 CHECK_SYMBOL (val);
4939150c 9612 ASET (attrs, coding_attr_pre_write, val);
3a73fa5d 9613
df7492f9
KH
9614 val = args[coding_arg_default_char];
9615 if (NILP (val))
4939150c 9616 ASET (attrs, coding_attr_default_char, make_number (' '));
df7492f9
KH
9617 else
9618 {
8f924df7 9619 CHECK_CHARACTER (val);
4939150c 9620 ASET (attrs, coding_attr_default_char, val);
df7492f9 9621 }
4031e2bf 9622
8f924df7 9623 val = args[coding_arg_for_unibyte];
4939150c 9624 ASET (attrs, coding_attr_for_unibyte, NILP (val) ? Qnil : Qt);
3a73fa5d 9625
df7492f9
KH
9626 val = args[coding_arg_plist];
9627 CHECK_LIST (val);
4939150c 9628 ASET (attrs, coding_attr_plist, val);
3a73fa5d 9629
df7492f9
KH
9630 if (EQ (coding_type, Qcharset))
9631 {
c7c66a95
KH
9632 /* Generate a lisp vector of 256 elements. Each element is nil,
9633 integer, or a list of charset IDs.
3a73fa5d 9634
c7c66a95
KH
9635 If Nth element is nil, the byte code N is invalid in this
9636 coding system.
4ed46869 9637
c7c66a95
KH
9638 If Nth element is a number NUM, N is the first byte of a
9639 charset whose ID is NUM.
4ed46869 9640
c7c66a95
KH
9641 If Nth element is a list of charset IDs, N is the first byte
9642 of one of them. The list is sorted by dimensions of the
ad1746f5 9643 charsets. A charset of smaller dimension comes first. */
df7492f9 9644 val = Fmake_vector (make_number (256), Qnil);
4ed46869 9645
5c99c2e6 9646 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
df7492f9 9647 {
c7c66a95
KH
9648 struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
9649 int dim = CHARSET_DIMENSION (charset);
9650 int idx = (dim - 1) * 4;
4ed46869 9651
5c99c2e6 9652 if (CHARSET_ASCII_COMPATIBLE_P (charset))
4939150c 9653 ASET (attrs, coding_attr_ascii_compat, Qt);
4031e2bf 9654
15d143f7
KH
9655 for (i = charset->code_space[idx];
9656 i <= charset->code_space[idx + 1]; i++)
9657 {
c7c66a95
KH
9658 Lisp_Object tmp, tmp2;
9659 int dim2;
ec6d2bb8 9660
c7c66a95
KH
9661 tmp = AREF (val, i);
9662 if (NILP (tmp))
9663 tmp = XCAR (tail);
9664 else if (NUMBERP (tmp))
9665 {
9666 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
9667 if (dim < dim2)
c7c66a95 9668 tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
f9d71dcd
KH
9669 else
9670 tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
c7c66a95 9671 }
15d143f7 9672 else
c7c66a95
KH
9673 {
9674 for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
9675 {
9676 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
9677 if (dim < dim2)
9678 break;
9679 }
9680 if (NILP (tmp2))
9681 tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
9682 else
9683 {
9684 XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
9685 XSETCAR (tmp2, XCAR (tail));
9686 }
9687 }
9688 ASET (val, i, tmp);
15d143f7 9689 }
df7492f9
KH
9690 }
9691 ASET (attrs, coding_attr_charset_valids, val);
9692 category = coding_category_charset;
9693 }
9694 else if (EQ (coding_type, Qccl))
9695 {
9696 Lisp_Object valids;
ecec61c1 9697
df7492f9
KH
9698 if (nargs < coding_arg_ccl_max)
9699 goto short_args;
ecec61c1 9700
df7492f9
KH
9701 val = args[coding_arg_ccl_decoder];
9702 CHECK_CCL_PROGRAM (val);
9703 if (VECTORP (val))
9704 val = Fcopy_sequence (val);
9705 ASET (attrs, coding_attr_ccl_decoder, val);
ecec61c1 9706
df7492f9
KH
9707 val = args[coding_arg_ccl_encoder];
9708 CHECK_CCL_PROGRAM (val);
9709 if (VECTORP (val))
9710 val = Fcopy_sequence (val);
9711 ASET (attrs, coding_attr_ccl_encoder, val);
ecec61c1 9712
df7492f9
KH
9713 val = args[coding_arg_ccl_valids];
9714 valids = Fmake_string (make_number (256), make_number (0));
7d7bbefd 9715 for (tail = val; CONSP (tail); tail = XCDR (tail))
df7492f9 9716 {
8dcbea82 9717 int from, to;
ecec61c1 9718
34348bd4 9719 val = XCAR (tail);
df7492f9 9720 if (INTEGERP (val))
8dcbea82 9721 {
d311d28c 9722 if (! (0 <= XINT (val) && XINT (val) <= 255))
8dcbea82 9723 args_out_of_range_3 (val, make_number (0), make_number (255));
d311d28c 9724 from = to = XINT (val);
8dcbea82 9725 }
df7492f9
KH
9726 else
9727 {
df7492f9 9728 CHECK_CONS (val);
8f924df7 9729 CHECK_NATNUM_CAR (val);
d311d28c
PE
9730 CHECK_NUMBER_CDR (val);
9731 if (XINT (XCAR (val)) > 255)
8dcbea82
KH
9732 args_out_of_range_3 (XCAR (val),
9733 make_number (0), make_number (255));
d311d28c
PE
9734 from = XINT (XCAR (val));
9735 if (! (from <= XINT (XCDR (val)) && XINT (XCDR (val)) <= 255))
8dcbea82
KH
9736 args_out_of_range_3 (XCDR (val),
9737 XCAR (val), make_number (255));
d311d28c 9738 to = XINT (XCDR (val));
df7492f9 9739 }
8dcbea82 9740 for (i = from; i <= to; i++)
8f924df7 9741 SSET (valids, i, 1);
df7492f9
KH
9742 }
9743 ASET (attrs, coding_attr_ccl_valids, valids);
4ed46869 9744
df7492f9 9745 category = coding_category_ccl;
55ab7be3 9746 }
df7492f9 9747 else if (EQ (coding_type, Qutf_16))
55ab7be3 9748 {
df7492f9 9749 Lisp_Object bom, endian;
4ed46869 9750
4939150c 9751 ASET (attrs, coding_attr_ascii_compat, Qnil);
4ed46869 9752
df7492f9
KH
9753 if (nargs < coding_arg_utf16_max)
9754 goto short_args;
4ed46869 9755
df7492f9
KH
9756 bom = args[coding_arg_utf16_bom];
9757 if (! NILP (bom) && ! EQ (bom, Qt))
9758 {
9759 CHECK_CONS (bom);
8f924df7
KH
9760 val = XCAR (bom);
9761 CHECK_CODING_SYSTEM (val);
9762 val = XCDR (bom);
9763 CHECK_CODING_SYSTEM (val);
df7492f9 9764 }
a470d443 9765 ASET (attrs, coding_attr_utf_bom, bom);
df7492f9
KH
9766
9767 endian = args[coding_arg_utf16_endian];
b49a1807
KH
9768 CHECK_SYMBOL (endian);
9769 if (NILP (endian))
9770 endian = Qbig;
9771 else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
8f924df7 9772 error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
df7492f9
KH
9773 ASET (attrs, coding_attr_utf_16_endian, endian);
9774
9775 category = (CONSP (bom)
9776 ? coding_category_utf_16_auto
9777 : NILP (bom)
b49a1807 9778 ? (EQ (endian, Qbig)
df7492f9
KH
9779 ? coding_category_utf_16_be_nosig
9780 : coding_category_utf_16_le_nosig)
b49a1807 9781 : (EQ (endian, Qbig)
df7492f9
KH
9782 ? coding_category_utf_16_be
9783 : coding_category_utf_16_le));
9784 }
9785 else if (EQ (coding_type, Qiso_2022))
9786 {
9787 Lisp_Object initial, reg_usage, request, flags;
1397dc18 9788
df7492f9
KH
9789 if (nargs < coding_arg_iso2022_max)
9790 goto short_args;
9791
9792 initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
9793 CHECK_VECTOR (initial);
9794 for (i = 0; i < 4; i++)
9795 {
9796 val = Faref (initial, make_number (i));
9797 if (! NILP (val))
9798 {
584948ac
KH
9799 struct charset *charset;
9800
9801 CHECK_CHARSET_GET_CHARSET (val, charset);
9802 ASET (initial, i, make_number (CHARSET_ID (charset)));
9803 if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
4939150c 9804 ASET (attrs, coding_attr_ascii_compat, Qt);
df7492f9
KH
9805 }
9806 else
9807 ASET (initial, i, make_number (-1));
9808 }
9809
9810 reg_usage = args[coding_arg_iso2022_reg_usage];
9811 CHECK_CONS (reg_usage);
8f924df7
KH
9812 CHECK_NUMBER_CAR (reg_usage);
9813 CHECK_NUMBER_CDR (reg_usage);
df7492f9
KH
9814
9815 request = Fcopy_sequence (args[coding_arg_iso2022_request]);
7d7bbefd 9816 for (tail = request; CONSP (tail); tail = XCDR (tail))
1397dc18 9817 {
df7492f9 9818 int id;
2735d060 9819 Lisp_Object tmp1;
df7492f9 9820
34348bd4 9821 val = XCAR (tail);
df7492f9 9822 CHECK_CONS (val);
2735d060
PE
9823 tmp1 = XCAR (val);
9824 CHECK_CHARSET_GET_ID (tmp1, id);
8f924df7 9825 CHECK_NATNUM_CDR (val);
df7492f9 9826 if (XINT (XCDR (val)) >= 4)
c2982e87 9827 error ("Invalid graphic register number: %"pI"d", XINT (XCDR (val)));
8f924df7 9828 XSETCAR (val, make_number (id));
1397dc18 9829 }
4ed46869 9830
df7492f9
KH
9831 flags = args[coding_arg_iso2022_flags];
9832 CHECK_NATNUM (flags);
d311d28c 9833 i = XINT (flags) & INT_MAX;
df7492f9 9834 if (EQ (args[coding_arg_charset_list], Qiso_2022))
d311d28c
PE
9835 i |= CODING_ISO_FLAG_FULL_SUPPORT;
9836 flags = make_number (i);
df7492f9
KH
9837
9838 ASET (attrs, coding_attr_iso_initial, initial);
9839 ASET (attrs, coding_attr_iso_usage, reg_usage);
9840 ASET (attrs, coding_attr_iso_request, request);
9841 ASET (attrs, coding_attr_iso_flags, flags);
9842 setup_iso_safe_charsets (attrs);
9843
9844 if (i & CODING_ISO_FLAG_SEVEN_BITS)
9845 category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
9846 | CODING_ISO_FLAG_SINGLE_SHIFT))
9847 ? coding_category_iso_7_else
9848 : EQ (args[coding_arg_charset_list], Qiso_2022)
9849 ? coding_category_iso_7
9850 : coding_category_iso_7_tight);
9851 else
9852 {
9853 int id = XINT (AREF (initial, 1));
9854
c6fb6e98 9855 category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
df7492f9
KH
9856 || EQ (args[coding_arg_charset_list], Qiso_2022)
9857 || id < 0)
9858 ? coding_category_iso_8_else
9859 : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
9860 ? coding_category_iso_8_1
9861 : coding_category_iso_8_2);
9862 }
0ce7886f
KH
9863 if (category != coding_category_iso_8_1
9864 && category != coding_category_iso_8_2)
4939150c 9865 ASET (attrs, coding_attr_ascii_compat, Qnil);
df7492f9
KH
9866 }
9867 else if (EQ (coding_type, Qemacs_mule))
c28a9453 9868 {
df7492f9
KH
9869 if (EQ (args[coding_arg_charset_list], Qemacs_mule))
9870 ASET (attrs, coding_attr_emacs_mule_full, Qt);
4939150c 9871 ASET (attrs, coding_attr_ascii_compat, Qt);
df7492f9 9872 category = coding_category_emacs_mule;
c28a9453 9873 }
df7492f9 9874 else if (EQ (coding_type, Qshift_jis))
c28a9453 9875 {
df7492f9
KH
9876
9877 struct charset *charset;
9878
7d64c6ad 9879 if (XINT (Flength (charset_list)) != 3
6e07c25f 9880 && XINT (Flength (charset_list)) != 4)
7d64c6ad 9881 error ("There should be three or four charsets");
df7492f9
KH
9882
9883 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9884 if (CHARSET_DIMENSION (charset) != 1)
9885 error ("Dimension of charset %s is not one",
8f924df7 9886 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
584948ac 9887 if (CHARSET_ASCII_COMPATIBLE_P (charset))
4939150c 9888 ASET (attrs, coding_attr_ascii_compat, Qt);
df7492f9
KH
9889
9890 charset_list = XCDR (charset_list);
9891 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9892 if (CHARSET_DIMENSION (charset) != 1)
9893 error ("Dimension of charset %s is not one",
8f924df7 9894 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
df7492f9
KH
9895
9896 charset_list = XCDR (charset_list);
9897 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9898 if (CHARSET_DIMENSION (charset) != 2)
7d64c6ad
KH
9899 error ("Dimension of charset %s is not two",
9900 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9901
9902 charset_list = XCDR (charset_list);
2b917a06
KH
9903 if (! NILP (charset_list))
9904 {
9905 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9906 if (CHARSET_DIMENSION (charset) != 2)
9907 error ("Dimension of charset %s is not two",
9908 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9909 }
df7492f9
KH
9910
9911 category = coding_category_sjis;
9912 Vsjis_coding_system = name;
c28a9453 9913 }
df7492f9
KH
9914 else if (EQ (coding_type, Qbig5))
9915 {
9916 struct charset *charset;
4ed46869 9917
df7492f9
KH
9918 if (XINT (Flength (charset_list)) != 2)
9919 error ("There should be just two charsets");
9920
9921 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9922 if (CHARSET_DIMENSION (charset) != 1)
9923 error ("Dimension of charset %s is not one",
8f924df7 9924 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
584948ac 9925 if (CHARSET_ASCII_COMPATIBLE_P (charset))
4939150c 9926 ASET (attrs, coding_attr_ascii_compat, Qt);
df7492f9
KH
9927
9928 charset_list = XCDR (charset_list);
9929 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9930 if (CHARSET_DIMENSION (charset) != 2)
9931 error ("Dimension of charset %s is not two",
8f924df7 9932 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
4ed46869 9933
df7492f9
KH
9934 category = coding_category_big5;
9935 Vbig5_coding_system = name;
9936 }
9937 else if (EQ (coding_type, Qraw_text))
c28a9453 9938 {
584948ac 9939 category = coding_category_raw_text;
4939150c 9940 ASET (attrs, coding_attr_ascii_compat, Qt);
c28a9453 9941 }
df7492f9 9942 else if (EQ (coding_type, Qutf_8))
4ed46869 9943 {
a470d443
KH
9944 Lisp_Object bom;
9945
a470d443
KH
9946 if (nargs < coding_arg_utf8_max)
9947 goto short_args;
9948
9949 bom = args[coding_arg_utf8_bom];
9950 if (! NILP (bom) && ! EQ (bom, Qt))
9951 {
9952 CHECK_CONS (bom);
9953 val = XCAR (bom);
9954 CHECK_CODING_SYSTEM (val);
9955 val = XCDR (bom);
9956 CHECK_CODING_SYSTEM (val);
9957 }
9958 ASET (attrs, coding_attr_utf_bom, bom);
0e5317f7 9959 if (NILP (bom))
4939150c 9960 ASET (attrs, coding_attr_ascii_compat, Qt);
a470d443
KH
9961
9962 category = (CONSP (bom) ? coding_category_utf_8_auto
9963 : NILP (bom) ? coding_category_utf_8_nosig
9964 : coding_category_utf_8_sig);
4ed46869 9965 }
df7492f9
KH
9966 else if (EQ (coding_type, Qundecided))
9967 category = coding_category_undecided;
4ed46869 9968 else
df7492f9 9969 error ("Invalid coding system type: %s",
8f924df7 9970 SDATA (SYMBOL_NAME (coding_type)));
4ed46869 9971
4939150c
PE
9972 ASET (attrs, coding_attr_category, make_number (category));
9973 ASET (attrs, coding_attr_plist,
9974 Fcons (QCcategory,
9975 Fcons (AREF (Vcoding_category_table, category),
9976 CODING_ATTR_PLIST (attrs))));
9977 ASET (attrs, coding_attr_plist,
9978 Fcons (QCascii_compatible_p,
9979 Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
9980 CODING_ATTR_PLIST (attrs))));
c4825358 9981
df7492f9
KH
9982 eol_type = args[coding_arg_eol_type];
9983 if (! NILP (eol_type)
9984 && ! EQ (eol_type, Qunix)
9985 && ! EQ (eol_type, Qdos)
9986 && ! EQ (eol_type, Qmac))
9987 error ("Invalid eol-type");
4ed46869 9988
df7492f9 9989 aliases = Fcons (name, Qnil);
4ed46869 9990
df7492f9
KH
9991 if (NILP (eol_type))
9992 {
9993 eol_type = make_subsidiaries (name);
9994 for (i = 0; i < 3; i++)
1397dc18 9995 {
df7492f9
KH
9996 Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
9997
9998 this_name = AREF (eol_type, i);
9999 this_aliases = Fcons (this_name, Qnil);
10000 this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
10001 this_spec = Fmake_vector (make_number (3), attrs);
10002 ASET (this_spec, 1, this_aliases);
10003 ASET (this_spec, 2, this_eol_type);
10004 Fputhash (this_name, this_spec, Vcoding_system_hash_table);
10005 Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
583f71ca
KH
10006 val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
10007 if (NILP (val))
10008 Vcoding_system_alist
10009 = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
10010 Vcoding_system_alist);
1397dc18 10011 }
d46c5b12 10012 }
4ed46869 10013
df7492f9
KH
10014 spec_vec = Fmake_vector (make_number (3), attrs);
10015 ASET (spec_vec, 1, aliases);
10016 ASET (spec_vec, 2, eol_type);
48b0f3ae 10017
df7492f9
KH
10018 Fputhash (name, spec_vec, Vcoding_system_hash_table);
10019 Vcoding_system_list = Fcons (name, Vcoding_system_list);
583f71ca
KH
10020 val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
10021 if (NILP (val))
10022 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
10023 Vcoding_system_alist);
48b0f3ae 10024
df7492f9
KH
10025 {
10026 int id = coding_categories[category].id;
48b0f3ae 10027
df7492f9
KH
10028 if (id < 0 || EQ (name, CODING_ID_NAME (id)))
10029 setup_coding_system (name, &coding_categories[category]);
10030 }
48b0f3ae 10031
d46c5b12 10032 return Qnil;
48b0f3ae 10033
df7492f9
KH
10034 short_args:
10035 return Fsignal (Qwrong_number_of_arguments,
10036 Fcons (intern ("define-coding-system-internal"),
10037 make_number (nargs)));
d46c5b12 10038}
4ed46869 10039
d6925f38 10040
a6f87d34
KH
10041DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
10042 3, 3, 0,
10043 doc: /* Change value in CODING-SYSTEM's property list PROP to VAL. */)
5842a27b 10044 (Lisp_Object coding_system, Lisp_Object prop, Lisp_Object val)
a6f87d34 10045{
3dbe7859 10046 Lisp_Object spec, attrs;
a6f87d34
KH
10047
10048 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10049 attrs = AREF (spec, 0);
10050 if (EQ (prop, QCmnemonic))
10051 {
10052 if (! STRINGP (val))
10053 CHECK_CHARACTER (val);
4939150c 10054 ASET (attrs, coding_attr_mnemonic, val);
a6f87d34 10055 }
2133e2d1 10056 else if (EQ (prop, QCdefault_char))
a6f87d34
KH
10057 {
10058 if (NILP (val))
10059 val = make_number (' ');
10060 else
10061 CHECK_CHARACTER (val);
4939150c 10062 ASET (attrs, coding_attr_default_char, val);
a6f87d34
KH
10063 }
10064 else if (EQ (prop, QCdecode_translation_table))
10065 {
10066 if (! CHAR_TABLE_P (val) && ! CONSP (val))
10067 CHECK_SYMBOL (val);
4939150c 10068 ASET (attrs, coding_attr_decode_tbl, val);
a6f87d34
KH
10069 }
10070 else if (EQ (prop, QCencode_translation_table))
10071 {
10072 if (! CHAR_TABLE_P (val) && ! CONSP (val))
10073 CHECK_SYMBOL (val);
4939150c 10074 ASET (attrs, coding_attr_encode_tbl, val);
a6f87d34
KH
10075 }
10076 else if (EQ (prop, QCpost_read_conversion))
10077 {
10078 CHECK_SYMBOL (val);
4939150c 10079 ASET (attrs, coding_attr_post_read, val);
a6f87d34
KH
10080 }
10081 else if (EQ (prop, QCpre_write_conversion))
10082 {
10083 CHECK_SYMBOL (val);
4939150c 10084 ASET (attrs, coding_attr_pre_write, val);
a6f87d34 10085 }
35befdaa
KH
10086 else if (EQ (prop, QCascii_compatible_p))
10087 {
4939150c 10088 ASET (attrs, coding_attr_ascii_compat, val);
35befdaa 10089 }
a6f87d34 10090
4939150c
PE
10091 ASET (attrs, coding_attr_plist,
10092 Fplist_put (CODING_ATTR_PLIST (attrs), prop, val));
a6f87d34
KH
10093 return val;
10094}
10095
10096
df7492f9
KH
10097DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
10098 Sdefine_coding_system_alias, 2, 2, 0,
10099 doc: /* Define ALIAS as an alias for CODING-SYSTEM. */)
5842a27b 10100 (Lisp_Object alias, Lisp_Object coding_system)
66cfb530 10101{
583f71ca 10102 Lisp_Object spec, aliases, eol_type, val;
4ed46869 10103
df7492f9
KH
10104 CHECK_SYMBOL (alias);
10105 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10106 aliases = AREF (spec, 1);
d4a1d553 10107 /* ALIASES should be a list of length more than zero, and the first
d6925f38
KH
10108 element is a base coding system. Append ALIAS at the tail of the
10109 list. */
df7492f9
KH
10110 while (!NILP (XCDR (aliases)))
10111 aliases = XCDR (aliases);
8f924df7 10112 XSETCDR (aliases, Fcons (alias, Qnil));
4ed46869 10113
df7492f9
KH
10114 eol_type = AREF (spec, 2);
10115 if (VECTORP (eol_type))
4ed46869 10116 {
df7492f9
KH
10117 Lisp_Object subsidiaries;
10118 int i;
4ed46869 10119
df7492f9
KH
10120 subsidiaries = make_subsidiaries (alias);
10121 for (i = 0; i < 3; i++)
10122 Fdefine_coding_system_alias (AREF (subsidiaries, i),
10123 AREF (eol_type, i));
4ed46869 10124 }
df7492f9
KH
10125
10126 Fputhash (alias, spec, Vcoding_system_hash_table);
d6925f38 10127 Vcoding_system_list = Fcons (alias, Vcoding_system_list);
583f71ca
KH
10128 val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
10129 if (NILP (val))
10130 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
10131 Vcoding_system_alist);
66cfb530 10132
4ed46869
KH
10133 return Qnil;
10134}
10135
a7ca3326 10136DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
df7492f9
KH
10137 1, 1, 0,
10138 doc: /* Return the base of CODING-SYSTEM.
da7db224 10139Any alias or subsidiary coding system is not a base coding system. */)
5842a27b 10140 (Lisp_Object coding_system)
d46c5b12 10141{
df7492f9 10142 Lisp_Object spec, attrs;
d46c5b12 10143
df7492f9
KH
10144 if (NILP (coding_system))
10145 return (Qno_conversion);
10146 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10147 attrs = AREF (spec, 0);
10148 return CODING_ATTR_BASE_NAME (attrs);
10149}
1397dc18 10150
df7492f9
KH
10151DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
10152 1, 1, 0,
10153 doc: "Return the property list of CODING-SYSTEM.")
5842a27b 10154 (Lisp_Object coding_system)
df7492f9
KH
10155{
10156 Lisp_Object spec, attrs;
1397dc18 10157
df7492f9
KH
10158 if (NILP (coding_system))
10159 coding_system = Qno_conversion;
10160 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10161 attrs = AREF (spec, 0);
10162 return CODING_ATTR_PLIST (attrs);
d46c5b12
KH
10163}
10164
df7492f9
KH
10165
10166DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
10167 1, 1, 0,
da7db224 10168 doc: /* Return the list of aliases of CODING-SYSTEM. */)
5842a27b 10169 (Lisp_Object coding_system)
66cfb530 10170{
df7492f9 10171 Lisp_Object spec;
84d60297 10172
df7492f9
KH
10173 if (NILP (coding_system))
10174 coding_system = Qno_conversion;
10175 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
da7db224 10176 return AREF (spec, 1);
df7492f9 10177}
66cfb530 10178
a7ca3326 10179DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
df7492f9
KH
10180 Scoding_system_eol_type, 1, 1, 0,
10181 doc: /* Return eol-type of CODING-SYSTEM.
d4a1d553 10182An eol-type is an integer 0, 1, 2, or a vector of coding systems.
66cfb530 10183
df7492f9
KH
10184Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10185and CR respectively.
66cfb530 10186
df7492f9
KH
10187A vector value indicates that a format of end-of-line should be
10188detected automatically. Nth element of the vector is the subsidiary
10189coding system whose eol-type is N. */)
5842a27b 10190 (Lisp_Object coding_system)
6b89e3aa 10191{
df7492f9
KH
10192 Lisp_Object spec, eol_type;
10193 int n;
6b89e3aa 10194
df7492f9
KH
10195 if (NILP (coding_system))
10196 coding_system = Qno_conversion;
10197 if (! CODING_SYSTEM_P (coding_system))
10198 return Qnil;
10199 spec = CODING_SYSTEM_SPEC (coding_system);
10200 eol_type = AREF (spec, 2);
10201 if (VECTORP (eol_type))
10202 return Fcopy_sequence (eol_type);
10203 n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
10204 return make_number (n);
6b89e3aa
KH
10205}
10206
4ed46869
KH
10207#endif /* emacs */
10208
10209\f
1397dc18 10210/*** 9. Post-amble ***/
4ed46869 10211
dfcf069d 10212void
971de7fb 10213init_coding_once (void)
4ed46869
KH
10214{
10215 int i;
10216
df7492f9
KH
10217 for (i = 0; i < coding_category_max; i++)
10218 {
10219 coding_categories[i].id = -1;
10220 coding_priorities[i] = i;
10221 }
4ed46869
KH
10222
10223 /* ISO2022 specific initialize routine. */
10224 for (i = 0; i < 0x20; i++)
b73bfc1c 10225 iso_code_class[i] = ISO_control_0;
4ed46869
KH
10226 for (i = 0x21; i < 0x7F; i++)
10227 iso_code_class[i] = ISO_graphic_plane_0;
10228 for (i = 0x80; i < 0xA0; i++)
b73bfc1c 10229 iso_code_class[i] = ISO_control_1;
4ed46869
KH
10230 for (i = 0xA1; i < 0xFF; i++)
10231 iso_code_class[i] = ISO_graphic_plane_1;
10232 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
10233 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
4ed46869
KH
10234 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
10235 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
10236 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
10237 iso_code_class[ISO_CODE_ESC] = ISO_escape;
10238 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
10239 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
10240 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
10241
df7492f9
KH
10242 for (i = 0; i < 256; i++)
10243 {
10244 emacs_mule_bytes[i] = 1;
10245 }
7c78e542
KH
10246 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
10247 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
10248 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
10249 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
e0e989f6
KH
10250}
10251
10252#ifdef emacs
10253
dfcf069d 10254void
971de7fb 10255syms_of_coding (void)
e0e989f6 10256{
df7492f9 10257 staticpro (&Vcoding_system_hash_table);
8f924df7
KH
10258 {
10259 Lisp_Object args[2];
10260 args[0] = QCtest;
10261 args[1] = Qeq;
10262 Vcoding_system_hash_table = Fmake_hash_table (2, args);
10263 }
df7492f9
KH
10264
10265 staticpro (&Vsjis_coding_system);
10266 Vsjis_coding_system = Qnil;
e0e989f6 10267
df7492f9
KH
10268 staticpro (&Vbig5_coding_system);
10269 Vbig5_coding_system = Qnil;
10270
24a73b0a
KH
10271 staticpro (&Vcode_conversion_reused_workbuf);
10272 Vcode_conversion_reused_workbuf = Qnil;
10273
10274 staticpro (&Vcode_conversion_workbuf_name);
2a0213a6 10275 Vcode_conversion_workbuf_name = build_pure_c_string (" *code-conversion-work*");
e0e989f6 10276
24a73b0a 10277 reused_workbuf_in_use = 0;
df7492f9
KH
10278
10279 DEFSYM (Qcharset, "charset");
10280 DEFSYM (Qtarget_idx, "target-idx");
10281 DEFSYM (Qcoding_system_history, "coding-system-history");
bb0115a2
RS
10282 Fset (Qcoding_system_history, Qnil);
10283
9ce27fde 10284 /* Target FILENAME is the first argument. */
e0e989f6 10285 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
9ce27fde 10286 /* Target FILENAME is the third argument. */
e0e989f6
KH
10287 Fput (Qwrite_region, Qtarget_idx, make_number (2));
10288
df7492f9 10289 DEFSYM (Qcall_process, "call-process");
9ce27fde 10290 /* Target PROGRAM is the first argument. */
e0e989f6
KH
10291 Fput (Qcall_process, Qtarget_idx, make_number (0));
10292
df7492f9 10293 DEFSYM (Qcall_process_region, "call-process-region");
9ce27fde 10294 /* Target PROGRAM is the third argument. */
e0e989f6
KH
10295 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
10296
df7492f9 10297 DEFSYM (Qstart_process, "start-process");
9ce27fde 10298 /* Target PROGRAM is the third argument. */
e0e989f6
KH
10299 Fput (Qstart_process, Qtarget_idx, make_number (2));
10300
df7492f9 10301 DEFSYM (Qopen_network_stream, "open-network-stream");
9ce27fde 10302 /* Target SERVICE is the fourth argument. */
e0e989f6
KH
10303 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
10304
df7492f9
KH
10305 DEFSYM (Qcoding_system, "coding-system");
10306 DEFSYM (Qcoding_aliases, "coding-aliases");
4ed46869 10307
df7492f9
KH
10308 DEFSYM (Qeol_type, "eol-type");
10309 DEFSYM (Qunix, "unix");
10310 DEFSYM (Qdos, "dos");
4ed46869 10311
df7492f9
KH
10312 DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10313 DEFSYM (Qpost_read_conversion, "post-read-conversion");
10314 DEFSYM (Qpre_write_conversion, "pre-write-conversion");
10315 DEFSYM (Qdefault_char, "default-char");
10316 DEFSYM (Qundecided, "undecided");
10317 DEFSYM (Qno_conversion, "no-conversion");
10318 DEFSYM (Qraw_text, "raw-text");
4ed46869 10319
df7492f9 10320 DEFSYM (Qiso_2022, "iso-2022");
4ed46869 10321
df7492f9 10322 DEFSYM (Qutf_8, "utf-8");
8f924df7 10323 DEFSYM (Qutf_8_emacs, "utf-8-emacs");
27901516 10324
7f590b0c 10325#if defined (WINDOWSNT) || defined (CYGWIN)
ba116008
DC
10326 /* No, not utf-16-le: that one has a BOM. */
10327 DEFSYM (Qutf_16le, "utf-16le");
10328#endif
10329
df7492f9 10330 DEFSYM (Qutf_16, "utf-16");
df7492f9
KH
10331 DEFSYM (Qbig, "big");
10332 DEFSYM (Qlittle, "little");
27901516 10333
df7492f9
KH
10334 DEFSYM (Qshift_jis, "shift-jis");
10335 DEFSYM (Qbig5, "big5");
4ed46869 10336
df7492f9 10337 DEFSYM (Qcoding_system_p, "coding-system-p");
4ed46869 10338
df7492f9 10339 DEFSYM (Qcoding_system_error, "coding-system-error");
4ed46869 10340 Fput (Qcoding_system_error, Qerror_conditions,
3438fe21 10341 listn (CONSTYPE_PURE, 2, Qcoding_system_error, Qerror));
4ed46869 10342 Fput (Qcoding_system_error, Qerror_message,
2a0213a6 10343 build_pure_c_string ("Invalid coding system"));
4ed46869 10344
05e6f5dc
KH
10345 /* Intern this now in case it isn't already done.
10346 Setting this variable twice is harmless.
10347 But don't staticpro it here--that is done in alloc.c. */
d67b4f80 10348 Qchar_table_extra_slots = intern_c_string ("char-table-extra-slots");
70c22245 10349
df7492f9 10350 DEFSYM (Qtranslation_table, "translation-table");
433f7f87 10351 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
df7492f9
KH
10352 DEFSYM (Qtranslation_table_id, "translation-table-id");
10353 DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
10354 DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
1397dc18 10355
df7492f9 10356 DEFSYM (Qvalid_codes, "valid-codes");
9ce27fde 10357
df7492f9 10358 DEFSYM (Qemacs_mule, "emacs-mule");
d46c5b12 10359
01378f49 10360 DEFSYM (QCcategory, ":category");
a6f87d34 10361 DEFSYM (QCmnemonic, ":mnemonic");
2133e2d1 10362 DEFSYM (QCdefault_char, ":default-char");
a6f87d34
KH
10363 DEFSYM (QCdecode_translation_table, ":decode-translation-table");
10364 DEFSYM (QCencode_translation_table, ":encode-translation-table");
10365 DEFSYM (QCpost_read_conversion, ":post-read-conversion");
10366 DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
35befdaa 10367 DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
01378f49 10368
df7492f9
KH
10369 Vcoding_category_table
10370 = Fmake_vector (make_number (coding_category_max), Qnil);
10371 staticpro (&Vcoding_category_table);
10372 /* Followings are target of code detection. */
10373 ASET (Vcoding_category_table, coding_category_iso_7,
d67b4f80 10374 intern_c_string ("coding-category-iso-7"));
df7492f9 10375 ASET (Vcoding_category_table, coding_category_iso_7_tight,
d67b4f80 10376 intern_c_string ("coding-category-iso-7-tight"));
df7492f9 10377 ASET (Vcoding_category_table, coding_category_iso_8_1,
d67b4f80 10378 intern_c_string ("coding-category-iso-8-1"));
df7492f9 10379 ASET (Vcoding_category_table, coding_category_iso_8_2,
d67b4f80 10380 intern_c_string ("coding-category-iso-8-2"));
df7492f9 10381 ASET (Vcoding_category_table, coding_category_iso_7_else,
d67b4f80 10382 intern_c_string ("coding-category-iso-7-else"));
df7492f9 10383 ASET (Vcoding_category_table, coding_category_iso_8_else,
d67b4f80 10384 intern_c_string ("coding-category-iso-8-else"));
a470d443 10385 ASET (Vcoding_category_table, coding_category_utf_8_auto,
d67b4f80 10386 intern_c_string ("coding-category-utf-8-auto"));
a470d443 10387 ASET (Vcoding_category_table, coding_category_utf_8_nosig,
d67b4f80 10388 intern_c_string ("coding-category-utf-8"));
a470d443 10389 ASET (Vcoding_category_table, coding_category_utf_8_sig,
d67b4f80 10390 intern_c_string ("coding-category-utf-8-sig"));
df7492f9 10391 ASET (Vcoding_category_table, coding_category_utf_16_be,
d67b4f80 10392 intern_c_string ("coding-category-utf-16-be"));
ff563fce 10393 ASET (Vcoding_category_table, coding_category_utf_16_auto,
d67b4f80 10394 intern_c_string ("coding-category-utf-16-auto"));
df7492f9 10395 ASET (Vcoding_category_table, coding_category_utf_16_le,
d67b4f80 10396 intern_c_string ("coding-category-utf-16-le"));
df7492f9 10397 ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
d67b4f80 10398 intern_c_string ("coding-category-utf-16-be-nosig"));
df7492f9 10399 ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
d67b4f80 10400 intern_c_string ("coding-category-utf-16-le-nosig"));
df7492f9 10401 ASET (Vcoding_category_table, coding_category_charset,
d67b4f80 10402 intern_c_string ("coding-category-charset"));
df7492f9 10403 ASET (Vcoding_category_table, coding_category_sjis,
d67b4f80 10404 intern_c_string ("coding-category-sjis"));
df7492f9 10405 ASET (Vcoding_category_table, coding_category_big5,
d67b4f80 10406 intern_c_string ("coding-category-big5"));
df7492f9 10407 ASET (Vcoding_category_table, coding_category_ccl,
d67b4f80 10408 intern_c_string ("coding-category-ccl"));
df7492f9 10409 ASET (Vcoding_category_table, coding_category_emacs_mule,
d67b4f80 10410 intern_c_string ("coding-category-emacs-mule"));
df7492f9
KH
10411 /* Followings are NOT target of code detection. */
10412 ASET (Vcoding_category_table, coding_category_raw_text,
d67b4f80 10413 intern_c_string ("coding-category-raw-text"));
df7492f9 10414 ASET (Vcoding_category_table, coding_category_undecided,
d67b4f80 10415 intern_c_string ("coding-category-undecided"));
ecf488bc 10416
065e3595
KH
10417 DEFSYM (Qinsufficient_source, "insufficient-source");
10418 DEFSYM (Qinconsistent_eol, "inconsistent-eol");
10419 DEFSYM (Qinvalid_source, "invalid-source");
10420 DEFSYM (Qinterrupted, "interrupted");
10421 DEFSYM (Qinsufficient_memory, "insufficient-memory");
44e8490d 10422 DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
065e3595 10423
4ed46869
KH
10424 defsubr (&Scoding_system_p);
10425 defsubr (&Sread_coding_system);
10426 defsubr (&Sread_non_nil_coding_system);
10427 defsubr (&Scheck_coding_system);
10428 defsubr (&Sdetect_coding_region);
d46c5b12 10429 defsubr (&Sdetect_coding_string);
05e6f5dc 10430 defsubr (&Sfind_coding_systems_region_internal);
068a9dbd 10431 defsubr (&Sunencodable_char_position);
df7492f9 10432 defsubr (&Scheck_coding_systems_region);
4ed46869
KH
10433 defsubr (&Sdecode_coding_region);
10434 defsubr (&Sencode_coding_region);
10435 defsubr (&Sdecode_coding_string);
10436 defsubr (&Sencode_coding_string);
10437 defsubr (&Sdecode_sjis_char);
10438 defsubr (&Sencode_sjis_char);
10439 defsubr (&Sdecode_big5_char);
10440 defsubr (&Sencode_big5_char);
1ba9e4ab 10441 defsubr (&Sset_terminal_coding_system_internal);
c4825358 10442 defsubr (&Sset_safe_terminal_coding_system_internal);
4ed46869 10443 defsubr (&Sterminal_coding_system);
1ba9e4ab 10444 defsubr (&Sset_keyboard_coding_system_internal);
4ed46869 10445 defsubr (&Skeyboard_coding_system);
a5d301df 10446 defsubr (&Sfind_operation_coding_system);
df7492f9 10447 defsubr (&Sset_coding_system_priority);
6b89e3aa 10448 defsubr (&Sdefine_coding_system_internal);
df7492f9 10449 defsubr (&Sdefine_coding_system_alias);
a6f87d34 10450 defsubr (&Scoding_system_put);
df7492f9
KH
10451 defsubr (&Scoding_system_base);
10452 defsubr (&Scoding_system_plist);
10453 defsubr (&Scoding_system_aliases);
10454 defsubr (&Scoding_system_eol_type);
10455 defsubr (&Scoding_system_priority_list);
4ed46869 10456
29208e82 10457 DEFVAR_LISP ("coding-system-list", Vcoding_system_list,
48b0f3ae
PJ
10458 doc: /* List of coding systems.
10459
10460Do not alter the value of this variable manually. This variable should be
df7492f9 10461updated by the functions `define-coding-system' and
48b0f3ae 10462`define-coding-system-alias'. */);
4608c386
KH
10463 Vcoding_system_list = Qnil;
10464
29208e82 10465 DEFVAR_LISP ("coding-system-alist", Vcoding_system_alist,
48b0f3ae
PJ
10466 doc: /* Alist of coding system names.
10467Each element is one element list of coding system name.
446dcd75 10468This variable is given to `completing-read' as COLLECTION argument.
48b0f3ae
PJ
10469
10470Do not alter the value of this variable manually. This variable should be
10471updated by the functions `make-coding-system' and
10472`define-coding-system-alias'. */);
4608c386
KH
10473 Vcoding_system_alist = Qnil;
10474
29208e82 10475 DEFVAR_LISP ("coding-category-list", Vcoding_category_list,
48b0f3ae
PJ
10476 doc: /* List of coding-categories (symbols) ordered by priority.
10477
10478On detecting a coding system, Emacs tries code detection algorithms
10479associated with each coding-category one by one in this order. When
10480one algorithm agrees with a byte sequence of source text, the coding
0ec31faf
KH
10481system bound to the corresponding coding-category is selected.
10482
448e17d6 10483Don't modify this variable directly, but use `set-coding-system-priority'. */);
4ed46869
KH
10484 {
10485 int i;
10486
10487 Vcoding_category_list = Qnil;
df7492f9 10488 for (i = coding_category_max - 1; i >= 0; i--)
4ed46869 10489 Vcoding_category_list
28be1ada 10490 = Fcons (AREF (Vcoding_category_table, i),
d46c5b12 10491 Vcoding_category_list);
4ed46869
KH
10492 }
10493
29208e82 10494 DEFVAR_LISP ("coding-system-for-read", Vcoding_system_for_read,
48b0f3ae
PJ
10495 doc: /* Specify the coding system for read operations.
10496It is useful to bind this variable with `let', but do not set it globally.
10497If the value is a coding system, it is used for decoding on read operation.
446dcd75
JB
10498If not, an appropriate element is used from one of the coding system alists.
10499There are three such tables: `file-coding-system-alist',
48b0f3ae 10500`process-coding-system-alist', and `network-coding-system-alist'. */);
4ed46869
KH
10501 Vcoding_system_for_read = Qnil;
10502
29208e82 10503 DEFVAR_LISP ("coding-system-for-write", Vcoding_system_for_write,
48b0f3ae
PJ
10504 doc: /* Specify the coding system for write operations.
10505Programs bind this variable with `let', but you should not set it globally.
10506If the value is a coding system, it is used for encoding of output,
10507when writing it to a file and when sending it to a file or subprocess.
10508
10509If this does not specify a coding system, an appropriate element
446dcd75
JB
10510is used from one of the coding system alists.
10511There are three such tables: `file-coding-system-alist',
48b0f3ae
PJ
10512`process-coding-system-alist', and `network-coding-system-alist'.
10513For output to files, if the above procedure does not specify a coding system,
10514the value of `buffer-file-coding-system' is used. */);
4ed46869
KH
10515 Vcoding_system_for_write = Qnil;
10516
29208e82 10517 DEFVAR_LISP ("last-coding-system-used", Vlast_coding_system_used,
df7492f9
KH
10518 doc: /*
10519Coding system used in the latest file or process I/O. */);
4ed46869
KH
10520 Vlast_coding_system_used = Qnil;
10521
29208e82 10522 DEFVAR_LISP ("last-code-conversion-error", Vlast_code_conversion_error,
065e3595
KH
10523 doc: /*
10524Error status of the last code conversion.
10525
10526When an error was detected in the last code conversion, this variable
10527is set to one of the following symbols.
10528 `insufficient-source'
10529 `inconsistent-eol'
10530 `invalid-source'
10531 `interrupted'
10532 `insufficient-memory'
10533When no error was detected, the value doesn't change. So, to check
10534the error status of a code conversion by this variable, you must
10535explicitly set this variable to nil before performing code
10536conversion. */);
10537 Vlast_code_conversion_error = Qnil;
10538
29208e82 10539 DEFVAR_BOOL ("inhibit-eol-conversion", inhibit_eol_conversion,
df7492f9
KH
10540 doc: /*
10541*Non-nil means always inhibit code conversion of end-of-line format.
48b0f3ae
PJ
10542See info node `Coding Systems' and info node `Text and Binary' concerning
10543such conversion. */);
9ce27fde
KH
10544 inhibit_eol_conversion = 0;
10545
29208e82 10546 DEFVAR_BOOL ("inherit-process-coding-system", inherit_process_coding_system,
df7492f9
KH
10547 doc: /*
10548Non-nil means process buffer inherits coding system of process output.
48b0f3ae
PJ
10549Bind it to t if the process output is to be treated as if it were a file
10550read from some filesystem. */);
ed29121d
EZ
10551 inherit_process_coding_system = 0;
10552
29208e82 10553 DEFVAR_LISP ("file-coding-system-alist", Vfile_coding_system_alist,
df7492f9
KH
10554 doc: /*
10555Alist to decide a coding system to use for a file I/O operation.
48b0f3ae
PJ
10556The format is ((PATTERN . VAL) ...),
10557where PATTERN is a regular expression matching a file name,
10558VAL is a coding system, a cons of coding systems, or a function symbol.
10559If VAL is a coding system, it is used for both decoding and encoding
10560the file contents.
10561If VAL is a cons of coding systems, the car part is used for decoding,
10562and the cdr part is used for encoding.
10563If VAL is a function symbol, the function must return a coding system
2c53e699
KH
10564or a cons of coding systems which are used as above. The function is
10565called with an argument that is a list of the arguments with which
5a0bbd9a
KH
10566`find-operation-coding-system' was called. If the function can't decide
10567a coding system, it can return `undecided' so that the normal
10568code-detection is performed.
48b0f3ae
PJ
10569
10570See also the function `find-operation-coding-system'
10571and the variable `auto-coding-alist'. */);
02ba4723
KH
10572 Vfile_coding_system_alist = Qnil;
10573
29208e82 10574 DEFVAR_LISP ("process-coding-system-alist", Vprocess_coding_system_alist,
df7492f9
KH
10575 doc: /*
10576Alist to decide a coding system to use for a process I/O operation.
48b0f3ae
PJ
10577The format is ((PATTERN . VAL) ...),
10578where PATTERN is a regular expression matching a program name,
10579VAL is a coding system, a cons of coding systems, or a function symbol.
10580If VAL is a coding system, it is used for both decoding what received
10581from the program and encoding what sent to the program.
10582If VAL is a cons of coding systems, the car part is used for decoding,
10583and the cdr part is used for encoding.
10584If VAL is a function symbol, the function must return a coding system
10585or a cons of coding systems which are used as above.
10586
10587See also the function `find-operation-coding-system'. */);
02ba4723
KH
10588 Vprocess_coding_system_alist = Qnil;
10589
29208e82 10590 DEFVAR_LISP ("network-coding-system-alist", Vnetwork_coding_system_alist,
df7492f9
KH
10591 doc: /*
10592Alist to decide a coding system to use for a network I/O operation.
48b0f3ae
PJ
10593The format is ((PATTERN . VAL) ...),
10594where PATTERN is a regular expression matching a network service name
10595or is a port number to connect to,
10596VAL is a coding system, a cons of coding systems, or a function symbol.
10597If VAL is a coding system, it is used for both decoding what received
10598from the network stream and encoding what sent to the network stream.
10599If VAL is a cons of coding systems, the car part is used for decoding,
10600and the cdr part is used for encoding.
10601If VAL is a function symbol, the function must return a coding system
10602or a cons of coding systems which are used as above.
10603
10604See also the function `find-operation-coding-system'. */);
02ba4723 10605 Vnetwork_coding_system_alist = Qnil;
4ed46869 10606
29208e82 10607 DEFVAR_LISP ("locale-coding-system", Vlocale_coding_system,
75205970
RS
10608 doc: /* Coding system to use with system messages.
10609Also used for decoding keyboard input on X Window system. */);
68c45bf0
PE
10610 Vlocale_coding_system = Qnil;
10611
005f0d35 10612 /* The eol mnemonics are reset in startup.el system-dependently. */
29208e82 10613 DEFVAR_LISP ("eol-mnemonic-unix", eol_mnemonic_unix,
df7492f9
KH
10614 doc: /*
10615*String displayed in mode line for UNIX-like (LF) end-of-line format. */);
2a0213a6 10616 eol_mnemonic_unix = build_pure_c_string (":");
4ed46869 10617
29208e82 10618 DEFVAR_LISP ("eol-mnemonic-dos", eol_mnemonic_dos,
df7492f9
KH
10619 doc: /*
10620*String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
2a0213a6 10621 eol_mnemonic_dos = build_pure_c_string ("\\");
4ed46869 10622
29208e82 10623 DEFVAR_LISP ("eol-mnemonic-mac", eol_mnemonic_mac,
df7492f9
KH
10624 doc: /*
10625*String displayed in mode line for MAC-like (CR) end-of-line format. */);
2a0213a6 10626 eol_mnemonic_mac = build_pure_c_string ("/");
4ed46869 10627
29208e82 10628 DEFVAR_LISP ("eol-mnemonic-undecided", eol_mnemonic_undecided,
df7492f9
KH
10629 doc: /*
10630*String displayed in mode line when end-of-line format is not yet determined. */);
2a0213a6 10631 eol_mnemonic_undecided = build_pure_c_string (":");
4ed46869 10632
29208e82 10633 DEFVAR_LISP ("enable-character-translation", Venable_character_translation,
df7492f9
KH
10634 doc: /*
10635*Non-nil enables character translation while encoding and decoding. */);
84fbb8a0 10636 Venable_character_translation = Qt;
bdd9fb48 10637
f967223b 10638 DEFVAR_LISP ("standard-translation-table-for-decode",
29208e82 10639 Vstandard_translation_table_for_decode,
48b0f3ae 10640 doc: /* Table for translating characters while decoding. */);
f967223b 10641 Vstandard_translation_table_for_decode = Qnil;
bdd9fb48 10642
f967223b 10643 DEFVAR_LISP ("standard-translation-table-for-encode",
29208e82 10644 Vstandard_translation_table_for_encode,
48b0f3ae 10645 doc: /* Table for translating characters while encoding. */);
f967223b 10646 Vstandard_translation_table_for_encode = Qnil;
4ed46869 10647
29208e82 10648 DEFVAR_LISP ("charset-revision-table", Vcharset_revision_table,
48b0f3ae
PJ
10649 doc: /* Alist of charsets vs revision numbers.
10650While encoding, if a charset (car part of an element) is found,
df7492f9
KH
10651designate it with the escape sequence identifying revision (cdr part
10652of the element). */);
10653 Vcharset_revision_table = Qnil;
02ba4723
KH
10654
10655 DEFVAR_LISP ("default-process-coding-system",
29208e82 10656 Vdefault_process_coding_system,
48b0f3ae
PJ
10657 doc: /* Cons of coding systems used for process I/O by default.
10658The car part is used for decoding a process output,
10659the cdr part is used for encoding a text to be sent to a process. */);
02ba4723 10660 Vdefault_process_coding_system = Qnil;
c4825358 10661
29208e82 10662 DEFVAR_LISP ("latin-extra-code-table", Vlatin_extra_code_table,
df7492f9
KH
10663 doc: /*
10664Table of extra Latin codes in the range 128..159 (inclusive).
48b0f3ae
PJ
10665This is a vector of length 256.
10666If Nth element is non-nil, the existence of code N in a file
10667\(or output of subprocess) doesn't prevent it to be detected as
10668a coding system of ISO 2022 variant which has a flag
10669`accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
10670or reading output of a subprocess.
446dcd75 10671Only 128th through 159th elements have a meaning. */);
3f003981 10672 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
d46c5b12
KH
10673
10674 DEFVAR_LISP ("select-safe-coding-system-function",
29208e82 10675 Vselect_safe_coding_system_function,
df7492f9
KH
10676 doc: /*
10677Function to call to select safe coding system for encoding a text.
48b0f3ae
PJ
10678
10679If set, this function is called to force a user to select a proper
10680coding system which can encode the text in the case that a default
fdecf907
GM
10681coding system used in each operation can't encode the text. The
10682function should take care that the buffer is not modified while
10683the coding system is being selected.
48b0f3ae
PJ
10684
10685The default value is `select-safe-coding-system' (which see). */);
d46c5b12
KH
10686 Vselect_safe_coding_system_function = Qnil;
10687
5d5bf4d8 10688 DEFVAR_BOOL ("coding-system-require-warning",
29208e82 10689 coding_system_require_warning,
5d5bf4d8 10690 doc: /* Internal use only.
6b89e3aa
KH
10691If non-nil, on writing a file, `select-safe-coding-system-function' is
10692called even if `coding-system-for-write' is non-nil. The command
10693`universal-coding-system-argument' binds this variable to t temporarily. */);
5d5bf4d8
KH
10694 coding_system_require_warning = 0;
10695
10696
22ab2303 10697 DEFVAR_BOOL ("inhibit-iso-escape-detection",
29208e82 10698 inhibit_iso_escape_detection,
df7492f9 10699 doc: /*
97b1b294 10700If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
48b0f3ae 10701
97b1b294
EZ
10702When Emacs reads text, it tries to detect how the text is encoded.
10703This code detection is sensitive to escape sequences. If Emacs sees
10704a valid ISO-2022 escape sequence, it assumes the text is encoded in one
10705of the ISO2022 encodings, and decodes text by the corresponding coding
10706system (e.g. `iso-2022-7bit').
48b0f3ae
PJ
10707
10708However, there may be a case that you want to read escape sequences in
10709a file as is. In such a case, you can set this variable to non-nil.
97b1b294
EZ
10710Then the code detection will ignore any escape sequences, and no text is
10711detected as encoded in some ISO-2022 encoding. The result is that all
48b0f3ae
PJ
10712escape sequences become visible in a buffer.
10713
10714The default value is nil, and it is strongly recommended not to change
10715it. That is because many Emacs Lisp source files that contain
10716non-ASCII characters are encoded by the coding system `iso-2022-7bit'
10717in Emacs's distribution, and they won't be decoded correctly on
10718reading if you suppress escape sequence detection.
10719
10720The other way to read escape sequences in a file without decoding is
97b1b294 10721to explicitly specify some coding system that doesn't use ISO-2022
48b0f3ae 10722escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument]. */);
74383408 10723 inhibit_iso_escape_detection = 0;
002fdb44 10724
97b1b294 10725 DEFVAR_BOOL ("inhibit-null-byte-detection",
29208e82 10726 inhibit_null_byte_detection,
97b1b294
EZ
10727 doc: /* If non-nil, Emacs ignores null bytes on code detection.
10728By default, Emacs treats it as binary data, and does not attempt to
10729decode it. The effect is as if you specified `no-conversion' for
10730reading that text.
10731
10732Set this to non-nil when a regular text happens to include null bytes.
10733Examples are Index nodes of Info files and null-byte delimited output
10734from GNU Find and GNU Grep. Emacs will then ignore the null bytes and
10735decode text as usual. */);
10736 inhibit_null_byte_detection = 0;
10737
29208e82 10738 DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input,
15c8f9d1 10739 doc: /* Char table for translating self-inserting characters.
446dcd75 10740This is applied to the result of input methods, not their input.
8434d0b8
EZ
10741See also `keyboard-translate-table'.
10742
10743Use of this variable for character code unification was rendered
10744obsolete in Emacs 23.1 and later, since Unicode is now the basis of
10745internal character representation. */);
002fdb44 10746 Vtranslation_table_for_input = Qnil;
8f924df7 10747
2c78b7e1
KH
10748 {
10749 Lisp_Object args[coding_arg_max];
8f924df7 10750 Lisp_Object plist[16];
2c78b7e1
KH
10751 int i;
10752
10753 for (i = 0; i < coding_arg_max; i++)
10754 args[i] = Qnil;
10755
d67b4f80 10756 plist[0] = intern_c_string (":name");
2c78b7e1 10757 plist[1] = args[coding_arg_name] = Qno_conversion;
d67b4f80 10758 plist[2] = intern_c_string (":mnemonic");
2c78b7e1 10759 plist[3] = args[coding_arg_mnemonic] = make_number ('=');
d67b4f80 10760 plist[4] = intern_c_string (":coding-type");
2c78b7e1 10761 plist[5] = args[coding_arg_coding_type] = Qraw_text;
d67b4f80 10762 plist[6] = intern_c_string (":ascii-compatible-p");
2c78b7e1 10763 plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
d67b4f80 10764 plist[8] = intern_c_string (":default-char");
2c78b7e1 10765 plist[9] = args[coding_arg_default_char] = make_number (0);
d67b4f80 10766 plist[10] = intern_c_string (":for-unibyte");
8f924df7 10767 plist[11] = args[coding_arg_for_unibyte] = Qt;
d67b4f80 10768 plist[12] = intern_c_string (":docstring");
2a0213a6 10769 plist[13] = build_pure_c_string ("Do no conversion.\n\
2c78b7e1
KH
10770\n\
10771When you visit a file with this coding, the file is read into a\n\
10772unibyte buffer as is, thus each byte of a file is treated as a\n\
10773character.");
d67b4f80 10774 plist[14] = intern_c_string (":eol-type");
8f924df7
KH
10775 plist[15] = args[coding_arg_eol_type] = Qunix;
10776 args[coding_arg_plist] = Flist (16, plist);
2c78b7e1 10777 Fdefine_coding_system_internal (coding_arg_max, args);
ae6f73fa
KH
10778
10779 plist[1] = args[coding_arg_name] = Qundecided;
10780 plist[3] = args[coding_arg_mnemonic] = make_number ('-');
10781 plist[5] = args[coding_arg_coding_type] = Qundecided;
10782 /* This is already set.
35befdaa 10783 plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
d67b4f80 10784 plist[8] = intern_c_string (":charset-list");
ae6f73fa
KH
10785 plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
10786 plist[11] = args[coding_arg_for_unibyte] = Qnil;
2a0213a6 10787 plist[13] = build_pure_c_string ("No conversion on encoding, automatic conversion on decoding.");
ae6f73fa
KH
10788 plist[15] = args[coding_arg_eol_type] = Qnil;
10789 args[coding_arg_plist] = Flist (16, plist);
10790 Fdefine_coding_system_internal (coding_arg_max, args);
2c78b7e1
KH
10791 }
10792
2c78b7e1 10793 setup_coding_system (Qno_conversion, &safe_terminal_coding);
ff563fce
KH
10794
10795 {
10796 int i;
10797
10798 for (i = 0; i < coding_category_max; i++)
10799 Fset (AREF (Vcoding_category_table, i), Qno_conversion);
10800 }
1a4990fb 10801#if defined (DOS_NT)
fcbcfb64
KH
10802 system_eol_type = Qdos;
10803#else
10804 system_eol_type = Qunix;
10805#endif
10806 staticpro (&system_eol_type);
4ed46869
KH
10807}
10808
68c45bf0 10809char *
971de7fb 10810emacs_strerror (int error_number)
68c45bf0
PE
10811{
10812 char *str;
10813
ca9c0567 10814 synchronize_system_messages_locale ();
68c45bf0
PE
10815 str = strerror (error_number);
10816
10817 if (! NILP (Vlocale_coding_system))
10818 {
10819 Lisp_Object dec = code_convert_string_norecord (build_string (str),
10820 Vlocale_coding_system,
10821 0);
51b59d79 10822 str = SSDATA (dec);
68c45bf0
PE
10823 }
10824
10825 return str;
10826}
10827
4ed46869 10828#endif /* emacs */