* xsettings.c, xsettings.h (xsettings_get_system_normal_font):
[bpt/emacs.git] / src / coding.c
CommitLineData
9542cb1f 1/* Coding system handler (conversion, detection, etc).
73b0cd50 2 Copyright (C) 2001-2011 Free Software Foundation, Inc.
7976eda0 3 Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
5df4f04c 4 2005, 2006, 2007, 2008, 2009, 2010, 2011
ce03bf76
KH
5 National Institute of Advanced Industrial Science and Technology (AIST)
6 Registration Number H14PRO021
8f924df7 7 Copyright (C) 2003
df7492f9
KH
8 National Institute of Advanced Industrial Science and Technology (AIST)
9 Registration Number H13PRO009
4ed46869 10
369314dc
KH
11This file is part of GNU Emacs.
12
9ec0b715 13GNU Emacs is free software: you can redistribute it and/or modify
369314dc 14it under the terms of the GNU General Public License as published by
9ec0b715
GM
15the Free Software Foundation, either version 3 of the License, or
16(at your option) any later version.
4ed46869 17
369314dc
KH
18GNU Emacs is distributed in the hope that it will be useful,
19but WITHOUT ANY WARRANTY; without even the implied warranty of
20MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21GNU General Public License for more details.
4ed46869 22
369314dc 23You should have received a copy of the GNU General Public License
9ec0b715 24along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */
4ed46869
KH
25
26/*** TABLE OF CONTENTS ***
27
b73bfc1c 28 0. General comments
4ed46869 29 1. Preamble
df7492f9
KH
30 2. Emacs' internal format (emacs-utf-8) handlers
31 3. UTF-8 handlers
32 4. UTF-16 handlers
33 5. Charset-base coding systems handlers
34 6. emacs-mule (old Emacs' internal format) handlers
35 7. ISO2022 handlers
36 8. Shift-JIS and BIG5 handlers
37 9. CCL handlers
38 10. C library functions
39 11. Emacs Lisp library functions
40 12. Postamble
4ed46869
KH
41
42*/
43
df7492f9 44/*** 0. General comments ***
b73bfc1c
KH
45
46
df7492f9 47CODING SYSTEM
4ed46869 48
5bad0796
DL
49 A coding system is an object for an encoding mechanism that contains
50 information about how to convert byte sequences to character
e19c3639
KH
51 sequences and vice versa. When we say "decode", it means converting
52 a byte sequence of a specific coding system into a character
53 sequence that is represented by Emacs' internal coding system
54 `emacs-utf-8', and when we say "encode", it means converting a
55 character sequence of emacs-utf-8 to a byte sequence of a specific
0ef69138 56 coding system.
4ed46869 57
e19c3639
KH
58 In Emacs Lisp, a coding system is represented by a Lisp symbol. In
59 C level, a coding system is represented by a vector of attributes
5bad0796 60 stored in the hash table Vcharset_hash_table. The conversion from
e19c3639
KH
61 coding system symbol to attributes vector is done by looking up
62 Vcharset_hash_table by the symbol.
4ed46869 63
e19c3639 64 Coding systems are classified into the following types depending on
5bad0796 65 the encoding mechanism. Here's a brief description of the types.
4ed46869 66
df7492f9
KH
67 o UTF-8
68
69 o UTF-16
70
71 o Charset-base coding system
72
73 A coding system defined by one or more (coded) character sets.
5bad0796 74 Decoding and encoding are done by a code converter defined for each
df7492f9
KH
75 character set.
76
5bad0796 77 o Old Emacs internal format (emacs-mule)
df7492f9 78
5bad0796 79 The coding system adopted by old versions of Emacs (20 and 21).
4ed46869 80
df7492f9 81 o ISO2022-base coding system
4ed46869
KH
82
83 The most famous coding system for multiple character sets. X's
df7492f9
KH
84 Compound Text, various EUCs (Extended Unix Code), and coding systems
85 used in the Internet communication such as ISO-2022-JP are all
86 variants of ISO2022.
4ed46869 87
df7492f9 88 o SJIS (or Shift-JIS or MS-Kanji-Code)
93dec019 89
4ed46869
KH
90 A coding system to encode character sets: ASCII, JISX0201, and
91 JISX0208. Widely used for PC's in Japan. Details are described in
df7492f9 92 section 8.
4ed46869 93
df7492f9 94 o BIG5
4ed46869 95
df7492f9 96 A coding system to encode character sets: ASCII and Big5. Widely
cfb43547 97 used for Chinese (mainly in Taiwan and Hong Kong). Details are
df7492f9
KH
98 described in section 8. In this file, when we write "big5" (all
99 lowercase), we mean the coding system, and when we write "Big5"
100 (capitalized), we mean the character set.
4ed46869 101
df7492f9 102 o CCL
27901516 103
5bad0796 104 If a user wants to decode/encode text encoded in a coding system
df7492f9
KH
105 not listed above, he can supply a decoder and an encoder for it in
106 CCL (Code Conversion Language) programs. Emacs executes the CCL
107 program while decoding/encoding.
27901516 108
df7492f9 109 o Raw-text
4ed46869 110
5a936b46 111 A coding system for text containing raw eight-bit data. Emacs
5bad0796 112 treats each byte of source text as a character (except for
df7492f9 113 end-of-line conversion).
4ed46869 114
df7492f9
KH
115 o No-conversion
116
117 Like raw text, but don't do end-of-line conversion.
4ed46869 118
4ed46869 119
df7492f9 120END-OF-LINE FORMAT
4ed46869 121
5bad0796 122 How text end-of-line is encoded depends on operating system. For
df7492f9 123 instance, Unix's format is just one byte of LF (line-feed) code,
f4dee582 124 whereas DOS's format is two-byte sequence of `carriage-return' and
d46c5b12
KH
125 `line-feed' codes. MacOS's format is usually one byte of
126 `carriage-return'.
4ed46869 127
cfb43547 128 Since text character encoding and end-of-line encoding are
df7492f9
KH
129 independent, any coding system described above can take any format
130 of end-of-line (except for no-conversion).
4ed46869 131
e19c3639
KH
132STRUCT CODING_SYSTEM
133
134 Before using a coding system for code conversion (i.e. decoding and
135 encoding), we setup a structure of type `struct coding_system'.
136 This structure keeps various information about a specific code
5bad0796 137 conversion (e.g. the location of source and destination data).
4ed46869
KH
138
139*/
140
df7492f9
KH
141/* COMMON MACROS */
142
143
4ed46869
KH
144/*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
145
df7492f9 146 These functions check if a byte sequence specified as a source in
ff0dacd7
KH
147 CODING conforms to the format of XXX, and update the members of
148 DETECT_INFO.
df7492f9 149
ff0dacd7 150 Return 1 if the byte sequence conforms to XXX, otherwise return 0.
df7492f9
KH
151
152 Below is the template of these functions. */
153
4ed46869 154#if 0
df7492f9 155static int
cf84bb53
JB
156detect_coding_XXX (struct coding_system *coding,
157 struct coding_detection_info *detect_info)
4ed46869 158{
f1d34bca
MB
159 const unsigned char *src = coding->source;
160 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9 161 int multibytep = coding->src_multibyte;
a53e2e89 162 EMACS_INT consumed_chars = 0;
df7492f9
KH
163 int found = 0;
164 ...;
165
166 while (1)
167 {
ad1746f5 168 /* Get one byte from the source. If the source is exhausted, jump
df7492f9
KH
169 to no_more_source:. */
170 ONE_MORE_BYTE (c);
ff0dacd7
KH
171
172 if (! __C_conforms_to_XXX___ (c))
173 break;
174 if (! __C_strongly_suggests_XXX__ (c))
175 found = CATEGORY_MASK_XXX;
df7492f9 176 }
ff0dacd7
KH
177 /* The byte sequence is invalid for XXX. */
178 detect_info->rejected |= CATEGORY_MASK_XXX;
df7492f9 179 return 0;
ff0dacd7 180
df7492f9 181 no_more_source:
ad1746f5 182 /* The source exhausted successfully. */
ff0dacd7 183 detect_info->found |= found;
df7492f9 184 return 1;
4ed46869
KH
185}
186#endif
187
188/*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
189
df7492f9
KH
190 These functions decode a byte sequence specified as a source by
191 CODING. The resulting multibyte text goes to a place pointed to by
192 CODING->charbuf, the length of which should not exceed
193 CODING->charbuf_size;
d46c5b12 194
df7492f9
KH
195 These functions set the information of original and decoded texts in
196 CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
197 They also set CODING->result to one of CODING_RESULT_XXX indicating
198 how the decoding is finished.
d46c5b12 199
df7492f9 200 Below is the template of these functions. */
d46c5b12 201
4ed46869 202#if 0
b73bfc1c 203static void
cf84bb53 204decode_coding_XXXX (struct coding_system *coding)
4ed46869 205{
f1d34bca
MB
206 const unsigned char *src = coding->source + coding->consumed;
207 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9
KH
208 /* SRC_BASE remembers the start position in source in each loop.
209 The loop will be exited when there's not enough source code, or
210 when there's no room in CHARBUF for a decoded character. */
f1d34bca 211 const unsigned char *src_base;
df7492f9 212 /* A buffer to produce decoded characters. */
69a80ea3
KH
213 int *charbuf = coding->charbuf + coding->charbuf_used;
214 int *charbuf_end = coding->charbuf + coding->charbuf_size;
df7492f9
KH
215 int multibytep = coding->src_multibyte;
216
217 while (1)
218 {
219 src_base = src;
220 if (charbuf < charbuf_end)
221 /* No more room to produce a decoded character. */
222 break;
223 ONE_MORE_BYTE (c);
224 /* Decode it. */
225 }
226
227 no_more_source:
228 if (src_base < src_end
229 && coding->mode & CODING_MODE_LAST_BLOCK)
230 /* If the source ends by partial bytes to construct a character,
231 treat them as eight-bit raw data. */
232 while (src_base < src_end && charbuf < charbuf_end)
233 *charbuf++ = *src_base++;
234 /* Remember how many bytes and characters we consumed. If the
235 source is multibyte, the bytes and chars are not identical. */
236 coding->consumed = coding->consumed_char = src_base - coding->source;
237 /* Remember how many characters we produced. */
238 coding->charbuf_used = charbuf - coding->charbuf;
4ed46869
KH
239}
240#endif
241
242/*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
243
df7492f9
KH
244 These functions encode SRC_BYTES length text at SOURCE of Emacs'
245 internal multibyte format by CODING. The resulting byte sequence
b73bfc1c
KH
246 goes to a place pointed to by DESTINATION, the length of which
247 should not exceed DST_BYTES.
d46c5b12 248
df7492f9
KH
249 These functions set the information of original and encoded texts in
250 the members produced, produced_char, consumed, and consumed_char of
251 the structure *CODING. They also set the member result to one of
252 CODING_RESULT_XXX indicating how the encoding finished.
d46c5b12 253
df7492f9
KH
254 DST_BYTES zero means that source area and destination area are
255 overlapped, which means that we can produce a encoded text until it
256 reaches at the head of not-yet-encoded source text.
d46c5b12 257
df7492f9 258 Below is a template of these functions. */
4ed46869 259#if 0
b73bfc1c 260static void
cf84bb53 261encode_coding_XXX (struct coding_system *coding)
4ed46869 262{
df7492f9
KH
263 int multibytep = coding->dst_multibyte;
264 int *charbuf = coding->charbuf;
265 int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
266 unsigned char *dst = coding->destination + coding->produced;
267 unsigned char *dst_end = coding->destination + coding->dst_bytes;
268 unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
a53e2e89 269 EMACS_INT produced_chars = 0;
df7492f9
KH
270
271 for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
272 {
273 int c = *charbuf;
274 /* Encode C into DST, and increment DST. */
275 }
276 label_no_more_destination:
277 /* How many chars and bytes we produced. */
278 coding->produced_char += produced_chars;
279 coding->produced = dst - coding->destination;
4ed46869
KH
280}
281#endif
282
4ed46869
KH
283\f
284/*** 1. Preamble ***/
285
68c45bf0 286#include <config.h>
4ed46869 287#include <stdio.h>
d7306fe6 288#include <setjmp.h>
4ed46869 289
4ed46869
KH
290#include "lisp.h"
291#include "buffer.h"
df7492f9 292#include "character.h"
4ed46869
KH
293#include "charset.h"
294#include "ccl.h"
df7492f9 295#include "composite.h"
4ed46869
KH
296#include "coding.h"
297#include "window.h"
b8299c66
KL
298#include "frame.h"
299#include "termhooks.h"
4ed46869 300
df7492f9 301Lisp_Object Vcoding_system_hash_table;
4ed46869 302
955cbe7b
PE
303static Lisp_Object Qcoding_system, Qeol_type;
304static Lisp_Object Qcoding_aliases;
1965cb73 305Lisp_Object Qunix, Qdos;
4ed46869 306Lisp_Object Qbuffer_file_coding_system;
955cbe7b
PE
307static Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
308static Lisp_Object Qdefault_char;
27901516 309Lisp_Object Qno_conversion, Qundecided;
955cbe7b
PE
310Lisp_Object Qcharset, Qutf_8;
311static Lisp_Object Qiso_2022;
312static Lisp_Object Qutf_16, Qshift_jis, Qbig5;
313static Lisp_Object Qbig, Qlittle;
314static Lisp_Object Qcoding_system_history;
315static Lisp_Object Qvalid_codes;
316static Lisp_Object QCcategory, QCmnemonic, QCdefault_char;
317static Lisp_Object QCdecode_translation_table, QCencode_translation_table;
318static Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
319static Lisp_Object QCascii_compatible_p;
4ed46869 320
387f6ba5 321Lisp_Object Qcall_process, Qcall_process_region;
4ed46869 322Lisp_Object Qstart_process, Qopen_network_stream;
955cbe7b 323static Lisp_Object Qtarget_idx;
4ed46869 324
955cbe7b
PE
325static Lisp_Object Qinsufficient_source, Qinconsistent_eol, Qinvalid_source;
326static Lisp_Object Qinterrupted, Qinsufficient_memory;
065e3595 327
44e8490d
KH
328/* If a symbol has this property, evaluate the value to define the
329 symbol as a coding system. */
330static Lisp_Object Qcoding_system_define_form;
331
fcbcfb64
KH
332/* Format of end-of-line decided by system. This is Qunix on
333 Unix and Mac, Qdos on DOS/Windows.
334 This has an effect only for external encoding (i.e. for output to
335 file and process), not for in-buffer or Lisp string encoding. */
336static Lisp_Object system_eol_type;
337
4ed46869
KH
338#ifdef emacs
339
4608c386 340Lisp_Object Qcoding_system_p, Qcoding_system_error;
4ed46869 341
d46c5b12
KH
342/* Coding system emacs-mule and raw-text are for converting only
343 end-of-line format. */
344Lisp_Object Qemacs_mule, Qraw_text;
8f924df7 345Lisp_Object Qutf_8_emacs;
ecf488bc 346
4ed46869
KH
347/* Coding-systems are handed between Emacs Lisp programs and C internal
348 routines by the following three variables. */
c4825358
KH
349/* Coding system to be used to encode text for terminal display when
350 terminal coding system is nil. */
351struct coding_system safe_terminal_coding;
352
4ed46869
KH
353#endif /* emacs */
354
f967223b
KH
355Lisp_Object Qtranslation_table;
356Lisp_Object Qtranslation_table_id;
955cbe7b
PE
357static Lisp_Object Qtranslation_table_for_decode;
358static Lisp_Object Qtranslation_table_for_encode;
4ed46869 359
df7492f9 360/* Two special coding systems. */
74ab6df5
PE
361static Lisp_Object Vsjis_coding_system;
362static Lisp_Object Vbig5_coding_system;
df7492f9 363
df7492f9
KH
364/* ISO2022 section */
365
366#define CODING_ISO_INITIAL(coding, reg) \
367 (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id), \
368 coding_attr_iso_initial), \
369 reg)))
370
371
1b3b981b
AS
372#define CODING_ISO_REQUEST(coding, charset_id) \
373 (((charset_id) <= (coding)->max_charset_id \
374 ? ((coding)->safe_charsets[charset_id] != 255 \
375 ? (coding)->safe_charsets[charset_id] \
376 : -1) \
df7492f9
KH
377 : -1))
378
379
380#define CODING_ISO_FLAGS(coding) \
381 ((coding)->spec.iso_2022.flags)
382#define CODING_ISO_DESIGNATION(coding, reg) \
383 ((coding)->spec.iso_2022.current_designation[reg])
384#define CODING_ISO_INVOCATION(coding, plane) \
385 ((coding)->spec.iso_2022.current_invocation[plane])
386#define CODING_ISO_SINGLE_SHIFTING(coding) \
387 ((coding)->spec.iso_2022.single_shifting)
388#define CODING_ISO_BOL(coding) \
389 ((coding)->spec.iso_2022.bol)
390#define CODING_ISO_INVOKED_CHARSET(coding, plane) \
391 CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
e951386e
KH
392#define CODING_ISO_CMP_STATUS(coding) \
393 (&(coding)->spec.iso_2022.cmp_status)
394#define CODING_ISO_EXTSEGMENT_LEN(coding) \
395 ((coding)->spec.iso_2022.ctext_extended_segment_len)
396#define CODING_ISO_EMBEDDED_UTF_8(coding) \
397 ((coding)->spec.iso_2022.embedded_utf_8)
df7492f9
KH
398
399/* Control characters of ISO2022. */
400 /* code */ /* function */
df7492f9
KH
401#define ISO_CODE_SO 0x0E /* shift-out */
402#define ISO_CODE_SI 0x0F /* shift-in */
403#define ISO_CODE_SS2_7 0x19 /* single-shift-2 for 7-bit code */
404#define ISO_CODE_ESC 0x1B /* escape */
405#define ISO_CODE_SS2 0x8E /* single-shift-2 */
406#define ISO_CODE_SS3 0x8F /* single-shift-3 */
407#define ISO_CODE_CSI 0x9B /* control-sequence-introducer */
408
409/* All code (1-byte) of ISO2022 is classified into one of the
410 followings. */
411enum iso_code_class_type
412 {
413 ISO_control_0, /* Control codes in the range
414 0x00..0x1F and 0x7F, except for the
415 following 5 codes. */
df7492f9
KH
416 ISO_shift_out, /* ISO_CODE_SO (0x0E) */
417 ISO_shift_in, /* ISO_CODE_SI (0x0F) */
418 ISO_single_shift_2_7, /* ISO_CODE_SS2_7 (0x19) */
419 ISO_escape, /* ISO_CODE_SO (0x1B) */
420 ISO_control_1, /* Control codes in the range
421 0x80..0x9F, except for the
422 following 3 codes. */
423 ISO_single_shift_2, /* ISO_CODE_SS2 (0x8E) */
424 ISO_single_shift_3, /* ISO_CODE_SS3 (0x8F) */
425 ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
426 ISO_0x20_or_0x7F, /* Codes of the values 0x20 or 0x7F. */
427 ISO_graphic_plane_0, /* Graphic codes in the range 0x21..0x7E. */
428 ISO_0xA0_or_0xFF, /* Codes of the values 0xA0 or 0xFF. */
429 ISO_graphic_plane_1 /* Graphic codes in the range 0xA1..0xFE. */
430 };
05e6f5dc 431
df7492f9
KH
432/** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
433 `iso-flags' attribute of an iso2022 coding system. */
05e6f5dc 434
df7492f9
KH
435/* If set, produce long-form designation sequence (e.g. ESC $ ( A)
436 instead of the correct short-form sequence (e.g. ESC $ A). */
437#define CODING_ISO_FLAG_LONG_FORM 0x0001
93dec019 438
df7492f9
KH
439/* If set, reset graphic planes and registers at end-of-line to the
440 initial state. */
441#define CODING_ISO_FLAG_RESET_AT_EOL 0x0002
05e6f5dc 442
df7492f9
KH
443/* If set, reset graphic planes and registers before any control
444 characters to the initial state. */
445#define CODING_ISO_FLAG_RESET_AT_CNTL 0x0004
05e6f5dc 446
df7492f9
KH
447/* If set, encode by 7-bit environment. */
448#define CODING_ISO_FLAG_SEVEN_BITS 0x0008
4ed46869 449
df7492f9
KH
450/* If set, use locking-shift function. */
451#define CODING_ISO_FLAG_LOCKING_SHIFT 0x0010
b73bfc1c 452
df7492f9
KH
453/* If set, use single-shift function. Overwrite
454 CODING_ISO_FLAG_LOCKING_SHIFT. */
455#define CODING_ISO_FLAG_SINGLE_SHIFT 0x0020
b73bfc1c 456
df7492f9
KH
457/* If set, use designation escape sequence. */
458#define CODING_ISO_FLAG_DESIGNATION 0x0040
b73bfc1c 459
df7492f9
KH
460/* If set, produce revision number sequence. */
461#define CODING_ISO_FLAG_REVISION 0x0080
b73bfc1c 462
df7492f9
KH
463/* If set, produce ISO6429's direction specifying sequence. */
464#define CODING_ISO_FLAG_DIRECTION 0x0100
f4dee582 465
df7492f9
KH
466/* If set, assume designation states are reset at beginning of line on
467 output. */
468#define CODING_ISO_FLAG_INIT_AT_BOL 0x0200
4ed46869 469
df7492f9
KH
470/* If set, designation sequence should be placed at beginning of line
471 on output. */
472#define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
aa72b389 473
ad1746f5 474/* If set, do not encode unsafe characters on output. */
df7492f9 475#define CODING_ISO_FLAG_SAFE 0x0800
aa72b389 476
df7492f9
KH
477/* If set, extra latin codes (128..159) are accepted as a valid code
478 on input. */
479#define CODING_ISO_FLAG_LATIN_EXTRA 0x1000
aa72b389 480
df7492f9 481#define CODING_ISO_FLAG_COMPOSITION 0x2000
aa72b389 482
5f58e762 483/* #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000 */
aa72b389 484
bf16eb23 485#define CODING_ISO_FLAG_USE_ROMAN 0x8000
aa72b389 486
bf16eb23 487#define CODING_ISO_FLAG_USE_OLDJIS 0x10000
aa72b389 488
bf16eb23 489#define CODING_ISO_FLAG_FULL_SUPPORT 0x100000
aa72b389 490
df7492f9
KH
491/* A character to be produced on output if encoding of the original
492 character is prohibited by CODING_ISO_FLAG_SAFE. */
493#define CODING_INHIBIT_CHARACTER_SUBSTITUTION '?'
aa72b389 494
a470d443
KH
495/* UTF-8 section */
496#define CODING_UTF_8_BOM(coding) \
497 ((coding)->spec.utf_8_bom)
4ed46869 498
df7492f9
KH
499/* UTF-16 section */
500#define CODING_UTF_16_BOM(coding) \
501 ((coding)->spec.utf_16.bom)
4ed46869 502
df7492f9
KH
503#define CODING_UTF_16_ENDIAN(coding) \
504 ((coding)->spec.utf_16.endian)
4ed46869 505
df7492f9
KH
506#define CODING_UTF_16_SURROGATE(coding) \
507 ((coding)->spec.utf_16.surrogate)
4ed46869 508
4ed46869 509
df7492f9
KH
510/* CCL section */
511#define CODING_CCL_DECODER(coding) \
512 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
513#define CODING_CCL_ENCODER(coding) \
514 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
515#define CODING_CCL_VALIDS(coding) \
8f924df7 516 (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
4ed46869 517
5a936b46 518/* Index for each coding category in `coding_categories' */
4ed46869 519
df7492f9
KH
520enum coding_category
521 {
522 coding_category_iso_7,
523 coding_category_iso_7_tight,
524 coding_category_iso_8_1,
525 coding_category_iso_8_2,
526 coding_category_iso_7_else,
527 coding_category_iso_8_else,
a470d443
KH
528 coding_category_utf_8_auto,
529 coding_category_utf_8_nosig,
530 coding_category_utf_8_sig,
df7492f9
KH
531 coding_category_utf_16_auto,
532 coding_category_utf_16_be,
533 coding_category_utf_16_le,
534 coding_category_utf_16_be_nosig,
535 coding_category_utf_16_le_nosig,
536 coding_category_charset,
537 coding_category_sjis,
538 coding_category_big5,
539 coding_category_ccl,
540 coding_category_emacs_mule,
541 /* All above are targets of code detection. */
542 coding_category_raw_text,
543 coding_category_undecided,
544 coding_category_max
545 };
546
547/* Definitions of flag bits used in detect_coding_XXXX. */
548#define CATEGORY_MASK_ISO_7 (1 << coding_category_iso_7)
549#define CATEGORY_MASK_ISO_7_TIGHT (1 << coding_category_iso_7_tight)
550#define CATEGORY_MASK_ISO_8_1 (1 << coding_category_iso_8_1)
551#define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2)
552#define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else)
553#define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else)
a470d443
KH
554#define CATEGORY_MASK_UTF_8_AUTO (1 << coding_category_utf_8_auto)
555#define CATEGORY_MASK_UTF_8_NOSIG (1 << coding_category_utf_8_nosig)
556#define CATEGORY_MASK_UTF_8_SIG (1 << coding_category_utf_8_sig)
b49a1807 557#define CATEGORY_MASK_UTF_16_AUTO (1 << coding_category_utf_16_auto)
df7492f9
KH
558#define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be)
559#define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le)
560#define CATEGORY_MASK_UTF_16_BE_NOSIG (1 << coding_category_utf_16_be_nosig)
561#define CATEGORY_MASK_UTF_16_LE_NOSIG (1 << coding_category_utf_16_le_nosig)
562#define CATEGORY_MASK_CHARSET (1 << coding_category_charset)
563#define CATEGORY_MASK_SJIS (1 << coding_category_sjis)
564#define CATEGORY_MASK_BIG5 (1 << coding_category_big5)
565#define CATEGORY_MASK_CCL (1 << coding_category_ccl)
566#define CATEGORY_MASK_EMACS_MULE (1 << coding_category_emacs_mule)
ff0dacd7 567#define CATEGORY_MASK_RAW_TEXT (1 << coding_category_raw_text)
df7492f9
KH
568
569/* This value is returned if detect_coding_mask () find nothing other
570 than ASCII characters. */
571#define CATEGORY_MASK_ANY \
572 (CATEGORY_MASK_ISO_7 \
573 | CATEGORY_MASK_ISO_7_TIGHT \
574 | CATEGORY_MASK_ISO_8_1 \
575 | CATEGORY_MASK_ISO_8_2 \
576 | CATEGORY_MASK_ISO_7_ELSE \
577 | CATEGORY_MASK_ISO_8_ELSE \
a470d443
KH
578 | CATEGORY_MASK_UTF_8_AUTO \
579 | CATEGORY_MASK_UTF_8_NOSIG \
580 | CATEGORY_MASK_UTF_8_SIG \
2f3cbb32 581 | CATEGORY_MASK_UTF_16_AUTO \
df7492f9
KH
582 | CATEGORY_MASK_UTF_16_BE \
583 | CATEGORY_MASK_UTF_16_LE \
584 | CATEGORY_MASK_UTF_16_BE_NOSIG \
585 | CATEGORY_MASK_UTF_16_LE_NOSIG \
586 | CATEGORY_MASK_CHARSET \
587 | CATEGORY_MASK_SJIS \
588 | CATEGORY_MASK_BIG5 \
589 | CATEGORY_MASK_CCL \
590 | CATEGORY_MASK_EMACS_MULE)
591
592
593#define CATEGORY_MASK_ISO_7BIT \
594 (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
595
596#define CATEGORY_MASK_ISO_8BIT \
597 (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
598
599#define CATEGORY_MASK_ISO_ELSE \
600 (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
601
602#define CATEGORY_MASK_ISO_ESCAPE \
603 (CATEGORY_MASK_ISO_7 \
604 | CATEGORY_MASK_ISO_7_TIGHT \
605 | CATEGORY_MASK_ISO_7_ELSE \
606 | CATEGORY_MASK_ISO_8_ELSE)
607
608#define CATEGORY_MASK_ISO \
609 ( CATEGORY_MASK_ISO_7BIT \
610 | CATEGORY_MASK_ISO_8BIT \
611 | CATEGORY_MASK_ISO_ELSE)
612
613#define CATEGORY_MASK_UTF_16 \
2f3cbb32
KH
614 (CATEGORY_MASK_UTF_16_AUTO \
615 | CATEGORY_MASK_UTF_16_BE \
df7492f9
KH
616 | CATEGORY_MASK_UTF_16_LE \
617 | CATEGORY_MASK_UTF_16_BE_NOSIG \
618 | CATEGORY_MASK_UTF_16_LE_NOSIG)
619
a470d443
KH
620#define CATEGORY_MASK_UTF_8 \
621 (CATEGORY_MASK_UTF_8_AUTO \
622 | CATEGORY_MASK_UTF_8_NOSIG \
623 | CATEGORY_MASK_UTF_8_SIG)
df7492f9 624
df7492f9 625/* Table of coding categories (Lisp symbols). This variable is for
ad1746f5 626 internal use only. */
df7492f9
KH
627static Lisp_Object Vcoding_category_table;
628
629/* Table of coding-categories ordered by priority. */
630static enum coding_category coding_priorities[coding_category_max];
631
632/* Nth element is a coding context for the coding system bound to the
633 Nth coding category. */
634static struct coding_system coding_categories[coding_category_max];
635
df7492f9
KH
636/*** Commonly used macros and functions ***/
637
638#ifndef min
639#define min(a, b) ((a) < (b) ? (a) : (b))
640#endif
641#ifndef max
642#define max(a, b) ((a) > (b) ? (a) : (b))
643#endif
4ed46869 644
24a73b0a
KH
645#define CODING_GET_INFO(coding, attrs, charset_list) \
646 do { \
647 (attrs) = CODING_ID_ATTRS ((coding)->id); \
648 (charset_list) = CODING_ATTR_CHARSET_LIST (attrs); \
df7492f9 649 } while (0)
4ed46869 650
4ed46869 651
df7492f9
KH
652/* Safely get one byte from the source text pointed by SRC which ends
653 at SRC_END, and set C to that byte. If there are not enough bytes
065e3595
KH
654 in the source, it jumps to `no_more_source'. If multibytep is
655 nonzero, and a multibyte character is found at SRC, set C to the
656 negative value of the character code. The caller should declare
657 and set these variables appropriately in advance:
658 src, src_end, multibytep */
aa72b389 659
065e3595
KH
660#define ONE_MORE_BYTE(c) \
661 do { \
662 if (src == src_end) \
663 { \
664 if (src_base < src) \
665 record_conversion_result \
666 (coding, CODING_RESULT_INSUFFICIENT_SRC); \
667 goto no_more_source; \
668 } \
669 c = *src++; \
670 if (multibytep && (c & 0x80)) \
671 { \
672 if ((c & 0xFE) == 0xC0) \
673 c = ((c & 1) << 6) | *src++; \
674 else \
675 { \
35befdaa
KH
676 src--; \
677 c = - string_char (src, &src, NULL); \
065e3595
KH
678 record_conversion_result \
679 (coding, CODING_RESULT_INVALID_SRC); \
680 } \
681 } \
682 consumed_chars++; \
aa72b389
KH
683 } while (0)
684
f56a4450 685/* Safely get two bytes from the source text pointed by SRC which ends
220eeac9
KH
686 at SRC_END, and set C1 and C2 to those bytes while skipping the
687 heading multibyte characters. If there are not enough bytes in the
688 source, it jumps to `no_more_source'. If multibytep is nonzero and
689 a multibyte character is found for C2, set C2 to the negative value
690 of the character code. The caller should declare and set these
691 variables appropriately in advance:
f56a4450
KH
692 src, src_end, multibytep
693 It is intended that this macro is used in detect_coding_utf_16. */
694
220eeac9
KH
695#define TWO_MORE_BYTES(c1, c2) \
696 do { \
697 do { \
698 if (src == src_end) \
699 goto no_more_source; \
700 c1 = *src++; \
701 if (multibytep && (c1 & 0x80)) \
702 { \
703 if ((c1 & 0xFE) == 0xC0) \
704 c1 = ((c1 & 1) << 6) | *src++; \
705 else \
706 { \
707 src += BYTES_BY_CHAR_HEAD (c1) - 1; \
708 c1 = -1; \
709 } \
710 } \
711 } while (c1 < 0); \
712 if (src == src_end) \
713 goto no_more_source; \
714 c2 = *src++; \
715 if (multibytep && (c2 & 0x80)) \
716 { \
717 if ((c2 & 0xFE) == 0xC0) \
718 c2 = ((c2 & 1) << 6) | *src++; \
719 else \
720 c2 = -1; \
721 } \
f56a4450
KH
722 } while (0)
723
aa72b389 724
df7492f9
KH
725/* Store a byte C in the place pointed by DST and increment DST to the
726 next free point, and increment PRODUCED_CHARS. The caller should
727 assure that C is 0..127, and declare and set the variable `dst'
728 appropriately in advance.
729*/
aa72b389
KH
730
731
df7492f9
KH
732#define EMIT_ONE_ASCII_BYTE(c) \
733 do { \
734 produced_chars++; \
735 *dst++ = (c); \
b6871cc7 736 } while (0)
aa72b389
KH
737
738
ad1746f5 739/* Like EMIT_ONE_ASCII_BYTE but store two bytes; C1 and C2. */
aa72b389 740
df7492f9
KH
741#define EMIT_TWO_ASCII_BYTES(c1, c2) \
742 do { \
743 produced_chars += 2; \
744 *dst++ = (c1), *dst++ = (c2); \
745 } while (0)
aa72b389
KH
746
747
df7492f9
KH
748/* Store a byte C in the place pointed by DST and increment DST to the
749 next free point, and increment PRODUCED_CHARS. If MULTIBYTEP is
750 nonzero, store in an appropriate multibyte from. The caller should
751 declare and set the variables `dst' and `multibytep' appropriately
752 in advance. */
753
754#define EMIT_ONE_BYTE(c) \
755 do { \
756 produced_chars++; \
757 if (multibytep) \
758 { \
b25d760e 759 unsigned ch = (c); \
df7492f9
KH
760 if (ch >= 0x80) \
761 ch = BYTE8_TO_CHAR (ch); \
762 CHAR_STRING_ADVANCE (ch, dst); \
763 } \
764 else \
765 *dst++ = (c); \
aa72b389 766 } while (0)
aa72b389 767
aa72b389 768
df7492f9 769/* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */
aa72b389 770
e19c3639
KH
771#define EMIT_TWO_BYTES(c1, c2) \
772 do { \
773 produced_chars += 2; \
774 if (multibytep) \
775 { \
b25d760e 776 unsigned ch; \
e19c3639
KH
777 \
778 ch = (c1); \
779 if (ch >= 0x80) \
780 ch = BYTE8_TO_CHAR (ch); \
781 CHAR_STRING_ADVANCE (ch, dst); \
782 ch = (c2); \
783 if (ch >= 0x80) \
784 ch = BYTE8_TO_CHAR (ch); \
785 CHAR_STRING_ADVANCE (ch, dst); \
786 } \
787 else \
788 { \
789 *dst++ = (c1); \
790 *dst++ = (c2); \
791 } \
aa72b389
KH
792 } while (0)
793
794
df7492f9
KH
795#define EMIT_THREE_BYTES(c1, c2, c3) \
796 do { \
797 EMIT_ONE_BYTE (c1); \
798 EMIT_TWO_BYTES (c2, c3); \
799 } while (0)
aa72b389 800
aa72b389 801
df7492f9
KH
802#define EMIT_FOUR_BYTES(c1, c2, c3, c4) \
803 do { \
804 EMIT_TWO_BYTES (c1, c2); \
805 EMIT_TWO_BYTES (c3, c4); \
806 } while (0)
aa72b389 807
aa72b389 808
f6cbaf43 809/* Prototypes for static functions. */
f57e2426
J
810static void record_conversion_result (struct coding_system *coding,
811 enum coding_result_code result);
812static int detect_coding_utf_8 (struct coding_system *,
813 struct coding_detection_info *info);
814static void decode_coding_utf_8 (struct coding_system *);
815static int encode_coding_utf_8 (struct coding_system *);
816
817static int detect_coding_utf_16 (struct coding_system *,
818 struct coding_detection_info *info);
819static void decode_coding_utf_16 (struct coding_system *);
820static int encode_coding_utf_16 (struct coding_system *);
821
822static int detect_coding_iso_2022 (struct coding_system *,
823 struct coding_detection_info *info);
824static void decode_coding_iso_2022 (struct coding_system *);
825static int encode_coding_iso_2022 (struct coding_system *);
826
827static int detect_coding_emacs_mule (struct coding_system *,
828 struct coding_detection_info *info);
829static void decode_coding_emacs_mule (struct coding_system *);
830static int encode_coding_emacs_mule (struct coding_system *);
831
832static int detect_coding_sjis (struct coding_system *,
833 struct coding_detection_info *info);
834static void decode_coding_sjis (struct coding_system *);
835static int encode_coding_sjis (struct coding_system *);
836
837static int detect_coding_big5 (struct coding_system *,
838 struct coding_detection_info *info);
839static void decode_coding_big5 (struct coding_system *);
840static int encode_coding_big5 (struct coding_system *);
841
842static int detect_coding_ccl (struct coding_system *,
843 struct coding_detection_info *info);
844static void decode_coding_ccl (struct coding_system *);
845static int encode_coding_ccl (struct coding_system *);
846
847static void decode_coding_raw_text (struct coding_system *);
848static int encode_coding_raw_text (struct coding_system *);
849
850static void coding_set_source (struct coding_system *);
851static void coding_set_destination (struct coding_system *);
852static void coding_alloc_by_realloc (struct coding_system *, EMACS_INT);
853static void coding_alloc_by_making_gap (struct coding_system *,
854 EMACS_INT, EMACS_INT);
855static unsigned char *alloc_destination (struct coding_system *,
856 EMACS_INT, unsigned char *);
857static void setup_iso_safe_charsets (Lisp_Object);
858static unsigned char *encode_designation_at_bol (struct coding_system *,
461c2ab9 859 int *, unsigned char *);
f57e2426
J
860static int detect_eol (const unsigned char *,
861 EMACS_INT, enum coding_category);
862static Lisp_Object adjust_coding_eol_type (struct coding_system *, int);
863static void decode_eol (struct coding_system *);
864static Lisp_Object get_translation_table (Lisp_Object, int, int *);
865static Lisp_Object get_translation (Lisp_Object, int *, int *);
866static int produce_chars (struct coding_system *, Lisp_Object, int);
867static INLINE void produce_charset (struct coding_system *, int *,
868 EMACS_INT);
869static void produce_annotation (struct coding_system *, EMACS_INT);
870static int decode_coding (struct coding_system *);
871static INLINE int *handle_composition_annotation (EMACS_INT, EMACS_INT,
872 struct coding_system *,
873 int *, EMACS_INT *);
874static INLINE int *handle_charset_annotation (EMACS_INT, EMACS_INT,
875 struct coding_system *,
876 int *, EMACS_INT *);
877static void consume_chars (struct coding_system *, Lisp_Object, int);
878static int encode_coding (struct coding_system *);
879static Lisp_Object make_conversion_work_buffer (int);
880static Lisp_Object code_conversion_restore (Lisp_Object);
881static INLINE int char_encodable_p (int, Lisp_Object);
882static Lisp_Object make_subsidiaries (Lisp_Object);
f6cbaf43 883
065e3595
KH
884static void
885record_conversion_result (struct coding_system *coding,
886 enum coding_result_code result)
887{
888 coding->result = result;
889 switch (result)
890 {
891 case CODING_RESULT_INSUFFICIENT_SRC:
892 Vlast_code_conversion_error = Qinsufficient_source;
893 break;
894 case CODING_RESULT_INCONSISTENT_EOL:
895 Vlast_code_conversion_error = Qinconsistent_eol;
896 break;
897 case CODING_RESULT_INVALID_SRC:
898 Vlast_code_conversion_error = Qinvalid_source;
899 break;
900 case CODING_RESULT_INTERRUPT:
901 Vlast_code_conversion_error = Qinterrupted;
902 break;
903 case CODING_RESULT_INSUFFICIENT_MEM:
904 Vlast_code_conversion_error = Qinsufficient_memory;
905 break;
ebaf11b6
KH
906 case CODING_RESULT_INSUFFICIENT_DST:
907 /* Don't record this error in Vlast_code_conversion_error
908 because it happens just temporarily and is resolved when the
909 whole conversion is finished. */
910 break;
409ea3a1
AS
911 case CODING_RESULT_SUCCESS:
912 break;
35befdaa
KH
913 default:
914 Vlast_code_conversion_error = intern ("Unknown error");
065e3595
KH
915 }
916}
917
75f80e63
EZ
918/* This wrapper macro is used to preserve validity of pointers into
919 buffer text across calls to decode_char, which could cause
920 relocation of buffers if it loads a charset map, because loading a
921 charset map allocates large structures. */
df7492f9
KH
922#define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
923 do { \
924 charset_map_loaded = 0; \
925 c = DECODE_CHAR (charset, code); \
926 if (charset_map_loaded) \
927 { \
8f924df7 928 const unsigned char *orig = coding->source; \
df7492f9
KH
929 EMACS_INT offset; \
930 \
931 coding_set_source (coding); \
932 offset = coding->source - orig; \
933 src += offset; \
934 src_base += offset; \
935 src_end += offset; \
936 } \
aa72b389
KH
937 } while (0)
938
939
119852e7
KH
940/* If there are at least BYTES length of room at dst, allocate memory
941 for coding->destination and update dst and dst_end. We don't have
942 to take care of coding->source which will be relocated. It is
943 handled by calling coding_set_source in encode_coding. */
944
df7492f9
KH
945#define ASSURE_DESTINATION(bytes) \
946 do { \
947 if (dst + (bytes) >= dst_end) \
948 { \
a53e2e89 949 EMACS_INT more_bytes = charbuf_end - charbuf + (bytes); \
df7492f9
KH
950 \
951 dst = alloc_destination (coding, more_bytes, dst); \
952 dst_end = coding->destination + coding->dst_bytes; \
953 } \
954 } while (0)
aa72b389 955
aa72b389 956
db274c7a
KH
957/* Store multibyte form of the character C in P, and advance P to the
958 end of the multibyte form. This is like CHAR_STRING_ADVANCE but it
959 never calls MAYBE_UNIFY_CHAR. */
960
961#define CHAR_STRING_ADVANCE_NO_UNIFY(c, p) \
962 do { \
963 if ((c) <= MAX_1_BYTE_CHAR) \
964 *(p)++ = (c); \
965 else if ((c) <= MAX_2_BYTE_CHAR) \
966 *(p)++ = (0xC0 | ((c) >> 6)), \
967 *(p)++ = (0x80 | ((c) & 0x3F)); \
968 else if ((c) <= MAX_3_BYTE_CHAR) \
969 *(p)++ = (0xE0 | ((c) >> 12)), \
970 *(p)++ = (0x80 | (((c) >> 6) & 0x3F)), \
971 *(p)++ = (0x80 | ((c) & 0x3F)); \
972 else if ((c) <= MAX_4_BYTE_CHAR) \
973 *(p)++ = (0xF0 | (c >> 18)), \
974 *(p)++ = (0x80 | ((c >> 12) & 0x3F)), \
975 *(p)++ = (0x80 | ((c >> 6) & 0x3F)), \
976 *(p)++ = (0x80 | (c & 0x3F)); \
977 else if ((c) <= MAX_5_BYTE_CHAR) \
978 *(p)++ = 0xF8, \
979 *(p)++ = (0x80 | ((c >> 18) & 0x0F)), \
980 *(p)++ = (0x80 | ((c >> 12) & 0x3F)), \
981 *(p)++ = (0x80 | ((c >> 6) & 0x3F)), \
982 *(p)++ = (0x80 | (c & 0x3F)); \
983 else \
984 (p) += BYTE8_STRING ((c) - 0x3FFF80, p); \
985 } while (0)
986
987
988/* Return the character code of character whose multibyte form is at
989 P, and advance P to the end of the multibyte form. This is like
990 STRING_CHAR_ADVANCE, but it never calls MAYBE_UNIFY_CHAR. */
991
992#define STRING_CHAR_ADVANCE_NO_UNIFY(p) \
993 (!((p)[0] & 0x80) \
994 ? *(p)++ \
995 : ! ((p)[0] & 0x20) \
996 ? ((p) += 2, \
997 ((((p)[-2] & 0x1F) << 6) \
998 | ((p)[-1] & 0x3F) \
999 | ((unsigned char) ((p)[-2]) < 0xC2 ? 0x3FFF80 : 0))) \
1000 : ! ((p)[0] & 0x10) \
1001 ? ((p) += 3, \
1002 ((((p)[-3] & 0x0F) << 12) \
1003 | (((p)[-2] & 0x3F) << 6) \
1004 | ((p)[-1] & 0x3F))) \
1005 : ! ((p)[0] & 0x08) \
1006 ? ((p) += 4, \
1007 ((((p)[-4] & 0xF) << 18) \
1008 | (((p)[-3] & 0x3F) << 12) \
1009 | (((p)[-2] & 0x3F) << 6) \
1010 | ((p)[-1] & 0x3F))) \
1011 : ((p) += 5, \
1012 ((((p)[-4] & 0x3F) << 18) \
1013 | (((p)[-3] & 0x3F) << 12) \
1014 | (((p)[-2] & 0x3F) << 6) \
1015 | ((p)[-1] & 0x3F))))
1016
aa72b389 1017
df7492f9 1018static void
971de7fb 1019coding_set_source (struct coding_system *coding)
aa72b389 1020{
df7492f9
KH
1021 if (BUFFERP (coding->src_object))
1022 {
2cb26057 1023 struct buffer *buf = XBUFFER (coding->src_object);
aa72b389 1024
df7492f9 1025 if (coding->src_pos < 0)
2cb26057 1026 coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
df7492f9 1027 else
2cb26057 1028 coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
aa72b389 1029 }
df7492f9 1030 else if (STRINGP (coding->src_object))
aa72b389 1031 {
8f924df7 1032 coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
aa72b389 1033 }
df7492f9 1034 else
f38b440c
PE
1035 {
1036 /* Otherwise, the source is C string and is never relocated
1037 automatically. Thus we don't have to update anything. */
1038 }
df7492f9 1039}
aa72b389 1040
df7492f9 1041static void
971de7fb 1042coding_set_destination (struct coding_system *coding)
df7492f9
KH
1043{
1044 if (BUFFERP (coding->dst_object))
aa72b389 1045 {
df7492f9 1046 if (coding->src_pos < 0)
aa72b389 1047 {
13818c30 1048 coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
28f67a95
KH
1049 coding->dst_bytes = (GAP_END_ADDR
1050 - (coding->src_bytes - coding->consumed)
1051 - coding->destination);
aa72b389 1052 }
df7492f9 1053 else
28f67a95
KH
1054 {
1055 /* We are sure that coding->dst_pos_byte is before the gap
1056 of the buffer. */
1057 coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
13818c30 1058 + coding->dst_pos_byte - BEG_BYTE);
28f67a95
KH
1059 coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
1060 - coding->destination);
1061 }
df7492f9
KH
1062 }
1063 else
f38b440c
PE
1064 {
1065 /* Otherwise, the destination is C string and is never relocated
1066 automatically. Thus we don't have to update anything. */
1067 }
df7492f9
KH
1068}
1069
1070
1071static void
971de7fb 1072coding_alloc_by_realloc (struct coding_system *coding, EMACS_INT bytes)
df7492f9
KH
1073{
1074 coding->destination = (unsigned char *) xrealloc (coding->destination,
1075 coding->dst_bytes + bytes);
1076 coding->dst_bytes += bytes;
1077}
1078
1079static void
cf84bb53
JB
1080coding_alloc_by_making_gap (struct coding_system *coding,
1081 EMACS_INT gap_head_used, EMACS_INT bytes)
df7492f9 1082{
db274c7a 1083 if (EQ (coding->src_object, coding->dst_object))
df7492f9 1084 {
db274c7a
KH
1085 /* The gap may contain the produced data at the head and not-yet
1086 consumed data at the tail. To preserve those data, we at
1087 first make the gap size to zero, then increase the gap
1088 size. */
1089 EMACS_INT add = GAP_SIZE;
1090
1091 GPT += gap_head_used, GPT_BYTE += gap_head_used;
1092 GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
df7492f9
KH
1093 make_gap (bytes);
1094 GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
db274c7a 1095 GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
df7492f9 1096 }
730fff51 1097 else
df7492f9 1098 {
2c78b7e1
KH
1099 Lisp_Object this_buffer;
1100
1101 this_buffer = Fcurrent_buffer ();
df7492f9
KH
1102 set_buffer_internal (XBUFFER (coding->dst_object));
1103 make_gap (bytes);
1104 set_buffer_internal (XBUFFER (this_buffer));
aa72b389 1105 }
df7492f9 1106}
8f924df7 1107
df7492f9
KH
1108
1109static unsigned char *
cf84bb53
JB
1110alloc_destination (struct coding_system *coding, EMACS_INT nbytes,
1111 unsigned char *dst)
df7492f9
KH
1112{
1113 EMACS_INT offset = dst - coding->destination;
1114
1115 if (BUFFERP (coding->dst_object))
db274c7a
KH
1116 {
1117 struct buffer *buf = XBUFFER (coding->dst_object);
1118
1119 coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1120 }
aa72b389 1121 else
df7492f9 1122 coding_alloc_by_realloc (coding, nbytes);
df7492f9
KH
1123 coding_set_destination (coding);
1124 dst = coding->destination + offset;
1125 return dst;
1126}
aa72b389 1127
ff0dacd7
KH
1128/** Macros for annotations. */
1129
ff0dacd7
KH
1130/* An annotation data is stored in the array coding->charbuf in this
1131 format:
69a80ea3 1132 [ -LENGTH ANNOTATION_MASK NCHARS ... ]
ff0dacd7
KH
1133 LENGTH is the number of elements in the annotation.
1134 ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
69a80ea3 1135 NCHARS is the number of characters in the text annotated.
ff0dacd7
KH
1136
1137 The format of the following elements depend on ANNOTATION_MASK.
1138
1139 In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1140 follows:
e951386e
KH
1141 ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1142
1143 NBYTES is the number of bytes specified in the header part of
1144 old-style emacs-mule encoding, or 0 for the other kind of
1145 composition.
1146
ff0dacd7 1147 METHOD is one of enum composition_method.
e951386e 1148
ad1746f5 1149 Optional COMPOSITION-COMPONENTS are characters and composition
ff0dacd7
KH
1150 rules.
1151
1152 In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
e951386e
KH
1153 follows.
1154
1155 If ANNOTATION_MASK is 0, this annotation is just a space holder to
1156 recover from an invalid annotation, and should be skipped by
1157 produce_annotation. */
1158
1159/* Maximum length of the header of annotation data. */
1160#define MAX_ANNOTATION_LENGTH 5
ff0dacd7 1161
69a80ea3 1162#define ADD_ANNOTATION_DATA(buf, len, mask, nchars) \
ff0dacd7
KH
1163 do { \
1164 *(buf)++ = -(len); \
1165 *(buf)++ = (mask); \
69a80ea3 1166 *(buf)++ = (nchars); \
ff0dacd7
KH
1167 coding->annotated = 1; \
1168 } while (0);
1169
e951386e 1170#define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method) \
69a80ea3 1171 do { \
e951386e
KH
1172 ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1173 *buf++ = nbytes; \
69a80ea3 1174 *buf++ = method; \
ff0dacd7
KH
1175 } while (0)
1176
1177
69a80ea3
KH
1178#define ADD_CHARSET_DATA(buf, nchars, id) \
1179 do { \
1180 ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1181 *buf++ = id; \
ff0dacd7
KH
1182 } while (0)
1183
df7492f9
KH
1184\f
1185/*** 2. Emacs' internal format (emacs-utf-8) ***/
1186
1187
1188
1189\f
1190/*** 3. UTF-8 ***/
1191
1192/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
ff0dacd7
KH
1193 Check if a text is encoded in UTF-8. If it is, return 1, else
1194 return 0. */
df7492f9
KH
1195
1196#define UTF_8_1_OCTET_P(c) ((c) < 0x80)
1197#define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
1198#define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1199#define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1200#define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1201#define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1202
a470d443
KH
1203#define UTF_8_BOM_1 0xEF
1204#define UTF_8_BOM_2 0xBB
1205#define UTF_8_BOM_3 0xBF
1206
df7492f9 1207static int
cf84bb53
JB
1208detect_coding_utf_8 (struct coding_system *coding,
1209 struct coding_detection_info *detect_info)
df7492f9 1210{
065e3595 1211 const unsigned char *src = coding->source, *src_base;
8f924df7 1212 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9 1213 int multibytep = coding->src_multibyte;
a53e2e89 1214 EMACS_INT consumed_chars = 0;
a470d443 1215 int bom_found = 0;
df7492f9
KH
1216 int found = 0;
1217
ff0dacd7 1218 detect_info->checked |= CATEGORY_MASK_UTF_8;
df7492f9
KH
1219 /* A coding system of this category is always ASCII compatible. */
1220 src += coding->head_ascii;
1221
1222 while (1)
aa72b389 1223 {
df7492f9 1224 int c, c1, c2, c3, c4;
aa72b389 1225
065e3595 1226 src_base = src;
df7492f9 1227 ONE_MORE_BYTE (c);
065e3595 1228 if (c < 0 || UTF_8_1_OCTET_P (c))
df7492f9
KH
1229 continue;
1230 ONE_MORE_BYTE (c1);
065e3595 1231 if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
df7492f9
KH
1232 break;
1233 if (UTF_8_2_OCTET_LEADING_P (c))
aa72b389 1234 {
a470d443 1235 found = 1;
df7492f9 1236 continue;
aa72b389 1237 }
df7492f9 1238 ONE_MORE_BYTE (c2);
065e3595 1239 if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
df7492f9
KH
1240 break;
1241 if (UTF_8_3_OCTET_LEADING_P (c))
aa72b389 1242 {
a470d443
KH
1243 found = 1;
1244 if (src_base == coding->source
1245 && c == UTF_8_BOM_1 && c1 == UTF_8_BOM_2 && c2 == UTF_8_BOM_3)
1246 bom_found = 1;
df7492f9 1247 continue;
aa72b389 1248 }
df7492f9 1249 ONE_MORE_BYTE (c3);
065e3595 1250 if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
df7492f9
KH
1251 break;
1252 if (UTF_8_4_OCTET_LEADING_P (c))
aa72b389 1253 {
a470d443 1254 found = 1;
df7492f9
KH
1255 continue;
1256 }
1257 ONE_MORE_BYTE (c4);
065e3595 1258 if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
df7492f9
KH
1259 break;
1260 if (UTF_8_5_OCTET_LEADING_P (c))
1261 {
a470d443 1262 found = 1;
df7492f9
KH
1263 continue;
1264 }
1265 break;
aa72b389 1266 }
ff0dacd7 1267 detect_info->rejected |= CATEGORY_MASK_UTF_8;
df7492f9 1268 return 0;
aa72b389 1269
df7492f9 1270 no_more_source:
065e3595 1271 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
aa72b389 1272 {
ff0dacd7 1273 detect_info->rejected |= CATEGORY_MASK_UTF_8;
89528eb3 1274 return 0;
aa72b389 1275 }
a470d443
KH
1276 if (bom_found)
1277 {
1278 /* The first character 0xFFFE doesn't necessarily mean a BOM. */
1279 detect_info->found |= CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1280 }
1281 else
1282 {
1283 detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
0e17387a
KH
1284 if (found)
1285 detect_info->found |= CATEGORY_MASK_UTF_8_NOSIG;
a470d443 1286 }
ff0dacd7 1287 return 1;
aa72b389
KH
1288}
1289
4ed46869 1290
b73bfc1c 1291static void
971de7fb 1292decode_coding_utf_8 (struct coding_system *coding)
b73bfc1c 1293{
8f924df7
KH
1294 const unsigned char *src = coding->source + coding->consumed;
1295 const unsigned char *src_end = coding->source + coding->src_bytes;
1296 const unsigned char *src_base;
69a80ea3
KH
1297 int *charbuf = coding->charbuf + coding->charbuf_used;
1298 int *charbuf_end = coding->charbuf + coding->charbuf_size;
a53e2e89 1299 EMACS_INT consumed_chars = 0, consumed_chars_base = 0;
df7492f9 1300 int multibytep = coding->src_multibyte;
a470d443 1301 enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
2735d060 1302 int eol_dos =
0a9564cb 1303 !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
119852e7 1304 int byte_after_cr = -1;
4ed46869 1305
a470d443
KH
1306 if (bom != utf_without_bom)
1307 {
1308 int c1, c2, c3;
1309
1310 src_base = src;
1311 ONE_MORE_BYTE (c1);
1312 if (! UTF_8_3_OCTET_LEADING_P (c1))
1313 src = src_base;
1314 else
1315 {
159bd5a2 1316 ONE_MORE_BYTE (c2);
a470d443
KH
1317 if (! UTF_8_EXTRA_OCTET_P (c2))
1318 src = src_base;
1319 else
1320 {
159bd5a2 1321 ONE_MORE_BYTE (c3);
a470d443
KH
1322 if (! UTF_8_EXTRA_OCTET_P (c3))
1323 src = src_base;
1324 else
1325 {
1326 if ((c1 != UTF_8_BOM_1)
1327 || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1328 src = src_base;
1329 else
1330 CODING_UTF_8_BOM (coding) = utf_without_bom;
1331 }
1332 }
1333 }
1334 }
1335 CODING_UTF_8_BOM (coding) = utf_without_bom;
1336
df7492f9 1337 while (1)
b73bfc1c 1338 {
df7492f9 1339 int c, c1, c2, c3, c4, c5;
ec6d2bb8 1340
df7492f9
KH
1341 src_base = src;
1342 consumed_chars_base = consumed_chars;
4af310db 1343
df7492f9 1344 if (charbuf >= charbuf_end)
b71f6f73
KH
1345 {
1346 if (byte_after_cr >= 0)
1347 src_base--;
1348 break;
1349 }
df7492f9 1350
119852e7
KH
1351 if (byte_after_cr >= 0)
1352 c1 = byte_after_cr, byte_after_cr = -1;
1353 else
1354 ONE_MORE_BYTE (c1);
065e3595
KH
1355 if (c1 < 0)
1356 {
1357 c = - c1;
1358 }
1a4990fb 1359 else if (UTF_8_1_OCTET_P (c1))
df7492f9 1360 {
2735d060 1361 if (eol_dos && c1 == '\r')
119852e7 1362 ONE_MORE_BYTE (byte_after_cr);
df7492f9 1363 c = c1;
4af310db 1364 }
df7492f9 1365 else
4af310db 1366 {
df7492f9 1367 ONE_MORE_BYTE (c2);
065e3595 1368 if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
df7492f9
KH
1369 goto invalid_code;
1370 if (UTF_8_2_OCTET_LEADING_P (c1))
4af310db 1371 {
b0edb2c5
DL
1372 c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1373 /* Reject overlong sequences here and below. Encoders
1374 producing them are incorrect, they can be misleading,
1375 and they mess up read/write invariance. */
1376 if (c < 128)
1377 goto invalid_code;
4af310db 1378 }
df7492f9 1379 else
aa72b389 1380 {
df7492f9 1381 ONE_MORE_BYTE (c3);
065e3595 1382 if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
df7492f9
KH
1383 goto invalid_code;
1384 if (UTF_8_3_OCTET_LEADING_P (c1))
b0edb2c5
DL
1385 {
1386 c = (((c1 & 0xF) << 12)
1387 | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
72fe1301
DL
1388 if (c < 0x800
1389 || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
b0edb2c5
DL
1390 goto invalid_code;
1391 }
df7492f9
KH
1392 else
1393 {
1394 ONE_MORE_BYTE (c4);
065e3595 1395 if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
df7492f9
KH
1396 goto invalid_code;
1397 if (UTF_8_4_OCTET_LEADING_P (c1))
b0edb2c5 1398 {
df7492f9
KH
1399 c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1400 | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
b0edb2c5
DL
1401 if (c < 0x10000)
1402 goto invalid_code;
1403 }
df7492f9
KH
1404 else
1405 {
1406 ONE_MORE_BYTE (c5);
065e3595 1407 if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
df7492f9
KH
1408 goto invalid_code;
1409 if (UTF_8_5_OCTET_LEADING_P (c1))
1410 {
1411 c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1412 | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1413 | (c5 & 0x3F));
b0edb2c5 1414 if ((c > MAX_CHAR) || (c < 0x200000))
df7492f9
KH
1415 goto invalid_code;
1416 }
1417 else
1418 goto invalid_code;
1419 }
1420 }
aa72b389 1421 }
b73bfc1c 1422 }
df7492f9
KH
1423
1424 *charbuf++ = c;
1425 continue;
1426
1427 invalid_code:
1428 src = src_base;
1429 consumed_chars = consumed_chars_base;
1430 ONE_MORE_BYTE (c);
1431 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1432 coding->errors++;
aa72b389
KH
1433 }
1434
df7492f9
KH
1435 no_more_source:
1436 coding->consumed_char += consumed_chars_base;
1437 coding->consumed = src_base - coding->source;
1438 coding->charbuf_used = charbuf - coding->charbuf;
1439}
1440
1441
1442static int
971de7fb 1443encode_coding_utf_8 (struct coding_system *coding)
df7492f9
KH
1444{
1445 int multibytep = coding->dst_multibyte;
1446 int *charbuf = coding->charbuf;
1447 int *charbuf_end = charbuf + coding->charbuf_used;
1448 unsigned char *dst = coding->destination + coding->produced;
1449 unsigned char *dst_end = coding->destination + coding->dst_bytes;
a53e2e89 1450 EMACS_INT produced_chars = 0;
df7492f9
KH
1451 int c;
1452
a470d443
KH
1453 if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1454 {
1455 ASSURE_DESTINATION (3);
1456 EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1457 CODING_UTF_8_BOM (coding) = utf_without_bom;
1458 }
1459
df7492f9 1460 if (multibytep)
aa72b389 1461 {
df7492f9
KH
1462 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1463
1464 while (charbuf < charbuf_end)
b73bfc1c 1465 {
df7492f9 1466 unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
8f924df7 1467
df7492f9
KH
1468 ASSURE_DESTINATION (safe_room);
1469 c = *charbuf++;
28f67a95
KH
1470 if (CHAR_BYTE8_P (c))
1471 {
1472 c = CHAR_TO_BYTE8 (c);
1473 EMIT_ONE_BYTE (c);
1474 }
1475 else
1476 {
db274c7a 1477 CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
28f67a95
KH
1478 for (p = str; p < pend; p++)
1479 EMIT_ONE_BYTE (*p);
1480 }
b73bfc1c 1481 }
aa72b389 1482 }
df7492f9
KH
1483 else
1484 {
1485 int safe_room = MAX_MULTIBYTE_LENGTH;
1486
1487 while (charbuf < charbuf_end)
b73bfc1c 1488 {
df7492f9
KH
1489 ASSURE_DESTINATION (safe_room);
1490 c = *charbuf++;
f03caae0
KH
1491 if (CHAR_BYTE8_P (c))
1492 *dst++ = CHAR_TO_BYTE8 (c);
1493 else
db274c7a 1494 CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
df7492f9 1495 produced_chars++;
4ed46869
KH
1496 }
1497 }
065e3595 1498 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
1499 coding->produced_char += produced_chars;
1500 coding->produced = dst - coding->destination;
1501 return 0;
4ed46869
KH
1502}
1503
b73bfc1c 1504
df7492f9 1505/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
ff0dacd7
KH
1506 Check if a text is encoded in one of UTF-16 based coding systems.
1507 If it is, return 1, else return 0. */
aa72b389 1508
df7492f9
KH
1509#define UTF_16_HIGH_SURROGATE_P(val) \
1510 (((val) & 0xFC00) == 0xD800)
1511
1512#define UTF_16_LOW_SURROGATE_P(val) \
1513 (((val) & 0xFC00) == 0xDC00)
93dec019 1514
aa72b389 1515
df7492f9 1516static int
cf84bb53
JB
1517detect_coding_utf_16 (struct coding_system *coding,
1518 struct coding_detection_info *detect_info)
aa72b389 1519{
ef1b0ba7 1520 const unsigned char *src = coding->source;
8f924df7 1521 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9 1522 int multibytep = coding->src_multibyte;
df7492f9 1523 int c1, c2;
aa72b389 1524
ff0dacd7 1525 detect_info->checked |= CATEGORY_MASK_UTF_16;
ff0dacd7 1526 if (coding->mode & CODING_MODE_LAST_BLOCK
24a73b0a 1527 && (coding->src_chars & 1))
ff0dacd7
KH
1528 {
1529 detect_info->rejected |= CATEGORY_MASK_UTF_16;
1530 return 0;
1531 }
24a73b0a 1532
f56a4450 1533 TWO_MORE_BYTES (c1, c2);
df7492f9 1534 if ((c1 == 0xFF) && (c2 == 0xFE))
aa72b389 1535 {
b49a1807
KH
1536 detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1537 | CATEGORY_MASK_UTF_16_AUTO);
24a73b0a
KH
1538 detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1539 | CATEGORY_MASK_UTF_16_BE_NOSIG
1540 | CATEGORY_MASK_UTF_16_LE_NOSIG);
aa72b389 1541 }
df7492f9 1542 else if ((c1 == 0xFE) && (c2 == 0xFF))
ff0dacd7 1543 {
b49a1807
KH
1544 detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1545 | CATEGORY_MASK_UTF_16_AUTO);
24a73b0a
KH
1546 detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1547 | CATEGORY_MASK_UTF_16_BE_NOSIG
1548 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1549 }
220eeac9 1550 else if (c2 < 0)
f56a4450
KH
1551 {
1552 detect_info->rejected |= CATEGORY_MASK_UTF_16;
1553 return 0;
1554 }
2f3cbb32 1555 else
24a73b0a 1556 {
2f3cbb32
KH
1557 /* We check the dispersion of Eth and Oth bytes where E is even and
1558 O is odd. If both are high, we assume binary data.*/
1559 unsigned char e[256], o[256];
1560 unsigned e_num = 1, o_num = 1;
1561
1562 memset (e, 0, 256);
1563 memset (o, 0, 256);
1564 e[c1] = 1;
1565 o[c2] = 1;
1566
cc13543e
KH
1567 detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
1568 |CATEGORY_MASK_UTF_16_BE
1569 | CATEGORY_MASK_UTF_16_LE);
2f3cbb32 1570
7f1faf1c
KH
1571 while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
1572 != CATEGORY_MASK_UTF_16)
2f3cbb32 1573 {
f56a4450 1574 TWO_MORE_BYTES (c1, c2);
220eeac9 1575 if (c2 < 0)
f56a4450 1576 break;
2f3cbb32
KH
1577 if (! e[c1])
1578 {
1579 e[c1] = 1;
1580 e_num++;
cc13543e
KH
1581 if (e_num >= 128)
1582 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
2f3cbb32
KH
1583 }
1584 if (! o[c2])
1585 {
977b85f4 1586 o[c2] = 1;
2f3cbb32 1587 o_num++;
cc13543e
KH
1588 if (o_num >= 128)
1589 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
2f3cbb32
KH
1590 }
1591 }
2f3cbb32 1592 return 0;
ff0dacd7 1593 }
2f3cbb32 1594
df7492f9 1595 no_more_source:
ff0dacd7 1596 return 1;
df7492f9 1597}
aa72b389 1598
df7492f9 1599static void
971de7fb 1600decode_coding_utf_16 (struct coding_system *coding)
df7492f9 1601{
8f924df7
KH
1602 const unsigned char *src = coding->source + coding->consumed;
1603 const unsigned char *src_end = coding->source + coding->src_bytes;
1604 const unsigned char *src_base;
69a80ea3 1605 int *charbuf = coding->charbuf + coding->charbuf_used;
df80c7f0
KH
1606 /* We may produces at most 3 chars in one loop. */
1607 int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
a53e2e89 1608 EMACS_INT consumed_chars = 0, consumed_chars_base = 0;
df7492f9 1609 int multibytep = coding->src_multibyte;
a470d443 1610 enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
df7492f9
KH
1611 enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1612 int surrogate = CODING_UTF_16_SURROGATE (coding);
2735d060 1613 int eol_dos =
0a9564cb 1614 !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
119852e7 1615 int byte_after_cr1 = -1, byte_after_cr2 = -1;
df7492f9 1616
a470d443 1617 if (bom == utf_with_bom)
aa72b389 1618 {
df7492f9 1619 int c, c1, c2;
4af310db 1620
aa72b389 1621 src_base = src;
df7492f9
KH
1622 ONE_MORE_BYTE (c1);
1623 ONE_MORE_BYTE (c2);
e19c3639 1624 c = (c1 << 8) | c2;
aa72b389 1625
b49a1807
KH
1626 if (endian == utf_16_big_endian
1627 ? c != 0xFEFF : c != 0xFFFE)
aa72b389 1628 {
b49a1807
KH
1629 /* The first two bytes are not BOM. Treat them as bytes
1630 for a normal character. */
1631 src = src_base;
1632 coding->errors++;
aa72b389 1633 }
a470d443 1634 CODING_UTF_16_BOM (coding) = utf_without_bom;
b49a1807 1635 }
a470d443 1636 else if (bom == utf_detect_bom)
b49a1807
KH
1637 {
1638 /* We have already tried to detect BOM and failed in
1639 detect_coding. */
a470d443 1640 CODING_UTF_16_BOM (coding) = utf_without_bom;
df7492f9 1641 }
aa72b389 1642
df7492f9
KH
1643 while (1)
1644 {
1645 int c, c1, c2;
1646
1647 src_base = src;
1648 consumed_chars_base = consumed_chars;
1649
df80c7f0 1650 if (charbuf >= charbuf_end)
b71f6f73
KH
1651 {
1652 if (byte_after_cr1 >= 0)
1653 src_base -= 2;
1654 break;
1655 }
df7492f9 1656
119852e7
KH
1657 if (byte_after_cr1 >= 0)
1658 c1 = byte_after_cr1, byte_after_cr1 = -1;
1659 else
1660 ONE_MORE_BYTE (c1);
065e3595
KH
1661 if (c1 < 0)
1662 {
1663 *charbuf++ = -c1;
1664 continue;
1665 }
119852e7
KH
1666 if (byte_after_cr2 >= 0)
1667 c2 = byte_after_cr2, byte_after_cr2 = -1;
1668 else
1669 ONE_MORE_BYTE (c2);
065e3595
KH
1670 if (c2 < 0)
1671 {
1672 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1673 *charbuf++ = -c2;
1674 continue;
1675 }
df7492f9 1676 c = (endian == utf_16_big_endian
e19c3639 1677 ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
119852e7 1678
df7492f9 1679 if (surrogate)
fd3ae0b9 1680 {
df7492f9 1681 if (! UTF_16_LOW_SURROGATE_P (c))
fd3ae0b9 1682 {
df7492f9
KH
1683 if (endian == utf_16_big_endian)
1684 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1685 else
1686 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1687 *charbuf++ = c1;
1688 *charbuf++ = c2;
1689 coding->errors++;
1690 if (UTF_16_HIGH_SURROGATE_P (c))
1691 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
fd3ae0b9 1692 else
df7492f9 1693 *charbuf++ = c;
fd3ae0b9
KH
1694 }
1695 else
df7492f9
KH
1696 {
1697 c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1698 CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
29f7ffd0 1699 *charbuf++ = 0x10000 + c;
df7492f9 1700 }
fd3ae0b9 1701 }
aa72b389 1702 else
df7492f9
KH
1703 {
1704 if (UTF_16_HIGH_SURROGATE_P (c))
1705 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1706 else
119852e7 1707 {
2735d060 1708 if (eol_dos && c == '\r')
119852e7
KH
1709 {
1710 ONE_MORE_BYTE (byte_after_cr1);
1711 ONE_MORE_BYTE (byte_after_cr2);
1712 }
1713 *charbuf++ = c;
1714 }
8f924df7 1715 }
aa72b389 1716 }
df7492f9
KH
1717
1718 no_more_source:
1719 coding->consumed_char += consumed_chars_base;
1720 coding->consumed = src_base - coding->source;
1721 coding->charbuf_used = charbuf - coding->charbuf;
aa72b389 1722}
b73bfc1c 1723
df7492f9 1724static int
971de7fb 1725encode_coding_utf_16 (struct coding_system *coding)
df7492f9
KH
1726{
1727 int multibytep = coding->dst_multibyte;
1728 int *charbuf = coding->charbuf;
1729 int *charbuf_end = charbuf + coding->charbuf_used;
1730 unsigned char *dst = coding->destination + coding->produced;
1731 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1732 int safe_room = 8;
a470d443 1733 enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
df7492f9 1734 int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
a53e2e89 1735 EMACS_INT produced_chars = 0;
df7492f9 1736 int c;
4ed46869 1737
a470d443 1738 if (bom != utf_without_bom)
df7492f9
KH
1739 {
1740 ASSURE_DESTINATION (safe_room);
1741 if (big_endian)
df7492f9 1742 EMIT_TWO_BYTES (0xFE, 0xFF);
880cf180
KH
1743 else
1744 EMIT_TWO_BYTES (0xFF, 0xFE);
a470d443 1745 CODING_UTF_16_BOM (coding) = utf_without_bom;
df7492f9
KH
1746 }
1747
1748 while (charbuf < charbuf_end)
1749 {
1750 ASSURE_DESTINATION (safe_room);
1751 c = *charbuf++;
60afa08d 1752 if (c > MAX_UNICODE_CHAR)
e19c3639 1753 c = coding->default_char;
df7492f9
KH
1754
1755 if (c < 0x10000)
1756 {
1757 if (big_endian)
1758 EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1759 else
1760 EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1761 }
1762 else
1763 {
1764 int c1, c2;
1765
1766 c -= 0x10000;
1767 c1 = (c >> 10) + 0xD800;
1768 c2 = (c & 0x3FF) + 0xDC00;
1769 if (big_endian)
1770 EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1771 else
1772 EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1773 }
1774 }
065e3595 1775 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
1776 coding->produced = dst - coding->destination;
1777 coding->produced_char += produced_chars;
1778 return 0;
1779}
1780
1781\f
1782/*** 6. Old Emacs' internal format (emacs-mule) ***/
1783
1784/* Emacs' internal format for representation of multiple character
1785 sets is a kind of multi-byte encoding, i.e. characters are
1786 represented by variable-length sequences of one-byte codes.
1787
1788 ASCII characters and control characters (e.g. `tab', `newline') are
1789 represented by one-byte sequences which are their ASCII codes, in
1790 the range 0x00 through 0x7F.
1791
1792 8-bit characters of the range 0x80..0x9F are represented by
1793 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1794 code + 0x20).
1795
1796 8-bit characters of the range 0xA0..0xFF are represented by
1797 one-byte sequences which are their 8-bit code.
1798
1799 The other characters are represented by a sequence of `base
1800 leading-code', optional `extended leading-code', and one or two
1801 `position-code's. The length of the sequence is determined by the
1802 base leading-code. Leading-code takes the range 0x81 through 0x9D,
1803 whereas extended leading-code and position-code take the range 0xA0
1804 through 0xFF. See `charset.h' for more details about leading-code
1805 and position-code.
1806
1807 --- CODE RANGE of Emacs' internal format ---
1808 character set range
1809 ------------- -----
1810 ascii 0x00..0x7F
1811 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1812 eight-bit-graphic 0xA0..0xBF
1813 ELSE 0x81..0x9D + [0xA0..0xFF]+
1814 ---------------------------------------------
1815
1816 As this is the internal character representation, the format is
1817 usually not used externally (i.e. in a file or in a data sent to a
1818 process). But, it is possible to have a text externally in this
1819 format (i.e. by encoding by the coding system `emacs-mule').
1820
1821 In that case, a sequence of one-byte codes has a slightly different
1822 form.
1823
1824 At first, all characters in eight-bit-control are represented by
1825 one-byte sequences which are their 8-bit code.
1826
1827 Next, character composition data are represented by the byte
1828 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1829 where,
e951386e 1830 METHOD is 0xF2 plus one of composition method (enum
df7492f9
KH
1831 composition_method),
1832
1833 BYTES is 0xA0 plus a byte length of this composition data,
1834
e951386e 1835 CHARS is 0xA0 plus a number of characters composed by this
df7492f9
KH
1836 data,
1837
ad1746f5 1838 COMPONENTs are characters of multibyte form or composition
df7492f9
KH
1839 rules encoded by two-byte of ASCII codes.
1840
1841 In addition, for backward compatibility, the following formats are
1842 also recognized as composition data on decoding.
1843
1844 0x80 MSEQ ...
1845 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1846
1847 Here,
1848 MSEQ is a multibyte form but in these special format:
1849 ASCII: 0xA0 ASCII_CODE+0x80,
1850 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1851 RULE is a one byte code of the range 0xA0..0xF0 that
1852 represents a composition rule.
1853 */
1854
1855char emacs_mule_bytes[256];
1856
e951386e
KH
1857
1858/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1859 Check if a text is encoded in `emacs-mule'. If it is, return 1,
1860 else return 0. */
1861
1862static int
cf84bb53
JB
1863detect_coding_emacs_mule (struct coding_system *coding,
1864 struct coding_detection_info *detect_info)
e951386e
KH
1865{
1866 const unsigned char *src = coding->source, *src_base;
1867 const unsigned char *src_end = coding->source + coding->src_bytes;
1868 int multibytep = coding->src_multibyte;
a53e2e89 1869 EMACS_INT consumed_chars = 0;
e951386e
KH
1870 int c;
1871 int found = 0;
1872
1873 detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1874 /* A coding system of this category is always ASCII compatible. */
1875 src += coding->head_ascii;
1876
1877 while (1)
1878 {
1879 src_base = src;
1880 ONE_MORE_BYTE (c);
1881 if (c < 0)
1882 continue;
1883 if (c == 0x80)
1884 {
1885 /* Perhaps the start of composite character. We simply skip
1886 it because analyzing it is too heavy for detecting. But,
1887 at least, we check that the composite character
1888 constitutes of more than 4 bytes. */
2735d060 1889 const unsigned char *src_start;
e951386e
KH
1890
1891 repeat:
2735d060 1892 src_start = src;
e951386e
KH
1893 do
1894 {
1895 ONE_MORE_BYTE (c);
1896 }
1897 while (c >= 0xA0);
1898
2735d060 1899 if (src - src_start <= 4)
e951386e
KH
1900 break;
1901 found = CATEGORY_MASK_EMACS_MULE;
1902 if (c == 0x80)
1903 goto repeat;
1904 }
1905
1906 if (c < 0x80)
1907 {
1908 if (c < 0x20
1909 && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1910 break;
1911 }
1912 else
1913 {
396475b7 1914 int more_bytes = emacs_mule_bytes[c] - 1;
e951386e
KH
1915
1916 while (more_bytes > 0)
1917 {
1918 ONE_MORE_BYTE (c);
1919 if (c < 0xA0)
1920 {
1921 src--; /* Unread the last byte. */
1922 break;
1923 }
1924 more_bytes--;
1925 }
1926 if (more_bytes != 0)
1927 break;
1928 found = CATEGORY_MASK_EMACS_MULE;
1929 }
1930 }
1931 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1932 return 0;
1933
1934 no_more_source:
1935 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1936 {
1937 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1938 return 0;
1939 }
1940 detect_info->found |= found;
1941 return 1;
1942}
1943
1944
1945/* Parse emacs-mule multibyte sequence at SRC and return the decoded
1946 character. If CMP_STATUS indicates that we must expect MSEQ or
1947 RULE described above, decode it and return the negative value of
685ebdc8 1948 the decoded character or rule. If an invalid byte is found, return
e951386e
KH
1949 -1. If SRC is too short, return -2. */
1950
e2f1bab9 1951static int
cf84bb53
JB
1952emacs_mule_char (struct coding_system *coding, const unsigned char *src,
1953 int *nbytes, int *nchars, int *id,
1954 struct composition_status *cmp_status)
df7492f9 1955{
8f924df7
KH
1956 const unsigned char *src_end = coding->source + coding->src_bytes;
1957 const unsigned char *src_base = src;
df7492f9 1958 int multibytep = coding->src_multibyte;
2735d060 1959 int charset_ID;
df7492f9
KH
1960 unsigned code;
1961 int c;
1962 int consumed_chars = 0;
e951386e 1963 int mseq_found = 0;
df7492f9
KH
1964
1965 ONE_MORE_BYTE (c);
065e3595 1966 if (c < 0)
df7492f9 1967 {
065e3595 1968 c = -c;
2735d060 1969 charset_ID = emacs_mule_charset[0];
065e3595
KH
1970 }
1971 else
1972 {
4d41e8b7
KH
1973 if (c >= 0xA0)
1974 {
e951386e
KH
1975 if (cmp_status->state != COMPOSING_NO
1976 && cmp_status->old_form)
4d41e8b7 1977 {
e951386e
KH
1978 if (cmp_status->state == COMPOSING_CHAR)
1979 {
1980 if (c == 0xA0)
1981 {
1982 ONE_MORE_BYTE (c);
1983 c -= 0x80;
1984 if (c < 0)
1985 goto invalid_code;
1986 }
1987 else
1988 c -= 0x20;
1989 mseq_found = 1;
1990 }
1991 else
1992 {
1993 *nbytes = src - src_base;
1994 *nchars = consumed_chars;
1995 return -c;
1996 }
4d41e8b7
KH
1997 }
1998 else
e951386e 1999 goto invalid_code;
4d41e8b7
KH
2000 }
2001
065e3595 2002 switch (emacs_mule_bytes[c])
b73bfc1c 2003 {
065e3595 2004 case 2:
2735d060 2005 if ((charset_ID = emacs_mule_charset[c]) < 0)
df7492f9
KH
2006 goto invalid_code;
2007 ONE_MORE_BYTE (c);
9ffd559c 2008 if (c < 0xA0)
065e3595 2009 goto invalid_code;
df7492f9 2010 code = c & 0x7F;
065e3595
KH
2011 break;
2012
2013 case 3:
2014 if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
2015 || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
2016 {
2017 ONE_MORE_BYTE (c);
2735d060 2018 if (c < 0xA0 || (charset_ID = emacs_mule_charset[c]) < 0)
065e3595
KH
2019 goto invalid_code;
2020 ONE_MORE_BYTE (c);
9ffd559c 2021 if (c < 0xA0)
065e3595
KH
2022 goto invalid_code;
2023 code = c & 0x7F;
2024 }
2025 else
2026 {
2735d060 2027 if ((charset_ID = emacs_mule_charset[c]) < 0)
065e3595
KH
2028 goto invalid_code;
2029 ONE_MORE_BYTE (c);
9ffd559c 2030 if (c < 0xA0)
065e3595
KH
2031 goto invalid_code;
2032 code = (c & 0x7F) << 8;
2033 ONE_MORE_BYTE (c);
9ffd559c 2034 if (c < 0xA0)
065e3595
KH
2035 goto invalid_code;
2036 code |= c & 0x7F;
2037 }
2038 break;
2039
2040 case 4:
2041 ONE_MORE_BYTE (c);
2735d060 2042 if (c < 0 || (charset_ID = emacs_mule_charset[c]) < 0)
df7492f9
KH
2043 goto invalid_code;
2044 ONE_MORE_BYTE (c);
9ffd559c 2045 if (c < 0xA0)
065e3595 2046 goto invalid_code;
781d7a48 2047 code = (c & 0x7F) << 8;
df7492f9 2048 ONE_MORE_BYTE (c);
9ffd559c 2049 if (c < 0xA0)
065e3595 2050 goto invalid_code;
df7492f9 2051 code |= c & 0x7F;
065e3595 2052 break;
df7492f9 2053
065e3595
KH
2054 case 1:
2055 code = c;
2735d060 2056 charset_ID = ASCII_BYTE_P (code) ? charset_ascii : charset_eight_bit;
065e3595 2057 break;
df7492f9 2058
065e3595
KH
2059 default:
2060 abort ();
2061 }
b84ae584 2062 CODING_DECODE_CHAR (coding, src, src_base, src_end,
2735d060 2063 CHARSET_FROM_ID (charset_ID), code, c);
065e3595
KH
2064 if (c < 0)
2065 goto invalid_code;
df7492f9 2066 }
df7492f9
KH
2067 *nbytes = src - src_base;
2068 *nchars = consumed_chars;
ff0dacd7 2069 if (id)
2735d060 2070 *id = charset_ID;
e951386e 2071 return (mseq_found ? -c : c);
df7492f9
KH
2072
2073 no_more_source:
2074 return -2;
2075
2076 invalid_code:
2077 return -1;
2078}
2079
2080
e951386e 2081/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
df7492f9 2082
e951386e
KH
2083/* Handle these composition sequence ('|': the end of header elements,
2084 BYTES and CHARS >= 0xA0):
df7492f9 2085
e951386e
KH
2086 (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2087 (2) altchar composition: 0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2088 (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
df7492f9 2089
e951386e 2090 and these old form:
1a4990fb 2091
e951386e
KH
2092 (4) relative composition: 0x80 | MSEQ ... MSEQ
2093 (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
df7492f9 2094
e951386e
KH
2095 When the starter 0x80 and the following header elements are found,
2096 this annotation header is produced.
df7492f9 2097
e951386e 2098 [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
df7492f9 2099
e951386e
KH
2100 NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2101 NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
df7492f9 2102
e951386e
KH
2103 Then, upon reading the following elements, these codes are produced
2104 until the composition end is found:
df7492f9 2105
e951386e
KH
2106 (1) CHAR ... CHAR
2107 (2) ALT ... ALT CHAR ... CHAR
2108 (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2109 (4) CHAR ... CHAR
2110 (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
4ed46869 2111
e951386e
KH
2112 When the composition end is found, LENGTH and NCHARS in the
2113 annotation header is updated as below:
b73bfc1c 2114
e951386e
KH
2115 (1) LENGTH: unchanged, NCHARS: unchanged
2116 (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2117 (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2118 (4) LENGTH: unchanged, NCHARS: number of CHARs
2119 (5) LENGTH: unchanged, NCHARS: number of CHARs
df7492f9 2120
e951386e
KH
2121 If an error is found while composing, the annotation header is
2122 changed to the original composition header (plus filler -1s) as
2123 below:
2124
2125 (1),(2),(3) [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2126 (5) [ 0x80 0xFF -1 -1- -1 ]
2127
2128 and the sequence [ -2 DECODED-RULE ] is changed to the original
2129 byte sequence as below:
2130 o the original byte sequence is B: [ B -1 ]
2131 o the original byte sequence is B1 B2: [ B1 B2 ]
2132
2133 Most of the routines are implemented by macros because many
2134 variables and labels in the caller decode_coding_emacs_mule must be
2135 accessible, and they are usually called just once (thus doesn't
2136 increase the size of compiled object). */
2137
2138/* Decode a composition rule represented by C as a component of
2139 composition sequence of Emacs 20 style. Set RULE to the decoded
2140 rule. */
2141
2142#define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule) \
df7492f9 2143 do { \
e951386e
KH
2144 int gref, nref; \
2145 \
4d41e8b7 2146 c -= 0xA0; \
df7492f9
KH
2147 if (c < 0 || c >= 81) \
2148 goto invalid_code; \
df7492f9 2149 gref = c / 9, nref = c % 9; \
e951386e
KH
2150 if (gref == 4) gref = 10; \
2151 if (nref == 4) nref = 10; \
2152 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
df7492f9
KH
2153 } while (0)
2154
2155
e951386e
KH
2156/* Decode a composition rule represented by C and the following byte
2157 at SRC as a component of composition sequence of Emacs 21 style.
2158 Set RULE to the decoded rule. */
781d7a48 2159
e951386e 2160#define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule) \
781d7a48
KH
2161 do { \
2162 int gref, nref; \
e951386e
KH
2163 \
2164 gref = c - 0x20; \
2165 if (gref < 0 || gref >= 81) \
781d7a48 2166 goto invalid_code; \
e951386e
KH
2167 ONE_MORE_BYTE (c); \
2168 nref = c - 0x20; \
2169 if (nref < 0 || nref >= 81) \
781d7a48 2170 goto invalid_code; \
e951386e 2171 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
781d7a48
KH
2172 } while (0)
2173
2174
e951386e
KH
2175/* Start of Emacs 21 style format. The first three bytes at SRC are
2176 (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2177 byte length of this composition information, CHARS is the number of
2178 characters composed by this composition. */
2179
2180#define DECODE_EMACS_MULE_21_COMPOSITION() \
aa72b389 2181 do { \
781d7a48 2182 enum composition_method method = c - 0xF2; \
df7492f9 2183 int nbytes, nchars; \
e951386e 2184 \
df7492f9 2185 ONE_MORE_BYTE (c); \
065e3595
KH
2186 if (c < 0) \
2187 goto invalid_code; \
df7492f9 2188 nbytes = c - 0xA0; \
e951386e 2189 if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4)) \
df7492f9
KH
2190 goto invalid_code; \
2191 ONE_MORE_BYTE (c); \
2192 nchars = c - 0xA0; \
e951386e
KH
2193 if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS) \
2194 goto invalid_code; \
2195 cmp_status->old_form = 0; \
2196 cmp_status->method = method; \
2197 if (method == COMPOSITION_RELATIVE) \
2198 cmp_status->state = COMPOSING_CHAR; \
2199 else \
2200 cmp_status->state = COMPOSING_COMPONENT_CHAR; \
2201 cmp_status->length = MAX_ANNOTATION_LENGTH; \
2202 cmp_status->nchars = nchars; \
2203 cmp_status->ncomps = nbytes - 4; \
2204 ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method); \
aa72b389 2205 } while (0)
93dec019 2206
aa72b389 2207
e951386e
KH
2208/* Start of Emacs 20 style format for relative composition. */
2209
2210#define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION() \
2211 do { \
2212 cmp_status->old_form = 1; \
2213 cmp_status->method = COMPOSITION_RELATIVE; \
2214 cmp_status->state = COMPOSING_CHAR; \
2215 cmp_status->length = MAX_ANNOTATION_LENGTH; \
2216 cmp_status->nchars = cmp_status->ncomps = 0; \
2217 ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method); \
2218 } while (0)
2219
2220
2221/* Start of Emacs 20 style format for rule-base composition. */
2222
2223#define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION() \
2224 do { \
2225 cmp_status->old_form = 1; \
2226 cmp_status->method = COMPOSITION_WITH_RULE; \
2227 cmp_status->state = COMPOSING_CHAR; \
2228 cmp_status->length = MAX_ANNOTATION_LENGTH; \
2229 cmp_status->nchars = cmp_status->ncomps = 0; \
2230 ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method); \
df7492f9
KH
2231 } while (0)
2232
2233
e951386e
KH
2234#define DECODE_EMACS_MULE_COMPOSITION_START() \
2235 do { \
2236 const unsigned char *current_src = src; \
2237 \
2238 ONE_MORE_BYTE (c); \
2239 if (c < 0) \
2240 goto invalid_code; \
2241 if (c - 0xF2 >= COMPOSITION_RELATIVE \
2242 && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS) \
2243 DECODE_EMACS_MULE_21_COMPOSITION (); \
2244 else if (c < 0xA0) \
2245 goto invalid_code; \
2246 else if (c < 0xC0) \
2247 { \
2248 DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (); \
2249 /* Re-read C as a composition component. */ \
2250 src = current_src; \
2251 } \
2252 else if (c == 0xFF) \
2253 DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (); \
2254 else \
2255 goto invalid_code; \
2256 } while (0)
2257
2258#define EMACS_MULE_COMPOSITION_END() \
df7492f9 2259 do { \
e951386e 2260 int idx = - cmp_status->length; \
4d41e8b7 2261 \
e951386e
KH
2262 if (cmp_status->old_form) \
2263 charbuf[idx + 2] = cmp_status->nchars; \
2264 else if (cmp_status->method > COMPOSITION_RELATIVE) \
2265 charbuf[idx] = charbuf[idx + 2] - cmp_status->length; \
2266 cmp_status->state = COMPOSING_NO; \
2267 } while (0)
2268
2269
2270static int
cf84bb53
JB
2271emacs_mule_finish_composition (int *charbuf,
2272 struct composition_status *cmp_status)
e951386e
KH
2273{
2274 int idx = - cmp_status->length;
2275 int new_chars;
2276
2277 if (cmp_status->old_form && cmp_status->nchars > 0)
2278 {
2279 charbuf[idx + 2] = cmp_status->nchars;
2280 new_chars = 0;
2281 if (cmp_status->method == COMPOSITION_WITH_RULE
2282 && cmp_status->state == COMPOSING_CHAR)
2283 {
2284 /* The last rule was invalid. */
2285 int rule = charbuf[-1] + 0xA0;
2286
2287 charbuf[-2] = BYTE8_TO_CHAR (rule);
2288 charbuf[-1] = -1;
2289 new_chars = 1;
2290 }
2291 }
2292 else
2293 {
2294 charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2295
2296 if (cmp_status->method == COMPOSITION_WITH_RULE)
2297 {
2298 charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2299 charbuf[idx++] = -3;
2300 charbuf[idx++] = 0;
2301 new_chars = 1;
2302 }
2303 else
2304 {
2305 int nchars = charbuf[idx + 1] + 0xA0;
2306 int nbytes = charbuf[idx + 2] + 0xA0;
2307
2308 charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2309 charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2310 charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2311 charbuf[idx++] = -1;
2312 new_chars = 4;
2313 }
2314 }
2315 cmp_status->state = COMPOSING_NO;
2316 return new_chars;
2317}
2318
2319#define EMACS_MULE_MAYBE_FINISH_COMPOSITION() \
2320 do { \
2321 if (cmp_status->state != COMPOSING_NO) \
2322 char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
df7492f9
KH
2323 } while (0)
2324
aa72b389
KH
2325
2326static void
971de7fb 2327decode_coding_emacs_mule (struct coding_system *coding)
aa72b389 2328{
8f924df7
KH
2329 const unsigned char *src = coding->source + coding->consumed;
2330 const unsigned char *src_end = coding->source + coding->src_bytes;
2331 const unsigned char *src_base;
69a80ea3 2332 int *charbuf = coding->charbuf + coding->charbuf_used;
ad1746f5
KH
2333 /* We may produce two annotations (charset and composition) in one
2334 loop and one more charset annotation at the end. */
69a80ea3 2335 int *charbuf_end
df80c7f0 2336 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
a53e2e89 2337 EMACS_INT consumed_chars = 0, consumed_chars_base;
df7492f9 2338 int multibytep = coding->src_multibyte;
a53e2e89
EZ
2339 EMACS_INT char_offset = coding->produced_char;
2340 EMACS_INT last_offset = char_offset;
ff0dacd7 2341 int last_id = charset_ascii;
2735d060 2342 int eol_dos =
0a9564cb 2343 !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
119852e7 2344 int byte_after_cr = -1;
e951386e 2345 struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
aa72b389 2346
e951386e
KH
2347 if (cmp_status->state != COMPOSING_NO)
2348 {
2349 int i;
2350
2351 for (i = 0; i < cmp_status->length; i++)
2352 *charbuf++ = cmp_status->carryover[i];
2353 coding->annotated = 1;
2354 }
2355
aa72b389
KH
2356 while (1)
2357 {
ee05f961 2358 int c, id IF_LINT (= 0);
df7492f9 2359
aa72b389 2360 src_base = src;
df7492f9
KH
2361 consumed_chars_base = consumed_chars;
2362
2363 if (charbuf >= charbuf_end)
b71f6f73
KH
2364 {
2365 if (byte_after_cr >= 0)
2366 src_base--;
2367 break;
2368 }
aa72b389 2369
119852e7
KH
2370 if (byte_after_cr >= 0)
2371 c = byte_after_cr, byte_after_cr = -1;
2372 else
2373 ONE_MORE_BYTE (c);
e951386e
KH
2374
2375 if (c < 0 || c == 0x80)
065e3595 2376 {
e951386e
KH
2377 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2378 if (c < 0)
2379 {
2380 *charbuf++ = -c;
2381 char_offset++;
2382 }
2383 else
2384 DECODE_EMACS_MULE_COMPOSITION_START ();
2385 continue;
065e3595 2386 }
e951386e
KH
2387
2388 if (c < 0x80)
aa72b389 2389 {
2735d060 2390 if (eol_dos && c == '\r')
119852e7 2391 ONE_MORE_BYTE (byte_after_cr);
e951386e
KH
2392 id = charset_ascii;
2393 if (cmp_status->state != COMPOSING_NO)
2394 {
2395 if (cmp_status->old_form)
2396 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2397 else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2398 cmp_status->ncomps--;
2399 }
2400 }
2401 else
2402 {
ee05f961 2403 int nchars IF_LINT (= 0), nbytes IF_LINT (= 0);
75f80e63
EZ
2404 /* emacs_mule_char can load a charset map from a file, which
2405 allocates a large structure and might cause buffer text
2406 to be relocated as result. Thus, we need to remember the
ad1746f5 2407 original pointer to buffer text, and fix up all related
75f80e63
EZ
2408 pointers after the call. */
2409 const unsigned char *orig = coding->source;
2410 EMACS_INT offset;
e951386e
KH
2411
2412 c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2413 cmp_status);
75f80e63
EZ
2414 offset = coding->source - orig;
2415 if (offset)
2416 {
2417 src += offset;
2418 src_base += offset;
2419 src_end += offset;
2420 }
e951386e
KH
2421 if (c < 0)
2422 {
2423 if (c == -1)
2424 goto invalid_code;
2425 if (c == -2)
2426 break;
2427 }
2428 src = src_base + nbytes;
2429 consumed_chars = consumed_chars_base + nchars;
2430 if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2431 cmp_status->ncomps -= nchars;
2432 }
2433
ad1746f5 2434 /* Now if C >= 0, we found a normally encoded character, if C <
e951386e
KH
2435 0, we found an old-style composition component character or
2436 rule. */
2437
2438 if (cmp_status->state == COMPOSING_NO)
2439 {
2440 if (last_id != id)
2441 {
2442 if (last_id != charset_ascii)
2443 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2444 last_id);
2445 last_id = id;
2446 last_offset = char_offset;
2447 }
df7492f9
KH
2448 *charbuf++ = c;
2449 char_offset++;
aa72b389 2450 }
e951386e 2451 else if (cmp_status->state == COMPOSING_CHAR)
df7492f9 2452 {
e951386e
KH
2453 if (cmp_status->old_form)
2454 {
2455 if (c >= 0)
2456 {
2457 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2458 *charbuf++ = c;
2459 char_offset++;
2460 }
2461 else
2462 {
2463 *charbuf++ = -c;
2464 cmp_status->nchars++;
2465 cmp_status->length++;
2466 if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2467 EMACS_MULE_COMPOSITION_END ();
2468 else if (cmp_status->method == COMPOSITION_WITH_RULE)
2469 cmp_status->state = COMPOSING_RULE;
2470 }
2471 }
df7492f9 2472 else
e951386e
KH
2473 {
2474 *charbuf++ = c;
2475 cmp_status->length++;
2476 cmp_status->nchars--;
2477 if (cmp_status->nchars == 0)
2478 EMACS_MULE_COMPOSITION_END ();
2479 }
df7492f9 2480 }
e951386e 2481 else if (cmp_status->state == COMPOSING_RULE)
df7492f9 2482 {
e951386e 2483 int rule;
ff0dacd7 2484
e951386e 2485 if (c >= 0)
df7492f9 2486 {
e951386e
KH
2487 EMACS_MULE_COMPOSITION_END ();
2488 *charbuf++ = c;
2489 char_offset++;
df7492f9 2490 }
e951386e 2491 else
ff0dacd7 2492 {
e951386e
KH
2493 c = -c;
2494 DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2495 if (rule < 0)
2496 goto invalid_code;
2497 *charbuf++ = -2;
2498 *charbuf++ = rule;
2499 cmp_status->length += 2;
2500 cmp_status->state = COMPOSING_CHAR;
ff0dacd7 2501 }
e951386e
KH
2502 }
2503 else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2504 {
df7492f9 2505 *charbuf++ = c;
e951386e
KH
2506 cmp_status->length++;
2507 if (cmp_status->ncomps == 0)
2508 cmp_status->state = COMPOSING_CHAR;
2509 else if (cmp_status->ncomps > 0)
2510 {
2511 if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2512 cmp_status->state = COMPOSING_COMPONENT_RULE;
2513 }
2514 else
2515 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
df7492f9 2516 }
e951386e
KH
2517 else /* COMPOSING_COMPONENT_RULE */
2518 {
2519 int rule;
2520
2521 DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2522 if (rule < 0)
2523 goto invalid_code;
2524 *charbuf++ = -2;
2525 *charbuf++ = rule;
2526 cmp_status->length += 2;
2527 cmp_status->ncomps--;
2528 if (cmp_status->ncomps > 0)
2529 cmp_status->state = COMPOSING_COMPONENT_CHAR;
2530 else
2531 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2532 }
2533 continue;
2534
df7492f9 2535 invalid_code:
e951386e 2536 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
df7492f9
KH
2537 src = src_base;
2538 consumed_chars = consumed_chars_base;
2539 ONE_MORE_BYTE (c);
2540 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
ff0dacd7 2541 char_offset++;
df7492f9
KH
2542 coding->errors++;
2543 }
2544
2545 no_more_source:
e951386e
KH
2546 if (cmp_status->state != COMPOSING_NO)
2547 {
2548 if (coding->mode & CODING_MODE_LAST_BLOCK)
2549 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2550 else
2551 {
2552 int i;
2553
2554 charbuf -= cmp_status->length;
2555 for (i = 0; i < cmp_status->length; i++)
2556 cmp_status->carryover[i] = charbuf[i];
2557 }
2558 }
ff0dacd7 2559 if (last_id != charset_ascii)
69a80ea3 2560 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
df7492f9
KH
2561 coding->consumed_char += consumed_chars_base;
2562 coding->consumed = src_base - coding->source;
2563 coding->charbuf_used = charbuf - coding->charbuf;
2564}
2565
2566
2567#define EMACS_MULE_LEADING_CODES(id, codes) \
2568 do { \
2569 if (id < 0xA0) \
2570 codes[0] = id, codes[1] = 0; \
2571 else if (id < 0xE0) \
2572 codes[0] = 0x9A, codes[1] = id; \
2573 else if (id < 0xF0) \
2574 codes[0] = 0x9B, codes[1] = id; \
2575 else if (id < 0xF5) \
2576 codes[0] = 0x9C, codes[1] = id; \
2577 else \
2578 codes[0] = 0x9D, codes[1] = id; \
2579 } while (0);
2580
aa72b389 2581
df7492f9 2582static int
971de7fb 2583encode_coding_emacs_mule (struct coding_system *coding)
df7492f9
KH
2584{
2585 int multibytep = coding->dst_multibyte;
2586 int *charbuf = coding->charbuf;
2587 int *charbuf_end = charbuf + coding->charbuf_used;
2588 unsigned char *dst = coding->destination + coding->produced;
2589 unsigned char *dst_end = coding->destination + coding->dst_bytes;
2590 int safe_room = 8;
a53e2e89 2591 EMACS_INT produced_chars = 0;
24a73b0a 2592 Lisp_Object attrs, charset_list;
df7492f9 2593 int c;
ff0dacd7 2594 int preferred_charset_id = -1;
df7492f9 2595
24a73b0a 2596 CODING_GET_INFO (coding, attrs, charset_list);
eccb6815
KH
2597 if (! EQ (charset_list, Vemacs_mule_charset_list))
2598 {
2599 CODING_ATTR_CHARSET_LIST (attrs)
2600 = charset_list = Vemacs_mule_charset_list;
2601 }
df7492f9
KH
2602
2603 while (charbuf < charbuf_end)
2604 {
2605 ASSURE_DESTINATION (safe_room);
2606 c = *charbuf++;
ff0dacd7
KH
2607
2608 if (c < 0)
2609 {
2610 /* Handle an annotation. */
2611 switch (*charbuf)
2612 {
2613 case CODING_ANNOTATE_COMPOSITION_MASK:
2614 /* Not yet implemented. */
2615 break;
2616 case CODING_ANNOTATE_CHARSET_MASK:
2617 preferred_charset_id = charbuf[3];
2618 if (preferred_charset_id >= 0
2619 && NILP (Fmemq (make_number (preferred_charset_id),
2620 charset_list)))
2621 preferred_charset_id = -1;
2622 break;
2623 default:
2624 abort ();
2625 }
2626 charbuf += -c - 1;
2627 continue;
2628 }
2629
df7492f9
KH
2630 if (ASCII_CHAR_P (c))
2631 EMIT_ONE_ASCII_BYTE (c);
16eafb5d
KH
2632 else if (CHAR_BYTE8_P (c))
2633 {
2634 c = CHAR_TO_BYTE8 (c);
2635 EMIT_ONE_BYTE (c);
2636 }
df7492f9 2637 else
aa72b389 2638 {
df7492f9
KH
2639 struct charset *charset;
2640 unsigned code;
2641 int dimension;
2642 int emacs_mule_id;
2643 unsigned char leading_codes[2];
2644
ff0dacd7
KH
2645 if (preferred_charset_id >= 0)
2646 {
2647 charset = CHARSET_FROM_ID (preferred_charset_id);
905ca9d2
KH
2648 if (CHAR_CHARSET_P (c, charset))
2649 code = ENCODE_CHAR (charset, c);
2650 else
2651 charset = char_charset (c, charset_list, &code);
ff0dacd7
KH
2652 }
2653 else
2654 charset = char_charset (c, charset_list, &code);
df7492f9
KH
2655 if (! charset)
2656 {
2657 c = coding->default_char;
2658 if (ASCII_CHAR_P (c))
2659 {
2660 EMIT_ONE_ASCII_BYTE (c);
2661 continue;
2662 }
2663 charset = char_charset (c, charset_list, &code);
2664 }
2665 dimension = CHARSET_DIMENSION (charset);
2666 emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2667 EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2668 EMIT_ONE_BYTE (leading_codes[0]);
2669 if (leading_codes[1])
2670 EMIT_ONE_BYTE (leading_codes[1]);
2671 if (dimension == 1)
1fa663f9 2672 EMIT_ONE_BYTE (code | 0x80);
aa72b389 2673 else
df7492f9 2674 {
1fa663f9 2675 code |= 0x8080;
df7492f9
KH
2676 EMIT_ONE_BYTE (code >> 8);
2677 EMIT_ONE_BYTE (code & 0xFF);
2678 }
aa72b389 2679 }
aa72b389 2680 }
065e3595 2681 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
2682 coding->produced_char += produced_chars;
2683 coding->produced = dst - coding->destination;
2684 return 0;
aa72b389 2685}
b73bfc1c 2686
4ed46869 2687\f
df7492f9 2688/*** 7. ISO2022 handlers ***/
4ed46869
KH
2689
2690/* The following note describes the coding system ISO2022 briefly.
39787efd 2691 Since the intention of this note is to help understand the
5a936b46 2692 functions in this file, some parts are NOT ACCURATE or are OVERLY
39787efd 2693 SIMPLIFIED. For thorough understanding, please refer to the
5a936b46 2694 original document of ISO2022. This is equivalent to the standard
cfb43547 2695 ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
4ed46869
KH
2696
2697 ISO2022 provides many mechanisms to encode several character sets
cfb43547 2698 in 7-bit and 8-bit environments. For 7-bit environments, all text
39787efd
KH
2699 is encoded using bytes less than 128. This may make the encoded
2700 text a little bit longer, but the text passes more easily through
cfb43547 2701 several types of gateway, some of which strip off the MSB (Most
8ca3766a 2702 Significant Bit).
b73bfc1c 2703
cfb43547
DL
2704 There are two kinds of character sets: control character sets and
2705 graphic character sets. The former contain control characters such
4ed46869 2706 as `newline' and `escape' to provide control functions (control
39787efd 2707 functions are also provided by escape sequences). The latter
cfb43547 2708 contain graphic characters such as 'A' and '-'. Emacs recognizes
4ed46869
KH
2709 two control character sets and many graphic character sets.
2710
2711 Graphic character sets are classified into one of the following
39787efd
KH
2712 four classes, according to the number of bytes (DIMENSION) and
2713 number of characters in one dimension (CHARS) of the set:
2714 - DIMENSION1_CHARS94
2715 - DIMENSION1_CHARS96
2716 - DIMENSION2_CHARS94
2717 - DIMENSION2_CHARS96
2718
2719 In addition, each character set is assigned an identification tag,
cfb43547 2720 unique for each set, called the "final character" (denoted as <F>
39787efd
KH
2721 hereafter). The <F> of each character set is decided by ECMA(*)
2722 when it is registered in ISO. The code range of <F> is 0x30..0x7F
2723 (0x30..0x3F are for private use only).
4ed46869
KH
2724
2725 Note (*): ECMA = European Computer Manufacturers Association
2726
cfb43547 2727 Here are examples of graphic character sets [NAME(<F>)]:
4ed46869
KH
2728 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2729 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2730 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2731 o DIMENSION2_CHARS96 -- none for the moment
2732
39787efd 2733 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
4ed46869
KH
2734 C0 [0x00..0x1F] -- control character plane 0
2735 GL [0x20..0x7F] -- graphic character plane 0
2736 C1 [0x80..0x9F] -- control character plane 1
2737 GR [0xA0..0xFF] -- graphic character plane 1
2738
2739 A control character set is directly designated and invoked to C0 or
39787efd
KH
2740 C1 by an escape sequence. The most common case is that:
2741 - ISO646's control character set is designated/invoked to C0, and
2742 - ISO6429's control character set is designated/invoked to C1,
2743 and usually these designations/invocations are omitted in encoded
2744 text. In a 7-bit environment, only C0 can be used, and a control
2745 character for C1 is encoded by an appropriate escape sequence to
2746 fit into the environment. All control characters for C1 are
2747 defined to have corresponding escape sequences.
4ed46869
KH
2748
2749 A graphic character set is at first designated to one of four
2750 graphic registers (G0 through G3), then these graphic registers are
2751 invoked to GL or GR. These designations and invocations can be
2752 done independently. The most common case is that G0 is invoked to
39787efd
KH
2753 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
2754 these invocations and designations are omitted in encoded text.
2755 In a 7-bit environment, only GL can be used.
4ed46869 2756
39787efd
KH
2757 When a graphic character set of CHARS94 is invoked to GL, codes
2758 0x20 and 0x7F of the GL area work as control characters SPACE and
2759 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2760 be used.
4ed46869
KH
2761
2762 There are two ways of invocation: locking-shift and single-shift.
2763 With locking-shift, the invocation lasts until the next different
39787efd
KH
2764 invocation, whereas with single-shift, the invocation affects the
2765 following character only and doesn't affect the locking-shift
2766 state. Invocations are done by the following control characters or
2767 escape sequences:
4ed46869
KH
2768
2769 ----------------------------------------------------------------------
39787efd 2770 abbrev function cntrl escape seq description
4ed46869 2771 ----------------------------------------------------------------------
39787efd
KH
2772 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
2773 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
2774 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
2775 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
2776 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
2777 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
2778 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
2779 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
2780 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
4ed46869 2781 ----------------------------------------------------------------------
39787efd
KH
2782 (*) These are not used by any known coding system.
2783
2784 Control characters for these functions are defined by macros
2785 ISO_CODE_XXX in `coding.h'.
4ed46869 2786
39787efd 2787 Designations are done by the following escape sequences:
4ed46869
KH
2788 ----------------------------------------------------------------------
2789 escape sequence description
2790 ----------------------------------------------------------------------
2791 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
2792 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
2793 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
2794 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
2795 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
2796 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
2797 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
2798 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
2799 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
2800 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
2801 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
2802 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
2803 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
2804 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
2805 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
2806 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
2807 ----------------------------------------------------------------------
2808
2809 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
39787efd 2810 of dimension 1, chars 94, and final character <F>, etc...
4ed46869
KH
2811
2812 Note (*): Although these designations are not allowed in ISO2022,
2813 Emacs accepts them on decoding, and produces them on encoding
39787efd 2814 CHARS96 character sets in a coding system which is characterized as
4ed46869
KH
2815 7-bit environment, non-locking-shift, and non-single-shift.
2816
2817 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
df7492f9 2818 '(' must be omitted. We refer to this as "short-form" hereafter.
4ed46869 2819
cfb43547 2820 Now you may notice that there are a lot of ways of encoding the
39787efd
KH
2821 same multilingual text in ISO2022. Actually, there exist many
2822 coding systems such as Compound Text (used in X11's inter client
8ca3766a
DL
2823 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2824 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
4ed46869
KH
2825 localized platforms), and all of these are variants of ISO2022.
2826
2827 In addition to the above, Emacs handles two more kinds of escape
2828 sequences: ISO6429's direction specification and Emacs' private
2829 sequence for specifying character composition.
2830
39787efd 2831 ISO6429's direction specification takes the following form:
4ed46869
KH
2832 o CSI ']' -- end of the current direction
2833 o CSI '0' ']' -- end of the current direction
2834 o CSI '1' ']' -- start of left-to-right text
2835 o CSI '2' ']' -- start of right-to-left text
2836 The control character CSI (0x9B: control sequence introducer) is
39787efd
KH
2837 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2838
2839 Character composition specification takes the following form:
ec6d2bb8
KH
2840 o ESC '0' -- start relative composition
2841 o ESC '1' -- end composition
2842 o ESC '2' -- start rule-base composition (*)
2843 o ESC '3' -- start relative composition with alternate chars (**)
2844 o ESC '4' -- start rule-base composition with alternate chars (**)
b73bfc1c 2845 Since these are not standard escape sequences of any ISO standard,
cfb43547 2846 the use of them with these meanings is restricted to Emacs only.
ec6d2bb8 2847
5a936b46
DL
2848 (*) This form is used only in Emacs 20.7 and older versions,
2849 but newer versions can safely decode it.
cfb43547 2850 (**) This form is used only in Emacs 21.1 and newer versions,
5a936b46 2851 and older versions can't decode it.
ec6d2bb8 2852
cfb43547 2853 Here's a list of example usages of these composition escape
b73bfc1c 2854 sequences (categorized by `enum composition_method').
ec6d2bb8 2855
b73bfc1c 2856 COMPOSITION_RELATIVE:
ec6d2bb8 2857 ESC 0 CHAR [ CHAR ] ESC 1
8ca3766a 2858 COMPOSITION_WITH_RULE:
ec6d2bb8 2859 ESC 2 CHAR [ RULE CHAR ] ESC 1
b73bfc1c 2860 COMPOSITION_WITH_ALTCHARS:
ec6d2bb8 2861 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
b73bfc1c 2862 COMPOSITION_WITH_RULE_ALTCHARS:
ec6d2bb8 2863 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
4ed46869 2864
74ab6df5 2865static enum iso_code_class_type iso_code_class[256];
4ed46869 2866
df7492f9
KH
2867#define SAFE_CHARSET_P(coding, id) \
2868 ((id) <= (coding)->max_charset_id \
1b3b981b 2869 && (coding)->safe_charsets[id] != 255)
df7492f9 2870
df7492f9 2871static void
971de7fb 2872setup_iso_safe_charsets (Lisp_Object attrs)
df7492f9
KH
2873{
2874 Lisp_Object charset_list, safe_charsets;
2875 Lisp_Object request;
2876 Lisp_Object reg_usage;
2877 Lisp_Object tail;
2878 int reg94, reg96;
2879 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2880 int max_charset_id;
2881
2882 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2883 if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2884 && ! EQ (charset_list, Viso_2022_charset_list))
2885 {
2886 CODING_ATTR_CHARSET_LIST (attrs)
2887 = charset_list = Viso_2022_charset_list;
2888 ASET (attrs, coding_attr_safe_charsets, Qnil);
2889 }
2890
2891 if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2892 return;
2893
2894 max_charset_id = 0;
2895 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2896 {
2897 int id = XINT (XCAR (tail));
2898 if (max_charset_id < id)
2899 max_charset_id = id;
2900 }
d46c5b12 2901
1b3b981b
AS
2902 safe_charsets = make_uninit_string (max_charset_id + 1);
2903 memset (SDATA (safe_charsets), 255, max_charset_id + 1);
df7492f9
KH
2904 request = AREF (attrs, coding_attr_iso_request);
2905 reg_usage = AREF (attrs, coding_attr_iso_usage);
2906 reg94 = XINT (XCAR (reg_usage));
2907 reg96 = XINT (XCDR (reg_usage));
2908
2909 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2910 {
2911 Lisp_Object id;
2912 Lisp_Object reg;
2913 struct charset *charset;
2914
2915 id = XCAR (tail);
2916 charset = CHARSET_FROM_ID (XINT (id));
bf16eb23 2917 reg = Fcdr (Fassq (id, request));
df7492f9 2918 if (! NILP (reg))
8f924df7 2919 SSET (safe_charsets, XINT (id), XINT (reg));
df7492f9
KH
2920 else if (charset->iso_chars_96)
2921 {
2922 if (reg96 < 4)
8f924df7 2923 SSET (safe_charsets, XINT (id), reg96);
df7492f9
KH
2924 }
2925 else
2926 {
2927 if (reg94 < 4)
8f924df7 2928 SSET (safe_charsets, XINT (id), reg94);
df7492f9
KH
2929 }
2930 }
2931 ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2932}
d46c5b12 2933
b6871cc7 2934
4ed46869 2935/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
ad1746f5 2936 Check if a text is encoded in one of ISO-2022 based coding systems.
ff0dacd7 2937 If it is, return 1, else return 0. */
4ed46869 2938
0a28aafb 2939static int
cf84bb53
JB
2940detect_coding_iso_2022 (struct coding_system *coding,
2941 struct coding_detection_info *detect_info)
4ed46869 2942{
8f924df7
KH
2943 const unsigned char *src = coding->source, *src_base = src;
2944 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9 2945 int multibytep = coding->src_multibyte;
ff0dacd7 2946 int single_shifting = 0;
0e48bb22 2947 int id;
df7492f9 2948 int c, c1;
a53e2e89 2949 EMACS_INT consumed_chars = 0;
df7492f9 2950 int i;
ff0dacd7
KH
2951 int rejected = 0;
2952 int found = 0;
cee53ed4 2953 int composition_count = -1;
ff0dacd7
KH
2954
2955 detect_info->checked |= CATEGORY_MASK_ISO;
df7492f9
KH
2956
2957 for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2958 {
2959 struct coding_system *this = &(coding_categories[i]);
2960 Lisp_Object attrs, val;
2961
c6b278e7
KH
2962 if (this->id < 0)
2963 continue;
df7492f9
KH
2964 attrs = CODING_ID_ATTRS (this->id);
2965 if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
1b3b981b 2966 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
df7492f9
KH
2967 setup_iso_safe_charsets (attrs);
2968 val = CODING_ATTR_SAFE_CHARSETS (attrs);
8f924df7 2969 this->max_charset_id = SCHARS (val) - 1;
1b3b981b 2970 this->safe_charsets = SDATA (val);
df7492f9
KH
2971 }
2972
2973 /* A coding system of this category is always ASCII compatible. */
2974 src += coding->head_ascii;
3f003981 2975
ff0dacd7 2976 while (rejected != CATEGORY_MASK_ISO)
4ed46869 2977 {
065e3595 2978 src_base = src;
df7492f9 2979 ONE_MORE_BYTE (c);
4ed46869
KH
2980 switch (c)
2981 {
2982 case ISO_CODE_ESC:
74383408
KH
2983 if (inhibit_iso_escape_detection)
2984 break;
f46869e4 2985 single_shifting = 0;
df7492f9 2986 ONE_MORE_BYTE (c);
0e48bb22 2987 if (c == 'N' || c == 'O')
d46c5b12 2988 {
ae9ff118 2989 /* ESC <Fe> for SS2 or SS3. */
ff0dacd7
KH
2990 single_shifting = 1;
2991 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
4ed46869 2992 }
cee53ed4
KH
2993 else if (c == '1')
2994 {
2995 /* End of composition. */
2996 if (composition_count < 0
2997 || composition_count > MAX_COMPOSITION_COMPONENTS)
2998 /* Invalid */
2999 break;
3000 composition_count = -1;
3001 found |= CATEGORY_MASK_ISO;
3002 }
ec6d2bb8
KH
3003 else if (c >= '0' && c <= '4')
3004 {
3005 /* ESC <Fp> for start/end composition. */
cee53ed4 3006 composition_count = 0;
ec6d2bb8 3007 }
bf9cdd4e 3008 else
df7492f9 3009 {
0e48bb22
AS
3010 if (c >= '(' && c <= '/')
3011 {
3012 /* Designation sequence for a charset of dimension 1. */
3013 ONE_MORE_BYTE (c1);
3014 if (c1 < ' ' || c1 >= 0x80
3015 || (id = iso_charset_table[0][c >= ','][c1]) < 0)
3016 /* Invalid designation sequence. Just ignore. */
3017 break;
3018 }
3019 else if (c == '$')
3020 {
3021 /* Designation sequence for a charset of dimension 2. */
3022 ONE_MORE_BYTE (c);
3023 if (c >= '@' && c <= 'B')
3024 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
3025 id = iso_charset_table[1][0][c];
3026 else if (c >= '(' && c <= '/')
3027 {
3028 ONE_MORE_BYTE (c1);
3029 if (c1 < ' ' || c1 >= 0x80
3030 || (id = iso_charset_table[1][c >= ','][c1]) < 0)
3031 /* Invalid designation sequence. Just ignore. */
3032 break;
3033 }
3034 else
3035 /* Invalid designation sequence. Just ignore it. */
3036 break;
3037 }
3038 else
3039 {
3040 /* Invalid escape sequence. Just ignore it. */
3041 break;
3042 }
d46c5b12 3043
0e48bb22
AS
3044 /* We found a valid designation sequence for CHARSET. */
3045 rejected |= CATEGORY_MASK_ISO_8BIT;
3046 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
3047 id))
3048 found |= CATEGORY_MASK_ISO_7;
3049 else
3050 rejected |= CATEGORY_MASK_ISO_7;
3051 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
3052 id))
3053 found |= CATEGORY_MASK_ISO_7_TIGHT;
3054 else
3055 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
3056 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3057 id))
3058 found |= CATEGORY_MASK_ISO_7_ELSE;
3059 else
3060 rejected |= CATEGORY_MASK_ISO_7_ELSE;
3061 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3062 id))
3063 found |= CATEGORY_MASK_ISO_8_ELSE;
3064 else
3065 rejected |= CATEGORY_MASK_ISO_8_ELSE;
3066 }
4ed46869
KH
3067 break;
3068
4ed46869 3069 case ISO_CODE_SO:
d46c5b12 3070 case ISO_CODE_SI:
ff0dacd7 3071 /* Locking shift out/in. */
74383408
KH
3072 if (inhibit_iso_escape_detection)
3073 break;
f46869e4 3074 single_shifting = 0;
ff0dacd7 3075 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
d46c5b12
KH
3076 break;
3077
4ed46869 3078 case ISO_CODE_CSI:
ff0dacd7 3079 /* Control sequence introducer. */
f46869e4 3080 single_shifting = 0;
ff0dacd7
KH
3081 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3082 found |= CATEGORY_MASK_ISO_8_ELSE;
3083 goto check_extra_latin;
3084
4ed46869
KH
3085 case ISO_CODE_SS2:
3086 case ISO_CODE_SS3:
ff0dacd7
KH
3087 /* Single shift. */
3088 if (inhibit_iso_escape_detection)
3089 break;
75e2a253 3090 single_shifting = 0;
ff0dacd7
KH
3091 rejected |= CATEGORY_MASK_ISO_7BIT;
3092 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3093 & CODING_ISO_FLAG_SINGLE_SHIFT)
0e48bb22
AS
3094 {
3095 found |= CATEGORY_MASK_ISO_8_1;
3096 single_shifting = 1;
3097 }
ff0dacd7
KH
3098 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3099 & CODING_ISO_FLAG_SINGLE_SHIFT)
0e48bb22
AS
3100 {
3101 found |= CATEGORY_MASK_ISO_8_2;
3102 single_shifting = 1;
3103 }
75e2a253
KH
3104 if (single_shifting)
3105 break;
0e48bb22
AS
3106 check_extra_latin:
3107 if (! VECTORP (Vlatin_extra_code_table)
3108 || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
3109 {
3110 rejected = CATEGORY_MASK_ISO;
3111 break;
3112 }
3113 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3114 & CODING_ISO_FLAG_LATIN_EXTRA)
3115 found |= CATEGORY_MASK_ISO_8_1;
3116 else
3117 rejected |= CATEGORY_MASK_ISO_8_1;
3118 rejected |= CATEGORY_MASK_ISO_8_2;
3119 break;
4ed46869
KH
3120
3121 default:
065e3595
KH
3122 if (c < 0)
3123 continue;
4ed46869 3124 if (c < 0x80)
f46869e4 3125 {
cee53ed4
KH
3126 if (composition_count >= 0)
3127 composition_count++;
f46869e4
KH
3128 single_shifting = 0;
3129 break;
3130 }
ff0dacd7 3131 if (c >= 0xA0)
c4825358 3132 {
ff0dacd7
KH
3133 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3134 found |= CATEGORY_MASK_ISO_8_1;
f46869e4 3135 /* Check the length of succeeding codes of the range
ff0dacd7
KH
3136 0xA0..0FF. If the byte length is even, we include
3137 CATEGORY_MASK_ISO_8_2 in `found'. We can check this
3138 only when we are not single shifting. */
3139 if (! single_shifting
3140 && ! (rejected & CATEGORY_MASK_ISO_8_2))
f46869e4 3141 {
2735d060 3142 int len = 1;
b73bfc1c
KH
3143 while (src < src_end)
3144 {
d12bd917 3145 src_base = src;
df7492f9 3146 ONE_MORE_BYTE (c);
b73bfc1c 3147 if (c < 0xA0)
d12bd917
KH
3148 {
3149 src = src_base;
3150 break;
3151 }
2735d060 3152 len++;
b73bfc1c
KH
3153 }
3154
2735d060 3155 if (len & 1 && src < src_end)
cee53ed4
KH
3156 {
3157 rejected |= CATEGORY_MASK_ISO_8_2;
3158 if (composition_count >= 0)
2735d060 3159 composition_count += len;
cee53ed4 3160 }
f46869e4 3161 else
cee53ed4
KH
3162 {
3163 found |= CATEGORY_MASK_ISO_8_2;
3164 if (composition_count >= 0)
2735d060 3165 composition_count += len / 2;
cee53ed4 3166 }
f46869e4 3167 }
ff0dacd7 3168 break;
4ed46869 3169 }
4ed46869
KH
3170 }
3171 }
ff0dacd7
KH
3172 detect_info->rejected |= CATEGORY_MASK_ISO;
3173 return 0;
4ed46869 3174
df7492f9 3175 no_more_source:
ff0dacd7
KH
3176 detect_info->rejected |= rejected;
3177 detect_info->found |= (found & ~rejected);
df7492f9 3178 return 1;
4ed46869 3179}
ec6d2bb8 3180
4ed46869 3181
134b9549
KH
3182/* Set designation state into CODING. Set CHARS_96 to -1 if the
3183 escape sequence should be kept. */
df7492f9
KH
3184#define DECODE_DESIGNATION(reg, dim, chars_96, final) \
3185 do { \
3186 int id, prev; \
3187 \
3188 if (final < '0' || final >= 128 \
3189 || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0) \
3190 || !SAFE_CHARSET_P (coding, id)) \
3191 { \
3192 CODING_ISO_DESIGNATION (coding, reg) = -2; \
134b9549
KH
3193 chars_96 = -1; \
3194 break; \
df7492f9
KH
3195 } \
3196 prev = CODING_ISO_DESIGNATION (coding, reg); \
bf16eb23
KH
3197 if (id == charset_jisx0201_roman) \
3198 { \
3199 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
3200 id = charset_ascii; \
3201 } \
3202 else if (id == charset_jisx0208_1978) \
3203 { \
3204 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
3205 id = charset_jisx0208; \
3206 } \
df7492f9
KH
3207 CODING_ISO_DESIGNATION (coding, reg) = id; \
3208 /* If there was an invalid designation to REG previously, and this \
3209 designation is ASCII to REG, we should keep this designation \
3210 sequence. */ \
3211 if (prev == -2 && id == charset_ascii) \
134b9549 3212 chars_96 = -1; \
4ed46869
KH
3213 } while (0)
3214
d46c5b12 3215
e951386e
KH
3216/* Handle these composition sequence (ALT: alternate char):
3217
3218 (1) relative composition: ESC 0 CHAR ... ESC 1
3219 (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3220 (3) altchar composition: ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3221 (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3222
3223 When the start sequence (ESC 0/2/3/4) is found, this annotation
3224 header is produced.
3225
3226 [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3227
3228 Then, upon reading CHAR or RULE (one or two bytes), these codes are
3229 produced until the end sequence (ESC 1) is found:
3230
3231 (1) CHAR ... CHAR
3232 (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3233 (3) ALT ... ALT -1 -1 CHAR ... CHAR
3234 (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3235
3236 When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3237 annotation header is updated as below:
3238
3239 (1) LENGTH: unchanged, NCHARS: number of CHARs
3240 (2) LENGTH: unchanged, NCHARS: number of CHARs
3241 (3) LENGTH: += number of ALTs + 2, NCHARS: number of CHARs
3242 (4) LENGTH: += number of ALTs * 3, NCHARS: number of CHARs
3243
3244 If an error is found while composing, the annotation header is
3245 changed to:
3246
3247 [ ESC '0'/'2'/'3'/'4' -2 0 ]
3248
3249 and the sequence [ -2 DECODED-RULE ] is changed to the original
3250 byte sequence as below:
3251 o the original byte sequence is B: [ B -1 ]
3252 o the original byte sequence is B1 B2: [ B1 B2 ]
3253 and the sequence [ -1 -1 ] is changed to the original byte
3254 sequence:
3255 [ ESC '0' ]
3256*/
3257
3258/* Decode a composition rule C1 and maybe one more byte from the
66ebf983 3259 source, and set RULE to the encoded composition rule. If the rule
d5efd1d1 3260 is invalid, goto invalid_code. */
e951386e 3261
66ebf983 3262#define DECODE_COMPOSITION_RULE(rule) \
e951386e
KH
3263 do { \
3264 rule = c1 - 32; \
3265 if (rule < 0) \
d5efd1d1 3266 goto invalid_code; \
e951386e
KH
3267 if (rule < 81) /* old format (before ver.21) */ \
3268 { \
3269 int gref = (rule) / 9; \
3270 int nref = (rule) % 9; \
3271 if (gref == 4) gref = 10; \
3272 if (nref == 4) nref = 10; \
3273 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
e951386e
KH
3274 } \
3275 else /* new format (after ver.21) */ \
3276 { \
2735d060 3277 int b; \
e951386e 3278 \
2735d060 3279 ONE_MORE_BYTE (b); \
d5efd1d1
PE
3280 if (! COMPOSITION_ENCODE_RULE_VALID (rule - 81, b - 32)) \
3281 goto invalid_code; \
2735d060 3282 rule = COMPOSITION_ENCODE_RULE (rule - 81, b - 32); \
d5efd1d1 3283 rule += 0x100; /* Distinguish it from the old format. */ \
e951386e
KH
3284 } \
3285 } while (0)
3286
3287#define ENCODE_COMPOSITION_RULE(rule) \
df7492f9 3288 do { \
e951386e
KH
3289 int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3290 \
3291 if (rule < 0x100) /* old format */ \
df7492f9 3292 { \
e951386e
KH
3293 if (gref == 10) gref = 4; \
3294 if (nref == 10) nref = 4; \
3295 charbuf[idx] = 32 + gref * 9 + nref; \
3296 charbuf[idx + 1] = -1; \
3297 new_chars++; \
df7492f9 3298 } \
e951386e 3299 else /* new format */ \
df7492f9 3300 { \
e951386e
KH
3301 charbuf[idx] = 32 + 81 + gref; \
3302 charbuf[idx + 1] = 32 + nref; \
3303 new_chars += 2; \
df7492f9
KH
3304 } \
3305 } while (0)
3306
e951386e
KH
3307/* Finish the current composition as invalid. */
3308
f57e2426 3309static int finish_composition (int *, struct composition_status *);
e951386e
KH
3310
3311static int
971de7fb 3312finish_composition (int *charbuf, struct composition_status *cmp_status)
e951386e
KH
3313{
3314 int idx = - cmp_status->length;
3315 int new_chars;
3316
3317 /* Recover the original ESC sequence */
3318 charbuf[idx++] = ISO_CODE_ESC;
3319 charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3320 : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3321 : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3322 /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3323 : '4');
3324 charbuf[idx++] = -2;
3325 charbuf[idx++] = 0;
3326 charbuf[idx++] = -1;
3327 new_chars = cmp_status->nchars;
3328 if (cmp_status->method >= COMPOSITION_WITH_RULE)
3329 for (; idx < 0; idx++)
3330 {
3331 int elt = charbuf[idx];
3332
3333 if (elt == -2)
3334 {
3335 ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3336 idx++;
3337 }
3338 else if (elt == -1)
3339 {
3340 charbuf[idx++] = ISO_CODE_ESC;
3341 charbuf[idx] = '0';
3342 new_chars += 2;
3343 }
3344 }
3345 cmp_status->state = COMPOSING_NO;
3346 return new_chars;
3347}
3348
ad1746f5 3349/* If characters are under composition, finish the composition. */
e951386e
KH
3350#define MAYBE_FINISH_COMPOSITION() \
3351 do { \
3352 if (cmp_status->state != COMPOSING_NO) \
3353 char_offset += finish_composition (charbuf, cmp_status); \
3354 } while (0)
d46c5b12 3355
aa72b389 3356/* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
e951386e 3357
aa72b389
KH
3358 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3359 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
df7492f9
KH
3360 ESC 3 : altchar composition : ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3361 ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
ec6d2bb8 3362
e951386e
KH
3363 Produce this annotation sequence now:
3364
3365 [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3366*/
3367
3368#define DECODE_COMPOSITION_START(c1) \
3369 do { \
3370 if (c1 == '0' \
3371 && ((cmp_status->state == COMPOSING_COMPONENT_CHAR \
3372 && cmp_status->method == COMPOSITION_WITH_ALTCHARS) \
3373 || (cmp_status->state == COMPOSING_COMPONENT_RULE \
3374 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3375 { \
3376 *charbuf++ = -1; \
3377 *charbuf++= -1; \
3378 cmp_status->state = COMPOSING_CHAR; \
3379 cmp_status->length += 2; \
3380 } \
3381 else \
3382 { \
3383 MAYBE_FINISH_COMPOSITION (); \
3384 cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE \
3385 : c1 == '2' ? COMPOSITION_WITH_RULE \
3386 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
3387 : COMPOSITION_WITH_RULE_ALTCHARS); \
3388 cmp_status->state \
3389 = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR); \
3390 ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method); \
3391 cmp_status->length = MAX_ANNOTATION_LENGTH; \
3392 cmp_status->nchars = cmp_status->ncomps = 0; \
3393 coding->annotated = 1; \
3394 } \
ec6d2bb8
KH
3395 } while (0)
3396
ec6d2bb8 3397
e951386e 3398/* Handle composition end sequence ESC 1. */
df7492f9
KH
3399
3400#define DECODE_COMPOSITION_END() \
ec6d2bb8 3401 do { \
e951386e
KH
3402 if (cmp_status->nchars == 0 \
3403 || ((cmp_status->state == COMPOSING_CHAR) \
3404 == (cmp_status->method == COMPOSITION_WITH_RULE))) \
ec6d2bb8 3405 { \
e951386e
KH
3406 MAYBE_FINISH_COMPOSITION (); \
3407 goto invalid_code; \
ec6d2bb8 3408 } \
e951386e
KH
3409 if (cmp_status->method == COMPOSITION_WITH_ALTCHARS) \
3410 charbuf[- cmp_status->length] -= cmp_status->ncomps + 2; \
3411 else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS) \
3412 charbuf[- cmp_status->length] -= cmp_status->ncomps * 3; \
3413 charbuf[- cmp_status->length + 2] = cmp_status->nchars; \
3414 char_offset += cmp_status->nchars; \
3415 cmp_status->state = COMPOSING_NO; \
ec6d2bb8
KH
3416 } while (0)
3417
e951386e 3418/* Store a composition rule RULE in charbuf, and update cmp_status. */
df7492f9 3419
e951386e
KH
3420#define STORE_COMPOSITION_RULE(rule) \
3421 do { \
3422 *charbuf++ = -2; \
3423 *charbuf++ = rule; \
3424 cmp_status->length += 2; \
3425 cmp_status->state--; \
3426 } while (0)
ec6d2bb8 3427
e951386e
KH
3428/* Store a composed char or a component char C in charbuf, and update
3429 cmp_status. */
3430
3431#define STORE_COMPOSITION_CHAR(c) \
ec6d2bb8 3432 do { \
e951386e
KH
3433 *charbuf++ = (c); \
3434 cmp_status->length++; \
3435 if (cmp_status->state == COMPOSING_CHAR) \
3436 cmp_status->nchars++; \
df7492f9 3437 else \
e951386e
KH
3438 cmp_status->ncomps++; \
3439 if (cmp_status->method == COMPOSITION_WITH_RULE \
3440 || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS \
3441 && cmp_status->state == COMPOSING_COMPONENT_CHAR)) \
3442 cmp_status->state++; \
ec6d2bb8 3443 } while (0)
88993dfd 3444
d46c5b12 3445
4ed46869
KH
3446/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
3447
b73bfc1c 3448static void
971de7fb 3449decode_coding_iso_2022 (struct coding_system *coding)
4ed46869 3450{
8f924df7
KH
3451 const unsigned char *src = coding->source + coding->consumed;
3452 const unsigned char *src_end = coding->source + coding->src_bytes;
3453 const unsigned char *src_base;
69a80ea3 3454 int *charbuf = coding->charbuf + coding->charbuf_used;
ad1746f5
KH
3455 /* We may produce two annotations (charset and composition) in one
3456 loop and one more charset annotation at the end. */
ff0dacd7 3457 int *charbuf_end
df80c7f0 3458 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
a53e2e89 3459 EMACS_INT consumed_chars = 0, consumed_chars_base;
df7492f9 3460 int multibytep = coding->src_multibyte;
4ed46869 3461 /* Charsets invoked to graphic plane 0 and 1 respectively. */
df7492f9
KH
3462 int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3463 int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
134b9549 3464 int charset_id_2, charset_id_3;
df7492f9
KH
3465 struct charset *charset;
3466 int c;
e951386e 3467 struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
66ebf983 3468 Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
a53e2e89
EZ
3469 EMACS_INT char_offset = coding->produced_char;
3470 EMACS_INT last_offset = char_offset;
ff0dacd7 3471 int last_id = charset_ascii;
2735d060 3472 int eol_dos =
0a9564cb 3473 !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
119852e7 3474 int byte_after_cr = -1;
e951386e 3475 int i;
df7492f9 3476
df7492f9 3477 setup_iso_safe_charsets (attrs);
1b3b981b 3478 coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
b73bfc1c 3479
e951386e
KH
3480 if (cmp_status->state != COMPOSING_NO)
3481 {
3482 for (i = 0; i < cmp_status->length; i++)
3483 *charbuf++ = cmp_status->carryover[i];
3484 coding->annotated = 1;
3485 }
3486
b73bfc1c 3487 while (1)
4ed46869 3488 {
cf299835 3489 int c1, c2, c3;
b73bfc1c
KH
3490
3491 src_base = src;
df7492f9
KH
3492 consumed_chars_base = consumed_chars;
3493
3494 if (charbuf >= charbuf_end)
b71f6f73
KH
3495 {
3496 if (byte_after_cr >= 0)
3497 src_base--;
3498 break;
3499 }
df7492f9 3500
119852e7
KH
3501 if (byte_after_cr >= 0)
3502 c1 = byte_after_cr, byte_after_cr = -1;
3503 else
3504 ONE_MORE_BYTE (c1);
065e3595
KH
3505 if (c1 < 0)
3506 goto invalid_code;
4ed46869 3507
e951386e 3508 if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
4ed46869 3509 {
e951386e
KH
3510 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3511 char_offset++;
3512 CODING_ISO_EXTSEGMENT_LEN (coding)--;
3513 continue;
3514 }
3515
3516 if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3517 {
3518 if (c1 == ISO_CODE_ESC)
ec6d2bb8 3519 {
e951386e
KH
3520 if (src + 1 >= src_end)
3521 goto no_more_source;
3522 *charbuf++ = ISO_CODE_ESC;
3523 char_offset++;
3524 if (src[0] == '%' && src[1] == '@')
df7492f9 3525 {
e951386e
KH
3526 src += 2;
3527 consumed_chars += 2;
3528 char_offset += 2;
3529 /* We are sure charbuf can contain two more chars. */
3530 *charbuf++ = '%';
3531 *charbuf++ = '@';
3532 CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
df7492f9 3533 }
4ed46869 3534 }
e951386e
KH
3535 else
3536 {
3537 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3538 char_offset++;
3539 }
3540 continue;
3541 }
3542
3543 if ((cmp_status->state == COMPOSING_RULE
3544 || cmp_status->state == COMPOSING_COMPONENT_RULE)
3545 && c1 != ISO_CODE_ESC)
3546 {
66ebf983 3547 int rule;
e951386e 3548
66ebf983 3549 DECODE_COMPOSITION_RULE (rule);
e951386e
KH
3550 STORE_COMPOSITION_RULE (rule);
3551 continue;
3552 }
3553
3554 /* We produce at most one character. */
3555 switch (iso_code_class [c1])
3556 {
3557 case ISO_0x20_or_0x7F:
df7492f9
KH
3558 if (charset_id_0 < 0
3559 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
781d7a48
KH
3560 /* This is SPACE or DEL. */
3561 charset = CHARSET_FROM_ID (charset_ascii);
3562 else
3563 charset = CHARSET_FROM_ID (charset_id_0);
3564 break;
4ed46869
KH
3565
3566 case ISO_graphic_plane_0:
134b9549
KH
3567 if (charset_id_0 < 0)
3568 charset = CHARSET_FROM_ID (charset_ascii);
3569 else
3570 charset = CHARSET_FROM_ID (charset_id_0);
4ed46869
KH
3571 break;
3572
3573 case ISO_0xA0_or_0xFF:
df7492f9
KH
3574 if (charset_id_1 < 0
3575 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3576 || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3577 goto invalid_code;
4ed46869
KH
3578 /* This is a graphic character, we fall down ... */
3579
3580 case ISO_graphic_plane_1:
df7492f9
KH
3581 if (charset_id_1 < 0)
3582 goto invalid_code;
3583 charset = CHARSET_FROM_ID (charset_id_1);
4ed46869
KH
3584 break;
3585
df7492f9 3586 case ISO_control_0:
2735d060 3587 if (eol_dos && c1 == '\r')
119852e7 3588 ONE_MORE_BYTE (byte_after_cr);
df7492f9
KH
3589 MAYBE_FINISH_COMPOSITION ();
3590 charset = CHARSET_FROM_ID (charset_ascii);
4ed46869
KH
3591 break;
3592
df7492f9 3593 case ISO_control_1:
df7492f9
KH
3594 goto invalid_code;
3595
4ed46869 3596 case ISO_shift_out:
df7492f9
KH
3597 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3598 || CODING_ISO_DESIGNATION (coding, 1) < 0)
3599 goto invalid_code;
3600 CODING_ISO_INVOCATION (coding, 0) = 1;
3601 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
b73bfc1c 3602 continue;
4ed46869
KH
3603
3604 case ISO_shift_in:
df7492f9
KH
3605 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3606 goto invalid_code;
3607 CODING_ISO_INVOCATION (coding, 0) = 0;
3608 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
b73bfc1c 3609 continue;
4ed46869
KH
3610
3611 case ISO_single_shift_2_7:
a63dba42
KH
3612 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS))
3613 goto invalid_code;
4ed46869 3614 case ISO_single_shift_2:
df7492f9
KH
3615 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3616 goto invalid_code;
4ed46869
KH
3617 /* SS2 is handled as an escape sequence of ESC 'N' */
3618 c1 = 'N';
3619 goto label_escape_sequence;
3620
3621 case ISO_single_shift_3:
df7492f9
KH
3622 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3623 goto invalid_code;
4ed46869
KH
3624 /* SS2 is handled as an escape sequence of ESC 'O' */
3625 c1 = 'O';
3626 goto label_escape_sequence;
3627
3628 case ISO_control_sequence_introducer:
3629 /* CSI is handled as an escape sequence of ESC '[' ... */
3630 c1 = '[';
3631 goto label_escape_sequence;
3632
3633 case ISO_escape:
3634 ONE_MORE_BYTE (c1);
3635 label_escape_sequence:
df7492f9 3636 /* Escape sequences handled here are invocation,
4ed46869
KH
3637 designation, direction specification, and character
3638 composition specification. */
3639 switch (c1)
3640 {
3641 case '&': /* revision of following character set */
3642 ONE_MORE_BYTE (c1);
3643 if (!(c1 >= '@' && c1 <= '~'))
df7492f9 3644 goto invalid_code;
4ed46869
KH
3645 ONE_MORE_BYTE (c1);
3646 if (c1 != ISO_CODE_ESC)
df7492f9 3647 goto invalid_code;
4ed46869
KH
3648 ONE_MORE_BYTE (c1);
3649 goto label_escape_sequence;
3650
3651 case '$': /* designation of 2-byte character set */
df7492f9
KH
3652 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3653 goto invalid_code;
134b9549
KH
3654 {
3655 int reg, chars96;
3656
3657 ONE_MORE_BYTE (c1);
3658 if (c1 >= '@' && c1 <= 'B')
3659 { /* designation of JISX0208.1978, GB2312.1980,
88993dfd 3660 or JISX0208.1980 */
134b9549
KH
3661 reg = 0, chars96 = 0;
3662 }
3663 else if (c1 >= 0x28 && c1 <= 0x2B)
3664 { /* designation of DIMENSION2_CHARS94 character set */
3665 reg = c1 - 0x28, chars96 = 0;
3666 ONE_MORE_BYTE (c1);
3667 }
3668 else if (c1 >= 0x2C && c1 <= 0x2F)
3669 { /* designation of DIMENSION2_CHARS96 character set */
3670 reg = c1 - 0x2C, chars96 = 1;
3671 ONE_MORE_BYTE (c1);
3672 }
3673 else
3674 goto invalid_code;
3675 DECODE_DESIGNATION (reg, 2, chars96, c1);
3676 /* We must update these variables now. */
3677 if (reg == 0)
3678 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3679 else if (reg == 1)
3680 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3681 if (chars96 < 0)
3682 goto invalid_code;
3683 }
b73bfc1c 3684 continue;
4ed46869
KH
3685
3686 case 'n': /* invocation of locking-shift-2 */
df7492f9
KH
3687 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3688 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3689 goto invalid_code;
3690 CODING_ISO_INVOCATION (coding, 0) = 2;
3691 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
b73bfc1c 3692 continue;
4ed46869
KH
3693
3694 case 'o': /* invocation of locking-shift-3 */
df7492f9
KH
3695 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3696 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3697 goto invalid_code;
3698 CODING_ISO_INVOCATION (coding, 0) = 3;
3699 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
b73bfc1c 3700 continue;
4ed46869
KH
3701
3702 case 'N': /* invocation of single-shift-2 */
df7492f9
KH
3703 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3704 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3705 goto invalid_code;
134b9549
KH
3706 charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3707 if (charset_id_2 < 0)
3708 charset = CHARSET_FROM_ID (charset_ascii);
3709 else
3710 charset = CHARSET_FROM_ID (charset_id_2);
b73bfc1c 3711 ONE_MORE_BYTE (c1);
e7046a18 3712 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
df7492f9 3713 goto invalid_code;
4ed46869
KH
3714 break;
3715
3716 case 'O': /* invocation of single-shift-3 */
df7492f9
KH
3717 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3718 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3719 goto invalid_code;
134b9549
KH
3720 charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3721 if (charset_id_3 < 0)
3722 charset = CHARSET_FROM_ID (charset_ascii);
3723 else
3724 charset = CHARSET_FROM_ID (charset_id_3);
b73bfc1c 3725 ONE_MORE_BYTE (c1);
e7046a18 3726 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
df7492f9 3727 goto invalid_code;
4ed46869
KH
3728 break;
3729
ec6d2bb8 3730 case '0': case '2': case '3': case '4': /* start composition */
df7492f9
KH
3731 if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3732 goto invalid_code;
e951386e
KH
3733 if (last_id != charset_ascii)
3734 {
3735 ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3736 last_id = charset_ascii;
3737 last_offset = char_offset;
3738 }
ec6d2bb8 3739 DECODE_COMPOSITION_START (c1);
b73bfc1c 3740 continue;
4ed46869 3741
ec6d2bb8 3742 case '1': /* end composition */
e951386e 3743 if (cmp_status->state == COMPOSING_NO)
df7492f9
KH
3744 goto invalid_code;
3745 DECODE_COMPOSITION_END ();
b73bfc1c 3746 continue;
4ed46869
KH
3747
3748 case '[': /* specification of direction */
de59072a 3749 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION))
df7492f9 3750 goto invalid_code;
4ed46869 3751 /* For the moment, nested direction is not supported.
d46c5b12 3752 So, `coding->mode & CODING_MODE_DIRECTION' zero means
ad1746f5 3753 left-to-right, and nonzero means right-to-left. */
4ed46869
KH
3754 ONE_MORE_BYTE (c1);
3755 switch (c1)
3756 {
3757 case ']': /* end of the current direction */
d46c5b12 3758 coding->mode &= ~CODING_MODE_DIRECTION;
4ed46869
KH
3759
3760 case '0': /* end of the current direction */
3761 case '1': /* start of left-to-right direction */
3762 ONE_MORE_BYTE (c1);
3763 if (c1 == ']')
d46c5b12 3764 coding->mode &= ~CODING_MODE_DIRECTION;
4ed46869 3765 else
df7492f9 3766 goto invalid_code;
4ed46869
KH
3767 break;
3768
3769 case '2': /* start of right-to-left direction */
3770 ONE_MORE_BYTE (c1);
3771 if (c1 == ']')
d46c5b12 3772 coding->mode |= CODING_MODE_DIRECTION;
4ed46869 3773 else
df7492f9 3774 goto invalid_code;
4ed46869
KH
3775 break;
3776
3777 default:
df7492f9 3778 goto invalid_code;
4ed46869 3779 }
b73bfc1c 3780 continue;
4ed46869 3781
103e0180 3782 case '%':
103e0180
KH
3783 ONE_MORE_BYTE (c1);
3784 if (c1 == '/')
3785 {
3786 /* CTEXT extended segment:
3787 ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3788 We keep these bytes as is for the moment.
3789 They may be decoded by post-read-conversion. */
3790 int dim, M, L;
4776e638 3791 int size;
8f924df7 3792
103e0180 3793 ONE_MORE_BYTE (dim);
7a84eee5 3794 if (dim < '0' || dim > '4')
e951386e 3795 goto invalid_code;
103e0180 3796 ONE_MORE_BYTE (M);
e951386e
KH
3797 if (M < 128)
3798 goto invalid_code;
103e0180 3799 ONE_MORE_BYTE (L);
e951386e
KH
3800 if (L < 128)
3801 goto invalid_code;
103e0180 3802 size = ((M - 128) * 128) + (L - 128);
e951386e 3803 if (charbuf + 6 > charbuf_end)
4776e638
KH
3804 goto break_loop;
3805 *charbuf++ = ISO_CODE_ESC;
3806 *charbuf++ = '%';
3807 *charbuf++ = '/';
3808 *charbuf++ = dim;
3809 *charbuf++ = BYTE8_TO_CHAR (M);
3810 *charbuf++ = BYTE8_TO_CHAR (L);
e951386e 3811 CODING_ISO_EXTSEGMENT_LEN (coding) = size;
103e0180
KH
3812 }
3813 else if (c1 == 'G')
3814 {
103e0180
KH
3815 /* XFree86 extension for embedding UTF-8 in CTEXT:
3816 ESC % G --UTF-8-BYTES-- ESC % @
3817 We keep these bytes as is for the moment.
3818 They may be decoded by post-read-conversion. */
e951386e 3819 if (charbuf + 3 > charbuf_end)
4776e638 3820 goto break_loop;
e951386e
KH
3821 *charbuf++ = ISO_CODE_ESC;
3822 *charbuf++ = '%';
3823 *charbuf++ = 'G';
3824 CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
103e0180
KH
3825 }
3826 else
4776e638 3827 goto invalid_code;
103e0180 3828 continue;
4776e638 3829 break;
103e0180 3830
4ed46869 3831 default:
df7492f9
KH
3832 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3833 goto invalid_code;
134b9549
KH
3834 {
3835 int reg, chars96;
3836
3837 if (c1 >= 0x28 && c1 <= 0x2B)
3838 { /* designation of DIMENSION1_CHARS94 character set */
3839 reg = c1 - 0x28, chars96 = 0;
3840 ONE_MORE_BYTE (c1);
3841 }
3842 else if (c1 >= 0x2C && c1 <= 0x2F)
3843 { /* designation of DIMENSION1_CHARS96 character set */
3844 reg = c1 - 0x2C, chars96 = 1;
3845 ONE_MORE_BYTE (c1);
3846 }
3847 else
3848 goto invalid_code;
3849 DECODE_DESIGNATION (reg, 1, chars96, c1);
3850 /* We must update these variables now. */
3851 if (reg == 0)
3852 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3853 else if (reg == 1)
3854 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3855 if (chars96 < 0)
3856 goto invalid_code;
3857 }
b73bfc1c 3858 continue;
4ed46869 3859 }
413bb2db
PE
3860 break;
3861
3862 default:
3863 abort ();
b73bfc1c 3864 }
4ed46869 3865
e951386e
KH
3866 if (cmp_status->state == COMPOSING_NO
3867 && charset->id != charset_ascii
ff0dacd7
KH
3868 && last_id != charset->id)
3869 {
3870 if (last_id != charset_ascii)
69a80ea3 3871 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
ff0dacd7
KH
3872 last_id = charset->id;
3873 last_offset = char_offset;
3874 }
3875
b73bfc1c 3876 /* Now we know CHARSET and 1st position code C1 of a character.
cf299835
KH
3877 Produce a decoded character while getting 2nd and 3rd
3878 position codes C2, C3 if necessary. */
df7492f9 3879 if (CHARSET_DIMENSION (charset) > 1)
b73bfc1c
KH
3880 {
3881 ONE_MORE_BYTE (c2);
cf299835
KH
3882 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
3883 || ((c1 & 0x80) != (c2 & 0x80)))
b73bfc1c 3884 /* C2 is not in a valid range. */
df7492f9 3885 goto invalid_code;
cf299835
KH
3886 if (CHARSET_DIMENSION (charset) == 2)
3887 c1 = (c1 << 8) | c2;
3888 else
df7492f9 3889 {
cf299835
KH
3890 ONE_MORE_BYTE (c3);
3891 if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
3892 || ((c1 & 0x80) != (c3 & 0x80)))
3893 /* C3 is not in a valid range. */
df7492f9 3894 goto invalid_code;
cf299835 3895 c1 = (c1 << 16) | (c2 << 8) | c2;
df7492f9
KH
3896 }
3897 }
cf299835 3898 c1 &= 0x7F7F7F;
df7492f9
KH
3899 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3900 if (c < 0)
3901 {
3902 MAYBE_FINISH_COMPOSITION ();
3903 for (; src_base < src; src_base++, char_offset++)
3904 {
3905 if (ASCII_BYTE_P (*src_base))
3906 *charbuf++ = *src_base;
3907 else
3908 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3909 }
3910 }
e951386e 3911 else if (cmp_status->state == COMPOSING_NO)
df7492f9
KH
3912 {
3913 *charbuf++ = c;
3914 char_offset++;
4ed46869 3915 }
e951386e
KH
3916 else if ((cmp_status->state == COMPOSING_CHAR
3917 ? cmp_status->nchars
3918 : cmp_status->ncomps)
3919 >= MAX_COMPOSITION_COMPONENTS)
781d7a48 3920 {
e951386e
KH
3921 /* Too long composition. */
3922 MAYBE_FINISH_COMPOSITION ();
3923 *charbuf++ = c;
3924 char_offset++;
4ed46869 3925 }
e951386e
KH
3926 else
3927 STORE_COMPOSITION_CHAR (c);
4ed46869
KH
3928 continue;
3929
df7492f9
KH
3930 invalid_code:
3931 MAYBE_FINISH_COMPOSITION ();
4ed46869 3932 src = src_base;
df7492f9
KH
3933 consumed_chars = consumed_chars_base;
3934 ONE_MORE_BYTE (c);
065e3595 3935 *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
ff0dacd7 3936 char_offset++;
df7492f9 3937 coding->errors++;
4776e638
KH
3938 continue;
3939
3940 break_loop:
3941 break;
4ed46869 3942 }
fb88bf2d 3943
df7492f9 3944 no_more_source:
e951386e
KH
3945 if (cmp_status->state != COMPOSING_NO)
3946 {
3947 if (coding->mode & CODING_MODE_LAST_BLOCK)
3948 MAYBE_FINISH_COMPOSITION ();
3949 else
3950 {
3951 charbuf -= cmp_status->length;
3952 for (i = 0; i < cmp_status->length; i++)
3953 cmp_status->carryover[i] = charbuf[i];
3954 }
3955 }
3956 else if (last_id != charset_ascii)
69a80ea3 3957 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
df7492f9
KH
3958 coding->consumed_char += consumed_chars_base;
3959 coding->consumed = src_base - coding->source;
3960 coding->charbuf_used = charbuf - coding->charbuf;
4ed46869
KH
3961}
3962
b73bfc1c 3963
f4dee582 3964/* ISO2022 encoding stuff. */
4ed46869
KH
3965
3966/*
f4dee582 3967 It is not enough to say just "ISO2022" on encoding, we have to
df7492f9 3968 specify more details. In Emacs, each coding system of ISO2022
4ed46869 3969 variant has the following specifications:
df7492f9 3970 1. Initial designation to G0 thru G3.
4ed46869
KH
3971 2. Allows short-form designation?
3972 3. ASCII should be designated to G0 before control characters?
3973 4. ASCII should be designated to G0 at end of line?
3974 5. 7-bit environment or 8-bit environment?
3975 6. Use locking-shift?
3976 7. Use Single-shift?
3977 And the following two are only for Japanese:
3978 8. Use ASCII in place of JIS0201-1976-Roman?
3979 9. Use JISX0208-1983 in place of JISX0208-1978?
df7492f9
KH
3980 These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3981 defined by macros CODING_ISO_FLAG_XXX. See `coding.h' for more
f4dee582 3982 details.
4ed46869
KH
3983*/
3984
3985/* Produce codes (escape sequence) for designating CHARSET to graphic
b73bfc1c
KH
3986 register REG at DST, and increment DST. If <final-char> of CHARSET is
3987 '@', 'A', or 'B' and the coding system CODING allows, produce
3988 designation sequence of short-form. */
4ed46869
KH
3989
3990#define ENCODE_DESIGNATION(charset, reg, coding) \
3991 do { \
df7492f9 3992 unsigned char final_char = CHARSET_ISO_FINAL (charset); \
675e2c69
DN
3993 const char *intermediate_char_94 = "()*+"; \
3994 const char *intermediate_char_96 = ",-./"; \
df7492f9 3995 int revision = -1; \
df7492f9
KH
3996 \
3997 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION) \
c197f191 3998 revision = CHARSET_ISO_REVISION (charset); \
df7492f9
KH
3999 \
4000 if (revision >= 0) \
70c22245 4001 { \
df7492f9
KH
4002 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&'); \
4003 EMIT_ONE_BYTE ('@' + revision); \
4ed46869 4004 } \
df7492f9 4005 EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC); \
4ed46869
KH
4006 if (CHARSET_DIMENSION (charset) == 1) \
4007 { \
2735d060 4008 int b; \
df7492f9 4009 if (! CHARSET_ISO_CHARS_96 (charset)) \
2735d060 4010 b = intermediate_char_94[reg]; \
4ed46869 4011 else \
2735d060
PE
4012 b = intermediate_char_96[reg]; \
4013 EMIT_ONE_ASCII_BYTE (b); \
4ed46869
KH
4014 } \
4015 else \
4016 { \
df7492f9
KH
4017 EMIT_ONE_ASCII_BYTE ('$'); \
4018 if (! CHARSET_ISO_CHARS_96 (charset)) \
4ed46869 4019 { \
df7492f9 4020 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM \
b73bfc1c
KH
4021 || reg != 0 \
4022 || final_char < '@' || final_char > 'B') \
df7492f9 4023 EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]); \
4ed46869
KH
4024 } \
4025 else \
df7492f9 4026 EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]); \
4ed46869 4027 } \
df7492f9
KH
4028 EMIT_ONE_ASCII_BYTE (final_char); \
4029 \
4030 CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset); \
4ed46869
KH
4031 } while (0)
4032
df7492f9 4033
4ed46869
KH
4034/* The following two macros produce codes (control character or escape
4035 sequence) for ISO2022 single-shift functions (single-shift-2 and
4036 single-shift-3). */
4037
df7492f9
KH
4038#define ENCODE_SINGLE_SHIFT_2 \
4039 do { \
4040 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4041 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N'); \
4042 else \
4043 EMIT_ONE_BYTE (ISO_CODE_SS2); \
4044 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
4ed46869
KH
4045 } while (0)
4046
df7492f9
KH
4047
4048#define ENCODE_SINGLE_SHIFT_3 \
4049 do { \
4050 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4051 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O'); \
4052 else \
4053 EMIT_ONE_BYTE (ISO_CODE_SS3); \
4054 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
4ed46869
KH
4055 } while (0)
4056
df7492f9 4057
4ed46869
KH
4058/* The following four macros produce codes (control character or
4059 escape sequence) for ISO2022 locking-shift functions (shift-in,
4060 shift-out, locking-shift-2, and locking-shift-3). */
4061
df7492f9
KH
4062#define ENCODE_SHIFT_IN \
4063 do { \
4064 EMIT_ONE_ASCII_BYTE (ISO_CODE_SI); \
4065 CODING_ISO_INVOCATION (coding, 0) = 0; \
4ed46869
KH
4066 } while (0)
4067
df7492f9
KH
4068
4069#define ENCODE_SHIFT_OUT \
4070 do { \
4071 EMIT_ONE_ASCII_BYTE (ISO_CODE_SO); \
4072 CODING_ISO_INVOCATION (coding, 0) = 1; \
4ed46869
KH
4073 } while (0)
4074
df7492f9
KH
4075
4076#define ENCODE_LOCKING_SHIFT_2 \
4077 do { \
4078 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
4079 CODING_ISO_INVOCATION (coding, 0) = 2; \
4ed46869
KH
4080 } while (0)
4081
df7492f9
KH
4082
4083#define ENCODE_LOCKING_SHIFT_3 \
4084 do { \
4085 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
4086 CODING_ISO_INVOCATION (coding, 0) = 3; \
4ed46869
KH
4087 } while (0)
4088
df7492f9 4089
f4dee582
RS
4090/* Produce codes for a DIMENSION1 character whose character set is
4091 CHARSET and whose position-code is C1. Designation and invocation
4ed46869
KH
4092 sequences are also produced in advance if necessary. */
4093
6e85d753
KH
4094#define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
4095 do { \
df7492f9 4096 int id = CHARSET_ID (charset); \
bf16eb23
KH
4097 \
4098 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
4099 && id == charset_ascii) \
4100 { \
4101 id = charset_jisx0201_roman; \
4102 charset = CHARSET_FROM_ID (id); \
4103 } \
4104 \
df7492f9 4105 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
6e85d753 4106 { \
df7492f9
KH
4107 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4108 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
6e85d753 4109 else \
df7492f9
KH
4110 EMIT_ONE_BYTE (c1 | 0x80); \
4111 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
6e85d753
KH
4112 break; \
4113 } \
df7492f9 4114 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
6e85d753 4115 { \
df7492f9 4116 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
6e85d753
KH
4117 break; \
4118 } \
df7492f9 4119 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
6e85d753 4120 { \
df7492f9 4121 EMIT_ONE_BYTE (c1 | 0x80); \
6e85d753
KH
4122 break; \
4123 } \
6e85d753
KH
4124 else \
4125 /* Since CHARSET is not yet invoked to any graphic planes, we \
4126 must invoke it, or, at first, designate it to some graphic \
4127 register. Then repeat the loop to actually produce the \
4128 character. */ \
df7492f9
KH
4129 dst = encode_invocation_designation (charset, coding, dst, \
4130 &produced_chars); \
4ed46869
KH
4131 } while (1)
4132
df7492f9 4133
f4dee582
RS
4134/* Produce codes for a DIMENSION2 character whose character set is
4135 CHARSET and whose position-codes are C1 and C2. Designation and
4ed46869
KH
4136 invocation codes are also produced in advance if necessary. */
4137
6e85d753
KH
4138#define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
4139 do { \
df7492f9 4140 int id = CHARSET_ID (charset); \
bf16eb23
KH
4141 \
4142 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
4143 && id == charset_jisx0208) \
4144 { \
4145 id = charset_jisx0208_1978; \
4146 charset = CHARSET_FROM_ID (id); \
4147 } \
4148 \
df7492f9 4149 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
6e85d753 4150 { \
df7492f9
KH
4151 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4152 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
6e85d753 4153 else \
df7492f9
KH
4154 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
4155 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
6e85d753
KH
4156 break; \
4157 } \
df7492f9 4158 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
6e85d753 4159 { \
df7492f9 4160 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
6e85d753
KH
4161 break; \
4162 } \
df7492f9 4163 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
6e85d753 4164 { \
df7492f9 4165 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
6e85d753
KH
4166 break; \
4167 } \
6e85d753
KH
4168 else \
4169 /* Since CHARSET is not yet invoked to any graphic planes, we \
4170 must invoke it, or, at first, designate it to some graphic \
4171 register. Then repeat the loop to actually produce the \
4172 character. */ \
df7492f9
KH
4173 dst = encode_invocation_designation (charset, coding, dst, \
4174 &produced_chars); \
4ed46869
KH
4175 } while (1)
4176
05e6f5dc 4177
df7492f9
KH
4178#define ENCODE_ISO_CHARACTER(charset, c) \
4179 do { \
1a4990fb 4180 int code = ENCODE_CHAR ((charset), (c)); \
df7492f9
KH
4181 \
4182 if (CHARSET_DIMENSION (charset) == 1) \
4183 ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code); \
4184 else \
4185 ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
84fbb8a0 4186 } while (0)
bdd9fb48 4187
05e6f5dc 4188
4ed46869 4189/* Produce designation and invocation codes at a place pointed by DST
df7492f9 4190 to use CHARSET. The element `spec.iso_2022' of *CODING is updated.
4ed46869
KH
4191 Return new DST. */
4192
e2f1bab9 4193static unsigned char *
cf84bb53
JB
4194encode_invocation_designation (struct charset *charset,
4195 struct coding_system *coding,
a53e2e89 4196 unsigned char *dst, EMACS_INT *p_nchars)
4ed46869 4197{
df7492f9 4198 int multibytep = coding->dst_multibyte;
a53e2e89 4199 EMACS_INT produced_chars = *p_nchars;
4ed46869 4200 int reg; /* graphic register number */
df7492f9 4201 int id = CHARSET_ID (charset);
4ed46869
KH
4202
4203 /* At first, check designations. */
4204 for (reg = 0; reg < 4; reg++)
df7492f9 4205 if (id == CODING_ISO_DESIGNATION (coding, reg))
4ed46869
KH
4206 break;
4207
4208 if (reg >= 4)
4209 {
4210 /* CHARSET is not yet designated to any graphic registers. */
4211 /* At first check the requested designation. */
df7492f9
KH
4212 reg = CODING_ISO_REQUEST (coding, id);
4213 if (reg < 0)
1ba9e4ab
KH
4214 /* Since CHARSET requests no special designation, designate it
4215 to graphic register 0. */
4ed46869
KH
4216 reg = 0;
4217
4218 ENCODE_DESIGNATION (charset, reg, coding);
4219 }
4220
df7492f9
KH
4221 if (CODING_ISO_INVOCATION (coding, 0) != reg
4222 && CODING_ISO_INVOCATION (coding, 1) != reg)
4ed46869
KH
4223 {
4224 /* Since the graphic register REG is not invoked to any graphic
4225 planes, invoke it to graphic plane 0. */
4226 switch (reg)
4227 {
4228 case 0: /* graphic register 0 */
4229 ENCODE_SHIFT_IN;
4230 break;
4231
4232 case 1: /* graphic register 1 */
4233 ENCODE_SHIFT_OUT;
4234 break;
4235
4236 case 2: /* graphic register 2 */
df7492f9 4237 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4ed46869
KH
4238 ENCODE_SINGLE_SHIFT_2;
4239 else
4240 ENCODE_LOCKING_SHIFT_2;
4241 break;
4242
4243 case 3: /* graphic register 3 */
df7492f9 4244 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4ed46869
KH
4245 ENCODE_SINGLE_SHIFT_3;
4246 else
4247 ENCODE_LOCKING_SHIFT_3;
4248 break;
4249 }
4250 }
b73bfc1c 4251
df7492f9 4252 *p_nchars = produced_chars;
4ed46869
KH
4253 return dst;
4254}
4255
4ed46869
KH
4256
4257/* Produce codes for designation and invocation to reset the graphic
4258 planes and registers to initial state. */
df7492f9
KH
4259#define ENCODE_RESET_PLANE_AND_REGISTER() \
4260 do { \
4261 int reg; \
4262 struct charset *charset; \
4263 \
4264 if (CODING_ISO_INVOCATION (coding, 0) != 0) \
4265 ENCODE_SHIFT_IN; \
4266 for (reg = 0; reg < 4; reg++) \
4267 if (CODING_ISO_INITIAL (coding, reg) >= 0 \
4268 && (CODING_ISO_DESIGNATION (coding, reg) \
4269 != CODING_ISO_INITIAL (coding, reg))) \
4270 { \
4271 charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4272 ENCODE_DESIGNATION (charset, reg, coding); \
4273 } \
4ed46869
KH
4274 } while (0)
4275
df7492f9 4276
bdd9fb48 4277/* Produce designation sequences of charsets in the line started from
b73bfc1c 4278 SRC to a place pointed by DST, and return updated DST.
bdd9fb48
KH
4279
4280 If the current block ends before any end-of-line, we may fail to
d46c5b12
KH
4281 find all the necessary designations. */
4282
b73bfc1c 4283static unsigned char *
cf84bb53 4284encode_designation_at_bol (struct coding_system *coding, int *charbuf,
461c2ab9 4285 unsigned char *dst)
e0e989f6 4286{
df7492f9 4287 struct charset *charset;
bdd9fb48
KH
4288 /* Table of charsets to be designated to each graphic register. */
4289 int r[4];
df7492f9 4290 int c, found = 0, reg;
a53e2e89 4291 EMACS_INT produced_chars = 0;
df7492f9
KH
4292 int multibytep = coding->dst_multibyte;
4293 Lisp_Object attrs;
4294 Lisp_Object charset_list;
4295
4296 attrs = CODING_ID_ATTRS (coding->id);
4297 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4298 if (EQ (charset_list, Qiso_2022))
4299 charset_list = Viso_2022_charset_list;
bdd9fb48
KH
4300
4301 for (reg = 0; reg < 4; reg++)
4302 r[reg] = -1;
4303
b73bfc1c 4304 while (found < 4)
e0e989f6 4305 {
df7492f9
KH
4306 int id;
4307
4308 c = *charbuf++;
b73bfc1c
KH
4309 if (c == '\n')
4310 break;
df7492f9
KH
4311 charset = char_charset (c, charset_list, NULL);
4312 id = CHARSET_ID (charset);
4313 reg = CODING_ISO_REQUEST (coding, id);
4314 if (reg >= 0 && r[reg] < 0)
bdd9fb48
KH
4315 {
4316 found++;
df7492f9 4317 r[reg] = id;
bdd9fb48 4318 }
bdd9fb48
KH
4319 }
4320
4321 if (found)
4322 {
4323 for (reg = 0; reg < 4; reg++)
4324 if (r[reg] >= 0
df7492f9
KH
4325 && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4326 ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
e0e989f6 4327 }
b73bfc1c
KH
4328
4329 return dst;
e0e989f6
KH
4330}
4331
4ed46869
KH
4332/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
4333
df7492f9 4334static int
971de7fb 4335encode_coding_iso_2022 (struct coding_system *coding)
4ed46869 4336{
df7492f9
KH
4337 int multibytep = coding->dst_multibyte;
4338 int *charbuf = coding->charbuf;
4339 int *charbuf_end = charbuf + coding->charbuf_used;
4340 unsigned char *dst = coding->destination + coding->produced;
4341 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4342 int safe_room = 16;
4343 int bol_designation
4344 = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4345 && CODING_ISO_BOL (coding));
a53e2e89 4346 EMACS_INT produced_chars = 0;
df7492f9
KH
4347 Lisp_Object attrs, eol_type, charset_list;
4348 int ascii_compatible;
b73bfc1c 4349 int c;
ff0dacd7 4350 int preferred_charset_id = -1;
05e6f5dc 4351
24a73b0a 4352 CODING_GET_INFO (coding, attrs, charset_list);
0a9564cb 4353 eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
24a73b0a
KH
4354 if (VECTORP (eol_type))
4355 eol_type = Qunix;
4356
004068e4 4357 setup_iso_safe_charsets (attrs);
ff0dacd7 4358 /* Charset list may have been changed. */
287c57d7 4359 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
1b3b981b 4360 coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
0eecad43 4361
a552b35a
KH
4362 ascii_compatible
4363 = (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
4364 && ! (CODING_ISO_FLAGS (coding) & (CODING_ISO_FLAG_DESIGNATION
4365 | CODING_ISO_FLAG_LOCKING_SHIFT)));
bdd9fb48 4366
df7492f9 4367 while (charbuf < charbuf_end)
4ed46869 4368 {
df7492f9 4369 ASSURE_DESTINATION (safe_room);
b73bfc1c 4370
df7492f9 4371 if (bol_designation)
b73bfc1c 4372 {
df7492f9 4373 unsigned char *dst_prev = dst;
4ed46869 4374
bdd9fb48 4375 /* We have to produce designation sequences if any now. */
461c2ab9 4376 dst = encode_designation_at_bol (coding, charbuf, dst);
df7492f9
KH
4377 bol_designation = 0;
4378 /* We are sure that designation sequences are all ASCII bytes. */
4379 produced_chars += dst - dst_prev;
e0e989f6
KH
4380 }
4381
df7492f9 4382 c = *charbuf++;
ec6d2bb8 4383
ff0dacd7
KH
4384 if (c < 0)
4385 {
4386 /* Handle an annotation. */
4387 switch (*charbuf)
ec6d2bb8 4388 {
ff0dacd7
KH
4389 case CODING_ANNOTATE_COMPOSITION_MASK:
4390 /* Not yet implemented. */
4391 break;
4392 case CODING_ANNOTATE_CHARSET_MASK:
cf7dfdf5 4393 preferred_charset_id = charbuf[2];
ff0dacd7
KH
4394 if (preferred_charset_id >= 0
4395 && NILP (Fmemq (make_number (preferred_charset_id),
4396 charset_list)))
4397 preferred_charset_id = -1;
4398 break;
4399 default:
4400 abort ();
4ed46869 4401 }
ff0dacd7
KH
4402 charbuf += -c - 1;
4403 continue;
4ed46869 4404 }
ec6d2bb8 4405
b73bfc1c
KH
4406 /* Now encode the character C. */
4407 if (c < 0x20 || c == 0x7F)
4408 {
df7492f9
KH
4409 if (c == '\n'
4410 || (c == '\r' && EQ (eol_type, Qmac)))
19a8d9e0 4411 {
df7492f9
KH
4412 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4413 ENCODE_RESET_PLANE_AND_REGISTER ();
4414 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
b73bfc1c 4415 {
df7492f9
KH
4416 int i;
4417
4418 for (i = 0; i < 4; i++)
4419 CODING_ISO_DESIGNATION (coding, i)
4420 = CODING_ISO_INITIAL (coding, i);
b73bfc1c 4421 }
df7492f9
KH
4422 bol_designation
4423 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
19a8d9e0 4424 }
df7492f9
KH
4425 else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4426 ENCODE_RESET_PLANE_AND_REGISTER ();
4427 EMIT_ONE_ASCII_BYTE (c);
4ed46869 4428 }
df7492f9 4429 else if (ASCII_CHAR_P (c))
88993dfd 4430 {
df7492f9
KH
4431 if (ascii_compatible)
4432 EMIT_ONE_ASCII_BYTE (c);
93dec019 4433 else
19a8d9e0 4434 {
bf16eb23
KH
4435 struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4436 ENCODE_ISO_CHARACTER (charset, c);
19a8d9e0 4437 }
4ed46869 4438 }
16eafb5d 4439 else if (CHAR_BYTE8_P (c))
88993dfd 4440 {
16eafb5d
KH
4441 c = CHAR_TO_BYTE8 (c);
4442 EMIT_ONE_BYTE (c);
88993dfd 4443 }
b73bfc1c 4444 else
df7492f9 4445 {
ff0dacd7 4446 struct charset *charset;
b73bfc1c 4447
ff0dacd7
KH
4448 if (preferred_charset_id >= 0)
4449 {
4450 charset = CHARSET_FROM_ID (preferred_charset_id);
4451 if (! CHAR_CHARSET_P (c, charset))
4452 charset = char_charset (c, charset_list, NULL);
4453 }
4454 else
4455 charset = char_charset (c, charset_list, NULL);
df7492f9
KH
4456 if (!charset)
4457 {
41cbe562
KH
4458 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4459 {
4460 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4461 charset = CHARSET_FROM_ID (charset_ascii);
4462 }
4463 else
4464 {
4465 c = coding->default_char;
4466 charset = char_charset (c, charset_list, NULL);
4467 }
df7492f9
KH
4468 }
4469 ENCODE_ISO_CHARACTER (charset, c);
4470 }
84fbb8a0 4471 }
b73bfc1c 4472
df7492f9
KH
4473 if (coding->mode & CODING_MODE_LAST_BLOCK
4474 && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4475 {
4476 ASSURE_DESTINATION (safe_room);
4477 ENCODE_RESET_PLANE_AND_REGISTER ();
4478 }
065e3595 4479 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
4480 CODING_ISO_BOL (coding) = bol_designation;
4481 coding->produced_char += produced_chars;
4482 coding->produced = dst - coding->destination;
4483 return 0;
4ed46869
KH
4484}
4485
4486\f
df7492f9 4487/*** 8,9. SJIS and BIG5 handlers ***/
4ed46869 4488
df7492f9 4489/* Although SJIS and BIG5 are not ISO's coding system, they are used
4ed46869
KH
4490 quite widely. So, for the moment, Emacs supports them in the bare
4491 C code. But, in the future, they may be supported only by CCL. */
4492
4493/* SJIS is a coding system encoding three character sets: ASCII, right
4494 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
4495 as is. A character of charset katakana-jisx0201 is encoded by
4496 "position-code + 0x80". A character of charset japanese-jisx0208
4497 is encoded in 2-byte but two position-codes are divided and shifted
df7492f9 4498 so that it fit in the range below.
4ed46869
KH
4499
4500 --- CODE RANGE of SJIS ---
4501 (character set) (range)
4502 ASCII 0x00 .. 0x7F
df7492f9 4503 KATAKANA-JISX0201 0xA0 .. 0xDF
c28a9453 4504 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
d14d03ac 4505 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
4ed46869
KH
4506 -------------------------------
4507
4508*/
4509
4510/* BIG5 is a coding system encoding two character sets: ASCII and
4511 Big5. An ASCII character is encoded as is. Big5 is a two-byte
df7492f9 4512 character set and is encoded in two-byte.
4ed46869
KH
4513
4514 --- CODE RANGE of BIG5 ---
4515 (character set) (range)
4516 ASCII 0x00 .. 0x7F
4517 Big5 (1st byte) 0xA1 .. 0xFE
4518 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
4519 --------------------------
4520
df7492f9 4521 */
4ed46869
KH
4522
4523/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4524 Check if a text is encoded in SJIS. If it is, return
df7492f9 4525 CATEGORY_MASK_SJIS, else return 0. */
4ed46869 4526
0a28aafb 4527static int
cf84bb53
JB
4528detect_coding_sjis (struct coding_system *coding,
4529 struct coding_detection_info *detect_info)
4ed46869 4530{
065e3595 4531 const unsigned char *src = coding->source, *src_base;
8f924df7 4532 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9 4533 int multibytep = coding->src_multibyte;
a53e2e89 4534 EMACS_INT consumed_chars = 0;
df7492f9 4535 int found = 0;
b73bfc1c 4536 int c;
f07190ca
KH
4537 Lisp_Object attrs, charset_list;
4538 int max_first_byte_of_2_byte_code;
4539
4540 CODING_GET_INFO (coding, attrs, charset_list);
4541 max_first_byte_of_2_byte_code
4542 = (XINT (Flength (charset_list)) > 3 ? 0xFC : 0xEF);
df7492f9 4543
ff0dacd7 4544 detect_info->checked |= CATEGORY_MASK_SJIS;
df7492f9
KH
4545 /* A coding system of this category is always ASCII compatible. */
4546 src += coding->head_ascii;
4ed46869 4547
b73bfc1c 4548 while (1)
4ed46869 4549 {
065e3595 4550 src_base = src;
df7492f9 4551 ONE_MORE_BYTE (c);
682169fe
KH
4552 if (c < 0x80)
4553 continue;
f07190ca
KH
4554 if ((c >= 0x81 && c <= 0x9F)
4555 || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code))
4ed46869 4556 {
df7492f9 4557 ONE_MORE_BYTE (c);
682169fe 4558 if (c < 0x40 || c == 0x7F || c > 0xFC)
df7492f9 4559 break;
ff0dacd7 4560 found = CATEGORY_MASK_SJIS;
4ed46869 4561 }
df7492f9 4562 else if (c >= 0xA0 && c < 0xE0)
ff0dacd7 4563 found = CATEGORY_MASK_SJIS;
df7492f9
KH
4564 else
4565 break;
4ed46869 4566 }
ff0dacd7 4567 detect_info->rejected |= CATEGORY_MASK_SJIS;
df7492f9
KH
4568 return 0;
4569
4570 no_more_source:
065e3595 4571 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
89528eb3 4572 {
ff0dacd7 4573 detect_info->rejected |= CATEGORY_MASK_SJIS;
89528eb3 4574 return 0;
4ed46869 4575 }
ff0dacd7
KH
4576 detect_info->found |= found;
4577 return 1;
4ed46869
KH
4578}
4579
4580/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4581 Check if a text is encoded in BIG5. If it is, return
df7492f9 4582 CATEGORY_MASK_BIG5, else return 0. */
4ed46869 4583
0a28aafb 4584static int
cf84bb53
JB
4585detect_coding_big5 (struct coding_system *coding,
4586 struct coding_detection_info *detect_info)
4ed46869 4587{
065e3595 4588 const unsigned char *src = coding->source, *src_base;
8f924df7 4589 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9 4590 int multibytep = coding->src_multibyte;
a53e2e89 4591 EMACS_INT consumed_chars = 0;
df7492f9 4592 int found = 0;
b73bfc1c 4593 int c;
fa42c37f 4594
ff0dacd7 4595 detect_info->checked |= CATEGORY_MASK_BIG5;
df7492f9
KH
4596 /* A coding system of this category is always ASCII compatible. */
4597 src += coding->head_ascii;
fa42c37f 4598
b73bfc1c 4599 while (1)
fa42c37f 4600 {
065e3595 4601 src_base = src;
df7492f9
KH
4602 ONE_MORE_BYTE (c);
4603 if (c < 0x80)
fa42c37f 4604 continue;
df7492f9 4605 if (c >= 0xA1)
fa42c37f 4606 {
df7492f9
KH
4607 ONE_MORE_BYTE (c);
4608 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
fa42c37f 4609 return 0;
ff0dacd7 4610 found = CATEGORY_MASK_BIG5;
fa42c37f 4611 }
df7492f9
KH
4612 else
4613 break;
fa42c37f 4614 }
ff0dacd7 4615 detect_info->rejected |= CATEGORY_MASK_BIG5;
fa42c37f 4616 return 0;
fa42c37f 4617
df7492f9 4618 no_more_source:
065e3595 4619 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
89528eb3 4620 {
ff0dacd7 4621 detect_info->rejected |= CATEGORY_MASK_BIG5;
89528eb3
KH
4622 return 0;
4623 }
ff0dacd7
KH
4624 detect_info->found |= found;
4625 return 1;
fa42c37f
KH
4626}
4627
4ed46869
KH
4628/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
4629 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
fa42c37f 4630
b73bfc1c 4631static void
971de7fb 4632decode_coding_sjis (struct coding_system *coding)
4ed46869 4633{
8f924df7
KH
4634 const unsigned char *src = coding->source + coding->consumed;
4635 const unsigned char *src_end = coding->source + coding->src_bytes;
4636 const unsigned char *src_base;
69a80ea3 4637 int *charbuf = coding->charbuf + coding->charbuf_used;
ad1746f5 4638 /* We may produce one charset annotation in one loop and one more at
df80c7f0 4639 the end. */
69a80ea3 4640 int *charbuf_end
df80c7f0 4641 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
a53e2e89 4642 EMACS_INT consumed_chars = 0, consumed_chars_base;
df7492f9
KH
4643 int multibytep = coding->src_multibyte;
4644 struct charset *charset_roman, *charset_kanji, *charset_kana;
57a47f8a 4645 struct charset *charset_kanji2;
24a73b0a 4646 Lisp_Object attrs, charset_list, val;
a53e2e89
EZ
4647 EMACS_INT char_offset = coding->produced_char;
4648 EMACS_INT last_offset = char_offset;
ff0dacd7 4649 int last_id = charset_ascii;
2735d060 4650 int eol_dos =
0a9564cb 4651 !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
119852e7 4652 int byte_after_cr = -1;
a5d301df 4653
24a73b0a 4654 CODING_GET_INFO (coding, attrs, charset_list);
df7492f9
KH
4655
4656 val = charset_list;
4657 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
89528eb3 4658 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
57a47f8a
KH
4659 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4660 charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
fa42c37f 4661
b73bfc1c 4662 while (1)
4ed46869 4663 {
df7492f9 4664 int c, c1;
24a73b0a 4665 struct charset *charset;
fa42c37f 4666
b73bfc1c 4667 src_base = src;
df7492f9 4668 consumed_chars_base = consumed_chars;
fa42c37f 4669
df7492f9 4670 if (charbuf >= charbuf_end)
b71f6f73
KH
4671 {
4672 if (byte_after_cr >= 0)
4673 src_base--;
4674 break;
4675 }
df7492f9 4676
119852e7
KH
4677 if (byte_after_cr >= 0)
4678 c = byte_after_cr, byte_after_cr = -1;
4679 else
4680 ONE_MORE_BYTE (c);
065e3595
KH
4681 if (c < 0)
4682 goto invalid_code;
24a73b0a 4683 if (c < 0x80)
119852e7 4684 {
2735d060 4685 if (eol_dos && c == '\r')
119852e7
KH
4686 ONE_MORE_BYTE (byte_after_cr);
4687 charset = charset_roman;
4688 }
57a47f8a 4689 else if (c == 0x80 || c == 0xA0)
8e921c4b 4690 goto invalid_code;
57a47f8a
KH
4691 else if (c >= 0xA1 && c <= 0xDF)
4692 {
4693 /* SJIS -> JISX0201-Kana */
4694 c &= 0x7F;
4695 charset = charset_kana;
4696 }
4697 else if (c <= 0xEF)
df7492f9 4698 {
57a47f8a
KH
4699 /* SJIS -> JISX0208 */
4700 ONE_MORE_BYTE (c1);
4701 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
24a73b0a 4702 goto invalid_code;
57a47f8a
KH
4703 c = (c << 8) | c1;
4704 SJIS_TO_JIS (c);
4705 charset = charset_kanji;
4706 }
4707 else if (c <= 0xFC && charset_kanji2)
4708 {
c6876370 4709 /* SJIS -> JISX0213-2 */
57a47f8a
KH
4710 ONE_MORE_BYTE (c1);
4711 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
24a73b0a 4712 goto invalid_code;
57a47f8a
KH
4713 c = (c << 8) | c1;
4714 SJIS_TO_JIS2 (c);
4715 charset = charset_kanji2;
df7492f9 4716 }
57a47f8a
KH
4717 else
4718 goto invalid_code;
24a73b0a
KH
4719 if (charset->id != charset_ascii
4720 && last_id != charset->id)
4721 {
4722 if (last_id != charset_ascii)
69a80ea3 4723 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
24a73b0a
KH
4724 last_id = charset->id;
4725 last_offset = char_offset;
4726 }
4727 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
df7492f9 4728 *charbuf++ = c;
ff0dacd7 4729 char_offset++;
df7492f9 4730 continue;
b73bfc1c 4731
df7492f9
KH
4732 invalid_code:
4733 src = src_base;
4734 consumed_chars = consumed_chars_base;
4735 ONE_MORE_BYTE (c);
065e3595 4736 *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
ff0dacd7 4737 char_offset++;
df7492f9
KH
4738 coding->errors++;
4739 }
fa42c37f 4740
df7492f9 4741 no_more_source:
ff0dacd7 4742 if (last_id != charset_ascii)
69a80ea3 4743 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
df7492f9
KH
4744 coding->consumed_char += consumed_chars_base;
4745 coding->consumed = src_base - coding->source;
4746 coding->charbuf_used = charbuf - coding->charbuf;
fa42c37f
KH
4747}
4748
b73bfc1c 4749static void
971de7fb 4750decode_coding_big5 (struct coding_system *coding)
4ed46869 4751{
8f924df7
KH
4752 const unsigned char *src = coding->source + coding->consumed;
4753 const unsigned char *src_end = coding->source + coding->src_bytes;
4754 const unsigned char *src_base;
69a80ea3 4755 int *charbuf = coding->charbuf + coding->charbuf_used;
ad1746f5 4756 /* We may produce one charset annotation in one loop and one more at
df80c7f0 4757 the end. */
69a80ea3 4758 int *charbuf_end
df80c7f0 4759 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
a53e2e89 4760 EMACS_INT consumed_chars = 0, consumed_chars_base;
df7492f9
KH
4761 int multibytep = coding->src_multibyte;
4762 struct charset *charset_roman, *charset_big5;
24a73b0a 4763 Lisp_Object attrs, charset_list, val;
a53e2e89
EZ
4764 EMACS_INT char_offset = coding->produced_char;
4765 EMACS_INT last_offset = char_offset;
ff0dacd7 4766 int last_id = charset_ascii;
2735d060 4767 int eol_dos =
0a9564cb 4768 !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
119852e7 4769 int byte_after_cr = -1;
df7492f9 4770
24a73b0a 4771 CODING_GET_INFO (coding, attrs, charset_list);
df7492f9
KH
4772 val = charset_list;
4773 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4774 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4ed46869 4775
b73bfc1c 4776 while (1)
4ed46869 4777 {
df7492f9 4778 int c, c1;
24a73b0a 4779 struct charset *charset;
b73bfc1c
KH
4780
4781 src_base = src;
df7492f9
KH
4782 consumed_chars_base = consumed_chars;
4783
4784 if (charbuf >= charbuf_end)
b71f6f73
KH
4785 {
4786 if (byte_after_cr >= 0)
4787 src_base--;
4788 break;
4789 }
df7492f9 4790
119852e7 4791 if (byte_after_cr >= 0)
14daee73 4792 c = byte_after_cr, byte_after_cr = -1;
119852e7
KH
4793 else
4794 ONE_MORE_BYTE (c);
b73bfc1c 4795
065e3595
KH
4796 if (c < 0)
4797 goto invalid_code;
24a73b0a 4798 if (c < 0x80)
119852e7 4799 {
2735d060 4800 if (eol_dos && c == '\r')
119852e7
KH
4801 ONE_MORE_BYTE (byte_after_cr);
4802 charset = charset_roman;
4803 }
24a73b0a 4804 else
4ed46869 4805 {
24a73b0a
KH
4806 /* BIG5 -> Big5 */
4807 if (c < 0xA1 || c > 0xFE)
4808 goto invalid_code;
4809 ONE_MORE_BYTE (c1);
4810 if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4811 goto invalid_code;
4812 c = c << 8 | c1;
4813 charset = charset_big5;
4ed46869 4814 }
24a73b0a
KH
4815 if (charset->id != charset_ascii
4816 && last_id != charset->id)
df7492f9 4817 {
24a73b0a 4818 if (last_id != charset_ascii)
69a80ea3 4819 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
24a73b0a
KH
4820 last_id = charset->id;
4821 last_offset = char_offset;
4ed46869 4822 }
24a73b0a 4823 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
df7492f9 4824 *charbuf++ = c;
ff0dacd7 4825 char_offset++;
fb88bf2d
KH
4826 continue;
4827
df7492f9 4828 invalid_code:
4ed46869 4829 src = src_base;
df7492f9
KH
4830 consumed_chars = consumed_chars_base;
4831 ONE_MORE_BYTE (c);
065e3595 4832 *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
ff0dacd7 4833 char_offset++;
df7492f9 4834 coding->errors++;
fb88bf2d 4835 }
d46c5b12 4836
df7492f9 4837 no_more_source:
ff0dacd7 4838 if (last_id != charset_ascii)
69a80ea3 4839 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
df7492f9
KH
4840 coding->consumed_char += consumed_chars_base;
4841 coding->consumed = src_base - coding->source;
4842 coding->charbuf_used = charbuf - coding->charbuf;
4ed46869
KH
4843}
4844
4845/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
b73bfc1c
KH
4846 This function can encode charsets `ascii', `katakana-jisx0201',
4847 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
4848 are sure that all these charsets are registered as official charset
4ed46869
KH
4849 (i.e. do not have extended leading-codes). Characters of other
4850 charsets are produced without any encoding. If SJIS_P is 1, encode
4851 SJIS text, else encode BIG5 text. */
4852
df7492f9 4853static int
971de7fb 4854encode_coding_sjis (struct coding_system *coding)
4ed46869 4855{
df7492f9
KH
4856 int multibytep = coding->dst_multibyte;
4857 int *charbuf = coding->charbuf;
4858 int *charbuf_end = charbuf + coding->charbuf_used;
4859 unsigned char *dst = coding->destination + coding->produced;
4860 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4861 int safe_room = 4;
a53e2e89 4862 EMACS_INT produced_chars = 0;
24a73b0a 4863 Lisp_Object attrs, charset_list, val;
df7492f9 4864 int ascii_compatible;
66ebf983 4865 struct charset *charset_kanji, *charset_kana;
57a47f8a 4866 struct charset *charset_kanji2;
df7492f9 4867 int c;
a5d301df 4868
24a73b0a 4869 CODING_GET_INFO (coding, attrs, charset_list);
66ebf983 4870 val = XCDR (charset_list);
df7492f9 4871 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
57a47f8a
KH
4872 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4873 charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4ed46869 4874
df7492f9 4875 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
93dec019 4876
df7492f9
KH
4877 while (charbuf < charbuf_end)
4878 {
4879 ASSURE_DESTINATION (safe_room);
4880 c = *charbuf++;
b73bfc1c 4881 /* Now encode the character C. */
df7492f9
KH
4882 if (ASCII_CHAR_P (c) && ascii_compatible)
4883 EMIT_ONE_ASCII_BYTE (c);
16eafb5d
KH
4884 else if (CHAR_BYTE8_P (c))
4885 {
4886 c = CHAR_TO_BYTE8 (c);
4887 EMIT_ONE_BYTE (c);
4888 }
df7492f9 4889 else
b73bfc1c 4890 {
df7492f9
KH
4891 unsigned code;
4892 struct charset *charset = char_charset (c, charset_list, &code);
4893
4894 if (!charset)
4ed46869 4895 {
41cbe562 4896 if (coding->mode & CODING_MODE_SAFE_ENCODING)
b73bfc1c 4897 {
41cbe562
KH
4898 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4899 charset = CHARSET_FROM_ID (charset_ascii);
b73bfc1c 4900 }
41cbe562 4901 else
b73bfc1c 4902 {
41cbe562
KH
4903 c = coding->default_char;
4904 charset = char_charset (c, charset_list, &code);
b73bfc1c 4905 }
b73bfc1c 4906 }
df7492f9
KH
4907 if (code == CHARSET_INVALID_CODE (charset))
4908 abort ();
4909 if (charset == charset_kanji)
4910 {
4911 int c1, c2;
4912 JIS_TO_SJIS (code);
4913 c1 = code >> 8, c2 = code & 0xFF;
4914 EMIT_TWO_BYTES (c1, c2);
4915 }
4916 else if (charset == charset_kana)
4917 EMIT_ONE_BYTE (code | 0x80);
57a47f8a
KH
4918 else if (charset_kanji2 && charset == charset_kanji2)
4919 {
4920 int c1, c2;
4921
4922 c1 = code >> 8;
f07190ca
KH
4923 if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25)
4924 || c1 == 0x28
57a47f8a
KH
4925 || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
4926 {
4927 JIS_TO_SJIS2 (code);
4928 c1 = code >> 8, c2 = code & 0xFF;
4929 EMIT_TWO_BYTES (c1, c2);
4930 }
4931 else
4932 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4933 }
df7492f9
KH
4934 else
4935 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4936 }
4937 }
065e3595 4938 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
4939 coding->produced_char += produced_chars;
4940 coding->produced = dst - coding->destination;
4941 return 0;
4942}
4943
4944static int
971de7fb 4945encode_coding_big5 (struct coding_system *coding)
df7492f9
KH
4946{
4947 int multibytep = coding->dst_multibyte;
4948 int *charbuf = coding->charbuf;
4949 int *charbuf_end = charbuf + coding->charbuf_used;
4950 unsigned char *dst = coding->destination + coding->produced;
4951 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4952 int safe_room = 4;
a53e2e89 4953 EMACS_INT produced_chars = 0;
24a73b0a 4954 Lisp_Object attrs, charset_list, val;
df7492f9 4955 int ascii_compatible;
66ebf983 4956 struct charset *charset_big5;
df7492f9
KH
4957 int c;
4958
24a73b0a 4959 CODING_GET_INFO (coding, attrs, charset_list);
66ebf983 4960 val = XCDR (charset_list);
df7492f9
KH
4961 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4962 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4963
4964 while (charbuf < charbuf_end)
4965 {
4966 ASSURE_DESTINATION (safe_room);
4967 c = *charbuf++;
4968 /* Now encode the character C. */
4969 if (ASCII_CHAR_P (c) && ascii_compatible)
4970 EMIT_ONE_ASCII_BYTE (c);
16eafb5d
KH
4971 else if (CHAR_BYTE8_P (c))
4972 {
4973 c = CHAR_TO_BYTE8 (c);
4974 EMIT_ONE_BYTE (c);
b73bfc1c
KH
4975 }
4976 else
4977 {
df7492f9
KH
4978 unsigned code;
4979 struct charset *charset = char_charset (c, charset_list, &code);
4980
4981 if (! charset)
b73bfc1c 4982 {
41cbe562 4983 if (coding->mode & CODING_MODE_SAFE_ENCODING)
b73bfc1c 4984 {
41cbe562
KH
4985 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4986 charset = CHARSET_FROM_ID (charset_ascii);
b73bfc1c 4987 }
41cbe562 4988 else
0eecad43 4989 {
41cbe562
KH
4990 c = coding->default_char;
4991 charset = char_charset (c, charset_list, &code);
0eecad43 4992 }
4ed46869 4993 }
df7492f9
KH
4994 if (code == CHARSET_INVALID_CODE (charset))
4995 abort ();
4996 if (charset == charset_big5)
b73bfc1c 4997 {
df7492f9
KH
4998 int c1, c2;
4999
5000 c1 = code >> 8, c2 = code & 0xFF;
5001 EMIT_TWO_BYTES (c1, c2);
b73bfc1c 5002 }
df7492f9
KH
5003 else
5004 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4ed46869 5005 }
4ed46869 5006 }
065e3595 5007 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
5008 coding->produced_char += produced_chars;
5009 coding->produced = dst - coding->destination;
5010 return 0;
4ed46869
KH
5011}
5012
5013\f
df7492f9 5014/*** 10. CCL handlers ***/
1397dc18
KH
5015
5016/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5017 Check if a text is encoded in a coding system of which
5018 encoder/decoder are written in CCL program. If it is, return
df7492f9 5019 CATEGORY_MASK_CCL, else return 0. */
1397dc18 5020
0a28aafb 5021static int
cf84bb53
JB
5022detect_coding_ccl (struct coding_system *coding,
5023 struct coding_detection_info *detect_info)
1397dc18 5024{
065e3595 5025 const unsigned char *src = coding->source, *src_base;
8f924df7 5026 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9 5027 int multibytep = coding->src_multibyte;
a53e2e89 5028 EMACS_INT consumed_chars = 0;
df7492f9 5029 int found = 0;
0e219d54 5030 unsigned char *valids;
a53e2e89 5031 EMACS_INT head_ascii = coding->head_ascii;
df7492f9
KH
5032 Lisp_Object attrs;
5033
ff0dacd7
KH
5034 detect_info->checked |= CATEGORY_MASK_CCL;
5035
df7492f9 5036 coding = &coding_categories[coding_category_ccl];
0e219d54 5037 valids = CODING_CCL_VALIDS (coding);
df7492f9
KH
5038 attrs = CODING_ID_ATTRS (coding->id);
5039 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5040 src += head_ascii;
1397dc18 5041
b73bfc1c 5042 while (1)
1397dc18 5043 {
df7492f9 5044 int c;
065e3595
KH
5045
5046 src_base = src;
df7492f9 5047 ONE_MORE_BYTE (c);
065e3595 5048 if (c < 0 || ! valids[c])
df7492f9 5049 break;
ff0dacd7
KH
5050 if ((valids[c] > 1))
5051 found = CATEGORY_MASK_CCL;
df7492f9 5052 }
ff0dacd7 5053 detect_info->rejected |= CATEGORY_MASK_CCL;
df7492f9
KH
5054 return 0;
5055
5056 no_more_source:
ff0dacd7
KH
5057 detect_info->found |= found;
5058 return 1;
df7492f9
KH
5059}
5060
5061static void
971de7fb 5062decode_coding_ccl (struct coding_system *coding)
df7492f9 5063{
7c78e542 5064 const unsigned char *src = coding->source + coding->consumed;
8f924df7 5065 const unsigned char *src_end = coding->source + coding->src_bytes;
69a80ea3
KH
5066 int *charbuf = coding->charbuf + coding->charbuf_used;
5067 int *charbuf_end = coding->charbuf + coding->charbuf_size;
a53e2e89 5068 EMACS_INT consumed_chars = 0;
df7492f9 5069 int multibytep = coding->src_multibyte;
d0396581 5070 struct ccl_program *ccl = &coding->spec.ccl->ccl;
df7492f9 5071 int source_charbuf[1024];
fbdc1721 5072 int source_byteidx[1025];
24a73b0a 5073 Lisp_Object attrs, charset_list;
df7492f9 5074
24a73b0a 5075 CODING_GET_INFO (coding, attrs, charset_list);
df7492f9 5076
d0396581 5077 while (1)
df7492f9 5078 {
7c78e542 5079 const unsigned char *p = src;
df7492f9
KH
5080 int i = 0;
5081
5082 if (multibytep)
fbdc1721
KH
5083 {
5084 while (i < 1024 && p < src_end)
5085 {
5086 source_byteidx[i] = p - src;
5087 source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
5088 }
5089 source_byteidx[i] = p - src;
5090 }
df7492f9
KH
5091 else
5092 while (i < 1024 && p < src_end)
5093 source_charbuf[i++] = *p++;
8f924df7 5094
df7492f9 5095 if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
d0396581
KH
5096 ccl->last_block = 1;
5097 ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf,
5098 charset_list);
5099 charbuf += ccl->produced;
fbdc1721 5100 if (multibytep)
d0396581 5101 src += source_byteidx[ccl->consumed];
df7492f9 5102 else
d0396581
KH
5103 src += ccl->consumed;
5104 consumed_chars += ccl->consumed;
5105 if (p == src_end || ccl->status != CCL_STAT_SUSPEND_BY_SRC)
df7492f9
KH
5106 break;
5107 }
5108
d0396581 5109 switch (ccl->status)
df7492f9
KH
5110 {
5111 case CCL_STAT_SUSPEND_BY_SRC:
065e3595 5112 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
df7492f9
KH
5113 break;
5114 case CCL_STAT_SUSPEND_BY_DST:
d0396581 5115 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
df7492f9
KH
5116 break;
5117 case CCL_STAT_QUIT:
5118 case CCL_STAT_INVALID_CMD:
065e3595 5119 record_conversion_result (coding, CODING_RESULT_INTERRUPT);
df7492f9
KH
5120 break;
5121 default:
065e3595 5122 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
5123 break;
5124 }
5125 coding->consumed_char += consumed_chars;
5126 coding->consumed = src - coding->source;
5127 coding->charbuf_used = charbuf - coding->charbuf;
5128}
5129
5130static int
971de7fb 5131encode_coding_ccl (struct coding_system *coding)
df7492f9 5132{
fb608df3 5133 struct ccl_program *ccl = &coding->spec.ccl->ccl;
df7492f9
KH
5134 int multibytep = coding->dst_multibyte;
5135 int *charbuf = coding->charbuf;
5136 int *charbuf_end = charbuf + coding->charbuf_used;
5137 unsigned char *dst = coding->destination + coding->produced;
5138 unsigned char *dst_end = coding->destination + coding->dst_bytes;
df7492f9 5139 int destination_charbuf[1024];
a53e2e89
EZ
5140 EMACS_INT produced_chars = 0;
5141 int i;
24a73b0a 5142 Lisp_Object attrs, charset_list;
df7492f9 5143
24a73b0a 5144 CODING_GET_INFO (coding, attrs, charset_list);
fb608df3
KH
5145 if (coding->consumed_char == coding->src_chars
5146 && coding->mode & CODING_MODE_LAST_BLOCK)
5147 ccl->last_block = 1;
df7492f9 5148
8cffd3e7 5149 while (charbuf < charbuf_end)
df7492f9 5150 {
fb608df3 5151 ccl_driver (ccl, charbuf, destination_charbuf,
8cffd3e7 5152 charbuf_end - charbuf, 1024, charset_list);
df7492f9 5153 if (multibytep)
8cffd3e7 5154 {
fb608df3
KH
5155 ASSURE_DESTINATION (ccl->produced * 2);
5156 for (i = 0; i < ccl->produced; i++)
8cffd3e7
KH
5157 EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5158 }
df7492f9
KH
5159 else
5160 {
fb608df3
KH
5161 ASSURE_DESTINATION (ccl->produced);
5162 for (i = 0; i < ccl->produced; i++)
df7492f9 5163 *dst++ = destination_charbuf[i] & 0xFF;
fb608df3 5164 produced_chars += ccl->produced;
df7492f9 5165 }
fb608df3
KH
5166 charbuf += ccl->consumed;
5167 if (ccl->status == CCL_STAT_QUIT
5168 || ccl->status == CCL_STAT_INVALID_CMD)
8cffd3e7 5169 break;
df7492f9
KH
5170 }
5171
fb608df3 5172 switch (ccl->status)
df7492f9
KH
5173 {
5174 case CCL_STAT_SUSPEND_BY_SRC:
065e3595 5175 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
df7492f9
KH
5176 break;
5177 case CCL_STAT_SUSPEND_BY_DST:
065e3595 5178 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
df7492f9
KH
5179 break;
5180 case CCL_STAT_QUIT:
5181 case CCL_STAT_INVALID_CMD:
065e3595 5182 record_conversion_result (coding, CODING_RESULT_INTERRUPT);
df7492f9
KH
5183 break;
5184 default:
065e3595 5185 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9 5186 break;
1397dc18 5187 }
df7492f9
KH
5188
5189 coding->produced_char += produced_chars;
5190 coding->produced = dst - coding->destination;
5191 return 0;
1397dc18
KH
5192}
5193
df7492f9 5194
1397dc18 5195\f
df7492f9 5196/*** 10, 11. no-conversion handlers ***/
4ed46869 5197
b73bfc1c 5198/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
4ed46869 5199
b73bfc1c 5200static void
971de7fb 5201decode_coding_raw_text (struct coding_system *coding)
4ed46869 5202{
2735d060 5203 int eol_dos =
0a9564cb 5204 !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
119852e7 5205
df7492f9 5206 coding->chars_at_source = 1;
119852e7
KH
5207 coding->consumed_char = coding->src_chars;
5208 coding->consumed = coding->src_bytes;
2735d060 5209 if (eol_dos && coding->source[coding->src_bytes - 1] == '\r')
119852e7
KH
5210 {
5211 coding->consumed_char--;
5212 coding->consumed--;
5213 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5214 }
5215 else
5216 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9 5217}
4ed46869 5218
df7492f9 5219static int
971de7fb 5220encode_coding_raw_text (struct coding_system *coding)
df7492f9
KH
5221{
5222 int multibytep = coding->dst_multibyte;
5223 int *charbuf = coding->charbuf;
5224 int *charbuf_end = coding->charbuf + coding->charbuf_used;
5225 unsigned char *dst = coding->destination + coding->produced;
5226 unsigned char *dst_end = coding->destination + coding->dst_bytes;
a53e2e89 5227 EMACS_INT produced_chars = 0;
b73bfc1c
KH
5228 int c;
5229
df7492f9 5230 if (multibytep)
b73bfc1c 5231 {
df7492f9 5232 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
4ed46869 5233
df7492f9
KH
5234 if (coding->src_multibyte)
5235 while (charbuf < charbuf_end)
5236 {
5237 ASSURE_DESTINATION (safe_room);
5238 c = *charbuf++;
5239 if (ASCII_CHAR_P (c))
5240 EMIT_ONE_ASCII_BYTE (c);
5241 else if (CHAR_BYTE8_P (c))
5242 {
5243 c = CHAR_TO_BYTE8 (c);
5244 EMIT_ONE_BYTE (c);
5245 }
5246 else
5247 {
5248 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
93dec019 5249
df7492f9 5250 CHAR_STRING_ADVANCE (c, p1);
8abc3f12 5251 do
9d123124
KH
5252 {
5253 EMIT_ONE_BYTE (*p0);
5254 p0++;
5255 }
8abc3f12 5256 while (p0 < p1);
df7492f9
KH
5257 }
5258 }
b73bfc1c 5259 else
df7492f9
KH
5260 while (charbuf < charbuf_end)
5261 {
5262 ASSURE_DESTINATION (safe_room);
5263 c = *charbuf++;
5264 EMIT_ONE_BYTE (c);
5265 }
5266 }
5267 else
4ed46869 5268 {
df7492f9 5269 if (coding->src_multibyte)
d46c5b12 5270 {
df7492f9
KH
5271 int safe_room = MAX_MULTIBYTE_LENGTH;
5272
5273 while (charbuf < charbuf_end)
d46c5b12 5274 {
df7492f9
KH
5275 ASSURE_DESTINATION (safe_room);
5276 c = *charbuf++;
5277 if (ASCII_CHAR_P (c))
5278 *dst++ = c;
5279 else if (CHAR_BYTE8_P (c))
5280 *dst++ = CHAR_TO_BYTE8 (c);
b73bfc1c 5281 else
df7492f9 5282 CHAR_STRING_ADVANCE (c, dst);
d46c5b12
KH
5283 }
5284 }
df7492f9
KH
5285 else
5286 {
5287 ASSURE_DESTINATION (charbuf_end - charbuf);
5288 while (charbuf < charbuf_end && dst < dst_end)
5289 *dst++ = *charbuf++;
8f924df7 5290 }
319a3947 5291 produced_chars = dst - (coding->destination + coding->produced);
4ed46869 5292 }
065e3595 5293 record_conversion_result (coding, CODING_RESULT_SUCCESS);
a0ed9b27 5294 coding->produced_char += produced_chars;
df7492f9
KH
5295 coding->produced = dst - coding->destination;
5296 return 0;
4ed46869
KH
5297}
5298
ff0dacd7
KH
5299/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5300 Check if a text is encoded in a charset-based coding system. If it
5301 is, return 1, else return 0. */
5302
0a28aafb 5303static int
cf84bb53
JB
5304detect_coding_charset (struct coding_system *coding,
5305 struct coding_detection_info *detect_info)
1397dc18 5306{
065e3595 5307 const unsigned char *src = coding->source, *src_base;
8f924df7 5308 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9 5309 int multibytep = coding->src_multibyte;
a53e2e89 5310 EMACS_INT consumed_chars = 0;
07295713 5311 Lisp_Object attrs, valids, name;
584948ac 5312 int found = 0;
a53e2e89 5313 EMACS_INT head_ascii = coding->head_ascii;
07295713 5314 int check_latin_extra = 0;
1397dc18 5315
ff0dacd7
KH
5316 detect_info->checked |= CATEGORY_MASK_CHARSET;
5317
df7492f9
KH
5318 coding = &coding_categories[coding_category_charset];
5319 attrs = CODING_ID_ATTRS (coding->id);
5320 valids = AREF (attrs, coding_attr_charset_valids);
07295713 5321 name = CODING_ID_NAME (coding->id);
51b59d79 5322 if (strncmp (SSDATA (SYMBOL_NAME (name)),
237aabf4 5323 "iso-8859-", sizeof ("iso-8859-") - 1) == 0
51b59d79 5324 || strncmp (SSDATA (SYMBOL_NAME (name)),
237aabf4 5325 "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
07295713 5326 check_latin_extra = 1;
237aabf4 5327
df7492f9 5328 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
716b3fa0 5329 src += head_ascii;
1397dc18 5330
b73bfc1c 5331 while (1)
1397dc18 5332 {
df7492f9 5333 int c;
716b3fa0
KH
5334 Lisp_Object val;
5335 struct charset *charset;
5336 int dim, idx;
1397dc18 5337
065e3595 5338 src_base = src;
df7492f9 5339 ONE_MORE_BYTE (c);
065e3595
KH
5340 if (c < 0)
5341 continue;
716b3fa0
KH
5342 val = AREF (valids, c);
5343 if (NILP (val))
df7492f9 5344 break;
584948ac 5345 if (c >= 0x80)
07295713
KH
5346 {
5347 if (c < 0xA0
237aabf4
JR
5348 && check_latin_extra
5349 && (!VECTORP (Vlatin_extra_code_table)
9f0526cb 5350 || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c])))
07295713
KH
5351 break;
5352 found = CATEGORY_MASK_CHARSET;
5353 }
716b3fa0
KH
5354 if (INTEGERP (val))
5355 {
5356 charset = CHARSET_FROM_ID (XFASTINT (val));
5357 dim = CHARSET_DIMENSION (charset);
5358 for (idx = 1; idx < dim; idx++)
5359 {
5360 if (src == src_end)
5361 goto too_short;
5362 ONE_MORE_BYTE (c);
3ed051d4 5363 if (c < charset->code_space[(dim - 1 - idx) * 2]
716b3fa0
KH
5364 || c > charset->code_space[(dim - 1 - idx) * 2 + 1])
5365 break;
5366 }
5367 if (idx < dim)
5368 break;
5369 }
5370 else
5371 {
5372 idx = 1;
5373 for (; CONSP (val); val = XCDR (val))
5374 {
5375 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5376 dim = CHARSET_DIMENSION (charset);
5377 while (idx < dim)
5378 {
5379 if (src == src_end)
5380 goto too_short;
5381 ONE_MORE_BYTE (c);
5382 if (c < charset->code_space[(dim - 1 - idx) * 4]
5383 || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5384 break;
5385 idx++;
5386 }
5387 if (idx == dim)
5388 {
5389 val = Qnil;
5390 break;
5391 }
5392 }
5393 if (CONSP (val))
5394 break;
5395 }
df7492f9 5396 }
716b3fa0 5397 too_short:
ff0dacd7 5398 detect_info->rejected |= CATEGORY_MASK_CHARSET;
df7492f9 5399 return 0;
4ed46869 5400
df7492f9 5401 no_more_source:
ff0dacd7
KH
5402 detect_info->found |= found;
5403 return 1;
df7492f9 5404}
b73bfc1c 5405
b73bfc1c 5406static void
971de7fb 5407decode_coding_charset (struct coding_system *coding)
4ed46869 5408{
8f924df7
KH
5409 const unsigned char *src = coding->source + coding->consumed;
5410 const unsigned char *src_end = coding->source + coding->src_bytes;
5411 const unsigned char *src_base;
69a80ea3 5412 int *charbuf = coding->charbuf + coding->charbuf_used;
ad1746f5 5413 /* We may produce one charset annotation in one loop and one more at
df80c7f0 5414 the end. */
69a80ea3 5415 int *charbuf_end
df80c7f0 5416 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
a53e2e89 5417 EMACS_INT consumed_chars = 0, consumed_chars_base;
df7492f9 5418 int multibytep = coding->src_multibyte;
66ebf983
PE
5419 Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
5420 Lisp_Object valids;
a53e2e89
EZ
5421 EMACS_INT char_offset = coding->produced_char;
5422 EMACS_INT last_offset = char_offset;
ff0dacd7 5423 int last_id = charset_ascii;
2735d060 5424 int eol_dos =
0a9564cb 5425 !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
119852e7 5426 int byte_after_cr = -1;
df7492f9 5427
4eb6d3f1 5428 valids = AREF (attrs, coding_attr_charset_valids);
b73bfc1c 5429
df7492f9 5430 while (1)
4ed46869 5431 {
4eb6d3f1 5432 int c;
24a73b0a
KH
5433 Lisp_Object val;
5434 struct charset *charset;
5435 int dim;
5436 int len = 1;
5437 unsigned code;
df7492f9
KH
5438
5439 src_base = src;
5440 consumed_chars_base = consumed_chars;
b73bfc1c 5441
df7492f9 5442 if (charbuf >= charbuf_end)
b71f6f73
KH
5443 {
5444 if (byte_after_cr >= 0)
5445 src_base--;
5446 break;
5447 }
df7492f9 5448
119852e7
KH
5449 if (byte_after_cr >= 0)
5450 {
5451 c = byte_after_cr;
5452 byte_after_cr = -1;
5453 }
5454 else
5455 {
5456 ONE_MORE_BYTE (c);
2735d060 5457 if (eol_dos && c == '\r')
119852e7
KH
5458 ONE_MORE_BYTE (byte_after_cr);
5459 }
065e3595
KH
5460 if (c < 0)
5461 goto invalid_code;
24a73b0a
KH
5462 code = c;
5463
5464 val = AREF (valids, c);
1b17adfd 5465 if (! INTEGERP (val) && ! CONSP (val))
24a73b0a
KH
5466 goto invalid_code;
5467 if (INTEGERP (val))
d46c5b12 5468 {
24a73b0a
KH
5469 charset = CHARSET_FROM_ID (XFASTINT (val));
5470 dim = CHARSET_DIMENSION (charset);
5471 while (len < dim)
b73bfc1c 5472 {
24a73b0a
KH
5473 ONE_MORE_BYTE (c);
5474 code = (code << 8) | c;
5475 len++;
b73bfc1c 5476 }
24a73b0a
KH
5477 CODING_DECODE_CHAR (coding, src, src_base, src_end,
5478 charset, code, c);
d46c5b12 5479 }
df7492f9 5480 else
d46c5b12 5481 {
24a73b0a
KH
5482 /* VAL is a list of charset IDs. It is assured that the
5483 list is sorted by charset dimensions (smaller one
5484 comes first). */
5485 while (CONSP (val))
4eb6d3f1 5486 {
24a73b0a 5487 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
c7c66a95 5488 dim = CHARSET_DIMENSION (charset);
f9d71dcd 5489 while (len < dim)
4eb6d3f1 5490 {
acb2a965
KH
5491 ONE_MORE_BYTE (c);
5492 code = (code << 8) | c;
f9d71dcd 5493 len++;
4eb6d3f1 5494 }
24a73b0a
KH
5495 CODING_DECODE_CHAR (coding, src, src_base,
5496 src_end, charset, code, c);
5497 if (c >= 0)
5498 break;
5499 val = XCDR (val);
ff0dacd7 5500 }
d46c5b12 5501 }
24a73b0a
KH
5502 if (c < 0)
5503 goto invalid_code;
5504 if (charset->id != charset_ascii
5505 && last_id != charset->id)
5506 {
5507 if (last_id != charset_ascii)
69a80ea3 5508 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
24a73b0a
KH
5509 last_id = charset->id;
5510 last_offset = char_offset;
5511 }
5512
df7492f9 5513 *charbuf++ = c;
ff0dacd7 5514 char_offset++;
df7492f9
KH
5515 continue;
5516
5517 invalid_code:
5518 src = src_base;
5519 consumed_chars = consumed_chars_base;
5520 ONE_MORE_BYTE (c);
065e3595 5521 *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
ff0dacd7 5522 char_offset++;
df7492f9 5523 coding->errors++;
4ed46869
KH
5524 }
5525
df7492f9 5526 no_more_source:
ff0dacd7 5527 if (last_id != charset_ascii)
69a80ea3 5528 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
df7492f9
KH
5529 coding->consumed_char += consumed_chars_base;
5530 coding->consumed = src_base - coding->source;
5531 coding->charbuf_used = charbuf - coding->charbuf;
4ed46869
KH
5532}
5533
df7492f9 5534static int
971de7fb 5535encode_coding_charset (struct coding_system *coding)
4ed46869 5536{
df7492f9
KH
5537 int multibytep = coding->dst_multibyte;
5538 int *charbuf = coding->charbuf;
5539 int *charbuf_end = charbuf + coding->charbuf_used;
5540 unsigned char *dst = coding->destination + coding->produced;
5541 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5542 int safe_room = MAX_MULTIBYTE_LENGTH;
a53e2e89 5543 EMACS_INT produced_chars = 0;
24a73b0a 5544 Lisp_Object attrs, charset_list;
df7492f9 5545 int ascii_compatible;
b73bfc1c 5546 int c;
b73bfc1c 5547
24a73b0a 5548 CODING_GET_INFO (coding, attrs, charset_list);
df7492f9 5549 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
fb88bf2d 5550
df7492f9 5551 while (charbuf < charbuf_end)
4ed46869 5552 {
4eb6d3f1 5553 struct charset *charset;
df7492f9 5554 unsigned code;
8f924df7 5555
df7492f9
KH
5556 ASSURE_DESTINATION (safe_room);
5557 c = *charbuf++;
5558 if (ascii_compatible && ASCII_CHAR_P (c))
5559 EMIT_ONE_ASCII_BYTE (c);
16eafb5d 5560 else if (CHAR_BYTE8_P (c))
4ed46869 5561 {
16eafb5d
KH
5562 c = CHAR_TO_BYTE8 (c);
5563 EMIT_ONE_BYTE (c);
d46c5b12 5564 }
d46c5b12 5565 else
b73bfc1c 5566 {
4eb6d3f1
KH
5567 charset = char_charset (c, charset_list, &code);
5568 if (charset)
5569 {
5570 if (CHARSET_DIMENSION (charset) == 1)
5571 EMIT_ONE_BYTE (code);
5572 else if (CHARSET_DIMENSION (charset) == 2)
5573 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5574 else if (CHARSET_DIMENSION (charset) == 3)
5575 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5576 else
5577 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5578 (code >> 8) & 0xFF, code & 0xFF);
5579 }
5580 else
41cbe562
KH
5581 {
5582 if (coding->mode & CODING_MODE_SAFE_ENCODING)
5583 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5584 else
5585 c = coding->default_char;
5586 EMIT_ONE_BYTE (c);
5587 }
4ed46869 5588 }
4ed46869
KH
5589 }
5590
065e3595 5591 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
5592 coding->produced_char += produced_chars;
5593 coding->produced = dst - coding->destination;
5594 return 0;
4ed46869
KH
5595}
5596
5597\f
1397dc18 5598/*** 7. C library functions ***/
4ed46869 5599
df7492f9
KH
5600/* Setup coding context CODING from information about CODING_SYSTEM.
5601 If CODING_SYSTEM is nil, `no-conversion' is assumed. If
5602 CODING_SYSTEM is invalid, signal an error. */
4ed46869 5603
ec6d2bb8 5604void
971de7fb 5605setup_coding_system (Lisp_Object coding_system, struct coding_system *coding)
4ed46869 5606{
df7492f9
KH
5607 Lisp_Object attrs;
5608 Lisp_Object eol_type;
5609 Lisp_Object coding_type;
4608c386 5610 Lisp_Object val;
4ed46869 5611
df7492f9 5612 if (NILP (coding_system))
ae6f73fa 5613 coding_system = Qundecided;
c07c8e12 5614
df7492f9 5615 CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
1f5dbf34 5616
df7492f9 5617 attrs = CODING_ID_ATTRS (coding->id);
0a9564cb 5618 eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
1f5dbf34 5619
df7492f9
KH
5620 coding->mode = 0;
5621 coding->head_ascii = -1;
4a015c45
KH
5622 if (VECTORP (eol_type))
5623 coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5624 | CODING_REQUIRE_DETECTION_MASK);
5625 else if (! EQ (eol_type, Qunix))
5626 coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5627 | CODING_REQUIRE_ENCODING_MASK);
5628 else
5629 coding->common_flags = 0;
5e5c78be
KH
5630 if (! NILP (CODING_ATTR_POST_READ (attrs)))
5631 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5632 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5633 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
8f924df7
KH
5634 if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5635 coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
1f5dbf34 5636
df7492f9 5637 val = CODING_ATTR_SAFE_CHARSETS (attrs);
8f924df7 5638 coding->max_charset_id = SCHARS (val) - 1;
1b3b981b 5639 coding->safe_charsets = SDATA (val);
df7492f9 5640 coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
624bda09 5641 coding->carryover_bytes = 0;
4608c386 5642
df7492f9
KH
5643 coding_type = CODING_ATTR_TYPE (attrs);
5644 if (EQ (coding_type, Qundecided))
d46c5b12 5645 {
df7492f9
KH
5646 coding->detector = NULL;
5647 coding->decoder = decode_coding_raw_text;
5648 coding->encoder = encode_coding_raw_text;
5649 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
d46c5b12 5650 }
df7492f9 5651 else if (EQ (coding_type, Qiso_2022))
d46c5b12 5652 {
df7492f9
KH
5653 int i;
5654 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5655
5656 /* Invoke graphic register 0 to plane 0. */
5657 CODING_ISO_INVOCATION (coding, 0) = 0;
5658 /* Invoke graphic register 1 to plane 1 if we can use 8-bit. */
5659 CODING_ISO_INVOCATION (coding, 1)
5660 = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5661 /* Setup the initial status of designation. */
5662 for (i = 0; i < 4; i++)
5663 CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5664 /* Not single shifting initially. */
5665 CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5666 /* Beginning of buffer should also be regarded as bol. */
5667 CODING_ISO_BOL (coding) = 1;
5668 coding->detector = detect_coding_iso_2022;
5669 coding->decoder = decode_coding_iso_2022;
5670 coding->encoder = encode_coding_iso_2022;
5671 if (flags & CODING_ISO_FLAG_SAFE)
5672 coding->mode |= CODING_MODE_SAFE_ENCODING;
d46c5b12 5673 coding->common_flags
df7492f9
KH
5674 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5675 | CODING_REQUIRE_FLUSHING_MASK);
5676 if (flags & CODING_ISO_FLAG_COMPOSITION)
5677 coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
ff0dacd7
KH
5678 if (flags & CODING_ISO_FLAG_DESIGNATION)
5679 coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
df7492f9
KH
5680 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5681 {
5682 setup_iso_safe_charsets (attrs);
5683 val = CODING_ATTR_SAFE_CHARSETS (attrs);
8f924df7 5684 coding->max_charset_id = SCHARS (val) - 1;
1b3b981b 5685 coding->safe_charsets = SDATA (val);
df7492f9
KH
5686 }
5687 CODING_ISO_FLAGS (coding) = flags;
e951386e
KH
5688 CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5689 CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5690 CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5691 CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
d46c5b12 5692 }
df7492f9 5693 else if (EQ (coding_type, Qcharset))
d46c5b12 5694 {
df7492f9
KH
5695 coding->detector = detect_coding_charset;
5696 coding->decoder = decode_coding_charset;
5697 coding->encoder = encode_coding_charset;
d46c5b12 5698 coding->common_flags
df7492f9 5699 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
d46c5b12 5700 }
df7492f9 5701 else if (EQ (coding_type, Qutf_8))
d46c5b12 5702 {
a470d443
KH
5703 val = AREF (attrs, coding_attr_utf_bom);
5704 CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5705 : EQ (val, Qt) ? utf_with_bom
5706 : utf_without_bom);
df7492f9
KH
5707 coding->detector = detect_coding_utf_8;
5708 coding->decoder = decode_coding_utf_8;
5709 coding->encoder = encode_coding_utf_8;
5710 coding->common_flags
5711 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
a470d443
KH
5712 if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5713 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
df7492f9
KH
5714 }
5715 else if (EQ (coding_type, Qutf_16))
5716 {
a470d443
KH
5717 val = AREF (attrs, coding_attr_utf_bom);
5718 CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5719 : EQ (val, Qt) ? utf_with_bom
5720 : utf_without_bom);
df7492f9 5721 val = AREF (attrs, coding_attr_utf_16_endian);
b49a1807 5722 CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
df7492f9 5723 : utf_16_little_endian);
e19c3639 5724 CODING_UTF_16_SURROGATE (coding) = 0;
df7492f9
KH
5725 coding->detector = detect_coding_utf_16;
5726 coding->decoder = decode_coding_utf_16;
5727 coding->encoder = encode_coding_utf_16;
5728 coding->common_flags
5729 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
a470d443 5730 if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
b49a1807 5731 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
d46c5b12 5732 }
df7492f9 5733 else if (EQ (coding_type, Qccl))
4ed46869 5734 {
df7492f9
KH
5735 coding->detector = detect_coding_ccl;
5736 coding->decoder = decode_coding_ccl;
5737 coding->encoder = encode_coding_ccl;
c952af22 5738 coding->common_flags
df7492f9
KH
5739 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5740 | CODING_REQUIRE_FLUSHING_MASK);
5741 }
5742 else if (EQ (coding_type, Qemacs_mule))
5743 {
5744 coding->detector = detect_coding_emacs_mule;
5745 coding->decoder = decode_coding_emacs_mule;
5746 coding->encoder = encode_coding_emacs_mule;
c952af22 5747 coding->common_flags
df7492f9 5748 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
e951386e 5749 coding->spec.emacs_mule.full_support = 1;
df7492f9
KH
5750 if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5751 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5752 {
5753 Lisp_Object tail, safe_charsets;
5754 int max_charset_id = 0;
5755
5756 for (tail = Vemacs_mule_charset_list; CONSP (tail);
5757 tail = XCDR (tail))
5758 if (max_charset_id < XFASTINT (XCAR (tail)))
5759 max_charset_id = XFASTINT (XCAR (tail));
1b3b981b
AS
5760 safe_charsets = make_uninit_string (max_charset_id + 1);
5761 memset (SDATA (safe_charsets), 255, max_charset_id + 1);
df7492f9
KH
5762 for (tail = Vemacs_mule_charset_list; CONSP (tail);
5763 tail = XCDR (tail))
8f924df7 5764 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
df7492f9 5765 coding->max_charset_id = max_charset_id;
1b3b981b 5766 coding->safe_charsets = SDATA (safe_charsets);
e951386e 5767 coding->spec.emacs_mule.full_support = 1;
df7492f9 5768 }
e951386e
KH
5769 coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
5770 coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
df7492f9
KH
5771 }
5772 else if (EQ (coding_type, Qshift_jis))
5773 {
5774 coding->detector = detect_coding_sjis;
5775 coding->decoder = decode_coding_sjis;
5776 coding->encoder = encode_coding_sjis;
c952af22 5777 coding->common_flags
df7492f9
KH
5778 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5779 }
5780 else if (EQ (coding_type, Qbig5))
5781 {
5782 coding->detector = detect_coding_big5;
5783 coding->decoder = decode_coding_big5;
5784 coding->encoder = encode_coding_big5;
c952af22 5785 coding->common_flags
df7492f9
KH
5786 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5787 }
5788 else /* EQ (coding_type, Qraw_text) */
ec6d2bb8 5789 {
df7492f9
KH
5790 coding->detector = NULL;
5791 coding->decoder = decode_coding_raw_text;
5792 coding->encoder = encode_coding_raw_text;
ea29edf2
KH
5793 if (! EQ (eol_type, Qunix))
5794 {
5795 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5796 if (! VECTORP (eol_type))
5797 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5798 }
5799
4ed46869 5800 }
4ed46869 5801
df7492f9 5802 return;
4ed46869
KH
5803}
5804
0ff61e78
KH
5805/* Return a list of charsets supported by CODING. */
5806
5807Lisp_Object
971de7fb 5808coding_charset_list (struct coding_system *coding)
0ff61e78 5809{
35befdaa 5810 Lisp_Object attrs, charset_list;
0ff61e78
KH
5811
5812 CODING_GET_INFO (coding, attrs, charset_list);
5813 if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5814 {
5815 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5816
5817 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5818 charset_list = Viso_2022_charset_list;
5819 }
5820 else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5821 {
5822 charset_list = Vemacs_mule_charset_list;
5823 }
5824 return charset_list;
5825}
5826
5827
e9f91ece
KH
5828/* Return a list of charsets supported by CODING-SYSTEM. */
5829
5830Lisp_Object
971de7fb 5831coding_system_charset_list (Lisp_Object coding_system)
e9f91ece
KH
5832{
5833 int id;
5834 Lisp_Object attrs, charset_list;
5835
5836 CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
5837 attrs = CODING_ID_ATTRS (id);
5838
5839 if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5840 {
5841 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5842
5843 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5844 charset_list = Viso_2022_charset_list;
5845 else
5846 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5847 }
5848 else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5849 {
5850 charset_list = Vemacs_mule_charset_list;
5851 }
5852 else
5853 {
5854 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5855 }
5856 return charset_list;
5857}
5858
5859
df7492f9
KH
5860/* Return raw-text or one of its subsidiaries that has the same
5861 eol_type as CODING-SYSTEM. */
ec6d2bb8 5862
df7492f9 5863Lisp_Object
971de7fb 5864raw_text_coding_system (Lisp_Object coding_system)
ec6d2bb8 5865{
0be8721c 5866 Lisp_Object spec, attrs;
df7492f9 5867 Lisp_Object eol_type, raw_text_eol_type;
ec6d2bb8 5868
d3e4cb56
KH
5869 if (NILP (coding_system))
5870 return Qraw_text;
df7492f9
KH
5871 spec = CODING_SYSTEM_SPEC (coding_system);
5872 attrs = AREF (spec, 0);
ec6d2bb8 5873
df7492f9
KH
5874 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5875 return coding_system;
ec6d2bb8 5876
df7492f9
KH
5877 eol_type = AREF (spec, 2);
5878 if (VECTORP (eol_type))
5879 return Qraw_text;
5880 spec = CODING_SYSTEM_SPEC (Qraw_text);
5881 raw_text_eol_type = AREF (spec, 2);
5882 return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5883 : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5884 : AREF (raw_text_eol_type, 2));
ec6d2bb8
KH
5885}
5886
54f78171 5887
1911a33b
KH
5888/* If CODING_SYSTEM doesn't specify end-of-line format, return one of
5889 the subsidiary that has the same eol-spec as PARENT (if it is not
5890 nil and specifies end-of-line format) or the system's setting
fcbcfb64 5891 (system_eol_type). */
df7492f9
KH
5892
5893Lisp_Object
971de7fb 5894coding_inherit_eol_type (Lisp_Object coding_system, Lisp_Object parent)
54f78171 5895{
3e139625 5896 Lisp_Object spec, eol_type;
54f78171 5897
d3e4cb56
KH
5898 if (NILP (coding_system))
5899 coding_system = Qraw_text;
df7492f9 5900 spec = CODING_SYSTEM_SPEC (coding_system);
df7492f9 5901 eol_type = AREF (spec, 2);
fcbcfb64 5902 if (VECTORP (eol_type))
df7492f9 5903 {
df7492f9
KH
5904 Lisp_Object parent_eol_type;
5905
fcbcfb64
KH
5906 if (! NILP (parent))
5907 {
5908 Lisp_Object parent_spec;
5909
4a015c45 5910 parent_spec = CODING_SYSTEM_SPEC (parent);
fcbcfb64 5911 parent_eol_type = AREF (parent_spec, 2);
1911a33b 5912 if (VECTORP (parent_eol_type))
4628bef1 5913 parent_eol_type = system_eol_type;
fcbcfb64
KH
5914 }
5915 else
5916 parent_eol_type = system_eol_type;
df7492f9
KH
5917 if (EQ (parent_eol_type, Qunix))
5918 coding_system = AREF (eol_type, 0);
5919 else if (EQ (parent_eol_type, Qdos))
5920 coding_system = AREF (eol_type, 1);
5921 else if (EQ (parent_eol_type, Qmac))
5922 coding_system = AREF (eol_type, 2);
54f78171 5923 }
df7492f9 5924 return coding_system;
54f78171
KH
5925}
5926
fcaf8878
KH
5927
5928/* Check if text-conversion and eol-conversion of CODING_SYSTEM are
5929 decided for writing to a process. If not, complement them, and
5930 return a new coding system. */
5931
5932Lisp_Object
4628bef1 5933complement_process_encoding_system (Lisp_Object coding_system)
fcaf8878 5934{
5886ec9c
KH
5935 Lisp_Object coding_base = Qnil, eol_base = Qnil;
5936 Lisp_Object spec, attrs;
93d50df8 5937 int i;
fcaf8878 5938
93d50df8 5939 for (i = 0; i < 3; i++)
fcaf8878 5940 {
93d50df8
KH
5941 if (i == 1)
5942 coding_system = CDR_SAFE (Vdefault_process_coding_system);
5943 else if (i == 2)
5944 coding_system = preferred_coding_system ();
5945 spec = CODING_SYSTEM_SPEC (coding_system);
5946 if (NILP (spec))
5947 continue;
5948 attrs = AREF (spec, 0);
5949 if (NILP (coding_base) && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
5950 coding_base = CODING_ATTR_BASE_NAME (attrs);
5951 if (NILP (eol_base) && ! VECTORP (AREF (spec, 2)))
5952 eol_base = coding_system;
5953 if (! NILP (coding_base) && ! NILP (eol_base))
5954 break;
fcaf8878 5955 }
fcaf8878 5956
93d50df8
KH
5957 if (i > 0)
5958 /* The original CODING_SYSTEM didn't specify text-conversion or
5959 eol-conversion. Be sure that we return a fully complemented
5960 coding system. */
5961 coding_system = coding_inherit_eol_type (coding_base, eol_base);
5962 return coding_system;
fcaf8878
KH
5963}
5964
5965
4ed46869
KH
5966/* Emacs has a mechanism to automatically detect a coding system if it
5967 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
5968 it's impossible to distinguish some coding systems accurately
5969 because they use the same range of codes. So, at first, coding
5970 systems are categorized into 7, those are:
5971
0ef69138 5972 o coding-category-emacs-mule
4ed46869
KH
5973
5974 The category for a coding system which has the same code range
5975 as Emacs' internal format. Assigned the coding-system (Lisp
0ef69138 5976 symbol) `emacs-mule' by default.
4ed46869
KH
5977
5978 o coding-category-sjis
5979
5980 The category for a coding system which has the same code range
5981 as SJIS. Assigned the coding-system (Lisp
7717c392 5982 symbol) `japanese-shift-jis' by default.
4ed46869
KH
5983
5984 o coding-category-iso-7
5985
5986 The category for a coding system which has the same code range
7717c392 5987 as ISO2022 of 7-bit environment. This doesn't use any locking
d46c5b12
KH
5988 shift and single shift functions. This can encode/decode all
5989 charsets. Assigned the coding-system (Lisp symbol)
5990 `iso-2022-7bit' by default.
5991
5992 o coding-category-iso-7-tight
5993
5994 Same as coding-category-iso-7 except that this can
5995 encode/decode only the specified charsets.
4ed46869
KH
5996
5997 o coding-category-iso-8-1
5998
5999 The category for a coding system which has the same code range
6000 as ISO2022 of 8-bit environment and graphic plane 1 used only
7717c392
KH
6001 for DIMENSION1 charset. This doesn't use any locking shift
6002 and single shift functions. Assigned the coding-system (Lisp
6003 symbol) `iso-latin-1' by default.
4ed46869
KH
6004
6005 o coding-category-iso-8-2
6006
6007 The category for a coding system which has the same code range
6008 as ISO2022 of 8-bit environment and graphic plane 1 used only
7717c392
KH
6009 for DIMENSION2 charset. This doesn't use any locking shift
6010 and single shift functions. Assigned the coding-system (Lisp
6011 symbol) `japanese-iso-8bit' by default.
4ed46869 6012
7717c392 6013 o coding-category-iso-7-else
4ed46869
KH
6014
6015 The category for a coding system which has the same code range
ad1746f5 6016 as ISO2022 of 7-bit environment but uses locking shift or
7717c392
KH
6017 single shift functions. Assigned the coding-system (Lisp
6018 symbol) `iso-2022-7bit-lock' by default.
6019
6020 o coding-category-iso-8-else
6021
6022 The category for a coding system which has the same code range
ad1746f5 6023 as ISO2022 of 8-bit environment but uses locking shift or
7717c392
KH
6024 single shift functions. Assigned the coding-system (Lisp
6025 symbol) `iso-2022-8bit-ss2' by default.
4ed46869
KH
6026
6027 o coding-category-big5
6028
6029 The category for a coding system which has the same code range
6030 as BIG5. Assigned the coding-system (Lisp symbol)
e0e989f6 6031 `cn-big5' by default.
4ed46869 6032
fa42c37f
KH
6033 o coding-category-utf-8
6034
6035 The category for a coding system which has the same code range
6e76ae91 6036 as UTF-8 (cf. RFC3629). Assigned the coding-system (Lisp
fa42c37f
KH
6037 symbol) `utf-8' by default.
6038
6039 o coding-category-utf-16-be
6040
6041 The category for a coding system in which a text has an
6042 Unicode signature (cf. Unicode Standard) in the order of BIG
6043 endian at the head. Assigned the coding-system (Lisp symbol)
6044 `utf-16-be' by default.
6045
6046 o coding-category-utf-16-le
6047
6048 The category for a coding system in which a text has an
6049 Unicode signature (cf. Unicode Standard) in the order of
6050 LITTLE endian at the head. Assigned the coding-system (Lisp
6051 symbol) `utf-16-le' by default.
6052
1397dc18
KH
6053 o coding-category-ccl
6054
6055 The category for a coding system of which encoder/decoder is
6056 written in CCL programs. The default value is nil, i.e., no
6057 coding system is assigned.
6058
4ed46869
KH
6059 o coding-category-binary
6060
6061 The category for a coding system not categorized in any of the
6062 above. Assigned the coding-system (Lisp symbol)
e0e989f6 6063 `no-conversion' by default.
4ed46869
KH
6064
6065 Each of them is a Lisp symbol and the value is an actual
df7492f9 6066 `coding-system's (this is also a Lisp symbol) assigned by a user.
4ed46869
KH
6067 What Emacs does actually is to detect a category of coding system.
6068 Then, it uses a `coding-system' assigned to it. If Emacs can't
df7492f9 6069 decide only one possible category, it selects a category of the
4ed46869
KH
6070 highest priority. Priorities of categories are also specified by a
6071 user in a Lisp variable `coding-category-list'.
6072
6073*/
6074
df7492f9
KH
6075#define EOL_SEEN_NONE 0
6076#define EOL_SEEN_LF 1
6077#define EOL_SEEN_CR 2
6078#define EOL_SEEN_CRLF 4
66cfb530 6079
ff0dacd7
KH
6080/* Detect how end-of-line of a text of length SRC_BYTES pointed by
6081 SOURCE is encoded. If CATEGORY is one of
6082 coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6083 two-byte, else they are encoded by one-byte.
6084
6085 Return one of EOL_SEEN_XXX. */
4ed46869 6086
bc4bc72a 6087#define MAX_EOL_CHECK_COUNT 3
d46c5b12
KH
6088
6089static int
cf84bb53
JB
6090detect_eol (const unsigned char *source, EMACS_INT src_bytes,
6091 enum coding_category category)
4ed46869 6092{
f6cbaf43 6093 const unsigned char *src = source, *src_end = src + src_bytes;
4ed46869 6094 unsigned char c;
df7492f9
KH
6095 int total = 0;
6096 int eol_seen = EOL_SEEN_NONE;
4ed46869 6097
89528eb3 6098 if ((1 << category) & CATEGORY_MASK_UTF_16)
4ed46869 6099 {
df7492f9 6100 int msb, lsb;
fa42c37f 6101
89528eb3
KH
6102 msb = category == (coding_category_utf_16_le
6103 | coding_category_utf_16_le_nosig);
df7492f9 6104 lsb = 1 - msb;
fa42c37f 6105
df7492f9 6106 while (src + 1 < src_end)
fa42c37f 6107 {
df7492f9
KH
6108 c = src[lsb];
6109 if (src[msb] == 0 && (c == '\n' || c == '\r'))
fa42c37f 6110 {
df7492f9
KH
6111 int this_eol;
6112
6113 if (c == '\n')
6114 this_eol = EOL_SEEN_LF;
6115 else if (src + 3 >= src_end
6116 || src[msb + 2] != 0
6117 || src[lsb + 2] != '\n')
6118 this_eol = EOL_SEEN_CR;
fa42c37f 6119 else
75f4f1ac
EZ
6120 {
6121 this_eol = EOL_SEEN_CRLF;
6122 src += 2;
6123 }
df7492f9
KH
6124
6125 if (eol_seen == EOL_SEEN_NONE)
6126 /* This is the first end-of-line. */
6127 eol_seen = this_eol;
6128 else if (eol_seen != this_eol)
fa42c37f 6129 {
75f4f1ac
EZ
6130 /* The found type is different from what found before.
6131 Allow for stray ^M characters in DOS EOL files. */
ef1b0ba7
SM
6132 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6133 || (eol_seen == EOL_SEEN_CRLF
6134 && this_eol == EOL_SEEN_CR))
75f4f1ac
EZ
6135 eol_seen = EOL_SEEN_CRLF;
6136 else
6137 {
6138 eol_seen = EOL_SEEN_LF;
6139 break;
6140 }
fa42c37f 6141 }
df7492f9
KH
6142 if (++total == MAX_EOL_CHECK_COUNT)
6143 break;
fa42c37f 6144 }
df7492f9 6145 src += 2;
fa42c37f 6146 }
bcf26d6a 6147 }
d46c5b12 6148 else
ef1b0ba7
SM
6149 while (src < src_end)
6150 {
6151 c = *src++;
6152 if (c == '\n' || c == '\r')
6153 {
6154 int this_eol;
d46c5b12 6155
ef1b0ba7
SM
6156 if (c == '\n')
6157 this_eol = EOL_SEEN_LF;
6158 else if (src >= src_end || *src != '\n')
6159 this_eol = EOL_SEEN_CR;
6160 else
6161 this_eol = EOL_SEEN_CRLF, src++;
d46c5b12 6162
ef1b0ba7
SM
6163 if (eol_seen == EOL_SEEN_NONE)
6164 /* This is the first end-of-line. */
6165 eol_seen = this_eol;
6166 else if (eol_seen != this_eol)
6167 {
6168 /* The found type is different from what found before.
6169 Allow for stray ^M characters in DOS EOL files. */
6170 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6171 || (eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR))
6172 eol_seen = EOL_SEEN_CRLF;
6173 else
6174 {
6175 eol_seen = EOL_SEEN_LF;
6176 break;
6177 }
6178 }
6179 if (++total == MAX_EOL_CHECK_COUNT)
6180 break;
6181 }
6182 }
df7492f9 6183 return eol_seen;
73be902c
KH
6184}
6185
df7492f9 6186
24a73b0a 6187static Lisp_Object
971de7fb 6188adjust_coding_eol_type (struct coding_system *coding, int eol_seen)
73be902c 6189{
0be8721c 6190 Lisp_Object eol_type;
8f924df7 6191
df7492f9
KH
6192 eol_type = CODING_ID_EOL_TYPE (coding->id);
6193 if (eol_seen & EOL_SEEN_LF)
24a73b0a
KH
6194 {
6195 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6196 eol_type = Qunix;
6197 }
6f197c07 6198 else if (eol_seen & EOL_SEEN_CRLF)
24a73b0a
KH
6199 {
6200 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6201 eol_type = Qdos;
6202 }
6f197c07 6203 else if (eol_seen & EOL_SEEN_CR)
24a73b0a
KH
6204 {
6205 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6206 eol_type = Qmac;
6207 }
6208 return eol_type;
d46c5b12 6209}
4ed46869 6210
df7492f9
KH
6211/* Detect how a text specified in CODING is encoded. If a coding
6212 system is detected, update fields of CODING by the detected coding
6213 system. */
0a28aafb 6214
74ab6df5 6215static void
971de7fb 6216detect_coding (struct coding_system *coding)
d46c5b12 6217{
8f924df7 6218 const unsigned char *src, *src_end;
73cce38d 6219 int saved_mode = coding->mode;
d46c5b12 6220
df7492f9
KH
6221 coding->consumed = coding->consumed_char = 0;
6222 coding->produced = coding->produced_char = 0;
6223 coding_set_source (coding);
1c3478b0 6224
df7492f9 6225 src_end = coding->source + coding->src_bytes;
c0e16b14 6226 coding->head_ascii = 0;
1c3478b0 6227
df7492f9
KH
6228 /* If we have not yet decided the text encoding type, detect it
6229 now. */
6230 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
b73bfc1c 6231 {
df7492f9 6232 int c, i;
6cb21a4f 6233 struct coding_detection_info detect_info;
2f3cbb32 6234 int null_byte_found = 0, eight_bit_found = 0;
df7492f9 6235
6cb21a4f 6236 detect_info.checked = detect_info.found = detect_info.rejected = 0;
2f3cbb32 6237 for (src = coding->source; src < src_end; src++)
d46c5b12 6238 {
df7492f9 6239 c = *src;
6cb21a4f 6240 if (c & 0x80)
6cb21a4f 6241 {
2f3cbb32 6242 eight_bit_found = 1;
2f3cbb32
KH
6243 if (null_byte_found)
6244 break;
6245 }
6246 else if (c < 0x20)
6247 {
6248 if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6249 && ! inhibit_iso_escape_detection
6250 && ! detect_info.checked)
6cb21a4f 6251 {
2f3cbb32
KH
6252 if (detect_coding_iso_2022 (coding, &detect_info))
6253 {
6254 /* We have scanned the whole data. */
6255 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
c0e16b14
KH
6256 {
6257 /* We didn't find an 8-bit code. We may
6258 have found a null-byte, but it's very
ce5b453a 6259 rare that a binary file conforms to
c0e16b14
KH
6260 ISO-2022. */
6261 src = src_end;
6262 coding->head_ascii = src - coding->source;
6263 }
6264 detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
2f3cbb32
KH
6265 break;
6266 }
6267 }
97b1b294 6268 else if (! c && !inhibit_null_byte_detection)
2f3cbb32
KH
6269 {
6270 null_byte_found = 1;
6271 if (eight_bit_found)
6272 break;
6cb21a4f 6273 }
c006c0c8
KH
6274 if (! eight_bit_found)
6275 coding->head_ascii++;
6cb21a4f 6276 }
c006c0c8 6277 else if (! eight_bit_found)
c0e16b14 6278 coding->head_ascii++;
d46c5b12 6279 }
df7492f9 6280
2f3cbb32
KH
6281 if (null_byte_found || eight_bit_found
6282 || coding->head_ascii < coding->src_bytes
6cb21a4f 6283 || detect_info.found)
d46c5b12 6284 {
ff0dacd7
KH
6285 enum coding_category category;
6286 struct coding_system *this;
df7492f9 6287
6cb21a4f
KH
6288 if (coding->head_ascii == coding->src_bytes)
6289 /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */
6290 for (i = 0; i < coding_category_raw_text; i++)
6291 {
6292 category = coding_priorities[i];
6293 this = coding_categories + category;
6294 if (detect_info.found & (1 << category))
24a73b0a 6295 break;
6cb21a4f
KH
6296 }
6297 else
2f3cbb32
KH
6298 {
6299 if (null_byte_found)
ff0dacd7 6300 {
2f3cbb32
KH
6301 detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6302 detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
ff0dacd7 6303 }
2f3cbb32
KH
6304 for (i = 0; i < coding_category_raw_text; i++)
6305 {
6306 category = coding_priorities[i];
6307 this = coding_categories + category;
6308 if (this->id < 0)
6309 {
6310 /* No coding system of this category is defined. */
6311 detect_info.rejected |= (1 << category);
6312 }
6313 else if (category >= coding_category_raw_text)
6314 continue;
6315 else if (detect_info.checked & (1 << category))
6316 {
6317 if (detect_info.found & (1 << category))
6318 break;
6319 }
6320 else if ((*(this->detector)) (coding, &detect_info)
6321 && detect_info.found & (1 << category))
6322 {
6323 if (category == coding_category_utf_16_auto)
6324 {
6325 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6326 category = coding_category_utf_16_le;
6327 else
6328 category = coding_category_utf_16_be;
6329 }
6330 break;
6331 }
6332 }
2f3cbb32 6333 }
c0e16b14
KH
6334
6335 if (i < coding_category_raw_text)
6336 setup_coding_system (CODING_ID_NAME (this->id), coding);
6337 else if (null_byte_found)
6338 setup_coding_system (Qno_conversion, coding);
6339 else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6340 == CATEGORY_MASK_ANY)
6341 setup_coding_system (Qraw_text, coding);
6342 else if (detect_info.rejected)
6343 for (i = 0; i < coding_category_raw_text; i++)
6344 if (! (detect_info.rejected & (1 << coding_priorities[i])))
6345 {
6346 this = coding_categories + coding_priorities[i];
6347 setup_coding_system (CODING_ID_NAME (this->id), coding);
6348 break;
6349 }
d46c5b12 6350 }
b73bfc1c 6351 }
a470d443
KH
6352 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6353 == coding_category_utf_8_auto)
6354 {
6355 Lisp_Object coding_systems;
6356 struct coding_detection_info detect_info;
6357
6358 coding_systems
6359 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6360 detect_info.found = detect_info.rejected = 0;
6361 coding->head_ascii = 0;
6362 if (CONSP (coding_systems)
6363 && detect_coding_utf_8 (coding, &detect_info))
6364 {
6365 if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6366 setup_coding_system (XCAR (coding_systems), coding);
6367 else
6368 setup_coding_system (XCDR (coding_systems), coding);
6369 }
6370 }
24a73b0a
KH
6371 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6372 == coding_category_utf_16_auto)
b49a1807
KH
6373 {
6374 Lisp_Object coding_systems;
6375 struct coding_detection_info detect_info;
6376
6377 coding_systems
a470d443 6378 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
b49a1807 6379 detect_info.found = detect_info.rejected = 0;
a470d443 6380 coding->head_ascii = 0;
b49a1807 6381 if (CONSP (coding_systems)
24a73b0a 6382 && detect_coding_utf_16 (coding, &detect_info))
b49a1807
KH
6383 {
6384 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6385 setup_coding_system (XCAR (coding_systems), coding);
24a73b0a 6386 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
b49a1807
KH
6387 setup_coding_system (XCDR (coding_systems), coding);
6388 }
6389 }
73cce38d 6390 coding->mode = saved_mode;
4ed46869 6391}
4ed46869 6392
d46c5b12 6393
aaaf0b1e 6394static void
971de7fb 6395decode_eol (struct coding_system *coding)
aaaf0b1e 6396{
24a73b0a
KH
6397 Lisp_Object eol_type;
6398 unsigned char *p, *pbeg, *pend;
3ed051d4 6399
24a73b0a 6400 eol_type = CODING_ID_EOL_TYPE (coding->id);
0a9564cb 6401 if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
24a73b0a
KH
6402 return;
6403
6404 if (NILP (coding->dst_object))
6405 pbeg = coding->destination;
6406 else
6407 pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6408 pend = pbeg + coding->produced;
6409
6410 if (VECTORP (eol_type))
aaaf0b1e 6411 {
df7492f9 6412 int eol_seen = EOL_SEEN_NONE;
4ed46869 6413
24a73b0a 6414 for (p = pbeg; p < pend; p++)
aaaf0b1e 6415 {
df7492f9
KH
6416 if (*p == '\n')
6417 eol_seen |= EOL_SEEN_LF;
6418 else if (*p == '\r')
aaaf0b1e 6419 {
df7492f9 6420 if (p + 1 < pend && *(p + 1) == '\n')
aaaf0b1e 6421 {
df7492f9
KH
6422 eol_seen |= EOL_SEEN_CRLF;
6423 p++;
aaaf0b1e 6424 }
aaaf0b1e 6425 else
df7492f9 6426 eol_seen |= EOL_SEEN_CR;
aaaf0b1e 6427 }
aaaf0b1e 6428 }
75f4f1ac
EZ
6429 /* Handle DOS-style EOLs in a file with stray ^M characters. */
6430 if ((eol_seen & EOL_SEEN_CRLF) != 0
6431 && (eol_seen & EOL_SEEN_CR) != 0
6432 && (eol_seen & EOL_SEEN_LF) == 0)
6433 eol_seen = EOL_SEEN_CRLF;
6434 else if (eol_seen != EOL_SEEN_NONE
24a73b0a
KH
6435 && eol_seen != EOL_SEEN_LF
6436 && eol_seen != EOL_SEEN_CRLF
6437 && eol_seen != EOL_SEEN_CR)
6438 eol_seen = EOL_SEEN_LF;
df7492f9 6439 if (eol_seen != EOL_SEEN_NONE)
24a73b0a 6440 eol_type = adjust_coding_eol_type (coding, eol_seen);
aaaf0b1e 6441 }
d46c5b12 6442
24a73b0a 6443 if (EQ (eol_type, Qmac))
27901516 6444 {
24a73b0a 6445 for (p = pbeg; p < pend; p++)
df7492f9
KH
6446 if (*p == '\r')
6447 *p = '\n';
4ed46869 6448 }
24a73b0a 6449 else if (EQ (eol_type, Qdos))
df7492f9 6450 {
a53e2e89 6451 EMACS_INT n = 0;
b73bfc1c 6452
24a73b0a
KH
6453 if (NILP (coding->dst_object))
6454 {
4347441b
KH
6455 /* Start deleting '\r' from the tail to minimize the memory
6456 movement. */
24a73b0a
KH
6457 for (p = pend - 2; p >= pbeg; p--)
6458 if (*p == '\r')
6459 {
72af86bd 6460 memmove (p, p + 1, pend-- - p - 1);
24a73b0a
KH
6461 n++;
6462 }
6463 }
6464 else
6465 {
a53e2e89
EZ
6466 EMACS_INT pos_byte = coding->dst_pos_byte;
6467 EMACS_INT pos = coding->dst_pos;
6468 EMACS_INT pos_end = pos + coding->produced_char - 1;
4347441b
KH
6469
6470 while (pos < pos_end)
6471 {
6472 p = BYTE_POS_ADDR (pos_byte);
6473 if (*p == '\r' && p[1] == '\n')
6474 {
6475 del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6476 n++;
6477 pos_end--;
6478 }
6479 pos++;
69b8522d
KH
6480 if (coding->dst_multibyte)
6481 pos_byte += BYTES_BY_CHAR_HEAD (*p);
6482 else
6483 pos_byte++;
4347441b 6484 }
24a73b0a
KH
6485 }
6486 coding->produced -= n;
6487 coding->produced_char -= n;
aaaf0b1e 6488 }
4ed46869
KH
6489}
6490
7d64c6ad 6491
a6f87d34
KH
6492/* Return a translation table (or list of them) from coding system
6493 attribute vector ATTRS for encoding (ENCODEP is nonzero) or
6494 decoding (ENCODEP is zero). */
7d64c6ad 6495
e6a54062 6496static Lisp_Object
971de7fb 6497get_translation_table (Lisp_Object attrs, int encodep, int *max_lookup)
7d64c6ad
KH
6498{
6499 Lisp_Object standard, translation_table;
09ee6fdd 6500 Lisp_Object val;
7d64c6ad 6501
4bed5909
CY
6502 if (NILP (Venable_character_translation))
6503 {
6504 if (max_lookup)
6505 *max_lookup = 0;
6506 return Qnil;
6507 }
7d64c6ad
KH
6508 if (encodep)
6509 translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6510 standard = Vstandard_translation_table_for_encode;
6511 else
6512 translation_table = CODING_ATTR_DECODE_TBL (attrs),
6513 standard = Vstandard_translation_table_for_decode;
7d64c6ad 6514 if (NILP (translation_table))
09ee6fdd
KH
6515 translation_table = standard;
6516 else
a6f87d34 6517 {
09ee6fdd
KH
6518 if (SYMBOLP (translation_table))
6519 translation_table = Fget (translation_table, Qtranslation_table);
6520 else if (CONSP (translation_table))
6521 {
6522 translation_table = Fcopy_sequence (translation_table);
6523 for (val = translation_table; CONSP (val); val = XCDR (val))
6524 if (SYMBOLP (XCAR (val)))
6525 XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6526 }
6527 if (CHAR_TABLE_P (standard))
6528 {
6529 if (CONSP (translation_table))
6530 translation_table = nconc2 (translation_table,
6531 Fcons (standard, Qnil));
6532 else
6533 translation_table = Fcons (translation_table,
6534 Fcons (standard, Qnil));
6535 }
a6f87d34 6536 }
2170c8f0
KH
6537
6538 if (max_lookup)
09ee6fdd 6539 {
2170c8f0
KH
6540 *max_lookup = 1;
6541 if (CHAR_TABLE_P (translation_table)
6542 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6543 {
6544 val = XCHAR_TABLE (translation_table)->extras[1];
6545 if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6546 *max_lookup = XFASTINT (val);
6547 }
6548 else if (CONSP (translation_table))
6549 {
2735d060 6550 Lisp_Object tail;
09ee6fdd 6551
2170c8f0
KH
6552 for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6553 if (CHAR_TABLE_P (XCAR (tail))
6554 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6555 {
2735d060
PE
6556 Lisp_Object tailval = XCHAR_TABLE (XCAR (tail))->extras[1];
6557 if (NATNUMP (tailval) && *max_lookup < XFASTINT (tailval))
6558 *max_lookup = XFASTINT (tailval);
2170c8f0
KH
6559 }
6560 }
a6f87d34 6561 }
7d64c6ad
KH
6562 return translation_table;
6563}
6564
09ee6fdd
KH
6565#define LOOKUP_TRANSLATION_TABLE(table, c, trans) \
6566 do { \
6567 trans = Qnil; \
6568 if (CHAR_TABLE_P (table)) \
6569 { \
6570 trans = CHAR_TABLE_REF (table, c); \
6571 if (CHARACTERP (trans)) \
6572 c = XFASTINT (trans), trans = Qnil; \
6573 } \
6574 else if (CONSP (table)) \
6575 { \
6576 Lisp_Object tail; \
6577 \
6578 for (tail = table; CONSP (tail); tail = XCDR (tail)) \
6579 if (CHAR_TABLE_P (XCAR (tail))) \
6580 { \
6581 trans = CHAR_TABLE_REF (XCAR (tail), c); \
6582 if (CHARACTERP (trans)) \
6583 c = XFASTINT (trans), trans = Qnil; \
6584 else if (! NILP (trans)) \
6585 break; \
6586 } \
6587 } \
e6a54062
KH
6588 } while (0)
6589
7d64c6ad 6590
e951386e
KH
6591/* Return a translation of character(s) at BUF according to TRANS.
6592 TRANS is TO-CHAR or ((FROM . TO) ...) where
6593 FROM = [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].
6594 The return value is TO-CHAR or ([FROM-CHAR ...] . TO) if a
6595 translation is found, and Qnil if not found..
6596 If BUF is too short to lookup characters in FROM, return Qt. */
6597
69a80ea3 6598static Lisp_Object
971de7fb 6599get_translation (Lisp_Object trans, int *buf, int *buf_end)
69a80ea3 6600{
e951386e
KH
6601
6602 if (INTEGERP (trans))
6603 return trans;
6604 for (; CONSP (trans); trans = XCDR (trans))
69a80ea3 6605 {
e951386e
KH
6606 Lisp_Object val = XCAR (trans);
6607 Lisp_Object from = XCAR (val);
6608 int len = ASIZE (from);
6609 int i;
69a80ea3 6610
e951386e 6611 for (i = 0; i < len; i++)
69a80ea3 6612 {
e951386e
KH
6613 if (buf + i == buf_end)
6614 return Qt;
6615 if (XINT (AREF (from, i)) != buf[i])
6616 break;
69a80ea3 6617 }
e951386e
KH
6618 if (i == len)
6619 return val;
69a80ea3 6620 }
e951386e 6621 return Qnil;
69a80ea3
KH
6622}
6623
6624
d46c5b12 6625static int
cf84bb53
JB
6626produce_chars (struct coding_system *coding, Lisp_Object translation_table,
6627 int last_block)
4ed46869 6628{
df7492f9
KH
6629 unsigned char *dst = coding->destination + coding->produced;
6630 unsigned char *dst_end = coding->destination + coding->dst_bytes;
119852e7
KH
6631 EMACS_INT produced;
6632 EMACS_INT produced_chars = 0;
69a80ea3 6633 int carryover = 0;
4ed46869 6634
df7492f9 6635 if (! coding->chars_at_source)
4ed46869 6636 {
119852e7 6637 /* Source characters are in coding->charbuf. */
fba4576f
AS
6638 int *buf = coding->charbuf;
6639 int *buf_end = buf + coding->charbuf_used;
4ed46869 6640
db274c7a
KH
6641 if (EQ (coding->src_object, coding->dst_object))
6642 {
6643 coding_set_source (coding);
6644 dst_end = ((unsigned char *) coding->source) + coding->consumed;
6645 }
4ed46869 6646
df7492f9 6647 while (buf < buf_end)
4ed46869 6648 {
69a80ea3 6649 int c = *buf, i;
bc4bc72a 6650
df7492f9
KH
6651 if (c >= 0)
6652 {
a53e2e89 6653 EMACS_INT from_nchars = 1, to_nchars = 1;
69a80ea3
KH
6654 Lisp_Object trans = Qnil;
6655
09ee6fdd 6656 LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
e6a54062 6657 if (! NILP (trans))
69a80ea3 6658 {
e951386e
KH
6659 trans = get_translation (trans, buf, buf_end);
6660 if (INTEGERP (trans))
6661 c = XINT (trans);
6662 else if (CONSP (trans))
6663 {
6664 from_nchars = ASIZE (XCAR (trans));
6665 trans = XCDR (trans);
6666 if (INTEGERP (trans))
6667 c = XINT (trans);
6668 else
6669 {
6670 to_nchars = ASIZE (trans);
6671 c = XINT (AREF (trans, 0));
6672 }
6673 }
6674 else if (EQ (trans, Qt) && ! last_block)
69a80ea3 6675 break;
69a80ea3
KH
6676 }
6677
6678 if (dst + MAX_MULTIBYTE_LENGTH * to_nchars > dst_end)
6679 {
6680 dst = alloc_destination (coding,
6681 buf_end - buf
6682 + MAX_MULTIBYTE_LENGTH * to_nchars,
6683 dst);
db274c7a
KH
6684 if (EQ (coding->src_object, coding->dst_object))
6685 {
6686 coding_set_source (coding);
e951386e
KH
6687 dst_end = (((unsigned char *) coding->source)
6688 + coding->consumed);
db274c7a
KH
6689 }
6690 else
6691 dst_end = coding->destination + coding->dst_bytes;
69a80ea3
KH
6692 }
6693
433f7f87 6694 for (i = 0; i < to_nchars; i++)
69a80ea3 6695 {
433f7f87
KH
6696 if (i > 0)
6697 c = XINT (AREF (trans, i));
69a80ea3
KH
6698 if (coding->dst_multibyte
6699 || ! CHAR_BYTE8_P (c))
db274c7a 6700 CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
69a80ea3
KH
6701 else
6702 *dst++ = CHAR_TO_BYTE8 (c);
6703 }
6704 produced_chars += to_nchars;
e951386e 6705 buf += from_nchars;
d46c5b12 6706 }
df7492f9 6707 else
69a80ea3
KH
6708 /* This is an annotation datum. (-C) is the length. */
6709 buf += -c;
4ed46869 6710 }
69a80ea3 6711 carryover = buf_end - buf;
4ed46869 6712 }
fa42c37f 6713 else
fa42c37f 6714 {
119852e7 6715 /* Source characters are at coding->source. */
8f924df7 6716 const unsigned char *src = coding->source;
119852e7 6717 const unsigned char *src_end = src + coding->consumed;
4ed46869 6718
db274c7a
KH
6719 if (EQ (coding->dst_object, coding->src_object))
6720 dst_end = (unsigned char *) src;
df7492f9 6721 if (coding->src_multibyte != coding->dst_multibyte)
fa42c37f 6722 {
df7492f9 6723 if (coding->src_multibyte)
fa42c37f 6724 {
71c81426 6725 int multibytep = 1;
4533845d 6726 EMACS_INT consumed_chars = 0;
d46c5b12 6727
df7492f9
KH
6728 while (1)
6729 {
8f924df7 6730 const unsigned char *src_base = src;
df7492f9 6731 int c;
b73bfc1c 6732
df7492f9 6733 ONE_MORE_BYTE (c);
119852e7 6734 if (dst == dst_end)
df7492f9 6735 {
119852e7
KH
6736 if (EQ (coding->src_object, coding->dst_object))
6737 dst_end = (unsigned char *) src;
6738 if (dst == dst_end)
df7492f9 6739 {
119852e7
KH
6740 EMACS_INT offset = src - coding->source;
6741
6742 dst = alloc_destination (coding, src_end - src + 1,
6743 dst);
6744 dst_end = coding->destination + coding->dst_bytes;
6745 coding_set_source (coding);
6746 src = coding->source + offset;
6747 src_end = coding->source + coding->src_bytes;
db274c7a
KH
6748 if (EQ (coding->src_object, coding->dst_object))
6749 dst_end = (unsigned char *) src;
df7492f9 6750 }
df7492f9
KH
6751 }
6752 *dst++ = c;
6753 produced_chars++;
6754 }
6755 no_more_source:
6756 ;
fa42c37f
KH
6757 }
6758 else
df7492f9
KH
6759 while (src < src_end)
6760 {
71c81426 6761 int multibytep = 1;
df7492f9 6762 int c = *src++;
b73bfc1c 6763
df7492f9
KH
6764 if (dst >= dst_end - 1)
6765 {
2c78b7e1 6766 if (EQ (coding->src_object, coding->dst_object))
8f924df7 6767 dst_end = (unsigned char *) src;
2c78b7e1
KH
6768 if (dst >= dst_end - 1)
6769 {
119852e7 6770 EMACS_INT offset = src - coding->source;
db274c7a 6771 EMACS_INT more_bytes;
119852e7 6772
db274c7a
KH
6773 if (EQ (coding->src_object, coding->dst_object))
6774 more_bytes = ((src_end - src) / 2) + 2;
6775 else
6776 more_bytes = src_end - src + 2;
6777 dst = alloc_destination (coding, more_bytes, dst);
2c78b7e1
KH
6778 dst_end = coding->destination + coding->dst_bytes;
6779 coding_set_source (coding);
119852e7 6780 src = coding->source + offset;
2c78b7e1 6781 src_end = coding->source + coding->src_bytes;
db274c7a
KH
6782 if (EQ (coding->src_object, coding->dst_object))
6783 dst_end = (unsigned char *) src;
2c78b7e1 6784 }
df7492f9
KH
6785 }
6786 EMIT_ONE_BYTE (c);
6787 }
d46c5b12 6788 }
df7492f9
KH
6789 else
6790 {
6791 if (!EQ (coding->src_object, coding->dst_object))
fa42c37f 6792 {
119852e7 6793 EMACS_INT require = coding->src_bytes - coding->dst_bytes;
4ed46869 6794
df7492f9 6795 if (require > 0)
fa42c37f 6796 {
df7492f9
KH
6797 EMACS_INT offset = src - coding->source;
6798
6799 dst = alloc_destination (coding, require, dst);
6800 coding_set_source (coding);
6801 src = coding->source + offset;
6802 src_end = coding->source + coding->src_bytes;
fa42c37f
KH
6803 }
6804 }
119852e7 6805 produced_chars = coding->consumed_char;
df7492f9 6806 while (src < src_end)
14daee73 6807 *dst++ = *src++;
fa42c37f
KH
6808 }
6809 }
6810
df7492f9 6811 produced = dst - (coding->destination + coding->produced);
284201e4 6812 if (BUFFERP (coding->dst_object) && produced_chars > 0)
df7492f9
KH
6813 insert_from_gap (produced_chars, produced);
6814 coding->produced += produced;
6815 coding->produced_char += produced_chars;
69a80ea3 6816 return carryover;
fa42c37f
KH
6817}
6818
ff0dacd7
KH
6819/* Compose text in CODING->object according to the annotation data at
6820 CHARBUF. CHARBUF is an array:
e951386e 6821 [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
df7492f9 6822 */
4ed46869 6823
df7492f9 6824static INLINE void
971de7fb 6825produce_composition (struct coding_system *coding, int *charbuf, EMACS_INT pos)
4ed46869 6826{
df7492f9 6827 int len;
69a80ea3 6828 EMACS_INT to;
df7492f9 6829 enum composition_method method;
df7492f9 6830 Lisp_Object components;
fa42c37f 6831
e951386e 6832 len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
69a80ea3 6833 to = pos + charbuf[2];
e951386e 6834 method = (enum composition_method) (charbuf[4]);
d46c5b12 6835
df7492f9
KH
6836 if (method == COMPOSITION_RELATIVE)
6837 components = Qnil;
e951386e 6838 else
d46c5b12 6839 {
df7492f9 6840 Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
e951386e 6841 int i, j;
b73bfc1c 6842
e951386e
KH
6843 if (method == COMPOSITION_WITH_RULE)
6844 len = charbuf[2] * 3 - 2;
6845 charbuf += MAX_ANNOTATION_LENGTH;
6846 /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
6847 for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
9ffd559c 6848 {
e951386e
KH
6849 if (charbuf[i] >= 0)
6850 args[j] = make_number (charbuf[i]);
6851 else
6852 {
6853 i++;
6854 args[j] = make_number (charbuf[i] % 0x100);
6855 }
9ffd559c 6856 }
e951386e 6857 components = (i == j ? Fstring (j, args) : Fvector (j, args));
d46c5b12 6858 }
69a80ea3 6859 compose_text (pos, to, components, Qnil, coding->dst_object);
d46c5b12
KH
6860}
6861
d46c5b12 6862
ff0dacd7
KH
6863/* Put `charset' property on text in CODING->object according to
6864 the annotation data at CHARBUF. CHARBUF is an array:
69a80ea3 6865 [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
ff0dacd7 6866 */
d46c5b12 6867
ff0dacd7 6868static INLINE void
971de7fb 6869produce_charset (struct coding_system *coding, int *charbuf, EMACS_INT pos)
d46c5b12 6870{
69a80ea3
KH
6871 EMACS_INT from = pos - charbuf[2];
6872 struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
b73bfc1c 6873
69a80ea3 6874 Fput_text_property (make_number (from), make_number (pos),
ff0dacd7
KH
6875 Qcharset, CHARSET_NAME (charset),
6876 coding->dst_object);
d46c5b12
KH
6877}
6878
d46c5b12 6879
df7492f9
KH
6880#define CHARBUF_SIZE 0x4000
6881
6882#define ALLOC_CONVERSION_WORK_AREA(coding) \
6883 do { \
8510724d 6884 int size = CHARBUF_SIZE; \
df7492f9
KH
6885 \
6886 coding->charbuf = NULL; \
6887 while (size > 1024) \
6888 { \
6889 coding->charbuf = (int *) alloca (sizeof (int) * size); \
6890 if (coding->charbuf) \
6891 break; \
6892 size >>= 1; \
6893 } \
6894 if (! coding->charbuf) \
6895 { \
065e3595 6896 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
df7492f9
KH
6897 return coding->result; \
6898 } \
6899 coding->charbuf_size = size; \
6900 } while (0)
4ed46869 6901
d46c5b12
KH
6902
6903static void
971de7fb 6904produce_annotation (struct coding_system *coding, EMACS_INT pos)
d46c5b12 6905{
df7492f9
KH
6906 int *charbuf = coding->charbuf;
6907 int *charbuf_end = charbuf + coding->charbuf_used;
d46c5b12 6908
ff0dacd7
KH
6909 if (NILP (coding->dst_object))
6910 return;
d46c5b12 6911
df7492f9 6912 while (charbuf < charbuf_end)
a84f1519 6913 {
df7492f9 6914 if (*charbuf >= 0)
e951386e 6915 pos++, charbuf++;
d46c5b12 6916 else
d46c5b12 6917 {
df7492f9 6918 int len = -*charbuf;
e951386e
KH
6919
6920 if (len > 2)
6921 switch (charbuf[1])
6922 {
6923 case CODING_ANNOTATE_COMPOSITION_MASK:
6924 produce_composition (coding, charbuf, pos);
6925 break;
6926 case CODING_ANNOTATE_CHARSET_MASK:
6927 produce_charset (coding, charbuf, pos);
6928 break;
6929 }
df7492f9 6930 charbuf += len;
d46c5b12 6931 }
a84f1519 6932 }
d46c5b12
KH
6933}
6934
df7492f9
KH
6935/* Decode the data at CODING->src_object into CODING->dst_object.
6936 CODING->src_object is a buffer, a string, or nil.
6937 CODING->dst_object is a buffer.
d46c5b12 6938
df7492f9
KH
6939 If CODING->src_object is a buffer, it must be the current buffer.
6940 In this case, if CODING->src_pos is positive, it is a position of
6941 the source text in the buffer, otherwise, the source text is in the
6942 gap area of the buffer, and CODING->src_pos specifies the offset of
6943 the text from GPT (which must be the same as PT). If this is the
6944 same buffer as CODING->dst_object, CODING->src_pos must be
6945 negative.
d46c5b12 6946
b6828792 6947 If CODING->src_object is a string, CODING->src_pos is an index to
df7492f9 6948 that string.
d46c5b12 6949
df7492f9
KH
6950 If CODING->src_object is nil, CODING->source must already point to
6951 the non-relocatable memory area. In this case, CODING->src_pos is
6952 an offset from CODING->source.
73be902c 6953
df7492f9
KH
6954 The decoded data is inserted at the current point of the buffer
6955 CODING->dst_object.
6956*/
d46c5b12 6957
df7492f9 6958static int
971de7fb 6959decode_coding (struct coding_system *coding)
d46c5b12 6960{
df7492f9 6961 Lisp_Object attrs;
24a73b0a 6962 Lisp_Object undo_list;
7d64c6ad 6963 Lisp_Object translation_table;
d0396581 6964 struct ccl_spec cclspec;
69a80ea3
KH
6965 int carryover;
6966 int i;
d46c5b12 6967
df7492f9
KH
6968 if (BUFFERP (coding->src_object)
6969 && coding->src_pos > 0
6970 && coding->src_pos < GPT
6971 && coding->src_pos + coding->src_chars > GPT)
6972 move_gap_both (coding->src_pos, coding->src_pos_byte);
d46c5b12 6973
24a73b0a 6974 undo_list = Qt;
df7492f9 6975 if (BUFFERP (coding->dst_object))
1c3478b0 6976 {
df7492f9
KH
6977 if (current_buffer != XBUFFER (coding->dst_object))
6978 set_buffer_internal (XBUFFER (coding->dst_object));
6979 if (GPT != PT)
6980 move_gap_both (PT, PT_BYTE);
4b4deea2
TT
6981 undo_list = BVAR (current_buffer, undo_list);
6982 BVAR (current_buffer, undo_list) = Qt;
1c3478b0
KH
6983 }
6984
df7492f9
KH
6985 coding->consumed = coding->consumed_char = 0;
6986 coding->produced = coding->produced_char = 0;
6987 coding->chars_at_source = 0;
065e3595 6988 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9 6989 coding->errors = 0;
1c3478b0 6990
df7492f9
KH
6991 ALLOC_CONVERSION_WORK_AREA (coding);
6992
6993 attrs = CODING_ID_ATTRS (coding->id);
2170c8f0 6994 translation_table = get_translation_table (attrs, 0, NULL);
df7492f9 6995
69a80ea3 6996 carryover = 0;
d0396581
KH
6997 if (coding->decoder == decode_coding_ccl)
6998 {
6999 coding->spec.ccl = &cclspec;
7000 setup_ccl_program (&cclspec.ccl, CODING_CCL_DECODER (coding));
7001 }
df7492f9 7002 do
b73bfc1c 7003 {
69a80ea3
KH
7004 EMACS_INT pos = coding->dst_pos + coding->produced_char;
7005
df7492f9
KH
7006 coding_set_source (coding);
7007 coding->annotated = 0;
69a80ea3 7008 coding->charbuf_used = carryover;
df7492f9 7009 (*(coding->decoder)) (coding);
df7492f9 7010 coding_set_destination (coding);
69a80ea3 7011 carryover = produce_chars (coding, translation_table, 0);
df7492f9 7012 if (coding->annotated)
69a80ea3
KH
7013 produce_annotation (coding, pos);
7014 for (i = 0; i < carryover; i++)
7015 coding->charbuf[i]
7016 = coding->charbuf[coding->charbuf_used - carryover + i];
d46c5b12 7017 }
d0396581
KH
7018 while (coding->result == CODING_RESULT_INSUFFICIENT_DST
7019 || (coding->consumed < coding->src_bytes
7020 && (coding->result == CODING_RESULT_SUCCESS
7021 || coding->result == CODING_RESULT_INVALID_SRC)));
d46c5b12 7022
69a80ea3
KH
7023 if (carryover > 0)
7024 {
7025 coding_set_destination (coding);
7026 coding->charbuf_used = carryover;
7027 produce_chars (coding, translation_table, 1);
7028 }
7029
df7492f9
KH
7030 coding->carryover_bytes = 0;
7031 if (coding->consumed < coding->src_bytes)
d46c5b12 7032 {
df7492f9 7033 int nbytes = coding->src_bytes - coding->consumed;
8f924df7 7034 const unsigned char *src;
df7492f9
KH
7035
7036 coding_set_source (coding);
7037 coding_set_destination (coding);
7038 src = coding->source + coding->consumed;
7039
7040 if (coding->mode & CODING_MODE_LAST_BLOCK)
1c3478b0 7041 {
df7492f9
KH
7042 /* Flush out unprocessed data as binary chars. We are sure
7043 that the number of data is less than the size of
7044 coding->charbuf. */
065e3595 7045 coding->charbuf_used = 0;
b2dab6c8
JR
7046 coding->chars_at_source = 0;
7047
df7492f9 7048 while (nbytes-- > 0)
1c3478b0 7049 {
df7492f9 7050 int c = *src++;
98725083 7051
1c91457d
KH
7052 if (c & 0x80)
7053 c = BYTE8_TO_CHAR (c);
7054 coding->charbuf[coding->charbuf_used++] = c;
1c3478b0 7055 }
f6cbaf43 7056 produce_chars (coding, Qnil, 1);
d46c5b12 7057 }
d46c5b12 7058 else
df7492f9
KH
7059 {
7060 /* Record unprocessed bytes in coding->carryover. We are
7061 sure that the number of data is less than the size of
7062 coding->carryover. */
7063 unsigned char *p = coding->carryover;
7064
f289d375
KH
7065 if (nbytes > sizeof coding->carryover)
7066 nbytes = sizeof coding->carryover;
df7492f9
KH
7067 coding->carryover_bytes = nbytes;
7068 while (nbytes-- > 0)
7069 *p++ = *src++;
1c3478b0 7070 }
df7492f9 7071 coding->consumed = coding->src_bytes;
b73bfc1c 7072 }
69f76525 7073
0a9564cb
EZ
7074 if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7075 && !inhibit_eol_conversion)
4347441b 7076 decode_eol (coding);
24a73b0a
KH
7077 if (BUFFERP (coding->dst_object))
7078 {
4b4deea2 7079 BVAR (current_buffer, undo_list) = undo_list;
24a73b0a
KH
7080 record_insert (coding->dst_pos, coding->produced_char);
7081 }
73be902c 7082 return coding->result;
4ed46869
KH
7083}
7084
aaaf0b1e 7085
e1c23804 7086/* Extract an annotation datum from a composition starting at POS and
ff0dacd7
KH
7087 ending before LIMIT of CODING->src_object (buffer or string), store
7088 the data in BUF, set *STOP to a starting position of the next
7089 composition (if any) or to LIMIT, and return the address of the
7090 next element of BUF.
7091
7092 If such an annotation is not found, set *STOP to a starting
7093 position of a composition after POS (if any) or to LIMIT, and
7094 return BUF. */
7095
7096static INLINE int *
cf84bb53
JB
7097handle_composition_annotation (EMACS_INT pos, EMACS_INT limit,
7098 struct coding_system *coding, int *buf,
7099 EMACS_INT *stop)
aaaf0b1e 7100{
ff0dacd7
KH
7101 EMACS_INT start, end;
7102 Lisp_Object prop;
aaaf0b1e 7103
ff0dacd7
KH
7104 if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7105 || end > limit)
7106 *stop = limit;
7107 else if (start > pos)
7108 *stop = start;
7109 else
aaaf0b1e 7110 {
ff0dacd7 7111 if (start == pos)
aaaf0b1e 7112 {
ff0dacd7
KH
7113 /* We found a composition. Store the corresponding
7114 annotation data in BUF. */
7115 int *head = buf;
7116 enum composition_method method = COMPOSITION_METHOD (prop);
7117 int nchars = COMPOSITION_LENGTH (prop);
7118
e951386e 7119 ADD_COMPOSITION_DATA (buf, nchars, 0, method);
ff0dacd7 7120 if (method != COMPOSITION_RELATIVE)
aaaf0b1e 7121 {
ff0dacd7
KH
7122 Lisp_Object components;
7123 int len, i, i_byte;
7124
7125 components = COMPOSITION_COMPONENTS (prop);
7126 if (VECTORP (components))
aaaf0b1e 7127 {
ff0dacd7
KH
7128 len = XVECTOR (components)->size;
7129 for (i = 0; i < len; i++)
7130 *buf++ = XINT (AREF (components, i));
aaaf0b1e 7131 }
ff0dacd7 7132 else if (STRINGP (components))
aaaf0b1e 7133 {
8f924df7 7134 len = SCHARS (components);
ff0dacd7
KH
7135 i = i_byte = 0;
7136 while (i < len)
7137 {
7138 FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
7139 buf++;
7140 }
7141 }
7142 else if (INTEGERP (components))
7143 {
7144 len = 1;
7145 *buf++ = XINT (components);
7146 }
7147 else if (CONSP (components))
7148 {
7149 for (len = 0; CONSP (components);
7150 len++, components = XCDR (components))
7151 *buf++ = XINT (XCAR (components));
aaaf0b1e 7152 }
aaaf0b1e 7153 else
ff0dacd7
KH
7154 abort ();
7155 *head -= len;
aaaf0b1e 7156 }
aaaf0b1e 7157 }
ff0dacd7
KH
7158
7159 if (find_composition (end, limit, &start, &end, &prop,
7160 coding->src_object)
7161 && end <= limit)
7162 *stop = start;
7163 else
7164 *stop = limit;
aaaf0b1e 7165 }
ff0dacd7
KH
7166 return buf;
7167}
7168
7169
e1c23804 7170/* Extract an annotation datum from a text property `charset' at POS of
ff0dacd7
KH
7171 CODING->src_object (buffer of string), store the data in BUF, set
7172 *STOP to the position where the value of `charset' property changes
7173 (limiting by LIMIT), and return the address of the next element of
7174 BUF.
7175
7176 If the property value is nil, set *STOP to the position where the
7177 property value is non-nil (limiting by LIMIT), and return BUF. */
7178
7179static INLINE int *
cf84bb53
JB
7180handle_charset_annotation (EMACS_INT pos, EMACS_INT limit,
7181 struct coding_system *coding, int *buf,
7182 EMACS_INT *stop)
ff0dacd7
KH
7183{
7184 Lisp_Object val, next;
7185 int id;
7186
7187 val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
7188 if (! NILP (val) && CHARSETP (val))
7189 id = XINT (CHARSET_SYMBOL_ID (val));
7190 else
7191 id = -1;
69a80ea3 7192 ADD_CHARSET_DATA (buf, 0, id);
ff0dacd7
KH
7193 next = Fnext_single_property_change (make_number (pos), Qcharset,
7194 coding->src_object,
7195 make_number (limit));
7196 *stop = XINT (next);
7197 return buf;
7198}
7199
7200
df7492f9 7201static void
cf84bb53
JB
7202consume_chars (struct coding_system *coding, Lisp_Object translation_table,
7203 int max_lookup)
df7492f9
KH
7204{
7205 int *buf = coding->charbuf;
ff0dacd7 7206 int *buf_end = coding->charbuf + coding->charbuf_size;
7c78e542 7207 const unsigned char *src = coding->source + coding->consumed;
4776e638 7208 const unsigned char *src_end = coding->source + coding->src_bytes;
ff0dacd7
KH
7209 EMACS_INT pos = coding->src_pos + coding->consumed_char;
7210 EMACS_INT end_pos = coding->src_pos + coding->src_chars;
df7492f9
KH
7211 int multibytep = coding->src_multibyte;
7212 Lisp_Object eol_type;
7213 int c;
ff0dacd7 7214 EMACS_INT stop, stop_composition, stop_charset;
09ee6fdd 7215 int *lookup_buf = NULL;
433f7f87
KH
7216
7217 if (! NILP (translation_table))
09ee6fdd 7218 lookup_buf = alloca (sizeof (int) * max_lookup);
88993dfd 7219
0a9564cb 7220 eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
df7492f9
KH
7221 if (VECTORP (eol_type))
7222 eol_type = Qunix;
88993dfd 7223
df7492f9
KH
7224 /* Note: composition handling is not yet implemented. */
7225 coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
ec6d2bb8 7226
0b5670c9
KH
7227 if (NILP (coding->src_object))
7228 stop = stop_composition = stop_charset = end_pos;
ff0dacd7 7229 else
0b5670c9
KH
7230 {
7231 if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7232 stop = stop_composition = pos;
7233 else
7234 stop = stop_composition = end_pos;
7235 if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7236 stop = stop_charset = pos;
7237 else
7238 stop_charset = end_pos;
7239 }
ec6d2bb8 7240
24a73b0a 7241 /* Compensate for CRLF and conversion. */
ff0dacd7 7242 buf_end -= 1 + MAX_ANNOTATION_LENGTH;
df7492f9 7243 while (buf < buf_end)
aaaf0b1e 7244 {
433f7f87
KH
7245 Lisp_Object trans;
7246
df7492f9 7247 if (pos == stop)
ec6d2bb8 7248 {
df7492f9
KH
7249 if (pos == end_pos)
7250 break;
ff0dacd7
KH
7251 if (pos == stop_composition)
7252 buf = handle_composition_annotation (pos, end_pos, coding,
7253 buf, &stop_composition);
7254 if (pos == stop_charset)
7255 buf = handle_charset_annotation (pos, end_pos, coding,
7256 buf, &stop_charset);
7257 stop = (stop_composition < stop_charset
7258 ? stop_composition : stop_charset);
df7492f9
KH
7259 }
7260
7261 if (! multibytep)
4776e638 7262 {
d3e4cb56 7263 EMACS_INT bytes;
aaaf0b1e 7264
4d1e6632
KH
7265 if (coding->encoder == encode_coding_raw_text
7266 || coding->encoder == encode_coding_ccl)
ea29edf2
KH
7267 c = *src++, pos++;
7268 else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
db274c7a 7269 c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
4776e638 7270 else
f03caae0 7271 c = BYTE8_TO_CHAR (*src), src++, pos++;
4776e638 7272 }
df7492f9 7273 else
db274c7a 7274 c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
df7492f9
KH
7275 if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7276 c = '\n';
7277 if (! EQ (eol_type, Qunix))
aaaf0b1e 7278 {
df7492f9 7279 if (c == '\n')
aaaf0b1e 7280 {
df7492f9
KH
7281 if (EQ (eol_type, Qdos))
7282 *buf++ = '\r';
7283 else
7284 c = '\r';
aaaf0b1e
KH
7285 }
7286 }
433f7f87 7287
e6a54062 7288 trans = Qnil;
09ee6fdd 7289 LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
e6a54062 7290 if (NILP (trans))
433f7f87
KH
7291 *buf++ = c;
7292 else
7293 {
7294 int from_nchars = 1, to_nchars = 1;
7295 int *lookup_buf_end;
7296 const unsigned char *p = src;
7297 int i;
7298
7299 lookup_buf[0] = c;
7300 for (i = 1; i < max_lookup && p < src_end; i++)
7301 lookup_buf[i] = STRING_CHAR_ADVANCE (p);
7302 lookup_buf_end = lookup_buf + i;
e951386e
KH
7303 trans = get_translation (trans, lookup_buf, lookup_buf_end);
7304 if (INTEGERP (trans))
7305 c = XINT (trans);
7306 else if (CONSP (trans))
7307 {
7308 from_nchars = ASIZE (XCAR (trans));
7309 trans = XCDR (trans);
7310 if (INTEGERP (trans))
7311 c = XINT (trans);
7312 else
7313 {
7314 to_nchars = ASIZE (trans);
7315 if (buf + to_nchars > buf_end)
7316 break;
7317 c = XINT (AREF (trans, 0));
7318 }
7319 }
7320 else
433f7f87 7321 break;
e951386e 7322 *buf++ = c;
433f7f87
KH
7323 for (i = 1; i < to_nchars; i++)
7324 *buf++ = XINT (AREF (trans, i));
7325 for (i = 1; i < from_nchars; i++, pos++)
7326 src += MULTIBYTE_LENGTH_NO_CHECK (src);
7327 }
aaaf0b1e 7328 }
ec6d2bb8 7329
df7492f9
KH
7330 coding->consumed = src - coding->source;
7331 coding->consumed_char = pos - coding->src_pos;
7332 coding->charbuf_used = buf - coding->charbuf;
7333 coding->chars_at_source = 0;
aaaf0b1e
KH
7334}
7335
4ed46869 7336
df7492f9
KH
7337/* Encode the text at CODING->src_object into CODING->dst_object.
7338 CODING->src_object is a buffer or a string.
7339 CODING->dst_object is a buffer or nil.
7340
7341 If CODING->src_object is a buffer, it must be the current buffer.
7342 In this case, if CODING->src_pos is positive, it is a position of
7343 the source text in the buffer, otherwise. the source text is in the
7344 gap area of the buffer, and coding->src_pos specifies the offset of
7345 the text from GPT (which must be the same as PT). If this is the
7346 same buffer as CODING->dst_object, CODING->src_pos must be
7347 negative and CODING should not have `pre-write-conversion'.
7348
7349 If CODING->src_object is a string, CODING should not have
7350 `pre-write-conversion'.
7351
7352 If CODING->dst_object is a buffer, the encoded data is inserted at
7353 the current point of that buffer.
7354
7355 If CODING->dst_object is nil, the encoded data is placed at the
7356 memory area specified by CODING->destination. */
7357
7358static int
971de7fb 7359encode_coding (struct coding_system *coding)
4ed46869 7360{
df7492f9 7361 Lisp_Object attrs;
7d64c6ad 7362 Lisp_Object translation_table;
09ee6fdd 7363 int max_lookup;
fb608df3 7364 struct ccl_spec cclspec;
9861e777 7365
df7492f9 7366 attrs = CODING_ID_ATTRS (coding->id);
ea29edf2
KH
7367 if (coding->encoder == encode_coding_raw_text)
7368 translation_table = Qnil, max_lookup = 0;
7369 else
7370 translation_table = get_translation_table (attrs, 1, &max_lookup);
4ed46869 7371
df7492f9 7372 if (BUFFERP (coding->dst_object))
8844fa83 7373 {
df7492f9
KH
7374 set_buffer_internal (XBUFFER (coding->dst_object));
7375 coding->dst_multibyte
4b4deea2 7376 = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
8844fa83 7377 }
4ed46869 7378
b73bfc1c 7379 coding->consumed = coding->consumed_char = 0;
df7492f9 7380 coding->produced = coding->produced_char = 0;
065e3595 7381 record_conversion_result (coding, CODING_RESULT_SUCCESS);
b73bfc1c 7382 coding->errors = 0;
b73bfc1c 7383
df7492f9 7384 ALLOC_CONVERSION_WORK_AREA (coding);
4ed46869 7385
fb608df3
KH
7386 if (coding->encoder == encode_coding_ccl)
7387 {
7388 coding->spec.ccl = &cclspec;
7389 setup_ccl_program (&cclspec.ccl, CODING_CCL_ENCODER (coding));
7390 }
df7492f9
KH
7391 do {
7392 coding_set_source (coding);
09ee6fdd 7393 consume_chars (coding, translation_table, max_lookup);
df7492f9
KH
7394 coding_set_destination (coding);
7395 (*(coding->encoder)) (coding);
7396 } while (coding->consumed_char < coding->src_chars);
7397
284201e4 7398 if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
df7492f9
KH
7399 insert_from_gap (coding->produced_char, coding->produced);
7400
7401 return (coding->result);
ec6d2bb8
KH
7402}
7403
fb88bf2d 7404
24a73b0a
KH
7405/* Name (or base name) of work buffer for code conversion. */
7406static Lisp_Object Vcode_conversion_workbuf_name;
d46c5b12 7407
24a73b0a
KH
7408/* A working buffer used by the top level conversion. Once it is
7409 created, it is never destroyed. It has the name
7410 Vcode_conversion_workbuf_name. The other working buffers are
7411 destroyed after the use is finished, and their names are modified
7412 versions of Vcode_conversion_workbuf_name. */
7413static Lisp_Object Vcode_conversion_reused_workbuf;
b73bfc1c 7414
24a73b0a
KH
7415/* 1 iff Vcode_conversion_reused_workbuf is already in use. */
7416static int reused_workbuf_in_use;
4ed46869 7417
24a73b0a 7418
ad1746f5 7419/* Return a working buffer of code conversion. MULTIBYTE specifies the
24a73b0a 7420 multibyteness of returning buffer. */
b73bfc1c 7421
f6cbaf43 7422static Lisp_Object
971de7fb 7423make_conversion_work_buffer (int multibyte)
df7492f9 7424{
24a73b0a
KH
7425 Lisp_Object name, workbuf;
7426 struct buffer *current;
4ed46869 7427
24a73b0a 7428 if (reused_workbuf_in_use++)
065e3595
KH
7429 {
7430 name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7431 workbuf = Fget_buffer_create (name);
7432 }
df7492f9 7433 else
065e3595 7434 {
159bd5a2 7435 if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
a993c7a1
KH
7436 Vcode_conversion_reused_workbuf
7437 = Fget_buffer_create (Vcode_conversion_workbuf_name);
7438 workbuf = Vcode_conversion_reused_workbuf;
065e3595 7439 }
24a73b0a
KH
7440 current = current_buffer;
7441 set_buffer_internal (XBUFFER (workbuf));
df36ff1f
CY
7442 /* We can't allow modification hooks to run in the work buffer. For
7443 instance, directory_files_internal assumes that file decoding
7444 doesn't compile new regexps. */
7445 Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
3ed051d4 7446 Ferase_buffer ();
4b4deea2
TT
7447 BVAR (current_buffer, undo_list) = Qt;
7448 BVAR (current_buffer, enable_multibyte_characters) = multibyte ? Qt : Qnil;
df7492f9 7449 set_buffer_internal (current);
24a73b0a 7450 return workbuf;
df7492f9 7451}
d46c5b12 7452
24a73b0a 7453
4776e638 7454static Lisp_Object
971de7fb 7455code_conversion_restore (Lisp_Object arg)
4776e638 7456{
24a73b0a 7457 Lisp_Object current, workbuf;
948bdcf3 7458 struct gcpro gcpro1;
24a73b0a 7459
948bdcf3 7460 GCPRO1 (arg);
24a73b0a
KH
7461 current = XCAR (arg);
7462 workbuf = XCDR (arg);
7463 if (! NILP (workbuf))
7464 {
7465 if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7466 reused_workbuf_in_use = 0;
7467 else if (! NILP (Fbuffer_live_p (workbuf)))
7468 Fkill_buffer (workbuf);
7469 }
7470 set_buffer_internal (XBUFFER (current));
948bdcf3 7471 UNGCPRO;
4776e638
KH
7472 return Qnil;
7473}
b73bfc1c 7474
24a73b0a 7475Lisp_Object
971de7fb 7476code_conversion_save (int with_work_buf, int multibyte)
df7492f9 7477{
24a73b0a 7478 Lisp_Object workbuf = Qnil;
b73bfc1c 7479
4776e638 7480 if (with_work_buf)
24a73b0a
KH
7481 workbuf = make_conversion_work_buffer (multibyte);
7482 record_unwind_protect (code_conversion_restore,
7483 Fcons (Fcurrent_buffer (), workbuf));
4776e638 7484 return workbuf;
df7492f9 7485}
d46c5b12 7486
df7492f9 7487int
cf84bb53
JB
7488decode_coding_gap (struct coding_system *coding,
7489 EMACS_INT chars, EMACS_INT bytes)
df7492f9 7490{
1a4990fb 7491 int count = SPECPDL_INDEX ();
5e5c78be 7492 Lisp_Object attrs;
fb88bf2d 7493
24a73b0a 7494 code_conversion_save (0, 0);
ec6d2bb8 7495
24a73b0a 7496 coding->src_object = Fcurrent_buffer ();
df7492f9
KH
7497 coding->src_chars = chars;
7498 coding->src_bytes = bytes;
7499 coding->src_pos = -chars;
7500 coding->src_pos_byte = -bytes;
7501 coding->src_multibyte = chars < bytes;
24a73b0a 7502 coding->dst_object = coding->src_object;
df7492f9
KH
7503 coding->dst_pos = PT;
7504 coding->dst_pos_byte = PT_BYTE;
4b4deea2 7505 coding->dst_multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
4ed46869 7506
df7492f9
KH
7507 if (CODING_REQUIRE_DETECTION (coding))
7508 detect_coding (coding);
8f924df7 7509
9286b333 7510 coding->mode |= CODING_MODE_LAST_BLOCK;
287c57d7 7511 current_buffer->text->inhibit_shrinking = 1;
df7492f9 7512 decode_coding (coding);
287c57d7 7513 current_buffer->text->inhibit_shrinking = 0;
d46c5b12 7514
5e5c78be
KH
7515 attrs = CODING_ID_ATTRS (coding->id);
7516 if (! NILP (CODING_ATTR_POST_READ (attrs)))
b73bfc1c 7517 {
5e5c78be
KH
7518 EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7519 Lisp_Object val;
7520
7521 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
5e5c78be
KH
7522 val = call1 (CODING_ATTR_POST_READ (attrs),
7523 make_number (coding->produced_char));
5e5c78be
KH
7524 CHECK_NATNUM (val);
7525 coding->produced_char += Z - prev_Z;
7526 coding->produced += Z_BYTE - prev_Z_BYTE;
b73bfc1c 7527 }
4ed46869 7528
df7492f9 7529 unbind_to (count, Qnil);
b73bfc1c
KH
7530 return coding->result;
7531}
52d41803 7532
d46c5b12 7533
df7492f9
KH
7534/* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
7535 SRC_OBJECT into DST_OBJECT by coding context CODING.
b73bfc1c 7536
df7492f9 7537 SRC_OBJECT is a buffer, a string, or Qnil.
b73bfc1c 7538
df7492f9
KH
7539 If it is a buffer, the text is at point of the buffer. FROM and TO
7540 are positions in the buffer.
b73bfc1c 7541
df7492f9
KH
7542 If it is a string, the text is at the beginning of the string.
7543 FROM and TO are indices to the string.
4ed46869 7544
df7492f9
KH
7545 If it is nil, the text is at coding->source. FROM and TO are
7546 indices to coding->source.
bb10be8b 7547
df7492f9 7548 DST_OBJECT is a buffer, Qt, or Qnil.
4ed46869 7549
df7492f9
KH
7550 If it is a buffer, the decoded text is inserted at point of the
7551 buffer. If the buffer is the same as SRC_OBJECT, the source text
7552 is deleted.
4ed46869 7553
df7492f9
KH
7554 If it is Qt, a string is made from the decoded text, and
7555 set in CODING->dst_object.
d46c5b12 7556
df7492f9 7557 If it is Qnil, the decoded text is stored at CODING->destination.
2cb26057 7558 The caller must allocate CODING->dst_bytes bytes at
df7492f9
KH
7559 CODING->destination by xmalloc. If the decoded text is longer than
7560 CODING->dst_bytes, CODING->destination is relocated by xrealloc.
7561 */
d46c5b12 7562
df7492f9 7563void
cf84bb53
JB
7564decode_coding_object (struct coding_system *coding,
7565 Lisp_Object src_object,
7566 EMACS_INT from, EMACS_INT from_byte,
7567 EMACS_INT to, EMACS_INT to_byte,
7568 Lisp_Object dst_object)
d46c5b12 7569{
1a4990fb 7570 int count = SPECPDL_INDEX ();
c4a63b12
PE
7571 unsigned char *destination IF_LINT (= NULL);
7572 EMACS_INT dst_bytes IF_LINT (= 0);
df7492f9
KH
7573 EMACS_INT chars = to - from;
7574 EMACS_INT bytes = to_byte - from_byte;
7575 Lisp_Object attrs;
c4a63b12 7576 int saved_pt = -1, saved_pt_byte IF_LINT (= 0);
64cedb0c 7577 int need_marker_adjustment = 0;
b3bfad50 7578 Lisp_Object old_deactivate_mark;
d46c5b12 7579
b3bfad50 7580 old_deactivate_mark = Vdeactivate_mark;
93dec019 7581
df7492f9 7582 if (NILP (dst_object))
d46c5b12 7583 {
df7492f9
KH
7584 destination = coding->destination;
7585 dst_bytes = coding->dst_bytes;
d46c5b12 7586 }
93dec019 7587
df7492f9
KH
7588 coding->src_object = src_object;
7589 coding->src_chars = chars;
7590 coding->src_bytes = bytes;
7591 coding->src_multibyte = chars < bytes;
70ad9fc4 7592
df7492f9 7593 if (STRINGP (src_object))
d46c5b12 7594 {
df7492f9
KH
7595 coding->src_pos = from;
7596 coding->src_pos_byte = from_byte;
d46c5b12 7597 }
df7492f9 7598 else if (BUFFERP (src_object))
88993dfd 7599 {
df7492f9
KH
7600 set_buffer_internal (XBUFFER (src_object));
7601 if (from != GPT)
7602 move_gap_both (from, from_byte);
7603 if (EQ (src_object, dst_object))
fb88bf2d 7604 {
64cedb0c
KH
7605 struct Lisp_Marker *tail;
7606
7607 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7608 {
7609 tail->need_adjustment
7610 = tail->charpos == (tail->insertion_type ? from : to);
7611 need_marker_adjustment |= tail->need_adjustment;
7612 }
4776e638 7613 saved_pt = PT, saved_pt_byte = PT_BYTE;
df7492f9 7614 TEMP_SET_PT_BOTH (from, from_byte);
f4a3cc44 7615 current_buffer->text->inhibit_shrinking = 1;
df7492f9
KH
7616 del_range_both (from, from_byte, to, to_byte, 1);
7617 coding->src_pos = -chars;
7618 coding->src_pos_byte = -bytes;
fb88bf2d 7619 }
df7492f9 7620 else
fb88bf2d 7621 {
df7492f9
KH
7622 coding->src_pos = from;
7623 coding->src_pos_byte = from_byte;
fb88bf2d 7624 }
88993dfd
KH
7625 }
7626
df7492f9
KH
7627 if (CODING_REQUIRE_DETECTION (coding))
7628 detect_coding (coding);
7629 attrs = CODING_ID_ATTRS (coding->id);
d46c5b12 7630
2cb26057
KH
7631 if (EQ (dst_object, Qt)
7632 || (! NILP (CODING_ATTR_POST_READ (attrs))
7633 && NILP (dst_object)))
b73bfc1c 7634 {
a1567c45
SM
7635 coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
7636 coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
df7492f9
KH
7637 coding->dst_pos = BEG;
7638 coding->dst_pos_byte = BEG_BYTE;
b73bfc1c 7639 }
df7492f9 7640 else if (BUFFERP (dst_object))
d46c5b12 7641 {
24a73b0a 7642 code_conversion_save (0, 0);
df7492f9
KH
7643 coding->dst_object = dst_object;
7644 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
7645 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
7646 coding->dst_multibyte
4b4deea2 7647 = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
d46c5b12
KH
7648 }
7649 else
7650 {
24a73b0a 7651 code_conversion_save (0, 0);
df7492f9 7652 coding->dst_object = Qnil;
0154725e
SM
7653 /* Most callers presume this will return a multibyte result, and they
7654 won't use `binary' or `raw-text' anyway, so let's not worry about
7655 CODING_FOR_UNIBYTE. */
bb555731 7656 coding->dst_multibyte = 1;
d46c5b12
KH
7657 }
7658
df7492f9 7659 decode_coding (coding);
fa46990e 7660
df7492f9
KH
7661 if (BUFFERP (coding->dst_object))
7662 set_buffer_internal (XBUFFER (coding->dst_object));
d46c5b12 7663
df7492f9 7664 if (! NILP (CODING_ATTR_POST_READ (attrs)))
d46c5b12 7665 {
b3bfad50 7666 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
df7492f9 7667 EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
2b4f9037 7668 Lisp_Object val;
d46c5b12 7669
c0cc7f7f 7670 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
b3bfad50
KH
7671 GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7672 old_deactivate_mark);
d4850d67
KH
7673 val = safe_call1 (CODING_ATTR_POST_READ (attrs),
7674 make_number (coding->produced_char));
df7492f9
KH
7675 UNGCPRO;
7676 CHECK_NATNUM (val);
7677 coding->produced_char += Z - prev_Z;
7678 coding->produced += Z_BYTE - prev_Z_BYTE;
d46c5b12 7679 }
de79a6a5 7680
df7492f9 7681 if (EQ (dst_object, Qt))
ec6d2bb8 7682 {
df7492f9
KH
7683 coding->dst_object = Fbuffer_string ();
7684 }
7685 else if (NILP (dst_object) && BUFFERP (coding->dst_object))
7686 {
7687 set_buffer_internal (XBUFFER (coding->dst_object));
7688 if (dst_bytes < coding->produced)
7689 {
b3bfad50 7690 destination = xrealloc (destination, coding->produced);
df7492f9
KH
7691 if (! destination)
7692 {
065e3595 7693 record_conversion_result (coding,
ebaf11b6 7694 CODING_RESULT_INSUFFICIENT_MEM);
df7492f9
KH
7695 unbind_to (count, Qnil);
7696 return;
7697 }
7698 if (BEGV < GPT && GPT < BEGV + coding->produced_char)
7699 move_gap_both (BEGV, BEGV_BYTE);
72af86bd 7700 memcpy (destination, BEGV_ADDR, coding->produced);
df7492f9 7701 coding->destination = destination;
d46c5b12 7702 }
ec6d2bb8 7703 }
b73bfc1c 7704
4776e638
KH
7705 if (saved_pt >= 0)
7706 {
7707 /* This is the case of:
7708 (BUFFERP (src_object) && EQ (src_object, dst_object))
7709 As we have moved PT while replacing the original buffer
7710 contents, we must recover it now. */
7711 set_buffer_internal (XBUFFER (src_object));
f4a3cc44 7712 current_buffer->text->inhibit_shrinking = 0;
4776e638
KH
7713 if (saved_pt < from)
7714 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7715 else if (saved_pt < from + chars)
7716 TEMP_SET_PT_BOTH (from, from_byte);
4b4deea2 7717 else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
4776e638
KH
7718 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7719 saved_pt_byte + (coding->produced - bytes));
7720 else
7721 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7722 saved_pt_byte + (coding->produced - bytes));
64cedb0c
KH
7723
7724 if (need_marker_adjustment)
7725 {
7726 struct Lisp_Marker *tail;
7727
7728 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7729 if (tail->need_adjustment)
7730 {
7731 tail->need_adjustment = 0;
7732 if (tail->insertion_type)
7733 {
7734 tail->bytepos = from_byte;
7735 tail->charpos = from;
7736 }
7737 else
7738 {
7739 tail->bytepos = from_byte + coding->produced;
7740 tail->charpos
4b4deea2 7741 = (NILP (BVAR (current_buffer, enable_multibyte_characters))
64cedb0c
KH
7742 ? tail->bytepos : from + coding->produced_char);
7743 }
7744 }
7745 }
d46c5b12 7746 }
4776e638 7747
b3bfad50 7748 Vdeactivate_mark = old_deactivate_mark;
065e3595 7749 unbind_to (count, coding->dst_object);
d46c5b12
KH
7750}
7751
d46c5b12 7752
df7492f9 7753void
cf84bb53
JB
7754encode_coding_object (struct coding_system *coding,
7755 Lisp_Object src_object,
7756 EMACS_INT from, EMACS_INT from_byte,
7757 EMACS_INT to, EMACS_INT to_byte,
7758 Lisp_Object dst_object)
d46c5b12 7759{
1a4990fb 7760 int count = SPECPDL_INDEX ();
df7492f9
KH
7761 EMACS_INT chars = to - from;
7762 EMACS_INT bytes = to_byte - from_byte;
7763 Lisp_Object attrs;
c4a63b12 7764 int saved_pt = -1, saved_pt_byte IF_LINT (= 0);
64cedb0c 7765 int need_marker_adjustment = 0;
c02d943b 7766 int kill_src_buffer = 0;
b3bfad50 7767 Lisp_Object old_deactivate_mark;
df7492f9 7768
b3bfad50 7769 old_deactivate_mark = Vdeactivate_mark;
df7492f9
KH
7770
7771 coding->src_object = src_object;
7772 coding->src_chars = chars;
7773 coding->src_bytes = bytes;
7774 coding->src_multibyte = chars < bytes;
7775
7776 attrs = CODING_ID_ATTRS (coding->id);
7777
64cedb0c
KH
7778 if (EQ (src_object, dst_object))
7779 {
7780 struct Lisp_Marker *tail;
7781
7782 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7783 {
7784 tail->need_adjustment
7785 = tail->charpos == (tail->insertion_type ? from : to);
7786 need_marker_adjustment |= tail->need_adjustment;
7787 }
7788 }
7789
df7492f9 7790 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
6bac5b12 7791 {
24a73b0a 7792 coding->src_object = code_conversion_save (1, coding->src_multibyte);
df7492f9
KH
7793 set_buffer_internal (XBUFFER (coding->src_object));
7794 if (STRINGP (src_object))
7795 insert_from_string (src_object, from, from_byte, chars, bytes, 0);
7796 else if (BUFFERP (src_object))
7797 insert_from_buffer (XBUFFER (src_object), from, chars, 0);
7798 else
b68864e5 7799 insert_1_both ((char *) coding->source + from, chars, bytes, 0, 0, 0);
d46c5b12 7800
df7492f9
KH
7801 if (EQ (src_object, dst_object))
7802 {
7803 set_buffer_internal (XBUFFER (src_object));
4776e638 7804 saved_pt = PT, saved_pt_byte = PT_BYTE;
df7492f9
KH
7805 del_range_both (from, from_byte, to, to_byte, 1);
7806 set_buffer_internal (XBUFFER (coding->src_object));
7807 }
7808
d4850d67
KH
7809 {
7810 Lisp_Object args[3];
b3bfad50 7811 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
d4850d67 7812
b3bfad50
KH
7813 GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7814 old_deactivate_mark);
d4850d67
KH
7815 args[0] = CODING_ATTR_PRE_WRITE (attrs);
7816 args[1] = make_number (BEG);
7817 args[2] = make_number (Z);
7818 safe_call (3, args);
b3bfad50 7819 UNGCPRO;
d4850d67 7820 }
c02d943b
KH
7821 if (XBUFFER (coding->src_object) != current_buffer)
7822 kill_src_buffer = 1;
ac87bbef 7823 coding->src_object = Fcurrent_buffer ();
df7492f9
KH
7824 if (BEG != GPT)
7825 move_gap_both (BEG, BEG_BYTE);
7826 coding->src_chars = Z - BEG;
7827 coding->src_bytes = Z_BYTE - BEG_BYTE;
7828 coding->src_pos = BEG;
7829 coding->src_pos_byte = BEG_BYTE;
7830 coding->src_multibyte = Z < Z_BYTE;
7831 }
7832 else if (STRINGP (src_object))
d46c5b12 7833 {
24a73b0a 7834 code_conversion_save (0, 0);
df7492f9
KH
7835 coding->src_pos = from;
7836 coding->src_pos_byte = from_byte;
b73bfc1c 7837 }
df7492f9 7838 else if (BUFFERP (src_object))
b73bfc1c 7839 {
24a73b0a 7840 code_conversion_save (0, 0);
df7492f9 7841 set_buffer_internal (XBUFFER (src_object));
df7492f9 7842 if (EQ (src_object, dst_object))
d46c5b12 7843 {
4776e638 7844 saved_pt = PT, saved_pt_byte = PT_BYTE;
ff0dacd7
KH
7845 coding->src_object = del_range_1 (from, to, 1, 1);
7846 coding->src_pos = 0;
7847 coding->src_pos_byte = 0;
d46c5b12 7848 }
df7492f9 7849 else
d46c5b12 7850 {
ff0dacd7
KH
7851 if (from < GPT && to >= GPT)
7852 move_gap_both (from, from_byte);
df7492f9
KH
7853 coding->src_pos = from;
7854 coding->src_pos_byte = from_byte;
d46c5b12 7855 }
d46c5b12 7856 }
4776e638 7857 else
24a73b0a 7858 code_conversion_save (0, 0);
d46c5b12 7859
df7492f9 7860 if (BUFFERP (dst_object))
88993dfd 7861 {
df7492f9 7862 coding->dst_object = dst_object;
28f67a95
KH
7863 if (EQ (src_object, dst_object))
7864 {
7865 coding->dst_pos = from;
7866 coding->dst_pos_byte = from_byte;
7867 }
7868 else
7869 {
319a3947
KH
7870 struct buffer *current = current_buffer;
7871
7872 set_buffer_temp (XBUFFER (dst_object));
7873 coding->dst_pos = PT;
7874 coding->dst_pos_byte = PT_BYTE;
7875 move_gap_both (coding->dst_pos, coding->dst_pos_byte);
7876 set_buffer_temp (current);
28f67a95 7877 }
df7492f9 7878 coding->dst_multibyte
4b4deea2 7879 = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
88993dfd 7880 }
df7492f9 7881 else if (EQ (dst_object, Qt))
d46c5b12 7882 {
df7492f9 7883 coding->dst_object = Qnil;
df7492f9 7884 coding->dst_bytes = coding->src_chars;
ac87bbef
KH
7885 if (coding->dst_bytes == 0)
7886 coding->dst_bytes = 1;
7887 coding->destination = (unsigned char *) xmalloc (coding->dst_bytes);
df7492f9 7888 coding->dst_multibyte = 0;
d46c5b12
KH
7889 }
7890 else
7891 {
df7492f9
KH
7892 coding->dst_object = Qnil;
7893 coding->dst_multibyte = 0;
d46c5b12
KH
7894 }
7895
df7492f9 7896 encode_coding (coding);
d46c5b12 7897
df7492f9 7898 if (EQ (dst_object, Qt))
d46c5b12 7899 {
df7492f9
KH
7900 if (BUFFERP (coding->dst_object))
7901 coding->dst_object = Fbuffer_string ();
7902 else
d46c5b12 7903 {
df7492f9
KH
7904 coding->dst_object
7905 = make_unibyte_string ((char *) coding->destination,
7906 coding->produced);
7907 xfree (coding->destination);
d46c5b12 7908 }
4ed46869 7909 }
d46c5b12 7910
4776e638
KH
7911 if (saved_pt >= 0)
7912 {
7913 /* This is the case of:
7914 (BUFFERP (src_object) && EQ (src_object, dst_object))
7915 As we have moved PT while replacing the original buffer
7916 contents, we must recover it now. */
7917 set_buffer_internal (XBUFFER (src_object));
7918 if (saved_pt < from)
7919 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7920 else if (saved_pt < from + chars)
7921 TEMP_SET_PT_BOTH (from, from_byte);
4b4deea2 7922 else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
4776e638
KH
7923 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7924 saved_pt_byte + (coding->produced - bytes));
d46c5b12 7925 else
4776e638
KH
7926 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7927 saved_pt_byte + (coding->produced - bytes));
64cedb0c
KH
7928
7929 if (need_marker_adjustment)
7930 {
7931 struct Lisp_Marker *tail;
7932
7933 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7934 if (tail->need_adjustment)
7935 {
7936 tail->need_adjustment = 0;
7937 if (tail->insertion_type)
7938 {
7939 tail->bytepos = from_byte;
7940 tail->charpos = from;
7941 }
7942 else
7943 {
7944 tail->bytepos = from_byte + coding->produced;
7945 tail->charpos
4b4deea2 7946 = (NILP (BVAR (current_buffer, enable_multibyte_characters))
64cedb0c
KH
7947 ? tail->bytepos : from + coding->produced_char);
7948 }
7949 }
7950 }
4776e638
KH
7951 }
7952
c02d943b
KH
7953 if (kill_src_buffer)
7954 Fkill_buffer (coding->src_object);
b3bfad50
KH
7955
7956 Vdeactivate_mark = old_deactivate_mark;
df7492f9 7957 unbind_to (count, Qnil);
b73bfc1c
KH
7958}
7959
df7492f9 7960
b73bfc1c 7961Lisp_Object
971de7fb 7962preferred_coding_system (void)
b73bfc1c 7963{
df7492f9 7964 int id = coding_categories[coding_priorities[0]].id;
2391eaa4 7965
df7492f9 7966 return CODING_ID_NAME (id);
4ed46869
KH
7967}
7968
7969\f
7970#ifdef emacs
1397dc18 7971/*** 8. Emacs Lisp library functions ***/
4ed46869 7972
16a97296 7973DEFUE ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
48b0f3ae 7974 doc: /* Return t if OBJECT is nil or a coding-system.
df7492f9 7975See the documentation of `define-coding-system' for information
48b0f3ae 7976about coding-system objects. */)
5842a27b 7977 (Lisp_Object object)
4ed46869 7978{
d4a1d553
JB
7979 if (NILP (object)
7980 || CODING_SYSTEM_ID (object) >= 0)
44e8490d 7981 return Qt;
d4a1d553
JB
7982 if (! SYMBOLP (object)
7983 || NILP (Fget (object, Qcoding_system_define_form)))
44e8490d
KH
7984 return Qnil;
7985 return Qt;
4ed46869
KH
7986}
7987
16a97296 7988DEFUE ("read-non-nil-coding-system", Fread_non_nil_coding_system,
9d991de8 7989 Sread_non_nil_coding_system, 1, 1, 0,
48b0f3ae 7990 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
5842a27b 7991 (Lisp_Object prompt)
4ed46869 7992{
e0e989f6 7993 Lisp_Object val;
9d991de8
RS
7994 do
7995 {
4608c386
KH
7996 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
7997 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
9d991de8 7998 }
8f924df7 7999 while (SCHARS (val) == 0);
e0e989f6 8000 return (Fintern (val, Qnil));
4ed46869
KH
8001}
8002
16a97296 8003DEFUE ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
48b0f3ae 8004 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
c7183fb8
GM
8005If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8006Ignores case when completing coding systems (all Emacs coding systems
8007are lower-case). */)
5842a27b 8008 (Lisp_Object prompt, Lisp_Object default_coding_system)
4ed46869 8009{
f44d27ce 8010 Lisp_Object val;
c7183fb8
GM
8011 int count = SPECPDL_INDEX ();
8012
9b787f3e 8013 if (SYMBOLP (default_coding_system))
57d25e6f 8014 default_coding_system = SYMBOL_NAME (default_coding_system);
c7183fb8 8015 specbind (Qcompletion_ignore_case, Qt);
4608c386 8016 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
9b787f3e
RS
8017 Qt, Qnil, Qcoding_system_history,
8018 default_coding_system, Qnil);
c7183fb8 8019 unbind_to (count, Qnil);
8f924df7 8020 return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
4ed46869
KH
8021}
8022
16a97296 8023DEFUE ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
4ed46869 8024 1, 1, 0,
48b0f3ae 8025 doc: /* Check validity of CODING-SYSTEM.
9ffd559c
KH
8026If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8027It is valid if it is nil or a symbol defined as a coding system by the
8028function `define-coding-system'. */)
5842a27b 8029 (Lisp_Object coding_system)
4ed46869 8030{
44e8490d
KH
8031 Lisp_Object define_form;
8032
8033 define_form = Fget (coding_system, Qcoding_system_define_form);
8034 if (! NILP (define_form))
8035 {
8036 Fput (coding_system, Qcoding_system_define_form, Qnil);
8037 safe_eval (define_form);
8038 }
4ed46869
KH
8039 if (!NILP (Fcoding_system_p (coding_system)))
8040 return coding_system;
fcad4ec4 8041 xsignal1 (Qcoding_system_error, coding_system);
4ed46869 8042}
df7492f9 8043
3a73fa5d 8044\f
89528eb3
KH
8045/* Detect how the bytes at SRC of length SRC_BYTES are encoded. If
8046 HIGHEST is nonzero, return the coding system of the highest
ad1746f5 8047 priority among the detected coding systems. Otherwise return a
89528eb3
KH
8048 list of detected coding systems sorted by their priorities. If
8049 MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
8050 multibyte form but contains only ASCII and eight-bit chars.
8051 Otherwise, the bytes are raw bytes.
8052
8053 CODING-SYSTEM controls the detection as below:
8054
8055 If it is nil, detect both text-format and eol-format. If the
8056 text-format part of CODING-SYSTEM is already specified
8057 (e.g. `iso-latin-1'), detect only eol-format. If the eol-format
8058 part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8059 detect only text-format. */
8060
d46c5b12 8061Lisp_Object
cf84bb53
JB
8062detect_coding_system (const unsigned char *src,
8063 EMACS_INT src_chars, EMACS_INT src_bytes,
8064 int highest, int multibytep,
8065 Lisp_Object coding_system)
4ed46869 8066{
8f924df7 8067 const unsigned char *src_end = src + src_bytes;
df7492f9 8068 Lisp_Object attrs, eol_type;
4533845d 8069 Lisp_Object val = Qnil;
df7492f9 8070 struct coding_system coding;
89528eb3 8071 int id;
ff0dacd7 8072 struct coding_detection_info detect_info;
24a73b0a 8073 enum coding_category base_category;
2f3cbb32 8074 int null_byte_found = 0, eight_bit_found = 0;
b73bfc1c 8075
df7492f9
KH
8076 if (NILP (coding_system))
8077 coding_system = Qundecided;
8078 setup_coding_system (coding_system, &coding);
8079 attrs = CODING_ID_ATTRS (coding.id);
8080 eol_type = CODING_ID_EOL_TYPE (coding.id);
89528eb3 8081 coding_system = CODING_ATTR_BASE_NAME (attrs);
4ed46869 8082
df7492f9 8083 coding.source = src;
24a73b0a 8084 coding.src_chars = src_chars;
df7492f9
KH
8085 coding.src_bytes = src_bytes;
8086 coding.src_multibyte = multibytep;
8087 coding.consumed = 0;
89528eb3 8088 coding.mode |= CODING_MODE_LAST_BLOCK;
c0e16b14 8089 coding.head_ascii = 0;
d46c5b12 8090
ff0dacd7 8091 detect_info.checked = detect_info.found = detect_info.rejected = 0;
d46c5b12 8092
89528eb3 8093 /* At first, detect text-format if necessary. */
24a73b0a
KH
8094 base_category = XINT (CODING_ATTR_CATEGORY (attrs));
8095 if (base_category == coding_category_undecided)
4ed46869 8096 {
c4a63b12
PE
8097 enum coding_category category IF_LINT (= 0);
8098 struct coding_system *this IF_LINT (= NULL);
ff0dacd7 8099 int c, i;
88993dfd 8100
24a73b0a 8101 /* Skip all ASCII bytes except for a few ISO2022 controls. */
2f3cbb32 8102 for (; src < src_end; src++)
4ed46869 8103 {
df7492f9 8104 c = *src;
6cb21a4f 8105 if (c & 0x80)
6cb21a4f 8106 {
2f3cbb32 8107 eight_bit_found = 1;
2f3cbb32
KH
8108 if (null_byte_found)
8109 break;
8110 }
c0e16b14 8111 else if (c < 0x20)
2f3cbb32
KH
8112 {
8113 if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8114 && ! inhibit_iso_escape_detection
8115 && ! detect_info.checked)
6cb21a4f 8116 {
2f3cbb32
KH
8117 if (detect_coding_iso_2022 (&coding, &detect_info))
8118 {
8119 /* We have scanned the whole data. */
8120 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
c0e16b14
KH
8121 {
8122 /* We didn't find an 8-bit code. We may
8123 have found a null-byte, but it's very
8124 rare that a binary file confirm to
8125 ISO-2022. */
8126 src = src_end;
8127 coding.head_ascii = src - coding.source;
8128 }
8129 detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
2f3cbb32
KH
8130 break;
8131 }
8132 }
97b1b294 8133 else if (! c && !inhibit_null_byte_detection)
2f3cbb32
KH
8134 {
8135 null_byte_found = 1;
8136 if (eight_bit_found)
8137 break;
6cb21a4f 8138 }
c006c0c8
KH
8139 if (! eight_bit_found)
8140 coding.head_ascii++;
6cb21a4f 8141 }
c006c0c8 8142 else if (! eight_bit_found)
c0e16b14 8143 coding.head_ascii++;
4ed46869 8144 }
88993dfd 8145
2f3cbb32
KH
8146 if (null_byte_found || eight_bit_found
8147 || coding.head_ascii < coding.src_bytes
6cb21a4f
KH
8148 || detect_info.found)
8149 {
2f3cbb32 8150 if (coding.head_ascii == coding.src_bytes)
6cb21a4f
KH
8151 /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */
8152 for (i = 0; i < coding_category_raw_text; i++)
ff0dacd7 8153 {
6cb21a4f 8154 category = coding_priorities[i];
c7266f4a 8155 this = coding_categories + category;
6cb21a4f 8156 if (detect_info.found & (1 << category))
ff0dacd7
KH
8157 break;
8158 }
6cb21a4f 8159 else
2f3cbb32
KH
8160 {
8161 if (null_byte_found)
8162 {
8163 detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8164 detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8165 }
8166 for (i = 0; i < coding_category_raw_text; i++)
8167 {
8168 category = coding_priorities[i];
8169 this = coding_categories + category;
6cb21a4f 8170
2f3cbb32
KH
8171 if (this->id < 0)
8172 {
8173 /* No coding system of this category is defined. */
8174 detect_info.rejected |= (1 << category);
8175 }
8176 else if (category >= coding_category_raw_text)
8177 continue;
8178 else if (detect_info.checked & (1 << category))
8179 {
8180 if (highest
8181 && (detect_info.found & (1 << category)))
6cb21a4f 8182 break;
2f3cbb32
KH
8183 }
8184 else if ((*(this->detector)) (&coding, &detect_info)
8185 && highest
8186 && (detect_info.found & (1 << category)))
8187 {
8188 if (category == coding_category_utf_16_auto)
8189 {
8190 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8191 category = coding_category_utf_16_le;
8192 else
8193 category = coding_category_utf_16_be;
8194 }
8195 break;
8196 }
8197 }
8198 }
6cb21a4f 8199 }
ec6d2bb8 8200
4cddb209
KH
8201 if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8202 || null_byte_found)
ec6d2bb8 8203 {
ff0dacd7 8204 detect_info.found = CATEGORY_MASK_RAW_TEXT;
4cddb209 8205 id = CODING_SYSTEM_ID (Qno_conversion);
89528eb3
KH
8206 val = Fcons (make_number (id), Qnil);
8207 }
ff0dacd7 8208 else if (! detect_info.rejected && ! detect_info.found)
89528eb3 8209 {
ff0dacd7 8210 detect_info.found = CATEGORY_MASK_ANY;
89528eb3
KH
8211 id = coding_categories[coding_category_undecided].id;
8212 val = Fcons (make_number (id), Qnil);
8213 }
8214 else if (highest)
8215 {
ff0dacd7 8216 if (detect_info.found)
ec6d2bb8 8217 {
ff0dacd7
KH
8218 detect_info.found = 1 << category;
8219 val = Fcons (make_number (this->id), Qnil);
8220 }
8221 else
8222 for (i = 0; i < coding_category_raw_text; i++)
8223 if (! (detect_info.rejected & (1 << coding_priorities[i])))
8224 {
8225 detect_info.found = 1 << coding_priorities[i];
8226 id = coding_categories[coding_priorities[i]].id;
8227 val = Fcons (make_number (id), Qnil);
8228 break;
8229 }
8230 }
89528eb3
KH
8231 else
8232 {
ff0dacd7
KH
8233 int mask = detect_info.rejected | detect_info.found;
8234 int found = 0;
ec6d2bb8 8235
89528eb3 8236 for (i = coding_category_raw_text - 1; i >= 0; i--)
ff0dacd7
KH
8237 {
8238 category = coding_priorities[i];
8239 if (! (mask & (1 << category)))
ec6d2bb8 8240 {
ff0dacd7
KH
8241 found |= 1 << category;
8242 id = coding_categories[category].id;
c7266f4a
KH
8243 if (id >= 0)
8244 val = Fcons (make_number (id), val);
ff0dacd7
KH
8245 }
8246 }
8247 for (i = coding_category_raw_text - 1; i >= 0; i--)
8248 {
8249 category = coding_priorities[i];
8250 if (detect_info.found & (1 << category))
8251 {
8252 id = coding_categories[category].id;
8253 val = Fcons (make_number (id), val);
ec6d2bb8 8254 }
ec6d2bb8 8255 }
ff0dacd7 8256 detect_info.found |= found;
ec6d2bb8 8257 }
ec6d2bb8 8258 }
a470d443
KH
8259 else if (base_category == coding_category_utf_8_auto)
8260 {
8261 if (detect_coding_utf_8 (&coding, &detect_info))
8262 {
8263 struct coding_system *this;
8264
8265 if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8266 this = coding_categories + coding_category_utf_8_sig;
8267 else
8268 this = coding_categories + coding_category_utf_8_nosig;
8269 val = Fcons (make_number (this->id), Qnil);
8270 }
8271 }
24a73b0a
KH
8272 else if (base_category == coding_category_utf_16_auto)
8273 {
8274 if (detect_coding_utf_16 (&coding, &detect_info))
8275 {
24a73b0a
KH
8276 struct coding_system *this;
8277
8278 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8279 this = coding_categories + coding_category_utf_16_le;
8280 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8281 this = coding_categories + coding_category_utf_16_be;
8282 else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8283 this = coding_categories + coding_category_utf_16_be_nosig;
8284 else
8285 this = coding_categories + coding_category_utf_16_le_nosig;
8286 val = Fcons (make_number (this->id), Qnil);
8287 }
8288 }
df7492f9
KH
8289 else
8290 {
ff0dacd7 8291 detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
89528eb3 8292 val = Fcons (make_number (coding.id), Qnil);
4ed46869 8293 }
df7492f9 8294
89528eb3 8295 /* Then, detect eol-format if necessary. */
df7492f9 8296 {
4533845d 8297 int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
df7492f9
KH
8298 Lisp_Object tail;
8299
89528eb3
KH
8300 if (VECTORP (eol_type))
8301 {
ff0dacd7 8302 if (detect_info.found & ~CATEGORY_MASK_UTF_16)
2f3cbb32
KH
8303 {
8304 if (null_byte_found)
8305 normal_eol = EOL_SEEN_LF;
8306 else
8307 normal_eol = detect_eol (coding.source, src_bytes,
8308 coding_category_raw_text);
8309 }
ff0dacd7
KH
8310 if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8311 | CATEGORY_MASK_UTF_16_BE_NOSIG))
89528eb3
KH
8312 utf_16_be_eol = detect_eol (coding.source, src_bytes,
8313 coding_category_utf_16_be);
ff0dacd7
KH
8314 if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8315 | CATEGORY_MASK_UTF_16_LE_NOSIG))
89528eb3
KH
8316 utf_16_le_eol = detect_eol (coding.source, src_bytes,
8317 coding_category_utf_16_le);
8318 }
8319 else
8320 {
8321 if (EQ (eol_type, Qunix))
8322 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8323 else if (EQ (eol_type, Qdos))
8324 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8325 else
8326 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8327 }
8328
df7492f9
KH
8329 for (tail = val; CONSP (tail); tail = XCDR (tail))
8330 {
89528eb3 8331 enum coding_category category;
df7492f9 8332 int this_eol;
89528eb3
KH
8333
8334 id = XINT (XCAR (tail));
8335 attrs = CODING_ID_ATTRS (id);
8336 category = XINT (CODING_ATTR_CATEGORY (attrs));
8337 eol_type = CODING_ID_EOL_TYPE (id);
df7492f9
KH
8338 if (VECTORP (eol_type))
8339 {
89528eb3
KH
8340 if (category == coding_category_utf_16_be
8341 || category == coding_category_utf_16_be_nosig)
8342 this_eol = utf_16_be_eol;
8343 else if (category == coding_category_utf_16_le
8344 || category == coding_category_utf_16_le_nosig)
8345 this_eol = utf_16_le_eol;
df7492f9 8346 else
89528eb3
KH
8347 this_eol = normal_eol;
8348
df7492f9
KH
8349 if (this_eol == EOL_SEEN_LF)
8350 XSETCAR (tail, AREF (eol_type, 0));
8351 else if (this_eol == EOL_SEEN_CRLF)
8352 XSETCAR (tail, AREF (eol_type, 1));
8353 else if (this_eol == EOL_SEEN_CR)
8354 XSETCAR (tail, AREF (eol_type, 2));
89528eb3
KH
8355 else
8356 XSETCAR (tail, CODING_ID_NAME (id));
df7492f9 8357 }
89528eb3
KH
8358 else
8359 XSETCAR (tail, CODING_ID_NAME (id));
df7492f9
KH
8360 }
8361 }
ec6d2bb8 8362
4533845d 8363 return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
ec6d2bb8
KH
8364}
8365
ec6d2bb8 8366
d46c5b12
KH
8367DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8368 2, 3, 0,
48b0f3ae
PJ
8369 doc: /* Detect coding system of the text in the region between START and END.
8370Return a list of possible coding systems ordered by priority.
b811c52b
KH
8371The coding systems to try and their priorities follows what
8372the function `coding-system-priority-list' (which see) returns.
ec6d2bb8 8373
12e0131a 8374If only ASCII characters are found (except for such ISO-2022 control
d4a1d553
JB
8375characters as ESC), it returns a list of single element `undecided'
8376or its subsidiary coding system according to a detected end-of-line
8377format.
ec6d2bb8 8378
48b0f3ae
PJ
8379If optional argument HIGHEST is non-nil, return the coding system of
8380highest priority. */)
5842a27b 8381 (Lisp_Object start, Lisp_Object end, Lisp_Object highest)
d46c5b12
KH
8382{
8383 int from, to;
8384 int from_byte, to_byte;
ec6d2bb8 8385
b7826503
PJ
8386 CHECK_NUMBER_COERCE_MARKER (start);
8387 CHECK_NUMBER_COERCE_MARKER (end);
ec6d2bb8 8388
d46c5b12
KH
8389 validate_region (&start, &end);
8390 from = XINT (start), to = XINT (end);
8391 from_byte = CHAR_TO_BYTE (from);
8392 to_byte = CHAR_TO_BYTE (to);
ec6d2bb8 8393
d46c5b12
KH
8394 if (from < GPT && to >= GPT)
8395 move_gap_both (to, to_byte);
c210f766 8396
d46c5b12 8397 return detect_coding_system (BYTE_POS_ADDR (from_byte),
24a73b0a 8398 to - from, to_byte - from_byte,
0a28aafb 8399 !NILP (highest),
4b4deea2 8400 !NILP (BVAR (current_buffer
5d8ea120 8401 , enable_multibyte_characters)),
df7492f9 8402 Qnil);
ec6d2bb8
KH
8403}
8404
d46c5b12
KH
8405DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8406 1, 2, 0,
48b0f3ae
PJ
8407 doc: /* Detect coding system of the text in STRING.
8408Return a list of possible coding systems ordered by priority.
67ceab9d
KH
8409The coding systems to try and their priorities follows what
8410the function `coding-system-priority-list' (which see) returns.
fb88bf2d 8411
12e0131a 8412If only ASCII characters are found (except for such ISO-2022 control
d4a1d553
JB
8413characters as ESC), it returns a list of single element `undecided'
8414or its subsidiary coding system according to a detected end-of-line
8415format.
d46c5b12 8416
48b0f3ae
PJ
8417If optional argument HIGHEST is non-nil, return the coding system of
8418highest priority. */)
5842a27b 8419 (Lisp_Object string, Lisp_Object highest)
d46c5b12 8420{
b7826503 8421 CHECK_STRING (string);
b73bfc1c 8422
24a73b0a
KH
8423 return detect_coding_system (SDATA (string),
8424 SCHARS (string), SBYTES (string),
8f924df7 8425 !NILP (highest), STRING_MULTIBYTE (string),
df7492f9 8426 Qnil);
4ed46869 8427}
4ed46869 8428
b73bfc1c 8429
df7492f9 8430static INLINE int
971de7fb 8431char_encodable_p (int c, Lisp_Object attrs)
05e6f5dc 8432{
df7492f9 8433 Lisp_Object tail;
df7492f9 8434 struct charset *charset;
7d64c6ad 8435 Lisp_Object translation_table;
d46c5b12 8436
7d64c6ad 8437 translation_table = CODING_ATTR_TRANS_TBL (attrs);
a6f87d34 8438 if (! NILP (translation_table))
7d64c6ad 8439 c = translate_char (translation_table, c);
df7492f9
KH
8440 for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8441 CONSP (tail); tail = XCDR (tail))
e133c8fa 8442 {
df7492f9
KH
8443 charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8444 if (CHAR_CHARSET_P (c, charset))
8445 break;
e133c8fa 8446 }
df7492f9 8447 return (! NILP (tail));
05e6f5dc 8448}
83fa074f 8449
fb88bf2d 8450
df7492f9
KH
8451/* Return a list of coding systems that safely encode the text between
8452 START and END. If EXCLUDE is non-nil, it is a list of coding
8453 systems not to check. The returned list doesn't contain any such
48468dac 8454 coding systems. In any case, if the text contains only ASCII or is
df7492f9 8455 unibyte, return t. */
e077cc80 8456
df7492f9
KH
8457DEFUN ("find-coding-systems-region-internal",
8458 Ffind_coding_systems_region_internal,
8459 Sfind_coding_systems_region_internal, 2, 3, 0,
8460 doc: /* Internal use only. */)
5842a27b 8461 (Lisp_Object start, Lisp_Object end, Lisp_Object exclude)
df7492f9
KH
8462{
8463 Lisp_Object coding_attrs_list, safe_codings;
8464 EMACS_INT start_byte, end_byte;
7c78e542 8465 const unsigned char *p, *pbeg, *pend;
df7492f9 8466 int c;
0e727afa 8467 Lisp_Object tail, elt, work_table;
d46c5b12 8468
df7492f9
KH
8469 if (STRINGP (start))
8470 {
8471 if (!STRING_MULTIBYTE (start)
8f924df7 8472 || SCHARS (start) == SBYTES (start))
df7492f9
KH
8473 return Qt;
8474 start_byte = 0;
8f924df7 8475 end_byte = SBYTES (start);
df7492f9
KH
8476 }
8477 else
d46c5b12 8478 {
df7492f9
KH
8479 CHECK_NUMBER_COERCE_MARKER (start);
8480 CHECK_NUMBER_COERCE_MARKER (end);
8481 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8482 args_out_of_range (start, end);
4b4deea2 8483 if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
df7492f9
KH
8484 return Qt;
8485 start_byte = CHAR_TO_BYTE (XINT (start));
8486 end_byte = CHAR_TO_BYTE (XINT (end));
8487 if (XINT (end) - XINT (start) == end_byte - start_byte)
8488 return Qt;
d46c5b12 8489
e1c23804 8490 if (XINT (start) < GPT && XINT (end) > GPT)
d46c5b12 8491 {
e1c23804
DL
8492 if ((GPT - XINT (start)) < (XINT (end) - GPT))
8493 move_gap_both (XINT (start), start_byte);
df7492f9 8494 else
e1c23804 8495 move_gap_both (XINT (end), end_byte);
d46c5b12
KH
8496 }
8497 }
8498
df7492f9
KH
8499 coding_attrs_list = Qnil;
8500 for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
8501 if (NILP (exclude)
8502 || NILP (Fmemq (XCAR (tail), exclude)))
8503 {
8504 Lisp_Object attrs;
d46c5b12 8505
df7492f9
KH
8506 attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
8507 if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
8508 && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
7d64c6ad
KH
8509 {
8510 ASET (attrs, coding_attr_trans_tbl,
2170c8f0 8511 get_translation_table (attrs, 1, NULL));
7d64c6ad
KH
8512 coding_attrs_list = Fcons (attrs, coding_attrs_list);
8513 }
df7492f9 8514 }
d46c5b12 8515
df7492f9 8516 if (STRINGP (start))
8f924df7 8517 p = pbeg = SDATA (start);
df7492f9
KH
8518 else
8519 p = pbeg = BYTE_POS_ADDR (start_byte);
8520 pend = p + (end_byte - start_byte);
b843d1ae 8521
df7492f9
KH
8522 while (p < pend && ASCII_BYTE_P (*p)) p++;
8523 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
d46c5b12 8524
0e727afa 8525 work_table = Fmake_char_table (Qnil, Qnil);
05e6f5dc 8526 while (p < pend)
72d1a715 8527 {
df7492f9
KH
8528 if (ASCII_BYTE_P (*p))
8529 p++;
72d1a715
RS
8530 else
8531 {
df7492f9 8532 c = STRING_CHAR_ADVANCE (p);
0e727afa
YM
8533 if (!NILP (char_table_ref (work_table, c)))
8534 /* This character was already checked. Ignore it. */
8535 continue;
12410ef1 8536
df7492f9
KH
8537 charset_map_loaded = 0;
8538 for (tail = coding_attrs_list; CONSP (tail);)
8539 {
8540 elt = XCAR (tail);
8541 if (NILP (elt))
8542 tail = XCDR (tail);
8543 else if (char_encodable_p (c, elt))
8544 tail = XCDR (tail);
8545 else if (CONSP (XCDR (tail)))
8546 {
8547 XSETCAR (tail, XCAR (XCDR (tail)));
8548 XSETCDR (tail, XCDR (XCDR (tail)));
8549 }
8550 else
8551 {
8552 XSETCAR (tail, Qnil);
8553 tail = XCDR (tail);
8554 }
8555 }
8556 if (charset_map_loaded)
8557 {
8558 EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
05e6f5dc 8559
df7492f9 8560 if (STRINGP (start))
8f924df7 8561 pbeg = SDATA (start);
df7492f9
KH
8562 else
8563 pbeg = BYTE_POS_ADDR (start_byte);
8564 p = pbeg + p_offset;
8565 pend = pbeg + pend_offset;
8566 }
0e727afa 8567 char_table_set (work_table, c, Qt);
df7492f9 8568 }
ec6d2bb8 8569 }
fb88bf2d 8570
988b3759 8571 safe_codings = list2 (Qraw_text, Qno_conversion);
df7492f9
KH
8572 for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
8573 if (! NILP (XCAR (tail)))
8574 safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
ec6d2bb8 8575
05e6f5dc
KH
8576 return safe_codings;
8577}
4956c225 8578
d46c5b12 8579
8f924df7
KH
8580DEFUN ("unencodable-char-position", Funencodable_char_position,
8581 Sunencodable_char_position, 3, 5, 0,
8582 doc: /*
8583Return position of first un-encodable character in a region.
d4a1d553 8584START and END specify the region and CODING-SYSTEM specifies the
8f924df7 8585encoding to check. Return nil if CODING-SYSTEM does encode the region.
d46c5b12 8586
8f924df7
KH
8587If optional 4th argument COUNT is non-nil, it specifies at most how
8588many un-encodable characters to search. In this case, the value is a
8589list of positions.
d46c5b12 8590
8f924df7
KH
8591If optional 5th argument STRING is non-nil, it is a string to search
8592for un-encodable characters. In that case, START and END are indexes
8593to the string. */)
5842a27b 8594 (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object count, Lisp_Object string)
8f924df7
KH
8595{
8596 int n;
8597 struct coding_system coding;
7d64c6ad 8598 Lisp_Object attrs, charset_list, translation_table;
8f924df7
KH
8599 Lisp_Object positions;
8600 int from, to;
8601 const unsigned char *p, *stop, *pend;
8602 int ascii_compatible;
fb88bf2d 8603
8f924df7
KH
8604 setup_coding_system (Fcheck_coding_system (coding_system), &coding);
8605 attrs = CODING_ID_ATTRS (coding.id);
8606 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
8607 return Qnil;
8608 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
8609 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2170c8f0 8610 translation_table = get_translation_table (attrs, 1, NULL);
fb88bf2d 8611
8f924df7
KH
8612 if (NILP (string))
8613 {
8614 validate_region (&start, &end);
8615 from = XINT (start);
8616 to = XINT (end);
4b4deea2 8617 if (NILP (BVAR (current_buffer, enable_multibyte_characters))
8f924df7
KH
8618 || (ascii_compatible
8619 && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
8620 return Qnil;
8621 p = CHAR_POS_ADDR (from);
8622 pend = CHAR_POS_ADDR (to);
8623 if (from < GPT && to >= GPT)
8624 stop = GPT_ADDR;
8625 else
8626 stop = pend;
8627 }
8628 else
8629 {
8630 CHECK_STRING (string);
8631 CHECK_NATNUM (start);
8632 CHECK_NATNUM (end);
8633 from = XINT (start);
8634 to = XINT (end);
8635 if (from > to
8636 || to > SCHARS (string))
8637 args_out_of_range_3 (string, start, end);
8638 if (! STRING_MULTIBYTE (string))
8639 return Qnil;
8640 p = SDATA (string) + string_char_to_byte (string, from);
8641 stop = pend = SDATA (string) + string_char_to_byte (string, to);
8642 if (ascii_compatible && (to - from) == (pend - p))
8643 return Qnil;
8644 }
f2558efd 8645
8f924df7
KH
8646 if (NILP (count))
8647 n = 1;
8648 else
b73bfc1c 8649 {
8f924df7
KH
8650 CHECK_NATNUM (count);
8651 n = XINT (count);
b73bfc1c
KH
8652 }
8653
8f924df7
KH
8654 positions = Qnil;
8655 while (1)
d46c5b12 8656 {
8f924df7 8657 int c;
ec6d2bb8 8658
8f924df7
KH
8659 if (ascii_compatible)
8660 while (p < stop && ASCII_BYTE_P (*p))
8661 p++, from++;
8662 if (p >= stop)
0e79d667 8663 {
8f924df7
KH
8664 if (p >= pend)
8665 break;
8666 stop = pend;
8667 p = GAP_END_ADDR;
0e79d667 8668 }
ec6d2bb8 8669
8f924df7
KH
8670 c = STRING_CHAR_ADVANCE (p);
8671 if (! (ASCII_CHAR_P (c) && ascii_compatible)
7d64c6ad
KH
8672 && ! char_charset (translate_char (translation_table, c),
8673 charset_list, NULL))
ec6d2bb8 8674 {
8f924df7
KH
8675 positions = Fcons (make_number (from), positions);
8676 n--;
8677 if (n == 0)
8678 break;
ec6d2bb8
KH
8679 }
8680
8f924df7
KH
8681 from++;
8682 }
d46c5b12 8683
8f924df7
KH
8684 return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
8685}
d46c5b12 8686
d46c5b12 8687
df7492f9
KH
8688DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
8689 Scheck_coding_systems_region, 3, 3, 0,
8690 doc: /* Check if the region is encodable by coding systems.
d46c5b12 8691
df7492f9
KH
8692START and END are buffer positions specifying the region.
8693CODING-SYSTEM-LIST is a list of coding systems to check.
d46c5b12 8694
df7492f9 8695The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
d4a1d553 8696CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
df7492f9
KH
8697whole region, POS0, POS1, ... are buffer positions where non-encodable
8698characters are found.
93dec019 8699
df7492f9
KH
8700If all coding systems in CODING-SYSTEM-LIST can encode the region, the
8701value is nil.
93dec019 8702
df7492f9
KH
8703START may be a string. In that case, check if the string is
8704encodable, and the value contains indices to the string instead of
5704f39a
KH
8705buffer positions. END is ignored.
8706
4c1958f4 8707If the current buffer (or START if it is a string) is unibyte, the value
5704f39a 8708is nil. */)
5842a27b 8709 (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system_list)
05e6f5dc 8710{
df7492f9
KH
8711 Lisp_Object list;
8712 EMACS_INT start_byte, end_byte;
8713 int pos;
7c78e542 8714 const unsigned char *p, *pbeg, *pend;
df7492f9 8715 int c;
7d64c6ad 8716 Lisp_Object tail, elt, attrs;
70ad9fc4 8717
05e6f5dc
KH
8718 if (STRINGP (start))
8719 {
df7492f9 8720 if (!STRING_MULTIBYTE (start)
4c1958f4 8721 || SCHARS (start) == SBYTES (start))
df7492f9
KH
8722 return Qnil;
8723 start_byte = 0;
8f924df7 8724 end_byte = SBYTES (start);
df7492f9 8725 pos = 0;
d46c5b12 8726 }
05e6f5dc 8727 else
b73bfc1c 8728 {
b7826503
PJ
8729 CHECK_NUMBER_COERCE_MARKER (start);
8730 CHECK_NUMBER_COERCE_MARKER (end);
05e6f5dc
KH
8731 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8732 args_out_of_range (start, end);
4b4deea2 8733 if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
df7492f9
KH
8734 return Qnil;
8735 start_byte = CHAR_TO_BYTE (XINT (start));
8736 end_byte = CHAR_TO_BYTE (XINT (end));
8737 if (XINT (end) - XINT (start) == end_byte - start_byte)
5704f39a 8738 return Qnil;
df7492f9 8739
e1c23804 8740 if (XINT (start) < GPT && XINT (end) > GPT)
b73bfc1c 8741 {
e1c23804
DL
8742 if ((GPT - XINT (start)) < (XINT (end) - GPT))
8743 move_gap_both (XINT (start), start_byte);
df7492f9 8744 else
e1c23804 8745 move_gap_both (XINT (end), end_byte);
b73bfc1c 8746 }
e1c23804 8747 pos = XINT (start);
b73bfc1c 8748 }
7553d0e1 8749
df7492f9
KH
8750 list = Qnil;
8751 for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
12410ef1 8752 {
df7492f9 8753 elt = XCAR (tail);
7d64c6ad 8754 attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
2170c8f0
KH
8755 ASET (attrs, coding_attr_trans_tbl,
8756 get_translation_table (attrs, 1, NULL));
7d64c6ad 8757 list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
12410ef1
KH
8758 }
8759
df7492f9 8760 if (STRINGP (start))
8f924df7 8761 p = pbeg = SDATA (start);
72d1a715 8762 else
df7492f9
KH
8763 p = pbeg = BYTE_POS_ADDR (start_byte);
8764 pend = p + (end_byte - start_byte);
4ed46869 8765
df7492f9
KH
8766 while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
8767 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
ec6d2bb8 8768
df7492f9 8769 while (p < pend)
d46c5b12 8770 {
df7492f9
KH
8771 if (ASCII_BYTE_P (*p))
8772 p++;
e133c8fa 8773 else
05e6f5dc 8774 {
df7492f9
KH
8775 c = STRING_CHAR_ADVANCE (p);
8776
8777 charset_map_loaded = 0;
8778 for (tail = list; CONSP (tail); tail = XCDR (tail))
8779 {
8780 elt = XCDR (XCAR (tail));
8781 if (! char_encodable_p (c, XCAR (elt)))
8782 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
8783 }
8784 if (charset_map_loaded)
8785 {
8786 EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
8787
8788 if (STRINGP (start))
8f924df7 8789 pbeg = SDATA (start);
df7492f9
KH
8790 else
8791 pbeg = BYTE_POS_ADDR (start_byte);
8792 p = pbeg + p_offset;
8793 pend = pbeg + pend_offset;
8794 }
05e6f5dc 8795 }
df7492f9 8796 pos++;
d46c5b12 8797 }
4ed46869 8798
df7492f9
KH
8799 tail = list;
8800 list = Qnil;
8801 for (; CONSP (tail); tail = XCDR (tail))
ec6d2bb8 8802 {
df7492f9
KH
8803 elt = XCAR (tail);
8804 if (CONSP (XCDR (XCDR (elt))))
8805 list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
8806 list);
ec6d2bb8 8807 }
2b4f9037 8808
df7492f9 8809 return list;
d46c5b12
KH
8810}
8811
3fd9494b 8812
74ab6df5 8813static Lisp_Object
cf84bb53
JB
8814code_convert_region (Lisp_Object start, Lisp_Object end,
8815 Lisp_Object coding_system, Lisp_Object dst_object,
8816 int encodep, int norecord)
4ed46869 8817{
3a73fa5d 8818 struct coding_system coding;
df7492f9
KH
8819 EMACS_INT from, from_byte, to, to_byte;
8820 Lisp_Object src_object;
4ed46869 8821
b7826503
PJ
8822 CHECK_NUMBER_COERCE_MARKER (start);
8823 CHECK_NUMBER_COERCE_MARKER (end);
df7492f9
KH
8824 if (NILP (coding_system))
8825 coding_system = Qno_conversion;
8826 else
8827 CHECK_CODING_SYSTEM (coding_system);
8828 src_object = Fcurrent_buffer ();
8829 if (NILP (dst_object))
8830 dst_object = src_object;
8831 else if (! EQ (dst_object, Qt))
8832 CHECK_BUFFER (dst_object);
3a73fa5d 8833
d46c5b12
KH
8834 validate_region (&start, &end);
8835 from = XFASTINT (start);
df7492f9 8836 from_byte = CHAR_TO_BYTE (from);
d46c5b12 8837 to = XFASTINT (end);
df7492f9 8838 to_byte = CHAR_TO_BYTE (to);
764ca8da 8839
df7492f9
KH
8840 setup_coding_system (coding_system, &coding);
8841 coding.mode |= CODING_MODE_LAST_BLOCK;
ec6d2bb8 8842
df7492f9
KH
8843 if (encodep)
8844 encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8845 dst_object);
8846 else
8847 decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8848 dst_object);
8849 if (! norecord)
8850 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
ec6d2bb8 8851
df7492f9
KH
8852 return (BUFFERP (dst_object)
8853 ? make_number (coding.produced_char)
8854 : coding.dst_object);
4031e2bf 8855}
78108bcd 8856
4ed46869 8857
4031e2bf 8858DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
df7492f9 8859 3, 4, "r\nzCoding system: ",
48b0f3ae 8860 doc: /* Decode the current region from the specified coding system.
df7492f9
KH
8861When called from a program, takes four arguments:
8862 START, END, CODING-SYSTEM, and DESTINATION.
8863START and END are buffer positions.
8844fa83 8864
df7492f9 8865Optional 4th arguments DESTINATION specifies where the decoded text goes.
2354b80b 8866If nil, the region between START and END is replaced by the decoded text.
1560f91a
EZ
8867If buffer, the decoded text is inserted in that buffer after point (point
8868does not move).
446dcd75 8869In those cases, the length of the decoded text is returned.
319a3947 8870If DESTINATION is t, the decoded text is returned.
8844fa83 8871
48b0f3ae
PJ
8872This function sets `last-coding-system-used' to the precise coding system
8873used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
319a3947 8874not fully specified.) */)
5842a27b 8875 (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
4031e2bf 8876{
df7492f9 8877 return code_convert_region (start, end, coding_system, destination, 0, 0);
3a73fa5d 8878}
8844fa83 8879
3a73fa5d 8880DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
df7492f9
KH
8881 3, 4, "r\nzCoding system: ",
8882 doc: /* Encode the current region by specified coding system.
d4a1d553
JB
8883When called from a program, takes four arguments:
8884 START, END, CODING-SYSTEM and DESTINATION.
8885START and END are buffer positions.
d46c5b12 8886
df7492f9
KH
8887Optional 4th arguments DESTINATION specifies where the encoded text goes.
8888If nil, the region between START and END is replace by the encoded text.
1560f91a
EZ
8889If buffer, the encoded text is inserted in that buffer after point (point
8890does not move).
446dcd75 8891In those cases, the length of the encoded text is returned.
319a3947 8892If DESTINATION is t, the encoded text is returned.
2391eaa4 8893
48b0f3ae
PJ
8894This function sets `last-coding-system-used' to the precise coding system
8895used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
319a3947 8896not fully specified.) */)
5842a27b 8897 (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
3a73fa5d 8898{
df7492f9 8899 return code_convert_region (start, end, coding_system, destination, 1, 0);
b73bfc1c
KH
8900}
8901
8902Lisp_Object
6f704c76
DN
8903code_convert_string (Lisp_Object string, Lisp_Object coding_system,
8904 Lisp_Object dst_object, int encodep, int nocopy, int norecord)
b73bfc1c 8905{
4031e2bf 8906 struct coding_system coding;
df7492f9 8907 EMACS_INT chars, bytes;
ec6d2bb8 8908
b7826503 8909 CHECK_STRING (string);
d46c5b12 8910 if (NILP (coding_system))
4956c225 8911 {
df7492f9
KH
8912 if (! norecord)
8913 Vlast_coding_system_used = Qno_conversion;
8914 if (NILP (dst_object))
8915 return (nocopy ? Fcopy_sequence (string) : string);
4956c225 8916 }
b73bfc1c 8917
df7492f9
KH
8918 if (NILP (coding_system))
8919 coding_system = Qno_conversion;
8920 else
8921 CHECK_CODING_SYSTEM (coding_system);
8922 if (NILP (dst_object))
8923 dst_object = Qt;
8924 else if (! EQ (dst_object, Qt))
8925 CHECK_BUFFER (dst_object);
73be902c 8926
df7492f9 8927 setup_coding_system (coding_system, &coding);
d46c5b12 8928 coding.mode |= CODING_MODE_LAST_BLOCK;
8f924df7
KH
8929 chars = SCHARS (string);
8930 bytes = SBYTES (string);
df7492f9
KH
8931 if (encodep)
8932 encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
8933 else
8934 decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
8935 if (! norecord)
8936 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
73be902c 8937
df7492f9
KH
8938 return (BUFFERP (dst_object)
8939 ? make_number (coding.produced_char)
8940 : coding.dst_object);
4ed46869 8941}
73be902c 8942
b73bfc1c 8943
ecec61c1 8944/* Encode or decode STRING according to CODING_SYSTEM.
ec6d2bb8 8945 Do not set Vlast_coding_system_used.
4ed46869 8946
ec6d2bb8
KH
8947 This function is called only from macros DECODE_FILE and
8948 ENCODE_FILE, thus we ignore character composition. */
4ed46869 8949
ecec61c1 8950Lisp_Object
cf84bb53
JB
8951code_convert_string_norecord (Lisp_Object string, Lisp_Object coding_system,
8952 int encodep)
4ed46869 8953{
0be8721c 8954 return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
4ed46869
KH
8955}
8956
4ed46869 8957
16a97296 8958DEFUE ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
df7492f9
KH
8959 2, 4, 0,
8960 doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
8961
8962Optional third arg NOCOPY non-nil means it is OK to return STRING itself
8963if the decoding operation is trivial.
ecec61c1 8964
d4a1d553 8965Optional fourth arg BUFFER non-nil means that the decoded text is
1560f91a
EZ
8966inserted in that buffer after point (point does not move). In this
8967case, the return value is the length of the decoded text.
ecec61c1 8968
df7492f9
KH
8969This function sets `last-coding-system-used' to the precise coding system
8970used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
d4a1d553 8971not fully specified.) */)
5842a27b 8972 (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
4ed46869 8973{
df7492f9
KH
8974 return code_convert_string (string, coding_system, buffer,
8975 0, ! NILP (nocopy), 0);
4ed46869
KH
8976}
8977
df7492f9
KH
8978DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
8979 2, 4, 0,
8980 doc: /* Encode STRING to CODING-SYSTEM, and return the result.
8981
8982Optional third arg NOCOPY non-nil means it is OK to return STRING
8983itself if the encoding operation is trivial.
8984
d4a1d553 8985Optional fourth arg BUFFER non-nil means that the encoded text is
1560f91a
EZ
8986inserted in that buffer after point (point does not move). In this
8987case, the return value is the length of the encoded text.
df7492f9
KH
8988
8989This function sets `last-coding-system-used' to the precise coding system
8990used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8991not fully specified.) */)
5842a27b 8992 (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
4ed46869 8993{
df7492f9 8994 return code_convert_string (string, coding_system, buffer,
c197f191 8995 1, ! NILP (nocopy), 1);
4ed46869 8996}
df7492f9 8997
3a73fa5d 8998\f
4ed46869 8999DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
48b0f3ae
PJ
9000 doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
9001Return the corresponding character. */)
5842a27b 9002 (Lisp_Object code)
4ed46869 9003{
df7492f9
KH
9004 Lisp_Object spec, attrs, val;
9005 struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
5fdb398c
PE
9006 EMACS_INT ch;
9007 int c;
4ed46869 9008
df7492f9 9009 CHECK_NATNUM (code);
5fdb398c 9010 ch = XFASTINT (code);
df7492f9
KH
9011 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9012 attrs = AREF (spec, 0);
4ed46869 9013
5fdb398c 9014 if (ASCII_BYTE_P (ch)
df7492f9
KH
9015 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9016 return code;
4ed46869 9017
df7492f9
KH
9018 val = CODING_ATTR_CHARSET_LIST (attrs);
9019 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
004068e4
KH
9020 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9021 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
fa42c37f 9022
5fdb398c
PE
9023 if (ch <= 0x7F)
9024 {
9025 c = ch;
9026 charset = charset_roman;
9027 }
9028 else if (ch >= 0xA0 && ch < 0xDF)
55ab7be3 9029 {
5fdb398c 9030 c = ch - 0x80;
df7492f9 9031 charset = charset_kana;
4ed46869 9032 }
55ab7be3 9033 else
4ed46869 9034 {
5fdb398c
PE
9035 EMACS_INT c1 = ch >> 8;
9036 int c2 = ch & 0xFF;
df7492f9 9037
2735d060
PE
9038 if (c1 < 0x81 || (c1 > 0x9F && c1 < 0xE0) || c1 > 0xEF
9039 || c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
5fdb398c
PE
9040 error ("Invalid code: %"pEd, ch);
9041 c = ch;
df7492f9
KH
9042 SJIS_TO_JIS (c);
9043 charset = charset_kanji;
4ed46869 9044 }
df7492f9
KH
9045 c = DECODE_CHAR (charset, c);
9046 if (c < 0)
5fdb398c 9047 error ("Invalid code: %"pEd, ch);
df7492f9 9048 return make_number (c);
93dec019 9049}
4ed46869 9050
48b0f3ae 9051
4ed46869 9052DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
8acf0c0e 9053 doc: /* Encode a Japanese character CH to shift_jis encoding.
48b0f3ae 9054Return the corresponding code in SJIS. */)
5842a27b 9055 (Lisp_Object ch)
4ed46869 9056{
df7492f9
KH
9057 Lisp_Object spec, attrs, charset_list;
9058 int c;
9059 struct charset *charset;
9060 unsigned code;
48b0f3ae 9061
df7492f9
KH
9062 CHECK_CHARACTER (ch);
9063 c = XFASTINT (ch);
9064 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9065 attrs = AREF (spec, 0);
9066
9067 if (ASCII_CHAR_P (c)
9068 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9069 return ch;
9070
9071 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9072 charset = char_charset (c, charset_list, &code);
9073 if (code == CHARSET_INVALID_CODE (charset))
9074 error ("Can't encode by shift_jis encoding: %d", c);
9075 JIS_TO_SJIS (code);
9076
9077 return make_number (code);
4ed46869
KH
9078}
9079
9080DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
48b0f3ae
PJ
9081 doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
9082Return the corresponding character. */)
5842a27b 9083 (Lisp_Object code)
d46c5b12 9084{
df7492f9
KH
9085 Lisp_Object spec, attrs, val;
9086 struct charset *charset_roman, *charset_big5, *charset;
5fdb398c 9087 EMACS_INT ch;
df7492f9 9088 int c;
6289dd10 9089
df7492f9 9090 CHECK_NATNUM (code);
5fdb398c 9091 ch = XFASTINT (code);
df7492f9
KH
9092 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9093 attrs = AREF (spec, 0);
4ed46869 9094
5fdb398c 9095 if (ASCII_BYTE_P (ch)
df7492f9
KH
9096 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9097 return code;
6289dd10 9098
df7492f9
KH
9099 val = CODING_ATTR_CHARSET_LIST (attrs);
9100 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9101 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
c210f766 9102
5fdb398c
PE
9103 if (ch <= 0x7F)
9104 {
9105 c = ch;
9106 charset = charset_roman;
9107 }
c28a9453
KH
9108 else
9109 {
5fdb398c
PE
9110 EMACS_INT b1 = ch >> 8;
9111 int b2 = ch & 0x7F;
df7492f9
KH
9112 if (b1 < 0xA1 || b1 > 0xFE
9113 || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
5fdb398c
PE
9114 error ("Invalid code: %"pEd, ch);
9115 c = ch;
df7492f9 9116 charset = charset_big5;
c28a9453 9117 }
5fdb398c 9118 c = DECODE_CHAR (charset, c);
df7492f9 9119 if (c < 0)
5fdb398c 9120 error ("Invalid code: %"pEd, ch);
df7492f9 9121 return make_number (c);
d46c5b12 9122}
6289dd10 9123
4ed46869 9124DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
8acf0c0e 9125 doc: /* Encode the Big5 character CH to BIG5 coding system.
48b0f3ae 9126Return the corresponding character code in Big5. */)
5842a27b 9127 (Lisp_Object ch)
4ed46869 9128{
df7492f9
KH
9129 Lisp_Object spec, attrs, charset_list;
9130 struct charset *charset;
9131 int c;
9132 unsigned code;
9133
9134 CHECK_CHARACTER (ch);
9135 c = XFASTINT (ch);
9136 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9137 attrs = AREF (spec, 0);
9138 if (ASCII_CHAR_P (c)
9139 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9140 return ch;
9141
9142 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9143 charset = char_charset (c, charset_list, &code);
9144 if (code == CHARSET_INVALID_CODE (charset))
9145 error ("Can't encode by Big5 encoding: %d", c);
9146
9147 return make_number (code);
4ed46869 9148}
48b0f3ae 9149
3a73fa5d 9150\f
002fdb44 9151DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
68bba4e4 9152 Sset_terminal_coding_system_internal, 1, 2, 0,
48b0f3ae 9153 doc: /* Internal use only. */)
5842a27b 9154 (Lisp_Object coding_system, Lisp_Object terminal)
4ed46869 9155{
b18fad6d
KH
9156 struct terminal *term = get_terminal (terminal, 1);
9157 struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (term);
b7826503 9158 CHECK_SYMBOL (coding_system);
b8299c66 9159 setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
70c22245 9160 /* We had better not send unsafe characters to terminal. */
c73bd236 9161 terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
ad1746f5 9162 /* Character composition should be disabled. */
c73bd236 9163 terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
b8299c66
KL
9164 terminal_coding->src_multibyte = 1;
9165 terminal_coding->dst_multibyte = 0;
b18fad6d
KH
9166 if (terminal_coding->common_flags & CODING_REQUIRE_ENCODING_MASK)
9167 term->charset_list = coding_charset_list (terminal_coding);
9168 else
6b4bb703 9169 term->charset_list = Fcons (make_number (charset_ascii), Qnil);
4ed46869
KH
9170 return Qnil;
9171}
9172
c4825358
KH
9173DEFUN ("set-safe-terminal-coding-system-internal",
9174 Fset_safe_terminal_coding_system_internal,
48b0f3ae 9175 Sset_safe_terminal_coding_system_internal, 1, 1, 0,
ddb67bdc 9176 doc: /* Internal use only. */)
5842a27b 9177 (Lisp_Object coding_system)
d46c5b12 9178{
b7826503 9179 CHECK_SYMBOL (coding_system);
c4825358
KH
9180 setup_coding_system (Fcheck_coding_system (coding_system),
9181 &safe_terminal_coding);
ad1746f5 9182 /* Character composition should be disabled. */
df7492f9 9183 safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
b73bfc1c
KH
9184 safe_terminal_coding.src_multibyte = 1;
9185 safe_terminal_coding.dst_multibyte = 0;
c4825358
KH
9186 return Qnil;
9187}
4ed46869 9188
002fdb44 9189DEFUN ("terminal-coding-system", Fterminal_coding_system,
68bba4e4 9190 Sterminal_coding_system, 0, 1, 0,
6ed8eeff 9191 doc: /* Return coding system specified for terminal output on the given terminal.
708e05dc 9192TERMINAL may be a terminal object, a frame, or nil for the selected
6ed8eeff 9193frame's terminal device. */)
5842a27b 9194 (Lisp_Object terminal)
4ed46869 9195{
985773c9
MB
9196 struct coding_system *terminal_coding
9197 = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
9198 Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
ae6f73fa 9199
ae6f73fa 9200 /* For backward compatibility, return nil if it is `undecided'. */
f75c90a9 9201 return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
4ed46869
KH
9202}
9203
002fdb44 9204DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
68bba4e4 9205 Sset_keyboard_coding_system_internal, 1, 2, 0,
48b0f3ae 9206 doc: /* Internal use only. */)
5842a27b 9207 (Lisp_Object coding_system, Lisp_Object terminal)
4ed46869 9208{
6ed8eeff 9209 struct terminal *t = get_terminal (terminal, 1);
b7826503 9210 CHECK_SYMBOL (coding_system);
624bda09
KH
9211 if (NILP (coding_system))
9212 coding_system = Qno_conversion;
9213 else
9214 Fcheck_coding_system (coding_system);
9215 setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t));
ad1746f5 9216 /* Character composition should be disabled. */
c73bd236
MB
9217 TERMINAL_KEYBOARD_CODING (t)->common_flags
9218 &= ~CODING_ANNOTATE_COMPOSITION_MASK;
4ed46869
KH
9219 return Qnil;
9220}
9221
9222DEFUN ("keyboard-coding-system",
985773c9 9223 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
48b0f3ae 9224 doc: /* Return coding system specified for decoding keyboard input. */)
5842a27b 9225 (Lisp_Object terminal)
4ed46869 9226{
985773c9
MB
9227 return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9228 (get_terminal (terminal, 1))->id);
4ed46869
KH
9229}
9230
4ed46869 9231\f
16a97296 9232DEFUE ("find-operation-coding-system", Ffind_operation_coding_system,
a5d301df 9233 Sfind_operation_coding_system, 1, MANY, 0,
48b0f3ae
PJ
9234 doc: /* Choose a coding system for an operation based on the target name.
9235The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9236DECODING-SYSTEM is the coding system to use for decoding
9237\(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9238for encoding (in case OPERATION does encoding).
05e6f5dc 9239
48b0f3ae
PJ
9240The first argument OPERATION specifies an I/O primitive:
9241 For file I/O, `insert-file-contents' or `write-region'.
9242 For process I/O, `call-process', `call-process-region', or `start-process'.
9243 For network I/O, `open-network-stream'.
05e6f5dc 9244
48b0f3ae
PJ
9245The remaining arguments should be the same arguments that were passed
9246to the primitive. Depending on which primitive, one of those arguments
9247is selected as the TARGET. For example, if OPERATION does file I/O,
9248whichever argument specifies the file name is TARGET.
05e6f5dc 9249
48b0f3ae 9250TARGET has a meaning which depends on OPERATION:
b883cdb2 9251 For file I/O, TARGET is a file name (except for the special case below).
48b0f3ae 9252 For process I/O, TARGET is a process name.
d4a1d553 9253 For network I/O, TARGET is a service name or a port number.
05e6f5dc 9254
d4a1d553 9255This function looks up what is specified for TARGET in
48b0f3ae
PJ
9256`file-coding-system-alist', `process-coding-system-alist',
9257or `network-coding-system-alist' depending on OPERATION.
9258They may specify a coding system, a cons of coding systems,
9259or a function symbol to call.
9260In the last case, we call the function with one argument,
9261which is a list of all the arguments given to this function.
1011c487
MB
9262If the function can't decide a coding system, it can return
9263`undecided' so that the normal code-detection is performed.
48b0f3ae 9264
b883cdb2
MB
9265If OPERATION is `insert-file-contents', the argument corresponding to
9266TARGET may be a cons (FILENAME . BUFFER). In that case, FILENAME is a
9267file name to look up, and BUFFER is a buffer that contains the file's
9268contents (not yet decoded). If `file-coding-system-alist' specifies a
9269function to call for FILENAME, that function should examine the
9270contents of BUFFER instead of reading the file.
9271
d918f936 9272usage: (find-operation-coding-system OPERATION ARGUMENTS...) */)
c5101a77 9273 (size_t nargs, Lisp_Object *args)
6b89e3aa 9274{
4ed46869
KH
9275 Lisp_Object operation, target_idx, target, val;
9276 register Lisp_Object chain;
177c0ea7 9277
4ed46869
KH
9278 if (nargs < 2)
9279 error ("Too few arguments");
9280 operation = args[0];
9281 if (!SYMBOLP (operation)
c5101a77 9282 || !NATNUMP (target_idx = Fget (operation, Qtarget_idx)))
3ed051d4 9283 error ("Invalid first argument");
c5101a77 9284 if (nargs < 1 + XFASTINT (target_idx))
4ed46869 9285 error ("Too few arguments for operation: %s",
8f924df7 9286 SDATA (SYMBOL_NAME (operation)));
c5101a77 9287 target = args[XFASTINT (target_idx) + 1];
4ed46869 9288 if (!(STRINGP (target)
091a0ff0
KH
9289 || (EQ (operation, Qinsert_file_contents) && CONSP (target)
9290 && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
4ed46869 9291 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
5fdb398c 9292 error ("Invalid %"pEd"th argument", XFASTINT (target_idx) + 1);
091a0ff0
KH
9293 if (CONSP (target))
9294 target = XCAR (target);
4ed46869 9295
2e34157c
RS
9296 chain = ((EQ (operation, Qinsert_file_contents)
9297 || EQ (operation, Qwrite_region))
02ba4723 9298 ? Vfile_coding_system_alist
2e34157c 9299 : (EQ (operation, Qopen_network_stream)
02ba4723
KH
9300 ? Vnetwork_coding_system_alist
9301 : Vprocess_coding_system_alist));
4ed46869
KH
9302 if (NILP (chain))
9303 return Qnil;
9304
03699b14 9305 for (; CONSP (chain); chain = XCDR (chain))
6b89e3aa 9306 {
f44d27ce 9307 Lisp_Object elt;
6b89e3aa 9308
df7492f9 9309 elt = XCAR (chain);
4ed46869
KH
9310 if (CONSP (elt)
9311 && ((STRINGP (target)
03699b14
KR
9312 && STRINGP (XCAR (elt))
9313 && fast_string_match (XCAR (elt), target) >= 0)
9314 || (INTEGERP (target) && EQ (target, XCAR (elt)))))
6b89e3aa 9315 {
03699b14 9316 val = XCDR (elt);
b19fd4c5
KH
9317 /* Here, if VAL is both a valid coding system and a valid
9318 function symbol, we return VAL as a coding system. */
02ba4723
KH
9319 if (CONSP (val))
9320 return val;
9321 if (! SYMBOLP (val))
9322 return Qnil;
9323 if (! NILP (Fcoding_system_p (val)))
9324 return Fcons (val, val);
b19fd4c5 9325 if (! NILP (Ffboundp (val)))
6b89e3aa 9326 {
e2b97060
MB
9327 /* We use call1 rather than safe_call1
9328 so as to get bug reports about functions called here
9329 which don't handle the current interface. */
9330 val = call1 (val, Flist (nargs, args));
b19fd4c5
KH
9331 if (CONSP (val))
9332 return val;
9333 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
9334 return Fcons (val, val);
6b89e3aa 9335 }
02ba4723 9336 return Qnil;
6b89e3aa
KH
9337 }
9338 }
4ed46869 9339 return Qnil;
6b89e3aa
KH
9340}
9341
df7492f9 9342DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
a3f6ee6d 9343 Sset_coding_system_priority, 0, MANY, 0,
da7db224 9344 doc: /* Assign higher priority to the coding systems given as arguments.
d4a1d553 9345If multiple coding systems belong to the same category,
a3181084
DL
9346all but the first one are ignored.
9347
d4a1d553 9348usage: (set-coding-system-priority &rest coding-systems) */)
c5101a77 9349 (size_t nargs, Lisp_Object *args)
df7492f9 9350{
c5101a77 9351 size_t i, j;
df7492f9
KH
9352 int changed[coding_category_max];
9353 enum coding_category priorities[coding_category_max];
9354
72af86bd 9355 memset (changed, 0, sizeof changed);
6b89e3aa 9356
df7492f9 9357 for (i = j = 0; i < nargs; i++)
6b89e3aa 9358 {
df7492f9
KH
9359 enum coding_category category;
9360 Lisp_Object spec, attrs;
6b89e3aa 9361
df7492f9
KH
9362 CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9363 attrs = AREF (spec, 0);
9364 category = XINT (CODING_ATTR_CATEGORY (attrs));
9365 if (changed[category])
9366 /* Ignore this coding system because a coding system of the
9367 same category already had a higher priority. */
9368 continue;
9369 changed[category] = 1;
9370 priorities[j++] = category;
9371 if (coding_categories[category].id >= 0
9372 && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9373 setup_coding_system (args[i], &coding_categories[category]);
ff563fce 9374 Fset (AREF (Vcoding_category_table, category), args[i]);
df7492f9 9375 }
6b89e3aa 9376
df7492f9
KH
9377 /* Now we have decided top J priorities. Reflect the order of the
9378 original priorities to the remaining priorities. */
6b89e3aa 9379
df7492f9 9380 for (i = j, j = 0; i < coding_category_max; i++, j++)
6b89e3aa 9381 {
df7492f9
KH
9382 while (j < coding_category_max
9383 && changed[coding_priorities[j]])
9384 j++;
9385 if (j == coding_category_max)
9386 abort ();
9387 priorities[i] = coding_priorities[j];
9388 }
6b89e3aa 9389
72af86bd 9390 memcpy (coding_priorities, priorities, sizeof priorities);
177c0ea7 9391
ff563fce
KH
9392 /* Update `coding-category-list'. */
9393 Vcoding_category_list = Qnil;
c5101a77 9394 for (i = coding_category_max; i-- > 0; )
ff563fce
KH
9395 Vcoding_category_list
9396 = Fcons (AREF (Vcoding_category_table, priorities[i]),
9397 Vcoding_category_list);
6b89e3aa 9398
df7492f9 9399 return Qnil;
6b89e3aa
KH
9400}
9401
df7492f9
KH
9402DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
9403 Scoding_system_priority_list, 0, 1, 0,
da7db224 9404 doc: /* Return a list of coding systems ordered by their priorities.
b811c52b
KH
9405The list contains a subset of coding systems; i.e. coding systems
9406assigned to each coding category (see `coding-category-list').
9407
da7db224 9408HIGHESTP non-nil means just return the highest priority one. */)
5842a27b 9409 (Lisp_Object highestp)
d46c5b12
KH
9410{
9411 int i;
df7492f9 9412 Lisp_Object val;
6b89e3aa 9413
df7492f9 9414 for (i = 0, val = Qnil; i < coding_category_max; i++)
d46c5b12 9415 {
df7492f9
KH
9416 enum coding_category category = coding_priorities[i];
9417 int id = coding_categories[category].id;
9418 Lisp_Object attrs;
068a9dbd 9419
df7492f9
KH
9420 if (id < 0)
9421 continue;
9422 attrs = CODING_ID_ATTRS (id);
9423 if (! NILP (highestp))
9424 return CODING_ATTR_BASE_NAME (attrs);
9425 val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
9426 }
9427 return Fnreverse (val);
9428}
068a9dbd 9429
91433552 9430static const char *const suffixes[] = { "-unix", "-dos", "-mac" };
068a9dbd
KH
9431
9432static Lisp_Object
971de7fb 9433make_subsidiaries (Lisp_Object base)
068a9dbd 9434{
df7492f9 9435 Lisp_Object subsidiaries;
8f924df7 9436 int base_name_len = SBYTES (SYMBOL_NAME (base));
df7492f9
KH
9437 char *buf = (char *) alloca (base_name_len + 6);
9438 int i;
068a9dbd 9439
72af86bd 9440 memcpy (buf, SDATA (SYMBOL_NAME (base)), base_name_len);
df7492f9
KH
9441 subsidiaries = Fmake_vector (make_number (3), Qnil);
9442 for (i = 0; i < 3; i++)
068a9dbd 9443 {
72af86bd 9444 memcpy (buf + base_name_len, suffixes[i], strlen (suffixes[i]) + 1);
df7492f9 9445 ASET (subsidiaries, i, intern (buf));
068a9dbd 9446 }
df7492f9 9447 return subsidiaries;
068a9dbd
KH
9448}
9449
9450
df7492f9
KH
9451DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
9452 Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
1fcd6c8b
DL
9453 doc: /* For internal use only.
9454usage: (define-coding-system-internal ...) */)
c5101a77 9455 (size_t nargs, Lisp_Object *args)
068a9dbd 9456{
df7492f9
KH
9457 Lisp_Object name;
9458 Lisp_Object spec_vec; /* [ ATTRS ALIASE EOL_TYPE ] */
9459 Lisp_Object attrs; /* Vector of attributes. */
9460 Lisp_Object eol_type;
9461 Lisp_Object aliases;
9462 Lisp_Object coding_type, charset_list, safe_charsets;
9463 enum coding_category category;
9464 Lisp_Object tail, val;
9465 int max_charset_id = 0;
9466 int i;
068a9dbd 9467
df7492f9
KH
9468 if (nargs < coding_arg_max)
9469 goto short_args;
068a9dbd 9470
df7492f9 9471 attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
068a9dbd 9472
df7492f9
KH
9473 name = args[coding_arg_name];
9474 CHECK_SYMBOL (name);
9475 CODING_ATTR_BASE_NAME (attrs) = name;
068a9dbd 9476
df7492f9
KH
9477 val = args[coding_arg_mnemonic];
9478 if (! STRINGP (val))
9479 CHECK_CHARACTER (val);
9480 CODING_ATTR_MNEMONIC (attrs) = val;
068a9dbd 9481
df7492f9
KH
9482 coding_type = args[coding_arg_coding_type];
9483 CHECK_SYMBOL (coding_type);
9484 CODING_ATTR_TYPE (attrs) = coding_type;
068a9dbd 9485
df7492f9
KH
9486 charset_list = args[coding_arg_charset_list];
9487 if (SYMBOLP (charset_list))
9488 {
9489 if (EQ (charset_list, Qiso_2022))
9490 {
9491 if (! EQ (coding_type, Qiso_2022))
9492 error ("Invalid charset-list");
9493 charset_list = Viso_2022_charset_list;
9494 }
9495 else if (EQ (charset_list, Qemacs_mule))
9496 {
9497 if (! EQ (coding_type, Qemacs_mule))
9498 error ("Invalid charset-list");
9499 charset_list = Vemacs_mule_charset_list;
9500 }
9501 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9502 if (max_charset_id < XFASTINT (XCAR (tail)))
9503 max_charset_id = XFASTINT (XCAR (tail));
9504 }
068a9dbd
KH
9505 else
9506 {
df7492f9 9507 charset_list = Fcopy_sequence (charset_list);
985773c9 9508 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
068a9dbd 9509 {
df7492f9
KH
9510 struct charset *charset;
9511
985773c9 9512 val = XCAR (tail);
df7492f9
KH
9513 CHECK_CHARSET_GET_CHARSET (val, charset);
9514 if (EQ (coding_type, Qiso_2022)
9515 ? CHARSET_ISO_FINAL (charset) < 0
9516 : EQ (coding_type, Qemacs_mule)
9517 ? CHARSET_EMACS_MULE_ID (charset) < 0
9518 : 0)
9519 error ("Can't handle charset `%s'",
8f924df7 9520 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
df7492f9 9521
8f924df7 9522 XSETCAR (tail, make_number (charset->id));
df7492f9
KH
9523 if (max_charset_id < charset->id)
9524 max_charset_id = charset->id;
068a9dbd
KH
9525 }
9526 }
df7492f9 9527 CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
068a9dbd 9528
1b3b981b
AS
9529 safe_charsets = make_uninit_string (max_charset_id + 1);
9530 memset (SDATA (safe_charsets), 255, max_charset_id + 1);
df7492f9 9531 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8f924df7 9532 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
df7492f9 9533 CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
068a9dbd 9534
584948ac 9535 CODING_ATTR_ASCII_COMPAT (attrs) = args[coding_arg_ascii_compatible_p];
3a73fa5d 9536
df7492f9 9537 val = args[coding_arg_decode_translation_table];
a6f87d34 9538 if (! CHAR_TABLE_P (val) && ! CONSP (val))
7d64c6ad 9539 CHECK_SYMBOL (val);
df7492f9 9540 CODING_ATTR_DECODE_TBL (attrs) = val;
3a73fa5d 9541
df7492f9 9542 val = args[coding_arg_encode_translation_table];
a6f87d34 9543 if (! CHAR_TABLE_P (val) && ! CONSP (val))
7d64c6ad 9544 CHECK_SYMBOL (val);
df7492f9 9545 CODING_ATTR_ENCODE_TBL (attrs) = val;
d46c5b12 9546
df7492f9
KH
9547 val = args[coding_arg_post_read_conversion];
9548 CHECK_SYMBOL (val);
9549 CODING_ATTR_POST_READ (attrs) = val;
d46c5b12 9550
df7492f9
KH
9551 val = args[coding_arg_pre_write_conversion];
9552 CHECK_SYMBOL (val);
9553 CODING_ATTR_PRE_WRITE (attrs) = val;
3a73fa5d 9554
df7492f9
KH
9555 val = args[coding_arg_default_char];
9556 if (NILP (val))
9557 CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
9558 else
9559 {
8f924df7 9560 CHECK_CHARACTER (val);
df7492f9
KH
9561 CODING_ATTR_DEFAULT_CHAR (attrs) = val;
9562 }
4031e2bf 9563
8f924df7
KH
9564 val = args[coding_arg_for_unibyte];
9565 CODING_ATTR_FOR_UNIBYTE (attrs) = NILP (val) ? Qnil : Qt;
3a73fa5d 9566
df7492f9
KH
9567 val = args[coding_arg_plist];
9568 CHECK_LIST (val);
9569 CODING_ATTR_PLIST (attrs) = val;
3a73fa5d 9570
df7492f9
KH
9571 if (EQ (coding_type, Qcharset))
9572 {
c7c66a95
KH
9573 /* Generate a lisp vector of 256 elements. Each element is nil,
9574 integer, or a list of charset IDs.
3a73fa5d 9575
c7c66a95
KH
9576 If Nth element is nil, the byte code N is invalid in this
9577 coding system.
4ed46869 9578
c7c66a95
KH
9579 If Nth element is a number NUM, N is the first byte of a
9580 charset whose ID is NUM.
4ed46869 9581
c7c66a95
KH
9582 If Nth element is a list of charset IDs, N is the first byte
9583 of one of them. The list is sorted by dimensions of the
ad1746f5 9584 charsets. A charset of smaller dimension comes first. */
df7492f9 9585 val = Fmake_vector (make_number (256), Qnil);
4ed46869 9586
5c99c2e6 9587 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
df7492f9 9588 {
c7c66a95
KH
9589 struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
9590 int dim = CHARSET_DIMENSION (charset);
9591 int idx = (dim - 1) * 4;
4ed46869 9592
5c99c2e6 9593 if (CHARSET_ASCII_COMPATIBLE_P (charset))
584948ac 9594 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
4031e2bf 9595
15d143f7
KH
9596 for (i = charset->code_space[idx];
9597 i <= charset->code_space[idx + 1]; i++)
9598 {
c7c66a95
KH
9599 Lisp_Object tmp, tmp2;
9600 int dim2;
ec6d2bb8 9601
c7c66a95
KH
9602 tmp = AREF (val, i);
9603 if (NILP (tmp))
9604 tmp = XCAR (tail);
9605 else if (NUMBERP (tmp))
9606 {
9607 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
9608 if (dim < dim2)
c7c66a95 9609 tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
f9d71dcd
KH
9610 else
9611 tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
c7c66a95 9612 }
15d143f7 9613 else
c7c66a95
KH
9614 {
9615 for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
9616 {
9617 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
9618 if (dim < dim2)
9619 break;
9620 }
9621 if (NILP (tmp2))
9622 tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
9623 else
9624 {
9625 XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
9626 XSETCAR (tmp2, XCAR (tail));
9627 }
9628 }
9629 ASET (val, i, tmp);
15d143f7 9630 }
df7492f9
KH
9631 }
9632 ASET (attrs, coding_attr_charset_valids, val);
9633 category = coding_category_charset;
9634 }
9635 else if (EQ (coding_type, Qccl))
9636 {
9637 Lisp_Object valids;
ecec61c1 9638
df7492f9
KH
9639 if (nargs < coding_arg_ccl_max)
9640 goto short_args;
ecec61c1 9641
df7492f9
KH
9642 val = args[coding_arg_ccl_decoder];
9643 CHECK_CCL_PROGRAM (val);
9644 if (VECTORP (val))
9645 val = Fcopy_sequence (val);
9646 ASET (attrs, coding_attr_ccl_decoder, val);
ecec61c1 9647
df7492f9
KH
9648 val = args[coding_arg_ccl_encoder];
9649 CHECK_CCL_PROGRAM (val);
9650 if (VECTORP (val))
9651 val = Fcopy_sequence (val);
9652 ASET (attrs, coding_attr_ccl_encoder, val);
ecec61c1 9653
df7492f9
KH
9654 val = args[coding_arg_ccl_valids];
9655 valids = Fmake_string (make_number (256), make_number (0));
9656 for (tail = val; !NILP (tail); tail = Fcdr (tail))
9657 {
8dcbea82 9658 int from, to;
ecec61c1 9659
df7492f9
KH
9660 val = Fcar (tail);
9661 if (INTEGERP (val))
8dcbea82
KH
9662 {
9663 from = to = XINT (val);
9664 if (from < 0 || from > 255)
9665 args_out_of_range_3 (val, make_number (0), make_number (255));
9666 }
df7492f9
KH
9667 else
9668 {
df7492f9 9669 CHECK_CONS (val);
8f924df7
KH
9670 CHECK_NATNUM_CAR (val);
9671 CHECK_NATNUM_CDR (val);
df7492f9 9672 from = XINT (XCAR (val));
8f924df7 9673 if (from > 255)
8dcbea82
KH
9674 args_out_of_range_3 (XCAR (val),
9675 make_number (0), make_number (255));
df7492f9 9676 to = XINT (XCDR (val));
8dcbea82
KH
9677 if (to < from || to > 255)
9678 args_out_of_range_3 (XCDR (val),
9679 XCAR (val), make_number (255));
df7492f9 9680 }
8dcbea82 9681 for (i = from; i <= to; i++)
8f924df7 9682 SSET (valids, i, 1);
df7492f9
KH
9683 }
9684 ASET (attrs, coding_attr_ccl_valids, valids);
4ed46869 9685
df7492f9 9686 category = coding_category_ccl;
55ab7be3 9687 }
df7492f9 9688 else if (EQ (coding_type, Qutf_16))
55ab7be3 9689 {
df7492f9 9690 Lisp_Object bom, endian;
4ed46869 9691
584948ac 9692 CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
4ed46869 9693
df7492f9
KH
9694 if (nargs < coding_arg_utf16_max)
9695 goto short_args;
4ed46869 9696
df7492f9
KH
9697 bom = args[coding_arg_utf16_bom];
9698 if (! NILP (bom) && ! EQ (bom, Qt))
9699 {
9700 CHECK_CONS (bom);
8f924df7
KH
9701 val = XCAR (bom);
9702 CHECK_CODING_SYSTEM (val);
9703 val = XCDR (bom);
9704 CHECK_CODING_SYSTEM (val);
df7492f9 9705 }
a470d443 9706 ASET (attrs, coding_attr_utf_bom, bom);
df7492f9
KH
9707
9708 endian = args[coding_arg_utf16_endian];
b49a1807
KH
9709 CHECK_SYMBOL (endian);
9710 if (NILP (endian))
9711 endian = Qbig;
9712 else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
8f924df7 9713 error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
df7492f9
KH
9714 ASET (attrs, coding_attr_utf_16_endian, endian);
9715
9716 category = (CONSP (bom)
9717 ? coding_category_utf_16_auto
9718 : NILP (bom)
b49a1807 9719 ? (EQ (endian, Qbig)
df7492f9
KH
9720 ? coding_category_utf_16_be_nosig
9721 : coding_category_utf_16_le_nosig)
b49a1807 9722 : (EQ (endian, Qbig)
df7492f9
KH
9723 ? coding_category_utf_16_be
9724 : coding_category_utf_16_le));
9725 }
9726 else if (EQ (coding_type, Qiso_2022))
9727 {
9728 Lisp_Object initial, reg_usage, request, flags;
1397dc18 9729
df7492f9
KH
9730 if (nargs < coding_arg_iso2022_max)
9731 goto short_args;
9732
9733 initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
9734 CHECK_VECTOR (initial);
9735 for (i = 0; i < 4; i++)
9736 {
9737 val = Faref (initial, make_number (i));
9738 if (! NILP (val))
9739 {
584948ac
KH
9740 struct charset *charset;
9741
9742 CHECK_CHARSET_GET_CHARSET (val, charset);
9743 ASET (initial, i, make_number (CHARSET_ID (charset)));
9744 if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
9745 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
df7492f9
KH
9746 }
9747 else
9748 ASET (initial, i, make_number (-1));
9749 }
9750
9751 reg_usage = args[coding_arg_iso2022_reg_usage];
9752 CHECK_CONS (reg_usage);
8f924df7
KH
9753 CHECK_NUMBER_CAR (reg_usage);
9754 CHECK_NUMBER_CDR (reg_usage);
df7492f9
KH
9755
9756 request = Fcopy_sequence (args[coding_arg_iso2022_request]);
9757 for (tail = request; ! NILP (tail); tail = Fcdr (tail))
1397dc18 9758 {
df7492f9 9759 int id;
2735d060 9760 Lisp_Object tmp1;
df7492f9
KH
9761
9762 val = Fcar (tail);
9763 CHECK_CONS (val);
2735d060
PE
9764 tmp1 = XCAR (val);
9765 CHECK_CHARSET_GET_ID (tmp1, id);
8f924df7 9766 CHECK_NATNUM_CDR (val);
df7492f9 9767 if (XINT (XCDR (val)) >= 4)
5fdb398c 9768 error ("Invalid graphic register number: %"pEd, XINT (XCDR (val)));
8f924df7 9769 XSETCAR (val, make_number (id));
1397dc18 9770 }
4ed46869 9771
df7492f9
KH
9772 flags = args[coding_arg_iso2022_flags];
9773 CHECK_NATNUM (flags);
9774 i = XINT (flags);
9775 if (EQ (args[coding_arg_charset_list], Qiso_2022))
9776 flags = make_number (i | CODING_ISO_FLAG_FULL_SUPPORT);
9777
9778 ASET (attrs, coding_attr_iso_initial, initial);
9779 ASET (attrs, coding_attr_iso_usage, reg_usage);
9780 ASET (attrs, coding_attr_iso_request, request);
9781 ASET (attrs, coding_attr_iso_flags, flags);
9782 setup_iso_safe_charsets (attrs);
9783
9784 if (i & CODING_ISO_FLAG_SEVEN_BITS)
9785 category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
9786 | CODING_ISO_FLAG_SINGLE_SHIFT))
9787 ? coding_category_iso_7_else
9788 : EQ (args[coding_arg_charset_list], Qiso_2022)
9789 ? coding_category_iso_7
9790 : coding_category_iso_7_tight);
9791 else
9792 {
9793 int id = XINT (AREF (initial, 1));
9794
c6fb6e98 9795 category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
df7492f9
KH
9796 || EQ (args[coding_arg_charset_list], Qiso_2022)
9797 || id < 0)
9798 ? coding_category_iso_8_else
9799 : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
9800 ? coding_category_iso_8_1
9801 : coding_category_iso_8_2);
9802 }
0ce7886f
KH
9803 if (category != coding_category_iso_8_1
9804 && category != coding_category_iso_8_2)
9805 CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
df7492f9
KH
9806 }
9807 else if (EQ (coding_type, Qemacs_mule))
c28a9453 9808 {
df7492f9
KH
9809 if (EQ (args[coding_arg_charset_list], Qemacs_mule))
9810 ASET (attrs, coding_attr_emacs_mule_full, Qt);
584948ac 9811 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
df7492f9 9812 category = coding_category_emacs_mule;
c28a9453 9813 }
df7492f9 9814 else if (EQ (coding_type, Qshift_jis))
c28a9453 9815 {
df7492f9
KH
9816
9817 struct charset *charset;
9818
7d64c6ad 9819 if (XINT (Flength (charset_list)) != 3
6e07c25f 9820 && XINT (Flength (charset_list)) != 4)
7d64c6ad 9821 error ("There should be three or four charsets");
df7492f9
KH
9822
9823 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9824 if (CHARSET_DIMENSION (charset) != 1)
9825 error ("Dimension of charset %s is not one",
8f924df7 9826 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
584948ac
KH
9827 if (CHARSET_ASCII_COMPATIBLE_P (charset))
9828 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
df7492f9
KH
9829
9830 charset_list = XCDR (charset_list);
9831 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9832 if (CHARSET_DIMENSION (charset) != 1)
9833 error ("Dimension of charset %s is not one",
8f924df7 9834 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
df7492f9
KH
9835
9836 charset_list = XCDR (charset_list);
9837 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9838 if (CHARSET_DIMENSION (charset) != 2)
7d64c6ad
KH
9839 error ("Dimension of charset %s is not two",
9840 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9841
9842 charset_list = XCDR (charset_list);
2b917a06
KH
9843 if (! NILP (charset_list))
9844 {
9845 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9846 if (CHARSET_DIMENSION (charset) != 2)
9847 error ("Dimension of charset %s is not two",
9848 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9849 }
df7492f9
KH
9850
9851 category = coding_category_sjis;
9852 Vsjis_coding_system = name;
c28a9453 9853 }
df7492f9
KH
9854 else if (EQ (coding_type, Qbig5))
9855 {
9856 struct charset *charset;
4ed46869 9857
df7492f9
KH
9858 if (XINT (Flength (charset_list)) != 2)
9859 error ("There should be just two charsets");
9860
9861 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9862 if (CHARSET_DIMENSION (charset) != 1)
9863 error ("Dimension of charset %s is not one",
8f924df7 9864 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
584948ac
KH
9865 if (CHARSET_ASCII_COMPATIBLE_P (charset))
9866 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
df7492f9
KH
9867
9868 charset_list = XCDR (charset_list);
9869 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9870 if (CHARSET_DIMENSION (charset) != 2)
9871 error ("Dimension of charset %s is not two",
8f924df7 9872 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
4ed46869 9873
df7492f9
KH
9874 category = coding_category_big5;
9875 Vbig5_coding_system = name;
9876 }
9877 else if (EQ (coding_type, Qraw_text))
c28a9453 9878 {
584948ac
KH
9879 category = coding_category_raw_text;
9880 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
c28a9453 9881 }
df7492f9 9882 else if (EQ (coding_type, Qutf_8))
4ed46869 9883 {
a470d443
KH
9884 Lisp_Object bom;
9885
584948ac 9886 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
a470d443
KH
9887
9888 if (nargs < coding_arg_utf8_max)
9889 goto short_args;
9890
9891 bom = args[coding_arg_utf8_bom];
9892 if (! NILP (bom) && ! EQ (bom, Qt))
9893 {
9894 CHECK_CONS (bom);
9895 val = XCAR (bom);
9896 CHECK_CODING_SYSTEM (val);
9897 val = XCDR (bom);
9898 CHECK_CODING_SYSTEM (val);
9899 }
9900 ASET (attrs, coding_attr_utf_bom, bom);
9901
9902 category = (CONSP (bom) ? coding_category_utf_8_auto
9903 : NILP (bom) ? coding_category_utf_8_nosig
9904 : coding_category_utf_8_sig);
4ed46869 9905 }
df7492f9
KH
9906 else if (EQ (coding_type, Qundecided))
9907 category = coding_category_undecided;
4ed46869 9908 else
df7492f9 9909 error ("Invalid coding system type: %s",
8f924df7 9910 SDATA (SYMBOL_NAME (coding_type)));
4ed46869 9911
df7492f9 9912 CODING_ATTR_CATEGORY (attrs) = make_number (category);
01378f49
KH
9913 CODING_ATTR_PLIST (attrs)
9914 = Fcons (QCcategory, Fcons (AREF (Vcoding_category_table, category),
9915 CODING_ATTR_PLIST (attrs)));
35befdaa 9916 CODING_ATTR_PLIST (attrs)
3ed051d4 9917 = Fcons (QCascii_compatible_p,
35befdaa
KH
9918 Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
9919 CODING_ATTR_PLIST (attrs)));
c4825358 9920
df7492f9
KH
9921 eol_type = args[coding_arg_eol_type];
9922 if (! NILP (eol_type)
9923 && ! EQ (eol_type, Qunix)
9924 && ! EQ (eol_type, Qdos)
9925 && ! EQ (eol_type, Qmac))
9926 error ("Invalid eol-type");
4ed46869 9927
df7492f9 9928 aliases = Fcons (name, Qnil);
4ed46869 9929
df7492f9
KH
9930 if (NILP (eol_type))
9931 {
9932 eol_type = make_subsidiaries (name);
9933 for (i = 0; i < 3; i++)
1397dc18 9934 {
df7492f9
KH
9935 Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
9936
9937 this_name = AREF (eol_type, i);
9938 this_aliases = Fcons (this_name, Qnil);
9939 this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
9940 this_spec = Fmake_vector (make_number (3), attrs);
9941 ASET (this_spec, 1, this_aliases);
9942 ASET (this_spec, 2, this_eol_type);
9943 Fputhash (this_name, this_spec, Vcoding_system_hash_table);
9944 Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
583f71ca
KH
9945 val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
9946 if (NILP (val))
9947 Vcoding_system_alist
9948 = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
9949 Vcoding_system_alist);
1397dc18 9950 }
d46c5b12 9951 }
4ed46869 9952
df7492f9
KH
9953 spec_vec = Fmake_vector (make_number (3), attrs);
9954 ASET (spec_vec, 1, aliases);
9955 ASET (spec_vec, 2, eol_type);
48b0f3ae 9956
df7492f9
KH
9957 Fputhash (name, spec_vec, Vcoding_system_hash_table);
9958 Vcoding_system_list = Fcons (name, Vcoding_system_list);
583f71ca
KH
9959 val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
9960 if (NILP (val))
9961 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
9962 Vcoding_system_alist);
48b0f3ae 9963
df7492f9
KH
9964 {
9965 int id = coding_categories[category].id;
48b0f3ae 9966
df7492f9
KH
9967 if (id < 0 || EQ (name, CODING_ID_NAME (id)))
9968 setup_coding_system (name, &coding_categories[category]);
9969 }
48b0f3ae 9970
d46c5b12 9971 return Qnil;
48b0f3ae 9972
df7492f9
KH
9973 short_args:
9974 return Fsignal (Qwrong_number_of_arguments,
9975 Fcons (intern ("define-coding-system-internal"),
9976 make_number (nargs)));
d46c5b12 9977}
4ed46869 9978
d6925f38 9979
a6f87d34
KH
9980DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
9981 3, 3, 0,
9982 doc: /* Change value in CODING-SYSTEM's property list PROP to VAL. */)
5842a27b 9983 (Lisp_Object coding_system, Lisp_Object prop, Lisp_Object val)
a6f87d34 9984{
3dbe7859 9985 Lisp_Object spec, attrs;
a6f87d34
KH
9986
9987 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9988 attrs = AREF (spec, 0);
9989 if (EQ (prop, QCmnemonic))
9990 {
9991 if (! STRINGP (val))
9992 CHECK_CHARACTER (val);
9993 CODING_ATTR_MNEMONIC (attrs) = val;
9994 }
2133e2d1 9995 else if (EQ (prop, QCdefault_char))
a6f87d34
KH
9996 {
9997 if (NILP (val))
9998 val = make_number (' ');
9999 else
10000 CHECK_CHARACTER (val);
10001 CODING_ATTR_DEFAULT_CHAR (attrs) = val;
10002 }
10003 else if (EQ (prop, QCdecode_translation_table))
10004 {
10005 if (! CHAR_TABLE_P (val) && ! CONSP (val))
10006 CHECK_SYMBOL (val);
10007 CODING_ATTR_DECODE_TBL (attrs) = val;
10008 }
10009 else if (EQ (prop, QCencode_translation_table))
10010 {
10011 if (! CHAR_TABLE_P (val) && ! CONSP (val))
10012 CHECK_SYMBOL (val);
10013 CODING_ATTR_ENCODE_TBL (attrs) = val;
10014 }
10015 else if (EQ (prop, QCpost_read_conversion))
10016 {
10017 CHECK_SYMBOL (val);
10018 CODING_ATTR_POST_READ (attrs) = val;
10019 }
10020 else if (EQ (prop, QCpre_write_conversion))
10021 {
10022 CHECK_SYMBOL (val);
10023 CODING_ATTR_PRE_WRITE (attrs) = val;
10024 }
35befdaa
KH
10025 else if (EQ (prop, QCascii_compatible_p))
10026 {
10027 CODING_ATTR_ASCII_COMPAT (attrs) = val;
10028 }
a6f87d34
KH
10029
10030 CODING_ATTR_PLIST (attrs)
10031 = Fplist_put (CODING_ATTR_PLIST (attrs), prop, val);
10032 return val;
10033}
10034
10035
df7492f9
KH
10036DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
10037 Sdefine_coding_system_alias, 2, 2, 0,
10038 doc: /* Define ALIAS as an alias for CODING-SYSTEM. */)
5842a27b 10039 (Lisp_Object alias, Lisp_Object coding_system)
66cfb530 10040{
583f71ca 10041 Lisp_Object spec, aliases, eol_type, val;
4ed46869 10042
df7492f9
KH
10043 CHECK_SYMBOL (alias);
10044 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10045 aliases = AREF (spec, 1);
d4a1d553 10046 /* ALIASES should be a list of length more than zero, and the first
d6925f38
KH
10047 element is a base coding system. Append ALIAS at the tail of the
10048 list. */
df7492f9
KH
10049 while (!NILP (XCDR (aliases)))
10050 aliases = XCDR (aliases);
8f924df7 10051 XSETCDR (aliases, Fcons (alias, Qnil));
4ed46869 10052
df7492f9
KH
10053 eol_type = AREF (spec, 2);
10054 if (VECTORP (eol_type))
4ed46869 10055 {
df7492f9
KH
10056 Lisp_Object subsidiaries;
10057 int i;
4ed46869 10058
df7492f9
KH
10059 subsidiaries = make_subsidiaries (alias);
10060 for (i = 0; i < 3; i++)
10061 Fdefine_coding_system_alias (AREF (subsidiaries, i),
10062 AREF (eol_type, i));
4ed46869 10063 }
df7492f9
KH
10064
10065 Fputhash (alias, spec, Vcoding_system_hash_table);
d6925f38 10066 Vcoding_system_list = Fcons (alias, Vcoding_system_list);
583f71ca
KH
10067 val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
10068 if (NILP (val))
10069 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
10070 Vcoding_system_alist);
66cfb530 10071
4ed46869
KH
10072 return Qnil;
10073}
10074
16a97296 10075DEFUE ("coding-system-base", Fcoding_system_base, Scoding_system_base,
df7492f9
KH
10076 1, 1, 0,
10077 doc: /* Return the base of CODING-SYSTEM.
da7db224 10078Any alias or subsidiary coding system is not a base coding system. */)
5842a27b 10079 (Lisp_Object coding_system)
d46c5b12 10080{
df7492f9 10081 Lisp_Object spec, attrs;
d46c5b12 10082
df7492f9
KH
10083 if (NILP (coding_system))
10084 return (Qno_conversion);
10085 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10086 attrs = AREF (spec, 0);
10087 return CODING_ATTR_BASE_NAME (attrs);
10088}
1397dc18 10089
df7492f9
KH
10090DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
10091 1, 1, 0,
10092 doc: "Return the property list of CODING-SYSTEM.")
5842a27b 10093 (Lisp_Object coding_system)
df7492f9
KH
10094{
10095 Lisp_Object spec, attrs;
1397dc18 10096
df7492f9
KH
10097 if (NILP (coding_system))
10098 coding_system = Qno_conversion;
10099 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10100 attrs = AREF (spec, 0);
10101 return CODING_ATTR_PLIST (attrs);
d46c5b12
KH
10102}
10103
df7492f9
KH
10104
10105DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
10106 1, 1, 0,
da7db224 10107 doc: /* Return the list of aliases of CODING-SYSTEM. */)
5842a27b 10108 (Lisp_Object coding_system)
66cfb530 10109{
df7492f9 10110 Lisp_Object spec;
84d60297 10111
df7492f9
KH
10112 if (NILP (coding_system))
10113 coding_system = Qno_conversion;
10114 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
da7db224 10115 return AREF (spec, 1);
df7492f9 10116}
66cfb530 10117
16a97296 10118DEFUE ("coding-system-eol-type", Fcoding_system_eol_type,
df7492f9
KH
10119 Scoding_system_eol_type, 1, 1, 0,
10120 doc: /* Return eol-type of CODING-SYSTEM.
d4a1d553 10121An eol-type is an integer 0, 1, 2, or a vector of coding systems.
66cfb530 10122
df7492f9
KH
10123Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10124and CR respectively.
66cfb530 10125
df7492f9
KH
10126A vector value indicates that a format of end-of-line should be
10127detected automatically. Nth element of the vector is the subsidiary
10128coding system whose eol-type is N. */)
5842a27b 10129 (Lisp_Object coding_system)
6b89e3aa 10130{
df7492f9
KH
10131 Lisp_Object spec, eol_type;
10132 int n;
6b89e3aa 10133
df7492f9
KH
10134 if (NILP (coding_system))
10135 coding_system = Qno_conversion;
10136 if (! CODING_SYSTEM_P (coding_system))
10137 return Qnil;
10138 spec = CODING_SYSTEM_SPEC (coding_system);
10139 eol_type = AREF (spec, 2);
10140 if (VECTORP (eol_type))
10141 return Fcopy_sequence (eol_type);
10142 n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
10143 return make_number (n);
6b89e3aa
KH
10144}
10145
4ed46869
KH
10146#endif /* emacs */
10147
10148\f
1397dc18 10149/*** 9. Post-amble ***/
4ed46869 10150
dfcf069d 10151void
971de7fb 10152init_coding_once (void)
4ed46869
KH
10153{
10154 int i;
10155
df7492f9
KH
10156 for (i = 0; i < coding_category_max; i++)
10157 {
10158 coding_categories[i].id = -1;
10159 coding_priorities[i] = i;
10160 }
4ed46869
KH
10161
10162 /* ISO2022 specific initialize routine. */
10163 for (i = 0; i < 0x20; i++)
b73bfc1c 10164 iso_code_class[i] = ISO_control_0;
4ed46869
KH
10165 for (i = 0x21; i < 0x7F; i++)
10166 iso_code_class[i] = ISO_graphic_plane_0;
10167 for (i = 0x80; i < 0xA0; i++)
b73bfc1c 10168 iso_code_class[i] = ISO_control_1;
4ed46869
KH
10169 for (i = 0xA1; i < 0xFF; i++)
10170 iso_code_class[i] = ISO_graphic_plane_1;
10171 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
10172 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
4ed46869
KH
10173 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
10174 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
10175 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
10176 iso_code_class[ISO_CODE_ESC] = ISO_escape;
10177 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
10178 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
10179 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
10180
df7492f9
KH
10181 for (i = 0; i < 256; i++)
10182 {
10183 emacs_mule_bytes[i] = 1;
10184 }
7c78e542
KH
10185 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
10186 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
10187 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
10188 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
e0e989f6
KH
10189}
10190
10191#ifdef emacs
10192
dfcf069d 10193void
971de7fb 10194syms_of_coding (void)
e0e989f6 10195{
df7492f9 10196 staticpro (&Vcoding_system_hash_table);
8f924df7
KH
10197 {
10198 Lisp_Object args[2];
10199 args[0] = QCtest;
10200 args[1] = Qeq;
10201 Vcoding_system_hash_table = Fmake_hash_table (2, args);
10202 }
df7492f9
KH
10203
10204 staticpro (&Vsjis_coding_system);
10205 Vsjis_coding_system = Qnil;
e0e989f6 10206
df7492f9
KH
10207 staticpro (&Vbig5_coding_system);
10208 Vbig5_coding_system = Qnil;
10209
24a73b0a
KH
10210 staticpro (&Vcode_conversion_reused_workbuf);
10211 Vcode_conversion_reused_workbuf = Qnil;
10212
10213 staticpro (&Vcode_conversion_workbuf_name);
d67b4f80 10214 Vcode_conversion_workbuf_name = make_pure_c_string (" *code-conversion-work*");
e0e989f6 10215
24a73b0a 10216 reused_workbuf_in_use = 0;
df7492f9
KH
10217
10218 DEFSYM (Qcharset, "charset");
10219 DEFSYM (Qtarget_idx, "target-idx");
10220 DEFSYM (Qcoding_system_history, "coding-system-history");
bb0115a2
RS
10221 Fset (Qcoding_system_history, Qnil);
10222
9ce27fde 10223 /* Target FILENAME is the first argument. */
e0e989f6 10224 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
9ce27fde 10225 /* Target FILENAME is the third argument. */
e0e989f6
KH
10226 Fput (Qwrite_region, Qtarget_idx, make_number (2));
10227
df7492f9 10228 DEFSYM (Qcall_process, "call-process");
9ce27fde 10229 /* Target PROGRAM is the first argument. */
e0e989f6
KH
10230 Fput (Qcall_process, Qtarget_idx, make_number (0));
10231
df7492f9 10232 DEFSYM (Qcall_process_region, "call-process-region");
9ce27fde 10233 /* Target PROGRAM is the third argument. */
e0e989f6
KH
10234 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
10235
df7492f9 10236 DEFSYM (Qstart_process, "start-process");
9ce27fde 10237 /* Target PROGRAM is the third argument. */
e0e989f6
KH
10238 Fput (Qstart_process, Qtarget_idx, make_number (2));
10239
df7492f9 10240 DEFSYM (Qopen_network_stream, "open-network-stream");
9ce27fde 10241 /* Target SERVICE is the fourth argument. */
e0e989f6
KH
10242 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
10243
df7492f9
KH
10244 DEFSYM (Qcoding_system, "coding-system");
10245 DEFSYM (Qcoding_aliases, "coding-aliases");
4ed46869 10246
df7492f9
KH
10247 DEFSYM (Qeol_type, "eol-type");
10248 DEFSYM (Qunix, "unix");
10249 DEFSYM (Qdos, "dos");
4ed46869 10250
df7492f9
KH
10251 DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10252 DEFSYM (Qpost_read_conversion, "post-read-conversion");
10253 DEFSYM (Qpre_write_conversion, "pre-write-conversion");
10254 DEFSYM (Qdefault_char, "default-char");
10255 DEFSYM (Qundecided, "undecided");
10256 DEFSYM (Qno_conversion, "no-conversion");
10257 DEFSYM (Qraw_text, "raw-text");
4ed46869 10258
df7492f9 10259 DEFSYM (Qiso_2022, "iso-2022");
4ed46869 10260
df7492f9 10261 DEFSYM (Qutf_8, "utf-8");
8f924df7 10262 DEFSYM (Qutf_8_emacs, "utf-8-emacs");
27901516 10263
df7492f9 10264 DEFSYM (Qutf_16, "utf-16");
df7492f9
KH
10265 DEFSYM (Qbig, "big");
10266 DEFSYM (Qlittle, "little");
27901516 10267
df7492f9
KH
10268 DEFSYM (Qshift_jis, "shift-jis");
10269 DEFSYM (Qbig5, "big5");
4ed46869 10270
df7492f9 10271 DEFSYM (Qcoding_system_p, "coding-system-p");
4ed46869 10272
df7492f9 10273 DEFSYM (Qcoding_system_error, "coding-system-error");
4ed46869 10274 Fput (Qcoding_system_error, Qerror_conditions,
d67b4f80 10275 pure_cons (Qcoding_system_error, pure_cons (Qerror, Qnil)));
4ed46869 10276 Fput (Qcoding_system_error, Qerror_message,
d67b4f80 10277 make_pure_c_string ("Invalid coding system"));
4ed46869 10278
05e6f5dc
KH
10279 /* Intern this now in case it isn't already done.
10280 Setting this variable twice is harmless.
10281 But don't staticpro it here--that is done in alloc.c. */
d67b4f80 10282 Qchar_table_extra_slots = intern_c_string ("char-table-extra-slots");
70c22245 10283
df7492f9 10284 DEFSYM (Qtranslation_table, "translation-table");
433f7f87 10285 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
df7492f9
KH
10286 DEFSYM (Qtranslation_table_id, "translation-table-id");
10287 DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
10288 DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
1397dc18 10289
df7492f9 10290 DEFSYM (Qvalid_codes, "valid-codes");
9ce27fde 10291
df7492f9 10292 DEFSYM (Qemacs_mule, "emacs-mule");
d46c5b12 10293
01378f49 10294 DEFSYM (QCcategory, ":category");
a6f87d34 10295 DEFSYM (QCmnemonic, ":mnemonic");
2133e2d1 10296 DEFSYM (QCdefault_char, ":default-char");
a6f87d34
KH
10297 DEFSYM (QCdecode_translation_table, ":decode-translation-table");
10298 DEFSYM (QCencode_translation_table, ":encode-translation-table");
10299 DEFSYM (QCpost_read_conversion, ":post-read-conversion");
10300 DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
35befdaa 10301 DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
01378f49 10302
df7492f9
KH
10303 Vcoding_category_table
10304 = Fmake_vector (make_number (coding_category_max), Qnil);
10305 staticpro (&Vcoding_category_table);
10306 /* Followings are target of code detection. */
10307 ASET (Vcoding_category_table, coding_category_iso_7,
d67b4f80 10308 intern_c_string ("coding-category-iso-7"));
df7492f9 10309 ASET (Vcoding_category_table, coding_category_iso_7_tight,
d67b4f80 10310 intern_c_string ("coding-category-iso-7-tight"));
df7492f9 10311 ASET (Vcoding_category_table, coding_category_iso_8_1,
d67b4f80 10312 intern_c_string ("coding-category-iso-8-1"));
df7492f9 10313 ASET (Vcoding_category_table, coding_category_iso_8_2,
d67b4f80 10314 intern_c_string ("coding-category-iso-8-2"));
df7492f9 10315 ASET (Vcoding_category_table, coding_category_iso_7_else,
d67b4f80 10316 intern_c_string ("coding-category-iso-7-else"));
df7492f9 10317 ASET (Vcoding_category_table, coding_category_iso_8_else,
d67b4f80 10318 intern_c_string ("coding-category-iso-8-else"));
a470d443 10319 ASET (Vcoding_category_table, coding_category_utf_8_auto,
d67b4f80 10320 intern_c_string ("coding-category-utf-8-auto"));
a470d443 10321 ASET (Vcoding_category_table, coding_category_utf_8_nosig,
d67b4f80 10322 intern_c_string ("coding-category-utf-8"));
a470d443 10323 ASET (Vcoding_category_table, coding_category_utf_8_sig,
d67b4f80 10324 intern_c_string ("coding-category-utf-8-sig"));
df7492f9 10325 ASET (Vcoding_category_table, coding_category_utf_16_be,
d67b4f80 10326 intern_c_string ("coding-category-utf-16-be"));
ff563fce 10327 ASET (Vcoding_category_table, coding_category_utf_16_auto,
d67b4f80 10328 intern_c_string ("coding-category-utf-16-auto"));
df7492f9 10329 ASET (Vcoding_category_table, coding_category_utf_16_le,
d67b4f80 10330 intern_c_string ("coding-category-utf-16-le"));
df7492f9 10331 ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
d67b4f80 10332 intern_c_string ("coding-category-utf-16-be-nosig"));
df7492f9 10333 ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
d67b4f80 10334 intern_c_string ("coding-category-utf-16-le-nosig"));
df7492f9 10335 ASET (Vcoding_category_table, coding_category_charset,
d67b4f80 10336 intern_c_string ("coding-category-charset"));
df7492f9 10337 ASET (Vcoding_category_table, coding_category_sjis,
d67b4f80 10338 intern_c_string ("coding-category-sjis"));
df7492f9 10339 ASET (Vcoding_category_table, coding_category_big5,
d67b4f80 10340 intern_c_string ("coding-category-big5"));
df7492f9 10341 ASET (Vcoding_category_table, coding_category_ccl,
d67b4f80 10342 intern_c_string ("coding-category-ccl"));
df7492f9 10343 ASET (Vcoding_category_table, coding_category_emacs_mule,
d67b4f80 10344 intern_c_string ("coding-category-emacs-mule"));
df7492f9
KH
10345 /* Followings are NOT target of code detection. */
10346 ASET (Vcoding_category_table, coding_category_raw_text,
d67b4f80 10347 intern_c_string ("coding-category-raw-text"));
df7492f9 10348 ASET (Vcoding_category_table, coding_category_undecided,
d67b4f80 10349 intern_c_string ("coding-category-undecided"));
ecf488bc 10350
065e3595
KH
10351 DEFSYM (Qinsufficient_source, "insufficient-source");
10352 DEFSYM (Qinconsistent_eol, "inconsistent-eol");
10353 DEFSYM (Qinvalid_source, "invalid-source");
10354 DEFSYM (Qinterrupted, "interrupted");
10355 DEFSYM (Qinsufficient_memory, "insufficient-memory");
44e8490d 10356 DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
065e3595 10357
4ed46869
KH
10358 defsubr (&Scoding_system_p);
10359 defsubr (&Sread_coding_system);
10360 defsubr (&Sread_non_nil_coding_system);
10361 defsubr (&Scheck_coding_system);
10362 defsubr (&Sdetect_coding_region);
d46c5b12 10363 defsubr (&Sdetect_coding_string);
05e6f5dc 10364 defsubr (&Sfind_coding_systems_region_internal);
068a9dbd 10365 defsubr (&Sunencodable_char_position);
df7492f9 10366 defsubr (&Scheck_coding_systems_region);
4ed46869
KH
10367 defsubr (&Sdecode_coding_region);
10368 defsubr (&Sencode_coding_region);
10369 defsubr (&Sdecode_coding_string);
10370 defsubr (&Sencode_coding_string);
10371 defsubr (&Sdecode_sjis_char);
10372 defsubr (&Sencode_sjis_char);
10373 defsubr (&Sdecode_big5_char);
10374 defsubr (&Sencode_big5_char);
1ba9e4ab 10375 defsubr (&Sset_terminal_coding_system_internal);
c4825358 10376 defsubr (&Sset_safe_terminal_coding_system_internal);
4ed46869 10377 defsubr (&Sterminal_coding_system);
1ba9e4ab 10378 defsubr (&Sset_keyboard_coding_system_internal);
4ed46869 10379 defsubr (&Skeyboard_coding_system);
a5d301df 10380 defsubr (&Sfind_operation_coding_system);
df7492f9 10381 defsubr (&Sset_coding_system_priority);
6b89e3aa 10382 defsubr (&Sdefine_coding_system_internal);
df7492f9 10383 defsubr (&Sdefine_coding_system_alias);
a6f87d34 10384 defsubr (&Scoding_system_put);
df7492f9
KH
10385 defsubr (&Scoding_system_base);
10386 defsubr (&Scoding_system_plist);
10387 defsubr (&Scoding_system_aliases);
10388 defsubr (&Scoding_system_eol_type);
10389 defsubr (&Scoding_system_priority_list);
4ed46869 10390
29208e82 10391 DEFVAR_LISP ("coding-system-list", Vcoding_system_list,
48b0f3ae
PJ
10392 doc: /* List of coding systems.
10393
10394Do not alter the value of this variable manually. This variable should be
df7492f9 10395updated by the functions `define-coding-system' and
48b0f3ae 10396`define-coding-system-alias'. */);
4608c386
KH
10397 Vcoding_system_list = Qnil;
10398
29208e82 10399 DEFVAR_LISP ("coding-system-alist", Vcoding_system_alist,
48b0f3ae
PJ
10400 doc: /* Alist of coding system names.
10401Each element is one element list of coding system name.
446dcd75 10402This variable is given to `completing-read' as COLLECTION argument.
48b0f3ae
PJ
10403
10404Do not alter the value of this variable manually. This variable should be
10405updated by the functions `make-coding-system' and
10406`define-coding-system-alias'. */);
4608c386
KH
10407 Vcoding_system_alist = Qnil;
10408
29208e82 10409 DEFVAR_LISP ("coding-category-list", Vcoding_category_list,
48b0f3ae
PJ
10410 doc: /* List of coding-categories (symbols) ordered by priority.
10411
10412On detecting a coding system, Emacs tries code detection algorithms
10413associated with each coding-category one by one in this order. When
10414one algorithm agrees with a byte sequence of source text, the coding
0ec31faf
KH
10415system bound to the corresponding coding-category is selected.
10416
448e17d6 10417Don't modify this variable directly, but use `set-coding-system-priority'. */);
4ed46869
KH
10418 {
10419 int i;
10420
10421 Vcoding_category_list = Qnil;
df7492f9 10422 for (i = coding_category_max - 1; i >= 0; i--)
4ed46869 10423 Vcoding_category_list
d46c5b12
KH
10424 = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
10425 Vcoding_category_list);
4ed46869
KH
10426 }
10427
29208e82 10428 DEFVAR_LISP ("coding-system-for-read", Vcoding_system_for_read,
48b0f3ae
PJ
10429 doc: /* Specify the coding system for read operations.
10430It is useful to bind this variable with `let', but do not set it globally.
10431If the value is a coding system, it is used for decoding on read operation.
446dcd75
JB
10432If not, an appropriate element is used from one of the coding system alists.
10433There are three such tables: `file-coding-system-alist',
48b0f3ae 10434`process-coding-system-alist', and `network-coding-system-alist'. */);
4ed46869
KH
10435 Vcoding_system_for_read = Qnil;
10436
29208e82 10437 DEFVAR_LISP ("coding-system-for-write", Vcoding_system_for_write,
48b0f3ae
PJ
10438 doc: /* Specify the coding system for write operations.
10439Programs bind this variable with `let', but you should not set it globally.
10440If the value is a coding system, it is used for encoding of output,
10441when writing it to a file and when sending it to a file or subprocess.
10442
10443If this does not specify a coding system, an appropriate element
446dcd75
JB
10444is used from one of the coding system alists.
10445There are three such tables: `file-coding-system-alist',
48b0f3ae
PJ
10446`process-coding-system-alist', and `network-coding-system-alist'.
10447For output to files, if the above procedure does not specify a coding system,
10448the value of `buffer-file-coding-system' is used. */);
4ed46869
KH
10449 Vcoding_system_for_write = Qnil;
10450
29208e82 10451 DEFVAR_LISP ("last-coding-system-used", Vlast_coding_system_used,
df7492f9
KH
10452 doc: /*
10453Coding system used in the latest file or process I/O. */);
4ed46869
KH
10454 Vlast_coding_system_used = Qnil;
10455
29208e82 10456 DEFVAR_LISP ("last-code-conversion-error", Vlast_code_conversion_error,
065e3595
KH
10457 doc: /*
10458Error status of the last code conversion.
10459
10460When an error was detected in the last code conversion, this variable
10461is set to one of the following symbols.
10462 `insufficient-source'
10463 `inconsistent-eol'
10464 `invalid-source'
10465 `interrupted'
10466 `insufficient-memory'
10467When no error was detected, the value doesn't change. So, to check
10468the error status of a code conversion by this variable, you must
10469explicitly set this variable to nil before performing code
10470conversion. */);
10471 Vlast_code_conversion_error = Qnil;
10472
29208e82 10473 DEFVAR_BOOL ("inhibit-eol-conversion", inhibit_eol_conversion,
df7492f9
KH
10474 doc: /*
10475*Non-nil means always inhibit code conversion of end-of-line format.
48b0f3ae
PJ
10476See info node `Coding Systems' and info node `Text and Binary' concerning
10477such conversion. */);
9ce27fde
KH
10478 inhibit_eol_conversion = 0;
10479
29208e82 10480 DEFVAR_BOOL ("inherit-process-coding-system", inherit_process_coding_system,
df7492f9
KH
10481 doc: /*
10482Non-nil means process buffer inherits coding system of process output.
48b0f3ae
PJ
10483Bind it to t if the process output is to be treated as if it were a file
10484read from some filesystem. */);
ed29121d
EZ
10485 inherit_process_coding_system = 0;
10486
29208e82 10487 DEFVAR_LISP ("file-coding-system-alist", Vfile_coding_system_alist,
df7492f9
KH
10488 doc: /*
10489Alist to decide a coding system to use for a file I/O operation.
48b0f3ae
PJ
10490The format is ((PATTERN . VAL) ...),
10491where PATTERN is a regular expression matching a file name,
10492VAL is a coding system, a cons of coding systems, or a function symbol.
10493If VAL is a coding system, it is used for both decoding and encoding
10494the file contents.
10495If VAL is a cons of coding systems, the car part is used for decoding,
10496and the cdr part is used for encoding.
10497If VAL is a function symbol, the function must return a coding system
2c53e699
KH
10498or a cons of coding systems which are used as above. The function is
10499called with an argument that is a list of the arguments with which
5a0bbd9a
KH
10500`find-operation-coding-system' was called. If the function can't decide
10501a coding system, it can return `undecided' so that the normal
10502code-detection is performed.
48b0f3ae
PJ
10503
10504See also the function `find-operation-coding-system'
10505and the variable `auto-coding-alist'. */);
02ba4723
KH
10506 Vfile_coding_system_alist = Qnil;
10507
29208e82 10508 DEFVAR_LISP ("process-coding-system-alist", Vprocess_coding_system_alist,
df7492f9
KH
10509 doc: /*
10510Alist to decide a coding system to use for a process I/O operation.
48b0f3ae
PJ
10511The format is ((PATTERN . VAL) ...),
10512where PATTERN is a regular expression matching a program name,
10513VAL is a coding system, a cons of coding systems, or a function symbol.
10514If VAL is a coding system, it is used for both decoding what received
10515from the program and encoding what sent to the program.
10516If VAL is a cons of coding systems, the car part is used for decoding,
10517and the cdr part is used for encoding.
10518If VAL is a function symbol, the function must return a coding system
10519or a cons of coding systems which are used as above.
10520
10521See also the function `find-operation-coding-system'. */);
02ba4723
KH
10522 Vprocess_coding_system_alist = Qnil;
10523
29208e82 10524 DEFVAR_LISP ("network-coding-system-alist", Vnetwork_coding_system_alist,
df7492f9
KH
10525 doc: /*
10526Alist to decide a coding system to use for a network I/O operation.
48b0f3ae
PJ
10527The format is ((PATTERN . VAL) ...),
10528where PATTERN is a regular expression matching a network service name
10529or is a port number to connect to,
10530VAL is a coding system, a cons of coding systems, or a function symbol.
10531If VAL is a coding system, it is used for both decoding what received
10532from the network stream and encoding what sent to the network stream.
10533If VAL is a cons of coding systems, the car part is used for decoding,
10534and the cdr part is used for encoding.
10535If VAL is a function symbol, the function must return a coding system
10536or a cons of coding systems which are used as above.
10537
10538See also the function `find-operation-coding-system'. */);
02ba4723 10539 Vnetwork_coding_system_alist = Qnil;
4ed46869 10540
29208e82 10541 DEFVAR_LISP ("locale-coding-system", Vlocale_coding_system,
75205970
RS
10542 doc: /* Coding system to use with system messages.
10543Also used for decoding keyboard input on X Window system. */);
68c45bf0
PE
10544 Vlocale_coding_system = Qnil;
10545
005f0d35 10546 /* The eol mnemonics are reset in startup.el system-dependently. */
29208e82 10547 DEFVAR_LISP ("eol-mnemonic-unix", eol_mnemonic_unix,
df7492f9
KH
10548 doc: /*
10549*String displayed in mode line for UNIX-like (LF) end-of-line format. */);
d67b4f80 10550 eol_mnemonic_unix = make_pure_c_string (":");
4ed46869 10551
29208e82 10552 DEFVAR_LISP ("eol-mnemonic-dos", eol_mnemonic_dos,
df7492f9
KH
10553 doc: /*
10554*String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
d67b4f80 10555 eol_mnemonic_dos = make_pure_c_string ("\\");
4ed46869 10556
29208e82 10557 DEFVAR_LISP ("eol-mnemonic-mac", eol_mnemonic_mac,
df7492f9
KH
10558 doc: /*
10559*String displayed in mode line for MAC-like (CR) end-of-line format. */);
d67b4f80 10560 eol_mnemonic_mac = make_pure_c_string ("/");
4ed46869 10561
29208e82 10562 DEFVAR_LISP ("eol-mnemonic-undecided", eol_mnemonic_undecided,
df7492f9
KH
10563 doc: /*
10564*String displayed in mode line when end-of-line format is not yet determined. */);
d67b4f80 10565 eol_mnemonic_undecided = make_pure_c_string (":");
4ed46869 10566
29208e82 10567 DEFVAR_LISP ("enable-character-translation", Venable_character_translation,
df7492f9
KH
10568 doc: /*
10569*Non-nil enables character translation while encoding and decoding. */);
84fbb8a0 10570 Venable_character_translation = Qt;
bdd9fb48 10571
f967223b 10572 DEFVAR_LISP ("standard-translation-table-for-decode",
29208e82 10573 Vstandard_translation_table_for_decode,
48b0f3ae 10574 doc: /* Table for translating characters while decoding. */);
f967223b 10575 Vstandard_translation_table_for_decode = Qnil;
bdd9fb48 10576
f967223b 10577 DEFVAR_LISP ("standard-translation-table-for-encode",
29208e82 10578 Vstandard_translation_table_for_encode,
48b0f3ae 10579 doc: /* Table for translating characters while encoding. */);
f967223b 10580 Vstandard_translation_table_for_encode = Qnil;
4ed46869 10581
29208e82 10582 DEFVAR_LISP ("charset-revision-table", Vcharset_revision_table,
48b0f3ae
PJ
10583 doc: /* Alist of charsets vs revision numbers.
10584While encoding, if a charset (car part of an element) is found,
df7492f9
KH
10585designate it with the escape sequence identifying revision (cdr part
10586of the element). */);
10587 Vcharset_revision_table = Qnil;
02ba4723
KH
10588
10589 DEFVAR_LISP ("default-process-coding-system",
29208e82 10590 Vdefault_process_coding_system,
48b0f3ae
PJ
10591 doc: /* Cons of coding systems used for process I/O by default.
10592The car part is used for decoding a process output,
10593the cdr part is used for encoding a text to be sent to a process. */);
02ba4723 10594 Vdefault_process_coding_system = Qnil;
c4825358 10595
29208e82 10596 DEFVAR_LISP ("latin-extra-code-table", Vlatin_extra_code_table,
df7492f9
KH
10597 doc: /*
10598Table of extra Latin codes in the range 128..159 (inclusive).
48b0f3ae
PJ
10599This is a vector of length 256.
10600If Nth element is non-nil, the existence of code N in a file
10601\(or output of subprocess) doesn't prevent it to be detected as
10602a coding system of ISO 2022 variant which has a flag
10603`accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
10604or reading output of a subprocess.
446dcd75 10605Only 128th through 159th elements have a meaning. */);
3f003981 10606 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
d46c5b12
KH
10607
10608 DEFVAR_LISP ("select-safe-coding-system-function",
29208e82 10609 Vselect_safe_coding_system_function,
df7492f9
KH
10610 doc: /*
10611Function to call to select safe coding system for encoding a text.
48b0f3ae
PJ
10612
10613If set, this function is called to force a user to select a proper
10614coding system which can encode the text in the case that a default
fdecf907
GM
10615coding system used in each operation can't encode the text. The
10616function should take care that the buffer is not modified while
10617the coding system is being selected.
48b0f3ae
PJ
10618
10619The default value is `select-safe-coding-system' (which see). */);
d46c5b12
KH
10620 Vselect_safe_coding_system_function = Qnil;
10621
5d5bf4d8 10622 DEFVAR_BOOL ("coding-system-require-warning",
29208e82 10623 coding_system_require_warning,
5d5bf4d8 10624 doc: /* Internal use only.
6b89e3aa
KH
10625If non-nil, on writing a file, `select-safe-coding-system-function' is
10626called even if `coding-system-for-write' is non-nil. The command
10627`universal-coding-system-argument' binds this variable to t temporarily. */);
5d5bf4d8
KH
10628 coding_system_require_warning = 0;
10629
10630
22ab2303 10631 DEFVAR_BOOL ("inhibit-iso-escape-detection",
29208e82 10632 inhibit_iso_escape_detection,
df7492f9 10633 doc: /*
97b1b294 10634If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
48b0f3ae 10635
97b1b294
EZ
10636When Emacs reads text, it tries to detect how the text is encoded.
10637This code detection is sensitive to escape sequences. If Emacs sees
10638a valid ISO-2022 escape sequence, it assumes the text is encoded in one
10639of the ISO2022 encodings, and decodes text by the corresponding coding
10640system (e.g. `iso-2022-7bit').
48b0f3ae
PJ
10641
10642However, there may be a case that you want to read escape sequences in
10643a file as is. In such a case, you can set this variable to non-nil.
97b1b294
EZ
10644Then the code detection will ignore any escape sequences, and no text is
10645detected as encoded in some ISO-2022 encoding. The result is that all
48b0f3ae
PJ
10646escape sequences become visible in a buffer.
10647
10648The default value is nil, and it is strongly recommended not to change
10649it. That is because many Emacs Lisp source files that contain
10650non-ASCII characters are encoded by the coding system `iso-2022-7bit'
10651in Emacs's distribution, and they won't be decoded correctly on
10652reading if you suppress escape sequence detection.
10653
10654The other way to read escape sequences in a file without decoding is
97b1b294 10655to explicitly specify some coding system that doesn't use ISO-2022
48b0f3ae 10656escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument]. */);
74383408 10657 inhibit_iso_escape_detection = 0;
002fdb44 10658
97b1b294 10659 DEFVAR_BOOL ("inhibit-null-byte-detection",
29208e82 10660 inhibit_null_byte_detection,
97b1b294
EZ
10661 doc: /* If non-nil, Emacs ignores null bytes on code detection.
10662By default, Emacs treats it as binary data, and does not attempt to
10663decode it. The effect is as if you specified `no-conversion' for
10664reading that text.
10665
10666Set this to non-nil when a regular text happens to include null bytes.
10667Examples are Index nodes of Info files and null-byte delimited output
10668from GNU Find and GNU Grep. Emacs will then ignore the null bytes and
10669decode text as usual. */);
10670 inhibit_null_byte_detection = 0;
10671
29208e82 10672 DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input,
15c8f9d1 10673 doc: /* Char table for translating self-inserting characters.
446dcd75 10674This is applied to the result of input methods, not their input.
8434d0b8
EZ
10675See also `keyboard-translate-table'.
10676
10677Use of this variable for character code unification was rendered
10678obsolete in Emacs 23.1 and later, since Unicode is now the basis of
10679internal character representation. */);
002fdb44 10680 Vtranslation_table_for_input = Qnil;
8f924df7 10681
2c78b7e1
KH
10682 {
10683 Lisp_Object args[coding_arg_max];
8f924df7 10684 Lisp_Object plist[16];
2c78b7e1
KH
10685 int i;
10686
10687 for (i = 0; i < coding_arg_max; i++)
10688 args[i] = Qnil;
10689
d67b4f80 10690 plist[0] = intern_c_string (":name");
2c78b7e1 10691 plist[1] = args[coding_arg_name] = Qno_conversion;
d67b4f80 10692 plist[2] = intern_c_string (":mnemonic");
2c78b7e1 10693 plist[3] = args[coding_arg_mnemonic] = make_number ('=');
d67b4f80 10694 plist[4] = intern_c_string (":coding-type");
2c78b7e1 10695 plist[5] = args[coding_arg_coding_type] = Qraw_text;
d67b4f80 10696 plist[6] = intern_c_string (":ascii-compatible-p");
2c78b7e1 10697 plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
d67b4f80 10698 plist[8] = intern_c_string (":default-char");
2c78b7e1 10699 plist[9] = args[coding_arg_default_char] = make_number (0);
d67b4f80 10700 plist[10] = intern_c_string (":for-unibyte");
8f924df7 10701 plist[11] = args[coding_arg_for_unibyte] = Qt;
d67b4f80
DN
10702 plist[12] = intern_c_string (":docstring");
10703 plist[13] = make_pure_c_string ("Do no conversion.\n\
2c78b7e1
KH
10704\n\
10705When you visit a file with this coding, the file is read into a\n\
10706unibyte buffer as is, thus each byte of a file is treated as a\n\
10707character.");
d67b4f80 10708 plist[14] = intern_c_string (":eol-type");
8f924df7
KH
10709 plist[15] = args[coding_arg_eol_type] = Qunix;
10710 args[coding_arg_plist] = Flist (16, plist);
2c78b7e1 10711 Fdefine_coding_system_internal (coding_arg_max, args);
ae6f73fa
KH
10712
10713 plist[1] = args[coding_arg_name] = Qundecided;
10714 plist[3] = args[coding_arg_mnemonic] = make_number ('-');
10715 plist[5] = args[coding_arg_coding_type] = Qundecided;
10716 /* This is already set.
35befdaa 10717 plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
d67b4f80 10718 plist[8] = intern_c_string (":charset-list");
ae6f73fa
KH
10719 plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
10720 plist[11] = args[coding_arg_for_unibyte] = Qnil;
d67b4f80 10721 plist[13] = make_pure_c_string ("No conversion on encoding, automatic conversion on decoding.");
ae6f73fa
KH
10722 plist[15] = args[coding_arg_eol_type] = Qnil;
10723 args[coding_arg_plist] = Flist (16, plist);
10724 Fdefine_coding_system_internal (coding_arg_max, args);
2c78b7e1
KH
10725 }
10726
2c78b7e1 10727 setup_coding_system (Qno_conversion, &safe_terminal_coding);
ff563fce
KH
10728
10729 {
10730 int i;
10731
10732 for (i = 0; i < coding_category_max; i++)
10733 Fset (AREF (Vcoding_category_table, i), Qno_conversion);
10734 }
1a4990fb 10735#if defined (DOS_NT)
fcbcfb64
KH
10736 system_eol_type = Qdos;
10737#else
10738 system_eol_type = Qunix;
10739#endif
10740 staticpro (&system_eol_type);
4ed46869
KH
10741}
10742
68c45bf0 10743char *
971de7fb 10744emacs_strerror (int error_number)
68c45bf0
PE
10745{
10746 char *str;
10747
ca9c0567 10748 synchronize_system_messages_locale ();
68c45bf0
PE
10749 str = strerror (error_number);
10750
10751 if (! NILP (Vlocale_coding_system))
10752 {
10753 Lisp_Object dec = code_convert_string_norecord (build_string (str),
10754 Vlocale_coding_system,
10755 0);
51b59d79 10756 str = SSDATA (dec);
68c45bf0
PE
10757 }
10758
10759 return str;
10760}
10761
4ed46869 10762#endif /* emacs */