Merge from trunk.
[bpt/emacs.git] / src / coding.c
CommitLineData
9542cb1f 1/* Coding system handler (conversion, detection, etc).
acaf905b 2 Copyright (C) 2001-2012 Free Software Foundation, Inc.
7976eda0 3 Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
5df4f04c 4 2005, 2006, 2007, 2008, 2009, 2010, 2011
ce03bf76
KH
5 National Institute of Advanced Industrial Science and Technology (AIST)
6 Registration Number H14PRO021
8f924df7 7 Copyright (C) 2003
df7492f9
KH
8 National Institute of Advanced Industrial Science and Technology (AIST)
9 Registration Number H13PRO009
4ed46869 10
369314dc
KH
11This file is part of GNU Emacs.
12
9ec0b715 13GNU Emacs is free software: you can redistribute it and/or modify
369314dc 14it under the terms of the GNU General Public License as published by
9ec0b715
GM
15the Free Software Foundation, either version 3 of the License, or
16(at your option) any later version.
4ed46869 17
369314dc
KH
18GNU Emacs is distributed in the hope that it will be useful,
19but WITHOUT ANY WARRANTY; without even the implied warranty of
20MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21GNU General Public License for more details.
4ed46869 22
369314dc 23You should have received a copy of the GNU General Public License
9ec0b715 24along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */
4ed46869
KH
25
26/*** TABLE OF CONTENTS ***
27
b73bfc1c 28 0. General comments
4ed46869 29 1. Preamble
df7492f9
KH
30 2. Emacs' internal format (emacs-utf-8) handlers
31 3. UTF-8 handlers
32 4. UTF-16 handlers
33 5. Charset-base coding systems handlers
34 6. emacs-mule (old Emacs' internal format) handlers
35 7. ISO2022 handlers
36 8. Shift-JIS and BIG5 handlers
37 9. CCL handlers
38 10. C library functions
39 11. Emacs Lisp library functions
40 12. Postamble
4ed46869
KH
41
42*/
43
df7492f9 44/*** 0. General comments ***
b73bfc1c
KH
45
46
df7492f9 47CODING SYSTEM
4ed46869 48
5bad0796
DL
49 A coding system is an object for an encoding mechanism that contains
50 information about how to convert byte sequences to character
e19c3639
KH
51 sequences and vice versa. When we say "decode", it means converting
52 a byte sequence of a specific coding system into a character
53 sequence that is represented by Emacs' internal coding system
54 `emacs-utf-8', and when we say "encode", it means converting a
55 character sequence of emacs-utf-8 to a byte sequence of a specific
0ef69138 56 coding system.
4ed46869 57
34809aa6
EZ
58 In Emacs Lisp, a coding system is represented by a Lisp symbol. On
59 the C level, a coding system is represented by a vector of attributes
5bad0796 60 stored in the hash table Vcharset_hash_table. The conversion from
e19c3639
KH
61 coding system symbol to attributes vector is done by looking up
62 Vcharset_hash_table by the symbol.
4ed46869 63
e19c3639 64 Coding systems are classified into the following types depending on
5bad0796 65 the encoding mechanism. Here's a brief description of the types.
4ed46869 66
df7492f9
KH
67 o UTF-8
68
69 o UTF-16
70
71 o Charset-base coding system
72
73 A coding system defined by one or more (coded) character sets.
5bad0796 74 Decoding and encoding are done by a code converter defined for each
df7492f9
KH
75 character set.
76
5bad0796 77 o Old Emacs internal format (emacs-mule)
df7492f9 78
5bad0796 79 The coding system adopted by old versions of Emacs (20 and 21).
4ed46869 80
df7492f9 81 o ISO2022-base coding system
4ed46869
KH
82
83 The most famous coding system for multiple character sets. X's
df7492f9
KH
84 Compound Text, various EUCs (Extended Unix Code), and coding systems
85 used in the Internet communication such as ISO-2022-JP are all
86 variants of ISO2022.
4ed46869 87
df7492f9 88 o SJIS (or Shift-JIS or MS-Kanji-Code)
93dec019 89
4ed46869
KH
90 A coding system to encode character sets: ASCII, JISX0201, and
91 JISX0208. Widely used for PC's in Japan. Details are described in
df7492f9 92 section 8.
4ed46869 93
df7492f9 94 o BIG5
4ed46869 95
df7492f9 96 A coding system to encode character sets: ASCII and Big5. Widely
cfb43547 97 used for Chinese (mainly in Taiwan and Hong Kong). Details are
df7492f9
KH
98 described in section 8. In this file, when we write "big5" (all
99 lowercase), we mean the coding system, and when we write "Big5"
100 (capitalized), we mean the character set.
4ed46869 101
df7492f9 102 o CCL
27901516 103
5bad0796 104 If a user wants to decode/encode text encoded in a coding system
df7492f9
KH
105 not listed above, he can supply a decoder and an encoder for it in
106 CCL (Code Conversion Language) programs. Emacs executes the CCL
107 program while decoding/encoding.
27901516 108
df7492f9 109 o Raw-text
4ed46869 110
5a936b46 111 A coding system for text containing raw eight-bit data. Emacs
5bad0796 112 treats each byte of source text as a character (except for
df7492f9 113 end-of-line conversion).
4ed46869 114
df7492f9
KH
115 o No-conversion
116
117 Like raw text, but don't do end-of-line conversion.
4ed46869 118
4ed46869 119
df7492f9 120END-OF-LINE FORMAT
4ed46869 121
5bad0796 122 How text end-of-line is encoded depends on operating system. For
df7492f9 123 instance, Unix's format is just one byte of LF (line-feed) code,
f4dee582 124 whereas DOS's format is two-byte sequence of `carriage-return' and
d46c5b12
KH
125 `line-feed' codes. MacOS's format is usually one byte of
126 `carriage-return'.
4ed46869 127
cfb43547 128 Since text character encoding and end-of-line encoding are
df7492f9
KH
129 independent, any coding system described above can take any format
130 of end-of-line (except for no-conversion).
4ed46869 131
e19c3639
KH
132STRUCT CODING_SYSTEM
133
134 Before using a coding system for code conversion (i.e. decoding and
135 encoding), we setup a structure of type `struct coding_system'.
136 This structure keeps various information about a specific code
5bad0796 137 conversion (e.g. the location of source and destination data).
4ed46869
KH
138
139*/
140
df7492f9
KH
141/* COMMON MACROS */
142
143
4ed46869
KH
144/*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
145
df7492f9 146 These functions check if a byte sequence specified as a source in
ff0dacd7
KH
147 CODING conforms to the format of XXX, and update the members of
148 DETECT_INFO.
df7492f9 149
ff0dacd7 150 Return 1 if the byte sequence conforms to XXX, otherwise return 0.
df7492f9
KH
151
152 Below is the template of these functions. */
153
4ed46869 154#if 0
df7492f9 155static int
cf84bb53
JB
156detect_coding_XXX (struct coding_system *coding,
157 struct coding_detection_info *detect_info)
4ed46869 158{
f1d34bca
MB
159 const unsigned char *src = coding->source;
160 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9 161 int multibytep = coding->src_multibyte;
d311d28c 162 ptrdiff_t consumed_chars = 0;
df7492f9
KH
163 int found = 0;
164 ...;
165
166 while (1)
167 {
ad1746f5 168 /* Get one byte from the source. If the source is exhausted, jump
df7492f9
KH
169 to no_more_source:. */
170 ONE_MORE_BYTE (c);
ff0dacd7
KH
171
172 if (! __C_conforms_to_XXX___ (c))
173 break;
174 if (! __C_strongly_suggests_XXX__ (c))
175 found = CATEGORY_MASK_XXX;
df7492f9 176 }
ff0dacd7
KH
177 /* The byte sequence is invalid for XXX. */
178 detect_info->rejected |= CATEGORY_MASK_XXX;
df7492f9 179 return 0;
ff0dacd7 180
df7492f9 181 no_more_source:
ad1746f5 182 /* The source exhausted successfully. */
ff0dacd7 183 detect_info->found |= found;
df7492f9 184 return 1;
4ed46869
KH
185}
186#endif
187
188/*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
189
df7492f9
KH
190 These functions decode a byte sequence specified as a source by
191 CODING. The resulting multibyte text goes to a place pointed to by
192 CODING->charbuf, the length of which should not exceed
193 CODING->charbuf_size;
d46c5b12 194
df7492f9
KH
195 These functions set the information of original and decoded texts in
196 CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
197 They also set CODING->result to one of CODING_RESULT_XXX indicating
198 how the decoding is finished.
d46c5b12 199
df7492f9 200 Below is the template of these functions. */
d46c5b12 201
4ed46869 202#if 0
b73bfc1c 203static void
cf84bb53 204decode_coding_XXXX (struct coding_system *coding)
4ed46869 205{
f1d34bca
MB
206 const unsigned char *src = coding->source + coding->consumed;
207 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9
KH
208 /* SRC_BASE remembers the start position in source in each loop.
209 The loop will be exited when there's not enough source code, or
210 when there's no room in CHARBUF for a decoded character. */
f1d34bca 211 const unsigned char *src_base;
df7492f9 212 /* A buffer to produce decoded characters. */
69a80ea3
KH
213 int *charbuf = coding->charbuf + coding->charbuf_used;
214 int *charbuf_end = coding->charbuf + coding->charbuf_size;
df7492f9
KH
215 int multibytep = coding->src_multibyte;
216
217 while (1)
218 {
219 src_base = src;
220 if (charbuf < charbuf_end)
221 /* No more room to produce a decoded character. */
222 break;
223 ONE_MORE_BYTE (c);
224 /* Decode it. */
225 }
226
227 no_more_source:
228 if (src_base < src_end
229 && coding->mode & CODING_MODE_LAST_BLOCK)
230 /* If the source ends by partial bytes to construct a character,
231 treat them as eight-bit raw data. */
232 while (src_base < src_end && charbuf < charbuf_end)
233 *charbuf++ = *src_base++;
234 /* Remember how many bytes and characters we consumed. If the
235 source is multibyte, the bytes and chars are not identical. */
236 coding->consumed = coding->consumed_char = src_base - coding->source;
237 /* Remember how many characters we produced. */
238 coding->charbuf_used = charbuf - coding->charbuf;
4ed46869
KH
239}
240#endif
241
242/*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
243
df7492f9
KH
244 These functions encode SRC_BYTES length text at SOURCE of Emacs'
245 internal multibyte format by CODING. The resulting byte sequence
b73bfc1c
KH
246 goes to a place pointed to by DESTINATION, the length of which
247 should not exceed DST_BYTES.
d46c5b12 248
df7492f9
KH
249 These functions set the information of original and encoded texts in
250 the members produced, produced_char, consumed, and consumed_char of
251 the structure *CODING. They also set the member result to one of
252 CODING_RESULT_XXX indicating how the encoding finished.
d46c5b12 253
df7492f9
KH
254 DST_BYTES zero means that source area and destination area are
255 overlapped, which means that we can produce a encoded text until it
256 reaches at the head of not-yet-encoded source text.
d46c5b12 257
df7492f9 258 Below is a template of these functions. */
4ed46869 259#if 0
b73bfc1c 260static void
cf84bb53 261encode_coding_XXX (struct coding_system *coding)
4ed46869 262{
df7492f9
KH
263 int multibytep = coding->dst_multibyte;
264 int *charbuf = coding->charbuf;
265 int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
266 unsigned char *dst = coding->destination + coding->produced;
267 unsigned char *dst_end = coding->destination + coding->dst_bytes;
268 unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
d311d28c 269 ptrdiff_t produced_chars = 0;
df7492f9
KH
270
271 for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
272 {
273 int c = *charbuf;
274 /* Encode C into DST, and increment DST. */
275 }
276 label_no_more_destination:
277 /* How many chars and bytes we produced. */
278 coding->produced_char += produced_chars;
279 coding->produced = dst - coding->destination;
4ed46869
KH
280}
281#endif
282
4ed46869
KH
283\f
284/*** 1. Preamble ***/
285
68c45bf0 286#include <config.h>
4ed46869 287#include <stdio.h>
d7306fe6 288#include <setjmp.h>
4ed46869 289
4ed46869
KH
290#include "lisp.h"
291#include "buffer.h"
df7492f9 292#include "character.h"
4ed46869
KH
293#include "charset.h"
294#include "ccl.h"
df7492f9 295#include "composite.h"
4ed46869
KH
296#include "coding.h"
297#include "window.h"
b8299c66
KL
298#include "frame.h"
299#include "termhooks.h"
4ed46869 300
df7492f9 301Lisp_Object Vcoding_system_hash_table;
4ed46869 302
955cbe7b
PE
303static Lisp_Object Qcoding_system, Qeol_type;
304static Lisp_Object Qcoding_aliases;
1965cb73 305Lisp_Object Qunix, Qdos;
4ed46869 306Lisp_Object Qbuffer_file_coding_system;
955cbe7b
PE
307static Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
308static Lisp_Object Qdefault_char;
27901516 309Lisp_Object Qno_conversion, Qundecided;
955cbe7b
PE
310Lisp_Object Qcharset, Qutf_8;
311static Lisp_Object Qiso_2022;
312static Lisp_Object Qutf_16, Qshift_jis, Qbig5;
313static Lisp_Object Qbig, Qlittle;
314static Lisp_Object Qcoding_system_history;
315static Lisp_Object Qvalid_codes;
316static Lisp_Object QCcategory, QCmnemonic, QCdefault_char;
317static Lisp_Object QCdecode_translation_table, QCencode_translation_table;
318static Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
319static Lisp_Object QCascii_compatible_p;
4ed46869 320
387f6ba5 321Lisp_Object Qcall_process, Qcall_process_region;
4ed46869 322Lisp_Object Qstart_process, Qopen_network_stream;
955cbe7b 323static Lisp_Object Qtarget_idx;
4ed46869 324
955cbe7b
PE
325static Lisp_Object Qinsufficient_source, Qinconsistent_eol, Qinvalid_source;
326static Lisp_Object Qinterrupted, Qinsufficient_memory;
065e3595 327
44e8490d
KH
328/* If a symbol has this property, evaluate the value to define the
329 symbol as a coding system. */
330static Lisp_Object Qcoding_system_define_form;
331
fcbcfb64
KH
332/* Format of end-of-line decided by system. This is Qunix on
333 Unix and Mac, Qdos on DOS/Windows.
334 This has an effect only for external encoding (i.e. for output to
335 file and process), not for in-buffer or Lisp string encoding. */
336static Lisp_Object system_eol_type;
337
4ed46869
KH
338#ifdef emacs
339
4608c386 340Lisp_Object Qcoding_system_p, Qcoding_system_error;
4ed46869 341
d46c5b12
KH
342/* Coding system emacs-mule and raw-text are for converting only
343 end-of-line format. */
344Lisp_Object Qemacs_mule, Qraw_text;
8f924df7 345Lisp_Object Qutf_8_emacs;
ecf488bc 346
4ed46869
KH
347/* Coding-systems are handed between Emacs Lisp programs and C internal
348 routines by the following three variables. */
c4825358
KH
349/* Coding system to be used to encode text for terminal display when
350 terminal coding system is nil. */
351struct coding_system safe_terminal_coding;
352
4ed46869
KH
353#endif /* emacs */
354
f967223b
KH
355Lisp_Object Qtranslation_table;
356Lisp_Object Qtranslation_table_id;
955cbe7b
PE
357static Lisp_Object Qtranslation_table_for_decode;
358static Lisp_Object Qtranslation_table_for_encode;
4ed46869 359
df7492f9 360/* Two special coding systems. */
74ab6df5
PE
361static Lisp_Object Vsjis_coding_system;
362static Lisp_Object Vbig5_coding_system;
df7492f9 363
df7492f9
KH
364/* ISO2022 section */
365
366#define CODING_ISO_INITIAL(coding, reg) \
367 (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id), \
368 coding_attr_iso_initial), \
369 reg)))
370
371
1b3b981b
AS
372#define CODING_ISO_REQUEST(coding, charset_id) \
373 (((charset_id) <= (coding)->max_charset_id \
374 ? ((coding)->safe_charsets[charset_id] != 255 \
375 ? (coding)->safe_charsets[charset_id] \
376 : -1) \
df7492f9
KH
377 : -1))
378
379
380#define CODING_ISO_FLAGS(coding) \
381 ((coding)->spec.iso_2022.flags)
382#define CODING_ISO_DESIGNATION(coding, reg) \
383 ((coding)->spec.iso_2022.current_designation[reg])
384#define CODING_ISO_INVOCATION(coding, plane) \
385 ((coding)->spec.iso_2022.current_invocation[plane])
386#define CODING_ISO_SINGLE_SHIFTING(coding) \
387 ((coding)->spec.iso_2022.single_shifting)
388#define CODING_ISO_BOL(coding) \
389 ((coding)->spec.iso_2022.bol)
390#define CODING_ISO_INVOKED_CHARSET(coding, plane) \
391 CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
e951386e
KH
392#define CODING_ISO_CMP_STATUS(coding) \
393 (&(coding)->spec.iso_2022.cmp_status)
394#define CODING_ISO_EXTSEGMENT_LEN(coding) \
395 ((coding)->spec.iso_2022.ctext_extended_segment_len)
396#define CODING_ISO_EMBEDDED_UTF_8(coding) \
397 ((coding)->spec.iso_2022.embedded_utf_8)
df7492f9
KH
398
399/* Control characters of ISO2022. */
400 /* code */ /* function */
df7492f9
KH
401#define ISO_CODE_SO 0x0E /* shift-out */
402#define ISO_CODE_SI 0x0F /* shift-in */
403#define ISO_CODE_SS2_7 0x19 /* single-shift-2 for 7-bit code */
404#define ISO_CODE_ESC 0x1B /* escape */
405#define ISO_CODE_SS2 0x8E /* single-shift-2 */
406#define ISO_CODE_SS3 0x8F /* single-shift-3 */
407#define ISO_CODE_CSI 0x9B /* control-sequence-introducer */
408
409/* All code (1-byte) of ISO2022 is classified into one of the
410 followings. */
411enum iso_code_class_type
412 {
413 ISO_control_0, /* Control codes in the range
414 0x00..0x1F and 0x7F, except for the
415 following 5 codes. */
df7492f9
KH
416 ISO_shift_out, /* ISO_CODE_SO (0x0E) */
417 ISO_shift_in, /* ISO_CODE_SI (0x0F) */
418 ISO_single_shift_2_7, /* ISO_CODE_SS2_7 (0x19) */
419 ISO_escape, /* ISO_CODE_SO (0x1B) */
420 ISO_control_1, /* Control codes in the range
421 0x80..0x9F, except for the
422 following 3 codes. */
423 ISO_single_shift_2, /* ISO_CODE_SS2 (0x8E) */
424 ISO_single_shift_3, /* ISO_CODE_SS3 (0x8F) */
425 ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
426 ISO_0x20_or_0x7F, /* Codes of the values 0x20 or 0x7F. */
427 ISO_graphic_plane_0, /* Graphic codes in the range 0x21..0x7E. */
428 ISO_0xA0_or_0xFF, /* Codes of the values 0xA0 or 0xFF. */
429 ISO_graphic_plane_1 /* Graphic codes in the range 0xA1..0xFE. */
430 };
05e6f5dc 431
df7492f9
KH
432/** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
433 `iso-flags' attribute of an iso2022 coding system. */
05e6f5dc 434
df7492f9
KH
435/* If set, produce long-form designation sequence (e.g. ESC $ ( A)
436 instead of the correct short-form sequence (e.g. ESC $ A). */
437#define CODING_ISO_FLAG_LONG_FORM 0x0001
93dec019 438
df7492f9
KH
439/* If set, reset graphic planes and registers at end-of-line to the
440 initial state. */
441#define CODING_ISO_FLAG_RESET_AT_EOL 0x0002
05e6f5dc 442
df7492f9
KH
443/* If set, reset graphic planes and registers before any control
444 characters to the initial state. */
445#define CODING_ISO_FLAG_RESET_AT_CNTL 0x0004
05e6f5dc 446
df7492f9
KH
447/* If set, encode by 7-bit environment. */
448#define CODING_ISO_FLAG_SEVEN_BITS 0x0008
4ed46869 449
df7492f9
KH
450/* If set, use locking-shift function. */
451#define CODING_ISO_FLAG_LOCKING_SHIFT 0x0010
b73bfc1c 452
df7492f9
KH
453/* If set, use single-shift function. Overwrite
454 CODING_ISO_FLAG_LOCKING_SHIFT. */
455#define CODING_ISO_FLAG_SINGLE_SHIFT 0x0020
b73bfc1c 456
df7492f9
KH
457/* If set, use designation escape sequence. */
458#define CODING_ISO_FLAG_DESIGNATION 0x0040
b73bfc1c 459
df7492f9
KH
460/* If set, produce revision number sequence. */
461#define CODING_ISO_FLAG_REVISION 0x0080
b73bfc1c 462
df7492f9
KH
463/* If set, produce ISO6429's direction specifying sequence. */
464#define CODING_ISO_FLAG_DIRECTION 0x0100
f4dee582 465
df7492f9
KH
466/* If set, assume designation states are reset at beginning of line on
467 output. */
468#define CODING_ISO_FLAG_INIT_AT_BOL 0x0200
4ed46869 469
df7492f9
KH
470/* If set, designation sequence should be placed at beginning of line
471 on output. */
472#define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
aa72b389 473
ad1746f5 474/* If set, do not encode unsafe characters on output. */
df7492f9 475#define CODING_ISO_FLAG_SAFE 0x0800
aa72b389 476
df7492f9
KH
477/* If set, extra latin codes (128..159) are accepted as a valid code
478 on input. */
479#define CODING_ISO_FLAG_LATIN_EXTRA 0x1000
aa72b389 480
df7492f9 481#define CODING_ISO_FLAG_COMPOSITION 0x2000
aa72b389 482
5f58e762 483/* #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000 */
aa72b389 484
bf16eb23 485#define CODING_ISO_FLAG_USE_ROMAN 0x8000
aa72b389 486
bf16eb23 487#define CODING_ISO_FLAG_USE_OLDJIS 0x10000
aa72b389 488
bf16eb23 489#define CODING_ISO_FLAG_FULL_SUPPORT 0x100000
aa72b389 490
df7492f9
KH
491/* A character to be produced on output if encoding of the original
492 character is prohibited by CODING_ISO_FLAG_SAFE. */
493#define CODING_INHIBIT_CHARACTER_SUBSTITUTION '?'
aa72b389 494
a470d443
KH
495/* UTF-8 section */
496#define CODING_UTF_8_BOM(coding) \
497 ((coding)->spec.utf_8_bom)
4ed46869 498
df7492f9
KH
499/* UTF-16 section */
500#define CODING_UTF_16_BOM(coding) \
501 ((coding)->spec.utf_16.bom)
4ed46869 502
df7492f9
KH
503#define CODING_UTF_16_ENDIAN(coding) \
504 ((coding)->spec.utf_16.endian)
4ed46869 505
df7492f9
KH
506#define CODING_UTF_16_SURROGATE(coding) \
507 ((coding)->spec.utf_16.surrogate)
4ed46869 508
4ed46869 509
df7492f9
KH
510/* CCL section */
511#define CODING_CCL_DECODER(coding) \
512 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
513#define CODING_CCL_ENCODER(coding) \
514 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
515#define CODING_CCL_VALIDS(coding) \
8f924df7 516 (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
4ed46869 517
5a936b46 518/* Index for each coding category in `coding_categories' */
4ed46869 519
df7492f9
KH
520enum coding_category
521 {
522 coding_category_iso_7,
523 coding_category_iso_7_tight,
524 coding_category_iso_8_1,
525 coding_category_iso_8_2,
526 coding_category_iso_7_else,
527 coding_category_iso_8_else,
a470d443
KH
528 coding_category_utf_8_auto,
529 coding_category_utf_8_nosig,
530 coding_category_utf_8_sig,
df7492f9
KH
531 coding_category_utf_16_auto,
532 coding_category_utf_16_be,
533 coding_category_utf_16_le,
534 coding_category_utf_16_be_nosig,
535 coding_category_utf_16_le_nosig,
536 coding_category_charset,
537 coding_category_sjis,
538 coding_category_big5,
539 coding_category_ccl,
540 coding_category_emacs_mule,
541 /* All above are targets of code detection. */
542 coding_category_raw_text,
543 coding_category_undecided,
544 coding_category_max
545 };
546
547/* Definitions of flag bits used in detect_coding_XXXX. */
548#define CATEGORY_MASK_ISO_7 (1 << coding_category_iso_7)
549#define CATEGORY_MASK_ISO_7_TIGHT (1 << coding_category_iso_7_tight)
550#define CATEGORY_MASK_ISO_8_1 (1 << coding_category_iso_8_1)
551#define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2)
552#define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else)
553#define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else)
a470d443
KH
554#define CATEGORY_MASK_UTF_8_AUTO (1 << coding_category_utf_8_auto)
555#define CATEGORY_MASK_UTF_8_NOSIG (1 << coding_category_utf_8_nosig)
556#define CATEGORY_MASK_UTF_8_SIG (1 << coding_category_utf_8_sig)
b49a1807 557#define CATEGORY_MASK_UTF_16_AUTO (1 << coding_category_utf_16_auto)
df7492f9
KH
558#define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be)
559#define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le)
560#define CATEGORY_MASK_UTF_16_BE_NOSIG (1 << coding_category_utf_16_be_nosig)
561#define CATEGORY_MASK_UTF_16_LE_NOSIG (1 << coding_category_utf_16_le_nosig)
562#define CATEGORY_MASK_CHARSET (1 << coding_category_charset)
563#define CATEGORY_MASK_SJIS (1 << coding_category_sjis)
564#define CATEGORY_MASK_BIG5 (1 << coding_category_big5)
565#define CATEGORY_MASK_CCL (1 << coding_category_ccl)
566#define CATEGORY_MASK_EMACS_MULE (1 << coding_category_emacs_mule)
ff0dacd7 567#define CATEGORY_MASK_RAW_TEXT (1 << coding_category_raw_text)
df7492f9
KH
568
569/* This value is returned if detect_coding_mask () find nothing other
570 than ASCII characters. */
571#define CATEGORY_MASK_ANY \
572 (CATEGORY_MASK_ISO_7 \
573 | CATEGORY_MASK_ISO_7_TIGHT \
574 | CATEGORY_MASK_ISO_8_1 \
575 | CATEGORY_MASK_ISO_8_2 \
576 | CATEGORY_MASK_ISO_7_ELSE \
577 | CATEGORY_MASK_ISO_8_ELSE \
a470d443
KH
578 | CATEGORY_MASK_UTF_8_AUTO \
579 | CATEGORY_MASK_UTF_8_NOSIG \
580 | CATEGORY_MASK_UTF_8_SIG \
2f3cbb32 581 | CATEGORY_MASK_UTF_16_AUTO \
df7492f9
KH
582 | CATEGORY_MASK_UTF_16_BE \
583 | CATEGORY_MASK_UTF_16_LE \
584 | CATEGORY_MASK_UTF_16_BE_NOSIG \
585 | CATEGORY_MASK_UTF_16_LE_NOSIG \
586 | CATEGORY_MASK_CHARSET \
587 | CATEGORY_MASK_SJIS \
588 | CATEGORY_MASK_BIG5 \
589 | CATEGORY_MASK_CCL \
590 | CATEGORY_MASK_EMACS_MULE)
591
592
593#define CATEGORY_MASK_ISO_7BIT \
594 (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
595
596#define CATEGORY_MASK_ISO_8BIT \
597 (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
598
599#define CATEGORY_MASK_ISO_ELSE \
600 (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
601
602#define CATEGORY_MASK_ISO_ESCAPE \
603 (CATEGORY_MASK_ISO_7 \
604 | CATEGORY_MASK_ISO_7_TIGHT \
605 | CATEGORY_MASK_ISO_7_ELSE \
606 | CATEGORY_MASK_ISO_8_ELSE)
607
608#define CATEGORY_MASK_ISO \
609 ( CATEGORY_MASK_ISO_7BIT \
610 | CATEGORY_MASK_ISO_8BIT \
611 | CATEGORY_MASK_ISO_ELSE)
612
613#define CATEGORY_MASK_UTF_16 \
2f3cbb32
KH
614 (CATEGORY_MASK_UTF_16_AUTO \
615 | CATEGORY_MASK_UTF_16_BE \
df7492f9
KH
616 | CATEGORY_MASK_UTF_16_LE \
617 | CATEGORY_MASK_UTF_16_BE_NOSIG \
618 | CATEGORY_MASK_UTF_16_LE_NOSIG)
619
a470d443
KH
620#define CATEGORY_MASK_UTF_8 \
621 (CATEGORY_MASK_UTF_8_AUTO \
622 | CATEGORY_MASK_UTF_8_NOSIG \
623 | CATEGORY_MASK_UTF_8_SIG)
df7492f9 624
df7492f9 625/* Table of coding categories (Lisp symbols). This variable is for
ad1746f5 626 internal use only. */
df7492f9
KH
627static Lisp_Object Vcoding_category_table;
628
629/* Table of coding-categories ordered by priority. */
630static enum coding_category coding_priorities[coding_category_max];
631
632/* Nth element is a coding context for the coding system bound to the
633 Nth coding category. */
634static struct coding_system coding_categories[coding_category_max];
635
df7492f9
KH
636/*** Commonly used macros and functions ***/
637
638#ifndef min
639#define min(a, b) ((a) < (b) ? (a) : (b))
640#endif
641#ifndef max
642#define max(a, b) ((a) > (b) ? (a) : (b))
643#endif
4ed46869 644
24a73b0a
KH
645#define CODING_GET_INFO(coding, attrs, charset_list) \
646 do { \
647 (attrs) = CODING_ID_ATTRS ((coding)->id); \
648 (charset_list) = CODING_ATTR_CHARSET_LIST (attrs); \
df7492f9 649 } while (0)
4ed46869 650
4ed46869 651
df7492f9
KH
652/* Safely get one byte from the source text pointed by SRC which ends
653 at SRC_END, and set C to that byte. If there are not enough bytes
065e3595
KH
654 in the source, it jumps to `no_more_source'. If multibytep is
655 nonzero, and a multibyte character is found at SRC, set C to the
656 negative value of the character code. The caller should declare
657 and set these variables appropriately in advance:
658 src, src_end, multibytep */
aa72b389 659
065e3595
KH
660#define ONE_MORE_BYTE(c) \
661 do { \
662 if (src == src_end) \
663 { \
664 if (src_base < src) \
665 record_conversion_result \
666 (coding, CODING_RESULT_INSUFFICIENT_SRC); \
667 goto no_more_source; \
668 } \
669 c = *src++; \
670 if (multibytep && (c & 0x80)) \
671 { \
672 if ((c & 0xFE) == 0xC0) \
673 c = ((c & 1) << 6) | *src++; \
674 else \
675 { \
35befdaa
KH
676 src--; \
677 c = - string_char (src, &src, NULL); \
065e3595
KH
678 record_conversion_result \
679 (coding, CODING_RESULT_INVALID_SRC); \
680 } \
681 } \
682 consumed_chars++; \
aa72b389
KH
683 } while (0)
684
f56a4450 685/* Safely get two bytes from the source text pointed by SRC which ends
220eeac9
KH
686 at SRC_END, and set C1 and C2 to those bytes while skipping the
687 heading multibyte characters. If there are not enough bytes in the
688 source, it jumps to `no_more_source'. If multibytep is nonzero and
689 a multibyte character is found for C2, set C2 to the negative value
690 of the character code. The caller should declare and set these
691 variables appropriately in advance:
f56a4450
KH
692 src, src_end, multibytep
693 It is intended that this macro is used in detect_coding_utf_16. */
694
220eeac9
KH
695#define TWO_MORE_BYTES(c1, c2) \
696 do { \
697 do { \
698 if (src == src_end) \
699 goto no_more_source; \
700 c1 = *src++; \
701 if (multibytep && (c1 & 0x80)) \
702 { \
703 if ((c1 & 0xFE) == 0xC0) \
704 c1 = ((c1 & 1) << 6) | *src++; \
705 else \
706 { \
707 src += BYTES_BY_CHAR_HEAD (c1) - 1; \
708 c1 = -1; \
709 } \
710 } \
711 } while (c1 < 0); \
712 if (src == src_end) \
713 goto no_more_source; \
714 c2 = *src++; \
715 if (multibytep && (c2 & 0x80)) \
716 { \
717 if ((c2 & 0xFE) == 0xC0) \
718 c2 = ((c2 & 1) << 6) | *src++; \
719 else \
720 c2 = -1; \
721 } \
f56a4450
KH
722 } while (0)
723
aa72b389 724
df7492f9
KH
725/* Store a byte C in the place pointed by DST and increment DST to the
726 next free point, and increment PRODUCED_CHARS. The caller should
727 assure that C is 0..127, and declare and set the variable `dst'
728 appropriately in advance.
729*/
aa72b389
KH
730
731
df7492f9
KH
732#define EMIT_ONE_ASCII_BYTE(c) \
733 do { \
734 produced_chars++; \
735 *dst++ = (c); \
b6871cc7 736 } while (0)
aa72b389
KH
737
738
ad1746f5 739/* Like EMIT_ONE_ASCII_BYTE but store two bytes; C1 and C2. */
aa72b389 740
df7492f9
KH
741#define EMIT_TWO_ASCII_BYTES(c1, c2) \
742 do { \
743 produced_chars += 2; \
744 *dst++ = (c1), *dst++ = (c2); \
745 } while (0)
aa72b389
KH
746
747
df7492f9
KH
748/* Store a byte C in the place pointed by DST and increment DST to the
749 next free point, and increment PRODUCED_CHARS. If MULTIBYTEP is
750 nonzero, store in an appropriate multibyte from. The caller should
751 declare and set the variables `dst' and `multibytep' appropriately
752 in advance. */
753
754#define EMIT_ONE_BYTE(c) \
755 do { \
756 produced_chars++; \
757 if (multibytep) \
758 { \
b25d760e 759 unsigned ch = (c); \
df7492f9
KH
760 if (ch >= 0x80) \
761 ch = BYTE8_TO_CHAR (ch); \
762 CHAR_STRING_ADVANCE (ch, dst); \
763 } \
764 else \
765 *dst++ = (c); \
aa72b389 766 } while (0)
aa72b389 767
aa72b389 768
df7492f9 769/* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */
aa72b389 770
e19c3639
KH
771#define EMIT_TWO_BYTES(c1, c2) \
772 do { \
773 produced_chars += 2; \
774 if (multibytep) \
775 { \
b25d760e 776 unsigned ch; \
e19c3639
KH
777 \
778 ch = (c1); \
779 if (ch >= 0x80) \
780 ch = BYTE8_TO_CHAR (ch); \
781 CHAR_STRING_ADVANCE (ch, dst); \
782 ch = (c2); \
783 if (ch >= 0x80) \
784 ch = BYTE8_TO_CHAR (ch); \
785 CHAR_STRING_ADVANCE (ch, dst); \
786 } \
787 else \
788 { \
789 *dst++ = (c1); \
790 *dst++ = (c2); \
791 } \
aa72b389
KH
792 } while (0)
793
794
df7492f9
KH
795#define EMIT_THREE_BYTES(c1, c2, c3) \
796 do { \
797 EMIT_ONE_BYTE (c1); \
798 EMIT_TWO_BYTES (c2, c3); \
799 } while (0)
aa72b389 800
aa72b389 801
df7492f9
KH
802#define EMIT_FOUR_BYTES(c1, c2, c3, c4) \
803 do { \
804 EMIT_TWO_BYTES (c1, c2); \
805 EMIT_TWO_BYTES (c3, c4); \
806 } while (0)
aa72b389 807
aa72b389 808
f6cbaf43 809/* Prototypes for static functions. */
f57e2426
J
810static void record_conversion_result (struct coding_system *coding,
811 enum coding_result_code result);
812static int detect_coding_utf_8 (struct coding_system *,
813 struct coding_detection_info *info);
814static void decode_coding_utf_8 (struct coding_system *);
815static int encode_coding_utf_8 (struct coding_system *);
816
817static int detect_coding_utf_16 (struct coding_system *,
818 struct coding_detection_info *info);
819static void decode_coding_utf_16 (struct coding_system *);
820static int encode_coding_utf_16 (struct coding_system *);
821
822static int detect_coding_iso_2022 (struct coding_system *,
823 struct coding_detection_info *info);
824static void decode_coding_iso_2022 (struct coding_system *);
825static int encode_coding_iso_2022 (struct coding_system *);
826
827static int detect_coding_emacs_mule (struct coding_system *,
828 struct coding_detection_info *info);
829static void decode_coding_emacs_mule (struct coding_system *);
830static int encode_coding_emacs_mule (struct coding_system *);
831
832static int detect_coding_sjis (struct coding_system *,
833 struct coding_detection_info *info);
834static void decode_coding_sjis (struct coding_system *);
835static int encode_coding_sjis (struct coding_system *);
836
837static int detect_coding_big5 (struct coding_system *,
838 struct coding_detection_info *info);
839static void decode_coding_big5 (struct coding_system *);
840static int encode_coding_big5 (struct coding_system *);
841
842static int detect_coding_ccl (struct coding_system *,
843 struct coding_detection_info *info);
844static void decode_coding_ccl (struct coding_system *);
845static int encode_coding_ccl (struct coding_system *);
846
847static void decode_coding_raw_text (struct coding_system *);
848static int encode_coding_raw_text (struct coding_system *);
849
8f50130c
PE
850static ptrdiff_t coding_set_source (struct coding_system *);
851static ptrdiff_t coding_set_destination (struct coding_system *);
d311d28c 852static void coding_alloc_by_realloc (struct coding_system *, ptrdiff_t);
f57e2426 853static void coding_alloc_by_making_gap (struct coding_system *,
d311d28c 854 ptrdiff_t, ptrdiff_t);
f57e2426 855static unsigned char *alloc_destination (struct coding_system *,
d311d28c 856 ptrdiff_t, unsigned char *);
f57e2426 857static void setup_iso_safe_charsets (Lisp_Object);
6e6c82a4 858static ptrdiff_t encode_designation_at_bol (struct coding_system *,
5eb05ea3 859 int *, int *, unsigned char *);
f57e2426 860static int detect_eol (const unsigned char *,
d311d28c 861 ptrdiff_t, enum coding_category);
f57e2426
J
862static Lisp_Object adjust_coding_eol_type (struct coding_system *, int);
863static void decode_eol (struct coding_system *);
864static Lisp_Object get_translation_table (Lisp_Object, int, int *);
865static Lisp_Object get_translation (Lisp_Object, int *, int *);
866static int produce_chars (struct coding_system *, Lisp_Object, int);
55d4c1b2 867static inline void produce_charset (struct coding_system *, int *,
d311d28c
PE
868 ptrdiff_t);
869static void produce_annotation (struct coding_system *, ptrdiff_t);
f57e2426 870static int decode_coding (struct coding_system *);
d311d28c 871static inline int *handle_composition_annotation (ptrdiff_t, ptrdiff_t,
f57e2426 872 struct coding_system *,
d311d28c
PE
873 int *, ptrdiff_t *);
874static inline int *handle_charset_annotation (ptrdiff_t, ptrdiff_t,
f57e2426 875 struct coding_system *,
d311d28c 876 int *, ptrdiff_t *);
f57e2426
J
877static void consume_chars (struct coding_system *, Lisp_Object, int);
878static int encode_coding (struct coding_system *);
879static Lisp_Object make_conversion_work_buffer (int);
880static Lisp_Object code_conversion_restore (Lisp_Object);
55d4c1b2 881static inline int char_encodable_p (int, Lisp_Object);
f57e2426 882static Lisp_Object make_subsidiaries (Lisp_Object);
f6cbaf43 883
065e3595
KH
884static void
885record_conversion_result (struct coding_system *coding,
886 enum coding_result_code result)
887{
888 coding->result = result;
889 switch (result)
890 {
891 case CODING_RESULT_INSUFFICIENT_SRC:
892 Vlast_code_conversion_error = Qinsufficient_source;
893 break;
894 case CODING_RESULT_INCONSISTENT_EOL:
895 Vlast_code_conversion_error = Qinconsistent_eol;
896 break;
897 case CODING_RESULT_INVALID_SRC:
898 Vlast_code_conversion_error = Qinvalid_source;
899 break;
900 case CODING_RESULT_INTERRUPT:
901 Vlast_code_conversion_error = Qinterrupted;
902 break;
903 case CODING_RESULT_INSUFFICIENT_MEM:
904 Vlast_code_conversion_error = Qinsufficient_memory;
905 break;
ebaf11b6
KH
906 case CODING_RESULT_INSUFFICIENT_DST:
907 /* Don't record this error in Vlast_code_conversion_error
908 because it happens just temporarily and is resolved when the
909 whole conversion is finished. */
910 break;
409ea3a1
AS
911 case CODING_RESULT_SUCCESS:
912 break;
35befdaa
KH
913 default:
914 Vlast_code_conversion_error = intern ("Unknown error");
065e3595
KH
915 }
916}
917
5eb05ea3
KH
918/* These wrapper macros are used to preserve validity of pointers into
919 buffer text across calls to decode_char, encode_char, etc, which
920 could cause relocation of buffers if it loads a charset map,
921 because loading a charset map allocates large structures. */
922
df7492f9
KH
923#define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
924 do { \
8f50130c 925 ptrdiff_t offset; \
5eb05ea3 926 \
df7492f9
KH
927 charset_map_loaded = 0; \
928 c = DECODE_CHAR (charset, code); \
5eb05ea3
KH
929 if (charset_map_loaded \
930 && (offset = coding_set_source (coding))) \
df7492f9 931 { \
df7492f9
KH
932 src += offset; \
933 src_base += offset; \
934 src_end += offset; \
935 } \
aa72b389
KH
936 } while (0)
937
5eb05ea3
KH
938#define CODING_ENCODE_CHAR(coding, dst, dst_end, charset, c, code) \
939 do { \
8f50130c 940 ptrdiff_t offset; \
5eb05ea3
KH
941 \
942 charset_map_loaded = 0; \
943 code = ENCODE_CHAR (charset, c); \
944 if (charset_map_loaded \
945 && (offset = coding_set_destination (coding))) \
946 { \
947 dst += offset; \
948 dst_end += offset; \
949 } \
950 } while (0)
951
952#define CODING_CHAR_CHARSET(coding, dst, dst_end, c, charset_list, code_return, charset) \
953 do { \
8f50130c 954 ptrdiff_t offset; \
5eb05ea3
KH
955 \
956 charset_map_loaded = 0; \
957 charset = char_charset (c, charset_list, code_return); \
958 if (charset_map_loaded \
959 && (offset = coding_set_destination (coding))) \
960 { \
961 dst += offset; \
962 dst_end += offset; \
963 } \
964 } while (0)
965
966#define CODING_CHAR_CHARSET_P(coding, dst, dst_end, c, charset, result) \
967 do { \
8f50130c 968 ptrdiff_t offset; \
5eb05ea3
KH
969 \
970 charset_map_loaded = 0; \
971 result = CHAR_CHARSET_P (c, charset); \
972 if (charset_map_loaded \
973 && (offset = coding_set_destination (coding))) \
974 { \
975 dst += offset; \
976 dst_end += offset; \
977 } \
978 } while (0)
979
aa72b389 980
119852e7
KH
981/* If there are at least BYTES length of room at dst, allocate memory
982 for coding->destination and update dst and dst_end. We don't have
983 to take care of coding->source which will be relocated. It is
984 handled by calling coding_set_source in encode_coding. */
985
df7492f9
KH
986#define ASSURE_DESTINATION(bytes) \
987 do { \
988 if (dst + (bytes) >= dst_end) \
989 { \
d311d28c 990 ptrdiff_t more_bytes = charbuf_end - charbuf + (bytes); \
df7492f9
KH
991 \
992 dst = alloc_destination (coding, more_bytes, dst); \
993 dst_end = coding->destination + coding->dst_bytes; \
994 } \
995 } while (0)
aa72b389 996
aa72b389 997
db274c7a
KH
998/* Store multibyte form of the character C in P, and advance P to the
999 end of the multibyte form. This is like CHAR_STRING_ADVANCE but it
1000 never calls MAYBE_UNIFY_CHAR. */
1001
1002#define CHAR_STRING_ADVANCE_NO_UNIFY(c, p) \
1003 do { \
1004 if ((c) <= MAX_1_BYTE_CHAR) \
1005 *(p)++ = (c); \
1006 else if ((c) <= MAX_2_BYTE_CHAR) \
1007 *(p)++ = (0xC0 | ((c) >> 6)), \
1008 *(p)++ = (0x80 | ((c) & 0x3F)); \
1009 else if ((c) <= MAX_3_BYTE_CHAR) \
1010 *(p)++ = (0xE0 | ((c) >> 12)), \
1011 *(p)++ = (0x80 | (((c) >> 6) & 0x3F)), \
1012 *(p)++ = (0x80 | ((c) & 0x3F)); \
1013 else if ((c) <= MAX_4_BYTE_CHAR) \
1014 *(p)++ = (0xF0 | (c >> 18)), \
1015 *(p)++ = (0x80 | ((c >> 12) & 0x3F)), \
1016 *(p)++ = (0x80 | ((c >> 6) & 0x3F)), \
1017 *(p)++ = (0x80 | (c & 0x3F)); \
1018 else if ((c) <= MAX_5_BYTE_CHAR) \
1019 *(p)++ = 0xF8, \
1020 *(p)++ = (0x80 | ((c >> 18) & 0x0F)), \
1021 *(p)++ = (0x80 | ((c >> 12) & 0x3F)), \
1022 *(p)++ = (0x80 | ((c >> 6) & 0x3F)), \
1023 *(p)++ = (0x80 | (c & 0x3F)); \
1024 else \
1025 (p) += BYTE8_STRING ((c) - 0x3FFF80, p); \
1026 } while (0)
1027
1028
1029/* Return the character code of character whose multibyte form is at
1030 P, and advance P to the end of the multibyte form. This is like
1031 STRING_CHAR_ADVANCE, but it never calls MAYBE_UNIFY_CHAR. */
1032
1033#define STRING_CHAR_ADVANCE_NO_UNIFY(p) \
1034 (!((p)[0] & 0x80) \
1035 ? *(p)++ \
1036 : ! ((p)[0] & 0x20) \
1037 ? ((p) += 2, \
1038 ((((p)[-2] & 0x1F) << 6) \
1039 | ((p)[-1] & 0x3F) \
1040 | ((unsigned char) ((p)[-2]) < 0xC2 ? 0x3FFF80 : 0))) \
1041 : ! ((p)[0] & 0x10) \
1042 ? ((p) += 3, \
1043 ((((p)[-3] & 0x0F) << 12) \
1044 | (((p)[-2] & 0x3F) << 6) \
1045 | ((p)[-1] & 0x3F))) \
1046 : ! ((p)[0] & 0x08) \
1047 ? ((p) += 4, \
1048 ((((p)[-4] & 0xF) << 18) \
1049 | (((p)[-3] & 0x3F) << 12) \
1050 | (((p)[-2] & 0x3F) << 6) \
1051 | ((p)[-1] & 0x3F))) \
1052 : ((p) += 5, \
1053 ((((p)[-4] & 0x3F) << 18) \
1054 | (((p)[-3] & 0x3F) << 12) \
1055 | (((p)[-2] & 0x3F) << 6) \
1056 | ((p)[-1] & 0x3F))))
1057
aa72b389 1058
5eb05ea3
KH
1059/* Update coding->source from coding->src_object, and return how many
1060 bytes coding->source was changed. */
1061
8f50130c 1062static ptrdiff_t
971de7fb 1063coding_set_source (struct coding_system *coding)
aa72b389 1064{
5eb05ea3
KH
1065 const unsigned char *orig = coding->source;
1066
df7492f9
KH
1067 if (BUFFERP (coding->src_object))
1068 {
2cb26057 1069 struct buffer *buf = XBUFFER (coding->src_object);
aa72b389 1070
df7492f9 1071 if (coding->src_pos < 0)
2cb26057 1072 coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
df7492f9 1073 else
2cb26057 1074 coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
aa72b389 1075 }
df7492f9 1076 else if (STRINGP (coding->src_object))
aa72b389 1077 {
8f924df7 1078 coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
aa72b389 1079 }
df7492f9 1080 else
f38b440c
PE
1081 {
1082 /* Otherwise, the source is C string and is never relocated
1083 automatically. Thus we don't have to update anything. */
1084 }
5eb05ea3 1085 return coding->source - orig;
df7492f9 1086}
aa72b389 1087
5eb05ea3
KH
1088
1089/* Update coding->destination from coding->dst_object, and return how
1090 many bytes coding->destination was changed. */
1091
8f50130c 1092static ptrdiff_t
971de7fb 1093coding_set_destination (struct coding_system *coding)
df7492f9 1094{
5eb05ea3
KH
1095 const unsigned char *orig = coding->destination;
1096
df7492f9 1097 if (BUFFERP (coding->dst_object))
aa72b389 1098 {
a0241d01 1099 if (BUFFERP (coding->src_object) && coding->src_pos < 0)
aa72b389 1100 {
13818c30 1101 coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
28f67a95
KH
1102 coding->dst_bytes = (GAP_END_ADDR
1103 - (coding->src_bytes - coding->consumed)
1104 - coding->destination);
aa72b389 1105 }
df7492f9 1106 else
28f67a95
KH
1107 {
1108 /* We are sure that coding->dst_pos_byte is before the gap
1109 of the buffer. */
1110 coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
13818c30 1111 + coding->dst_pos_byte - BEG_BYTE);
28f67a95
KH
1112 coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
1113 - coding->destination);
1114 }
df7492f9
KH
1115 }
1116 else
f38b440c
PE
1117 {
1118 /* Otherwise, the destination is C string and is never relocated
1119 automatically. Thus we don't have to update anything. */
1120 }
5eb05ea3 1121 return coding->destination - orig;
df7492f9
KH
1122}
1123
1124
1125static void
d311d28c 1126coding_alloc_by_realloc (struct coding_system *coding, ptrdiff_t bytes)
df7492f9 1127{
c9d624c6 1128 if (STRING_BYTES_BOUND - coding->dst_bytes < bytes)
d1f3d2af 1129 string_overflow ();
df7492f9
KH
1130 coding->destination = (unsigned char *) xrealloc (coding->destination,
1131 coding->dst_bytes + bytes);
1132 coding->dst_bytes += bytes;
1133}
1134
1135static void
cf84bb53 1136coding_alloc_by_making_gap (struct coding_system *coding,
d311d28c 1137 ptrdiff_t gap_head_used, ptrdiff_t bytes)
df7492f9 1138{
db274c7a 1139 if (EQ (coding->src_object, coding->dst_object))
df7492f9 1140 {
db274c7a
KH
1141 /* The gap may contain the produced data at the head and not-yet
1142 consumed data at the tail. To preserve those data, we at
1143 first make the gap size to zero, then increase the gap
1144 size. */
d311d28c 1145 ptrdiff_t add = GAP_SIZE;
db274c7a
KH
1146
1147 GPT += gap_head_used, GPT_BYTE += gap_head_used;
1148 GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
df7492f9
KH
1149 make_gap (bytes);
1150 GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
db274c7a 1151 GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
df7492f9 1152 }
730fff51 1153 else
df7492f9 1154 {
2c78b7e1
KH
1155 Lisp_Object this_buffer;
1156
1157 this_buffer = Fcurrent_buffer ();
df7492f9
KH
1158 set_buffer_internal (XBUFFER (coding->dst_object));
1159 make_gap (bytes);
1160 set_buffer_internal (XBUFFER (this_buffer));
aa72b389 1161 }
df7492f9 1162}
8f924df7 1163
df7492f9
KH
1164
1165static unsigned char *
d311d28c 1166alloc_destination (struct coding_system *coding, ptrdiff_t nbytes,
cf84bb53 1167 unsigned char *dst)
df7492f9 1168{
d311d28c 1169 ptrdiff_t offset = dst - coding->destination;
df7492f9
KH
1170
1171 if (BUFFERP (coding->dst_object))
db274c7a
KH
1172 {
1173 struct buffer *buf = XBUFFER (coding->dst_object);
1174
1175 coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1176 }
aa72b389 1177 else
df7492f9 1178 coding_alloc_by_realloc (coding, nbytes);
df7492f9
KH
1179 coding_set_destination (coding);
1180 dst = coding->destination + offset;
1181 return dst;
1182}
aa72b389 1183
ff0dacd7
KH
1184/** Macros for annotations. */
1185
ff0dacd7
KH
1186/* An annotation data is stored in the array coding->charbuf in this
1187 format:
69a80ea3 1188 [ -LENGTH ANNOTATION_MASK NCHARS ... ]
ff0dacd7
KH
1189 LENGTH is the number of elements in the annotation.
1190 ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
69a80ea3 1191 NCHARS is the number of characters in the text annotated.
ff0dacd7
KH
1192
1193 The format of the following elements depend on ANNOTATION_MASK.
1194
1195 In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1196 follows:
e951386e
KH
1197 ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1198
1199 NBYTES is the number of bytes specified in the header part of
1200 old-style emacs-mule encoding, or 0 for the other kind of
1201 composition.
1202
ff0dacd7 1203 METHOD is one of enum composition_method.
e951386e 1204
ad1746f5 1205 Optional COMPOSITION-COMPONENTS are characters and composition
ff0dacd7
KH
1206 rules.
1207
1208 In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
e951386e
KH
1209 follows.
1210
1211 If ANNOTATION_MASK is 0, this annotation is just a space holder to
1212 recover from an invalid annotation, and should be skipped by
1213 produce_annotation. */
1214
1215/* Maximum length of the header of annotation data. */
1216#define MAX_ANNOTATION_LENGTH 5
ff0dacd7 1217
69a80ea3 1218#define ADD_ANNOTATION_DATA(buf, len, mask, nchars) \
ff0dacd7
KH
1219 do { \
1220 *(buf)++ = -(len); \
1221 *(buf)++ = (mask); \
69a80ea3 1222 *(buf)++ = (nchars); \
ff0dacd7
KH
1223 coding->annotated = 1; \
1224 } while (0);
1225
e951386e 1226#define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method) \
69a80ea3 1227 do { \
e951386e
KH
1228 ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1229 *buf++ = nbytes; \
69a80ea3 1230 *buf++ = method; \
ff0dacd7
KH
1231 } while (0)
1232
1233
69a80ea3
KH
1234#define ADD_CHARSET_DATA(buf, nchars, id) \
1235 do { \
1236 ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1237 *buf++ = id; \
ff0dacd7
KH
1238 } while (0)
1239
df7492f9
KH
1240\f
1241/*** 2. Emacs' internal format (emacs-utf-8) ***/
1242
1243
1244
1245\f
1246/*** 3. UTF-8 ***/
1247
1248/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
ff0dacd7
KH
1249 Check if a text is encoded in UTF-8. If it is, return 1, else
1250 return 0. */
df7492f9
KH
1251
1252#define UTF_8_1_OCTET_P(c) ((c) < 0x80)
1253#define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
1254#define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1255#define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1256#define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1257#define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1258
a470d443
KH
1259#define UTF_8_BOM_1 0xEF
1260#define UTF_8_BOM_2 0xBB
1261#define UTF_8_BOM_3 0xBF
1262
df7492f9 1263static int
cf84bb53
JB
1264detect_coding_utf_8 (struct coding_system *coding,
1265 struct coding_detection_info *detect_info)
df7492f9 1266{
065e3595 1267 const unsigned char *src = coding->source, *src_base;
8f924df7 1268 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9 1269 int multibytep = coding->src_multibyte;
d311d28c 1270 ptrdiff_t consumed_chars = 0;
a470d443 1271 int bom_found = 0;
df7492f9
KH
1272 int found = 0;
1273
ff0dacd7 1274 detect_info->checked |= CATEGORY_MASK_UTF_8;
df7492f9
KH
1275 /* A coding system of this category is always ASCII compatible. */
1276 src += coding->head_ascii;
1277
1278 while (1)
aa72b389 1279 {
df7492f9 1280 int c, c1, c2, c3, c4;
aa72b389 1281
065e3595 1282 src_base = src;
df7492f9 1283 ONE_MORE_BYTE (c);
065e3595 1284 if (c < 0 || UTF_8_1_OCTET_P (c))
df7492f9
KH
1285 continue;
1286 ONE_MORE_BYTE (c1);
065e3595 1287 if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
df7492f9
KH
1288 break;
1289 if (UTF_8_2_OCTET_LEADING_P (c))
aa72b389 1290 {
a470d443 1291 found = 1;
df7492f9 1292 continue;
aa72b389 1293 }
df7492f9 1294 ONE_MORE_BYTE (c2);
065e3595 1295 if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
df7492f9
KH
1296 break;
1297 if (UTF_8_3_OCTET_LEADING_P (c))
aa72b389 1298 {
a470d443
KH
1299 found = 1;
1300 if (src_base == coding->source
1301 && c == UTF_8_BOM_1 && c1 == UTF_8_BOM_2 && c2 == UTF_8_BOM_3)
1302 bom_found = 1;
df7492f9 1303 continue;
aa72b389 1304 }
df7492f9 1305 ONE_MORE_BYTE (c3);
065e3595 1306 if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
df7492f9
KH
1307 break;
1308 if (UTF_8_4_OCTET_LEADING_P (c))
aa72b389 1309 {
a470d443 1310 found = 1;
df7492f9
KH
1311 continue;
1312 }
1313 ONE_MORE_BYTE (c4);
065e3595 1314 if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
df7492f9
KH
1315 break;
1316 if (UTF_8_5_OCTET_LEADING_P (c))
1317 {
a470d443 1318 found = 1;
df7492f9
KH
1319 continue;
1320 }
1321 break;
aa72b389 1322 }
ff0dacd7 1323 detect_info->rejected |= CATEGORY_MASK_UTF_8;
df7492f9 1324 return 0;
aa72b389 1325
df7492f9 1326 no_more_source:
065e3595 1327 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
aa72b389 1328 {
ff0dacd7 1329 detect_info->rejected |= CATEGORY_MASK_UTF_8;
89528eb3 1330 return 0;
aa72b389 1331 }
a470d443
KH
1332 if (bom_found)
1333 {
1334 /* The first character 0xFFFE doesn't necessarily mean a BOM. */
1335 detect_info->found |= CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1336 }
1337 else
1338 {
1339 detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
0e17387a
KH
1340 if (found)
1341 detect_info->found |= CATEGORY_MASK_UTF_8_NOSIG;
a470d443 1342 }
ff0dacd7 1343 return 1;
aa72b389
KH
1344}
1345
4ed46869 1346
b73bfc1c 1347static void
971de7fb 1348decode_coding_utf_8 (struct coding_system *coding)
b73bfc1c 1349{
8f924df7
KH
1350 const unsigned char *src = coding->source + coding->consumed;
1351 const unsigned char *src_end = coding->source + coding->src_bytes;
1352 const unsigned char *src_base;
69a80ea3
KH
1353 int *charbuf = coding->charbuf + coding->charbuf_used;
1354 int *charbuf_end = coding->charbuf + coding->charbuf_size;
d311d28c 1355 ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
df7492f9 1356 int multibytep = coding->src_multibyte;
a470d443 1357 enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
2735d060 1358 int eol_dos =
0a9564cb 1359 !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
119852e7 1360 int byte_after_cr = -1;
4ed46869 1361
a470d443
KH
1362 if (bom != utf_without_bom)
1363 {
1364 int c1, c2, c3;
1365
1366 src_base = src;
1367 ONE_MORE_BYTE (c1);
1368 if (! UTF_8_3_OCTET_LEADING_P (c1))
1369 src = src_base;
1370 else
1371 {
159bd5a2 1372 ONE_MORE_BYTE (c2);
a470d443
KH
1373 if (! UTF_8_EXTRA_OCTET_P (c2))
1374 src = src_base;
1375 else
1376 {
159bd5a2 1377 ONE_MORE_BYTE (c3);
a470d443
KH
1378 if (! UTF_8_EXTRA_OCTET_P (c3))
1379 src = src_base;
1380 else
1381 {
1382 if ((c1 != UTF_8_BOM_1)
1383 || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1384 src = src_base;
1385 else
1386 CODING_UTF_8_BOM (coding) = utf_without_bom;
1387 }
1388 }
1389 }
1390 }
1391 CODING_UTF_8_BOM (coding) = utf_without_bom;
1392
df7492f9 1393 while (1)
b73bfc1c 1394 {
df7492f9 1395 int c, c1, c2, c3, c4, c5;
ec6d2bb8 1396
df7492f9
KH
1397 src_base = src;
1398 consumed_chars_base = consumed_chars;
4af310db 1399
df7492f9 1400 if (charbuf >= charbuf_end)
b71f6f73
KH
1401 {
1402 if (byte_after_cr >= 0)
1403 src_base--;
1404 break;
1405 }
df7492f9 1406
119852e7
KH
1407 if (byte_after_cr >= 0)
1408 c1 = byte_after_cr, byte_after_cr = -1;
1409 else
1410 ONE_MORE_BYTE (c1);
065e3595
KH
1411 if (c1 < 0)
1412 {
1413 c = - c1;
1414 }
1a4990fb 1415 else if (UTF_8_1_OCTET_P (c1))
df7492f9 1416 {
2735d060 1417 if (eol_dos && c1 == '\r')
119852e7 1418 ONE_MORE_BYTE (byte_after_cr);
df7492f9 1419 c = c1;
4af310db 1420 }
df7492f9 1421 else
4af310db 1422 {
df7492f9 1423 ONE_MORE_BYTE (c2);
065e3595 1424 if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
df7492f9
KH
1425 goto invalid_code;
1426 if (UTF_8_2_OCTET_LEADING_P (c1))
4af310db 1427 {
b0edb2c5
DL
1428 c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1429 /* Reject overlong sequences here and below. Encoders
1430 producing them are incorrect, they can be misleading,
1431 and they mess up read/write invariance. */
1432 if (c < 128)
1433 goto invalid_code;
4af310db 1434 }
df7492f9 1435 else
aa72b389 1436 {
df7492f9 1437 ONE_MORE_BYTE (c3);
065e3595 1438 if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
df7492f9
KH
1439 goto invalid_code;
1440 if (UTF_8_3_OCTET_LEADING_P (c1))
b0edb2c5
DL
1441 {
1442 c = (((c1 & 0xF) << 12)
1443 | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
72fe1301
DL
1444 if (c < 0x800
1445 || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
b0edb2c5
DL
1446 goto invalid_code;
1447 }
df7492f9
KH
1448 else
1449 {
1450 ONE_MORE_BYTE (c4);
065e3595 1451 if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
df7492f9
KH
1452 goto invalid_code;
1453 if (UTF_8_4_OCTET_LEADING_P (c1))
b0edb2c5 1454 {
df7492f9
KH
1455 c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1456 | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
b0edb2c5
DL
1457 if (c < 0x10000)
1458 goto invalid_code;
1459 }
df7492f9
KH
1460 else
1461 {
1462 ONE_MORE_BYTE (c5);
065e3595 1463 if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
df7492f9
KH
1464 goto invalid_code;
1465 if (UTF_8_5_OCTET_LEADING_P (c1))
1466 {
1467 c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1468 | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1469 | (c5 & 0x3F));
b0edb2c5 1470 if ((c > MAX_CHAR) || (c < 0x200000))
df7492f9
KH
1471 goto invalid_code;
1472 }
1473 else
1474 goto invalid_code;
1475 }
1476 }
aa72b389 1477 }
b73bfc1c 1478 }
df7492f9
KH
1479
1480 *charbuf++ = c;
1481 continue;
1482
1483 invalid_code:
1484 src = src_base;
1485 consumed_chars = consumed_chars_base;
1486 ONE_MORE_BYTE (c);
1487 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1488 coding->errors++;
aa72b389
KH
1489 }
1490
df7492f9
KH
1491 no_more_source:
1492 coding->consumed_char += consumed_chars_base;
1493 coding->consumed = src_base - coding->source;
1494 coding->charbuf_used = charbuf - coding->charbuf;
1495}
1496
1497
1498static int
971de7fb 1499encode_coding_utf_8 (struct coding_system *coding)
df7492f9
KH
1500{
1501 int multibytep = coding->dst_multibyte;
1502 int *charbuf = coding->charbuf;
1503 int *charbuf_end = charbuf + coding->charbuf_used;
1504 unsigned char *dst = coding->destination + coding->produced;
1505 unsigned char *dst_end = coding->destination + coding->dst_bytes;
d311d28c 1506 ptrdiff_t produced_chars = 0;
df7492f9
KH
1507 int c;
1508
a470d443
KH
1509 if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1510 {
1511 ASSURE_DESTINATION (3);
1512 EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1513 CODING_UTF_8_BOM (coding) = utf_without_bom;
1514 }
1515
df7492f9 1516 if (multibytep)
aa72b389 1517 {
df7492f9
KH
1518 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1519
1520 while (charbuf < charbuf_end)
b73bfc1c 1521 {
df7492f9 1522 unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
8f924df7 1523
df7492f9
KH
1524 ASSURE_DESTINATION (safe_room);
1525 c = *charbuf++;
28f67a95
KH
1526 if (CHAR_BYTE8_P (c))
1527 {
1528 c = CHAR_TO_BYTE8 (c);
1529 EMIT_ONE_BYTE (c);
1530 }
1531 else
1532 {
db274c7a 1533 CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
28f67a95
KH
1534 for (p = str; p < pend; p++)
1535 EMIT_ONE_BYTE (*p);
1536 }
b73bfc1c 1537 }
aa72b389 1538 }
df7492f9
KH
1539 else
1540 {
1541 int safe_room = MAX_MULTIBYTE_LENGTH;
1542
1543 while (charbuf < charbuf_end)
b73bfc1c 1544 {
df7492f9
KH
1545 ASSURE_DESTINATION (safe_room);
1546 c = *charbuf++;
f03caae0
KH
1547 if (CHAR_BYTE8_P (c))
1548 *dst++ = CHAR_TO_BYTE8 (c);
1549 else
db274c7a 1550 CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
df7492f9 1551 produced_chars++;
4ed46869
KH
1552 }
1553 }
065e3595 1554 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
1555 coding->produced_char += produced_chars;
1556 coding->produced = dst - coding->destination;
1557 return 0;
4ed46869
KH
1558}
1559
b73bfc1c 1560
df7492f9 1561/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
ff0dacd7
KH
1562 Check if a text is encoded in one of UTF-16 based coding systems.
1563 If it is, return 1, else return 0. */
aa72b389 1564
df7492f9
KH
1565#define UTF_16_HIGH_SURROGATE_P(val) \
1566 (((val) & 0xFC00) == 0xD800)
1567
1568#define UTF_16_LOW_SURROGATE_P(val) \
1569 (((val) & 0xFC00) == 0xDC00)
93dec019 1570
aa72b389 1571
df7492f9 1572static int
cf84bb53
JB
1573detect_coding_utf_16 (struct coding_system *coding,
1574 struct coding_detection_info *detect_info)
aa72b389 1575{
ef1b0ba7 1576 const unsigned char *src = coding->source;
8f924df7 1577 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9 1578 int multibytep = coding->src_multibyte;
df7492f9 1579 int c1, c2;
aa72b389 1580
ff0dacd7 1581 detect_info->checked |= CATEGORY_MASK_UTF_16;
ff0dacd7 1582 if (coding->mode & CODING_MODE_LAST_BLOCK
24a73b0a 1583 && (coding->src_chars & 1))
ff0dacd7
KH
1584 {
1585 detect_info->rejected |= CATEGORY_MASK_UTF_16;
1586 return 0;
1587 }
24a73b0a 1588
f56a4450 1589 TWO_MORE_BYTES (c1, c2);
df7492f9 1590 if ((c1 == 0xFF) && (c2 == 0xFE))
aa72b389 1591 {
b49a1807
KH
1592 detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1593 | CATEGORY_MASK_UTF_16_AUTO);
24a73b0a
KH
1594 detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1595 | CATEGORY_MASK_UTF_16_BE_NOSIG
1596 | CATEGORY_MASK_UTF_16_LE_NOSIG);
aa72b389 1597 }
df7492f9 1598 else if ((c1 == 0xFE) && (c2 == 0xFF))
ff0dacd7 1599 {
b49a1807
KH
1600 detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1601 | CATEGORY_MASK_UTF_16_AUTO);
24a73b0a
KH
1602 detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1603 | CATEGORY_MASK_UTF_16_BE_NOSIG
1604 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1605 }
220eeac9 1606 else if (c2 < 0)
f56a4450
KH
1607 {
1608 detect_info->rejected |= CATEGORY_MASK_UTF_16;
1609 return 0;
1610 }
2f3cbb32 1611 else
24a73b0a 1612 {
2f3cbb32
KH
1613 /* We check the dispersion of Eth and Oth bytes where E is even and
1614 O is odd. If both are high, we assume binary data.*/
1615 unsigned char e[256], o[256];
1616 unsigned e_num = 1, o_num = 1;
1617
1618 memset (e, 0, 256);
1619 memset (o, 0, 256);
1620 e[c1] = 1;
1621 o[c2] = 1;
1622
cc13543e
KH
1623 detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
1624 |CATEGORY_MASK_UTF_16_BE
1625 | CATEGORY_MASK_UTF_16_LE);
2f3cbb32 1626
7f1faf1c
KH
1627 while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
1628 != CATEGORY_MASK_UTF_16)
2f3cbb32 1629 {
f56a4450 1630 TWO_MORE_BYTES (c1, c2);
220eeac9 1631 if (c2 < 0)
f56a4450 1632 break;
2f3cbb32
KH
1633 if (! e[c1])
1634 {
1635 e[c1] = 1;
1636 e_num++;
cc13543e
KH
1637 if (e_num >= 128)
1638 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
2f3cbb32
KH
1639 }
1640 if (! o[c2])
1641 {
977b85f4 1642 o[c2] = 1;
2f3cbb32 1643 o_num++;
cc13543e
KH
1644 if (o_num >= 128)
1645 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
2f3cbb32
KH
1646 }
1647 }
2f3cbb32 1648 return 0;
ff0dacd7 1649 }
2f3cbb32 1650
df7492f9 1651 no_more_source:
ff0dacd7 1652 return 1;
df7492f9 1653}
aa72b389 1654
df7492f9 1655static void
971de7fb 1656decode_coding_utf_16 (struct coding_system *coding)
df7492f9 1657{
8f924df7
KH
1658 const unsigned char *src = coding->source + coding->consumed;
1659 const unsigned char *src_end = coding->source + coding->src_bytes;
1660 const unsigned char *src_base;
69a80ea3 1661 int *charbuf = coding->charbuf + coding->charbuf_used;
df80c7f0
KH
1662 /* We may produces at most 3 chars in one loop. */
1663 int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
d311d28c 1664 ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
df7492f9 1665 int multibytep = coding->src_multibyte;
a470d443 1666 enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
df7492f9
KH
1667 enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1668 int surrogate = CODING_UTF_16_SURROGATE (coding);
2735d060 1669 int eol_dos =
0a9564cb 1670 !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
119852e7 1671 int byte_after_cr1 = -1, byte_after_cr2 = -1;
df7492f9 1672
a470d443 1673 if (bom == utf_with_bom)
aa72b389 1674 {
df7492f9 1675 int c, c1, c2;
4af310db 1676
aa72b389 1677 src_base = src;
df7492f9
KH
1678 ONE_MORE_BYTE (c1);
1679 ONE_MORE_BYTE (c2);
e19c3639 1680 c = (c1 << 8) | c2;
aa72b389 1681
b49a1807
KH
1682 if (endian == utf_16_big_endian
1683 ? c != 0xFEFF : c != 0xFFFE)
aa72b389 1684 {
b49a1807
KH
1685 /* The first two bytes are not BOM. Treat them as bytes
1686 for a normal character. */
1687 src = src_base;
1688 coding->errors++;
aa72b389 1689 }
a470d443 1690 CODING_UTF_16_BOM (coding) = utf_without_bom;
b49a1807 1691 }
a470d443 1692 else if (bom == utf_detect_bom)
b49a1807
KH
1693 {
1694 /* We have already tried to detect BOM and failed in
1695 detect_coding. */
a470d443 1696 CODING_UTF_16_BOM (coding) = utf_without_bom;
df7492f9 1697 }
aa72b389 1698
df7492f9
KH
1699 while (1)
1700 {
1701 int c, c1, c2;
1702
1703 src_base = src;
1704 consumed_chars_base = consumed_chars;
1705
df80c7f0 1706 if (charbuf >= charbuf_end)
b71f6f73
KH
1707 {
1708 if (byte_after_cr1 >= 0)
1709 src_base -= 2;
1710 break;
1711 }
df7492f9 1712
119852e7
KH
1713 if (byte_after_cr1 >= 0)
1714 c1 = byte_after_cr1, byte_after_cr1 = -1;
1715 else
1716 ONE_MORE_BYTE (c1);
065e3595
KH
1717 if (c1 < 0)
1718 {
1719 *charbuf++ = -c1;
1720 continue;
1721 }
119852e7
KH
1722 if (byte_after_cr2 >= 0)
1723 c2 = byte_after_cr2, byte_after_cr2 = -1;
1724 else
1725 ONE_MORE_BYTE (c2);
065e3595
KH
1726 if (c2 < 0)
1727 {
1728 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1729 *charbuf++ = -c2;
1730 continue;
1731 }
df7492f9 1732 c = (endian == utf_16_big_endian
e19c3639 1733 ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
119852e7 1734
df7492f9 1735 if (surrogate)
fd3ae0b9 1736 {
df7492f9 1737 if (! UTF_16_LOW_SURROGATE_P (c))
fd3ae0b9 1738 {
df7492f9
KH
1739 if (endian == utf_16_big_endian)
1740 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1741 else
1742 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1743 *charbuf++ = c1;
1744 *charbuf++ = c2;
1745 coding->errors++;
1746 if (UTF_16_HIGH_SURROGATE_P (c))
1747 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
fd3ae0b9 1748 else
df7492f9 1749 *charbuf++ = c;
fd3ae0b9
KH
1750 }
1751 else
df7492f9
KH
1752 {
1753 c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1754 CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
29f7ffd0 1755 *charbuf++ = 0x10000 + c;
df7492f9 1756 }
fd3ae0b9 1757 }
aa72b389 1758 else
df7492f9
KH
1759 {
1760 if (UTF_16_HIGH_SURROGATE_P (c))
1761 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1762 else
119852e7 1763 {
2735d060 1764 if (eol_dos && c == '\r')
119852e7
KH
1765 {
1766 ONE_MORE_BYTE (byte_after_cr1);
1767 ONE_MORE_BYTE (byte_after_cr2);
1768 }
1769 *charbuf++ = c;
1770 }
8f924df7 1771 }
aa72b389 1772 }
df7492f9
KH
1773
1774 no_more_source:
1775 coding->consumed_char += consumed_chars_base;
1776 coding->consumed = src_base - coding->source;
1777 coding->charbuf_used = charbuf - coding->charbuf;
aa72b389 1778}
b73bfc1c 1779
df7492f9 1780static int
971de7fb 1781encode_coding_utf_16 (struct coding_system *coding)
df7492f9
KH
1782{
1783 int multibytep = coding->dst_multibyte;
1784 int *charbuf = coding->charbuf;
1785 int *charbuf_end = charbuf + coding->charbuf_used;
1786 unsigned char *dst = coding->destination + coding->produced;
1787 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1788 int safe_room = 8;
a470d443 1789 enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
df7492f9 1790 int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
d311d28c 1791 ptrdiff_t produced_chars = 0;
df7492f9 1792 int c;
4ed46869 1793
a470d443 1794 if (bom != utf_without_bom)
df7492f9
KH
1795 {
1796 ASSURE_DESTINATION (safe_room);
1797 if (big_endian)
df7492f9 1798 EMIT_TWO_BYTES (0xFE, 0xFF);
880cf180
KH
1799 else
1800 EMIT_TWO_BYTES (0xFF, 0xFE);
a470d443 1801 CODING_UTF_16_BOM (coding) = utf_without_bom;
df7492f9
KH
1802 }
1803
1804 while (charbuf < charbuf_end)
1805 {
1806 ASSURE_DESTINATION (safe_room);
1807 c = *charbuf++;
60afa08d 1808 if (c > MAX_UNICODE_CHAR)
e19c3639 1809 c = coding->default_char;
df7492f9
KH
1810
1811 if (c < 0x10000)
1812 {
1813 if (big_endian)
1814 EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1815 else
1816 EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1817 }
1818 else
1819 {
1820 int c1, c2;
1821
1822 c -= 0x10000;
1823 c1 = (c >> 10) + 0xD800;
1824 c2 = (c & 0x3FF) + 0xDC00;
1825 if (big_endian)
1826 EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1827 else
1828 EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1829 }
1830 }
065e3595 1831 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
1832 coding->produced = dst - coding->destination;
1833 coding->produced_char += produced_chars;
1834 return 0;
1835}
1836
1837\f
1838/*** 6. Old Emacs' internal format (emacs-mule) ***/
1839
1840/* Emacs' internal format for representation of multiple character
1841 sets is a kind of multi-byte encoding, i.e. characters are
1842 represented by variable-length sequences of one-byte codes.
1843
1844 ASCII characters and control characters (e.g. `tab', `newline') are
1845 represented by one-byte sequences which are their ASCII codes, in
1846 the range 0x00 through 0x7F.
1847
1848 8-bit characters of the range 0x80..0x9F are represented by
1849 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1850 code + 0x20).
1851
1852 8-bit characters of the range 0xA0..0xFF are represented by
1853 one-byte sequences which are their 8-bit code.
1854
1855 The other characters are represented by a sequence of `base
1856 leading-code', optional `extended leading-code', and one or two
1857 `position-code's. The length of the sequence is determined by the
1858 base leading-code. Leading-code takes the range 0x81 through 0x9D,
1859 whereas extended leading-code and position-code take the range 0xA0
1860 through 0xFF. See `charset.h' for more details about leading-code
1861 and position-code.
1862
1863 --- CODE RANGE of Emacs' internal format ---
1864 character set range
1865 ------------- -----
1866 ascii 0x00..0x7F
1867 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1868 eight-bit-graphic 0xA0..0xBF
1869 ELSE 0x81..0x9D + [0xA0..0xFF]+
1870 ---------------------------------------------
1871
1872 As this is the internal character representation, the format is
1873 usually not used externally (i.e. in a file or in a data sent to a
1874 process). But, it is possible to have a text externally in this
1875 format (i.e. by encoding by the coding system `emacs-mule').
1876
1877 In that case, a sequence of one-byte codes has a slightly different
1878 form.
1879
1880 At first, all characters in eight-bit-control are represented by
1881 one-byte sequences which are their 8-bit code.
1882
1883 Next, character composition data are represented by the byte
1884 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1885 where,
e951386e 1886 METHOD is 0xF2 plus one of composition method (enum
df7492f9
KH
1887 composition_method),
1888
1889 BYTES is 0xA0 plus a byte length of this composition data,
1890
e951386e 1891 CHARS is 0xA0 plus a number of characters composed by this
df7492f9
KH
1892 data,
1893
ad1746f5 1894 COMPONENTs are characters of multibyte form or composition
df7492f9
KH
1895 rules encoded by two-byte of ASCII codes.
1896
1897 In addition, for backward compatibility, the following formats are
1898 also recognized as composition data on decoding.
1899
1900 0x80 MSEQ ...
1901 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1902
1903 Here,
1904 MSEQ is a multibyte form but in these special format:
1905 ASCII: 0xA0 ASCII_CODE+0x80,
1906 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1907 RULE is a one byte code of the range 0xA0..0xF0 that
1908 represents a composition rule.
1909 */
1910
1911char emacs_mule_bytes[256];
1912
e951386e
KH
1913
1914/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1915 Check if a text is encoded in `emacs-mule'. If it is, return 1,
1916 else return 0. */
1917
1918static int
cf84bb53
JB
1919detect_coding_emacs_mule (struct coding_system *coding,
1920 struct coding_detection_info *detect_info)
e951386e
KH
1921{
1922 const unsigned char *src = coding->source, *src_base;
1923 const unsigned char *src_end = coding->source + coding->src_bytes;
1924 int multibytep = coding->src_multibyte;
d311d28c 1925 ptrdiff_t consumed_chars = 0;
e951386e
KH
1926 int c;
1927 int found = 0;
1928
1929 detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1930 /* A coding system of this category is always ASCII compatible. */
1931 src += coding->head_ascii;
1932
1933 while (1)
1934 {
1935 src_base = src;
1936 ONE_MORE_BYTE (c);
1937 if (c < 0)
1938 continue;
1939 if (c == 0x80)
1940 {
1941 /* Perhaps the start of composite character. We simply skip
1942 it because analyzing it is too heavy for detecting. But,
1943 at least, we check that the composite character
1944 constitutes of more than 4 bytes. */
2735d060 1945 const unsigned char *src_start;
e951386e
KH
1946
1947 repeat:
2735d060 1948 src_start = src;
e951386e
KH
1949 do
1950 {
1951 ONE_MORE_BYTE (c);
1952 }
1953 while (c >= 0xA0);
1954
2735d060 1955 if (src - src_start <= 4)
e951386e
KH
1956 break;
1957 found = CATEGORY_MASK_EMACS_MULE;
1958 if (c == 0x80)
1959 goto repeat;
1960 }
1961
1962 if (c < 0x80)
1963 {
1964 if (c < 0x20
1965 && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1966 break;
1967 }
1968 else
1969 {
396475b7 1970 int more_bytes = emacs_mule_bytes[c] - 1;
e951386e
KH
1971
1972 while (more_bytes > 0)
1973 {
1974 ONE_MORE_BYTE (c);
1975 if (c < 0xA0)
1976 {
1977 src--; /* Unread the last byte. */
1978 break;
1979 }
1980 more_bytes--;
1981 }
1982 if (more_bytes != 0)
1983 break;
1984 found = CATEGORY_MASK_EMACS_MULE;
1985 }
1986 }
1987 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1988 return 0;
1989
1990 no_more_source:
1991 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1992 {
1993 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1994 return 0;
1995 }
1996 detect_info->found |= found;
1997 return 1;
1998}
1999
2000
2001/* Parse emacs-mule multibyte sequence at SRC and return the decoded
2002 character. If CMP_STATUS indicates that we must expect MSEQ or
2003 RULE described above, decode it and return the negative value of
685ebdc8 2004 the decoded character or rule. If an invalid byte is found, return
e951386e
KH
2005 -1. If SRC is too short, return -2. */
2006
e2f1bab9 2007static int
cf84bb53
JB
2008emacs_mule_char (struct coding_system *coding, const unsigned char *src,
2009 int *nbytes, int *nchars, int *id,
2010 struct composition_status *cmp_status)
df7492f9 2011{
8f924df7
KH
2012 const unsigned char *src_end = coding->source + coding->src_bytes;
2013 const unsigned char *src_base = src;
df7492f9 2014 int multibytep = coding->src_multibyte;
2735d060 2015 int charset_ID;
df7492f9
KH
2016 unsigned code;
2017 int c;
2018 int consumed_chars = 0;
e951386e 2019 int mseq_found = 0;
df7492f9
KH
2020
2021 ONE_MORE_BYTE (c);
065e3595 2022 if (c < 0)
df7492f9 2023 {
065e3595 2024 c = -c;
2735d060 2025 charset_ID = emacs_mule_charset[0];
065e3595
KH
2026 }
2027 else
2028 {
4d41e8b7
KH
2029 if (c >= 0xA0)
2030 {
e951386e
KH
2031 if (cmp_status->state != COMPOSING_NO
2032 && cmp_status->old_form)
4d41e8b7 2033 {
e951386e
KH
2034 if (cmp_status->state == COMPOSING_CHAR)
2035 {
2036 if (c == 0xA0)
2037 {
2038 ONE_MORE_BYTE (c);
2039 c -= 0x80;
2040 if (c < 0)
2041 goto invalid_code;
2042 }
2043 else
2044 c -= 0x20;
2045 mseq_found = 1;
2046 }
2047 else
2048 {
2049 *nbytes = src - src_base;
2050 *nchars = consumed_chars;
2051 return -c;
2052 }
4d41e8b7
KH
2053 }
2054 else
e951386e 2055 goto invalid_code;
4d41e8b7
KH
2056 }
2057
065e3595 2058 switch (emacs_mule_bytes[c])
b73bfc1c 2059 {
065e3595 2060 case 2:
2735d060 2061 if ((charset_ID = emacs_mule_charset[c]) < 0)
df7492f9
KH
2062 goto invalid_code;
2063 ONE_MORE_BYTE (c);
9ffd559c 2064 if (c < 0xA0)
065e3595 2065 goto invalid_code;
df7492f9 2066 code = c & 0x7F;
065e3595
KH
2067 break;
2068
2069 case 3:
2070 if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
2071 || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
2072 {
2073 ONE_MORE_BYTE (c);
2735d060 2074 if (c < 0xA0 || (charset_ID = emacs_mule_charset[c]) < 0)
065e3595
KH
2075 goto invalid_code;
2076 ONE_MORE_BYTE (c);
9ffd559c 2077 if (c < 0xA0)
065e3595
KH
2078 goto invalid_code;
2079 code = c & 0x7F;
2080 }
2081 else
2082 {
2735d060 2083 if ((charset_ID = emacs_mule_charset[c]) < 0)
065e3595
KH
2084 goto invalid_code;
2085 ONE_MORE_BYTE (c);
9ffd559c 2086 if (c < 0xA0)
065e3595
KH
2087 goto invalid_code;
2088 code = (c & 0x7F) << 8;
2089 ONE_MORE_BYTE (c);
9ffd559c 2090 if (c < 0xA0)
065e3595
KH
2091 goto invalid_code;
2092 code |= c & 0x7F;
2093 }
2094 break;
2095
2096 case 4:
2097 ONE_MORE_BYTE (c);
2735d060 2098 if (c < 0 || (charset_ID = emacs_mule_charset[c]) < 0)
df7492f9
KH
2099 goto invalid_code;
2100 ONE_MORE_BYTE (c);
9ffd559c 2101 if (c < 0xA0)
065e3595 2102 goto invalid_code;
781d7a48 2103 code = (c & 0x7F) << 8;
df7492f9 2104 ONE_MORE_BYTE (c);
9ffd559c 2105 if (c < 0xA0)
065e3595 2106 goto invalid_code;
df7492f9 2107 code |= c & 0x7F;
065e3595 2108 break;
df7492f9 2109
065e3595
KH
2110 case 1:
2111 code = c;
2735d060 2112 charset_ID = ASCII_BYTE_P (code) ? charset_ascii : charset_eight_bit;
065e3595 2113 break;
df7492f9 2114
065e3595
KH
2115 default:
2116 abort ();
2117 }
b84ae584 2118 CODING_DECODE_CHAR (coding, src, src_base, src_end,
2735d060 2119 CHARSET_FROM_ID (charset_ID), code, c);
065e3595
KH
2120 if (c < 0)
2121 goto invalid_code;
df7492f9 2122 }
df7492f9
KH
2123 *nbytes = src - src_base;
2124 *nchars = consumed_chars;
ff0dacd7 2125 if (id)
2735d060 2126 *id = charset_ID;
e951386e 2127 return (mseq_found ? -c : c);
df7492f9
KH
2128
2129 no_more_source:
2130 return -2;
2131
2132 invalid_code:
2133 return -1;
2134}
2135
2136
e951386e 2137/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
df7492f9 2138
e951386e
KH
2139/* Handle these composition sequence ('|': the end of header elements,
2140 BYTES and CHARS >= 0xA0):
df7492f9 2141
e951386e
KH
2142 (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2143 (2) altchar composition: 0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2144 (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
df7492f9 2145
e951386e 2146 and these old form:
1a4990fb 2147
e951386e
KH
2148 (4) relative composition: 0x80 | MSEQ ... MSEQ
2149 (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
df7492f9 2150
e951386e
KH
2151 When the starter 0x80 and the following header elements are found,
2152 this annotation header is produced.
df7492f9 2153
e951386e 2154 [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
df7492f9 2155
e951386e
KH
2156 NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2157 NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
df7492f9 2158
e951386e
KH
2159 Then, upon reading the following elements, these codes are produced
2160 until the composition end is found:
df7492f9 2161
e951386e
KH
2162 (1) CHAR ... CHAR
2163 (2) ALT ... ALT CHAR ... CHAR
2164 (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2165 (4) CHAR ... CHAR
2166 (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
4ed46869 2167
e951386e
KH
2168 When the composition end is found, LENGTH and NCHARS in the
2169 annotation header is updated as below:
b73bfc1c 2170
e951386e
KH
2171 (1) LENGTH: unchanged, NCHARS: unchanged
2172 (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2173 (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2174 (4) LENGTH: unchanged, NCHARS: number of CHARs
2175 (5) LENGTH: unchanged, NCHARS: number of CHARs
df7492f9 2176
e951386e
KH
2177 If an error is found while composing, the annotation header is
2178 changed to the original composition header (plus filler -1s) as
2179 below:
2180
2181 (1),(2),(3) [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2182 (5) [ 0x80 0xFF -1 -1- -1 ]
2183
2184 and the sequence [ -2 DECODED-RULE ] is changed to the original
2185 byte sequence as below:
2186 o the original byte sequence is B: [ B -1 ]
2187 o the original byte sequence is B1 B2: [ B1 B2 ]
2188
2189 Most of the routines are implemented by macros because many
2190 variables and labels in the caller decode_coding_emacs_mule must be
2191 accessible, and they are usually called just once (thus doesn't
2192 increase the size of compiled object). */
2193
2194/* Decode a composition rule represented by C as a component of
2195 composition sequence of Emacs 20 style. Set RULE to the decoded
2196 rule. */
2197
2198#define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule) \
df7492f9 2199 do { \
e951386e
KH
2200 int gref, nref; \
2201 \
4d41e8b7 2202 c -= 0xA0; \
df7492f9
KH
2203 if (c < 0 || c >= 81) \
2204 goto invalid_code; \
df7492f9 2205 gref = c / 9, nref = c % 9; \
e951386e
KH
2206 if (gref == 4) gref = 10; \
2207 if (nref == 4) nref = 10; \
2208 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
df7492f9
KH
2209 } while (0)
2210
2211
e951386e
KH
2212/* Decode a composition rule represented by C and the following byte
2213 at SRC as a component of composition sequence of Emacs 21 style.
2214 Set RULE to the decoded rule. */
781d7a48 2215
e951386e 2216#define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule) \
781d7a48
KH
2217 do { \
2218 int gref, nref; \
e951386e
KH
2219 \
2220 gref = c - 0x20; \
2221 if (gref < 0 || gref >= 81) \
781d7a48 2222 goto invalid_code; \
e951386e
KH
2223 ONE_MORE_BYTE (c); \
2224 nref = c - 0x20; \
2225 if (nref < 0 || nref >= 81) \
781d7a48 2226 goto invalid_code; \
e951386e 2227 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
781d7a48
KH
2228 } while (0)
2229
2230
e951386e
KH
2231/* Start of Emacs 21 style format. The first three bytes at SRC are
2232 (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2233 byte length of this composition information, CHARS is the number of
2234 characters composed by this composition. */
2235
2236#define DECODE_EMACS_MULE_21_COMPOSITION() \
aa72b389 2237 do { \
781d7a48 2238 enum composition_method method = c - 0xF2; \
df7492f9 2239 int nbytes, nchars; \
e951386e 2240 \
df7492f9 2241 ONE_MORE_BYTE (c); \
065e3595
KH
2242 if (c < 0) \
2243 goto invalid_code; \
df7492f9 2244 nbytes = c - 0xA0; \
e951386e 2245 if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4)) \
df7492f9
KH
2246 goto invalid_code; \
2247 ONE_MORE_BYTE (c); \
2248 nchars = c - 0xA0; \
e951386e
KH
2249 if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS) \
2250 goto invalid_code; \
2251 cmp_status->old_form = 0; \
2252 cmp_status->method = method; \
2253 if (method == COMPOSITION_RELATIVE) \
2254 cmp_status->state = COMPOSING_CHAR; \
2255 else \
2256 cmp_status->state = COMPOSING_COMPONENT_CHAR; \
2257 cmp_status->length = MAX_ANNOTATION_LENGTH; \
2258 cmp_status->nchars = nchars; \
2259 cmp_status->ncomps = nbytes - 4; \
2260 ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method); \
aa72b389 2261 } while (0)
93dec019 2262
aa72b389 2263
e951386e
KH
2264/* Start of Emacs 20 style format for relative composition. */
2265
2266#define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION() \
2267 do { \
2268 cmp_status->old_form = 1; \
2269 cmp_status->method = COMPOSITION_RELATIVE; \
2270 cmp_status->state = COMPOSING_CHAR; \
2271 cmp_status->length = MAX_ANNOTATION_LENGTH; \
2272 cmp_status->nchars = cmp_status->ncomps = 0; \
2273 ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method); \
2274 } while (0)
2275
2276
2277/* Start of Emacs 20 style format for rule-base composition. */
2278
2279#define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION() \
2280 do { \
2281 cmp_status->old_form = 1; \
2282 cmp_status->method = COMPOSITION_WITH_RULE; \
2283 cmp_status->state = COMPOSING_CHAR; \
2284 cmp_status->length = MAX_ANNOTATION_LENGTH; \
2285 cmp_status->nchars = cmp_status->ncomps = 0; \
2286 ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method); \
df7492f9
KH
2287 } while (0)
2288
2289
e951386e
KH
2290#define DECODE_EMACS_MULE_COMPOSITION_START() \
2291 do { \
2292 const unsigned char *current_src = src; \
2293 \
2294 ONE_MORE_BYTE (c); \
2295 if (c < 0) \
2296 goto invalid_code; \
2297 if (c - 0xF2 >= COMPOSITION_RELATIVE \
2298 && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS) \
2299 DECODE_EMACS_MULE_21_COMPOSITION (); \
2300 else if (c < 0xA0) \
2301 goto invalid_code; \
2302 else if (c < 0xC0) \
2303 { \
2304 DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (); \
2305 /* Re-read C as a composition component. */ \
2306 src = current_src; \
2307 } \
2308 else if (c == 0xFF) \
2309 DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (); \
2310 else \
2311 goto invalid_code; \
2312 } while (0)
2313
2314#define EMACS_MULE_COMPOSITION_END() \
df7492f9 2315 do { \
e951386e 2316 int idx = - cmp_status->length; \
4d41e8b7 2317 \
e951386e
KH
2318 if (cmp_status->old_form) \
2319 charbuf[idx + 2] = cmp_status->nchars; \
2320 else if (cmp_status->method > COMPOSITION_RELATIVE) \
2321 charbuf[idx] = charbuf[idx + 2] - cmp_status->length; \
2322 cmp_status->state = COMPOSING_NO; \
2323 } while (0)
2324
2325
2326static int
cf84bb53
JB
2327emacs_mule_finish_composition (int *charbuf,
2328 struct composition_status *cmp_status)
e951386e
KH
2329{
2330 int idx = - cmp_status->length;
2331 int new_chars;
2332
2333 if (cmp_status->old_form && cmp_status->nchars > 0)
2334 {
2335 charbuf[idx + 2] = cmp_status->nchars;
2336 new_chars = 0;
2337 if (cmp_status->method == COMPOSITION_WITH_RULE
2338 && cmp_status->state == COMPOSING_CHAR)
2339 {
2340 /* The last rule was invalid. */
2341 int rule = charbuf[-1] + 0xA0;
2342
2343 charbuf[-2] = BYTE8_TO_CHAR (rule);
2344 charbuf[-1] = -1;
2345 new_chars = 1;
2346 }
2347 }
2348 else
2349 {
2350 charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2351
2352 if (cmp_status->method == COMPOSITION_WITH_RULE)
2353 {
2354 charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2355 charbuf[idx++] = -3;
2356 charbuf[idx++] = 0;
2357 new_chars = 1;
2358 }
2359 else
2360 {
2361 int nchars = charbuf[idx + 1] + 0xA0;
2362 int nbytes = charbuf[idx + 2] + 0xA0;
2363
2364 charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2365 charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2366 charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2367 charbuf[idx++] = -1;
2368 new_chars = 4;
2369 }
2370 }
2371 cmp_status->state = COMPOSING_NO;
2372 return new_chars;
2373}
2374
2375#define EMACS_MULE_MAYBE_FINISH_COMPOSITION() \
2376 do { \
2377 if (cmp_status->state != COMPOSING_NO) \
2378 char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
df7492f9
KH
2379 } while (0)
2380
aa72b389
KH
2381
2382static void
971de7fb 2383decode_coding_emacs_mule (struct coding_system *coding)
aa72b389 2384{
8f924df7
KH
2385 const unsigned char *src = coding->source + coding->consumed;
2386 const unsigned char *src_end = coding->source + coding->src_bytes;
2387 const unsigned char *src_base;
69a80ea3 2388 int *charbuf = coding->charbuf + coding->charbuf_used;
ad1746f5
KH
2389 /* We may produce two annotations (charset and composition) in one
2390 loop and one more charset annotation at the end. */
69a80ea3 2391 int *charbuf_end
15cbd324
EZ
2392 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3)
2393 /* We can produce up to 2 characters in a loop. */
2394 - 1;
d311d28c 2395 ptrdiff_t consumed_chars = 0, consumed_chars_base;
df7492f9 2396 int multibytep = coding->src_multibyte;
d311d28c
PE
2397 ptrdiff_t char_offset = coding->produced_char;
2398 ptrdiff_t last_offset = char_offset;
ff0dacd7 2399 int last_id = charset_ascii;
2735d060 2400 int eol_dos =
0a9564cb 2401 !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
119852e7 2402 int byte_after_cr = -1;
e951386e 2403 struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
aa72b389 2404
e951386e
KH
2405 if (cmp_status->state != COMPOSING_NO)
2406 {
2407 int i;
2408
15cbd324
EZ
2409 if (charbuf_end - charbuf < cmp_status->length)
2410 abort ();
e951386e
KH
2411 for (i = 0; i < cmp_status->length; i++)
2412 *charbuf++ = cmp_status->carryover[i];
2413 coding->annotated = 1;
2414 }
2415
aa72b389
KH
2416 while (1)
2417 {
ee05f961 2418 int c, id IF_LINT (= 0);
df7492f9 2419
aa72b389 2420 src_base = src;
df7492f9
KH
2421 consumed_chars_base = consumed_chars;
2422
2423 if (charbuf >= charbuf_end)
b71f6f73
KH
2424 {
2425 if (byte_after_cr >= 0)
2426 src_base--;
2427 break;
2428 }
aa72b389 2429
119852e7
KH
2430 if (byte_after_cr >= 0)
2431 c = byte_after_cr, byte_after_cr = -1;
2432 else
2433 ONE_MORE_BYTE (c);
e951386e
KH
2434
2435 if (c < 0 || c == 0x80)
065e3595 2436 {
e951386e
KH
2437 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2438 if (c < 0)
2439 {
2440 *charbuf++ = -c;
2441 char_offset++;
2442 }
2443 else
2444 DECODE_EMACS_MULE_COMPOSITION_START ();
2445 continue;
065e3595 2446 }
e951386e
KH
2447
2448 if (c < 0x80)
aa72b389 2449 {
2735d060 2450 if (eol_dos && c == '\r')
119852e7 2451 ONE_MORE_BYTE (byte_after_cr);
e951386e
KH
2452 id = charset_ascii;
2453 if (cmp_status->state != COMPOSING_NO)
2454 {
2455 if (cmp_status->old_form)
2456 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2457 else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2458 cmp_status->ncomps--;
2459 }
2460 }
2461 else
2462 {
ee05f961 2463 int nchars IF_LINT (= 0), nbytes IF_LINT (= 0);
75f80e63
EZ
2464 /* emacs_mule_char can load a charset map from a file, which
2465 allocates a large structure and might cause buffer text
2466 to be relocated as result. Thus, we need to remember the
ad1746f5 2467 original pointer to buffer text, and fix up all related
75f80e63
EZ
2468 pointers after the call. */
2469 const unsigned char *orig = coding->source;
d311d28c 2470 ptrdiff_t offset;
e951386e
KH
2471
2472 c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2473 cmp_status);
75f80e63
EZ
2474 offset = coding->source - orig;
2475 if (offset)
2476 {
2477 src += offset;
2478 src_base += offset;
2479 src_end += offset;
2480 }
e951386e
KH
2481 if (c < 0)
2482 {
2483 if (c == -1)
2484 goto invalid_code;
2485 if (c == -2)
2486 break;
2487 }
2488 src = src_base + nbytes;
2489 consumed_chars = consumed_chars_base + nchars;
2490 if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2491 cmp_status->ncomps -= nchars;
2492 }
2493
ad1746f5 2494 /* Now if C >= 0, we found a normally encoded character, if C <
e951386e
KH
2495 0, we found an old-style composition component character or
2496 rule. */
2497
2498 if (cmp_status->state == COMPOSING_NO)
2499 {
2500 if (last_id != id)
2501 {
2502 if (last_id != charset_ascii)
2503 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2504 last_id);
2505 last_id = id;
2506 last_offset = char_offset;
2507 }
df7492f9
KH
2508 *charbuf++ = c;
2509 char_offset++;
aa72b389 2510 }
e951386e 2511 else if (cmp_status->state == COMPOSING_CHAR)
df7492f9 2512 {
e951386e
KH
2513 if (cmp_status->old_form)
2514 {
2515 if (c >= 0)
2516 {
2517 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2518 *charbuf++ = c;
2519 char_offset++;
2520 }
2521 else
2522 {
2523 *charbuf++ = -c;
2524 cmp_status->nchars++;
2525 cmp_status->length++;
2526 if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2527 EMACS_MULE_COMPOSITION_END ();
2528 else if (cmp_status->method == COMPOSITION_WITH_RULE)
2529 cmp_status->state = COMPOSING_RULE;
2530 }
2531 }
df7492f9 2532 else
e951386e
KH
2533 {
2534 *charbuf++ = c;
2535 cmp_status->length++;
2536 cmp_status->nchars--;
2537 if (cmp_status->nchars == 0)
2538 EMACS_MULE_COMPOSITION_END ();
2539 }
df7492f9 2540 }
e951386e 2541 else if (cmp_status->state == COMPOSING_RULE)
df7492f9 2542 {
e951386e 2543 int rule;
ff0dacd7 2544
e951386e 2545 if (c >= 0)
df7492f9 2546 {
e951386e
KH
2547 EMACS_MULE_COMPOSITION_END ();
2548 *charbuf++ = c;
2549 char_offset++;
df7492f9 2550 }
e951386e 2551 else
ff0dacd7 2552 {
e951386e
KH
2553 c = -c;
2554 DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2555 if (rule < 0)
2556 goto invalid_code;
2557 *charbuf++ = -2;
2558 *charbuf++ = rule;
2559 cmp_status->length += 2;
2560 cmp_status->state = COMPOSING_CHAR;
ff0dacd7 2561 }
e951386e
KH
2562 }
2563 else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2564 {
df7492f9 2565 *charbuf++ = c;
e951386e
KH
2566 cmp_status->length++;
2567 if (cmp_status->ncomps == 0)
2568 cmp_status->state = COMPOSING_CHAR;
2569 else if (cmp_status->ncomps > 0)
2570 {
2571 if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2572 cmp_status->state = COMPOSING_COMPONENT_RULE;
2573 }
2574 else
2575 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
df7492f9 2576 }
e951386e
KH
2577 else /* COMPOSING_COMPONENT_RULE */
2578 {
2579 int rule;
2580
2581 DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2582 if (rule < 0)
2583 goto invalid_code;
2584 *charbuf++ = -2;
2585 *charbuf++ = rule;
2586 cmp_status->length += 2;
2587 cmp_status->ncomps--;
2588 if (cmp_status->ncomps > 0)
2589 cmp_status->state = COMPOSING_COMPONENT_CHAR;
2590 else
2591 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2592 }
2593 continue;
2594
df7492f9 2595 invalid_code:
e951386e 2596 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
df7492f9
KH
2597 src = src_base;
2598 consumed_chars = consumed_chars_base;
2599 ONE_MORE_BYTE (c);
2600 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
ff0dacd7 2601 char_offset++;
df7492f9
KH
2602 coding->errors++;
2603 }
2604
2605 no_more_source:
e951386e
KH
2606 if (cmp_status->state != COMPOSING_NO)
2607 {
2608 if (coding->mode & CODING_MODE_LAST_BLOCK)
2609 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2610 else
2611 {
2612 int i;
2613
2614 charbuf -= cmp_status->length;
2615 for (i = 0; i < cmp_status->length; i++)
2616 cmp_status->carryover[i] = charbuf[i];
2617 }
2618 }
ff0dacd7 2619 if (last_id != charset_ascii)
69a80ea3 2620 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
df7492f9
KH
2621 coding->consumed_char += consumed_chars_base;
2622 coding->consumed = src_base - coding->source;
2623 coding->charbuf_used = charbuf - coding->charbuf;
2624}
2625
2626
2627#define EMACS_MULE_LEADING_CODES(id, codes) \
2628 do { \
2629 if (id < 0xA0) \
2630 codes[0] = id, codes[1] = 0; \
2631 else if (id < 0xE0) \
2632 codes[0] = 0x9A, codes[1] = id; \
2633 else if (id < 0xF0) \
2634 codes[0] = 0x9B, codes[1] = id; \
2635 else if (id < 0xF5) \
2636 codes[0] = 0x9C, codes[1] = id; \
2637 else \
2638 codes[0] = 0x9D, codes[1] = id; \
2639 } while (0);
2640
aa72b389 2641
df7492f9 2642static int
971de7fb 2643encode_coding_emacs_mule (struct coding_system *coding)
df7492f9
KH
2644{
2645 int multibytep = coding->dst_multibyte;
2646 int *charbuf = coding->charbuf;
2647 int *charbuf_end = charbuf + coding->charbuf_used;
2648 unsigned char *dst = coding->destination + coding->produced;
2649 unsigned char *dst_end = coding->destination + coding->dst_bytes;
2650 int safe_room = 8;
d311d28c 2651 ptrdiff_t produced_chars = 0;
24a73b0a 2652 Lisp_Object attrs, charset_list;
df7492f9 2653 int c;
ff0dacd7 2654 int preferred_charset_id = -1;
df7492f9 2655
24a73b0a 2656 CODING_GET_INFO (coding, attrs, charset_list);
eccb6815
KH
2657 if (! EQ (charset_list, Vemacs_mule_charset_list))
2658 {
2659 CODING_ATTR_CHARSET_LIST (attrs)
2660 = charset_list = Vemacs_mule_charset_list;
2661 }
df7492f9
KH
2662
2663 while (charbuf < charbuf_end)
2664 {
2665 ASSURE_DESTINATION (safe_room);
2666 c = *charbuf++;
ff0dacd7
KH
2667
2668 if (c < 0)
2669 {
2670 /* Handle an annotation. */
2671 switch (*charbuf)
2672 {
2673 case CODING_ANNOTATE_COMPOSITION_MASK:
2674 /* Not yet implemented. */
2675 break;
2676 case CODING_ANNOTATE_CHARSET_MASK:
2677 preferred_charset_id = charbuf[3];
2678 if (preferred_charset_id >= 0
2679 && NILP (Fmemq (make_number (preferred_charset_id),
2680 charset_list)))
2681 preferred_charset_id = -1;
2682 break;
2683 default:
2684 abort ();
2685 }
2686 charbuf += -c - 1;
2687 continue;
2688 }
2689
df7492f9
KH
2690 if (ASCII_CHAR_P (c))
2691 EMIT_ONE_ASCII_BYTE (c);
16eafb5d
KH
2692 else if (CHAR_BYTE8_P (c))
2693 {
2694 c = CHAR_TO_BYTE8 (c);
2695 EMIT_ONE_BYTE (c);
2696 }
df7492f9 2697 else
aa72b389 2698 {
df7492f9
KH
2699 struct charset *charset;
2700 unsigned code;
2701 int dimension;
2702 int emacs_mule_id;
2703 unsigned char leading_codes[2];
2704
ff0dacd7
KH
2705 if (preferred_charset_id >= 0)
2706 {
5eb05ea3
KH
2707 int result;
2708
ff0dacd7 2709 charset = CHARSET_FROM_ID (preferred_charset_id);
5eb05ea3
KH
2710 CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
2711 if (result)
905ca9d2
KH
2712 code = ENCODE_CHAR (charset, c);
2713 else
5eb05ea3
KH
2714 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2715 &code, charset);
ff0dacd7
KH
2716 }
2717 else
5eb05ea3
KH
2718 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2719 &code, charset);
df7492f9
KH
2720 if (! charset)
2721 {
2722 c = coding->default_char;
2723 if (ASCII_CHAR_P (c))
2724 {
2725 EMIT_ONE_ASCII_BYTE (c);
2726 continue;
2727 }
5eb05ea3
KH
2728 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2729 &code, charset);
df7492f9
KH
2730 }
2731 dimension = CHARSET_DIMENSION (charset);
2732 emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2733 EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2734 EMIT_ONE_BYTE (leading_codes[0]);
2735 if (leading_codes[1])
2736 EMIT_ONE_BYTE (leading_codes[1]);
2737 if (dimension == 1)
1fa663f9 2738 EMIT_ONE_BYTE (code | 0x80);
aa72b389 2739 else
df7492f9 2740 {
1fa663f9 2741 code |= 0x8080;
df7492f9
KH
2742 EMIT_ONE_BYTE (code >> 8);
2743 EMIT_ONE_BYTE (code & 0xFF);
2744 }
aa72b389 2745 }
aa72b389 2746 }
065e3595 2747 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
2748 coding->produced_char += produced_chars;
2749 coding->produced = dst - coding->destination;
2750 return 0;
aa72b389 2751}
b73bfc1c 2752
4ed46869 2753\f
df7492f9 2754/*** 7. ISO2022 handlers ***/
4ed46869
KH
2755
2756/* The following note describes the coding system ISO2022 briefly.
39787efd 2757 Since the intention of this note is to help understand the
5a936b46 2758 functions in this file, some parts are NOT ACCURATE or are OVERLY
39787efd 2759 SIMPLIFIED. For thorough understanding, please refer to the
5a936b46 2760 original document of ISO2022. This is equivalent to the standard
cfb43547 2761 ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
4ed46869
KH
2762
2763 ISO2022 provides many mechanisms to encode several character sets
cfb43547 2764 in 7-bit and 8-bit environments. For 7-bit environments, all text
39787efd
KH
2765 is encoded using bytes less than 128. This may make the encoded
2766 text a little bit longer, but the text passes more easily through
cfb43547 2767 several types of gateway, some of which strip off the MSB (Most
8ca3766a 2768 Significant Bit).
b73bfc1c 2769
cfb43547
DL
2770 There are two kinds of character sets: control character sets and
2771 graphic character sets. The former contain control characters such
4ed46869 2772 as `newline' and `escape' to provide control functions (control
39787efd 2773 functions are also provided by escape sequences). The latter
cfb43547 2774 contain graphic characters such as 'A' and '-'. Emacs recognizes
4ed46869
KH
2775 two control character sets and many graphic character sets.
2776
2777 Graphic character sets are classified into one of the following
39787efd
KH
2778 four classes, according to the number of bytes (DIMENSION) and
2779 number of characters in one dimension (CHARS) of the set:
2780 - DIMENSION1_CHARS94
2781 - DIMENSION1_CHARS96
2782 - DIMENSION2_CHARS94
2783 - DIMENSION2_CHARS96
2784
2785 In addition, each character set is assigned an identification tag,
cfb43547 2786 unique for each set, called the "final character" (denoted as <F>
39787efd
KH
2787 hereafter). The <F> of each character set is decided by ECMA(*)
2788 when it is registered in ISO. The code range of <F> is 0x30..0x7F
2789 (0x30..0x3F are for private use only).
4ed46869
KH
2790
2791 Note (*): ECMA = European Computer Manufacturers Association
2792
cfb43547 2793 Here are examples of graphic character sets [NAME(<F>)]:
4ed46869
KH
2794 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2795 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2796 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2797 o DIMENSION2_CHARS96 -- none for the moment
2798
39787efd 2799 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
4ed46869
KH
2800 C0 [0x00..0x1F] -- control character plane 0
2801 GL [0x20..0x7F] -- graphic character plane 0
2802 C1 [0x80..0x9F] -- control character plane 1
2803 GR [0xA0..0xFF] -- graphic character plane 1
2804
2805 A control character set is directly designated and invoked to C0 or
39787efd
KH
2806 C1 by an escape sequence. The most common case is that:
2807 - ISO646's control character set is designated/invoked to C0, and
2808 - ISO6429's control character set is designated/invoked to C1,
2809 and usually these designations/invocations are omitted in encoded
2810 text. In a 7-bit environment, only C0 can be used, and a control
2811 character for C1 is encoded by an appropriate escape sequence to
2812 fit into the environment. All control characters for C1 are
2813 defined to have corresponding escape sequences.
4ed46869
KH
2814
2815 A graphic character set is at first designated to one of four
2816 graphic registers (G0 through G3), then these graphic registers are
2817 invoked to GL or GR. These designations and invocations can be
2818 done independently. The most common case is that G0 is invoked to
39787efd
KH
2819 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
2820 these invocations and designations are omitted in encoded text.
2821 In a 7-bit environment, only GL can be used.
4ed46869 2822
39787efd
KH
2823 When a graphic character set of CHARS94 is invoked to GL, codes
2824 0x20 and 0x7F of the GL area work as control characters SPACE and
2825 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2826 be used.
4ed46869
KH
2827
2828 There are two ways of invocation: locking-shift and single-shift.
2829 With locking-shift, the invocation lasts until the next different
39787efd
KH
2830 invocation, whereas with single-shift, the invocation affects the
2831 following character only and doesn't affect the locking-shift
2832 state. Invocations are done by the following control characters or
2833 escape sequences:
4ed46869
KH
2834
2835 ----------------------------------------------------------------------
39787efd 2836 abbrev function cntrl escape seq description
4ed46869 2837 ----------------------------------------------------------------------
39787efd
KH
2838 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
2839 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
2840 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
2841 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
2842 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
2843 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
2844 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
2845 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
2846 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
4ed46869 2847 ----------------------------------------------------------------------
39787efd
KH
2848 (*) These are not used by any known coding system.
2849
2850 Control characters for these functions are defined by macros
2851 ISO_CODE_XXX in `coding.h'.
4ed46869 2852
39787efd 2853 Designations are done by the following escape sequences:
4ed46869
KH
2854 ----------------------------------------------------------------------
2855 escape sequence description
2856 ----------------------------------------------------------------------
2857 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
2858 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
2859 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
2860 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
2861 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
2862 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
2863 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
2864 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
2865 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
2866 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
2867 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
2868 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
2869 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
2870 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
2871 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
2872 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
2873 ----------------------------------------------------------------------
2874
2875 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
39787efd 2876 of dimension 1, chars 94, and final character <F>, etc...
4ed46869
KH
2877
2878 Note (*): Although these designations are not allowed in ISO2022,
2879 Emacs accepts them on decoding, and produces them on encoding
39787efd 2880 CHARS96 character sets in a coding system which is characterized as
4ed46869
KH
2881 7-bit environment, non-locking-shift, and non-single-shift.
2882
2883 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
df7492f9 2884 '(' must be omitted. We refer to this as "short-form" hereafter.
4ed46869 2885
cfb43547 2886 Now you may notice that there are a lot of ways of encoding the
39787efd
KH
2887 same multilingual text in ISO2022. Actually, there exist many
2888 coding systems such as Compound Text (used in X11's inter client
8ca3766a
DL
2889 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2890 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
4ed46869
KH
2891 localized platforms), and all of these are variants of ISO2022.
2892
2893 In addition to the above, Emacs handles two more kinds of escape
2894 sequences: ISO6429's direction specification and Emacs' private
2895 sequence for specifying character composition.
2896
39787efd 2897 ISO6429's direction specification takes the following form:
4ed46869
KH
2898 o CSI ']' -- end of the current direction
2899 o CSI '0' ']' -- end of the current direction
2900 o CSI '1' ']' -- start of left-to-right text
2901 o CSI '2' ']' -- start of right-to-left text
2902 The control character CSI (0x9B: control sequence introducer) is
39787efd
KH
2903 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2904
2905 Character composition specification takes the following form:
ec6d2bb8
KH
2906 o ESC '0' -- start relative composition
2907 o ESC '1' -- end composition
2908 o ESC '2' -- start rule-base composition (*)
2909 o ESC '3' -- start relative composition with alternate chars (**)
2910 o ESC '4' -- start rule-base composition with alternate chars (**)
b73bfc1c 2911 Since these are not standard escape sequences of any ISO standard,
cfb43547 2912 the use of them with these meanings is restricted to Emacs only.
ec6d2bb8 2913
5a936b46
DL
2914 (*) This form is used only in Emacs 20.7 and older versions,
2915 but newer versions can safely decode it.
cfb43547 2916 (**) This form is used only in Emacs 21.1 and newer versions,
5a936b46 2917 and older versions can't decode it.
ec6d2bb8 2918
cfb43547 2919 Here's a list of example usages of these composition escape
b73bfc1c 2920 sequences (categorized by `enum composition_method').
ec6d2bb8 2921
b73bfc1c 2922 COMPOSITION_RELATIVE:
ec6d2bb8 2923 ESC 0 CHAR [ CHAR ] ESC 1
8ca3766a 2924 COMPOSITION_WITH_RULE:
ec6d2bb8 2925 ESC 2 CHAR [ RULE CHAR ] ESC 1
b73bfc1c 2926 COMPOSITION_WITH_ALTCHARS:
ec6d2bb8 2927 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
b73bfc1c 2928 COMPOSITION_WITH_RULE_ALTCHARS:
ec6d2bb8 2929 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
4ed46869 2930
74ab6df5 2931static enum iso_code_class_type iso_code_class[256];
4ed46869 2932
df7492f9
KH
2933#define SAFE_CHARSET_P(coding, id) \
2934 ((id) <= (coding)->max_charset_id \
1b3b981b 2935 && (coding)->safe_charsets[id] != 255)
df7492f9 2936
df7492f9 2937static void
971de7fb 2938setup_iso_safe_charsets (Lisp_Object attrs)
df7492f9
KH
2939{
2940 Lisp_Object charset_list, safe_charsets;
2941 Lisp_Object request;
2942 Lisp_Object reg_usage;
2943 Lisp_Object tail;
d311d28c 2944 EMACS_INT reg94, reg96;
df7492f9
KH
2945 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2946 int max_charset_id;
2947
2948 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2949 if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2950 && ! EQ (charset_list, Viso_2022_charset_list))
2951 {
2952 CODING_ATTR_CHARSET_LIST (attrs)
2953 = charset_list = Viso_2022_charset_list;
2954 ASET (attrs, coding_attr_safe_charsets, Qnil);
2955 }
2956
2957 if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2958 return;
2959
2960 max_charset_id = 0;
2961 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2962 {
2963 int id = XINT (XCAR (tail));
2964 if (max_charset_id < id)
2965 max_charset_id = id;
2966 }
d46c5b12 2967
1b3b981b
AS
2968 safe_charsets = make_uninit_string (max_charset_id + 1);
2969 memset (SDATA (safe_charsets), 255, max_charset_id + 1);
df7492f9
KH
2970 request = AREF (attrs, coding_attr_iso_request);
2971 reg_usage = AREF (attrs, coding_attr_iso_usage);
2972 reg94 = XINT (XCAR (reg_usage));
2973 reg96 = XINT (XCDR (reg_usage));
2974
2975 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2976 {
2977 Lisp_Object id;
2978 Lisp_Object reg;
2979 struct charset *charset;
2980
2981 id = XCAR (tail);
2982 charset = CHARSET_FROM_ID (XINT (id));
bf16eb23 2983 reg = Fcdr (Fassq (id, request));
df7492f9 2984 if (! NILP (reg))
8f924df7 2985 SSET (safe_charsets, XINT (id), XINT (reg));
df7492f9
KH
2986 else if (charset->iso_chars_96)
2987 {
2988 if (reg96 < 4)
8f924df7 2989 SSET (safe_charsets, XINT (id), reg96);
df7492f9
KH
2990 }
2991 else
2992 {
2993 if (reg94 < 4)
8f924df7 2994 SSET (safe_charsets, XINT (id), reg94);
df7492f9
KH
2995 }
2996 }
2997 ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2998}
d46c5b12 2999
b6871cc7 3000
4ed46869 3001/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
ad1746f5 3002 Check if a text is encoded in one of ISO-2022 based coding systems.
ff0dacd7 3003 If it is, return 1, else return 0. */
4ed46869 3004
0a28aafb 3005static int
cf84bb53
JB
3006detect_coding_iso_2022 (struct coding_system *coding,
3007 struct coding_detection_info *detect_info)
4ed46869 3008{
8f924df7
KH
3009 const unsigned char *src = coding->source, *src_base = src;
3010 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9 3011 int multibytep = coding->src_multibyte;
ff0dacd7 3012 int single_shifting = 0;
0e48bb22 3013 int id;
df7492f9 3014 int c, c1;
d311d28c 3015 ptrdiff_t consumed_chars = 0;
df7492f9 3016 int i;
ff0dacd7
KH
3017 int rejected = 0;
3018 int found = 0;
cee53ed4 3019 int composition_count = -1;
ff0dacd7
KH
3020
3021 detect_info->checked |= CATEGORY_MASK_ISO;
df7492f9
KH
3022
3023 for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
3024 {
3025 struct coding_system *this = &(coding_categories[i]);
3026 Lisp_Object attrs, val;
3027
c6b278e7
KH
3028 if (this->id < 0)
3029 continue;
df7492f9
KH
3030 attrs = CODING_ID_ATTRS (this->id);
3031 if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
1b3b981b 3032 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
df7492f9
KH
3033 setup_iso_safe_charsets (attrs);
3034 val = CODING_ATTR_SAFE_CHARSETS (attrs);
8f924df7 3035 this->max_charset_id = SCHARS (val) - 1;
1b3b981b 3036 this->safe_charsets = SDATA (val);
df7492f9
KH
3037 }
3038
3039 /* A coding system of this category is always ASCII compatible. */
3040 src += coding->head_ascii;
3f003981 3041
ff0dacd7 3042 while (rejected != CATEGORY_MASK_ISO)
4ed46869 3043 {
065e3595 3044 src_base = src;
df7492f9 3045 ONE_MORE_BYTE (c);
4ed46869
KH
3046 switch (c)
3047 {
3048 case ISO_CODE_ESC:
74383408
KH
3049 if (inhibit_iso_escape_detection)
3050 break;
f46869e4 3051 single_shifting = 0;
df7492f9 3052 ONE_MORE_BYTE (c);
0e48bb22 3053 if (c == 'N' || c == 'O')
d46c5b12 3054 {
ae9ff118 3055 /* ESC <Fe> for SS2 or SS3. */
ff0dacd7
KH
3056 single_shifting = 1;
3057 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
4ed46869 3058 }
cee53ed4
KH
3059 else if (c == '1')
3060 {
3061 /* End of composition. */
3062 if (composition_count < 0
3063 || composition_count > MAX_COMPOSITION_COMPONENTS)
3064 /* Invalid */
3065 break;
3066 composition_count = -1;
3067 found |= CATEGORY_MASK_ISO;
3068 }
ec6d2bb8
KH
3069 else if (c >= '0' && c <= '4')
3070 {
3071 /* ESC <Fp> for start/end composition. */
cee53ed4 3072 composition_count = 0;
ec6d2bb8 3073 }
bf9cdd4e 3074 else
df7492f9 3075 {
0e48bb22
AS
3076 if (c >= '(' && c <= '/')
3077 {
3078 /* Designation sequence for a charset of dimension 1. */
3079 ONE_MORE_BYTE (c1);
3080 if (c1 < ' ' || c1 >= 0x80
3081 || (id = iso_charset_table[0][c >= ','][c1]) < 0)
3082 /* Invalid designation sequence. Just ignore. */
3083 break;
3084 }
3085 else if (c == '$')
3086 {
3087 /* Designation sequence for a charset of dimension 2. */
3088 ONE_MORE_BYTE (c);
3089 if (c >= '@' && c <= 'B')
3090 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
3091 id = iso_charset_table[1][0][c];
3092 else if (c >= '(' && c <= '/')
3093 {
3094 ONE_MORE_BYTE (c1);
3095 if (c1 < ' ' || c1 >= 0x80
3096 || (id = iso_charset_table[1][c >= ','][c1]) < 0)
3097 /* Invalid designation sequence. Just ignore. */
3098 break;
3099 }
3100 else
3101 /* Invalid designation sequence. Just ignore it. */
3102 break;
3103 }
3104 else
3105 {
3106 /* Invalid escape sequence. Just ignore it. */
3107 break;
3108 }
d46c5b12 3109
0e48bb22
AS
3110 /* We found a valid designation sequence for CHARSET. */
3111 rejected |= CATEGORY_MASK_ISO_8BIT;
3112 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
3113 id))
3114 found |= CATEGORY_MASK_ISO_7;
3115 else
3116 rejected |= CATEGORY_MASK_ISO_7;
3117 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
3118 id))
3119 found |= CATEGORY_MASK_ISO_7_TIGHT;
3120 else
3121 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
3122 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3123 id))
3124 found |= CATEGORY_MASK_ISO_7_ELSE;
3125 else
3126 rejected |= CATEGORY_MASK_ISO_7_ELSE;
3127 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3128 id))
3129 found |= CATEGORY_MASK_ISO_8_ELSE;
3130 else
3131 rejected |= CATEGORY_MASK_ISO_8_ELSE;
3132 }
4ed46869
KH
3133 break;
3134
4ed46869 3135 case ISO_CODE_SO:
d46c5b12 3136 case ISO_CODE_SI:
ff0dacd7 3137 /* Locking shift out/in. */
74383408
KH
3138 if (inhibit_iso_escape_detection)
3139 break;
f46869e4 3140 single_shifting = 0;
ff0dacd7 3141 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
d46c5b12
KH
3142 break;
3143
4ed46869 3144 case ISO_CODE_CSI:
ff0dacd7 3145 /* Control sequence introducer. */
f46869e4 3146 single_shifting = 0;
ff0dacd7
KH
3147 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3148 found |= CATEGORY_MASK_ISO_8_ELSE;
3149 goto check_extra_latin;
3150
4ed46869
KH
3151 case ISO_CODE_SS2:
3152 case ISO_CODE_SS3:
ff0dacd7
KH
3153 /* Single shift. */
3154 if (inhibit_iso_escape_detection)
3155 break;
75e2a253 3156 single_shifting = 0;
ff0dacd7
KH
3157 rejected |= CATEGORY_MASK_ISO_7BIT;
3158 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3159 & CODING_ISO_FLAG_SINGLE_SHIFT)
0e48bb22
AS
3160 {
3161 found |= CATEGORY_MASK_ISO_8_1;
3162 single_shifting = 1;
3163 }
ff0dacd7
KH
3164 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3165 & CODING_ISO_FLAG_SINGLE_SHIFT)
0e48bb22
AS
3166 {
3167 found |= CATEGORY_MASK_ISO_8_2;
3168 single_shifting = 1;
3169 }
75e2a253
KH
3170 if (single_shifting)
3171 break;
0e48bb22
AS
3172 check_extra_latin:
3173 if (! VECTORP (Vlatin_extra_code_table)
3174 || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
3175 {
3176 rejected = CATEGORY_MASK_ISO;
3177 break;
3178 }
3179 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3180 & CODING_ISO_FLAG_LATIN_EXTRA)
3181 found |= CATEGORY_MASK_ISO_8_1;
3182 else
3183 rejected |= CATEGORY_MASK_ISO_8_1;
3184 rejected |= CATEGORY_MASK_ISO_8_2;
3185 break;
4ed46869
KH
3186
3187 default:
065e3595
KH
3188 if (c < 0)
3189 continue;
4ed46869 3190 if (c < 0x80)
f46869e4 3191 {
cee53ed4
KH
3192 if (composition_count >= 0)
3193 composition_count++;
f46869e4
KH
3194 single_shifting = 0;
3195 break;
3196 }
ff0dacd7 3197 if (c >= 0xA0)
c4825358 3198 {
ff0dacd7
KH
3199 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3200 found |= CATEGORY_MASK_ISO_8_1;
f46869e4 3201 /* Check the length of succeeding codes of the range
ff0dacd7
KH
3202 0xA0..0FF. If the byte length is even, we include
3203 CATEGORY_MASK_ISO_8_2 in `found'. We can check this
3204 only when we are not single shifting. */
3205 if (! single_shifting
3206 && ! (rejected & CATEGORY_MASK_ISO_8_2))
f46869e4 3207 {
2735d060 3208 int len = 1;
b73bfc1c
KH
3209 while (src < src_end)
3210 {
d12bd917 3211 src_base = src;
df7492f9 3212 ONE_MORE_BYTE (c);
b73bfc1c 3213 if (c < 0xA0)
d12bd917
KH
3214 {
3215 src = src_base;
3216 break;
3217 }
2735d060 3218 len++;
b73bfc1c
KH
3219 }
3220
2735d060 3221 if (len & 1 && src < src_end)
cee53ed4
KH
3222 {
3223 rejected |= CATEGORY_MASK_ISO_8_2;
3224 if (composition_count >= 0)
2735d060 3225 composition_count += len;
cee53ed4 3226 }
f46869e4 3227 else
cee53ed4
KH
3228 {
3229 found |= CATEGORY_MASK_ISO_8_2;
3230 if (composition_count >= 0)
2735d060 3231 composition_count += len / 2;
cee53ed4 3232 }
f46869e4 3233 }
ff0dacd7 3234 break;
4ed46869 3235 }
4ed46869
KH
3236 }
3237 }
ff0dacd7
KH
3238 detect_info->rejected |= CATEGORY_MASK_ISO;
3239 return 0;
4ed46869 3240
df7492f9 3241 no_more_source:
ff0dacd7
KH
3242 detect_info->rejected |= rejected;
3243 detect_info->found |= (found & ~rejected);
df7492f9 3244 return 1;
4ed46869 3245}
ec6d2bb8 3246
4ed46869 3247
134b9549
KH
3248/* Set designation state into CODING. Set CHARS_96 to -1 if the
3249 escape sequence should be kept. */
df7492f9
KH
3250#define DECODE_DESIGNATION(reg, dim, chars_96, final) \
3251 do { \
3252 int id, prev; \
3253 \
3254 if (final < '0' || final >= 128 \
3255 || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0) \
3256 || !SAFE_CHARSET_P (coding, id)) \
3257 { \
3258 CODING_ISO_DESIGNATION (coding, reg) = -2; \
134b9549
KH
3259 chars_96 = -1; \
3260 break; \
df7492f9
KH
3261 } \
3262 prev = CODING_ISO_DESIGNATION (coding, reg); \
bf16eb23
KH
3263 if (id == charset_jisx0201_roman) \
3264 { \
3265 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
3266 id = charset_ascii; \
3267 } \
3268 else if (id == charset_jisx0208_1978) \
3269 { \
3270 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
3271 id = charset_jisx0208; \
3272 } \
df7492f9
KH
3273 CODING_ISO_DESIGNATION (coding, reg) = id; \
3274 /* If there was an invalid designation to REG previously, and this \
3275 designation is ASCII to REG, we should keep this designation \
3276 sequence. */ \
3277 if (prev == -2 && id == charset_ascii) \
134b9549 3278 chars_96 = -1; \
4ed46869
KH
3279 } while (0)
3280
d46c5b12 3281
e951386e
KH
3282/* Handle these composition sequence (ALT: alternate char):
3283
3284 (1) relative composition: ESC 0 CHAR ... ESC 1
3285 (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3286 (3) altchar composition: ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3287 (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3288
3289 When the start sequence (ESC 0/2/3/4) is found, this annotation
3290 header is produced.
3291
3292 [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3293
3294 Then, upon reading CHAR or RULE (one or two bytes), these codes are
3295 produced until the end sequence (ESC 1) is found:
3296
3297 (1) CHAR ... CHAR
3298 (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3299 (3) ALT ... ALT -1 -1 CHAR ... CHAR
3300 (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3301
3302 When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3303 annotation header is updated as below:
3304
3305 (1) LENGTH: unchanged, NCHARS: number of CHARs
3306 (2) LENGTH: unchanged, NCHARS: number of CHARs
3307 (3) LENGTH: += number of ALTs + 2, NCHARS: number of CHARs
3308 (4) LENGTH: += number of ALTs * 3, NCHARS: number of CHARs
3309
3310 If an error is found while composing, the annotation header is
3311 changed to:
3312
3313 [ ESC '0'/'2'/'3'/'4' -2 0 ]
3314
3315 and the sequence [ -2 DECODED-RULE ] is changed to the original
3316 byte sequence as below:
3317 o the original byte sequence is B: [ B -1 ]
3318 o the original byte sequence is B1 B2: [ B1 B2 ]
3319 and the sequence [ -1 -1 ] is changed to the original byte
3320 sequence:
3321 [ ESC '0' ]
3322*/
3323
3324/* Decode a composition rule C1 and maybe one more byte from the
66ebf983 3325 source, and set RULE to the encoded composition rule. If the rule
d5efd1d1 3326 is invalid, goto invalid_code. */
e951386e 3327
66ebf983 3328#define DECODE_COMPOSITION_RULE(rule) \
e951386e
KH
3329 do { \
3330 rule = c1 - 32; \
3331 if (rule < 0) \
d5efd1d1 3332 goto invalid_code; \
e951386e
KH
3333 if (rule < 81) /* old format (before ver.21) */ \
3334 { \
3335 int gref = (rule) / 9; \
3336 int nref = (rule) % 9; \
3337 if (gref == 4) gref = 10; \
3338 if (nref == 4) nref = 10; \
3339 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
e951386e
KH
3340 } \
3341 else /* new format (after ver.21) */ \
3342 { \
2735d060 3343 int b; \
e951386e 3344 \
2735d060 3345 ONE_MORE_BYTE (b); \
d5efd1d1
PE
3346 if (! COMPOSITION_ENCODE_RULE_VALID (rule - 81, b - 32)) \
3347 goto invalid_code; \
2735d060 3348 rule = COMPOSITION_ENCODE_RULE (rule - 81, b - 32); \
d5efd1d1 3349 rule += 0x100; /* Distinguish it from the old format. */ \
e951386e
KH
3350 } \
3351 } while (0)
3352
3353#define ENCODE_COMPOSITION_RULE(rule) \
df7492f9 3354 do { \
e951386e
KH
3355 int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3356 \
3357 if (rule < 0x100) /* old format */ \
df7492f9 3358 { \
e951386e
KH
3359 if (gref == 10) gref = 4; \
3360 if (nref == 10) nref = 4; \
3361 charbuf[idx] = 32 + gref * 9 + nref; \
3362 charbuf[idx + 1] = -1; \
3363 new_chars++; \
df7492f9 3364 } \
e951386e 3365 else /* new format */ \
df7492f9 3366 { \
e951386e
KH
3367 charbuf[idx] = 32 + 81 + gref; \
3368 charbuf[idx + 1] = 32 + nref; \
3369 new_chars += 2; \
df7492f9
KH
3370 } \
3371 } while (0)
3372
e951386e
KH
3373/* Finish the current composition as invalid. */
3374
f57e2426 3375static int finish_composition (int *, struct composition_status *);
e951386e
KH
3376
3377static int
971de7fb 3378finish_composition (int *charbuf, struct composition_status *cmp_status)
e951386e
KH
3379{
3380 int idx = - cmp_status->length;
3381 int new_chars;
3382
3383 /* Recover the original ESC sequence */
3384 charbuf[idx++] = ISO_CODE_ESC;
3385 charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3386 : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3387 : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3388 /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3389 : '4');
3390 charbuf[idx++] = -2;
3391 charbuf[idx++] = 0;
3392 charbuf[idx++] = -1;
3393 new_chars = cmp_status->nchars;
3394 if (cmp_status->method >= COMPOSITION_WITH_RULE)
3395 for (; idx < 0; idx++)
3396 {
3397 int elt = charbuf[idx];
3398
3399 if (elt == -2)
3400 {
3401 ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3402 idx++;
3403 }
3404 else if (elt == -1)
3405 {
3406 charbuf[idx++] = ISO_CODE_ESC;
3407 charbuf[idx] = '0';
3408 new_chars += 2;
3409 }
3410 }
3411 cmp_status->state = COMPOSING_NO;
3412 return new_chars;
3413}
3414
ad1746f5 3415/* If characters are under composition, finish the composition. */
e951386e
KH
3416#define MAYBE_FINISH_COMPOSITION() \
3417 do { \
3418 if (cmp_status->state != COMPOSING_NO) \
3419 char_offset += finish_composition (charbuf, cmp_status); \
3420 } while (0)
d46c5b12 3421
aa72b389 3422/* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
e951386e 3423
aa72b389
KH
3424 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3425 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
df7492f9
KH
3426 ESC 3 : altchar composition : ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3427 ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
ec6d2bb8 3428
e951386e
KH
3429 Produce this annotation sequence now:
3430
3431 [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3432*/
3433
3434#define DECODE_COMPOSITION_START(c1) \
3435 do { \
3436 if (c1 == '0' \
3437 && ((cmp_status->state == COMPOSING_COMPONENT_CHAR \
3438 && cmp_status->method == COMPOSITION_WITH_ALTCHARS) \
3439 || (cmp_status->state == COMPOSING_COMPONENT_RULE \
3440 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3441 { \
3442 *charbuf++ = -1; \
3443 *charbuf++= -1; \
3444 cmp_status->state = COMPOSING_CHAR; \
3445 cmp_status->length += 2; \
3446 } \
3447 else \
3448 { \
3449 MAYBE_FINISH_COMPOSITION (); \
3450 cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE \
3451 : c1 == '2' ? COMPOSITION_WITH_RULE \
3452 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
3453 : COMPOSITION_WITH_RULE_ALTCHARS); \
3454 cmp_status->state \
3455 = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR); \
3456 ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method); \
3457 cmp_status->length = MAX_ANNOTATION_LENGTH; \
3458 cmp_status->nchars = cmp_status->ncomps = 0; \
3459 coding->annotated = 1; \
3460 } \
ec6d2bb8
KH
3461 } while (0)
3462
ec6d2bb8 3463
e951386e 3464/* Handle composition end sequence ESC 1. */
df7492f9
KH
3465
3466#define DECODE_COMPOSITION_END() \
ec6d2bb8 3467 do { \
e951386e
KH
3468 if (cmp_status->nchars == 0 \
3469 || ((cmp_status->state == COMPOSING_CHAR) \
3470 == (cmp_status->method == COMPOSITION_WITH_RULE))) \
ec6d2bb8 3471 { \
e951386e
KH
3472 MAYBE_FINISH_COMPOSITION (); \
3473 goto invalid_code; \
ec6d2bb8 3474 } \
e951386e
KH
3475 if (cmp_status->method == COMPOSITION_WITH_ALTCHARS) \
3476 charbuf[- cmp_status->length] -= cmp_status->ncomps + 2; \
3477 else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS) \
3478 charbuf[- cmp_status->length] -= cmp_status->ncomps * 3; \
3479 charbuf[- cmp_status->length + 2] = cmp_status->nchars; \
3480 char_offset += cmp_status->nchars; \
3481 cmp_status->state = COMPOSING_NO; \
ec6d2bb8
KH
3482 } while (0)
3483
e951386e 3484/* Store a composition rule RULE in charbuf, and update cmp_status. */
df7492f9 3485
e951386e
KH
3486#define STORE_COMPOSITION_RULE(rule) \
3487 do { \
3488 *charbuf++ = -2; \
3489 *charbuf++ = rule; \
3490 cmp_status->length += 2; \
3491 cmp_status->state--; \
3492 } while (0)
ec6d2bb8 3493
e951386e
KH
3494/* Store a composed char or a component char C in charbuf, and update
3495 cmp_status. */
3496
3497#define STORE_COMPOSITION_CHAR(c) \
ec6d2bb8 3498 do { \
e951386e
KH
3499 *charbuf++ = (c); \
3500 cmp_status->length++; \
3501 if (cmp_status->state == COMPOSING_CHAR) \
3502 cmp_status->nchars++; \
df7492f9 3503 else \
e951386e
KH
3504 cmp_status->ncomps++; \
3505 if (cmp_status->method == COMPOSITION_WITH_RULE \
3506 || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS \
3507 && cmp_status->state == COMPOSING_COMPONENT_CHAR)) \
3508 cmp_status->state++; \
ec6d2bb8 3509 } while (0)
88993dfd 3510
d46c5b12 3511
4ed46869
KH
3512/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
3513
b73bfc1c 3514static void
971de7fb 3515decode_coding_iso_2022 (struct coding_system *coding)
4ed46869 3516{
8f924df7
KH
3517 const unsigned char *src = coding->source + coding->consumed;
3518 const unsigned char *src_end = coding->source + coding->src_bytes;
3519 const unsigned char *src_base;
69a80ea3 3520 int *charbuf = coding->charbuf + coding->charbuf_used;
ad1746f5
KH
3521 /* We may produce two annotations (charset and composition) in one
3522 loop and one more charset annotation at the end. */
ff0dacd7 3523 int *charbuf_end
df80c7f0 3524 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
d311d28c 3525 ptrdiff_t consumed_chars = 0, consumed_chars_base;
df7492f9 3526 int multibytep = coding->src_multibyte;
4ed46869 3527 /* Charsets invoked to graphic plane 0 and 1 respectively. */
df7492f9
KH
3528 int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3529 int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
134b9549 3530 int charset_id_2, charset_id_3;
df7492f9
KH
3531 struct charset *charset;
3532 int c;
e951386e 3533 struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
66ebf983 3534 Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
d311d28c
PE
3535 ptrdiff_t char_offset = coding->produced_char;
3536 ptrdiff_t last_offset = char_offset;
ff0dacd7 3537 int last_id = charset_ascii;
2735d060 3538 int eol_dos =
0a9564cb 3539 !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
119852e7 3540 int byte_after_cr = -1;
e951386e 3541 int i;
df7492f9 3542
df7492f9 3543 setup_iso_safe_charsets (attrs);
1b3b981b 3544 coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
b73bfc1c 3545
e951386e
KH
3546 if (cmp_status->state != COMPOSING_NO)
3547 {
15cbd324
EZ
3548 if (charbuf_end - charbuf < cmp_status->length)
3549 abort ();
e951386e
KH
3550 for (i = 0; i < cmp_status->length; i++)
3551 *charbuf++ = cmp_status->carryover[i];
3552 coding->annotated = 1;
3553 }
3554
b73bfc1c 3555 while (1)
4ed46869 3556 {
cf299835 3557 int c1, c2, c3;
b73bfc1c
KH
3558
3559 src_base = src;
df7492f9
KH
3560 consumed_chars_base = consumed_chars;
3561
3562 if (charbuf >= charbuf_end)
b71f6f73
KH
3563 {
3564 if (byte_after_cr >= 0)
3565 src_base--;
3566 break;
3567 }
df7492f9 3568
119852e7
KH
3569 if (byte_after_cr >= 0)
3570 c1 = byte_after_cr, byte_after_cr = -1;
3571 else
3572 ONE_MORE_BYTE (c1);
065e3595
KH
3573 if (c1 < 0)
3574 goto invalid_code;
4ed46869 3575
e951386e 3576 if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
4ed46869 3577 {
e951386e
KH
3578 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3579 char_offset++;
3580 CODING_ISO_EXTSEGMENT_LEN (coding)--;
3581 continue;
3582 }
3583
3584 if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3585 {
3586 if (c1 == ISO_CODE_ESC)
ec6d2bb8 3587 {
e951386e
KH
3588 if (src + 1 >= src_end)
3589 goto no_more_source;
3590 *charbuf++ = ISO_CODE_ESC;
3591 char_offset++;
3592 if (src[0] == '%' && src[1] == '@')
df7492f9 3593 {
e951386e
KH
3594 src += 2;
3595 consumed_chars += 2;
3596 char_offset += 2;
3597 /* We are sure charbuf can contain two more chars. */
3598 *charbuf++ = '%';
3599 *charbuf++ = '@';
3600 CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
df7492f9 3601 }
4ed46869 3602 }
e951386e
KH
3603 else
3604 {
3605 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3606 char_offset++;
3607 }
3608 continue;
3609 }
3610
3611 if ((cmp_status->state == COMPOSING_RULE
3612 || cmp_status->state == COMPOSING_COMPONENT_RULE)
3613 && c1 != ISO_CODE_ESC)
3614 {
66ebf983 3615 int rule;
e951386e 3616
66ebf983 3617 DECODE_COMPOSITION_RULE (rule);
e951386e
KH
3618 STORE_COMPOSITION_RULE (rule);
3619 continue;
3620 }
3621
3622 /* We produce at most one character. */
3623 switch (iso_code_class [c1])
3624 {
3625 case ISO_0x20_or_0x7F:
df7492f9
KH
3626 if (charset_id_0 < 0
3627 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
781d7a48
KH
3628 /* This is SPACE or DEL. */
3629 charset = CHARSET_FROM_ID (charset_ascii);
3630 else
3631 charset = CHARSET_FROM_ID (charset_id_0);
3632 break;
4ed46869
KH
3633
3634 case ISO_graphic_plane_0:
134b9549
KH
3635 if (charset_id_0 < 0)
3636 charset = CHARSET_FROM_ID (charset_ascii);
3637 else
3638 charset = CHARSET_FROM_ID (charset_id_0);
4ed46869
KH
3639 break;
3640
3641 case ISO_0xA0_or_0xFF:
df7492f9
KH
3642 if (charset_id_1 < 0
3643 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3644 || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3645 goto invalid_code;
4ed46869
KH
3646 /* This is a graphic character, we fall down ... */
3647
3648 case ISO_graphic_plane_1:
df7492f9
KH
3649 if (charset_id_1 < 0)
3650 goto invalid_code;
3651 charset = CHARSET_FROM_ID (charset_id_1);
4ed46869
KH
3652 break;
3653
df7492f9 3654 case ISO_control_0:
2735d060 3655 if (eol_dos && c1 == '\r')
119852e7 3656 ONE_MORE_BYTE (byte_after_cr);
df7492f9
KH
3657 MAYBE_FINISH_COMPOSITION ();
3658 charset = CHARSET_FROM_ID (charset_ascii);
4ed46869
KH
3659 break;
3660
df7492f9 3661 case ISO_control_1:
df7492f9
KH
3662 goto invalid_code;
3663
4ed46869 3664 case ISO_shift_out:
df7492f9
KH
3665 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3666 || CODING_ISO_DESIGNATION (coding, 1) < 0)
3667 goto invalid_code;
3668 CODING_ISO_INVOCATION (coding, 0) = 1;
3669 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
b73bfc1c 3670 continue;
4ed46869
KH
3671
3672 case ISO_shift_in:
df7492f9
KH
3673 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3674 goto invalid_code;
3675 CODING_ISO_INVOCATION (coding, 0) = 0;
3676 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
b73bfc1c 3677 continue;
4ed46869
KH
3678
3679 case ISO_single_shift_2_7:
a63dba42
KH
3680 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS))
3681 goto invalid_code;
4ed46869 3682 case ISO_single_shift_2:
df7492f9
KH
3683 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3684 goto invalid_code;
4ed46869
KH
3685 /* SS2 is handled as an escape sequence of ESC 'N' */
3686 c1 = 'N';
3687 goto label_escape_sequence;
3688
3689 case ISO_single_shift_3:
df7492f9
KH
3690 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3691 goto invalid_code;
4ed46869
KH
3692 /* SS2 is handled as an escape sequence of ESC 'O' */
3693 c1 = 'O';
3694 goto label_escape_sequence;
3695
3696 case ISO_control_sequence_introducer:
3697 /* CSI is handled as an escape sequence of ESC '[' ... */
3698 c1 = '[';
3699 goto label_escape_sequence;
3700
3701 case ISO_escape:
3702 ONE_MORE_BYTE (c1);
3703 label_escape_sequence:
df7492f9 3704 /* Escape sequences handled here are invocation,
4ed46869
KH
3705 designation, direction specification, and character
3706 composition specification. */
3707 switch (c1)
3708 {
3709 case '&': /* revision of following character set */
3710 ONE_MORE_BYTE (c1);
3711 if (!(c1 >= '@' && c1 <= '~'))
df7492f9 3712 goto invalid_code;
4ed46869
KH
3713 ONE_MORE_BYTE (c1);
3714 if (c1 != ISO_CODE_ESC)
df7492f9 3715 goto invalid_code;
4ed46869
KH
3716 ONE_MORE_BYTE (c1);
3717 goto label_escape_sequence;
3718
3719 case '$': /* designation of 2-byte character set */
df7492f9
KH
3720 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3721 goto invalid_code;
134b9549
KH
3722 {
3723 int reg, chars96;
3724
3725 ONE_MORE_BYTE (c1);
3726 if (c1 >= '@' && c1 <= 'B')
3727 { /* designation of JISX0208.1978, GB2312.1980,
88993dfd 3728 or JISX0208.1980 */
134b9549
KH
3729 reg = 0, chars96 = 0;
3730 }
3731 else if (c1 >= 0x28 && c1 <= 0x2B)
3732 { /* designation of DIMENSION2_CHARS94 character set */
3733 reg = c1 - 0x28, chars96 = 0;
3734 ONE_MORE_BYTE (c1);
3735 }
3736 else if (c1 >= 0x2C && c1 <= 0x2F)
3737 { /* designation of DIMENSION2_CHARS96 character set */
3738 reg = c1 - 0x2C, chars96 = 1;
3739 ONE_MORE_BYTE (c1);
3740 }
3741 else
3742 goto invalid_code;
3743 DECODE_DESIGNATION (reg, 2, chars96, c1);
3744 /* We must update these variables now. */
3745 if (reg == 0)
3746 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3747 else if (reg == 1)
3748 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3749 if (chars96 < 0)
3750 goto invalid_code;
3751 }
b73bfc1c 3752 continue;
4ed46869
KH
3753
3754 case 'n': /* invocation of locking-shift-2 */
df7492f9
KH
3755 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3756 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3757 goto invalid_code;
3758 CODING_ISO_INVOCATION (coding, 0) = 2;
3759 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
b73bfc1c 3760 continue;
4ed46869
KH
3761
3762 case 'o': /* invocation of locking-shift-3 */
df7492f9
KH
3763 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3764 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3765 goto invalid_code;
3766 CODING_ISO_INVOCATION (coding, 0) = 3;
3767 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
b73bfc1c 3768 continue;
4ed46869
KH
3769
3770 case 'N': /* invocation of single-shift-2 */
df7492f9
KH
3771 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3772 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3773 goto invalid_code;
134b9549
KH
3774 charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3775 if (charset_id_2 < 0)
3776 charset = CHARSET_FROM_ID (charset_ascii);
3777 else
3778 charset = CHARSET_FROM_ID (charset_id_2);
b73bfc1c 3779 ONE_MORE_BYTE (c1);
e7046a18 3780 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
df7492f9 3781 goto invalid_code;
4ed46869
KH
3782 break;
3783
3784 case 'O': /* invocation of single-shift-3 */
df7492f9
KH
3785 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3786 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3787 goto invalid_code;
134b9549
KH
3788 charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3789 if (charset_id_3 < 0)
3790 charset = CHARSET_FROM_ID (charset_ascii);
3791 else
3792 charset = CHARSET_FROM_ID (charset_id_3);
b73bfc1c 3793 ONE_MORE_BYTE (c1);
e7046a18 3794 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
df7492f9 3795 goto invalid_code;
4ed46869
KH
3796 break;
3797
ec6d2bb8 3798 case '0': case '2': case '3': case '4': /* start composition */
df7492f9
KH
3799 if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3800 goto invalid_code;
e951386e
KH
3801 if (last_id != charset_ascii)
3802 {
3803 ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3804 last_id = charset_ascii;
3805 last_offset = char_offset;
3806 }
ec6d2bb8 3807 DECODE_COMPOSITION_START (c1);
b73bfc1c 3808 continue;
4ed46869 3809
ec6d2bb8 3810 case '1': /* end composition */
e951386e 3811 if (cmp_status->state == COMPOSING_NO)
df7492f9
KH
3812 goto invalid_code;
3813 DECODE_COMPOSITION_END ();
b73bfc1c 3814 continue;
4ed46869
KH
3815
3816 case '[': /* specification of direction */
de59072a 3817 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION))
df7492f9 3818 goto invalid_code;
4ed46869 3819 /* For the moment, nested direction is not supported.
d46c5b12 3820 So, `coding->mode & CODING_MODE_DIRECTION' zero means
ad1746f5 3821 left-to-right, and nonzero means right-to-left. */
4ed46869
KH
3822 ONE_MORE_BYTE (c1);
3823 switch (c1)
3824 {
3825 case ']': /* end of the current direction */
d46c5b12 3826 coding->mode &= ~CODING_MODE_DIRECTION;
4ed46869
KH
3827
3828 case '0': /* end of the current direction */
3829 case '1': /* start of left-to-right direction */
3830 ONE_MORE_BYTE (c1);
3831 if (c1 == ']')
d46c5b12 3832 coding->mode &= ~CODING_MODE_DIRECTION;
4ed46869 3833 else
df7492f9 3834 goto invalid_code;
4ed46869
KH
3835 break;
3836
3837 case '2': /* start of right-to-left direction */
3838 ONE_MORE_BYTE (c1);
3839 if (c1 == ']')
d46c5b12 3840 coding->mode |= CODING_MODE_DIRECTION;
4ed46869 3841 else
df7492f9 3842 goto invalid_code;
4ed46869
KH
3843 break;
3844
3845 default:
df7492f9 3846 goto invalid_code;
4ed46869 3847 }
b73bfc1c 3848 continue;
4ed46869 3849
103e0180 3850 case '%':
103e0180
KH
3851 ONE_MORE_BYTE (c1);
3852 if (c1 == '/')
3853 {
3854 /* CTEXT extended segment:
3855 ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3856 We keep these bytes as is for the moment.
3857 They may be decoded by post-read-conversion. */
3858 int dim, M, L;
4776e638 3859 int size;
8f924df7 3860
103e0180 3861 ONE_MORE_BYTE (dim);
7a84eee5 3862 if (dim < '0' || dim > '4')
e951386e 3863 goto invalid_code;
103e0180 3864 ONE_MORE_BYTE (M);
e951386e
KH
3865 if (M < 128)
3866 goto invalid_code;
103e0180 3867 ONE_MORE_BYTE (L);
e951386e
KH
3868 if (L < 128)
3869 goto invalid_code;
103e0180 3870 size = ((M - 128) * 128) + (L - 128);
e951386e 3871 if (charbuf + 6 > charbuf_end)
4776e638
KH
3872 goto break_loop;
3873 *charbuf++ = ISO_CODE_ESC;
3874 *charbuf++ = '%';
3875 *charbuf++ = '/';
3876 *charbuf++ = dim;
3877 *charbuf++ = BYTE8_TO_CHAR (M);
3878 *charbuf++ = BYTE8_TO_CHAR (L);
e951386e 3879 CODING_ISO_EXTSEGMENT_LEN (coding) = size;
103e0180
KH
3880 }
3881 else if (c1 == 'G')
3882 {
103e0180
KH
3883 /* XFree86 extension for embedding UTF-8 in CTEXT:
3884 ESC % G --UTF-8-BYTES-- ESC % @
3885 We keep these bytes as is for the moment.
3886 They may be decoded by post-read-conversion. */
e951386e 3887 if (charbuf + 3 > charbuf_end)
4776e638 3888 goto break_loop;
e951386e
KH
3889 *charbuf++ = ISO_CODE_ESC;
3890 *charbuf++ = '%';
3891 *charbuf++ = 'G';
3892 CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
103e0180
KH
3893 }
3894 else
4776e638 3895 goto invalid_code;
103e0180 3896 continue;
4776e638 3897 break;
103e0180 3898
4ed46869 3899 default:
df7492f9
KH
3900 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3901 goto invalid_code;
134b9549
KH
3902 {
3903 int reg, chars96;
3904
3905 if (c1 >= 0x28 && c1 <= 0x2B)
3906 { /* designation of DIMENSION1_CHARS94 character set */
3907 reg = c1 - 0x28, chars96 = 0;
3908 ONE_MORE_BYTE (c1);
3909 }
3910 else if (c1 >= 0x2C && c1 <= 0x2F)
3911 { /* designation of DIMENSION1_CHARS96 character set */
3912 reg = c1 - 0x2C, chars96 = 1;
3913 ONE_MORE_BYTE (c1);
3914 }
3915 else
3916 goto invalid_code;
3917 DECODE_DESIGNATION (reg, 1, chars96, c1);
3918 /* We must update these variables now. */
3919 if (reg == 0)
3920 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3921 else if (reg == 1)
3922 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3923 if (chars96 < 0)
3924 goto invalid_code;
3925 }
b73bfc1c 3926 continue;
4ed46869 3927 }
413bb2db
PE
3928 break;
3929
3930 default:
3931 abort ();
b73bfc1c 3932 }
4ed46869 3933
e951386e
KH
3934 if (cmp_status->state == COMPOSING_NO
3935 && charset->id != charset_ascii
ff0dacd7
KH
3936 && last_id != charset->id)
3937 {
3938 if (last_id != charset_ascii)
69a80ea3 3939 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
ff0dacd7
KH
3940 last_id = charset->id;
3941 last_offset = char_offset;
3942 }
3943
b73bfc1c 3944 /* Now we know CHARSET and 1st position code C1 of a character.
cf299835
KH
3945 Produce a decoded character while getting 2nd and 3rd
3946 position codes C2, C3 if necessary. */
df7492f9 3947 if (CHARSET_DIMENSION (charset) > 1)
b73bfc1c
KH
3948 {
3949 ONE_MORE_BYTE (c2);
cf299835
KH
3950 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
3951 || ((c1 & 0x80) != (c2 & 0x80)))
b73bfc1c 3952 /* C2 is not in a valid range. */
df7492f9 3953 goto invalid_code;
cf299835
KH
3954 if (CHARSET_DIMENSION (charset) == 2)
3955 c1 = (c1 << 8) | c2;
3956 else
df7492f9 3957 {
cf299835
KH
3958 ONE_MORE_BYTE (c3);
3959 if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
3960 || ((c1 & 0x80) != (c3 & 0x80)))
3961 /* C3 is not in a valid range. */
df7492f9 3962 goto invalid_code;
cf299835 3963 c1 = (c1 << 16) | (c2 << 8) | c2;
df7492f9
KH
3964 }
3965 }
cf299835 3966 c1 &= 0x7F7F7F;
df7492f9
KH
3967 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3968 if (c < 0)
3969 {
3970 MAYBE_FINISH_COMPOSITION ();
3971 for (; src_base < src; src_base++, char_offset++)
3972 {
3973 if (ASCII_BYTE_P (*src_base))
3974 *charbuf++ = *src_base;
3975 else
3976 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3977 }
3978 }
e951386e 3979 else if (cmp_status->state == COMPOSING_NO)
df7492f9
KH
3980 {
3981 *charbuf++ = c;
3982 char_offset++;
4ed46869 3983 }
e951386e
KH
3984 else if ((cmp_status->state == COMPOSING_CHAR
3985 ? cmp_status->nchars
3986 : cmp_status->ncomps)
3987 >= MAX_COMPOSITION_COMPONENTS)
781d7a48 3988 {
e951386e
KH
3989 /* Too long composition. */
3990 MAYBE_FINISH_COMPOSITION ();
3991 *charbuf++ = c;
3992 char_offset++;
4ed46869 3993 }
e951386e
KH
3994 else
3995 STORE_COMPOSITION_CHAR (c);
4ed46869
KH
3996 continue;
3997
df7492f9
KH
3998 invalid_code:
3999 MAYBE_FINISH_COMPOSITION ();
4ed46869 4000 src = src_base;
df7492f9
KH
4001 consumed_chars = consumed_chars_base;
4002 ONE_MORE_BYTE (c);
065e3595 4003 *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
ff0dacd7 4004 char_offset++;
df7492f9 4005 coding->errors++;
4776e638
KH
4006 continue;
4007
4008 break_loop:
4009 break;
4ed46869 4010 }
fb88bf2d 4011
df7492f9 4012 no_more_source:
e951386e
KH
4013 if (cmp_status->state != COMPOSING_NO)
4014 {
4015 if (coding->mode & CODING_MODE_LAST_BLOCK)
4016 MAYBE_FINISH_COMPOSITION ();
4017 else
4018 {
4019 charbuf -= cmp_status->length;
4020 for (i = 0; i < cmp_status->length; i++)
4021 cmp_status->carryover[i] = charbuf[i];
4022 }
4023 }
4024 else if (last_id != charset_ascii)
69a80ea3 4025 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
df7492f9
KH
4026 coding->consumed_char += consumed_chars_base;
4027 coding->consumed = src_base - coding->source;
4028 coding->charbuf_used = charbuf - coding->charbuf;
4ed46869
KH
4029}
4030
b73bfc1c 4031
f4dee582 4032/* ISO2022 encoding stuff. */
4ed46869
KH
4033
4034/*
f4dee582 4035 It is not enough to say just "ISO2022" on encoding, we have to
df7492f9 4036 specify more details. In Emacs, each coding system of ISO2022
4ed46869 4037 variant has the following specifications:
df7492f9 4038 1. Initial designation to G0 thru G3.
4ed46869
KH
4039 2. Allows short-form designation?
4040 3. ASCII should be designated to G0 before control characters?
4041 4. ASCII should be designated to G0 at end of line?
4042 5. 7-bit environment or 8-bit environment?
4043 6. Use locking-shift?
4044 7. Use Single-shift?
4045 And the following two are only for Japanese:
4046 8. Use ASCII in place of JIS0201-1976-Roman?
4047 9. Use JISX0208-1983 in place of JISX0208-1978?
df7492f9
KH
4048 These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
4049 defined by macros CODING_ISO_FLAG_XXX. See `coding.h' for more
f4dee582 4050 details.
4ed46869
KH
4051*/
4052
4053/* Produce codes (escape sequence) for designating CHARSET to graphic
b73bfc1c
KH
4054 register REG at DST, and increment DST. If <final-char> of CHARSET is
4055 '@', 'A', or 'B' and the coding system CODING allows, produce
4056 designation sequence of short-form. */
4ed46869
KH
4057
4058#define ENCODE_DESIGNATION(charset, reg, coding) \
4059 do { \
df7492f9 4060 unsigned char final_char = CHARSET_ISO_FINAL (charset); \
675e2c69
DN
4061 const char *intermediate_char_94 = "()*+"; \
4062 const char *intermediate_char_96 = ",-./"; \
df7492f9 4063 int revision = -1; \
df7492f9
KH
4064 \
4065 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION) \
c197f191 4066 revision = CHARSET_ISO_REVISION (charset); \
df7492f9
KH
4067 \
4068 if (revision >= 0) \
70c22245 4069 { \
df7492f9
KH
4070 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&'); \
4071 EMIT_ONE_BYTE ('@' + revision); \
4ed46869 4072 } \
df7492f9 4073 EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC); \
4ed46869
KH
4074 if (CHARSET_DIMENSION (charset) == 1) \
4075 { \
2735d060 4076 int b; \
df7492f9 4077 if (! CHARSET_ISO_CHARS_96 (charset)) \
2735d060 4078 b = intermediate_char_94[reg]; \
4ed46869 4079 else \
2735d060
PE
4080 b = intermediate_char_96[reg]; \
4081 EMIT_ONE_ASCII_BYTE (b); \
4ed46869
KH
4082 } \
4083 else \
4084 { \
df7492f9
KH
4085 EMIT_ONE_ASCII_BYTE ('$'); \
4086 if (! CHARSET_ISO_CHARS_96 (charset)) \
4ed46869 4087 { \
df7492f9 4088 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM \
b73bfc1c
KH
4089 || reg != 0 \
4090 || final_char < '@' || final_char > 'B') \
df7492f9 4091 EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]); \
4ed46869
KH
4092 } \
4093 else \
df7492f9 4094 EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]); \
4ed46869 4095 } \
df7492f9
KH
4096 EMIT_ONE_ASCII_BYTE (final_char); \
4097 \
4098 CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset); \
4ed46869
KH
4099 } while (0)
4100
df7492f9 4101
4ed46869
KH
4102/* The following two macros produce codes (control character or escape
4103 sequence) for ISO2022 single-shift functions (single-shift-2 and
4104 single-shift-3). */
4105
df7492f9
KH
4106#define ENCODE_SINGLE_SHIFT_2 \
4107 do { \
4108 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4109 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N'); \
4110 else \
4111 EMIT_ONE_BYTE (ISO_CODE_SS2); \
4112 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
4ed46869
KH
4113 } while (0)
4114
df7492f9
KH
4115
4116#define ENCODE_SINGLE_SHIFT_3 \
4117 do { \
4118 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4119 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O'); \
4120 else \
4121 EMIT_ONE_BYTE (ISO_CODE_SS3); \
4122 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
4ed46869
KH
4123 } while (0)
4124
df7492f9 4125
4ed46869
KH
4126/* The following four macros produce codes (control character or
4127 escape sequence) for ISO2022 locking-shift functions (shift-in,
4128 shift-out, locking-shift-2, and locking-shift-3). */
4129
df7492f9
KH
4130#define ENCODE_SHIFT_IN \
4131 do { \
4132 EMIT_ONE_ASCII_BYTE (ISO_CODE_SI); \
4133 CODING_ISO_INVOCATION (coding, 0) = 0; \
4ed46869
KH
4134 } while (0)
4135
df7492f9
KH
4136
4137#define ENCODE_SHIFT_OUT \
4138 do { \
4139 EMIT_ONE_ASCII_BYTE (ISO_CODE_SO); \
4140 CODING_ISO_INVOCATION (coding, 0) = 1; \
4ed46869
KH
4141 } while (0)
4142
df7492f9
KH
4143
4144#define ENCODE_LOCKING_SHIFT_2 \
4145 do { \
4146 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
4147 CODING_ISO_INVOCATION (coding, 0) = 2; \
4ed46869
KH
4148 } while (0)
4149
df7492f9
KH
4150
4151#define ENCODE_LOCKING_SHIFT_3 \
4152 do { \
4153 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
4154 CODING_ISO_INVOCATION (coding, 0) = 3; \
4ed46869
KH
4155 } while (0)
4156
df7492f9 4157
f4dee582
RS
4158/* Produce codes for a DIMENSION1 character whose character set is
4159 CHARSET and whose position-code is C1. Designation and invocation
4ed46869
KH
4160 sequences are also produced in advance if necessary. */
4161
6e85d753
KH
4162#define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
4163 do { \
df7492f9 4164 int id = CHARSET_ID (charset); \
bf16eb23
KH
4165 \
4166 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
4167 && id == charset_ascii) \
4168 { \
4169 id = charset_jisx0201_roman; \
4170 charset = CHARSET_FROM_ID (id); \
4171 } \
4172 \
df7492f9 4173 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
6e85d753 4174 { \
df7492f9
KH
4175 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4176 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
6e85d753 4177 else \
df7492f9
KH
4178 EMIT_ONE_BYTE (c1 | 0x80); \
4179 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
6e85d753
KH
4180 break; \
4181 } \
df7492f9 4182 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
6e85d753 4183 { \
df7492f9 4184 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
6e85d753
KH
4185 break; \
4186 } \
df7492f9 4187 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
6e85d753 4188 { \
df7492f9 4189 EMIT_ONE_BYTE (c1 | 0x80); \
6e85d753
KH
4190 break; \
4191 } \
6e85d753
KH
4192 else \
4193 /* Since CHARSET is not yet invoked to any graphic planes, we \
4194 must invoke it, or, at first, designate it to some graphic \
4195 register. Then repeat the loop to actually produce the \
4196 character. */ \
df7492f9
KH
4197 dst = encode_invocation_designation (charset, coding, dst, \
4198 &produced_chars); \
4ed46869
KH
4199 } while (1)
4200
df7492f9 4201
f4dee582
RS
4202/* Produce codes for a DIMENSION2 character whose character set is
4203 CHARSET and whose position-codes are C1 and C2. Designation and
4ed46869
KH
4204 invocation codes are also produced in advance if necessary. */
4205
6e85d753
KH
4206#define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
4207 do { \
df7492f9 4208 int id = CHARSET_ID (charset); \
bf16eb23
KH
4209 \
4210 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
4211 && id == charset_jisx0208) \
4212 { \
4213 id = charset_jisx0208_1978; \
4214 charset = CHARSET_FROM_ID (id); \
4215 } \
4216 \
df7492f9 4217 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
6e85d753 4218 { \
df7492f9
KH
4219 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4220 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
6e85d753 4221 else \
df7492f9
KH
4222 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
4223 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
6e85d753
KH
4224 break; \
4225 } \
df7492f9 4226 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
6e85d753 4227 { \
df7492f9 4228 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
6e85d753
KH
4229 break; \
4230 } \
df7492f9 4231 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
6e85d753 4232 { \
df7492f9 4233 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
6e85d753
KH
4234 break; \
4235 } \
6e85d753
KH
4236 else \
4237 /* Since CHARSET is not yet invoked to any graphic planes, we \
4238 must invoke it, or, at first, designate it to some graphic \
4239 register. Then repeat the loop to actually produce the \
4240 character. */ \
df7492f9
KH
4241 dst = encode_invocation_designation (charset, coding, dst, \
4242 &produced_chars); \
4ed46869
KH
4243 } while (1)
4244
05e6f5dc 4245
df7492f9
KH
4246#define ENCODE_ISO_CHARACTER(charset, c) \
4247 do { \
8f50130c 4248 unsigned code; \
5eb05ea3 4249 CODING_ENCODE_CHAR (coding, dst, dst_end, (charset), (c), code); \
df7492f9
KH
4250 \
4251 if (CHARSET_DIMENSION (charset) == 1) \
4252 ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code); \
4253 else \
4254 ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
84fbb8a0 4255 } while (0)
bdd9fb48 4256
05e6f5dc 4257
4ed46869 4258/* Produce designation and invocation codes at a place pointed by DST
df7492f9 4259 to use CHARSET. The element `spec.iso_2022' of *CODING is updated.
4ed46869
KH
4260 Return new DST. */
4261
e2f1bab9 4262static unsigned char *
cf84bb53
JB
4263encode_invocation_designation (struct charset *charset,
4264 struct coding_system *coding,
d311d28c 4265 unsigned char *dst, ptrdiff_t *p_nchars)
4ed46869 4266{
df7492f9 4267 int multibytep = coding->dst_multibyte;
d311d28c 4268 ptrdiff_t produced_chars = *p_nchars;
4ed46869 4269 int reg; /* graphic register number */
df7492f9 4270 int id = CHARSET_ID (charset);
4ed46869
KH
4271
4272 /* At first, check designations. */
4273 for (reg = 0; reg < 4; reg++)
df7492f9 4274 if (id == CODING_ISO_DESIGNATION (coding, reg))
4ed46869
KH
4275 break;
4276
4277 if (reg >= 4)
4278 {
4279 /* CHARSET is not yet designated to any graphic registers. */
4280 /* At first check the requested designation. */
df7492f9
KH
4281 reg = CODING_ISO_REQUEST (coding, id);
4282 if (reg < 0)
1ba9e4ab
KH
4283 /* Since CHARSET requests no special designation, designate it
4284 to graphic register 0. */
4ed46869
KH
4285 reg = 0;
4286
4287 ENCODE_DESIGNATION (charset, reg, coding);
4288 }
4289
df7492f9
KH
4290 if (CODING_ISO_INVOCATION (coding, 0) != reg
4291 && CODING_ISO_INVOCATION (coding, 1) != reg)
4ed46869
KH
4292 {
4293 /* Since the graphic register REG is not invoked to any graphic
4294 planes, invoke it to graphic plane 0. */
4295 switch (reg)
4296 {
4297 case 0: /* graphic register 0 */
4298 ENCODE_SHIFT_IN;
4299 break;
4300
4301 case 1: /* graphic register 1 */
4302 ENCODE_SHIFT_OUT;
4303 break;
4304
4305 case 2: /* graphic register 2 */
df7492f9 4306 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4ed46869
KH
4307 ENCODE_SINGLE_SHIFT_2;
4308 else
4309 ENCODE_LOCKING_SHIFT_2;
4310 break;
4311
4312 case 3: /* graphic register 3 */
df7492f9 4313 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4ed46869
KH
4314 ENCODE_SINGLE_SHIFT_3;
4315 else
4316 ENCODE_LOCKING_SHIFT_3;
4317 break;
4318 }
4319 }
b73bfc1c 4320
df7492f9 4321 *p_nchars = produced_chars;
4ed46869
KH
4322 return dst;
4323}
4324
4ed46869
KH
4325
4326/* Produce codes for designation and invocation to reset the graphic
4327 planes and registers to initial state. */
df7492f9
KH
4328#define ENCODE_RESET_PLANE_AND_REGISTER() \
4329 do { \
4330 int reg; \
4331 struct charset *charset; \
4332 \
4333 if (CODING_ISO_INVOCATION (coding, 0) != 0) \
4334 ENCODE_SHIFT_IN; \
4335 for (reg = 0; reg < 4; reg++) \
4336 if (CODING_ISO_INITIAL (coding, reg) >= 0 \
4337 && (CODING_ISO_DESIGNATION (coding, reg) \
4338 != CODING_ISO_INITIAL (coding, reg))) \
4339 { \
4340 charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4341 ENCODE_DESIGNATION (charset, reg, coding); \
4342 } \
4ed46869
KH
4343 } while (0)
4344
df7492f9 4345
bdd9fb48 4346/* Produce designation sequences of charsets in the line started from
5eb05ea3
KH
4347 CHARBUF to a place pointed by DST, and return the number of
4348 produced bytes. DST should not directly point a buffer text area
4349 which may be relocated by char_charset call.
bdd9fb48
KH
4350
4351 If the current block ends before any end-of-line, we may fail to
d46c5b12
KH
4352 find all the necessary designations. */
4353
6e6c82a4 4354static ptrdiff_t
5eb05ea3
KH
4355encode_designation_at_bol (struct coding_system *coding,
4356 int *charbuf, int *charbuf_end,
461c2ab9 4357 unsigned char *dst)
e0e989f6 4358{
75a3b399 4359 unsigned char *orig = dst;
df7492f9 4360 struct charset *charset;
bdd9fb48
KH
4361 /* Table of charsets to be designated to each graphic register. */
4362 int r[4];
df7492f9 4363 int c, found = 0, reg;
d311d28c 4364 ptrdiff_t produced_chars = 0;
df7492f9
KH
4365 int multibytep = coding->dst_multibyte;
4366 Lisp_Object attrs;
4367 Lisp_Object charset_list;
4368
4369 attrs = CODING_ID_ATTRS (coding->id);
4370 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4371 if (EQ (charset_list, Qiso_2022))
4372 charset_list = Viso_2022_charset_list;
bdd9fb48
KH
4373
4374 for (reg = 0; reg < 4; reg++)
4375 r[reg] = -1;
4376
5eb05ea3 4377 while (charbuf < charbuf_end && found < 4)
e0e989f6 4378 {
df7492f9
KH
4379 int id;
4380
4381 c = *charbuf++;
b73bfc1c
KH
4382 if (c == '\n')
4383 break;
df7492f9
KH
4384 charset = char_charset (c, charset_list, NULL);
4385 id = CHARSET_ID (charset);
4386 reg = CODING_ISO_REQUEST (coding, id);
4387 if (reg >= 0 && r[reg] < 0)
bdd9fb48
KH
4388 {
4389 found++;
df7492f9 4390 r[reg] = id;
bdd9fb48 4391 }
bdd9fb48
KH
4392 }
4393
4394 if (found)
4395 {
4396 for (reg = 0; reg < 4; reg++)
4397 if (r[reg] >= 0
df7492f9
KH
4398 && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4399 ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
e0e989f6 4400 }
b73bfc1c 4401
5eb05ea3 4402 return dst - orig;
e0e989f6
KH
4403}
4404
4ed46869
KH
4405/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
4406
df7492f9 4407static int
971de7fb 4408encode_coding_iso_2022 (struct coding_system *coding)
4ed46869 4409{
df7492f9
KH
4410 int multibytep = coding->dst_multibyte;
4411 int *charbuf = coding->charbuf;
4412 int *charbuf_end = charbuf + coding->charbuf_used;
4413 unsigned char *dst = coding->destination + coding->produced;
4414 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4415 int safe_room = 16;
4416 int bol_designation
4417 = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4418 && CODING_ISO_BOL (coding));
d311d28c 4419 ptrdiff_t produced_chars = 0;
df7492f9
KH
4420 Lisp_Object attrs, eol_type, charset_list;
4421 int ascii_compatible;
b73bfc1c 4422 int c;
ff0dacd7 4423 int preferred_charset_id = -1;
05e6f5dc 4424
24a73b0a 4425 CODING_GET_INFO (coding, attrs, charset_list);
0a9564cb 4426 eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
24a73b0a
KH
4427 if (VECTORP (eol_type))
4428 eol_type = Qunix;
4429
004068e4 4430 setup_iso_safe_charsets (attrs);
ff0dacd7 4431 /* Charset list may have been changed. */
287c57d7 4432 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
1b3b981b 4433 coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
0eecad43 4434
a552b35a
KH
4435 ascii_compatible
4436 = (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
4437 && ! (CODING_ISO_FLAGS (coding) & (CODING_ISO_FLAG_DESIGNATION
4438 | CODING_ISO_FLAG_LOCKING_SHIFT)));
bdd9fb48 4439
df7492f9 4440 while (charbuf < charbuf_end)
4ed46869 4441 {
df7492f9 4442 ASSURE_DESTINATION (safe_room);
b73bfc1c 4443
df7492f9 4444 if (bol_designation)
b73bfc1c 4445 {
bdd9fb48 4446 /* We have to produce designation sequences if any now. */
5eb05ea3
KH
4447 unsigned char desig_buf[16];
4448 int nbytes;
8f50130c 4449 ptrdiff_t offset;
5eb05ea3
KH
4450
4451 charset_map_loaded = 0;
4452 nbytes = encode_designation_at_bol (coding, charbuf, charbuf_end,
4453 desig_buf);
4454 if (charset_map_loaded
4455 && (offset = coding_set_destination (coding)))
4456 {
4457 dst += offset;
4458 dst_end += offset;
4459 }
4460 memcpy (dst, desig_buf, nbytes);
4461 dst += nbytes;
df7492f9 4462 /* We are sure that designation sequences are all ASCII bytes. */
5eb05ea3
KH
4463 produced_chars += nbytes;
4464 bol_designation = 0;
4465 ASSURE_DESTINATION (safe_room);
e0e989f6
KH
4466 }
4467
df7492f9 4468 c = *charbuf++;
ec6d2bb8 4469
ff0dacd7
KH
4470 if (c < 0)
4471 {
4472 /* Handle an annotation. */
4473 switch (*charbuf)
ec6d2bb8 4474 {
ff0dacd7
KH
4475 case CODING_ANNOTATE_COMPOSITION_MASK:
4476 /* Not yet implemented. */
4477 break;
4478 case CODING_ANNOTATE_CHARSET_MASK:
cf7dfdf5 4479 preferred_charset_id = charbuf[2];
ff0dacd7
KH
4480 if (preferred_charset_id >= 0
4481 && NILP (Fmemq (make_number (preferred_charset_id),
4482 charset_list)))
4483 preferred_charset_id = -1;
4484 break;
4485 default:
4486 abort ();
4ed46869 4487 }
ff0dacd7
KH
4488 charbuf += -c - 1;
4489 continue;
4ed46869 4490 }
ec6d2bb8 4491
b73bfc1c
KH
4492 /* Now encode the character C. */
4493 if (c < 0x20 || c == 0x7F)
4494 {
df7492f9
KH
4495 if (c == '\n'
4496 || (c == '\r' && EQ (eol_type, Qmac)))
19a8d9e0 4497 {
df7492f9
KH
4498 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4499 ENCODE_RESET_PLANE_AND_REGISTER ();
4500 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
b73bfc1c 4501 {
df7492f9
KH
4502 int i;
4503
4504 for (i = 0; i < 4; i++)
4505 CODING_ISO_DESIGNATION (coding, i)
4506 = CODING_ISO_INITIAL (coding, i);
b73bfc1c 4507 }
df7492f9
KH
4508 bol_designation
4509 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
19a8d9e0 4510 }
df7492f9
KH
4511 else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4512 ENCODE_RESET_PLANE_AND_REGISTER ();
4513 EMIT_ONE_ASCII_BYTE (c);
4ed46869 4514 }
df7492f9 4515 else if (ASCII_CHAR_P (c))
88993dfd 4516 {
df7492f9
KH
4517 if (ascii_compatible)
4518 EMIT_ONE_ASCII_BYTE (c);
93dec019 4519 else
19a8d9e0 4520 {
bf16eb23
KH
4521 struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4522 ENCODE_ISO_CHARACTER (charset, c);
19a8d9e0 4523 }
4ed46869 4524 }
16eafb5d 4525 else if (CHAR_BYTE8_P (c))
88993dfd 4526 {
16eafb5d
KH
4527 c = CHAR_TO_BYTE8 (c);
4528 EMIT_ONE_BYTE (c);
88993dfd 4529 }
b73bfc1c 4530 else
df7492f9 4531 {
ff0dacd7 4532 struct charset *charset;
b73bfc1c 4533
ff0dacd7
KH
4534 if (preferred_charset_id >= 0)
4535 {
5eb05ea3
KH
4536 int result;
4537
ff0dacd7 4538 charset = CHARSET_FROM_ID (preferred_charset_id);
5eb05ea3
KH
4539 CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
4540 if (! result)
4541 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4542 NULL, charset);
ff0dacd7
KH
4543 }
4544 else
5eb05ea3
KH
4545 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4546 NULL, charset);
df7492f9
KH
4547 if (!charset)
4548 {
41cbe562
KH
4549 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4550 {
4551 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4552 charset = CHARSET_FROM_ID (charset_ascii);
4553 }
4554 else
4555 {
4556 c = coding->default_char;
5eb05ea3
KH
4557 CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4558 charset_list, NULL, charset);
41cbe562 4559 }
df7492f9
KH
4560 }
4561 ENCODE_ISO_CHARACTER (charset, c);
4562 }
84fbb8a0 4563 }
b73bfc1c 4564
df7492f9
KH
4565 if (coding->mode & CODING_MODE_LAST_BLOCK
4566 && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4567 {
4568 ASSURE_DESTINATION (safe_room);
4569 ENCODE_RESET_PLANE_AND_REGISTER ();
4570 }
065e3595 4571 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
4572 CODING_ISO_BOL (coding) = bol_designation;
4573 coding->produced_char += produced_chars;
4574 coding->produced = dst - coding->destination;
4575 return 0;
4ed46869
KH
4576}
4577
4578\f
df7492f9 4579/*** 8,9. SJIS and BIG5 handlers ***/
4ed46869 4580
df7492f9 4581/* Although SJIS and BIG5 are not ISO's coding system, they are used
4ed46869
KH
4582 quite widely. So, for the moment, Emacs supports them in the bare
4583 C code. But, in the future, they may be supported only by CCL. */
4584
4585/* SJIS is a coding system encoding three character sets: ASCII, right
4586 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
4587 as is. A character of charset katakana-jisx0201 is encoded by
4588 "position-code + 0x80". A character of charset japanese-jisx0208
4589 is encoded in 2-byte but two position-codes are divided and shifted
df7492f9 4590 so that it fit in the range below.
4ed46869
KH
4591
4592 --- CODE RANGE of SJIS ---
4593 (character set) (range)
4594 ASCII 0x00 .. 0x7F
df7492f9 4595 KATAKANA-JISX0201 0xA0 .. 0xDF
c28a9453 4596 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
d14d03ac 4597 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
4ed46869
KH
4598 -------------------------------
4599
4600*/
4601
4602/* BIG5 is a coding system encoding two character sets: ASCII and
4603 Big5. An ASCII character is encoded as is. Big5 is a two-byte
df7492f9 4604 character set and is encoded in two-byte.
4ed46869
KH
4605
4606 --- CODE RANGE of BIG5 ---
4607 (character set) (range)
4608 ASCII 0x00 .. 0x7F
4609 Big5 (1st byte) 0xA1 .. 0xFE
4610 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
4611 --------------------------
4612
df7492f9 4613 */
4ed46869
KH
4614
4615/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4616 Check if a text is encoded in SJIS. If it is, return
df7492f9 4617 CATEGORY_MASK_SJIS, else return 0. */
4ed46869 4618
0a28aafb 4619static int
cf84bb53
JB
4620detect_coding_sjis (struct coding_system *coding,
4621 struct coding_detection_info *detect_info)
4ed46869 4622{
065e3595 4623 const unsigned char *src = coding->source, *src_base;
8f924df7 4624 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9 4625 int multibytep = coding->src_multibyte;
d311d28c 4626 ptrdiff_t consumed_chars = 0;
df7492f9 4627 int found = 0;
b73bfc1c 4628 int c;
f07190ca
KH
4629 Lisp_Object attrs, charset_list;
4630 int max_first_byte_of_2_byte_code;
4631
4632 CODING_GET_INFO (coding, attrs, charset_list);
4633 max_first_byte_of_2_byte_code
4634 = (XINT (Flength (charset_list)) > 3 ? 0xFC : 0xEF);
df7492f9 4635
ff0dacd7 4636 detect_info->checked |= CATEGORY_MASK_SJIS;
df7492f9
KH
4637 /* A coding system of this category is always ASCII compatible. */
4638 src += coding->head_ascii;
4ed46869 4639
b73bfc1c 4640 while (1)
4ed46869 4641 {
065e3595 4642 src_base = src;
df7492f9 4643 ONE_MORE_BYTE (c);
682169fe
KH
4644 if (c < 0x80)
4645 continue;
f07190ca
KH
4646 if ((c >= 0x81 && c <= 0x9F)
4647 || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code))
4ed46869 4648 {
df7492f9 4649 ONE_MORE_BYTE (c);
682169fe 4650 if (c < 0x40 || c == 0x7F || c > 0xFC)
df7492f9 4651 break;
ff0dacd7 4652 found = CATEGORY_MASK_SJIS;
4ed46869 4653 }
df7492f9 4654 else if (c >= 0xA0 && c < 0xE0)
ff0dacd7 4655 found = CATEGORY_MASK_SJIS;
df7492f9
KH
4656 else
4657 break;
4ed46869 4658 }
ff0dacd7 4659 detect_info->rejected |= CATEGORY_MASK_SJIS;
df7492f9
KH
4660 return 0;
4661
4662 no_more_source:
065e3595 4663 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
89528eb3 4664 {
ff0dacd7 4665 detect_info->rejected |= CATEGORY_MASK_SJIS;
89528eb3 4666 return 0;
4ed46869 4667 }
ff0dacd7
KH
4668 detect_info->found |= found;
4669 return 1;
4ed46869
KH
4670}
4671
4672/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4673 Check if a text is encoded in BIG5. If it is, return
df7492f9 4674 CATEGORY_MASK_BIG5, else return 0. */
4ed46869 4675
0a28aafb 4676static int
cf84bb53
JB
4677detect_coding_big5 (struct coding_system *coding,
4678 struct coding_detection_info *detect_info)
4ed46869 4679{
065e3595 4680 const unsigned char *src = coding->source, *src_base;
8f924df7 4681 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9 4682 int multibytep = coding->src_multibyte;
d311d28c 4683 ptrdiff_t consumed_chars = 0;
df7492f9 4684 int found = 0;
b73bfc1c 4685 int c;
fa42c37f 4686
ff0dacd7 4687 detect_info->checked |= CATEGORY_MASK_BIG5;
df7492f9
KH
4688 /* A coding system of this category is always ASCII compatible. */
4689 src += coding->head_ascii;
fa42c37f 4690
b73bfc1c 4691 while (1)
fa42c37f 4692 {
065e3595 4693 src_base = src;
df7492f9
KH
4694 ONE_MORE_BYTE (c);
4695 if (c < 0x80)
fa42c37f 4696 continue;
df7492f9 4697 if (c >= 0xA1)
fa42c37f 4698 {
df7492f9
KH
4699 ONE_MORE_BYTE (c);
4700 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
fa42c37f 4701 return 0;
ff0dacd7 4702 found = CATEGORY_MASK_BIG5;
fa42c37f 4703 }
df7492f9
KH
4704 else
4705 break;
fa42c37f 4706 }
ff0dacd7 4707 detect_info->rejected |= CATEGORY_MASK_BIG5;
fa42c37f 4708 return 0;
fa42c37f 4709
df7492f9 4710 no_more_source:
065e3595 4711 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
89528eb3 4712 {
ff0dacd7 4713 detect_info->rejected |= CATEGORY_MASK_BIG5;
89528eb3
KH
4714 return 0;
4715 }
ff0dacd7
KH
4716 detect_info->found |= found;
4717 return 1;
fa42c37f
KH
4718}
4719
4ed46869
KH
4720/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
4721 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
fa42c37f 4722
b73bfc1c 4723static void
971de7fb 4724decode_coding_sjis (struct coding_system *coding)
4ed46869 4725{
8f924df7
KH
4726 const unsigned char *src = coding->source + coding->consumed;
4727 const unsigned char *src_end = coding->source + coding->src_bytes;
4728 const unsigned char *src_base;
69a80ea3 4729 int *charbuf = coding->charbuf + coding->charbuf_used;
ad1746f5 4730 /* We may produce one charset annotation in one loop and one more at
df80c7f0 4731 the end. */
69a80ea3 4732 int *charbuf_end
df80c7f0 4733 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
d311d28c 4734 ptrdiff_t consumed_chars = 0, consumed_chars_base;
df7492f9
KH
4735 int multibytep = coding->src_multibyte;
4736 struct charset *charset_roman, *charset_kanji, *charset_kana;
57a47f8a 4737 struct charset *charset_kanji2;
24a73b0a 4738 Lisp_Object attrs, charset_list, val;
d311d28c
PE
4739 ptrdiff_t char_offset = coding->produced_char;
4740 ptrdiff_t last_offset = char_offset;
ff0dacd7 4741 int last_id = charset_ascii;
2735d060 4742 int eol_dos =
0a9564cb 4743 !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
119852e7 4744 int byte_after_cr = -1;
a5d301df 4745
24a73b0a 4746 CODING_GET_INFO (coding, attrs, charset_list);
df7492f9
KH
4747
4748 val = charset_list;
4749 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
89528eb3 4750 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
57a47f8a
KH
4751 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4752 charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
fa42c37f 4753
b73bfc1c 4754 while (1)
4ed46869 4755 {
df7492f9 4756 int c, c1;
24a73b0a 4757 struct charset *charset;
fa42c37f 4758
b73bfc1c 4759 src_base = src;
df7492f9 4760 consumed_chars_base = consumed_chars;
fa42c37f 4761
df7492f9 4762 if (charbuf >= charbuf_end)
b71f6f73
KH
4763 {
4764 if (byte_after_cr >= 0)
4765 src_base--;
4766 break;
4767 }
df7492f9 4768
119852e7
KH
4769 if (byte_after_cr >= 0)
4770 c = byte_after_cr, byte_after_cr = -1;
4771 else
4772 ONE_MORE_BYTE (c);
065e3595
KH
4773 if (c < 0)
4774 goto invalid_code;
24a73b0a 4775 if (c < 0x80)
119852e7 4776 {
2735d060 4777 if (eol_dos && c == '\r')
119852e7
KH
4778 ONE_MORE_BYTE (byte_after_cr);
4779 charset = charset_roman;
4780 }
57a47f8a 4781 else if (c == 0x80 || c == 0xA0)
8e921c4b 4782 goto invalid_code;
57a47f8a
KH
4783 else if (c >= 0xA1 && c <= 0xDF)
4784 {
4785 /* SJIS -> JISX0201-Kana */
4786 c &= 0x7F;
4787 charset = charset_kana;
4788 }
4789 else if (c <= 0xEF)
df7492f9 4790 {
57a47f8a
KH
4791 /* SJIS -> JISX0208 */
4792 ONE_MORE_BYTE (c1);
4793 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
24a73b0a 4794 goto invalid_code;
57a47f8a
KH
4795 c = (c << 8) | c1;
4796 SJIS_TO_JIS (c);
4797 charset = charset_kanji;
4798 }
4799 else if (c <= 0xFC && charset_kanji2)
4800 {
c6876370 4801 /* SJIS -> JISX0213-2 */
57a47f8a
KH
4802 ONE_MORE_BYTE (c1);
4803 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
24a73b0a 4804 goto invalid_code;
57a47f8a
KH
4805 c = (c << 8) | c1;
4806 SJIS_TO_JIS2 (c);
4807 charset = charset_kanji2;
df7492f9 4808 }
57a47f8a
KH
4809 else
4810 goto invalid_code;
24a73b0a
KH
4811 if (charset->id != charset_ascii
4812 && last_id != charset->id)
4813 {
4814 if (last_id != charset_ascii)
69a80ea3 4815 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
24a73b0a
KH
4816 last_id = charset->id;
4817 last_offset = char_offset;
4818 }
4819 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
df7492f9 4820 *charbuf++ = c;
ff0dacd7 4821 char_offset++;
df7492f9 4822 continue;
b73bfc1c 4823
df7492f9
KH
4824 invalid_code:
4825 src = src_base;
4826 consumed_chars = consumed_chars_base;
4827 ONE_MORE_BYTE (c);
065e3595 4828 *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
ff0dacd7 4829 char_offset++;
df7492f9
KH
4830 coding->errors++;
4831 }
fa42c37f 4832
df7492f9 4833 no_more_source:
ff0dacd7 4834 if (last_id != charset_ascii)
69a80ea3 4835 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
df7492f9
KH
4836 coding->consumed_char += consumed_chars_base;
4837 coding->consumed = src_base - coding->source;
4838 coding->charbuf_used = charbuf - coding->charbuf;
fa42c37f
KH
4839}
4840
b73bfc1c 4841static void
971de7fb 4842decode_coding_big5 (struct coding_system *coding)
4ed46869 4843{
8f924df7
KH
4844 const unsigned char *src = coding->source + coding->consumed;
4845 const unsigned char *src_end = coding->source + coding->src_bytes;
4846 const unsigned char *src_base;
69a80ea3 4847 int *charbuf = coding->charbuf + coding->charbuf_used;
ad1746f5 4848 /* We may produce one charset annotation in one loop and one more at
df80c7f0 4849 the end. */
69a80ea3 4850 int *charbuf_end
df80c7f0 4851 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
d311d28c 4852 ptrdiff_t consumed_chars = 0, consumed_chars_base;
df7492f9
KH
4853 int multibytep = coding->src_multibyte;
4854 struct charset *charset_roman, *charset_big5;
24a73b0a 4855 Lisp_Object attrs, charset_list, val;
d311d28c
PE
4856 ptrdiff_t char_offset = coding->produced_char;
4857 ptrdiff_t last_offset = char_offset;
ff0dacd7 4858 int last_id = charset_ascii;
2735d060 4859 int eol_dos =
0a9564cb 4860 !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
119852e7 4861 int byte_after_cr = -1;
df7492f9 4862
24a73b0a 4863 CODING_GET_INFO (coding, attrs, charset_list);
df7492f9
KH
4864 val = charset_list;
4865 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4866 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4ed46869 4867
b73bfc1c 4868 while (1)
4ed46869 4869 {
df7492f9 4870 int c, c1;
24a73b0a 4871 struct charset *charset;
b73bfc1c
KH
4872
4873 src_base = src;
df7492f9
KH
4874 consumed_chars_base = consumed_chars;
4875
4876 if (charbuf >= charbuf_end)
b71f6f73
KH
4877 {
4878 if (byte_after_cr >= 0)
4879 src_base--;
4880 break;
4881 }
df7492f9 4882
119852e7 4883 if (byte_after_cr >= 0)
14daee73 4884 c = byte_after_cr, byte_after_cr = -1;
119852e7
KH
4885 else
4886 ONE_MORE_BYTE (c);
b73bfc1c 4887
065e3595
KH
4888 if (c < 0)
4889 goto invalid_code;
24a73b0a 4890 if (c < 0x80)
119852e7 4891 {
2735d060 4892 if (eol_dos && c == '\r')
119852e7
KH
4893 ONE_MORE_BYTE (byte_after_cr);
4894 charset = charset_roman;
4895 }
24a73b0a 4896 else
4ed46869 4897 {
24a73b0a
KH
4898 /* BIG5 -> Big5 */
4899 if (c < 0xA1 || c > 0xFE)
4900 goto invalid_code;
4901 ONE_MORE_BYTE (c1);
4902 if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4903 goto invalid_code;
4904 c = c << 8 | c1;
4905 charset = charset_big5;
4ed46869 4906 }
24a73b0a
KH
4907 if (charset->id != charset_ascii
4908 && last_id != charset->id)
df7492f9 4909 {
24a73b0a 4910 if (last_id != charset_ascii)
69a80ea3 4911 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
24a73b0a
KH
4912 last_id = charset->id;
4913 last_offset = char_offset;
4ed46869 4914 }
24a73b0a 4915 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
df7492f9 4916 *charbuf++ = c;
ff0dacd7 4917 char_offset++;
fb88bf2d
KH
4918 continue;
4919
df7492f9 4920 invalid_code:
4ed46869 4921 src = src_base;
df7492f9
KH
4922 consumed_chars = consumed_chars_base;
4923 ONE_MORE_BYTE (c);
065e3595 4924 *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
ff0dacd7 4925 char_offset++;
df7492f9 4926 coding->errors++;
fb88bf2d 4927 }
d46c5b12 4928
df7492f9 4929 no_more_source:
ff0dacd7 4930 if (last_id != charset_ascii)
69a80ea3 4931 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
df7492f9
KH
4932 coding->consumed_char += consumed_chars_base;
4933 coding->consumed = src_base - coding->source;
4934 coding->charbuf_used = charbuf - coding->charbuf;
4ed46869
KH
4935}
4936
4937/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
b73bfc1c
KH
4938 This function can encode charsets `ascii', `katakana-jisx0201',
4939 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
4940 are sure that all these charsets are registered as official charset
4ed46869
KH
4941 (i.e. do not have extended leading-codes). Characters of other
4942 charsets are produced without any encoding. If SJIS_P is 1, encode
4943 SJIS text, else encode BIG5 text. */
4944
df7492f9 4945static int
971de7fb 4946encode_coding_sjis (struct coding_system *coding)
4ed46869 4947{
df7492f9
KH
4948 int multibytep = coding->dst_multibyte;
4949 int *charbuf = coding->charbuf;
4950 int *charbuf_end = charbuf + coding->charbuf_used;
4951 unsigned char *dst = coding->destination + coding->produced;
4952 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4953 int safe_room = 4;
d311d28c 4954 ptrdiff_t produced_chars = 0;
24a73b0a 4955 Lisp_Object attrs, charset_list, val;
df7492f9 4956 int ascii_compatible;
66ebf983 4957 struct charset *charset_kanji, *charset_kana;
57a47f8a 4958 struct charset *charset_kanji2;
df7492f9 4959 int c;
a5d301df 4960
24a73b0a 4961 CODING_GET_INFO (coding, attrs, charset_list);
66ebf983 4962 val = XCDR (charset_list);
df7492f9 4963 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
57a47f8a
KH
4964 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4965 charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4ed46869 4966
df7492f9 4967 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
93dec019 4968
df7492f9
KH
4969 while (charbuf < charbuf_end)
4970 {
4971 ASSURE_DESTINATION (safe_room);
4972 c = *charbuf++;
b73bfc1c 4973 /* Now encode the character C. */
df7492f9
KH
4974 if (ASCII_CHAR_P (c) && ascii_compatible)
4975 EMIT_ONE_ASCII_BYTE (c);
16eafb5d
KH
4976 else if (CHAR_BYTE8_P (c))
4977 {
4978 c = CHAR_TO_BYTE8 (c);
4979 EMIT_ONE_BYTE (c);
4980 }
df7492f9 4981 else
b73bfc1c 4982 {
df7492f9 4983 unsigned code;
5eb05ea3
KH
4984 struct charset *charset;
4985 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4986 &code, charset);
df7492f9
KH
4987
4988 if (!charset)
4ed46869 4989 {
41cbe562 4990 if (coding->mode & CODING_MODE_SAFE_ENCODING)
b73bfc1c 4991 {
41cbe562
KH
4992 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4993 charset = CHARSET_FROM_ID (charset_ascii);
b73bfc1c 4994 }
41cbe562 4995 else
b73bfc1c 4996 {
41cbe562 4997 c = coding->default_char;
5eb05ea3
KH
4998 CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4999 charset_list, &code, charset);
b73bfc1c 5000 }
b73bfc1c 5001 }
df7492f9
KH
5002 if (code == CHARSET_INVALID_CODE (charset))
5003 abort ();
5004 if (charset == charset_kanji)
5005 {
5006 int c1, c2;
5007 JIS_TO_SJIS (code);
5008 c1 = code >> 8, c2 = code & 0xFF;
5009 EMIT_TWO_BYTES (c1, c2);
5010 }
5011 else if (charset == charset_kana)
5012 EMIT_ONE_BYTE (code | 0x80);
57a47f8a
KH
5013 else if (charset_kanji2 && charset == charset_kanji2)
5014 {
5015 int c1, c2;
5016
5017 c1 = code >> 8;
f07190ca
KH
5018 if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25)
5019 || c1 == 0x28
57a47f8a
KH
5020 || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
5021 {
5022 JIS_TO_SJIS2 (code);
5023 c1 = code >> 8, c2 = code & 0xFF;
5024 EMIT_TWO_BYTES (c1, c2);
5025 }
5026 else
5027 EMIT_ONE_ASCII_BYTE (code & 0x7F);
5028 }
df7492f9
KH
5029 else
5030 EMIT_ONE_ASCII_BYTE (code & 0x7F);
5031 }
5032 }
065e3595 5033 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
5034 coding->produced_char += produced_chars;
5035 coding->produced = dst - coding->destination;
5036 return 0;
5037}
5038
5039static int
971de7fb 5040encode_coding_big5 (struct coding_system *coding)
df7492f9
KH
5041{
5042 int multibytep = coding->dst_multibyte;
5043 int *charbuf = coding->charbuf;
5044 int *charbuf_end = charbuf + coding->charbuf_used;
5045 unsigned char *dst = coding->destination + coding->produced;
5046 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5047 int safe_room = 4;
d311d28c 5048 ptrdiff_t produced_chars = 0;
24a73b0a 5049 Lisp_Object attrs, charset_list, val;
df7492f9 5050 int ascii_compatible;
66ebf983 5051 struct charset *charset_big5;
df7492f9
KH
5052 int c;
5053
24a73b0a 5054 CODING_GET_INFO (coding, attrs, charset_list);
66ebf983 5055 val = XCDR (charset_list);
df7492f9
KH
5056 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
5057 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5058
5059 while (charbuf < charbuf_end)
5060 {
5061 ASSURE_DESTINATION (safe_room);
5062 c = *charbuf++;
5063 /* Now encode the character C. */
5064 if (ASCII_CHAR_P (c) && ascii_compatible)
5065 EMIT_ONE_ASCII_BYTE (c);
16eafb5d
KH
5066 else if (CHAR_BYTE8_P (c))
5067 {
5068 c = CHAR_TO_BYTE8 (c);
5069 EMIT_ONE_BYTE (c);
b73bfc1c
KH
5070 }
5071 else
5072 {
df7492f9 5073 unsigned code;
5eb05ea3
KH
5074 struct charset *charset;
5075 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5076 &code, charset);
df7492f9
KH
5077
5078 if (! charset)
b73bfc1c 5079 {
41cbe562 5080 if (coding->mode & CODING_MODE_SAFE_ENCODING)
b73bfc1c 5081 {
41cbe562
KH
5082 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5083 charset = CHARSET_FROM_ID (charset_ascii);
b73bfc1c 5084 }
41cbe562 5085 else
0eecad43 5086 {
41cbe562 5087 c = coding->default_char;
5eb05ea3
KH
5088 CODING_CHAR_CHARSET (coding, dst, dst_end, c,
5089 charset_list, &code, charset);
0eecad43 5090 }
4ed46869 5091 }
df7492f9
KH
5092 if (code == CHARSET_INVALID_CODE (charset))
5093 abort ();
5094 if (charset == charset_big5)
b73bfc1c 5095 {
df7492f9
KH
5096 int c1, c2;
5097
5098 c1 = code >> 8, c2 = code & 0xFF;
5099 EMIT_TWO_BYTES (c1, c2);
b73bfc1c 5100 }
df7492f9
KH
5101 else
5102 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4ed46869 5103 }
4ed46869 5104 }
065e3595 5105 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
5106 coding->produced_char += produced_chars;
5107 coding->produced = dst - coding->destination;
5108 return 0;
4ed46869
KH
5109}
5110
5111\f
df7492f9 5112/*** 10. CCL handlers ***/
1397dc18
KH
5113
5114/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5115 Check if a text is encoded in a coding system of which
5116 encoder/decoder are written in CCL program. If it is, return
df7492f9 5117 CATEGORY_MASK_CCL, else return 0. */
1397dc18 5118
0a28aafb 5119static int
cf84bb53
JB
5120detect_coding_ccl (struct coding_system *coding,
5121 struct coding_detection_info *detect_info)
1397dc18 5122{
065e3595 5123 const unsigned char *src = coding->source, *src_base;
8f924df7 5124 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9 5125 int multibytep = coding->src_multibyte;
d311d28c 5126 ptrdiff_t consumed_chars = 0;
df7492f9 5127 int found = 0;
0e219d54 5128 unsigned char *valids;
d311d28c 5129 ptrdiff_t head_ascii = coding->head_ascii;
df7492f9
KH
5130 Lisp_Object attrs;
5131
ff0dacd7
KH
5132 detect_info->checked |= CATEGORY_MASK_CCL;
5133
df7492f9 5134 coding = &coding_categories[coding_category_ccl];
0e219d54 5135 valids = CODING_CCL_VALIDS (coding);
df7492f9
KH
5136 attrs = CODING_ID_ATTRS (coding->id);
5137 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5138 src += head_ascii;
1397dc18 5139
b73bfc1c 5140 while (1)
1397dc18 5141 {
df7492f9 5142 int c;
065e3595
KH
5143
5144 src_base = src;
df7492f9 5145 ONE_MORE_BYTE (c);
065e3595 5146 if (c < 0 || ! valids[c])
df7492f9 5147 break;
ff0dacd7
KH
5148 if ((valids[c] > 1))
5149 found = CATEGORY_MASK_CCL;
df7492f9 5150 }
ff0dacd7 5151 detect_info->rejected |= CATEGORY_MASK_CCL;
df7492f9
KH
5152 return 0;
5153
5154 no_more_source:
ff0dacd7
KH
5155 detect_info->found |= found;
5156 return 1;
df7492f9
KH
5157}
5158
5159static void
971de7fb 5160decode_coding_ccl (struct coding_system *coding)
df7492f9 5161{
7c78e542 5162 const unsigned char *src = coding->source + coding->consumed;
8f924df7 5163 const unsigned char *src_end = coding->source + coding->src_bytes;
69a80ea3
KH
5164 int *charbuf = coding->charbuf + coding->charbuf_used;
5165 int *charbuf_end = coding->charbuf + coding->charbuf_size;
d311d28c 5166 ptrdiff_t consumed_chars = 0;
df7492f9 5167 int multibytep = coding->src_multibyte;
d0396581 5168 struct ccl_program *ccl = &coding->spec.ccl->ccl;
df7492f9 5169 int source_charbuf[1024];
fbdc1721 5170 int source_byteidx[1025];
24a73b0a 5171 Lisp_Object attrs, charset_list;
df7492f9 5172
24a73b0a 5173 CODING_GET_INFO (coding, attrs, charset_list);
df7492f9 5174
d0396581 5175 while (1)
df7492f9 5176 {
7c78e542 5177 const unsigned char *p = src;
df7492f9
KH
5178 int i = 0;
5179
5180 if (multibytep)
fbdc1721
KH
5181 {
5182 while (i < 1024 && p < src_end)
5183 {
5184 source_byteidx[i] = p - src;
5185 source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
5186 }
5187 source_byteidx[i] = p - src;
5188 }
df7492f9
KH
5189 else
5190 while (i < 1024 && p < src_end)
5191 source_charbuf[i++] = *p++;
8f924df7 5192
df7492f9 5193 if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
d0396581
KH
5194 ccl->last_block = 1;
5195 ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf,
5196 charset_list);
5197 charbuf += ccl->produced;
fbdc1721 5198 if (multibytep)
d0396581 5199 src += source_byteidx[ccl->consumed];
df7492f9 5200 else
d0396581
KH
5201 src += ccl->consumed;
5202 consumed_chars += ccl->consumed;
5203 if (p == src_end || ccl->status != CCL_STAT_SUSPEND_BY_SRC)
df7492f9
KH
5204 break;
5205 }
5206
d0396581 5207 switch (ccl->status)
df7492f9
KH
5208 {
5209 case CCL_STAT_SUSPEND_BY_SRC:
065e3595 5210 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
df7492f9
KH
5211 break;
5212 case CCL_STAT_SUSPEND_BY_DST:
d0396581 5213 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
df7492f9
KH
5214 break;
5215 case CCL_STAT_QUIT:
5216 case CCL_STAT_INVALID_CMD:
065e3595 5217 record_conversion_result (coding, CODING_RESULT_INTERRUPT);
df7492f9
KH
5218 break;
5219 default:
065e3595 5220 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
5221 break;
5222 }
5223 coding->consumed_char += consumed_chars;
5224 coding->consumed = src - coding->source;
5225 coding->charbuf_used = charbuf - coding->charbuf;
5226}
5227
5228static int
971de7fb 5229encode_coding_ccl (struct coding_system *coding)
df7492f9 5230{
fb608df3 5231 struct ccl_program *ccl = &coding->spec.ccl->ccl;
df7492f9
KH
5232 int multibytep = coding->dst_multibyte;
5233 int *charbuf = coding->charbuf;
5234 int *charbuf_end = charbuf + coding->charbuf_used;
5235 unsigned char *dst = coding->destination + coding->produced;
5236 unsigned char *dst_end = coding->destination + coding->dst_bytes;
df7492f9 5237 int destination_charbuf[1024];
d311d28c 5238 ptrdiff_t produced_chars = 0;
a53e2e89 5239 int i;
24a73b0a 5240 Lisp_Object attrs, charset_list;
df7492f9 5241
24a73b0a 5242 CODING_GET_INFO (coding, attrs, charset_list);
fb608df3
KH
5243 if (coding->consumed_char == coding->src_chars
5244 && coding->mode & CODING_MODE_LAST_BLOCK)
5245 ccl->last_block = 1;
df7492f9 5246
76470ad1 5247 do
df7492f9 5248 {
fb608df3 5249 ccl_driver (ccl, charbuf, destination_charbuf,
8cffd3e7 5250 charbuf_end - charbuf, 1024, charset_list);
df7492f9 5251 if (multibytep)
8cffd3e7 5252 {
fb608df3
KH
5253 ASSURE_DESTINATION (ccl->produced * 2);
5254 for (i = 0; i < ccl->produced; i++)
8cffd3e7
KH
5255 EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5256 }
df7492f9
KH
5257 else
5258 {
fb608df3
KH
5259 ASSURE_DESTINATION (ccl->produced);
5260 for (i = 0; i < ccl->produced; i++)
df7492f9 5261 *dst++ = destination_charbuf[i] & 0xFF;
fb608df3 5262 produced_chars += ccl->produced;
df7492f9 5263 }
fb608df3
KH
5264 charbuf += ccl->consumed;
5265 if (ccl->status == CCL_STAT_QUIT
5266 || ccl->status == CCL_STAT_INVALID_CMD)
8cffd3e7 5267 break;
df7492f9 5268 }
76470ad1 5269 while (charbuf < charbuf_end);
df7492f9 5270
fb608df3 5271 switch (ccl->status)
df7492f9
KH
5272 {
5273 case CCL_STAT_SUSPEND_BY_SRC:
065e3595 5274 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
df7492f9
KH
5275 break;
5276 case CCL_STAT_SUSPEND_BY_DST:
065e3595 5277 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
df7492f9
KH
5278 break;
5279 case CCL_STAT_QUIT:
5280 case CCL_STAT_INVALID_CMD:
065e3595 5281 record_conversion_result (coding, CODING_RESULT_INTERRUPT);
df7492f9
KH
5282 break;
5283 default:
065e3595 5284 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9 5285 break;
1397dc18 5286 }
df7492f9
KH
5287
5288 coding->produced_char += produced_chars;
5289 coding->produced = dst - coding->destination;
5290 return 0;
1397dc18
KH
5291}
5292
df7492f9 5293
1397dc18 5294\f
df7492f9 5295/*** 10, 11. no-conversion handlers ***/
4ed46869 5296
b73bfc1c 5297/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
4ed46869 5298
b73bfc1c 5299static void
971de7fb 5300decode_coding_raw_text (struct coding_system *coding)
4ed46869 5301{
2735d060 5302 int eol_dos =
0a9564cb 5303 !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
119852e7 5304
df7492f9 5305 coding->chars_at_source = 1;
119852e7
KH
5306 coding->consumed_char = coding->src_chars;
5307 coding->consumed = coding->src_bytes;
2735d060 5308 if (eol_dos && coding->source[coding->src_bytes - 1] == '\r')
119852e7
KH
5309 {
5310 coding->consumed_char--;
5311 coding->consumed--;
5312 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5313 }
5314 else
5315 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9 5316}
4ed46869 5317
df7492f9 5318static int
971de7fb 5319encode_coding_raw_text (struct coding_system *coding)
df7492f9
KH
5320{
5321 int multibytep = coding->dst_multibyte;
5322 int *charbuf = coding->charbuf;
5323 int *charbuf_end = coding->charbuf + coding->charbuf_used;
5324 unsigned char *dst = coding->destination + coding->produced;
5325 unsigned char *dst_end = coding->destination + coding->dst_bytes;
d311d28c 5326 ptrdiff_t produced_chars = 0;
b73bfc1c
KH
5327 int c;
5328
df7492f9 5329 if (multibytep)
b73bfc1c 5330 {
df7492f9 5331 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
4ed46869 5332
df7492f9
KH
5333 if (coding->src_multibyte)
5334 while (charbuf < charbuf_end)
5335 {
5336 ASSURE_DESTINATION (safe_room);
5337 c = *charbuf++;
5338 if (ASCII_CHAR_P (c))
5339 EMIT_ONE_ASCII_BYTE (c);
5340 else if (CHAR_BYTE8_P (c))
5341 {
5342 c = CHAR_TO_BYTE8 (c);
5343 EMIT_ONE_BYTE (c);
5344 }
5345 else
5346 {
5347 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
93dec019 5348
df7492f9 5349 CHAR_STRING_ADVANCE (c, p1);
8abc3f12 5350 do
9d123124
KH
5351 {
5352 EMIT_ONE_BYTE (*p0);
5353 p0++;
5354 }
8abc3f12 5355 while (p0 < p1);
df7492f9
KH
5356 }
5357 }
b73bfc1c 5358 else
df7492f9
KH
5359 while (charbuf < charbuf_end)
5360 {
5361 ASSURE_DESTINATION (safe_room);
5362 c = *charbuf++;
5363 EMIT_ONE_BYTE (c);
5364 }
5365 }
5366 else
4ed46869 5367 {
df7492f9 5368 if (coding->src_multibyte)
d46c5b12 5369 {
df7492f9
KH
5370 int safe_room = MAX_MULTIBYTE_LENGTH;
5371
5372 while (charbuf < charbuf_end)
d46c5b12 5373 {
df7492f9
KH
5374 ASSURE_DESTINATION (safe_room);
5375 c = *charbuf++;
5376 if (ASCII_CHAR_P (c))
5377 *dst++ = c;
5378 else if (CHAR_BYTE8_P (c))
5379 *dst++ = CHAR_TO_BYTE8 (c);
b73bfc1c 5380 else
df7492f9 5381 CHAR_STRING_ADVANCE (c, dst);
d46c5b12
KH
5382 }
5383 }
df7492f9
KH
5384 else
5385 {
5386 ASSURE_DESTINATION (charbuf_end - charbuf);
5387 while (charbuf < charbuf_end && dst < dst_end)
5388 *dst++ = *charbuf++;
8f924df7 5389 }
319a3947 5390 produced_chars = dst - (coding->destination + coding->produced);
4ed46869 5391 }
065e3595 5392 record_conversion_result (coding, CODING_RESULT_SUCCESS);
a0ed9b27 5393 coding->produced_char += produced_chars;
df7492f9
KH
5394 coding->produced = dst - coding->destination;
5395 return 0;
4ed46869
KH
5396}
5397
ff0dacd7
KH
5398/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5399 Check if a text is encoded in a charset-based coding system. If it
5400 is, return 1, else return 0. */
5401
0a28aafb 5402static int
cf84bb53
JB
5403detect_coding_charset (struct coding_system *coding,
5404 struct coding_detection_info *detect_info)
1397dc18 5405{
065e3595 5406 const unsigned char *src = coding->source, *src_base;
8f924df7 5407 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9 5408 int multibytep = coding->src_multibyte;
d311d28c 5409 ptrdiff_t consumed_chars = 0;
07295713 5410 Lisp_Object attrs, valids, name;
584948ac 5411 int found = 0;
d311d28c 5412 ptrdiff_t head_ascii = coding->head_ascii;
07295713 5413 int check_latin_extra = 0;
1397dc18 5414
ff0dacd7
KH
5415 detect_info->checked |= CATEGORY_MASK_CHARSET;
5416
df7492f9
KH
5417 coding = &coding_categories[coding_category_charset];
5418 attrs = CODING_ID_ATTRS (coding->id);
5419 valids = AREF (attrs, coding_attr_charset_valids);
07295713 5420 name = CODING_ID_NAME (coding->id);
51b59d79 5421 if (strncmp (SSDATA (SYMBOL_NAME (name)),
237aabf4 5422 "iso-8859-", sizeof ("iso-8859-") - 1) == 0
51b59d79 5423 || strncmp (SSDATA (SYMBOL_NAME (name)),
237aabf4 5424 "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
07295713 5425 check_latin_extra = 1;
237aabf4 5426
df7492f9 5427 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
716b3fa0 5428 src += head_ascii;
1397dc18 5429
b73bfc1c 5430 while (1)
1397dc18 5431 {
df7492f9 5432 int c;
716b3fa0
KH
5433 Lisp_Object val;
5434 struct charset *charset;
5435 int dim, idx;
1397dc18 5436
065e3595 5437 src_base = src;
df7492f9 5438 ONE_MORE_BYTE (c);
065e3595
KH
5439 if (c < 0)
5440 continue;
716b3fa0
KH
5441 val = AREF (valids, c);
5442 if (NILP (val))
df7492f9 5443 break;
584948ac 5444 if (c >= 0x80)
07295713
KH
5445 {
5446 if (c < 0xA0
237aabf4
JR
5447 && check_latin_extra
5448 && (!VECTORP (Vlatin_extra_code_table)
9f0526cb 5449 || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c])))
07295713
KH
5450 break;
5451 found = CATEGORY_MASK_CHARSET;
5452 }
716b3fa0
KH
5453 if (INTEGERP (val))
5454 {
5455 charset = CHARSET_FROM_ID (XFASTINT (val));
5456 dim = CHARSET_DIMENSION (charset);
5457 for (idx = 1; idx < dim; idx++)
5458 {
5459 if (src == src_end)
5460 goto too_short;
5461 ONE_MORE_BYTE (c);
2f9442b8
PE
5462 if (c < charset->code_space[(dim - 1 - idx) * 4]
5463 || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
716b3fa0
KH
5464 break;
5465 }
5466 if (idx < dim)
5467 break;
5468 }
5469 else
5470 {
5471 idx = 1;
5472 for (; CONSP (val); val = XCDR (val))
5473 {
5474 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5475 dim = CHARSET_DIMENSION (charset);
5476 while (idx < dim)
5477 {
5478 if (src == src_end)
5479 goto too_short;
5480 ONE_MORE_BYTE (c);
5481 if (c < charset->code_space[(dim - 1 - idx) * 4]
5482 || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5483 break;
5484 idx++;
5485 }
5486 if (idx == dim)
5487 {
5488 val = Qnil;
5489 break;
5490 }
5491 }
5492 if (CONSP (val))
5493 break;
5494 }
df7492f9 5495 }
716b3fa0 5496 too_short:
ff0dacd7 5497 detect_info->rejected |= CATEGORY_MASK_CHARSET;
df7492f9 5498 return 0;
4ed46869 5499
df7492f9 5500 no_more_source:
ff0dacd7
KH
5501 detect_info->found |= found;
5502 return 1;
df7492f9 5503}
b73bfc1c 5504
b73bfc1c 5505static void
971de7fb 5506decode_coding_charset (struct coding_system *coding)
4ed46869 5507{
8f924df7
KH
5508 const unsigned char *src = coding->source + coding->consumed;
5509 const unsigned char *src_end = coding->source + coding->src_bytes;
5510 const unsigned char *src_base;
69a80ea3 5511 int *charbuf = coding->charbuf + coding->charbuf_used;
ad1746f5 5512 /* We may produce one charset annotation in one loop and one more at
df80c7f0 5513 the end. */
69a80ea3 5514 int *charbuf_end
df80c7f0 5515 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
d311d28c 5516 ptrdiff_t consumed_chars = 0, consumed_chars_base;
df7492f9 5517 int multibytep = coding->src_multibyte;
66ebf983
PE
5518 Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
5519 Lisp_Object valids;
d311d28c
PE
5520 ptrdiff_t char_offset = coding->produced_char;
5521 ptrdiff_t last_offset = char_offset;
ff0dacd7 5522 int last_id = charset_ascii;
2735d060 5523 int eol_dos =
0a9564cb 5524 !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
119852e7 5525 int byte_after_cr = -1;
df7492f9 5526
4eb6d3f1 5527 valids = AREF (attrs, coding_attr_charset_valids);
b73bfc1c 5528
df7492f9 5529 while (1)
4ed46869 5530 {
4eb6d3f1 5531 int c;
24a73b0a
KH
5532 Lisp_Object val;
5533 struct charset *charset;
5534 int dim;
5535 int len = 1;
5536 unsigned code;
df7492f9
KH
5537
5538 src_base = src;
5539 consumed_chars_base = consumed_chars;
b73bfc1c 5540
df7492f9 5541 if (charbuf >= charbuf_end)
b71f6f73
KH
5542 {
5543 if (byte_after_cr >= 0)
5544 src_base--;
5545 break;
5546 }
df7492f9 5547
119852e7
KH
5548 if (byte_after_cr >= 0)
5549 {
5550 c = byte_after_cr;
5551 byte_after_cr = -1;
5552 }
5553 else
5554 {
5555 ONE_MORE_BYTE (c);
2735d060 5556 if (eol_dos && c == '\r')
119852e7
KH
5557 ONE_MORE_BYTE (byte_after_cr);
5558 }
065e3595
KH
5559 if (c < 0)
5560 goto invalid_code;
24a73b0a
KH
5561 code = c;
5562
5563 val = AREF (valids, c);
1b17adfd 5564 if (! INTEGERP (val) && ! CONSP (val))
24a73b0a
KH
5565 goto invalid_code;
5566 if (INTEGERP (val))
d46c5b12 5567 {
24a73b0a
KH
5568 charset = CHARSET_FROM_ID (XFASTINT (val));
5569 dim = CHARSET_DIMENSION (charset);
5570 while (len < dim)
b73bfc1c 5571 {
24a73b0a
KH
5572 ONE_MORE_BYTE (c);
5573 code = (code << 8) | c;
5574 len++;
b73bfc1c 5575 }
24a73b0a
KH
5576 CODING_DECODE_CHAR (coding, src, src_base, src_end,
5577 charset, code, c);
d46c5b12 5578 }
df7492f9 5579 else
d46c5b12 5580 {
24a73b0a
KH
5581 /* VAL is a list of charset IDs. It is assured that the
5582 list is sorted by charset dimensions (smaller one
5583 comes first). */
5584 while (CONSP (val))
4eb6d3f1 5585 {
24a73b0a 5586 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
c7c66a95 5587 dim = CHARSET_DIMENSION (charset);
f9d71dcd 5588 while (len < dim)
4eb6d3f1 5589 {
acb2a965
KH
5590 ONE_MORE_BYTE (c);
5591 code = (code << 8) | c;
f9d71dcd 5592 len++;
4eb6d3f1 5593 }
24a73b0a
KH
5594 CODING_DECODE_CHAR (coding, src, src_base,
5595 src_end, charset, code, c);
5596 if (c >= 0)
5597 break;
5598 val = XCDR (val);
ff0dacd7 5599 }
d46c5b12 5600 }
24a73b0a
KH
5601 if (c < 0)
5602 goto invalid_code;
5603 if (charset->id != charset_ascii
5604 && last_id != charset->id)
5605 {
5606 if (last_id != charset_ascii)
69a80ea3 5607 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
24a73b0a
KH
5608 last_id = charset->id;
5609 last_offset = char_offset;
5610 }
5611
df7492f9 5612 *charbuf++ = c;
ff0dacd7 5613 char_offset++;
df7492f9
KH
5614 continue;
5615
5616 invalid_code:
5617 src = src_base;
5618 consumed_chars = consumed_chars_base;
5619 ONE_MORE_BYTE (c);
065e3595 5620 *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
ff0dacd7 5621 char_offset++;
df7492f9 5622 coding->errors++;
4ed46869
KH
5623 }
5624
df7492f9 5625 no_more_source:
ff0dacd7 5626 if (last_id != charset_ascii)
69a80ea3 5627 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
df7492f9
KH
5628 coding->consumed_char += consumed_chars_base;
5629 coding->consumed = src_base - coding->source;
5630 coding->charbuf_used = charbuf - coding->charbuf;
4ed46869
KH
5631}
5632
df7492f9 5633static int
971de7fb 5634encode_coding_charset (struct coding_system *coding)
4ed46869 5635{
df7492f9
KH
5636 int multibytep = coding->dst_multibyte;
5637 int *charbuf = coding->charbuf;
5638 int *charbuf_end = charbuf + coding->charbuf_used;
5639 unsigned char *dst = coding->destination + coding->produced;
5640 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5641 int safe_room = MAX_MULTIBYTE_LENGTH;
d311d28c 5642 ptrdiff_t produced_chars = 0;
24a73b0a 5643 Lisp_Object attrs, charset_list;
df7492f9 5644 int ascii_compatible;
b73bfc1c 5645 int c;
b73bfc1c 5646
24a73b0a 5647 CODING_GET_INFO (coding, attrs, charset_list);
df7492f9 5648 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
fb88bf2d 5649
df7492f9 5650 while (charbuf < charbuf_end)
4ed46869 5651 {
4eb6d3f1 5652 struct charset *charset;
df7492f9 5653 unsigned code;
8f924df7 5654
df7492f9
KH
5655 ASSURE_DESTINATION (safe_room);
5656 c = *charbuf++;
5657 if (ascii_compatible && ASCII_CHAR_P (c))
5658 EMIT_ONE_ASCII_BYTE (c);
16eafb5d 5659 else if (CHAR_BYTE8_P (c))
4ed46869 5660 {
16eafb5d
KH
5661 c = CHAR_TO_BYTE8 (c);
5662 EMIT_ONE_BYTE (c);
d46c5b12 5663 }
d46c5b12 5664 else
b73bfc1c 5665 {
5eb05ea3
KH
5666 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5667 &code, charset);
5668
4eb6d3f1
KH
5669 if (charset)
5670 {
5671 if (CHARSET_DIMENSION (charset) == 1)
5672 EMIT_ONE_BYTE (code);
5673 else if (CHARSET_DIMENSION (charset) == 2)
5674 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5675 else if (CHARSET_DIMENSION (charset) == 3)
5676 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5677 else
5678 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5679 (code >> 8) & 0xFF, code & 0xFF);
5680 }
5681 else
41cbe562
KH
5682 {
5683 if (coding->mode & CODING_MODE_SAFE_ENCODING)
5684 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5685 else
5686 c = coding->default_char;
5687 EMIT_ONE_BYTE (c);
5688 }
4ed46869 5689 }
4ed46869
KH
5690 }
5691
065e3595 5692 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
5693 coding->produced_char += produced_chars;
5694 coding->produced = dst - coding->destination;
5695 return 0;
4ed46869
KH
5696}
5697
5698\f
1397dc18 5699/*** 7. C library functions ***/
4ed46869 5700
df7492f9
KH
5701/* Setup coding context CODING from information about CODING_SYSTEM.
5702 If CODING_SYSTEM is nil, `no-conversion' is assumed. If
5703 CODING_SYSTEM is invalid, signal an error. */
4ed46869 5704
ec6d2bb8 5705void
971de7fb 5706setup_coding_system (Lisp_Object coding_system, struct coding_system *coding)
4ed46869 5707{
df7492f9
KH
5708 Lisp_Object attrs;
5709 Lisp_Object eol_type;
5710 Lisp_Object coding_type;
4608c386 5711 Lisp_Object val;
4ed46869 5712
df7492f9 5713 if (NILP (coding_system))
ae6f73fa 5714 coding_system = Qundecided;
c07c8e12 5715
df7492f9 5716 CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
1f5dbf34 5717
df7492f9 5718 attrs = CODING_ID_ATTRS (coding->id);
0a9564cb 5719 eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
1f5dbf34 5720
df7492f9
KH
5721 coding->mode = 0;
5722 coding->head_ascii = -1;
4a015c45
KH
5723 if (VECTORP (eol_type))
5724 coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5725 | CODING_REQUIRE_DETECTION_MASK);
5726 else if (! EQ (eol_type, Qunix))
5727 coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5728 | CODING_REQUIRE_ENCODING_MASK);
5729 else
5730 coding->common_flags = 0;
5e5c78be
KH
5731 if (! NILP (CODING_ATTR_POST_READ (attrs)))
5732 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5733 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5734 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
8f924df7
KH
5735 if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5736 coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
1f5dbf34 5737
df7492f9 5738 val = CODING_ATTR_SAFE_CHARSETS (attrs);
8f924df7 5739 coding->max_charset_id = SCHARS (val) - 1;
1b3b981b 5740 coding->safe_charsets = SDATA (val);
df7492f9 5741 coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
624bda09 5742 coding->carryover_bytes = 0;
4608c386 5743
df7492f9
KH
5744 coding_type = CODING_ATTR_TYPE (attrs);
5745 if (EQ (coding_type, Qundecided))
d46c5b12 5746 {
df7492f9
KH
5747 coding->detector = NULL;
5748 coding->decoder = decode_coding_raw_text;
5749 coding->encoder = encode_coding_raw_text;
5750 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
d46c5b12 5751 }
df7492f9 5752 else if (EQ (coding_type, Qiso_2022))
d46c5b12 5753 {
df7492f9
KH
5754 int i;
5755 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5756
5757 /* Invoke graphic register 0 to plane 0. */
5758 CODING_ISO_INVOCATION (coding, 0) = 0;
5759 /* Invoke graphic register 1 to plane 1 if we can use 8-bit. */
5760 CODING_ISO_INVOCATION (coding, 1)
5761 = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5762 /* Setup the initial status of designation. */
5763 for (i = 0; i < 4; i++)
5764 CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5765 /* Not single shifting initially. */
5766 CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5767 /* Beginning of buffer should also be regarded as bol. */
5768 CODING_ISO_BOL (coding) = 1;
5769 coding->detector = detect_coding_iso_2022;
5770 coding->decoder = decode_coding_iso_2022;
5771 coding->encoder = encode_coding_iso_2022;
5772 if (flags & CODING_ISO_FLAG_SAFE)
5773 coding->mode |= CODING_MODE_SAFE_ENCODING;
d46c5b12 5774 coding->common_flags
df7492f9
KH
5775 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5776 | CODING_REQUIRE_FLUSHING_MASK);
5777 if (flags & CODING_ISO_FLAG_COMPOSITION)
5778 coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
ff0dacd7
KH
5779 if (flags & CODING_ISO_FLAG_DESIGNATION)
5780 coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
df7492f9
KH
5781 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5782 {
5783 setup_iso_safe_charsets (attrs);
5784 val = CODING_ATTR_SAFE_CHARSETS (attrs);
8f924df7 5785 coding->max_charset_id = SCHARS (val) - 1;
1b3b981b 5786 coding->safe_charsets = SDATA (val);
df7492f9
KH
5787 }
5788 CODING_ISO_FLAGS (coding) = flags;
e951386e
KH
5789 CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5790 CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5791 CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5792 CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
d46c5b12 5793 }
df7492f9 5794 else if (EQ (coding_type, Qcharset))
d46c5b12 5795 {
df7492f9
KH
5796 coding->detector = detect_coding_charset;
5797 coding->decoder = decode_coding_charset;
5798 coding->encoder = encode_coding_charset;
d46c5b12 5799 coding->common_flags
df7492f9 5800 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
d46c5b12 5801 }
df7492f9 5802 else if (EQ (coding_type, Qutf_8))
d46c5b12 5803 {
a470d443
KH
5804 val = AREF (attrs, coding_attr_utf_bom);
5805 CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5806 : EQ (val, Qt) ? utf_with_bom
5807 : utf_without_bom);
df7492f9
KH
5808 coding->detector = detect_coding_utf_8;
5809 coding->decoder = decode_coding_utf_8;
5810 coding->encoder = encode_coding_utf_8;
5811 coding->common_flags
5812 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
a470d443
KH
5813 if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5814 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
df7492f9
KH
5815 }
5816 else if (EQ (coding_type, Qutf_16))
5817 {
a470d443
KH
5818 val = AREF (attrs, coding_attr_utf_bom);
5819 CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5820 : EQ (val, Qt) ? utf_with_bom
5821 : utf_without_bom);
df7492f9 5822 val = AREF (attrs, coding_attr_utf_16_endian);
b49a1807 5823 CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
df7492f9 5824 : utf_16_little_endian);
e19c3639 5825 CODING_UTF_16_SURROGATE (coding) = 0;
df7492f9
KH
5826 coding->detector = detect_coding_utf_16;
5827 coding->decoder = decode_coding_utf_16;
5828 coding->encoder = encode_coding_utf_16;
5829 coding->common_flags
5830 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
a470d443 5831 if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
b49a1807 5832 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
d46c5b12 5833 }
df7492f9 5834 else if (EQ (coding_type, Qccl))
4ed46869 5835 {
df7492f9
KH
5836 coding->detector = detect_coding_ccl;
5837 coding->decoder = decode_coding_ccl;
5838 coding->encoder = encode_coding_ccl;
c952af22 5839 coding->common_flags
df7492f9
KH
5840 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5841 | CODING_REQUIRE_FLUSHING_MASK);
5842 }
5843 else if (EQ (coding_type, Qemacs_mule))
5844 {
5845 coding->detector = detect_coding_emacs_mule;
5846 coding->decoder = decode_coding_emacs_mule;
5847 coding->encoder = encode_coding_emacs_mule;
c952af22 5848 coding->common_flags
df7492f9 5849 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
e951386e 5850 coding->spec.emacs_mule.full_support = 1;
df7492f9
KH
5851 if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5852 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5853 {
5854 Lisp_Object tail, safe_charsets;
5855 int max_charset_id = 0;
5856
5857 for (tail = Vemacs_mule_charset_list; CONSP (tail);
5858 tail = XCDR (tail))
5859 if (max_charset_id < XFASTINT (XCAR (tail)))
5860 max_charset_id = XFASTINT (XCAR (tail));
1b3b981b
AS
5861 safe_charsets = make_uninit_string (max_charset_id + 1);
5862 memset (SDATA (safe_charsets), 255, max_charset_id + 1);
df7492f9
KH
5863 for (tail = Vemacs_mule_charset_list; CONSP (tail);
5864 tail = XCDR (tail))
8f924df7 5865 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
df7492f9 5866 coding->max_charset_id = max_charset_id;
1b3b981b 5867 coding->safe_charsets = SDATA (safe_charsets);
e951386e 5868 coding->spec.emacs_mule.full_support = 1;
df7492f9 5869 }
e951386e
KH
5870 coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
5871 coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
df7492f9
KH
5872 }
5873 else if (EQ (coding_type, Qshift_jis))
5874 {
5875 coding->detector = detect_coding_sjis;
5876 coding->decoder = decode_coding_sjis;
5877 coding->encoder = encode_coding_sjis;
c952af22 5878 coding->common_flags
df7492f9
KH
5879 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5880 }
5881 else if (EQ (coding_type, Qbig5))
5882 {
5883 coding->detector = detect_coding_big5;
5884 coding->decoder = decode_coding_big5;
5885 coding->encoder = encode_coding_big5;
c952af22 5886 coding->common_flags
df7492f9
KH
5887 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5888 }
5889 else /* EQ (coding_type, Qraw_text) */
ec6d2bb8 5890 {
df7492f9
KH
5891 coding->detector = NULL;
5892 coding->decoder = decode_coding_raw_text;
5893 coding->encoder = encode_coding_raw_text;
ea29edf2
KH
5894 if (! EQ (eol_type, Qunix))
5895 {
5896 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5897 if (! VECTORP (eol_type))
5898 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5899 }
5900
4ed46869 5901 }
4ed46869 5902
df7492f9 5903 return;
4ed46869
KH
5904}
5905
0ff61e78
KH
5906/* Return a list of charsets supported by CODING. */
5907
5908Lisp_Object
971de7fb 5909coding_charset_list (struct coding_system *coding)
0ff61e78 5910{
35befdaa 5911 Lisp_Object attrs, charset_list;
0ff61e78
KH
5912
5913 CODING_GET_INFO (coding, attrs, charset_list);
5914 if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5915 {
5916 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5917
5918 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5919 charset_list = Viso_2022_charset_list;
5920 }
5921 else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5922 {
5923 charset_list = Vemacs_mule_charset_list;
5924 }
5925 return charset_list;
5926}
5927
5928
e9f91ece
KH
5929/* Return a list of charsets supported by CODING-SYSTEM. */
5930
5931Lisp_Object
971de7fb 5932coding_system_charset_list (Lisp_Object coding_system)
e9f91ece 5933{
d3411f89 5934 ptrdiff_t id;
e9f91ece
KH
5935 Lisp_Object attrs, charset_list;
5936
5937 CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
5938 attrs = CODING_ID_ATTRS (id);
5939
5940 if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5941 {
5942 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5943
5944 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5945 charset_list = Viso_2022_charset_list;
5946 else
5947 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5948 }
5949 else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5950 {
5951 charset_list = Vemacs_mule_charset_list;
5952 }
5953 else
5954 {
5955 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5956 }
5957 return charset_list;
5958}
5959
5960
df7492f9
KH
5961/* Return raw-text or one of its subsidiaries that has the same
5962 eol_type as CODING-SYSTEM. */
ec6d2bb8 5963
df7492f9 5964Lisp_Object
971de7fb 5965raw_text_coding_system (Lisp_Object coding_system)
ec6d2bb8 5966{
0be8721c 5967 Lisp_Object spec, attrs;
df7492f9 5968 Lisp_Object eol_type, raw_text_eol_type;
ec6d2bb8 5969
d3e4cb56
KH
5970 if (NILP (coding_system))
5971 return Qraw_text;
df7492f9
KH
5972 spec = CODING_SYSTEM_SPEC (coding_system);
5973 attrs = AREF (spec, 0);
ec6d2bb8 5974
df7492f9
KH
5975 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5976 return coding_system;
ec6d2bb8 5977
df7492f9
KH
5978 eol_type = AREF (spec, 2);
5979 if (VECTORP (eol_type))
5980 return Qraw_text;
5981 spec = CODING_SYSTEM_SPEC (Qraw_text);
5982 raw_text_eol_type = AREF (spec, 2);
5983 return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5984 : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5985 : AREF (raw_text_eol_type, 2));
ec6d2bb8
KH
5986}
5987
54f78171 5988
1911a33b
KH
5989/* If CODING_SYSTEM doesn't specify end-of-line format, return one of
5990 the subsidiary that has the same eol-spec as PARENT (if it is not
5991 nil and specifies end-of-line format) or the system's setting
fcbcfb64 5992 (system_eol_type). */
df7492f9
KH
5993
5994Lisp_Object
971de7fb 5995coding_inherit_eol_type (Lisp_Object coding_system, Lisp_Object parent)
54f78171 5996{
3e139625 5997 Lisp_Object spec, eol_type;
54f78171 5998
d3e4cb56
KH
5999 if (NILP (coding_system))
6000 coding_system = Qraw_text;
df7492f9 6001 spec = CODING_SYSTEM_SPEC (coding_system);
df7492f9 6002 eol_type = AREF (spec, 2);
fcbcfb64 6003 if (VECTORP (eol_type))
df7492f9 6004 {
df7492f9
KH
6005 Lisp_Object parent_eol_type;
6006
fcbcfb64
KH
6007 if (! NILP (parent))
6008 {
6009 Lisp_Object parent_spec;
6010
4a015c45 6011 parent_spec = CODING_SYSTEM_SPEC (parent);
fcbcfb64 6012 parent_eol_type = AREF (parent_spec, 2);
1911a33b 6013 if (VECTORP (parent_eol_type))
4628bef1 6014 parent_eol_type = system_eol_type;
fcbcfb64
KH
6015 }
6016 else
6017 parent_eol_type = system_eol_type;
df7492f9
KH
6018 if (EQ (parent_eol_type, Qunix))
6019 coding_system = AREF (eol_type, 0);
6020 else if (EQ (parent_eol_type, Qdos))
6021 coding_system = AREF (eol_type, 1);
6022 else if (EQ (parent_eol_type, Qmac))
6023 coding_system = AREF (eol_type, 2);
54f78171 6024 }
df7492f9 6025 return coding_system;
54f78171
KH
6026}
6027
fcaf8878
KH
6028
6029/* Check if text-conversion and eol-conversion of CODING_SYSTEM are
6030 decided for writing to a process. If not, complement them, and
6031 return a new coding system. */
6032
6033Lisp_Object
4628bef1 6034complement_process_encoding_system (Lisp_Object coding_system)
fcaf8878 6035{
5886ec9c
KH
6036 Lisp_Object coding_base = Qnil, eol_base = Qnil;
6037 Lisp_Object spec, attrs;
93d50df8 6038 int i;
fcaf8878 6039
93d50df8 6040 for (i = 0; i < 3; i++)
fcaf8878 6041 {
93d50df8
KH
6042 if (i == 1)
6043 coding_system = CDR_SAFE (Vdefault_process_coding_system);
6044 else if (i == 2)
6045 coding_system = preferred_coding_system ();
6046 spec = CODING_SYSTEM_SPEC (coding_system);
6047 if (NILP (spec))
6048 continue;
6049 attrs = AREF (spec, 0);
6050 if (NILP (coding_base) && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
6051 coding_base = CODING_ATTR_BASE_NAME (attrs);
6052 if (NILP (eol_base) && ! VECTORP (AREF (spec, 2)))
6053 eol_base = coding_system;
6054 if (! NILP (coding_base) && ! NILP (eol_base))
6055 break;
fcaf8878 6056 }
fcaf8878 6057
93d50df8
KH
6058 if (i > 0)
6059 /* The original CODING_SYSTEM didn't specify text-conversion or
6060 eol-conversion. Be sure that we return a fully complemented
6061 coding system. */
6062 coding_system = coding_inherit_eol_type (coding_base, eol_base);
6063 return coding_system;
fcaf8878
KH
6064}
6065
6066
4ed46869
KH
6067/* Emacs has a mechanism to automatically detect a coding system if it
6068 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
6069 it's impossible to distinguish some coding systems accurately
6070 because they use the same range of codes. So, at first, coding
6071 systems are categorized into 7, those are:
6072
0ef69138 6073 o coding-category-emacs-mule
4ed46869
KH
6074
6075 The category for a coding system which has the same code range
6076 as Emacs' internal format. Assigned the coding-system (Lisp
0ef69138 6077 symbol) `emacs-mule' by default.
4ed46869
KH
6078
6079 o coding-category-sjis
6080
6081 The category for a coding system which has the same code range
6082 as SJIS. Assigned the coding-system (Lisp
7717c392 6083 symbol) `japanese-shift-jis' by default.
4ed46869
KH
6084
6085 o coding-category-iso-7
6086
6087 The category for a coding system which has the same code range
7717c392 6088 as ISO2022 of 7-bit environment. This doesn't use any locking
d46c5b12
KH
6089 shift and single shift functions. This can encode/decode all
6090 charsets. Assigned the coding-system (Lisp symbol)
6091 `iso-2022-7bit' by default.
6092
6093 o coding-category-iso-7-tight
6094
6095 Same as coding-category-iso-7 except that this can
6096 encode/decode only the specified charsets.
4ed46869
KH
6097
6098 o coding-category-iso-8-1
6099
6100 The category for a coding system which has the same code range
6101 as ISO2022 of 8-bit environment and graphic plane 1 used only
7717c392
KH
6102 for DIMENSION1 charset. This doesn't use any locking shift
6103 and single shift functions. Assigned the coding-system (Lisp
6104 symbol) `iso-latin-1' by default.
4ed46869
KH
6105
6106 o coding-category-iso-8-2
6107
6108 The category for a coding system which has the same code range
6109 as ISO2022 of 8-bit environment and graphic plane 1 used only
7717c392
KH
6110 for DIMENSION2 charset. This doesn't use any locking shift
6111 and single shift functions. Assigned the coding-system (Lisp
6112 symbol) `japanese-iso-8bit' by default.
4ed46869 6113
7717c392 6114 o coding-category-iso-7-else
4ed46869
KH
6115
6116 The category for a coding system which has the same code range
ad1746f5 6117 as ISO2022 of 7-bit environment but uses locking shift or
7717c392
KH
6118 single shift functions. Assigned the coding-system (Lisp
6119 symbol) `iso-2022-7bit-lock' by default.
6120
6121 o coding-category-iso-8-else
6122
6123 The category for a coding system which has the same code range
ad1746f5 6124 as ISO2022 of 8-bit environment but uses locking shift or
7717c392
KH
6125 single shift functions. Assigned the coding-system (Lisp
6126 symbol) `iso-2022-8bit-ss2' by default.
4ed46869
KH
6127
6128 o coding-category-big5
6129
6130 The category for a coding system which has the same code range
6131 as BIG5. Assigned the coding-system (Lisp symbol)
e0e989f6 6132 `cn-big5' by default.
4ed46869 6133
fa42c37f
KH
6134 o coding-category-utf-8
6135
6136 The category for a coding system which has the same code range
6e76ae91 6137 as UTF-8 (cf. RFC3629). Assigned the coding-system (Lisp
fa42c37f
KH
6138 symbol) `utf-8' by default.
6139
6140 o coding-category-utf-16-be
6141
6142 The category for a coding system in which a text has an
6143 Unicode signature (cf. Unicode Standard) in the order of BIG
6144 endian at the head. Assigned the coding-system (Lisp symbol)
6145 `utf-16-be' by default.
6146
6147 o coding-category-utf-16-le
6148
6149 The category for a coding system in which a text has an
6150 Unicode signature (cf. Unicode Standard) in the order of
6151 LITTLE endian at the head. Assigned the coding-system (Lisp
6152 symbol) `utf-16-le' by default.
6153
1397dc18
KH
6154 o coding-category-ccl
6155
6156 The category for a coding system of which encoder/decoder is
6157 written in CCL programs. The default value is nil, i.e., no
6158 coding system is assigned.
6159
4ed46869
KH
6160 o coding-category-binary
6161
6162 The category for a coding system not categorized in any of the
6163 above. Assigned the coding-system (Lisp symbol)
e0e989f6 6164 `no-conversion' by default.
4ed46869
KH
6165
6166 Each of them is a Lisp symbol and the value is an actual
df7492f9 6167 `coding-system's (this is also a Lisp symbol) assigned by a user.
4ed46869
KH
6168 What Emacs does actually is to detect a category of coding system.
6169 Then, it uses a `coding-system' assigned to it. If Emacs can't
df7492f9 6170 decide only one possible category, it selects a category of the
4ed46869
KH
6171 highest priority. Priorities of categories are also specified by a
6172 user in a Lisp variable `coding-category-list'.
6173
6174*/
6175
df7492f9
KH
6176#define EOL_SEEN_NONE 0
6177#define EOL_SEEN_LF 1
6178#define EOL_SEEN_CR 2
6179#define EOL_SEEN_CRLF 4
66cfb530 6180
ff0dacd7
KH
6181/* Detect how end-of-line of a text of length SRC_BYTES pointed by
6182 SOURCE is encoded. If CATEGORY is one of
6183 coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6184 two-byte, else they are encoded by one-byte.
6185
6186 Return one of EOL_SEEN_XXX. */
4ed46869 6187
bc4bc72a 6188#define MAX_EOL_CHECK_COUNT 3
d46c5b12
KH
6189
6190static int
d311d28c 6191detect_eol (const unsigned char *source, ptrdiff_t src_bytes,
cf84bb53 6192 enum coding_category category)
4ed46869 6193{
f6cbaf43 6194 const unsigned char *src = source, *src_end = src + src_bytes;
4ed46869 6195 unsigned char c;
df7492f9
KH
6196 int total = 0;
6197 int eol_seen = EOL_SEEN_NONE;
4ed46869 6198
89528eb3 6199 if ((1 << category) & CATEGORY_MASK_UTF_16)
4ed46869 6200 {
df7492f9 6201 int msb, lsb;
fa42c37f 6202
89528eb3
KH
6203 msb = category == (coding_category_utf_16_le
6204 | coding_category_utf_16_le_nosig);
df7492f9 6205 lsb = 1 - msb;
fa42c37f 6206
df7492f9 6207 while (src + 1 < src_end)
fa42c37f 6208 {
df7492f9
KH
6209 c = src[lsb];
6210 if (src[msb] == 0 && (c == '\n' || c == '\r'))
fa42c37f 6211 {
df7492f9
KH
6212 int this_eol;
6213
6214 if (c == '\n')
6215 this_eol = EOL_SEEN_LF;
6216 else if (src + 3 >= src_end
6217 || src[msb + 2] != 0
6218 || src[lsb + 2] != '\n')
6219 this_eol = EOL_SEEN_CR;
fa42c37f 6220 else
75f4f1ac
EZ
6221 {
6222 this_eol = EOL_SEEN_CRLF;
6223 src += 2;
6224 }
df7492f9
KH
6225
6226 if (eol_seen == EOL_SEEN_NONE)
6227 /* This is the first end-of-line. */
6228 eol_seen = this_eol;
6229 else if (eol_seen != this_eol)
fa42c37f 6230 {
75f4f1ac
EZ
6231 /* The found type is different from what found before.
6232 Allow for stray ^M characters in DOS EOL files. */
ef1b0ba7
SM
6233 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6234 || (eol_seen == EOL_SEEN_CRLF
6235 && this_eol == EOL_SEEN_CR))
75f4f1ac
EZ
6236 eol_seen = EOL_SEEN_CRLF;
6237 else
6238 {
6239 eol_seen = EOL_SEEN_LF;
6240 break;
6241 }
fa42c37f 6242 }
df7492f9
KH
6243 if (++total == MAX_EOL_CHECK_COUNT)
6244 break;
fa42c37f 6245 }
df7492f9 6246 src += 2;
fa42c37f 6247 }
bcf26d6a 6248 }
d46c5b12 6249 else
ef1b0ba7
SM
6250 while (src < src_end)
6251 {
6252 c = *src++;
6253 if (c == '\n' || c == '\r')
6254 {
6255 int this_eol;
d46c5b12 6256
ef1b0ba7
SM
6257 if (c == '\n')
6258 this_eol = EOL_SEEN_LF;
6259 else if (src >= src_end || *src != '\n')
6260 this_eol = EOL_SEEN_CR;
6261 else
6262 this_eol = EOL_SEEN_CRLF, src++;
d46c5b12 6263
ef1b0ba7
SM
6264 if (eol_seen == EOL_SEEN_NONE)
6265 /* This is the first end-of-line. */
6266 eol_seen = this_eol;
6267 else if (eol_seen != this_eol)
6268 {
6269 /* The found type is different from what found before.
6270 Allow for stray ^M characters in DOS EOL files. */
6271 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6272 || (eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR))
6273 eol_seen = EOL_SEEN_CRLF;
6274 else
6275 {
6276 eol_seen = EOL_SEEN_LF;
6277 break;
6278 }
6279 }
6280 if (++total == MAX_EOL_CHECK_COUNT)
6281 break;
6282 }
6283 }
df7492f9 6284 return eol_seen;
73be902c
KH
6285}
6286
df7492f9 6287
24a73b0a 6288static Lisp_Object
971de7fb 6289adjust_coding_eol_type (struct coding_system *coding, int eol_seen)
73be902c 6290{
0be8721c 6291 Lisp_Object eol_type;
8f924df7 6292
df7492f9
KH
6293 eol_type = CODING_ID_EOL_TYPE (coding->id);
6294 if (eol_seen & EOL_SEEN_LF)
24a73b0a
KH
6295 {
6296 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6297 eol_type = Qunix;
6298 }
6f197c07 6299 else if (eol_seen & EOL_SEEN_CRLF)
24a73b0a
KH
6300 {
6301 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6302 eol_type = Qdos;
6303 }
6f197c07 6304 else if (eol_seen & EOL_SEEN_CR)
24a73b0a
KH
6305 {
6306 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6307 eol_type = Qmac;
6308 }
6309 return eol_type;
d46c5b12 6310}
4ed46869 6311
df7492f9
KH
6312/* Detect how a text specified in CODING is encoded. If a coding
6313 system is detected, update fields of CODING by the detected coding
6314 system. */
0a28aafb 6315
74ab6df5 6316static void
971de7fb 6317detect_coding (struct coding_system *coding)
d46c5b12 6318{
8f924df7 6319 const unsigned char *src, *src_end;
73cce38d 6320 int saved_mode = coding->mode;
d46c5b12 6321
df7492f9
KH
6322 coding->consumed = coding->consumed_char = 0;
6323 coding->produced = coding->produced_char = 0;
6324 coding_set_source (coding);
1c3478b0 6325
df7492f9 6326 src_end = coding->source + coding->src_bytes;
c0e16b14 6327 coding->head_ascii = 0;
1c3478b0 6328
df7492f9
KH
6329 /* If we have not yet decided the text encoding type, detect it
6330 now. */
6331 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
b73bfc1c 6332 {
df7492f9 6333 int c, i;
6cb21a4f 6334 struct coding_detection_info detect_info;
2f3cbb32 6335 int null_byte_found = 0, eight_bit_found = 0;
df7492f9 6336
6cb21a4f 6337 detect_info.checked = detect_info.found = detect_info.rejected = 0;
2f3cbb32 6338 for (src = coding->source; src < src_end; src++)
d46c5b12 6339 {
df7492f9 6340 c = *src;
6cb21a4f 6341 if (c & 0x80)
6cb21a4f 6342 {
2f3cbb32 6343 eight_bit_found = 1;
2f3cbb32
KH
6344 if (null_byte_found)
6345 break;
6346 }
6347 else if (c < 0x20)
6348 {
6349 if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6350 && ! inhibit_iso_escape_detection
6351 && ! detect_info.checked)
6cb21a4f 6352 {
2f3cbb32
KH
6353 if (detect_coding_iso_2022 (coding, &detect_info))
6354 {
6355 /* We have scanned the whole data. */
6356 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
c0e16b14
KH
6357 {
6358 /* We didn't find an 8-bit code. We may
6359 have found a null-byte, but it's very
ce5b453a 6360 rare that a binary file conforms to
c0e16b14
KH
6361 ISO-2022. */
6362 src = src_end;
6363 coding->head_ascii = src - coding->source;
6364 }
6365 detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
2f3cbb32
KH
6366 break;
6367 }
6368 }
97b1b294 6369 else if (! c && !inhibit_null_byte_detection)
2f3cbb32
KH
6370 {
6371 null_byte_found = 1;
6372 if (eight_bit_found)
6373 break;
6cb21a4f 6374 }
c006c0c8
KH
6375 if (! eight_bit_found)
6376 coding->head_ascii++;
6cb21a4f 6377 }
c006c0c8 6378 else if (! eight_bit_found)
c0e16b14 6379 coding->head_ascii++;
d46c5b12 6380 }
df7492f9 6381
2f3cbb32
KH
6382 if (null_byte_found || eight_bit_found
6383 || coding->head_ascii < coding->src_bytes
6cb21a4f 6384 || detect_info.found)
d46c5b12 6385 {
ff0dacd7
KH
6386 enum coding_category category;
6387 struct coding_system *this;
df7492f9 6388
6cb21a4f
KH
6389 if (coding->head_ascii == coding->src_bytes)
6390 /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */
6391 for (i = 0; i < coding_category_raw_text; i++)
6392 {
6393 category = coding_priorities[i];
6394 this = coding_categories + category;
6395 if (detect_info.found & (1 << category))
24a73b0a 6396 break;
6cb21a4f
KH
6397 }
6398 else
2f3cbb32
KH
6399 {
6400 if (null_byte_found)
ff0dacd7 6401 {
2f3cbb32
KH
6402 detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6403 detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
ff0dacd7 6404 }
2f3cbb32
KH
6405 for (i = 0; i < coding_category_raw_text; i++)
6406 {
6407 category = coding_priorities[i];
6408 this = coding_categories + category;
6409 if (this->id < 0)
6410 {
6411 /* No coding system of this category is defined. */
6412 detect_info.rejected |= (1 << category);
6413 }
6414 else if (category >= coding_category_raw_text)
6415 continue;
6416 else if (detect_info.checked & (1 << category))
6417 {
6418 if (detect_info.found & (1 << category))
6419 break;
6420 }
6421 else if ((*(this->detector)) (coding, &detect_info)
6422 && detect_info.found & (1 << category))
6423 {
6424 if (category == coding_category_utf_16_auto)
6425 {
6426 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6427 category = coding_category_utf_16_le;
6428 else
6429 category = coding_category_utf_16_be;
6430 }
6431 break;
6432 }
6433 }
2f3cbb32 6434 }
c0e16b14
KH
6435
6436 if (i < coding_category_raw_text)
6437 setup_coding_system (CODING_ID_NAME (this->id), coding);
6438 else if (null_byte_found)
6439 setup_coding_system (Qno_conversion, coding);
6440 else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6441 == CATEGORY_MASK_ANY)
6442 setup_coding_system (Qraw_text, coding);
6443 else if (detect_info.rejected)
6444 for (i = 0; i < coding_category_raw_text; i++)
6445 if (! (detect_info.rejected & (1 << coding_priorities[i])))
6446 {
6447 this = coding_categories + coding_priorities[i];
6448 setup_coding_system (CODING_ID_NAME (this->id), coding);
6449 break;
6450 }
d46c5b12 6451 }
b73bfc1c 6452 }
a470d443
KH
6453 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6454 == coding_category_utf_8_auto)
6455 {
6456 Lisp_Object coding_systems;
6457 struct coding_detection_info detect_info;
6458
6459 coding_systems
6460 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6461 detect_info.found = detect_info.rejected = 0;
6462 coding->head_ascii = 0;
6463 if (CONSP (coding_systems)
6464 && detect_coding_utf_8 (coding, &detect_info))
6465 {
6466 if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6467 setup_coding_system (XCAR (coding_systems), coding);
6468 else
6469 setup_coding_system (XCDR (coding_systems), coding);
6470 }
6471 }
24a73b0a
KH
6472 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6473 == coding_category_utf_16_auto)
b49a1807
KH
6474 {
6475 Lisp_Object coding_systems;
6476 struct coding_detection_info detect_info;
6477
6478 coding_systems
a470d443 6479 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
b49a1807 6480 detect_info.found = detect_info.rejected = 0;
a470d443 6481 coding->head_ascii = 0;
b49a1807 6482 if (CONSP (coding_systems)
24a73b0a 6483 && detect_coding_utf_16 (coding, &detect_info))
b49a1807
KH
6484 {
6485 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6486 setup_coding_system (XCAR (coding_systems), coding);
24a73b0a 6487 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
b49a1807
KH
6488 setup_coding_system (XCDR (coding_systems), coding);
6489 }
6490 }
73cce38d 6491 coding->mode = saved_mode;
4ed46869 6492}
4ed46869 6493
d46c5b12 6494
aaaf0b1e 6495static void
971de7fb 6496decode_eol (struct coding_system *coding)
aaaf0b1e 6497{
24a73b0a
KH
6498 Lisp_Object eol_type;
6499 unsigned char *p, *pbeg, *pend;
3ed051d4 6500
24a73b0a 6501 eol_type = CODING_ID_EOL_TYPE (coding->id);
0a9564cb 6502 if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
24a73b0a
KH
6503 return;
6504
6505 if (NILP (coding->dst_object))
6506 pbeg = coding->destination;
6507 else
6508 pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6509 pend = pbeg + coding->produced;
6510
6511 if (VECTORP (eol_type))
aaaf0b1e 6512 {
df7492f9 6513 int eol_seen = EOL_SEEN_NONE;
4ed46869 6514
24a73b0a 6515 for (p = pbeg; p < pend; p++)
aaaf0b1e 6516 {
df7492f9
KH
6517 if (*p == '\n')
6518 eol_seen |= EOL_SEEN_LF;
6519 else if (*p == '\r')
aaaf0b1e 6520 {
df7492f9 6521 if (p + 1 < pend && *(p + 1) == '\n')
aaaf0b1e 6522 {
df7492f9
KH
6523 eol_seen |= EOL_SEEN_CRLF;
6524 p++;
aaaf0b1e 6525 }
aaaf0b1e 6526 else
df7492f9 6527 eol_seen |= EOL_SEEN_CR;
aaaf0b1e 6528 }
aaaf0b1e 6529 }
75f4f1ac
EZ
6530 /* Handle DOS-style EOLs in a file with stray ^M characters. */
6531 if ((eol_seen & EOL_SEEN_CRLF) != 0
6532 && (eol_seen & EOL_SEEN_CR) != 0
6533 && (eol_seen & EOL_SEEN_LF) == 0)
6534 eol_seen = EOL_SEEN_CRLF;
6535 else if (eol_seen != EOL_SEEN_NONE
24a73b0a
KH
6536 && eol_seen != EOL_SEEN_LF
6537 && eol_seen != EOL_SEEN_CRLF
6538 && eol_seen != EOL_SEEN_CR)
6539 eol_seen = EOL_SEEN_LF;
df7492f9 6540 if (eol_seen != EOL_SEEN_NONE)
24a73b0a 6541 eol_type = adjust_coding_eol_type (coding, eol_seen);
aaaf0b1e 6542 }
d46c5b12 6543
24a73b0a 6544 if (EQ (eol_type, Qmac))
27901516 6545 {
24a73b0a 6546 for (p = pbeg; p < pend; p++)
df7492f9
KH
6547 if (*p == '\r')
6548 *p = '\n';
4ed46869 6549 }
24a73b0a 6550 else if (EQ (eol_type, Qdos))
df7492f9 6551 {
d311d28c 6552 ptrdiff_t n = 0;
b73bfc1c 6553
24a73b0a
KH
6554 if (NILP (coding->dst_object))
6555 {
4347441b
KH
6556 /* Start deleting '\r' from the tail to minimize the memory
6557 movement. */
24a73b0a
KH
6558 for (p = pend - 2; p >= pbeg; p--)
6559 if (*p == '\r')
6560 {
72af86bd 6561 memmove (p, p + 1, pend-- - p - 1);
24a73b0a
KH
6562 n++;
6563 }
6564 }
6565 else
6566 {
d311d28c
PE
6567 ptrdiff_t pos_byte = coding->dst_pos_byte;
6568 ptrdiff_t pos = coding->dst_pos;
6569 ptrdiff_t pos_end = pos + coding->produced_char - 1;
4347441b
KH
6570
6571 while (pos < pos_end)
6572 {
6573 p = BYTE_POS_ADDR (pos_byte);
6574 if (*p == '\r' && p[1] == '\n')
6575 {
6576 del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6577 n++;
6578 pos_end--;
6579 }
6580 pos++;
69b8522d
KH
6581 if (coding->dst_multibyte)
6582 pos_byte += BYTES_BY_CHAR_HEAD (*p);
6583 else
6584 pos_byte++;
4347441b 6585 }
24a73b0a
KH
6586 }
6587 coding->produced -= n;
6588 coding->produced_char -= n;
aaaf0b1e 6589 }
4ed46869
KH
6590}
6591
7d64c6ad 6592
a6f87d34
KH
6593/* Return a translation table (or list of them) from coding system
6594 attribute vector ATTRS for encoding (ENCODEP is nonzero) or
6595 decoding (ENCODEP is zero). */
7d64c6ad 6596
e6a54062 6597static Lisp_Object
971de7fb 6598get_translation_table (Lisp_Object attrs, int encodep, int *max_lookup)
7d64c6ad
KH
6599{
6600 Lisp_Object standard, translation_table;
09ee6fdd 6601 Lisp_Object val;
7d64c6ad 6602
4bed5909
CY
6603 if (NILP (Venable_character_translation))
6604 {
6605 if (max_lookup)
6606 *max_lookup = 0;
6607 return Qnil;
6608 }
7d64c6ad
KH
6609 if (encodep)
6610 translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6611 standard = Vstandard_translation_table_for_encode;
6612 else
6613 translation_table = CODING_ATTR_DECODE_TBL (attrs),
6614 standard = Vstandard_translation_table_for_decode;
7d64c6ad 6615 if (NILP (translation_table))
09ee6fdd
KH
6616 translation_table = standard;
6617 else
a6f87d34 6618 {
09ee6fdd
KH
6619 if (SYMBOLP (translation_table))
6620 translation_table = Fget (translation_table, Qtranslation_table);
6621 else if (CONSP (translation_table))
6622 {
6623 translation_table = Fcopy_sequence (translation_table);
6624 for (val = translation_table; CONSP (val); val = XCDR (val))
6625 if (SYMBOLP (XCAR (val)))
6626 XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6627 }
6628 if (CHAR_TABLE_P (standard))
6629 {
6630 if (CONSP (translation_table))
6631 translation_table = nconc2 (translation_table,
6632 Fcons (standard, Qnil));
6633 else
6634 translation_table = Fcons (translation_table,
6635 Fcons (standard, Qnil));
6636 }
a6f87d34 6637 }
2170c8f0
KH
6638
6639 if (max_lookup)
09ee6fdd 6640 {
2170c8f0
KH
6641 *max_lookup = 1;
6642 if (CHAR_TABLE_P (translation_table)
6643 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6644 {
6645 val = XCHAR_TABLE (translation_table)->extras[1];
6646 if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6647 *max_lookup = XFASTINT (val);
6648 }
6649 else if (CONSP (translation_table))
6650 {
2735d060 6651 Lisp_Object tail;
09ee6fdd 6652
2170c8f0
KH
6653 for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6654 if (CHAR_TABLE_P (XCAR (tail))
6655 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6656 {
2735d060
PE
6657 Lisp_Object tailval = XCHAR_TABLE (XCAR (tail))->extras[1];
6658 if (NATNUMP (tailval) && *max_lookup < XFASTINT (tailval))
6659 *max_lookup = XFASTINT (tailval);
2170c8f0
KH
6660 }
6661 }
a6f87d34 6662 }
7d64c6ad
KH
6663 return translation_table;
6664}
6665
09ee6fdd
KH
6666#define LOOKUP_TRANSLATION_TABLE(table, c, trans) \
6667 do { \
6668 trans = Qnil; \
6669 if (CHAR_TABLE_P (table)) \
6670 { \
6671 trans = CHAR_TABLE_REF (table, c); \
6672 if (CHARACTERP (trans)) \
6673 c = XFASTINT (trans), trans = Qnil; \
6674 } \
6675 else if (CONSP (table)) \
6676 { \
6677 Lisp_Object tail; \
6678 \
6679 for (tail = table; CONSP (tail); tail = XCDR (tail)) \
6680 if (CHAR_TABLE_P (XCAR (tail))) \
6681 { \
6682 trans = CHAR_TABLE_REF (XCAR (tail), c); \
6683 if (CHARACTERP (trans)) \
6684 c = XFASTINT (trans), trans = Qnil; \
6685 else if (! NILP (trans)) \
6686 break; \
6687 } \
6688 } \
e6a54062
KH
6689 } while (0)
6690
7d64c6ad 6691
e951386e
KH
6692/* Return a translation of character(s) at BUF according to TRANS.
6693 TRANS is TO-CHAR or ((FROM . TO) ...) where
6694 FROM = [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].
6695 The return value is TO-CHAR or ([FROM-CHAR ...] . TO) if a
6696 translation is found, and Qnil if not found..
6697 If BUF is too short to lookup characters in FROM, return Qt. */
6698
69a80ea3 6699static Lisp_Object
971de7fb 6700get_translation (Lisp_Object trans, int *buf, int *buf_end)
69a80ea3 6701{
e951386e
KH
6702
6703 if (INTEGERP (trans))
6704 return trans;
6705 for (; CONSP (trans); trans = XCDR (trans))
69a80ea3 6706 {
e951386e
KH
6707 Lisp_Object val = XCAR (trans);
6708 Lisp_Object from = XCAR (val);
2c6a9faa
PE
6709 ptrdiff_t len = ASIZE (from);
6710 ptrdiff_t i;
69a80ea3 6711
e951386e 6712 for (i = 0; i < len; i++)
69a80ea3 6713 {
e951386e
KH
6714 if (buf + i == buf_end)
6715 return Qt;
6716 if (XINT (AREF (from, i)) != buf[i])
6717 break;
69a80ea3 6718 }
e951386e
KH
6719 if (i == len)
6720 return val;
69a80ea3 6721 }
e951386e 6722 return Qnil;
69a80ea3
KH
6723}
6724
6725
d46c5b12 6726static int
cf84bb53
JB
6727produce_chars (struct coding_system *coding, Lisp_Object translation_table,
6728 int last_block)
4ed46869 6729{
df7492f9
KH
6730 unsigned char *dst = coding->destination + coding->produced;
6731 unsigned char *dst_end = coding->destination + coding->dst_bytes;
d311d28c
PE
6732 ptrdiff_t produced;
6733 ptrdiff_t produced_chars = 0;
69a80ea3 6734 int carryover = 0;
4ed46869 6735
df7492f9 6736 if (! coding->chars_at_source)
4ed46869 6737 {
119852e7 6738 /* Source characters are in coding->charbuf. */
fba4576f
AS
6739 int *buf = coding->charbuf;
6740 int *buf_end = buf + coding->charbuf_used;
4ed46869 6741
db274c7a
KH
6742 if (EQ (coding->src_object, coding->dst_object))
6743 {
6744 coding_set_source (coding);
6745 dst_end = ((unsigned char *) coding->source) + coding->consumed;
6746 }
4ed46869 6747
df7492f9 6748 while (buf < buf_end)
4ed46869 6749 {
69a80ea3 6750 int c = *buf, i;
bc4bc72a 6751
df7492f9
KH
6752 if (c >= 0)
6753 {
d311d28c 6754 ptrdiff_t from_nchars = 1, to_nchars = 1;
69a80ea3
KH
6755 Lisp_Object trans = Qnil;
6756
09ee6fdd 6757 LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
e6a54062 6758 if (! NILP (trans))
69a80ea3 6759 {
e951386e
KH
6760 trans = get_translation (trans, buf, buf_end);
6761 if (INTEGERP (trans))
6762 c = XINT (trans);
6763 else if (CONSP (trans))
6764 {
6765 from_nchars = ASIZE (XCAR (trans));
6766 trans = XCDR (trans);
6767 if (INTEGERP (trans))
6768 c = XINT (trans);
6769 else
6770 {
6771 to_nchars = ASIZE (trans);
6772 c = XINT (AREF (trans, 0));
6773 }
6774 }
6775 else if (EQ (trans, Qt) && ! last_block)
69a80ea3 6776 break;
69a80ea3
KH
6777 }
6778
5d009b3a 6779 if ((dst_end - dst) / MAX_MULTIBYTE_LENGTH < to_nchars)
69a80ea3 6780 {
5d009b3a
PE
6781 if (((min (PTRDIFF_MAX, SIZE_MAX) - (buf_end - buf))
6782 / MAX_MULTIBYTE_LENGTH)
6783 < to_nchars)
6784 memory_full (SIZE_MAX);
69a80ea3
KH
6785 dst = alloc_destination (coding,
6786 buf_end - buf
6787 + MAX_MULTIBYTE_LENGTH * to_nchars,
6788 dst);
db274c7a
KH
6789 if (EQ (coding->src_object, coding->dst_object))
6790 {
6791 coding_set_source (coding);
e951386e
KH
6792 dst_end = (((unsigned char *) coding->source)
6793 + coding->consumed);
db274c7a
KH
6794 }
6795 else
6796 dst_end = coding->destination + coding->dst_bytes;
69a80ea3
KH
6797 }
6798
433f7f87 6799 for (i = 0; i < to_nchars; i++)
69a80ea3 6800 {
433f7f87
KH
6801 if (i > 0)
6802 c = XINT (AREF (trans, i));
69a80ea3
KH
6803 if (coding->dst_multibyte
6804 || ! CHAR_BYTE8_P (c))
db274c7a 6805 CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
69a80ea3
KH
6806 else
6807 *dst++ = CHAR_TO_BYTE8 (c);
6808 }
6809 produced_chars += to_nchars;
e951386e 6810 buf += from_nchars;
d46c5b12 6811 }
df7492f9 6812 else
69a80ea3
KH
6813 /* This is an annotation datum. (-C) is the length. */
6814 buf += -c;
4ed46869 6815 }
69a80ea3 6816 carryover = buf_end - buf;
4ed46869 6817 }
fa42c37f 6818 else
fa42c37f 6819 {
119852e7 6820 /* Source characters are at coding->source. */
8f924df7 6821 const unsigned char *src = coding->source;
119852e7 6822 const unsigned char *src_end = src + coding->consumed;
4ed46869 6823
db274c7a
KH
6824 if (EQ (coding->dst_object, coding->src_object))
6825 dst_end = (unsigned char *) src;
df7492f9 6826 if (coding->src_multibyte != coding->dst_multibyte)
fa42c37f 6827 {
df7492f9 6828 if (coding->src_multibyte)
fa42c37f 6829 {
71c81426 6830 int multibytep = 1;
d311d28c 6831 ptrdiff_t consumed_chars = 0;
d46c5b12 6832
df7492f9
KH
6833 while (1)
6834 {
8f924df7 6835 const unsigned char *src_base = src;
df7492f9 6836 int c;
b73bfc1c 6837
df7492f9 6838 ONE_MORE_BYTE (c);
119852e7 6839 if (dst == dst_end)
df7492f9 6840 {
119852e7
KH
6841 if (EQ (coding->src_object, coding->dst_object))
6842 dst_end = (unsigned char *) src;
6843 if (dst == dst_end)
df7492f9 6844 {
d311d28c 6845 ptrdiff_t offset = src - coding->source;
119852e7
KH
6846
6847 dst = alloc_destination (coding, src_end - src + 1,
6848 dst);
6849 dst_end = coding->destination + coding->dst_bytes;
6850 coding_set_source (coding);
6851 src = coding->source + offset;
5c1ca13d 6852 src_end = coding->source + coding->consumed;
db274c7a
KH
6853 if (EQ (coding->src_object, coding->dst_object))
6854 dst_end = (unsigned char *) src;
df7492f9 6855 }
df7492f9
KH
6856 }
6857 *dst++ = c;
6858 produced_chars++;
6859 }
6860 no_more_source:
6861 ;
fa42c37f
KH
6862 }
6863 else
df7492f9
KH
6864 while (src < src_end)
6865 {
71c81426 6866 int multibytep = 1;
df7492f9 6867 int c = *src++;
b73bfc1c 6868
df7492f9
KH
6869 if (dst >= dst_end - 1)
6870 {
2c78b7e1 6871 if (EQ (coding->src_object, coding->dst_object))
8f924df7 6872 dst_end = (unsigned char *) src;
2c78b7e1
KH
6873 if (dst >= dst_end - 1)
6874 {
d311d28c
PE
6875 ptrdiff_t offset = src - coding->source;
6876 ptrdiff_t more_bytes;
119852e7 6877
db274c7a
KH
6878 if (EQ (coding->src_object, coding->dst_object))
6879 more_bytes = ((src_end - src) / 2) + 2;
6880 else
6881 more_bytes = src_end - src + 2;
6882 dst = alloc_destination (coding, more_bytes, dst);
2c78b7e1
KH
6883 dst_end = coding->destination + coding->dst_bytes;
6884 coding_set_source (coding);
119852e7 6885 src = coding->source + offset;
5c1ca13d 6886 src_end = coding->source + coding->consumed;
db274c7a
KH
6887 if (EQ (coding->src_object, coding->dst_object))
6888 dst_end = (unsigned char *) src;
2c78b7e1 6889 }
df7492f9
KH
6890 }
6891 EMIT_ONE_BYTE (c);
6892 }
d46c5b12 6893 }
df7492f9
KH
6894 else
6895 {
6896 if (!EQ (coding->src_object, coding->dst_object))
fa42c37f 6897 {
d311d28c 6898 ptrdiff_t require = coding->src_bytes - coding->dst_bytes;
4ed46869 6899
df7492f9 6900 if (require > 0)
fa42c37f 6901 {
d311d28c 6902 ptrdiff_t offset = src - coding->source;
df7492f9
KH
6903
6904 dst = alloc_destination (coding, require, dst);
6905 coding_set_source (coding);
6906 src = coding->source + offset;
5c1ca13d 6907 src_end = coding->source + coding->consumed;
fa42c37f
KH
6908 }
6909 }
119852e7 6910 produced_chars = coding->consumed_char;
df7492f9 6911 while (src < src_end)
14daee73 6912 *dst++ = *src++;
fa42c37f
KH
6913 }
6914 }
6915
df7492f9 6916 produced = dst - (coding->destination + coding->produced);
284201e4 6917 if (BUFFERP (coding->dst_object) && produced_chars > 0)
df7492f9
KH
6918 insert_from_gap (produced_chars, produced);
6919 coding->produced += produced;
6920 coding->produced_char += produced_chars;
69a80ea3 6921 return carryover;
fa42c37f
KH
6922}
6923
ff0dacd7
KH
6924/* Compose text in CODING->object according to the annotation data at
6925 CHARBUF. CHARBUF is an array:
e951386e 6926 [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
df7492f9 6927 */
4ed46869 6928
55d4c1b2 6929static inline void
d311d28c 6930produce_composition (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
4ed46869 6931{
df7492f9 6932 int len;
d311d28c 6933 ptrdiff_t to;
df7492f9 6934 enum composition_method method;
df7492f9 6935 Lisp_Object components;
fa42c37f 6936
e951386e 6937 len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
69a80ea3 6938 to = pos + charbuf[2];
e951386e 6939 method = (enum composition_method) (charbuf[4]);
d46c5b12 6940
df7492f9
KH
6941 if (method == COMPOSITION_RELATIVE)
6942 components = Qnil;
e951386e 6943 else
d46c5b12 6944 {
df7492f9 6945 Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
e951386e 6946 int i, j;
b73bfc1c 6947
e951386e
KH
6948 if (method == COMPOSITION_WITH_RULE)
6949 len = charbuf[2] * 3 - 2;
6950 charbuf += MAX_ANNOTATION_LENGTH;
6951 /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
6952 for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
9ffd559c 6953 {
e951386e
KH
6954 if (charbuf[i] >= 0)
6955 args[j] = make_number (charbuf[i]);
6956 else
6957 {
6958 i++;
6959 args[j] = make_number (charbuf[i] % 0x100);
6960 }
9ffd559c 6961 }
e951386e 6962 components = (i == j ? Fstring (j, args) : Fvector (j, args));
d46c5b12 6963 }
69a80ea3 6964 compose_text (pos, to, components, Qnil, coding->dst_object);
d46c5b12
KH
6965}
6966
d46c5b12 6967
ff0dacd7
KH
6968/* Put `charset' property on text in CODING->object according to
6969 the annotation data at CHARBUF. CHARBUF is an array:
69a80ea3 6970 [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
ff0dacd7 6971 */
d46c5b12 6972
55d4c1b2 6973static inline void
d311d28c 6974produce_charset (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
d46c5b12 6975{
d311d28c 6976 ptrdiff_t from = pos - charbuf[2];
69a80ea3 6977 struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
b73bfc1c 6978
69a80ea3 6979 Fput_text_property (make_number (from), make_number (pos),
ff0dacd7
KH
6980 Qcharset, CHARSET_NAME (charset),
6981 coding->dst_object);
d46c5b12
KH
6982}
6983
d46c5b12 6984
df7492f9
KH
6985#define CHARBUF_SIZE 0x4000
6986
6987#define ALLOC_CONVERSION_WORK_AREA(coding) \
6988 do { \
8510724d 6989 int size = CHARBUF_SIZE; \
df7492f9
KH
6990 \
6991 coding->charbuf = NULL; \
6992 while (size > 1024) \
6993 { \
6994 coding->charbuf = (int *) alloca (sizeof (int) * size); \
6995 if (coding->charbuf) \
6996 break; \
6997 size >>= 1; \
6998 } \
6999 if (! coding->charbuf) \
7000 { \
065e3595 7001 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
df7492f9
KH
7002 return coding->result; \
7003 } \
7004 coding->charbuf_size = size; \
7005 } while (0)
4ed46869 7006
d46c5b12
KH
7007
7008static void
d311d28c 7009produce_annotation (struct coding_system *coding, ptrdiff_t pos)
d46c5b12 7010{
df7492f9
KH
7011 int *charbuf = coding->charbuf;
7012 int *charbuf_end = charbuf + coding->charbuf_used;
d46c5b12 7013
ff0dacd7
KH
7014 if (NILP (coding->dst_object))
7015 return;
d46c5b12 7016
df7492f9 7017 while (charbuf < charbuf_end)
a84f1519 7018 {
df7492f9 7019 if (*charbuf >= 0)
e951386e 7020 pos++, charbuf++;
d46c5b12 7021 else
d46c5b12 7022 {
df7492f9 7023 int len = -*charbuf;
e951386e
KH
7024
7025 if (len > 2)
7026 switch (charbuf[1])
7027 {
7028 case CODING_ANNOTATE_COMPOSITION_MASK:
7029 produce_composition (coding, charbuf, pos);
7030 break;
7031 case CODING_ANNOTATE_CHARSET_MASK:
7032 produce_charset (coding, charbuf, pos);
7033 break;
7034 }
df7492f9 7035 charbuf += len;
d46c5b12 7036 }
a84f1519 7037 }
d46c5b12
KH
7038}
7039
df7492f9
KH
7040/* Decode the data at CODING->src_object into CODING->dst_object.
7041 CODING->src_object is a buffer, a string, or nil.
7042 CODING->dst_object is a buffer.
d46c5b12 7043
df7492f9
KH
7044 If CODING->src_object is a buffer, it must be the current buffer.
7045 In this case, if CODING->src_pos is positive, it is a position of
7046 the source text in the buffer, otherwise, the source text is in the
7047 gap area of the buffer, and CODING->src_pos specifies the offset of
7048 the text from GPT (which must be the same as PT). If this is the
7049 same buffer as CODING->dst_object, CODING->src_pos must be
7050 negative.
d46c5b12 7051
b6828792 7052 If CODING->src_object is a string, CODING->src_pos is an index to
df7492f9 7053 that string.
d46c5b12 7054
df7492f9
KH
7055 If CODING->src_object is nil, CODING->source must already point to
7056 the non-relocatable memory area. In this case, CODING->src_pos is
7057 an offset from CODING->source.
73be902c 7058
df7492f9
KH
7059 The decoded data is inserted at the current point of the buffer
7060 CODING->dst_object.
7061*/
d46c5b12 7062
df7492f9 7063static int
971de7fb 7064decode_coding (struct coding_system *coding)
d46c5b12 7065{
df7492f9 7066 Lisp_Object attrs;
24a73b0a 7067 Lisp_Object undo_list;
7d64c6ad 7068 Lisp_Object translation_table;
d0396581 7069 struct ccl_spec cclspec;
69a80ea3
KH
7070 int carryover;
7071 int i;
d46c5b12 7072
df7492f9
KH
7073 if (BUFFERP (coding->src_object)
7074 && coding->src_pos > 0
7075 && coding->src_pos < GPT
7076 && coding->src_pos + coding->src_chars > GPT)
7077 move_gap_both (coding->src_pos, coding->src_pos_byte);
d46c5b12 7078
24a73b0a 7079 undo_list = Qt;
df7492f9 7080 if (BUFFERP (coding->dst_object))
1c3478b0 7081 {
df7492f9
KH
7082 if (current_buffer != XBUFFER (coding->dst_object))
7083 set_buffer_internal (XBUFFER (coding->dst_object));
7084 if (GPT != PT)
7085 move_gap_both (PT, PT_BYTE);
4b4deea2
TT
7086 undo_list = BVAR (current_buffer, undo_list);
7087 BVAR (current_buffer, undo_list) = Qt;
1c3478b0
KH
7088 }
7089
df7492f9
KH
7090 coding->consumed = coding->consumed_char = 0;
7091 coding->produced = coding->produced_char = 0;
7092 coding->chars_at_source = 0;
065e3595 7093 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9 7094 coding->errors = 0;
1c3478b0 7095
df7492f9
KH
7096 ALLOC_CONVERSION_WORK_AREA (coding);
7097
7098 attrs = CODING_ID_ATTRS (coding->id);
2170c8f0 7099 translation_table = get_translation_table (attrs, 0, NULL);
df7492f9 7100
69a80ea3 7101 carryover = 0;
d0396581
KH
7102 if (coding->decoder == decode_coding_ccl)
7103 {
7104 coding->spec.ccl = &cclspec;
7105 setup_ccl_program (&cclspec.ccl, CODING_CCL_DECODER (coding));
7106 }
df7492f9 7107 do
b73bfc1c 7108 {
d311d28c 7109 ptrdiff_t pos = coding->dst_pos + coding->produced_char;
69a80ea3 7110
df7492f9
KH
7111 coding_set_source (coding);
7112 coding->annotated = 0;
69a80ea3 7113 coding->charbuf_used = carryover;
df7492f9 7114 (*(coding->decoder)) (coding);
df7492f9 7115 coding_set_destination (coding);
69a80ea3 7116 carryover = produce_chars (coding, translation_table, 0);
df7492f9 7117 if (coding->annotated)
69a80ea3
KH
7118 produce_annotation (coding, pos);
7119 for (i = 0; i < carryover; i++)
7120 coding->charbuf[i]
7121 = coding->charbuf[coding->charbuf_used - carryover + i];
d46c5b12 7122 }
d0396581
KH
7123 while (coding->result == CODING_RESULT_INSUFFICIENT_DST
7124 || (coding->consumed < coding->src_bytes
7125 && (coding->result == CODING_RESULT_SUCCESS
7126 || coding->result == CODING_RESULT_INVALID_SRC)));
d46c5b12 7127
69a80ea3
KH
7128 if (carryover > 0)
7129 {
7130 coding_set_destination (coding);
7131 coding->charbuf_used = carryover;
7132 produce_chars (coding, translation_table, 1);
7133 }
7134
df7492f9
KH
7135 coding->carryover_bytes = 0;
7136 if (coding->consumed < coding->src_bytes)
d46c5b12 7137 {
df7492f9 7138 int nbytes = coding->src_bytes - coding->consumed;
8f924df7 7139 const unsigned char *src;
df7492f9
KH
7140
7141 coding_set_source (coding);
7142 coding_set_destination (coding);
7143 src = coding->source + coding->consumed;
7144
7145 if (coding->mode & CODING_MODE_LAST_BLOCK)
1c3478b0 7146 {
df7492f9
KH
7147 /* Flush out unprocessed data as binary chars. We are sure
7148 that the number of data is less than the size of
7149 coding->charbuf. */
065e3595 7150 coding->charbuf_used = 0;
b2dab6c8
JR
7151 coding->chars_at_source = 0;
7152
df7492f9 7153 while (nbytes-- > 0)
1c3478b0 7154 {
df7492f9 7155 int c = *src++;
98725083 7156
1c91457d
KH
7157 if (c & 0x80)
7158 c = BYTE8_TO_CHAR (c);
7159 coding->charbuf[coding->charbuf_used++] = c;
1c3478b0 7160 }
f6cbaf43 7161 produce_chars (coding, Qnil, 1);
d46c5b12 7162 }
d46c5b12 7163 else
df7492f9
KH
7164 {
7165 /* Record unprocessed bytes in coding->carryover. We are
7166 sure that the number of data is less than the size of
7167 coding->carryover. */
7168 unsigned char *p = coding->carryover;
7169
f289d375
KH
7170 if (nbytes > sizeof coding->carryover)
7171 nbytes = sizeof coding->carryover;
df7492f9
KH
7172 coding->carryover_bytes = nbytes;
7173 while (nbytes-- > 0)
7174 *p++ = *src++;
1c3478b0 7175 }
df7492f9 7176 coding->consumed = coding->src_bytes;
b73bfc1c 7177 }
69f76525 7178
0a9564cb
EZ
7179 if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7180 && !inhibit_eol_conversion)
4347441b 7181 decode_eol (coding);
24a73b0a
KH
7182 if (BUFFERP (coding->dst_object))
7183 {
4b4deea2 7184 BVAR (current_buffer, undo_list) = undo_list;
24a73b0a
KH
7185 record_insert (coding->dst_pos, coding->produced_char);
7186 }
73be902c 7187 return coding->result;
4ed46869
KH
7188}
7189
aaaf0b1e 7190
e1c23804 7191/* Extract an annotation datum from a composition starting at POS and
ff0dacd7
KH
7192 ending before LIMIT of CODING->src_object (buffer or string), store
7193 the data in BUF, set *STOP to a starting position of the next
7194 composition (if any) or to LIMIT, and return the address of the
7195 next element of BUF.
7196
7197 If such an annotation is not found, set *STOP to a starting
7198 position of a composition after POS (if any) or to LIMIT, and
7199 return BUF. */
7200
55d4c1b2 7201static inline int *
d311d28c 7202handle_composition_annotation (ptrdiff_t pos, ptrdiff_t limit,
cf84bb53 7203 struct coding_system *coding, int *buf,
d311d28c 7204 ptrdiff_t *stop)
aaaf0b1e 7205{
d311d28c 7206 ptrdiff_t start, end;
ff0dacd7 7207 Lisp_Object prop;
aaaf0b1e 7208
ff0dacd7
KH
7209 if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7210 || end > limit)
7211 *stop = limit;
7212 else if (start > pos)
7213 *stop = start;
7214 else
aaaf0b1e 7215 {
ff0dacd7 7216 if (start == pos)
aaaf0b1e 7217 {
ff0dacd7
KH
7218 /* We found a composition. Store the corresponding
7219 annotation data in BUF. */
7220 int *head = buf;
7221 enum composition_method method = COMPOSITION_METHOD (prop);
7222 int nchars = COMPOSITION_LENGTH (prop);
7223
e951386e 7224 ADD_COMPOSITION_DATA (buf, nchars, 0, method);
ff0dacd7 7225 if (method != COMPOSITION_RELATIVE)
aaaf0b1e 7226 {
ff0dacd7 7227 Lisp_Object components;
2c6a9faa 7228 ptrdiff_t i, len, i_byte;
ff0dacd7
KH
7229
7230 components = COMPOSITION_COMPONENTS (prop);
7231 if (VECTORP (components))
aaaf0b1e 7232 {
77b37c05 7233 len = ASIZE (components);
ff0dacd7
KH
7234 for (i = 0; i < len; i++)
7235 *buf++ = XINT (AREF (components, i));
aaaf0b1e 7236 }
ff0dacd7 7237 else if (STRINGP (components))
aaaf0b1e 7238 {
8f924df7 7239 len = SCHARS (components);
ff0dacd7
KH
7240 i = i_byte = 0;
7241 while (i < len)
7242 {
7243 FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
7244 buf++;
7245 }
7246 }
7247 else if (INTEGERP (components))
7248 {
7249 len = 1;
7250 *buf++ = XINT (components);
7251 }
7252 else if (CONSP (components))
7253 {
7254 for (len = 0; CONSP (components);
7255 len++, components = XCDR (components))
7256 *buf++ = XINT (XCAR (components));
aaaf0b1e 7257 }
aaaf0b1e 7258 else
ff0dacd7
KH
7259 abort ();
7260 *head -= len;
aaaf0b1e 7261 }
aaaf0b1e 7262 }
ff0dacd7
KH
7263
7264 if (find_composition (end, limit, &start, &end, &prop,
7265 coding->src_object)
7266 && end <= limit)
7267 *stop = start;
7268 else
7269 *stop = limit;
aaaf0b1e 7270 }
ff0dacd7
KH
7271 return buf;
7272}
7273
7274
e1c23804 7275/* Extract an annotation datum from a text property `charset' at POS of
ff0dacd7
KH
7276 CODING->src_object (buffer of string), store the data in BUF, set
7277 *STOP to the position where the value of `charset' property changes
7278 (limiting by LIMIT), and return the address of the next element of
7279 BUF.
7280
7281 If the property value is nil, set *STOP to the position where the
7282 property value is non-nil (limiting by LIMIT), and return BUF. */
7283
55d4c1b2 7284static inline int *
d311d28c 7285handle_charset_annotation (ptrdiff_t pos, ptrdiff_t limit,
cf84bb53 7286 struct coding_system *coding, int *buf,
d311d28c 7287 ptrdiff_t *stop)
ff0dacd7
KH
7288{
7289 Lisp_Object val, next;
7290 int id;
7291
7292 val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
7293 if (! NILP (val) && CHARSETP (val))
7294 id = XINT (CHARSET_SYMBOL_ID (val));
7295 else
7296 id = -1;
69a80ea3 7297 ADD_CHARSET_DATA (buf, 0, id);
ff0dacd7
KH
7298 next = Fnext_single_property_change (make_number (pos), Qcharset,
7299 coding->src_object,
7300 make_number (limit));
7301 *stop = XINT (next);
7302 return buf;
7303}
7304
7305
df7492f9 7306static void
cf84bb53
JB
7307consume_chars (struct coding_system *coding, Lisp_Object translation_table,
7308 int max_lookup)
df7492f9
KH
7309{
7310 int *buf = coding->charbuf;
ff0dacd7 7311 int *buf_end = coding->charbuf + coding->charbuf_size;
7c78e542 7312 const unsigned char *src = coding->source + coding->consumed;
4776e638 7313 const unsigned char *src_end = coding->source + coding->src_bytes;
d311d28c
PE
7314 ptrdiff_t pos = coding->src_pos + coding->consumed_char;
7315 ptrdiff_t end_pos = coding->src_pos + coding->src_chars;
df7492f9
KH
7316 int multibytep = coding->src_multibyte;
7317 Lisp_Object eol_type;
7318 int c;
d311d28c 7319 ptrdiff_t stop, stop_composition, stop_charset;
09ee6fdd 7320 int *lookup_buf = NULL;
433f7f87
KH
7321
7322 if (! NILP (translation_table))
09ee6fdd 7323 lookup_buf = alloca (sizeof (int) * max_lookup);
88993dfd 7324
0a9564cb 7325 eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
df7492f9
KH
7326 if (VECTORP (eol_type))
7327 eol_type = Qunix;
88993dfd 7328
df7492f9
KH
7329 /* Note: composition handling is not yet implemented. */
7330 coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
ec6d2bb8 7331
0b5670c9
KH
7332 if (NILP (coding->src_object))
7333 stop = stop_composition = stop_charset = end_pos;
ff0dacd7 7334 else
0b5670c9
KH
7335 {
7336 if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7337 stop = stop_composition = pos;
7338 else
7339 stop = stop_composition = end_pos;
7340 if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7341 stop = stop_charset = pos;
7342 else
7343 stop_charset = end_pos;
7344 }
ec6d2bb8 7345
24a73b0a 7346 /* Compensate for CRLF and conversion. */
ff0dacd7 7347 buf_end -= 1 + MAX_ANNOTATION_LENGTH;
df7492f9 7348 while (buf < buf_end)
aaaf0b1e 7349 {
433f7f87
KH
7350 Lisp_Object trans;
7351
df7492f9 7352 if (pos == stop)
ec6d2bb8 7353 {
df7492f9
KH
7354 if (pos == end_pos)
7355 break;
ff0dacd7
KH
7356 if (pos == stop_composition)
7357 buf = handle_composition_annotation (pos, end_pos, coding,
7358 buf, &stop_composition);
7359 if (pos == stop_charset)
7360 buf = handle_charset_annotation (pos, end_pos, coding,
7361 buf, &stop_charset);
7362 stop = (stop_composition < stop_charset
7363 ? stop_composition : stop_charset);
df7492f9
KH
7364 }
7365
7366 if (! multibytep)
4776e638 7367 {
d311d28c 7368 int bytes;
aaaf0b1e 7369
4d1e6632
KH
7370 if (coding->encoder == encode_coding_raw_text
7371 || coding->encoder == encode_coding_ccl)
ea29edf2
KH
7372 c = *src++, pos++;
7373 else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
db274c7a 7374 c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
4776e638 7375 else
f03caae0 7376 c = BYTE8_TO_CHAR (*src), src++, pos++;
4776e638 7377 }
df7492f9 7378 else
db274c7a 7379 c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
df7492f9
KH
7380 if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7381 c = '\n';
7382 if (! EQ (eol_type, Qunix))
aaaf0b1e 7383 {
df7492f9 7384 if (c == '\n')
aaaf0b1e 7385 {
df7492f9
KH
7386 if (EQ (eol_type, Qdos))
7387 *buf++ = '\r';
7388 else
7389 c = '\r';
aaaf0b1e
KH
7390 }
7391 }
433f7f87 7392
e6a54062 7393 trans = Qnil;
09ee6fdd 7394 LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
e6a54062 7395 if (NILP (trans))
433f7f87
KH
7396 *buf++ = c;
7397 else
7398 {
2c6a9faa 7399 ptrdiff_t from_nchars = 1, to_nchars = 1;
433f7f87
KH
7400 int *lookup_buf_end;
7401 const unsigned char *p = src;
7402 int i;
7403
7404 lookup_buf[0] = c;
7405 for (i = 1; i < max_lookup && p < src_end; i++)
7406 lookup_buf[i] = STRING_CHAR_ADVANCE (p);
7407 lookup_buf_end = lookup_buf + i;
e951386e
KH
7408 trans = get_translation (trans, lookup_buf, lookup_buf_end);
7409 if (INTEGERP (trans))
7410 c = XINT (trans);
7411 else if (CONSP (trans))
7412 {
7413 from_nchars = ASIZE (XCAR (trans));
7414 trans = XCDR (trans);
7415 if (INTEGERP (trans))
7416 c = XINT (trans);
7417 else
7418 {
7419 to_nchars = ASIZE (trans);
2c6a9faa 7420 if (buf_end - buf < to_nchars)
e951386e
KH
7421 break;
7422 c = XINT (AREF (trans, 0));
7423 }
7424 }
7425 else
433f7f87 7426 break;
e951386e 7427 *buf++ = c;
433f7f87
KH
7428 for (i = 1; i < to_nchars; i++)
7429 *buf++ = XINT (AREF (trans, i));
7430 for (i = 1; i < from_nchars; i++, pos++)
7431 src += MULTIBYTE_LENGTH_NO_CHECK (src);
7432 }
aaaf0b1e 7433 }
ec6d2bb8 7434
df7492f9
KH
7435 coding->consumed = src - coding->source;
7436 coding->consumed_char = pos - coding->src_pos;
7437 coding->charbuf_used = buf - coding->charbuf;
7438 coding->chars_at_source = 0;
aaaf0b1e
KH
7439}
7440
4ed46869 7441
df7492f9
KH
7442/* Encode the text at CODING->src_object into CODING->dst_object.
7443 CODING->src_object is a buffer or a string.
7444 CODING->dst_object is a buffer or nil.
7445
7446 If CODING->src_object is a buffer, it must be the current buffer.
7447 In this case, if CODING->src_pos is positive, it is a position of
7448 the source text in the buffer, otherwise. the source text is in the
7449 gap area of the buffer, and coding->src_pos specifies the offset of
7450 the text from GPT (which must be the same as PT). If this is the
7451 same buffer as CODING->dst_object, CODING->src_pos must be
7452 negative and CODING should not have `pre-write-conversion'.
7453
7454 If CODING->src_object is a string, CODING should not have
7455 `pre-write-conversion'.
7456
7457 If CODING->dst_object is a buffer, the encoded data is inserted at
7458 the current point of that buffer.
7459
7460 If CODING->dst_object is nil, the encoded data is placed at the
7461 memory area specified by CODING->destination. */
7462
7463static int
971de7fb 7464encode_coding (struct coding_system *coding)
4ed46869 7465{
df7492f9 7466 Lisp_Object attrs;
7d64c6ad 7467 Lisp_Object translation_table;
09ee6fdd 7468 int max_lookup;
fb608df3 7469 struct ccl_spec cclspec;
9861e777 7470
df7492f9 7471 attrs = CODING_ID_ATTRS (coding->id);
ea29edf2
KH
7472 if (coding->encoder == encode_coding_raw_text)
7473 translation_table = Qnil, max_lookup = 0;
7474 else
7475 translation_table = get_translation_table (attrs, 1, &max_lookup);
4ed46869 7476
df7492f9 7477 if (BUFFERP (coding->dst_object))
8844fa83 7478 {
df7492f9
KH
7479 set_buffer_internal (XBUFFER (coding->dst_object));
7480 coding->dst_multibyte
4b4deea2 7481 = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
8844fa83 7482 }
4ed46869 7483
b73bfc1c 7484 coding->consumed = coding->consumed_char = 0;
df7492f9 7485 coding->produced = coding->produced_char = 0;
065e3595 7486 record_conversion_result (coding, CODING_RESULT_SUCCESS);
b73bfc1c 7487 coding->errors = 0;
b73bfc1c 7488
df7492f9 7489 ALLOC_CONVERSION_WORK_AREA (coding);
4ed46869 7490
fb608df3
KH
7491 if (coding->encoder == encode_coding_ccl)
7492 {
7493 coding->spec.ccl = &cclspec;
7494 setup_ccl_program (&cclspec.ccl, CODING_CCL_ENCODER (coding));
7495 }
df7492f9
KH
7496 do {
7497 coding_set_source (coding);
09ee6fdd 7498 consume_chars (coding, translation_table, max_lookup);
df7492f9
KH
7499 coding_set_destination (coding);
7500 (*(coding->encoder)) (coding);
7501 } while (coding->consumed_char < coding->src_chars);
7502
284201e4 7503 if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
df7492f9
KH
7504 insert_from_gap (coding->produced_char, coding->produced);
7505
7506 return (coding->result);
ec6d2bb8
KH
7507}
7508
fb88bf2d 7509
24a73b0a
KH
7510/* Name (or base name) of work buffer for code conversion. */
7511static Lisp_Object Vcode_conversion_workbuf_name;
d46c5b12 7512
24a73b0a
KH
7513/* A working buffer used by the top level conversion. Once it is
7514 created, it is never destroyed. It has the name
7515 Vcode_conversion_workbuf_name. The other working buffers are
7516 destroyed after the use is finished, and their names are modified
7517 versions of Vcode_conversion_workbuf_name. */
7518static Lisp_Object Vcode_conversion_reused_workbuf;
b73bfc1c 7519
24a73b0a
KH
7520/* 1 iff Vcode_conversion_reused_workbuf is already in use. */
7521static int reused_workbuf_in_use;
4ed46869 7522
24a73b0a 7523
ad1746f5 7524/* Return a working buffer of code conversion. MULTIBYTE specifies the
24a73b0a 7525 multibyteness of returning buffer. */
b73bfc1c 7526
f6cbaf43 7527static Lisp_Object
971de7fb 7528make_conversion_work_buffer (int multibyte)
df7492f9 7529{
24a73b0a
KH
7530 Lisp_Object name, workbuf;
7531 struct buffer *current;
4ed46869 7532
24a73b0a 7533 if (reused_workbuf_in_use++)
065e3595
KH
7534 {
7535 name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7536 workbuf = Fget_buffer_create (name);
7537 }
df7492f9 7538 else
065e3595 7539 {
159bd5a2 7540 if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
a993c7a1
KH
7541 Vcode_conversion_reused_workbuf
7542 = Fget_buffer_create (Vcode_conversion_workbuf_name);
7543 workbuf = Vcode_conversion_reused_workbuf;
065e3595 7544 }
24a73b0a
KH
7545 current = current_buffer;
7546 set_buffer_internal (XBUFFER (workbuf));
df36ff1f
CY
7547 /* We can't allow modification hooks to run in the work buffer. For
7548 instance, directory_files_internal assumes that file decoding
7549 doesn't compile new regexps. */
7550 Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
3ed051d4 7551 Ferase_buffer ();
4b4deea2
TT
7552 BVAR (current_buffer, undo_list) = Qt;
7553 BVAR (current_buffer, enable_multibyte_characters) = multibyte ? Qt : Qnil;
df7492f9 7554 set_buffer_internal (current);
24a73b0a 7555 return workbuf;
df7492f9 7556}
d46c5b12 7557
24a73b0a 7558
4776e638 7559static Lisp_Object
971de7fb 7560code_conversion_restore (Lisp_Object arg)
4776e638 7561{
24a73b0a 7562 Lisp_Object current, workbuf;
948bdcf3 7563 struct gcpro gcpro1;
24a73b0a 7564
948bdcf3 7565 GCPRO1 (arg);
24a73b0a
KH
7566 current = XCAR (arg);
7567 workbuf = XCDR (arg);
7568 if (! NILP (workbuf))
7569 {
7570 if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7571 reused_workbuf_in_use = 0;
7572 else if (! NILP (Fbuffer_live_p (workbuf)))
7573 Fkill_buffer (workbuf);
7574 }
7575 set_buffer_internal (XBUFFER (current));
948bdcf3 7576 UNGCPRO;
4776e638
KH
7577 return Qnil;
7578}
b73bfc1c 7579
24a73b0a 7580Lisp_Object
971de7fb 7581code_conversion_save (int with_work_buf, int multibyte)
df7492f9 7582{
24a73b0a 7583 Lisp_Object workbuf = Qnil;
b73bfc1c 7584
4776e638 7585 if (with_work_buf)
24a73b0a
KH
7586 workbuf = make_conversion_work_buffer (multibyte);
7587 record_unwind_protect (code_conversion_restore,
7588 Fcons (Fcurrent_buffer (), workbuf));
4776e638 7589 return workbuf;
df7492f9 7590}
d46c5b12 7591
df7492f9 7592int
cf84bb53 7593decode_coding_gap (struct coding_system *coding,
d311d28c 7594 ptrdiff_t chars, ptrdiff_t bytes)
df7492f9 7595{
d311d28c 7596 ptrdiff_t count = SPECPDL_INDEX ();
5e5c78be 7597 Lisp_Object attrs;
fb88bf2d 7598
24a73b0a 7599 code_conversion_save (0, 0);
ec6d2bb8 7600
24a73b0a 7601 coding->src_object = Fcurrent_buffer ();
df7492f9
KH
7602 coding->src_chars = chars;
7603 coding->src_bytes = bytes;
7604 coding->src_pos = -chars;
7605 coding->src_pos_byte = -bytes;
7606 coding->src_multibyte = chars < bytes;
24a73b0a 7607 coding->dst_object = coding->src_object;
df7492f9
KH
7608 coding->dst_pos = PT;
7609 coding->dst_pos_byte = PT_BYTE;
4b4deea2 7610 coding->dst_multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
4ed46869 7611
df7492f9
KH
7612 if (CODING_REQUIRE_DETECTION (coding))
7613 detect_coding (coding);
8f924df7 7614
9286b333 7615 coding->mode |= CODING_MODE_LAST_BLOCK;
287c57d7 7616 current_buffer->text->inhibit_shrinking = 1;
df7492f9 7617 decode_coding (coding);
287c57d7 7618 current_buffer->text->inhibit_shrinking = 0;
d46c5b12 7619
5e5c78be
KH
7620 attrs = CODING_ID_ATTRS (coding->id);
7621 if (! NILP (CODING_ATTR_POST_READ (attrs)))
b73bfc1c 7622 {
d311d28c 7623 ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
5e5c78be
KH
7624 Lisp_Object val;
7625
7626 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
5e5c78be
KH
7627 val = call1 (CODING_ATTR_POST_READ (attrs),
7628 make_number (coding->produced_char));
5e5c78be
KH
7629 CHECK_NATNUM (val);
7630 coding->produced_char += Z - prev_Z;
7631 coding->produced += Z_BYTE - prev_Z_BYTE;
b73bfc1c 7632 }
4ed46869 7633
df7492f9 7634 unbind_to (count, Qnil);
b73bfc1c
KH
7635 return coding->result;
7636}
52d41803 7637
d46c5b12 7638
df7492f9
KH
7639/* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
7640 SRC_OBJECT into DST_OBJECT by coding context CODING.
b73bfc1c 7641
df7492f9 7642 SRC_OBJECT is a buffer, a string, or Qnil.
b73bfc1c 7643
df7492f9
KH
7644 If it is a buffer, the text is at point of the buffer. FROM and TO
7645 are positions in the buffer.
b73bfc1c 7646
df7492f9
KH
7647 If it is a string, the text is at the beginning of the string.
7648 FROM and TO are indices to the string.
4ed46869 7649
df7492f9
KH
7650 If it is nil, the text is at coding->source. FROM and TO are
7651 indices to coding->source.
bb10be8b 7652
df7492f9 7653 DST_OBJECT is a buffer, Qt, or Qnil.
4ed46869 7654
df7492f9
KH
7655 If it is a buffer, the decoded text is inserted at point of the
7656 buffer. If the buffer is the same as SRC_OBJECT, the source text
7657 is deleted.
4ed46869 7658
df7492f9
KH
7659 If it is Qt, a string is made from the decoded text, and
7660 set in CODING->dst_object.
d46c5b12 7661
df7492f9 7662 If it is Qnil, the decoded text is stored at CODING->destination.
2cb26057 7663 The caller must allocate CODING->dst_bytes bytes at
df7492f9
KH
7664 CODING->destination by xmalloc. If the decoded text is longer than
7665 CODING->dst_bytes, CODING->destination is relocated by xrealloc.
7666 */
d46c5b12 7667
df7492f9 7668void
cf84bb53
JB
7669decode_coding_object (struct coding_system *coding,
7670 Lisp_Object src_object,
d311d28c
PE
7671 ptrdiff_t from, ptrdiff_t from_byte,
7672 ptrdiff_t to, ptrdiff_t to_byte,
cf84bb53 7673 Lisp_Object dst_object)
d46c5b12 7674{
d311d28c 7675 ptrdiff_t count = SPECPDL_INDEX ();
c4a63b12 7676 unsigned char *destination IF_LINT (= NULL);
d311d28c
PE
7677 ptrdiff_t dst_bytes IF_LINT (= 0);
7678 ptrdiff_t chars = to - from;
7679 ptrdiff_t bytes = to_byte - from_byte;
df7492f9 7680 Lisp_Object attrs;
c4a63b12 7681 int saved_pt = -1, saved_pt_byte IF_LINT (= 0);
64cedb0c 7682 int need_marker_adjustment = 0;
b3bfad50 7683 Lisp_Object old_deactivate_mark;
d46c5b12 7684
b3bfad50 7685 old_deactivate_mark = Vdeactivate_mark;
93dec019 7686
df7492f9 7687 if (NILP (dst_object))
d46c5b12 7688 {
df7492f9
KH
7689 destination = coding->destination;
7690 dst_bytes = coding->dst_bytes;
d46c5b12 7691 }
93dec019 7692
df7492f9
KH
7693 coding->src_object = src_object;
7694 coding->src_chars = chars;
7695 coding->src_bytes = bytes;
7696 coding->src_multibyte = chars < bytes;
70ad9fc4 7697
df7492f9 7698 if (STRINGP (src_object))
d46c5b12 7699 {
df7492f9
KH
7700 coding->src_pos = from;
7701 coding->src_pos_byte = from_byte;
d46c5b12 7702 }
df7492f9 7703 else if (BUFFERP (src_object))
88993dfd 7704 {
df7492f9
KH
7705 set_buffer_internal (XBUFFER (src_object));
7706 if (from != GPT)
7707 move_gap_both (from, from_byte);
7708 if (EQ (src_object, dst_object))
fb88bf2d 7709 {
64cedb0c
KH
7710 struct Lisp_Marker *tail;
7711
7712 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7713 {
7714 tail->need_adjustment
7715 = tail->charpos == (tail->insertion_type ? from : to);
7716 need_marker_adjustment |= tail->need_adjustment;
7717 }
4776e638 7718 saved_pt = PT, saved_pt_byte = PT_BYTE;
df7492f9 7719 TEMP_SET_PT_BOTH (from, from_byte);
f4a3cc44 7720 current_buffer->text->inhibit_shrinking = 1;
df7492f9
KH
7721 del_range_both (from, from_byte, to, to_byte, 1);
7722 coding->src_pos = -chars;
7723 coding->src_pos_byte = -bytes;
fb88bf2d 7724 }
df7492f9 7725 else
fb88bf2d 7726 {
df7492f9
KH
7727 coding->src_pos = from;
7728 coding->src_pos_byte = from_byte;
fb88bf2d 7729 }
88993dfd
KH
7730 }
7731
df7492f9
KH
7732 if (CODING_REQUIRE_DETECTION (coding))
7733 detect_coding (coding);
7734 attrs = CODING_ID_ATTRS (coding->id);
d46c5b12 7735
2cb26057
KH
7736 if (EQ (dst_object, Qt)
7737 || (! NILP (CODING_ATTR_POST_READ (attrs))
7738 && NILP (dst_object)))
b73bfc1c 7739 {
a1567c45
SM
7740 coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
7741 coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
df7492f9
KH
7742 coding->dst_pos = BEG;
7743 coding->dst_pos_byte = BEG_BYTE;
b73bfc1c 7744 }
df7492f9 7745 else if (BUFFERP (dst_object))
d46c5b12 7746 {
24a73b0a 7747 code_conversion_save (0, 0);
df7492f9
KH
7748 coding->dst_object = dst_object;
7749 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
7750 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
7751 coding->dst_multibyte
4b4deea2 7752 = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
d46c5b12
KH
7753 }
7754 else
7755 {
24a73b0a 7756 code_conversion_save (0, 0);
df7492f9 7757 coding->dst_object = Qnil;
0154725e
SM
7758 /* Most callers presume this will return a multibyte result, and they
7759 won't use `binary' or `raw-text' anyway, so let's not worry about
7760 CODING_FOR_UNIBYTE. */
bb555731 7761 coding->dst_multibyte = 1;
d46c5b12
KH
7762 }
7763
df7492f9 7764 decode_coding (coding);
fa46990e 7765
df7492f9
KH
7766 if (BUFFERP (coding->dst_object))
7767 set_buffer_internal (XBUFFER (coding->dst_object));
d46c5b12 7768
df7492f9 7769 if (! NILP (CODING_ATTR_POST_READ (attrs)))
d46c5b12 7770 {
b3bfad50 7771 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
d311d28c 7772 ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
2b4f9037 7773 Lisp_Object val;
d46c5b12 7774
c0cc7f7f 7775 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
b3bfad50
KH
7776 GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7777 old_deactivate_mark);
d4850d67
KH
7778 val = safe_call1 (CODING_ATTR_POST_READ (attrs),
7779 make_number (coding->produced_char));
df7492f9
KH
7780 UNGCPRO;
7781 CHECK_NATNUM (val);
7782 coding->produced_char += Z - prev_Z;
7783 coding->produced += Z_BYTE - prev_Z_BYTE;
d46c5b12 7784 }
de79a6a5 7785
df7492f9 7786 if (EQ (dst_object, Qt))
ec6d2bb8 7787 {
df7492f9
KH
7788 coding->dst_object = Fbuffer_string ();
7789 }
7790 else if (NILP (dst_object) && BUFFERP (coding->dst_object))
7791 {
7792 set_buffer_internal (XBUFFER (coding->dst_object));
7793 if (dst_bytes < coding->produced)
7794 {
b3bfad50 7795 destination = xrealloc (destination, coding->produced);
df7492f9
KH
7796 if (! destination)
7797 {
065e3595 7798 record_conversion_result (coding,
ebaf11b6 7799 CODING_RESULT_INSUFFICIENT_MEM);
df7492f9
KH
7800 unbind_to (count, Qnil);
7801 return;
7802 }
7803 if (BEGV < GPT && GPT < BEGV + coding->produced_char)
7804 move_gap_both (BEGV, BEGV_BYTE);
72af86bd 7805 memcpy (destination, BEGV_ADDR, coding->produced);
df7492f9 7806 coding->destination = destination;
d46c5b12 7807 }
ec6d2bb8 7808 }
b73bfc1c 7809
4776e638
KH
7810 if (saved_pt >= 0)
7811 {
7812 /* This is the case of:
7813 (BUFFERP (src_object) && EQ (src_object, dst_object))
7814 As we have moved PT while replacing the original buffer
7815 contents, we must recover it now. */
7816 set_buffer_internal (XBUFFER (src_object));
f4a3cc44 7817 current_buffer->text->inhibit_shrinking = 0;
4776e638
KH
7818 if (saved_pt < from)
7819 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7820 else if (saved_pt < from + chars)
7821 TEMP_SET_PT_BOTH (from, from_byte);
4b4deea2 7822 else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
4776e638
KH
7823 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7824 saved_pt_byte + (coding->produced - bytes));
7825 else
7826 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7827 saved_pt_byte + (coding->produced - bytes));
64cedb0c
KH
7828
7829 if (need_marker_adjustment)
7830 {
7831 struct Lisp_Marker *tail;
7832
7833 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7834 if (tail->need_adjustment)
7835 {
7836 tail->need_adjustment = 0;
7837 if (tail->insertion_type)
7838 {
7839 tail->bytepos = from_byte;
7840 tail->charpos = from;
7841 }
7842 else
7843 {
7844 tail->bytepos = from_byte + coding->produced;
7845 tail->charpos
4b4deea2 7846 = (NILP (BVAR (current_buffer, enable_multibyte_characters))
64cedb0c
KH
7847 ? tail->bytepos : from + coding->produced_char);
7848 }
7849 }
7850 }
d46c5b12 7851 }
4776e638 7852
b3bfad50 7853 Vdeactivate_mark = old_deactivate_mark;
065e3595 7854 unbind_to (count, coding->dst_object);
d46c5b12
KH
7855}
7856
d46c5b12 7857
df7492f9 7858void
cf84bb53
JB
7859encode_coding_object (struct coding_system *coding,
7860 Lisp_Object src_object,
d311d28c
PE
7861 ptrdiff_t from, ptrdiff_t from_byte,
7862 ptrdiff_t to, ptrdiff_t to_byte,
cf84bb53 7863 Lisp_Object dst_object)
d46c5b12 7864{
d311d28c
PE
7865 ptrdiff_t count = SPECPDL_INDEX ();
7866 ptrdiff_t chars = to - from;
7867 ptrdiff_t bytes = to_byte - from_byte;
df7492f9 7868 Lisp_Object attrs;
c4a63b12 7869 int saved_pt = -1, saved_pt_byte IF_LINT (= 0);
64cedb0c 7870 int need_marker_adjustment = 0;
c02d943b 7871 int kill_src_buffer = 0;
b3bfad50 7872 Lisp_Object old_deactivate_mark;
df7492f9 7873
b3bfad50 7874 old_deactivate_mark = Vdeactivate_mark;
df7492f9
KH
7875
7876 coding->src_object = src_object;
7877 coding->src_chars = chars;
7878 coding->src_bytes = bytes;
7879 coding->src_multibyte = chars < bytes;
7880
7881 attrs = CODING_ID_ATTRS (coding->id);
7882
64cedb0c
KH
7883 if (EQ (src_object, dst_object))
7884 {
7885 struct Lisp_Marker *tail;
7886
7887 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7888 {
7889 tail->need_adjustment
7890 = tail->charpos == (tail->insertion_type ? from : to);
7891 need_marker_adjustment |= tail->need_adjustment;
7892 }
7893 }
7894
df7492f9 7895 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
6bac5b12 7896 {
24a73b0a 7897 coding->src_object = code_conversion_save (1, coding->src_multibyte);
df7492f9
KH
7898 set_buffer_internal (XBUFFER (coding->src_object));
7899 if (STRINGP (src_object))
7900 insert_from_string (src_object, from, from_byte, chars, bytes, 0);
7901 else if (BUFFERP (src_object))
7902 insert_from_buffer (XBUFFER (src_object), from, chars, 0);
7903 else
b68864e5 7904 insert_1_both ((char *) coding->source + from, chars, bytes, 0, 0, 0);
d46c5b12 7905
df7492f9
KH
7906 if (EQ (src_object, dst_object))
7907 {
7908 set_buffer_internal (XBUFFER (src_object));
4776e638 7909 saved_pt = PT, saved_pt_byte = PT_BYTE;
df7492f9
KH
7910 del_range_both (from, from_byte, to, to_byte, 1);
7911 set_buffer_internal (XBUFFER (coding->src_object));
7912 }
7913
d4850d67
KH
7914 {
7915 Lisp_Object args[3];
b3bfad50 7916 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
d4850d67 7917
b3bfad50
KH
7918 GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7919 old_deactivate_mark);
d4850d67
KH
7920 args[0] = CODING_ATTR_PRE_WRITE (attrs);
7921 args[1] = make_number (BEG);
7922 args[2] = make_number (Z);
7923 safe_call (3, args);
b3bfad50 7924 UNGCPRO;
d4850d67 7925 }
c02d943b
KH
7926 if (XBUFFER (coding->src_object) != current_buffer)
7927 kill_src_buffer = 1;
ac87bbef 7928 coding->src_object = Fcurrent_buffer ();
df7492f9
KH
7929 if (BEG != GPT)
7930 move_gap_both (BEG, BEG_BYTE);
7931 coding->src_chars = Z - BEG;
7932 coding->src_bytes = Z_BYTE - BEG_BYTE;
7933 coding->src_pos = BEG;
7934 coding->src_pos_byte = BEG_BYTE;
7935 coding->src_multibyte = Z < Z_BYTE;
7936 }
7937 else if (STRINGP (src_object))
d46c5b12 7938 {
24a73b0a 7939 code_conversion_save (0, 0);
df7492f9
KH
7940 coding->src_pos = from;
7941 coding->src_pos_byte = from_byte;
b73bfc1c 7942 }
df7492f9 7943 else if (BUFFERP (src_object))
b73bfc1c 7944 {
24a73b0a 7945 code_conversion_save (0, 0);
df7492f9 7946 set_buffer_internal (XBUFFER (src_object));
df7492f9 7947 if (EQ (src_object, dst_object))
d46c5b12 7948 {
4776e638 7949 saved_pt = PT, saved_pt_byte = PT_BYTE;
ff0dacd7
KH
7950 coding->src_object = del_range_1 (from, to, 1, 1);
7951 coding->src_pos = 0;
7952 coding->src_pos_byte = 0;
d46c5b12 7953 }
df7492f9 7954 else
d46c5b12 7955 {
ff0dacd7
KH
7956 if (from < GPT && to >= GPT)
7957 move_gap_both (from, from_byte);
df7492f9
KH
7958 coding->src_pos = from;
7959 coding->src_pos_byte = from_byte;
d46c5b12 7960 }
d46c5b12 7961 }
4776e638 7962 else
24a73b0a 7963 code_conversion_save (0, 0);
d46c5b12 7964
df7492f9 7965 if (BUFFERP (dst_object))
88993dfd 7966 {
df7492f9 7967 coding->dst_object = dst_object;
28f67a95
KH
7968 if (EQ (src_object, dst_object))
7969 {
7970 coding->dst_pos = from;
7971 coding->dst_pos_byte = from_byte;
7972 }
7973 else
7974 {
319a3947
KH
7975 struct buffer *current = current_buffer;
7976
7977 set_buffer_temp (XBUFFER (dst_object));
7978 coding->dst_pos = PT;
7979 coding->dst_pos_byte = PT_BYTE;
7980 move_gap_both (coding->dst_pos, coding->dst_pos_byte);
7981 set_buffer_temp (current);
28f67a95 7982 }
df7492f9 7983 coding->dst_multibyte
4b4deea2 7984 = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
88993dfd 7985 }
df7492f9 7986 else if (EQ (dst_object, Qt))
d46c5b12 7987 {
5d009b3a 7988 ptrdiff_t dst_bytes = max (1, coding->src_chars);
df7492f9 7989 coding->dst_object = Qnil;
5d009b3a
PE
7990 coding->destination = (unsigned char *) xmalloc (dst_bytes);
7991 coding->dst_bytes = dst_bytes;
df7492f9 7992 coding->dst_multibyte = 0;
d46c5b12
KH
7993 }
7994 else
7995 {
df7492f9
KH
7996 coding->dst_object = Qnil;
7997 coding->dst_multibyte = 0;
d46c5b12
KH
7998 }
7999
df7492f9 8000 encode_coding (coding);
d46c5b12 8001
df7492f9 8002 if (EQ (dst_object, Qt))
d46c5b12 8003 {
df7492f9
KH
8004 if (BUFFERP (coding->dst_object))
8005 coding->dst_object = Fbuffer_string ();
8006 else
d46c5b12 8007 {
df7492f9
KH
8008 coding->dst_object
8009 = make_unibyte_string ((char *) coding->destination,
8010 coding->produced);
8011 xfree (coding->destination);
d46c5b12 8012 }
4ed46869 8013 }
d46c5b12 8014
4776e638
KH
8015 if (saved_pt >= 0)
8016 {
8017 /* This is the case of:
8018 (BUFFERP (src_object) && EQ (src_object, dst_object))
8019 As we have moved PT while replacing the original buffer
8020 contents, we must recover it now. */
8021 set_buffer_internal (XBUFFER (src_object));
8022 if (saved_pt < from)
8023 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8024 else if (saved_pt < from + chars)
8025 TEMP_SET_PT_BOTH (from, from_byte);
4b4deea2 8026 else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
4776e638
KH
8027 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8028 saved_pt_byte + (coding->produced - bytes));
d46c5b12 8029 else
4776e638
KH
8030 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8031 saved_pt_byte + (coding->produced - bytes));
64cedb0c
KH
8032
8033 if (need_marker_adjustment)
8034 {
8035 struct Lisp_Marker *tail;
8036
8037 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8038 if (tail->need_adjustment)
8039 {
8040 tail->need_adjustment = 0;
8041 if (tail->insertion_type)
8042 {
8043 tail->bytepos = from_byte;
8044 tail->charpos = from;
8045 }
8046 else
8047 {
8048 tail->bytepos = from_byte + coding->produced;
8049 tail->charpos
4b4deea2 8050 = (NILP (BVAR (current_buffer, enable_multibyte_characters))
64cedb0c
KH
8051 ? tail->bytepos : from + coding->produced_char);
8052 }
8053 }
8054 }
4776e638
KH
8055 }
8056
c02d943b
KH
8057 if (kill_src_buffer)
8058 Fkill_buffer (coding->src_object);
b3bfad50
KH
8059
8060 Vdeactivate_mark = old_deactivate_mark;
df7492f9 8061 unbind_to (count, Qnil);
b73bfc1c
KH
8062}
8063
df7492f9 8064
b73bfc1c 8065Lisp_Object
971de7fb 8066preferred_coding_system (void)
b73bfc1c 8067{
df7492f9 8068 int id = coding_categories[coding_priorities[0]].id;
2391eaa4 8069
df7492f9 8070 return CODING_ID_NAME (id);
4ed46869
KH
8071}
8072
8073\f
8074#ifdef emacs
1397dc18 8075/*** 8. Emacs Lisp library functions ***/
4ed46869 8076
a7ca3326 8077DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
48b0f3ae 8078 doc: /* Return t if OBJECT is nil or a coding-system.
df7492f9 8079See the documentation of `define-coding-system' for information
48b0f3ae 8080about coding-system objects. */)
5842a27b 8081 (Lisp_Object object)
4ed46869 8082{
d4a1d553
JB
8083 if (NILP (object)
8084 || CODING_SYSTEM_ID (object) >= 0)
44e8490d 8085 return Qt;
d4a1d553
JB
8086 if (! SYMBOLP (object)
8087 || NILP (Fget (object, Qcoding_system_define_form)))
44e8490d
KH
8088 return Qnil;
8089 return Qt;
4ed46869
KH
8090}
8091
a7ca3326 8092DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
9d991de8 8093 Sread_non_nil_coding_system, 1, 1, 0,
48b0f3ae 8094 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
5842a27b 8095 (Lisp_Object prompt)
4ed46869 8096{
e0e989f6 8097 Lisp_Object val;
9d991de8
RS
8098 do
8099 {
4608c386
KH
8100 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8101 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
9d991de8 8102 }
8f924df7 8103 while (SCHARS (val) == 0);
e0e989f6 8104 return (Fintern (val, Qnil));
4ed46869
KH
8105}
8106
a7ca3326 8107DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
48b0f3ae 8108 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
c7183fb8
GM
8109If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8110Ignores case when completing coding systems (all Emacs coding systems
8111are lower-case). */)
5842a27b 8112 (Lisp_Object prompt, Lisp_Object default_coding_system)
4ed46869 8113{
f44d27ce 8114 Lisp_Object val;
d311d28c 8115 ptrdiff_t count = SPECPDL_INDEX ();
c7183fb8 8116
9b787f3e 8117 if (SYMBOLP (default_coding_system))
57d25e6f 8118 default_coding_system = SYMBOL_NAME (default_coding_system);
c7183fb8 8119 specbind (Qcompletion_ignore_case, Qt);
4608c386 8120 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
9b787f3e
RS
8121 Qt, Qnil, Qcoding_system_history,
8122 default_coding_system, Qnil);
c7183fb8 8123 unbind_to (count, Qnil);
8f924df7 8124 return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
4ed46869
KH
8125}
8126
a7ca3326 8127DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
4ed46869 8128 1, 1, 0,
48b0f3ae 8129 doc: /* Check validity of CODING-SYSTEM.
9ffd559c
KH
8130If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8131It is valid if it is nil or a symbol defined as a coding system by the
8132function `define-coding-system'. */)
5842a27b 8133 (Lisp_Object coding_system)
4ed46869 8134{
44e8490d
KH
8135 Lisp_Object define_form;
8136
8137 define_form = Fget (coding_system, Qcoding_system_define_form);
8138 if (! NILP (define_form))
8139 {
8140 Fput (coding_system, Qcoding_system_define_form, Qnil);
8141 safe_eval (define_form);
8142 }
4ed46869
KH
8143 if (!NILP (Fcoding_system_p (coding_system)))
8144 return coding_system;
fcad4ec4 8145 xsignal1 (Qcoding_system_error, coding_system);
4ed46869 8146}
df7492f9 8147
3a73fa5d 8148\f
89528eb3
KH
8149/* Detect how the bytes at SRC of length SRC_BYTES are encoded. If
8150 HIGHEST is nonzero, return the coding system of the highest
ad1746f5 8151 priority among the detected coding systems. Otherwise return a
89528eb3
KH
8152 list of detected coding systems sorted by their priorities. If
8153 MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
8154 multibyte form but contains only ASCII and eight-bit chars.
8155 Otherwise, the bytes are raw bytes.
8156
8157 CODING-SYSTEM controls the detection as below:
8158
8159 If it is nil, detect both text-format and eol-format. If the
8160 text-format part of CODING-SYSTEM is already specified
8161 (e.g. `iso-latin-1'), detect only eol-format. If the eol-format
8162 part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8163 detect only text-format. */
8164
d46c5b12 8165Lisp_Object
cf84bb53 8166detect_coding_system (const unsigned char *src,
d311d28c 8167 ptrdiff_t src_chars, ptrdiff_t src_bytes,
cf84bb53
JB
8168 int highest, int multibytep,
8169 Lisp_Object coding_system)
4ed46869 8170{
8f924df7 8171 const unsigned char *src_end = src + src_bytes;
df7492f9 8172 Lisp_Object attrs, eol_type;
4533845d 8173 Lisp_Object val = Qnil;
df7492f9 8174 struct coding_system coding;
d3411f89 8175 ptrdiff_t id;
ff0dacd7 8176 struct coding_detection_info detect_info;
24a73b0a 8177 enum coding_category base_category;
2f3cbb32 8178 int null_byte_found = 0, eight_bit_found = 0;
b73bfc1c 8179
df7492f9
KH
8180 if (NILP (coding_system))
8181 coding_system = Qundecided;
8182 setup_coding_system (coding_system, &coding);
8183 attrs = CODING_ID_ATTRS (coding.id);
8184 eol_type = CODING_ID_EOL_TYPE (coding.id);
89528eb3 8185 coding_system = CODING_ATTR_BASE_NAME (attrs);
4ed46869 8186
df7492f9 8187 coding.source = src;
24a73b0a 8188 coding.src_chars = src_chars;
df7492f9
KH
8189 coding.src_bytes = src_bytes;
8190 coding.src_multibyte = multibytep;
8191 coding.consumed = 0;
89528eb3 8192 coding.mode |= CODING_MODE_LAST_BLOCK;
c0e16b14 8193 coding.head_ascii = 0;
d46c5b12 8194
ff0dacd7 8195 detect_info.checked = detect_info.found = detect_info.rejected = 0;
d46c5b12 8196
89528eb3 8197 /* At first, detect text-format if necessary. */
24a73b0a
KH
8198 base_category = XINT (CODING_ATTR_CATEGORY (attrs));
8199 if (base_category == coding_category_undecided)
4ed46869 8200 {
c4a63b12
PE
8201 enum coding_category category IF_LINT (= 0);
8202 struct coding_system *this IF_LINT (= NULL);
ff0dacd7 8203 int c, i;
88993dfd 8204
24a73b0a 8205 /* Skip all ASCII bytes except for a few ISO2022 controls. */
2f3cbb32 8206 for (; src < src_end; src++)
4ed46869 8207 {
df7492f9 8208 c = *src;
6cb21a4f 8209 if (c & 0x80)
6cb21a4f 8210 {
2f3cbb32 8211 eight_bit_found = 1;
2f3cbb32
KH
8212 if (null_byte_found)
8213 break;
8214 }
c0e16b14 8215 else if (c < 0x20)
2f3cbb32
KH
8216 {
8217 if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8218 && ! inhibit_iso_escape_detection
8219 && ! detect_info.checked)
6cb21a4f 8220 {
2f3cbb32
KH
8221 if (detect_coding_iso_2022 (&coding, &detect_info))
8222 {
8223 /* We have scanned the whole data. */
8224 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
c0e16b14
KH
8225 {
8226 /* We didn't find an 8-bit code. We may
8227 have found a null-byte, but it's very
8228 rare that a binary file confirm to
8229 ISO-2022. */
8230 src = src_end;
8231 coding.head_ascii = src - coding.source;
8232 }
8233 detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
2f3cbb32
KH
8234 break;
8235 }
8236 }
97b1b294 8237 else if (! c && !inhibit_null_byte_detection)
2f3cbb32
KH
8238 {
8239 null_byte_found = 1;
8240 if (eight_bit_found)
8241 break;
6cb21a4f 8242 }
c006c0c8
KH
8243 if (! eight_bit_found)
8244 coding.head_ascii++;
6cb21a4f 8245 }
c006c0c8 8246 else if (! eight_bit_found)
c0e16b14 8247 coding.head_ascii++;
4ed46869 8248 }
88993dfd 8249
2f3cbb32
KH
8250 if (null_byte_found || eight_bit_found
8251 || coding.head_ascii < coding.src_bytes
6cb21a4f
KH
8252 || detect_info.found)
8253 {
2f3cbb32 8254 if (coding.head_ascii == coding.src_bytes)
6cb21a4f
KH
8255 /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */
8256 for (i = 0; i < coding_category_raw_text; i++)
ff0dacd7 8257 {
6cb21a4f 8258 category = coding_priorities[i];
c7266f4a 8259 this = coding_categories + category;
6cb21a4f 8260 if (detect_info.found & (1 << category))
ff0dacd7
KH
8261 break;
8262 }
6cb21a4f 8263 else
2f3cbb32
KH
8264 {
8265 if (null_byte_found)
8266 {
8267 detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8268 detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8269 }
8270 for (i = 0; i < coding_category_raw_text; i++)
8271 {
8272 category = coding_priorities[i];
8273 this = coding_categories + category;
6cb21a4f 8274
2f3cbb32
KH
8275 if (this->id < 0)
8276 {
8277 /* No coding system of this category is defined. */
8278 detect_info.rejected |= (1 << category);
8279 }
8280 else if (category >= coding_category_raw_text)
8281 continue;
8282 else if (detect_info.checked & (1 << category))
8283 {
8284 if (highest
8285 && (detect_info.found & (1 << category)))
6cb21a4f 8286 break;
2f3cbb32
KH
8287 }
8288 else if ((*(this->detector)) (&coding, &detect_info)
8289 && highest
8290 && (detect_info.found & (1 << category)))
8291 {
8292 if (category == coding_category_utf_16_auto)
8293 {
8294 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8295 category = coding_category_utf_16_le;
8296 else
8297 category = coding_category_utf_16_be;
8298 }
8299 break;
8300 }
8301 }
8302 }
6cb21a4f 8303 }
ec6d2bb8 8304
4cddb209
KH
8305 if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8306 || null_byte_found)
ec6d2bb8 8307 {
ff0dacd7 8308 detect_info.found = CATEGORY_MASK_RAW_TEXT;
4cddb209 8309 id = CODING_SYSTEM_ID (Qno_conversion);
89528eb3
KH
8310 val = Fcons (make_number (id), Qnil);
8311 }
ff0dacd7 8312 else if (! detect_info.rejected && ! detect_info.found)
89528eb3 8313 {
ff0dacd7 8314 detect_info.found = CATEGORY_MASK_ANY;
89528eb3
KH
8315 id = coding_categories[coding_category_undecided].id;
8316 val = Fcons (make_number (id), Qnil);
8317 }
8318 else if (highest)
8319 {
ff0dacd7 8320 if (detect_info.found)
ec6d2bb8 8321 {
ff0dacd7
KH
8322 detect_info.found = 1 << category;
8323 val = Fcons (make_number (this->id), Qnil);
8324 }
8325 else
8326 for (i = 0; i < coding_category_raw_text; i++)
8327 if (! (detect_info.rejected & (1 << coding_priorities[i])))
8328 {
8329 detect_info.found = 1 << coding_priorities[i];
8330 id = coding_categories[coding_priorities[i]].id;
8331 val = Fcons (make_number (id), Qnil);
8332 break;
8333 }
8334 }
89528eb3
KH
8335 else
8336 {
ff0dacd7
KH
8337 int mask = detect_info.rejected | detect_info.found;
8338 int found = 0;
ec6d2bb8 8339
89528eb3 8340 for (i = coding_category_raw_text - 1; i >= 0; i--)
ff0dacd7
KH
8341 {
8342 category = coding_priorities[i];
8343 if (! (mask & (1 << category)))
ec6d2bb8 8344 {
ff0dacd7
KH
8345 found |= 1 << category;
8346 id = coding_categories[category].id;
c7266f4a
KH
8347 if (id >= 0)
8348 val = Fcons (make_number (id), val);
ff0dacd7
KH
8349 }
8350 }
8351 for (i = coding_category_raw_text - 1; i >= 0; i--)
8352 {
8353 category = coding_priorities[i];
8354 if (detect_info.found & (1 << category))
8355 {
8356 id = coding_categories[category].id;
8357 val = Fcons (make_number (id), val);
ec6d2bb8 8358 }
ec6d2bb8 8359 }
ff0dacd7 8360 detect_info.found |= found;
ec6d2bb8 8361 }
ec6d2bb8 8362 }
a470d443
KH
8363 else if (base_category == coding_category_utf_8_auto)
8364 {
8365 if (detect_coding_utf_8 (&coding, &detect_info))
8366 {
8367 struct coding_system *this;
8368
8369 if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8370 this = coding_categories + coding_category_utf_8_sig;
8371 else
8372 this = coding_categories + coding_category_utf_8_nosig;
8373 val = Fcons (make_number (this->id), Qnil);
8374 }
8375 }
24a73b0a
KH
8376 else if (base_category == coding_category_utf_16_auto)
8377 {
8378 if (detect_coding_utf_16 (&coding, &detect_info))
8379 {
24a73b0a
KH
8380 struct coding_system *this;
8381
8382 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8383 this = coding_categories + coding_category_utf_16_le;
8384 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8385 this = coding_categories + coding_category_utf_16_be;
8386 else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8387 this = coding_categories + coding_category_utf_16_be_nosig;
8388 else
8389 this = coding_categories + coding_category_utf_16_le_nosig;
8390 val = Fcons (make_number (this->id), Qnil);
8391 }
8392 }
df7492f9
KH
8393 else
8394 {
ff0dacd7 8395 detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
89528eb3 8396 val = Fcons (make_number (coding.id), Qnil);
4ed46869 8397 }
df7492f9 8398
89528eb3 8399 /* Then, detect eol-format if necessary. */
df7492f9 8400 {
4533845d 8401 int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
df7492f9
KH
8402 Lisp_Object tail;
8403
89528eb3
KH
8404 if (VECTORP (eol_type))
8405 {
ff0dacd7 8406 if (detect_info.found & ~CATEGORY_MASK_UTF_16)
2f3cbb32
KH
8407 {
8408 if (null_byte_found)
8409 normal_eol = EOL_SEEN_LF;
8410 else
8411 normal_eol = detect_eol (coding.source, src_bytes,
8412 coding_category_raw_text);
8413 }
ff0dacd7
KH
8414 if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8415 | CATEGORY_MASK_UTF_16_BE_NOSIG))
89528eb3
KH
8416 utf_16_be_eol = detect_eol (coding.source, src_bytes,
8417 coding_category_utf_16_be);
ff0dacd7
KH
8418 if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8419 | CATEGORY_MASK_UTF_16_LE_NOSIG))
89528eb3
KH
8420 utf_16_le_eol = detect_eol (coding.source, src_bytes,
8421 coding_category_utf_16_le);
8422 }
8423 else
8424 {
8425 if (EQ (eol_type, Qunix))
8426 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8427 else if (EQ (eol_type, Qdos))
8428 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8429 else
8430 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8431 }
8432
df7492f9
KH
8433 for (tail = val; CONSP (tail); tail = XCDR (tail))
8434 {
89528eb3 8435 enum coding_category category;
df7492f9 8436 int this_eol;
89528eb3
KH
8437
8438 id = XINT (XCAR (tail));
8439 attrs = CODING_ID_ATTRS (id);
8440 category = XINT (CODING_ATTR_CATEGORY (attrs));
8441 eol_type = CODING_ID_EOL_TYPE (id);
df7492f9
KH
8442 if (VECTORP (eol_type))
8443 {
89528eb3
KH
8444 if (category == coding_category_utf_16_be
8445 || category == coding_category_utf_16_be_nosig)
8446 this_eol = utf_16_be_eol;
8447 else if (category == coding_category_utf_16_le
8448 || category == coding_category_utf_16_le_nosig)
8449 this_eol = utf_16_le_eol;
df7492f9 8450 else
89528eb3
KH
8451 this_eol = normal_eol;
8452
df7492f9
KH
8453 if (this_eol == EOL_SEEN_LF)
8454 XSETCAR (tail, AREF (eol_type, 0));
8455 else if (this_eol == EOL_SEEN_CRLF)
8456 XSETCAR (tail, AREF (eol_type, 1));
8457 else if (this_eol == EOL_SEEN_CR)
8458 XSETCAR (tail, AREF (eol_type, 2));
89528eb3
KH
8459 else
8460 XSETCAR (tail, CODING_ID_NAME (id));
df7492f9 8461 }
89528eb3
KH
8462 else
8463 XSETCAR (tail, CODING_ID_NAME (id));
df7492f9
KH
8464 }
8465 }
ec6d2bb8 8466
4533845d 8467 return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
ec6d2bb8
KH
8468}
8469
ec6d2bb8 8470
d46c5b12
KH
8471DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8472 2, 3, 0,
48b0f3ae
PJ
8473 doc: /* Detect coding system of the text in the region between START and END.
8474Return a list of possible coding systems ordered by priority.
b811c52b
KH
8475The coding systems to try and their priorities follows what
8476the function `coding-system-priority-list' (which see) returns.
ec6d2bb8 8477
12e0131a 8478If only ASCII characters are found (except for such ISO-2022 control
d4a1d553
JB
8479characters as ESC), it returns a list of single element `undecided'
8480or its subsidiary coding system according to a detected end-of-line
8481format.
ec6d2bb8 8482
48b0f3ae
PJ
8483If optional argument HIGHEST is non-nil, return the coding system of
8484highest priority. */)
5842a27b 8485 (Lisp_Object start, Lisp_Object end, Lisp_Object highest)
d46c5b12 8486{
d311d28c
PE
8487 ptrdiff_t from, to;
8488 ptrdiff_t from_byte, to_byte;
ec6d2bb8 8489
b7826503
PJ
8490 CHECK_NUMBER_COERCE_MARKER (start);
8491 CHECK_NUMBER_COERCE_MARKER (end);
ec6d2bb8 8492
d46c5b12
KH
8493 validate_region (&start, &end);
8494 from = XINT (start), to = XINT (end);
8495 from_byte = CHAR_TO_BYTE (from);
8496 to_byte = CHAR_TO_BYTE (to);
ec6d2bb8 8497
d46c5b12
KH
8498 if (from < GPT && to >= GPT)
8499 move_gap_both (to, to_byte);
c210f766 8500
d46c5b12 8501 return detect_coding_system (BYTE_POS_ADDR (from_byte),
24a73b0a 8502 to - from, to_byte - from_byte,
0a28aafb 8503 !NILP (highest),
4b4deea2 8504 !NILP (BVAR (current_buffer
5d8ea120 8505 , enable_multibyte_characters)),
df7492f9 8506 Qnil);
ec6d2bb8
KH
8507}
8508
d46c5b12
KH
8509DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8510 1, 2, 0,
48b0f3ae
PJ
8511 doc: /* Detect coding system of the text in STRING.
8512Return a list of possible coding systems ordered by priority.
67ceab9d
KH
8513The coding systems to try and their priorities follows what
8514the function `coding-system-priority-list' (which see) returns.
fb88bf2d 8515
12e0131a 8516If only ASCII characters are found (except for such ISO-2022 control
d4a1d553
JB
8517characters as ESC), it returns a list of single element `undecided'
8518or its subsidiary coding system according to a detected end-of-line
8519format.
d46c5b12 8520
48b0f3ae
PJ
8521If optional argument HIGHEST is non-nil, return the coding system of
8522highest priority. */)
5842a27b 8523 (Lisp_Object string, Lisp_Object highest)
d46c5b12 8524{
b7826503 8525 CHECK_STRING (string);
b73bfc1c 8526
24a73b0a
KH
8527 return detect_coding_system (SDATA (string),
8528 SCHARS (string), SBYTES (string),
8f924df7 8529 !NILP (highest), STRING_MULTIBYTE (string),
df7492f9 8530 Qnil);
4ed46869 8531}
4ed46869 8532
b73bfc1c 8533
55d4c1b2 8534static inline int
971de7fb 8535char_encodable_p (int c, Lisp_Object attrs)
05e6f5dc 8536{
df7492f9 8537 Lisp_Object tail;
df7492f9 8538 struct charset *charset;
7d64c6ad 8539 Lisp_Object translation_table;
d46c5b12 8540
7d64c6ad 8541 translation_table = CODING_ATTR_TRANS_TBL (attrs);
a6f87d34 8542 if (! NILP (translation_table))
7d64c6ad 8543 c = translate_char (translation_table, c);
df7492f9
KH
8544 for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8545 CONSP (tail); tail = XCDR (tail))
e133c8fa 8546 {
df7492f9
KH
8547 charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8548 if (CHAR_CHARSET_P (c, charset))
8549 break;
e133c8fa 8550 }
df7492f9 8551 return (! NILP (tail));
05e6f5dc 8552}
83fa074f 8553
fb88bf2d 8554
df7492f9
KH
8555/* Return a list of coding systems that safely encode the text between
8556 START and END. If EXCLUDE is non-nil, it is a list of coding
8557 systems not to check. The returned list doesn't contain any such
48468dac 8558 coding systems. In any case, if the text contains only ASCII or is
df7492f9 8559 unibyte, return t. */
e077cc80 8560
df7492f9
KH
8561DEFUN ("find-coding-systems-region-internal",
8562 Ffind_coding_systems_region_internal,
8563 Sfind_coding_systems_region_internal, 2, 3, 0,
8564 doc: /* Internal use only. */)
5842a27b 8565 (Lisp_Object start, Lisp_Object end, Lisp_Object exclude)
df7492f9
KH
8566{
8567 Lisp_Object coding_attrs_list, safe_codings;
d311d28c 8568 ptrdiff_t start_byte, end_byte;
7c78e542 8569 const unsigned char *p, *pbeg, *pend;
df7492f9 8570 int c;
0e727afa 8571 Lisp_Object tail, elt, work_table;
d46c5b12 8572
df7492f9
KH
8573 if (STRINGP (start))
8574 {
8575 if (!STRING_MULTIBYTE (start)
8f924df7 8576 || SCHARS (start) == SBYTES (start))
df7492f9
KH
8577 return Qt;
8578 start_byte = 0;
8f924df7 8579 end_byte = SBYTES (start);
df7492f9
KH
8580 }
8581 else
d46c5b12 8582 {
df7492f9
KH
8583 CHECK_NUMBER_COERCE_MARKER (start);
8584 CHECK_NUMBER_COERCE_MARKER (end);
8585 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8586 args_out_of_range (start, end);
4b4deea2 8587 if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
df7492f9
KH
8588 return Qt;
8589 start_byte = CHAR_TO_BYTE (XINT (start));
8590 end_byte = CHAR_TO_BYTE (XINT (end));
8591 if (XINT (end) - XINT (start) == end_byte - start_byte)
8592 return Qt;
d46c5b12 8593
e1c23804 8594 if (XINT (start) < GPT && XINT (end) > GPT)
d46c5b12 8595 {
e1c23804
DL
8596 if ((GPT - XINT (start)) < (XINT (end) - GPT))
8597 move_gap_both (XINT (start), start_byte);
df7492f9 8598 else
e1c23804 8599 move_gap_both (XINT (end), end_byte);
d46c5b12
KH
8600 }
8601 }
8602
df7492f9
KH
8603 coding_attrs_list = Qnil;
8604 for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
8605 if (NILP (exclude)
8606 || NILP (Fmemq (XCAR (tail), exclude)))
8607 {
8608 Lisp_Object attrs;
d46c5b12 8609
df7492f9
KH
8610 attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
8611 if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
8612 && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
7d64c6ad
KH
8613 {
8614 ASET (attrs, coding_attr_trans_tbl,
2170c8f0 8615 get_translation_table (attrs, 1, NULL));
7d64c6ad
KH
8616 coding_attrs_list = Fcons (attrs, coding_attrs_list);
8617 }
df7492f9 8618 }
d46c5b12 8619
df7492f9 8620 if (STRINGP (start))
8f924df7 8621 p = pbeg = SDATA (start);
df7492f9
KH
8622 else
8623 p = pbeg = BYTE_POS_ADDR (start_byte);
8624 pend = p + (end_byte - start_byte);
b843d1ae 8625
df7492f9
KH
8626 while (p < pend && ASCII_BYTE_P (*p)) p++;
8627 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
d46c5b12 8628
0e727afa 8629 work_table = Fmake_char_table (Qnil, Qnil);
05e6f5dc 8630 while (p < pend)
72d1a715 8631 {
df7492f9
KH
8632 if (ASCII_BYTE_P (*p))
8633 p++;
72d1a715
RS
8634 else
8635 {
df7492f9 8636 c = STRING_CHAR_ADVANCE (p);
0e727afa
YM
8637 if (!NILP (char_table_ref (work_table, c)))
8638 /* This character was already checked. Ignore it. */
8639 continue;
12410ef1 8640
df7492f9
KH
8641 charset_map_loaded = 0;
8642 for (tail = coding_attrs_list; CONSP (tail);)
8643 {
8644 elt = XCAR (tail);
8645 if (NILP (elt))
8646 tail = XCDR (tail);
8647 else if (char_encodable_p (c, elt))
8648 tail = XCDR (tail);
8649 else if (CONSP (XCDR (tail)))
8650 {
8651 XSETCAR (tail, XCAR (XCDR (tail)));
8652 XSETCDR (tail, XCDR (XCDR (tail)));
8653 }
8654 else
8655 {
8656 XSETCAR (tail, Qnil);
8657 tail = XCDR (tail);
8658 }
8659 }
8660 if (charset_map_loaded)
8661 {
d311d28c 8662 ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
05e6f5dc 8663
df7492f9 8664 if (STRINGP (start))
8f924df7 8665 pbeg = SDATA (start);
df7492f9
KH
8666 else
8667 pbeg = BYTE_POS_ADDR (start_byte);
8668 p = pbeg + p_offset;
8669 pend = pbeg + pend_offset;
8670 }
0e727afa 8671 char_table_set (work_table, c, Qt);
df7492f9 8672 }
ec6d2bb8 8673 }
fb88bf2d 8674
988b3759 8675 safe_codings = list2 (Qraw_text, Qno_conversion);
df7492f9
KH
8676 for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
8677 if (! NILP (XCAR (tail)))
8678 safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
ec6d2bb8 8679
05e6f5dc
KH
8680 return safe_codings;
8681}
4956c225 8682
d46c5b12 8683
8f924df7
KH
8684DEFUN ("unencodable-char-position", Funencodable_char_position,
8685 Sunencodable_char_position, 3, 5, 0,
8686 doc: /*
8687Return position of first un-encodable character in a region.
d4a1d553 8688START and END specify the region and CODING-SYSTEM specifies the
8f924df7 8689encoding to check. Return nil if CODING-SYSTEM does encode the region.
d46c5b12 8690
8f924df7
KH
8691If optional 4th argument COUNT is non-nil, it specifies at most how
8692many un-encodable characters to search. In this case, the value is a
8693list of positions.
d46c5b12 8694
8f924df7
KH
8695If optional 5th argument STRING is non-nil, it is a string to search
8696for un-encodable characters. In that case, START and END are indexes
8697to the string. */)
5842a27b 8698 (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object count, Lisp_Object string)
8f924df7 8699{
d311d28c 8700 EMACS_INT n;
8f924df7 8701 struct coding_system coding;
7d64c6ad 8702 Lisp_Object attrs, charset_list, translation_table;
8f924df7 8703 Lisp_Object positions;
d311d28c 8704 ptrdiff_t from, to;
8f924df7
KH
8705 const unsigned char *p, *stop, *pend;
8706 int ascii_compatible;
fb88bf2d 8707
8f924df7
KH
8708 setup_coding_system (Fcheck_coding_system (coding_system), &coding);
8709 attrs = CODING_ID_ATTRS (coding.id);
8710 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
8711 return Qnil;
8712 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
8713 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2170c8f0 8714 translation_table = get_translation_table (attrs, 1, NULL);
fb88bf2d 8715
8f924df7
KH
8716 if (NILP (string))
8717 {
8718 validate_region (&start, &end);
8719 from = XINT (start);
8720 to = XINT (end);
4b4deea2 8721 if (NILP (BVAR (current_buffer, enable_multibyte_characters))
8f924df7
KH
8722 || (ascii_compatible
8723 && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
8724 return Qnil;
8725 p = CHAR_POS_ADDR (from);
8726 pend = CHAR_POS_ADDR (to);
8727 if (from < GPT && to >= GPT)
8728 stop = GPT_ADDR;
8729 else
8730 stop = pend;
8731 }
8732 else
8733 {
8734 CHECK_STRING (string);
8735 CHECK_NATNUM (start);
8736 CHECK_NATNUM (end);
d311d28c
PE
8737 if (! (XINT (start) <= XINT (end) && XINT (end) <= SCHARS (string)))
8738 args_out_of_range_3 (string, start, end);
8f924df7
KH
8739 from = XINT (start);
8740 to = XINT (end);
8f924df7
KH
8741 if (! STRING_MULTIBYTE (string))
8742 return Qnil;
8743 p = SDATA (string) + string_char_to_byte (string, from);
8744 stop = pend = SDATA (string) + string_char_to_byte (string, to);
8745 if (ascii_compatible && (to - from) == (pend - p))
8746 return Qnil;
8747 }
f2558efd 8748
8f924df7
KH
8749 if (NILP (count))
8750 n = 1;
8751 else
b73bfc1c 8752 {
8f924df7
KH
8753 CHECK_NATNUM (count);
8754 n = XINT (count);
b73bfc1c
KH
8755 }
8756
8f924df7 8757 positions = Qnil;
3633e3aa 8758 charset_map_loaded = 0;
8f924df7 8759 while (1)
d46c5b12 8760 {
8f924df7 8761 int c;
ec6d2bb8 8762
8f924df7
KH
8763 if (ascii_compatible)
8764 while (p < stop && ASCII_BYTE_P (*p))
8765 p++, from++;
8766 if (p >= stop)
0e79d667 8767 {
8f924df7
KH
8768 if (p >= pend)
8769 break;
8770 stop = pend;
8771 p = GAP_END_ADDR;
0e79d667 8772 }
ec6d2bb8 8773
8f924df7
KH
8774 c = STRING_CHAR_ADVANCE (p);
8775 if (! (ASCII_CHAR_P (c) && ascii_compatible)
7d64c6ad
KH
8776 && ! char_charset (translate_char (translation_table, c),
8777 charset_list, NULL))
ec6d2bb8 8778 {
8f924df7
KH
8779 positions = Fcons (make_number (from), positions);
8780 n--;
8781 if (n == 0)
8782 break;
ec6d2bb8
KH
8783 }
8784
8f924df7 8785 from++;
3633e3aa
KH
8786 if (charset_map_loaded && NILP (string))
8787 {
8788 p = CHAR_POS_ADDR (from);
8789 pend = CHAR_POS_ADDR (to);
8790 if (from < GPT && to >= GPT)
8791 stop = GPT_ADDR;
8792 else
8793 stop = pend;
8794 charset_map_loaded = 0;
8795 }
8f924df7 8796 }
d46c5b12 8797
8f924df7
KH
8798 return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
8799}
d46c5b12 8800
d46c5b12 8801
df7492f9
KH
8802DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
8803 Scheck_coding_systems_region, 3, 3, 0,
8804 doc: /* Check if the region is encodable by coding systems.
d46c5b12 8805
df7492f9
KH
8806START and END are buffer positions specifying the region.
8807CODING-SYSTEM-LIST is a list of coding systems to check.
d46c5b12 8808
df7492f9 8809The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
d4a1d553 8810CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
df7492f9
KH
8811whole region, POS0, POS1, ... are buffer positions where non-encodable
8812characters are found.
93dec019 8813
df7492f9
KH
8814If all coding systems in CODING-SYSTEM-LIST can encode the region, the
8815value is nil.
93dec019 8816
df7492f9
KH
8817START may be a string. In that case, check if the string is
8818encodable, and the value contains indices to the string instead of
5704f39a
KH
8819buffer positions. END is ignored.
8820
4c1958f4 8821If the current buffer (or START if it is a string) is unibyte, the value
5704f39a 8822is nil. */)
5842a27b 8823 (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system_list)
05e6f5dc 8824{
df7492f9 8825 Lisp_Object list;
d311d28c
PE
8826 ptrdiff_t start_byte, end_byte;
8827 ptrdiff_t pos;
7c78e542 8828 const unsigned char *p, *pbeg, *pend;
df7492f9 8829 int c;
7d64c6ad 8830 Lisp_Object tail, elt, attrs;
70ad9fc4 8831
05e6f5dc
KH
8832 if (STRINGP (start))
8833 {
df7492f9 8834 if (!STRING_MULTIBYTE (start)
4c1958f4 8835 || SCHARS (start) == SBYTES (start))
df7492f9
KH
8836 return Qnil;
8837 start_byte = 0;
8f924df7 8838 end_byte = SBYTES (start);
df7492f9 8839 pos = 0;
d46c5b12 8840 }
05e6f5dc 8841 else
b73bfc1c 8842 {
b7826503
PJ
8843 CHECK_NUMBER_COERCE_MARKER (start);
8844 CHECK_NUMBER_COERCE_MARKER (end);
05e6f5dc
KH
8845 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8846 args_out_of_range (start, end);
4b4deea2 8847 if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
df7492f9
KH
8848 return Qnil;
8849 start_byte = CHAR_TO_BYTE (XINT (start));
8850 end_byte = CHAR_TO_BYTE (XINT (end));
8851 if (XINT (end) - XINT (start) == end_byte - start_byte)
5704f39a 8852 return Qnil;
df7492f9 8853
e1c23804 8854 if (XINT (start) < GPT && XINT (end) > GPT)
b73bfc1c 8855 {
e1c23804
DL
8856 if ((GPT - XINT (start)) < (XINT (end) - GPT))
8857 move_gap_both (XINT (start), start_byte);
df7492f9 8858 else
e1c23804 8859 move_gap_both (XINT (end), end_byte);
b73bfc1c 8860 }
e1c23804 8861 pos = XINT (start);
b73bfc1c 8862 }
7553d0e1 8863
df7492f9
KH
8864 list = Qnil;
8865 for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
12410ef1 8866 {
df7492f9 8867 elt = XCAR (tail);
7d64c6ad 8868 attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
2170c8f0
KH
8869 ASET (attrs, coding_attr_trans_tbl,
8870 get_translation_table (attrs, 1, NULL));
7d64c6ad 8871 list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
12410ef1
KH
8872 }
8873
df7492f9 8874 if (STRINGP (start))
8f924df7 8875 p = pbeg = SDATA (start);
72d1a715 8876 else
df7492f9
KH
8877 p = pbeg = BYTE_POS_ADDR (start_byte);
8878 pend = p + (end_byte - start_byte);
4ed46869 8879
df7492f9
KH
8880 while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
8881 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
ec6d2bb8 8882
df7492f9 8883 while (p < pend)
d46c5b12 8884 {
df7492f9
KH
8885 if (ASCII_BYTE_P (*p))
8886 p++;
e133c8fa 8887 else
05e6f5dc 8888 {
df7492f9
KH
8889 c = STRING_CHAR_ADVANCE (p);
8890
8891 charset_map_loaded = 0;
8892 for (tail = list; CONSP (tail); tail = XCDR (tail))
8893 {
8894 elt = XCDR (XCAR (tail));
8895 if (! char_encodable_p (c, XCAR (elt)))
8896 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
8897 }
8898 if (charset_map_loaded)
8899 {
d311d28c 8900 ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
df7492f9
KH
8901
8902 if (STRINGP (start))
8f924df7 8903 pbeg = SDATA (start);
df7492f9
KH
8904 else
8905 pbeg = BYTE_POS_ADDR (start_byte);
8906 p = pbeg + p_offset;
8907 pend = pbeg + pend_offset;
8908 }
05e6f5dc 8909 }
df7492f9 8910 pos++;
d46c5b12 8911 }
4ed46869 8912
df7492f9
KH
8913 tail = list;
8914 list = Qnil;
8915 for (; CONSP (tail); tail = XCDR (tail))
ec6d2bb8 8916 {
df7492f9
KH
8917 elt = XCAR (tail);
8918 if (CONSP (XCDR (XCDR (elt))))
8919 list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
8920 list);
ec6d2bb8 8921 }
2b4f9037 8922
df7492f9 8923 return list;
d46c5b12
KH
8924}
8925
3fd9494b 8926
74ab6df5 8927static Lisp_Object
cf84bb53
JB
8928code_convert_region (Lisp_Object start, Lisp_Object end,
8929 Lisp_Object coding_system, Lisp_Object dst_object,
8930 int encodep, int norecord)
4ed46869 8931{
3a73fa5d 8932 struct coding_system coding;
d311d28c 8933 ptrdiff_t from, from_byte, to, to_byte;
df7492f9 8934 Lisp_Object src_object;
4ed46869 8935
b7826503
PJ
8936 CHECK_NUMBER_COERCE_MARKER (start);
8937 CHECK_NUMBER_COERCE_MARKER (end);
df7492f9
KH
8938 if (NILP (coding_system))
8939 coding_system = Qno_conversion;
8940 else
8941 CHECK_CODING_SYSTEM (coding_system);
8942 src_object = Fcurrent_buffer ();
8943 if (NILP (dst_object))
8944 dst_object = src_object;
8945 else if (! EQ (dst_object, Qt))
8946 CHECK_BUFFER (dst_object);
3a73fa5d 8947
d46c5b12
KH
8948 validate_region (&start, &end);
8949 from = XFASTINT (start);
df7492f9 8950 from_byte = CHAR_TO_BYTE (from);
d46c5b12 8951 to = XFASTINT (end);
df7492f9 8952 to_byte = CHAR_TO_BYTE (to);
764ca8da 8953
df7492f9
KH
8954 setup_coding_system (coding_system, &coding);
8955 coding.mode |= CODING_MODE_LAST_BLOCK;
ec6d2bb8 8956
df7492f9
KH
8957 if (encodep)
8958 encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8959 dst_object);
8960 else
8961 decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8962 dst_object);
8963 if (! norecord)
8964 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
ec6d2bb8 8965
df7492f9
KH
8966 return (BUFFERP (dst_object)
8967 ? make_number (coding.produced_char)
8968 : coding.dst_object);
4031e2bf 8969}
78108bcd 8970
4ed46869 8971
4031e2bf 8972DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
df7492f9 8973 3, 4, "r\nzCoding system: ",
48b0f3ae 8974 doc: /* Decode the current region from the specified coding system.
df7492f9
KH
8975When called from a program, takes four arguments:
8976 START, END, CODING-SYSTEM, and DESTINATION.
8977START and END are buffer positions.
8844fa83 8978
df7492f9 8979Optional 4th arguments DESTINATION specifies where the decoded text goes.
2354b80b 8980If nil, the region between START and END is replaced by the decoded text.
1560f91a
EZ
8981If buffer, the decoded text is inserted in that buffer after point (point
8982does not move).
446dcd75 8983In those cases, the length of the decoded text is returned.
319a3947 8984If DESTINATION is t, the decoded text is returned.
8844fa83 8985
48b0f3ae
PJ
8986This function sets `last-coding-system-used' to the precise coding system
8987used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
319a3947 8988not fully specified.) */)
5842a27b 8989 (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
4031e2bf 8990{
df7492f9 8991 return code_convert_region (start, end, coding_system, destination, 0, 0);
3a73fa5d 8992}
8844fa83 8993
3a73fa5d 8994DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
df7492f9
KH
8995 3, 4, "r\nzCoding system: ",
8996 doc: /* Encode the current region by specified coding system.
d4a1d553
JB
8997When called from a program, takes four arguments:
8998 START, END, CODING-SYSTEM and DESTINATION.
8999START and END are buffer positions.
d46c5b12 9000
df7492f9
KH
9001Optional 4th arguments DESTINATION specifies where the encoded text goes.
9002If nil, the region between START and END is replace by the encoded text.
1560f91a
EZ
9003If buffer, the encoded text is inserted in that buffer after point (point
9004does not move).
446dcd75 9005In those cases, the length of the encoded text is returned.
319a3947 9006If DESTINATION is t, the encoded text is returned.
2391eaa4 9007
48b0f3ae
PJ
9008This function sets `last-coding-system-used' to the precise coding system
9009used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
319a3947 9010not fully specified.) */)
5842a27b 9011 (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
3a73fa5d 9012{
df7492f9 9013 return code_convert_region (start, end, coding_system, destination, 1, 0);
b73bfc1c
KH
9014}
9015
9016Lisp_Object
6f704c76
DN
9017code_convert_string (Lisp_Object string, Lisp_Object coding_system,
9018 Lisp_Object dst_object, int encodep, int nocopy, int norecord)
b73bfc1c 9019{
4031e2bf 9020 struct coding_system coding;
d311d28c 9021 ptrdiff_t chars, bytes;
ec6d2bb8 9022
b7826503 9023 CHECK_STRING (string);
d46c5b12 9024 if (NILP (coding_system))
4956c225 9025 {
df7492f9
KH
9026 if (! norecord)
9027 Vlast_coding_system_used = Qno_conversion;
9028 if (NILP (dst_object))
9029 return (nocopy ? Fcopy_sequence (string) : string);
4956c225 9030 }
b73bfc1c 9031
df7492f9
KH
9032 if (NILP (coding_system))
9033 coding_system = Qno_conversion;
9034 else
9035 CHECK_CODING_SYSTEM (coding_system);
9036 if (NILP (dst_object))
9037 dst_object = Qt;
9038 else if (! EQ (dst_object, Qt))
9039 CHECK_BUFFER (dst_object);
73be902c 9040
df7492f9 9041 setup_coding_system (coding_system, &coding);
d46c5b12 9042 coding.mode |= CODING_MODE_LAST_BLOCK;
8f924df7
KH
9043 chars = SCHARS (string);
9044 bytes = SBYTES (string);
df7492f9
KH
9045 if (encodep)
9046 encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9047 else
9048 decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9049 if (! norecord)
9050 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
73be902c 9051
df7492f9
KH
9052 return (BUFFERP (dst_object)
9053 ? make_number (coding.produced_char)
9054 : coding.dst_object);
4ed46869 9055}
73be902c 9056
b73bfc1c 9057
ecec61c1 9058/* Encode or decode STRING according to CODING_SYSTEM.
ec6d2bb8 9059 Do not set Vlast_coding_system_used.
4ed46869 9060
ec6d2bb8
KH
9061 This function is called only from macros DECODE_FILE and
9062 ENCODE_FILE, thus we ignore character composition. */
4ed46869 9063
ecec61c1 9064Lisp_Object
cf84bb53
JB
9065code_convert_string_norecord (Lisp_Object string, Lisp_Object coding_system,
9066 int encodep)
4ed46869 9067{
0be8721c 9068 return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
4ed46869
KH
9069}
9070
4ed46869 9071
a7ca3326 9072DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
df7492f9
KH
9073 2, 4, 0,
9074 doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
9075
9076Optional third arg NOCOPY non-nil means it is OK to return STRING itself
9077if the decoding operation is trivial.
ecec61c1 9078
d4a1d553 9079Optional fourth arg BUFFER non-nil means that the decoded text is
1560f91a
EZ
9080inserted in that buffer after point (point does not move). In this
9081case, the return value is the length of the decoded text.
ecec61c1 9082
df7492f9
KH
9083This function sets `last-coding-system-used' to the precise coding system
9084used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
d4a1d553 9085not fully specified.) */)
5842a27b 9086 (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
4ed46869 9087{
df7492f9
KH
9088 return code_convert_string (string, coding_system, buffer,
9089 0, ! NILP (nocopy), 0);
4ed46869
KH
9090}
9091
df7492f9
KH
9092DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
9093 2, 4, 0,
9094 doc: /* Encode STRING to CODING-SYSTEM, and return the result.
9095
9096Optional third arg NOCOPY non-nil means it is OK to return STRING
9097itself if the encoding operation is trivial.
9098
d4a1d553 9099Optional fourth arg BUFFER non-nil means that the encoded text is
1560f91a
EZ
9100inserted in that buffer after point (point does not move). In this
9101case, the return value is the length of the encoded text.
df7492f9
KH
9102
9103This function sets `last-coding-system-used' to the precise coding system
9104used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9105not fully specified.) */)
5842a27b 9106 (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
4ed46869 9107{
df7492f9 9108 return code_convert_string (string, coding_system, buffer,
4550efdf 9109 1, ! NILP (nocopy), 0);
4ed46869 9110}
df7492f9 9111
3a73fa5d 9112\f
4ed46869 9113DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
48b0f3ae
PJ
9114 doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
9115Return the corresponding character. */)
5842a27b 9116 (Lisp_Object code)
4ed46869 9117{
df7492f9
KH
9118 Lisp_Object spec, attrs, val;
9119 struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
5fdb398c
PE
9120 EMACS_INT ch;
9121 int c;
4ed46869 9122
df7492f9 9123 CHECK_NATNUM (code);
5fdb398c 9124 ch = XFASTINT (code);
df7492f9
KH
9125 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9126 attrs = AREF (spec, 0);
4ed46869 9127
5fdb398c 9128 if (ASCII_BYTE_P (ch)
df7492f9
KH
9129 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9130 return code;
4ed46869 9131
df7492f9
KH
9132 val = CODING_ATTR_CHARSET_LIST (attrs);
9133 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
004068e4
KH
9134 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9135 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
fa42c37f 9136
5fdb398c
PE
9137 if (ch <= 0x7F)
9138 {
9139 c = ch;
9140 charset = charset_roman;
9141 }
9142 else if (ch >= 0xA0 && ch < 0xDF)
55ab7be3 9143 {
5fdb398c 9144 c = ch - 0x80;
df7492f9 9145 charset = charset_kana;
4ed46869 9146 }
55ab7be3 9147 else
4ed46869 9148 {
5fdb398c
PE
9149 EMACS_INT c1 = ch >> 8;
9150 int c2 = ch & 0xFF;
df7492f9 9151
2735d060
PE
9152 if (c1 < 0x81 || (c1 > 0x9F && c1 < 0xE0) || c1 > 0xEF
9153 || c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
c2982e87 9154 error ("Invalid code: %"pI"d", ch);
5fdb398c 9155 c = ch;
df7492f9
KH
9156 SJIS_TO_JIS (c);
9157 charset = charset_kanji;
4ed46869 9158 }
df7492f9
KH
9159 c = DECODE_CHAR (charset, c);
9160 if (c < 0)
c2982e87 9161 error ("Invalid code: %"pI"d", ch);
df7492f9 9162 return make_number (c);
93dec019 9163}
4ed46869 9164
48b0f3ae 9165
4ed46869 9166DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
8acf0c0e 9167 doc: /* Encode a Japanese character CH to shift_jis encoding.
48b0f3ae 9168Return the corresponding code in SJIS. */)
5842a27b 9169 (Lisp_Object ch)
4ed46869 9170{
df7492f9
KH
9171 Lisp_Object spec, attrs, charset_list;
9172 int c;
9173 struct charset *charset;
9174 unsigned code;
48b0f3ae 9175
df7492f9
KH
9176 CHECK_CHARACTER (ch);
9177 c = XFASTINT (ch);
9178 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9179 attrs = AREF (spec, 0);
9180
9181 if (ASCII_CHAR_P (c)
9182 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9183 return ch;
9184
9185 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9186 charset = char_charset (c, charset_list, &code);
9187 if (code == CHARSET_INVALID_CODE (charset))
e6c3da20 9188 error ("Can't encode by shift_jis encoding: %c", c);
df7492f9
KH
9189 JIS_TO_SJIS (code);
9190
9191 return make_number (code);
4ed46869
KH
9192}
9193
9194DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
48b0f3ae
PJ
9195 doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
9196Return the corresponding character. */)
5842a27b 9197 (Lisp_Object code)
d46c5b12 9198{
df7492f9
KH
9199 Lisp_Object spec, attrs, val;
9200 struct charset *charset_roman, *charset_big5, *charset;
5fdb398c 9201 EMACS_INT ch;
df7492f9 9202 int c;
6289dd10 9203
df7492f9 9204 CHECK_NATNUM (code);
5fdb398c 9205 ch = XFASTINT (code);
df7492f9
KH
9206 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9207 attrs = AREF (spec, 0);
4ed46869 9208
5fdb398c 9209 if (ASCII_BYTE_P (ch)
df7492f9
KH
9210 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9211 return code;
6289dd10 9212
df7492f9
KH
9213 val = CODING_ATTR_CHARSET_LIST (attrs);
9214 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9215 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
c210f766 9216
5fdb398c
PE
9217 if (ch <= 0x7F)
9218 {
9219 c = ch;
9220 charset = charset_roman;
9221 }
c28a9453
KH
9222 else
9223 {
5fdb398c
PE
9224 EMACS_INT b1 = ch >> 8;
9225 int b2 = ch & 0x7F;
df7492f9
KH
9226 if (b1 < 0xA1 || b1 > 0xFE
9227 || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
c2982e87 9228 error ("Invalid code: %"pI"d", ch);
5fdb398c 9229 c = ch;
df7492f9 9230 charset = charset_big5;
c28a9453 9231 }
5fdb398c 9232 c = DECODE_CHAR (charset, c);
df7492f9 9233 if (c < 0)
c2982e87 9234 error ("Invalid code: %"pI"d", ch);
df7492f9 9235 return make_number (c);
d46c5b12 9236}
6289dd10 9237
4ed46869 9238DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
8acf0c0e 9239 doc: /* Encode the Big5 character CH to BIG5 coding system.
48b0f3ae 9240Return the corresponding character code in Big5. */)
5842a27b 9241 (Lisp_Object ch)
4ed46869 9242{
df7492f9
KH
9243 Lisp_Object spec, attrs, charset_list;
9244 struct charset *charset;
9245 int c;
9246 unsigned code;
9247
9248 CHECK_CHARACTER (ch);
9249 c = XFASTINT (ch);
9250 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9251 attrs = AREF (spec, 0);
9252 if (ASCII_CHAR_P (c)
9253 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9254 return ch;
9255
9256 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9257 charset = char_charset (c, charset_list, &code);
9258 if (code == CHARSET_INVALID_CODE (charset))
e6c3da20 9259 error ("Can't encode by Big5 encoding: %c", c);
df7492f9
KH
9260
9261 return make_number (code);
4ed46869 9262}
48b0f3ae 9263
3a73fa5d 9264\f
002fdb44 9265DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
68bba4e4 9266 Sset_terminal_coding_system_internal, 1, 2, 0,
48b0f3ae 9267 doc: /* Internal use only. */)
5842a27b 9268 (Lisp_Object coding_system, Lisp_Object terminal)
4ed46869 9269{
b18fad6d
KH
9270 struct terminal *term = get_terminal (terminal, 1);
9271 struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (term);
b7826503 9272 CHECK_SYMBOL (coding_system);
b8299c66 9273 setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
70c22245 9274 /* We had better not send unsafe characters to terminal. */
c73bd236 9275 terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
ad1746f5 9276 /* Character composition should be disabled. */
c73bd236 9277 terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
b8299c66
KL
9278 terminal_coding->src_multibyte = 1;
9279 terminal_coding->dst_multibyte = 0;
b18fad6d
KH
9280 if (terminal_coding->common_flags & CODING_REQUIRE_ENCODING_MASK)
9281 term->charset_list = coding_charset_list (terminal_coding);
9282 else
6b4bb703 9283 term->charset_list = Fcons (make_number (charset_ascii), Qnil);
4ed46869
KH
9284 return Qnil;
9285}
9286
c4825358
KH
9287DEFUN ("set-safe-terminal-coding-system-internal",
9288 Fset_safe_terminal_coding_system_internal,
48b0f3ae 9289 Sset_safe_terminal_coding_system_internal, 1, 1, 0,
ddb67bdc 9290 doc: /* Internal use only. */)
5842a27b 9291 (Lisp_Object coding_system)
d46c5b12 9292{
b7826503 9293 CHECK_SYMBOL (coding_system);
c4825358
KH
9294 setup_coding_system (Fcheck_coding_system (coding_system),
9295 &safe_terminal_coding);
ad1746f5 9296 /* Character composition should be disabled. */
df7492f9 9297 safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
b73bfc1c
KH
9298 safe_terminal_coding.src_multibyte = 1;
9299 safe_terminal_coding.dst_multibyte = 0;
c4825358
KH
9300 return Qnil;
9301}
4ed46869 9302
002fdb44 9303DEFUN ("terminal-coding-system", Fterminal_coding_system,
68bba4e4 9304 Sterminal_coding_system, 0, 1, 0,
6ed8eeff 9305 doc: /* Return coding system specified for terminal output on the given terminal.
708e05dc 9306TERMINAL may be a terminal object, a frame, or nil for the selected
6ed8eeff 9307frame's terminal device. */)
5842a27b 9308 (Lisp_Object terminal)
4ed46869 9309{
985773c9
MB
9310 struct coding_system *terminal_coding
9311 = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
9312 Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
ae6f73fa 9313
6d5eb5b0 9314 /* For backward compatibility, return nil if it is `undecided'. */
f75c90a9 9315 return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
4ed46869
KH
9316}
9317
002fdb44 9318DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
68bba4e4 9319 Sset_keyboard_coding_system_internal, 1, 2, 0,
48b0f3ae 9320 doc: /* Internal use only. */)
5842a27b 9321 (Lisp_Object coding_system, Lisp_Object terminal)
4ed46869 9322{
6ed8eeff 9323 struct terminal *t = get_terminal (terminal, 1);
b7826503 9324 CHECK_SYMBOL (coding_system);
624bda09
KH
9325 if (NILP (coding_system))
9326 coding_system = Qno_conversion;
9327 else
9328 Fcheck_coding_system (coding_system);
9329 setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t));
ad1746f5 9330 /* Character composition should be disabled. */
c73bd236
MB
9331 TERMINAL_KEYBOARD_CODING (t)->common_flags
9332 &= ~CODING_ANNOTATE_COMPOSITION_MASK;
4ed46869
KH
9333 return Qnil;
9334}
9335
9336DEFUN ("keyboard-coding-system",
985773c9 9337 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
48b0f3ae 9338 doc: /* Return coding system specified for decoding keyboard input. */)
5842a27b 9339 (Lisp_Object terminal)
4ed46869 9340{
985773c9
MB
9341 return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9342 (get_terminal (terminal, 1))->id);
4ed46869
KH
9343}
9344
4ed46869 9345\f
a7ca3326 9346DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
a5d301df 9347 Sfind_operation_coding_system, 1, MANY, 0,
48b0f3ae
PJ
9348 doc: /* Choose a coding system for an operation based on the target name.
9349The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9350DECODING-SYSTEM is the coding system to use for decoding
9351\(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9352for encoding (in case OPERATION does encoding).
05e6f5dc 9353
48b0f3ae
PJ
9354The first argument OPERATION specifies an I/O primitive:
9355 For file I/O, `insert-file-contents' or `write-region'.
9356 For process I/O, `call-process', `call-process-region', or `start-process'.
9357 For network I/O, `open-network-stream'.
05e6f5dc 9358
48b0f3ae
PJ
9359The remaining arguments should be the same arguments that were passed
9360to the primitive. Depending on which primitive, one of those arguments
9361is selected as the TARGET. For example, if OPERATION does file I/O,
9362whichever argument specifies the file name is TARGET.
05e6f5dc 9363
48b0f3ae 9364TARGET has a meaning which depends on OPERATION:
b883cdb2 9365 For file I/O, TARGET is a file name (except for the special case below).
48b0f3ae 9366 For process I/O, TARGET is a process name.
d4a1d553 9367 For network I/O, TARGET is a service name or a port number.
05e6f5dc 9368
d4a1d553 9369This function looks up what is specified for TARGET in
48b0f3ae
PJ
9370`file-coding-system-alist', `process-coding-system-alist',
9371or `network-coding-system-alist' depending on OPERATION.
9372They may specify a coding system, a cons of coding systems,
9373or a function symbol to call.
9374In the last case, we call the function with one argument,
9375which is a list of all the arguments given to this function.
1011c487
MB
9376If the function can't decide a coding system, it can return
9377`undecided' so that the normal code-detection is performed.
48b0f3ae 9378
b883cdb2
MB
9379If OPERATION is `insert-file-contents', the argument corresponding to
9380TARGET may be a cons (FILENAME . BUFFER). In that case, FILENAME is a
9381file name to look up, and BUFFER is a buffer that contains the file's
9382contents (not yet decoded). If `file-coding-system-alist' specifies a
9383function to call for FILENAME, that function should examine the
9384contents of BUFFER instead of reading the file.
9385
d918f936 9386usage: (find-operation-coding-system OPERATION ARGUMENTS...) */)
f66c7cf8 9387 (ptrdiff_t nargs, Lisp_Object *args)
6b89e3aa 9388{
4ed46869
KH
9389 Lisp_Object operation, target_idx, target, val;
9390 register Lisp_Object chain;
177c0ea7 9391
4ed46869
KH
9392 if (nargs < 2)
9393 error ("Too few arguments");
9394 operation = args[0];
9395 if (!SYMBOLP (operation)
d311d28c 9396 || (target_idx = Fget (operation, Qtarget_idx), !NATNUMP (target_idx)))
3ed051d4 9397 error ("Invalid first argument");
7b09a37a 9398 if (nargs <= 1 + XFASTINT (target_idx))
94dcfacf 9399 error ("Too few arguments for operation `%s'",
8f924df7 9400 SDATA (SYMBOL_NAME (operation)));
c5101a77 9401 target = args[XFASTINT (target_idx) + 1];
4ed46869 9402 if (!(STRINGP (target)
091a0ff0
KH
9403 || (EQ (operation, Qinsert_file_contents) && CONSP (target)
9404 && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
4ed46869 9405 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
94dcfacf
EZ
9406 error ("Invalid argument %"pI"d of operation `%s'",
9407 XFASTINT (target_idx) + 1, SDATA (SYMBOL_NAME (operation)));
091a0ff0
KH
9408 if (CONSP (target))
9409 target = XCAR (target);
4ed46869 9410
2e34157c
RS
9411 chain = ((EQ (operation, Qinsert_file_contents)
9412 || EQ (operation, Qwrite_region))
02ba4723 9413 ? Vfile_coding_system_alist
2e34157c 9414 : (EQ (operation, Qopen_network_stream)
02ba4723
KH
9415 ? Vnetwork_coding_system_alist
9416 : Vprocess_coding_system_alist));
4ed46869
KH
9417 if (NILP (chain))
9418 return Qnil;
9419
03699b14 9420 for (; CONSP (chain); chain = XCDR (chain))
6b89e3aa 9421 {
f44d27ce 9422 Lisp_Object elt;
6b89e3aa 9423
df7492f9 9424 elt = XCAR (chain);
4ed46869
KH
9425 if (CONSP (elt)
9426 && ((STRINGP (target)
03699b14
KR
9427 && STRINGP (XCAR (elt))
9428 && fast_string_match (XCAR (elt), target) >= 0)
9429 || (INTEGERP (target) && EQ (target, XCAR (elt)))))
6b89e3aa 9430 {
03699b14 9431 val = XCDR (elt);
b19fd4c5
KH
9432 /* Here, if VAL is both a valid coding system and a valid
9433 function symbol, we return VAL as a coding system. */
02ba4723
KH
9434 if (CONSP (val))
9435 return val;
9436 if (! SYMBOLP (val))
9437 return Qnil;
9438 if (! NILP (Fcoding_system_p (val)))
9439 return Fcons (val, val);
b19fd4c5 9440 if (! NILP (Ffboundp (val)))
6b89e3aa 9441 {
e2b97060
MB
9442 /* We use call1 rather than safe_call1
9443 so as to get bug reports about functions called here
9444 which don't handle the current interface. */
9445 val = call1 (val, Flist (nargs, args));
b19fd4c5
KH
9446 if (CONSP (val))
9447 return val;
9448 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
9449 return Fcons (val, val);
6b89e3aa 9450 }
02ba4723 9451 return Qnil;
6b89e3aa
KH
9452 }
9453 }
4ed46869 9454 return Qnil;
6b89e3aa
KH
9455}
9456
df7492f9 9457DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
a3f6ee6d 9458 Sset_coding_system_priority, 0, MANY, 0,
da7db224 9459 doc: /* Assign higher priority to the coding systems given as arguments.
d4a1d553 9460If multiple coding systems belong to the same category,
a3181084
DL
9461all but the first one are ignored.
9462
d4a1d553 9463usage: (set-coding-system-priority &rest coding-systems) */)
f66c7cf8 9464 (ptrdiff_t nargs, Lisp_Object *args)
df7492f9 9465{
f66c7cf8 9466 ptrdiff_t i, j;
df7492f9
KH
9467 int changed[coding_category_max];
9468 enum coding_category priorities[coding_category_max];
9469
72af86bd 9470 memset (changed, 0, sizeof changed);
6b89e3aa 9471
df7492f9 9472 for (i = j = 0; i < nargs; i++)
6b89e3aa 9473 {
df7492f9
KH
9474 enum coding_category category;
9475 Lisp_Object spec, attrs;
6b89e3aa 9476
df7492f9
KH
9477 CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9478 attrs = AREF (spec, 0);
9479 category = XINT (CODING_ATTR_CATEGORY (attrs));
9480 if (changed[category])
9481 /* Ignore this coding system because a coding system of the
9482 same category already had a higher priority. */
9483 continue;
9484 changed[category] = 1;
9485 priorities[j++] = category;
9486 if (coding_categories[category].id >= 0
9487 && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9488 setup_coding_system (args[i], &coding_categories[category]);
ff563fce 9489 Fset (AREF (Vcoding_category_table, category), args[i]);
df7492f9 9490 }
6b89e3aa 9491
df7492f9
KH
9492 /* Now we have decided top J priorities. Reflect the order of the
9493 original priorities to the remaining priorities. */
6b89e3aa 9494
df7492f9 9495 for (i = j, j = 0; i < coding_category_max; i++, j++)
6b89e3aa 9496 {
df7492f9
KH
9497 while (j < coding_category_max
9498 && changed[coding_priorities[j]])
9499 j++;
9500 if (j == coding_category_max)
9501 abort ();
9502 priorities[i] = coding_priorities[j];
9503 }
6b89e3aa 9504
72af86bd 9505 memcpy (coding_priorities, priorities, sizeof priorities);
177c0ea7 9506
ff563fce
KH
9507 /* Update `coding-category-list'. */
9508 Vcoding_category_list = Qnil;
c5101a77 9509 for (i = coding_category_max; i-- > 0; )
ff563fce
KH
9510 Vcoding_category_list
9511 = Fcons (AREF (Vcoding_category_table, priorities[i]),
9512 Vcoding_category_list);
6b89e3aa 9513
df7492f9 9514 return Qnil;
6b89e3aa
KH
9515}
9516
df7492f9
KH
9517DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
9518 Scoding_system_priority_list, 0, 1, 0,
da7db224 9519 doc: /* Return a list of coding systems ordered by their priorities.
b811c52b
KH
9520The list contains a subset of coding systems; i.e. coding systems
9521assigned to each coding category (see `coding-category-list').
9522
da7db224 9523HIGHESTP non-nil means just return the highest priority one. */)
5842a27b 9524 (Lisp_Object highestp)
d46c5b12
KH
9525{
9526 int i;
df7492f9 9527 Lisp_Object val;
6b89e3aa 9528
df7492f9 9529 for (i = 0, val = Qnil; i < coding_category_max; i++)
d46c5b12 9530 {
df7492f9
KH
9531 enum coding_category category = coding_priorities[i];
9532 int id = coding_categories[category].id;
9533 Lisp_Object attrs;
068a9dbd 9534
df7492f9
KH
9535 if (id < 0)
9536 continue;
9537 attrs = CODING_ID_ATTRS (id);
9538 if (! NILP (highestp))
9539 return CODING_ATTR_BASE_NAME (attrs);
9540 val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
9541 }
9542 return Fnreverse (val);
9543}
068a9dbd 9544
91433552 9545static const char *const suffixes[] = { "-unix", "-dos", "-mac" };
068a9dbd
KH
9546
9547static Lisp_Object
971de7fb 9548make_subsidiaries (Lisp_Object base)
068a9dbd 9549{
df7492f9 9550 Lisp_Object subsidiaries;
1bfdaf10 9551 ptrdiff_t base_name_len = SBYTES (SYMBOL_NAME (base));
df7492f9
KH
9552 char *buf = (char *) alloca (base_name_len + 6);
9553 int i;
068a9dbd 9554
72af86bd 9555 memcpy (buf, SDATA (SYMBOL_NAME (base)), base_name_len);
df7492f9
KH
9556 subsidiaries = Fmake_vector (make_number (3), Qnil);
9557 for (i = 0; i < 3; i++)
068a9dbd 9558 {
1bfdaf10 9559 strcpy (buf + base_name_len, suffixes[i]);
df7492f9 9560 ASET (subsidiaries, i, intern (buf));
068a9dbd 9561 }
df7492f9 9562 return subsidiaries;
068a9dbd
KH
9563}
9564
9565
df7492f9
KH
9566DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
9567 Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
1fcd6c8b
DL
9568 doc: /* For internal use only.
9569usage: (define-coding-system-internal ...) */)
f66c7cf8 9570 (ptrdiff_t nargs, Lisp_Object *args)
068a9dbd 9571{
df7492f9
KH
9572 Lisp_Object name;
9573 Lisp_Object spec_vec; /* [ ATTRS ALIASE EOL_TYPE ] */
9574 Lisp_Object attrs; /* Vector of attributes. */
9575 Lisp_Object eol_type;
9576 Lisp_Object aliases;
9577 Lisp_Object coding_type, charset_list, safe_charsets;
9578 enum coding_category category;
9579 Lisp_Object tail, val;
9580 int max_charset_id = 0;
9581 int i;
068a9dbd 9582
df7492f9
KH
9583 if (nargs < coding_arg_max)
9584 goto short_args;
068a9dbd 9585
df7492f9 9586 attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
068a9dbd 9587
df7492f9
KH
9588 name = args[coding_arg_name];
9589 CHECK_SYMBOL (name);
9590 CODING_ATTR_BASE_NAME (attrs) = name;
068a9dbd 9591
df7492f9
KH
9592 val = args[coding_arg_mnemonic];
9593 if (! STRINGP (val))
9594 CHECK_CHARACTER (val);
9595 CODING_ATTR_MNEMONIC (attrs) = val;
068a9dbd 9596
df7492f9
KH
9597 coding_type = args[coding_arg_coding_type];
9598 CHECK_SYMBOL (coding_type);
9599 CODING_ATTR_TYPE (attrs) = coding_type;
068a9dbd 9600
df7492f9
KH
9601 charset_list = args[coding_arg_charset_list];
9602 if (SYMBOLP (charset_list))
9603 {
9604 if (EQ (charset_list, Qiso_2022))
9605 {
9606 if (! EQ (coding_type, Qiso_2022))
9607 error ("Invalid charset-list");
9608 charset_list = Viso_2022_charset_list;
9609 }
9610 else if (EQ (charset_list, Qemacs_mule))
9611 {
9612 if (! EQ (coding_type, Qemacs_mule))
9613 error ("Invalid charset-list");
9614 charset_list = Vemacs_mule_charset_list;
9615 }
9616 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
d311d28c
PE
9617 {
9618 if (! RANGED_INTEGERP (0, XCAR (tail), INT_MAX - 1))
9619 error ("Invalid charset-list");
9620 if (max_charset_id < XFASTINT (XCAR (tail)))
9621 max_charset_id = XFASTINT (XCAR (tail));
9622 }
df7492f9 9623 }
068a9dbd
KH
9624 else
9625 {
df7492f9 9626 charset_list = Fcopy_sequence (charset_list);
985773c9 9627 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
068a9dbd 9628 {
df7492f9
KH
9629 struct charset *charset;
9630
985773c9 9631 val = XCAR (tail);
df7492f9
KH
9632 CHECK_CHARSET_GET_CHARSET (val, charset);
9633 if (EQ (coding_type, Qiso_2022)
9634 ? CHARSET_ISO_FINAL (charset) < 0
9635 : EQ (coding_type, Qemacs_mule)
9636 ? CHARSET_EMACS_MULE_ID (charset) < 0
9637 : 0)
9638 error ("Can't handle charset `%s'",
8f924df7 9639 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
df7492f9 9640
8f924df7 9641 XSETCAR (tail, make_number (charset->id));
df7492f9
KH
9642 if (max_charset_id < charset->id)
9643 max_charset_id = charset->id;
068a9dbd
KH
9644 }
9645 }
df7492f9 9646 CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
068a9dbd 9647
1b3b981b
AS
9648 safe_charsets = make_uninit_string (max_charset_id + 1);
9649 memset (SDATA (safe_charsets), 255, max_charset_id + 1);
df7492f9 9650 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8f924df7 9651 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
df7492f9 9652 CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
068a9dbd 9653
584948ac 9654 CODING_ATTR_ASCII_COMPAT (attrs) = args[coding_arg_ascii_compatible_p];
3a73fa5d 9655
df7492f9 9656 val = args[coding_arg_decode_translation_table];
a6f87d34 9657 if (! CHAR_TABLE_P (val) && ! CONSP (val))
7d64c6ad 9658 CHECK_SYMBOL (val);
df7492f9 9659 CODING_ATTR_DECODE_TBL (attrs) = val;
3a73fa5d 9660
df7492f9 9661 val = args[coding_arg_encode_translation_table];
a6f87d34 9662 if (! CHAR_TABLE_P (val) && ! CONSP (val))
7d64c6ad 9663 CHECK_SYMBOL (val);
df7492f9 9664 CODING_ATTR_ENCODE_TBL (attrs) = val;
d46c5b12 9665
df7492f9
KH
9666 val = args[coding_arg_post_read_conversion];
9667 CHECK_SYMBOL (val);
9668 CODING_ATTR_POST_READ (attrs) = val;
d46c5b12 9669
df7492f9
KH
9670 val = args[coding_arg_pre_write_conversion];
9671 CHECK_SYMBOL (val);
9672 CODING_ATTR_PRE_WRITE (attrs) = val;
3a73fa5d 9673
df7492f9
KH
9674 val = args[coding_arg_default_char];
9675 if (NILP (val))
9676 CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
9677 else
9678 {
8f924df7 9679 CHECK_CHARACTER (val);
df7492f9
KH
9680 CODING_ATTR_DEFAULT_CHAR (attrs) = val;
9681 }
4031e2bf 9682
8f924df7
KH
9683 val = args[coding_arg_for_unibyte];
9684 CODING_ATTR_FOR_UNIBYTE (attrs) = NILP (val) ? Qnil : Qt;
3a73fa5d 9685
df7492f9
KH
9686 val = args[coding_arg_plist];
9687 CHECK_LIST (val);
9688 CODING_ATTR_PLIST (attrs) = val;
3a73fa5d 9689
df7492f9
KH
9690 if (EQ (coding_type, Qcharset))
9691 {
c7c66a95
KH
9692 /* Generate a lisp vector of 256 elements. Each element is nil,
9693 integer, or a list of charset IDs.
3a73fa5d 9694
c7c66a95
KH
9695 If Nth element is nil, the byte code N is invalid in this
9696 coding system.
4ed46869 9697
c7c66a95
KH
9698 If Nth element is a number NUM, N is the first byte of a
9699 charset whose ID is NUM.
4ed46869 9700
c7c66a95
KH
9701 If Nth element is a list of charset IDs, N is the first byte
9702 of one of them. The list is sorted by dimensions of the
ad1746f5 9703 charsets. A charset of smaller dimension comes first. */
df7492f9 9704 val = Fmake_vector (make_number (256), Qnil);
4ed46869 9705
5c99c2e6 9706 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
df7492f9 9707 {
c7c66a95
KH
9708 struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
9709 int dim = CHARSET_DIMENSION (charset);
9710 int idx = (dim - 1) * 4;
4ed46869 9711
5c99c2e6 9712 if (CHARSET_ASCII_COMPATIBLE_P (charset))
584948ac 9713 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
4031e2bf 9714
15d143f7
KH
9715 for (i = charset->code_space[idx];
9716 i <= charset->code_space[idx + 1]; i++)
9717 {
c7c66a95
KH
9718 Lisp_Object tmp, tmp2;
9719 int dim2;
ec6d2bb8 9720
c7c66a95
KH
9721 tmp = AREF (val, i);
9722 if (NILP (tmp))
9723 tmp = XCAR (tail);
9724 else if (NUMBERP (tmp))
9725 {
9726 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
9727 if (dim < dim2)
c7c66a95 9728 tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
f9d71dcd
KH
9729 else
9730 tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
c7c66a95 9731 }
15d143f7 9732 else
c7c66a95
KH
9733 {
9734 for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
9735 {
9736 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
9737 if (dim < dim2)
9738 break;
9739 }
9740 if (NILP (tmp2))
9741 tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
9742 else
9743 {
9744 XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
9745 XSETCAR (tmp2, XCAR (tail));
9746 }
9747 }
9748 ASET (val, i, tmp);
15d143f7 9749 }
df7492f9
KH
9750 }
9751 ASET (attrs, coding_attr_charset_valids, val);
9752 category = coding_category_charset;
9753 }
9754 else if (EQ (coding_type, Qccl))
9755 {
9756 Lisp_Object valids;
ecec61c1 9757
df7492f9
KH
9758 if (nargs < coding_arg_ccl_max)
9759 goto short_args;
ecec61c1 9760
df7492f9
KH
9761 val = args[coding_arg_ccl_decoder];
9762 CHECK_CCL_PROGRAM (val);
9763 if (VECTORP (val))
9764 val = Fcopy_sequence (val);
9765 ASET (attrs, coding_attr_ccl_decoder, val);
ecec61c1 9766
df7492f9
KH
9767 val = args[coding_arg_ccl_encoder];
9768 CHECK_CCL_PROGRAM (val);
9769 if (VECTORP (val))
9770 val = Fcopy_sequence (val);
9771 ASET (attrs, coding_attr_ccl_encoder, val);
ecec61c1 9772
df7492f9
KH
9773 val = args[coding_arg_ccl_valids];
9774 valids = Fmake_string (make_number (256), make_number (0));
9775 for (tail = val; !NILP (tail); tail = Fcdr (tail))
9776 {
8dcbea82 9777 int from, to;
ecec61c1 9778
df7492f9
KH
9779 val = Fcar (tail);
9780 if (INTEGERP (val))
8dcbea82 9781 {
d311d28c 9782 if (! (0 <= XINT (val) && XINT (val) <= 255))
8dcbea82 9783 args_out_of_range_3 (val, make_number (0), make_number (255));
d311d28c 9784 from = to = XINT (val);
8dcbea82 9785 }
df7492f9
KH
9786 else
9787 {
df7492f9 9788 CHECK_CONS (val);
8f924df7 9789 CHECK_NATNUM_CAR (val);
d311d28c
PE
9790 CHECK_NUMBER_CDR (val);
9791 if (XINT (XCAR (val)) > 255)
8dcbea82
KH
9792 args_out_of_range_3 (XCAR (val),
9793 make_number (0), make_number (255));
d311d28c
PE
9794 from = XINT (XCAR (val));
9795 if (! (from <= XINT (XCDR (val)) && XINT (XCDR (val)) <= 255))
8dcbea82
KH
9796 args_out_of_range_3 (XCDR (val),
9797 XCAR (val), make_number (255));
d311d28c 9798 to = XINT (XCDR (val));
df7492f9 9799 }
8dcbea82 9800 for (i = from; i <= to; i++)
8f924df7 9801 SSET (valids, i, 1);
df7492f9
KH
9802 }
9803 ASET (attrs, coding_attr_ccl_valids, valids);
4ed46869 9804
df7492f9 9805 category = coding_category_ccl;
55ab7be3 9806 }
df7492f9 9807 else if (EQ (coding_type, Qutf_16))
55ab7be3 9808 {
df7492f9 9809 Lisp_Object bom, endian;
4ed46869 9810
584948ac 9811 CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
4ed46869 9812
df7492f9
KH
9813 if (nargs < coding_arg_utf16_max)
9814 goto short_args;
4ed46869 9815
df7492f9
KH
9816 bom = args[coding_arg_utf16_bom];
9817 if (! NILP (bom) && ! EQ (bom, Qt))
9818 {
9819 CHECK_CONS (bom);
8f924df7
KH
9820 val = XCAR (bom);
9821 CHECK_CODING_SYSTEM (val);
9822 val = XCDR (bom);
9823 CHECK_CODING_SYSTEM (val);
df7492f9 9824 }
a470d443 9825 ASET (attrs, coding_attr_utf_bom, bom);
df7492f9
KH
9826
9827 endian = args[coding_arg_utf16_endian];
b49a1807
KH
9828 CHECK_SYMBOL (endian);
9829 if (NILP (endian))
9830 endian = Qbig;
9831 else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
8f924df7 9832 error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
df7492f9
KH
9833 ASET (attrs, coding_attr_utf_16_endian, endian);
9834
9835 category = (CONSP (bom)
9836 ? coding_category_utf_16_auto
9837 : NILP (bom)
b49a1807 9838 ? (EQ (endian, Qbig)
df7492f9
KH
9839 ? coding_category_utf_16_be_nosig
9840 : coding_category_utf_16_le_nosig)
b49a1807 9841 : (EQ (endian, Qbig)
df7492f9
KH
9842 ? coding_category_utf_16_be
9843 : coding_category_utf_16_le));
9844 }
9845 else if (EQ (coding_type, Qiso_2022))
9846 {
9847 Lisp_Object initial, reg_usage, request, flags;
1397dc18 9848
df7492f9
KH
9849 if (nargs < coding_arg_iso2022_max)
9850 goto short_args;
9851
9852 initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
9853 CHECK_VECTOR (initial);
9854 for (i = 0; i < 4; i++)
9855 {
9856 val = Faref (initial, make_number (i));
9857 if (! NILP (val))
9858 {
584948ac
KH
9859 struct charset *charset;
9860
9861 CHECK_CHARSET_GET_CHARSET (val, charset);
9862 ASET (initial, i, make_number (CHARSET_ID (charset)));
9863 if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
9864 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
df7492f9
KH
9865 }
9866 else
9867 ASET (initial, i, make_number (-1));
9868 }
9869
9870 reg_usage = args[coding_arg_iso2022_reg_usage];
9871 CHECK_CONS (reg_usage);
8f924df7
KH
9872 CHECK_NUMBER_CAR (reg_usage);
9873 CHECK_NUMBER_CDR (reg_usage);
df7492f9
KH
9874
9875 request = Fcopy_sequence (args[coding_arg_iso2022_request]);
9876 for (tail = request; ! NILP (tail); tail = Fcdr (tail))
1397dc18 9877 {
df7492f9 9878 int id;
2735d060 9879 Lisp_Object tmp1;
df7492f9
KH
9880
9881 val = Fcar (tail);
9882 CHECK_CONS (val);
2735d060
PE
9883 tmp1 = XCAR (val);
9884 CHECK_CHARSET_GET_ID (tmp1, id);
8f924df7 9885 CHECK_NATNUM_CDR (val);
df7492f9 9886 if (XINT (XCDR (val)) >= 4)
c2982e87 9887 error ("Invalid graphic register number: %"pI"d", XINT (XCDR (val)));
8f924df7 9888 XSETCAR (val, make_number (id));
1397dc18 9889 }
4ed46869 9890
df7492f9
KH
9891 flags = args[coding_arg_iso2022_flags];
9892 CHECK_NATNUM (flags);
d311d28c 9893 i = XINT (flags) & INT_MAX;
df7492f9 9894 if (EQ (args[coding_arg_charset_list], Qiso_2022))
d311d28c
PE
9895 i |= CODING_ISO_FLAG_FULL_SUPPORT;
9896 flags = make_number (i);
df7492f9
KH
9897
9898 ASET (attrs, coding_attr_iso_initial, initial);
9899 ASET (attrs, coding_attr_iso_usage, reg_usage);
9900 ASET (attrs, coding_attr_iso_request, request);
9901 ASET (attrs, coding_attr_iso_flags, flags);
9902 setup_iso_safe_charsets (attrs);
9903
9904 if (i & CODING_ISO_FLAG_SEVEN_BITS)
9905 category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
9906 | CODING_ISO_FLAG_SINGLE_SHIFT))
9907 ? coding_category_iso_7_else
9908 : EQ (args[coding_arg_charset_list], Qiso_2022)
9909 ? coding_category_iso_7
9910 : coding_category_iso_7_tight);
9911 else
9912 {
9913 int id = XINT (AREF (initial, 1));
9914
c6fb6e98 9915 category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
df7492f9
KH
9916 || EQ (args[coding_arg_charset_list], Qiso_2022)
9917 || id < 0)
9918 ? coding_category_iso_8_else
9919 : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
9920 ? coding_category_iso_8_1
9921 : coding_category_iso_8_2);
9922 }
0ce7886f
KH
9923 if (category != coding_category_iso_8_1
9924 && category != coding_category_iso_8_2)
9925 CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
df7492f9
KH
9926 }
9927 else if (EQ (coding_type, Qemacs_mule))
c28a9453 9928 {
df7492f9
KH
9929 if (EQ (args[coding_arg_charset_list], Qemacs_mule))
9930 ASET (attrs, coding_attr_emacs_mule_full, Qt);
584948ac 9931 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
df7492f9 9932 category = coding_category_emacs_mule;
c28a9453 9933 }
df7492f9 9934 else if (EQ (coding_type, Qshift_jis))
c28a9453 9935 {
df7492f9
KH
9936
9937 struct charset *charset;
9938
7d64c6ad 9939 if (XINT (Flength (charset_list)) != 3
6e07c25f 9940 && XINT (Flength (charset_list)) != 4)
7d64c6ad 9941 error ("There should be three or four charsets");
df7492f9
KH
9942
9943 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9944 if (CHARSET_DIMENSION (charset) != 1)
9945 error ("Dimension of charset %s is not one",
8f924df7 9946 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
584948ac
KH
9947 if (CHARSET_ASCII_COMPATIBLE_P (charset))
9948 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
df7492f9
KH
9949
9950 charset_list = XCDR (charset_list);
9951 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9952 if (CHARSET_DIMENSION (charset) != 1)
9953 error ("Dimension of charset %s is not one",
8f924df7 9954 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
df7492f9
KH
9955
9956 charset_list = XCDR (charset_list);
9957 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9958 if (CHARSET_DIMENSION (charset) != 2)
7d64c6ad
KH
9959 error ("Dimension of charset %s is not two",
9960 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9961
9962 charset_list = XCDR (charset_list);
2b917a06
KH
9963 if (! NILP (charset_list))
9964 {
9965 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9966 if (CHARSET_DIMENSION (charset) != 2)
9967 error ("Dimension of charset %s is not two",
9968 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9969 }
df7492f9
KH
9970
9971 category = coding_category_sjis;
9972 Vsjis_coding_system = name;
c28a9453 9973 }
df7492f9
KH
9974 else if (EQ (coding_type, Qbig5))
9975 {
9976 struct charset *charset;
4ed46869 9977
df7492f9
KH
9978 if (XINT (Flength (charset_list)) != 2)
9979 error ("There should be just two charsets");
9980
9981 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9982 if (CHARSET_DIMENSION (charset) != 1)
9983 error ("Dimension of charset %s is not one",
8f924df7 9984 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
584948ac
KH
9985 if (CHARSET_ASCII_COMPATIBLE_P (charset))
9986 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
df7492f9
KH
9987
9988 charset_list = XCDR (charset_list);
9989 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9990 if (CHARSET_DIMENSION (charset) != 2)
9991 error ("Dimension of charset %s is not two",
8f924df7 9992 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
4ed46869 9993
df7492f9
KH
9994 category = coding_category_big5;
9995 Vbig5_coding_system = name;
9996 }
9997 else if (EQ (coding_type, Qraw_text))
c28a9453 9998 {
584948ac
KH
9999 category = coding_category_raw_text;
10000 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
c28a9453 10001 }
df7492f9 10002 else if (EQ (coding_type, Qutf_8))
4ed46869 10003 {
a470d443
KH
10004 Lisp_Object bom;
10005
a470d443
KH
10006 if (nargs < coding_arg_utf8_max)
10007 goto short_args;
10008
10009 bom = args[coding_arg_utf8_bom];
10010 if (! NILP (bom) && ! EQ (bom, Qt))
10011 {
10012 CHECK_CONS (bom);
10013 val = XCAR (bom);
10014 CHECK_CODING_SYSTEM (val);
10015 val = XCDR (bom);
10016 CHECK_CODING_SYSTEM (val);
10017 }
10018 ASET (attrs, coding_attr_utf_bom, bom);
0e5317f7
KH
10019 if (NILP (bom))
10020 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
a470d443
KH
10021
10022 category = (CONSP (bom) ? coding_category_utf_8_auto
10023 : NILP (bom) ? coding_category_utf_8_nosig
10024 : coding_category_utf_8_sig);
4ed46869 10025 }
df7492f9
KH
10026 else if (EQ (coding_type, Qundecided))
10027 category = coding_category_undecided;
4ed46869 10028 else
df7492f9 10029 error ("Invalid coding system type: %s",
8f924df7 10030 SDATA (SYMBOL_NAME (coding_type)));
4ed46869 10031
df7492f9 10032 CODING_ATTR_CATEGORY (attrs) = make_number (category);
01378f49
KH
10033 CODING_ATTR_PLIST (attrs)
10034 = Fcons (QCcategory, Fcons (AREF (Vcoding_category_table, category),
10035 CODING_ATTR_PLIST (attrs)));
35befdaa 10036 CODING_ATTR_PLIST (attrs)
3ed051d4 10037 = Fcons (QCascii_compatible_p,
35befdaa
KH
10038 Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
10039 CODING_ATTR_PLIST (attrs)));
c4825358 10040
df7492f9
KH
10041 eol_type = args[coding_arg_eol_type];
10042 if (! NILP (eol_type)
10043 && ! EQ (eol_type, Qunix)
10044 && ! EQ (eol_type, Qdos)
10045 && ! EQ (eol_type, Qmac))
10046 error ("Invalid eol-type");
4ed46869 10047
df7492f9 10048 aliases = Fcons (name, Qnil);
4ed46869 10049
df7492f9
KH
10050 if (NILP (eol_type))
10051 {
10052 eol_type = make_subsidiaries (name);
10053 for (i = 0; i < 3; i++)
1397dc18 10054 {
df7492f9
KH
10055 Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
10056
10057 this_name = AREF (eol_type, i);
10058 this_aliases = Fcons (this_name, Qnil);
10059 this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
10060 this_spec = Fmake_vector (make_number (3), attrs);
10061 ASET (this_spec, 1, this_aliases);
10062 ASET (this_spec, 2, this_eol_type);
10063 Fputhash (this_name, this_spec, Vcoding_system_hash_table);
10064 Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
583f71ca
KH
10065 val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
10066 if (NILP (val))
10067 Vcoding_system_alist
10068 = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
10069 Vcoding_system_alist);
1397dc18 10070 }
d46c5b12 10071 }
4ed46869 10072
df7492f9
KH
10073 spec_vec = Fmake_vector (make_number (3), attrs);
10074 ASET (spec_vec, 1, aliases);
10075 ASET (spec_vec, 2, eol_type);
48b0f3ae 10076
df7492f9
KH
10077 Fputhash (name, spec_vec, Vcoding_system_hash_table);
10078 Vcoding_system_list = Fcons (name, Vcoding_system_list);
583f71ca
KH
10079 val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
10080 if (NILP (val))
10081 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
10082 Vcoding_system_alist);
48b0f3ae 10083
df7492f9
KH
10084 {
10085 int id = coding_categories[category].id;
48b0f3ae 10086
df7492f9
KH
10087 if (id < 0 || EQ (name, CODING_ID_NAME (id)))
10088 setup_coding_system (name, &coding_categories[category]);
10089 }
48b0f3ae 10090
d46c5b12 10091 return Qnil;
48b0f3ae 10092
df7492f9
KH
10093 short_args:
10094 return Fsignal (Qwrong_number_of_arguments,
10095 Fcons (intern ("define-coding-system-internal"),
10096 make_number (nargs)));
d46c5b12 10097}
4ed46869 10098
d6925f38 10099
a6f87d34
KH
10100DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
10101 3, 3, 0,
10102 doc: /* Change value in CODING-SYSTEM's property list PROP to VAL. */)
5842a27b 10103 (Lisp_Object coding_system, Lisp_Object prop, Lisp_Object val)
a6f87d34 10104{
3dbe7859 10105 Lisp_Object spec, attrs;
a6f87d34
KH
10106
10107 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10108 attrs = AREF (spec, 0);
10109 if (EQ (prop, QCmnemonic))
10110 {
10111 if (! STRINGP (val))
10112 CHECK_CHARACTER (val);
10113 CODING_ATTR_MNEMONIC (attrs) = val;
10114 }
2133e2d1 10115 else if (EQ (prop, QCdefault_char))
a6f87d34
KH
10116 {
10117 if (NILP (val))
10118 val = make_number (' ');
10119 else
10120 CHECK_CHARACTER (val);
10121 CODING_ATTR_DEFAULT_CHAR (attrs) = val;
10122 }
10123 else if (EQ (prop, QCdecode_translation_table))
10124 {
10125 if (! CHAR_TABLE_P (val) && ! CONSP (val))
10126 CHECK_SYMBOL (val);
10127 CODING_ATTR_DECODE_TBL (attrs) = val;
10128 }
10129 else if (EQ (prop, QCencode_translation_table))
10130 {
10131 if (! CHAR_TABLE_P (val) && ! CONSP (val))
10132 CHECK_SYMBOL (val);
10133 CODING_ATTR_ENCODE_TBL (attrs) = val;
10134 }
10135 else if (EQ (prop, QCpost_read_conversion))
10136 {
10137 CHECK_SYMBOL (val);
10138 CODING_ATTR_POST_READ (attrs) = val;
10139 }
10140 else if (EQ (prop, QCpre_write_conversion))
10141 {
10142 CHECK_SYMBOL (val);
10143 CODING_ATTR_PRE_WRITE (attrs) = val;
10144 }
35befdaa
KH
10145 else if (EQ (prop, QCascii_compatible_p))
10146 {
10147 CODING_ATTR_ASCII_COMPAT (attrs) = val;
10148 }
a6f87d34
KH
10149
10150 CODING_ATTR_PLIST (attrs)
10151 = Fplist_put (CODING_ATTR_PLIST (attrs), prop, val);
10152 return val;
10153}
10154
10155
df7492f9
KH
10156DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
10157 Sdefine_coding_system_alias, 2, 2, 0,
10158 doc: /* Define ALIAS as an alias for CODING-SYSTEM. */)
5842a27b 10159 (Lisp_Object alias, Lisp_Object coding_system)
66cfb530 10160{
583f71ca 10161 Lisp_Object spec, aliases, eol_type, val;
4ed46869 10162
df7492f9
KH
10163 CHECK_SYMBOL (alias);
10164 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10165 aliases = AREF (spec, 1);
d4a1d553 10166 /* ALIASES should be a list of length more than zero, and the first
d6925f38
KH
10167 element is a base coding system. Append ALIAS at the tail of the
10168 list. */
df7492f9
KH
10169 while (!NILP (XCDR (aliases)))
10170 aliases = XCDR (aliases);
8f924df7 10171 XSETCDR (aliases, Fcons (alias, Qnil));
4ed46869 10172
df7492f9
KH
10173 eol_type = AREF (spec, 2);
10174 if (VECTORP (eol_type))
4ed46869 10175 {
df7492f9
KH
10176 Lisp_Object subsidiaries;
10177 int i;
4ed46869 10178
df7492f9
KH
10179 subsidiaries = make_subsidiaries (alias);
10180 for (i = 0; i < 3; i++)
10181 Fdefine_coding_system_alias (AREF (subsidiaries, i),
10182 AREF (eol_type, i));
4ed46869 10183 }
df7492f9
KH
10184
10185 Fputhash (alias, spec, Vcoding_system_hash_table);
d6925f38 10186 Vcoding_system_list = Fcons (alias, Vcoding_system_list);
583f71ca
KH
10187 val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
10188 if (NILP (val))
10189 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
10190 Vcoding_system_alist);
66cfb530 10191
4ed46869
KH
10192 return Qnil;
10193}
10194
a7ca3326 10195DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
df7492f9
KH
10196 1, 1, 0,
10197 doc: /* Return the base of CODING-SYSTEM.
da7db224 10198Any alias or subsidiary coding system is not a base coding system. */)
5842a27b 10199 (Lisp_Object coding_system)
d46c5b12 10200{
df7492f9 10201 Lisp_Object spec, attrs;
d46c5b12 10202
df7492f9
KH
10203 if (NILP (coding_system))
10204 return (Qno_conversion);
10205 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10206 attrs = AREF (spec, 0);
10207 return CODING_ATTR_BASE_NAME (attrs);
10208}
1397dc18 10209
df7492f9
KH
10210DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
10211 1, 1, 0,
10212 doc: "Return the property list of CODING-SYSTEM.")
5842a27b 10213 (Lisp_Object coding_system)
df7492f9
KH
10214{
10215 Lisp_Object spec, attrs;
1397dc18 10216
df7492f9
KH
10217 if (NILP (coding_system))
10218 coding_system = Qno_conversion;
10219 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10220 attrs = AREF (spec, 0);
10221 return CODING_ATTR_PLIST (attrs);
d46c5b12
KH
10222}
10223
df7492f9
KH
10224
10225DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
10226 1, 1, 0,
da7db224 10227 doc: /* Return the list of aliases of CODING-SYSTEM. */)
5842a27b 10228 (Lisp_Object coding_system)
66cfb530 10229{
df7492f9 10230 Lisp_Object spec;
84d60297 10231
df7492f9
KH
10232 if (NILP (coding_system))
10233 coding_system = Qno_conversion;
10234 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
da7db224 10235 return AREF (spec, 1);
df7492f9 10236}
66cfb530 10237
a7ca3326 10238DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
df7492f9
KH
10239 Scoding_system_eol_type, 1, 1, 0,
10240 doc: /* Return eol-type of CODING-SYSTEM.
d4a1d553 10241An eol-type is an integer 0, 1, 2, or a vector of coding systems.
66cfb530 10242
df7492f9
KH
10243Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10244and CR respectively.
66cfb530 10245
df7492f9
KH
10246A vector value indicates that a format of end-of-line should be
10247detected automatically. Nth element of the vector is the subsidiary
10248coding system whose eol-type is N. */)
5842a27b 10249 (Lisp_Object coding_system)
6b89e3aa 10250{
df7492f9
KH
10251 Lisp_Object spec, eol_type;
10252 int n;
6b89e3aa 10253
df7492f9
KH
10254 if (NILP (coding_system))
10255 coding_system = Qno_conversion;
10256 if (! CODING_SYSTEM_P (coding_system))
10257 return Qnil;
10258 spec = CODING_SYSTEM_SPEC (coding_system);
10259 eol_type = AREF (spec, 2);
10260 if (VECTORP (eol_type))
10261 return Fcopy_sequence (eol_type);
10262 n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
10263 return make_number (n);
6b89e3aa
KH
10264}
10265
4ed46869
KH
10266#endif /* emacs */
10267
10268\f
1397dc18 10269/*** 9. Post-amble ***/
4ed46869 10270
dfcf069d 10271void
971de7fb 10272init_coding_once (void)
4ed46869
KH
10273{
10274 int i;
10275
df7492f9
KH
10276 for (i = 0; i < coding_category_max; i++)
10277 {
10278 coding_categories[i].id = -1;
10279 coding_priorities[i] = i;
10280 }
4ed46869
KH
10281
10282 /* ISO2022 specific initialize routine. */
10283 for (i = 0; i < 0x20; i++)
b73bfc1c 10284 iso_code_class[i] = ISO_control_0;
4ed46869
KH
10285 for (i = 0x21; i < 0x7F; i++)
10286 iso_code_class[i] = ISO_graphic_plane_0;
10287 for (i = 0x80; i < 0xA0; i++)
b73bfc1c 10288 iso_code_class[i] = ISO_control_1;
4ed46869
KH
10289 for (i = 0xA1; i < 0xFF; i++)
10290 iso_code_class[i] = ISO_graphic_plane_1;
10291 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
10292 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
4ed46869
KH
10293 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
10294 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
10295 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
10296 iso_code_class[ISO_CODE_ESC] = ISO_escape;
10297 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
10298 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
10299 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
10300
df7492f9
KH
10301 for (i = 0; i < 256; i++)
10302 {
10303 emacs_mule_bytes[i] = 1;
10304 }
7c78e542
KH
10305 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
10306 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
10307 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
10308 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
e0e989f6
KH
10309}
10310
10311#ifdef emacs
10312
dfcf069d 10313void
971de7fb 10314syms_of_coding (void)
e0e989f6 10315{
df7492f9 10316 staticpro (&Vcoding_system_hash_table);
8f924df7
KH
10317 {
10318 Lisp_Object args[2];
10319 args[0] = QCtest;
10320 args[1] = Qeq;
10321 Vcoding_system_hash_table = Fmake_hash_table (2, args);
10322 }
df7492f9
KH
10323
10324 staticpro (&Vsjis_coding_system);
10325 Vsjis_coding_system = Qnil;
e0e989f6 10326
df7492f9
KH
10327 staticpro (&Vbig5_coding_system);
10328 Vbig5_coding_system = Qnil;
10329
24a73b0a
KH
10330 staticpro (&Vcode_conversion_reused_workbuf);
10331 Vcode_conversion_reused_workbuf = Qnil;
10332
10333 staticpro (&Vcode_conversion_workbuf_name);
d67b4f80 10334 Vcode_conversion_workbuf_name = make_pure_c_string (" *code-conversion-work*");
e0e989f6 10335
24a73b0a 10336 reused_workbuf_in_use = 0;
df7492f9
KH
10337
10338 DEFSYM (Qcharset, "charset");
10339 DEFSYM (Qtarget_idx, "target-idx");
10340 DEFSYM (Qcoding_system_history, "coding-system-history");
bb0115a2
RS
10341 Fset (Qcoding_system_history, Qnil);
10342
9ce27fde 10343 /* Target FILENAME is the first argument. */
e0e989f6 10344 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
9ce27fde 10345 /* Target FILENAME is the third argument. */
e0e989f6
KH
10346 Fput (Qwrite_region, Qtarget_idx, make_number (2));
10347
df7492f9 10348 DEFSYM (Qcall_process, "call-process");
9ce27fde 10349 /* Target PROGRAM is the first argument. */
e0e989f6
KH
10350 Fput (Qcall_process, Qtarget_idx, make_number (0));
10351
df7492f9 10352 DEFSYM (Qcall_process_region, "call-process-region");
9ce27fde 10353 /* Target PROGRAM is the third argument. */
e0e989f6
KH
10354 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
10355
df7492f9 10356 DEFSYM (Qstart_process, "start-process");
9ce27fde 10357 /* Target PROGRAM is the third argument. */
e0e989f6
KH
10358 Fput (Qstart_process, Qtarget_idx, make_number (2));
10359
df7492f9 10360 DEFSYM (Qopen_network_stream, "open-network-stream");
9ce27fde 10361 /* Target SERVICE is the fourth argument. */
e0e989f6
KH
10362 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
10363
df7492f9
KH
10364 DEFSYM (Qcoding_system, "coding-system");
10365 DEFSYM (Qcoding_aliases, "coding-aliases");
4ed46869 10366
df7492f9
KH
10367 DEFSYM (Qeol_type, "eol-type");
10368 DEFSYM (Qunix, "unix");
10369 DEFSYM (Qdos, "dos");
4ed46869 10370
df7492f9
KH
10371 DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10372 DEFSYM (Qpost_read_conversion, "post-read-conversion");
10373 DEFSYM (Qpre_write_conversion, "pre-write-conversion");
10374 DEFSYM (Qdefault_char, "default-char");
10375 DEFSYM (Qundecided, "undecided");
10376 DEFSYM (Qno_conversion, "no-conversion");
10377 DEFSYM (Qraw_text, "raw-text");
4ed46869 10378
df7492f9 10379 DEFSYM (Qiso_2022, "iso-2022");
4ed46869 10380
df7492f9 10381 DEFSYM (Qutf_8, "utf-8");
8f924df7 10382 DEFSYM (Qutf_8_emacs, "utf-8-emacs");
27901516 10383
df7492f9 10384 DEFSYM (Qutf_16, "utf-16");
df7492f9
KH
10385 DEFSYM (Qbig, "big");
10386 DEFSYM (Qlittle, "little");
27901516 10387
df7492f9
KH
10388 DEFSYM (Qshift_jis, "shift-jis");
10389 DEFSYM (Qbig5, "big5");
4ed46869 10390
df7492f9 10391 DEFSYM (Qcoding_system_p, "coding-system-p");
4ed46869 10392
df7492f9 10393 DEFSYM (Qcoding_system_error, "coding-system-error");
4ed46869 10394 Fput (Qcoding_system_error, Qerror_conditions,
d67b4f80 10395 pure_cons (Qcoding_system_error, pure_cons (Qerror, Qnil)));
4ed46869 10396 Fput (Qcoding_system_error, Qerror_message,
d67b4f80 10397 make_pure_c_string ("Invalid coding system"));
4ed46869 10398
05e6f5dc
KH
10399 /* Intern this now in case it isn't already done.
10400 Setting this variable twice is harmless.
10401 But don't staticpro it here--that is done in alloc.c. */
d67b4f80 10402 Qchar_table_extra_slots = intern_c_string ("char-table-extra-slots");
70c22245 10403
df7492f9 10404 DEFSYM (Qtranslation_table, "translation-table");
433f7f87 10405 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
df7492f9
KH
10406 DEFSYM (Qtranslation_table_id, "translation-table-id");
10407 DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
10408 DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
1397dc18 10409
df7492f9 10410 DEFSYM (Qvalid_codes, "valid-codes");
9ce27fde 10411
df7492f9 10412 DEFSYM (Qemacs_mule, "emacs-mule");
d46c5b12 10413
01378f49 10414 DEFSYM (QCcategory, ":category");
a6f87d34 10415 DEFSYM (QCmnemonic, ":mnemonic");
2133e2d1 10416 DEFSYM (QCdefault_char, ":default-char");
a6f87d34
KH
10417 DEFSYM (QCdecode_translation_table, ":decode-translation-table");
10418 DEFSYM (QCencode_translation_table, ":encode-translation-table");
10419 DEFSYM (QCpost_read_conversion, ":post-read-conversion");
10420 DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
35befdaa 10421 DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
01378f49 10422
df7492f9
KH
10423 Vcoding_category_table
10424 = Fmake_vector (make_number (coding_category_max), Qnil);
10425 staticpro (&Vcoding_category_table);
10426 /* Followings are target of code detection. */
10427 ASET (Vcoding_category_table, coding_category_iso_7,
d67b4f80 10428 intern_c_string ("coding-category-iso-7"));
df7492f9 10429 ASET (Vcoding_category_table, coding_category_iso_7_tight,
d67b4f80 10430 intern_c_string ("coding-category-iso-7-tight"));
df7492f9 10431 ASET (Vcoding_category_table, coding_category_iso_8_1,
d67b4f80 10432 intern_c_string ("coding-category-iso-8-1"));
df7492f9 10433 ASET (Vcoding_category_table, coding_category_iso_8_2,
d67b4f80 10434 intern_c_string ("coding-category-iso-8-2"));
df7492f9 10435 ASET (Vcoding_category_table, coding_category_iso_7_else,
d67b4f80 10436 intern_c_string ("coding-category-iso-7-else"));
df7492f9 10437 ASET (Vcoding_category_table, coding_category_iso_8_else,
d67b4f80 10438 intern_c_string ("coding-category-iso-8-else"));
a470d443 10439 ASET (Vcoding_category_table, coding_category_utf_8_auto,
d67b4f80 10440 intern_c_string ("coding-category-utf-8-auto"));
a470d443 10441 ASET (Vcoding_category_table, coding_category_utf_8_nosig,
d67b4f80 10442 intern_c_string ("coding-category-utf-8"));
a470d443 10443 ASET (Vcoding_category_table, coding_category_utf_8_sig,
d67b4f80 10444 intern_c_string ("coding-category-utf-8-sig"));
df7492f9 10445 ASET (Vcoding_category_table, coding_category_utf_16_be,
d67b4f80 10446 intern_c_string ("coding-category-utf-16-be"));
ff563fce 10447 ASET (Vcoding_category_table, coding_category_utf_16_auto,
d67b4f80 10448 intern_c_string ("coding-category-utf-16-auto"));
df7492f9 10449 ASET (Vcoding_category_table, coding_category_utf_16_le,
d67b4f80 10450 intern_c_string ("coding-category-utf-16-le"));
df7492f9 10451 ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
d67b4f80 10452 intern_c_string ("coding-category-utf-16-be-nosig"));
df7492f9 10453 ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
d67b4f80 10454 intern_c_string ("coding-category-utf-16-le-nosig"));
df7492f9 10455 ASET (Vcoding_category_table, coding_category_charset,
d67b4f80 10456 intern_c_string ("coding-category-charset"));
df7492f9 10457 ASET (Vcoding_category_table, coding_category_sjis,
d67b4f80 10458 intern_c_string ("coding-category-sjis"));
df7492f9 10459 ASET (Vcoding_category_table, coding_category_big5,
d67b4f80 10460 intern_c_string ("coding-category-big5"));
df7492f9 10461 ASET (Vcoding_category_table, coding_category_ccl,
d67b4f80 10462 intern_c_string ("coding-category-ccl"));
df7492f9 10463 ASET (Vcoding_category_table, coding_category_emacs_mule,
d67b4f80 10464 intern_c_string ("coding-category-emacs-mule"));
df7492f9
KH
10465 /* Followings are NOT target of code detection. */
10466 ASET (Vcoding_category_table, coding_category_raw_text,
d67b4f80 10467 intern_c_string ("coding-category-raw-text"));
df7492f9 10468 ASET (Vcoding_category_table, coding_category_undecided,
d67b4f80 10469 intern_c_string ("coding-category-undecided"));
ecf488bc 10470
065e3595
KH
10471 DEFSYM (Qinsufficient_source, "insufficient-source");
10472 DEFSYM (Qinconsistent_eol, "inconsistent-eol");
10473 DEFSYM (Qinvalid_source, "invalid-source");
10474 DEFSYM (Qinterrupted, "interrupted");
10475 DEFSYM (Qinsufficient_memory, "insufficient-memory");
44e8490d 10476 DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
065e3595 10477
4ed46869
KH
10478 defsubr (&Scoding_system_p);
10479 defsubr (&Sread_coding_system);
10480 defsubr (&Sread_non_nil_coding_system);
10481 defsubr (&Scheck_coding_system);
10482 defsubr (&Sdetect_coding_region);
d46c5b12 10483 defsubr (&Sdetect_coding_string);
05e6f5dc 10484 defsubr (&Sfind_coding_systems_region_internal);
068a9dbd 10485 defsubr (&Sunencodable_char_position);
df7492f9 10486 defsubr (&Scheck_coding_systems_region);
4ed46869
KH
10487 defsubr (&Sdecode_coding_region);
10488 defsubr (&Sencode_coding_region);
10489 defsubr (&Sdecode_coding_string);
10490 defsubr (&Sencode_coding_string);
10491 defsubr (&Sdecode_sjis_char);
10492 defsubr (&Sencode_sjis_char);
10493 defsubr (&Sdecode_big5_char);
10494 defsubr (&Sencode_big5_char);
1ba9e4ab 10495 defsubr (&Sset_terminal_coding_system_internal);
c4825358 10496 defsubr (&Sset_safe_terminal_coding_system_internal);
4ed46869 10497 defsubr (&Sterminal_coding_system);
1ba9e4ab 10498 defsubr (&Sset_keyboard_coding_system_internal);
4ed46869 10499 defsubr (&Skeyboard_coding_system);
a5d301df 10500 defsubr (&Sfind_operation_coding_system);
df7492f9 10501 defsubr (&Sset_coding_system_priority);
6b89e3aa 10502 defsubr (&Sdefine_coding_system_internal);
df7492f9 10503 defsubr (&Sdefine_coding_system_alias);
a6f87d34 10504 defsubr (&Scoding_system_put);
df7492f9
KH
10505 defsubr (&Scoding_system_base);
10506 defsubr (&Scoding_system_plist);
10507 defsubr (&Scoding_system_aliases);
10508 defsubr (&Scoding_system_eol_type);
10509 defsubr (&Scoding_system_priority_list);
4ed46869 10510
29208e82 10511 DEFVAR_LISP ("coding-system-list", Vcoding_system_list,
48b0f3ae
PJ
10512 doc: /* List of coding systems.
10513
10514Do not alter the value of this variable manually. This variable should be
df7492f9 10515updated by the functions `define-coding-system' and
48b0f3ae 10516`define-coding-system-alias'. */);
4608c386
KH
10517 Vcoding_system_list = Qnil;
10518
29208e82 10519 DEFVAR_LISP ("coding-system-alist", Vcoding_system_alist,
48b0f3ae
PJ
10520 doc: /* Alist of coding system names.
10521Each element is one element list of coding system name.
446dcd75 10522This variable is given to `completing-read' as COLLECTION argument.
48b0f3ae
PJ
10523
10524Do not alter the value of this variable manually. This variable should be
10525updated by the functions `make-coding-system' and
10526`define-coding-system-alias'. */);
4608c386
KH
10527 Vcoding_system_alist = Qnil;
10528
29208e82 10529 DEFVAR_LISP ("coding-category-list", Vcoding_category_list,
48b0f3ae
PJ
10530 doc: /* List of coding-categories (symbols) ordered by priority.
10531
10532On detecting a coding system, Emacs tries code detection algorithms
10533associated with each coding-category one by one in this order. When
10534one algorithm agrees with a byte sequence of source text, the coding
0ec31faf
KH
10535system bound to the corresponding coding-category is selected.
10536
448e17d6 10537Don't modify this variable directly, but use `set-coding-system-priority'. */);
4ed46869
KH
10538 {
10539 int i;
10540
10541 Vcoding_category_list = Qnil;
df7492f9 10542 for (i = coding_category_max - 1; i >= 0; i--)
4ed46869 10543 Vcoding_category_list
d46c5b12
KH
10544 = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
10545 Vcoding_category_list);
4ed46869
KH
10546 }
10547
29208e82 10548 DEFVAR_LISP ("coding-system-for-read", Vcoding_system_for_read,
48b0f3ae
PJ
10549 doc: /* Specify the coding system for read operations.
10550It is useful to bind this variable with `let', but do not set it globally.
10551If the value is a coding system, it is used for decoding on read operation.
446dcd75
JB
10552If not, an appropriate element is used from one of the coding system alists.
10553There are three such tables: `file-coding-system-alist',
48b0f3ae 10554`process-coding-system-alist', and `network-coding-system-alist'. */);
4ed46869
KH
10555 Vcoding_system_for_read = Qnil;
10556
29208e82 10557 DEFVAR_LISP ("coding-system-for-write", Vcoding_system_for_write,
48b0f3ae
PJ
10558 doc: /* Specify the coding system for write operations.
10559Programs bind this variable with `let', but you should not set it globally.
10560If the value is a coding system, it is used for encoding of output,
10561when writing it to a file and when sending it to a file or subprocess.
10562
10563If this does not specify a coding system, an appropriate element
446dcd75
JB
10564is used from one of the coding system alists.
10565There are three such tables: `file-coding-system-alist',
48b0f3ae
PJ
10566`process-coding-system-alist', and `network-coding-system-alist'.
10567For output to files, if the above procedure does not specify a coding system,
10568the value of `buffer-file-coding-system' is used. */);
4ed46869
KH
10569 Vcoding_system_for_write = Qnil;
10570
29208e82 10571 DEFVAR_LISP ("last-coding-system-used", Vlast_coding_system_used,
df7492f9
KH
10572 doc: /*
10573Coding system used in the latest file or process I/O. */);
4ed46869
KH
10574 Vlast_coding_system_used = Qnil;
10575
29208e82 10576 DEFVAR_LISP ("last-code-conversion-error", Vlast_code_conversion_error,
065e3595
KH
10577 doc: /*
10578Error status of the last code conversion.
10579
10580When an error was detected in the last code conversion, this variable
10581is set to one of the following symbols.
10582 `insufficient-source'
10583 `inconsistent-eol'
10584 `invalid-source'
10585 `interrupted'
10586 `insufficient-memory'
10587When no error was detected, the value doesn't change. So, to check
10588the error status of a code conversion by this variable, you must
10589explicitly set this variable to nil before performing code
10590conversion. */);
10591 Vlast_code_conversion_error = Qnil;
10592
29208e82 10593 DEFVAR_BOOL ("inhibit-eol-conversion", inhibit_eol_conversion,
df7492f9
KH
10594 doc: /*
10595*Non-nil means always inhibit code conversion of end-of-line format.
48b0f3ae
PJ
10596See info node `Coding Systems' and info node `Text and Binary' concerning
10597such conversion. */);
9ce27fde
KH
10598 inhibit_eol_conversion = 0;
10599
29208e82 10600 DEFVAR_BOOL ("inherit-process-coding-system", inherit_process_coding_system,
df7492f9
KH
10601 doc: /*
10602Non-nil means process buffer inherits coding system of process output.
48b0f3ae
PJ
10603Bind it to t if the process output is to be treated as if it were a file
10604read from some filesystem. */);
ed29121d
EZ
10605 inherit_process_coding_system = 0;
10606
29208e82 10607 DEFVAR_LISP ("file-coding-system-alist", Vfile_coding_system_alist,
df7492f9
KH
10608 doc: /*
10609Alist to decide a coding system to use for a file I/O operation.
48b0f3ae
PJ
10610The format is ((PATTERN . VAL) ...),
10611where PATTERN is a regular expression matching a file name,
10612VAL is a coding system, a cons of coding systems, or a function symbol.
10613If VAL is a coding system, it is used for both decoding and encoding
10614the file contents.
10615If VAL is a cons of coding systems, the car part is used for decoding,
10616and the cdr part is used for encoding.
10617If VAL is a function symbol, the function must return a coding system
2c53e699
KH
10618or a cons of coding systems which are used as above. The function is
10619called with an argument that is a list of the arguments with which
5a0bbd9a
KH
10620`find-operation-coding-system' was called. If the function can't decide
10621a coding system, it can return `undecided' so that the normal
10622code-detection is performed.
48b0f3ae
PJ
10623
10624See also the function `find-operation-coding-system'
10625and the variable `auto-coding-alist'. */);
02ba4723
KH
10626 Vfile_coding_system_alist = Qnil;
10627
29208e82 10628 DEFVAR_LISP ("process-coding-system-alist", Vprocess_coding_system_alist,
df7492f9
KH
10629 doc: /*
10630Alist to decide a coding system to use for a process I/O operation.
48b0f3ae
PJ
10631The format is ((PATTERN . VAL) ...),
10632where PATTERN is a regular expression matching a program name,
10633VAL is a coding system, a cons of coding systems, or a function symbol.
10634If VAL is a coding system, it is used for both decoding what received
10635from the program and encoding what sent to the program.
10636If VAL is a cons of coding systems, the car part is used for decoding,
10637and the cdr part is used for encoding.
10638If VAL is a function symbol, the function must return a coding system
10639or a cons of coding systems which are used as above.
10640
10641See also the function `find-operation-coding-system'. */);
02ba4723
KH
10642 Vprocess_coding_system_alist = Qnil;
10643
29208e82 10644 DEFVAR_LISP ("network-coding-system-alist", Vnetwork_coding_system_alist,
df7492f9
KH
10645 doc: /*
10646Alist to decide a coding system to use for a network I/O operation.
48b0f3ae
PJ
10647The format is ((PATTERN . VAL) ...),
10648where PATTERN is a regular expression matching a network service name
10649or is a port number to connect to,
10650VAL is a coding system, a cons of coding systems, or a function symbol.
10651If VAL is a coding system, it is used for both decoding what received
10652from the network stream and encoding what sent to the network stream.
10653If VAL is a cons of coding systems, the car part is used for decoding,
10654and the cdr part is used for encoding.
10655If VAL is a function symbol, the function must return a coding system
10656or a cons of coding systems which are used as above.
10657
10658See also the function `find-operation-coding-system'. */);
02ba4723 10659 Vnetwork_coding_system_alist = Qnil;
4ed46869 10660
29208e82 10661 DEFVAR_LISP ("locale-coding-system", Vlocale_coding_system,
75205970
RS
10662 doc: /* Coding system to use with system messages.
10663Also used for decoding keyboard input on X Window system. */);
68c45bf0
PE
10664 Vlocale_coding_system = Qnil;
10665
005f0d35 10666 /* The eol mnemonics are reset in startup.el system-dependently. */
29208e82 10667 DEFVAR_LISP ("eol-mnemonic-unix", eol_mnemonic_unix,
df7492f9
KH
10668 doc: /*
10669*String displayed in mode line for UNIX-like (LF) end-of-line format. */);
d67b4f80 10670 eol_mnemonic_unix = make_pure_c_string (":");
4ed46869 10671
29208e82 10672 DEFVAR_LISP ("eol-mnemonic-dos", eol_mnemonic_dos,
df7492f9
KH
10673 doc: /*
10674*String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
d67b4f80 10675 eol_mnemonic_dos = make_pure_c_string ("\\");
4ed46869 10676
29208e82 10677 DEFVAR_LISP ("eol-mnemonic-mac", eol_mnemonic_mac,
df7492f9
KH
10678 doc: /*
10679*String displayed in mode line for MAC-like (CR) end-of-line format. */);
d67b4f80 10680 eol_mnemonic_mac = make_pure_c_string ("/");
4ed46869 10681
29208e82 10682 DEFVAR_LISP ("eol-mnemonic-undecided", eol_mnemonic_undecided,
df7492f9
KH
10683 doc: /*
10684*String displayed in mode line when end-of-line format is not yet determined. */);
d67b4f80 10685 eol_mnemonic_undecided = make_pure_c_string (":");
4ed46869 10686
29208e82 10687 DEFVAR_LISP ("enable-character-translation", Venable_character_translation,
df7492f9
KH
10688 doc: /*
10689*Non-nil enables character translation while encoding and decoding. */);
84fbb8a0 10690 Venable_character_translation = Qt;
bdd9fb48 10691
f967223b 10692 DEFVAR_LISP ("standard-translation-table-for-decode",
29208e82 10693 Vstandard_translation_table_for_decode,
48b0f3ae 10694 doc: /* Table for translating characters while decoding. */);
f967223b 10695 Vstandard_translation_table_for_decode = Qnil;
bdd9fb48 10696
f967223b 10697 DEFVAR_LISP ("standard-translation-table-for-encode",
29208e82 10698 Vstandard_translation_table_for_encode,
48b0f3ae 10699 doc: /* Table for translating characters while encoding. */);
f967223b 10700 Vstandard_translation_table_for_encode = Qnil;
4ed46869 10701
29208e82 10702 DEFVAR_LISP ("charset-revision-table", Vcharset_revision_table,
48b0f3ae
PJ
10703 doc: /* Alist of charsets vs revision numbers.
10704While encoding, if a charset (car part of an element) is found,
df7492f9
KH
10705designate it with the escape sequence identifying revision (cdr part
10706of the element). */);
10707 Vcharset_revision_table = Qnil;
02ba4723
KH
10708
10709 DEFVAR_LISP ("default-process-coding-system",
29208e82 10710 Vdefault_process_coding_system,
48b0f3ae
PJ
10711 doc: /* Cons of coding systems used for process I/O by default.
10712The car part is used for decoding a process output,
10713the cdr part is used for encoding a text to be sent to a process. */);
02ba4723 10714 Vdefault_process_coding_system = Qnil;
c4825358 10715
29208e82 10716 DEFVAR_LISP ("latin-extra-code-table", Vlatin_extra_code_table,
df7492f9
KH
10717 doc: /*
10718Table of extra Latin codes in the range 128..159 (inclusive).
48b0f3ae
PJ
10719This is a vector of length 256.
10720If Nth element is non-nil, the existence of code N in a file
10721\(or output of subprocess) doesn't prevent it to be detected as
10722a coding system of ISO 2022 variant which has a flag
10723`accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
10724or reading output of a subprocess.
446dcd75 10725Only 128th through 159th elements have a meaning. */);
3f003981 10726 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
d46c5b12
KH
10727
10728 DEFVAR_LISP ("select-safe-coding-system-function",
29208e82 10729 Vselect_safe_coding_system_function,
df7492f9
KH
10730 doc: /*
10731Function to call to select safe coding system for encoding a text.
48b0f3ae
PJ
10732
10733If set, this function is called to force a user to select a proper
10734coding system which can encode the text in the case that a default
fdecf907
GM
10735coding system used in each operation can't encode the text. The
10736function should take care that the buffer is not modified while
10737the coding system is being selected.
48b0f3ae
PJ
10738
10739The default value is `select-safe-coding-system' (which see). */);
d46c5b12
KH
10740 Vselect_safe_coding_system_function = Qnil;
10741
5d5bf4d8 10742 DEFVAR_BOOL ("coding-system-require-warning",
29208e82 10743 coding_system_require_warning,
5d5bf4d8 10744 doc: /* Internal use only.
6b89e3aa
KH
10745If non-nil, on writing a file, `select-safe-coding-system-function' is
10746called even if `coding-system-for-write' is non-nil. The command
10747`universal-coding-system-argument' binds this variable to t temporarily. */);
5d5bf4d8
KH
10748 coding_system_require_warning = 0;
10749
10750
22ab2303 10751 DEFVAR_BOOL ("inhibit-iso-escape-detection",
29208e82 10752 inhibit_iso_escape_detection,
df7492f9 10753 doc: /*
97b1b294 10754If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
48b0f3ae 10755
97b1b294
EZ
10756When Emacs reads text, it tries to detect how the text is encoded.
10757This code detection is sensitive to escape sequences. If Emacs sees
10758a valid ISO-2022 escape sequence, it assumes the text is encoded in one
10759of the ISO2022 encodings, and decodes text by the corresponding coding
10760system (e.g. `iso-2022-7bit').
48b0f3ae
PJ
10761
10762However, there may be a case that you want to read escape sequences in
10763a file as is. In such a case, you can set this variable to non-nil.
97b1b294
EZ
10764Then the code detection will ignore any escape sequences, and no text is
10765detected as encoded in some ISO-2022 encoding. The result is that all
48b0f3ae
PJ
10766escape sequences become visible in a buffer.
10767
10768The default value is nil, and it is strongly recommended not to change
10769it. That is because many Emacs Lisp source files that contain
10770non-ASCII characters are encoded by the coding system `iso-2022-7bit'
10771in Emacs's distribution, and they won't be decoded correctly on
10772reading if you suppress escape sequence detection.
10773
10774The other way to read escape sequences in a file without decoding is
97b1b294 10775to explicitly specify some coding system that doesn't use ISO-2022
48b0f3ae 10776escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument]. */);
74383408 10777 inhibit_iso_escape_detection = 0;
002fdb44 10778
97b1b294 10779 DEFVAR_BOOL ("inhibit-null-byte-detection",
29208e82 10780 inhibit_null_byte_detection,
97b1b294
EZ
10781 doc: /* If non-nil, Emacs ignores null bytes on code detection.
10782By default, Emacs treats it as binary data, and does not attempt to
10783decode it. The effect is as if you specified `no-conversion' for
10784reading that text.
10785
10786Set this to non-nil when a regular text happens to include null bytes.
10787Examples are Index nodes of Info files and null-byte delimited output
10788from GNU Find and GNU Grep. Emacs will then ignore the null bytes and
10789decode text as usual. */);
10790 inhibit_null_byte_detection = 0;
10791
29208e82 10792 DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input,
15c8f9d1 10793 doc: /* Char table for translating self-inserting characters.
446dcd75 10794This is applied to the result of input methods, not their input.
8434d0b8
EZ
10795See also `keyboard-translate-table'.
10796
10797Use of this variable for character code unification was rendered
10798obsolete in Emacs 23.1 and later, since Unicode is now the basis of
10799internal character representation. */);
002fdb44 10800 Vtranslation_table_for_input = Qnil;
8f924df7 10801
2c78b7e1
KH
10802 {
10803 Lisp_Object args[coding_arg_max];
8f924df7 10804 Lisp_Object plist[16];
2c78b7e1
KH
10805 int i;
10806
10807 for (i = 0; i < coding_arg_max; i++)
10808 args[i] = Qnil;
10809
d67b4f80 10810 plist[0] = intern_c_string (":name");
2c78b7e1 10811 plist[1] = args[coding_arg_name] = Qno_conversion;
d67b4f80 10812 plist[2] = intern_c_string (":mnemonic");
2c78b7e1 10813 plist[3] = args[coding_arg_mnemonic] = make_number ('=');
d67b4f80 10814 plist[4] = intern_c_string (":coding-type");
2c78b7e1 10815 plist[5] = args[coding_arg_coding_type] = Qraw_text;
d67b4f80 10816 plist[6] = intern_c_string (":ascii-compatible-p");
2c78b7e1 10817 plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
d67b4f80 10818 plist[8] = intern_c_string (":default-char");
2c78b7e1 10819 plist[9] = args[coding_arg_default_char] = make_number (0);
d67b4f80 10820 plist[10] = intern_c_string (":for-unibyte");
8f924df7 10821 plist[11] = args[coding_arg_for_unibyte] = Qt;
d67b4f80
DN
10822 plist[12] = intern_c_string (":docstring");
10823 plist[13] = make_pure_c_string ("Do no conversion.\n\
2c78b7e1
KH
10824\n\
10825When you visit a file with this coding, the file is read into a\n\
10826unibyte buffer as is, thus each byte of a file is treated as a\n\
10827character.");
d67b4f80 10828 plist[14] = intern_c_string (":eol-type");
8f924df7
KH
10829 plist[15] = args[coding_arg_eol_type] = Qunix;
10830 args[coding_arg_plist] = Flist (16, plist);
2c78b7e1 10831 Fdefine_coding_system_internal (coding_arg_max, args);
ae6f73fa
KH
10832
10833 plist[1] = args[coding_arg_name] = Qundecided;
10834 plist[3] = args[coding_arg_mnemonic] = make_number ('-');
10835 plist[5] = args[coding_arg_coding_type] = Qundecided;
10836 /* This is already set.
35befdaa 10837 plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
d67b4f80 10838 plist[8] = intern_c_string (":charset-list");
ae6f73fa
KH
10839 plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
10840 plist[11] = args[coding_arg_for_unibyte] = Qnil;
d67b4f80 10841 plist[13] = make_pure_c_string ("No conversion on encoding, automatic conversion on decoding.");
ae6f73fa
KH
10842 plist[15] = args[coding_arg_eol_type] = Qnil;
10843 args[coding_arg_plist] = Flist (16, plist);
10844 Fdefine_coding_system_internal (coding_arg_max, args);
2c78b7e1
KH
10845 }
10846
2c78b7e1 10847 setup_coding_system (Qno_conversion, &safe_terminal_coding);
ff563fce
KH
10848
10849 {
10850 int i;
10851
10852 for (i = 0; i < coding_category_max; i++)
10853 Fset (AREF (Vcoding_category_table, i), Qno_conversion);
10854 }
1a4990fb 10855#if defined (DOS_NT)
fcbcfb64
KH
10856 system_eol_type = Qdos;
10857#else
10858 system_eol_type = Qunix;
10859#endif
10860 staticpro (&system_eol_type);
4ed46869
KH
10861}
10862
68c45bf0 10863char *
971de7fb 10864emacs_strerror (int error_number)
68c45bf0
PE
10865{
10866 char *str;
10867
ca9c0567 10868 synchronize_system_messages_locale ();
68c45bf0
PE
10869 str = strerror (error_number);
10870
10871 if (! NILP (Vlocale_coding_system))
10872 {
10873 Lisp_Object dec = code_convert_string_norecord (build_string (str),
10874 Vlocale_coding_system,
10875 0);
51b59d79 10876 str = SSDATA (dec);
68c45bf0
PE
10877 }
10878
10879 return str;
10880}
10881
4ed46869 10882#endif /* emacs */