(x_set_font): Always store a font to the font parameter,
[bpt/emacs.git] / src / coding.c
CommitLineData
9542cb1f 1/* Coding system handler (conversion, detection, etc).
aaef169d 2 Copyright (C) 2001, 2002, 2003, 2004, 2005,
76b6f707 3 2006, 2007, 2008, 2009 Free Software Foundation, Inc.
7976eda0 4 Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
76b6f707 5 2005, 2006, 2007, 2008, 2009
ce03bf76
KH
6 National Institute of Advanced Industrial Science and Technology (AIST)
7 Registration Number H14PRO021
8f924df7 8 Copyright (C) 2003
df7492f9
KH
9 National Institute of Advanced Industrial Science and Technology (AIST)
10 Registration Number H13PRO009
4ed46869 11
369314dc
KH
12This file is part of GNU Emacs.
13
9ec0b715 14GNU Emacs is free software: you can redistribute it and/or modify
369314dc 15it under the terms of the GNU General Public License as published by
9ec0b715
GM
16the Free Software Foundation, either version 3 of the License, or
17(at your option) any later version.
4ed46869 18
369314dc
KH
19GNU Emacs is distributed in the hope that it will be useful,
20but WITHOUT ANY WARRANTY; without even the implied warranty of
21MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22GNU General Public License for more details.
4ed46869 23
369314dc 24You should have received a copy of the GNU General Public License
9ec0b715 25along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */
4ed46869
KH
26
27/*** TABLE OF CONTENTS ***
28
b73bfc1c 29 0. General comments
4ed46869 30 1. Preamble
df7492f9
KH
31 2. Emacs' internal format (emacs-utf-8) handlers
32 3. UTF-8 handlers
33 4. UTF-16 handlers
34 5. Charset-base coding systems handlers
35 6. emacs-mule (old Emacs' internal format) handlers
36 7. ISO2022 handlers
37 8. Shift-JIS and BIG5 handlers
38 9. CCL handlers
39 10. C library functions
40 11. Emacs Lisp library functions
41 12. Postamble
4ed46869
KH
42
43*/
44
df7492f9 45/*** 0. General comments ***
b73bfc1c
KH
46
47
df7492f9 48CODING SYSTEM
4ed46869 49
5bad0796
DL
50 A coding system is an object for an encoding mechanism that contains
51 information about how to convert byte sequences to character
e19c3639
KH
52 sequences and vice versa. When we say "decode", it means converting
53 a byte sequence of a specific coding system into a character
54 sequence that is represented by Emacs' internal coding system
55 `emacs-utf-8', and when we say "encode", it means converting a
56 character sequence of emacs-utf-8 to a byte sequence of a specific
0ef69138 57 coding system.
4ed46869 58
e19c3639
KH
59 In Emacs Lisp, a coding system is represented by a Lisp symbol. In
60 C level, a coding system is represented by a vector of attributes
5bad0796 61 stored in the hash table Vcharset_hash_table. The conversion from
e19c3639
KH
62 coding system symbol to attributes vector is done by looking up
63 Vcharset_hash_table by the symbol.
4ed46869 64
e19c3639 65 Coding systems are classified into the following types depending on
5bad0796 66 the encoding mechanism. Here's a brief description of the types.
4ed46869 67
df7492f9
KH
68 o UTF-8
69
70 o UTF-16
71
72 o Charset-base coding system
73
74 A coding system defined by one or more (coded) character sets.
5bad0796 75 Decoding and encoding are done by a code converter defined for each
df7492f9
KH
76 character set.
77
5bad0796 78 o Old Emacs internal format (emacs-mule)
df7492f9 79
5bad0796 80 The coding system adopted by old versions of Emacs (20 and 21).
4ed46869 81
df7492f9 82 o ISO2022-base coding system
4ed46869
KH
83
84 The most famous coding system for multiple character sets. X's
df7492f9
KH
85 Compound Text, various EUCs (Extended Unix Code), and coding systems
86 used in the Internet communication such as ISO-2022-JP are all
87 variants of ISO2022.
4ed46869 88
df7492f9 89 o SJIS (or Shift-JIS or MS-Kanji-Code)
93dec019 90
4ed46869
KH
91 A coding system to encode character sets: ASCII, JISX0201, and
92 JISX0208. Widely used for PC's in Japan. Details are described in
df7492f9 93 section 8.
4ed46869 94
df7492f9 95 o BIG5
4ed46869 96
df7492f9 97 A coding system to encode character sets: ASCII and Big5. Widely
cfb43547 98 used for Chinese (mainly in Taiwan and Hong Kong). Details are
df7492f9
KH
99 described in section 8. In this file, when we write "big5" (all
100 lowercase), we mean the coding system, and when we write "Big5"
101 (capitalized), we mean the character set.
4ed46869 102
df7492f9 103 o CCL
27901516 104
5bad0796 105 If a user wants to decode/encode text encoded in a coding system
df7492f9
KH
106 not listed above, he can supply a decoder and an encoder for it in
107 CCL (Code Conversion Language) programs. Emacs executes the CCL
108 program while decoding/encoding.
27901516 109
df7492f9 110 o Raw-text
4ed46869 111
5a936b46 112 A coding system for text containing raw eight-bit data. Emacs
5bad0796 113 treats each byte of source text as a character (except for
df7492f9 114 end-of-line conversion).
4ed46869 115
df7492f9
KH
116 o No-conversion
117
118 Like raw text, but don't do end-of-line conversion.
4ed46869 119
4ed46869 120
df7492f9 121END-OF-LINE FORMAT
4ed46869 122
5bad0796 123 How text end-of-line is encoded depends on operating system. For
df7492f9 124 instance, Unix's format is just one byte of LF (line-feed) code,
f4dee582 125 whereas DOS's format is two-byte sequence of `carriage-return' and
d46c5b12
KH
126 `line-feed' codes. MacOS's format is usually one byte of
127 `carriage-return'.
4ed46869 128
cfb43547 129 Since text character encoding and end-of-line encoding are
df7492f9
KH
130 independent, any coding system described above can take any format
131 of end-of-line (except for no-conversion).
4ed46869 132
e19c3639
KH
133STRUCT CODING_SYSTEM
134
135 Before using a coding system for code conversion (i.e. decoding and
136 encoding), we setup a structure of type `struct coding_system'.
137 This structure keeps various information about a specific code
5bad0796 138 conversion (e.g. the location of source and destination data).
4ed46869
KH
139
140*/
141
df7492f9
KH
142/* COMMON MACROS */
143
144
4ed46869
KH
145/*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
146
df7492f9 147 These functions check if a byte sequence specified as a source in
ff0dacd7
KH
148 CODING conforms to the format of XXX, and update the members of
149 DETECT_INFO.
df7492f9 150
ff0dacd7 151 Return 1 if the byte sequence conforms to XXX, otherwise return 0.
df7492f9
KH
152
153 Below is the template of these functions. */
154
4ed46869 155#if 0
df7492f9 156static int
ff0dacd7 157detect_coding_XXX (coding, detect_info)
df7492f9 158 struct coding_system *coding;
ff0dacd7 159 struct coding_detection_info *detect_info;
4ed46869 160{
f1d34bca
MB
161 const unsigned char *src = coding->source;
162 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9 163 int multibytep = coding->src_multibyte;
ff0dacd7 164 int consumed_chars = 0;
df7492f9
KH
165 int found = 0;
166 ...;
167
168 while (1)
169 {
170 /* Get one byte from the source. If the souce is exausted, jump
171 to no_more_source:. */
172 ONE_MORE_BYTE (c);
ff0dacd7
KH
173
174 if (! __C_conforms_to_XXX___ (c))
175 break;
176 if (! __C_strongly_suggests_XXX__ (c))
177 found = CATEGORY_MASK_XXX;
df7492f9 178 }
ff0dacd7
KH
179 /* The byte sequence is invalid for XXX. */
180 detect_info->rejected |= CATEGORY_MASK_XXX;
df7492f9 181 return 0;
ff0dacd7 182
df7492f9 183 no_more_source:
ff0dacd7
KH
184 /* The source exausted successfully. */
185 detect_info->found |= found;
df7492f9 186 return 1;
4ed46869
KH
187}
188#endif
189
190/*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
191
df7492f9
KH
192 These functions decode a byte sequence specified as a source by
193 CODING. The resulting multibyte text goes to a place pointed to by
194 CODING->charbuf, the length of which should not exceed
195 CODING->charbuf_size;
d46c5b12 196
df7492f9
KH
197 These functions set the information of original and decoded texts in
198 CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
199 They also set CODING->result to one of CODING_RESULT_XXX indicating
200 how the decoding is finished.
d46c5b12 201
df7492f9 202 Below is the template of these functions. */
d46c5b12 203
4ed46869 204#if 0
b73bfc1c 205static void
df7492f9 206decode_coding_XXXX (coding)
4ed46869 207 struct coding_system *coding;
4ed46869 208{
f1d34bca
MB
209 const unsigned char *src = coding->source + coding->consumed;
210 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9
KH
211 /* SRC_BASE remembers the start position in source in each loop.
212 The loop will be exited when there's not enough source code, or
213 when there's no room in CHARBUF for a decoded character. */
f1d34bca 214 const unsigned char *src_base;
df7492f9 215 /* A buffer to produce decoded characters. */
69a80ea3
KH
216 int *charbuf = coding->charbuf + coding->charbuf_used;
217 int *charbuf_end = coding->charbuf + coding->charbuf_size;
df7492f9
KH
218 int multibytep = coding->src_multibyte;
219
220 while (1)
221 {
222 src_base = src;
223 if (charbuf < charbuf_end)
224 /* No more room to produce a decoded character. */
225 break;
226 ONE_MORE_BYTE (c);
227 /* Decode it. */
228 }
229
230 no_more_source:
231 if (src_base < src_end
232 && coding->mode & CODING_MODE_LAST_BLOCK)
233 /* If the source ends by partial bytes to construct a character,
234 treat them as eight-bit raw data. */
235 while (src_base < src_end && charbuf < charbuf_end)
236 *charbuf++ = *src_base++;
237 /* Remember how many bytes and characters we consumed. If the
238 source is multibyte, the bytes and chars are not identical. */
239 coding->consumed = coding->consumed_char = src_base - coding->source;
240 /* Remember how many characters we produced. */
241 coding->charbuf_used = charbuf - coding->charbuf;
4ed46869
KH
242}
243#endif
244
245/*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
246
df7492f9
KH
247 These functions encode SRC_BYTES length text at SOURCE of Emacs'
248 internal multibyte format by CODING. The resulting byte sequence
b73bfc1c
KH
249 goes to a place pointed to by DESTINATION, the length of which
250 should not exceed DST_BYTES.
d46c5b12 251
df7492f9
KH
252 These functions set the information of original and encoded texts in
253 the members produced, produced_char, consumed, and consumed_char of
254 the structure *CODING. They also set the member result to one of
255 CODING_RESULT_XXX indicating how the encoding finished.
d46c5b12 256
df7492f9
KH
257 DST_BYTES zero means that source area and destination area are
258 overlapped, which means that we can produce a encoded text until it
259 reaches at the head of not-yet-encoded source text.
d46c5b12 260
df7492f9 261 Below is a template of these functions. */
4ed46869 262#if 0
b73bfc1c 263static void
df7492f9 264encode_coding_XXX (coding)
4ed46869 265 struct coding_system *coding;
4ed46869 266{
df7492f9
KH
267 int multibytep = coding->dst_multibyte;
268 int *charbuf = coding->charbuf;
269 int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
270 unsigned char *dst = coding->destination + coding->produced;
271 unsigned char *dst_end = coding->destination + coding->dst_bytes;
272 unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
273 int produced_chars = 0;
274
275 for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
276 {
277 int c = *charbuf;
278 /* Encode C into DST, and increment DST. */
279 }
280 label_no_more_destination:
281 /* How many chars and bytes we produced. */
282 coding->produced_char += produced_chars;
283 coding->produced = dst - coding->destination;
4ed46869
KH
284}
285#endif
286
4ed46869
KH
287\f
288/*** 1. Preamble ***/
289
68c45bf0 290#include <config.h>
4ed46869
KH
291#include <stdio.h>
292
4ed46869
KH
293#include "lisp.h"
294#include "buffer.h"
df7492f9 295#include "character.h"
4ed46869
KH
296#include "charset.h"
297#include "ccl.h"
df7492f9 298#include "composite.h"
4ed46869
KH
299#include "coding.h"
300#include "window.h"
b8299c66
KL
301#include "frame.h"
302#include "termhooks.h"
4ed46869 303
df7492f9 304Lisp_Object Vcoding_system_hash_table;
4ed46869 305
df7492f9 306Lisp_Object Qcoding_system, Qcoding_aliases, Qeol_type;
1965cb73
DL
307Lisp_Object Qunix, Qdos;
308extern Lisp_Object Qmac; /* frame.c */
4ed46869
KH
309Lisp_Object Qbuffer_file_coding_system;
310Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
df7492f9 311Lisp_Object Qdefault_char;
27901516 312Lisp_Object Qno_conversion, Qundecided;
df7492f9 313Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5;
b49a1807 314Lisp_Object Qbig, Qlittle;
bb0115a2 315Lisp_Object Qcoding_system_history;
1397dc18 316Lisp_Object Qvalid_codes;
2133e2d1 317Lisp_Object QCcategory, QCmnemonic, QCdefault_char;
a6f87d34
KH
318Lisp_Object QCdecode_translation_table, QCencode_translation_table;
319Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
35befdaa 320Lisp_Object QCascii_compatible_p;
4ed46869
KH
321
322extern Lisp_Object Qinsert_file_contents, Qwrite_region;
387f6ba5 323Lisp_Object Qcall_process, Qcall_process_region;
4ed46869
KH
324Lisp_Object Qstart_process, Qopen_network_stream;
325Lisp_Object Qtarget_idx;
326
065e3595
KH
327Lisp_Object Qinsufficient_source, Qinconsistent_eol, Qinvalid_source;
328Lisp_Object Qinterrupted, Qinsufficient_memory;
329
c7183fb8
GM
330extern Lisp_Object Qcompletion_ignore_case;
331
44e8490d
KH
332/* If a symbol has this property, evaluate the value to define the
333 symbol as a coding system. */
334static Lisp_Object Qcoding_system_define_form;
335
5d5bf4d8
KH
336int coding_system_require_warning;
337
d46c5b12
KH
338Lisp_Object Vselect_safe_coding_system_function;
339
7722baf9
EZ
340/* Mnemonic string for each format of end-of-line. */
341Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
342/* Mnemonic string to indicate format of end-of-line is not yet
4ed46869 343 decided. */
7722baf9 344Lisp_Object eol_mnemonic_undecided;
4ed46869 345
fcbcfb64
KH
346/* Format of end-of-line decided by system. This is Qunix on
347 Unix and Mac, Qdos on DOS/Windows.
348 This has an effect only for external encoding (i.e. for output to
349 file and process), not for in-buffer or Lisp string encoding. */
350static Lisp_Object system_eol_type;
351
4ed46869
KH
352#ifdef emacs
353
4608c386
KH
354Lisp_Object Vcoding_system_list, Vcoding_system_alist;
355
356Lisp_Object Qcoding_system_p, Qcoding_system_error;
4ed46869 357
d46c5b12
KH
358/* Coding system emacs-mule and raw-text are for converting only
359 end-of-line format. */
360Lisp_Object Qemacs_mule, Qraw_text;
8f924df7 361Lisp_Object Qutf_8_emacs;
ecf488bc 362
4ed46869
KH
363/* Coding-systems are handed between Emacs Lisp programs and C internal
364 routines by the following three variables. */
365/* Coding-system for reading files and receiving data from process. */
366Lisp_Object Vcoding_system_for_read;
367/* Coding-system for writing files and sending data to process. */
368Lisp_Object Vcoding_system_for_write;
369/* Coding-system actually used in the latest I/O. */
370Lisp_Object Vlast_coding_system_used;
065e3595
KH
371/* Set to non-nil when an error is detected while code conversion. */
372Lisp_Object Vlast_code_conversion_error;
c4825358 373/* A vector of length 256 which contains information about special
94487c4e 374 Latin codes (especially for dealing with Microsoft codes). */
3f003981 375Lisp_Object Vlatin_extra_code_table;
c4825358 376
9ce27fde
KH
377/* Flag to inhibit code conversion of end-of-line format. */
378int inhibit_eol_conversion;
379
74383408
KH
380/* Flag to inhibit ISO2022 escape sequence detection. */
381int inhibit_iso_escape_detection;
382
97b1b294
EZ
383/* Flag to inhibit detection of binary files through null bytes. */
384int inhibit_null_byte_detection;
385
ed29121d
EZ
386/* Flag to make buffer-file-coding-system inherit from process-coding. */
387int inherit_process_coding_system;
388
c4825358
KH
389/* Coding system to be used to encode text for terminal display when
390 terminal coding system is nil. */
391struct coding_system safe_terminal_coding;
392
02ba4723
KH
393Lisp_Object Vfile_coding_system_alist;
394Lisp_Object Vprocess_coding_system_alist;
395Lisp_Object Vnetwork_coding_system_alist;
4ed46869 396
68c45bf0
PE
397Lisp_Object Vlocale_coding_system;
398
4ed46869
KH
399#endif /* emacs */
400
f967223b
KH
401/* Flag to tell if we look up translation table on character code
402 conversion. */
84fbb8a0 403Lisp_Object Venable_character_translation;
f967223b
KH
404/* Standard translation table to look up on decoding (reading). */
405Lisp_Object Vstandard_translation_table_for_decode;
406/* Standard translation table to look up on encoding (writing). */
407Lisp_Object Vstandard_translation_table_for_encode;
84fbb8a0 408
f967223b
KH
409Lisp_Object Qtranslation_table;
410Lisp_Object Qtranslation_table_id;
411Lisp_Object Qtranslation_table_for_decode;
412Lisp_Object Qtranslation_table_for_encode;
4ed46869
KH
413
414/* Alist of charsets vs revision number. */
df7492f9 415static Lisp_Object Vcharset_revision_table;
4ed46869 416
02ba4723
KH
417/* Default coding systems used for process I/O. */
418Lisp_Object Vdefault_process_coding_system;
419
002fdb44
DL
420/* Char table for translating Quail and self-inserting input. */
421Lisp_Object Vtranslation_table_for_input;
422
df7492f9
KH
423/* Two special coding systems. */
424Lisp_Object Vsjis_coding_system;
425Lisp_Object Vbig5_coding_system;
426
df7492f9
KH
427/* ISO2022 section */
428
429#define CODING_ISO_INITIAL(coding, reg) \
430 (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id), \
431 coding_attr_iso_initial), \
432 reg)))
433
434
435#define CODING_ISO_REQUEST(coding, charset_id) \
436 ((charset_id <= (coding)->max_charset_id \
437 ? (coding)->safe_charsets[charset_id] \
438 : -1))
439
440
441#define CODING_ISO_FLAGS(coding) \
442 ((coding)->spec.iso_2022.flags)
443#define CODING_ISO_DESIGNATION(coding, reg) \
444 ((coding)->spec.iso_2022.current_designation[reg])
445#define CODING_ISO_INVOCATION(coding, plane) \
446 ((coding)->spec.iso_2022.current_invocation[plane])
447#define CODING_ISO_SINGLE_SHIFTING(coding) \
448 ((coding)->spec.iso_2022.single_shifting)
449#define CODING_ISO_BOL(coding) \
450 ((coding)->spec.iso_2022.bol)
451#define CODING_ISO_INVOKED_CHARSET(coding, plane) \
452 CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
453
454/* Control characters of ISO2022. */
455 /* code */ /* function */
456#define ISO_CODE_LF 0x0A /* line-feed */
457#define ISO_CODE_CR 0x0D /* carriage-return */
458#define ISO_CODE_SO 0x0E /* shift-out */
459#define ISO_CODE_SI 0x0F /* shift-in */
460#define ISO_CODE_SS2_7 0x19 /* single-shift-2 for 7-bit code */
461#define ISO_CODE_ESC 0x1B /* escape */
462#define ISO_CODE_SS2 0x8E /* single-shift-2 */
463#define ISO_CODE_SS3 0x8F /* single-shift-3 */
464#define ISO_CODE_CSI 0x9B /* control-sequence-introducer */
465
466/* All code (1-byte) of ISO2022 is classified into one of the
467 followings. */
468enum iso_code_class_type
469 {
470 ISO_control_0, /* Control codes in the range
471 0x00..0x1F and 0x7F, except for the
472 following 5 codes. */
df7492f9
KH
473 ISO_shift_out, /* ISO_CODE_SO (0x0E) */
474 ISO_shift_in, /* ISO_CODE_SI (0x0F) */
475 ISO_single_shift_2_7, /* ISO_CODE_SS2_7 (0x19) */
476 ISO_escape, /* ISO_CODE_SO (0x1B) */
477 ISO_control_1, /* Control codes in the range
478 0x80..0x9F, except for the
479 following 3 codes. */
480 ISO_single_shift_2, /* ISO_CODE_SS2 (0x8E) */
481 ISO_single_shift_3, /* ISO_CODE_SS3 (0x8F) */
482 ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
483 ISO_0x20_or_0x7F, /* Codes of the values 0x20 or 0x7F. */
484 ISO_graphic_plane_0, /* Graphic codes in the range 0x21..0x7E. */
485 ISO_0xA0_or_0xFF, /* Codes of the values 0xA0 or 0xFF. */
486 ISO_graphic_plane_1 /* Graphic codes in the range 0xA1..0xFE. */
487 };
05e6f5dc 488
df7492f9
KH
489/** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
490 `iso-flags' attribute of an iso2022 coding system. */
05e6f5dc 491
df7492f9
KH
492/* If set, produce long-form designation sequence (e.g. ESC $ ( A)
493 instead of the correct short-form sequence (e.g. ESC $ A). */
494#define CODING_ISO_FLAG_LONG_FORM 0x0001
93dec019 495
df7492f9
KH
496/* If set, reset graphic planes and registers at end-of-line to the
497 initial state. */
498#define CODING_ISO_FLAG_RESET_AT_EOL 0x0002
05e6f5dc 499
df7492f9
KH
500/* If set, reset graphic planes and registers before any control
501 characters to the initial state. */
502#define CODING_ISO_FLAG_RESET_AT_CNTL 0x0004
05e6f5dc 503
df7492f9
KH
504/* If set, encode by 7-bit environment. */
505#define CODING_ISO_FLAG_SEVEN_BITS 0x0008
4ed46869 506
df7492f9
KH
507/* If set, use locking-shift function. */
508#define CODING_ISO_FLAG_LOCKING_SHIFT 0x0010
b73bfc1c 509
df7492f9
KH
510/* If set, use single-shift function. Overwrite
511 CODING_ISO_FLAG_LOCKING_SHIFT. */
512#define CODING_ISO_FLAG_SINGLE_SHIFT 0x0020
b73bfc1c 513
df7492f9
KH
514/* If set, use designation escape sequence. */
515#define CODING_ISO_FLAG_DESIGNATION 0x0040
b73bfc1c 516
df7492f9
KH
517/* If set, produce revision number sequence. */
518#define CODING_ISO_FLAG_REVISION 0x0080
b73bfc1c 519
df7492f9
KH
520/* If set, produce ISO6429's direction specifying sequence. */
521#define CODING_ISO_FLAG_DIRECTION 0x0100
f4dee582 522
df7492f9
KH
523/* If set, assume designation states are reset at beginning of line on
524 output. */
525#define CODING_ISO_FLAG_INIT_AT_BOL 0x0200
4ed46869 526
df7492f9
KH
527/* If set, designation sequence should be placed at beginning of line
528 on output. */
529#define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
aa72b389 530
df7492f9
KH
531/* If set, do not encode unsafe charactes on output. */
532#define CODING_ISO_FLAG_SAFE 0x0800
aa72b389 533
df7492f9
KH
534/* If set, extra latin codes (128..159) are accepted as a valid code
535 on input. */
536#define CODING_ISO_FLAG_LATIN_EXTRA 0x1000
aa72b389 537
df7492f9 538#define CODING_ISO_FLAG_COMPOSITION 0x2000
aa72b389 539
df7492f9 540#define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000
aa72b389 541
bf16eb23 542#define CODING_ISO_FLAG_USE_ROMAN 0x8000
aa72b389 543
bf16eb23 544#define CODING_ISO_FLAG_USE_OLDJIS 0x10000
aa72b389 545
bf16eb23 546#define CODING_ISO_FLAG_FULL_SUPPORT 0x100000
aa72b389 547
df7492f9
KH
548/* A character to be produced on output if encoding of the original
549 character is prohibited by CODING_ISO_FLAG_SAFE. */
550#define CODING_INHIBIT_CHARACTER_SUBSTITUTION '?'
aa72b389 551
a470d443
KH
552/* UTF-8 section */
553#define CODING_UTF_8_BOM(coding) \
554 ((coding)->spec.utf_8_bom)
4ed46869 555
df7492f9
KH
556/* UTF-16 section */
557#define CODING_UTF_16_BOM(coding) \
558 ((coding)->spec.utf_16.bom)
4ed46869 559
df7492f9
KH
560#define CODING_UTF_16_ENDIAN(coding) \
561 ((coding)->spec.utf_16.endian)
4ed46869 562
df7492f9
KH
563#define CODING_UTF_16_SURROGATE(coding) \
564 ((coding)->spec.utf_16.surrogate)
4ed46869 565
4ed46869 566
df7492f9
KH
567/* CCL section */
568#define CODING_CCL_DECODER(coding) \
569 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
570#define CODING_CCL_ENCODER(coding) \
571 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
572#define CODING_CCL_VALIDS(coding) \
8f924df7 573 (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
4ed46869 574
5a936b46 575/* Index for each coding category in `coding_categories' */
4ed46869 576
df7492f9
KH
577enum coding_category
578 {
579 coding_category_iso_7,
580 coding_category_iso_7_tight,
581 coding_category_iso_8_1,
582 coding_category_iso_8_2,
583 coding_category_iso_7_else,
584 coding_category_iso_8_else,
a470d443
KH
585 coding_category_utf_8_auto,
586 coding_category_utf_8_nosig,
587 coding_category_utf_8_sig,
df7492f9
KH
588 coding_category_utf_16_auto,
589 coding_category_utf_16_be,
590 coding_category_utf_16_le,
591 coding_category_utf_16_be_nosig,
592 coding_category_utf_16_le_nosig,
593 coding_category_charset,
594 coding_category_sjis,
595 coding_category_big5,
596 coding_category_ccl,
597 coding_category_emacs_mule,
598 /* All above are targets of code detection. */
599 coding_category_raw_text,
600 coding_category_undecided,
601 coding_category_max
602 };
603
604/* Definitions of flag bits used in detect_coding_XXXX. */
605#define CATEGORY_MASK_ISO_7 (1 << coding_category_iso_7)
606#define CATEGORY_MASK_ISO_7_TIGHT (1 << coding_category_iso_7_tight)
607#define CATEGORY_MASK_ISO_8_1 (1 << coding_category_iso_8_1)
608#define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2)
609#define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else)
610#define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else)
a470d443
KH
611#define CATEGORY_MASK_UTF_8_AUTO (1 << coding_category_utf_8_auto)
612#define CATEGORY_MASK_UTF_8_NOSIG (1 << coding_category_utf_8_nosig)
613#define CATEGORY_MASK_UTF_8_SIG (1 << coding_category_utf_8_sig)
b49a1807 614#define CATEGORY_MASK_UTF_16_AUTO (1 << coding_category_utf_16_auto)
df7492f9
KH
615#define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be)
616#define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le)
617#define CATEGORY_MASK_UTF_16_BE_NOSIG (1 << coding_category_utf_16_be_nosig)
618#define CATEGORY_MASK_UTF_16_LE_NOSIG (1 << coding_category_utf_16_le_nosig)
619#define CATEGORY_MASK_CHARSET (1 << coding_category_charset)
620#define CATEGORY_MASK_SJIS (1 << coding_category_sjis)
621#define CATEGORY_MASK_BIG5 (1 << coding_category_big5)
622#define CATEGORY_MASK_CCL (1 << coding_category_ccl)
623#define CATEGORY_MASK_EMACS_MULE (1 << coding_category_emacs_mule)
ff0dacd7 624#define CATEGORY_MASK_RAW_TEXT (1 << coding_category_raw_text)
df7492f9
KH
625
626/* This value is returned if detect_coding_mask () find nothing other
627 than ASCII characters. */
628#define CATEGORY_MASK_ANY \
629 (CATEGORY_MASK_ISO_7 \
630 | CATEGORY_MASK_ISO_7_TIGHT \
631 | CATEGORY_MASK_ISO_8_1 \
632 | CATEGORY_MASK_ISO_8_2 \
633 | CATEGORY_MASK_ISO_7_ELSE \
634 | CATEGORY_MASK_ISO_8_ELSE \
a470d443
KH
635 | CATEGORY_MASK_UTF_8_AUTO \
636 | CATEGORY_MASK_UTF_8_NOSIG \
637 | CATEGORY_MASK_UTF_8_SIG \
2f3cbb32 638 | CATEGORY_MASK_UTF_16_AUTO \
df7492f9
KH
639 | CATEGORY_MASK_UTF_16_BE \
640 | CATEGORY_MASK_UTF_16_LE \
641 | CATEGORY_MASK_UTF_16_BE_NOSIG \
642 | CATEGORY_MASK_UTF_16_LE_NOSIG \
643 | CATEGORY_MASK_CHARSET \
644 | CATEGORY_MASK_SJIS \
645 | CATEGORY_MASK_BIG5 \
646 | CATEGORY_MASK_CCL \
647 | CATEGORY_MASK_EMACS_MULE)
648
649
650#define CATEGORY_MASK_ISO_7BIT \
651 (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
652
653#define CATEGORY_MASK_ISO_8BIT \
654 (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
655
656#define CATEGORY_MASK_ISO_ELSE \
657 (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
658
659#define CATEGORY_MASK_ISO_ESCAPE \
660 (CATEGORY_MASK_ISO_7 \
661 | CATEGORY_MASK_ISO_7_TIGHT \
662 | CATEGORY_MASK_ISO_7_ELSE \
663 | CATEGORY_MASK_ISO_8_ELSE)
664
665#define CATEGORY_MASK_ISO \
666 ( CATEGORY_MASK_ISO_7BIT \
667 | CATEGORY_MASK_ISO_8BIT \
668 | CATEGORY_MASK_ISO_ELSE)
669
670#define CATEGORY_MASK_UTF_16 \
2f3cbb32
KH
671 (CATEGORY_MASK_UTF_16_AUTO \
672 | CATEGORY_MASK_UTF_16_BE \
df7492f9
KH
673 | CATEGORY_MASK_UTF_16_LE \
674 | CATEGORY_MASK_UTF_16_BE_NOSIG \
675 | CATEGORY_MASK_UTF_16_LE_NOSIG)
676
a470d443
KH
677#define CATEGORY_MASK_UTF_8 \
678 (CATEGORY_MASK_UTF_8_AUTO \
679 | CATEGORY_MASK_UTF_8_NOSIG \
680 | CATEGORY_MASK_UTF_8_SIG)
df7492f9
KH
681
682/* List of symbols `coding-category-xxx' ordered by priority. This
683 variable is exposed to Emacs Lisp. */
684static Lisp_Object Vcoding_category_list;
685
686/* Table of coding categories (Lisp symbols). This variable is for
687 internal use oly. */
688static Lisp_Object Vcoding_category_table;
689
690/* Table of coding-categories ordered by priority. */
691static enum coding_category coding_priorities[coding_category_max];
692
693/* Nth element is a coding context for the coding system bound to the
694 Nth coding category. */
695static struct coding_system coding_categories[coding_category_max];
696
df7492f9
KH
697/*** Commonly used macros and functions ***/
698
699#ifndef min
700#define min(a, b) ((a) < (b) ? (a) : (b))
701#endif
702#ifndef max
703#define max(a, b) ((a) > (b) ? (a) : (b))
704#endif
4ed46869 705
24a73b0a
KH
706#define CODING_GET_INFO(coding, attrs, charset_list) \
707 do { \
708 (attrs) = CODING_ID_ATTRS ((coding)->id); \
709 (charset_list) = CODING_ATTR_CHARSET_LIST (attrs); \
df7492f9 710 } while (0)
4ed46869 711
4ed46869 712
df7492f9
KH
713/* Safely get one byte from the source text pointed by SRC which ends
714 at SRC_END, and set C to that byte. If there are not enough bytes
065e3595
KH
715 in the source, it jumps to `no_more_source'. If multibytep is
716 nonzero, and a multibyte character is found at SRC, set C to the
717 negative value of the character code. The caller should declare
718 and set these variables appropriately in advance:
719 src, src_end, multibytep */
aa72b389 720
065e3595
KH
721#define ONE_MORE_BYTE(c) \
722 do { \
723 if (src == src_end) \
724 { \
725 if (src_base < src) \
726 record_conversion_result \
727 (coding, CODING_RESULT_INSUFFICIENT_SRC); \
728 goto no_more_source; \
729 } \
730 c = *src++; \
731 if (multibytep && (c & 0x80)) \
732 { \
733 if ((c & 0xFE) == 0xC0) \
734 c = ((c & 1) << 6) | *src++; \
735 else \
736 { \
35befdaa
KH
737 src--; \
738 c = - string_char (src, &src, NULL); \
065e3595
KH
739 record_conversion_result \
740 (coding, CODING_RESULT_INVALID_SRC); \
741 } \
742 } \
743 consumed_chars++; \
aa72b389
KH
744 } while (0)
745
aa72b389 746
065e3595
KH
747#define ONE_MORE_BYTE_NO_CHECK(c) \
748 do { \
749 c = *src++; \
750 if (multibytep && (c & 0x80)) \
751 { \
752 if ((c & 0xFE) == 0xC0) \
753 c = ((c & 1) << 6) | *src++; \
754 else \
755 { \
35befdaa
KH
756 src--; \
757 c = - string_char (src, &src, NULL); \
065e3595
KH
758 record_conversion_result \
759 (coding, CODING_RESULT_INVALID_SRC); \
760 } \
761 } \
762 consumed_chars++; \
aa72b389
KH
763 } while (0)
764
aa72b389 765
df7492f9
KH
766/* Store a byte C in the place pointed by DST and increment DST to the
767 next free point, and increment PRODUCED_CHARS. The caller should
768 assure that C is 0..127, and declare and set the variable `dst'
769 appropriately in advance.
770*/
aa72b389
KH
771
772
df7492f9
KH
773#define EMIT_ONE_ASCII_BYTE(c) \
774 do { \
775 produced_chars++; \
776 *dst++ = (c); \
b6871cc7 777 } while (0)
aa72b389
KH
778
779
df7492f9 780/* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2. */
aa72b389 781
df7492f9
KH
782#define EMIT_TWO_ASCII_BYTES(c1, c2) \
783 do { \
784 produced_chars += 2; \
785 *dst++ = (c1), *dst++ = (c2); \
786 } while (0)
aa72b389
KH
787
788
df7492f9
KH
789/* Store a byte C in the place pointed by DST and increment DST to the
790 next free point, and increment PRODUCED_CHARS. If MULTIBYTEP is
791 nonzero, store in an appropriate multibyte from. The caller should
792 declare and set the variables `dst' and `multibytep' appropriately
793 in advance. */
794
795#define EMIT_ONE_BYTE(c) \
796 do { \
797 produced_chars++; \
798 if (multibytep) \
799 { \
800 int ch = (c); \
801 if (ch >= 0x80) \
802 ch = BYTE8_TO_CHAR (ch); \
803 CHAR_STRING_ADVANCE (ch, dst); \
804 } \
805 else \
806 *dst++ = (c); \
aa72b389 807 } while (0)
aa72b389 808
aa72b389 809
df7492f9 810/* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */
aa72b389 811
e19c3639
KH
812#define EMIT_TWO_BYTES(c1, c2) \
813 do { \
814 produced_chars += 2; \
815 if (multibytep) \
816 { \
817 int ch; \
818 \
819 ch = (c1); \
820 if (ch >= 0x80) \
821 ch = BYTE8_TO_CHAR (ch); \
822 CHAR_STRING_ADVANCE (ch, dst); \
823 ch = (c2); \
824 if (ch >= 0x80) \
825 ch = BYTE8_TO_CHAR (ch); \
826 CHAR_STRING_ADVANCE (ch, dst); \
827 } \
828 else \
829 { \
830 *dst++ = (c1); \
831 *dst++ = (c2); \
832 } \
aa72b389
KH
833 } while (0)
834
835
df7492f9
KH
836#define EMIT_THREE_BYTES(c1, c2, c3) \
837 do { \
838 EMIT_ONE_BYTE (c1); \
839 EMIT_TWO_BYTES (c2, c3); \
840 } while (0)
aa72b389 841
aa72b389 842
df7492f9
KH
843#define EMIT_FOUR_BYTES(c1, c2, c3, c4) \
844 do { \
845 EMIT_TWO_BYTES (c1, c2); \
846 EMIT_TWO_BYTES (c3, c4); \
847 } while (0)
aa72b389 848
aa72b389 849
f6cbaf43
KH
850/* Prototypes for static functions. */
851static void record_conversion_result P_ ((struct coding_system *coding,
852 enum coding_result_code result));
853static int detect_coding_utf_8 P_ ((struct coding_system *,
854 struct coding_detection_info *info));
855static void decode_coding_utf_8 P_ ((struct coding_system *));
856static int encode_coding_utf_8 P_ ((struct coding_system *));
857
858static int detect_coding_utf_16 P_ ((struct coding_system *,
859 struct coding_detection_info *info));
860static void decode_coding_utf_16 P_ ((struct coding_system *));
861static int encode_coding_utf_16 P_ ((struct coding_system *));
862
863static int detect_coding_iso_2022 P_ ((struct coding_system *,
864 struct coding_detection_info *info));
865static void decode_coding_iso_2022 P_ ((struct coding_system *));
866static int encode_coding_iso_2022 P_ ((struct coding_system *));
867
868static int detect_coding_emacs_mule P_ ((struct coding_system *,
869 struct coding_detection_info *info));
870static void decode_coding_emacs_mule P_ ((struct coding_system *));
871static int encode_coding_emacs_mule P_ ((struct coding_system *));
872
873static int detect_coding_sjis P_ ((struct coding_system *,
874 struct coding_detection_info *info));
875static void decode_coding_sjis P_ ((struct coding_system *));
876static int encode_coding_sjis P_ ((struct coding_system *));
877
878static int detect_coding_big5 P_ ((struct coding_system *,
879 struct coding_detection_info *info));
880static void decode_coding_big5 P_ ((struct coding_system *));
881static int encode_coding_big5 P_ ((struct coding_system *));
882
883static int detect_coding_ccl P_ ((struct coding_system *,
884 struct coding_detection_info *info));
885static void decode_coding_ccl P_ ((struct coding_system *));
886static int encode_coding_ccl P_ ((struct coding_system *));
887
888static void decode_coding_raw_text P_ ((struct coding_system *));
889static int encode_coding_raw_text P_ ((struct coding_system *));
890
891static void coding_set_source P_ ((struct coding_system *));
892static void coding_set_destination P_ ((struct coding_system *));
893static void coding_alloc_by_realloc P_ ((struct coding_system *, EMACS_INT));
894static void coding_alloc_by_making_gap P_ ((struct coding_system *,
287c57d7 895 EMACS_INT, EMACS_INT));
f6cbaf43
KH
896static unsigned char *alloc_destination P_ ((struct coding_system *,
897 EMACS_INT, unsigned char *));
898static void setup_iso_safe_charsets P_ ((Lisp_Object));
899static unsigned char *encode_designation_at_bol P_ ((struct coding_system *,
900 int *, int *,
901 unsigned char *));
902static int detect_eol P_ ((const unsigned char *,
903 EMACS_INT, enum coding_category));
904static Lisp_Object adjust_coding_eol_type P_ ((struct coding_system *, int));
905static void decode_eol P_ ((struct coding_system *));
906static Lisp_Object get_translation_table P_ ((Lisp_Object, int, int *));
907static Lisp_Object get_translation P_ ((Lisp_Object, int *, int *,
908 int, int *, int *));
909static int produce_chars P_ ((struct coding_system *, Lisp_Object, int));
910static INLINE void produce_composition P_ ((struct coding_system *, int *,
911 EMACS_INT));
912static INLINE void produce_charset P_ ((struct coding_system *, int *,
913 EMACS_INT));
914static void produce_annotation P_ ((struct coding_system *, EMACS_INT));
915static int decode_coding P_ ((struct coding_system *));
916static INLINE int *handle_composition_annotation P_ ((EMACS_INT, EMACS_INT,
3ed051d4 917 struct coding_system *,
f6cbaf43
KH
918 int *, EMACS_INT *));
919static INLINE int *handle_charset_annotation P_ ((EMACS_INT, EMACS_INT,
920 struct coding_system *,
921 int *, EMACS_INT *));
922static void consume_chars P_ ((struct coding_system *, Lisp_Object, int));
923static int encode_coding P_ ((struct coding_system *));
924static Lisp_Object make_conversion_work_buffer P_ ((int));
925static Lisp_Object code_conversion_restore P_ ((Lisp_Object));
926static INLINE int char_encodable_p P_ ((int, Lisp_Object));
927static Lisp_Object make_subsidiaries P_ ((Lisp_Object));
928
065e3595
KH
929static void
930record_conversion_result (struct coding_system *coding,
931 enum coding_result_code result)
932{
933 coding->result = result;
934 switch (result)
935 {
936 case CODING_RESULT_INSUFFICIENT_SRC:
937 Vlast_code_conversion_error = Qinsufficient_source;
938 break;
939 case CODING_RESULT_INCONSISTENT_EOL:
940 Vlast_code_conversion_error = Qinconsistent_eol;
941 break;
942 case CODING_RESULT_INVALID_SRC:
943 Vlast_code_conversion_error = Qinvalid_source;
944 break;
945 case CODING_RESULT_INTERRUPT:
946 Vlast_code_conversion_error = Qinterrupted;
947 break;
948 case CODING_RESULT_INSUFFICIENT_MEM:
949 Vlast_code_conversion_error = Qinsufficient_memory;
950 break;
35befdaa
KH
951 default:
952 Vlast_code_conversion_error = intern ("Unknown error");
065e3595
KH
953 }
954}
955
df7492f9
KH
956#define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
957 do { \
958 charset_map_loaded = 0; \
959 c = DECODE_CHAR (charset, code); \
960 if (charset_map_loaded) \
961 { \
8f924df7 962 const unsigned char *orig = coding->source; \
df7492f9
KH
963 EMACS_INT offset; \
964 \
965 coding_set_source (coding); \
966 offset = coding->source - orig; \
967 src += offset; \
968 src_base += offset; \
969 src_end += offset; \
970 } \
aa72b389
KH
971 } while (0)
972
973
119852e7
KH
974/* If there are at least BYTES length of room at dst, allocate memory
975 for coding->destination and update dst and dst_end. We don't have
976 to take care of coding->source which will be relocated. It is
977 handled by calling coding_set_source in encode_coding. */
978
df7492f9
KH
979#define ASSURE_DESTINATION(bytes) \
980 do { \
981 if (dst + (bytes) >= dst_end) \
982 { \
983 int more_bytes = charbuf_end - charbuf + (bytes); \
984 \
985 dst = alloc_destination (coding, more_bytes, dst); \
986 dst_end = coding->destination + coding->dst_bytes; \
987 } \
988 } while (0)
aa72b389 989
aa72b389 990
db274c7a
KH
991/* Store multibyte form of the character C in P, and advance P to the
992 end of the multibyte form. This is like CHAR_STRING_ADVANCE but it
993 never calls MAYBE_UNIFY_CHAR. */
994
995#define CHAR_STRING_ADVANCE_NO_UNIFY(c, p) \
996 do { \
997 if ((c) <= MAX_1_BYTE_CHAR) \
998 *(p)++ = (c); \
999 else if ((c) <= MAX_2_BYTE_CHAR) \
1000 *(p)++ = (0xC0 | ((c) >> 6)), \
1001 *(p)++ = (0x80 | ((c) & 0x3F)); \
1002 else if ((c) <= MAX_3_BYTE_CHAR) \
1003 *(p)++ = (0xE0 | ((c) >> 12)), \
1004 *(p)++ = (0x80 | (((c) >> 6) & 0x3F)), \
1005 *(p)++ = (0x80 | ((c) & 0x3F)); \
1006 else if ((c) <= MAX_4_BYTE_CHAR) \
1007 *(p)++ = (0xF0 | (c >> 18)), \
1008 *(p)++ = (0x80 | ((c >> 12) & 0x3F)), \
1009 *(p)++ = (0x80 | ((c >> 6) & 0x3F)), \
1010 *(p)++ = (0x80 | (c & 0x3F)); \
1011 else if ((c) <= MAX_5_BYTE_CHAR) \
1012 *(p)++ = 0xF8, \
1013 *(p)++ = (0x80 | ((c >> 18) & 0x0F)), \
1014 *(p)++ = (0x80 | ((c >> 12) & 0x3F)), \
1015 *(p)++ = (0x80 | ((c >> 6) & 0x3F)), \
1016 *(p)++ = (0x80 | (c & 0x3F)); \
1017 else \
1018 (p) += BYTE8_STRING ((c) - 0x3FFF80, p); \
1019 } while (0)
1020
1021
1022/* Return the character code of character whose multibyte form is at
1023 P, and advance P to the end of the multibyte form. This is like
1024 STRING_CHAR_ADVANCE, but it never calls MAYBE_UNIFY_CHAR. */
1025
1026#define STRING_CHAR_ADVANCE_NO_UNIFY(p) \
1027 (!((p)[0] & 0x80) \
1028 ? *(p)++ \
1029 : ! ((p)[0] & 0x20) \
1030 ? ((p) += 2, \
1031 ((((p)[-2] & 0x1F) << 6) \
1032 | ((p)[-1] & 0x3F) \
1033 | ((unsigned char) ((p)[-2]) < 0xC2 ? 0x3FFF80 : 0))) \
1034 : ! ((p)[0] & 0x10) \
1035 ? ((p) += 3, \
1036 ((((p)[-3] & 0x0F) << 12) \
1037 | (((p)[-2] & 0x3F) << 6) \
1038 | ((p)[-1] & 0x3F))) \
1039 : ! ((p)[0] & 0x08) \
1040 ? ((p) += 4, \
1041 ((((p)[-4] & 0xF) << 18) \
1042 | (((p)[-3] & 0x3F) << 12) \
1043 | (((p)[-2] & 0x3F) << 6) \
1044 | ((p)[-1] & 0x3F))) \
1045 : ((p) += 5, \
1046 ((((p)[-4] & 0x3F) << 18) \
1047 | (((p)[-3] & 0x3F) << 12) \
1048 | (((p)[-2] & 0x3F) << 6) \
1049 | ((p)[-1] & 0x3F))))
1050
aa72b389 1051
df7492f9
KH
1052static void
1053coding_set_source (coding)
aa72b389 1054 struct coding_system *coding;
aa72b389 1055{
df7492f9
KH
1056 if (BUFFERP (coding->src_object))
1057 {
2cb26057 1058 struct buffer *buf = XBUFFER (coding->src_object);
aa72b389 1059
df7492f9 1060 if (coding->src_pos < 0)
2cb26057 1061 coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
df7492f9 1062 else
2cb26057 1063 coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
aa72b389 1064 }
df7492f9 1065 else if (STRINGP (coding->src_object))
aa72b389 1066 {
8f924df7 1067 coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
aa72b389 1068 }
df7492f9
KH
1069 else
1070 /* Otherwise, the source is C string and is never relocated
1071 automatically. Thus we don't have to update anything. */
1072 ;
1073}
aa72b389 1074
df7492f9
KH
1075static void
1076coding_set_destination (coding)
1077 struct coding_system *coding;
1078{
1079 if (BUFFERP (coding->dst_object))
aa72b389 1080 {
df7492f9 1081 if (coding->src_pos < 0)
aa72b389 1082 {
13818c30 1083 coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
28f67a95
KH
1084 coding->dst_bytes = (GAP_END_ADDR
1085 - (coding->src_bytes - coding->consumed)
1086 - coding->destination);
aa72b389 1087 }
df7492f9 1088 else
28f67a95
KH
1089 {
1090 /* We are sure that coding->dst_pos_byte is before the gap
1091 of the buffer. */
1092 coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
13818c30 1093 + coding->dst_pos_byte - BEG_BYTE);
28f67a95
KH
1094 coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
1095 - coding->destination);
1096 }
df7492f9
KH
1097 }
1098 else
1099 /* Otherwise, the destination is C string and is never relocated
1100 automatically. Thus we don't have to update anything. */
1101 ;
1102}
1103
1104
1105static void
1106coding_alloc_by_realloc (coding, bytes)
1107 struct coding_system *coding;
1108 EMACS_INT bytes;
1109{
1110 coding->destination = (unsigned char *) xrealloc (coding->destination,
1111 coding->dst_bytes + bytes);
1112 coding->dst_bytes += bytes;
1113}
1114
1115static void
db274c7a 1116coding_alloc_by_making_gap (coding, gap_head_used, bytes)
df7492f9 1117 struct coding_system *coding;
db274c7a 1118 EMACS_INT gap_head_used, bytes;
df7492f9 1119{
db274c7a 1120 if (EQ (coding->src_object, coding->dst_object))
df7492f9 1121 {
db274c7a
KH
1122 /* The gap may contain the produced data at the head and not-yet
1123 consumed data at the tail. To preserve those data, we at
1124 first make the gap size to zero, then increase the gap
1125 size. */
1126 EMACS_INT add = GAP_SIZE;
1127
1128 GPT += gap_head_used, GPT_BYTE += gap_head_used;
1129 GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
df7492f9
KH
1130 make_gap (bytes);
1131 GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
db274c7a 1132 GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
df7492f9 1133 }
730fff51 1134 else
df7492f9 1135 {
2c78b7e1
KH
1136 Lisp_Object this_buffer;
1137
1138 this_buffer = Fcurrent_buffer ();
df7492f9
KH
1139 set_buffer_internal (XBUFFER (coding->dst_object));
1140 make_gap (bytes);
1141 set_buffer_internal (XBUFFER (this_buffer));
aa72b389 1142 }
df7492f9 1143}
8f924df7 1144
df7492f9
KH
1145
1146static unsigned char *
1147alloc_destination (coding, nbytes, dst)
1148 struct coding_system *coding;
3e139625 1149 EMACS_INT nbytes;
df7492f9
KH
1150 unsigned char *dst;
1151{
1152 EMACS_INT offset = dst - coding->destination;
1153
1154 if (BUFFERP (coding->dst_object))
db274c7a
KH
1155 {
1156 struct buffer *buf = XBUFFER (coding->dst_object);
1157
1158 coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1159 }
aa72b389 1160 else
df7492f9 1161 coding_alloc_by_realloc (coding, nbytes);
065e3595 1162 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
1163 coding_set_destination (coding);
1164 dst = coding->destination + offset;
1165 return dst;
1166}
aa72b389 1167
ff0dacd7
KH
1168/** Macros for annotations. */
1169
1170/* Maximum length of annotation data (sum of annotations for
1171 composition and charset). */
69a80ea3 1172#define MAX_ANNOTATION_LENGTH (4 + (MAX_COMPOSITION_COMPONENTS * 2) - 1 + 4)
ff0dacd7
KH
1173
1174/* An annotation data is stored in the array coding->charbuf in this
1175 format:
69a80ea3 1176 [ -LENGTH ANNOTATION_MASK NCHARS ... ]
ff0dacd7
KH
1177 LENGTH is the number of elements in the annotation.
1178 ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
69a80ea3 1179 NCHARS is the number of characters in the text annotated.
ff0dacd7
KH
1180
1181 The format of the following elements depend on ANNOTATION_MASK.
1182
1183 In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1184 follows:
1185 ... METHOD [ COMPOSITION-COMPONENTS ... ]
1186 METHOD is one of enum composition_method.
1187 Optionnal COMPOSITION-COMPONENTS are characters and composition
1188 rules.
1189
1190 In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1191 follows. */
1192
69a80ea3 1193#define ADD_ANNOTATION_DATA(buf, len, mask, nchars) \
ff0dacd7
KH
1194 do { \
1195 *(buf)++ = -(len); \
1196 *(buf)++ = (mask); \
69a80ea3 1197 *(buf)++ = (nchars); \
ff0dacd7
KH
1198 coding->annotated = 1; \
1199 } while (0);
1200
69a80ea3
KH
1201#define ADD_COMPOSITION_DATA(buf, nchars, method) \
1202 do { \
1203 ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1204 *buf++ = method; \
ff0dacd7
KH
1205 } while (0)
1206
1207
69a80ea3
KH
1208#define ADD_CHARSET_DATA(buf, nchars, id) \
1209 do { \
1210 ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1211 *buf++ = id; \
ff0dacd7
KH
1212 } while (0)
1213
df7492f9
KH
1214\f
1215/*** 2. Emacs' internal format (emacs-utf-8) ***/
1216
1217
1218
1219\f
1220/*** 3. UTF-8 ***/
1221
1222/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
ff0dacd7
KH
1223 Check if a text is encoded in UTF-8. If it is, return 1, else
1224 return 0. */
df7492f9
KH
1225
1226#define UTF_8_1_OCTET_P(c) ((c) < 0x80)
1227#define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
1228#define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1229#define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1230#define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1231#define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1232
a470d443
KH
1233#define UTF_BOM 0xFEFF
1234#define UTF_8_BOM_1 0xEF
1235#define UTF_8_BOM_2 0xBB
1236#define UTF_8_BOM_3 0xBF
1237
df7492f9 1238static int
ff0dacd7 1239detect_coding_utf_8 (coding, detect_info)
df7492f9 1240 struct coding_system *coding;
ff0dacd7 1241 struct coding_detection_info *detect_info;
df7492f9 1242{
065e3595 1243 const unsigned char *src = coding->source, *src_base;
8f924df7 1244 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9
KH
1245 int multibytep = coding->src_multibyte;
1246 int consumed_chars = 0;
a470d443 1247 int bom_found = 0;
df7492f9
KH
1248 int found = 0;
1249
ff0dacd7 1250 detect_info->checked |= CATEGORY_MASK_UTF_8;
df7492f9
KH
1251 /* A coding system of this category is always ASCII compatible. */
1252 src += coding->head_ascii;
1253
1254 while (1)
aa72b389 1255 {
df7492f9 1256 int c, c1, c2, c3, c4;
aa72b389 1257
065e3595 1258 src_base = src;
df7492f9 1259 ONE_MORE_BYTE (c);
065e3595 1260 if (c < 0 || UTF_8_1_OCTET_P (c))
df7492f9
KH
1261 continue;
1262 ONE_MORE_BYTE (c1);
065e3595 1263 if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
df7492f9
KH
1264 break;
1265 if (UTF_8_2_OCTET_LEADING_P (c))
aa72b389 1266 {
a470d443 1267 found = 1;
df7492f9 1268 continue;
aa72b389 1269 }
df7492f9 1270 ONE_MORE_BYTE (c2);
065e3595 1271 if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
df7492f9
KH
1272 break;
1273 if (UTF_8_3_OCTET_LEADING_P (c))
aa72b389 1274 {
a470d443
KH
1275 found = 1;
1276 if (src_base == coding->source
1277 && c == UTF_8_BOM_1 && c1 == UTF_8_BOM_2 && c2 == UTF_8_BOM_3)
1278 bom_found = 1;
df7492f9 1279 continue;
aa72b389 1280 }
df7492f9 1281 ONE_MORE_BYTE (c3);
065e3595 1282 if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
df7492f9
KH
1283 break;
1284 if (UTF_8_4_OCTET_LEADING_P (c))
aa72b389 1285 {
a470d443 1286 found = 1;
df7492f9
KH
1287 continue;
1288 }
1289 ONE_MORE_BYTE (c4);
065e3595 1290 if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
df7492f9
KH
1291 break;
1292 if (UTF_8_5_OCTET_LEADING_P (c))
1293 {
a470d443 1294 found = 1;
df7492f9
KH
1295 continue;
1296 }
1297 break;
aa72b389 1298 }
ff0dacd7 1299 detect_info->rejected |= CATEGORY_MASK_UTF_8;
df7492f9 1300 return 0;
aa72b389 1301
df7492f9 1302 no_more_source:
065e3595 1303 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
aa72b389 1304 {
ff0dacd7 1305 detect_info->rejected |= CATEGORY_MASK_UTF_8;
89528eb3 1306 return 0;
aa72b389 1307 }
a470d443
KH
1308 if (bom_found)
1309 {
1310 /* The first character 0xFFFE doesn't necessarily mean a BOM. */
1311 detect_info->found |= CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1312 }
1313 else
1314 {
1315 detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
0e17387a
KH
1316 if (found)
1317 detect_info->found |= CATEGORY_MASK_UTF_8_NOSIG;
a470d443 1318 }
ff0dacd7 1319 return 1;
aa72b389
KH
1320}
1321
4ed46869 1322
b73bfc1c 1323static void
df7492f9 1324decode_coding_utf_8 (coding)
b73bfc1c 1325 struct coding_system *coding;
b73bfc1c 1326{
8f924df7
KH
1327 const unsigned char *src = coding->source + coding->consumed;
1328 const unsigned char *src_end = coding->source + coding->src_bytes;
1329 const unsigned char *src_base;
69a80ea3
KH
1330 int *charbuf = coding->charbuf + coding->charbuf_used;
1331 int *charbuf_end = coding->charbuf + coding->charbuf_size;
453b38f0 1332 int consumed_chars = 0, consumed_chars_base = 0;
df7492f9 1333 int multibytep = coding->src_multibyte;
a470d443 1334 enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
24a73b0a 1335 Lisp_Object attr, charset_list;
119852e7
KH
1336 int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1337 int byte_after_cr = -1;
4ed46869 1338
24a73b0a 1339 CODING_GET_INFO (coding, attr, charset_list);
df7492f9 1340
a470d443
KH
1341 if (bom != utf_without_bom)
1342 {
1343 int c1, c2, c3;
1344
1345 src_base = src;
1346 ONE_MORE_BYTE (c1);
1347 if (! UTF_8_3_OCTET_LEADING_P (c1))
1348 src = src_base;
1349 else
1350 {
159bd5a2 1351 ONE_MORE_BYTE (c2);
a470d443
KH
1352 if (! UTF_8_EXTRA_OCTET_P (c2))
1353 src = src_base;
1354 else
1355 {
159bd5a2 1356 ONE_MORE_BYTE (c3);
a470d443
KH
1357 if (! UTF_8_EXTRA_OCTET_P (c3))
1358 src = src_base;
1359 else
1360 {
1361 if ((c1 != UTF_8_BOM_1)
1362 || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1363 src = src_base;
1364 else
1365 CODING_UTF_8_BOM (coding) = utf_without_bom;
1366 }
1367 }
1368 }
1369 }
1370 CODING_UTF_8_BOM (coding) = utf_without_bom;
1371
1372
1373
df7492f9 1374 while (1)
b73bfc1c 1375 {
df7492f9 1376 int c, c1, c2, c3, c4, c5;
ec6d2bb8 1377
df7492f9
KH
1378 src_base = src;
1379 consumed_chars_base = consumed_chars;
4af310db 1380
df7492f9 1381 if (charbuf >= charbuf_end)
b71f6f73
KH
1382 {
1383 if (byte_after_cr >= 0)
1384 src_base--;
1385 break;
1386 }
df7492f9 1387
119852e7
KH
1388 if (byte_after_cr >= 0)
1389 c1 = byte_after_cr, byte_after_cr = -1;
1390 else
1391 ONE_MORE_BYTE (c1);
065e3595
KH
1392 if (c1 < 0)
1393 {
1394 c = - c1;
1395 }
1396 else if (UTF_8_1_OCTET_P(c1))
df7492f9 1397 {
119852e7
KH
1398 if (eol_crlf && c1 == '\r')
1399 ONE_MORE_BYTE (byte_after_cr);
df7492f9 1400 c = c1;
4af310db 1401 }
df7492f9 1402 else
4af310db 1403 {
df7492f9 1404 ONE_MORE_BYTE (c2);
065e3595 1405 if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
df7492f9
KH
1406 goto invalid_code;
1407 if (UTF_8_2_OCTET_LEADING_P (c1))
4af310db 1408 {
b0edb2c5
DL
1409 c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1410 /* Reject overlong sequences here and below. Encoders
1411 producing them are incorrect, they can be misleading,
1412 and they mess up read/write invariance. */
1413 if (c < 128)
1414 goto invalid_code;
4af310db 1415 }
df7492f9 1416 else
aa72b389 1417 {
df7492f9 1418 ONE_MORE_BYTE (c3);
065e3595 1419 if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
df7492f9
KH
1420 goto invalid_code;
1421 if (UTF_8_3_OCTET_LEADING_P (c1))
b0edb2c5
DL
1422 {
1423 c = (((c1 & 0xF) << 12)
1424 | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
72fe1301
DL
1425 if (c < 0x800
1426 || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
b0edb2c5
DL
1427 goto invalid_code;
1428 }
df7492f9
KH
1429 else
1430 {
1431 ONE_MORE_BYTE (c4);
065e3595 1432 if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
df7492f9
KH
1433 goto invalid_code;
1434 if (UTF_8_4_OCTET_LEADING_P (c1))
b0edb2c5 1435 {
df7492f9
KH
1436 c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1437 | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
b0edb2c5
DL
1438 if (c < 0x10000)
1439 goto invalid_code;
1440 }
df7492f9
KH
1441 else
1442 {
1443 ONE_MORE_BYTE (c5);
065e3595 1444 if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
df7492f9
KH
1445 goto invalid_code;
1446 if (UTF_8_5_OCTET_LEADING_P (c1))
1447 {
1448 c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1449 | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1450 | (c5 & 0x3F));
b0edb2c5 1451 if ((c > MAX_CHAR) || (c < 0x200000))
df7492f9
KH
1452 goto invalid_code;
1453 }
1454 else
1455 goto invalid_code;
1456 }
1457 }
aa72b389 1458 }
b73bfc1c 1459 }
df7492f9
KH
1460
1461 *charbuf++ = c;
1462 continue;
1463
1464 invalid_code:
1465 src = src_base;
1466 consumed_chars = consumed_chars_base;
1467 ONE_MORE_BYTE (c);
1468 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1469 coding->errors++;
aa72b389
KH
1470 }
1471
df7492f9
KH
1472 no_more_source:
1473 coding->consumed_char += consumed_chars_base;
1474 coding->consumed = src_base - coding->source;
1475 coding->charbuf_used = charbuf - coding->charbuf;
1476}
1477
1478
1479static int
1480encode_coding_utf_8 (coding)
1481 struct coding_system *coding;
1482{
1483 int multibytep = coding->dst_multibyte;
1484 int *charbuf = coding->charbuf;
1485 int *charbuf_end = charbuf + coding->charbuf_used;
1486 unsigned char *dst = coding->destination + coding->produced;
1487 unsigned char *dst_end = coding->destination + coding->dst_bytes;
e19c3639 1488 int produced_chars = 0;
df7492f9
KH
1489 int c;
1490
a470d443
KH
1491 if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1492 {
1493 ASSURE_DESTINATION (3);
1494 EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1495 CODING_UTF_8_BOM (coding) = utf_without_bom;
1496 }
1497
df7492f9 1498 if (multibytep)
aa72b389 1499 {
df7492f9
KH
1500 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1501
1502 while (charbuf < charbuf_end)
b73bfc1c 1503 {
df7492f9 1504 unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
8f924df7 1505
df7492f9
KH
1506 ASSURE_DESTINATION (safe_room);
1507 c = *charbuf++;
28f67a95
KH
1508 if (CHAR_BYTE8_P (c))
1509 {
1510 c = CHAR_TO_BYTE8 (c);
1511 EMIT_ONE_BYTE (c);
1512 }
1513 else
1514 {
db274c7a 1515 CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
28f67a95
KH
1516 for (p = str; p < pend; p++)
1517 EMIT_ONE_BYTE (*p);
1518 }
b73bfc1c 1519 }
aa72b389 1520 }
df7492f9
KH
1521 else
1522 {
1523 int safe_room = MAX_MULTIBYTE_LENGTH;
1524
1525 while (charbuf < charbuf_end)
b73bfc1c 1526 {
df7492f9
KH
1527 ASSURE_DESTINATION (safe_room);
1528 c = *charbuf++;
f03caae0
KH
1529 if (CHAR_BYTE8_P (c))
1530 *dst++ = CHAR_TO_BYTE8 (c);
1531 else
db274c7a 1532 CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
df7492f9 1533 produced_chars++;
4ed46869
KH
1534 }
1535 }
065e3595 1536 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
1537 coding->produced_char += produced_chars;
1538 coding->produced = dst - coding->destination;
1539 return 0;
4ed46869
KH
1540}
1541
b73bfc1c 1542
df7492f9 1543/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
ff0dacd7
KH
1544 Check if a text is encoded in one of UTF-16 based coding systems.
1545 If it is, return 1, else return 0. */
aa72b389 1546
df7492f9
KH
1547#define UTF_16_HIGH_SURROGATE_P(val) \
1548 (((val) & 0xFC00) == 0xD800)
1549
1550#define UTF_16_LOW_SURROGATE_P(val) \
1551 (((val) & 0xFC00) == 0xDC00)
93dec019 1552
df7492f9
KH
1553#define UTF_16_INVALID_P(val) \
1554 (((val) == 0xFFFE) \
1555 || ((val) == 0xFFFF) \
1556 || UTF_16_LOW_SURROGATE_P (val))
aa72b389 1557
aa72b389 1558
df7492f9 1559static int
ff0dacd7 1560detect_coding_utf_16 (coding, detect_info)
aa72b389 1561 struct coding_system *coding;
ff0dacd7 1562 struct coding_detection_info *detect_info;
aa72b389 1563{
8f924df7
KH
1564 const unsigned char *src = coding->source, *src_base = src;
1565 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9
KH
1566 int multibytep = coding->src_multibyte;
1567 int consumed_chars = 0;
1568 int c1, c2;
aa72b389 1569
ff0dacd7 1570 detect_info->checked |= CATEGORY_MASK_UTF_16;
ff0dacd7 1571 if (coding->mode & CODING_MODE_LAST_BLOCK
24a73b0a 1572 && (coding->src_chars & 1))
ff0dacd7
KH
1573 {
1574 detect_info->rejected |= CATEGORY_MASK_UTF_16;
1575 return 0;
1576 }
24a73b0a 1577
df7492f9
KH
1578 ONE_MORE_BYTE (c1);
1579 ONE_MORE_BYTE (c2);
df7492f9 1580 if ((c1 == 0xFF) && (c2 == 0xFE))
aa72b389 1581 {
b49a1807
KH
1582 detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1583 | CATEGORY_MASK_UTF_16_AUTO);
24a73b0a
KH
1584 detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1585 | CATEGORY_MASK_UTF_16_BE_NOSIG
1586 | CATEGORY_MASK_UTF_16_LE_NOSIG);
aa72b389 1587 }
df7492f9 1588 else if ((c1 == 0xFE) && (c2 == 0xFF))
ff0dacd7 1589 {
b49a1807
KH
1590 detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1591 | CATEGORY_MASK_UTF_16_AUTO);
24a73b0a
KH
1592 detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1593 | CATEGORY_MASK_UTF_16_BE_NOSIG
1594 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1595 }
2f3cbb32 1596 else
24a73b0a 1597 {
2f3cbb32
KH
1598 /* We check the dispersion of Eth and Oth bytes where E is even and
1599 O is odd. If both are high, we assume binary data.*/
1600 unsigned char e[256], o[256];
1601 unsigned e_num = 1, o_num = 1;
1602
1603 memset (e, 0, 256);
1604 memset (o, 0, 256);
1605 e[c1] = 1;
1606 o[c2] = 1;
1607
24a73b0a
KH
1608 detect_info->rejected
1609 |= (CATEGORY_MASK_UTF_16_BE | CATEGORY_MASK_UTF_16_LE);
2f3cbb32
KH
1610
1611 while (1)
1612 {
1613 ONE_MORE_BYTE (c1);
1614 ONE_MORE_BYTE (c2);
1615 if (! e[c1])
1616 {
1617 e[c1] = 1;
1618 e_num++;
1619 if (e_num >= 128)
1620 break;
1621 }
1622 if (! o[c2])
1623 {
1624 o[c1] = 1;
1625 o_num++;
1626 if (o_num >= 128)
1627 break;
1628 }
1629 }
1630 detect_info->rejected |= CATEGORY_MASK_UTF_16;
1631 return 0;
ff0dacd7 1632 }
2f3cbb32 1633
df7492f9 1634 no_more_source:
ff0dacd7 1635 return 1;
df7492f9 1636}
aa72b389 1637
df7492f9
KH
1638static void
1639decode_coding_utf_16 (coding)
1640 struct coding_system *coding;
1641{
8f924df7
KH
1642 const unsigned char *src = coding->source + coding->consumed;
1643 const unsigned char *src_end = coding->source + coding->src_bytes;
1644 const unsigned char *src_base;
69a80ea3
KH
1645 int *charbuf = coding->charbuf + coding->charbuf_used;
1646 int *charbuf_end = coding->charbuf + coding->charbuf_size;
3a8406e1 1647 int consumed_chars = 0, consumed_chars_base = 0;
df7492f9 1648 int multibytep = coding->src_multibyte;
a470d443 1649 enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
df7492f9
KH
1650 enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1651 int surrogate = CODING_UTF_16_SURROGATE (coding);
24a73b0a 1652 Lisp_Object attr, charset_list;
119852e7
KH
1653 int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1654 int byte_after_cr1 = -1, byte_after_cr2 = -1;
df7492f9 1655
24a73b0a 1656 CODING_GET_INFO (coding, attr, charset_list);
df7492f9 1657
a470d443 1658 if (bom == utf_with_bom)
aa72b389 1659 {
df7492f9 1660 int c, c1, c2;
4af310db 1661
aa72b389 1662 src_base = src;
df7492f9
KH
1663 ONE_MORE_BYTE (c1);
1664 ONE_MORE_BYTE (c2);
e19c3639 1665 c = (c1 << 8) | c2;
aa72b389 1666
b49a1807
KH
1667 if (endian == utf_16_big_endian
1668 ? c != 0xFEFF : c != 0xFFFE)
aa72b389 1669 {
b49a1807
KH
1670 /* The first two bytes are not BOM. Treat them as bytes
1671 for a normal character. */
1672 src = src_base;
1673 coding->errors++;
aa72b389 1674 }
a470d443 1675 CODING_UTF_16_BOM (coding) = utf_without_bom;
b49a1807 1676 }
a470d443 1677 else if (bom == utf_detect_bom)
b49a1807
KH
1678 {
1679 /* We have already tried to detect BOM and failed in
1680 detect_coding. */
a470d443 1681 CODING_UTF_16_BOM (coding) = utf_without_bom;
df7492f9 1682 }
aa72b389 1683
df7492f9
KH
1684 while (1)
1685 {
1686 int c, c1, c2;
1687
1688 src_base = src;
1689 consumed_chars_base = consumed_chars;
1690
1691 if (charbuf + 2 >= charbuf_end)
b71f6f73
KH
1692 {
1693 if (byte_after_cr1 >= 0)
1694 src_base -= 2;
1695 break;
1696 }
df7492f9 1697
119852e7
KH
1698 if (byte_after_cr1 >= 0)
1699 c1 = byte_after_cr1, byte_after_cr1 = -1;
1700 else
1701 ONE_MORE_BYTE (c1);
065e3595
KH
1702 if (c1 < 0)
1703 {
1704 *charbuf++ = -c1;
1705 continue;
1706 }
119852e7
KH
1707 if (byte_after_cr2 >= 0)
1708 c2 = byte_after_cr2, byte_after_cr2 = -1;
1709 else
1710 ONE_MORE_BYTE (c2);
065e3595
KH
1711 if (c2 < 0)
1712 {
1713 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1714 *charbuf++ = -c2;
1715 continue;
1716 }
df7492f9 1717 c = (endian == utf_16_big_endian
e19c3639 1718 ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
119852e7 1719
df7492f9 1720 if (surrogate)
fd3ae0b9 1721 {
df7492f9 1722 if (! UTF_16_LOW_SURROGATE_P (c))
fd3ae0b9 1723 {
df7492f9
KH
1724 if (endian == utf_16_big_endian)
1725 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1726 else
1727 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1728 *charbuf++ = c1;
1729 *charbuf++ = c2;
1730 coding->errors++;
1731 if (UTF_16_HIGH_SURROGATE_P (c))
1732 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
fd3ae0b9 1733 else
df7492f9 1734 *charbuf++ = c;
fd3ae0b9
KH
1735 }
1736 else
df7492f9
KH
1737 {
1738 c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1739 CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
29f7ffd0 1740 *charbuf++ = 0x10000 + c;
df7492f9 1741 }
fd3ae0b9 1742 }
aa72b389 1743 else
df7492f9
KH
1744 {
1745 if (UTF_16_HIGH_SURROGATE_P (c))
1746 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1747 else
119852e7
KH
1748 {
1749 if (eol_crlf && c == '\r')
1750 {
1751 ONE_MORE_BYTE (byte_after_cr1);
1752 ONE_MORE_BYTE (byte_after_cr2);
1753 }
1754 *charbuf++ = c;
1755 }
8f924df7 1756 }
aa72b389 1757 }
df7492f9
KH
1758
1759 no_more_source:
1760 coding->consumed_char += consumed_chars_base;
1761 coding->consumed = src_base - coding->source;
1762 coding->charbuf_used = charbuf - coding->charbuf;
aa72b389 1763}
b73bfc1c 1764
df7492f9
KH
1765static int
1766encode_coding_utf_16 (coding)
1767 struct coding_system *coding;
1768{
1769 int multibytep = coding->dst_multibyte;
1770 int *charbuf = coding->charbuf;
1771 int *charbuf_end = charbuf + coding->charbuf_used;
1772 unsigned char *dst = coding->destination + coding->produced;
1773 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1774 int safe_room = 8;
a470d443 1775 enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
df7492f9
KH
1776 int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1777 int produced_chars = 0;
24a73b0a 1778 Lisp_Object attrs, charset_list;
df7492f9 1779 int c;
4ed46869 1780
24a73b0a 1781 CODING_GET_INFO (coding, attrs, charset_list);
df7492f9 1782
a470d443 1783 if (bom != utf_without_bom)
df7492f9
KH
1784 {
1785 ASSURE_DESTINATION (safe_room);
1786 if (big_endian)
df7492f9 1787 EMIT_TWO_BYTES (0xFE, 0xFF);
880cf180
KH
1788 else
1789 EMIT_TWO_BYTES (0xFF, 0xFE);
a470d443 1790 CODING_UTF_16_BOM (coding) = utf_without_bom;
df7492f9
KH
1791 }
1792
1793 while (charbuf < charbuf_end)
1794 {
1795 ASSURE_DESTINATION (safe_room);
1796 c = *charbuf++;
e19c3639
KH
1797 if (c >= MAX_UNICODE_CHAR)
1798 c = coding->default_char;
df7492f9
KH
1799
1800 if (c < 0x10000)
1801 {
1802 if (big_endian)
1803 EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1804 else
1805 EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1806 }
1807 else
1808 {
1809 int c1, c2;
1810
1811 c -= 0x10000;
1812 c1 = (c >> 10) + 0xD800;
1813 c2 = (c & 0x3FF) + 0xDC00;
1814 if (big_endian)
1815 EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1816 else
1817 EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1818 }
1819 }
065e3595 1820 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
1821 coding->produced = dst - coding->destination;
1822 coding->produced_char += produced_chars;
1823 return 0;
1824}
1825
1826\f
1827/*** 6. Old Emacs' internal format (emacs-mule) ***/
1828
1829/* Emacs' internal format for representation of multiple character
1830 sets is a kind of multi-byte encoding, i.e. characters are
1831 represented by variable-length sequences of one-byte codes.
1832
1833 ASCII characters and control characters (e.g. `tab', `newline') are
1834 represented by one-byte sequences which are their ASCII codes, in
1835 the range 0x00 through 0x7F.
1836
1837 8-bit characters of the range 0x80..0x9F are represented by
1838 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1839 code + 0x20).
1840
1841 8-bit characters of the range 0xA0..0xFF are represented by
1842 one-byte sequences which are their 8-bit code.
1843
1844 The other characters are represented by a sequence of `base
1845 leading-code', optional `extended leading-code', and one or two
1846 `position-code's. The length of the sequence is determined by the
1847 base leading-code. Leading-code takes the range 0x81 through 0x9D,
1848 whereas extended leading-code and position-code take the range 0xA0
1849 through 0xFF. See `charset.h' for more details about leading-code
1850 and position-code.
1851
1852 --- CODE RANGE of Emacs' internal format ---
1853 character set range
1854 ------------- -----
1855 ascii 0x00..0x7F
1856 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1857 eight-bit-graphic 0xA0..0xBF
1858 ELSE 0x81..0x9D + [0xA0..0xFF]+
1859 ---------------------------------------------
1860
1861 As this is the internal character representation, the format is
1862 usually not used externally (i.e. in a file or in a data sent to a
1863 process). But, it is possible to have a text externally in this
1864 format (i.e. by encoding by the coding system `emacs-mule').
1865
1866 In that case, a sequence of one-byte codes has a slightly different
1867 form.
1868
1869 At first, all characters in eight-bit-control are represented by
1870 one-byte sequences which are their 8-bit code.
1871
1872 Next, character composition data are represented by the byte
1873 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1874 where,
1875 METHOD is 0xF0 plus one of composition method (enum
1876 composition_method),
1877
1878 BYTES is 0xA0 plus a byte length of this composition data,
1879
1880 CHARS is 0x20 plus a number of characters composed by this
1881 data,
1882
1883 COMPONENTs are characters of multibye form or composition
1884 rules encoded by two-byte of ASCII codes.
1885
1886 In addition, for backward compatibility, the following formats are
1887 also recognized as composition data on decoding.
1888
1889 0x80 MSEQ ...
1890 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1891
1892 Here,
1893 MSEQ is a multibyte form but in these special format:
1894 ASCII: 0xA0 ASCII_CODE+0x80,
1895 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1896 RULE is a one byte code of the range 0xA0..0xF0 that
1897 represents a composition rule.
1898 */
1899
1900char emacs_mule_bytes[256];
1901
df7492f9 1902int
ff0dacd7 1903emacs_mule_char (coding, src, nbytes, nchars, id)
df7492f9 1904 struct coding_system *coding;
065e3595 1905 const unsigned char *src;
ff0dacd7 1906 int *nbytes, *nchars, *id;
df7492f9 1907{
8f924df7
KH
1908 const unsigned char *src_end = coding->source + coding->src_bytes;
1909 const unsigned char *src_base = src;
df7492f9 1910 int multibytep = coding->src_multibyte;
df7492f9
KH
1911 struct charset *charset;
1912 unsigned code;
1913 int c;
1914 int consumed_chars = 0;
1915
1916 ONE_MORE_BYTE (c);
065e3595 1917 if (c < 0)
df7492f9 1918 {
065e3595
KH
1919 c = -c;
1920 charset = emacs_mule_charset[0];
1921 }
1922 else
1923 {
4d41e8b7
KH
1924 if (c >= 0xA0)
1925 {
b3af4b28 1926 /* Old style component character of a composition. */
4d41e8b7
KH
1927 if (c == 0xA0)
1928 {
1929 ONE_MORE_BYTE (c);
1930 c -= 0x80;
1931 }
1932 else
1933 c -= 0x20;
1934 }
1935
065e3595 1936 switch (emacs_mule_bytes[c])
b73bfc1c 1937 {
065e3595 1938 case 2:
df7492f9
KH
1939 if (! (charset = emacs_mule_charset[c]))
1940 goto invalid_code;
1941 ONE_MORE_BYTE (c);
9ffd559c 1942 if (c < 0xA0)
065e3595 1943 goto invalid_code;
df7492f9 1944 code = c & 0x7F;
065e3595
KH
1945 break;
1946
1947 case 3:
1948 if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
1949 || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
1950 {
1951 ONE_MORE_BYTE (c);
9ffd559c 1952 if (c < 0xA0 || ! (charset = emacs_mule_charset[c]))
065e3595
KH
1953 goto invalid_code;
1954 ONE_MORE_BYTE (c);
9ffd559c 1955 if (c < 0xA0)
065e3595
KH
1956 goto invalid_code;
1957 code = c & 0x7F;
1958 }
1959 else
1960 {
1961 if (! (charset = emacs_mule_charset[c]))
1962 goto invalid_code;
1963 ONE_MORE_BYTE (c);
9ffd559c 1964 if (c < 0xA0)
065e3595
KH
1965 goto invalid_code;
1966 code = (c & 0x7F) << 8;
1967 ONE_MORE_BYTE (c);
9ffd559c 1968 if (c < 0xA0)
065e3595
KH
1969 goto invalid_code;
1970 code |= c & 0x7F;
1971 }
1972 break;
1973
1974 case 4:
1975 ONE_MORE_BYTE (c);
1976 if (c < 0 || ! (charset = emacs_mule_charset[c]))
df7492f9
KH
1977 goto invalid_code;
1978 ONE_MORE_BYTE (c);
9ffd559c 1979 if (c < 0xA0)
065e3595 1980 goto invalid_code;
781d7a48 1981 code = (c & 0x7F) << 8;
df7492f9 1982 ONE_MORE_BYTE (c);
9ffd559c 1983 if (c < 0xA0)
065e3595 1984 goto invalid_code;
df7492f9 1985 code |= c & 0x7F;
065e3595 1986 break;
df7492f9 1987
065e3595
KH
1988 case 1:
1989 code = c;
1990 charset = CHARSET_FROM_ID (ASCII_BYTE_P (code)
1991 ? charset_ascii : charset_eight_bit);
1992 break;
df7492f9 1993
065e3595
KH
1994 default:
1995 abort ();
1996 }
1997 c = DECODE_CHAR (charset, code);
1998 if (c < 0)
1999 goto invalid_code;
df7492f9 2000 }
df7492f9
KH
2001 *nbytes = src - src_base;
2002 *nchars = consumed_chars;
ff0dacd7
KH
2003 if (id)
2004 *id = charset->id;
df7492f9
KH
2005 return c;
2006
2007 no_more_source:
2008 return -2;
2009
2010 invalid_code:
2011 return -1;
2012}
2013
2014
2015/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
ff0dacd7
KH
2016 Check if a text is encoded in `emacs-mule'. If it is, return 1,
2017 else return 0. */
df7492f9
KH
2018
2019static int
ff0dacd7 2020detect_coding_emacs_mule (coding, detect_info)
df7492f9 2021 struct coding_system *coding;
ff0dacd7 2022 struct coding_detection_info *detect_info;
df7492f9 2023{
065e3595 2024 const unsigned char *src = coding->source, *src_base;
8f924df7 2025 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9
KH
2026 int multibytep = coding->src_multibyte;
2027 int consumed_chars = 0;
2028 int c;
2029 int found = 0;
2030
ff0dacd7 2031 detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
df7492f9
KH
2032 /* A coding system of this category is always ASCII compatible. */
2033 src += coding->head_ascii;
2034
2035 while (1)
2036 {
065e3595 2037 src_base = src;
df7492f9 2038 ONE_MORE_BYTE (c);
065e3595
KH
2039 if (c < 0)
2040 continue;
df7492f9
KH
2041 if (c == 0x80)
2042 {
2043 /* Perhaps the start of composite character. We simple skip
2044 it because analyzing it is too heavy for detecting. But,
2045 at least, we check that the composite character
3ed051d4 2046 constitutes of more than 4 bytes. */
8f924df7 2047 const unsigned char *src_base;
df7492f9
KH
2048
2049 repeat:
2050 src_base = src;
2051 do
2052 {
2053 ONE_MORE_BYTE (c);
2054 }
2055 while (c >= 0xA0);
2056
2057 if (src - src_base <= 4)
2058 break;
ff0dacd7 2059 found = CATEGORY_MASK_EMACS_MULE;
df7492f9
KH
2060 if (c == 0x80)
2061 goto repeat;
b73bfc1c 2062 }
df7492f9
KH
2063
2064 if (c < 0x80)
b73bfc1c 2065 {
df7492f9
KH
2066 if (c < 0x20
2067 && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
2068 break;
2069 }
2070 else
2071 {
0e219d54 2072 int more_bytes = emacs_mule_bytes[*src_base] - 1;
df7492f9 2073
0e219d54 2074 while (more_bytes > 0)
df7492f9
KH
2075 {
2076 ONE_MORE_BYTE (c);
0e219d54
KH
2077 if (c < 0xA0)
2078 {
2079 src--; /* Unread the last byte. */
2080 break;
2081 }
2082 more_bytes--;
df7492f9 2083 }
0e219d54 2084 if (more_bytes != 0)
df7492f9 2085 break;
ff0dacd7 2086 found = CATEGORY_MASK_EMACS_MULE;
4ed46869
KH
2087 }
2088 }
ff0dacd7 2089 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
df7492f9
KH
2090 return 0;
2091
2092 no_more_source:
065e3595 2093 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
89528eb3 2094 {
ff0dacd7 2095 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
89528eb3
KH
2096 return 0;
2097 }
ff0dacd7
KH
2098 detect_info->found |= found;
2099 return 1;
4ed46869
KH
2100}
2101
b73bfc1c 2102
df7492f9
KH
2103/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
2104
2105/* Decode a character represented as a component of composition
2106 sequence of Emacs 20/21 style at SRC. Set C to that character and
2107 update SRC to the head of next character (or an encoded composition
2108 rule). If SRC doesn't points a composition component, set C to -1.
2109 If SRC points an invalid byte sequence, global exit by a return
2110 value 0. */
2111
2112#define DECODE_EMACS_MULE_COMPOSITION_CHAR(buf) \
f937a7db 2113 do \
df7492f9
KH
2114 { \
2115 int c; \
2116 int nbytes, nchars; \
2117 \
2118 if (src == src_end) \
2119 break; \
ff0dacd7 2120 c = emacs_mule_char (coding, src, &nbytes, &nchars, NULL);\
df7492f9
KH
2121 if (c < 0) \
2122 { \
2123 if (c == -2) \
2124 break; \
2125 goto invalid_code; \
2126 } \
2127 *buf++ = c; \
2128 src += nbytes; \
2129 consumed_chars += nchars; \
2130 } \
f937a7db 2131 while (0)
df7492f9
KH
2132
2133
2134/* Decode a composition rule represented as a component of composition
781d7a48
KH
2135 sequence of Emacs 20 style at SRC. Store the decoded rule in *BUF,
2136 and increment BUF. If SRC points an invalid byte sequence, set C
2137 to -1. */
df7492f9 2138
781d7a48 2139#define DECODE_EMACS_MULE_COMPOSITION_RULE_20(buf) \
df7492f9
KH
2140 do { \
2141 int c, gref, nref; \
2142 \
781d7a48 2143 if (src >= src_end) \
df7492f9
KH
2144 goto invalid_code; \
2145 ONE_MORE_BYTE_NO_CHECK (c); \
4d41e8b7 2146 c -= 0xA0; \
df7492f9
KH
2147 if (c < 0 || c >= 81) \
2148 goto invalid_code; \
2149 \
2150 gref = c / 9, nref = c % 9; \
2151 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
2152 } while (0)
2153
2154
781d7a48
KH
2155/* Decode a composition rule represented as a component of composition
2156 sequence of Emacs 21 style at SRC. Store the decoded rule in *BUF,
2157 and increment BUF. If SRC points an invalid byte sequence, set C
2158 to -1. */
2159
2160#define DECODE_EMACS_MULE_COMPOSITION_RULE_21(buf) \
2161 do { \
2162 int gref, nref; \
2163 \
2164 if (src + 1>= src_end) \
2165 goto invalid_code; \
2166 ONE_MORE_BYTE_NO_CHECK (gref); \
2167 gref -= 0x20; \
2168 ONE_MORE_BYTE_NO_CHECK (nref); \
2169 nref -= 0x20; \
2170 if (gref < 0 || gref >= 81 \
2171 || nref < 0 || nref >= 81) \
2172 goto invalid_code; \
2173 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
2174 } while (0)
2175
2176
df7492f9 2177#define DECODE_EMACS_MULE_21_COMPOSITION(c) \
aa72b389 2178 do { \
df7492f9 2179 /* Emacs 21 style format. The first three bytes at SRC are \
781d7a48 2180 (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is \
df7492f9
KH
2181 the byte length of this composition information, CHARS is the \
2182 number of characters composed by this composition. */ \
781d7a48
KH
2183 enum composition_method method = c - 0xF2; \
2184 int *charbuf_base = charbuf; \
df7492f9
KH
2185 int consumed_chars_limit; \
2186 int nbytes, nchars; \
2187 \
2188 ONE_MORE_BYTE (c); \
065e3595
KH
2189 if (c < 0) \
2190 goto invalid_code; \
df7492f9
KH
2191 nbytes = c - 0xA0; \
2192 if (nbytes < 3) \
2193 goto invalid_code; \
2194 ONE_MORE_BYTE (c); \
065e3595
KH
2195 if (c < 0) \
2196 goto invalid_code; \
df7492f9 2197 nchars = c - 0xA0; \
69a80ea3 2198 ADD_COMPOSITION_DATA (charbuf, nchars, method); \
df7492f9
KH
2199 consumed_chars_limit = consumed_chars_base + nbytes; \
2200 if (method != COMPOSITION_RELATIVE) \
aa72b389 2201 { \
df7492f9
KH
2202 int i = 0; \
2203 while (consumed_chars < consumed_chars_limit) \
aa72b389 2204 { \
df7492f9 2205 if (i % 2 && method != COMPOSITION_WITH_ALTCHARS) \
781d7a48 2206 DECODE_EMACS_MULE_COMPOSITION_RULE_21 (charbuf); \
df7492f9
KH
2207 else \
2208 DECODE_EMACS_MULE_COMPOSITION_CHAR (charbuf); \
781d7a48 2209 i++; \
aa72b389 2210 } \
df7492f9
KH
2211 if (consumed_chars < consumed_chars_limit) \
2212 goto invalid_code; \
781d7a48 2213 charbuf_base[0] -= i; \
aa72b389
KH
2214 } \
2215 } while (0)
93dec019 2216
aa72b389 2217
d959f512
KH
2218#define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION(c) \
2219 do { \
2220 /* Emacs 20 style format for relative composition. */ \
2221 /* Store multibyte form of characters to be composed. */ \
2222 enum composition_method method = COMPOSITION_RELATIVE; \
2223 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
2224 int *buf = components; \
2225 int i, j; \
2226 \
2227 src = src_base; \
2228 ONE_MORE_BYTE (c); /* skip 0x80 */ \
2229 for (i = 0; *src >= 0xA0 && i < MAX_COMPOSITION_COMPONENTS; i++) \
2230 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
2231 if (i < 2) \
2232 goto invalid_code; \
2233 ADD_COMPOSITION_DATA (charbuf, i, method); \
2234 for (j = 0; j < i; j++) \
2235 *charbuf++ = components[j]; \
df7492f9
KH
2236 } while (0)
2237
2238
2239#define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION(c) \
2240 do { \
2241 /* Emacs 20 style format for rule-base composition. */ \
2242 /* Store multibyte form of characters to be composed. */ \
ff0dacd7 2243 enum composition_method method = COMPOSITION_WITH_RULE; \
4d41e8b7 2244 int *charbuf_base = charbuf; \
df7492f9
KH
2245 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
2246 int *buf = components; \
2247 int i, j; \
4d41e8b7 2248 \
df7492f9 2249 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
4d41e8b7 2250 for (i = 1; i < MAX_COMPOSITION_COMPONENTS; i++) \
df7492f9 2251 { \
4d41e8b7
KH
2252 if (*src < 0xA0) \
2253 break; \
781d7a48 2254 DECODE_EMACS_MULE_COMPOSITION_RULE_20 (buf); \
df7492f9
KH
2255 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
2256 } \
4d41e8b7 2257 if (i <= 1 || (buf - components) % 2 == 0) \
df7492f9 2258 goto invalid_code; \
4d41e8b7 2259 if (charbuf + i + (i / 2) + 1 >= charbuf_end) \
df7492f9 2260 goto no_more_source; \
4d41e8b7
KH
2261 ADD_COMPOSITION_DATA (charbuf, i, method); \
2262 i = i * 2 - 1; \
df7492f9
KH
2263 for (j = 0; j < i; j++) \
2264 *charbuf++ = components[j]; \
4d41e8b7 2265 charbuf_base[0] -= i; \
df7492f9
KH
2266 for (j = 0; j < i; j += 2) \
2267 *charbuf++ = components[j]; \
2268 } while (0)
2269
aa72b389
KH
2270
2271static void
df7492f9 2272decode_coding_emacs_mule (coding)
aa72b389 2273 struct coding_system *coding;
aa72b389 2274{
8f924df7
KH
2275 const unsigned char *src = coding->source + coding->consumed;
2276 const unsigned char *src_end = coding->source + coding->src_bytes;
2277 const unsigned char *src_base;
69a80ea3
KH
2278 int *charbuf = coding->charbuf + coding->charbuf_used;
2279 int *charbuf_end
2280 = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
df7492f9 2281 int consumed_chars = 0, consumed_chars_base;
df7492f9 2282 int multibytep = coding->src_multibyte;
24a73b0a 2283 Lisp_Object attrs, charset_list;
ff0dacd7
KH
2284 int char_offset = coding->produced_char;
2285 int last_offset = char_offset;
2286 int last_id = charset_ascii;
119852e7
KH
2287 int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2288 int byte_after_cr = -1;
aa72b389 2289
24a73b0a 2290 CODING_GET_INFO (coding, attrs, charset_list);
aa72b389 2291
aa72b389
KH
2292 while (1)
2293 {
df7492f9
KH
2294 int c;
2295
aa72b389 2296 src_base = src;
df7492f9
KH
2297 consumed_chars_base = consumed_chars;
2298
2299 if (charbuf >= charbuf_end)
b71f6f73
KH
2300 {
2301 if (byte_after_cr >= 0)
2302 src_base--;
2303 break;
2304 }
aa72b389 2305
119852e7
KH
2306 if (byte_after_cr >= 0)
2307 c = byte_after_cr, byte_after_cr = -1;
2308 else
2309 ONE_MORE_BYTE (c);
065e3595
KH
2310 if (c < 0)
2311 {
2312 *charbuf++ = -c;
2313 char_offset++;
2314 }
2315 else if (c < 0x80)
aa72b389 2316 {
119852e7
KH
2317 if (eol_crlf && c == '\r')
2318 ONE_MORE_BYTE (byte_after_cr);
df7492f9
KH
2319 *charbuf++ = c;
2320 char_offset++;
aa72b389 2321 }
df7492f9
KH
2322 else if (c == 0x80)
2323 {
df7492f9 2324 ONE_MORE_BYTE (c);
065e3595
KH
2325 if (c < 0)
2326 goto invalid_code;
781d7a48
KH
2327 if (c - 0xF2 >= COMPOSITION_RELATIVE
2328 && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)
df7492f9
KH
2329 DECODE_EMACS_MULE_21_COMPOSITION (c);
2330 else if (c < 0xC0)
2331 DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (c);
2332 else if (c == 0xFF)
2333 DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (c);
2334 else
2335 goto invalid_code;
2336 }
2337 else if (c < 0xA0 && emacs_mule_bytes[c] > 1)
2338 {
2339 int nbytes, nchars;
ff0dacd7
KH
2340 int id;
2341
781d7a48
KH
2342 src = src_base;
2343 consumed_chars = consumed_chars_base;
ff0dacd7 2344 c = emacs_mule_char (coding, src, &nbytes, &nchars, &id);
df7492f9
KH
2345 if (c < 0)
2346 {
2347 if (c == -2)
2348 break;
2349 goto invalid_code;
2350 }
ff0dacd7
KH
2351 if (last_id != id)
2352 {
2353 if (last_id != charset_ascii)
69a80ea3 2354 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
ff0dacd7
KH
2355 last_id = id;
2356 last_offset = char_offset;
2357 }
df7492f9 2358 *charbuf++ = c;
781d7a48
KH
2359 src += nbytes;
2360 consumed_chars += nchars;
df7492f9
KH
2361 char_offset++;
2362 }
4d41e8b7
KH
2363 else
2364 goto invalid_code;
df7492f9
KH
2365 continue;
2366
2367 invalid_code:
2368 src = src_base;
2369 consumed_chars = consumed_chars_base;
2370 ONE_MORE_BYTE (c);
2371 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
ff0dacd7 2372 char_offset++;
df7492f9
KH
2373 coding->errors++;
2374 }
2375
2376 no_more_source:
ff0dacd7 2377 if (last_id != charset_ascii)
69a80ea3 2378 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
df7492f9
KH
2379 coding->consumed_char += consumed_chars_base;
2380 coding->consumed = src_base - coding->source;
2381 coding->charbuf_used = charbuf - coding->charbuf;
2382}
2383
2384
2385#define EMACS_MULE_LEADING_CODES(id, codes) \
2386 do { \
2387 if (id < 0xA0) \
2388 codes[0] = id, codes[1] = 0; \
2389 else if (id < 0xE0) \
2390 codes[0] = 0x9A, codes[1] = id; \
2391 else if (id < 0xF0) \
2392 codes[0] = 0x9B, codes[1] = id; \
2393 else if (id < 0xF5) \
2394 codes[0] = 0x9C, codes[1] = id; \
2395 else \
2396 codes[0] = 0x9D, codes[1] = id; \
2397 } while (0);
2398
aa72b389 2399
df7492f9
KH
2400static int
2401encode_coding_emacs_mule (coding)
2402 struct coding_system *coding;
2403{
2404 int multibytep = coding->dst_multibyte;
2405 int *charbuf = coding->charbuf;
2406 int *charbuf_end = charbuf + coding->charbuf_used;
2407 unsigned char *dst = coding->destination + coding->produced;
2408 unsigned char *dst_end = coding->destination + coding->dst_bytes;
2409 int safe_room = 8;
df7492f9 2410 int produced_chars = 0;
24a73b0a 2411 Lisp_Object attrs, charset_list;
df7492f9 2412 int c;
ff0dacd7 2413 int preferred_charset_id = -1;
df7492f9 2414
24a73b0a 2415 CODING_GET_INFO (coding, attrs, charset_list);
eccb6815
KH
2416 if (! EQ (charset_list, Vemacs_mule_charset_list))
2417 {
2418 CODING_ATTR_CHARSET_LIST (attrs)
2419 = charset_list = Vemacs_mule_charset_list;
2420 }
df7492f9
KH
2421
2422 while (charbuf < charbuf_end)
2423 {
2424 ASSURE_DESTINATION (safe_room);
2425 c = *charbuf++;
ff0dacd7
KH
2426
2427 if (c < 0)
2428 {
2429 /* Handle an annotation. */
2430 switch (*charbuf)
2431 {
2432 case CODING_ANNOTATE_COMPOSITION_MASK:
2433 /* Not yet implemented. */
2434 break;
2435 case CODING_ANNOTATE_CHARSET_MASK:
2436 preferred_charset_id = charbuf[3];
2437 if (preferred_charset_id >= 0
2438 && NILP (Fmemq (make_number (preferred_charset_id),
2439 charset_list)))
2440 preferred_charset_id = -1;
2441 break;
2442 default:
2443 abort ();
2444 }
2445 charbuf += -c - 1;
2446 continue;
2447 }
2448
df7492f9
KH
2449 if (ASCII_CHAR_P (c))
2450 EMIT_ONE_ASCII_BYTE (c);
16eafb5d
KH
2451 else if (CHAR_BYTE8_P (c))
2452 {
2453 c = CHAR_TO_BYTE8 (c);
2454 EMIT_ONE_BYTE (c);
2455 }
df7492f9 2456 else
aa72b389 2457 {
df7492f9
KH
2458 struct charset *charset;
2459 unsigned code;
2460 int dimension;
2461 int emacs_mule_id;
2462 unsigned char leading_codes[2];
2463
ff0dacd7
KH
2464 if (preferred_charset_id >= 0)
2465 {
2466 charset = CHARSET_FROM_ID (preferred_charset_id);
905ca9d2
KH
2467 if (CHAR_CHARSET_P (c, charset))
2468 code = ENCODE_CHAR (charset, c);
2469 else
2470 charset = char_charset (c, charset_list, &code);
ff0dacd7
KH
2471 }
2472 else
2473 charset = char_charset (c, charset_list, &code);
df7492f9
KH
2474 if (! charset)
2475 {
2476 c = coding->default_char;
2477 if (ASCII_CHAR_P (c))
2478 {
2479 EMIT_ONE_ASCII_BYTE (c);
2480 continue;
2481 }
2482 charset = char_charset (c, charset_list, &code);
2483 }
2484 dimension = CHARSET_DIMENSION (charset);
2485 emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2486 EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2487 EMIT_ONE_BYTE (leading_codes[0]);
2488 if (leading_codes[1])
2489 EMIT_ONE_BYTE (leading_codes[1]);
2490 if (dimension == 1)
1fa663f9 2491 EMIT_ONE_BYTE (code | 0x80);
aa72b389 2492 else
df7492f9 2493 {
1fa663f9 2494 code |= 0x8080;
df7492f9
KH
2495 EMIT_ONE_BYTE (code >> 8);
2496 EMIT_ONE_BYTE (code & 0xFF);
2497 }
aa72b389 2498 }
aa72b389 2499 }
065e3595 2500 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
2501 coding->produced_char += produced_chars;
2502 coding->produced = dst - coding->destination;
2503 return 0;
aa72b389 2504}
b73bfc1c 2505
4ed46869 2506\f
df7492f9 2507/*** 7. ISO2022 handlers ***/
4ed46869
KH
2508
2509/* The following note describes the coding system ISO2022 briefly.
39787efd 2510 Since the intention of this note is to help understand the
5a936b46 2511 functions in this file, some parts are NOT ACCURATE or are OVERLY
39787efd 2512 SIMPLIFIED. For thorough understanding, please refer to the
5a936b46 2513 original document of ISO2022. This is equivalent to the standard
cfb43547 2514 ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
4ed46869
KH
2515
2516 ISO2022 provides many mechanisms to encode several character sets
cfb43547 2517 in 7-bit and 8-bit environments. For 7-bit environments, all text
39787efd
KH
2518 is encoded using bytes less than 128. This may make the encoded
2519 text a little bit longer, but the text passes more easily through
cfb43547 2520 several types of gateway, some of which strip off the MSB (Most
8ca3766a 2521 Significant Bit).
b73bfc1c 2522
cfb43547
DL
2523 There are two kinds of character sets: control character sets and
2524 graphic character sets. The former contain control characters such
4ed46869 2525 as `newline' and `escape' to provide control functions (control
39787efd 2526 functions are also provided by escape sequences). The latter
cfb43547 2527 contain graphic characters such as 'A' and '-'. Emacs recognizes
4ed46869
KH
2528 two control character sets and many graphic character sets.
2529
2530 Graphic character sets are classified into one of the following
39787efd
KH
2531 four classes, according to the number of bytes (DIMENSION) and
2532 number of characters in one dimension (CHARS) of the set:
2533 - DIMENSION1_CHARS94
2534 - DIMENSION1_CHARS96
2535 - DIMENSION2_CHARS94
2536 - DIMENSION2_CHARS96
2537
2538 In addition, each character set is assigned an identification tag,
cfb43547 2539 unique for each set, called the "final character" (denoted as <F>
39787efd
KH
2540 hereafter). The <F> of each character set is decided by ECMA(*)
2541 when it is registered in ISO. The code range of <F> is 0x30..0x7F
2542 (0x30..0x3F are for private use only).
4ed46869
KH
2543
2544 Note (*): ECMA = European Computer Manufacturers Association
2545
cfb43547 2546 Here are examples of graphic character sets [NAME(<F>)]:
4ed46869
KH
2547 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2548 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2549 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2550 o DIMENSION2_CHARS96 -- none for the moment
2551
39787efd 2552 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
4ed46869
KH
2553 C0 [0x00..0x1F] -- control character plane 0
2554 GL [0x20..0x7F] -- graphic character plane 0
2555 C1 [0x80..0x9F] -- control character plane 1
2556 GR [0xA0..0xFF] -- graphic character plane 1
2557
2558 A control character set is directly designated and invoked to C0 or
39787efd
KH
2559 C1 by an escape sequence. The most common case is that:
2560 - ISO646's control character set is designated/invoked to C0, and
2561 - ISO6429's control character set is designated/invoked to C1,
2562 and usually these designations/invocations are omitted in encoded
2563 text. In a 7-bit environment, only C0 can be used, and a control
2564 character for C1 is encoded by an appropriate escape sequence to
2565 fit into the environment. All control characters for C1 are
2566 defined to have corresponding escape sequences.
4ed46869
KH
2567
2568 A graphic character set is at first designated to one of four
2569 graphic registers (G0 through G3), then these graphic registers are
2570 invoked to GL or GR. These designations and invocations can be
2571 done independently. The most common case is that G0 is invoked to
39787efd
KH
2572 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
2573 these invocations and designations are omitted in encoded text.
2574 In a 7-bit environment, only GL can be used.
4ed46869 2575
39787efd
KH
2576 When a graphic character set of CHARS94 is invoked to GL, codes
2577 0x20 and 0x7F of the GL area work as control characters SPACE and
2578 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2579 be used.
4ed46869
KH
2580
2581 There are two ways of invocation: locking-shift and single-shift.
2582 With locking-shift, the invocation lasts until the next different
39787efd
KH
2583 invocation, whereas with single-shift, the invocation affects the
2584 following character only and doesn't affect the locking-shift
2585 state. Invocations are done by the following control characters or
2586 escape sequences:
4ed46869
KH
2587
2588 ----------------------------------------------------------------------
39787efd 2589 abbrev function cntrl escape seq description
4ed46869 2590 ----------------------------------------------------------------------
39787efd
KH
2591 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
2592 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
2593 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
2594 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
2595 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
2596 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
2597 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
2598 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
2599 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
4ed46869 2600 ----------------------------------------------------------------------
39787efd
KH
2601 (*) These are not used by any known coding system.
2602
2603 Control characters for these functions are defined by macros
2604 ISO_CODE_XXX in `coding.h'.
4ed46869 2605
39787efd 2606 Designations are done by the following escape sequences:
4ed46869
KH
2607 ----------------------------------------------------------------------
2608 escape sequence description
2609 ----------------------------------------------------------------------
2610 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
2611 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
2612 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
2613 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
2614 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
2615 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
2616 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
2617 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
2618 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
2619 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
2620 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
2621 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
2622 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
2623 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
2624 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
2625 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
2626 ----------------------------------------------------------------------
2627
2628 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
39787efd 2629 of dimension 1, chars 94, and final character <F>, etc...
4ed46869
KH
2630
2631 Note (*): Although these designations are not allowed in ISO2022,
2632 Emacs accepts them on decoding, and produces them on encoding
39787efd 2633 CHARS96 character sets in a coding system which is characterized as
4ed46869
KH
2634 7-bit environment, non-locking-shift, and non-single-shift.
2635
2636 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
df7492f9 2637 '(' must be omitted. We refer to this as "short-form" hereafter.
4ed46869 2638
cfb43547 2639 Now you may notice that there are a lot of ways of encoding the
39787efd
KH
2640 same multilingual text in ISO2022. Actually, there exist many
2641 coding systems such as Compound Text (used in X11's inter client
8ca3766a
DL
2642 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2643 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
4ed46869
KH
2644 localized platforms), and all of these are variants of ISO2022.
2645
2646 In addition to the above, Emacs handles two more kinds of escape
2647 sequences: ISO6429's direction specification and Emacs' private
2648 sequence for specifying character composition.
2649
39787efd 2650 ISO6429's direction specification takes the following form:
4ed46869
KH
2651 o CSI ']' -- end of the current direction
2652 o CSI '0' ']' -- end of the current direction
2653 o CSI '1' ']' -- start of left-to-right text
2654 o CSI '2' ']' -- start of right-to-left text
2655 The control character CSI (0x9B: control sequence introducer) is
39787efd
KH
2656 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2657
2658 Character composition specification takes the following form:
ec6d2bb8
KH
2659 o ESC '0' -- start relative composition
2660 o ESC '1' -- end composition
2661 o ESC '2' -- start rule-base composition (*)
2662 o ESC '3' -- start relative composition with alternate chars (**)
2663 o ESC '4' -- start rule-base composition with alternate chars (**)
b73bfc1c 2664 Since these are not standard escape sequences of any ISO standard,
cfb43547 2665 the use of them with these meanings is restricted to Emacs only.
ec6d2bb8 2666
5a936b46
DL
2667 (*) This form is used only in Emacs 20.7 and older versions,
2668 but newer versions can safely decode it.
cfb43547 2669 (**) This form is used only in Emacs 21.1 and newer versions,
5a936b46 2670 and older versions can't decode it.
ec6d2bb8 2671
cfb43547 2672 Here's a list of example usages of these composition escape
b73bfc1c 2673 sequences (categorized by `enum composition_method').
ec6d2bb8 2674
b73bfc1c 2675 COMPOSITION_RELATIVE:
ec6d2bb8 2676 ESC 0 CHAR [ CHAR ] ESC 1
8ca3766a 2677 COMPOSITION_WITH_RULE:
ec6d2bb8 2678 ESC 2 CHAR [ RULE CHAR ] ESC 1
b73bfc1c 2679 COMPOSITION_WITH_ALTCHARS:
ec6d2bb8 2680 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
b73bfc1c 2681 COMPOSITION_WITH_RULE_ALTCHARS:
ec6d2bb8 2682 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
4ed46869
KH
2683
2684enum iso_code_class_type iso_code_class[256];
2685
df7492f9
KH
2686#define SAFE_CHARSET_P(coding, id) \
2687 ((id) <= (coding)->max_charset_id \
2688 && (coding)->safe_charsets[id] >= 0)
2689
2690
2691#define SHIFT_OUT_OK(category) \
2692 (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
2693
2694static void
f0064e1f
DL
2695setup_iso_safe_charsets (attrs)
2696 Lisp_Object attrs;
df7492f9
KH
2697{
2698 Lisp_Object charset_list, safe_charsets;
2699 Lisp_Object request;
2700 Lisp_Object reg_usage;
2701 Lisp_Object tail;
2702 int reg94, reg96;
2703 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2704 int max_charset_id;
2705
2706 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2707 if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2708 && ! EQ (charset_list, Viso_2022_charset_list))
2709 {
2710 CODING_ATTR_CHARSET_LIST (attrs)
2711 = charset_list = Viso_2022_charset_list;
2712 ASET (attrs, coding_attr_safe_charsets, Qnil);
2713 }
2714
2715 if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2716 return;
2717
2718 max_charset_id = 0;
2719 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2720 {
2721 int id = XINT (XCAR (tail));
2722 if (max_charset_id < id)
2723 max_charset_id = id;
2724 }
d46c5b12 2725
df7492f9
KH
2726 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
2727 make_number (255));
2728 request = AREF (attrs, coding_attr_iso_request);
2729 reg_usage = AREF (attrs, coding_attr_iso_usage);
2730 reg94 = XINT (XCAR (reg_usage));
2731 reg96 = XINT (XCDR (reg_usage));
2732
2733 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2734 {
2735 Lisp_Object id;
2736 Lisp_Object reg;
2737 struct charset *charset;
2738
2739 id = XCAR (tail);
2740 charset = CHARSET_FROM_ID (XINT (id));
bf16eb23 2741 reg = Fcdr (Fassq (id, request));
df7492f9 2742 if (! NILP (reg))
8f924df7 2743 SSET (safe_charsets, XINT (id), XINT (reg));
df7492f9
KH
2744 else if (charset->iso_chars_96)
2745 {
2746 if (reg96 < 4)
8f924df7 2747 SSET (safe_charsets, XINT (id), reg96);
df7492f9
KH
2748 }
2749 else
2750 {
2751 if (reg94 < 4)
8f924df7 2752 SSET (safe_charsets, XINT (id), reg94);
df7492f9
KH
2753 }
2754 }
2755 ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2756}
d46c5b12 2757
b6871cc7 2758
4ed46869 2759/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
ff0dacd7
KH
2760 Check if a text is encoded in one of ISO-2022 based codig systems.
2761 If it is, return 1, else return 0. */
4ed46869 2762
0a28aafb 2763static int
ff0dacd7 2764detect_coding_iso_2022 (coding, detect_info)
df7492f9 2765 struct coding_system *coding;
ff0dacd7 2766 struct coding_detection_info *detect_info;
4ed46869 2767{
8f924df7
KH
2768 const unsigned char *src = coding->source, *src_base = src;
2769 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9 2770 int multibytep = coding->src_multibyte;
ff0dacd7 2771 int single_shifting = 0;
df7492f9
KH
2772 int id;
2773 int c, c1;
2774 int consumed_chars = 0;
2775 int i;
ff0dacd7
KH
2776 int rejected = 0;
2777 int found = 0;
cee53ed4 2778 int composition_count = -1;
ff0dacd7
KH
2779
2780 detect_info->checked |= CATEGORY_MASK_ISO;
df7492f9
KH
2781
2782 for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2783 {
2784 struct coding_system *this = &(coding_categories[i]);
2785 Lisp_Object attrs, val;
2786
c6b278e7
KH
2787 if (this->id < 0)
2788 continue;
df7492f9
KH
2789 attrs = CODING_ID_ATTRS (this->id);
2790 if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2791 && ! EQ (CODING_ATTR_SAFE_CHARSETS (attrs), Viso_2022_charset_list))
2792 setup_iso_safe_charsets (attrs);
2793 val = CODING_ATTR_SAFE_CHARSETS (attrs);
8f924df7
KH
2794 this->max_charset_id = SCHARS (val) - 1;
2795 this->safe_charsets = (char *) SDATA (val);
df7492f9
KH
2796 }
2797
2798 /* A coding system of this category is always ASCII compatible. */
2799 src += coding->head_ascii;
3f003981 2800
ff0dacd7 2801 while (rejected != CATEGORY_MASK_ISO)
4ed46869 2802 {
065e3595 2803 src_base = src;
df7492f9 2804 ONE_MORE_BYTE (c);
4ed46869
KH
2805 switch (c)
2806 {
2807 case ISO_CODE_ESC:
74383408
KH
2808 if (inhibit_iso_escape_detection)
2809 break;
f46869e4 2810 single_shifting = 0;
df7492f9 2811 ONE_MORE_BYTE (c);
d46c5b12 2812 if (c >= '(' && c <= '/')
4ed46869 2813 {
bf9cdd4e 2814 /* Designation sequence for a charset of dimension 1. */
df7492f9 2815 ONE_MORE_BYTE (c1);
d46c5b12 2816 if (c1 < ' ' || c1 >= 0x80
df7492f9 2817 || (id = iso_charset_table[0][c >= ','][c1]) < 0)
d46c5b12
KH
2818 /* Invalid designation sequence. Just ignore. */
2819 break;
bf9cdd4e
KH
2820 }
2821 else if (c == '$')
2822 {
2823 /* Designation sequence for a charset of dimension 2. */
df7492f9 2824 ONE_MORE_BYTE (c);
bf9cdd4e
KH
2825 if (c >= '@' && c <= 'B')
2826 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
ff0dacd7 2827 id = iso_charset_table[1][0][c];
bf9cdd4e 2828 else if (c >= '(' && c <= '/')
bcf26d6a 2829 {
df7492f9 2830 ONE_MORE_BYTE (c1);
d46c5b12 2831 if (c1 < ' ' || c1 >= 0x80
df7492f9 2832 || (id = iso_charset_table[1][c >= ','][c1]) < 0)
d46c5b12
KH
2833 /* Invalid designation sequence. Just ignore. */
2834 break;
bcf26d6a 2835 }
bf9cdd4e 2836 else
ff0dacd7 2837 /* Invalid designation sequence. Just ignore it. */
d46c5b12
KH
2838 break;
2839 }
ae9ff118 2840 else if (c == 'N' || c == 'O')
d46c5b12 2841 {
ae9ff118 2842 /* ESC <Fe> for SS2 or SS3. */
ff0dacd7
KH
2843 single_shifting = 1;
2844 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
d46c5b12 2845 break;
4ed46869 2846 }
cee53ed4
KH
2847 else if (c == '1')
2848 {
2849 /* End of composition. */
2850 if (composition_count < 0
2851 || composition_count > MAX_COMPOSITION_COMPONENTS)
2852 /* Invalid */
2853 break;
2854 composition_count = -1;
2855 found |= CATEGORY_MASK_ISO;
2856 }
ec6d2bb8
KH
2857 else if (c >= '0' && c <= '4')
2858 {
2859 /* ESC <Fp> for start/end composition. */
cee53ed4 2860 composition_count = 0;
ec6d2bb8
KH
2861 break;
2862 }
bf9cdd4e 2863 else
df7492f9 2864 {
ff0dacd7 2865 /* Invalid escape sequence. Just ignore it. */
df7492f9
KH
2866 break;
2867 }
d46c5b12
KH
2868
2869 /* We found a valid designation sequence for CHARSET. */
ff0dacd7 2870 rejected |= CATEGORY_MASK_ISO_8BIT;
df7492f9
KH
2871 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
2872 id))
ff0dacd7 2873 found |= CATEGORY_MASK_ISO_7;
d46c5b12 2874 else
ff0dacd7 2875 rejected |= CATEGORY_MASK_ISO_7;
df7492f9
KH
2876 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
2877 id))
ff0dacd7 2878 found |= CATEGORY_MASK_ISO_7_TIGHT;
d46c5b12 2879 else
ff0dacd7 2880 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
df7492f9
KH
2881 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
2882 id))
ff0dacd7 2883 found |= CATEGORY_MASK_ISO_7_ELSE;
ae9ff118 2884 else
ff0dacd7 2885 rejected |= CATEGORY_MASK_ISO_7_ELSE;
df7492f9
KH
2886 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
2887 id))
ff0dacd7 2888 found |= CATEGORY_MASK_ISO_8_ELSE;
ae9ff118 2889 else
ff0dacd7 2890 rejected |= CATEGORY_MASK_ISO_8_ELSE;
4ed46869
KH
2891 break;
2892
4ed46869 2893 case ISO_CODE_SO:
d46c5b12 2894 case ISO_CODE_SI:
ff0dacd7 2895 /* Locking shift out/in. */
74383408
KH
2896 if (inhibit_iso_escape_detection)
2897 break;
f46869e4 2898 single_shifting = 0;
ff0dacd7 2899 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
d46c5b12
KH
2900 break;
2901
4ed46869 2902 case ISO_CODE_CSI:
ff0dacd7 2903 /* Control sequence introducer. */
f46869e4 2904 single_shifting = 0;
ff0dacd7
KH
2905 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2906 found |= CATEGORY_MASK_ISO_8_ELSE;
2907 goto check_extra_latin;
2908
4ed46869
KH
2909 case ISO_CODE_SS2:
2910 case ISO_CODE_SS3:
ff0dacd7
KH
2911 /* Single shift. */
2912 if (inhibit_iso_escape_detection)
2913 break;
75e2a253 2914 single_shifting = 0;
ff0dacd7
KH
2915 rejected |= CATEGORY_MASK_ISO_7BIT;
2916 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2917 & CODING_ISO_FLAG_SINGLE_SHIFT)
75e2a253 2918 found |= CATEGORY_MASK_ISO_8_1, single_shifting = 1;
ff0dacd7
KH
2919 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2920 & CODING_ISO_FLAG_SINGLE_SHIFT)
75e2a253
KH
2921 found |= CATEGORY_MASK_ISO_8_2, single_shifting = 1;
2922 if (single_shifting)
2923 break;
ff0dacd7 2924 goto check_extra_latin;
4ed46869
KH
2925
2926 default:
065e3595
KH
2927 if (c < 0)
2928 continue;
4ed46869 2929 if (c < 0x80)
f46869e4 2930 {
cee53ed4
KH
2931 if (composition_count >= 0)
2932 composition_count++;
f46869e4
KH
2933 single_shifting = 0;
2934 break;
2935 }
ff0dacd7 2936 if (c >= 0xA0)
c4825358 2937 {
ff0dacd7
KH
2938 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2939 found |= CATEGORY_MASK_ISO_8_1;
f46869e4 2940 /* Check the length of succeeding codes of the range
ff0dacd7
KH
2941 0xA0..0FF. If the byte length is even, we include
2942 CATEGORY_MASK_ISO_8_2 in `found'. We can check this
2943 only when we are not single shifting. */
2944 if (! single_shifting
2945 && ! (rejected & CATEGORY_MASK_ISO_8_2))
f46869e4 2946 {
e17de821 2947 int i = 1;
b73bfc1c
KH
2948 while (src < src_end)
2949 {
df7492f9 2950 ONE_MORE_BYTE (c);
b73bfc1c
KH
2951 if (c < 0xA0)
2952 break;
2953 i++;
2954 }
2955
2956 if (i & 1 && src < src_end)
cee53ed4
KH
2957 {
2958 rejected |= CATEGORY_MASK_ISO_8_2;
2959 if (composition_count >= 0)
2960 composition_count += i;
2961 }
f46869e4 2962 else
cee53ed4
KH
2963 {
2964 found |= CATEGORY_MASK_ISO_8_2;
2965 if (composition_count >= 0)
2966 composition_count += i / 2;
2967 }
f46869e4 2968 }
ff0dacd7 2969 break;
4ed46869 2970 }
ff0dacd7
KH
2971 check_extra_latin:
2972 single_shifting = 0;
2973 if (! VECTORP (Vlatin_extra_code_table)
2974 || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
2975 {
2976 rejected = CATEGORY_MASK_ISO;
2977 break;
2978 }
2979 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2980 & CODING_ISO_FLAG_LATIN_EXTRA)
2981 found |= CATEGORY_MASK_ISO_8_1;
2982 else
2983 rejected |= CATEGORY_MASK_ISO_8_1;
75e2a253 2984 rejected |= CATEGORY_MASK_ISO_8_2;
4ed46869
KH
2985 }
2986 }
ff0dacd7
KH
2987 detect_info->rejected |= CATEGORY_MASK_ISO;
2988 return 0;
4ed46869 2989
df7492f9 2990 no_more_source:
ff0dacd7
KH
2991 detect_info->rejected |= rejected;
2992 detect_info->found |= (found & ~rejected);
df7492f9 2993 return 1;
4ed46869 2994}
ec6d2bb8 2995
4ed46869 2996
134b9549
KH
2997/* Set designation state into CODING. Set CHARS_96 to -1 if the
2998 escape sequence should be kept. */
df7492f9
KH
2999#define DECODE_DESIGNATION(reg, dim, chars_96, final) \
3000 do { \
3001 int id, prev; \
3002 \
3003 if (final < '0' || final >= 128 \
3004 || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0) \
3005 || !SAFE_CHARSET_P (coding, id)) \
3006 { \
3007 CODING_ISO_DESIGNATION (coding, reg) = -2; \
134b9549
KH
3008 chars_96 = -1; \
3009 break; \
df7492f9
KH
3010 } \
3011 prev = CODING_ISO_DESIGNATION (coding, reg); \
bf16eb23
KH
3012 if (id == charset_jisx0201_roman) \
3013 { \
3014 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
3015 id = charset_ascii; \
3016 } \
3017 else if (id == charset_jisx0208_1978) \
3018 { \
3019 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
3020 id = charset_jisx0208; \
3021 } \
df7492f9
KH
3022 CODING_ISO_DESIGNATION (coding, reg) = id; \
3023 /* If there was an invalid designation to REG previously, and this \
3024 designation is ASCII to REG, we should keep this designation \
3025 sequence. */ \
3026 if (prev == -2 && id == charset_ascii) \
134b9549 3027 chars_96 = -1; \
4ed46869
KH
3028 } while (0)
3029
d46c5b12 3030
df7492f9
KH
3031#define MAYBE_FINISH_COMPOSITION() \
3032 do { \
3033 int i; \
3034 if (composition_state == COMPOSING_NO) \
3035 break; \
3036 /* It is assured that we have enough room for producing \
3037 characters stored in the table `components'. */ \
3038 if (charbuf + component_idx > charbuf_end) \
3039 goto no_more_source; \
3040 composition_state = COMPOSING_NO; \
3041 if (method == COMPOSITION_RELATIVE \
3042 || method == COMPOSITION_WITH_ALTCHARS) \
3043 { \
3044 for (i = 0; i < component_idx; i++) \
3045 *charbuf++ = components[i]; \
3046 char_offset += component_idx; \
3047 } \
3048 else \
3049 { \
3050 for (i = 0; i < component_idx; i += 2) \
3051 *charbuf++ = components[i]; \
3052 char_offset += (component_idx / 2) + 1; \
3053 } \
3054 } while (0)
3055
d46c5b12 3056
aa72b389
KH
3057/* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3058 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3059 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
df7492f9
KH
3060 ESC 3 : altchar composition : ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3061 ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
aa72b389 3062 */
ec6d2bb8 3063
df7492f9
KH
3064#define DECODE_COMPOSITION_START(c1) \
3065 do { \
3066 if (c1 == '0' \
781d7a48 3067 && composition_state == COMPOSING_COMPONENT_RULE) \
df7492f9
KH
3068 { \
3069 component_len = component_idx; \
3070 composition_state = COMPOSING_CHAR; \
3071 } \
3072 else \
3073 { \
8f924df7 3074 const unsigned char *p; \
df7492f9
KH
3075 \
3076 MAYBE_FINISH_COMPOSITION (); \
3077 if (charbuf + MAX_COMPOSITION_COMPONENTS > charbuf_end) \
3078 goto no_more_source; \
3079 for (p = src; p < src_end - 1; p++) \
3080 if (*p == ISO_CODE_ESC && p[1] == '1') \
3081 break; \
3082 if (p == src_end - 1) \
3083 { \
cee53ed4
KH
3084 if (coding->mode & CODING_MODE_LAST_BLOCK) \
3085 goto invalid_code; \
9286b333
KH
3086 /* The current composition doesn't end in the current \
3087 source. */ \
3088 record_conversion_result \
3089 (coding, CODING_RESULT_INSUFFICIENT_SRC); \
df7492f9
KH
3090 goto no_more_source; \
3091 } \
3092 \
3093 /* This is surely the start of a composition. */ \
3094 method = (c1 == '0' ? COMPOSITION_RELATIVE \
3095 : c1 == '2' ? COMPOSITION_WITH_RULE \
3096 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
3097 : COMPOSITION_WITH_RULE_ALTCHARS); \
3098 composition_state = (c1 <= '2' ? COMPOSING_CHAR \
3099 : COMPOSING_COMPONENT_CHAR); \
3100 component_idx = component_len = 0; \
3101 } \
ec6d2bb8
KH
3102 } while (0)
3103
ec6d2bb8 3104
df7492f9
KH
3105/* Handle compositoin end sequence ESC 1. */
3106
3107#define DECODE_COMPOSITION_END() \
ec6d2bb8 3108 do { \
df7492f9
KH
3109 int nchars = (component_len > 0 ? component_idx - component_len \
3110 : method == COMPOSITION_RELATIVE ? component_idx \
3111 : (component_idx + 1) / 2); \
3112 int i; \
3113 int *saved_charbuf = charbuf; \
3114 \
69a80ea3 3115 ADD_COMPOSITION_DATA (charbuf, nchars, method); \
df7492f9 3116 if (method != COMPOSITION_RELATIVE) \
ec6d2bb8 3117 { \
df7492f9
KH
3118 if (component_len == 0) \
3119 for (i = 0; i < component_idx; i++) \
3120 *charbuf++ = components[i]; \
3121 else \
3122 for (i = 0; i < component_len; i++) \
3123 *charbuf++ = components[i]; \
3124 *saved_charbuf = saved_charbuf - charbuf; \
ec6d2bb8 3125 } \
df7492f9
KH
3126 if (method == COMPOSITION_WITH_RULE) \
3127 for (i = 0; i < component_idx; i += 2, char_offset++) \
3128 *charbuf++ = components[i]; \
ec6d2bb8 3129 else \
df7492f9
KH
3130 for (i = component_len; i < component_idx; i++, char_offset++) \
3131 *charbuf++ = components[i]; \
3132 coding->annotated = 1; \
3133 composition_state = COMPOSING_NO; \
ec6d2bb8
KH
3134 } while (0)
3135
df7492f9 3136
ec6d2bb8
KH
3137/* Decode a composition rule from the byte C1 (and maybe one more byte
3138 from SRC) and store one encoded composition rule in
3139 coding->cmp_data. */
3140
3141#define DECODE_COMPOSITION_RULE(c1) \
3142 do { \
ec6d2bb8
KH
3143 (c1) -= 32; \
3144 if (c1 < 81) /* old format (before ver.21) */ \
3145 { \
3146 int gref = (c1) / 9; \
3147 int nref = (c1) % 9; \
3148 if (gref == 4) gref = 10; \
3149 if (nref == 4) nref = 10; \
df7492f9 3150 c1 = COMPOSITION_ENCODE_RULE (gref, nref); \
ec6d2bb8 3151 } \
b73bfc1c 3152 else if (c1 < 93) /* new format (after ver.21) */ \
ec6d2bb8
KH
3153 { \
3154 ONE_MORE_BYTE (c2); \
df7492f9 3155 c1 = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32); \
ec6d2bb8 3156 } \
df7492f9
KH
3157 else \
3158 c1 = 0; \
ec6d2bb8 3159 } while (0)
88993dfd 3160
d46c5b12 3161
4ed46869
KH
3162/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
3163
b73bfc1c 3164static void
df7492f9 3165decode_coding_iso_2022 (coding)
4ed46869 3166 struct coding_system *coding;
4ed46869 3167{
8f924df7
KH
3168 const unsigned char *src = coding->source + coding->consumed;
3169 const unsigned char *src_end = coding->source + coding->src_bytes;
3170 const unsigned char *src_base;
69a80ea3 3171 int *charbuf = coding->charbuf + coding->charbuf_used;
ff0dacd7 3172 int *charbuf_end
69a80ea3 3173 = coding->charbuf + coding->charbuf_size - 4 - MAX_ANNOTATION_LENGTH;
df7492f9 3174 int consumed_chars = 0, consumed_chars_base;
df7492f9 3175 int multibytep = coding->src_multibyte;
4ed46869 3176 /* Charsets invoked to graphic plane 0 and 1 respectively. */
df7492f9
KH
3177 int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3178 int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
134b9549 3179 int charset_id_2, charset_id_3;
df7492f9
KH
3180 struct charset *charset;
3181 int c;
3182 /* For handling composition sequence. */
3183#define COMPOSING_NO 0
3184#define COMPOSING_CHAR 1
3185#define COMPOSING_RULE 2
3186#define COMPOSING_COMPONENT_CHAR 3
3187#define COMPOSING_COMPONENT_RULE 4
3188
3189 int composition_state = COMPOSING_NO;
3190 enum composition_method method;
3191 int components[MAX_COMPOSITION_COMPONENTS * 2 + 1];
3192 int component_idx;
3193 int component_len;
24a73b0a 3194 Lisp_Object attrs, charset_list;
ff0dacd7
KH
3195 int char_offset = coding->produced_char;
3196 int last_offset = char_offset;
3197 int last_id = charset_ascii;
119852e7
KH
3198 int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3199 int byte_after_cr = -1;
df7492f9 3200
24a73b0a 3201 CODING_GET_INFO (coding, attrs, charset_list);
df7492f9 3202 setup_iso_safe_charsets (attrs);
287c57d7
KH
3203 /* Charset list may have been changed. */
3204 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3205 coding->safe_charsets = (char *) SDATA (CODING_ATTR_SAFE_CHARSETS(attrs));
b73bfc1c
KH
3206
3207 while (1)
4ed46869 3208 {
463f5630 3209 int c1, c2;
b73bfc1c
KH
3210
3211 src_base = src;
df7492f9
KH
3212 consumed_chars_base = consumed_chars;
3213
3214 if (charbuf >= charbuf_end)
b71f6f73
KH
3215 {
3216 if (byte_after_cr >= 0)
3217 src_base--;
3218 break;
3219 }
df7492f9 3220
119852e7
KH
3221 if (byte_after_cr >= 0)
3222 c1 = byte_after_cr, byte_after_cr = -1;
3223 else
3224 ONE_MORE_BYTE (c1);
065e3595
KH
3225 if (c1 < 0)
3226 goto invalid_code;
4ed46869 3227
98725083 3228 /* We produce at most one character. */
4ed46869
KH
3229 switch (iso_code_class [c1])
3230 {
3231 case ISO_0x20_or_0x7F:
df7492f9 3232 if (composition_state != COMPOSING_NO)
ec6d2bb8 3233 {
df7492f9
KH
3234 if (composition_state == COMPOSING_RULE
3235 || composition_state == COMPOSING_COMPONENT_RULE)
3236 {
cee53ed4
KH
3237 if (component_idx < MAX_COMPOSITION_COMPONENTS * 2 + 1)
3238 {
3239 DECODE_COMPOSITION_RULE (c1);
3240 components[component_idx++] = c1;
3241 composition_state--;
3242 continue;
3243 }
3244 /* Too long composition. */
3245 MAYBE_FINISH_COMPOSITION ();
df7492f9 3246 }
4ed46869 3247 }
df7492f9
KH
3248 if (charset_id_0 < 0
3249 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
781d7a48
KH
3250 /* This is SPACE or DEL. */
3251 charset = CHARSET_FROM_ID (charset_ascii);
3252 else
3253 charset = CHARSET_FROM_ID (charset_id_0);
3254 break;
4ed46869
KH
3255
3256 case ISO_graphic_plane_0:
781d7a48 3257 if (composition_state != COMPOSING_NO)
b73bfc1c 3258 {
781d7a48
KH
3259 if (composition_state == COMPOSING_RULE
3260 || composition_state == COMPOSING_COMPONENT_RULE)
3261 {
cee53ed4
KH
3262 if (component_idx < MAX_COMPOSITION_COMPONENTS * 2 + 1)
3263 {
3264 DECODE_COMPOSITION_RULE (c1);
3265 components[component_idx++] = c1;
3266 composition_state--;
3267 continue;
3268 }
3269 MAYBE_FINISH_COMPOSITION ();
781d7a48 3270 }
b73bfc1c 3271 }
134b9549
KH
3272 if (charset_id_0 < 0)
3273 charset = CHARSET_FROM_ID (charset_ascii);
3274 else
3275 charset = CHARSET_FROM_ID (charset_id_0);
4ed46869
KH
3276 break;
3277
3278 case ISO_0xA0_or_0xFF:
df7492f9
KH
3279 if (charset_id_1 < 0
3280 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3281 || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3282 goto invalid_code;
4ed46869
KH
3283 /* This is a graphic character, we fall down ... */
3284
3285 case ISO_graphic_plane_1:
df7492f9
KH
3286 if (charset_id_1 < 0)
3287 goto invalid_code;
3288 charset = CHARSET_FROM_ID (charset_id_1);
4ed46869
KH
3289 break;
3290
df7492f9 3291 case ISO_control_0:
119852e7
KH
3292 if (eol_crlf && c1 == '\r')
3293 ONE_MORE_BYTE (byte_after_cr);
df7492f9
KH
3294 MAYBE_FINISH_COMPOSITION ();
3295 charset = CHARSET_FROM_ID (charset_ascii);
4ed46869
KH
3296 break;
3297
df7492f9
KH
3298 case ISO_control_1:
3299 MAYBE_FINISH_COMPOSITION ();
3300 goto invalid_code;
3301
4ed46869 3302 case ISO_shift_out:
df7492f9
KH
3303 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3304 || CODING_ISO_DESIGNATION (coding, 1) < 0)
3305 goto invalid_code;
3306 CODING_ISO_INVOCATION (coding, 0) = 1;
3307 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
b73bfc1c 3308 continue;
4ed46869
KH
3309
3310 case ISO_shift_in:
df7492f9
KH
3311 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3312 goto invalid_code;
3313 CODING_ISO_INVOCATION (coding, 0) = 0;
3314 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
b73bfc1c 3315 continue;
4ed46869
KH
3316
3317 case ISO_single_shift_2_7:
3318 case ISO_single_shift_2:
df7492f9
KH
3319 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3320 goto invalid_code;
4ed46869
KH
3321 /* SS2 is handled as an escape sequence of ESC 'N' */
3322 c1 = 'N';
3323 goto label_escape_sequence;
3324
3325 case ISO_single_shift_3:
df7492f9
KH
3326 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3327 goto invalid_code;
4ed46869
KH
3328 /* SS2 is handled as an escape sequence of ESC 'O' */
3329 c1 = 'O';
3330 goto label_escape_sequence;
3331
3332 case ISO_control_sequence_introducer:
3333 /* CSI is handled as an escape sequence of ESC '[' ... */
3334 c1 = '[';
3335 goto label_escape_sequence;
3336
3337 case ISO_escape:
3338 ONE_MORE_BYTE (c1);
3339 label_escape_sequence:
df7492f9 3340 /* Escape sequences handled here are invocation,
4ed46869
KH
3341 designation, direction specification, and character
3342 composition specification. */
3343 switch (c1)
3344 {
3345 case '&': /* revision of following character set */
3346 ONE_MORE_BYTE (c1);
3347 if (!(c1 >= '@' && c1 <= '~'))
df7492f9 3348 goto invalid_code;
4ed46869
KH
3349 ONE_MORE_BYTE (c1);
3350 if (c1 != ISO_CODE_ESC)
df7492f9 3351 goto invalid_code;
4ed46869
KH
3352 ONE_MORE_BYTE (c1);
3353 goto label_escape_sequence;
3354
3355 case '$': /* designation of 2-byte character set */
df7492f9
KH
3356 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3357 goto invalid_code;
134b9549
KH
3358 {
3359 int reg, chars96;
3360
3361 ONE_MORE_BYTE (c1);
3362 if (c1 >= '@' && c1 <= 'B')
3363 { /* designation of JISX0208.1978, GB2312.1980,
88993dfd 3364 or JISX0208.1980 */
134b9549
KH
3365 reg = 0, chars96 = 0;
3366 }
3367 else if (c1 >= 0x28 && c1 <= 0x2B)
3368 { /* designation of DIMENSION2_CHARS94 character set */
3369 reg = c1 - 0x28, chars96 = 0;
3370 ONE_MORE_BYTE (c1);
3371 }
3372 else if (c1 >= 0x2C && c1 <= 0x2F)
3373 { /* designation of DIMENSION2_CHARS96 character set */
3374 reg = c1 - 0x2C, chars96 = 1;
3375 ONE_MORE_BYTE (c1);
3376 }
3377 else
3378 goto invalid_code;
3379 DECODE_DESIGNATION (reg, 2, chars96, c1);
3380 /* We must update these variables now. */
3381 if (reg == 0)
3382 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3383 else if (reg == 1)
3384 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3385 if (chars96 < 0)
3386 goto invalid_code;
3387 }
b73bfc1c 3388 continue;
4ed46869
KH
3389
3390 case 'n': /* invocation of locking-shift-2 */
df7492f9
KH
3391 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3392 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3393 goto invalid_code;
3394 CODING_ISO_INVOCATION (coding, 0) = 2;
3395 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
b73bfc1c 3396 continue;
4ed46869
KH
3397
3398 case 'o': /* invocation of locking-shift-3 */
df7492f9
KH
3399 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3400 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3401 goto invalid_code;
3402 CODING_ISO_INVOCATION (coding, 0) = 3;
3403 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
b73bfc1c 3404 continue;
4ed46869
KH
3405
3406 case 'N': /* invocation of single-shift-2 */
df7492f9
KH
3407 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3408 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3409 goto invalid_code;
134b9549
KH
3410 charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3411 if (charset_id_2 < 0)
3412 charset = CHARSET_FROM_ID (charset_ascii);
3413 else
3414 charset = CHARSET_FROM_ID (charset_id_2);
b73bfc1c 3415 ONE_MORE_BYTE (c1);
e7046a18 3416 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
df7492f9 3417 goto invalid_code;
4ed46869
KH
3418 break;
3419
3420 case 'O': /* invocation of single-shift-3 */
df7492f9
KH
3421 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3422 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3423 goto invalid_code;
134b9549
KH
3424 charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3425 if (charset_id_3 < 0)
3426 charset = CHARSET_FROM_ID (charset_ascii);
3427 else
3428 charset = CHARSET_FROM_ID (charset_id_3);
b73bfc1c 3429 ONE_MORE_BYTE (c1);
e7046a18 3430 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
df7492f9 3431 goto invalid_code;
4ed46869
KH
3432 break;
3433
ec6d2bb8 3434 case '0': case '2': case '3': case '4': /* start composition */
df7492f9
KH
3435 if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3436 goto invalid_code;
ec6d2bb8 3437 DECODE_COMPOSITION_START (c1);
b73bfc1c 3438 continue;
4ed46869 3439
ec6d2bb8 3440 case '1': /* end composition */
df7492f9
KH
3441 if (composition_state == COMPOSING_NO)
3442 goto invalid_code;
3443 DECODE_COMPOSITION_END ();
b73bfc1c 3444 continue;
4ed46869
KH
3445
3446 case '[': /* specification of direction */
df7492f9
KH
3447 if (! CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION)
3448 goto invalid_code;
4ed46869 3449 /* For the moment, nested direction is not supported.
d46c5b12 3450 So, `coding->mode & CODING_MODE_DIRECTION' zero means
df7492f9 3451 left-to-right, and nozero means right-to-left. */
4ed46869
KH
3452 ONE_MORE_BYTE (c1);
3453 switch (c1)
3454 {
3455 case ']': /* end of the current direction */
d46c5b12 3456 coding->mode &= ~CODING_MODE_DIRECTION;
4ed46869
KH
3457
3458 case '0': /* end of the current direction */
3459 case '1': /* start of left-to-right direction */
3460 ONE_MORE_BYTE (c1);
3461 if (c1 == ']')
d46c5b12 3462 coding->mode &= ~CODING_MODE_DIRECTION;
4ed46869 3463 else
df7492f9 3464 goto invalid_code;
4ed46869
KH
3465 break;
3466
3467 case '2': /* start of right-to-left direction */
3468 ONE_MORE_BYTE (c1);
3469 if (c1 == ']')
d46c5b12 3470 coding->mode |= CODING_MODE_DIRECTION;
4ed46869 3471 else
df7492f9 3472 goto invalid_code;
4ed46869
KH
3473 break;
3474
3475 default:
df7492f9 3476 goto invalid_code;
4ed46869 3477 }
b73bfc1c 3478 continue;
4ed46869 3479
103e0180 3480 case '%':
103e0180
KH
3481 ONE_MORE_BYTE (c1);
3482 if (c1 == '/')
3483 {
3484 /* CTEXT extended segment:
3485 ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3486 We keep these bytes as is for the moment.
3487 They may be decoded by post-read-conversion. */
3488 int dim, M, L;
4776e638 3489 int size;
8f924df7 3490
103e0180
KH
3491 ONE_MORE_BYTE (dim);
3492 ONE_MORE_BYTE (M);
3493 ONE_MORE_BYTE (L);
3494 size = ((M - 128) * 128) + (L - 128);
4776e638
KH
3495 if (charbuf + 8 + size > charbuf_end)
3496 goto break_loop;
3497 *charbuf++ = ISO_CODE_ESC;
3498 *charbuf++ = '%';
3499 *charbuf++ = '/';
3500 *charbuf++ = dim;
3501 *charbuf++ = BYTE8_TO_CHAR (M);
3502 *charbuf++ = BYTE8_TO_CHAR (L);
103e0180
KH
3503 while (size-- > 0)
3504 {
3505 ONE_MORE_BYTE (c1);
4776e638 3506 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
103e0180 3507 }
103e0180
KH
3508 }
3509 else if (c1 == 'G')
3510 {
103e0180
KH
3511 /* XFree86 extension for embedding UTF-8 in CTEXT:
3512 ESC % G --UTF-8-BYTES-- ESC % @
3513 We keep these bytes as is for the moment.
3514 They may be decoded by post-read-conversion. */
4776e638
KH
3515 int *p = charbuf;
3516
3517 if (p + 6 > charbuf_end)
3518 goto break_loop;
3519 *p++ = ISO_CODE_ESC;
3520 *p++ = '%';
3521 *p++ = 'G';
3522 while (p < charbuf_end)
103e0180
KH
3523 {
3524 ONE_MORE_BYTE (c1);
3525 if (c1 == ISO_CODE_ESC
3526 && src + 1 < src_end
3527 && src[0] == '%'
3528 && src[1] == '@')
9ffd559c
KH
3529 {
3530 src += 2;
3531 break;
3532 }
4776e638 3533 *p++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
103e0180 3534 }
4776e638
KH
3535 if (p + 3 > charbuf_end)
3536 goto break_loop;
3537 *p++ = ISO_CODE_ESC;
3538 *p++ = '%';
3539 *p++ = '@';
3540 charbuf = p;
103e0180
KH
3541 }
3542 else
4776e638 3543 goto invalid_code;
103e0180 3544 continue;
4776e638 3545 break;
103e0180 3546
4ed46869 3547 default:
df7492f9
KH
3548 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3549 goto invalid_code;
134b9549
KH
3550 {
3551 int reg, chars96;
3552
3553 if (c1 >= 0x28 && c1 <= 0x2B)
3554 { /* designation of DIMENSION1_CHARS94 character set */
3555 reg = c1 - 0x28, chars96 = 0;
3556 ONE_MORE_BYTE (c1);
3557 }
3558 else if (c1 >= 0x2C && c1 <= 0x2F)
3559 { /* designation of DIMENSION1_CHARS96 character set */
3560 reg = c1 - 0x2C, chars96 = 1;
3561 ONE_MORE_BYTE (c1);
3562 }
3563 else
3564 goto invalid_code;
3565 DECODE_DESIGNATION (reg, 1, chars96, c1);
3566 /* We must update these variables now. */
3567 if (reg == 0)
3568 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3569 else if (reg == 1)
3570 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3571 if (chars96 < 0)
3572 goto invalid_code;
3573 }
b73bfc1c 3574 continue;
4ed46869 3575 }
b73bfc1c 3576 }
4ed46869 3577
ff0dacd7
KH
3578 if (charset->id != charset_ascii
3579 && last_id != charset->id)
3580 {
3581 if (last_id != charset_ascii)
69a80ea3 3582 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
ff0dacd7
KH
3583 last_id = charset->id;
3584 last_offset = char_offset;
3585 }
3586
b73bfc1c 3587 /* Now we know CHARSET and 1st position code C1 of a character.
df7492f9
KH
3588 Produce a decoded character while getting 2nd position code
3589 C2 if necessary. */
3590 c1 &= 0x7F;
3591 if (CHARSET_DIMENSION (charset) > 1)
b73bfc1c
KH
3592 {
3593 ONE_MORE_BYTE (c2);
df7492f9 3594 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
b73bfc1c 3595 /* C2 is not in a valid range. */
df7492f9
KH
3596 goto invalid_code;
3597 c1 = (c1 << 8) | (c2 & 0x7F);
3598 if (CHARSET_DIMENSION (charset) > 2)
3599 {
3600 ONE_MORE_BYTE (c2);
3601 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3602 /* C2 is not in a valid range. */
3603 goto invalid_code;
3604 c1 = (c1 << 8) | (c2 & 0x7F);
3605 }
3606 }
3607
3608 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3609 if (c < 0)
3610 {
3611 MAYBE_FINISH_COMPOSITION ();
3612 for (; src_base < src; src_base++, char_offset++)
3613 {
3614 if (ASCII_BYTE_P (*src_base))
3615 *charbuf++ = *src_base;
3616 else
3617 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3618 }
3619 }
3620 else if (composition_state == COMPOSING_NO)
3621 {
3622 *charbuf++ = c;
3623 char_offset++;
4ed46869 3624 }
df7492f9 3625 else
781d7a48 3626 {
cee53ed4
KH
3627 if (component_idx < MAX_COMPOSITION_COMPONENTS * 2 + 1)
3628 {
3629 components[component_idx++] = c;
3630 if (method == COMPOSITION_WITH_RULE
3631 || (method == COMPOSITION_WITH_RULE_ALTCHARS
3632 && composition_state == COMPOSING_COMPONENT_CHAR))
3633 composition_state++;
3634 }
3635 else
3636 {
3637 MAYBE_FINISH_COMPOSITION ();
3638 *charbuf++ = c;
3639 char_offset++;
3640 }
4ed46869
KH
3641 }
3642 continue;
3643
df7492f9
KH
3644 invalid_code:
3645 MAYBE_FINISH_COMPOSITION ();
4ed46869 3646 src = src_base;
df7492f9
KH
3647 consumed_chars = consumed_chars_base;
3648 ONE_MORE_BYTE (c);
065e3595 3649 *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
ff0dacd7 3650 char_offset++;
df7492f9 3651 coding->errors++;
4776e638
KH
3652 continue;
3653
3654 break_loop:
3655 break;
4ed46869 3656 }
fb88bf2d 3657
df7492f9 3658 no_more_source:
ff0dacd7 3659 if (last_id != charset_ascii)
69a80ea3 3660 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
df7492f9
KH
3661 coding->consumed_char += consumed_chars_base;
3662 coding->consumed = src_base - coding->source;
3663 coding->charbuf_used = charbuf - coding->charbuf;
4ed46869
KH
3664}
3665
b73bfc1c 3666
f4dee582 3667/* ISO2022 encoding stuff. */
4ed46869
KH
3668
3669/*
f4dee582 3670 It is not enough to say just "ISO2022" on encoding, we have to
df7492f9 3671 specify more details. In Emacs, each coding system of ISO2022
4ed46869 3672 variant has the following specifications:
df7492f9 3673 1. Initial designation to G0 thru G3.
4ed46869
KH
3674 2. Allows short-form designation?
3675 3. ASCII should be designated to G0 before control characters?
3676 4. ASCII should be designated to G0 at end of line?
3677 5. 7-bit environment or 8-bit environment?
3678 6. Use locking-shift?
3679 7. Use Single-shift?
3680 And the following two are only for Japanese:
3681 8. Use ASCII in place of JIS0201-1976-Roman?
3682 9. Use JISX0208-1983 in place of JISX0208-1978?
df7492f9
KH
3683 These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3684 defined by macros CODING_ISO_FLAG_XXX. See `coding.h' for more
f4dee582 3685 details.
4ed46869
KH
3686*/
3687
3688/* Produce codes (escape sequence) for designating CHARSET to graphic
b73bfc1c
KH
3689 register REG at DST, and increment DST. If <final-char> of CHARSET is
3690 '@', 'A', or 'B' and the coding system CODING allows, produce
3691 designation sequence of short-form. */
4ed46869
KH
3692
3693#define ENCODE_DESIGNATION(charset, reg, coding) \
3694 do { \
df7492f9 3695 unsigned char final_char = CHARSET_ISO_FINAL (charset); \
4ed46869
KH
3696 char *intermediate_char_94 = "()*+"; \
3697 char *intermediate_char_96 = ",-./"; \
df7492f9
KH
3698 int revision = -1; \
3699 int c; \
3700 \
3701 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION) \
c197f191 3702 revision = CHARSET_ISO_REVISION (charset); \
df7492f9
KH
3703 \
3704 if (revision >= 0) \
70c22245 3705 { \
df7492f9
KH
3706 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&'); \
3707 EMIT_ONE_BYTE ('@' + revision); \
4ed46869 3708 } \
df7492f9 3709 EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC); \
4ed46869
KH
3710 if (CHARSET_DIMENSION (charset) == 1) \
3711 { \
df7492f9
KH
3712 if (! CHARSET_ISO_CHARS_96 (charset)) \
3713 c = intermediate_char_94[reg]; \
4ed46869 3714 else \
df7492f9
KH
3715 c = intermediate_char_96[reg]; \
3716 EMIT_ONE_ASCII_BYTE (c); \
4ed46869
KH
3717 } \
3718 else \
3719 { \
df7492f9
KH
3720 EMIT_ONE_ASCII_BYTE ('$'); \
3721 if (! CHARSET_ISO_CHARS_96 (charset)) \
4ed46869 3722 { \
df7492f9 3723 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM \
b73bfc1c
KH
3724 || reg != 0 \
3725 || final_char < '@' || final_char > 'B') \
df7492f9 3726 EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]); \
4ed46869
KH
3727 } \
3728 else \
df7492f9 3729 EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]); \
4ed46869 3730 } \
df7492f9
KH
3731 EMIT_ONE_ASCII_BYTE (final_char); \
3732 \
3733 CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset); \
4ed46869
KH
3734 } while (0)
3735
df7492f9 3736
4ed46869
KH
3737/* The following two macros produce codes (control character or escape
3738 sequence) for ISO2022 single-shift functions (single-shift-2 and
3739 single-shift-3). */
3740
df7492f9
KH
3741#define ENCODE_SINGLE_SHIFT_2 \
3742 do { \
3743 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3744 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N'); \
3745 else \
3746 EMIT_ONE_BYTE (ISO_CODE_SS2); \
3747 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
4ed46869
KH
3748 } while (0)
3749
df7492f9
KH
3750
3751#define ENCODE_SINGLE_SHIFT_3 \
3752 do { \
3753 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3754 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O'); \
3755 else \
3756 EMIT_ONE_BYTE (ISO_CODE_SS3); \
3757 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
4ed46869
KH
3758 } while (0)
3759
df7492f9 3760
4ed46869
KH
3761/* The following four macros produce codes (control character or
3762 escape sequence) for ISO2022 locking-shift functions (shift-in,
3763 shift-out, locking-shift-2, and locking-shift-3). */
3764
df7492f9
KH
3765#define ENCODE_SHIFT_IN \
3766 do { \
3767 EMIT_ONE_ASCII_BYTE (ISO_CODE_SI); \
3768 CODING_ISO_INVOCATION (coding, 0) = 0; \
4ed46869
KH
3769 } while (0)
3770
df7492f9
KH
3771
3772#define ENCODE_SHIFT_OUT \
3773 do { \
3774 EMIT_ONE_ASCII_BYTE (ISO_CODE_SO); \
3775 CODING_ISO_INVOCATION (coding, 0) = 1; \
4ed46869
KH
3776 } while (0)
3777
df7492f9
KH
3778
3779#define ENCODE_LOCKING_SHIFT_2 \
3780 do { \
3781 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3782 CODING_ISO_INVOCATION (coding, 0) = 2; \
4ed46869
KH
3783 } while (0)
3784
df7492f9
KH
3785
3786#define ENCODE_LOCKING_SHIFT_3 \
3787 do { \
3788 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3789 CODING_ISO_INVOCATION (coding, 0) = 3; \
4ed46869
KH
3790 } while (0)
3791
df7492f9 3792
f4dee582
RS
3793/* Produce codes for a DIMENSION1 character whose character set is
3794 CHARSET and whose position-code is C1. Designation and invocation
4ed46869
KH
3795 sequences are also produced in advance if necessary. */
3796
6e85d753
KH
3797#define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
3798 do { \
df7492f9 3799 int id = CHARSET_ID (charset); \
bf16eb23
KH
3800 \
3801 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
3802 && id == charset_ascii) \
3803 { \
3804 id = charset_jisx0201_roman; \
3805 charset = CHARSET_FROM_ID (id); \
3806 } \
3807 \
df7492f9 3808 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
6e85d753 3809 { \
df7492f9
KH
3810 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3811 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
6e85d753 3812 else \
df7492f9
KH
3813 EMIT_ONE_BYTE (c1 | 0x80); \
3814 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
6e85d753
KH
3815 break; \
3816 } \
df7492f9 3817 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
6e85d753 3818 { \
df7492f9 3819 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
6e85d753
KH
3820 break; \
3821 } \
df7492f9 3822 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
6e85d753 3823 { \
df7492f9 3824 EMIT_ONE_BYTE (c1 | 0x80); \
6e85d753
KH
3825 break; \
3826 } \
6e85d753
KH
3827 else \
3828 /* Since CHARSET is not yet invoked to any graphic planes, we \
3829 must invoke it, or, at first, designate it to some graphic \
3830 register. Then repeat the loop to actually produce the \
3831 character. */ \
df7492f9
KH
3832 dst = encode_invocation_designation (charset, coding, dst, \
3833 &produced_chars); \
4ed46869
KH
3834 } while (1)
3835
df7492f9 3836
f4dee582
RS
3837/* Produce codes for a DIMENSION2 character whose character set is
3838 CHARSET and whose position-codes are C1 and C2. Designation and
4ed46869
KH
3839 invocation codes are also produced in advance if necessary. */
3840
6e85d753
KH
3841#define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
3842 do { \
df7492f9 3843 int id = CHARSET_ID (charset); \
bf16eb23
KH
3844 \
3845 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
3846 && id == charset_jisx0208) \
3847 { \
3848 id = charset_jisx0208_1978; \
3849 charset = CHARSET_FROM_ID (id); \
3850 } \
3851 \
df7492f9 3852 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
6e85d753 3853 { \
df7492f9
KH
3854 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3855 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
6e85d753 3856 else \
df7492f9
KH
3857 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3858 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
6e85d753
KH
3859 break; \
3860 } \
df7492f9 3861 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
6e85d753 3862 { \
df7492f9 3863 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
6e85d753
KH
3864 break; \
3865 } \
df7492f9 3866 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
6e85d753 3867 { \
df7492f9 3868 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
6e85d753
KH
3869 break; \
3870 } \
6e85d753
KH
3871 else \
3872 /* Since CHARSET is not yet invoked to any graphic planes, we \
3873 must invoke it, or, at first, designate it to some graphic \
3874 register. Then repeat the loop to actually produce the \
3875 character. */ \
df7492f9
KH
3876 dst = encode_invocation_designation (charset, coding, dst, \
3877 &produced_chars); \
4ed46869
KH
3878 } while (1)
3879
05e6f5dc 3880
df7492f9
KH
3881#define ENCODE_ISO_CHARACTER(charset, c) \
3882 do { \
3883 int code = ENCODE_CHAR ((charset),(c)); \
3884 \
3885 if (CHARSET_DIMENSION (charset) == 1) \
3886 ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code); \
3887 else \
3888 ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
84fbb8a0 3889 } while (0)
bdd9fb48 3890
05e6f5dc 3891
4ed46869 3892/* Produce designation and invocation codes at a place pointed by DST
df7492f9 3893 to use CHARSET. The element `spec.iso_2022' of *CODING is updated.
4ed46869
KH
3894 Return new DST. */
3895
3896unsigned char *
df7492f9
KH
3897encode_invocation_designation (charset, coding, dst, p_nchars)
3898 struct charset *charset;
4ed46869
KH
3899 struct coding_system *coding;
3900 unsigned char *dst;
df7492f9 3901 int *p_nchars;
4ed46869 3902{
df7492f9
KH
3903 int multibytep = coding->dst_multibyte;
3904 int produced_chars = *p_nchars;
4ed46869 3905 int reg; /* graphic register number */
df7492f9 3906 int id = CHARSET_ID (charset);
4ed46869
KH
3907
3908 /* At first, check designations. */
3909 for (reg = 0; reg < 4; reg++)
df7492f9 3910 if (id == CODING_ISO_DESIGNATION (coding, reg))
4ed46869
KH
3911 break;
3912
3913 if (reg >= 4)
3914 {
3915 /* CHARSET is not yet designated to any graphic registers. */
3916 /* At first check the requested designation. */
df7492f9
KH
3917 reg = CODING_ISO_REQUEST (coding, id);
3918 if (reg < 0)
1ba9e4ab
KH
3919 /* Since CHARSET requests no special designation, designate it
3920 to graphic register 0. */
4ed46869
KH
3921 reg = 0;
3922
3923 ENCODE_DESIGNATION (charset, reg, coding);
3924 }
3925
df7492f9
KH
3926 if (CODING_ISO_INVOCATION (coding, 0) != reg
3927 && CODING_ISO_INVOCATION (coding, 1) != reg)
4ed46869
KH
3928 {
3929 /* Since the graphic register REG is not invoked to any graphic
3930 planes, invoke it to graphic plane 0. */
3931 switch (reg)
3932 {
3933 case 0: /* graphic register 0 */
3934 ENCODE_SHIFT_IN;
3935 break;
3936
3937 case 1: /* graphic register 1 */
3938 ENCODE_SHIFT_OUT;
3939 break;
3940
3941 case 2: /* graphic register 2 */
df7492f9 3942 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4ed46869
KH
3943 ENCODE_SINGLE_SHIFT_2;
3944 else
3945 ENCODE_LOCKING_SHIFT_2;
3946 break;
3947
3948 case 3: /* graphic register 3 */
df7492f9 3949 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4ed46869
KH
3950 ENCODE_SINGLE_SHIFT_3;
3951 else
3952 ENCODE_LOCKING_SHIFT_3;
3953 break;
3954 }
3955 }
b73bfc1c 3956
df7492f9 3957 *p_nchars = produced_chars;
4ed46869
KH
3958 return dst;
3959}
3960
df7492f9
KH
3961/* The following three macros produce codes for indicating direction
3962 of text. */
3963#define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
ec6d2bb8 3964 do { \
df7492f9
KH
3965 if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS) \
3966 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '['); \
ec6d2bb8 3967 else \
df7492f9 3968 EMIT_ONE_BYTE (ISO_CODE_CSI); \
ec6d2bb8
KH
3969 } while (0)
3970
ec6d2bb8 3971
df7492f9
KH
3972#define ENCODE_DIRECTION_R2L() \
3973 do { \
3974 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3975 EMIT_TWO_ASCII_BYTES ('2', ']'); \
ec6d2bb8
KH
3976 } while (0)
3977
ec6d2bb8 3978
df7492f9 3979#define ENCODE_DIRECTION_L2R() \
ec6d2bb8 3980 do { \
df7492f9
KH
3981 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3982 EMIT_TWO_ASCII_BYTES ('0', ']'); \
ec6d2bb8 3983 } while (0)
4ed46869 3984
4ed46869
KH
3985
3986/* Produce codes for designation and invocation to reset the graphic
3987 planes and registers to initial state. */
df7492f9
KH
3988#define ENCODE_RESET_PLANE_AND_REGISTER() \
3989 do { \
3990 int reg; \
3991 struct charset *charset; \
3992 \
3993 if (CODING_ISO_INVOCATION (coding, 0) != 0) \
3994 ENCODE_SHIFT_IN; \
3995 for (reg = 0; reg < 4; reg++) \
3996 if (CODING_ISO_INITIAL (coding, reg) >= 0 \
3997 && (CODING_ISO_DESIGNATION (coding, reg) \
3998 != CODING_ISO_INITIAL (coding, reg))) \
3999 { \
4000 charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4001 ENCODE_DESIGNATION (charset, reg, coding); \
4002 } \
4ed46869
KH
4003 } while (0)
4004
df7492f9 4005
bdd9fb48 4006/* Produce designation sequences of charsets in the line started from
b73bfc1c 4007 SRC to a place pointed by DST, and return updated DST.
bdd9fb48
KH
4008
4009 If the current block ends before any end-of-line, we may fail to
d46c5b12
KH
4010 find all the necessary designations. */
4011
b73bfc1c 4012static unsigned char *
df7492f9 4013encode_designation_at_bol (coding, charbuf, charbuf_end, dst)
e0e989f6 4014 struct coding_system *coding;
df7492f9
KH
4015 int *charbuf, *charbuf_end;
4016 unsigned char *dst;
e0e989f6 4017{
df7492f9 4018 struct charset *charset;
bdd9fb48
KH
4019 /* Table of charsets to be designated to each graphic register. */
4020 int r[4];
df7492f9
KH
4021 int c, found = 0, reg;
4022 int produced_chars = 0;
4023 int multibytep = coding->dst_multibyte;
4024 Lisp_Object attrs;
4025 Lisp_Object charset_list;
4026
4027 attrs = CODING_ID_ATTRS (coding->id);
4028 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4029 if (EQ (charset_list, Qiso_2022))
4030 charset_list = Viso_2022_charset_list;
bdd9fb48
KH
4031
4032 for (reg = 0; reg < 4; reg++)
4033 r[reg] = -1;
4034
b73bfc1c 4035 while (found < 4)
e0e989f6 4036 {
df7492f9
KH
4037 int id;
4038
4039 c = *charbuf++;
b73bfc1c
KH
4040 if (c == '\n')
4041 break;
df7492f9
KH
4042 charset = char_charset (c, charset_list, NULL);
4043 id = CHARSET_ID (charset);
4044 reg = CODING_ISO_REQUEST (coding, id);
4045 if (reg >= 0 && r[reg] < 0)
bdd9fb48
KH
4046 {
4047 found++;
df7492f9 4048 r[reg] = id;
bdd9fb48 4049 }
bdd9fb48
KH
4050 }
4051
4052 if (found)
4053 {
4054 for (reg = 0; reg < 4; reg++)
4055 if (r[reg] >= 0
df7492f9
KH
4056 && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4057 ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
e0e989f6 4058 }
b73bfc1c
KH
4059
4060 return dst;
e0e989f6
KH
4061}
4062
4ed46869
KH
4063/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
4064
df7492f9
KH
4065static int
4066encode_coding_iso_2022 (coding)
4ed46869 4067 struct coding_system *coding;
4ed46869 4068{
df7492f9
KH
4069 int multibytep = coding->dst_multibyte;
4070 int *charbuf = coding->charbuf;
4071 int *charbuf_end = charbuf + coding->charbuf_used;
4072 unsigned char *dst = coding->destination + coding->produced;
4073 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4074 int safe_room = 16;
4075 int bol_designation
4076 = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4077 && CODING_ISO_BOL (coding));
4078 int produced_chars = 0;
4079 Lisp_Object attrs, eol_type, charset_list;
4080 int ascii_compatible;
b73bfc1c 4081 int c;
ff0dacd7 4082 int preferred_charset_id = -1;
05e6f5dc 4083
24a73b0a
KH
4084 CODING_GET_INFO (coding, attrs, charset_list);
4085 eol_type = CODING_ID_EOL_TYPE (coding->id);
4086 if (VECTORP (eol_type))
4087 eol_type = Qunix;
4088
004068e4 4089 setup_iso_safe_charsets (attrs);
ff0dacd7 4090 /* Charset list may have been changed. */
287c57d7 4091 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8f924df7 4092 coding->safe_charsets = (char *) SDATA (CODING_ATTR_SAFE_CHARSETS(attrs));
0eecad43 4093
df7492f9 4094 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
bdd9fb48 4095
df7492f9 4096 while (charbuf < charbuf_end)
4ed46869 4097 {
df7492f9 4098 ASSURE_DESTINATION (safe_room);
b73bfc1c 4099
df7492f9 4100 if (bol_designation)
b73bfc1c 4101 {
df7492f9 4102 unsigned char *dst_prev = dst;
4ed46869 4103
bdd9fb48 4104 /* We have to produce designation sequences if any now. */
df7492f9
KH
4105 dst = encode_designation_at_bol (coding, charbuf, charbuf_end, dst);
4106 bol_designation = 0;
4107 /* We are sure that designation sequences are all ASCII bytes. */
4108 produced_chars += dst - dst_prev;
e0e989f6
KH
4109 }
4110
df7492f9 4111 c = *charbuf++;
ec6d2bb8 4112
ff0dacd7
KH
4113 if (c < 0)
4114 {
4115 /* Handle an annotation. */
4116 switch (*charbuf)
ec6d2bb8 4117 {
ff0dacd7
KH
4118 case CODING_ANNOTATE_COMPOSITION_MASK:
4119 /* Not yet implemented. */
4120 break;
4121 case CODING_ANNOTATE_CHARSET_MASK:
cf7dfdf5 4122 preferred_charset_id = charbuf[2];
ff0dacd7
KH
4123 if (preferred_charset_id >= 0
4124 && NILP (Fmemq (make_number (preferred_charset_id),
4125 charset_list)))
4126 preferred_charset_id = -1;
4127 break;
4128 default:
4129 abort ();
4ed46869 4130 }
ff0dacd7
KH
4131 charbuf += -c - 1;
4132 continue;
4ed46869 4133 }
ec6d2bb8 4134
b73bfc1c
KH
4135 /* Now encode the character C. */
4136 if (c < 0x20 || c == 0x7F)
4137 {
df7492f9
KH
4138 if (c == '\n'
4139 || (c == '\r' && EQ (eol_type, Qmac)))
19a8d9e0 4140 {
df7492f9
KH
4141 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4142 ENCODE_RESET_PLANE_AND_REGISTER ();
4143 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
b73bfc1c 4144 {
df7492f9
KH
4145 int i;
4146
4147 for (i = 0; i < 4; i++)
4148 CODING_ISO_DESIGNATION (coding, i)
4149 = CODING_ISO_INITIAL (coding, i);
b73bfc1c 4150 }
df7492f9
KH
4151 bol_designation
4152 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
19a8d9e0 4153 }
df7492f9
KH
4154 else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4155 ENCODE_RESET_PLANE_AND_REGISTER ();
4156 EMIT_ONE_ASCII_BYTE (c);
4ed46869 4157 }
df7492f9 4158 else if (ASCII_CHAR_P (c))
88993dfd 4159 {
df7492f9
KH
4160 if (ascii_compatible)
4161 EMIT_ONE_ASCII_BYTE (c);
93dec019 4162 else
19a8d9e0 4163 {
bf16eb23
KH
4164 struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4165 ENCODE_ISO_CHARACTER (charset, c);
19a8d9e0 4166 }
4ed46869 4167 }
16eafb5d 4168 else if (CHAR_BYTE8_P (c))
88993dfd 4169 {
16eafb5d
KH
4170 c = CHAR_TO_BYTE8 (c);
4171 EMIT_ONE_BYTE (c);
88993dfd 4172 }
b73bfc1c 4173 else
df7492f9 4174 {
ff0dacd7 4175 struct charset *charset;
b73bfc1c 4176
ff0dacd7
KH
4177 if (preferred_charset_id >= 0)
4178 {
4179 charset = CHARSET_FROM_ID (preferred_charset_id);
4180 if (! CHAR_CHARSET_P (c, charset))
4181 charset = char_charset (c, charset_list, NULL);
4182 }
4183 else
4184 charset = char_charset (c, charset_list, NULL);
df7492f9
KH
4185 if (!charset)
4186 {
41cbe562
KH
4187 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4188 {
4189 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4190 charset = CHARSET_FROM_ID (charset_ascii);
4191 }
4192 else
4193 {
4194 c = coding->default_char;
4195 charset = char_charset (c, charset_list, NULL);
4196 }
df7492f9
KH
4197 }
4198 ENCODE_ISO_CHARACTER (charset, c);
4199 }
84fbb8a0 4200 }
b73bfc1c 4201
df7492f9
KH
4202 if (coding->mode & CODING_MODE_LAST_BLOCK
4203 && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4204 {
4205 ASSURE_DESTINATION (safe_room);
4206 ENCODE_RESET_PLANE_AND_REGISTER ();
4207 }
065e3595 4208 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
4209 CODING_ISO_BOL (coding) = bol_designation;
4210 coding->produced_char += produced_chars;
4211 coding->produced = dst - coding->destination;
4212 return 0;
4ed46869
KH
4213}
4214
4215\f
df7492f9 4216/*** 8,9. SJIS and BIG5 handlers ***/
4ed46869 4217
df7492f9 4218/* Although SJIS and BIG5 are not ISO's coding system, they are used
4ed46869
KH
4219 quite widely. So, for the moment, Emacs supports them in the bare
4220 C code. But, in the future, they may be supported only by CCL. */
4221
4222/* SJIS is a coding system encoding three character sets: ASCII, right
4223 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
4224 as is. A character of charset katakana-jisx0201 is encoded by
4225 "position-code + 0x80". A character of charset japanese-jisx0208
4226 is encoded in 2-byte but two position-codes are divided and shifted
df7492f9 4227 so that it fit in the range below.
4ed46869
KH
4228
4229 --- CODE RANGE of SJIS ---
4230 (character set) (range)
4231 ASCII 0x00 .. 0x7F
df7492f9 4232 KATAKANA-JISX0201 0xA0 .. 0xDF
c28a9453 4233 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
d14d03ac 4234 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
4ed46869
KH
4235 -------------------------------
4236
4237*/
4238
4239/* BIG5 is a coding system encoding two character sets: ASCII and
4240 Big5. An ASCII character is encoded as is. Big5 is a two-byte
df7492f9 4241 character set and is encoded in two-byte.
4ed46869
KH
4242
4243 --- CODE RANGE of BIG5 ---
4244 (character set) (range)
4245 ASCII 0x00 .. 0x7F
4246 Big5 (1st byte) 0xA1 .. 0xFE
4247 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
4248 --------------------------
4249
df7492f9 4250 */
4ed46869
KH
4251
4252/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4253 Check if a text is encoded in SJIS. If it is, return
df7492f9 4254 CATEGORY_MASK_SJIS, else return 0. */
4ed46869 4255
0a28aafb 4256static int
ff0dacd7 4257detect_coding_sjis (coding, detect_info)
df7492f9 4258 struct coding_system *coding;
ff0dacd7 4259 struct coding_detection_info *detect_info;
4ed46869 4260{
065e3595 4261 const unsigned char *src = coding->source, *src_base;
8f924df7 4262 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9
KH
4263 int multibytep = coding->src_multibyte;
4264 int consumed_chars = 0;
4265 int found = 0;
b73bfc1c 4266 int c;
df7492f9 4267
ff0dacd7 4268 detect_info->checked |= CATEGORY_MASK_SJIS;
df7492f9
KH
4269 /* A coding system of this category is always ASCII compatible. */
4270 src += coding->head_ascii;
4ed46869 4271
b73bfc1c 4272 while (1)
4ed46869 4273 {
065e3595 4274 src_base = src;
df7492f9 4275 ONE_MORE_BYTE (c);
682169fe
KH
4276 if (c < 0x80)
4277 continue;
df7492f9 4278 if ((c >= 0x81 && c <= 0x9F) || (c >= 0xE0 && c <= 0xEF))
4ed46869 4279 {
df7492f9 4280 ONE_MORE_BYTE (c);
682169fe 4281 if (c < 0x40 || c == 0x7F || c > 0xFC)
df7492f9 4282 break;
ff0dacd7 4283 found = CATEGORY_MASK_SJIS;
4ed46869 4284 }
df7492f9 4285 else if (c >= 0xA0 && c < 0xE0)
ff0dacd7 4286 found = CATEGORY_MASK_SJIS;
df7492f9
KH
4287 else
4288 break;
4ed46869 4289 }
ff0dacd7 4290 detect_info->rejected |= CATEGORY_MASK_SJIS;
df7492f9
KH
4291 return 0;
4292
4293 no_more_source:
065e3595 4294 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
89528eb3 4295 {
ff0dacd7 4296 detect_info->rejected |= CATEGORY_MASK_SJIS;
89528eb3 4297 return 0;
4ed46869 4298 }
ff0dacd7
KH
4299 detect_info->found |= found;
4300 return 1;
4ed46869
KH
4301}
4302
4303/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4304 Check if a text is encoded in BIG5. If it is, return
df7492f9 4305 CATEGORY_MASK_BIG5, else return 0. */
4ed46869 4306
0a28aafb 4307static int
ff0dacd7 4308detect_coding_big5 (coding, detect_info)
df7492f9 4309 struct coding_system *coding;
ff0dacd7 4310 struct coding_detection_info *detect_info;
4ed46869 4311{
065e3595 4312 const unsigned char *src = coding->source, *src_base;
8f924df7 4313 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9
KH
4314 int multibytep = coding->src_multibyte;
4315 int consumed_chars = 0;
4316 int found = 0;
b73bfc1c 4317 int c;
fa42c37f 4318
ff0dacd7 4319 detect_info->checked |= CATEGORY_MASK_BIG5;
df7492f9
KH
4320 /* A coding system of this category is always ASCII compatible. */
4321 src += coding->head_ascii;
fa42c37f 4322
b73bfc1c 4323 while (1)
fa42c37f 4324 {
065e3595 4325 src_base = src;
df7492f9
KH
4326 ONE_MORE_BYTE (c);
4327 if (c < 0x80)
fa42c37f 4328 continue;
df7492f9 4329 if (c >= 0xA1)
fa42c37f 4330 {
df7492f9
KH
4331 ONE_MORE_BYTE (c);
4332 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
fa42c37f 4333 return 0;
ff0dacd7 4334 found = CATEGORY_MASK_BIG5;
fa42c37f 4335 }
df7492f9
KH
4336 else
4337 break;
fa42c37f 4338 }
ff0dacd7 4339 detect_info->rejected |= CATEGORY_MASK_BIG5;
fa42c37f 4340 return 0;
fa42c37f 4341
df7492f9 4342 no_more_source:
065e3595 4343 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
89528eb3 4344 {
ff0dacd7 4345 detect_info->rejected |= CATEGORY_MASK_BIG5;
89528eb3
KH
4346 return 0;
4347 }
ff0dacd7
KH
4348 detect_info->found |= found;
4349 return 1;
fa42c37f
KH
4350}
4351
4ed46869
KH
4352/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
4353 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
fa42c37f 4354
b73bfc1c 4355static void
df7492f9 4356decode_coding_sjis (coding)
4ed46869 4357 struct coding_system *coding;
4ed46869 4358{
8f924df7
KH
4359 const unsigned char *src = coding->source + coding->consumed;
4360 const unsigned char *src_end = coding->source + coding->src_bytes;
4361 const unsigned char *src_base;
69a80ea3
KH
4362 int *charbuf = coding->charbuf + coding->charbuf_used;
4363 int *charbuf_end
4364 = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
df7492f9
KH
4365 int consumed_chars = 0, consumed_chars_base;
4366 int multibytep = coding->src_multibyte;
4367 struct charset *charset_roman, *charset_kanji, *charset_kana;
57a47f8a 4368 struct charset *charset_kanji2;
24a73b0a 4369 Lisp_Object attrs, charset_list, val;
ff0dacd7
KH
4370 int char_offset = coding->produced_char;
4371 int last_offset = char_offset;
4372 int last_id = charset_ascii;
119852e7
KH
4373 int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4374 int byte_after_cr = -1;
a5d301df 4375
24a73b0a 4376 CODING_GET_INFO (coding, attrs, charset_list);
df7492f9
KH
4377
4378 val = charset_list;
4379 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
89528eb3 4380 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
57a47f8a
KH
4381 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4382 charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
fa42c37f 4383
b73bfc1c 4384 while (1)
4ed46869 4385 {
df7492f9 4386 int c, c1;
24a73b0a 4387 struct charset *charset;
fa42c37f 4388
b73bfc1c 4389 src_base = src;
df7492f9 4390 consumed_chars_base = consumed_chars;
fa42c37f 4391
df7492f9 4392 if (charbuf >= charbuf_end)
b71f6f73
KH
4393 {
4394 if (byte_after_cr >= 0)
4395 src_base--;
4396 break;
4397 }
df7492f9 4398
119852e7
KH
4399 if (byte_after_cr >= 0)
4400 c = byte_after_cr, byte_after_cr = -1;
4401 else
4402 ONE_MORE_BYTE (c);
065e3595
KH
4403 if (c < 0)
4404 goto invalid_code;
24a73b0a 4405 if (c < 0x80)
119852e7
KH
4406 {
4407 if (eol_crlf && c == '\r')
4408 ONE_MORE_BYTE (byte_after_cr);
4409 charset = charset_roman;
4410 }
57a47f8a 4411 else if (c == 0x80 || c == 0xA0)
8e921c4b 4412 goto invalid_code;
57a47f8a
KH
4413 else if (c >= 0xA1 && c <= 0xDF)
4414 {
4415 /* SJIS -> JISX0201-Kana */
4416 c &= 0x7F;
4417 charset = charset_kana;
4418 }
4419 else if (c <= 0xEF)
df7492f9 4420 {
57a47f8a
KH
4421 /* SJIS -> JISX0208 */
4422 ONE_MORE_BYTE (c1);
4423 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
24a73b0a 4424 goto invalid_code;
57a47f8a
KH
4425 c = (c << 8) | c1;
4426 SJIS_TO_JIS (c);
4427 charset = charset_kanji;
4428 }
4429 else if (c <= 0xFC && charset_kanji2)
4430 {
c6876370 4431 /* SJIS -> JISX0213-2 */
57a47f8a
KH
4432 ONE_MORE_BYTE (c1);
4433 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
24a73b0a 4434 goto invalid_code;
57a47f8a
KH
4435 c = (c << 8) | c1;
4436 SJIS_TO_JIS2 (c);
4437 charset = charset_kanji2;
df7492f9 4438 }
57a47f8a
KH
4439 else
4440 goto invalid_code;
24a73b0a
KH
4441 if (charset->id != charset_ascii
4442 && last_id != charset->id)
4443 {
4444 if (last_id != charset_ascii)
69a80ea3 4445 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
24a73b0a
KH
4446 last_id = charset->id;
4447 last_offset = char_offset;
4448 }
4449 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
df7492f9 4450 *charbuf++ = c;
ff0dacd7 4451 char_offset++;
df7492f9 4452 continue;
b73bfc1c 4453
df7492f9
KH
4454 invalid_code:
4455 src = src_base;
4456 consumed_chars = consumed_chars_base;
4457 ONE_MORE_BYTE (c);
065e3595 4458 *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
ff0dacd7 4459 char_offset++;
df7492f9
KH
4460 coding->errors++;
4461 }
fa42c37f 4462
df7492f9 4463 no_more_source:
ff0dacd7 4464 if (last_id != charset_ascii)
69a80ea3 4465 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
df7492f9
KH
4466 coding->consumed_char += consumed_chars_base;
4467 coding->consumed = src_base - coding->source;
4468 coding->charbuf_used = charbuf - coding->charbuf;
fa42c37f
KH
4469}
4470
b73bfc1c 4471static void
df7492f9 4472decode_coding_big5 (coding)
4ed46869 4473 struct coding_system *coding;
4ed46869 4474{
8f924df7
KH
4475 const unsigned char *src = coding->source + coding->consumed;
4476 const unsigned char *src_end = coding->source + coding->src_bytes;
4477 const unsigned char *src_base;
69a80ea3
KH
4478 int *charbuf = coding->charbuf + coding->charbuf_used;
4479 int *charbuf_end
4480 = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
df7492f9
KH
4481 int consumed_chars = 0, consumed_chars_base;
4482 int multibytep = coding->src_multibyte;
4483 struct charset *charset_roman, *charset_big5;
24a73b0a 4484 Lisp_Object attrs, charset_list, val;
ff0dacd7
KH
4485 int char_offset = coding->produced_char;
4486 int last_offset = char_offset;
4487 int last_id = charset_ascii;
119852e7
KH
4488 int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4489 int byte_after_cr = -1;
df7492f9 4490
24a73b0a 4491 CODING_GET_INFO (coding, attrs, charset_list);
df7492f9
KH
4492 val = charset_list;
4493 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4494 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4ed46869 4495
b73bfc1c 4496 while (1)
4ed46869 4497 {
df7492f9 4498 int c, c1;
24a73b0a 4499 struct charset *charset;
b73bfc1c
KH
4500
4501 src_base = src;
df7492f9
KH
4502 consumed_chars_base = consumed_chars;
4503
4504 if (charbuf >= charbuf_end)
b71f6f73
KH
4505 {
4506 if (byte_after_cr >= 0)
4507 src_base--;
4508 break;
4509 }
df7492f9 4510
119852e7 4511 if (byte_after_cr >= 0)
14daee73 4512 c = byte_after_cr, byte_after_cr = -1;
119852e7
KH
4513 else
4514 ONE_MORE_BYTE (c);
b73bfc1c 4515
065e3595
KH
4516 if (c < 0)
4517 goto invalid_code;
24a73b0a 4518 if (c < 0x80)
119852e7 4519 {
14daee73 4520 if (eol_crlf && c == '\r')
119852e7
KH
4521 ONE_MORE_BYTE (byte_after_cr);
4522 charset = charset_roman;
4523 }
24a73b0a 4524 else
4ed46869 4525 {
24a73b0a
KH
4526 /* BIG5 -> Big5 */
4527 if (c < 0xA1 || c > 0xFE)
4528 goto invalid_code;
4529 ONE_MORE_BYTE (c1);
4530 if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4531 goto invalid_code;
4532 c = c << 8 | c1;
4533 charset = charset_big5;
4ed46869 4534 }
24a73b0a
KH
4535 if (charset->id != charset_ascii
4536 && last_id != charset->id)
df7492f9 4537 {
24a73b0a 4538 if (last_id != charset_ascii)
69a80ea3 4539 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
24a73b0a
KH
4540 last_id = charset->id;
4541 last_offset = char_offset;
4ed46869 4542 }
24a73b0a 4543 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
df7492f9 4544 *charbuf++ = c;
ff0dacd7 4545 char_offset++;
fb88bf2d
KH
4546 continue;
4547
df7492f9 4548 invalid_code:
4ed46869 4549 src = src_base;
df7492f9
KH
4550 consumed_chars = consumed_chars_base;
4551 ONE_MORE_BYTE (c);
065e3595 4552 *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
ff0dacd7 4553 char_offset++;
df7492f9 4554 coding->errors++;
fb88bf2d 4555 }
d46c5b12 4556
df7492f9 4557 no_more_source:
ff0dacd7 4558 if (last_id != charset_ascii)
69a80ea3 4559 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
df7492f9
KH
4560 coding->consumed_char += consumed_chars_base;
4561 coding->consumed = src_base - coding->source;
4562 coding->charbuf_used = charbuf - coding->charbuf;
4ed46869
KH
4563}
4564
4565/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
b73bfc1c
KH
4566 This function can encode charsets `ascii', `katakana-jisx0201',
4567 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
4568 are sure that all these charsets are registered as official charset
4ed46869
KH
4569 (i.e. do not have extended leading-codes). Characters of other
4570 charsets are produced without any encoding. If SJIS_P is 1, encode
4571 SJIS text, else encode BIG5 text. */
4572
df7492f9
KH
4573static int
4574encode_coding_sjis (coding)
4ed46869 4575 struct coding_system *coding;
4ed46869 4576{
df7492f9
KH
4577 int multibytep = coding->dst_multibyte;
4578 int *charbuf = coding->charbuf;
4579 int *charbuf_end = charbuf + coding->charbuf_used;
4580 unsigned char *dst = coding->destination + coding->produced;
4581 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4582 int safe_room = 4;
4583 int produced_chars = 0;
24a73b0a 4584 Lisp_Object attrs, charset_list, val;
df7492f9
KH
4585 int ascii_compatible;
4586 struct charset *charset_roman, *charset_kanji, *charset_kana;
57a47f8a 4587 struct charset *charset_kanji2;
df7492f9 4588 int c;
a5d301df 4589
24a73b0a 4590 CODING_GET_INFO (coding, attrs, charset_list);
df7492f9
KH
4591 val = charset_list;
4592 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4593 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
57a47f8a
KH
4594 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4595 charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4ed46869 4596
df7492f9 4597 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
93dec019 4598
df7492f9
KH
4599 while (charbuf < charbuf_end)
4600 {
4601 ASSURE_DESTINATION (safe_room);
4602 c = *charbuf++;
b73bfc1c 4603 /* Now encode the character C. */
df7492f9
KH
4604 if (ASCII_CHAR_P (c) && ascii_compatible)
4605 EMIT_ONE_ASCII_BYTE (c);
16eafb5d
KH
4606 else if (CHAR_BYTE8_P (c))
4607 {
4608 c = CHAR_TO_BYTE8 (c);
4609 EMIT_ONE_BYTE (c);
4610 }
df7492f9 4611 else
b73bfc1c 4612 {
df7492f9
KH
4613 unsigned code;
4614 struct charset *charset = char_charset (c, charset_list, &code);
4615
4616 if (!charset)
4ed46869 4617 {
41cbe562 4618 if (coding->mode & CODING_MODE_SAFE_ENCODING)
b73bfc1c 4619 {
41cbe562
KH
4620 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4621 charset = CHARSET_FROM_ID (charset_ascii);
b73bfc1c 4622 }
41cbe562 4623 else
b73bfc1c 4624 {
41cbe562
KH
4625 c = coding->default_char;
4626 charset = char_charset (c, charset_list, &code);
b73bfc1c 4627 }
b73bfc1c 4628 }
df7492f9
KH
4629 if (code == CHARSET_INVALID_CODE (charset))
4630 abort ();
4631 if (charset == charset_kanji)
4632 {
4633 int c1, c2;
4634 JIS_TO_SJIS (code);
4635 c1 = code >> 8, c2 = code & 0xFF;
4636 EMIT_TWO_BYTES (c1, c2);
4637 }
4638 else if (charset == charset_kana)
4639 EMIT_ONE_BYTE (code | 0x80);
57a47f8a
KH
4640 else if (charset_kanji2 && charset == charset_kanji2)
4641 {
4642 int c1, c2;
4643
4644 c1 = code >> 8;
4645 if (c1 == 0x21 || (c1 >= 0x23 && c1 < 0x25)
4646 || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
4647 {
4648 JIS_TO_SJIS2 (code);
4649 c1 = code >> 8, c2 = code & 0xFF;
4650 EMIT_TWO_BYTES (c1, c2);
4651 }
4652 else
4653 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4654 }
df7492f9
KH
4655 else
4656 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4657 }
4658 }
065e3595 4659 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
4660 coding->produced_char += produced_chars;
4661 coding->produced = dst - coding->destination;
4662 return 0;
4663}
4664
4665static int
4666encode_coding_big5 (coding)
4667 struct coding_system *coding;
4668{
4669 int multibytep = coding->dst_multibyte;
4670 int *charbuf = coding->charbuf;
4671 int *charbuf_end = charbuf + coding->charbuf_used;
4672 unsigned char *dst = coding->destination + coding->produced;
4673 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4674 int safe_room = 4;
4675 int produced_chars = 0;
24a73b0a 4676 Lisp_Object attrs, charset_list, val;
df7492f9
KH
4677 int ascii_compatible;
4678 struct charset *charset_roman, *charset_big5;
4679 int c;
4680
24a73b0a 4681 CODING_GET_INFO (coding, attrs, charset_list);
df7492f9
KH
4682 val = charset_list;
4683 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4684 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4685 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4686
4687 while (charbuf < charbuf_end)
4688 {
4689 ASSURE_DESTINATION (safe_room);
4690 c = *charbuf++;
4691 /* Now encode the character C. */
4692 if (ASCII_CHAR_P (c) && ascii_compatible)
4693 EMIT_ONE_ASCII_BYTE (c);
16eafb5d
KH
4694 else if (CHAR_BYTE8_P (c))
4695 {
4696 c = CHAR_TO_BYTE8 (c);
4697 EMIT_ONE_BYTE (c);
b73bfc1c
KH
4698 }
4699 else
4700 {
df7492f9
KH
4701 unsigned code;
4702 struct charset *charset = char_charset (c, charset_list, &code);
4703
4704 if (! charset)
b73bfc1c 4705 {
41cbe562 4706 if (coding->mode & CODING_MODE_SAFE_ENCODING)
b73bfc1c 4707 {
41cbe562
KH
4708 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4709 charset = CHARSET_FROM_ID (charset_ascii);
b73bfc1c 4710 }
41cbe562 4711 else
0eecad43 4712 {
41cbe562
KH
4713 c = coding->default_char;
4714 charset = char_charset (c, charset_list, &code);
0eecad43 4715 }
4ed46869 4716 }
df7492f9
KH
4717 if (code == CHARSET_INVALID_CODE (charset))
4718 abort ();
4719 if (charset == charset_big5)
b73bfc1c 4720 {
df7492f9
KH
4721 int c1, c2;
4722
4723 c1 = code >> 8, c2 = code & 0xFF;
4724 EMIT_TWO_BYTES (c1, c2);
b73bfc1c 4725 }
df7492f9
KH
4726 else
4727 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4ed46869 4728 }
4ed46869 4729 }
065e3595 4730 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
4731 coding->produced_char += produced_chars;
4732 coding->produced = dst - coding->destination;
4733 return 0;
4ed46869
KH
4734}
4735
4736\f
df7492f9 4737/*** 10. CCL handlers ***/
1397dc18
KH
4738
4739/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4740 Check if a text is encoded in a coding system of which
4741 encoder/decoder are written in CCL program. If it is, return
df7492f9 4742 CATEGORY_MASK_CCL, else return 0. */
1397dc18 4743
0a28aafb 4744static int
ff0dacd7 4745detect_coding_ccl (coding, detect_info)
df7492f9 4746 struct coding_system *coding;
ff0dacd7 4747 struct coding_detection_info *detect_info;
1397dc18 4748{
065e3595 4749 const unsigned char *src = coding->source, *src_base;
8f924df7 4750 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9
KH
4751 int multibytep = coding->src_multibyte;
4752 int consumed_chars = 0;
4753 int found = 0;
0e219d54 4754 unsigned char *valids;
df7492f9
KH
4755 int head_ascii = coding->head_ascii;
4756 Lisp_Object attrs;
4757
ff0dacd7
KH
4758 detect_info->checked |= CATEGORY_MASK_CCL;
4759
df7492f9 4760 coding = &coding_categories[coding_category_ccl];
0e219d54 4761 valids = CODING_CCL_VALIDS (coding);
df7492f9
KH
4762 attrs = CODING_ID_ATTRS (coding->id);
4763 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4764 src += head_ascii;
1397dc18 4765
b73bfc1c 4766 while (1)
1397dc18 4767 {
df7492f9 4768 int c;
065e3595
KH
4769
4770 src_base = src;
df7492f9 4771 ONE_MORE_BYTE (c);
065e3595 4772 if (c < 0 || ! valids[c])
df7492f9 4773 break;
ff0dacd7
KH
4774 if ((valids[c] > 1))
4775 found = CATEGORY_MASK_CCL;
df7492f9 4776 }
ff0dacd7 4777 detect_info->rejected |= CATEGORY_MASK_CCL;
df7492f9
KH
4778 return 0;
4779
4780 no_more_source:
ff0dacd7
KH
4781 detect_info->found |= found;
4782 return 1;
df7492f9
KH
4783}
4784
4785static void
4786decode_coding_ccl (coding)
4787 struct coding_system *coding;
4788{
7c78e542 4789 const unsigned char *src = coding->source + coding->consumed;
8f924df7 4790 const unsigned char *src_end = coding->source + coding->src_bytes;
69a80ea3
KH
4791 int *charbuf = coding->charbuf + coding->charbuf_used;
4792 int *charbuf_end = coding->charbuf + coding->charbuf_size;
df7492f9
KH
4793 int consumed_chars = 0;
4794 int multibytep = coding->src_multibyte;
4795 struct ccl_program ccl;
4796 int source_charbuf[1024];
4797 int source_byteidx[1024];
24a73b0a 4798 Lisp_Object attrs, charset_list;
df7492f9 4799
24a73b0a 4800 CODING_GET_INFO (coding, attrs, charset_list);
df7492f9
KH
4801 setup_ccl_program (&ccl, CODING_CCL_DECODER (coding));
4802
4803 while (src < src_end)
4804 {
7c78e542 4805 const unsigned char *p = src;
df7492f9
KH
4806 int *source, *source_end;
4807 int i = 0;
4808
4809 if (multibytep)
4810 while (i < 1024 && p < src_end)
4811 {
4812 source_byteidx[i] = p - src;
4813 source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
4814 }
4815 else
4816 while (i < 1024 && p < src_end)
4817 source_charbuf[i++] = *p++;
8f924df7 4818
df7492f9
KH
4819 if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
4820 ccl.last_block = 1;
4821
4822 source = source_charbuf;
4823 source_end = source + i;
4824 while (source < source_end)
4825 {
4826 ccl_driver (&ccl, source, charbuf,
8dcbea82
KH
4827 source_end - source, charbuf_end - charbuf,
4828 charset_list);
df7492f9
KH
4829 source += ccl.consumed;
4830 charbuf += ccl.produced;
4831 if (ccl.status != CCL_STAT_SUSPEND_BY_DST)
4832 break;
4833 }
4834 if (source < source_end)
4835 src += source_byteidx[source - source_charbuf];
4836 else
4837 src = p;
4838 consumed_chars += source - source_charbuf;
4839
4840 if (ccl.status != CCL_STAT_SUSPEND_BY_SRC
4841 && ccl.status != CODING_RESULT_INSUFFICIENT_SRC)
4842 break;
4843 }
4844
4845 switch (ccl.status)
4846 {
4847 case CCL_STAT_SUSPEND_BY_SRC:
065e3595 4848 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
df7492f9
KH
4849 break;
4850 case CCL_STAT_SUSPEND_BY_DST:
4851 break;
4852 case CCL_STAT_QUIT:
4853 case CCL_STAT_INVALID_CMD:
065e3595 4854 record_conversion_result (coding, CODING_RESULT_INTERRUPT);
df7492f9
KH
4855 break;
4856 default:
065e3595 4857 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
4858 break;
4859 }
4860 coding->consumed_char += consumed_chars;
4861 coding->consumed = src - coding->source;
4862 coding->charbuf_used = charbuf - coding->charbuf;
4863}
4864
4865static int
4866encode_coding_ccl (coding)
4867 struct coding_system *coding;
4868{
4869 struct ccl_program ccl;
4870 int multibytep = coding->dst_multibyte;
4871 int *charbuf = coding->charbuf;
4872 int *charbuf_end = charbuf + coding->charbuf_used;
4873 unsigned char *dst = coding->destination + coding->produced;
4874 unsigned char *dst_end = coding->destination + coding->dst_bytes;
df7492f9
KH
4875 int destination_charbuf[1024];
4876 int i, produced_chars = 0;
24a73b0a 4877 Lisp_Object attrs, charset_list;
df7492f9 4878
24a73b0a 4879 CODING_GET_INFO (coding, attrs, charset_list);
df7492f9
KH
4880 setup_ccl_program (&ccl, CODING_CCL_ENCODER (coding));
4881
4882 ccl.last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4883 ccl.dst_multibyte = coding->dst_multibyte;
4884
8cffd3e7 4885 while (charbuf < charbuf_end)
df7492f9 4886 {
df7492f9 4887 ccl_driver (&ccl, charbuf, destination_charbuf,
8cffd3e7 4888 charbuf_end - charbuf, 1024, charset_list);
df7492f9 4889 if (multibytep)
8cffd3e7
KH
4890 {
4891 ASSURE_DESTINATION (ccl.produced * 2);
4892 for (i = 0; i < ccl.produced; i++)
4893 EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
4894 }
df7492f9
KH
4895 else
4896 {
8cffd3e7 4897 ASSURE_DESTINATION (ccl.produced);
3ed051d4 4898 for (i = 0; i < ccl.produced; i++)
df7492f9
KH
4899 *dst++ = destination_charbuf[i] & 0xFF;
4900 produced_chars += ccl.produced;
4901 }
8cffd3e7
KH
4902 charbuf += ccl.consumed;
4903 if (ccl.status == CCL_STAT_QUIT
4904 || ccl.status == CCL_STAT_INVALID_CMD)
4905 break;
df7492f9
KH
4906 }
4907
4908 switch (ccl.status)
4909 {
4910 case CCL_STAT_SUSPEND_BY_SRC:
065e3595 4911 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
df7492f9
KH
4912 break;
4913 case CCL_STAT_SUSPEND_BY_DST:
065e3595 4914 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
df7492f9
KH
4915 break;
4916 case CCL_STAT_QUIT:
4917 case CCL_STAT_INVALID_CMD:
065e3595 4918 record_conversion_result (coding, CODING_RESULT_INTERRUPT);
df7492f9
KH
4919 break;
4920 default:
065e3595 4921 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9 4922 break;
1397dc18 4923 }
df7492f9
KH
4924
4925 coding->produced_char += produced_chars;
4926 coding->produced = dst - coding->destination;
4927 return 0;
1397dc18
KH
4928}
4929
df7492f9 4930
1397dc18 4931\f
df7492f9 4932/*** 10, 11. no-conversion handlers ***/
4ed46869 4933
b73bfc1c 4934/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
4ed46869 4935
b73bfc1c 4936static void
df7492f9 4937decode_coding_raw_text (coding)
4ed46869 4938 struct coding_system *coding;
4ed46869 4939{
119852e7
KH
4940 int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4941
df7492f9 4942 coding->chars_at_source = 1;
119852e7
KH
4943 coding->consumed_char = coding->src_chars;
4944 coding->consumed = coding->src_bytes;
4945 if (eol_crlf && coding->source[coding->src_bytes - 1] == '\r')
4946 {
4947 coding->consumed_char--;
4948 coding->consumed--;
4949 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
4950 }
4951 else
4952 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9 4953}
4ed46869 4954
df7492f9
KH
4955static int
4956encode_coding_raw_text (coding)
4957 struct coding_system *coding;
4958{
4959 int multibytep = coding->dst_multibyte;
4960 int *charbuf = coding->charbuf;
4961 int *charbuf_end = coding->charbuf + coding->charbuf_used;
4962 unsigned char *dst = coding->destination + coding->produced;
4963 unsigned char *dst_end = coding->destination + coding->dst_bytes;
a0ed9b27 4964 int produced_chars = 0;
b73bfc1c
KH
4965 int c;
4966
df7492f9 4967 if (multibytep)
b73bfc1c 4968 {
df7492f9 4969 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
4ed46869 4970
df7492f9
KH
4971 if (coding->src_multibyte)
4972 while (charbuf < charbuf_end)
4973 {
4974 ASSURE_DESTINATION (safe_room);
4975 c = *charbuf++;
4976 if (ASCII_CHAR_P (c))
4977 EMIT_ONE_ASCII_BYTE (c);
4978 else if (CHAR_BYTE8_P (c))
4979 {
4980 c = CHAR_TO_BYTE8 (c);
4981 EMIT_ONE_BYTE (c);
4982 }
4983 else
4984 {
4985 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
93dec019 4986
df7492f9
KH
4987 CHAR_STRING_ADVANCE (c, p1);
4988 while (p0 < p1)
9d123124
KH
4989 {
4990 EMIT_ONE_BYTE (*p0);
4991 p0++;
4992 }
df7492f9
KH
4993 }
4994 }
b73bfc1c 4995 else
df7492f9
KH
4996 while (charbuf < charbuf_end)
4997 {
4998 ASSURE_DESTINATION (safe_room);
4999 c = *charbuf++;
5000 EMIT_ONE_BYTE (c);
5001 }
5002 }
5003 else
4ed46869 5004 {
df7492f9 5005 if (coding->src_multibyte)
d46c5b12 5006 {
df7492f9
KH
5007 int safe_room = MAX_MULTIBYTE_LENGTH;
5008
5009 while (charbuf < charbuf_end)
d46c5b12 5010 {
df7492f9
KH
5011 ASSURE_DESTINATION (safe_room);
5012 c = *charbuf++;
5013 if (ASCII_CHAR_P (c))
5014 *dst++ = c;
5015 else if (CHAR_BYTE8_P (c))
5016 *dst++ = CHAR_TO_BYTE8 (c);
b73bfc1c 5017 else
df7492f9 5018 CHAR_STRING_ADVANCE (c, dst);
d46c5b12
KH
5019 }
5020 }
df7492f9
KH
5021 else
5022 {
5023 ASSURE_DESTINATION (charbuf_end - charbuf);
5024 while (charbuf < charbuf_end && dst < dst_end)
5025 *dst++ = *charbuf++;
8f924df7 5026 }
319a3947 5027 produced_chars = dst - (coding->destination + coding->produced);
4ed46869 5028 }
065e3595 5029 record_conversion_result (coding, CODING_RESULT_SUCCESS);
a0ed9b27 5030 coding->produced_char += produced_chars;
df7492f9
KH
5031 coding->produced = dst - coding->destination;
5032 return 0;
4ed46869
KH
5033}
5034
ff0dacd7
KH
5035/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5036 Check if a text is encoded in a charset-based coding system. If it
5037 is, return 1, else return 0. */
5038
0a28aafb 5039static int
ff0dacd7 5040detect_coding_charset (coding, detect_info)
df7492f9 5041 struct coding_system *coding;
ff0dacd7 5042 struct coding_detection_info *detect_info;
1397dc18 5043{
065e3595 5044 const unsigned char *src = coding->source, *src_base;
8f924df7 5045 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9
KH
5046 int multibytep = coding->src_multibyte;
5047 int consumed_chars = 0;
07295713 5048 Lisp_Object attrs, valids, name;
584948ac 5049 int found = 0;
716b3fa0 5050 int head_ascii = coding->head_ascii;
07295713 5051 int check_latin_extra = 0;
1397dc18 5052
ff0dacd7
KH
5053 detect_info->checked |= CATEGORY_MASK_CHARSET;
5054
df7492f9
KH
5055 coding = &coding_categories[coding_category_charset];
5056 attrs = CODING_ID_ATTRS (coding->id);
5057 valids = AREF (attrs, coding_attr_charset_valids);
07295713
KH
5058 name = CODING_ID_NAME (coding->id);
5059 if (VECTORP (Vlatin_extra_code_table)
5060 && strcmp ((char *) SDATA (SYMBOL_NAME (name)), "iso-8859-"))
5061 check_latin_extra = 1;
df7492f9 5062 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
716b3fa0 5063 src += head_ascii;
1397dc18 5064
b73bfc1c 5065 while (1)
1397dc18 5066 {
df7492f9 5067 int c;
716b3fa0
KH
5068 Lisp_Object val;
5069 struct charset *charset;
5070 int dim, idx;
1397dc18 5071
065e3595 5072 src_base = src;
df7492f9 5073 ONE_MORE_BYTE (c);
065e3595
KH
5074 if (c < 0)
5075 continue;
716b3fa0
KH
5076 val = AREF (valids, c);
5077 if (NILP (val))
df7492f9 5078 break;
584948ac 5079 if (c >= 0x80)
07295713
KH
5080 {
5081 if (c < 0xA0
5082 && check_latin_extra
5083 && NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
5084 break;
5085 found = CATEGORY_MASK_CHARSET;
5086 }
716b3fa0
KH
5087 if (INTEGERP (val))
5088 {
5089 charset = CHARSET_FROM_ID (XFASTINT (val));
5090 dim = CHARSET_DIMENSION (charset);
5091 for (idx = 1; idx < dim; idx++)
5092 {
5093 if (src == src_end)
5094 goto too_short;
5095 ONE_MORE_BYTE (c);
3ed051d4 5096 if (c < charset->code_space[(dim - 1 - idx) * 2]
716b3fa0
KH
5097 || c > charset->code_space[(dim - 1 - idx) * 2 + 1])
5098 break;
5099 }
5100 if (idx < dim)
5101 break;
5102 }
5103 else
5104 {
5105 idx = 1;
5106 for (; CONSP (val); val = XCDR (val))
5107 {
5108 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5109 dim = CHARSET_DIMENSION (charset);
5110 while (idx < dim)
5111 {
5112 if (src == src_end)
5113 goto too_short;
5114 ONE_MORE_BYTE (c);
5115 if (c < charset->code_space[(dim - 1 - idx) * 4]
5116 || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5117 break;
5118 idx++;
5119 }
5120 if (idx == dim)
5121 {
5122 val = Qnil;
5123 break;
5124 }
5125 }
5126 if (CONSP (val))
5127 break;
5128 }
df7492f9 5129 }
716b3fa0 5130 too_short:
ff0dacd7 5131 detect_info->rejected |= CATEGORY_MASK_CHARSET;
df7492f9 5132 return 0;
4ed46869 5133
df7492f9 5134 no_more_source:
ff0dacd7
KH
5135 detect_info->found |= found;
5136 return 1;
df7492f9 5137}
b73bfc1c 5138
b73bfc1c 5139static void
df7492f9 5140decode_coding_charset (coding)
4ed46869 5141 struct coding_system *coding;
4ed46869 5142{
8f924df7
KH
5143 const unsigned char *src = coding->source + coding->consumed;
5144 const unsigned char *src_end = coding->source + coding->src_bytes;
5145 const unsigned char *src_base;
69a80ea3
KH
5146 int *charbuf = coding->charbuf + coding->charbuf_used;
5147 int *charbuf_end
5148 = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
df7492f9
KH
5149 int consumed_chars = 0, consumed_chars_base;
5150 int multibytep = coding->src_multibyte;
24a73b0a 5151 Lisp_Object attrs, charset_list, valids;
ff0dacd7
KH
5152 int char_offset = coding->produced_char;
5153 int last_offset = char_offset;
5154 int last_id = charset_ascii;
119852e7
KH
5155 int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5156 int byte_after_cr = -1;
df7492f9 5157
24a73b0a 5158 CODING_GET_INFO (coding, attrs, charset_list);
4eb6d3f1 5159 valids = AREF (attrs, coding_attr_charset_valids);
b73bfc1c 5160
df7492f9 5161 while (1)
4ed46869 5162 {
4eb6d3f1 5163 int c;
24a73b0a
KH
5164 Lisp_Object val;
5165 struct charset *charset;
5166 int dim;
5167 int len = 1;
5168 unsigned code;
df7492f9
KH
5169
5170 src_base = src;
5171 consumed_chars_base = consumed_chars;
b73bfc1c 5172
df7492f9 5173 if (charbuf >= charbuf_end)
b71f6f73
KH
5174 {
5175 if (byte_after_cr >= 0)
5176 src_base--;
5177 break;
5178 }
df7492f9 5179
119852e7
KH
5180 if (byte_after_cr >= 0)
5181 {
5182 c = byte_after_cr;
5183 byte_after_cr = -1;
5184 }
5185 else
5186 {
5187 ONE_MORE_BYTE (c);
5188 if (eol_crlf && c == '\r')
5189 ONE_MORE_BYTE (byte_after_cr);
5190 }
065e3595
KH
5191 if (c < 0)
5192 goto invalid_code;
24a73b0a
KH
5193 code = c;
5194
5195 val = AREF (valids, c);
1b17adfd 5196 if (! INTEGERP (val) && ! CONSP (val))
24a73b0a
KH
5197 goto invalid_code;
5198 if (INTEGERP (val))
d46c5b12 5199 {
24a73b0a
KH
5200 charset = CHARSET_FROM_ID (XFASTINT (val));
5201 dim = CHARSET_DIMENSION (charset);
5202 while (len < dim)
b73bfc1c 5203 {
24a73b0a
KH
5204 ONE_MORE_BYTE (c);
5205 code = (code << 8) | c;
5206 len++;
b73bfc1c 5207 }
24a73b0a
KH
5208 CODING_DECODE_CHAR (coding, src, src_base, src_end,
5209 charset, code, c);
d46c5b12 5210 }
df7492f9 5211 else
d46c5b12 5212 {
24a73b0a
KH
5213 /* VAL is a list of charset IDs. It is assured that the
5214 list is sorted by charset dimensions (smaller one
5215 comes first). */
5216 while (CONSP (val))
4eb6d3f1 5217 {
24a73b0a 5218 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
c7c66a95 5219 dim = CHARSET_DIMENSION (charset);
f9d71dcd 5220 while (len < dim)
4eb6d3f1 5221 {
acb2a965
KH
5222 ONE_MORE_BYTE (c);
5223 code = (code << 8) | c;
f9d71dcd 5224 len++;
4eb6d3f1 5225 }
24a73b0a
KH
5226 CODING_DECODE_CHAR (coding, src, src_base,
5227 src_end, charset, code, c);
5228 if (c >= 0)
5229 break;
5230 val = XCDR (val);
ff0dacd7 5231 }
d46c5b12 5232 }
24a73b0a
KH
5233 if (c < 0)
5234 goto invalid_code;
5235 if (charset->id != charset_ascii
5236 && last_id != charset->id)
5237 {
5238 if (last_id != charset_ascii)
69a80ea3 5239 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
24a73b0a
KH
5240 last_id = charset->id;
5241 last_offset = char_offset;
5242 }
5243
df7492f9 5244 *charbuf++ = c;
ff0dacd7 5245 char_offset++;
df7492f9
KH
5246 continue;
5247
5248 invalid_code:
5249 src = src_base;
5250 consumed_chars = consumed_chars_base;
5251 ONE_MORE_BYTE (c);
065e3595 5252 *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
ff0dacd7 5253 char_offset++;
df7492f9 5254 coding->errors++;
4ed46869
KH
5255 }
5256
df7492f9 5257 no_more_source:
ff0dacd7 5258 if (last_id != charset_ascii)
69a80ea3 5259 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
df7492f9
KH
5260 coding->consumed_char += consumed_chars_base;
5261 coding->consumed = src_base - coding->source;
5262 coding->charbuf_used = charbuf - coding->charbuf;
4ed46869
KH
5263}
5264
df7492f9
KH
5265static int
5266encode_coding_charset (coding)
4ed46869 5267 struct coding_system *coding;
4ed46869 5268{
df7492f9
KH
5269 int multibytep = coding->dst_multibyte;
5270 int *charbuf = coding->charbuf;
5271 int *charbuf_end = charbuf + coding->charbuf_used;
5272 unsigned char *dst = coding->destination + coding->produced;
5273 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5274 int safe_room = MAX_MULTIBYTE_LENGTH;
5275 int produced_chars = 0;
24a73b0a 5276 Lisp_Object attrs, charset_list;
df7492f9 5277 int ascii_compatible;
b73bfc1c 5278 int c;
b73bfc1c 5279
24a73b0a 5280 CODING_GET_INFO (coding, attrs, charset_list);
df7492f9 5281 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
fb88bf2d 5282
df7492f9 5283 while (charbuf < charbuf_end)
4ed46869 5284 {
4eb6d3f1 5285 struct charset *charset;
df7492f9 5286 unsigned code;
8f924df7 5287
df7492f9
KH
5288 ASSURE_DESTINATION (safe_room);
5289 c = *charbuf++;
5290 if (ascii_compatible && ASCII_CHAR_P (c))
5291 EMIT_ONE_ASCII_BYTE (c);
16eafb5d 5292 else if (CHAR_BYTE8_P (c))
4ed46869 5293 {
16eafb5d
KH
5294 c = CHAR_TO_BYTE8 (c);
5295 EMIT_ONE_BYTE (c);
d46c5b12 5296 }
d46c5b12 5297 else
b73bfc1c 5298 {
4eb6d3f1
KH
5299 charset = char_charset (c, charset_list, &code);
5300 if (charset)
5301 {
5302 if (CHARSET_DIMENSION (charset) == 1)
5303 EMIT_ONE_BYTE (code);
5304 else if (CHARSET_DIMENSION (charset) == 2)
5305 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5306 else if (CHARSET_DIMENSION (charset) == 3)
5307 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5308 else
5309 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5310 (code >> 8) & 0xFF, code & 0xFF);
5311 }
5312 else
41cbe562
KH
5313 {
5314 if (coding->mode & CODING_MODE_SAFE_ENCODING)
5315 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5316 else
5317 c = coding->default_char;
5318 EMIT_ONE_BYTE (c);
5319 }
4ed46869 5320 }
4ed46869
KH
5321 }
5322
065e3595 5323 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
5324 coding->produced_char += produced_chars;
5325 coding->produced = dst - coding->destination;
5326 return 0;
4ed46869
KH
5327}
5328
5329\f
1397dc18 5330/*** 7. C library functions ***/
4ed46869 5331
df7492f9
KH
5332/* Setup coding context CODING from information about CODING_SYSTEM.
5333 If CODING_SYSTEM is nil, `no-conversion' is assumed. If
5334 CODING_SYSTEM is invalid, signal an error. */
4ed46869 5335
ec6d2bb8 5336void
e0e989f6
KH
5337setup_coding_system (coding_system, coding)
5338 Lisp_Object coding_system;
4ed46869
KH
5339 struct coding_system *coding;
5340{
df7492f9
KH
5341 Lisp_Object attrs;
5342 Lisp_Object eol_type;
5343 Lisp_Object coding_type;
4608c386 5344 Lisp_Object val;
4ed46869 5345
df7492f9 5346 if (NILP (coding_system))
ae6f73fa 5347 coding_system = Qundecided;
c07c8e12 5348
df7492f9 5349 CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
1f5dbf34 5350
df7492f9
KH
5351 attrs = CODING_ID_ATTRS (coding->id);
5352 eol_type = CODING_ID_EOL_TYPE (coding->id);
1f5dbf34 5353
df7492f9
KH
5354 coding->mode = 0;
5355 coding->head_ascii = -1;
4a015c45
KH
5356 if (VECTORP (eol_type))
5357 coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5358 | CODING_REQUIRE_DETECTION_MASK);
5359 else if (! EQ (eol_type, Qunix))
5360 coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5361 | CODING_REQUIRE_ENCODING_MASK);
5362 else
5363 coding->common_flags = 0;
5e5c78be
KH
5364 if (! NILP (CODING_ATTR_POST_READ (attrs)))
5365 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5366 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5367 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
8f924df7
KH
5368 if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5369 coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
1f5dbf34 5370
df7492f9 5371 val = CODING_ATTR_SAFE_CHARSETS (attrs);
8f924df7
KH
5372 coding->max_charset_id = SCHARS (val) - 1;
5373 coding->safe_charsets = (char *) SDATA (val);
df7492f9 5374 coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
4608c386 5375
df7492f9
KH
5376 coding_type = CODING_ATTR_TYPE (attrs);
5377 if (EQ (coding_type, Qundecided))
d46c5b12 5378 {
df7492f9
KH
5379 coding->detector = NULL;
5380 coding->decoder = decode_coding_raw_text;
5381 coding->encoder = encode_coding_raw_text;
5382 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
d46c5b12 5383 }
df7492f9 5384 else if (EQ (coding_type, Qiso_2022))
d46c5b12 5385 {
df7492f9
KH
5386 int i;
5387 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5388
5389 /* Invoke graphic register 0 to plane 0. */
5390 CODING_ISO_INVOCATION (coding, 0) = 0;
5391 /* Invoke graphic register 1 to plane 1 if we can use 8-bit. */
5392 CODING_ISO_INVOCATION (coding, 1)
5393 = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5394 /* Setup the initial status of designation. */
5395 for (i = 0; i < 4; i++)
5396 CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5397 /* Not single shifting initially. */
5398 CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5399 /* Beginning of buffer should also be regarded as bol. */
5400 CODING_ISO_BOL (coding) = 1;
5401 coding->detector = detect_coding_iso_2022;
5402 coding->decoder = decode_coding_iso_2022;
5403 coding->encoder = encode_coding_iso_2022;
5404 if (flags & CODING_ISO_FLAG_SAFE)
5405 coding->mode |= CODING_MODE_SAFE_ENCODING;
d46c5b12 5406 coding->common_flags
df7492f9
KH
5407 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5408 | CODING_REQUIRE_FLUSHING_MASK);
5409 if (flags & CODING_ISO_FLAG_COMPOSITION)
5410 coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
ff0dacd7
KH
5411 if (flags & CODING_ISO_FLAG_DESIGNATION)
5412 coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
df7492f9
KH
5413 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5414 {
5415 setup_iso_safe_charsets (attrs);
5416 val = CODING_ATTR_SAFE_CHARSETS (attrs);
8f924df7
KH
5417 coding->max_charset_id = SCHARS (val) - 1;
5418 coding->safe_charsets = (char *) SDATA (val);
df7492f9
KH
5419 }
5420 CODING_ISO_FLAGS (coding) = flags;
d46c5b12 5421 }
df7492f9 5422 else if (EQ (coding_type, Qcharset))
d46c5b12 5423 {
df7492f9
KH
5424 coding->detector = detect_coding_charset;
5425 coding->decoder = decode_coding_charset;
5426 coding->encoder = encode_coding_charset;
d46c5b12 5427 coding->common_flags
df7492f9 5428 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
d46c5b12 5429 }
df7492f9 5430 else if (EQ (coding_type, Qutf_8))
d46c5b12 5431 {
a470d443
KH
5432 val = AREF (attrs, coding_attr_utf_bom);
5433 CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5434 : EQ (val, Qt) ? utf_with_bom
5435 : utf_without_bom);
df7492f9
KH
5436 coding->detector = detect_coding_utf_8;
5437 coding->decoder = decode_coding_utf_8;
5438 coding->encoder = encode_coding_utf_8;
5439 coding->common_flags
5440 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
a470d443
KH
5441 if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5442 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
df7492f9
KH
5443 }
5444 else if (EQ (coding_type, Qutf_16))
5445 {
a470d443
KH
5446 val = AREF (attrs, coding_attr_utf_bom);
5447 CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5448 : EQ (val, Qt) ? utf_with_bom
5449 : utf_without_bom);
df7492f9 5450 val = AREF (attrs, coding_attr_utf_16_endian);
b49a1807 5451 CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
df7492f9 5452 : utf_16_little_endian);
e19c3639 5453 CODING_UTF_16_SURROGATE (coding) = 0;
df7492f9
KH
5454 coding->detector = detect_coding_utf_16;
5455 coding->decoder = decode_coding_utf_16;
5456 coding->encoder = encode_coding_utf_16;
5457 coding->common_flags
5458 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
a470d443 5459 if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
b49a1807 5460 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
d46c5b12 5461 }
df7492f9 5462 else if (EQ (coding_type, Qccl))
4ed46869 5463 {
df7492f9
KH
5464 coding->detector = detect_coding_ccl;
5465 coding->decoder = decode_coding_ccl;
5466 coding->encoder = encode_coding_ccl;
c952af22 5467 coding->common_flags
df7492f9
KH
5468 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5469 | CODING_REQUIRE_FLUSHING_MASK);
5470 }
5471 else if (EQ (coding_type, Qemacs_mule))
5472 {
5473 coding->detector = detect_coding_emacs_mule;
5474 coding->decoder = decode_coding_emacs_mule;
5475 coding->encoder = encode_coding_emacs_mule;
c952af22 5476 coding->common_flags
df7492f9
KH
5477 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5478 if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5479 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5480 {
5481 Lisp_Object tail, safe_charsets;
5482 int max_charset_id = 0;
5483
5484 for (tail = Vemacs_mule_charset_list; CONSP (tail);
5485 tail = XCDR (tail))
5486 if (max_charset_id < XFASTINT (XCAR (tail)))
5487 max_charset_id = XFASTINT (XCAR (tail));
5488 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
5489 make_number (255));
5490 for (tail = Vemacs_mule_charset_list; CONSP (tail);
5491 tail = XCDR (tail))
8f924df7 5492 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
df7492f9 5493 coding->max_charset_id = max_charset_id;
8f924df7 5494 coding->safe_charsets = (char *) SDATA (safe_charsets);
df7492f9
KH
5495 }
5496 }
5497 else if (EQ (coding_type, Qshift_jis))
5498 {
5499 coding->detector = detect_coding_sjis;
5500 coding->decoder = decode_coding_sjis;
5501 coding->encoder = encode_coding_sjis;
c952af22 5502 coding->common_flags
df7492f9
KH
5503 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5504 }
5505 else if (EQ (coding_type, Qbig5))
5506 {
5507 coding->detector = detect_coding_big5;
5508 coding->decoder = decode_coding_big5;
5509 coding->encoder = encode_coding_big5;
c952af22 5510 coding->common_flags
df7492f9
KH
5511 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5512 }
5513 else /* EQ (coding_type, Qraw_text) */
ec6d2bb8 5514 {
df7492f9
KH
5515 coding->detector = NULL;
5516 coding->decoder = decode_coding_raw_text;
5517 coding->encoder = encode_coding_raw_text;
ea29edf2
KH
5518 if (! EQ (eol_type, Qunix))
5519 {
5520 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5521 if (! VECTORP (eol_type))
5522 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5523 }
5524
4ed46869 5525 }
4ed46869 5526
df7492f9 5527 return;
4ed46869
KH
5528}
5529
0ff61e78
KH
5530/* Return a list of charsets supported by CODING. */
5531
5532Lisp_Object
5533coding_charset_list (coding)
5534 struct coding_system *coding;
5535{
35befdaa 5536 Lisp_Object attrs, charset_list;
0ff61e78
KH
5537
5538 CODING_GET_INFO (coding, attrs, charset_list);
5539 if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5540 {
5541 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5542
5543 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5544 charset_list = Viso_2022_charset_list;
5545 }
5546 else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5547 {
5548 charset_list = Vemacs_mule_charset_list;
5549 }
5550 return charset_list;
5551}
5552
5553
df7492f9
KH
5554/* Return raw-text or one of its subsidiaries that has the same
5555 eol_type as CODING-SYSTEM. */
ec6d2bb8 5556
df7492f9
KH
5557Lisp_Object
5558raw_text_coding_system (coding_system)
5559 Lisp_Object coding_system;
ec6d2bb8 5560{
0be8721c 5561 Lisp_Object spec, attrs;
df7492f9 5562 Lisp_Object eol_type, raw_text_eol_type;
ec6d2bb8 5563
d3e4cb56
KH
5564 if (NILP (coding_system))
5565 return Qraw_text;
df7492f9
KH
5566 spec = CODING_SYSTEM_SPEC (coding_system);
5567 attrs = AREF (spec, 0);
ec6d2bb8 5568
df7492f9
KH
5569 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5570 return coding_system;
ec6d2bb8 5571
df7492f9
KH
5572 eol_type = AREF (spec, 2);
5573 if (VECTORP (eol_type))
5574 return Qraw_text;
5575 spec = CODING_SYSTEM_SPEC (Qraw_text);
5576 raw_text_eol_type = AREF (spec, 2);
5577 return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5578 : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5579 : AREF (raw_text_eol_type, 2));
ec6d2bb8
KH
5580}
5581
54f78171 5582
df7492f9
KH
5583/* If CODING_SYSTEM doesn't specify end-of-line format but PARENT
5584 does, return one of the subsidiary that has the same eol-spec as
fcbcfb64
KH
5585 PARENT. Otherwise, return CODING_SYSTEM. If PARENT is nil,
5586 inherit end-of-line format from the system's setting
5587 (system_eol_type). */
df7492f9
KH
5588
5589Lisp_Object
5590coding_inherit_eol_type (coding_system, parent)
b74e4686 5591 Lisp_Object coding_system, parent;
54f78171 5592{
3e139625 5593 Lisp_Object spec, eol_type;
54f78171 5594
d3e4cb56
KH
5595 if (NILP (coding_system))
5596 coding_system = Qraw_text;
df7492f9 5597 spec = CODING_SYSTEM_SPEC (coding_system);
df7492f9 5598 eol_type = AREF (spec, 2);
fcbcfb64 5599 if (VECTORP (eol_type))
df7492f9 5600 {
df7492f9
KH
5601 Lisp_Object parent_eol_type;
5602
fcbcfb64
KH
5603 if (! NILP (parent))
5604 {
5605 Lisp_Object parent_spec;
5606
4a015c45 5607 parent_spec = CODING_SYSTEM_SPEC (parent);
fcbcfb64
KH
5608 parent_eol_type = AREF (parent_spec, 2);
5609 }
5610 else
5611 parent_eol_type = system_eol_type;
df7492f9
KH
5612 if (EQ (parent_eol_type, Qunix))
5613 coding_system = AREF (eol_type, 0);
5614 else if (EQ (parent_eol_type, Qdos))
5615 coding_system = AREF (eol_type, 1);
5616 else if (EQ (parent_eol_type, Qmac))
5617 coding_system = AREF (eol_type, 2);
54f78171 5618 }
df7492f9 5619 return coding_system;
54f78171
KH
5620}
5621
4ed46869
KH
5622/* Emacs has a mechanism to automatically detect a coding system if it
5623 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
5624 it's impossible to distinguish some coding systems accurately
5625 because they use the same range of codes. So, at first, coding
5626 systems are categorized into 7, those are:
5627
0ef69138 5628 o coding-category-emacs-mule
4ed46869
KH
5629
5630 The category for a coding system which has the same code range
5631 as Emacs' internal format. Assigned the coding-system (Lisp
0ef69138 5632 symbol) `emacs-mule' by default.
4ed46869
KH
5633
5634 o coding-category-sjis
5635
5636 The category for a coding system which has the same code range
5637 as SJIS. Assigned the coding-system (Lisp
7717c392 5638 symbol) `japanese-shift-jis' by default.
4ed46869
KH
5639
5640 o coding-category-iso-7
5641
5642 The category for a coding system which has the same code range
7717c392 5643 as ISO2022 of 7-bit environment. This doesn't use any locking
d46c5b12
KH
5644 shift and single shift functions. This can encode/decode all
5645 charsets. Assigned the coding-system (Lisp symbol)
5646 `iso-2022-7bit' by default.
5647
5648 o coding-category-iso-7-tight
5649
5650 Same as coding-category-iso-7 except that this can
5651 encode/decode only the specified charsets.
4ed46869
KH
5652
5653 o coding-category-iso-8-1
5654
5655 The category for a coding system which has the same code range
5656 as ISO2022 of 8-bit environment and graphic plane 1 used only
7717c392
KH
5657 for DIMENSION1 charset. This doesn't use any locking shift
5658 and single shift functions. Assigned the coding-system (Lisp
5659 symbol) `iso-latin-1' by default.
4ed46869
KH
5660
5661 o coding-category-iso-8-2
5662
5663 The category for a coding system which has the same code range
5664 as ISO2022 of 8-bit environment and graphic plane 1 used only
7717c392
KH
5665 for DIMENSION2 charset. This doesn't use any locking shift
5666 and single shift functions. Assigned the coding-system (Lisp
5667 symbol) `japanese-iso-8bit' by default.
4ed46869 5668
7717c392 5669 o coding-category-iso-7-else
4ed46869
KH
5670
5671 The category for a coding system which has the same code range
df7492f9 5672 as ISO2022 of 7-bit environemnt but uses locking shift or
7717c392
KH
5673 single shift functions. Assigned the coding-system (Lisp
5674 symbol) `iso-2022-7bit-lock' by default.
5675
5676 o coding-category-iso-8-else
5677
5678 The category for a coding system which has the same code range
df7492f9 5679 as ISO2022 of 8-bit environemnt but uses locking shift or
7717c392
KH
5680 single shift functions. Assigned the coding-system (Lisp
5681 symbol) `iso-2022-8bit-ss2' by default.
4ed46869
KH
5682
5683 o coding-category-big5
5684
5685 The category for a coding system which has the same code range
5686 as BIG5. Assigned the coding-system (Lisp symbol)
e0e989f6 5687 `cn-big5' by default.
4ed46869 5688
fa42c37f
KH
5689 o coding-category-utf-8
5690
5691 The category for a coding system which has the same code range
6e76ae91 5692 as UTF-8 (cf. RFC3629). Assigned the coding-system (Lisp
fa42c37f
KH
5693 symbol) `utf-8' by default.
5694
5695 o coding-category-utf-16-be
5696
5697 The category for a coding system in which a text has an
5698 Unicode signature (cf. Unicode Standard) in the order of BIG
5699 endian at the head. Assigned the coding-system (Lisp symbol)
5700 `utf-16-be' by default.
5701
5702 o coding-category-utf-16-le
5703
5704 The category for a coding system in which a text has an
5705 Unicode signature (cf. Unicode Standard) in the order of
5706 LITTLE endian at the head. Assigned the coding-system (Lisp
5707 symbol) `utf-16-le' by default.
5708
1397dc18
KH
5709 o coding-category-ccl
5710
5711 The category for a coding system of which encoder/decoder is
5712 written in CCL programs. The default value is nil, i.e., no
5713 coding system is assigned.
5714
4ed46869
KH
5715 o coding-category-binary
5716
5717 The category for a coding system not categorized in any of the
5718 above. Assigned the coding-system (Lisp symbol)
e0e989f6 5719 `no-conversion' by default.
4ed46869
KH
5720
5721 Each of them is a Lisp symbol and the value is an actual
df7492f9 5722 `coding-system's (this is also a Lisp symbol) assigned by a user.
4ed46869
KH
5723 What Emacs does actually is to detect a category of coding system.
5724 Then, it uses a `coding-system' assigned to it. If Emacs can't
df7492f9 5725 decide only one possible category, it selects a category of the
4ed46869
KH
5726 highest priority. Priorities of categories are also specified by a
5727 user in a Lisp variable `coding-category-list'.
5728
5729*/
5730
df7492f9
KH
5731#define EOL_SEEN_NONE 0
5732#define EOL_SEEN_LF 1
5733#define EOL_SEEN_CR 2
5734#define EOL_SEEN_CRLF 4
66cfb530 5735
ff0dacd7
KH
5736/* Detect how end-of-line of a text of length SRC_BYTES pointed by
5737 SOURCE is encoded. If CATEGORY is one of
5738 coding_category_utf_16_XXXX, assume that CR and LF are encoded by
5739 two-byte, else they are encoded by one-byte.
5740
5741 Return one of EOL_SEEN_XXX. */
4ed46869 5742
bc4bc72a 5743#define MAX_EOL_CHECK_COUNT 3
d46c5b12
KH
5744
5745static int
89528eb3 5746detect_eol (source, src_bytes, category)
f6cbaf43 5747 const unsigned char *source;
df7492f9 5748 EMACS_INT src_bytes;
89528eb3 5749 enum coding_category category;
4ed46869 5750{
f6cbaf43 5751 const unsigned char *src = source, *src_end = src + src_bytes;
4ed46869 5752 unsigned char c;
df7492f9
KH
5753 int total = 0;
5754 int eol_seen = EOL_SEEN_NONE;
4ed46869 5755
89528eb3 5756 if ((1 << category) & CATEGORY_MASK_UTF_16)
4ed46869 5757 {
df7492f9 5758 int msb, lsb;
fa42c37f 5759
89528eb3
KH
5760 msb = category == (coding_category_utf_16_le
5761 | coding_category_utf_16_le_nosig);
df7492f9 5762 lsb = 1 - msb;
fa42c37f 5763
df7492f9 5764 while (src + 1 < src_end)
fa42c37f 5765 {
df7492f9
KH
5766 c = src[lsb];
5767 if (src[msb] == 0 && (c == '\n' || c == '\r'))
fa42c37f 5768 {
df7492f9
KH
5769 int this_eol;
5770
5771 if (c == '\n')
5772 this_eol = EOL_SEEN_LF;
5773 else if (src + 3 >= src_end
5774 || src[msb + 2] != 0
5775 || src[lsb + 2] != '\n')
5776 this_eol = EOL_SEEN_CR;
fa42c37f 5777 else
8f924df7 5778 this_eol = EOL_SEEN_CRLF;
df7492f9
KH
5779
5780 if (eol_seen == EOL_SEEN_NONE)
5781 /* This is the first end-of-line. */
5782 eol_seen = this_eol;
5783 else if (eol_seen != this_eol)
fa42c37f 5784 {
df7492f9
KH
5785 /* The found type is different from what found before. */
5786 eol_seen = EOL_SEEN_LF;
5787 break;
fa42c37f 5788 }
df7492f9
KH
5789 if (++total == MAX_EOL_CHECK_COUNT)
5790 break;
fa42c37f 5791 }
df7492f9 5792 src += 2;
fa42c37f 5793 }
bcf26d6a 5794 }
d46c5b12 5795 else
c4825358 5796 {
df7492f9 5797 while (src < src_end)
27901516 5798 {
df7492f9
KH
5799 c = *src++;
5800 if (c == '\n' || c == '\r')
5801 {
5802 int this_eol;
d46c5b12 5803
df7492f9
KH
5804 if (c == '\n')
5805 this_eol = EOL_SEEN_LF;
5806 else if (src >= src_end || *src != '\n')
5807 this_eol = EOL_SEEN_CR;
5808 else
5809 this_eol = EOL_SEEN_CRLF, src++;
d46c5b12 5810
df7492f9
KH
5811 if (eol_seen == EOL_SEEN_NONE)
5812 /* This is the first end-of-line. */
5813 eol_seen = this_eol;
5814 else if (eol_seen != this_eol)
5815 {
5816 /* The found type is different from what found before. */
5817 eol_seen = EOL_SEEN_LF;
5818 break;
5819 }
5820 if (++total == MAX_EOL_CHECK_COUNT)
5821 break;
5822 }
5823 }
73be902c 5824 }
df7492f9 5825 return eol_seen;
73be902c
KH
5826}
5827
df7492f9 5828
24a73b0a 5829static Lisp_Object
df7492f9
KH
5830adjust_coding_eol_type (coding, eol_seen)
5831 struct coding_system *coding;
5832 int eol_seen;
73be902c 5833{
0be8721c 5834 Lisp_Object eol_type;
8f924df7 5835
df7492f9
KH
5836 eol_type = CODING_ID_EOL_TYPE (coding->id);
5837 if (eol_seen & EOL_SEEN_LF)
24a73b0a
KH
5838 {
5839 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
5840 eol_type = Qunix;
5841 }
6f197c07 5842 else if (eol_seen & EOL_SEEN_CRLF)
24a73b0a
KH
5843 {
5844 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
5845 eol_type = Qdos;
5846 }
6f197c07 5847 else if (eol_seen & EOL_SEEN_CR)
24a73b0a
KH
5848 {
5849 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
5850 eol_type = Qmac;
5851 }
5852 return eol_type;
d46c5b12 5853}
4ed46869 5854
df7492f9
KH
5855/* Detect how a text specified in CODING is encoded. If a coding
5856 system is detected, update fields of CODING by the detected coding
5857 system. */
0a28aafb 5858
df7492f9
KH
5859void
5860detect_coding (coding)
d46c5b12 5861 struct coding_system *coding;
d46c5b12 5862{
8f924df7 5863 const unsigned char *src, *src_end;
d46c5b12 5864
df7492f9
KH
5865 coding->consumed = coding->consumed_char = 0;
5866 coding->produced = coding->produced_char = 0;
5867 coding_set_source (coding);
1c3478b0 5868
df7492f9 5869 src_end = coding->source + coding->src_bytes;
c0e16b14 5870 coding->head_ascii = 0;
1c3478b0 5871
df7492f9
KH
5872 /* If we have not yet decided the text encoding type, detect it
5873 now. */
5874 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
b73bfc1c 5875 {
df7492f9 5876 int c, i;
6cb21a4f 5877 struct coding_detection_info detect_info;
2f3cbb32 5878 int null_byte_found = 0, eight_bit_found = 0;
df7492f9 5879
6cb21a4f 5880 detect_info.checked = detect_info.found = detect_info.rejected = 0;
2f3cbb32 5881 for (src = coding->source; src < src_end; src++)
d46c5b12 5882 {
df7492f9 5883 c = *src;
6cb21a4f 5884 if (c & 0x80)
6cb21a4f 5885 {
2f3cbb32 5886 eight_bit_found = 1;
2f3cbb32
KH
5887 if (null_byte_found)
5888 break;
5889 }
5890 else if (c < 0x20)
5891 {
5892 if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
5893 && ! inhibit_iso_escape_detection
5894 && ! detect_info.checked)
6cb21a4f 5895 {
2f3cbb32
KH
5896 if (detect_coding_iso_2022 (coding, &detect_info))
5897 {
5898 /* We have scanned the whole data. */
5899 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
c0e16b14
KH
5900 {
5901 /* We didn't find an 8-bit code. We may
5902 have found a null-byte, but it's very
5903 rare that a binary file confirm to
5904 ISO-2022. */
5905 src = src_end;
5906 coding->head_ascii = src - coding->source;
5907 }
5908 detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
2f3cbb32
KH
5909 break;
5910 }
5911 }
97b1b294 5912 else if (! c && !inhibit_null_byte_detection)
2f3cbb32
KH
5913 {
5914 null_byte_found = 1;
5915 if (eight_bit_found)
5916 break;
6cb21a4f 5917 }
c006c0c8
KH
5918 if (! eight_bit_found)
5919 coding->head_ascii++;
6cb21a4f 5920 }
c006c0c8 5921 else if (! eight_bit_found)
c0e16b14 5922 coding->head_ascii++;
d46c5b12 5923 }
df7492f9 5924
2f3cbb32
KH
5925 if (null_byte_found || eight_bit_found
5926 || coding->head_ascii < coding->src_bytes
6cb21a4f 5927 || detect_info.found)
d46c5b12 5928 {
ff0dacd7
KH
5929 enum coding_category category;
5930 struct coding_system *this;
df7492f9 5931
6cb21a4f
KH
5932 if (coding->head_ascii == coding->src_bytes)
5933 /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */
5934 for (i = 0; i < coding_category_raw_text; i++)
5935 {
5936 category = coding_priorities[i];
5937 this = coding_categories + category;
5938 if (detect_info.found & (1 << category))
24a73b0a 5939 break;
6cb21a4f
KH
5940 }
5941 else
2f3cbb32
KH
5942 {
5943 if (null_byte_found)
ff0dacd7 5944 {
2f3cbb32
KH
5945 detect_info.checked |= ~CATEGORY_MASK_UTF_16;
5946 detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
ff0dacd7 5947 }
2f3cbb32
KH
5948 for (i = 0; i < coding_category_raw_text; i++)
5949 {
5950 category = coding_priorities[i];
5951 this = coding_categories + category;
5952 if (this->id < 0)
5953 {
5954 /* No coding system of this category is defined. */
5955 detect_info.rejected |= (1 << category);
5956 }
5957 else if (category >= coding_category_raw_text)
5958 continue;
5959 else if (detect_info.checked & (1 << category))
5960 {
5961 if (detect_info.found & (1 << category))
5962 break;
5963 }
5964 else if ((*(this->detector)) (coding, &detect_info)
5965 && detect_info.found & (1 << category))
5966 {
5967 if (category == coding_category_utf_16_auto)
5968 {
5969 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5970 category = coding_category_utf_16_le;
5971 else
5972 category = coding_category_utf_16_be;
5973 }
5974 break;
5975 }
5976 }
2f3cbb32 5977 }
c0e16b14
KH
5978
5979 if (i < coding_category_raw_text)
5980 setup_coding_system (CODING_ID_NAME (this->id), coding);
5981 else if (null_byte_found)
5982 setup_coding_system (Qno_conversion, coding);
5983 else if ((detect_info.rejected & CATEGORY_MASK_ANY)
5984 == CATEGORY_MASK_ANY)
5985 setup_coding_system (Qraw_text, coding);
5986 else if (detect_info.rejected)
5987 for (i = 0; i < coding_category_raw_text; i++)
5988 if (! (detect_info.rejected & (1 << coding_priorities[i])))
5989 {
5990 this = coding_categories + coding_priorities[i];
5991 setup_coding_system (CODING_ID_NAME (this->id), coding);
5992 break;
5993 }
d46c5b12 5994 }
b73bfc1c 5995 }
a470d443
KH
5996 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
5997 == coding_category_utf_8_auto)
5998 {
5999 Lisp_Object coding_systems;
6000 struct coding_detection_info detect_info;
6001
6002 coding_systems
6003 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6004 detect_info.found = detect_info.rejected = 0;
6005 coding->head_ascii = 0;
6006 if (CONSP (coding_systems)
6007 && detect_coding_utf_8 (coding, &detect_info))
6008 {
6009 if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6010 setup_coding_system (XCAR (coding_systems), coding);
6011 else
6012 setup_coding_system (XCDR (coding_systems), coding);
6013 }
6014 }
24a73b0a
KH
6015 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6016 == coding_category_utf_16_auto)
b49a1807
KH
6017 {
6018 Lisp_Object coding_systems;
6019 struct coding_detection_info detect_info;
6020
6021 coding_systems
a470d443 6022 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
b49a1807 6023 detect_info.found = detect_info.rejected = 0;
a470d443 6024 coding->head_ascii = 0;
b49a1807 6025 if (CONSP (coding_systems)
24a73b0a 6026 && detect_coding_utf_16 (coding, &detect_info))
b49a1807
KH
6027 {
6028 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6029 setup_coding_system (XCAR (coding_systems), coding);
24a73b0a 6030 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
b49a1807
KH
6031 setup_coding_system (XCDR (coding_systems), coding);
6032 }
6033 }
4ed46869 6034}
4ed46869 6035
d46c5b12 6036
aaaf0b1e 6037static void
df7492f9 6038decode_eol (coding)
aaaf0b1e 6039 struct coding_system *coding;
aaaf0b1e 6040{
24a73b0a
KH
6041 Lisp_Object eol_type;
6042 unsigned char *p, *pbeg, *pend;
3ed051d4 6043
24a73b0a
KH
6044 eol_type = CODING_ID_EOL_TYPE (coding->id);
6045 if (EQ (eol_type, Qunix))
6046 return;
6047
6048 if (NILP (coding->dst_object))
6049 pbeg = coding->destination;
6050 else
6051 pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6052 pend = pbeg + coding->produced;
6053
6054 if (VECTORP (eol_type))
aaaf0b1e 6055 {
df7492f9 6056 int eol_seen = EOL_SEEN_NONE;
4ed46869 6057
24a73b0a 6058 for (p = pbeg; p < pend; p++)
aaaf0b1e 6059 {
df7492f9
KH
6060 if (*p == '\n')
6061 eol_seen |= EOL_SEEN_LF;
6062 else if (*p == '\r')
aaaf0b1e 6063 {
df7492f9 6064 if (p + 1 < pend && *(p + 1) == '\n')
aaaf0b1e 6065 {
df7492f9
KH
6066 eol_seen |= EOL_SEEN_CRLF;
6067 p++;
aaaf0b1e 6068 }
aaaf0b1e 6069 else
df7492f9 6070 eol_seen |= EOL_SEEN_CR;
aaaf0b1e 6071 }
aaaf0b1e 6072 }
24a73b0a
KH
6073 if (eol_seen != EOL_SEEN_NONE
6074 && eol_seen != EOL_SEEN_LF
6075 && eol_seen != EOL_SEEN_CRLF
6076 && eol_seen != EOL_SEEN_CR)
6077 eol_seen = EOL_SEEN_LF;
df7492f9 6078 if (eol_seen != EOL_SEEN_NONE)
24a73b0a 6079 eol_type = adjust_coding_eol_type (coding, eol_seen);
aaaf0b1e 6080 }
d46c5b12 6081
24a73b0a 6082 if (EQ (eol_type, Qmac))
27901516 6083 {
24a73b0a 6084 for (p = pbeg; p < pend; p++)
df7492f9
KH
6085 if (*p == '\r')
6086 *p = '\n';
4ed46869 6087 }
24a73b0a 6088 else if (EQ (eol_type, Qdos))
df7492f9 6089 {
24a73b0a 6090 int n = 0;
b73bfc1c 6091
24a73b0a
KH
6092 if (NILP (coding->dst_object))
6093 {
4347441b
KH
6094 /* Start deleting '\r' from the tail to minimize the memory
6095 movement. */
24a73b0a
KH
6096 for (p = pend - 2; p >= pbeg; p--)
6097 if (*p == '\r')
6098 {
6099 safe_bcopy ((char *) (p + 1), (char *) p, pend-- - p - 1);
6100 n++;
6101 }
6102 }
6103 else
6104 {
4347441b
KH
6105 int pos_byte = coding->dst_pos_byte;
6106 int pos = coding->dst_pos;
6107 int pos_end = pos + coding->produced_char - 1;
6108
6109 while (pos < pos_end)
6110 {
6111 p = BYTE_POS_ADDR (pos_byte);
6112 if (*p == '\r' && p[1] == '\n')
6113 {
6114 del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6115 n++;
6116 pos_end--;
6117 }
6118 pos++;
69b8522d
KH
6119 if (coding->dst_multibyte)
6120 pos_byte += BYTES_BY_CHAR_HEAD (*p);
6121 else
6122 pos_byte++;
4347441b 6123 }
24a73b0a
KH
6124 }
6125 coding->produced -= n;
6126 coding->produced_char -= n;
aaaf0b1e 6127 }
4ed46869
KH
6128}
6129
7d64c6ad 6130
a6f87d34
KH
6131/* Return a translation table (or list of them) from coding system
6132 attribute vector ATTRS for encoding (ENCODEP is nonzero) or
6133 decoding (ENCODEP is zero). */
7d64c6ad 6134
e6a54062 6135static Lisp_Object
09ee6fdd
KH
6136get_translation_table (attrs, encodep, max_lookup)
6137 Lisp_Object attrs;
6138 int encodep, *max_lookup;
7d64c6ad
KH
6139{
6140 Lisp_Object standard, translation_table;
09ee6fdd 6141 Lisp_Object val;
7d64c6ad
KH
6142
6143 if (encodep)
6144 translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6145 standard = Vstandard_translation_table_for_encode;
6146 else
6147 translation_table = CODING_ATTR_DECODE_TBL (attrs),
6148 standard = Vstandard_translation_table_for_decode;
7d64c6ad 6149 if (NILP (translation_table))
09ee6fdd
KH
6150 translation_table = standard;
6151 else
a6f87d34 6152 {
09ee6fdd
KH
6153 if (SYMBOLP (translation_table))
6154 translation_table = Fget (translation_table, Qtranslation_table);
6155 else if (CONSP (translation_table))
6156 {
6157 translation_table = Fcopy_sequence (translation_table);
6158 for (val = translation_table; CONSP (val); val = XCDR (val))
6159 if (SYMBOLP (XCAR (val)))
6160 XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6161 }
6162 if (CHAR_TABLE_P (standard))
6163 {
6164 if (CONSP (translation_table))
6165 translation_table = nconc2 (translation_table,
6166 Fcons (standard, Qnil));
6167 else
6168 translation_table = Fcons (translation_table,
6169 Fcons (standard, Qnil));
6170 }
a6f87d34 6171 }
2170c8f0
KH
6172
6173 if (max_lookup)
09ee6fdd 6174 {
2170c8f0
KH
6175 *max_lookup = 1;
6176 if (CHAR_TABLE_P (translation_table)
6177 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6178 {
6179 val = XCHAR_TABLE (translation_table)->extras[1];
6180 if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6181 *max_lookup = XFASTINT (val);
6182 }
6183 else if (CONSP (translation_table))
6184 {
6185 Lisp_Object tail, val;
09ee6fdd 6186
2170c8f0
KH
6187 for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6188 if (CHAR_TABLE_P (XCAR (tail))
6189 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6190 {
6191 val = XCHAR_TABLE (XCAR (tail))->extras[1];
6192 if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6193 *max_lookup = XFASTINT (val);
6194 }
6195 }
a6f87d34 6196 }
7d64c6ad
KH
6197 return translation_table;
6198}
6199
09ee6fdd
KH
6200#define LOOKUP_TRANSLATION_TABLE(table, c, trans) \
6201 do { \
6202 trans = Qnil; \
6203 if (CHAR_TABLE_P (table)) \
6204 { \
6205 trans = CHAR_TABLE_REF (table, c); \
6206 if (CHARACTERP (trans)) \
6207 c = XFASTINT (trans), trans = Qnil; \
6208 } \
6209 else if (CONSP (table)) \
6210 { \
6211 Lisp_Object tail; \
6212 \
6213 for (tail = table; CONSP (tail); tail = XCDR (tail)) \
6214 if (CHAR_TABLE_P (XCAR (tail))) \
6215 { \
6216 trans = CHAR_TABLE_REF (XCAR (tail), c); \
6217 if (CHARACTERP (trans)) \
6218 c = XFASTINT (trans), trans = Qnil; \
6219 else if (! NILP (trans)) \
6220 break; \
6221 } \
6222 } \
e6a54062
KH
6223 } while (0)
6224
7d64c6ad 6225
69a80ea3
KH
6226static Lisp_Object
6227get_translation (val, buf, buf_end, last_block, from_nchars, to_nchars)
6228 Lisp_Object val;
6229 int *buf, *buf_end;
6230 int last_block;
6231 int *from_nchars, *to_nchars;
6232{
433f7f87
KH
6233 /* VAL is TO or (([FROM-CHAR ...] . TO) ...) where TO is TO-CHAR or
6234 [TO-CHAR ...]. */
69a80ea3
KH
6235 if (CONSP (val))
6236 {
433f7f87 6237 Lisp_Object from, tail;
69a80ea3
KH
6238 int i, len;
6239
433f7f87 6240 for (tail = val; CONSP (tail); tail = XCDR (tail))
69a80ea3 6241 {
433f7f87
KH
6242 val = XCAR (tail);
6243 from = XCAR (val);
6244 len = ASIZE (from);
6245 for (i = 0; i < len; i++)
6246 {
6247 if (buf + i == buf_end)
6248 {
6249 if (! last_block)
6250 return Qt;
6251 break;
6252 }
6253 if (XINT (AREF (from, i)) != buf[i])
6254 break;
6255 }
6256 if (i == len)
6257 {
6258 val = XCDR (val);
6259 *from_nchars = len;
6260 break;
6261 }
69a80ea3 6262 }
433f7f87
KH
6263 if (! CONSP (tail))
6264 return Qnil;
69a80ea3
KH
6265 }
6266 if (VECTORP (val))
6267 *buf = XINT (AREF (val, 0)), *to_nchars = ASIZE (val);
6268 else
6269 *buf = XINT (val);
6270 return val;
6271}
6272
6273
d46c5b12 6274static int
69a80ea3 6275produce_chars (coding, translation_table, last_block)
df7492f9 6276 struct coding_system *coding;
69a80ea3
KH
6277 Lisp_Object translation_table;
6278 int last_block;
4ed46869 6279{
df7492f9
KH
6280 unsigned char *dst = coding->destination + coding->produced;
6281 unsigned char *dst_end = coding->destination + coding->dst_bytes;
119852e7
KH
6282 EMACS_INT produced;
6283 EMACS_INT produced_chars = 0;
69a80ea3 6284 int carryover = 0;
4ed46869 6285
df7492f9 6286 if (! coding->chars_at_source)
4ed46869 6287 {
119852e7 6288 /* Source characters are in coding->charbuf. */
fba4576f
AS
6289 int *buf = coding->charbuf;
6290 int *buf_end = buf + coding->charbuf_used;
4ed46869 6291
db274c7a
KH
6292 if (EQ (coding->src_object, coding->dst_object))
6293 {
6294 coding_set_source (coding);
6295 dst_end = ((unsigned char *) coding->source) + coding->consumed;
6296 }
4ed46869 6297
df7492f9 6298 while (buf < buf_end)
4ed46869 6299 {
69a80ea3 6300 int c = *buf, i;
bc4bc72a 6301
df7492f9
KH
6302 if (c >= 0)
6303 {
69a80ea3
KH
6304 int from_nchars = 1, to_nchars = 1;
6305 Lisp_Object trans = Qnil;
6306
09ee6fdd 6307 LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
e6a54062 6308 if (! NILP (trans))
69a80ea3
KH
6309 {
6310 trans = get_translation (trans, buf, buf_end, last_block,
6311 &from_nchars, &to_nchars);
6312 if (EQ (trans, Qt))
6313 break;
6314 c = *buf;
6315 }
6316
6317 if (dst + MAX_MULTIBYTE_LENGTH * to_nchars > dst_end)
6318 {
6319 dst = alloc_destination (coding,
6320 buf_end - buf
6321 + MAX_MULTIBYTE_LENGTH * to_nchars,
6322 dst);
db274c7a
KH
6323 if (EQ (coding->src_object, coding->dst_object))
6324 {
6325 coding_set_source (coding);
6326 dst_end = ((unsigned char *) coding->source) + coding->consumed;
6327 }
6328 else
6329 dst_end = coding->destination + coding->dst_bytes;
69a80ea3
KH
6330 }
6331
433f7f87 6332 for (i = 0; i < to_nchars; i++)
69a80ea3 6333 {
433f7f87
KH
6334 if (i > 0)
6335 c = XINT (AREF (trans, i));
69a80ea3
KH
6336 if (coding->dst_multibyte
6337 || ! CHAR_BYTE8_P (c))
db274c7a 6338 CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
69a80ea3
KH
6339 else
6340 *dst++ = CHAR_TO_BYTE8 (c);
6341 }
6342 produced_chars += to_nchars;
6343 *buf++ = to_nchars;
6344 while (--from_nchars > 0)
6345 *buf++ = 0;
d46c5b12 6346 }
df7492f9 6347 else
69a80ea3
KH
6348 /* This is an annotation datum. (-C) is the length. */
6349 buf += -c;
4ed46869 6350 }
69a80ea3 6351 carryover = buf_end - buf;
4ed46869 6352 }
fa42c37f 6353 else
fa42c37f 6354 {
119852e7 6355 /* Source characters are at coding->source. */
8f924df7 6356 const unsigned char *src = coding->source;
119852e7 6357 const unsigned char *src_end = src + coding->consumed;
4ed46869 6358
db274c7a
KH
6359 if (EQ (coding->dst_object, coding->src_object))
6360 dst_end = (unsigned char *) src;
df7492f9 6361 if (coding->src_multibyte != coding->dst_multibyte)
fa42c37f 6362 {
df7492f9 6363 if (coding->src_multibyte)
fa42c37f 6364 {
71c81426 6365 int multibytep = 1;
4533845d 6366 EMACS_INT consumed_chars = 0;
d46c5b12 6367
df7492f9
KH
6368 while (1)
6369 {
8f924df7 6370 const unsigned char *src_base = src;
df7492f9 6371 int c;
b73bfc1c 6372
df7492f9 6373 ONE_MORE_BYTE (c);
119852e7 6374 if (dst == dst_end)
df7492f9 6375 {
119852e7
KH
6376 if (EQ (coding->src_object, coding->dst_object))
6377 dst_end = (unsigned char *) src;
6378 if (dst == dst_end)
df7492f9 6379 {
119852e7
KH
6380 EMACS_INT offset = src - coding->source;
6381
6382 dst = alloc_destination (coding, src_end - src + 1,
6383 dst);
6384 dst_end = coding->destination + coding->dst_bytes;
6385 coding_set_source (coding);
6386 src = coding->source + offset;
6387 src_end = coding->source + coding->src_bytes;
db274c7a
KH
6388 if (EQ (coding->src_object, coding->dst_object))
6389 dst_end = (unsigned char *) src;
df7492f9 6390 }
df7492f9
KH
6391 }
6392 *dst++ = c;
6393 produced_chars++;
6394 }
6395 no_more_source:
6396 ;
fa42c37f
KH
6397 }
6398 else
df7492f9
KH
6399 while (src < src_end)
6400 {
71c81426 6401 int multibytep = 1;
df7492f9 6402 int c = *src++;
b73bfc1c 6403
df7492f9
KH
6404 if (dst >= dst_end - 1)
6405 {
2c78b7e1 6406 if (EQ (coding->src_object, coding->dst_object))
8f924df7 6407 dst_end = (unsigned char *) src;
2c78b7e1
KH
6408 if (dst >= dst_end - 1)
6409 {
119852e7 6410 EMACS_INT offset = src - coding->source;
db274c7a 6411 EMACS_INT more_bytes;
119852e7 6412
db274c7a
KH
6413 if (EQ (coding->src_object, coding->dst_object))
6414 more_bytes = ((src_end - src) / 2) + 2;
6415 else
6416 more_bytes = src_end - src + 2;
6417 dst = alloc_destination (coding, more_bytes, dst);
2c78b7e1
KH
6418 dst_end = coding->destination + coding->dst_bytes;
6419 coding_set_source (coding);
119852e7 6420 src = coding->source + offset;
2c78b7e1 6421 src_end = coding->source + coding->src_bytes;
db274c7a
KH
6422 if (EQ (coding->src_object, coding->dst_object))
6423 dst_end = (unsigned char *) src;
2c78b7e1 6424 }
df7492f9
KH
6425 }
6426 EMIT_ONE_BYTE (c);
6427 }
d46c5b12 6428 }
df7492f9
KH
6429 else
6430 {
6431 if (!EQ (coding->src_object, coding->dst_object))
fa42c37f 6432 {
119852e7 6433 EMACS_INT require = coding->src_bytes - coding->dst_bytes;
4ed46869 6434
df7492f9 6435 if (require > 0)
fa42c37f 6436 {
df7492f9
KH
6437 EMACS_INT offset = src - coding->source;
6438
6439 dst = alloc_destination (coding, require, dst);
6440 coding_set_source (coding);
6441 src = coding->source + offset;
6442 src_end = coding->source + coding->src_bytes;
fa42c37f
KH
6443 }
6444 }
119852e7 6445 produced_chars = coding->consumed_char;
df7492f9 6446 while (src < src_end)
14daee73 6447 *dst++ = *src++;
fa42c37f
KH
6448 }
6449 }
6450
df7492f9 6451 produced = dst - (coding->destination + coding->produced);
284201e4 6452 if (BUFFERP (coding->dst_object) && produced_chars > 0)
df7492f9
KH
6453 insert_from_gap (produced_chars, produced);
6454 coding->produced += produced;
6455 coding->produced_char += produced_chars;
69a80ea3 6456 return carryover;
fa42c37f
KH
6457}
6458
ff0dacd7
KH
6459/* Compose text in CODING->object according to the annotation data at
6460 CHARBUF. CHARBUF is an array:
6461 [ -LENGTH ANNOTATION_MASK FROM TO METHOD COMP_LEN [ COMPONENTS... ] ]
df7492f9 6462 */
4ed46869 6463
df7492f9 6464static INLINE void
69a80ea3 6465produce_composition (coding, charbuf, pos)
4ed46869 6466 struct coding_system *coding;
df7492f9 6467 int *charbuf;
69a80ea3 6468 EMACS_INT pos;
4ed46869 6469{
df7492f9 6470 int len;
69a80ea3 6471 EMACS_INT to;
df7492f9 6472 enum composition_method method;
df7492f9 6473 Lisp_Object components;
fa42c37f 6474
df7492f9 6475 len = -charbuf[0];
69a80ea3 6476 to = pos + charbuf[2];
9ffd559c
KH
6477 if (to <= pos)
6478 return;
69a80ea3 6479 method = (enum composition_method) (charbuf[3]);
d46c5b12 6480
df7492f9
KH
6481 if (method == COMPOSITION_RELATIVE)
6482 components = Qnil;
9ffd559c
KH
6483 else if (method >= COMPOSITION_WITH_RULE
6484 && method <= COMPOSITION_WITH_RULE_ALTCHARS)
d46c5b12 6485 {
df7492f9
KH
6486 Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
6487 int i;
b73bfc1c 6488
69a80ea3
KH
6489 len -= 4;
6490 charbuf += 4;
df7492f9 6491 for (i = 0; i < len; i++)
9ffd559c
KH
6492 {
6493 args[i] = make_number (charbuf[i]);
f75c90a9 6494 if (charbuf[i] < 0)
9ffd559c
KH
6495 return;
6496 }
df7492f9
KH
6497 components = (method == COMPOSITION_WITH_ALTCHARS
6498 ? Fstring (len, args) : Fvector (len, args));
d46c5b12 6499 }
9ffd559c
KH
6500 else
6501 return;
69a80ea3 6502 compose_text (pos, to, components, Qnil, coding->dst_object);
d46c5b12
KH
6503}
6504
d46c5b12 6505
ff0dacd7
KH
6506/* Put `charset' property on text in CODING->object according to
6507 the annotation data at CHARBUF. CHARBUF is an array:
69a80ea3 6508 [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
ff0dacd7 6509 */
d46c5b12 6510
ff0dacd7 6511static INLINE void
69a80ea3 6512produce_charset (coding, charbuf, pos)
d46c5b12 6513 struct coding_system *coding;
ff0dacd7 6514 int *charbuf;
69a80ea3 6515 EMACS_INT pos;
d46c5b12 6516{
69a80ea3
KH
6517 EMACS_INT from = pos - charbuf[2];
6518 struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
b73bfc1c 6519
69a80ea3 6520 Fput_text_property (make_number (from), make_number (pos),
ff0dacd7
KH
6521 Qcharset, CHARSET_NAME (charset),
6522 coding->dst_object);
d46c5b12
KH
6523}
6524
d46c5b12 6525
df7492f9
KH
6526#define CHARBUF_SIZE 0x4000
6527
6528#define ALLOC_CONVERSION_WORK_AREA(coding) \
6529 do { \
6530 int size = CHARBUF_SIZE;; \
6531 \
6532 coding->charbuf = NULL; \
6533 while (size > 1024) \
6534 { \
6535 coding->charbuf = (int *) alloca (sizeof (int) * size); \
6536 if (coding->charbuf) \
6537 break; \
6538 size >>= 1; \
6539 } \
6540 if (! coding->charbuf) \
6541 { \
065e3595 6542 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
df7492f9
KH
6543 return coding->result; \
6544 } \
6545 coding->charbuf_size = size; \
6546 } while (0)
4ed46869 6547
d46c5b12
KH
6548
6549static void
69a80ea3 6550produce_annotation (coding, pos)
d46c5b12 6551 struct coding_system *coding;
69a80ea3 6552 EMACS_INT pos;
d46c5b12 6553{
df7492f9
KH
6554 int *charbuf = coding->charbuf;
6555 int *charbuf_end = charbuf + coding->charbuf_used;
d46c5b12 6556
ff0dacd7
KH
6557 if (NILP (coding->dst_object))
6558 return;
d46c5b12 6559
df7492f9 6560 while (charbuf < charbuf_end)
a84f1519 6561 {
df7492f9 6562 if (*charbuf >= 0)
69a80ea3 6563 pos += *charbuf++;
d46c5b12 6564 else
d46c5b12 6565 {
df7492f9 6566 int len = -*charbuf;
ff0dacd7 6567 switch (charbuf[1])
df7492f9
KH
6568 {
6569 case CODING_ANNOTATE_COMPOSITION_MASK:
69a80ea3 6570 produce_composition (coding, charbuf, pos);
df7492f9 6571 break;
ff0dacd7 6572 case CODING_ANNOTATE_CHARSET_MASK:
69a80ea3 6573 produce_charset (coding, charbuf, pos);
ff0dacd7 6574 break;
df7492f9
KH
6575 default:
6576 abort ();
6577 }
6578 charbuf += len;
d46c5b12 6579 }
a84f1519 6580 }
d46c5b12
KH
6581}
6582
df7492f9
KH
6583/* Decode the data at CODING->src_object into CODING->dst_object.
6584 CODING->src_object is a buffer, a string, or nil.
6585 CODING->dst_object is a buffer.
d46c5b12 6586
df7492f9
KH
6587 If CODING->src_object is a buffer, it must be the current buffer.
6588 In this case, if CODING->src_pos is positive, it is a position of
6589 the source text in the buffer, otherwise, the source text is in the
6590 gap area of the buffer, and CODING->src_pos specifies the offset of
6591 the text from GPT (which must be the same as PT). If this is the
6592 same buffer as CODING->dst_object, CODING->src_pos must be
6593 negative.
d46c5b12 6594
b6828792 6595 If CODING->src_object is a string, CODING->src_pos is an index to
df7492f9 6596 that string.
d46c5b12 6597
df7492f9
KH
6598 If CODING->src_object is nil, CODING->source must already point to
6599 the non-relocatable memory area. In this case, CODING->src_pos is
6600 an offset from CODING->source.
73be902c 6601
df7492f9
KH
6602 The decoded data is inserted at the current point of the buffer
6603 CODING->dst_object.
6604*/
d46c5b12 6605
df7492f9
KH
6606static int
6607decode_coding (coding)
d46c5b12 6608 struct coding_system *coding;
d46c5b12 6609{
df7492f9 6610 Lisp_Object attrs;
24a73b0a 6611 Lisp_Object undo_list;
7d64c6ad 6612 Lisp_Object translation_table;
69a80ea3
KH
6613 int carryover;
6614 int i;
d46c5b12 6615
df7492f9
KH
6616 if (BUFFERP (coding->src_object)
6617 && coding->src_pos > 0
6618 && coding->src_pos < GPT
6619 && coding->src_pos + coding->src_chars > GPT)
6620 move_gap_both (coding->src_pos, coding->src_pos_byte);
d46c5b12 6621
24a73b0a 6622 undo_list = Qt;
df7492f9 6623 if (BUFFERP (coding->dst_object))
1c3478b0 6624 {
df7492f9
KH
6625 if (current_buffer != XBUFFER (coding->dst_object))
6626 set_buffer_internal (XBUFFER (coding->dst_object));
6627 if (GPT != PT)
6628 move_gap_both (PT, PT_BYTE);
24a73b0a
KH
6629 undo_list = current_buffer->undo_list;
6630 current_buffer->undo_list = Qt;
1c3478b0
KH
6631 }
6632
df7492f9
KH
6633 coding->consumed = coding->consumed_char = 0;
6634 coding->produced = coding->produced_char = 0;
6635 coding->chars_at_source = 0;
065e3595 6636 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9 6637 coding->errors = 0;
1c3478b0 6638
df7492f9
KH
6639 ALLOC_CONVERSION_WORK_AREA (coding);
6640
6641 attrs = CODING_ID_ATTRS (coding->id);
2170c8f0 6642 translation_table = get_translation_table (attrs, 0, NULL);
df7492f9 6643
69a80ea3 6644 carryover = 0;
df7492f9 6645 do
b73bfc1c 6646 {
69a80ea3
KH
6647 EMACS_INT pos = coding->dst_pos + coding->produced_char;
6648
df7492f9
KH
6649 coding_set_source (coding);
6650 coding->annotated = 0;
69a80ea3 6651 coding->charbuf_used = carryover;
df7492f9 6652 (*(coding->decoder)) (coding);
df7492f9 6653 coding_set_destination (coding);
69a80ea3 6654 carryover = produce_chars (coding, translation_table, 0);
df7492f9 6655 if (coding->annotated)
69a80ea3
KH
6656 produce_annotation (coding, pos);
6657 for (i = 0; i < carryover; i++)
6658 coding->charbuf[i]
6659 = coding->charbuf[coding->charbuf_used - carryover + i];
d46c5b12 6660 }
df7492f9 6661 while (coding->consumed < coding->src_bytes
54b367bb
KH
6662 && (coding->result == CODING_RESULT_SUCCESS
6663 || coding->result == CODING_RESULT_INVALID_SRC));
d46c5b12 6664
69a80ea3
KH
6665 if (carryover > 0)
6666 {
6667 coding_set_destination (coding);
6668 coding->charbuf_used = carryover;
6669 produce_chars (coding, translation_table, 1);
6670 }
6671
df7492f9
KH
6672 coding->carryover_bytes = 0;
6673 if (coding->consumed < coding->src_bytes)
d46c5b12 6674 {
df7492f9 6675 int nbytes = coding->src_bytes - coding->consumed;
8f924df7 6676 const unsigned char *src;
df7492f9
KH
6677
6678 coding_set_source (coding);
6679 coding_set_destination (coding);
6680 src = coding->source + coding->consumed;
6681
6682 if (coding->mode & CODING_MODE_LAST_BLOCK)
1c3478b0 6683 {
df7492f9
KH
6684 /* Flush out unprocessed data as binary chars. We are sure
6685 that the number of data is less than the size of
6686 coding->charbuf. */
065e3595 6687 coding->charbuf_used = 0;
b2dab6c8
JR
6688 coding->chars_at_source = 0;
6689
df7492f9 6690 while (nbytes-- > 0)
1c3478b0 6691 {
df7492f9 6692 int c = *src++;
98725083 6693
1c91457d
KH
6694 if (c & 0x80)
6695 c = BYTE8_TO_CHAR (c);
6696 coding->charbuf[coding->charbuf_used++] = c;
1c3478b0 6697 }
f6cbaf43 6698 produce_chars (coding, Qnil, 1);
d46c5b12 6699 }
d46c5b12 6700 else
df7492f9
KH
6701 {
6702 /* Record unprocessed bytes in coding->carryover. We are
6703 sure that the number of data is less than the size of
6704 coding->carryover. */
6705 unsigned char *p = coding->carryover;
6706
6707 coding->carryover_bytes = nbytes;
6708 while (nbytes-- > 0)
6709 *p++ = *src++;
1c3478b0 6710 }
df7492f9 6711 coding->consumed = coding->src_bytes;
b73bfc1c 6712 }
69f76525 6713
4347441b
KH
6714 if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix))
6715 decode_eol (coding);
24a73b0a
KH
6716 if (BUFFERP (coding->dst_object))
6717 {
6718 current_buffer->undo_list = undo_list;
6719 record_insert (coding->dst_pos, coding->produced_char);
6720 }
73be902c 6721 return coding->result;
4ed46869
KH
6722}
6723
aaaf0b1e 6724
e1c23804 6725/* Extract an annotation datum from a composition starting at POS and
ff0dacd7
KH
6726 ending before LIMIT of CODING->src_object (buffer or string), store
6727 the data in BUF, set *STOP to a starting position of the next
6728 composition (if any) or to LIMIT, and return the address of the
6729 next element of BUF.
6730
6731 If such an annotation is not found, set *STOP to a starting
6732 position of a composition after POS (if any) or to LIMIT, and
6733 return BUF. */
6734
6735static INLINE int *
6736handle_composition_annotation (pos, limit, coding, buf, stop)
6737 EMACS_INT pos, limit;
aaaf0b1e 6738 struct coding_system *coding;
ff0dacd7
KH
6739 int *buf;
6740 EMACS_INT *stop;
aaaf0b1e 6741{
ff0dacd7
KH
6742 EMACS_INT start, end;
6743 Lisp_Object prop;
aaaf0b1e 6744
ff0dacd7
KH
6745 if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
6746 || end > limit)
6747 *stop = limit;
6748 else if (start > pos)
6749 *stop = start;
6750 else
aaaf0b1e 6751 {
ff0dacd7 6752 if (start == pos)
aaaf0b1e 6753 {
ff0dacd7
KH
6754 /* We found a composition. Store the corresponding
6755 annotation data in BUF. */
6756 int *head = buf;
6757 enum composition_method method = COMPOSITION_METHOD (prop);
6758 int nchars = COMPOSITION_LENGTH (prop);
6759
69a80ea3 6760 ADD_COMPOSITION_DATA (buf, nchars, method);
ff0dacd7 6761 if (method != COMPOSITION_RELATIVE)
aaaf0b1e 6762 {
ff0dacd7
KH
6763 Lisp_Object components;
6764 int len, i, i_byte;
6765
6766 components = COMPOSITION_COMPONENTS (prop);
6767 if (VECTORP (components))
aaaf0b1e 6768 {
ff0dacd7
KH
6769 len = XVECTOR (components)->size;
6770 for (i = 0; i < len; i++)
6771 *buf++ = XINT (AREF (components, i));
aaaf0b1e 6772 }
ff0dacd7 6773 else if (STRINGP (components))
aaaf0b1e 6774 {
8f924df7 6775 len = SCHARS (components);
ff0dacd7
KH
6776 i = i_byte = 0;
6777 while (i < len)
6778 {
6779 FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
6780 buf++;
6781 }
6782 }
6783 else if (INTEGERP (components))
6784 {
6785 len = 1;
6786 *buf++ = XINT (components);
6787 }
6788 else if (CONSP (components))
6789 {
6790 for (len = 0; CONSP (components);
6791 len++, components = XCDR (components))
6792 *buf++ = XINT (XCAR (components));
aaaf0b1e 6793 }
aaaf0b1e 6794 else
ff0dacd7
KH
6795 abort ();
6796 *head -= len;
aaaf0b1e 6797 }
aaaf0b1e 6798 }
ff0dacd7
KH
6799
6800 if (find_composition (end, limit, &start, &end, &prop,
6801 coding->src_object)
6802 && end <= limit)
6803 *stop = start;
6804 else
6805 *stop = limit;
aaaf0b1e 6806 }
ff0dacd7
KH
6807 return buf;
6808}
6809
6810
e1c23804 6811/* Extract an annotation datum from a text property `charset' at POS of
ff0dacd7
KH
6812 CODING->src_object (buffer of string), store the data in BUF, set
6813 *STOP to the position where the value of `charset' property changes
6814 (limiting by LIMIT), and return the address of the next element of
6815 BUF.
6816
6817 If the property value is nil, set *STOP to the position where the
6818 property value is non-nil (limiting by LIMIT), and return BUF. */
6819
6820static INLINE int *
6821handle_charset_annotation (pos, limit, coding, buf, stop)
6822 EMACS_INT pos, limit;
6823 struct coding_system *coding;
6824 int *buf;
6825 EMACS_INT *stop;
6826{
6827 Lisp_Object val, next;
6828 int id;
6829
6830 val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
6831 if (! NILP (val) && CHARSETP (val))
6832 id = XINT (CHARSET_SYMBOL_ID (val));
6833 else
6834 id = -1;
69a80ea3 6835 ADD_CHARSET_DATA (buf, 0, id);
ff0dacd7
KH
6836 next = Fnext_single_property_change (make_number (pos), Qcharset,
6837 coding->src_object,
6838 make_number (limit));
6839 *stop = XINT (next);
6840 return buf;
6841}
6842
6843
df7492f9 6844static void
09ee6fdd 6845consume_chars (coding, translation_table, max_lookup)
df7492f9 6846 struct coding_system *coding;
433f7f87 6847 Lisp_Object translation_table;
09ee6fdd 6848 int max_lookup;
df7492f9
KH
6849{
6850 int *buf = coding->charbuf;
ff0dacd7 6851 int *buf_end = coding->charbuf + coding->charbuf_size;
7c78e542 6852 const unsigned char *src = coding->source + coding->consumed;
4776e638 6853 const unsigned char *src_end = coding->source + coding->src_bytes;
ff0dacd7
KH
6854 EMACS_INT pos = coding->src_pos + coding->consumed_char;
6855 EMACS_INT end_pos = coding->src_pos + coding->src_chars;
df7492f9
KH
6856 int multibytep = coding->src_multibyte;
6857 Lisp_Object eol_type;
6858 int c;
ff0dacd7 6859 EMACS_INT stop, stop_composition, stop_charset;
09ee6fdd 6860 int *lookup_buf = NULL;
433f7f87
KH
6861
6862 if (! NILP (translation_table))
09ee6fdd 6863 lookup_buf = alloca (sizeof (int) * max_lookup);
88993dfd 6864
df7492f9
KH
6865 eol_type = CODING_ID_EOL_TYPE (coding->id);
6866 if (VECTORP (eol_type))
6867 eol_type = Qunix;
88993dfd 6868
df7492f9
KH
6869 /* Note: composition handling is not yet implemented. */
6870 coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
ec6d2bb8 6871
0b5670c9
KH
6872 if (NILP (coding->src_object))
6873 stop = stop_composition = stop_charset = end_pos;
ff0dacd7 6874 else
0b5670c9
KH
6875 {
6876 if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
6877 stop = stop_composition = pos;
6878 else
6879 stop = stop_composition = end_pos;
6880 if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
6881 stop = stop_charset = pos;
6882 else
6883 stop_charset = end_pos;
6884 }
ec6d2bb8 6885
24a73b0a 6886 /* Compensate for CRLF and conversion. */
ff0dacd7 6887 buf_end -= 1 + MAX_ANNOTATION_LENGTH;
df7492f9 6888 while (buf < buf_end)
aaaf0b1e 6889 {
433f7f87
KH
6890 Lisp_Object trans;
6891
df7492f9 6892 if (pos == stop)
ec6d2bb8 6893 {
df7492f9
KH
6894 if (pos == end_pos)
6895 break;
ff0dacd7
KH
6896 if (pos == stop_composition)
6897 buf = handle_composition_annotation (pos, end_pos, coding,
6898 buf, &stop_composition);
6899 if (pos == stop_charset)
6900 buf = handle_charset_annotation (pos, end_pos, coding,
6901 buf, &stop_charset);
6902 stop = (stop_composition < stop_charset
6903 ? stop_composition : stop_charset);
df7492f9
KH
6904 }
6905
6906 if (! multibytep)
4776e638 6907 {
d3e4cb56 6908 EMACS_INT bytes;
aaaf0b1e 6909
ea29edf2
KH
6910 if (coding->encoder == encode_coding_raw_text)
6911 c = *src++, pos++;
6912 else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
db274c7a 6913 c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
4776e638 6914 else
f03caae0 6915 c = BYTE8_TO_CHAR (*src), src++, pos++;
4776e638 6916 }
df7492f9 6917 else
db274c7a 6918 c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
df7492f9
KH
6919 if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
6920 c = '\n';
6921 if (! EQ (eol_type, Qunix))
aaaf0b1e 6922 {
df7492f9 6923 if (c == '\n')
aaaf0b1e 6924 {
df7492f9
KH
6925 if (EQ (eol_type, Qdos))
6926 *buf++ = '\r';
6927 else
6928 c = '\r';
aaaf0b1e
KH
6929 }
6930 }
433f7f87 6931
e6a54062 6932 trans = Qnil;
09ee6fdd 6933 LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
e6a54062 6934 if (NILP (trans))
433f7f87
KH
6935 *buf++ = c;
6936 else
6937 {
6938 int from_nchars = 1, to_nchars = 1;
6939 int *lookup_buf_end;
6940 const unsigned char *p = src;
6941 int i;
6942
6943 lookup_buf[0] = c;
6944 for (i = 1; i < max_lookup && p < src_end; i++)
6945 lookup_buf[i] = STRING_CHAR_ADVANCE (p);
6946 lookup_buf_end = lookup_buf + i;
6947 trans = get_translation (trans, lookup_buf, lookup_buf_end, 1,
6948 &from_nchars, &to_nchars);
6949 if (EQ (trans, Qt)
6950 || buf + to_nchars > buf_end)
6951 break;
6952 *buf++ = *lookup_buf;
6953 for (i = 1; i < to_nchars; i++)
6954 *buf++ = XINT (AREF (trans, i));
6955 for (i = 1; i < from_nchars; i++, pos++)
6956 src += MULTIBYTE_LENGTH_NO_CHECK (src);
6957 }
aaaf0b1e 6958 }
ec6d2bb8 6959
df7492f9
KH
6960 coding->consumed = src - coding->source;
6961 coding->consumed_char = pos - coding->src_pos;
6962 coding->charbuf_used = buf - coding->charbuf;
6963 coding->chars_at_source = 0;
aaaf0b1e
KH
6964}
6965
4ed46869 6966
df7492f9
KH
6967/* Encode the text at CODING->src_object into CODING->dst_object.
6968 CODING->src_object is a buffer or a string.
6969 CODING->dst_object is a buffer or nil.
6970
6971 If CODING->src_object is a buffer, it must be the current buffer.
6972 In this case, if CODING->src_pos is positive, it is a position of
6973 the source text in the buffer, otherwise. the source text is in the
6974 gap area of the buffer, and coding->src_pos specifies the offset of
6975 the text from GPT (which must be the same as PT). If this is the
6976 same buffer as CODING->dst_object, CODING->src_pos must be
6977 negative and CODING should not have `pre-write-conversion'.
6978
6979 If CODING->src_object is a string, CODING should not have
6980 `pre-write-conversion'.
6981
6982 If CODING->dst_object is a buffer, the encoded data is inserted at
6983 the current point of that buffer.
6984
6985 If CODING->dst_object is nil, the encoded data is placed at the
6986 memory area specified by CODING->destination. */
6987
6988static int
6989encode_coding (coding)
4ed46869 6990 struct coding_system *coding;
4ed46869 6991{
df7492f9 6992 Lisp_Object attrs;
7d64c6ad 6993 Lisp_Object translation_table;
09ee6fdd 6994 int max_lookup;
9861e777 6995
df7492f9 6996 attrs = CODING_ID_ATTRS (coding->id);
ea29edf2
KH
6997 if (coding->encoder == encode_coding_raw_text)
6998 translation_table = Qnil, max_lookup = 0;
6999 else
7000 translation_table = get_translation_table (attrs, 1, &max_lookup);
4ed46869 7001
df7492f9 7002 if (BUFFERP (coding->dst_object))
8844fa83 7003 {
df7492f9
KH
7004 set_buffer_internal (XBUFFER (coding->dst_object));
7005 coding->dst_multibyte
7006 = ! NILP (current_buffer->enable_multibyte_characters);
8844fa83 7007 }
4ed46869 7008
b73bfc1c 7009 coding->consumed = coding->consumed_char = 0;
df7492f9 7010 coding->produced = coding->produced_char = 0;
065e3595 7011 record_conversion_result (coding, CODING_RESULT_SUCCESS);
b73bfc1c 7012 coding->errors = 0;
b73bfc1c 7013
df7492f9 7014 ALLOC_CONVERSION_WORK_AREA (coding);
4ed46869 7015
df7492f9
KH
7016 do {
7017 coding_set_source (coding);
09ee6fdd 7018 consume_chars (coding, translation_table, max_lookup);
df7492f9
KH
7019 coding_set_destination (coding);
7020 (*(coding->encoder)) (coding);
7021 } while (coding->consumed_char < coding->src_chars);
7022
284201e4 7023 if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
df7492f9
KH
7024 insert_from_gap (coding->produced_char, coding->produced);
7025
7026 return (coding->result);
ec6d2bb8
KH
7027}
7028
fb88bf2d 7029
24a73b0a
KH
7030/* Name (or base name) of work buffer for code conversion. */
7031static Lisp_Object Vcode_conversion_workbuf_name;
d46c5b12 7032
24a73b0a
KH
7033/* A working buffer used by the top level conversion. Once it is
7034 created, it is never destroyed. It has the name
7035 Vcode_conversion_workbuf_name. The other working buffers are
7036 destroyed after the use is finished, and their names are modified
7037 versions of Vcode_conversion_workbuf_name. */
7038static Lisp_Object Vcode_conversion_reused_workbuf;
b73bfc1c 7039
24a73b0a
KH
7040/* 1 iff Vcode_conversion_reused_workbuf is already in use. */
7041static int reused_workbuf_in_use;
4ed46869 7042
24a73b0a
KH
7043
7044/* Return a working buffer of code convesion. MULTIBYTE specifies the
7045 multibyteness of returning buffer. */
b73bfc1c 7046
f6cbaf43 7047static Lisp_Object
24a73b0a 7048make_conversion_work_buffer (multibyte)
f6cbaf43 7049 int multibyte;
df7492f9 7050{
24a73b0a
KH
7051 Lisp_Object name, workbuf;
7052 struct buffer *current;
4ed46869 7053
24a73b0a 7054 if (reused_workbuf_in_use++)
065e3595
KH
7055 {
7056 name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7057 workbuf = Fget_buffer_create (name);
7058 }
df7492f9 7059 else
065e3595 7060 {
159bd5a2 7061 if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
a993c7a1
KH
7062 Vcode_conversion_reused_workbuf
7063 = Fget_buffer_create (Vcode_conversion_workbuf_name);
7064 workbuf = Vcode_conversion_reused_workbuf;
065e3595 7065 }
24a73b0a
KH
7066 current = current_buffer;
7067 set_buffer_internal (XBUFFER (workbuf));
df36ff1f
CY
7068 /* We can't allow modification hooks to run in the work buffer. For
7069 instance, directory_files_internal assumes that file decoding
7070 doesn't compile new regexps. */
7071 Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
3ed051d4 7072 Ferase_buffer ();
df7492f9 7073 current_buffer->undo_list = Qt;
24a73b0a 7074 current_buffer->enable_multibyte_characters = multibyte ? Qt : Qnil;
df7492f9 7075 set_buffer_internal (current);
24a73b0a 7076 return workbuf;
df7492f9 7077}
d46c5b12 7078
24a73b0a 7079
4776e638 7080static Lisp_Object
24a73b0a
KH
7081code_conversion_restore (arg)
7082 Lisp_Object arg;
4776e638 7083{
24a73b0a 7084 Lisp_Object current, workbuf;
948bdcf3 7085 struct gcpro gcpro1;
24a73b0a 7086
948bdcf3 7087 GCPRO1 (arg);
24a73b0a
KH
7088 current = XCAR (arg);
7089 workbuf = XCDR (arg);
7090 if (! NILP (workbuf))
7091 {
7092 if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7093 reused_workbuf_in_use = 0;
7094 else if (! NILP (Fbuffer_live_p (workbuf)))
7095 Fkill_buffer (workbuf);
7096 }
7097 set_buffer_internal (XBUFFER (current));
948bdcf3 7098 UNGCPRO;
4776e638
KH
7099 return Qnil;
7100}
b73bfc1c 7101
24a73b0a
KH
7102Lisp_Object
7103code_conversion_save (with_work_buf, multibyte)
4776e638 7104 int with_work_buf, multibyte;
df7492f9 7105{
24a73b0a 7106 Lisp_Object workbuf = Qnil;
b73bfc1c 7107
4776e638 7108 if (with_work_buf)
24a73b0a
KH
7109 workbuf = make_conversion_work_buffer (multibyte);
7110 record_unwind_protect (code_conversion_restore,
7111 Fcons (Fcurrent_buffer (), workbuf));
4776e638 7112 return workbuf;
df7492f9 7113}
d46c5b12 7114
df7492f9
KH
7115int
7116decode_coding_gap (coding, chars, bytes)
7117 struct coding_system *coding;
7118 EMACS_INT chars, bytes;
7119{
7120 int count = specpdl_ptr - specpdl;
5e5c78be 7121 Lisp_Object attrs;
fb88bf2d 7122
24a73b0a 7123 code_conversion_save (0, 0);
ec6d2bb8 7124
24a73b0a 7125 coding->src_object = Fcurrent_buffer ();
df7492f9
KH
7126 coding->src_chars = chars;
7127 coding->src_bytes = bytes;
7128 coding->src_pos = -chars;
7129 coding->src_pos_byte = -bytes;
7130 coding->src_multibyte = chars < bytes;
24a73b0a 7131 coding->dst_object = coding->src_object;
df7492f9
KH
7132 coding->dst_pos = PT;
7133 coding->dst_pos_byte = PT_BYTE;
71c81426 7134 coding->dst_multibyte = ! NILP (current_buffer->enable_multibyte_characters);
4ed46869 7135
df7492f9
KH
7136 if (CODING_REQUIRE_DETECTION (coding))
7137 detect_coding (coding);
8f924df7 7138
9286b333 7139 coding->mode |= CODING_MODE_LAST_BLOCK;
287c57d7 7140 current_buffer->text->inhibit_shrinking = 1;
df7492f9 7141 decode_coding (coding);
287c57d7 7142 current_buffer->text->inhibit_shrinking = 0;
d46c5b12 7143
5e5c78be
KH
7144 attrs = CODING_ID_ATTRS (coding->id);
7145 if (! NILP (CODING_ATTR_POST_READ (attrs)))
b73bfc1c 7146 {
5e5c78be
KH
7147 EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7148 Lisp_Object val;
7149
7150 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
5e5c78be
KH
7151 val = call1 (CODING_ATTR_POST_READ (attrs),
7152 make_number (coding->produced_char));
5e5c78be
KH
7153 CHECK_NATNUM (val);
7154 coding->produced_char += Z - prev_Z;
7155 coding->produced += Z_BYTE - prev_Z_BYTE;
b73bfc1c 7156 }
4ed46869 7157
df7492f9 7158 unbind_to (count, Qnil);
b73bfc1c
KH
7159 return coding->result;
7160}
52d41803 7161
4ed46869 7162int
df7492f9 7163encode_coding_gap (coding, chars, bytes)
4ed46869 7164 struct coding_system *coding;
df7492f9 7165 EMACS_INT chars, bytes;
4ed46869 7166{
df7492f9 7167 int count = specpdl_ptr - specpdl;
4ed46869 7168
24a73b0a 7169 code_conversion_save (0, 0);
4ed46869 7170
24a73b0a 7171 coding->src_object = Fcurrent_buffer ();
df7492f9
KH
7172 coding->src_chars = chars;
7173 coding->src_bytes = bytes;
7174 coding->src_pos = -chars;
7175 coding->src_pos_byte = -bytes;
7176 coding->src_multibyte = chars < bytes;
7177 coding->dst_object = coding->src_object;
7178 coding->dst_pos = PT;
7179 coding->dst_pos_byte = PT_BYTE;
4ed46869 7180
df7492f9 7181 encode_coding (coding);
b73bfc1c 7182
df7492f9
KH
7183 unbind_to (count, Qnil);
7184 return coding->result;
7185}
4ed46869 7186
d46c5b12 7187
df7492f9
KH
7188/* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
7189 SRC_OBJECT into DST_OBJECT by coding context CODING.
b73bfc1c 7190
df7492f9 7191 SRC_OBJECT is a buffer, a string, or Qnil.
b73bfc1c 7192
df7492f9
KH
7193 If it is a buffer, the text is at point of the buffer. FROM and TO
7194 are positions in the buffer.
b73bfc1c 7195
df7492f9
KH
7196 If it is a string, the text is at the beginning of the string.
7197 FROM and TO are indices to the string.
4ed46869 7198
df7492f9
KH
7199 If it is nil, the text is at coding->source. FROM and TO are
7200 indices to coding->source.
bb10be8b 7201
df7492f9 7202 DST_OBJECT is a buffer, Qt, or Qnil.
4ed46869 7203
df7492f9
KH
7204 If it is a buffer, the decoded text is inserted at point of the
7205 buffer. If the buffer is the same as SRC_OBJECT, the source text
7206 is deleted.
4ed46869 7207
df7492f9
KH
7208 If it is Qt, a string is made from the decoded text, and
7209 set in CODING->dst_object.
d46c5b12 7210
df7492f9 7211 If it is Qnil, the decoded text is stored at CODING->destination.
2cb26057 7212 The caller must allocate CODING->dst_bytes bytes at
df7492f9
KH
7213 CODING->destination by xmalloc. If the decoded text is longer than
7214 CODING->dst_bytes, CODING->destination is relocated by xrealloc.
7215 */
d46c5b12 7216
df7492f9
KH
7217void
7218decode_coding_object (coding, src_object, from, from_byte, to, to_byte,
7219 dst_object)
d46c5b12 7220 struct coding_system *coding;
df7492f9
KH
7221 Lisp_Object src_object;
7222 EMACS_INT from, from_byte, to, to_byte;
7223 Lisp_Object dst_object;
d46c5b12 7224{
df7492f9
KH
7225 int count = specpdl_ptr - specpdl;
7226 unsigned char *destination;
7227 EMACS_INT dst_bytes;
7228 EMACS_INT chars = to - from;
7229 EMACS_INT bytes = to_byte - from_byte;
7230 Lisp_Object attrs;
4776e638 7231 int saved_pt = -1, saved_pt_byte;
64cedb0c 7232 int need_marker_adjustment = 0;
b3bfad50 7233 Lisp_Object old_deactivate_mark;
d46c5b12 7234
b3bfad50 7235 old_deactivate_mark = Vdeactivate_mark;
93dec019 7236
df7492f9 7237 if (NILP (dst_object))
d46c5b12 7238 {
df7492f9
KH
7239 destination = coding->destination;
7240 dst_bytes = coding->dst_bytes;
d46c5b12 7241 }
93dec019 7242
df7492f9
KH
7243 coding->src_object = src_object;
7244 coding->src_chars = chars;
7245 coding->src_bytes = bytes;
7246 coding->src_multibyte = chars < bytes;
70ad9fc4 7247
df7492f9 7248 if (STRINGP (src_object))
d46c5b12 7249 {
df7492f9
KH
7250 coding->src_pos = from;
7251 coding->src_pos_byte = from_byte;
d46c5b12 7252 }
df7492f9 7253 else if (BUFFERP (src_object))
88993dfd 7254 {
df7492f9
KH
7255 set_buffer_internal (XBUFFER (src_object));
7256 if (from != GPT)
7257 move_gap_both (from, from_byte);
7258 if (EQ (src_object, dst_object))
fb88bf2d 7259 {
64cedb0c
KH
7260 struct Lisp_Marker *tail;
7261
7262 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7263 {
7264 tail->need_adjustment
7265 = tail->charpos == (tail->insertion_type ? from : to);
7266 need_marker_adjustment |= tail->need_adjustment;
7267 }
4776e638 7268 saved_pt = PT, saved_pt_byte = PT_BYTE;
df7492f9 7269 TEMP_SET_PT_BOTH (from, from_byte);
f4a3cc44 7270 current_buffer->text->inhibit_shrinking = 1;
df7492f9
KH
7271 del_range_both (from, from_byte, to, to_byte, 1);
7272 coding->src_pos = -chars;
7273 coding->src_pos_byte = -bytes;
fb88bf2d 7274 }
df7492f9 7275 else
fb88bf2d 7276 {
df7492f9
KH
7277 coding->src_pos = from;
7278 coding->src_pos_byte = from_byte;
fb88bf2d 7279 }
88993dfd
KH
7280 }
7281
df7492f9
KH
7282 if (CODING_REQUIRE_DETECTION (coding))
7283 detect_coding (coding);
7284 attrs = CODING_ID_ATTRS (coding->id);
d46c5b12 7285
2cb26057
KH
7286 if (EQ (dst_object, Qt)
7287 || (! NILP (CODING_ATTR_POST_READ (attrs))
7288 && NILP (dst_object)))
b73bfc1c 7289 {
a1567c45
SM
7290 coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
7291 coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
df7492f9
KH
7292 coding->dst_pos = BEG;
7293 coding->dst_pos_byte = BEG_BYTE;
b73bfc1c 7294 }
df7492f9 7295 else if (BUFFERP (dst_object))
d46c5b12 7296 {
24a73b0a 7297 code_conversion_save (0, 0);
df7492f9
KH
7298 coding->dst_object = dst_object;
7299 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
7300 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
7301 coding->dst_multibyte
7302 = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
d46c5b12
KH
7303 }
7304 else
7305 {
24a73b0a 7306 code_conversion_save (0, 0);
df7492f9 7307 coding->dst_object = Qnil;
0154725e
SM
7308 /* Most callers presume this will return a multibyte result, and they
7309 won't use `binary' or `raw-text' anyway, so let's not worry about
7310 CODING_FOR_UNIBYTE. */
bb555731 7311 coding->dst_multibyte = 1;
d46c5b12
KH
7312 }
7313
df7492f9 7314 decode_coding (coding);
fa46990e 7315
df7492f9
KH
7316 if (BUFFERP (coding->dst_object))
7317 set_buffer_internal (XBUFFER (coding->dst_object));
d46c5b12 7318
df7492f9 7319 if (! NILP (CODING_ATTR_POST_READ (attrs)))
d46c5b12 7320 {
b3bfad50 7321 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
df7492f9 7322 EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
2b4f9037 7323 Lisp_Object val;
d46c5b12 7324
c0cc7f7f 7325 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
b3bfad50
KH
7326 GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7327 old_deactivate_mark);
d4850d67
KH
7328 val = safe_call1 (CODING_ATTR_POST_READ (attrs),
7329 make_number (coding->produced_char));
df7492f9
KH
7330 UNGCPRO;
7331 CHECK_NATNUM (val);
7332 coding->produced_char += Z - prev_Z;
7333 coding->produced += Z_BYTE - prev_Z_BYTE;
d46c5b12 7334 }
de79a6a5 7335
df7492f9 7336 if (EQ (dst_object, Qt))
ec6d2bb8 7337 {
df7492f9
KH
7338 coding->dst_object = Fbuffer_string ();
7339 }
7340 else if (NILP (dst_object) && BUFFERP (coding->dst_object))
7341 {
7342 set_buffer_internal (XBUFFER (coding->dst_object));
7343 if (dst_bytes < coding->produced)
7344 {
b3bfad50 7345 destination = xrealloc (destination, coding->produced);
df7492f9
KH
7346 if (! destination)
7347 {
065e3595
KH
7348 record_conversion_result (coding,
7349 CODING_RESULT_INSUFFICIENT_DST);
df7492f9
KH
7350 unbind_to (count, Qnil);
7351 return;
7352 }
7353 if (BEGV < GPT && GPT < BEGV + coding->produced_char)
7354 move_gap_both (BEGV, BEGV_BYTE);
7355 bcopy (BEGV_ADDR, destination, coding->produced);
7356 coding->destination = destination;
d46c5b12 7357 }
ec6d2bb8 7358 }
b73bfc1c 7359
4776e638
KH
7360 if (saved_pt >= 0)
7361 {
7362 /* This is the case of:
7363 (BUFFERP (src_object) && EQ (src_object, dst_object))
7364 As we have moved PT while replacing the original buffer
7365 contents, we must recover it now. */
7366 set_buffer_internal (XBUFFER (src_object));
f4a3cc44 7367 current_buffer->text->inhibit_shrinking = 0;
4776e638
KH
7368 if (saved_pt < from)
7369 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7370 else if (saved_pt < from + chars)
7371 TEMP_SET_PT_BOTH (from, from_byte);
7372 else if (! NILP (current_buffer->enable_multibyte_characters))
7373 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7374 saved_pt_byte + (coding->produced - bytes));
7375 else
7376 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7377 saved_pt_byte + (coding->produced - bytes));
64cedb0c
KH
7378
7379 if (need_marker_adjustment)
7380 {
7381 struct Lisp_Marker *tail;
7382
7383 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7384 if (tail->need_adjustment)
7385 {
7386 tail->need_adjustment = 0;
7387 if (tail->insertion_type)
7388 {
7389 tail->bytepos = from_byte;
7390 tail->charpos = from;
7391 }
7392 else
7393 {
7394 tail->bytepos = from_byte + coding->produced;
7395 tail->charpos
7396 = (NILP (current_buffer->enable_multibyte_characters)
7397 ? tail->bytepos : from + coding->produced_char);
7398 }
7399 }
7400 }
d46c5b12 7401 }
4776e638 7402
b3bfad50 7403 Vdeactivate_mark = old_deactivate_mark;
065e3595 7404 unbind_to (count, coding->dst_object);
d46c5b12
KH
7405}
7406
d46c5b12 7407
df7492f9
KH
7408void
7409encode_coding_object (coding, src_object, from, from_byte, to, to_byte,
7410 dst_object)
d46c5b12 7411 struct coding_system *coding;
df7492f9
KH
7412 Lisp_Object src_object;
7413 EMACS_INT from, from_byte, to, to_byte;
7414 Lisp_Object dst_object;
d46c5b12 7415{
b73bfc1c 7416 int count = specpdl_ptr - specpdl;
df7492f9
KH
7417 EMACS_INT chars = to - from;
7418 EMACS_INT bytes = to_byte - from_byte;
7419 Lisp_Object attrs;
4776e638 7420 int saved_pt = -1, saved_pt_byte;
64cedb0c 7421 int need_marker_adjustment = 0;
c02d943b 7422 int kill_src_buffer = 0;
b3bfad50 7423 Lisp_Object old_deactivate_mark;
df7492f9 7424
b3bfad50 7425 old_deactivate_mark = Vdeactivate_mark;
df7492f9
KH
7426
7427 coding->src_object = src_object;
7428 coding->src_chars = chars;
7429 coding->src_bytes = bytes;
7430 coding->src_multibyte = chars < bytes;
7431
7432 attrs = CODING_ID_ATTRS (coding->id);
7433
64cedb0c
KH
7434 if (EQ (src_object, dst_object))
7435 {
7436 struct Lisp_Marker *tail;
7437
7438 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7439 {
7440 tail->need_adjustment
7441 = tail->charpos == (tail->insertion_type ? from : to);
7442 need_marker_adjustment |= tail->need_adjustment;
7443 }
7444 }
7445
df7492f9 7446 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
6bac5b12 7447 {
24a73b0a 7448 coding->src_object = code_conversion_save (1, coding->src_multibyte);
df7492f9
KH
7449 set_buffer_internal (XBUFFER (coding->src_object));
7450 if (STRINGP (src_object))
7451 insert_from_string (src_object, from, from_byte, chars, bytes, 0);
7452 else if (BUFFERP (src_object))
7453 insert_from_buffer (XBUFFER (src_object), from, chars, 0);
7454 else
7455 insert_1_both (coding->source + from, chars, bytes, 0, 0, 0);
d46c5b12 7456
df7492f9
KH
7457 if (EQ (src_object, dst_object))
7458 {
7459 set_buffer_internal (XBUFFER (src_object));
4776e638 7460 saved_pt = PT, saved_pt_byte = PT_BYTE;
df7492f9
KH
7461 del_range_both (from, from_byte, to, to_byte, 1);
7462 set_buffer_internal (XBUFFER (coding->src_object));
7463 }
7464
d4850d67
KH
7465 {
7466 Lisp_Object args[3];
b3bfad50 7467 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
d4850d67 7468
b3bfad50
KH
7469 GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7470 old_deactivate_mark);
d4850d67
KH
7471 args[0] = CODING_ATTR_PRE_WRITE (attrs);
7472 args[1] = make_number (BEG);
7473 args[2] = make_number (Z);
7474 safe_call (3, args);
b3bfad50 7475 UNGCPRO;
d4850d67 7476 }
c02d943b
KH
7477 if (XBUFFER (coding->src_object) != current_buffer)
7478 kill_src_buffer = 1;
ac87bbef 7479 coding->src_object = Fcurrent_buffer ();
df7492f9
KH
7480 if (BEG != GPT)
7481 move_gap_both (BEG, BEG_BYTE);
7482 coding->src_chars = Z - BEG;
7483 coding->src_bytes = Z_BYTE - BEG_BYTE;
7484 coding->src_pos = BEG;
7485 coding->src_pos_byte = BEG_BYTE;
7486 coding->src_multibyte = Z < Z_BYTE;
7487 }
7488 else if (STRINGP (src_object))
d46c5b12 7489 {
24a73b0a 7490 code_conversion_save (0, 0);
df7492f9
KH
7491 coding->src_pos = from;
7492 coding->src_pos_byte = from_byte;
b73bfc1c 7493 }
df7492f9 7494 else if (BUFFERP (src_object))
b73bfc1c 7495 {
24a73b0a 7496 code_conversion_save (0, 0);
df7492f9 7497 set_buffer_internal (XBUFFER (src_object));
df7492f9 7498 if (EQ (src_object, dst_object))
d46c5b12 7499 {
4776e638 7500 saved_pt = PT, saved_pt_byte = PT_BYTE;
ff0dacd7
KH
7501 coding->src_object = del_range_1 (from, to, 1, 1);
7502 coding->src_pos = 0;
7503 coding->src_pos_byte = 0;
d46c5b12 7504 }
df7492f9 7505 else
d46c5b12 7506 {
ff0dacd7
KH
7507 if (from < GPT && to >= GPT)
7508 move_gap_both (from, from_byte);
df7492f9
KH
7509 coding->src_pos = from;
7510 coding->src_pos_byte = from_byte;
d46c5b12 7511 }
d46c5b12 7512 }
4776e638 7513 else
24a73b0a 7514 code_conversion_save (0, 0);
d46c5b12 7515
df7492f9 7516 if (BUFFERP (dst_object))
88993dfd 7517 {
df7492f9 7518 coding->dst_object = dst_object;
28f67a95
KH
7519 if (EQ (src_object, dst_object))
7520 {
7521 coding->dst_pos = from;
7522 coding->dst_pos_byte = from_byte;
7523 }
7524 else
7525 {
319a3947
KH
7526 struct buffer *current = current_buffer;
7527
7528 set_buffer_temp (XBUFFER (dst_object));
7529 coding->dst_pos = PT;
7530 coding->dst_pos_byte = PT_BYTE;
7531 move_gap_both (coding->dst_pos, coding->dst_pos_byte);
7532 set_buffer_temp (current);
28f67a95 7533 }
df7492f9
KH
7534 coding->dst_multibyte
7535 = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
88993dfd 7536 }
df7492f9 7537 else if (EQ (dst_object, Qt))
d46c5b12 7538 {
df7492f9 7539 coding->dst_object = Qnil;
df7492f9 7540 coding->dst_bytes = coding->src_chars;
ac87bbef
KH
7541 if (coding->dst_bytes == 0)
7542 coding->dst_bytes = 1;
7543 coding->destination = (unsigned char *) xmalloc (coding->dst_bytes);
df7492f9 7544 coding->dst_multibyte = 0;
d46c5b12
KH
7545 }
7546 else
7547 {
df7492f9
KH
7548 coding->dst_object = Qnil;
7549 coding->dst_multibyte = 0;
d46c5b12
KH
7550 }
7551
df7492f9 7552 encode_coding (coding);
d46c5b12 7553
df7492f9 7554 if (EQ (dst_object, Qt))
d46c5b12 7555 {
df7492f9
KH
7556 if (BUFFERP (coding->dst_object))
7557 coding->dst_object = Fbuffer_string ();
7558 else
d46c5b12 7559 {
df7492f9
KH
7560 coding->dst_object
7561 = make_unibyte_string ((char *) coding->destination,
7562 coding->produced);
7563 xfree (coding->destination);
d46c5b12 7564 }
4ed46869 7565 }
d46c5b12 7566
4776e638
KH
7567 if (saved_pt >= 0)
7568 {
7569 /* This is the case of:
7570 (BUFFERP (src_object) && EQ (src_object, dst_object))
7571 As we have moved PT while replacing the original buffer
7572 contents, we must recover it now. */
7573 set_buffer_internal (XBUFFER (src_object));
7574 if (saved_pt < from)
7575 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7576 else if (saved_pt < from + chars)
7577 TEMP_SET_PT_BOTH (from, from_byte);
7578 else if (! NILP (current_buffer->enable_multibyte_characters))
7579 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7580 saved_pt_byte + (coding->produced - bytes));
d46c5b12 7581 else
4776e638
KH
7582 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7583 saved_pt_byte + (coding->produced - bytes));
64cedb0c
KH
7584
7585 if (need_marker_adjustment)
7586 {
7587 struct Lisp_Marker *tail;
7588
7589 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7590 if (tail->need_adjustment)
7591 {
7592 tail->need_adjustment = 0;
7593 if (tail->insertion_type)
7594 {
7595 tail->bytepos = from_byte;
7596 tail->charpos = from;
7597 }
7598 else
7599 {
7600 tail->bytepos = from_byte + coding->produced;
7601 tail->charpos
7602 = (NILP (current_buffer->enable_multibyte_characters)
7603 ? tail->bytepos : from + coding->produced_char);
7604 }
7605 }
7606 }
4776e638
KH
7607 }
7608
c02d943b
KH
7609 if (kill_src_buffer)
7610 Fkill_buffer (coding->src_object);
b3bfad50
KH
7611
7612 Vdeactivate_mark = old_deactivate_mark;
df7492f9 7613 unbind_to (count, Qnil);
b73bfc1c
KH
7614}
7615
df7492f9 7616
b73bfc1c 7617Lisp_Object
df7492f9 7618preferred_coding_system ()
b73bfc1c 7619{
df7492f9 7620 int id = coding_categories[coding_priorities[0]].id;
2391eaa4 7621
df7492f9 7622 return CODING_ID_NAME (id);
4ed46869
KH
7623}
7624
7625\f
7626#ifdef emacs
1397dc18 7627/*** 8. Emacs Lisp library functions ***/
4ed46869 7628
4ed46869 7629DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
48b0f3ae 7630 doc: /* Return t if OBJECT is nil or a coding-system.
df7492f9 7631See the documentation of `define-coding-system' for information
48b0f3ae 7632about coding-system objects. */)
d4a1d553
JB
7633 (object)
7634 Lisp_Object object;
4ed46869 7635{
d4a1d553
JB
7636 if (NILP (object)
7637 || CODING_SYSTEM_ID (object) >= 0)
44e8490d 7638 return Qt;
d4a1d553
JB
7639 if (! SYMBOLP (object)
7640 || NILP (Fget (object, Qcoding_system_define_form)))
44e8490d
KH
7641 return Qnil;
7642 return Qt;
4ed46869
KH
7643}
7644
9d991de8
RS
7645DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
7646 Sread_non_nil_coding_system, 1, 1, 0,
48b0f3ae
PJ
7647 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
7648 (prompt)
4ed46869
KH
7649 Lisp_Object prompt;
7650{
e0e989f6 7651 Lisp_Object val;
9d991de8
RS
7652 do
7653 {
4608c386
KH
7654 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
7655 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
9d991de8 7656 }
8f924df7 7657 while (SCHARS (val) == 0);
e0e989f6 7658 return (Fintern (val, Qnil));
4ed46869
KH
7659}
7660
9b787f3e 7661DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
48b0f3ae 7662 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
c7183fb8
GM
7663If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
7664Ignores case when completing coding systems (all Emacs coding systems
7665are lower-case). */)
48b0f3ae 7666 (prompt, default_coding_system)
9b787f3e 7667 Lisp_Object prompt, default_coding_system;
4ed46869 7668{
f44d27ce 7669 Lisp_Object val;
c7183fb8
GM
7670 int count = SPECPDL_INDEX ();
7671
9b787f3e 7672 if (SYMBOLP (default_coding_system))
57d25e6f 7673 default_coding_system = SYMBOL_NAME (default_coding_system);
c7183fb8 7674 specbind (Qcompletion_ignore_case, Qt);
4608c386 7675 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
9b787f3e
RS
7676 Qt, Qnil, Qcoding_system_history,
7677 default_coding_system, Qnil);
c7183fb8 7678 unbind_to (count, Qnil);
8f924df7 7679 return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
4ed46869
KH
7680}
7681
7682DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
7683 1, 1, 0,
48b0f3ae 7684 doc: /* Check validity of CODING-SYSTEM.
9ffd559c
KH
7685If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
7686It is valid if it is nil or a symbol defined as a coding system by the
7687function `define-coding-system'. */)
df7492f9 7688 (coding_system)
4ed46869
KH
7689 Lisp_Object coding_system;
7690{
44e8490d
KH
7691 Lisp_Object define_form;
7692
7693 define_form = Fget (coding_system, Qcoding_system_define_form);
7694 if (! NILP (define_form))
7695 {
7696 Fput (coding_system, Qcoding_system_define_form, Qnil);
7697 safe_eval (define_form);
7698 }
4ed46869
KH
7699 if (!NILP (Fcoding_system_p (coding_system)))
7700 return coding_system;
fcad4ec4 7701 xsignal1 (Qcoding_system_error, coding_system);
4ed46869 7702}
df7492f9 7703
3a73fa5d 7704\f
89528eb3
KH
7705/* Detect how the bytes at SRC of length SRC_BYTES are encoded. If
7706 HIGHEST is nonzero, return the coding system of the highest
7707 priority among the detected coding systems. Otherwize return a
7708 list of detected coding systems sorted by their priorities. If
7709 MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
7710 multibyte form but contains only ASCII and eight-bit chars.
7711 Otherwise, the bytes are raw bytes.
7712
7713 CODING-SYSTEM controls the detection as below:
7714
7715 If it is nil, detect both text-format and eol-format. If the
7716 text-format part of CODING-SYSTEM is already specified
7717 (e.g. `iso-latin-1'), detect only eol-format. If the eol-format
7718 part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
7719 detect only text-format. */
7720
d46c5b12 7721Lisp_Object
24a73b0a
KH
7722detect_coding_system (src, src_chars, src_bytes, highest, multibytep,
7723 coding_system)
8f924df7 7724 const unsigned char *src;
13818c30
SM
7725 EMACS_INT src_chars, src_bytes;
7726 int highest;
0a28aafb 7727 int multibytep;
df7492f9 7728 Lisp_Object coding_system;
4ed46869 7729{
8f924df7 7730 const unsigned char *src_end = src + src_bytes;
df7492f9 7731 Lisp_Object attrs, eol_type;
4533845d 7732 Lisp_Object val = Qnil;
df7492f9 7733 struct coding_system coding;
89528eb3 7734 int id;
ff0dacd7 7735 struct coding_detection_info detect_info;
24a73b0a 7736 enum coding_category base_category;
2f3cbb32 7737 int null_byte_found = 0, eight_bit_found = 0;
b73bfc1c 7738
df7492f9
KH
7739 if (NILP (coding_system))
7740 coding_system = Qundecided;
7741 setup_coding_system (coding_system, &coding);
7742 attrs = CODING_ID_ATTRS (coding.id);
7743 eol_type = CODING_ID_EOL_TYPE (coding.id);
89528eb3 7744 coding_system = CODING_ATTR_BASE_NAME (attrs);
4ed46869 7745
df7492f9 7746 coding.source = src;
24a73b0a 7747 coding.src_chars = src_chars;
df7492f9
KH
7748 coding.src_bytes = src_bytes;
7749 coding.src_multibyte = multibytep;
7750 coding.consumed = 0;
89528eb3 7751 coding.mode |= CODING_MODE_LAST_BLOCK;
c0e16b14 7752 coding.head_ascii = 0;
d46c5b12 7753
ff0dacd7 7754 detect_info.checked = detect_info.found = detect_info.rejected = 0;
d46c5b12 7755
89528eb3 7756 /* At first, detect text-format if necessary. */
24a73b0a
KH
7757 base_category = XINT (CODING_ATTR_CATEGORY (attrs));
7758 if (base_category == coding_category_undecided)
4ed46869 7759 {
ff0dacd7
KH
7760 enum coding_category category;
7761 struct coding_system *this;
7762 int c, i;
88993dfd 7763
24a73b0a 7764 /* Skip all ASCII bytes except for a few ISO2022 controls. */
2f3cbb32 7765 for (; src < src_end; src++)
4ed46869 7766 {
df7492f9 7767 c = *src;
6cb21a4f 7768 if (c & 0x80)
6cb21a4f 7769 {
2f3cbb32 7770 eight_bit_found = 1;
2f3cbb32
KH
7771 if (null_byte_found)
7772 break;
7773 }
c0e16b14 7774 else if (c < 0x20)
2f3cbb32
KH
7775 {
7776 if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
7777 && ! inhibit_iso_escape_detection
7778 && ! detect_info.checked)
6cb21a4f 7779 {
2f3cbb32
KH
7780 if (detect_coding_iso_2022 (&coding, &detect_info))
7781 {
7782 /* We have scanned the whole data. */
7783 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
c0e16b14
KH
7784 {
7785 /* We didn't find an 8-bit code. We may
7786 have found a null-byte, but it's very
7787 rare that a binary file confirm to
7788 ISO-2022. */
7789 src = src_end;
7790 coding.head_ascii = src - coding.source;
7791 }
7792 detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
2f3cbb32
KH
7793 break;
7794 }
7795 }
97b1b294 7796 else if (! c && !inhibit_null_byte_detection)
2f3cbb32
KH
7797 {
7798 null_byte_found = 1;
7799 if (eight_bit_found)
7800 break;
6cb21a4f 7801 }
c006c0c8
KH
7802 if (! eight_bit_found)
7803 coding.head_ascii++;
6cb21a4f 7804 }
c006c0c8 7805 else if (! eight_bit_found)
c0e16b14 7806 coding.head_ascii++;
4ed46869 7807 }
88993dfd 7808
2f3cbb32
KH
7809 if (null_byte_found || eight_bit_found
7810 || coding.head_ascii < coding.src_bytes
6cb21a4f
KH
7811 || detect_info.found)
7812 {
2f3cbb32 7813 if (coding.head_ascii == coding.src_bytes)
6cb21a4f
KH
7814 /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */
7815 for (i = 0; i < coding_category_raw_text; i++)
ff0dacd7 7816 {
6cb21a4f 7817 category = coding_priorities[i];
c7266f4a 7818 this = coding_categories + category;
6cb21a4f 7819 if (detect_info.found & (1 << category))
ff0dacd7
KH
7820 break;
7821 }
6cb21a4f 7822 else
2f3cbb32
KH
7823 {
7824 if (null_byte_found)
7825 {
7826 detect_info.checked |= ~CATEGORY_MASK_UTF_16;
7827 detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
7828 }
7829 for (i = 0; i < coding_category_raw_text; i++)
7830 {
7831 category = coding_priorities[i];
7832 this = coding_categories + category;
6cb21a4f 7833
2f3cbb32
KH
7834 if (this->id < 0)
7835 {
7836 /* No coding system of this category is defined. */
7837 detect_info.rejected |= (1 << category);
7838 }
7839 else if (category >= coding_category_raw_text)
7840 continue;
7841 else if (detect_info.checked & (1 << category))
7842 {
7843 if (highest
7844 && (detect_info.found & (1 << category)))
6cb21a4f 7845 break;
2f3cbb32
KH
7846 }
7847 else if ((*(this->detector)) (&coding, &detect_info)
7848 && highest
7849 && (detect_info.found & (1 << category)))
7850 {
7851 if (category == coding_category_utf_16_auto)
7852 {
7853 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
7854 category = coding_category_utf_16_le;
7855 else
7856 category = coding_category_utf_16_be;
7857 }
7858 break;
7859 }
7860 }
7861 }
6cb21a4f 7862 }
ec6d2bb8 7863
2f3cbb32 7864 if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY)
ec6d2bb8 7865 {
ff0dacd7 7866 detect_info.found = CATEGORY_MASK_RAW_TEXT;
89528eb3
KH
7867 id = coding_categories[coding_category_raw_text].id;
7868 val = Fcons (make_number (id), Qnil);
7869 }
ff0dacd7 7870 else if (! detect_info.rejected && ! detect_info.found)
89528eb3 7871 {
ff0dacd7 7872 detect_info.found = CATEGORY_MASK_ANY;
89528eb3
KH
7873 id = coding_categories[coding_category_undecided].id;
7874 val = Fcons (make_number (id), Qnil);
7875 }
7876 else if (highest)
7877 {
ff0dacd7 7878 if (detect_info.found)
ec6d2bb8 7879 {
ff0dacd7
KH
7880 detect_info.found = 1 << category;
7881 val = Fcons (make_number (this->id), Qnil);
7882 }
7883 else
7884 for (i = 0; i < coding_category_raw_text; i++)
7885 if (! (detect_info.rejected & (1 << coding_priorities[i])))
7886 {
7887 detect_info.found = 1 << coding_priorities[i];
7888 id = coding_categories[coding_priorities[i]].id;
7889 val = Fcons (make_number (id), Qnil);
7890 break;
7891 }
7892 }
89528eb3
KH
7893 else
7894 {
ff0dacd7
KH
7895 int mask = detect_info.rejected | detect_info.found;
7896 int found = 0;
ec6d2bb8 7897
89528eb3 7898 for (i = coding_category_raw_text - 1; i >= 0; i--)
ff0dacd7
KH
7899 {
7900 category = coding_priorities[i];
7901 if (! (mask & (1 << category)))
ec6d2bb8 7902 {
ff0dacd7
KH
7903 found |= 1 << category;
7904 id = coding_categories[category].id;
c7266f4a
KH
7905 if (id >= 0)
7906 val = Fcons (make_number (id), val);
ff0dacd7
KH
7907 }
7908 }
7909 for (i = coding_category_raw_text - 1; i >= 0; i--)
7910 {
7911 category = coding_priorities[i];
7912 if (detect_info.found & (1 << category))
7913 {
7914 id = coding_categories[category].id;
7915 val = Fcons (make_number (id), val);
ec6d2bb8 7916 }
ec6d2bb8 7917 }
ff0dacd7 7918 detect_info.found |= found;
ec6d2bb8 7919 }
ec6d2bb8 7920 }
a470d443
KH
7921 else if (base_category == coding_category_utf_8_auto)
7922 {
7923 if (detect_coding_utf_8 (&coding, &detect_info))
7924 {
7925 struct coding_system *this;
7926
7927 if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
7928 this = coding_categories + coding_category_utf_8_sig;
7929 else
7930 this = coding_categories + coding_category_utf_8_nosig;
7931 val = Fcons (make_number (this->id), Qnil);
7932 }
7933 }
24a73b0a
KH
7934 else if (base_category == coding_category_utf_16_auto)
7935 {
7936 if (detect_coding_utf_16 (&coding, &detect_info))
7937 {
24a73b0a
KH
7938 struct coding_system *this;
7939
7940 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
7941 this = coding_categories + coding_category_utf_16_le;
7942 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
7943 this = coding_categories + coding_category_utf_16_be;
7944 else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
7945 this = coding_categories + coding_category_utf_16_be_nosig;
7946 else
7947 this = coding_categories + coding_category_utf_16_le_nosig;
7948 val = Fcons (make_number (this->id), Qnil);
7949 }
7950 }
df7492f9
KH
7951 else
7952 {
ff0dacd7 7953 detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
89528eb3 7954 val = Fcons (make_number (coding.id), Qnil);
4ed46869 7955 }
df7492f9 7956
89528eb3 7957 /* Then, detect eol-format if necessary. */
df7492f9 7958 {
4533845d 7959 int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
df7492f9
KH
7960 Lisp_Object tail;
7961
89528eb3
KH
7962 if (VECTORP (eol_type))
7963 {
ff0dacd7 7964 if (detect_info.found & ~CATEGORY_MASK_UTF_16)
2f3cbb32
KH
7965 {
7966 if (null_byte_found)
7967 normal_eol = EOL_SEEN_LF;
7968 else
7969 normal_eol = detect_eol (coding.source, src_bytes,
7970 coding_category_raw_text);
7971 }
ff0dacd7
KH
7972 if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
7973 | CATEGORY_MASK_UTF_16_BE_NOSIG))
89528eb3
KH
7974 utf_16_be_eol = detect_eol (coding.source, src_bytes,
7975 coding_category_utf_16_be);
ff0dacd7
KH
7976 if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
7977 | CATEGORY_MASK_UTF_16_LE_NOSIG))
89528eb3
KH
7978 utf_16_le_eol = detect_eol (coding.source, src_bytes,
7979 coding_category_utf_16_le);
7980 }
7981 else
7982 {
7983 if (EQ (eol_type, Qunix))
7984 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
7985 else if (EQ (eol_type, Qdos))
7986 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
7987 else
7988 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
7989 }
7990
df7492f9
KH
7991 for (tail = val; CONSP (tail); tail = XCDR (tail))
7992 {
89528eb3 7993 enum coding_category category;
df7492f9 7994 int this_eol;
89528eb3
KH
7995
7996 id = XINT (XCAR (tail));
7997 attrs = CODING_ID_ATTRS (id);
7998 category = XINT (CODING_ATTR_CATEGORY (attrs));
7999 eol_type = CODING_ID_EOL_TYPE (id);
df7492f9
KH
8000 if (VECTORP (eol_type))
8001 {
89528eb3
KH
8002 if (category == coding_category_utf_16_be
8003 || category == coding_category_utf_16_be_nosig)
8004 this_eol = utf_16_be_eol;
8005 else if (category == coding_category_utf_16_le
8006 || category == coding_category_utf_16_le_nosig)
8007 this_eol = utf_16_le_eol;
df7492f9 8008 else
89528eb3
KH
8009 this_eol = normal_eol;
8010
df7492f9
KH
8011 if (this_eol == EOL_SEEN_LF)
8012 XSETCAR (tail, AREF (eol_type, 0));
8013 else if (this_eol == EOL_SEEN_CRLF)
8014 XSETCAR (tail, AREF (eol_type, 1));
8015 else if (this_eol == EOL_SEEN_CR)
8016 XSETCAR (tail, AREF (eol_type, 2));
89528eb3
KH
8017 else
8018 XSETCAR (tail, CODING_ID_NAME (id));
df7492f9 8019 }
89528eb3
KH
8020 else
8021 XSETCAR (tail, CODING_ID_NAME (id));
df7492f9
KH
8022 }
8023 }
ec6d2bb8 8024
4533845d 8025 return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
ec6d2bb8
KH
8026}
8027
ec6d2bb8 8028
d46c5b12
KH
8029DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8030 2, 3, 0,
48b0f3ae
PJ
8031 doc: /* Detect coding system of the text in the region between START and END.
8032Return a list of possible coding systems ordered by priority.
ec6d2bb8 8033
12e0131a 8034If only ASCII characters are found (except for such ISO-2022 control
d4a1d553
JB
8035characters as ESC), it returns a list of single element `undecided'
8036or its subsidiary coding system according to a detected end-of-line
8037format.
ec6d2bb8 8038
48b0f3ae
PJ
8039If optional argument HIGHEST is non-nil, return the coding system of
8040highest priority. */)
8041 (start, end, highest)
d46c5b12
KH
8042 Lisp_Object start, end, highest;
8043{
8044 int from, to;
8045 int from_byte, to_byte;
ec6d2bb8 8046
b7826503
PJ
8047 CHECK_NUMBER_COERCE_MARKER (start);
8048 CHECK_NUMBER_COERCE_MARKER (end);
ec6d2bb8 8049
d46c5b12
KH
8050 validate_region (&start, &end);
8051 from = XINT (start), to = XINT (end);
8052 from_byte = CHAR_TO_BYTE (from);
8053 to_byte = CHAR_TO_BYTE (to);
ec6d2bb8 8054
d46c5b12
KH
8055 if (from < GPT && to >= GPT)
8056 move_gap_both (to, to_byte);
c210f766 8057
d46c5b12 8058 return detect_coding_system (BYTE_POS_ADDR (from_byte),
24a73b0a 8059 to - from, to_byte - from_byte,
0a28aafb
KH
8060 !NILP (highest),
8061 !NILP (current_buffer
df7492f9
KH
8062 ->enable_multibyte_characters),
8063 Qnil);
ec6d2bb8
KH
8064}
8065
d46c5b12
KH
8066DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8067 1, 2, 0,
48b0f3ae
PJ
8068 doc: /* Detect coding system of the text in STRING.
8069Return a list of possible coding systems ordered by priority.
fb88bf2d 8070
12e0131a 8071If only ASCII characters are found (except for such ISO-2022 control
d4a1d553
JB
8072characters as ESC), it returns a list of single element `undecided'
8073or its subsidiary coding system according to a detected end-of-line
8074format.
d46c5b12 8075
48b0f3ae
PJ
8076If optional argument HIGHEST is non-nil, return the coding system of
8077highest priority. */)
8078 (string, highest)
d46c5b12
KH
8079 Lisp_Object string, highest;
8080{
b7826503 8081 CHECK_STRING (string);
b73bfc1c 8082
24a73b0a
KH
8083 return detect_coding_system (SDATA (string),
8084 SCHARS (string), SBYTES (string),
8f924df7 8085 !NILP (highest), STRING_MULTIBYTE (string),
df7492f9 8086 Qnil);
4ed46869 8087}
4ed46869 8088
b73bfc1c 8089
df7492f9
KH
8090static INLINE int
8091char_encodable_p (c, attrs)
8092 int c;
8093 Lisp_Object attrs;
05e6f5dc 8094{
df7492f9 8095 Lisp_Object tail;
df7492f9 8096 struct charset *charset;
7d64c6ad 8097 Lisp_Object translation_table;
d46c5b12 8098
7d64c6ad 8099 translation_table = CODING_ATTR_TRANS_TBL (attrs);
a6f87d34 8100 if (! NILP (translation_table))
7d64c6ad 8101 c = translate_char (translation_table, c);
df7492f9
KH
8102 for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8103 CONSP (tail); tail = XCDR (tail))
e133c8fa 8104 {
df7492f9
KH
8105 charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8106 if (CHAR_CHARSET_P (c, charset))
8107 break;
e133c8fa 8108 }
df7492f9 8109 return (! NILP (tail));
05e6f5dc 8110}
83fa074f 8111
fb88bf2d 8112
df7492f9
KH
8113/* Return a list of coding systems that safely encode the text between
8114 START and END. If EXCLUDE is non-nil, it is a list of coding
8115 systems not to check. The returned list doesn't contain any such
48468dac 8116 coding systems. In any case, if the text contains only ASCII or is
df7492f9 8117 unibyte, return t. */
e077cc80 8118
df7492f9
KH
8119DEFUN ("find-coding-systems-region-internal",
8120 Ffind_coding_systems_region_internal,
8121 Sfind_coding_systems_region_internal, 2, 3, 0,
8122 doc: /* Internal use only. */)
8123 (start, end, exclude)
8124 Lisp_Object start, end, exclude;
8125{
8126 Lisp_Object coding_attrs_list, safe_codings;
8127 EMACS_INT start_byte, end_byte;
7c78e542 8128 const unsigned char *p, *pbeg, *pend;
df7492f9
KH
8129 int c;
8130 Lisp_Object tail, elt;
d46c5b12 8131
df7492f9
KH
8132 if (STRINGP (start))
8133 {
8134 if (!STRING_MULTIBYTE (start)
8f924df7 8135 || SCHARS (start) == SBYTES (start))
df7492f9
KH
8136 return Qt;
8137 start_byte = 0;
8f924df7 8138 end_byte = SBYTES (start);
df7492f9
KH
8139 }
8140 else
d46c5b12 8141 {
df7492f9
KH
8142 CHECK_NUMBER_COERCE_MARKER (start);
8143 CHECK_NUMBER_COERCE_MARKER (end);
8144 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8145 args_out_of_range (start, end);
8146 if (NILP (current_buffer->enable_multibyte_characters))
8147 return Qt;
8148 start_byte = CHAR_TO_BYTE (XINT (start));
8149 end_byte = CHAR_TO_BYTE (XINT (end));
8150 if (XINT (end) - XINT (start) == end_byte - start_byte)
8151 return Qt;
d46c5b12 8152
e1c23804 8153 if (XINT (start) < GPT && XINT (end) > GPT)
d46c5b12 8154 {
e1c23804
DL
8155 if ((GPT - XINT (start)) < (XINT (end) - GPT))
8156 move_gap_both (XINT (start), start_byte);
df7492f9 8157 else
e1c23804 8158 move_gap_both (XINT (end), end_byte);
d46c5b12
KH
8159 }
8160 }
8161
df7492f9
KH
8162 coding_attrs_list = Qnil;
8163 for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
8164 if (NILP (exclude)
8165 || NILP (Fmemq (XCAR (tail), exclude)))
8166 {
8167 Lisp_Object attrs;
d46c5b12 8168
df7492f9
KH
8169 attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
8170 if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
8171 && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
7d64c6ad
KH
8172 {
8173 ASET (attrs, coding_attr_trans_tbl,
2170c8f0 8174 get_translation_table (attrs, 1, NULL));
7d64c6ad
KH
8175 coding_attrs_list = Fcons (attrs, coding_attrs_list);
8176 }
df7492f9 8177 }
d46c5b12 8178
df7492f9 8179 if (STRINGP (start))
8f924df7 8180 p = pbeg = SDATA (start);
df7492f9
KH
8181 else
8182 p = pbeg = BYTE_POS_ADDR (start_byte);
8183 pend = p + (end_byte - start_byte);
b843d1ae 8184
df7492f9
KH
8185 while (p < pend && ASCII_BYTE_P (*p)) p++;
8186 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
d46c5b12 8187
05e6f5dc 8188 while (p < pend)
72d1a715 8189 {
df7492f9
KH
8190 if (ASCII_BYTE_P (*p))
8191 p++;
72d1a715
RS
8192 else
8193 {
df7492f9 8194 c = STRING_CHAR_ADVANCE (p);
12410ef1 8195
df7492f9
KH
8196 charset_map_loaded = 0;
8197 for (tail = coding_attrs_list; CONSP (tail);)
8198 {
8199 elt = XCAR (tail);
8200 if (NILP (elt))
8201 tail = XCDR (tail);
8202 else if (char_encodable_p (c, elt))
8203 tail = XCDR (tail);
8204 else if (CONSP (XCDR (tail)))
8205 {
8206 XSETCAR (tail, XCAR (XCDR (tail)));
8207 XSETCDR (tail, XCDR (XCDR (tail)));
8208 }
8209 else
8210 {
8211 XSETCAR (tail, Qnil);
8212 tail = XCDR (tail);
8213 }
8214 }
8215 if (charset_map_loaded)
8216 {
8217 EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
05e6f5dc 8218
df7492f9 8219 if (STRINGP (start))
8f924df7 8220 pbeg = SDATA (start);
df7492f9
KH
8221 else
8222 pbeg = BYTE_POS_ADDR (start_byte);
8223 p = pbeg + p_offset;
8224 pend = pbeg + pend_offset;
8225 }
8226 }
ec6d2bb8 8227 }
fb88bf2d 8228
988b3759 8229 safe_codings = list2 (Qraw_text, Qno_conversion);
df7492f9
KH
8230 for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
8231 if (! NILP (XCAR (tail)))
8232 safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
ec6d2bb8 8233
05e6f5dc
KH
8234 return safe_codings;
8235}
4956c225 8236
d46c5b12 8237
8f924df7
KH
8238DEFUN ("unencodable-char-position", Funencodable_char_position,
8239 Sunencodable_char_position, 3, 5, 0,
8240 doc: /*
8241Return position of first un-encodable character in a region.
d4a1d553 8242START and END specify the region and CODING-SYSTEM specifies the
8f924df7 8243encoding to check. Return nil if CODING-SYSTEM does encode the region.
d46c5b12 8244
8f924df7
KH
8245If optional 4th argument COUNT is non-nil, it specifies at most how
8246many un-encodable characters to search. In this case, the value is a
8247list of positions.
d46c5b12 8248
8f924df7
KH
8249If optional 5th argument STRING is non-nil, it is a string to search
8250for un-encodable characters. In that case, START and END are indexes
8251to the string. */)
8252 (start, end, coding_system, count, string)
8253 Lisp_Object start, end, coding_system, count, string;
8254{
8255 int n;
8256 struct coding_system coding;
7d64c6ad 8257 Lisp_Object attrs, charset_list, translation_table;
8f924df7
KH
8258 Lisp_Object positions;
8259 int from, to;
8260 const unsigned char *p, *stop, *pend;
8261 int ascii_compatible;
fb88bf2d 8262
8f924df7
KH
8263 setup_coding_system (Fcheck_coding_system (coding_system), &coding);
8264 attrs = CODING_ID_ATTRS (coding.id);
8265 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
8266 return Qnil;
8267 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
8268 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2170c8f0 8269 translation_table = get_translation_table (attrs, 1, NULL);
fb88bf2d 8270
8f924df7
KH
8271 if (NILP (string))
8272 {
8273 validate_region (&start, &end);
8274 from = XINT (start);
8275 to = XINT (end);
8276 if (NILP (current_buffer->enable_multibyte_characters)
8277 || (ascii_compatible
8278 && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
8279 return Qnil;
8280 p = CHAR_POS_ADDR (from);
8281 pend = CHAR_POS_ADDR (to);
8282 if (from < GPT && to >= GPT)
8283 stop = GPT_ADDR;
8284 else
8285 stop = pend;
8286 }
8287 else
8288 {
8289 CHECK_STRING (string);
8290 CHECK_NATNUM (start);
8291 CHECK_NATNUM (end);
8292 from = XINT (start);
8293 to = XINT (end);
8294 if (from > to
8295 || to > SCHARS (string))
8296 args_out_of_range_3 (string, start, end);
8297 if (! STRING_MULTIBYTE (string))
8298 return Qnil;
8299 p = SDATA (string) + string_char_to_byte (string, from);
8300 stop = pend = SDATA (string) + string_char_to_byte (string, to);
8301 if (ascii_compatible && (to - from) == (pend - p))
8302 return Qnil;
8303 }
f2558efd 8304
8f924df7
KH
8305 if (NILP (count))
8306 n = 1;
8307 else
b73bfc1c 8308 {
8f924df7
KH
8309 CHECK_NATNUM (count);
8310 n = XINT (count);
b73bfc1c
KH
8311 }
8312
8f924df7
KH
8313 positions = Qnil;
8314 while (1)
d46c5b12 8315 {
8f924df7 8316 int c;
ec6d2bb8 8317
8f924df7
KH
8318 if (ascii_compatible)
8319 while (p < stop && ASCII_BYTE_P (*p))
8320 p++, from++;
8321 if (p >= stop)
0e79d667 8322 {
8f924df7
KH
8323 if (p >= pend)
8324 break;
8325 stop = pend;
8326 p = GAP_END_ADDR;
0e79d667 8327 }
ec6d2bb8 8328
8f924df7
KH
8329 c = STRING_CHAR_ADVANCE (p);
8330 if (! (ASCII_CHAR_P (c) && ascii_compatible)
7d64c6ad
KH
8331 && ! char_charset (translate_char (translation_table, c),
8332 charset_list, NULL))
ec6d2bb8 8333 {
8f924df7
KH
8334 positions = Fcons (make_number (from), positions);
8335 n--;
8336 if (n == 0)
8337 break;
ec6d2bb8
KH
8338 }
8339
8f924df7
KH
8340 from++;
8341 }
d46c5b12 8342
8f924df7
KH
8343 return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
8344}
d46c5b12 8345
d46c5b12 8346
df7492f9
KH
8347DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
8348 Scheck_coding_systems_region, 3, 3, 0,
8349 doc: /* Check if the region is encodable by coding systems.
d46c5b12 8350
df7492f9
KH
8351START and END are buffer positions specifying the region.
8352CODING-SYSTEM-LIST is a list of coding systems to check.
d46c5b12 8353
df7492f9 8354The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
d4a1d553 8355CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
df7492f9
KH
8356whole region, POS0, POS1, ... are buffer positions where non-encodable
8357characters are found.
93dec019 8358
df7492f9
KH
8359If all coding systems in CODING-SYSTEM-LIST can encode the region, the
8360value is nil.
93dec019 8361
df7492f9
KH
8362START may be a string. In that case, check if the string is
8363encodable, and the value contains indices to the string instead of
8364buffer positions. END is ignored. */)
8365 (start, end, coding_system_list)
8366 Lisp_Object start, end, coding_system_list;
05e6f5dc 8367{
df7492f9
KH
8368 Lisp_Object list;
8369 EMACS_INT start_byte, end_byte;
8370 int pos;
7c78e542 8371 const unsigned char *p, *pbeg, *pend;
df7492f9 8372 int c;
7d64c6ad 8373 Lisp_Object tail, elt, attrs;
70ad9fc4 8374
05e6f5dc
KH
8375 if (STRINGP (start))
8376 {
df7492f9 8377 if (!STRING_MULTIBYTE (start)
8f924df7 8378 && SCHARS (start) != SBYTES (start))
df7492f9
KH
8379 return Qnil;
8380 start_byte = 0;
8f924df7 8381 end_byte = SBYTES (start);
df7492f9 8382 pos = 0;
d46c5b12 8383 }
05e6f5dc 8384 else
b73bfc1c 8385 {
b7826503
PJ
8386 CHECK_NUMBER_COERCE_MARKER (start);
8387 CHECK_NUMBER_COERCE_MARKER (end);
05e6f5dc
KH
8388 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8389 args_out_of_range (start, end);
8390 if (NILP (current_buffer->enable_multibyte_characters))
df7492f9
KH
8391 return Qnil;
8392 start_byte = CHAR_TO_BYTE (XINT (start));
8393 end_byte = CHAR_TO_BYTE (XINT (end));
8394 if (XINT (end) - XINT (start) == end_byte - start_byte)
05e6f5dc 8395 return Qt;
df7492f9 8396
e1c23804 8397 if (XINT (start) < GPT && XINT (end) > GPT)
b73bfc1c 8398 {
e1c23804
DL
8399 if ((GPT - XINT (start)) < (XINT (end) - GPT))
8400 move_gap_both (XINT (start), start_byte);
df7492f9 8401 else
e1c23804 8402 move_gap_both (XINT (end), end_byte);
b73bfc1c 8403 }
e1c23804 8404 pos = XINT (start);
b73bfc1c 8405 }
7553d0e1 8406
df7492f9
KH
8407 list = Qnil;
8408 for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
12410ef1 8409 {
df7492f9 8410 elt = XCAR (tail);
7d64c6ad 8411 attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
2170c8f0
KH
8412 ASET (attrs, coding_attr_trans_tbl,
8413 get_translation_table (attrs, 1, NULL));
7d64c6ad 8414 list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
12410ef1
KH
8415 }
8416
df7492f9 8417 if (STRINGP (start))
8f924df7 8418 p = pbeg = SDATA (start);
72d1a715 8419 else
df7492f9
KH
8420 p = pbeg = BYTE_POS_ADDR (start_byte);
8421 pend = p + (end_byte - start_byte);
4ed46869 8422
df7492f9
KH
8423 while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
8424 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
ec6d2bb8 8425
df7492f9 8426 while (p < pend)
d46c5b12 8427 {
df7492f9
KH
8428 if (ASCII_BYTE_P (*p))
8429 p++;
e133c8fa 8430 else
05e6f5dc 8431 {
df7492f9
KH
8432 c = STRING_CHAR_ADVANCE (p);
8433
8434 charset_map_loaded = 0;
8435 for (tail = list; CONSP (tail); tail = XCDR (tail))
8436 {
8437 elt = XCDR (XCAR (tail));
8438 if (! char_encodable_p (c, XCAR (elt)))
8439 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
8440 }
8441 if (charset_map_loaded)
8442 {
8443 EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
8444
8445 if (STRINGP (start))
8f924df7 8446 pbeg = SDATA (start);
df7492f9
KH
8447 else
8448 pbeg = BYTE_POS_ADDR (start_byte);
8449 p = pbeg + p_offset;
8450 pend = pbeg + pend_offset;
8451 }
05e6f5dc 8452 }
df7492f9 8453 pos++;
d46c5b12 8454 }
4ed46869 8455
df7492f9
KH
8456 tail = list;
8457 list = Qnil;
8458 for (; CONSP (tail); tail = XCDR (tail))
ec6d2bb8 8459 {
df7492f9
KH
8460 elt = XCAR (tail);
8461 if (CONSP (XCDR (XCDR (elt))))
8462 list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
8463 list);
ec6d2bb8 8464 }
2b4f9037 8465
df7492f9 8466 return list;
d46c5b12
KH
8467}
8468
3fd9494b 8469
b73bfc1c 8470Lisp_Object
df7492f9
KH
8471code_convert_region (start, end, coding_system, dst_object, encodep, norecord)
8472 Lisp_Object start, end, coding_system, dst_object;
8473 int encodep, norecord;
4ed46869 8474{
3a73fa5d 8475 struct coding_system coding;
df7492f9
KH
8476 EMACS_INT from, from_byte, to, to_byte;
8477 Lisp_Object src_object;
4ed46869 8478
b7826503
PJ
8479 CHECK_NUMBER_COERCE_MARKER (start);
8480 CHECK_NUMBER_COERCE_MARKER (end);
df7492f9
KH
8481 if (NILP (coding_system))
8482 coding_system = Qno_conversion;
8483 else
8484 CHECK_CODING_SYSTEM (coding_system);
8485 src_object = Fcurrent_buffer ();
8486 if (NILP (dst_object))
8487 dst_object = src_object;
8488 else if (! EQ (dst_object, Qt))
8489 CHECK_BUFFER (dst_object);
3a73fa5d 8490
d46c5b12
KH
8491 validate_region (&start, &end);
8492 from = XFASTINT (start);
df7492f9 8493 from_byte = CHAR_TO_BYTE (from);
d46c5b12 8494 to = XFASTINT (end);
df7492f9 8495 to_byte = CHAR_TO_BYTE (to);
764ca8da 8496
df7492f9
KH
8497 setup_coding_system (coding_system, &coding);
8498 coding.mode |= CODING_MODE_LAST_BLOCK;
ec6d2bb8 8499
df7492f9
KH
8500 if (encodep)
8501 encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8502 dst_object);
8503 else
8504 decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8505 dst_object);
8506 if (! norecord)
8507 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
ec6d2bb8 8508
df7492f9
KH
8509 return (BUFFERP (dst_object)
8510 ? make_number (coding.produced_char)
8511 : coding.dst_object);
4031e2bf 8512}
78108bcd 8513
4ed46869 8514
4031e2bf 8515DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
df7492f9 8516 3, 4, "r\nzCoding system: ",
48b0f3ae 8517 doc: /* Decode the current region from the specified coding system.
df7492f9
KH
8518When called from a program, takes four arguments:
8519 START, END, CODING-SYSTEM, and DESTINATION.
8520START and END are buffer positions.
8844fa83 8521
df7492f9 8522Optional 4th arguments DESTINATION specifies where the decoded text goes.
2354b80b 8523If nil, the region between START and END is replaced by the decoded text.
1560f91a
EZ
8524If buffer, the decoded text is inserted in that buffer after point (point
8525does not move).
446dcd75 8526In those cases, the length of the decoded text is returned.
319a3947 8527If DESTINATION is t, the decoded text is returned.
8844fa83 8528
48b0f3ae
PJ
8529This function sets `last-coding-system-used' to the precise coding system
8530used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
319a3947 8531not fully specified.) */)
df7492f9
KH
8532 (start, end, coding_system, destination)
8533 Lisp_Object start, end, coding_system, destination;
4031e2bf 8534{
df7492f9 8535 return code_convert_region (start, end, coding_system, destination, 0, 0);
3a73fa5d 8536}
8844fa83 8537
3a73fa5d 8538DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
df7492f9
KH
8539 3, 4, "r\nzCoding system: ",
8540 doc: /* Encode the current region by specified coding system.
d4a1d553
JB
8541When called from a program, takes four arguments:
8542 START, END, CODING-SYSTEM and DESTINATION.
8543START and END are buffer positions.
d46c5b12 8544
df7492f9
KH
8545Optional 4th arguments DESTINATION specifies where the encoded text goes.
8546If nil, the region between START and END is replace by the encoded text.
1560f91a
EZ
8547If buffer, the encoded text is inserted in that buffer after point (point
8548does not move).
446dcd75 8549In those cases, the length of the encoded text is returned.
319a3947 8550If DESTINATION is t, the encoded text is returned.
2391eaa4 8551
48b0f3ae
PJ
8552This function sets `last-coding-system-used' to the precise coding system
8553used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
319a3947 8554not fully specified.) */)
df7492f9
KH
8555 (start, end, coding_system, destination)
8556 Lisp_Object start, end, coding_system, destination;
3a73fa5d 8557{
df7492f9 8558 return code_convert_region (start, end, coding_system, destination, 1, 0);
b73bfc1c
KH
8559}
8560
8561Lisp_Object
df7492f9
KH
8562code_convert_string (string, coding_system, dst_object,
8563 encodep, nocopy, norecord)
8564 Lisp_Object string, coding_system, dst_object;
8565 int encodep, nocopy, norecord;
b73bfc1c 8566{
4031e2bf 8567 struct coding_system coding;
df7492f9 8568 EMACS_INT chars, bytes;
ec6d2bb8 8569
b7826503 8570 CHECK_STRING (string);
d46c5b12 8571 if (NILP (coding_system))
4956c225 8572 {
df7492f9
KH
8573 if (! norecord)
8574 Vlast_coding_system_used = Qno_conversion;
8575 if (NILP (dst_object))
8576 return (nocopy ? Fcopy_sequence (string) : string);
4956c225 8577 }
b73bfc1c 8578
df7492f9
KH
8579 if (NILP (coding_system))
8580 coding_system = Qno_conversion;
8581 else
8582 CHECK_CODING_SYSTEM (coding_system);
8583 if (NILP (dst_object))
8584 dst_object = Qt;
8585 else if (! EQ (dst_object, Qt))
8586 CHECK_BUFFER (dst_object);
73be902c 8587
df7492f9 8588 setup_coding_system (coding_system, &coding);
d46c5b12 8589 coding.mode |= CODING_MODE_LAST_BLOCK;
8f924df7
KH
8590 chars = SCHARS (string);
8591 bytes = SBYTES (string);
df7492f9
KH
8592 if (encodep)
8593 encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
8594 else
8595 decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
8596 if (! norecord)
8597 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
73be902c 8598
df7492f9
KH
8599 return (BUFFERP (dst_object)
8600 ? make_number (coding.produced_char)
8601 : coding.dst_object);
4ed46869 8602}
73be902c 8603
b73bfc1c 8604
ecec61c1 8605/* Encode or decode STRING according to CODING_SYSTEM.
ec6d2bb8 8606 Do not set Vlast_coding_system_used.
4ed46869 8607
ec6d2bb8
KH
8608 This function is called only from macros DECODE_FILE and
8609 ENCODE_FILE, thus we ignore character composition. */
4ed46869 8610
ecec61c1
KH
8611Lisp_Object
8612code_convert_string_norecord (string, coding_system, encodep)
8613 Lisp_Object string, coding_system;
8614 int encodep;
4ed46869 8615{
0be8721c 8616 return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
4ed46869
KH
8617}
8618
4ed46869 8619
df7492f9
KH
8620DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
8621 2, 4, 0,
8622 doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
8623
8624Optional third arg NOCOPY non-nil means it is OK to return STRING itself
8625if the decoding operation is trivial.
ecec61c1 8626
d4a1d553 8627Optional fourth arg BUFFER non-nil means that the decoded text is
1560f91a
EZ
8628inserted in that buffer after point (point does not move). In this
8629case, the return value is the length of the decoded text.
ecec61c1 8630
df7492f9
KH
8631This function sets `last-coding-system-used' to the precise coding system
8632used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
d4a1d553 8633not fully specified.) */)
df7492f9
KH
8634 (string, coding_system, nocopy, buffer)
8635 Lisp_Object string, coding_system, nocopy, buffer;
4ed46869 8636{
df7492f9
KH
8637 return code_convert_string (string, coding_system, buffer,
8638 0, ! NILP (nocopy), 0);
4ed46869
KH
8639}
8640
df7492f9
KH
8641DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
8642 2, 4, 0,
8643 doc: /* Encode STRING to CODING-SYSTEM, and return the result.
8644
8645Optional third arg NOCOPY non-nil means it is OK to return STRING
8646itself if the encoding operation is trivial.
8647
d4a1d553 8648Optional fourth arg BUFFER non-nil means that the encoded text is
1560f91a
EZ
8649inserted in that buffer after point (point does not move). In this
8650case, the return value is the length of the encoded text.
df7492f9
KH
8651
8652This function sets `last-coding-system-used' to the precise coding system
8653used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8654not fully specified.) */)
8655 (string, coding_system, nocopy, buffer)
8656 Lisp_Object string, coding_system, nocopy, buffer;
4ed46869 8657{
df7492f9 8658 return code_convert_string (string, coding_system, buffer,
c197f191 8659 1, ! NILP (nocopy), 1);
4ed46869 8660}
df7492f9 8661
3a73fa5d 8662\f
4ed46869 8663DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
48b0f3ae
PJ
8664 doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
8665Return the corresponding character. */)
8666 (code)
4ed46869 8667 Lisp_Object code;
4ed46869 8668{
df7492f9
KH
8669 Lisp_Object spec, attrs, val;
8670 struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
8671 int c;
4ed46869 8672
df7492f9
KH
8673 CHECK_NATNUM (code);
8674 c = XFASTINT (code);
8675 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
8676 attrs = AREF (spec, 0);
4ed46869 8677
df7492f9
KH
8678 if (ASCII_BYTE_P (c)
8679 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8680 return code;
4ed46869 8681
df7492f9
KH
8682 val = CODING_ATTR_CHARSET_LIST (attrs);
8683 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
004068e4
KH
8684 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
8685 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
fa42c37f 8686
df7492f9
KH
8687 if (c <= 0x7F)
8688 charset = charset_roman;
8689 else if (c >= 0xA0 && c < 0xDF)
55ab7be3 8690 {
df7492f9
KH
8691 charset = charset_kana;
8692 c -= 0x80;
4ed46869 8693 }
55ab7be3 8694 else
4ed46869 8695 {
004068e4 8696 int s1 = c >> 8, s2 = c & 0xFF;
df7492f9
KH
8697
8698 if (s1 < 0x81 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF
8699 || s2 < 0x40 || s2 == 0x7F || s2 > 0xFC)
8700 error ("Invalid code: %d", code);
8701 SJIS_TO_JIS (c);
8702 charset = charset_kanji;
4ed46869 8703 }
df7492f9
KH
8704 c = DECODE_CHAR (charset, c);
8705 if (c < 0)
8706 error ("Invalid code: %d", code);
8707 return make_number (c);
93dec019 8708}
4ed46869 8709
48b0f3ae 8710
4ed46869 8711DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
8acf0c0e 8712 doc: /* Encode a Japanese character CH to shift_jis encoding.
48b0f3ae
PJ
8713Return the corresponding code in SJIS. */)
8714 (ch)
df7492f9 8715 Lisp_Object ch;
4ed46869 8716{
df7492f9
KH
8717 Lisp_Object spec, attrs, charset_list;
8718 int c;
8719 struct charset *charset;
8720 unsigned code;
48b0f3ae 8721
df7492f9
KH
8722 CHECK_CHARACTER (ch);
8723 c = XFASTINT (ch);
8724 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
8725 attrs = AREF (spec, 0);
8726
8727 if (ASCII_CHAR_P (c)
8728 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8729 return ch;
8730
8731 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8732 charset = char_charset (c, charset_list, &code);
8733 if (code == CHARSET_INVALID_CODE (charset))
8734 error ("Can't encode by shift_jis encoding: %d", c);
8735 JIS_TO_SJIS (code);
8736
8737 return make_number (code);
4ed46869
KH
8738}
8739
8740DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
48b0f3ae
PJ
8741 doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
8742Return the corresponding character. */)
8743 (code)
4ed46869 8744 Lisp_Object code;
d46c5b12 8745{
df7492f9
KH
8746 Lisp_Object spec, attrs, val;
8747 struct charset *charset_roman, *charset_big5, *charset;
8748 int c;
6289dd10 8749
df7492f9
KH
8750 CHECK_NATNUM (code);
8751 c = XFASTINT (code);
8752 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
8753 attrs = AREF (spec, 0);
4ed46869 8754
df7492f9
KH
8755 if (ASCII_BYTE_P (c)
8756 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8757 return code;
6289dd10 8758
df7492f9
KH
8759 val = CODING_ATTR_CHARSET_LIST (attrs);
8760 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
8761 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
c210f766 8762
df7492f9
KH
8763 if (c <= 0x7F)
8764 charset = charset_roman;
c28a9453
KH
8765 else
8766 {
df7492f9
KH
8767 int b1 = c >> 8, b2 = c & 0x7F;
8768 if (b1 < 0xA1 || b1 > 0xFE
8769 || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
8770 error ("Invalid code: %d", code);
8771 charset = charset_big5;
c28a9453 8772 }
df7492f9
KH
8773 c = DECODE_CHAR (charset, (unsigned )c);
8774 if (c < 0)
8775 error ("Invalid code: %d", code);
8776 return make_number (c);
d46c5b12 8777}
6289dd10 8778
4ed46869 8779DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
8acf0c0e 8780 doc: /* Encode the Big5 character CH to BIG5 coding system.
48b0f3ae
PJ
8781Return the corresponding character code in Big5. */)
8782 (ch)
4ed46869
KH
8783 Lisp_Object ch;
8784{
df7492f9
KH
8785 Lisp_Object spec, attrs, charset_list;
8786 struct charset *charset;
8787 int c;
8788 unsigned code;
8789
8790 CHECK_CHARACTER (ch);
8791 c = XFASTINT (ch);
8792 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
8793 attrs = AREF (spec, 0);
8794 if (ASCII_CHAR_P (c)
8795 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8796 return ch;
8797
8798 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8799 charset = char_charset (c, charset_list, &code);
8800 if (code == CHARSET_INVALID_CODE (charset))
8801 error ("Can't encode by Big5 encoding: %d", c);
8802
8803 return make_number (code);
4ed46869 8804}
48b0f3ae 8805
3a73fa5d 8806\f
002fdb44 8807DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
68bba4e4 8808 Sset_terminal_coding_system_internal, 1, 2, 0,
48b0f3ae 8809 doc: /* Internal use only. */)
6ed8eeff 8810 (coding_system, terminal)
b74e4686 8811 Lisp_Object coding_system;
6ed8eeff 8812 Lisp_Object terminal;
4ed46869 8813{
6ed8eeff 8814 struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
b7826503 8815 CHECK_SYMBOL (coding_system);
b8299c66 8816 setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
70c22245 8817 /* We had better not send unsafe characters to terminal. */
c73bd236 8818 terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
df7492f9 8819 /* Characer composition should be disabled. */
c73bd236 8820 terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
b8299c66
KL
8821 terminal_coding->src_multibyte = 1;
8822 terminal_coding->dst_multibyte = 0;
4ed46869
KH
8823 return Qnil;
8824}
8825
c4825358
KH
8826DEFUN ("set-safe-terminal-coding-system-internal",
8827 Fset_safe_terminal_coding_system_internal,
48b0f3ae 8828 Sset_safe_terminal_coding_system_internal, 1, 1, 0,
ddb67bdc 8829 doc: /* Internal use only. */)
48b0f3ae 8830 (coding_system)
b74e4686 8831 Lisp_Object coding_system;
d46c5b12 8832{
b7826503 8833 CHECK_SYMBOL (coding_system);
c4825358
KH
8834 setup_coding_system (Fcheck_coding_system (coding_system),
8835 &safe_terminal_coding);
df7492f9
KH
8836 /* Characer composition should be disabled. */
8837 safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
b73bfc1c
KH
8838 safe_terminal_coding.src_multibyte = 1;
8839 safe_terminal_coding.dst_multibyte = 0;
c4825358
KH
8840 return Qnil;
8841}
4ed46869 8842
002fdb44 8843DEFUN ("terminal-coding-system", Fterminal_coding_system,
68bba4e4 8844 Sterminal_coding_system, 0, 1, 0,
6ed8eeff
KL
8845 doc: /* Return coding system specified for terminal output on the given terminal.
8846TERMINAL may be a terminal id, a frame, or nil for the selected
8847frame's terminal device. */)
8848 (terminal)
8849 Lisp_Object terminal;
4ed46869 8850{
985773c9
MB
8851 struct coding_system *terminal_coding
8852 = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
8853 Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
ae6f73fa 8854
ae6f73fa 8855 /* For backward compatibility, return nil if it is `undecided'. */
f75c90a9 8856 return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
4ed46869
KH
8857}
8858
002fdb44 8859DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
68bba4e4 8860 Sset_keyboard_coding_system_internal, 1, 2, 0,
48b0f3ae 8861 doc: /* Internal use only. */)
6ed8eeff 8862 (coding_system, terminal)
4ed46869 8863 Lisp_Object coding_system;
6ed8eeff 8864 Lisp_Object terminal;
4ed46869 8865{
6ed8eeff 8866 struct terminal *t = get_terminal (terminal, 1);
b7826503 8867 CHECK_SYMBOL (coding_system);
df7492f9 8868 setup_coding_system (Fcheck_coding_system (coding_system),
c73bd236 8869 TERMINAL_KEYBOARD_CODING (t));
df7492f9 8870 /* Characer composition should be disabled. */
c73bd236
MB
8871 TERMINAL_KEYBOARD_CODING (t)->common_flags
8872 &= ~CODING_ANNOTATE_COMPOSITION_MASK;
4ed46869
KH
8873 return Qnil;
8874}
8875
8876DEFUN ("keyboard-coding-system",
985773c9 8877 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
48b0f3ae 8878 doc: /* Return coding system specified for decoding keyboard input. */)
985773c9
MB
8879 (terminal)
8880 Lisp_Object terminal;
4ed46869 8881{
985773c9
MB
8882 return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
8883 (get_terminal (terminal, 1))->id);
4ed46869
KH
8884}
8885
4ed46869 8886\f
a5d301df
KH
8887DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
8888 Sfind_operation_coding_system, 1, MANY, 0,
48b0f3ae
PJ
8889 doc: /* Choose a coding system for an operation based on the target name.
8890The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
8891DECODING-SYSTEM is the coding system to use for decoding
8892\(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
8893for encoding (in case OPERATION does encoding).
05e6f5dc 8894
48b0f3ae
PJ
8895The first argument OPERATION specifies an I/O primitive:
8896 For file I/O, `insert-file-contents' or `write-region'.
8897 For process I/O, `call-process', `call-process-region', or `start-process'.
8898 For network I/O, `open-network-stream'.
05e6f5dc 8899
48b0f3ae
PJ
8900The remaining arguments should be the same arguments that were passed
8901to the primitive. Depending on which primitive, one of those arguments
8902is selected as the TARGET. For example, if OPERATION does file I/O,
8903whichever argument specifies the file name is TARGET.
05e6f5dc 8904
48b0f3ae 8905TARGET has a meaning which depends on OPERATION:
b883cdb2 8906 For file I/O, TARGET is a file name (except for the special case below).
48b0f3ae 8907 For process I/O, TARGET is a process name.
d4a1d553 8908 For network I/O, TARGET is a service name or a port number.
05e6f5dc 8909
d4a1d553 8910This function looks up what is specified for TARGET in
48b0f3ae
PJ
8911`file-coding-system-alist', `process-coding-system-alist',
8912or `network-coding-system-alist' depending on OPERATION.
8913They may specify a coding system, a cons of coding systems,
8914or a function symbol to call.
8915In the last case, we call the function with one argument,
8916which is a list of all the arguments given to this function.
1011c487
MB
8917If the function can't decide a coding system, it can return
8918`undecided' so that the normal code-detection is performed.
48b0f3ae 8919
b883cdb2
MB
8920If OPERATION is `insert-file-contents', the argument corresponding to
8921TARGET may be a cons (FILENAME . BUFFER). In that case, FILENAME is a
8922file name to look up, and BUFFER is a buffer that contains the file's
8923contents (not yet decoded). If `file-coding-system-alist' specifies a
8924function to call for FILENAME, that function should examine the
8925contents of BUFFER instead of reading the file.
8926
d918f936 8927usage: (find-operation-coding-system OPERATION ARGUMENTS...) */)
48b0f3ae 8928 (nargs, args)
4ed46869
KH
8929 int nargs;
8930 Lisp_Object *args;
6b89e3aa 8931{
4ed46869
KH
8932 Lisp_Object operation, target_idx, target, val;
8933 register Lisp_Object chain;
177c0ea7 8934
4ed46869
KH
8935 if (nargs < 2)
8936 error ("Too few arguments");
8937 operation = args[0];
8938 if (!SYMBOLP (operation)
8939 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
3ed051d4 8940 error ("Invalid first argument");
4ed46869
KH
8941 if (nargs < 1 + XINT (target_idx))
8942 error ("Too few arguments for operation: %s",
8f924df7 8943 SDATA (SYMBOL_NAME (operation)));
4ed46869
KH
8944 target = args[XINT (target_idx) + 1];
8945 if (!(STRINGP (target)
091a0ff0
KH
8946 || (EQ (operation, Qinsert_file_contents) && CONSP (target)
8947 && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
4ed46869 8948 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
df7492f9 8949 error ("Invalid %dth argument", XINT (target_idx) + 1);
091a0ff0
KH
8950 if (CONSP (target))
8951 target = XCAR (target);
4ed46869 8952
2e34157c
RS
8953 chain = ((EQ (operation, Qinsert_file_contents)
8954 || EQ (operation, Qwrite_region))
02ba4723 8955 ? Vfile_coding_system_alist
2e34157c 8956 : (EQ (operation, Qopen_network_stream)
02ba4723
KH
8957 ? Vnetwork_coding_system_alist
8958 : Vprocess_coding_system_alist));
4ed46869
KH
8959 if (NILP (chain))
8960 return Qnil;
8961
03699b14 8962 for (; CONSP (chain); chain = XCDR (chain))
6b89e3aa 8963 {
f44d27ce 8964 Lisp_Object elt;
6b89e3aa 8965
df7492f9 8966 elt = XCAR (chain);
4ed46869
KH
8967 if (CONSP (elt)
8968 && ((STRINGP (target)
03699b14
KR
8969 && STRINGP (XCAR (elt))
8970 && fast_string_match (XCAR (elt), target) >= 0)
8971 || (INTEGERP (target) && EQ (target, XCAR (elt)))))
6b89e3aa 8972 {
03699b14 8973 val = XCDR (elt);
b19fd4c5
KH
8974 /* Here, if VAL is both a valid coding system and a valid
8975 function symbol, we return VAL as a coding system. */
02ba4723
KH
8976 if (CONSP (val))
8977 return val;
8978 if (! SYMBOLP (val))
8979 return Qnil;
8980 if (! NILP (Fcoding_system_p (val)))
8981 return Fcons (val, val);
b19fd4c5 8982 if (! NILP (Ffboundp (val)))
6b89e3aa 8983 {
e2b97060
MB
8984 /* We use call1 rather than safe_call1
8985 so as to get bug reports about functions called here
8986 which don't handle the current interface. */
8987 val = call1 (val, Flist (nargs, args));
b19fd4c5
KH
8988 if (CONSP (val))
8989 return val;
8990 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
8991 return Fcons (val, val);
6b89e3aa 8992 }
02ba4723 8993 return Qnil;
6b89e3aa
KH
8994 }
8995 }
4ed46869 8996 return Qnil;
6b89e3aa
KH
8997}
8998
df7492f9 8999DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
a3f6ee6d 9000 Sset_coding_system_priority, 0, MANY, 0,
da7db224 9001 doc: /* Assign higher priority to the coding systems given as arguments.
d4a1d553 9002If multiple coding systems belong to the same category,
a3181084
DL
9003all but the first one are ignored.
9004
d4a1d553 9005usage: (set-coding-system-priority &rest coding-systems) */)
df7492f9
KH
9006 (nargs, args)
9007 int nargs;
9008 Lisp_Object *args;
9009{
9010 int i, j;
9011 int changed[coding_category_max];
9012 enum coding_category priorities[coding_category_max];
9013
9014 bzero (changed, sizeof changed);
6b89e3aa 9015
df7492f9 9016 for (i = j = 0; i < nargs; i++)
6b89e3aa 9017 {
df7492f9
KH
9018 enum coding_category category;
9019 Lisp_Object spec, attrs;
6b89e3aa 9020
df7492f9
KH
9021 CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9022 attrs = AREF (spec, 0);
9023 category = XINT (CODING_ATTR_CATEGORY (attrs));
9024 if (changed[category])
9025 /* Ignore this coding system because a coding system of the
9026 same category already had a higher priority. */
9027 continue;
9028 changed[category] = 1;
9029 priorities[j++] = category;
9030 if (coding_categories[category].id >= 0
9031 && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9032 setup_coding_system (args[i], &coding_categories[category]);
ff563fce 9033 Fset (AREF (Vcoding_category_table, category), args[i]);
df7492f9 9034 }
6b89e3aa 9035
df7492f9
KH
9036 /* Now we have decided top J priorities. Reflect the order of the
9037 original priorities to the remaining priorities. */
6b89e3aa 9038
df7492f9 9039 for (i = j, j = 0; i < coding_category_max; i++, j++)
6b89e3aa 9040 {
df7492f9
KH
9041 while (j < coding_category_max
9042 && changed[coding_priorities[j]])
9043 j++;
9044 if (j == coding_category_max)
9045 abort ();
9046 priorities[i] = coding_priorities[j];
9047 }
6b89e3aa 9048
df7492f9 9049 bcopy (priorities, coding_priorities, sizeof priorities);
177c0ea7 9050
ff563fce
KH
9051 /* Update `coding-category-list'. */
9052 Vcoding_category_list = Qnil;
9053 for (i = coding_category_max - 1; i >= 0; i--)
9054 Vcoding_category_list
9055 = Fcons (AREF (Vcoding_category_table, priorities[i]),
9056 Vcoding_category_list);
6b89e3aa 9057
df7492f9 9058 return Qnil;
6b89e3aa
KH
9059}
9060
df7492f9
KH
9061DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
9062 Scoding_system_priority_list, 0, 1, 0,
da7db224
DL
9063 doc: /* Return a list of coding systems ordered by their priorities.
9064HIGHESTP non-nil means just return the highest priority one. */)
df7492f9
KH
9065 (highestp)
9066 Lisp_Object highestp;
d46c5b12
KH
9067{
9068 int i;
df7492f9 9069 Lisp_Object val;
6b89e3aa 9070
df7492f9 9071 for (i = 0, val = Qnil; i < coding_category_max; i++)
d46c5b12 9072 {
df7492f9
KH
9073 enum coding_category category = coding_priorities[i];
9074 int id = coding_categories[category].id;
9075 Lisp_Object attrs;
068a9dbd 9076
df7492f9
KH
9077 if (id < 0)
9078 continue;
9079 attrs = CODING_ID_ATTRS (id);
9080 if (! NILP (highestp))
9081 return CODING_ATTR_BASE_NAME (attrs);
9082 val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
9083 }
9084 return Fnreverse (val);
9085}
068a9dbd 9086
f0064e1f 9087static char *suffixes[] = { "-unix", "-dos", "-mac" };
068a9dbd
KH
9088
9089static Lisp_Object
df7492f9
KH
9090make_subsidiaries (base)
9091 Lisp_Object base;
068a9dbd 9092{
df7492f9 9093 Lisp_Object subsidiaries;
8f924df7 9094 int base_name_len = SBYTES (SYMBOL_NAME (base));
df7492f9
KH
9095 char *buf = (char *) alloca (base_name_len + 6);
9096 int i;
068a9dbd 9097
8f924df7 9098 bcopy (SDATA (SYMBOL_NAME (base)), buf, base_name_len);
df7492f9
KH
9099 subsidiaries = Fmake_vector (make_number (3), Qnil);
9100 for (i = 0; i < 3; i++)
068a9dbd 9101 {
df7492f9
KH
9102 bcopy (suffixes[i], buf + base_name_len, strlen (suffixes[i]) + 1);
9103 ASET (subsidiaries, i, intern (buf));
068a9dbd 9104 }
df7492f9 9105 return subsidiaries;
068a9dbd
KH
9106}
9107
9108
df7492f9
KH
9109DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
9110 Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
1fcd6c8b
DL
9111 doc: /* For internal use only.
9112usage: (define-coding-system-internal ...) */)
df7492f9
KH
9113 (nargs, args)
9114 int nargs;
9115 Lisp_Object *args;
068a9dbd 9116{
df7492f9
KH
9117 Lisp_Object name;
9118 Lisp_Object spec_vec; /* [ ATTRS ALIASE EOL_TYPE ] */
9119 Lisp_Object attrs; /* Vector of attributes. */
9120 Lisp_Object eol_type;
9121 Lisp_Object aliases;
9122 Lisp_Object coding_type, charset_list, safe_charsets;
9123 enum coding_category category;
9124 Lisp_Object tail, val;
9125 int max_charset_id = 0;
9126 int i;
068a9dbd 9127
df7492f9
KH
9128 if (nargs < coding_arg_max)
9129 goto short_args;
068a9dbd 9130
df7492f9 9131 attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
068a9dbd 9132
df7492f9
KH
9133 name = args[coding_arg_name];
9134 CHECK_SYMBOL (name);
9135 CODING_ATTR_BASE_NAME (attrs) = name;
068a9dbd 9136
df7492f9
KH
9137 val = args[coding_arg_mnemonic];
9138 if (! STRINGP (val))
9139 CHECK_CHARACTER (val);
9140 CODING_ATTR_MNEMONIC (attrs) = val;
068a9dbd 9141
df7492f9
KH
9142 coding_type = args[coding_arg_coding_type];
9143 CHECK_SYMBOL (coding_type);
9144 CODING_ATTR_TYPE (attrs) = coding_type;
068a9dbd 9145
df7492f9
KH
9146 charset_list = args[coding_arg_charset_list];
9147 if (SYMBOLP (charset_list))
9148 {
9149 if (EQ (charset_list, Qiso_2022))
9150 {
9151 if (! EQ (coding_type, Qiso_2022))
9152 error ("Invalid charset-list");
9153 charset_list = Viso_2022_charset_list;
9154 }
9155 else if (EQ (charset_list, Qemacs_mule))
9156 {
9157 if (! EQ (coding_type, Qemacs_mule))
9158 error ("Invalid charset-list");
9159 charset_list = Vemacs_mule_charset_list;
9160 }
9161 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9162 if (max_charset_id < XFASTINT (XCAR (tail)))
9163 max_charset_id = XFASTINT (XCAR (tail));
9164 }
068a9dbd
KH
9165 else
9166 {
df7492f9 9167 charset_list = Fcopy_sequence (charset_list);
985773c9 9168 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
068a9dbd 9169 {
df7492f9
KH
9170 struct charset *charset;
9171
985773c9 9172 val = XCAR (tail);
df7492f9
KH
9173 CHECK_CHARSET_GET_CHARSET (val, charset);
9174 if (EQ (coding_type, Qiso_2022)
9175 ? CHARSET_ISO_FINAL (charset) < 0
9176 : EQ (coding_type, Qemacs_mule)
9177 ? CHARSET_EMACS_MULE_ID (charset) < 0
9178 : 0)
9179 error ("Can't handle charset `%s'",
8f924df7 9180 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
df7492f9 9181
8f924df7 9182 XSETCAR (tail, make_number (charset->id));
df7492f9
KH
9183 if (max_charset_id < charset->id)
9184 max_charset_id = charset->id;
068a9dbd
KH
9185 }
9186 }
df7492f9 9187 CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
068a9dbd 9188
df7492f9
KH
9189 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
9190 make_number (255));
9191 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8f924df7 9192 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
df7492f9 9193 CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
068a9dbd 9194
584948ac 9195 CODING_ATTR_ASCII_COMPAT (attrs) = args[coding_arg_ascii_compatible_p];
3a73fa5d 9196
df7492f9 9197 val = args[coding_arg_decode_translation_table];
a6f87d34 9198 if (! CHAR_TABLE_P (val) && ! CONSP (val))
7d64c6ad 9199 CHECK_SYMBOL (val);
df7492f9 9200 CODING_ATTR_DECODE_TBL (attrs) = val;
3a73fa5d 9201
df7492f9 9202 val = args[coding_arg_encode_translation_table];
a6f87d34 9203 if (! CHAR_TABLE_P (val) && ! CONSP (val))
7d64c6ad 9204 CHECK_SYMBOL (val);
df7492f9 9205 CODING_ATTR_ENCODE_TBL (attrs) = val;
d46c5b12 9206
df7492f9
KH
9207 val = args[coding_arg_post_read_conversion];
9208 CHECK_SYMBOL (val);
9209 CODING_ATTR_POST_READ (attrs) = val;
d46c5b12 9210
df7492f9
KH
9211 val = args[coding_arg_pre_write_conversion];
9212 CHECK_SYMBOL (val);
9213 CODING_ATTR_PRE_WRITE (attrs) = val;
3a73fa5d 9214
df7492f9
KH
9215 val = args[coding_arg_default_char];
9216 if (NILP (val))
9217 CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
9218 else
9219 {
8f924df7 9220 CHECK_CHARACTER (val);
df7492f9
KH
9221 CODING_ATTR_DEFAULT_CHAR (attrs) = val;
9222 }
4031e2bf 9223
8f924df7
KH
9224 val = args[coding_arg_for_unibyte];
9225 CODING_ATTR_FOR_UNIBYTE (attrs) = NILP (val) ? Qnil : Qt;
3a73fa5d 9226
df7492f9
KH
9227 val = args[coding_arg_plist];
9228 CHECK_LIST (val);
9229 CODING_ATTR_PLIST (attrs) = val;
3a73fa5d 9230
df7492f9
KH
9231 if (EQ (coding_type, Qcharset))
9232 {
c7c66a95
KH
9233 /* Generate a lisp vector of 256 elements. Each element is nil,
9234 integer, or a list of charset IDs.
3a73fa5d 9235
c7c66a95
KH
9236 If Nth element is nil, the byte code N is invalid in this
9237 coding system.
4ed46869 9238
c7c66a95
KH
9239 If Nth element is a number NUM, N is the first byte of a
9240 charset whose ID is NUM.
4ed46869 9241
c7c66a95
KH
9242 If Nth element is a list of charset IDs, N is the first byte
9243 of one of them. The list is sorted by dimensions of the
2bc515e4 9244 charsets. A charset of smaller dimension comes firtst. */
df7492f9 9245 val = Fmake_vector (make_number (256), Qnil);
4ed46869 9246
5c99c2e6 9247 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
df7492f9 9248 {
c7c66a95
KH
9249 struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
9250 int dim = CHARSET_DIMENSION (charset);
9251 int idx = (dim - 1) * 4;
4ed46869 9252
5c99c2e6 9253 if (CHARSET_ASCII_COMPATIBLE_P (charset))
584948ac 9254 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
4031e2bf 9255
15d143f7
KH
9256 for (i = charset->code_space[idx];
9257 i <= charset->code_space[idx + 1]; i++)
9258 {
c7c66a95
KH
9259 Lisp_Object tmp, tmp2;
9260 int dim2;
ec6d2bb8 9261
c7c66a95
KH
9262 tmp = AREF (val, i);
9263 if (NILP (tmp))
9264 tmp = XCAR (tail);
9265 else if (NUMBERP (tmp))
9266 {
9267 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
9268 if (dim < dim2)
c7c66a95 9269 tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
f9d71dcd
KH
9270 else
9271 tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
c7c66a95 9272 }
15d143f7 9273 else
c7c66a95
KH
9274 {
9275 for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
9276 {
9277 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
9278 if (dim < dim2)
9279 break;
9280 }
9281 if (NILP (tmp2))
9282 tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
9283 else
9284 {
9285 XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
9286 XSETCAR (tmp2, XCAR (tail));
9287 }
9288 }
9289 ASET (val, i, tmp);
15d143f7 9290 }
df7492f9
KH
9291 }
9292 ASET (attrs, coding_attr_charset_valids, val);
9293 category = coding_category_charset;
9294 }
9295 else if (EQ (coding_type, Qccl))
9296 {
9297 Lisp_Object valids;
ecec61c1 9298
df7492f9
KH
9299 if (nargs < coding_arg_ccl_max)
9300 goto short_args;
ecec61c1 9301
df7492f9
KH
9302 val = args[coding_arg_ccl_decoder];
9303 CHECK_CCL_PROGRAM (val);
9304 if (VECTORP (val))
9305 val = Fcopy_sequence (val);
9306 ASET (attrs, coding_attr_ccl_decoder, val);
ecec61c1 9307
df7492f9
KH
9308 val = args[coding_arg_ccl_encoder];
9309 CHECK_CCL_PROGRAM (val);
9310 if (VECTORP (val))
9311 val = Fcopy_sequence (val);
9312 ASET (attrs, coding_attr_ccl_encoder, val);
ecec61c1 9313
df7492f9
KH
9314 val = args[coding_arg_ccl_valids];
9315 valids = Fmake_string (make_number (256), make_number (0));
9316 for (tail = val; !NILP (tail); tail = Fcdr (tail))
9317 {
8dcbea82 9318 int from, to;
ecec61c1 9319
df7492f9
KH
9320 val = Fcar (tail);
9321 if (INTEGERP (val))
8dcbea82
KH
9322 {
9323 from = to = XINT (val);
9324 if (from < 0 || from > 255)
9325 args_out_of_range_3 (val, make_number (0), make_number (255));
9326 }
df7492f9
KH
9327 else
9328 {
df7492f9 9329 CHECK_CONS (val);
8f924df7
KH
9330 CHECK_NATNUM_CAR (val);
9331 CHECK_NATNUM_CDR (val);
df7492f9 9332 from = XINT (XCAR (val));
8f924df7 9333 if (from > 255)
8dcbea82
KH
9334 args_out_of_range_3 (XCAR (val),
9335 make_number (0), make_number (255));
df7492f9 9336 to = XINT (XCDR (val));
8dcbea82
KH
9337 if (to < from || to > 255)
9338 args_out_of_range_3 (XCDR (val),
9339 XCAR (val), make_number (255));
df7492f9 9340 }
8dcbea82 9341 for (i = from; i <= to; i++)
8f924df7 9342 SSET (valids, i, 1);
df7492f9
KH
9343 }
9344 ASET (attrs, coding_attr_ccl_valids, valids);
4ed46869 9345
df7492f9 9346 category = coding_category_ccl;
55ab7be3 9347 }
df7492f9 9348 else if (EQ (coding_type, Qutf_16))
55ab7be3 9349 {
df7492f9 9350 Lisp_Object bom, endian;
4ed46869 9351
584948ac 9352 CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
4ed46869 9353
df7492f9
KH
9354 if (nargs < coding_arg_utf16_max)
9355 goto short_args;
4ed46869 9356
df7492f9
KH
9357 bom = args[coding_arg_utf16_bom];
9358 if (! NILP (bom) && ! EQ (bom, Qt))
9359 {
9360 CHECK_CONS (bom);
8f924df7
KH
9361 val = XCAR (bom);
9362 CHECK_CODING_SYSTEM (val);
9363 val = XCDR (bom);
9364 CHECK_CODING_SYSTEM (val);
df7492f9 9365 }
a470d443 9366 ASET (attrs, coding_attr_utf_bom, bom);
df7492f9
KH
9367
9368 endian = args[coding_arg_utf16_endian];
b49a1807
KH
9369 CHECK_SYMBOL (endian);
9370 if (NILP (endian))
9371 endian = Qbig;
9372 else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
8f924df7 9373 error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
df7492f9
KH
9374 ASET (attrs, coding_attr_utf_16_endian, endian);
9375
9376 category = (CONSP (bom)
9377 ? coding_category_utf_16_auto
9378 : NILP (bom)
b49a1807 9379 ? (EQ (endian, Qbig)
df7492f9
KH
9380 ? coding_category_utf_16_be_nosig
9381 : coding_category_utf_16_le_nosig)
b49a1807 9382 : (EQ (endian, Qbig)
df7492f9
KH
9383 ? coding_category_utf_16_be
9384 : coding_category_utf_16_le));
9385 }
9386 else if (EQ (coding_type, Qiso_2022))
9387 {
9388 Lisp_Object initial, reg_usage, request, flags;
4776e638 9389 int i;
1397dc18 9390
df7492f9
KH
9391 if (nargs < coding_arg_iso2022_max)
9392 goto short_args;
9393
9394 initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
9395 CHECK_VECTOR (initial);
9396 for (i = 0; i < 4; i++)
9397 {
9398 val = Faref (initial, make_number (i));
9399 if (! NILP (val))
9400 {
584948ac
KH
9401 struct charset *charset;
9402
9403 CHECK_CHARSET_GET_CHARSET (val, charset);
9404 ASET (initial, i, make_number (CHARSET_ID (charset)));
9405 if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
9406 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
df7492f9
KH
9407 }
9408 else
9409 ASET (initial, i, make_number (-1));
9410 }
9411
9412 reg_usage = args[coding_arg_iso2022_reg_usage];
9413 CHECK_CONS (reg_usage);
8f924df7
KH
9414 CHECK_NUMBER_CAR (reg_usage);
9415 CHECK_NUMBER_CDR (reg_usage);
df7492f9
KH
9416
9417 request = Fcopy_sequence (args[coding_arg_iso2022_request]);
9418 for (tail = request; ! NILP (tail); tail = Fcdr (tail))
1397dc18 9419 {
df7492f9 9420 int id;
8f924df7 9421 Lisp_Object tmp;
df7492f9
KH
9422
9423 val = Fcar (tail);
9424 CHECK_CONS (val);
8f924df7
KH
9425 tmp = XCAR (val);
9426 CHECK_CHARSET_GET_ID (tmp, id);
9427 CHECK_NATNUM_CDR (val);
df7492f9
KH
9428 if (XINT (XCDR (val)) >= 4)
9429 error ("Invalid graphic register number: %d", XINT (XCDR (val)));
8f924df7 9430 XSETCAR (val, make_number (id));
1397dc18 9431 }
4ed46869 9432
df7492f9
KH
9433 flags = args[coding_arg_iso2022_flags];
9434 CHECK_NATNUM (flags);
9435 i = XINT (flags);
9436 if (EQ (args[coding_arg_charset_list], Qiso_2022))
9437 flags = make_number (i | CODING_ISO_FLAG_FULL_SUPPORT);
9438
9439 ASET (attrs, coding_attr_iso_initial, initial);
9440 ASET (attrs, coding_attr_iso_usage, reg_usage);
9441 ASET (attrs, coding_attr_iso_request, request);
9442 ASET (attrs, coding_attr_iso_flags, flags);
9443 setup_iso_safe_charsets (attrs);
9444
9445 if (i & CODING_ISO_FLAG_SEVEN_BITS)
9446 category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
9447 | CODING_ISO_FLAG_SINGLE_SHIFT))
9448 ? coding_category_iso_7_else
9449 : EQ (args[coding_arg_charset_list], Qiso_2022)
9450 ? coding_category_iso_7
9451 : coding_category_iso_7_tight);
9452 else
9453 {
9454 int id = XINT (AREF (initial, 1));
9455
c6fb6e98 9456 category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
df7492f9
KH
9457 || EQ (args[coding_arg_charset_list], Qiso_2022)
9458 || id < 0)
9459 ? coding_category_iso_8_else
9460 : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
9461 ? coding_category_iso_8_1
9462 : coding_category_iso_8_2);
9463 }
0ce7886f
KH
9464 if (category != coding_category_iso_8_1
9465 && category != coding_category_iso_8_2)
9466 CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
df7492f9
KH
9467 }
9468 else if (EQ (coding_type, Qemacs_mule))
c28a9453 9469 {
df7492f9
KH
9470 if (EQ (args[coding_arg_charset_list], Qemacs_mule))
9471 ASET (attrs, coding_attr_emacs_mule_full, Qt);
584948ac 9472 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
df7492f9 9473 category = coding_category_emacs_mule;
c28a9453 9474 }
df7492f9 9475 else if (EQ (coding_type, Qshift_jis))
c28a9453 9476 {
df7492f9
KH
9477
9478 struct charset *charset;
9479
7d64c6ad 9480 if (XINT (Flength (charset_list)) != 3
6e07c25f 9481 && XINT (Flength (charset_list)) != 4)
7d64c6ad 9482 error ("There should be three or four charsets");
df7492f9
KH
9483
9484 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9485 if (CHARSET_DIMENSION (charset) != 1)
9486 error ("Dimension of charset %s is not one",
8f924df7 9487 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
584948ac
KH
9488 if (CHARSET_ASCII_COMPATIBLE_P (charset))
9489 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
df7492f9
KH
9490
9491 charset_list = XCDR (charset_list);
9492 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9493 if (CHARSET_DIMENSION (charset) != 1)
9494 error ("Dimension of charset %s is not one",
8f924df7 9495 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
df7492f9
KH
9496
9497 charset_list = XCDR (charset_list);
9498 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9499 if (CHARSET_DIMENSION (charset) != 2)
7d64c6ad
KH
9500 error ("Dimension of charset %s is not two",
9501 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9502
9503 charset_list = XCDR (charset_list);
2b917a06
KH
9504 if (! NILP (charset_list))
9505 {
9506 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9507 if (CHARSET_DIMENSION (charset) != 2)
9508 error ("Dimension of charset %s is not two",
9509 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9510 }
df7492f9
KH
9511
9512 category = coding_category_sjis;
9513 Vsjis_coding_system = name;
c28a9453 9514 }
df7492f9
KH
9515 else if (EQ (coding_type, Qbig5))
9516 {
9517 struct charset *charset;
4ed46869 9518
df7492f9
KH
9519 if (XINT (Flength (charset_list)) != 2)
9520 error ("There should be just two charsets");
9521
9522 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9523 if (CHARSET_DIMENSION (charset) != 1)
9524 error ("Dimension of charset %s is not one",
8f924df7 9525 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
584948ac
KH
9526 if (CHARSET_ASCII_COMPATIBLE_P (charset))
9527 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
df7492f9
KH
9528
9529 charset_list = XCDR (charset_list);
9530 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9531 if (CHARSET_DIMENSION (charset) != 2)
9532 error ("Dimension of charset %s is not two",
8f924df7 9533 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
4ed46869 9534
df7492f9
KH
9535 category = coding_category_big5;
9536 Vbig5_coding_system = name;
9537 }
9538 else if (EQ (coding_type, Qraw_text))
c28a9453 9539 {
584948ac
KH
9540 category = coding_category_raw_text;
9541 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
c28a9453 9542 }
df7492f9 9543 else if (EQ (coding_type, Qutf_8))
4ed46869 9544 {
a470d443
KH
9545 Lisp_Object bom;
9546
584948ac 9547 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
a470d443
KH
9548
9549 if (nargs < coding_arg_utf8_max)
9550 goto short_args;
9551
9552 bom = args[coding_arg_utf8_bom];
9553 if (! NILP (bom) && ! EQ (bom, Qt))
9554 {
9555 CHECK_CONS (bom);
9556 val = XCAR (bom);
9557 CHECK_CODING_SYSTEM (val);
9558 val = XCDR (bom);
9559 CHECK_CODING_SYSTEM (val);
9560 }
9561 ASET (attrs, coding_attr_utf_bom, bom);
9562
9563 category = (CONSP (bom) ? coding_category_utf_8_auto
9564 : NILP (bom) ? coding_category_utf_8_nosig
9565 : coding_category_utf_8_sig);
4ed46869 9566 }
df7492f9
KH
9567 else if (EQ (coding_type, Qundecided))
9568 category = coding_category_undecided;
4ed46869 9569 else
df7492f9 9570 error ("Invalid coding system type: %s",
8f924df7 9571 SDATA (SYMBOL_NAME (coding_type)));
4ed46869 9572
df7492f9 9573 CODING_ATTR_CATEGORY (attrs) = make_number (category);
01378f49
KH
9574 CODING_ATTR_PLIST (attrs)
9575 = Fcons (QCcategory, Fcons (AREF (Vcoding_category_table, category),
9576 CODING_ATTR_PLIST (attrs)));
35befdaa 9577 CODING_ATTR_PLIST (attrs)
3ed051d4 9578 = Fcons (QCascii_compatible_p,
35befdaa
KH
9579 Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
9580 CODING_ATTR_PLIST (attrs)));
c4825358 9581
df7492f9
KH
9582 eol_type = args[coding_arg_eol_type];
9583 if (! NILP (eol_type)
9584 && ! EQ (eol_type, Qunix)
9585 && ! EQ (eol_type, Qdos)
9586 && ! EQ (eol_type, Qmac))
9587 error ("Invalid eol-type");
4ed46869 9588
df7492f9 9589 aliases = Fcons (name, Qnil);
4ed46869 9590
df7492f9
KH
9591 if (NILP (eol_type))
9592 {
9593 eol_type = make_subsidiaries (name);
9594 for (i = 0; i < 3; i++)
1397dc18 9595 {
df7492f9
KH
9596 Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
9597
9598 this_name = AREF (eol_type, i);
9599 this_aliases = Fcons (this_name, Qnil);
9600 this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
9601 this_spec = Fmake_vector (make_number (3), attrs);
9602 ASET (this_spec, 1, this_aliases);
9603 ASET (this_spec, 2, this_eol_type);
9604 Fputhash (this_name, this_spec, Vcoding_system_hash_table);
9605 Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
583f71ca
KH
9606 val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
9607 if (NILP (val))
9608 Vcoding_system_alist
9609 = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
9610 Vcoding_system_alist);
1397dc18 9611 }
d46c5b12 9612 }
4ed46869 9613
df7492f9
KH
9614 spec_vec = Fmake_vector (make_number (3), attrs);
9615 ASET (spec_vec, 1, aliases);
9616 ASET (spec_vec, 2, eol_type);
48b0f3ae 9617
df7492f9
KH
9618 Fputhash (name, spec_vec, Vcoding_system_hash_table);
9619 Vcoding_system_list = Fcons (name, Vcoding_system_list);
583f71ca
KH
9620 val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
9621 if (NILP (val))
9622 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
9623 Vcoding_system_alist);
48b0f3ae 9624
df7492f9
KH
9625 {
9626 int id = coding_categories[category].id;
48b0f3ae 9627
df7492f9
KH
9628 if (id < 0 || EQ (name, CODING_ID_NAME (id)))
9629 setup_coding_system (name, &coding_categories[category]);
9630 }
48b0f3ae 9631
d46c5b12 9632 return Qnil;
48b0f3ae 9633
df7492f9
KH
9634 short_args:
9635 return Fsignal (Qwrong_number_of_arguments,
9636 Fcons (intern ("define-coding-system-internal"),
9637 make_number (nargs)));
d46c5b12 9638}
4ed46869 9639
d6925f38 9640
a6f87d34
KH
9641DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
9642 3, 3, 0,
9643 doc: /* Change value in CODING-SYSTEM's property list PROP to VAL. */)
9644 (coding_system, prop, val)
9645 Lisp_Object coding_system, prop, val;
9646{
3dbe7859 9647 Lisp_Object spec, attrs;
a6f87d34
KH
9648
9649 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9650 attrs = AREF (spec, 0);
9651 if (EQ (prop, QCmnemonic))
9652 {
9653 if (! STRINGP (val))
9654 CHECK_CHARACTER (val);
9655 CODING_ATTR_MNEMONIC (attrs) = val;
9656 }
2133e2d1 9657 else if (EQ (prop, QCdefault_char))
a6f87d34
KH
9658 {
9659 if (NILP (val))
9660 val = make_number (' ');
9661 else
9662 CHECK_CHARACTER (val);
9663 CODING_ATTR_DEFAULT_CHAR (attrs) = val;
9664 }
9665 else if (EQ (prop, QCdecode_translation_table))
9666 {
9667 if (! CHAR_TABLE_P (val) && ! CONSP (val))
9668 CHECK_SYMBOL (val);
9669 CODING_ATTR_DECODE_TBL (attrs) = val;
9670 }
9671 else if (EQ (prop, QCencode_translation_table))
9672 {
9673 if (! CHAR_TABLE_P (val) && ! CONSP (val))
9674 CHECK_SYMBOL (val);
9675 CODING_ATTR_ENCODE_TBL (attrs) = val;
9676 }
9677 else if (EQ (prop, QCpost_read_conversion))
9678 {
9679 CHECK_SYMBOL (val);
9680 CODING_ATTR_POST_READ (attrs) = val;
9681 }
9682 else if (EQ (prop, QCpre_write_conversion))
9683 {
9684 CHECK_SYMBOL (val);
9685 CODING_ATTR_PRE_WRITE (attrs) = val;
9686 }
35befdaa
KH
9687 else if (EQ (prop, QCascii_compatible_p))
9688 {
9689 CODING_ATTR_ASCII_COMPAT (attrs) = val;
9690 }
a6f87d34
KH
9691
9692 CODING_ATTR_PLIST (attrs)
9693 = Fplist_put (CODING_ATTR_PLIST (attrs), prop, val);
9694 return val;
9695}
9696
9697
df7492f9
KH
9698DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
9699 Sdefine_coding_system_alias, 2, 2, 0,
9700 doc: /* Define ALIAS as an alias for CODING-SYSTEM. */)
9701 (alias, coding_system)
9702 Lisp_Object alias, coding_system;
66cfb530 9703{
583f71ca 9704 Lisp_Object spec, aliases, eol_type, val;
4ed46869 9705
df7492f9
KH
9706 CHECK_SYMBOL (alias);
9707 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9708 aliases = AREF (spec, 1);
d4a1d553 9709 /* ALIASES should be a list of length more than zero, and the first
d6925f38
KH
9710 element is a base coding system. Append ALIAS at the tail of the
9711 list. */
df7492f9
KH
9712 while (!NILP (XCDR (aliases)))
9713 aliases = XCDR (aliases);
8f924df7 9714 XSETCDR (aliases, Fcons (alias, Qnil));
4ed46869 9715
df7492f9
KH
9716 eol_type = AREF (spec, 2);
9717 if (VECTORP (eol_type))
4ed46869 9718 {
df7492f9
KH
9719 Lisp_Object subsidiaries;
9720 int i;
4ed46869 9721
df7492f9
KH
9722 subsidiaries = make_subsidiaries (alias);
9723 for (i = 0; i < 3; i++)
9724 Fdefine_coding_system_alias (AREF (subsidiaries, i),
9725 AREF (eol_type, i));
4ed46869 9726 }
df7492f9
KH
9727
9728 Fputhash (alias, spec, Vcoding_system_hash_table);
d6925f38 9729 Vcoding_system_list = Fcons (alias, Vcoding_system_list);
583f71ca
KH
9730 val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
9731 if (NILP (val))
9732 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
9733 Vcoding_system_alist);
66cfb530 9734
4ed46869
KH
9735 return Qnil;
9736}
9737
df7492f9
KH
9738DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
9739 1, 1, 0,
9740 doc: /* Return the base of CODING-SYSTEM.
da7db224 9741Any alias or subsidiary coding system is not a base coding system. */)
df7492f9
KH
9742 (coding_system)
9743 Lisp_Object coding_system;
d46c5b12 9744{
df7492f9 9745 Lisp_Object spec, attrs;
d46c5b12 9746
df7492f9
KH
9747 if (NILP (coding_system))
9748 return (Qno_conversion);
9749 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9750 attrs = AREF (spec, 0);
9751 return CODING_ATTR_BASE_NAME (attrs);
9752}
1397dc18 9753
df7492f9
KH
9754DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
9755 1, 1, 0,
9756 doc: "Return the property list of CODING-SYSTEM.")
9757 (coding_system)
9758 Lisp_Object coding_system;
9759{
9760 Lisp_Object spec, attrs;
1397dc18 9761
df7492f9
KH
9762 if (NILP (coding_system))
9763 coding_system = Qno_conversion;
9764 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9765 attrs = AREF (spec, 0);
9766 return CODING_ATTR_PLIST (attrs);
d46c5b12
KH
9767}
9768
df7492f9
KH
9769
9770DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
9771 1, 1, 0,
da7db224 9772 doc: /* Return the list of aliases of CODING-SYSTEM. */)
df7492f9
KH
9773 (coding_system)
9774 Lisp_Object coding_system;
66cfb530 9775{
df7492f9 9776 Lisp_Object spec;
84d60297 9777
df7492f9
KH
9778 if (NILP (coding_system))
9779 coding_system = Qno_conversion;
9780 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
da7db224 9781 return AREF (spec, 1);
df7492f9 9782}
66cfb530 9783
df7492f9
KH
9784DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
9785 Scoding_system_eol_type, 1, 1, 0,
9786 doc: /* Return eol-type of CODING-SYSTEM.
d4a1d553 9787An eol-type is an integer 0, 1, 2, or a vector of coding systems.
66cfb530 9788
df7492f9
KH
9789Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
9790and CR respectively.
66cfb530 9791
df7492f9
KH
9792A vector value indicates that a format of end-of-line should be
9793detected automatically. Nth element of the vector is the subsidiary
9794coding system whose eol-type is N. */)
6b89e3aa
KH
9795 (coding_system)
9796 Lisp_Object coding_system;
9797{
df7492f9
KH
9798 Lisp_Object spec, eol_type;
9799 int n;
6b89e3aa 9800
df7492f9
KH
9801 if (NILP (coding_system))
9802 coding_system = Qno_conversion;
9803 if (! CODING_SYSTEM_P (coding_system))
9804 return Qnil;
9805 spec = CODING_SYSTEM_SPEC (coding_system);
9806 eol_type = AREF (spec, 2);
9807 if (VECTORP (eol_type))
9808 return Fcopy_sequence (eol_type);
9809 n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
9810 return make_number (n);
6b89e3aa
KH
9811}
9812
4ed46869
KH
9813#endif /* emacs */
9814
9815\f
1397dc18 9816/*** 9. Post-amble ***/
4ed46869 9817
dfcf069d 9818void
4ed46869
KH
9819init_coding_once ()
9820{
9821 int i;
9822
df7492f9
KH
9823 for (i = 0; i < coding_category_max; i++)
9824 {
9825 coding_categories[i].id = -1;
9826 coding_priorities[i] = i;
9827 }
4ed46869
KH
9828
9829 /* ISO2022 specific initialize routine. */
9830 for (i = 0; i < 0x20; i++)
b73bfc1c 9831 iso_code_class[i] = ISO_control_0;
4ed46869
KH
9832 for (i = 0x21; i < 0x7F; i++)
9833 iso_code_class[i] = ISO_graphic_plane_0;
9834 for (i = 0x80; i < 0xA0; i++)
b73bfc1c 9835 iso_code_class[i] = ISO_control_1;
4ed46869
KH
9836 for (i = 0xA1; i < 0xFF; i++)
9837 iso_code_class[i] = ISO_graphic_plane_1;
9838 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
9839 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
4ed46869
KH
9840 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
9841 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
9842 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
9843 iso_code_class[ISO_CODE_ESC] = ISO_escape;
9844 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
9845 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
9846 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
9847
df7492f9
KH
9848 for (i = 0; i < 256; i++)
9849 {
9850 emacs_mule_bytes[i] = 1;
9851 }
7c78e542
KH
9852 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
9853 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
9854 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
9855 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
e0e989f6
KH
9856}
9857
9858#ifdef emacs
9859
dfcf069d 9860void
e0e989f6
KH
9861syms_of_coding ()
9862{
df7492f9 9863 staticpro (&Vcoding_system_hash_table);
8f924df7
KH
9864 {
9865 Lisp_Object args[2];
9866 args[0] = QCtest;
9867 args[1] = Qeq;
9868 Vcoding_system_hash_table = Fmake_hash_table (2, args);
9869 }
df7492f9
KH
9870
9871 staticpro (&Vsjis_coding_system);
9872 Vsjis_coding_system = Qnil;
e0e989f6 9873
df7492f9
KH
9874 staticpro (&Vbig5_coding_system);
9875 Vbig5_coding_system = Qnil;
9876
24a73b0a
KH
9877 staticpro (&Vcode_conversion_reused_workbuf);
9878 Vcode_conversion_reused_workbuf = Qnil;
9879
9880 staticpro (&Vcode_conversion_workbuf_name);
9881 Vcode_conversion_workbuf_name = build_string (" *code-conversion-work*");
e0e989f6 9882
24a73b0a 9883 reused_workbuf_in_use = 0;
df7492f9
KH
9884
9885 DEFSYM (Qcharset, "charset");
9886 DEFSYM (Qtarget_idx, "target-idx");
9887 DEFSYM (Qcoding_system_history, "coding-system-history");
bb0115a2
RS
9888 Fset (Qcoding_system_history, Qnil);
9889
9ce27fde 9890 /* Target FILENAME is the first argument. */
e0e989f6 9891 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
9ce27fde 9892 /* Target FILENAME is the third argument. */
e0e989f6
KH
9893 Fput (Qwrite_region, Qtarget_idx, make_number (2));
9894
df7492f9 9895 DEFSYM (Qcall_process, "call-process");
9ce27fde 9896 /* Target PROGRAM is the first argument. */
e0e989f6
KH
9897 Fput (Qcall_process, Qtarget_idx, make_number (0));
9898
df7492f9 9899 DEFSYM (Qcall_process_region, "call-process-region");
9ce27fde 9900 /* Target PROGRAM is the third argument. */
e0e989f6
KH
9901 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
9902
df7492f9 9903 DEFSYM (Qstart_process, "start-process");
9ce27fde 9904 /* Target PROGRAM is the third argument. */
e0e989f6
KH
9905 Fput (Qstart_process, Qtarget_idx, make_number (2));
9906
df7492f9 9907 DEFSYM (Qopen_network_stream, "open-network-stream");
9ce27fde 9908 /* Target SERVICE is the fourth argument. */
e0e989f6
KH
9909 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
9910
df7492f9
KH
9911 DEFSYM (Qcoding_system, "coding-system");
9912 DEFSYM (Qcoding_aliases, "coding-aliases");
4ed46869 9913
df7492f9
KH
9914 DEFSYM (Qeol_type, "eol-type");
9915 DEFSYM (Qunix, "unix");
9916 DEFSYM (Qdos, "dos");
4ed46869 9917
df7492f9
KH
9918 DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
9919 DEFSYM (Qpost_read_conversion, "post-read-conversion");
9920 DEFSYM (Qpre_write_conversion, "pre-write-conversion");
9921 DEFSYM (Qdefault_char, "default-char");
9922 DEFSYM (Qundecided, "undecided");
9923 DEFSYM (Qno_conversion, "no-conversion");
9924 DEFSYM (Qraw_text, "raw-text");
4ed46869 9925
df7492f9 9926 DEFSYM (Qiso_2022, "iso-2022");
4ed46869 9927
df7492f9 9928 DEFSYM (Qutf_8, "utf-8");
8f924df7 9929 DEFSYM (Qutf_8_emacs, "utf-8-emacs");
27901516 9930
df7492f9 9931 DEFSYM (Qutf_16, "utf-16");
df7492f9
KH
9932 DEFSYM (Qbig, "big");
9933 DEFSYM (Qlittle, "little");
27901516 9934
df7492f9
KH
9935 DEFSYM (Qshift_jis, "shift-jis");
9936 DEFSYM (Qbig5, "big5");
4ed46869 9937
df7492f9 9938 DEFSYM (Qcoding_system_p, "coding-system-p");
4ed46869 9939
df7492f9 9940 DEFSYM (Qcoding_system_error, "coding-system-error");
4ed46869
KH
9941 Fput (Qcoding_system_error, Qerror_conditions,
9942 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
9943 Fput (Qcoding_system_error, Qerror_message,
9ce27fde 9944 build_string ("Invalid coding system"));
4ed46869 9945
05e6f5dc
KH
9946 /* Intern this now in case it isn't already done.
9947 Setting this variable twice is harmless.
9948 But don't staticpro it here--that is done in alloc.c. */
9949 Qchar_table_extra_slots = intern ("char-table-extra-slots");
70c22245 9950
df7492f9 9951 DEFSYM (Qtranslation_table, "translation-table");
433f7f87 9952 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
df7492f9
KH
9953 DEFSYM (Qtranslation_table_id, "translation-table-id");
9954 DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
9955 DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
1397dc18 9956
df7492f9 9957 DEFSYM (Qvalid_codes, "valid-codes");
9ce27fde 9958
df7492f9 9959 DEFSYM (Qemacs_mule, "emacs-mule");
d46c5b12 9960
01378f49 9961 DEFSYM (QCcategory, ":category");
a6f87d34 9962 DEFSYM (QCmnemonic, ":mnemonic");
2133e2d1 9963 DEFSYM (QCdefault_char, ":default-char");
a6f87d34
KH
9964 DEFSYM (QCdecode_translation_table, ":decode-translation-table");
9965 DEFSYM (QCencode_translation_table, ":encode-translation-table");
9966 DEFSYM (QCpost_read_conversion, ":post-read-conversion");
9967 DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
35befdaa 9968 DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
01378f49 9969
df7492f9
KH
9970 Vcoding_category_table
9971 = Fmake_vector (make_number (coding_category_max), Qnil);
9972 staticpro (&Vcoding_category_table);
9973 /* Followings are target of code detection. */
9974 ASET (Vcoding_category_table, coding_category_iso_7,
9975 intern ("coding-category-iso-7"));
9976 ASET (Vcoding_category_table, coding_category_iso_7_tight,
9977 intern ("coding-category-iso-7-tight"));
9978 ASET (Vcoding_category_table, coding_category_iso_8_1,
9979 intern ("coding-category-iso-8-1"));
9980 ASET (Vcoding_category_table, coding_category_iso_8_2,
9981 intern ("coding-category-iso-8-2"));
9982 ASET (Vcoding_category_table, coding_category_iso_7_else,
9983 intern ("coding-category-iso-7-else"));
9984 ASET (Vcoding_category_table, coding_category_iso_8_else,
9985 intern ("coding-category-iso-8-else"));
a470d443
KH
9986 ASET (Vcoding_category_table, coding_category_utf_8_auto,
9987 intern ("coding-category-utf-8-auto"));
9988 ASET (Vcoding_category_table, coding_category_utf_8_nosig,
df7492f9 9989 intern ("coding-category-utf-8"));
a470d443
KH
9990 ASET (Vcoding_category_table, coding_category_utf_8_sig,
9991 intern ("coding-category-utf-8-sig"));
df7492f9
KH
9992 ASET (Vcoding_category_table, coding_category_utf_16_be,
9993 intern ("coding-category-utf-16-be"));
ff563fce
KH
9994 ASET (Vcoding_category_table, coding_category_utf_16_auto,
9995 intern ("coding-category-utf-16-auto"));
df7492f9
KH
9996 ASET (Vcoding_category_table, coding_category_utf_16_le,
9997 intern ("coding-category-utf-16-le"));
9998 ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
9999 intern ("coding-category-utf-16-be-nosig"));
10000 ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
10001 intern ("coding-category-utf-16-le-nosig"));
10002 ASET (Vcoding_category_table, coding_category_charset,
10003 intern ("coding-category-charset"));
10004 ASET (Vcoding_category_table, coding_category_sjis,
10005 intern ("coding-category-sjis"));
10006 ASET (Vcoding_category_table, coding_category_big5,
10007 intern ("coding-category-big5"));
10008 ASET (Vcoding_category_table, coding_category_ccl,
10009 intern ("coding-category-ccl"));
10010 ASET (Vcoding_category_table, coding_category_emacs_mule,
10011 intern ("coding-category-emacs-mule"));
10012 /* Followings are NOT target of code detection. */
10013 ASET (Vcoding_category_table, coding_category_raw_text,
10014 intern ("coding-category-raw-text"));
10015 ASET (Vcoding_category_table, coding_category_undecided,
10016 intern ("coding-category-undecided"));
ecf488bc 10017
065e3595
KH
10018 DEFSYM (Qinsufficient_source, "insufficient-source");
10019 DEFSYM (Qinconsistent_eol, "inconsistent-eol");
10020 DEFSYM (Qinvalid_source, "invalid-source");
10021 DEFSYM (Qinterrupted, "interrupted");
10022 DEFSYM (Qinsufficient_memory, "insufficient-memory");
44e8490d 10023 DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
065e3595 10024
4ed46869
KH
10025 defsubr (&Scoding_system_p);
10026 defsubr (&Sread_coding_system);
10027 defsubr (&Sread_non_nil_coding_system);
10028 defsubr (&Scheck_coding_system);
10029 defsubr (&Sdetect_coding_region);
d46c5b12 10030 defsubr (&Sdetect_coding_string);
05e6f5dc 10031 defsubr (&Sfind_coding_systems_region_internal);
068a9dbd 10032 defsubr (&Sunencodable_char_position);
df7492f9 10033 defsubr (&Scheck_coding_systems_region);
4ed46869
KH
10034 defsubr (&Sdecode_coding_region);
10035 defsubr (&Sencode_coding_region);
10036 defsubr (&Sdecode_coding_string);
10037 defsubr (&Sencode_coding_string);
10038 defsubr (&Sdecode_sjis_char);
10039 defsubr (&Sencode_sjis_char);
10040 defsubr (&Sdecode_big5_char);
10041 defsubr (&Sencode_big5_char);
1ba9e4ab 10042 defsubr (&Sset_terminal_coding_system_internal);
c4825358 10043 defsubr (&Sset_safe_terminal_coding_system_internal);
4ed46869 10044 defsubr (&Sterminal_coding_system);
1ba9e4ab 10045 defsubr (&Sset_keyboard_coding_system_internal);
4ed46869 10046 defsubr (&Skeyboard_coding_system);
a5d301df 10047 defsubr (&Sfind_operation_coding_system);
df7492f9 10048 defsubr (&Sset_coding_system_priority);
6b89e3aa 10049 defsubr (&Sdefine_coding_system_internal);
df7492f9 10050 defsubr (&Sdefine_coding_system_alias);
a6f87d34 10051 defsubr (&Scoding_system_put);
df7492f9
KH
10052 defsubr (&Scoding_system_base);
10053 defsubr (&Scoding_system_plist);
10054 defsubr (&Scoding_system_aliases);
10055 defsubr (&Scoding_system_eol_type);
10056 defsubr (&Scoding_system_priority_list);
4ed46869 10057
4608c386 10058 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
48b0f3ae
PJ
10059 doc: /* List of coding systems.
10060
10061Do not alter the value of this variable manually. This variable should be
df7492f9 10062updated by the functions `define-coding-system' and
48b0f3ae 10063`define-coding-system-alias'. */);
4608c386
KH
10064 Vcoding_system_list = Qnil;
10065
10066 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
48b0f3ae
PJ
10067 doc: /* Alist of coding system names.
10068Each element is one element list of coding system name.
446dcd75 10069This variable is given to `completing-read' as COLLECTION argument.
48b0f3ae
PJ
10070
10071Do not alter the value of this variable manually. This variable should be
10072updated by the functions `make-coding-system' and
10073`define-coding-system-alias'. */);
4608c386
KH
10074 Vcoding_system_alist = Qnil;
10075
4ed46869 10076 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
48b0f3ae
PJ
10077 doc: /* List of coding-categories (symbols) ordered by priority.
10078
10079On detecting a coding system, Emacs tries code detection algorithms
10080associated with each coding-category one by one in this order. When
10081one algorithm agrees with a byte sequence of source text, the coding
0ec31faf
KH
10082system bound to the corresponding coding-category is selected.
10083
42205607 10084Don't modify this variable directly, but use `set-coding-priority'. */);
4ed46869
KH
10085 {
10086 int i;
10087
10088 Vcoding_category_list = Qnil;
df7492f9 10089 for (i = coding_category_max - 1; i >= 0; i--)
4ed46869 10090 Vcoding_category_list
d46c5b12
KH
10091 = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
10092 Vcoding_category_list);
4ed46869
KH
10093 }
10094
10095 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
48b0f3ae
PJ
10096 doc: /* Specify the coding system for read operations.
10097It is useful to bind this variable with `let', but do not set it globally.
10098If the value is a coding system, it is used for decoding on read operation.
446dcd75
JB
10099If not, an appropriate element is used from one of the coding system alists.
10100There are three such tables: `file-coding-system-alist',
48b0f3ae 10101`process-coding-system-alist', and `network-coding-system-alist'. */);
4ed46869
KH
10102 Vcoding_system_for_read = Qnil;
10103
10104 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
48b0f3ae
PJ
10105 doc: /* Specify the coding system for write operations.
10106Programs bind this variable with `let', but you should not set it globally.
10107If the value is a coding system, it is used for encoding of output,
10108when writing it to a file and when sending it to a file or subprocess.
10109
10110If this does not specify a coding system, an appropriate element
446dcd75
JB
10111is used from one of the coding system alists.
10112There are three such tables: `file-coding-system-alist',
48b0f3ae
PJ
10113`process-coding-system-alist', and `network-coding-system-alist'.
10114For output to files, if the above procedure does not specify a coding system,
10115the value of `buffer-file-coding-system' is used. */);
4ed46869
KH
10116 Vcoding_system_for_write = Qnil;
10117
10118 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
df7492f9
KH
10119 doc: /*
10120Coding system used in the latest file or process I/O. */);
4ed46869
KH
10121 Vlast_coding_system_used = Qnil;
10122
065e3595
KH
10123 DEFVAR_LISP ("last-code-conversion-error", &Vlast_code_conversion_error,
10124 doc: /*
10125Error status of the last code conversion.
10126
10127When an error was detected in the last code conversion, this variable
10128is set to one of the following symbols.
10129 `insufficient-source'
10130 `inconsistent-eol'
10131 `invalid-source'
10132 `interrupted'
10133 `insufficient-memory'
10134When no error was detected, the value doesn't change. So, to check
10135the error status of a code conversion by this variable, you must
10136explicitly set this variable to nil before performing code
10137conversion. */);
10138 Vlast_code_conversion_error = Qnil;
10139
9ce27fde 10140 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
df7492f9
KH
10141 doc: /*
10142*Non-nil means always inhibit code conversion of end-of-line format.
48b0f3ae
PJ
10143See info node `Coding Systems' and info node `Text and Binary' concerning
10144such conversion. */);
9ce27fde
KH
10145 inhibit_eol_conversion = 0;
10146
ed29121d 10147 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
df7492f9
KH
10148 doc: /*
10149Non-nil means process buffer inherits coding system of process output.
48b0f3ae
PJ
10150Bind it to t if the process output is to be treated as if it were a file
10151read from some filesystem. */);
ed29121d
EZ
10152 inherit_process_coding_system = 0;
10153
02ba4723 10154 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
df7492f9
KH
10155 doc: /*
10156Alist to decide a coding system to use for a file I/O operation.
48b0f3ae
PJ
10157The format is ((PATTERN . VAL) ...),
10158where PATTERN is a regular expression matching a file name,
10159VAL is a coding system, a cons of coding systems, or a function symbol.
10160If VAL is a coding system, it is used for both decoding and encoding
10161the file contents.
10162If VAL is a cons of coding systems, the car part is used for decoding,
10163and the cdr part is used for encoding.
10164If VAL is a function symbol, the function must return a coding system
2c53e699
KH
10165or a cons of coding systems which are used as above. The function is
10166called with an argument that is a list of the arguments with which
5a0bbd9a
KH
10167`find-operation-coding-system' was called. If the function can't decide
10168a coding system, it can return `undecided' so that the normal
10169code-detection is performed.
48b0f3ae
PJ
10170
10171See also the function `find-operation-coding-system'
10172and the variable `auto-coding-alist'. */);
02ba4723
KH
10173 Vfile_coding_system_alist = Qnil;
10174
10175 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
df7492f9
KH
10176 doc: /*
10177Alist to decide a coding system to use for a process I/O operation.
48b0f3ae
PJ
10178The format is ((PATTERN . VAL) ...),
10179where PATTERN is a regular expression matching a program name,
10180VAL is a coding system, a cons of coding systems, or a function symbol.
10181If VAL is a coding system, it is used for both decoding what received
10182from the program and encoding what sent to the program.
10183If VAL is a cons of coding systems, the car part is used for decoding,
10184and the cdr part is used for encoding.
10185If VAL is a function symbol, the function must return a coding system
10186or a cons of coding systems which are used as above.
10187
10188See also the function `find-operation-coding-system'. */);
02ba4723
KH
10189 Vprocess_coding_system_alist = Qnil;
10190
10191 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
df7492f9
KH
10192 doc: /*
10193Alist to decide a coding system to use for a network I/O operation.
48b0f3ae
PJ
10194The format is ((PATTERN . VAL) ...),
10195where PATTERN is a regular expression matching a network service name
10196or is a port number to connect to,
10197VAL is a coding system, a cons of coding systems, or a function symbol.
10198If VAL is a coding system, it is used for both decoding what received
10199from the network stream and encoding what sent to the network stream.
10200If VAL is a cons of coding systems, the car part is used for decoding,
10201and the cdr part is used for encoding.
10202If VAL is a function symbol, the function must return a coding system
10203or a cons of coding systems which are used as above.
10204
10205See also the function `find-operation-coding-system'. */);
02ba4723 10206 Vnetwork_coding_system_alist = Qnil;
4ed46869 10207
68c45bf0 10208 DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
75205970
RS
10209 doc: /* Coding system to use with system messages.
10210Also used for decoding keyboard input on X Window system. */);
68c45bf0
PE
10211 Vlocale_coding_system = Qnil;
10212
005f0d35 10213 /* The eol mnemonics are reset in startup.el system-dependently. */
7722baf9 10214 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
df7492f9
KH
10215 doc: /*
10216*String displayed in mode line for UNIX-like (LF) end-of-line format. */);
7722baf9 10217 eol_mnemonic_unix = build_string (":");
4ed46869 10218
7722baf9 10219 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
df7492f9
KH
10220 doc: /*
10221*String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
7722baf9 10222 eol_mnemonic_dos = build_string ("\\");
4ed46869 10223
7722baf9 10224 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
df7492f9
KH
10225 doc: /*
10226*String displayed in mode line for MAC-like (CR) end-of-line format. */);
7722baf9 10227 eol_mnemonic_mac = build_string ("/");
4ed46869 10228
7722baf9 10229 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
df7492f9
KH
10230 doc: /*
10231*String displayed in mode line when end-of-line format is not yet determined. */);
7722baf9 10232 eol_mnemonic_undecided = build_string (":");
4ed46869 10233
84fbb8a0 10234 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
df7492f9
KH
10235 doc: /*
10236*Non-nil enables character translation while encoding and decoding. */);
84fbb8a0 10237 Venable_character_translation = Qt;
bdd9fb48 10238
f967223b 10239 DEFVAR_LISP ("standard-translation-table-for-decode",
48b0f3ae
PJ
10240 &Vstandard_translation_table_for_decode,
10241 doc: /* Table for translating characters while decoding. */);
f967223b 10242 Vstandard_translation_table_for_decode = Qnil;
bdd9fb48 10243
f967223b 10244 DEFVAR_LISP ("standard-translation-table-for-encode",
48b0f3ae
PJ
10245 &Vstandard_translation_table_for_encode,
10246 doc: /* Table for translating characters while encoding. */);
f967223b 10247 Vstandard_translation_table_for_encode = Qnil;
4ed46869 10248
df7492f9 10249 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table,
48b0f3ae
PJ
10250 doc: /* Alist of charsets vs revision numbers.
10251While encoding, if a charset (car part of an element) is found,
df7492f9
KH
10252designate it with the escape sequence identifying revision (cdr part
10253of the element). */);
10254 Vcharset_revision_table = Qnil;
02ba4723
KH
10255
10256 DEFVAR_LISP ("default-process-coding-system",
10257 &Vdefault_process_coding_system,
48b0f3ae
PJ
10258 doc: /* Cons of coding systems used for process I/O by default.
10259The car part is used for decoding a process output,
10260the cdr part is used for encoding a text to be sent to a process. */);
02ba4723 10261 Vdefault_process_coding_system = Qnil;
c4825358 10262
3f003981 10263 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
df7492f9
KH
10264 doc: /*
10265Table of extra Latin codes in the range 128..159 (inclusive).
48b0f3ae
PJ
10266This is a vector of length 256.
10267If Nth element is non-nil, the existence of code N in a file
10268\(or output of subprocess) doesn't prevent it to be detected as
10269a coding system of ISO 2022 variant which has a flag
10270`accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
10271or reading output of a subprocess.
446dcd75 10272Only 128th through 159th elements have a meaning. */);
3f003981 10273 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
d46c5b12
KH
10274
10275 DEFVAR_LISP ("select-safe-coding-system-function",
10276 &Vselect_safe_coding_system_function,
df7492f9
KH
10277 doc: /*
10278Function to call to select safe coding system for encoding a text.
48b0f3ae
PJ
10279
10280If set, this function is called to force a user to select a proper
10281coding system which can encode the text in the case that a default
fdecf907
GM
10282coding system used in each operation can't encode the text. The
10283function should take care that the buffer is not modified while
10284the coding system is being selected.
48b0f3ae
PJ
10285
10286The default value is `select-safe-coding-system' (which see). */);
d46c5b12
KH
10287 Vselect_safe_coding_system_function = Qnil;
10288
5d5bf4d8
KH
10289 DEFVAR_BOOL ("coding-system-require-warning",
10290 &coding_system_require_warning,
10291 doc: /* Internal use only.
6b89e3aa
KH
10292If non-nil, on writing a file, `select-safe-coding-system-function' is
10293called even if `coding-system-for-write' is non-nil. The command
10294`universal-coding-system-argument' binds this variable to t temporarily. */);
5d5bf4d8
KH
10295 coding_system_require_warning = 0;
10296
10297
22ab2303 10298 DEFVAR_BOOL ("inhibit-iso-escape-detection",
74383408 10299 &inhibit_iso_escape_detection,
df7492f9 10300 doc: /*
97b1b294 10301If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
48b0f3ae 10302
97b1b294
EZ
10303When Emacs reads text, it tries to detect how the text is encoded.
10304This code detection is sensitive to escape sequences. If Emacs sees
10305a valid ISO-2022 escape sequence, it assumes the text is encoded in one
10306of the ISO2022 encodings, and decodes text by the corresponding coding
10307system (e.g. `iso-2022-7bit').
48b0f3ae
PJ
10308
10309However, there may be a case that you want to read escape sequences in
10310a file as is. In such a case, you can set this variable to non-nil.
97b1b294
EZ
10311Then the code detection will ignore any escape sequences, and no text is
10312detected as encoded in some ISO-2022 encoding. The result is that all
48b0f3ae
PJ
10313escape sequences become visible in a buffer.
10314
10315The default value is nil, and it is strongly recommended not to change
10316it. That is because many Emacs Lisp source files that contain
10317non-ASCII characters are encoded by the coding system `iso-2022-7bit'
10318in Emacs's distribution, and they won't be decoded correctly on
10319reading if you suppress escape sequence detection.
10320
10321The other way to read escape sequences in a file without decoding is
97b1b294 10322to explicitly specify some coding system that doesn't use ISO-2022
48b0f3ae 10323escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument]. */);
74383408 10324 inhibit_iso_escape_detection = 0;
002fdb44 10325
97b1b294
EZ
10326 DEFVAR_BOOL ("inhibit-null-byte-detection",
10327 &inhibit_null_byte_detection,
10328 doc: /* If non-nil, Emacs ignores null bytes on code detection.
10329By default, Emacs treats it as binary data, and does not attempt to
10330decode it. The effect is as if you specified `no-conversion' for
10331reading that text.
10332
10333Set this to non-nil when a regular text happens to include null bytes.
10334Examples are Index nodes of Info files and null-byte delimited output
10335from GNU Find and GNU Grep. Emacs will then ignore the null bytes and
10336decode text as usual. */);
10337 inhibit_null_byte_detection = 0;
10338
002fdb44 10339 DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
15c8f9d1 10340 doc: /* Char table for translating self-inserting characters.
446dcd75
JB
10341This is applied to the result of input methods, not their input.
10342See also `keyboard-translate-table'. */);
002fdb44 10343 Vtranslation_table_for_input = Qnil;
8f924df7 10344
2c78b7e1
KH
10345 {
10346 Lisp_Object args[coding_arg_max];
8f924df7 10347 Lisp_Object plist[16];
2c78b7e1
KH
10348 int i;
10349
10350 for (i = 0; i < coding_arg_max; i++)
10351 args[i] = Qnil;
10352
10353 plist[0] = intern (":name");
10354 plist[1] = args[coding_arg_name] = Qno_conversion;
10355 plist[2] = intern (":mnemonic");
10356 plist[3] = args[coding_arg_mnemonic] = make_number ('=');
10357 plist[4] = intern (":coding-type");
10358 plist[5] = args[coding_arg_coding_type] = Qraw_text;
10359 plist[6] = intern (":ascii-compatible-p");
10360 plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
10361 plist[8] = intern (":default-char");
10362 plist[9] = args[coding_arg_default_char] = make_number (0);
8f924df7
KH
10363 plist[10] = intern (":for-unibyte");
10364 plist[11] = args[coding_arg_for_unibyte] = Qt;
10365 plist[12] = intern (":docstring");
10366 plist[13] = build_string ("Do no conversion.\n\
2c78b7e1
KH
10367\n\
10368When you visit a file with this coding, the file is read into a\n\
10369unibyte buffer as is, thus each byte of a file is treated as a\n\
10370character.");
8f924df7
KH
10371 plist[14] = intern (":eol-type");
10372 plist[15] = args[coding_arg_eol_type] = Qunix;
10373 args[coding_arg_plist] = Flist (16, plist);
2c78b7e1 10374 Fdefine_coding_system_internal (coding_arg_max, args);
ae6f73fa
KH
10375
10376 plist[1] = args[coding_arg_name] = Qundecided;
10377 plist[3] = args[coding_arg_mnemonic] = make_number ('-');
10378 plist[5] = args[coding_arg_coding_type] = Qundecided;
10379 /* This is already set.
35befdaa 10380 plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
ae6f73fa
KH
10381 plist[8] = intern (":charset-list");
10382 plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
10383 plist[11] = args[coding_arg_for_unibyte] = Qnil;
10384 plist[13] = build_string ("No conversion on encoding, automatic conversion on decoding.");
10385 plist[15] = args[coding_arg_eol_type] = Qnil;
10386 args[coding_arg_plist] = Flist (16, plist);
10387 Fdefine_coding_system_internal (coding_arg_max, args);
2c78b7e1
KH
10388 }
10389
2c78b7e1 10390 setup_coding_system (Qno_conversion, &safe_terminal_coding);
ff563fce
KH
10391
10392 {
10393 int i;
10394
10395 for (i = 0; i < coding_category_max; i++)
10396 Fset (AREF (Vcoding_category_table, i), Qno_conversion);
10397 }
fcbcfb64
KH
10398#if defined (MSDOS) || defined (WINDOWSNT)
10399 system_eol_type = Qdos;
10400#else
10401 system_eol_type = Qunix;
10402#endif
10403 staticpro (&system_eol_type);
4ed46869
KH
10404}
10405
68c45bf0
PE
10406char *
10407emacs_strerror (error_number)
10408 int error_number;
10409{
10410 char *str;
10411
ca9c0567 10412 synchronize_system_messages_locale ();
68c45bf0
PE
10413 str = strerror (error_number);
10414
10415 if (! NILP (Vlocale_coding_system))
10416 {
10417 Lisp_Object dec = code_convert_string_norecord (build_string (str),
10418 Vlocale_coding_system,
10419 0);
d5db4077 10420 str = (char *) SDATA (dec);
68c45bf0
PE
10421 }
10422
10423 return str;
10424}
10425
4ed46869 10426#endif /* emacs */
9ffd559c
KH
10427
10428/* arch-tag: 3a3a2b01-5ff6-4071-9afe-f5b808d9229d
10429 (do not change this comment) */