(detect_coding_charset): For iso-8859-* coding systems,
[bpt/emacs.git] / src / coding.c
CommitLineData
9542cb1f 1/* Coding system handler (conversion, detection, etc).
aaef169d 2 Copyright (C) 2001, 2002, 2003, 2004, 2005,
8cabe764 3 2006, 2007, 2008 Free Software Foundation, Inc.
7976eda0 4 Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
8cabe764 5 2005, 2006, 2007, 2008
ce03bf76
KH
6 National Institute of Advanced Industrial Science and Technology (AIST)
7 Registration Number H14PRO021
8f924df7 8 Copyright (C) 2003
df7492f9
KH
9 National Institute of Advanced Industrial Science and Technology (AIST)
10 Registration Number H13PRO009
4ed46869 11
369314dc
KH
12This file is part of GNU Emacs.
13
9ec0b715 14GNU Emacs is free software: you can redistribute it and/or modify
369314dc 15it under the terms of the GNU General Public License as published by
9ec0b715
GM
16the Free Software Foundation, either version 3 of the License, or
17(at your option) any later version.
4ed46869 18
369314dc
KH
19GNU Emacs is distributed in the hope that it will be useful,
20but WITHOUT ANY WARRANTY; without even the implied warranty of
21MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22GNU General Public License for more details.
4ed46869 23
369314dc 24You should have received a copy of the GNU General Public License
9ec0b715 25along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */
4ed46869
KH
26
27/*** TABLE OF CONTENTS ***
28
b73bfc1c 29 0. General comments
4ed46869 30 1. Preamble
df7492f9
KH
31 2. Emacs' internal format (emacs-utf-8) handlers
32 3. UTF-8 handlers
33 4. UTF-16 handlers
34 5. Charset-base coding systems handlers
35 6. emacs-mule (old Emacs' internal format) handlers
36 7. ISO2022 handlers
37 8. Shift-JIS and BIG5 handlers
38 9. CCL handlers
39 10. C library functions
40 11. Emacs Lisp library functions
41 12. Postamble
4ed46869
KH
42
43*/
44
df7492f9 45/*** 0. General comments ***
b73bfc1c
KH
46
47
df7492f9 48CODING SYSTEM
4ed46869 49
5bad0796
DL
50 A coding system is an object for an encoding mechanism that contains
51 information about how to convert byte sequences to character
e19c3639
KH
52 sequences and vice versa. When we say "decode", it means converting
53 a byte sequence of a specific coding system into a character
54 sequence that is represented by Emacs' internal coding system
55 `emacs-utf-8', and when we say "encode", it means converting a
56 character sequence of emacs-utf-8 to a byte sequence of a specific
0ef69138 57 coding system.
4ed46869 58
e19c3639
KH
59 In Emacs Lisp, a coding system is represented by a Lisp symbol. In
60 C level, a coding system is represented by a vector of attributes
5bad0796 61 stored in the hash table Vcharset_hash_table. The conversion from
e19c3639
KH
62 coding system symbol to attributes vector is done by looking up
63 Vcharset_hash_table by the symbol.
4ed46869 64
e19c3639 65 Coding systems are classified into the following types depending on
5bad0796 66 the encoding mechanism. Here's a brief description of the types.
4ed46869 67
df7492f9
KH
68 o UTF-8
69
70 o UTF-16
71
72 o Charset-base coding system
73
74 A coding system defined by one or more (coded) character sets.
5bad0796 75 Decoding and encoding are done by a code converter defined for each
df7492f9
KH
76 character set.
77
5bad0796 78 o Old Emacs internal format (emacs-mule)
df7492f9 79
5bad0796 80 The coding system adopted by old versions of Emacs (20 and 21).
4ed46869 81
df7492f9 82 o ISO2022-base coding system
4ed46869
KH
83
84 The most famous coding system for multiple character sets. X's
df7492f9
KH
85 Compound Text, various EUCs (Extended Unix Code), and coding systems
86 used in the Internet communication such as ISO-2022-JP are all
87 variants of ISO2022.
4ed46869 88
df7492f9 89 o SJIS (or Shift-JIS or MS-Kanji-Code)
93dec019 90
4ed46869
KH
91 A coding system to encode character sets: ASCII, JISX0201, and
92 JISX0208. Widely used for PC's in Japan. Details are described in
df7492f9 93 section 8.
4ed46869 94
df7492f9 95 o BIG5
4ed46869 96
df7492f9 97 A coding system to encode character sets: ASCII and Big5. Widely
cfb43547 98 used for Chinese (mainly in Taiwan and Hong Kong). Details are
df7492f9
KH
99 described in section 8. In this file, when we write "big5" (all
100 lowercase), we mean the coding system, and when we write "Big5"
101 (capitalized), we mean the character set.
4ed46869 102
df7492f9 103 o CCL
27901516 104
5bad0796 105 If a user wants to decode/encode text encoded in a coding system
df7492f9
KH
106 not listed above, he can supply a decoder and an encoder for it in
107 CCL (Code Conversion Language) programs. Emacs executes the CCL
108 program while decoding/encoding.
27901516 109
df7492f9 110 o Raw-text
4ed46869 111
5a936b46 112 A coding system for text containing raw eight-bit data. Emacs
5bad0796 113 treats each byte of source text as a character (except for
df7492f9 114 end-of-line conversion).
4ed46869 115
df7492f9
KH
116 o No-conversion
117
118 Like raw text, but don't do end-of-line conversion.
4ed46869 119
4ed46869 120
df7492f9 121END-OF-LINE FORMAT
4ed46869 122
5bad0796 123 How text end-of-line is encoded depends on operating system. For
df7492f9 124 instance, Unix's format is just one byte of LF (line-feed) code,
f4dee582 125 whereas DOS's format is two-byte sequence of `carriage-return' and
d46c5b12
KH
126 `line-feed' codes. MacOS's format is usually one byte of
127 `carriage-return'.
4ed46869 128
cfb43547 129 Since text character encoding and end-of-line encoding are
df7492f9
KH
130 independent, any coding system described above can take any format
131 of end-of-line (except for no-conversion).
4ed46869 132
e19c3639
KH
133STRUCT CODING_SYSTEM
134
135 Before using a coding system for code conversion (i.e. decoding and
136 encoding), we setup a structure of type `struct coding_system'.
137 This structure keeps various information about a specific code
5bad0796 138 conversion (e.g. the location of source and destination data).
4ed46869
KH
139
140*/
141
df7492f9
KH
142/* COMMON MACROS */
143
144
4ed46869
KH
145/*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
146
df7492f9 147 These functions check if a byte sequence specified as a source in
ff0dacd7
KH
148 CODING conforms to the format of XXX, and update the members of
149 DETECT_INFO.
df7492f9 150
ff0dacd7 151 Return 1 if the byte sequence conforms to XXX, otherwise return 0.
df7492f9
KH
152
153 Below is the template of these functions. */
154
4ed46869 155#if 0
df7492f9 156static int
ff0dacd7 157detect_coding_XXX (coding, detect_info)
df7492f9 158 struct coding_system *coding;
ff0dacd7 159 struct coding_detection_info *detect_info;
4ed46869 160{
f1d34bca
MB
161 const unsigned char *src = coding->source;
162 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9 163 int multibytep = coding->src_multibyte;
ff0dacd7 164 int consumed_chars = 0;
df7492f9
KH
165 int found = 0;
166 ...;
167
168 while (1)
169 {
170 /* Get one byte from the source. If the souce is exausted, jump
171 to no_more_source:. */
172 ONE_MORE_BYTE (c);
ff0dacd7
KH
173
174 if (! __C_conforms_to_XXX___ (c))
175 break;
176 if (! __C_strongly_suggests_XXX__ (c))
177 found = CATEGORY_MASK_XXX;
df7492f9 178 }
ff0dacd7
KH
179 /* The byte sequence is invalid for XXX. */
180 detect_info->rejected |= CATEGORY_MASK_XXX;
df7492f9 181 return 0;
ff0dacd7 182
df7492f9 183 no_more_source:
ff0dacd7
KH
184 /* The source exausted successfully. */
185 detect_info->found |= found;
df7492f9 186 return 1;
4ed46869
KH
187}
188#endif
189
190/*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
191
df7492f9
KH
192 These functions decode a byte sequence specified as a source by
193 CODING. The resulting multibyte text goes to a place pointed to by
194 CODING->charbuf, the length of which should not exceed
195 CODING->charbuf_size;
d46c5b12 196
df7492f9
KH
197 These functions set the information of original and decoded texts in
198 CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
199 They also set CODING->result to one of CODING_RESULT_XXX indicating
200 how the decoding is finished.
d46c5b12 201
df7492f9 202 Below is the template of these functions. */
d46c5b12 203
4ed46869 204#if 0
b73bfc1c 205static void
df7492f9 206decode_coding_XXXX (coding)
4ed46869 207 struct coding_system *coding;
4ed46869 208{
f1d34bca
MB
209 const unsigned char *src = coding->source + coding->consumed;
210 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9
KH
211 /* SRC_BASE remembers the start position in source in each loop.
212 The loop will be exited when there's not enough source code, or
213 when there's no room in CHARBUF for a decoded character. */
f1d34bca 214 const unsigned char *src_base;
df7492f9 215 /* A buffer to produce decoded characters. */
69a80ea3
KH
216 int *charbuf = coding->charbuf + coding->charbuf_used;
217 int *charbuf_end = coding->charbuf + coding->charbuf_size;
df7492f9
KH
218 int multibytep = coding->src_multibyte;
219
220 while (1)
221 {
222 src_base = src;
223 if (charbuf < charbuf_end)
224 /* No more room to produce a decoded character. */
225 break;
226 ONE_MORE_BYTE (c);
227 /* Decode it. */
228 }
229
230 no_more_source:
231 if (src_base < src_end
232 && coding->mode & CODING_MODE_LAST_BLOCK)
233 /* If the source ends by partial bytes to construct a character,
234 treat them as eight-bit raw data. */
235 while (src_base < src_end && charbuf < charbuf_end)
236 *charbuf++ = *src_base++;
237 /* Remember how many bytes and characters we consumed. If the
238 source is multibyte, the bytes and chars are not identical. */
239 coding->consumed = coding->consumed_char = src_base - coding->source;
240 /* Remember how many characters we produced. */
241 coding->charbuf_used = charbuf - coding->charbuf;
4ed46869
KH
242}
243#endif
244
245/*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
246
df7492f9
KH
247 These functions encode SRC_BYTES length text at SOURCE of Emacs'
248 internal multibyte format by CODING. The resulting byte sequence
b73bfc1c
KH
249 goes to a place pointed to by DESTINATION, the length of which
250 should not exceed DST_BYTES.
d46c5b12 251
df7492f9
KH
252 These functions set the information of original and encoded texts in
253 the members produced, produced_char, consumed, and consumed_char of
254 the structure *CODING. They also set the member result to one of
255 CODING_RESULT_XXX indicating how the encoding finished.
d46c5b12 256
df7492f9
KH
257 DST_BYTES zero means that source area and destination area are
258 overlapped, which means that we can produce a encoded text until it
259 reaches at the head of not-yet-encoded source text.
d46c5b12 260
df7492f9 261 Below is a template of these functions. */
4ed46869 262#if 0
b73bfc1c 263static void
df7492f9 264encode_coding_XXX (coding)
4ed46869 265 struct coding_system *coding;
4ed46869 266{
df7492f9
KH
267 int multibytep = coding->dst_multibyte;
268 int *charbuf = coding->charbuf;
269 int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
270 unsigned char *dst = coding->destination + coding->produced;
271 unsigned char *dst_end = coding->destination + coding->dst_bytes;
272 unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
273 int produced_chars = 0;
274
275 for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
276 {
277 int c = *charbuf;
278 /* Encode C into DST, and increment DST. */
279 }
280 label_no_more_destination:
281 /* How many chars and bytes we produced. */
282 coding->produced_char += produced_chars;
283 coding->produced = dst - coding->destination;
4ed46869
KH
284}
285#endif
286
4ed46869
KH
287\f
288/*** 1. Preamble ***/
289
68c45bf0 290#include <config.h>
4ed46869
KH
291#include <stdio.h>
292
4ed46869
KH
293#include "lisp.h"
294#include "buffer.h"
df7492f9 295#include "character.h"
4ed46869
KH
296#include "charset.h"
297#include "ccl.h"
df7492f9 298#include "composite.h"
4ed46869
KH
299#include "coding.h"
300#include "window.h"
b8299c66
KL
301#include "frame.h"
302#include "termhooks.h"
4ed46869 303
df7492f9 304Lisp_Object Vcoding_system_hash_table;
4ed46869 305
df7492f9 306Lisp_Object Qcoding_system, Qcoding_aliases, Qeol_type;
1965cb73
DL
307Lisp_Object Qunix, Qdos;
308extern Lisp_Object Qmac; /* frame.c */
4ed46869
KH
309Lisp_Object Qbuffer_file_coding_system;
310Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
df7492f9 311Lisp_Object Qdefault_char;
27901516 312Lisp_Object Qno_conversion, Qundecided;
df7492f9 313Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5;
b49a1807 314Lisp_Object Qbig, Qlittle;
bb0115a2 315Lisp_Object Qcoding_system_history;
1397dc18 316Lisp_Object Qvalid_codes;
a6f87d34
KH
317Lisp_Object QCcategory, QCmnemonic, QCdefalut_char;
318Lisp_Object QCdecode_translation_table, QCencode_translation_table;
319Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
35befdaa 320Lisp_Object QCascii_compatible_p;
4ed46869
KH
321
322extern Lisp_Object Qinsert_file_contents, Qwrite_region;
387f6ba5 323Lisp_Object Qcall_process, Qcall_process_region;
4ed46869
KH
324Lisp_Object Qstart_process, Qopen_network_stream;
325Lisp_Object Qtarget_idx;
326
065e3595
KH
327Lisp_Object Qinsufficient_source, Qinconsistent_eol, Qinvalid_source;
328Lisp_Object Qinterrupted, Qinsufficient_memory;
329
c7183fb8
GM
330extern Lisp_Object Qcompletion_ignore_case;
331
44e8490d
KH
332/* If a symbol has this property, evaluate the value to define the
333 symbol as a coding system. */
334static Lisp_Object Qcoding_system_define_form;
335
5d5bf4d8
KH
336int coding_system_require_warning;
337
d46c5b12
KH
338Lisp_Object Vselect_safe_coding_system_function;
339
7722baf9
EZ
340/* Mnemonic string for each format of end-of-line. */
341Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
342/* Mnemonic string to indicate format of end-of-line is not yet
4ed46869 343 decided. */
7722baf9 344Lisp_Object eol_mnemonic_undecided;
4ed46869 345
fcbcfb64
KH
346/* Format of end-of-line decided by system. This is Qunix on
347 Unix and Mac, Qdos on DOS/Windows.
348 This has an effect only for external encoding (i.e. for output to
349 file and process), not for in-buffer or Lisp string encoding. */
350static Lisp_Object system_eol_type;
351
4ed46869
KH
352#ifdef emacs
353
4608c386
KH
354Lisp_Object Vcoding_system_list, Vcoding_system_alist;
355
356Lisp_Object Qcoding_system_p, Qcoding_system_error;
4ed46869 357
d46c5b12
KH
358/* Coding system emacs-mule and raw-text are for converting only
359 end-of-line format. */
360Lisp_Object Qemacs_mule, Qraw_text;
8f924df7 361Lisp_Object Qutf_8_emacs;
ecf488bc 362
4ed46869
KH
363/* Coding-systems are handed between Emacs Lisp programs and C internal
364 routines by the following three variables. */
365/* Coding-system for reading files and receiving data from process. */
366Lisp_Object Vcoding_system_for_read;
367/* Coding-system for writing files and sending data to process. */
368Lisp_Object Vcoding_system_for_write;
369/* Coding-system actually used in the latest I/O. */
370Lisp_Object Vlast_coding_system_used;
065e3595
KH
371/* Set to non-nil when an error is detected while code conversion. */
372Lisp_Object Vlast_code_conversion_error;
c4825358 373/* A vector of length 256 which contains information about special
94487c4e 374 Latin codes (especially for dealing with Microsoft codes). */
3f003981 375Lisp_Object Vlatin_extra_code_table;
c4825358 376
9ce27fde
KH
377/* Flag to inhibit code conversion of end-of-line format. */
378int inhibit_eol_conversion;
379
74383408
KH
380/* Flag to inhibit ISO2022 escape sequence detection. */
381int inhibit_iso_escape_detection;
382
ed29121d
EZ
383/* Flag to make buffer-file-coding-system inherit from process-coding. */
384int inherit_process_coding_system;
385
c4825358
KH
386/* Coding system to be used to encode text for terminal display when
387 terminal coding system is nil. */
388struct coding_system safe_terminal_coding;
389
02ba4723
KH
390Lisp_Object Vfile_coding_system_alist;
391Lisp_Object Vprocess_coding_system_alist;
392Lisp_Object Vnetwork_coding_system_alist;
4ed46869 393
68c45bf0
PE
394Lisp_Object Vlocale_coding_system;
395
4ed46869
KH
396#endif /* emacs */
397
f967223b
KH
398/* Flag to tell if we look up translation table on character code
399 conversion. */
84fbb8a0 400Lisp_Object Venable_character_translation;
f967223b
KH
401/* Standard translation table to look up on decoding (reading). */
402Lisp_Object Vstandard_translation_table_for_decode;
403/* Standard translation table to look up on encoding (writing). */
404Lisp_Object Vstandard_translation_table_for_encode;
84fbb8a0 405
f967223b
KH
406Lisp_Object Qtranslation_table;
407Lisp_Object Qtranslation_table_id;
408Lisp_Object Qtranslation_table_for_decode;
409Lisp_Object Qtranslation_table_for_encode;
4ed46869
KH
410
411/* Alist of charsets vs revision number. */
df7492f9 412static Lisp_Object Vcharset_revision_table;
4ed46869 413
02ba4723
KH
414/* Default coding systems used for process I/O. */
415Lisp_Object Vdefault_process_coding_system;
416
002fdb44
DL
417/* Char table for translating Quail and self-inserting input. */
418Lisp_Object Vtranslation_table_for_input;
419
df7492f9
KH
420/* Two special coding systems. */
421Lisp_Object Vsjis_coding_system;
422Lisp_Object Vbig5_coding_system;
423
df7492f9
KH
424/* ISO2022 section */
425
426#define CODING_ISO_INITIAL(coding, reg) \
427 (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id), \
428 coding_attr_iso_initial), \
429 reg)))
430
431
432#define CODING_ISO_REQUEST(coding, charset_id) \
433 ((charset_id <= (coding)->max_charset_id \
434 ? (coding)->safe_charsets[charset_id] \
435 : -1))
436
437
438#define CODING_ISO_FLAGS(coding) \
439 ((coding)->spec.iso_2022.flags)
440#define CODING_ISO_DESIGNATION(coding, reg) \
441 ((coding)->spec.iso_2022.current_designation[reg])
442#define CODING_ISO_INVOCATION(coding, plane) \
443 ((coding)->spec.iso_2022.current_invocation[plane])
444#define CODING_ISO_SINGLE_SHIFTING(coding) \
445 ((coding)->spec.iso_2022.single_shifting)
446#define CODING_ISO_BOL(coding) \
447 ((coding)->spec.iso_2022.bol)
448#define CODING_ISO_INVOKED_CHARSET(coding, plane) \
449 CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
450
451/* Control characters of ISO2022. */
452 /* code */ /* function */
453#define ISO_CODE_LF 0x0A /* line-feed */
454#define ISO_CODE_CR 0x0D /* carriage-return */
455#define ISO_CODE_SO 0x0E /* shift-out */
456#define ISO_CODE_SI 0x0F /* shift-in */
457#define ISO_CODE_SS2_7 0x19 /* single-shift-2 for 7-bit code */
458#define ISO_CODE_ESC 0x1B /* escape */
459#define ISO_CODE_SS2 0x8E /* single-shift-2 */
460#define ISO_CODE_SS3 0x8F /* single-shift-3 */
461#define ISO_CODE_CSI 0x9B /* control-sequence-introducer */
462
463/* All code (1-byte) of ISO2022 is classified into one of the
464 followings. */
465enum iso_code_class_type
466 {
467 ISO_control_0, /* Control codes in the range
468 0x00..0x1F and 0x7F, except for the
469 following 5 codes. */
df7492f9
KH
470 ISO_shift_out, /* ISO_CODE_SO (0x0E) */
471 ISO_shift_in, /* ISO_CODE_SI (0x0F) */
472 ISO_single_shift_2_7, /* ISO_CODE_SS2_7 (0x19) */
473 ISO_escape, /* ISO_CODE_SO (0x1B) */
474 ISO_control_1, /* Control codes in the range
475 0x80..0x9F, except for the
476 following 3 codes. */
477 ISO_single_shift_2, /* ISO_CODE_SS2 (0x8E) */
478 ISO_single_shift_3, /* ISO_CODE_SS3 (0x8F) */
479 ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
480 ISO_0x20_or_0x7F, /* Codes of the values 0x20 or 0x7F. */
481 ISO_graphic_plane_0, /* Graphic codes in the range 0x21..0x7E. */
482 ISO_0xA0_or_0xFF, /* Codes of the values 0xA0 or 0xFF. */
483 ISO_graphic_plane_1 /* Graphic codes in the range 0xA1..0xFE. */
484 };
05e6f5dc 485
df7492f9
KH
486/** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
487 `iso-flags' attribute of an iso2022 coding system. */
05e6f5dc 488
df7492f9
KH
489/* If set, produce long-form designation sequence (e.g. ESC $ ( A)
490 instead of the correct short-form sequence (e.g. ESC $ A). */
491#define CODING_ISO_FLAG_LONG_FORM 0x0001
93dec019 492
df7492f9
KH
493/* If set, reset graphic planes and registers at end-of-line to the
494 initial state. */
495#define CODING_ISO_FLAG_RESET_AT_EOL 0x0002
05e6f5dc 496
df7492f9
KH
497/* If set, reset graphic planes and registers before any control
498 characters to the initial state. */
499#define CODING_ISO_FLAG_RESET_AT_CNTL 0x0004
05e6f5dc 500
df7492f9
KH
501/* If set, encode by 7-bit environment. */
502#define CODING_ISO_FLAG_SEVEN_BITS 0x0008
4ed46869 503
df7492f9
KH
504/* If set, use locking-shift function. */
505#define CODING_ISO_FLAG_LOCKING_SHIFT 0x0010
b73bfc1c 506
df7492f9
KH
507/* If set, use single-shift function. Overwrite
508 CODING_ISO_FLAG_LOCKING_SHIFT. */
509#define CODING_ISO_FLAG_SINGLE_SHIFT 0x0020
b73bfc1c 510
df7492f9
KH
511/* If set, use designation escape sequence. */
512#define CODING_ISO_FLAG_DESIGNATION 0x0040
b73bfc1c 513
df7492f9
KH
514/* If set, produce revision number sequence. */
515#define CODING_ISO_FLAG_REVISION 0x0080
b73bfc1c 516
df7492f9
KH
517/* If set, produce ISO6429's direction specifying sequence. */
518#define CODING_ISO_FLAG_DIRECTION 0x0100
f4dee582 519
df7492f9
KH
520/* If set, assume designation states are reset at beginning of line on
521 output. */
522#define CODING_ISO_FLAG_INIT_AT_BOL 0x0200
4ed46869 523
df7492f9
KH
524/* If set, designation sequence should be placed at beginning of line
525 on output. */
526#define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
aa72b389 527
df7492f9
KH
528/* If set, do not encode unsafe charactes on output. */
529#define CODING_ISO_FLAG_SAFE 0x0800
aa72b389 530
df7492f9
KH
531/* If set, extra latin codes (128..159) are accepted as a valid code
532 on input. */
533#define CODING_ISO_FLAG_LATIN_EXTRA 0x1000
aa72b389 534
df7492f9 535#define CODING_ISO_FLAG_COMPOSITION 0x2000
aa72b389 536
df7492f9 537#define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000
aa72b389 538
bf16eb23 539#define CODING_ISO_FLAG_USE_ROMAN 0x8000
aa72b389 540
bf16eb23 541#define CODING_ISO_FLAG_USE_OLDJIS 0x10000
aa72b389 542
bf16eb23 543#define CODING_ISO_FLAG_FULL_SUPPORT 0x100000
aa72b389 544
df7492f9
KH
545/* A character to be produced on output if encoding of the original
546 character is prohibited by CODING_ISO_FLAG_SAFE. */
547#define CODING_INHIBIT_CHARACTER_SUBSTITUTION '?'
aa72b389 548
a470d443
KH
549/* UTF-8 section */
550#define CODING_UTF_8_BOM(coding) \
551 ((coding)->spec.utf_8_bom)
4ed46869 552
df7492f9
KH
553/* UTF-16 section */
554#define CODING_UTF_16_BOM(coding) \
555 ((coding)->spec.utf_16.bom)
4ed46869 556
df7492f9
KH
557#define CODING_UTF_16_ENDIAN(coding) \
558 ((coding)->spec.utf_16.endian)
4ed46869 559
df7492f9
KH
560#define CODING_UTF_16_SURROGATE(coding) \
561 ((coding)->spec.utf_16.surrogate)
4ed46869 562
4ed46869 563
df7492f9
KH
564/* CCL section */
565#define CODING_CCL_DECODER(coding) \
566 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
567#define CODING_CCL_ENCODER(coding) \
568 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
569#define CODING_CCL_VALIDS(coding) \
8f924df7 570 (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
4ed46869 571
5a936b46 572/* Index for each coding category in `coding_categories' */
4ed46869 573
df7492f9
KH
574enum coding_category
575 {
576 coding_category_iso_7,
577 coding_category_iso_7_tight,
578 coding_category_iso_8_1,
579 coding_category_iso_8_2,
580 coding_category_iso_7_else,
581 coding_category_iso_8_else,
a470d443
KH
582 coding_category_utf_8_auto,
583 coding_category_utf_8_nosig,
584 coding_category_utf_8_sig,
df7492f9
KH
585 coding_category_utf_16_auto,
586 coding_category_utf_16_be,
587 coding_category_utf_16_le,
588 coding_category_utf_16_be_nosig,
589 coding_category_utf_16_le_nosig,
590 coding_category_charset,
591 coding_category_sjis,
592 coding_category_big5,
593 coding_category_ccl,
594 coding_category_emacs_mule,
595 /* All above are targets of code detection. */
596 coding_category_raw_text,
597 coding_category_undecided,
598 coding_category_max
599 };
600
601/* Definitions of flag bits used in detect_coding_XXXX. */
602#define CATEGORY_MASK_ISO_7 (1 << coding_category_iso_7)
603#define CATEGORY_MASK_ISO_7_TIGHT (1 << coding_category_iso_7_tight)
604#define CATEGORY_MASK_ISO_8_1 (1 << coding_category_iso_8_1)
605#define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2)
606#define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else)
607#define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else)
a470d443
KH
608#define CATEGORY_MASK_UTF_8_AUTO (1 << coding_category_utf_8_auto)
609#define CATEGORY_MASK_UTF_8_NOSIG (1 << coding_category_utf_8_nosig)
610#define CATEGORY_MASK_UTF_8_SIG (1 << coding_category_utf_8_sig)
b49a1807 611#define CATEGORY_MASK_UTF_16_AUTO (1 << coding_category_utf_16_auto)
df7492f9
KH
612#define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be)
613#define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le)
614#define CATEGORY_MASK_UTF_16_BE_NOSIG (1 << coding_category_utf_16_be_nosig)
615#define CATEGORY_MASK_UTF_16_LE_NOSIG (1 << coding_category_utf_16_le_nosig)
616#define CATEGORY_MASK_CHARSET (1 << coding_category_charset)
617#define CATEGORY_MASK_SJIS (1 << coding_category_sjis)
618#define CATEGORY_MASK_BIG5 (1 << coding_category_big5)
619#define CATEGORY_MASK_CCL (1 << coding_category_ccl)
620#define CATEGORY_MASK_EMACS_MULE (1 << coding_category_emacs_mule)
ff0dacd7 621#define CATEGORY_MASK_RAW_TEXT (1 << coding_category_raw_text)
df7492f9
KH
622
623/* This value is returned if detect_coding_mask () find nothing other
624 than ASCII characters. */
625#define CATEGORY_MASK_ANY \
626 (CATEGORY_MASK_ISO_7 \
627 | CATEGORY_MASK_ISO_7_TIGHT \
628 | CATEGORY_MASK_ISO_8_1 \
629 | CATEGORY_MASK_ISO_8_2 \
630 | CATEGORY_MASK_ISO_7_ELSE \
631 | CATEGORY_MASK_ISO_8_ELSE \
a470d443
KH
632 | CATEGORY_MASK_UTF_8_AUTO \
633 | CATEGORY_MASK_UTF_8_NOSIG \
634 | CATEGORY_MASK_UTF_8_SIG \
2f3cbb32 635 | CATEGORY_MASK_UTF_16_AUTO \
df7492f9
KH
636 | CATEGORY_MASK_UTF_16_BE \
637 | CATEGORY_MASK_UTF_16_LE \
638 | CATEGORY_MASK_UTF_16_BE_NOSIG \
639 | CATEGORY_MASK_UTF_16_LE_NOSIG \
640 | CATEGORY_MASK_CHARSET \
641 | CATEGORY_MASK_SJIS \
642 | CATEGORY_MASK_BIG5 \
643 | CATEGORY_MASK_CCL \
644 | CATEGORY_MASK_EMACS_MULE)
645
646
647#define CATEGORY_MASK_ISO_7BIT \
648 (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
649
650#define CATEGORY_MASK_ISO_8BIT \
651 (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
652
653#define CATEGORY_MASK_ISO_ELSE \
654 (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
655
656#define CATEGORY_MASK_ISO_ESCAPE \
657 (CATEGORY_MASK_ISO_7 \
658 | CATEGORY_MASK_ISO_7_TIGHT \
659 | CATEGORY_MASK_ISO_7_ELSE \
660 | CATEGORY_MASK_ISO_8_ELSE)
661
662#define CATEGORY_MASK_ISO \
663 ( CATEGORY_MASK_ISO_7BIT \
664 | CATEGORY_MASK_ISO_8BIT \
665 | CATEGORY_MASK_ISO_ELSE)
666
667#define CATEGORY_MASK_UTF_16 \
2f3cbb32
KH
668 (CATEGORY_MASK_UTF_16_AUTO \
669 | CATEGORY_MASK_UTF_16_BE \
df7492f9
KH
670 | CATEGORY_MASK_UTF_16_LE \
671 | CATEGORY_MASK_UTF_16_BE_NOSIG \
672 | CATEGORY_MASK_UTF_16_LE_NOSIG)
673
a470d443
KH
674#define CATEGORY_MASK_UTF_8 \
675 (CATEGORY_MASK_UTF_8_AUTO \
676 | CATEGORY_MASK_UTF_8_NOSIG \
677 | CATEGORY_MASK_UTF_8_SIG)
df7492f9
KH
678
679/* List of symbols `coding-category-xxx' ordered by priority. This
680 variable is exposed to Emacs Lisp. */
681static Lisp_Object Vcoding_category_list;
682
683/* Table of coding categories (Lisp symbols). This variable is for
684 internal use oly. */
685static Lisp_Object Vcoding_category_table;
686
687/* Table of coding-categories ordered by priority. */
688static enum coding_category coding_priorities[coding_category_max];
689
690/* Nth element is a coding context for the coding system bound to the
691 Nth coding category. */
692static struct coding_system coding_categories[coding_category_max];
693
df7492f9
KH
694/*** Commonly used macros and functions ***/
695
696#ifndef min
697#define min(a, b) ((a) < (b) ? (a) : (b))
698#endif
699#ifndef max
700#define max(a, b) ((a) > (b) ? (a) : (b))
701#endif
4ed46869 702
24a73b0a
KH
703#define CODING_GET_INFO(coding, attrs, charset_list) \
704 do { \
705 (attrs) = CODING_ID_ATTRS ((coding)->id); \
706 (charset_list) = CODING_ATTR_CHARSET_LIST (attrs); \
df7492f9 707 } while (0)
4ed46869 708
4ed46869 709
df7492f9
KH
710/* Safely get one byte from the source text pointed by SRC which ends
711 at SRC_END, and set C to that byte. If there are not enough bytes
065e3595
KH
712 in the source, it jumps to `no_more_source'. If multibytep is
713 nonzero, and a multibyte character is found at SRC, set C to the
714 negative value of the character code. The caller should declare
715 and set these variables appropriately in advance:
716 src, src_end, multibytep */
aa72b389 717
065e3595
KH
718#define ONE_MORE_BYTE(c) \
719 do { \
720 if (src == src_end) \
721 { \
722 if (src_base < src) \
723 record_conversion_result \
724 (coding, CODING_RESULT_INSUFFICIENT_SRC); \
725 goto no_more_source; \
726 } \
727 c = *src++; \
728 if (multibytep && (c & 0x80)) \
729 { \
730 if ((c & 0xFE) == 0xC0) \
731 c = ((c & 1) << 6) | *src++; \
732 else \
733 { \
35befdaa
KH
734 src--; \
735 c = - string_char (src, &src, NULL); \
065e3595
KH
736 record_conversion_result \
737 (coding, CODING_RESULT_INVALID_SRC); \
738 } \
739 } \
740 consumed_chars++; \
aa72b389
KH
741 } while (0)
742
aa72b389 743
065e3595
KH
744#define ONE_MORE_BYTE_NO_CHECK(c) \
745 do { \
746 c = *src++; \
747 if (multibytep && (c & 0x80)) \
748 { \
749 if ((c & 0xFE) == 0xC0) \
750 c = ((c & 1) << 6) | *src++; \
751 else \
752 { \
35befdaa
KH
753 src--; \
754 c = - string_char (src, &src, NULL); \
065e3595
KH
755 record_conversion_result \
756 (coding, CODING_RESULT_INVALID_SRC); \
757 } \
758 } \
759 consumed_chars++; \
aa72b389
KH
760 } while (0)
761
aa72b389 762
df7492f9
KH
763/* Store a byte C in the place pointed by DST and increment DST to the
764 next free point, and increment PRODUCED_CHARS. The caller should
765 assure that C is 0..127, and declare and set the variable `dst'
766 appropriately in advance.
767*/
aa72b389
KH
768
769
df7492f9
KH
770#define EMIT_ONE_ASCII_BYTE(c) \
771 do { \
772 produced_chars++; \
773 *dst++ = (c); \
b6871cc7 774 } while (0)
aa72b389
KH
775
776
df7492f9 777/* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2. */
aa72b389 778
df7492f9
KH
779#define EMIT_TWO_ASCII_BYTES(c1, c2) \
780 do { \
781 produced_chars += 2; \
782 *dst++ = (c1), *dst++ = (c2); \
783 } while (0)
aa72b389
KH
784
785
df7492f9
KH
786/* Store a byte C in the place pointed by DST and increment DST to the
787 next free point, and increment PRODUCED_CHARS. If MULTIBYTEP is
788 nonzero, store in an appropriate multibyte from. The caller should
789 declare and set the variables `dst' and `multibytep' appropriately
790 in advance. */
791
792#define EMIT_ONE_BYTE(c) \
793 do { \
794 produced_chars++; \
795 if (multibytep) \
796 { \
797 int ch = (c); \
798 if (ch >= 0x80) \
799 ch = BYTE8_TO_CHAR (ch); \
800 CHAR_STRING_ADVANCE (ch, dst); \
801 } \
802 else \
803 *dst++ = (c); \
aa72b389 804 } while (0)
aa72b389 805
aa72b389 806
df7492f9 807/* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */
aa72b389 808
e19c3639
KH
809#define EMIT_TWO_BYTES(c1, c2) \
810 do { \
811 produced_chars += 2; \
812 if (multibytep) \
813 { \
814 int ch; \
815 \
816 ch = (c1); \
817 if (ch >= 0x80) \
818 ch = BYTE8_TO_CHAR (ch); \
819 CHAR_STRING_ADVANCE (ch, dst); \
820 ch = (c2); \
821 if (ch >= 0x80) \
822 ch = BYTE8_TO_CHAR (ch); \
823 CHAR_STRING_ADVANCE (ch, dst); \
824 } \
825 else \
826 { \
827 *dst++ = (c1); \
828 *dst++ = (c2); \
829 } \
aa72b389
KH
830 } while (0)
831
832
df7492f9
KH
833#define EMIT_THREE_BYTES(c1, c2, c3) \
834 do { \
835 EMIT_ONE_BYTE (c1); \
836 EMIT_TWO_BYTES (c2, c3); \
837 } while (0)
aa72b389 838
aa72b389 839
df7492f9
KH
840#define EMIT_FOUR_BYTES(c1, c2, c3, c4) \
841 do { \
842 EMIT_TWO_BYTES (c1, c2); \
843 EMIT_TWO_BYTES (c3, c4); \
844 } while (0)
aa72b389 845
aa72b389 846
f6cbaf43
KH
847/* Prototypes for static functions. */
848static void record_conversion_result P_ ((struct coding_system *coding,
849 enum coding_result_code result));
850static int detect_coding_utf_8 P_ ((struct coding_system *,
851 struct coding_detection_info *info));
852static void decode_coding_utf_8 P_ ((struct coding_system *));
853static int encode_coding_utf_8 P_ ((struct coding_system *));
854
855static int detect_coding_utf_16 P_ ((struct coding_system *,
856 struct coding_detection_info *info));
857static void decode_coding_utf_16 P_ ((struct coding_system *));
858static int encode_coding_utf_16 P_ ((struct coding_system *));
859
860static int detect_coding_iso_2022 P_ ((struct coding_system *,
861 struct coding_detection_info *info));
862static void decode_coding_iso_2022 P_ ((struct coding_system *));
863static int encode_coding_iso_2022 P_ ((struct coding_system *));
864
865static int detect_coding_emacs_mule P_ ((struct coding_system *,
866 struct coding_detection_info *info));
867static void decode_coding_emacs_mule P_ ((struct coding_system *));
868static int encode_coding_emacs_mule P_ ((struct coding_system *));
869
870static int detect_coding_sjis P_ ((struct coding_system *,
871 struct coding_detection_info *info));
872static void decode_coding_sjis P_ ((struct coding_system *));
873static int encode_coding_sjis P_ ((struct coding_system *));
874
875static int detect_coding_big5 P_ ((struct coding_system *,
876 struct coding_detection_info *info));
877static void decode_coding_big5 P_ ((struct coding_system *));
878static int encode_coding_big5 P_ ((struct coding_system *));
879
880static int detect_coding_ccl P_ ((struct coding_system *,
881 struct coding_detection_info *info));
882static void decode_coding_ccl P_ ((struct coding_system *));
883static int encode_coding_ccl P_ ((struct coding_system *));
884
885static void decode_coding_raw_text P_ ((struct coding_system *));
886static int encode_coding_raw_text P_ ((struct coding_system *));
887
888static void coding_set_source P_ ((struct coding_system *));
889static void coding_set_destination P_ ((struct coding_system *));
890static void coding_alloc_by_realloc P_ ((struct coding_system *, EMACS_INT));
891static void coding_alloc_by_making_gap P_ ((struct coding_system *,
287c57d7 892 EMACS_INT, EMACS_INT));
f6cbaf43
KH
893static unsigned char *alloc_destination P_ ((struct coding_system *,
894 EMACS_INT, unsigned char *));
895static void setup_iso_safe_charsets P_ ((Lisp_Object));
896static unsigned char *encode_designation_at_bol P_ ((struct coding_system *,
897 int *, int *,
898 unsigned char *));
899static int detect_eol P_ ((const unsigned char *,
900 EMACS_INT, enum coding_category));
901static Lisp_Object adjust_coding_eol_type P_ ((struct coding_system *, int));
902static void decode_eol P_ ((struct coding_system *));
903static Lisp_Object get_translation_table P_ ((Lisp_Object, int, int *));
904static Lisp_Object get_translation P_ ((Lisp_Object, int *, int *,
905 int, int *, int *));
906static int produce_chars P_ ((struct coding_system *, Lisp_Object, int));
907static INLINE void produce_composition P_ ((struct coding_system *, int *,
908 EMACS_INT));
909static INLINE void produce_charset P_ ((struct coding_system *, int *,
910 EMACS_INT));
911static void produce_annotation P_ ((struct coding_system *, EMACS_INT));
912static int decode_coding P_ ((struct coding_system *));
913static INLINE int *handle_composition_annotation P_ ((EMACS_INT, EMACS_INT,
3ed051d4 914 struct coding_system *,
f6cbaf43
KH
915 int *, EMACS_INT *));
916static INLINE int *handle_charset_annotation P_ ((EMACS_INT, EMACS_INT,
917 struct coding_system *,
918 int *, EMACS_INT *));
919static void consume_chars P_ ((struct coding_system *, Lisp_Object, int));
920static int encode_coding P_ ((struct coding_system *));
921static Lisp_Object make_conversion_work_buffer P_ ((int));
922static Lisp_Object code_conversion_restore P_ ((Lisp_Object));
923static INLINE int char_encodable_p P_ ((int, Lisp_Object));
924static Lisp_Object make_subsidiaries P_ ((Lisp_Object));
925
065e3595
KH
926static void
927record_conversion_result (struct coding_system *coding,
928 enum coding_result_code result)
929{
930 coding->result = result;
931 switch (result)
932 {
933 case CODING_RESULT_INSUFFICIENT_SRC:
934 Vlast_code_conversion_error = Qinsufficient_source;
935 break;
936 case CODING_RESULT_INCONSISTENT_EOL:
937 Vlast_code_conversion_error = Qinconsistent_eol;
938 break;
939 case CODING_RESULT_INVALID_SRC:
940 Vlast_code_conversion_error = Qinvalid_source;
941 break;
942 case CODING_RESULT_INTERRUPT:
943 Vlast_code_conversion_error = Qinterrupted;
944 break;
945 case CODING_RESULT_INSUFFICIENT_MEM:
946 Vlast_code_conversion_error = Qinsufficient_memory;
947 break;
35befdaa
KH
948 default:
949 Vlast_code_conversion_error = intern ("Unknown error");
065e3595
KH
950 }
951}
952
df7492f9
KH
953#define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
954 do { \
955 charset_map_loaded = 0; \
956 c = DECODE_CHAR (charset, code); \
957 if (charset_map_loaded) \
958 { \
8f924df7 959 const unsigned char *orig = coding->source; \
df7492f9
KH
960 EMACS_INT offset; \
961 \
962 coding_set_source (coding); \
963 offset = coding->source - orig; \
964 src += offset; \
965 src_base += offset; \
966 src_end += offset; \
967 } \
aa72b389
KH
968 } while (0)
969
970
119852e7
KH
971/* If there are at least BYTES length of room at dst, allocate memory
972 for coding->destination and update dst and dst_end. We don't have
973 to take care of coding->source which will be relocated. It is
974 handled by calling coding_set_source in encode_coding. */
975
df7492f9
KH
976#define ASSURE_DESTINATION(bytes) \
977 do { \
978 if (dst + (bytes) >= dst_end) \
979 { \
980 int more_bytes = charbuf_end - charbuf + (bytes); \
981 \
982 dst = alloc_destination (coding, more_bytes, dst); \
983 dst_end = coding->destination + coding->dst_bytes; \
984 } \
985 } while (0)
aa72b389 986
aa72b389 987
db274c7a
KH
988/* Store multibyte form of the character C in P, and advance P to the
989 end of the multibyte form. This is like CHAR_STRING_ADVANCE but it
990 never calls MAYBE_UNIFY_CHAR. */
991
992#define CHAR_STRING_ADVANCE_NO_UNIFY(c, p) \
993 do { \
994 if ((c) <= MAX_1_BYTE_CHAR) \
995 *(p)++ = (c); \
996 else if ((c) <= MAX_2_BYTE_CHAR) \
997 *(p)++ = (0xC0 | ((c) >> 6)), \
998 *(p)++ = (0x80 | ((c) & 0x3F)); \
999 else if ((c) <= MAX_3_BYTE_CHAR) \
1000 *(p)++ = (0xE0 | ((c) >> 12)), \
1001 *(p)++ = (0x80 | (((c) >> 6) & 0x3F)), \
1002 *(p)++ = (0x80 | ((c) & 0x3F)); \
1003 else if ((c) <= MAX_4_BYTE_CHAR) \
1004 *(p)++ = (0xF0 | (c >> 18)), \
1005 *(p)++ = (0x80 | ((c >> 12) & 0x3F)), \
1006 *(p)++ = (0x80 | ((c >> 6) & 0x3F)), \
1007 *(p)++ = (0x80 | (c & 0x3F)); \
1008 else if ((c) <= MAX_5_BYTE_CHAR) \
1009 *(p)++ = 0xF8, \
1010 *(p)++ = (0x80 | ((c >> 18) & 0x0F)), \
1011 *(p)++ = (0x80 | ((c >> 12) & 0x3F)), \
1012 *(p)++ = (0x80 | ((c >> 6) & 0x3F)), \
1013 *(p)++ = (0x80 | (c & 0x3F)); \
1014 else \
1015 (p) += BYTE8_STRING ((c) - 0x3FFF80, p); \
1016 } while (0)
1017
1018
1019/* Return the character code of character whose multibyte form is at
1020 P, and advance P to the end of the multibyte form. This is like
1021 STRING_CHAR_ADVANCE, but it never calls MAYBE_UNIFY_CHAR. */
1022
1023#define STRING_CHAR_ADVANCE_NO_UNIFY(p) \
1024 (!((p)[0] & 0x80) \
1025 ? *(p)++ \
1026 : ! ((p)[0] & 0x20) \
1027 ? ((p) += 2, \
1028 ((((p)[-2] & 0x1F) << 6) \
1029 | ((p)[-1] & 0x3F) \
1030 | ((unsigned char) ((p)[-2]) < 0xC2 ? 0x3FFF80 : 0))) \
1031 : ! ((p)[0] & 0x10) \
1032 ? ((p) += 3, \
1033 ((((p)[-3] & 0x0F) << 12) \
1034 | (((p)[-2] & 0x3F) << 6) \
1035 | ((p)[-1] & 0x3F))) \
1036 : ! ((p)[0] & 0x08) \
1037 ? ((p) += 4, \
1038 ((((p)[-4] & 0xF) << 18) \
1039 | (((p)[-3] & 0x3F) << 12) \
1040 | (((p)[-2] & 0x3F) << 6) \
1041 | ((p)[-1] & 0x3F))) \
1042 : ((p) += 5, \
1043 ((((p)[-4] & 0x3F) << 18) \
1044 | (((p)[-3] & 0x3F) << 12) \
1045 | (((p)[-2] & 0x3F) << 6) \
1046 | ((p)[-1] & 0x3F))))
1047
aa72b389 1048
df7492f9
KH
1049static void
1050coding_set_source (coding)
aa72b389 1051 struct coding_system *coding;
aa72b389 1052{
df7492f9
KH
1053 if (BUFFERP (coding->src_object))
1054 {
2cb26057 1055 struct buffer *buf = XBUFFER (coding->src_object);
aa72b389 1056
df7492f9 1057 if (coding->src_pos < 0)
2cb26057 1058 coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
df7492f9 1059 else
2cb26057 1060 coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
aa72b389 1061 }
df7492f9 1062 else if (STRINGP (coding->src_object))
aa72b389 1063 {
8f924df7 1064 coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
aa72b389 1065 }
df7492f9
KH
1066 else
1067 /* Otherwise, the source is C string and is never relocated
1068 automatically. Thus we don't have to update anything. */
1069 ;
1070}
aa72b389 1071
df7492f9
KH
1072static void
1073coding_set_destination (coding)
1074 struct coding_system *coding;
1075{
1076 if (BUFFERP (coding->dst_object))
aa72b389 1077 {
df7492f9 1078 if (coding->src_pos < 0)
aa72b389 1079 {
13818c30 1080 coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
28f67a95
KH
1081 coding->dst_bytes = (GAP_END_ADDR
1082 - (coding->src_bytes - coding->consumed)
1083 - coding->destination);
aa72b389 1084 }
df7492f9 1085 else
28f67a95
KH
1086 {
1087 /* We are sure that coding->dst_pos_byte is before the gap
1088 of the buffer. */
1089 coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
13818c30 1090 + coding->dst_pos_byte - BEG_BYTE);
28f67a95
KH
1091 coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
1092 - coding->destination);
1093 }
df7492f9
KH
1094 }
1095 else
1096 /* Otherwise, the destination is C string and is never relocated
1097 automatically. Thus we don't have to update anything. */
1098 ;
1099}
1100
1101
1102static void
1103coding_alloc_by_realloc (coding, bytes)
1104 struct coding_system *coding;
1105 EMACS_INT bytes;
1106{
1107 coding->destination = (unsigned char *) xrealloc (coding->destination,
1108 coding->dst_bytes + bytes);
1109 coding->dst_bytes += bytes;
1110}
1111
1112static void
db274c7a 1113coding_alloc_by_making_gap (coding, gap_head_used, bytes)
df7492f9 1114 struct coding_system *coding;
db274c7a 1115 EMACS_INT gap_head_used, bytes;
df7492f9 1116{
db274c7a 1117 if (EQ (coding->src_object, coding->dst_object))
df7492f9 1118 {
db274c7a
KH
1119 /* The gap may contain the produced data at the head and not-yet
1120 consumed data at the tail. To preserve those data, we at
1121 first make the gap size to zero, then increase the gap
1122 size. */
1123 EMACS_INT add = GAP_SIZE;
1124
1125 GPT += gap_head_used, GPT_BYTE += gap_head_used;
1126 GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
df7492f9
KH
1127 make_gap (bytes);
1128 GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
db274c7a 1129 GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
df7492f9 1130 }
730fff51 1131 else
df7492f9 1132 {
2c78b7e1
KH
1133 Lisp_Object this_buffer;
1134
1135 this_buffer = Fcurrent_buffer ();
df7492f9
KH
1136 set_buffer_internal (XBUFFER (coding->dst_object));
1137 make_gap (bytes);
1138 set_buffer_internal (XBUFFER (this_buffer));
aa72b389 1139 }
df7492f9 1140}
8f924df7 1141
df7492f9
KH
1142
1143static unsigned char *
1144alloc_destination (coding, nbytes, dst)
1145 struct coding_system *coding;
3e139625 1146 EMACS_INT nbytes;
df7492f9
KH
1147 unsigned char *dst;
1148{
1149 EMACS_INT offset = dst - coding->destination;
1150
1151 if (BUFFERP (coding->dst_object))
db274c7a
KH
1152 {
1153 struct buffer *buf = XBUFFER (coding->dst_object);
1154
1155 coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1156 }
aa72b389 1157 else
df7492f9 1158 coding_alloc_by_realloc (coding, nbytes);
065e3595 1159 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
1160 coding_set_destination (coding);
1161 dst = coding->destination + offset;
1162 return dst;
1163}
aa72b389 1164
ff0dacd7
KH
1165/** Macros for annotations. */
1166
1167/* Maximum length of annotation data (sum of annotations for
1168 composition and charset). */
69a80ea3 1169#define MAX_ANNOTATION_LENGTH (4 + (MAX_COMPOSITION_COMPONENTS * 2) - 1 + 4)
ff0dacd7
KH
1170
1171/* An annotation data is stored in the array coding->charbuf in this
1172 format:
69a80ea3 1173 [ -LENGTH ANNOTATION_MASK NCHARS ... ]
ff0dacd7
KH
1174 LENGTH is the number of elements in the annotation.
1175 ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
69a80ea3 1176 NCHARS is the number of characters in the text annotated.
ff0dacd7
KH
1177
1178 The format of the following elements depend on ANNOTATION_MASK.
1179
1180 In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1181 follows:
1182 ... METHOD [ COMPOSITION-COMPONENTS ... ]
1183 METHOD is one of enum composition_method.
1184 Optionnal COMPOSITION-COMPONENTS are characters and composition
1185 rules.
1186
1187 In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1188 follows. */
1189
69a80ea3 1190#define ADD_ANNOTATION_DATA(buf, len, mask, nchars) \
ff0dacd7
KH
1191 do { \
1192 *(buf)++ = -(len); \
1193 *(buf)++ = (mask); \
69a80ea3 1194 *(buf)++ = (nchars); \
ff0dacd7
KH
1195 coding->annotated = 1; \
1196 } while (0);
1197
69a80ea3
KH
1198#define ADD_COMPOSITION_DATA(buf, nchars, method) \
1199 do { \
1200 ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1201 *buf++ = method; \
ff0dacd7
KH
1202 } while (0)
1203
1204
69a80ea3
KH
1205#define ADD_CHARSET_DATA(buf, nchars, id) \
1206 do { \
1207 ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1208 *buf++ = id; \
ff0dacd7
KH
1209 } while (0)
1210
df7492f9
KH
1211\f
1212/*** 2. Emacs' internal format (emacs-utf-8) ***/
1213
1214
1215
1216\f
1217/*** 3. UTF-8 ***/
1218
1219/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
ff0dacd7
KH
1220 Check if a text is encoded in UTF-8. If it is, return 1, else
1221 return 0. */
df7492f9
KH
1222
1223#define UTF_8_1_OCTET_P(c) ((c) < 0x80)
1224#define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
1225#define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1226#define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1227#define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1228#define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1229
a470d443
KH
1230#define UTF_BOM 0xFEFF
1231#define UTF_8_BOM_1 0xEF
1232#define UTF_8_BOM_2 0xBB
1233#define UTF_8_BOM_3 0xBF
1234
df7492f9 1235static int
ff0dacd7 1236detect_coding_utf_8 (coding, detect_info)
df7492f9 1237 struct coding_system *coding;
ff0dacd7 1238 struct coding_detection_info *detect_info;
df7492f9 1239{
065e3595 1240 const unsigned char *src = coding->source, *src_base;
8f924df7 1241 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9
KH
1242 int multibytep = coding->src_multibyte;
1243 int consumed_chars = 0;
a470d443 1244 int bom_found = 0;
df7492f9
KH
1245 int found = 0;
1246
ff0dacd7 1247 detect_info->checked |= CATEGORY_MASK_UTF_8;
df7492f9
KH
1248 /* A coding system of this category is always ASCII compatible. */
1249 src += coding->head_ascii;
1250
1251 while (1)
aa72b389 1252 {
df7492f9 1253 int c, c1, c2, c3, c4;
aa72b389 1254
065e3595 1255 src_base = src;
df7492f9 1256 ONE_MORE_BYTE (c);
065e3595 1257 if (c < 0 || UTF_8_1_OCTET_P (c))
df7492f9
KH
1258 continue;
1259 ONE_MORE_BYTE (c1);
065e3595 1260 if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
df7492f9
KH
1261 break;
1262 if (UTF_8_2_OCTET_LEADING_P (c))
aa72b389 1263 {
a470d443 1264 found = 1;
df7492f9 1265 continue;
aa72b389 1266 }
df7492f9 1267 ONE_MORE_BYTE (c2);
065e3595 1268 if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
df7492f9
KH
1269 break;
1270 if (UTF_8_3_OCTET_LEADING_P (c))
aa72b389 1271 {
a470d443
KH
1272 found = 1;
1273 if (src_base == coding->source
1274 && c == UTF_8_BOM_1 && c1 == UTF_8_BOM_2 && c2 == UTF_8_BOM_3)
1275 bom_found = 1;
df7492f9 1276 continue;
aa72b389 1277 }
df7492f9 1278 ONE_MORE_BYTE (c3);
065e3595 1279 if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
df7492f9
KH
1280 break;
1281 if (UTF_8_4_OCTET_LEADING_P (c))
aa72b389 1282 {
a470d443 1283 found = 1;
df7492f9
KH
1284 continue;
1285 }
1286 ONE_MORE_BYTE (c4);
065e3595 1287 if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
df7492f9
KH
1288 break;
1289 if (UTF_8_5_OCTET_LEADING_P (c))
1290 {
a470d443 1291 found = 1;
df7492f9
KH
1292 continue;
1293 }
1294 break;
aa72b389 1295 }
ff0dacd7 1296 detect_info->rejected |= CATEGORY_MASK_UTF_8;
df7492f9 1297 return 0;
aa72b389 1298
df7492f9 1299 no_more_source:
065e3595 1300 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
aa72b389 1301 {
ff0dacd7 1302 detect_info->rejected |= CATEGORY_MASK_UTF_8;
89528eb3 1303 return 0;
aa72b389 1304 }
a470d443
KH
1305 if (bom_found)
1306 {
1307 /* The first character 0xFFFE doesn't necessarily mean a BOM. */
1308 detect_info->found |= CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1309 }
1310 else
1311 {
1312 detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
0e17387a
KH
1313 if (found)
1314 detect_info->found |= CATEGORY_MASK_UTF_8_NOSIG;
a470d443 1315 }
ff0dacd7 1316 return 1;
aa72b389
KH
1317}
1318
4ed46869 1319
b73bfc1c 1320static void
df7492f9 1321decode_coding_utf_8 (coding)
b73bfc1c 1322 struct coding_system *coding;
b73bfc1c 1323{
8f924df7
KH
1324 const unsigned char *src = coding->source + coding->consumed;
1325 const unsigned char *src_end = coding->source + coding->src_bytes;
1326 const unsigned char *src_base;
69a80ea3
KH
1327 int *charbuf = coding->charbuf + coding->charbuf_used;
1328 int *charbuf_end = coding->charbuf + coding->charbuf_size;
df7492f9
KH
1329 int consumed_chars = 0, consumed_chars_base;
1330 int multibytep = coding->src_multibyte;
a470d443 1331 enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
24a73b0a 1332 Lisp_Object attr, charset_list;
119852e7
KH
1333 int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1334 int byte_after_cr = -1;
4ed46869 1335
24a73b0a 1336 CODING_GET_INFO (coding, attr, charset_list);
df7492f9 1337
a470d443
KH
1338 if (bom != utf_without_bom)
1339 {
1340 int c1, c2, c3;
1341
1342 src_base = src;
1343 ONE_MORE_BYTE (c1);
1344 if (! UTF_8_3_OCTET_LEADING_P (c1))
1345 src = src_base;
1346 else
1347 {
1348 ONE_MORE_BYTE (c2);
1349 if (! UTF_8_EXTRA_OCTET_P (c2))
1350 src = src_base;
1351 else
1352 {
1353 ONE_MORE_BYTE (c3);
1354 if (! UTF_8_EXTRA_OCTET_P (c3))
1355 src = src_base;
1356 else
1357 {
1358 if ((c1 != UTF_8_BOM_1)
1359 || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1360 src = src_base;
1361 else
1362 CODING_UTF_8_BOM (coding) = utf_without_bom;
1363 }
1364 }
1365 }
1366 }
1367 CODING_UTF_8_BOM (coding) = utf_without_bom;
1368
1369
1370
df7492f9 1371 while (1)
b73bfc1c 1372 {
df7492f9 1373 int c, c1, c2, c3, c4, c5;
ec6d2bb8 1374
df7492f9
KH
1375 src_base = src;
1376 consumed_chars_base = consumed_chars;
4af310db 1377
df7492f9
KH
1378 if (charbuf >= charbuf_end)
1379 break;
1380
119852e7
KH
1381 if (byte_after_cr >= 0)
1382 c1 = byte_after_cr, byte_after_cr = -1;
1383 else
1384 ONE_MORE_BYTE (c1);
065e3595
KH
1385 if (c1 < 0)
1386 {
1387 c = - c1;
1388 }
1389 else if (UTF_8_1_OCTET_P(c1))
df7492f9 1390 {
119852e7
KH
1391 if (eol_crlf && c1 == '\r')
1392 ONE_MORE_BYTE (byte_after_cr);
df7492f9 1393 c = c1;
4af310db 1394 }
df7492f9 1395 else
4af310db 1396 {
df7492f9 1397 ONE_MORE_BYTE (c2);
065e3595 1398 if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
df7492f9
KH
1399 goto invalid_code;
1400 if (UTF_8_2_OCTET_LEADING_P (c1))
4af310db 1401 {
b0edb2c5
DL
1402 c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1403 /* Reject overlong sequences here and below. Encoders
1404 producing them are incorrect, they can be misleading,
1405 and they mess up read/write invariance. */
1406 if (c < 128)
1407 goto invalid_code;
4af310db 1408 }
df7492f9 1409 else
aa72b389 1410 {
df7492f9 1411 ONE_MORE_BYTE (c3);
065e3595 1412 if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
df7492f9
KH
1413 goto invalid_code;
1414 if (UTF_8_3_OCTET_LEADING_P (c1))
b0edb2c5
DL
1415 {
1416 c = (((c1 & 0xF) << 12)
1417 | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
72fe1301
DL
1418 if (c < 0x800
1419 || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
b0edb2c5
DL
1420 goto invalid_code;
1421 }
df7492f9
KH
1422 else
1423 {
1424 ONE_MORE_BYTE (c4);
065e3595 1425 if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
df7492f9
KH
1426 goto invalid_code;
1427 if (UTF_8_4_OCTET_LEADING_P (c1))
b0edb2c5 1428 {
df7492f9
KH
1429 c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1430 | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
b0edb2c5
DL
1431 if (c < 0x10000)
1432 goto invalid_code;
1433 }
df7492f9
KH
1434 else
1435 {
1436 ONE_MORE_BYTE (c5);
065e3595 1437 if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
df7492f9
KH
1438 goto invalid_code;
1439 if (UTF_8_5_OCTET_LEADING_P (c1))
1440 {
1441 c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1442 | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1443 | (c5 & 0x3F));
b0edb2c5 1444 if ((c > MAX_CHAR) || (c < 0x200000))
df7492f9
KH
1445 goto invalid_code;
1446 }
1447 else
1448 goto invalid_code;
1449 }
1450 }
aa72b389 1451 }
b73bfc1c 1452 }
df7492f9
KH
1453
1454 *charbuf++ = c;
1455 continue;
1456
1457 invalid_code:
1458 src = src_base;
1459 consumed_chars = consumed_chars_base;
1460 ONE_MORE_BYTE (c);
1461 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1462 coding->errors++;
aa72b389
KH
1463 }
1464
df7492f9
KH
1465 no_more_source:
1466 coding->consumed_char += consumed_chars_base;
1467 coding->consumed = src_base - coding->source;
1468 coding->charbuf_used = charbuf - coding->charbuf;
1469}
1470
1471
1472static int
1473encode_coding_utf_8 (coding)
1474 struct coding_system *coding;
1475{
1476 int multibytep = coding->dst_multibyte;
1477 int *charbuf = coding->charbuf;
1478 int *charbuf_end = charbuf + coding->charbuf_used;
1479 unsigned char *dst = coding->destination + coding->produced;
1480 unsigned char *dst_end = coding->destination + coding->dst_bytes;
e19c3639 1481 int produced_chars = 0;
df7492f9
KH
1482 int c;
1483
a470d443
KH
1484 if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1485 {
1486 ASSURE_DESTINATION (3);
1487 EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1488 CODING_UTF_8_BOM (coding) = utf_without_bom;
1489 }
1490
df7492f9 1491 if (multibytep)
aa72b389 1492 {
df7492f9
KH
1493 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1494
1495 while (charbuf < charbuf_end)
b73bfc1c 1496 {
df7492f9 1497 unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
8f924df7 1498
df7492f9
KH
1499 ASSURE_DESTINATION (safe_room);
1500 c = *charbuf++;
28f67a95
KH
1501 if (CHAR_BYTE8_P (c))
1502 {
1503 c = CHAR_TO_BYTE8 (c);
1504 EMIT_ONE_BYTE (c);
1505 }
1506 else
1507 {
db274c7a 1508 CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
28f67a95
KH
1509 for (p = str; p < pend; p++)
1510 EMIT_ONE_BYTE (*p);
1511 }
b73bfc1c 1512 }
aa72b389 1513 }
df7492f9
KH
1514 else
1515 {
1516 int safe_room = MAX_MULTIBYTE_LENGTH;
1517
1518 while (charbuf < charbuf_end)
b73bfc1c 1519 {
df7492f9
KH
1520 ASSURE_DESTINATION (safe_room);
1521 c = *charbuf++;
f03caae0
KH
1522 if (CHAR_BYTE8_P (c))
1523 *dst++ = CHAR_TO_BYTE8 (c);
1524 else
db274c7a 1525 CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
df7492f9 1526 produced_chars++;
4ed46869
KH
1527 }
1528 }
065e3595 1529 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
1530 coding->produced_char += produced_chars;
1531 coding->produced = dst - coding->destination;
1532 return 0;
4ed46869
KH
1533}
1534
b73bfc1c 1535
df7492f9 1536/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
ff0dacd7
KH
1537 Check if a text is encoded in one of UTF-16 based coding systems.
1538 If it is, return 1, else return 0. */
aa72b389 1539
df7492f9
KH
1540#define UTF_16_HIGH_SURROGATE_P(val) \
1541 (((val) & 0xFC00) == 0xD800)
1542
1543#define UTF_16_LOW_SURROGATE_P(val) \
1544 (((val) & 0xFC00) == 0xDC00)
93dec019 1545
df7492f9
KH
1546#define UTF_16_INVALID_P(val) \
1547 (((val) == 0xFFFE) \
1548 || ((val) == 0xFFFF) \
1549 || UTF_16_LOW_SURROGATE_P (val))
aa72b389 1550
aa72b389 1551
df7492f9 1552static int
ff0dacd7 1553detect_coding_utf_16 (coding, detect_info)
aa72b389 1554 struct coding_system *coding;
ff0dacd7 1555 struct coding_detection_info *detect_info;
aa72b389 1556{
8f924df7
KH
1557 const unsigned char *src = coding->source, *src_base = src;
1558 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9
KH
1559 int multibytep = coding->src_multibyte;
1560 int consumed_chars = 0;
1561 int c1, c2;
aa72b389 1562
ff0dacd7 1563 detect_info->checked |= CATEGORY_MASK_UTF_16;
ff0dacd7 1564 if (coding->mode & CODING_MODE_LAST_BLOCK
24a73b0a 1565 && (coding->src_chars & 1))
ff0dacd7
KH
1566 {
1567 detect_info->rejected |= CATEGORY_MASK_UTF_16;
1568 return 0;
1569 }
24a73b0a 1570
df7492f9
KH
1571 ONE_MORE_BYTE (c1);
1572 ONE_MORE_BYTE (c2);
df7492f9 1573 if ((c1 == 0xFF) && (c2 == 0xFE))
aa72b389 1574 {
b49a1807
KH
1575 detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1576 | CATEGORY_MASK_UTF_16_AUTO);
24a73b0a
KH
1577 detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1578 | CATEGORY_MASK_UTF_16_BE_NOSIG
1579 | CATEGORY_MASK_UTF_16_LE_NOSIG);
aa72b389 1580 }
df7492f9 1581 else if ((c1 == 0xFE) && (c2 == 0xFF))
ff0dacd7 1582 {
b49a1807
KH
1583 detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1584 | CATEGORY_MASK_UTF_16_AUTO);
24a73b0a
KH
1585 detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1586 | CATEGORY_MASK_UTF_16_BE_NOSIG
1587 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1588 }
2f3cbb32 1589 else
24a73b0a 1590 {
2f3cbb32
KH
1591 /* We check the dispersion of Eth and Oth bytes where E is even and
1592 O is odd. If both are high, we assume binary data.*/
1593 unsigned char e[256], o[256];
1594 unsigned e_num = 1, o_num = 1;
1595
1596 memset (e, 0, 256);
1597 memset (o, 0, 256);
1598 e[c1] = 1;
1599 o[c2] = 1;
1600
24a73b0a
KH
1601 detect_info->rejected
1602 |= (CATEGORY_MASK_UTF_16_BE | CATEGORY_MASK_UTF_16_LE);
2f3cbb32
KH
1603
1604 while (1)
1605 {
1606 ONE_MORE_BYTE (c1);
1607 ONE_MORE_BYTE (c2);
1608 if (! e[c1])
1609 {
1610 e[c1] = 1;
1611 e_num++;
1612 if (e_num >= 128)
1613 break;
1614 }
1615 if (! o[c2])
1616 {
1617 o[c1] = 1;
1618 o_num++;
1619 if (o_num >= 128)
1620 break;
1621 }
1622 }
1623 detect_info->rejected |= CATEGORY_MASK_UTF_16;
1624 return 0;
ff0dacd7 1625 }
2f3cbb32 1626
df7492f9 1627 no_more_source:
ff0dacd7 1628 return 1;
df7492f9 1629}
aa72b389 1630
df7492f9
KH
1631static void
1632decode_coding_utf_16 (coding)
1633 struct coding_system *coding;
1634{
8f924df7
KH
1635 const unsigned char *src = coding->source + coding->consumed;
1636 const unsigned char *src_end = coding->source + coding->src_bytes;
1637 const unsigned char *src_base;
69a80ea3
KH
1638 int *charbuf = coding->charbuf + coding->charbuf_used;
1639 int *charbuf_end = coding->charbuf + coding->charbuf_size;
df7492f9
KH
1640 int consumed_chars = 0, consumed_chars_base;
1641 int multibytep = coding->src_multibyte;
a470d443 1642 enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
df7492f9
KH
1643 enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1644 int surrogate = CODING_UTF_16_SURROGATE (coding);
24a73b0a 1645 Lisp_Object attr, charset_list;
119852e7
KH
1646 int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1647 int byte_after_cr1 = -1, byte_after_cr2 = -1;
df7492f9 1648
24a73b0a 1649 CODING_GET_INFO (coding, attr, charset_list);
df7492f9 1650
a470d443 1651 if (bom == utf_with_bom)
aa72b389 1652 {
df7492f9 1653 int c, c1, c2;
4af310db 1654
aa72b389 1655 src_base = src;
df7492f9
KH
1656 ONE_MORE_BYTE (c1);
1657 ONE_MORE_BYTE (c2);
e19c3639 1658 c = (c1 << 8) | c2;
aa72b389 1659
b49a1807
KH
1660 if (endian == utf_16_big_endian
1661 ? c != 0xFEFF : c != 0xFFFE)
aa72b389 1662 {
b49a1807
KH
1663 /* The first two bytes are not BOM. Treat them as bytes
1664 for a normal character. */
1665 src = src_base;
1666 coding->errors++;
aa72b389 1667 }
a470d443 1668 CODING_UTF_16_BOM (coding) = utf_without_bom;
b49a1807 1669 }
a470d443 1670 else if (bom == utf_detect_bom)
b49a1807
KH
1671 {
1672 /* We have already tried to detect BOM and failed in
1673 detect_coding. */
a470d443 1674 CODING_UTF_16_BOM (coding) = utf_without_bom;
df7492f9 1675 }
aa72b389 1676
df7492f9
KH
1677 while (1)
1678 {
1679 int c, c1, c2;
1680
1681 src_base = src;
1682 consumed_chars_base = consumed_chars;
1683
1684 if (charbuf + 2 >= charbuf_end)
1685 break;
1686
119852e7
KH
1687 if (byte_after_cr1 >= 0)
1688 c1 = byte_after_cr1, byte_after_cr1 = -1;
1689 else
1690 ONE_MORE_BYTE (c1);
065e3595
KH
1691 if (c1 < 0)
1692 {
1693 *charbuf++ = -c1;
1694 continue;
1695 }
119852e7
KH
1696 if (byte_after_cr2 >= 0)
1697 c2 = byte_after_cr2, byte_after_cr2 = -1;
1698 else
1699 ONE_MORE_BYTE (c2);
065e3595
KH
1700 if (c2 < 0)
1701 {
1702 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1703 *charbuf++ = -c2;
1704 continue;
1705 }
df7492f9 1706 c = (endian == utf_16_big_endian
e19c3639 1707 ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
119852e7 1708
df7492f9 1709 if (surrogate)
fd3ae0b9 1710 {
df7492f9 1711 if (! UTF_16_LOW_SURROGATE_P (c))
fd3ae0b9 1712 {
df7492f9
KH
1713 if (endian == utf_16_big_endian)
1714 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1715 else
1716 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1717 *charbuf++ = c1;
1718 *charbuf++ = c2;
1719 coding->errors++;
1720 if (UTF_16_HIGH_SURROGATE_P (c))
1721 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
fd3ae0b9 1722 else
df7492f9 1723 *charbuf++ = c;
fd3ae0b9
KH
1724 }
1725 else
df7492f9
KH
1726 {
1727 c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1728 CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
29f7ffd0 1729 *charbuf++ = 0x10000 + c;
df7492f9 1730 }
fd3ae0b9 1731 }
aa72b389 1732 else
df7492f9
KH
1733 {
1734 if (UTF_16_HIGH_SURROGATE_P (c))
1735 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1736 else
119852e7
KH
1737 {
1738 if (eol_crlf && c == '\r')
1739 {
1740 ONE_MORE_BYTE (byte_after_cr1);
1741 ONE_MORE_BYTE (byte_after_cr2);
1742 }
1743 *charbuf++ = c;
1744 }
8f924df7 1745 }
aa72b389 1746 }
df7492f9
KH
1747
1748 no_more_source:
1749 coding->consumed_char += consumed_chars_base;
1750 coding->consumed = src_base - coding->source;
1751 coding->charbuf_used = charbuf - coding->charbuf;
aa72b389 1752}
b73bfc1c 1753
df7492f9
KH
1754static int
1755encode_coding_utf_16 (coding)
1756 struct coding_system *coding;
1757{
1758 int multibytep = coding->dst_multibyte;
1759 int *charbuf = coding->charbuf;
1760 int *charbuf_end = charbuf + coding->charbuf_used;
1761 unsigned char *dst = coding->destination + coding->produced;
1762 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1763 int safe_room = 8;
a470d443 1764 enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
df7492f9
KH
1765 int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1766 int produced_chars = 0;
24a73b0a 1767 Lisp_Object attrs, charset_list;
df7492f9 1768 int c;
4ed46869 1769
24a73b0a 1770 CODING_GET_INFO (coding, attrs, charset_list);
df7492f9 1771
a470d443 1772 if (bom != utf_without_bom)
df7492f9
KH
1773 {
1774 ASSURE_DESTINATION (safe_room);
1775 if (big_endian)
df7492f9 1776 EMIT_TWO_BYTES (0xFE, 0xFF);
880cf180
KH
1777 else
1778 EMIT_TWO_BYTES (0xFF, 0xFE);
a470d443 1779 CODING_UTF_16_BOM (coding) = utf_without_bom;
df7492f9
KH
1780 }
1781
1782 while (charbuf < charbuf_end)
1783 {
1784 ASSURE_DESTINATION (safe_room);
1785 c = *charbuf++;
e19c3639
KH
1786 if (c >= MAX_UNICODE_CHAR)
1787 c = coding->default_char;
df7492f9
KH
1788
1789 if (c < 0x10000)
1790 {
1791 if (big_endian)
1792 EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1793 else
1794 EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1795 }
1796 else
1797 {
1798 int c1, c2;
1799
1800 c -= 0x10000;
1801 c1 = (c >> 10) + 0xD800;
1802 c2 = (c & 0x3FF) + 0xDC00;
1803 if (big_endian)
1804 EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1805 else
1806 EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1807 }
1808 }
065e3595 1809 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
1810 coding->produced = dst - coding->destination;
1811 coding->produced_char += produced_chars;
1812 return 0;
1813}
1814
1815\f
1816/*** 6. Old Emacs' internal format (emacs-mule) ***/
1817
1818/* Emacs' internal format for representation of multiple character
1819 sets is a kind of multi-byte encoding, i.e. characters are
1820 represented by variable-length sequences of one-byte codes.
1821
1822 ASCII characters and control characters (e.g. `tab', `newline') are
1823 represented by one-byte sequences which are their ASCII codes, in
1824 the range 0x00 through 0x7F.
1825
1826 8-bit characters of the range 0x80..0x9F are represented by
1827 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1828 code + 0x20).
1829
1830 8-bit characters of the range 0xA0..0xFF are represented by
1831 one-byte sequences which are their 8-bit code.
1832
1833 The other characters are represented by a sequence of `base
1834 leading-code', optional `extended leading-code', and one or two
1835 `position-code's. The length of the sequence is determined by the
1836 base leading-code. Leading-code takes the range 0x81 through 0x9D,
1837 whereas extended leading-code and position-code take the range 0xA0
1838 through 0xFF. See `charset.h' for more details about leading-code
1839 and position-code.
1840
1841 --- CODE RANGE of Emacs' internal format ---
1842 character set range
1843 ------------- -----
1844 ascii 0x00..0x7F
1845 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1846 eight-bit-graphic 0xA0..0xBF
1847 ELSE 0x81..0x9D + [0xA0..0xFF]+
1848 ---------------------------------------------
1849
1850 As this is the internal character representation, the format is
1851 usually not used externally (i.e. in a file or in a data sent to a
1852 process). But, it is possible to have a text externally in this
1853 format (i.e. by encoding by the coding system `emacs-mule').
1854
1855 In that case, a sequence of one-byte codes has a slightly different
1856 form.
1857
1858 At first, all characters in eight-bit-control are represented by
1859 one-byte sequences which are their 8-bit code.
1860
1861 Next, character composition data are represented by the byte
1862 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1863 where,
1864 METHOD is 0xF0 plus one of composition method (enum
1865 composition_method),
1866
1867 BYTES is 0xA0 plus a byte length of this composition data,
1868
1869 CHARS is 0x20 plus a number of characters composed by this
1870 data,
1871
1872 COMPONENTs are characters of multibye form or composition
1873 rules encoded by two-byte of ASCII codes.
1874
1875 In addition, for backward compatibility, the following formats are
1876 also recognized as composition data on decoding.
1877
1878 0x80 MSEQ ...
1879 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1880
1881 Here,
1882 MSEQ is a multibyte form but in these special format:
1883 ASCII: 0xA0 ASCII_CODE+0x80,
1884 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1885 RULE is a one byte code of the range 0xA0..0xF0 that
1886 represents a composition rule.
1887 */
1888
1889char emacs_mule_bytes[256];
1890
df7492f9 1891int
ff0dacd7 1892emacs_mule_char (coding, src, nbytes, nchars, id)
df7492f9 1893 struct coding_system *coding;
065e3595 1894 const unsigned char *src;
ff0dacd7 1895 int *nbytes, *nchars, *id;
df7492f9 1896{
8f924df7
KH
1897 const unsigned char *src_end = coding->source + coding->src_bytes;
1898 const unsigned char *src_base = src;
df7492f9 1899 int multibytep = coding->src_multibyte;
df7492f9
KH
1900 struct charset *charset;
1901 unsigned code;
1902 int c;
1903 int consumed_chars = 0;
1904
1905 ONE_MORE_BYTE (c);
065e3595 1906 if (c < 0)
df7492f9 1907 {
065e3595
KH
1908 c = -c;
1909 charset = emacs_mule_charset[0];
1910 }
1911 else
1912 {
4d41e8b7
KH
1913 if (c >= 0xA0)
1914 {
b3af4b28 1915 /* Old style component character of a composition. */
4d41e8b7
KH
1916 if (c == 0xA0)
1917 {
1918 ONE_MORE_BYTE (c);
1919 c -= 0x80;
1920 }
1921 else
1922 c -= 0x20;
1923 }
1924
065e3595 1925 switch (emacs_mule_bytes[c])
b73bfc1c 1926 {
065e3595 1927 case 2:
df7492f9
KH
1928 if (! (charset = emacs_mule_charset[c]))
1929 goto invalid_code;
1930 ONE_MORE_BYTE (c);
9ffd559c 1931 if (c < 0xA0)
065e3595 1932 goto invalid_code;
df7492f9 1933 code = c & 0x7F;
065e3595
KH
1934 break;
1935
1936 case 3:
1937 if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
1938 || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
1939 {
1940 ONE_MORE_BYTE (c);
9ffd559c 1941 if (c < 0xA0 || ! (charset = emacs_mule_charset[c]))
065e3595
KH
1942 goto invalid_code;
1943 ONE_MORE_BYTE (c);
9ffd559c 1944 if (c < 0xA0)
065e3595
KH
1945 goto invalid_code;
1946 code = c & 0x7F;
1947 }
1948 else
1949 {
1950 if (! (charset = emacs_mule_charset[c]))
1951 goto invalid_code;
1952 ONE_MORE_BYTE (c);
9ffd559c 1953 if (c < 0xA0)
065e3595
KH
1954 goto invalid_code;
1955 code = (c & 0x7F) << 8;
1956 ONE_MORE_BYTE (c);
9ffd559c 1957 if (c < 0xA0)
065e3595
KH
1958 goto invalid_code;
1959 code |= c & 0x7F;
1960 }
1961 break;
1962
1963 case 4:
1964 ONE_MORE_BYTE (c);
1965 if (c < 0 || ! (charset = emacs_mule_charset[c]))
df7492f9
KH
1966 goto invalid_code;
1967 ONE_MORE_BYTE (c);
9ffd559c 1968 if (c < 0xA0)
065e3595 1969 goto invalid_code;
781d7a48 1970 code = (c & 0x7F) << 8;
df7492f9 1971 ONE_MORE_BYTE (c);
9ffd559c 1972 if (c < 0xA0)
065e3595 1973 goto invalid_code;
df7492f9 1974 code |= c & 0x7F;
065e3595 1975 break;
df7492f9 1976
065e3595
KH
1977 case 1:
1978 code = c;
1979 charset = CHARSET_FROM_ID (ASCII_BYTE_P (code)
1980 ? charset_ascii : charset_eight_bit);
1981 break;
df7492f9 1982
065e3595
KH
1983 default:
1984 abort ();
1985 }
1986 c = DECODE_CHAR (charset, code);
1987 if (c < 0)
1988 goto invalid_code;
df7492f9 1989 }
df7492f9
KH
1990 *nbytes = src - src_base;
1991 *nchars = consumed_chars;
ff0dacd7
KH
1992 if (id)
1993 *id = charset->id;
df7492f9
KH
1994 return c;
1995
1996 no_more_source:
1997 return -2;
1998
1999 invalid_code:
2000 return -1;
2001}
2002
2003
2004/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
ff0dacd7
KH
2005 Check if a text is encoded in `emacs-mule'. If it is, return 1,
2006 else return 0. */
df7492f9
KH
2007
2008static int
ff0dacd7 2009detect_coding_emacs_mule (coding, detect_info)
df7492f9 2010 struct coding_system *coding;
ff0dacd7 2011 struct coding_detection_info *detect_info;
df7492f9 2012{
065e3595 2013 const unsigned char *src = coding->source, *src_base;
8f924df7 2014 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9
KH
2015 int multibytep = coding->src_multibyte;
2016 int consumed_chars = 0;
2017 int c;
2018 int found = 0;
2019
ff0dacd7 2020 detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
df7492f9
KH
2021 /* A coding system of this category is always ASCII compatible. */
2022 src += coding->head_ascii;
2023
2024 while (1)
2025 {
065e3595 2026 src_base = src;
df7492f9 2027 ONE_MORE_BYTE (c);
065e3595
KH
2028 if (c < 0)
2029 continue;
df7492f9
KH
2030 if (c == 0x80)
2031 {
2032 /* Perhaps the start of composite character. We simple skip
2033 it because analyzing it is too heavy for detecting. But,
2034 at least, we check that the composite character
3ed051d4 2035 constitutes of more than 4 bytes. */
8f924df7 2036 const unsigned char *src_base;
df7492f9
KH
2037
2038 repeat:
2039 src_base = src;
2040 do
2041 {
2042 ONE_MORE_BYTE (c);
2043 }
2044 while (c >= 0xA0);
2045
2046 if (src - src_base <= 4)
2047 break;
ff0dacd7 2048 found = CATEGORY_MASK_EMACS_MULE;
df7492f9
KH
2049 if (c == 0x80)
2050 goto repeat;
b73bfc1c 2051 }
df7492f9
KH
2052
2053 if (c < 0x80)
b73bfc1c 2054 {
df7492f9
KH
2055 if (c < 0x20
2056 && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
2057 break;
2058 }
2059 else
2060 {
0e219d54 2061 int more_bytes = emacs_mule_bytes[*src_base] - 1;
df7492f9 2062
0e219d54 2063 while (more_bytes > 0)
df7492f9
KH
2064 {
2065 ONE_MORE_BYTE (c);
0e219d54
KH
2066 if (c < 0xA0)
2067 {
2068 src--; /* Unread the last byte. */
2069 break;
2070 }
2071 more_bytes--;
df7492f9 2072 }
0e219d54 2073 if (more_bytes != 0)
df7492f9 2074 break;
ff0dacd7 2075 found = CATEGORY_MASK_EMACS_MULE;
4ed46869
KH
2076 }
2077 }
ff0dacd7 2078 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
df7492f9
KH
2079 return 0;
2080
2081 no_more_source:
065e3595 2082 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
89528eb3 2083 {
ff0dacd7 2084 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
89528eb3
KH
2085 return 0;
2086 }
ff0dacd7
KH
2087 detect_info->found |= found;
2088 return 1;
4ed46869
KH
2089}
2090
b73bfc1c 2091
df7492f9
KH
2092/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
2093
2094/* Decode a character represented as a component of composition
2095 sequence of Emacs 20/21 style at SRC. Set C to that character and
2096 update SRC to the head of next character (or an encoded composition
2097 rule). If SRC doesn't points a composition component, set C to -1.
2098 If SRC points an invalid byte sequence, global exit by a return
2099 value 0. */
2100
2101#define DECODE_EMACS_MULE_COMPOSITION_CHAR(buf) \
f937a7db 2102 do \
df7492f9
KH
2103 { \
2104 int c; \
2105 int nbytes, nchars; \
2106 \
2107 if (src == src_end) \
2108 break; \
ff0dacd7 2109 c = emacs_mule_char (coding, src, &nbytes, &nchars, NULL);\
df7492f9
KH
2110 if (c < 0) \
2111 { \
2112 if (c == -2) \
2113 break; \
2114 goto invalid_code; \
2115 } \
2116 *buf++ = c; \
2117 src += nbytes; \
2118 consumed_chars += nchars; \
2119 } \
f937a7db 2120 while (0)
df7492f9
KH
2121
2122
2123/* Decode a composition rule represented as a component of composition
781d7a48
KH
2124 sequence of Emacs 20 style at SRC. Store the decoded rule in *BUF,
2125 and increment BUF. If SRC points an invalid byte sequence, set C
2126 to -1. */
df7492f9 2127
781d7a48 2128#define DECODE_EMACS_MULE_COMPOSITION_RULE_20(buf) \
df7492f9
KH
2129 do { \
2130 int c, gref, nref; \
2131 \
781d7a48 2132 if (src >= src_end) \
df7492f9
KH
2133 goto invalid_code; \
2134 ONE_MORE_BYTE_NO_CHECK (c); \
4d41e8b7 2135 c -= 0xA0; \
df7492f9
KH
2136 if (c < 0 || c >= 81) \
2137 goto invalid_code; \
2138 \
2139 gref = c / 9, nref = c % 9; \
2140 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
2141 } while (0)
2142
2143
781d7a48
KH
2144/* Decode a composition rule represented as a component of composition
2145 sequence of Emacs 21 style at SRC. Store the decoded rule in *BUF,
2146 and increment BUF. If SRC points an invalid byte sequence, set C
2147 to -1. */
2148
2149#define DECODE_EMACS_MULE_COMPOSITION_RULE_21(buf) \
2150 do { \
2151 int gref, nref; \
2152 \
2153 if (src + 1>= src_end) \
2154 goto invalid_code; \
2155 ONE_MORE_BYTE_NO_CHECK (gref); \
2156 gref -= 0x20; \
2157 ONE_MORE_BYTE_NO_CHECK (nref); \
2158 nref -= 0x20; \
2159 if (gref < 0 || gref >= 81 \
2160 || nref < 0 || nref >= 81) \
2161 goto invalid_code; \
2162 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
2163 } while (0)
2164
2165
df7492f9 2166#define DECODE_EMACS_MULE_21_COMPOSITION(c) \
aa72b389 2167 do { \
df7492f9 2168 /* Emacs 21 style format. The first three bytes at SRC are \
781d7a48 2169 (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is \
df7492f9
KH
2170 the byte length of this composition information, CHARS is the \
2171 number of characters composed by this composition. */ \
781d7a48
KH
2172 enum composition_method method = c - 0xF2; \
2173 int *charbuf_base = charbuf; \
df7492f9
KH
2174 int consumed_chars_limit; \
2175 int nbytes, nchars; \
2176 \
2177 ONE_MORE_BYTE (c); \
065e3595
KH
2178 if (c < 0) \
2179 goto invalid_code; \
df7492f9
KH
2180 nbytes = c - 0xA0; \
2181 if (nbytes < 3) \
2182 goto invalid_code; \
2183 ONE_MORE_BYTE (c); \
065e3595
KH
2184 if (c < 0) \
2185 goto invalid_code; \
df7492f9 2186 nchars = c - 0xA0; \
69a80ea3 2187 ADD_COMPOSITION_DATA (charbuf, nchars, method); \
df7492f9
KH
2188 consumed_chars_limit = consumed_chars_base + nbytes; \
2189 if (method != COMPOSITION_RELATIVE) \
aa72b389 2190 { \
df7492f9
KH
2191 int i = 0; \
2192 while (consumed_chars < consumed_chars_limit) \
aa72b389 2193 { \
df7492f9 2194 if (i % 2 && method != COMPOSITION_WITH_ALTCHARS) \
781d7a48 2195 DECODE_EMACS_MULE_COMPOSITION_RULE_21 (charbuf); \
df7492f9
KH
2196 else \
2197 DECODE_EMACS_MULE_COMPOSITION_CHAR (charbuf); \
781d7a48 2198 i++; \
aa72b389 2199 } \
df7492f9
KH
2200 if (consumed_chars < consumed_chars_limit) \
2201 goto invalid_code; \
781d7a48 2202 charbuf_base[0] -= i; \
aa72b389
KH
2203 } \
2204 } while (0)
93dec019 2205
aa72b389 2206
d959f512
KH
2207#define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION(c) \
2208 do { \
2209 /* Emacs 20 style format for relative composition. */ \
2210 /* Store multibyte form of characters to be composed. */ \
2211 enum composition_method method = COMPOSITION_RELATIVE; \
2212 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
2213 int *buf = components; \
2214 int i, j; \
2215 \
2216 src = src_base; \
2217 ONE_MORE_BYTE (c); /* skip 0x80 */ \
2218 for (i = 0; *src >= 0xA0 && i < MAX_COMPOSITION_COMPONENTS; i++) \
2219 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
2220 if (i < 2) \
2221 goto invalid_code; \
2222 ADD_COMPOSITION_DATA (charbuf, i, method); \
2223 for (j = 0; j < i; j++) \
2224 *charbuf++ = components[j]; \
df7492f9
KH
2225 } while (0)
2226
2227
2228#define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION(c) \
2229 do { \
2230 /* Emacs 20 style format for rule-base composition. */ \
2231 /* Store multibyte form of characters to be composed. */ \
ff0dacd7 2232 enum composition_method method = COMPOSITION_WITH_RULE; \
4d41e8b7 2233 int *charbuf_base = charbuf; \
df7492f9
KH
2234 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
2235 int *buf = components; \
2236 int i, j; \
4d41e8b7 2237 \
df7492f9 2238 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
4d41e8b7 2239 for (i = 1; i < MAX_COMPOSITION_COMPONENTS; i++) \
df7492f9 2240 { \
4d41e8b7
KH
2241 if (*src < 0xA0) \
2242 break; \
781d7a48 2243 DECODE_EMACS_MULE_COMPOSITION_RULE_20 (buf); \
df7492f9
KH
2244 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
2245 } \
4d41e8b7 2246 if (i <= 1 || (buf - components) % 2 == 0) \
df7492f9 2247 goto invalid_code; \
4d41e8b7 2248 if (charbuf + i + (i / 2) + 1 >= charbuf_end) \
df7492f9 2249 goto no_more_source; \
4d41e8b7
KH
2250 ADD_COMPOSITION_DATA (charbuf, i, method); \
2251 i = i * 2 - 1; \
df7492f9
KH
2252 for (j = 0; j < i; j++) \
2253 *charbuf++ = components[j]; \
4d41e8b7 2254 charbuf_base[0] -= i; \
df7492f9
KH
2255 for (j = 0; j < i; j += 2) \
2256 *charbuf++ = components[j]; \
2257 } while (0)
2258
aa72b389
KH
2259
2260static void
df7492f9 2261decode_coding_emacs_mule (coding)
aa72b389 2262 struct coding_system *coding;
aa72b389 2263{
8f924df7
KH
2264 const unsigned char *src = coding->source + coding->consumed;
2265 const unsigned char *src_end = coding->source + coding->src_bytes;
2266 const unsigned char *src_base;
69a80ea3
KH
2267 int *charbuf = coding->charbuf + coding->charbuf_used;
2268 int *charbuf_end
2269 = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
df7492f9 2270 int consumed_chars = 0, consumed_chars_base;
df7492f9 2271 int multibytep = coding->src_multibyte;
24a73b0a 2272 Lisp_Object attrs, charset_list;
ff0dacd7
KH
2273 int char_offset = coding->produced_char;
2274 int last_offset = char_offset;
2275 int last_id = charset_ascii;
119852e7
KH
2276 int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2277 int byte_after_cr = -1;
aa72b389 2278
24a73b0a 2279 CODING_GET_INFO (coding, attrs, charset_list);
aa72b389 2280
aa72b389
KH
2281 while (1)
2282 {
df7492f9
KH
2283 int c;
2284
aa72b389 2285 src_base = src;
df7492f9
KH
2286 consumed_chars_base = consumed_chars;
2287
2288 if (charbuf >= charbuf_end)
2289 break;
aa72b389 2290
119852e7
KH
2291 if (byte_after_cr >= 0)
2292 c = byte_after_cr, byte_after_cr = -1;
2293 else
2294 ONE_MORE_BYTE (c);
065e3595
KH
2295 if (c < 0)
2296 {
2297 *charbuf++ = -c;
2298 char_offset++;
2299 }
2300 else if (c < 0x80)
aa72b389 2301 {
119852e7
KH
2302 if (eol_crlf && c == '\r')
2303 ONE_MORE_BYTE (byte_after_cr);
df7492f9
KH
2304 *charbuf++ = c;
2305 char_offset++;
aa72b389 2306 }
df7492f9
KH
2307 else if (c == 0x80)
2308 {
df7492f9 2309 ONE_MORE_BYTE (c);
065e3595
KH
2310 if (c < 0)
2311 goto invalid_code;
781d7a48
KH
2312 if (c - 0xF2 >= COMPOSITION_RELATIVE
2313 && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)
df7492f9
KH
2314 DECODE_EMACS_MULE_21_COMPOSITION (c);
2315 else if (c < 0xC0)
2316 DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (c);
2317 else if (c == 0xFF)
2318 DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (c);
2319 else
2320 goto invalid_code;
2321 }
2322 else if (c < 0xA0 && emacs_mule_bytes[c] > 1)
2323 {
2324 int nbytes, nchars;
ff0dacd7
KH
2325 int id;
2326
781d7a48
KH
2327 src = src_base;
2328 consumed_chars = consumed_chars_base;
ff0dacd7 2329 c = emacs_mule_char (coding, src, &nbytes, &nchars, &id);
df7492f9
KH
2330 if (c < 0)
2331 {
2332 if (c == -2)
2333 break;
2334 goto invalid_code;
2335 }
ff0dacd7
KH
2336 if (last_id != id)
2337 {
2338 if (last_id != charset_ascii)
69a80ea3 2339 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
ff0dacd7
KH
2340 last_id = id;
2341 last_offset = char_offset;
2342 }
df7492f9 2343 *charbuf++ = c;
781d7a48
KH
2344 src += nbytes;
2345 consumed_chars += nchars;
df7492f9
KH
2346 char_offset++;
2347 }
4d41e8b7
KH
2348 else
2349 goto invalid_code;
df7492f9
KH
2350 continue;
2351
2352 invalid_code:
2353 src = src_base;
2354 consumed_chars = consumed_chars_base;
2355 ONE_MORE_BYTE (c);
2356 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
ff0dacd7 2357 char_offset++;
df7492f9
KH
2358 coding->errors++;
2359 }
2360
2361 no_more_source:
ff0dacd7 2362 if (last_id != charset_ascii)
69a80ea3 2363 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
df7492f9
KH
2364 coding->consumed_char += consumed_chars_base;
2365 coding->consumed = src_base - coding->source;
2366 coding->charbuf_used = charbuf - coding->charbuf;
2367}
2368
2369
2370#define EMACS_MULE_LEADING_CODES(id, codes) \
2371 do { \
2372 if (id < 0xA0) \
2373 codes[0] = id, codes[1] = 0; \
2374 else if (id < 0xE0) \
2375 codes[0] = 0x9A, codes[1] = id; \
2376 else if (id < 0xF0) \
2377 codes[0] = 0x9B, codes[1] = id; \
2378 else if (id < 0xF5) \
2379 codes[0] = 0x9C, codes[1] = id; \
2380 else \
2381 codes[0] = 0x9D, codes[1] = id; \
2382 } while (0);
2383
aa72b389 2384
df7492f9
KH
2385static int
2386encode_coding_emacs_mule (coding)
2387 struct coding_system *coding;
2388{
2389 int multibytep = coding->dst_multibyte;
2390 int *charbuf = coding->charbuf;
2391 int *charbuf_end = charbuf + coding->charbuf_used;
2392 unsigned char *dst = coding->destination + coding->produced;
2393 unsigned char *dst_end = coding->destination + coding->dst_bytes;
2394 int safe_room = 8;
df7492f9 2395 int produced_chars = 0;
24a73b0a 2396 Lisp_Object attrs, charset_list;
df7492f9 2397 int c;
ff0dacd7 2398 int preferred_charset_id = -1;
df7492f9 2399
24a73b0a 2400 CODING_GET_INFO (coding, attrs, charset_list);
eccb6815
KH
2401 if (! EQ (charset_list, Vemacs_mule_charset_list))
2402 {
2403 CODING_ATTR_CHARSET_LIST (attrs)
2404 = charset_list = Vemacs_mule_charset_list;
2405 }
df7492f9
KH
2406
2407 while (charbuf < charbuf_end)
2408 {
2409 ASSURE_DESTINATION (safe_room);
2410 c = *charbuf++;
ff0dacd7
KH
2411
2412 if (c < 0)
2413 {
2414 /* Handle an annotation. */
2415 switch (*charbuf)
2416 {
2417 case CODING_ANNOTATE_COMPOSITION_MASK:
2418 /* Not yet implemented. */
2419 break;
2420 case CODING_ANNOTATE_CHARSET_MASK:
2421 preferred_charset_id = charbuf[3];
2422 if (preferred_charset_id >= 0
2423 && NILP (Fmemq (make_number (preferred_charset_id),
2424 charset_list)))
2425 preferred_charset_id = -1;
2426 break;
2427 default:
2428 abort ();
2429 }
2430 charbuf += -c - 1;
2431 continue;
2432 }
2433
df7492f9
KH
2434 if (ASCII_CHAR_P (c))
2435 EMIT_ONE_ASCII_BYTE (c);
16eafb5d
KH
2436 else if (CHAR_BYTE8_P (c))
2437 {
2438 c = CHAR_TO_BYTE8 (c);
2439 EMIT_ONE_BYTE (c);
2440 }
df7492f9 2441 else
aa72b389 2442 {
df7492f9
KH
2443 struct charset *charset;
2444 unsigned code;
2445 int dimension;
2446 int emacs_mule_id;
2447 unsigned char leading_codes[2];
2448
ff0dacd7
KH
2449 if (preferred_charset_id >= 0)
2450 {
2451 charset = CHARSET_FROM_ID (preferred_charset_id);
2452 if (! CHAR_CHARSET_P (c, charset))
2453 charset = char_charset (c, charset_list, NULL);
2454 }
2455 else
2456 charset = char_charset (c, charset_list, &code);
df7492f9
KH
2457 if (! charset)
2458 {
2459 c = coding->default_char;
2460 if (ASCII_CHAR_P (c))
2461 {
2462 EMIT_ONE_ASCII_BYTE (c);
2463 continue;
2464 }
2465 charset = char_charset (c, charset_list, &code);
2466 }
2467 dimension = CHARSET_DIMENSION (charset);
2468 emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2469 EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2470 EMIT_ONE_BYTE (leading_codes[0]);
2471 if (leading_codes[1])
2472 EMIT_ONE_BYTE (leading_codes[1]);
2473 if (dimension == 1)
1fa663f9 2474 EMIT_ONE_BYTE (code | 0x80);
aa72b389 2475 else
df7492f9 2476 {
1fa663f9 2477 code |= 0x8080;
df7492f9
KH
2478 EMIT_ONE_BYTE (code >> 8);
2479 EMIT_ONE_BYTE (code & 0xFF);
2480 }
aa72b389 2481 }
aa72b389 2482 }
065e3595 2483 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
2484 coding->produced_char += produced_chars;
2485 coding->produced = dst - coding->destination;
2486 return 0;
aa72b389 2487}
b73bfc1c 2488
4ed46869 2489\f
df7492f9 2490/*** 7. ISO2022 handlers ***/
4ed46869
KH
2491
2492/* The following note describes the coding system ISO2022 briefly.
39787efd 2493 Since the intention of this note is to help understand the
5a936b46 2494 functions in this file, some parts are NOT ACCURATE or are OVERLY
39787efd 2495 SIMPLIFIED. For thorough understanding, please refer to the
5a936b46 2496 original document of ISO2022. This is equivalent to the standard
cfb43547 2497 ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
4ed46869
KH
2498
2499 ISO2022 provides many mechanisms to encode several character sets
cfb43547 2500 in 7-bit and 8-bit environments. For 7-bit environments, all text
39787efd
KH
2501 is encoded using bytes less than 128. This may make the encoded
2502 text a little bit longer, but the text passes more easily through
cfb43547 2503 several types of gateway, some of which strip off the MSB (Most
8ca3766a 2504 Significant Bit).
b73bfc1c 2505
cfb43547
DL
2506 There are two kinds of character sets: control character sets and
2507 graphic character sets. The former contain control characters such
4ed46869 2508 as `newline' and `escape' to provide control functions (control
39787efd 2509 functions are also provided by escape sequences). The latter
cfb43547 2510 contain graphic characters such as 'A' and '-'. Emacs recognizes
4ed46869
KH
2511 two control character sets and many graphic character sets.
2512
2513 Graphic character sets are classified into one of the following
39787efd
KH
2514 four classes, according to the number of bytes (DIMENSION) and
2515 number of characters in one dimension (CHARS) of the set:
2516 - DIMENSION1_CHARS94
2517 - DIMENSION1_CHARS96
2518 - DIMENSION2_CHARS94
2519 - DIMENSION2_CHARS96
2520
2521 In addition, each character set is assigned an identification tag,
cfb43547 2522 unique for each set, called the "final character" (denoted as <F>
39787efd
KH
2523 hereafter). The <F> of each character set is decided by ECMA(*)
2524 when it is registered in ISO. The code range of <F> is 0x30..0x7F
2525 (0x30..0x3F are for private use only).
4ed46869
KH
2526
2527 Note (*): ECMA = European Computer Manufacturers Association
2528
cfb43547 2529 Here are examples of graphic character sets [NAME(<F>)]:
4ed46869
KH
2530 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2531 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2532 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2533 o DIMENSION2_CHARS96 -- none for the moment
2534
39787efd 2535 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
4ed46869
KH
2536 C0 [0x00..0x1F] -- control character plane 0
2537 GL [0x20..0x7F] -- graphic character plane 0
2538 C1 [0x80..0x9F] -- control character plane 1
2539 GR [0xA0..0xFF] -- graphic character plane 1
2540
2541 A control character set is directly designated and invoked to C0 or
39787efd
KH
2542 C1 by an escape sequence. The most common case is that:
2543 - ISO646's control character set is designated/invoked to C0, and
2544 - ISO6429's control character set is designated/invoked to C1,
2545 and usually these designations/invocations are omitted in encoded
2546 text. In a 7-bit environment, only C0 can be used, and a control
2547 character for C1 is encoded by an appropriate escape sequence to
2548 fit into the environment. All control characters for C1 are
2549 defined to have corresponding escape sequences.
4ed46869
KH
2550
2551 A graphic character set is at first designated to one of four
2552 graphic registers (G0 through G3), then these graphic registers are
2553 invoked to GL or GR. These designations and invocations can be
2554 done independently. The most common case is that G0 is invoked to
39787efd
KH
2555 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
2556 these invocations and designations are omitted in encoded text.
2557 In a 7-bit environment, only GL can be used.
4ed46869 2558
39787efd
KH
2559 When a graphic character set of CHARS94 is invoked to GL, codes
2560 0x20 and 0x7F of the GL area work as control characters SPACE and
2561 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2562 be used.
4ed46869
KH
2563
2564 There are two ways of invocation: locking-shift and single-shift.
2565 With locking-shift, the invocation lasts until the next different
39787efd
KH
2566 invocation, whereas with single-shift, the invocation affects the
2567 following character only and doesn't affect the locking-shift
2568 state. Invocations are done by the following control characters or
2569 escape sequences:
4ed46869
KH
2570
2571 ----------------------------------------------------------------------
39787efd 2572 abbrev function cntrl escape seq description
4ed46869 2573 ----------------------------------------------------------------------
39787efd
KH
2574 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
2575 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
2576 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
2577 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
2578 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
2579 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
2580 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
2581 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
2582 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
4ed46869 2583 ----------------------------------------------------------------------
39787efd
KH
2584 (*) These are not used by any known coding system.
2585
2586 Control characters for these functions are defined by macros
2587 ISO_CODE_XXX in `coding.h'.
4ed46869 2588
39787efd 2589 Designations are done by the following escape sequences:
4ed46869
KH
2590 ----------------------------------------------------------------------
2591 escape sequence description
2592 ----------------------------------------------------------------------
2593 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
2594 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
2595 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
2596 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
2597 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
2598 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
2599 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
2600 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
2601 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
2602 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
2603 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
2604 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
2605 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
2606 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
2607 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
2608 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
2609 ----------------------------------------------------------------------
2610
2611 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
39787efd 2612 of dimension 1, chars 94, and final character <F>, etc...
4ed46869
KH
2613
2614 Note (*): Although these designations are not allowed in ISO2022,
2615 Emacs accepts them on decoding, and produces them on encoding
39787efd 2616 CHARS96 character sets in a coding system which is characterized as
4ed46869
KH
2617 7-bit environment, non-locking-shift, and non-single-shift.
2618
2619 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
df7492f9 2620 '(' must be omitted. We refer to this as "short-form" hereafter.
4ed46869 2621
cfb43547 2622 Now you may notice that there are a lot of ways of encoding the
39787efd
KH
2623 same multilingual text in ISO2022. Actually, there exist many
2624 coding systems such as Compound Text (used in X11's inter client
8ca3766a
DL
2625 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2626 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
4ed46869
KH
2627 localized platforms), and all of these are variants of ISO2022.
2628
2629 In addition to the above, Emacs handles two more kinds of escape
2630 sequences: ISO6429's direction specification and Emacs' private
2631 sequence for specifying character composition.
2632
39787efd 2633 ISO6429's direction specification takes the following form:
4ed46869
KH
2634 o CSI ']' -- end of the current direction
2635 o CSI '0' ']' -- end of the current direction
2636 o CSI '1' ']' -- start of left-to-right text
2637 o CSI '2' ']' -- start of right-to-left text
2638 The control character CSI (0x9B: control sequence introducer) is
39787efd
KH
2639 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2640
2641 Character composition specification takes the following form:
ec6d2bb8
KH
2642 o ESC '0' -- start relative composition
2643 o ESC '1' -- end composition
2644 o ESC '2' -- start rule-base composition (*)
2645 o ESC '3' -- start relative composition with alternate chars (**)
2646 o ESC '4' -- start rule-base composition with alternate chars (**)
b73bfc1c 2647 Since these are not standard escape sequences of any ISO standard,
cfb43547 2648 the use of them with these meanings is restricted to Emacs only.
ec6d2bb8 2649
5a936b46
DL
2650 (*) This form is used only in Emacs 20.7 and older versions,
2651 but newer versions can safely decode it.
cfb43547 2652 (**) This form is used only in Emacs 21.1 and newer versions,
5a936b46 2653 and older versions can't decode it.
ec6d2bb8 2654
cfb43547 2655 Here's a list of example usages of these composition escape
b73bfc1c 2656 sequences (categorized by `enum composition_method').
ec6d2bb8 2657
b73bfc1c 2658 COMPOSITION_RELATIVE:
ec6d2bb8 2659 ESC 0 CHAR [ CHAR ] ESC 1
8ca3766a 2660 COMPOSITION_WITH_RULE:
ec6d2bb8 2661 ESC 2 CHAR [ RULE CHAR ] ESC 1
b73bfc1c 2662 COMPOSITION_WITH_ALTCHARS:
ec6d2bb8 2663 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
b73bfc1c 2664 COMPOSITION_WITH_RULE_ALTCHARS:
ec6d2bb8 2665 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
4ed46869
KH
2666
2667enum iso_code_class_type iso_code_class[256];
2668
df7492f9
KH
2669#define SAFE_CHARSET_P(coding, id) \
2670 ((id) <= (coding)->max_charset_id \
2671 && (coding)->safe_charsets[id] >= 0)
2672
2673
2674#define SHIFT_OUT_OK(category) \
2675 (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
2676
2677static void
f0064e1f
DL
2678setup_iso_safe_charsets (attrs)
2679 Lisp_Object attrs;
df7492f9
KH
2680{
2681 Lisp_Object charset_list, safe_charsets;
2682 Lisp_Object request;
2683 Lisp_Object reg_usage;
2684 Lisp_Object tail;
2685 int reg94, reg96;
2686 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2687 int max_charset_id;
2688
2689 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2690 if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2691 && ! EQ (charset_list, Viso_2022_charset_list))
2692 {
2693 CODING_ATTR_CHARSET_LIST (attrs)
2694 = charset_list = Viso_2022_charset_list;
2695 ASET (attrs, coding_attr_safe_charsets, Qnil);
2696 }
2697
2698 if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2699 return;
2700
2701 max_charset_id = 0;
2702 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2703 {
2704 int id = XINT (XCAR (tail));
2705 if (max_charset_id < id)
2706 max_charset_id = id;
2707 }
d46c5b12 2708
df7492f9
KH
2709 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
2710 make_number (255));
2711 request = AREF (attrs, coding_attr_iso_request);
2712 reg_usage = AREF (attrs, coding_attr_iso_usage);
2713 reg94 = XINT (XCAR (reg_usage));
2714 reg96 = XINT (XCDR (reg_usage));
2715
2716 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2717 {
2718 Lisp_Object id;
2719 Lisp_Object reg;
2720 struct charset *charset;
2721
2722 id = XCAR (tail);
2723 charset = CHARSET_FROM_ID (XINT (id));
bf16eb23 2724 reg = Fcdr (Fassq (id, request));
df7492f9 2725 if (! NILP (reg))
8f924df7 2726 SSET (safe_charsets, XINT (id), XINT (reg));
df7492f9
KH
2727 else if (charset->iso_chars_96)
2728 {
2729 if (reg96 < 4)
8f924df7 2730 SSET (safe_charsets, XINT (id), reg96);
df7492f9
KH
2731 }
2732 else
2733 {
2734 if (reg94 < 4)
8f924df7 2735 SSET (safe_charsets, XINT (id), reg94);
df7492f9
KH
2736 }
2737 }
2738 ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2739}
d46c5b12 2740
b6871cc7 2741
4ed46869 2742/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
ff0dacd7
KH
2743 Check if a text is encoded in one of ISO-2022 based codig systems.
2744 If it is, return 1, else return 0. */
4ed46869 2745
0a28aafb 2746static int
ff0dacd7 2747detect_coding_iso_2022 (coding, detect_info)
df7492f9 2748 struct coding_system *coding;
ff0dacd7 2749 struct coding_detection_info *detect_info;
4ed46869 2750{
8f924df7
KH
2751 const unsigned char *src = coding->source, *src_base = src;
2752 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9 2753 int multibytep = coding->src_multibyte;
ff0dacd7 2754 int single_shifting = 0;
df7492f9
KH
2755 int id;
2756 int c, c1;
2757 int consumed_chars = 0;
2758 int i;
ff0dacd7
KH
2759 int rejected = 0;
2760 int found = 0;
2761
2762 detect_info->checked |= CATEGORY_MASK_ISO;
df7492f9
KH
2763
2764 for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2765 {
2766 struct coding_system *this = &(coding_categories[i]);
2767 Lisp_Object attrs, val;
2768
c6b278e7
KH
2769 if (this->id < 0)
2770 continue;
df7492f9
KH
2771 attrs = CODING_ID_ATTRS (this->id);
2772 if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2773 && ! EQ (CODING_ATTR_SAFE_CHARSETS (attrs), Viso_2022_charset_list))
2774 setup_iso_safe_charsets (attrs);
2775 val = CODING_ATTR_SAFE_CHARSETS (attrs);
8f924df7
KH
2776 this->max_charset_id = SCHARS (val) - 1;
2777 this->safe_charsets = (char *) SDATA (val);
df7492f9
KH
2778 }
2779
2780 /* A coding system of this category is always ASCII compatible. */
2781 src += coding->head_ascii;
3f003981 2782
ff0dacd7 2783 while (rejected != CATEGORY_MASK_ISO)
4ed46869 2784 {
065e3595 2785 src_base = src;
df7492f9 2786 ONE_MORE_BYTE (c);
4ed46869
KH
2787 switch (c)
2788 {
2789 case ISO_CODE_ESC:
74383408
KH
2790 if (inhibit_iso_escape_detection)
2791 break;
f46869e4 2792 single_shifting = 0;
df7492f9 2793 ONE_MORE_BYTE (c);
d46c5b12 2794 if (c >= '(' && c <= '/')
4ed46869 2795 {
bf9cdd4e 2796 /* Designation sequence for a charset of dimension 1. */
df7492f9 2797 ONE_MORE_BYTE (c1);
d46c5b12 2798 if (c1 < ' ' || c1 >= 0x80
df7492f9 2799 || (id = iso_charset_table[0][c >= ','][c1]) < 0)
d46c5b12
KH
2800 /* Invalid designation sequence. Just ignore. */
2801 break;
bf9cdd4e
KH
2802 }
2803 else if (c == '$')
2804 {
2805 /* Designation sequence for a charset of dimension 2. */
df7492f9 2806 ONE_MORE_BYTE (c);
bf9cdd4e
KH
2807 if (c >= '@' && c <= 'B')
2808 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
ff0dacd7 2809 id = iso_charset_table[1][0][c];
bf9cdd4e 2810 else if (c >= '(' && c <= '/')
bcf26d6a 2811 {
df7492f9 2812 ONE_MORE_BYTE (c1);
d46c5b12 2813 if (c1 < ' ' || c1 >= 0x80
df7492f9 2814 || (id = iso_charset_table[1][c >= ','][c1]) < 0)
d46c5b12
KH
2815 /* Invalid designation sequence. Just ignore. */
2816 break;
bcf26d6a 2817 }
bf9cdd4e 2818 else
ff0dacd7 2819 /* Invalid designation sequence. Just ignore it. */
d46c5b12
KH
2820 break;
2821 }
ae9ff118 2822 else if (c == 'N' || c == 'O')
d46c5b12 2823 {
ae9ff118 2824 /* ESC <Fe> for SS2 or SS3. */
ff0dacd7
KH
2825 single_shifting = 1;
2826 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
d46c5b12 2827 break;
4ed46869 2828 }
ec6d2bb8
KH
2829 else if (c >= '0' && c <= '4')
2830 {
2831 /* ESC <Fp> for start/end composition. */
ff0dacd7 2832 found |= CATEGORY_MASK_ISO;
ec6d2bb8
KH
2833 break;
2834 }
bf9cdd4e 2835 else
df7492f9 2836 {
ff0dacd7 2837 /* Invalid escape sequence. Just ignore it. */
df7492f9
KH
2838 break;
2839 }
d46c5b12
KH
2840
2841 /* We found a valid designation sequence for CHARSET. */
ff0dacd7 2842 rejected |= CATEGORY_MASK_ISO_8BIT;
df7492f9
KH
2843 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
2844 id))
ff0dacd7 2845 found |= CATEGORY_MASK_ISO_7;
d46c5b12 2846 else
ff0dacd7 2847 rejected |= CATEGORY_MASK_ISO_7;
df7492f9
KH
2848 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
2849 id))
ff0dacd7 2850 found |= CATEGORY_MASK_ISO_7_TIGHT;
d46c5b12 2851 else
ff0dacd7 2852 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
df7492f9
KH
2853 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
2854 id))
ff0dacd7 2855 found |= CATEGORY_MASK_ISO_7_ELSE;
ae9ff118 2856 else
ff0dacd7 2857 rejected |= CATEGORY_MASK_ISO_7_ELSE;
df7492f9
KH
2858 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
2859 id))
ff0dacd7 2860 found |= CATEGORY_MASK_ISO_8_ELSE;
ae9ff118 2861 else
ff0dacd7 2862 rejected |= CATEGORY_MASK_ISO_8_ELSE;
4ed46869
KH
2863 break;
2864
4ed46869 2865 case ISO_CODE_SO:
d46c5b12 2866 case ISO_CODE_SI:
ff0dacd7 2867 /* Locking shift out/in. */
74383408
KH
2868 if (inhibit_iso_escape_detection)
2869 break;
f46869e4 2870 single_shifting = 0;
ff0dacd7 2871 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
d46c5b12
KH
2872 break;
2873
4ed46869 2874 case ISO_CODE_CSI:
ff0dacd7 2875 /* Control sequence introducer. */
f46869e4 2876 single_shifting = 0;
ff0dacd7
KH
2877 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2878 found |= CATEGORY_MASK_ISO_8_ELSE;
2879 goto check_extra_latin;
2880
4ed46869
KH
2881 case ISO_CODE_SS2:
2882 case ISO_CODE_SS3:
ff0dacd7
KH
2883 /* Single shift. */
2884 if (inhibit_iso_escape_detection)
2885 break;
75e2a253 2886 single_shifting = 0;
ff0dacd7
KH
2887 rejected |= CATEGORY_MASK_ISO_7BIT;
2888 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2889 & CODING_ISO_FLAG_SINGLE_SHIFT)
75e2a253 2890 found |= CATEGORY_MASK_ISO_8_1, single_shifting = 1;
ff0dacd7
KH
2891 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2892 & CODING_ISO_FLAG_SINGLE_SHIFT)
75e2a253
KH
2893 found |= CATEGORY_MASK_ISO_8_2, single_shifting = 1;
2894 if (single_shifting)
2895 break;
ff0dacd7 2896 goto check_extra_latin;
4ed46869
KH
2897
2898 default:
065e3595
KH
2899 if (c < 0)
2900 continue;
4ed46869 2901 if (c < 0x80)
f46869e4
KH
2902 {
2903 single_shifting = 0;
2904 break;
2905 }
ff0dacd7 2906 if (c >= 0xA0)
c4825358 2907 {
ff0dacd7
KH
2908 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2909 found |= CATEGORY_MASK_ISO_8_1;
f46869e4 2910 /* Check the length of succeeding codes of the range
ff0dacd7
KH
2911 0xA0..0FF. If the byte length is even, we include
2912 CATEGORY_MASK_ISO_8_2 in `found'. We can check this
2913 only when we are not single shifting. */
2914 if (! single_shifting
2915 && ! (rejected & CATEGORY_MASK_ISO_8_2))
f46869e4 2916 {
e17de821 2917 int i = 1;
b73bfc1c
KH
2918 while (src < src_end)
2919 {
df7492f9 2920 ONE_MORE_BYTE (c);
b73bfc1c
KH
2921 if (c < 0xA0)
2922 break;
2923 i++;
2924 }
2925
2926 if (i & 1 && src < src_end)
ff0dacd7 2927 rejected |= CATEGORY_MASK_ISO_8_2;
f46869e4 2928 else
ff0dacd7 2929 found |= CATEGORY_MASK_ISO_8_2;
f46869e4 2930 }
ff0dacd7 2931 break;
4ed46869 2932 }
ff0dacd7
KH
2933 check_extra_latin:
2934 single_shifting = 0;
2935 if (! VECTORP (Vlatin_extra_code_table)
2936 || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
2937 {
2938 rejected = CATEGORY_MASK_ISO;
2939 break;
2940 }
2941 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2942 & CODING_ISO_FLAG_LATIN_EXTRA)
2943 found |= CATEGORY_MASK_ISO_8_1;
2944 else
2945 rejected |= CATEGORY_MASK_ISO_8_1;
75e2a253 2946 rejected |= CATEGORY_MASK_ISO_8_2;
4ed46869
KH
2947 }
2948 }
ff0dacd7
KH
2949 detect_info->rejected |= CATEGORY_MASK_ISO;
2950 return 0;
4ed46869 2951
df7492f9 2952 no_more_source:
ff0dacd7
KH
2953 detect_info->rejected |= rejected;
2954 detect_info->found |= (found & ~rejected);
df7492f9 2955 return 1;
4ed46869 2956}
ec6d2bb8 2957
4ed46869 2958
134b9549
KH
2959/* Set designation state into CODING. Set CHARS_96 to -1 if the
2960 escape sequence should be kept. */
df7492f9
KH
2961#define DECODE_DESIGNATION(reg, dim, chars_96, final) \
2962 do { \
2963 int id, prev; \
2964 \
2965 if (final < '0' || final >= 128 \
2966 || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0) \
2967 || !SAFE_CHARSET_P (coding, id)) \
2968 { \
2969 CODING_ISO_DESIGNATION (coding, reg) = -2; \
134b9549
KH
2970 chars_96 = -1; \
2971 break; \
df7492f9
KH
2972 } \
2973 prev = CODING_ISO_DESIGNATION (coding, reg); \
bf16eb23
KH
2974 if (id == charset_jisx0201_roman) \
2975 { \
2976 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
2977 id = charset_ascii; \
2978 } \
2979 else if (id == charset_jisx0208_1978) \
2980 { \
2981 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
2982 id = charset_jisx0208; \
2983 } \
df7492f9
KH
2984 CODING_ISO_DESIGNATION (coding, reg) = id; \
2985 /* If there was an invalid designation to REG previously, and this \
2986 designation is ASCII to REG, we should keep this designation \
2987 sequence. */ \
2988 if (prev == -2 && id == charset_ascii) \
134b9549 2989 chars_96 = -1; \
4ed46869
KH
2990 } while (0)
2991
d46c5b12 2992
df7492f9
KH
2993#define MAYBE_FINISH_COMPOSITION() \
2994 do { \
2995 int i; \
2996 if (composition_state == COMPOSING_NO) \
2997 break; \
2998 /* It is assured that we have enough room for producing \
2999 characters stored in the table `components'. */ \
3000 if (charbuf + component_idx > charbuf_end) \
3001 goto no_more_source; \
3002 composition_state = COMPOSING_NO; \
3003 if (method == COMPOSITION_RELATIVE \
3004 || method == COMPOSITION_WITH_ALTCHARS) \
3005 { \
3006 for (i = 0; i < component_idx; i++) \
3007 *charbuf++ = components[i]; \
3008 char_offset += component_idx; \
3009 } \
3010 else \
3011 { \
3012 for (i = 0; i < component_idx; i += 2) \
3013 *charbuf++ = components[i]; \
3014 char_offset += (component_idx / 2) + 1; \
3015 } \
3016 } while (0)
3017
d46c5b12 3018
aa72b389
KH
3019/* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3020 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3021 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
df7492f9
KH
3022 ESC 3 : altchar composition : ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3023 ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
aa72b389 3024 */
ec6d2bb8 3025
df7492f9
KH
3026#define DECODE_COMPOSITION_START(c1) \
3027 do { \
3028 if (c1 == '0' \
781d7a48 3029 && composition_state == COMPOSING_COMPONENT_RULE) \
df7492f9
KH
3030 { \
3031 component_len = component_idx; \
3032 composition_state = COMPOSING_CHAR; \
3033 } \
3034 else \
3035 { \
8f924df7 3036 const unsigned char *p; \
df7492f9
KH
3037 \
3038 MAYBE_FINISH_COMPOSITION (); \
3039 if (charbuf + MAX_COMPOSITION_COMPONENTS > charbuf_end) \
3040 goto no_more_source; \
3041 for (p = src; p < src_end - 1; p++) \
3042 if (*p == ISO_CODE_ESC && p[1] == '1') \
3043 break; \
3044 if (p == src_end - 1) \
3045 { \
9286b333
KH
3046 /* The current composition doesn't end in the current \
3047 source. */ \
3048 record_conversion_result \
3049 (coding, CODING_RESULT_INSUFFICIENT_SRC); \
df7492f9
KH
3050 goto no_more_source; \
3051 } \
3052 \
3053 /* This is surely the start of a composition. */ \
3054 method = (c1 == '0' ? COMPOSITION_RELATIVE \
3055 : c1 == '2' ? COMPOSITION_WITH_RULE \
3056 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
3057 : COMPOSITION_WITH_RULE_ALTCHARS); \
3058 composition_state = (c1 <= '2' ? COMPOSING_CHAR \
3059 : COMPOSING_COMPONENT_CHAR); \
3060 component_idx = component_len = 0; \
3061 } \
ec6d2bb8
KH
3062 } while (0)
3063
ec6d2bb8 3064
df7492f9
KH
3065/* Handle compositoin end sequence ESC 1. */
3066
3067#define DECODE_COMPOSITION_END() \
ec6d2bb8 3068 do { \
df7492f9
KH
3069 int nchars = (component_len > 0 ? component_idx - component_len \
3070 : method == COMPOSITION_RELATIVE ? component_idx \
3071 : (component_idx + 1) / 2); \
3072 int i; \
3073 int *saved_charbuf = charbuf; \
3074 \
69a80ea3 3075 ADD_COMPOSITION_DATA (charbuf, nchars, method); \
df7492f9 3076 if (method != COMPOSITION_RELATIVE) \
ec6d2bb8 3077 { \
df7492f9
KH
3078 if (component_len == 0) \
3079 for (i = 0; i < component_idx; i++) \
3080 *charbuf++ = components[i]; \
3081 else \
3082 for (i = 0; i < component_len; i++) \
3083 *charbuf++ = components[i]; \
3084 *saved_charbuf = saved_charbuf - charbuf; \
ec6d2bb8 3085 } \
df7492f9
KH
3086 if (method == COMPOSITION_WITH_RULE) \
3087 for (i = 0; i < component_idx; i += 2, char_offset++) \
3088 *charbuf++ = components[i]; \
ec6d2bb8 3089 else \
df7492f9
KH
3090 for (i = component_len; i < component_idx; i++, char_offset++) \
3091 *charbuf++ = components[i]; \
3092 coding->annotated = 1; \
3093 composition_state = COMPOSING_NO; \
ec6d2bb8
KH
3094 } while (0)
3095
df7492f9 3096
ec6d2bb8
KH
3097/* Decode a composition rule from the byte C1 (and maybe one more byte
3098 from SRC) and store one encoded composition rule in
3099 coding->cmp_data. */
3100
3101#define DECODE_COMPOSITION_RULE(c1) \
3102 do { \
ec6d2bb8
KH
3103 (c1) -= 32; \
3104 if (c1 < 81) /* old format (before ver.21) */ \
3105 { \
3106 int gref = (c1) / 9; \
3107 int nref = (c1) % 9; \
3108 if (gref == 4) gref = 10; \
3109 if (nref == 4) nref = 10; \
df7492f9 3110 c1 = COMPOSITION_ENCODE_RULE (gref, nref); \
ec6d2bb8 3111 } \
b73bfc1c 3112 else if (c1 < 93) /* new format (after ver.21) */ \
ec6d2bb8
KH
3113 { \
3114 ONE_MORE_BYTE (c2); \
df7492f9 3115 c1 = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32); \
ec6d2bb8 3116 } \
df7492f9
KH
3117 else \
3118 c1 = 0; \
ec6d2bb8 3119 } while (0)
88993dfd 3120
d46c5b12 3121
4ed46869
KH
3122/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
3123
b73bfc1c 3124static void
df7492f9 3125decode_coding_iso_2022 (coding)
4ed46869 3126 struct coding_system *coding;
4ed46869 3127{
8f924df7
KH
3128 const unsigned char *src = coding->source + coding->consumed;
3129 const unsigned char *src_end = coding->source + coding->src_bytes;
3130 const unsigned char *src_base;
69a80ea3 3131 int *charbuf = coding->charbuf + coding->charbuf_used;
ff0dacd7 3132 int *charbuf_end
69a80ea3 3133 = coding->charbuf + coding->charbuf_size - 4 - MAX_ANNOTATION_LENGTH;
df7492f9 3134 int consumed_chars = 0, consumed_chars_base;
df7492f9 3135 int multibytep = coding->src_multibyte;
4ed46869 3136 /* Charsets invoked to graphic plane 0 and 1 respectively. */
df7492f9
KH
3137 int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3138 int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
134b9549 3139 int charset_id_2, charset_id_3;
df7492f9
KH
3140 struct charset *charset;
3141 int c;
3142 /* For handling composition sequence. */
3143#define COMPOSING_NO 0
3144#define COMPOSING_CHAR 1
3145#define COMPOSING_RULE 2
3146#define COMPOSING_COMPONENT_CHAR 3
3147#define COMPOSING_COMPONENT_RULE 4
3148
3149 int composition_state = COMPOSING_NO;
3150 enum composition_method method;
3151 int components[MAX_COMPOSITION_COMPONENTS * 2 + 1];
3152 int component_idx;
3153 int component_len;
24a73b0a 3154 Lisp_Object attrs, charset_list;
ff0dacd7
KH
3155 int char_offset = coding->produced_char;
3156 int last_offset = char_offset;
3157 int last_id = charset_ascii;
119852e7
KH
3158 int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3159 int byte_after_cr = -1;
df7492f9 3160
24a73b0a 3161 CODING_GET_INFO (coding, attrs, charset_list);
df7492f9 3162 setup_iso_safe_charsets (attrs);
287c57d7
KH
3163 /* Charset list may have been changed. */
3164 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3165 coding->safe_charsets = (char *) SDATA (CODING_ATTR_SAFE_CHARSETS(attrs));
b73bfc1c
KH
3166
3167 while (1)
4ed46869 3168 {
463f5630 3169 int c1, c2;
b73bfc1c
KH
3170
3171 src_base = src;
df7492f9
KH
3172 consumed_chars_base = consumed_chars;
3173
3174 if (charbuf >= charbuf_end)
3175 break;
3176
119852e7
KH
3177 if (byte_after_cr >= 0)
3178 c1 = byte_after_cr, byte_after_cr = -1;
3179 else
3180 ONE_MORE_BYTE (c1);
065e3595
KH
3181 if (c1 < 0)
3182 goto invalid_code;
4ed46869 3183
98725083 3184 /* We produce at most one character. */
4ed46869
KH
3185 switch (iso_code_class [c1])
3186 {
3187 case ISO_0x20_or_0x7F:
df7492f9 3188 if (composition_state != COMPOSING_NO)
ec6d2bb8 3189 {
df7492f9
KH
3190 if (composition_state == COMPOSING_RULE
3191 || composition_state == COMPOSING_COMPONENT_RULE)
3192 {
3193 DECODE_COMPOSITION_RULE (c1);
3194 components[component_idx++] = c1;
3195 composition_state--;
3196 continue;
3197 }
4ed46869 3198 }
df7492f9
KH
3199 if (charset_id_0 < 0
3200 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
781d7a48
KH
3201 /* This is SPACE or DEL. */
3202 charset = CHARSET_FROM_ID (charset_ascii);
3203 else
3204 charset = CHARSET_FROM_ID (charset_id_0);
3205 break;
4ed46869
KH
3206
3207 case ISO_graphic_plane_0:
781d7a48 3208 if (composition_state != COMPOSING_NO)
b73bfc1c 3209 {
781d7a48
KH
3210 if (composition_state == COMPOSING_RULE
3211 || composition_state == COMPOSING_COMPONENT_RULE)
3212 {
3213 DECODE_COMPOSITION_RULE (c1);
3214 components[component_idx++] = c1;
3215 composition_state--;
3216 continue;
3217 }
b73bfc1c 3218 }
134b9549
KH
3219 if (charset_id_0 < 0)
3220 charset = CHARSET_FROM_ID (charset_ascii);
3221 else
3222 charset = CHARSET_FROM_ID (charset_id_0);
4ed46869
KH
3223 break;
3224
3225 case ISO_0xA0_or_0xFF:
df7492f9
KH
3226 if (charset_id_1 < 0
3227 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3228 || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3229 goto invalid_code;
4ed46869
KH
3230 /* This is a graphic character, we fall down ... */
3231
3232 case ISO_graphic_plane_1:
df7492f9
KH
3233 if (charset_id_1 < 0)
3234 goto invalid_code;
3235 charset = CHARSET_FROM_ID (charset_id_1);
4ed46869
KH
3236 break;
3237
df7492f9 3238 case ISO_control_0:
119852e7
KH
3239 if (eol_crlf && c1 == '\r')
3240 ONE_MORE_BYTE (byte_after_cr);
df7492f9
KH
3241 MAYBE_FINISH_COMPOSITION ();
3242 charset = CHARSET_FROM_ID (charset_ascii);
4ed46869
KH
3243 break;
3244
df7492f9
KH
3245 case ISO_control_1:
3246 MAYBE_FINISH_COMPOSITION ();
3247 goto invalid_code;
3248
4ed46869 3249 case ISO_shift_out:
df7492f9
KH
3250 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3251 || CODING_ISO_DESIGNATION (coding, 1) < 0)
3252 goto invalid_code;
3253 CODING_ISO_INVOCATION (coding, 0) = 1;
3254 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
b73bfc1c 3255 continue;
4ed46869
KH
3256
3257 case ISO_shift_in:
df7492f9
KH
3258 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3259 goto invalid_code;
3260 CODING_ISO_INVOCATION (coding, 0) = 0;
3261 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
b73bfc1c 3262 continue;
4ed46869
KH
3263
3264 case ISO_single_shift_2_7:
3265 case ISO_single_shift_2:
df7492f9
KH
3266 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3267 goto invalid_code;
4ed46869
KH
3268 /* SS2 is handled as an escape sequence of ESC 'N' */
3269 c1 = 'N';
3270 goto label_escape_sequence;
3271
3272 case ISO_single_shift_3:
df7492f9
KH
3273 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3274 goto invalid_code;
4ed46869
KH
3275 /* SS2 is handled as an escape sequence of ESC 'O' */
3276 c1 = 'O';
3277 goto label_escape_sequence;
3278
3279 case ISO_control_sequence_introducer:
3280 /* CSI is handled as an escape sequence of ESC '[' ... */
3281 c1 = '[';
3282 goto label_escape_sequence;
3283
3284 case ISO_escape:
3285 ONE_MORE_BYTE (c1);
3286 label_escape_sequence:
df7492f9 3287 /* Escape sequences handled here are invocation,
4ed46869
KH
3288 designation, direction specification, and character
3289 composition specification. */
3290 switch (c1)
3291 {
3292 case '&': /* revision of following character set */
3293 ONE_MORE_BYTE (c1);
3294 if (!(c1 >= '@' && c1 <= '~'))
df7492f9 3295 goto invalid_code;
4ed46869
KH
3296 ONE_MORE_BYTE (c1);
3297 if (c1 != ISO_CODE_ESC)
df7492f9 3298 goto invalid_code;
4ed46869
KH
3299 ONE_MORE_BYTE (c1);
3300 goto label_escape_sequence;
3301
3302 case '$': /* designation of 2-byte character set */
df7492f9
KH
3303 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3304 goto invalid_code;
134b9549
KH
3305 {
3306 int reg, chars96;
3307
3308 ONE_MORE_BYTE (c1);
3309 if (c1 >= '@' && c1 <= 'B')
3310 { /* designation of JISX0208.1978, GB2312.1980,
88993dfd 3311 or JISX0208.1980 */
134b9549
KH
3312 reg = 0, chars96 = 0;
3313 }
3314 else if (c1 >= 0x28 && c1 <= 0x2B)
3315 { /* designation of DIMENSION2_CHARS94 character set */
3316 reg = c1 - 0x28, chars96 = 0;
3317 ONE_MORE_BYTE (c1);
3318 }
3319 else if (c1 >= 0x2C && c1 <= 0x2F)
3320 { /* designation of DIMENSION2_CHARS96 character set */
3321 reg = c1 - 0x2C, chars96 = 1;
3322 ONE_MORE_BYTE (c1);
3323 }
3324 else
3325 goto invalid_code;
3326 DECODE_DESIGNATION (reg, 2, chars96, c1);
3327 /* We must update these variables now. */
3328 if (reg == 0)
3329 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3330 else if (reg == 1)
3331 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3332 if (chars96 < 0)
3333 goto invalid_code;
3334 }
b73bfc1c 3335 continue;
4ed46869
KH
3336
3337 case 'n': /* invocation of locking-shift-2 */
df7492f9
KH
3338 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3339 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3340 goto invalid_code;
3341 CODING_ISO_INVOCATION (coding, 0) = 2;
3342 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
b73bfc1c 3343 continue;
4ed46869
KH
3344
3345 case 'o': /* invocation of locking-shift-3 */
df7492f9
KH
3346 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3347 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3348 goto invalid_code;
3349 CODING_ISO_INVOCATION (coding, 0) = 3;
3350 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
b73bfc1c 3351 continue;
4ed46869
KH
3352
3353 case 'N': /* invocation of single-shift-2 */
df7492f9
KH
3354 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3355 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3356 goto invalid_code;
134b9549
KH
3357 charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3358 if (charset_id_2 < 0)
3359 charset = CHARSET_FROM_ID (charset_ascii);
3360 else
3361 charset = CHARSET_FROM_ID (charset_id_2);
b73bfc1c 3362 ONE_MORE_BYTE (c1);
e7046a18 3363 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
df7492f9 3364 goto invalid_code;
4ed46869
KH
3365 break;
3366
3367 case 'O': /* invocation of single-shift-3 */
df7492f9
KH
3368 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3369 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3370 goto invalid_code;
134b9549
KH
3371 charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3372 if (charset_id_3 < 0)
3373 charset = CHARSET_FROM_ID (charset_ascii);
3374 else
3375 charset = CHARSET_FROM_ID (charset_id_3);
b73bfc1c 3376 ONE_MORE_BYTE (c1);
e7046a18 3377 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
df7492f9 3378 goto invalid_code;
4ed46869
KH
3379 break;
3380
ec6d2bb8 3381 case '0': case '2': case '3': case '4': /* start composition */
df7492f9
KH
3382 if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3383 goto invalid_code;
ec6d2bb8 3384 DECODE_COMPOSITION_START (c1);
b73bfc1c 3385 continue;
4ed46869 3386
ec6d2bb8 3387 case '1': /* end composition */
df7492f9
KH
3388 if (composition_state == COMPOSING_NO)
3389 goto invalid_code;
3390 DECODE_COMPOSITION_END ();
b73bfc1c 3391 continue;
4ed46869
KH
3392
3393 case '[': /* specification of direction */
df7492f9
KH
3394 if (! CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION)
3395 goto invalid_code;
4ed46869 3396 /* For the moment, nested direction is not supported.
d46c5b12 3397 So, `coding->mode & CODING_MODE_DIRECTION' zero means
df7492f9 3398 left-to-right, and nozero means right-to-left. */
4ed46869
KH
3399 ONE_MORE_BYTE (c1);
3400 switch (c1)
3401 {
3402 case ']': /* end of the current direction */
d46c5b12 3403 coding->mode &= ~CODING_MODE_DIRECTION;
4ed46869
KH
3404
3405 case '0': /* end of the current direction */
3406 case '1': /* start of left-to-right direction */
3407 ONE_MORE_BYTE (c1);
3408 if (c1 == ']')
d46c5b12 3409 coding->mode &= ~CODING_MODE_DIRECTION;
4ed46869 3410 else
df7492f9 3411 goto invalid_code;
4ed46869
KH
3412 break;
3413
3414 case '2': /* start of right-to-left direction */
3415 ONE_MORE_BYTE (c1);
3416 if (c1 == ']')
d46c5b12 3417 coding->mode |= CODING_MODE_DIRECTION;
4ed46869 3418 else
df7492f9 3419 goto invalid_code;
4ed46869
KH
3420 break;
3421
3422 default:
df7492f9 3423 goto invalid_code;
4ed46869 3424 }
b73bfc1c 3425 continue;
4ed46869 3426
103e0180 3427 case '%':
103e0180
KH
3428 ONE_MORE_BYTE (c1);
3429 if (c1 == '/')
3430 {
3431 /* CTEXT extended segment:
3432 ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3433 We keep these bytes as is for the moment.
3434 They may be decoded by post-read-conversion. */
3435 int dim, M, L;
4776e638 3436 int size;
8f924df7 3437
103e0180
KH
3438 ONE_MORE_BYTE (dim);
3439 ONE_MORE_BYTE (M);
3440 ONE_MORE_BYTE (L);
3441 size = ((M - 128) * 128) + (L - 128);
4776e638
KH
3442 if (charbuf + 8 + size > charbuf_end)
3443 goto break_loop;
3444 *charbuf++ = ISO_CODE_ESC;
3445 *charbuf++ = '%';
3446 *charbuf++ = '/';
3447 *charbuf++ = dim;
3448 *charbuf++ = BYTE8_TO_CHAR (M);
3449 *charbuf++ = BYTE8_TO_CHAR (L);
103e0180
KH
3450 while (size-- > 0)
3451 {
3452 ONE_MORE_BYTE (c1);
4776e638 3453 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
103e0180 3454 }
103e0180
KH
3455 }
3456 else if (c1 == 'G')
3457 {
103e0180
KH
3458 /* XFree86 extension for embedding UTF-8 in CTEXT:
3459 ESC % G --UTF-8-BYTES-- ESC % @
3460 We keep these bytes as is for the moment.
3461 They may be decoded by post-read-conversion. */
4776e638
KH
3462 int *p = charbuf;
3463
3464 if (p + 6 > charbuf_end)
3465 goto break_loop;
3466 *p++ = ISO_CODE_ESC;
3467 *p++ = '%';
3468 *p++ = 'G';
3469 while (p < charbuf_end)
103e0180
KH
3470 {
3471 ONE_MORE_BYTE (c1);
3472 if (c1 == ISO_CODE_ESC
3473 && src + 1 < src_end
3474 && src[0] == '%'
3475 && src[1] == '@')
9ffd559c
KH
3476 {
3477 src += 2;
3478 break;
3479 }
4776e638 3480 *p++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
103e0180 3481 }
4776e638
KH
3482 if (p + 3 > charbuf_end)
3483 goto break_loop;
3484 *p++ = ISO_CODE_ESC;
3485 *p++ = '%';
3486 *p++ = '@';
3487 charbuf = p;
103e0180
KH
3488 }
3489 else
4776e638 3490 goto invalid_code;
103e0180 3491 continue;
4776e638 3492 break;
103e0180 3493
4ed46869 3494 default:
df7492f9
KH
3495 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3496 goto invalid_code;
134b9549
KH
3497 {
3498 int reg, chars96;
3499
3500 if (c1 >= 0x28 && c1 <= 0x2B)
3501 { /* designation of DIMENSION1_CHARS94 character set */
3502 reg = c1 - 0x28, chars96 = 0;
3503 ONE_MORE_BYTE (c1);
3504 }
3505 else if (c1 >= 0x2C && c1 <= 0x2F)
3506 { /* designation of DIMENSION1_CHARS96 character set */
3507 reg = c1 - 0x2C, chars96 = 1;
3508 ONE_MORE_BYTE (c1);
3509 }
3510 else
3511 goto invalid_code;
3512 DECODE_DESIGNATION (reg, 1, chars96, c1);
3513 /* We must update these variables now. */
3514 if (reg == 0)
3515 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3516 else if (reg == 1)
3517 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3518 if (chars96 < 0)
3519 goto invalid_code;
3520 }
b73bfc1c 3521 continue;
4ed46869 3522 }
b73bfc1c 3523 }
4ed46869 3524
ff0dacd7
KH
3525 if (charset->id != charset_ascii
3526 && last_id != charset->id)
3527 {
3528 if (last_id != charset_ascii)
69a80ea3 3529 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
ff0dacd7
KH
3530 last_id = charset->id;
3531 last_offset = char_offset;
3532 }
3533
b73bfc1c 3534 /* Now we know CHARSET and 1st position code C1 of a character.
df7492f9
KH
3535 Produce a decoded character while getting 2nd position code
3536 C2 if necessary. */
3537 c1 &= 0x7F;
3538 if (CHARSET_DIMENSION (charset) > 1)
b73bfc1c
KH
3539 {
3540 ONE_MORE_BYTE (c2);
df7492f9 3541 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
b73bfc1c 3542 /* C2 is not in a valid range. */
df7492f9
KH
3543 goto invalid_code;
3544 c1 = (c1 << 8) | (c2 & 0x7F);
3545 if (CHARSET_DIMENSION (charset) > 2)
3546 {
3547 ONE_MORE_BYTE (c2);
3548 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3549 /* C2 is not in a valid range. */
3550 goto invalid_code;
3551 c1 = (c1 << 8) | (c2 & 0x7F);
3552 }
3553 }
3554
3555 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3556 if (c < 0)
3557 {
3558 MAYBE_FINISH_COMPOSITION ();
3559 for (; src_base < src; src_base++, char_offset++)
3560 {
3561 if (ASCII_BYTE_P (*src_base))
3562 *charbuf++ = *src_base;
3563 else
3564 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3565 }
3566 }
3567 else if (composition_state == COMPOSING_NO)
3568 {
3569 *charbuf++ = c;
3570 char_offset++;
4ed46869 3571 }
df7492f9 3572 else
781d7a48
KH
3573 {
3574 components[component_idx++] = c;
3575 if (method == COMPOSITION_WITH_RULE
3576 || (method == COMPOSITION_WITH_RULE_ALTCHARS
3577 && composition_state == COMPOSING_COMPONENT_CHAR))
3578 composition_state++;
4ed46869
KH
3579 }
3580 continue;
3581
df7492f9
KH
3582 invalid_code:
3583 MAYBE_FINISH_COMPOSITION ();
4ed46869 3584 src = src_base;
df7492f9
KH
3585 consumed_chars = consumed_chars_base;
3586 ONE_MORE_BYTE (c);
065e3595 3587 *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
ff0dacd7 3588 char_offset++;
df7492f9 3589 coding->errors++;
4776e638
KH
3590 continue;
3591
3592 break_loop:
3593 break;
4ed46869 3594 }
fb88bf2d 3595
df7492f9 3596 no_more_source:
ff0dacd7 3597 if (last_id != charset_ascii)
69a80ea3 3598 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
df7492f9
KH
3599 coding->consumed_char += consumed_chars_base;
3600 coding->consumed = src_base - coding->source;
3601 coding->charbuf_used = charbuf - coding->charbuf;
4ed46869
KH
3602}
3603
b73bfc1c 3604
f4dee582 3605/* ISO2022 encoding stuff. */
4ed46869
KH
3606
3607/*
f4dee582 3608 It is not enough to say just "ISO2022" on encoding, we have to
df7492f9 3609 specify more details. In Emacs, each coding system of ISO2022
4ed46869 3610 variant has the following specifications:
df7492f9 3611 1. Initial designation to G0 thru G3.
4ed46869
KH
3612 2. Allows short-form designation?
3613 3. ASCII should be designated to G0 before control characters?
3614 4. ASCII should be designated to G0 at end of line?
3615 5. 7-bit environment or 8-bit environment?
3616 6. Use locking-shift?
3617 7. Use Single-shift?
3618 And the following two are only for Japanese:
3619 8. Use ASCII in place of JIS0201-1976-Roman?
3620 9. Use JISX0208-1983 in place of JISX0208-1978?
df7492f9
KH
3621 These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3622 defined by macros CODING_ISO_FLAG_XXX. See `coding.h' for more
f4dee582 3623 details.
4ed46869
KH
3624*/
3625
3626/* Produce codes (escape sequence) for designating CHARSET to graphic
b73bfc1c
KH
3627 register REG at DST, and increment DST. If <final-char> of CHARSET is
3628 '@', 'A', or 'B' and the coding system CODING allows, produce
3629 designation sequence of short-form. */
4ed46869
KH
3630
3631#define ENCODE_DESIGNATION(charset, reg, coding) \
3632 do { \
df7492f9 3633 unsigned char final_char = CHARSET_ISO_FINAL (charset); \
4ed46869
KH
3634 char *intermediate_char_94 = "()*+"; \
3635 char *intermediate_char_96 = ",-./"; \
df7492f9
KH
3636 int revision = -1; \
3637 int c; \
3638 \
3639 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION) \
c197f191 3640 revision = CHARSET_ISO_REVISION (charset); \
df7492f9
KH
3641 \
3642 if (revision >= 0) \
70c22245 3643 { \
df7492f9
KH
3644 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&'); \
3645 EMIT_ONE_BYTE ('@' + revision); \
4ed46869 3646 } \
df7492f9 3647 EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC); \
4ed46869
KH
3648 if (CHARSET_DIMENSION (charset) == 1) \
3649 { \
df7492f9
KH
3650 if (! CHARSET_ISO_CHARS_96 (charset)) \
3651 c = intermediate_char_94[reg]; \
4ed46869 3652 else \
df7492f9
KH
3653 c = intermediate_char_96[reg]; \
3654 EMIT_ONE_ASCII_BYTE (c); \
4ed46869
KH
3655 } \
3656 else \
3657 { \
df7492f9
KH
3658 EMIT_ONE_ASCII_BYTE ('$'); \
3659 if (! CHARSET_ISO_CHARS_96 (charset)) \
4ed46869 3660 { \
df7492f9 3661 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM \
b73bfc1c
KH
3662 || reg != 0 \
3663 || final_char < '@' || final_char > 'B') \
df7492f9 3664 EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]); \
4ed46869
KH
3665 } \
3666 else \
df7492f9 3667 EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]); \
4ed46869 3668 } \
df7492f9
KH
3669 EMIT_ONE_ASCII_BYTE (final_char); \
3670 \
3671 CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset); \
4ed46869
KH
3672 } while (0)
3673
df7492f9 3674
4ed46869
KH
3675/* The following two macros produce codes (control character or escape
3676 sequence) for ISO2022 single-shift functions (single-shift-2 and
3677 single-shift-3). */
3678
df7492f9
KH
3679#define ENCODE_SINGLE_SHIFT_2 \
3680 do { \
3681 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3682 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N'); \
3683 else \
3684 EMIT_ONE_BYTE (ISO_CODE_SS2); \
3685 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
4ed46869
KH
3686 } while (0)
3687
df7492f9
KH
3688
3689#define ENCODE_SINGLE_SHIFT_3 \
3690 do { \
3691 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3692 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O'); \
3693 else \
3694 EMIT_ONE_BYTE (ISO_CODE_SS3); \
3695 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
4ed46869
KH
3696 } while (0)
3697
df7492f9 3698
4ed46869
KH
3699/* The following four macros produce codes (control character or
3700 escape sequence) for ISO2022 locking-shift functions (shift-in,
3701 shift-out, locking-shift-2, and locking-shift-3). */
3702
df7492f9
KH
3703#define ENCODE_SHIFT_IN \
3704 do { \
3705 EMIT_ONE_ASCII_BYTE (ISO_CODE_SI); \
3706 CODING_ISO_INVOCATION (coding, 0) = 0; \
4ed46869
KH
3707 } while (0)
3708
df7492f9
KH
3709
3710#define ENCODE_SHIFT_OUT \
3711 do { \
3712 EMIT_ONE_ASCII_BYTE (ISO_CODE_SO); \
3713 CODING_ISO_INVOCATION (coding, 0) = 1; \
4ed46869
KH
3714 } while (0)
3715
df7492f9
KH
3716
3717#define ENCODE_LOCKING_SHIFT_2 \
3718 do { \
3719 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3720 CODING_ISO_INVOCATION (coding, 0) = 2; \
4ed46869
KH
3721 } while (0)
3722
df7492f9
KH
3723
3724#define ENCODE_LOCKING_SHIFT_3 \
3725 do { \
3726 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3727 CODING_ISO_INVOCATION (coding, 0) = 3; \
4ed46869
KH
3728 } while (0)
3729
df7492f9 3730
f4dee582
RS
3731/* Produce codes for a DIMENSION1 character whose character set is
3732 CHARSET and whose position-code is C1. Designation and invocation
4ed46869
KH
3733 sequences are also produced in advance if necessary. */
3734
6e85d753
KH
3735#define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
3736 do { \
df7492f9 3737 int id = CHARSET_ID (charset); \
bf16eb23
KH
3738 \
3739 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
3740 && id == charset_ascii) \
3741 { \
3742 id = charset_jisx0201_roman; \
3743 charset = CHARSET_FROM_ID (id); \
3744 } \
3745 \
df7492f9 3746 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
6e85d753 3747 { \
df7492f9
KH
3748 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3749 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
6e85d753 3750 else \
df7492f9
KH
3751 EMIT_ONE_BYTE (c1 | 0x80); \
3752 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
6e85d753
KH
3753 break; \
3754 } \
df7492f9 3755 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
6e85d753 3756 { \
df7492f9 3757 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
6e85d753
KH
3758 break; \
3759 } \
df7492f9 3760 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
6e85d753 3761 { \
df7492f9 3762 EMIT_ONE_BYTE (c1 | 0x80); \
6e85d753
KH
3763 break; \
3764 } \
6e85d753
KH
3765 else \
3766 /* Since CHARSET is not yet invoked to any graphic planes, we \
3767 must invoke it, or, at first, designate it to some graphic \
3768 register. Then repeat the loop to actually produce the \
3769 character. */ \
df7492f9
KH
3770 dst = encode_invocation_designation (charset, coding, dst, \
3771 &produced_chars); \
4ed46869
KH
3772 } while (1)
3773
df7492f9 3774
f4dee582
RS
3775/* Produce codes for a DIMENSION2 character whose character set is
3776 CHARSET and whose position-codes are C1 and C2. Designation and
4ed46869
KH
3777 invocation codes are also produced in advance if necessary. */
3778
6e85d753
KH
3779#define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
3780 do { \
df7492f9 3781 int id = CHARSET_ID (charset); \
bf16eb23
KH
3782 \
3783 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
3784 && id == charset_jisx0208) \
3785 { \
3786 id = charset_jisx0208_1978; \
3787 charset = CHARSET_FROM_ID (id); \
3788 } \
3789 \
df7492f9 3790 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
6e85d753 3791 { \
df7492f9
KH
3792 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3793 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
6e85d753 3794 else \
df7492f9
KH
3795 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3796 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
6e85d753
KH
3797 break; \
3798 } \
df7492f9 3799 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
6e85d753 3800 { \
df7492f9 3801 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
6e85d753
KH
3802 break; \
3803 } \
df7492f9 3804 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
6e85d753 3805 { \
df7492f9 3806 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
6e85d753
KH
3807 break; \
3808 } \
6e85d753
KH
3809 else \
3810 /* Since CHARSET is not yet invoked to any graphic planes, we \
3811 must invoke it, or, at first, designate it to some graphic \
3812 register. Then repeat the loop to actually produce the \
3813 character. */ \
df7492f9
KH
3814 dst = encode_invocation_designation (charset, coding, dst, \
3815 &produced_chars); \
4ed46869
KH
3816 } while (1)
3817
05e6f5dc 3818
df7492f9
KH
3819#define ENCODE_ISO_CHARACTER(charset, c) \
3820 do { \
3821 int code = ENCODE_CHAR ((charset),(c)); \
3822 \
3823 if (CHARSET_DIMENSION (charset) == 1) \
3824 ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code); \
3825 else \
3826 ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
84fbb8a0 3827 } while (0)
bdd9fb48 3828
05e6f5dc 3829
4ed46869 3830/* Produce designation and invocation codes at a place pointed by DST
df7492f9 3831 to use CHARSET. The element `spec.iso_2022' of *CODING is updated.
4ed46869
KH
3832 Return new DST. */
3833
3834unsigned char *
df7492f9
KH
3835encode_invocation_designation (charset, coding, dst, p_nchars)
3836 struct charset *charset;
4ed46869
KH
3837 struct coding_system *coding;
3838 unsigned char *dst;
df7492f9 3839 int *p_nchars;
4ed46869 3840{
df7492f9
KH
3841 int multibytep = coding->dst_multibyte;
3842 int produced_chars = *p_nchars;
4ed46869 3843 int reg; /* graphic register number */
df7492f9 3844 int id = CHARSET_ID (charset);
4ed46869
KH
3845
3846 /* At first, check designations. */
3847 for (reg = 0; reg < 4; reg++)
df7492f9 3848 if (id == CODING_ISO_DESIGNATION (coding, reg))
4ed46869
KH
3849 break;
3850
3851 if (reg >= 4)
3852 {
3853 /* CHARSET is not yet designated to any graphic registers. */
3854 /* At first check the requested designation. */
df7492f9
KH
3855 reg = CODING_ISO_REQUEST (coding, id);
3856 if (reg < 0)
1ba9e4ab
KH
3857 /* Since CHARSET requests no special designation, designate it
3858 to graphic register 0. */
4ed46869
KH
3859 reg = 0;
3860
3861 ENCODE_DESIGNATION (charset, reg, coding);
3862 }
3863
df7492f9
KH
3864 if (CODING_ISO_INVOCATION (coding, 0) != reg
3865 && CODING_ISO_INVOCATION (coding, 1) != reg)
4ed46869
KH
3866 {
3867 /* Since the graphic register REG is not invoked to any graphic
3868 planes, invoke it to graphic plane 0. */
3869 switch (reg)
3870 {
3871 case 0: /* graphic register 0 */
3872 ENCODE_SHIFT_IN;
3873 break;
3874
3875 case 1: /* graphic register 1 */
3876 ENCODE_SHIFT_OUT;
3877 break;
3878
3879 case 2: /* graphic register 2 */
df7492f9 3880 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4ed46869
KH
3881 ENCODE_SINGLE_SHIFT_2;
3882 else
3883 ENCODE_LOCKING_SHIFT_2;
3884 break;
3885
3886 case 3: /* graphic register 3 */
df7492f9 3887 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4ed46869
KH
3888 ENCODE_SINGLE_SHIFT_3;
3889 else
3890 ENCODE_LOCKING_SHIFT_3;
3891 break;
3892 }
3893 }
b73bfc1c 3894
df7492f9 3895 *p_nchars = produced_chars;
4ed46869
KH
3896 return dst;
3897}
3898
df7492f9
KH
3899/* The following three macros produce codes for indicating direction
3900 of text. */
3901#define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
ec6d2bb8 3902 do { \
df7492f9
KH
3903 if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS) \
3904 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '['); \
ec6d2bb8 3905 else \
df7492f9 3906 EMIT_ONE_BYTE (ISO_CODE_CSI); \
ec6d2bb8
KH
3907 } while (0)
3908
ec6d2bb8 3909
df7492f9
KH
3910#define ENCODE_DIRECTION_R2L() \
3911 do { \
3912 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3913 EMIT_TWO_ASCII_BYTES ('2', ']'); \
ec6d2bb8
KH
3914 } while (0)
3915
ec6d2bb8 3916
df7492f9 3917#define ENCODE_DIRECTION_L2R() \
ec6d2bb8 3918 do { \
df7492f9
KH
3919 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3920 EMIT_TWO_ASCII_BYTES ('0', ']'); \
ec6d2bb8 3921 } while (0)
4ed46869 3922
4ed46869
KH
3923
3924/* Produce codes for designation and invocation to reset the graphic
3925 planes and registers to initial state. */
df7492f9
KH
3926#define ENCODE_RESET_PLANE_AND_REGISTER() \
3927 do { \
3928 int reg; \
3929 struct charset *charset; \
3930 \
3931 if (CODING_ISO_INVOCATION (coding, 0) != 0) \
3932 ENCODE_SHIFT_IN; \
3933 for (reg = 0; reg < 4; reg++) \
3934 if (CODING_ISO_INITIAL (coding, reg) >= 0 \
3935 && (CODING_ISO_DESIGNATION (coding, reg) \
3936 != CODING_ISO_INITIAL (coding, reg))) \
3937 { \
3938 charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
3939 ENCODE_DESIGNATION (charset, reg, coding); \
3940 } \
4ed46869
KH
3941 } while (0)
3942
df7492f9 3943
bdd9fb48 3944/* Produce designation sequences of charsets in the line started from
b73bfc1c 3945 SRC to a place pointed by DST, and return updated DST.
bdd9fb48
KH
3946
3947 If the current block ends before any end-of-line, we may fail to
d46c5b12
KH
3948 find all the necessary designations. */
3949
b73bfc1c 3950static unsigned char *
df7492f9 3951encode_designation_at_bol (coding, charbuf, charbuf_end, dst)
e0e989f6 3952 struct coding_system *coding;
df7492f9
KH
3953 int *charbuf, *charbuf_end;
3954 unsigned char *dst;
e0e989f6 3955{
df7492f9 3956 struct charset *charset;
bdd9fb48
KH
3957 /* Table of charsets to be designated to each graphic register. */
3958 int r[4];
df7492f9
KH
3959 int c, found = 0, reg;
3960 int produced_chars = 0;
3961 int multibytep = coding->dst_multibyte;
3962 Lisp_Object attrs;
3963 Lisp_Object charset_list;
3964
3965 attrs = CODING_ID_ATTRS (coding->id);
3966 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3967 if (EQ (charset_list, Qiso_2022))
3968 charset_list = Viso_2022_charset_list;
bdd9fb48
KH
3969
3970 for (reg = 0; reg < 4; reg++)
3971 r[reg] = -1;
3972
b73bfc1c 3973 while (found < 4)
e0e989f6 3974 {
df7492f9
KH
3975 int id;
3976
3977 c = *charbuf++;
b73bfc1c
KH
3978 if (c == '\n')
3979 break;
df7492f9
KH
3980 charset = char_charset (c, charset_list, NULL);
3981 id = CHARSET_ID (charset);
3982 reg = CODING_ISO_REQUEST (coding, id);
3983 if (reg >= 0 && r[reg] < 0)
bdd9fb48
KH
3984 {
3985 found++;
df7492f9 3986 r[reg] = id;
bdd9fb48 3987 }
bdd9fb48
KH
3988 }
3989
3990 if (found)
3991 {
3992 for (reg = 0; reg < 4; reg++)
3993 if (r[reg] >= 0
df7492f9
KH
3994 && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
3995 ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
e0e989f6 3996 }
b73bfc1c
KH
3997
3998 return dst;
e0e989f6
KH
3999}
4000
4ed46869
KH
4001/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
4002
df7492f9
KH
4003static int
4004encode_coding_iso_2022 (coding)
4ed46869 4005 struct coding_system *coding;
4ed46869 4006{
df7492f9
KH
4007 int multibytep = coding->dst_multibyte;
4008 int *charbuf = coding->charbuf;
4009 int *charbuf_end = charbuf + coding->charbuf_used;
4010 unsigned char *dst = coding->destination + coding->produced;
4011 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4012 int safe_room = 16;
4013 int bol_designation
4014 = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4015 && CODING_ISO_BOL (coding));
4016 int produced_chars = 0;
4017 Lisp_Object attrs, eol_type, charset_list;
4018 int ascii_compatible;
b73bfc1c 4019 int c;
ff0dacd7 4020 int preferred_charset_id = -1;
05e6f5dc 4021
24a73b0a
KH
4022 CODING_GET_INFO (coding, attrs, charset_list);
4023 eol_type = CODING_ID_EOL_TYPE (coding->id);
4024 if (VECTORP (eol_type))
4025 eol_type = Qunix;
4026
004068e4 4027 setup_iso_safe_charsets (attrs);
ff0dacd7 4028 /* Charset list may have been changed. */
287c57d7 4029 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8f924df7 4030 coding->safe_charsets = (char *) SDATA (CODING_ATTR_SAFE_CHARSETS(attrs));
0eecad43 4031
df7492f9 4032 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
bdd9fb48 4033
df7492f9 4034 while (charbuf < charbuf_end)
4ed46869 4035 {
df7492f9 4036 ASSURE_DESTINATION (safe_room);
b73bfc1c 4037
df7492f9 4038 if (bol_designation)
b73bfc1c 4039 {
df7492f9 4040 unsigned char *dst_prev = dst;
4ed46869 4041
bdd9fb48 4042 /* We have to produce designation sequences if any now. */
df7492f9
KH
4043 dst = encode_designation_at_bol (coding, charbuf, charbuf_end, dst);
4044 bol_designation = 0;
4045 /* We are sure that designation sequences are all ASCII bytes. */
4046 produced_chars += dst - dst_prev;
e0e989f6
KH
4047 }
4048
df7492f9 4049 c = *charbuf++;
ec6d2bb8 4050
ff0dacd7
KH
4051 if (c < 0)
4052 {
4053 /* Handle an annotation. */
4054 switch (*charbuf)
ec6d2bb8 4055 {
ff0dacd7
KH
4056 case CODING_ANNOTATE_COMPOSITION_MASK:
4057 /* Not yet implemented. */
4058 break;
4059 case CODING_ANNOTATE_CHARSET_MASK:
cf7dfdf5 4060 preferred_charset_id = charbuf[2];
ff0dacd7
KH
4061 if (preferred_charset_id >= 0
4062 && NILP (Fmemq (make_number (preferred_charset_id),
4063 charset_list)))
4064 preferred_charset_id = -1;
4065 break;
4066 default:
4067 abort ();
4ed46869 4068 }
ff0dacd7
KH
4069 charbuf += -c - 1;
4070 continue;
4ed46869 4071 }
ec6d2bb8 4072
b73bfc1c
KH
4073 /* Now encode the character C. */
4074 if (c < 0x20 || c == 0x7F)
4075 {
df7492f9
KH
4076 if (c == '\n'
4077 || (c == '\r' && EQ (eol_type, Qmac)))
19a8d9e0 4078 {
df7492f9
KH
4079 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4080 ENCODE_RESET_PLANE_AND_REGISTER ();
4081 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
b73bfc1c 4082 {
df7492f9
KH
4083 int i;
4084
4085 for (i = 0; i < 4; i++)
4086 CODING_ISO_DESIGNATION (coding, i)
4087 = CODING_ISO_INITIAL (coding, i);
b73bfc1c 4088 }
df7492f9
KH
4089 bol_designation
4090 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
19a8d9e0 4091 }
df7492f9
KH
4092 else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4093 ENCODE_RESET_PLANE_AND_REGISTER ();
4094 EMIT_ONE_ASCII_BYTE (c);
4ed46869 4095 }
df7492f9 4096 else if (ASCII_CHAR_P (c))
88993dfd 4097 {
df7492f9
KH
4098 if (ascii_compatible)
4099 EMIT_ONE_ASCII_BYTE (c);
93dec019 4100 else
19a8d9e0 4101 {
bf16eb23
KH
4102 struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4103 ENCODE_ISO_CHARACTER (charset, c);
19a8d9e0 4104 }
4ed46869 4105 }
16eafb5d 4106 else if (CHAR_BYTE8_P (c))
88993dfd 4107 {
16eafb5d
KH
4108 c = CHAR_TO_BYTE8 (c);
4109 EMIT_ONE_BYTE (c);
88993dfd 4110 }
b73bfc1c 4111 else
df7492f9 4112 {
ff0dacd7 4113 struct charset *charset;
b73bfc1c 4114
ff0dacd7
KH
4115 if (preferred_charset_id >= 0)
4116 {
4117 charset = CHARSET_FROM_ID (preferred_charset_id);
4118 if (! CHAR_CHARSET_P (c, charset))
4119 charset = char_charset (c, charset_list, NULL);
4120 }
4121 else
4122 charset = char_charset (c, charset_list, NULL);
df7492f9
KH
4123 if (!charset)
4124 {
41cbe562
KH
4125 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4126 {
4127 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4128 charset = CHARSET_FROM_ID (charset_ascii);
4129 }
4130 else
4131 {
4132 c = coding->default_char;
4133 charset = char_charset (c, charset_list, NULL);
4134 }
df7492f9
KH
4135 }
4136 ENCODE_ISO_CHARACTER (charset, c);
4137 }
84fbb8a0 4138 }
b73bfc1c 4139
df7492f9
KH
4140 if (coding->mode & CODING_MODE_LAST_BLOCK
4141 && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4142 {
4143 ASSURE_DESTINATION (safe_room);
4144 ENCODE_RESET_PLANE_AND_REGISTER ();
4145 }
065e3595 4146 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
4147 CODING_ISO_BOL (coding) = bol_designation;
4148 coding->produced_char += produced_chars;
4149 coding->produced = dst - coding->destination;
4150 return 0;
4ed46869
KH
4151}
4152
4153\f
df7492f9 4154/*** 8,9. SJIS and BIG5 handlers ***/
4ed46869 4155
df7492f9 4156/* Although SJIS and BIG5 are not ISO's coding system, they are used
4ed46869
KH
4157 quite widely. So, for the moment, Emacs supports them in the bare
4158 C code. But, in the future, they may be supported only by CCL. */
4159
4160/* SJIS is a coding system encoding three character sets: ASCII, right
4161 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
4162 as is. A character of charset katakana-jisx0201 is encoded by
4163 "position-code + 0x80". A character of charset japanese-jisx0208
4164 is encoded in 2-byte but two position-codes are divided and shifted
df7492f9 4165 so that it fit in the range below.
4ed46869
KH
4166
4167 --- CODE RANGE of SJIS ---
4168 (character set) (range)
4169 ASCII 0x00 .. 0x7F
df7492f9 4170 KATAKANA-JISX0201 0xA0 .. 0xDF
c28a9453 4171 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
d14d03ac 4172 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
4ed46869
KH
4173 -------------------------------
4174
4175*/
4176
4177/* BIG5 is a coding system encoding two character sets: ASCII and
4178 Big5. An ASCII character is encoded as is. Big5 is a two-byte
df7492f9 4179 character set and is encoded in two-byte.
4ed46869
KH
4180
4181 --- CODE RANGE of BIG5 ---
4182 (character set) (range)
4183 ASCII 0x00 .. 0x7F
4184 Big5 (1st byte) 0xA1 .. 0xFE
4185 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
4186 --------------------------
4187
df7492f9 4188 */
4ed46869
KH
4189
4190/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4191 Check if a text is encoded in SJIS. If it is, return
df7492f9 4192 CATEGORY_MASK_SJIS, else return 0. */
4ed46869 4193
0a28aafb 4194static int
ff0dacd7 4195detect_coding_sjis (coding, detect_info)
df7492f9 4196 struct coding_system *coding;
ff0dacd7 4197 struct coding_detection_info *detect_info;
4ed46869 4198{
065e3595 4199 const unsigned char *src = coding->source, *src_base;
8f924df7 4200 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9
KH
4201 int multibytep = coding->src_multibyte;
4202 int consumed_chars = 0;
4203 int found = 0;
b73bfc1c 4204 int c;
df7492f9 4205
ff0dacd7 4206 detect_info->checked |= CATEGORY_MASK_SJIS;
df7492f9
KH
4207 /* A coding system of this category is always ASCII compatible. */
4208 src += coding->head_ascii;
4ed46869 4209
b73bfc1c 4210 while (1)
4ed46869 4211 {
065e3595 4212 src_base = src;
df7492f9 4213 ONE_MORE_BYTE (c);
682169fe
KH
4214 if (c < 0x80)
4215 continue;
df7492f9 4216 if ((c >= 0x81 && c <= 0x9F) || (c >= 0xE0 && c <= 0xEF))
4ed46869 4217 {
df7492f9 4218 ONE_MORE_BYTE (c);
682169fe 4219 if (c < 0x40 || c == 0x7F || c > 0xFC)
df7492f9 4220 break;
ff0dacd7 4221 found = CATEGORY_MASK_SJIS;
4ed46869 4222 }
df7492f9 4223 else if (c >= 0xA0 && c < 0xE0)
ff0dacd7 4224 found = CATEGORY_MASK_SJIS;
df7492f9
KH
4225 else
4226 break;
4ed46869 4227 }
ff0dacd7 4228 detect_info->rejected |= CATEGORY_MASK_SJIS;
df7492f9
KH
4229 return 0;
4230
4231 no_more_source:
065e3595 4232 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
89528eb3 4233 {
ff0dacd7 4234 detect_info->rejected |= CATEGORY_MASK_SJIS;
89528eb3 4235 return 0;
4ed46869 4236 }
ff0dacd7
KH
4237 detect_info->found |= found;
4238 return 1;
4ed46869
KH
4239}
4240
4241/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4242 Check if a text is encoded in BIG5. If it is, return
df7492f9 4243 CATEGORY_MASK_BIG5, else return 0. */
4ed46869 4244
0a28aafb 4245static int
ff0dacd7 4246detect_coding_big5 (coding, detect_info)
df7492f9 4247 struct coding_system *coding;
ff0dacd7 4248 struct coding_detection_info *detect_info;
4ed46869 4249{
065e3595 4250 const unsigned char *src = coding->source, *src_base;
8f924df7 4251 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9
KH
4252 int multibytep = coding->src_multibyte;
4253 int consumed_chars = 0;
4254 int found = 0;
b73bfc1c 4255 int c;
fa42c37f 4256
ff0dacd7 4257 detect_info->checked |= CATEGORY_MASK_BIG5;
df7492f9
KH
4258 /* A coding system of this category is always ASCII compatible. */
4259 src += coding->head_ascii;
fa42c37f 4260
b73bfc1c 4261 while (1)
fa42c37f 4262 {
065e3595 4263 src_base = src;
df7492f9
KH
4264 ONE_MORE_BYTE (c);
4265 if (c < 0x80)
fa42c37f 4266 continue;
df7492f9 4267 if (c >= 0xA1)
fa42c37f 4268 {
df7492f9
KH
4269 ONE_MORE_BYTE (c);
4270 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
fa42c37f 4271 return 0;
ff0dacd7 4272 found = CATEGORY_MASK_BIG5;
fa42c37f 4273 }
df7492f9
KH
4274 else
4275 break;
fa42c37f 4276 }
ff0dacd7 4277 detect_info->rejected |= CATEGORY_MASK_BIG5;
fa42c37f 4278 return 0;
fa42c37f 4279
df7492f9 4280 no_more_source:
065e3595 4281 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
89528eb3 4282 {
ff0dacd7 4283 detect_info->rejected |= CATEGORY_MASK_BIG5;
89528eb3
KH
4284 return 0;
4285 }
ff0dacd7
KH
4286 detect_info->found |= found;
4287 return 1;
fa42c37f
KH
4288}
4289
4ed46869
KH
4290/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
4291 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
fa42c37f 4292
b73bfc1c 4293static void
df7492f9 4294decode_coding_sjis (coding)
4ed46869 4295 struct coding_system *coding;
4ed46869 4296{
8f924df7
KH
4297 const unsigned char *src = coding->source + coding->consumed;
4298 const unsigned char *src_end = coding->source + coding->src_bytes;
4299 const unsigned char *src_base;
69a80ea3
KH
4300 int *charbuf = coding->charbuf + coding->charbuf_used;
4301 int *charbuf_end
4302 = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
df7492f9
KH
4303 int consumed_chars = 0, consumed_chars_base;
4304 int multibytep = coding->src_multibyte;
4305 struct charset *charset_roman, *charset_kanji, *charset_kana;
57a47f8a 4306 struct charset *charset_kanji2;
24a73b0a 4307 Lisp_Object attrs, charset_list, val;
ff0dacd7
KH
4308 int char_offset = coding->produced_char;
4309 int last_offset = char_offset;
4310 int last_id = charset_ascii;
119852e7
KH
4311 int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4312 int byte_after_cr = -1;
a5d301df 4313
24a73b0a 4314 CODING_GET_INFO (coding, attrs, charset_list);
df7492f9
KH
4315
4316 val = charset_list;
4317 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
89528eb3 4318 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
57a47f8a
KH
4319 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4320 charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
fa42c37f 4321
b73bfc1c 4322 while (1)
4ed46869 4323 {
df7492f9 4324 int c, c1;
24a73b0a 4325 struct charset *charset;
fa42c37f 4326
b73bfc1c 4327 src_base = src;
df7492f9 4328 consumed_chars_base = consumed_chars;
fa42c37f 4329
df7492f9
KH
4330 if (charbuf >= charbuf_end)
4331 break;
4332
119852e7
KH
4333 if (byte_after_cr >= 0)
4334 c = byte_after_cr, byte_after_cr = -1;
4335 else
4336 ONE_MORE_BYTE (c);
065e3595
KH
4337 if (c < 0)
4338 goto invalid_code;
24a73b0a 4339 if (c < 0x80)
119852e7
KH
4340 {
4341 if (eol_crlf && c == '\r')
4342 ONE_MORE_BYTE (byte_after_cr);
4343 charset = charset_roman;
4344 }
57a47f8a 4345 else if (c == 0x80 || c == 0xA0)
8e921c4b 4346 goto invalid_code;
57a47f8a
KH
4347 else if (c >= 0xA1 && c <= 0xDF)
4348 {
4349 /* SJIS -> JISX0201-Kana */
4350 c &= 0x7F;
4351 charset = charset_kana;
4352 }
4353 else if (c <= 0xEF)
df7492f9 4354 {
57a47f8a
KH
4355 /* SJIS -> JISX0208 */
4356 ONE_MORE_BYTE (c1);
4357 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
24a73b0a 4358 goto invalid_code;
57a47f8a
KH
4359 c = (c << 8) | c1;
4360 SJIS_TO_JIS (c);
4361 charset = charset_kanji;
4362 }
4363 else if (c <= 0xFC && charset_kanji2)
4364 {
c6876370 4365 /* SJIS -> JISX0213-2 */
57a47f8a
KH
4366 ONE_MORE_BYTE (c1);
4367 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
24a73b0a 4368 goto invalid_code;
57a47f8a
KH
4369 c = (c << 8) | c1;
4370 SJIS_TO_JIS2 (c);
4371 charset = charset_kanji2;
df7492f9 4372 }
57a47f8a
KH
4373 else
4374 goto invalid_code;
24a73b0a
KH
4375 if (charset->id != charset_ascii
4376 && last_id != charset->id)
4377 {
4378 if (last_id != charset_ascii)
69a80ea3 4379 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
24a73b0a
KH
4380 last_id = charset->id;
4381 last_offset = char_offset;
4382 }
4383 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
df7492f9 4384 *charbuf++ = c;
ff0dacd7 4385 char_offset++;
df7492f9 4386 continue;
b73bfc1c 4387
df7492f9
KH
4388 invalid_code:
4389 src = src_base;
4390 consumed_chars = consumed_chars_base;
4391 ONE_MORE_BYTE (c);
065e3595 4392 *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
ff0dacd7 4393 char_offset++;
df7492f9
KH
4394 coding->errors++;
4395 }
fa42c37f 4396
df7492f9 4397 no_more_source:
ff0dacd7 4398 if (last_id != charset_ascii)
69a80ea3 4399 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
df7492f9
KH
4400 coding->consumed_char += consumed_chars_base;
4401 coding->consumed = src_base - coding->source;
4402 coding->charbuf_used = charbuf - coding->charbuf;
fa42c37f
KH
4403}
4404
b73bfc1c 4405static void
df7492f9 4406decode_coding_big5 (coding)
4ed46869 4407 struct coding_system *coding;
4ed46869 4408{
8f924df7
KH
4409 const unsigned char *src = coding->source + coding->consumed;
4410 const unsigned char *src_end = coding->source + coding->src_bytes;
4411 const unsigned char *src_base;
69a80ea3
KH
4412 int *charbuf = coding->charbuf + coding->charbuf_used;
4413 int *charbuf_end
4414 = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
df7492f9
KH
4415 int consumed_chars = 0, consumed_chars_base;
4416 int multibytep = coding->src_multibyte;
4417 struct charset *charset_roman, *charset_big5;
24a73b0a 4418 Lisp_Object attrs, charset_list, val;
ff0dacd7
KH
4419 int char_offset = coding->produced_char;
4420 int last_offset = char_offset;
4421 int last_id = charset_ascii;
119852e7
KH
4422 int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4423 int byte_after_cr = -1;
df7492f9 4424
24a73b0a 4425 CODING_GET_INFO (coding, attrs, charset_list);
df7492f9
KH
4426 val = charset_list;
4427 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4428 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4ed46869 4429
b73bfc1c 4430 while (1)
4ed46869 4431 {
df7492f9 4432 int c, c1;
24a73b0a 4433 struct charset *charset;
b73bfc1c
KH
4434
4435 src_base = src;
df7492f9
KH
4436 consumed_chars_base = consumed_chars;
4437
4438 if (charbuf >= charbuf_end)
4439 break;
4440
119852e7 4441 if (byte_after_cr >= 0)
14daee73 4442 c = byte_after_cr, byte_after_cr = -1;
119852e7
KH
4443 else
4444 ONE_MORE_BYTE (c);
b73bfc1c 4445
065e3595
KH
4446 if (c < 0)
4447 goto invalid_code;
24a73b0a 4448 if (c < 0x80)
119852e7 4449 {
14daee73 4450 if (eol_crlf && c == '\r')
119852e7
KH
4451 ONE_MORE_BYTE (byte_after_cr);
4452 charset = charset_roman;
4453 }
24a73b0a 4454 else
4ed46869 4455 {
24a73b0a
KH
4456 /* BIG5 -> Big5 */
4457 if (c < 0xA1 || c > 0xFE)
4458 goto invalid_code;
4459 ONE_MORE_BYTE (c1);
4460 if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4461 goto invalid_code;
4462 c = c << 8 | c1;
4463 charset = charset_big5;
4ed46869 4464 }
24a73b0a
KH
4465 if (charset->id != charset_ascii
4466 && last_id != charset->id)
df7492f9 4467 {
24a73b0a 4468 if (last_id != charset_ascii)
69a80ea3 4469 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
24a73b0a
KH
4470 last_id = charset->id;
4471 last_offset = char_offset;
4ed46869 4472 }
24a73b0a 4473 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
df7492f9 4474 *charbuf++ = c;
ff0dacd7 4475 char_offset++;
fb88bf2d
KH
4476 continue;
4477
df7492f9 4478 invalid_code:
4ed46869 4479 src = src_base;
df7492f9
KH
4480 consumed_chars = consumed_chars_base;
4481 ONE_MORE_BYTE (c);
065e3595 4482 *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
ff0dacd7 4483 char_offset++;
df7492f9 4484 coding->errors++;
fb88bf2d 4485 }
d46c5b12 4486
df7492f9 4487 no_more_source:
ff0dacd7 4488 if (last_id != charset_ascii)
69a80ea3 4489 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
df7492f9
KH
4490 coding->consumed_char += consumed_chars_base;
4491 coding->consumed = src_base - coding->source;
4492 coding->charbuf_used = charbuf - coding->charbuf;
4ed46869
KH
4493}
4494
4495/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
b73bfc1c
KH
4496 This function can encode charsets `ascii', `katakana-jisx0201',
4497 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
4498 are sure that all these charsets are registered as official charset
4ed46869
KH
4499 (i.e. do not have extended leading-codes). Characters of other
4500 charsets are produced without any encoding. If SJIS_P is 1, encode
4501 SJIS text, else encode BIG5 text. */
4502
df7492f9
KH
4503static int
4504encode_coding_sjis (coding)
4ed46869 4505 struct coding_system *coding;
4ed46869 4506{
df7492f9
KH
4507 int multibytep = coding->dst_multibyte;
4508 int *charbuf = coding->charbuf;
4509 int *charbuf_end = charbuf + coding->charbuf_used;
4510 unsigned char *dst = coding->destination + coding->produced;
4511 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4512 int safe_room = 4;
4513 int produced_chars = 0;
24a73b0a 4514 Lisp_Object attrs, charset_list, val;
df7492f9
KH
4515 int ascii_compatible;
4516 struct charset *charset_roman, *charset_kanji, *charset_kana;
57a47f8a 4517 struct charset *charset_kanji2;
df7492f9 4518 int c;
a5d301df 4519
24a73b0a 4520 CODING_GET_INFO (coding, attrs, charset_list);
df7492f9
KH
4521 val = charset_list;
4522 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4523 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
57a47f8a
KH
4524 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4525 charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4ed46869 4526
df7492f9 4527 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
93dec019 4528
df7492f9
KH
4529 while (charbuf < charbuf_end)
4530 {
4531 ASSURE_DESTINATION (safe_room);
4532 c = *charbuf++;
b73bfc1c 4533 /* Now encode the character C. */
df7492f9
KH
4534 if (ASCII_CHAR_P (c) && ascii_compatible)
4535 EMIT_ONE_ASCII_BYTE (c);
16eafb5d
KH
4536 else if (CHAR_BYTE8_P (c))
4537 {
4538 c = CHAR_TO_BYTE8 (c);
4539 EMIT_ONE_BYTE (c);
4540 }
df7492f9 4541 else
b73bfc1c 4542 {
df7492f9
KH
4543 unsigned code;
4544 struct charset *charset = char_charset (c, charset_list, &code);
4545
4546 if (!charset)
4ed46869 4547 {
41cbe562 4548 if (coding->mode & CODING_MODE_SAFE_ENCODING)
b73bfc1c 4549 {
41cbe562
KH
4550 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4551 charset = CHARSET_FROM_ID (charset_ascii);
b73bfc1c 4552 }
41cbe562 4553 else
b73bfc1c 4554 {
41cbe562
KH
4555 c = coding->default_char;
4556 charset = char_charset (c, charset_list, &code);
b73bfc1c 4557 }
b73bfc1c 4558 }
df7492f9
KH
4559 if (code == CHARSET_INVALID_CODE (charset))
4560 abort ();
4561 if (charset == charset_kanji)
4562 {
4563 int c1, c2;
4564 JIS_TO_SJIS (code);
4565 c1 = code >> 8, c2 = code & 0xFF;
4566 EMIT_TWO_BYTES (c1, c2);
4567 }
4568 else if (charset == charset_kana)
4569 EMIT_ONE_BYTE (code | 0x80);
57a47f8a
KH
4570 else if (charset_kanji2 && charset == charset_kanji2)
4571 {
4572 int c1, c2;
4573
4574 c1 = code >> 8;
4575 if (c1 == 0x21 || (c1 >= 0x23 && c1 < 0x25)
4576 || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
4577 {
4578 JIS_TO_SJIS2 (code);
4579 c1 = code >> 8, c2 = code & 0xFF;
4580 EMIT_TWO_BYTES (c1, c2);
4581 }
4582 else
4583 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4584 }
df7492f9
KH
4585 else
4586 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4587 }
4588 }
065e3595 4589 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
4590 coding->produced_char += produced_chars;
4591 coding->produced = dst - coding->destination;
4592 return 0;
4593}
4594
4595static int
4596encode_coding_big5 (coding)
4597 struct coding_system *coding;
4598{
4599 int multibytep = coding->dst_multibyte;
4600 int *charbuf = coding->charbuf;
4601 int *charbuf_end = charbuf + coding->charbuf_used;
4602 unsigned char *dst = coding->destination + coding->produced;
4603 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4604 int safe_room = 4;
4605 int produced_chars = 0;
24a73b0a 4606 Lisp_Object attrs, charset_list, val;
df7492f9
KH
4607 int ascii_compatible;
4608 struct charset *charset_roman, *charset_big5;
4609 int c;
4610
24a73b0a 4611 CODING_GET_INFO (coding, attrs, charset_list);
df7492f9
KH
4612 val = charset_list;
4613 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4614 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4615 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4616
4617 while (charbuf < charbuf_end)
4618 {
4619 ASSURE_DESTINATION (safe_room);
4620 c = *charbuf++;
4621 /* Now encode the character C. */
4622 if (ASCII_CHAR_P (c) && ascii_compatible)
4623 EMIT_ONE_ASCII_BYTE (c);
16eafb5d
KH
4624 else if (CHAR_BYTE8_P (c))
4625 {
4626 c = CHAR_TO_BYTE8 (c);
4627 EMIT_ONE_BYTE (c);
b73bfc1c
KH
4628 }
4629 else
4630 {
df7492f9
KH
4631 unsigned code;
4632 struct charset *charset = char_charset (c, charset_list, &code);
4633
4634 if (! charset)
b73bfc1c 4635 {
41cbe562 4636 if (coding->mode & CODING_MODE_SAFE_ENCODING)
b73bfc1c 4637 {
41cbe562
KH
4638 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4639 charset = CHARSET_FROM_ID (charset_ascii);
b73bfc1c 4640 }
41cbe562 4641 else
0eecad43 4642 {
41cbe562
KH
4643 c = coding->default_char;
4644 charset = char_charset (c, charset_list, &code);
0eecad43 4645 }
4ed46869 4646 }
df7492f9
KH
4647 if (code == CHARSET_INVALID_CODE (charset))
4648 abort ();
4649 if (charset == charset_big5)
b73bfc1c 4650 {
df7492f9
KH
4651 int c1, c2;
4652
4653 c1 = code >> 8, c2 = code & 0xFF;
4654 EMIT_TWO_BYTES (c1, c2);
b73bfc1c 4655 }
df7492f9
KH
4656 else
4657 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4ed46869 4658 }
4ed46869 4659 }
065e3595 4660 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
4661 coding->produced_char += produced_chars;
4662 coding->produced = dst - coding->destination;
4663 return 0;
4ed46869
KH
4664}
4665
4666\f
df7492f9 4667/*** 10. CCL handlers ***/
1397dc18
KH
4668
4669/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4670 Check if a text is encoded in a coding system of which
4671 encoder/decoder are written in CCL program. If it is, return
df7492f9 4672 CATEGORY_MASK_CCL, else return 0. */
1397dc18 4673
0a28aafb 4674static int
ff0dacd7 4675detect_coding_ccl (coding, detect_info)
df7492f9 4676 struct coding_system *coding;
ff0dacd7 4677 struct coding_detection_info *detect_info;
1397dc18 4678{
065e3595 4679 const unsigned char *src = coding->source, *src_base;
8f924df7 4680 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9
KH
4681 int multibytep = coding->src_multibyte;
4682 int consumed_chars = 0;
4683 int found = 0;
0e219d54 4684 unsigned char *valids;
df7492f9
KH
4685 int head_ascii = coding->head_ascii;
4686 Lisp_Object attrs;
4687
ff0dacd7
KH
4688 detect_info->checked |= CATEGORY_MASK_CCL;
4689
df7492f9 4690 coding = &coding_categories[coding_category_ccl];
0e219d54 4691 valids = CODING_CCL_VALIDS (coding);
df7492f9
KH
4692 attrs = CODING_ID_ATTRS (coding->id);
4693 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4694 src += head_ascii;
1397dc18 4695
b73bfc1c 4696 while (1)
1397dc18 4697 {
df7492f9 4698 int c;
065e3595
KH
4699
4700 src_base = src;
df7492f9 4701 ONE_MORE_BYTE (c);
065e3595 4702 if (c < 0 || ! valids[c])
df7492f9 4703 break;
ff0dacd7
KH
4704 if ((valids[c] > 1))
4705 found = CATEGORY_MASK_CCL;
df7492f9 4706 }
ff0dacd7 4707 detect_info->rejected |= CATEGORY_MASK_CCL;
df7492f9
KH
4708 return 0;
4709
4710 no_more_source:
ff0dacd7
KH
4711 detect_info->found |= found;
4712 return 1;
df7492f9
KH
4713}
4714
4715static void
4716decode_coding_ccl (coding)
4717 struct coding_system *coding;
4718{
7c78e542 4719 const unsigned char *src = coding->source + coding->consumed;
8f924df7 4720 const unsigned char *src_end = coding->source + coding->src_bytes;
69a80ea3
KH
4721 int *charbuf = coding->charbuf + coding->charbuf_used;
4722 int *charbuf_end = coding->charbuf + coding->charbuf_size;
df7492f9
KH
4723 int consumed_chars = 0;
4724 int multibytep = coding->src_multibyte;
4725 struct ccl_program ccl;
4726 int source_charbuf[1024];
4727 int source_byteidx[1024];
24a73b0a 4728 Lisp_Object attrs, charset_list;
df7492f9 4729
24a73b0a 4730 CODING_GET_INFO (coding, attrs, charset_list);
df7492f9
KH
4731 setup_ccl_program (&ccl, CODING_CCL_DECODER (coding));
4732
4733 while (src < src_end)
4734 {
7c78e542 4735 const unsigned char *p = src;
df7492f9
KH
4736 int *source, *source_end;
4737 int i = 0;
4738
4739 if (multibytep)
4740 while (i < 1024 && p < src_end)
4741 {
4742 source_byteidx[i] = p - src;
4743 source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
4744 }
4745 else
4746 while (i < 1024 && p < src_end)
4747 source_charbuf[i++] = *p++;
8f924df7 4748
df7492f9
KH
4749 if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
4750 ccl.last_block = 1;
4751
4752 source = source_charbuf;
4753 source_end = source + i;
4754 while (source < source_end)
4755 {
4756 ccl_driver (&ccl, source, charbuf,
8dcbea82
KH
4757 source_end - source, charbuf_end - charbuf,
4758 charset_list);
df7492f9
KH
4759 source += ccl.consumed;
4760 charbuf += ccl.produced;
4761 if (ccl.status != CCL_STAT_SUSPEND_BY_DST)
4762 break;
4763 }
4764 if (source < source_end)
4765 src += source_byteidx[source - source_charbuf];
4766 else
4767 src = p;
4768 consumed_chars += source - source_charbuf;
4769
4770 if (ccl.status != CCL_STAT_SUSPEND_BY_SRC
4771 && ccl.status != CODING_RESULT_INSUFFICIENT_SRC)
4772 break;
4773 }
4774
4775 switch (ccl.status)
4776 {
4777 case CCL_STAT_SUSPEND_BY_SRC:
065e3595 4778 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
df7492f9
KH
4779 break;
4780 case CCL_STAT_SUSPEND_BY_DST:
4781 break;
4782 case CCL_STAT_QUIT:
4783 case CCL_STAT_INVALID_CMD:
065e3595 4784 record_conversion_result (coding, CODING_RESULT_INTERRUPT);
df7492f9
KH
4785 break;
4786 default:
065e3595 4787 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
4788 break;
4789 }
4790 coding->consumed_char += consumed_chars;
4791 coding->consumed = src - coding->source;
4792 coding->charbuf_used = charbuf - coding->charbuf;
4793}
4794
4795static int
4796encode_coding_ccl (coding)
4797 struct coding_system *coding;
4798{
4799 struct ccl_program ccl;
4800 int multibytep = coding->dst_multibyte;
4801 int *charbuf = coding->charbuf;
4802 int *charbuf_end = charbuf + coding->charbuf_used;
4803 unsigned char *dst = coding->destination + coding->produced;
4804 unsigned char *dst_end = coding->destination + coding->dst_bytes;
df7492f9
KH
4805 int destination_charbuf[1024];
4806 int i, produced_chars = 0;
24a73b0a 4807 Lisp_Object attrs, charset_list;
df7492f9 4808
24a73b0a 4809 CODING_GET_INFO (coding, attrs, charset_list);
df7492f9
KH
4810 setup_ccl_program (&ccl, CODING_CCL_ENCODER (coding));
4811
4812 ccl.last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4813 ccl.dst_multibyte = coding->dst_multibyte;
4814
8cffd3e7 4815 while (charbuf < charbuf_end)
df7492f9 4816 {
df7492f9 4817 ccl_driver (&ccl, charbuf, destination_charbuf,
8cffd3e7 4818 charbuf_end - charbuf, 1024, charset_list);
df7492f9 4819 if (multibytep)
8cffd3e7
KH
4820 {
4821 ASSURE_DESTINATION (ccl.produced * 2);
4822 for (i = 0; i < ccl.produced; i++)
4823 EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
4824 }
df7492f9
KH
4825 else
4826 {
8cffd3e7 4827 ASSURE_DESTINATION (ccl.produced);
3ed051d4 4828 for (i = 0; i < ccl.produced; i++)
df7492f9
KH
4829 *dst++ = destination_charbuf[i] & 0xFF;
4830 produced_chars += ccl.produced;
4831 }
8cffd3e7
KH
4832 charbuf += ccl.consumed;
4833 if (ccl.status == CCL_STAT_QUIT
4834 || ccl.status == CCL_STAT_INVALID_CMD)
4835 break;
df7492f9
KH
4836 }
4837
4838 switch (ccl.status)
4839 {
4840 case CCL_STAT_SUSPEND_BY_SRC:
065e3595 4841 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
df7492f9
KH
4842 break;
4843 case CCL_STAT_SUSPEND_BY_DST:
065e3595 4844 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
df7492f9
KH
4845 break;
4846 case CCL_STAT_QUIT:
4847 case CCL_STAT_INVALID_CMD:
065e3595 4848 record_conversion_result (coding, CODING_RESULT_INTERRUPT);
df7492f9
KH
4849 break;
4850 default:
065e3595 4851 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9 4852 break;
1397dc18 4853 }
df7492f9
KH
4854
4855 coding->produced_char += produced_chars;
4856 coding->produced = dst - coding->destination;
4857 return 0;
1397dc18
KH
4858}
4859
df7492f9 4860
1397dc18 4861\f
df7492f9 4862/*** 10, 11. no-conversion handlers ***/
4ed46869 4863
b73bfc1c 4864/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
4ed46869 4865
b73bfc1c 4866static void
df7492f9 4867decode_coding_raw_text (coding)
4ed46869 4868 struct coding_system *coding;
4ed46869 4869{
119852e7
KH
4870 int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4871
df7492f9 4872 coding->chars_at_source = 1;
119852e7
KH
4873 coding->consumed_char = coding->src_chars;
4874 coding->consumed = coding->src_bytes;
4875 if (eol_crlf && coding->source[coding->src_bytes - 1] == '\r')
4876 {
4877 coding->consumed_char--;
4878 coding->consumed--;
4879 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
4880 }
4881 else
4882 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9 4883}
4ed46869 4884
df7492f9
KH
4885static int
4886encode_coding_raw_text (coding)
4887 struct coding_system *coding;
4888{
4889 int multibytep = coding->dst_multibyte;
4890 int *charbuf = coding->charbuf;
4891 int *charbuf_end = coding->charbuf + coding->charbuf_used;
4892 unsigned char *dst = coding->destination + coding->produced;
4893 unsigned char *dst_end = coding->destination + coding->dst_bytes;
a0ed9b27 4894 int produced_chars = 0;
b73bfc1c
KH
4895 int c;
4896
df7492f9 4897 if (multibytep)
b73bfc1c 4898 {
df7492f9 4899 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
4ed46869 4900
df7492f9
KH
4901 if (coding->src_multibyte)
4902 while (charbuf < charbuf_end)
4903 {
4904 ASSURE_DESTINATION (safe_room);
4905 c = *charbuf++;
4906 if (ASCII_CHAR_P (c))
4907 EMIT_ONE_ASCII_BYTE (c);
4908 else if (CHAR_BYTE8_P (c))
4909 {
4910 c = CHAR_TO_BYTE8 (c);
4911 EMIT_ONE_BYTE (c);
4912 }
4913 else
4914 {
4915 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
93dec019 4916
df7492f9
KH
4917 CHAR_STRING_ADVANCE (c, p1);
4918 while (p0 < p1)
9d123124
KH
4919 {
4920 EMIT_ONE_BYTE (*p0);
4921 p0++;
4922 }
df7492f9
KH
4923 }
4924 }
b73bfc1c 4925 else
df7492f9
KH
4926 while (charbuf < charbuf_end)
4927 {
4928 ASSURE_DESTINATION (safe_room);
4929 c = *charbuf++;
4930 EMIT_ONE_BYTE (c);
4931 }
4932 }
4933 else
4ed46869 4934 {
df7492f9 4935 if (coding->src_multibyte)
d46c5b12 4936 {
df7492f9
KH
4937 int safe_room = MAX_MULTIBYTE_LENGTH;
4938
4939 while (charbuf < charbuf_end)
d46c5b12 4940 {
df7492f9
KH
4941 ASSURE_DESTINATION (safe_room);
4942 c = *charbuf++;
4943 if (ASCII_CHAR_P (c))
4944 *dst++ = c;
4945 else if (CHAR_BYTE8_P (c))
4946 *dst++ = CHAR_TO_BYTE8 (c);
b73bfc1c 4947 else
df7492f9 4948 CHAR_STRING_ADVANCE (c, dst);
d46c5b12
KH
4949 }
4950 }
df7492f9
KH
4951 else
4952 {
4953 ASSURE_DESTINATION (charbuf_end - charbuf);
4954 while (charbuf < charbuf_end && dst < dst_end)
4955 *dst++ = *charbuf++;
8f924df7 4956 }
319a3947 4957 produced_chars = dst - (coding->destination + coding->produced);
4ed46869 4958 }
065e3595 4959 record_conversion_result (coding, CODING_RESULT_SUCCESS);
a0ed9b27 4960 coding->produced_char += produced_chars;
df7492f9
KH
4961 coding->produced = dst - coding->destination;
4962 return 0;
4ed46869
KH
4963}
4964
ff0dacd7
KH
4965/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4966 Check if a text is encoded in a charset-based coding system. If it
4967 is, return 1, else return 0. */
4968
0a28aafb 4969static int
ff0dacd7 4970detect_coding_charset (coding, detect_info)
df7492f9 4971 struct coding_system *coding;
ff0dacd7 4972 struct coding_detection_info *detect_info;
1397dc18 4973{
065e3595 4974 const unsigned char *src = coding->source, *src_base;
8f924df7 4975 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9
KH
4976 int multibytep = coding->src_multibyte;
4977 int consumed_chars = 0;
07295713 4978 Lisp_Object attrs, valids, name;
584948ac 4979 int found = 0;
716b3fa0 4980 int head_ascii = coding->head_ascii;
07295713 4981 int check_latin_extra = 0;
1397dc18 4982
ff0dacd7
KH
4983 detect_info->checked |= CATEGORY_MASK_CHARSET;
4984
df7492f9
KH
4985 coding = &coding_categories[coding_category_charset];
4986 attrs = CODING_ID_ATTRS (coding->id);
4987 valids = AREF (attrs, coding_attr_charset_valids);
07295713
KH
4988 name = CODING_ID_NAME (coding->id);
4989 if (VECTORP (Vlatin_extra_code_table)
4990 && strcmp ((char *) SDATA (SYMBOL_NAME (name)), "iso-8859-"))
4991 check_latin_extra = 1;
df7492f9 4992 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
716b3fa0 4993 src += head_ascii;
1397dc18 4994
b73bfc1c 4995 while (1)
1397dc18 4996 {
df7492f9 4997 int c;
716b3fa0
KH
4998 Lisp_Object val;
4999 struct charset *charset;
5000 int dim, idx;
1397dc18 5001
065e3595 5002 src_base = src;
df7492f9 5003 ONE_MORE_BYTE (c);
065e3595
KH
5004 if (c < 0)
5005 continue;
716b3fa0
KH
5006 val = AREF (valids, c);
5007 if (NILP (val))
df7492f9 5008 break;
584948ac 5009 if (c >= 0x80)
07295713
KH
5010 {
5011 if (c < 0xA0
5012 && check_latin_extra
5013 && NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
5014 break;
5015 found = CATEGORY_MASK_CHARSET;
5016 }
716b3fa0
KH
5017 if (INTEGERP (val))
5018 {
5019 charset = CHARSET_FROM_ID (XFASTINT (val));
5020 dim = CHARSET_DIMENSION (charset);
5021 for (idx = 1; idx < dim; idx++)
5022 {
5023 if (src == src_end)
5024 goto too_short;
5025 ONE_MORE_BYTE (c);
3ed051d4 5026 if (c < charset->code_space[(dim - 1 - idx) * 2]
716b3fa0
KH
5027 || c > charset->code_space[(dim - 1 - idx) * 2 + 1])
5028 break;
5029 }
5030 if (idx < dim)
5031 break;
5032 }
5033 else
5034 {
5035 idx = 1;
5036 for (; CONSP (val); val = XCDR (val))
5037 {
5038 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5039 dim = CHARSET_DIMENSION (charset);
5040 while (idx < dim)
5041 {
5042 if (src == src_end)
5043 goto too_short;
5044 ONE_MORE_BYTE (c);
5045 if (c < charset->code_space[(dim - 1 - idx) * 4]
5046 || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5047 break;
5048 idx++;
5049 }
5050 if (idx == dim)
5051 {
5052 val = Qnil;
5053 break;
5054 }
5055 }
5056 if (CONSP (val))
5057 break;
5058 }
df7492f9 5059 }
716b3fa0 5060 too_short:
ff0dacd7 5061 detect_info->rejected |= CATEGORY_MASK_CHARSET;
df7492f9 5062 return 0;
4ed46869 5063
df7492f9 5064 no_more_source:
ff0dacd7
KH
5065 detect_info->found |= found;
5066 return 1;
df7492f9 5067}
b73bfc1c 5068
b73bfc1c 5069static void
df7492f9 5070decode_coding_charset (coding)
4ed46869 5071 struct coding_system *coding;
4ed46869 5072{
8f924df7
KH
5073 const unsigned char *src = coding->source + coding->consumed;
5074 const unsigned char *src_end = coding->source + coding->src_bytes;
5075 const unsigned char *src_base;
69a80ea3
KH
5076 int *charbuf = coding->charbuf + coding->charbuf_used;
5077 int *charbuf_end
5078 = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
df7492f9
KH
5079 int consumed_chars = 0, consumed_chars_base;
5080 int multibytep = coding->src_multibyte;
24a73b0a 5081 Lisp_Object attrs, charset_list, valids;
ff0dacd7
KH
5082 int char_offset = coding->produced_char;
5083 int last_offset = char_offset;
5084 int last_id = charset_ascii;
119852e7
KH
5085 int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5086 int byte_after_cr = -1;
df7492f9 5087
24a73b0a 5088 CODING_GET_INFO (coding, attrs, charset_list);
4eb6d3f1 5089 valids = AREF (attrs, coding_attr_charset_valids);
b73bfc1c 5090
df7492f9 5091 while (1)
4ed46869 5092 {
4eb6d3f1 5093 int c;
24a73b0a
KH
5094 Lisp_Object val;
5095 struct charset *charset;
5096 int dim;
5097 int len = 1;
5098 unsigned code;
df7492f9
KH
5099
5100 src_base = src;
5101 consumed_chars_base = consumed_chars;
b73bfc1c 5102
df7492f9
KH
5103 if (charbuf >= charbuf_end)
5104 break;
5105
119852e7
KH
5106 if (byte_after_cr >= 0)
5107 {
5108 c = byte_after_cr;
5109 byte_after_cr = -1;
5110 }
5111 else
5112 {
5113 ONE_MORE_BYTE (c);
5114 if (eol_crlf && c == '\r')
5115 ONE_MORE_BYTE (byte_after_cr);
5116 }
065e3595
KH
5117 if (c < 0)
5118 goto invalid_code;
24a73b0a
KH
5119 code = c;
5120
5121 val = AREF (valids, c);
5122 if (NILP (val))
5123 goto invalid_code;
5124 if (INTEGERP (val))
d46c5b12 5125 {
24a73b0a
KH
5126 charset = CHARSET_FROM_ID (XFASTINT (val));
5127 dim = CHARSET_DIMENSION (charset);
5128 while (len < dim)
b73bfc1c 5129 {
24a73b0a
KH
5130 ONE_MORE_BYTE (c);
5131 code = (code << 8) | c;
5132 len++;
b73bfc1c 5133 }
24a73b0a
KH
5134 CODING_DECODE_CHAR (coding, src, src_base, src_end,
5135 charset, code, c);
d46c5b12 5136 }
df7492f9 5137 else
d46c5b12 5138 {
24a73b0a
KH
5139 /* VAL is a list of charset IDs. It is assured that the
5140 list is sorted by charset dimensions (smaller one
5141 comes first). */
5142 while (CONSP (val))
4eb6d3f1 5143 {
24a73b0a 5144 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
c7c66a95 5145 dim = CHARSET_DIMENSION (charset);
f9d71dcd 5146 while (len < dim)
4eb6d3f1 5147 {
acb2a965
KH
5148 ONE_MORE_BYTE (c);
5149 code = (code << 8) | c;
f9d71dcd 5150 len++;
4eb6d3f1 5151 }
24a73b0a
KH
5152 CODING_DECODE_CHAR (coding, src, src_base,
5153 src_end, charset, code, c);
5154 if (c >= 0)
5155 break;
5156 val = XCDR (val);
ff0dacd7 5157 }
d46c5b12 5158 }
24a73b0a
KH
5159 if (c < 0)
5160 goto invalid_code;
5161 if (charset->id != charset_ascii
5162 && last_id != charset->id)
5163 {
5164 if (last_id != charset_ascii)
69a80ea3 5165 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
24a73b0a
KH
5166 last_id = charset->id;
5167 last_offset = char_offset;
5168 }
5169
df7492f9 5170 *charbuf++ = c;
ff0dacd7 5171 char_offset++;
df7492f9
KH
5172 continue;
5173
5174 invalid_code:
5175 src = src_base;
5176 consumed_chars = consumed_chars_base;
5177 ONE_MORE_BYTE (c);
065e3595 5178 *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
ff0dacd7 5179 char_offset++;
df7492f9 5180 coding->errors++;
4ed46869
KH
5181 }
5182
df7492f9 5183 no_more_source:
ff0dacd7 5184 if (last_id != charset_ascii)
69a80ea3 5185 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
df7492f9
KH
5186 coding->consumed_char += consumed_chars_base;
5187 coding->consumed = src_base - coding->source;
5188 coding->charbuf_used = charbuf - coding->charbuf;
4ed46869
KH
5189}
5190
df7492f9
KH
5191static int
5192encode_coding_charset (coding)
4ed46869 5193 struct coding_system *coding;
4ed46869 5194{
df7492f9
KH
5195 int multibytep = coding->dst_multibyte;
5196 int *charbuf = coding->charbuf;
5197 int *charbuf_end = charbuf + coding->charbuf_used;
5198 unsigned char *dst = coding->destination + coding->produced;
5199 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5200 int safe_room = MAX_MULTIBYTE_LENGTH;
5201 int produced_chars = 0;
24a73b0a 5202 Lisp_Object attrs, charset_list;
df7492f9 5203 int ascii_compatible;
b73bfc1c 5204 int c;
b73bfc1c 5205
24a73b0a 5206 CODING_GET_INFO (coding, attrs, charset_list);
df7492f9 5207 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
fb88bf2d 5208
df7492f9 5209 while (charbuf < charbuf_end)
4ed46869 5210 {
4eb6d3f1 5211 struct charset *charset;
df7492f9 5212 unsigned code;
8f924df7 5213
df7492f9
KH
5214 ASSURE_DESTINATION (safe_room);
5215 c = *charbuf++;
5216 if (ascii_compatible && ASCII_CHAR_P (c))
5217 EMIT_ONE_ASCII_BYTE (c);
16eafb5d 5218 else if (CHAR_BYTE8_P (c))
4ed46869 5219 {
16eafb5d
KH
5220 c = CHAR_TO_BYTE8 (c);
5221 EMIT_ONE_BYTE (c);
d46c5b12 5222 }
d46c5b12 5223 else
b73bfc1c 5224 {
4eb6d3f1
KH
5225 charset = char_charset (c, charset_list, &code);
5226 if (charset)
5227 {
5228 if (CHARSET_DIMENSION (charset) == 1)
5229 EMIT_ONE_BYTE (code);
5230 else if (CHARSET_DIMENSION (charset) == 2)
5231 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5232 else if (CHARSET_DIMENSION (charset) == 3)
5233 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5234 else
5235 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5236 (code >> 8) & 0xFF, code & 0xFF);
5237 }
5238 else
41cbe562
KH
5239 {
5240 if (coding->mode & CODING_MODE_SAFE_ENCODING)
5241 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5242 else
5243 c = coding->default_char;
5244 EMIT_ONE_BYTE (c);
5245 }
4ed46869 5246 }
4ed46869
KH
5247 }
5248
065e3595 5249 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
5250 coding->produced_char += produced_chars;
5251 coding->produced = dst - coding->destination;
5252 return 0;
4ed46869
KH
5253}
5254
5255\f
1397dc18 5256/*** 7. C library functions ***/
4ed46869 5257
df7492f9
KH
5258/* Setup coding context CODING from information about CODING_SYSTEM.
5259 If CODING_SYSTEM is nil, `no-conversion' is assumed. If
5260 CODING_SYSTEM is invalid, signal an error. */
4ed46869 5261
ec6d2bb8 5262void
e0e989f6
KH
5263setup_coding_system (coding_system, coding)
5264 Lisp_Object coding_system;
4ed46869
KH
5265 struct coding_system *coding;
5266{
df7492f9
KH
5267 Lisp_Object attrs;
5268 Lisp_Object eol_type;
5269 Lisp_Object coding_type;
4608c386 5270 Lisp_Object val;
4ed46869 5271
df7492f9 5272 if (NILP (coding_system))
ae6f73fa 5273 coding_system = Qundecided;
c07c8e12 5274
df7492f9 5275 CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
1f5dbf34 5276
df7492f9
KH
5277 attrs = CODING_ID_ATTRS (coding->id);
5278 eol_type = CODING_ID_EOL_TYPE (coding->id);
1f5dbf34 5279
df7492f9
KH
5280 coding->mode = 0;
5281 coding->head_ascii = -1;
4a015c45
KH
5282 if (VECTORP (eol_type))
5283 coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5284 | CODING_REQUIRE_DETECTION_MASK);
5285 else if (! EQ (eol_type, Qunix))
5286 coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5287 | CODING_REQUIRE_ENCODING_MASK);
5288 else
5289 coding->common_flags = 0;
5e5c78be
KH
5290 if (! NILP (CODING_ATTR_POST_READ (attrs)))
5291 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5292 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5293 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
8f924df7
KH
5294 if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5295 coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
1f5dbf34 5296
df7492f9 5297 val = CODING_ATTR_SAFE_CHARSETS (attrs);
8f924df7
KH
5298 coding->max_charset_id = SCHARS (val) - 1;
5299 coding->safe_charsets = (char *) SDATA (val);
df7492f9 5300 coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
4608c386 5301
df7492f9
KH
5302 coding_type = CODING_ATTR_TYPE (attrs);
5303 if (EQ (coding_type, Qundecided))
d46c5b12 5304 {
df7492f9
KH
5305 coding->detector = NULL;
5306 coding->decoder = decode_coding_raw_text;
5307 coding->encoder = encode_coding_raw_text;
5308 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
d46c5b12 5309 }
df7492f9 5310 else if (EQ (coding_type, Qiso_2022))
d46c5b12 5311 {
df7492f9
KH
5312 int i;
5313 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5314
5315 /* Invoke graphic register 0 to plane 0. */
5316 CODING_ISO_INVOCATION (coding, 0) = 0;
5317 /* Invoke graphic register 1 to plane 1 if we can use 8-bit. */
5318 CODING_ISO_INVOCATION (coding, 1)
5319 = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5320 /* Setup the initial status of designation. */
5321 for (i = 0; i < 4; i++)
5322 CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5323 /* Not single shifting initially. */
5324 CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5325 /* Beginning of buffer should also be regarded as bol. */
5326 CODING_ISO_BOL (coding) = 1;
5327 coding->detector = detect_coding_iso_2022;
5328 coding->decoder = decode_coding_iso_2022;
5329 coding->encoder = encode_coding_iso_2022;
5330 if (flags & CODING_ISO_FLAG_SAFE)
5331 coding->mode |= CODING_MODE_SAFE_ENCODING;
d46c5b12 5332 coding->common_flags
df7492f9
KH
5333 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5334 | CODING_REQUIRE_FLUSHING_MASK);
5335 if (flags & CODING_ISO_FLAG_COMPOSITION)
5336 coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
ff0dacd7
KH
5337 if (flags & CODING_ISO_FLAG_DESIGNATION)
5338 coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
df7492f9
KH
5339 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5340 {
5341 setup_iso_safe_charsets (attrs);
5342 val = CODING_ATTR_SAFE_CHARSETS (attrs);
8f924df7
KH
5343 coding->max_charset_id = SCHARS (val) - 1;
5344 coding->safe_charsets = (char *) SDATA (val);
df7492f9
KH
5345 }
5346 CODING_ISO_FLAGS (coding) = flags;
d46c5b12 5347 }
df7492f9 5348 else if (EQ (coding_type, Qcharset))
d46c5b12 5349 {
df7492f9
KH
5350 coding->detector = detect_coding_charset;
5351 coding->decoder = decode_coding_charset;
5352 coding->encoder = encode_coding_charset;
d46c5b12 5353 coding->common_flags
df7492f9 5354 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
d46c5b12 5355 }
df7492f9 5356 else if (EQ (coding_type, Qutf_8))
d46c5b12 5357 {
a470d443
KH
5358 val = AREF (attrs, coding_attr_utf_bom);
5359 CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5360 : EQ (val, Qt) ? utf_with_bom
5361 : utf_without_bom);
df7492f9
KH
5362 coding->detector = detect_coding_utf_8;
5363 coding->decoder = decode_coding_utf_8;
5364 coding->encoder = encode_coding_utf_8;
5365 coding->common_flags
5366 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
a470d443
KH
5367 if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5368 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
df7492f9
KH
5369 }
5370 else if (EQ (coding_type, Qutf_16))
5371 {
a470d443
KH
5372 val = AREF (attrs, coding_attr_utf_bom);
5373 CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5374 : EQ (val, Qt) ? utf_with_bom
5375 : utf_without_bom);
df7492f9 5376 val = AREF (attrs, coding_attr_utf_16_endian);
b49a1807 5377 CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
df7492f9 5378 : utf_16_little_endian);
e19c3639 5379 CODING_UTF_16_SURROGATE (coding) = 0;
df7492f9
KH
5380 coding->detector = detect_coding_utf_16;
5381 coding->decoder = decode_coding_utf_16;
5382 coding->encoder = encode_coding_utf_16;
5383 coding->common_flags
5384 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
a470d443 5385 if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
b49a1807 5386 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
d46c5b12 5387 }
df7492f9 5388 else if (EQ (coding_type, Qccl))
4ed46869 5389 {
df7492f9
KH
5390 coding->detector = detect_coding_ccl;
5391 coding->decoder = decode_coding_ccl;
5392 coding->encoder = encode_coding_ccl;
c952af22 5393 coding->common_flags
df7492f9
KH
5394 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5395 | CODING_REQUIRE_FLUSHING_MASK);
5396 }
5397 else if (EQ (coding_type, Qemacs_mule))
5398 {
5399 coding->detector = detect_coding_emacs_mule;
5400 coding->decoder = decode_coding_emacs_mule;
5401 coding->encoder = encode_coding_emacs_mule;
c952af22 5402 coding->common_flags
df7492f9
KH
5403 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5404 if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5405 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5406 {
5407 Lisp_Object tail, safe_charsets;
5408 int max_charset_id = 0;
5409
5410 for (tail = Vemacs_mule_charset_list; CONSP (tail);
5411 tail = XCDR (tail))
5412 if (max_charset_id < XFASTINT (XCAR (tail)))
5413 max_charset_id = XFASTINT (XCAR (tail));
5414 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
5415 make_number (255));
5416 for (tail = Vemacs_mule_charset_list; CONSP (tail);
5417 tail = XCDR (tail))
8f924df7 5418 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
df7492f9 5419 coding->max_charset_id = max_charset_id;
8f924df7 5420 coding->safe_charsets = (char *) SDATA (safe_charsets);
df7492f9
KH
5421 }
5422 }
5423 else if (EQ (coding_type, Qshift_jis))
5424 {
5425 coding->detector = detect_coding_sjis;
5426 coding->decoder = decode_coding_sjis;
5427 coding->encoder = encode_coding_sjis;
c952af22 5428 coding->common_flags
df7492f9
KH
5429 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5430 }
5431 else if (EQ (coding_type, Qbig5))
5432 {
5433 coding->detector = detect_coding_big5;
5434 coding->decoder = decode_coding_big5;
5435 coding->encoder = encode_coding_big5;
c952af22 5436 coding->common_flags
df7492f9
KH
5437 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5438 }
5439 else /* EQ (coding_type, Qraw_text) */
ec6d2bb8 5440 {
df7492f9
KH
5441 coding->detector = NULL;
5442 coding->decoder = decode_coding_raw_text;
5443 coding->encoder = encode_coding_raw_text;
ea29edf2
KH
5444 if (! EQ (eol_type, Qunix))
5445 {
5446 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5447 if (! VECTORP (eol_type))
5448 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5449 }
5450
4ed46869 5451 }
4ed46869 5452
df7492f9 5453 return;
4ed46869
KH
5454}
5455
0ff61e78
KH
5456/* Return a list of charsets supported by CODING. */
5457
5458Lisp_Object
5459coding_charset_list (coding)
5460 struct coding_system *coding;
5461{
35befdaa 5462 Lisp_Object attrs, charset_list;
0ff61e78
KH
5463
5464 CODING_GET_INFO (coding, attrs, charset_list);
5465 if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5466 {
5467 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5468
5469 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5470 charset_list = Viso_2022_charset_list;
5471 }
5472 else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5473 {
5474 charset_list = Vemacs_mule_charset_list;
5475 }
5476 return charset_list;
5477}
5478
5479
df7492f9
KH
5480/* Return raw-text or one of its subsidiaries that has the same
5481 eol_type as CODING-SYSTEM. */
ec6d2bb8 5482
df7492f9
KH
5483Lisp_Object
5484raw_text_coding_system (coding_system)
5485 Lisp_Object coding_system;
ec6d2bb8 5486{
0be8721c 5487 Lisp_Object spec, attrs;
df7492f9 5488 Lisp_Object eol_type, raw_text_eol_type;
ec6d2bb8 5489
d3e4cb56
KH
5490 if (NILP (coding_system))
5491 return Qraw_text;
df7492f9
KH
5492 spec = CODING_SYSTEM_SPEC (coding_system);
5493 attrs = AREF (spec, 0);
ec6d2bb8 5494
df7492f9
KH
5495 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5496 return coding_system;
ec6d2bb8 5497
df7492f9
KH
5498 eol_type = AREF (spec, 2);
5499 if (VECTORP (eol_type))
5500 return Qraw_text;
5501 spec = CODING_SYSTEM_SPEC (Qraw_text);
5502 raw_text_eol_type = AREF (spec, 2);
5503 return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5504 : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5505 : AREF (raw_text_eol_type, 2));
ec6d2bb8
KH
5506}
5507
54f78171 5508
df7492f9
KH
5509/* If CODING_SYSTEM doesn't specify end-of-line format but PARENT
5510 does, return one of the subsidiary that has the same eol-spec as
fcbcfb64
KH
5511 PARENT. Otherwise, return CODING_SYSTEM. If PARENT is nil,
5512 inherit end-of-line format from the system's setting
5513 (system_eol_type). */
df7492f9
KH
5514
5515Lisp_Object
5516coding_inherit_eol_type (coding_system, parent)
b74e4686 5517 Lisp_Object coding_system, parent;
54f78171 5518{
3e139625 5519 Lisp_Object spec, eol_type;
54f78171 5520
d3e4cb56
KH
5521 if (NILP (coding_system))
5522 coding_system = Qraw_text;
df7492f9 5523 spec = CODING_SYSTEM_SPEC (coding_system);
df7492f9 5524 eol_type = AREF (spec, 2);
fcbcfb64 5525 if (VECTORP (eol_type))
df7492f9 5526 {
df7492f9
KH
5527 Lisp_Object parent_eol_type;
5528
fcbcfb64
KH
5529 if (! NILP (parent))
5530 {
5531 Lisp_Object parent_spec;
5532
4a015c45 5533 parent_spec = CODING_SYSTEM_SPEC (parent);
fcbcfb64
KH
5534 parent_eol_type = AREF (parent_spec, 2);
5535 }
5536 else
5537 parent_eol_type = system_eol_type;
df7492f9
KH
5538 if (EQ (parent_eol_type, Qunix))
5539 coding_system = AREF (eol_type, 0);
5540 else if (EQ (parent_eol_type, Qdos))
5541 coding_system = AREF (eol_type, 1);
5542 else if (EQ (parent_eol_type, Qmac))
5543 coding_system = AREF (eol_type, 2);
54f78171 5544 }
df7492f9 5545 return coding_system;
54f78171
KH
5546}
5547
4ed46869
KH
5548/* Emacs has a mechanism to automatically detect a coding system if it
5549 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
5550 it's impossible to distinguish some coding systems accurately
5551 because they use the same range of codes. So, at first, coding
5552 systems are categorized into 7, those are:
5553
0ef69138 5554 o coding-category-emacs-mule
4ed46869
KH
5555
5556 The category for a coding system which has the same code range
5557 as Emacs' internal format. Assigned the coding-system (Lisp
0ef69138 5558 symbol) `emacs-mule' by default.
4ed46869
KH
5559
5560 o coding-category-sjis
5561
5562 The category for a coding system which has the same code range
5563 as SJIS. Assigned the coding-system (Lisp
7717c392 5564 symbol) `japanese-shift-jis' by default.
4ed46869
KH
5565
5566 o coding-category-iso-7
5567
5568 The category for a coding system which has the same code range
7717c392 5569 as ISO2022 of 7-bit environment. This doesn't use any locking
d46c5b12
KH
5570 shift and single shift functions. This can encode/decode all
5571 charsets. Assigned the coding-system (Lisp symbol)
5572 `iso-2022-7bit' by default.
5573
5574 o coding-category-iso-7-tight
5575
5576 Same as coding-category-iso-7 except that this can
5577 encode/decode only the specified charsets.
4ed46869
KH
5578
5579 o coding-category-iso-8-1
5580
5581 The category for a coding system which has the same code range
5582 as ISO2022 of 8-bit environment and graphic plane 1 used only
7717c392
KH
5583 for DIMENSION1 charset. This doesn't use any locking shift
5584 and single shift functions. Assigned the coding-system (Lisp
5585 symbol) `iso-latin-1' by default.
4ed46869
KH
5586
5587 o coding-category-iso-8-2
5588
5589 The category for a coding system which has the same code range
5590 as ISO2022 of 8-bit environment and graphic plane 1 used only
7717c392
KH
5591 for DIMENSION2 charset. This doesn't use any locking shift
5592 and single shift functions. Assigned the coding-system (Lisp
5593 symbol) `japanese-iso-8bit' by default.
4ed46869 5594
7717c392 5595 o coding-category-iso-7-else
4ed46869
KH
5596
5597 The category for a coding system which has the same code range
df7492f9 5598 as ISO2022 of 7-bit environemnt but uses locking shift or
7717c392
KH
5599 single shift functions. Assigned the coding-system (Lisp
5600 symbol) `iso-2022-7bit-lock' by default.
5601
5602 o coding-category-iso-8-else
5603
5604 The category for a coding system which has the same code range
df7492f9 5605 as ISO2022 of 8-bit environemnt but uses locking shift or
7717c392
KH
5606 single shift functions. Assigned the coding-system (Lisp
5607 symbol) `iso-2022-8bit-ss2' by default.
4ed46869
KH
5608
5609 o coding-category-big5
5610
5611 The category for a coding system which has the same code range
5612 as BIG5. Assigned the coding-system (Lisp symbol)
e0e989f6 5613 `cn-big5' by default.
4ed46869 5614
fa42c37f
KH
5615 o coding-category-utf-8
5616
5617 The category for a coding system which has the same code range
6e76ae91 5618 as UTF-8 (cf. RFC3629). Assigned the coding-system (Lisp
fa42c37f
KH
5619 symbol) `utf-8' by default.
5620
5621 o coding-category-utf-16-be
5622
5623 The category for a coding system in which a text has an
5624 Unicode signature (cf. Unicode Standard) in the order of BIG
5625 endian at the head. Assigned the coding-system (Lisp symbol)
5626 `utf-16-be' by default.
5627
5628 o coding-category-utf-16-le
5629
5630 The category for a coding system in which a text has an
5631 Unicode signature (cf. Unicode Standard) in the order of
5632 LITTLE endian at the head. Assigned the coding-system (Lisp
5633 symbol) `utf-16-le' by default.
5634
1397dc18
KH
5635 o coding-category-ccl
5636
5637 The category for a coding system of which encoder/decoder is
5638 written in CCL programs. The default value is nil, i.e., no
5639 coding system is assigned.
5640
4ed46869
KH
5641 o coding-category-binary
5642
5643 The category for a coding system not categorized in any of the
5644 above. Assigned the coding-system (Lisp symbol)
e0e989f6 5645 `no-conversion' by default.
4ed46869
KH
5646
5647 Each of them is a Lisp symbol and the value is an actual
df7492f9 5648 `coding-system's (this is also a Lisp symbol) assigned by a user.
4ed46869
KH
5649 What Emacs does actually is to detect a category of coding system.
5650 Then, it uses a `coding-system' assigned to it. If Emacs can't
df7492f9 5651 decide only one possible category, it selects a category of the
4ed46869
KH
5652 highest priority. Priorities of categories are also specified by a
5653 user in a Lisp variable `coding-category-list'.
5654
5655*/
5656
df7492f9
KH
5657#define EOL_SEEN_NONE 0
5658#define EOL_SEEN_LF 1
5659#define EOL_SEEN_CR 2
5660#define EOL_SEEN_CRLF 4
66cfb530 5661
ff0dacd7
KH
5662/* Detect how end-of-line of a text of length SRC_BYTES pointed by
5663 SOURCE is encoded. If CATEGORY is one of
5664 coding_category_utf_16_XXXX, assume that CR and LF are encoded by
5665 two-byte, else they are encoded by one-byte.
5666
5667 Return one of EOL_SEEN_XXX. */
4ed46869 5668
bc4bc72a 5669#define MAX_EOL_CHECK_COUNT 3
d46c5b12
KH
5670
5671static int
89528eb3 5672detect_eol (source, src_bytes, category)
f6cbaf43 5673 const unsigned char *source;
df7492f9 5674 EMACS_INT src_bytes;
89528eb3 5675 enum coding_category category;
4ed46869 5676{
f6cbaf43 5677 const unsigned char *src = source, *src_end = src + src_bytes;
4ed46869 5678 unsigned char c;
df7492f9
KH
5679 int total = 0;
5680 int eol_seen = EOL_SEEN_NONE;
4ed46869 5681
89528eb3 5682 if ((1 << category) & CATEGORY_MASK_UTF_16)
4ed46869 5683 {
df7492f9 5684 int msb, lsb;
fa42c37f 5685
89528eb3
KH
5686 msb = category == (coding_category_utf_16_le
5687 | coding_category_utf_16_le_nosig);
df7492f9 5688 lsb = 1 - msb;
fa42c37f 5689
df7492f9 5690 while (src + 1 < src_end)
fa42c37f 5691 {
df7492f9
KH
5692 c = src[lsb];
5693 if (src[msb] == 0 && (c == '\n' || c == '\r'))
fa42c37f 5694 {
df7492f9
KH
5695 int this_eol;
5696
5697 if (c == '\n')
5698 this_eol = EOL_SEEN_LF;
5699 else if (src + 3 >= src_end
5700 || src[msb + 2] != 0
5701 || src[lsb + 2] != '\n')
5702 this_eol = EOL_SEEN_CR;
fa42c37f 5703 else
8f924df7 5704 this_eol = EOL_SEEN_CRLF;
df7492f9
KH
5705
5706 if (eol_seen == EOL_SEEN_NONE)
5707 /* This is the first end-of-line. */
5708 eol_seen = this_eol;
5709 else if (eol_seen != this_eol)
fa42c37f 5710 {
df7492f9
KH
5711 /* The found type is different from what found before. */
5712 eol_seen = EOL_SEEN_LF;
5713 break;
fa42c37f 5714 }
df7492f9
KH
5715 if (++total == MAX_EOL_CHECK_COUNT)
5716 break;
fa42c37f 5717 }
df7492f9 5718 src += 2;
fa42c37f 5719 }
bcf26d6a 5720 }
d46c5b12 5721 else
c4825358 5722 {
df7492f9 5723 while (src < src_end)
27901516 5724 {
df7492f9
KH
5725 c = *src++;
5726 if (c == '\n' || c == '\r')
5727 {
5728 int this_eol;
d46c5b12 5729
df7492f9
KH
5730 if (c == '\n')
5731 this_eol = EOL_SEEN_LF;
5732 else if (src >= src_end || *src != '\n')
5733 this_eol = EOL_SEEN_CR;
5734 else
5735 this_eol = EOL_SEEN_CRLF, src++;
d46c5b12 5736
df7492f9
KH
5737 if (eol_seen == EOL_SEEN_NONE)
5738 /* This is the first end-of-line. */
5739 eol_seen = this_eol;
5740 else if (eol_seen != this_eol)
5741 {
5742 /* The found type is different from what found before. */
5743 eol_seen = EOL_SEEN_LF;
5744 break;
5745 }
5746 if (++total == MAX_EOL_CHECK_COUNT)
5747 break;
5748 }
5749 }
73be902c 5750 }
df7492f9 5751 return eol_seen;
73be902c
KH
5752}
5753
df7492f9 5754
24a73b0a 5755static Lisp_Object
df7492f9
KH
5756adjust_coding_eol_type (coding, eol_seen)
5757 struct coding_system *coding;
5758 int eol_seen;
73be902c 5759{
0be8721c 5760 Lisp_Object eol_type;
8f924df7 5761
df7492f9
KH
5762 eol_type = CODING_ID_EOL_TYPE (coding->id);
5763 if (eol_seen & EOL_SEEN_LF)
24a73b0a
KH
5764 {
5765 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
5766 eol_type = Qunix;
5767 }
6f197c07 5768 else if (eol_seen & EOL_SEEN_CRLF)
24a73b0a
KH
5769 {
5770 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
5771 eol_type = Qdos;
5772 }
6f197c07 5773 else if (eol_seen & EOL_SEEN_CR)
24a73b0a
KH
5774 {
5775 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
5776 eol_type = Qmac;
5777 }
5778 return eol_type;
d46c5b12 5779}
4ed46869 5780
df7492f9
KH
5781/* Detect how a text specified in CODING is encoded. If a coding
5782 system is detected, update fields of CODING by the detected coding
5783 system. */
0a28aafb 5784
df7492f9
KH
5785void
5786detect_coding (coding)
d46c5b12 5787 struct coding_system *coding;
d46c5b12 5788{
8f924df7 5789 const unsigned char *src, *src_end;
d46c5b12 5790
df7492f9
KH
5791 coding->consumed = coding->consumed_char = 0;
5792 coding->produced = coding->produced_char = 0;
5793 coding_set_source (coding);
1c3478b0 5794
df7492f9 5795 src_end = coding->source + coding->src_bytes;
c0e16b14 5796 coding->head_ascii = 0;
1c3478b0 5797
df7492f9
KH
5798 /* If we have not yet decided the text encoding type, detect it
5799 now. */
5800 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
b73bfc1c 5801 {
df7492f9 5802 int c, i;
6cb21a4f 5803 struct coding_detection_info detect_info;
2f3cbb32 5804 int null_byte_found = 0, eight_bit_found = 0;
df7492f9 5805
6cb21a4f 5806 detect_info.checked = detect_info.found = detect_info.rejected = 0;
2f3cbb32 5807 for (src = coding->source; src < src_end; src++)
d46c5b12 5808 {
df7492f9 5809 c = *src;
6cb21a4f 5810 if (c & 0x80)
6cb21a4f 5811 {
2f3cbb32 5812 eight_bit_found = 1;
2f3cbb32
KH
5813 if (null_byte_found)
5814 break;
5815 }
5816 else if (c < 0x20)
5817 {
5818 if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
5819 && ! inhibit_iso_escape_detection
5820 && ! detect_info.checked)
6cb21a4f 5821 {
2f3cbb32
KH
5822 if (detect_coding_iso_2022 (coding, &detect_info))
5823 {
5824 /* We have scanned the whole data. */
5825 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
c0e16b14
KH
5826 {
5827 /* We didn't find an 8-bit code. We may
5828 have found a null-byte, but it's very
5829 rare that a binary file confirm to
5830 ISO-2022. */
5831 src = src_end;
5832 coding->head_ascii = src - coding->source;
5833 }
5834 detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
2f3cbb32
KH
5835 break;
5836 }
5837 }
5838 else if (! c)
5839 {
5840 null_byte_found = 1;
5841 if (eight_bit_found)
5842 break;
6cb21a4f 5843 }
c006c0c8
KH
5844 if (! eight_bit_found)
5845 coding->head_ascii++;
6cb21a4f 5846 }
c006c0c8 5847 else if (! eight_bit_found)
c0e16b14 5848 coding->head_ascii++;
d46c5b12 5849 }
df7492f9 5850
2f3cbb32
KH
5851 if (null_byte_found || eight_bit_found
5852 || coding->head_ascii < coding->src_bytes
6cb21a4f 5853 || detect_info.found)
d46c5b12 5854 {
ff0dacd7
KH
5855 enum coding_category category;
5856 struct coding_system *this;
df7492f9 5857
6cb21a4f
KH
5858 if (coding->head_ascii == coding->src_bytes)
5859 /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */
5860 for (i = 0; i < coding_category_raw_text; i++)
5861 {
5862 category = coding_priorities[i];
5863 this = coding_categories + category;
5864 if (detect_info.found & (1 << category))
24a73b0a 5865 break;
6cb21a4f
KH
5866 }
5867 else
2f3cbb32
KH
5868 {
5869 if (null_byte_found)
ff0dacd7 5870 {
2f3cbb32
KH
5871 detect_info.checked |= ~CATEGORY_MASK_UTF_16;
5872 detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
ff0dacd7 5873 }
2f3cbb32
KH
5874 for (i = 0; i < coding_category_raw_text; i++)
5875 {
5876 category = coding_priorities[i];
5877 this = coding_categories + category;
5878 if (this->id < 0)
5879 {
5880 /* No coding system of this category is defined. */
5881 detect_info.rejected |= (1 << category);
5882 }
5883 else if (category >= coding_category_raw_text)
5884 continue;
5885 else if (detect_info.checked & (1 << category))
5886 {
5887 if (detect_info.found & (1 << category))
5888 break;
5889 }
5890 else if ((*(this->detector)) (coding, &detect_info)
5891 && detect_info.found & (1 << category))
5892 {
5893 if (category == coding_category_utf_16_auto)
5894 {
5895 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5896 category = coding_category_utf_16_le;
5897 else
5898 category = coding_category_utf_16_be;
5899 }
5900 break;
5901 }
5902 }
2f3cbb32 5903 }
c0e16b14
KH
5904
5905 if (i < coding_category_raw_text)
5906 setup_coding_system (CODING_ID_NAME (this->id), coding);
5907 else if (null_byte_found)
5908 setup_coding_system (Qno_conversion, coding);
5909 else if ((detect_info.rejected & CATEGORY_MASK_ANY)
5910 == CATEGORY_MASK_ANY)
5911 setup_coding_system (Qraw_text, coding);
5912 else if (detect_info.rejected)
5913 for (i = 0; i < coding_category_raw_text; i++)
5914 if (! (detect_info.rejected & (1 << coding_priorities[i])))
5915 {
5916 this = coding_categories + coding_priorities[i];
5917 setup_coding_system (CODING_ID_NAME (this->id), coding);
5918 break;
5919 }
d46c5b12 5920 }
b73bfc1c 5921 }
a470d443
KH
5922 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
5923 == coding_category_utf_8_auto)
5924 {
5925 Lisp_Object coding_systems;
5926 struct coding_detection_info detect_info;
5927
5928 coding_systems
5929 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
5930 detect_info.found = detect_info.rejected = 0;
5931 coding->head_ascii = 0;
5932 if (CONSP (coding_systems)
5933 && detect_coding_utf_8 (coding, &detect_info))
5934 {
5935 if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
5936 setup_coding_system (XCAR (coding_systems), coding);
5937 else
5938 setup_coding_system (XCDR (coding_systems), coding);
5939 }
5940 }
24a73b0a
KH
5941 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
5942 == coding_category_utf_16_auto)
b49a1807
KH
5943 {
5944 Lisp_Object coding_systems;
5945 struct coding_detection_info detect_info;
5946
5947 coding_systems
a470d443 5948 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
b49a1807 5949 detect_info.found = detect_info.rejected = 0;
a470d443 5950 coding->head_ascii = 0;
b49a1807 5951 if (CONSP (coding_systems)
24a73b0a 5952 && detect_coding_utf_16 (coding, &detect_info))
b49a1807
KH
5953 {
5954 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5955 setup_coding_system (XCAR (coding_systems), coding);
24a73b0a 5956 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
b49a1807
KH
5957 setup_coding_system (XCDR (coding_systems), coding);
5958 }
5959 }
4ed46869 5960}
4ed46869 5961
d46c5b12 5962
aaaf0b1e 5963static void
df7492f9 5964decode_eol (coding)
aaaf0b1e 5965 struct coding_system *coding;
aaaf0b1e 5966{
24a73b0a
KH
5967 Lisp_Object eol_type;
5968 unsigned char *p, *pbeg, *pend;
3ed051d4 5969
24a73b0a
KH
5970 eol_type = CODING_ID_EOL_TYPE (coding->id);
5971 if (EQ (eol_type, Qunix))
5972 return;
5973
5974 if (NILP (coding->dst_object))
5975 pbeg = coding->destination;
5976 else
5977 pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
5978 pend = pbeg + coding->produced;
5979
5980 if (VECTORP (eol_type))
aaaf0b1e 5981 {
df7492f9 5982 int eol_seen = EOL_SEEN_NONE;
4ed46869 5983
24a73b0a 5984 for (p = pbeg; p < pend; p++)
aaaf0b1e 5985 {
df7492f9
KH
5986 if (*p == '\n')
5987 eol_seen |= EOL_SEEN_LF;
5988 else if (*p == '\r')
aaaf0b1e 5989 {
df7492f9 5990 if (p + 1 < pend && *(p + 1) == '\n')
aaaf0b1e 5991 {
df7492f9
KH
5992 eol_seen |= EOL_SEEN_CRLF;
5993 p++;
aaaf0b1e 5994 }
aaaf0b1e 5995 else
df7492f9 5996 eol_seen |= EOL_SEEN_CR;
aaaf0b1e 5997 }
aaaf0b1e 5998 }
24a73b0a
KH
5999 if (eol_seen != EOL_SEEN_NONE
6000 && eol_seen != EOL_SEEN_LF
6001 && eol_seen != EOL_SEEN_CRLF
6002 && eol_seen != EOL_SEEN_CR)
6003 eol_seen = EOL_SEEN_LF;
df7492f9 6004 if (eol_seen != EOL_SEEN_NONE)
24a73b0a 6005 eol_type = adjust_coding_eol_type (coding, eol_seen);
aaaf0b1e 6006 }
d46c5b12 6007
24a73b0a 6008 if (EQ (eol_type, Qmac))
27901516 6009 {
24a73b0a 6010 for (p = pbeg; p < pend; p++)
df7492f9
KH
6011 if (*p == '\r')
6012 *p = '\n';
4ed46869 6013 }
24a73b0a 6014 else if (EQ (eol_type, Qdos))
df7492f9 6015 {
24a73b0a 6016 int n = 0;
b73bfc1c 6017
24a73b0a
KH
6018 if (NILP (coding->dst_object))
6019 {
4347441b
KH
6020 /* Start deleting '\r' from the tail to minimize the memory
6021 movement. */
24a73b0a
KH
6022 for (p = pend - 2; p >= pbeg; p--)
6023 if (*p == '\r')
6024 {
6025 safe_bcopy ((char *) (p + 1), (char *) p, pend-- - p - 1);
6026 n++;
6027 }
6028 }
6029 else
6030 {
4347441b
KH
6031 int pos_byte = coding->dst_pos_byte;
6032 int pos = coding->dst_pos;
6033 int pos_end = pos + coding->produced_char - 1;
6034
6035 while (pos < pos_end)
6036 {
6037 p = BYTE_POS_ADDR (pos_byte);
6038 if (*p == '\r' && p[1] == '\n')
6039 {
6040 del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6041 n++;
6042 pos_end--;
6043 }
6044 pos++;
69b8522d
KH
6045 if (coding->dst_multibyte)
6046 pos_byte += BYTES_BY_CHAR_HEAD (*p);
6047 else
6048 pos_byte++;
4347441b 6049 }
24a73b0a
KH
6050 }
6051 coding->produced -= n;
6052 coding->produced_char -= n;
aaaf0b1e 6053 }
4ed46869
KH
6054}
6055
7d64c6ad 6056
a6f87d34
KH
6057/* Return a translation table (or list of them) from coding system
6058 attribute vector ATTRS for encoding (ENCODEP is nonzero) or
6059 decoding (ENCODEP is zero). */
7d64c6ad 6060
e6a54062 6061static Lisp_Object
09ee6fdd
KH
6062get_translation_table (attrs, encodep, max_lookup)
6063 Lisp_Object attrs;
6064 int encodep, *max_lookup;
7d64c6ad
KH
6065{
6066 Lisp_Object standard, translation_table;
09ee6fdd 6067 Lisp_Object val;
7d64c6ad
KH
6068
6069 if (encodep)
6070 translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6071 standard = Vstandard_translation_table_for_encode;
6072 else
6073 translation_table = CODING_ATTR_DECODE_TBL (attrs),
6074 standard = Vstandard_translation_table_for_decode;
7d64c6ad 6075 if (NILP (translation_table))
09ee6fdd
KH
6076 translation_table = standard;
6077 else
a6f87d34 6078 {
09ee6fdd
KH
6079 if (SYMBOLP (translation_table))
6080 translation_table = Fget (translation_table, Qtranslation_table);
6081 else if (CONSP (translation_table))
6082 {
6083 translation_table = Fcopy_sequence (translation_table);
6084 for (val = translation_table; CONSP (val); val = XCDR (val))
6085 if (SYMBOLP (XCAR (val)))
6086 XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6087 }
6088 if (CHAR_TABLE_P (standard))
6089 {
6090 if (CONSP (translation_table))
6091 translation_table = nconc2 (translation_table,
6092 Fcons (standard, Qnil));
6093 else
6094 translation_table = Fcons (translation_table,
6095 Fcons (standard, Qnil));
6096 }
a6f87d34 6097 }
2170c8f0
KH
6098
6099 if (max_lookup)
09ee6fdd 6100 {
2170c8f0
KH
6101 *max_lookup = 1;
6102 if (CHAR_TABLE_P (translation_table)
6103 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6104 {
6105 val = XCHAR_TABLE (translation_table)->extras[1];
6106 if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6107 *max_lookup = XFASTINT (val);
6108 }
6109 else if (CONSP (translation_table))
6110 {
6111 Lisp_Object tail, val;
09ee6fdd 6112
2170c8f0
KH
6113 for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6114 if (CHAR_TABLE_P (XCAR (tail))
6115 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6116 {
6117 val = XCHAR_TABLE (XCAR (tail))->extras[1];
6118 if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6119 *max_lookup = XFASTINT (val);
6120 }
6121 }
a6f87d34 6122 }
7d64c6ad
KH
6123 return translation_table;
6124}
6125
09ee6fdd
KH
6126#define LOOKUP_TRANSLATION_TABLE(table, c, trans) \
6127 do { \
6128 trans = Qnil; \
6129 if (CHAR_TABLE_P (table)) \
6130 { \
6131 trans = CHAR_TABLE_REF (table, c); \
6132 if (CHARACTERP (trans)) \
6133 c = XFASTINT (trans), trans = Qnil; \
6134 } \
6135 else if (CONSP (table)) \
6136 { \
6137 Lisp_Object tail; \
6138 \
6139 for (tail = table; CONSP (tail); tail = XCDR (tail)) \
6140 if (CHAR_TABLE_P (XCAR (tail))) \
6141 { \
6142 trans = CHAR_TABLE_REF (XCAR (tail), c); \
6143 if (CHARACTERP (trans)) \
6144 c = XFASTINT (trans), trans = Qnil; \
6145 else if (! NILP (trans)) \
6146 break; \
6147 } \
6148 } \
e6a54062
KH
6149 } while (0)
6150
7d64c6ad 6151
69a80ea3
KH
6152static Lisp_Object
6153get_translation (val, buf, buf_end, last_block, from_nchars, to_nchars)
6154 Lisp_Object val;
6155 int *buf, *buf_end;
6156 int last_block;
6157 int *from_nchars, *to_nchars;
6158{
433f7f87
KH
6159 /* VAL is TO or (([FROM-CHAR ...] . TO) ...) where TO is TO-CHAR or
6160 [TO-CHAR ...]. */
69a80ea3
KH
6161 if (CONSP (val))
6162 {
433f7f87 6163 Lisp_Object from, tail;
69a80ea3
KH
6164 int i, len;
6165
433f7f87 6166 for (tail = val; CONSP (tail); tail = XCDR (tail))
69a80ea3 6167 {
433f7f87
KH
6168 val = XCAR (tail);
6169 from = XCAR (val);
6170 len = ASIZE (from);
6171 for (i = 0; i < len; i++)
6172 {
6173 if (buf + i == buf_end)
6174 {
6175 if (! last_block)
6176 return Qt;
6177 break;
6178 }
6179 if (XINT (AREF (from, i)) != buf[i])
6180 break;
6181 }
6182 if (i == len)
6183 {
6184 val = XCDR (val);
6185 *from_nchars = len;
6186 break;
6187 }
69a80ea3 6188 }
433f7f87
KH
6189 if (! CONSP (tail))
6190 return Qnil;
69a80ea3
KH
6191 }
6192 if (VECTORP (val))
6193 *buf = XINT (AREF (val, 0)), *to_nchars = ASIZE (val);
6194 else
6195 *buf = XINT (val);
6196 return val;
6197}
6198
6199
d46c5b12 6200static int
69a80ea3 6201produce_chars (coding, translation_table, last_block)
df7492f9 6202 struct coding_system *coding;
69a80ea3
KH
6203 Lisp_Object translation_table;
6204 int last_block;
4ed46869 6205{
df7492f9
KH
6206 unsigned char *dst = coding->destination + coding->produced;
6207 unsigned char *dst_end = coding->destination + coding->dst_bytes;
119852e7
KH
6208 EMACS_INT produced;
6209 EMACS_INT produced_chars = 0;
69a80ea3 6210 int carryover = 0;
4ed46869 6211
df7492f9 6212 if (! coding->chars_at_source)
4ed46869 6213 {
119852e7 6214 /* Source characters are in coding->charbuf. */
fba4576f
AS
6215 int *buf = coding->charbuf;
6216 int *buf_end = buf + coding->charbuf_used;
4ed46869 6217
db274c7a
KH
6218 if (EQ (coding->src_object, coding->dst_object))
6219 {
6220 coding_set_source (coding);
6221 dst_end = ((unsigned char *) coding->source) + coding->consumed;
6222 }
4ed46869 6223
df7492f9 6224 while (buf < buf_end)
4ed46869 6225 {
69a80ea3 6226 int c = *buf, i;
bc4bc72a 6227
df7492f9
KH
6228 if (c >= 0)
6229 {
69a80ea3
KH
6230 int from_nchars = 1, to_nchars = 1;
6231 Lisp_Object trans = Qnil;
6232
09ee6fdd 6233 LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
e6a54062 6234 if (! NILP (trans))
69a80ea3
KH
6235 {
6236 trans = get_translation (trans, buf, buf_end, last_block,
6237 &from_nchars, &to_nchars);
6238 if (EQ (trans, Qt))
6239 break;
6240 c = *buf;
6241 }
6242
6243 if (dst + MAX_MULTIBYTE_LENGTH * to_nchars > dst_end)
6244 {
6245 dst = alloc_destination (coding,
6246 buf_end - buf
6247 + MAX_MULTIBYTE_LENGTH * to_nchars,
6248 dst);
db274c7a
KH
6249 if (EQ (coding->src_object, coding->dst_object))
6250 {
6251 coding_set_source (coding);
6252 dst_end = ((unsigned char *) coding->source) + coding->consumed;
6253 }
6254 else
6255 dst_end = coding->destination + coding->dst_bytes;
69a80ea3
KH
6256 }
6257
433f7f87 6258 for (i = 0; i < to_nchars; i++)
69a80ea3 6259 {
433f7f87
KH
6260 if (i > 0)
6261 c = XINT (AREF (trans, i));
69a80ea3
KH
6262 if (coding->dst_multibyte
6263 || ! CHAR_BYTE8_P (c))
db274c7a 6264 CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
69a80ea3
KH
6265 else
6266 *dst++ = CHAR_TO_BYTE8 (c);
6267 }
6268 produced_chars += to_nchars;
6269 *buf++ = to_nchars;
6270 while (--from_nchars > 0)
6271 *buf++ = 0;
d46c5b12 6272 }
df7492f9 6273 else
69a80ea3
KH
6274 /* This is an annotation datum. (-C) is the length. */
6275 buf += -c;
4ed46869 6276 }
69a80ea3 6277 carryover = buf_end - buf;
4ed46869 6278 }
fa42c37f 6279 else
fa42c37f 6280 {
119852e7 6281 /* Source characters are at coding->source. */
8f924df7 6282 const unsigned char *src = coding->source;
119852e7 6283 const unsigned char *src_end = src + coding->consumed;
4ed46869 6284
db274c7a
KH
6285 if (EQ (coding->dst_object, coding->src_object))
6286 dst_end = (unsigned char *) src;
df7492f9 6287 if (coding->src_multibyte != coding->dst_multibyte)
fa42c37f 6288 {
df7492f9 6289 if (coding->src_multibyte)
fa42c37f 6290 {
71c81426 6291 int multibytep = 1;
119852e7 6292 EMACS_INT consumed_chars;
d46c5b12 6293
df7492f9
KH
6294 while (1)
6295 {
8f924df7 6296 const unsigned char *src_base = src;
df7492f9 6297 int c;
b73bfc1c 6298
df7492f9 6299 ONE_MORE_BYTE (c);
119852e7 6300 if (dst == dst_end)
df7492f9 6301 {
119852e7
KH
6302 if (EQ (coding->src_object, coding->dst_object))
6303 dst_end = (unsigned char *) src;
6304 if (dst == dst_end)
df7492f9 6305 {
119852e7
KH
6306 EMACS_INT offset = src - coding->source;
6307
6308 dst = alloc_destination (coding, src_end - src + 1,
6309 dst);
6310 dst_end = coding->destination + coding->dst_bytes;
6311 coding_set_source (coding);
6312 src = coding->source + offset;
6313 src_end = coding->source + coding->src_bytes;
db274c7a
KH
6314 if (EQ (coding->src_object, coding->dst_object))
6315 dst_end = (unsigned char *) src;
df7492f9 6316 }
df7492f9
KH
6317 }
6318 *dst++ = c;
6319 produced_chars++;
6320 }
6321 no_more_source:
6322 ;
fa42c37f
KH
6323 }
6324 else
df7492f9
KH
6325 while (src < src_end)
6326 {
71c81426 6327 int multibytep = 1;
df7492f9 6328 int c = *src++;
b73bfc1c 6329
df7492f9
KH
6330 if (dst >= dst_end - 1)
6331 {
2c78b7e1 6332 if (EQ (coding->src_object, coding->dst_object))
8f924df7 6333 dst_end = (unsigned char *) src;
2c78b7e1
KH
6334 if (dst >= dst_end - 1)
6335 {
119852e7 6336 EMACS_INT offset = src - coding->source;
db274c7a 6337 EMACS_INT more_bytes;
119852e7 6338
db274c7a
KH
6339 if (EQ (coding->src_object, coding->dst_object))
6340 more_bytes = ((src_end - src) / 2) + 2;
6341 else
6342 more_bytes = src_end - src + 2;
6343 dst = alloc_destination (coding, more_bytes, dst);
2c78b7e1
KH
6344 dst_end = coding->destination + coding->dst_bytes;
6345 coding_set_source (coding);
119852e7 6346 src = coding->source + offset;
2c78b7e1 6347 src_end = coding->source + coding->src_bytes;
db274c7a
KH
6348 if (EQ (coding->src_object, coding->dst_object))
6349 dst_end = (unsigned char *) src;
2c78b7e1 6350 }
df7492f9
KH
6351 }
6352 EMIT_ONE_BYTE (c);
6353 }
d46c5b12 6354 }
df7492f9
KH
6355 else
6356 {
6357 if (!EQ (coding->src_object, coding->dst_object))
fa42c37f 6358 {
119852e7 6359 EMACS_INT require = coding->src_bytes - coding->dst_bytes;
4ed46869 6360
df7492f9 6361 if (require > 0)
fa42c37f 6362 {
df7492f9
KH
6363 EMACS_INT offset = src - coding->source;
6364
6365 dst = alloc_destination (coding, require, dst);
6366 coding_set_source (coding);
6367 src = coding->source + offset;
6368 src_end = coding->source + coding->src_bytes;
fa42c37f
KH
6369 }
6370 }
119852e7 6371 produced_chars = coding->consumed_char;
df7492f9 6372 while (src < src_end)
14daee73 6373 *dst++ = *src++;
fa42c37f
KH
6374 }
6375 }
6376
df7492f9 6377 produced = dst - (coding->destination + coding->produced);
284201e4 6378 if (BUFFERP (coding->dst_object) && produced_chars > 0)
df7492f9
KH
6379 insert_from_gap (produced_chars, produced);
6380 coding->produced += produced;
6381 coding->produced_char += produced_chars;
69a80ea3 6382 return carryover;
fa42c37f
KH
6383}
6384
ff0dacd7
KH
6385/* Compose text in CODING->object according to the annotation data at
6386 CHARBUF. CHARBUF is an array:
6387 [ -LENGTH ANNOTATION_MASK FROM TO METHOD COMP_LEN [ COMPONENTS... ] ]
df7492f9 6388 */
4ed46869 6389
df7492f9 6390static INLINE void
69a80ea3 6391produce_composition (coding, charbuf, pos)
4ed46869 6392 struct coding_system *coding;
df7492f9 6393 int *charbuf;
69a80ea3 6394 EMACS_INT pos;
4ed46869 6395{
df7492f9 6396 int len;
69a80ea3 6397 EMACS_INT to;
df7492f9 6398 enum composition_method method;
df7492f9 6399 Lisp_Object components;
fa42c37f 6400
df7492f9 6401 len = -charbuf[0];
69a80ea3 6402 to = pos + charbuf[2];
9ffd559c
KH
6403 if (to <= pos)
6404 return;
69a80ea3 6405 method = (enum composition_method) (charbuf[3]);
d46c5b12 6406
df7492f9
KH
6407 if (method == COMPOSITION_RELATIVE)
6408 components = Qnil;
9ffd559c
KH
6409 else if (method >= COMPOSITION_WITH_RULE
6410 && method <= COMPOSITION_WITH_RULE_ALTCHARS)
d46c5b12 6411 {
df7492f9
KH
6412 Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
6413 int i;
b73bfc1c 6414
69a80ea3
KH
6415 len -= 4;
6416 charbuf += 4;
df7492f9 6417 for (i = 0; i < len; i++)
9ffd559c
KH
6418 {
6419 args[i] = make_number (charbuf[i]);
f75c90a9 6420 if (charbuf[i] < 0)
9ffd559c
KH
6421 return;
6422 }
df7492f9
KH
6423 components = (method == COMPOSITION_WITH_ALTCHARS
6424 ? Fstring (len, args) : Fvector (len, args));
d46c5b12 6425 }
9ffd559c
KH
6426 else
6427 return;
69a80ea3 6428 compose_text (pos, to, components, Qnil, coding->dst_object);
d46c5b12
KH
6429}
6430
d46c5b12 6431
ff0dacd7
KH
6432/* Put `charset' property on text in CODING->object according to
6433 the annotation data at CHARBUF. CHARBUF is an array:
69a80ea3 6434 [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
ff0dacd7 6435 */
d46c5b12 6436
ff0dacd7 6437static INLINE void
69a80ea3 6438produce_charset (coding, charbuf, pos)
d46c5b12 6439 struct coding_system *coding;
ff0dacd7 6440 int *charbuf;
69a80ea3 6441 EMACS_INT pos;
d46c5b12 6442{
69a80ea3
KH
6443 EMACS_INT from = pos - charbuf[2];
6444 struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
b73bfc1c 6445
69a80ea3 6446 Fput_text_property (make_number (from), make_number (pos),
ff0dacd7
KH
6447 Qcharset, CHARSET_NAME (charset),
6448 coding->dst_object);
d46c5b12
KH
6449}
6450
d46c5b12 6451
df7492f9
KH
6452#define CHARBUF_SIZE 0x4000
6453
6454#define ALLOC_CONVERSION_WORK_AREA(coding) \
6455 do { \
6456 int size = CHARBUF_SIZE;; \
6457 \
6458 coding->charbuf = NULL; \
6459 while (size > 1024) \
6460 { \
6461 coding->charbuf = (int *) alloca (sizeof (int) * size); \
6462 if (coding->charbuf) \
6463 break; \
6464 size >>= 1; \
6465 } \
6466 if (! coding->charbuf) \
6467 { \
065e3595 6468 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
df7492f9
KH
6469 return coding->result; \
6470 } \
6471 coding->charbuf_size = size; \
6472 } while (0)
4ed46869 6473
d46c5b12
KH
6474
6475static void
69a80ea3 6476produce_annotation (coding, pos)
d46c5b12 6477 struct coding_system *coding;
69a80ea3 6478 EMACS_INT pos;
d46c5b12 6479{
df7492f9
KH
6480 int *charbuf = coding->charbuf;
6481 int *charbuf_end = charbuf + coding->charbuf_used;
d46c5b12 6482
ff0dacd7
KH
6483 if (NILP (coding->dst_object))
6484 return;
d46c5b12 6485
df7492f9 6486 while (charbuf < charbuf_end)
a84f1519 6487 {
df7492f9 6488 if (*charbuf >= 0)
69a80ea3 6489 pos += *charbuf++;
d46c5b12 6490 else
d46c5b12 6491 {
df7492f9 6492 int len = -*charbuf;
ff0dacd7 6493 switch (charbuf[1])
df7492f9
KH
6494 {
6495 case CODING_ANNOTATE_COMPOSITION_MASK:
69a80ea3 6496 produce_composition (coding, charbuf, pos);
df7492f9 6497 break;
ff0dacd7 6498 case CODING_ANNOTATE_CHARSET_MASK:
69a80ea3 6499 produce_charset (coding, charbuf, pos);
ff0dacd7 6500 break;
df7492f9
KH
6501 default:
6502 abort ();
6503 }
6504 charbuf += len;
d46c5b12 6505 }
a84f1519 6506 }
d46c5b12
KH
6507}
6508
df7492f9
KH
6509/* Decode the data at CODING->src_object into CODING->dst_object.
6510 CODING->src_object is a buffer, a string, or nil.
6511 CODING->dst_object is a buffer.
d46c5b12 6512
df7492f9
KH
6513 If CODING->src_object is a buffer, it must be the current buffer.
6514 In this case, if CODING->src_pos is positive, it is a position of
6515 the source text in the buffer, otherwise, the source text is in the
6516 gap area of the buffer, and CODING->src_pos specifies the offset of
6517 the text from GPT (which must be the same as PT). If this is the
6518 same buffer as CODING->dst_object, CODING->src_pos must be
6519 negative.
d46c5b12 6520
b6828792 6521 If CODING->src_object is a string, CODING->src_pos is an index to
df7492f9 6522 that string.
d46c5b12 6523
df7492f9
KH
6524 If CODING->src_object is nil, CODING->source must already point to
6525 the non-relocatable memory area. In this case, CODING->src_pos is
6526 an offset from CODING->source.
73be902c 6527
df7492f9
KH
6528 The decoded data is inserted at the current point of the buffer
6529 CODING->dst_object.
6530*/
d46c5b12 6531
df7492f9
KH
6532static int
6533decode_coding (coding)
d46c5b12 6534 struct coding_system *coding;
d46c5b12 6535{
df7492f9 6536 Lisp_Object attrs;
24a73b0a 6537 Lisp_Object undo_list;
7d64c6ad 6538 Lisp_Object translation_table;
69a80ea3
KH
6539 int carryover;
6540 int i;
d46c5b12 6541
df7492f9
KH
6542 if (BUFFERP (coding->src_object)
6543 && coding->src_pos > 0
6544 && coding->src_pos < GPT
6545 && coding->src_pos + coding->src_chars > GPT)
6546 move_gap_both (coding->src_pos, coding->src_pos_byte);
d46c5b12 6547
24a73b0a 6548 undo_list = Qt;
df7492f9 6549 if (BUFFERP (coding->dst_object))
1c3478b0 6550 {
df7492f9
KH
6551 if (current_buffer != XBUFFER (coding->dst_object))
6552 set_buffer_internal (XBUFFER (coding->dst_object));
6553 if (GPT != PT)
6554 move_gap_both (PT, PT_BYTE);
24a73b0a
KH
6555 undo_list = current_buffer->undo_list;
6556 current_buffer->undo_list = Qt;
1c3478b0
KH
6557 }
6558
df7492f9
KH
6559 coding->consumed = coding->consumed_char = 0;
6560 coding->produced = coding->produced_char = 0;
6561 coding->chars_at_source = 0;
065e3595 6562 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9 6563 coding->errors = 0;
1c3478b0 6564
df7492f9
KH
6565 ALLOC_CONVERSION_WORK_AREA (coding);
6566
6567 attrs = CODING_ID_ATTRS (coding->id);
2170c8f0 6568 translation_table = get_translation_table (attrs, 0, NULL);
df7492f9 6569
69a80ea3 6570 carryover = 0;
df7492f9 6571 do
b73bfc1c 6572 {
69a80ea3
KH
6573 EMACS_INT pos = coding->dst_pos + coding->produced_char;
6574
df7492f9
KH
6575 coding_set_source (coding);
6576 coding->annotated = 0;
69a80ea3 6577 coding->charbuf_used = carryover;
df7492f9 6578 (*(coding->decoder)) (coding);
df7492f9 6579 coding_set_destination (coding);
69a80ea3 6580 carryover = produce_chars (coding, translation_table, 0);
df7492f9 6581 if (coding->annotated)
69a80ea3
KH
6582 produce_annotation (coding, pos);
6583 for (i = 0; i < carryover; i++)
6584 coding->charbuf[i]
6585 = coding->charbuf[coding->charbuf_used - carryover + i];
d46c5b12 6586 }
df7492f9 6587 while (coding->consumed < coding->src_bytes
54b367bb
KH
6588 && (coding->result == CODING_RESULT_SUCCESS
6589 || coding->result == CODING_RESULT_INVALID_SRC));
d46c5b12 6590
69a80ea3
KH
6591 if (carryover > 0)
6592 {
6593 coding_set_destination (coding);
6594 coding->charbuf_used = carryover;
6595 produce_chars (coding, translation_table, 1);
6596 }
6597
df7492f9
KH
6598 coding->carryover_bytes = 0;
6599 if (coding->consumed < coding->src_bytes)
d46c5b12 6600 {
df7492f9 6601 int nbytes = coding->src_bytes - coding->consumed;
8f924df7 6602 const unsigned char *src;
df7492f9
KH
6603
6604 coding_set_source (coding);
6605 coding_set_destination (coding);
6606 src = coding->source + coding->consumed;
6607
6608 if (coding->mode & CODING_MODE_LAST_BLOCK)
1c3478b0 6609 {
df7492f9
KH
6610 /* Flush out unprocessed data as binary chars. We are sure
6611 that the number of data is less than the size of
6612 coding->charbuf. */
065e3595 6613 coding->charbuf_used = 0;
df7492f9 6614 while (nbytes-- > 0)
1c3478b0 6615 {
df7492f9 6616 int c = *src++;
98725083 6617
1c91457d
KH
6618 if (c & 0x80)
6619 c = BYTE8_TO_CHAR (c);
6620 coding->charbuf[coding->charbuf_used++] = c;
1c3478b0 6621 }
f6cbaf43 6622 produce_chars (coding, Qnil, 1);
d46c5b12 6623 }
d46c5b12 6624 else
df7492f9
KH
6625 {
6626 /* Record unprocessed bytes in coding->carryover. We are
6627 sure that the number of data is less than the size of
6628 coding->carryover. */
6629 unsigned char *p = coding->carryover;
6630
6631 coding->carryover_bytes = nbytes;
6632 while (nbytes-- > 0)
6633 *p++ = *src++;
1c3478b0 6634 }
df7492f9 6635 coding->consumed = coding->src_bytes;
b73bfc1c 6636 }
69f76525 6637
4347441b
KH
6638 if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix))
6639 decode_eol (coding);
24a73b0a
KH
6640 if (BUFFERP (coding->dst_object))
6641 {
6642 current_buffer->undo_list = undo_list;
6643 record_insert (coding->dst_pos, coding->produced_char);
6644 }
73be902c 6645 return coding->result;
4ed46869
KH
6646}
6647
aaaf0b1e 6648
e1c23804 6649/* Extract an annotation datum from a composition starting at POS and
ff0dacd7
KH
6650 ending before LIMIT of CODING->src_object (buffer or string), store
6651 the data in BUF, set *STOP to a starting position of the next
6652 composition (if any) or to LIMIT, and return the address of the
6653 next element of BUF.
6654
6655 If such an annotation is not found, set *STOP to a starting
6656 position of a composition after POS (if any) or to LIMIT, and
6657 return BUF. */
6658
6659static INLINE int *
6660handle_composition_annotation (pos, limit, coding, buf, stop)
6661 EMACS_INT pos, limit;
aaaf0b1e 6662 struct coding_system *coding;
ff0dacd7
KH
6663 int *buf;
6664 EMACS_INT *stop;
aaaf0b1e 6665{
ff0dacd7
KH
6666 EMACS_INT start, end;
6667 Lisp_Object prop;
aaaf0b1e 6668
ff0dacd7
KH
6669 if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
6670 || end > limit)
6671 *stop = limit;
6672 else if (start > pos)
6673 *stop = start;
6674 else
aaaf0b1e 6675 {
ff0dacd7 6676 if (start == pos)
aaaf0b1e 6677 {
ff0dacd7
KH
6678 /* We found a composition. Store the corresponding
6679 annotation data in BUF. */
6680 int *head = buf;
6681 enum composition_method method = COMPOSITION_METHOD (prop);
6682 int nchars = COMPOSITION_LENGTH (prop);
6683
69a80ea3 6684 ADD_COMPOSITION_DATA (buf, nchars, method);
ff0dacd7 6685 if (method != COMPOSITION_RELATIVE)
aaaf0b1e 6686 {
ff0dacd7
KH
6687 Lisp_Object components;
6688 int len, i, i_byte;
6689
6690 components = COMPOSITION_COMPONENTS (prop);
6691 if (VECTORP (components))
aaaf0b1e 6692 {
ff0dacd7
KH
6693 len = XVECTOR (components)->size;
6694 for (i = 0; i < len; i++)
6695 *buf++ = XINT (AREF (components, i));
aaaf0b1e 6696 }
ff0dacd7 6697 else if (STRINGP (components))
aaaf0b1e 6698 {
8f924df7 6699 len = SCHARS (components);
ff0dacd7
KH
6700 i = i_byte = 0;
6701 while (i < len)
6702 {
6703 FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
6704 buf++;
6705 }
6706 }
6707 else if (INTEGERP (components))
6708 {
6709 len = 1;
6710 *buf++ = XINT (components);
6711 }
6712 else if (CONSP (components))
6713 {
6714 for (len = 0; CONSP (components);
6715 len++, components = XCDR (components))
6716 *buf++ = XINT (XCAR (components));
aaaf0b1e 6717 }
aaaf0b1e 6718 else
ff0dacd7
KH
6719 abort ();
6720 *head -= len;
aaaf0b1e 6721 }
aaaf0b1e 6722 }
ff0dacd7
KH
6723
6724 if (find_composition (end, limit, &start, &end, &prop,
6725 coding->src_object)
6726 && end <= limit)
6727 *stop = start;
6728 else
6729 *stop = limit;
aaaf0b1e 6730 }
ff0dacd7
KH
6731 return buf;
6732}
6733
6734
e1c23804 6735/* Extract an annotation datum from a text property `charset' at POS of
ff0dacd7
KH
6736 CODING->src_object (buffer of string), store the data in BUF, set
6737 *STOP to the position where the value of `charset' property changes
6738 (limiting by LIMIT), and return the address of the next element of
6739 BUF.
6740
6741 If the property value is nil, set *STOP to the position where the
6742 property value is non-nil (limiting by LIMIT), and return BUF. */
6743
6744static INLINE int *
6745handle_charset_annotation (pos, limit, coding, buf, stop)
6746 EMACS_INT pos, limit;
6747 struct coding_system *coding;
6748 int *buf;
6749 EMACS_INT *stop;
6750{
6751 Lisp_Object val, next;
6752 int id;
6753
6754 val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
6755 if (! NILP (val) && CHARSETP (val))
6756 id = XINT (CHARSET_SYMBOL_ID (val));
6757 else
6758 id = -1;
69a80ea3 6759 ADD_CHARSET_DATA (buf, 0, id);
ff0dacd7
KH
6760 next = Fnext_single_property_change (make_number (pos), Qcharset,
6761 coding->src_object,
6762 make_number (limit));
6763 *stop = XINT (next);
6764 return buf;
6765}
6766
6767
df7492f9 6768static void
09ee6fdd 6769consume_chars (coding, translation_table, max_lookup)
df7492f9 6770 struct coding_system *coding;
433f7f87 6771 Lisp_Object translation_table;
09ee6fdd 6772 int max_lookup;
df7492f9
KH
6773{
6774 int *buf = coding->charbuf;
ff0dacd7 6775 int *buf_end = coding->charbuf + coding->charbuf_size;
7c78e542 6776 const unsigned char *src = coding->source + coding->consumed;
4776e638 6777 const unsigned char *src_end = coding->source + coding->src_bytes;
ff0dacd7
KH
6778 EMACS_INT pos = coding->src_pos + coding->consumed_char;
6779 EMACS_INT end_pos = coding->src_pos + coding->src_chars;
df7492f9
KH
6780 int multibytep = coding->src_multibyte;
6781 Lisp_Object eol_type;
6782 int c;
ff0dacd7 6783 EMACS_INT stop, stop_composition, stop_charset;
09ee6fdd 6784 int *lookup_buf = NULL;
433f7f87
KH
6785
6786 if (! NILP (translation_table))
09ee6fdd 6787 lookup_buf = alloca (sizeof (int) * max_lookup);
88993dfd 6788
df7492f9
KH
6789 eol_type = CODING_ID_EOL_TYPE (coding->id);
6790 if (VECTORP (eol_type))
6791 eol_type = Qunix;
88993dfd 6792
df7492f9
KH
6793 /* Note: composition handling is not yet implemented. */
6794 coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
ec6d2bb8 6795
0b5670c9
KH
6796 if (NILP (coding->src_object))
6797 stop = stop_composition = stop_charset = end_pos;
ff0dacd7 6798 else
0b5670c9
KH
6799 {
6800 if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
6801 stop = stop_composition = pos;
6802 else
6803 stop = stop_composition = end_pos;
6804 if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
6805 stop = stop_charset = pos;
6806 else
6807 stop_charset = end_pos;
6808 }
ec6d2bb8 6809
24a73b0a 6810 /* Compensate for CRLF and conversion. */
ff0dacd7 6811 buf_end -= 1 + MAX_ANNOTATION_LENGTH;
df7492f9 6812 while (buf < buf_end)
aaaf0b1e 6813 {
433f7f87
KH
6814 Lisp_Object trans;
6815
df7492f9 6816 if (pos == stop)
ec6d2bb8 6817 {
df7492f9
KH
6818 if (pos == end_pos)
6819 break;
ff0dacd7
KH
6820 if (pos == stop_composition)
6821 buf = handle_composition_annotation (pos, end_pos, coding,
6822 buf, &stop_composition);
6823 if (pos == stop_charset)
6824 buf = handle_charset_annotation (pos, end_pos, coding,
6825 buf, &stop_charset);
6826 stop = (stop_composition < stop_charset
6827 ? stop_composition : stop_charset);
df7492f9
KH
6828 }
6829
6830 if (! multibytep)
4776e638 6831 {
d3e4cb56 6832 EMACS_INT bytes;
aaaf0b1e 6833
ea29edf2
KH
6834 if (coding->encoder == encode_coding_raw_text)
6835 c = *src++, pos++;
6836 else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
db274c7a 6837 c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
4776e638 6838 else
f03caae0 6839 c = BYTE8_TO_CHAR (*src), src++, pos++;
4776e638 6840 }
df7492f9 6841 else
db274c7a 6842 c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
df7492f9
KH
6843 if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
6844 c = '\n';
6845 if (! EQ (eol_type, Qunix))
aaaf0b1e 6846 {
df7492f9 6847 if (c == '\n')
aaaf0b1e 6848 {
df7492f9
KH
6849 if (EQ (eol_type, Qdos))
6850 *buf++ = '\r';
6851 else
6852 c = '\r';
aaaf0b1e
KH
6853 }
6854 }
433f7f87 6855
e6a54062 6856 trans = Qnil;
09ee6fdd 6857 LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
e6a54062 6858 if (NILP (trans))
433f7f87
KH
6859 *buf++ = c;
6860 else
6861 {
6862 int from_nchars = 1, to_nchars = 1;
6863 int *lookup_buf_end;
6864 const unsigned char *p = src;
6865 int i;
6866
6867 lookup_buf[0] = c;
6868 for (i = 1; i < max_lookup && p < src_end; i++)
6869 lookup_buf[i] = STRING_CHAR_ADVANCE (p);
6870 lookup_buf_end = lookup_buf + i;
6871 trans = get_translation (trans, lookup_buf, lookup_buf_end, 1,
6872 &from_nchars, &to_nchars);
6873 if (EQ (trans, Qt)
6874 || buf + to_nchars > buf_end)
6875 break;
6876 *buf++ = *lookup_buf;
6877 for (i = 1; i < to_nchars; i++)
6878 *buf++ = XINT (AREF (trans, i));
6879 for (i = 1; i < from_nchars; i++, pos++)
6880 src += MULTIBYTE_LENGTH_NO_CHECK (src);
6881 }
aaaf0b1e 6882 }
ec6d2bb8 6883
df7492f9
KH
6884 coding->consumed = src - coding->source;
6885 coding->consumed_char = pos - coding->src_pos;
6886 coding->charbuf_used = buf - coding->charbuf;
6887 coding->chars_at_source = 0;
aaaf0b1e
KH
6888}
6889
4ed46869 6890
df7492f9
KH
6891/* Encode the text at CODING->src_object into CODING->dst_object.
6892 CODING->src_object is a buffer or a string.
6893 CODING->dst_object is a buffer or nil.
6894
6895 If CODING->src_object is a buffer, it must be the current buffer.
6896 In this case, if CODING->src_pos is positive, it is a position of
6897 the source text in the buffer, otherwise. the source text is in the
6898 gap area of the buffer, and coding->src_pos specifies the offset of
6899 the text from GPT (which must be the same as PT). If this is the
6900 same buffer as CODING->dst_object, CODING->src_pos must be
6901 negative and CODING should not have `pre-write-conversion'.
6902
6903 If CODING->src_object is a string, CODING should not have
6904 `pre-write-conversion'.
6905
6906 If CODING->dst_object is a buffer, the encoded data is inserted at
6907 the current point of that buffer.
6908
6909 If CODING->dst_object is nil, the encoded data is placed at the
6910 memory area specified by CODING->destination. */
6911
6912static int
6913encode_coding (coding)
4ed46869 6914 struct coding_system *coding;
4ed46869 6915{
df7492f9 6916 Lisp_Object attrs;
7d64c6ad 6917 Lisp_Object translation_table;
09ee6fdd 6918 int max_lookup;
9861e777 6919
df7492f9 6920 attrs = CODING_ID_ATTRS (coding->id);
ea29edf2
KH
6921 if (coding->encoder == encode_coding_raw_text)
6922 translation_table = Qnil, max_lookup = 0;
6923 else
6924 translation_table = get_translation_table (attrs, 1, &max_lookup);
4ed46869 6925
df7492f9 6926 if (BUFFERP (coding->dst_object))
8844fa83 6927 {
df7492f9
KH
6928 set_buffer_internal (XBUFFER (coding->dst_object));
6929 coding->dst_multibyte
6930 = ! NILP (current_buffer->enable_multibyte_characters);
8844fa83 6931 }
4ed46869 6932
b73bfc1c 6933 coding->consumed = coding->consumed_char = 0;
df7492f9 6934 coding->produced = coding->produced_char = 0;
065e3595 6935 record_conversion_result (coding, CODING_RESULT_SUCCESS);
b73bfc1c 6936 coding->errors = 0;
b73bfc1c 6937
df7492f9 6938 ALLOC_CONVERSION_WORK_AREA (coding);
4ed46869 6939
df7492f9
KH
6940 do {
6941 coding_set_source (coding);
09ee6fdd 6942 consume_chars (coding, translation_table, max_lookup);
df7492f9
KH
6943 coding_set_destination (coding);
6944 (*(coding->encoder)) (coding);
6945 } while (coding->consumed_char < coding->src_chars);
6946
284201e4 6947 if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
df7492f9
KH
6948 insert_from_gap (coding->produced_char, coding->produced);
6949
6950 return (coding->result);
ec6d2bb8
KH
6951}
6952
fb88bf2d 6953
24a73b0a
KH
6954/* Name (or base name) of work buffer for code conversion. */
6955static Lisp_Object Vcode_conversion_workbuf_name;
d46c5b12 6956
24a73b0a
KH
6957/* A working buffer used by the top level conversion. Once it is
6958 created, it is never destroyed. It has the name
6959 Vcode_conversion_workbuf_name. The other working buffers are
6960 destroyed after the use is finished, and their names are modified
6961 versions of Vcode_conversion_workbuf_name. */
6962static Lisp_Object Vcode_conversion_reused_workbuf;
b73bfc1c 6963
24a73b0a
KH
6964/* 1 iff Vcode_conversion_reused_workbuf is already in use. */
6965static int reused_workbuf_in_use;
4ed46869 6966
24a73b0a
KH
6967
6968/* Return a working buffer of code convesion. MULTIBYTE specifies the
6969 multibyteness of returning buffer. */
b73bfc1c 6970
f6cbaf43 6971static Lisp_Object
24a73b0a 6972make_conversion_work_buffer (multibyte)
f6cbaf43 6973 int multibyte;
df7492f9 6974{
24a73b0a
KH
6975 Lisp_Object name, workbuf;
6976 struct buffer *current;
4ed46869 6977
24a73b0a 6978 if (reused_workbuf_in_use++)
065e3595
KH
6979 {
6980 name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
6981 workbuf = Fget_buffer_create (name);
6982 }
df7492f9 6983 else
065e3595 6984 {
065e3595 6985 if (NILP (Vcode_conversion_reused_workbuf))
a993c7a1
KH
6986 Vcode_conversion_reused_workbuf
6987 = Fget_buffer_create (Vcode_conversion_workbuf_name);
6988 workbuf = Vcode_conversion_reused_workbuf;
065e3595 6989 }
24a73b0a
KH
6990 current = current_buffer;
6991 set_buffer_internal (XBUFFER (workbuf));
3ed051d4 6992 Ferase_buffer ();
df7492f9 6993 current_buffer->undo_list = Qt;
24a73b0a 6994 current_buffer->enable_multibyte_characters = multibyte ? Qt : Qnil;
df7492f9 6995 set_buffer_internal (current);
24a73b0a 6996 return workbuf;
df7492f9 6997}
d46c5b12 6998
24a73b0a 6999
4776e638 7000static Lisp_Object
24a73b0a
KH
7001code_conversion_restore (arg)
7002 Lisp_Object arg;
4776e638 7003{
24a73b0a 7004 Lisp_Object current, workbuf;
948bdcf3 7005 struct gcpro gcpro1;
24a73b0a 7006
948bdcf3 7007 GCPRO1 (arg);
24a73b0a
KH
7008 current = XCAR (arg);
7009 workbuf = XCDR (arg);
7010 if (! NILP (workbuf))
7011 {
7012 if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7013 reused_workbuf_in_use = 0;
7014 else if (! NILP (Fbuffer_live_p (workbuf)))
7015 Fkill_buffer (workbuf);
7016 }
7017 set_buffer_internal (XBUFFER (current));
948bdcf3 7018 UNGCPRO;
4776e638
KH
7019 return Qnil;
7020}
b73bfc1c 7021
24a73b0a
KH
7022Lisp_Object
7023code_conversion_save (with_work_buf, multibyte)
4776e638 7024 int with_work_buf, multibyte;
df7492f9 7025{
24a73b0a 7026 Lisp_Object workbuf = Qnil;
b73bfc1c 7027
4776e638 7028 if (with_work_buf)
24a73b0a
KH
7029 workbuf = make_conversion_work_buffer (multibyte);
7030 record_unwind_protect (code_conversion_restore,
7031 Fcons (Fcurrent_buffer (), workbuf));
4776e638 7032 return workbuf;
df7492f9 7033}
d46c5b12 7034
df7492f9
KH
7035int
7036decode_coding_gap (coding, chars, bytes)
7037 struct coding_system *coding;
7038 EMACS_INT chars, bytes;
7039{
7040 int count = specpdl_ptr - specpdl;
5e5c78be 7041 Lisp_Object attrs;
fb88bf2d 7042
24a73b0a 7043 code_conversion_save (0, 0);
ec6d2bb8 7044
24a73b0a 7045 coding->src_object = Fcurrent_buffer ();
df7492f9
KH
7046 coding->src_chars = chars;
7047 coding->src_bytes = bytes;
7048 coding->src_pos = -chars;
7049 coding->src_pos_byte = -bytes;
7050 coding->src_multibyte = chars < bytes;
24a73b0a 7051 coding->dst_object = coding->src_object;
df7492f9
KH
7052 coding->dst_pos = PT;
7053 coding->dst_pos_byte = PT_BYTE;
71c81426 7054 coding->dst_multibyte = ! NILP (current_buffer->enable_multibyte_characters);
4ed46869 7055
df7492f9
KH
7056 if (CODING_REQUIRE_DETECTION (coding))
7057 detect_coding (coding);
8f924df7 7058
9286b333 7059 coding->mode |= CODING_MODE_LAST_BLOCK;
287c57d7 7060 current_buffer->text->inhibit_shrinking = 1;
df7492f9 7061 decode_coding (coding);
287c57d7 7062 current_buffer->text->inhibit_shrinking = 0;
d46c5b12 7063
5e5c78be
KH
7064 attrs = CODING_ID_ATTRS (coding->id);
7065 if (! NILP (CODING_ATTR_POST_READ (attrs)))
b73bfc1c 7066 {
5e5c78be
KH
7067 EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7068 Lisp_Object val;
7069
7070 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
5e5c78be
KH
7071 val = call1 (CODING_ATTR_POST_READ (attrs),
7072 make_number (coding->produced_char));
5e5c78be
KH
7073 CHECK_NATNUM (val);
7074 coding->produced_char += Z - prev_Z;
7075 coding->produced += Z_BYTE - prev_Z_BYTE;
b73bfc1c 7076 }
4ed46869 7077
df7492f9 7078 unbind_to (count, Qnil);
b73bfc1c
KH
7079 return coding->result;
7080}
52d41803 7081
4ed46869 7082int
df7492f9 7083encode_coding_gap (coding, chars, bytes)
4ed46869 7084 struct coding_system *coding;
df7492f9 7085 EMACS_INT chars, bytes;
4ed46869 7086{
df7492f9 7087 int count = specpdl_ptr - specpdl;
4ed46869 7088
24a73b0a 7089 code_conversion_save (0, 0);
4ed46869 7090
24a73b0a 7091 coding->src_object = Fcurrent_buffer ();
df7492f9
KH
7092 coding->src_chars = chars;
7093 coding->src_bytes = bytes;
7094 coding->src_pos = -chars;
7095 coding->src_pos_byte = -bytes;
7096 coding->src_multibyte = chars < bytes;
7097 coding->dst_object = coding->src_object;
7098 coding->dst_pos = PT;
7099 coding->dst_pos_byte = PT_BYTE;
4ed46869 7100
df7492f9 7101 encode_coding (coding);
b73bfc1c 7102
df7492f9
KH
7103 unbind_to (count, Qnil);
7104 return coding->result;
7105}
4ed46869 7106
d46c5b12 7107
df7492f9
KH
7108/* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
7109 SRC_OBJECT into DST_OBJECT by coding context CODING.
b73bfc1c 7110
df7492f9 7111 SRC_OBJECT is a buffer, a string, or Qnil.
b73bfc1c 7112
df7492f9
KH
7113 If it is a buffer, the text is at point of the buffer. FROM and TO
7114 are positions in the buffer.
b73bfc1c 7115
df7492f9
KH
7116 If it is a string, the text is at the beginning of the string.
7117 FROM and TO are indices to the string.
4ed46869 7118
df7492f9
KH
7119 If it is nil, the text is at coding->source. FROM and TO are
7120 indices to coding->source.
bb10be8b 7121
df7492f9 7122 DST_OBJECT is a buffer, Qt, or Qnil.
4ed46869 7123
df7492f9
KH
7124 If it is a buffer, the decoded text is inserted at point of the
7125 buffer. If the buffer is the same as SRC_OBJECT, the source text
7126 is deleted.
4ed46869 7127
df7492f9
KH
7128 If it is Qt, a string is made from the decoded text, and
7129 set in CODING->dst_object.
d46c5b12 7130
df7492f9 7131 If it is Qnil, the decoded text is stored at CODING->destination.
2cb26057 7132 The caller must allocate CODING->dst_bytes bytes at
df7492f9
KH
7133 CODING->destination by xmalloc. If the decoded text is longer than
7134 CODING->dst_bytes, CODING->destination is relocated by xrealloc.
7135 */
d46c5b12 7136
df7492f9
KH
7137void
7138decode_coding_object (coding, src_object, from, from_byte, to, to_byte,
7139 dst_object)
d46c5b12 7140 struct coding_system *coding;
df7492f9
KH
7141 Lisp_Object src_object;
7142 EMACS_INT from, from_byte, to, to_byte;
7143 Lisp_Object dst_object;
d46c5b12 7144{
df7492f9
KH
7145 int count = specpdl_ptr - specpdl;
7146 unsigned char *destination;
7147 EMACS_INT dst_bytes;
7148 EMACS_INT chars = to - from;
7149 EMACS_INT bytes = to_byte - from_byte;
7150 Lisp_Object attrs;
4776e638 7151 int saved_pt = -1, saved_pt_byte;
64cedb0c 7152 int need_marker_adjustment = 0;
b3bfad50 7153 Lisp_Object old_deactivate_mark;
d46c5b12 7154
b3bfad50 7155 old_deactivate_mark = Vdeactivate_mark;
93dec019 7156
df7492f9 7157 if (NILP (dst_object))
d46c5b12 7158 {
df7492f9
KH
7159 destination = coding->destination;
7160 dst_bytes = coding->dst_bytes;
d46c5b12 7161 }
93dec019 7162
df7492f9
KH
7163 coding->src_object = src_object;
7164 coding->src_chars = chars;
7165 coding->src_bytes = bytes;
7166 coding->src_multibyte = chars < bytes;
70ad9fc4 7167
df7492f9 7168 if (STRINGP (src_object))
d46c5b12 7169 {
df7492f9
KH
7170 coding->src_pos = from;
7171 coding->src_pos_byte = from_byte;
d46c5b12 7172 }
df7492f9 7173 else if (BUFFERP (src_object))
88993dfd 7174 {
df7492f9
KH
7175 set_buffer_internal (XBUFFER (src_object));
7176 if (from != GPT)
7177 move_gap_both (from, from_byte);
7178 if (EQ (src_object, dst_object))
fb88bf2d 7179 {
64cedb0c
KH
7180 struct Lisp_Marker *tail;
7181
7182 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7183 {
7184 tail->need_adjustment
7185 = tail->charpos == (tail->insertion_type ? from : to);
7186 need_marker_adjustment |= tail->need_adjustment;
7187 }
4776e638 7188 saved_pt = PT, saved_pt_byte = PT_BYTE;
df7492f9 7189 TEMP_SET_PT_BOTH (from, from_byte);
f4a3cc44 7190 current_buffer->text->inhibit_shrinking = 1;
df7492f9
KH
7191 del_range_both (from, from_byte, to, to_byte, 1);
7192 coding->src_pos = -chars;
7193 coding->src_pos_byte = -bytes;
fb88bf2d 7194 }
df7492f9 7195 else
fb88bf2d 7196 {
df7492f9
KH
7197 coding->src_pos = from;
7198 coding->src_pos_byte = from_byte;
fb88bf2d 7199 }
88993dfd
KH
7200 }
7201
df7492f9
KH
7202 if (CODING_REQUIRE_DETECTION (coding))
7203 detect_coding (coding);
7204 attrs = CODING_ID_ATTRS (coding->id);
d46c5b12 7205
2cb26057
KH
7206 if (EQ (dst_object, Qt)
7207 || (! NILP (CODING_ATTR_POST_READ (attrs))
7208 && NILP (dst_object)))
b73bfc1c 7209 {
a1567c45
SM
7210 coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
7211 coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
df7492f9
KH
7212 coding->dst_pos = BEG;
7213 coding->dst_pos_byte = BEG_BYTE;
b73bfc1c 7214 }
df7492f9 7215 else if (BUFFERP (dst_object))
d46c5b12 7216 {
24a73b0a 7217 code_conversion_save (0, 0);
df7492f9
KH
7218 coding->dst_object = dst_object;
7219 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
7220 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
7221 coding->dst_multibyte
7222 = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
d46c5b12
KH
7223 }
7224 else
7225 {
24a73b0a 7226 code_conversion_save (0, 0);
df7492f9 7227 coding->dst_object = Qnil;
0154725e
SM
7228 /* Most callers presume this will return a multibyte result, and they
7229 won't use `binary' or `raw-text' anyway, so let's not worry about
7230 CODING_FOR_UNIBYTE. */
bb555731 7231 coding->dst_multibyte = 1;
d46c5b12
KH
7232 }
7233
df7492f9 7234 decode_coding (coding);
fa46990e 7235
df7492f9
KH
7236 if (BUFFERP (coding->dst_object))
7237 set_buffer_internal (XBUFFER (coding->dst_object));
d46c5b12 7238
df7492f9 7239 if (! NILP (CODING_ATTR_POST_READ (attrs)))
d46c5b12 7240 {
b3bfad50 7241 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
df7492f9 7242 EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
2b4f9037 7243 Lisp_Object val;
d46c5b12 7244
c0cc7f7f 7245 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
b3bfad50
KH
7246 GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7247 old_deactivate_mark);
d4850d67
KH
7248 val = safe_call1 (CODING_ATTR_POST_READ (attrs),
7249 make_number (coding->produced_char));
df7492f9
KH
7250 UNGCPRO;
7251 CHECK_NATNUM (val);
7252 coding->produced_char += Z - prev_Z;
7253 coding->produced += Z_BYTE - prev_Z_BYTE;
d46c5b12 7254 }
de79a6a5 7255
df7492f9 7256 if (EQ (dst_object, Qt))
ec6d2bb8 7257 {
df7492f9
KH
7258 coding->dst_object = Fbuffer_string ();
7259 }
7260 else if (NILP (dst_object) && BUFFERP (coding->dst_object))
7261 {
7262 set_buffer_internal (XBUFFER (coding->dst_object));
7263 if (dst_bytes < coding->produced)
7264 {
b3bfad50 7265 destination = xrealloc (destination, coding->produced);
df7492f9
KH
7266 if (! destination)
7267 {
065e3595
KH
7268 record_conversion_result (coding,
7269 CODING_RESULT_INSUFFICIENT_DST);
df7492f9
KH
7270 unbind_to (count, Qnil);
7271 return;
7272 }
7273 if (BEGV < GPT && GPT < BEGV + coding->produced_char)
7274 move_gap_both (BEGV, BEGV_BYTE);
7275 bcopy (BEGV_ADDR, destination, coding->produced);
7276 coding->destination = destination;
d46c5b12 7277 }
ec6d2bb8 7278 }
b73bfc1c 7279
4776e638
KH
7280 if (saved_pt >= 0)
7281 {
7282 /* This is the case of:
7283 (BUFFERP (src_object) && EQ (src_object, dst_object))
7284 As we have moved PT while replacing the original buffer
7285 contents, we must recover it now. */
7286 set_buffer_internal (XBUFFER (src_object));
f4a3cc44 7287 current_buffer->text->inhibit_shrinking = 0;
4776e638
KH
7288 if (saved_pt < from)
7289 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7290 else if (saved_pt < from + chars)
7291 TEMP_SET_PT_BOTH (from, from_byte);
7292 else if (! NILP (current_buffer->enable_multibyte_characters))
7293 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7294 saved_pt_byte + (coding->produced - bytes));
7295 else
7296 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7297 saved_pt_byte + (coding->produced - bytes));
64cedb0c
KH
7298
7299 if (need_marker_adjustment)
7300 {
7301 struct Lisp_Marker *tail;
7302
7303 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7304 if (tail->need_adjustment)
7305 {
7306 tail->need_adjustment = 0;
7307 if (tail->insertion_type)
7308 {
7309 tail->bytepos = from_byte;
7310 tail->charpos = from;
7311 }
7312 else
7313 {
7314 tail->bytepos = from_byte + coding->produced;
7315 tail->charpos
7316 = (NILP (current_buffer->enable_multibyte_characters)
7317 ? tail->bytepos : from + coding->produced_char);
7318 }
7319 }
7320 }
d46c5b12 7321 }
4776e638 7322
b3bfad50 7323 Vdeactivate_mark = old_deactivate_mark;
065e3595 7324 unbind_to (count, coding->dst_object);
d46c5b12
KH
7325}
7326
d46c5b12 7327
df7492f9
KH
7328void
7329encode_coding_object (coding, src_object, from, from_byte, to, to_byte,
7330 dst_object)
d46c5b12 7331 struct coding_system *coding;
df7492f9
KH
7332 Lisp_Object src_object;
7333 EMACS_INT from, from_byte, to, to_byte;
7334 Lisp_Object dst_object;
d46c5b12 7335{
b73bfc1c 7336 int count = specpdl_ptr - specpdl;
df7492f9
KH
7337 EMACS_INT chars = to - from;
7338 EMACS_INT bytes = to_byte - from_byte;
7339 Lisp_Object attrs;
4776e638 7340 int saved_pt = -1, saved_pt_byte;
64cedb0c 7341 int need_marker_adjustment = 0;
c02d943b 7342 int kill_src_buffer = 0;
b3bfad50 7343 Lisp_Object old_deactivate_mark;
df7492f9 7344
b3bfad50 7345 old_deactivate_mark = Vdeactivate_mark;
df7492f9
KH
7346
7347 coding->src_object = src_object;
7348 coding->src_chars = chars;
7349 coding->src_bytes = bytes;
7350 coding->src_multibyte = chars < bytes;
7351
7352 attrs = CODING_ID_ATTRS (coding->id);
7353
64cedb0c
KH
7354 if (EQ (src_object, dst_object))
7355 {
7356 struct Lisp_Marker *tail;
7357
7358 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7359 {
7360 tail->need_adjustment
7361 = tail->charpos == (tail->insertion_type ? from : to);
7362 need_marker_adjustment |= tail->need_adjustment;
7363 }
7364 }
7365
df7492f9 7366 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
6bac5b12 7367 {
24a73b0a 7368 coding->src_object = code_conversion_save (1, coding->src_multibyte);
df7492f9
KH
7369 set_buffer_internal (XBUFFER (coding->src_object));
7370 if (STRINGP (src_object))
7371 insert_from_string (src_object, from, from_byte, chars, bytes, 0);
7372 else if (BUFFERP (src_object))
7373 insert_from_buffer (XBUFFER (src_object), from, chars, 0);
7374 else
7375 insert_1_both (coding->source + from, chars, bytes, 0, 0, 0);
d46c5b12 7376
df7492f9
KH
7377 if (EQ (src_object, dst_object))
7378 {
7379 set_buffer_internal (XBUFFER (src_object));
4776e638 7380 saved_pt = PT, saved_pt_byte = PT_BYTE;
df7492f9
KH
7381 del_range_both (from, from_byte, to, to_byte, 1);
7382 set_buffer_internal (XBUFFER (coding->src_object));
7383 }
7384
d4850d67
KH
7385 {
7386 Lisp_Object args[3];
b3bfad50 7387 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
d4850d67 7388
b3bfad50
KH
7389 GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7390 old_deactivate_mark);
d4850d67
KH
7391 args[0] = CODING_ATTR_PRE_WRITE (attrs);
7392 args[1] = make_number (BEG);
7393 args[2] = make_number (Z);
7394 safe_call (3, args);
b3bfad50 7395 UNGCPRO;
d4850d67 7396 }
c02d943b
KH
7397 if (XBUFFER (coding->src_object) != current_buffer)
7398 kill_src_buffer = 1;
ac87bbef 7399 coding->src_object = Fcurrent_buffer ();
df7492f9
KH
7400 if (BEG != GPT)
7401 move_gap_both (BEG, BEG_BYTE);
7402 coding->src_chars = Z - BEG;
7403 coding->src_bytes = Z_BYTE - BEG_BYTE;
7404 coding->src_pos = BEG;
7405 coding->src_pos_byte = BEG_BYTE;
7406 coding->src_multibyte = Z < Z_BYTE;
7407 }
7408 else if (STRINGP (src_object))
d46c5b12 7409 {
24a73b0a 7410 code_conversion_save (0, 0);
df7492f9
KH
7411 coding->src_pos = from;
7412 coding->src_pos_byte = from_byte;
b73bfc1c 7413 }
df7492f9 7414 else if (BUFFERP (src_object))
b73bfc1c 7415 {
24a73b0a 7416 code_conversion_save (0, 0);
df7492f9 7417 set_buffer_internal (XBUFFER (src_object));
df7492f9 7418 if (EQ (src_object, dst_object))
d46c5b12 7419 {
4776e638 7420 saved_pt = PT, saved_pt_byte = PT_BYTE;
ff0dacd7
KH
7421 coding->src_object = del_range_1 (from, to, 1, 1);
7422 coding->src_pos = 0;
7423 coding->src_pos_byte = 0;
d46c5b12 7424 }
df7492f9 7425 else
d46c5b12 7426 {
ff0dacd7
KH
7427 if (from < GPT && to >= GPT)
7428 move_gap_both (from, from_byte);
df7492f9
KH
7429 coding->src_pos = from;
7430 coding->src_pos_byte = from_byte;
d46c5b12 7431 }
d46c5b12 7432 }
4776e638 7433 else
24a73b0a 7434 code_conversion_save (0, 0);
d46c5b12 7435
df7492f9 7436 if (BUFFERP (dst_object))
88993dfd 7437 {
df7492f9 7438 coding->dst_object = dst_object;
28f67a95
KH
7439 if (EQ (src_object, dst_object))
7440 {
7441 coding->dst_pos = from;
7442 coding->dst_pos_byte = from_byte;
7443 }
7444 else
7445 {
319a3947
KH
7446 struct buffer *current = current_buffer;
7447
7448 set_buffer_temp (XBUFFER (dst_object));
7449 coding->dst_pos = PT;
7450 coding->dst_pos_byte = PT_BYTE;
7451 move_gap_both (coding->dst_pos, coding->dst_pos_byte);
7452 set_buffer_temp (current);
28f67a95 7453 }
df7492f9
KH
7454 coding->dst_multibyte
7455 = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
88993dfd 7456 }
df7492f9 7457 else if (EQ (dst_object, Qt))
d46c5b12 7458 {
df7492f9 7459 coding->dst_object = Qnil;
df7492f9 7460 coding->dst_bytes = coding->src_chars;
ac87bbef
KH
7461 if (coding->dst_bytes == 0)
7462 coding->dst_bytes = 1;
7463 coding->destination = (unsigned char *) xmalloc (coding->dst_bytes);
df7492f9 7464 coding->dst_multibyte = 0;
d46c5b12
KH
7465 }
7466 else
7467 {
df7492f9
KH
7468 coding->dst_object = Qnil;
7469 coding->dst_multibyte = 0;
d46c5b12
KH
7470 }
7471
df7492f9 7472 encode_coding (coding);
d46c5b12 7473
df7492f9 7474 if (EQ (dst_object, Qt))
d46c5b12 7475 {
df7492f9
KH
7476 if (BUFFERP (coding->dst_object))
7477 coding->dst_object = Fbuffer_string ();
7478 else
d46c5b12 7479 {
df7492f9
KH
7480 coding->dst_object
7481 = make_unibyte_string ((char *) coding->destination,
7482 coding->produced);
7483 xfree (coding->destination);
d46c5b12 7484 }
4ed46869 7485 }
d46c5b12 7486
4776e638
KH
7487 if (saved_pt >= 0)
7488 {
7489 /* This is the case of:
7490 (BUFFERP (src_object) && EQ (src_object, dst_object))
7491 As we have moved PT while replacing the original buffer
7492 contents, we must recover it now. */
7493 set_buffer_internal (XBUFFER (src_object));
7494 if (saved_pt < from)
7495 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7496 else if (saved_pt < from + chars)
7497 TEMP_SET_PT_BOTH (from, from_byte);
7498 else if (! NILP (current_buffer->enable_multibyte_characters))
7499 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7500 saved_pt_byte + (coding->produced - bytes));
d46c5b12 7501 else
4776e638
KH
7502 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7503 saved_pt_byte + (coding->produced - bytes));
64cedb0c
KH
7504
7505 if (need_marker_adjustment)
7506 {
7507 struct Lisp_Marker *tail;
7508
7509 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7510 if (tail->need_adjustment)
7511 {
7512 tail->need_adjustment = 0;
7513 if (tail->insertion_type)
7514 {
7515 tail->bytepos = from_byte;
7516 tail->charpos = from;
7517 }
7518 else
7519 {
7520 tail->bytepos = from_byte + coding->produced;
7521 tail->charpos
7522 = (NILP (current_buffer->enable_multibyte_characters)
7523 ? tail->bytepos : from + coding->produced_char);
7524 }
7525 }
7526 }
4776e638
KH
7527 }
7528
c02d943b
KH
7529 if (kill_src_buffer)
7530 Fkill_buffer (coding->src_object);
b3bfad50
KH
7531
7532 Vdeactivate_mark = old_deactivate_mark;
df7492f9 7533 unbind_to (count, Qnil);
b73bfc1c
KH
7534}
7535
df7492f9 7536
b73bfc1c 7537Lisp_Object
df7492f9 7538preferred_coding_system ()
b73bfc1c 7539{
df7492f9 7540 int id = coding_categories[coding_priorities[0]].id;
2391eaa4 7541
df7492f9 7542 return CODING_ID_NAME (id);
4ed46869
KH
7543}
7544
7545\f
7546#ifdef emacs
1397dc18 7547/*** 8. Emacs Lisp library functions ***/
4ed46869 7548
4ed46869 7549DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
48b0f3ae 7550 doc: /* Return t if OBJECT is nil or a coding-system.
df7492f9 7551See the documentation of `define-coding-system' for information
48b0f3ae 7552about coding-system objects. */)
d4a1d553
JB
7553 (object)
7554 Lisp_Object object;
4ed46869 7555{
d4a1d553
JB
7556 if (NILP (object)
7557 || CODING_SYSTEM_ID (object) >= 0)
44e8490d 7558 return Qt;
d4a1d553
JB
7559 if (! SYMBOLP (object)
7560 || NILP (Fget (object, Qcoding_system_define_form)))
44e8490d
KH
7561 return Qnil;
7562 return Qt;
4ed46869
KH
7563}
7564
9d991de8
RS
7565DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
7566 Sread_non_nil_coding_system, 1, 1, 0,
48b0f3ae
PJ
7567 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
7568 (prompt)
4ed46869
KH
7569 Lisp_Object prompt;
7570{
e0e989f6 7571 Lisp_Object val;
9d991de8
RS
7572 do
7573 {
4608c386
KH
7574 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
7575 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
9d991de8 7576 }
8f924df7 7577 while (SCHARS (val) == 0);
e0e989f6 7578 return (Fintern (val, Qnil));
4ed46869
KH
7579}
7580
9b787f3e 7581DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
48b0f3ae 7582 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
c7183fb8
GM
7583If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
7584Ignores case when completing coding systems (all Emacs coding systems
7585are lower-case). */)
48b0f3ae 7586 (prompt, default_coding_system)
9b787f3e 7587 Lisp_Object prompt, default_coding_system;
4ed46869 7588{
f44d27ce 7589 Lisp_Object val;
c7183fb8
GM
7590 int count = SPECPDL_INDEX ();
7591
9b787f3e 7592 if (SYMBOLP (default_coding_system))
57d25e6f 7593 default_coding_system = SYMBOL_NAME (default_coding_system);
c7183fb8 7594 specbind (Qcompletion_ignore_case, Qt);
4608c386 7595 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
9b787f3e
RS
7596 Qt, Qnil, Qcoding_system_history,
7597 default_coding_system, Qnil);
c7183fb8 7598 unbind_to (count, Qnil);
8f924df7 7599 return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
4ed46869
KH
7600}
7601
7602DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
7603 1, 1, 0,
48b0f3ae 7604 doc: /* Check validity of CODING-SYSTEM.
9ffd559c
KH
7605If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
7606It is valid if it is nil or a symbol defined as a coding system by the
7607function `define-coding-system'. */)
df7492f9 7608 (coding_system)
4ed46869
KH
7609 Lisp_Object coding_system;
7610{
44e8490d
KH
7611 Lisp_Object define_form;
7612
7613 define_form = Fget (coding_system, Qcoding_system_define_form);
7614 if (! NILP (define_form))
7615 {
7616 Fput (coding_system, Qcoding_system_define_form, Qnil);
7617 safe_eval (define_form);
7618 }
4ed46869
KH
7619 if (!NILP (Fcoding_system_p (coding_system)))
7620 return coding_system;
fcad4ec4 7621 xsignal1 (Qcoding_system_error, coding_system);
4ed46869 7622}
df7492f9 7623
3a73fa5d 7624\f
89528eb3
KH
7625/* Detect how the bytes at SRC of length SRC_BYTES are encoded. If
7626 HIGHEST is nonzero, return the coding system of the highest
7627 priority among the detected coding systems. Otherwize return a
7628 list of detected coding systems sorted by their priorities. If
7629 MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
7630 multibyte form but contains only ASCII and eight-bit chars.
7631 Otherwise, the bytes are raw bytes.
7632
7633 CODING-SYSTEM controls the detection as below:
7634
7635 If it is nil, detect both text-format and eol-format. If the
7636 text-format part of CODING-SYSTEM is already specified
7637 (e.g. `iso-latin-1'), detect only eol-format. If the eol-format
7638 part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
7639 detect only text-format. */
7640
d46c5b12 7641Lisp_Object
24a73b0a
KH
7642detect_coding_system (src, src_chars, src_bytes, highest, multibytep,
7643 coding_system)
8f924df7 7644 const unsigned char *src;
13818c30
SM
7645 EMACS_INT src_chars, src_bytes;
7646 int highest;
0a28aafb 7647 int multibytep;
df7492f9 7648 Lisp_Object coding_system;
4ed46869 7649{
8f924df7 7650 const unsigned char *src_end = src + src_bytes;
df7492f9
KH
7651 Lisp_Object attrs, eol_type;
7652 Lisp_Object val;
7653 struct coding_system coding;
89528eb3 7654 int id;
ff0dacd7 7655 struct coding_detection_info detect_info;
24a73b0a 7656 enum coding_category base_category;
2f3cbb32 7657 int null_byte_found = 0, eight_bit_found = 0;
b73bfc1c 7658
df7492f9
KH
7659 if (NILP (coding_system))
7660 coding_system = Qundecided;
7661 setup_coding_system (coding_system, &coding);
7662 attrs = CODING_ID_ATTRS (coding.id);
7663 eol_type = CODING_ID_EOL_TYPE (coding.id);
89528eb3 7664 coding_system = CODING_ATTR_BASE_NAME (attrs);
4ed46869 7665
df7492f9 7666 coding.source = src;
24a73b0a 7667 coding.src_chars = src_chars;
df7492f9
KH
7668 coding.src_bytes = src_bytes;
7669 coding.src_multibyte = multibytep;
7670 coding.consumed = 0;
89528eb3 7671 coding.mode |= CODING_MODE_LAST_BLOCK;
c0e16b14 7672 coding.head_ascii = 0;
d46c5b12 7673
ff0dacd7 7674 detect_info.checked = detect_info.found = detect_info.rejected = 0;
d46c5b12 7675
89528eb3 7676 /* At first, detect text-format if necessary. */
24a73b0a
KH
7677 base_category = XINT (CODING_ATTR_CATEGORY (attrs));
7678 if (base_category == coding_category_undecided)
4ed46869 7679 {
ff0dacd7
KH
7680 enum coding_category category;
7681 struct coding_system *this;
7682 int c, i;
88993dfd 7683
24a73b0a 7684 /* Skip all ASCII bytes except for a few ISO2022 controls. */
2f3cbb32 7685 for (; src < src_end; src++)
4ed46869 7686 {
df7492f9 7687 c = *src;
6cb21a4f 7688 if (c & 0x80)
6cb21a4f 7689 {
2f3cbb32 7690 eight_bit_found = 1;
2f3cbb32
KH
7691 if (null_byte_found)
7692 break;
7693 }
c0e16b14 7694 else if (c < 0x20)
2f3cbb32
KH
7695 {
7696 if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
7697 && ! inhibit_iso_escape_detection
7698 && ! detect_info.checked)
6cb21a4f 7699 {
2f3cbb32
KH
7700 if (detect_coding_iso_2022 (&coding, &detect_info))
7701 {
7702 /* We have scanned the whole data. */
7703 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
c0e16b14
KH
7704 {
7705 /* We didn't find an 8-bit code. We may
7706 have found a null-byte, but it's very
7707 rare that a binary file confirm to
7708 ISO-2022. */
7709 src = src_end;
7710 coding.head_ascii = src - coding.source;
7711 }
7712 detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
2f3cbb32
KH
7713 break;
7714 }
7715 }
7716 else if (! c)
7717 {
7718 null_byte_found = 1;
7719 if (eight_bit_found)
7720 break;
6cb21a4f 7721 }
c006c0c8
KH
7722 if (! eight_bit_found)
7723 coding.head_ascii++;
6cb21a4f 7724 }
c006c0c8 7725 else if (! eight_bit_found)
c0e16b14 7726 coding.head_ascii++;
4ed46869 7727 }
88993dfd 7728
2f3cbb32
KH
7729 if (null_byte_found || eight_bit_found
7730 || coding.head_ascii < coding.src_bytes
6cb21a4f
KH
7731 || detect_info.found)
7732 {
2f3cbb32 7733 if (coding.head_ascii == coding.src_bytes)
6cb21a4f
KH
7734 /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */
7735 for (i = 0; i < coding_category_raw_text; i++)
ff0dacd7 7736 {
6cb21a4f 7737 category = coding_priorities[i];
c7266f4a 7738 this = coding_categories + category;
6cb21a4f 7739 if (detect_info.found & (1 << category))
ff0dacd7
KH
7740 break;
7741 }
6cb21a4f 7742 else
2f3cbb32
KH
7743 {
7744 if (null_byte_found)
7745 {
7746 detect_info.checked |= ~CATEGORY_MASK_UTF_16;
7747 detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
7748 }
7749 for (i = 0; i < coding_category_raw_text; i++)
7750 {
7751 category = coding_priorities[i];
7752 this = coding_categories + category;
6cb21a4f 7753
2f3cbb32
KH
7754 if (this->id < 0)
7755 {
7756 /* No coding system of this category is defined. */
7757 detect_info.rejected |= (1 << category);
7758 }
7759 else if (category >= coding_category_raw_text)
7760 continue;
7761 else if (detect_info.checked & (1 << category))
7762 {
7763 if (highest
7764 && (detect_info.found & (1 << category)))
6cb21a4f 7765 break;
2f3cbb32
KH
7766 }
7767 else if ((*(this->detector)) (&coding, &detect_info)
7768 && highest
7769 && (detect_info.found & (1 << category)))
7770 {
7771 if (category == coding_category_utf_16_auto)
7772 {
7773 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
7774 category = coding_category_utf_16_le;
7775 else
7776 category = coding_category_utf_16_be;
7777 }
7778 break;
7779 }
7780 }
7781 }
6cb21a4f 7782 }
ec6d2bb8 7783
2f3cbb32 7784 if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY)
ec6d2bb8 7785 {
ff0dacd7 7786 detect_info.found = CATEGORY_MASK_RAW_TEXT;
89528eb3
KH
7787 id = coding_categories[coding_category_raw_text].id;
7788 val = Fcons (make_number (id), Qnil);
7789 }
ff0dacd7 7790 else if (! detect_info.rejected && ! detect_info.found)
89528eb3 7791 {
ff0dacd7 7792 detect_info.found = CATEGORY_MASK_ANY;
89528eb3
KH
7793 id = coding_categories[coding_category_undecided].id;
7794 val = Fcons (make_number (id), Qnil);
7795 }
7796 else if (highest)
7797 {
ff0dacd7 7798 if (detect_info.found)
ec6d2bb8 7799 {
ff0dacd7
KH
7800 detect_info.found = 1 << category;
7801 val = Fcons (make_number (this->id), Qnil);
7802 }
7803 else
7804 for (i = 0; i < coding_category_raw_text; i++)
7805 if (! (detect_info.rejected & (1 << coding_priorities[i])))
7806 {
7807 detect_info.found = 1 << coding_priorities[i];
7808 id = coding_categories[coding_priorities[i]].id;
7809 val = Fcons (make_number (id), Qnil);
7810 break;
7811 }
7812 }
89528eb3
KH
7813 else
7814 {
ff0dacd7
KH
7815 int mask = detect_info.rejected | detect_info.found;
7816 int found = 0;
89528eb3 7817 val = Qnil;
ec6d2bb8 7818
89528eb3 7819 for (i = coding_category_raw_text - 1; i >= 0; i--)
ff0dacd7
KH
7820 {
7821 category = coding_priorities[i];
7822 if (! (mask & (1 << category)))
ec6d2bb8 7823 {
ff0dacd7
KH
7824 found |= 1 << category;
7825 id = coding_categories[category].id;
c7266f4a
KH
7826 if (id >= 0)
7827 val = Fcons (make_number (id), val);
ff0dacd7
KH
7828 }
7829 }
7830 for (i = coding_category_raw_text - 1; i >= 0; i--)
7831 {
7832 category = coding_priorities[i];
7833 if (detect_info.found & (1 << category))
7834 {
7835 id = coding_categories[category].id;
7836 val = Fcons (make_number (id), val);
ec6d2bb8 7837 }
ec6d2bb8 7838 }
ff0dacd7 7839 detect_info.found |= found;
ec6d2bb8 7840 }
ec6d2bb8 7841 }
a470d443
KH
7842 else if (base_category == coding_category_utf_8_auto)
7843 {
7844 if (detect_coding_utf_8 (&coding, &detect_info))
7845 {
7846 struct coding_system *this;
7847
7848 if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
7849 this = coding_categories + coding_category_utf_8_sig;
7850 else
7851 this = coding_categories + coding_category_utf_8_nosig;
7852 val = Fcons (make_number (this->id), Qnil);
7853 }
7854 }
24a73b0a
KH
7855 else if (base_category == coding_category_utf_16_auto)
7856 {
7857 if (detect_coding_utf_16 (&coding, &detect_info))
7858 {
24a73b0a
KH
7859 struct coding_system *this;
7860
7861 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
7862 this = coding_categories + coding_category_utf_16_le;
7863 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
7864 this = coding_categories + coding_category_utf_16_be;
7865 else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
7866 this = coding_categories + coding_category_utf_16_be_nosig;
7867 else
7868 this = coding_categories + coding_category_utf_16_le_nosig;
7869 val = Fcons (make_number (this->id), Qnil);
7870 }
7871 }
df7492f9
KH
7872 else
7873 {
ff0dacd7 7874 detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
89528eb3 7875 val = Fcons (make_number (coding.id), Qnil);
4ed46869 7876 }
df7492f9 7877
89528eb3 7878 /* Then, detect eol-format if necessary. */
df7492f9 7879 {
89528eb3 7880 int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol;
df7492f9
KH
7881 Lisp_Object tail;
7882
89528eb3
KH
7883 if (VECTORP (eol_type))
7884 {
ff0dacd7 7885 if (detect_info.found & ~CATEGORY_MASK_UTF_16)
2f3cbb32
KH
7886 {
7887 if (null_byte_found)
7888 normal_eol = EOL_SEEN_LF;
7889 else
7890 normal_eol = detect_eol (coding.source, src_bytes,
7891 coding_category_raw_text);
7892 }
ff0dacd7
KH
7893 if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
7894 | CATEGORY_MASK_UTF_16_BE_NOSIG))
89528eb3
KH
7895 utf_16_be_eol = detect_eol (coding.source, src_bytes,
7896 coding_category_utf_16_be);
ff0dacd7
KH
7897 if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
7898 | CATEGORY_MASK_UTF_16_LE_NOSIG))
89528eb3
KH
7899 utf_16_le_eol = detect_eol (coding.source, src_bytes,
7900 coding_category_utf_16_le);
7901 }
7902 else
7903 {
7904 if (EQ (eol_type, Qunix))
7905 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
7906 else if (EQ (eol_type, Qdos))
7907 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
7908 else
7909 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
7910 }
7911
df7492f9
KH
7912 for (tail = val; CONSP (tail); tail = XCDR (tail))
7913 {
89528eb3 7914 enum coding_category category;
df7492f9 7915 int this_eol;
89528eb3
KH
7916
7917 id = XINT (XCAR (tail));
7918 attrs = CODING_ID_ATTRS (id);
7919 category = XINT (CODING_ATTR_CATEGORY (attrs));
7920 eol_type = CODING_ID_EOL_TYPE (id);
df7492f9
KH
7921 if (VECTORP (eol_type))
7922 {
89528eb3
KH
7923 if (category == coding_category_utf_16_be
7924 || category == coding_category_utf_16_be_nosig)
7925 this_eol = utf_16_be_eol;
7926 else if (category == coding_category_utf_16_le
7927 || category == coding_category_utf_16_le_nosig)
7928 this_eol = utf_16_le_eol;
df7492f9 7929 else
89528eb3
KH
7930 this_eol = normal_eol;
7931
df7492f9
KH
7932 if (this_eol == EOL_SEEN_LF)
7933 XSETCAR (tail, AREF (eol_type, 0));
7934 else if (this_eol == EOL_SEEN_CRLF)
7935 XSETCAR (tail, AREF (eol_type, 1));
7936 else if (this_eol == EOL_SEEN_CR)
7937 XSETCAR (tail, AREF (eol_type, 2));
89528eb3
KH
7938 else
7939 XSETCAR (tail, CODING_ID_NAME (id));
df7492f9 7940 }
89528eb3
KH
7941 else
7942 XSETCAR (tail, CODING_ID_NAME (id));
df7492f9
KH
7943 }
7944 }
ec6d2bb8 7945
03699b14 7946 return (highest ? XCAR (val) : val);
ec6d2bb8
KH
7947}
7948
ec6d2bb8 7949
d46c5b12
KH
7950DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
7951 2, 3, 0,
48b0f3ae
PJ
7952 doc: /* Detect coding system of the text in the region between START and END.
7953Return a list of possible coding systems ordered by priority.
ec6d2bb8 7954
12e0131a 7955If only ASCII characters are found (except for such ISO-2022 control
d4a1d553
JB
7956characters as ESC), it returns a list of single element `undecided'
7957or its subsidiary coding system according to a detected end-of-line
7958format.
ec6d2bb8 7959
48b0f3ae
PJ
7960If optional argument HIGHEST is non-nil, return the coding system of
7961highest priority. */)
7962 (start, end, highest)
d46c5b12
KH
7963 Lisp_Object start, end, highest;
7964{
7965 int from, to;
7966 int from_byte, to_byte;
ec6d2bb8 7967
b7826503
PJ
7968 CHECK_NUMBER_COERCE_MARKER (start);
7969 CHECK_NUMBER_COERCE_MARKER (end);
ec6d2bb8 7970
d46c5b12
KH
7971 validate_region (&start, &end);
7972 from = XINT (start), to = XINT (end);
7973 from_byte = CHAR_TO_BYTE (from);
7974 to_byte = CHAR_TO_BYTE (to);
ec6d2bb8 7975
d46c5b12
KH
7976 if (from < GPT && to >= GPT)
7977 move_gap_both (to, to_byte);
c210f766 7978
d46c5b12 7979 return detect_coding_system (BYTE_POS_ADDR (from_byte),
24a73b0a 7980 to - from, to_byte - from_byte,
0a28aafb
KH
7981 !NILP (highest),
7982 !NILP (current_buffer
df7492f9
KH
7983 ->enable_multibyte_characters),
7984 Qnil);
ec6d2bb8
KH
7985}
7986
d46c5b12
KH
7987DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
7988 1, 2, 0,
48b0f3ae
PJ
7989 doc: /* Detect coding system of the text in STRING.
7990Return a list of possible coding systems ordered by priority.
fb88bf2d 7991
12e0131a 7992If only ASCII characters are found (except for such ISO-2022 control
d4a1d553
JB
7993characters as ESC), it returns a list of single element `undecided'
7994or its subsidiary coding system according to a detected end-of-line
7995format.
d46c5b12 7996
48b0f3ae
PJ
7997If optional argument HIGHEST is non-nil, return the coding system of
7998highest priority. */)
7999 (string, highest)
d46c5b12
KH
8000 Lisp_Object string, highest;
8001{
b7826503 8002 CHECK_STRING (string);
b73bfc1c 8003
24a73b0a
KH
8004 return detect_coding_system (SDATA (string),
8005 SCHARS (string), SBYTES (string),
8f924df7 8006 !NILP (highest), STRING_MULTIBYTE (string),
df7492f9 8007 Qnil);
4ed46869 8008}
4ed46869 8009
b73bfc1c 8010
df7492f9
KH
8011static INLINE int
8012char_encodable_p (c, attrs)
8013 int c;
8014 Lisp_Object attrs;
05e6f5dc 8015{
df7492f9 8016 Lisp_Object tail;
df7492f9 8017 struct charset *charset;
7d64c6ad 8018 Lisp_Object translation_table;
d46c5b12 8019
7d64c6ad 8020 translation_table = CODING_ATTR_TRANS_TBL (attrs);
a6f87d34 8021 if (! NILP (translation_table))
7d64c6ad 8022 c = translate_char (translation_table, c);
df7492f9
KH
8023 for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8024 CONSP (tail); tail = XCDR (tail))
e133c8fa 8025 {
df7492f9
KH
8026 charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8027 if (CHAR_CHARSET_P (c, charset))
8028 break;
e133c8fa 8029 }
df7492f9 8030 return (! NILP (tail));
05e6f5dc 8031}
83fa074f 8032
fb88bf2d 8033
df7492f9
KH
8034/* Return a list of coding systems that safely encode the text between
8035 START and END. If EXCLUDE is non-nil, it is a list of coding
8036 systems not to check. The returned list doesn't contain any such
48468dac 8037 coding systems. In any case, if the text contains only ASCII or is
df7492f9 8038 unibyte, return t. */
e077cc80 8039
df7492f9
KH
8040DEFUN ("find-coding-systems-region-internal",
8041 Ffind_coding_systems_region_internal,
8042 Sfind_coding_systems_region_internal, 2, 3, 0,
8043 doc: /* Internal use only. */)
8044 (start, end, exclude)
8045 Lisp_Object start, end, exclude;
8046{
8047 Lisp_Object coding_attrs_list, safe_codings;
8048 EMACS_INT start_byte, end_byte;
7c78e542 8049 const unsigned char *p, *pbeg, *pend;
df7492f9
KH
8050 int c;
8051 Lisp_Object tail, elt;
d46c5b12 8052
df7492f9
KH
8053 if (STRINGP (start))
8054 {
8055 if (!STRING_MULTIBYTE (start)
8f924df7 8056 || SCHARS (start) == SBYTES (start))
df7492f9
KH
8057 return Qt;
8058 start_byte = 0;
8f924df7 8059 end_byte = SBYTES (start);
df7492f9
KH
8060 }
8061 else
d46c5b12 8062 {
df7492f9
KH
8063 CHECK_NUMBER_COERCE_MARKER (start);
8064 CHECK_NUMBER_COERCE_MARKER (end);
8065 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8066 args_out_of_range (start, end);
8067 if (NILP (current_buffer->enable_multibyte_characters))
8068 return Qt;
8069 start_byte = CHAR_TO_BYTE (XINT (start));
8070 end_byte = CHAR_TO_BYTE (XINT (end));
8071 if (XINT (end) - XINT (start) == end_byte - start_byte)
8072 return Qt;
d46c5b12 8073
e1c23804 8074 if (XINT (start) < GPT && XINT (end) > GPT)
d46c5b12 8075 {
e1c23804
DL
8076 if ((GPT - XINT (start)) < (XINT (end) - GPT))
8077 move_gap_both (XINT (start), start_byte);
df7492f9 8078 else
e1c23804 8079 move_gap_both (XINT (end), end_byte);
d46c5b12
KH
8080 }
8081 }
8082
df7492f9
KH
8083 coding_attrs_list = Qnil;
8084 for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
8085 if (NILP (exclude)
8086 || NILP (Fmemq (XCAR (tail), exclude)))
8087 {
8088 Lisp_Object attrs;
d46c5b12 8089
df7492f9
KH
8090 attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
8091 if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
8092 && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
7d64c6ad
KH
8093 {
8094 ASET (attrs, coding_attr_trans_tbl,
2170c8f0 8095 get_translation_table (attrs, 1, NULL));
7d64c6ad
KH
8096 coding_attrs_list = Fcons (attrs, coding_attrs_list);
8097 }
df7492f9 8098 }
d46c5b12 8099
df7492f9 8100 if (STRINGP (start))
8f924df7 8101 p = pbeg = SDATA (start);
df7492f9
KH
8102 else
8103 p = pbeg = BYTE_POS_ADDR (start_byte);
8104 pend = p + (end_byte - start_byte);
b843d1ae 8105
df7492f9
KH
8106 while (p < pend && ASCII_BYTE_P (*p)) p++;
8107 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
d46c5b12 8108
05e6f5dc 8109 while (p < pend)
72d1a715 8110 {
df7492f9
KH
8111 if (ASCII_BYTE_P (*p))
8112 p++;
72d1a715
RS
8113 else
8114 {
df7492f9 8115 c = STRING_CHAR_ADVANCE (p);
12410ef1 8116
df7492f9
KH
8117 charset_map_loaded = 0;
8118 for (tail = coding_attrs_list; CONSP (tail);)
8119 {
8120 elt = XCAR (tail);
8121 if (NILP (elt))
8122 tail = XCDR (tail);
8123 else if (char_encodable_p (c, elt))
8124 tail = XCDR (tail);
8125 else if (CONSP (XCDR (tail)))
8126 {
8127 XSETCAR (tail, XCAR (XCDR (tail)));
8128 XSETCDR (tail, XCDR (XCDR (tail)));
8129 }
8130 else
8131 {
8132 XSETCAR (tail, Qnil);
8133 tail = XCDR (tail);
8134 }
8135 }
8136 if (charset_map_loaded)
8137 {
8138 EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
05e6f5dc 8139
df7492f9 8140 if (STRINGP (start))
8f924df7 8141 pbeg = SDATA (start);
df7492f9
KH
8142 else
8143 pbeg = BYTE_POS_ADDR (start_byte);
8144 p = pbeg + p_offset;
8145 pend = pbeg + pend_offset;
8146 }
8147 }
ec6d2bb8 8148 }
fb88bf2d 8149
988b3759 8150 safe_codings = list2 (Qraw_text, Qno_conversion);
df7492f9
KH
8151 for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
8152 if (! NILP (XCAR (tail)))
8153 safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
ec6d2bb8 8154
05e6f5dc
KH
8155 return safe_codings;
8156}
4956c225 8157
d46c5b12 8158
8f924df7
KH
8159DEFUN ("unencodable-char-position", Funencodable_char_position,
8160 Sunencodable_char_position, 3, 5, 0,
8161 doc: /*
8162Return position of first un-encodable character in a region.
d4a1d553 8163START and END specify the region and CODING-SYSTEM specifies the
8f924df7 8164encoding to check. Return nil if CODING-SYSTEM does encode the region.
d46c5b12 8165
8f924df7
KH
8166If optional 4th argument COUNT is non-nil, it specifies at most how
8167many un-encodable characters to search. In this case, the value is a
8168list of positions.
d46c5b12 8169
8f924df7
KH
8170If optional 5th argument STRING is non-nil, it is a string to search
8171for un-encodable characters. In that case, START and END are indexes
8172to the string. */)
8173 (start, end, coding_system, count, string)
8174 Lisp_Object start, end, coding_system, count, string;
8175{
8176 int n;
8177 struct coding_system coding;
7d64c6ad 8178 Lisp_Object attrs, charset_list, translation_table;
8f924df7
KH
8179 Lisp_Object positions;
8180 int from, to;
8181 const unsigned char *p, *stop, *pend;
8182 int ascii_compatible;
fb88bf2d 8183
8f924df7
KH
8184 setup_coding_system (Fcheck_coding_system (coding_system), &coding);
8185 attrs = CODING_ID_ATTRS (coding.id);
8186 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
8187 return Qnil;
8188 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
8189 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2170c8f0 8190 translation_table = get_translation_table (attrs, 1, NULL);
fb88bf2d 8191
8f924df7
KH
8192 if (NILP (string))
8193 {
8194 validate_region (&start, &end);
8195 from = XINT (start);
8196 to = XINT (end);
8197 if (NILP (current_buffer->enable_multibyte_characters)
8198 || (ascii_compatible
8199 && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
8200 return Qnil;
8201 p = CHAR_POS_ADDR (from);
8202 pend = CHAR_POS_ADDR (to);
8203 if (from < GPT && to >= GPT)
8204 stop = GPT_ADDR;
8205 else
8206 stop = pend;
8207 }
8208 else
8209 {
8210 CHECK_STRING (string);
8211 CHECK_NATNUM (start);
8212 CHECK_NATNUM (end);
8213 from = XINT (start);
8214 to = XINT (end);
8215 if (from > to
8216 || to > SCHARS (string))
8217 args_out_of_range_3 (string, start, end);
8218 if (! STRING_MULTIBYTE (string))
8219 return Qnil;
8220 p = SDATA (string) + string_char_to_byte (string, from);
8221 stop = pend = SDATA (string) + string_char_to_byte (string, to);
8222 if (ascii_compatible && (to - from) == (pend - p))
8223 return Qnil;
8224 }
f2558efd 8225
8f924df7
KH
8226 if (NILP (count))
8227 n = 1;
8228 else
b73bfc1c 8229 {
8f924df7
KH
8230 CHECK_NATNUM (count);
8231 n = XINT (count);
b73bfc1c
KH
8232 }
8233
8f924df7
KH
8234 positions = Qnil;
8235 while (1)
d46c5b12 8236 {
8f924df7 8237 int c;
ec6d2bb8 8238
8f924df7
KH
8239 if (ascii_compatible)
8240 while (p < stop && ASCII_BYTE_P (*p))
8241 p++, from++;
8242 if (p >= stop)
0e79d667 8243 {
8f924df7
KH
8244 if (p >= pend)
8245 break;
8246 stop = pend;
8247 p = GAP_END_ADDR;
0e79d667 8248 }
ec6d2bb8 8249
8f924df7
KH
8250 c = STRING_CHAR_ADVANCE (p);
8251 if (! (ASCII_CHAR_P (c) && ascii_compatible)
7d64c6ad
KH
8252 && ! char_charset (translate_char (translation_table, c),
8253 charset_list, NULL))
ec6d2bb8 8254 {
8f924df7
KH
8255 positions = Fcons (make_number (from), positions);
8256 n--;
8257 if (n == 0)
8258 break;
ec6d2bb8
KH
8259 }
8260
8f924df7
KH
8261 from++;
8262 }
d46c5b12 8263
8f924df7
KH
8264 return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
8265}
d46c5b12 8266
d46c5b12 8267
df7492f9
KH
8268DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
8269 Scheck_coding_systems_region, 3, 3, 0,
8270 doc: /* Check if the region is encodable by coding systems.
d46c5b12 8271
df7492f9
KH
8272START and END are buffer positions specifying the region.
8273CODING-SYSTEM-LIST is a list of coding systems to check.
d46c5b12 8274
df7492f9 8275The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
d4a1d553 8276CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
df7492f9
KH
8277whole region, POS0, POS1, ... are buffer positions where non-encodable
8278characters are found.
93dec019 8279
df7492f9
KH
8280If all coding systems in CODING-SYSTEM-LIST can encode the region, the
8281value is nil.
93dec019 8282
df7492f9
KH
8283START may be a string. In that case, check if the string is
8284encodable, and the value contains indices to the string instead of
8285buffer positions. END is ignored. */)
8286 (start, end, coding_system_list)
8287 Lisp_Object start, end, coding_system_list;
05e6f5dc 8288{
df7492f9
KH
8289 Lisp_Object list;
8290 EMACS_INT start_byte, end_byte;
8291 int pos;
7c78e542 8292 const unsigned char *p, *pbeg, *pend;
df7492f9 8293 int c;
7d64c6ad 8294 Lisp_Object tail, elt, attrs;
70ad9fc4 8295
05e6f5dc
KH
8296 if (STRINGP (start))
8297 {
df7492f9 8298 if (!STRING_MULTIBYTE (start)
8f924df7 8299 && SCHARS (start) != SBYTES (start))
df7492f9
KH
8300 return Qnil;
8301 start_byte = 0;
8f924df7 8302 end_byte = SBYTES (start);
df7492f9 8303 pos = 0;
d46c5b12 8304 }
05e6f5dc 8305 else
b73bfc1c 8306 {
b7826503
PJ
8307 CHECK_NUMBER_COERCE_MARKER (start);
8308 CHECK_NUMBER_COERCE_MARKER (end);
05e6f5dc
KH
8309 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8310 args_out_of_range (start, end);
8311 if (NILP (current_buffer->enable_multibyte_characters))
df7492f9
KH
8312 return Qnil;
8313 start_byte = CHAR_TO_BYTE (XINT (start));
8314 end_byte = CHAR_TO_BYTE (XINT (end));
8315 if (XINT (end) - XINT (start) == end_byte - start_byte)
05e6f5dc 8316 return Qt;
df7492f9 8317
e1c23804 8318 if (XINT (start) < GPT && XINT (end) > GPT)
b73bfc1c 8319 {
e1c23804
DL
8320 if ((GPT - XINT (start)) < (XINT (end) - GPT))
8321 move_gap_both (XINT (start), start_byte);
df7492f9 8322 else
e1c23804 8323 move_gap_both (XINT (end), end_byte);
b73bfc1c 8324 }
e1c23804 8325 pos = XINT (start);
b73bfc1c 8326 }
7553d0e1 8327
df7492f9
KH
8328 list = Qnil;
8329 for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
12410ef1 8330 {
df7492f9 8331 elt = XCAR (tail);
7d64c6ad 8332 attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
2170c8f0
KH
8333 ASET (attrs, coding_attr_trans_tbl,
8334 get_translation_table (attrs, 1, NULL));
7d64c6ad 8335 list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
12410ef1
KH
8336 }
8337
df7492f9 8338 if (STRINGP (start))
8f924df7 8339 p = pbeg = SDATA (start);
72d1a715 8340 else
df7492f9
KH
8341 p = pbeg = BYTE_POS_ADDR (start_byte);
8342 pend = p + (end_byte - start_byte);
4ed46869 8343
df7492f9
KH
8344 while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
8345 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
ec6d2bb8 8346
df7492f9 8347 while (p < pend)
d46c5b12 8348 {
df7492f9
KH
8349 if (ASCII_BYTE_P (*p))
8350 p++;
e133c8fa 8351 else
05e6f5dc 8352 {
df7492f9
KH
8353 c = STRING_CHAR_ADVANCE (p);
8354
8355 charset_map_loaded = 0;
8356 for (tail = list; CONSP (tail); tail = XCDR (tail))
8357 {
8358 elt = XCDR (XCAR (tail));
8359 if (! char_encodable_p (c, XCAR (elt)))
8360 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
8361 }
8362 if (charset_map_loaded)
8363 {
8364 EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
8365
8366 if (STRINGP (start))
8f924df7 8367 pbeg = SDATA (start);
df7492f9
KH
8368 else
8369 pbeg = BYTE_POS_ADDR (start_byte);
8370 p = pbeg + p_offset;
8371 pend = pbeg + pend_offset;
8372 }
05e6f5dc 8373 }
df7492f9 8374 pos++;
d46c5b12 8375 }
4ed46869 8376
df7492f9
KH
8377 tail = list;
8378 list = Qnil;
8379 for (; CONSP (tail); tail = XCDR (tail))
ec6d2bb8 8380 {
df7492f9
KH
8381 elt = XCAR (tail);
8382 if (CONSP (XCDR (XCDR (elt))))
8383 list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
8384 list);
ec6d2bb8 8385 }
2b4f9037 8386
df7492f9 8387 return list;
d46c5b12
KH
8388}
8389
3fd9494b 8390
b73bfc1c 8391Lisp_Object
df7492f9
KH
8392code_convert_region (start, end, coding_system, dst_object, encodep, norecord)
8393 Lisp_Object start, end, coding_system, dst_object;
8394 int encodep, norecord;
4ed46869 8395{
3a73fa5d 8396 struct coding_system coding;
df7492f9
KH
8397 EMACS_INT from, from_byte, to, to_byte;
8398 Lisp_Object src_object;
4ed46869 8399
b7826503
PJ
8400 CHECK_NUMBER_COERCE_MARKER (start);
8401 CHECK_NUMBER_COERCE_MARKER (end);
df7492f9
KH
8402 if (NILP (coding_system))
8403 coding_system = Qno_conversion;
8404 else
8405 CHECK_CODING_SYSTEM (coding_system);
8406 src_object = Fcurrent_buffer ();
8407 if (NILP (dst_object))
8408 dst_object = src_object;
8409 else if (! EQ (dst_object, Qt))
8410 CHECK_BUFFER (dst_object);
3a73fa5d 8411
d46c5b12
KH
8412 validate_region (&start, &end);
8413 from = XFASTINT (start);
df7492f9 8414 from_byte = CHAR_TO_BYTE (from);
d46c5b12 8415 to = XFASTINT (end);
df7492f9 8416 to_byte = CHAR_TO_BYTE (to);
764ca8da 8417
df7492f9
KH
8418 setup_coding_system (coding_system, &coding);
8419 coding.mode |= CODING_MODE_LAST_BLOCK;
ec6d2bb8 8420
df7492f9
KH
8421 if (encodep)
8422 encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8423 dst_object);
8424 else
8425 decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8426 dst_object);
8427 if (! norecord)
8428 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
ec6d2bb8 8429
df7492f9
KH
8430 return (BUFFERP (dst_object)
8431 ? make_number (coding.produced_char)
8432 : coding.dst_object);
4031e2bf 8433}
78108bcd 8434
4ed46869 8435
4031e2bf 8436DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
df7492f9 8437 3, 4, "r\nzCoding system: ",
48b0f3ae 8438 doc: /* Decode the current region from the specified coding system.
df7492f9
KH
8439When called from a program, takes four arguments:
8440 START, END, CODING-SYSTEM, and DESTINATION.
8441START and END are buffer positions.
8844fa83 8442
df7492f9 8443Optional 4th arguments DESTINATION specifies where the decoded text goes.
2354b80b 8444If nil, the region between START and END is replaced by the decoded text.
df7492f9 8445If buffer, the decoded text is inserted in the buffer.
446dcd75 8446In those cases, the length of the decoded text is returned.
319a3947 8447If DESTINATION is t, the decoded text is returned.
8844fa83 8448
48b0f3ae
PJ
8449This function sets `last-coding-system-used' to the precise coding system
8450used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
319a3947 8451not fully specified.) */)
df7492f9
KH
8452 (start, end, coding_system, destination)
8453 Lisp_Object start, end, coding_system, destination;
4031e2bf 8454{
df7492f9 8455 return code_convert_region (start, end, coding_system, destination, 0, 0);
3a73fa5d 8456}
8844fa83 8457
3a73fa5d 8458DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
df7492f9
KH
8459 3, 4, "r\nzCoding system: ",
8460 doc: /* Encode the current region by specified coding system.
d4a1d553
JB
8461When called from a program, takes four arguments:
8462 START, END, CODING-SYSTEM and DESTINATION.
8463START and END are buffer positions.
d46c5b12 8464
df7492f9
KH
8465Optional 4th arguments DESTINATION specifies where the encoded text goes.
8466If nil, the region between START and END is replace by the encoded text.
8467If buffer, the encoded text is inserted in the buffer.
446dcd75 8468In those cases, the length of the encoded text is returned.
319a3947 8469If DESTINATION is t, the encoded text is returned.
2391eaa4 8470
48b0f3ae
PJ
8471This function sets `last-coding-system-used' to the precise coding system
8472used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
319a3947 8473not fully specified.) */)
df7492f9
KH
8474 (start, end, coding_system, destination)
8475 Lisp_Object start, end, coding_system, destination;
3a73fa5d 8476{
df7492f9 8477 return code_convert_region (start, end, coding_system, destination, 1, 0);
b73bfc1c
KH
8478}
8479
8480Lisp_Object
df7492f9
KH
8481code_convert_string (string, coding_system, dst_object,
8482 encodep, nocopy, norecord)
8483 Lisp_Object string, coding_system, dst_object;
8484 int encodep, nocopy, norecord;
b73bfc1c 8485{
4031e2bf 8486 struct coding_system coding;
df7492f9 8487 EMACS_INT chars, bytes;
ec6d2bb8 8488
b7826503 8489 CHECK_STRING (string);
d46c5b12 8490 if (NILP (coding_system))
4956c225 8491 {
df7492f9
KH
8492 if (! norecord)
8493 Vlast_coding_system_used = Qno_conversion;
8494 if (NILP (dst_object))
8495 return (nocopy ? Fcopy_sequence (string) : string);
4956c225 8496 }
b73bfc1c 8497
df7492f9
KH
8498 if (NILP (coding_system))
8499 coding_system = Qno_conversion;
8500 else
8501 CHECK_CODING_SYSTEM (coding_system);
8502 if (NILP (dst_object))
8503 dst_object = Qt;
8504 else if (! EQ (dst_object, Qt))
8505 CHECK_BUFFER (dst_object);
73be902c 8506
df7492f9 8507 setup_coding_system (coding_system, &coding);
d46c5b12 8508 coding.mode |= CODING_MODE_LAST_BLOCK;
8f924df7
KH
8509 chars = SCHARS (string);
8510 bytes = SBYTES (string);
df7492f9
KH
8511 if (encodep)
8512 encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
8513 else
8514 decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
8515 if (! norecord)
8516 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
73be902c 8517
df7492f9
KH
8518 return (BUFFERP (dst_object)
8519 ? make_number (coding.produced_char)
8520 : coding.dst_object);
4ed46869 8521}
73be902c 8522
b73bfc1c 8523
ecec61c1 8524/* Encode or decode STRING according to CODING_SYSTEM.
ec6d2bb8 8525 Do not set Vlast_coding_system_used.
4ed46869 8526
ec6d2bb8
KH
8527 This function is called only from macros DECODE_FILE and
8528 ENCODE_FILE, thus we ignore character composition. */
4ed46869 8529
ecec61c1
KH
8530Lisp_Object
8531code_convert_string_norecord (string, coding_system, encodep)
8532 Lisp_Object string, coding_system;
8533 int encodep;
4ed46869 8534{
0be8721c 8535 return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
4ed46869
KH
8536}
8537
4ed46869 8538
df7492f9
KH
8539DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
8540 2, 4, 0,
8541 doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
8542
8543Optional third arg NOCOPY non-nil means it is OK to return STRING itself
8544if the decoding operation is trivial.
ecec61c1 8545
d4a1d553 8546Optional fourth arg BUFFER non-nil means that the decoded text is
a3f6ee6d 8547inserted in BUFFER instead of returned as a string. In this case,
319a3947 8548the return value is the length of the decoded text.
ecec61c1 8549
df7492f9
KH
8550This function sets `last-coding-system-used' to the precise coding system
8551used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
d4a1d553 8552not fully specified.) */)
df7492f9
KH
8553 (string, coding_system, nocopy, buffer)
8554 Lisp_Object string, coding_system, nocopy, buffer;
4ed46869 8555{
df7492f9
KH
8556 return code_convert_string (string, coding_system, buffer,
8557 0, ! NILP (nocopy), 0);
4ed46869
KH
8558}
8559
df7492f9
KH
8560DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
8561 2, 4, 0,
8562 doc: /* Encode STRING to CODING-SYSTEM, and return the result.
8563
8564Optional third arg NOCOPY non-nil means it is OK to return STRING
8565itself if the encoding operation is trivial.
8566
d4a1d553 8567Optional fourth arg BUFFER non-nil means that the encoded text is
a3f6ee6d 8568inserted in BUFFER instead of returned as a string. In this case,
446dcd75 8569the return value is the length of the encoded text.
df7492f9
KH
8570
8571This function sets `last-coding-system-used' to the precise coding system
8572used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8573not fully specified.) */)
8574 (string, coding_system, nocopy, buffer)
8575 Lisp_Object string, coding_system, nocopy, buffer;
4ed46869 8576{
df7492f9 8577 return code_convert_string (string, coding_system, buffer,
c197f191 8578 1, ! NILP (nocopy), 1);
4ed46869 8579}
df7492f9 8580
3a73fa5d 8581\f
4ed46869 8582DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
48b0f3ae
PJ
8583 doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
8584Return the corresponding character. */)
8585 (code)
4ed46869 8586 Lisp_Object code;
4ed46869 8587{
df7492f9
KH
8588 Lisp_Object spec, attrs, val;
8589 struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
8590 int c;
4ed46869 8591
df7492f9
KH
8592 CHECK_NATNUM (code);
8593 c = XFASTINT (code);
8594 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
8595 attrs = AREF (spec, 0);
4ed46869 8596
df7492f9
KH
8597 if (ASCII_BYTE_P (c)
8598 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8599 return code;
4ed46869 8600
df7492f9
KH
8601 val = CODING_ATTR_CHARSET_LIST (attrs);
8602 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
004068e4
KH
8603 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
8604 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
fa42c37f 8605
df7492f9
KH
8606 if (c <= 0x7F)
8607 charset = charset_roman;
8608 else if (c >= 0xA0 && c < 0xDF)
55ab7be3 8609 {
df7492f9
KH
8610 charset = charset_kana;
8611 c -= 0x80;
4ed46869 8612 }
55ab7be3 8613 else
4ed46869 8614 {
004068e4 8615 int s1 = c >> 8, s2 = c & 0xFF;
df7492f9
KH
8616
8617 if (s1 < 0x81 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF
8618 || s2 < 0x40 || s2 == 0x7F || s2 > 0xFC)
8619 error ("Invalid code: %d", code);
8620 SJIS_TO_JIS (c);
8621 charset = charset_kanji;
4ed46869 8622 }
df7492f9
KH
8623 c = DECODE_CHAR (charset, c);
8624 if (c < 0)
8625 error ("Invalid code: %d", code);
8626 return make_number (c);
93dec019 8627}
4ed46869 8628
48b0f3ae 8629
4ed46869 8630DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
8acf0c0e 8631 doc: /* Encode a Japanese character CH to shift_jis encoding.
48b0f3ae
PJ
8632Return the corresponding code in SJIS. */)
8633 (ch)
df7492f9 8634 Lisp_Object ch;
4ed46869 8635{
df7492f9
KH
8636 Lisp_Object spec, attrs, charset_list;
8637 int c;
8638 struct charset *charset;
8639 unsigned code;
48b0f3ae 8640
df7492f9
KH
8641 CHECK_CHARACTER (ch);
8642 c = XFASTINT (ch);
8643 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
8644 attrs = AREF (spec, 0);
8645
8646 if (ASCII_CHAR_P (c)
8647 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8648 return ch;
8649
8650 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8651 charset = char_charset (c, charset_list, &code);
8652 if (code == CHARSET_INVALID_CODE (charset))
8653 error ("Can't encode by shift_jis encoding: %d", c);
8654 JIS_TO_SJIS (code);
8655
8656 return make_number (code);
4ed46869
KH
8657}
8658
8659DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
48b0f3ae
PJ
8660 doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
8661Return the corresponding character. */)
8662 (code)
4ed46869 8663 Lisp_Object code;
d46c5b12 8664{
df7492f9
KH
8665 Lisp_Object spec, attrs, val;
8666 struct charset *charset_roman, *charset_big5, *charset;
8667 int c;
6289dd10 8668
df7492f9
KH
8669 CHECK_NATNUM (code);
8670 c = XFASTINT (code);
8671 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
8672 attrs = AREF (spec, 0);
4ed46869 8673
df7492f9
KH
8674 if (ASCII_BYTE_P (c)
8675 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8676 return code;
6289dd10 8677
df7492f9
KH
8678 val = CODING_ATTR_CHARSET_LIST (attrs);
8679 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
8680 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
c210f766 8681
df7492f9
KH
8682 if (c <= 0x7F)
8683 charset = charset_roman;
c28a9453
KH
8684 else
8685 {
df7492f9
KH
8686 int b1 = c >> 8, b2 = c & 0x7F;
8687 if (b1 < 0xA1 || b1 > 0xFE
8688 || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
8689 error ("Invalid code: %d", code);
8690 charset = charset_big5;
c28a9453 8691 }
df7492f9
KH
8692 c = DECODE_CHAR (charset, (unsigned )c);
8693 if (c < 0)
8694 error ("Invalid code: %d", code);
8695 return make_number (c);
d46c5b12 8696}
6289dd10 8697
4ed46869 8698DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
8acf0c0e 8699 doc: /* Encode the Big5 character CH to BIG5 coding system.
48b0f3ae
PJ
8700Return the corresponding character code in Big5. */)
8701 (ch)
4ed46869
KH
8702 Lisp_Object ch;
8703{
df7492f9
KH
8704 Lisp_Object spec, attrs, charset_list;
8705 struct charset *charset;
8706 int c;
8707 unsigned code;
8708
8709 CHECK_CHARACTER (ch);
8710 c = XFASTINT (ch);
8711 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
8712 attrs = AREF (spec, 0);
8713 if (ASCII_CHAR_P (c)
8714 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8715 return ch;
8716
8717 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8718 charset = char_charset (c, charset_list, &code);
8719 if (code == CHARSET_INVALID_CODE (charset))
8720 error ("Can't encode by Big5 encoding: %d", c);
8721
8722 return make_number (code);
4ed46869 8723}
48b0f3ae 8724
3a73fa5d 8725\f
002fdb44 8726DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
68bba4e4 8727 Sset_terminal_coding_system_internal, 1, 2, 0,
48b0f3ae 8728 doc: /* Internal use only. */)
6ed8eeff 8729 (coding_system, terminal)
b74e4686 8730 Lisp_Object coding_system;
6ed8eeff 8731 Lisp_Object terminal;
4ed46869 8732{
6ed8eeff 8733 struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
b7826503 8734 CHECK_SYMBOL (coding_system);
b8299c66 8735 setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
70c22245 8736 /* We had better not send unsafe characters to terminal. */
c73bd236 8737 terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
df7492f9 8738 /* Characer composition should be disabled. */
c73bd236 8739 terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
b8299c66
KL
8740 terminal_coding->src_multibyte = 1;
8741 terminal_coding->dst_multibyte = 0;
4ed46869
KH
8742 return Qnil;
8743}
8744
c4825358
KH
8745DEFUN ("set-safe-terminal-coding-system-internal",
8746 Fset_safe_terminal_coding_system_internal,
48b0f3ae 8747 Sset_safe_terminal_coding_system_internal, 1, 1, 0,
ddb67bdc 8748 doc: /* Internal use only. */)
48b0f3ae 8749 (coding_system)
b74e4686 8750 Lisp_Object coding_system;
d46c5b12 8751{
b7826503 8752 CHECK_SYMBOL (coding_system);
c4825358
KH
8753 setup_coding_system (Fcheck_coding_system (coding_system),
8754 &safe_terminal_coding);
df7492f9
KH
8755 /* Characer composition should be disabled. */
8756 safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
b73bfc1c
KH
8757 safe_terminal_coding.src_multibyte = 1;
8758 safe_terminal_coding.dst_multibyte = 0;
c4825358
KH
8759 return Qnil;
8760}
4ed46869 8761
002fdb44 8762DEFUN ("terminal-coding-system", Fterminal_coding_system,
68bba4e4 8763 Sterminal_coding_system, 0, 1, 0,
6ed8eeff
KL
8764 doc: /* Return coding system specified for terminal output on the given terminal.
8765TERMINAL may be a terminal id, a frame, or nil for the selected
8766frame's terminal device. */)
8767 (terminal)
8768 Lisp_Object terminal;
4ed46869 8769{
985773c9
MB
8770 struct coding_system *terminal_coding
8771 = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
8772 Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
ae6f73fa 8773
ae6f73fa 8774 /* For backward compatibility, return nil if it is `undecided'. */
f75c90a9 8775 return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
4ed46869
KH
8776}
8777
002fdb44 8778DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
68bba4e4 8779 Sset_keyboard_coding_system_internal, 1, 2, 0,
48b0f3ae 8780 doc: /* Internal use only. */)
6ed8eeff 8781 (coding_system, terminal)
4ed46869 8782 Lisp_Object coding_system;
6ed8eeff 8783 Lisp_Object terminal;
4ed46869 8784{
6ed8eeff 8785 struct terminal *t = get_terminal (terminal, 1);
b7826503 8786 CHECK_SYMBOL (coding_system);
df7492f9 8787 setup_coding_system (Fcheck_coding_system (coding_system),
c73bd236 8788 TERMINAL_KEYBOARD_CODING (t));
df7492f9 8789 /* Characer composition should be disabled. */
c73bd236
MB
8790 TERMINAL_KEYBOARD_CODING (t)->common_flags
8791 &= ~CODING_ANNOTATE_COMPOSITION_MASK;
4ed46869
KH
8792 return Qnil;
8793}
8794
8795DEFUN ("keyboard-coding-system",
985773c9 8796 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
48b0f3ae 8797 doc: /* Return coding system specified for decoding keyboard input. */)
985773c9
MB
8798 (terminal)
8799 Lisp_Object terminal;
4ed46869 8800{
985773c9
MB
8801 return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
8802 (get_terminal (terminal, 1))->id);
4ed46869
KH
8803}
8804
4ed46869 8805\f
a5d301df
KH
8806DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
8807 Sfind_operation_coding_system, 1, MANY, 0,
48b0f3ae
PJ
8808 doc: /* Choose a coding system for an operation based on the target name.
8809The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
8810DECODING-SYSTEM is the coding system to use for decoding
8811\(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
8812for encoding (in case OPERATION does encoding).
05e6f5dc 8813
48b0f3ae
PJ
8814The first argument OPERATION specifies an I/O primitive:
8815 For file I/O, `insert-file-contents' or `write-region'.
8816 For process I/O, `call-process', `call-process-region', or `start-process'.
8817 For network I/O, `open-network-stream'.
05e6f5dc 8818
48b0f3ae
PJ
8819The remaining arguments should be the same arguments that were passed
8820to the primitive. Depending on which primitive, one of those arguments
8821is selected as the TARGET. For example, if OPERATION does file I/O,
8822whichever argument specifies the file name is TARGET.
05e6f5dc 8823
48b0f3ae 8824TARGET has a meaning which depends on OPERATION:
b883cdb2 8825 For file I/O, TARGET is a file name (except for the special case below).
48b0f3ae 8826 For process I/O, TARGET is a process name.
d4a1d553 8827 For network I/O, TARGET is a service name or a port number.
05e6f5dc 8828
d4a1d553 8829This function looks up what is specified for TARGET in
48b0f3ae
PJ
8830`file-coding-system-alist', `process-coding-system-alist',
8831or `network-coding-system-alist' depending on OPERATION.
8832They may specify a coding system, a cons of coding systems,
8833or a function symbol to call.
8834In the last case, we call the function with one argument,
8835which is a list of all the arguments given to this function.
1011c487
MB
8836If the function can't decide a coding system, it can return
8837`undecided' so that the normal code-detection is performed.
48b0f3ae 8838
b883cdb2
MB
8839If OPERATION is `insert-file-contents', the argument corresponding to
8840TARGET may be a cons (FILENAME . BUFFER). In that case, FILENAME is a
8841file name to look up, and BUFFER is a buffer that contains the file's
8842contents (not yet decoded). If `file-coding-system-alist' specifies a
8843function to call for FILENAME, that function should examine the
8844contents of BUFFER instead of reading the file.
8845
d918f936 8846usage: (find-operation-coding-system OPERATION ARGUMENTS...) */)
48b0f3ae 8847 (nargs, args)
4ed46869
KH
8848 int nargs;
8849 Lisp_Object *args;
6b89e3aa 8850{
4ed46869
KH
8851 Lisp_Object operation, target_idx, target, val;
8852 register Lisp_Object chain;
177c0ea7 8853
4ed46869
KH
8854 if (nargs < 2)
8855 error ("Too few arguments");
8856 operation = args[0];
8857 if (!SYMBOLP (operation)
8858 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
3ed051d4 8859 error ("Invalid first argument");
4ed46869
KH
8860 if (nargs < 1 + XINT (target_idx))
8861 error ("Too few arguments for operation: %s",
8f924df7 8862 SDATA (SYMBOL_NAME (operation)));
4ed46869
KH
8863 target = args[XINT (target_idx) + 1];
8864 if (!(STRINGP (target)
091a0ff0
KH
8865 || (EQ (operation, Qinsert_file_contents) && CONSP (target)
8866 && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
4ed46869 8867 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
df7492f9 8868 error ("Invalid %dth argument", XINT (target_idx) + 1);
091a0ff0
KH
8869 if (CONSP (target))
8870 target = XCAR (target);
4ed46869 8871
2e34157c
RS
8872 chain = ((EQ (operation, Qinsert_file_contents)
8873 || EQ (operation, Qwrite_region))
02ba4723 8874 ? Vfile_coding_system_alist
2e34157c 8875 : (EQ (operation, Qopen_network_stream)
02ba4723
KH
8876 ? Vnetwork_coding_system_alist
8877 : Vprocess_coding_system_alist));
4ed46869
KH
8878 if (NILP (chain))
8879 return Qnil;
8880
03699b14 8881 for (; CONSP (chain); chain = XCDR (chain))
6b89e3aa 8882 {
f44d27ce 8883 Lisp_Object elt;
6b89e3aa 8884
df7492f9 8885 elt = XCAR (chain);
4ed46869
KH
8886 if (CONSP (elt)
8887 && ((STRINGP (target)
03699b14
KR
8888 && STRINGP (XCAR (elt))
8889 && fast_string_match (XCAR (elt), target) >= 0)
8890 || (INTEGERP (target) && EQ (target, XCAR (elt)))))
6b89e3aa 8891 {
03699b14 8892 val = XCDR (elt);
b19fd4c5
KH
8893 /* Here, if VAL is both a valid coding system and a valid
8894 function symbol, we return VAL as a coding system. */
02ba4723
KH
8895 if (CONSP (val))
8896 return val;
8897 if (! SYMBOLP (val))
8898 return Qnil;
8899 if (! NILP (Fcoding_system_p (val)))
8900 return Fcons (val, val);
b19fd4c5 8901 if (! NILP (Ffboundp (val)))
6b89e3aa 8902 {
e2b97060
MB
8903 /* We use call1 rather than safe_call1
8904 so as to get bug reports about functions called here
8905 which don't handle the current interface. */
8906 val = call1 (val, Flist (nargs, args));
b19fd4c5
KH
8907 if (CONSP (val))
8908 return val;
8909 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
8910 return Fcons (val, val);
6b89e3aa 8911 }
02ba4723 8912 return Qnil;
6b89e3aa
KH
8913 }
8914 }
4ed46869 8915 return Qnil;
6b89e3aa
KH
8916}
8917
df7492f9 8918DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
a3f6ee6d 8919 Sset_coding_system_priority, 0, MANY, 0,
da7db224 8920 doc: /* Assign higher priority to the coding systems given as arguments.
d4a1d553 8921If multiple coding systems belong to the same category,
a3181084
DL
8922all but the first one are ignored.
8923
d4a1d553 8924usage: (set-coding-system-priority &rest coding-systems) */)
df7492f9
KH
8925 (nargs, args)
8926 int nargs;
8927 Lisp_Object *args;
8928{
8929 int i, j;
8930 int changed[coding_category_max];
8931 enum coding_category priorities[coding_category_max];
8932
8933 bzero (changed, sizeof changed);
6b89e3aa 8934
df7492f9 8935 for (i = j = 0; i < nargs; i++)
6b89e3aa 8936 {
df7492f9
KH
8937 enum coding_category category;
8938 Lisp_Object spec, attrs;
6b89e3aa 8939
df7492f9
KH
8940 CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
8941 attrs = AREF (spec, 0);
8942 category = XINT (CODING_ATTR_CATEGORY (attrs));
8943 if (changed[category])
8944 /* Ignore this coding system because a coding system of the
8945 same category already had a higher priority. */
8946 continue;
8947 changed[category] = 1;
8948 priorities[j++] = category;
8949 if (coding_categories[category].id >= 0
8950 && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
8951 setup_coding_system (args[i], &coding_categories[category]);
ff563fce 8952 Fset (AREF (Vcoding_category_table, category), args[i]);
df7492f9 8953 }
6b89e3aa 8954
df7492f9
KH
8955 /* Now we have decided top J priorities. Reflect the order of the
8956 original priorities to the remaining priorities. */
6b89e3aa 8957
df7492f9 8958 for (i = j, j = 0; i < coding_category_max; i++, j++)
6b89e3aa 8959 {
df7492f9
KH
8960 while (j < coding_category_max
8961 && changed[coding_priorities[j]])
8962 j++;
8963 if (j == coding_category_max)
8964 abort ();
8965 priorities[i] = coding_priorities[j];
8966 }
6b89e3aa 8967
df7492f9 8968 bcopy (priorities, coding_priorities, sizeof priorities);
177c0ea7 8969
ff563fce
KH
8970 /* Update `coding-category-list'. */
8971 Vcoding_category_list = Qnil;
8972 for (i = coding_category_max - 1; i >= 0; i--)
8973 Vcoding_category_list
8974 = Fcons (AREF (Vcoding_category_table, priorities[i]),
8975 Vcoding_category_list);
6b89e3aa 8976
df7492f9 8977 return Qnil;
6b89e3aa
KH
8978}
8979
df7492f9
KH
8980DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
8981 Scoding_system_priority_list, 0, 1, 0,
da7db224
DL
8982 doc: /* Return a list of coding systems ordered by their priorities.
8983HIGHESTP non-nil means just return the highest priority one. */)
df7492f9
KH
8984 (highestp)
8985 Lisp_Object highestp;
d46c5b12
KH
8986{
8987 int i;
df7492f9 8988 Lisp_Object val;
6b89e3aa 8989
df7492f9 8990 for (i = 0, val = Qnil; i < coding_category_max; i++)
d46c5b12 8991 {
df7492f9
KH
8992 enum coding_category category = coding_priorities[i];
8993 int id = coding_categories[category].id;
8994 Lisp_Object attrs;
068a9dbd 8995
df7492f9
KH
8996 if (id < 0)
8997 continue;
8998 attrs = CODING_ID_ATTRS (id);
8999 if (! NILP (highestp))
9000 return CODING_ATTR_BASE_NAME (attrs);
9001 val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
9002 }
9003 return Fnreverse (val);
9004}
068a9dbd 9005
f0064e1f 9006static char *suffixes[] = { "-unix", "-dos", "-mac" };
068a9dbd
KH
9007
9008static Lisp_Object
df7492f9
KH
9009make_subsidiaries (base)
9010 Lisp_Object base;
068a9dbd 9011{
df7492f9 9012 Lisp_Object subsidiaries;
8f924df7 9013 int base_name_len = SBYTES (SYMBOL_NAME (base));
df7492f9
KH
9014 char *buf = (char *) alloca (base_name_len + 6);
9015 int i;
068a9dbd 9016
8f924df7 9017 bcopy (SDATA (SYMBOL_NAME (base)), buf, base_name_len);
df7492f9
KH
9018 subsidiaries = Fmake_vector (make_number (3), Qnil);
9019 for (i = 0; i < 3; i++)
068a9dbd 9020 {
df7492f9
KH
9021 bcopy (suffixes[i], buf + base_name_len, strlen (suffixes[i]) + 1);
9022 ASET (subsidiaries, i, intern (buf));
068a9dbd 9023 }
df7492f9 9024 return subsidiaries;
068a9dbd
KH
9025}
9026
9027
df7492f9
KH
9028DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
9029 Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
1fcd6c8b
DL
9030 doc: /* For internal use only.
9031usage: (define-coding-system-internal ...) */)
df7492f9
KH
9032 (nargs, args)
9033 int nargs;
9034 Lisp_Object *args;
068a9dbd 9035{
df7492f9
KH
9036 Lisp_Object name;
9037 Lisp_Object spec_vec; /* [ ATTRS ALIASE EOL_TYPE ] */
9038 Lisp_Object attrs; /* Vector of attributes. */
9039 Lisp_Object eol_type;
9040 Lisp_Object aliases;
9041 Lisp_Object coding_type, charset_list, safe_charsets;
9042 enum coding_category category;
9043 Lisp_Object tail, val;
9044 int max_charset_id = 0;
9045 int i;
068a9dbd 9046
df7492f9
KH
9047 if (nargs < coding_arg_max)
9048 goto short_args;
068a9dbd 9049
df7492f9 9050 attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
068a9dbd 9051
df7492f9
KH
9052 name = args[coding_arg_name];
9053 CHECK_SYMBOL (name);
9054 CODING_ATTR_BASE_NAME (attrs) = name;
068a9dbd 9055
df7492f9
KH
9056 val = args[coding_arg_mnemonic];
9057 if (! STRINGP (val))
9058 CHECK_CHARACTER (val);
9059 CODING_ATTR_MNEMONIC (attrs) = val;
068a9dbd 9060
df7492f9
KH
9061 coding_type = args[coding_arg_coding_type];
9062 CHECK_SYMBOL (coding_type);
9063 CODING_ATTR_TYPE (attrs) = coding_type;
068a9dbd 9064
df7492f9
KH
9065 charset_list = args[coding_arg_charset_list];
9066 if (SYMBOLP (charset_list))
9067 {
9068 if (EQ (charset_list, Qiso_2022))
9069 {
9070 if (! EQ (coding_type, Qiso_2022))
9071 error ("Invalid charset-list");
9072 charset_list = Viso_2022_charset_list;
9073 }
9074 else if (EQ (charset_list, Qemacs_mule))
9075 {
9076 if (! EQ (coding_type, Qemacs_mule))
9077 error ("Invalid charset-list");
9078 charset_list = Vemacs_mule_charset_list;
9079 }
9080 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9081 if (max_charset_id < XFASTINT (XCAR (tail)))
9082 max_charset_id = XFASTINT (XCAR (tail));
9083 }
068a9dbd
KH
9084 else
9085 {
df7492f9 9086 charset_list = Fcopy_sequence (charset_list);
985773c9 9087 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
068a9dbd 9088 {
df7492f9
KH
9089 struct charset *charset;
9090
985773c9 9091 val = XCAR (tail);
df7492f9
KH
9092 CHECK_CHARSET_GET_CHARSET (val, charset);
9093 if (EQ (coding_type, Qiso_2022)
9094 ? CHARSET_ISO_FINAL (charset) < 0
9095 : EQ (coding_type, Qemacs_mule)
9096 ? CHARSET_EMACS_MULE_ID (charset) < 0
9097 : 0)
9098 error ("Can't handle charset `%s'",
8f924df7 9099 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
df7492f9 9100
8f924df7 9101 XSETCAR (tail, make_number (charset->id));
df7492f9
KH
9102 if (max_charset_id < charset->id)
9103 max_charset_id = charset->id;
068a9dbd
KH
9104 }
9105 }
df7492f9 9106 CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
068a9dbd 9107
df7492f9
KH
9108 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
9109 make_number (255));
9110 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8f924df7 9111 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
df7492f9 9112 CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
068a9dbd 9113
584948ac 9114 CODING_ATTR_ASCII_COMPAT (attrs) = args[coding_arg_ascii_compatible_p];
3a73fa5d 9115
df7492f9 9116 val = args[coding_arg_decode_translation_table];
a6f87d34 9117 if (! CHAR_TABLE_P (val) && ! CONSP (val))
7d64c6ad 9118 CHECK_SYMBOL (val);
df7492f9 9119 CODING_ATTR_DECODE_TBL (attrs) = val;
3a73fa5d 9120
df7492f9 9121 val = args[coding_arg_encode_translation_table];
a6f87d34 9122 if (! CHAR_TABLE_P (val) && ! CONSP (val))
7d64c6ad 9123 CHECK_SYMBOL (val);
df7492f9 9124 CODING_ATTR_ENCODE_TBL (attrs) = val;
d46c5b12 9125
df7492f9
KH
9126 val = args[coding_arg_post_read_conversion];
9127 CHECK_SYMBOL (val);
9128 CODING_ATTR_POST_READ (attrs) = val;
d46c5b12 9129
df7492f9
KH
9130 val = args[coding_arg_pre_write_conversion];
9131 CHECK_SYMBOL (val);
9132 CODING_ATTR_PRE_WRITE (attrs) = val;
3a73fa5d 9133
df7492f9
KH
9134 val = args[coding_arg_default_char];
9135 if (NILP (val))
9136 CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
9137 else
9138 {
8f924df7 9139 CHECK_CHARACTER (val);
df7492f9
KH
9140 CODING_ATTR_DEFAULT_CHAR (attrs) = val;
9141 }
4031e2bf 9142
8f924df7
KH
9143 val = args[coding_arg_for_unibyte];
9144 CODING_ATTR_FOR_UNIBYTE (attrs) = NILP (val) ? Qnil : Qt;
3a73fa5d 9145
df7492f9
KH
9146 val = args[coding_arg_plist];
9147 CHECK_LIST (val);
9148 CODING_ATTR_PLIST (attrs) = val;
3a73fa5d 9149
df7492f9
KH
9150 if (EQ (coding_type, Qcharset))
9151 {
c7c66a95
KH
9152 /* Generate a lisp vector of 256 elements. Each element is nil,
9153 integer, or a list of charset IDs.
3a73fa5d 9154
c7c66a95
KH
9155 If Nth element is nil, the byte code N is invalid in this
9156 coding system.
4ed46869 9157
c7c66a95
KH
9158 If Nth element is a number NUM, N is the first byte of a
9159 charset whose ID is NUM.
4ed46869 9160
c7c66a95
KH
9161 If Nth element is a list of charset IDs, N is the first byte
9162 of one of them. The list is sorted by dimensions of the
2bc515e4 9163 charsets. A charset of smaller dimension comes firtst. */
df7492f9 9164 val = Fmake_vector (make_number (256), Qnil);
4ed46869 9165
5c99c2e6 9166 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
df7492f9 9167 {
c7c66a95
KH
9168 struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
9169 int dim = CHARSET_DIMENSION (charset);
9170 int idx = (dim - 1) * 4;
4ed46869 9171
5c99c2e6 9172 if (CHARSET_ASCII_COMPATIBLE_P (charset))
584948ac 9173 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
4031e2bf 9174
15d143f7
KH
9175 for (i = charset->code_space[idx];
9176 i <= charset->code_space[idx + 1]; i++)
9177 {
c7c66a95
KH
9178 Lisp_Object tmp, tmp2;
9179 int dim2;
ec6d2bb8 9180
c7c66a95
KH
9181 tmp = AREF (val, i);
9182 if (NILP (tmp))
9183 tmp = XCAR (tail);
9184 else if (NUMBERP (tmp))
9185 {
9186 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
9187 if (dim < dim2)
c7c66a95 9188 tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
f9d71dcd
KH
9189 else
9190 tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
c7c66a95 9191 }
15d143f7 9192 else
c7c66a95
KH
9193 {
9194 for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
9195 {
9196 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
9197 if (dim < dim2)
9198 break;
9199 }
9200 if (NILP (tmp2))
9201 tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
9202 else
9203 {
9204 XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
9205 XSETCAR (tmp2, XCAR (tail));
9206 }
9207 }
9208 ASET (val, i, tmp);
15d143f7 9209 }
df7492f9
KH
9210 }
9211 ASET (attrs, coding_attr_charset_valids, val);
9212 category = coding_category_charset;
9213 }
9214 else if (EQ (coding_type, Qccl))
9215 {
9216 Lisp_Object valids;
ecec61c1 9217
df7492f9
KH
9218 if (nargs < coding_arg_ccl_max)
9219 goto short_args;
ecec61c1 9220
df7492f9
KH
9221 val = args[coding_arg_ccl_decoder];
9222 CHECK_CCL_PROGRAM (val);
9223 if (VECTORP (val))
9224 val = Fcopy_sequence (val);
9225 ASET (attrs, coding_attr_ccl_decoder, val);
ecec61c1 9226
df7492f9
KH
9227 val = args[coding_arg_ccl_encoder];
9228 CHECK_CCL_PROGRAM (val);
9229 if (VECTORP (val))
9230 val = Fcopy_sequence (val);
9231 ASET (attrs, coding_attr_ccl_encoder, val);
ecec61c1 9232
df7492f9
KH
9233 val = args[coding_arg_ccl_valids];
9234 valids = Fmake_string (make_number (256), make_number (0));
9235 for (tail = val; !NILP (tail); tail = Fcdr (tail))
9236 {
8dcbea82 9237 int from, to;
ecec61c1 9238
df7492f9
KH
9239 val = Fcar (tail);
9240 if (INTEGERP (val))
8dcbea82
KH
9241 {
9242 from = to = XINT (val);
9243 if (from < 0 || from > 255)
9244 args_out_of_range_3 (val, make_number (0), make_number (255));
9245 }
df7492f9
KH
9246 else
9247 {
df7492f9 9248 CHECK_CONS (val);
8f924df7
KH
9249 CHECK_NATNUM_CAR (val);
9250 CHECK_NATNUM_CDR (val);
df7492f9 9251 from = XINT (XCAR (val));
8f924df7 9252 if (from > 255)
8dcbea82
KH
9253 args_out_of_range_3 (XCAR (val),
9254 make_number (0), make_number (255));
df7492f9 9255 to = XINT (XCDR (val));
8dcbea82
KH
9256 if (to < from || to > 255)
9257 args_out_of_range_3 (XCDR (val),
9258 XCAR (val), make_number (255));
df7492f9 9259 }
8dcbea82 9260 for (i = from; i <= to; i++)
8f924df7 9261 SSET (valids, i, 1);
df7492f9
KH
9262 }
9263 ASET (attrs, coding_attr_ccl_valids, valids);
4ed46869 9264
df7492f9 9265 category = coding_category_ccl;
55ab7be3 9266 }
df7492f9 9267 else if (EQ (coding_type, Qutf_16))
55ab7be3 9268 {
df7492f9 9269 Lisp_Object bom, endian;
4ed46869 9270
584948ac 9271 CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
4ed46869 9272
df7492f9
KH
9273 if (nargs < coding_arg_utf16_max)
9274 goto short_args;
4ed46869 9275
df7492f9
KH
9276 bom = args[coding_arg_utf16_bom];
9277 if (! NILP (bom) && ! EQ (bom, Qt))
9278 {
9279 CHECK_CONS (bom);
8f924df7
KH
9280 val = XCAR (bom);
9281 CHECK_CODING_SYSTEM (val);
9282 val = XCDR (bom);
9283 CHECK_CODING_SYSTEM (val);
df7492f9 9284 }
a470d443 9285 ASET (attrs, coding_attr_utf_bom, bom);
df7492f9
KH
9286
9287 endian = args[coding_arg_utf16_endian];
b49a1807
KH
9288 CHECK_SYMBOL (endian);
9289 if (NILP (endian))
9290 endian = Qbig;
9291 else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
8f924df7 9292 error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
df7492f9
KH
9293 ASET (attrs, coding_attr_utf_16_endian, endian);
9294
9295 category = (CONSP (bom)
9296 ? coding_category_utf_16_auto
9297 : NILP (bom)
b49a1807 9298 ? (EQ (endian, Qbig)
df7492f9
KH
9299 ? coding_category_utf_16_be_nosig
9300 : coding_category_utf_16_le_nosig)
b49a1807 9301 : (EQ (endian, Qbig)
df7492f9
KH
9302 ? coding_category_utf_16_be
9303 : coding_category_utf_16_le));
9304 }
9305 else if (EQ (coding_type, Qiso_2022))
9306 {
9307 Lisp_Object initial, reg_usage, request, flags;
4776e638 9308 int i;
1397dc18 9309
df7492f9
KH
9310 if (nargs < coding_arg_iso2022_max)
9311 goto short_args;
9312
9313 initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
9314 CHECK_VECTOR (initial);
9315 for (i = 0; i < 4; i++)
9316 {
9317 val = Faref (initial, make_number (i));
9318 if (! NILP (val))
9319 {
584948ac
KH
9320 struct charset *charset;
9321
9322 CHECK_CHARSET_GET_CHARSET (val, charset);
9323 ASET (initial, i, make_number (CHARSET_ID (charset)));
9324 if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
9325 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
df7492f9
KH
9326 }
9327 else
9328 ASET (initial, i, make_number (-1));
9329 }
9330
9331 reg_usage = args[coding_arg_iso2022_reg_usage];
9332 CHECK_CONS (reg_usage);
8f924df7
KH
9333 CHECK_NUMBER_CAR (reg_usage);
9334 CHECK_NUMBER_CDR (reg_usage);
df7492f9
KH
9335
9336 request = Fcopy_sequence (args[coding_arg_iso2022_request]);
9337 for (tail = request; ! NILP (tail); tail = Fcdr (tail))
1397dc18 9338 {
df7492f9 9339 int id;
8f924df7 9340 Lisp_Object tmp;
df7492f9
KH
9341
9342 val = Fcar (tail);
9343 CHECK_CONS (val);
8f924df7
KH
9344 tmp = XCAR (val);
9345 CHECK_CHARSET_GET_ID (tmp, id);
9346 CHECK_NATNUM_CDR (val);
df7492f9
KH
9347 if (XINT (XCDR (val)) >= 4)
9348 error ("Invalid graphic register number: %d", XINT (XCDR (val)));
8f924df7 9349 XSETCAR (val, make_number (id));
1397dc18 9350 }
4ed46869 9351
df7492f9
KH
9352 flags = args[coding_arg_iso2022_flags];
9353 CHECK_NATNUM (flags);
9354 i = XINT (flags);
9355 if (EQ (args[coding_arg_charset_list], Qiso_2022))
9356 flags = make_number (i | CODING_ISO_FLAG_FULL_SUPPORT);
9357
9358 ASET (attrs, coding_attr_iso_initial, initial);
9359 ASET (attrs, coding_attr_iso_usage, reg_usage);
9360 ASET (attrs, coding_attr_iso_request, request);
9361 ASET (attrs, coding_attr_iso_flags, flags);
9362 setup_iso_safe_charsets (attrs);
9363
9364 if (i & CODING_ISO_FLAG_SEVEN_BITS)
9365 category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
9366 | CODING_ISO_FLAG_SINGLE_SHIFT))
9367 ? coding_category_iso_7_else
9368 : EQ (args[coding_arg_charset_list], Qiso_2022)
9369 ? coding_category_iso_7
9370 : coding_category_iso_7_tight);
9371 else
9372 {
9373 int id = XINT (AREF (initial, 1));
9374
c6fb6e98 9375 category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
df7492f9
KH
9376 || EQ (args[coding_arg_charset_list], Qiso_2022)
9377 || id < 0)
9378 ? coding_category_iso_8_else
9379 : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
9380 ? coding_category_iso_8_1
9381 : coding_category_iso_8_2);
9382 }
0ce7886f
KH
9383 if (category != coding_category_iso_8_1
9384 && category != coding_category_iso_8_2)
9385 CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
df7492f9
KH
9386 }
9387 else if (EQ (coding_type, Qemacs_mule))
c28a9453 9388 {
df7492f9
KH
9389 if (EQ (args[coding_arg_charset_list], Qemacs_mule))
9390 ASET (attrs, coding_attr_emacs_mule_full, Qt);
584948ac 9391 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
df7492f9 9392 category = coding_category_emacs_mule;
c28a9453 9393 }
df7492f9 9394 else if (EQ (coding_type, Qshift_jis))
c28a9453 9395 {
df7492f9
KH
9396
9397 struct charset *charset;
9398
7d64c6ad 9399 if (XINT (Flength (charset_list)) != 3
6e07c25f 9400 && XINT (Flength (charset_list)) != 4)
7d64c6ad 9401 error ("There should be three or four charsets");
df7492f9
KH
9402
9403 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9404 if (CHARSET_DIMENSION (charset) != 1)
9405 error ("Dimension of charset %s is not one",
8f924df7 9406 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
584948ac
KH
9407 if (CHARSET_ASCII_COMPATIBLE_P (charset))
9408 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
df7492f9
KH
9409
9410 charset_list = XCDR (charset_list);
9411 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9412 if (CHARSET_DIMENSION (charset) != 1)
9413 error ("Dimension of charset %s is not one",
8f924df7 9414 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
df7492f9
KH
9415
9416 charset_list = XCDR (charset_list);
9417 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9418 if (CHARSET_DIMENSION (charset) != 2)
7d64c6ad
KH
9419 error ("Dimension of charset %s is not two",
9420 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9421
9422 charset_list = XCDR (charset_list);
2b917a06
KH
9423 if (! NILP (charset_list))
9424 {
9425 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9426 if (CHARSET_DIMENSION (charset) != 2)
9427 error ("Dimension of charset %s is not two",
9428 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9429 }
df7492f9
KH
9430
9431 category = coding_category_sjis;
9432 Vsjis_coding_system = name;
c28a9453 9433 }
df7492f9
KH
9434 else if (EQ (coding_type, Qbig5))
9435 {
9436 struct charset *charset;
4ed46869 9437
df7492f9
KH
9438 if (XINT (Flength (charset_list)) != 2)
9439 error ("There should be just two charsets");
9440
9441 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9442 if (CHARSET_DIMENSION (charset) != 1)
9443 error ("Dimension of charset %s is not one",
8f924df7 9444 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
584948ac
KH
9445 if (CHARSET_ASCII_COMPATIBLE_P (charset))
9446 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
df7492f9
KH
9447
9448 charset_list = XCDR (charset_list);
9449 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9450 if (CHARSET_DIMENSION (charset) != 2)
9451 error ("Dimension of charset %s is not two",
8f924df7 9452 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
4ed46869 9453
df7492f9
KH
9454 category = coding_category_big5;
9455 Vbig5_coding_system = name;
9456 }
9457 else if (EQ (coding_type, Qraw_text))
c28a9453 9458 {
584948ac
KH
9459 category = coding_category_raw_text;
9460 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
c28a9453 9461 }
df7492f9 9462 else if (EQ (coding_type, Qutf_8))
4ed46869 9463 {
a470d443
KH
9464 Lisp_Object bom;
9465
584948ac 9466 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
a470d443
KH
9467
9468 if (nargs < coding_arg_utf8_max)
9469 goto short_args;
9470
9471 bom = args[coding_arg_utf8_bom];
9472 if (! NILP (bom) && ! EQ (bom, Qt))
9473 {
9474 CHECK_CONS (bom);
9475 val = XCAR (bom);
9476 CHECK_CODING_SYSTEM (val);
9477 val = XCDR (bom);
9478 CHECK_CODING_SYSTEM (val);
9479 }
9480 ASET (attrs, coding_attr_utf_bom, bom);
9481
9482 category = (CONSP (bom) ? coding_category_utf_8_auto
9483 : NILP (bom) ? coding_category_utf_8_nosig
9484 : coding_category_utf_8_sig);
4ed46869 9485 }
df7492f9
KH
9486 else if (EQ (coding_type, Qundecided))
9487 category = coding_category_undecided;
4ed46869 9488 else
df7492f9 9489 error ("Invalid coding system type: %s",
8f924df7 9490 SDATA (SYMBOL_NAME (coding_type)));
4ed46869 9491
df7492f9 9492 CODING_ATTR_CATEGORY (attrs) = make_number (category);
01378f49
KH
9493 CODING_ATTR_PLIST (attrs)
9494 = Fcons (QCcategory, Fcons (AREF (Vcoding_category_table, category),
9495 CODING_ATTR_PLIST (attrs)));
35befdaa 9496 CODING_ATTR_PLIST (attrs)
3ed051d4 9497 = Fcons (QCascii_compatible_p,
35befdaa
KH
9498 Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
9499 CODING_ATTR_PLIST (attrs)));
c4825358 9500
df7492f9
KH
9501 eol_type = args[coding_arg_eol_type];
9502 if (! NILP (eol_type)
9503 && ! EQ (eol_type, Qunix)
9504 && ! EQ (eol_type, Qdos)
9505 && ! EQ (eol_type, Qmac))
9506 error ("Invalid eol-type");
4ed46869 9507
df7492f9 9508 aliases = Fcons (name, Qnil);
4ed46869 9509
df7492f9
KH
9510 if (NILP (eol_type))
9511 {
9512 eol_type = make_subsidiaries (name);
9513 for (i = 0; i < 3; i++)
1397dc18 9514 {
df7492f9
KH
9515 Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
9516
9517 this_name = AREF (eol_type, i);
9518 this_aliases = Fcons (this_name, Qnil);
9519 this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
9520 this_spec = Fmake_vector (make_number (3), attrs);
9521 ASET (this_spec, 1, this_aliases);
9522 ASET (this_spec, 2, this_eol_type);
9523 Fputhash (this_name, this_spec, Vcoding_system_hash_table);
9524 Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
583f71ca
KH
9525 val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
9526 if (NILP (val))
9527 Vcoding_system_alist
9528 = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
9529 Vcoding_system_alist);
1397dc18 9530 }
d46c5b12 9531 }
4ed46869 9532
df7492f9
KH
9533 spec_vec = Fmake_vector (make_number (3), attrs);
9534 ASET (spec_vec, 1, aliases);
9535 ASET (spec_vec, 2, eol_type);
48b0f3ae 9536
df7492f9
KH
9537 Fputhash (name, spec_vec, Vcoding_system_hash_table);
9538 Vcoding_system_list = Fcons (name, Vcoding_system_list);
583f71ca
KH
9539 val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
9540 if (NILP (val))
9541 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
9542 Vcoding_system_alist);
48b0f3ae 9543
df7492f9
KH
9544 {
9545 int id = coding_categories[category].id;
48b0f3ae 9546
df7492f9
KH
9547 if (id < 0 || EQ (name, CODING_ID_NAME (id)))
9548 setup_coding_system (name, &coding_categories[category]);
9549 }
48b0f3ae 9550
d46c5b12 9551 return Qnil;
48b0f3ae 9552
df7492f9
KH
9553 short_args:
9554 return Fsignal (Qwrong_number_of_arguments,
9555 Fcons (intern ("define-coding-system-internal"),
9556 make_number (nargs)));
d46c5b12 9557}
4ed46869 9558
d6925f38 9559
a6f87d34
KH
9560DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
9561 3, 3, 0,
9562 doc: /* Change value in CODING-SYSTEM's property list PROP to VAL. */)
9563 (coding_system, prop, val)
9564 Lisp_Object coding_system, prop, val;
9565{
3dbe7859 9566 Lisp_Object spec, attrs;
a6f87d34
KH
9567
9568 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9569 attrs = AREF (spec, 0);
9570 if (EQ (prop, QCmnemonic))
9571 {
9572 if (! STRINGP (val))
9573 CHECK_CHARACTER (val);
9574 CODING_ATTR_MNEMONIC (attrs) = val;
9575 }
9576 else if (EQ (prop, QCdefalut_char))
9577 {
9578 if (NILP (val))
9579 val = make_number (' ');
9580 else
9581 CHECK_CHARACTER (val);
9582 CODING_ATTR_DEFAULT_CHAR (attrs) = val;
9583 }
9584 else if (EQ (prop, QCdecode_translation_table))
9585 {
9586 if (! CHAR_TABLE_P (val) && ! CONSP (val))
9587 CHECK_SYMBOL (val);
9588 CODING_ATTR_DECODE_TBL (attrs) = val;
9589 }
9590 else if (EQ (prop, QCencode_translation_table))
9591 {
9592 if (! CHAR_TABLE_P (val) && ! CONSP (val))
9593 CHECK_SYMBOL (val);
9594 CODING_ATTR_ENCODE_TBL (attrs) = val;
9595 }
9596 else if (EQ (prop, QCpost_read_conversion))
9597 {
9598 CHECK_SYMBOL (val);
9599 CODING_ATTR_POST_READ (attrs) = val;
9600 }
9601 else if (EQ (prop, QCpre_write_conversion))
9602 {
9603 CHECK_SYMBOL (val);
9604 CODING_ATTR_PRE_WRITE (attrs) = val;
9605 }
35befdaa
KH
9606 else if (EQ (prop, QCascii_compatible_p))
9607 {
9608 CODING_ATTR_ASCII_COMPAT (attrs) = val;
9609 }
a6f87d34
KH
9610
9611 CODING_ATTR_PLIST (attrs)
9612 = Fplist_put (CODING_ATTR_PLIST (attrs), prop, val);
9613 return val;
9614}
9615
9616
df7492f9
KH
9617DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
9618 Sdefine_coding_system_alias, 2, 2, 0,
9619 doc: /* Define ALIAS as an alias for CODING-SYSTEM. */)
9620 (alias, coding_system)
9621 Lisp_Object alias, coding_system;
66cfb530 9622{
583f71ca 9623 Lisp_Object spec, aliases, eol_type, val;
4ed46869 9624
df7492f9
KH
9625 CHECK_SYMBOL (alias);
9626 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9627 aliases = AREF (spec, 1);
d4a1d553 9628 /* ALIASES should be a list of length more than zero, and the first
d6925f38
KH
9629 element is a base coding system. Append ALIAS at the tail of the
9630 list. */
df7492f9
KH
9631 while (!NILP (XCDR (aliases)))
9632 aliases = XCDR (aliases);
8f924df7 9633 XSETCDR (aliases, Fcons (alias, Qnil));
4ed46869 9634
df7492f9
KH
9635 eol_type = AREF (spec, 2);
9636 if (VECTORP (eol_type))
4ed46869 9637 {
df7492f9
KH
9638 Lisp_Object subsidiaries;
9639 int i;
4ed46869 9640
df7492f9
KH
9641 subsidiaries = make_subsidiaries (alias);
9642 for (i = 0; i < 3; i++)
9643 Fdefine_coding_system_alias (AREF (subsidiaries, i),
9644 AREF (eol_type, i));
4ed46869 9645 }
df7492f9
KH
9646
9647 Fputhash (alias, spec, Vcoding_system_hash_table);
d6925f38 9648 Vcoding_system_list = Fcons (alias, Vcoding_system_list);
583f71ca
KH
9649 val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
9650 if (NILP (val))
9651 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
9652 Vcoding_system_alist);
66cfb530 9653
4ed46869
KH
9654 return Qnil;
9655}
9656
df7492f9
KH
9657DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
9658 1, 1, 0,
9659 doc: /* Return the base of CODING-SYSTEM.
da7db224 9660Any alias or subsidiary coding system is not a base coding system. */)
df7492f9
KH
9661 (coding_system)
9662 Lisp_Object coding_system;
d46c5b12 9663{
df7492f9 9664 Lisp_Object spec, attrs;
d46c5b12 9665
df7492f9
KH
9666 if (NILP (coding_system))
9667 return (Qno_conversion);
9668 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9669 attrs = AREF (spec, 0);
9670 return CODING_ATTR_BASE_NAME (attrs);
9671}
1397dc18 9672
df7492f9
KH
9673DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
9674 1, 1, 0,
9675 doc: "Return the property list of CODING-SYSTEM.")
9676 (coding_system)
9677 Lisp_Object coding_system;
9678{
9679 Lisp_Object spec, attrs;
1397dc18 9680
df7492f9
KH
9681 if (NILP (coding_system))
9682 coding_system = Qno_conversion;
9683 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9684 attrs = AREF (spec, 0);
9685 return CODING_ATTR_PLIST (attrs);
d46c5b12
KH
9686}
9687
df7492f9
KH
9688
9689DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
9690 1, 1, 0,
da7db224 9691 doc: /* Return the list of aliases of CODING-SYSTEM. */)
df7492f9
KH
9692 (coding_system)
9693 Lisp_Object coding_system;
66cfb530 9694{
df7492f9 9695 Lisp_Object spec;
84d60297 9696
df7492f9
KH
9697 if (NILP (coding_system))
9698 coding_system = Qno_conversion;
9699 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
da7db224 9700 return AREF (spec, 1);
df7492f9 9701}
66cfb530 9702
df7492f9
KH
9703DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
9704 Scoding_system_eol_type, 1, 1, 0,
9705 doc: /* Return eol-type of CODING-SYSTEM.
d4a1d553 9706An eol-type is an integer 0, 1, 2, or a vector of coding systems.
66cfb530 9707
df7492f9
KH
9708Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
9709and CR respectively.
66cfb530 9710
df7492f9
KH
9711A vector value indicates that a format of end-of-line should be
9712detected automatically. Nth element of the vector is the subsidiary
9713coding system whose eol-type is N. */)
6b89e3aa
KH
9714 (coding_system)
9715 Lisp_Object coding_system;
9716{
df7492f9
KH
9717 Lisp_Object spec, eol_type;
9718 int n;
6b89e3aa 9719
df7492f9
KH
9720 if (NILP (coding_system))
9721 coding_system = Qno_conversion;
9722 if (! CODING_SYSTEM_P (coding_system))
9723 return Qnil;
9724 spec = CODING_SYSTEM_SPEC (coding_system);
9725 eol_type = AREF (spec, 2);
9726 if (VECTORP (eol_type))
9727 return Fcopy_sequence (eol_type);
9728 n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
9729 return make_number (n);
6b89e3aa
KH
9730}
9731
4ed46869
KH
9732#endif /* emacs */
9733
9734\f
1397dc18 9735/*** 9. Post-amble ***/
4ed46869 9736
dfcf069d 9737void
4ed46869
KH
9738init_coding_once ()
9739{
9740 int i;
9741
df7492f9
KH
9742 for (i = 0; i < coding_category_max; i++)
9743 {
9744 coding_categories[i].id = -1;
9745 coding_priorities[i] = i;
9746 }
4ed46869
KH
9747
9748 /* ISO2022 specific initialize routine. */
9749 for (i = 0; i < 0x20; i++)
b73bfc1c 9750 iso_code_class[i] = ISO_control_0;
4ed46869
KH
9751 for (i = 0x21; i < 0x7F; i++)
9752 iso_code_class[i] = ISO_graphic_plane_0;
9753 for (i = 0x80; i < 0xA0; i++)
b73bfc1c 9754 iso_code_class[i] = ISO_control_1;
4ed46869
KH
9755 for (i = 0xA1; i < 0xFF; i++)
9756 iso_code_class[i] = ISO_graphic_plane_1;
9757 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
9758 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
4ed46869
KH
9759 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
9760 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
9761 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
9762 iso_code_class[ISO_CODE_ESC] = ISO_escape;
9763 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
9764 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
9765 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
9766
df7492f9
KH
9767 for (i = 0; i < 256; i++)
9768 {
9769 emacs_mule_bytes[i] = 1;
9770 }
7c78e542
KH
9771 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
9772 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
9773 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
9774 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
e0e989f6
KH
9775}
9776
9777#ifdef emacs
9778
dfcf069d 9779void
e0e989f6
KH
9780syms_of_coding ()
9781{
df7492f9 9782 staticpro (&Vcoding_system_hash_table);
8f924df7
KH
9783 {
9784 Lisp_Object args[2];
9785 args[0] = QCtest;
9786 args[1] = Qeq;
9787 Vcoding_system_hash_table = Fmake_hash_table (2, args);
9788 }
df7492f9
KH
9789
9790 staticpro (&Vsjis_coding_system);
9791 Vsjis_coding_system = Qnil;
e0e989f6 9792
df7492f9
KH
9793 staticpro (&Vbig5_coding_system);
9794 Vbig5_coding_system = Qnil;
9795
24a73b0a
KH
9796 staticpro (&Vcode_conversion_reused_workbuf);
9797 Vcode_conversion_reused_workbuf = Qnil;
9798
9799 staticpro (&Vcode_conversion_workbuf_name);
9800 Vcode_conversion_workbuf_name = build_string (" *code-conversion-work*");
e0e989f6 9801
24a73b0a 9802 reused_workbuf_in_use = 0;
df7492f9
KH
9803
9804 DEFSYM (Qcharset, "charset");
9805 DEFSYM (Qtarget_idx, "target-idx");
9806 DEFSYM (Qcoding_system_history, "coding-system-history");
bb0115a2
RS
9807 Fset (Qcoding_system_history, Qnil);
9808
9ce27fde 9809 /* Target FILENAME is the first argument. */
e0e989f6 9810 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
9ce27fde 9811 /* Target FILENAME is the third argument. */
e0e989f6
KH
9812 Fput (Qwrite_region, Qtarget_idx, make_number (2));
9813
df7492f9 9814 DEFSYM (Qcall_process, "call-process");
9ce27fde 9815 /* Target PROGRAM is the first argument. */
e0e989f6
KH
9816 Fput (Qcall_process, Qtarget_idx, make_number (0));
9817
df7492f9 9818 DEFSYM (Qcall_process_region, "call-process-region");
9ce27fde 9819 /* Target PROGRAM is the third argument. */
e0e989f6
KH
9820 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
9821
df7492f9 9822 DEFSYM (Qstart_process, "start-process");
9ce27fde 9823 /* Target PROGRAM is the third argument. */
e0e989f6
KH
9824 Fput (Qstart_process, Qtarget_idx, make_number (2));
9825
df7492f9 9826 DEFSYM (Qopen_network_stream, "open-network-stream");
9ce27fde 9827 /* Target SERVICE is the fourth argument. */
e0e989f6
KH
9828 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
9829
df7492f9
KH
9830 DEFSYM (Qcoding_system, "coding-system");
9831 DEFSYM (Qcoding_aliases, "coding-aliases");
4ed46869 9832
df7492f9
KH
9833 DEFSYM (Qeol_type, "eol-type");
9834 DEFSYM (Qunix, "unix");
9835 DEFSYM (Qdos, "dos");
4ed46869 9836
df7492f9
KH
9837 DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
9838 DEFSYM (Qpost_read_conversion, "post-read-conversion");
9839 DEFSYM (Qpre_write_conversion, "pre-write-conversion");
9840 DEFSYM (Qdefault_char, "default-char");
9841 DEFSYM (Qundecided, "undecided");
9842 DEFSYM (Qno_conversion, "no-conversion");
9843 DEFSYM (Qraw_text, "raw-text");
4ed46869 9844
df7492f9 9845 DEFSYM (Qiso_2022, "iso-2022");
4ed46869 9846
df7492f9 9847 DEFSYM (Qutf_8, "utf-8");
8f924df7 9848 DEFSYM (Qutf_8_emacs, "utf-8-emacs");
27901516 9849
df7492f9 9850 DEFSYM (Qutf_16, "utf-16");
df7492f9
KH
9851 DEFSYM (Qbig, "big");
9852 DEFSYM (Qlittle, "little");
27901516 9853
df7492f9
KH
9854 DEFSYM (Qshift_jis, "shift-jis");
9855 DEFSYM (Qbig5, "big5");
4ed46869 9856
df7492f9 9857 DEFSYM (Qcoding_system_p, "coding-system-p");
4ed46869 9858
df7492f9 9859 DEFSYM (Qcoding_system_error, "coding-system-error");
4ed46869
KH
9860 Fput (Qcoding_system_error, Qerror_conditions,
9861 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
9862 Fput (Qcoding_system_error, Qerror_message,
9ce27fde 9863 build_string ("Invalid coding system"));
4ed46869 9864
05e6f5dc
KH
9865 /* Intern this now in case it isn't already done.
9866 Setting this variable twice is harmless.
9867 But don't staticpro it here--that is done in alloc.c. */
9868 Qchar_table_extra_slots = intern ("char-table-extra-slots");
70c22245 9869
df7492f9 9870 DEFSYM (Qtranslation_table, "translation-table");
433f7f87 9871 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
df7492f9
KH
9872 DEFSYM (Qtranslation_table_id, "translation-table-id");
9873 DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
9874 DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
1397dc18 9875
df7492f9 9876 DEFSYM (Qvalid_codes, "valid-codes");
9ce27fde 9877
df7492f9 9878 DEFSYM (Qemacs_mule, "emacs-mule");
d46c5b12 9879
01378f49 9880 DEFSYM (QCcategory, ":category");
a6f87d34
KH
9881 DEFSYM (QCmnemonic, ":mnemonic");
9882 DEFSYM (QCdefalut_char, ":default-char");
9883 DEFSYM (QCdecode_translation_table, ":decode-translation-table");
9884 DEFSYM (QCencode_translation_table, ":encode-translation-table");
9885 DEFSYM (QCpost_read_conversion, ":post-read-conversion");
9886 DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
35befdaa 9887 DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
01378f49 9888
df7492f9
KH
9889 Vcoding_category_table
9890 = Fmake_vector (make_number (coding_category_max), Qnil);
9891 staticpro (&Vcoding_category_table);
9892 /* Followings are target of code detection. */
9893 ASET (Vcoding_category_table, coding_category_iso_7,
9894 intern ("coding-category-iso-7"));
9895 ASET (Vcoding_category_table, coding_category_iso_7_tight,
9896 intern ("coding-category-iso-7-tight"));
9897 ASET (Vcoding_category_table, coding_category_iso_8_1,
9898 intern ("coding-category-iso-8-1"));
9899 ASET (Vcoding_category_table, coding_category_iso_8_2,
9900 intern ("coding-category-iso-8-2"));
9901 ASET (Vcoding_category_table, coding_category_iso_7_else,
9902 intern ("coding-category-iso-7-else"));
9903 ASET (Vcoding_category_table, coding_category_iso_8_else,
9904 intern ("coding-category-iso-8-else"));
a470d443
KH
9905 ASET (Vcoding_category_table, coding_category_utf_8_auto,
9906 intern ("coding-category-utf-8-auto"));
9907 ASET (Vcoding_category_table, coding_category_utf_8_nosig,
df7492f9 9908 intern ("coding-category-utf-8"));
a470d443
KH
9909 ASET (Vcoding_category_table, coding_category_utf_8_sig,
9910 intern ("coding-category-utf-8-sig"));
df7492f9
KH
9911 ASET (Vcoding_category_table, coding_category_utf_16_be,
9912 intern ("coding-category-utf-16-be"));
ff563fce
KH
9913 ASET (Vcoding_category_table, coding_category_utf_16_auto,
9914 intern ("coding-category-utf-16-auto"));
df7492f9
KH
9915 ASET (Vcoding_category_table, coding_category_utf_16_le,
9916 intern ("coding-category-utf-16-le"));
9917 ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
9918 intern ("coding-category-utf-16-be-nosig"));
9919 ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
9920 intern ("coding-category-utf-16-le-nosig"));
9921 ASET (Vcoding_category_table, coding_category_charset,
9922 intern ("coding-category-charset"));
9923 ASET (Vcoding_category_table, coding_category_sjis,
9924 intern ("coding-category-sjis"));
9925 ASET (Vcoding_category_table, coding_category_big5,
9926 intern ("coding-category-big5"));
9927 ASET (Vcoding_category_table, coding_category_ccl,
9928 intern ("coding-category-ccl"));
9929 ASET (Vcoding_category_table, coding_category_emacs_mule,
9930 intern ("coding-category-emacs-mule"));
9931 /* Followings are NOT target of code detection. */
9932 ASET (Vcoding_category_table, coding_category_raw_text,
9933 intern ("coding-category-raw-text"));
9934 ASET (Vcoding_category_table, coding_category_undecided,
9935 intern ("coding-category-undecided"));
ecf488bc 9936
065e3595
KH
9937 DEFSYM (Qinsufficient_source, "insufficient-source");
9938 DEFSYM (Qinconsistent_eol, "inconsistent-eol");
9939 DEFSYM (Qinvalid_source, "invalid-source");
9940 DEFSYM (Qinterrupted, "interrupted");
9941 DEFSYM (Qinsufficient_memory, "insufficient-memory");
44e8490d 9942 DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
065e3595 9943
4ed46869
KH
9944 defsubr (&Scoding_system_p);
9945 defsubr (&Sread_coding_system);
9946 defsubr (&Sread_non_nil_coding_system);
9947 defsubr (&Scheck_coding_system);
9948 defsubr (&Sdetect_coding_region);
d46c5b12 9949 defsubr (&Sdetect_coding_string);
05e6f5dc 9950 defsubr (&Sfind_coding_systems_region_internal);
068a9dbd 9951 defsubr (&Sunencodable_char_position);
df7492f9 9952 defsubr (&Scheck_coding_systems_region);
4ed46869
KH
9953 defsubr (&Sdecode_coding_region);
9954 defsubr (&Sencode_coding_region);
9955 defsubr (&Sdecode_coding_string);
9956 defsubr (&Sencode_coding_string);
9957 defsubr (&Sdecode_sjis_char);
9958 defsubr (&Sencode_sjis_char);
9959 defsubr (&Sdecode_big5_char);
9960 defsubr (&Sencode_big5_char);
1ba9e4ab 9961 defsubr (&Sset_terminal_coding_system_internal);
c4825358 9962 defsubr (&Sset_safe_terminal_coding_system_internal);
4ed46869 9963 defsubr (&Sterminal_coding_system);
1ba9e4ab 9964 defsubr (&Sset_keyboard_coding_system_internal);
4ed46869 9965 defsubr (&Skeyboard_coding_system);
a5d301df 9966 defsubr (&Sfind_operation_coding_system);
df7492f9 9967 defsubr (&Sset_coding_system_priority);
6b89e3aa 9968 defsubr (&Sdefine_coding_system_internal);
df7492f9 9969 defsubr (&Sdefine_coding_system_alias);
a6f87d34 9970 defsubr (&Scoding_system_put);
df7492f9
KH
9971 defsubr (&Scoding_system_base);
9972 defsubr (&Scoding_system_plist);
9973 defsubr (&Scoding_system_aliases);
9974 defsubr (&Scoding_system_eol_type);
9975 defsubr (&Scoding_system_priority_list);
4ed46869 9976
4608c386 9977 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
48b0f3ae
PJ
9978 doc: /* List of coding systems.
9979
9980Do not alter the value of this variable manually. This variable should be
df7492f9 9981updated by the functions `define-coding-system' and
48b0f3ae 9982`define-coding-system-alias'. */);
4608c386
KH
9983 Vcoding_system_list = Qnil;
9984
9985 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
48b0f3ae
PJ
9986 doc: /* Alist of coding system names.
9987Each element is one element list of coding system name.
446dcd75 9988This variable is given to `completing-read' as COLLECTION argument.
48b0f3ae
PJ
9989
9990Do not alter the value of this variable manually. This variable should be
9991updated by the functions `make-coding-system' and
9992`define-coding-system-alias'. */);
4608c386
KH
9993 Vcoding_system_alist = Qnil;
9994
4ed46869 9995 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
48b0f3ae
PJ
9996 doc: /* List of coding-categories (symbols) ordered by priority.
9997
9998On detecting a coding system, Emacs tries code detection algorithms
9999associated with each coding-category one by one in this order. When
10000one algorithm agrees with a byte sequence of source text, the coding
0ec31faf
KH
10001system bound to the corresponding coding-category is selected.
10002
42205607 10003Don't modify this variable directly, but use `set-coding-priority'. */);
4ed46869
KH
10004 {
10005 int i;
10006
10007 Vcoding_category_list = Qnil;
df7492f9 10008 for (i = coding_category_max - 1; i >= 0; i--)
4ed46869 10009 Vcoding_category_list
d46c5b12
KH
10010 = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
10011 Vcoding_category_list);
4ed46869
KH
10012 }
10013
10014 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
48b0f3ae
PJ
10015 doc: /* Specify the coding system for read operations.
10016It is useful to bind this variable with `let', but do not set it globally.
10017If the value is a coding system, it is used for decoding on read operation.
446dcd75
JB
10018If not, an appropriate element is used from one of the coding system alists.
10019There are three such tables: `file-coding-system-alist',
48b0f3ae 10020`process-coding-system-alist', and `network-coding-system-alist'. */);
4ed46869
KH
10021 Vcoding_system_for_read = Qnil;
10022
10023 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
48b0f3ae
PJ
10024 doc: /* Specify the coding system for write operations.
10025Programs bind this variable with `let', but you should not set it globally.
10026If the value is a coding system, it is used for encoding of output,
10027when writing it to a file and when sending it to a file or subprocess.
10028
10029If this does not specify a coding system, an appropriate element
446dcd75
JB
10030is used from one of the coding system alists.
10031There are three such tables: `file-coding-system-alist',
48b0f3ae
PJ
10032`process-coding-system-alist', and `network-coding-system-alist'.
10033For output to files, if the above procedure does not specify a coding system,
10034the value of `buffer-file-coding-system' is used. */);
4ed46869
KH
10035 Vcoding_system_for_write = Qnil;
10036
10037 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
df7492f9
KH
10038 doc: /*
10039Coding system used in the latest file or process I/O. */);
4ed46869
KH
10040 Vlast_coding_system_used = Qnil;
10041
065e3595
KH
10042 DEFVAR_LISP ("last-code-conversion-error", &Vlast_code_conversion_error,
10043 doc: /*
10044Error status of the last code conversion.
10045
10046When an error was detected in the last code conversion, this variable
10047is set to one of the following symbols.
10048 `insufficient-source'
10049 `inconsistent-eol'
10050 `invalid-source'
10051 `interrupted'
10052 `insufficient-memory'
10053When no error was detected, the value doesn't change. So, to check
10054the error status of a code conversion by this variable, you must
10055explicitly set this variable to nil before performing code
10056conversion. */);
10057 Vlast_code_conversion_error = Qnil;
10058
9ce27fde 10059 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
df7492f9
KH
10060 doc: /*
10061*Non-nil means always inhibit code conversion of end-of-line format.
48b0f3ae
PJ
10062See info node `Coding Systems' and info node `Text and Binary' concerning
10063such conversion. */);
9ce27fde
KH
10064 inhibit_eol_conversion = 0;
10065
ed29121d 10066 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
df7492f9
KH
10067 doc: /*
10068Non-nil means process buffer inherits coding system of process output.
48b0f3ae
PJ
10069Bind it to t if the process output is to be treated as if it were a file
10070read from some filesystem. */);
ed29121d
EZ
10071 inherit_process_coding_system = 0;
10072
02ba4723 10073 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
df7492f9
KH
10074 doc: /*
10075Alist to decide a coding system to use for a file I/O operation.
48b0f3ae
PJ
10076The format is ((PATTERN . VAL) ...),
10077where PATTERN is a regular expression matching a file name,
10078VAL is a coding system, a cons of coding systems, or a function symbol.
10079If VAL is a coding system, it is used for both decoding and encoding
10080the file contents.
10081If VAL is a cons of coding systems, the car part is used for decoding,
10082and the cdr part is used for encoding.
10083If VAL is a function symbol, the function must return a coding system
2c53e699
KH
10084or a cons of coding systems which are used as above. The function is
10085called with an argument that is a list of the arguments with which
5a0bbd9a
KH
10086`find-operation-coding-system' was called. If the function can't decide
10087a coding system, it can return `undecided' so that the normal
10088code-detection is performed.
48b0f3ae
PJ
10089
10090See also the function `find-operation-coding-system'
10091and the variable `auto-coding-alist'. */);
02ba4723
KH
10092 Vfile_coding_system_alist = Qnil;
10093
10094 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
df7492f9
KH
10095 doc: /*
10096Alist to decide a coding system to use for a process I/O operation.
48b0f3ae
PJ
10097The format is ((PATTERN . VAL) ...),
10098where PATTERN is a regular expression matching a program name,
10099VAL is a coding system, a cons of coding systems, or a function symbol.
10100If VAL is a coding system, it is used for both decoding what received
10101from the program and encoding what sent to the program.
10102If VAL is a cons of coding systems, the car part is used for decoding,
10103and the cdr part is used for encoding.
10104If VAL is a function symbol, the function must return a coding system
10105or a cons of coding systems which are used as above.
10106
10107See also the function `find-operation-coding-system'. */);
02ba4723
KH
10108 Vprocess_coding_system_alist = Qnil;
10109
10110 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
df7492f9
KH
10111 doc: /*
10112Alist to decide a coding system to use for a network I/O operation.
48b0f3ae
PJ
10113The format is ((PATTERN . VAL) ...),
10114where PATTERN is a regular expression matching a network service name
10115or is a port number to connect to,
10116VAL is a coding system, a cons of coding systems, or a function symbol.
10117If VAL is a coding system, it is used for both decoding what received
10118from the network stream and encoding what sent to the network stream.
10119If VAL is a cons of coding systems, the car part is used for decoding,
10120and the cdr part is used for encoding.
10121If VAL is a function symbol, the function must return a coding system
10122or a cons of coding systems which are used as above.
10123
10124See also the function `find-operation-coding-system'. */);
02ba4723 10125 Vnetwork_coding_system_alist = Qnil;
4ed46869 10126
68c45bf0 10127 DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
75205970
RS
10128 doc: /* Coding system to use with system messages.
10129Also used for decoding keyboard input on X Window system. */);
68c45bf0
PE
10130 Vlocale_coding_system = Qnil;
10131
005f0d35 10132 /* The eol mnemonics are reset in startup.el system-dependently. */
7722baf9 10133 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
df7492f9
KH
10134 doc: /*
10135*String displayed in mode line for UNIX-like (LF) end-of-line format. */);
7722baf9 10136 eol_mnemonic_unix = build_string (":");
4ed46869 10137
7722baf9 10138 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
df7492f9
KH
10139 doc: /*
10140*String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
7722baf9 10141 eol_mnemonic_dos = build_string ("\\");
4ed46869 10142
7722baf9 10143 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
df7492f9
KH
10144 doc: /*
10145*String displayed in mode line for MAC-like (CR) end-of-line format. */);
7722baf9 10146 eol_mnemonic_mac = build_string ("/");
4ed46869 10147
7722baf9 10148 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
df7492f9
KH
10149 doc: /*
10150*String displayed in mode line when end-of-line format is not yet determined. */);
7722baf9 10151 eol_mnemonic_undecided = build_string (":");
4ed46869 10152
84fbb8a0 10153 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
df7492f9
KH
10154 doc: /*
10155*Non-nil enables character translation while encoding and decoding. */);
84fbb8a0 10156 Venable_character_translation = Qt;
bdd9fb48 10157
f967223b 10158 DEFVAR_LISP ("standard-translation-table-for-decode",
48b0f3ae
PJ
10159 &Vstandard_translation_table_for_decode,
10160 doc: /* Table for translating characters while decoding. */);
f967223b 10161 Vstandard_translation_table_for_decode = Qnil;
bdd9fb48 10162
f967223b 10163 DEFVAR_LISP ("standard-translation-table-for-encode",
48b0f3ae
PJ
10164 &Vstandard_translation_table_for_encode,
10165 doc: /* Table for translating characters while encoding. */);
f967223b 10166 Vstandard_translation_table_for_encode = Qnil;
4ed46869 10167
df7492f9 10168 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table,
48b0f3ae
PJ
10169 doc: /* Alist of charsets vs revision numbers.
10170While encoding, if a charset (car part of an element) is found,
df7492f9
KH
10171designate it with the escape sequence identifying revision (cdr part
10172of the element). */);
10173 Vcharset_revision_table = Qnil;
02ba4723
KH
10174
10175 DEFVAR_LISP ("default-process-coding-system",
10176 &Vdefault_process_coding_system,
48b0f3ae
PJ
10177 doc: /* Cons of coding systems used for process I/O by default.
10178The car part is used for decoding a process output,
10179the cdr part is used for encoding a text to be sent to a process. */);
02ba4723 10180 Vdefault_process_coding_system = Qnil;
c4825358 10181
3f003981 10182 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
df7492f9
KH
10183 doc: /*
10184Table of extra Latin codes in the range 128..159 (inclusive).
48b0f3ae
PJ
10185This is a vector of length 256.
10186If Nth element is non-nil, the existence of code N in a file
10187\(or output of subprocess) doesn't prevent it to be detected as
10188a coding system of ISO 2022 variant which has a flag
10189`accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
10190or reading output of a subprocess.
446dcd75 10191Only 128th through 159th elements have a meaning. */);
3f003981 10192 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
d46c5b12
KH
10193
10194 DEFVAR_LISP ("select-safe-coding-system-function",
10195 &Vselect_safe_coding_system_function,
df7492f9
KH
10196 doc: /*
10197Function to call to select safe coding system for encoding a text.
48b0f3ae
PJ
10198
10199If set, this function is called to force a user to select a proper
10200coding system which can encode the text in the case that a default
fdecf907
GM
10201coding system used in each operation can't encode the text. The
10202function should take care that the buffer is not modified while
10203the coding system is being selected.
48b0f3ae
PJ
10204
10205The default value is `select-safe-coding-system' (which see). */);
d46c5b12
KH
10206 Vselect_safe_coding_system_function = Qnil;
10207
5d5bf4d8
KH
10208 DEFVAR_BOOL ("coding-system-require-warning",
10209 &coding_system_require_warning,
10210 doc: /* Internal use only.
6b89e3aa
KH
10211If non-nil, on writing a file, `select-safe-coding-system-function' is
10212called even if `coding-system-for-write' is non-nil. The command
10213`universal-coding-system-argument' binds this variable to t temporarily. */);
5d5bf4d8
KH
10214 coding_system_require_warning = 0;
10215
10216
22ab2303 10217 DEFVAR_BOOL ("inhibit-iso-escape-detection",
74383408 10218 &inhibit_iso_escape_detection,
df7492f9
KH
10219 doc: /*
10220If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
48b0f3ae
PJ
10221
10222By default, on reading a file, Emacs tries to detect how the text is
10223encoded. This code detection is sensitive to escape sequences. If
10224the sequence is valid as ISO2022, the code is determined as one of
10225the ISO2022 encodings, and the file is decoded by the corresponding
10226coding system (e.g. `iso-2022-7bit').
10227
10228However, there may be a case that you want to read escape sequences in
10229a file as is. In such a case, you can set this variable to non-nil.
10230Then, as the code detection ignores any escape sequences, no file is
10231detected as encoded in some ISO2022 encoding. The result is that all
10232escape sequences become visible in a buffer.
10233
10234The default value is nil, and it is strongly recommended not to change
10235it. That is because many Emacs Lisp source files that contain
10236non-ASCII characters are encoded by the coding system `iso-2022-7bit'
10237in Emacs's distribution, and they won't be decoded correctly on
10238reading if you suppress escape sequence detection.
10239
10240The other way to read escape sequences in a file without decoding is
10241to explicitly specify some coding system that doesn't use ISO2022's
10242escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument]. */);
74383408 10243 inhibit_iso_escape_detection = 0;
002fdb44
DL
10244
10245 DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
15c8f9d1 10246 doc: /* Char table for translating self-inserting characters.
446dcd75
JB
10247This is applied to the result of input methods, not their input.
10248See also `keyboard-translate-table'. */);
002fdb44 10249 Vtranslation_table_for_input = Qnil;
8f924df7 10250
2c78b7e1
KH
10251 {
10252 Lisp_Object args[coding_arg_max];
8f924df7 10253 Lisp_Object plist[16];
2c78b7e1
KH
10254 int i;
10255
10256 for (i = 0; i < coding_arg_max; i++)
10257 args[i] = Qnil;
10258
10259 plist[0] = intern (":name");
10260 plist[1] = args[coding_arg_name] = Qno_conversion;
10261 plist[2] = intern (":mnemonic");
10262 plist[3] = args[coding_arg_mnemonic] = make_number ('=');
10263 plist[4] = intern (":coding-type");
10264 plist[5] = args[coding_arg_coding_type] = Qraw_text;
10265 plist[6] = intern (":ascii-compatible-p");
10266 plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
10267 plist[8] = intern (":default-char");
10268 plist[9] = args[coding_arg_default_char] = make_number (0);
8f924df7
KH
10269 plist[10] = intern (":for-unibyte");
10270 plist[11] = args[coding_arg_for_unibyte] = Qt;
10271 plist[12] = intern (":docstring");
10272 plist[13] = build_string ("Do no conversion.\n\
2c78b7e1
KH
10273\n\
10274When you visit a file with this coding, the file is read into a\n\
10275unibyte buffer as is, thus each byte of a file is treated as a\n\
10276character.");
8f924df7
KH
10277 plist[14] = intern (":eol-type");
10278 plist[15] = args[coding_arg_eol_type] = Qunix;
10279 args[coding_arg_plist] = Flist (16, plist);
2c78b7e1 10280 Fdefine_coding_system_internal (coding_arg_max, args);
ae6f73fa
KH
10281
10282 plist[1] = args[coding_arg_name] = Qundecided;
10283 plist[3] = args[coding_arg_mnemonic] = make_number ('-');
10284 plist[5] = args[coding_arg_coding_type] = Qundecided;
10285 /* This is already set.
35befdaa 10286 plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
ae6f73fa
KH
10287 plist[8] = intern (":charset-list");
10288 plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
10289 plist[11] = args[coding_arg_for_unibyte] = Qnil;
10290 plist[13] = build_string ("No conversion on encoding, automatic conversion on decoding.");
10291 plist[15] = args[coding_arg_eol_type] = Qnil;
10292 args[coding_arg_plist] = Flist (16, plist);
10293 Fdefine_coding_system_internal (coding_arg_max, args);
2c78b7e1
KH
10294 }
10295
2c78b7e1 10296 setup_coding_system (Qno_conversion, &safe_terminal_coding);
ff563fce
KH
10297
10298 {
10299 int i;
10300
10301 for (i = 0; i < coding_category_max; i++)
10302 Fset (AREF (Vcoding_category_table, i), Qno_conversion);
10303 }
fcbcfb64
KH
10304#if defined (MSDOS) || defined (WINDOWSNT)
10305 system_eol_type = Qdos;
10306#else
10307 system_eol_type = Qunix;
10308#endif
10309 staticpro (&system_eol_type);
4ed46869
KH
10310}
10311
68c45bf0
PE
10312char *
10313emacs_strerror (error_number)
10314 int error_number;
10315{
10316 char *str;
10317
ca9c0567 10318 synchronize_system_messages_locale ();
68c45bf0
PE
10319 str = strerror (error_number);
10320
10321 if (! NILP (Vlocale_coding_system))
10322 {
10323 Lisp_Object dec = code_convert_string_norecord (build_string (str),
10324 Vlocale_coding_system,
10325 0);
d5db4077 10326 str = (char *) SDATA (dec);
68c45bf0
PE
10327 }
10328
10329 return str;
10330}
10331
4ed46869 10332#endif /* emacs */
9ffd559c
KH
10333
10334/* arch-tag: 3a3a2b01-5ff6-4071-9afe-f5b808d9229d
10335 (do not change this comment) */