* coding.c (make_conversion_work_buffer): Disable buffer modification
[bpt/emacs.git] / src / coding.c
CommitLineData
9542cb1f 1/* Coding system handler (conversion, detection, etc).
aaef169d 2 Copyright (C) 2001, 2002, 2003, 2004, 2005,
8cabe764 3 2006, 2007, 2008 Free Software Foundation, Inc.
7976eda0 4 Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
8cabe764 5 2005, 2006, 2007, 2008
ce03bf76
KH
6 National Institute of Advanced Industrial Science and Technology (AIST)
7 Registration Number H14PRO021
8f924df7 8 Copyright (C) 2003
df7492f9
KH
9 National Institute of Advanced Industrial Science and Technology (AIST)
10 Registration Number H13PRO009
4ed46869 11
369314dc
KH
12This file is part of GNU Emacs.
13
9ec0b715 14GNU Emacs is free software: you can redistribute it and/or modify
369314dc 15it under the terms of the GNU General Public License as published by
9ec0b715
GM
16the Free Software Foundation, either version 3 of the License, or
17(at your option) any later version.
4ed46869 18
369314dc
KH
19GNU Emacs is distributed in the hope that it will be useful,
20but WITHOUT ANY WARRANTY; without even the implied warranty of
21MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22GNU General Public License for more details.
4ed46869 23
369314dc 24You should have received a copy of the GNU General Public License
9ec0b715 25along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */
4ed46869
KH
26
27/*** TABLE OF CONTENTS ***
28
b73bfc1c 29 0. General comments
4ed46869 30 1. Preamble
df7492f9
KH
31 2. Emacs' internal format (emacs-utf-8) handlers
32 3. UTF-8 handlers
33 4. UTF-16 handlers
34 5. Charset-base coding systems handlers
35 6. emacs-mule (old Emacs' internal format) handlers
36 7. ISO2022 handlers
37 8. Shift-JIS and BIG5 handlers
38 9. CCL handlers
39 10. C library functions
40 11. Emacs Lisp library functions
41 12. Postamble
4ed46869
KH
42
43*/
44
df7492f9 45/*** 0. General comments ***
b73bfc1c
KH
46
47
df7492f9 48CODING SYSTEM
4ed46869 49
5bad0796
DL
50 A coding system is an object for an encoding mechanism that contains
51 information about how to convert byte sequences to character
e19c3639
KH
52 sequences and vice versa. When we say "decode", it means converting
53 a byte sequence of a specific coding system into a character
54 sequence that is represented by Emacs' internal coding system
55 `emacs-utf-8', and when we say "encode", it means converting a
56 character sequence of emacs-utf-8 to a byte sequence of a specific
0ef69138 57 coding system.
4ed46869 58
e19c3639
KH
59 In Emacs Lisp, a coding system is represented by a Lisp symbol. In
60 C level, a coding system is represented by a vector of attributes
5bad0796 61 stored in the hash table Vcharset_hash_table. The conversion from
e19c3639
KH
62 coding system symbol to attributes vector is done by looking up
63 Vcharset_hash_table by the symbol.
4ed46869 64
e19c3639 65 Coding systems are classified into the following types depending on
5bad0796 66 the encoding mechanism. Here's a brief description of the types.
4ed46869 67
df7492f9
KH
68 o UTF-8
69
70 o UTF-16
71
72 o Charset-base coding system
73
74 A coding system defined by one or more (coded) character sets.
5bad0796 75 Decoding and encoding are done by a code converter defined for each
df7492f9
KH
76 character set.
77
5bad0796 78 o Old Emacs internal format (emacs-mule)
df7492f9 79
5bad0796 80 The coding system adopted by old versions of Emacs (20 and 21).
4ed46869 81
df7492f9 82 o ISO2022-base coding system
4ed46869
KH
83
84 The most famous coding system for multiple character sets. X's
df7492f9
KH
85 Compound Text, various EUCs (Extended Unix Code), and coding systems
86 used in the Internet communication such as ISO-2022-JP are all
87 variants of ISO2022.
4ed46869 88
df7492f9 89 o SJIS (or Shift-JIS or MS-Kanji-Code)
93dec019 90
4ed46869
KH
91 A coding system to encode character sets: ASCII, JISX0201, and
92 JISX0208. Widely used for PC's in Japan. Details are described in
df7492f9 93 section 8.
4ed46869 94
df7492f9 95 o BIG5
4ed46869 96
df7492f9 97 A coding system to encode character sets: ASCII and Big5. Widely
cfb43547 98 used for Chinese (mainly in Taiwan and Hong Kong). Details are
df7492f9
KH
99 described in section 8. In this file, when we write "big5" (all
100 lowercase), we mean the coding system, and when we write "Big5"
101 (capitalized), we mean the character set.
4ed46869 102
df7492f9 103 o CCL
27901516 104
5bad0796 105 If a user wants to decode/encode text encoded in a coding system
df7492f9
KH
106 not listed above, he can supply a decoder and an encoder for it in
107 CCL (Code Conversion Language) programs. Emacs executes the CCL
108 program while decoding/encoding.
27901516 109
df7492f9 110 o Raw-text
4ed46869 111
5a936b46 112 A coding system for text containing raw eight-bit data. Emacs
5bad0796 113 treats each byte of source text as a character (except for
df7492f9 114 end-of-line conversion).
4ed46869 115
df7492f9
KH
116 o No-conversion
117
118 Like raw text, but don't do end-of-line conversion.
4ed46869 119
4ed46869 120
df7492f9 121END-OF-LINE FORMAT
4ed46869 122
5bad0796 123 How text end-of-line is encoded depends on operating system. For
df7492f9 124 instance, Unix's format is just one byte of LF (line-feed) code,
f4dee582 125 whereas DOS's format is two-byte sequence of `carriage-return' and
d46c5b12
KH
126 `line-feed' codes. MacOS's format is usually one byte of
127 `carriage-return'.
4ed46869 128
cfb43547 129 Since text character encoding and end-of-line encoding are
df7492f9
KH
130 independent, any coding system described above can take any format
131 of end-of-line (except for no-conversion).
4ed46869 132
e19c3639
KH
133STRUCT CODING_SYSTEM
134
135 Before using a coding system for code conversion (i.e. decoding and
136 encoding), we setup a structure of type `struct coding_system'.
137 This structure keeps various information about a specific code
5bad0796 138 conversion (e.g. the location of source and destination data).
4ed46869
KH
139
140*/
141
df7492f9
KH
142/* COMMON MACROS */
143
144
4ed46869
KH
145/*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
146
df7492f9 147 These functions check if a byte sequence specified as a source in
ff0dacd7
KH
148 CODING conforms to the format of XXX, and update the members of
149 DETECT_INFO.
df7492f9 150
ff0dacd7 151 Return 1 if the byte sequence conforms to XXX, otherwise return 0.
df7492f9
KH
152
153 Below is the template of these functions. */
154
4ed46869 155#if 0
df7492f9 156static int
ff0dacd7 157detect_coding_XXX (coding, detect_info)
df7492f9 158 struct coding_system *coding;
ff0dacd7 159 struct coding_detection_info *detect_info;
4ed46869 160{
f1d34bca
MB
161 const unsigned char *src = coding->source;
162 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9 163 int multibytep = coding->src_multibyte;
ff0dacd7 164 int consumed_chars = 0;
df7492f9
KH
165 int found = 0;
166 ...;
167
168 while (1)
169 {
170 /* Get one byte from the source. If the souce is exausted, jump
171 to no_more_source:. */
172 ONE_MORE_BYTE (c);
ff0dacd7
KH
173
174 if (! __C_conforms_to_XXX___ (c))
175 break;
176 if (! __C_strongly_suggests_XXX__ (c))
177 found = CATEGORY_MASK_XXX;
df7492f9 178 }
ff0dacd7
KH
179 /* The byte sequence is invalid for XXX. */
180 detect_info->rejected |= CATEGORY_MASK_XXX;
df7492f9 181 return 0;
ff0dacd7 182
df7492f9 183 no_more_source:
ff0dacd7
KH
184 /* The source exausted successfully. */
185 detect_info->found |= found;
df7492f9 186 return 1;
4ed46869
KH
187}
188#endif
189
190/*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
191
df7492f9
KH
192 These functions decode a byte sequence specified as a source by
193 CODING. The resulting multibyte text goes to a place pointed to by
194 CODING->charbuf, the length of which should not exceed
195 CODING->charbuf_size;
d46c5b12 196
df7492f9
KH
197 These functions set the information of original and decoded texts in
198 CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
199 They also set CODING->result to one of CODING_RESULT_XXX indicating
200 how the decoding is finished.
d46c5b12 201
df7492f9 202 Below is the template of these functions. */
d46c5b12 203
4ed46869 204#if 0
b73bfc1c 205static void
df7492f9 206decode_coding_XXXX (coding)
4ed46869 207 struct coding_system *coding;
4ed46869 208{
f1d34bca
MB
209 const unsigned char *src = coding->source + coding->consumed;
210 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9
KH
211 /* SRC_BASE remembers the start position in source in each loop.
212 The loop will be exited when there's not enough source code, or
213 when there's no room in CHARBUF for a decoded character. */
f1d34bca 214 const unsigned char *src_base;
df7492f9 215 /* A buffer to produce decoded characters. */
69a80ea3
KH
216 int *charbuf = coding->charbuf + coding->charbuf_used;
217 int *charbuf_end = coding->charbuf + coding->charbuf_size;
df7492f9
KH
218 int multibytep = coding->src_multibyte;
219
220 while (1)
221 {
222 src_base = src;
223 if (charbuf < charbuf_end)
224 /* No more room to produce a decoded character. */
225 break;
226 ONE_MORE_BYTE (c);
227 /* Decode it. */
228 }
229
230 no_more_source:
231 if (src_base < src_end
232 && coding->mode & CODING_MODE_LAST_BLOCK)
233 /* If the source ends by partial bytes to construct a character,
234 treat them as eight-bit raw data. */
235 while (src_base < src_end && charbuf < charbuf_end)
236 *charbuf++ = *src_base++;
237 /* Remember how many bytes and characters we consumed. If the
238 source is multibyte, the bytes and chars are not identical. */
239 coding->consumed = coding->consumed_char = src_base - coding->source;
240 /* Remember how many characters we produced. */
241 coding->charbuf_used = charbuf - coding->charbuf;
4ed46869
KH
242}
243#endif
244
245/*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
246
df7492f9
KH
247 These functions encode SRC_BYTES length text at SOURCE of Emacs'
248 internal multibyte format by CODING. The resulting byte sequence
b73bfc1c
KH
249 goes to a place pointed to by DESTINATION, the length of which
250 should not exceed DST_BYTES.
d46c5b12 251
df7492f9
KH
252 These functions set the information of original and encoded texts in
253 the members produced, produced_char, consumed, and consumed_char of
254 the structure *CODING. They also set the member result to one of
255 CODING_RESULT_XXX indicating how the encoding finished.
d46c5b12 256
df7492f9
KH
257 DST_BYTES zero means that source area and destination area are
258 overlapped, which means that we can produce a encoded text until it
259 reaches at the head of not-yet-encoded source text.
d46c5b12 260
df7492f9 261 Below is a template of these functions. */
4ed46869 262#if 0
b73bfc1c 263static void
df7492f9 264encode_coding_XXX (coding)
4ed46869 265 struct coding_system *coding;
4ed46869 266{
df7492f9
KH
267 int multibytep = coding->dst_multibyte;
268 int *charbuf = coding->charbuf;
269 int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
270 unsigned char *dst = coding->destination + coding->produced;
271 unsigned char *dst_end = coding->destination + coding->dst_bytes;
272 unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
273 int produced_chars = 0;
274
275 for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
276 {
277 int c = *charbuf;
278 /* Encode C into DST, and increment DST. */
279 }
280 label_no_more_destination:
281 /* How many chars and bytes we produced. */
282 coding->produced_char += produced_chars;
283 coding->produced = dst - coding->destination;
4ed46869
KH
284}
285#endif
286
4ed46869
KH
287\f
288/*** 1. Preamble ***/
289
68c45bf0 290#include <config.h>
4ed46869
KH
291#include <stdio.h>
292
4ed46869
KH
293#include "lisp.h"
294#include "buffer.h"
df7492f9 295#include "character.h"
4ed46869
KH
296#include "charset.h"
297#include "ccl.h"
df7492f9 298#include "composite.h"
4ed46869
KH
299#include "coding.h"
300#include "window.h"
b8299c66
KL
301#include "frame.h"
302#include "termhooks.h"
4ed46869 303
df7492f9 304Lisp_Object Vcoding_system_hash_table;
4ed46869 305
df7492f9 306Lisp_Object Qcoding_system, Qcoding_aliases, Qeol_type;
1965cb73
DL
307Lisp_Object Qunix, Qdos;
308extern Lisp_Object Qmac; /* frame.c */
4ed46869
KH
309Lisp_Object Qbuffer_file_coding_system;
310Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
df7492f9 311Lisp_Object Qdefault_char;
27901516 312Lisp_Object Qno_conversion, Qundecided;
df7492f9 313Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5;
b49a1807 314Lisp_Object Qbig, Qlittle;
bb0115a2 315Lisp_Object Qcoding_system_history;
1397dc18 316Lisp_Object Qvalid_codes;
a6f87d34
KH
317Lisp_Object QCcategory, QCmnemonic, QCdefalut_char;
318Lisp_Object QCdecode_translation_table, QCencode_translation_table;
319Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
35befdaa 320Lisp_Object QCascii_compatible_p;
4ed46869
KH
321
322extern Lisp_Object Qinsert_file_contents, Qwrite_region;
387f6ba5 323Lisp_Object Qcall_process, Qcall_process_region;
4ed46869
KH
324Lisp_Object Qstart_process, Qopen_network_stream;
325Lisp_Object Qtarget_idx;
326
065e3595
KH
327Lisp_Object Qinsufficient_source, Qinconsistent_eol, Qinvalid_source;
328Lisp_Object Qinterrupted, Qinsufficient_memory;
329
c7183fb8
GM
330extern Lisp_Object Qcompletion_ignore_case;
331
44e8490d
KH
332/* If a symbol has this property, evaluate the value to define the
333 symbol as a coding system. */
334static Lisp_Object Qcoding_system_define_form;
335
5d5bf4d8
KH
336int coding_system_require_warning;
337
d46c5b12
KH
338Lisp_Object Vselect_safe_coding_system_function;
339
7722baf9
EZ
340/* Mnemonic string for each format of end-of-line. */
341Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
342/* Mnemonic string to indicate format of end-of-line is not yet
4ed46869 343 decided. */
7722baf9 344Lisp_Object eol_mnemonic_undecided;
4ed46869 345
fcbcfb64
KH
346/* Format of end-of-line decided by system. This is Qunix on
347 Unix and Mac, Qdos on DOS/Windows.
348 This has an effect only for external encoding (i.e. for output to
349 file and process), not for in-buffer or Lisp string encoding. */
350static Lisp_Object system_eol_type;
351
4ed46869
KH
352#ifdef emacs
353
4608c386
KH
354Lisp_Object Vcoding_system_list, Vcoding_system_alist;
355
356Lisp_Object Qcoding_system_p, Qcoding_system_error;
4ed46869 357
d46c5b12
KH
358/* Coding system emacs-mule and raw-text are for converting only
359 end-of-line format. */
360Lisp_Object Qemacs_mule, Qraw_text;
8f924df7 361Lisp_Object Qutf_8_emacs;
ecf488bc 362
4ed46869
KH
363/* Coding-systems are handed between Emacs Lisp programs and C internal
364 routines by the following three variables. */
365/* Coding-system for reading files and receiving data from process. */
366Lisp_Object Vcoding_system_for_read;
367/* Coding-system for writing files and sending data to process. */
368Lisp_Object Vcoding_system_for_write;
369/* Coding-system actually used in the latest I/O. */
370Lisp_Object Vlast_coding_system_used;
065e3595
KH
371/* Set to non-nil when an error is detected while code conversion. */
372Lisp_Object Vlast_code_conversion_error;
c4825358 373/* A vector of length 256 which contains information about special
94487c4e 374 Latin codes (especially for dealing with Microsoft codes). */
3f003981 375Lisp_Object Vlatin_extra_code_table;
c4825358 376
9ce27fde
KH
377/* Flag to inhibit code conversion of end-of-line format. */
378int inhibit_eol_conversion;
379
74383408
KH
380/* Flag to inhibit ISO2022 escape sequence detection. */
381int inhibit_iso_escape_detection;
382
ed29121d
EZ
383/* Flag to make buffer-file-coding-system inherit from process-coding. */
384int inherit_process_coding_system;
385
c4825358
KH
386/* Coding system to be used to encode text for terminal display when
387 terminal coding system is nil. */
388struct coding_system safe_terminal_coding;
389
02ba4723
KH
390Lisp_Object Vfile_coding_system_alist;
391Lisp_Object Vprocess_coding_system_alist;
392Lisp_Object Vnetwork_coding_system_alist;
4ed46869 393
68c45bf0
PE
394Lisp_Object Vlocale_coding_system;
395
4ed46869
KH
396#endif /* emacs */
397
f967223b
KH
398/* Flag to tell if we look up translation table on character code
399 conversion. */
84fbb8a0 400Lisp_Object Venable_character_translation;
f967223b
KH
401/* Standard translation table to look up on decoding (reading). */
402Lisp_Object Vstandard_translation_table_for_decode;
403/* Standard translation table to look up on encoding (writing). */
404Lisp_Object Vstandard_translation_table_for_encode;
84fbb8a0 405
f967223b
KH
406Lisp_Object Qtranslation_table;
407Lisp_Object Qtranslation_table_id;
408Lisp_Object Qtranslation_table_for_decode;
409Lisp_Object Qtranslation_table_for_encode;
4ed46869
KH
410
411/* Alist of charsets vs revision number. */
df7492f9 412static Lisp_Object Vcharset_revision_table;
4ed46869 413
02ba4723
KH
414/* Default coding systems used for process I/O. */
415Lisp_Object Vdefault_process_coding_system;
416
002fdb44
DL
417/* Char table for translating Quail and self-inserting input. */
418Lisp_Object Vtranslation_table_for_input;
419
df7492f9
KH
420/* Two special coding systems. */
421Lisp_Object Vsjis_coding_system;
422Lisp_Object Vbig5_coding_system;
423
df7492f9
KH
424/* ISO2022 section */
425
426#define CODING_ISO_INITIAL(coding, reg) \
427 (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id), \
428 coding_attr_iso_initial), \
429 reg)))
430
431
432#define CODING_ISO_REQUEST(coding, charset_id) \
433 ((charset_id <= (coding)->max_charset_id \
434 ? (coding)->safe_charsets[charset_id] \
435 : -1))
436
437
438#define CODING_ISO_FLAGS(coding) \
439 ((coding)->spec.iso_2022.flags)
440#define CODING_ISO_DESIGNATION(coding, reg) \
441 ((coding)->spec.iso_2022.current_designation[reg])
442#define CODING_ISO_INVOCATION(coding, plane) \
443 ((coding)->spec.iso_2022.current_invocation[plane])
444#define CODING_ISO_SINGLE_SHIFTING(coding) \
445 ((coding)->spec.iso_2022.single_shifting)
446#define CODING_ISO_BOL(coding) \
447 ((coding)->spec.iso_2022.bol)
448#define CODING_ISO_INVOKED_CHARSET(coding, plane) \
449 CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
450
451/* Control characters of ISO2022. */
452 /* code */ /* function */
453#define ISO_CODE_LF 0x0A /* line-feed */
454#define ISO_CODE_CR 0x0D /* carriage-return */
455#define ISO_CODE_SO 0x0E /* shift-out */
456#define ISO_CODE_SI 0x0F /* shift-in */
457#define ISO_CODE_SS2_7 0x19 /* single-shift-2 for 7-bit code */
458#define ISO_CODE_ESC 0x1B /* escape */
459#define ISO_CODE_SS2 0x8E /* single-shift-2 */
460#define ISO_CODE_SS3 0x8F /* single-shift-3 */
461#define ISO_CODE_CSI 0x9B /* control-sequence-introducer */
462
463/* All code (1-byte) of ISO2022 is classified into one of the
464 followings. */
465enum iso_code_class_type
466 {
467 ISO_control_0, /* Control codes in the range
468 0x00..0x1F and 0x7F, except for the
469 following 5 codes. */
df7492f9
KH
470 ISO_shift_out, /* ISO_CODE_SO (0x0E) */
471 ISO_shift_in, /* ISO_CODE_SI (0x0F) */
472 ISO_single_shift_2_7, /* ISO_CODE_SS2_7 (0x19) */
473 ISO_escape, /* ISO_CODE_SO (0x1B) */
474 ISO_control_1, /* Control codes in the range
475 0x80..0x9F, except for the
476 following 3 codes. */
477 ISO_single_shift_2, /* ISO_CODE_SS2 (0x8E) */
478 ISO_single_shift_3, /* ISO_CODE_SS3 (0x8F) */
479 ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
480 ISO_0x20_or_0x7F, /* Codes of the values 0x20 or 0x7F. */
481 ISO_graphic_plane_0, /* Graphic codes in the range 0x21..0x7E. */
482 ISO_0xA0_or_0xFF, /* Codes of the values 0xA0 or 0xFF. */
483 ISO_graphic_plane_1 /* Graphic codes in the range 0xA1..0xFE. */
484 };
05e6f5dc 485
df7492f9
KH
486/** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
487 `iso-flags' attribute of an iso2022 coding system. */
05e6f5dc 488
df7492f9
KH
489/* If set, produce long-form designation sequence (e.g. ESC $ ( A)
490 instead of the correct short-form sequence (e.g. ESC $ A). */
491#define CODING_ISO_FLAG_LONG_FORM 0x0001
93dec019 492
df7492f9
KH
493/* If set, reset graphic planes and registers at end-of-line to the
494 initial state. */
495#define CODING_ISO_FLAG_RESET_AT_EOL 0x0002
05e6f5dc 496
df7492f9
KH
497/* If set, reset graphic planes and registers before any control
498 characters to the initial state. */
499#define CODING_ISO_FLAG_RESET_AT_CNTL 0x0004
05e6f5dc 500
df7492f9
KH
501/* If set, encode by 7-bit environment. */
502#define CODING_ISO_FLAG_SEVEN_BITS 0x0008
4ed46869 503
df7492f9
KH
504/* If set, use locking-shift function. */
505#define CODING_ISO_FLAG_LOCKING_SHIFT 0x0010
b73bfc1c 506
df7492f9
KH
507/* If set, use single-shift function. Overwrite
508 CODING_ISO_FLAG_LOCKING_SHIFT. */
509#define CODING_ISO_FLAG_SINGLE_SHIFT 0x0020
b73bfc1c 510
df7492f9
KH
511/* If set, use designation escape sequence. */
512#define CODING_ISO_FLAG_DESIGNATION 0x0040
b73bfc1c 513
df7492f9
KH
514/* If set, produce revision number sequence. */
515#define CODING_ISO_FLAG_REVISION 0x0080
b73bfc1c 516
df7492f9
KH
517/* If set, produce ISO6429's direction specifying sequence. */
518#define CODING_ISO_FLAG_DIRECTION 0x0100
f4dee582 519
df7492f9
KH
520/* If set, assume designation states are reset at beginning of line on
521 output. */
522#define CODING_ISO_FLAG_INIT_AT_BOL 0x0200
4ed46869 523
df7492f9
KH
524/* If set, designation sequence should be placed at beginning of line
525 on output. */
526#define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
aa72b389 527
df7492f9
KH
528/* If set, do not encode unsafe charactes on output. */
529#define CODING_ISO_FLAG_SAFE 0x0800
aa72b389 530
df7492f9
KH
531/* If set, extra latin codes (128..159) are accepted as a valid code
532 on input. */
533#define CODING_ISO_FLAG_LATIN_EXTRA 0x1000
aa72b389 534
df7492f9 535#define CODING_ISO_FLAG_COMPOSITION 0x2000
aa72b389 536
df7492f9 537#define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000
aa72b389 538
bf16eb23 539#define CODING_ISO_FLAG_USE_ROMAN 0x8000
aa72b389 540
bf16eb23 541#define CODING_ISO_FLAG_USE_OLDJIS 0x10000
aa72b389 542
bf16eb23 543#define CODING_ISO_FLAG_FULL_SUPPORT 0x100000
aa72b389 544
df7492f9
KH
545/* A character to be produced on output if encoding of the original
546 character is prohibited by CODING_ISO_FLAG_SAFE. */
547#define CODING_INHIBIT_CHARACTER_SUBSTITUTION '?'
aa72b389 548
a470d443
KH
549/* UTF-8 section */
550#define CODING_UTF_8_BOM(coding) \
551 ((coding)->spec.utf_8_bom)
4ed46869 552
df7492f9
KH
553/* UTF-16 section */
554#define CODING_UTF_16_BOM(coding) \
555 ((coding)->spec.utf_16.bom)
4ed46869 556
df7492f9
KH
557#define CODING_UTF_16_ENDIAN(coding) \
558 ((coding)->spec.utf_16.endian)
4ed46869 559
df7492f9
KH
560#define CODING_UTF_16_SURROGATE(coding) \
561 ((coding)->spec.utf_16.surrogate)
4ed46869 562
4ed46869 563
df7492f9
KH
564/* CCL section */
565#define CODING_CCL_DECODER(coding) \
566 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
567#define CODING_CCL_ENCODER(coding) \
568 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
569#define CODING_CCL_VALIDS(coding) \
8f924df7 570 (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
4ed46869 571
5a936b46 572/* Index for each coding category in `coding_categories' */
4ed46869 573
df7492f9
KH
574enum coding_category
575 {
576 coding_category_iso_7,
577 coding_category_iso_7_tight,
578 coding_category_iso_8_1,
579 coding_category_iso_8_2,
580 coding_category_iso_7_else,
581 coding_category_iso_8_else,
a470d443
KH
582 coding_category_utf_8_auto,
583 coding_category_utf_8_nosig,
584 coding_category_utf_8_sig,
df7492f9
KH
585 coding_category_utf_16_auto,
586 coding_category_utf_16_be,
587 coding_category_utf_16_le,
588 coding_category_utf_16_be_nosig,
589 coding_category_utf_16_le_nosig,
590 coding_category_charset,
591 coding_category_sjis,
592 coding_category_big5,
593 coding_category_ccl,
594 coding_category_emacs_mule,
595 /* All above are targets of code detection. */
596 coding_category_raw_text,
597 coding_category_undecided,
598 coding_category_max
599 };
600
601/* Definitions of flag bits used in detect_coding_XXXX. */
602#define CATEGORY_MASK_ISO_7 (1 << coding_category_iso_7)
603#define CATEGORY_MASK_ISO_7_TIGHT (1 << coding_category_iso_7_tight)
604#define CATEGORY_MASK_ISO_8_1 (1 << coding_category_iso_8_1)
605#define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2)
606#define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else)
607#define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else)
a470d443
KH
608#define CATEGORY_MASK_UTF_8_AUTO (1 << coding_category_utf_8_auto)
609#define CATEGORY_MASK_UTF_8_NOSIG (1 << coding_category_utf_8_nosig)
610#define CATEGORY_MASK_UTF_8_SIG (1 << coding_category_utf_8_sig)
b49a1807 611#define CATEGORY_MASK_UTF_16_AUTO (1 << coding_category_utf_16_auto)
df7492f9
KH
612#define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be)
613#define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le)
614#define CATEGORY_MASK_UTF_16_BE_NOSIG (1 << coding_category_utf_16_be_nosig)
615#define CATEGORY_MASK_UTF_16_LE_NOSIG (1 << coding_category_utf_16_le_nosig)
616#define CATEGORY_MASK_CHARSET (1 << coding_category_charset)
617#define CATEGORY_MASK_SJIS (1 << coding_category_sjis)
618#define CATEGORY_MASK_BIG5 (1 << coding_category_big5)
619#define CATEGORY_MASK_CCL (1 << coding_category_ccl)
620#define CATEGORY_MASK_EMACS_MULE (1 << coding_category_emacs_mule)
ff0dacd7 621#define CATEGORY_MASK_RAW_TEXT (1 << coding_category_raw_text)
df7492f9
KH
622
623/* This value is returned if detect_coding_mask () find nothing other
624 than ASCII characters. */
625#define CATEGORY_MASK_ANY \
626 (CATEGORY_MASK_ISO_7 \
627 | CATEGORY_MASK_ISO_7_TIGHT \
628 | CATEGORY_MASK_ISO_8_1 \
629 | CATEGORY_MASK_ISO_8_2 \
630 | CATEGORY_MASK_ISO_7_ELSE \
631 | CATEGORY_MASK_ISO_8_ELSE \
a470d443
KH
632 | CATEGORY_MASK_UTF_8_AUTO \
633 | CATEGORY_MASK_UTF_8_NOSIG \
634 | CATEGORY_MASK_UTF_8_SIG \
2f3cbb32 635 | CATEGORY_MASK_UTF_16_AUTO \
df7492f9
KH
636 | CATEGORY_MASK_UTF_16_BE \
637 | CATEGORY_MASK_UTF_16_LE \
638 | CATEGORY_MASK_UTF_16_BE_NOSIG \
639 | CATEGORY_MASK_UTF_16_LE_NOSIG \
640 | CATEGORY_MASK_CHARSET \
641 | CATEGORY_MASK_SJIS \
642 | CATEGORY_MASK_BIG5 \
643 | CATEGORY_MASK_CCL \
644 | CATEGORY_MASK_EMACS_MULE)
645
646
647#define CATEGORY_MASK_ISO_7BIT \
648 (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
649
650#define CATEGORY_MASK_ISO_8BIT \
651 (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
652
653#define CATEGORY_MASK_ISO_ELSE \
654 (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
655
656#define CATEGORY_MASK_ISO_ESCAPE \
657 (CATEGORY_MASK_ISO_7 \
658 | CATEGORY_MASK_ISO_7_TIGHT \
659 | CATEGORY_MASK_ISO_7_ELSE \
660 | CATEGORY_MASK_ISO_8_ELSE)
661
662#define CATEGORY_MASK_ISO \
663 ( CATEGORY_MASK_ISO_7BIT \
664 | CATEGORY_MASK_ISO_8BIT \
665 | CATEGORY_MASK_ISO_ELSE)
666
667#define CATEGORY_MASK_UTF_16 \
2f3cbb32
KH
668 (CATEGORY_MASK_UTF_16_AUTO \
669 | CATEGORY_MASK_UTF_16_BE \
df7492f9
KH
670 | CATEGORY_MASK_UTF_16_LE \
671 | CATEGORY_MASK_UTF_16_BE_NOSIG \
672 | CATEGORY_MASK_UTF_16_LE_NOSIG)
673
a470d443
KH
674#define CATEGORY_MASK_UTF_8 \
675 (CATEGORY_MASK_UTF_8_AUTO \
676 | CATEGORY_MASK_UTF_8_NOSIG \
677 | CATEGORY_MASK_UTF_8_SIG)
df7492f9
KH
678
679/* List of symbols `coding-category-xxx' ordered by priority. This
680 variable is exposed to Emacs Lisp. */
681static Lisp_Object Vcoding_category_list;
682
683/* Table of coding categories (Lisp symbols). This variable is for
684 internal use oly. */
685static Lisp_Object Vcoding_category_table;
686
687/* Table of coding-categories ordered by priority. */
688static enum coding_category coding_priorities[coding_category_max];
689
690/* Nth element is a coding context for the coding system bound to the
691 Nth coding category. */
692static struct coding_system coding_categories[coding_category_max];
693
df7492f9
KH
694/*** Commonly used macros and functions ***/
695
696#ifndef min
697#define min(a, b) ((a) < (b) ? (a) : (b))
698#endif
699#ifndef max
700#define max(a, b) ((a) > (b) ? (a) : (b))
701#endif
4ed46869 702
24a73b0a
KH
703#define CODING_GET_INFO(coding, attrs, charset_list) \
704 do { \
705 (attrs) = CODING_ID_ATTRS ((coding)->id); \
706 (charset_list) = CODING_ATTR_CHARSET_LIST (attrs); \
df7492f9 707 } while (0)
4ed46869 708
4ed46869 709
df7492f9
KH
710/* Safely get one byte from the source text pointed by SRC which ends
711 at SRC_END, and set C to that byte. If there are not enough bytes
065e3595
KH
712 in the source, it jumps to `no_more_source'. If multibytep is
713 nonzero, and a multibyte character is found at SRC, set C to the
714 negative value of the character code. The caller should declare
715 and set these variables appropriately in advance:
716 src, src_end, multibytep */
aa72b389 717
065e3595
KH
718#define ONE_MORE_BYTE(c) \
719 do { \
720 if (src == src_end) \
721 { \
722 if (src_base < src) \
723 record_conversion_result \
724 (coding, CODING_RESULT_INSUFFICIENT_SRC); \
725 goto no_more_source; \
726 } \
727 c = *src++; \
728 if (multibytep && (c & 0x80)) \
729 { \
730 if ((c & 0xFE) == 0xC0) \
731 c = ((c & 1) << 6) | *src++; \
732 else \
733 { \
35befdaa
KH
734 src--; \
735 c = - string_char (src, &src, NULL); \
065e3595
KH
736 record_conversion_result \
737 (coding, CODING_RESULT_INVALID_SRC); \
738 } \
739 } \
740 consumed_chars++; \
aa72b389
KH
741 } while (0)
742
aa72b389 743
065e3595
KH
744#define ONE_MORE_BYTE_NO_CHECK(c) \
745 do { \
746 c = *src++; \
747 if (multibytep && (c & 0x80)) \
748 { \
749 if ((c & 0xFE) == 0xC0) \
750 c = ((c & 1) << 6) | *src++; \
751 else \
752 { \
35befdaa
KH
753 src--; \
754 c = - string_char (src, &src, NULL); \
065e3595
KH
755 record_conversion_result \
756 (coding, CODING_RESULT_INVALID_SRC); \
757 } \
758 } \
759 consumed_chars++; \
aa72b389
KH
760 } while (0)
761
aa72b389 762
df7492f9
KH
763/* Store a byte C in the place pointed by DST and increment DST to the
764 next free point, and increment PRODUCED_CHARS. The caller should
765 assure that C is 0..127, and declare and set the variable `dst'
766 appropriately in advance.
767*/
aa72b389
KH
768
769
df7492f9
KH
770#define EMIT_ONE_ASCII_BYTE(c) \
771 do { \
772 produced_chars++; \
773 *dst++ = (c); \
b6871cc7 774 } while (0)
aa72b389
KH
775
776
df7492f9 777/* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2. */
aa72b389 778
df7492f9
KH
779#define EMIT_TWO_ASCII_BYTES(c1, c2) \
780 do { \
781 produced_chars += 2; \
782 *dst++ = (c1), *dst++ = (c2); \
783 } while (0)
aa72b389
KH
784
785
df7492f9
KH
786/* Store a byte C in the place pointed by DST and increment DST to the
787 next free point, and increment PRODUCED_CHARS. If MULTIBYTEP is
788 nonzero, store in an appropriate multibyte from. The caller should
789 declare and set the variables `dst' and `multibytep' appropriately
790 in advance. */
791
792#define EMIT_ONE_BYTE(c) \
793 do { \
794 produced_chars++; \
795 if (multibytep) \
796 { \
797 int ch = (c); \
798 if (ch >= 0x80) \
799 ch = BYTE8_TO_CHAR (ch); \
800 CHAR_STRING_ADVANCE (ch, dst); \
801 } \
802 else \
803 *dst++ = (c); \
aa72b389 804 } while (0)
aa72b389 805
aa72b389 806
df7492f9 807/* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */
aa72b389 808
e19c3639
KH
809#define EMIT_TWO_BYTES(c1, c2) \
810 do { \
811 produced_chars += 2; \
812 if (multibytep) \
813 { \
814 int ch; \
815 \
816 ch = (c1); \
817 if (ch >= 0x80) \
818 ch = BYTE8_TO_CHAR (ch); \
819 CHAR_STRING_ADVANCE (ch, dst); \
820 ch = (c2); \
821 if (ch >= 0x80) \
822 ch = BYTE8_TO_CHAR (ch); \
823 CHAR_STRING_ADVANCE (ch, dst); \
824 } \
825 else \
826 { \
827 *dst++ = (c1); \
828 *dst++ = (c2); \
829 } \
aa72b389
KH
830 } while (0)
831
832
df7492f9
KH
833#define EMIT_THREE_BYTES(c1, c2, c3) \
834 do { \
835 EMIT_ONE_BYTE (c1); \
836 EMIT_TWO_BYTES (c2, c3); \
837 } while (0)
aa72b389 838
aa72b389 839
df7492f9
KH
840#define EMIT_FOUR_BYTES(c1, c2, c3, c4) \
841 do { \
842 EMIT_TWO_BYTES (c1, c2); \
843 EMIT_TWO_BYTES (c3, c4); \
844 } while (0)
aa72b389 845
aa72b389 846
f6cbaf43
KH
847/* Prototypes for static functions. */
848static void record_conversion_result P_ ((struct coding_system *coding,
849 enum coding_result_code result));
850static int detect_coding_utf_8 P_ ((struct coding_system *,
851 struct coding_detection_info *info));
852static void decode_coding_utf_8 P_ ((struct coding_system *));
853static int encode_coding_utf_8 P_ ((struct coding_system *));
854
855static int detect_coding_utf_16 P_ ((struct coding_system *,
856 struct coding_detection_info *info));
857static void decode_coding_utf_16 P_ ((struct coding_system *));
858static int encode_coding_utf_16 P_ ((struct coding_system *));
859
860static int detect_coding_iso_2022 P_ ((struct coding_system *,
861 struct coding_detection_info *info));
862static void decode_coding_iso_2022 P_ ((struct coding_system *));
863static int encode_coding_iso_2022 P_ ((struct coding_system *));
864
865static int detect_coding_emacs_mule P_ ((struct coding_system *,
866 struct coding_detection_info *info));
867static void decode_coding_emacs_mule P_ ((struct coding_system *));
868static int encode_coding_emacs_mule P_ ((struct coding_system *));
869
870static int detect_coding_sjis P_ ((struct coding_system *,
871 struct coding_detection_info *info));
872static void decode_coding_sjis P_ ((struct coding_system *));
873static int encode_coding_sjis P_ ((struct coding_system *));
874
875static int detect_coding_big5 P_ ((struct coding_system *,
876 struct coding_detection_info *info));
877static void decode_coding_big5 P_ ((struct coding_system *));
878static int encode_coding_big5 P_ ((struct coding_system *));
879
880static int detect_coding_ccl P_ ((struct coding_system *,
881 struct coding_detection_info *info));
882static void decode_coding_ccl P_ ((struct coding_system *));
883static int encode_coding_ccl P_ ((struct coding_system *));
884
885static void decode_coding_raw_text P_ ((struct coding_system *));
886static int encode_coding_raw_text P_ ((struct coding_system *));
887
888static void coding_set_source P_ ((struct coding_system *));
889static void coding_set_destination P_ ((struct coding_system *));
890static void coding_alloc_by_realloc P_ ((struct coding_system *, EMACS_INT));
891static void coding_alloc_by_making_gap P_ ((struct coding_system *,
287c57d7 892 EMACS_INT, EMACS_INT));
f6cbaf43
KH
893static unsigned char *alloc_destination P_ ((struct coding_system *,
894 EMACS_INT, unsigned char *));
895static void setup_iso_safe_charsets P_ ((Lisp_Object));
896static unsigned char *encode_designation_at_bol P_ ((struct coding_system *,
897 int *, int *,
898 unsigned char *));
899static int detect_eol P_ ((const unsigned char *,
900 EMACS_INT, enum coding_category));
901static Lisp_Object adjust_coding_eol_type P_ ((struct coding_system *, int));
902static void decode_eol P_ ((struct coding_system *));
903static Lisp_Object get_translation_table P_ ((Lisp_Object, int, int *));
904static Lisp_Object get_translation P_ ((Lisp_Object, int *, int *,
905 int, int *, int *));
906static int produce_chars P_ ((struct coding_system *, Lisp_Object, int));
907static INLINE void produce_composition P_ ((struct coding_system *, int *,
908 EMACS_INT));
909static INLINE void produce_charset P_ ((struct coding_system *, int *,
910 EMACS_INT));
911static void produce_annotation P_ ((struct coding_system *, EMACS_INT));
912static int decode_coding P_ ((struct coding_system *));
913static INLINE int *handle_composition_annotation P_ ((EMACS_INT, EMACS_INT,
3ed051d4 914 struct coding_system *,
f6cbaf43
KH
915 int *, EMACS_INT *));
916static INLINE int *handle_charset_annotation P_ ((EMACS_INT, EMACS_INT,
917 struct coding_system *,
918 int *, EMACS_INT *));
919static void consume_chars P_ ((struct coding_system *, Lisp_Object, int));
920static int encode_coding P_ ((struct coding_system *));
921static Lisp_Object make_conversion_work_buffer P_ ((int));
922static Lisp_Object code_conversion_restore P_ ((Lisp_Object));
923static INLINE int char_encodable_p P_ ((int, Lisp_Object));
924static Lisp_Object make_subsidiaries P_ ((Lisp_Object));
925
065e3595
KH
926static void
927record_conversion_result (struct coding_system *coding,
928 enum coding_result_code result)
929{
930 coding->result = result;
931 switch (result)
932 {
933 case CODING_RESULT_INSUFFICIENT_SRC:
934 Vlast_code_conversion_error = Qinsufficient_source;
935 break;
936 case CODING_RESULT_INCONSISTENT_EOL:
937 Vlast_code_conversion_error = Qinconsistent_eol;
938 break;
939 case CODING_RESULT_INVALID_SRC:
940 Vlast_code_conversion_error = Qinvalid_source;
941 break;
942 case CODING_RESULT_INTERRUPT:
943 Vlast_code_conversion_error = Qinterrupted;
944 break;
945 case CODING_RESULT_INSUFFICIENT_MEM:
946 Vlast_code_conversion_error = Qinsufficient_memory;
947 break;
35befdaa
KH
948 default:
949 Vlast_code_conversion_error = intern ("Unknown error");
065e3595
KH
950 }
951}
952
df7492f9
KH
953#define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
954 do { \
955 charset_map_loaded = 0; \
956 c = DECODE_CHAR (charset, code); \
957 if (charset_map_loaded) \
958 { \
8f924df7 959 const unsigned char *orig = coding->source; \
df7492f9
KH
960 EMACS_INT offset; \
961 \
962 coding_set_source (coding); \
963 offset = coding->source - orig; \
964 src += offset; \
965 src_base += offset; \
966 src_end += offset; \
967 } \
aa72b389
KH
968 } while (0)
969
970
119852e7
KH
971/* If there are at least BYTES length of room at dst, allocate memory
972 for coding->destination and update dst and dst_end. We don't have
973 to take care of coding->source which will be relocated. It is
974 handled by calling coding_set_source in encode_coding. */
975
df7492f9
KH
976#define ASSURE_DESTINATION(bytes) \
977 do { \
978 if (dst + (bytes) >= dst_end) \
979 { \
980 int more_bytes = charbuf_end - charbuf + (bytes); \
981 \
982 dst = alloc_destination (coding, more_bytes, dst); \
983 dst_end = coding->destination + coding->dst_bytes; \
984 } \
985 } while (0)
aa72b389 986
aa72b389 987
db274c7a
KH
988/* Store multibyte form of the character C in P, and advance P to the
989 end of the multibyte form. This is like CHAR_STRING_ADVANCE but it
990 never calls MAYBE_UNIFY_CHAR. */
991
992#define CHAR_STRING_ADVANCE_NO_UNIFY(c, p) \
993 do { \
994 if ((c) <= MAX_1_BYTE_CHAR) \
995 *(p)++ = (c); \
996 else if ((c) <= MAX_2_BYTE_CHAR) \
997 *(p)++ = (0xC0 | ((c) >> 6)), \
998 *(p)++ = (0x80 | ((c) & 0x3F)); \
999 else if ((c) <= MAX_3_BYTE_CHAR) \
1000 *(p)++ = (0xE0 | ((c) >> 12)), \
1001 *(p)++ = (0x80 | (((c) >> 6) & 0x3F)), \
1002 *(p)++ = (0x80 | ((c) & 0x3F)); \
1003 else if ((c) <= MAX_4_BYTE_CHAR) \
1004 *(p)++ = (0xF0 | (c >> 18)), \
1005 *(p)++ = (0x80 | ((c >> 12) & 0x3F)), \
1006 *(p)++ = (0x80 | ((c >> 6) & 0x3F)), \
1007 *(p)++ = (0x80 | (c & 0x3F)); \
1008 else if ((c) <= MAX_5_BYTE_CHAR) \
1009 *(p)++ = 0xF8, \
1010 *(p)++ = (0x80 | ((c >> 18) & 0x0F)), \
1011 *(p)++ = (0x80 | ((c >> 12) & 0x3F)), \
1012 *(p)++ = (0x80 | ((c >> 6) & 0x3F)), \
1013 *(p)++ = (0x80 | (c & 0x3F)); \
1014 else \
1015 (p) += BYTE8_STRING ((c) - 0x3FFF80, p); \
1016 } while (0)
1017
1018
1019/* Return the character code of character whose multibyte form is at
1020 P, and advance P to the end of the multibyte form. This is like
1021 STRING_CHAR_ADVANCE, but it never calls MAYBE_UNIFY_CHAR. */
1022
1023#define STRING_CHAR_ADVANCE_NO_UNIFY(p) \
1024 (!((p)[0] & 0x80) \
1025 ? *(p)++ \
1026 : ! ((p)[0] & 0x20) \
1027 ? ((p) += 2, \
1028 ((((p)[-2] & 0x1F) << 6) \
1029 | ((p)[-1] & 0x3F) \
1030 | ((unsigned char) ((p)[-2]) < 0xC2 ? 0x3FFF80 : 0))) \
1031 : ! ((p)[0] & 0x10) \
1032 ? ((p) += 3, \
1033 ((((p)[-3] & 0x0F) << 12) \
1034 | (((p)[-2] & 0x3F) << 6) \
1035 | ((p)[-1] & 0x3F))) \
1036 : ! ((p)[0] & 0x08) \
1037 ? ((p) += 4, \
1038 ((((p)[-4] & 0xF) << 18) \
1039 | (((p)[-3] & 0x3F) << 12) \
1040 | (((p)[-2] & 0x3F) << 6) \
1041 | ((p)[-1] & 0x3F))) \
1042 : ((p) += 5, \
1043 ((((p)[-4] & 0x3F) << 18) \
1044 | (((p)[-3] & 0x3F) << 12) \
1045 | (((p)[-2] & 0x3F) << 6) \
1046 | ((p)[-1] & 0x3F))))
1047
aa72b389 1048
df7492f9
KH
1049static void
1050coding_set_source (coding)
aa72b389 1051 struct coding_system *coding;
aa72b389 1052{
df7492f9
KH
1053 if (BUFFERP (coding->src_object))
1054 {
2cb26057 1055 struct buffer *buf = XBUFFER (coding->src_object);
aa72b389 1056
df7492f9 1057 if (coding->src_pos < 0)
2cb26057 1058 coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
df7492f9 1059 else
2cb26057 1060 coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
aa72b389 1061 }
df7492f9 1062 else if (STRINGP (coding->src_object))
aa72b389 1063 {
8f924df7 1064 coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
aa72b389 1065 }
df7492f9
KH
1066 else
1067 /* Otherwise, the source is C string and is never relocated
1068 automatically. Thus we don't have to update anything. */
1069 ;
1070}
aa72b389 1071
df7492f9
KH
1072static void
1073coding_set_destination (coding)
1074 struct coding_system *coding;
1075{
1076 if (BUFFERP (coding->dst_object))
aa72b389 1077 {
df7492f9 1078 if (coding->src_pos < 0)
aa72b389 1079 {
13818c30 1080 coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
28f67a95
KH
1081 coding->dst_bytes = (GAP_END_ADDR
1082 - (coding->src_bytes - coding->consumed)
1083 - coding->destination);
aa72b389 1084 }
df7492f9 1085 else
28f67a95
KH
1086 {
1087 /* We are sure that coding->dst_pos_byte is before the gap
1088 of the buffer. */
1089 coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
13818c30 1090 + coding->dst_pos_byte - BEG_BYTE);
28f67a95
KH
1091 coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
1092 - coding->destination);
1093 }
df7492f9
KH
1094 }
1095 else
1096 /* Otherwise, the destination is C string and is never relocated
1097 automatically. Thus we don't have to update anything. */
1098 ;
1099}
1100
1101
1102static void
1103coding_alloc_by_realloc (coding, bytes)
1104 struct coding_system *coding;
1105 EMACS_INT bytes;
1106{
1107 coding->destination = (unsigned char *) xrealloc (coding->destination,
1108 coding->dst_bytes + bytes);
1109 coding->dst_bytes += bytes;
1110}
1111
1112static void
db274c7a 1113coding_alloc_by_making_gap (coding, gap_head_used, bytes)
df7492f9 1114 struct coding_system *coding;
db274c7a 1115 EMACS_INT gap_head_used, bytes;
df7492f9 1116{
db274c7a 1117 if (EQ (coding->src_object, coding->dst_object))
df7492f9 1118 {
db274c7a
KH
1119 /* The gap may contain the produced data at the head and not-yet
1120 consumed data at the tail. To preserve those data, we at
1121 first make the gap size to zero, then increase the gap
1122 size. */
1123 EMACS_INT add = GAP_SIZE;
1124
1125 GPT += gap_head_used, GPT_BYTE += gap_head_used;
1126 GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
df7492f9
KH
1127 make_gap (bytes);
1128 GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
db274c7a 1129 GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
df7492f9 1130 }
730fff51 1131 else
df7492f9 1132 {
2c78b7e1
KH
1133 Lisp_Object this_buffer;
1134
1135 this_buffer = Fcurrent_buffer ();
df7492f9
KH
1136 set_buffer_internal (XBUFFER (coding->dst_object));
1137 make_gap (bytes);
1138 set_buffer_internal (XBUFFER (this_buffer));
aa72b389 1139 }
df7492f9 1140}
8f924df7 1141
df7492f9
KH
1142
1143static unsigned char *
1144alloc_destination (coding, nbytes, dst)
1145 struct coding_system *coding;
3e139625 1146 EMACS_INT nbytes;
df7492f9
KH
1147 unsigned char *dst;
1148{
1149 EMACS_INT offset = dst - coding->destination;
1150
1151 if (BUFFERP (coding->dst_object))
db274c7a
KH
1152 {
1153 struct buffer *buf = XBUFFER (coding->dst_object);
1154
1155 coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1156 }
aa72b389 1157 else
df7492f9 1158 coding_alloc_by_realloc (coding, nbytes);
065e3595 1159 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
1160 coding_set_destination (coding);
1161 dst = coding->destination + offset;
1162 return dst;
1163}
aa72b389 1164
ff0dacd7
KH
1165/** Macros for annotations. */
1166
1167/* Maximum length of annotation data (sum of annotations for
1168 composition and charset). */
69a80ea3 1169#define MAX_ANNOTATION_LENGTH (4 + (MAX_COMPOSITION_COMPONENTS * 2) - 1 + 4)
ff0dacd7
KH
1170
1171/* An annotation data is stored in the array coding->charbuf in this
1172 format:
69a80ea3 1173 [ -LENGTH ANNOTATION_MASK NCHARS ... ]
ff0dacd7
KH
1174 LENGTH is the number of elements in the annotation.
1175 ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
69a80ea3 1176 NCHARS is the number of characters in the text annotated.
ff0dacd7
KH
1177
1178 The format of the following elements depend on ANNOTATION_MASK.
1179
1180 In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1181 follows:
1182 ... METHOD [ COMPOSITION-COMPONENTS ... ]
1183 METHOD is one of enum composition_method.
1184 Optionnal COMPOSITION-COMPONENTS are characters and composition
1185 rules.
1186
1187 In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1188 follows. */
1189
69a80ea3 1190#define ADD_ANNOTATION_DATA(buf, len, mask, nchars) \
ff0dacd7
KH
1191 do { \
1192 *(buf)++ = -(len); \
1193 *(buf)++ = (mask); \
69a80ea3 1194 *(buf)++ = (nchars); \
ff0dacd7
KH
1195 coding->annotated = 1; \
1196 } while (0);
1197
69a80ea3
KH
1198#define ADD_COMPOSITION_DATA(buf, nchars, method) \
1199 do { \
1200 ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1201 *buf++ = method; \
ff0dacd7
KH
1202 } while (0)
1203
1204
69a80ea3
KH
1205#define ADD_CHARSET_DATA(buf, nchars, id) \
1206 do { \
1207 ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1208 *buf++ = id; \
ff0dacd7
KH
1209 } while (0)
1210
df7492f9
KH
1211\f
1212/*** 2. Emacs' internal format (emacs-utf-8) ***/
1213
1214
1215
1216\f
1217/*** 3. UTF-8 ***/
1218
1219/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
ff0dacd7
KH
1220 Check if a text is encoded in UTF-8. If it is, return 1, else
1221 return 0. */
df7492f9
KH
1222
1223#define UTF_8_1_OCTET_P(c) ((c) < 0x80)
1224#define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
1225#define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1226#define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1227#define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1228#define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1229
a470d443
KH
1230#define UTF_BOM 0xFEFF
1231#define UTF_8_BOM_1 0xEF
1232#define UTF_8_BOM_2 0xBB
1233#define UTF_8_BOM_3 0xBF
1234
df7492f9 1235static int
ff0dacd7 1236detect_coding_utf_8 (coding, detect_info)
df7492f9 1237 struct coding_system *coding;
ff0dacd7 1238 struct coding_detection_info *detect_info;
df7492f9 1239{
065e3595 1240 const unsigned char *src = coding->source, *src_base;
8f924df7 1241 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9
KH
1242 int multibytep = coding->src_multibyte;
1243 int consumed_chars = 0;
a470d443 1244 int bom_found = 0;
df7492f9
KH
1245 int found = 0;
1246
ff0dacd7 1247 detect_info->checked |= CATEGORY_MASK_UTF_8;
df7492f9
KH
1248 /* A coding system of this category is always ASCII compatible. */
1249 src += coding->head_ascii;
1250
1251 while (1)
aa72b389 1252 {
df7492f9 1253 int c, c1, c2, c3, c4;
aa72b389 1254
065e3595 1255 src_base = src;
df7492f9 1256 ONE_MORE_BYTE (c);
065e3595 1257 if (c < 0 || UTF_8_1_OCTET_P (c))
df7492f9
KH
1258 continue;
1259 ONE_MORE_BYTE (c1);
065e3595 1260 if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
df7492f9
KH
1261 break;
1262 if (UTF_8_2_OCTET_LEADING_P (c))
aa72b389 1263 {
a470d443 1264 found = 1;
df7492f9 1265 continue;
aa72b389 1266 }
df7492f9 1267 ONE_MORE_BYTE (c2);
065e3595 1268 if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
df7492f9
KH
1269 break;
1270 if (UTF_8_3_OCTET_LEADING_P (c))
aa72b389 1271 {
a470d443
KH
1272 found = 1;
1273 if (src_base == coding->source
1274 && c == UTF_8_BOM_1 && c1 == UTF_8_BOM_2 && c2 == UTF_8_BOM_3)
1275 bom_found = 1;
df7492f9 1276 continue;
aa72b389 1277 }
df7492f9 1278 ONE_MORE_BYTE (c3);
065e3595 1279 if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
df7492f9
KH
1280 break;
1281 if (UTF_8_4_OCTET_LEADING_P (c))
aa72b389 1282 {
a470d443 1283 found = 1;
df7492f9
KH
1284 continue;
1285 }
1286 ONE_MORE_BYTE (c4);
065e3595 1287 if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
df7492f9
KH
1288 break;
1289 if (UTF_8_5_OCTET_LEADING_P (c))
1290 {
a470d443 1291 found = 1;
df7492f9
KH
1292 continue;
1293 }
1294 break;
aa72b389 1295 }
ff0dacd7 1296 detect_info->rejected |= CATEGORY_MASK_UTF_8;
df7492f9 1297 return 0;
aa72b389 1298
df7492f9 1299 no_more_source:
065e3595 1300 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
aa72b389 1301 {
ff0dacd7 1302 detect_info->rejected |= CATEGORY_MASK_UTF_8;
89528eb3 1303 return 0;
aa72b389 1304 }
a470d443
KH
1305 if (bom_found)
1306 {
1307 /* The first character 0xFFFE doesn't necessarily mean a BOM. */
1308 detect_info->found |= CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1309 }
1310 else
1311 {
1312 detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
0e17387a
KH
1313 if (found)
1314 detect_info->found |= CATEGORY_MASK_UTF_8_NOSIG;
a470d443 1315 }
ff0dacd7 1316 return 1;
aa72b389
KH
1317}
1318
4ed46869 1319
b73bfc1c 1320static void
df7492f9 1321decode_coding_utf_8 (coding)
b73bfc1c 1322 struct coding_system *coding;
b73bfc1c 1323{
8f924df7
KH
1324 const unsigned char *src = coding->source + coding->consumed;
1325 const unsigned char *src_end = coding->source + coding->src_bytes;
1326 const unsigned char *src_base;
69a80ea3
KH
1327 int *charbuf = coding->charbuf + coding->charbuf_used;
1328 int *charbuf_end = coding->charbuf + coding->charbuf_size;
453b38f0 1329 int consumed_chars = 0, consumed_chars_base = 0;
df7492f9 1330 int multibytep = coding->src_multibyte;
a470d443 1331 enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
24a73b0a 1332 Lisp_Object attr, charset_list;
119852e7
KH
1333 int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1334 int byte_after_cr = -1;
4ed46869 1335
24a73b0a 1336 CODING_GET_INFO (coding, attr, charset_list);
df7492f9 1337
a470d443
KH
1338 if (bom != utf_without_bom)
1339 {
1340 int c1, c2, c3;
1341
1342 src_base = src;
1343 ONE_MORE_BYTE (c1);
1344 if (! UTF_8_3_OCTET_LEADING_P (c1))
1345 src = src_base;
1346 else
1347 {
159bd5a2 1348 ONE_MORE_BYTE (c2);
a470d443
KH
1349 if (! UTF_8_EXTRA_OCTET_P (c2))
1350 src = src_base;
1351 else
1352 {
159bd5a2 1353 ONE_MORE_BYTE (c3);
a470d443
KH
1354 if (! UTF_8_EXTRA_OCTET_P (c3))
1355 src = src_base;
1356 else
1357 {
1358 if ((c1 != UTF_8_BOM_1)
1359 || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1360 src = src_base;
1361 else
1362 CODING_UTF_8_BOM (coding) = utf_without_bom;
1363 }
1364 }
1365 }
1366 }
1367 CODING_UTF_8_BOM (coding) = utf_without_bom;
1368
1369
1370
df7492f9 1371 while (1)
b73bfc1c 1372 {
df7492f9 1373 int c, c1, c2, c3, c4, c5;
ec6d2bb8 1374
df7492f9
KH
1375 src_base = src;
1376 consumed_chars_base = consumed_chars;
4af310db 1377
df7492f9
KH
1378 if (charbuf >= charbuf_end)
1379 break;
1380
119852e7
KH
1381 if (byte_after_cr >= 0)
1382 c1 = byte_after_cr, byte_after_cr = -1;
1383 else
1384 ONE_MORE_BYTE (c1);
065e3595
KH
1385 if (c1 < 0)
1386 {
1387 c = - c1;
1388 }
1389 else if (UTF_8_1_OCTET_P(c1))
df7492f9 1390 {
119852e7
KH
1391 if (eol_crlf && c1 == '\r')
1392 ONE_MORE_BYTE (byte_after_cr);
df7492f9 1393 c = c1;
4af310db 1394 }
df7492f9 1395 else
4af310db 1396 {
df7492f9 1397 ONE_MORE_BYTE (c2);
065e3595 1398 if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
df7492f9
KH
1399 goto invalid_code;
1400 if (UTF_8_2_OCTET_LEADING_P (c1))
4af310db 1401 {
b0edb2c5
DL
1402 c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1403 /* Reject overlong sequences here and below. Encoders
1404 producing them are incorrect, they can be misleading,
1405 and they mess up read/write invariance. */
1406 if (c < 128)
1407 goto invalid_code;
4af310db 1408 }
df7492f9 1409 else
aa72b389 1410 {
df7492f9 1411 ONE_MORE_BYTE (c3);
065e3595 1412 if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
df7492f9
KH
1413 goto invalid_code;
1414 if (UTF_8_3_OCTET_LEADING_P (c1))
b0edb2c5
DL
1415 {
1416 c = (((c1 & 0xF) << 12)
1417 | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
72fe1301
DL
1418 if (c < 0x800
1419 || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
b0edb2c5
DL
1420 goto invalid_code;
1421 }
df7492f9
KH
1422 else
1423 {
1424 ONE_MORE_BYTE (c4);
065e3595 1425 if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
df7492f9
KH
1426 goto invalid_code;
1427 if (UTF_8_4_OCTET_LEADING_P (c1))
b0edb2c5 1428 {
df7492f9
KH
1429 c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1430 | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
b0edb2c5
DL
1431 if (c < 0x10000)
1432 goto invalid_code;
1433 }
df7492f9
KH
1434 else
1435 {
1436 ONE_MORE_BYTE (c5);
065e3595 1437 if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
df7492f9
KH
1438 goto invalid_code;
1439 if (UTF_8_5_OCTET_LEADING_P (c1))
1440 {
1441 c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1442 | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1443 | (c5 & 0x3F));
b0edb2c5 1444 if ((c > MAX_CHAR) || (c < 0x200000))
df7492f9
KH
1445 goto invalid_code;
1446 }
1447 else
1448 goto invalid_code;
1449 }
1450 }
aa72b389 1451 }
b73bfc1c 1452 }
df7492f9
KH
1453
1454 *charbuf++ = c;
1455 continue;
1456
1457 invalid_code:
1458 src = src_base;
1459 consumed_chars = consumed_chars_base;
1460 ONE_MORE_BYTE (c);
1461 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1462 coding->errors++;
aa72b389
KH
1463 }
1464
df7492f9
KH
1465 no_more_source:
1466 coding->consumed_char += consumed_chars_base;
1467 coding->consumed = src_base - coding->source;
1468 coding->charbuf_used = charbuf - coding->charbuf;
1469}
1470
1471
1472static int
1473encode_coding_utf_8 (coding)
1474 struct coding_system *coding;
1475{
1476 int multibytep = coding->dst_multibyte;
1477 int *charbuf = coding->charbuf;
1478 int *charbuf_end = charbuf + coding->charbuf_used;
1479 unsigned char *dst = coding->destination + coding->produced;
1480 unsigned char *dst_end = coding->destination + coding->dst_bytes;
e19c3639 1481 int produced_chars = 0;
df7492f9
KH
1482 int c;
1483
a470d443
KH
1484 if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1485 {
1486 ASSURE_DESTINATION (3);
1487 EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1488 CODING_UTF_8_BOM (coding) = utf_without_bom;
1489 }
1490
df7492f9 1491 if (multibytep)
aa72b389 1492 {
df7492f9
KH
1493 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1494
1495 while (charbuf < charbuf_end)
b73bfc1c 1496 {
df7492f9 1497 unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
8f924df7 1498
df7492f9
KH
1499 ASSURE_DESTINATION (safe_room);
1500 c = *charbuf++;
28f67a95
KH
1501 if (CHAR_BYTE8_P (c))
1502 {
1503 c = CHAR_TO_BYTE8 (c);
1504 EMIT_ONE_BYTE (c);
1505 }
1506 else
1507 {
db274c7a 1508 CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
28f67a95
KH
1509 for (p = str; p < pend; p++)
1510 EMIT_ONE_BYTE (*p);
1511 }
b73bfc1c 1512 }
aa72b389 1513 }
df7492f9
KH
1514 else
1515 {
1516 int safe_room = MAX_MULTIBYTE_LENGTH;
1517
1518 while (charbuf < charbuf_end)
b73bfc1c 1519 {
df7492f9
KH
1520 ASSURE_DESTINATION (safe_room);
1521 c = *charbuf++;
f03caae0
KH
1522 if (CHAR_BYTE8_P (c))
1523 *dst++ = CHAR_TO_BYTE8 (c);
1524 else
db274c7a 1525 CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
df7492f9 1526 produced_chars++;
4ed46869
KH
1527 }
1528 }
065e3595 1529 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
1530 coding->produced_char += produced_chars;
1531 coding->produced = dst - coding->destination;
1532 return 0;
4ed46869
KH
1533}
1534
b73bfc1c 1535
df7492f9 1536/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
ff0dacd7
KH
1537 Check if a text is encoded in one of UTF-16 based coding systems.
1538 If it is, return 1, else return 0. */
aa72b389 1539
df7492f9
KH
1540#define UTF_16_HIGH_SURROGATE_P(val) \
1541 (((val) & 0xFC00) == 0xD800)
1542
1543#define UTF_16_LOW_SURROGATE_P(val) \
1544 (((val) & 0xFC00) == 0xDC00)
93dec019 1545
df7492f9
KH
1546#define UTF_16_INVALID_P(val) \
1547 (((val) == 0xFFFE) \
1548 || ((val) == 0xFFFF) \
1549 || UTF_16_LOW_SURROGATE_P (val))
aa72b389 1550
aa72b389 1551
df7492f9 1552static int
ff0dacd7 1553detect_coding_utf_16 (coding, detect_info)
aa72b389 1554 struct coding_system *coding;
ff0dacd7 1555 struct coding_detection_info *detect_info;
aa72b389 1556{
8f924df7
KH
1557 const unsigned char *src = coding->source, *src_base = src;
1558 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9
KH
1559 int multibytep = coding->src_multibyte;
1560 int consumed_chars = 0;
1561 int c1, c2;
aa72b389 1562
ff0dacd7 1563 detect_info->checked |= CATEGORY_MASK_UTF_16;
ff0dacd7 1564 if (coding->mode & CODING_MODE_LAST_BLOCK
24a73b0a 1565 && (coding->src_chars & 1))
ff0dacd7
KH
1566 {
1567 detect_info->rejected |= CATEGORY_MASK_UTF_16;
1568 return 0;
1569 }
24a73b0a 1570
df7492f9
KH
1571 ONE_MORE_BYTE (c1);
1572 ONE_MORE_BYTE (c2);
df7492f9 1573 if ((c1 == 0xFF) && (c2 == 0xFE))
aa72b389 1574 {
b49a1807
KH
1575 detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1576 | CATEGORY_MASK_UTF_16_AUTO);
24a73b0a
KH
1577 detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1578 | CATEGORY_MASK_UTF_16_BE_NOSIG
1579 | CATEGORY_MASK_UTF_16_LE_NOSIG);
aa72b389 1580 }
df7492f9 1581 else if ((c1 == 0xFE) && (c2 == 0xFF))
ff0dacd7 1582 {
b49a1807
KH
1583 detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1584 | CATEGORY_MASK_UTF_16_AUTO);
24a73b0a
KH
1585 detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1586 | CATEGORY_MASK_UTF_16_BE_NOSIG
1587 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1588 }
2f3cbb32 1589 else
24a73b0a 1590 {
2f3cbb32
KH
1591 /* We check the dispersion of Eth and Oth bytes where E is even and
1592 O is odd. If both are high, we assume binary data.*/
1593 unsigned char e[256], o[256];
1594 unsigned e_num = 1, o_num = 1;
1595
1596 memset (e, 0, 256);
1597 memset (o, 0, 256);
1598 e[c1] = 1;
1599 o[c2] = 1;
1600
24a73b0a
KH
1601 detect_info->rejected
1602 |= (CATEGORY_MASK_UTF_16_BE | CATEGORY_MASK_UTF_16_LE);
2f3cbb32
KH
1603
1604 while (1)
1605 {
1606 ONE_MORE_BYTE (c1);
1607 ONE_MORE_BYTE (c2);
1608 if (! e[c1])
1609 {
1610 e[c1] = 1;
1611 e_num++;
1612 if (e_num >= 128)
1613 break;
1614 }
1615 if (! o[c2])
1616 {
1617 o[c1] = 1;
1618 o_num++;
1619 if (o_num >= 128)
1620 break;
1621 }
1622 }
1623 detect_info->rejected |= CATEGORY_MASK_UTF_16;
1624 return 0;
ff0dacd7 1625 }
2f3cbb32 1626
df7492f9 1627 no_more_source:
ff0dacd7 1628 return 1;
df7492f9 1629}
aa72b389 1630
df7492f9
KH
1631static void
1632decode_coding_utf_16 (coding)
1633 struct coding_system *coding;
1634{
8f924df7
KH
1635 const unsigned char *src = coding->source + coding->consumed;
1636 const unsigned char *src_end = coding->source + coding->src_bytes;
1637 const unsigned char *src_base;
69a80ea3
KH
1638 int *charbuf = coding->charbuf + coding->charbuf_used;
1639 int *charbuf_end = coding->charbuf + coding->charbuf_size;
3a8406e1 1640 int consumed_chars = 0, consumed_chars_base = 0;
df7492f9 1641 int multibytep = coding->src_multibyte;
a470d443 1642 enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
df7492f9
KH
1643 enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1644 int surrogate = CODING_UTF_16_SURROGATE (coding);
24a73b0a 1645 Lisp_Object attr, charset_list;
119852e7
KH
1646 int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1647 int byte_after_cr1 = -1, byte_after_cr2 = -1;
df7492f9 1648
24a73b0a 1649 CODING_GET_INFO (coding, attr, charset_list);
df7492f9 1650
a470d443 1651 if (bom == utf_with_bom)
aa72b389 1652 {
df7492f9 1653 int c, c1, c2;
4af310db 1654
aa72b389 1655 src_base = src;
df7492f9
KH
1656 ONE_MORE_BYTE (c1);
1657 ONE_MORE_BYTE (c2);
e19c3639 1658 c = (c1 << 8) | c2;
aa72b389 1659
b49a1807
KH
1660 if (endian == utf_16_big_endian
1661 ? c != 0xFEFF : c != 0xFFFE)
aa72b389 1662 {
b49a1807
KH
1663 /* The first two bytes are not BOM. Treat them as bytes
1664 for a normal character. */
1665 src = src_base;
1666 coding->errors++;
aa72b389 1667 }
a470d443 1668 CODING_UTF_16_BOM (coding) = utf_without_bom;
b49a1807 1669 }
a470d443 1670 else if (bom == utf_detect_bom)
b49a1807
KH
1671 {
1672 /* We have already tried to detect BOM and failed in
1673 detect_coding. */
a470d443 1674 CODING_UTF_16_BOM (coding) = utf_without_bom;
df7492f9 1675 }
aa72b389 1676
df7492f9
KH
1677 while (1)
1678 {
1679 int c, c1, c2;
1680
1681 src_base = src;
1682 consumed_chars_base = consumed_chars;
1683
1684 if (charbuf + 2 >= charbuf_end)
1685 break;
1686
119852e7
KH
1687 if (byte_after_cr1 >= 0)
1688 c1 = byte_after_cr1, byte_after_cr1 = -1;
1689 else
1690 ONE_MORE_BYTE (c1);
065e3595
KH
1691 if (c1 < 0)
1692 {
1693 *charbuf++ = -c1;
1694 continue;
1695 }
119852e7
KH
1696 if (byte_after_cr2 >= 0)
1697 c2 = byte_after_cr2, byte_after_cr2 = -1;
1698 else
1699 ONE_MORE_BYTE (c2);
065e3595
KH
1700 if (c2 < 0)
1701 {
1702 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1703 *charbuf++ = -c2;
1704 continue;
1705 }
df7492f9 1706 c = (endian == utf_16_big_endian
e19c3639 1707 ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
119852e7 1708
df7492f9 1709 if (surrogate)
fd3ae0b9 1710 {
df7492f9 1711 if (! UTF_16_LOW_SURROGATE_P (c))
fd3ae0b9 1712 {
df7492f9
KH
1713 if (endian == utf_16_big_endian)
1714 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1715 else
1716 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1717 *charbuf++ = c1;
1718 *charbuf++ = c2;
1719 coding->errors++;
1720 if (UTF_16_HIGH_SURROGATE_P (c))
1721 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
fd3ae0b9 1722 else
df7492f9 1723 *charbuf++ = c;
fd3ae0b9
KH
1724 }
1725 else
df7492f9
KH
1726 {
1727 c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1728 CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
29f7ffd0 1729 *charbuf++ = 0x10000 + c;
df7492f9 1730 }
fd3ae0b9 1731 }
aa72b389 1732 else
df7492f9
KH
1733 {
1734 if (UTF_16_HIGH_SURROGATE_P (c))
1735 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1736 else
119852e7
KH
1737 {
1738 if (eol_crlf && c == '\r')
1739 {
1740 ONE_MORE_BYTE (byte_after_cr1);
1741 ONE_MORE_BYTE (byte_after_cr2);
1742 }
1743 *charbuf++ = c;
1744 }
8f924df7 1745 }
aa72b389 1746 }
df7492f9
KH
1747
1748 no_more_source:
1749 coding->consumed_char += consumed_chars_base;
1750 coding->consumed = src_base - coding->source;
1751 coding->charbuf_used = charbuf - coding->charbuf;
aa72b389 1752}
b73bfc1c 1753
df7492f9
KH
1754static int
1755encode_coding_utf_16 (coding)
1756 struct coding_system *coding;
1757{
1758 int multibytep = coding->dst_multibyte;
1759 int *charbuf = coding->charbuf;
1760 int *charbuf_end = charbuf + coding->charbuf_used;
1761 unsigned char *dst = coding->destination + coding->produced;
1762 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1763 int safe_room = 8;
a470d443 1764 enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
df7492f9
KH
1765 int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1766 int produced_chars = 0;
24a73b0a 1767 Lisp_Object attrs, charset_list;
df7492f9 1768 int c;
4ed46869 1769
24a73b0a 1770 CODING_GET_INFO (coding, attrs, charset_list);
df7492f9 1771
a470d443 1772 if (bom != utf_without_bom)
df7492f9
KH
1773 {
1774 ASSURE_DESTINATION (safe_room);
1775 if (big_endian)
df7492f9 1776 EMIT_TWO_BYTES (0xFE, 0xFF);
880cf180
KH
1777 else
1778 EMIT_TWO_BYTES (0xFF, 0xFE);
a470d443 1779 CODING_UTF_16_BOM (coding) = utf_without_bom;
df7492f9
KH
1780 }
1781
1782 while (charbuf < charbuf_end)
1783 {
1784 ASSURE_DESTINATION (safe_room);
1785 c = *charbuf++;
e19c3639
KH
1786 if (c >= MAX_UNICODE_CHAR)
1787 c = coding->default_char;
df7492f9
KH
1788
1789 if (c < 0x10000)
1790 {
1791 if (big_endian)
1792 EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1793 else
1794 EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1795 }
1796 else
1797 {
1798 int c1, c2;
1799
1800 c -= 0x10000;
1801 c1 = (c >> 10) + 0xD800;
1802 c2 = (c & 0x3FF) + 0xDC00;
1803 if (big_endian)
1804 EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1805 else
1806 EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1807 }
1808 }
065e3595 1809 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
1810 coding->produced = dst - coding->destination;
1811 coding->produced_char += produced_chars;
1812 return 0;
1813}
1814
1815\f
1816/*** 6. Old Emacs' internal format (emacs-mule) ***/
1817
1818/* Emacs' internal format for representation of multiple character
1819 sets is a kind of multi-byte encoding, i.e. characters are
1820 represented by variable-length sequences of one-byte codes.
1821
1822 ASCII characters and control characters (e.g. `tab', `newline') are
1823 represented by one-byte sequences which are their ASCII codes, in
1824 the range 0x00 through 0x7F.
1825
1826 8-bit characters of the range 0x80..0x9F are represented by
1827 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1828 code + 0x20).
1829
1830 8-bit characters of the range 0xA0..0xFF are represented by
1831 one-byte sequences which are their 8-bit code.
1832
1833 The other characters are represented by a sequence of `base
1834 leading-code', optional `extended leading-code', and one or two
1835 `position-code's. The length of the sequence is determined by the
1836 base leading-code. Leading-code takes the range 0x81 through 0x9D,
1837 whereas extended leading-code and position-code take the range 0xA0
1838 through 0xFF. See `charset.h' for more details about leading-code
1839 and position-code.
1840
1841 --- CODE RANGE of Emacs' internal format ---
1842 character set range
1843 ------------- -----
1844 ascii 0x00..0x7F
1845 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1846 eight-bit-graphic 0xA0..0xBF
1847 ELSE 0x81..0x9D + [0xA0..0xFF]+
1848 ---------------------------------------------
1849
1850 As this is the internal character representation, the format is
1851 usually not used externally (i.e. in a file or in a data sent to a
1852 process). But, it is possible to have a text externally in this
1853 format (i.e. by encoding by the coding system `emacs-mule').
1854
1855 In that case, a sequence of one-byte codes has a slightly different
1856 form.
1857
1858 At first, all characters in eight-bit-control are represented by
1859 one-byte sequences which are their 8-bit code.
1860
1861 Next, character composition data are represented by the byte
1862 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1863 where,
1864 METHOD is 0xF0 plus one of composition method (enum
1865 composition_method),
1866
1867 BYTES is 0xA0 plus a byte length of this composition data,
1868
1869 CHARS is 0x20 plus a number of characters composed by this
1870 data,
1871
1872 COMPONENTs are characters of multibye form or composition
1873 rules encoded by two-byte of ASCII codes.
1874
1875 In addition, for backward compatibility, the following formats are
1876 also recognized as composition data on decoding.
1877
1878 0x80 MSEQ ...
1879 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1880
1881 Here,
1882 MSEQ is a multibyte form but in these special format:
1883 ASCII: 0xA0 ASCII_CODE+0x80,
1884 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1885 RULE is a one byte code of the range 0xA0..0xF0 that
1886 represents a composition rule.
1887 */
1888
1889char emacs_mule_bytes[256];
1890
df7492f9 1891int
ff0dacd7 1892emacs_mule_char (coding, src, nbytes, nchars, id)
df7492f9 1893 struct coding_system *coding;
065e3595 1894 const unsigned char *src;
ff0dacd7 1895 int *nbytes, *nchars, *id;
df7492f9 1896{
8f924df7
KH
1897 const unsigned char *src_end = coding->source + coding->src_bytes;
1898 const unsigned char *src_base = src;
df7492f9 1899 int multibytep = coding->src_multibyte;
df7492f9
KH
1900 struct charset *charset;
1901 unsigned code;
1902 int c;
1903 int consumed_chars = 0;
1904
1905 ONE_MORE_BYTE (c);
065e3595 1906 if (c < 0)
df7492f9 1907 {
065e3595
KH
1908 c = -c;
1909 charset = emacs_mule_charset[0];
1910 }
1911 else
1912 {
4d41e8b7
KH
1913 if (c >= 0xA0)
1914 {
b3af4b28 1915 /* Old style component character of a composition. */
4d41e8b7
KH
1916 if (c == 0xA0)
1917 {
1918 ONE_MORE_BYTE (c);
1919 c -= 0x80;
1920 }
1921 else
1922 c -= 0x20;
1923 }
1924
065e3595 1925 switch (emacs_mule_bytes[c])
b73bfc1c 1926 {
065e3595 1927 case 2:
df7492f9
KH
1928 if (! (charset = emacs_mule_charset[c]))
1929 goto invalid_code;
1930 ONE_MORE_BYTE (c);
9ffd559c 1931 if (c < 0xA0)
065e3595 1932 goto invalid_code;
df7492f9 1933 code = c & 0x7F;
065e3595
KH
1934 break;
1935
1936 case 3:
1937 if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
1938 || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
1939 {
1940 ONE_MORE_BYTE (c);
9ffd559c 1941 if (c < 0xA0 || ! (charset = emacs_mule_charset[c]))
065e3595
KH
1942 goto invalid_code;
1943 ONE_MORE_BYTE (c);
9ffd559c 1944 if (c < 0xA0)
065e3595
KH
1945 goto invalid_code;
1946 code = c & 0x7F;
1947 }
1948 else
1949 {
1950 if (! (charset = emacs_mule_charset[c]))
1951 goto invalid_code;
1952 ONE_MORE_BYTE (c);
9ffd559c 1953 if (c < 0xA0)
065e3595
KH
1954 goto invalid_code;
1955 code = (c & 0x7F) << 8;
1956 ONE_MORE_BYTE (c);
9ffd559c 1957 if (c < 0xA0)
065e3595
KH
1958 goto invalid_code;
1959 code |= c & 0x7F;
1960 }
1961 break;
1962
1963 case 4:
1964 ONE_MORE_BYTE (c);
1965 if (c < 0 || ! (charset = emacs_mule_charset[c]))
df7492f9
KH
1966 goto invalid_code;
1967 ONE_MORE_BYTE (c);
9ffd559c 1968 if (c < 0xA0)
065e3595 1969 goto invalid_code;
781d7a48 1970 code = (c & 0x7F) << 8;
df7492f9 1971 ONE_MORE_BYTE (c);
9ffd559c 1972 if (c < 0xA0)
065e3595 1973 goto invalid_code;
df7492f9 1974 code |= c & 0x7F;
065e3595 1975 break;
df7492f9 1976
065e3595
KH
1977 case 1:
1978 code = c;
1979 charset = CHARSET_FROM_ID (ASCII_BYTE_P (code)
1980 ? charset_ascii : charset_eight_bit);
1981 break;
df7492f9 1982
065e3595
KH
1983 default:
1984 abort ();
1985 }
1986 c = DECODE_CHAR (charset, code);
1987 if (c < 0)
1988 goto invalid_code;
df7492f9 1989 }
df7492f9
KH
1990 *nbytes = src - src_base;
1991 *nchars = consumed_chars;
ff0dacd7
KH
1992 if (id)
1993 *id = charset->id;
df7492f9
KH
1994 return c;
1995
1996 no_more_source:
1997 return -2;
1998
1999 invalid_code:
2000 return -1;
2001}
2002
2003
2004/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
ff0dacd7
KH
2005 Check if a text is encoded in `emacs-mule'. If it is, return 1,
2006 else return 0. */
df7492f9
KH
2007
2008static int
ff0dacd7 2009detect_coding_emacs_mule (coding, detect_info)
df7492f9 2010 struct coding_system *coding;
ff0dacd7 2011 struct coding_detection_info *detect_info;
df7492f9 2012{
065e3595 2013 const unsigned char *src = coding->source, *src_base;
8f924df7 2014 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9
KH
2015 int multibytep = coding->src_multibyte;
2016 int consumed_chars = 0;
2017 int c;
2018 int found = 0;
2019
ff0dacd7 2020 detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
df7492f9
KH
2021 /* A coding system of this category is always ASCII compatible. */
2022 src += coding->head_ascii;
2023
2024 while (1)
2025 {
065e3595 2026 src_base = src;
df7492f9 2027 ONE_MORE_BYTE (c);
065e3595
KH
2028 if (c < 0)
2029 continue;
df7492f9
KH
2030 if (c == 0x80)
2031 {
2032 /* Perhaps the start of composite character. We simple skip
2033 it because analyzing it is too heavy for detecting. But,
2034 at least, we check that the composite character
3ed051d4 2035 constitutes of more than 4 bytes. */
8f924df7 2036 const unsigned char *src_base;
df7492f9
KH
2037
2038 repeat:
2039 src_base = src;
2040 do
2041 {
2042 ONE_MORE_BYTE (c);
2043 }
2044 while (c >= 0xA0);
2045
2046 if (src - src_base <= 4)
2047 break;
ff0dacd7 2048 found = CATEGORY_MASK_EMACS_MULE;
df7492f9
KH
2049 if (c == 0x80)
2050 goto repeat;
b73bfc1c 2051 }
df7492f9
KH
2052
2053 if (c < 0x80)
b73bfc1c 2054 {
df7492f9
KH
2055 if (c < 0x20
2056 && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
2057 break;
2058 }
2059 else
2060 {
0e219d54 2061 int more_bytes = emacs_mule_bytes[*src_base] - 1;
df7492f9 2062
0e219d54 2063 while (more_bytes > 0)
df7492f9
KH
2064 {
2065 ONE_MORE_BYTE (c);
0e219d54
KH
2066 if (c < 0xA0)
2067 {
2068 src--; /* Unread the last byte. */
2069 break;
2070 }
2071 more_bytes--;
df7492f9 2072 }
0e219d54 2073 if (more_bytes != 0)
df7492f9 2074 break;
ff0dacd7 2075 found = CATEGORY_MASK_EMACS_MULE;
4ed46869
KH
2076 }
2077 }
ff0dacd7 2078 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
df7492f9
KH
2079 return 0;
2080
2081 no_more_source:
065e3595 2082 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
89528eb3 2083 {
ff0dacd7 2084 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
89528eb3
KH
2085 return 0;
2086 }
ff0dacd7
KH
2087 detect_info->found |= found;
2088 return 1;
4ed46869
KH
2089}
2090
b73bfc1c 2091
df7492f9
KH
2092/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
2093
2094/* Decode a character represented as a component of composition
2095 sequence of Emacs 20/21 style at SRC. Set C to that character and
2096 update SRC to the head of next character (or an encoded composition
2097 rule). If SRC doesn't points a composition component, set C to -1.
2098 If SRC points an invalid byte sequence, global exit by a return
2099 value 0. */
2100
2101#define DECODE_EMACS_MULE_COMPOSITION_CHAR(buf) \
f937a7db 2102 do \
df7492f9
KH
2103 { \
2104 int c; \
2105 int nbytes, nchars; \
2106 \
2107 if (src == src_end) \
2108 break; \
ff0dacd7 2109 c = emacs_mule_char (coding, src, &nbytes, &nchars, NULL);\
df7492f9
KH
2110 if (c < 0) \
2111 { \
2112 if (c == -2) \
2113 break; \
2114 goto invalid_code; \
2115 } \
2116 *buf++ = c; \
2117 src += nbytes; \
2118 consumed_chars += nchars; \
2119 } \
f937a7db 2120 while (0)
df7492f9
KH
2121
2122
2123/* Decode a composition rule represented as a component of composition
781d7a48
KH
2124 sequence of Emacs 20 style at SRC. Store the decoded rule in *BUF,
2125 and increment BUF. If SRC points an invalid byte sequence, set C
2126 to -1. */
df7492f9 2127
781d7a48 2128#define DECODE_EMACS_MULE_COMPOSITION_RULE_20(buf) \
df7492f9
KH
2129 do { \
2130 int c, gref, nref; \
2131 \
781d7a48 2132 if (src >= src_end) \
df7492f9
KH
2133 goto invalid_code; \
2134 ONE_MORE_BYTE_NO_CHECK (c); \
4d41e8b7 2135 c -= 0xA0; \
df7492f9
KH
2136 if (c < 0 || c >= 81) \
2137 goto invalid_code; \
2138 \
2139 gref = c / 9, nref = c % 9; \
2140 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
2141 } while (0)
2142
2143
781d7a48
KH
2144/* Decode a composition rule represented as a component of composition
2145 sequence of Emacs 21 style at SRC. Store the decoded rule in *BUF,
2146 and increment BUF. If SRC points an invalid byte sequence, set C
2147 to -1. */
2148
2149#define DECODE_EMACS_MULE_COMPOSITION_RULE_21(buf) \
2150 do { \
2151 int gref, nref; \
2152 \
2153 if (src + 1>= src_end) \
2154 goto invalid_code; \
2155 ONE_MORE_BYTE_NO_CHECK (gref); \
2156 gref -= 0x20; \
2157 ONE_MORE_BYTE_NO_CHECK (nref); \
2158 nref -= 0x20; \
2159 if (gref < 0 || gref >= 81 \
2160 || nref < 0 || nref >= 81) \
2161 goto invalid_code; \
2162 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
2163 } while (0)
2164
2165
df7492f9 2166#define DECODE_EMACS_MULE_21_COMPOSITION(c) \
aa72b389 2167 do { \
df7492f9 2168 /* Emacs 21 style format. The first three bytes at SRC are \
781d7a48 2169 (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is \
df7492f9
KH
2170 the byte length of this composition information, CHARS is the \
2171 number of characters composed by this composition. */ \
781d7a48
KH
2172 enum composition_method method = c - 0xF2; \
2173 int *charbuf_base = charbuf; \
df7492f9
KH
2174 int consumed_chars_limit; \
2175 int nbytes, nchars; \
2176 \
2177 ONE_MORE_BYTE (c); \
065e3595
KH
2178 if (c < 0) \
2179 goto invalid_code; \
df7492f9
KH
2180 nbytes = c - 0xA0; \
2181 if (nbytes < 3) \
2182 goto invalid_code; \
2183 ONE_MORE_BYTE (c); \
065e3595
KH
2184 if (c < 0) \
2185 goto invalid_code; \
df7492f9 2186 nchars = c - 0xA0; \
69a80ea3 2187 ADD_COMPOSITION_DATA (charbuf, nchars, method); \
df7492f9
KH
2188 consumed_chars_limit = consumed_chars_base + nbytes; \
2189 if (method != COMPOSITION_RELATIVE) \
aa72b389 2190 { \
df7492f9
KH
2191 int i = 0; \
2192 while (consumed_chars < consumed_chars_limit) \
aa72b389 2193 { \
df7492f9 2194 if (i % 2 && method != COMPOSITION_WITH_ALTCHARS) \
781d7a48 2195 DECODE_EMACS_MULE_COMPOSITION_RULE_21 (charbuf); \
df7492f9
KH
2196 else \
2197 DECODE_EMACS_MULE_COMPOSITION_CHAR (charbuf); \
781d7a48 2198 i++; \
aa72b389 2199 } \
df7492f9
KH
2200 if (consumed_chars < consumed_chars_limit) \
2201 goto invalid_code; \
781d7a48 2202 charbuf_base[0] -= i; \
aa72b389
KH
2203 } \
2204 } while (0)
93dec019 2205
aa72b389 2206
d959f512
KH
2207#define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION(c) \
2208 do { \
2209 /* Emacs 20 style format for relative composition. */ \
2210 /* Store multibyte form of characters to be composed. */ \
2211 enum composition_method method = COMPOSITION_RELATIVE; \
2212 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
2213 int *buf = components; \
2214 int i, j; \
2215 \
2216 src = src_base; \
2217 ONE_MORE_BYTE (c); /* skip 0x80 */ \
2218 for (i = 0; *src >= 0xA0 && i < MAX_COMPOSITION_COMPONENTS; i++) \
2219 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
2220 if (i < 2) \
2221 goto invalid_code; \
2222 ADD_COMPOSITION_DATA (charbuf, i, method); \
2223 for (j = 0; j < i; j++) \
2224 *charbuf++ = components[j]; \
df7492f9
KH
2225 } while (0)
2226
2227
2228#define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION(c) \
2229 do { \
2230 /* Emacs 20 style format for rule-base composition. */ \
2231 /* Store multibyte form of characters to be composed. */ \
ff0dacd7 2232 enum composition_method method = COMPOSITION_WITH_RULE; \
4d41e8b7 2233 int *charbuf_base = charbuf; \
df7492f9
KH
2234 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
2235 int *buf = components; \
2236 int i, j; \
4d41e8b7 2237 \
df7492f9 2238 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
4d41e8b7 2239 for (i = 1; i < MAX_COMPOSITION_COMPONENTS; i++) \
df7492f9 2240 { \
4d41e8b7
KH
2241 if (*src < 0xA0) \
2242 break; \
781d7a48 2243 DECODE_EMACS_MULE_COMPOSITION_RULE_20 (buf); \
df7492f9
KH
2244 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
2245 } \
4d41e8b7 2246 if (i <= 1 || (buf - components) % 2 == 0) \
df7492f9 2247 goto invalid_code; \
4d41e8b7 2248 if (charbuf + i + (i / 2) + 1 >= charbuf_end) \
df7492f9 2249 goto no_more_source; \
4d41e8b7
KH
2250 ADD_COMPOSITION_DATA (charbuf, i, method); \
2251 i = i * 2 - 1; \
df7492f9
KH
2252 for (j = 0; j < i; j++) \
2253 *charbuf++ = components[j]; \
4d41e8b7 2254 charbuf_base[0] -= i; \
df7492f9
KH
2255 for (j = 0; j < i; j += 2) \
2256 *charbuf++ = components[j]; \
2257 } while (0)
2258
aa72b389
KH
2259
2260static void
df7492f9 2261decode_coding_emacs_mule (coding)
aa72b389 2262 struct coding_system *coding;
aa72b389 2263{
8f924df7
KH
2264 const unsigned char *src = coding->source + coding->consumed;
2265 const unsigned char *src_end = coding->source + coding->src_bytes;
2266 const unsigned char *src_base;
69a80ea3
KH
2267 int *charbuf = coding->charbuf + coding->charbuf_used;
2268 int *charbuf_end
2269 = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
df7492f9 2270 int consumed_chars = 0, consumed_chars_base;
df7492f9 2271 int multibytep = coding->src_multibyte;
24a73b0a 2272 Lisp_Object attrs, charset_list;
ff0dacd7
KH
2273 int char_offset = coding->produced_char;
2274 int last_offset = char_offset;
2275 int last_id = charset_ascii;
119852e7
KH
2276 int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2277 int byte_after_cr = -1;
aa72b389 2278
24a73b0a 2279 CODING_GET_INFO (coding, attrs, charset_list);
aa72b389 2280
aa72b389
KH
2281 while (1)
2282 {
df7492f9
KH
2283 int c;
2284
aa72b389 2285 src_base = src;
df7492f9
KH
2286 consumed_chars_base = consumed_chars;
2287
2288 if (charbuf >= charbuf_end)
2289 break;
aa72b389 2290
119852e7
KH
2291 if (byte_after_cr >= 0)
2292 c = byte_after_cr, byte_after_cr = -1;
2293 else
2294 ONE_MORE_BYTE (c);
065e3595
KH
2295 if (c < 0)
2296 {
2297 *charbuf++ = -c;
2298 char_offset++;
2299 }
2300 else if (c < 0x80)
aa72b389 2301 {
119852e7
KH
2302 if (eol_crlf && c == '\r')
2303 ONE_MORE_BYTE (byte_after_cr);
df7492f9
KH
2304 *charbuf++ = c;
2305 char_offset++;
aa72b389 2306 }
df7492f9
KH
2307 else if (c == 0x80)
2308 {
df7492f9 2309 ONE_MORE_BYTE (c);
065e3595
KH
2310 if (c < 0)
2311 goto invalid_code;
781d7a48
KH
2312 if (c - 0xF2 >= COMPOSITION_RELATIVE
2313 && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)
df7492f9
KH
2314 DECODE_EMACS_MULE_21_COMPOSITION (c);
2315 else if (c < 0xC0)
2316 DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (c);
2317 else if (c == 0xFF)
2318 DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (c);
2319 else
2320 goto invalid_code;
2321 }
2322 else if (c < 0xA0 && emacs_mule_bytes[c] > 1)
2323 {
2324 int nbytes, nchars;
ff0dacd7
KH
2325 int id;
2326
781d7a48
KH
2327 src = src_base;
2328 consumed_chars = consumed_chars_base;
ff0dacd7 2329 c = emacs_mule_char (coding, src, &nbytes, &nchars, &id);
df7492f9
KH
2330 if (c < 0)
2331 {
2332 if (c == -2)
2333 break;
2334 goto invalid_code;
2335 }
ff0dacd7
KH
2336 if (last_id != id)
2337 {
2338 if (last_id != charset_ascii)
69a80ea3 2339 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
ff0dacd7
KH
2340 last_id = id;
2341 last_offset = char_offset;
2342 }
df7492f9 2343 *charbuf++ = c;
781d7a48
KH
2344 src += nbytes;
2345 consumed_chars += nchars;
df7492f9
KH
2346 char_offset++;
2347 }
4d41e8b7
KH
2348 else
2349 goto invalid_code;
df7492f9
KH
2350 continue;
2351
2352 invalid_code:
2353 src = src_base;
2354 consumed_chars = consumed_chars_base;
2355 ONE_MORE_BYTE (c);
2356 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
ff0dacd7 2357 char_offset++;
df7492f9
KH
2358 coding->errors++;
2359 }
2360
2361 no_more_source:
ff0dacd7 2362 if (last_id != charset_ascii)
69a80ea3 2363 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
df7492f9
KH
2364 coding->consumed_char += consumed_chars_base;
2365 coding->consumed = src_base - coding->source;
2366 coding->charbuf_used = charbuf - coding->charbuf;
2367}
2368
2369
2370#define EMACS_MULE_LEADING_CODES(id, codes) \
2371 do { \
2372 if (id < 0xA0) \
2373 codes[0] = id, codes[1] = 0; \
2374 else if (id < 0xE0) \
2375 codes[0] = 0x9A, codes[1] = id; \
2376 else if (id < 0xF0) \
2377 codes[0] = 0x9B, codes[1] = id; \
2378 else if (id < 0xF5) \
2379 codes[0] = 0x9C, codes[1] = id; \
2380 else \
2381 codes[0] = 0x9D, codes[1] = id; \
2382 } while (0);
2383
aa72b389 2384
df7492f9
KH
2385static int
2386encode_coding_emacs_mule (coding)
2387 struct coding_system *coding;
2388{
2389 int multibytep = coding->dst_multibyte;
2390 int *charbuf = coding->charbuf;
2391 int *charbuf_end = charbuf + coding->charbuf_used;
2392 unsigned char *dst = coding->destination + coding->produced;
2393 unsigned char *dst_end = coding->destination + coding->dst_bytes;
2394 int safe_room = 8;
df7492f9 2395 int produced_chars = 0;
24a73b0a 2396 Lisp_Object attrs, charset_list;
df7492f9 2397 int c;
ff0dacd7 2398 int preferred_charset_id = -1;
df7492f9 2399
24a73b0a 2400 CODING_GET_INFO (coding, attrs, charset_list);
eccb6815
KH
2401 if (! EQ (charset_list, Vemacs_mule_charset_list))
2402 {
2403 CODING_ATTR_CHARSET_LIST (attrs)
2404 = charset_list = Vemacs_mule_charset_list;
2405 }
df7492f9
KH
2406
2407 while (charbuf < charbuf_end)
2408 {
2409 ASSURE_DESTINATION (safe_room);
2410 c = *charbuf++;
ff0dacd7
KH
2411
2412 if (c < 0)
2413 {
2414 /* Handle an annotation. */
2415 switch (*charbuf)
2416 {
2417 case CODING_ANNOTATE_COMPOSITION_MASK:
2418 /* Not yet implemented. */
2419 break;
2420 case CODING_ANNOTATE_CHARSET_MASK:
2421 preferred_charset_id = charbuf[3];
2422 if (preferred_charset_id >= 0
2423 && NILP (Fmemq (make_number (preferred_charset_id),
2424 charset_list)))
2425 preferred_charset_id = -1;
2426 break;
2427 default:
2428 abort ();
2429 }
2430 charbuf += -c - 1;
2431 continue;
2432 }
2433
df7492f9
KH
2434 if (ASCII_CHAR_P (c))
2435 EMIT_ONE_ASCII_BYTE (c);
16eafb5d
KH
2436 else if (CHAR_BYTE8_P (c))
2437 {
2438 c = CHAR_TO_BYTE8 (c);
2439 EMIT_ONE_BYTE (c);
2440 }
df7492f9 2441 else
aa72b389 2442 {
df7492f9
KH
2443 struct charset *charset;
2444 unsigned code;
2445 int dimension;
2446 int emacs_mule_id;
2447 unsigned char leading_codes[2];
2448
ff0dacd7
KH
2449 if (preferred_charset_id >= 0)
2450 {
2451 charset = CHARSET_FROM_ID (preferred_charset_id);
905ca9d2
KH
2452 if (CHAR_CHARSET_P (c, charset))
2453 code = ENCODE_CHAR (charset, c);
2454 else
2455 charset = char_charset (c, charset_list, &code);
ff0dacd7
KH
2456 }
2457 else
2458 charset = char_charset (c, charset_list, &code);
df7492f9
KH
2459 if (! charset)
2460 {
2461 c = coding->default_char;
2462 if (ASCII_CHAR_P (c))
2463 {
2464 EMIT_ONE_ASCII_BYTE (c);
2465 continue;
2466 }
2467 charset = char_charset (c, charset_list, &code);
2468 }
2469 dimension = CHARSET_DIMENSION (charset);
2470 emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2471 EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2472 EMIT_ONE_BYTE (leading_codes[0]);
2473 if (leading_codes[1])
2474 EMIT_ONE_BYTE (leading_codes[1]);
2475 if (dimension == 1)
1fa663f9 2476 EMIT_ONE_BYTE (code | 0x80);
aa72b389 2477 else
df7492f9 2478 {
1fa663f9 2479 code |= 0x8080;
df7492f9
KH
2480 EMIT_ONE_BYTE (code >> 8);
2481 EMIT_ONE_BYTE (code & 0xFF);
2482 }
aa72b389 2483 }
aa72b389 2484 }
065e3595 2485 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
2486 coding->produced_char += produced_chars;
2487 coding->produced = dst - coding->destination;
2488 return 0;
aa72b389 2489}
b73bfc1c 2490
4ed46869 2491\f
df7492f9 2492/*** 7. ISO2022 handlers ***/
4ed46869
KH
2493
2494/* The following note describes the coding system ISO2022 briefly.
39787efd 2495 Since the intention of this note is to help understand the
5a936b46 2496 functions in this file, some parts are NOT ACCURATE or are OVERLY
39787efd 2497 SIMPLIFIED. For thorough understanding, please refer to the
5a936b46 2498 original document of ISO2022. This is equivalent to the standard
cfb43547 2499 ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
4ed46869
KH
2500
2501 ISO2022 provides many mechanisms to encode several character sets
cfb43547 2502 in 7-bit and 8-bit environments. For 7-bit environments, all text
39787efd
KH
2503 is encoded using bytes less than 128. This may make the encoded
2504 text a little bit longer, but the text passes more easily through
cfb43547 2505 several types of gateway, some of which strip off the MSB (Most
8ca3766a 2506 Significant Bit).
b73bfc1c 2507
cfb43547
DL
2508 There are two kinds of character sets: control character sets and
2509 graphic character sets. The former contain control characters such
4ed46869 2510 as `newline' and `escape' to provide control functions (control
39787efd 2511 functions are also provided by escape sequences). The latter
cfb43547 2512 contain graphic characters such as 'A' and '-'. Emacs recognizes
4ed46869
KH
2513 two control character sets and many graphic character sets.
2514
2515 Graphic character sets are classified into one of the following
39787efd
KH
2516 four classes, according to the number of bytes (DIMENSION) and
2517 number of characters in one dimension (CHARS) of the set:
2518 - DIMENSION1_CHARS94
2519 - DIMENSION1_CHARS96
2520 - DIMENSION2_CHARS94
2521 - DIMENSION2_CHARS96
2522
2523 In addition, each character set is assigned an identification tag,
cfb43547 2524 unique for each set, called the "final character" (denoted as <F>
39787efd
KH
2525 hereafter). The <F> of each character set is decided by ECMA(*)
2526 when it is registered in ISO. The code range of <F> is 0x30..0x7F
2527 (0x30..0x3F are for private use only).
4ed46869
KH
2528
2529 Note (*): ECMA = European Computer Manufacturers Association
2530
cfb43547 2531 Here are examples of graphic character sets [NAME(<F>)]:
4ed46869
KH
2532 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2533 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2534 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2535 o DIMENSION2_CHARS96 -- none for the moment
2536
39787efd 2537 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
4ed46869
KH
2538 C0 [0x00..0x1F] -- control character plane 0
2539 GL [0x20..0x7F] -- graphic character plane 0
2540 C1 [0x80..0x9F] -- control character plane 1
2541 GR [0xA0..0xFF] -- graphic character plane 1
2542
2543 A control character set is directly designated and invoked to C0 or
39787efd
KH
2544 C1 by an escape sequence. The most common case is that:
2545 - ISO646's control character set is designated/invoked to C0, and
2546 - ISO6429's control character set is designated/invoked to C1,
2547 and usually these designations/invocations are omitted in encoded
2548 text. In a 7-bit environment, only C0 can be used, and a control
2549 character for C1 is encoded by an appropriate escape sequence to
2550 fit into the environment. All control characters for C1 are
2551 defined to have corresponding escape sequences.
4ed46869
KH
2552
2553 A graphic character set is at first designated to one of four
2554 graphic registers (G0 through G3), then these graphic registers are
2555 invoked to GL or GR. These designations and invocations can be
2556 done independently. The most common case is that G0 is invoked to
39787efd
KH
2557 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
2558 these invocations and designations are omitted in encoded text.
2559 In a 7-bit environment, only GL can be used.
4ed46869 2560
39787efd
KH
2561 When a graphic character set of CHARS94 is invoked to GL, codes
2562 0x20 and 0x7F of the GL area work as control characters SPACE and
2563 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2564 be used.
4ed46869
KH
2565
2566 There are two ways of invocation: locking-shift and single-shift.
2567 With locking-shift, the invocation lasts until the next different
39787efd
KH
2568 invocation, whereas with single-shift, the invocation affects the
2569 following character only and doesn't affect the locking-shift
2570 state. Invocations are done by the following control characters or
2571 escape sequences:
4ed46869
KH
2572
2573 ----------------------------------------------------------------------
39787efd 2574 abbrev function cntrl escape seq description
4ed46869 2575 ----------------------------------------------------------------------
39787efd
KH
2576 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
2577 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
2578 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
2579 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
2580 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
2581 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
2582 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
2583 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
2584 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
4ed46869 2585 ----------------------------------------------------------------------
39787efd
KH
2586 (*) These are not used by any known coding system.
2587
2588 Control characters for these functions are defined by macros
2589 ISO_CODE_XXX in `coding.h'.
4ed46869 2590
39787efd 2591 Designations are done by the following escape sequences:
4ed46869
KH
2592 ----------------------------------------------------------------------
2593 escape sequence description
2594 ----------------------------------------------------------------------
2595 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
2596 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
2597 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
2598 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
2599 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
2600 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
2601 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
2602 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
2603 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
2604 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
2605 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
2606 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
2607 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
2608 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
2609 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
2610 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
2611 ----------------------------------------------------------------------
2612
2613 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
39787efd 2614 of dimension 1, chars 94, and final character <F>, etc...
4ed46869
KH
2615
2616 Note (*): Although these designations are not allowed in ISO2022,
2617 Emacs accepts them on decoding, and produces them on encoding
39787efd 2618 CHARS96 character sets in a coding system which is characterized as
4ed46869
KH
2619 7-bit environment, non-locking-shift, and non-single-shift.
2620
2621 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
df7492f9 2622 '(' must be omitted. We refer to this as "short-form" hereafter.
4ed46869 2623
cfb43547 2624 Now you may notice that there are a lot of ways of encoding the
39787efd
KH
2625 same multilingual text in ISO2022. Actually, there exist many
2626 coding systems such as Compound Text (used in X11's inter client
8ca3766a
DL
2627 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2628 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
4ed46869
KH
2629 localized platforms), and all of these are variants of ISO2022.
2630
2631 In addition to the above, Emacs handles two more kinds of escape
2632 sequences: ISO6429's direction specification and Emacs' private
2633 sequence for specifying character composition.
2634
39787efd 2635 ISO6429's direction specification takes the following form:
4ed46869
KH
2636 o CSI ']' -- end of the current direction
2637 o CSI '0' ']' -- end of the current direction
2638 o CSI '1' ']' -- start of left-to-right text
2639 o CSI '2' ']' -- start of right-to-left text
2640 The control character CSI (0x9B: control sequence introducer) is
39787efd
KH
2641 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2642
2643 Character composition specification takes the following form:
ec6d2bb8
KH
2644 o ESC '0' -- start relative composition
2645 o ESC '1' -- end composition
2646 o ESC '2' -- start rule-base composition (*)
2647 o ESC '3' -- start relative composition with alternate chars (**)
2648 o ESC '4' -- start rule-base composition with alternate chars (**)
b73bfc1c 2649 Since these are not standard escape sequences of any ISO standard,
cfb43547 2650 the use of them with these meanings is restricted to Emacs only.
ec6d2bb8 2651
5a936b46
DL
2652 (*) This form is used only in Emacs 20.7 and older versions,
2653 but newer versions can safely decode it.
cfb43547 2654 (**) This form is used only in Emacs 21.1 and newer versions,
5a936b46 2655 and older versions can't decode it.
ec6d2bb8 2656
cfb43547 2657 Here's a list of example usages of these composition escape
b73bfc1c 2658 sequences (categorized by `enum composition_method').
ec6d2bb8 2659
b73bfc1c 2660 COMPOSITION_RELATIVE:
ec6d2bb8 2661 ESC 0 CHAR [ CHAR ] ESC 1
8ca3766a 2662 COMPOSITION_WITH_RULE:
ec6d2bb8 2663 ESC 2 CHAR [ RULE CHAR ] ESC 1
b73bfc1c 2664 COMPOSITION_WITH_ALTCHARS:
ec6d2bb8 2665 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
b73bfc1c 2666 COMPOSITION_WITH_RULE_ALTCHARS:
ec6d2bb8 2667 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
4ed46869
KH
2668
2669enum iso_code_class_type iso_code_class[256];
2670
df7492f9
KH
2671#define SAFE_CHARSET_P(coding, id) \
2672 ((id) <= (coding)->max_charset_id \
2673 && (coding)->safe_charsets[id] >= 0)
2674
2675
2676#define SHIFT_OUT_OK(category) \
2677 (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
2678
2679static void
f0064e1f
DL
2680setup_iso_safe_charsets (attrs)
2681 Lisp_Object attrs;
df7492f9
KH
2682{
2683 Lisp_Object charset_list, safe_charsets;
2684 Lisp_Object request;
2685 Lisp_Object reg_usage;
2686 Lisp_Object tail;
2687 int reg94, reg96;
2688 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2689 int max_charset_id;
2690
2691 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2692 if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2693 && ! EQ (charset_list, Viso_2022_charset_list))
2694 {
2695 CODING_ATTR_CHARSET_LIST (attrs)
2696 = charset_list = Viso_2022_charset_list;
2697 ASET (attrs, coding_attr_safe_charsets, Qnil);
2698 }
2699
2700 if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2701 return;
2702
2703 max_charset_id = 0;
2704 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2705 {
2706 int id = XINT (XCAR (tail));
2707 if (max_charset_id < id)
2708 max_charset_id = id;
2709 }
d46c5b12 2710
df7492f9
KH
2711 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
2712 make_number (255));
2713 request = AREF (attrs, coding_attr_iso_request);
2714 reg_usage = AREF (attrs, coding_attr_iso_usage);
2715 reg94 = XINT (XCAR (reg_usage));
2716 reg96 = XINT (XCDR (reg_usage));
2717
2718 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2719 {
2720 Lisp_Object id;
2721 Lisp_Object reg;
2722 struct charset *charset;
2723
2724 id = XCAR (tail);
2725 charset = CHARSET_FROM_ID (XINT (id));
bf16eb23 2726 reg = Fcdr (Fassq (id, request));
df7492f9 2727 if (! NILP (reg))
8f924df7 2728 SSET (safe_charsets, XINT (id), XINT (reg));
df7492f9
KH
2729 else if (charset->iso_chars_96)
2730 {
2731 if (reg96 < 4)
8f924df7 2732 SSET (safe_charsets, XINT (id), reg96);
df7492f9
KH
2733 }
2734 else
2735 {
2736 if (reg94 < 4)
8f924df7 2737 SSET (safe_charsets, XINT (id), reg94);
df7492f9
KH
2738 }
2739 }
2740 ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2741}
d46c5b12 2742
b6871cc7 2743
4ed46869 2744/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
ff0dacd7
KH
2745 Check if a text is encoded in one of ISO-2022 based codig systems.
2746 If it is, return 1, else return 0. */
4ed46869 2747
0a28aafb 2748static int
ff0dacd7 2749detect_coding_iso_2022 (coding, detect_info)
df7492f9 2750 struct coding_system *coding;
ff0dacd7 2751 struct coding_detection_info *detect_info;
4ed46869 2752{
8f924df7
KH
2753 const unsigned char *src = coding->source, *src_base = src;
2754 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9 2755 int multibytep = coding->src_multibyte;
ff0dacd7 2756 int single_shifting = 0;
df7492f9
KH
2757 int id;
2758 int c, c1;
2759 int consumed_chars = 0;
2760 int i;
ff0dacd7
KH
2761 int rejected = 0;
2762 int found = 0;
cee53ed4 2763 int composition_count = -1;
ff0dacd7
KH
2764
2765 detect_info->checked |= CATEGORY_MASK_ISO;
df7492f9
KH
2766
2767 for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2768 {
2769 struct coding_system *this = &(coding_categories[i]);
2770 Lisp_Object attrs, val;
2771
c6b278e7
KH
2772 if (this->id < 0)
2773 continue;
df7492f9
KH
2774 attrs = CODING_ID_ATTRS (this->id);
2775 if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2776 && ! EQ (CODING_ATTR_SAFE_CHARSETS (attrs), Viso_2022_charset_list))
2777 setup_iso_safe_charsets (attrs);
2778 val = CODING_ATTR_SAFE_CHARSETS (attrs);
8f924df7
KH
2779 this->max_charset_id = SCHARS (val) - 1;
2780 this->safe_charsets = (char *) SDATA (val);
df7492f9
KH
2781 }
2782
2783 /* A coding system of this category is always ASCII compatible. */
2784 src += coding->head_ascii;
3f003981 2785
ff0dacd7 2786 while (rejected != CATEGORY_MASK_ISO)
4ed46869 2787 {
065e3595 2788 src_base = src;
df7492f9 2789 ONE_MORE_BYTE (c);
4ed46869
KH
2790 switch (c)
2791 {
2792 case ISO_CODE_ESC:
74383408
KH
2793 if (inhibit_iso_escape_detection)
2794 break;
f46869e4 2795 single_shifting = 0;
df7492f9 2796 ONE_MORE_BYTE (c);
d46c5b12 2797 if (c >= '(' && c <= '/')
4ed46869 2798 {
bf9cdd4e 2799 /* Designation sequence for a charset of dimension 1. */
df7492f9 2800 ONE_MORE_BYTE (c1);
d46c5b12 2801 if (c1 < ' ' || c1 >= 0x80
df7492f9 2802 || (id = iso_charset_table[0][c >= ','][c1]) < 0)
d46c5b12
KH
2803 /* Invalid designation sequence. Just ignore. */
2804 break;
bf9cdd4e
KH
2805 }
2806 else if (c == '$')
2807 {
2808 /* Designation sequence for a charset of dimension 2. */
df7492f9 2809 ONE_MORE_BYTE (c);
bf9cdd4e
KH
2810 if (c >= '@' && c <= 'B')
2811 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
ff0dacd7 2812 id = iso_charset_table[1][0][c];
bf9cdd4e 2813 else if (c >= '(' && c <= '/')
bcf26d6a 2814 {
df7492f9 2815 ONE_MORE_BYTE (c1);
d46c5b12 2816 if (c1 < ' ' || c1 >= 0x80
df7492f9 2817 || (id = iso_charset_table[1][c >= ','][c1]) < 0)
d46c5b12
KH
2818 /* Invalid designation sequence. Just ignore. */
2819 break;
bcf26d6a 2820 }
bf9cdd4e 2821 else
ff0dacd7 2822 /* Invalid designation sequence. Just ignore it. */
d46c5b12
KH
2823 break;
2824 }
ae9ff118 2825 else if (c == 'N' || c == 'O')
d46c5b12 2826 {
ae9ff118 2827 /* ESC <Fe> for SS2 or SS3. */
ff0dacd7
KH
2828 single_shifting = 1;
2829 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
d46c5b12 2830 break;
4ed46869 2831 }
cee53ed4
KH
2832 else if (c == '1')
2833 {
2834 /* End of composition. */
2835 if (composition_count < 0
2836 || composition_count > MAX_COMPOSITION_COMPONENTS)
2837 /* Invalid */
2838 break;
2839 composition_count = -1;
2840 found |= CATEGORY_MASK_ISO;
2841 }
ec6d2bb8
KH
2842 else if (c >= '0' && c <= '4')
2843 {
2844 /* ESC <Fp> for start/end composition. */
cee53ed4 2845 composition_count = 0;
ec6d2bb8
KH
2846 break;
2847 }
bf9cdd4e 2848 else
df7492f9 2849 {
ff0dacd7 2850 /* Invalid escape sequence. Just ignore it. */
df7492f9
KH
2851 break;
2852 }
d46c5b12
KH
2853
2854 /* We found a valid designation sequence for CHARSET. */
ff0dacd7 2855 rejected |= CATEGORY_MASK_ISO_8BIT;
df7492f9
KH
2856 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
2857 id))
ff0dacd7 2858 found |= CATEGORY_MASK_ISO_7;
d46c5b12 2859 else
ff0dacd7 2860 rejected |= CATEGORY_MASK_ISO_7;
df7492f9
KH
2861 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
2862 id))
ff0dacd7 2863 found |= CATEGORY_MASK_ISO_7_TIGHT;
d46c5b12 2864 else
ff0dacd7 2865 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
df7492f9
KH
2866 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
2867 id))
ff0dacd7 2868 found |= CATEGORY_MASK_ISO_7_ELSE;
ae9ff118 2869 else
ff0dacd7 2870 rejected |= CATEGORY_MASK_ISO_7_ELSE;
df7492f9
KH
2871 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
2872 id))
ff0dacd7 2873 found |= CATEGORY_MASK_ISO_8_ELSE;
ae9ff118 2874 else
ff0dacd7 2875 rejected |= CATEGORY_MASK_ISO_8_ELSE;
4ed46869
KH
2876 break;
2877
4ed46869 2878 case ISO_CODE_SO:
d46c5b12 2879 case ISO_CODE_SI:
ff0dacd7 2880 /* Locking shift out/in. */
74383408
KH
2881 if (inhibit_iso_escape_detection)
2882 break;
f46869e4 2883 single_shifting = 0;
ff0dacd7 2884 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
d46c5b12
KH
2885 break;
2886
4ed46869 2887 case ISO_CODE_CSI:
ff0dacd7 2888 /* Control sequence introducer. */
f46869e4 2889 single_shifting = 0;
ff0dacd7
KH
2890 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2891 found |= CATEGORY_MASK_ISO_8_ELSE;
2892 goto check_extra_latin;
2893
4ed46869
KH
2894 case ISO_CODE_SS2:
2895 case ISO_CODE_SS3:
ff0dacd7
KH
2896 /* Single shift. */
2897 if (inhibit_iso_escape_detection)
2898 break;
75e2a253 2899 single_shifting = 0;
ff0dacd7
KH
2900 rejected |= CATEGORY_MASK_ISO_7BIT;
2901 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2902 & CODING_ISO_FLAG_SINGLE_SHIFT)
75e2a253 2903 found |= CATEGORY_MASK_ISO_8_1, single_shifting = 1;
ff0dacd7
KH
2904 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2905 & CODING_ISO_FLAG_SINGLE_SHIFT)
75e2a253
KH
2906 found |= CATEGORY_MASK_ISO_8_2, single_shifting = 1;
2907 if (single_shifting)
2908 break;
ff0dacd7 2909 goto check_extra_latin;
4ed46869
KH
2910
2911 default:
065e3595
KH
2912 if (c < 0)
2913 continue;
4ed46869 2914 if (c < 0x80)
f46869e4 2915 {
cee53ed4
KH
2916 if (composition_count >= 0)
2917 composition_count++;
f46869e4
KH
2918 single_shifting = 0;
2919 break;
2920 }
ff0dacd7 2921 if (c >= 0xA0)
c4825358 2922 {
ff0dacd7
KH
2923 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2924 found |= CATEGORY_MASK_ISO_8_1;
f46869e4 2925 /* Check the length of succeeding codes of the range
ff0dacd7
KH
2926 0xA0..0FF. If the byte length is even, we include
2927 CATEGORY_MASK_ISO_8_2 in `found'. We can check this
2928 only when we are not single shifting. */
2929 if (! single_shifting
2930 && ! (rejected & CATEGORY_MASK_ISO_8_2))
f46869e4 2931 {
e17de821 2932 int i = 1;
b73bfc1c
KH
2933 while (src < src_end)
2934 {
df7492f9 2935 ONE_MORE_BYTE (c);
b73bfc1c
KH
2936 if (c < 0xA0)
2937 break;
2938 i++;
2939 }
2940
2941 if (i & 1 && src < src_end)
cee53ed4
KH
2942 {
2943 rejected |= CATEGORY_MASK_ISO_8_2;
2944 if (composition_count >= 0)
2945 composition_count += i;
2946 }
f46869e4 2947 else
cee53ed4
KH
2948 {
2949 found |= CATEGORY_MASK_ISO_8_2;
2950 if (composition_count >= 0)
2951 composition_count += i / 2;
2952 }
f46869e4 2953 }
ff0dacd7 2954 break;
4ed46869 2955 }
ff0dacd7
KH
2956 check_extra_latin:
2957 single_shifting = 0;
2958 if (! VECTORP (Vlatin_extra_code_table)
2959 || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
2960 {
2961 rejected = CATEGORY_MASK_ISO;
2962 break;
2963 }
2964 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2965 & CODING_ISO_FLAG_LATIN_EXTRA)
2966 found |= CATEGORY_MASK_ISO_8_1;
2967 else
2968 rejected |= CATEGORY_MASK_ISO_8_1;
75e2a253 2969 rejected |= CATEGORY_MASK_ISO_8_2;
4ed46869
KH
2970 }
2971 }
ff0dacd7
KH
2972 detect_info->rejected |= CATEGORY_MASK_ISO;
2973 return 0;
4ed46869 2974
df7492f9 2975 no_more_source:
ff0dacd7
KH
2976 detect_info->rejected |= rejected;
2977 detect_info->found |= (found & ~rejected);
df7492f9 2978 return 1;
4ed46869 2979}
ec6d2bb8 2980
4ed46869 2981
134b9549
KH
2982/* Set designation state into CODING. Set CHARS_96 to -1 if the
2983 escape sequence should be kept. */
df7492f9
KH
2984#define DECODE_DESIGNATION(reg, dim, chars_96, final) \
2985 do { \
2986 int id, prev; \
2987 \
2988 if (final < '0' || final >= 128 \
2989 || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0) \
2990 || !SAFE_CHARSET_P (coding, id)) \
2991 { \
2992 CODING_ISO_DESIGNATION (coding, reg) = -2; \
134b9549
KH
2993 chars_96 = -1; \
2994 break; \
df7492f9
KH
2995 } \
2996 prev = CODING_ISO_DESIGNATION (coding, reg); \
bf16eb23
KH
2997 if (id == charset_jisx0201_roman) \
2998 { \
2999 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
3000 id = charset_ascii; \
3001 } \
3002 else if (id == charset_jisx0208_1978) \
3003 { \
3004 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
3005 id = charset_jisx0208; \
3006 } \
df7492f9
KH
3007 CODING_ISO_DESIGNATION (coding, reg) = id; \
3008 /* If there was an invalid designation to REG previously, and this \
3009 designation is ASCII to REG, we should keep this designation \
3010 sequence. */ \
3011 if (prev == -2 && id == charset_ascii) \
134b9549 3012 chars_96 = -1; \
4ed46869
KH
3013 } while (0)
3014
d46c5b12 3015
df7492f9
KH
3016#define MAYBE_FINISH_COMPOSITION() \
3017 do { \
3018 int i; \
3019 if (composition_state == COMPOSING_NO) \
3020 break; \
3021 /* It is assured that we have enough room for producing \
3022 characters stored in the table `components'. */ \
3023 if (charbuf + component_idx > charbuf_end) \
3024 goto no_more_source; \
3025 composition_state = COMPOSING_NO; \
3026 if (method == COMPOSITION_RELATIVE \
3027 || method == COMPOSITION_WITH_ALTCHARS) \
3028 { \
3029 for (i = 0; i < component_idx; i++) \
3030 *charbuf++ = components[i]; \
3031 char_offset += component_idx; \
3032 } \
3033 else \
3034 { \
3035 for (i = 0; i < component_idx; i += 2) \
3036 *charbuf++ = components[i]; \
3037 char_offset += (component_idx / 2) + 1; \
3038 } \
3039 } while (0)
3040
d46c5b12 3041
aa72b389
KH
3042/* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3043 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3044 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
df7492f9
KH
3045 ESC 3 : altchar composition : ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3046 ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
aa72b389 3047 */
ec6d2bb8 3048
df7492f9
KH
3049#define DECODE_COMPOSITION_START(c1) \
3050 do { \
3051 if (c1 == '0' \
781d7a48 3052 && composition_state == COMPOSING_COMPONENT_RULE) \
df7492f9
KH
3053 { \
3054 component_len = component_idx; \
3055 composition_state = COMPOSING_CHAR; \
3056 } \
3057 else \
3058 { \
8f924df7 3059 const unsigned char *p; \
df7492f9
KH
3060 \
3061 MAYBE_FINISH_COMPOSITION (); \
3062 if (charbuf + MAX_COMPOSITION_COMPONENTS > charbuf_end) \
3063 goto no_more_source; \
3064 for (p = src; p < src_end - 1; p++) \
3065 if (*p == ISO_CODE_ESC && p[1] == '1') \
3066 break; \
3067 if (p == src_end - 1) \
3068 { \
cee53ed4
KH
3069 if (coding->mode & CODING_MODE_LAST_BLOCK) \
3070 goto invalid_code; \
9286b333
KH
3071 /* The current composition doesn't end in the current \
3072 source. */ \
3073 record_conversion_result \
3074 (coding, CODING_RESULT_INSUFFICIENT_SRC); \
df7492f9
KH
3075 goto no_more_source; \
3076 } \
3077 \
3078 /* This is surely the start of a composition. */ \
3079 method = (c1 == '0' ? COMPOSITION_RELATIVE \
3080 : c1 == '2' ? COMPOSITION_WITH_RULE \
3081 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
3082 : COMPOSITION_WITH_RULE_ALTCHARS); \
3083 composition_state = (c1 <= '2' ? COMPOSING_CHAR \
3084 : COMPOSING_COMPONENT_CHAR); \
3085 component_idx = component_len = 0; \
3086 } \
ec6d2bb8
KH
3087 } while (0)
3088
ec6d2bb8 3089
df7492f9
KH
3090/* Handle compositoin end sequence ESC 1. */
3091
3092#define DECODE_COMPOSITION_END() \
ec6d2bb8 3093 do { \
df7492f9
KH
3094 int nchars = (component_len > 0 ? component_idx - component_len \
3095 : method == COMPOSITION_RELATIVE ? component_idx \
3096 : (component_idx + 1) / 2); \
3097 int i; \
3098 int *saved_charbuf = charbuf; \
3099 \
69a80ea3 3100 ADD_COMPOSITION_DATA (charbuf, nchars, method); \
df7492f9 3101 if (method != COMPOSITION_RELATIVE) \
ec6d2bb8 3102 { \
df7492f9
KH
3103 if (component_len == 0) \
3104 for (i = 0; i < component_idx; i++) \
3105 *charbuf++ = components[i]; \
3106 else \
3107 for (i = 0; i < component_len; i++) \
3108 *charbuf++ = components[i]; \
3109 *saved_charbuf = saved_charbuf - charbuf; \
ec6d2bb8 3110 } \
df7492f9
KH
3111 if (method == COMPOSITION_WITH_RULE) \
3112 for (i = 0; i < component_idx; i += 2, char_offset++) \
3113 *charbuf++ = components[i]; \
ec6d2bb8 3114 else \
df7492f9
KH
3115 for (i = component_len; i < component_idx; i++, char_offset++) \
3116 *charbuf++ = components[i]; \
3117 coding->annotated = 1; \
3118 composition_state = COMPOSING_NO; \
ec6d2bb8
KH
3119 } while (0)
3120
df7492f9 3121
ec6d2bb8
KH
3122/* Decode a composition rule from the byte C1 (and maybe one more byte
3123 from SRC) and store one encoded composition rule in
3124 coding->cmp_data. */
3125
3126#define DECODE_COMPOSITION_RULE(c1) \
3127 do { \
ec6d2bb8
KH
3128 (c1) -= 32; \
3129 if (c1 < 81) /* old format (before ver.21) */ \
3130 { \
3131 int gref = (c1) / 9; \
3132 int nref = (c1) % 9; \
3133 if (gref == 4) gref = 10; \
3134 if (nref == 4) nref = 10; \
df7492f9 3135 c1 = COMPOSITION_ENCODE_RULE (gref, nref); \
ec6d2bb8 3136 } \
b73bfc1c 3137 else if (c1 < 93) /* new format (after ver.21) */ \
ec6d2bb8
KH
3138 { \
3139 ONE_MORE_BYTE (c2); \
df7492f9 3140 c1 = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32); \
ec6d2bb8 3141 } \
df7492f9
KH
3142 else \
3143 c1 = 0; \
ec6d2bb8 3144 } while (0)
88993dfd 3145
d46c5b12 3146
4ed46869
KH
3147/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
3148
b73bfc1c 3149static void
df7492f9 3150decode_coding_iso_2022 (coding)
4ed46869 3151 struct coding_system *coding;
4ed46869 3152{
8f924df7
KH
3153 const unsigned char *src = coding->source + coding->consumed;
3154 const unsigned char *src_end = coding->source + coding->src_bytes;
3155 const unsigned char *src_base;
69a80ea3 3156 int *charbuf = coding->charbuf + coding->charbuf_used;
ff0dacd7 3157 int *charbuf_end
69a80ea3 3158 = coding->charbuf + coding->charbuf_size - 4 - MAX_ANNOTATION_LENGTH;
df7492f9 3159 int consumed_chars = 0, consumed_chars_base;
df7492f9 3160 int multibytep = coding->src_multibyte;
4ed46869 3161 /* Charsets invoked to graphic plane 0 and 1 respectively. */
df7492f9
KH
3162 int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3163 int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
134b9549 3164 int charset_id_2, charset_id_3;
df7492f9
KH
3165 struct charset *charset;
3166 int c;
3167 /* For handling composition sequence. */
3168#define COMPOSING_NO 0
3169#define COMPOSING_CHAR 1
3170#define COMPOSING_RULE 2
3171#define COMPOSING_COMPONENT_CHAR 3
3172#define COMPOSING_COMPONENT_RULE 4
3173
3174 int composition_state = COMPOSING_NO;
3175 enum composition_method method;
3176 int components[MAX_COMPOSITION_COMPONENTS * 2 + 1];
3177 int component_idx;
3178 int component_len;
24a73b0a 3179 Lisp_Object attrs, charset_list;
ff0dacd7
KH
3180 int char_offset = coding->produced_char;
3181 int last_offset = char_offset;
3182 int last_id = charset_ascii;
119852e7
KH
3183 int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3184 int byte_after_cr = -1;
df7492f9 3185
24a73b0a 3186 CODING_GET_INFO (coding, attrs, charset_list);
df7492f9 3187 setup_iso_safe_charsets (attrs);
287c57d7
KH
3188 /* Charset list may have been changed. */
3189 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3190 coding->safe_charsets = (char *) SDATA (CODING_ATTR_SAFE_CHARSETS(attrs));
b73bfc1c
KH
3191
3192 while (1)
4ed46869 3193 {
463f5630 3194 int c1, c2;
b73bfc1c
KH
3195
3196 src_base = src;
df7492f9
KH
3197 consumed_chars_base = consumed_chars;
3198
3199 if (charbuf >= charbuf_end)
3200 break;
3201
119852e7
KH
3202 if (byte_after_cr >= 0)
3203 c1 = byte_after_cr, byte_after_cr = -1;
3204 else
3205 ONE_MORE_BYTE (c1);
065e3595
KH
3206 if (c1 < 0)
3207 goto invalid_code;
4ed46869 3208
98725083 3209 /* We produce at most one character. */
4ed46869
KH
3210 switch (iso_code_class [c1])
3211 {
3212 case ISO_0x20_or_0x7F:
df7492f9 3213 if (composition_state != COMPOSING_NO)
ec6d2bb8 3214 {
df7492f9
KH
3215 if (composition_state == COMPOSING_RULE
3216 || composition_state == COMPOSING_COMPONENT_RULE)
3217 {
cee53ed4
KH
3218 if (component_idx < MAX_COMPOSITION_COMPONENTS * 2 + 1)
3219 {
3220 DECODE_COMPOSITION_RULE (c1);
3221 components[component_idx++] = c1;
3222 composition_state--;
3223 continue;
3224 }
3225 /* Too long composition. */
3226 MAYBE_FINISH_COMPOSITION ();
df7492f9 3227 }
4ed46869 3228 }
df7492f9
KH
3229 if (charset_id_0 < 0
3230 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
781d7a48
KH
3231 /* This is SPACE or DEL. */
3232 charset = CHARSET_FROM_ID (charset_ascii);
3233 else
3234 charset = CHARSET_FROM_ID (charset_id_0);
3235 break;
4ed46869
KH
3236
3237 case ISO_graphic_plane_0:
781d7a48 3238 if (composition_state != COMPOSING_NO)
b73bfc1c 3239 {
781d7a48
KH
3240 if (composition_state == COMPOSING_RULE
3241 || composition_state == COMPOSING_COMPONENT_RULE)
3242 {
cee53ed4
KH
3243 if (component_idx < MAX_COMPOSITION_COMPONENTS * 2 + 1)
3244 {
3245 DECODE_COMPOSITION_RULE (c1);
3246 components[component_idx++] = c1;
3247 composition_state--;
3248 continue;
3249 }
3250 MAYBE_FINISH_COMPOSITION ();
781d7a48 3251 }
b73bfc1c 3252 }
134b9549
KH
3253 if (charset_id_0 < 0)
3254 charset = CHARSET_FROM_ID (charset_ascii);
3255 else
3256 charset = CHARSET_FROM_ID (charset_id_0);
4ed46869
KH
3257 break;
3258
3259 case ISO_0xA0_or_0xFF:
df7492f9
KH
3260 if (charset_id_1 < 0
3261 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3262 || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3263 goto invalid_code;
4ed46869
KH
3264 /* This is a graphic character, we fall down ... */
3265
3266 case ISO_graphic_plane_1:
df7492f9
KH
3267 if (charset_id_1 < 0)
3268 goto invalid_code;
3269 charset = CHARSET_FROM_ID (charset_id_1);
4ed46869
KH
3270 break;
3271
df7492f9 3272 case ISO_control_0:
119852e7
KH
3273 if (eol_crlf && c1 == '\r')
3274 ONE_MORE_BYTE (byte_after_cr);
df7492f9
KH
3275 MAYBE_FINISH_COMPOSITION ();
3276 charset = CHARSET_FROM_ID (charset_ascii);
4ed46869
KH
3277 break;
3278
df7492f9
KH
3279 case ISO_control_1:
3280 MAYBE_FINISH_COMPOSITION ();
3281 goto invalid_code;
3282
4ed46869 3283 case ISO_shift_out:
df7492f9
KH
3284 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3285 || CODING_ISO_DESIGNATION (coding, 1) < 0)
3286 goto invalid_code;
3287 CODING_ISO_INVOCATION (coding, 0) = 1;
3288 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
b73bfc1c 3289 continue;
4ed46869
KH
3290
3291 case ISO_shift_in:
df7492f9
KH
3292 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3293 goto invalid_code;
3294 CODING_ISO_INVOCATION (coding, 0) = 0;
3295 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
b73bfc1c 3296 continue;
4ed46869
KH
3297
3298 case ISO_single_shift_2_7:
3299 case ISO_single_shift_2:
df7492f9
KH
3300 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3301 goto invalid_code;
4ed46869
KH
3302 /* SS2 is handled as an escape sequence of ESC 'N' */
3303 c1 = 'N';
3304 goto label_escape_sequence;
3305
3306 case ISO_single_shift_3:
df7492f9
KH
3307 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3308 goto invalid_code;
4ed46869
KH
3309 /* SS2 is handled as an escape sequence of ESC 'O' */
3310 c1 = 'O';
3311 goto label_escape_sequence;
3312
3313 case ISO_control_sequence_introducer:
3314 /* CSI is handled as an escape sequence of ESC '[' ... */
3315 c1 = '[';
3316 goto label_escape_sequence;
3317
3318 case ISO_escape:
3319 ONE_MORE_BYTE (c1);
3320 label_escape_sequence:
df7492f9 3321 /* Escape sequences handled here are invocation,
4ed46869
KH
3322 designation, direction specification, and character
3323 composition specification. */
3324 switch (c1)
3325 {
3326 case '&': /* revision of following character set */
3327 ONE_MORE_BYTE (c1);
3328 if (!(c1 >= '@' && c1 <= '~'))
df7492f9 3329 goto invalid_code;
4ed46869
KH
3330 ONE_MORE_BYTE (c1);
3331 if (c1 != ISO_CODE_ESC)
df7492f9 3332 goto invalid_code;
4ed46869
KH
3333 ONE_MORE_BYTE (c1);
3334 goto label_escape_sequence;
3335
3336 case '$': /* designation of 2-byte character set */
df7492f9
KH
3337 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3338 goto invalid_code;
134b9549
KH
3339 {
3340 int reg, chars96;
3341
3342 ONE_MORE_BYTE (c1);
3343 if (c1 >= '@' && c1 <= 'B')
3344 { /* designation of JISX0208.1978, GB2312.1980,
88993dfd 3345 or JISX0208.1980 */
134b9549
KH
3346 reg = 0, chars96 = 0;
3347 }
3348 else if (c1 >= 0x28 && c1 <= 0x2B)
3349 { /* designation of DIMENSION2_CHARS94 character set */
3350 reg = c1 - 0x28, chars96 = 0;
3351 ONE_MORE_BYTE (c1);
3352 }
3353 else if (c1 >= 0x2C && c1 <= 0x2F)
3354 { /* designation of DIMENSION2_CHARS96 character set */
3355 reg = c1 - 0x2C, chars96 = 1;
3356 ONE_MORE_BYTE (c1);
3357 }
3358 else
3359 goto invalid_code;
3360 DECODE_DESIGNATION (reg, 2, chars96, c1);
3361 /* We must update these variables now. */
3362 if (reg == 0)
3363 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3364 else if (reg == 1)
3365 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3366 if (chars96 < 0)
3367 goto invalid_code;
3368 }
b73bfc1c 3369 continue;
4ed46869
KH
3370
3371 case 'n': /* invocation of locking-shift-2 */
df7492f9
KH
3372 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3373 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3374 goto invalid_code;
3375 CODING_ISO_INVOCATION (coding, 0) = 2;
3376 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
b73bfc1c 3377 continue;
4ed46869
KH
3378
3379 case 'o': /* invocation of locking-shift-3 */
df7492f9
KH
3380 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3381 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3382 goto invalid_code;
3383 CODING_ISO_INVOCATION (coding, 0) = 3;
3384 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
b73bfc1c 3385 continue;
4ed46869
KH
3386
3387 case 'N': /* invocation of single-shift-2 */
df7492f9
KH
3388 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3389 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3390 goto invalid_code;
134b9549
KH
3391 charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3392 if (charset_id_2 < 0)
3393 charset = CHARSET_FROM_ID (charset_ascii);
3394 else
3395 charset = CHARSET_FROM_ID (charset_id_2);
b73bfc1c 3396 ONE_MORE_BYTE (c1);
e7046a18 3397 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
df7492f9 3398 goto invalid_code;
4ed46869
KH
3399 break;
3400
3401 case 'O': /* invocation of single-shift-3 */
df7492f9
KH
3402 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3403 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3404 goto invalid_code;
134b9549
KH
3405 charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3406 if (charset_id_3 < 0)
3407 charset = CHARSET_FROM_ID (charset_ascii);
3408 else
3409 charset = CHARSET_FROM_ID (charset_id_3);
b73bfc1c 3410 ONE_MORE_BYTE (c1);
e7046a18 3411 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
df7492f9 3412 goto invalid_code;
4ed46869
KH
3413 break;
3414
ec6d2bb8 3415 case '0': case '2': case '3': case '4': /* start composition */
df7492f9
KH
3416 if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3417 goto invalid_code;
ec6d2bb8 3418 DECODE_COMPOSITION_START (c1);
b73bfc1c 3419 continue;
4ed46869 3420
ec6d2bb8 3421 case '1': /* end composition */
df7492f9
KH
3422 if (composition_state == COMPOSING_NO)
3423 goto invalid_code;
3424 DECODE_COMPOSITION_END ();
b73bfc1c 3425 continue;
4ed46869
KH
3426
3427 case '[': /* specification of direction */
df7492f9
KH
3428 if (! CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION)
3429 goto invalid_code;
4ed46869 3430 /* For the moment, nested direction is not supported.
d46c5b12 3431 So, `coding->mode & CODING_MODE_DIRECTION' zero means
df7492f9 3432 left-to-right, and nozero means right-to-left. */
4ed46869
KH
3433 ONE_MORE_BYTE (c1);
3434 switch (c1)
3435 {
3436 case ']': /* end of the current direction */
d46c5b12 3437 coding->mode &= ~CODING_MODE_DIRECTION;
4ed46869
KH
3438
3439 case '0': /* end of the current direction */
3440 case '1': /* start of left-to-right direction */
3441 ONE_MORE_BYTE (c1);
3442 if (c1 == ']')
d46c5b12 3443 coding->mode &= ~CODING_MODE_DIRECTION;
4ed46869 3444 else
df7492f9 3445 goto invalid_code;
4ed46869
KH
3446 break;
3447
3448 case '2': /* start of right-to-left direction */
3449 ONE_MORE_BYTE (c1);
3450 if (c1 == ']')
d46c5b12 3451 coding->mode |= CODING_MODE_DIRECTION;
4ed46869 3452 else
df7492f9 3453 goto invalid_code;
4ed46869
KH
3454 break;
3455
3456 default:
df7492f9 3457 goto invalid_code;
4ed46869 3458 }
b73bfc1c 3459 continue;
4ed46869 3460
103e0180 3461 case '%':
103e0180
KH
3462 ONE_MORE_BYTE (c1);
3463 if (c1 == '/')
3464 {
3465 /* CTEXT extended segment:
3466 ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3467 We keep these bytes as is for the moment.
3468 They may be decoded by post-read-conversion. */
3469 int dim, M, L;
4776e638 3470 int size;
8f924df7 3471
103e0180
KH
3472 ONE_MORE_BYTE (dim);
3473 ONE_MORE_BYTE (M);
3474 ONE_MORE_BYTE (L);
3475 size = ((M - 128) * 128) + (L - 128);
4776e638
KH
3476 if (charbuf + 8 + size > charbuf_end)
3477 goto break_loop;
3478 *charbuf++ = ISO_CODE_ESC;
3479 *charbuf++ = '%';
3480 *charbuf++ = '/';
3481 *charbuf++ = dim;
3482 *charbuf++ = BYTE8_TO_CHAR (M);
3483 *charbuf++ = BYTE8_TO_CHAR (L);
103e0180
KH
3484 while (size-- > 0)
3485 {
3486 ONE_MORE_BYTE (c1);
4776e638 3487 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
103e0180 3488 }
103e0180
KH
3489 }
3490 else if (c1 == 'G')
3491 {
103e0180
KH
3492 /* XFree86 extension for embedding UTF-8 in CTEXT:
3493 ESC % G --UTF-8-BYTES-- ESC % @
3494 We keep these bytes as is for the moment.
3495 They may be decoded by post-read-conversion. */
4776e638
KH
3496 int *p = charbuf;
3497
3498 if (p + 6 > charbuf_end)
3499 goto break_loop;
3500 *p++ = ISO_CODE_ESC;
3501 *p++ = '%';
3502 *p++ = 'G';
3503 while (p < charbuf_end)
103e0180
KH
3504 {
3505 ONE_MORE_BYTE (c1);
3506 if (c1 == ISO_CODE_ESC
3507 && src + 1 < src_end
3508 && src[0] == '%'
3509 && src[1] == '@')
9ffd559c
KH
3510 {
3511 src += 2;
3512 break;
3513 }
4776e638 3514 *p++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
103e0180 3515 }
4776e638
KH
3516 if (p + 3 > charbuf_end)
3517 goto break_loop;
3518 *p++ = ISO_CODE_ESC;
3519 *p++ = '%';
3520 *p++ = '@';
3521 charbuf = p;
103e0180
KH
3522 }
3523 else
4776e638 3524 goto invalid_code;
103e0180 3525 continue;
4776e638 3526 break;
103e0180 3527
4ed46869 3528 default:
df7492f9
KH
3529 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3530 goto invalid_code;
134b9549
KH
3531 {
3532 int reg, chars96;
3533
3534 if (c1 >= 0x28 && c1 <= 0x2B)
3535 { /* designation of DIMENSION1_CHARS94 character set */
3536 reg = c1 - 0x28, chars96 = 0;
3537 ONE_MORE_BYTE (c1);
3538 }
3539 else if (c1 >= 0x2C && c1 <= 0x2F)
3540 { /* designation of DIMENSION1_CHARS96 character set */
3541 reg = c1 - 0x2C, chars96 = 1;
3542 ONE_MORE_BYTE (c1);
3543 }
3544 else
3545 goto invalid_code;
3546 DECODE_DESIGNATION (reg, 1, chars96, c1);
3547 /* We must update these variables now. */
3548 if (reg == 0)
3549 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3550 else if (reg == 1)
3551 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3552 if (chars96 < 0)
3553 goto invalid_code;
3554 }
b73bfc1c 3555 continue;
4ed46869 3556 }
b73bfc1c 3557 }
4ed46869 3558
ff0dacd7
KH
3559 if (charset->id != charset_ascii
3560 && last_id != charset->id)
3561 {
3562 if (last_id != charset_ascii)
69a80ea3 3563 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
ff0dacd7
KH
3564 last_id = charset->id;
3565 last_offset = char_offset;
3566 }
3567
b73bfc1c 3568 /* Now we know CHARSET and 1st position code C1 of a character.
df7492f9
KH
3569 Produce a decoded character while getting 2nd position code
3570 C2 if necessary. */
3571 c1 &= 0x7F;
3572 if (CHARSET_DIMENSION (charset) > 1)
b73bfc1c
KH
3573 {
3574 ONE_MORE_BYTE (c2);
df7492f9 3575 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
b73bfc1c 3576 /* C2 is not in a valid range. */
df7492f9
KH
3577 goto invalid_code;
3578 c1 = (c1 << 8) | (c2 & 0x7F);
3579 if (CHARSET_DIMENSION (charset) > 2)
3580 {
3581 ONE_MORE_BYTE (c2);
3582 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3583 /* C2 is not in a valid range. */
3584 goto invalid_code;
3585 c1 = (c1 << 8) | (c2 & 0x7F);
3586 }
3587 }
3588
3589 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3590 if (c < 0)
3591 {
3592 MAYBE_FINISH_COMPOSITION ();
3593 for (; src_base < src; src_base++, char_offset++)
3594 {
3595 if (ASCII_BYTE_P (*src_base))
3596 *charbuf++ = *src_base;
3597 else
3598 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3599 }
3600 }
3601 else if (composition_state == COMPOSING_NO)
3602 {
3603 *charbuf++ = c;
3604 char_offset++;
4ed46869 3605 }
df7492f9 3606 else
781d7a48 3607 {
cee53ed4
KH
3608 if (component_idx < MAX_COMPOSITION_COMPONENTS * 2 + 1)
3609 {
3610 components[component_idx++] = c;
3611 if (method == COMPOSITION_WITH_RULE
3612 || (method == COMPOSITION_WITH_RULE_ALTCHARS
3613 && composition_state == COMPOSING_COMPONENT_CHAR))
3614 composition_state++;
3615 }
3616 else
3617 {
3618 MAYBE_FINISH_COMPOSITION ();
3619 *charbuf++ = c;
3620 char_offset++;
3621 }
4ed46869
KH
3622 }
3623 continue;
3624
df7492f9
KH
3625 invalid_code:
3626 MAYBE_FINISH_COMPOSITION ();
4ed46869 3627 src = src_base;
df7492f9
KH
3628 consumed_chars = consumed_chars_base;
3629 ONE_MORE_BYTE (c);
065e3595 3630 *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
ff0dacd7 3631 char_offset++;
df7492f9 3632 coding->errors++;
4776e638
KH
3633 continue;
3634
3635 break_loop:
3636 break;
4ed46869 3637 }
fb88bf2d 3638
df7492f9 3639 no_more_source:
ff0dacd7 3640 if (last_id != charset_ascii)
69a80ea3 3641 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
df7492f9
KH
3642 coding->consumed_char += consumed_chars_base;
3643 coding->consumed = src_base - coding->source;
3644 coding->charbuf_used = charbuf - coding->charbuf;
4ed46869
KH
3645}
3646
b73bfc1c 3647
f4dee582 3648/* ISO2022 encoding stuff. */
4ed46869
KH
3649
3650/*
f4dee582 3651 It is not enough to say just "ISO2022" on encoding, we have to
df7492f9 3652 specify more details. In Emacs, each coding system of ISO2022
4ed46869 3653 variant has the following specifications:
df7492f9 3654 1. Initial designation to G0 thru G3.
4ed46869
KH
3655 2. Allows short-form designation?
3656 3. ASCII should be designated to G0 before control characters?
3657 4. ASCII should be designated to G0 at end of line?
3658 5. 7-bit environment or 8-bit environment?
3659 6. Use locking-shift?
3660 7. Use Single-shift?
3661 And the following two are only for Japanese:
3662 8. Use ASCII in place of JIS0201-1976-Roman?
3663 9. Use JISX0208-1983 in place of JISX0208-1978?
df7492f9
KH
3664 These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3665 defined by macros CODING_ISO_FLAG_XXX. See `coding.h' for more
f4dee582 3666 details.
4ed46869
KH
3667*/
3668
3669/* Produce codes (escape sequence) for designating CHARSET to graphic
b73bfc1c
KH
3670 register REG at DST, and increment DST. If <final-char> of CHARSET is
3671 '@', 'A', or 'B' and the coding system CODING allows, produce
3672 designation sequence of short-form. */
4ed46869
KH
3673
3674#define ENCODE_DESIGNATION(charset, reg, coding) \
3675 do { \
df7492f9 3676 unsigned char final_char = CHARSET_ISO_FINAL (charset); \
4ed46869
KH
3677 char *intermediate_char_94 = "()*+"; \
3678 char *intermediate_char_96 = ",-./"; \
df7492f9
KH
3679 int revision = -1; \
3680 int c; \
3681 \
3682 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION) \
c197f191 3683 revision = CHARSET_ISO_REVISION (charset); \
df7492f9
KH
3684 \
3685 if (revision >= 0) \
70c22245 3686 { \
df7492f9
KH
3687 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&'); \
3688 EMIT_ONE_BYTE ('@' + revision); \
4ed46869 3689 } \
df7492f9 3690 EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC); \
4ed46869
KH
3691 if (CHARSET_DIMENSION (charset) == 1) \
3692 { \
df7492f9
KH
3693 if (! CHARSET_ISO_CHARS_96 (charset)) \
3694 c = intermediate_char_94[reg]; \
4ed46869 3695 else \
df7492f9
KH
3696 c = intermediate_char_96[reg]; \
3697 EMIT_ONE_ASCII_BYTE (c); \
4ed46869
KH
3698 } \
3699 else \
3700 { \
df7492f9
KH
3701 EMIT_ONE_ASCII_BYTE ('$'); \
3702 if (! CHARSET_ISO_CHARS_96 (charset)) \
4ed46869 3703 { \
df7492f9 3704 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM \
b73bfc1c
KH
3705 || reg != 0 \
3706 || final_char < '@' || final_char > 'B') \
df7492f9 3707 EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]); \
4ed46869
KH
3708 } \
3709 else \
df7492f9 3710 EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]); \
4ed46869 3711 } \
df7492f9
KH
3712 EMIT_ONE_ASCII_BYTE (final_char); \
3713 \
3714 CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset); \
4ed46869
KH
3715 } while (0)
3716
df7492f9 3717
4ed46869
KH
3718/* The following two macros produce codes (control character or escape
3719 sequence) for ISO2022 single-shift functions (single-shift-2 and
3720 single-shift-3). */
3721
df7492f9
KH
3722#define ENCODE_SINGLE_SHIFT_2 \
3723 do { \
3724 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3725 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N'); \
3726 else \
3727 EMIT_ONE_BYTE (ISO_CODE_SS2); \
3728 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
4ed46869
KH
3729 } while (0)
3730
df7492f9
KH
3731
3732#define ENCODE_SINGLE_SHIFT_3 \
3733 do { \
3734 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3735 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O'); \
3736 else \
3737 EMIT_ONE_BYTE (ISO_CODE_SS3); \
3738 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
4ed46869
KH
3739 } while (0)
3740
df7492f9 3741
4ed46869
KH
3742/* The following four macros produce codes (control character or
3743 escape sequence) for ISO2022 locking-shift functions (shift-in,
3744 shift-out, locking-shift-2, and locking-shift-3). */
3745
df7492f9
KH
3746#define ENCODE_SHIFT_IN \
3747 do { \
3748 EMIT_ONE_ASCII_BYTE (ISO_CODE_SI); \
3749 CODING_ISO_INVOCATION (coding, 0) = 0; \
4ed46869
KH
3750 } while (0)
3751
df7492f9
KH
3752
3753#define ENCODE_SHIFT_OUT \
3754 do { \
3755 EMIT_ONE_ASCII_BYTE (ISO_CODE_SO); \
3756 CODING_ISO_INVOCATION (coding, 0) = 1; \
4ed46869
KH
3757 } while (0)
3758
df7492f9
KH
3759
3760#define ENCODE_LOCKING_SHIFT_2 \
3761 do { \
3762 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3763 CODING_ISO_INVOCATION (coding, 0) = 2; \
4ed46869
KH
3764 } while (0)
3765
df7492f9
KH
3766
3767#define ENCODE_LOCKING_SHIFT_3 \
3768 do { \
3769 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3770 CODING_ISO_INVOCATION (coding, 0) = 3; \
4ed46869
KH
3771 } while (0)
3772
df7492f9 3773
f4dee582
RS
3774/* Produce codes for a DIMENSION1 character whose character set is
3775 CHARSET and whose position-code is C1. Designation and invocation
4ed46869
KH
3776 sequences are also produced in advance if necessary. */
3777
6e85d753
KH
3778#define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
3779 do { \
df7492f9 3780 int id = CHARSET_ID (charset); \
bf16eb23
KH
3781 \
3782 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
3783 && id == charset_ascii) \
3784 { \
3785 id = charset_jisx0201_roman; \
3786 charset = CHARSET_FROM_ID (id); \
3787 } \
3788 \
df7492f9 3789 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
6e85d753 3790 { \
df7492f9
KH
3791 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3792 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
6e85d753 3793 else \
df7492f9
KH
3794 EMIT_ONE_BYTE (c1 | 0x80); \
3795 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
6e85d753
KH
3796 break; \
3797 } \
df7492f9 3798 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
6e85d753 3799 { \
df7492f9 3800 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
6e85d753
KH
3801 break; \
3802 } \
df7492f9 3803 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
6e85d753 3804 { \
df7492f9 3805 EMIT_ONE_BYTE (c1 | 0x80); \
6e85d753
KH
3806 break; \
3807 } \
6e85d753
KH
3808 else \
3809 /* Since CHARSET is not yet invoked to any graphic planes, we \
3810 must invoke it, or, at first, designate it to some graphic \
3811 register. Then repeat the loop to actually produce the \
3812 character. */ \
df7492f9
KH
3813 dst = encode_invocation_designation (charset, coding, dst, \
3814 &produced_chars); \
4ed46869
KH
3815 } while (1)
3816
df7492f9 3817
f4dee582
RS
3818/* Produce codes for a DIMENSION2 character whose character set is
3819 CHARSET and whose position-codes are C1 and C2. Designation and
4ed46869
KH
3820 invocation codes are also produced in advance if necessary. */
3821
6e85d753
KH
3822#define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
3823 do { \
df7492f9 3824 int id = CHARSET_ID (charset); \
bf16eb23
KH
3825 \
3826 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
3827 && id == charset_jisx0208) \
3828 { \
3829 id = charset_jisx0208_1978; \
3830 charset = CHARSET_FROM_ID (id); \
3831 } \
3832 \
df7492f9 3833 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
6e85d753 3834 { \
df7492f9
KH
3835 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3836 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
6e85d753 3837 else \
df7492f9
KH
3838 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3839 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
6e85d753
KH
3840 break; \
3841 } \
df7492f9 3842 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
6e85d753 3843 { \
df7492f9 3844 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
6e85d753
KH
3845 break; \
3846 } \
df7492f9 3847 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
6e85d753 3848 { \
df7492f9 3849 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
6e85d753
KH
3850 break; \
3851 } \
6e85d753
KH
3852 else \
3853 /* Since CHARSET is not yet invoked to any graphic planes, we \
3854 must invoke it, or, at first, designate it to some graphic \
3855 register. Then repeat the loop to actually produce the \
3856 character. */ \
df7492f9
KH
3857 dst = encode_invocation_designation (charset, coding, dst, \
3858 &produced_chars); \
4ed46869
KH
3859 } while (1)
3860
05e6f5dc 3861
df7492f9
KH
3862#define ENCODE_ISO_CHARACTER(charset, c) \
3863 do { \
3864 int code = ENCODE_CHAR ((charset),(c)); \
3865 \
3866 if (CHARSET_DIMENSION (charset) == 1) \
3867 ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code); \
3868 else \
3869 ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
84fbb8a0 3870 } while (0)
bdd9fb48 3871
05e6f5dc 3872
4ed46869 3873/* Produce designation and invocation codes at a place pointed by DST
df7492f9 3874 to use CHARSET. The element `spec.iso_2022' of *CODING is updated.
4ed46869
KH
3875 Return new DST. */
3876
3877unsigned char *
df7492f9
KH
3878encode_invocation_designation (charset, coding, dst, p_nchars)
3879 struct charset *charset;
4ed46869
KH
3880 struct coding_system *coding;
3881 unsigned char *dst;
df7492f9 3882 int *p_nchars;
4ed46869 3883{
df7492f9
KH
3884 int multibytep = coding->dst_multibyte;
3885 int produced_chars = *p_nchars;
4ed46869 3886 int reg; /* graphic register number */
df7492f9 3887 int id = CHARSET_ID (charset);
4ed46869
KH
3888
3889 /* At first, check designations. */
3890 for (reg = 0; reg < 4; reg++)
df7492f9 3891 if (id == CODING_ISO_DESIGNATION (coding, reg))
4ed46869
KH
3892 break;
3893
3894 if (reg >= 4)
3895 {
3896 /* CHARSET is not yet designated to any graphic registers. */
3897 /* At first check the requested designation. */
df7492f9
KH
3898 reg = CODING_ISO_REQUEST (coding, id);
3899 if (reg < 0)
1ba9e4ab
KH
3900 /* Since CHARSET requests no special designation, designate it
3901 to graphic register 0. */
4ed46869
KH
3902 reg = 0;
3903
3904 ENCODE_DESIGNATION (charset, reg, coding);
3905 }
3906
df7492f9
KH
3907 if (CODING_ISO_INVOCATION (coding, 0) != reg
3908 && CODING_ISO_INVOCATION (coding, 1) != reg)
4ed46869
KH
3909 {
3910 /* Since the graphic register REG is not invoked to any graphic
3911 planes, invoke it to graphic plane 0. */
3912 switch (reg)
3913 {
3914 case 0: /* graphic register 0 */
3915 ENCODE_SHIFT_IN;
3916 break;
3917
3918 case 1: /* graphic register 1 */
3919 ENCODE_SHIFT_OUT;
3920 break;
3921
3922 case 2: /* graphic register 2 */
df7492f9 3923 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4ed46869
KH
3924 ENCODE_SINGLE_SHIFT_2;
3925 else
3926 ENCODE_LOCKING_SHIFT_2;
3927 break;
3928
3929 case 3: /* graphic register 3 */
df7492f9 3930 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4ed46869
KH
3931 ENCODE_SINGLE_SHIFT_3;
3932 else
3933 ENCODE_LOCKING_SHIFT_3;
3934 break;
3935 }
3936 }
b73bfc1c 3937
df7492f9 3938 *p_nchars = produced_chars;
4ed46869
KH
3939 return dst;
3940}
3941
df7492f9
KH
3942/* The following three macros produce codes for indicating direction
3943 of text. */
3944#define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
ec6d2bb8 3945 do { \
df7492f9
KH
3946 if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS) \
3947 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '['); \
ec6d2bb8 3948 else \
df7492f9 3949 EMIT_ONE_BYTE (ISO_CODE_CSI); \
ec6d2bb8
KH
3950 } while (0)
3951
ec6d2bb8 3952
df7492f9
KH
3953#define ENCODE_DIRECTION_R2L() \
3954 do { \
3955 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3956 EMIT_TWO_ASCII_BYTES ('2', ']'); \
ec6d2bb8
KH
3957 } while (0)
3958
ec6d2bb8 3959
df7492f9 3960#define ENCODE_DIRECTION_L2R() \
ec6d2bb8 3961 do { \
df7492f9
KH
3962 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3963 EMIT_TWO_ASCII_BYTES ('0', ']'); \
ec6d2bb8 3964 } while (0)
4ed46869 3965
4ed46869
KH
3966
3967/* Produce codes for designation and invocation to reset the graphic
3968 planes and registers to initial state. */
df7492f9
KH
3969#define ENCODE_RESET_PLANE_AND_REGISTER() \
3970 do { \
3971 int reg; \
3972 struct charset *charset; \
3973 \
3974 if (CODING_ISO_INVOCATION (coding, 0) != 0) \
3975 ENCODE_SHIFT_IN; \
3976 for (reg = 0; reg < 4; reg++) \
3977 if (CODING_ISO_INITIAL (coding, reg) >= 0 \
3978 && (CODING_ISO_DESIGNATION (coding, reg) \
3979 != CODING_ISO_INITIAL (coding, reg))) \
3980 { \
3981 charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
3982 ENCODE_DESIGNATION (charset, reg, coding); \
3983 } \
4ed46869
KH
3984 } while (0)
3985
df7492f9 3986
bdd9fb48 3987/* Produce designation sequences of charsets in the line started from
b73bfc1c 3988 SRC to a place pointed by DST, and return updated DST.
bdd9fb48
KH
3989
3990 If the current block ends before any end-of-line, we may fail to
d46c5b12
KH
3991 find all the necessary designations. */
3992
b73bfc1c 3993static unsigned char *
df7492f9 3994encode_designation_at_bol (coding, charbuf, charbuf_end, dst)
e0e989f6 3995 struct coding_system *coding;
df7492f9
KH
3996 int *charbuf, *charbuf_end;
3997 unsigned char *dst;
e0e989f6 3998{
df7492f9 3999 struct charset *charset;
bdd9fb48
KH
4000 /* Table of charsets to be designated to each graphic register. */
4001 int r[4];
df7492f9
KH
4002 int c, found = 0, reg;
4003 int produced_chars = 0;
4004 int multibytep = coding->dst_multibyte;
4005 Lisp_Object attrs;
4006 Lisp_Object charset_list;
4007
4008 attrs = CODING_ID_ATTRS (coding->id);
4009 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4010 if (EQ (charset_list, Qiso_2022))
4011 charset_list = Viso_2022_charset_list;
bdd9fb48
KH
4012
4013 for (reg = 0; reg < 4; reg++)
4014 r[reg] = -1;
4015
b73bfc1c 4016 while (found < 4)
e0e989f6 4017 {
df7492f9
KH
4018 int id;
4019
4020 c = *charbuf++;
b73bfc1c
KH
4021 if (c == '\n')
4022 break;
df7492f9
KH
4023 charset = char_charset (c, charset_list, NULL);
4024 id = CHARSET_ID (charset);
4025 reg = CODING_ISO_REQUEST (coding, id);
4026 if (reg >= 0 && r[reg] < 0)
bdd9fb48
KH
4027 {
4028 found++;
df7492f9 4029 r[reg] = id;
bdd9fb48 4030 }
bdd9fb48
KH
4031 }
4032
4033 if (found)
4034 {
4035 for (reg = 0; reg < 4; reg++)
4036 if (r[reg] >= 0
df7492f9
KH
4037 && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4038 ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
e0e989f6 4039 }
b73bfc1c
KH
4040
4041 return dst;
e0e989f6
KH
4042}
4043
4ed46869
KH
4044/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
4045
df7492f9
KH
4046static int
4047encode_coding_iso_2022 (coding)
4ed46869 4048 struct coding_system *coding;
4ed46869 4049{
df7492f9
KH
4050 int multibytep = coding->dst_multibyte;
4051 int *charbuf = coding->charbuf;
4052 int *charbuf_end = charbuf + coding->charbuf_used;
4053 unsigned char *dst = coding->destination + coding->produced;
4054 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4055 int safe_room = 16;
4056 int bol_designation
4057 = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4058 && CODING_ISO_BOL (coding));
4059 int produced_chars = 0;
4060 Lisp_Object attrs, eol_type, charset_list;
4061 int ascii_compatible;
b73bfc1c 4062 int c;
ff0dacd7 4063 int preferred_charset_id = -1;
05e6f5dc 4064
24a73b0a
KH
4065 CODING_GET_INFO (coding, attrs, charset_list);
4066 eol_type = CODING_ID_EOL_TYPE (coding->id);
4067 if (VECTORP (eol_type))
4068 eol_type = Qunix;
4069
004068e4 4070 setup_iso_safe_charsets (attrs);
ff0dacd7 4071 /* Charset list may have been changed. */
287c57d7 4072 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8f924df7 4073 coding->safe_charsets = (char *) SDATA (CODING_ATTR_SAFE_CHARSETS(attrs));
0eecad43 4074
df7492f9 4075 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
bdd9fb48 4076
df7492f9 4077 while (charbuf < charbuf_end)
4ed46869 4078 {
df7492f9 4079 ASSURE_DESTINATION (safe_room);
b73bfc1c 4080
df7492f9 4081 if (bol_designation)
b73bfc1c 4082 {
df7492f9 4083 unsigned char *dst_prev = dst;
4ed46869 4084
bdd9fb48 4085 /* We have to produce designation sequences if any now. */
df7492f9
KH
4086 dst = encode_designation_at_bol (coding, charbuf, charbuf_end, dst);
4087 bol_designation = 0;
4088 /* We are sure that designation sequences are all ASCII bytes. */
4089 produced_chars += dst - dst_prev;
e0e989f6
KH
4090 }
4091
df7492f9 4092 c = *charbuf++;
ec6d2bb8 4093
ff0dacd7
KH
4094 if (c < 0)
4095 {
4096 /* Handle an annotation. */
4097 switch (*charbuf)
ec6d2bb8 4098 {
ff0dacd7
KH
4099 case CODING_ANNOTATE_COMPOSITION_MASK:
4100 /* Not yet implemented. */
4101 break;
4102 case CODING_ANNOTATE_CHARSET_MASK:
cf7dfdf5 4103 preferred_charset_id = charbuf[2];
ff0dacd7
KH
4104 if (preferred_charset_id >= 0
4105 && NILP (Fmemq (make_number (preferred_charset_id),
4106 charset_list)))
4107 preferred_charset_id = -1;
4108 break;
4109 default:
4110 abort ();
4ed46869 4111 }
ff0dacd7
KH
4112 charbuf += -c - 1;
4113 continue;
4ed46869 4114 }
ec6d2bb8 4115
b73bfc1c
KH
4116 /* Now encode the character C. */
4117 if (c < 0x20 || c == 0x7F)
4118 {
df7492f9
KH
4119 if (c == '\n'
4120 || (c == '\r' && EQ (eol_type, Qmac)))
19a8d9e0 4121 {
df7492f9
KH
4122 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4123 ENCODE_RESET_PLANE_AND_REGISTER ();
4124 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
b73bfc1c 4125 {
df7492f9
KH
4126 int i;
4127
4128 for (i = 0; i < 4; i++)
4129 CODING_ISO_DESIGNATION (coding, i)
4130 = CODING_ISO_INITIAL (coding, i);
b73bfc1c 4131 }
df7492f9
KH
4132 bol_designation
4133 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
19a8d9e0 4134 }
df7492f9
KH
4135 else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4136 ENCODE_RESET_PLANE_AND_REGISTER ();
4137 EMIT_ONE_ASCII_BYTE (c);
4ed46869 4138 }
df7492f9 4139 else if (ASCII_CHAR_P (c))
88993dfd 4140 {
df7492f9
KH
4141 if (ascii_compatible)
4142 EMIT_ONE_ASCII_BYTE (c);
93dec019 4143 else
19a8d9e0 4144 {
bf16eb23
KH
4145 struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4146 ENCODE_ISO_CHARACTER (charset, c);
19a8d9e0 4147 }
4ed46869 4148 }
16eafb5d 4149 else if (CHAR_BYTE8_P (c))
88993dfd 4150 {
16eafb5d
KH
4151 c = CHAR_TO_BYTE8 (c);
4152 EMIT_ONE_BYTE (c);
88993dfd 4153 }
b73bfc1c 4154 else
df7492f9 4155 {
ff0dacd7 4156 struct charset *charset;
b73bfc1c 4157
ff0dacd7
KH
4158 if (preferred_charset_id >= 0)
4159 {
4160 charset = CHARSET_FROM_ID (preferred_charset_id);
4161 if (! CHAR_CHARSET_P (c, charset))
4162 charset = char_charset (c, charset_list, NULL);
4163 }
4164 else
4165 charset = char_charset (c, charset_list, NULL);
df7492f9
KH
4166 if (!charset)
4167 {
41cbe562
KH
4168 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4169 {
4170 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4171 charset = CHARSET_FROM_ID (charset_ascii);
4172 }
4173 else
4174 {
4175 c = coding->default_char;
4176 charset = char_charset (c, charset_list, NULL);
4177 }
df7492f9
KH
4178 }
4179 ENCODE_ISO_CHARACTER (charset, c);
4180 }
84fbb8a0 4181 }
b73bfc1c 4182
df7492f9
KH
4183 if (coding->mode & CODING_MODE_LAST_BLOCK
4184 && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4185 {
4186 ASSURE_DESTINATION (safe_room);
4187 ENCODE_RESET_PLANE_AND_REGISTER ();
4188 }
065e3595 4189 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
4190 CODING_ISO_BOL (coding) = bol_designation;
4191 coding->produced_char += produced_chars;
4192 coding->produced = dst - coding->destination;
4193 return 0;
4ed46869
KH
4194}
4195
4196\f
df7492f9 4197/*** 8,9. SJIS and BIG5 handlers ***/
4ed46869 4198
df7492f9 4199/* Although SJIS and BIG5 are not ISO's coding system, they are used
4ed46869
KH
4200 quite widely. So, for the moment, Emacs supports them in the bare
4201 C code. But, in the future, they may be supported only by CCL. */
4202
4203/* SJIS is a coding system encoding three character sets: ASCII, right
4204 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
4205 as is. A character of charset katakana-jisx0201 is encoded by
4206 "position-code + 0x80". A character of charset japanese-jisx0208
4207 is encoded in 2-byte but two position-codes are divided and shifted
df7492f9 4208 so that it fit in the range below.
4ed46869
KH
4209
4210 --- CODE RANGE of SJIS ---
4211 (character set) (range)
4212 ASCII 0x00 .. 0x7F
df7492f9 4213 KATAKANA-JISX0201 0xA0 .. 0xDF
c28a9453 4214 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
d14d03ac 4215 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
4ed46869
KH
4216 -------------------------------
4217
4218*/
4219
4220/* BIG5 is a coding system encoding two character sets: ASCII and
4221 Big5. An ASCII character is encoded as is. Big5 is a two-byte
df7492f9 4222 character set and is encoded in two-byte.
4ed46869
KH
4223
4224 --- CODE RANGE of BIG5 ---
4225 (character set) (range)
4226 ASCII 0x00 .. 0x7F
4227 Big5 (1st byte) 0xA1 .. 0xFE
4228 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
4229 --------------------------
4230
df7492f9 4231 */
4ed46869
KH
4232
4233/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4234 Check if a text is encoded in SJIS. If it is, return
df7492f9 4235 CATEGORY_MASK_SJIS, else return 0. */
4ed46869 4236
0a28aafb 4237static int
ff0dacd7 4238detect_coding_sjis (coding, detect_info)
df7492f9 4239 struct coding_system *coding;
ff0dacd7 4240 struct coding_detection_info *detect_info;
4ed46869 4241{
065e3595 4242 const unsigned char *src = coding->source, *src_base;
8f924df7 4243 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9
KH
4244 int multibytep = coding->src_multibyte;
4245 int consumed_chars = 0;
4246 int found = 0;
b73bfc1c 4247 int c;
df7492f9 4248
ff0dacd7 4249 detect_info->checked |= CATEGORY_MASK_SJIS;
df7492f9
KH
4250 /* A coding system of this category is always ASCII compatible. */
4251 src += coding->head_ascii;
4ed46869 4252
b73bfc1c 4253 while (1)
4ed46869 4254 {
065e3595 4255 src_base = src;
df7492f9 4256 ONE_MORE_BYTE (c);
682169fe
KH
4257 if (c < 0x80)
4258 continue;
df7492f9 4259 if ((c >= 0x81 && c <= 0x9F) || (c >= 0xE0 && c <= 0xEF))
4ed46869 4260 {
df7492f9 4261 ONE_MORE_BYTE (c);
682169fe 4262 if (c < 0x40 || c == 0x7F || c > 0xFC)
df7492f9 4263 break;
ff0dacd7 4264 found = CATEGORY_MASK_SJIS;
4ed46869 4265 }
df7492f9 4266 else if (c >= 0xA0 && c < 0xE0)
ff0dacd7 4267 found = CATEGORY_MASK_SJIS;
df7492f9
KH
4268 else
4269 break;
4ed46869 4270 }
ff0dacd7 4271 detect_info->rejected |= CATEGORY_MASK_SJIS;
df7492f9
KH
4272 return 0;
4273
4274 no_more_source:
065e3595 4275 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
89528eb3 4276 {
ff0dacd7 4277 detect_info->rejected |= CATEGORY_MASK_SJIS;
89528eb3 4278 return 0;
4ed46869 4279 }
ff0dacd7
KH
4280 detect_info->found |= found;
4281 return 1;
4ed46869
KH
4282}
4283
4284/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4285 Check if a text is encoded in BIG5. If it is, return
df7492f9 4286 CATEGORY_MASK_BIG5, else return 0. */
4ed46869 4287
0a28aafb 4288static int
ff0dacd7 4289detect_coding_big5 (coding, detect_info)
df7492f9 4290 struct coding_system *coding;
ff0dacd7 4291 struct coding_detection_info *detect_info;
4ed46869 4292{
065e3595 4293 const unsigned char *src = coding->source, *src_base;
8f924df7 4294 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9
KH
4295 int multibytep = coding->src_multibyte;
4296 int consumed_chars = 0;
4297 int found = 0;
b73bfc1c 4298 int c;
fa42c37f 4299
ff0dacd7 4300 detect_info->checked |= CATEGORY_MASK_BIG5;
df7492f9
KH
4301 /* A coding system of this category is always ASCII compatible. */
4302 src += coding->head_ascii;
fa42c37f 4303
b73bfc1c 4304 while (1)
fa42c37f 4305 {
065e3595 4306 src_base = src;
df7492f9
KH
4307 ONE_MORE_BYTE (c);
4308 if (c < 0x80)
fa42c37f 4309 continue;
df7492f9 4310 if (c >= 0xA1)
fa42c37f 4311 {
df7492f9
KH
4312 ONE_MORE_BYTE (c);
4313 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
fa42c37f 4314 return 0;
ff0dacd7 4315 found = CATEGORY_MASK_BIG5;
fa42c37f 4316 }
df7492f9
KH
4317 else
4318 break;
fa42c37f 4319 }
ff0dacd7 4320 detect_info->rejected |= CATEGORY_MASK_BIG5;
fa42c37f 4321 return 0;
fa42c37f 4322
df7492f9 4323 no_more_source:
065e3595 4324 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
89528eb3 4325 {
ff0dacd7 4326 detect_info->rejected |= CATEGORY_MASK_BIG5;
89528eb3
KH
4327 return 0;
4328 }
ff0dacd7
KH
4329 detect_info->found |= found;
4330 return 1;
fa42c37f
KH
4331}
4332
4ed46869
KH
4333/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
4334 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
fa42c37f 4335
b73bfc1c 4336static void
df7492f9 4337decode_coding_sjis (coding)
4ed46869 4338 struct coding_system *coding;
4ed46869 4339{
8f924df7
KH
4340 const unsigned char *src = coding->source + coding->consumed;
4341 const unsigned char *src_end = coding->source + coding->src_bytes;
4342 const unsigned char *src_base;
69a80ea3
KH
4343 int *charbuf = coding->charbuf + coding->charbuf_used;
4344 int *charbuf_end
4345 = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
df7492f9
KH
4346 int consumed_chars = 0, consumed_chars_base;
4347 int multibytep = coding->src_multibyte;
4348 struct charset *charset_roman, *charset_kanji, *charset_kana;
57a47f8a 4349 struct charset *charset_kanji2;
24a73b0a 4350 Lisp_Object attrs, charset_list, val;
ff0dacd7
KH
4351 int char_offset = coding->produced_char;
4352 int last_offset = char_offset;
4353 int last_id = charset_ascii;
119852e7
KH
4354 int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4355 int byte_after_cr = -1;
a5d301df 4356
24a73b0a 4357 CODING_GET_INFO (coding, attrs, charset_list);
df7492f9
KH
4358
4359 val = charset_list;
4360 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
89528eb3 4361 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
57a47f8a
KH
4362 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4363 charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
fa42c37f 4364
b73bfc1c 4365 while (1)
4ed46869 4366 {
df7492f9 4367 int c, c1;
24a73b0a 4368 struct charset *charset;
fa42c37f 4369
b73bfc1c 4370 src_base = src;
df7492f9 4371 consumed_chars_base = consumed_chars;
fa42c37f 4372
df7492f9
KH
4373 if (charbuf >= charbuf_end)
4374 break;
4375
119852e7
KH
4376 if (byte_after_cr >= 0)
4377 c = byte_after_cr, byte_after_cr = -1;
4378 else
4379 ONE_MORE_BYTE (c);
065e3595
KH
4380 if (c < 0)
4381 goto invalid_code;
24a73b0a 4382 if (c < 0x80)
119852e7
KH
4383 {
4384 if (eol_crlf && c == '\r')
4385 ONE_MORE_BYTE (byte_after_cr);
4386 charset = charset_roman;
4387 }
57a47f8a 4388 else if (c == 0x80 || c == 0xA0)
8e921c4b 4389 goto invalid_code;
57a47f8a
KH
4390 else if (c >= 0xA1 && c <= 0xDF)
4391 {
4392 /* SJIS -> JISX0201-Kana */
4393 c &= 0x7F;
4394 charset = charset_kana;
4395 }
4396 else if (c <= 0xEF)
df7492f9 4397 {
57a47f8a
KH
4398 /* SJIS -> JISX0208 */
4399 ONE_MORE_BYTE (c1);
4400 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
24a73b0a 4401 goto invalid_code;
57a47f8a
KH
4402 c = (c << 8) | c1;
4403 SJIS_TO_JIS (c);
4404 charset = charset_kanji;
4405 }
4406 else if (c <= 0xFC && charset_kanji2)
4407 {
c6876370 4408 /* SJIS -> JISX0213-2 */
57a47f8a
KH
4409 ONE_MORE_BYTE (c1);
4410 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
24a73b0a 4411 goto invalid_code;
57a47f8a
KH
4412 c = (c << 8) | c1;
4413 SJIS_TO_JIS2 (c);
4414 charset = charset_kanji2;
df7492f9 4415 }
57a47f8a
KH
4416 else
4417 goto invalid_code;
24a73b0a
KH
4418 if (charset->id != charset_ascii
4419 && last_id != charset->id)
4420 {
4421 if (last_id != charset_ascii)
69a80ea3 4422 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
24a73b0a
KH
4423 last_id = charset->id;
4424 last_offset = char_offset;
4425 }
4426 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
df7492f9 4427 *charbuf++ = c;
ff0dacd7 4428 char_offset++;
df7492f9 4429 continue;
b73bfc1c 4430
df7492f9
KH
4431 invalid_code:
4432 src = src_base;
4433 consumed_chars = consumed_chars_base;
4434 ONE_MORE_BYTE (c);
065e3595 4435 *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
ff0dacd7 4436 char_offset++;
df7492f9
KH
4437 coding->errors++;
4438 }
fa42c37f 4439
df7492f9 4440 no_more_source:
ff0dacd7 4441 if (last_id != charset_ascii)
69a80ea3 4442 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
df7492f9
KH
4443 coding->consumed_char += consumed_chars_base;
4444 coding->consumed = src_base - coding->source;
4445 coding->charbuf_used = charbuf - coding->charbuf;
fa42c37f
KH
4446}
4447
b73bfc1c 4448static void
df7492f9 4449decode_coding_big5 (coding)
4ed46869 4450 struct coding_system *coding;
4ed46869 4451{
8f924df7
KH
4452 const unsigned char *src = coding->source + coding->consumed;
4453 const unsigned char *src_end = coding->source + coding->src_bytes;
4454 const unsigned char *src_base;
69a80ea3
KH
4455 int *charbuf = coding->charbuf + coding->charbuf_used;
4456 int *charbuf_end
4457 = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
df7492f9
KH
4458 int consumed_chars = 0, consumed_chars_base;
4459 int multibytep = coding->src_multibyte;
4460 struct charset *charset_roman, *charset_big5;
24a73b0a 4461 Lisp_Object attrs, charset_list, val;
ff0dacd7
KH
4462 int char_offset = coding->produced_char;
4463 int last_offset = char_offset;
4464 int last_id = charset_ascii;
119852e7
KH
4465 int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4466 int byte_after_cr = -1;
df7492f9 4467
24a73b0a 4468 CODING_GET_INFO (coding, attrs, charset_list);
df7492f9
KH
4469 val = charset_list;
4470 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4471 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4ed46869 4472
b73bfc1c 4473 while (1)
4ed46869 4474 {
df7492f9 4475 int c, c1;
24a73b0a 4476 struct charset *charset;
b73bfc1c
KH
4477
4478 src_base = src;
df7492f9
KH
4479 consumed_chars_base = consumed_chars;
4480
4481 if (charbuf >= charbuf_end)
4482 break;
4483
119852e7 4484 if (byte_after_cr >= 0)
14daee73 4485 c = byte_after_cr, byte_after_cr = -1;
119852e7
KH
4486 else
4487 ONE_MORE_BYTE (c);
b73bfc1c 4488
065e3595
KH
4489 if (c < 0)
4490 goto invalid_code;
24a73b0a 4491 if (c < 0x80)
119852e7 4492 {
14daee73 4493 if (eol_crlf && c == '\r')
119852e7
KH
4494 ONE_MORE_BYTE (byte_after_cr);
4495 charset = charset_roman;
4496 }
24a73b0a 4497 else
4ed46869 4498 {
24a73b0a
KH
4499 /* BIG5 -> Big5 */
4500 if (c < 0xA1 || c > 0xFE)
4501 goto invalid_code;
4502 ONE_MORE_BYTE (c1);
4503 if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4504 goto invalid_code;
4505 c = c << 8 | c1;
4506 charset = charset_big5;
4ed46869 4507 }
24a73b0a
KH
4508 if (charset->id != charset_ascii
4509 && last_id != charset->id)
df7492f9 4510 {
24a73b0a 4511 if (last_id != charset_ascii)
69a80ea3 4512 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
24a73b0a
KH
4513 last_id = charset->id;
4514 last_offset = char_offset;
4ed46869 4515 }
24a73b0a 4516 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
df7492f9 4517 *charbuf++ = c;
ff0dacd7 4518 char_offset++;
fb88bf2d
KH
4519 continue;
4520
df7492f9 4521 invalid_code:
4ed46869 4522 src = src_base;
df7492f9
KH
4523 consumed_chars = consumed_chars_base;
4524 ONE_MORE_BYTE (c);
065e3595 4525 *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
ff0dacd7 4526 char_offset++;
df7492f9 4527 coding->errors++;
fb88bf2d 4528 }
d46c5b12 4529
df7492f9 4530 no_more_source:
ff0dacd7 4531 if (last_id != charset_ascii)
69a80ea3 4532 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
df7492f9
KH
4533 coding->consumed_char += consumed_chars_base;
4534 coding->consumed = src_base - coding->source;
4535 coding->charbuf_used = charbuf - coding->charbuf;
4ed46869
KH
4536}
4537
4538/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
b73bfc1c
KH
4539 This function can encode charsets `ascii', `katakana-jisx0201',
4540 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
4541 are sure that all these charsets are registered as official charset
4ed46869
KH
4542 (i.e. do not have extended leading-codes). Characters of other
4543 charsets are produced without any encoding. If SJIS_P is 1, encode
4544 SJIS text, else encode BIG5 text. */
4545
df7492f9
KH
4546static int
4547encode_coding_sjis (coding)
4ed46869 4548 struct coding_system *coding;
4ed46869 4549{
df7492f9
KH
4550 int multibytep = coding->dst_multibyte;
4551 int *charbuf = coding->charbuf;
4552 int *charbuf_end = charbuf + coding->charbuf_used;
4553 unsigned char *dst = coding->destination + coding->produced;
4554 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4555 int safe_room = 4;
4556 int produced_chars = 0;
24a73b0a 4557 Lisp_Object attrs, charset_list, val;
df7492f9
KH
4558 int ascii_compatible;
4559 struct charset *charset_roman, *charset_kanji, *charset_kana;
57a47f8a 4560 struct charset *charset_kanji2;
df7492f9 4561 int c;
a5d301df 4562
24a73b0a 4563 CODING_GET_INFO (coding, attrs, charset_list);
df7492f9
KH
4564 val = charset_list;
4565 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4566 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
57a47f8a
KH
4567 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4568 charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4ed46869 4569
df7492f9 4570 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
93dec019 4571
df7492f9
KH
4572 while (charbuf < charbuf_end)
4573 {
4574 ASSURE_DESTINATION (safe_room);
4575 c = *charbuf++;
b73bfc1c 4576 /* Now encode the character C. */
df7492f9
KH
4577 if (ASCII_CHAR_P (c) && ascii_compatible)
4578 EMIT_ONE_ASCII_BYTE (c);
16eafb5d
KH
4579 else if (CHAR_BYTE8_P (c))
4580 {
4581 c = CHAR_TO_BYTE8 (c);
4582 EMIT_ONE_BYTE (c);
4583 }
df7492f9 4584 else
b73bfc1c 4585 {
df7492f9
KH
4586 unsigned code;
4587 struct charset *charset = char_charset (c, charset_list, &code);
4588
4589 if (!charset)
4ed46869 4590 {
41cbe562 4591 if (coding->mode & CODING_MODE_SAFE_ENCODING)
b73bfc1c 4592 {
41cbe562
KH
4593 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4594 charset = CHARSET_FROM_ID (charset_ascii);
b73bfc1c 4595 }
41cbe562 4596 else
b73bfc1c 4597 {
41cbe562
KH
4598 c = coding->default_char;
4599 charset = char_charset (c, charset_list, &code);
b73bfc1c 4600 }
b73bfc1c 4601 }
df7492f9
KH
4602 if (code == CHARSET_INVALID_CODE (charset))
4603 abort ();
4604 if (charset == charset_kanji)
4605 {
4606 int c1, c2;
4607 JIS_TO_SJIS (code);
4608 c1 = code >> 8, c2 = code & 0xFF;
4609 EMIT_TWO_BYTES (c1, c2);
4610 }
4611 else if (charset == charset_kana)
4612 EMIT_ONE_BYTE (code | 0x80);
57a47f8a
KH
4613 else if (charset_kanji2 && charset == charset_kanji2)
4614 {
4615 int c1, c2;
4616
4617 c1 = code >> 8;
4618 if (c1 == 0x21 || (c1 >= 0x23 && c1 < 0x25)
4619 || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
4620 {
4621 JIS_TO_SJIS2 (code);
4622 c1 = code >> 8, c2 = code & 0xFF;
4623 EMIT_TWO_BYTES (c1, c2);
4624 }
4625 else
4626 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4627 }
df7492f9
KH
4628 else
4629 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4630 }
4631 }
065e3595 4632 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
4633 coding->produced_char += produced_chars;
4634 coding->produced = dst - coding->destination;
4635 return 0;
4636}
4637
4638static int
4639encode_coding_big5 (coding)
4640 struct coding_system *coding;
4641{
4642 int multibytep = coding->dst_multibyte;
4643 int *charbuf = coding->charbuf;
4644 int *charbuf_end = charbuf + coding->charbuf_used;
4645 unsigned char *dst = coding->destination + coding->produced;
4646 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4647 int safe_room = 4;
4648 int produced_chars = 0;
24a73b0a 4649 Lisp_Object attrs, charset_list, val;
df7492f9
KH
4650 int ascii_compatible;
4651 struct charset *charset_roman, *charset_big5;
4652 int c;
4653
24a73b0a 4654 CODING_GET_INFO (coding, attrs, charset_list);
df7492f9
KH
4655 val = charset_list;
4656 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4657 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4658 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4659
4660 while (charbuf < charbuf_end)
4661 {
4662 ASSURE_DESTINATION (safe_room);
4663 c = *charbuf++;
4664 /* Now encode the character C. */
4665 if (ASCII_CHAR_P (c) && ascii_compatible)
4666 EMIT_ONE_ASCII_BYTE (c);
16eafb5d
KH
4667 else if (CHAR_BYTE8_P (c))
4668 {
4669 c = CHAR_TO_BYTE8 (c);
4670 EMIT_ONE_BYTE (c);
b73bfc1c
KH
4671 }
4672 else
4673 {
df7492f9
KH
4674 unsigned code;
4675 struct charset *charset = char_charset (c, charset_list, &code);
4676
4677 if (! charset)
b73bfc1c 4678 {
41cbe562 4679 if (coding->mode & CODING_MODE_SAFE_ENCODING)
b73bfc1c 4680 {
41cbe562
KH
4681 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4682 charset = CHARSET_FROM_ID (charset_ascii);
b73bfc1c 4683 }
41cbe562 4684 else
0eecad43 4685 {
41cbe562
KH
4686 c = coding->default_char;
4687 charset = char_charset (c, charset_list, &code);
0eecad43 4688 }
4ed46869 4689 }
df7492f9
KH
4690 if (code == CHARSET_INVALID_CODE (charset))
4691 abort ();
4692 if (charset == charset_big5)
b73bfc1c 4693 {
df7492f9
KH
4694 int c1, c2;
4695
4696 c1 = code >> 8, c2 = code & 0xFF;
4697 EMIT_TWO_BYTES (c1, c2);
b73bfc1c 4698 }
df7492f9
KH
4699 else
4700 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4ed46869 4701 }
4ed46869 4702 }
065e3595 4703 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
4704 coding->produced_char += produced_chars;
4705 coding->produced = dst - coding->destination;
4706 return 0;
4ed46869
KH
4707}
4708
4709\f
df7492f9 4710/*** 10. CCL handlers ***/
1397dc18
KH
4711
4712/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4713 Check if a text is encoded in a coding system of which
4714 encoder/decoder are written in CCL program. If it is, return
df7492f9 4715 CATEGORY_MASK_CCL, else return 0. */
1397dc18 4716
0a28aafb 4717static int
ff0dacd7 4718detect_coding_ccl (coding, detect_info)
df7492f9 4719 struct coding_system *coding;
ff0dacd7 4720 struct coding_detection_info *detect_info;
1397dc18 4721{
065e3595 4722 const unsigned char *src = coding->source, *src_base;
8f924df7 4723 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9
KH
4724 int multibytep = coding->src_multibyte;
4725 int consumed_chars = 0;
4726 int found = 0;
0e219d54 4727 unsigned char *valids;
df7492f9
KH
4728 int head_ascii = coding->head_ascii;
4729 Lisp_Object attrs;
4730
ff0dacd7
KH
4731 detect_info->checked |= CATEGORY_MASK_CCL;
4732
df7492f9 4733 coding = &coding_categories[coding_category_ccl];
0e219d54 4734 valids = CODING_CCL_VALIDS (coding);
df7492f9
KH
4735 attrs = CODING_ID_ATTRS (coding->id);
4736 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4737 src += head_ascii;
1397dc18 4738
b73bfc1c 4739 while (1)
1397dc18 4740 {
df7492f9 4741 int c;
065e3595
KH
4742
4743 src_base = src;
df7492f9 4744 ONE_MORE_BYTE (c);
065e3595 4745 if (c < 0 || ! valids[c])
df7492f9 4746 break;
ff0dacd7
KH
4747 if ((valids[c] > 1))
4748 found = CATEGORY_MASK_CCL;
df7492f9 4749 }
ff0dacd7 4750 detect_info->rejected |= CATEGORY_MASK_CCL;
df7492f9
KH
4751 return 0;
4752
4753 no_more_source:
ff0dacd7
KH
4754 detect_info->found |= found;
4755 return 1;
df7492f9
KH
4756}
4757
4758static void
4759decode_coding_ccl (coding)
4760 struct coding_system *coding;
4761{
7c78e542 4762 const unsigned char *src = coding->source + coding->consumed;
8f924df7 4763 const unsigned char *src_end = coding->source + coding->src_bytes;
69a80ea3
KH
4764 int *charbuf = coding->charbuf + coding->charbuf_used;
4765 int *charbuf_end = coding->charbuf + coding->charbuf_size;
df7492f9
KH
4766 int consumed_chars = 0;
4767 int multibytep = coding->src_multibyte;
4768 struct ccl_program ccl;
4769 int source_charbuf[1024];
4770 int source_byteidx[1024];
24a73b0a 4771 Lisp_Object attrs, charset_list;
df7492f9 4772
24a73b0a 4773 CODING_GET_INFO (coding, attrs, charset_list);
df7492f9
KH
4774 setup_ccl_program (&ccl, CODING_CCL_DECODER (coding));
4775
4776 while (src < src_end)
4777 {
7c78e542 4778 const unsigned char *p = src;
df7492f9
KH
4779 int *source, *source_end;
4780 int i = 0;
4781
4782 if (multibytep)
4783 while (i < 1024 && p < src_end)
4784 {
4785 source_byteidx[i] = p - src;
4786 source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
4787 }
4788 else
4789 while (i < 1024 && p < src_end)
4790 source_charbuf[i++] = *p++;
8f924df7 4791
df7492f9
KH
4792 if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
4793 ccl.last_block = 1;
4794
4795 source = source_charbuf;
4796 source_end = source + i;
4797 while (source < source_end)
4798 {
4799 ccl_driver (&ccl, source, charbuf,
8dcbea82
KH
4800 source_end - source, charbuf_end - charbuf,
4801 charset_list);
df7492f9
KH
4802 source += ccl.consumed;
4803 charbuf += ccl.produced;
4804 if (ccl.status != CCL_STAT_SUSPEND_BY_DST)
4805 break;
4806 }
4807 if (source < source_end)
4808 src += source_byteidx[source - source_charbuf];
4809 else
4810 src = p;
4811 consumed_chars += source - source_charbuf;
4812
4813 if (ccl.status != CCL_STAT_SUSPEND_BY_SRC
4814 && ccl.status != CODING_RESULT_INSUFFICIENT_SRC)
4815 break;
4816 }
4817
4818 switch (ccl.status)
4819 {
4820 case CCL_STAT_SUSPEND_BY_SRC:
065e3595 4821 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
df7492f9
KH
4822 break;
4823 case CCL_STAT_SUSPEND_BY_DST:
4824 break;
4825 case CCL_STAT_QUIT:
4826 case CCL_STAT_INVALID_CMD:
065e3595 4827 record_conversion_result (coding, CODING_RESULT_INTERRUPT);
df7492f9
KH
4828 break;
4829 default:
065e3595 4830 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
4831 break;
4832 }
4833 coding->consumed_char += consumed_chars;
4834 coding->consumed = src - coding->source;
4835 coding->charbuf_used = charbuf - coding->charbuf;
4836}
4837
4838static int
4839encode_coding_ccl (coding)
4840 struct coding_system *coding;
4841{
4842 struct ccl_program ccl;
4843 int multibytep = coding->dst_multibyte;
4844 int *charbuf = coding->charbuf;
4845 int *charbuf_end = charbuf + coding->charbuf_used;
4846 unsigned char *dst = coding->destination + coding->produced;
4847 unsigned char *dst_end = coding->destination + coding->dst_bytes;
df7492f9
KH
4848 int destination_charbuf[1024];
4849 int i, produced_chars = 0;
24a73b0a 4850 Lisp_Object attrs, charset_list;
df7492f9 4851
24a73b0a 4852 CODING_GET_INFO (coding, attrs, charset_list);
df7492f9
KH
4853 setup_ccl_program (&ccl, CODING_CCL_ENCODER (coding));
4854
4855 ccl.last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4856 ccl.dst_multibyte = coding->dst_multibyte;
4857
8cffd3e7 4858 while (charbuf < charbuf_end)
df7492f9 4859 {
df7492f9 4860 ccl_driver (&ccl, charbuf, destination_charbuf,
8cffd3e7 4861 charbuf_end - charbuf, 1024, charset_list);
df7492f9 4862 if (multibytep)
8cffd3e7
KH
4863 {
4864 ASSURE_DESTINATION (ccl.produced * 2);
4865 for (i = 0; i < ccl.produced; i++)
4866 EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
4867 }
df7492f9
KH
4868 else
4869 {
8cffd3e7 4870 ASSURE_DESTINATION (ccl.produced);
3ed051d4 4871 for (i = 0; i < ccl.produced; i++)
df7492f9
KH
4872 *dst++ = destination_charbuf[i] & 0xFF;
4873 produced_chars += ccl.produced;
4874 }
8cffd3e7
KH
4875 charbuf += ccl.consumed;
4876 if (ccl.status == CCL_STAT_QUIT
4877 || ccl.status == CCL_STAT_INVALID_CMD)
4878 break;
df7492f9
KH
4879 }
4880
4881 switch (ccl.status)
4882 {
4883 case CCL_STAT_SUSPEND_BY_SRC:
065e3595 4884 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
df7492f9
KH
4885 break;
4886 case CCL_STAT_SUSPEND_BY_DST:
065e3595 4887 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
df7492f9
KH
4888 break;
4889 case CCL_STAT_QUIT:
4890 case CCL_STAT_INVALID_CMD:
065e3595 4891 record_conversion_result (coding, CODING_RESULT_INTERRUPT);
df7492f9
KH
4892 break;
4893 default:
065e3595 4894 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9 4895 break;
1397dc18 4896 }
df7492f9
KH
4897
4898 coding->produced_char += produced_chars;
4899 coding->produced = dst - coding->destination;
4900 return 0;
1397dc18
KH
4901}
4902
df7492f9 4903
1397dc18 4904\f
df7492f9 4905/*** 10, 11. no-conversion handlers ***/
4ed46869 4906
b73bfc1c 4907/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
4ed46869 4908
b73bfc1c 4909static void
df7492f9 4910decode_coding_raw_text (coding)
4ed46869 4911 struct coding_system *coding;
4ed46869 4912{
119852e7
KH
4913 int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4914
df7492f9 4915 coding->chars_at_source = 1;
119852e7
KH
4916 coding->consumed_char = coding->src_chars;
4917 coding->consumed = coding->src_bytes;
4918 if (eol_crlf && coding->source[coding->src_bytes - 1] == '\r')
4919 {
4920 coding->consumed_char--;
4921 coding->consumed--;
4922 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
4923 }
4924 else
4925 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9 4926}
4ed46869 4927
df7492f9
KH
4928static int
4929encode_coding_raw_text (coding)
4930 struct coding_system *coding;
4931{
4932 int multibytep = coding->dst_multibyte;
4933 int *charbuf = coding->charbuf;
4934 int *charbuf_end = coding->charbuf + coding->charbuf_used;
4935 unsigned char *dst = coding->destination + coding->produced;
4936 unsigned char *dst_end = coding->destination + coding->dst_bytes;
a0ed9b27 4937 int produced_chars = 0;
b73bfc1c
KH
4938 int c;
4939
df7492f9 4940 if (multibytep)
b73bfc1c 4941 {
df7492f9 4942 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
4ed46869 4943
df7492f9
KH
4944 if (coding->src_multibyte)
4945 while (charbuf < charbuf_end)
4946 {
4947 ASSURE_DESTINATION (safe_room);
4948 c = *charbuf++;
4949 if (ASCII_CHAR_P (c))
4950 EMIT_ONE_ASCII_BYTE (c);
4951 else if (CHAR_BYTE8_P (c))
4952 {
4953 c = CHAR_TO_BYTE8 (c);
4954 EMIT_ONE_BYTE (c);
4955 }
4956 else
4957 {
4958 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
93dec019 4959
df7492f9
KH
4960 CHAR_STRING_ADVANCE (c, p1);
4961 while (p0 < p1)
9d123124
KH
4962 {
4963 EMIT_ONE_BYTE (*p0);
4964 p0++;
4965 }
df7492f9
KH
4966 }
4967 }
b73bfc1c 4968 else
df7492f9
KH
4969 while (charbuf < charbuf_end)
4970 {
4971 ASSURE_DESTINATION (safe_room);
4972 c = *charbuf++;
4973 EMIT_ONE_BYTE (c);
4974 }
4975 }
4976 else
4ed46869 4977 {
df7492f9 4978 if (coding->src_multibyte)
d46c5b12 4979 {
df7492f9
KH
4980 int safe_room = MAX_MULTIBYTE_LENGTH;
4981
4982 while (charbuf < charbuf_end)
d46c5b12 4983 {
df7492f9
KH
4984 ASSURE_DESTINATION (safe_room);
4985 c = *charbuf++;
4986 if (ASCII_CHAR_P (c))
4987 *dst++ = c;
4988 else if (CHAR_BYTE8_P (c))
4989 *dst++ = CHAR_TO_BYTE8 (c);
b73bfc1c 4990 else
df7492f9 4991 CHAR_STRING_ADVANCE (c, dst);
d46c5b12
KH
4992 }
4993 }
df7492f9
KH
4994 else
4995 {
4996 ASSURE_DESTINATION (charbuf_end - charbuf);
4997 while (charbuf < charbuf_end && dst < dst_end)
4998 *dst++ = *charbuf++;
8f924df7 4999 }
319a3947 5000 produced_chars = dst - (coding->destination + coding->produced);
4ed46869 5001 }
065e3595 5002 record_conversion_result (coding, CODING_RESULT_SUCCESS);
a0ed9b27 5003 coding->produced_char += produced_chars;
df7492f9
KH
5004 coding->produced = dst - coding->destination;
5005 return 0;
4ed46869
KH
5006}
5007
ff0dacd7
KH
5008/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5009 Check if a text is encoded in a charset-based coding system. If it
5010 is, return 1, else return 0. */
5011
0a28aafb 5012static int
ff0dacd7 5013detect_coding_charset (coding, detect_info)
df7492f9 5014 struct coding_system *coding;
ff0dacd7 5015 struct coding_detection_info *detect_info;
1397dc18 5016{
065e3595 5017 const unsigned char *src = coding->source, *src_base;
8f924df7 5018 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9
KH
5019 int multibytep = coding->src_multibyte;
5020 int consumed_chars = 0;
07295713 5021 Lisp_Object attrs, valids, name;
584948ac 5022 int found = 0;
716b3fa0 5023 int head_ascii = coding->head_ascii;
07295713 5024 int check_latin_extra = 0;
1397dc18 5025
ff0dacd7
KH
5026 detect_info->checked |= CATEGORY_MASK_CHARSET;
5027
df7492f9
KH
5028 coding = &coding_categories[coding_category_charset];
5029 attrs = CODING_ID_ATTRS (coding->id);
5030 valids = AREF (attrs, coding_attr_charset_valids);
07295713
KH
5031 name = CODING_ID_NAME (coding->id);
5032 if (VECTORP (Vlatin_extra_code_table)
5033 && strcmp ((char *) SDATA (SYMBOL_NAME (name)), "iso-8859-"))
5034 check_latin_extra = 1;
df7492f9 5035 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
716b3fa0 5036 src += head_ascii;
1397dc18 5037
b73bfc1c 5038 while (1)
1397dc18 5039 {
df7492f9 5040 int c;
716b3fa0
KH
5041 Lisp_Object val;
5042 struct charset *charset;
5043 int dim, idx;
1397dc18 5044
065e3595 5045 src_base = src;
df7492f9 5046 ONE_MORE_BYTE (c);
065e3595
KH
5047 if (c < 0)
5048 continue;
716b3fa0
KH
5049 val = AREF (valids, c);
5050 if (NILP (val))
df7492f9 5051 break;
584948ac 5052 if (c >= 0x80)
07295713
KH
5053 {
5054 if (c < 0xA0
5055 && check_latin_extra
5056 && NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
5057 break;
5058 found = CATEGORY_MASK_CHARSET;
5059 }
716b3fa0
KH
5060 if (INTEGERP (val))
5061 {
5062 charset = CHARSET_FROM_ID (XFASTINT (val));
5063 dim = CHARSET_DIMENSION (charset);
5064 for (idx = 1; idx < dim; idx++)
5065 {
5066 if (src == src_end)
5067 goto too_short;
5068 ONE_MORE_BYTE (c);
3ed051d4 5069 if (c < charset->code_space[(dim - 1 - idx) * 2]
716b3fa0
KH
5070 || c > charset->code_space[(dim - 1 - idx) * 2 + 1])
5071 break;
5072 }
5073 if (idx < dim)
5074 break;
5075 }
5076 else
5077 {
5078 idx = 1;
5079 for (; CONSP (val); val = XCDR (val))
5080 {
5081 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5082 dim = CHARSET_DIMENSION (charset);
5083 while (idx < dim)
5084 {
5085 if (src == src_end)
5086 goto too_short;
5087 ONE_MORE_BYTE (c);
5088 if (c < charset->code_space[(dim - 1 - idx) * 4]
5089 || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5090 break;
5091 idx++;
5092 }
5093 if (idx == dim)
5094 {
5095 val = Qnil;
5096 break;
5097 }
5098 }
5099 if (CONSP (val))
5100 break;
5101 }
df7492f9 5102 }
716b3fa0 5103 too_short:
ff0dacd7 5104 detect_info->rejected |= CATEGORY_MASK_CHARSET;
df7492f9 5105 return 0;
4ed46869 5106
df7492f9 5107 no_more_source:
ff0dacd7
KH
5108 detect_info->found |= found;
5109 return 1;
df7492f9 5110}
b73bfc1c 5111
b73bfc1c 5112static void
df7492f9 5113decode_coding_charset (coding)
4ed46869 5114 struct coding_system *coding;
4ed46869 5115{
8f924df7
KH
5116 const unsigned char *src = coding->source + coding->consumed;
5117 const unsigned char *src_end = coding->source + coding->src_bytes;
5118 const unsigned char *src_base;
69a80ea3
KH
5119 int *charbuf = coding->charbuf + coding->charbuf_used;
5120 int *charbuf_end
5121 = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
df7492f9
KH
5122 int consumed_chars = 0, consumed_chars_base;
5123 int multibytep = coding->src_multibyte;
24a73b0a 5124 Lisp_Object attrs, charset_list, valids;
ff0dacd7
KH
5125 int char_offset = coding->produced_char;
5126 int last_offset = char_offset;
5127 int last_id = charset_ascii;
119852e7
KH
5128 int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5129 int byte_after_cr = -1;
df7492f9 5130
24a73b0a 5131 CODING_GET_INFO (coding, attrs, charset_list);
4eb6d3f1 5132 valids = AREF (attrs, coding_attr_charset_valids);
b73bfc1c 5133
df7492f9 5134 while (1)
4ed46869 5135 {
4eb6d3f1 5136 int c;
24a73b0a
KH
5137 Lisp_Object val;
5138 struct charset *charset;
5139 int dim;
5140 int len = 1;
5141 unsigned code;
df7492f9
KH
5142
5143 src_base = src;
5144 consumed_chars_base = consumed_chars;
b73bfc1c 5145
df7492f9
KH
5146 if (charbuf >= charbuf_end)
5147 break;
5148
119852e7
KH
5149 if (byte_after_cr >= 0)
5150 {
5151 c = byte_after_cr;
5152 byte_after_cr = -1;
5153 }
5154 else
5155 {
5156 ONE_MORE_BYTE (c);
5157 if (eol_crlf && c == '\r')
5158 ONE_MORE_BYTE (byte_after_cr);
5159 }
065e3595
KH
5160 if (c < 0)
5161 goto invalid_code;
24a73b0a
KH
5162 code = c;
5163
5164 val = AREF (valids, c);
1b17adfd 5165 if (! INTEGERP (val) && ! CONSP (val))
24a73b0a
KH
5166 goto invalid_code;
5167 if (INTEGERP (val))
d46c5b12 5168 {
24a73b0a
KH
5169 charset = CHARSET_FROM_ID (XFASTINT (val));
5170 dim = CHARSET_DIMENSION (charset);
5171 while (len < dim)
b73bfc1c 5172 {
24a73b0a
KH
5173 ONE_MORE_BYTE (c);
5174 code = (code << 8) | c;
5175 len++;
b73bfc1c 5176 }
24a73b0a
KH
5177 CODING_DECODE_CHAR (coding, src, src_base, src_end,
5178 charset, code, c);
d46c5b12 5179 }
df7492f9 5180 else
d46c5b12 5181 {
24a73b0a
KH
5182 /* VAL is a list of charset IDs. It is assured that the
5183 list is sorted by charset dimensions (smaller one
5184 comes first). */
5185 while (CONSP (val))
4eb6d3f1 5186 {
24a73b0a 5187 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
c7c66a95 5188 dim = CHARSET_DIMENSION (charset);
f9d71dcd 5189 while (len < dim)
4eb6d3f1 5190 {
acb2a965
KH
5191 ONE_MORE_BYTE (c);
5192 code = (code << 8) | c;
f9d71dcd 5193 len++;
4eb6d3f1 5194 }
24a73b0a
KH
5195 CODING_DECODE_CHAR (coding, src, src_base,
5196 src_end, charset, code, c);
5197 if (c >= 0)
5198 break;
5199 val = XCDR (val);
ff0dacd7 5200 }
d46c5b12 5201 }
24a73b0a
KH
5202 if (c < 0)
5203 goto invalid_code;
5204 if (charset->id != charset_ascii
5205 && last_id != charset->id)
5206 {
5207 if (last_id != charset_ascii)
69a80ea3 5208 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
24a73b0a
KH
5209 last_id = charset->id;
5210 last_offset = char_offset;
5211 }
5212
df7492f9 5213 *charbuf++ = c;
ff0dacd7 5214 char_offset++;
df7492f9
KH
5215 continue;
5216
5217 invalid_code:
5218 src = src_base;
5219 consumed_chars = consumed_chars_base;
5220 ONE_MORE_BYTE (c);
065e3595 5221 *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
ff0dacd7 5222 char_offset++;
df7492f9 5223 coding->errors++;
4ed46869
KH
5224 }
5225
df7492f9 5226 no_more_source:
ff0dacd7 5227 if (last_id != charset_ascii)
69a80ea3 5228 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
df7492f9
KH
5229 coding->consumed_char += consumed_chars_base;
5230 coding->consumed = src_base - coding->source;
5231 coding->charbuf_used = charbuf - coding->charbuf;
4ed46869
KH
5232}
5233
df7492f9
KH
5234static int
5235encode_coding_charset (coding)
4ed46869 5236 struct coding_system *coding;
4ed46869 5237{
df7492f9
KH
5238 int multibytep = coding->dst_multibyte;
5239 int *charbuf = coding->charbuf;
5240 int *charbuf_end = charbuf + coding->charbuf_used;
5241 unsigned char *dst = coding->destination + coding->produced;
5242 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5243 int safe_room = MAX_MULTIBYTE_LENGTH;
5244 int produced_chars = 0;
24a73b0a 5245 Lisp_Object attrs, charset_list;
df7492f9 5246 int ascii_compatible;
b73bfc1c 5247 int c;
b73bfc1c 5248
24a73b0a 5249 CODING_GET_INFO (coding, attrs, charset_list);
df7492f9 5250 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
fb88bf2d 5251
df7492f9 5252 while (charbuf < charbuf_end)
4ed46869 5253 {
4eb6d3f1 5254 struct charset *charset;
df7492f9 5255 unsigned code;
8f924df7 5256
df7492f9
KH
5257 ASSURE_DESTINATION (safe_room);
5258 c = *charbuf++;
5259 if (ascii_compatible && ASCII_CHAR_P (c))
5260 EMIT_ONE_ASCII_BYTE (c);
16eafb5d 5261 else if (CHAR_BYTE8_P (c))
4ed46869 5262 {
16eafb5d
KH
5263 c = CHAR_TO_BYTE8 (c);
5264 EMIT_ONE_BYTE (c);
d46c5b12 5265 }
d46c5b12 5266 else
b73bfc1c 5267 {
4eb6d3f1
KH
5268 charset = char_charset (c, charset_list, &code);
5269 if (charset)
5270 {
5271 if (CHARSET_DIMENSION (charset) == 1)
5272 EMIT_ONE_BYTE (code);
5273 else if (CHARSET_DIMENSION (charset) == 2)
5274 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5275 else if (CHARSET_DIMENSION (charset) == 3)
5276 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5277 else
5278 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5279 (code >> 8) & 0xFF, code & 0xFF);
5280 }
5281 else
41cbe562
KH
5282 {
5283 if (coding->mode & CODING_MODE_SAFE_ENCODING)
5284 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5285 else
5286 c = coding->default_char;
5287 EMIT_ONE_BYTE (c);
5288 }
4ed46869 5289 }
4ed46869
KH
5290 }
5291
065e3595 5292 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
5293 coding->produced_char += produced_chars;
5294 coding->produced = dst - coding->destination;
5295 return 0;
4ed46869
KH
5296}
5297
5298\f
1397dc18 5299/*** 7. C library functions ***/
4ed46869 5300
df7492f9
KH
5301/* Setup coding context CODING from information about CODING_SYSTEM.
5302 If CODING_SYSTEM is nil, `no-conversion' is assumed. If
5303 CODING_SYSTEM is invalid, signal an error. */
4ed46869 5304
ec6d2bb8 5305void
e0e989f6
KH
5306setup_coding_system (coding_system, coding)
5307 Lisp_Object coding_system;
4ed46869
KH
5308 struct coding_system *coding;
5309{
df7492f9
KH
5310 Lisp_Object attrs;
5311 Lisp_Object eol_type;
5312 Lisp_Object coding_type;
4608c386 5313 Lisp_Object val;
4ed46869 5314
df7492f9 5315 if (NILP (coding_system))
ae6f73fa 5316 coding_system = Qundecided;
c07c8e12 5317
df7492f9 5318 CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
1f5dbf34 5319
df7492f9
KH
5320 attrs = CODING_ID_ATTRS (coding->id);
5321 eol_type = CODING_ID_EOL_TYPE (coding->id);
1f5dbf34 5322
df7492f9
KH
5323 coding->mode = 0;
5324 coding->head_ascii = -1;
4a015c45
KH
5325 if (VECTORP (eol_type))
5326 coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5327 | CODING_REQUIRE_DETECTION_MASK);
5328 else if (! EQ (eol_type, Qunix))
5329 coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5330 | CODING_REQUIRE_ENCODING_MASK);
5331 else
5332 coding->common_flags = 0;
5e5c78be
KH
5333 if (! NILP (CODING_ATTR_POST_READ (attrs)))
5334 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5335 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5336 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
8f924df7
KH
5337 if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5338 coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
1f5dbf34 5339
df7492f9 5340 val = CODING_ATTR_SAFE_CHARSETS (attrs);
8f924df7
KH
5341 coding->max_charset_id = SCHARS (val) - 1;
5342 coding->safe_charsets = (char *) SDATA (val);
df7492f9 5343 coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
4608c386 5344
df7492f9
KH
5345 coding_type = CODING_ATTR_TYPE (attrs);
5346 if (EQ (coding_type, Qundecided))
d46c5b12 5347 {
df7492f9
KH
5348 coding->detector = NULL;
5349 coding->decoder = decode_coding_raw_text;
5350 coding->encoder = encode_coding_raw_text;
5351 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
d46c5b12 5352 }
df7492f9 5353 else if (EQ (coding_type, Qiso_2022))
d46c5b12 5354 {
df7492f9
KH
5355 int i;
5356 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5357
5358 /* Invoke graphic register 0 to plane 0. */
5359 CODING_ISO_INVOCATION (coding, 0) = 0;
5360 /* Invoke graphic register 1 to plane 1 if we can use 8-bit. */
5361 CODING_ISO_INVOCATION (coding, 1)
5362 = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5363 /* Setup the initial status of designation. */
5364 for (i = 0; i < 4; i++)
5365 CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5366 /* Not single shifting initially. */
5367 CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5368 /* Beginning of buffer should also be regarded as bol. */
5369 CODING_ISO_BOL (coding) = 1;
5370 coding->detector = detect_coding_iso_2022;
5371 coding->decoder = decode_coding_iso_2022;
5372 coding->encoder = encode_coding_iso_2022;
5373 if (flags & CODING_ISO_FLAG_SAFE)
5374 coding->mode |= CODING_MODE_SAFE_ENCODING;
d46c5b12 5375 coding->common_flags
df7492f9
KH
5376 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5377 | CODING_REQUIRE_FLUSHING_MASK);
5378 if (flags & CODING_ISO_FLAG_COMPOSITION)
5379 coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
ff0dacd7
KH
5380 if (flags & CODING_ISO_FLAG_DESIGNATION)
5381 coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
df7492f9
KH
5382 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5383 {
5384 setup_iso_safe_charsets (attrs);
5385 val = CODING_ATTR_SAFE_CHARSETS (attrs);
8f924df7
KH
5386 coding->max_charset_id = SCHARS (val) - 1;
5387 coding->safe_charsets = (char *) SDATA (val);
df7492f9
KH
5388 }
5389 CODING_ISO_FLAGS (coding) = flags;
d46c5b12 5390 }
df7492f9 5391 else if (EQ (coding_type, Qcharset))
d46c5b12 5392 {
df7492f9
KH
5393 coding->detector = detect_coding_charset;
5394 coding->decoder = decode_coding_charset;
5395 coding->encoder = encode_coding_charset;
d46c5b12 5396 coding->common_flags
df7492f9 5397 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
d46c5b12 5398 }
df7492f9 5399 else if (EQ (coding_type, Qutf_8))
d46c5b12 5400 {
a470d443
KH
5401 val = AREF (attrs, coding_attr_utf_bom);
5402 CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5403 : EQ (val, Qt) ? utf_with_bom
5404 : utf_without_bom);
df7492f9
KH
5405 coding->detector = detect_coding_utf_8;
5406 coding->decoder = decode_coding_utf_8;
5407 coding->encoder = encode_coding_utf_8;
5408 coding->common_flags
5409 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
a470d443
KH
5410 if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5411 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
df7492f9
KH
5412 }
5413 else if (EQ (coding_type, Qutf_16))
5414 {
a470d443
KH
5415 val = AREF (attrs, coding_attr_utf_bom);
5416 CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5417 : EQ (val, Qt) ? utf_with_bom
5418 : utf_without_bom);
df7492f9 5419 val = AREF (attrs, coding_attr_utf_16_endian);
b49a1807 5420 CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
df7492f9 5421 : utf_16_little_endian);
e19c3639 5422 CODING_UTF_16_SURROGATE (coding) = 0;
df7492f9
KH
5423 coding->detector = detect_coding_utf_16;
5424 coding->decoder = decode_coding_utf_16;
5425 coding->encoder = encode_coding_utf_16;
5426 coding->common_flags
5427 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
a470d443 5428 if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
b49a1807 5429 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
d46c5b12 5430 }
df7492f9 5431 else if (EQ (coding_type, Qccl))
4ed46869 5432 {
df7492f9
KH
5433 coding->detector = detect_coding_ccl;
5434 coding->decoder = decode_coding_ccl;
5435 coding->encoder = encode_coding_ccl;
c952af22 5436 coding->common_flags
df7492f9
KH
5437 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5438 | CODING_REQUIRE_FLUSHING_MASK);
5439 }
5440 else if (EQ (coding_type, Qemacs_mule))
5441 {
5442 coding->detector = detect_coding_emacs_mule;
5443 coding->decoder = decode_coding_emacs_mule;
5444 coding->encoder = encode_coding_emacs_mule;
c952af22 5445 coding->common_flags
df7492f9
KH
5446 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5447 if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5448 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5449 {
5450 Lisp_Object tail, safe_charsets;
5451 int max_charset_id = 0;
5452
5453 for (tail = Vemacs_mule_charset_list; CONSP (tail);
5454 tail = XCDR (tail))
5455 if (max_charset_id < XFASTINT (XCAR (tail)))
5456 max_charset_id = XFASTINT (XCAR (tail));
5457 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
5458 make_number (255));
5459 for (tail = Vemacs_mule_charset_list; CONSP (tail);
5460 tail = XCDR (tail))
8f924df7 5461 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
df7492f9 5462 coding->max_charset_id = max_charset_id;
8f924df7 5463 coding->safe_charsets = (char *) SDATA (safe_charsets);
df7492f9
KH
5464 }
5465 }
5466 else if (EQ (coding_type, Qshift_jis))
5467 {
5468 coding->detector = detect_coding_sjis;
5469 coding->decoder = decode_coding_sjis;
5470 coding->encoder = encode_coding_sjis;
c952af22 5471 coding->common_flags
df7492f9
KH
5472 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5473 }
5474 else if (EQ (coding_type, Qbig5))
5475 {
5476 coding->detector = detect_coding_big5;
5477 coding->decoder = decode_coding_big5;
5478 coding->encoder = encode_coding_big5;
c952af22 5479 coding->common_flags
df7492f9
KH
5480 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5481 }
5482 else /* EQ (coding_type, Qraw_text) */
ec6d2bb8 5483 {
df7492f9
KH
5484 coding->detector = NULL;
5485 coding->decoder = decode_coding_raw_text;
5486 coding->encoder = encode_coding_raw_text;
ea29edf2
KH
5487 if (! EQ (eol_type, Qunix))
5488 {
5489 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5490 if (! VECTORP (eol_type))
5491 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5492 }
5493
4ed46869 5494 }
4ed46869 5495
df7492f9 5496 return;
4ed46869
KH
5497}
5498
0ff61e78
KH
5499/* Return a list of charsets supported by CODING. */
5500
5501Lisp_Object
5502coding_charset_list (coding)
5503 struct coding_system *coding;
5504{
35befdaa 5505 Lisp_Object attrs, charset_list;
0ff61e78
KH
5506
5507 CODING_GET_INFO (coding, attrs, charset_list);
5508 if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5509 {
5510 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5511
5512 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5513 charset_list = Viso_2022_charset_list;
5514 }
5515 else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5516 {
5517 charset_list = Vemacs_mule_charset_list;
5518 }
5519 return charset_list;
5520}
5521
5522
df7492f9
KH
5523/* Return raw-text or one of its subsidiaries that has the same
5524 eol_type as CODING-SYSTEM. */
ec6d2bb8 5525
df7492f9
KH
5526Lisp_Object
5527raw_text_coding_system (coding_system)
5528 Lisp_Object coding_system;
ec6d2bb8 5529{
0be8721c 5530 Lisp_Object spec, attrs;
df7492f9 5531 Lisp_Object eol_type, raw_text_eol_type;
ec6d2bb8 5532
d3e4cb56
KH
5533 if (NILP (coding_system))
5534 return Qraw_text;
df7492f9
KH
5535 spec = CODING_SYSTEM_SPEC (coding_system);
5536 attrs = AREF (spec, 0);
ec6d2bb8 5537
df7492f9
KH
5538 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5539 return coding_system;
ec6d2bb8 5540
df7492f9
KH
5541 eol_type = AREF (spec, 2);
5542 if (VECTORP (eol_type))
5543 return Qraw_text;
5544 spec = CODING_SYSTEM_SPEC (Qraw_text);
5545 raw_text_eol_type = AREF (spec, 2);
5546 return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5547 : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5548 : AREF (raw_text_eol_type, 2));
ec6d2bb8
KH
5549}
5550
54f78171 5551
df7492f9
KH
5552/* If CODING_SYSTEM doesn't specify end-of-line format but PARENT
5553 does, return one of the subsidiary that has the same eol-spec as
fcbcfb64
KH
5554 PARENT. Otherwise, return CODING_SYSTEM. If PARENT is nil,
5555 inherit end-of-line format from the system's setting
5556 (system_eol_type). */
df7492f9
KH
5557
5558Lisp_Object
5559coding_inherit_eol_type (coding_system, parent)
b74e4686 5560 Lisp_Object coding_system, parent;
54f78171 5561{
3e139625 5562 Lisp_Object spec, eol_type;
54f78171 5563
d3e4cb56
KH
5564 if (NILP (coding_system))
5565 coding_system = Qraw_text;
df7492f9 5566 spec = CODING_SYSTEM_SPEC (coding_system);
df7492f9 5567 eol_type = AREF (spec, 2);
fcbcfb64 5568 if (VECTORP (eol_type))
df7492f9 5569 {
df7492f9
KH
5570 Lisp_Object parent_eol_type;
5571
fcbcfb64
KH
5572 if (! NILP (parent))
5573 {
5574 Lisp_Object parent_spec;
5575
4a015c45 5576 parent_spec = CODING_SYSTEM_SPEC (parent);
fcbcfb64
KH
5577 parent_eol_type = AREF (parent_spec, 2);
5578 }
5579 else
5580 parent_eol_type = system_eol_type;
df7492f9
KH
5581 if (EQ (parent_eol_type, Qunix))
5582 coding_system = AREF (eol_type, 0);
5583 else if (EQ (parent_eol_type, Qdos))
5584 coding_system = AREF (eol_type, 1);
5585 else if (EQ (parent_eol_type, Qmac))
5586 coding_system = AREF (eol_type, 2);
54f78171 5587 }
df7492f9 5588 return coding_system;
54f78171
KH
5589}
5590
4ed46869
KH
5591/* Emacs has a mechanism to automatically detect a coding system if it
5592 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
5593 it's impossible to distinguish some coding systems accurately
5594 because they use the same range of codes. So, at first, coding
5595 systems are categorized into 7, those are:
5596
0ef69138 5597 o coding-category-emacs-mule
4ed46869
KH
5598
5599 The category for a coding system which has the same code range
5600 as Emacs' internal format. Assigned the coding-system (Lisp
0ef69138 5601 symbol) `emacs-mule' by default.
4ed46869
KH
5602
5603 o coding-category-sjis
5604
5605 The category for a coding system which has the same code range
5606 as SJIS. Assigned the coding-system (Lisp
7717c392 5607 symbol) `japanese-shift-jis' by default.
4ed46869
KH
5608
5609 o coding-category-iso-7
5610
5611 The category for a coding system which has the same code range
7717c392 5612 as ISO2022 of 7-bit environment. This doesn't use any locking
d46c5b12
KH
5613 shift and single shift functions. This can encode/decode all
5614 charsets. Assigned the coding-system (Lisp symbol)
5615 `iso-2022-7bit' by default.
5616
5617 o coding-category-iso-7-tight
5618
5619 Same as coding-category-iso-7 except that this can
5620 encode/decode only the specified charsets.
4ed46869
KH
5621
5622 o coding-category-iso-8-1
5623
5624 The category for a coding system which has the same code range
5625 as ISO2022 of 8-bit environment and graphic plane 1 used only
7717c392
KH
5626 for DIMENSION1 charset. This doesn't use any locking shift
5627 and single shift functions. Assigned the coding-system (Lisp
5628 symbol) `iso-latin-1' by default.
4ed46869
KH
5629
5630 o coding-category-iso-8-2
5631
5632 The category for a coding system which has the same code range
5633 as ISO2022 of 8-bit environment and graphic plane 1 used only
7717c392
KH
5634 for DIMENSION2 charset. This doesn't use any locking shift
5635 and single shift functions. Assigned the coding-system (Lisp
5636 symbol) `japanese-iso-8bit' by default.
4ed46869 5637
7717c392 5638 o coding-category-iso-7-else
4ed46869
KH
5639
5640 The category for a coding system which has the same code range
df7492f9 5641 as ISO2022 of 7-bit environemnt but uses locking shift or
7717c392
KH
5642 single shift functions. Assigned the coding-system (Lisp
5643 symbol) `iso-2022-7bit-lock' by default.
5644
5645 o coding-category-iso-8-else
5646
5647 The category for a coding system which has the same code range
df7492f9 5648 as ISO2022 of 8-bit environemnt but uses locking shift or
7717c392
KH
5649 single shift functions. Assigned the coding-system (Lisp
5650 symbol) `iso-2022-8bit-ss2' by default.
4ed46869
KH
5651
5652 o coding-category-big5
5653
5654 The category for a coding system which has the same code range
5655 as BIG5. Assigned the coding-system (Lisp symbol)
e0e989f6 5656 `cn-big5' by default.
4ed46869 5657
fa42c37f
KH
5658 o coding-category-utf-8
5659
5660 The category for a coding system which has the same code range
6e76ae91 5661 as UTF-8 (cf. RFC3629). Assigned the coding-system (Lisp
fa42c37f
KH
5662 symbol) `utf-8' by default.
5663
5664 o coding-category-utf-16-be
5665
5666 The category for a coding system in which a text has an
5667 Unicode signature (cf. Unicode Standard) in the order of BIG
5668 endian at the head. Assigned the coding-system (Lisp symbol)
5669 `utf-16-be' by default.
5670
5671 o coding-category-utf-16-le
5672
5673 The category for a coding system in which a text has an
5674 Unicode signature (cf. Unicode Standard) in the order of
5675 LITTLE endian at the head. Assigned the coding-system (Lisp
5676 symbol) `utf-16-le' by default.
5677
1397dc18
KH
5678 o coding-category-ccl
5679
5680 The category for a coding system of which encoder/decoder is
5681 written in CCL programs. The default value is nil, i.e., no
5682 coding system is assigned.
5683
4ed46869
KH
5684 o coding-category-binary
5685
5686 The category for a coding system not categorized in any of the
5687 above. Assigned the coding-system (Lisp symbol)
e0e989f6 5688 `no-conversion' by default.
4ed46869
KH
5689
5690 Each of them is a Lisp symbol and the value is an actual
df7492f9 5691 `coding-system's (this is also a Lisp symbol) assigned by a user.
4ed46869
KH
5692 What Emacs does actually is to detect a category of coding system.
5693 Then, it uses a `coding-system' assigned to it. If Emacs can't
df7492f9 5694 decide only one possible category, it selects a category of the
4ed46869
KH
5695 highest priority. Priorities of categories are also specified by a
5696 user in a Lisp variable `coding-category-list'.
5697
5698*/
5699
df7492f9
KH
5700#define EOL_SEEN_NONE 0
5701#define EOL_SEEN_LF 1
5702#define EOL_SEEN_CR 2
5703#define EOL_SEEN_CRLF 4
66cfb530 5704
ff0dacd7
KH
5705/* Detect how end-of-line of a text of length SRC_BYTES pointed by
5706 SOURCE is encoded. If CATEGORY is one of
5707 coding_category_utf_16_XXXX, assume that CR and LF are encoded by
5708 two-byte, else they are encoded by one-byte.
5709
5710 Return one of EOL_SEEN_XXX. */
4ed46869 5711
bc4bc72a 5712#define MAX_EOL_CHECK_COUNT 3
d46c5b12
KH
5713
5714static int
89528eb3 5715detect_eol (source, src_bytes, category)
f6cbaf43 5716 const unsigned char *source;
df7492f9 5717 EMACS_INT src_bytes;
89528eb3 5718 enum coding_category category;
4ed46869 5719{
f6cbaf43 5720 const unsigned char *src = source, *src_end = src + src_bytes;
4ed46869 5721 unsigned char c;
df7492f9
KH
5722 int total = 0;
5723 int eol_seen = EOL_SEEN_NONE;
4ed46869 5724
89528eb3 5725 if ((1 << category) & CATEGORY_MASK_UTF_16)
4ed46869 5726 {
df7492f9 5727 int msb, lsb;
fa42c37f 5728
89528eb3
KH
5729 msb = category == (coding_category_utf_16_le
5730 | coding_category_utf_16_le_nosig);
df7492f9 5731 lsb = 1 - msb;
fa42c37f 5732
df7492f9 5733 while (src + 1 < src_end)
fa42c37f 5734 {
df7492f9
KH
5735 c = src[lsb];
5736 if (src[msb] == 0 && (c == '\n' || c == '\r'))
fa42c37f 5737 {
df7492f9
KH
5738 int this_eol;
5739
5740 if (c == '\n')
5741 this_eol = EOL_SEEN_LF;
5742 else if (src + 3 >= src_end
5743 || src[msb + 2] != 0
5744 || src[lsb + 2] != '\n')
5745 this_eol = EOL_SEEN_CR;
fa42c37f 5746 else
8f924df7 5747 this_eol = EOL_SEEN_CRLF;
df7492f9
KH
5748
5749 if (eol_seen == EOL_SEEN_NONE)
5750 /* This is the first end-of-line. */
5751 eol_seen = this_eol;
5752 else if (eol_seen != this_eol)
fa42c37f 5753 {
df7492f9
KH
5754 /* The found type is different from what found before. */
5755 eol_seen = EOL_SEEN_LF;
5756 break;
fa42c37f 5757 }
df7492f9
KH
5758 if (++total == MAX_EOL_CHECK_COUNT)
5759 break;
fa42c37f 5760 }
df7492f9 5761 src += 2;
fa42c37f 5762 }
bcf26d6a 5763 }
d46c5b12 5764 else
c4825358 5765 {
df7492f9 5766 while (src < src_end)
27901516 5767 {
df7492f9
KH
5768 c = *src++;
5769 if (c == '\n' || c == '\r')
5770 {
5771 int this_eol;
d46c5b12 5772
df7492f9
KH
5773 if (c == '\n')
5774 this_eol = EOL_SEEN_LF;
5775 else if (src >= src_end || *src != '\n')
5776 this_eol = EOL_SEEN_CR;
5777 else
5778 this_eol = EOL_SEEN_CRLF, src++;
d46c5b12 5779
df7492f9
KH
5780 if (eol_seen == EOL_SEEN_NONE)
5781 /* This is the first end-of-line. */
5782 eol_seen = this_eol;
5783 else if (eol_seen != this_eol)
5784 {
5785 /* The found type is different from what found before. */
5786 eol_seen = EOL_SEEN_LF;
5787 break;
5788 }
5789 if (++total == MAX_EOL_CHECK_COUNT)
5790 break;
5791 }
5792 }
73be902c 5793 }
df7492f9 5794 return eol_seen;
73be902c
KH
5795}
5796
df7492f9 5797
24a73b0a 5798static Lisp_Object
df7492f9
KH
5799adjust_coding_eol_type (coding, eol_seen)
5800 struct coding_system *coding;
5801 int eol_seen;
73be902c 5802{
0be8721c 5803 Lisp_Object eol_type;
8f924df7 5804
df7492f9
KH
5805 eol_type = CODING_ID_EOL_TYPE (coding->id);
5806 if (eol_seen & EOL_SEEN_LF)
24a73b0a
KH
5807 {
5808 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
5809 eol_type = Qunix;
5810 }
6f197c07 5811 else if (eol_seen & EOL_SEEN_CRLF)
24a73b0a
KH
5812 {
5813 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
5814 eol_type = Qdos;
5815 }
6f197c07 5816 else if (eol_seen & EOL_SEEN_CR)
24a73b0a
KH
5817 {
5818 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
5819 eol_type = Qmac;
5820 }
5821 return eol_type;
d46c5b12 5822}
4ed46869 5823
df7492f9
KH
5824/* Detect how a text specified in CODING is encoded. If a coding
5825 system is detected, update fields of CODING by the detected coding
5826 system. */
0a28aafb 5827
df7492f9
KH
5828void
5829detect_coding (coding)
d46c5b12 5830 struct coding_system *coding;
d46c5b12 5831{
8f924df7 5832 const unsigned char *src, *src_end;
d46c5b12 5833
df7492f9
KH
5834 coding->consumed = coding->consumed_char = 0;
5835 coding->produced = coding->produced_char = 0;
5836 coding_set_source (coding);
1c3478b0 5837
df7492f9 5838 src_end = coding->source + coding->src_bytes;
c0e16b14 5839 coding->head_ascii = 0;
1c3478b0 5840
df7492f9
KH
5841 /* If we have not yet decided the text encoding type, detect it
5842 now. */
5843 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
b73bfc1c 5844 {
df7492f9 5845 int c, i;
6cb21a4f 5846 struct coding_detection_info detect_info;
2f3cbb32 5847 int null_byte_found = 0, eight_bit_found = 0;
df7492f9 5848
6cb21a4f 5849 detect_info.checked = detect_info.found = detect_info.rejected = 0;
2f3cbb32 5850 for (src = coding->source; src < src_end; src++)
d46c5b12 5851 {
df7492f9 5852 c = *src;
6cb21a4f 5853 if (c & 0x80)
6cb21a4f 5854 {
2f3cbb32 5855 eight_bit_found = 1;
2f3cbb32
KH
5856 if (null_byte_found)
5857 break;
5858 }
5859 else if (c < 0x20)
5860 {
5861 if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
5862 && ! inhibit_iso_escape_detection
5863 && ! detect_info.checked)
6cb21a4f 5864 {
2f3cbb32
KH
5865 if (detect_coding_iso_2022 (coding, &detect_info))
5866 {
5867 /* We have scanned the whole data. */
5868 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
c0e16b14
KH
5869 {
5870 /* We didn't find an 8-bit code. We may
5871 have found a null-byte, but it's very
5872 rare that a binary file confirm to
5873 ISO-2022. */
5874 src = src_end;
5875 coding->head_ascii = src - coding->source;
5876 }
5877 detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
2f3cbb32
KH
5878 break;
5879 }
5880 }
5881 else if (! c)
5882 {
5883 null_byte_found = 1;
5884 if (eight_bit_found)
5885 break;
6cb21a4f 5886 }
c006c0c8
KH
5887 if (! eight_bit_found)
5888 coding->head_ascii++;
6cb21a4f 5889 }
c006c0c8 5890 else if (! eight_bit_found)
c0e16b14 5891 coding->head_ascii++;
d46c5b12 5892 }
df7492f9 5893
2f3cbb32
KH
5894 if (null_byte_found || eight_bit_found
5895 || coding->head_ascii < coding->src_bytes
6cb21a4f 5896 || detect_info.found)
d46c5b12 5897 {
ff0dacd7
KH
5898 enum coding_category category;
5899 struct coding_system *this;
df7492f9 5900
6cb21a4f
KH
5901 if (coding->head_ascii == coding->src_bytes)
5902 /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */
5903 for (i = 0; i < coding_category_raw_text; i++)
5904 {
5905 category = coding_priorities[i];
5906 this = coding_categories + category;
5907 if (detect_info.found & (1 << category))
24a73b0a 5908 break;
6cb21a4f
KH
5909 }
5910 else
2f3cbb32
KH
5911 {
5912 if (null_byte_found)
ff0dacd7 5913 {
2f3cbb32
KH
5914 detect_info.checked |= ~CATEGORY_MASK_UTF_16;
5915 detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
ff0dacd7 5916 }
2f3cbb32
KH
5917 for (i = 0; i < coding_category_raw_text; i++)
5918 {
5919 category = coding_priorities[i];
5920 this = coding_categories + category;
5921 if (this->id < 0)
5922 {
5923 /* No coding system of this category is defined. */
5924 detect_info.rejected |= (1 << category);
5925 }
5926 else if (category >= coding_category_raw_text)
5927 continue;
5928 else if (detect_info.checked & (1 << category))
5929 {
5930 if (detect_info.found & (1 << category))
5931 break;
5932 }
5933 else if ((*(this->detector)) (coding, &detect_info)
5934 && detect_info.found & (1 << category))
5935 {
5936 if (category == coding_category_utf_16_auto)
5937 {
5938 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5939 category = coding_category_utf_16_le;
5940 else
5941 category = coding_category_utf_16_be;
5942 }
5943 break;
5944 }
5945 }
2f3cbb32 5946 }
c0e16b14
KH
5947
5948 if (i < coding_category_raw_text)
5949 setup_coding_system (CODING_ID_NAME (this->id), coding);
5950 else if (null_byte_found)
5951 setup_coding_system (Qno_conversion, coding);
5952 else if ((detect_info.rejected & CATEGORY_MASK_ANY)
5953 == CATEGORY_MASK_ANY)
5954 setup_coding_system (Qraw_text, coding);
5955 else if (detect_info.rejected)
5956 for (i = 0; i < coding_category_raw_text; i++)
5957 if (! (detect_info.rejected & (1 << coding_priorities[i])))
5958 {
5959 this = coding_categories + coding_priorities[i];
5960 setup_coding_system (CODING_ID_NAME (this->id), coding);
5961 break;
5962 }
d46c5b12 5963 }
b73bfc1c 5964 }
a470d443
KH
5965 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
5966 == coding_category_utf_8_auto)
5967 {
5968 Lisp_Object coding_systems;
5969 struct coding_detection_info detect_info;
5970
5971 coding_systems
5972 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
5973 detect_info.found = detect_info.rejected = 0;
5974 coding->head_ascii = 0;
5975 if (CONSP (coding_systems)
5976 && detect_coding_utf_8 (coding, &detect_info))
5977 {
5978 if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
5979 setup_coding_system (XCAR (coding_systems), coding);
5980 else
5981 setup_coding_system (XCDR (coding_systems), coding);
5982 }
5983 }
24a73b0a
KH
5984 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
5985 == coding_category_utf_16_auto)
b49a1807
KH
5986 {
5987 Lisp_Object coding_systems;
5988 struct coding_detection_info detect_info;
5989
5990 coding_systems
a470d443 5991 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
b49a1807 5992 detect_info.found = detect_info.rejected = 0;
a470d443 5993 coding->head_ascii = 0;
b49a1807 5994 if (CONSP (coding_systems)
24a73b0a 5995 && detect_coding_utf_16 (coding, &detect_info))
b49a1807
KH
5996 {
5997 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5998 setup_coding_system (XCAR (coding_systems), coding);
24a73b0a 5999 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
b49a1807
KH
6000 setup_coding_system (XCDR (coding_systems), coding);
6001 }
6002 }
4ed46869 6003}
4ed46869 6004
d46c5b12 6005
aaaf0b1e 6006static void
df7492f9 6007decode_eol (coding)
aaaf0b1e 6008 struct coding_system *coding;
aaaf0b1e 6009{
24a73b0a
KH
6010 Lisp_Object eol_type;
6011 unsigned char *p, *pbeg, *pend;
3ed051d4 6012
24a73b0a
KH
6013 eol_type = CODING_ID_EOL_TYPE (coding->id);
6014 if (EQ (eol_type, Qunix))
6015 return;
6016
6017 if (NILP (coding->dst_object))
6018 pbeg = coding->destination;
6019 else
6020 pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6021 pend = pbeg + coding->produced;
6022
6023 if (VECTORP (eol_type))
aaaf0b1e 6024 {
df7492f9 6025 int eol_seen = EOL_SEEN_NONE;
4ed46869 6026
24a73b0a 6027 for (p = pbeg; p < pend; p++)
aaaf0b1e 6028 {
df7492f9
KH
6029 if (*p == '\n')
6030 eol_seen |= EOL_SEEN_LF;
6031 else if (*p == '\r')
aaaf0b1e 6032 {
df7492f9 6033 if (p + 1 < pend && *(p + 1) == '\n')
aaaf0b1e 6034 {
df7492f9
KH
6035 eol_seen |= EOL_SEEN_CRLF;
6036 p++;
aaaf0b1e 6037 }
aaaf0b1e 6038 else
df7492f9 6039 eol_seen |= EOL_SEEN_CR;
aaaf0b1e 6040 }
aaaf0b1e 6041 }
24a73b0a
KH
6042 if (eol_seen != EOL_SEEN_NONE
6043 && eol_seen != EOL_SEEN_LF
6044 && eol_seen != EOL_SEEN_CRLF
6045 && eol_seen != EOL_SEEN_CR)
6046 eol_seen = EOL_SEEN_LF;
df7492f9 6047 if (eol_seen != EOL_SEEN_NONE)
24a73b0a 6048 eol_type = adjust_coding_eol_type (coding, eol_seen);
aaaf0b1e 6049 }
d46c5b12 6050
24a73b0a 6051 if (EQ (eol_type, Qmac))
27901516 6052 {
24a73b0a 6053 for (p = pbeg; p < pend; p++)
df7492f9
KH
6054 if (*p == '\r')
6055 *p = '\n';
4ed46869 6056 }
24a73b0a 6057 else if (EQ (eol_type, Qdos))
df7492f9 6058 {
24a73b0a 6059 int n = 0;
b73bfc1c 6060
24a73b0a
KH
6061 if (NILP (coding->dst_object))
6062 {
4347441b
KH
6063 /* Start deleting '\r' from the tail to minimize the memory
6064 movement. */
24a73b0a
KH
6065 for (p = pend - 2; p >= pbeg; p--)
6066 if (*p == '\r')
6067 {
6068 safe_bcopy ((char *) (p + 1), (char *) p, pend-- - p - 1);
6069 n++;
6070 }
6071 }
6072 else
6073 {
4347441b
KH
6074 int pos_byte = coding->dst_pos_byte;
6075 int pos = coding->dst_pos;
6076 int pos_end = pos + coding->produced_char - 1;
6077
6078 while (pos < pos_end)
6079 {
6080 p = BYTE_POS_ADDR (pos_byte);
6081 if (*p == '\r' && p[1] == '\n')
6082 {
6083 del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6084 n++;
6085 pos_end--;
6086 }
6087 pos++;
69b8522d
KH
6088 if (coding->dst_multibyte)
6089 pos_byte += BYTES_BY_CHAR_HEAD (*p);
6090 else
6091 pos_byte++;
4347441b 6092 }
24a73b0a
KH
6093 }
6094 coding->produced -= n;
6095 coding->produced_char -= n;
aaaf0b1e 6096 }
4ed46869
KH
6097}
6098
7d64c6ad 6099
a6f87d34
KH
6100/* Return a translation table (or list of them) from coding system
6101 attribute vector ATTRS for encoding (ENCODEP is nonzero) or
6102 decoding (ENCODEP is zero). */
7d64c6ad 6103
e6a54062 6104static Lisp_Object
09ee6fdd
KH
6105get_translation_table (attrs, encodep, max_lookup)
6106 Lisp_Object attrs;
6107 int encodep, *max_lookup;
7d64c6ad
KH
6108{
6109 Lisp_Object standard, translation_table;
09ee6fdd 6110 Lisp_Object val;
7d64c6ad
KH
6111
6112 if (encodep)
6113 translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6114 standard = Vstandard_translation_table_for_encode;
6115 else
6116 translation_table = CODING_ATTR_DECODE_TBL (attrs),
6117 standard = Vstandard_translation_table_for_decode;
7d64c6ad 6118 if (NILP (translation_table))
09ee6fdd
KH
6119 translation_table = standard;
6120 else
a6f87d34 6121 {
09ee6fdd
KH
6122 if (SYMBOLP (translation_table))
6123 translation_table = Fget (translation_table, Qtranslation_table);
6124 else if (CONSP (translation_table))
6125 {
6126 translation_table = Fcopy_sequence (translation_table);
6127 for (val = translation_table; CONSP (val); val = XCDR (val))
6128 if (SYMBOLP (XCAR (val)))
6129 XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6130 }
6131 if (CHAR_TABLE_P (standard))
6132 {
6133 if (CONSP (translation_table))
6134 translation_table = nconc2 (translation_table,
6135 Fcons (standard, Qnil));
6136 else
6137 translation_table = Fcons (translation_table,
6138 Fcons (standard, Qnil));
6139 }
a6f87d34 6140 }
2170c8f0
KH
6141
6142 if (max_lookup)
09ee6fdd 6143 {
2170c8f0
KH
6144 *max_lookup = 1;
6145 if (CHAR_TABLE_P (translation_table)
6146 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6147 {
6148 val = XCHAR_TABLE (translation_table)->extras[1];
6149 if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6150 *max_lookup = XFASTINT (val);
6151 }
6152 else if (CONSP (translation_table))
6153 {
6154 Lisp_Object tail, val;
09ee6fdd 6155
2170c8f0
KH
6156 for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6157 if (CHAR_TABLE_P (XCAR (tail))
6158 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6159 {
6160 val = XCHAR_TABLE (XCAR (tail))->extras[1];
6161 if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6162 *max_lookup = XFASTINT (val);
6163 }
6164 }
a6f87d34 6165 }
7d64c6ad
KH
6166 return translation_table;
6167}
6168
09ee6fdd
KH
6169#define LOOKUP_TRANSLATION_TABLE(table, c, trans) \
6170 do { \
6171 trans = Qnil; \
6172 if (CHAR_TABLE_P (table)) \
6173 { \
6174 trans = CHAR_TABLE_REF (table, c); \
6175 if (CHARACTERP (trans)) \
6176 c = XFASTINT (trans), trans = Qnil; \
6177 } \
6178 else if (CONSP (table)) \
6179 { \
6180 Lisp_Object tail; \
6181 \
6182 for (tail = table; CONSP (tail); tail = XCDR (tail)) \
6183 if (CHAR_TABLE_P (XCAR (tail))) \
6184 { \
6185 trans = CHAR_TABLE_REF (XCAR (tail), c); \
6186 if (CHARACTERP (trans)) \
6187 c = XFASTINT (trans), trans = Qnil; \
6188 else if (! NILP (trans)) \
6189 break; \
6190 } \
6191 } \
e6a54062
KH
6192 } while (0)
6193
7d64c6ad 6194
69a80ea3
KH
6195static Lisp_Object
6196get_translation (val, buf, buf_end, last_block, from_nchars, to_nchars)
6197 Lisp_Object val;
6198 int *buf, *buf_end;
6199 int last_block;
6200 int *from_nchars, *to_nchars;
6201{
433f7f87
KH
6202 /* VAL is TO or (([FROM-CHAR ...] . TO) ...) where TO is TO-CHAR or
6203 [TO-CHAR ...]. */
69a80ea3
KH
6204 if (CONSP (val))
6205 {
433f7f87 6206 Lisp_Object from, tail;
69a80ea3
KH
6207 int i, len;
6208
433f7f87 6209 for (tail = val; CONSP (tail); tail = XCDR (tail))
69a80ea3 6210 {
433f7f87
KH
6211 val = XCAR (tail);
6212 from = XCAR (val);
6213 len = ASIZE (from);
6214 for (i = 0; i < len; i++)
6215 {
6216 if (buf + i == buf_end)
6217 {
6218 if (! last_block)
6219 return Qt;
6220 break;
6221 }
6222 if (XINT (AREF (from, i)) != buf[i])
6223 break;
6224 }
6225 if (i == len)
6226 {
6227 val = XCDR (val);
6228 *from_nchars = len;
6229 break;
6230 }
69a80ea3 6231 }
433f7f87
KH
6232 if (! CONSP (tail))
6233 return Qnil;
69a80ea3
KH
6234 }
6235 if (VECTORP (val))
6236 *buf = XINT (AREF (val, 0)), *to_nchars = ASIZE (val);
6237 else
6238 *buf = XINT (val);
6239 return val;
6240}
6241
6242
d46c5b12 6243static int
69a80ea3 6244produce_chars (coding, translation_table, last_block)
df7492f9 6245 struct coding_system *coding;
69a80ea3
KH
6246 Lisp_Object translation_table;
6247 int last_block;
4ed46869 6248{
df7492f9
KH
6249 unsigned char *dst = coding->destination + coding->produced;
6250 unsigned char *dst_end = coding->destination + coding->dst_bytes;
119852e7
KH
6251 EMACS_INT produced;
6252 EMACS_INT produced_chars = 0;
69a80ea3 6253 int carryover = 0;
4ed46869 6254
df7492f9 6255 if (! coding->chars_at_source)
4ed46869 6256 {
119852e7 6257 /* Source characters are in coding->charbuf. */
fba4576f
AS
6258 int *buf = coding->charbuf;
6259 int *buf_end = buf + coding->charbuf_used;
4ed46869 6260
db274c7a
KH
6261 if (EQ (coding->src_object, coding->dst_object))
6262 {
6263 coding_set_source (coding);
6264 dst_end = ((unsigned char *) coding->source) + coding->consumed;
6265 }
4ed46869 6266
df7492f9 6267 while (buf < buf_end)
4ed46869 6268 {
69a80ea3 6269 int c = *buf, i;
bc4bc72a 6270
df7492f9
KH
6271 if (c >= 0)
6272 {
69a80ea3
KH
6273 int from_nchars = 1, to_nchars = 1;
6274 Lisp_Object trans = Qnil;
6275
09ee6fdd 6276 LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
e6a54062 6277 if (! NILP (trans))
69a80ea3
KH
6278 {
6279 trans = get_translation (trans, buf, buf_end, last_block,
6280 &from_nchars, &to_nchars);
6281 if (EQ (trans, Qt))
6282 break;
6283 c = *buf;
6284 }
6285
6286 if (dst + MAX_MULTIBYTE_LENGTH * to_nchars > dst_end)
6287 {
6288 dst = alloc_destination (coding,
6289 buf_end - buf
6290 + MAX_MULTIBYTE_LENGTH * to_nchars,
6291 dst);
db274c7a
KH
6292 if (EQ (coding->src_object, coding->dst_object))
6293 {
6294 coding_set_source (coding);
6295 dst_end = ((unsigned char *) coding->source) + coding->consumed;
6296 }
6297 else
6298 dst_end = coding->destination + coding->dst_bytes;
69a80ea3
KH
6299 }
6300
433f7f87 6301 for (i = 0; i < to_nchars; i++)
69a80ea3 6302 {
433f7f87
KH
6303 if (i > 0)
6304 c = XINT (AREF (trans, i));
69a80ea3
KH
6305 if (coding->dst_multibyte
6306 || ! CHAR_BYTE8_P (c))
db274c7a 6307 CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
69a80ea3
KH
6308 else
6309 *dst++ = CHAR_TO_BYTE8 (c);
6310 }
6311 produced_chars += to_nchars;
6312 *buf++ = to_nchars;
6313 while (--from_nchars > 0)
6314 *buf++ = 0;
d46c5b12 6315 }
df7492f9 6316 else
69a80ea3
KH
6317 /* This is an annotation datum. (-C) is the length. */
6318 buf += -c;
4ed46869 6319 }
69a80ea3 6320 carryover = buf_end - buf;
4ed46869 6321 }
fa42c37f 6322 else
fa42c37f 6323 {
119852e7 6324 /* Source characters are at coding->source. */
8f924df7 6325 const unsigned char *src = coding->source;
119852e7 6326 const unsigned char *src_end = src + coding->consumed;
4ed46869 6327
db274c7a
KH
6328 if (EQ (coding->dst_object, coding->src_object))
6329 dst_end = (unsigned char *) src;
df7492f9 6330 if (coding->src_multibyte != coding->dst_multibyte)
fa42c37f 6331 {
df7492f9 6332 if (coding->src_multibyte)
fa42c37f 6333 {
71c81426 6334 int multibytep = 1;
4533845d 6335 EMACS_INT consumed_chars = 0;
d46c5b12 6336
df7492f9
KH
6337 while (1)
6338 {
8f924df7 6339 const unsigned char *src_base = src;
df7492f9 6340 int c;
b73bfc1c 6341
df7492f9 6342 ONE_MORE_BYTE (c);
119852e7 6343 if (dst == dst_end)
df7492f9 6344 {
119852e7
KH
6345 if (EQ (coding->src_object, coding->dst_object))
6346 dst_end = (unsigned char *) src;
6347 if (dst == dst_end)
df7492f9 6348 {
119852e7
KH
6349 EMACS_INT offset = src - coding->source;
6350
6351 dst = alloc_destination (coding, src_end - src + 1,
6352 dst);
6353 dst_end = coding->destination + coding->dst_bytes;
6354 coding_set_source (coding);
6355 src = coding->source + offset;
6356 src_end = coding->source + coding->src_bytes;
db274c7a
KH
6357 if (EQ (coding->src_object, coding->dst_object))
6358 dst_end = (unsigned char *) src;
df7492f9 6359 }
df7492f9
KH
6360 }
6361 *dst++ = c;
6362 produced_chars++;
6363 }
6364 no_more_source:
6365 ;
fa42c37f
KH
6366 }
6367 else
df7492f9
KH
6368 while (src < src_end)
6369 {
71c81426 6370 int multibytep = 1;
df7492f9 6371 int c = *src++;
b73bfc1c 6372
df7492f9
KH
6373 if (dst >= dst_end - 1)
6374 {
2c78b7e1 6375 if (EQ (coding->src_object, coding->dst_object))
8f924df7 6376 dst_end = (unsigned char *) src;
2c78b7e1
KH
6377 if (dst >= dst_end - 1)
6378 {
119852e7 6379 EMACS_INT offset = src - coding->source;
db274c7a 6380 EMACS_INT more_bytes;
119852e7 6381
db274c7a
KH
6382 if (EQ (coding->src_object, coding->dst_object))
6383 more_bytes = ((src_end - src) / 2) + 2;
6384 else
6385 more_bytes = src_end - src + 2;
6386 dst = alloc_destination (coding, more_bytes, dst);
2c78b7e1
KH
6387 dst_end = coding->destination + coding->dst_bytes;
6388 coding_set_source (coding);
119852e7 6389 src = coding->source + offset;
2c78b7e1 6390 src_end = coding->source + coding->src_bytes;
db274c7a
KH
6391 if (EQ (coding->src_object, coding->dst_object))
6392 dst_end = (unsigned char *) src;
2c78b7e1 6393 }
df7492f9
KH
6394 }
6395 EMIT_ONE_BYTE (c);
6396 }
d46c5b12 6397 }
df7492f9
KH
6398 else
6399 {
6400 if (!EQ (coding->src_object, coding->dst_object))
fa42c37f 6401 {
119852e7 6402 EMACS_INT require = coding->src_bytes - coding->dst_bytes;
4ed46869 6403
df7492f9 6404 if (require > 0)
fa42c37f 6405 {
df7492f9
KH
6406 EMACS_INT offset = src - coding->source;
6407
6408 dst = alloc_destination (coding, require, dst);
6409 coding_set_source (coding);
6410 src = coding->source + offset;
6411 src_end = coding->source + coding->src_bytes;
fa42c37f
KH
6412 }
6413 }
119852e7 6414 produced_chars = coding->consumed_char;
df7492f9 6415 while (src < src_end)
14daee73 6416 *dst++ = *src++;
fa42c37f
KH
6417 }
6418 }
6419
df7492f9 6420 produced = dst - (coding->destination + coding->produced);
284201e4 6421 if (BUFFERP (coding->dst_object) && produced_chars > 0)
df7492f9
KH
6422 insert_from_gap (produced_chars, produced);
6423 coding->produced += produced;
6424 coding->produced_char += produced_chars;
69a80ea3 6425 return carryover;
fa42c37f
KH
6426}
6427
ff0dacd7
KH
6428/* Compose text in CODING->object according to the annotation data at
6429 CHARBUF. CHARBUF is an array:
6430 [ -LENGTH ANNOTATION_MASK FROM TO METHOD COMP_LEN [ COMPONENTS... ] ]
df7492f9 6431 */
4ed46869 6432
df7492f9 6433static INLINE void
69a80ea3 6434produce_composition (coding, charbuf, pos)
4ed46869 6435 struct coding_system *coding;
df7492f9 6436 int *charbuf;
69a80ea3 6437 EMACS_INT pos;
4ed46869 6438{
df7492f9 6439 int len;
69a80ea3 6440 EMACS_INT to;
df7492f9 6441 enum composition_method method;
df7492f9 6442 Lisp_Object components;
fa42c37f 6443
df7492f9 6444 len = -charbuf[0];
69a80ea3 6445 to = pos + charbuf[2];
9ffd559c
KH
6446 if (to <= pos)
6447 return;
69a80ea3 6448 method = (enum composition_method) (charbuf[3]);
d46c5b12 6449
df7492f9
KH
6450 if (method == COMPOSITION_RELATIVE)
6451 components = Qnil;
9ffd559c
KH
6452 else if (method >= COMPOSITION_WITH_RULE
6453 && method <= COMPOSITION_WITH_RULE_ALTCHARS)
d46c5b12 6454 {
df7492f9
KH
6455 Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
6456 int i;
b73bfc1c 6457
69a80ea3
KH
6458 len -= 4;
6459 charbuf += 4;
df7492f9 6460 for (i = 0; i < len; i++)
9ffd559c
KH
6461 {
6462 args[i] = make_number (charbuf[i]);
f75c90a9 6463 if (charbuf[i] < 0)
9ffd559c
KH
6464 return;
6465 }
df7492f9
KH
6466 components = (method == COMPOSITION_WITH_ALTCHARS
6467 ? Fstring (len, args) : Fvector (len, args));
d46c5b12 6468 }
9ffd559c
KH
6469 else
6470 return;
69a80ea3 6471 compose_text (pos, to, components, Qnil, coding->dst_object);
d46c5b12
KH
6472}
6473
d46c5b12 6474
ff0dacd7
KH
6475/* Put `charset' property on text in CODING->object according to
6476 the annotation data at CHARBUF. CHARBUF is an array:
69a80ea3 6477 [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
ff0dacd7 6478 */
d46c5b12 6479
ff0dacd7 6480static INLINE void
69a80ea3 6481produce_charset (coding, charbuf, pos)
d46c5b12 6482 struct coding_system *coding;
ff0dacd7 6483 int *charbuf;
69a80ea3 6484 EMACS_INT pos;
d46c5b12 6485{
69a80ea3
KH
6486 EMACS_INT from = pos - charbuf[2];
6487 struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
b73bfc1c 6488
69a80ea3 6489 Fput_text_property (make_number (from), make_number (pos),
ff0dacd7
KH
6490 Qcharset, CHARSET_NAME (charset),
6491 coding->dst_object);
d46c5b12
KH
6492}
6493
d46c5b12 6494
df7492f9
KH
6495#define CHARBUF_SIZE 0x4000
6496
6497#define ALLOC_CONVERSION_WORK_AREA(coding) \
6498 do { \
6499 int size = CHARBUF_SIZE;; \
6500 \
6501 coding->charbuf = NULL; \
6502 while (size > 1024) \
6503 { \
6504 coding->charbuf = (int *) alloca (sizeof (int) * size); \
6505 if (coding->charbuf) \
6506 break; \
6507 size >>= 1; \
6508 } \
6509 if (! coding->charbuf) \
6510 { \
065e3595 6511 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
df7492f9
KH
6512 return coding->result; \
6513 } \
6514 coding->charbuf_size = size; \
6515 } while (0)
4ed46869 6516
d46c5b12
KH
6517
6518static void
69a80ea3 6519produce_annotation (coding, pos)
d46c5b12 6520 struct coding_system *coding;
69a80ea3 6521 EMACS_INT pos;
d46c5b12 6522{
df7492f9
KH
6523 int *charbuf = coding->charbuf;
6524 int *charbuf_end = charbuf + coding->charbuf_used;
d46c5b12 6525
ff0dacd7
KH
6526 if (NILP (coding->dst_object))
6527 return;
d46c5b12 6528
df7492f9 6529 while (charbuf < charbuf_end)
a84f1519 6530 {
df7492f9 6531 if (*charbuf >= 0)
69a80ea3 6532 pos += *charbuf++;
d46c5b12 6533 else
d46c5b12 6534 {
df7492f9 6535 int len = -*charbuf;
ff0dacd7 6536 switch (charbuf[1])
df7492f9
KH
6537 {
6538 case CODING_ANNOTATE_COMPOSITION_MASK:
69a80ea3 6539 produce_composition (coding, charbuf, pos);
df7492f9 6540 break;
ff0dacd7 6541 case CODING_ANNOTATE_CHARSET_MASK:
69a80ea3 6542 produce_charset (coding, charbuf, pos);
ff0dacd7 6543 break;
df7492f9
KH
6544 default:
6545 abort ();
6546 }
6547 charbuf += len;
d46c5b12 6548 }
a84f1519 6549 }
d46c5b12
KH
6550}
6551
df7492f9
KH
6552/* Decode the data at CODING->src_object into CODING->dst_object.
6553 CODING->src_object is a buffer, a string, or nil.
6554 CODING->dst_object is a buffer.
d46c5b12 6555
df7492f9
KH
6556 If CODING->src_object is a buffer, it must be the current buffer.
6557 In this case, if CODING->src_pos is positive, it is a position of
6558 the source text in the buffer, otherwise, the source text is in the
6559 gap area of the buffer, and CODING->src_pos specifies the offset of
6560 the text from GPT (which must be the same as PT). If this is the
6561 same buffer as CODING->dst_object, CODING->src_pos must be
6562 negative.
d46c5b12 6563
b6828792 6564 If CODING->src_object is a string, CODING->src_pos is an index to
df7492f9 6565 that string.
d46c5b12 6566
df7492f9
KH
6567 If CODING->src_object is nil, CODING->source must already point to
6568 the non-relocatable memory area. In this case, CODING->src_pos is
6569 an offset from CODING->source.
73be902c 6570
df7492f9
KH
6571 The decoded data is inserted at the current point of the buffer
6572 CODING->dst_object.
6573*/
d46c5b12 6574
df7492f9
KH
6575static int
6576decode_coding (coding)
d46c5b12 6577 struct coding_system *coding;
d46c5b12 6578{
df7492f9 6579 Lisp_Object attrs;
24a73b0a 6580 Lisp_Object undo_list;
7d64c6ad 6581 Lisp_Object translation_table;
69a80ea3
KH
6582 int carryover;
6583 int i;
d46c5b12 6584
df7492f9
KH
6585 if (BUFFERP (coding->src_object)
6586 && coding->src_pos > 0
6587 && coding->src_pos < GPT
6588 && coding->src_pos + coding->src_chars > GPT)
6589 move_gap_both (coding->src_pos, coding->src_pos_byte);
d46c5b12 6590
24a73b0a 6591 undo_list = Qt;
df7492f9 6592 if (BUFFERP (coding->dst_object))
1c3478b0 6593 {
df7492f9
KH
6594 if (current_buffer != XBUFFER (coding->dst_object))
6595 set_buffer_internal (XBUFFER (coding->dst_object));
6596 if (GPT != PT)
6597 move_gap_both (PT, PT_BYTE);
24a73b0a
KH
6598 undo_list = current_buffer->undo_list;
6599 current_buffer->undo_list = Qt;
1c3478b0
KH
6600 }
6601
df7492f9
KH
6602 coding->consumed = coding->consumed_char = 0;
6603 coding->produced = coding->produced_char = 0;
6604 coding->chars_at_source = 0;
065e3595 6605 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9 6606 coding->errors = 0;
1c3478b0 6607
df7492f9
KH
6608 ALLOC_CONVERSION_WORK_AREA (coding);
6609
6610 attrs = CODING_ID_ATTRS (coding->id);
2170c8f0 6611 translation_table = get_translation_table (attrs, 0, NULL);
df7492f9 6612
69a80ea3 6613 carryover = 0;
df7492f9 6614 do
b73bfc1c 6615 {
69a80ea3
KH
6616 EMACS_INT pos = coding->dst_pos + coding->produced_char;
6617
df7492f9
KH
6618 coding_set_source (coding);
6619 coding->annotated = 0;
69a80ea3 6620 coding->charbuf_used = carryover;
df7492f9 6621 (*(coding->decoder)) (coding);
df7492f9 6622 coding_set_destination (coding);
69a80ea3 6623 carryover = produce_chars (coding, translation_table, 0);
df7492f9 6624 if (coding->annotated)
69a80ea3
KH
6625 produce_annotation (coding, pos);
6626 for (i = 0; i < carryover; i++)
6627 coding->charbuf[i]
6628 = coding->charbuf[coding->charbuf_used - carryover + i];
d46c5b12 6629 }
df7492f9 6630 while (coding->consumed < coding->src_bytes
54b367bb
KH
6631 && (coding->result == CODING_RESULT_SUCCESS
6632 || coding->result == CODING_RESULT_INVALID_SRC));
d46c5b12 6633
69a80ea3
KH
6634 if (carryover > 0)
6635 {
6636 coding_set_destination (coding);
6637 coding->charbuf_used = carryover;
6638 produce_chars (coding, translation_table, 1);
6639 }
6640
df7492f9
KH
6641 coding->carryover_bytes = 0;
6642 if (coding->consumed < coding->src_bytes)
d46c5b12 6643 {
df7492f9 6644 int nbytes = coding->src_bytes - coding->consumed;
8f924df7 6645 const unsigned char *src;
df7492f9
KH
6646
6647 coding_set_source (coding);
6648 coding_set_destination (coding);
6649 src = coding->source + coding->consumed;
6650
6651 if (coding->mode & CODING_MODE_LAST_BLOCK)
1c3478b0 6652 {
df7492f9
KH
6653 /* Flush out unprocessed data as binary chars. We are sure
6654 that the number of data is less than the size of
6655 coding->charbuf. */
065e3595 6656 coding->charbuf_used = 0;
df7492f9 6657 while (nbytes-- > 0)
1c3478b0 6658 {
df7492f9 6659 int c = *src++;
98725083 6660
1c91457d
KH
6661 if (c & 0x80)
6662 c = BYTE8_TO_CHAR (c);
6663 coding->charbuf[coding->charbuf_used++] = c;
1c3478b0 6664 }
f6cbaf43 6665 produce_chars (coding, Qnil, 1);
d46c5b12 6666 }
d46c5b12 6667 else
df7492f9
KH
6668 {
6669 /* Record unprocessed bytes in coding->carryover. We are
6670 sure that the number of data is less than the size of
6671 coding->carryover. */
6672 unsigned char *p = coding->carryover;
6673
6674 coding->carryover_bytes = nbytes;
6675 while (nbytes-- > 0)
6676 *p++ = *src++;
1c3478b0 6677 }
df7492f9 6678 coding->consumed = coding->src_bytes;
b73bfc1c 6679 }
69f76525 6680
4347441b
KH
6681 if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix))
6682 decode_eol (coding);
24a73b0a
KH
6683 if (BUFFERP (coding->dst_object))
6684 {
6685 current_buffer->undo_list = undo_list;
6686 record_insert (coding->dst_pos, coding->produced_char);
6687 }
73be902c 6688 return coding->result;
4ed46869
KH
6689}
6690
aaaf0b1e 6691
e1c23804 6692/* Extract an annotation datum from a composition starting at POS and
ff0dacd7
KH
6693 ending before LIMIT of CODING->src_object (buffer or string), store
6694 the data in BUF, set *STOP to a starting position of the next
6695 composition (if any) or to LIMIT, and return the address of the
6696 next element of BUF.
6697
6698 If such an annotation is not found, set *STOP to a starting
6699 position of a composition after POS (if any) or to LIMIT, and
6700 return BUF. */
6701
6702static INLINE int *
6703handle_composition_annotation (pos, limit, coding, buf, stop)
6704 EMACS_INT pos, limit;
aaaf0b1e 6705 struct coding_system *coding;
ff0dacd7
KH
6706 int *buf;
6707 EMACS_INT *stop;
aaaf0b1e 6708{
ff0dacd7
KH
6709 EMACS_INT start, end;
6710 Lisp_Object prop;
aaaf0b1e 6711
ff0dacd7
KH
6712 if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
6713 || end > limit)
6714 *stop = limit;
6715 else if (start > pos)
6716 *stop = start;
6717 else
aaaf0b1e 6718 {
ff0dacd7 6719 if (start == pos)
aaaf0b1e 6720 {
ff0dacd7
KH
6721 /* We found a composition. Store the corresponding
6722 annotation data in BUF. */
6723 int *head = buf;
6724 enum composition_method method = COMPOSITION_METHOD (prop);
6725 int nchars = COMPOSITION_LENGTH (prop);
6726
69a80ea3 6727 ADD_COMPOSITION_DATA (buf, nchars, method);
ff0dacd7 6728 if (method != COMPOSITION_RELATIVE)
aaaf0b1e 6729 {
ff0dacd7
KH
6730 Lisp_Object components;
6731 int len, i, i_byte;
6732
6733 components = COMPOSITION_COMPONENTS (prop);
6734 if (VECTORP (components))
aaaf0b1e 6735 {
ff0dacd7
KH
6736 len = XVECTOR (components)->size;
6737 for (i = 0; i < len; i++)
6738 *buf++ = XINT (AREF (components, i));
aaaf0b1e 6739 }
ff0dacd7 6740 else if (STRINGP (components))
aaaf0b1e 6741 {
8f924df7 6742 len = SCHARS (components);
ff0dacd7
KH
6743 i = i_byte = 0;
6744 while (i < len)
6745 {
6746 FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
6747 buf++;
6748 }
6749 }
6750 else if (INTEGERP (components))
6751 {
6752 len = 1;
6753 *buf++ = XINT (components);
6754 }
6755 else if (CONSP (components))
6756 {
6757 for (len = 0; CONSP (components);
6758 len++, components = XCDR (components))
6759 *buf++ = XINT (XCAR (components));
aaaf0b1e 6760 }
aaaf0b1e 6761 else
ff0dacd7
KH
6762 abort ();
6763 *head -= len;
aaaf0b1e 6764 }
aaaf0b1e 6765 }
ff0dacd7
KH
6766
6767 if (find_composition (end, limit, &start, &end, &prop,
6768 coding->src_object)
6769 && end <= limit)
6770 *stop = start;
6771 else
6772 *stop = limit;
aaaf0b1e 6773 }
ff0dacd7
KH
6774 return buf;
6775}
6776
6777
e1c23804 6778/* Extract an annotation datum from a text property `charset' at POS of
ff0dacd7
KH
6779 CODING->src_object (buffer of string), store the data in BUF, set
6780 *STOP to the position where the value of `charset' property changes
6781 (limiting by LIMIT), and return the address of the next element of
6782 BUF.
6783
6784 If the property value is nil, set *STOP to the position where the
6785 property value is non-nil (limiting by LIMIT), and return BUF. */
6786
6787static INLINE int *
6788handle_charset_annotation (pos, limit, coding, buf, stop)
6789 EMACS_INT pos, limit;
6790 struct coding_system *coding;
6791 int *buf;
6792 EMACS_INT *stop;
6793{
6794 Lisp_Object val, next;
6795 int id;
6796
6797 val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
6798 if (! NILP (val) && CHARSETP (val))
6799 id = XINT (CHARSET_SYMBOL_ID (val));
6800 else
6801 id = -1;
69a80ea3 6802 ADD_CHARSET_DATA (buf, 0, id);
ff0dacd7
KH
6803 next = Fnext_single_property_change (make_number (pos), Qcharset,
6804 coding->src_object,
6805 make_number (limit));
6806 *stop = XINT (next);
6807 return buf;
6808}
6809
6810
df7492f9 6811static void
09ee6fdd 6812consume_chars (coding, translation_table, max_lookup)
df7492f9 6813 struct coding_system *coding;
433f7f87 6814 Lisp_Object translation_table;
09ee6fdd 6815 int max_lookup;
df7492f9
KH
6816{
6817 int *buf = coding->charbuf;
ff0dacd7 6818 int *buf_end = coding->charbuf + coding->charbuf_size;
7c78e542 6819 const unsigned char *src = coding->source + coding->consumed;
4776e638 6820 const unsigned char *src_end = coding->source + coding->src_bytes;
ff0dacd7
KH
6821 EMACS_INT pos = coding->src_pos + coding->consumed_char;
6822 EMACS_INT end_pos = coding->src_pos + coding->src_chars;
df7492f9
KH
6823 int multibytep = coding->src_multibyte;
6824 Lisp_Object eol_type;
6825 int c;
ff0dacd7 6826 EMACS_INT stop, stop_composition, stop_charset;
09ee6fdd 6827 int *lookup_buf = NULL;
433f7f87
KH
6828
6829 if (! NILP (translation_table))
09ee6fdd 6830 lookup_buf = alloca (sizeof (int) * max_lookup);
88993dfd 6831
df7492f9
KH
6832 eol_type = CODING_ID_EOL_TYPE (coding->id);
6833 if (VECTORP (eol_type))
6834 eol_type = Qunix;
88993dfd 6835
df7492f9
KH
6836 /* Note: composition handling is not yet implemented. */
6837 coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
ec6d2bb8 6838
0b5670c9
KH
6839 if (NILP (coding->src_object))
6840 stop = stop_composition = stop_charset = end_pos;
ff0dacd7 6841 else
0b5670c9
KH
6842 {
6843 if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
6844 stop = stop_composition = pos;
6845 else
6846 stop = stop_composition = end_pos;
6847 if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
6848 stop = stop_charset = pos;
6849 else
6850 stop_charset = end_pos;
6851 }
ec6d2bb8 6852
24a73b0a 6853 /* Compensate for CRLF and conversion. */
ff0dacd7 6854 buf_end -= 1 + MAX_ANNOTATION_LENGTH;
df7492f9 6855 while (buf < buf_end)
aaaf0b1e 6856 {
433f7f87
KH
6857 Lisp_Object trans;
6858
df7492f9 6859 if (pos == stop)
ec6d2bb8 6860 {
df7492f9
KH
6861 if (pos == end_pos)
6862 break;
ff0dacd7
KH
6863 if (pos == stop_composition)
6864 buf = handle_composition_annotation (pos, end_pos, coding,
6865 buf, &stop_composition);
6866 if (pos == stop_charset)
6867 buf = handle_charset_annotation (pos, end_pos, coding,
6868 buf, &stop_charset);
6869 stop = (stop_composition < stop_charset
6870 ? stop_composition : stop_charset);
df7492f9
KH
6871 }
6872
6873 if (! multibytep)
4776e638 6874 {
d3e4cb56 6875 EMACS_INT bytes;
aaaf0b1e 6876
ea29edf2
KH
6877 if (coding->encoder == encode_coding_raw_text)
6878 c = *src++, pos++;
6879 else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
db274c7a 6880 c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
4776e638 6881 else
f03caae0 6882 c = BYTE8_TO_CHAR (*src), src++, pos++;
4776e638 6883 }
df7492f9 6884 else
db274c7a 6885 c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
df7492f9
KH
6886 if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
6887 c = '\n';
6888 if (! EQ (eol_type, Qunix))
aaaf0b1e 6889 {
df7492f9 6890 if (c == '\n')
aaaf0b1e 6891 {
df7492f9
KH
6892 if (EQ (eol_type, Qdos))
6893 *buf++ = '\r';
6894 else
6895 c = '\r';
aaaf0b1e
KH
6896 }
6897 }
433f7f87 6898
e6a54062 6899 trans = Qnil;
09ee6fdd 6900 LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
e6a54062 6901 if (NILP (trans))
433f7f87
KH
6902 *buf++ = c;
6903 else
6904 {
6905 int from_nchars = 1, to_nchars = 1;
6906 int *lookup_buf_end;
6907 const unsigned char *p = src;
6908 int i;
6909
6910 lookup_buf[0] = c;
6911 for (i = 1; i < max_lookup && p < src_end; i++)
6912 lookup_buf[i] = STRING_CHAR_ADVANCE (p);
6913 lookup_buf_end = lookup_buf + i;
6914 trans = get_translation (trans, lookup_buf, lookup_buf_end, 1,
6915 &from_nchars, &to_nchars);
6916 if (EQ (trans, Qt)
6917 || buf + to_nchars > buf_end)
6918 break;
6919 *buf++ = *lookup_buf;
6920 for (i = 1; i < to_nchars; i++)
6921 *buf++ = XINT (AREF (trans, i));
6922 for (i = 1; i < from_nchars; i++, pos++)
6923 src += MULTIBYTE_LENGTH_NO_CHECK (src);
6924 }
aaaf0b1e 6925 }
ec6d2bb8 6926
df7492f9
KH
6927 coding->consumed = src - coding->source;
6928 coding->consumed_char = pos - coding->src_pos;
6929 coding->charbuf_used = buf - coding->charbuf;
6930 coding->chars_at_source = 0;
aaaf0b1e
KH
6931}
6932
4ed46869 6933
df7492f9
KH
6934/* Encode the text at CODING->src_object into CODING->dst_object.
6935 CODING->src_object is a buffer or a string.
6936 CODING->dst_object is a buffer or nil.
6937
6938 If CODING->src_object is a buffer, it must be the current buffer.
6939 In this case, if CODING->src_pos is positive, it is a position of
6940 the source text in the buffer, otherwise. the source text is in the
6941 gap area of the buffer, and coding->src_pos specifies the offset of
6942 the text from GPT (which must be the same as PT). If this is the
6943 same buffer as CODING->dst_object, CODING->src_pos must be
6944 negative and CODING should not have `pre-write-conversion'.
6945
6946 If CODING->src_object is a string, CODING should not have
6947 `pre-write-conversion'.
6948
6949 If CODING->dst_object is a buffer, the encoded data is inserted at
6950 the current point of that buffer.
6951
6952 If CODING->dst_object is nil, the encoded data is placed at the
6953 memory area specified by CODING->destination. */
6954
6955static int
6956encode_coding (coding)
4ed46869 6957 struct coding_system *coding;
4ed46869 6958{
df7492f9 6959 Lisp_Object attrs;
7d64c6ad 6960 Lisp_Object translation_table;
09ee6fdd 6961 int max_lookup;
9861e777 6962
df7492f9 6963 attrs = CODING_ID_ATTRS (coding->id);
ea29edf2
KH
6964 if (coding->encoder == encode_coding_raw_text)
6965 translation_table = Qnil, max_lookup = 0;
6966 else
6967 translation_table = get_translation_table (attrs, 1, &max_lookup);
4ed46869 6968
df7492f9 6969 if (BUFFERP (coding->dst_object))
8844fa83 6970 {
df7492f9
KH
6971 set_buffer_internal (XBUFFER (coding->dst_object));
6972 coding->dst_multibyte
6973 = ! NILP (current_buffer->enable_multibyte_characters);
8844fa83 6974 }
4ed46869 6975
b73bfc1c 6976 coding->consumed = coding->consumed_char = 0;
df7492f9 6977 coding->produced = coding->produced_char = 0;
065e3595 6978 record_conversion_result (coding, CODING_RESULT_SUCCESS);
b73bfc1c 6979 coding->errors = 0;
b73bfc1c 6980
df7492f9 6981 ALLOC_CONVERSION_WORK_AREA (coding);
4ed46869 6982
df7492f9
KH
6983 do {
6984 coding_set_source (coding);
09ee6fdd 6985 consume_chars (coding, translation_table, max_lookup);
df7492f9
KH
6986 coding_set_destination (coding);
6987 (*(coding->encoder)) (coding);
6988 } while (coding->consumed_char < coding->src_chars);
6989
284201e4 6990 if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
df7492f9
KH
6991 insert_from_gap (coding->produced_char, coding->produced);
6992
6993 return (coding->result);
ec6d2bb8
KH
6994}
6995
fb88bf2d 6996
24a73b0a
KH
6997/* Name (or base name) of work buffer for code conversion. */
6998static Lisp_Object Vcode_conversion_workbuf_name;
d46c5b12 6999
24a73b0a
KH
7000/* A working buffer used by the top level conversion. Once it is
7001 created, it is never destroyed. It has the name
7002 Vcode_conversion_workbuf_name. The other working buffers are
7003 destroyed after the use is finished, and their names are modified
7004 versions of Vcode_conversion_workbuf_name. */
7005static Lisp_Object Vcode_conversion_reused_workbuf;
b73bfc1c 7006
24a73b0a
KH
7007/* 1 iff Vcode_conversion_reused_workbuf is already in use. */
7008static int reused_workbuf_in_use;
4ed46869 7009
24a73b0a
KH
7010
7011/* Return a working buffer of code convesion. MULTIBYTE specifies the
7012 multibyteness of returning buffer. */
b73bfc1c 7013
f6cbaf43 7014static Lisp_Object
24a73b0a 7015make_conversion_work_buffer (multibyte)
f6cbaf43 7016 int multibyte;
df7492f9 7017{
24a73b0a
KH
7018 Lisp_Object name, workbuf;
7019 struct buffer *current;
4ed46869 7020
24a73b0a 7021 if (reused_workbuf_in_use++)
065e3595
KH
7022 {
7023 name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7024 workbuf = Fget_buffer_create (name);
7025 }
df7492f9 7026 else
065e3595 7027 {
159bd5a2 7028 if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
a993c7a1
KH
7029 Vcode_conversion_reused_workbuf
7030 = Fget_buffer_create (Vcode_conversion_workbuf_name);
7031 workbuf = Vcode_conversion_reused_workbuf;
065e3595 7032 }
24a73b0a
KH
7033 current = current_buffer;
7034 set_buffer_internal (XBUFFER (workbuf));
3ed051d4 7035 Ferase_buffer ();
df7492f9 7036 current_buffer->undo_list = Qt;
24a73b0a 7037 current_buffer->enable_multibyte_characters = multibyte ? Qt : Qnil;
df7492f9 7038 set_buffer_internal (current);
24a73b0a 7039 return workbuf;
df7492f9 7040}
d46c5b12 7041
24a73b0a 7042
4776e638 7043static Lisp_Object
24a73b0a
KH
7044code_conversion_restore (arg)
7045 Lisp_Object arg;
4776e638 7046{
24a73b0a 7047 Lisp_Object current, workbuf;
948bdcf3 7048 struct gcpro gcpro1;
24a73b0a 7049
948bdcf3 7050 GCPRO1 (arg);
24a73b0a
KH
7051 current = XCAR (arg);
7052 workbuf = XCDR (arg);
7053 if (! NILP (workbuf))
7054 {
7055 if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7056 reused_workbuf_in_use = 0;
7057 else if (! NILP (Fbuffer_live_p (workbuf)))
7058 Fkill_buffer (workbuf);
7059 }
7060 set_buffer_internal (XBUFFER (current));
948bdcf3 7061 UNGCPRO;
4776e638
KH
7062 return Qnil;
7063}
b73bfc1c 7064
24a73b0a
KH
7065Lisp_Object
7066code_conversion_save (with_work_buf, multibyte)
4776e638 7067 int with_work_buf, multibyte;
df7492f9 7068{
24a73b0a 7069 Lisp_Object workbuf = Qnil;
b73bfc1c 7070
4776e638 7071 if (with_work_buf)
24a73b0a
KH
7072 workbuf = make_conversion_work_buffer (multibyte);
7073 record_unwind_protect (code_conversion_restore,
7074 Fcons (Fcurrent_buffer (), workbuf));
4776e638 7075 return workbuf;
df7492f9 7076}
d46c5b12 7077
df7492f9
KH
7078int
7079decode_coding_gap (coding, chars, bytes)
7080 struct coding_system *coding;
7081 EMACS_INT chars, bytes;
7082{
7083 int count = specpdl_ptr - specpdl;
5e5c78be 7084 Lisp_Object attrs;
fb88bf2d 7085
24a73b0a 7086 code_conversion_save (0, 0);
ec6d2bb8 7087
24a73b0a 7088 coding->src_object = Fcurrent_buffer ();
df7492f9
KH
7089 coding->src_chars = chars;
7090 coding->src_bytes = bytes;
7091 coding->src_pos = -chars;
7092 coding->src_pos_byte = -bytes;
7093 coding->src_multibyte = chars < bytes;
24a73b0a 7094 coding->dst_object = coding->src_object;
df7492f9
KH
7095 coding->dst_pos = PT;
7096 coding->dst_pos_byte = PT_BYTE;
71c81426 7097 coding->dst_multibyte = ! NILP (current_buffer->enable_multibyte_characters);
4ed46869 7098
df7492f9
KH
7099 if (CODING_REQUIRE_DETECTION (coding))
7100 detect_coding (coding);
8f924df7 7101
9286b333 7102 coding->mode |= CODING_MODE_LAST_BLOCK;
287c57d7 7103 current_buffer->text->inhibit_shrinking = 1;
df7492f9 7104 decode_coding (coding);
287c57d7 7105 current_buffer->text->inhibit_shrinking = 0;
d46c5b12 7106
5e5c78be
KH
7107 attrs = CODING_ID_ATTRS (coding->id);
7108 if (! NILP (CODING_ATTR_POST_READ (attrs)))
b73bfc1c 7109 {
5e5c78be
KH
7110 EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7111 Lisp_Object val;
7112
7113 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
5e5c78be
KH
7114 val = call1 (CODING_ATTR_POST_READ (attrs),
7115 make_number (coding->produced_char));
5e5c78be
KH
7116 CHECK_NATNUM (val);
7117 coding->produced_char += Z - prev_Z;
7118 coding->produced += Z_BYTE - prev_Z_BYTE;
b73bfc1c 7119 }
4ed46869 7120
df7492f9 7121 unbind_to (count, Qnil);
b73bfc1c
KH
7122 return coding->result;
7123}
52d41803 7124
4ed46869 7125int
df7492f9 7126encode_coding_gap (coding, chars, bytes)
4ed46869 7127 struct coding_system *coding;
df7492f9 7128 EMACS_INT chars, bytes;
4ed46869 7129{
df7492f9 7130 int count = specpdl_ptr - specpdl;
4ed46869 7131
24a73b0a 7132 code_conversion_save (0, 0);
4ed46869 7133
24a73b0a 7134 coding->src_object = Fcurrent_buffer ();
df7492f9
KH
7135 coding->src_chars = chars;
7136 coding->src_bytes = bytes;
7137 coding->src_pos = -chars;
7138 coding->src_pos_byte = -bytes;
7139 coding->src_multibyte = chars < bytes;
7140 coding->dst_object = coding->src_object;
7141 coding->dst_pos = PT;
7142 coding->dst_pos_byte = PT_BYTE;
4ed46869 7143
df7492f9 7144 encode_coding (coding);
b73bfc1c 7145
df7492f9
KH
7146 unbind_to (count, Qnil);
7147 return coding->result;
7148}
4ed46869 7149
d46c5b12 7150
df7492f9
KH
7151/* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
7152 SRC_OBJECT into DST_OBJECT by coding context CODING.
b73bfc1c 7153
df7492f9 7154 SRC_OBJECT is a buffer, a string, or Qnil.
b73bfc1c 7155
df7492f9
KH
7156 If it is a buffer, the text is at point of the buffer. FROM and TO
7157 are positions in the buffer.
b73bfc1c 7158
df7492f9
KH
7159 If it is a string, the text is at the beginning of the string.
7160 FROM and TO are indices to the string.
4ed46869 7161
df7492f9
KH
7162 If it is nil, the text is at coding->source. FROM and TO are
7163 indices to coding->source.
bb10be8b 7164
df7492f9 7165 DST_OBJECT is a buffer, Qt, or Qnil.
4ed46869 7166
df7492f9
KH
7167 If it is a buffer, the decoded text is inserted at point of the
7168 buffer. If the buffer is the same as SRC_OBJECT, the source text
7169 is deleted.
4ed46869 7170
df7492f9
KH
7171 If it is Qt, a string is made from the decoded text, and
7172 set in CODING->dst_object.
d46c5b12 7173
df7492f9 7174 If it is Qnil, the decoded text is stored at CODING->destination.
2cb26057 7175 The caller must allocate CODING->dst_bytes bytes at
df7492f9
KH
7176 CODING->destination by xmalloc. If the decoded text is longer than
7177 CODING->dst_bytes, CODING->destination is relocated by xrealloc.
7178 */
d46c5b12 7179
df7492f9
KH
7180void
7181decode_coding_object (coding, src_object, from, from_byte, to, to_byte,
7182 dst_object)
d46c5b12 7183 struct coding_system *coding;
df7492f9
KH
7184 Lisp_Object src_object;
7185 EMACS_INT from, from_byte, to, to_byte;
7186 Lisp_Object dst_object;
d46c5b12 7187{
df7492f9
KH
7188 int count = specpdl_ptr - specpdl;
7189 unsigned char *destination;
7190 EMACS_INT dst_bytes;
7191 EMACS_INT chars = to - from;
7192 EMACS_INT bytes = to_byte - from_byte;
7193 Lisp_Object attrs;
4776e638 7194 int saved_pt = -1, saved_pt_byte;
64cedb0c 7195 int need_marker_adjustment = 0;
b3bfad50 7196 Lisp_Object old_deactivate_mark;
d46c5b12 7197
b3bfad50 7198 old_deactivate_mark = Vdeactivate_mark;
93dec019 7199
df7492f9 7200 if (NILP (dst_object))
d46c5b12 7201 {
df7492f9
KH
7202 destination = coding->destination;
7203 dst_bytes = coding->dst_bytes;
d46c5b12 7204 }
93dec019 7205
df7492f9
KH
7206 coding->src_object = src_object;
7207 coding->src_chars = chars;
7208 coding->src_bytes = bytes;
7209 coding->src_multibyte = chars < bytes;
70ad9fc4 7210
df7492f9 7211 if (STRINGP (src_object))
d46c5b12 7212 {
df7492f9
KH
7213 coding->src_pos = from;
7214 coding->src_pos_byte = from_byte;
d46c5b12 7215 }
df7492f9 7216 else if (BUFFERP (src_object))
88993dfd 7217 {
df7492f9
KH
7218 set_buffer_internal (XBUFFER (src_object));
7219 if (from != GPT)
7220 move_gap_both (from, from_byte);
7221 if (EQ (src_object, dst_object))
fb88bf2d 7222 {
64cedb0c
KH
7223 struct Lisp_Marker *tail;
7224
7225 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7226 {
7227 tail->need_adjustment
7228 = tail->charpos == (tail->insertion_type ? from : to);
7229 need_marker_adjustment |= tail->need_adjustment;
7230 }
4776e638 7231 saved_pt = PT, saved_pt_byte = PT_BYTE;
df7492f9 7232 TEMP_SET_PT_BOTH (from, from_byte);
f4a3cc44 7233 current_buffer->text->inhibit_shrinking = 1;
df7492f9
KH
7234 del_range_both (from, from_byte, to, to_byte, 1);
7235 coding->src_pos = -chars;
7236 coding->src_pos_byte = -bytes;
fb88bf2d 7237 }
df7492f9 7238 else
fb88bf2d 7239 {
df7492f9
KH
7240 coding->src_pos = from;
7241 coding->src_pos_byte = from_byte;
fb88bf2d 7242 }
88993dfd
KH
7243 }
7244
df7492f9
KH
7245 if (CODING_REQUIRE_DETECTION (coding))
7246 detect_coding (coding);
7247 attrs = CODING_ID_ATTRS (coding->id);
d46c5b12 7248
2cb26057
KH
7249 if (EQ (dst_object, Qt)
7250 || (! NILP (CODING_ATTR_POST_READ (attrs))
7251 && NILP (dst_object)))
b73bfc1c 7252 {
a1567c45
SM
7253 coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
7254 coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
df7492f9
KH
7255 coding->dst_pos = BEG;
7256 coding->dst_pos_byte = BEG_BYTE;
b73bfc1c 7257 }
df7492f9 7258 else if (BUFFERP (dst_object))
d46c5b12 7259 {
24a73b0a 7260 code_conversion_save (0, 0);
df7492f9
KH
7261 coding->dst_object = dst_object;
7262 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
7263 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
7264 coding->dst_multibyte
7265 = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
d46c5b12
KH
7266 }
7267 else
7268 {
24a73b0a 7269 code_conversion_save (0, 0);
df7492f9 7270 coding->dst_object = Qnil;
0154725e
SM
7271 /* Most callers presume this will return a multibyte result, and they
7272 won't use `binary' or `raw-text' anyway, so let's not worry about
7273 CODING_FOR_UNIBYTE. */
bb555731 7274 coding->dst_multibyte = 1;
d46c5b12
KH
7275 }
7276
df7492f9 7277 decode_coding (coding);
fa46990e 7278
df7492f9
KH
7279 if (BUFFERP (coding->dst_object))
7280 set_buffer_internal (XBUFFER (coding->dst_object));
d46c5b12 7281
df7492f9 7282 if (! NILP (CODING_ATTR_POST_READ (attrs)))
d46c5b12 7283 {
b3bfad50 7284 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
df7492f9 7285 EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
2b4f9037 7286 Lisp_Object val;
d46c5b12 7287
c0cc7f7f 7288 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
b3bfad50
KH
7289 GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7290 old_deactivate_mark);
d4850d67
KH
7291 val = safe_call1 (CODING_ATTR_POST_READ (attrs),
7292 make_number (coding->produced_char));
df7492f9
KH
7293 UNGCPRO;
7294 CHECK_NATNUM (val);
7295 coding->produced_char += Z - prev_Z;
7296 coding->produced += Z_BYTE - prev_Z_BYTE;
d46c5b12 7297 }
de79a6a5 7298
df7492f9 7299 if (EQ (dst_object, Qt))
ec6d2bb8 7300 {
df7492f9
KH
7301 coding->dst_object = Fbuffer_string ();
7302 }
7303 else if (NILP (dst_object) && BUFFERP (coding->dst_object))
7304 {
7305 set_buffer_internal (XBUFFER (coding->dst_object));
7306 if (dst_bytes < coding->produced)
7307 {
b3bfad50 7308 destination = xrealloc (destination, coding->produced);
df7492f9
KH
7309 if (! destination)
7310 {
065e3595
KH
7311 record_conversion_result (coding,
7312 CODING_RESULT_INSUFFICIENT_DST);
df7492f9
KH
7313 unbind_to (count, Qnil);
7314 return;
7315 }
7316 if (BEGV < GPT && GPT < BEGV + coding->produced_char)
7317 move_gap_both (BEGV, BEGV_BYTE);
7318 bcopy (BEGV_ADDR, destination, coding->produced);
7319 coding->destination = destination;
d46c5b12 7320 }
ec6d2bb8 7321 }
b73bfc1c 7322
4776e638
KH
7323 if (saved_pt >= 0)
7324 {
7325 /* This is the case of:
7326 (BUFFERP (src_object) && EQ (src_object, dst_object))
7327 As we have moved PT while replacing the original buffer
7328 contents, we must recover it now. */
7329 set_buffer_internal (XBUFFER (src_object));
f4a3cc44 7330 current_buffer->text->inhibit_shrinking = 0;
4776e638
KH
7331 if (saved_pt < from)
7332 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7333 else if (saved_pt < from + chars)
7334 TEMP_SET_PT_BOTH (from, from_byte);
7335 else if (! NILP (current_buffer->enable_multibyte_characters))
7336 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7337 saved_pt_byte + (coding->produced - bytes));
7338 else
7339 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7340 saved_pt_byte + (coding->produced - bytes));
64cedb0c
KH
7341
7342 if (need_marker_adjustment)
7343 {
7344 struct Lisp_Marker *tail;
7345
7346 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7347 if (tail->need_adjustment)
7348 {
7349 tail->need_adjustment = 0;
7350 if (tail->insertion_type)
7351 {
7352 tail->bytepos = from_byte;
7353 tail->charpos = from;
7354 }
7355 else
7356 {
7357 tail->bytepos = from_byte + coding->produced;
7358 tail->charpos
7359 = (NILP (current_buffer->enable_multibyte_characters)
7360 ? tail->bytepos : from + coding->produced_char);
7361 }
7362 }
7363 }
d46c5b12 7364 }
4776e638 7365
b3bfad50 7366 Vdeactivate_mark = old_deactivate_mark;
065e3595 7367 unbind_to (count, coding->dst_object);
d46c5b12
KH
7368}
7369
d46c5b12 7370
df7492f9
KH
7371void
7372encode_coding_object (coding, src_object, from, from_byte, to, to_byte,
7373 dst_object)
d46c5b12 7374 struct coding_system *coding;
df7492f9
KH
7375 Lisp_Object src_object;
7376 EMACS_INT from, from_byte, to, to_byte;
7377 Lisp_Object dst_object;
d46c5b12 7378{
b73bfc1c 7379 int count = specpdl_ptr - specpdl;
df7492f9
KH
7380 EMACS_INT chars = to - from;
7381 EMACS_INT bytes = to_byte - from_byte;
7382 Lisp_Object attrs;
4776e638 7383 int saved_pt = -1, saved_pt_byte;
64cedb0c 7384 int need_marker_adjustment = 0;
c02d943b 7385 int kill_src_buffer = 0;
b3bfad50 7386 Lisp_Object old_deactivate_mark;
df7492f9 7387
b3bfad50 7388 old_deactivate_mark = Vdeactivate_mark;
df7492f9
KH
7389
7390 coding->src_object = src_object;
7391 coding->src_chars = chars;
7392 coding->src_bytes = bytes;
7393 coding->src_multibyte = chars < bytes;
7394
7395 attrs = CODING_ID_ATTRS (coding->id);
7396
64cedb0c
KH
7397 if (EQ (src_object, dst_object))
7398 {
7399 struct Lisp_Marker *tail;
7400
7401 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7402 {
7403 tail->need_adjustment
7404 = tail->charpos == (tail->insertion_type ? from : to);
7405 need_marker_adjustment |= tail->need_adjustment;
7406 }
7407 }
7408
df7492f9 7409 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
6bac5b12 7410 {
24a73b0a 7411 coding->src_object = code_conversion_save (1, coding->src_multibyte);
df7492f9
KH
7412 set_buffer_internal (XBUFFER (coding->src_object));
7413 if (STRINGP (src_object))
7414 insert_from_string (src_object, from, from_byte, chars, bytes, 0);
7415 else if (BUFFERP (src_object))
7416 insert_from_buffer (XBUFFER (src_object), from, chars, 0);
7417 else
7418 insert_1_both (coding->source + from, chars, bytes, 0, 0, 0);
d46c5b12 7419
df7492f9
KH
7420 if (EQ (src_object, dst_object))
7421 {
7422 set_buffer_internal (XBUFFER (src_object));
4776e638 7423 saved_pt = PT, saved_pt_byte = PT_BYTE;
df7492f9
KH
7424 del_range_both (from, from_byte, to, to_byte, 1);
7425 set_buffer_internal (XBUFFER (coding->src_object));
7426 }
7427
d4850d67
KH
7428 {
7429 Lisp_Object args[3];
b3bfad50 7430 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
d4850d67 7431
b3bfad50
KH
7432 GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7433 old_deactivate_mark);
d4850d67
KH
7434 args[0] = CODING_ATTR_PRE_WRITE (attrs);
7435 args[1] = make_number (BEG);
7436 args[2] = make_number (Z);
7437 safe_call (3, args);
b3bfad50 7438 UNGCPRO;
d4850d67 7439 }
c02d943b
KH
7440 if (XBUFFER (coding->src_object) != current_buffer)
7441 kill_src_buffer = 1;
ac87bbef 7442 coding->src_object = Fcurrent_buffer ();
df7492f9
KH
7443 if (BEG != GPT)
7444 move_gap_both (BEG, BEG_BYTE);
7445 coding->src_chars = Z - BEG;
7446 coding->src_bytes = Z_BYTE - BEG_BYTE;
7447 coding->src_pos = BEG;
7448 coding->src_pos_byte = BEG_BYTE;
7449 coding->src_multibyte = Z < Z_BYTE;
7450 }
7451 else if (STRINGP (src_object))
d46c5b12 7452 {
24a73b0a 7453 code_conversion_save (0, 0);
df7492f9
KH
7454 coding->src_pos = from;
7455 coding->src_pos_byte = from_byte;
b73bfc1c 7456 }
df7492f9 7457 else if (BUFFERP (src_object))
b73bfc1c 7458 {
24a73b0a 7459 code_conversion_save (0, 0);
df7492f9 7460 set_buffer_internal (XBUFFER (src_object));
df7492f9 7461 if (EQ (src_object, dst_object))
d46c5b12 7462 {
4776e638 7463 saved_pt = PT, saved_pt_byte = PT_BYTE;
ff0dacd7
KH
7464 coding->src_object = del_range_1 (from, to, 1, 1);
7465 coding->src_pos = 0;
7466 coding->src_pos_byte = 0;
d46c5b12 7467 }
df7492f9 7468 else
d46c5b12 7469 {
ff0dacd7
KH
7470 if (from < GPT && to >= GPT)
7471 move_gap_both (from, from_byte);
df7492f9
KH
7472 coding->src_pos = from;
7473 coding->src_pos_byte = from_byte;
d46c5b12 7474 }
d46c5b12 7475 }
4776e638 7476 else
24a73b0a 7477 code_conversion_save (0, 0);
d46c5b12 7478
df7492f9 7479 if (BUFFERP (dst_object))
88993dfd 7480 {
df7492f9 7481 coding->dst_object = dst_object;
28f67a95
KH
7482 if (EQ (src_object, dst_object))
7483 {
7484 coding->dst_pos = from;
7485 coding->dst_pos_byte = from_byte;
7486 }
7487 else
7488 {
319a3947
KH
7489 struct buffer *current = current_buffer;
7490
7491 set_buffer_temp (XBUFFER (dst_object));
7492 coding->dst_pos = PT;
7493 coding->dst_pos_byte = PT_BYTE;
7494 move_gap_both (coding->dst_pos, coding->dst_pos_byte);
7495 set_buffer_temp (current);
28f67a95 7496 }
df7492f9
KH
7497 coding->dst_multibyte
7498 = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
88993dfd 7499 }
df7492f9 7500 else if (EQ (dst_object, Qt))
d46c5b12 7501 {
df7492f9 7502 coding->dst_object = Qnil;
df7492f9 7503 coding->dst_bytes = coding->src_chars;
ac87bbef
KH
7504 if (coding->dst_bytes == 0)
7505 coding->dst_bytes = 1;
7506 coding->destination = (unsigned char *) xmalloc (coding->dst_bytes);
df7492f9 7507 coding->dst_multibyte = 0;
d46c5b12
KH
7508 }
7509 else
7510 {
df7492f9
KH
7511 coding->dst_object = Qnil;
7512 coding->dst_multibyte = 0;
d46c5b12
KH
7513 }
7514
df7492f9 7515 encode_coding (coding);
d46c5b12 7516
df7492f9 7517 if (EQ (dst_object, Qt))
d46c5b12 7518 {
df7492f9
KH
7519 if (BUFFERP (coding->dst_object))
7520 coding->dst_object = Fbuffer_string ();
7521 else
d46c5b12 7522 {
df7492f9
KH
7523 coding->dst_object
7524 = make_unibyte_string ((char *) coding->destination,
7525 coding->produced);
7526 xfree (coding->destination);
d46c5b12 7527 }
4ed46869 7528 }
d46c5b12 7529
4776e638
KH
7530 if (saved_pt >= 0)
7531 {
7532 /* This is the case of:
7533 (BUFFERP (src_object) && EQ (src_object, dst_object))
7534 As we have moved PT while replacing the original buffer
7535 contents, we must recover it now. */
7536 set_buffer_internal (XBUFFER (src_object));
7537 if (saved_pt < from)
7538 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7539 else if (saved_pt < from + chars)
7540 TEMP_SET_PT_BOTH (from, from_byte);
7541 else if (! NILP (current_buffer->enable_multibyte_characters))
7542 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7543 saved_pt_byte + (coding->produced - bytes));
d46c5b12 7544 else
4776e638
KH
7545 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7546 saved_pt_byte + (coding->produced - bytes));
64cedb0c
KH
7547
7548 if (need_marker_adjustment)
7549 {
7550 struct Lisp_Marker *tail;
7551
7552 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7553 if (tail->need_adjustment)
7554 {
7555 tail->need_adjustment = 0;
7556 if (tail->insertion_type)
7557 {
7558 tail->bytepos = from_byte;
7559 tail->charpos = from;
7560 }
7561 else
7562 {
7563 tail->bytepos = from_byte + coding->produced;
7564 tail->charpos
7565 = (NILP (current_buffer->enable_multibyte_characters)
7566 ? tail->bytepos : from + coding->produced_char);
7567 }
7568 }
7569 }
4776e638
KH
7570 }
7571
c02d943b
KH
7572 if (kill_src_buffer)
7573 Fkill_buffer (coding->src_object);
b3bfad50
KH
7574
7575 Vdeactivate_mark = old_deactivate_mark;
df7492f9 7576 unbind_to (count, Qnil);
b73bfc1c
KH
7577}
7578
df7492f9 7579
b73bfc1c 7580Lisp_Object
df7492f9 7581preferred_coding_system ()
b73bfc1c 7582{
df7492f9 7583 int id = coding_categories[coding_priorities[0]].id;
2391eaa4 7584
df7492f9 7585 return CODING_ID_NAME (id);
4ed46869
KH
7586}
7587
7588\f
7589#ifdef emacs
1397dc18 7590/*** 8. Emacs Lisp library functions ***/
4ed46869 7591
4ed46869 7592DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
48b0f3ae 7593 doc: /* Return t if OBJECT is nil or a coding-system.
df7492f9 7594See the documentation of `define-coding-system' for information
48b0f3ae 7595about coding-system objects. */)
d4a1d553
JB
7596 (object)
7597 Lisp_Object object;
4ed46869 7598{
d4a1d553
JB
7599 if (NILP (object)
7600 || CODING_SYSTEM_ID (object) >= 0)
44e8490d 7601 return Qt;
d4a1d553
JB
7602 if (! SYMBOLP (object)
7603 || NILP (Fget (object, Qcoding_system_define_form)))
44e8490d
KH
7604 return Qnil;
7605 return Qt;
4ed46869
KH
7606}
7607
9d991de8
RS
7608DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
7609 Sread_non_nil_coding_system, 1, 1, 0,
48b0f3ae
PJ
7610 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
7611 (prompt)
4ed46869
KH
7612 Lisp_Object prompt;
7613{
e0e989f6 7614 Lisp_Object val;
9d991de8
RS
7615 do
7616 {
4608c386
KH
7617 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
7618 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
9d991de8 7619 }
8f924df7 7620 while (SCHARS (val) == 0);
e0e989f6 7621 return (Fintern (val, Qnil));
4ed46869
KH
7622}
7623
9b787f3e 7624DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
48b0f3ae 7625 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
c7183fb8
GM
7626If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
7627Ignores case when completing coding systems (all Emacs coding systems
7628are lower-case). */)
48b0f3ae 7629 (prompt, default_coding_system)
9b787f3e 7630 Lisp_Object prompt, default_coding_system;
4ed46869 7631{
f44d27ce 7632 Lisp_Object val;
c7183fb8
GM
7633 int count = SPECPDL_INDEX ();
7634
9b787f3e 7635 if (SYMBOLP (default_coding_system))
57d25e6f 7636 default_coding_system = SYMBOL_NAME (default_coding_system);
c7183fb8 7637 specbind (Qcompletion_ignore_case, Qt);
4608c386 7638 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
9b787f3e
RS
7639 Qt, Qnil, Qcoding_system_history,
7640 default_coding_system, Qnil);
c7183fb8 7641 unbind_to (count, Qnil);
8f924df7 7642 return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
4ed46869
KH
7643}
7644
7645DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
7646 1, 1, 0,
48b0f3ae 7647 doc: /* Check validity of CODING-SYSTEM.
9ffd559c
KH
7648If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
7649It is valid if it is nil or a symbol defined as a coding system by the
7650function `define-coding-system'. */)
df7492f9 7651 (coding_system)
4ed46869
KH
7652 Lisp_Object coding_system;
7653{
44e8490d
KH
7654 Lisp_Object define_form;
7655
7656 define_form = Fget (coding_system, Qcoding_system_define_form);
7657 if (! NILP (define_form))
7658 {
7659 Fput (coding_system, Qcoding_system_define_form, Qnil);
7660 safe_eval (define_form);
7661 }
4ed46869
KH
7662 if (!NILP (Fcoding_system_p (coding_system)))
7663 return coding_system;
fcad4ec4 7664 xsignal1 (Qcoding_system_error, coding_system);
4ed46869 7665}
df7492f9 7666
3a73fa5d 7667\f
89528eb3
KH
7668/* Detect how the bytes at SRC of length SRC_BYTES are encoded. If
7669 HIGHEST is nonzero, return the coding system of the highest
7670 priority among the detected coding systems. Otherwize return a
7671 list of detected coding systems sorted by their priorities. If
7672 MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
7673 multibyte form but contains only ASCII and eight-bit chars.
7674 Otherwise, the bytes are raw bytes.
7675
7676 CODING-SYSTEM controls the detection as below:
7677
7678 If it is nil, detect both text-format and eol-format. If the
7679 text-format part of CODING-SYSTEM is already specified
7680 (e.g. `iso-latin-1'), detect only eol-format. If the eol-format
7681 part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
7682 detect only text-format. */
7683
d46c5b12 7684Lisp_Object
24a73b0a
KH
7685detect_coding_system (src, src_chars, src_bytes, highest, multibytep,
7686 coding_system)
8f924df7 7687 const unsigned char *src;
13818c30
SM
7688 EMACS_INT src_chars, src_bytes;
7689 int highest;
0a28aafb 7690 int multibytep;
df7492f9 7691 Lisp_Object coding_system;
4ed46869 7692{
8f924df7 7693 const unsigned char *src_end = src + src_bytes;
df7492f9 7694 Lisp_Object attrs, eol_type;
4533845d 7695 Lisp_Object val = Qnil;
df7492f9 7696 struct coding_system coding;
89528eb3 7697 int id;
ff0dacd7 7698 struct coding_detection_info detect_info;
24a73b0a 7699 enum coding_category base_category;
2f3cbb32 7700 int null_byte_found = 0, eight_bit_found = 0;
b73bfc1c 7701
df7492f9
KH
7702 if (NILP (coding_system))
7703 coding_system = Qundecided;
7704 setup_coding_system (coding_system, &coding);
7705 attrs = CODING_ID_ATTRS (coding.id);
7706 eol_type = CODING_ID_EOL_TYPE (coding.id);
89528eb3 7707 coding_system = CODING_ATTR_BASE_NAME (attrs);
4ed46869 7708
df7492f9 7709 coding.source = src;
24a73b0a 7710 coding.src_chars = src_chars;
df7492f9
KH
7711 coding.src_bytes = src_bytes;
7712 coding.src_multibyte = multibytep;
7713 coding.consumed = 0;
89528eb3 7714 coding.mode |= CODING_MODE_LAST_BLOCK;
c0e16b14 7715 coding.head_ascii = 0;
d46c5b12 7716
ff0dacd7 7717 detect_info.checked = detect_info.found = detect_info.rejected = 0;
d46c5b12 7718
89528eb3 7719 /* At first, detect text-format if necessary. */
24a73b0a
KH
7720 base_category = XINT (CODING_ATTR_CATEGORY (attrs));
7721 if (base_category == coding_category_undecided)
4ed46869 7722 {
ff0dacd7
KH
7723 enum coding_category category;
7724 struct coding_system *this;
7725 int c, i;
88993dfd 7726
24a73b0a 7727 /* Skip all ASCII bytes except for a few ISO2022 controls. */
2f3cbb32 7728 for (; src < src_end; src++)
4ed46869 7729 {
df7492f9 7730 c = *src;
6cb21a4f 7731 if (c & 0x80)
6cb21a4f 7732 {
2f3cbb32 7733 eight_bit_found = 1;
2f3cbb32
KH
7734 if (null_byte_found)
7735 break;
7736 }
c0e16b14 7737 else if (c < 0x20)
2f3cbb32
KH
7738 {
7739 if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
7740 && ! inhibit_iso_escape_detection
7741 && ! detect_info.checked)
6cb21a4f 7742 {
2f3cbb32
KH
7743 if (detect_coding_iso_2022 (&coding, &detect_info))
7744 {
7745 /* We have scanned the whole data. */
7746 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
c0e16b14
KH
7747 {
7748 /* We didn't find an 8-bit code. We may
7749 have found a null-byte, but it's very
7750 rare that a binary file confirm to
7751 ISO-2022. */
7752 src = src_end;
7753 coding.head_ascii = src - coding.source;
7754 }
7755 detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
2f3cbb32
KH
7756 break;
7757 }
7758 }
7759 else if (! c)
7760 {
7761 null_byte_found = 1;
7762 if (eight_bit_found)
7763 break;
6cb21a4f 7764 }
c006c0c8
KH
7765 if (! eight_bit_found)
7766 coding.head_ascii++;
6cb21a4f 7767 }
c006c0c8 7768 else if (! eight_bit_found)
c0e16b14 7769 coding.head_ascii++;
4ed46869 7770 }
88993dfd 7771
2f3cbb32
KH
7772 if (null_byte_found || eight_bit_found
7773 || coding.head_ascii < coding.src_bytes
6cb21a4f
KH
7774 || detect_info.found)
7775 {
2f3cbb32 7776 if (coding.head_ascii == coding.src_bytes)
6cb21a4f
KH
7777 /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */
7778 for (i = 0; i < coding_category_raw_text; i++)
ff0dacd7 7779 {
6cb21a4f 7780 category = coding_priorities[i];
c7266f4a 7781 this = coding_categories + category;
6cb21a4f 7782 if (detect_info.found & (1 << category))
ff0dacd7
KH
7783 break;
7784 }
6cb21a4f 7785 else
2f3cbb32
KH
7786 {
7787 if (null_byte_found)
7788 {
7789 detect_info.checked |= ~CATEGORY_MASK_UTF_16;
7790 detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
7791 }
7792 for (i = 0; i < coding_category_raw_text; i++)
7793 {
7794 category = coding_priorities[i];
7795 this = coding_categories + category;
6cb21a4f 7796
2f3cbb32
KH
7797 if (this->id < 0)
7798 {
7799 /* No coding system of this category is defined. */
7800 detect_info.rejected |= (1 << category);
7801 }
7802 else if (category >= coding_category_raw_text)
7803 continue;
7804 else if (detect_info.checked & (1 << category))
7805 {
7806 if (highest
7807 && (detect_info.found & (1 << category)))
6cb21a4f 7808 break;
2f3cbb32
KH
7809 }
7810 else if ((*(this->detector)) (&coding, &detect_info)
7811 && highest
7812 && (detect_info.found & (1 << category)))
7813 {
7814 if (category == coding_category_utf_16_auto)
7815 {
7816 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
7817 category = coding_category_utf_16_le;
7818 else
7819 category = coding_category_utf_16_be;
7820 }
7821 break;
7822 }
7823 }
7824 }
6cb21a4f 7825 }
ec6d2bb8 7826
2f3cbb32 7827 if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY)
ec6d2bb8 7828 {
ff0dacd7 7829 detect_info.found = CATEGORY_MASK_RAW_TEXT;
89528eb3
KH
7830 id = coding_categories[coding_category_raw_text].id;
7831 val = Fcons (make_number (id), Qnil);
7832 }
ff0dacd7 7833 else if (! detect_info.rejected && ! detect_info.found)
89528eb3 7834 {
ff0dacd7 7835 detect_info.found = CATEGORY_MASK_ANY;
89528eb3
KH
7836 id = coding_categories[coding_category_undecided].id;
7837 val = Fcons (make_number (id), Qnil);
7838 }
7839 else if (highest)
7840 {
ff0dacd7 7841 if (detect_info.found)
ec6d2bb8 7842 {
ff0dacd7
KH
7843 detect_info.found = 1 << category;
7844 val = Fcons (make_number (this->id), Qnil);
7845 }
7846 else
7847 for (i = 0; i < coding_category_raw_text; i++)
7848 if (! (detect_info.rejected & (1 << coding_priorities[i])))
7849 {
7850 detect_info.found = 1 << coding_priorities[i];
7851 id = coding_categories[coding_priorities[i]].id;
7852 val = Fcons (make_number (id), Qnil);
7853 break;
7854 }
7855 }
89528eb3
KH
7856 else
7857 {
ff0dacd7
KH
7858 int mask = detect_info.rejected | detect_info.found;
7859 int found = 0;
ec6d2bb8 7860
89528eb3 7861 for (i = coding_category_raw_text - 1; i >= 0; i--)
ff0dacd7
KH
7862 {
7863 category = coding_priorities[i];
7864 if (! (mask & (1 << category)))
ec6d2bb8 7865 {
ff0dacd7
KH
7866 found |= 1 << category;
7867 id = coding_categories[category].id;
c7266f4a
KH
7868 if (id >= 0)
7869 val = Fcons (make_number (id), val);
ff0dacd7
KH
7870 }
7871 }
7872 for (i = coding_category_raw_text - 1; i >= 0; i--)
7873 {
7874 category = coding_priorities[i];
7875 if (detect_info.found & (1 << category))
7876 {
7877 id = coding_categories[category].id;
7878 val = Fcons (make_number (id), val);
ec6d2bb8 7879 }
ec6d2bb8 7880 }
ff0dacd7 7881 detect_info.found |= found;
ec6d2bb8 7882 }
ec6d2bb8 7883 }
a470d443
KH
7884 else if (base_category == coding_category_utf_8_auto)
7885 {
7886 if (detect_coding_utf_8 (&coding, &detect_info))
7887 {
7888 struct coding_system *this;
7889
7890 if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
7891 this = coding_categories + coding_category_utf_8_sig;
7892 else
7893 this = coding_categories + coding_category_utf_8_nosig;
7894 val = Fcons (make_number (this->id), Qnil);
7895 }
7896 }
24a73b0a
KH
7897 else if (base_category == coding_category_utf_16_auto)
7898 {
7899 if (detect_coding_utf_16 (&coding, &detect_info))
7900 {
24a73b0a
KH
7901 struct coding_system *this;
7902
7903 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
7904 this = coding_categories + coding_category_utf_16_le;
7905 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
7906 this = coding_categories + coding_category_utf_16_be;
7907 else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
7908 this = coding_categories + coding_category_utf_16_be_nosig;
7909 else
7910 this = coding_categories + coding_category_utf_16_le_nosig;
7911 val = Fcons (make_number (this->id), Qnil);
7912 }
7913 }
df7492f9
KH
7914 else
7915 {
ff0dacd7 7916 detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
89528eb3 7917 val = Fcons (make_number (coding.id), Qnil);
4ed46869 7918 }
df7492f9 7919
89528eb3 7920 /* Then, detect eol-format if necessary. */
df7492f9 7921 {
4533845d 7922 int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
df7492f9
KH
7923 Lisp_Object tail;
7924
89528eb3
KH
7925 if (VECTORP (eol_type))
7926 {
ff0dacd7 7927 if (detect_info.found & ~CATEGORY_MASK_UTF_16)
2f3cbb32
KH
7928 {
7929 if (null_byte_found)
7930 normal_eol = EOL_SEEN_LF;
7931 else
7932 normal_eol = detect_eol (coding.source, src_bytes,
7933 coding_category_raw_text);
7934 }
ff0dacd7
KH
7935 if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
7936 | CATEGORY_MASK_UTF_16_BE_NOSIG))
89528eb3
KH
7937 utf_16_be_eol = detect_eol (coding.source, src_bytes,
7938 coding_category_utf_16_be);
ff0dacd7
KH
7939 if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
7940 | CATEGORY_MASK_UTF_16_LE_NOSIG))
89528eb3
KH
7941 utf_16_le_eol = detect_eol (coding.source, src_bytes,
7942 coding_category_utf_16_le);
7943 }
7944 else
7945 {
7946 if (EQ (eol_type, Qunix))
7947 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
7948 else if (EQ (eol_type, Qdos))
7949 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
7950 else
7951 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
7952 }
7953
df7492f9
KH
7954 for (tail = val; CONSP (tail); tail = XCDR (tail))
7955 {
89528eb3 7956 enum coding_category category;
df7492f9 7957 int this_eol;
89528eb3
KH
7958
7959 id = XINT (XCAR (tail));
7960 attrs = CODING_ID_ATTRS (id);
7961 category = XINT (CODING_ATTR_CATEGORY (attrs));
7962 eol_type = CODING_ID_EOL_TYPE (id);
df7492f9
KH
7963 if (VECTORP (eol_type))
7964 {
89528eb3
KH
7965 if (category == coding_category_utf_16_be
7966 || category == coding_category_utf_16_be_nosig)
7967 this_eol = utf_16_be_eol;
7968 else if (category == coding_category_utf_16_le
7969 || category == coding_category_utf_16_le_nosig)
7970 this_eol = utf_16_le_eol;
df7492f9 7971 else
89528eb3
KH
7972 this_eol = normal_eol;
7973
df7492f9
KH
7974 if (this_eol == EOL_SEEN_LF)
7975 XSETCAR (tail, AREF (eol_type, 0));
7976 else if (this_eol == EOL_SEEN_CRLF)
7977 XSETCAR (tail, AREF (eol_type, 1));
7978 else if (this_eol == EOL_SEEN_CR)
7979 XSETCAR (tail, AREF (eol_type, 2));
89528eb3
KH
7980 else
7981 XSETCAR (tail, CODING_ID_NAME (id));
df7492f9 7982 }
89528eb3
KH
7983 else
7984 XSETCAR (tail, CODING_ID_NAME (id));
df7492f9
KH
7985 }
7986 }
ec6d2bb8 7987
4533845d 7988 return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
ec6d2bb8
KH
7989}
7990
ec6d2bb8 7991
d46c5b12
KH
7992DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
7993 2, 3, 0,
48b0f3ae
PJ
7994 doc: /* Detect coding system of the text in the region between START and END.
7995Return a list of possible coding systems ordered by priority.
ec6d2bb8 7996
12e0131a 7997If only ASCII characters are found (except for such ISO-2022 control
d4a1d553
JB
7998characters as ESC), it returns a list of single element `undecided'
7999or its subsidiary coding system according to a detected end-of-line
8000format.
ec6d2bb8 8001
48b0f3ae
PJ
8002If optional argument HIGHEST is non-nil, return the coding system of
8003highest priority. */)
8004 (start, end, highest)
d46c5b12
KH
8005 Lisp_Object start, end, highest;
8006{
8007 int from, to;
8008 int from_byte, to_byte;
ec6d2bb8 8009
b7826503
PJ
8010 CHECK_NUMBER_COERCE_MARKER (start);
8011 CHECK_NUMBER_COERCE_MARKER (end);
ec6d2bb8 8012
d46c5b12
KH
8013 validate_region (&start, &end);
8014 from = XINT (start), to = XINT (end);
8015 from_byte = CHAR_TO_BYTE (from);
8016 to_byte = CHAR_TO_BYTE (to);
ec6d2bb8 8017
d46c5b12
KH
8018 if (from < GPT && to >= GPT)
8019 move_gap_both (to, to_byte);
c210f766 8020
d46c5b12 8021 return detect_coding_system (BYTE_POS_ADDR (from_byte),
24a73b0a 8022 to - from, to_byte - from_byte,
0a28aafb
KH
8023 !NILP (highest),
8024 !NILP (current_buffer
df7492f9
KH
8025 ->enable_multibyte_characters),
8026 Qnil);
ec6d2bb8
KH
8027}
8028
d46c5b12
KH
8029DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8030 1, 2, 0,
48b0f3ae
PJ
8031 doc: /* Detect coding system of the text in STRING.
8032Return a list of possible coding systems ordered by priority.
fb88bf2d 8033
12e0131a 8034If only ASCII characters are found (except for such ISO-2022 control
d4a1d553
JB
8035characters as ESC), it returns a list of single element `undecided'
8036or its subsidiary coding system according to a detected end-of-line
8037format.
d46c5b12 8038
48b0f3ae
PJ
8039If optional argument HIGHEST is non-nil, return the coding system of
8040highest priority. */)
8041 (string, highest)
d46c5b12
KH
8042 Lisp_Object string, highest;
8043{
b7826503 8044 CHECK_STRING (string);
b73bfc1c 8045
24a73b0a
KH
8046 return detect_coding_system (SDATA (string),
8047 SCHARS (string), SBYTES (string),
8f924df7 8048 !NILP (highest), STRING_MULTIBYTE (string),
df7492f9 8049 Qnil);
4ed46869 8050}
4ed46869 8051
b73bfc1c 8052
df7492f9
KH
8053static INLINE int
8054char_encodable_p (c, attrs)
8055 int c;
8056 Lisp_Object attrs;
05e6f5dc 8057{
df7492f9 8058 Lisp_Object tail;
df7492f9 8059 struct charset *charset;
7d64c6ad 8060 Lisp_Object translation_table;
d46c5b12 8061
7d64c6ad 8062 translation_table = CODING_ATTR_TRANS_TBL (attrs);
a6f87d34 8063 if (! NILP (translation_table))
7d64c6ad 8064 c = translate_char (translation_table, c);
df7492f9
KH
8065 for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8066 CONSP (tail); tail = XCDR (tail))
e133c8fa 8067 {
df7492f9
KH
8068 charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8069 if (CHAR_CHARSET_P (c, charset))
8070 break;
e133c8fa 8071 }
df7492f9 8072 return (! NILP (tail));
05e6f5dc 8073}
83fa074f 8074
fb88bf2d 8075
df7492f9
KH
8076/* Return a list of coding systems that safely encode the text between
8077 START and END. If EXCLUDE is non-nil, it is a list of coding
8078 systems not to check. The returned list doesn't contain any such
48468dac 8079 coding systems. In any case, if the text contains only ASCII or is
df7492f9 8080 unibyte, return t. */
e077cc80 8081
df7492f9
KH
8082DEFUN ("find-coding-systems-region-internal",
8083 Ffind_coding_systems_region_internal,
8084 Sfind_coding_systems_region_internal, 2, 3, 0,
8085 doc: /* Internal use only. */)
8086 (start, end, exclude)
8087 Lisp_Object start, end, exclude;
8088{
8089 Lisp_Object coding_attrs_list, safe_codings;
8090 EMACS_INT start_byte, end_byte;
7c78e542 8091 const unsigned char *p, *pbeg, *pend;
df7492f9
KH
8092 int c;
8093 Lisp_Object tail, elt;
d46c5b12 8094
df7492f9
KH
8095 if (STRINGP (start))
8096 {
8097 if (!STRING_MULTIBYTE (start)
8f924df7 8098 || SCHARS (start) == SBYTES (start))
df7492f9
KH
8099 return Qt;
8100 start_byte = 0;
8f924df7 8101 end_byte = SBYTES (start);
df7492f9
KH
8102 }
8103 else
d46c5b12 8104 {
df7492f9
KH
8105 CHECK_NUMBER_COERCE_MARKER (start);
8106 CHECK_NUMBER_COERCE_MARKER (end);
8107 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8108 args_out_of_range (start, end);
8109 if (NILP (current_buffer->enable_multibyte_characters))
8110 return Qt;
8111 start_byte = CHAR_TO_BYTE (XINT (start));
8112 end_byte = CHAR_TO_BYTE (XINT (end));
8113 if (XINT (end) - XINT (start) == end_byte - start_byte)
8114 return Qt;
d46c5b12 8115
e1c23804 8116 if (XINT (start) < GPT && XINT (end) > GPT)
d46c5b12 8117 {
e1c23804
DL
8118 if ((GPT - XINT (start)) < (XINT (end) - GPT))
8119 move_gap_both (XINT (start), start_byte);
df7492f9 8120 else
e1c23804 8121 move_gap_both (XINT (end), end_byte);
d46c5b12
KH
8122 }
8123 }
8124
df7492f9
KH
8125 coding_attrs_list = Qnil;
8126 for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
8127 if (NILP (exclude)
8128 || NILP (Fmemq (XCAR (tail), exclude)))
8129 {
8130 Lisp_Object attrs;
d46c5b12 8131
df7492f9
KH
8132 attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
8133 if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
8134 && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
7d64c6ad
KH
8135 {
8136 ASET (attrs, coding_attr_trans_tbl,
2170c8f0 8137 get_translation_table (attrs, 1, NULL));
7d64c6ad
KH
8138 coding_attrs_list = Fcons (attrs, coding_attrs_list);
8139 }
df7492f9 8140 }
d46c5b12 8141
df7492f9 8142 if (STRINGP (start))
8f924df7 8143 p = pbeg = SDATA (start);
df7492f9
KH
8144 else
8145 p = pbeg = BYTE_POS_ADDR (start_byte);
8146 pend = p + (end_byte - start_byte);
b843d1ae 8147
df7492f9
KH
8148 while (p < pend && ASCII_BYTE_P (*p)) p++;
8149 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
d46c5b12 8150
05e6f5dc 8151 while (p < pend)
72d1a715 8152 {
df7492f9
KH
8153 if (ASCII_BYTE_P (*p))
8154 p++;
72d1a715
RS
8155 else
8156 {
df7492f9 8157 c = STRING_CHAR_ADVANCE (p);
12410ef1 8158
df7492f9
KH
8159 charset_map_loaded = 0;
8160 for (tail = coding_attrs_list; CONSP (tail);)
8161 {
8162 elt = XCAR (tail);
8163 if (NILP (elt))
8164 tail = XCDR (tail);
8165 else if (char_encodable_p (c, elt))
8166 tail = XCDR (tail);
8167 else if (CONSP (XCDR (tail)))
8168 {
8169 XSETCAR (tail, XCAR (XCDR (tail)));
8170 XSETCDR (tail, XCDR (XCDR (tail)));
8171 }
8172 else
8173 {
8174 XSETCAR (tail, Qnil);
8175 tail = XCDR (tail);
8176 }
8177 }
8178 if (charset_map_loaded)
8179 {
8180 EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
05e6f5dc 8181
df7492f9 8182 if (STRINGP (start))
8f924df7 8183 pbeg = SDATA (start);
df7492f9
KH
8184 else
8185 pbeg = BYTE_POS_ADDR (start_byte);
8186 p = pbeg + p_offset;
8187 pend = pbeg + pend_offset;
8188 }
8189 }
ec6d2bb8 8190 }
fb88bf2d 8191
988b3759 8192 safe_codings = list2 (Qraw_text, Qno_conversion);
df7492f9
KH
8193 for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
8194 if (! NILP (XCAR (tail)))
8195 safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
ec6d2bb8 8196
05e6f5dc
KH
8197 return safe_codings;
8198}
4956c225 8199
d46c5b12 8200
8f924df7
KH
8201DEFUN ("unencodable-char-position", Funencodable_char_position,
8202 Sunencodable_char_position, 3, 5, 0,
8203 doc: /*
8204Return position of first un-encodable character in a region.
d4a1d553 8205START and END specify the region and CODING-SYSTEM specifies the
8f924df7 8206encoding to check. Return nil if CODING-SYSTEM does encode the region.
d46c5b12 8207
8f924df7
KH
8208If optional 4th argument COUNT is non-nil, it specifies at most how
8209many un-encodable characters to search. In this case, the value is a
8210list of positions.
d46c5b12 8211
8f924df7
KH
8212If optional 5th argument STRING is non-nil, it is a string to search
8213for un-encodable characters. In that case, START and END are indexes
8214to the string. */)
8215 (start, end, coding_system, count, string)
8216 Lisp_Object start, end, coding_system, count, string;
8217{
8218 int n;
8219 struct coding_system coding;
7d64c6ad 8220 Lisp_Object attrs, charset_list, translation_table;
8f924df7
KH
8221 Lisp_Object positions;
8222 int from, to;
8223 const unsigned char *p, *stop, *pend;
8224 int ascii_compatible;
fb88bf2d 8225
8f924df7
KH
8226 setup_coding_system (Fcheck_coding_system (coding_system), &coding);
8227 attrs = CODING_ID_ATTRS (coding.id);
8228 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
8229 return Qnil;
8230 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
8231 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2170c8f0 8232 translation_table = get_translation_table (attrs, 1, NULL);
fb88bf2d 8233
8f924df7
KH
8234 if (NILP (string))
8235 {
8236 validate_region (&start, &end);
8237 from = XINT (start);
8238 to = XINT (end);
8239 if (NILP (current_buffer->enable_multibyte_characters)
8240 || (ascii_compatible
8241 && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
8242 return Qnil;
8243 p = CHAR_POS_ADDR (from);
8244 pend = CHAR_POS_ADDR (to);
8245 if (from < GPT && to >= GPT)
8246 stop = GPT_ADDR;
8247 else
8248 stop = pend;
8249 }
8250 else
8251 {
8252 CHECK_STRING (string);
8253 CHECK_NATNUM (start);
8254 CHECK_NATNUM (end);
8255 from = XINT (start);
8256 to = XINT (end);
8257 if (from > to
8258 || to > SCHARS (string))
8259 args_out_of_range_3 (string, start, end);
8260 if (! STRING_MULTIBYTE (string))
8261 return Qnil;
8262 p = SDATA (string) + string_char_to_byte (string, from);
8263 stop = pend = SDATA (string) + string_char_to_byte (string, to);
8264 if (ascii_compatible && (to - from) == (pend - p))
8265 return Qnil;
8266 }
f2558efd 8267
8f924df7
KH
8268 if (NILP (count))
8269 n = 1;
8270 else
b73bfc1c 8271 {
8f924df7
KH
8272 CHECK_NATNUM (count);
8273 n = XINT (count);
b73bfc1c
KH
8274 }
8275
8f924df7
KH
8276 positions = Qnil;
8277 while (1)
d46c5b12 8278 {
8f924df7 8279 int c;
ec6d2bb8 8280
8f924df7
KH
8281 if (ascii_compatible)
8282 while (p < stop && ASCII_BYTE_P (*p))
8283 p++, from++;
8284 if (p >= stop)
0e79d667 8285 {
8f924df7
KH
8286 if (p >= pend)
8287 break;
8288 stop = pend;
8289 p = GAP_END_ADDR;
0e79d667 8290 }
ec6d2bb8 8291
8f924df7
KH
8292 c = STRING_CHAR_ADVANCE (p);
8293 if (! (ASCII_CHAR_P (c) && ascii_compatible)
7d64c6ad
KH
8294 && ! char_charset (translate_char (translation_table, c),
8295 charset_list, NULL))
ec6d2bb8 8296 {
8f924df7
KH
8297 positions = Fcons (make_number (from), positions);
8298 n--;
8299 if (n == 0)
8300 break;
ec6d2bb8
KH
8301 }
8302
8f924df7
KH
8303 from++;
8304 }
d46c5b12 8305
8f924df7
KH
8306 return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
8307}
d46c5b12 8308
d46c5b12 8309
df7492f9
KH
8310DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
8311 Scheck_coding_systems_region, 3, 3, 0,
8312 doc: /* Check if the region is encodable by coding systems.
d46c5b12 8313
df7492f9
KH
8314START and END are buffer positions specifying the region.
8315CODING-SYSTEM-LIST is a list of coding systems to check.
d46c5b12 8316
df7492f9 8317The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
d4a1d553 8318CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
df7492f9
KH
8319whole region, POS0, POS1, ... are buffer positions where non-encodable
8320characters are found.
93dec019 8321
df7492f9
KH
8322If all coding systems in CODING-SYSTEM-LIST can encode the region, the
8323value is nil.
93dec019 8324
df7492f9
KH
8325START may be a string. In that case, check if the string is
8326encodable, and the value contains indices to the string instead of
8327buffer positions. END is ignored. */)
8328 (start, end, coding_system_list)
8329 Lisp_Object start, end, coding_system_list;
05e6f5dc 8330{
df7492f9
KH
8331 Lisp_Object list;
8332 EMACS_INT start_byte, end_byte;
8333 int pos;
7c78e542 8334 const unsigned char *p, *pbeg, *pend;
df7492f9 8335 int c;
7d64c6ad 8336 Lisp_Object tail, elt, attrs;
70ad9fc4 8337
05e6f5dc
KH
8338 if (STRINGP (start))
8339 {
df7492f9 8340 if (!STRING_MULTIBYTE (start)
8f924df7 8341 && SCHARS (start) != SBYTES (start))
df7492f9
KH
8342 return Qnil;
8343 start_byte = 0;
8f924df7 8344 end_byte = SBYTES (start);
df7492f9 8345 pos = 0;
d46c5b12 8346 }
05e6f5dc 8347 else
b73bfc1c 8348 {
b7826503
PJ
8349 CHECK_NUMBER_COERCE_MARKER (start);
8350 CHECK_NUMBER_COERCE_MARKER (end);
05e6f5dc
KH
8351 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8352 args_out_of_range (start, end);
8353 if (NILP (current_buffer->enable_multibyte_characters))
df7492f9
KH
8354 return Qnil;
8355 start_byte = CHAR_TO_BYTE (XINT (start));
8356 end_byte = CHAR_TO_BYTE (XINT (end));
8357 if (XINT (end) - XINT (start) == end_byte - start_byte)
05e6f5dc 8358 return Qt;
df7492f9 8359
e1c23804 8360 if (XINT (start) < GPT && XINT (end) > GPT)
b73bfc1c 8361 {
e1c23804
DL
8362 if ((GPT - XINT (start)) < (XINT (end) - GPT))
8363 move_gap_both (XINT (start), start_byte);
df7492f9 8364 else
e1c23804 8365 move_gap_both (XINT (end), end_byte);
b73bfc1c 8366 }
e1c23804 8367 pos = XINT (start);
b73bfc1c 8368 }
7553d0e1 8369
df7492f9
KH
8370 list = Qnil;
8371 for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
12410ef1 8372 {
df7492f9 8373 elt = XCAR (tail);
7d64c6ad 8374 attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
2170c8f0
KH
8375 ASET (attrs, coding_attr_trans_tbl,
8376 get_translation_table (attrs, 1, NULL));
7d64c6ad 8377 list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
12410ef1
KH
8378 }
8379
df7492f9 8380 if (STRINGP (start))
8f924df7 8381 p = pbeg = SDATA (start);
72d1a715 8382 else
df7492f9
KH
8383 p = pbeg = BYTE_POS_ADDR (start_byte);
8384 pend = p + (end_byte - start_byte);
4ed46869 8385
df7492f9
KH
8386 while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
8387 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
ec6d2bb8 8388
df7492f9 8389 while (p < pend)
d46c5b12 8390 {
df7492f9
KH
8391 if (ASCII_BYTE_P (*p))
8392 p++;
e133c8fa 8393 else
05e6f5dc 8394 {
df7492f9
KH
8395 c = STRING_CHAR_ADVANCE (p);
8396
8397 charset_map_loaded = 0;
8398 for (tail = list; CONSP (tail); tail = XCDR (tail))
8399 {
8400 elt = XCDR (XCAR (tail));
8401 if (! char_encodable_p (c, XCAR (elt)))
8402 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
8403 }
8404 if (charset_map_loaded)
8405 {
8406 EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
8407
8408 if (STRINGP (start))
8f924df7 8409 pbeg = SDATA (start);
df7492f9
KH
8410 else
8411 pbeg = BYTE_POS_ADDR (start_byte);
8412 p = pbeg + p_offset;
8413 pend = pbeg + pend_offset;
8414 }
05e6f5dc 8415 }
df7492f9 8416 pos++;
d46c5b12 8417 }
4ed46869 8418
df7492f9
KH
8419 tail = list;
8420 list = Qnil;
8421 for (; CONSP (tail); tail = XCDR (tail))
ec6d2bb8 8422 {
df7492f9
KH
8423 elt = XCAR (tail);
8424 if (CONSP (XCDR (XCDR (elt))))
8425 list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
8426 list);
ec6d2bb8 8427 }
2b4f9037 8428
df7492f9 8429 return list;
d46c5b12
KH
8430}
8431
3fd9494b 8432
b73bfc1c 8433Lisp_Object
df7492f9
KH
8434code_convert_region (start, end, coding_system, dst_object, encodep, norecord)
8435 Lisp_Object start, end, coding_system, dst_object;
8436 int encodep, norecord;
4ed46869 8437{
3a73fa5d 8438 struct coding_system coding;
df7492f9
KH
8439 EMACS_INT from, from_byte, to, to_byte;
8440 Lisp_Object src_object;
4ed46869 8441
b7826503
PJ
8442 CHECK_NUMBER_COERCE_MARKER (start);
8443 CHECK_NUMBER_COERCE_MARKER (end);
df7492f9
KH
8444 if (NILP (coding_system))
8445 coding_system = Qno_conversion;
8446 else
8447 CHECK_CODING_SYSTEM (coding_system);
8448 src_object = Fcurrent_buffer ();
8449 if (NILP (dst_object))
8450 dst_object = src_object;
8451 else if (! EQ (dst_object, Qt))
8452 CHECK_BUFFER (dst_object);
3a73fa5d 8453
d46c5b12
KH
8454 validate_region (&start, &end);
8455 from = XFASTINT (start);
df7492f9 8456 from_byte = CHAR_TO_BYTE (from);
d46c5b12 8457 to = XFASTINT (end);
df7492f9 8458 to_byte = CHAR_TO_BYTE (to);
764ca8da 8459
df7492f9
KH
8460 setup_coding_system (coding_system, &coding);
8461 coding.mode |= CODING_MODE_LAST_BLOCK;
ec6d2bb8 8462
df7492f9
KH
8463 if (encodep)
8464 encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8465 dst_object);
8466 else
8467 decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8468 dst_object);
8469 if (! norecord)
8470 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
ec6d2bb8 8471
df7492f9
KH
8472 return (BUFFERP (dst_object)
8473 ? make_number (coding.produced_char)
8474 : coding.dst_object);
4031e2bf 8475}
78108bcd 8476
4ed46869 8477
4031e2bf 8478DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
df7492f9 8479 3, 4, "r\nzCoding system: ",
48b0f3ae 8480 doc: /* Decode the current region from the specified coding system.
df7492f9
KH
8481When called from a program, takes four arguments:
8482 START, END, CODING-SYSTEM, and DESTINATION.
8483START and END are buffer positions.
8844fa83 8484
df7492f9 8485Optional 4th arguments DESTINATION specifies where the decoded text goes.
2354b80b 8486If nil, the region between START and END is replaced by the decoded text.
1560f91a
EZ
8487If buffer, the decoded text is inserted in that buffer after point (point
8488does not move).
446dcd75 8489In those cases, the length of the decoded text is returned.
319a3947 8490If DESTINATION is t, the decoded text is returned.
8844fa83 8491
48b0f3ae
PJ
8492This function sets `last-coding-system-used' to the precise coding system
8493used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
319a3947 8494not fully specified.) */)
df7492f9
KH
8495 (start, end, coding_system, destination)
8496 Lisp_Object start, end, coding_system, destination;
4031e2bf 8497{
df7492f9 8498 return code_convert_region (start, end, coding_system, destination, 0, 0);
3a73fa5d 8499}
8844fa83 8500
3a73fa5d 8501DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
df7492f9
KH
8502 3, 4, "r\nzCoding system: ",
8503 doc: /* Encode the current region by specified coding system.
d4a1d553
JB
8504When called from a program, takes four arguments:
8505 START, END, CODING-SYSTEM and DESTINATION.
8506START and END are buffer positions.
d46c5b12 8507
df7492f9
KH
8508Optional 4th arguments DESTINATION specifies where the encoded text goes.
8509If nil, the region between START and END is replace by the encoded text.
1560f91a
EZ
8510If buffer, the encoded text is inserted in that buffer after point (point
8511does not move).
446dcd75 8512In those cases, the length of the encoded text is returned.
319a3947 8513If DESTINATION is t, the encoded text is returned.
2391eaa4 8514
48b0f3ae
PJ
8515This function sets `last-coding-system-used' to the precise coding system
8516used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
319a3947 8517not fully specified.) */)
df7492f9
KH
8518 (start, end, coding_system, destination)
8519 Lisp_Object start, end, coding_system, destination;
3a73fa5d 8520{
df7492f9 8521 return code_convert_region (start, end, coding_system, destination, 1, 0);
b73bfc1c
KH
8522}
8523
8524Lisp_Object
df7492f9
KH
8525code_convert_string (string, coding_system, dst_object,
8526 encodep, nocopy, norecord)
8527 Lisp_Object string, coding_system, dst_object;
8528 int encodep, nocopy, norecord;
b73bfc1c 8529{
4031e2bf 8530 struct coding_system coding;
df7492f9 8531 EMACS_INT chars, bytes;
ec6d2bb8 8532
b7826503 8533 CHECK_STRING (string);
d46c5b12 8534 if (NILP (coding_system))
4956c225 8535 {
df7492f9
KH
8536 if (! norecord)
8537 Vlast_coding_system_used = Qno_conversion;
8538 if (NILP (dst_object))
8539 return (nocopy ? Fcopy_sequence (string) : string);
4956c225 8540 }
b73bfc1c 8541
df7492f9
KH
8542 if (NILP (coding_system))
8543 coding_system = Qno_conversion;
8544 else
8545 CHECK_CODING_SYSTEM (coding_system);
8546 if (NILP (dst_object))
8547 dst_object = Qt;
8548 else if (! EQ (dst_object, Qt))
8549 CHECK_BUFFER (dst_object);
73be902c 8550
df7492f9 8551 setup_coding_system (coding_system, &coding);
d46c5b12 8552 coding.mode |= CODING_MODE_LAST_BLOCK;
8f924df7
KH
8553 chars = SCHARS (string);
8554 bytes = SBYTES (string);
df7492f9
KH
8555 if (encodep)
8556 encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
8557 else
8558 decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
8559 if (! norecord)
8560 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
73be902c 8561
df7492f9
KH
8562 return (BUFFERP (dst_object)
8563 ? make_number (coding.produced_char)
8564 : coding.dst_object);
4ed46869 8565}
73be902c 8566
b73bfc1c 8567
ecec61c1 8568/* Encode or decode STRING according to CODING_SYSTEM.
ec6d2bb8 8569 Do not set Vlast_coding_system_used.
4ed46869 8570
ec6d2bb8
KH
8571 This function is called only from macros DECODE_FILE and
8572 ENCODE_FILE, thus we ignore character composition. */
4ed46869 8573
ecec61c1
KH
8574Lisp_Object
8575code_convert_string_norecord (string, coding_system, encodep)
8576 Lisp_Object string, coding_system;
8577 int encodep;
4ed46869 8578{
0be8721c 8579 return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
4ed46869
KH
8580}
8581
4ed46869 8582
df7492f9
KH
8583DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
8584 2, 4, 0,
8585 doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
8586
8587Optional third arg NOCOPY non-nil means it is OK to return STRING itself
8588if the decoding operation is trivial.
ecec61c1 8589
d4a1d553 8590Optional fourth arg BUFFER non-nil means that the decoded text is
1560f91a
EZ
8591inserted in that buffer after point (point does not move). In this
8592case, the return value is the length of the decoded text.
ecec61c1 8593
df7492f9
KH
8594This function sets `last-coding-system-used' to the precise coding system
8595used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
d4a1d553 8596not fully specified.) */)
df7492f9
KH
8597 (string, coding_system, nocopy, buffer)
8598 Lisp_Object string, coding_system, nocopy, buffer;
4ed46869 8599{
df7492f9
KH
8600 return code_convert_string (string, coding_system, buffer,
8601 0, ! NILP (nocopy), 0);
4ed46869
KH
8602}
8603
df7492f9
KH
8604DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
8605 2, 4, 0,
8606 doc: /* Encode STRING to CODING-SYSTEM, and return the result.
8607
8608Optional third arg NOCOPY non-nil means it is OK to return STRING
8609itself if the encoding operation is trivial.
8610
d4a1d553 8611Optional fourth arg BUFFER non-nil means that the encoded text is
1560f91a
EZ
8612inserted in that buffer after point (point does not move). In this
8613case, the return value is the length of the encoded text.
df7492f9
KH
8614
8615This function sets `last-coding-system-used' to the precise coding system
8616used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8617not fully specified.) */)
8618 (string, coding_system, nocopy, buffer)
8619 Lisp_Object string, coding_system, nocopy, buffer;
4ed46869 8620{
df7492f9 8621 return code_convert_string (string, coding_system, buffer,
c197f191 8622 1, ! NILP (nocopy), 1);
4ed46869 8623}
df7492f9 8624
3a73fa5d 8625\f
4ed46869 8626DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
48b0f3ae
PJ
8627 doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
8628Return the corresponding character. */)
8629 (code)
4ed46869 8630 Lisp_Object code;
4ed46869 8631{
df7492f9
KH
8632 Lisp_Object spec, attrs, val;
8633 struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
8634 int c;
4ed46869 8635
df7492f9
KH
8636 CHECK_NATNUM (code);
8637 c = XFASTINT (code);
8638 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
8639 attrs = AREF (spec, 0);
4ed46869 8640
df7492f9
KH
8641 if (ASCII_BYTE_P (c)
8642 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8643 return code;
4ed46869 8644
df7492f9
KH
8645 val = CODING_ATTR_CHARSET_LIST (attrs);
8646 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
004068e4
KH
8647 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
8648 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
fa42c37f 8649
df7492f9
KH
8650 if (c <= 0x7F)
8651 charset = charset_roman;
8652 else if (c >= 0xA0 && c < 0xDF)
55ab7be3 8653 {
df7492f9
KH
8654 charset = charset_kana;
8655 c -= 0x80;
4ed46869 8656 }
55ab7be3 8657 else
4ed46869 8658 {
004068e4 8659 int s1 = c >> 8, s2 = c & 0xFF;
df7492f9
KH
8660
8661 if (s1 < 0x81 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF
8662 || s2 < 0x40 || s2 == 0x7F || s2 > 0xFC)
8663 error ("Invalid code: %d", code);
8664 SJIS_TO_JIS (c);
8665 charset = charset_kanji;
4ed46869 8666 }
df7492f9
KH
8667 c = DECODE_CHAR (charset, c);
8668 if (c < 0)
8669 error ("Invalid code: %d", code);
8670 return make_number (c);
93dec019 8671}
4ed46869 8672
48b0f3ae 8673
4ed46869 8674DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
8acf0c0e 8675 doc: /* Encode a Japanese character CH to shift_jis encoding.
48b0f3ae
PJ
8676Return the corresponding code in SJIS. */)
8677 (ch)
df7492f9 8678 Lisp_Object ch;
4ed46869 8679{
df7492f9
KH
8680 Lisp_Object spec, attrs, charset_list;
8681 int c;
8682 struct charset *charset;
8683 unsigned code;
48b0f3ae 8684
df7492f9
KH
8685 CHECK_CHARACTER (ch);
8686 c = XFASTINT (ch);
8687 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
8688 attrs = AREF (spec, 0);
8689
8690 if (ASCII_CHAR_P (c)
8691 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8692 return ch;
8693
8694 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8695 charset = char_charset (c, charset_list, &code);
8696 if (code == CHARSET_INVALID_CODE (charset))
8697 error ("Can't encode by shift_jis encoding: %d", c);
8698 JIS_TO_SJIS (code);
8699
8700 return make_number (code);
4ed46869
KH
8701}
8702
8703DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
48b0f3ae
PJ
8704 doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
8705Return the corresponding character. */)
8706 (code)
4ed46869 8707 Lisp_Object code;
d46c5b12 8708{
df7492f9
KH
8709 Lisp_Object spec, attrs, val;
8710 struct charset *charset_roman, *charset_big5, *charset;
8711 int c;
6289dd10 8712
df7492f9
KH
8713 CHECK_NATNUM (code);
8714 c = XFASTINT (code);
8715 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
8716 attrs = AREF (spec, 0);
4ed46869 8717
df7492f9
KH
8718 if (ASCII_BYTE_P (c)
8719 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8720 return code;
6289dd10 8721
df7492f9
KH
8722 val = CODING_ATTR_CHARSET_LIST (attrs);
8723 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
8724 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
c210f766 8725
df7492f9
KH
8726 if (c <= 0x7F)
8727 charset = charset_roman;
c28a9453
KH
8728 else
8729 {
df7492f9
KH
8730 int b1 = c >> 8, b2 = c & 0x7F;
8731 if (b1 < 0xA1 || b1 > 0xFE
8732 || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
8733 error ("Invalid code: %d", code);
8734 charset = charset_big5;
c28a9453 8735 }
df7492f9
KH
8736 c = DECODE_CHAR (charset, (unsigned )c);
8737 if (c < 0)
8738 error ("Invalid code: %d", code);
8739 return make_number (c);
d46c5b12 8740}
6289dd10 8741
4ed46869 8742DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
8acf0c0e 8743 doc: /* Encode the Big5 character CH to BIG5 coding system.
48b0f3ae
PJ
8744Return the corresponding character code in Big5. */)
8745 (ch)
4ed46869
KH
8746 Lisp_Object ch;
8747{
df7492f9
KH
8748 Lisp_Object spec, attrs, charset_list;
8749 struct charset *charset;
8750 int c;
8751 unsigned code;
8752
8753 CHECK_CHARACTER (ch);
8754 c = XFASTINT (ch);
8755 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
8756 attrs = AREF (spec, 0);
8757 if (ASCII_CHAR_P (c)
8758 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8759 return ch;
8760
8761 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8762 charset = char_charset (c, charset_list, &code);
8763 if (code == CHARSET_INVALID_CODE (charset))
8764 error ("Can't encode by Big5 encoding: %d", c);
8765
8766 return make_number (code);
4ed46869 8767}
48b0f3ae 8768
3a73fa5d 8769\f
002fdb44 8770DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
68bba4e4 8771 Sset_terminal_coding_system_internal, 1, 2, 0,
48b0f3ae 8772 doc: /* Internal use only. */)
6ed8eeff 8773 (coding_system, terminal)
b74e4686 8774 Lisp_Object coding_system;
6ed8eeff 8775 Lisp_Object terminal;
4ed46869 8776{
6ed8eeff 8777 struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
b7826503 8778 CHECK_SYMBOL (coding_system);
b8299c66 8779 setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
70c22245 8780 /* We had better not send unsafe characters to terminal. */
c73bd236 8781 terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
df7492f9 8782 /* Characer composition should be disabled. */
c73bd236 8783 terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
b8299c66
KL
8784 terminal_coding->src_multibyte = 1;
8785 terminal_coding->dst_multibyte = 0;
4ed46869
KH
8786 return Qnil;
8787}
8788
c4825358
KH
8789DEFUN ("set-safe-terminal-coding-system-internal",
8790 Fset_safe_terminal_coding_system_internal,
48b0f3ae 8791 Sset_safe_terminal_coding_system_internal, 1, 1, 0,
ddb67bdc 8792 doc: /* Internal use only. */)
48b0f3ae 8793 (coding_system)
b74e4686 8794 Lisp_Object coding_system;
d46c5b12 8795{
b7826503 8796 CHECK_SYMBOL (coding_system);
c4825358
KH
8797 setup_coding_system (Fcheck_coding_system (coding_system),
8798 &safe_terminal_coding);
df7492f9
KH
8799 /* Characer composition should be disabled. */
8800 safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
b73bfc1c
KH
8801 safe_terminal_coding.src_multibyte = 1;
8802 safe_terminal_coding.dst_multibyte = 0;
c4825358
KH
8803 return Qnil;
8804}
4ed46869 8805
002fdb44 8806DEFUN ("terminal-coding-system", Fterminal_coding_system,
68bba4e4 8807 Sterminal_coding_system, 0, 1, 0,
6ed8eeff
KL
8808 doc: /* Return coding system specified for terminal output on the given terminal.
8809TERMINAL may be a terminal id, a frame, or nil for the selected
8810frame's terminal device. */)
8811 (terminal)
8812 Lisp_Object terminal;
4ed46869 8813{
985773c9
MB
8814 struct coding_system *terminal_coding
8815 = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
8816 Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
ae6f73fa 8817
ae6f73fa 8818 /* For backward compatibility, return nil if it is `undecided'. */
f75c90a9 8819 return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
4ed46869
KH
8820}
8821
002fdb44 8822DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
68bba4e4 8823 Sset_keyboard_coding_system_internal, 1, 2, 0,
48b0f3ae 8824 doc: /* Internal use only. */)
6ed8eeff 8825 (coding_system, terminal)
4ed46869 8826 Lisp_Object coding_system;
6ed8eeff 8827 Lisp_Object terminal;
4ed46869 8828{
6ed8eeff 8829 struct terminal *t = get_terminal (terminal, 1);
b7826503 8830 CHECK_SYMBOL (coding_system);
df7492f9 8831 setup_coding_system (Fcheck_coding_system (coding_system),
c73bd236 8832 TERMINAL_KEYBOARD_CODING (t));
df7492f9 8833 /* Characer composition should be disabled. */
c73bd236
MB
8834 TERMINAL_KEYBOARD_CODING (t)->common_flags
8835 &= ~CODING_ANNOTATE_COMPOSITION_MASK;
4ed46869
KH
8836 return Qnil;
8837}
8838
8839DEFUN ("keyboard-coding-system",
985773c9 8840 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
48b0f3ae 8841 doc: /* Return coding system specified for decoding keyboard input. */)
985773c9
MB
8842 (terminal)
8843 Lisp_Object terminal;
4ed46869 8844{
985773c9
MB
8845 return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
8846 (get_terminal (terminal, 1))->id);
4ed46869
KH
8847}
8848
4ed46869 8849\f
a5d301df
KH
8850DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
8851 Sfind_operation_coding_system, 1, MANY, 0,
48b0f3ae
PJ
8852 doc: /* Choose a coding system for an operation based on the target name.
8853The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
8854DECODING-SYSTEM is the coding system to use for decoding
8855\(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
8856for encoding (in case OPERATION does encoding).
05e6f5dc 8857
48b0f3ae
PJ
8858The first argument OPERATION specifies an I/O primitive:
8859 For file I/O, `insert-file-contents' or `write-region'.
8860 For process I/O, `call-process', `call-process-region', or `start-process'.
8861 For network I/O, `open-network-stream'.
05e6f5dc 8862
48b0f3ae
PJ
8863The remaining arguments should be the same arguments that were passed
8864to the primitive. Depending on which primitive, one of those arguments
8865is selected as the TARGET. For example, if OPERATION does file I/O,
8866whichever argument specifies the file name is TARGET.
05e6f5dc 8867
48b0f3ae 8868TARGET has a meaning which depends on OPERATION:
b883cdb2 8869 For file I/O, TARGET is a file name (except for the special case below).
48b0f3ae 8870 For process I/O, TARGET is a process name.
d4a1d553 8871 For network I/O, TARGET is a service name or a port number.
05e6f5dc 8872
d4a1d553 8873This function looks up what is specified for TARGET in
48b0f3ae
PJ
8874`file-coding-system-alist', `process-coding-system-alist',
8875or `network-coding-system-alist' depending on OPERATION.
8876They may specify a coding system, a cons of coding systems,
8877or a function symbol to call.
8878In the last case, we call the function with one argument,
8879which is a list of all the arguments given to this function.
1011c487
MB
8880If the function can't decide a coding system, it can return
8881`undecided' so that the normal code-detection is performed.
48b0f3ae 8882
b883cdb2
MB
8883If OPERATION is `insert-file-contents', the argument corresponding to
8884TARGET may be a cons (FILENAME . BUFFER). In that case, FILENAME is a
8885file name to look up, and BUFFER is a buffer that contains the file's
8886contents (not yet decoded). If `file-coding-system-alist' specifies a
8887function to call for FILENAME, that function should examine the
8888contents of BUFFER instead of reading the file.
8889
d918f936 8890usage: (find-operation-coding-system OPERATION ARGUMENTS...) */)
48b0f3ae 8891 (nargs, args)
4ed46869
KH
8892 int nargs;
8893 Lisp_Object *args;
6b89e3aa 8894{
4ed46869
KH
8895 Lisp_Object operation, target_idx, target, val;
8896 register Lisp_Object chain;
177c0ea7 8897
4ed46869
KH
8898 if (nargs < 2)
8899 error ("Too few arguments");
8900 operation = args[0];
8901 if (!SYMBOLP (operation)
8902 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
3ed051d4 8903 error ("Invalid first argument");
4ed46869
KH
8904 if (nargs < 1 + XINT (target_idx))
8905 error ("Too few arguments for operation: %s",
8f924df7 8906 SDATA (SYMBOL_NAME (operation)));
4ed46869
KH
8907 target = args[XINT (target_idx) + 1];
8908 if (!(STRINGP (target)
091a0ff0
KH
8909 || (EQ (operation, Qinsert_file_contents) && CONSP (target)
8910 && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
4ed46869 8911 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
df7492f9 8912 error ("Invalid %dth argument", XINT (target_idx) + 1);
091a0ff0
KH
8913 if (CONSP (target))
8914 target = XCAR (target);
4ed46869 8915
2e34157c
RS
8916 chain = ((EQ (operation, Qinsert_file_contents)
8917 || EQ (operation, Qwrite_region))
02ba4723 8918 ? Vfile_coding_system_alist
2e34157c 8919 : (EQ (operation, Qopen_network_stream)
02ba4723
KH
8920 ? Vnetwork_coding_system_alist
8921 : Vprocess_coding_system_alist));
4ed46869
KH
8922 if (NILP (chain))
8923 return Qnil;
8924
03699b14 8925 for (; CONSP (chain); chain = XCDR (chain))
6b89e3aa 8926 {
f44d27ce 8927 Lisp_Object elt;
6b89e3aa 8928
df7492f9 8929 elt = XCAR (chain);
4ed46869
KH
8930 if (CONSP (elt)
8931 && ((STRINGP (target)
03699b14
KR
8932 && STRINGP (XCAR (elt))
8933 && fast_string_match (XCAR (elt), target) >= 0)
8934 || (INTEGERP (target) && EQ (target, XCAR (elt)))))
6b89e3aa 8935 {
03699b14 8936 val = XCDR (elt);
b19fd4c5
KH
8937 /* Here, if VAL is both a valid coding system and a valid
8938 function symbol, we return VAL as a coding system. */
02ba4723
KH
8939 if (CONSP (val))
8940 return val;
8941 if (! SYMBOLP (val))
8942 return Qnil;
8943 if (! NILP (Fcoding_system_p (val)))
8944 return Fcons (val, val);
b19fd4c5 8945 if (! NILP (Ffboundp (val)))
6b89e3aa 8946 {
e2b97060
MB
8947 /* We use call1 rather than safe_call1
8948 so as to get bug reports about functions called here
8949 which don't handle the current interface. */
8950 val = call1 (val, Flist (nargs, args));
b19fd4c5
KH
8951 if (CONSP (val))
8952 return val;
8953 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
8954 return Fcons (val, val);
6b89e3aa 8955 }
02ba4723 8956 return Qnil;
6b89e3aa
KH
8957 }
8958 }
4ed46869 8959 return Qnil;
6b89e3aa
KH
8960}
8961
df7492f9 8962DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
a3f6ee6d 8963 Sset_coding_system_priority, 0, MANY, 0,
da7db224 8964 doc: /* Assign higher priority to the coding systems given as arguments.
d4a1d553 8965If multiple coding systems belong to the same category,
a3181084
DL
8966all but the first one are ignored.
8967
d4a1d553 8968usage: (set-coding-system-priority &rest coding-systems) */)
df7492f9
KH
8969 (nargs, args)
8970 int nargs;
8971 Lisp_Object *args;
8972{
8973 int i, j;
8974 int changed[coding_category_max];
8975 enum coding_category priorities[coding_category_max];
8976
8977 bzero (changed, sizeof changed);
6b89e3aa 8978
df7492f9 8979 for (i = j = 0; i < nargs; i++)
6b89e3aa 8980 {
df7492f9
KH
8981 enum coding_category category;
8982 Lisp_Object spec, attrs;
6b89e3aa 8983
df7492f9
KH
8984 CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
8985 attrs = AREF (spec, 0);
8986 category = XINT (CODING_ATTR_CATEGORY (attrs));
8987 if (changed[category])
8988 /* Ignore this coding system because a coding system of the
8989 same category already had a higher priority. */
8990 continue;
8991 changed[category] = 1;
8992 priorities[j++] = category;
8993 if (coding_categories[category].id >= 0
8994 && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
8995 setup_coding_system (args[i], &coding_categories[category]);
ff563fce 8996 Fset (AREF (Vcoding_category_table, category), args[i]);
df7492f9 8997 }
6b89e3aa 8998
df7492f9
KH
8999 /* Now we have decided top J priorities. Reflect the order of the
9000 original priorities to the remaining priorities. */
6b89e3aa 9001
df7492f9 9002 for (i = j, j = 0; i < coding_category_max; i++, j++)
6b89e3aa 9003 {
df7492f9
KH
9004 while (j < coding_category_max
9005 && changed[coding_priorities[j]])
9006 j++;
9007 if (j == coding_category_max)
9008 abort ();
9009 priorities[i] = coding_priorities[j];
9010 }
6b89e3aa 9011
df7492f9 9012 bcopy (priorities, coding_priorities, sizeof priorities);
177c0ea7 9013
ff563fce
KH
9014 /* Update `coding-category-list'. */
9015 Vcoding_category_list = Qnil;
9016 for (i = coding_category_max - 1; i >= 0; i--)
9017 Vcoding_category_list
9018 = Fcons (AREF (Vcoding_category_table, priorities[i]),
9019 Vcoding_category_list);
6b89e3aa 9020
df7492f9 9021 return Qnil;
6b89e3aa
KH
9022}
9023
df7492f9
KH
9024DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
9025 Scoding_system_priority_list, 0, 1, 0,
da7db224
DL
9026 doc: /* Return a list of coding systems ordered by their priorities.
9027HIGHESTP non-nil means just return the highest priority one. */)
df7492f9
KH
9028 (highestp)
9029 Lisp_Object highestp;
d46c5b12
KH
9030{
9031 int i;
df7492f9 9032 Lisp_Object val;
6b89e3aa 9033
df7492f9 9034 for (i = 0, val = Qnil; i < coding_category_max; i++)
d46c5b12 9035 {
df7492f9
KH
9036 enum coding_category category = coding_priorities[i];
9037 int id = coding_categories[category].id;
9038 Lisp_Object attrs;
068a9dbd 9039
df7492f9
KH
9040 if (id < 0)
9041 continue;
9042 attrs = CODING_ID_ATTRS (id);
9043 if (! NILP (highestp))
9044 return CODING_ATTR_BASE_NAME (attrs);
9045 val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
9046 }
9047 return Fnreverse (val);
9048}
068a9dbd 9049
f0064e1f 9050static char *suffixes[] = { "-unix", "-dos", "-mac" };
068a9dbd
KH
9051
9052static Lisp_Object
df7492f9
KH
9053make_subsidiaries (base)
9054 Lisp_Object base;
068a9dbd 9055{
df7492f9 9056 Lisp_Object subsidiaries;
8f924df7 9057 int base_name_len = SBYTES (SYMBOL_NAME (base));
df7492f9
KH
9058 char *buf = (char *) alloca (base_name_len + 6);
9059 int i;
068a9dbd 9060
8f924df7 9061 bcopy (SDATA (SYMBOL_NAME (base)), buf, base_name_len);
df7492f9
KH
9062 subsidiaries = Fmake_vector (make_number (3), Qnil);
9063 for (i = 0; i < 3; i++)
068a9dbd 9064 {
df7492f9
KH
9065 bcopy (suffixes[i], buf + base_name_len, strlen (suffixes[i]) + 1);
9066 ASET (subsidiaries, i, intern (buf));
068a9dbd 9067 }
df7492f9 9068 return subsidiaries;
068a9dbd
KH
9069}
9070
9071
df7492f9
KH
9072DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
9073 Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
1fcd6c8b
DL
9074 doc: /* For internal use only.
9075usage: (define-coding-system-internal ...) */)
df7492f9
KH
9076 (nargs, args)
9077 int nargs;
9078 Lisp_Object *args;
068a9dbd 9079{
df7492f9
KH
9080 Lisp_Object name;
9081 Lisp_Object spec_vec; /* [ ATTRS ALIASE EOL_TYPE ] */
9082 Lisp_Object attrs; /* Vector of attributes. */
9083 Lisp_Object eol_type;
9084 Lisp_Object aliases;
9085 Lisp_Object coding_type, charset_list, safe_charsets;
9086 enum coding_category category;
9087 Lisp_Object tail, val;
9088 int max_charset_id = 0;
9089 int i;
068a9dbd 9090
df7492f9
KH
9091 if (nargs < coding_arg_max)
9092 goto short_args;
068a9dbd 9093
df7492f9 9094 attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
068a9dbd 9095
df7492f9
KH
9096 name = args[coding_arg_name];
9097 CHECK_SYMBOL (name);
9098 CODING_ATTR_BASE_NAME (attrs) = name;
068a9dbd 9099
df7492f9
KH
9100 val = args[coding_arg_mnemonic];
9101 if (! STRINGP (val))
9102 CHECK_CHARACTER (val);
9103 CODING_ATTR_MNEMONIC (attrs) = val;
068a9dbd 9104
df7492f9
KH
9105 coding_type = args[coding_arg_coding_type];
9106 CHECK_SYMBOL (coding_type);
9107 CODING_ATTR_TYPE (attrs) = coding_type;
068a9dbd 9108
df7492f9
KH
9109 charset_list = args[coding_arg_charset_list];
9110 if (SYMBOLP (charset_list))
9111 {
9112 if (EQ (charset_list, Qiso_2022))
9113 {
9114 if (! EQ (coding_type, Qiso_2022))
9115 error ("Invalid charset-list");
9116 charset_list = Viso_2022_charset_list;
9117 }
9118 else if (EQ (charset_list, Qemacs_mule))
9119 {
9120 if (! EQ (coding_type, Qemacs_mule))
9121 error ("Invalid charset-list");
9122 charset_list = Vemacs_mule_charset_list;
9123 }
9124 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9125 if (max_charset_id < XFASTINT (XCAR (tail)))
9126 max_charset_id = XFASTINT (XCAR (tail));
9127 }
068a9dbd
KH
9128 else
9129 {
df7492f9 9130 charset_list = Fcopy_sequence (charset_list);
985773c9 9131 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
068a9dbd 9132 {
df7492f9
KH
9133 struct charset *charset;
9134
985773c9 9135 val = XCAR (tail);
df7492f9
KH
9136 CHECK_CHARSET_GET_CHARSET (val, charset);
9137 if (EQ (coding_type, Qiso_2022)
9138 ? CHARSET_ISO_FINAL (charset) < 0
9139 : EQ (coding_type, Qemacs_mule)
9140 ? CHARSET_EMACS_MULE_ID (charset) < 0
9141 : 0)
9142 error ("Can't handle charset `%s'",
8f924df7 9143 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
df7492f9 9144
8f924df7 9145 XSETCAR (tail, make_number (charset->id));
df7492f9
KH
9146 if (max_charset_id < charset->id)
9147 max_charset_id = charset->id;
068a9dbd
KH
9148 }
9149 }
df7492f9 9150 CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
068a9dbd 9151
df7492f9
KH
9152 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
9153 make_number (255));
9154 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8f924df7 9155 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
df7492f9 9156 CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
068a9dbd 9157
584948ac 9158 CODING_ATTR_ASCII_COMPAT (attrs) = args[coding_arg_ascii_compatible_p];
3a73fa5d 9159
df7492f9 9160 val = args[coding_arg_decode_translation_table];
a6f87d34 9161 if (! CHAR_TABLE_P (val) && ! CONSP (val))
7d64c6ad 9162 CHECK_SYMBOL (val);
df7492f9 9163 CODING_ATTR_DECODE_TBL (attrs) = val;
3a73fa5d 9164
df7492f9 9165 val = args[coding_arg_encode_translation_table];
a6f87d34 9166 if (! CHAR_TABLE_P (val) && ! CONSP (val))
7d64c6ad 9167 CHECK_SYMBOL (val);
df7492f9 9168 CODING_ATTR_ENCODE_TBL (attrs) = val;
d46c5b12 9169
df7492f9
KH
9170 val = args[coding_arg_post_read_conversion];
9171 CHECK_SYMBOL (val);
9172 CODING_ATTR_POST_READ (attrs) = val;
d46c5b12 9173
df7492f9
KH
9174 val = args[coding_arg_pre_write_conversion];
9175 CHECK_SYMBOL (val);
9176 CODING_ATTR_PRE_WRITE (attrs) = val;
3a73fa5d 9177
df7492f9
KH
9178 val = args[coding_arg_default_char];
9179 if (NILP (val))
9180 CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
9181 else
9182 {
8f924df7 9183 CHECK_CHARACTER (val);
df7492f9
KH
9184 CODING_ATTR_DEFAULT_CHAR (attrs) = val;
9185 }
4031e2bf 9186
8f924df7
KH
9187 val = args[coding_arg_for_unibyte];
9188 CODING_ATTR_FOR_UNIBYTE (attrs) = NILP (val) ? Qnil : Qt;
3a73fa5d 9189
df7492f9
KH
9190 val = args[coding_arg_plist];
9191 CHECK_LIST (val);
9192 CODING_ATTR_PLIST (attrs) = val;
3a73fa5d 9193
df7492f9
KH
9194 if (EQ (coding_type, Qcharset))
9195 {
c7c66a95
KH
9196 /* Generate a lisp vector of 256 elements. Each element is nil,
9197 integer, or a list of charset IDs.
3a73fa5d 9198
c7c66a95
KH
9199 If Nth element is nil, the byte code N is invalid in this
9200 coding system.
4ed46869 9201
c7c66a95
KH
9202 If Nth element is a number NUM, N is the first byte of a
9203 charset whose ID is NUM.
4ed46869 9204
c7c66a95
KH
9205 If Nth element is a list of charset IDs, N is the first byte
9206 of one of them. The list is sorted by dimensions of the
2bc515e4 9207 charsets. A charset of smaller dimension comes firtst. */
df7492f9 9208 val = Fmake_vector (make_number (256), Qnil);
4ed46869 9209
5c99c2e6 9210 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
df7492f9 9211 {
c7c66a95
KH
9212 struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
9213 int dim = CHARSET_DIMENSION (charset);
9214 int idx = (dim - 1) * 4;
4ed46869 9215
5c99c2e6 9216 if (CHARSET_ASCII_COMPATIBLE_P (charset))
584948ac 9217 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
4031e2bf 9218
15d143f7
KH
9219 for (i = charset->code_space[idx];
9220 i <= charset->code_space[idx + 1]; i++)
9221 {
c7c66a95
KH
9222 Lisp_Object tmp, tmp2;
9223 int dim2;
ec6d2bb8 9224
c7c66a95
KH
9225 tmp = AREF (val, i);
9226 if (NILP (tmp))
9227 tmp = XCAR (tail);
9228 else if (NUMBERP (tmp))
9229 {
9230 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
9231 if (dim < dim2)
c7c66a95 9232 tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
f9d71dcd
KH
9233 else
9234 tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
c7c66a95 9235 }
15d143f7 9236 else
c7c66a95
KH
9237 {
9238 for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
9239 {
9240 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
9241 if (dim < dim2)
9242 break;
9243 }
9244 if (NILP (tmp2))
9245 tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
9246 else
9247 {
9248 XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
9249 XSETCAR (tmp2, XCAR (tail));
9250 }
9251 }
9252 ASET (val, i, tmp);
15d143f7 9253 }
df7492f9
KH
9254 }
9255 ASET (attrs, coding_attr_charset_valids, val);
9256 category = coding_category_charset;
9257 }
9258 else if (EQ (coding_type, Qccl))
9259 {
9260 Lisp_Object valids;
ecec61c1 9261
df7492f9
KH
9262 if (nargs < coding_arg_ccl_max)
9263 goto short_args;
ecec61c1 9264
df7492f9
KH
9265 val = args[coding_arg_ccl_decoder];
9266 CHECK_CCL_PROGRAM (val);
9267 if (VECTORP (val))
9268 val = Fcopy_sequence (val);
9269 ASET (attrs, coding_attr_ccl_decoder, val);
ecec61c1 9270
df7492f9
KH
9271 val = args[coding_arg_ccl_encoder];
9272 CHECK_CCL_PROGRAM (val);
9273 if (VECTORP (val))
9274 val = Fcopy_sequence (val);
9275 ASET (attrs, coding_attr_ccl_encoder, val);
ecec61c1 9276
df7492f9
KH
9277 val = args[coding_arg_ccl_valids];
9278 valids = Fmake_string (make_number (256), make_number (0));
9279 for (tail = val; !NILP (tail); tail = Fcdr (tail))
9280 {
8dcbea82 9281 int from, to;
ecec61c1 9282
df7492f9
KH
9283 val = Fcar (tail);
9284 if (INTEGERP (val))
8dcbea82
KH
9285 {
9286 from = to = XINT (val);
9287 if (from < 0 || from > 255)
9288 args_out_of_range_3 (val, make_number (0), make_number (255));
9289 }
df7492f9
KH
9290 else
9291 {
df7492f9 9292 CHECK_CONS (val);
8f924df7
KH
9293 CHECK_NATNUM_CAR (val);
9294 CHECK_NATNUM_CDR (val);
df7492f9 9295 from = XINT (XCAR (val));
8f924df7 9296 if (from > 255)
8dcbea82
KH
9297 args_out_of_range_3 (XCAR (val),
9298 make_number (0), make_number (255));
df7492f9 9299 to = XINT (XCDR (val));
8dcbea82
KH
9300 if (to < from || to > 255)
9301 args_out_of_range_3 (XCDR (val),
9302 XCAR (val), make_number (255));
df7492f9 9303 }
8dcbea82 9304 for (i = from; i <= to; i++)
8f924df7 9305 SSET (valids, i, 1);
df7492f9
KH
9306 }
9307 ASET (attrs, coding_attr_ccl_valids, valids);
4ed46869 9308
df7492f9 9309 category = coding_category_ccl;
55ab7be3 9310 }
df7492f9 9311 else if (EQ (coding_type, Qutf_16))
55ab7be3 9312 {
df7492f9 9313 Lisp_Object bom, endian;
4ed46869 9314
584948ac 9315 CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
4ed46869 9316
df7492f9
KH
9317 if (nargs < coding_arg_utf16_max)
9318 goto short_args;
4ed46869 9319
df7492f9
KH
9320 bom = args[coding_arg_utf16_bom];
9321 if (! NILP (bom) && ! EQ (bom, Qt))
9322 {
9323 CHECK_CONS (bom);
8f924df7
KH
9324 val = XCAR (bom);
9325 CHECK_CODING_SYSTEM (val);
9326 val = XCDR (bom);
9327 CHECK_CODING_SYSTEM (val);
df7492f9 9328 }
a470d443 9329 ASET (attrs, coding_attr_utf_bom, bom);
df7492f9
KH
9330
9331 endian = args[coding_arg_utf16_endian];
b49a1807
KH
9332 CHECK_SYMBOL (endian);
9333 if (NILP (endian))
9334 endian = Qbig;
9335 else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
8f924df7 9336 error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
df7492f9
KH
9337 ASET (attrs, coding_attr_utf_16_endian, endian);
9338
9339 category = (CONSP (bom)
9340 ? coding_category_utf_16_auto
9341 : NILP (bom)
b49a1807 9342 ? (EQ (endian, Qbig)
df7492f9
KH
9343 ? coding_category_utf_16_be_nosig
9344 : coding_category_utf_16_le_nosig)
b49a1807 9345 : (EQ (endian, Qbig)
df7492f9
KH
9346 ? coding_category_utf_16_be
9347 : coding_category_utf_16_le));
9348 }
9349 else if (EQ (coding_type, Qiso_2022))
9350 {
9351 Lisp_Object initial, reg_usage, request, flags;
4776e638 9352 int i;
1397dc18 9353
df7492f9
KH
9354 if (nargs < coding_arg_iso2022_max)
9355 goto short_args;
9356
9357 initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
9358 CHECK_VECTOR (initial);
9359 for (i = 0; i < 4; i++)
9360 {
9361 val = Faref (initial, make_number (i));
9362 if (! NILP (val))
9363 {
584948ac
KH
9364 struct charset *charset;
9365
9366 CHECK_CHARSET_GET_CHARSET (val, charset);
9367 ASET (initial, i, make_number (CHARSET_ID (charset)));
9368 if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
9369 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
df7492f9
KH
9370 }
9371 else
9372 ASET (initial, i, make_number (-1));
9373 }
9374
9375 reg_usage = args[coding_arg_iso2022_reg_usage];
9376 CHECK_CONS (reg_usage);
8f924df7
KH
9377 CHECK_NUMBER_CAR (reg_usage);
9378 CHECK_NUMBER_CDR (reg_usage);
df7492f9
KH
9379
9380 request = Fcopy_sequence (args[coding_arg_iso2022_request]);
9381 for (tail = request; ! NILP (tail); tail = Fcdr (tail))
1397dc18 9382 {
df7492f9 9383 int id;
8f924df7 9384 Lisp_Object tmp;
df7492f9
KH
9385
9386 val = Fcar (tail);
9387 CHECK_CONS (val);
8f924df7
KH
9388 tmp = XCAR (val);
9389 CHECK_CHARSET_GET_ID (tmp, id);
9390 CHECK_NATNUM_CDR (val);
df7492f9
KH
9391 if (XINT (XCDR (val)) >= 4)
9392 error ("Invalid graphic register number: %d", XINT (XCDR (val)));
8f924df7 9393 XSETCAR (val, make_number (id));
1397dc18 9394 }
4ed46869 9395
df7492f9
KH
9396 flags = args[coding_arg_iso2022_flags];
9397 CHECK_NATNUM (flags);
9398 i = XINT (flags);
9399 if (EQ (args[coding_arg_charset_list], Qiso_2022))
9400 flags = make_number (i | CODING_ISO_FLAG_FULL_SUPPORT);
9401
9402 ASET (attrs, coding_attr_iso_initial, initial);
9403 ASET (attrs, coding_attr_iso_usage, reg_usage);
9404 ASET (attrs, coding_attr_iso_request, request);
9405 ASET (attrs, coding_attr_iso_flags, flags);
9406 setup_iso_safe_charsets (attrs);
9407
9408 if (i & CODING_ISO_FLAG_SEVEN_BITS)
9409 category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
9410 | CODING_ISO_FLAG_SINGLE_SHIFT))
9411 ? coding_category_iso_7_else
9412 : EQ (args[coding_arg_charset_list], Qiso_2022)
9413 ? coding_category_iso_7
9414 : coding_category_iso_7_tight);
9415 else
9416 {
9417 int id = XINT (AREF (initial, 1));
9418
c6fb6e98 9419 category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
df7492f9
KH
9420 || EQ (args[coding_arg_charset_list], Qiso_2022)
9421 || id < 0)
9422 ? coding_category_iso_8_else
9423 : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
9424 ? coding_category_iso_8_1
9425 : coding_category_iso_8_2);
9426 }
0ce7886f
KH
9427 if (category != coding_category_iso_8_1
9428 && category != coding_category_iso_8_2)
9429 CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
df7492f9
KH
9430 }
9431 else if (EQ (coding_type, Qemacs_mule))
c28a9453 9432 {
df7492f9
KH
9433 if (EQ (args[coding_arg_charset_list], Qemacs_mule))
9434 ASET (attrs, coding_attr_emacs_mule_full, Qt);
584948ac 9435 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
df7492f9 9436 category = coding_category_emacs_mule;
c28a9453 9437 }
df7492f9 9438 else if (EQ (coding_type, Qshift_jis))
c28a9453 9439 {
df7492f9
KH
9440
9441 struct charset *charset;
9442
7d64c6ad 9443 if (XINT (Flength (charset_list)) != 3
6e07c25f 9444 && XINT (Flength (charset_list)) != 4)
7d64c6ad 9445 error ("There should be three or four charsets");
df7492f9
KH
9446
9447 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9448 if (CHARSET_DIMENSION (charset) != 1)
9449 error ("Dimension of charset %s is not one",
8f924df7 9450 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
584948ac
KH
9451 if (CHARSET_ASCII_COMPATIBLE_P (charset))
9452 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
df7492f9
KH
9453
9454 charset_list = XCDR (charset_list);
9455 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9456 if (CHARSET_DIMENSION (charset) != 1)
9457 error ("Dimension of charset %s is not one",
8f924df7 9458 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
df7492f9
KH
9459
9460 charset_list = XCDR (charset_list);
9461 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9462 if (CHARSET_DIMENSION (charset) != 2)
7d64c6ad
KH
9463 error ("Dimension of charset %s is not two",
9464 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9465
9466 charset_list = XCDR (charset_list);
2b917a06
KH
9467 if (! NILP (charset_list))
9468 {
9469 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9470 if (CHARSET_DIMENSION (charset) != 2)
9471 error ("Dimension of charset %s is not two",
9472 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9473 }
df7492f9
KH
9474
9475 category = coding_category_sjis;
9476 Vsjis_coding_system = name;
c28a9453 9477 }
df7492f9
KH
9478 else if (EQ (coding_type, Qbig5))
9479 {
9480 struct charset *charset;
4ed46869 9481
df7492f9
KH
9482 if (XINT (Flength (charset_list)) != 2)
9483 error ("There should be just two charsets");
9484
9485 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9486 if (CHARSET_DIMENSION (charset) != 1)
9487 error ("Dimension of charset %s is not one",
8f924df7 9488 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
584948ac
KH
9489 if (CHARSET_ASCII_COMPATIBLE_P (charset))
9490 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
df7492f9
KH
9491
9492 charset_list = XCDR (charset_list);
9493 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9494 if (CHARSET_DIMENSION (charset) != 2)
9495 error ("Dimension of charset %s is not two",
8f924df7 9496 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
4ed46869 9497
df7492f9
KH
9498 category = coding_category_big5;
9499 Vbig5_coding_system = name;
9500 }
9501 else if (EQ (coding_type, Qraw_text))
c28a9453 9502 {
584948ac
KH
9503 category = coding_category_raw_text;
9504 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
c28a9453 9505 }
df7492f9 9506 else if (EQ (coding_type, Qutf_8))
4ed46869 9507 {
a470d443
KH
9508 Lisp_Object bom;
9509
584948ac 9510 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
a470d443
KH
9511
9512 if (nargs < coding_arg_utf8_max)
9513 goto short_args;
9514
9515 bom = args[coding_arg_utf8_bom];
9516 if (! NILP (bom) && ! EQ (bom, Qt))
9517 {
9518 CHECK_CONS (bom);
9519 val = XCAR (bom);
9520 CHECK_CODING_SYSTEM (val);
9521 val = XCDR (bom);
9522 CHECK_CODING_SYSTEM (val);
9523 }
9524 ASET (attrs, coding_attr_utf_bom, bom);
9525
9526 category = (CONSP (bom) ? coding_category_utf_8_auto
9527 : NILP (bom) ? coding_category_utf_8_nosig
9528 : coding_category_utf_8_sig);
4ed46869 9529 }
df7492f9
KH
9530 else if (EQ (coding_type, Qundecided))
9531 category = coding_category_undecided;
4ed46869 9532 else
df7492f9 9533 error ("Invalid coding system type: %s",
8f924df7 9534 SDATA (SYMBOL_NAME (coding_type)));
4ed46869 9535
df7492f9 9536 CODING_ATTR_CATEGORY (attrs) = make_number (category);
01378f49
KH
9537 CODING_ATTR_PLIST (attrs)
9538 = Fcons (QCcategory, Fcons (AREF (Vcoding_category_table, category),
9539 CODING_ATTR_PLIST (attrs)));
35befdaa 9540 CODING_ATTR_PLIST (attrs)
3ed051d4 9541 = Fcons (QCascii_compatible_p,
35befdaa
KH
9542 Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
9543 CODING_ATTR_PLIST (attrs)));
c4825358 9544
df7492f9
KH
9545 eol_type = args[coding_arg_eol_type];
9546 if (! NILP (eol_type)
9547 && ! EQ (eol_type, Qunix)
9548 && ! EQ (eol_type, Qdos)
9549 && ! EQ (eol_type, Qmac))
9550 error ("Invalid eol-type");
4ed46869 9551
df7492f9 9552 aliases = Fcons (name, Qnil);
4ed46869 9553
df7492f9
KH
9554 if (NILP (eol_type))
9555 {
9556 eol_type = make_subsidiaries (name);
9557 for (i = 0; i < 3; i++)
1397dc18 9558 {
df7492f9
KH
9559 Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
9560
9561 this_name = AREF (eol_type, i);
9562 this_aliases = Fcons (this_name, Qnil);
9563 this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
9564 this_spec = Fmake_vector (make_number (3), attrs);
9565 ASET (this_spec, 1, this_aliases);
9566 ASET (this_spec, 2, this_eol_type);
9567 Fputhash (this_name, this_spec, Vcoding_system_hash_table);
9568 Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
583f71ca
KH
9569 val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
9570 if (NILP (val))
9571 Vcoding_system_alist
9572 = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
9573 Vcoding_system_alist);
1397dc18 9574 }
d46c5b12 9575 }
4ed46869 9576
df7492f9
KH
9577 spec_vec = Fmake_vector (make_number (3), attrs);
9578 ASET (spec_vec, 1, aliases);
9579 ASET (spec_vec, 2, eol_type);
48b0f3ae 9580
df7492f9
KH
9581 Fputhash (name, spec_vec, Vcoding_system_hash_table);
9582 Vcoding_system_list = Fcons (name, Vcoding_system_list);
583f71ca
KH
9583 val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
9584 if (NILP (val))
9585 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
9586 Vcoding_system_alist);
48b0f3ae 9587
df7492f9
KH
9588 {
9589 int id = coding_categories[category].id;
48b0f3ae 9590
df7492f9
KH
9591 if (id < 0 || EQ (name, CODING_ID_NAME (id)))
9592 setup_coding_system (name, &coding_categories[category]);
9593 }
48b0f3ae 9594
d46c5b12 9595 return Qnil;
48b0f3ae 9596
df7492f9
KH
9597 short_args:
9598 return Fsignal (Qwrong_number_of_arguments,
9599 Fcons (intern ("define-coding-system-internal"),
9600 make_number (nargs)));
d46c5b12 9601}
4ed46869 9602
d6925f38 9603
a6f87d34
KH
9604DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
9605 3, 3, 0,
9606 doc: /* Change value in CODING-SYSTEM's property list PROP to VAL. */)
9607 (coding_system, prop, val)
9608 Lisp_Object coding_system, prop, val;
9609{
3dbe7859 9610 Lisp_Object spec, attrs;
a6f87d34
KH
9611
9612 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9613 attrs = AREF (spec, 0);
9614 if (EQ (prop, QCmnemonic))
9615 {
9616 if (! STRINGP (val))
9617 CHECK_CHARACTER (val);
9618 CODING_ATTR_MNEMONIC (attrs) = val;
9619 }
9620 else if (EQ (prop, QCdefalut_char))
9621 {
9622 if (NILP (val))
9623 val = make_number (' ');
9624 else
9625 CHECK_CHARACTER (val);
9626 CODING_ATTR_DEFAULT_CHAR (attrs) = val;
9627 }
9628 else if (EQ (prop, QCdecode_translation_table))
9629 {
9630 if (! CHAR_TABLE_P (val) && ! CONSP (val))
9631 CHECK_SYMBOL (val);
9632 CODING_ATTR_DECODE_TBL (attrs) = val;
9633 }
9634 else if (EQ (prop, QCencode_translation_table))
9635 {
9636 if (! CHAR_TABLE_P (val) && ! CONSP (val))
9637 CHECK_SYMBOL (val);
9638 CODING_ATTR_ENCODE_TBL (attrs) = val;
9639 }
9640 else if (EQ (prop, QCpost_read_conversion))
9641 {
9642 CHECK_SYMBOL (val);
9643 CODING_ATTR_POST_READ (attrs) = val;
9644 }
9645 else if (EQ (prop, QCpre_write_conversion))
9646 {
9647 CHECK_SYMBOL (val);
9648 CODING_ATTR_PRE_WRITE (attrs) = val;
9649 }
35befdaa
KH
9650 else if (EQ (prop, QCascii_compatible_p))
9651 {
9652 CODING_ATTR_ASCII_COMPAT (attrs) = val;
9653 }
a6f87d34
KH
9654
9655 CODING_ATTR_PLIST (attrs)
9656 = Fplist_put (CODING_ATTR_PLIST (attrs), prop, val);
9657 return val;
9658}
9659
9660
df7492f9
KH
9661DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
9662 Sdefine_coding_system_alias, 2, 2, 0,
9663 doc: /* Define ALIAS as an alias for CODING-SYSTEM. */)
9664 (alias, coding_system)
9665 Lisp_Object alias, coding_system;
66cfb530 9666{
583f71ca 9667 Lisp_Object spec, aliases, eol_type, val;
4ed46869 9668
df7492f9
KH
9669 CHECK_SYMBOL (alias);
9670 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9671 aliases = AREF (spec, 1);
d4a1d553 9672 /* ALIASES should be a list of length more than zero, and the first
d6925f38
KH
9673 element is a base coding system. Append ALIAS at the tail of the
9674 list. */
df7492f9
KH
9675 while (!NILP (XCDR (aliases)))
9676 aliases = XCDR (aliases);
8f924df7 9677 XSETCDR (aliases, Fcons (alias, Qnil));
4ed46869 9678
df7492f9
KH
9679 eol_type = AREF (spec, 2);
9680 if (VECTORP (eol_type))
4ed46869 9681 {
df7492f9
KH
9682 Lisp_Object subsidiaries;
9683 int i;
4ed46869 9684
df7492f9
KH
9685 subsidiaries = make_subsidiaries (alias);
9686 for (i = 0; i < 3; i++)
9687 Fdefine_coding_system_alias (AREF (subsidiaries, i),
9688 AREF (eol_type, i));
4ed46869 9689 }
df7492f9
KH
9690
9691 Fputhash (alias, spec, Vcoding_system_hash_table);
d6925f38 9692 Vcoding_system_list = Fcons (alias, Vcoding_system_list);
583f71ca
KH
9693 val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
9694 if (NILP (val))
9695 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
9696 Vcoding_system_alist);
66cfb530 9697
4ed46869
KH
9698 return Qnil;
9699}
9700
df7492f9
KH
9701DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
9702 1, 1, 0,
9703 doc: /* Return the base of CODING-SYSTEM.
da7db224 9704Any alias or subsidiary coding system is not a base coding system. */)
df7492f9
KH
9705 (coding_system)
9706 Lisp_Object coding_system;
d46c5b12 9707{
df7492f9 9708 Lisp_Object spec, attrs;
d46c5b12 9709
df7492f9
KH
9710 if (NILP (coding_system))
9711 return (Qno_conversion);
9712 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9713 attrs = AREF (spec, 0);
9714 return CODING_ATTR_BASE_NAME (attrs);
9715}
1397dc18 9716
df7492f9
KH
9717DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
9718 1, 1, 0,
9719 doc: "Return the property list of CODING-SYSTEM.")
9720 (coding_system)
9721 Lisp_Object coding_system;
9722{
9723 Lisp_Object spec, attrs;
1397dc18 9724
df7492f9
KH
9725 if (NILP (coding_system))
9726 coding_system = Qno_conversion;
9727 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9728 attrs = AREF (spec, 0);
9729 return CODING_ATTR_PLIST (attrs);
d46c5b12
KH
9730}
9731
df7492f9
KH
9732
9733DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
9734 1, 1, 0,
da7db224 9735 doc: /* Return the list of aliases of CODING-SYSTEM. */)
df7492f9
KH
9736 (coding_system)
9737 Lisp_Object coding_system;
66cfb530 9738{
df7492f9 9739 Lisp_Object spec;
84d60297 9740
df7492f9
KH
9741 if (NILP (coding_system))
9742 coding_system = Qno_conversion;
9743 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
da7db224 9744 return AREF (spec, 1);
df7492f9 9745}
66cfb530 9746
df7492f9
KH
9747DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
9748 Scoding_system_eol_type, 1, 1, 0,
9749 doc: /* Return eol-type of CODING-SYSTEM.
d4a1d553 9750An eol-type is an integer 0, 1, 2, or a vector of coding systems.
66cfb530 9751
df7492f9
KH
9752Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
9753and CR respectively.
66cfb530 9754
df7492f9
KH
9755A vector value indicates that a format of end-of-line should be
9756detected automatically. Nth element of the vector is the subsidiary
9757coding system whose eol-type is N. */)
6b89e3aa
KH
9758 (coding_system)
9759 Lisp_Object coding_system;
9760{
df7492f9
KH
9761 Lisp_Object spec, eol_type;
9762 int n;
6b89e3aa 9763
df7492f9
KH
9764 if (NILP (coding_system))
9765 coding_system = Qno_conversion;
9766 if (! CODING_SYSTEM_P (coding_system))
9767 return Qnil;
9768 spec = CODING_SYSTEM_SPEC (coding_system);
9769 eol_type = AREF (spec, 2);
9770 if (VECTORP (eol_type))
9771 return Fcopy_sequence (eol_type);
9772 n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
9773 return make_number (n);
6b89e3aa
KH
9774}
9775
4ed46869
KH
9776#endif /* emacs */
9777
9778\f
1397dc18 9779/*** 9. Post-amble ***/
4ed46869 9780
dfcf069d 9781void
4ed46869
KH
9782init_coding_once ()
9783{
9784 int i;
9785
df7492f9
KH
9786 for (i = 0; i < coding_category_max; i++)
9787 {
9788 coding_categories[i].id = -1;
9789 coding_priorities[i] = i;
9790 }
4ed46869
KH
9791
9792 /* ISO2022 specific initialize routine. */
9793 for (i = 0; i < 0x20; i++)
b73bfc1c 9794 iso_code_class[i] = ISO_control_0;
4ed46869
KH
9795 for (i = 0x21; i < 0x7F; i++)
9796 iso_code_class[i] = ISO_graphic_plane_0;
9797 for (i = 0x80; i < 0xA0; i++)
b73bfc1c 9798 iso_code_class[i] = ISO_control_1;
4ed46869
KH
9799 for (i = 0xA1; i < 0xFF; i++)
9800 iso_code_class[i] = ISO_graphic_plane_1;
9801 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
9802 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
4ed46869
KH
9803 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
9804 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
9805 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
9806 iso_code_class[ISO_CODE_ESC] = ISO_escape;
9807 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
9808 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
9809 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
9810
df7492f9
KH
9811 for (i = 0; i < 256; i++)
9812 {
9813 emacs_mule_bytes[i] = 1;
9814 }
7c78e542
KH
9815 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
9816 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
9817 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
9818 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
e0e989f6
KH
9819}
9820
9821#ifdef emacs
9822
dfcf069d 9823void
e0e989f6
KH
9824syms_of_coding ()
9825{
df7492f9 9826 staticpro (&Vcoding_system_hash_table);
8f924df7
KH
9827 {
9828 Lisp_Object args[2];
9829 args[0] = QCtest;
9830 args[1] = Qeq;
9831 Vcoding_system_hash_table = Fmake_hash_table (2, args);
9832 }
df7492f9
KH
9833
9834 staticpro (&Vsjis_coding_system);
9835 Vsjis_coding_system = Qnil;
e0e989f6 9836
df7492f9
KH
9837 staticpro (&Vbig5_coding_system);
9838 Vbig5_coding_system = Qnil;
9839
24a73b0a
KH
9840 staticpro (&Vcode_conversion_reused_workbuf);
9841 Vcode_conversion_reused_workbuf = Qnil;
9842
9843 staticpro (&Vcode_conversion_workbuf_name);
9844 Vcode_conversion_workbuf_name = build_string (" *code-conversion-work*");
e0e989f6 9845
24a73b0a 9846 reused_workbuf_in_use = 0;
df7492f9
KH
9847
9848 DEFSYM (Qcharset, "charset");
9849 DEFSYM (Qtarget_idx, "target-idx");
9850 DEFSYM (Qcoding_system_history, "coding-system-history");
bb0115a2
RS
9851 Fset (Qcoding_system_history, Qnil);
9852
9ce27fde 9853 /* Target FILENAME is the first argument. */
e0e989f6 9854 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
9ce27fde 9855 /* Target FILENAME is the third argument. */
e0e989f6
KH
9856 Fput (Qwrite_region, Qtarget_idx, make_number (2));
9857
df7492f9 9858 DEFSYM (Qcall_process, "call-process");
9ce27fde 9859 /* Target PROGRAM is the first argument. */
e0e989f6
KH
9860 Fput (Qcall_process, Qtarget_idx, make_number (0));
9861
df7492f9 9862 DEFSYM (Qcall_process_region, "call-process-region");
9ce27fde 9863 /* Target PROGRAM is the third argument. */
e0e989f6
KH
9864 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
9865
df7492f9 9866 DEFSYM (Qstart_process, "start-process");
9ce27fde 9867 /* Target PROGRAM is the third argument. */
e0e989f6
KH
9868 Fput (Qstart_process, Qtarget_idx, make_number (2));
9869
df7492f9 9870 DEFSYM (Qopen_network_stream, "open-network-stream");
9ce27fde 9871 /* Target SERVICE is the fourth argument. */
e0e989f6
KH
9872 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
9873
df7492f9
KH
9874 DEFSYM (Qcoding_system, "coding-system");
9875 DEFSYM (Qcoding_aliases, "coding-aliases");
4ed46869 9876
df7492f9
KH
9877 DEFSYM (Qeol_type, "eol-type");
9878 DEFSYM (Qunix, "unix");
9879 DEFSYM (Qdos, "dos");
4ed46869 9880
df7492f9
KH
9881 DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
9882 DEFSYM (Qpost_read_conversion, "post-read-conversion");
9883 DEFSYM (Qpre_write_conversion, "pre-write-conversion");
9884 DEFSYM (Qdefault_char, "default-char");
9885 DEFSYM (Qundecided, "undecided");
9886 DEFSYM (Qno_conversion, "no-conversion");
9887 DEFSYM (Qraw_text, "raw-text");
4ed46869 9888
df7492f9 9889 DEFSYM (Qiso_2022, "iso-2022");
4ed46869 9890
df7492f9 9891 DEFSYM (Qutf_8, "utf-8");
8f924df7 9892 DEFSYM (Qutf_8_emacs, "utf-8-emacs");
27901516 9893
df7492f9 9894 DEFSYM (Qutf_16, "utf-16");
df7492f9
KH
9895 DEFSYM (Qbig, "big");
9896 DEFSYM (Qlittle, "little");
27901516 9897
df7492f9
KH
9898 DEFSYM (Qshift_jis, "shift-jis");
9899 DEFSYM (Qbig5, "big5");
4ed46869 9900
df7492f9 9901 DEFSYM (Qcoding_system_p, "coding-system-p");
4ed46869 9902
df7492f9 9903 DEFSYM (Qcoding_system_error, "coding-system-error");
4ed46869
KH
9904 Fput (Qcoding_system_error, Qerror_conditions,
9905 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
9906 Fput (Qcoding_system_error, Qerror_message,
9ce27fde 9907 build_string ("Invalid coding system"));
4ed46869 9908
05e6f5dc
KH
9909 /* Intern this now in case it isn't already done.
9910 Setting this variable twice is harmless.
9911 But don't staticpro it here--that is done in alloc.c. */
9912 Qchar_table_extra_slots = intern ("char-table-extra-slots");
70c22245 9913
df7492f9 9914 DEFSYM (Qtranslation_table, "translation-table");
433f7f87 9915 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
df7492f9
KH
9916 DEFSYM (Qtranslation_table_id, "translation-table-id");
9917 DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
9918 DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
1397dc18 9919
df7492f9 9920 DEFSYM (Qvalid_codes, "valid-codes");
9ce27fde 9921
df7492f9 9922 DEFSYM (Qemacs_mule, "emacs-mule");
d46c5b12 9923
01378f49 9924 DEFSYM (QCcategory, ":category");
a6f87d34
KH
9925 DEFSYM (QCmnemonic, ":mnemonic");
9926 DEFSYM (QCdefalut_char, ":default-char");
9927 DEFSYM (QCdecode_translation_table, ":decode-translation-table");
9928 DEFSYM (QCencode_translation_table, ":encode-translation-table");
9929 DEFSYM (QCpost_read_conversion, ":post-read-conversion");
9930 DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
35befdaa 9931 DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
01378f49 9932
df7492f9
KH
9933 Vcoding_category_table
9934 = Fmake_vector (make_number (coding_category_max), Qnil);
9935 staticpro (&Vcoding_category_table);
9936 /* Followings are target of code detection. */
9937 ASET (Vcoding_category_table, coding_category_iso_7,
9938 intern ("coding-category-iso-7"));
9939 ASET (Vcoding_category_table, coding_category_iso_7_tight,
9940 intern ("coding-category-iso-7-tight"));
9941 ASET (Vcoding_category_table, coding_category_iso_8_1,
9942 intern ("coding-category-iso-8-1"));
9943 ASET (Vcoding_category_table, coding_category_iso_8_2,
9944 intern ("coding-category-iso-8-2"));
9945 ASET (Vcoding_category_table, coding_category_iso_7_else,
9946 intern ("coding-category-iso-7-else"));
9947 ASET (Vcoding_category_table, coding_category_iso_8_else,
9948 intern ("coding-category-iso-8-else"));
a470d443
KH
9949 ASET (Vcoding_category_table, coding_category_utf_8_auto,
9950 intern ("coding-category-utf-8-auto"));
9951 ASET (Vcoding_category_table, coding_category_utf_8_nosig,
df7492f9 9952 intern ("coding-category-utf-8"));
a470d443
KH
9953 ASET (Vcoding_category_table, coding_category_utf_8_sig,
9954 intern ("coding-category-utf-8-sig"));
df7492f9
KH
9955 ASET (Vcoding_category_table, coding_category_utf_16_be,
9956 intern ("coding-category-utf-16-be"));
ff563fce
KH
9957 ASET (Vcoding_category_table, coding_category_utf_16_auto,
9958 intern ("coding-category-utf-16-auto"));
df7492f9
KH
9959 ASET (Vcoding_category_table, coding_category_utf_16_le,
9960 intern ("coding-category-utf-16-le"));
9961 ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
9962 intern ("coding-category-utf-16-be-nosig"));
9963 ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
9964 intern ("coding-category-utf-16-le-nosig"));
9965 ASET (Vcoding_category_table, coding_category_charset,
9966 intern ("coding-category-charset"));
9967 ASET (Vcoding_category_table, coding_category_sjis,
9968 intern ("coding-category-sjis"));
9969 ASET (Vcoding_category_table, coding_category_big5,
9970 intern ("coding-category-big5"));
9971 ASET (Vcoding_category_table, coding_category_ccl,
9972 intern ("coding-category-ccl"));
9973 ASET (Vcoding_category_table, coding_category_emacs_mule,
9974 intern ("coding-category-emacs-mule"));
9975 /* Followings are NOT target of code detection. */
9976 ASET (Vcoding_category_table, coding_category_raw_text,
9977 intern ("coding-category-raw-text"));
9978 ASET (Vcoding_category_table, coding_category_undecided,
9979 intern ("coding-category-undecided"));
ecf488bc 9980
065e3595
KH
9981 DEFSYM (Qinsufficient_source, "insufficient-source");
9982 DEFSYM (Qinconsistent_eol, "inconsistent-eol");
9983 DEFSYM (Qinvalid_source, "invalid-source");
9984 DEFSYM (Qinterrupted, "interrupted");
9985 DEFSYM (Qinsufficient_memory, "insufficient-memory");
44e8490d 9986 DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
065e3595 9987
4ed46869
KH
9988 defsubr (&Scoding_system_p);
9989 defsubr (&Sread_coding_system);
9990 defsubr (&Sread_non_nil_coding_system);
9991 defsubr (&Scheck_coding_system);
9992 defsubr (&Sdetect_coding_region);
d46c5b12 9993 defsubr (&Sdetect_coding_string);
05e6f5dc 9994 defsubr (&Sfind_coding_systems_region_internal);
068a9dbd 9995 defsubr (&Sunencodable_char_position);
df7492f9 9996 defsubr (&Scheck_coding_systems_region);
4ed46869
KH
9997 defsubr (&Sdecode_coding_region);
9998 defsubr (&Sencode_coding_region);
9999 defsubr (&Sdecode_coding_string);
10000 defsubr (&Sencode_coding_string);
10001 defsubr (&Sdecode_sjis_char);
10002 defsubr (&Sencode_sjis_char);
10003 defsubr (&Sdecode_big5_char);
10004 defsubr (&Sencode_big5_char);
1ba9e4ab 10005 defsubr (&Sset_terminal_coding_system_internal);
c4825358 10006 defsubr (&Sset_safe_terminal_coding_system_internal);
4ed46869 10007 defsubr (&Sterminal_coding_system);
1ba9e4ab 10008 defsubr (&Sset_keyboard_coding_system_internal);
4ed46869 10009 defsubr (&Skeyboard_coding_system);
a5d301df 10010 defsubr (&Sfind_operation_coding_system);
df7492f9 10011 defsubr (&Sset_coding_system_priority);
6b89e3aa 10012 defsubr (&Sdefine_coding_system_internal);
df7492f9 10013 defsubr (&Sdefine_coding_system_alias);
a6f87d34 10014 defsubr (&Scoding_system_put);
df7492f9
KH
10015 defsubr (&Scoding_system_base);
10016 defsubr (&Scoding_system_plist);
10017 defsubr (&Scoding_system_aliases);
10018 defsubr (&Scoding_system_eol_type);
10019 defsubr (&Scoding_system_priority_list);
4ed46869 10020
4608c386 10021 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
48b0f3ae
PJ
10022 doc: /* List of coding systems.
10023
10024Do not alter the value of this variable manually. This variable should be
df7492f9 10025updated by the functions `define-coding-system' and
48b0f3ae 10026`define-coding-system-alias'. */);
4608c386
KH
10027 Vcoding_system_list = Qnil;
10028
10029 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
48b0f3ae
PJ
10030 doc: /* Alist of coding system names.
10031Each element is one element list of coding system name.
446dcd75 10032This variable is given to `completing-read' as COLLECTION argument.
48b0f3ae
PJ
10033
10034Do not alter the value of this variable manually. This variable should be
10035updated by the functions `make-coding-system' and
10036`define-coding-system-alias'. */);
4608c386
KH
10037 Vcoding_system_alist = Qnil;
10038
4ed46869 10039 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
48b0f3ae
PJ
10040 doc: /* List of coding-categories (symbols) ordered by priority.
10041
10042On detecting a coding system, Emacs tries code detection algorithms
10043associated with each coding-category one by one in this order. When
10044one algorithm agrees with a byte sequence of source text, the coding
0ec31faf
KH
10045system bound to the corresponding coding-category is selected.
10046
42205607 10047Don't modify this variable directly, but use `set-coding-priority'. */);
4ed46869
KH
10048 {
10049 int i;
10050
10051 Vcoding_category_list = Qnil;
df7492f9 10052 for (i = coding_category_max - 1; i >= 0; i--)
4ed46869 10053 Vcoding_category_list
d46c5b12
KH
10054 = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
10055 Vcoding_category_list);
4ed46869
KH
10056 }
10057
10058 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
48b0f3ae
PJ
10059 doc: /* Specify the coding system for read operations.
10060It is useful to bind this variable with `let', but do not set it globally.
10061If the value is a coding system, it is used for decoding on read operation.
446dcd75
JB
10062If not, an appropriate element is used from one of the coding system alists.
10063There are three such tables: `file-coding-system-alist',
48b0f3ae 10064`process-coding-system-alist', and `network-coding-system-alist'. */);
4ed46869
KH
10065 Vcoding_system_for_read = Qnil;
10066
10067 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
48b0f3ae
PJ
10068 doc: /* Specify the coding system for write operations.
10069Programs bind this variable with `let', but you should not set it globally.
10070If the value is a coding system, it is used for encoding of output,
10071when writing it to a file and when sending it to a file or subprocess.
10072
10073If this does not specify a coding system, an appropriate element
446dcd75
JB
10074is used from one of the coding system alists.
10075There are three such tables: `file-coding-system-alist',
48b0f3ae
PJ
10076`process-coding-system-alist', and `network-coding-system-alist'.
10077For output to files, if the above procedure does not specify a coding system,
10078the value of `buffer-file-coding-system' is used. */);
4ed46869
KH
10079 Vcoding_system_for_write = Qnil;
10080
10081 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
df7492f9
KH
10082 doc: /*
10083Coding system used in the latest file or process I/O. */);
4ed46869
KH
10084 Vlast_coding_system_used = Qnil;
10085
065e3595
KH
10086 DEFVAR_LISP ("last-code-conversion-error", &Vlast_code_conversion_error,
10087 doc: /*
10088Error status of the last code conversion.
10089
10090When an error was detected in the last code conversion, this variable
10091is set to one of the following symbols.
10092 `insufficient-source'
10093 `inconsistent-eol'
10094 `invalid-source'
10095 `interrupted'
10096 `insufficient-memory'
10097When no error was detected, the value doesn't change. So, to check
10098the error status of a code conversion by this variable, you must
10099explicitly set this variable to nil before performing code
10100conversion. */);
10101 Vlast_code_conversion_error = Qnil;
10102
9ce27fde 10103 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
df7492f9
KH
10104 doc: /*
10105*Non-nil means always inhibit code conversion of end-of-line format.
48b0f3ae
PJ
10106See info node `Coding Systems' and info node `Text and Binary' concerning
10107such conversion. */);
9ce27fde
KH
10108 inhibit_eol_conversion = 0;
10109
ed29121d 10110 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
df7492f9
KH
10111 doc: /*
10112Non-nil means process buffer inherits coding system of process output.
48b0f3ae
PJ
10113Bind it to t if the process output is to be treated as if it were a file
10114read from some filesystem. */);
ed29121d
EZ
10115 inherit_process_coding_system = 0;
10116
02ba4723 10117 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
df7492f9
KH
10118 doc: /*
10119Alist to decide a coding system to use for a file I/O operation.
48b0f3ae
PJ
10120The format is ((PATTERN . VAL) ...),
10121where PATTERN is a regular expression matching a file name,
10122VAL is a coding system, a cons of coding systems, or a function symbol.
10123If VAL is a coding system, it is used for both decoding and encoding
10124the file contents.
10125If VAL is a cons of coding systems, the car part is used for decoding,
10126and the cdr part is used for encoding.
10127If VAL is a function symbol, the function must return a coding system
2c53e699
KH
10128or a cons of coding systems which are used as above. The function is
10129called with an argument that is a list of the arguments with which
5a0bbd9a
KH
10130`find-operation-coding-system' was called. If the function can't decide
10131a coding system, it can return `undecided' so that the normal
10132code-detection is performed.
48b0f3ae
PJ
10133
10134See also the function `find-operation-coding-system'
10135and the variable `auto-coding-alist'. */);
02ba4723
KH
10136 Vfile_coding_system_alist = Qnil;
10137
10138 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
df7492f9
KH
10139 doc: /*
10140Alist to decide a coding system to use for a process I/O operation.
48b0f3ae
PJ
10141The format is ((PATTERN . VAL) ...),
10142where PATTERN is a regular expression matching a program name,
10143VAL is a coding system, a cons of coding systems, or a function symbol.
10144If VAL is a coding system, it is used for both decoding what received
10145from the program and encoding what sent to the program.
10146If VAL is a cons of coding systems, the car part is used for decoding,
10147and the cdr part is used for encoding.
10148If VAL is a function symbol, the function must return a coding system
10149or a cons of coding systems which are used as above.
10150
10151See also the function `find-operation-coding-system'. */);
02ba4723
KH
10152 Vprocess_coding_system_alist = Qnil;
10153
10154 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
df7492f9
KH
10155 doc: /*
10156Alist to decide a coding system to use for a network I/O operation.
48b0f3ae
PJ
10157The format is ((PATTERN . VAL) ...),
10158where PATTERN is a regular expression matching a network service name
10159or is a port number to connect to,
10160VAL is a coding system, a cons of coding systems, or a function symbol.
10161If VAL is a coding system, it is used for both decoding what received
10162from the network stream and encoding what sent to the network stream.
10163If VAL is a cons of coding systems, the car part is used for decoding,
10164and the cdr part is used for encoding.
10165If VAL is a function symbol, the function must return a coding system
10166or a cons of coding systems which are used as above.
10167
10168See also the function `find-operation-coding-system'. */);
02ba4723 10169 Vnetwork_coding_system_alist = Qnil;
4ed46869 10170
68c45bf0 10171 DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
75205970
RS
10172 doc: /* Coding system to use with system messages.
10173Also used for decoding keyboard input on X Window system. */);
68c45bf0
PE
10174 Vlocale_coding_system = Qnil;
10175
005f0d35 10176 /* The eol mnemonics are reset in startup.el system-dependently. */
7722baf9 10177 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
df7492f9
KH
10178 doc: /*
10179*String displayed in mode line for UNIX-like (LF) end-of-line format. */);
7722baf9 10180 eol_mnemonic_unix = build_string (":");
4ed46869 10181
7722baf9 10182 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
df7492f9
KH
10183 doc: /*
10184*String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
7722baf9 10185 eol_mnemonic_dos = build_string ("\\");
4ed46869 10186
7722baf9 10187 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
df7492f9
KH
10188 doc: /*
10189*String displayed in mode line for MAC-like (CR) end-of-line format. */);
7722baf9 10190 eol_mnemonic_mac = build_string ("/");
4ed46869 10191
7722baf9 10192 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
df7492f9
KH
10193 doc: /*
10194*String displayed in mode line when end-of-line format is not yet determined. */);
7722baf9 10195 eol_mnemonic_undecided = build_string (":");
4ed46869 10196
84fbb8a0 10197 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
df7492f9
KH
10198 doc: /*
10199*Non-nil enables character translation while encoding and decoding. */);
84fbb8a0 10200 Venable_character_translation = Qt;
bdd9fb48 10201
f967223b 10202 DEFVAR_LISP ("standard-translation-table-for-decode",
48b0f3ae
PJ
10203 &Vstandard_translation_table_for_decode,
10204 doc: /* Table for translating characters while decoding. */);
f967223b 10205 Vstandard_translation_table_for_decode = Qnil;
bdd9fb48 10206
f967223b 10207 DEFVAR_LISP ("standard-translation-table-for-encode",
48b0f3ae
PJ
10208 &Vstandard_translation_table_for_encode,
10209 doc: /* Table for translating characters while encoding. */);
f967223b 10210 Vstandard_translation_table_for_encode = Qnil;
4ed46869 10211
df7492f9 10212 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table,
48b0f3ae
PJ
10213 doc: /* Alist of charsets vs revision numbers.
10214While encoding, if a charset (car part of an element) is found,
df7492f9
KH
10215designate it with the escape sequence identifying revision (cdr part
10216of the element). */);
10217 Vcharset_revision_table = Qnil;
02ba4723
KH
10218
10219 DEFVAR_LISP ("default-process-coding-system",
10220 &Vdefault_process_coding_system,
48b0f3ae
PJ
10221 doc: /* Cons of coding systems used for process I/O by default.
10222The car part is used for decoding a process output,
10223the cdr part is used for encoding a text to be sent to a process. */);
02ba4723 10224 Vdefault_process_coding_system = Qnil;
c4825358 10225
3f003981 10226 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
df7492f9
KH
10227 doc: /*
10228Table of extra Latin codes in the range 128..159 (inclusive).
48b0f3ae
PJ
10229This is a vector of length 256.
10230If Nth element is non-nil, the existence of code N in a file
10231\(or output of subprocess) doesn't prevent it to be detected as
10232a coding system of ISO 2022 variant which has a flag
10233`accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
10234or reading output of a subprocess.
446dcd75 10235Only 128th through 159th elements have a meaning. */);
3f003981 10236 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
d46c5b12
KH
10237
10238 DEFVAR_LISP ("select-safe-coding-system-function",
10239 &Vselect_safe_coding_system_function,
df7492f9
KH
10240 doc: /*
10241Function to call to select safe coding system for encoding a text.
48b0f3ae
PJ
10242
10243If set, this function is called to force a user to select a proper
10244coding system which can encode the text in the case that a default
fdecf907
GM
10245coding system used in each operation can't encode the text. The
10246function should take care that the buffer is not modified while
10247the coding system is being selected.
48b0f3ae
PJ
10248
10249The default value is `select-safe-coding-system' (which see). */);
d46c5b12
KH
10250 Vselect_safe_coding_system_function = Qnil;
10251
5d5bf4d8
KH
10252 DEFVAR_BOOL ("coding-system-require-warning",
10253 &coding_system_require_warning,
10254 doc: /* Internal use only.
6b89e3aa
KH
10255If non-nil, on writing a file, `select-safe-coding-system-function' is
10256called even if `coding-system-for-write' is non-nil. The command
10257`universal-coding-system-argument' binds this variable to t temporarily. */);
5d5bf4d8
KH
10258 coding_system_require_warning = 0;
10259
10260
22ab2303 10261 DEFVAR_BOOL ("inhibit-iso-escape-detection",
74383408 10262 &inhibit_iso_escape_detection,
df7492f9
KH
10263 doc: /*
10264If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
48b0f3ae
PJ
10265
10266By default, on reading a file, Emacs tries to detect how the text is
10267encoded. This code detection is sensitive to escape sequences. If
10268the sequence is valid as ISO2022, the code is determined as one of
10269the ISO2022 encodings, and the file is decoded by the corresponding
10270coding system (e.g. `iso-2022-7bit').
10271
10272However, there may be a case that you want to read escape sequences in
10273a file as is. In such a case, you can set this variable to non-nil.
10274Then, as the code detection ignores any escape sequences, no file is
10275detected as encoded in some ISO2022 encoding. The result is that all
10276escape sequences become visible in a buffer.
10277
10278The default value is nil, and it is strongly recommended not to change
10279it. That is because many Emacs Lisp source files that contain
10280non-ASCII characters are encoded by the coding system `iso-2022-7bit'
10281in Emacs's distribution, and they won't be decoded correctly on
10282reading if you suppress escape sequence detection.
10283
10284The other way to read escape sequences in a file without decoding is
10285to explicitly specify some coding system that doesn't use ISO2022's
10286escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument]. */);
74383408 10287 inhibit_iso_escape_detection = 0;
002fdb44
DL
10288
10289 DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
15c8f9d1 10290 doc: /* Char table for translating self-inserting characters.
446dcd75
JB
10291This is applied to the result of input methods, not their input.
10292See also `keyboard-translate-table'. */);
002fdb44 10293 Vtranslation_table_for_input = Qnil;
8f924df7 10294
2c78b7e1
KH
10295 {
10296 Lisp_Object args[coding_arg_max];
8f924df7 10297 Lisp_Object plist[16];
2c78b7e1
KH
10298 int i;
10299
10300 for (i = 0; i < coding_arg_max; i++)
10301 args[i] = Qnil;
10302
10303 plist[0] = intern (":name");
10304 plist[1] = args[coding_arg_name] = Qno_conversion;
10305 plist[2] = intern (":mnemonic");
10306 plist[3] = args[coding_arg_mnemonic] = make_number ('=');
10307 plist[4] = intern (":coding-type");
10308 plist[5] = args[coding_arg_coding_type] = Qraw_text;
10309 plist[6] = intern (":ascii-compatible-p");
10310 plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
10311 plist[8] = intern (":default-char");
10312 plist[9] = args[coding_arg_default_char] = make_number (0);
8f924df7
KH
10313 plist[10] = intern (":for-unibyte");
10314 plist[11] = args[coding_arg_for_unibyte] = Qt;
10315 plist[12] = intern (":docstring");
10316 plist[13] = build_string ("Do no conversion.\n\
2c78b7e1
KH
10317\n\
10318When you visit a file with this coding, the file is read into a\n\
10319unibyte buffer as is, thus each byte of a file is treated as a\n\
10320character.");
8f924df7
KH
10321 plist[14] = intern (":eol-type");
10322 plist[15] = args[coding_arg_eol_type] = Qunix;
10323 args[coding_arg_plist] = Flist (16, plist);
2c78b7e1 10324 Fdefine_coding_system_internal (coding_arg_max, args);
ae6f73fa
KH
10325
10326 plist[1] = args[coding_arg_name] = Qundecided;
10327 plist[3] = args[coding_arg_mnemonic] = make_number ('-');
10328 plist[5] = args[coding_arg_coding_type] = Qundecided;
10329 /* This is already set.
35befdaa 10330 plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
ae6f73fa
KH
10331 plist[8] = intern (":charset-list");
10332 plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
10333 plist[11] = args[coding_arg_for_unibyte] = Qnil;
10334 plist[13] = build_string ("No conversion on encoding, automatic conversion on decoding.");
10335 plist[15] = args[coding_arg_eol_type] = Qnil;
10336 args[coding_arg_plist] = Flist (16, plist);
10337 Fdefine_coding_system_internal (coding_arg_max, args);
2c78b7e1
KH
10338 }
10339
2c78b7e1 10340 setup_coding_system (Qno_conversion, &safe_terminal_coding);
ff563fce
KH
10341
10342 {
10343 int i;
10344
10345 for (i = 0; i < coding_category_max; i++)
10346 Fset (AREF (Vcoding_category_table, i), Qno_conversion);
10347 }
fcbcfb64
KH
10348#if defined (MSDOS) || defined (WINDOWSNT)
10349 system_eol_type = Qdos;
10350#else
10351 system_eol_type = Qunix;
10352#endif
10353 staticpro (&system_eol_type);
4ed46869
KH
10354}
10355
68c45bf0
PE
10356char *
10357emacs_strerror (error_number)
10358 int error_number;
10359{
10360 char *str;
10361
ca9c0567 10362 synchronize_system_messages_locale ();
68c45bf0
PE
10363 str = strerror (error_number);
10364
10365 if (! NILP (Vlocale_coding_system))
10366 {
10367 Lisp_Object dec = code_convert_string_norecord (build_string (str),
10368 Vlocale_coding_system,
10369 0);
d5db4077 10370 str = (char *) SDATA (dec);
68c45bf0
PE
10371 }
10372
10373 return str;
10374}
10375
4ed46869 10376#endif /* emacs */
9ffd559c
KH
10377
10378/* arch-tag: 3a3a2b01-5ff6-4071-9afe-f5b808d9229d
10379 (do not change this comment) */