* keyboard.c (Fset_input_meta_mode): Doc fix.
[bpt/emacs.git] / src / coding.c
CommitLineData
9542cb1f 1/* Coding system handler (conversion, detection, etc).
aaef169d 2 Copyright (C) 2001, 2002, 2003, 2004, 2005,
76b6f707 3 2006, 2007, 2008, 2009 Free Software Foundation, Inc.
7976eda0 4 Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
76b6f707 5 2005, 2006, 2007, 2008, 2009
ce03bf76
KH
6 National Institute of Advanced Industrial Science and Technology (AIST)
7 Registration Number H14PRO021
8f924df7 8 Copyright (C) 2003
df7492f9
KH
9 National Institute of Advanced Industrial Science and Technology (AIST)
10 Registration Number H13PRO009
4ed46869 11
369314dc
KH
12This file is part of GNU Emacs.
13
9ec0b715 14GNU Emacs is free software: you can redistribute it and/or modify
369314dc 15it under the terms of the GNU General Public License as published by
9ec0b715
GM
16the Free Software Foundation, either version 3 of the License, or
17(at your option) any later version.
4ed46869 18
369314dc
KH
19GNU Emacs is distributed in the hope that it will be useful,
20but WITHOUT ANY WARRANTY; without even the implied warranty of
21MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22GNU General Public License for more details.
4ed46869 23
369314dc 24You should have received a copy of the GNU General Public License
9ec0b715 25along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */
4ed46869
KH
26
27/*** TABLE OF CONTENTS ***
28
b73bfc1c 29 0. General comments
4ed46869 30 1. Preamble
df7492f9
KH
31 2. Emacs' internal format (emacs-utf-8) handlers
32 3. UTF-8 handlers
33 4. UTF-16 handlers
34 5. Charset-base coding systems handlers
35 6. emacs-mule (old Emacs' internal format) handlers
36 7. ISO2022 handlers
37 8. Shift-JIS and BIG5 handlers
38 9. CCL handlers
39 10. C library functions
40 11. Emacs Lisp library functions
41 12. Postamble
4ed46869
KH
42
43*/
44
df7492f9 45/*** 0. General comments ***
b73bfc1c
KH
46
47
df7492f9 48CODING SYSTEM
4ed46869 49
5bad0796
DL
50 A coding system is an object for an encoding mechanism that contains
51 information about how to convert byte sequences to character
e19c3639
KH
52 sequences and vice versa. When we say "decode", it means converting
53 a byte sequence of a specific coding system into a character
54 sequence that is represented by Emacs' internal coding system
55 `emacs-utf-8', and when we say "encode", it means converting a
56 character sequence of emacs-utf-8 to a byte sequence of a specific
0ef69138 57 coding system.
4ed46869 58
e19c3639
KH
59 In Emacs Lisp, a coding system is represented by a Lisp symbol. In
60 C level, a coding system is represented by a vector of attributes
5bad0796 61 stored in the hash table Vcharset_hash_table. The conversion from
e19c3639
KH
62 coding system symbol to attributes vector is done by looking up
63 Vcharset_hash_table by the symbol.
4ed46869 64
e19c3639 65 Coding systems are classified into the following types depending on
5bad0796 66 the encoding mechanism. Here's a brief description of the types.
4ed46869 67
df7492f9
KH
68 o UTF-8
69
70 o UTF-16
71
72 o Charset-base coding system
73
74 A coding system defined by one or more (coded) character sets.
5bad0796 75 Decoding and encoding are done by a code converter defined for each
df7492f9
KH
76 character set.
77
5bad0796 78 o Old Emacs internal format (emacs-mule)
df7492f9 79
5bad0796 80 The coding system adopted by old versions of Emacs (20 and 21).
4ed46869 81
df7492f9 82 o ISO2022-base coding system
4ed46869
KH
83
84 The most famous coding system for multiple character sets. X's
df7492f9
KH
85 Compound Text, various EUCs (Extended Unix Code), and coding systems
86 used in the Internet communication such as ISO-2022-JP are all
87 variants of ISO2022.
4ed46869 88
df7492f9 89 o SJIS (or Shift-JIS or MS-Kanji-Code)
93dec019 90
4ed46869
KH
91 A coding system to encode character sets: ASCII, JISX0201, and
92 JISX0208. Widely used for PC's in Japan. Details are described in
df7492f9 93 section 8.
4ed46869 94
df7492f9 95 o BIG5
4ed46869 96
df7492f9 97 A coding system to encode character sets: ASCII and Big5. Widely
cfb43547 98 used for Chinese (mainly in Taiwan and Hong Kong). Details are
df7492f9
KH
99 described in section 8. In this file, when we write "big5" (all
100 lowercase), we mean the coding system, and when we write "Big5"
101 (capitalized), we mean the character set.
4ed46869 102
df7492f9 103 o CCL
27901516 104
5bad0796 105 If a user wants to decode/encode text encoded in a coding system
df7492f9
KH
106 not listed above, he can supply a decoder and an encoder for it in
107 CCL (Code Conversion Language) programs. Emacs executes the CCL
108 program while decoding/encoding.
27901516 109
df7492f9 110 o Raw-text
4ed46869 111
5a936b46 112 A coding system for text containing raw eight-bit data. Emacs
5bad0796 113 treats each byte of source text as a character (except for
df7492f9 114 end-of-line conversion).
4ed46869 115
df7492f9
KH
116 o No-conversion
117
118 Like raw text, but don't do end-of-line conversion.
4ed46869 119
4ed46869 120
df7492f9 121END-OF-LINE FORMAT
4ed46869 122
5bad0796 123 How text end-of-line is encoded depends on operating system. For
df7492f9 124 instance, Unix's format is just one byte of LF (line-feed) code,
f4dee582 125 whereas DOS's format is two-byte sequence of `carriage-return' and
d46c5b12
KH
126 `line-feed' codes. MacOS's format is usually one byte of
127 `carriage-return'.
4ed46869 128
cfb43547 129 Since text character encoding and end-of-line encoding are
df7492f9
KH
130 independent, any coding system described above can take any format
131 of end-of-line (except for no-conversion).
4ed46869 132
e19c3639
KH
133STRUCT CODING_SYSTEM
134
135 Before using a coding system for code conversion (i.e. decoding and
136 encoding), we setup a structure of type `struct coding_system'.
137 This structure keeps various information about a specific code
5bad0796 138 conversion (e.g. the location of source and destination data).
4ed46869
KH
139
140*/
141
df7492f9
KH
142/* COMMON MACROS */
143
144
4ed46869
KH
145/*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
146
df7492f9 147 These functions check if a byte sequence specified as a source in
ff0dacd7
KH
148 CODING conforms to the format of XXX, and update the members of
149 DETECT_INFO.
df7492f9 150
ff0dacd7 151 Return 1 if the byte sequence conforms to XXX, otherwise return 0.
df7492f9
KH
152
153 Below is the template of these functions. */
154
4ed46869 155#if 0
df7492f9 156static int
ff0dacd7 157detect_coding_XXX (coding, detect_info)
df7492f9 158 struct coding_system *coding;
ff0dacd7 159 struct coding_detection_info *detect_info;
4ed46869 160{
f1d34bca
MB
161 const unsigned char *src = coding->source;
162 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9 163 int multibytep = coding->src_multibyte;
ff0dacd7 164 int consumed_chars = 0;
df7492f9
KH
165 int found = 0;
166 ...;
167
168 while (1)
169 {
170 /* Get one byte from the source. If the souce is exausted, jump
171 to no_more_source:. */
172 ONE_MORE_BYTE (c);
ff0dacd7
KH
173
174 if (! __C_conforms_to_XXX___ (c))
175 break;
176 if (! __C_strongly_suggests_XXX__ (c))
177 found = CATEGORY_MASK_XXX;
df7492f9 178 }
ff0dacd7
KH
179 /* The byte sequence is invalid for XXX. */
180 detect_info->rejected |= CATEGORY_MASK_XXX;
df7492f9 181 return 0;
ff0dacd7 182
df7492f9 183 no_more_source:
ff0dacd7
KH
184 /* The source exausted successfully. */
185 detect_info->found |= found;
df7492f9 186 return 1;
4ed46869
KH
187}
188#endif
189
190/*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
191
df7492f9
KH
192 These functions decode a byte sequence specified as a source by
193 CODING. The resulting multibyte text goes to a place pointed to by
194 CODING->charbuf, the length of which should not exceed
195 CODING->charbuf_size;
d46c5b12 196
df7492f9
KH
197 These functions set the information of original and decoded texts in
198 CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
199 They also set CODING->result to one of CODING_RESULT_XXX indicating
200 how the decoding is finished.
d46c5b12 201
df7492f9 202 Below is the template of these functions. */
d46c5b12 203
4ed46869 204#if 0
b73bfc1c 205static void
df7492f9 206decode_coding_XXXX (coding)
4ed46869 207 struct coding_system *coding;
4ed46869 208{
f1d34bca
MB
209 const unsigned char *src = coding->source + coding->consumed;
210 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9
KH
211 /* SRC_BASE remembers the start position in source in each loop.
212 The loop will be exited when there's not enough source code, or
213 when there's no room in CHARBUF for a decoded character. */
f1d34bca 214 const unsigned char *src_base;
df7492f9 215 /* A buffer to produce decoded characters. */
69a80ea3
KH
216 int *charbuf = coding->charbuf + coding->charbuf_used;
217 int *charbuf_end = coding->charbuf + coding->charbuf_size;
df7492f9
KH
218 int multibytep = coding->src_multibyte;
219
220 while (1)
221 {
222 src_base = src;
223 if (charbuf < charbuf_end)
224 /* No more room to produce a decoded character. */
225 break;
226 ONE_MORE_BYTE (c);
227 /* Decode it. */
228 }
229
230 no_more_source:
231 if (src_base < src_end
232 && coding->mode & CODING_MODE_LAST_BLOCK)
233 /* If the source ends by partial bytes to construct a character,
234 treat them as eight-bit raw data. */
235 while (src_base < src_end && charbuf < charbuf_end)
236 *charbuf++ = *src_base++;
237 /* Remember how many bytes and characters we consumed. If the
238 source is multibyte, the bytes and chars are not identical. */
239 coding->consumed = coding->consumed_char = src_base - coding->source;
240 /* Remember how many characters we produced. */
241 coding->charbuf_used = charbuf - coding->charbuf;
4ed46869
KH
242}
243#endif
244
245/*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
246
df7492f9
KH
247 These functions encode SRC_BYTES length text at SOURCE of Emacs'
248 internal multibyte format by CODING. The resulting byte sequence
b73bfc1c
KH
249 goes to a place pointed to by DESTINATION, the length of which
250 should not exceed DST_BYTES.
d46c5b12 251
df7492f9
KH
252 These functions set the information of original and encoded texts in
253 the members produced, produced_char, consumed, and consumed_char of
254 the structure *CODING. They also set the member result to one of
255 CODING_RESULT_XXX indicating how the encoding finished.
d46c5b12 256
df7492f9
KH
257 DST_BYTES zero means that source area and destination area are
258 overlapped, which means that we can produce a encoded text until it
259 reaches at the head of not-yet-encoded source text.
d46c5b12 260
df7492f9 261 Below is a template of these functions. */
4ed46869 262#if 0
b73bfc1c 263static void
df7492f9 264encode_coding_XXX (coding)
4ed46869 265 struct coding_system *coding;
4ed46869 266{
df7492f9
KH
267 int multibytep = coding->dst_multibyte;
268 int *charbuf = coding->charbuf;
269 int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
270 unsigned char *dst = coding->destination + coding->produced;
271 unsigned char *dst_end = coding->destination + coding->dst_bytes;
272 unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
273 int produced_chars = 0;
274
275 for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
276 {
277 int c = *charbuf;
278 /* Encode C into DST, and increment DST. */
279 }
280 label_no_more_destination:
281 /* How many chars and bytes we produced. */
282 coding->produced_char += produced_chars;
283 coding->produced = dst - coding->destination;
4ed46869
KH
284}
285#endif
286
4ed46869
KH
287\f
288/*** 1. Preamble ***/
289
68c45bf0 290#include <config.h>
4ed46869
KH
291#include <stdio.h>
292
4ed46869
KH
293#include "lisp.h"
294#include "buffer.h"
df7492f9 295#include "character.h"
4ed46869
KH
296#include "charset.h"
297#include "ccl.h"
df7492f9 298#include "composite.h"
4ed46869
KH
299#include "coding.h"
300#include "window.h"
b8299c66
KL
301#include "frame.h"
302#include "termhooks.h"
4ed46869 303
df7492f9 304Lisp_Object Vcoding_system_hash_table;
4ed46869 305
df7492f9 306Lisp_Object Qcoding_system, Qcoding_aliases, Qeol_type;
1965cb73
DL
307Lisp_Object Qunix, Qdos;
308extern Lisp_Object Qmac; /* frame.c */
4ed46869
KH
309Lisp_Object Qbuffer_file_coding_system;
310Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
df7492f9 311Lisp_Object Qdefault_char;
27901516 312Lisp_Object Qno_conversion, Qundecided;
df7492f9 313Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5;
b49a1807 314Lisp_Object Qbig, Qlittle;
bb0115a2 315Lisp_Object Qcoding_system_history;
1397dc18 316Lisp_Object Qvalid_codes;
2133e2d1 317Lisp_Object QCcategory, QCmnemonic, QCdefault_char;
a6f87d34
KH
318Lisp_Object QCdecode_translation_table, QCencode_translation_table;
319Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
35befdaa 320Lisp_Object QCascii_compatible_p;
4ed46869
KH
321
322extern Lisp_Object Qinsert_file_contents, Qwrite_region;
387f6ba5 323Lisp_Object Qcall_process, Qcall_process_region;
4ed46869
KH
324Lisp_Object Qstart_process, Qopen_network_stream;
325Lisp_Object Qtarget_idx;
326
065e3595
KH
327Lisp_Object Qinsufficient_source, Qinconsistent_eol, Qinvalid_source;
328Lisp_Object Qinterrupted, Qinsufficient_memory;
329
c7183fb8
GM
330extern Lisp_Object Qcompletion_ignore_case;
331
44e8490d
KH
332/* If a symbol has this property, evaluate the value to define the
333 symbol as a coding system. */
334static Lisp_Object Qcoding_system_define_form;
335
5d5bf4d8
KH
336int coding_system_require_warning;
337
d46c5b12
KH
338Lisp_Object Vselect_safe_coding_system_function;
339
7722baf9
EZ
340/* Mnemonic string for each format of end-of-line. */
341Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
342/* Mnemonic string to indicate format of end-of-line is not yet
4ed46869 343 decided. */
7722baf9 344Lisp_Object eol_mnemonic_undecided;
4ed46869 345
fcbcfb64
KH
346/* Format of end-of-line decided by system. This is Qunix on
347 Unix and Mac, Qdos on DOS/Windows.
348 This has an effect only for external encoding (i.e. for output to
349 file and process), not for in-buffer or Lisp string encoding. */
350static Lisp_Object system_eol_type;
351
4ed46869
KH
352#ifdef emacs
353
4608c386
KH
354Lisp_Object Vcoding_system_list, Vcoding_system_alist;
355
356Lisp_Object Qcoding_system_p, Qcoding_system_error;
4ed46869 357
d46c5b12
KH
358/* Coding system emacs-mule and raw-text are for converting only
359 end-of-line format. */
360Lisp_Object Qemacs_mule, Qraw_text;
8f924df7 361Lisp_Object Qutf_8_emacs;
ecf488bc 362
4ed46869
KH
363/* Coding-systems are handed between Emacs Lisp programs and C internal
364 routines by the following three variables. */
365/* Coding-system for reading files and receiving data from process. */
366Lisp_Object Vcoding_system_for_read;
367/* Coding-system for writing files and sending data to process. */
368Lisp_Object Vcoding_system_for_write;
369/* Coding-system actually used in the latest I/O. */
370Lisp_Object Vlast_coding_system_used;
065e3595
KH
371/* Set to non-nil when an error is detected while code conversion. */
372Lisp_Object Vlast_code_conversion_error;
c4825358 373/* A vector of length 256 which contains information about special
94487c4e 374 Latin codes (especially for dealing with Microsoft codes). */
3f003981 375Lisp_Object Vlatin_extra_code_table;
c4825358 376
9ce27fde
KH
377/* Flag to inhibit code conversion of end-of-line format. */
378int inhibit_eol_conversion;
379
74383408
KH
380/* Flag to inhibit ISO2022 escape sequence detection. */
381int inhibit_iso_escape_detection;
382
97b1b294
EZ
383/* Flag to inhibit detection of binary files through null bytes. */
384int inhibit_null_byte_detection;
385
ed29121d
EZ
386/* Flag to make buffer-file-coding-system inherit from process-coding. */
387int inherit_process_coding_system;
388
c4825358
KH
389/* Coding system to be used to encode text for terminal display when
390 terminal coding system is nil. */
391struct coding_system safe_terminal_coding;
392
02ba4723
KH
393Lisp_Object Vfile_coding_system_alist;
394Lisp_Object Vprocess_coding_system_alist;
395Lisp_Object Vnetwork_coding_system_alist;
4ed46869 396
68c45bf0
PE
397Lisp_Object Vlocale_coding_system;
398
4ed46869
KH
399#endif /* emacs */
400
f967223b
KH
401/* Flag to tell if we look up translation table on character code
402 conversion. */
84fbb8a0 403Lisp_Object Venable_character_translation;
f967223b
KH
404/* Standard translation table to look up on decoding (reading). */
405Lisp_Object Vstandard_translation_table_for_decode;
406/* Standard translation table to look up on encoding (writing). */
407Lisp_Object Vstandard_translation_table_for_encode;
84fbb8a0 408
f967223b
KH
409Lisp_Object Qtranslation_table;
410Lisp_Object Qtranslation_table_id;
411Lisp_Object Qtranslation_table_for_decode;
412Lisp_Object Qtranslation_table_for_encode;
4ed46869
KH
413
414/* Alist of charsets vs revision number. */
df7492f9 415static Lisp_Object Vcharset_revision_table;
4ed46869 416
02ba4723
KH
417/* Default coding systems used for process I/O. */
418Lisp_Object Vdefault_process_coding_system;
419
002fdb44
DL
420/* Char table for translating Quail and self-inserting input. */
421Lisp_Object Vtranslation_table_for_input;
422
df7492f9
KH
423/* Two special coding systems. */
424Lisp_Object Vsjis_coding_system;
425Lisp_Object Vbig5_coding_system;
426
df7492f9
KH
427/* ISO2022 section */
428
429#define CODING_ISO_INITIAL(coding, reg) \
430 (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id), \
431 coding_attr_iso_initial), \
432 reg)))
433
434
1b3b981b
AS
435#define CODING_ISO_REQUEST(coding, charset_id) \
436 (((charset_id) <= (coding)->max_charset_id \
437 ? ((coding)->safe_charsets[charset_id] != 255 \
438 ? (coding)->safe_charsets[charset_id] \
439 : -1) \
df7492f9
KH
440 : -1))
441
442
443#define CODING_ISO_FLAGS(coding) \
444 ((coding)->spec.iso_2022.flags)
445#define CODING_ISO_DESIGNATION(coding, reg) \
446 ((coding)->spec.iso_2022.current_designation[reg])
447#define CODING_ISO_INVOCATION(coding, plane) \
448 ((coding)->spec.iso_2022.current_invocation[plane])
449#define CODING_ISO_SINGLE_SHIFTING(coding) \
450 ((coding)->spec.iso_2022.single_shifting)
451#define CODING_ISO_BOL(coding) \
452 ((coding)->spec.iso_2022.bol)
453#define CODING_ISO_INVOKED_CHARSET(coding, plane) \
454 CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
e951386e
KH
455#define CODING_ISO_CMP_STATUS(coding) \
456 (&(coding)->spec.iso_2022.cmp_status)
457#define CODING_ISO_EXTSEGMENT_LEN(coding) \
458 ((coding)->spec.iso_2022.ctext_extended_segment_len)
459#define CODING_ISO_EMBEDDED_UTF_8(coding) \
460 ((coding)->spec.iso_2022.embedded_utf_8)
df7492f9
KH
461
462/* Control characters of ISO2022. */
463 /* code */ /* function */
464#define ISO_CODE_LF 0x0A /* line-feed */
465#define ISO_CODE_CR 0x0D /* carriage-return */
466#define ISO_CODE_SO 0x0E /* shift-out */
467#define ISO_CODE_SI 0x0F /* shift-in */
468#define ISO_CODE_SS2_7 0x19 /* single-shift-2 for 7-bit code */
469#define ISO_CODE_ESC 0x1B /* escape */
470#define ISO_CODE_SS2 0x8E /* single-shift-2 */
471#define ISO_CODE_SS3 0x8F /* single-shift-3 */
472#define ISO_CODE_CSI 0x9B /* control-sequence-introducer */
473
474/* All code (1-byte) of ISO2022 is classified into one of the
475 followings. */
476enum iso_code_class_type
477 {
478 ISO_control_0, /* Control codes in the range
479 0x00..0x1F and 0x7F, except for the
480 following 5 codes. */
df7492f9
KH
481 ISO_shift_out, /* ISO_CODE_SO (0x0E) */
482 ISO_shift_in, /* ISO_CODE_SI (0x0F) */
483 ISO_single_shift_2_7, /* ISO_CODE_SS2_7 (0x19) */
484 ISO_escape, /* ISO_CODE_SO (0x1B) */
485 ISO_control_1, /* Control codes in the range
486 0x80..0x9F, except for the
487 following 3 codes. */
488 ISO_single_shift_2, /* ISO_CODE_SS2 (0x8E) */
489 ISO_single_shift_3, /* ISO_CODE_SS3 (0x8F) */
490 ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
491 ISO_0x20_or_0x7F, /* Codes of the values 0x20 or 0x7F. */
492 ISO_graphic_plane_0, /* Graphic codes in the range 0x21..0x7E. */
493 ISO_0xA0_or_0xFF, /* Codes of the values 0xA0 or 0xFF. */
494 ISO_graphic_plane_1 /* Graphic codes in the range 0xA1..0xFE. */
495 };
05e6f5dc 496
df7492f9
KH
497/** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
498 `iso-flags' attribute of an iso2022 coding system. */
05e6f5dc 499
df7492f9
KH
500/* If set, produce long-form designation sequence (e.g. ESC $ ( A)
501 instead of the correct short-form sequence (e.g. ESC $ A). */
502#define CODING_ISO_FLAG_LONG_FORM 0x0001
93dec019 503
df7492f9
KH
504/* If set, reset graphic planes and registers at end-of-line to the
505 initial state. */
506#define CODING_ISO_FLAG_RESET_AT_EOL 0x0002
05e6f5dc 507
df7492f9
KH
508/* If set, reset graphic planes and registers before any control
509 characters to the initial state. */
510#define CODING_ISO_FLAG_RESET_AT_CNTL 0x0004
05e6f5dc 511
df7492f9
KH
512/* If set, encode by 7-bit environment. */
513#define CODING_ISO_FLAG_SEVEN_BITS 0x0008
4ed46869 514
df7492f9
KH
515/* If set, use locking-shift function. */
516#define CODING_ISO_FLAG_LOCKING_SHIFT 0x0010
b73bfc1c 517
df7492f9
KH
518/* If set, use single-shift function. Overwrite
519 CODING_ISO_FLAG_LOCKING_SHIFT. */
520#define CODING_ISO_FLAG_SINGLE_SHIFT 0x0020
b73bfc1c 521
df7492f9
KH
522/* If set, use designation escape sequence. */
523#define CODING_ISO_FLAG_DESIGNATION 0x0040
b73bfc1c 524
df7492f9
KH
525/* If set, produce revision number sequence. */
526#define CODING_ISO_FLAG_REVISION 0x0080
b73bfc1c 527
df7492f9
KH
528/* If set, produce ISO6429's direction specifying sequence. */
529#define CODING_ISO_FLAG_DIRECTION 0x0100
f4dee582 530
df7492f9
KH
531/* If set, assume designation states are reset at beginning of line on
532 output. */
533#define CODING_ISO_FLAG_INIT_AT_BOL 0x0200
4ed46869 534
df7492f9
KH
535/* If set, designation sequence should be placed at beginning of line
536 on output. */
537#define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
aa72b389 538
df7492f9
KH
539/* If set, do not encode unsafe charactes on output. */
540#define CODING_ISO_FLAG_SAFE 0x0800
aa72b389 541
df7492f9
KH
542/* If set, extra latin codes (128..159) are accepted as a valid code
543 on input. */
544#define CODING_ISO_FLAG_LATIN_EXTRA 0x1000
aa72b389 545
df7492f9 546#define CODING_ISO_FLAG_COMPOSITION 0x2000
aa72b389 547
df7492f9 548#define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000
aa72b389 549
bf16eb23 550#define CODING_ISO_FLAG_USE_ROMAN 0x8000
aa72b389 551
bf16eb23 552#define CODING_ISO_FLAG_USE_OLDJIS 0x10000
aa72b389 553
bf16eb23 554#define CODING_ISO_FLAG_FULL_SUPPORT 0x100000
aa72b389 555
df7492f9
KH
556/* A character to be produced on output if encoding of the original
557 character is prohibited by CODING_ISO_FLAG_SAFE. */
558#define CODING_INHIBIT_CHARACTER_SUBSTITUTION '?'
aa72b389 559
a470d443
KH
560/* UTF-8 section */
561#define CODING_UTF_8_BOM(coding) \
562 ((coding)->spec.utf_8_bom)
4ed46869 563
df7492f9
KH
564/* UTF-16 section */
565#define CODING_UTF_16_BOM(coding) \
566 ((coding)->spec.utf_16.bom)
4ed46869 567
df7492f9
KH
568#define CODING_UTF_16_ENDIAN(coding) \
569 ((coding)->spec.utf_16.endian)
4ed46869 570
df7492f9
KH
571#define CODING_UTF_16_SURROGATE(coding) \
572 ((coding)->spec.utf_16.surrogate)
4ed46869 573
4ed46869 574
df7492f9
KH
575/* CCL section */
576#define CODING_CCL_DECODER(coding) \
577 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
578#define CODING_CCL_ENCODER(coding) \
579 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
580#define CODING_CCL_VALIDS(coding) \
8f924df7 581 (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
4ed46869 582
5a936b46 583/* Index for each coding category in `coding_categories' */
4ed46869 584
df7492f9
KH
585enum coding_category
586 {
587 coding_category_iso_7,
588 coding_category_iso_7_tight,
589 coding_category_iso_8_1,
590 coding_category_iso_8_2,
591 coding_category_iso_7_else,
592 coding_category_iso_8_else,
a470d443
KH
593 coding_category_utf_8_auto,
594 coding_category_utf_8_nosig,
595 coding_category_utf_8_sig,
df7492f9
KH
596 coding_category_utf_16_auto,
597 coding_category_utf_16_be,
598 coding_category_utf_16_le,
599 coding_category_utf_16_be_nosig,
600 coding_category_utf_16_le_nosig,
601 coding_category_charset,
602 coding_category_sjis,
603 coding_category_big5,
604 coding_category_ccl,
605 coding_category_emacs_mule,
606 /* All above are targets of code detection. */
607 coding_category_raw_text,
608 coding_category_undecided,
609 coding_category_max
610 };
611
612/* Definitions of flag bits used in detect_coding_XXXX. */
613#define CATEGORY_MASK_ISO_7 (1 << coding_category_iso_7)
614#define CATEGORY_MASK_ISO_7_TIGHT (1 << coding_category_iso_7_tight)
615#define CATEGORY_MASK_ISO_8_1 (1 << coding_category_iso_8_1)
616#define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2)
617#define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else)
618#define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else)
a470d443
KH
619#define CATEGORY_MASK_UTF_8_AUTO (1 << coding_category_utf_8_auto)
620#define CATEGORY_MASK_UTF_8_NOSIG (1 << coding_category_utf_8_nosig)
621#define CATEGORY_MASK_UTF_8_SIG (1 << coding_category_utf_8_sig)
b49a1807 622#define CATEGORY_MASK_UTF_16_AUTO (1 << coding_category_utf_16_auto)
df7492f9
KH
623#define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be)
624#define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le)
625#define CATEGORY_MASK_UTF_16_BE_NOSIG (1 << coding_category_utf_16_be_nosig)
626#define CATEGORY_MASK_UTF_16_LE_NOSIG (1 << coding_category_utf_16_le_nosig)
627#define CATEGORY_MASK_CHARSET (1 << coding_category_charset)
628#define CATEGORY_MASK_SJIS (1 << coding_category_sjis)
629#define CATEGORY_MASK_BIG5 (1 << coding_category_big5)
630#define CATEGORY_MASK_CCL (1 << coding_category_ccl)
631#define CATEGORY_MASK_EMACS_MULE (1 << coding_category_emacs_mule)
ff0dacd7 632#define CATEGORY_MASK_RAW_TEXT (1 << coding_category_raw_text)
df7492f9
KH
633
634/* This value is returned if detect_coding_mask () find nothing other
635 than ASCII characters. */
636#define CATEGORY_MASK_ANY \
637 (CATEGORY_MASK_ISO_7 \
638 | CATEGORY_MASK_ISO_7_TIGHT \
639 | CATEGORY_MASK_ISO_8_1 \
640 | CATEGORY_MASK_ISO_8_2 \
641 | CATEGORY_MASK_ISO_7_ELSE \
642 | CATEGORY_MASK_ISO_8_ELSE \
a470d443
KH
643 | CATEGORY_MASK_UTF_8_AUTO \
644 | CATEGORY_MASK_UTF_8_NOSIG \
645 | CATEGORY_MASK_UTF_8_SIG \
2f3cbb32 646 | CATEGORY_MASK_UTF_16_AUTO \
df7492f9
KH
647 | CATEGORY_MASK_UTF_16_BE \
648 | CATEGORY_MASK_UTF_16_LE \
649 | CATEGORY_MASK_UTF_16_BE_NOSIG \
650 | CATEGORY_MASK_UTF_16_LE_NOSIG \
651 | CATEGORY_MASK_CHARSET \
652 | CATEGORY_MASK_SJIS \
653 | CATEGORY_MASK_BIG5 \
654 | CATEGORY_MASK_CCL \
655 | CATEGORY_MASK_EMACS_MULE)
656
657
658#define CATEGORY_MASK_ISO_7BIT \
659 (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
660
661#define CATEGORY_MASK_ISO_8BIT \
662 (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
663
664#define CATEGORY_MASK_ISO_ELSE \
665 (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
666
667#define CATEGORY_MASK_ISO_ESCAPE \
668 (CATEGORY_MASK_ISO_7 \
669 | CATEGORY_MASK_ISO_7_TIGHT \
670 | CATEGORY_MASK_ISO_7_ELSE \
671 | CATEGORY_MASK_ISO_8_ELSE)
672
673#define CATEGORY_MASK_ISO \
674 ( CATEGORY_MASK_ISO_7BIT \
675 | CATEGORY_MASK_ISO_8BIT \
676 | CATEGORY_MASK_ISO_ELSE)
677
678#define CATEGORY_MASK_UTF_16 \
2f3cbb32
KH
679 (CATEGORY_MASK_UTF_16_AUTO \
680 | CATEGORY_MASK_UTF_16_BE \
df7492f9
KH
681 | CATEGORY_MASK_UTF_16_LE \
682 | CATEGORY_MASK_UTF_16_BE_NOSIG \
683 | CATEGORY_MASK_UTF_16_LE_NOSIG)
684
a470d443
KH
685#define CATEGORY_MASK_UTF_8 \
686 (CATEGORY_MASK_UTF_8_AUTO \
687 | CATEGORY_MASK_UTF_8_NOSIG \
688 | CATEGORY_MASK_UTF_8_SIG)
df7492f9
KH
689
690/* List of symbols `coding-category-xxx' ordered by priority. This
691 variable is exposed to Emacs Lisp. */
692static Lisp_Object Vcoding_category_list;
693
694/* Table of coding categories (Lisp symbols). This variable is for
695 internal use oly. */
696static Lisp_Object Vcoding_category_table;
697
698/* Table of coding-categories ordered by priority. */
699static enum coding_category coding_priorities[coding_category_max];
700
701/* Nth element is a coding context for the coding system bound to the
702 Nth coding category. */
703static struct coding_system coding_categories[coding_category_max];
704
df7492f9
KH
705/*** Commonly used macros and functions ***/
706
707#ifndef min
708#define min(a, b) ((a) < (b) ? (a) : (b))
709#endif
710#ifndef max
711#define max(a, b) ((a) > (b) ? (a) : (b))
712#endif
4ed46869 713
24a73b0a
KH
714#define CODING_GET_INFO(coding, attrs, charset_list) \
715 do { \
716 (attrs) = CODING_ID_ATTRS ((coding)->id); \
717 (charset_list) = CODING_ATTR_CHARSET_LIST (attrs); \
df7492f9 718 } while (0)
4ed46869 719
4ed46869 720
df7492f9
KH
721/* Safely get one byte from the source text pointed by SRC which ends
722 at SRC_END, and set C to that byte. If there are not enough bytes
065e3595
KH
723 in the source, it jumps to `no_more_source'. If multibytep is
724 nonzero, and a multibyte character is found at SRC, set C to the
725 negative value of the character code. The caller should declare
726 and set these variables appropriately in advance:
727 src, src_end, multibytep */
aa72b389 728
065e3595
KH
729#define ONE_MORE_BYTE(c) \
730 do { \
731 if (src == src_end) \
732 { \
733 if (src_base < src) \
734 record_conversion_result \
735 (coding, CODING_RESULT_INSUFFICIENT_SRC); \
736 goto no_more_source; \
737 } \
738 c = *src++; \
739 if (multibytep && (c & 0x80)) \
740 { \
741 if ((c & 0xFE) == 0xC0) \
742 c = ((c & 1) << 6) | *src++; \
743 else \
744 { \
35befdaa
KH
745 src--; \
746 c = - string_char (src, &src, NULL); \
065e3595
KH
747 record_conversion_result \
748 (coding, CODING_RESULT_INVALID_SRC); \
749 } \
750 } \
751 consumed_chars++; \
aa72b389
KH
752 } while (0)
753
f56a4450 754/* Safely get two bytes from the source text pointed by SRC which ends
220eeac9
KH
755 at SRC_END, and set C1 and C2 to those bytes while skipping the
756 heading multibyte characters. If there are not enough bytes in the
757 source, it jumps to `no_more_source'. If multibytep is nonzero and
758 a multibyte character is found for C2, set C2 to the negative value
759 of the character code. The caller should declare and set these
760 variables appropriately in advance:
f56a4450
KH
761 src, src_end, multibytep
762 It is intended that this macro is used in detect_coding_utf_16. */
763
220eeac9
KH
764#define TWO_MORE_BYTES(c1, c2) \
765 do { \
766 do { \
767 if (src == src_end) \
768 goto no_more_source; \
769 c1 = *src++; \
770 if (multibytep && (c1 & 0x80)) \
771 { \
772 if ((c1 & 0xFE) == 0xC0) \
773 c1 = ((c1 & 1) << 6) | *src++; \
774 else \
775 { \
776 src += BYTES_BY_CHAR_HEAD (c1) - 1; \
777 c1 = -1; \
778 } \
779 } \
780 } while (c1 < 0); \
781 if (src == src_end) \
782 goto no_more_source; \
783 c2 = *src++; \
784 if (multibytep && (c2 & 0x80)) \
785 { \
786 if ((c2 & 0xFE) == 0xC0) \
787 c2 = ((c2 & 1) << 6) | *src++; \
788 else \
789 c2 = -1; \
790 } \
f56a4450
KH
791 } while (0)
792
aa72b389 793
065e3595
KH
794#define ONE_MORE_BYTE_NO_CHECK(c) \
795 do { \
796 c = *src++; \
797 if (multibytep && (c & 0x80)) \
798 { \
799 if ((c & 0xFE) == 0xC0) \
800 c = ((c & 1) << 6) | *src++; \
801 else \
802 { \
35befdaa
KH
803 src--; \
804 c = - string_char (src, &src, NULL); \
065e3595
KH
805 record_conversion_result \
806 (coding, CODING_RESULT_INVALID_SRC); \
807 } \
808 } \
809 consumed_chars++; \
aa72b389
KH
810 } while (0)
811
aa72b389 812
df7492f9
KH
813/* Store a byte C in the place pointed by DST and increment DST to the
814 next free point, and increment PRODUCED_CHARS. The caller should
815 assure that C is 0..127, and declare and set the variable `dst'
816 appropriately in advance.
817*/
aa72b389
KH
818
819
df7492f9
KH
820#define EMIT_ONE_ASCII_BYTE(c) \
821 do { \
822 produced_chars++; \
823 *dst++ = (c); \
b6871cc7 824 } while (0)
aa72b389
KH
825
826
df7492f9 827/* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2. */
aa72b389 828
df7492f9
KH
829#define EMIT_TWO_ASCII_BYTES(c1, c2) \
830 do { \
831 produced_chars += 2; \
832 *dst++ = (c1), *dst++ = (c2); \
833 } while (0)
aa72b389
KH
834
835
df7492f9
KH
836/* Store a byte C in the place pointed by DST and increment DST to the
837 next free point, and increment PRODUCED_CHARS. If MULTIBYTEP is
838 nonzero, store in an appropriate multibyte from. The caller should
839 declare and set the variables `dst' and `multibytep' appropriately
840 in advance. */
841
842#define EMIT_ONE_BYTE(c) \
843 do { \
844 produced_chars++; \
845 if (multibytep) \
846 { \
847 int ch = (c); \
848 if (ch >= 0x80) \
849 ch = BYTE8_TO_CHAR (ch); \
850 CHAR_STRING_ADVANCE (ch, dst); \
851 } \
852 else \
853 *dst++ = (c); \
aa72b389 854 } while (0)
aa72b389 855
aa72b389 856
df7492f9 857/* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */
aa72b389 858
e19c3639
KH
859#define EMIT_TWO_BYTES(c1, c2) \
860 do { \
861 produced_chars += 2; \
862 if (multibytep) \
863 { \
864 int ch; \
865 \
866 ch = (c1); \
867 if (ch >= 0x80) \
868 ch = BYTE8_TO_CHAR (ch); \
869 CHAR_STRING_ADVANCE (ch, dst); \
870 ch = (c2); \
871 if (ch >= 0x80) \
872 ch = BYTE8_TO_CHAR (ch); \
873 CHAR_STRING_ADVANCE (ch, dst); \
874 } \
875 else \
876 { \
877 *dst++ = (c1); \
878 *dst++ = (c2); \
879 } \
aa72b389
KH
880 } while (0)
881
882
df7492f9
KH
883#define EMIT_THREE_BYTES(c1, c2, c3) \
884 do { \
885 EMIT_ONE_BYTE (c1); \
886 EMIT_TWO_BYTES (c2, c3); \
887 } while (0)
aa72b389 888
aa72b389 889
df7492f9
KH
890#define EMIT_FOUR_BYTES(c1, c2, c3, c4) \
891 do { \
892 EMIT_TWO_BYTES (c1, c2); \
893 EMIT_TWO_BYTES (c3, c4); \
894 } while (0)
aa72b389 895
aa72b389 896
f6cbaf43
KH
897/* Prototypes for static functions. */
898static void record_conversion_result P_ ((struct coding_system *coding,
899 enum coding_result_code result));
900static int detect_coding_utf_8 P_ ((struct coding_system *,
901 struct coding_detection_info *info));
902static void decode_coding_utf_8 P_ ((struct coding_system *));
903static int encode_coding_utf_8 P_ ((struct coding_system *));
904
905static int detect_coding_utf_16 P_ ((struct coding_system *,
906 struct coding_detection_info *info));
907static void decode_coding_utf_16 P_ ((struct coding_system *));
908static int encode_coding_utf_16 P_ ((struct coding_system *));
909
910static int detect_coding_iso_2022 P_ ((struct coding_system *,
911 struct coding_detection_info *info));
912static void decode_coding_iso_2022 P_ ((struct coding_system *));
913static int encode_coding_iso_2022 P_ ((struct coding_system *));
914
915static int detect_coding_emacs_mule P_ ((struct coding_system *,
916 struct coding_detection_info *info));
917static void decode_coding_emacs_mule P_ ((struct coding_system *));
918static int encode_coding_emacs_mule P_ ((struct coding_system *));
919
920static int detect_coding_sjis P_ ((struct coding_system *,
921 struct coding_detection_info *info));
922static void decode_coding_sjis P_ ((struct coding_system *));
923static int encode_coding_sjis P_ ((struct coding_system *));
924
925static int detect_coding_big5 P_ ((struct coding_system *,
926 struct coding_detection_info *info));
927static void decode_coding_big5 P_ ((struct coding_system *));
928static int encode_coding_big5 P_ ((struct coding_system *));
929
930static int detect_coding_ccl P_ ((struct coding_system *,
931 struct coding_detection_info *info));
932static void decode_coding_ccl P_ ((struct coding_system *));
933static int encode_coding_ccl P_ ((struct coding_system *));
934
935static void decode_coding_raw_text P_ ((struct coding_system *));
936static int encode_coding_raw_text P_ ((struct coding_system *));
937
938static void coding_set_source P_ ((struct coding_system *));
939static void coding_set_destination P_ ((struct coding_system *));
940static void coding_alloc_by_realloc P_ ((struct coding_system *, EMACS_INT));
941static void coding_alloc_by_making_gap P_ ((struct coding_system *,
287c57d7 942 EMACS_INT, EMACS_INT));
f6cbaf43
KH
943static unsigned char *alloc_destination P_ ((struct coding_system *,
944 EMACS_INT, unsigned char *));
945static void setup_iso_safe_charsets P_ ((Lisp_Object));
946static unsigned char *encode_designation_at_bol P_ ((struct coding_system *,
947 int *, int *,
948 unsigned char *));
949static int detect_eol P_ ((const unsigned char *,
950 EMACS_INT, enum coding_category));
951static Lisp_Object adjust_coding_eol_type P_ ((struct coding_system *, int));
952static void decode_eol P_ ((struct coding_system *));
953static Lisp_Object get_translation_table P_ ((Lisp_Object, int, int *));
e951386e 954static Lisp_Object get_translation P_ ((Lisp_Object, int *, int *));
f6cbaf43 955static int produce_chars P_ ((struct coding_system *, Lisp_Object, int));
f6cbaf43
KH
956static INLINE void produce_charset P_ ((struct coding_system *, int *,
957 EMACS_INT));
958static void produce_annotation P_ ((struct coding_system *, EMACS_INT));
959static int decode_coding P_ ((struct coding_system *));
960static INLINE int *handle_composition_annotation P_ ((EMACS_INT, EMACS_INT,
3ed051d4 961 struct coding_system *,
f6cbaf43
KH
962 int *, EMACS_INT *));
963static INLINE int *handle_charset_annotation P_ ((EMACS_INT, EMACS_INT,
964 struct coding_system *,
965 int *, EMACS_INT *));
966static void consume_chars P_ ((struct coding_system *, Lisp_Object, int));
967static int encode_coding P_ ((struct coding_system *));
968static Lisp_Object make_conversion_work_buffer P_ ((int));
969static Lisp_Object code_conversion_restore P_ ((Lisp_Object));
970static INLINE int char_encodable_p P_ ((int, Lisp_Object));
971static Lisp_Object make_subsidiaries P_ ((Lisp_Object));
972
065e3595
KH
973static void
974record_conversion_result (struct coding_system *coding,
975 enum coding_result_code result)
976{
977 coding->result = result;
978 switch (result)
979 {
980 case CODING_RESULT_INSUFFICIENT_SRC:
981 Vlast_code_conversion_error = Qinsufficient_source;
982 break;
983 case CODING_RESULT_INCONSISTENT_EOL:
984 Vlast_code_conversion_error = Qinconsistent_eol;
985 break;
986 case CODING_RESULT_INVALID_SRC:
987 Vlast_code_conversion_error = Qinvalid_source;
988 break;
989 case CODING_RESULT_INTERRUPT:
990 Vlast_code_conversion_error = Qinterrupted;
991 break;
992 case CODING_RESULT_INSUFFICIENT_MEM:
993 Vlast_code_conversion_error = Qinsufficient_memory;
994 break;
35befdaa
KH
995 default:
996 Vlast_code_conversion_error = intern ("Unknown error");
065e3595
KH
997 }
998}
999
df7492f9
KH
1000#define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
1001 do { \
1002 charset_map_loaded = 0; \
1003 c = DECODE_CHAR (charset, code); \
1004 if (charset_map_loaded) \
1005 { \
8f924df7 1006 const unsigned char *orig = coding->source; \
df7492f9
KH
1007 EMACS_INT offset; \
1008 \
1009 coding_set_source (coding); \
1010 offset = coding->source - orig; \
1011 src += offset; \
1012 src_base += offset; \
1013 src_end += offset; \
1014 } \
aa72b389
KH
1015 } while (0)
1016
1017
119852e7
KH
1018/* If there are at least BYTES length of room at dst, allocate memory
1019 for coding->destination and update dst and dst_end. We don't have
1020 to take care of coding->source which will be relocated. It is
1021 handled by calling coding_set_source in encode_coding. */
1022
df7492f9
KH
1023#define ASSURE_DESTINATION(bytes) \
1024 do { \
1025 if (dst + (bytes) >= dst_end) \
1026 { \
1027 int more_bytes = charbuf_end - charbuf + (bytes); \
1028 \
1029 dst = alloc_destination (coding, more_bytes, dst); \
1030 dst_end = coding->destination + coding->dst_bytes; \
1031 } \
1032 } while (0)
aa72b389 1033
aa72b389 1034
db274c7a
KH
1035/* Store multibyte form of the character C in P, and advance P to the
1036 end of the multibyte form. This is like CHAR_STRING_ADVANCE but it
1037 never calls MAYBE_UNIFY_CHAR. */
1038
1039#define CHAR_STRING_ADVANCE_NO_UNIFY(c, p) \
1040 do { \
1041 if ((c) <= MAX_1_BYTE_CHAR) \
1042 *(p)++ = (c); \
1043 else if ((c) <= MAX_2_BYTE_CHAR) \
1044 *(p)++ = (0xC0 | ((c) >> 6)), \
1045 *(p)++ = (0x80 | ((c) & 0x3F)); \
1046 else if ((c) <= MAX_3_BYTE_CHAR) \
1047 *(p)++ = (0xE0 | ((c) >> 12)), \
1048 *(p)++ = (0x80 | (((c) >> 6) & 0x3F)), \
1049 *(p)++ = (0x80 | ((c) & 0x3F)); \
1050 else if ((c) <= MAX_4_BYTE_CHAR) \
1051 *(p)++ = (0xF0 | (c >> 18)), \
1052 *(p)++ = (0x80 | ((c >> 12) & 0x3F)), \
1053 *(p)++ = (0x80 | ((c >> 6) & 0x3F)), \
1054 *(p)++ = (0x80 | (c & 0x3F)); \
1055 else if ((c) <= MAX_5_BYTE_CHAR) \
1056 *(p)++ = 0xF8, \
1057 *(p)++ = (0x80 | ((c >> 18) & 0x0F)), \
1058 *(p)++ = (0x80 | ((c >> 12) & 0x3F)), \
1059 *(p)++ = (0x80 | ((c >> 6) & 0x3F)), \
1060 *(p)++ = (0x80 | (c & 0x3F)); \
1061 else \
1062 (p) += BYTE8_STRING ((c) - 0x3FFF80, p); \
1063 } while (0)
1064
1065
1066/* Return the character code of character whose multibyte form is at
1067 P, and advance P to the end of the multibyte form. This is like
1068 STRING_CHAR_ADVANCE, but it never calls MAYBE_UNIFY_CHAR. */
1069
1070#define STRING_CHAR_ADVANCE_NO_UNIFY(p) \
1071 (!((p)[0] & 0x80) \
1072 ? *(p)++ \
1073 : ! ((p)[0] & 0x20) \
1074 ? ((p) += 2, \
1075 ((((p)[-2] & 0x1F) << 6) \
1076 | ((p)[-1] & 0x3F) \
1077 | ((unsigned char) ((p)[-2]) < 0xC2 ? 0x3FFF80 : 0))) \
1078 : ! ((p)[0] & 0x10) \
1079 ? ((p) += 3, \
1080 ((((p)[-3] & 0x0F) << 12) \
1081 | (((p)[-2] & 0x3F) << 6) \
1082 | ((p)[-1] & 0x3F))) \
1083 : ! ((p)[0] & 0x08) \
1084 ? ((p) += 4, \
1085 ((((p)[-4] & 0xF) << 18) \
1086 | (((p)[-3] & 0x3F) << 12) \
1087 | (((p)[-2] & 0x3F) << 6) \
1088 | ((p)[-1] & 0x3F))) \
1089 : ((p) += 5, \
1090 ((((p)[-4] & 0x3F) << 18) \
1091 | (((p)[-3] & 0x3F) << 12) \
1092 | (((p)[-2] & 0x3F) << 6) \
1093 | ((p)[-1] & 0x3F))))
1094
aa72b389 1095
df7492f9
KH
1096static void
1097coding_set_source (coding)
aa72b389 1098 struct coding_system *coding;
aa72b389 1099{
df7492f9
KH
1100 if (BUFFERP (coding->src_object))
1101 {
2cb26057 1102 struct buffer *buf = XBUFFER (coding->src_object);
aa72b389 1103
df7492f9 1104 if (coding->src_pos < 0)
2cb26057 1105 coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
df7492f9 1106 else
2cb26057 1107 coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
aa72b389 1108 }
df7492f9 1109 else if (STRINGP (coding->src_object))
aa72b389 1110 {
8f924df7 1111 coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
aa72b389 1112 }
df7492f9
KH
1113 else
1114 /* Otherwise, the source is C string and is never relocated
1115 automatically. Thus we don't have to update anything. */
1116 ;
1117}
aa72b389 1118
df7492f9
KH
1119static void
1120coding_set_destination (coding)
1121 struct coding_system *coding;
1122{
1123 if (BUFFERP (coding->dst_object))
aa72b389 1124 {
df7492f9 1125 if (coding->src_pos < 0)
aa72b389 1126 {
13818c30 1127 coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
28f67a95
KH
1128 coding->dst_bytes = (GAP_END_ADDR
1129 - (coding->src_bytes - coding->consumed)
1130 - coding->destination);
aa72b389 1131 }
df7492f9 1132 else
28f67a95
KH
1133 {
1134 /* We are sure that coding->dst_pos_byte is before the gap
1135 of the buffer. */
1136 coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
13818c30 1137 + coding->dst_pos_byte - BEG_BYTE);
28f67a95
KH
1138 coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
1139 - coding->destination);
1140 }
df7492f9
KH
1141 }
1142 else
1143 /* Otherwise, the destination is C string and is never relocated
1144 automatically. Thus we don't have to update anything. */
1145 ;
1146}
1147
1148
1149static void
1150coding_alloc_by_realloc (coding, bytes)
1151 struct coding_system *coding;
1152 EMACS_INT bytes;
1153{
1154 coding->destination = (unsigned char *) xrealloc (coding->destination,
1155 coding->dst_bytes + bytes);
1156 coding->dst_bytes += bytes;
1157}
1158
1159static void
db274c7a 1160coding_alloc_by_making_gap (coding, gap_head_used, bytes)
df7492f9 1161 struct coding_system *coding;
db274c7a 1162 EMACS_INT gap_head_used, bytes;
df7492f9 1163{
db274c7a 1164 if (EQ (coding->src_object, coding->dst_object))
df7492f9 1165 {
db274c7a
KH
1166 /* The gap may contain the produced data at the head and not-yet
1167 consumed data at the tail. To preserve those data, we at
1168 first make the gap size to zero, then increase the gap
1169 size. */
1170 EMACS_INT add = GAP_SIZE;
1171
1172 GPT += gap_head_used, GPT_BYTE += gap_head_used;
1173 GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
df7492f9
KH
1174 make_gap (bytes);
1175 GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
db274c7a 1176 GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
df7492f9 1177 }
730fff51 1178 else
df7492f9 1179 {
2c78b7e1
KH
1180 Lisp_Object this_buffer;
1181
1182 this_buffer = Fcurrent_buffer ();
df7492f9
KH
1183 set_buffer_internal (XBUFFER (coding->dst_object));
1184 make_gap (bytes);
1185 set_buffer_internal (XBUFFER (this_buffer));
aa72b389 1186 }
df7492f9 1187}
8f924df7 1188
df7492f9
KH
1189
1190static unsigned char *
1191alloc_destination (coding, nbytes, dst)
1192 struct coding_system *coding;
3e139625 1193 EMACS_INT nbytes;
df7492f9
KH
1194 unsigned char *dst;
1195{
1196 EMACS_INT offset = dst - coding->destination;
1197
1198 if (BUFFERP (coding->dst_object))
db274c7a
KH
1199 {
1200 struct buffer *buf = XBUFFER (coding->dst_object);
1201
1202 coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1203 }
aa72b389 1204 else
df7492f9 1205 coding_alloc_by_realloc (coding, nbytes);
065e3595 1206 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
1207 coding_set_destination (coding);
1208 dst = coding->destination + offset;
1209 return dst;
1210}
aa72b389 1211
ff0dacd7
KH
1212/** Macros for annotations. */
1213
ff0dacd7
KH
1214/* An annotation data is stored in the array coding->charbuf in this
1215 format:
69a80ea3 1216 [ -LENGTH ANNOTATION_MASK NCHARS ... ]
ff0dacd7
KH
1217 LENGTH is the number of elements in the annotation.
1218 ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
69a80ea3 1219 NCHARS is the number of characters in the text annotated.
ff0dacd7
KH
1220
1221 The format of the following elements depend on ANNOTATION_MASK.
1222
1223 In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1224 follows:
e951386e
KH
1225 ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1226
1227 NBYTES is the number of bytes specified in the header part of
1228 old-style emacs-mule encoding, or 0 for the other kind of
1229 composition.
1230
ff0dacd7 1231 METHOD is one of enum composition_method.
e951386e 1232
ff0dacd7
KH
1233 Optionnal COMPOSITION-COMPONENTS are characters and composition
1234 rules.
1235
1236 In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
e951386e
KH
1237 follows.
1238
1239 If ANNOTATION_MASK is 0, this annotation is just a space holder to
1240 recover from an invalid annotation, and should be skipped by
1241 produce_annotation. */
1242
1243/* Maximum length of the header of annotation data. */
1244#define MAX_ANNOTATION_LENGTH 5
ff0dacd7 1245
69a80ea3 1246#define ADD_ANNOTATION_DATA(buf, len, mask, nchars) \
ff0dacd7
KH
1247 do { \
1248 *(buf)++ = -(len); \
1249 *(buf)++ = (mask); \
69a80ea3 1250 *(buf)++ = (nchars); \
ff0dacd7
KH
1251 coding->annotated = 1; \
1252 } while (0);
1253
e951386e 1254#define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method) \
69a80ea3 1255 do { \
e951386e
KH
1256 ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1257 *buf++ = nbytes; \
69a80ea3 1258 *buf++ = method; \
ff0dacd7
KH
1259 } while (0)
1260
1261
69a80ea3
KH
1262#define ADD_CHARSET_DATA(buf, nchars, id) \
1263 do { \
1264 ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1265 *buf++ = id; \
ff0dacd7
KH
1266 } while (0)
1267
df7492f9
KH
1268\f
1269/*** 2. Emacs' internal format (emacs-utf-8) ***/
1270
1271
1272
1273\f
1274/*** 3. UTF-8 ***/
1275
1276/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
ff0dacd7
KH
1277 Check if a text is encoded in UTF-8. If it is, return 1, else
1278 return 0. */
df7492f9
KH
1279
1280#define UTF_8_1_OCTET_P(c) ((c) < 0x80)
1281#define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
1282#define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1283#define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1284#define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1285#define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1286
a470d443
KH
1287#define UTF_BOM 0xFEFF
1288#define UTF_8_BOM_1 0xEF
1289#define UTF_8_BOM_2 0xBB
1290#define UTF_8_BOM_3 0xBF
1291
df7492f9 1292static int
ff0dacd7 1293detect_coding_utf_8 (coding, detect_info)
df7492f9 1294 struct coding_system *coding;
ff0dacd7 1295 struct coding_detection_info *detect_info;
df7492f9 1296{
065e3595 1297 const unsigned char *src = coding->source, *src_base;
8f924df7 1298 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9
KH
1299 int multibytep = coding->src_multibyte;
1300 int consumed_chars = 0;
a470d443 1301 int bom_found = 0;
df7492f9
KH
1302 int found = 0;
1303
ff0dacd7 1304 detect_info->checked |= CATEGORY_MASK_UTF_8;
df7492f9
KH
1305 /* A coding system of this category is always ASCII compatible. */
1306 src += coding->head_ascii;
1307
1308 while (1)
aa72b389 1309 {
df7492f9 1310 int c, c1, c2, c3, c4;
aa72b389 1311
065e3595 1312 src_base = src;
df7492f9 1313 ONE_MORE_BYTE (c);
065e3595 1314 if (c < 0 || UTF_8_1_OCTET_P (c))
df7492f9
KH
1315 continue;
1316 ONE_MORE_BYTE (c1);
065e3595 1317 if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
df7492f9
KH
1318 break;
1319 if (UTF_8_2_OCTET_LEADING_P (c))
aa72b389 1320 {
a470d443 1321 found = 1;
df7492f9 1322 continue;
aa72b389 1323 }
df7492f9 1324 ONE_MORE_BYTE (c2);
065e3595 1325 if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
df7492f9
KH
1326 break;
1327 if (UTF_8_3_OCTET_LEADING_P (c))
aa72b389 1328 {
a470d443
KH
1329 found = 1;
1330 if (src_base == coding->source
1331 && c == UTF_8_BOM_1 && c1 == UTF_8_BOM_2 && c2 == UTF_8_BOM_3)
1332 bom_found = 1;
df7492f9 1333 continue;
aa72b389 1334 }
df7492f9 1335 ONE_MORE_BYTE (c3);
065e3595 1336 if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
df7492f9
KH
1337 break;
1338 if (UTF_8_4_OCTET_LEADING_P (c))
aa72b389 1339 {
a470d443 1340 found = 1;
df7492f9
KH
1341 continue;
1342 }
1343 ONE_MORE_BYTE (c4);
065e3595 1344 if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
df7492f9
KH
1345 break;
1346 if (UTF_8_5_OCTET_LEADING_P (c))
1347 {
a470d443 1348 found = 1;
df7492f9
KH
1349 continue;
1350 }
1351 break;
aa72b389 1352 }
ff0dacd7 1353 detect_info->rejected |= CATEGORY_MASK_UTF_8;
df7492f9 1354 return 0;
aa72b389 1355
df7492f9 1356 no_more_source:
065e3595 1357 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
aa72b389 1358 {
ff0dacd7 1359 detect_info->rejected |= CATEGORY_MASK_UTF_8;
89528eb3 1360 return 0;
aa72b389 1361 }
a470d443
KH
1362 if (bom_found)
1363 {
1364 /* The first character 0xFFFE doesn't necessarily mean a BOM. */
1365 detect_info->found |= CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1366 }
1367 else
1368 {
1369 detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
0e17387a
KH
1370 if (found)
1371 detect_info->found |= CATEGORY_MASK_UTF_8_NOSIG;
a470d443 1372 }
ff0dacd7 1373 return 1;
aa72b389
KH
1374}
1375
4ed46869 1376
b73bfc1c 1377static void
df7492f9 1378decode_coding_utf_8 (coding)
b73bfc1c 1379 struct coding_system *coding;
b73bfc1c 1380{
8f924df7
KH
1381 const unsigned char *src = coding->source + coding->consumed;
1382 const unsigned char *src_end = coding->source + coding->src_bytes;
1383 const unsigned char *src_base;
69a80ea3
KH
1384 int *charbuf = coding->charbuf + coding->charbuf_used;
1385 int *charbuf_end = coding->charbuf + coding->charbuf_size;
453b38f0 1386 int consumed_chars = 0, consumed_chars_base = 0;
df7492f9 1387 int multibytep = coding->src_multibyte;
a470d443 1388 enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
24a73b0a 1389 Lisp_Object attr, charset_list;
0a9564cb
EZ
1390 int eol_crlf =
1391 !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
119852e7 1392 int byte_after_cr = -1;
4ed46869 1393
24a73b0a 1394 CODING_GET_INFO (coding, attr, charset_list);
df7492f9 1395
a470d443
KH
1396 if (bom != utf_without_bom)
1397 {
1398 int c1, c2, c3;
1399
1400 src_base = src;
1401 ONE_MORE_BYTE (c1);
1402 if (! UTF_8_3_OCTET_LEADING_P (c1))
1403 src = src_base;
1404 else
1405 {
159bd5a2 1406 ONE_MORE_BYTE (c2);
a470d443
KH
1407 if (! UTF_8_EXTRA_OCTET_P (c2))
1408 src = src_base;
1409 else
1410 {
159bd5a2 1411 ONE_MORE_BYTE (c3);
a470d443
KH
1412 if (! UTF_8_EXTRA_OCTET_P (c3))
1413 src = src_base;
1414 else
1415 {
1416 if ((c1 != UTF_8_BOM_1)
1417 || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1418 src = src_base;
1419 else
1420 CODING_UTF_8_BOM (coding) = utf_without_bom;
1421 }
1422 }
1423 }
1424 }
1425 CODING_UTF_8_BOM (coding) = utf_without_bom;
1426
1427
1428
df7492f9 1429 while (1)
b73bfc1c 1430 {
df7492f9 1431 int c, c1, c2, c3, c4, c5;
ec6d2bb8 1432
df7492f9
KH
1433 src_base = src;
1434 consumed_chars_base = consumed_chars;
4af310db 1435
df7492f9 1436 if (charbuf >= charbuf_end)
b71f6f73
KH
1437 {
1438 if (byte_after_cr >= 0)
1439 src_base--;
1440 break;
1441 }
df7492f9 1442
119852e7
KH
1443 if (byte_after_cr >= 0)
1444 c1 = byte_after_cr, byte_after_cr = -1;
1445 else
1446 ONE_MORE_BYTE (c1);
065e3595
KH
1447 if (c1 < 0)
1448 {
1449 c = - c1;
1450 }
1451 else if (UTF_8_1_OCTET_P(c1))
df7492f9 1452 {
119852e7
KH
1453 if (eol_crlf && c1 == '\r')
1454 ONE_MORE_BYTE (byte_after_cr);
df7492f9 1455 c = c1;
4af310db 1456 }
df7492f9 1457 else
4af310db 1458 {
df7492f9 1459 ONE_MORE_BYTE (c2);
065e3595 1460 if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
df7492f9
KH
1461 goto invalid_code;
1462 if (UTF_8_2_OCTET_LEADING_P (c1))
4af310db 1463 {
b0edb2c5
DL
1464 c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1465 /* Reject overlong sequences here and below. Encoders
1466 producing them are incorrect, they can be misleading,
1467 and they mess up read/write invariance. */
1468 if (c < 128)
1469 goto invalid_code;
4af310db 1470 }
df7492f9 1471 else
aa72b389 1472 {
df7492f9 1473 ONE_MORE_BYTE (c3);
065e3595 1474 if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
df7492f9
KH
1475 goto invalid_code;
1476 if (UTF_8_3_OCTET_LEADING_P (c1))
b0edb2c5
DL
1477 {
1478 c = (((c1 & 0xF) << 12)
1479 | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
72fe1301
DL
1480 if (c < 0x800
1481 || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
b0edb2c5
DL
1482 goto invalid_code;
1483 }
df7492f9
KH
1484 else
1485 {
1486 ONE_MORE_BYTE (c4);
065e3595 1487 if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
df7492f9
KH
1488 goto invalid_code;
1489 if (UTF_8_4_OCTET_LEADING_P (c1))
b0edb2c5 1490 {
df7492f9
KH
1491 c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1492 | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
b0edb2c5
DL
1493 if (c < 0x10000)
1494 goto invalid_code;
1495 }
df7492f9
KH
1496 else
1497 {
1498 ONE_MORE_BYTE (c5);
065e3595 1499 if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
df7492f9
KH
1500 goto invalid_code;
1501 if (UTF_8_5_OCTET_LEADING_P (c1))
1502 {
1503 c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1504 | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1505 | (c5 & 0x3F));
b0edb2c5 1506 if ((c > MAX_CHAR) || (c < 0x200000))
df7492f9
KH
1507 goto invalid_code;
1508 }
1509 else
1510 goto invalid_code;
1511 }
1512 }
aa72b389 1513 }
b73bfc1c 1514 }
df7492f9
KH
1515
1516 *charbuf++ = c;
1517 continue;
1518
1519 invalid_code:
1520 src = src_base;
1521 consumed_chars = consumed_chars_base;
1522 ONE_MORE_BYTE (c);
1523 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1524 coding->errors++;
aa72b389
KH
1525 }
1526
df7492f9
KH
1527 no_more_source:
1528 coding->consumed_char += consumed_chars_base;
1529 coding->consumed = src_base - coding->source;
1530 coding->charbuf_used = charbuf - coding->charbuf;
1531}
1532
1533
1534static int
1535encode_coding_utf_8 (coding)
1536 struct coding_system *coding;
1537{
1538 int multibytep = coding->dst_multibyte;
1539 int *charbuf = coding->charbuf;
1540 int *charbuf_end = charbuf + coding->charbuf_used;
1541 unsigned char *dst = coding->destination + coding->produced;
1542 unsigned char *dst_end = coding->destination + coding->dst_bytes;
e19c3639 1543 int produced_chars = 0;
df7492f9
KH
1544 int c;
1545
a470d443
KH
1546 if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1547 {
1548 ASSURE_DESTINATION (3);
1549 EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1550 CODING_UTF_8_BOM (coding) = utf_without_bom;
1551 }
1552
df7492f9 1553 if (multibytep)
aa72b389 1554 {
df7492f9
KH
1555 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1556
1557 while (charbuf < charbuf_end)
b73bfc1c 1558 {
df7492f9 1559 unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
8f924df7 1560
df7492f9
KH
1561 ASSURE_DESTINATION (safe_room);
1562 c = *charbuf++;
28f67a95
KH
1563 if (CHAR_BYTE8_P (c))
1564 {
1565 c = CHAR_TO_BYTE8 (c);
1566 EMIT_ONE_BYTE (c);
1567 }
1568 else
1569 {
db274c7a 1570 CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
28f67a95
KH
1571 for (p = str; p < pend; p++)
1572 EMIT_ONE_BYTE (*p);
1573 }
b73bfc1c 1574 }
aa72b389 1575 }
df7492f9
KH
1576 else
1577 {
1578 int safe_room = MAX_MULTIBYTE_LENGTH;
1579
1580 while (charbuf < charbuf_end)
b73bfc1c 1581 {
df7492f9
KH
1582 ASSURE_DESTINATION (safe_room);
1583 c = *charbuf++;
f03caae0
KH
1584 if (CHAR_BYTE8_P (c))
1585 *dst++ = CHAR_TO_BYTE8 (c);
1586 else
db274c7a 1587 CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
df7492f9 1588 produced_chars++;
4ed46869
KH
1589 }
1590 }
065e3595 1591 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
1592 coding->produced_char += produced_chars;
1593 coding->produced = dst - coding->destination;
1594 return 0;
4ed46869
KH
1595}
1596
b73bfc1c 1597
df7492f9 1598/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
ff0dacd7
KH
1599 Check if a text is encoded in one of UTF-16 based coding systems.
1600 If it is, return 1, else return 0. */
aa72b389 1601
df7492f9
KH
1602#define UTF_16_HIGH_SURROGATE_P(val) \
1603 (((val) & 0xFC00) == 0xD800)
1604
1605#define UTF_16_LOW_SURROGATE_P(val) \
1606 (((val) & 0xFC00) == 0xDC00)
93dec019 1607
df7492f9
KH
1608#define UTF_16_INVALID_P(val) \
1609 (((val) == 0xFFFE) \
1610 || ((val) == 0xFFFF) \
1611 || UTF_16_LOW_SURROGATE_P (val))
aa72b389 1612
aa72b389 1613
df7492f9 1614static int
ff0dacd7 1615detect_coding_utf_16 (coding, detect_info)
aa72b389 1616 struct coding_system *coding;
ff0dacd7 1617 struct coding_detection_info *detect_info;
aa72b389 1618{
8f924df7
KH
1619 const unsigned char *src = coding->source, *src_base = src;
1620 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9
KH
1621 int multibytep = coding->src_multibyte;
1622 int consumed_chars = 0;
1623 int c1, c2;
aa72b389 1624
ff0dacd7 1625 detect_info->checked |= CATEGORY_MASK_UTF_16;
ff0dacd7 1626 if (coding->mode & CODING_MODE_LAST_BLOCK
24a73b0a 1627 && (coding->src_chars & 1))
ff0dacd7
KH
1628 {
1629 detect_info->rejected |= CATEGORY_MASK_UTF_16;
1630 return 0;
1631 }
24a73b0a 1632
f56a4450 1633 TWO_MORE_BYTES (c1, c2);
df7492f9 1634 if ((c1 == 0xFF) && (c2 == 0xFE))
aa72b389 1635 {
b49a1807
KH
1636 detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1637 | CATEGORY_MASK_UTF_16_AUTO);
24a73b0a
KH
1638 detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1639 | CATEGORY_MASK_UTF_16_BE_NOSIG
1640 | CATEGORY_MASK_UTF_16_LE_NOSIG);
aa72b389 1641 }
df7492f9 1642 else if ((c1 == 0xFE) && (c2 == 0xFF))
ff0dacd7 1643 {
b49a1807
KH
1644 detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1645 | CATEGORY_MASK_UTF_16_AUTO);
24a73b0a
KH
1646 detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1647 | CATEGORY_MASK_UTF_16_BE_NOSIG
1648 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1649 }
220eeac9 1650 else if (c2 < 0)
f56a4450
KH
1651 {
1652 detect_info->rejected |= CATEGORY_MASK_UTF_16;
1653 return 0;
1654 }
2f3cbb32 1655 else
24a73b0a 1656 {
2f3cbb32
KH
1657 /* We check the dispersion of Eth and Oth bytes where E is even and
1658 O is odd. If both are high, we assume binary data.*/
1659 unsigned char e[256], o[256];
1660 unsigned e_num = 1, o_num = 1;
1661
1662 memset (e, 0, 256);
1663 memset (o, 0, 256);
1664 e[c1] = 1;
1665 o[c2] = 1;
1666
24a73b0a
KH
1667 detect_info->rejected
1668 |= (CATEGORY_MASK_UTF_16_BE | CATEGORY_MASK_UTF_16_LE);
2f3cbb32
KH
1669
1670 while (1)
1671 {
f56a4450 1672 TWO_MORE_BYTES (c1, c2);
220eeac9 1673 if (c2 < 0)
f56a4450 1674 break;
2f3cbb32
KH
1675 if (! e[c1])
1676 {
1677 e[c1] = 1;
1678 e_num++;
1679 if (e_num >= 128)
1680 break;
1681 }
1682 if (! o[c2])
1683 {
1684 o[c1] = 1;
1685 o_num++;
1686 if (o_num >= 128)
1687 break;
1688 }
1689 }
1690 detect_info->rejected |= CATEGORY_MASK_UTF_16;
1691 return 0;
ff0dacd7 1692 }
2f3cbb32 1693
df7492f9 1694 no_more_source:
ff0dacd7 1695 return 1;
df7492f9 1696}
aa72b389 1697
df7492f9
KH
1698static void
1699decode_coding_utf_16 (coding)
1700 struct coding_system *coding;
1701{
8f924df7
KH
1702 const unsigned char *src = coding->source + coding->consumed;
1703 const unsigned char *src_end = coding->source + coding->src_bytes;
1704 const unsigned char *src_base;
69a80ea3 1705 int *charbuf = coding->charbuf + coding->charbuf_used;
df80c7f0
KH
1706 /* We may produces at most 3 chars in one loop. */
1707 int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
3a8406e1 1708 int consumed_chars = 0, consumed_chars_base = 0;
df7492f9 1709 int multibytep = coding->src_multibyte;
a470d443 1710 enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
df7492f9
KH
1711 enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1712 int surrogate = CODING_UTF_16_SURROGATE (coding);
24a73b0a 1713 Lisp_Object attr, charset_list;
0a9564cb
EZ
1714 int eol_crlf =
1715 !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
119852e7 1716 int byte_after_cr1 = -1, byte_after_cr2 = -1;
df7492f9 1717
24a73b0a 1718 CODING_GET_INFO (coding, attr, charset_list);
df7492f9 1719
a470d443 1720 if (bom == utf_with_bom)
aa72b389 1721 {
df7492f9 1722 int c, c1, c2;
4af310db 1723
aa72b389 1724 src_base = src;
df7492f9
KH
1725 ONE_MORE_BYTE (c1);
1726 ONE_MORE_BYTE (c2);
e19c3639 1727 c = (c1 << 8) | c2;
aa72b389 1728
b49a1807
KH
1729 if (endian == utf_16_big_endian
1730 ? c != 0xFEFF : c != 0xFFFE)
aa72b389 1731 {
b49a1807
KH
1732 /* The first two bytes are not BOM. Treat them as bytes
1733 for a normal character. */
1734 src = src_base;
1735 coding->errors++;
aa72b389 1736 }
a470d443 1737 CODING_UTF_16_BOM (coding) = utf_without_bom;
b49a1807 1738 }
a470d443 1739 else if (bom == utf_detect_bom)
b49a1807
KH
1740 {
1741 /* We have already tried to detect BOM and failed in
1742 detect_coding. */
a470d443 1743 CODING_UTF_16_BOM (coding) = utf_without_bom;
df7492f9 1744 }
aa72b389 1745
df7492f9
KH
1746 while (1)
1747 {
1748 int c, c1, c2;
1749
1750 src_base = src;
1751 consumed_chars_base = consumed_chars;
1752
df80c7f0 1753 if (charbuf >= charbuf_end)
b71f6f73
KH
1754 {
1755 if (byte_after_cr1 >= 0)
1756 src_base -= 2;
1757 break;
1758 }
df7492f9 1759
119852e7
KH
1760 if (byte_after_cr1 >= 0)
1761 c1 = byte_after_cr1, byte_after_cr1 = -1;
1762 else
1763 ONE_MORE_BYTE (c1);
065e3595
KH
1764 if (c1 < 0)
1765 {
1766 *charbuf++ = -c1;
1767 continue;
1768 }
119852e7
KH
1769 if (byte_after_cr2 >= 0)
1770 c2 = byte_after_cr2, byte_after_cr2 = -1;
1771 else
1772 ONE_MORE_BYTE (c2);
065e3595
KH
1773 if (c2 < 0)
1774 {
1775 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1776 *charbuf++ = -c2;
1777 continue;
1778 }
df7492f9 1779 c = (endian == utf_16_big_endian
e19c3639 1780 ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
119852e7 1781
df7492f9 1782 if (surrogate)
fd3ae0b9 1783 {
df7492f9 1784 if (! UTF_16_LOW_SURROGATE_P (c))
fd3ae0b9 1785 {
df7492f9
KH
1786 if (endian == utf_16_big_endian)
1787 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1788 else
1789 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1790 *charbuf++ = c1;
1791 *charbuf++ = c2;
1792 coding->errors++;
1793 if (UTF_16_HIGH_SURROGATE_P (c))
1794 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
fd3ae0b9 1795 else
df7492f9 1796 *charbuf++ = c;
fd3ae0b9
KH
1797 }
1798 else
df7492f9
KH
1799 {
1800 c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1801 CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
29f7ffd0 1802 *charbuf++ = 0x10000 + c;
df7492f9 1803 }
fd3ae0b9 1804 }
aa72b389 1805 else
df7492f9
KH
1806 {
1807 if (UTF_16_HIGH_SURROGATE_P (c))
1808 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1809 else
119852e7
KH
1810 {
1811 if (eol_crlf && c == '\r')
1812 {
1813 ONE_MORE_BYTE (byte_after_cr1);
1814 ONE_MORE_BYTE (byte_after_cr2);
1815 }
1816 *charbuf++ = c;
1817 }
8f924df7 1818 }
aa72b389 1819 }
df7492f9
KH
1820
1821 no_more_source:
1822 coding->consumed_char += consumed_chars_base;
1823 coding->consumed = src_base - coding->source;
1824 coding->charbuf_used = charbuf - coding->charbuf;
aa72b389 1825}
b73bfc1c 1826
df7492f9
KH
1827static int
1828encode_coding_utf_16 (coding)
1829 struct coding_system *coding;
1830{
1831 int multibytep = coding->dst_multibyte;
1832 int *charbuf = coding->charbuf;
1833 int *charbuf_end = charbuf + coding->charbuf_used;
1834 unsigned char *dst = coding->destination + coding->produced;
1835 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1836 int safe_room = 8;
a470d443 1837 enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
df7492f9
KH
1838 int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1839 int produced_chars = 0;
24a73b0a 1840 Lisp_Object attrs, charset_list;
df7492f9 1841 int c;
4ed46869 1842
24a73b0a 1843 CODING_GET_INFO (coding, attrs, charset_list);
df7492f9 1844
a470d443 1845 if (bom != utf_without_bom)
df7492f9
KH
1846 {
1847 ASSURE_DESTINATION (safe_room);
1848 if (big_endian)
df7492f9 1849 EMIT_TWO_BYTES (0xFE, 0xFF);
880cf180
KH
1850 else
1851 EMIT_TWO_BYTES (0xFF, 0xFE);
a470d443 1852 CODING_UTF_16_BOM (coding) = utf_without_bom;
df7492f9
KH
1853 }
1854
1855 while (charbuf < charbuf_end)
1856 {
1857 ASSURE_DESTINATION (safe_room);
1858 c = *charbuf++;
e19c3639
KH
1859 if (c >= MAX_UNICODE_CHAR)
1860 c = coding->default_char;
df7492f9
KH
1861
1862 if (c < 0x10000)
1863 {
1864 if (big_endian)
1865 EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1866 else
1867 EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1868 }
1869 else
1870 {
1871 int c1, c2;
1872
1873 c -= 0x10000;
1874 c1 = (c >> 10) + 0xD800;
1875 c2 = (c & 0x3FF) + 0xDC00;
1876 if (big_endian)
1877 EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1878 else
1879 EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1880 }
1881 }
065e3595 1882 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
1883 coding->produced = dst - coding->destination;
1884 coding->produced_char += produced_chars;
1885 return 0;
1886}
1887
1888\f
1889/*** 6. Old Emacs' internal format (emacs-mule) ***/
1890
1891/* Emacs' internal format for representation of multiple character
1892 sets is a kind of multi-byte encoding, i.e. characters are
1893 represented by variable-length sequences of one-byte codes.
1894
1895 ASCII characters and control characters (e.g. `tab', `newline') are
1896 represented by one-byte sequences which are their ASCII codes, in
1897 the range 0x00 through 0x7F.
1898
1899 8-bit characters of the range 0x80..0x9F are represented by
1900 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1901 code + 0x20).
1902
1903 8-bit characters of the range 0xA0..0xFF are represented by
1904 one-byte sequences which are their 8-bit code.
1905
1906 The other characters are represented by a sequence of `base
1907 leading-code', optional `extended leading-code', and one or two
1908 `position-code's. The length of the sequence is determined by the
1909 base leading-code. Leading-code takes the range 0x81 through 0x9D,
1910 whereas extended leading-code and position-code take the range 0xA0
1911 through 0xFF. See `charset.h' for more details about leading-code
1912 and position-code.
1913
1914 --- CODE RANGE of Emacs' internal format ---
1915 character set range
1916 ------------- -----
1917 ascii 0x00..0x7F
1918 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1919 eight-bit-graphic 0xA0..0xBF
1920 ELSE 0x81..0x9D + [0xA0..0xFF]+
1921 ---------------------------------------------
1922
1923 As this is the internal character representation, the format is
1924 usually not used externally (i.e. in a file or in a data sent to a
1925 process). But, it is possible to have a text externally in this
1926 format (i.e. by encoding by the coding system `emacs-mule').
1927
1928 In that case, a sequence of one-byte codes has a slightly different
1929 form.
1930
1931 At first, all characters in eight-bit-control are represented by
1932 one-byte sequences which are their 8-bit code.
1933
1934 Next, character composition data are represented by the byte
1935 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1936 where,
e951386e 1937 METHOD is 0xF2 plus one of composition method (enum
df7492f9
KH
1938 composition_method),
1939
1940 BYTES is 0xA0 plus a byte length of this composition data,
1941
e951386e 1942 CHARS is 0xA0 plus a number of characters composed by this
df7492f9
KH
1943 data,
1944
1945 COMPONENTs are characters of multibye form or composition
1946 rules encoded by two-byte of ASCII codes.
1947
1948 In addition, for backward compatibility, the following formats are
1949 also recognized as composition data on decoding.
1950
1951 0x80 MSEQ ...
1952 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1953
1954 Here,
1955 MSEQ is a multibyte form but in these special format:
1956 ASCII: 0xA0 ASCII_CODE+0x80,
1957 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1958 RULE is a one byte code of the range 0xA0..0xF0 that
1959 represents a composition rule.
1960 */
1961
1962char emacs_mule_bytes[256];
1963
e951386e
KH
1964
1965/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1966 Check if a text is encoded in `emacs-mule'. If it is, return 1,
1967 else return 0. */
1968
1969static int
1970detect_coding_emacs_mule (coding, detect_info)
1971 struct coding_system *coding;
1972 struct coding_detection_info *detect_info;
1973{
1974 const unsigned char *src = coding->source, *src_base;
1975 const unsigned char *src_end = coding->source + coding->src_bytes;
1976 int multibytep = coding->src_multibyte;
1977 int consumed_chars = 0;
1978 int c;
1979 int found = 0;
1980
1981 detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1982 /* A coding system of this category is always ASCII compatible. */
1983 src += coding->head_ascii;
1984
1985 while (1)
1986 {
1987 src_base = src;
1988 ONE_MORE_BYTE (c);
1989 if (c < 0)
1990 continue;
1991 if (c == 0x80)
1992 {
1993 /* Perhaps the start of composite character. We simply skip
1994 it because analyzing it is too heavy for detecting. But,
1995 at least, we check that the composite character
1996 constitutes of more than 4 bytes. */
1997 const unsigned char *src_base;
1998
1999 repeat:
2000 src_base = src;
2001 do
2002 {
2003 ONE_MORE_BYTE (c);
2004 }
2005 while (c >= 0xA0);
2006
2007 if (src - src_base <= 4)
2008 break;
2009 found = CATEGORY_MASK_EMACS_MULE;
2010 if (c == 0x80)
2011 goto repeat;
2012 }
2013
2014 if (c < 0x80)
2015 {
2016 if (c < 0x20
2017 && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
2018 break;
2019 }
2020 else
2021 {
2022 int more_bytes = emacs_mule_bytes[*src_base] - 1;
2023
2024 while (more_bytes > 0)
2025 {
2026 ONE_MORE_BYTE (c);
2027 if (c < 0xA0)
2028 {
2029 src--; /* Unread the last byte. */
2030 break;
2031 }
2032 more_bytes--;
2033 }
2034 if (more_bytes != 0)
2035 break;
2036 found = CATEGORY_MASK_EMACS_MULE;
2037 }
2038 }
2039 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
2040 return 0;
2041
2042 no_more_source:
2043 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
2044 {
2045 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
2046 return 0;
2047 }
2048 detect_info->found |= found;
2049 return 1;
2050}
2051
2052
2053/* Parse emacs-mule multibyte sequence at SRC and return the decoded
2054 character. If CMP_STATUS indicates that we must expect MSEQ or
2055 RULE described above, decode it and return the negative value of
2056 the deocded character or rule. If an invalid byte is found, return
2057 -1. If SRC is too short, return -2. */
2058
df7492f9 2059int
e951386e 2060emacs_mule_char (coding, src, nbytes, nchars, id, cmp_status)
df7492f9 2061 struct coding_system *coding;
065e3595 2062 const unsigned char *src;
ff0dacd7 2063 int *nbytes, *nchars, *id;
e951386e 2064 struct composition_status *cmp_status;
df7492f9 2065{
8f924df7
KH
2066 const unsigned char *src_end = coding->source + coding->src_bytes;
2067 const unsigned char *src_base = src;
df7492f9 2068 int multibytep = coding->src_multibyte;
df7492f9
KH
2069 struct charset *charset;
2070 unsigned code;
2071 int c;
2072 int consumed_chars = 0;
e951386e 2073 int mseq_found = 0;
df7492f9
KH
2074
2075 ONE_MORE_BYTE (c);
065e3595 2076 if (c < 0)
df7492f9 2077 {
065e3595
KH
2078 c = -c;
2079 charset = emacs_mule_charset[0];
2080 }
2081 else
2082 {
4d41e8b7
KH
2083 if (c >= 0xA0)
2084 {
e951386e
KH
2085 if (cmp_status->state != COMPOSING_NO
2086 && cmp_status->old_form)
4d41e8b7 2087 {
e951386e
KH
2088 if (cmp_status->state == COMPOSING_CHAR)
2089 {
2090 if (c == 0xA0)
2091 {
2092 ONE_MORE_BYTE (c);
2093 c -= 0x80;
2094 if (c < 0)
2095 goto invalid_code;
2096 }
2097 else
2098 c -= 0x20;
2099 mseq_found = 1;
2100 }
2101 else
2102 {
2103 *nbytes = src - src_base;
2104 *nchars = consumed_chars;
2105 return -c;
2106 }
4d41e8b7
KH
2107 }
2108 else
e951386e 2109 goto invalid_code;
4d41e8b7
KH
2110 }
2111
065e3595 2112 switch (emacs_mule_bytes[c])
b73bfc1c 2113 {
065e3595 2114 case 2:
df7492f9
KH
2115 if (! (charset = emacs_mule_charset[c]))
2116 goto invalid_code;
2117 ONE_MORE_BYTE (c);
9ffd559c 2118 if (c < 0xA0)
065e3595 2119 goto invalid_code;
df7492f9 2120 code = c & 0x7F;
065e3595
KH
2121 break;
2122
2123 case 3:
2124 if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
2125 || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
2126 {
2127 ONE_MORE_BYTE (c);
9ffd559c 2128 if (c < 0xA0 || ! (charset = emacs_mule_charset[c]))
065e3595
KH
2129 goto invalid_code;
2130 ONE_MORE_BYTE (c);
9ffd559c 2131 if (c < 0xA0)
065e3595
KH
2132 goto invalid_code;
2133 code = c & 0x7F;
2134 }
2135 else
2136 {
2137 if (! (charset = emacs_mule_charset[c]))
2138 goto invalid_code;
2139 ONE_MORE_BYTE (c);
9ffd559c 2140 if (c < 0xA0)
065e3595
KH
2141 goto invalid_code;
2142 code = (c & 0x7F) << 8;
2143 ONE_MORE_BYTE (c);
9ffd559c 2144 if (c < 0xA0)
065e3595
KH
2145 goto invalid_code;
2146 code |= c & 0x7F;
2147 }
2148 break;
2149
2150 case 4:
2151 ONE_MORE_BYTE (c);
2152 if (c < 0 || ! (charset = emacs_mule_charset[c]))
df7492f9
KH
2153 goto invalid_code;
2154 ONE_MORE_BYTE (c);
9ffd559c 2155 if (c < 0xA0)
065e3595 2156 goto invalid_code;
781d7a48 2157 code = (c & 0x7F) << 8;
df7492f9 2158 ONE_MORE_BYTE (c);
9ffd559c 2159 if (c < 0xA0)
065e3595 2160 goto invalid_code;
df7492f9 2161 code |= c & 0x7F;
065e3595 2162 break;
df7492f9 2163
065e3595
KH
2164 case 1:
2165 code = c;
2166 charset = CHARSET_FROM_ID (ASCII_BYTE_P (code)
2167 ? charset_ascii : charset_eight_bit);
2168 break;
df7492f9 2169
065e3595
KH
2170 default:
2171 abort ();
2172 }
2173 c = DECODE_CHAR (charset, code);
2174 if (c < 0)
2175 goto invalid_code;
df7492f9 2176 }
df7492f9
KH
2177 *nbytes = src - src_base;
2178 *nchars = consumed_chars;
ff0dacd7
KH
2179 if (id)
2180 *id = charset->id;
e951386e 2181 return (mseq_found ? -c : c);
df7492f9
KH
2182
2183 no_more_source:
2184 return -2;
2185
2186 invalid_code:
2187 return -1;
2188}
2189
2190
e951386e 2191/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
df7492f9 2192
e951386e
KH
2193/* Handle these composition sequence ('|': the end of header elements,
2194 BYTES and CHARS >= 0xA0):
df7492f9 2195
e951386e
KH
2196 (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2197 (2) altchar composition: 0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2198 (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
df7492f9 2199
e951386e
KH
2200 and these old form:
2201
2202 (4) relative composition: 0x80 | MSEQ ... MSEQ
2203 (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
df7492f9 2204
e951386e
KH
2205 When the starter 0x80 and the following header elements are found,
2206 this annotation header is produced.
df7492f9 2207
e951386e 2208 [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
df7492f9 2209
e951386e
KH
2210 NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2211 NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
df7492f9 2212
e951386e
KH
2213 Then, upon reading the following elements, these codes are produced
2214 until the composition end is found:
df7492f9 2215
e951386e
KH
2216 (1) CHAR ... CHAR
2217 (2) ALT ... ALT CHAR ... CHAR
2218 (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2219 (4) CHAR ... CHAR
2220 (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
4ed46869 2221
e951386e
KH
2222 When the composition end is found, LENGTH and NCHARS in the
2223 annotation header is updated as below:
b73bfc1c 2224
e951386e
KH
2225 (1) LENGTH: unchanged, NCHARS: unchanged
2226 (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2227 (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2228 (4) LENGTH: unchanged, NCHARS: number of CHARs
2229 (5) LENGTH: unchanged, NCHARS: number of CHARs
df7492f9 2230
e951386e
KH
2231 If an error is found while composing, the annotation header is
2232 changed to the original composition header (plus filler -1s) as
2233 below:
2234
2235 (1),(2),(3) [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2236 (5) [ 0x80 0xFF -1 -1- -1 ]
2237
2238 and the sequence [ -2 DECODED-RULE ] is changed to the original
2239 byte sequence as below:
2240 o the original byte sequence is B: [ B -1 ]
2241 o the original byte sequence is B1 B2: [ B1 B2 ]
2242
2243 Most of the routines are implemented by macros because many
2244 variables and labels in the caller decode_coding_emacs_mule must be
2245 accessible, and they are usually called just once (thus doesn't
2246 increase the size of compiled object). */
2247
2248/* Decode a composition rule represented by C as a component of
2249 composition sequence of Emacs 20 style. Set RULE to the decoded
2250 rule. */
2251
2252#define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule) \
df7492f9 2253 do { \
e951386e
KH
2254 int gref, nref; \
2255 \
4d41e8b7 2256 c -= 0xA0; \
df7492f9
KH
2257 if (c < 0 || c >= 81) \
2258 goto invalid_code; \
df7492f9 2259 gref = c / 9, nref = c % 9; \
e951386e
KH
2260 if (gref == 4) gref = 10; \
2261 if (nref == 4) nref = 10; \
2262 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
df7492f9
KH
2263 } while (0)
2264
2265
e951386e
KH
2266/* Decode a composition rule represented by C and the following byte
2267 at SRC as a component of composition sequence of Emacs 21 style.
2268 Set RULE to the decoded rule. */
781d7a48 2269
e951386e 2270#define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule) \
781d7a48
KH
2271 do { \
2272 int gref, nref; \
e951386e
KH
2273 \
2274 gref = c - 0x20; \
2275 if (gref < 0 || gref >= 81) \
781d7a48 2276 goto invalid_code; \
e951386e
KH
2277 ONE_MORE_BYTE (c); \
2278 nref = c - 0x20; \
2279 if (nref < 0 || nref >= 81) \
781d7a48 2280 goto invalid_code; \
e951386e 2281 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
781d7a48
KH
2282 } while (0)
2283
2284
e951386e
KH
2285/* Start of Emacs 21 style format. The first three bytes at SRC are
2286 (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2287 byte length of this composition information, CHARS is the number of
2288 characters composed by this composition. */
2289
2290#define DECODE_EMACS_MULE_21_COMPOSITION() \
aa72b389 2291 do { \
781d7a48
KH
2292 enum composition_method method = c - 0xF2; \
2293 int *charbuf_base = charbuf; \
df7492f9 2294 int nbytes, nchars; \
e951386e 2295 \
df7492f9 2296 ONE_MORE_BYTE (c); \
065e3595
KH
2297 if (c < 0) \
2298 goto invalid_code; \
df7492f9 2299 nbytes = c - 0xA0; \
e951386e 2300 if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4)) \
df7492f9
KH
2301 goto invalid_code; \
2302 ONE_MORE_BYTE (c); \
2303 nchars = c - 0xA0; \
e951386e
KH
2304 if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS) \
2305 goto invalid_code; \
2306 cmp_status->old_form = 0; \
2307 cmp_status->method = method; \
2308 if (method == COMPOSITION_RELATIVE) \
2309 cmp_status->state = COMPOSING_CHAR; \
2310 else \
2311 cmp_status->state = COMPOSING_COMPONENT_CHAR; \
2312 cmp_status->length = MAX_ANNOTATION_LENGTH; \
2313 cmp_status->nchars = nchars; \
2314 cmp_status->ncomps = nbytes - 4; \
2315 ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method); \
aa72b389 2316 } while (0)
93dec019 2317
aa72b389 2318
e951386e
KH
2319/* Start of Emacs 20 style format for relative composition. */
2320
2321#define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION() \
2322 do { \
2323 cmp_status->old_form = 1; \
2324 cmp_status->method = COMPOSITION_RELATIVE; \
2325 cmp_status->state = COMPOSING_CHAR; \
2326 cmp_status->length = MAX_ANNOTATION_LENGTH; \
2327 cmp_status->nchars = cmp_status->ncomps = 0; \
2328 ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method); \
2329 } while (0)
2330
2331
2332/* Start of Emacs 20 style format for rule-base composition. */
2333
2334#define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION() \
2335 do { \
2336 cmp_status->old_form = 1; \
2337 cmp_status->method = COMPOSITION_WITH_RULE; \
2338 cmp_status->state = COMPOSING_CHAR; \
2339 cmp_status->length = MAX_ANNOTATION_LENGTH; \
2340 cmp_status->nchars = cmp_status->ncomps = 0; \
2341 ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method); \
df7492f9
KH
2342 } while (0)
2343
2344
e951386e
KH
2345#define DECODE_EMACS_MULE_COMPOSITION_START() \
2346 do { \
2347 const unsigned char *current_src = src; \
2348 \
2349 ONE_MORE_BYTE (c); \
2350 if (c < 0) \
2351 goto invalid_code; \
2352 if (c - 0xF2 >= COMPOSITION_RELATIVE \
2353 && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS) \
2354 DECODE_EMACS_MULE_21_COMPOSITION (); \
2355 else if (c < 0xA0) \
2356 goto invalid_code; \
2357 else if (c < 0xC0) \
2358 { \
2359 DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (); \
2360 /* Re-read C as a composition component. */ \
2361 src = current_src; \
2362 } \
2363 else if (c == 0xFF) \
2364 DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (); \
2365 else \
2366 goto invalid_code; \
2367 } while (0)
2368
2369#define EMACS_MULE_COMPOSITION_END() \
df7492f9 2370 do { \
e951386e 2371 int idx = - cmp_status->length; \
4d41e8b7 2372 \
e951386e
KH
2373 if (cmp_status->old_form) \
2374 charbuf[idx + 2] = cmp_status->nchars; \
2375 else if (cmp_status->method > COMPOSITION_RELATIVE) \
2376 charbuf[idx] = charbuf[idx + 2] - cmp_status->length; \
2377 cmp_status->state = COMPOSING_NO; \
2378 } while (0)
2379
2380
2381static int
2382emacs_mule_finish_composition (charbuf, cmp_status)
2383 int *charbuf;
2384 struct composition_status *cmp_status;
2385{
2386 int idx = - cmp_status->length;
2387 int new_chars;
2388
2389 if (cmp_status->old_form && cmp_status->nchars > 0)
2390 {
2391 charbuf[idx + 2] = cmp_status->nchars;
2392 new_chars = 0;
2393 if (cmp_status->method == COMPOSITION_WITH_RULE
2394 && cmp_status->state == COMPOSING_CHAR)
2395 {
2396 /* The last rule was invalid. */
2397 int rule = charbuf[-1] + 0xA0;
2398
2399 charbuf[-2] = BYTE8_TO_CHAR (rule);
2400 charbuf[-1] = -1;
2401 new_chars = 1;
2402 }
2403 }
2404 else
2405 {
2406 charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2407
2408 if (cmp_status->method == COMPOSITION_WITH_RULE)
2409 {
2410 charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2411 charbuf[idx++] = -3;
2412 charbuf[idx++] = 0;
2413 new_chars = 1;
2414 }
2415 else
2416 {
2417 int nchars = charbuf[idx + 1] + 0xA0;
2418 int nbytes = charbuf[idx + 2] + 0xA0;
2419
2420 charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2421 charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2422 charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2423 charbuf[idx++] = -1;
2424 new_chars = 4;
2425 }
2426 }
2427 cmp_status->state = COMPOSING_NO;
2428 return new_chars;
2429}
2430
2431#define EMACS_MULE_MAYBE_FINISH_COMPOSITION() \
2432 do { \
2433 if (cmp_status->state != COMPOSING_NO) \
2434 char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
df7492f9
KH
2435 } while (0)
2436
aa72b389
KH
2437
2438static void
df7492f9 2439decode_coding_emacs_mule (coding)
aa72b389 2440 struct coding_system *coding;
aa72b389 2441{
8f924df7
KH
2442 const unsigned char *src = coding->source + coding->consumed;
2443 const unsigned char *src_end = coding->source + coding->src_bytes;
2444 const unsigned char *src_base;
69a80ea3 2445 int *charbuf = coding->charbuf + coding->charbuf_used;
df80c7f0
KH
2446 /* We may produce two annocations (charset and composition) in one
2447 loop and one more charset annocation at the end. */
69a80ea3 2448 int *charbuf_end
df80c7f0 2449 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
df7492f9 2450 int consumed_chars = 0, consumed_chars_base;
df7492f9 2451 int multibytep = coding->src_multibyte;
24a73b0a 2452 Lisp_Object attrs, charset_list;
ff0dacd7
KH
2453 int char_offset = coding->produced_char;
2454 int last_offset = char_offset;
2455 int last_id = charset_ascii;
0a9564cb
EZ
2456 int eol_crlf =
2457 !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
119852e7 2458 int byte_after_cr = -1;
e951386e 2459 struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
aa72b389 2460
24a73b0a 2461 CODING_GET_INFO (coding, attrs, charset_list);
aa72b389 2462
e951386e
KH
2463 if (cmp_status->state != COMPOSING_NO)
2464 {
2465 int i;
2466
2467 for (i = 0; i < cmp_status->length; i++)
2468 *charbuf++ = cmp_status->carryover[i];
2469 coding->annotated = 1;
2470 }
2471
aa72b389
KH
2472 while (1)
2473 {
e951386e 2474 int c, id;
df7492f9 2475
aa72b389 2476 src_base = src;
df7492f9
KH
2477 consumed_chars_base = consumed_chars;
2478
2479 if (charbuf >= charbuf_end)
b71f6f73
KH
2480 {
2481 if (byte_after_cr >= 0)
2482 src_base--;
2483 break;
2484 }
aa72b389 2485
119852e7
KH
2486 if (byte_after_cr >= 0)
2487 c = byte_after_cr, byte_after_cr = -1;
2488 else
2489 ONE_MORE_BYTE (c);
e951386e
KH
2490
2491 if (c < 0 || c == 0x80)
065e3595 2492 {
e951386e
KH
2493 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2494 if (c < 0)
2495 {
2496 *charbuf++ = -c;
2497 char_offset++;
2498 }
2499 else
2500 DECODE_EMACS_MULE_COMPOSITION_START ();
2501 continue;
065e3595 2502 }
e951386e
KH
2503
2504 if (c < 0x80)
aa72b389 2505 {
119852e7
KH
2506 if (eol_crlf && c == '\r')
2507 ONE_MORE_BYTE (byte_after_cr);
e951386e
KH
2508 id = charset_ascii;
2509 if (cmp_status->state != COMPOSING_NO)
2510 {
2511 if (cmp_status->old_form)
2512 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2513 else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2514 cmp_status->ncomps--;
2515 }
2516 }
2517 else
2518 {
2519 int nchars, nbytes;
2520
2521 c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2522 cmp_status);
2523 if (c < 0)
2524 {
2525 if (c == -1)
2526 goto invalid_code;
2527 if (c == -2)
2528 break;
2529 }
2530 src = src_base + nbytes;
2531 consumed_chars = consumed_chars_base + nchars;
2532 if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2533 cmp_status->ncomps -= nchars;
2534 }
2535
2536 /* Now if C >= 0, we found a normally encoded characer, if C <
2537 0, we found an old-style composition component character or
2538 rule. */
2539
2540 if (cmp_status->state == COMPOSING_NO)
2541 {
2542 if (last_id != id)
2543 {
2544 if (last_id != charset_ascii)
2545 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2546 last_id);
2547 last_id = id;
2548 last_offset = char_offset;
2549 }
df7492f9
KH
2550 *charbuf++ = c;
2551 char_offset++;
aa72b389 2552 }
e951386e 2553 else if (cmp_status->state == COMPOSING_CHAR)
df7492f9 2554 {
e951386e
KH
2555 if (cmp_status->old_form)
2556 {
2557 if (c >= 0)
2558 {
2559 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2560 *charbuf++ = c;
2561 char_offset++;
2562 }
2563 else
2564 {
2565 *charbuf++ = -c;
2566 cmp_status->nchars++;
2567 cmp_status->length++;
2568 if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2569 EMACS_MULE_COMPOSITION_END ();
2570 else if (cmp_status->method == COMPOSITION_WITH_RULE)
2571 cmp_status->state = COMPOSING_RULE;
2572 }
2573 }
df7492f9 2574 else
e951386e
KH
2575 {
2576 *charbuf++ = c;
2577 cmp_status->length++;
2578 cmp_status->nchars--;
2579 if (cmp_status->nchars == 0)
2580 EMACS_MULE_COMPOSITION_END ();
2581 }
df7492f9 2582 }
e951386e 2583 else if (cmp_status->state == COMPOSING_RULE)
df7492f9 2584 {
e951386e 2585 int rule;
ff0dacd7 2586
e951386e 2587 if (c >= 0)
df7492f9 2588 {
e951386e
KH
2589 EMACS_MULE_COMPOSITION_END ();
2590 *charbuf++ = c;
2591 char_offset++;
df7492f9 2592 }
e951386e 2593 else
ff0dacd7 2594 {
e951386e
KH
2595 c = -c;
2596 DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2597 if (rule < 0)
2598 goto invalid_code;
2599 *charbuf++ = -2;
2600 *charbuf++ = rule;
2601 cmp_status->length += 2;
2602 cmp_status->state = COMPOSING_CHAR;
ff0dacd7 2603 }
e951386e
KH
2604 }
2605 else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2606 {
df7492f9 2607 *charbuf++ = c;
e951386e
KH
2608 cmp_status->length++;
2609 if (cmp_status->ncomps == 0)
2610 cmp_status->state = COMPOSING_CHAR;
2611 else if (cmp_status->ncomps > 0)
2612 {
2613 if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2614 cmp_status->state = COMPOSING_COMPONENT_RULE;
2615 }
2616 else
2617 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
df7492f9 2618 }
e951386e
KH
2619 else /* COMPOSING_COMPONENT_RULE */
2620 {
2621 int rule;
2622
2623 DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2624 if (rule < 0)
2625 goto invalid_code;
2626 *charbuf++ = -2;
2627 *charbuf++ = rule;
2628 cmp_status->length += 2;
2629 cmp_status->ncomps--;
2630 if (cmp_status->ncomps > 0)
2631 cmp_status->state = COMPOSING_COMPONENT_CHAR;
2632 else
2633 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2634 }
2635 continue;
2636
2637 retry:
2638 src = src_base;
2639 consumed_chars = consumed_chars_base;
df7492f9
KH
2640 continue;
2641
2642 invalid_code:
e951386e 2643 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
df7492f9
KH
2644 src = src_base;
2645 consumed_chars = consumed_chars_base;
2646 ONE_MORE_BYTE (c);
2647 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
ff0dacd7 2648 char_offset++;
df7492f9
KH
2649 coding->errors++;
2650 }
2651
2652 no_more_source:
e951386e
KH
2653 if (cmp_status->state != COMPOSING_NO)
2654 {
2655 if (coding->mode & CODING_MODE_LAST_BLOCK)
2656 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2657 else
2658 {
2659 int i;
2660
2661 charbuf -= cmp_status->length;
2662 for (i = 0; i < cmp_status->length; i++)
2663 cmp_status->carryover[i] = charbuf[i];
2664 }
2665 }
ff0dacd7 2666 if (last_id != charset_ascii)
69a80ea3 2667 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
df7492f9
KH
2668 coding->consumed_char += consumed_chars_base;
2669 coding->consumed = src_base - coding->source;
2670 coding->charbuf_used = charbuf - coding->charbuf;
2671}
2672
2673
2674#define EMACS_MULE_LEADING_CODES(id, codes) \
2675 do { \
2676 if (id < 0xA0) \
2677 codes[0] = id, codes[1] = 0; \
2678 else if (id < 0xE0) \
2679 codes[0] = 0x9A, codes[1] = id; \
2680 else if (id < 0xF0) \
2681 codes[0] = 0x9B, codes[1] = id; \
2682 else if (id < 0xF5) \
2683 codes[0] = 0x9C, codes[1] = id; \
2684 else \
2685 codes[0] = 0x9D, codes[1] = id; \
2686 } while (0);
2687
aa72b389 2688
df7492f9
KH
2689static int
2690encode_coding_emacs_mule (coding)
2691 struct coding_system *coding;
2692{
2693 int multibytep = coding->dst_multibyte;
2694 int *charbuf = coding->charbuf;
2695 int *charbuf_end = charbuf + coding->charbuf_used;
2696 unsigned char *dst = coding->destination + coding->produced;
2697 unsigned char *dst_end = coding->destination + coding->dst_bytes;
2698 int safe_room = 8;
df7492f9 2699 int produced_chars = 0;
24a73b0a 2700 Lisp_Object attrs, charset_list;
df7492f9 2701 int c;
ff0dacd7 2702 int preferred_charset_id = -1;
df7492f9 2703
24a73b0a 2704 CODING_GET_INFO (coding, attrs, charset_list);
eccb6815
KH
2705 if (! EQ (charset_list, Vemacs_mule_charset_list))
2706 {
2707 CODING_ATTR_CHARSET_LIST (attrs)
2708 = charset_list = Vemacs_mule_charset_list;
2709 }
df7492f9
KH
2710
2711 while (charbuf < charbuf_end)
2712 {
2713 ASSURE_DESTINATION (safe_room);
2714 c = *charbuf++;
ff0dacd7
KH
2715
2716 if (c < 0)
2717 {
2718 /* Handle an annotation. */
2719 switch (*charbuf)
2720 {
2721 case CODING_ANNOTATE_COMPOSITION_MASK:
2722 /* Not yet implemented. */
2723 break;
2724 case CODING_ANNOTATE_CHARSET_MASK:
2725 preferred_charset_id = charbuf[3];
2726 if (preferred_charset_id >= 0
2727 && NILP (Fmemq (make_number (preferred_charset_id),
2728 charset_list)))
2729 preferred_charset_id = -1;
2730 break;
2731 default:
2732 abort ();
2733 }
2734 charbuf += -c - 1;
2735 continue;
2736 }
2737
df7492f9
KH
2738 if (ASCII_CHAR_P (c))
2739 EMIT_ONE_ASCII_BYTE (c);
16eafb5d
KH
2740 else if (CHAR_BYTE8_P (c))
2741 {
2742 c = CHAR_TO_BYTE8 (c);
2743 EMIT_ONE_BYTE (c);
2744 }
df7492f9 2745 else
aa72b389 2746 {
df7492f9
KH
2747 struct charset *charset;
2748 unsigned code;
2749 int dimension;
2750 int emacs_mule_id;
2751 unsigned char leading_codes[2];
2752
ff0dacd7
KH
2753 if (preferred_charset_id >= 0)
2754 {
2755 charset = CHARSET_FROM_ID (preferred_charset_id);
905ca9d2
KH
2756 if (CHAR_CHARSET_P (c, charset))
2757 code = ENCODE_CHAR (charset, c);
2758 else
2759 charset = char_charset (c, charset_list, &code);
ff0dacd7
KH
2760 }
2761 else
2762 charset = char_charset (c, charset_list, &code);
df7492f9
KH
2763 if (! charset)
2764 {
2765 c = coding->default_char;
2766 if (ASCII_CHAR_P (c))
2767 {
2768 EMIT_ONE_ASCII_BYTE (c);
2769 continue;
2770 }
2771 charset = char_charset (c, charset_list, &code);
2772 }
2773 dimension = CHARSET_DIMENSION (charset);
2774 emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2775 EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2776 EMIT_ONE_BYTE (leading_codes[0]);
2777 if (leading_codes[1])
2778 EMIT_ONE_BYTE (leading_codes[1]);
2779 if (dimension == 1)
1fa663f9 2780 EMIT_ONE_BYTE (code | 0x80);
aa72b389 2781 else
df7492f9 2782 {
1fa663f9 2783 code |= 0x8080;
df7492f9
KH
2784 EMIT_ONE_BYTE (code >> 8);
2785 EMIT_ONE_BYTE (code & 0xFF);
2786 }
aa72b389 2787 }
aa72b389 2788 }
065e3595 2789 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
2790 coding->produced_char += produced_chars;
2791 coding->produced = dst - coding->destination;
2792 return 0;
aa72b389 2793}
b73bfc1c 2794
4ed46869 2795\f
df7492f9 2796/*** 7. ISO2022 handlers ***/
4ed46869
KH
2797
2798/* The following note describes the coding system ISO2022 briefly.
39787efd 2799 Since the intention of this note is to help understand the
5a936b46 2800 functions in this file, some parts are NOT ACCURATE or are OVERLY
39787efd 2801 SIMPLIFIED. For thorough understanding, please refer to the
5a936b46 2802 original document of ISO2022. This is equivalent to the standard
cfb43547 2803 ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
4ed46869
KH
2804
2805 ISO2022 provides many mechanisms to encode several character sets
cfb43547 2806 in 7-bit and 8-bit environments. For 7-bit environments, all text
39787efd
KH
2807 is encoded using bytes less than 128. This may make the encoded
2808 text a little bit longer, but the text passes more easily through
cfb43547 2809 several types of gateway, some of which strip off the MSB (Most
8ca3766a 2810 Significant Bit).
b73bfc1c 2811
cfb43547
DL
2812 There are two kinds of character sets: control character sets and
2813 graphic character sets. The former contain control characters such
4ed46869 2814 as `newline' and `escape' to provide control functions (control
39787efd 2815 functions are also provided by escape sequences). The latter
cfb43547 2816 contain graphic characters such as 'A' and '-'. Emacs recognizes
4ed46869
KH
2817 two control character sets and many graphic character sets.
2818
2819 Graphic character sets are classified into one of the following
39787efd
KH
2820 four classes, according to the number of bytes (DIMENSION) and
2821 number of characters in one dimension (CHARS) of the set:
2822 - DIMENSION1_CHARS94
2823 - DIMENSION1_CHARS96
2824 - DIMENSION2_CHARS94
2825 - DIMENSION2_CHARS96
2826
2827 In addition, each character set is assigned an identification tag,
cfb43547 2828 unique for each set, called the "final character" (denoted as <F>
39787efd
KH
2829 hereafter). The <F> of each character set is decided by ECMA(*)
2830 when it is registered in ISO. The code range of <F> is 0x30..0x7F
2831 (0x30..0x3F are for private use only).
4ed46869
KH
2832
2833 Note (*): ECMA = European Computer Manufacturers Association
2834
cfb43547 2835 Here are examples of graphic character sets [NAME(<F>)]:
4ed46869
KH
2836 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2837 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2838 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2839 o DIMENSION2_CHARS96 -- none for the moment
2840
39787efd 2841 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
4ed46869
KH
2842 C0 [0x00..0x1F] -- control character plane 0
2843 GL [0x20..0x7F] -- graphic character plane 0
2844 C1 [0x80..0x9F] -- control character plane 1
2845 GR [0xA0..0xFF] -- graphic character plane 1
2846
2847 A control character set is directly designated and invoked to C0 or
39787efd
KH
2848 C1 by an escape sequence. The most common case is that:
2849 - ISO646's control character set is designated/invoked to C0, and
2850 - ISO6429's control character set is designated/invoked to C1,
2851 and usually these designations/invocations are omitted in encoded
2852 text. In a 7-bit environment, only C0 can be used, and a control
2853 character for C1 is encoded by an appropriate escape sequence to
2854 fit into the environment. All control characters for C1 are
2855 defined to have corresponding escape sequences.
4ed46869
KH
2856
2857 A graphic character set is at first designated to one of four
2858 graphic registers (G0 through G3), then these graphic registers are
2859 invoked to GL or GR. These designations and invocations can be
2860 done independently. The most common case is that G0 is invoked to
39787efd
KH
2861 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
2862 these invocations and designations are omitted in encoded text.
2863 In a 7-bit environment, only GL can be used.
4ed46869 2864
39787efd
KH
2865 When a graphic character set of CHARS94 is invoked to GL, codes
2866 0x20 and 0x7F of the GL area work as control characters SPACE and
2867 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2868 be used.
4ed46869
KH
2869
2870 There are two ways of invocation: locking-shift and single-shift.
2871 With locking-shift, the invocation lasts until the next different
39787efd
KH
2872 invocation, whereas with single-shift, the invocation affects the
2873 following character only and doesn't affect the locking-shift
2874 state. Invocations are done by the following control characters or
2875 escape sequences:
4ed46869
KH
2876
2877 ----------------------------------------------------------------------
39787efd 2878 abbrev function cntrl escape seq description
4ed46869 2879 ----------------------------------------------------------------------
39787efd
KH
2880 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
2881 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
2882 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
2883 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
2884 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
2885 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
2886 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
2887 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
2888 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
4ed46869 2889 ----------------------------------------------------------------------
39787efd
KH
2890 (*) These are not used by any known coding system.
2891
2892 Control characters for these functions are defined by macros
2893 ISO_CODE_XXX in `coding.h'.
4ed46869 2894
39787efd 2895 Designations are done by the following escape sequences:
4ed46869
KH
2896 ----------------------------------------------------------------------
2897 escape sequence description
2898 ----------------------------------------------------------------------
2899 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
2900 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
2901 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
2902 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
2903 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
2904 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
2905 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
2906 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
2907 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
2908 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
2909 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
2910 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
2911 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
2912 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
2913 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
2914 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
2915 ----------------------------------------------------------------------
2916
2917 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
39787efd 2918 of dimension 1, chars 94, and final character <F>, etc...
4ed46869
KH
2919
2920 Note (*): Although these designations are not allowed in ISO2022,
2921 Emacs accepts them on decoding, and produces them on encoding
39787efd 2922 CHARS96 character sets in a coding system which is characterized as
4ed46869
KH
2923 7-bit environment, non-locking-shift, and non-single-shift.
2924
2925 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
df7492f9 2926 '(' must be omitted. We refer to this as "short-form" hereafter.
4ed46869 2927
cfb43547 2928 Now you may notice that there are a lot of ways of encoding the
39787efd
KH
2929 same multilingual text in ISO2022. Actually, there exist many
2930 coding systems such as Compound Text (used in X11's inter client
8ca3766a
DL
2931 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2932 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
4ed46869
KH
2933 localized platforms), and all of these are variants of ISO2022.
2934
2935 In addition to the above, Emacs handles two more kinds of escape
2936 sequences: ISO6429's direction specification and Emacs' private
2937 sequence for specifying character composition.
2938
39787efd 2939 ISO6429's direction specification takes the following form:
4ed46869
KH
2940 o CSI ']' -- end of the current direction
2941 o CSI '0' ']' -- end of the current direction
2942 o CSI '1' ']' -- start of left-to-right text
2943 o CSI '2' ']' -- start of right-to-left text
2944 The control character CSI (0x9B: control sequence introducer) is
39787efd
KH
2945 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2946
2947 Character composition specification takes the following form:
ec6d2bb8
KH
2948 o ESC '0' -- start relative composition
2949 o ESC '1' -- end composition
2950 o ESC '2' -- start rule-base composition (*)
2951 o ESC '3' -- start relative composition with alternate chars (**)
2952 o ESC '4' -- start rule-base composition with alternate chars (**)
b73bfc1c 2953 Since these are not standard escape sequences of any ISO standard,
cfb43547 2954 the use of them with these meanings is restricted to Emacs only.
ec6d2bb8 2955
5a936b46
DL
2956 (*) This form is used only in Emacs 20.7 and older versions,
2957 but newer versions can safely decode it.
cfb43547 2958 (**) This form is used only in Emacs 21.1 and newer versions,
5a936b46 2959 and older versions can't decode it.
ec6d2bb8 2960
cfb43547 2961 Here's a list of example usages of these composition escape
b73bfc1c 2962 sequences (categorized by `enum composition_method').
ec6d2bb8 2963
b73bfc1c 2964 COMPOSITION_RELATIVE:
ec6d2bb8 2965 ESC 0 CHAR [ CHAR ] ESC 1
8ca3766a 2966 COMPOSITION_WITH_RULE:
ec6d2bb8 2967 ESC 2 CHAR [ RULE CHAR ] ESC 1
b73bfc1c 2968 COMPOSITION_WITH_ALTCHARS:
ec6d2bb8 2969 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
b73bfc1c 2970 COMPOSITION_WITH_RULE_ALTCHARS:
ec6d2bb8 2971 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
4ed46869
KH
2972
2973enum iso_code_class_type iso_code_class[256];
2974
df7492f9
KH
2975#define SAFE_CHARSET_P(coding, id) \
2976 ((id) <= (coding)->max_charset_id \
1b3b981b 2977 && (coding)->safe_charsets[id] != 255)
df7492f9
KH
2978
2979
2980#define SHIFT_OUT_OK(category) \
2981 (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
2982
2983static void
f0064e1f
DL
2984setup_iso_safe_charsets (attrs)
2985 Lisp_Object attrs;
df7492f9
KH
2986{
2987 Lisp_Object charset_list, safe_charsets;
2988 Lisp_Object request;
2989 Lisp_Object reg_usage;
2990 Lisp_Object tail;
2991 int reg94, reg96;
2992 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2993 int max_charset_id;
2994
2995 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2996 if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2997 && ! EQ (charset_list, Viso_2022_charset_list))
2998 {
2999 CODING_ATTR_CHARSET_LIST (attrs)
3000 = charset_list = Viso_2022_charset_list;
3001 ASET (attrs, coding_attr_safe_charsets, Qnil);
3002 }
3003
3004 if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
3005 return;
3006
3007 max_charset_id = 0;
3008 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
3009 {
3010 int id = XINT (XCAR (tail));
3011 if (max_charset_id < id)
3012 max_charset_id = id;
3013 }
d46c5b12 3014
1b3b981b
AS
3015 safe_charsets = make_uninit_string (max_charset_id + 1);
3016 memset (SDATA (safe_charsets), 255, max_charset_id + 1);
df7492f9
KH
3017 request = AREF (attrs, coding_attr_iso_request);
3018 reg_usage = AREF (attrs, coding_attr_iso_usage);
3019 reg94 = XINT (XCAR (reg_usage));
3020 reg96 = XINT (XCDR (reg_usage));
3021
3022 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
3023 {
3024 Lisp_Object id;
3025 Lisp_Object reg;
3026 struct charset *charset;
3027
3028 id = XCAR (tail);
3029 charset = CHARSET_FROM_ID (XINT (id));
bf16eb23 3030 reg = Fcdr (Fassq (id, request));
df7492f9 3031 if (! NILP (reg))
8f924df7 3032 SSET (safe_charsets, XINT (id), XINT (reg));
df7492f9
KH
3033 else if (charset->iso_chars_96)
3034 {
3035 if (reg96 < 4)
8f924df7 3036 SSET (safe_charsets, XINT (id), reg96);
df7492f9
KH
3037 }
3038 else
3039 {
3040 if (reg94 < 4)
8f924df7 3041 SSET (safe_charsets, XINT (id), reg94);
df7492f9
KH
3042 }
3043 }
3044 ASET (attrs, coding_attr_safe_charsets, safe_charsets);
3045}
d46c5b12 3046
b6871cc7 3047
4ed46869 3048/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
ff0dacd7
KH
3049 Check if a text is encoded in one of ISO-2022 based codig systems.
3050 If it is, return 1, else return 0. */
4ed46869 3051
0a28aafb 3052static int
ff0dacd7 3053detect_coding_iso_2022 (coding, detect_info)
df7492f9 3054 struct coding_system *coding;
ff0dacd7 3055 struct coding_detection_info *detect_info;
4ed46869 3056{
8f924df7
KH
3057 const unsigned char *src = coding->source, *src_base = src;
3058 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9 3059 int multibytep = coding->src_multibyte;
ff0dacd7 3060 int single_shifting = 0;
df7492f9
KH
3061 int id;
3062 int c, c1;
3063 int consumed_chars = 0;
3064 int i;
ff0dacd7
KH
3065 int rejected = 0;
3066 int found = 0;
cee53ed4 3067 int composition_count = -1;
ff0dacd7
KH
3068
3069 detect_info->checked |= CATEGORY_MASK_ISO;
df7492f9
KH
3070
3071 for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
3072 {
3073 struct coding_system *this = &(coding_categories[i]);
3074 Lisp_Object attrs, val;
3075
c6b278e7
KH
3076 if (this->id < 0)
3077 continue;
df7492f9
KH
3078 attrs = CODING_ID_ATTRS (this->id);
3079 if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
1b3b981b 3080 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
df7492f9
KH
3081 setup_iso_safe_charsets (attrs);
3082 val = CODING_ATTR_SAFE_CHARSETS (attrs);
8f924df7 3083 this->max_charset_id = SCHARS (val) - 1;
1b3b981b 3084 this->safe_charsets = SDATA (val);
df7492f9
KH
3085 }
3086
3087 /* A coding system of this category is always ASCII compatible. */
3088 src += coding->head_ascii;
3f003981 3089
ff0dacd7 3090 while (rejected != CATEGORY_MASK_ISO)
4ed46869 3091 {
065e3595 3092 src_base = src;
df7492f9 3093 ONE_MORE_BYTE (c);
4ed46869
KH
3094 switch (c)
3095 {
3096 case ISO_CODE_ESC:
74383408
KH
3097 if (inhibit_iso_escape_detection)
3098 break;
f46869e4 3099 single_shifting = 0;
df7492f9 3100 ONE_MORE_BYTE (c);
d46c5b12 3101 if (c >= '(' && c <= '/')
4ed46869 3102 {
bf9cdd4e 3103 /* Designation sequence for a charset of dimension 1. */
df7492f9 3104 ONE_MORE_BYTE (c1);
d46c5b12 3105 if (c1 < ' ' || c1 >= 0x80
df7492f9 3106 || (id = iso_charset_table[0][c >= ','][c1]) < 0)
d46c5b12
KH
3107 /* Invalid designation sequence. Just ignore. */
3108 break;
bf9cdd4e
KH
3109 }
3110 else if (c == '$')
3111 {
3112 /* Designation sequence for a charset of dimension 2. */
df7492f9 3113 ONE_MORE_BYTE (c);
bf9cdd4e
KH
3114 if (c >= '@' && c <= 'B')
3115 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
ff0dacd7 3116 id = iso_charset_table[1][0][c];
bf9cdd4e 3117 else if (c >= '(' && c <= '/')
bcf26d6a 3118 {
df7492f9 3119 ONE_MORE_BYTE (c1);
d46c5b12 3120 if (c1 < ' ' || c1 >= 0x80
df7492f9 3121 || (id = iso_charset_table[1][c >= ','][c1]) < 0)
d46c5b12
KH
3122 /* Invalid designation sequence. Just ignore. */
3123 break;
bcf26d6a 3124 }
bf9cdd4e 3125 else
ff0dacd7 3126 /* Invalid designation sequence. Just ignore it. */
d46c5b12
KH
3127 break;
3128 }
ae9ff118 3129 else if (c == 'N' || c == 'O')
d46c5b12 3130 {
ae9ff118 3131 /* ESC <Fe> for SS2 or SS3. */
ff0dacd7
KH
3132 single_shifting = 1;
3133 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
d46c5b12 3134 break;
4ed46869 3135 }
cee53ed4
KH
3136 else if (c == '1')
3137 {
3138 /* End of composition. */
3139 if (composition_count < 0
3140 || composition_count > MAX_COMPOSITION_COMPONENTS)
3141 /* Invalid */
3142 break;
3143 composition_count = -1;
3144 found |= CATEGORY_MASK_ISO;
3145 }
ec6d2bb8
KH
3146 else if (c >= '0' && c <= '4')
3147 {
3148 /* ESC <Fp> for start/end composition. */
cee53ed4 3149 composition_count = 0;
ec6d2bb8
KH
3150 break;
3151 }
bf9cdd4e 3152 else
df7492f9 3153 {
ff0dacd7 3154 /* Invalid escape sequence. Just ignore it. */
df7492f9
KH
3155 break;
3156 }
d46c5b12
KH
3157
3158 /* We found a valid designation sequence for CHARSET. */
ff0dacd7 3159 rejected |= CATEGORY_MASK_ISO_8BIT;
df7492f9
KH
3160 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
3161 id))
ff0dacd7 3162 found |= CATEGORY_MASK_ISO_7;
d46c5b12 3163 else
ff0dacd7 3164 rejected |= CATEGORY_MASK_ISO_7;
df7492f9
KH
3165 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
3166 id))
ff0dacd7 3167 found |= CATEGORY_MASK_ISO_7_TIGHT;
d46c5b12 3168 else
ff0dacd7 3169 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
df7492f9
KH
3170 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3171 id))
ff0dacd7 3172 found |= CATEGORY_MASK_ISO_7_ELSE;
ae9ff118 3173 else
ff0dacd7 3174 rejected |= CATEGORY_MASK_ISO_7_ELSE;
df7492f9
KH
3175 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3176 id))
ff0dacd7 3177 found |= CATEGORY_MASK_ISO_8_ELSE;
ae9ff118 3178 else
ff0dacd7 3179 rejected |= CATEGORY_MASK_ISO_8_ELSE;
4ed46869
KH
3180 break;
3181
4ed46869 3182 case ISO_CODE_SO:
d46c5b12 3183 case ISO_CODE_SI:
ff0dacd7 3184 /* Locking shift out/in. */
74383408
KH
3185 if (inhibit_iso_escape_detection)
3186 break;
f46869e4 3187 single_shifting = 0;
ff0dacd7 3188 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
d46c5b12
KH
3189 break;
3190
4ed46869 3191 case ISO_CODE_CSI:
ff0dacd7 3192 /* Control sequence introducer. */
f46869e4 3193 single_shifting = 0;
ff0dacd7
KH
3194 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3195 found |= CATEGORY_MASK_ISO_8_ELSE;
3196 goto check_extra_latin;
3197
4ed46869
KH
3198 case ISO_CODE_SS2:
3199 case ISO_CODE_SS3:
ff0dacd7
KH
3200 /* Single shift. */
3201 if (inhibit_iso_escape_detection)
3202 break;
75e2a253 3203 single_shifting = 0;
ff0dacd7
KH
3204 rejected |= CATEGORY_MASK_ISO_7BIT;
3205 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3206 & CODING_ISO_FLAG_SINGLE_SHIFT)
75e2a253 3207 found |= CATEGORY_MASK_ISO_8_1, single_shifting = 1;
ff0dacd7
KH
3208 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3209 & CODING_ISO_FLAG_SINGLE_SHIFT)
75e2a253
KH
3210 found |= CATEGORY_MASK_ISO_8_2, single_shifting = 1;
3211 if (single_shifting)
3212 break;
ff0dacd7 3213 goto check_extra_latin;
4ed46869
KH
3214
3215 default:
065e3595
KH
3216 if (c < 0)
3217 continue;
4ed46869 3218 if (c < 0x80)
f46869e4 3219 {
cee53ed4
KH
3220 if (composition_count >= 0)
3221 composition_count++;
f46869e4
KH
3222 single_shifting = 0;
3223 break;
3224 }
ff0dacd7 3225 if (c >= 0xA0)
c4825358 3226 {
ff0dacd7
KH
3227 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3228 found |= CATEGORY_MASK_ISO_8_1;
f46869e4 3229 /* Check the length of succeeding codes of the range
ff0dacd7
KH
3230 0xA0..0FF. If the byte length is even, we include
3231 CATEGORY_MASK_ISO_8_2 in `found'. We can check this
3232 only when we are not single shifting. */
3233 if (! single_shifting
3234 && ! (rejected & CATEGORY_MASK_ISO_8_2))
f46869e4 3235 {
e17de821 3236 int i = 1;
b73bfc1c
KH
3237 while (src < src_end)
3238 {
df7492f9 3239 ONE_MORE_BYTE (c);
b73bfc1c
KH
3240 if (c < 0xA0)
3241 break;
3242 i++;
3243 }
3244
3245 if (i & 1 && src < src_end)
cee53ed4
KH
3246 {
3247 rejected |= CATEGORY_MASK_ISO_8_2;
3248 if (composition_count >= 0)
3249 composition_count += i;
3250 }
f46869e4 3251 else
cee53ed4
KH
3252 {
3253 found |= CATEGORY_MASK_ISO_8_2;
3254 if (composition_count >= 0)
3255 composition_count += i / 2;
3256 }
f46869e4 3257 }
ff0dacd7 3258 break;
4ed46869 3259 }
ff0dacd7
KH
3260 check_extra_latin:
3261 single_shifting = 0;
3262 if (! VECTORP (Vlatin_extra_code_table)
3263 || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
3264 {
3265 rejected = CATEGORY_MASK_ISO;
3266 break;
3267 }
3268 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3269 & CODING_ISO_FLAG_LATIN_EXTRA)
3270 found |= CATEGORY_MASK_ISO_8_1;
3271 else
3272 rejected |= CATEGORY_MASK_ISO_8_1;
75e2a253 3273 rejected |= CATEGORY_MASK_ISO_8_2;
4ed46869
KH
3274 }
3275 }
ff0dacd7
KH
3276 detect_info->rejected |= CATEGORY_MASK_ISO;
3277 return 0;
4ed46869 3278
df7492f9 3279 no_more_source:
ff0dacd7
KH
3280 detect_info->rejected |= rejected;
3281 detect_info->found |= (found & ~rejected);
df7492f9 3282 return 1;
4ed46869 3283}
ec6d2bb8 3284
4ed46869 3285
134b9549
KH
3286/* Set designation state into CODING. Set CHARS_96 to -1 if the
3287 escape sequence should be kept. */
df7492f9
KH
3288#define DECODE_DESIGNATION(reg, dim, chars_96, final) \
3289 do { \
3290 int id, prev; \
3291 \
3292 if (final < '0' || final >= 128 \
3293 || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0) \
3294 || !SAFE_CHARSET_P (coding, id)) \
3295 { \
3296 CODING_ISO_DESIGNATION (coding, reg) = -2; \
134b9549
KH
3297 chars_96 = -1; \
3298 break; \
df7492f9
KH
3299 } \
3300 prev = CODING_ISO_DESIGNATION (coding, reg); \
bf16eb23
KH
3301 if (id == charset_jisx0201_roman) \
3302 { \
3303 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
3304 id = charset_ascii; \
3305 } \
3306 else if (id == charset_jisx0208_1978) \
3307 { \
3308 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
3309 id = charset_jisx0208; \
3310 } \
df7492f9
KH
3311 CODING_ISO_DESIGNATION (coding, reg) = id; \
3312 /* If there was an invalid designation to REG previously, and this \
3313 designation is ASCII to REG, we should keep this designation \
3314 sequence. */ \
3315 if (prev == -2 && id == charset_ascii) \
134b9549 3316 chars_96 = -1; \
4ed46869
KH
3317 } while (0)
3318
d46c5b12 3319
e951386e
KH
3320/* Handle these composition sequence (ALT: alternate char):
3321
3322 (1) relative composition: ESC 0 CHAR ... ESC 1
3323 (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3324 (3) altchar composition: ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3325 (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3326
3327 When the start sequence (ESC 0/2/3/4) is found, this annotation
3328 header is produced.
3329
3330 [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3331
3332 Then, upon reading CHAR or RULE (one or two bytes), these codes are
3333 produced until the end sequence (ESC 1) is found:
3334
3335 (1) CHAR ... CHAR
3336 (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3337 (3) ALT ... ALT -1 -1 CHAR ... CHAR
3338 (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3339
3340 When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3341 annotation header is updated as below:
3342
3343 (1) LENGTH: unchanged, NCHARS: number of CHARs
3344 (2) LENGTH: unchanged, NCHARS: number of CHARs
3345 (3) LENGTH: += number of ALTs + 2, NCHARS: number of CHARs
3346 (4) LENGTH: += number of ALTs * 3, NCHARS: number of CHARs
3347
3348 If an error is found while composing, the annotation header is
3349 changed to:
3350
3351 [ ESC '0'/'2'/'3'/'4' -2 0 ]
3352
3353 and the sequence [ -2 DECODED-RULE ] is changed to the original
3354 byte sequence as below:
3355 o the original byte sequence is B: [ B -1 ]
3356 o the original byte sequence is B1 B2: [ B1 B2 ]
3357 and the sequence [ -1 -1 ] is changed to the original byte
3358 sequence:
3359 [ ESC '0' ]
3360*/
3361
3362/* Decode a composition rule C1 and maybe one more byte from the
3363 source, and set RULE to the encoded composition rule, NBYTES to the
3364 length of the composition rule. If the rule is invalid, set RULE
3365 to some negative value. */
3366
3367#define DECODE_COMPOSITION_RULE(rule, nbytes) \
3368 do { \
3369 rule = c1 - 32; \
3370 if (rule < 0) \
3371 break; \
3372 if (rule < 81) /* old format (before ver.21) */ \
3373 { \
3374 int gref = (rule) / 9; \
3375 int nref = (rule) % 9; \
3376 if (gref == 4) gref = 10; \
3377 if (nref == 4) nref = 10; \
3378 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
3379 nbytes = 1; \
3380 } \
3381 else /* new format (after ver.21) */ \
3382 { \
3383 int c; \
3384 \
3385 ONE_MORE_BYTE (c); \
3386 rule = COMPOSITION_ENCODE_RULE (rule - 81, c - 32); \
3387 if (rule >= 0) \
3388 rule += 0x100; /* to destinguish it from the old format */ \
3389 nbytes = 2; \
3390 } \
3391 } while (0)
3392
3393#define ENCODE_COMPOSITION_RULE(rule) \
df7492f9 3394 do { \
e951386e
KH
3395 int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3396 \
3397 if (rule < 0x100) /* old format */ \
df7492f9 3398 { \
e951386e
KH
3399 if (gref == 10) gref = 4; \
3400 if (nref == 10) nref = 4; \
3401 charbuf[idx] = 32 + gref * 9 + nref; \
3402 charbuf[idx + 1] = -1; \
3403 new_chars++; \
df7492f9 3404 } \
e951386e 3405 else /* new format */ \
df7492f9 3406 { \
e951386e
KH
3407 charbuf[idx] = 32 + 81 + gref; \
3408 charbuf[idx + 1] = 32 + nref; \
3409 new_chars += 2; \
df7492f9
KH
3410 } \
3411 } while (0)
3412
e951386e
KH
3413/* Finish the current composition as invalid. */
3414
3415static int finish_composition P_ ((int *, struct composition_status *));
3416
3417static int
3418finish_composition (charbuf, cmp_status)
3419 int *charbuf;
3420 struct composition_status *cmp_status;
3421{
3422 int idx = - cmp_status->length;
3423 int new_chars;
3424
3425 /* Recover the original ESC sequence */
3426 charbuf[idx++] = ISO_CODE_ESC;
3427 charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3428 : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3429 : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3430 /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3431 : '4');
3432 charbuf[idx++] = -2;
3433 charbuf[idx++] = 0;
3434 charbuf[idx++] = -1;
3435 new_chars = cmp_status->nchars;
3436 if (cmp_status->method >= COMPOSITION_WITH_RULE)
3437 for (; idx < 0; idx++)
3438 {
3439 int elt = charbuf[idx];
3440
3441 if (elt == -2)
3442 {
3443 ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3444 idx++;
3445 }
3446 else if (elt == -1)
3447 {
3448 charbuf[idx++] = ISO_CODE_ESC;
3449 charbuf[idx] = '0';
3450 new_chars += 2;
3451 }
3452 }
3453 cmp_status->state = COMPOSING_NO;
3454 return new_chars;
3455}
3456
3457/* If characers are under composition, finish the composition. */
3458#define MAYBE_FINISH_COMPOSITION() \
3459 do { \
3460 if (cmp_status->state != COMPOSING_NO) \
3461 char_offset += finish_composition (charbuf, cmp_status); \
3462 } while (0)
d46c5b12 3463
aa72b389 3464/* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
e951386e 3465
aa72b389
KH
3466 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3467 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
df7492f9
KH
3468 ESC 3 : altchar composition : ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3469 ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
ec6d2bb8 3470
e951386e
KH
3471 Produce this annotation sequence now:
3472
3473 [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3474*/
3475
3476#define DECODE_COMPOSITION_START(c1) \
3477 do { \
3478 if (c1 == '0' \
3479 && ((cmp_status->state == COMPOSING_COMPONENT_CHAR \
3480 && cmp_status->method == COMPOSITION_WITH_ALTCHARS) \
3481 || (cmp_status->state == COMPOSING_COMPONENT_RULE \
3482 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3483 { \
3484 *charbuf++ = -1; \
3485 *charbuf++= -1; \
3486 cmp_status->state = COMPOSING_CHAR; \
3487 cmp_status->length += 2; \
3488 } \
3489 else \
3490 { \
3491 MAYBE_FINISH_COMPOSITION (); \
3492 cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE \
3493 : c1 == '2' ? COMPOSITION_WITH_RULE \
3494 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
3495 : COMPOSITION_WITH_RULE_ALTCHARS); \
3496 cmp_status->state \
3497 = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR); \
3498 ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method); \
3499 cmp_status->length = MAX_ANNOTATION_LENGTH; \
3500 cmp_status->nchars = cmp_status->ncomps = 0; \
3501 coding->annotated = 1; \
3502 } \
ec6d2bb8
KH
3503 } while (0)
3504
ec6d2bb8 3505
e951386e 3506/* Handle composition end sequence ESC 1. */
df7492f9
KH
3507
3508#define DECODE_COMPOSITION_END() \
ec6d2bb8 3509 do { \
e951386e
KH
3510 if (cmp_status->nchars == 0 \
3511 || ((cmp_status->state == COMPOSING_CHAR) \
3512 == (cmp_status->method == COMPOSITION_WITH_RULE))) \
ec6d2bb8 3513 { \
e951386e
KH
3514 MAYBE_FINISH_COMPOSITION (); \
3515 goto invalid_code; \
ec6d2bb8 3516 } \
e951386e
KH
3517 if (cmp_status->method == COMPOSITION_WITH_ALTCHARS) \
3518 charbuf[- cmp_status->length] -= cmp_status->ncomps + 2; \
3519 else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS) \
3520 charbuf[- cmp_status->length] -= cmp_status->ncomps * 3; \
3521 charbuf[- cmp_status->length + 2] = cmp_status->nchars; \
3522 char_offset += cmp_status->nchars; \
3523 cmp_status->state = COMPOSING_NO; \
ec6d2bb8
KH
3524 } while (0)
3525
e951386e 3526/* Store a composition rule RULE in charbuf, and update cmp_status. */
df7492f9 3527
e951386e
KH
3528#define STORE_COMPOSITION_RULE(rule) \
3529 do { \
3530 *charbuf++ = -2; \
3531 *charbuf++ = rule; \
3532 cmp_status->length += 2; \
3533 cmp_status->state--; \
3534 } while (0)
ec6d2bb8 3535
e951386e
KH
3536/* Store a composed char or a component char C in charbuf, and update
3537 cmp_status. */
3538
3539#define STORE_COMPOSITION_CHAR(c) \
ec6d2bb8 3540 do { \
e951386e
KH
3541 *charbuf++ = (c); \
3542 cmp_status->length++; \
3543 if (cmp_status->state == COMPOSING_CHAR) \
3544 cmp_status->nchars++; \
df7492f9 3545 else \
e951386e
KH
3546 cmp_status->ncomps++; \
3547 if (cmp_status->method == COMPOSITION_WITH_RULE \
3548 || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS \
3549 && cmp_status->state == COMPOSING_COMPONENT_CHAR)) \
3550 cmp_status->state++; \
ec6d2bb8 3551 } while (0)
88993dfd 3552
d46c5b12 3553
4ed46869
KH
3554/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
3555
b73bfc1c 3556static void
df7492f9 3557decode_coding_iso_2022 (coding)
4ed46869 3558 struct coding_system *coding;
4ed46869 3559{
8f924df7
KH
3560 const unsigned char *src = coding->source + coding->consumed;
3561 const unsigned char *src_end = coding->source + coding->src_bytes;
3562 const unsigned char *src_base;
69a80ea3 3563 int *charbuf = coding->charbuf + coding->charbuf_used;
df80c7f0
KH
3564 /* We may produce two annocations (charset and composition) in one
3565 loop and one more charset annocation at the end. */
ff0dacd7 3566 int *charbuf_end
df80c7f0 3567 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
df7492f9 3568 int consumed_chars = 0, consumed_chars_base;
df7492f9 3569 int multibytep = coding->src_multibyte;
4ed46869 3570 /* Charsets invoked to graphic plane 0 and 1 respectively. */
df7492f9
KH
3571 int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3572 int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
134b9549 3573 int charset_id_2, charset_id_3;
df7492f9
KH
3574 struct charset *charset;
3575 int c;
e951386e 3576 struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
24a73b0a 3577 Lisp_Object attrs, charset_list;
ff0dacd7
KH
3578 int char_offset = coding->produced_char;
3579 int last_offset = char_offset;
3580 int last_id = charset_ascii;
0a9564cb
EZ
3581 int eol_crlf =
3582 !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
119852e7 3583 int byte_after_cr = -1;
e951386e 3584 int i;
df7492f9 3585
24a73b0a 3586 CODING_GET_INFO (coding, attrs, charset_list);
df7492f9 3587 setup_iso_safe_charsets (attrs);
287c57d7
KH
3588 /* Charset list may have been changed. */
3589 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
1b3b981b 3590 coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
b73bfc1c 3591
e951386e
KH
3592 if (cmp_status->state != COMPOSING_NO)
3593 {
3594 for (i = 0; i < cmp_status->length; i++)
3595 *charbuf++ = cmp_status->carryover[i];
3596 coding->annotated = 1;
3597 }
3598
b73bfc1c 3599 while (1)
4ed46869 3600 {
463f5630 3601 int c1, c2;
b73bfc1c
KH
3602
3603 src_base = src;
df7492f9
KH
3604 consumed_chars_base = consumed_chars;
3605
3606 if (charbuf >= charbuf_end)
b71f6f73
KH
3607 {
3608 if (byte_after_cr >= 0)
3609 src_base--;
3610 break;
3611 }
df7492f9 3612
119852e7
KH
3613 if (byte_after_cr >= 0)
3614 c1 = byte_after_cr, byte_after_cr = -1;
3615 else
3616 ONE_MORE_BYTE (c1);
065e3595
KH
3617 if (c1 < 0)
3618 goto invalid_code;
4ed46869 3619
e951386e 3620 if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
4ed46869 3621 {
e951386e
KH
3622 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3623 char_offset++;
3624 CODING_ISO_EXTSEGMENT_LEN (coding)--;
3625 continue;
3626 }
3627
3628 if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3629 {
3630 if (c1 == ISO_CODE_ESC)
ec6d2bb8 3631 {
e951386e
KH
3632 if (src + 1 >= src_end)
3633 goto no_more_source;
3634 *charbuf++ = ISO_CODE_ESC;
3635 char_offset++;
3636 if (src[0] == '%' && src[1] == '@')
df7492f9 3637 {
e951386e
KH
3638 src += 2;
3639 consumed_chars += 2;
3640 char_offset += 2;
3641 /* We are sure charbuf can contain two more chars. */
3642 *charbuf++ = '%';
3643 *charbuf++ = '@';
3644 CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
df7492f9 3645 }
4ed46869 3646 }
e951386e
KH
3647 else
3648 {
3649 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3650 char_offset++;
3651 }
3652 continue;
3653 }
3654
3655 if ((cmp_status->state == COMPOSING_RULE
3656 || cmp_status->state == COMPOSING_COMPONENT_RULE)
3657 && c1 != ISO_CODE_ESC)
3658 {
3659 int rule, nbytes;
3660
3661 DECODE_COMPOSITION_RULE (rule, nbytes);
3662 if (rule < 0)
3663 goto invalid_code;
3664 STORE_COMPOSITION_RULE (rule);
3665 continue;
3666 }
3667
3668 /* We produce at most one character. */
3669 switch (iso_code_class [c1])
3670 {
3671 case ISO_0x20_or_0x7F:
df7492f9
KH
3672 if (charset_id_0 < 0
3673 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
781d7a48
KH
3674 /* This is SPACE or DEL. */
3675 charset = CHARSET_FROM_ID (charset_ascii);
3676 else
3677 charset = CHARSET_FROM_ID (charset_id_0);
3678 break;
4ed46869
KH
3679
3680 case ISO_graphic_plane_0:
134b9549
KH
3681 if (charset_id_0 < 0)
3682 charset = CHARSET_FROM_ID (charset_ascii);
3683 else
3684 charset = CHARSET_FROM_ID (charset_id_0);
4ed46869
KH
3685 break;
3686
3687 case ISO_0xA0_or_0xFF:
df7492f9
KH
3688 if (charset_id_1 < 0
3689 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3690 || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3691 goto invalid_code;
4ed46869
KH
3692 /* This is a graphic character, we fall down ... */
3693
3694 case ISO_graphic_plane_1:
df7492f9
KH
3695 if (charset_id_1 < 0)
3696 goto invalid_code;
3697 charset = CHARSET_FROM_ID (charset_id_1);
4ed46869
KH
3698 break;
3699
df7492f9 3700 case ISO_control_0:
119852e7
KH
3701 if (eol_crlf && c1 == '\r')
3702 ONE_MORE_BYTE (byte_after_cr);
df7492f9
KH
3703 MAYBE_FINISH_COMPOSITION ();
3704 charset = CHARSET_FROM_ID (charset_ascii);
4ed46869
KH
3705 break;
3706
df7492f9 3707 case ISO_control_1:
df7492f9
KH
3708 goto invalid_code;
3709
4ed46869 3710 case ISO_shift_out:
df7492f9
KH
3711 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3712 || CODING_ISO_DESIGNATION (coding, 1) < 0)
3713 goto invalid_code;
3714 CODING_ISO_INVOCATION (coding, 0) = 1;
3715 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
b73bfc1c 3716 continue;
4ed46869
KH
3717
3718 case ISO_shift_in:
df7492f9
KH
3719 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3720 goto invalid_code;
3721 CODING_ISO_INVOCATION (coding, 0) = 0;
3722 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
b73bfc1c 3723 continue;
4ed46869
KH
3724
3725 case ISO_single_shift_2_7:
3726 case ISO_single_shift_2:
df7492f9
KH
3727 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3728 goto invalid_code;
4ed46869
KH
3729 /* SS2 is handled as an escape sequence of ESC 'N' */
3730 c1 = 'N';
3731 goto label_escape_sequence;
3732
3733 case ISO_single_shift_3:
df7492f9
KH
3734 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3735 goto invalid_code;
4ed46869
KH
3736 /* SS2 is handled as an escape sequence of ESC 'O' */
3737 c1 = 'O';
3738 goto label_escape_sequence;
3739
3740 case ISO_control_sequence_introducer:
3741 /* CSI is handled as an escape sequence of ESC '[' ... */
3742 c1 = '[';
3743 goto label_escape_sequence;
3744
3745 case ISO_escape:
3746 ONE_MORE_BYTE (c1);
3747 label_escape_sequence:
df7492f9 3748 /* Escape sequences handled here are invocation,
4ed46869
KH
3749 designation, direction specification, and character
3750 composition specification. */
3751 switch (c1)
3752 {
3753 case '&': /* revision of following character set */
3754 ONE_MORE_BYTE (c1);
3755 if (!(c1 >= '@' && c1 <= '~'))
df7492f9 3756 goto invalid_code;
4ed46869
KH
3757 ONE_MORE_BYTE (c1);
3758 if (c1 != ISO_CODE_ESC)
df7492f9 3759 goto invalid_code;
4ed46869
KH
3760 ONE_MORE_BYTE (c1);
3761 goto label_escape_sequence;
3762
3763 case '$': /* designation of 2-byte character set */
df7492f9
KH
3764 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3765 goto invalid_code;
134b9549
KH
3766 {
3767 int reg, chars96;
3768
3769 ONE_MORE_BYTE (c1);
3770 if (c1 >= '@' && c1 <= 'B')
3771 { /* designation of JISX0208.1978, GB2312.1980,
88993dfd 3772 or JISX0208.1980 */
134b9549
KH
3773 reg = 0, chars96 = 0;
3774 }
3775 else if (c1 >= 0x28 && c1 <= 0x2B)
3776 { /* designation of DIMENSION2_CHARS94 character set */
3777 reg = c1 - 0x28, chars96 = 0;
3778 ONE_MORE_BYTE (c1);
3779 }
3780 else if (c1 >= 0x2C && c1 <= 0x2F)
3781 { /* designation of DIMENSION2_CHARS96 character set */
3782 reg = c1 - 0x2C, chars96 = 1;
3783 ONE_MORE_BYTE (c1);
3784 }
3785 else
3786 goto invalid_code;
3787 DECODE_DESIGNATION (reg, 2, chars96, c1);
3788 /* We must update these variables now. */
3789 if (reg == 0)
3790 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3791 else if (reg == 1)
3792 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3793 if (chars96 < 0)
3794 goto invalid_code;
3795 }
b73bfc1c 3796 continue;
4ed46869
KH
3797
3798 case 'n': /* invocation of locking-shift-2 */
df7492f9
KH
3799 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3800 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3801 goto invalid_code;
3802 CODING_ISO_INVOCATION (coding, 0) = 2;
3803 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
b73bfc1c 3804 continue;
4ed46869
KH
3805
3806 case 'o': /* invocation of locking-shift-3 */
df7492f9
KH
3807 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3808 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3809 goto invalid_code;
3810 CODING_ISO_INVOCATION (coding, 0) = 3;
3811 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
b73bfc1c 3812 continue;
4ed46869
KH
3813
3814 case 'N': /* invocation of single-shift-2 */
df7492f9
KH
3815 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3816 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3817 goto invalid_code;
134b9549
KH
3818 charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3819 if (charset_id_2 < 0)
3820 charset = CHARSET_FROM_ID (charset_ascii);
3821 else
3822 charset = CHARSET_FROM_ID (charset_id_2);
b73bfc1c 3823 ONE_MORE_BYTE (c1);
e7046a18 3824 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
df7492f9 3825 goto invalid_code;
4ed46869
KH
3826 break;
3827
3828 case 'O': /* invocation of single-shift-3 */
df7492f9
KH
3829 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3830 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3831 goto invalid_code;
134b9549
KH
3832 charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3833 if (charset_id_3 < 0)
3834 charset = CHARSET_FROM_ID (charset_ascii);
3835 else
3836 charset = CHARSET_FROM_ID (charset_id_3);
b73bfc1c 3837 ONE_MORE_BYTE (c1);
e7046a18 3838 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
df7492f9 3839 goto invalid_code;
4ed46869
KH
3840 break;
3841
ec6d2bb8 3842 case '0': case '2': case '3': case '4': /* start composition */
df7492f9
KH
3843 if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3844 goto invalid_code;
e951386e
KH
3845 if (last_id != charset_ascii)
3846 {
3847 ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3848 last_id = charset_ascii;
3849 last_offset = char_offset;
3850 }
ec6d2bb8 3851 DECODE_COMPOSITION_START (c1);
b73bfc1c 3852 continue;
4ed46869 3853
ec6d2bb8 3854 case '1': /* end composition */
e951386e 3855 if (cmp_status->state == COMPOSING_NO)
df7492f9
KH
3856 goto invalid_code;
3857 DECODE_COMPOSITION_END ();
b73bfc1c 3858 continue;
4ed46869
KH
3859
3860 case '[': /* specification of direction */
df7492f9
KH
3861 if (! CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION)
3862 goto invalid_code;
4ed46869 3863 /* For the moment, nested direction is not supported.
d46c5b12 3864 So, `coding->mode & CODING_MODE_DIRECTION' zero means
df7492f9 3865 left-to-right, and nozero means right-to-left. */
4ed46869
KH
3866 ONE_MORE_BYTE (c1);
3867 switch (c1)
3868 {
3869 case ']': /* end of the current direction */
d46c5b12 3870 coding->mode &= ~CODING_MODE_DIRECTION;
4ed46869
KH
3871
3872 case '0': /* end of the current direction */
3873 case '1': /* start of left-to-right direction */
3874 ONE_MORE_BYTE (c1);
3875 if (c1 == ']')
d46c5b12 3876 coding->mode &= ~CODING_MODE_DIRECTION;
4ed46869 3877 else
df7492f9 3878 goto invalid_code;
4ed46869
KH
3879 break;
3880
3881 case '2': /* start of right-to-left direction */
3882 ONE_MORE_BYTE (c1);
3883 if (c1 == ']')
d46c5b12 3884 coding->mode |= CODING_MODE_DIRECTION;
4ed46869 3885 else
df7492f9 3886 goto invalid_code;
4ed46869
KH
3887 break;
3888
3889 default:
df7492f9 3890 goto invalid_code;
4ed46869 3891 }
b73bfc1c 3892 continue;
4ed46869 3893
103e0180 3894 case '%':
103e0180
KH
3895 ONE_MORE_BYTE (c1);
3896 if (c1 == '/')
3897 {
3898 /* CTEXT extended segment:
3899 ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3900 We keep these bytes as is for the moment.
3901 They may be decoded by post-read-conversion. */
3902 int dim, M, L;
4776e638 3903 int size;
8f924df7 3904
103e0180 3905 ONE_MORE_BYTE (dim);
e951386e
KH
3906 if (dim < 0 || dim > 4)
3907 goto invalid_code;
103e0180 3908 ONE_MORE_BYTE (M);
e951386e
KH
3909 if (M < 128)
3910 goto invalid_code;
103e0180 3911 ONE_MORE_BYTE (L);
e951386e
KH
3912 if (L < 128)
3913 goto invalid_code;
103e0180 3914 size = ((M - 128) * 128) + (L - 128);
e951386e 3915 if (charbuf + 6 > charbuf_end)
4776e638
KH
3916 goto break_loop;
3917 *charbuf++ = ISO_CODE_ESC;
3918 *charbuf++ = '%';
3919 *charbuf++ = '/';
3920 *charbuf++ = dim;
3921 *charbuf++ = BYTE8_TO_CHAR (M);
3922 *charbuf++ = BYTE8_TO_CHAR (L);
e951386e 3923 CODING_ISO_EXTSEGMENT_LEN (coding) = size;
103e0180
KH
3924 }
3925 else if (c1 == 'G')
3926 {
103e0180
KH
3927 /* XFree86 extension for embedding UTF-8 in CTEXT:
3928 ESC % G --UTF-8-BYTES-- ESC % @
3929 We keep these bytes as is for the moment.
3930 They may be decoded by post-read-conversion. */
e951386e 3931 if (charbuf + 3 > charbuf_end)
4776e638 3932 goto break_loop;
e951386e
KH
3933 *charbuf++ = ISO_CODE_ESC;
3934 *charbuf++ = '%';
3935 *charbuf++ = 'G';
3936 CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
103e0180
KH
3937 }
3938 else
4776e638 3939 goto invalid_code;
103e0180 3940 continue;
4776e638 3941 break;
103e0180 3942
4ed46869 3943 default:
df7492f9
KH
3944 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3945 goto invalid_code;
134b9549
KH
3946 {
3947 int reg, chars96;
3948
3949 if (c1 >= 0x28 && c1 <= 0x2B)
3950 { /* designation of DIMENSION1_CHARS94 character set */
3951 reg = c1 - 0x28, chars96 = 0;
3952 ONE_MORE_BYTE (c1);
3953 }
3954 else if (c1 >= 0x2C && c1 <= 0x2F)
3955 { /* designation of DIMENSION1_CHARS96 character set */
3956 reg = c1 - 0x2C, chars96 = 1;
3957 ONE_MORE_BYTE (c1);
3958 }
3959 else
3960 goto invalid_code;
3961 DECODE_DESIGNATION (reg, 1, chars96, c1);
3962 /* We must update these variables now. */
3963 if (reg == 0)
3964 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3965 else if (reg == 1)
3966 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3967 if (chars96 < 0)
3968 goto invalid_code;
3969 }
b73bfc1c 3970 continue;
4ed46869 3971 }
b73bfc1c 3972 }
4ed46869 3973
e951386e
KH
3974 if (cmp_status->state == COMPOSING_NO
3975 && charset->id != charset_ascii
ff0dacd7
KH
3976 && last_id != charset->id)
3977 {
3978 if (last_id != charset_ascii)
69a80ea3 3979 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
ff0dacd7
KH
3980 last_id = charset->id;
3981 last_offset = char_offset;
3982 }
3983
b73bfc1c 3984 /* Now we know CHARSET and 1st position code C1 of a character.
df7492f9
KH
3985 Produce a decoded character while getting 2nd position code
3986 C2 if necessary. */
3987 c1 &= 0x7F;
3988 if (CHARSET_DIMENSION (charset) > 1)
b73bfc1c
KH
3989 {
3990 ONE_MORE_BYTE (c2);
df7492f9 3991 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
b73bfc1c 3992 /* C2 is not in a valid range. */
df7492f9
KH
3993 goto invalid_code;
3994 c1 = (c1 << 8) | (c2 & 0x7F);
3995 if (CHARSET_DIMENSION (charset) > 2)
3996 {
3997 ONE_MORE_BYTE (c2);
3998 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3999 /* C2 is not in a valid range. */
4000 goto invalid_code;
4001 c1 = (c1 << 8) | (c2 & 0x7F);
4002 }
4003 }
4004
4005 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
4006 if (c < 0)
4007 {
4008 MAYBE_FINISH_COMPOSITION ();
4009 for (; src_base < src; src_base++, char_offset++)
4010 {
4011 if (ASCII_BYTE_P (*src_base))
4012 *charbuf++ = *src_base;
4013 else
4014 *charbuf++ = BYTE8_TO_CHAR (*src_base);
4015 }
4016 }
e951386e 4017 else if (cmp_status->state == COMPOSING_NO)
df7492f9
KH
4018 {
4019 *charbuf++ = c;
4020 char_offset++;
4ed46869 4021 }
e951386e
KH
4022 else if ((cmp_status->state == COMPOSING_CHAR
4023 ? cmp_status->nchars
4024 : cmp_status->ncomps)
4025 >= MAX_COMPOSITION_COMPONENTS)
781d7a48 4026 {
e951386e
KH
4027 /* Too long composition. */
4028 MAYBE_FINISH_COMPOSITION ();
4029 *charbuf++ = c;
4030 char_offset++;
4ed46869 4031 }
e951386e
KH
4032 else
4033 STORE_COMPOSITION_CHAR (c);
4ed46869
KH
4034 continue;
4035
df7492f9
KH
4036 invalid_code:
4037 MAYBE_FINISH_COMPOSITION ();
4ed46869 4038 src = src_base;
df7492f9
KH
4039 consumed_chars = consumed_chars_base;
4040 ONE_MORE_BYTE (c);
065e3595 4041 *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
ff0dacd7 4042 char_offset++;
df7492f9 4043 coding->errors++;
4776e638
KH
4044 continue;
4045
4046 break_loop:
4047 break;
4ed46869 4048 }
fb88bf2d 4049
df7492f9 4050 no_more_source:
e951386e
KH
4051 if (cmp_status->state != COMPOSING_NO)
4052 {
4053 if (coding->mode & CODING_MODE_LAST_BLOCK)
4054 MAYBE_FINISH_COMPOSITION ();
4055 else
4056 {
4057 charbuf -= cmp_status->length;
4058 for (i = 0; i < cmp_status->length; i++)
4059 cmp_status->carryover[i] = charbuf[i];
4060 }
4061 }
4062 else if (last_id != charset_ascii)
69a80ea3 4063 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
df7492f9
KH
4064 coding->consumed_char += consumed_chars_base;
4065 coding->consumed = src_base - coding->source;
4066 coding->charbuf_used = charbuf - coding->charbuf;
4ed46869
KH
4067}
4068
b73bfc1c 4069
f4dee582 4070/* ISO2022 encoding stuff. */
4ed46869
KH
4071
4072/*
f4dee582 4073 It is not enough to say just "ISO2022" on encoding, we have to
df7492f9 4074 specify more details. In Emacs, each coding system of ISO2022
4ed46869 4075 variant has the following specifications:
df7492f9 4076 1. Initial designation to G0 thru G3.
4ed46869
KH
4077 2. Allows short-form designation?
4078 3. ASCII should be designated to G0 before control characters?
4079 4. ASCII should be designated to G0 at end of line?
4080 5. 7-bit environment or 8-bit environment?
4081 6. Use locking-shift?
4082 7. Use Single-shift?
4083 And the following two are only for Japanese:
4084 8. Use ASCII in place of JIS0201-1976-Roman?
4085 9. Use JISX0208-1983 in place of JISX0208-1978?
df7492f9
KH
4086 These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
4087 defined by macros CODING_ISO_FLAG_XXX. See `coding.h' for more
f4dee582 4088 details.
4ed46869
KH
4089*/
4090
4091/* Produce codes (escape sequence) for designating CHARSET to graphic
b73bfc1c
KH
4092 register REG at DST, and increment DST. If <final-char> of CHARSET is
4093 '@', 'A', or 'B' and the coding system CODING allows, produce
4094 designation sequence of short-form. */
4ed46869
KH
4095
4096#define ENCODE_DESIGNATION(charset, reg, coding) \
4097 do { \
df7492f9 4098 unsigned char final_char = CHARSET_ISO_FINAL (charset); \
4ed46869
KH
4099 char *intermediate_char_94 = "()*+"; \
4100 char *intermediate_char_96 = ",-./"; \
df7492f9
KH
4101 int revision = -1; \
4102 int c; \
4103 \
4104 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION) \
c197f191 4105 revision = CHARSET_ISO_REVISION (charset); \
df7492f9
KH
4106 \
4107 if (revision >= 0) \
70c22245 4108 { \
df7492f9
KH
4109 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&'); \
4110 EMIT_ONE_BYTE ('@' + revision); \
4ed46869 4111 } \
df7492f9 4112 EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC); \
4ed46869
KH
4113 if (CHARSET_DIMENSION (charset) == 1) \
4114 { \
df7492f9
KH
4115 if (! CHARSET_ISO_CHARS_96 (charset)) \
4116 c = intermediate_char_94[reg]; \
4ed46869 4117 else \
df7492f9
KH
4118 c = intermediate_char_96[reg]; \
4119 EMIT_ONE_ASCII_BYTE (c); \
4ed46869
KH
4120 } \
4121 else \
4122 { \
df7492f9
KH
4123 EMIT_ONE_ASCII_BYTE ('$'); \
4124 if (! CHARSET_ISO_CHARS_96 (charset)) \
4ed46869 4125 { \
df7492f9 4126 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM \
b73bfc1c
KH
4127 || reg != 0 \
4128 || final_char < '@' || final_char > 'B') \
df7492f9 4129 EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]); \
4ed46869
KH
4130 } \
4131 else \
df7492f9 4132 EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]); \
4ed46869 4133 } \
df7492f9
KH
4134 EMIT_ONE_ASCII_BYTE (final_char); \
4135 \
4136 CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset); \
4ed46869
KH
4137 } while (0)
4138
df7492f9 4139
4ed46869
KH
4140/* The following two macros produce codes (control character or escape
4141 sequence) for ISO2022 single-shift functions (single-shift-2 and
4142 single-shift-3). */
4143
df7492f9
KH
4144#define ENCODE_SINGLE_SHIFT_2 \
4145 do { \
4146 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4147 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N'); \
4148 else \
4149 EMIT_ONE_BYTE (ISO_CODE_SS2); \
4150 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
4ed46869
KH
4151 } while (0)
4152
df7492f9
KH
4153
4154#define ENCODE_SINGLE_SHIFT_3 \
4155 do { \
4156 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4157 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O'); \
4158 else \
4159 EMIT_ONE_BYTE (ISO_CODE_SS3); \
4160 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
4ed46869
KH
4161 } while (0)
4162
df7492f9 4163
4ed46869
KH
4164/* The following four macros produce codes (control character or
4165 escape sequence) for ISO2022 locking-shift functions (shift-in,
4166 shift-out, locking-shift-2, and locking-shift-3). */
4167
df7492f9
KH
4168#define ENCODE_SHIFT_IN \
4169 do { \
4170 EMIT_ONE_ASCII_BYTE (ISO_CODE_SI); \
4171 CODING_ISO_INVOCATION (coding, 0) = 0; \
4ed46869
KH
4172 } while (0)
4173
df7492f9
KH
4174
4175#define ENCODE_SHIFT_OUT \
4176 do { \
4177 EMIT_ONE_ASCII_BYTE (ISO_CODE_SO); \
4178 CODING_ISO_INVOCATION (coding, 0) = 1; \
4ed46869
KH
4179 } while (0)
4180
df7492f9
KH
4181
4182#define ENCODE_LOCKING_SHIFT_2 \
4183 do { \
4184 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
4185 CODING_ISO_INVOCATION (coding, 0) = 2; \
4ed46869
KH
4186 } while (0)
4187
df7492f9
KH
4188
4189#define ENCODE_LOCKING_SHIFT_3 \
4190 do { \
4191 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
4192 CODING_ISO_INVOCATION (coding, 0) = 3; \
4ed46869
KH
4193 } while (0)
4194
df7492f9 4195
f4dee582
RS
4196/* Produce codes for a DIMENSION1 character whose character set is
4197 CHARSET and whose position-code is C1. Designation and invocation
4ed46869
KH
4198 sequences are also produced in advance if necessary. */
4199
6e85d753
KH
4200#define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
4201 do { \
df7492f9 4202 int id = CHARSET_ID (charset); \
bf16eb23
KH
4203 \
4204 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
4205 && id == charset_ascii) \
4206 { \
4207 id = charset_jisx0201_roman; \
4208 charset = CHARSET_FROM_ID (id); \
4209 } \
4210 \
df7492f9 4211 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
6e85d753 4212 { \
df7492f9
KH
4213 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4214 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
6e85d753 4215 else \
df7492f9
KH
4216 EMIT_ONE_BYTE (c1 | 0x80); \
4217 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
6e85d753
KH
4218 break; \
4219 } \
df7492f9 4220 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
6e85d753 4221 { \
df7492f9 4222 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
6e85d753
KH
4223 break; \
4224 } \
df7492f9 4225 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
6e85d753 4226 { \
df7492f9 4227 EMIT_ONE_BYTE (c1 | 0x80); \
6e85d753
KH
4228 break; \
4229 } \
6e85d753
KH
4230 else \
4231 /* Since CHARSET is not yet invoked to any graphic planes, we \
4232 must invoke it, or, at first, designate it to some graphic \
4233 register. Then repeat the loop to actually produce the \
4234 character. */ \
df7492f9
KH
4235 dst = encode_invocation_designation (charset, coding, dst, \
4236 &produced_chars); \
4ed46869
KH
4237 } while (1)
4238
df7492f9 4239
f4dee582
RS
4240/* Produce codes for a DIMENSION2 character whose character set is
4241 CHARSET and whose position-codes are C1 and C2. Designation and
4ed46869
KH
4242 invocation codes are also produced in advance if necessary. */
4243
6e85d753
KH
4244#define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
4245 do { \
df7492f9 4246 int id = CHARSET_ID (charset); \
bf16eb23
KH
4247 \
4248 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
4249 && id == charset_jisx0208) \
4250 { \
4251 id = charset_jisx0208_1978; \
4252 charset = CHARSET_FROM_ID (id); \
4253 } \
4254 \
df7492f9 4255 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
6e85d753 4256 { \
df7492f9
KH
4257 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
4258 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
6e85d753 4259 else \
df7492f9
KH
4260 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
4261 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
6e85d753
KH
4262 break; \
4263 } \
df7492f9 4264 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
6e85d753 4265 { \
df7492f9 4266 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
6e85d753
KH
4267 break; \
4268 } \
df7492f9 4269 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
6e85d753 4270 { \
df7492f9 4271 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
6e85d753
KH
4272 break; \
4273 } \
6e85d753
KH
4274 else \
4275 /* Since CHARSET is not yet invoked to any graphic planes, we \
4276 must invoke it, or, at first, designate it to some graphic \
4277 register. Then repeat the loop to actually produce the \
4278 character. */ \
df7492f9
KH
4279 dst = encode_invocation_designation (charset, coding, dst, \
4280 &produced_chars); \
4ed46869
KH
4281 } while (1)
4282
05e6f5dc 4283
df7492f9
KH
4284#define ENCODE_ISO_CHARACTER(charset, c) \
4285 do { \
4286 int code = ENCODE_CHAR ((charset),(c)); \
4287 \
4288 if (CHARSET_DIMENSION (charset) == 1) \
4289 ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code); \
4290 else \
4291 ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
84fbb8a0 4292 } while (0)
bdd9fb48 4293
05e6f5dc 4294
4ed46869 4295/* Produce designation and invocation codes at a place pointed by DST
df7492f9 4296 to use CHARSET. The element `spec.iso_2022' of *CODING is updated.
4ed46869
KH
4297 Return new DST. */
4298
4299unsigned char *
df7492f9
KH
4300encode_invocation_designation (charset, coding, dst, p_nchars)
4301 struct charset *charset;
4ed46869
KH
4302 struct coding_system *coding;
4303 unsigned char *dst;
df7492f9 4304 int *p_nchars;
4ed46869 4305{
df7492f9
KH
4306 int multibytep = coding->dst_multibyte;
4307 int produced_chars = *p_nchars;
4ed46869 4308 int reg; /* graphic register number */
df7492f9 4309 int id = CHARSET_ID (charset);
4ed46869
KH
4310
4311 /* At first, check designations. */
4312 for (reg = 0; reg < 4; reg++)
df7492f9 4313 if (id == CODING_ISO_DESIGNATION (coding, reg))
4ed46869
KH
4314 break;
4315
4316 if (reg >= 4)
4317 {
4318 /* CHARSET is not yet designated to any graphic registers. */
4319 /* At first check the requested designation. */
df7492f9
KH
4320 reg = CODING_ISO_REQUEST (coding, id);
4321 if (reg < 0)
1ba9e4ab
KH
4322 /* Since CHARSET requests no special designation, designate it
4323 to graphic register 0. */
4ed46869
KH
4324 reg = 0;
4325
4326 ENCODE_DESIGNATION (charset, reg, coding);
4327 }
4328
df7492f9
KH
4329 if (CODING_ISO_INVOCATION (coding, 0) != reg
4330 && CODING_ISO_INVOCATION (coding, 1) != reg)
4ed46869
KH
4331 {
4332 /* Since the graphic register REG is not invoked to any graphic
4333 planes, invoke it to graphic plane 0. */
4334 switch (reg)
4335 {
4336 case 0: /* graphic register 0 */
4337 ENCODE_SHIFT_IN;
4338 break;
4339
4340 case 1: /* graphic register 1 */
4341 ENCODE_SHIFT_OUT;
4342 break;
4343
4344 case 2: /* graphic register 2 */
df7492f9 4345 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4ed46869
KH
4346 ENCODE_SINGLE_SHIFT_2;
4347 else
4348 ENCODE_LOCKING_SHIFT_2;
4349 break;
4350
4351 case 3: /* graphic register 3 */
df7492f9 4352 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4ed46869
KH
4353 ENCODE_SINGLE_SHIFT_3;
4354 else
4355 ENCODE_LOCKING_SHIFT_3;
4356 break;
4357 }
4358 }
b73bfc1c 4359
df7492f9 4360 *p_nchars = produced_chars;
4ed46869
KH
4361 return dst;
4362}
4363
df7492f9
KH
4364/* The following three macros produce codes for indicating direction
4365 of text. */
4366#define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
ec6d2bb8 4367 do { \
df7492f9
KH
4368 if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS) \
4369 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '['); \
ec6d2bb8 4370 else \
df7492f9 4371 EMIT_ONE_BYTE (ISO_CODE_CSI); \
ec6d2bb8
KH
4372 } while (0)
4373
ec6d2bb8 4374
df7492f9
KH
4375#define ENCODE_DIRECTION_R2L() \
4376 do { \
4377 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
4378 EMIT_TWO_ASCII_BYTES ('2', ']'); \
ec6d2bb8
KH
4379 } while (0)
4380
ec6d2bb8 4381
df7492f9 4382#define ENCODE_DIRECTION_L2R() \
ec6d2bb8 4383 do { \
df7492f9
KH
4384 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
4385 EMIT_TWO_ASCII_BYTES ('0', ']'); \
ec6d2bb8 4386 } while (0)
4ed46869 4387
4ed46869
KH
4388
4389/* Produce codes for designation and invocation to reset the graphic
4390 planes and registers to initial state. */
df7492f9
KH
4391#define ENCODE_RESET_PLANE_AND_REGISTER() \
4392 do { \
4393 int reg; \
4394 struct charset *charset; \
4395 \
4396 if (CODING_ISO_INVOCATION (coding, 0) != 0) \
4397 ENCODE_SHIFT_IN; \
4398 for (reg = 0; reg < 4; reg++) \
4399 if (CODING_ISO_INITIAL (coding, reg) >= 0 \
4400 && (CODING_ISO_DESIGNATION (coding, reg) \
4401 != CODING_ISO_INITIAL (coding, reg))) \
4402 { \
4403 charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4404 ENCODE_DESIGNATION (charset, reg, coding); \
4405 } \
4ed46869
KH
4406 } while (0)
4407
df7492f9 4408
bdd9fb48 4409/* Produce designation sequences of charsets in the line started from
b73bfc1c 4410 SRC to a place pointed by DST, and return updated DST.
bdd9fb48
KH
4411
4412 If the current block ends before any end-of-line, we may fail to
d46c5b12
KH
4413 find all the necessary designations. */
4414
b73bfc1c 4415static unsigned char *
df7492f9 4416encode_designation_at_bol (coding, charbuf, charbuf_end, dst)
e0e989f6 4417 struct coding_system *coding;
df7492f9
KH
4418 int *charbuf, *charbuf_end;
4419 unsigned char *dst;
e0e989f6 4420{
df7492f9 4421 struct charset *charset;
bdd9fb48
KH
4422 /* Table of charsets to be designated to each graphic register. */
4423 int r[4];
df7492f9
KH
4424 int c, found = 0, reg;
4425 int produced_chars = 0;
4426 int multibytep = coding->dst_multibyte;
4427 Lisp_Object attrs;
4428 Lisp_Object charset_list;
4429
4430 attrs = CODING_ID_ATTRS (coding->id);
4431 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4432 if (EQ (charset_list, Qiso_2022))
4433 charset_list = Viso_2022_charset_list;
bdd9fb48
KH
4434
4435 for (reg = 0; reg < 4; reg++)
4436 r[reg] = -1;
4437
b73bfc1c 4438 while (found < 4)
e0e989f6 4439 {
df7492f9
KH
4440 int id;
4441
4442 c = *charbuf++;
b73bfc1c
KH
4443 if (c == '\n')
4444 break;
df7492f9
KH
4445 charset = char_charset (c, charset_list, NULL);
4446 id = CHARSET_ID (charset);
4447 reg = CODING_ISO_REQUEST (coding, id);
4448 if (reg >= 0 && r[reg] < 0)
bdd9fb48
KH
4449 {
4450 found++;
df7492f9 4451 r[reg] = id;
bdd9fb48 4452 }
bdd9fb48
KH
4453 }
4454
4455 if (found)
4456 {
4457 for (reg = 0; reg < 4; reg++)
4458 if (r[reg] >= 0
df7492f9
KH
4459 && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4460 ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
e0e989f6 4461 }
b73bfc1c
KH
4462
4463 return dst;
e0e989f6
KH
4464}
4465
4ed46869
KH
4466/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
4467
df7492f9
KH
4468static int
4469encode_coding_iso_2022 (coding)
4ed46869 4470 struct coding_system *coding;
4ed46869 4471{
df7492f9
KH
4472 int multibytep = coding->dst_multibyte;
4473 int *charbuf = coding->charbuf;
4474 int *charbuf_end = charbuf + coding->charbuf_used;
4475 unsigned char *dst = coding->destination + coding->produced;
4476 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4477 int safe_room = 16;
4478 int bol_designation
4479 = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4480 && CODING_ISO_BOL (coding));
4481 int produced_chars = 0;
4482 Lisp_Object attrs, eol_type, charset_list;
4483 int ascii_compatible;
b73bfc1c 4484 int c;
ff0dacd7 4485 int preferred_charset_id = -1;
05e6f5dc 4486
24a73b0a 4487 CODING_GET_INFO (coding, attrs, charset_list);
0a9564cb 4488 eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
24a73b0a
KH
4489 if (VECTORP (eol_type))
4490 eol_type = Qunix;
4491
004068e4 4492 setup_iso_safe_charsets (attrs);
ff0dacd7 4493 /* Charset list may have been changed. */
287c57d7 4494 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
1b3b981b 4495 coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
0eecad43 4496
df7492f9 4497 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
bdd9fb48 4498
df7492f9 4499 while (charbuf < charbuf_end)
4ed46869 4500 {
df7492f9 4501 ASSURE_DESTINATION (safe_room);
b73bfc1c 4502
df7492f9 4503 if (bol_designation)
b73bfc1c 4504 {
df7492f9 4505 unsigned char *dst_prev = dst;
4ed46869 4506
bdd9fb48 4507 /* We have to produce designation sequences if any now. */
df7492f9
KH
4508 dst = encode_designation_at_bol (coding, charbuf, charbuf_end, dst);
4509 bol_designation = 0;
4510 /* We are sure that designation sequences are all ASCII bytes. */
4511 produced_chars += dst - dst_prev;
e0e989f6
KH
4512 }
4513
df7492f9 4514 c = *charbuf++;
ec6d2bb8 4515
ff0dacd7
KH
4516 if (c < 0)
4517 {
4518 /* Handle an annotation. */
4519 switch (*charbuf)
ec6d2bb8 4520 {
ff0dacd7
KH
4521 case CODING_ANNOTATE_COMPOSITION_MASK:
4522 /* Not yet implemented. */
4523 break;
4524 case CODING_ANNOTATE_CHARSET_MASK:
cf7dfdf5 4525 preferred_charset_id = charbuf[2];
ff0dacd7
KH
4526 if (preferred_charset_id >= 0
4527 && NILP (Fmemq (make_number (preferred_charset_id),
4528 charset_list)))
4529 preferred_charset_id = -1;
4530 break;
4531 default:
4532 abort ();
4ed46869 4533 }
ff0dacd7
KH
4534 charbuf += -c - 1;
4535 continue;
4ed46869 4536 }
ec6d2bb8 4537
b73bfc1c
KH
4538 /* Now encode the character C. */
4539 if (c < 0x20 || c == 0x7F)
4540 {
df7492f9
KH
4541 if (c == '\n'
4542 || (c == '\r' && EQ (eol_type, Qmac)))
19a8d9e0 4543 {
df7492f9
KH
4544 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4545 ENCODE_RESET_PLANE_AND_REGISTER ();
4546 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
b73bfc1c 4547 {
df7492f9
KH
4548 int i;
4549
4550 for (i = 0; i < 4; i++)
4551 CODING_ISO_DESIGNATION (coding, i)
4552 = CODING_ISO_INITIAL (coding, i);
b73bfc1c 4553 }
df7492f9
KH
4554 bol_designation
4555 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
19a8d9e0 4556 }
df7492f9
KH
4557 else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4558 ENCODE_RESET_PLANE_AND_REGISTER ();
4559 EMIT_ONE_ASCII_BYTE (c);
4ed46869 4560 }
df7492f9 4561 else if (ASCII_CHAR_P (c))
88993dfd 4562 {
df7492f9
KH
4563 if (ascii_compatible)
4564 EMIT_ONE_ASCII_BYTE (c);
93dec019 4565 else
19a8d9e0 4566 {
bf16eb23
KH
4567 struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4568 ENCODE_ISO_CHARACTER (charset, c);
19a8d9e0 4569 }
4ed46869 4570 }
16eafb5d 4571 else if (CHAR_BYTE8_P (c))
88993dfd 4572 {
16eafb5d
KH
4573 c = CHAR_TO_BYTE8 (c);
4574 EMIT_ONE_BYTE (c);
88993dfd 4575 }
b73bfc1c 4576 else
df7492f9 4577 {
ff0dacd7 4578 struct charset *charset;
b73bfc1c 4579
ff0dacd7
KH
4580 if (preferred_charset_id >= 0)
4581 {
4582 charset = CHARSET_FROM_ID (preferred_charset_id);
4583 if (! CHAR_CHARSET_P (c, charset))
4584 charset = char_charset (c, charset_list, NULL);
4585 }
4586 else
4587 charset = char_charset (c, charset_list, NULL);
df7492f9
KH
4588 if (!charset)
4589 {
41cbe562
KH
4590 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4591 {
4592 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4593 charset = CHARSET_FROM_ID (charset_ascii);
4594 }
4595 else
4596 {
4597 c = coding->default_char;
4598 charset = char_charset (c, charset_list, NULL);
4599 }
df7492f9
KH
4600 }
4601 ENCODE_ISO_CHARACTER (charset, c);
4602 }
84fbb8a0 4603 }
b73bfc1c 4604
df7492f9
KH
4605 if (coding->mode & CODING_MODE_LAST_BLOCK
4606 && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4607 {
4608 ASSURE_DESTINATION (safe_room);
4609 ENCODE_RESET_PLANE_AND_REGISTER ();
4610 }
065e3595 4611 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
4612 CODING_ISO_BOL (coding) = bol_designation;
4613 coding->produced_char += produced_chars;
4614 coding->produced = dst - coding->destination;
4615 return 0;
4ed46869
KH
4616}
4617
4618\f
df7492f9 4619/*** 8,9. SJIS and BIG5 handlers ***/
4ed46869 4620
df7492f9 4621/* Although SJIS and BIG5 are not ISO's coding system, they are used
4ed46869
KH
4622 quite widely. So, for the moment, Emacs supports them in the bare
4623 C code. But, in the future, they may be supported only by CCL. */
4624
4625/* SJIS is a coding system encoding three character sets: ASCII, right
4626 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
4627 as is. A character of charset katakana-jisx0201 is encoded by
4628 "position-code + 0x80". A character of charset japanese-jisx0208
4629 is encoded in 2-byte but two position-codes are divided and shifted
df7492f9 4630 so that it fit in the range below.
4ed46869
KH
4631
4632 --- CODE RANGE of SJIS ---
4633 (character set) (range)
4634 ASCII 0x00 .. 0x7F
df7492f9 4635 KATAKANA-JISX0201 0xA0 .. 0xDF
c28a9453 4636 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
d14d03ac 4637 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
4ed46869
KH
4638 -------------------------------
4639
4640*/
4641
4642/* BIG5 is a coding system encoding two character sets: ASCII and
4643 Big5. An ASCII character is encoded as is. Big5 is a two-byte
df7492f9 4644 character set and is encoded in two-byte.
4ed46869
KH
4645
4646 --- CODE RANGE of BIG5 ---
4647 (character set) (range)
4648 ASCII 0x00 .. 0x7F
4649 Big5 (1st byte) 0xA1 .. 0xFE
4650 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
4651 --------------------------
4652
df7492f9 4653 */
4ed46869
KH
4654
4655/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4656 Check if a text is encoded in SJIS. If it is, return
df7492f9 4657 CATEGORY_MASK_SJIS, else return 0. */
4ed46869 4658
0a28aafb 4659static int
ff0dacd7 4660detect_coding_sjis (coding, detect_info)
df7492f9 4661 struct coding_system *coding;
ff0dacd7 4662 struct coding_detection_info *detect_info;
4ed46869 4663{
065e3595 4664 const unsigned char *src = coding->source, *src_base;
8f924df7 4665 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9
KH
4666 int multibytep = coding->src_multibyte;
4667 int consumed_chars = 0;
4668 int found = 0;
b73bfc1c 4669 int c;
df7492f9 4670
ff0dacd7 4671 detect_info->checked |= CATEGORY_MASK_SJIS;
df7492f9
KH
4672 /* A coding system of this category is always ASCII compatible. */
4673 src += coding->head_ascii;
4ed46869 4674
b73bfc1c 4675 while (1)
4ed46869 4676 {
065e3595 4677 src_base = src;
df7492f9 4678 ONE_MORE_BYTE (c);
682169fe
KH
4679 if (c < 0x80)
4680 continue;
df7492f9 4681 if ((c >= 0x81 && c <= 0x9F) || (c >= 0xE0 && c <= 0xEF))
4ed46869 4682 {
df7492f9 4683 ONE_MORE_BYTE (c);
682169fe 4684 if (c < 0x40 || c == 0x7F || c > 0xFC)
df7492f9 4685 break;
ff0dacd7 4686 found = CATEGORY_MASK_SJIS;
4ed46869 4687 }
df7492f9 4688 else if (c >= 0xA0 && c < 0xE0)
ff0dacd7 4689 found = CATEGORY_MASK_SJIS;
df7492f9
KH
4690 else
4691 break;
4ed46869 4692 }
ff0dacd7 4693 detect_info->rejected |= CATEGORY_MASK_SJIS;
df7492f9
KH
4694 return 0;
4695
4696 no_more_source:
065e3595 4697 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
89528eb3 4698 {
ff0dacd7 4699 detect_info->rejected |= CATEGORY_MASK_SJIS;
89528eb3 4700 return 0;
4ed46869 4701 }
ff0dacd7
KH
4702 detect_info->found |= found;
4703 return 1;
4ed46869
KH
4704}
4705
4706/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4707 Check if a text is encoded in BIG5. If it is, return
df7492f9 4708 CATEGORY_MASK_BIG5, else return 0. */
4ed46869 4709
0a28aafb 4710static int
ff0dacd7 4711detect_coding_big5 (coding, detect_info)
df7492f9 4712 struct coding_system *coding;
ff0dacd7 4713 struct coding_detection_info *detect_info;
4ed46869 4714{
065e3595 4715 const unsigned char *src = coding->source, *src_base;
8f924df7 4716 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9
KH
4717 int multibytep = coding->src_multibyte;
4718 int consumed_chars = 0;
4719 int found = 0;
b73bfc1c 4720 int c;
fa42c37f 4721
ff0dacd7 4722 detect_info->checked |= CATEGORY_MASK_BIG5;
df7492f9
KH
4723 /* A coding system of this category is always ASCII compatible. */
4724 src += coding->head_ascii;
fa42c37f 4725
b73bfc1c 4726 while (1)
fa42c37f 4727 {
065e3595 4728 src_base = src;
df7492f9
KH
4729 ONE_MORE_BYTE (c);
4730 if (c < 0x80)
fa42c37f 4731 continue;
df7492f9 4732 if (c >= 0xA1)
fa42c37f 4733 {
df7492f9
KH
4734 ONE_MORE_BYTE (c);
4735 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
fa42c37f 4736 return 0;
ff0dacd7 4737 found = CATEGORY_MASK_BIG5;
fa42c37f 4738 }
df7492f9
KH
4739 else
4740 break;
fa42c37f 4741 }
ff0dacd7 4742 detect_info->rejected |= CATEGORY_MASK_BIG5;
fa42c37f 4743 return 0;
fa42c37f 4744
df7492f9 4745 no_more_source:
065e3595 4746 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
89528eb3 4747 {
ff0dacd7 4748 detect_info->rejected |= CATEGORY_MASK_BIG5;
89528eb3
KH
4749 return 0;
4750 }
ff0dacd7
KH
4751 detect_info->found |= found;
4752 return 1;
fa42c37f
KH
4753}
4754
4ed46869
KH
4755/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
4756 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
fa42c37f 4757
b73bfc1c 4758static void
df7492f9 4759decode_coding_sjis (coding)
4ed46869 4760 struct coding_system *coding;
4ed46869 4761{
8f924df7
KH
4762 const unsigned char *src = coding->source + coding->consumed;
4763 const unsigned char *src_end = coding->source + coding->src_bytes;
4764 const unsigned char *src_base;
69a80ea3 4765 int *charbuf = coding->charbuf + coding->charbuf_used;
df80c7f0
KH
4766 /* We may produce one charset annocation in one loop and one more at
4767 the end. */
69a80ea3 4768 int *charbuf_end
df80c7f0 4769 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
df7492f9
KH
4770 int consumed_chars = 0, consumed_chars_base;
4771 int multibytep = coding->src_multibyte;
4772 struct charset *charset_roman, *charset_kanji, *charset_kana;
57a47f8a 4773 struct charset *charset_kanji2;
24a73b0a 4774 Lisp_Object attrs, charset_list, val;
ff0dacd7
KH
4775 int char_offset = coding->produced_char;
4776 int last_offset = char_offset;
4777 int last_id = charset_ascii;
0a9564cb
EZ
4778 int eol_crlf =
4779 !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
119852e7 4780 int byte_after_cr = -1;
a5d301df 4781
24a73b0a 4782 CODING_GET_INFO (coding, attrs, charset_list);
df7492f9
KH
4783
4784 val = charset_list;
4785 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
89528eb3 4786 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
57a47f8a
KH
4787 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4788 charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
fa42c37f 4789
b73bfc1c 4790 while (1)
4ed46869 4791 {
df7492f9 4792 int c, c1;
24a73b0a 4793 struct charset *charset;
fa42c37f 4794
b73bfc1c 4795 src_base = src;
df7492f9 4796 consumed_chars_base = consumed_chars;
fa42c37f 4797
df7492f9 4798 if (charbuf >= charbuf_end)
b71f6f73
KH
4799 {
4800 if (byte_after_cr >= 0)
4801 src_base--;
4802 break;
4803 }
df7492f9 4804
119852e7
KH
4805 if (byte_after_cr >= 0)
4806 c = byte_after_cr, byte_after_cr = -1;
4807 else
4808 ONE_MORE_BYTE (c);
065e3595
KH
4809 if (c < 0)
4810 goto invalid_code;
24a73b0a 4811 if (c < 0x80)
119852e7
KH
4812 {
4813 if (eol_crlf && c == '\r')
4814 ONE_MORE_BYTE (byte_after_cr);
4815 charset = charset_roman;
4816 }
57a47f8a 4817 else if (c == 0x80 || c == 0xA0)
8e921c4b 4818 goto invalid_code;
57a47f8a
KH
4819 else if (c >= 0xA1 && c <= 0xDF)
4820 {
4821 /* SJIS -> JISX0201-Kana */
4822 c &= 0x7F;
4823 charset = charset_kana;
4824 }
4825 else if (c <= 0xEF)
df7492f9 4826 {
57a47f8a
KH
4827 /* SJIS -> JISX0208 */
4828 ONE_MORE_BYTE (c1);
4829 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
24a73b0a 4830 goto invalid_code;
57a47f8a
KH
4831 c = (c << 8) | c1;
4832 SJIS_TO_JIS (c);
4833 charset = charset_kanji;
4834 }
4835 else if (c <= 0xFC && charset_kanji2)
4836 {
c6876370 4837 /* SJIS -> JISX0213-2 */
57a47f8a
KH
4838 ONE_MORE_BYTE (c1);
4839 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
24a73b0a 4840 goto invalid_code;
57a47f8a
KH
4841 c = (c << 8) | c1;
4842 SJIS_TO_JIS2 (c);
4843 charset = charset_kanji2;
df7492f9 4844 }
57a47f8a
KH
4845 else
4846 goto invalid_code;
24a73b0a
KH
4847 if (charset->id != charset_ascii
4848 && last_id != charset->id)
4849 {
4850 if (last_id != charset_ascii)
69a80ea3 4851 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
24a73b0a
KH
4852 last_id = charset->id;
4853 last_offset = char_offset;
4854 }
4855 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
df7492f9 4856 *charbuf++ = c;
ff0dacd7 4857 char_offset++;
df7492f9 4858 continue;
b73bfc1c 4859
df7492f9
KH
4860 invalid_code:
4861 src = src_base;
4862 consumed_chars = consumed_chars_base;
4863 ONE_MORE_BYTE (c);
065e3595 4864 *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
ff0dacd7 4865 char_offset++;
df7492f9
KH
4866 coding->errors++;
4867 }
fa42c37f 4868
df7492f9 4869 no_more_source:
ff0dacd7 4870 if (last_id != charset_ascii)
69a80ea3 4871 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
df7492f9
KH
4872 coding->consumed_char += consumed_chars_base;
4873 coding->consumed = src_base - coding->source;
4874 coding->charbuf_used = charbuf - coding->charbuf;
fa42c37f
KH
4875}
4876
b73bfc1c 4877static void
df7492f9 4878decode_coding_big5 (coding)
4ed46869 4879 struct coding_system *coding;
4ed46869 4880{
8f924df7
KH
4881 const unsigned char *src = coding->source + coding->consumed;
4882 const unsigned char *src_end = coding->source + coding->src_bytes;
4883 const unsigned char *src_base;
69a80ea3 4884 int *charbuf = coding->charbuf + coding->charbuf_used;
df80c7f0
KH
4885 /* We may produce one charset annocation in one loop and one more at
4886 the end. */
69a80ea3 4887 int *charbuf_end
df80c7f0 4888 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
df7492f9
KH
4889 int consumed_chars = 0, consumed_chars_base;
4890 int multibytep = coding->src_multibyte;
4891 struct charset *charset_roman, *charset_big5;
24a73b0a 4892 Lisp_Object attrs, charset_list, val;
ff0dacd7
KH
4893 int char_offset = coding->produced_char;
4894 int last_offset = char_offset;
4895 int last_id = charset_ascii;
0a9564cb
EZ
4896 int eol_crlf =
4897 !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
119852e7 4898 int byte_after_cr = -1;
df7492f9 4899
24a73b0a 4900 CODING_GET_INFO (coding, attrs, charset_list);
df7492f9
KH
4901 val = charset_list;
4902 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4903 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4ed46869 4904
b73bfc1c 4905 while (1)
4ed46869 4906 {
df7492f9 4907 int c, c1;
24a73b0a 4908 struct charset *charset;
b73bfc1c
KH
4909
4910 src_base = src;
df7492f9
KH
4911 consumed_chars_base = consumed_chars;
4912
4913 if (charbuf >= charbuf_end)
b71f6f73
KH
4914 {
4915 if (byte_after_cr >= 0)
4916 src_base--;
4917 break;
4918 }
df7492f9 4919
119852e7 4920 if (byte_after_cr >= 0)
14daee73 4921 c = byte_after_cr, byte_after_cr = -1;
119852e7
KH
4922 else
4923 ONE_MORE_BYTE (c);
b73bfc1c 4924
065e3595
KH
4925 if (c < 0)
4926 goto invalid_code;
24a73b0a 4927 if (c < 0x80)
119852e7 4928 {
14daee73 4929 if (eol_crlf && c == '\r')
119852e7
KH
4930 ONE_MORE_BYTE (byte_after_cr);
4931 charset = charset_roman;
4932 }
24a73b0a 4933 else
4ed46869 4934 {
24a73b0a
KH
4935 /* BIG5 -> Big5 */
4936 if (c < 0xA1 || c > 0xFE)
4937 goto invalid_code;
4938 ONE_MORE_BYTE (c1);
4939 if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4940 goto invalid_code;
4941 c = c << 8 | c1;
4942 charset = charset_big5;
4ed46869 4943 }
24a73b0a
KH
4944 if (charset->id != charset_ascii
4945 && last_id != charset->id)
df7492f9 4946 {
24a73b0a 4947 if (last_id != charset_ascii)
69a80ea3 4948 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
24a73b0a
KH
4949 last_id = charset->id;
4950 last_offset = char_offset;
4ed46869 4951 }
24a73b0a 4952 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
df7492f9 4953 *charbuf++ = c;
ff0dacd7 4954 char_offset++;
fb88bf2d
KH
4955 continue;
4956
df7492f9 4957 invalid_code:
4ed46869 4958 src = src_base;
df7492f9
KH
4959 consumed_chars = consumed_chars_base;
4960 ONE_MORE_BYTE (c);
065e3595 4961 *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
ff0dacd7 4962 char_offset++;
df7492f9 4963 coding->errors++;
fb88bf2d 4964 }
d46c5b12 4965
df7492f9 4966 no_more_source:
ff0dacd7 4967 if (last_id != charset_ascii)
69a80ea3 4968 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
df7492f9
KH
4969 coding->consumed_char += consumed_chars_base;
4970 coding->consumed = src_base - coding->source;
4971 coding->charbuf_used = charbuf - coding->charbuf;
4ed46869
KH
4972}
4973
4974/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
b73bfc1c
KH
4975 This function can encode charsets `ascii', `katakana-jisx0201',
4976 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
4977 are sure that all these charsets are registered as official charset
4ed46869
KH
4978 (i.e. do not have extended leading-codes). Characters of other
4979 charsets are produced without any encoding. If SJIS_P is 1, encode
4980 SJIS text, else encode BIG5 text. */
4981
df7492f9
KH
4982static int
4983encode_coding_sjis (coding)
4ed46869 4984 struct coding_system *coding;
4ed46869 4985{
df7492f9
KH
4986 int multibytep = coding->dst_multibyte;
4987 int *charbuf = coding->charbuf;
4988 int *charbuf_end = charbuf + coding->charbuf_used;
4989 unsigned char *dst = coding->destination + coding->produced;
4990 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4991 int safe_room = 4;
4992 int produced_chars = 0;
24a73b0a 4993 Lisp_Object attrs, charset_list, val;
df7492f9
KH
4994 int ascii_compatible;
4995 struct charset *charset_roman, *charset_kanji, *charset_kana;
57a47f8a 4996 struct charset *charset_kanji2;
df7492f9 4997 int c;
a5d301df 4998
24a73b0a 4999 CODING_GET_INFO (coding, attrs, charset_list);
df7492f9
KH
5000 val = charset_list;
5001 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
5002 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
57a47f8a
KH
5003 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
5004 charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4ed46869 5005
df7492f9 5006 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
93dec019 5007
df7492f9
KH
5008 while (charbuf < charbuf_end)
5009 {
5010 ASSURE_DESTINATION (safe_room);
5011 c = *charbuf++;
b73bfc1c 5012 /* Now encode the character C. */
df7492f9
KH
5013 if (ASCII_CHAR_P (c) && ascii_compatible)
5014 EMIT_ONE_ASCII_BYTE (c);
16eafb5d
KH
5015 else if (CHAR_BYTE8_P (c))
5016 {
5017 c = CHAR_TO_BYTE8 (c);
5018 EMIT_ONE_BYTE (c);
5019 }
df7492f9 5020 else
b73bfc1c 5021 {
df7492f9
KH
5022 unsigned code;
5023 struct charset *charset = char_charset (c, charset_list, &code);
5024
5025 if (!charset)
4ed46869 5026 {
41cbe562 5027 if (coding->mode & CODING_MODE_SAFE_ENCODING)
b73bfc1c 5028 {
41cbe562
KH
5029 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5030 charset = CHARSET_FROM_ID (charset_ascii);
b73bfc1c 5031 }
41cbe562 5032 else
b73bfc1c 5033 {
41cbe562
KH
5034 c = coding->default_char;
5035 charset = char_charset (c, charset_list, &code);
b73bfc1c 5036 }
b73bfc1c 5037 }
df7492f9
KH
5038 if (code == CHARSET_INVALID_CODE (charset))
5039 abort ();
5040 if (charset == charset_kanji)
5041 {
5042 int c1, c2;
5043 JIS_TO_SJIS (code);
5044 c1 = code >> 8, c2 = code & 0xFF;
5045 EMIT_TWO_BYTES (c1, c2);
5046 }
5047 else if (charset == charset_kana)
5048 EMIT_ONE_BYTE (code | 0x80);
57a47f8a
KH
5049 else if (charset_kanji2 && charset == charset_kanji2)
5050 {
5051 int c1, c2;
5052
5053 c1 = code >> 8;
5054 if (c1 == 0x21 || (c1 >= 0x23 && c1 < 0x25)
5055 || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
5056 {
5057 JIS_TO_SJIS2 (code);
5058 c1 = code >> 8, c2 = code & 0xFF;
5059 EMIT_TWO_BYTES (c1, c2);
5060 }
5061 else
5062 EMIT_ONE_ASCII_BYTE (code & 0x7F);
5063 }
df7492f9
KH
5064 else
5065 EMIT_ONE_ASCII_BYTE (code & 0x7F);
5066 }
5067 }
065e3595 5068 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
5069 coding->produced_char += produced_chars;
5070 coding->produced = dst - coding->destination;
5071 return 0;
5072}
5073
5074static int
5075encode_coding_big5 (coding)
5076 struct coding_system *coding;
5077{
5078 int multibytep = coding->dst_multibyte;
5079 int *charbuf = coding->charbuf;
5080 int *charbuf_end = charbuf + coding->charbuf_used;
5081 unsigned char *dst = coding->destination + coding->produced;
5082 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5083 int safe_room = 4;
5084 int produced_chars = 0;
24a73b0a 5085 Lisp_Object attrs, charset_list, val;
df7492f9
KH
5086 int ascii_compatible;
5087 struct charset *charset_roman, *charset_big5;
5088 int c;
5089
24a73b0a 5090 CODING_GET_INFO (coding, attrs, charset_list);
df7492f9
KH
5091 val = charset_list;
5092 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
5093 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
5094 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5095
5096 while (charbuf < charbuf_end)
5097 {
5098 ASSURE_DESTINATION (safe_room);
5099 c = *charbuf++;
5100 /* Now encode the character C. */
5101 if (ASCII_CHAR_P (c) && ascii_compatible)
5102 EMIT_ONE_ASCII_BYTE (c);
16eafb5d
KH
5103 else if (CHAR_BYTE8_P (c))
5104 {
5105 c = CHAR_TO_BYTE8 (c);
5106 EMIT_ONE_BYTE (c);
b73bfc1c
KH
5107 }
5108 else
5109 {
df7492f9
KH
5110 unsigned code;
5111 struct charset *charset = char_charset (c, charset_list, &code);
5112
5113 if (! charset)
b73bfc1c 5114 {
41cbe562 5115 if (coding->mode & CODING_MODE_SAFE_ENCODING)
b73bfc1c 5116 {
41cbe562
KH
5117 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5118 charset = CHARSET_FROM_ID (charset_ascii);
b73bfc1c 5119 }
41cbe562 5120 else
0eecad43 5121 {
41cbe562
KH
5122 c = coding->default_char;
5123 charset = char_charset (c, charset_list, &code);
0eecad43 5124 }
4ed46869 5125 }
df7492f9
KH
5126 if (code == CHARSET_INVALID_CODE (charset))
5127 abort ();
5128 if (charset == charset_big5)
b73bfc1c 5129 {
df7492f9
KH
5130 int c1, c2;
5131
5132 c1 = code >> 8, c2 = code & 0xFF;
5133 EMIT_TWO_BYTES (c1, c2);
b73bfc1c 5134 }
df7492f9
KH
5135 else
5136 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4ed46869 5137 }
4ed46869 5138 }
065e3595 5139 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
5140 coding->produced_char += produced_chars;
5141 coding->produced = dst - coding->destination;
5142 return 0;
4ed46869
KH
5143}
5144
5145\f
df7492f9 5146/*** 10. CCL handlers ***/
1397dc18
KH
5147
5148/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5149 Check if a text is encoded in a coding system of which
5150 encoder/decoder are written in CCL program. If it is, return
df7492f9 5151 CATEGORY_MASK_CCL, else return 0. */
1397dc18 5152
0a28aafb 5153static int
ff0dacd7 5154detect_coding_ccl (coding, detect_info)
df7492f9 5155 struct coding_system *coding;
ff0dacd7 5156 struct coding_detection_info *detect_info;
1397dc18 5157{
065e3595 5158 const unsigned char *src = coding->source, *src_base;
8f924df7 5159 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9
KH
5160 int multibytep = coding->src_multibyte;
5161 int consumed_chars = 0;
5162 int found = 0;
0e219d54 5163 unsigned char *valids;
df7492f9
KH
5164 int head_ascii = coding->head_ascii;
5165 Lisp_Object attrs;
5166
ff0dacd7
KH
5167 detect_info->checked |= CATEGORY_MASK_CCL;
5168
df7492f9 5169 coding = &coding_categories[coding_category_ccl];
0e219d54 5170 valids = CODING_CCL_VALIDS (coding);
df7492f9
KH
5171 attrs = CODING_ID_ATTRS (coding->id);
5172 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5173 src += head_ascii;
1397dc18 5174
b73bfc1c 5175 while (1)
1397dc18 5176 {
df7492f9 5177 int c;
065e3595
KH
5178
5179 src_base = src;
df7492f9 5180 ONE_MORE_BYTE (c);
065e3595 5181 if (c < 0 || ! valids[c])
df7492f9 5182 break;
ff0dacd7
KH
5183 if ((valids[c] > 1))
5184 found = CATEGORY_MASK_CCL;
df7492f9 5185 }
ff0dacd7 5186 detect_info->rejected |= CATEGORY_MASK_CCL;
df7492f9
KH
5187 return 0;
5188
5189 no_more_source:
ff0dacd7
KH
5190 detect_info->found |= found;
5191 return 1;
df7492f9
KH
5192}
5193
5194static void
5195decode_coding_ccl (coding)
5196 struct coding_system *coding;
5197{
7c78e542 5198 const unsigned char *src = coding->source + coding->consumed;
8f924df7 5199 const unsigned char *src_end = coding->source + coding->src_bytes;
69a80ea3
KH
5200 int *charbuf = coding->charbuf + coding->charbuf_used;
5201 int *charbuf_end = coding->charbuf + coding->charbuf_size;
df7492f9
KH
5202 int consumed_chars = 0;
5203 int multibytep = coding->src_multibyte;
5204 struct ccl_program ccl;
5205 int source_charbuf[1024];
5206 int source_byteidx[1024];
24a73b0a 5207 Lisp_Object attrs, charset_list;
df7492f9 5208
24a73b0a 5209 CODING_GET_INFO (coding, attrs, charset_list);
df7492f9
KH
5210 setup_ccl_program (&ccl, CODING_CCL_DECODER (coding));
5211
5212 while (src < src_end)
5213 {
7c78e542 5214 const unsigned char *p = src;
df7492f9
KH
5215 int *source, *source_end;
5216 int i = 0;
5217
5218 if (multibytep)
5219 while (i < 1024 && p < src_end)
5220 {
5221 source_byteidx[i] = p - src;
5222 source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
5223 }
5224 else
5225 while (i < 1024 && p < src_end)
5226 source_charbuf[i++] = *p++;
8f924df7 5227
df7492f9
KH
5228 if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
5229 ccl.last_block = 1;
5230
5231 source = source_charbuf;
5232 source_end = source + i;
5233 while (source < source_end)
5234 {
5235 ccl_driver (&ccl, source, charbuf,
8dcbea82
KH
5236 source_end - source, charbuf_end - charbuf,
5237 charset_list);
df7492f9
KH
5238 source += ccl.consumed;
5239 charbuf += ccl.produced;
5240 if (ccl.status != CCL_STAT_SUSPEND_BY_DST)
5241 break;
5242 }
5243 if (source < source_end)
5244 src += source_byteidx[source - source_charbuf];
5245 else
5246 src = p;
5247 consumed_chars += source - source_charbuf;
5248
5249 if (ccl.status != CCL_STAT_SUSPEND_BY_SRC
5250 && ccl.status != CODING_RESULT_INSUFFICIENT_SRC)
5251 break;
5252 }
5253
5254 switch (ccl.status)
5255 {
5256 case CCL_STAT_SUSPEND_BY_SRC:
065e3595 5257 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
df7492f9
KH
5258 break;
5259 case CCL_STAT_SUSPEND_BY_DST:
5260 break;
5261 case CCL_STAT_QUIT:
5262 case CCL_STAT_INVALID_CMD:
065e3595 5263 record_conversion_result (coding, CODING_RESULT_INTERRUPT);
df7492f9
KH
5264 break;
5265 default:
065e3595 5266 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
5267 break;
5268 }
5269 coding->consumed_char += consumed_chars;
5270 coding->consumed = src - coding->source;
5271 coding->charbuf_used = charbuf - coding->charbuf;
5272}
5273
5274static int
5275encode_coding_ccl (coding)
5276 struct coding_system *coding;
5277{
5278 struct ccl_program ccl;
5279 int multibytep = coding->dst_multibyte;
5280 int *charbuf = coding->charbuf;
5281 int *charbuf_end = charbuf + coding->charbuf_used;
5282 unsigned char *dst = coding->destination + coding->produced;
5283 unsigned char *dst_end = coding->destination + coding->dst_bytes;
df7492f9
KH
5284 int destination_charbuf[1024];
5285 int i, produced_chars = 0;
24a73b0a 5286 Lisp_Object attrs, charset_list;
df7492f9 5287
24a73b0a 5288 CODING_GET_INFO (coding, attrs, charset_list);
df7492f9
KH
5289 setup_ccl_program (&ccl, CODING_CCL_ENCODER (coding));
5290
5291 ccl.last_block = coding->mode & CODING_MODE_LAST_BLOCK;
5292 ccl.dst_multibyte = coding->dst_multibyte;
5293
8cffd3e7 5294 while (charbuf < charbuf_end)
df7492f9 5295 {
df7492f9 5296 ccl_driver (&ccl, charbuf, destination_charbuf,
8cffd3e7 5297 charbuf_end - charbuf, 1024, charset_list);
df7492f9 5298 if (multibytep)
8cffd3e7
KH
5299 {
5300 ASSURE_DESTINATION (ccl.produced * 2);
5301 for (i = 0; i < ccl.produced; i++)
5302 EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5303 }
df7492f9
KH
5304 else
5305 {
8cffd3e7 5306 ASSURE_DESTINATION (ccl.produced);
3ed051d4 5307 for (i = 0; i < ccl.produced; i++)
df7492f9
KH
5308 *dst++ = destination_charbuf[i] & 0xFF;
5309 produced_chars += ccl.produced;
5310 }
8cffd3e7
KH
5311 charbuf += ccl.consumed;
5312 if (ccl.status == CCL_STAT_QUIT
5313 || ccl.status == CCL_STAT_INVALID_CMD)
5314 break;
df7492f9
KH
5315 }
5316
5317 switch (ccl.status)
5318 {
5319 case CCL_STAT_SUSPEND_BY_SRC:
065e3595 5320 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
df7492f9
KH
5321 break;
5322 case CCL_STAT_SUSPEND_BY_DST:
065e3595 5323 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
df7492f9
KH
5324 break;
5325 case CCL_STAT_QUIT:
5326 case CCL_STAT_INVALID_CMD:
065e3595 5327 record_conversion_result (coding, CODING_RESULT_INTERRUPT);
df7492f9
KH
5328 break;
5329 default:
065e3595 5330 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9 5331 break;
1397dc18 5332 }
df7492f9
KH
5333
5334 coding->produced_char += produced_chars;
5335 coding->produced = dst - coding->destination;
5336 return 0;
1397dc18
KH
5337}
5338
df7492f9 5339
1397dc18 5340\f
df7492f9 5341/*** 10, 11. no-conversion handlers ***/
4ed46869 5342
b73bfc1c 5343/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
4ed46869 5344
b73bfc1c 5345static void
df7492f9 5346decode_coding_raw_text (coding)
4ed46869 5347 struct coding_system *coding;
4ed46869 5348{
0a9564cb
EZ
5349 int eol_crlf =
5350 !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
119852e7 5351
df7492f9 5352 coding->chars_at_source = 1;
119852e7
KH
5353 coding->consumed_char = coding->src_chars;
5354 coding->consumed = coding->src_bytes;
5355 if (eol_crlf && coding->source[coding->src_bytes - 1] == '\r')
5356 {
5357 coding->consumed_char--;
5358 coding->consumed--;
5359 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5360 }
5361 else
5362 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9 5363}
4ed46869 5364
df7492f9
KH
5365static int
5366encode_coding_raw_text (coding)
5367 struct coding_system *coding;
5368{
5369 int multibytep = coding->dst_multibyte;
5370 int *charbuf = coding->charbuf;
5371 int *charbuf_end = coding->charbuf + coding->charbuf_used;
5372 unsigned char *dst = coding->destination + coding->produced;
5373 unsigned char *dst_end = coding->destination + coding->dst_bytes;
a0ed9b27 5374 int produced_chars = 0;
b73bfc1c
KH
5375 int c;
5376
df7492f9 5377 if (multibytep)
b73bfc1c 5378 {
df7492f9 5379 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
4ed46869 5380
df7492f9
KH
5381 if (coding->src_multibyte)
5382 while (charbuf < charbuf_end)
5383 {
5384 ASSURE_DESTINATION (safe_room);
5385 c = *charbuf++;
5386 if (ASCII_CHAR_P (c))
5387 EMIT_ONE_ASCII_BYTE (c);
5388 else if (CHAR_BYTE8_P (c))
5389 {
5390 c = CHAR_TO_BYTE8 (c);
5391 EMIT_ONE_BYTE (c);
5392 }
5393 else
5394 {
5395 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
93dec019 5396
df7492f9
KH
5397 CHAR_STRING_ADVANCE (c, p1);
5398 while (p0 < p1)
9d123124
KH
5399 {
5400 EMIT_ONE_BYTE (*p0);
5401 p0++;
5402 }
df7492f9
KH
5403 }
5404 }
b73bfc1c 5405 else
df7492f9
KH
5406 while (charbuf < charbuf_end)
5407 {
5408 ASSURE_DESTINATION (safe_room);
5409 c = *charbuf++;
5410 EMIT_ONE_BYTE (c);
5411 }
5412 }
5413 else
4ed46869 5414 {
df7492f9 5415 if (coding->src_multibyte)
d46c5b12 5416 {
df7492f9
KH
5417 int safe_room = MAX_MULTIBYTE_LENGTH;
5418
5419 while (charbuf < charbuf_end)
d46c5b12 5420 {
df7492f9
KH
5421 ASSURE_DESTINATION (safe_room);
5422 c = *charbuf++;
5423 if (ASCII_CHAR_P (c))
5424 *dst++ = c;
5425 else if (CHAR_BYTE8_P (c))
5426 *dst++ = CHAR_TO_BYTE8 (c);
b73bfc1c 5427 else
df7492f9 5428 CHAR_STRING_ADVANCE (c, dst);
d46c5b12
KH
5429 }
5430 }
df7492f9
KH
5431 else
5432 {
5433 ASSURE_DESTINATION (charbuf_end - charbuf);
5434 while (charbuf < charbuf_end && dst < dst_end)
5435 *dst++ = *charbuf++;
8f924df7 5436 }
319a3947 5437 produced_chars = dst - (coding->destination + coding->produced);
4ed46869 5438 }
065e3595 5439 record_conversion_result (coding, CODING_RESULT_SUCCESS);
a0ed9b27 5440 coding->produced_char += produced_chars;
df7492f9
KH
5441 coding->produced = dst - coding->destination;
5442 return 0;
4ed46869
KH
5443}
5444
ff0dacd7
KH
5445/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5446 Check if a text is encoded in a charset-based coding system. If it
5447 is, return 1, else return 0. */
5448
0a28aafb 5449static int
ff0dacd7 5450detect_coding_charset (coding, detect_info)
df7492f9 5451 struct coding_system *coding;
ff0dacd7 5452 struct coding_detection_info *detect_info;
1397dc18 5453{
065e3595 5454 const unsigned char *src = coding->source, *src_base;
8f924df7 5455 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9
KH
5456 int multibytep = coding->src_multibyte;
5457 int consumed_chars = 0;
07295713 5458 Lisp_Object attrs, valids, name;
584948ac 5459 int found = 0;
716b3fa0 5460 int head_ascii = coding->head_ascii;
07295713 5461 int check_latin_extra = 0;
1397dc18 5462
ff0dacd7
KH
5463 detect_info->checked |= CATEGORY_MASK_CHARSET;
5464
df7492f9
KH
5465 coding = &coding_categories[coding_category_charset];
5466 attrs = CODING_ID_ATTRS (coding->id);
5467 valids = AREF (attrs, coding_attr_charset_valids);
07295713 5468 name = CODING_ID_NAME (coding->id);
237aabf4
JR
5469 if (strncmp ((char *) SDATA (SYMBOL_NAME (name)),
5470 "iso-8859-", sizeof ("iso-8859-") - 1) == 0
5471 || strncmp ((char *) SDATA (SYMBOL_NAME (name)),
5472 "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
07295713 5473 check_latin_extra = 1;
237aabf4 5474
df7492f9 5475 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
716b3fa0 5476 src += head_ascii;
1397dc18 5477
b73bfc1c 5478 while (1)
1397dc18 5479 {
df7492f9 5480 int c;
716b3fa0
KH
5481 Lisp_Object val;
5482 struct charset *charset;
5483 int dim, idx;
1397dc18 5484
065e3595 5485 src_base = src;
df7492f9 5486 ONE_MORE_BYTE (c);
065e3595
KH
5487 if (c < 0)
5488 continue;
716b3fa0
KH
5489 val = AREF (valids, c);
5490 if (NILP (val))
df7492f9 5491 break;
584948ac 5492 if (c >= 0x80)
07295713
KH
5493 {
5494 if (c < 0xA0
237aabf4
JR
5495 && check_latin_extra
5496 && (!VECTORP (Vlatin_extra_code_table)
9f0526cb 5497 || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c])))
07295713
KH
5498 break;
5499 found = CATEGORY_MASK_CHARSET;
5500 }
716b3fa0
KH
5501 if (INTEGERP (val))
5502 {
5503 charset = CHARSET_FROM_ID (XFASTINT (val));
5504 dim = CHARSET_DIMENSION (charset);
5505 for (idx = 1; idx < dim; idx++)
5506 {
5507 if (src == src_end)
5508 goto too_short;
5509 ONE_MORE_BYTE (c);
3ed051d4 5510 if (c < charset->code_space[(dim - 1 - idx) * 2]
716b3fa0
KH
5511 || c > charset->code_space[(dim - 1 - idx) * 2 + 1])
5512 break;
5513 }
5514 if (idx < dim)
5515 break;
5516 }
5517 else
5518 {
5519 idx = 1;
5520 for (; CONSP (val); val = XCDR (val))
5521 {
5522 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5523 dim = CHARSET_DIMENSION (charset);
5524 while (idx < dim)
5525 {
5526 if (src == src_end)
5527 goto too_short;
5528 ONE_MORE_BYTE (c);
5529 if (c < charset->code_space[(dim - 1 - idx) * 4]
5530 || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5531 break;
5532 idx++;
5533 }
5534 if (idx == dim)
5535 {
5536 val = Qnil;
5537 break;
5538 }
5539 }
5540 if (CONSP (val))
5541 break;
5542 }
df7492f9 5543 }
716b3fa0 5544 too_short:
ff0dacd7 5545 detect_info->rejected |= CATEGORY_MASK_CHARSET;
df7492f9 5546 return 0;
4ed46869 5547
df7492f9 5548 no_more_source:
ff0dacd7
KH
5549 detect_info->found |= found;
5550 return 1;
df7492f9 5551}
b73bfc1c 5552
b73bfc1c 5553static void
df7492f9 5554decode_coding_charset (coding)
4ed46869 5555 struct coding_system *coding;
4ed46869 5556{
8f924df7
KH
5557 const unsigned char *src = coding->source + coding->consumed;
5558 const unsigned char *src_end = coding->source + coding->src_bytes;
5559 const unsigned char *src_base;
69a80ea3 5560 int *charbuf = coding->charbuf + coding->charbuf_used;
df80c7f0
KH
5561 /* We may produce one charset annocation in one loop and one more at
5562 the end. */
69a80ea3 5563 int *charbuf_end
df80c7f0 5564 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
df7492f9
KH
5565 int consumed_chars = 0, consumed_chars_base;
5566 int multibytep = coding->src_multibyte;
24a73b0a 5567 Lisp_Object attrs, charset_list, valids;
ff0dacd7
KH
5568 int char_offset = coding->produced_char;
5569 int last_offset = char_offset;
5570 int last_id = charset_ascii;
0a9564cb
EZ
5571 int eol_crlf =
5572 !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
119852e7 5573 int byte_after_cr = -1;
df7492f9 5574
24a73b0a 5575 CODING_GET_INFO (coding, attrs, charset_list);
4eb6d3f1 5576 valids = AREF (attrs, coding_attr_charset_valids);
b73bfc1c 5577
df7492f9 5578 while (1)
4ed46869 5579 {
4eb6d3f1 5580 int c;
24a73b0a
KH
5581 Lisp_Object val;
5582 struct charset *charset;
5583 int dim;
5584 int len = 1;
5585 unsigned code;
df7492f9
KH
5586
5587 src_base = src;
5588 consumed_chars_base = consumed_chars;
b73bfc1c 5589
df7492f9 5590 if (charbuf >= charbuf_end)
b71f6f73
KH
5591 {
5592 if (byte_after_cr >= 0)
5593 src_base--;
5594 break;
5595 }
df7492f9 5596
119852e7
KH
5597 if (byte_after_cr >= 0)
5598 {
5599 c = byte_after_cr;
5600 byte_after_cr = -1;
5601 }
5602 else
5603 {
5604 ONE_MORE_BYTE (c);
5605 if (eol_crlf && c == '\r')
5606 ONE_MORE_BYTE (byte_after_cr);
5607 }
065e3595
KH
5608 if (c < 0)
5609 goto invalid_code;
24a73b0a
KH
5610 code = c;
5611
5612 val = AREF (valids, c);
1b17adfd 5613 if (! INTEGERP (val) && ! CONSP (val))
24a73b0a
KH
5614 goto invalid_code;
5615 if (INTEGERP (val))
d46c5b12 5616 {
24a73b0a
KH
5617 charset = CHARSET_FROM_ID (XFASTINT (val));
5618 dim = CHARSET_DIMENSION (charset);
5619 while (len < dim)
b73bfc1c 5620 {
24a73b0a
KH
5621 ONE_MORE_BYTE (c);
5622 code = (code << 8) | c;
5623 len++;
b73bfc1c 5624 }
24a73b0a
KH
5625 CODING_DECODE_CHAR (coding, src, src_base, src_end,
5626 charset, code, c);
d46c5b12 5627 }
df7492f9 5628 else
d46c5b12 5629 {
24a73b0a
KH
5630 /* VAL is a list of charset IDs. It is assured that the
5631 list is sorted by charset dimensions (smaller one
5632 comes first). */
5633 while (CONSP (val))
4eb6d3f1 5634 {
24a73b0a 5635 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
c7c66a95 5636 dim = CHARSET_DIMENSION (charset);
f9d71dcd 5637 while (len < dim)
4eb6d3f1 5638 {
acb2a965
KH
5639 ONE_MORE_BYTE (c);
5640 code = (code << 8) | c;
f9d71dcd 5641 len++;
4eb6d3f1 5642 }
24a73b0a
KH
5643 CODING_DECODE_CHAR (coding, src, src_base,
5644 src_end, charset, code, c);
5645 if (c >= 0)
5646 break;
5647 val = XCDR (val);
ff0dacd7 5648 }
d46c5b12 5649 }
24a73b0a
KH
5650 if (c < 0)
5651 goto invalid_code;
5652 if (charset->id != charset_ascii
5653 && last_id != charset->id)
5654 {
5655 if (last_id != charset_ascii)
69a80ea3 5656 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
24a73b0a
KH
5657 last_id = charset->id;
5658 last_offset = char_offset;
5659 }
5660
df7492f9 5661 *charbuf++ = c;
ff0dacd7 5662 char_offset++;
df7492f9
KH
5663 continue;
5664
5665 invalid_code:
5666 src = src_base;
5667 consumed_chars = consumed_chars_base;
5668 ONE_MORE_BYTE (c);
065e3595 5669 *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
ff0dacd7 5670 char_offset++;
df7492f9 5671 coding->errors++;
4ed46869
KH
5672 }
5673
df7492f9 5674 no_more_source:
ff0dacd7 5675 if (last_id != charset_ascii)
69a80ea3 5676 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
df7492f9
KH
5677 coding->consumed_char += consumed_chars_base;
5678 coding->consumed = src_base - coding->source;
5679 coding->charbuf_used = charbuf - coding->charbuf;
4ed46869
KH
5680}
5681
df7492f9
KH
5682static int
5683encode_coding_charset (coding)
4ed46869 5684 struct coding_system *coding;
4ed46869 5685{
df7492f9
KH
5686 int multibytep = coding->dst_multibyte;
5687 int *charbuf = coding->charbuf;
5688 int *charbuf_end = charbuf + coding->charbuf_used;
5689 unsigned char *dst = coding->destination + coding->produced;
5690 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5691 int safe_room = MAX_MULTIBYTE_LENGTH;
5692 int produced_chars = 0;
24a73b0a 5693 Lisp_Object attrs, charset_list;
df7492f9 5694 int ascii_compatible;
b73bfc1c 5695 int c;
b73bfc1c 5696
24a73b0a 5697 CODING_GET_INFO (coding, attrs, charset_list);
df7492f9 5698 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
fb88bf2d 5699
df7492f9 5700 while (charbuf < charbuf_end)
4ed46869 5701 {
4eb6d3f1 5702 struct charset *charset;
df7492f9 5703 unsigned code;
8f924df7 5704
df7492f9
KH
5705 ASSURE_DESTINATION (safe_room);
5706 c = *charbuf++;
5707 if (ascii_compatible && ASCII_CHAR_P (c))
5708 EMIT_ONE_ASCII_BYTE (c);
16eafb5d 5709 else if (CHAR_BYTE8_P (c))
4ed46869 5710 {
16eafb5d
KH
5711 c = CHAR_TO_BYTE8 (c);
5712 EMIT_ONE_BYTE (c);
d46c5b12 5713 }
d46c5b12 5714 else
b73bfc1c 5715 {
4eb6d3f1
KH
5716 charset = char_charset (c, charset_list, &code);
5717 if (charset)
5718 {
5719 if (CHARSET_DIMENSION (charset) == 1)
5720 EMIT_ONE_BYTE (code);
5721 else if (CHARSET_DIMENSION (charset) == 2)
5722 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5723 else if (CHARSET_DIMENSION (charset) == 3)
5724 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5725 else
5726 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5727 (code >> 8) & 0xFF, code & 0xFF);
5728 }
5729 else
41cbe562
KH
5730 {
5731 if (coding->mode & CODING_MODE_SAFE_ENCODING)
5732 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5733 else
5734 c = coding->default_char;
5735 EMIT_ONE_BYTE (c);
5736 }
4ed46869 5737 }
4ed46869
KH
5738 }
5739
065e3595 5740 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
5741 coding->produced_char += produced_chars;
5742 coding->produced = dst - coding->destination;
5743 return 0;
4ed46869
KH
5744}
5745
5746\f
1397dc18 5747/*** 7. C library functions ***/
4ed46869 5748
df7492f9
KH
5749/* Setup coding context CODING from information about CODING_SYSTEM.
5750 If CODING_SYSTEM is nil, `no-conversion' is assumed. If
5751 CODING_SYSTEM is invalid, signal an error. */
4ed46869 5752
ec6d2bb8 5753void
e0e989f6
KH
5754setup_coding_system (coding_system, coding)
5755 Lisp_Object coding_system;
4ed46869
KH
5756 struct coding_system *coding;
5757{
df7492f9
KH
5758 Lisp_Object attrs;
5759 Lisp_Object eol_type;
5760 Lisp_Object coding_type;
4608c386 5761 Lisp_Object val;
4ed46869 5762
df7492f9 5763 if (NILP (coding_system))
ae6f73fa 5764 coding_system = Qundecided;
c07c8e12 5765
df7492f9 5766 CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
1f5dbf34 5767
df7492f9 5768 attrs = CODING_ID_ATTRS (coding->id);
0a9564cb 5769 eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
1f5dbf34 5770
df7492f9
KH
5771 coding->mode = 0;
5772 coding->head_ascii = -1;
4a015c45
KH
5773 if (VECTORP (eol_type))
5774 coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5775 | CODING_REQUIRE_DETECTION_MASK);
5776 else if (! EQ (eol_type, Qunix))
5777 coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5778 | CODING_REQUIRE_ENCODING_MASK);
5779 else
5780 coding->common_flags = 0;
5e5c78be
KH
5781 if (! NILP (CODING_ATTR_POST_READ (attrs)))
5782 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5783 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5784 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
8f924df7
KH
5785 if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5786 coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
1f5dbf34 5787
df7492f9 5788 val = CODING_ATTR_SAFE_CHARSETS (attrs);
8f924df7 5789 coding->max_charset_id = SCHARS (val) - 1;
1b3b981b 5790 coding->safe_charsets = SDATA (val);
df7492f9 5791 coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
4608c386 5792
df7492f9
KH
5793 coding_type = CODING_ATTR_TYPE (attrs);
5794 if (EQ (coding_type, Qundecided))
d46c5b12 5795 {
df7492f9
KH
5796 coding->detector = NULL;
5797 coding->decoder = decode_coding_raw_text;
5798 coding->encoder = encode_coding_raw_text;
5799 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
d46c5b12 5800 }
df7492f9 5801 else if (EQ (coding_type, Qiso_2022))
d46c5b12 5802 {
df7492f9
KH
5803 int i;
5804 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5805
5806 /* Invoke graphic register 0 to plane 0. */
5807 CODING_ISO_INVOCATION (coding, 0) = 0;
5808 /* Invoke graphic register 1 to plane 1 if we can use 8-bit. */
5809 CODING_ISO_INVOCATION (coding, 1)
5810 = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5811 /* Setup the initial status of designation. */
5812 for (i = 0; i < 4; i++)
5813 CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5814 /* Not single shifting initially. */
5815 CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5816 /* Beginning of buffer should also be regarded as bol. */
5817 CODING_ISO_BOL (coding) = 1;
5818 coding->detector = detect_coding_iso_2022;
5819 coding->decoder = decode_coding_iso_2022;
5820 coding->encoder = encode_coding_iso_2022;
5821 if (flags & CODING_ISO_FLAG_SAFE)
5822 coding->mode |= CODING_MODE_SAFE_ENCODING;
d46c5b12 5823 coding->common_flags
df7492f9
KH
5824 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5825 | CODING_REQUIRE_FLUSHING_MASK);
5826 if (flags & CODING_ISO_FLAG_COMPOSITION)
5827 coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
ff0dacd7
KH
5828 if (flags & CODING_ISO_FLAG_DESIGNATION)
5829 coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
df7492f9
KH
5830 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5831 {
5832 setup_iso_safe_charsets (attrs);
5833 val = CODING_ATTR_SAFE_CHARSETS (attrs);
8f924df7 5834 coding->max_charset_id = SCHARS (val) - 1;
1b3b981b 5835 coding->safe_charsets = SDATA (val);
df7492f9
KH
5836 }
5837 CODING_ISO_FLAGS (coding) = flags;
e951386e
KH
5838 CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5839 CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5840 CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5841 CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
d46c5b12 5842 }
df7492f9 5843 else if (EQ (coding_type, Qcharset))
d46c5b12 5844 {
df7492f9
KH
5845 coding->detector = detect_coding_charset;
5846 coding->decoder = decode_coding_charset;
5847 coding->encoder = encode_coding_charset;
d46c5b12 5848 coding->common_flags
df7492f9 5849 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
d46c5b12 5850 }
df7492f9 5851 else if (EQ (coding_type, Qutf_8))
d46c5b12 5852 {
a470d443
KH
5853 val = AREF (attrs, coding_attr_utf_bom);
5854 CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5855 : EQ (val, Qt) ? utf_with_bom
5856 : utf_without_bom);
df7492f9
KH
5857 coding->detector = detect_coding_utf_8;
5858 coding->decoder = decode_coding_utf_8;
5859 coding->encoder = encode_coding_utf_8;
5860 coding->common_flags
5861 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
a470d443
KH
5862 if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5863 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
df7492f9
KH
5864 }
5865 else if (EQ (coding_type, Qutf_16))
5866 {
a470d443
KH
5867 val = AREF (attrs, coding_attr_utf_bom);
5868 CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5869 : EQ (val, Qt) ? utf_with_bom
5870 : utf_without_bom);
df7492f9 5871 val = AREF (attrs, coding_attr_utf_16_endian);
b49a1807 5872 CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
df7492f9 5873 : utf_16_little_endian);
e19c3639 5874 CODING_UTF_16_SURROGATE (coding) = 0;
df7492f9
KH
5875 coding->detector = detect_coding_utf_16;
5876 coding->decoder = decode_coding_utf_16;
5877 coding->encoder = encode_coding_utf_16;
5878 coding->common_flags
5879 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
a470d443 5880 if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
b49a1807 5881 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
d46c5b12 5882 }
df7492f9 5883 else if (EQ (coding_type, Qccl))
4ed46869 5884 {
df7492f9
KH
5885 coding->detector = detect_coding_ccl;
5886 coding->decoder = decode_coding_ccl;
5887 coding->encoder = encode_coding_ccl;
c952af22 5888 coding->common_flags
df7492f9
KH
5889 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5890 | CODING_REQUIRE_FLUSHING_MASK);
5891 }
5892 else if (EQ (coding_type, Qemacs_mule))
5893 {
5894 coding->detector = detect_coding_emacs_mule;
5895 coding->decoder = decode_coding_emacs_mule;
5896 coding->encoder = encode_coding_emacs_mule;
c952af22 5897 coding->common_flags
df7492f9 5898 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
e951386e 5899 coding->spec.emacs_mule.full_support = 1;
df7492f9
KH
5900 if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5901 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5902 {
5903 Lisp_Object tail, safe_charsets;
5904 int max_charset_id = 0;
5905
5906 for (tail = Vemacs_mule_charset_list; CONSP (tail);
5907 tail = XCDR (tail))
5908 if (max_charset_id < XFASTINT (XCAR (tail)))
5909 max_charset_id = XFASTINT (XCAR (tail));
1b3b981b
AS
5910 safe_charsets = make_uninit_string (max_charset_id + 1);
5911 memset (SDATA (safe_charsets), 255, max_charset_id + 1);
df7492f9
KH
5912 for (tail = Vemacs_mule_charset_list; CONSP (tail);
5913 tail = XCDR (tail))
8f924df7 5914 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
df7492f9 5915 coding->max_charset_id = max_charset_id;
1b3b981b 5916 coding->safe_charsets = SDATA (safe_charsets);
e951386e 5917 coding->spec.emacs_mule.full_support = 1;
df7492f9 5918 }
e951386e
KH
5919 coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
5920 coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
df7492f9
KH
5921 }
5922 else if (EQ (coding_type, Qshift_jis))
5923 {
5924 coding->detector = detect_coding_sjis;
5925 coding->decoder = decode_coding_sjis;
5926 coding->encoder = encode_coding_sjis;
c952af22 5927 coding->common_flags
df7492f9
KH
5928 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5929 }
5930 else if (EQ (coding_type, Qbig5))
5931 {
5932 coding->detector = detect_coding_big5;
5933 coding->decoder = decode_coding_big5;
5934 coding->encoder = encode_coding_big5;
c952af22 5935 coding->common_flags
df7492f9
KH
5936 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5937 }
5938 else /* EQ (coding_type, Qraw_text) */
ec6d2bb8 5939 {
df7492f9
KH
5940 coding->detector = NULL;
5941 coding->decoder = decode_coding_raw_text;
5942 coding->encoder = encode_coding_raw_text;
ea29edf2
KH
5943 if (! EQ (eol_type, Qunix))
5944 {
5945 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5946 if (! VECTORP (eol_type))
5947 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5948 }
5949
4ed46869 5950 }
4ed46869 5951
df7492f9 5952 return;
4ed46869
KH
5953}
5954
0ff61e78
KH
5955/* Return a list of charsets supported by CODING. */
5956
5957Lisp_Object
5958coding_charset_list (coding)
5959 struct coding_system *coding;
5960{
35befdaa 5961 Lisp_Object attrs, charset_list;
0ff61e78
KH
5962
5963 CODING_GET_INFO (coding, attrs, charset_list);
5964 if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5965 {
5966 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5967
5968 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5969 charset_list = Viso_2022_charset_list;
5970 }
5971 else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5972 {
5973 charset_list = Vemacs_mule_charset_list;
5974 }
5975 return charset_list;
5976}
5977
5978
e9f91ece
KH
5979/* Return a list of charsets supported by CODING-SYSTEM. */
5980
5981Lisp_Object
5982coding_system_charset_list (coding_system)
5983 Lisp_Object coding_system;
5984{
5985 int id;
5986 Lisp_Object attrs, charset_list;
5987
5988 CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
5989 attrs = CODING_ID_ATTRS (id);
5990
5991 if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5992 {
5993 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5994
5995 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5996 charset_list = Viso_2022_charset_list;
5997 else
5998 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5999 }
6000 else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
6001 {
6002 charset_list = Vemacs_mule_charset_list;
6003 }
6004 else
6005 {
6006 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
6007 }
6008 return charset_list;
6009}
6010
6011
df7492f9
KH
6012/* Return raw-text or one of its subsidiaries that has the same
6013 eol_type as CODING-SYSTEM. */
ec6d2bb8 6014
df7492f9
KH
6015Lisp_Object
6016raw_text_coding_system (coding_system)
6017 Lisp_Object coding_system;
ec6d2bb8 6018{
0be8721c 6019 Lisp_Object spec, attrs;
df7492f9 6020 Lisp_Object eol_type, raw_text_eol_type;
ec6d2bb8 6021
d3e4cb56
KH
6022 if (NILP (coding_system))
6023 return Qraw_text;
df7492f9
KH
6024 spec = CODING_SYSTEM_SPEC (coding_system);
6025 attrs = AREF (spec, 0);
ec6d2bb8 6026
df7492f9
KH
6027 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
6028 return coding_system;
ec6d2bb8 6029
df7492f9
KH
6030 eol_type = AREF (spec, 2);
6031 if (VECTORP (eol_type))
6032 return Qraw_text;
6033 spec = CODING_SYSTEM_SPEC (Qraw_text);
6034 raw_text_eol_type = AREF (spec, 2);
6035 return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
6036 : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
6037 : AREF (raw_text_eol_type, 2));
ec6d2bb8
KH
6038}
6039
54f78171 6040
df7492f9
KH
6041/* If CODING_SYSTEM doesn't specify end-of-line format but PARENT
6042 does, return one of the subsidiary that has the same eol-spec as
fcbcfb64
KH
6043 PARENT. Otherwise, return CODING_SYSTEM. If PARENT is nil,
6044 inherit end-of-line format from the system's setting
6045 (system_eol_type). */
df7492f9
KH
6046
6047Lisp_Object
6048coding_inherit_eol_type (coding_system, parent)
b74e4686 6049 Lisp_Object coding_system, parent;
54f78171 6050{
3e139625 6051 Lisp_Object spec, eol_type;
54f78171 6052
d3e4cb56
KH
6053 if (NILP (coding_system))
6054 coding_system = Qraw_text;
df7492f9 6055 spec = CODING_SYSTEM_SPEC (coding_system);
df7492f9 6056 eol_type = AREF (spec, 2);
fcbcfb64 6057 if (VECTORP (eol_type))
df7492f9 6058 {
df7492f9
KH
6059 Lisp_Object parent_eol_type;
6060
fcbcfb64
KH
6061 if (! NILP (parent))
6062 {
6063 Lisp_Object parent_spec;
6064
4a015c45 6065 parent_spec = CODING_SYSTEM_SPEC (parent);
fcbcfb64
KH
6066 parent_eol_type = AREF (parent_spec, 2);
6067 }
6068 else
6069 parent_eol_type = system_eol_type;
df7492f9
KH
6070 if (EQ (parent_eol_type, Qunix))
6071 coding_system = AREF (eol_type, 0);
6072 else if (EQ (parent_eol_type, Qdos))
6073 coding_system = AREF (eol_type, 1);
6074 else if (EQ (parent_eol_type, Qmac))
6075 coding_system = AREF (eol_type, 2);
54f78171 6076 }
df7492f9 6077 return coding_system;
54f78171
KH
6078}
6079
4ed46869
KH
6080/* Emacs has a mechanism to automatically detect a coding system if it
6081 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
6082 it's impossible to distinguish some coding systems accurately
6083 because they use the same range of codes. So, at first, coding
6084 systems are categorized into 7, those are:
6085
0ef69138 6086 o coding-category-emacs-mule
4ed46869
KH
6087
6088 The category for a coding system which has the same code range
6089 as Emacs' internal format. Assigned the coding-system (Lisp
0ef69138 6090 symbol) `emacs-mule' by default.
4ed46869
KH
6091
6092 o coding-category-sjis
6093
6094 The category for a coding system which has the same code range
6095 as SJIS. Assigned the coding-system (Lisp
7717c392 6096 symbol) `japanese-shift-jis' by default.
4ed46869
KH
6097
6098 o coding-category-iso-7
6099
6100 The category for a coding system which has the same code range
7717c392 6101 as ISO2022 of 7-bit environment. This doesn't use any locking
d46c5b12
KH
6102 shift and single shift functions. This can encode/decode all
6103 charsets. Assigned the coding-system (Lisp symbol)
6104 `iso-2022-7bit' by default.
6105
6106 o coding-category-iso-7-tight
6107
6108 Same as coding-category-iso-7 except that this can
6109 encode/decode only the specified charsets.
4ed46869
KH
6110
6111 o coding-category-iso-8-1
6112
6113 The category for a coding system which has the same code range
6114 as ISO2022 of 8-bit environment and graphic plane 1 used only
7717c392
KH
6115 for DIMENSION1 charset. This doesn't use any locking shift
6116 and single shift functions. Assigned the coding-system (Lisp
6117 symbol) `iso-latin-1' by default.
4ed46869
KH
6118
6119 o coding-category-iso-8-2
6120
6121 The category for a coding system which has the same code range
6122 as ISO2022 of 8-bit environment and graphic plane 1 used only
7717c392
KH
6123 for DIMENSION2 charset. This doesn't use any locking shift
6124 and single shift functions. Assigned the coding-system (Lisp
6125 symbol) `japanese-iso-8bit' by default.
4ed46869 6126
7717c392 6127 o coding-category-iso-7-else
4ed46869
KH
6128
6129 The category for a coding system which has the same code range
df7492f9 6130 as ISO2022 of 7-bit environemnt but uses locking shift or
7717c392
KH
6131 single shift functions. Assigned the coding-system (Lisp
6132 symbol) `iso-2022-7bit-lock' by default.
6133
6134 o coding-category-iso-8-else
6135
6136 The category for a coding system which has the same code range
df7492f9 6137 as ISO2022 of 8-bit environemnt but uses locking shift or
7717c392
KH
6138 single shift functions. Assigned the coding-system (Lisp
6139 symbol) `iso-2022-8bit-ss2' by default.
4ed46869
KH
6140
6141 o coding-category-big5
6142
6143 The category for a coding system which has the same code range
6144 as BIG5. Assigned the coding-system (Lisp symbol)
e0e989f6 6145 `cn-big5' by default.
4ed46869 6146
fa42c37f
KH
6147 o coding-category-utf-8
6148
6149 The category for a coding system which has the same code range
6e76ae91 6150 as UTF-8 (cf. RFC3629). Assigned the coding-system (Lisp
fa42c37f
KH
6151 symbol) `utf-8' by default.
6152
6153 o coding-category-utf-16-be
6154
6155 The category for a coding system in which a text has an
6156 Unicode signature (cf. Unicode Standard) in the order of BIG
6157 endian at the head. Assigned the coding-system (Lisp symbol)
6158 `utf-16-be' by default.
6159
6160 o coding-category-utf-16-le
6161
6162 The category for a coding system in which a text has an
6163 Unicode signature (cf. Unicode Standard) in the order of
6164 LITTLE endian at the head. Assigned the coding-system (Lisp
6165 symbol) `utf-16-le' by default.
6166
1397dc18
KH
6167 o coding-category-ccl
6168
6169 The category for a coding system of which encoder/decoder is
6170 written in CCL programs. The default value is nil, i.e., no
6171 coding system is assigned.
6172
4ed46869
KH
6173 o coding-category-binary
6174
6175 The category for a coding system not categorized in any of the
6176 above. Assigned the coding-system (Lisp symbol)
e0e989f6 6177 `no-conversion' by default.
4ed46869
KH
6178
6179 Each of them is a Lisp symbol and the value is an actual
df7492f9 6180 `coding-system's (this is also a Lisp symbol) assigned by a user.
4ed46869
KH
6181 What Emacs does actually is to detect a category of coding system.
6182 Then, it uses a `coding-system' assigned to it. If Emacs can't
df7492f9 6183 decide only one possible category, it selects a category of the
4ed46869
KH
6184 highest priority. Priorities of categories are also specified by a
6185 user in a Lisp variable `coding-category-list'.
6186
6187*/
6188
df7492f9
KH
6189#define EOL_SEEN_NONE 0
6190#define EOL_SEEN_LF 1
6191#define EOL_SEEN_CR 2
6192#define EOL_SEEN_CRLF 4
66cfb530 6193
ff0dacd7
KH
6194/* Detect how end-of-line of a text of length SRC_BYTES pointed by
6195 SOURCE is encoded. If CATEGORY is one of
6196 coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6197 two-byte, else they are encoded by one-byte.
6198
6199 Return one of EOL_SEEN_XXX. */
4ed46869 6200
bc4bc72a 6201#define MAX_EOL_CHECK_COUNT 3
d46c5b12
KH
6202
6203static int
89528eb3 6204detect_eol (source, src_bytes, category)
f6cbaf43 6205 const unsigned char *source;
df7492f9 6206 EMACS_INT src_bytes;
89528eb3 6207 enum coding_category category;
4ed46869 6208{
f6cbaf43 6209 const unsigned char *src = source, *src_end = src + src_bytes;
4ed46869 6210 unsigned char c;
df7492f9
KH
6211 int total = 0;
6212 int eol_seen = EOL_SEEN_NONE;
4ed46869 6213
89528eb3 6214 if ((1 << category) & CATEGORY_MASK_UTF_16)
4ed46869 6215 {
df7492f9 6216 int msb, lsb;
fa42c37f 6217
89528eb3
KH
6218 msb = category == (coding_category_utf_16_le
6219 | coding_category_utf_16_le_nosig);
df7492f9 6220 lsb = 1 - msb;
fa42c37f 6221
df7492f9 6222 while (src + 1 < src_end)
fa42c37f 6223 {
df7492f9
KH
6224 c = src[lsb];
6225 if (src[msb] == 0 && (c == '\n' || c == '\r'))
fa42c37f 6226 {
df7492f9
KH
6227 int this_eol;
6228
6229 if (c == '\n')
6230 this_eol = EOL_SEEN_LF;
6231 else if (src + 3 >= src_end
6232 || src[msb + 2] != 0
6233 || src[lsb + 2] != '\n')
6234 this_eol = EOL_SEEN_CR;
fa42c37f 6235 else
75f4f1ac
EZ
6236 {
6237 this_eol = EOL_SEEN_CRLF;
6238 src += 2;
6239 }
df7492f9
KH
6240
6241 if (eol_seen == EOL_SEEN_NONE)
6242 /* This is the first end-of-line. */
6243 eol_seen = this_eol;
6244 else if (eol_seen != this_eol)
fa42c37f 6245 {
75f4f1ac
EZ
6246 /* The found type is different from what found before.
6247 Allow for stray ^M characters in DOS EOL files. */
6248 if (eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF
6249 || eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR)
6250 eol_seen = EOL_SEEN_CRLF;
6251 else
6252 {
6253 eol_seen = EOL_SEEN_LF;
6254 break;
6255 }
fa42c37f 6256 }
df7492f9
KH
6257 if (++total == MAX_EOL_CHECK_COUNT)
6258 break;
fa42c37f 6259 }
df7492f9 6260 src += 2;
fa42c37f 6261 }
bcf26d6a 6262 }
d46c5b12 6263 else
c4825358 6264 {
df7492f9 6265 while (src < src_end)
27901516 6266 {
df7492f9
KH
6267 c = *src++;
6268 if (c == '\n' || c == '\r')
6269 {
6270 int this_eol;
d46c5b12 6271
df7492f9
KH
6272 if (c == '\n')
6273 this_eol = EOL_SEEN_LF;
6274 else if (src >= src_end || *src != '\n')
6275 this_eol = EOL_SEEN_CR;
6276 else
6277 this_eol = EOL_SEEN_CRLF, src++;
d46c5b12 6278
df7492f9
KH
6279 if (eol_seen == EOL_SEEN_NONE)
6280 /* This is the first end-of-line. */
6281 eol_seen = this_eol;
6282 else if (eol_seen != this_eol)
6283 {
75f4f1ac
EZ
6284 /* The found type is different from what found before.
6285 Allow for stray ^M characters in DOS EOL files. */
6286 if (eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF
6287 || eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR)
6288 eol_seen = EOL_SEEN_CRLF;
6289 else
6290 {
6291 eol_seen = EOL_SEEN_LF;
6292 break;
6293 }
df7492f9
KH
6294 }
6295 if (++total == MAX_EOL_CHECK_COUNT)
6296 break;
6297 }
6298 }
73be902c 6299 }
df7492f9 6300 return eol_seen;
73be902c
KH
6301}
6302
df7492f9 6303
24a73b0a 6304static Lisp_Object
df7492f9
KH
6305adjust_coding_eol_type (coding, eol_seen)
6306 struct coding_system *coding;
6307 int eol_seen;
73be902c 6308{
0be8721c 6309 Lisp_Object eol_type;
8f924df7 6310
df7492f9
KH
6311 eol_type = CODING_ID_EOL_TYPE (coding->id);
6312 if (eol_seen & EOL_SEEN_LF)
24a73b0a
KH
6313 {
6314 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6315 eol_type = Qunix;
6316 }
6f197c07 6317 else if (eol_seen & EOL_SEEN_CRLF)
24a73b0a
KH
6318 {
6319 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6320 eol_type = Qdos;
6321 }
6f197c07 6322 else if (eol_seen & EOL_SEEN_CR)
24a73b0a
KH
6323 {
6324 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6325 eol_type = Qmac;
6326 }
6327 return eol_type;
d46c5b12 6328}
4ed46869 6329
df7492f9
KH
6330/* Detect how a text specified in CODING is encoded. If a coding
6331 system is detected, update fields of CODING by the detected coding
6332 system. */
0a28aafb 6333
df7492f9
KH
6334void
6335detect_coding (coding)
d46c5b12 6336 struct coding_system *coding;
d46c5b12 6337{
8f924df7 6338 const unsigned char *src, *src_end;
73cce38d 6339 int saved_mode = coding->mode;
d46c5b12 6340
df7492f9
KH
6341 coding->consumed = coding->consumed_char = 0;
6342 coding->produced = coding->produced_char = 0;
6343 coding_set_source (coding);
1c3478b0 6344
df7492f9 6345 src_end = coding->source + coding->src_bytes;
c0e16b14 6346 coding->head_ascii = 0;
1c3478b0 6347
df7492f9
KH
6348 /* If we have not yet decided the text encoding type, detect it
6349 now. */
6350 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
b73bfc1c 6351 {
df7492f9 6352 int c, i;
6cb21a4f 6353 struct coding_detection_info detect_info;
2f3cbb32 6354 int null_byte_found = 0, eight_bit_found = 0;
df7492f9 6355
6cb21a4f 6356 detect_info.checked = detect_info.found = detect_info.rejected = 0;
2f3cbb32 6357 for (src = coding->source; src < src_end; src++)
d46c5b12 6358 {
df7492f9 6359 c = *src;
6cb21a4f 6360 if (c & 0x80)
6cb21a4f 6361 {
2f3cbb32 6362 eight_bit_found = 1;
2f3cbb32
KH
6363 if (null_byte_found)
6364 break;
6365 }
6366 else if (c < 0x20)
6367 {
6368 if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6369 && ! inhibit_iso_escape_detection
6370 && ! detect_info.checked)
6cb21a4f 6371 {
2f3cbb32
KH
6372 if (detect_coding_iso_2022 (coding, &detect_info))
6373 {
6374 /* We have scanned the whole data. */
6375 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
c0e16b14
KH
6376 {
6377 /* We didn't find an 8-bit code. We may
6378 have found a null-byte, but it's very
6379 rare that a binary file confirm to
6380 ISO-2022. */
6381 src = src_end;
6382 coding->head_ascii = src - coding->source;
6383 }
6384 detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
2f3cbb32
KH
6385 break;
6386 }
6387 }
97b1b294 6388 else if (! c && !inhibit_null_byte_detection)
2f3cbb32
KH
6389 {
6390 null_byte_found = 1;
6391 if (eight_bit_found)
6392 break;
6cb21a4f 6393 }
c006c0c8
KH
6394 if (! eight_bit_found)
6395 coding->head_ascii++;
6cb21a4f 6396 }
c006c0c8 6397 else if (! eight_bit_found)
c0e16b14 6398 coding->head_ascii++;
d46c5b12 6399 }
df7492f9 6400
2f3cbb32
KH
6401 if (null_byte_found || eight_bit_found
6402 || coding->head_ascii < coding->src_bytes
6cb21a4f 6403 || detect_info.found)
d46c5b12 6404 {
ff0dacd7
KH
6405 enum coding_category category;
6406 struct coding_system *this;
df7492f9 6407
6cb21a4f
KH
6408 if (coding->head_ascii == coding->src_bytes)
6409 /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */
6410 for (i = 0; i < coding_category_raw_text; i++)
6411 {
6412 category = coding_priorities[i];
6413 this = coding_categories + category;
6414 if (detect_info.found & (1 << category))
24a73b0a 6415 break;
6cb21a4f
KH
6416 }
6417 else
2f3cbb32
KH
6418 {
6419 if (null_byte_found)
ff0dacd7 6420 {
2f3cbb32
KH
6421 detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6422 detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
ff0dacd7 6423 }
2f3cbb32
KH
6424 for (i = 0; i < coding_category_raw_text; i++)
6425 {
6426 category = coding_priorities[i];
6427 this = coding_categories + category;
6428 if (this->id < 0)
6429 {
6430 /* No coding system of this category is defined. */
6431 detect_info.rejected |= (1 << category);
6432 }
6433 else if (category >= coding_category_raw_text)
6434 continue;
6435 else if (detect_info.checked & (1 << category))
6436 {
6437 if (detect_info.found & (1 << category))
6438 break;
6439 }
6440 else if ((*(this->detector)) (coding, &detect_info)
6441 && detect_info.found & (1 << category))
6442 {
6443 if (category == coding_category_utf_16_auto)
6444 {
6445 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6446 category = coding_category_utf_16_le;
6447 else
6448 category = coding_category_utf_16_be;
6449 }
6450 break;
6451 }
6452 }
2f3cbb32 6453 }
c0e16b14
KH
6454
6455 if (i < coding_category_raw_text)
6456 setup_coding_system (CODING_ID_NAME (this->id), coding);
6457 else if (null_byte_found)
6458 setup_coding_system (Qno_conversion, coding);
6459 else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6460 == CATEGORY_MASK_ANY)
6461 setup_coding_system (Qraw_text, coding);
6462 else if (detect_info.rejected)
6463 for (i = 0; i < coding_category_raw_text; i++)
6464 if (! (detect_info.rejected & (1 << coding_priorities[i])))
6465 {
6466 this = coding_categories + coding_priorities[i];
6467 setup_coding_system (CODING_ID_NAME (this->id), coding);
6468 break;
6469 }
d46c5b12 6470 }
b73bfc1c 6471 }
a470d443
KH
6472 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6473 == coding_category_utf_8_auto)
6474 {
6475 Lisp_Object coding_systems;
6476 struct coding_detection_info detect_info;
6477
6478 coding_systems
6479 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6480 detect_info.found = detect_info.rejected = 0;
6481 coding->head_ascii = 0;
6482 if (CONSP (coding_systems)
6483 && detect_coding_utf_8 (coding, &detect_info))
6484 {
6485 if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6486 setup_coding_system (XCAR (coding_systems), coding);
6487 else
6488 setup_coding_system (XCDR (coding_systems), coding);
6489 }
6490 }
24a73b0a
KH
6491 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6492 == coding_category_utf_16_auto)
b49a1807
KH
6493 {
6494 Lisp_Object coding_systems;
6495 struct coding_detection_info detect_info;
6496
6497 coding_systems
a470d443 6498 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
b49a1807 6499 detect_info.found = detect_info.rejected = 0;
a470d443 6500 coding->head_ascii = 0;
b49a1807 6501 if (CONSP (coding_systems)
24a73b0a 6502 && detect_coding_utf_16 (coding, &detect_info))
b49a1807
KH
6503 {
6504 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6505 setup_coding_system (XCAR (coding_systems), coding);
24a73b0a 6506 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
b49a1807
KH
6507 setup_coding_system (XCDR (coding_systems), coding);
6508 }
6509 }
73cce38d 6510 coding->mode = saved_mode;
4ed46869 6511}
4ed46869 6512
d46c5b12 6513
aaaf0b1e 6514static void
df7492f9 6515decode_eol (coding)
aaaf0b1e 6516 struct coding_system *coding;
aaaf0b1e 6517{
24a73b0a
KH
6518 Lisp_Object eol_type;
6519 unsigned char *p, *pbeg, *pend;
3ed051d4 6520
24a73b0a 6521 eol_type = CODING_ID_EOL_TYPE (coding->id);
0a9564cb 6522 if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
24a73b0a
KH
6523 return;
6524
6525 if (NILP (coding->dst_object))
6526 pbeg = coding->destination;
6527 else
6528 pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6529 pend = pbeg + coding->produced;
6530
6531 if (VECTORP (eol_type))
aaaf0b1e 6532 {
df7492f9 6533 int eol_seen = EOL_SEEN_NONE;
4ed46869 6534
24a73b0a 6535 for (p = pbeg; p < pend; p++)
aaaf0b1e 6536 {
df7492f9
KH
6537 if (*p == '\n')
6538 eol_seen |= EOL_SEEN_LF;
6539 else if (*p == '\r')
aaaf0b1e 6540 {
df7492f9 6541 if (p + 1 < pend && *(p + 1) == '\n')
aaaf0b1e 6542 {
df7492f9
KH
6543 eol_seen |= EOL_SEEN_CRLF;
6544 p++;
aaaf0b1e 6545 }
aaaf0b1e 6546 else
df7492f9 6547 eol_seen |= EOL_SEEN_CR;
aaaf0b1e 6548 }
aaaf0b1e 6549 }
75f4f1ac
EZ
6550 /* Handle DOS-style EOLs in a file with stray ^M characters. */
6551 if ((eol_seen & EOL_SEEN_CRLF) != 0
6552 && (eol_seen & EOL_SEEN_CR) != 0
6553 && (eol_seen & EOL_SEEN_LF) == 0)
6554 eol_seen = EOL_SEEN_CRLF;
6555 else if (eol_seen != EOL_SEEN_NONE
24a73b0a
KH
6556 && eol_seen != EOL_SEEN_LF
6557 && eol_seen != EOL_SEEN_CRLF
6558 && eol_seen != EOL_SEEN_CR)
6559 eol_seen = EOL_SEEN_LF;
df7492f9 6560 if (eol_seen != EOL_SEEN_NONE)
24a73b0a 6561 eol_type = adjust_coding_eol_type (coding, eol_seen);
aaaf0b1e 6562 }
d46c5b12 6563
24a73b0a 6564 if (EQ (eol_type, Qmac))
27901516 6565 {
24a73b0a 6566 for (p = pbeg; p < pend; p++)
df7492f9
KH
6567 if (*p == '\r')
6568 *p = '\n';
4ed46869 6569 }
24a73b0a 6570 else if (EQ (eol_type, Qdos))
df7492f9 6571 {
24a73b0a 6572 int n = 0;
b73bfc1c 6573
24a73b0a
KH
6574 if (NILP (coding->dst_object))
6575 {
4347441b
KH
6576 /* Start deleting '\r' from the tail to minimize the memory
6577 movement. */
24a73b0a
KH
6578 for (p = pend - 2; p >= pbeg; p--)
6579 if (*p == '\r')
6580 {
6581 safe_bcopy ((char *) (p + 1), (char *) p, pend-- - p - 1);
6582 n++;
6583 }
6584 }
6585 else
6586 {
4347441b
KH
6587 int pos_byte = coding->dst_pos_byte;
6588 int pos = coding->dst_pos;
6589 int pos_end = pos + coding->produced_char - 1;
6590
6591 while (pos < pos_end)
6592 {
6593 p = BYTE_POS_ADDR (pos_byte);
6594 if (*p == '\r' && p[1] == '\n')
6595 {
6596 del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6597 n++;
6598 pos_end--;
6599 }
6600 pos++;
69b8522d
KH
6601 if (coding->dst_multibyte)
6602 pos_byte += BYTES_BY_CHAR_HEAD (*p);
6603 else
6604 pos_byte++;
4347441b 6605 }
24a73b0a
KH
6606 }
6607 coding->produced -= n;
6608 coding->produced_char -= n;
aaaf0b1e 6609 }
4ed46869
KH
6610}
6611
7d64c6ad 6612
a6f87d34
KH
6613/* Return a translation table (or list of them) from coding system
6614 attribute vector ATTRS for encoding (ENCODEP is nonzero) or
6615 decoding (ENCODEP is zero). */
7d64c6ad 6616
e6a54062 6617static Lisp_Object
09ee6fdd
KH
6618get_translation_table (attrs, encodep, max_lookup)
6619 Lisp_Object attrs;
6620 int encodep, *max_lookup;
7d64c6ad
KH
6621{
6622 Lisp_Object standard, translation_table;
09ee6fdd 6623 Lisp_Object val;
7d64c6ad
KH
6624
6625 if (encodep)
6626 translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6627 standard = Vstandard_translation_table_for_encode;
6628 else
6629 translation_table = CODING_ATTR_DECODE_TBL (attrs),
6630 standard = Vstandard_translation_table_for_decode;
7d64c6ad 6631 if (NILP (translation_table))
09ee6fdd
KH
6632 translation_table = standard;
6633 else
a6f87d34 6634 {
09ee6fdd
KH
6635 if (SYMBOLP (translation_table))
6636 translation_table = Fget (translation_table, Qtranslation_table);
6637 else if (CONSP (translation_table))
6638 {
6639 translation_table = Fcopy_sequence (translation_table);
6640 for (val = translation_table; CONSP (val); val = XCDR (val))
6641 if (SYMBOLP (XCAR (val)))
6642 XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6643 }
6644 if (CHAR_TABLE_P (standard))
6645 {
6646 if (CONSP (translation_table))
6647 translation_table = nconc2 (translation_table,
6648 Fcons (standard, Qnil));
6649 else
6650 translation_table = Fcons (translation_table,
6651 Fcons (standard, Qnil));
6652 }
a6f87d34 6653 }
2170c8f0
KH
6654
6655 if (max_lookup)
09ee6fdd 6656 {
2170c8f0
KH
6657 *max_lookup = 1;
6658 if (CHAR_TABLE_P (translation_table)
6659 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6660 {
6661 val = XCHAR_TABLE (translation_table)->extras[1];
6662 if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6663 *max_lookup = XFASTINT (val);
6664 }
6665 else if (CONSP (translation_table))
6666 {
6667 Lisp_Object tail, val;
09ee6fdd 6668
2170c8f0
KH
6669 for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6670 if (CHAR_TABLE_P (XCAR (tail))
6671 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6672 {
6673 val = XCHAR_TABLE (XCAR (tail))->extras[1];
6674 if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6675 *max_lookup = XFASTINT (val);
6676 }
6677 }
a6f87d34 6678 }
7d64c6ad
KH
6679 return translation_table;
6680}
6681
09ee6fdd
KH
6682#define LOOKUP_TRANSLATION_TABLE(table, c, trans) \
6683 do { \
6684 trans = Qnil; \
6685 if (CHAR_TABLE_P (table)) \
6686 { \
6687 trans = CHAR_TABLE_REF (table, c); \
6688 if (CHARACTERP (trans)) \
6689 c = XFASTINT (trans), trans = Qnil; \
6690 } \
6691 else if (CONSP (table)) \
6692 { \
6693 Lisp_Object tail; \
6694 \
6695 for (tail = table; CONSP (tail); tail = XCDR (tail)) \
6696 if (CHAR_TABLE_P (XCAR (tail))) \
6697 { \
6698 trans = CHAR_TABLE_REF (XCAR (tail), c); \
6699 if (CHARACTERP (trans)) \
6700 c = XFASTINT (trans), trans = Qnil; \
6701 else if (! NILP (trans)) \
6702 break; \
6703 } \
6704 } \
e6a54062
KH
6705 } while (0)
6706
7d64c6ad 6707
e951386e
KH
6708/* Return a translation of character(s) at BUF according to TRANS.
6709 TRANS is TO-CHAR or ((FROM . TO) ...) where
6710 FROM = [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].
6711 The return value is TO-CHAR or ([FROM-CHAR ...] . TO) if a
6712 translation is found, and Qnil if not found..
6713 If BUF is too short to lookup characters in FROM, return Qt. */
6714
69a80ea3 6715static Lisp_Object
e951386e
KH
6716get_translation (trans, buf, buf_end)
6717 Lisp_Object trans;
69a80ea3 6718 int *buf, *buf_end;
69a80ea3 6719{
e951386e
KH
6720
6721 if (INTEGERP (trans))
6722 return trans;
6723 for (; CONSP (trans); trans = XCDR (trans))
69a80ea3 6724 {
e951386e
KH
6725 Lisp_Object val = XCAR (trans);
6726 Lisp_Object from = XCAR (val);
6727 int len = ASIZE (from);
6728 int i;
69a80ea3 6729
e951386e 6730 for (i = 0; i < len; i++)
69a80ea3 6731 {
e951386e
KH
6732 if (buf + i == buf_end)
6733 return Qt;
6734 if (XINT (AREF (from, i)) != buf[i])
6735 break;
69a80ea3 6736 }
e951386e
KH
6737 if (i == len)
6738 return val;
69a80ea3 6739 }
e951386e 6740 return Qnil;
69a80ea3
KH
6741}
6742
6743
d46c5b12 6744static int
69a80ea3 6745produce_chars (coding, translation_table, last_block)
df7492f9 6746 struct coding_system *coding;
69a80ea3
KH
6747 Lisp_Object translation_table;
6748 int last_block;
4ed46869 6749{
df7492f9
KH
6750 unsigned char *dst = coding->destination + coding->produced;
6751 unsigned char *dst_end = coding->destination + coding->dst_bytes;
119852e7
KH
6752 EMACS_INT produced;
6753 EMACS_INT produced_chars = 0;
69a80ea3 6754 int carryover = 0;
4ed46869 6755
df7492f9 6756 if (! coding->chars_at_source)
4ed46869 6757 {
119852e7 6758 /* Source characters are in coding->charbuf. */
fba4576f
AS
6759 int *buf = coding->charbuf;
6760 int *buf_end = buf + coding->charbuf_used;
4ed46869 6761
db274c7a
KH
6762 if (EQ (coding->src_object, coding->dst_object))
6763 {
6764 coding_set_source (coding);
6765 dst_end = ((unsigned char *) coding->source) + coding->consumed;
6766 }
4ed46869 6767
df7492f9 6768 while (buf < buf_end)
4ed46869 6769 {
69a80ea3 6770 int c = *buf, i;
bc4bc72a 6771
df7492f9
KH
6772 if (c >= 0)
6773 {
69a80ea3
KH
6774 int from_nchars = 1, to_nchars = 1;
6775 Lisp_Object trans = Qnil;
6776
09ee6fdd 6777 LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
e6a54062 6778 if (! NILP (trans))
69a80ea3 6779 {
e951386e
KH
6780 trans = get_translation (trans, buf, buf_end);
6781 if (INTEGERP (trans))
6782 c = XINT (trans);
6783 else if (CONSP (trans))
6784 {
6785 from_nchars = ASIZE (XCAR (trans));
6786 trans = XCDR (trans);
6787 if (INTEGERP (trans))
6788 c = XINT (trans);
6789 else
6790 {
6791 to_nchars = ASIZE (trans);
6792 c = XINT (AREF (trans, 0));
6793 }
6794 }
6795 else if (EQ (trans, Qt) && ! last_block)
69a80ea3 6796 break;
69a80ea3
KH
6797 }
6798
6799 if (dst + MAX_MULTIBYTE_LENGTH * to_nchars > dst_end)
6800 {
6801 dst = alloc_destination (coding,
6802 buf_end - buf
6803 + MAX_MULTIBYTE_LENGTH * to_nchars,
6804 dst);
db274c7a
KH
6805 if (EQ (coding->src_object, coding->dst_object))
6806 {
6807 coding_set_source (coding);
e951386e
KH
6808 dst_end = (((unsigned char *) coding->source)
6809 + coding->consumed);
db274c7a
KH
6810 }
6811 else
6812 dst_end = coding->destination + coding->dst_bytes;
69a80ea3
KH
6813 }
6814
433f7f87 6815 for (i = 0; i < to_nchars; i++)
69a80ea3 6816 {
433f7f87
KH
6817 if (i > 0)
6818 c = XINT (AREF (trans, i));
69a80ea3
KH
6819 if (coding->dst_multibyte
6820 || ! CHAR_BYTE8_P (c))
db274c7a 6821 CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
69a80ea3
KH
6822 else
6823 *dst++ = CHAR_TO_BYTE8 (c);
6824 }
6825 produced_chars += to_nchars;
e951386e 6826 buf += from_nchars;
d46c5b12 6827 }
df7492f9 6828 else
69a80ea3
KH
6829 /* This is an annotation datum. (-C) is the length. */
6830 buf += -c;
4ed46869 6831 }
69a80ea3 6832 carryover = buf_end - buf;
4ed46869 6833 }
fa42c37f 6834 else
fa42c37f 6835 {
119852e7 6836 /* Source characters are at coding->source. */
8f924df7 6837 const unsigned char *src = coding->source;
119852e7 6838 const unsigned char *src_end = src + coding->consumed;
4ed46869 6839
db274c7a
KH
6840 if (EQ (coding->dst_object, coding->src_object))
6841 dst_end = (unsigned char *) src;
df7492f9 6842 if (coding->src_multibyte != coding->dst_multibyte)
fa42c37f 6843 {
df7492f9 6844 if (coding->src_multibyte)
fa42c37f 6845 {
71c81426 6846 int multibytep = 1;
4533845d 6847 EMACS_INT consumed_chars = 0;
d46c5b12 6848
df7492f9
KH
6849 while (1)
6850 {
8f924df7 6851 const unsigned char *src_base = src;
df7492f9 6852 int c;
b73bfc1c 6853
df7492f9 6854 ONE_MORE_BYTE (c);
119852e7 6855 if (dst == dst_end)
df7492f9 6856 {
119852e7
KH
6857 if (EQ (coding->src_object, coding->dst_object))
6858 dst_end = (unsigned char *) src;
6859 if (dst == dst_end)
df7492f9 6860 {
119852e7
KH
6861 EMACS_INT offset = src - coding->source;
6862
6863 dst = alloc_destination (coding, src_end - src + 1,
6864 dst);
6865 dst_end = coding->destination + coding->dst_bytes;
6866 coding_set_source (coding);
6867 src = coding->source + offset;
6868 src_end = coding->source + coding->src_bytes;
db274c7a
KH
6869 if (EQ (coding->src_object, coding->dst_object))
6870 dst_end = (unsigned char *) src;
df7492f9 6871 }
df7492f9
KH
6872 }
6873 *dst++ = c;
6874 produced_chars++;
6875 }
6876 no_more_source:
6877 ;
fa42c37f
KH
6878 }
6879 else
df7492f9
KH
6880 while (src < src_end)
6881 {
71c81426 6882 int multibytep = 1;
df7492f9 6883 int c = *src++;
b73bfc1c 6884
df7492f9
KH
6885 if (dst >= dst_end - 1)
6886 {
2c78b7e1 6887 if (EQ (coding->src_object, coding->dst_object))
8f924df7 6888 dst_end = (unsigned char *) src;
2c78b7e1
KH
6889 if (dst >= dst_end - 1)
6890 {
119852e7 6891 EMACS_INT offset = src - coding->source;
db274c7a 6892 EMACS_INT more_bytes;
119852e7 6893
db274c7a
KH
6894 if (EQ (coding->src_object, coding->dst_object))
6895 more_bytes = ((src_end - src) / 2) + 2;
6896 else
6897 more_bytes = src_end - src + 2;
6898 dst = alloc_destination (coding, more_bytes, dst);
2c78b7e1
KH
6899 dst_end = coding->destination + coding->dst_bytes;
6900 coding_set_source (coding);
119852e7 6901 src = coding->source + offset;
2c78b7e1 6902 src_end = coding->source + coding->src_bytes;
db274c7a
KH
6903 if (EQ (coding->src_object, coding->dst_object))
6904 dst_end = (unsigned char *) src;
2c78b7e1 6905 }
df7492f9
KH
6906 }
6907 EMIT_ONE_BYTE (c);
6908 }
d46c5b12 6909 }
df7492f9
KH
6910 else
6911 {
6912 if (!EQ (coding->src_object, coding->dst_object))
fa42c37f 6913 {
119852e7 6914 EMACS_INT require = coding->src_bytes - coding->dst_bytes;
4ed46869 6915
df7492f9 6916 if (require > 0)
fa42c37f 6917 {
df7492f9
KH
6918 EMACS_INT offset = src - coding->source;
6919
6920 dst = alloc_destination (coding, require, dst);
6921 coding_set_source (coding);
6922 src = coding->source + offset;
6923 src_end = coding->source + coding->src_bytes;
fa42c37f
KH
6924 }
6925 }
119852e7 6926 produced_chars = coding->consumed_char;
df7492f9 6927 while (src < src_end)
14daee73 6928 *dst++ = *src++;
fa42c37f
KH
6929 }
6930 }
6931
df7492f9 6932 produced = dst - (coding->destination + coding->produced);
284201e4 6933 if (BUFFERP (coding->dst_object) && produced_chars > 0)
df7492f9
KH
6934 insert_from_gap (produced_chars, produced);
6935 coding->produced += produced;
6936 coding->produced_char += produced_chars;
69a80ea3 6937 return carryover;
fa42c37f
KH
6938}
6939
ff0dacd7
KH
6940/* Compose text in CODING->object according to the annotation data at
6941 CHARBUF. CHARBUF is an array:
e951386e 6942 [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
df7492f9 6943 */
4ed46869 6944
df7492f9 6945static INLINE void
69a80ea3 6946produce_composition (coding, charbuf, pos)
4ed46869 6947 struct coding_system *coding;
df7492f9 6948 int *charbuf;
69a80ea3 6949 EMACS_INT pos;
4ed46869 6950{
df7492f9 6951 int len;
69a80ea3 6952 EMACS_INT to;
df7492f9 6953 enum composition_method method;
df7492f9 6954 Lisp_Object components;
fa42c37f 6955
e951386e 6956 len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
69a80ea3 6957 to = pos + charbuf[2];
e951386e 6958 method = (enum composition_method) (charbuf[4]);
d46c5b12 6959
df7492f9
KH
6960 if (method == COMPOSITION_RELATIVE)
6961 components = Qnil;
e951386e 6962 else
d46c5b12 6963 {
df7492f9 6964 Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
e951386e 6965 int i, j;
b73bfc1c 6966
e951386e
KH
6967 if (method == COMPOSITION_WITH_RULE)
6968 len = charbuf[2] * 3 - 2;
6969 charbuf += MAX_ANNOTATION_LENGTH;
6970 /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
6971 for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
9ffd559c 6972 {
e951386e
KH
6973 if (charbuf[i] >= 0)
6974 args[j] = make_number (charbuf[i]);
6975 else
6976 {
6977 i++;
6978 args[j] = make_number (charbuf[i] % 0x100);
6979 }
9ffd559c 6980 }
e951386e 6981 components = (i == j ? Fstring (j, args) : Fvector (j, args));
d46c5b12 6982 }
69a80ea3 6983 compose_text (pos, to, components, Qnil, coding->dst_object);
d46c5b12
KH
6984}
6985
d46c5b12 6986
ff0dacd7
KH
6987/* Put `charset' property on text in CODING->object according to
6988 the annotation data at CHARBUF. CHARBUF is an array:
69a80ea3 6989 [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
ff0dacd7 6990 */
d46c5b12 6991
ff0dacd7 6992static INLINE void
69a80ea3 6993produce_charset (coding, charbuf, pos)
d46c5b12 6994 struct coding_system *coding;
ff0dacd7 6995 int *charbuf;
69a80ea3 6996 EMACS_INT pos;
d46c5b12 6997{
69a80ea3
KH
6998 EMACS_INT from = pos - charbuf[2];
6999 struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
b73bfc1c 7000
69a80ea3 7001 Fput_text_property (make_number (from), make_number (pos),
ff0dacd7
KH
7002 Qcharset, CHARSET_NAME (charset),
7003 coding->dst_object);
d46c5b12
KH
7004}
7005
d46c5b12 7006
df7492f9
KH
7007#define CHARBUF_SIZE 0x4000
7008
7009#define ALLOC_CONVERSION_WORK_AREA(coding) \
7010 do { \
8510724d 7011 int size = CHARBUF_SIZE; \
df7492f9
KH
7012 \
7013 coding->charbuf = NULL; \
7014 while (size > 1024) \
7015 { \
7016 coding->charbuf = (int *) alloca (sizeof (int) * size); \
7017 if (coding->charbuf) \
7018 break; \
7019 size >>= 1; \
7020 } \
7021 if (! coding->charbuf) \
7022 { \
065e3595 7023 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
df7492f9
KH
7024 return coding->result; \
7025 } \
7026 coding->charbuf_size = size; \
7027 } while (0)
4ed46869 7028
d46c5b12
KH
7029
7030static void
69a80ea3 7031produce_annotation (coding, pos)
d46c5b12 7032 struct coding_system *coding;
69a80ea3 7033 EMACS_INT pos;
d46c5b12 7034{
df7492f9
KH
7035 int *charbuf = coding->charbuf;
7036 int *charbuf_end = charbuf + coding->charbuf_used;
d46c5b12 7037
ff0dacd7
KH
7038 if (NILP (coding->dst_object))
7039 return;
d46c5b12 7040
df7492f9 7041 while (charbuf < charbuf_end)
a84f1519 7042 {
df7492f9 7043 if (*charbuf >= 0)
e951386e 7044 pos++, charbuf++;
d46c5b12 7045 else
d46c5b12 7046 {
df7492f9 7047 int len = -*charbuf;
e951386e
KH
7048
7049 if (len > 2)
7050 switch (charbuf[1])
7051 {
7052 case CODING_ANNOTATE_COMPOSITION_MASK:
7053 produce_composition (coding, charbuf, pos);
7054 break;
7055 case CODING_ANNOTATE_CHARSET_MASK:
7056 produce_charset (coding, charbuf, pos);
7057 break;
7058 }
df7492f9 7059 charbuf += len;
d46c5b12 7060 }
a84f1519 7061 }
d46c5b12
KH
7062}
7063
df7492f9
KH
7064/* Decode the data at CODING->src_object into CODING->dst_object.
7065 CODING->src_object is a buffer, a string, or nil.
7066 CODING->dst_object is a buffer.
d46c5b12 7067
df7492f9
KH
7068 If CODING->src_object is a buffer, it must be the current buffer.
7069 In this case, if CODING->src_pos is positive, it is a position of
7070 the source text in the buffer, otherwise, the source text is in the
7071 gap area of the buffer, and CODING->src_pos specifies the offset of
7072 the text from GPT (which must be the same as PT). If this is the
7073 same buffer as CODING->dst_object, CODING->src_pos must be
7074 negative.
d46c5b12 7075
b6828792 7076 If CODING->src_object is a string, CODING->src_pos is an index to
df7492f9 7077 that string.
d46c5b12 7078
df7492f9
KH
7079 If CODING->src_object is nil, CODING->source must already point to
7080 the non-relocatable memory area. In this case, CODING->src_pos is
7081 an offset from CODING->source.
73be902c 7082
df7492f9
KH
7083 The decoded data is inserted at the current point of the buffer
7084 CODING->dst_object.
7085*/
d46c5b12 7086
df7492f9
KH
7087static int
7088decode_coding (coding)
d46c5b12 7089 struct coding_system *coding;
d46c5b12 7090{
df7492f9 7091 Lisp_Object attrs;
24a73b0a 7092 Lisp_Object undo_list;
7d64c6ad 7093 Lisp_Object translation_table;
69a80ea3
KH
7094 int carryover;
7095 int i;
d46c5b12 7096
df7492f9
KH
7097 if (BUFFERP (coding->src_object)
7098 && coding->src_pos > 0
7099 && coding->src_pos < GPT
7100 && coding->src_pos + coding->src_chars > GPT)
7101 move_gap_both (coding->src_pos, coding->src_pos_byte);
d46c5b12 7102
24a73b0a 7103 undo_list = Qt;
df7492f9 7104 if (BUFFERP (coding->dst_object))
1c3478b0 7105 {
df7492f9
KH
7106 if (current_buffer != XBUFFER (coding->dst_object))
7107 set_buffer_internal (XBUFFER (coding->dst_object));
7108 if (GPT != PT)
7109 move_gap_both (PT, PT_BYTE);
24a73b0a
KH
7110 undo_list = current_buffer->undo_list;
7111 current_buffer->undo_list = Qt;
1c3478b0
KH
7112 }
7113
df7492f9
KH
7114 coding->consumed = coding->consumed_char = 0;
7115 coding->produced = coding->produced_char = 0;
7116 coding->chars_at_source = 0;
065e3595 7117 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9 7118 coding->errors = 0;
1c3478b0 7119
df7492f9
KH
7120 ALLOC_CONVERSION_WORK_AREA (coding);
7121
7122 attrs = CODING_ID_ATTRS (coding->id);
2170c8f0 7123 translation_table = get_translation_table (attrs, 0, NULL);
df7492f9 7124
69a80ea3 7125 carryover = 0;
df7492f9 7126 do
b73bfc1c 7127 {
69a80ea3
KH
7128 EMACS_INT pos = coding->dst_pos + coding->produced_char;
7129
df7492f9
KH
7130 coding_set_source (coding);
7131 coding->annotated = 0;
69a80ea3 7132 coding->charbuf_used = carryover;
df7492f9 7133 (*(coding->decoder)) (coding);
df7492f9 7134 coding_set_destination (coding);
69a80ea3 7135 carryover = produce_chars (coding, translation_table, 0);
df7492f9 7136 if (coding->annotated)
69a80ea3
KH
7137 produce_annotation (coding, pos);
7138 for (i = 0; i < carryover; i++)
7139 coding->charbuf[i]
7140 = coding->charbuf[coding->charbuf_used - carryover + i];
d46c5b12 7141 }
df7492f9 7142 while (coding->consumed < coding->src_bytes
54b367bb
KH
7143 && (coding->result == CODING_RESULT_SUCCESS
7144 || coding->result == CODING_RESULT_INVALID_SRC));
d46c5b12 7145
69a80ea3
KH
7146 if (carryover > 0)
7147 {
7148 coding_set_destination (coding);
7149 coding->charbuf_used = carryover;
7150 produce_chars (coding, translation_table, 1);
7151 }
7152
df7492f9
KH
7153 coding->carryover_bytes = 0;
7154 if (coding->consumed < coding->src_bytes)
d46c5b12 7155 {
df7492f9 7156 int nbytes = coding->src_bytes - coding->consumed;
8f924df7 7157 const unsigned char *src;
df7492f9
KH
7158
7159 coding_set_source (coding);
7160 coding_set_destination (coding);
7161 src = coding->source + coding->consumed;
7162
7163 if (coding->mode & CODING_MODE_LAST_BLOCK)
1c3478b0 7164 {
df7492f9
KH
7165 /* Flush out unprocessed data as binary chars. We are sure
7166 that the number of data is less than the size of
7167 coding->charbuf. */
065e3595 7168 coding->charbuf_used = 0;
b2dab6c8
JR
7169 coding->chars_at_source = 0;
7170
df7492f9 7171 while (nbytes-- > 0)
1c3478b0 7172 {
df7492f9 7173 int c = *src++;
98725083 7174
1c91457d
KH
7175 if (c & 0x80)
7176 c = BYTE8_TO_CHAR (c);
7177 coding->charbuf[coding->charbuf_used++] = c;
1c3478b0 7178 }
f6cbaf43 7179 produce_chars (coding, Qnil, 1);
d46c5b12 7180 }
d46c5b12 7181 else
df7492f9
KH
7182 {
7183 /* Record unprocessed bytes in coding->carryover. We are
7184 sure that the number of data is less than the size of
7185 coding->carryover. */
7186 unsigned char *p = coding->carryover;
7187
f289d375
KH
7188 if (nbytes > sizeof coding->carryover)
7189 nbytes = sizeof coding->carryover;
df7492f9
KH
7190 coding->carryover_bytes = nbytes;
7191 while (nbytes-- > 0)
7192 *p++ = *src++;
1c3478b0 7193 }
df7492f9 7194 coding->consumed = coding->src_bytes;
b73bfc1c 7195 }
69f76525 7196
0a9564cb
EZ
7197 if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7198 && !inhibit_eol_conversion)
4347441b 7199 decode_eol (coding);
24a73b0a
KH
7200 if (BUFFERP (coding->dst_object))
7201 {
7202 current_buffer->undo_list = undo_list;
7203 record_insert (coding->dst_pos, coding->produced_char);
7204 }
73be902c 7205 return coding->result;
4ed46869
KH
7206}
7207
aaaf0b1e 7208
e1c23804 7209/* Extract an annotation datum from a composition starting at POS and
ff0dacd7
KH
7210 ending before LIMIT of CODING->src_object (buffer or string), store
7211 the data in BUF, set *STOP to a starting position of the next
7212 composition (if any) or to LIMIT, and return the address of the
7213 next element of BUF.
7214
7215 If such an annotation is not found, set *STOP to a starting
7216 position of a composition after POS (if any) or to LIMIT, and
7217 return BUF. */
7218
7219static INLINE int *
7220handle_composition_annotation (pos, limit, coding, buf, stop)
7221 EMACS_INT pos, limit;
aaaf0b1e 7222 struct coding_system *coding;
ff0dacd7
KH
7223 int *buf;
7224 EMACS_INT *stop;
aaaf0b1e 7225{
ff0dacd7
KH
7226 EMACS_INT start, end;
7227 Lisp_Object prop;
aaaf0b1e 7228
ff0dacd7
KH
7229 if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7230 || end > limit)
7231 *stop = limit;
7232 else if (start > pos)
7233 *stop = start;
7234 else
aaaf0b1e 7235 {
ff0dacd7 7236 if (start == pos)
aaaf0b1e 7237 {
ff0dacd7
KH
7238 /* We found a composition. Store the corresponding
7239 annotation data in BUF. */
7240 int *head = buf;
7241 enum composition_method method = COMPOSITION_METHOD (prop);
7242 int nchars = COMPOSITION_LENGTH (prop);
7243
e951386e 7244 ADD_COMPOSITION_DATA (buf, nchars, 0, method);
ff0dacd7 7245 if (method != COMPOSITION_RELATIVE)
aaaf0b1e 7246 {
ff0dacd7
KH
7247 Lisp_Object components;
7248 int len, i, i_byte;
7249
7250 components = COMPOSITION_COMPONENTS (prop);
7251 if (VECTORP (components))
aaaf0b1e 7252 {
ff0dacd7
KH
7253 len = XVECTOR (components)->size;
7254 for (i = 0; i < len; i++)
7255 *buf++ = XINT (AREF (components, i));
aaaf0b1e 7256 }
ff0dacd7 7257 else if (STRINGP (components))
aaaf0b1e 7258 {
8f924df7 7259 len = SCHARS (components);
ff0dacd7
KH
7260 i = i_byte = 0;
7261 while (i < len)
7262 {
7263 FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
7264 buf++;
7265 }
7266 }
7267 else if (INTEGERP (components))
7268 {
7269 len = 1;
7270 *buf++ = XINT (components);
7271 }
7272 else if (CONSP (components))
7273 {
7274 for (len = 0; CONSP (components);
7275 len++, components = XCDR (components))
7276 *buf++ = XINT (XCAR (components));
aaaf0b1e 7277 }
aaaf0b1e 7278 else
ff0dacd7
KH
7279 abort ();
7280 *head -= len;
aaaf0b1e 7281 }
aaaf0b1e 7282 }
ff0dacd7
KH
7283
7284 if (find_composition (end, limit, &start, &end, &prop,
7285 coding->src_object)
7286 && end <= limit)
7287 *stop = start;
7288 else
7289 *stop = limit;
aaaf0b1e 7290 }
ff0dacd7
KH
7291 return buf;
7292}
7293
7294
e1c23804 7295/* Extract an annotation datum from a text property `charset' at POS of
ff0dacd7
KH
7296 CODING->src_object (buffer of string), store the data in BUF, set
7297 *STOP to the position where the value of `charset' property changes
7298 (limiting by LIMIT), and return the address of the next element of
7299 BUF.
7300
7301 If the property value is nil, set *STOP to the position where the
7302 property value is non-nil (limiting by LIMIT), and return BUF. */
7303
7304static INLINE int *
7305handle_charset_annotation (pos, limit, coding, buf, stop)
7306 EMACS_INT pos, limit;
7307 struct coding_system *coding;
7308 int *buf;
7309 EMACS_INT *stop;
7310{
7311 Lisp_Object val, next;
7312 int id;
7313
7314 val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
7315 if (! NILP (val) && CHARSETP (val))
7316 id = XINT (CHARSET_SYMBOL_ID (val));
7317 else
7318 id = -1;
69a80ea3 7319 ADD_CHARSET_DATA (buf, 0, id);
ff0dacd7
KH
7320 next = Fnext_single_property_change (make_number (pos), Qcharset,
7321 coding->src_object,
7322 make_number (limit));
7323 *stop = XINT (next);
7324 return buf;
7325}
7326
7327
df7492f9 7328static void
09ee6fdd 7329consume_chars (coding, translation_table, max_lookup)
df7492f9 7330 struct coding_system *coding;
433f7f87 7331 Lisp_Object translation_table;
09ee6fdd 7332 int max_lookup;
df7492f9
KH
7333{
7334 int *buf = coding->charbuf;
ff0dacd7 7335 int *buf_end = coding->charbuf + coding->charbuf_size;
7c78e542 7336 const unsigned char *src = coding->source + coding->consumed;
4776e638 7337 const unsigned char *src_end = coding->source + coding->src_bytes;
ff0dacd7
KH
7338 EMACS_INT pos = coding->src_pos + coding->consumed_char;
7339 EMACS_INT end_pos = coding->src_pos + coding->src_chars;
df7492f9
KH
7340 int multibytep = coding->src_multibyte;
7341 Lisp_Object eol_type;
7342 int c;
ff0dacd7 7343 EMACS_INT stop, stop_composition, stop_charset;
09ee6fdd 7344 int *lookup_buf = NULL;
433f7f87
KH
7345
7346 if (! NILP (translation_table))
09ee6fdd 7347 lookup_buf = alloca (sizeof (int) * max_lookup);
88993dfd 7348
0a9564cb 7349 eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
df7492f9
KH
7350 if (VECTORP (eol_type))
7351 eol_type = Qunix;
88993dfd 7352
df7492f9
KH
7353 /* Note: composition handling is not yet implemented. */
7354 coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
ec6d2bb8 7355
0b5670c9
KH
7356 if (NILP (coding->src_object))
7357 stop = stop_composition = stop_charset = end_pos;
ff0dacd7 7358 else
0b5670c9
KH
7359 {
7360 if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7361 stop = stop_composition = pos;
7362 else
7363 stop = stop_composition = end_pos;
7364 if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7365 stop = stop_charset = pos;
7366 else
7367 stop_charset = end_pos;
7368 }
ec6d2bb8 7369
24a73b0a 7370 /* Compensate for CRLF and conversion. */
ff0dacd7 7371 buf_end -= 1 + MAX_ANNOTATION_LENGTH;
df7492f9 7372 while (buf < buf_end)
aaaf0b1e 7373 {
433f7f87
KH
7374 Lisp_Object trans;
7375
df7492f9 7376 if (pos == stop)
ec6d2bb8 7377 {
df7492f9
KH
7378 if (pos == end_pos)
7379 break;
ff0dacd7
KH
7380 if (pos == stop_composition)
7381 buf = handle_composition_annotation (pos, end_pos, coding,
7382 buf, &stop_composition);
7383 if (pos == stop_charset)
7384 buf = handle_charset_annotation (pos, end_pos, coding,
7385 buf, &stop_charset);
7386 stop = (stop_composition < stop_charset
7387 ? stop_composition : stop_charset);
df7492f9
KH
7388 }
7389
7390 if (! multibytep)
4776e638 7391 {
d3e4cb56 7392 EMACS_INT bytes;
aaaf0b1e 7393
ea29edf2
KH
7394 if (coding->encoder == encode_coding_raw_text)
7395 c = *src++, pos++;
7396 else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
db274c7a 7397 c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
4776e638 7398 else
f03caae0 7399 c = BYTE8_TO_CHAR (*src), src++, pos++;
4776e638 7400 }
df7492f9 7401 else
db274c7a 7402 c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
df7492f9
KH
7403 if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7404 c = '\n';
7405 if (! EQ (eol_type, Qunix))
aaaf0b1e 7406 {
df7492f9 7407 if (c == '\n')
aaaf0b1e 7408 {
df7492f9
KH
7409 if (EQ (eol_type, Qdos))
7410 *buf++ = '\r';
7411 else
7412 c = '\r';
aaaf0b1e
KH
7413 }
7414 }
433f7f87 7415
e6a54062 7416 trans = Qnil;
09ee6fdd 7417 LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
e6a54062 7418 if (NILP (trans))
433f7f87
KH
7419 *buf++ = c;
7420 else
7421 {
7422 int from_nchars = 1, to_nchars = 1;
7423 int *lookup_buf_end;
7424 const unsigned char *p = src;
7425 int i;
7426
7427 lookup_buf[0] = c;
7428 for (i = 1; i < max_lookup && p < src_end; i++)
7429 lookup_buf[i] = STRING_CHAR_ADVANCE (p);
7430 lookup_buf_end = lookup_buf + i;
e951386e
KH
7431 trans = get_translation (trans, lookup_buf, lookup_buf_end);
7432 if (INTEGERP (trans))
7433 c = XINT (trans);
7434 else if (CONSP (trans))
7435 {
7436 from_nchars = ASIZE (XCAR (trans));
7437 trans = XCDR (trans);
7438 if (INTEGERP (trans))
7439 c = XINT (trans);
7440 else
7441 {
7442 to_nchars = ASIZE (trans);
7443 if (buf + to_nchars > buf_end)
7444 break;
7445 c = XINT (AREF (trans, 0));
7446 }
7447 }
7448 else
433f7f87 7449 break;
e951386e 7450 *buf++ = c;
433f7f87
KH
7451 for (i = 1; i < to_nchars; i++)
7452 *buf++ = XINT (AREF (trans, i));
7453 for (i = 1; i < from_nchars; i++, pos++)
7454 src += MULTIBYTE_LENGTH_NO_CHECK (src);
7455 }
aaaf0b1e 7456 }
ec6d2bb8 7457
df7492f9
KH
7458 coding->consumed = src - coding->source;
7459 coding->consumed_char = pos - coding->src_pos;
7460 coding->charbuf_used = buf - coding->charbuf;
7461 coding->chars_at_source = 0;
aaaf0b1e
KH
7462}
7463
4ed46869 7464
df7492f9
KH
7465/* Encode the text at CODING->src_object into CODING->dst_object.
7466 CODING->src_object is a buffer or a string.
7467 CODING->dst_object is a buffer or nil.
7468
7469 If CODING->src_object is a buffer, it must be the current buffer.
7470 In this case, if CODING->src_pos is positive, it is a position of
7471 the source text in the buffer, otherwise. the source text is in the
7472 gap area of the buffer, and coding->src_pos specifies the offset of
7473 the text from GPT (which must be the same as PT). If this is the
7474 same buffer as CODING->dst_object, CODING->src_pos must be
7475 negative and CODING should not have `pre-write-conversion'.
7476
7477 If CODING->src_object is a string, CODING should not have
7478 `pre-write-conversion'.
7479
7480 If CODING->dst_object is a buffer, the encoded data is inserted at
7481 the current point of that buffer.
7482
7483 If CODING->dst_object is nil, the encoded data is placed at the
7484 memory area specified by CODING->destination. */
7485
7486static int
7487encode_coding (coding)
4ed46869 7488 struct coding_system *coding;
4ed46869 7489{
df7492f9 7490 Lisp_Object attrs;
7d64c6ad 7491 Lisp_Object translation_table;
09ee6fdd 7492 int max_lookup;
9861e777 7493
df7492f9 7494 attrs = CODING_ID_ATTRS (coding->id);
ea29edf2
KH
7495 if (coding->encoder == encode_coding_raw_text)
7496 translation_table = Qnil, max_lookup = 0;
7497 else
7498 translation_table = get_translation_table (attrs, 1, &max_lookup);
4ed46869 7499
df7492f9 7500 if (BUFFERP (coding->dst_object))
8844fa83 7501 {
df7492f9
KH
7502 set_buffer_internal (XBUFFER (coding->dst_object));
7503 coding->dst_multibyte
7504 = ! NILP (current_buffer->enable_multibyte_characters);
8844fa83 7505 }
4ed46869 7506
b73bfc1c 7507 coding->consumed = coding->consumed_char = 0;
df7492f9 7508 coding->produced = coding->produced_char = 0;
065e3595 7509 record_conversion_result (coding, CODING_RESULT_SUCCESS);
b73bfc1c 7510 coding->errors = 0;
b73bfc1c 7511
df7492f9 7512 ALLOC_CONVERSION_WORK_AREA (coding);
4ed46869 7513
df7492f9
KH
7514 do {
7515 coding_set_source (coding);
09ee6fdd 7516 consume_chars (coding, translation_table, max_lookup);
df7492f9
KH
7517 coding_set_destination (coding);
7518 (*(coding->encoder)) (coding);
7519 } while (coding->consumed_char < coding->src_chars);
7520
284201e4 7521 if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
df7492f9
KH
7522 insert_from_gap (coding->produced_char, coding->produced);
7523
7524 return (coding->result);
ec6d2bb8
KH
7525}
7526
fb88bf2d 7527
24a73b0a
KH
7528/* Name (or base name) of work buffer for code conversion. */
7529static Lisp_Object Vcode_conversion_workbuf_name;
d46c5b12 7530
24a73b0a
KH
7531/* A working buffer used by the top level conversion. Once it is
7532 created, it is never destroyed. It has the name
7533 Vcode_conversion_workbuf_name. The other working buffers are
7534 destroyed after the use is finished, and their names are modified
7535 versions of Vcode_conversion_workbuf_name. */
7536static Lisp_Object Vcode_conversion_reused_workbuf;
b73bfc1c 7537
24a73b0a
KH
7538/* 1 iff Vcode_conversion_reused_workbuf is already in use. */
7539static int reused_workbuf_in_use;
4ed46869 7540
24a73b0a
KH
7541
7542/* Return a working buffer of code convesion. MULTIBYTE specifies the
7543 multibyteness of returning buffer. */
b73bfc1c 7544
f6cbaf43 7545static Lisp_Object
24a73b0a 7546make_conversion_work_buffer (multibyte)
f6cbaf43 7547 int multibyte;
df7492f9 7548{
24a73b0a
KH
7549 Lisp_Object name, workbuf;
7550 struct buffer *current;
4ed46869 7551
24a73b0a 7552 if (reused_workbuf_in_use++)
065e3595
KH
7553 {
7554 name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7555 workbuf = Fget_buffer_create (name);
7556 }
df7492f9 7557 else
065e3595 7558 {
159bd5a2 7559 if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
a993c7a1
KH
7560 Vcode_conversion_reused_workbuf
7561 = Fget_buffer_create (Vcode_conversion_workbuf_name);
7562 workbuf = Vcode_conversion_reused_workbuf;
065e3595 7563 }
24a73b0a
KH
7564 current = current_buffer;
7565 set_buffer_internal (XBUFFER (workbuf));
df36ff1f
CY
7566 /* We can't allow modification hooks to run in the work buffer. For
7567 instance, directory_files_internal assumes that file decoding
7568 doesn't compile new regexps. */
7569 Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
3ed051d4 7570 Ferase_buffer ();
df7492f9 7571 current_buffer->undo_list = Qt;
24a73b0a 7572 current_buffer->enable_multibyte_characters = multibyte ? Qt : Qnil;
df7492f9 7573 set_buffer_internal (current);
24a73b0a 7574 return workbuf;
df7492f9 7575}
d46c5b12 7576
24a73b0a 7577
4776e638 7578static Lisp_Object
24a73b0a
KH
7579code_conversion_restore (arg)
7580 Lisp_Object arg;
4776e638 7581{
24a73b0a 7582 Lisp_Object current, workbuf;
948bdcf3 7583 struct gcpro gcpro1;
24a73b0a 7584
948bdcf3 7585 GCPRO1 (arg);
24a73b0a
KH
7586 current = XCAR (arg);
7587 workbuf = XCDR (arg);
7588 if (! NILP (workbuf))
7589 {
7590 if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7591 reused_workbuf_in_use = 0;
7592 else if (! NILP (Fbuffer_live_p (workbuf)))
7593 Fkill_buffer (workbuf);
7594 }
7595 set_buffer_internal (XBUFFER (current));
948bdcf3 7596 UNGCPRO;
4776e638
KH
7597 return Qnil;
7598}
b73bfc1c 7599
24a73b0a
KH
7600Lisp_Object
7601code_conversion_save (with_work_buf, multibyte)
4776e638 7602 int with_work_buf, multibyte;
df7492f9 7603{
24a73b0a 7604 Lisp_Object workbuf = Qnil;
b73bfc1c 7605
4776e638 7606 if (with_work_buf)
24a73b0a
KH
7607 workbuf = make_conversion_work_buffer (multibyte);
7608 record_unwind_protect (code_conversion_restore,
7609 Fcons (Fcurrent_buffer (), workbuf));
4776e638 7610 return workbuf;
df7492f9 7611}
d46c5b12 7612
df7492f9
KH
7613int
7614decode_coding_gap (coding, chars, bytes)
7615 struct coding_system *coding;
7616 EMACS_INT chars, bytes;
7617{
7618 int count = specpdl_ptr - specpdl;
5e5c78be 7619 Lisp_Object attrs;
fb88bf2d 7620
24a73b0a 7621 code_conversion_save (0, 0);
ec6d2bb8 7622
24a73b0a 7623 coding->src_object = Fcurrent_buffer ();
df7492f9
KH
7624 coding->src_chars = chars;
7625 coding->src_bytes = bytes;
7626 coding->src_pos = -chars;
7627 coding->src_pos_byte = -bytes;
7628 coding->src_multibyte = chars < bytes;
24a73b0a 7629 coding->dst_object = coding->src_object;
df7492f9
KH
7630 coding->dst_pos = PT;
7631 coding->dst_pos_byte = PT_BYTE;
71c81426 7632 coding->dst_multibyte = ! NILP (current_buffer->enable_multibyte_characters);
4ed46869 7633
df7492f9
KH
7634 if (CODING_REQUIRE_DETECTION (coding))
7635 detect_coding (coding);
8f924df7 7636
9286b333 7637 coding->mode |= CODING_MODE_LAST_BLOCK;
287c57d7 7638 current_buffer->text->inhibit_shrinking = 1;
df7492f9 7639 decode_coding (coding);
287c57d7 7640 current_buffer->text->inhibit_shrinking = 0;
d46c5b12 7641
5e5c78be
KH
7642 attrs = CODING_ID_ATTRS (coding->id);
7643 if (! NILP (CODING_ATTR_POST_READ (attrs)))
b73bfc1c 7644 {
5e5c78be
KH
7645 EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7646 Lisp_Object val;
7647
7648 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
5e5c78be
KH
7649 val = call1 (CODING_ATTR_POST_READ (attrs),
7650 make_number (coding->produced_char));
5e5c78be
KH
7651 CHECK_NATNUM (val);
7652 coding->produced_char += Z - prev_Z;
7653 coding->produced += Z_BYTE - prev_Z_BYTE;
b73bfc1c 7654 }
4ed46869 7655
df7492f9 7656 unbind_to (count, Qnil);
b73bfc1c
KH
7657 return coding->result;
7658}
52d41803 7659
4ed46869 7660int
df7492f9 7661encode_coding_gap (coding, chars, bytes)
4ed46869 7662 struct coding_system *coding;
df7492f9 7663 EMACS_INT chars, bytes;
4ed46869 7664{
df7492f9 7665 int count = specpdl_ptr - specpdl;
4ed46869 7666
24a73b0a 7667 code_conversion_save (0, 0);
4ed46869 7668
24a73b0a 7669 coding->src_object = Fcurrent_buffer ();
df7492f9
KH
7670 coding->src_chars = chars;
7671 coding->src_bytes = bytes;
7672 coding->src_pos = -chars;
7673 coding->src_pos_byte = -bytes;
7674 coding->src_multibyte = chars < bytes;
7675 coding->dst_object = coding->src_object;
7676 coding->dst_pos = PT;
7677 coding->dst_pos_byte = PT_BYTE;
4ed46869 7678
df7492f9 7679 encode_coding (coding);
b73bfc1c 7680
df7492f9
KH
7681 unbind_to (count, Qnil);
7682 return coding->result;
7683}
4ed46869 7684
d46c5b12 7685
df7492f9
KH
7686/* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
7687 SRC_OBJECT into DST_OBJECT by coding context CODING.
b73bfc1c 7688
df7492f9 7689 SRC_OBJECT is a buffer, a string, or Qnil.
b73bfc1c 7690
df7492f9
KH
7691 If it is a buffer, the text is at point of the buffer. FROM and TO
7692 are positions in the buffer.
b73bfc1c 7693
df7492f9
KH
7694 If it is a string, the text is at the beginning of the string.
7695 FROM and TO are indices to the string.
4ed46869 7696
df7492f9
KH
7697 If it is nil, the text is at coding->source. FROM and TO are
7698 indices to coding->source.
bb10be8b 7699
df7492f9 7700 DST_OBJECT is a buffer, Qt, or Qnil.
4ed46869 7701
df7492f9
KH
7702 If it is a buffer, the decoded text is inserted at point of the
7703 buffer. If the buffer is the same as SRC_OBJECT, the source text
7704 is deleted.
4ed46869 7705
df7492f9
KH
7706 If it is Qt, a string is made from the decoded text, and
7707 set in CODING->dst_object.
d46c5b12 7708
df7492f9 7709 If it is Qnil, the decoded text is stored at CODING->destination.
2cb26057 7710 The caller must allocate CODING->dst_bytes bytes at
df7492f9
KH
7711 CODING->destination by xmalloc. If the decoded text is longer than
7712 CODING->dst_bytes, CODING->destination is relocated by xrealloc.
7713 */
d46c5b12 7714
df7492f9
KH
7715void
7716decode_coding_object (coding, src_object, from, from_byte, to, to_byte,
7717 dst_object)
d46c5b12 7718 struct coding_system *coding;
df7492f9
KH
7719 Lisp_Object src_object;
7720 EMACS_INT from, from_byte, to, to_byte;
7721 Lisp_Object dst_object;
d46c5b12 7722{
df7492f9
KH
7723 int count = specpdl_ptr - specpdl;
7724 unsigned char *destination;
7725 EMACS_INT dst_bytes;
7726 EMACS_INT chars = to - from;
7727 EMACS_INT bytes = to_byte - from_byte;
7728 Lisp_Object attrs;
4776e638 7729 int saved_pt = -1, saved_pt_byte;
64cedb0c 7730 int need_marker_adjustment = 0;
b3bfad50 7731 Lisp_Object old_deactivate_mark;
d46c5b12 7732
b3bfad50 7733 old_deactivate_mark = Vdeactivate_mark;
93dec019 7734
df7492f9 7735 if (NILP (dst_object))
d46c5b12 7736 {
df7492f9
KH
7737 destination = coding->destination;
7738 dst_bytes = coding->dst_bytes;
d46c5b12 7739 }
93dec019 7740
df7492f9
KH
7741 coding->src_object = src_object;
7742 coding->src_chars = chars;
7743 coding->src_bytes = bytes;
7744 coding->src_multibyte = chars < bytes;
70ad9fc4 7745
df7492f9 7746 if (STRINGP (src_object))
d46c5b12 7747 {
df7492f9
KH
7748 coding->src_pos = from;
7749 coding->src_pos_byte = from_byte;
d46c5b12 7750 }
df7492f9 7751 else if (BUFFERP (src_object))
88993dfd 7752 {
df7492f9
KH
7753 set_buffer_internal (XBUFFER (src_object));
7754 if (from != GPT)
7755 move_gap_both (from, from_byte);
7756 if (EQ (src_object, dst_object))
fb88bf2d 7757 {
64cedb0c
KH
7758 struct Lisp_Marker *tail;
7759
7760 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7761 {
7762 tail->need_adjustment
7763 = tail->charpos == (tail->insertion_type ? from : to);
7764 need_marker_adjustment |= tail->need_adjustment;
7765 }
4776e638 7766 saved_pt = PT, saved_pt_byte = PT_BYTE;
df7492f9 7767 TEMP_SET_PT_BOTH (from, from_byte);
f4a3cc44 7768 current_buffer->text->inhibit_shrinking = 1;
df7492f9
KH
7769 del_range_both (from, from_byte, to, to_byte, 1);
7770 coding->src_pos = -chars;
7771 coding->src_pos_byte = -bytes;
fb88bf2d 7772 }
df7492f9 7773 else
fb88bf2d 7774 {
df7492f9
KH
7775 coding->src_pos = from;
7776 coding->src_pos_byte = from_byte;
fb88bf2d 7777 }
88993dfd
KH
7778 }
7779
df7492f9
KH
7780 if (CODING_REQUIRE_DETECTION (coding))
7781 detect_coding (coding);
7782 attrs = CODING_ID_ATTRS (coding->id);
d46c5b12 7783
2cb26057
KH
7784 if (EQ (dst_object, Qt)
7785 || (! NILP (CODING_ATTR_POST_READ (attrs))
7786 && NILP (dst_object)))
b73bfc1c 7787 {
a1567c45
SM
7788 coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
7789 coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
df7492f9
KH
7790 coding->dst_pos = BEG;
7791 coding->dst_pos_byte = BEG_BYTE;
b73bfc1c 7792 }
df7492f9 7793 else if (BUFFERP (dst_object))
d46c5b12 7794 {
24a73b0a 7795 code_conversion_save (0, 0);
df7492f9
KH
7796 coding->dst_object = dst_object;
7797 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
7798 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
7799 coding->dst_multibyte
7800 = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
d46c5b12
KH
7801 }
7802 else
7803 {
24a73b0a 7804 code_conversion_save (0, 0);
df7492f9 7805 coding->dst_object = Qnil;
0154725e
SM
7806 /* Most callers presume this will return a multibyte result, and they
7807 won't use `binary' or `raw-text' anyway, so let's not worry about
7808 CODING_FOR_UNIBYTE. */
bb555731 7809 coding->dst_multibyte = 1;
d46c5b12
KH
7810 }
7811
df7492f9 7812 decode_coding (coding);
fa46990e 7813
df7492f9
KH
7814 if (BUFFERP (coding->dst_object))
7815 set_buffer_internal (XBUFFER (coding->dst_object));
d46c5b12 7816
df7492f9 7817 if (! NILP (CODING_ATTR_POST_READ (attrs)))
d46c5b12 7818 {
b3bfad50 7819 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
df7492f9 7820 EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
2b4f9037 7821 Lisp_Object val;
d46c5b12 7822
c0cc7f7f 7823 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
b3bfad50
KH
7824 GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7825 old_deactivate_mark);
d4850d67
KH
7826 val = safe_call1 (CODING_ATTR_POST_READ (attrs),
7827 make_number (coding->produced_char));
df7492f9
KH
7828 UNGCPRO;
7829 CHECK_NATNUM (val);
7830 coding->produced_char += Z - prev_Z;
7831 coding->produced += Z_BYTE - prev_Z_BYTE;
d46c5b12 7832 }
de79a6a5 7833
df7492f9 7834 if (EQ (dst_object, Qt))
ec6d2bb8 7835 {
df7492f9
KH
7836 coding->dst_object = Fbuffer_string ();
7837 }
7838 else if (NILP (dst_object) && BUFFERP (coding->dst_object))
7839 {
7840 set_buffer_internal (XBUFFER (coding->dst_object));
7841 if (dst_bytes < coding->produced)
7842 {
b3bfad50 7843 destination = xrealloc (destination, coding->produced);
df7492f9
KH
7844 if (! destination)
7845 {
065e3595
KH
7846 record_conversion_result (coding,
7847 CODING_RESULT_INSUFFICIENT_DST);
df7492f9
KH
7848 unbind_to (count, Qnil);
7849 return;
7850 }
7851 if (BEGV < GPT && GPT < BEGV + coding->produced_char)
7852 move_gap_both (BEGV, BEGV_BYTE);
7853 bcopy (BEGV_ADDR, destination, coding->produced);
7854 coding->destination = destination;
d46c5b12 7855 }
ec6d2bb8 7856 }
b73bfc1c 7857
4776e638
KH
7858 if (saved_pt >= 0)
7859 {
7860 /* This is the case of:
7861 (BUFFERP (src_object) && EQ (src_object, dst_object))
7862 As we have moved PT while replacing the original buffer
7863 contents, we must recover it now. */
7864 set_buffer_internal (XBUFFER (src_object));
f4a3cc44 7865 current_buffer->text->inhibit_shrinking = 0;
4776e638
KH
7866 if (saved_pt < from)
7867 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7868 else if (saved_pt < from + chars)
7869 TEMP_SET_PT_BOTH (from, from_byte);
7870 else if (! NILP (current_buffer->enable_multibyte_characters))
7871 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7872 saved_pt_byte + (coding->produced - bytes));
7873 else
7874 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7875 saved_pt_byte + (coding->produced - bytes));
64cedb0c
KH
7876
7877 if (need_marker_adjustment)
7878 {
7879 struct Lisp_Marker *tail;
7880
7881 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7882 if (tail->need_adjustment)
7883 {
7884 tail->need_adjustment = 0;
7885 if (tail->insertion_type)
7886 {
7887 tail->bytepos = from_byte;
7888 tail->charpos = from;
7889 }
7890 else
7891 {
7892 tail->bytepos = from_byte + coding->produced;
7893 tail->charpos
7894 = (NILP (current_buffer->enable_multibyte_characters)
7895 ? tail->bytepos : from + coding->produced_char);
7896 }
7897 }
7898 }
d46c5b12 7899 }
4776e638 7900
b3bfad50 7901 Vdeactivate_mark = old_deactivate_mark;
065e3595 7902 unbind_to (count, coding->dst_object);
d46c5b12
KH
7903}
7904
d46c5b12 7905
df7492f9
KH
7906void
7907encode_coding_object (coding, src_object, from, from_byte, to, to_byte,
7908 dst_object)
d46c5b12 7909 struct coding_system *coding;
df7492f9
KH
7910 Lisp_Object src_object;
7911 EMACS_INT from, from_byte, to, to_byte;
7912 Lisp_Object dst_object;
d46c5b12 7913{
b73bfc1c 7914 int count = specpdl_ptr - specpdl;
df7492f9
KH
7915 EMACS_INT chars = to - from;
7916 EMACS_INT bytes = to_byte - from_byte;
7917 Lisp_Object attrs;
4776e638 7918 int saved_pt = -1, saved_pt_byte;
64cedb0c 7919 int need_marker_adjustment = 0;
c02d943b 7920 int kill_src_buffer = 0;
b3bfad50 7921 Lisp_Object old_deactivate_mark;
df7492f9 7922
b3bfad50 7923 old_deactivate_mark = Vdeactivate_mark;
df7492f9
KH
7924
7925 coding->src_object = src_object;
7926 coding->src_chars = chars;
7927 coding->src_bytes = bytes;
7928 coding->src_multibyte = chars < bytes;
7929
7930 attrs = CODING_ID_ATTRS (coding->id);
7931
64cedb0c
KH
7932 if (EQ (src_object, dst_object))
7933 {
7934 struct Lisp_Marker *tail;
7935
7936 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7937 {
7938 tail->need_adjustment
7939 = tail->charpos == (tail->insertion_type ? from : to);
7940 need_marker_adjustment |= tail->need_adjustment;
7941 }
7942 }
7943
df7492f9 7944 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
6bac5b12 7945 {
24a73b0a 7946 coding->src_object = code_conversion_save (1, coding->src_multibyte);
df7492f9
KH
7947 set_buffer_internal (XBUFFER (coding->src_object));
7948 if (STRINGP (src_object))
7949 insert_from_string (src_object, from, from_byte, chars, bytes, 0);
7950 else if (BUFFERP (src_object))
7951 insert_from_buffer (XBUFFER (src_object), from, chars, 0);
7952 else
7953 insert_1_both (coding->source + from, chars, bytes, 0, 0, 0);
d46c5b12 7954
df7492f9
KH
7955 if (EQ (src_object, dst_object))
7956 {
7957 set_buffer_internal (XBUFFER (src_object));
4776e638 7958 saved_pt = PT, saved_pt_byte = PT_BYTE;
df7492f9
KH
7959 del_range_both (from, from_byte, to, to_byte, 1);
7960 set_buffer_internal (XBUFFER (coding->src_object));
7961 }
7962
d4850d67
KH
7963 {
7964 Lisp_Object args[3];
b3bfad50 7965 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
d4850d67 7966
b3bfad50
KH
7967 GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7968 old_deactivate_mark);
d4850d67
KH
7969 args[0] = CODING_ATTR_PRE_WRITE (attrs);
7970 args[1] = make_number (BEG);
7971 args[2] = make_number (Z);
7972 safe_call (3, args);
b3bfad50 7973 UNGCPRO;
d4850d67 7974 }
c02d943b
KH
7975 if (XBUFFER (coding->src_object) != current_buffer)
7976 kill_src_buffer = 1;
ac87bbef 7977 coding->src_object = Fcurrent_buffer ();
df7492f9
KH
7978 if (BEG != GPT)
7979 move_gap_both (BEG, BEG_BYTE);
7980 coding->src_chars = Z - BEG;
7981 coding->src_bytes = Z_BYTE - BEG_BYTE;
7982 coding->src_pos = BEG;
7983 coding->src_pos_byte = BEG_BYTE;
7984 coding->src_multibyte = Z < Z_BYTE;
7985 }
7986 else if (STRINGP (src_object))
d46c5b12 7987 {
24a73b0a 7988 code_conversion_save (0, 0);
df7492f9
KH
7989 coding->src_pos = from;
7990 coding->src_pos_byte = from_byte;
b73bfc1c 7991 }
df7492f9 7992 else if (BUFFERP (src_object))
b73bfc1c 7993 {
24a73b0a 7994 code_conversion_save (0, 0);
df7492f9 7995 set_buffer_internal (XBUFFER (src_object));
df7492f9 7996 if (EQ (src_object, dst_object))
d46c5b12 7997 {
4776e638 7998 saved_pt = PT, saved_pt_byte = PT_BYTE;
ff0dacd7
KH
7999 coding->src_object = del_range_1 (from, to, 1, 1);
8000 coding->src_pos = 0;
8001 coding->src_pos_byte = 0;
d46c5b12 8002 }
df7492f9 8003 else
d46c5b12 8004 {
ff0dacd7
KH
8005 if (from < GPT && to >= GPT)
8006 move_gap_both (from, from_byte);
df7492f9
KH
8007 coding->src_pos = from;
8008 coding->src_pos_byte = from_byte;
d46c5b12 8009 }
d46c5b12 8010 }
4776e638 8011 else
24a73b0a 8012 code_conversion_save (0, 0);
d46c5b12 8013
df7492f9 8014 if (BUFFERP (dst_object))
88993dfd 8015 {
df7492f9 8016 coding->dst_object = dst_object;
28f67a95
KH
8017 if (EQ (src_object, dst_object))
8018 {
8019 coding->dst_pos = from;
8020 coding->dst_pos_byte = from_byte;
8021 }
8022 else
8023 {
319a3947
KH
8024 struct buffer *current = current_buffer;
8025
8026 set_buffer_temp (XBUFFER (dst_object));
8027 coding->dst_pos = PT;
8028 coding->dst_pos_byte = PT_BYTE;
8029 move_gap_both (coding->dst_pos, coding->dst_pos_byte);
8030 set_buffer_temp (current);
28f67a95 8031 }
df7492f9
KH
8032 coding->dst_multibyte
8033 = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
88993dfd 8034 }
df7492f9 8035 else if (EQ (dst_object, Qt))
d46c5b12 8036 {
df7492f9 8037 coding->dst_object = Qnil;
df7492f9 8038 coding->dst_bytes = coding->src_chars;
ac87bbef
KH
8039 if (coding->dst_bytes == 0)
8040 coding->dst_bytes = 1;
8041 coding->destination = (unsigned char *) xmalloc (coding->dst_bytes);
df7492f9 8042 coding->dst_multibyte = 0;
d46c5b12
KH
8043 }
8044 else
8045 {
df7492f9
KH
8046 coding->dst_object = Qnil;
8047 coding->dst_multibyte = 0;
d46c5b12
KH
8048 }
8049
df7492f9 8050 encode_coding (coding);
d46c5b12 8051
df7492f9 8052 if (EQ (dst_object, Qt))
d46c5b12 8053 {
df7492f9
KH
8054 if (BUFFERP (coding->dst_object))
8055 coding->dst_object = Fbuffer_string ();
8056 else
d46c5b12 8057 {
df7492f9
KH
8058 coding->dst_object
8059 = make_unibyte_string ((char *) coding->destination,
8060 coding->produced);
8061 xfree (coding->destination);
d46c5b12 8062 }
4ed46869 8063 }
d46c5b12 8064
4776e638
KH
8065 if (saved_pt >= 0)
8066 {
8067 /* This is the case of:
8068 (BUFFERP (src_object) && EQ (src_object, dst_object))
8069 As we have moved PT while replacing the original buffer
8070 contents, we must recover it now. */
8071 set_buffer_internal (XBUFFER (src_object));
8072 if (saved_pt < from)
8073 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8074 else if (saved_pt < from + chars)
8075 TEMP_SET_PT_BOTH (from, from_byte);
8076 else if (! NILP (current_buffer->enable_multibyte_characters))
8077 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8078 saved_pt_byte + (coding->produced - bytes));
d46c5b12 8079 else
4776e638
KH
8080 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8081 saved_pt_byte + (coding->produced - bytes));
64cedb0c
KH
8082
8083 if (need_marker_adjustment)
8084 {
8085 struct Lisp_Marker *tail;
8086
8087 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8088 if (tail->need_adjustment)
8089 {
8090 tail->need_adjustment = 0;
8091 if (tail->insertion_type)
8092 {
8093 tail->bytepos = from_byte;
8094 tail->charpos = from;
8095 }
8096 else
8097 {
8098 tail->bytepos = from_byte + coding->produced;
8099 tail->charpos
8100 = (NILP (current_buffer->enable_multibyte_characters)
8101 ? tail->bytepos : from + coding->produced_char);
8102 }
8103 }
8104 }
4776e638
KH
8105 }
8106
c02d943b
KH
8107 if (kill_src_buffer)
8108 Fkill_buffer (coding->src_object);
b3bfad50
KH
8109
8110 Vdeactivate_mark = old_deactivate_mark;
df7492f9 8111 unbind_to (count, Qnil);
b73bfc1c
KH
8112}
8113
df7492f9 8114
b73bfc1c 8115Lisp_Object
df7492f9 8116preferred_coding_system ()
b73bfc1c 8117{
df7492f9 8118 int id = coding_categories[coding_priorities[0]].id;
2391eaa4 8119
df7492f9 8120 return CODING_ID_NAME (id);
4ed46869
KH
8121}
8122
8123\f
8124#ifdef emacs
1397dc18 8125/*** 8. Emacs Lisp library functions ***/
4ed46869 8126
4ed46869 8127DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
48b0f3ae 8128 doc: /* Return t if OBJECT is nil or a coding-system.
df7492f9 8129See the documentation of `define-coding-system' for information
48b0f3ae 8130about coding-system objects. */)
d4a1d553
JB
8131 (object)
8132 Lisp_Object object;
4ed46869 8133{
d4a1d553
JB
8134 if (NILP (object)
8135 || CODING_SYSTEM_ID (object) >= 0)
44e8490d 8136 return Qt;
d4a1d553
JB
8137 if (! SYMBOLP (object)
8138 || NILP (Fget (object, Qcoding_system_define_form)))
44e8490d
KH
8139 return Qnil;
8140 return Qt;
4ed46869
KH
8141}
8142
9d991de8
RS
8143DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
8144 Sread_non_nil_coding_system, 1, 1, 0,
48b0f3ae
PJ
8145 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
8146 (prompt)
4ed46869
KH
8147 Lisp_Object prompt;
8148{
e0e989f6 8149 Lisp_Object val;
9d991de8
RS
8150 do
8151 {
4608c386
KH
8152 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8153 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
9d991de8 8154 }
8f924df7 8155 while (SCHARS (val) == 0);
e0e989f6 8156 return (Fintern (val, Qnil));
4ed46869
KH
8157}
8158
9b787f3e 8159DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
48b0f3ae 8160 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
c7183fb8
GM
8161If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8162Ignores case when completing coding systems (all Emacs coding systems
8163are lower-case). */)
48b0f3ae 8164 (prompt, default_coding_system)
9b787f3e 8165 Lisp_Object prompt, default_coding_system;
4ed46869 8166{
f44d27ce 8167 Lisp_Object val;
c7183fb8
GM
8168 int count = SPECPDL_INDEX ();
8169
9b787f3e 8170 if (SYMBOLP (default_coding_system))
57d25e6f 8171 default_coding_system = SYMBOL_NAME (default_coding_system);
c7183fb8 8172 specbind (Qcompletion_ignore_case, Qt);
4608c386 8173 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
9b787f3e
RS
8174 Qt, Qnil, Qcoding_system_history,
8175 default_coding_system, Qnil);
c7183fb8 8176 unbind_to (count, Qnil);
8f924df7 8177 return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
4ed46869
KH
8178}
8179
8180DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
8181 1, 1, 0,
48b0f3ae 8182 doc: /* Check validity of CODING-SYSTEM.
9ffd559c
KH
8183If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8184It is valid if it is nil or a symbol defined as a coding system by the
8185function `define-coding-system'. */)
df7492f9 8186 (coding_system)
4ed46869
KH
8187 Lisp_Object coding_system;
8188{
44e8490d
KH
8189 Lisp_Object define_form;
8190
8191 define_form = Fget (coding_system, Qcoding_system_define_form);
8192 if (! NILP (define_form))
8193 {
8194 Fput (coding_system, Qcoding_system_define_form, Qnil);
8195 safe_eval (define_form);
8196 }
4ed46869
KH
8197 if (!NILP (Fcoding_system_p (coding_system)))
8198 return coding_system;
fcad4ec4 8199 xsignal1 (Qcoding_system_error, coding_system);
4ed46869 8200}
df7492f9 8201
3a73fa5d 8202\f
89528eb3
KH
8203/* Detect how the bytes at SRC of length SRC_BYTES are encoded. If
8204 HIGHEST is nonzero, return the coding system of the highest
8205 priority among the detected coding systems. Otherwize return a
8206 list of detected coding systems sorted by their priorities. If
8207 MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
8208 multibyte form but contains only ASCII and eight-bit chars.
8209 Otherwise, the bytes are raw bytes.
8210
8211 CODING-SYSTEM controls the detection as below:
8212
8213 If it is nil, detect both text-format and eol-format. If the
8214 text-format part of CODING-SYSTEM is already specified
8215 (e.g. `iso-latin-1'), detect only eol-format. If the eol-format
8216 part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8217 detect only text-format. */
8218
d46c5b12 8219Lisp_Object
24a73b0a
KH
8220detect_coding_system (src, src_chars, src_bytes, highest, multibytep,
8221 coding_system)
8f924df7 8222 const unsigned char *src;
13818c30
SM
8223 EMACS_INT src_chars, src_bytes;
8224 int highest;
0a28aafb 8225 int multibytep;
df7492f9 8226 Lisp_Object coding_system;
4ed46869 8227{
8f924df7 8228 const unsigned char *src_end = src + src_bytes;
df7492f9 8229 Lisp_Object attrs, eol_type;
4533845d 8230 Lisp_Object val = Qnil;
df7492f9 8231 struct coding_system coding;
89528eb3 8232 int id;
ff0dacd7 8233 struct coding_detection_info detect_info;
24a73b0a 8234 enum coding_category base_category;
2f3cbb32 8235 int null_byte_found = 0, eight_bit_found = 0;
b73bfc1c 8236
df7492f9
KH
8237 if (NILP (coding_system))
8238 coding_system = Qundecided;
8239 setup_coding_system (coding_system, &coding);
8240 attrs = CODING_ID_ATTRS (coding.id);
8241 eol_type = CODING_ID_EOL_TYPE (coding.id);
89528eb3 8242 coding_system = CODING_ATTR_BASE_NAME (attrs);
4ed46869 8243
df7492f9 8244 coding.source = src;
24a73b0a 8245 coding.src_chars = src_chars;
df7492f9
KH
8246 coding.src_bytes = src_bytes;
8247 coding.src_multibyte = multibytep;
8248 coding.consumed = 0;
89528eb3 8249 coding.mode |= CODING_MODE_LAST_BLOCK;
c0e16b14 8250 coding.head_ascii = 0;
d46c5b12 8251
ff0dacd7 8252 detect_info.checked = detect_info.found = detect_info.rejected = 0;
d46c5b12 8253
89528eb3 8254 /* At first, detect text-format if necessary. */
24a73b0a
KH
8255 base_category = XINT (CODING_ATTR_CATEGORY (attrs));
8256 if (base_category == coding_category_undecided)
4ed46869 8257 {
ff0dacd7
KH
8258 enum coding_category category;
8259 struct coding_system *this;
8260 int c, i;
88993dfd 8261
24a73b0a 8262 /* Skip all ASCII bytes except for a few ISO2022 controls. */
2f3cbb32 8263 for (; src < src_end; src++)
4ed46869 8264 {
df7492f9 8265 c = *src;
6cb21a4f 8266 if (c & 0x80)
6cb21a4f 8267 {
2f3cbb32 8268 eight_bit_found = 1;
2f3cbb32
KH
8269 if (null_byte_found)
8270 break;
8271 }
c0e16b14 8272 else if (c < 0x20)
2f3cbb32
KH
8273 {
8274 if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8275 && ! inhibit_iso_escape_detection
8276 && ! detect_info.checked)
6cb21a4f 8277 {
2f3cbb32
KH
8278 if (detect_coding_iso_2022 (&coding, &detect_info))
8279 {
8280 /* We have scanned the whole data. */
8281 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
c0e16b14
KH
8282 {
8283 /* We didn't find an 8-bit code. We may
8284 have found a null-byte, but it's very
8285 rare that a binary file confirm to
8286 ISO-2022. */
8287 src = src_end;
8288 coding.head_ascii = src - coding.source;
8289 }
8290 detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
2f3cbb32
KH
8291 break;
8292 }
8293 }
97b1b294 8294 else if (! c && !inhibit_null_byte_detection)
2f3cbb32
KH
8295 {
8296 null_byte_found = 1;
8297 if (eight_bit_found)
8298 break;
6cb21a4f 8299 }
c006c0c8
KH
8300 if (! eight_bit_found)
8301 coding.head_ascii++;
6cb21a4f 8302 }
c006c0c8 8303 else if (! eight_bit_found)
c0e16b14 8304 coding.head_ascii++;
4ed46869 8305 }
88993dfd 8306
2f3cbb32
KH
8307 if (null_byte_found || eight_bit_found
8308 || coding.head_ascii < coding.src_bytes
6cb21a4f
KH
8309 || detect_info.found)
8310 {
2f3cbb32 8311 if (coding.head_ascii == coding.src_bytes)
6cb21a4f
KH
8312 /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */
8313 for (i = 0; i < coding_category_raw_text; i++)
ff0dacd7 8314 {
6cb21a4f 8315 category = coding_priorities[i];
c7266f4a 8316 this = coding_categories + category;
6cb21a4f 8317 if (detect_info.found & (1 << category))
ff0dacd7
KH
8318 break;
8319 }
6cb21a4f 8320 else
2f3cbb32
KH
8321 {
8322 if (null_byte_found)
8323 {
8324 detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8325 detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8326 }
8327 for (i = 0; i < coding_category_raw_text; i++)
8328 {
8329 category = coding_priorities[i];
8330 this = coding_categories + category;
6cb21a4f 8331
2f3cbb32
KH
8332 if (this->id < 0)
8333 {
8334 /* No coding system of this category is defined. */
8335 detect_info.rejected |= (1 << category);
8336 }
8337 else if (category >= coding_category_raw_text)
8338 continue;
8339 else if (detect_info.checked & (1 << category))
8340 {
8341 if (highest
8342 && (detect_info.found & (1 << category)))
6cb21a4f 8343 break;
2f3cbb32
KH
8344 }
8345 else if ((*(this->detector)) (&coding, &detect_info)
8346 && highest
8347 && (detect_info.found & (1 << category)))
8348 {
8349 if (category == coding_category_utf_16_auto)
8350 {
8351 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8352 category = coding_category_utf_16_le;
8353 else
8354 category = coding_category_utf_16_be;
8355 }
8356 break;
8357 }
8358 }
8359 }
6cb21a4f 8360 }
ec6d2bb8 8361
4cddb209
KH
8362 if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8363 || null_byte_found)
ec6d2bb8 8364 {
ff0dacd7 8365 detect_info.found = CATEGORY_MASK_RAW_TEXT;
4cddb209 8366 id = CODING_SYSTEM_ID (Qno_conversion);
89528eb3
KH
8367 val = Fcons (make_number (id), Qnil);
8368 }
ff0dacd7 8369 else if (! detect_info.rejected && ! detect_info.found)
89528eb3 8370 {
ff0dacd7 8371 detect_info.found = CATEGORY_MASK_ANY;
89528eb3
KH
8372 id = coding_categories[coding_category_undecided].id;
8373 val = Fcons (make_number (id), Qnil);
8374 }
8375 else if (highest)
8376 {
ff0dacd7 8377 if (detect_info.found)
ec6d2bb8 8378 {
ff0dacd7
KH
8379 detect_info.found = 1 << category;
8380 val = Fcons (make_number (this->id), Qnil);
8381 }
8382 else
8383 for (i = 0; i < coding_category_raw_text; i++)
8384 if (! (detect_info.rejected & (1 << coding_priorities[i])))
8385 {
8386 detect_info.found = 1 << coding_priorities[i];
8387 id = coding_categories[coding_priorities[i]].id;
8388 val = Fcons (make_number (id), Qnil);
8389 break;
8390 }
8391 }
89528eb3
KH
8392 else
8393 {
ff0dacd7
KH
8394 int mask = detect_info.rejected | detect_info.found;
8395 int found = 0;
ec6d2bb8 8396
89528eb3 8397 for (i = coding_category_raw_text - 1; i >= 0; i--)
ff0dacd7
KH
8398 {
8399 category = coding_priorities[i];
8400 if (! (mask & (1 << category)))
ec6d2bb8 8401 {
ff0dacd7
KH
8402 found |= 1 << category;
8403 id = coding_categories[category].id;
c7266f4a
KH
8404 if (id >= 0)
8405 val = Fcons (make_number (id), val);
ff0dacd7
KH
8406 }
8407 }
8408 for (i = coding_category_raw_text - 1; i >= 0; i--)
8409 {
8410 category = coding_priorities[i];
8411 if (detect_info.found & (1 << category))
8412 {
8413 id = coding_categories[category].id;
8414 val = Fcons (make_number (id), val);
ec6d2bb8 8415 }
ec6d2bb8 8416 }
ff0dacd7 8417 detect_info.found |= found;
ec6d2bb8 8418 }
ec6d2bb8 8419 }
a470d443
KH
8420 else if (base_category == coding_category_utf_8_auto)
8421 {
8422 if (detect_coding_utf_8 (&coding, &detect_info))
8423 {
8424 struct coding_system *this;
8425
8426 if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8427 this = coding_categories + coding_category_utf_8_sig;
8428 else
8429 this = coding_categories + coding_category_utf_8_nosig;
8430 val = Fcons (make_number (this->id), Qnil);
8431 }
8432 }
24a73b0a
KH
8433 else if (base_category == coding_category_utf_16_auto)
8434 {
8435 if (detect_coding_utf_16 (&coding, &detect_info))
8436 {
24a73b0a
KH
8437 struct coding_system *this;
8438
8439 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8440 this = coding_categories + coding_category_utf_16_le;
8441 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8442 this = coding_categories + coding_category_utf_16_be;
8443 else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8444 this = coding_categories + coding_category_utf_16_be_nosig;
8445 else
8446 this = coding_categories + coding_category_utf_16_le_nosig;
8447 val = Fcons (make_number (this->id), Qnil);
8448 }
8449 }
df7492f9
KH
8450 else
8451 {
ff0dacd7 8452 detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
89528eb3 8453 val = Fcons (make_number (coding.id), Qnil);
4ed46869 8454 }
df7492f9 8455
89528eb3 8456 /* Then, detect eol-format if necessary. */
df7492f9 8457 {
4533845d 8458 int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
df7492f9
KH
8459 Lisp_Object tail;
8460
89528eb3
KH
8461 if (VECTORP (eol_type))
8462 {
ff0dacd7 8463 if (detect_info.found & ~CATEGORY_MASK_UTF_16)
2f3cbb32
KH
8464 {
8465 if (null_byte_found)
8466 normal_eol = EOL_SEEN_LF;
8467 else
8468 normal_eol = detect_eol (coding.source, src_bytes,
8469 coding_category_raw_text);
8470 }
ff0dacd7
KH
8471 if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8472 | CATEGORY_MASK_UTF_16_BE_NOSIG))
89528eb3
KH
8473 utf_16_be_eol = detect_eol (coding.source, src_bytes,
8474 coding_category_utf_16_be);
ff0dacd7
KH
8475 if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8476 | CATEGORY_MASK_UTF_16_LE_NOSIG))
89528eb3
KH
8477 utf_16_le_eol = detect_eol (coding.source, src_bytes,
8478 coding_category_utf_16_le);
8479 }
8480 else
8481 {
8482 if (EQ (eol_type, Qunix))
8483 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8484 else if (EQ (eol_type, Qdos))
8485 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8486 else
8487 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8488 }
8489
df7492f9
KH
8490 for (tail = val; CONSP (tail); tail = XCDR (tail))
8491 {
89528eb3 8492 enum coding_category category;
df7492f9 8493 int this_eol;
89528eb3
KH
8494
8495 id = XINT (XCAR (tail));
8496 attrs = CODING_ID_ATTRS (id);
8497 category = XINT (CODING_ATTR_CATEGORY (attrs));
8498 eol_type = CODING_ID_EOL_TYPE (id);
df7492f9
KH
8499 if (VECTORP (eol_type))
8500 {
89528eb3
KH
8501 if (category == coding_category_utf_16_be
8502 || category == coding_category_utf_16_be_nosig)
8503 this_eol = utf_16_be_eol;
8504 else if (category == coding_category_utf_16_le
8505 || category == coding_category_utf_16_le_nosig)
8506 this_eol = utf_16_le_eol;
df7492f9 8507 else
89528eb3
KH
8508 this_eol = normal_eol;
8509
df7492f9
KH
8510 if (this_eol == EOL_SEEN_LF)
8511 XSETCAR (tail, AREF (eol_type, 0));
8512 else if (this_eol == EOL_SEEN_CRLF)
8513 XSETCAR (tail, AREF (eol_type, 1));
8514 else if (this_eol == EOL_SEEN_CR)
8515 XSETCAR (tail, AREF (eol_type, 2));
89528eb3
KH
8516 else
8517 XSETCAR (tail, CODING_ID_NAME (id));
df7492f9 8518 }
89528eb3
KH
8519 else
8520 XSETCAR (tail, CODING_ID_NAME (id));
df7492f9
KH
8521 }
8522 }
ec6d2bb8 8523
4533845d 8524 return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
ec6d2bb8
KH
8525}
8526
ec6d2bb8 8527
d46c5b12
KH
8528DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8529 2, 3, 0,
48b0f3ae
PJ
8530 doc: /* Detect coding system of the text in the region between START and END.
8531Return a list of possible coding systems ordered by priority.
b811c52b
KH
8532The coding systems to try and their priorities follows what
8533the function `coding-system-priority-list' (which see) returns.
ec6d2bb8 8534
12e0131a 8535If only ASCII characters are found (except for such ISO-2022 control
d4a1d553
JB
8536characters as ESC), it returns a list of single element `undecided'
8537or its subsidiary coding system according to a detected end-of-line
8538format.
ec6d2bb8 8539
48b0f3ae
PJ
8540If optional argument HIGHEST is non-nil, return the coding system of
8541highest priority. */)
8542 (start, end, highest)
d46c5b12
KH
8543 Lisp_Object start, end, highest;
8544{
8545 int from, to;
8546 int from_byte, to_byte;
ec6d2bb8 8547
b7826503
PJ
8548 CHECK_NUMBER_COERCE_MARKER (start);
8549 CHECK_NUMBER_COERCE_MARKER (end);
ec6d2bb8 8550
d46c5b12
KH
8551 validate_region (&start, &end);
8552 from = XINT (start), to = XINT (end);
8553 from_byte = CHAR_TO_BYTE (from);
8554 to_byte = CHAR_TO_BYTE (to);
ec6d2bb8 8555
d46c5b12
KH
8556 if (from < GPT && to >= GPT)
8557 move_gap_both (to, to_byte);
c210f766 8558
d46c5b12 8559 return detect_coding_system (BYTE_POS_ADDR (from_byte),
24a73b0a 8560 to - from, to_byte - from_byte,
0a28aafb
KH
8561 !NILP (highest),
8562 !NILP (current_buffer
df7492f9
KH
8563 ->enable_multibyte_characters),
8564 Qnil);
ec6d2bb8
KH
8565}
8566
d46c5b12
KH
8567DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8568 1, 2, 0,
48b0f3ae
PJ
8569 doc: /* Detect coding system of the text in STRING.
8570Return a list of possible coding systems ordered by priority.
67ceab9d
KH
8571The coding systems to try and their priorities follows what
8572the function `coding-system-priority-list' (which see) returns.
fb88bf2d 8573
12e0131a 8574If only ASCII characters are found (except for such ISO-2022 control
d4a1d553
JB
8575characters as ESC), it returns a list of single element `undecided'
8576or its subsidiary coding system according to a detected end-of-line
8577format.
d46c5b12 8578
48b0f3ae
PJ
8579If optional argument HIGHEST is non-nil, return the coding system of
8580highest priority. */)
8581 (string, highest)
d46c5b12
KH
8582 Lisp_Object string, highest;
8583{
b7826503 8584 CHECK_STRING (string);
b73bfc1c 8585
24a73b0a
KH
8586 return detect_coding_system (SDATA (string),
8587 SCHARS (string), SBYTES (string),
8f924df7 8588 !NILP (highest), STRING_MULTIBYTE (string),
df7492f9 8589 Qnil);
4ed46869 8590}
4ed46869 8591
b73bfc1c 8592
df7492f9
KH
8593static INLINE int
8594char_encodable_p (c, attrs)
8595 int c;
8596 Lisp_Object attrs;
05e6f5dc 8597{
df7492f9 8598 Lisp_Object tail;
df7492f9 8599 struct charset *charset;
7d64c6ad 8600 Lisp_Object translation_table;
d46c5b12 8601
7d64c6ad 8602 translation_table = CODING_ATTR_TRANS_TBL (attrs);
a6f87d34 8603 if (! NILP (translation_table))
7d64c6ad 8604 c = translate_char (translation_table, c);
df7492f9
KH
8605 for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8606 CONSP (tail); tail = XCDR (tail))
e133c8fa 8607 {
df7492f9
KH
8608 charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8609 if (CHAR_CHARSET_P (c, charset))
8610 break;
e133c8fa 8611 }
df7492f9 8612 return (! NILP (tail));
05e6f5dc 8613}
83fa074f 8614
fb88bf2d 8615
df7492f9
KH
8616/* Return a list of coding systems that safely encode the text between
8617 START and END. If EXCLUDE is non-nil, it is a list of coding
8618 systems not to check. The returned list doesn't contain any such
48468dac 8619 coding systems. In any case, if the text contains only ASCII or is
df7492f9 8620 unibyte, return t. */
e077cc80 8621
df7492f9
KH
8622DEFUN ("find-coding-systems-region-internal",
8623 Ffind_coding_systems_region_internal,
8624 Sfind_coding_systems_region_internal, 2, 3, 0,
8625 doc: /* Internal use only. */)
8626 (start, end, exclude)
8627 Lisp_Object start, end, exclude;
8628{
8629 Lisp_Object coding_attrs_list, safe_codings;
8630 EMACS_INT start_byte, end_byte;
7c78e542 8631 const unsigned char *p, *pbeg, *pend;
df7492f9
KH
8632 int c;
8633 Lisp_Object tail, elt;
d46c5b12 8634
df7492f9
KH
8635 if (STRINGP (start))
8636 {
8637 if (!STRING_MULTIBYTE (start)
8f924df7 8638 || SCHARS (start) == SBYTES (start))
df7492f9
KH
8639 return Qt;
8640 start_byte = 0;
8f924df7 8641 end_byte = SBYTES (start);
df7492f9
KH
8642 }
8643 else
d46c5b12 8644 {
df7492f9
KH
8645 CHECK_NUMBER_COERCE_MARKER (start);
8646 CHECK_NUMBER_COERCE_MARKER (end);
8647 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8648 args_out_of_range (start, end);
8649 if (NILP (current_buffer->enable_multibyte_characters))
8650 return Qt;
8651 start_byte = CHAR_TO_BYTE (XINT (start));
8652 end_byte = CHAR_TO_BYTE (XINT (end));
8653 if (XINT (end) - XINT (start) == end_byte - start_byte)
8654 return Qt;
d46c5b12 8655
e1c23804 8656 if (XINT (start) < GPT && XINT (end) > GPT)
d46c5b12 8657 {
e1c23804
DL
8658 if ((GPT - XINT (start)) < (XINT (end) - GPT))
8659 move_gap_both (XINT (start), start_byte);
df7492f9 8660 else
e1c23804 8661 move_gap_both (XINT (end), end_byte);
d46c5b12
KH
8662 }
8663 }
8664
df7492f9
KH
8665 coding_attrs_list = Qnil;
8666 for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
8667 if (NILP (exclude)
8668 || NILP (Fmemq (XCAR (tail), exclude)))
8669 {
8670 Lisp_Object attrs;
d46c5b12 8671
df7492f9
KH
8672 attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
8673 if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
8674 && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
7d64c6ad
KH
8675 {
8676 ASET (attrs, coding_attr_trans_tbl,
2170c8f0 8677 get_translation_table (attrs, 1, NULL));
7d64c6ad
KH
8678 coding_attrs_list = Fcons (attrs, coding_attrs_list);
8679 }
df7492f9 8680 }
d46c5b12 8681
df7492f9 8682 if (STRINGP (start))
8f924df7 8683 p = pbeg = SDATA (start);
df7492f9
KH
8684 else
8685 p = pbeg = BYTE_POS_ADDR (start_byte);
8686 pend = p + (end_byte - start_byte);
b843d1ae 8687
df7492f9
KH
8688 while (p < pend && ASCII_BYTE_P (*p)) p++;
8689 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
d46c5b12 8690
05e6f5dc 8691 while (p < pend)
72d1a715 8692 {
df7492f9
KH
8693 if (ASCII_BYTE_P (*p))
8694 p++;
72d1a715
RS
8695 else
8696 {
df7492f9 8697 c = STRING_CHAR_ADVANCE (p);
12410ef1 8698
df7492f9
KH
8699 charset_map_loaded = 0;
8700 for (tail = coding_attrs_list; CONSP (tail);)
8701 {
8702 elt = XCAR (tail);
8703 if (NILP (elt))
8704 tail = XCDR (tail);
8705 else if (char_encodable_p (c, elt))
8706 tail = XCDR (tail);
8707 else if (CONSP (XCDR (tail)))
8708 {
8709 XSETCAR (tail, XCAR (XCDR (tail)));
8710 XSETCDR (tail, XCDR (XCDR (tail)));
8711 }
8712 else
8713 {
8714 XSETCAR (tail, Qnil);
8715 tail = XCDR (tail);
8716 }
8717 }
8718 if (charset_map_loaded)
8719 {
8720 EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
05e6f5dc 8721
df7492f9 8722 if (STRINGP (start))
8f924df7 8723 pbeg = SDATA (start);
df7492f9
KH
8724 else
8725 pbeg = BYTE_POS_ADDR (start_byte);
8726 p = pbeg + p_offset;
8727 pend = pbeg + pend_offset;
8728 }
8729 }
ec6d2bb8 8730 }
fb88bf2d 8731
988b3759 8732 safe_codings = list2 (Qraw_text, Qno_conversion);
df7492f9
KH
8733 for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
8734 if (! NILP (XCAR (tail)))
8735 safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
ec6d2bb8 8736
05e6f5dc
KH
8737 return safe_codings;
8738}
4956c225 8739
d46c5b12 8740
8f924df7
KH
8741DEFUN ("unencodable-char-position", Funencodable_char_position,
8742 Sunencodable_char_position, 3, 5, 0,
8743 doc: /*
8744Return position of first un-encodable character in a region.
d4a1d553 8745START and END specify the region and CODING-SYSTEM specifies the
8f924df7 8746encoding to check. Return nil if CODING-SYSTEM does encode the region.
d46c5b12 8747
8f924df7
KH
8748If optional 4th argument COUNT is non-nil, it specifies at most how
8749many un-encodable characters to search. In this case, the value is a
8750list of positions.
d46c5b12 8751
8f924df7
KH
8752If optional 5th argument STRING is non-nil, it is a string to search
8753for un-encodable characters. In that case, START and END are indexes
8754to the string. */)
8755 (start, end, coding_system, count, string)
8756 Lisp_Object start, end, coding_system, count, string;
8757{
8758 int n;
8759 struct coding_system coding;
7d64c6ad 8760 Lisp_Object attrs, charset_list, translation_table;
8f924df7
KH
8761 Lisp_Object positions;
8762 int from, to;
8763 const unsigned char *p, *stop, *pend;
8764 int ascii_compatible;
fb88bf2d 8765
8f924df7
KH
8766 setup_coding_system (Fcheck_coding_system (coding_system), &coding);
8767 attrs = CODING_ID_ATTRS (coding.id);
8768 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
8769 return Qnil;
8770 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
8771 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2170c8f0 8772 translation_table = get_translation_table (attrs, 1, NULL);
fb88bf2d 8773
8f924df7
KH
8774 if (NILP (string))
8775 {
8776 validate_region (&start, &end);
8777 from = XINT (start);
8778 to = XINT (end);
8779 if (NILP (current_buffer->enable_multibyte_characters)
8780 || (ascii_compatible
8781 && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
8782 return Qnil;
8783 p = CHAR_POS_ADDR (from);
8784 pend = CHAR_POS_ADDR (to);
8785 if (from < GPT && to >= GPT)
8786 stop = GPT_ADDR;
8787 else
8788 stop = pend;
8789 }
8790 else
8791 {
8792 CHECK_STRING (string);
8793 CHECK_NATNUM (start);
8794 CHECK_NATNUM (end);
8795 from = XINT (start);
8796 to = XINT (end);
8797 if (from > to
8798 || to > SCHARS (string))
8799 args_out_of_range_3 (string, start, end);
8800 if (! STRING_MULTIBYTE (string))
8801 return Qnil;
8802 p = SDATA (string) + string_char_to_byte (string, from);
8803 stop = pend = SDATA (string) + string_char_to_byte (string, to);
8804 if (ascii_compatible && (to - from) == (pend - p))
8805 return Qnil;
8806 }
f2558efd 8807
8f924df7
KH
8808 if (NILP (count))
8809 n = 1;
8810 else
b73bfc1c 8811 {
8f924df7
KH
8812 CHECK_NATNUM (count);
8813 n = XINT (count);
b73bfc1c
KH
8814 }
8815
8f924df7
KH
8816 positions = Qnil;
8817 while (1)
d46c5b12 8818 {
8f924df7 8819 int c;
ec6d2bb8 8820
8f924df7
KH
8821 if (ascii_compatible)
8822 while (p < stop && ASCII_BYTE_P (*p))
8823 p++, from++;
8824 if (p >= stop)
0e79d667 8825 {
8f924df7
KH
8826 if (p >= pend)
8827 break;
8828 stop = pend;
8829 p = GAP_END_ADDR;
0e79d667 8830 }
ec6d2bb8 8831
8f924df7
KH
8832 c = STRING_CHAR_ADVANCE (p);
8833 if (! (ASCII_CHAR_P (c) && ascii_compatible)
7d64c6ad
KH
8834 && ! char_charset (translate_char (translation_table, c),
8835 charset_list, NULL))
ec6d2bb8 8836 {
8f924df7
KH
8837 positions = Fcons (make_number (from), positions);
8838 n--;
8839 if (n == 0)
8840 break;
ec6d2bb8
KH
8841 }
8842
8f924df7
KH
8843 from++;
8844 }
d46c5b12 8845
8f924df7
KH
8846 return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
8847}
d46c5b12 8848
d46c5b12 8849
df7492f9
KH
8850DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
8851 Scheck_coding_systems_region, 3, 3, 0,
8852 doc: /* Check if the region is encodable by coding systems.
d46c5b12 8853
df7492f9
KH
8854START and END are buffer positions specifying the region.
8855CODING-SYSTEM-LIST is a list of coding systems to check.
d46c5b12 8856
df7492f9 8857The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
d4a1d553 8858CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
df7492f9
KH
8859whole region, POS0, POS1, ... are buffer positions where non-encodable
8860characters are found.
93dec019 8861
df7492f9
KH
8862If all coding systems in CODING-SYSTEM-LIST can encode the region, the
8863value is nil.
93dec019 8864
df7492f9
KH
8865START may be a string. In that case, check if the string is
8866encodable, and the value contains indices to the string instead of
5704f39a
KH
8867buffer positions. END is ignored.
8868
4c1958f4 8869If the current buffer (or START if it is a string) is unibyte, the value
5704f39a 8870is nil. */)
df7492f9
KH
8871 (start, end, coding_system_list)
8872 Lisp_Object start, end, coding_system_list;
05e6f5dc 8873{
df7492f9
KH
8874 Lisp_Object list;
8875 EMACS_INT start_byte, end_byte;
8876 int pos;
7c78e542 8877 const unsigned char *p, *pbeg, *pend;
df7492f9 8878 int c;
7d64c6ad 8879 Lisp_Object tail, elt, attrs;
70ad9fc4 8880
05e6f5dc
KH
8881 if (STRINGP (start))
8882 {
df7492f9 8883 if (!STRING_MULTIBYTE (start)
4c1958f4 8884 || SCHARS (start) == SBYTES (start))
df7492f9
KH
8885 return Qnil;
8886 start_byte = 0;
8f924df7 8887 end_byte = SBYTES (start);
df7492f9 8888 pos = 0;
d46c5b12 8889 }
05e6f5dc 8890 else
b73bfc1c 8891 {
b7826503
PJ
8892 CHECK_NUMBER_COERCE_MARKER (start);
8893 CHECK_NUMBER_COERCE_MARKER (end);
05e6f5dc
KH
8894 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8895 args_out_of_range (start, end);
8896 if (NILP (current_buffer->enable_multibyte_characters))
df7492f9
KH
8897 return Qnil;
8898 start_byte = CHAR_TO_BYTE (XINT (start));
8899 end_byte = CHAR_TO_BYTE (XINT (end));
8900 if (XINT (end) - XINT (start) == end_byte - start_byte)
5704f39a 8901 return Qnil;
df7492f9 8902
e1c23804 8903 if (XINT (start) < GPT && XINT (end) > GPT)
b73bfc1c 8904 {
e1c23804
DL
8905 if ((GPT - XINT (start)) < (XINT (end) - GPT))
8906 move_gap_both (XINT (start), start_byte);
df7492f9 8907 else
e1c23804 8908 move_gap_both (XINT (end), end_byte);
b73bfc1c 8909 }
e1c23804 8910 pos = XINT (start);
b73bfc1c 8911 }
7553d0e1 8912
df7492f9
KH
8913 list = Qnil;
8914 for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
12410ef1 8915 {
df7492f9 8916 elt = XCAR (tail);
7d64c6ad 8917 attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
2170c8f0
KH
8918 ASET (attrs, coding_attr_trans_tbl,
8919 get_translation_table (attrs, 1, NULL));
7d64c6ad 8920 list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
12410ef1
KH
8921 }
8922
df7492f9 8923 if (STRINGP (start))
8f924df7 8924 p = pbeg = SDATA (start);
72d1a715 8925 else
df7492f9
KH
8926 p = pbeg = BYTE_POS_ADDR (start_byte);
8927 pend = p + (end_byte - start_byte);
4ed46869 8928
df7492f9
KH
8929 while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
8930 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
ec6d2bb8 8931
df7492f9 8932 while (p < pend)
d46c5b12 8933 {
df7492f9
KH
8934 if (ASCII_BYTE_P (*p))
8935 p++;
e133c8fa 8936 else
05e6f5dc 8937 {
df7492f9
KH
8938 c = STRING_CHAR_ADVANCE (p);
8939
8940 charset_map_loaded = 0;
8941 for (tail = list; CONSP (tail); tail = XCDR (tail))
8942 {
8943 elt = XCDR (XCAR (tail));
8944 if (! char_encodable_p (c, XCAR (elt)))
8945 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
8946 }
8947 if (charset_map_loaded)
8948 {
8949 EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
8950
8951 if (STRINGP (start))
8f924df7 8952 pbeg = SDATA (start);
df7492f9
KH
8953 else
8954 pbeg = BYTE_POS_ADDR (start_byte);
8955 p = pbeg + p_offset;
8956 pend = pbeg + pend_offset;
8957 }
05e6f5dc 8958 }
df7492f9 8959 pos++;
d46c5b12 8960 }
4ed46869 8961
df7492f9
KH
8962 tail = list;
8963 list = Qnil;
8964 for (; CONSP (tail); tail = XCDR (tail))
ec6d2bb8 8965 {
df7492f9
KH
8966 elt = XCAR (tail);
8967 if (CONSP (XCDR (XCDR (elt))))
8968 list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
8969 list);
ec6d2bb8 8970 }
2b4f9037 8971
df7492f9 8972 return list;
d46c5b12
KH
8973}
8974
3fd9494b 8975
b73bfc1c 8976Lisp_Object
df7492f9
KH
8977code_convert_region (start, end, coding_system, dst_object, encodep, norecord)
8978 Lisp_Object start, end, coding_system, dst_object;
8979 int encodep, norecord;
4ed46869 8980{
3a73fa5d 8981 struct coding_system coding;
df7492f9
KH
8982 EMACS_INT from, from_byte, to, to_byte;
8983 Lisp_Object src_object;
4ed46869 8984
b7826503
PJ
8985 CHECK_NUMBER_COERCE_MARKER (start);
8986 CHECK_NUMBER_COERCE_MARKER (end);
df7492f9
KH
8987 if (NILP (coding_system))
8988 coding_system = Qno_conversion;
8989 else
8990 CHECK_CODING_SYSTEM (coding_system);
8991 src_object = Fcurrent_buffer ();
8992 if (NILP (dst_object))
8993 dst_object = src_object;
8994 else if (! EQ (dst_object, Qt))
8995 CHECK_BUFFER (dst_object);
3a73fa5d 8996
d46c5b12
KH
8997 validate_region (&start, &end);
8998 from = XFASTINT (start);
df7492f9 8999 from_byte = CHAR_TO_BYTE (from);
d46c5b12 9000 to = XFASTINT (end);
df7492f9 9001 to_byte = CHAR_TO_BYTE (to);
764ca8da 9002
df7492f9
KH
9003 setup_coding_system (coding_system, &coding);
9004 coding.mode |= CODING_MODE_LAST_BLOCK;
ec6d2bb8 9005
df7492f9
KH
9006 if (encodep)
9007 encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9008 dst_object);
9009 else
9010 decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9011 dst_object);
9012 if (! norecord)
9013 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
ec6d2bb8 9014
df7492f9
KH
9015 return (BUFFERP (dst_object)
9016 ? make_number (coding.produced_char)
9017 : coding.dst_object);
4031e2bf 9018}
78108bcd 9019
4ed46869 9020
4031e2bf 9021DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
df7492f9 9022 3, 4, "r\nzCoding system: ",
48b0f3ae 9023 doc: /* Decode the current region from the specified coding system.
df7492f9
KH
9024When called from a program, takes four arguments:
9025 START, END, CODING-SYSTEM, and DESTINATION.
9026START and END are buffer positions.
8844fa83 9027
df7492f9 9028Optional 4th arguments DESTINATION specifies where the decoded text goes.
2354b80b 9029If nil, the region between START and END is replaced by the decoded text.
1560f91a
EZ
9030If buffer, the decoded text is inserted in that buffer after point (point
9031does not move).
446dcd75 9032In those cases, the length of the decoded text is returned.
319a3947 9033If DESTINATION is t, the decoded text is returned.
8844fa83 9034
48b0f3ae
PJ
9035This function sets `last-coding-system-used' to the precise coding system
9036used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
319a3947 9037not fully specified.) */)
df7492f9
KH
9038 (start, end, coding_system, destination)
9039 Lisp_Object start, end, coding_system, destination;
4031e2bf 9040{
df7492f9 9041 return code_convert_region (start, end, coding_system, destination, 0, 0);
3a73fa5d 9042}
8844fa83 9043
3a73fa5d 9044DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
df7492f9
KH
9045 3, 4, "r\nzCoding system: ",
9046 doc: /* Encode the current region by specified coding system.
d4a1d553
JB
9047When called from a program, takes four arguments:
9048 START, END, CODING-SYSTEM and DESTINATION.
9049START and END are buffer positions.
d46c5b12 9050
df7492f9
KH
9051Optional 4th arguments DESTINATION specifies where the encoded text goes.
9052If nil, the region between START and END is replace by the encoded text.
1560f91a
EZ
9053If buffer, the encoded text is inserted in that buffer after point (point
9054does not move).
446dcd75 9055In those cases, the length of the encoded text is returned.
319a3947 9056If DESTINATION is t, the encoded text is returned.
2391eaa4 9057
48b0f3ae
PJ
9058This function sets `last-coding-system-used' to the precise coding system
9059used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
319a3947 9060not fully specified.) */)
df7492f9
KH
9061 (start, end, coding_system, destination)
9062 Lisp_Object start, end, coding_system, destination;
3a73fa5d 9063{
df7492f9 9064 return code_convert_region (start, end, coding_system, destination, 1, 0);
b73bfc1c
KH
9065}
9066
9067Lisp_Object
df7492f9
KH
9068code_convert_string (string, coding_system, dst_object,
9069 encodep, nocopy, norecord)
9070 Lisp_Object string, coding_system, dst_object;
9071 int encodep, nocopy, norecord;
b73bfc1c 9072{
4031e2bf 9073 struct coding_system coding;
df7492f9 9074 EMACS_INT chars, bytes;
ec6d2bb8 9075
b7826503 9076 CHECK_STRING (string);
d46c5b12 9077 if (NILP (coding_system))
4956c225 9078 {
df7492f9
KH
9079 if (! norecord)
9080 Vlast_coding_system_used = Qno_conversion;
9081 if (NILP (dst_object))
9082 return (nocopy ? Fcopy_sequence (string) : string);
4956c225 9083 }
b73bfc1c 9084
df7492f9
KH
9085 if (NILP (coding_system))
9086 coding_system = Qno_conversion;
9087 else
9088 CHECK_CODING_SYSTEM (coding_system);
9089 if (NILP (dst_object))
9090 dst_object = Qt;
9091 else if (! EQ (dst_object, Qt))
9092 CHECK_BUFFER (dst_object);
73be902c 9093
df7492f9 9094 setup_coding_system (coding_system, &coding);
d46c5b12 9095 coding.mode |= CODING_MODE_LAST_BLOCK;
8f924df7
KH
9096 chars = SCHARS (string);
9097 bytes = SBYTES (string);
df7492f9
KH
9098 if (encodep)
9099 encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9100 else
9101 decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9102 if (! norecord)
9103 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
73be902c 9104
df7492f9
KH
9105 return (BUFFERP (dst_object)
9106 ? make_number (coding.produced_char)
9107 : coding.dst_object);
4ed46869 9108}
73be902c 9109
b73bfc1c 9110
ecec61c1 9111/* Encode or decode STRING according to CODING_SYSTEM.
ec6d2bb8 9112 Do not set Vlast_coding_system_used.
4ed46869 9113
ec6d2bb8
KH
9114 This function is called only from macros DECODE_FILE and
9115 ENCODE_FILE, thus we ignore character composition. */
4ed46869 9116
ecec61c1
KH
9117Lisp_Object
9118code_convert_string_norecord (string, coding_system, encodep)
9119 Lisp_Object string, coding_system;
9120 int encodep;
4ed46869 9121{
0be8721c 9122 return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
4ed46869
KH
9123}
9124
4ed46869 9125
df7492f9
KH
9126DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
9127 2, 4, 0,
9128 doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
9129
9130Optional third arg NOCOPY non-nil means it is OK to return STRING itself
9131if the decoding operation is trivial.
ecec61c1 9132
d4a1d553 9133Optional fourth arg BUFFER non-nil means that the decoded text is
1560f91a
EZ
9134inserted in that buffer after point (point does not move). In this
9135case, the return value is the length of the decoded text.
ecec61c1 9136
df7492f9
KH
9137This function sets `last-coding-system-used' to the precise coding system
9138used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
d4a1d553 9139not fully specified.) */)
df7492f9
KH
9140 (string, coding_system, nocopy, buffer)
9141 Lisp_Object string, coding_system, nocopy, buffer;
4ed46869 9142{
df7492f9
KH
9143 return code_convert_string (string, coding_system, buffer,
9144 0, ! NILP (nocopy), 0);
4ed46869
KH
9145}
9146
df7492f9
KH
9147DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
9148 2, 4, 0,
9149 doc: /* Encode STRING to CODING-SYSTEM, and return the result.
9150
9151Optional third arg NOCOPY non-nil means it is OK to return STRING
9152itself if the encoding operation is trivial.
9153
d4a1d553 9154Optional fourth arg BUFFER non-nil means that the encoded text is
1560f91a
EZ
9155inserted in that buffer after point (point does not move). In this
9156case, the return value is the length of the encoded text.
df7492f9
KH
9157
9158This function sets `last-coding-system-used' to the precise coding system
9159used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9160not fully specified.) */)
9161 (string, coding_system, nocopy, buffer)
9162 Lisp_Object string, coding_system, nocopy, buffer;
4ed46869 9163{
df7492f9 9164 return code_convert_string (string, coding_system, buffer,
c197f191 9165 1, ! NILP (nocopy), 1);
4ed46869 9166}
df7492f9 9167
3a73fa5d 9168\f
4ed46869 9169DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
48b0f3ae
PJ
9170 doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
9171Return the corresponding character. */)
9172 (code)
4ed46869 9173 Lisp_Object code;
4ed46869 9174{
df7492f9
KH
9175 Lisp_Object spec, attrs, val;
9176 struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
9177 int c;
4ed46869 9178
df7492f9
KH
9179 CHECK_NATNUM (code);
9180 c = XFASTINT (code);
9181 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9182 attrs = AREF (spec, 0);
4ed46869 9183
df7492f9
KH
9184 if (ASCII_BYTE_P (c)
9185 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9186 return code;
4ed46869 9187
df7492f9
KH
9188 val = CODING_ATTR_CHARSET_LIST (attrs);
9189 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
004068e4
KH
9190 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9191 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
fa42c37f 9192
df7492f9
KH
9193 if (c <= 0x7F)
9194 charset = charset_roman;
9195 else if (c >= 0xA0 && c < 0xDF)
55ab7be3 9196 {
df7492f9
KH
9197 charset = charset_kana;
9198 c -= 0x80;
4ed46869 9199 }
55ab7be3 9200 else
4ed46869 9201 {
004068e4 9202 int s1 = c >> 8, s2 = c & 0xFF;
df7492f9
KH
9203
9204 if (s1 < 0x81 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF
9205 || s2 < 0x40 || s2 == 0x7F || s2 > 0xFC)
9206 error ("Invalid code: %d", code);
9207 SJIS_TO_JIS (c);
9208 charset = charset_kanji;
4ed46869 9209 }
df7492f9
KH
9210 c = DECODE_CHAR (charset, c);
9211 if (c < 0)
9212 error ("Invalid code: %d", code);
9213 return make_number (c);
93dec019 9214}
4ed46869 9215
48b0f3ae 9216
4ed46869 9217DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
8acf0c0e 9218 doc: /* Encode a Japanese character CH to shift_jis encoding.
48b0f3ae
PJ
9219Return the corresponding code in SJIS. */)
9220 (ch)
df7492f9 9221 Lisp_Object ch;
4ed46869 9222{
df7492f9
KH
9223 Lisp_Object spec, attrs, charset_list;
9224 int c;
9225 struct charset *charset;
9226 unsigned code;
48b0f3ae 9227
df7492f9
KH
9228 CHECK_CHARACTER (ch);
9229 c = XFASTINT (ch);
9230 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9231 attrs = AREF (spec, 0);
9232
9233 if (ASCII_CHAR_P (c)
9234 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9235 return ch;
9236
9237 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9238 charset = char_charset (c, charset_list, &code);
9239 if (code == CHARSET_INVALID_CODE (charset))
9240 error ("Can't encode by shift_jis encoding: %d", c);
9241 JIS_TO_SJIS (code);
9242
9243 return make_number (code);
4ed46869
KH
9244}
9245
9246DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
48b0f3ae
PJ
9247 doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
9248Return the corresponding character. */)
9249 (code)
4ed46869 9250 Lisp_Object code;
d46c5b12 9251{
df7492f9
KH
9252 Lisp_Object spec, attrs, val;
9253 struct charset *charset_roman, *charset_big5, *charset;
9254 int c;
6289dd10 9255
df7492f9
KH
9256 CHECK_NATNUM (code);
9257 c = XFASTINT (code);
9258 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9259 attrs = AREF (spec, 0);
4ed46869 9260
df7492f9
KH
9261 if (ASCII_BYTE_P (c)
9262 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9263 return code;
6289dd10 9264
df7492f9
KH
9265 val = CODING_ATTR_CHARSET_LIST (attrs);
9266 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9267 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
c210f766 9268
df7492f9
KH
9269 if (c <= 0x7F)
9270 charset = charset_roman;
c28a9453
KH
9271 else
9272 {
df7492f9
KH
9273 int b1 = c >> 8, b2 = c & 0x7F;
9274 if (b1 < 0xA1 || b1 > 0xFE
9275 || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
9276 error ("Invalid code: %d", code);
9277 charset = charset_big5;
c28a9453 9278 }
df7492f9
KH
9279 c = DECODE_CHAR (charset, (unsigned )c);
9280 if (c < 0)
9281 error ("Invalid code: %d", code);
9282 return make_number (c);
d46c5b12 9283}
6289dd10 9284
4ed46869 9285DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
8acf0c0e 9286 doc: /* Encode the Big5 character CH to BIG5 coding system.
48b0f3ae
PJ
9287Return the corresponding character code in Big5. */)
9288 (ch)
4ed46869
KH
9289 Lisp_Object ch;
9290{
df7492f9
KH
9291 Lisp_Object spec, attrs, charset_list;
9292 struct charset *charset;
9293 int c;
9294 unsigned code;
9295
9296 CHECK_CHARACTER (ch);
9297 c = XFASTINT (ch);
9298 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9299 attrs = AREF (spec, 0);
9300 if (ASCII_CHAR_P (c)
9301 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9302 return ch;
9303
9304 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9305 charset = char_charset (c, charset_list, &code);
9306 if (code == CHARSET_INVALID_CODE (charset))
9307 error ("Can't encode by Big5 encoding: %d", c);
9308
9309 return make_number (code);
4ed46869 9310}
48b0f3ae 9311
3a73fa5d 9312\f
002fdb44 9313DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
68bba4e4 9314 Sset_terminal_coding_system_internal, 1, 2, 0,
48b0f3ae 9315 doc: /* Internal use only. */)
6ed8eeff 9316 (coding_system, terminal)
b74e4686 9317 Lisp_Object coding_system;
6ed8eeff 9318 Lisp_Object terminal;
4ed46869 9319{
6ed8eeff 9320 struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
b7826503 9321 CHECK_SYMBOL (coding_system);
b8299c66 9322 setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
70c22245 9323 /* We had better not send unsafe characters to terminal. */
c73bd236 9324 terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
df7492f9 9325 /* Characer composition should be disabled. */
c73bd236 9326 terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
b8299c66
KL
9327 terminal_coding->src_multibyte = 1;
9328 terminal_coding->dst_multibyte = 0;
4ed46869
KH
9329 return Qnil;
9330}
9331
c4825358
KH
9332DEFUN ("set-safe-terminal-coding-system-internal",
9333 Fset_safe_terminal_coding_system_internal,
48b0f3ae 9334 Sset_safe_terminal_coding_system_internal, 1, 1, 0,
ddb67bdc 9335 doc: /* Internal use only. */)
48b0f3ae 9336 (coding_system)
b74e4686 9337 Lisp_Object coding_system;
d46c5b12 9338{
b7826503 9339 CHECK_SYMBOL (coding_system);
c4825358
KH
9340 setup_coding_system (Fcheck_coding_system (coding_system),
9341 &safe_terminal_coding);
df7492f9
KH
9342 /* Characer composition should be disabled. */
9343 safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
b73bfc1c
KH
9344 safe_terminal_coding.src_multibyte = 1;
9345 safe_terminal_coding.dst_multibyte = 0;
c4825358
KH
9346 return Qnil;
9347}
4ed46869 9348
002fdb44 9349DEFUN ("terminal-coding-system", Fterminal_coding_system,
68bba4e4 9350 Sterminal_coding_system, 0, 1, 0,
6ed8eeff 9351 doc: /* Return coding system specified for terminal output on the given terminal.
708e05dc 9352TERMINAL may be a terminal object, a frame, or nil for the selected
6ed8eeff
KL
9353frame's terminal device. */)
9354 (terminal)
9355 Lisp_Object terminal;
4ed46869 9356{
985773c9
MB
9357 struct coding_system *terminal_coding
9358 = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
9359 Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
ae6f73fa 9360
ae6f73fa 9361 /* For backward compatibility, return nil if it is `undecided'. */
f75c90a9 9362 return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
4ed46869
KH
9363}
9364
002fdb44 9365DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
68bba4e4 9366 Sset_keyboard_coding_system_internal, 1, 2, 0,
48b0f3ae 9367 doc: /* Internal use only. */)
6ed8eeff 9368 (coding_system, terminal)
4ed46869 9369 Lisp_Object coding_system;
6ed8eeff 9370 Lisp_Object terminal;
4ed46869 9371{
6ed8eeff 9372 struct terminal *t = get_terminal (terminal, 1);
b7826503 9373 CHECK_SYMBOL (coding_system);
df7492f9 9374 setup_coding_system (Fcheck_coding_system (coding_system),
c73bd236 9375 TERMINAL_KEYBOARD_CODING (t));
df7492f9 9376 /* Characer composition should be disabled. */
c73bd236
MB
9377 TERMINAL_KEYBOARD_CODING (t)->common_flags
9378 &= ~CODING_ANNOTATE_COMPOSITION_MASK;
4ed46869
KH
9379 return Qnil;
9380}
9381
9382DEFUN ("keyboard-coding-system",
985773c9 9383 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
48b0f3ae 9384 doc: /* Return coding system specified for decoding keyboard input. */)
985773c9
MB
9385 (terminal)
9386 Lisp_Object terminal;
4ed46869 9387{
985773c9
MB
9388 return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9389 (get_terminal (terminal, 1))->id);
4ed46869
KH
9390}
9391
4ed46869 9392\f
a5d301df
KH
9393DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
9394 Sfind_operation_coding_system, 1, MANY, 0,
48b0f3ae
PJ
9395 doc: /* Choose a coding system for an operation based on the target name.
9396The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9397DECODING-SYSTEM is the coding system to use for decoding
9398\(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9399for encoding (in case OPERATION does encoding).
05e6f5dc 9400
48b0f3ae
PJ
9401The first argument OPERATION specifies an I/O primitive:
9402 For file I/O, `insert-file-contents' or `write-region'.
9403 For process I/O, `call-process', `call-process-region', or `start-process'.
9404 For network I/O, `open-network-stream'.
05e6f5dc 9405
48b0f3ae
PJ
9406The remaining arguments should be the same arguments that were passed
9407to the primitive. Depending on which primitive, one of those arguments
9408is selected as the TARGET. For example, if OPERATION does file I/O,
9409whichever argument specifies the file name is TARGET.
05e6f5dc 9410
48b0f3ae 9411TARGET has a meaning which depends on OPERATION:
b883cdb2 9412 For file I/O, TARGET is a file name (except for the special case below).
48b0f3ae 9413 For process I/O, TARGET is a process name.
d4a1d553 9414 For network I/O, TARGET is a service name or a port number.
05e6f5dc 9415
d4a1d553 9416This function looks up what is specified for TARGET in
48b0f3ae
PJ
9417`file-coding-system-alist', `process-coding-system-alist',
9418or `network-coding-system-alist' depending on OPERATION.
9419They may specify a coding system, a cons of coding systems,
9420or a function symbol to call.
9421In the last case, we call the function with one argument,
9422which is a list of all the arguments given to this function.
1011c487
MB
9423If the function can't decide a coding system, it can return
9424`undecided' so that the normal code-detection is performed.
48b0f3ae 9425
b883cdb2
MB
9426If OPERATION is `insert-file-contents', the argument corresponding to
9427TARGET may be a cons (FILENAME . BUFFER). In that case, FILENAME is a
9428file name to look up, and BUFFER is a buffer that contains the file's
9429contents (not yet decoded). If `file-coding-system-alist' specifies a
9430function to call for FILENAME, that function should examine the
9431contents of BUFFER instead of reading the file.
9432
d918f936 9433usage: (find-operation-coding-system OPERATION ARGUMENTS...) */)
48b0f3ae 9434 (nargs, args)
4ed46869
KH
9435 int nargs;
9436 Lisp_Object *args;
6b89e3aa 9437{
4ed46869
KH
9438 Lisp_Object operation, target_idx, target, val;
9439 register Lisp_Object chain;
177c0ea7 9440
4ed46869
KH
9441 if (nargs < 2)
9442 error ("Too few arguments");
9443 operation = args[0];
9444 if (!SYMBOLP (operation)
9445 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
3ed051d4 9446 error ("Invalid first argument");
4ed46869
KH
9447 if (nargs < 1 + XINT (target_idx))
9448 error ("Too few arguments for operation: %s",
8f924df7 9449 SDATA (SYMBOL_NAME (operation)));
4ed46869
KH
9450 target = args[XINT (target_idx) + 1];
9451 if (!(STRINGP (target)
091a0ff0
KH
9452 || (EQ (operation, Qinsert_file_contents) && CONSP (target)
9453 && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
4ed46869 9454 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
df7492f9 9455 error ("Invalid %dth argument", XINT (target_idx) + 1);
091a0ff0
KH
9456 if (CONSP (target))
9457 target = XCAR (target);
4ed46869 9458
2e34157c
RS
9459 chain = ((EQ (operation, Qinsert_file_contents)
9460 || EQ (operation, Qwrite_region))
02ba4723 9461 ? Vfile_coding_system_alist
2e34157c 9462 : (EQ (operation, Qopen_network_stream)
02ba4723
KH
9463 ? Vnetwork_coding_system_alist
9464 : Vprocess_coding_system_alist));
4ed46869
KH
9465 if (NILP (chain))
9466 return Qnil;
9467
03699b14 9468 for (; CONSP (chain); chain = XCDR (chain))
6b89e3aa 9469 {
f44d27ce 9470 Lisp_Object elt;
6b89e3aa 9471
df7492f9 9472 elt = XCAR (chain);
4ed46869
KH
9473 if (CONSP (elt)
9474 && ((STRINGP (target)
03699b14
KR
9475 && STRINGP (XCAR (elt))
9476 && fast_string_match (XCAR (elt), target) >= 0)
9477 || (INTEGERP (target) && EQ (target, XCAR (elt)))))
6b89e3aa 9478 {
03699b14 9479 val = XCDR (elt);
b19fd4c5
KH
9480 /* Here, if VAL is both a valid coding system and a valid
9481 function symbol, we return VAL as a coding system. */
02ba4723
KH
9482 if (CONSP (val))
9483 return val;
9484 if (! SYMBOLP (val))
9485 return Qnil;
9486 if (! NILP (Fcoding_system_p (val)))
9487 return Fcons (val, val);
b19fd4c5 9488 if (! NILP (Ffboundp (val)))
6b89e3aa 9489 {
e2b97060
MB
9490 /* We use call1 rather than safe_call1
9491 so as to get bug reports about functions called here
9492 which don't handle the current interface. */
9493 val = call1 (val, Flist (nargs, args));
b19fd4c5
KH
9494 if (CONSP (val))
9495 return val;
9496 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
9497 return Fcons (val, val);
6b89e3aa 9498 }
02ba4723 9499 return Qnil;
6b89e3aa
KH
9500 }
9501 }
4ed46869 9502 return Qnil;
6b89e3aa
KH
9503}
9504
df7492f9 9505DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
a3f6ee6d 9506 Sset_coding_system_priority, 0, MANY, 0,
da7db224 9507 doc: /* Assign higher priority to the coding systems given as arguments.
d4a1d553 9508If multiple coding systems belong to the same category,
a3181084
DL
9509all but the first one are ignored.
9510
d4a1d553 9511usage: (set-coding-system-priority &rest coding-systems) */)
df7492f9
KH
9512 (nargs, args)
9513 int nargs;
9514 Lisp_Object *args;
9515{
9516 int i, j;
9517 int changed[coding_category_max];
9518 enum coding_category priorities[coding_category_max];
9519
9520 bzero (changed, sizeof changed);
6b89e3aa 9521
df7492f9 9522 for (i = j = 0; i < nargs; i++)
6b89e3aa 9523 {
df7492f9
KH
9524 enum coding_category category;
9525 Lisp_Object spec, attrs;
6b89e3aa 9526
df7492f9
KH
9527 CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9528 attrs = AREF (spec, 0);
9529 category = XINT (CODING_ATTR_CATEGORY (attrs));
9530 if (changed[category])
9531 /* Ignore this coding system because a coding system of the
9532 same category already had a higher priority. */
9533 continue;
9534 changed[category] = 1;
9535 priorities[j++] = category;
9536 if (coding_categories[category].id >= 0
9537 && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9538 setup_coding_system (args[i], &coding_categories[category]);
ff563fce 9539 Fset (AREF (Vcoding_category_table, category), args[i]);
df7492f9 9540 }
6b89e3aa 9541
df7492f9
KH
9542 /* Now we have decided top J priorities. Reflect the order of the
9543 original priorities to the remaining priorities. */
6b89e3aa 9544
df7492f9 9545 for (i = j, j = 0; i < coding_category_max; i++, j++)
6b89e3aa 9546 {
df7492f9
KH
9547 while (j < coding_category_max
9548 && changed[coding_priorities[j]])
9549 j++;
9550 if (j == coding_category_max)
9551 abort ();
9552 priorities[i] = coding_priorities[j];
9553 }
6b89e3aa 9554
df7492f9 9555 bcopy (priorities, coding_priorities, sizeof priorities);
177c0ea7 9556
ff563fce
KH
9557 /* Update `coding-category-list'. */
9558 Vcoding_category_list = Qnil;
9559 for (i = coding_category_max - 1; i >= 0; i--)
9560 Vcoding_category_list
9561 = Fcons (AREF (Vcoding_category_table, priorities[i]),
9562 Vcoding_category_list);
6b89e3aa 9563
df7492f9 9564 return Qnil;
6b89e3aa
KH
9565}
9566
df7492f9
KH
9567DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
9568 Scoding_system_priority_list, 0, 1, 0,
da7db224 9569 doc: /* Return a list of coding systems ordered by their priorities.
b811c52b
KH
9570The list contains a subset of coding systems; i.e. coding systems
9571assigned to each coding category (see `coding-category-list').
9572
da7db224 9573HIGHESTP non-nil means just return the highest priority one. */)
df7492f9
KH
9574 (highestp)
9575 Lisp_Object highestp;
d46c5b12
KH
9576{
9577 int i;
df7492f9 9578 Lisp_Object val;
6b89e3aa 9579
df7492f9 9580 for (i = 0, val = Qnil; i < coding_category_max; i++)
d46c5b12 9581 {
df7492f9
KH
9582 enum coding_category category = coding_priorities[i];
9583 int id = coding_categories[category].id;
9584 Lisp_Object attrs;
068a9dbd 9585
df7492f9
KH
9586 if (id < 0)
9587 continue;
9588 attrs = CODING_ID_ATTRS (id);
9589 if (! NILP (highestp))
9590 return CODING_ATTR_BASE_NAME (attrs);
9591 val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
9592 }
9593 return Fnreverse (val);
9594}
068a9dbd 9595
f0064e1f 9596static char *suffixes[] = { "-unix", "-dos", "-mac" };
068a9dbd
KH
9597
9598static Lisp_Object
df7492f9
KH
9599make_subsidiaries (base)
9600 Lisp_Object base;
068a9dbd 9601{
df7492f9 9602 Lisp_Object subsidiaries;
8f924df7 9603 int base_name_len = SBYTES (SYMBOL_NAME (base));
df7492f9
KH
9604 char *buf = (char *) alloca (base_name_len + 6);
9605 int i;
068a9dbd 9606
8f924df7 9607 bcopy (SDATA (SYMBOL_NAME (base)), buf, base_name_len);
df7492f9
KH
9608 subsidiaries = Fmake_vector (make_number (3), Qnil);
9609 for (i = 0; i < 3; i++)
068a9dbd 9610 {
df7492f9
KH
9611 bcopy (suffixes[i], buf + base_name_len, strlen (suffixes[i]) + 1);
9612 ASET (subsidiaries, i, intern (buf));
068a9dbd 9613 }
df7492f9 9614 return subsidiaries;
068a9dbd
KH
9615}
9616
9617
df7492f9
KH
9618DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
9619 Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
1fcd6c8b
DL
9620 doc: /* For internal use only.
9621usage: (define-coding-system-internal ...) */)
df7492f9
KH
9622 (nargs, args)
9623 int nargs;
9624 Lisp_Object *args;
068a9dbd 9625{
df7492f9
KH
9626 Lisp_Object name;
9627 Lisp_Object spec_vec; /* [ ATTRS ALIASE EOL_TYPE ] */
9628 Lisp_Object attrs; /* Vector of attributes. */
9629 Lisp_Object eol_type;
9630 Lisp_Object aliases;
9631 Lisp_Object coding_type, charset_list, safe_charsets;
9632 enum coding_category category;
9633 Lisp_Object tail, val;
9634 int max_charset_id = 0;
9635 int i;
068a9dbd 9636
df7492f9
KH
9637 if (nargs < coding_arg_max)
9638 goto short_args;
068a9dbd 9639
df7492f9 9640 attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
068a9dbd 9641
df7492f9
KH
9642 name = args[coding_arg_name];
9643 CHECK_SYMBOL (name);
9644 CODING_ATTR_BASE_NAME (attrs) = name;
068a9dbd 9645
df7492f9
KH
9646 val = args[coding_arg_mnemonic];
9647 if (! STRINGP (val))
9648 CHECK_CHARACTER (val);
9649 CODING_ATTR_MNEMONIC (attrs) = val;
068a9dbd 9650
df7492f9
KH
9651 coding_type = args[coding_arg_coding_type];
9652 CHECK_SYMBOL (coding_type);
9653 CODING_ATTR_TYPE (attrs) = coding_type;
068a9dbd 9654
df7492f9
KH
9655 charset_list = args[coding_arg_charset_list];
9656 if (SYMBOLP (charset_list))
9657 {
9658 if (EQ (charset_list, Qiso_2022))
9659 {
9660 if (! EQ (coding_type, Qiso_2022))
9661 error ("Invalid charset-list");
9662 charset_list = Viso_2022_charset_list;
9663 }
9664 else if (EQ (charset_list, Qemacs_mule))
9665 {
9666 if (! EQ (coding_type, Qemacs_mule))
9667 error ("Invalid charset-list");
9668 charset_list = Vemacs_mule_charset_list;
9669 }
9670 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9671 if (max_charset_id < XFASTINT (XCAR (tail)))
9672 max_charset_id = XFASTINT (XCAR (tail));
9673 }
068a9dbd
KH
9674 else
9675 {
df7492f9 9676 charset_list = Fcopy_sequence (charset_list);
985773c9 9677 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
068a9dbd 9678 {
df7492f9
KH
9679 struct charset *charset;
9680
985773c9 9681 val = XCAR (tail);
df7492f9
KH
9682 CHECK_CHARSET_GET_CHARSET (val, charset);
9683 if (EQ (coding_type, Qiso_2022)
9684 ? CHARSET_ISO_FINAL (charset) < 0
9685 : EQ (coding_type, Qemacs_mule)
9686 ? CHARSET_EMACS_MULE_ID (charset) < 0
9687 : 0)
9688 error ("Can't handle charset `%s'",
8f924df7 9689 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
df7492f9 9690
8f924df7 9691 XSETCAR (tail, make_number (charset->id));
df7492f9
KH
9692 if (max_charset_id < charset->id)
9693 max_charset_id = charset->id;
068a9dbd
KH
9694 }
9695 }
df7492f9 9696 CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
068a9dbd 9697
1b3b981b
AS
9698 safe_charsets = make_uninit_string (max_charset_id + 1);
9699 memset (SDATA (safe_charsets), 255, max_charset_id + 1);
df7492f9 9700 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8f924df7 9701 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
df7492f9 9702 CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
068a9dbd 9703
584948ac 9704 CODING_ATTR_ASCII_COMPAT (attrs) = args[coding_arg_ascii_compatible_p];
3a73fa5d 9705
df7492f9 9706 val = args[coding_arg_decode_translation_table];
a6f87d34 9707 if (! CHAR_TABLE_P (val) && ! CONSP (val))
7d64c6ad 9708 CHECK_SYMBOL (val);
df7492f9 9709 CODING_ATTR_DECODE_TBL (attrs) = val;
3a73fa5d 9710
df7492f9 9711 val = args[coding_arg_encode_translation_table];
a6f87d34 9712 if (! CHAR_TABLE_P (val) && ! CONSP (val))
7d64c6ad 9713 CHECK_SYMBOL (val);
df7492f9 9714 CODING_ATTR_ENCODE_TBL (attrs) = val;
d46c5b12 9715
df7492f9
KH
9716 val = args[coding_arg_post_read_conversion];
9717 CHECK_SYMBOL (val);
9718 CODING_ATTR_POST_READ (attrs) = val;
d46c5b12 9719
df7492f9
KH
9720 val = args[coding_arg_pre_write_conversion];
9721 CHECK_SYMBOL (val);
9722 CODING_ATTR_PRE_WRITE (attrs) = val;
3a73fa5d 9723
df7492f9
KH
9724 val = args[coding_arg_default_char];
9725 if (NILP (val))
9726 CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
9727 else
9728 {
8f924df7 9729 CHECK_CHARACTER (val);
df7492f9
KH
9730 CODING_ATTR_DEFAULT_CHAR (attrs) = val;
9731 }
4031e2bf 9732
8f924df7
KH
9733 val = args[coding_arg_for_unibyte];
9734 CODING_ATTR_FOR_UNIBYTE (attrs) = NILP (val) ? Qnil : Qt;
3a73fa5d 9735
df7492f9
KH
9736 val = args[coding_arg_plist];
9737 CHECK_LIST (val);
9738 CODING_ATTR_PLIST (attrs) = val;
3a73fa5d 9739
df7492f9
KH
9740 if (EQ (coding_type, Qcharset))
9741 {
c7c66a95
KH
9742 /* Generate a lisp vector of 256 elements. Each element is nil,
9743 integer, or a list of charset IDs.
3a73fa5d 9744
c7c66a95
KH
9745 If Nth element is nil, the byte code N is invalid in this
9746 coding system.
4ed46869 9747
c7c66a95
KH
9748 If Nth element is a number NUM, N is the first byte of a
9749 charset whose ID is NUM.
4ed46869 9750
c7c66a95
KH
9751 If Nth element is a list of charset IDs, N is the first byte
9752 of one of them. The list is sorted by dimensions of the
2bc515e4 9753 charsets. A charset of smaller dimension comes firtst. */
df7492f9 9754 val = Fmake_vector (make_number (256), Qnil);
4ed46869 9755
5c99c2e6 9756 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
df7492f9 9757 {
c7c66a95
KH
9758 struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
9759 int dim = CHARSET_DIMENSION (charset);
9760 int idx = (dim - 1) * 4;
4ed46869 9761
5c99c2e6 9762 if (CHARSET_ASCII_COMPATIBLE_P (charset))
584948ac 9763 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
4031e2bf 9764
15d143f7
KH
9765 for (i = charset->code_space[idx];
9766 i <= charset->code_space[idx + 1]; i++)
9767 {
c7c66a95
KH
9768 Lisp_Object tmp, tmp2;
9769 int dim2;
ec6d2bb8 9770
c7c66a95
KH
9771 tmp = AREF (val, i);
9772 if (NILP (tmp))
9773 tmp = XCAR (tail);
9774 else if (NUMBERP (tmp))
9775 {
9776 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
9777 if (dim < dim2)
c7c66a95 9778 tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
f9d71dcd
KH
9779 else
9780 tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
c7c66a95 9781 }
15d143f7 9782 else
c7c66a95
KH
9783 {
9784 for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
9785 {
9786 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
9787 if (dim < dim2)
9788 break;
9789 }
9790 if (NILP (tmp2))
9791 tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
9792 else
9793 {
9794 XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
9795 XSETCAR (tmp2, XCAR (tail));
9796 }
9797 }
9798 ASET (val, i, tmp);
15d143f7 9799 }
df7492f9
KH
9800 }
9801 ASET (attrs, coding_attr_charset_valids, val);
9802 category = coding_category_charset;
9803 }
9804 else if (EQ (coding_type, Qccl))
9805 {
9806 Lisp_Object valids;
ecec61c1 9807
df7492f9
KH
9808 if (nargs < coding_arg_ccl_max)
9809 goto short_args;
ecec61c1 9810
df7492f9
KH
9811 val = args[coding_arg_ccl_decoder];
9812 CHECK_CCL_PROGRAM (val);
9813 if (VECTORP (val))
9814 val = Fcopy_sequence (val);
9815 ASET (attrs, coding_attr_ccl_decoder, val);
ecec61c1 9816
df7492f9
KH
9817 val = args[coding_arg_ccl_encoder];
9818 CHECK_CCL_PROGRAM (val);
9819 if (VECTORP (val))
9820 val = Fcopy_sequence (val);
9821 ASET (attrs, coding_attr_ccl_encoder, val);
ecec61c1 9822
df7492f9
KH
9823 val = args[coding_arg_ccl_valids];
9824 valids = Fmake_string (make_number (256), make_number (0));
9825 for (tail = val; !NILP (tail); tail = Fcdr (tail))
9826 {
8dcbea82 9827 int from, to;
ecec61c1 9828
df7492f9
KH
9829 val = Fcar (tail);
9830 if (INTEGERP (val))
8dcbea82
KH
9831 {
9832 from = to = XINT (val);
9833 if (from < 0 || from > 255)
9834 args_out_of_range_3 (val, make_number (0), make_number (255));
9835 }
df7492f9
KH
9836 else
9837 {
df7492f9 9838 CHECK_CONS (val);
8f924df7
KH
9839 CHECK_NATNUM_CAR (val);
9840 CHECK_NATNUM_CDR (val);
df7492f9 9841 from = XINT (XCAR (val));
8f924df7 9842 if (from > 255)
8dcbea82
KH
9843 args_out_of_range_3 (XCAR (val),
9844 make_number (0), make_number (255));
df7492f9 9845 to = XINT (XCDR (val));
8dcbea82
KH
9846 if (to < from || to > 255)
9847 args_out_of_range_3 (XCDR (val),
9848 XCAR (val), make_number (255));
df7492f9 9849 }
8dcbea82 9850 for (i = from; i <= to; i++)
8f924df7 9851 SSET (valids, i, 1);
df7492f9
KH
9852 }
9853 ASET (attrs, coding_attr_ccl_valids, valids);
4ed46869 9854
df7492f9 9855 category = coding_category_ccl;
55ab7be3 9856 }
df7492f9 9857 else if (EQ (coding_type, Qutf_16))
55ab7be3 9858 {
df7492f9 9859 Lisp_Object bom, endian;
4ed46869 9860
584948ac 9861 CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
4ed46869 9862
df7492f9
KH
9863 if (nargs < coding_arg_utf16_max)
9864 goto short_args;
4ed46869 9865
df7492f9
KH
9866 bom = args[coding_arg_utf16_bom];
9867 if (! NILP (bom) && ! EQ (bom, Qt))
9868 {
9869 CHECK_CONS (bom);
8f924df7
KH
9870 val = XCAR (bom);
9871 CHECK_CODING_SYSTEM (val);
9872 val = XCDR (bom);
9873 CHECK_CODING_SYSTEM (val);
df7492f9 9874 }
a470d443 9875 ASET (attrs, coding_attr_utf_bom, bom);
df7492f9
KH
9876
9877 endian = args[coding_arg_utf16_endian];
b49a1807
KH
9878 CHECK_SYMBOL (endian);
9879 if (NILP (endian))
9880 endian = Qbig;
9881 else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
8f924df7 9882 error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
df7492f9
KH
9883 ASET (attrs, coding_attr_utf_16_endian, endian);
9884
9885 category = (CONSP (bom)
9886 ? coding_category_utf_16_auto
9887 : NILP (bom)
b49a1807 9888 ? (EQ (endian, Qbig)
df7492f9
KH
9889 ? coding_category_utf_16_be_nosig
9890 : coding_category_utf_16_le_nosig)
b49a1807 9891 : (EQ (endian, Qbig)
df7492f9
KH
9892 ? coding_category_utf_16_be
9893 : coding_category_utf_16_le));
9894 }
9895 else if (EQ (coding_type, Qiso_2022))
9896 {
9897 Lisp_Object initial, reg_usage, request, flags;
4776e638 9898 int i;
1397dc18 9899
df7492f9
KH
9900 if (nargs < coding_arg_iso2022_max)
9901 goto short_args;
9902
9903 initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
9904 CHECK_VECTOR (initial);
9905 for (i = 0; i < 4; i++)
9906 {
9907 val = Faref (initial, make_number (i));
9908 if (! NILP (val))
9909 {
584948ac
KH
9910 struct charset *charset;
9911
9912 CHECK_CHARSET_GET_CHARSET (val, charset);
9913 ASET (initial, i, make_number (CHARSET_ID (charset)));
9914 if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
9915 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
df7492f9
KH
9916 }
9917 else
9918 ASET (initial, i, make_number (-1));
9919 }
9920
9921 reg_usage = args[coding_arg_iso2022_reg_usage];
9922 CHECK_CONS (reg_usage);
8f924df7
KH
9923 CHECK_NUMBER_CAR (reg_usage);
9924 CHECK_NUMBER_CDR (reg_usage);
df7492f9
KH
9925
9926 request = Fcopy_sequence (args[coding_arg_iso2022_request]);
9927 for (tail = request; ! NILP (tail); tail = Fcdr (tail))
1397dc18 9928 {
df7492f9 9929 int id;
8f924df7 9930 Lisp_Object tmp;
df7492f9
KH
9931
9932 val = Fcar (tail);
9933 CHECK_CONS (val);
8f924df7
KH
9934 tmp = XCAR (val);
9935 CHECK_CHARSET_GET_ID (tmp, id);
9936 CHECK_NATNUM_CDR (val);
df7492f9
KH
9937 if (XINT (XCDR (val)) >= 4)
9938 error ("Invalid graphic register number: %d", XINT (XCDR (val)));
8f924df7 9939 XSETCAR (val, make_number (id));
1397dc18 9940 }
4ed46869 9941
df7492f9
KH
9942 flags = args[coding_arg_iso2022_flags];
9943 CHECK_NATNUM (flags);
9944 i = XINT (flags);
9945 if (EQ (args[coding_arg_charset_list], Qiso_2022))
9946 flags = make_number (i | CODING_ISO_FLAG_FULL_SUPPORT);
9947
9948 ASET (attrs, coding_attr_iso_initial, initial);
9949 ASET (attrs, coding_attr_iso_usage, reg_usage);
9950 ASET (attrs, coding_attr_iso_request, request);
9951 ASET (attrs, coding_attr_iso_flags, flags);
9952 setup_iso_safe_charsets (attrs);
9953
9954 if (i & CODING_ISO_FLAG_SEVEN_BITS)
9955 category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
9956 | CODING_ISO_FLAG_SINGLE_SHIFT))
9957 ? coding_category_iso_7_else
9958 : EQ (args[coding_arg_charset_list], Qiso_2022)
9959 ? coding_category_iso_7
9960 : coding_category_iso_7_tight);
9961 else
9962 {
9963 int id = XINT (AREF (initial, 1));
9964
c6fb6e98 9965 category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
df7492f9
KH
9966 || EQ (args[coding_arg_charset_list], Qiso_2022)
9967 || id < 0)
9968 ? coding_category_iso_8_else
9969 : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
9970 ? coding_category_iso_8_1
9971 : coding_category_iso_8_2);
9972 }
0ce7886f
KH
9973 if (category != coding_category_iso_8_1
9974 && category != coding_category_iso_8_2)
9975 CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
df7492f9
KH
9976 }
9977 else if (EQ (coding_type, Qemacs_mule))
c28a9453 9978 {
df7492f9
KH
9979 if (EQ (args[coding_arg_charset_list], Qemacs_mule))
9980 ASET (attrs, coding_attr_emacs_mule_full, Qt);
584948ac 9981 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
df7492f9 9982 category = coding_category_emacs_mule;
c28a9453 9983 }
df7492f9 9984 else if (EQ (coding_type, Qshift_jis))
c28a9453 9985 {
df7492f9
KH
9986
9987 struct charset *charset;
9988
7d64c6ad 9989 if (XINT (Flength (charset_list)) != 3
6e07c25f 9990 && XINT (Flength (charset_list)) != 4)
7d64c6ad 9991 error ("There should be three or four charsets");
df7492f9
KH
9992
9993 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9994 if (CHARSET_DIMENSION (charset) != 1)
9995 error ("Dimension of charset %s is not one",
8f924df7 9996 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
584948ac
KH
9997 if (CHARSET_ASCII_COMPATIBLE_P (charset))
9998 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
df7492f9
KH
9999
10000 charset_list = XCDR (charset_list);
10001 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10002 if (CHARSET_DIMENSION (charset) != 1)
10003 error ("Dimension of charset %s is not one",
8f924df7 10004 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
df7492f9
KH
10005
10006 charset_list = XCDR (charset_list);
10007 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10008 if (CHARSET_DIMENSION (charset) != 2)
7d64c6ad
KH
10009 error ("Dimension of charset %s is not two",
10010 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10011
10012 charset_list = XCDR (charset_list);
2b917a06
KH
10013 if (! NILP (charset_list))
10014 {
10015 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10016 if (CHARSET_DIMENSION (charset) != 2)
10017 error ("Dimension of charset %s is not two",
10018 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10019 }
df7492f9
KH
10020
10021 category = coding_category_sjis;
10022 Vsjis_coding_system = name;
c28a9453 10023 }
df7492f9
KH
10024 else if (EQ (coding_type, Qbig5))
10025 {
10026 struct charset *charset;
4ed46869 10027
df7492f9
KH
10028 if (XINT (Flength (charset_list)) != 2)
10029 error ("There should be just two charsets");
10030
10031 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10032 if (CHARSET_DIMENSION (charset) != 1)
10033 error ("Dimension of charset %s is not one",
8f924df7 10034 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
584948ac
KH
10035 if (CHARSET_ASCII_COMPATIBLE_P (charset))
10036 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
df7492f9
KH
10037
10038 charset_list = XCDR (charset_list);
10039 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10040 if (CHARSET_DIMENSION (charset) != 2)
10041 error ("Dimension of charset %s is not two",
8f924df7 10042 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
4ed46869 10043
df7492f9
KH
10044 category = coding_category_big5;
10045 Vbig5_coding_system = name;
10046 }
10047 else if (EQ (coding_type, Qraw_text))
c28a9453 10048 {
584948ac
KH
10049 category = coding_category_raw_text;
10050 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
c28a9453 10051 }
df7492f9 10052 else if (EQ (coding_type, Qutf_8))
4ed46869 10053 {
a470d443
KH
10054 Lisp_Object bom;
10055
584948ac 10056 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
a470d443
KH
10057
10058 if (nargs < coding_arg_utf8_max)
10059 goto short_args;
10060
10061 bom = args[coding_arg_utf8_bom];
10062 if (! NILP (bom) && ! EQ (bom, Qt))
10063 {
10064 CHECK_CONS (bom);
10065 val = XCAR (bom);
10066 CHECK_CODING_SYSTEM (val);
10067 val = XCDR (bom);
10068 CHECK_CODING_SYSTEM (val);
10069 }
10070 ASET (attrs, coding_attr_utf_bom, bom);
10071
10072 category = (CONSP (bom) ? coding_category_utf_8_auto
10073 : NILP (bom) ? coding_category_utf_8_nosig
10074 : coding_category_utf_8_sig);
4ed46869 10075 }
df7492f9
KH
10076 else if (EQ (coding_type, Qundecided))
10077 category = coding_category_undecided;
4ed46869 10078 else
df7492f9 10079 error ("Invalid coding system type: %s",
8f924df7 10080 SDATA (SYMBOL_NAME (coding_type)));
4ed46869 10081
df7492f9 10082 CODING_ATTR_CATEGORY (attrs) = make_number (category);
01378f49
KH
10083 CODING_ATTR_PLIST (attrs)
10084 = Fcons (QCcategory, Fcons (AREF (Vcoding_category_table, category),
10085 CODING_ATTR_PLIST (attrs)));
35befdaa 10086 CODING_ATTR_PLIST (attrs)
3ed051d4 10087 = Fcons (QCascii_compatible_p,
35befdaa
KH
10088 Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
10089 CODING_ATTR_PLIST (attrs)));
c4825358 10090
df7492f9
KH
10091 eol_type = args[coding_arg_eol_type];
10092 if (! NILP (eol_type)
10093 && ! EQ (eol_type, Qunix)
10094 && ! EQ (eol_type, Qdos)
10095 && ! EQ (eol_type, Qmac))
10096 error ("Invalid eol-type");
4ed46869 10097
df7492f9 10098 aliases = Fcons (name, Qnil);
4ed46869 10099
df7492f9
KH
10100 if (NILP (eol_type))
10101 {
10102 eol_type = make_subsidiaries (name);
10103 for (i = 0; i < 3; i++)
1397dc18 10104 {
df7492f9
KH
10105 Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
10106
10107 this_name = AREF (eol_type, i);
10108 this_aliases = Fcons (this_name, Qnil);
10109 this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
10110 this_spec = Fmake_vector (make_number (3), attrs);
10111 ASET (this_spec, 1, this_aliases);
10112 ASET (this_spec, 2, this_eol_type);
10113 Fputhash (this_name, this_spec, Vcoding_system_hash_table);
10114 Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
583f71ca
KH
10115 val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
10116 if (NILP (val))
10117 Vcoding_system_alist
10118 = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
10119 Vcoding_system_alist);
1397dc18 10120 }
d46c5b12 10121 }
4ed46869 10122
df7492f9
KH
10123 spec_vec = Fmake_vector (make_number (3), attrs);
10124 ASET (spec_vec, 1, aliases);
10125 ASET (spec_vec, 2, eol_type);
48b0f3ae 10126
df7492f9
KH
10127 Fputhash (name, spec_vec, Vcoding_system_hash_table);
10128 Vcoding_system_list = Fcons (name, Vcoding_system_list);
583f71ca
KH
10129 val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
10130 if (NILP (val))
10131 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
10132 Vcoding_system_alist);
48b0f3ae 10133
df7492f9
KH
10134 {
10135 int id = coding_categories[category].id;
48b0f3ae 10136
df7492f9
KH
10137 if (id < 0 || EQ (name, CODING_ID_NAME (id)))
10138 setup_coding_system (name, &coding_categories[category]);
10139 }
48b0f3ae 10140
d46c5b12 10141 return Qnil;
48b0f3ae 10142
df7492f9
KH
10143 short_args:
10144 return Fsignal (Qwrong_number_of_arguments,
10145 Fcons (intern ("define-coding-system-internal"),
10146 make_number (nargs)));
d46c5b12 10147}
4ed46869 10148
d6925f38 10149
a6f87d34
KH
10150DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
10151 3, 3, 0,
10152 doc: /* Change value in CODING-SYSTEM's property list PROP to VAL. */)
10153 (coding_system, prop, val)
10154 Lisp_Object coding_system, prop, val;
10155{
3dbe7859 10156 Lisp_Object spec, attrs;
a6f87d34
KH
10157
10158 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10159 attrs = AREF (spec, 0);
10160 if (EQ (prop, QCmnemonic))
10161 {
10162 if (! STRINGP (val))
10163 CHECK_CHARACTER (val);
10164 CODING_ATTR_MNEMONIC (attrs) = val;
10165 }
2133e2d1 10166 else if (EQ (prop, QCdefault_char))
a6f87d34
KH
10167 {
10168 if (NILP (val))
10169 val = make_number (' ');
10170 else
10171 CHECK_CHARACTER (val);
10172 CODING_ATTR_DEFAULT_CHAR (attrs) = val;
10173 }
10174 else if (EQ (prop, QCdecode_translation_table))
10175 {
10176 if (! CHAR_TABLE_P (val) && ! CONSP (val))
10177 CHECK_SYMBOL (val);
10178 CODING_ATTR_DECODE_TBL (attrs) = val;
10179 }
10180 else if (EQ (prop, QCencode_translation_table))
10181 {
10182 if (! CHAR_TABLE_P (val) && ! CONSP (val))
10183 CHECK_SYMBOL (val);
10184 CODING_ATTR_ENCODE_TBL (attrs) = val;
10185 }
10186 else if (EQ (prop, QCpost_read_conversion))
10187 {
10188 CHECK_SYMBOL (val);
10189 CODING_ATTR_POST_READ (attrs) = val;
10190 }
10191 else if (EQ (prop, QCpre_write_conversion))
10192 {
10193 CHECK_SYMBOL (val);
10194 CODING_ATTR_PRE_WRITE (attrs) = val;
10195 }
35befdaa
KH
10196 else if (EQ (prop, QCascii_compatible_p))
10197 {
10198 CODING_ATTR_ASCII_COMPAT (attrs) = val;
10199 }
a6f87d34
KH
10200
10201 CODING_ATTR_PLIST (attrs)
10202 = Fplist_put (CODING_ATTR_PLIST (attrs), prop, val);
10203 return val;
10204}
10205
10206
df7492f9
KH
10207DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
10208 Sdefine_coding_system_alias, 2, 2, 0,
10209 doc: /* Define ALIAS as an alias for CODING-SYSTEM. */)
10210 (alias, coding_system)
10211 Lisp_Object alias, coding_system;
66cfb530 10212{
583f71ca 10213 Lisp_Object spec, aliases, eol_type, val;
4ed46869 10214
df7492f9
KH
10215 CHECK_SYMBOL (alias);
10216 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10217 aliases = AREF (spec, 1);
d4a1d553 10218 /* ALIASES should be a list of length more than zero, and the first
d6925f38
KH
10219 element is a base coding system. Append ALIAS at the tail of the
10220 list. */
df7492f9
KH
10221 while (!NILP (XCDR (aliases)))
10222 aliases = XCDR (aliases);
8f924df7 10223 XSETCDR (aliases, Fcons (alias, Qnil));
4ed46869 10224
df7492f9
KH
10225 eol_type = AREF (spec, 2);
10226 if (VECTORP (eol_type))
4ed46869 10227 {
df7492f9
KH
10228 Lisp_Object subsidiaries;
10229 int i;
4ed46869 10230
df7492f9
KH
10231 subsidiaries = make_subsidiaries (alias);
10232 for (i = 0; i < 3; i++)
10233 Fdefine_coding_system_alias (AREF (subsidiaries, i),
10234 AREF (eol_type, i));
4ed46869 10235 }
df7492f9
KH
10236
10237 Fputhash (alias, spec, Vcoding_system_hash_table);
d6925f38 10238 Vcoding_system_list = Fcons (alias, Vcoding_system_list);
583f71ca
KH
10239 val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
10240 if (NILP (val))
10241 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
10242 Vcoding_system_alist);
66cfb530 10243
4ed46869
KH
10244 return Qnil;
10245}
10246
df7492f9
KH
10247DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
10248 1, 1, 0,
10249 doc: /* Return the base of CODING-SYSTEM.
da7db224 10250Any alias or subsidiary coding system is not a base coding system. */)
df7492f9
KH
10251 (coding_system)
10252 Lisp_Object coding_system;
d46c5b12 10253{
df7492f9 10254 Lisp_Object spec, attrs;
d46c5b12 10255
df7492f9
KH
10256 if (NILP (coding_system))
10257 return (Qno_conversion);
10258 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10259 attrs = AREF (spec, 0);
10260 return CODING_ATTR_BASE_NAME (attrs);
10261}
1397dc18 10262
df7492f9
KH
10263DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
10264 1, 1, 0,
10265 doc: "Return the property list of CODING-SYSTEM.")
10266 (coding_system)
10267 Lisp_Object coding_system;
10268{
10269 Lisp_Object spec, attrs;
1397dc18 10270
df7492f9
KH
10271 if (NILP (coding_system))
10272 coding_system = Qno_conversion;
10273 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10274 attrs = AREF (spec, 0);
10275 return CODING_ATTR_PLIST (attrs);
d46c5b12
KH
10276}
10277
df7492f9
KH
10278
10279DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
10280 1, 1, 0,
da7db224 10281 doc: /* Return the list of aliases of CODING-SYSTEM. */)
df7492f9
KH
10282 (coding_system)
10283 Lisp_Object coding_system;
66cfb530 10284{
df7492f9 10285 Lisp_Object spec;
84d60297 10286
df7492f9
KH
10287 if (NILP (coding_system))
10288 coding_system = Qno_conversion;
10289 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
da7db224 10290 return AREF (spec, 1);
df7492f9 10291}
66cfb530 10292
df7492f9
KH
10293DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
10294 Scoding_system_eol_type, 1, 1, 0,
10295 doc: /* Return eol-type of CODING-SYSTEM.
d4a1d553 10296An eol-type is an integer 0, 1, 2, or a vector of coding systems.
66cfb530 10297
df7492f9
KH
10298Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10299and CR respectively.
66cfb530 10300
df7492f9
KH
10301A vector value indicates that a format of end-of-line should be
10302detected automatically. Nth element of the vector is the subsidiary
10303coding system whose eol-type is N. */)
6b89e3aa
KH
10304 (coding_system)
10305 Lisp_Object coding_system;
10306{
df7492f9
KH
10307 Lisp_Object spec, eol_type;
10308 int n;
6b89e3aa 10309
df7492f9
KH
10310 if (NILP (coding_system))
10311 coding_system = Qno_conversion;
10312 if (! CODING_SYSTEM_P (coding_system))
10313 return Qnil;
10314 spec = CODING_SYSTEM_SPEC (coding_system);
10315 eol_type = AREF (spec, 2);
10316 if (VECTORP (eol_type))
10317 return Fcopy_sequence (eol_type);
10318 n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
10319 return make_number (n);
6b89e3aa
KH
10320}
10321
4ed46869
KH
10322#endif /* emacs */
10323
10324\f
1397dc18 10325/*** 9. Post-amble ***/
4ed46869 10326
dfcf069d 10327void
4ed46869
KH
10328init_coding_once ()
10329{
10330 int i;
10331
df7492f9
KH
10332 for (i = 0; i < coding_category_max; i++)
10333 {
10334 coding_categories[i].id = -1;
10335 coding_priorities[i] = i;
10336 }
4ed46869
KH
10337
10338 /* ISO2022 specific initialize routine. */
10339 for (i = 0; i < 0x20; i++)
b73bfc1c 10340 iso_code_class[i] = ISO_control_0;
4ed46869
KH
10341 for (i = 0x21; i < 0x7F; i++)
10342 iso_code_class[i] = ISO_graphic_plane_0;
10343 for (i = 0x80; i < 0xA0; i++)
b73bfc1c 10344 iso_code_class[i] = ISO_control_1;
4ed46869
KH
10345 for (i = 0xA1; i < 0xFF; i++)
10346 iso_code_class[i] = ISO_graphic_plane_1;
10347 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
10348 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
4ed46869
KH
10349 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
10350 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
10351 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
10352 iso_code_class[ISO_CODE_ESC] = ISO_escape;
10353 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
10354 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
10355 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
10356
df7492f9
KH
10357 for (i = 0; i < 256; i++)
10358 {
10359 emacs_mule_bytes[i] = 1;
10360 }
7c78e542
KH
10361 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
10362 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
10363 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
10364 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
e0e989f6
KH
10365}
10366
10367#ifdef emacs
10368
dfcf069d 10369void
e0e989f6
KH
10370syms_of_coding ()
10371{
df7492f9 10372 staticpro (&Vcoding_system_hash_table);
8f924df7
KH
10373 {
10374 Lisp_Object args[2];
10375 args[0] = QCtest;
10376 args[1] = Qeq;
10377 Vcoding_system_hash_table = Fmake_hash_table (2, args);
10378 }
df7492f9
KH
10379
10380 staticpro (&Vsjis_coding_system);
10381 Vsjis_coding_system = Qnil;
e0e989f6 10382
df7492f9
KH
10383 staticpro (&Vbig5_coding_system);
10384 Vbig5_coding_system = Qnil;
10385
24a73b0a
KH
10386 staticpro (&Vcode_conversion_reused_workbuf);
10387 Vcode_conversion_reused_workbuf = Qnil;
10388
10389 staticpro (&Vcode_conversion_workbuf_name);
10390 Vcode_conversion_workbuf_name = build_string (" *code-conversion-work*");
e0e989f6 10391
24a73b0a 10392 reused_workbuf_in_use = 0;
df7492f9
KH
10393
10394 DEFSYM (Qcharset, "charset");
10395 DEFSYM (Qtarget_idx, "target-idx");
10396 DEFSYM (Qcoding_system_history, "coding-system-history");
bb0115a2
RS
10397 Fset (Qcoding_system_history, Qnil);
10398
9ce27fde 10399 /* Target FILENAME is the first argument. */
e0e989f6 10400 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
9ce27fde 10401 /* Target FILENAME is the third argument. */
e0e989f6
KH
10402 Fput (Qwrite_region, Qtarget_idx, make_number (2));
10403
df7492f9 10404 DEFSYM (Qcall_process, "call-process");
9ce27fde 10405 /* Target PROGRAM is the first argument. */
e0e989f6
KH
10406 Fput (Qcall_process, Qtarget_idx, make_number (0));
10407
df7492f9 10408 DEFSYM (Qcall_process_region, "call-process-region");
9ce27fde 10409 /* Target PROGRAM is the third argument. */
e0e989f6
KH
10410 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
10411
df7492f9 10412 DEFSYM (Qstart_process, "start-process");
9ce27fde 10413 /* Target PROGRAM is the third argument. */
e0e989f6
KH
10414 Fput (Qstart_process, Qtarget_idx, make_number (2));
10415
df7492f9 10416 DEFSYM (Qopen_network_stream, "open-network-stream");
9ce27fde 10417 /* Target SERVICE is the fourth argument. */
e0e989f6
KH
10418 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
10419
df7492f9
KH
10420 DEFSYM (Qcoding_system, "coding-system");
10421 DEFSYM (Qcoding_aliases, "coding-aliases");
4ed46869 10422
df7492f9
KH
10423 DEFSYM (Qeol_type, "eol-type");
10424 DEFSYM (Qunix, "unix");
10425 DEFSYM (Qdos, "dos");
4ed46869 10426
df7492f9
KH
10427 DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10428 DEFSYM (Qpost_read_conversion, "post-read-conversion");
10429 DEFSYM (Qpre_write_conversion, "pre-write-conversion");
10430 DEFSYM (Qdefault_char, "default-char");
10431 DEFSYM (Qundecided, "undecided");
10432 DEFSYM (Qno_conversion, "no-conversion");
10433 DEFSYM (Qraw_text, "raw-text");
4ed46869 10434
df7492f9 10435 DEFSYM (Qiso_2022, "iso-2022");
4ed46869 10436
df7492f9 10437 DEFSYM (Qutf_8, "utf-8");
8f924df7 10438 DEFSYM (Qutf_8_emacs, "utf-8-emacs");
27901516 10439
df7492f9 10440 DEFSYM (Qutf_16, "utf-16");
df7492f9
KH
10441 DEFSYM (Qbig, "big");
10442 DEFSYM (Qlittle, "little");
27901516 10443
df7492f9
KH
10444 DEFSYM (Qshift_jis, "shift-jis");
10445 DEFSYM (Qbig5, "big5");
4ed46869 10446
df7492f9 10447 DEFSYM (Qcoding_system_p, "coding-system-p");
4ed46869 10448
df7492f9 10449 DEFSYM (Qcoding_system_error, "coding-system-error");
4ed46869
KH
10450 Fput (Qcoding_system_error, Qerror_conditions,
10451 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
10452 Fput (Qcoding_system_error, Qerror_message,
9ce27fde 10453 build_string ("Invalid coding system"));
4ed46869 10454
05e6f5dc
KH
10455 /* Intern this now in case it isn't already done.
10456 Setting this variable twice is harmless.
10457 But don't staticpro it here--that is done in alloc.c. */
10458 Qchar_table_extra_slots = intern ("char-table-extra-slots");
70c22245 10459
df7492f9 10460 DEFSYM (Qtranslation_table, "translation-table");
433f7f87 10461 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
df7492f9
KH
10462 DEFSYM (Qtranslation_table_id, "translation-table-id");
10463 DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
10464 DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
1397dc18 10465
df7492f9 10466 DEFSYM (Qvalid_codes, "valid-codes");
9ce27fde 10467
df7492f9 10468 DEFSYM (Qemacs_mule, "emacs-mule");
d46c5b12 10469
01378f49 10470 DEFSYM (QCcategory, ":category");
a6f87d34 10471 DEFSYM (QCmnemonic, ":mnemonic");
2133e2d1 10472 DEFSYM (QCdefault_char, ":default-char");
a6f87d34
KH
10473 DEFSYM (QCdecode_translation_table, ":decode-translation-table");
10474 DEFSYM (QCencode_translation_table, ":encode-translation-table");
10475 DEFSYM (QCpost_read_conversion, ":post-read-conversion");
10476 DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
35befdaa 10477 DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
01378f49 10478
df7492f9
KH
10479 Vcoding_category_table
10480 = Fmake_vector (make_number (coding_category_max), Qnil);
10481 staticpro (&Vcoding_category_table);
10482 /* Followings are target of code detection. */
10483 ASET (Vcoding_category_table, coding_category_iso_7,
10484 intern ("coding-category-iso-7"));
10485 ASET (Vcoding_category_table, coding_category_iso_7_tight,
10486 intern ("coding-category-iso-7-tight"));
10487 ASET (Vcoding_category_table, coding_category_iso_8_1,
10488 intern ("coding-category-iso-8-1"));
10489 ASET (Vcoding_category_table, coding_category_iso_8_2,
10490 intern ("coding-category-iso-8-2"));
10491 ASET (Vcoding_category_table, coding_category_iso_7_else,
10492 intern ("coding-category-iso-7-else"));
10493 ASET (Vcoding_category_table, coding_category_iso_8_else,
10494 intern ("coding-category-iso-8-else"));
a470d443
KH
10495 ASET (Vcoding_category_table, coding_category_utf_8_auto,
10496 intern ("coding-category-utf-8-auto"));
10497 ASET (Vcoding_category_table, coding_category_utf_8_nosig,
df7492f9 10498 intern ("coding-category-utf-8"));
a470d443
KH
10499 ASET (Vcoding_category_table, coding_category_utf_8_sig,
10500 intern ("coding-category-utf-8-sig"));
df7492f9
KH
10501 ASET (Vcoding_category_table, coding_category_utf_16_be,
10502 intern ("coding-category-utf-16-be"));
ff563fce
KH
10503 ASET (Vcoding_category_table, coding_category_utf_16_auto,
10504 intern ("coding-category-utf-16-auto"));
df7492f9
KH
10505 ASET (Vcoding_category_table, coding_category_utf_16_le,
10506 intern ("coding-category-utf-16-le"));
10507 ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
10508 intern ("coding-category-utf-16-be-nosig"));
10509 ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
10510 intern ("coding-category-utf-16-le-nosig"));
10511 ASET (Vcoding_category_table, coding_category_charset,
10512 intern ("coding-category-charset"));
10513 ASET (Vcoding_category_table, coding_category_sjis,
10514 intern ("coding-category-sjis"));
10515 ASET (Vcoding_category_table, coding_category_big5,
10516 intern ("coding-category-big5"));
10517 ASET (Vcoding_category_table, coding_category_ccl,
10518 intern ("coding-category-ccl"));
10519 ASET (Vcoding_category_table, coding_category_emacs_mule,
10520 intern ("coding-category-emacs-mule"));
10521 /* Followings are NOT target of code detection. */
10522 ASET (Vcoding_category_table, coding_category_raw_text,
10523 intern ("coding-category-raw-text"));
10524 ASET (Vcoding_category_table, coding_category_undecided,
10525 intern ("coding-category-undecided"));
ecf488bc 10526
065e3595
KH
10527 DEFSYM (Qinsufficient_source, "insufficient-source");
10528 DEFSYM (Qinconsistent_eol, "inconsistent-eol");
10529 DEFSYM (Qinvalid_source, "invalid-source");
10530 DEFSYM (Qinterrupted, "interrupted");
10531 DEFSYM (Qinsufficient_memory, "insufficient-memory");
44e8490d 10532 DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
065e3595 10533
4ed46869
KH
10534 defsubr (&Scoding_system_p);
10535 defsubr (&Sread_coding_system);
10536 defsubr (&Sread_non_nil_coding_system);
10537 defsubr (&Scheck_coding_system);
10538 defsubr (&Sdetect_coding_region);
d46c5b12 10539 defsubr (&Sdetect_coding_string);
05e6f5dc 10540 defsubr (&Sfind_coding_systems_region_internal);
068a9dbd 10541 defsubr (&Sunencodable_char_position);
df7492f9 10542 defsubr (&Scheck_coding_systems_region);
4ed46869
KH
10543 defsubr (&Sdecode_coding_region);
10544 defsubr (&Sencode_coding_region);
10545 defsubr (&Sdecode_coding_string);
10546 defsubr (&Sencode_coding_string);
10547 defsubr (&Sdecode_sjis_char);
10548 defsubr (&Sencode_sjis_char);
10549 defsubr (&Sdecode_big5_char);
10550 defsubr (&Sencode_big5_char);
1ba9e4ab 10551 defsubr (&Sset_terminal_coding_system_internal);
c4825358 10552 defsubr (&Sset_safe_terminal_coding_system_internal);
4ed46869 10553 defsubr (&Sterminal_coding_system);
1ba9e4ab 10554 defsubr (&Sset_keyboard_coding_system_internal);
4ed46869 10555 defsubr (&Skeyboard_coding_system);
a5d301df 10556 defsubr (&Sfind_operation_coding_system);
df7492f9 10557 defsubr (&Sset_coding_system_priority);
6b89e3aa 10558 defsubr (&Sdefine_coding_system_internal);
df7492f9 10559 defsubr (&Sdefine_coding_system_alias);
a6f87d34 10560 defsubr (&Scoding_system_put);
df7492f9
KH
10561 defsubr (&Scoding_system_base);
10562 defsubr (&Scoding_system_plist);
10563 defsubr (&Scoding_system_aliases);
10564 defsubr (&Scoding_system_eol_type);
10565 defsubr (&Scoding_system_priority_list);
4ed46869 10566
4608c386 10567 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
48b0f3ae
PJ
10568 doc: /* List of coding systems.
10569
10570Do not alter the value of this variable manually. This variable should be
df7492f9 10571updated by the functions `define-coding-system' and
48b0f3ae 10572`define-coding-system-alias'. */);
4608c386
KH
10573 Vcoding_system_list = Qnil;
10574
10575 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
48b0f3ae
PJ
10576 doc: /* Alist of coding system names.
10577Each element is one element list of coding system name.
446dcd75 10578This variable is given to `completing-read' as COLLECTION argument.
48b0f3ae
PJ
10579
10580Do not alter the value of this variable manually. This variable should be
10581updated by the functions `make-coding-system' and
10582`define-coding-system-alias'. */);
4608c386
KH
10583 Vcoding_system_alist = Qnil;
10584
4ed46869 10585 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
48b0f3ae
PJ
10586 doc: /* List of coding-categories (symbols) ordered by priority.
10587
10588On detecting a coding system, Emacs tries code detection algorithms
10589associated with each coding-category one by one in this order. When
10590one algorithm agrees with a byte sequence of source text, the coding
0ec31faf
KH
10591system bound to the corresponding coding-category is selected.
10592
42205607 10593Don't modify this variable directly, but use `set-coding-priority'. */);
4ed46869
KH
10594 {
10595 int i;
10596
10597 Vcoding_category_list = Qnil;
df7492f9 10598 for (i = coding_category_max - 1; i >= 0; i--)
4ed46869 10599 Vcoding_category_list
d46c5b12
KH
10600 = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
10601 Vcoding_category_list);
4ed46869
KH
10602 }
10603
10604 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
48b0f3ae
PJ
10605 doc: /* Specify the coding system for read operations.
10606It is useful to bind this variable with `let', but do not set it globally.
10607If the value is a coding system, it is used for decoding on read operation.
446dcd75
JB
10608If not, an appropriate element is used from one of the coding system alists.
10609There are three such tables: `file-coding-system-alist',
48b0f3ae 10610`process-coding-system-alist', and `network-coding-system-alist'. */);
4ed46869
KH
10611 Vcoding_system_for_read = Qnil;
10612
10613 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
48b0f3ae
PJ
10614 doc: /* Specify the coding system for write operations.
10615Programs bind this variable with `let', but you should not set it globally.
10616If the value is a coding system, it is used for encoding of output,
10617when writing it to a file and when sending it to a file or subprocess.
10618
10619If this does not specify a coding system, an appropriate element
446dcd75
JB
10620is used from one of the coding system alists.
10621There are three such tables: `file-coding-system-alist',
48b0f3ae
PJ
10622`process-coding-system-alist', and `network-coding-system-alist'.
10623For output to files, if the above procedure does not specify a coding system,
10624the value of `buffer-file-coding-system' is used. */);
4ed46869
KH
10625 Vcoding_system_for_write = Qnil;
10626
10627 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
df7492f9
KH
10628 doc: /*
10629Coding system used in the latest file or process I/O. */);
4ed46869
KH
10630 Vlast_coding_system_used = Qnil;
10631
065e3595
KH
10632 DEFVAR_LISP ("last-code-conversion-error", &Vlast_code_conversion_error,
10633 doc: /*
10634Error status of the last code conversion.
10635
10636When an error was detected in the last code conversion, this variable
10637is set to one of the following symbols.
10638 `insufficient-source'
10639 `inconsistent-eol'
10640 `invalid-source'
10641 `interrupted'
10642 `insufficient-memory'
10643When no error was detected, the value doesn't change. So, to check
10644the error status of a code conversion by this variable, you must
10645explicitly set this variable to nil before performing code
10646conversion. */);
10647 Vlast_code_conversion_error = Qnil;
10648
9ce27fde 10649 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
df7492f9
KH
10650 doc: /*
10651*Non-nil means always inhibit code conversion of end-of-line format.
48b0f3ae
PJ
10652See info node `Coding Systems' and info node `Text and Binary' concerning
10653such conversion. */);
9ce27fde
KH
10654 inhibit_eol_conversion = 0;
10655
ed29121d 10656 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
df7492f9
KH
10657 doc: /*
10658Non-nil means process buffer inherits coding system of process output.
48b0f3ae
PJ
10659Bind it to t if the process output is to be treated as if it were a file
10660read from some filesystem. */);
ed29121d
EZ
10661 inherit_process_coding_system = 0;
10662
02ba4723 10663 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
df7492f9
KH
10664 doc: /*
10665Alist to decide a coding system to use for a file I/O operation.
48b0f3ae
PJ
10666The format is ((PATTERN . VAL) ...),
10667where PATTERN is a regular expression matching a file name,
10668VAL is a coding system, a cons of coding systems, or a function symbol.
10669If VAL is a coding system, it is used for both decoding and encoding
10670the file contents.
10671If VAL is a cons of coding systems, the car part is used for decoding,
10672and the cdr part is used for encoding.
10673If VAL is a function symbol, the function must return a coding system
2c53e699
KH
10674or a cons of coding systems which are used as above. The function is
10675called with an argument that is a list of the arguments with which
5a0bbd9a
KH
10676`find-operation-coding-system' was called. If the function can't decide
10677a coding system, it can return `undecided' so that the normal
10678code-detection is performed.
48b0f3ae
PJ
10679
10680See also the function `find-operation-coding-system'
10681and the variable `auto-coding-alist'. */);
02ba4723
KH
10682 Vfile_coding_system_alist = Qnil;
10683
10684 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
df7492f9
KH
10685 doc: /*
10686Alist to decide a coding system to use for a process I/O operation.
48b0f3ae
PJ
10687The format is ((PATTERN . VAL) ...),
10688where PATTERN is a regular expression matching a program name,
10689VAL is a coding system, a cons of coding systems, or a function symbol.
10690If VAL is a coding system, it is used for both decoding what received
10691from the program and encoding what sent to the program.
10692If VAL is a cons of coding systems, the car part is used for decoding,
10693and the cdr part is used for encoding.
10694If VAL is a function symbol, the function must return a coding system
10695or a cons of coding systems which are used as above.
10696
10697See also the function `find-operation-coding-system'. */);
02ba4723
KH
10698 Vprocess_coding_system_alist = Qnil;
10699
10700 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
df7492f9
KH
10701 doc: /*
10702Alist to decide a coding system to use for a network I/O operation.
48b0f3ae
PJ
10703The format is ((PATTERN . VAL) ...),
10704where PATTERN is a regular expression matching a network service name
10705or is a port number to connect to,
10706VAL is a coding system, a cons of coding systems, or a function symbol.
10707If VAL is a coding system, it is used for both decoding what received
10708from the network stream and encoding what sent to the network stream.
10709If VAL is a cons of coding systems, the car part is used for decoding,
10710and the cdr part is used for encoding.
10711If VAL is a function symbol, the function must return a coding system
10712or a cons of coding systems which are used as above.
10713
10714See also the function `find-operation-coding-system'. */);
02ba4723 10715 Vnetwork_coding_system_alist = Qnil;
4ed46869 10716
68c45bf0 10717 DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
75205970
RS
10718 doc: /* Coding system to use with system messages.
10719Also used for decoding keyboard input on X Window system. */);
68c45bf0
PE
10720 Vlocale_coding_system = Qnil;
10721
005f0d35 10722 /* The eol mnemonics are reset in startup.el system-dependently. */
7722baf9 10723 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
df7492f9
KH
10724 doc: /*
10725*String displayed in mode line for UNIX-like (LF) end-of-line format. */);
7722baf9 10726 eol_mnemonic_unix = build_string (":");
4ed46869 10727
7722baf9 10728 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
df7492f9
KH
10729 doc: /*
10730*String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
7722baf9 10731 eol_mnemonic_dos = build_string ("\\");
4ed46869 10732
7722baf9 10733 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
df7492f9
KH
10734 doc: /*
10735*String displayed in mode line for MAC-like (CR) end-of-line format. */);
7722baf9 10736 eol_mnemonic_mac = build_string ("/");
4ed46869 10737
7722baf9 10738 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
df7492f9
KH
10739 doc: /*
10740*String displayed in mode line when end-of-line format is not yet determined. */);
7722baf9 10741 eol_mnemonic_undecided = build_string (":");
4ed46869 10742
84fbb8a0 10743 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
df7492f9
KH
10744 doc: /*
10745*Non-nil enables character translation while encoding and decoding. */);
84fbb8a0 10746 Venable_character_translation = Qt;
bdd9fb48 10747
f967223b 10748 DEFVAR_LISP ("standard-translation-table-for-decode",
48b0f3ae
PJ
10749 &Vstandard_translation_table_for_decode,
10750 doc: /* Table for translating characters while decoding. */);
f967223b 10751 Vstandard_translation_table_for_decode = Qnil;
bdd9fb48 10752
f967223b 10753 DEFVAR_LISP ("standard-translation-table-for-encode",
48b0f3ae
PJ
10754 &Vstandard_translation_table_for_encode,
10755 doc: /* Table for translating characters while encoding. */);
f967223b 10756 Vstandard_translation_table_for_encode = Qnil;
4ed46869 10757
df7492f9 10758 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table,
48b0f3ae
PJ
10759 doc: /* Alist of charsets vs revision numbers.
10760While encoding, if a charset (car part of an element) is found,
df7492f9
KH
10761designate it with the escape sequence identifying revision (cdr part
10762of the element). */);
10763 Vcharset_revision_table = Qnil;
02ba4723
KH
10764
10765 DEFVAR_LISP ("default-process-coding-system",
10766 &Vdefault_process_coding_system,
48b0f3ae
PJ
10767 doc: /* Cons of coding systems used for process I/O by default.
10768The car part is used for decoding a process output,
10769the cdr part is used for encoding a text to be sent to a process. */);
02ba4723 10770 Vdefault_process_coding_system = Qnil;
c4825358 10771
3f003981 10772 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
df7492f9
KH
10773 doc: /*
10774Table of extra Latin codes in the range 128..159 (inclusive).
48b0f3ae
PJ
10775This is a vector of length 256.
10776If Nth element is non-nil, the existence of code N in a file
10777\(or output of subprocess) doesn't prevent it to be detected as
10778a coding system of ISO 2022 variant which has a flag
10779`accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
10780or reading output of a subprocess.
446dcd75 10781Only 128th through 159th elements have a meaning. */);
3f003981 10782 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
d46c5b12
KH
10783
10784 DEFVAR_LISP ("select-safe-coding-system-function",
10785 &Vselect_safe_coding_system_function,
df7492f9
KH
10786 doc: /*
10787Function to call to select safe coding system for encoding a text.
48b0f3ae
PJ
10788
10789If set, this function is called to force a user to select a proper
10790coding system which can encode the text in the case that a default
fdecf907
GM
10791coding system used in each operation can't encode the text. The
10792function should take care that the buffer is not modified while
10793the coding system is being selected.
48b0f3ae
PJ
10794
10795The default value is `select-safe-coding-system' (which see). */);
d46c5b12
KH
10796 Vselect_safe_coding_system_function = Qnil;
10797
5d5bf4d8
KH
10798 DEFVAR_BOOL ("coding-system-require-warning",
10799 &coding_system_require_warning,
10800 doc: /* Internal use only.
6b89e3aa
KH
10801If non-nil, on writing a file, `select-safe-coding-system-function' is
10802called even if `coding-system-for-write' is non-nil. The command
10803`universal-coding-system-argument' binds this variable to t temporarily. */);
5d5bf4d8
KH
10804 coding_system_require_warning = 0;
10805
10806
22ab2303 10807 DEFVAR_BOOL ("inhibit-iso-escape-detection",
74383408 10808 &inhibit_iso_escape_detection,
df7492f9 10809 doc: /*
97b1b294 10810If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
48b0f3ae 10811
97b1b294
EZ
10812When Emacs reads text, it tries to detect how the text is encoded.
10813This code detection is sensitive to escape sequences. If Emacs sees
10814a valid ISO-2022 escape sequence, it assumes the text is encoded in one
10815of the ISO2022 encodings, and decodes text by the corresponding coding
10816system (e.g. `iso-2022-7bit').
48b0f3ae
PJ
10817
10818However, there may be a case that you want to read escape sequences in
10819a file as is. In such a case, you can set this variable to non-nil.
97b1b294
EZ
10820Then the code detection will ignore any escape sequences, and no text is
10821detected as encoded in some ISO-2022 encoding. The result is that all
48b0f3ae
PJ
10822escape sequences become visible in a buffer.
10823
10824The default value is nil, and it is strongly recommended not to change
10825it. That is because many Emacs Lisp source files that contain
10826non-ASCII characters are encoded by the coding system `iso-2022-7bit'
10827in Emacs's distribution, and they won't be decoded correctly on
10828reading if you suppress escape sequence detection.
10829
10830The other way to read escape sequences in a file without decoding is
97b1b294 10831to explicitly specify some coding system that doesn't use ISO-2022
48b0f3ae 10832escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument]. */);
74383408 10833 inhibit_iso_escape_detection = 0;
002fdb44 10834
97b1b294
EZ
10835 DEFVAR_BOOL ("inhibit-null-byte-detection",
10836 &inhibit_null_byte_detection,
10837 doc: /* If non-nil, Emacs ignores null bytes on code detection.
10838By default, Emacs treats it as binary data, and does not attempt to
10839decode it. The effect is as if you specified `no-conversion' for
10840reading that text.
10841
10842Set this to non-nil when a regular text happens to include null bytes.
10843Examples are Index nodes of Info files and null-byte delimited output
10844from GNU Find and GNU Grep. Emacs will then ignore the null bytes and
10845decode text as usual. */);
10846 inhibit_null_byte_detection = 0;
10847
002fdb44 10848 DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
15c8f9d1 10849 doc: /* Char table for translating self-inserting characters.
446dcd75 10850This is applied to the result of input methods, not their input.
8434d0b8
EZ
10851See also `keyboard-translate-table'.
10852
10853Use of this variable for character code unification was rendered
10854obsolete in Emacs 23.1 and later, since Unicode is now the basis of
10855internal character representation. */);
002fdb44 10856 Vtranslation_table_for_input = Qnil;
8f924df7 10857
2c78b7e1
KH
10858 {
10859 Lisp_Object args[coding_arg_max];
8f924df7 10860 Lisp_Object plist[16];
2c78b7e1
KH
10861 int i;
10862
10863 for (i = 0; i < coding_arg_max; i++)
10864 args[i] = Qnil;
10865
10866 plist[0] = intern (":name");
10867 plist[1] = args[coding_arg_name] = Qno_conversion;
10868 plist[2] = intern (":mnemonic");
10869 plist[3] = args[coding_arg_mnemonic] = make_number ('=');
10870 plist[4] = intern (":coding-type");
10871 plist[5] = args[coding_arg_coding_type] = Qraw_text;
10872 plist[6] = intern (":ascii-compatible-p");
10873 plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
10874 plist[8] = intern (":default-char");
10875 plist[9] = args[coding_arg_default_char] = make_number (0);
8f924df7
KH
10876 plist[10] = intern (":for-unibyte");
10877 plist[11] = args[coding_arg_for_unibyte] = Qt;
10878 plist[12] = intern (":docstring");
10879 plist[13] = build_string ("Do no conversion.\n\
2c78b7e1
KH
10880\n\
10881When you visit a file with this coding, the file is read into a\n\
10882unibyte buffer as is, thus each byte of a file is treated as a\n\
10883character.");
8f924df7
KH
10884 plist[14] = intern (":eol-type");
10885 plist[15] = args[coding_arg_eol_type] = Qunix;
10886 args[coding_arg_plist] = Flist (16, plist);
2c78b7e1 10887 Fdefine_coding_system_internal (coding_arg_max, args);
ae6f73fa
KH
10888
10889 plist[1] = args[coding_arg_name] = Qundecided;
10890 plist[3] = args[coding_arg_mnemonic] = make_number ('-');
10891 plist[5] = args[coding_arg_coding_type] = Qundecided;
10892 /* This is already set.
35befdaa 10893 plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
ae6f73fa
KH
10894 plist[8] = intern (":charset-list");
10895 plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
10896 plist[11] = args[coding_arg_for_unibyte] = Qnil;
10897 plist[13] = build_string ("No conversion on encoding, automatic conversion on decoding.");
10898 plist[15] = args[coding_arg_eol_type] = Qnil;
10899 args[coding_arg_plist] = Flist (16, plist);
10900 Fdefine_coding_system_internal (coding_arg_max, args);
2c78b7e1
KH
10901 }
10902
2c78b7e1 10903 setup_coding_system (Qno_conversion, &safe_terminal_coding);
ff563fce
KH
10904
10905 {
10906 int i;
10907
10908 for (i = 0; i < coding_category_max; i++)
10909 Fset (AREF (Vcoding_category_table, i), Qno_conversion);
10910 }
fcbcfb64
KH
10911#if defined (MSDOS) || defined (WINDOWSNT)
10912 system_eol_type = Qdos;
10913#else
10914 system_eol_type = Qunix;
10915#endif
10916 staticpro (&system_eol_type);
4ed46869
KH
10917}
10918
68c45bf0
PE
10919char *
10920emacs_strerror (error_number)
10921 int error_number;
10922{
10923 char *str;
10924
ca9c0567 10925 synchronize_system_messages_locale ();
68c45bf0
PE
10926 str = strerror (error_number);
10927
10928 if (! NILP (Vlocale_coding_system))
10929 {
10930 Lisp_Object dec = code_convert_string_norecord (build_string (str),
10931 Vlocale_coding_system,
10932 0);
d5db4077 10933 str = (char *) SDATA (dec);
68c45bf0
PE
10934 }
10935
10936 return str;
10937}
10938
4ed46869 10939#endif /* emacs */
9ffd559c
KH
10940
10941/* arch-tag: 3a3a2b01-5ff6-4071-9afe-f5b808d9229d
10942 (do not change this comment) */