(xfont_text_extents): Fix initial setting of metrics.
[bpt/emacs.git] / src / coding.c
CommitLineData
9542cb1f 1/* Coding system handler (conversion, detection, etc).
aaef169d 2 Copyright (C) 2001, 2002, 2003, 2004, 2005,
8cabe764 3 2006, 2007, 2008 Free Software Foundation, Inc.
7976eda0 4 Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
8cabe764 5 2005, 2006, 2007, 2008
ce03bf76
KH
6 National Institute of Advanced Industrial Science and Technology (AIST)
7 Registration Number H14PRO021
8f924df7 8 Copyright (C) 2003
df7492f9
KH
9 National Institute of Advanced Industrial Science and Technology (AIST)
10 Registration Number H13PRO009
4ed46869 11
369314dc
KH
12This file is part of GNU Emacs.
13
9ec0b715 14GNU Emacs is free software: you can redistribute it and/or modify
369314dc 15it under the terms of the GNU General Public License as published by
9ec0b715
GM
16the Free Software Foundation, either version 3 of the License, or
17(at your option) any later version.
4ed46869 18
369314dc
KH
19GNU Emacs is distributed in the hope that it will be useful,
20but WITHOUT ANY WARRANTY; without even the implied warranty of
21MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22GNU General Public License for more details.
4ed46869 23
369314dc 24You should have received a copy of the GNU General Public License
9ec0b715 25along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */
4ed46869
KH
26
27/*** TABLE OF CONTENTS ***
28
b73bfc1c 29 0. General comments
4ed46869 30 1. Preamble
df7492f9
KH
31 2. Emacs' internal format (emacs-utf-8) handlers
32 3. UTF-8 handlers
33 4. UTF-16 handlers
34 5. Charset-base coding systems handlers
35 6. emacs-mule (old Emacs' internal format) handlers
36 7. ISO2022 handlers
37 8. Shift-JIS and BIG5 handlers
38 9. CCL handlers
39 10. C library functions
40 11. Emacs Lisp library functions
41 12. Postamble
4ed46869
KH
42
43*/
44
df7492f9 45/*** 0. General comments ***
b73bfc1c
KH
46
47
df7492f9 48CODING SYSTEM
4ed46869 49
5bad0796
DL
50 A coding system is an object for an encoding mechanism that contains
51 information about how to convert byte sequences to character
e19c3639
KH
52 sequences and vice versa. When we say "decode", it means converting
53 a byte sequence of a specific coding system into a character
54 sequence that is represented by Emacs' internal coding system
55 `emacs-utf-8', and when we say "encode", it means converting a
56 character sequence of emacs-utf-8 to a byte sequence of a specific
0ef69138 57 coding system.
4ed46869 58
e19c3639
KH
59 In Emacs Lisp, a coding system is represented by a Lisp symbol. In
60 C level, a coding system is represented by a vector of attributes
5bad0796 61 stored in the hash table Vcharset_hash_table. The conversion from
e19c3639
KH
62 coding system symbol to attributes vector is done by looking up
63 Vcharset_hash_table by the symbol.
4ed46869 64
e19c3639 65 Coding systems are classified into the following types depending on
5bad0796 66 the encoding mechanism. Here's a brief description of the types.
4ed46869 67
df7492f9
KH
68 o UTF-8
69
70 o UTF-16
71
72 o Charset-base coding system
73
74 A coding system defined by one or more (coded) character sets.
5bad0796 75 Decoding and encoding are done by a code converter defined for each
df7492f9
KH
76 character set.
77
5bad0796 78 o Old Emacs internal format (emacs-mule)
df7492f9 79
5bad0796 80 The coding system adopted by old versions of Emacs (20 and 21).
4ed46869 81
df7492f9 82 o ISO2022-base coding system
4ed46869
KH
83
84 The most famous coding system for multiple character sets. X's
df7492f9
KH
85 Compound Text, various EUCs (Extended Unix Code), and coding systems
86 used in the Internet communication such as ISO-2022-JP are all
87 variants of ISO2022.
4ed46869 88
df7492f9 89 o SJIS (or Shift-JIS or MS-Kanji-Code)
93dec019 90
4ed46869
KH
91 A coding system to encode character sets: ASCII, JISX0201, and
92 JISX0208. Widely used for PC's in Japan. Details are described in
df7492f9 93 section 8.
4ed46869 94
df7492f9 95 o BIG5
4ed46869 96
df7492f9 97 A coding system to encode character sets: ASCII and Big5. Widely
cfb43547 98 used for Chinese (mainly in Taiwan and Hong Kong). Details are
df7492f9
KH
99 described in section 8. In this file, when we write "big5" (all
100 lowercase), we mean the coding system, and when we write "Big5"
101 (capitalized), we mean the character set.
4ed46869 102
df7492f9 103 o CCL
27901516 104
5bad0796 105 If a user wants to decode/encode text encoded in a coding system
df7492f9
KH
106 not listed above, he can supply a decoder and an encoder for it in
107 CCL (Code Conversion Language) programs. Emacs executes the CCL
108 program while decoding/encoding.
27901516 109
df7492f9 110 o Raw-text
4ed46869 111
5a936b46 112 A coding system for text containing raw eight-bit data. Emacs
5bad0796 113 treats each byte of source text as a character (except for
df7492f9 114 end-of-line conversion).
4ed46869 115
df7492f9
KH
116 o No-conversion
117
118 Like raw text, but don't do end-of-line conversion.
4ed46869 119
4ed46869 120
df7492f9 121END-OF-LINE FORMAT
4ed46869 122
5bad0796 123 How text end-of-line is encoded depends on operating system. For
df7492f9 124 instance, Unix's format is just one byte of LF (line-feed) code,
f4dee582 125 whereas DOS's format is two-byte sequence of `carriage-return' and
d46c5b12
KH
126 `line-feed' codes. MacOS's format is usually one byte of
127 `carriage-return'.
4ed46869 128
cfb43547 129 Since text character encoding and end-of-line encoding are
df7492f9
KH
130 independent, any coding system described above can take any format
131 of end-of-line (except for no-conversion).
4ed46869 132
e19c3639
KH
133STRUCT CODING_SYSTEM
134
135 Before using a coding system for code conversion (i.e. decoding and
136 encoding), we setup a structure of type `struct coding_system'.
137 This structure keeps various information about a specific code
5bad0796 138 conversion (e.g. the location of source and destination data).
4ed46869
KH
139
140*/
141
df7492f9
KH
142/* COMMON MACROS */
143
144
4ed46869
KH
145/*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
146
df7492f9 147 These functions check if a byte sequence specified as a source in
ff0dacd7
KH
148 CODING conforms to the format of XXX, and update the members of
149 DETECT_INFO.
df7492f9 150
ff0dacd7 151 Return 1 if the byte sequence conforms to XXX, otherwise return 0.
df7492f9
KH
152
153 Below is the template of these functions. */
154
4ed46869 155#if 0
df7492f9 156static int
ff0dacd7 157detect_coding_XXX (coding, detect_info)
df7492f9 158 struct coding_system *coding;
ff0dacd7 159 struct coding_detection_info *detect_info;
4ed46869 160{
f1d34bca
MB
161 const unsigned char *src = coding->source;
162 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9 163 int multibytep = coding->src_multibyte;
ff0dacd7 164 int consumed_chars = 0;
df7492f9
KH
165 int found = 0;
166 ...;
167
168 while (1)
169 {
170 /* Get one byte from the source. If the souce is exausted, jump
171 to no_more_source:. */
172 ONE_MORE_BYTE (c);
ff0dacd7
KH
173
174 if (! __C_conforms_to_XXX___ (c))
175 break;
176 if (! __C_strongly_suggests_XXX__ (c))
177 found = CATEGORY_MASK_XXX;
df7492f9 178 }
ff0dacd7
KH
179 /* The byte sequence is invalid for XXX. */
180 detect_info->rejected |= CATEGORY_MASK_XXX;
df7492f9 181 return 0;
ff0dacd7 182
df7492f9 183 no_more_source:
ff0dacd7
KH
184 /* The source exausted successfully. */
185 detect_info->found |= found;
df7492f9 186 return 1;
4ed46869
KH
187}
188#endif
189
190/*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
191
df7492f9
KH
192 These functions decode a byte sequence specified as a source by
193 CODING. The resulting multibyte text goes to a place pointed to by
194 CODING->charbuf, the length of which should not exceed
195 CODING->charbuf_size;
d46c5b12 196
df7492f9
KH
197 These functions set the information of original and decoded texts in
198 CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
199 They also set CODING->result to one of CODING_RESULT_XXX indicating
200 how the decoding is finished.
d46c5b12 201
df7492f9 202 Below is the template of these functions. */
d46c5b12 203
4ed46869 204#if 0
b73bfc1c 205static void
df7492f9 206decode_coding_XXXX (coding)
4ed46869 207 struct coding_system *coding;
4ed46869 208{
f1d34bca
MB
209 const unsigned char *src = coding->source + coding->consumed;
210 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9
KH
211 /* SRC_BASE remembers the start position in source in each loop.
212 The loop will be exited when there's not enough source code, or
213 when there's no room in CHARBUF for a decoded character. */
f1d34bca 214 const unsigned char *src_base;
df7492f9 215 /* A buffer to produce decoded characters. */
69a80ea3
KH
216 int *charbuf = coding->charbuf + coding->charbuf_used;
217 int *charbuf_end = coding->charbuf + coding->charbuf_size;
df7492f9
KH
218 int multibytep = coding->src_multibyte;
219
220 while (1)
221 {
222 src_base = src;
223 if (charbuf < charbuf_end)
224 /* No more room to produce a decoded character. */
225 break;
226 ONE_MORE_BYTE (c);
227 /* Decode it. */
228 }
229
230 no_more_source:
231 if (src_base < src_end
232 && coding->mode & CODING_MODE_LAST_BLOCK)
233 /* If the source ends by partial bytes to construct a character,
234 treat them as eight-bit raw data. */
235 while (src_base < src_end && charbuf < charbuf_end)
236 *charbuf++ = *src_base++;
237 /* Remember how many bytes and characters we consumed. If the
238 source is multibyte, the bytes and chars are not identical. */
239 coding->consumed = coding->consumed_char = src_base - coding->source;
240 /* Remember how many characters we produced. */
241 coding->charbuf_used = charbuf - coding->charbuf;
4ed46869
KH
242}
243#endif
244
245/*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
246
df7492f9
KH
247 These functions encode SRC_BYTES length text at SOURCE of Emacs'
248 internal multibyte format by CODING. The resulting byte sequence
b73bfc1c
KH
249 goes to a place pointed to by DESTINATION, the length of which
250 should not exceed DST_BYTES.
d46c5b12 251
df7492f9
KH
252 These functions set the information of original and encoded texts in
253 the members produced, produced_char, consumed, and consumed_char of
254 the structure *CODING. They also set the member result to one of
255 CODING_RESULT_XXX indicating how the encoding finished.
d46c5b12 256
df7492f9
KH
257 DST_BYTES zero means that source area and destination area are
258 overlapped, which means that we can produce a encoded text until it
259 reaches at the head of not-yet-encoded source text.
d46c5b12 260
df7492f9 261 Below is a template of these functions. */
4ed46869 262#if 0
b73bfc1c 263static void
df7492f9 264encode_coding_XXX (coding)
4ed46869 265 struct coding_system *coding;
4ed46869 266{
df7492f9
KH
267 int multibytep = coding->dst_multibyte;
268 int *charbuf = coding->charbuf;
269 int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
270 unsigned char *dst = coding->destination + coding->produced;
271 unsigned char *dst_end = coding->destination + coding->dst_bytes;
272 unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
273 int produced_chars = 0;
274
275 for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
276 {
277 int c = *charbuf;
278 /* Encode C into DST, and increment DST. */
279 }
280 label_no_more_destination:
281 /* How many chars and bytes we produced. */
282 coding->produced_char += produced_chars;
283 coding->produced = dst - coding->destination;
4ed46869
KH
284}
285#endif
286
4ed46869
KH
287\f
288/*** 1. Preamble ***/
289
68c45bf0 290#include <config.h>
4ed46869
KH
291#include <stdio.h>
292
4ed46869
KH
293#include "lisp.h"
294#include "buffer.h"
df7492f9 295#include "character.h"
4ed46869
KH
296#include "charset.h"
297#include "ccl.h"
df7492f9 298#include "composite.h"
4ed46869
KH
299#include "coding.h"
300#include "window.h"
b8299c66
KL
301#include "frame.h"
302#include "termhooks.h"
4ed46869 303
df7492f9 304Lisp_Object Vcoding_system_hash_table;
4ed46869 305
df7492f9 306Lisp_Object Qcoding_system, Qcoding_aliases, Qeol_type;
1965cb73
DL
307Lisp_Object Qunix, Qdos;
308extern Lisp_Object Qmac; /* frame.c */
4ed46869
KH
309Lisp_Object Qbuffer_file_coding_system;
310Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
df7492f9 311Lisp_Object Qdefault_char;
27901516 312Lisp_Object Qno_conversion, Qundecided;
df7492f9 313Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5;
b49a1807 314Lisp_Object Qbig, Qlittle;
bb0115a2 315Lisp_Object Qcoding_system_history;
1397dc18 316Lisp_Object Qvalid_codes;
a6f87d34
KH
317Lisp_Object QCcategory, QCmnemonic, QCdefalut_char;
318Lisp_Object QCdecode_translation_table, QCencode_translation_table;
319Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
35befdaa 320Lisp_Object QCascii_compatible_p;
4ed46869
KH
321
322extern Lisp_Object Qinsert_file_contents, Qwrite_region;
387f6ba5 323Lisp_Object Qcall_process, Qcall_process_region;
4ed46869
KH
324Lisp_Object Qstart_process, Qopen_network_stream;
325Lisp_Object Qtarget_idx;
326
065e3595
KH
327Lisp_Object Qinsufficient_source, Qinconsistent_eol, Qinvalid_source;
328Lisp_Object Qinterrupted, Qinsufficient_memory;
329
c7183fb8
GM
330extern Lisp_Object Qcompletion_ignore_case;
331
44e8490d
KH
332/* If a symbol has this property, evaluate the value to define the
333 symbol as a coding system. */
334static Lisp_Object Qcoding_system_define_form;
335
5d5bf4d8
KH
336int coding_system_require_warning;
337
d46c5b12
KH
338Lisp_Object Vselect_safe_coding_system_function;
339
7722baf9
EZ
340/* Mnemonic string for each format of end-of-line. */
341Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
342/* Mnemonic string to indicate format of end-of-line is not yet
4ed46869 343 decided. */
7722baf9 344Lisp_Object eol_mnemonic_undecided;
4ed46869 345
fcbcfb64
KH
346/* Format of end-of-line decided by system. This is Qunix on
347 Unix and Mac, Qdos on DOS/Windows.
348 This has an effect only for external encoding (i.e. for output to
349 file and process), not for in-buffer or Lisp string encoding. */
350static Lisp_Object system_eol_type;
351
4ed46869
KH
352#ifdef emacs
353
4608c386
KH
354Lisp_Object Vcoding_system_list, Vcoding_system_alist;
355
356Lisp_Object Qcoding_system_p, Qcoding_system_error;
4ed46869 357
d46c5b12
KH
358/* Coding system emacs-mule and raw-text are for converting only
359 end-of-line format. */
360Lisp_Object Qemacs_mule, Qraw_text;
8f924df7 361Lisp_Object Qutf_8_emacs;
ecf488bc 362
4ed46869
KH
363/* Coding-systems are handed between Emacs Lisp programs and C internal
364 routines by the following three variables. */
365/* Coding-system for reading files and receiving data from process. */
366Lisp_Object Vcoding_system_for_read;
367/* Coding-system for writing files and sending data to process. */
368Lisp_Object Vcoding_system_for_write;
369/* Coding-system actually used in the latest I/O. */
370Lisp_Object Vlast_coding_system_used;
065e3595
KH
371/* Set to non-nil when an error is detected while code conversion. */
372Lisp_Object Vlast_code_conversion_error;
c4825358 373/* A vector of length 256 which contains information about special
94487c4e 374 Latin codes (especially for dealing with Microsoft codes). */
3f003981 375Lisp_Object Vlatin_extra_code_table;
c4825358 376
9ce27fde
KH
377/* Flag to inhibit code conversion of end-of-line format. */
378int inhibit_eol_conversion;
379
74383408
KH
380/* Flag to inhibit ISO2022 escape sequence detection. */
381int inhibit_iso_escape_detection;
382
ed29121d
EZ
383/* Flag to make buffer-file-coding-system inherit from process-coding. */
384int inherit_process_coding_system;
385
c4825358
KH
386/* Coding system to be used to encode text for terminal display when
387 terminal coding system is nil. */
388struct coding_system safe_terminal_coding;
389
02ba4723
KH
390Lisp_Object Vfile_coding_system_alist;
391Lisp_Object Vprocess_coding_system_alist;
392Lisp_Object Vnetwork_coding_system_alist;
4ed46869 393
68c45bf0
PE
394Lisp_Object Vlocale_coding_system;
395
4ed46869
KH
396#endif /* emacs */
397
f967223b
KH
398/* Flag to tell if we look up translation table on character code
399 conversion. */
84fbb8a0 400Lisp_Object Venable_character_translation;
f967223b
KH
401/* Standard translation table to look up on decoding (reading). */
402Lisp_Object Vstandard_translation_table_for_decode;
403/* Standard translation table to look up on encoding (writing). */
404Lisp_Object Vstandard_translation_table_for_encode;
84fbb8a0 405
f967223b
KH
406Lisp_Object Qtranslation_table;
407Lisp_Object Qtranslation_table_id;
408Lisp_Object Qtranslation_table_for_decode;
409Lisp_Object Qtranslation_table_for_encode;
4ed46869
KH
410
411/* Alist of charsets vs revision number. */
df7492f9 412static Lisp_Object Vcharset_revision_table;
4ed46869 413
02ba4723
KH
414/* Default coding systems used for process I/O. */
415Lisp_Object Vdefault_process_coding_system;
416
002fdb44
DL
417/* Char table for translating Quail and self-inserting input. */
418Lisp_Object Vtranslation_table_for_input;
419
df7492f9
KH
420/* Two special coding systems. */
421Lisp_Object Vsjis_coding_system;
422Lisp_Object Vbig5_coding_system;
423
df7492f9
KH
424/* ISO2022 section */
425
426#define CODING_ISO_INITIAL(coding, reg) \
427 (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id), \
428 coding_attr_iso_initial), \
429 reg)))
430
431
432#define CODING_ISO_REQUEST(coding, charset_id) \
433 ((charset_id <= (coding)->max_charset_id \
434 ? (coding)->safe_charsets[charset_id] \
435 : -1))
436
437
438#define CODING_ISO_FLAGS(coding) \
439 ((coding)->spec.iso_2022.flags)
440#define CODING_ISO_DESIGNATION(coding, reg) \
441 ((coding)->spec.iso_2022.current_designation[reg])
442#define CODING_ISO_INVOCATION(coding, plane) \
443 ((coding)->spec.iso_2022.current_invocation[plane])
444#define CODING_ISO_SINGLE_SHIFTING(coding) \
445 ((coding)->spec.iso_2022.single_shifting)
446#define CODING_ISO_BOL(coding) \
447 ((coding)->spec.iso_2022.bol)
448#define CODING_ISO_INVOKED_CHARSET(coding, plane) \
449 CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
450
451/* Control characters of ISO2022. */
452 /* code */ /* function */
453#define ISO_CODE_LF 0x0A /* line-feed */
454#define ISO_CODE_CR 0x0D /* carriage-return */
455#define ISO_CODE_SO 0x0E /* shift-out */
456#define ISO_CODE_SI 0x0F /* shift-in */
457#define ISO_CODE_SS2_7 0x19 /* single-shift-2 for 7-bit code */
458#define ISO_CODE_ESC 0x1B /* escape */
459#define ISO_CODE_SS2 0x8E /* single-shift-2 */
460#define ISO_CODE_SS3 0x8F /* single-shift-3 */
461#define ISO_CODE_CSI 0x9B /* control-sequence-introducer */
462
463/* All code (1-byte) of ISO2022 is classified into one of the
464 followings. */
465enum iso_code_class_type
466 {
467 ISO_control_0, /* Control codes in the range
468 0x00..0x1F and 0x7F, except for the
469 following 5 codes. */
df7492f9
KH
470 ISO_shift_out, /* ISO_CODE_SO (0x0E) */
471 ISO_shift_in, /* ISO_CODE_SI (0x0F) */
472 ISO_single_shift_2_7, /* ISO_CODE_SS2_7 (0x19) */
473 ISO_escape, /* ISO_CODE_SO (0x1B) */
474 ISO_control_1, /* Control codes in the range
475 0x80..0x9F, except for the
476 following 3 codes. */
477 ISO_single_shift_2, /* ISO_CODE_SS2 (0x8E) */
478 ISO_single_shift_3, /* ISO_CODE_SS3 (0x8F) */
479 ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
480 ISO_0x20_or_0x7F, /* Codes of the values 0x20 or 0x7F. */
481 ISO_graphic_plane_0, /* Graphic codes in the range 0x21..0x7E. */
482 ISO_0xA0_or_0xFF, /* Codes of the values 0xA0 or 0xFF. */
483 ISO_graphic_plane_1 /* Graphic codes in the range 0xA1..0xFE. */
484 };
05e6f5dc 485
df7492f9
KH
486/** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
487 `iso-flags' attribute of an iso2022 coding system. */
05e6f5dc 488
df7492f9
KH
489/* If set, produce long-form designation sequence (e.g. ESC $ ( A)
490 instead of the correct short-form sequence (e.g. ESC $ A). */
491#define CODING_ISO_FLAG_LONG_FORM 0x0001
93dec019 492
df7492f9
KH
493/* If set, reset graphic planes and registers at end-of-line to the
494 initial state. */
495#define CODING_ISO_FLAG_RESET_AT_EOL 0x0002
05e6f5dc 496
df7492f9
KH
497/* If set, reset graphic planes and registers before any control
498 characters to the initial state. */
499#define CODING_ISO_FLAG_RESET_AT_CNTL 0x0004
05e6f5dc 500
df7492f9
KH
501/* If set, encode by 7-bit environment. */
502#define CODING_ISO_FLAG_SEVEN_BITS 0x0008
4ed46869 503
df7492f9
KH
504/* If set, use locking-shift function. */
505#define CODING_ISO_FLAG_LOCKING_SHIFT 0x0010
b73bfc1c 506
df7492f9
KH
507/* If set, use single-shift function. Overwrite
508 CODING_ISO_FLAG_LOCKING_SHIFT. */
509#define CODING_ISO_FLAG_SINGLE_SHIFT 0x0020
b73bfc1c 510
df7492f9
KH
511/* If set, use designation escape sequence. */
512#define CODING_ISO_FLAG_DESIGNATION 0x0040
b73bfc1c 513
df7492f9
KH
514/* If set, produce revision number sequence. */
515#define CODING_ISO_FLAG_REVISION 0x0080
b73bfc1c 516
df7492f9
KH
517/* If set, produce ISO6429's direction specifying sequence. */
518#define CODING_ISO_FLAG_DIRECTION 0x0100
f4dee582 519
df7492f9
KH
520/* If set, assume designation states are reset at beginning of line on
521 output. */
522#define CODING_ISO_FLAG_INIT_AT_BOL 0x0200
4ed46869 523
df7492f9
KH
524/* If set, designation sequence should be placed at beginning of line
525 on output. */
526#define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
aa72b389 527
df7492f9
KH
528/* If set, do not encode unsafe charactes on output. */
529#define CODING_ISO_FLAG_SAFE 0x0800
aa72b389 530
df7492f9
KH
531/* If set, extra latin codes (128..159) are accepted as a valid code
532 on input. */
533#define CODING_ISO_FLAG_LATIN_EXTRA 0x1000
aa72b389 534
df7492f9 535#define CODING_ISO_FLAG_COMPOSITION 0x2000
aa72b389 536
df7492f9 537#define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000
aa72b389 538
bf16eb23 539#define CODING_ISO_FLAG_USE_ROMAN 0x8000
aa72b389 540
bf16eb23 541#define CODING_ISO_FLAG_USE_OLDJIS 0x10000
aa72b389 542
bf16eb23 543#define CODING_ISO_FLAG_FULL_SUPPORT 0x100000
aa72b389 544
df7492f9
KH
545/* A character to be produced on output if encoding of the original
546 character is prohibited by CODING_ISO_FLAG_SAFE. */
547#define CODING_INHIBIT_CHARACTER_SUBSTITUTION '?'
aa72b389 548
a470d443
KH
549/* UTF-8 section */
550#define CODING_UTF_8_BOM(coding) \
551 ((coding)->spec.utf_8_bom)
4ed46869 552
df7492f9
KH
553/* UTF-16 section */
554#define CODING_UTF_16_BOM(coding) \
555 ((coding)->spec.utf_16.bom)
4ed46869 556
df7492f9
KH
557#define CODING_UTF_16_ENDIAN(coding) \
558 ((coding)->spec.utf_16.endian)
4ed46869 559
df7492f9
KH
560#define CODING_UTF_16_SURROGATE(coding) \
561 ((coding)->spec.utf_16.surrogate)
4ed46869 562
4ed46869 563
df7492f9
KH
564/* CCL section */
565#define CODING_CCL_DECODER(coding) \
566 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
567#define CODING_CCL_ENCODER(coding) \
568 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
569#define CODING_CCL_VALIDS(coding) \
8f924df7 570 (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
4ed46869 571
5a936b46 572/* Index for each coding category in `coding_categories' */
4ed46869 573
df7492f9
KH
574enum coding_category
575 {
576 coding_category_iso_7,
577 coding_category_iso_7_tight,
578 coding_category_iso_8_1,
579 coding_category_iso_8_2,
580 coding_category_iso_7_else,
581 coding_category_iso_8_else,
a470d443
KH
582 coding_category_utf_8_auto,
583 coding_category_utf_8_nosig,
584 coding_category_utf_8_sig,
df7492f9
KH
585 coding_category_utf_16_auto,
586 coding_category_utf_16_be,
587 coding_category_utf_16_le,
588 coding_category_utf_16_be_nosig,
589 coding_category_utf_16_le_nosig,
590 coding_category_charset,
591 coding_category_sjis,
592 coding_category_big5,
593 coding_category_ccl,
594 coding_category_emacs_mule,
595 /* All above are targets of code detection. */
596 coding_category_raw_text,
597 coding_category_undecided,
598 coding_category_max
599 };
600
601/* Definitions of flag bits used in detect_coding_XXXX. */
602#define CATEGORY_MASK_ISO_7 (1 << coding_category_iso_7)
603#define CATEGORY_MASK_ISO_7_TIGHT (1 << coding_category_iso_7_tight)
604#define CATEGORY_MASK_ISO_8_1 (1 << coding_category_iso_8_1)
605#define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2)
606#define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else)
607#define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else)
a470d443
KH
608#define CATEGORY_MASK_UTF_8_AUTO (1 << coding_category_utf_8_auto)
609#define CATEGORY_MASK_UTF_8_NOSIG (1 << coding_category_utf_8_nosig)
610#define CATEGORY_MASK_UTF_8_SIG (1 << coding_category_utf_8_sig)
b49a1807 611#define CATEGORY_MASK_UTF_16_AUTO (1 << coding_category_utf_16_auto)
df7492f9
KH
612#define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be)
613#define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le)
614#define CATEGORY_MASK_UTF_16_BE_NOSIG (1 << coding_category_utf_16_be_nosig)
615#define CATEGORY_MASK_UTF_16_LE_NOSIG (1 << coding_category_utf_16_le_nosig)
616#define CATEGORY_MASK_CHARSET (1 << coding_category_charset)
617#define CATEGORY_MASK_SJIS (1 << coding_category_sjis)
618#define CATEGORY_MASK_BIG5 (1 << coding_category_big5)
619#define CATEGORY_MASK_CCL (1 << coding_category_ccl)
620#define CATEGORY_MASK_EMACS_MULE (1 << coding_category_emacs_mule)
ff0dacd7 621#define CATEGORY_MASK_RAW_TEXT (1 << coding_category_raw_text)
df7492f9
KH
622
623/* This value is returned if detect_coding_mask () find nothing other
624 than ASCII characters. */
625#define CATEGORY_MASK_ANY \
626 (CATEGORY_MASK_ISO_7 \
627 | CATEGORY_MASK_ISO_7_TIGHT \
628 | CATEGORY_MASK_ISO_8_1 \
629 | CATEGORY_MASK_ISO_8_2 \
630 | CATEGORY_MASK_ISO_7_ELSE \
631 | CATEGORY_MASK_ISO_8_ELSE \
a470d443
KH
632 | CATEGORY_MASK_UTF_8_AUTO \
633 | CATEGORY_MASK_UTF_8_NOSIG \
634 | CATEGORY_MASK_UTF_8_SIG \
2f3cbb32 635 | CATEGORY_MASK_UTF_16_AUTO \
df7492f9
KH
636 | CATEGORY_MASK_UTF_16_BE \
637 | CATEGORY_MASK_UTF_16_LE \
638 | CATEGORY_MASK_UTF_16_BE_NOSIG \
639 | CATEGORY_MASK_UTF_16_LE_NOSIG \
640 | CATEGORY_MASK_CHARSET \
641 | CATEGORY_MASK_SJIS \
642 | CATEGORY_MASK_BIG5 \
643 | CATEGORY_MASK_CCL \
644 | CATEGORY_MASK_EMACS_MULE)
645
646
647#define CATEGORY_MASK_ISO_7BIT \
648 (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
649
650#define CATEGORY_MASK_ISO_8BIT \
651 (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
652
653#define CATEGORY_MASK_ISO_ELSE \
654 (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
655
656#define CATEGORY_MASK_ISO_ESCAPE \
657 (CATEGORY_MASK_ISO_7 \
658 | CATEGORY_MASK_ISO_7_TIGHT \
659 | CATEGORY_MASK_ISO_7_ELSE \
660 | CATEGORY_MASK_ISO_8_ELSE)
661
662#define CATEGORY_MASK_ISO \
663 ( CATEGORY_MASK_ISO_7BIT \
664 | CATEGORY_MASK_ISO_8BIT \
665 | CATEGORY_MASK_ISO_ELSE)
666
667#define CATEGORY_MASK_UTF_16 \
2f3cbb32
KH
668 (CATEGORY_MASK_UTF_16_AUTO \
669 | CATEGORY_MASK_UTF_16_BE \
df7492f9
KH
670 | CATEGORY_MASK_UTF_16_LE \
671 | CATEGORY_MASK_UTF_16_BE_NOSIG \
672 | CATEGORY_MASK_UTF_16_LE_NOSIG)
673
a470d443
KH
674#define CATEGORY_MASK_UTF_8 \
675 (CATEGORY_MASK_UTF_8_AUTO \
676 | CATEGORY_MASK_UTF_8_NOSIG \
677 | CATEGORY_MASK_UTF_8_SIG)
df7492f9
KH
678
679/* List of symbols `coding-category-xxx' ordered by priority. This
680 variable is exposed to Emacs Lisp. */
681static Lisp_Object Vcoding_category_list;
682
683/* Table of coding categories (Lisp symbols). This variable is for
684 internal use oly. */
685static Lisp_Object Vcoding_category_table;
686
687/* Table of coding-categories ordered by priority. */
688static enum coding_category coding_priorities[coding_category_max];
689
690/* Nth element is a coding context for the coding system bound to the
691 Nth coding category. */
692static struct coding_system coding_categories[coding_category_max];
693
df7492f9
KH
694/*** Commonly used macros and functions ***/
695
696#ifndef min
697#define min(a, b) ((a) < (b) ? (a) : (b))
698#endif
699#ifndef max
700#define max(a, b) ((a) > (b) ? (a) : (b))
701#endif
4ed46869 702
24a73b0a
KH
703#define CODING_GET_INFO(coding, attrs, charset_list) \
704 do { \
705 (attrs) = CODING_ID_ATTRS ((coding)->id); \
706 (charset_list) = CODING_ATTR_CHARSET_LIST (attrs); \
df7492f9 707 } while (0)
4ed46869 708
4ed46869 709
df7492f9
KH
710/* Safely get one byte from the source text pointed by SRC which ends
711 at SRC_END, and set C to that byte. If there are not enough bytes
065e3595
KH
712 in the source, it jumps to `no_more_source'. If multibytep is
713 nonzero, and a multibyte character is found at SRC, set C to the
714 negative value of the character code. The caller should declare
715 and set these variables appropriately in advance:
716 src, src_end, multibytep */
aa72b389 717
065e3595
KH
718#define ONE_MORE_BYTE(c) \
719 do { \
720 if (src == src_end) \
721 { \
722 if (src_base < src) \
723 record_conversion_result \
724 (coding, CODING_RESULT_INSUFFICIENT_SRC); \
725 goto no_more_source; \
726 } \
727 c = *src++; \
728 if (multibytep && (c & 0x80)) \
729 { \
730 if ((c & 0xFE) == 0xC0) \
731 c = ((c & 1) << 6) | *src++; \
732 else \
733 { \
35befdaa
KH
734 src--; \
735 c = - string_char (src, &src, NULL); \
065e3595
KH
736 record_conversion_result \
737 (coding, CODING_RESULT_INVALID_SRC); \
738 } \
739 } \
740 consumed_chars++; \
aa72b389
KH
741 } while (0)
742
aa72b389 743
065e3595
KH
744#define ONE_MORE_BYTE_NO_CHECK(c) \
745 do { \
746 c = *src++; \
747 if (multibytep && (c & 0x80)) \
748 { \
749 if ((c & 0xFE) == 0xC0) \
750 c = ((c & 1) << 6) | *src++; \
751 else \
752 { \
35befdaa
KH
753 src--; \
754 c = - string_char (src, &src, NULL); \
065e3595
KH
755 record_conversion_result \
756 (coding, CODING_RESULT_INVALID_SRC); \
757 } \
758 } \
759 consumed_chars++; \
aa72b389
KH
760 } while (0)
761
aa72b389 762
df7492f9
KH
763/* Store a byte C in the place pointed by DST and increment DST to the
764 next free point, and increment PRODUCED_CHARS. The caller should
765 assure that C is 0..127, and declare and set the variable `dst'
766 appropriately in advance.
767*/
aa72b389
KH
768
769
df7492f9
KH
770#define EMIT_ONE_ASCII_BYTE(c) \
771 do { \
772 produced_chars++; \
773 *dst++ = (c); \
b6871cc7 774 } while (0)
aa72b389
KH
775
776
df7492f9 777/* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2. */
aa72b389 778
df7492f9
KH
779#define EMIT_TWO_ASCII_BYTES(c1, c2) \
780 do { \
781 produced_chars += 2; \
782 *dst++ = (c1), *dst++ = (c2); \
783 } while (0)
aa72b389
KH
784
785
df7492f9
KH
786/* Store a byte C in the place pointed by DST and increment DST to the
787 next free point, and increment PRODUCED_CHARS. If MULTIBYTEP is
788 nonzero, store in an appropriate multibyte from. The caller should
789 declare and set the variables `dst' and `multibytep' appropriately
790 in advance. */
791
792#define EMIT_ONE_BYTE(c) \
793 do { \
794 produced_chars++; \
795 if (multibytep) \
796 { \
797 int ch = (c); \
798 if (ch >= 0x80) \
799 ch = BYTE8_TO_CHAR (ch); \
800 CHAR_STRING_ADVANCE (ch, dst); \
801 } \
802 else \
803 *dst++ = (c); \
aa72b389 804 } while (0)
aa72b389 805
aa72b389 806
df7492f9 807/* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */
aa72b389 808
e19c3639
KH
809#define EMIT_TWO_BYTES(c1, c2) \
810 do { \
811 produced_chars += 2; \
812 if (multibytep) \
813 { \
814 int ch; \
815 \
816 ch = (c1); \
817 if (ch >= 0x80) \
818 ch = BYTE8_TO_CHAR (ch); \
819 CHAR_STRING_ADVANCE (ch, dst); \
820 ch = (c2); \
821 if (ch >= 0x80) \
822 ch = BYTE8_TO_CHAR (ch); \
823 CHAR_STRING_ADVANCE (ch, dst); \
824 } \
825 else \
826 { \
827 *dst++ = (c1); \
828 *dst++ = (c2); \
829 } \
aa72b389
KH
830 } while (0)
831
832
df7492f9
KH
833#define EMIT_THREE_BYTES(c1, c2, c3) \
834 do { \
835 EMIT_ONE_BYTE (c1); \
836 EMIT_TWO_BYTES (c2, c3); \
837 } while (0)
aa72b389 838
aa72b389 839
df7492f9
KH
840#define EMIT_FOUR_BYTES(c1, c2, c3, c4) \
841 do { \
842 EMIT_TWO_BYTES (c1, c2); \
843 EMIT_TWO_BYTES (c3, c4); \
844 } while (0)
aa72b389 845
aa72b389 846
f6cbaf43
KH
847/* Prototypes for static functions. */
848static void record_conversion_result P_ ((struct coding_system *coding,
849 enum coding_result_code result));
850static int detect_coding_utf_8 P_ ((struct coding_system *,
851 struct coding_detection_info *info));
852static void decode_coding_utf_8 P_ ((struct coding_system *));
853static int encode_coding_utf_8 P_ ((struct coding_system *));
854
855static int detect_coding_utf_16 P_ ((struct coding_system *,
856 struct coding_detection_info *info));
857static void decode_coding_utf_16 P_ ((struct coding_system *));
858static int encode_coding_utf_16 P_ ((struct coding_system *));
859
860static int detect_coding_iso_2022 P_ ((struct coding_system *,
861 struct coding_detection_info *info));
862static void decode_coding_iso_2022 P_ ((struct coding_system *));
863static int encode_coding_iso_2022 P_ ((struct coding_system *));
864
865static int detect_coding_emacs_mule P_ ((struct coding_system *,
866 struct coding_detection_info *info));
867static void decode_coding_emacs_mule P_ ((struct coding_system *));
868static int encode_coding_emacs_mule P_ ((struct coding_system *));
869
870static int detect_coding_sjis P_ ((struct coding_system *,
871 struct coding_detection_info *info));
872static void decode_coding_sjis P_ ((struct coding_system *));
873static int encode_coding_sjis P_ ((struct coding_system *));
874
875static int detect_coding_big5 P_ ((struct coding_system *,
876 struct coding_detection_info *info));
877static void decode_coding_big5 P_ ((struct coding_system *));
878static int encode_coding_big5 P_ ((struct coding_system *));
879
880static int detect_coding_ccl P_ ((struct coding_system *,
881 struct coding_detection_info *info));
882static void decode_coding_ccl P_ ((struct coding_system *));
883static int encode_coding_ccl P_ ((struct coding_system *));
884
885static void decode_coding_raw_text P_ ((struct coding_system *));
886static int encode_coding_raw_text P_ ((struct coding_system *));
887
888static void coding_set_source P_ ((struct coding_system *));
889static void coding_set_destination P_ ((struct coding_system *));
890static void coding_alloc_by_realloc P_ ((struct coding_system *, EMACS_INT));
891static void coding_alloc_by_making_gap P_ ((struct coding_system *,
287c57d7 892 EMACS_INT, EMACS_INT));
f6cbaf43
KH
893static unsigned char *alloc_destination P_ ((struct coding_system *,
894 EMACS_INT, unsigned char *));
895static void setup_iso_safe_charsets P_ ((Lisp_Object));
896static unsigned char *encode_designation_at_bol P_ ((struct coding_system *,
897 int *, int *,
898 unsigned char *));
899static int detect_eol P_ ((const unsigned char *,
900 EMACS_INT, enum coding_category));
901static Lisp_Object adjust_coding_eol_type P_ ((struct coding_system *, int));
902static void decode_eol P_ ((struct coding_system *));
903static Lisp_Object get_translation_table P_ ((Lisp_Object, int, int *));
904static Lisp_Object get_translation P_ ((Lisp_Object, int *, int *,
905 int, int *, int *));
906static int produce_chars P_ ((struct coding_system *, Lisp_Object, int));
907static INLINE void produce_composition P_ ((struct coding_system *, int *,
908 EMACS_INT));
909static INLINE void produce_charset P_ ((struct coding_system *, int *,
910 EMACS_INT));
911static void produce_annotation P_ ((struct coding_system *, EMACS_INT));
912static int decode_coding P_ ((struct coding_system *));
913static INLINE int *handle_composition_annotation P_ ((EMACS_INT, EMACS_INT,
3ed051d4 914 struct coding_system *,
f6cbaf43
KH
915 int *, EMACS_INT *));
916static INLINE int *handle_charset_annotation P_ ((EMACS_INT, EMACS_INT,
917 struct coding_system *,
918 int *, EMACS_INT *));
919static void consume_chars P_ ((struct coding_system *, Lisp_Object, int));
920static int encode_coding P_ ((struct coding_system *));
921static Lisp_Object make_conversion_work_buffer P_ ((int));
922static Lisp_Object code_conversion_restore P_ ((Lisp_Object));
923static INLINE int char_encodable_p P_ ((int, Lisp_Object));
924static Lisp_Object make_subsidiaries P_ ((Lisp_Object));
925
065e3595
KH
926static void
927record_conversion_result (struct coding_system *coding,
928 enum coding_result_code result)
929{
930 coding->result = result;
931 switch (result)
932 {
933 case CODING_RESULT_INSUFFICIENT_SRC:
934 Vlast_code_conversion_error = Qinsufficient_source;
935 break;
936 case CODING_RESULT_INCONSISTENT_EOL:
937 Vlast_code_conversion_error = Qinconsistent_eol;
938 break;
939 case CODING_RESULT_INVALID_SRC:
940 Vlast_code_conversion_error = Qinvalid_source;
941 break;
942 case CODING_RESULT_INTERRUPT:
943 Vlast_code_conversion_error = Qinterrupted;
944 break;
945 case CODING_RESULT_INSUFFICIENT_MEM:
946 Vlast_code_conversion_error = Qinsufficient_memory;
947 break;
35befdaa
KH
948 default:
949 Vlast_code_conversion_error = intern ("Unknown error");
065e3595
KH
950 }
951}
952
df7492f9
KH
953#define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
954 do { \
955 charset_map_loaded = 0; \
956 c = DECODE_CHAR (charset, code); \
957 if (charset_map_loaded) \
958 { \
8f924df7 959 const unsigned char *orig = coding->source; \
df7492f9
KH
960 EMACS_INT offset; \
961 \
962 coding_set_source (coding); \
963 offset = coding->source - orig; \
964 src += offset; \
965 src_base += offset; \
966 src_end += offset; \
967 } \
aa72b389
KH
968 } while (0)
969
970
119852e7
KH
971/* If there are at least BYTES length of room at dst, allocate memory
972 for coding->destination and update dst and dst_end. We don't have
973 to take care of coding->source which will be relocated. It is
974 handled by calling coding_set_source in encode_coding. */
975
df7492f9
KH
976#define ASSURE_DESTINATION(bytes) \
977 do { \
978 if (dst + (bytes) >= dst_end) \
979 { \
980 int more_bytes = charbuf_end - charbuf + (bytes); \
981 \
982 dst = alloc_destination (coding, more_bytes, dst); \
983 dst_end = coding->destination + coding->dst_bytes; \
984 } \
985 } while (0)
aa72b389 986
aa72b389 987
db274c7a
KH
988/* Store multibyte form of the character C in P, and advance P to the
989 end of the multibyte form. This is like CHAR_STRING_ADVANCE but it
990 never calls MAYBE_UNIFY_CHAR. */
991
992#define CHAR_STRING_ADVANCE_NO_UNIFY(c, p) \
993 do { \
994 if ((c) <= MAX_1_BYTE_CHAR) \
995 *(p)++ = (c); \
996 else if ((c) <= MAX_2_BYTE_CHAR) \
997 *(p)++ = (0xC0 | ((c) >> 6)), \
998 *(p)++ = (0x80 | ((c) & 0x3F)); \
999 else if ((c) <= MAX_3_BYTE_CHAR) \
1000 *(p)++ = (0xE0 | ((c) >> 12)), \
1001 *(p)++ = (0x80 | (((c) >> 6) & 0x3F)), \
1002 *(p)++ = (0x80 | ((c) & 0x3F)); \
1003 else if ((c) <= MAX_4_BYTE_CHAR) \
1004 *(p)++ = (0xF0 | (c >> 18)), \
1005 *(p)++ = (0x80 | ((c >> 12) & 0x3F)), \
1006 *(p)++ = (0x80 | ((c >> 6) & 0x3F)), \
1007 *(p)++ = (0x80 | (c & 0x3F)); \
1008 else if ((c) <= MAX_5_BYTE_CHAR) \
1009 *(p)++ = 0xF8, \
1010 *(p)++ = (0x80 | ((c >> 18) & 0x0F)), \
1011 *(p)++ = (0x80 | ((c >> 12) & 0x3F)), \
1012 *(p)++ = (0x80 | ((c >> 6) & 0x3F)), \
1013 *(p)++ = (0x80 | (c & 0x3F)); \
1014 else \
1015 (p) += BYTE8_STRING ((c) - 0x3FFF80, p); \
1016 } while (0)
1017
1018
1019/* Return the character code of character whose multibyte form is at
1020 P, and advance P to the end of the multibyte form. This is like
1021 STRING_CHAR_ADVANCE, but it never calls MAYBE_UNIFY_CHAR. */
1022
1023#define STRING_CHAR_ADVANCE_NO_UNIFY(p) \
1024 (!((p)[0] & 0x80) \
1025 ? *(p)++ \
1026 : ! ((p)[0] & 0x20) \
1027 ? ((p) += 2, \
1028 ((((p)[-2] & 0x1F) << 6) \
1029 | ((p)[-1] & 0x3F) \
1030 | ((unsigned char) ((p)[-2]) < 0xC2 ? 0x3FFF80 : 0))) \
1031 : ! ((p)[0] & 0x10) \
1032 ? ((p) += 3, \
1033 ((((p)[-3] & 0x0F) << 12) \
1034 | (((p)[-2] & 0x3F) << 6) \
1035 | ((p)[-1] & 0x3F))) \
1036 : ! ((p)[0] & 0x08) \
1037 ? ((p) += 4, \
1038 ((((p)[-4] & 0xF) << 18) \
1039 | (((p)[-3] & 0x3F) << 12) \
1040 | (((p)[-2] & 0x3F) << 6) \
1041 | ((p)[-1] & 0x3F))) \
1042 : ((p) += 5, \
1043 ((((p)[-4] & 0x3F) << 18) \
1044 | (((p)[-3] & 0x3F) << 12) \
1045 | (((p)[-2] & 0x3F) << 6) \
1046 | ((p)[-1] & 0x3F))))
1047
aa72b389 1048
df7492f9
KH
1049static void
1050coding_set_source (coding)
aa72b389 1051 struct coding_system *coding;
aa72b389 1052{
df7492f9
KH
1053 if (BUFFERP (coding->src_object))
1054 {
2cb26057 1055 struct buffer *buf = XBUFFER (coding->src_object);
aa72b389 1056
df7492f9 1057 if (coding->src_pos < 0)
2cb26057 1058 coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
df7492f9 1059 else
2cb26057 1060 coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
aa72b389 1061 }
df7492f9 1062 else if (STRINGP (coding->src_object))
aa72b389 1063 {
8f924df7 1064 coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
aa72b389 1065 }
df7492f9
KH
1066 else
1067 /* Otherwise, the source is C string and is never relocated
1068 automatically. Thus we don't have to update anything. */
1069 ;
1070}
aa72b389 1071
df7492f9
KH
1072static void
1073coding_set_destination (coding)
1074 struct coding_system *coding;
1075{
1076 if (BUFFERP (coding->dst_object))
aa72b389 1077 {
df7492f9 1078 if (coding->src_pos < 0)
aa72b389 1079 {
13818c30 1080 coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
28f67a95
KH
1081 coding->dst_bytes = (GAP_END_ADDR
1082 - (coding->src_bytes - coding->consumed)
1083 - coding->destination);
aa72b389 1084 }
df7492f9 1085 else
28f67a95
KH
1086 {
1087 /* We are sure that coding->dst_pos_byte is before the gap
1088 of the buffer. */
1089 coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
13818c30 1090 + coding->dst_pos_byte - BEG_BYTE);
28f67a95
KH
1091 coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
1092 - coding->destination);
1093 }
df7492f9
KH
1094 }
1095 else
1096 /* Otherwise, the destination is C string and is never relocated
1097 automatically. Thus we don't have to update anything. */
1098 ;
1099}
1100
1101
1102static void
1103coding_alloc_by_realloc (coding, bytes)
1104 struct coding_system *coding;
1105 EMACS_INT bytes;
1106{
1107 coding->destination = (unsigned char *) xrealloc (coding->destination,
1108 coding->dst_bytes + bytes);
1109 coding->dst_bytes += bytes;
1110}
1111
1112static void
db274c7a 1113coding_alloc_by_making_gap (coding, gap_head_used, bytes)
df7492f9 1114 struct coding_system *coding;
db274c7a 1115 EMACS_INT gap_head_used, bytes;
df7492f9 1116{
db274c7a 1117 if (EQ (coding->src_object, coding->dst_object))
df7492f9 1118 {
db274c7a
KH
1119 /* The gap may contain the produced data at the head and not-yet
1120 consumed data at the tail. To preserve those data, we at
1121 first make the gap size to zero, then increase the gap
1122 size. */
1123 EMACS_INT add = GAP_SIZE;
1124
1125 GPT += gap_head_used, GPT_BYTE += gap_head_used;
1126 GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
df7492f9
KH
1127 make_gap (bytes);
1128 GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
db274c7a 1129 GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
df7492f9 1130 }
730fff51 1131 else
df7492f9 1132 {
2c78b7e1
KH
1133 Lisp_Object this_buffer;
1134
1135 this_buffer = Fcurrent_buffer ();
df7492f9
KH
1136 set_buffer_internal (XBUFFER (coding->dst_object));
1137 make_gap (bytes);
1138 set_buffer_internal (XBUFFER (this_buffer));
aa72b389 1139 }
df7492f9 1140}
8f924df7 1141
df7492f9
KH
1142
1143static unsigned char *
1144alloc_destination (coding, nbytes, dst)
1145 struct coding_system *coding;
3e139625 1146 EMACS_INT nbytes;
df7492f9
KH
1147 unsigned char *dst;
1148{
1149 EMACS_INT offset = dst - coding->destination;
1150
1151 if (BUFFERP (coding->dst_object))
db274c7a
KH
1152 {
1153 struct buffer *buf = XBUFFER (coding->dst_object);
1154
1155 coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1156 }
aa72b389 1157 else
df7492f9 1158 coding_alloc_by_realloc (coding, nbytes);
065e3595 1159 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
1160 coding_set_destination (coding);
1161 dst = coding->destination + offset;
1162 return dst;
1163}
aa72b389 1164
ff0dacd7
KH
1165/** Macros for annotations. */
1166
1167/* Maximum length of annotation data (sum of annotations for
1168 composition and charset). */
69a80ea3 1169#define MAX_ANNOTATION_LENGTH (4 + (MAX_COMPOSITION_COMPONENTS * 2) - 1 + 4)
ff0dacd7
KH
1170
1171/* An annotation data is stored in the array coding->charbuf in this
1172 format:
69a80ea3 1173 [ -LENGTH ANNOTATION_MASK NCHARS ... ]
ff0dacd7
KH
1174 LENGTH is the number of elements in the annotation.
1175 ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
69a80ea3 1176 NCHARS is the number of characters in the text annotated.
ff0dacd7
KH
1177
1178 The format of the following elements depend on ANNOTATION_MASK.
1179
1180 In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1181 follows:
1182 ... METHOD [ COMPOSITION-COMPONENTS ... ]
1183 METHOD is one of enum composition_method.
1184 Optionnal COMPOSITION-COMPONENTS are characters and composition
1185 rules.
1186
1187 In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1188 follows. */
1189
69a80ea3 1190#define ADD_ANNOTATION_DATA(buf, len, mask, nchars) \
ff0dacd7
KH
1191 do { \
1192 *(buf)++ = -(len); \
1193 *(buf)++ = (mask); \
69a80ea3 1194 *(buf)++ = (nchars); \
ff0dacd7
KH
1195 coding->annotated = 1; \
1196 } while (0);
1197
69a80ea3
KH
1198#define ADD_COMPOSITION_DATA(buf, nchars, method) \
1199 do { \
1200 ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1201 *buf++ = method; \
ff0dacd7
KH
1202 } while (0)
1203
1204
69a80ea3
KH
1205#define ADD_CHARSET_DATA(buf, nchars, id) \
1206 do { \
1207 ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1208 *buf++ = id; \
ff0dacd7
KH
1209 } while (0)
1210
df7492f9
KH
1211\f
1212/*** 2. Emacs' internal format (emacs-utf-8) ***/
1213
1214
1215
1216\f
1217/*** 3. UTF-8 ***/
1218
1219/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
ff0dacd7
KH
1220 Check if a text is encoded in UTF-8. If it is, return 1, else
1221 return 0. */
df7492f9
KH
1222
1223#define UTF_8_1_OCTET_P(c) ((c) < 0x80)
1224#define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
1225#define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1226#define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1227#define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1228#define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1229
a470d443
KH
1230#define UTF_BOM 0xFEFF
1231#define UTF_8_BOM_1 0xEF
1232#define UTF_8_BOM_2 0xBB
1233#define UTF_8_BOM_3 0xBF
1234
df7492f9 1235static int
ff0dacd7 1236detect_coding_utf_8 (coding, detect_info)
df7492f9 1237 struct coding_system *coding;
ff0dacd7 1238 struct coding_detection_info *detect_info;
df7492f9 1239{
065e3595 1240 const unsigned char *src = coding->source, *src_base;
8f924df7 1241 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9
KH
1242 int multibytep = coding->src_multibyte;
1243 int consumed_chars = 0;
a470d443 1244 int bom_found = 0;
df7492f9
KH
1245 int found = 0;
1246
ff0dacd7 1247 detect_info->checked |= CATEGORY_MASK_UTF_8;
df7492f9
KH
1248 /* A coding system of this category is always ASCII compatible. */
1249 src += coding->head_ascii;
1250
1251 while (1)
aa72b389 1252 {
df7492f9 1253 int c, c1, c2, c3, c4;
aa72b389 1254
065e3595 1255 src_base = src;
df7492f9 1256 ONE_MORE_BYTE (c);
065e3595 1257 if (c < 0 || UTF_8_1_OCTET_P (c))
df7492f9
KH
1258 continue;
1259 ONE_MORE_BYTE (c1);
065e3595 1260 if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
df7492f9
KH
1261 break;
1262 if (UTF_8_2_OCTET_LEADING_P (c))
aa72b389 1263 {
a470d443 1264 found = 1;
df7492f9 1265 continue;
aa72b389 1266 }
df7492f9 1267 ONE_MORE_BYTE (c2);
065e3595 1268 if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
df7492f9
KH
1269 break;
1270 if (UTF_8_3_OCTET_LEADING_P (c))
aa72b389 1271 {
a470d443
KH
1272 found = 1;
1273 if (src_base == coding->source
1274 && c == UTF_8_BOM_1 && c1 == UTF_8_BOM_2 && c2 == UTF_8_BOM_3)
1275 bom_found = 1;
df7492f9 1276 continue;
aa72b389 1277 }
df7492f9 1278 ONE_MORE_BYTE (c3);
065e3595 1279 if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
df7492f9
KH
1280 break;
1281 if (UTF_8_4_OCTET_LEADING_P (c))
aa72b389 1282 {
a470d443 1283 found = 1;
df7492f9
KH
1284 continue;
1285 }
1286 ONE_MORE_BYTE (c4);
065e3595 1287 if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
df7492f9
KH
1288 break;
1289 if (UTF_8_5_OCTET_LEADING_P (c))
1290 {
a470d443 1291 found = 1;
df7492f9
KH
1292 continue;
1293 }
1294 break;
aa72b389 1295 }
ff0dacd7 1296 detect_info->rejected |= CATEGORY_MASK_UTF_8;
df7492f9 1297 return 0;
aa72b389 1298
df7492f9 1299 no_more_source:
065e3595 1300 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
aa72b389 1301 {
ff0dacd7 1302 detect_info->rejected |= CATEGORY_MASK_UTF_8;
89528eb3 1303 return 0;
aa72b389 1304 }
a470d443
KH
1305 if (bom_found)
1306 {
1307 /* The first character 0xFFFE doesn't necessarily mean a BOM. */
1308 detect_info->found |= CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1309 }
1310 else
1311 {
1312 detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1313 detect_info->found |= CATEGORY_MASK_UTF_8_NOSIG;
1314 }
ff0dacd7 1315 return 1;
aa72b389
KH
1316}
1317
4ed46869 1318
b73bfc1c 1319static void
df7492f9 1320decode_coding_utf_8 (coding)
b73bfc1c 1321 struct coding_system *coding;
b73bfc1c 1322{
8f924df7
KH
1323 const unsigned char *src = coding->source + coding->consumed;
1324 const unsigned char *src_end = coding->source + coding->src_bytes;
1325 const unsigned char *src_base;
69a80ea3
KH
1326 int *charbuf = coding->charbuf + coding->charbuf_used;
1327 int *charbuf_end = coding->charbuf + coding->charbuf_size;
df7492f9
KH
1328 int consumed_chars = 0, consumed_chars_base;
1329 int multibytep = coding->src_multibyte;
a470d443 1330 enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
24a73b0a 1331 Lisp_Object attr, charset_list;
119852e7
KH
1332 int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1333 int byte_after_cr = -1;
4ed46869 1334
24a73b0a 1335 CODING_GET_INFO (coding, attr, charset_list);
df7492f9 1336
a470d443
KH
1337 if (bom != utf_without_bom)
1338 {
1339 int c1, c2, c3;
1340
1341 src_base = src;
1342 ONE_MORE_BYTE (c1);
1343 if (! UTF_8_3_OCTET_LEADING_P (c1))
1344 src = src_base;
1345 else
1346 {
1347 ONE_MORE_BYTE (c2);
1348 if (! UTF_8_EXTRA_OCTET_P (c2))
1349 src = src_base;
1350 else
1351 {
1352 ONE_MORE_BYTE (c3);
1353 if (! UTF_8_EXTRA_OCTET_P (c3))
1354 src = src_base;
1355 else
1356 {
1357 if ((c1 != UTF_8_BOM_1)
1358 || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1359 src = src_base;
1360 else
1361 CODING_UTF_8_BOM (coding) = utf_without_bom;
1362 }
1363 }
1364 }
1365 }
1366 CODING_UTF_8_BOM (coding) = utf_without_bom;
1367
1368
1369
df7492f9 1370 while (1)
b73bfc1c 1371 {
df7492f9 1372 int c, c1, c2, c3, c4, c5;
ec6d2bb8 1373
df7492f9
KH
1374 src_base = src;
1375 consumed_chars_base = consumed_chars;
4af310db 1376
df7492f9
KH
1377 if (charbuf >= charbuf_end)
1378 break;
1379
119852e7
KH
1380 if (byte_after_cr >= 0)
1381 c1 = byte_after_cr, byte_after_cr = -1;
1382 else
1383 ONE_MORE_BYTE (c1);
065e3595
KH
1384 if (c1 < 0)
1385 {
1386 c = - c1;
1387 }
1388 else if (UTF_8_1_OCTET_P(c1))
df7492f9 1389 {
119852e7
KH
1390 if (eol_crlf && c1 == '\r')
1391 ONE_MORE_BYTE (byte_after_cr);
df7492f9 1392 c = c1;
4af310db 1393 }
df7492f9 1394 else
4af310db 1395 {
df7492f9 1396 ONE_MORE_BYTE (c2);
065e3595 1397 if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
df7492f9
KH
1398 goto invalid_code;
1399 if (UTF_8_2_OCTET_LEADING_P (c1))
4af310db 1400 {
b0edb2c5
DL
1401 c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1402 /* Reject overlong sequences here and below. Encoders
1403 producing them are incorrect, they can be misleading,
1404 and they mess up read/write invariance. */
1405 if (c < 128)
1406 goto invalid_code;
4af310db 1407 }
df7492f9 1408 else
aa72b389 1409 {
df7492f9 1410 ONE_MORE_BYTE (c3);
065e3595 1411 if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
df7492f9
KH
1412 goto invalid_code;
1413 if (UTF_8_3_OCTET_LEADING_P (c1))
b0edb2c5
DL
1414 {
1415 c = (((c1 & 0xF) << 12)
1416 | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
72fe1301
DL
1417 if (c < 0x800
1418 || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
b0edb2c5
DL
1419 goto invalid_code;
1420 }
df7492f9
KH
1421 else
1422 {
1423 ONE_MORE_BYTE (c4);
065e3595 1424 if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
df7492f9
KH
1425 goto invalid_code;
1426 if (UTF_8_4_OCTET_LEADING_P (c1))
b0edb2c5 1427 {
df7492f9
KH
1428 c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1429 | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
b0edb2c5
DL
1430 if (c < 0x10000)
1431 goto invalid_code;
1432 }
df7492f9
KH
1433 else
1434 {
1435 ONE_MORE_BYTE (c5);
065e3595 1436 if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
df7492f9
KH
1437 goto invalid_code;
1438 if (UTF_8_5_OCTET_LEADING_P (c1))
1439 {
1440 c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1441 | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1442 | (c5 & 0x3F));
b0edb2c5 1443 if ((c > MAX_CHAR) || (c < 0x200000))
df7492f9
KH
1444 goto invalid_code;
1445 }
1446 else
1447 goto invalid_code;
1448 }
1449 }
aa72b389 1450 }
b73bfc1c 1451 }
df7492f9
KH
1452
1453 *charbuf++ = c;
1454 continue;
1455
1456 invalid_code:
1457 src = src_base;
1458 consumed_chars = consumed_chars_base;
1459 ONE_MORE_BYTE (c);
1460 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1461 coding->errors++;
aa72b389
KH
1462 }
1463
df7492f9
KH
1464 no_more_source:
1465 coding->consumed_char += consumed_chars_base;
1466 coding->consumed = src_base - coding->source;
1467 coding->charbuf_used = charbuf - coding->charbuf;
1468}
1469
1470
1471static int
1472encode_coding_utf_8 (coding)
1473 struct coding_system *coding;
1474{
1475 int multibytep = coding->dst_multibyte;
1476 int *charbuf = coding->charbuf;
1477 int *charbuf_end = charbuf + coding->charbuf_used;
1478 unsigned char *dst = coding->destination + coding->produced;
1479 unsigned char *dst_end = coding->destination + coding->dst_bytes;
e19c3639 1480 int produced_chars = 0;
df7492f9
KH
1481 int c;
1482
a470d443
KH
1483 if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1484 {
1485 ASSURE_DESTINATION (3);
1486 EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1487 CODING_UTF_8_BOM (coding) = utf_without_bom;
1488 }
1489
df7492f9 1490 if (multibytep)
aa72b389 1491 {
df7492f9
KH
1492 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1493
1494 while (charbuf < charbuf_end)
b73bfc1c 1495 {
df7492f9 1496 unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
8f924df7 1497
df7492f9
KH
1498 ASSURE_DESTINATION (safe_room);
1499 c = *charbuf++;
28f67a95
KH
1500 if (CHAR_BYTE8_P (c))
1501 {
1502 c = CHAR_TO_BYTE8 (c);
1503 EMIT_ONE_BYTE (c);
1504 }
1505 else
1506 {
db274c7a 1507 CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
28f67a95
KH
1508 for (p = str; p < pend; p++)
1509 EMIT_ONE_BYTE (*p);
1510 }
b73bfc1c 1511 }
aa72b389 1512 }
df7492f9
KH
1513 else
1514 {
1515 int safe_room = MAX_MULTIBYTE_LENGTH;
1516
1517 while (charbuf < charbuf_end)
b73bfc1c 1518 {
df7492f9
KH
1519 ASSURE_DESTINATION (safe_room);
1520 c = *charbuf++;
f03caae0
KH
1521 if (CHAR_BYTE8_P (c))
1522 *dst++ = CHAR_TO_BYTE8 (c);
1523 else
db274c7a 1524 CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
df7492f9 1525 produced_chars++;
4ed46869
KH
1526 }
1527 }
065e3595 1528 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
1529 coding->produced_char += produced_chars;
1530 coding->produced = dst - coding->destination;
1531 return 0;
4ed46869
KH
1532}
1533
b73bfc1c 1534
df7492f9 1535/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
ff0dacd7
KH
1536 Check if a text is encoded in one of UTF-16 based coding systems.
1537 If it is, return 1, else return 0. */
aa72b389 1538
df7492f9
KH
1539#define UTF_16_HIGH_SURROGATE_P(val) \
1540 (((val) & 0xFC00) == 0xD800)
1541
1542#define UTF_16_LOW_SURROGATE_P(val) \
1543 (((val) & 0xFC00) == 0xDC00)
93dec019 1544
df7492f9
KH
1545#define UTF_16_INVALID_P(val) \
1546 (((val) == 0xFFFE) \
1547 || ((val) == 0xFFFF) \
1548 || UTF_16_LOW_SURROGATE_P (val))
aa72b389 1549
aa72b389 1550
df7492f9 1551static int
ff0dacd7 1552detect_coding_utf_16 (coding, detect_info)
aa72b389 1553 struct coding_system *coding;
ff0dacd7 1554 struct coding_detection_info *detect_info;
aa72b389 1555{
8f924df7
KH
1556 const unsigned char *src = coding->source, *src_base = src;
1557 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9
KH
1558 int multibytep = coding->src_multibyte;
1559 int consumed_chars = 0;
1560 int c1, c2;
aa72b389 1561
ff0dacd7 1562 detect_info->checked |= CATEGORY_MASK_UTF_16;
ff0dacd7 1563 if (coding->mode & CODING_MODE_LAST_BLOCK
24a73b0a 1564 && (coding->src_chars & 1))
ff0dacd7
KH
1565 {
1566 detect_info->rejected |= CATEGORY_MASK_UTF_16;
1567 return 0;
1568 }
24a73b0a 1569
df7492f9
KH
1570 ONE_MORE_BYTE (c1);
1571 ONE_MORE_BYTE (c2);
df7492f9 1572 if ((c1 == 0xFF) && (c2 == 0xFE))
aa72b389 1573 {
b49a1807
KH
1574 detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1575 | CATEGORY_MASK_UTF_16_AUTO);
24a73b0a
KH
1576 detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1577 | CATEGORY_MASK_UTF_16_BE_NOSIG
1578 | CATEGORY_MASK_UTF_16_LE_NOSIG);
aa72b389 1579 }
df7492f9 1580 else if ((c1 == 0xFE) && (c2 == 0xFF))
ff0dacd7 1581 {
b49a1807
KH
1582 detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1583 | CATEGORY_MASK_UTF_16_AUTO);
24a73b0a
KH
1584 detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1585 | CATEGORY_MASK_UTF_16_BE_NOSIG
1586 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1587 }
2f3cbb32 1588 else
24a73b0a 1589 {
2f3cbb32
KH
1590 /* We check the dispersion of Eth and Oth bytes where E is even and
1591 O is odd. If both are high, we assume binary data.*/
1592 unsigned char e[256], o[256];
1593 unsigned e_num = 1, o_num = 1;
1594
1595 memset (e, 0, 256);
1596 memset (o, 0, 256);
1597 e[c1] = 1;
1598 o[c2] = 1;
1599
24a73b0a
KH
1600 detect_info->rejected
1601 |= (CATEGORY_MASK_UTF_16_BE | CATEGORY_MASK_UTF_16_LE);
2f3cbb32
KH
1602
1603 while (1)
1604 {
1605 ONE_MORE_BYTE (c1);
1606 ONE_MORE_BYTE (c2);
1607 if (! e[c1])
1608 {
1609 e[c1] = 1;
1610 e_num++;
1611 if (e_num >= 128)
1612 break;
1613 }
1614 if (! o[c2])
1615 {
1616 o[c1] = 1;
1617 o_num++;
1618 if (o_num >= 128)
1619 break;
1620 }
1621 }
1622 detect_info->rejected |= CATEGORY_MASK_UTF_16;
1623 return 0;
ff0dacd7 1624 }
2f3cbb32 1625
df7492f9 1626 no_more_source:
ff0dacd7 1627 return 1;
df7492f9 1628}
aa72b389 1629
df7492f9
KH
1630static void
1631decode_coding_utf_16 (coding)
1632 struct coding_system *coding;
1633{
8f924df7
KH
1634 const unsigned char *src = coding->source + coding->consumed;
1635 const unsigned char *src_end = coding->source + coding->src_bytes;
1636 const unsigned char *src_base;
69a80ea3
KH
1637 int *charbuf = coding->charbuf + coding->charbuf_used;
1638 int *charbuf_end = coding->charbuf + coding->charbuf_size;
df7492f9
KH
1639 int consumed_chars = 0, consumed_chars_base;
1640 int multibytep = coding->src_multibyte;
a470d443 1641 enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
df7492f9
KH
1642 enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1643 int surrogate = CODING_UTF_16_SURROGATE (coding);
24a73b0a 1644 Lisp_Object attr, charset_list;
119852e7
KH
1645 int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1646 int byte_after_cr1 = -1, byte_after_cr2 = -1;
df7492f9 1647
24a73b0a 1648 CODING_GET_INFO (coding, attr, charset_list);
df7492f9 1649
a470d443 1650 if (bom == utf_with_bom)
aa72b389 1651 {
df7492f9 1652 int c, c1, c2;
4af310db 1653
aa72b389 1654 src_base = src;
df7492f9
KH
1655 ONE_MORE_BYTE (c1);
1656 ONE_MORE_BYTE (c2);
e19c3639 1657 c = (c1 << 8) | c2;
aa72b389 1658
b49a1807
KH
1659 if (endian == utf_16_big_endian
1660 ? c != 0xFEFF : c != 0xFFFE)
aa72b389 1661 {
b49a1807
KH
1662 /* The first two bytes are not BOM. Treat them as bytes
1663 for a normal character. */
1664 src = src_base;
1665 coding->errors++;
aa72b389 1666 }
a470d443 1667 CODING_UTF_16_BOM (coding) = utf_without_bom;
b49a1807 1668 }
a470d443 1669 else if (bom == utf_detect_bom)
b49a1807
KH
1670 {
1671 /* We have already tried to detect BOM and failed in
1672 detect_coding. */
a470d443 1673 CODING_UTF_16_BOM (coding) = utf_without_bom;
df7492f9 1674 }
aa72b389 1675
df7492f9
KH
1676 while (1)
1677 {
1678 int c, c1, c2;
1679
1680 src_base = src;
1681 consumed_chars_base = consumed_chars;
1682
1683 if (charbuf + 2 >= charbuf_end)
1684 break;
1685
119852e7
KH
1686 if (byte_after_cr1 >= 0)
1687 c1 = byte_after_cr1, byte_after_cr1 = -1;
1688 else
1689 ONE_MORE_BYTE (c1);
065e3595
KH
1690 if (c1 < 0)
1691 {
1692 *charbuf++ = -c1;
1693 continue;
1694 }
119852e7
KH
1695 if (byte_after_cr2 >= 0)
1696 c2 = byte_after_cr2, byte_after_cr2 = -1;
1697 else
1698 ONE_MORE_BYTE (c2);
065e3595
KH
1699 if (c2 < 0)
1700 {
1701 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1702 *charbuf++ = -c2;
1703 continue;
1704 }
df7492f9 1705 c = (endian == utf_16_big_endian
e19c3639 1706 ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
119852e7 1707
df7492f9 1708 if (surrogate)
fd3ae0b9 1709 {
df7492f9 1710 if (! UTF_16_LOW_SURROGATE_P (c))
fd3ae0b9 1711 {
df7492f9
KH
1712 if (endian == utf_16_big_endian)
1713 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1714 else
1715 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1716 *charbuf++ = c1;
1717 *charbuf++ = c2;
1718 coding->errors++;
1719 if (UTF_16_HIGH_SURROGATE_P (c))
1720 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
fd3ae0b9 1721 else
df7492f9 1722 *charbuf++ = c;
fd3ae0b9
KH
1723 }
1724 else
df7492f9
KH
1725 {
1726 c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1727 CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
29f7ffd0 1728 *charbuf++ = 0x10000 + c;
df7492f9 1729 }
fd3ae0b9 1730 }
aa72b389 1731 else
df7492f9
KH
1732 {
1733 if (UTF_16_HIGH_SURROGATE_P (c))
1734 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1735 else
119852e7
KH
1736 {
1737 if (eol_crlf && c == '\r')
1738 {
1739 ONE_MORE_BYTE (byte_after_cr1);
1740 ONE_MORE_BYTE (byte_after_cr2);
1741 }
1742 *charbuf++ = c;
1743 }
8f924df7 1744 }
aa72b389 1745 }
df7492f9
KH
1746
1747 no_more_source:
1748 coding->consumed_char += consumed_chars_base;
1749 coding->consumed = src_base - coding->source;
1750 coding->charbuf_used = charbuf - coding->charbuf;
aa72b389 1751}
b73bfc1c 1752
df7492f9
KH
1753static int
1754encode_coding_utf_16 (coding)
1755 struct coding_system *coding;
1756{
1757 int multibytep = coding->dst_multibyte;
1758 int *charbuf = coding->charbuf;
1759 int *charbuf_end = charbuf + coding->charbuf_used;
1760 unsigned char *dst = coding->destination + coding->produced;
1761 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1762 int safe_room = 8;
a470d443 1763 enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
df7492f9
KH
1764 int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1765 int produced_chars = 0;
24a73b0a 1766 Lisp_Object attrs, charset_list;
df7492f9 1767 int c;
4ed46869 1768
24a73b0a 1769 CODING_GET_INFO (coding, attrs, charset_list);
df7492f9 1770
a470d443 1771 if (bom != utf_without_bom)
df7492f9
KH
1772 {
1773 ASSURE_DESTINATION (safe_room);
1774 if (big_endian)
df7492f9 1775 EMIT_TWO_BYTES (0xFE, 0xFF);
880cf180
KH
1776 else
1777 EMIT_TWO_BYTES (0xFF, 0xFE);
a470d443 1778 CODING_UTF_16_BOM (coding) = utf_without_bom;
df7492f9
KH
1779 }
1780
1781 while (charbuf < charbuf_end)
1782 {
1783 ASSURE_DESTINATION (safe_room);
1784 c = *charbuf++;
e19c3639
KH
1785 if (c >= MAX_UNICODE_CHAR)
1786 c = coding->default_char;
df7492f9
KH
1787
1788 if (c < 0x10000)
1789 {
1790 if (big_endian)
1791 EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1792 else
1793 EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1794 }
1795 else
1796 {
1797 int c1, c2;
1798
1799 c -= 0x10000;
1800 c1 = (c >> 10) + 0xD800;
1801 c2 = (c & 0x3FF) + 0xDC00;
1802 if (big_endian)
1803 EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1804 else
1805 EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1806 }
1807 }
065e3595 1808 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
1809 coding->produced = dst - coding->destination;
1810 coding->produced_char += produced_chars;
1811 return 0;
1812}
1813
1814\f
1815/*** 6. Old Emacs' internal format (emacs-mule) ***/
1816
1817/* Emacs' internal format for representation of multiple character
1818 sets is a kind of multi-byte encoding, i.e. characters are
1819 represented by variable-length sequences of one-byte codes.
1820
1821 ASCII characters and control characters (e.g. `tab', `newline') are
1822 represented by one-byte sequences which are their ASCII codes, in
1823 the range 0x00 through 0x7F.
1824
1825 8-bit characters of the range 0x80..0x9F are represented by
1826 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1827 code + 0x20).
1828
1829 8-bit characters of the range 0xA0..0xFF are represented by
1830 one-byte sequences which are their 8-bit code.
1831
1832 The other characters are represented by a sequence of `base
1833 leading-code', optional `extended leading-code', and one or two
1834 `position-code's. The length of the sequence is determined by the
1835 base leading-code. Leading-code takes the range 0x81 through 0x9D,
1836 whereas extended leading-code and position-code take the range 0xA0
1837 through 0xFF. See `charset.h' for more details about leading-code
1838 and position-code.
1839
1840 --- CODE RANGE of Emacs' internal format ---
1841 character set range
1842 ------------- -----
1843 ascii 0x00..0x7F
1844 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1845 eight-bit-graphic 0xA0..0xBF
1846 ELSE 0x81..0x9D + [0xA0..0xFF]+
1847 ---------------------------------------------
1848
1849 As this is the internal character representation, the format is
1850 usually not used externally (i.e. in a file or in a data sent to a
1851 process). But, it is possible to have a text externally in this
1852 format (i.e. by encoding by the coding system `emacs-mule').
1853
1854 In that case, a sequence of one-byte codes has a slightly different
1855 form.
1856
1857 At first, all characters in eight-bit-control are represented by
1858 one-byte sequences which are their 8-bit code.
1859
1860 Next, character composition data are represented by the byte
1861 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1862 where,
1863 METHOD is 0xF0 plus one of composition method (enum
1864 composition_method),
1865
1866 BYTES is 0xA0 plus a byte length of this composition data,
1867
1868 CHARS is 0x20 plus a number of characters composed by this
1869 data,
1870
1871 COMPONENTs are characters of multibye form or composition
1872 rules encoded by two-byte of ASCII codes.
1873
1874 In addition, for backward compatibility, the following formats are
1875 also recognized as composition data on decoding.
1876
1877 0x80 MSEQ ...
1878 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1879
1880 Here,
1881 MSEQ is a multibyte form but in these special format:
1882 ASCII: 0xA0 ASCII_CODE+0x80,
1883 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1884 RULE is a one byte code of the range 0xA0..0xF0 that
1885 represents a composition rule.
1886 */
1887
1888char emacs_mule_bytes[256];
1889
df7492f9 1890int
ff0dacd7 1891emacs_mule_char (coding, src, nbytes, nchars, id)
df7492f9 1892 struct coding_system *coding;
065e3595 1893 const unsigned char *src;
ff0dacd7 1894 int *nbytes, *nchars, *id;
df7492f9 1895{
8f924df7
KH
1896 const unsigned char *src_end = coding->source + coding->src_bytes;
1897 const unsigned char *src_base = src;
df7492f9 1898 int multibytep = coding->src_multibyte;
df7492f9
KH
1899 struct charset *charset;
1900 unsigned code;
1901 int c;
1902 int consumed_chars = 0;
1903
1904 ONE_MORE_BYTE (c);
065e3595 1905 if (c < 0)
df7492f9 1906 {
065e3595
KH
1907 c = -c;
1908 charset = emacs_mule_charset[0];
1909 }
1910 else
1911 {
4d41e8b7
KH
1912 if (c >= 0xA0)
1913 {
b3af4b28 1914 /* Old style component character of a composition. */
4d41e8b7
KH
1915 if (c == 0xA0)
1916 {
1917 ONE_MORE_BYTE (c);
1918 c -= 0x80;
1919 }
1920 else
1921 c -= 0x20;
1922 }
1923
065e3595 1924 switch (emacs_mule_bytes[c])
b73bfc1c 1925 {
065e3595 1926 case 2:
df7492f9
KH
1927 if (! (charset = emacs_mule_charset[c]))
1928 goto invalid_code;
1929 ONE_MORE_BYTE (c);
9ffd559c 1930 if (c < 0xA0)
065e3595 1931 goto invalid_code;
df7492f9 1932 code = c & 0x7F;
065e3595
KH
1933 break;
1934
1935 case 3:
1936 if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
1937 || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
1938 {
1939 ONE_MORE_BYTE (c);
9ffd559c 1940 if (c < 0xA0 || ! (charset = emacs_mule_charset[c]))
065e3595
KH
1941 goto invalid_code;
1942 ONE_MORE_BYTE (c);
9ffd559c 1943 if (c < 0xA0)
065e3595
KH
1944 goto invalid_code;
1945 code = c & 0x7F;
1946 }
1947 else
1948 {
1949 if (! (charset = emacs_mule_charset[c]))
1950 goto invalid_code;
1951 ONE_MORE_BYTE (c);
9ffd559c 1952 if (c < 0xA0)
065e3595
KH
1953 goto invalid_code;
1954 code = (c & 0x7F) << 8;
1955 ONE_MORE_BYTE (c);
9ffd559c 1956 if (c < 0xA0)
065e3595
KH
1957 goto invalid_code;
1958 code |= c & 0x7F;
1959 }
1960 break;
1961
1962 case 4:
1963 ONE_MORE_BYTE (c);
1964 if (c < 0 || ! (charset = emacs_mule_charset[c]))
df7492f9
KH
1965 goto invalid_code;
1966 ONE_MORE_BYTE (c);
9ffd559c 1967 if (c < 0xA0)
065e3595 1968 goto invalid_code;
781d7a48 1969 code = (c & 0x7F) << 8;
df7492f9 1970 ONE_MORE_BYTE (c);
9ffd559c 1971 if (c < 0xA0)
065e3595 1972 goto invalid_code;
df7492f9 1973 code |= c & 0x7F;
065e3595 1974 break;
df7492f9 1975
065e3595
KH
1976 case 1:
1977 code = c;
1978 charset = CHARSET_FROM_ID (ASCII_BYTE_P (code)
1979 ? charset_ascii : charset_eight_bit);
1980 break;
df7492f9 1981
065e3595
KH
1982 default:
1983 abort ();
1984 }
1985 c = DECODE_CHAR (charset, code);
1986 if (c < 0)
1987 goto invalid_code;
df7492f9 1988 }
df7492f9
KH
1989 *nbytes = src - src_base;
1990 *nchars = consumed_chars;
ff0dacd7
KH
1991 if (id)
1992 *id = charset->id;
df7492f9
KH
1993 return c;
1994
1995 no_more_source:
1996 return -2;
1997
1998 invalid_code:
1999 return -1;
2000}
2001
2002
2003/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
ff0dacd7
KH
2004 Check if a text is encoded in `emacs-mule'. If it is, return 1,
2005 else return 0. */
df7492f9
KH
2006
2007static int
ff0dacd7 2008detect_coding_emacs_mule (coding, detect_info)
df7492f9 2009 struct coding_system *coding;
ff0dacd7 2010 struct coding_detection_info *detect_info;
df7492f9 2011{
065e3595 2012 const unsigned char *src = coding->source, *src_base;
8f924df7 2013 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9
KH
2014 int multibytep = coding->src_multibyte;
2015 int consumed_chars = 0;
2016 int c;
2017 int found = 0;
2018
ff0dacd7 2019 detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
df7492f9
KH
2020 /* A coding system of this category is always ASCII compatible. */
2021 src += coding->head_ascii;
2022
2023 while (1)
2024 {
065e3595 2025 src_base = src;
df7492f9 2026 ONE_MORE_BYTE (c);
065e3595
KH
2027 if (c < 0)
2028 continue;
df7492f9
KH
2029 if (c == 0x80)
2030 {
2031 /* Perhaps the start of composite character. We simple skip
2032 it because analyzing it is too heavy for detecting. But,
2033 at least, we check that the composite character
3ed051d4 2034 constitutes of more than 4 bytes. */
8f924df7 2035 const unsigned char *src_base;
df7492f9
KH
2036
2037 repeat:
2038 src_base = src;
2039 do
2040 {
2041 ONE_MORE_BYTE (c);
2042 }
2043 while (c >= 0xA0);
2044
2045 if (src - src_base <= 4)
2046 break;
ff0dacd7 2047 found = CATEGORY_MASK_EMACS_MULE;
df7492f9
KH
2048 if (c == 0x80)
2049 goto repeat;
b73bfc1c 2050 }
df7492f9
KH
2051
2052 if (c < 0x80)
b73bfc1c 2053 {
df7492f9
KH
2054 if (c < 0x20
2055 && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
2056 break;
2057 }
2058 else
2059 {
0e219d54 2060 int more_bytes = emacs_mule_bytes[*src_base] - 1;
df7492f9 2061
0e219d54 2062 while (more_bytes > 0)
df7492f9
KH
2063 {
2064 ONE_MORE_BYTE (c);
0e219d54
KH
2065 if (c < 0xA0)
2066 {
2067 src--; /* Unread the last byte. */
2068 break;
2069 }
2070 more_bytes--;
df7492f9 2071 }
0e219d54 2072 if (more_bytes != 0)
df7492f9 2073 break;
ff0dacd7 2074 found = CATEGORY_MASK_EMACS_MULE;
4ed46869
KH
2075 }
2076 }
ff0dacd7 2077 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
df7492f9
KH
2078 return 0;
2079
2080 no_more_source:
065e3595 2081 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
89528eb3 2082 {
ff0dacd7 2083 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
89528eb3
KH
2084 return 0;
2085 }
ff0dacd7
KH
2086 detect_info->found |= found;
2087 return 1;
4ed46869
KH
2088}
2089
b73bfc1c 2090
df7492f9
KH
2091/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
2092
2093/* Decode a character represented as a component of composition
2094 sequence of Emacs 20/21 style at SRC. Set C to that character and
2095 update SRC to the head of next character (or an encoded composition
2096 rule). If SRC doesn't points a composition component, set C to -1.
2097 If SRC points an invalid byte sequence, global exit by a return
2098 value 0. */
2099
2100#define DECODE_EMACS_MULE_COMPOSITION_CHAR(buf) \
f937a7db 2101 do \
df7492f9
KH
2102 { \
2103 int c; \
2104 int nbytes, nchars; \
2105 \
2106 if (src == src_end) \
2107 break; \
ff0dacd7 2108 c = emacs_mule_char (coding, src, &nbytes, &nchars, NULL);\
df7492f9
KH
2109 if (c < 0) \
2110 { \
2111 if (c == -2) \
2112 break; \
2113 goto invalid_code; \
2114 } \
2115 *buf++ = c; \
2116 src += nbytes; \
2117 consumed_chars += nchars; \
2118 } \
f937a7db 2119 while (0)
df7492f9
KH
2120
2121
2122/* Decode a composition rule represented as a component of composition
781d7a48
KH
2123 sequence of Emacs 20 style at SRC. Store the decoded rule in *BUF,
2124 and increment BUF. If SRC points an invalid byte sequence, set C
2125 to -1. */
df7492f9 2126
781d7a48 2127#define DECODE_EMACS_MULE_COMPOSITION_RULE_20(buf) \
df7492f9
KH
2128 do { \
2129 int c, gref, nref; \
2130 \
781d7a48 2131 if (src >= src_end) \
df7492f9
KH
2132 goto invalid_code; \
2133 ONE_MORE_BYTE_NO_CHECK (c); \
4d41e8b7 2134 c -= 0xA0; \
df7492f9
KH
2135 if (c < 0 || c >= 81) \
2136 goto invalid_code; \
2137 \
2138 gref = c / 9, nref = c % 9; \
2139 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
2140 } while (0)
2141
2142
781d7a48
KH
2143/* Decode a composition rule represented as a component of composition
2144 sequence of Emacs 21 style at SRC. Store the decoded rule in *BUF,
2145 and increment BUF. If SRC points an invalid byte sequence, set C
2146 to -1. */
2147
2148#define DECODE_EMACS_MULE_COMPOSITION_RULE_21(buf) \
2149 do { \
2150 int gref, nref; \
2151 \
2152 if (src + 1>= src_end) \
2153 goto invalid_code; \
2154 ONE_MORE_BYTE_NO_CHECK (gref); \
2155 gref -= 0x20; \
2156 ONE_MORE_BYTE_NO_CHECK (nref); \
2157 nref -= 0x20; \
2158 if (gref < 0 || gref >= 81 \
2159 || nref < 0 || nref >= 81) \
2160 goto invalid_code; \
2161 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
2162 } while (0)
2163
2164
df7492f9 2165#define DECODE_EMACS_MULE_21_COMPOSITION(c) \
aa72b389 2166 do { \
df7492f9 2167 /* Emacs 21 style format. The first three bytes at SRC are \
781d7a48 2168 (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is \
df7492f9
KH
2169 the byte length of this composition information, CHARS is the \
2170 number of characters composed by this composition. */ \
781d7a48
KH
2171 enum composition_method method = c - 0xF2; \
2172 int *charbuf_base = charbuf; \
df7492f9
KH
2173 int consumed_chars_limit; \
2174 int nbytes, nchars; \
2175 \
2176 ONE_MORE_BYTE (c); \
065e3595
KH
2177 if (c < 0) \
2178 goto invalid_code; \
df7492f9
KH
2179 nbytes = c - 0xA0; \
2180 if (nbytes < 3) \
2181 goto invalid_code; \
2182 ONE_MORE_BYTE (c); \
065e3595
KH
2183 if (c < 0) \
2184 goto invalid_code; \
df7492f9 2185 nchars = c - 0xA0; \
69a80ea3 2186 ADD_COMPOSITION_DATA (charbuf, nchars, method); \
df7492f9
KH
2187 consumed_chars_limit = consumed_chars_base + nbytes; \
2188 if (method != COMPOSITION_RELATIVE) \
aa72b389 2189 { \
df7492f9
KH
2190 int i = 0; \
2191 while (consumed_chars < consumed_chars_limit) \
aa72b389 2192 { \
df7492f9 2193 if (i % 2 && method != COMPOSITION_WITH_ALTCHARS) \
781d7a48 2194 DECODE_EMACS_MULE_COMPOSITION_RULE_21 (charbuf); \
df7492f9
KH
2195 else \
2196 DECODE_EMACS_MULE_COMPOSITION_CHAR (charbuf); \
781d7a48 2197 i++; \
aa72b389 2198 } \
df7492f9
KH
2199 if (consumed_chars < consumed_chars_limit) \
2200 goto invalid_code; \
781d7a48 2201 charbuf_base[0] -= i; \
aa72b389
KH
2202 } \
2203 } while (0)
93dec019 2204
aa72b389 2205
d959f512
KH
2206#define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION(c) \
2207 do { \
2208 /* Emacs 20 style format for relative composition. */ \
2209 /* Store multibyte form of characters to be composed. */ \
2210 enum composition_method method = COMPOSITION_RELATIVE; \
2211 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
2212 int *buf = components; \
2213 int i, j; \
2214 \
2215 src = src_base; \
2216 ONE_MORE_BYTE (c); /* skip 0x80 */ \
2217 for (i = 0; *src >= 0xA0 && i < MAX_COMPOSITION_COMPONENTS; i++) \
2218 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
2219 if (i < 2) \
2220 goto invalid_code; \
2221 ADD_COMPOSITION_DATA (charbuf, i, method); \
2222 for (j = 0; j < i; j++) \
2223 *charbuf++ = components[j]; \
df7492f9
KH
2224 } while (0)
2225
2226
2227#define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION(c) \
2228 do { \
2229 /* Emacs 20 style format for rule-base composition. */ \
2230 /* Store multibyte form of characters to be composed. */ \
ff0dacd7 2231 enum composition_method method = COMPOSITION_WITH_RULE; \
4d41e8b7 2232 int *charbuf_base = charbuf; \
df7492f9
KH
2233 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
2234 int *buf = components; \
2235 int i, j; \
4d41e8b7 2236 \
df7492f9 2237 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
4d41e8b7 2238 for (i = 1; i < MAX_COMPOSITION_COMPONENTS; i++) \
df7492f9 2239 { \
4d41e8b7
KH
2240 if (*src < 0xA0) \
2241 break; \
781d7a48 2242 DECODE_EMACS_MULE_COMPOSITION_RULE_20 (buf); \
df7492f9
KH
2243 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
2244 } \
4d41e8b7 2245 if (i <= 1 || (buf - components) % 2 == 0) \
df7492f9 2246 goto invalid_code; \
4d41e8b7 2247 if (charbuf + i + (i / 2) + 1 >= charbuf_end) \
df7492f9 2248 goto no_more_source; \
4d41e8b7
KH
2249 ADD_COMPOSITION_DATA (charbuf, i, method); \
2250 i = i * 2 - 1; \
df7492f9
KH
2251 for (j = 0; j < i; j++) \
2252 *charbuf++ = components[j]; \
4d41e8b7 2253 charbuf_base[0] -= i; \
df7492f9
KH
2254 for (j = 0; j < i; j += 2) \
2255 *charbuf++ = components[j]; \
2256 } while (0)
2257
aa72b389
KH
2258
2259static void
df7492f9 2260decode_coding_emacs_mule (coding)
aa72b389 2261 struct coding_system *coding;
aa72b389 2262{
8f924df7
KH
2263 const unsigned char *src = coding->source + coding->consumed;
2264 const unsigned char *src_end = coding->source + coding->src_bytes;
2265 const unsigned char *src_base;
69a80ea3
KH
2266 int *charbuf = coding->charbuf + coding->charbuf_used;
2267 int *charbuf_end
2268 = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
df7492f9 2269 int consumed_chars = 0, consumed_chars_base;
df7492f9 2270 int multibytep = coding->src_multibyte;
24a73b0a 2271 Lisp_Object attrs, charset_list;
ff0dacd7
KH
2272 int char_offset = coding->produced_char;
2273 int last_offset = char_offset;
2274 int last_id = charset_ascii;
119852e7
KH
2275 int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2276 int byte_after_cr = -1;
aa72b389 2277
24a73b0a 2278 CODING_GET_INFO (coding, attrs, charset_list);
aa72b389 2279
aa72b389
KH
2280 while (1)
2281 {
df7492f9
KH
2282 int c;
2283
aa72b389 2284 src_base = src;
df7492f9
KH
2285 consumed_chars_base = consumed_chars;
2286
2287 if (charbuf >= charbuf_end)
2288 break;
aa72b389 2289
119852e7
KH
2290 if (byte_after_cr >= 0)
2291 c = byte_after_cr, byte_after_cr = -1;
2292 else
2293 ONE_MORE_BYTE (c);
065e3595
KH
2294 if (c < 0)
2295 {
2296 *charbuf++ = -c;
2297 char_offset++;
2298 }
2299 else if (c < 0x80)
aa72b389 2300 {
119852e7
KH
2301 if (eol_crlf && c == '\r')
2302 ONE_MORE_BYTE (byte_after_cr);
df7492f9
KH
2303 *charbuf++ = c;
2304 char_offset++;
aa72b389 2305 }
df7492f9
KH
2306 else if (c == 0x80)
2307 {
df7492f9 2308 ONE_MORE_BYTE (c);
065e3595
KH
2309 if (c < 0)
2310 goto invalid_code;
781d7a48
KH
2311 if (c - 0xF2 >= COMPOSITION_RELATIVE
2312 && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)
df7492f9
KH
2313 DECODE_EMACS_MULE_21_COMPOSITION (c);
2314 else if (c < 0xC0)
2315 DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (c);
2316 else if (c == 0xFF)
2317 DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (c);
2318 else
2319 goto invalid_code;
2320 }
2321 else if (c < 0xA0 && emacs_mule_bytes[c] > 1)
2322 {
2323 int nbytes, nchars;
ff0dacd7
KH
2324 int id;
2325
781d7a48
KH
2326 src = src_base;
2327 consumed_chars = consumed_chars_base;
ff0dacd7 2328 c = emacs_mule_char (coding, src, &nbytes, &nchars, &id);
df7492f9
KH
2329 if (c < 0)
2330 {
2331 if (c == -2)
2332 break;
2333 goto invalid_code;
2334 }
ff0dacd7
KH
2335 if (last_id != id)
2336 {
2337 if (last_id != charset_ascii)
69a80ea3 2338 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
ff0dacd7
KH
2339 last_id = id;
2340 last_offset = char_offset;
2341 }
df7492f9 2342 *charbuf++ = c;
781d7a48
KH
2343 src += nbytes;
2344 consumed_chars += nchars;
df7492f9
KH
2345 char_offset++;
2346 }
4d41e8b7
KH
2347 else
2348 goto invalid_code;
df7492f9
KH
2349 continue;
2350
2351 invalid_code:
2352 src = src_base;
2353 consumed_chars = consumed_chars_base;
2354 ONE_MORE_BYTE (c);
2355 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
ff0dacd7 2356 char_offset++;
df7492f9
KH
2357 coding->errors++;
2358 }
2359
2360 no_more_source:
ff0dacd7 2361 if (last_id != charset_ascii)
69a80ea3 2362 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
df7492f9
KH
2363 coding->consumed_char += consumed_chars_base;
2364 coding->consumed = src_base - coding->source;
2365 coding->charbuf_used = charbuf - coding->charbuf;
2366}
2367
2368
2369#define EMACS_MULE_LEADING_CODES(id, codes) \
2370 do { \
2371 if (id < 0xA0) \
2372 codes[0] = id, codes[1] = 0; \
2373 else if (id < 0xE0) \
2374 codes[0] = 0x9A, codes[1] = id; \
2375 else if (id < 0xF0) \
2376 codes[0] = 0x9B, codes[1] = id; \
2377 else if (id < 0xF5) \
2378 codes[0] = 0x9C, codes[1] = id; \
2379 else \
2380 codes[0] = 0x9D, codes[1] = id; \
2381 } while (0);
2382
aa72b389 2383
df7492f9
KH
2384static int
2385encode_coding_emacs_mule (coding)
2386 struct coding_system *coding;
2387{
2388 int multibytep = coding->dst_multibyte;
2389 int *charbuf = coding->charbuf;
2390 int *charbuf_end = charbuf + coding->charbuf_used;
2391 unsigned char *dst = coding->destination + coding->produced;
2392 unsigned char *dst_end = coding->destination + coding->dst_bytes;
2393 int safe_room = 8;
df7492f9 2394 int produced_chars = 0;
24a73b0a 2395 Lisp_Object attrs, charset_list;
df7492f9 2396 int c;
ff0dacd7 2397 int preferred_charset_id = -1;
df7492f9 2398
24a73b0a 2399 CODING_GET_INFO (coding, attrs, charset_list);
eccb6815
KH
2400 if (! EQ (charset_list, Vemacs_mule_charset_list))
2401 {
2402 CODING_ATTR_CHARSET_LIST (attrs)
2403 = charset_list = Vemacs_mule_charset_list;
2404 }
df7492f9
KH
2405
2406 while (charbuf < charbuf_end)
2407 {
2408 ASSURE_DESTINATION (safe_room);
2409 c = *charbuf++;
ff0dacd7
KH
2410
2411 if (c < 0)
2412 {
2413 /* Handle an annotation. */
2414 switch (*charbuf)
2415 {
2416 case CODING_ANNOTATE_COMPOSITION_MASK:
2417 /* Not yet implemented. */
2418 break;
2419 case CODING_ANNOTATE_CHARSET_MASK:
2420 preferred_charset_id = charbuf[3];
2421 if (preferred_charset_id >= 0
2422 && NILP (Fmemq (make_number (preferred_charset_id),
2423 charset_list)))
2424 preferred_charset_id = -1;
2425 break;
2426 default:
2427 abort ();
2428 }
2429 charbuf += -c - 1;
2430 continue;
2431 }
2432
df7492f9
KH
2433 if (ASCII_CHAR_P (c))
2434 EMIT_ONE_ASCII_BYTE (c);
16eafb5d
KH
2435 else if (CHAR_BYTE8_P (c))
2436 {
2437 c = CHAR_TO_BYTE8 (c);
2438 EMIT_ONE_BYTE (c);
2439 }
df7492f9 2440 else
aa72b389 2441 {
df7492f9
KH
2442 struct charset *charset;
2443 unsigned code;
2444 int dimension;
2445 int emacs_mule_id;
2446 unsigned char leading_codes[2];
2447
ff0dacd7
KH
2448 if (preferred_charset_id >= 0)
2449 {
2450 charset = CHARSET_FROM_ID (preferred_charset_id);
2451 if (! CHAR_CHARSET_P (c, charset))
2452 charset = char_charset (c, charset_list, NULL);
2453 }
2454 else
2455 charset = char_charset (c, charset_list, &code);
df7492f9
KH
2456 if (! charset)
2457 {
2458 c = coding->default_char;
2459 if (ASCII_CHAR_P (c))
2460 {
2461 EMIT_ONE_ASCII_BYTE (c);
2462 continue;
2463 }
2464 charset = char_charset (c, charset_list, &code);
2465 }
2466 dimension = CHARSET_DIMENSION (charset);
2467 emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2468 EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2469 EMIT_ONE_BYTE (leading_codes[0]);
2470 if (leading_codes[1])
2471 EMIT_ONE_BYTE (leading_codes[1]);
2472 if (dimension == 1)
1fa663f9 2473 EMIT_ONE_BYTE (code | 0x80);
aa72b389 2474 else
df7492f9 2475 {
1fa663f9 2476 code |= 0x8080;
df7492f9
KH
2477 EMIT_ONE_BYTE (code >> 8);
2478 EMIT_ONE_BYTE (code & 0xFF);
2479 }
aa72b389 2480 }
aa72b389 2481 }
065e3595 2482 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
2483 coding->produced_char += produced_chars;
2484 coding->produced = dst - coding->destination;
2485 return 0;
aa72b389 2486}
b73bfc1c 2487
4ed46869 2488\f
df7492f9 2489/*** 7. ISO2022 handlers ***/
4ed46869
KH
2490
2491/* The following note describes the coding system ISO2022 briefly.
39787efd 2492 Since the intention of this note is to help understand the
5a936b46 2493 functions in this file, some parts are NOT ACCURATE or are OVERLY
39787efd 2494 SIMPLIFIED. For thorough understanding, please refer to the
5a936b46 2495 original document of ISO2022. This is equivalent to the standard
cfb43547 2496 ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
4ed46869
KH
2497
2498 ISO2022 provides many mechanisms to encode several character sets
cfb43547 2499 in 7-bit and 8-bit environments. For 7-bit environments, all text
39787efd
KH
2500 is encoded using bytes less than 128. This may make the encoded
2501 text a little bit longer, but the text passes more easily through
cfb43547 2502 several types of gateway, some of which strip off the MSB (Most
8ca3766a 2503 Significant Bit).
b73bfc1c 2504
cfb43547
DL
2505 There are two kinds of character sets: control character sets and
2506 graphic character sets. The former contain control characters such
4ed46869 2507 as `newline' and `escape' to provide control functions (control
39787efd 2508 functions are also provided by escape sequences). The latter
cfb43547 2509 contain graphic characters such as 'A' and '-'. Emacs recognizes
4ed46869
KH
2510 two control character sets and many graphic character sets.
2511
2512 Graphic character sets are classified into one of the following
39787efd
KH
2513 four classes, according to the number of bytes (DIMENSION) and
2514 number of characters in one dimension (CHARS) of the set:
2515 - DIMENSION1_CHARS94
2516 - DIMENSION1_CHARS96
2517 - DIMENSION2_CHARS94
2518 - DIMENSION2_CHARS96
2519
2520 In addition, each character set is assigned an identification tag,
cfb43547 2521 unique for each set, called the "final character" (denoted as <F>
39787efd
KH
2522 hereafter). The <F> of each character set is decided by ECMA(*)
2523 when it is registered in ISO. The code range of <F> is 0x30..0x7F
2524 (0x30..0x3F are for private use only).
4ed46869
KH
2525
2526 Note (*): ECMA = European Computer Manufacturers Association
2527
cfb43547 2528 Here are examples of graphic character sets [NAME(<F>)]:
4ed46869
KH
2529 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2530 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2531 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2532 o DIMENSION2_CHARS96 -- none for the moment
2533
39787efd 2534 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
4ed46869
KH
2535 C0 [0x00..0x1F] -- control character plane 0
2536 GL [0x20..0x7F] -- graphic character plane 0
2537 C1 [0x80..0x9F] -- control character plane 1
2538 GR [0xA0..0xFF] -- graphic character plane 1
2539
2540 A control character set is directly designated and invoked to C0 or
39787efd
KH
2541 C1 by an escape sequence. The most common case is that:
2542 - ISO646's control character set is designated/invoked to C0, and
2543 - ISO6429's control character set is designated/invoked to C1,
2544 and usually these designations/invocations are omitted in encoded
2545 text. In a 7-bit environment, only C0 can be used, and a control
2546 character for C1 is encoded by an appropriate escape sequence to
2547 fit into the environment. All control characters for C1 are
2548 defined to have corresponding escape sequences.
4ed46869
KH
2549
2550 A graphic character set is at first designated to one of four
2551 graphic registers (G0 through G3), then these graphic registers are
2552 invoked to GL or GR. These designations and invocations can be
2553 done independently. The most common case is that G0 is invoked to
39787efd
KH
2554 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
2555 these invocations and designations are omitted in encoded text.
2556 In a 7-bit environment, only GL can be used.
4ed46869 2557
39787efd
KH
2558 When a graphic character set of CHARS94 is invoked to GL, codes
2559 0x20 and 0x7F of the GL area work as control characters SPACE and
2560 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2561 be used.
4ed46869
KH
2562
2563 There are two ways of invocation: locking-shift and single-shift.
2564 With locking-shift, the invocation lasts until the next different
39787efd
KH
2565 invocation, whereas with single-shift, the invocation affects the
2566 following character only and doesn't affect the locking-shift
2567 state. Invocations are done by the following control characters or
2568 escape sequences:
4ed46869
KH
2569
2570 ----------------------------------------------------------------------
39787efd 2571 abbrev function cntrl escape seq description
4ed46869 2572 ----------------------------------------------------------------------
39787efd
KH
2573 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
2574 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
2575 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
2576 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
2577 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
2578 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
2579 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
2580 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
2581 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
4ed46869 2582 ----------------------------------------------------------------------
39787efd
KH
2583 (*) These are not used by any known coding system.
2584
2585 Control characters for these functions are defined by macros
2586 ISO_CODE_XXX in `coding.h'.
4ed46869 2587
39787efd 2588 Designations are done by the following escape sequences:
4ed46869
KH
2589 ----------------------------------------------------------------------
2590 escape sequence description
2591 ----------------------------------------------------------------------
2592 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
2593 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
2594 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
2595 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
2596 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
2597 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
2598 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
2599 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
2600 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
2601 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
2602 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
2603 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
2604 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
2605 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
2606 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
2607 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
2608 ----------------------------------------------------------------------
2609
2610 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
39787efd 2611 of dimension 1, chars 94, and final character <F>, etc...
4ed46869
KH
2612
2613 Note (*): Although these designations are not allowed in ISO2022,
2614 Emacs accepts them on decoding, and produces them on encoding
39787efd 2615 CHARS96 character sets in a coding system which is characterized as
4ed46869
KH
2616 7-bit environment, non-locking-shift, and non-single-shift.
2617
2618 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
df7492f9 2619 '(' must be omitted. We refer to this as "short-form" hereafter.
4ed46869 2620
cfb43547 2621 Now you may notice that there are a lot of ways of encoding the
39787efd
KH
2622 same multilingual text in ISO2022. Actually, there exist many
2623 coding systems such as Compound Text (used in X11's inter client
8ca3766a
DL
2624 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2625 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
4ed46869
KH
2626 localized platforms), and all of these are variants of ISO2022.
2627
2628 In addition to the above, Emacs handles two more kinds of escape
2629 sequences: ISO6429's direction specification and Emacs' private
2630 sequence for specifying character composition.
2631
39787efd 2632 ISO6429's direction specification takes the following form:
4ed46869
KH
2633 o CSI ']' -- end of the current direction
2634 o CSI '0' ']' -- end of the current direction
2635 o CSI '1' ']' -- start of left-to-right text
2636 o CSI '2' ']' -- start of right-to-left text
2637 The control character CSI (0x9B: control sequence introducer) is
39787efd
KH
2638 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2639
2640 Character composition specification takes the following form:
ec6d2bb8
KH
2641 o ESC '0' -- start relative composition
2642 o ESC '1' -- end composition
2643 o ESC '2' -- start rule-base composition (*)
2644 o ESC '3' -- start relative composition with alternate chars (**)
2645 o ESC '4' -- start rule-base composition with alternate chars (**)
b73bfc1c 2646 Since these are not standard escape sequences of any ISO standard,
cfb43547 2647 the use of them with these meanings is restricted to Emacs only.
ec6d2bb8 2648
5a936b46
DL
2649 (*) This form is used only in Emacs 20.7 and older versions,
2650 but newer versions can safely decode it.
cfb43547 2651 (**) This form is used only in Emacs 21.1 and newer versions,
5a936b46 2652 and older versions can't decode it.
ec6d2bb8 2653
cfb43547 2654 Here's a list of example usages of these composition escape
b73bfc1c 2655 sequences (categorized by `enum composition_method').
ec6d2bb8 2656
b73bfc1c 2657 COMPOSITION_RELATIVE:
ec6d2bb8 2658 ESC 0 CHAR [ CHAR ] ESC 1
8ca3766a 2659 COMPOSITION_WITH_RULE:
ec6d2bb8 2660 ESC 2 CHAR [ RULE CHAR ] ESC 1
b73bfc1c 2661 COMPOSITION_WITH_ALTCHARS:
ec6d2bb8 2662 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
b73bfc1c 2663 COMPOSITION_WITH_RULE_ALTCHARS:
ec6d2bb8 2664 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
4ed46869
KH
2665
2666enum iso_code_class_type iso_code_class[256];
2667
df7492f9
KH
2668#define SAFE_CHARSET_P(coding, id) \
2669 ((id) <= (coding)->max_charset_id \
2670 && (coding)->safe_charsets[id] >= 0)
2671
2672
2673#define SHIFT_OUT_OK(category) \
2674 (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
2675
2676static void
f0064e1f
DL
2677setup_iso_safe_charsets (attrs)
2678 Lisp_Object attrs;
df7492f9
KH
2679{
2680 Lisp_Object charset_list, safe_charsets;
2681 Lisp_Object request;
2682 Lisp_Object reg_usage;
2683 Lisp_Object tail;
2684 int reg94, reg96;
2685 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2686 int max_charset_id;
2687
2688 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2689 if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2690 && ! EQ (charset_list, Viso_2022_charset_list))
2691 {
2692 CODING_ATTR_CHARSET_LIST (attrs)
2693 = charset_list = Viso_2022_charset_list;
2694 ASET (attrs, coding_attr_safe_charsets, Qnil);
2695 }
2696
2697 if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2698 return;
2699
2700 max_charset_id = 0;
2701 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2702 {
2703 int id = XINT (XCAR (tail));
2704 if (max_charset_id < id)
2705 max_charset_id = id;
2706 }
d46c5b12 2707
df7492f9
KH
2708 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
2709 make_number (255));
2710 request = AREF (attrs, coding_attr_iso_request);
2711 reg_usage = AREF (attrs, coding_attr_iso_usage);
2712 reg94 = XINT (XCAR (reg_usage));
2713 reg96 = XINT (XCDR (reg_usage));
2714
2715 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2716 {
2717 Lisp_Object id;
2718 Lisp_Object reg;
2719 struct charset *charset;
2720
2721 id = XCAR (tail);
2722 charset = CHARSET_FROM_ID (XINT (id));
bf16eb23 2723 reg = Fcdr (Fassq (id, request));
df7492f9 2724 if (! NILP (reg))
8f924df7 2725 SSET (safe_charsets, XINT (id), XINT (reg));
df7492f9
KH
2726 else if (charset->iso_chars_96)
2727 {
2728 if (reg96 < 4)
8f924df7 2729 SSET (safe_charsets, XINT (id), reg96);
df7492f9
KH
2730 }
2731 else
2732 {
2733 if (reg94 < 4)
8f924df7 2734 SSET (safe_charsets, XINT (id), reg94);
df7492f9
KH
2735 }
2736 }
2737 ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2738}
d46c5b12 2739
b6871cc7 2740
4ed46869 2741/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
ff0dacd7
KH
2742 Check if a text is encoded in one of ISO-2022 based codig systems.
2743 If it is, return 1, else return 0. */
4ed46869 2744
0a28aafb 2745static int
ff0dacd7 2746detect_coding_iso_2022 (coding, detect_info)
df7492f9 2747 struct coding_system *coding;
ff0dacd7 2748 struct coding_detection_info *detect_info;
4ed46869 2749{
8f924df7
KH
2750 const unsigned char *src = coding->source, *src_base = src;
2751 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9 2752 int multibytep = coding->src_multibyte;
ff0dacd7 2753 int single_shifting = 0;
df7492f9
KH
2754 int id;
2755 int c, c1;
2756 int consumed_chars = 0;
2757 int i;
ff0dacd7
KH
2758 int rejected = 0;
2759 int found = 0;
2760
2761 detect_info->checked |= CATEGORY_MASK_ISO;
df7492f9
KH
2762
2763 for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2764 {
2765 struct coding_system *this = &(coding_categories[i]);
2766 Lisp_Object attrs, val;
2767
c6b278e7
KH
2768 if (this->id < 0)
2769 continue;
df7492f9
KH
2770 attrs = CODING_ID_ATTRS (this->id);
2771 if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2772 && ! EQ (CODING_ATTR_SAFE_CHARSETS (attrs), Viso_2022_charset_list))
2773 setup_iso_safe_charsets (attrs);
2774 val = CODING_ATTR_SAFE_CHARSETS (attrs);
8f924df7
KH
2775 this->max_charset_id = SCHARS (val) - 1;
2776 this->safe_charsets = (char *) SDATA (val);
df7492f9
KH
2777 }
2778
2779 /* A coding system of this category is always ASCII compatible. */
2780 src += coding->head_ascii;
3f003981 2781
ff0dacd7 2782 while (rejected != CATEGORY_MASK_ISO)
4ed46869 2783 {
065e3595 2784 src_base = src;
df7492f9 2785 ONE_MORE_BYTE (c);
4ed46869
KH
2786 switch (c)
2787 {
2788 case ISO_CODE_ESC:
74383408
KH
2789 if (inhibit_iso_escape_detection)
2790 break;
f46869e4 2791 single_shifting = 0;
df7492f9 2792 ONE_MORE_BYTE (c);
d46c5b12 2793 if (c >= '(' && c <= '/')
4ed46869 2794 {
bf9cdd4e 2795 /* Designation sequence for a charset of dimension 1. */
df7492f9 2796 ONE_MORE_BYTE (c1);
d46c5b12 2797 if (c1 < ' ' || c1 >= 0x80
df7492f9 2798 || (id = iso_charset_table[0][c >= ','][c1]) < 0)
d46c5b12
KH
2799 /* Invalid designation sequence. Just ignore. */
2800 break;
bf9cdd4e
KH
2801 }
2802 else if (c == '$')
2803 {
2804 /* Designation sequence for a charset of dimension 2. */
df7492f9 2805 ONE_MORE_BYTE (c);
bf9cdd4e
KH
2806 if (c >= '@' && c <= 'B')
2807 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
ff0dacd7 2808 id = iso_charset_table[1][0][c];
bf9cdd4e 2809 else if (c >= '(' && c <= '/')
bcf26d6a 2810 {
df7492f9 2811 ONE_MORE_BYTE (c1);
d46c5b12 2812 if (c1 < ' ' || c1 >= 0x80
df7492f9 2813 || (id = iso_charset_table[1][c >= ','][c1]) < 0)
d46c5b12
KH
2814 /* Invalid designation sequence. Just ignore. */
2815 break;
bcf26d6a 2816 }
bf9cdd4e 2817 else
ff0dacd7 2818 /* Invalid designation sequence. Just ignore it. */
d46c5b12
KH
2819 break;
2820 }
ae9ff118 2821 else if (c == 'N' || c == 'O')
d46c5b12 2822 {
ae9ff118 2823 /* ESC <Fe> for SS2 or SS3. */
ff0dacd7
KH
2824 single_shifting = 1;
2825 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
d46c5b12 2826 break;
4ed46869 2827 }
ec6d2bb8
KH
2828 else if (c >= '0' && c <= '4')
2829 {
2830 /* ESC <Fp> for start/end composition. */
ff0dacd7 2831 found |= CATEGORY_MASK_ISO;
ec6d2bb8
KH
2832 break;
2833 }
bf9cdd4e 2834 else
df7492f9 2835 {
ff0dacd7 2836 /* Invalid escape sequence. Just ignore it. */
df7492f9
KH
2837 break;
2838 }
d46c5b12
KH
2839
2840 /* We found a valid designation sequence for CHARSET. */
ff0dacd7 2841 rejected |= CATEGORY_MASK_ISO_8BIT;
df7492f9
KH
2842 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
2843 id))
ff0dacd7 2844 found |= CATEGORY_MASK_ISO_7;
d46c5b12 2845 else
ff0dacd7 2846 rejected |= CATEGORY_MASK_ISO_7;
df7492f9
KH
2847 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
2848 id))
ff0dacd7 2849 found |= CATEGORY_MASK_ISO_7_TIGHT;
d46c5b12 2850 else
ff0dacd7 2851 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
df7492f9
KH
2852 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
2853 id))
ff0dacd7 2854 found |= CATEGORY_MASK_ISO_7_ELSE;
ae9ff118 2855 else
ff0dacd7 2856 rejected |= CATEGORY_MASK_ISO_7_ELSE;
df7492f9
KH
2857 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
2858 id))
ff0dacd7 2859 found |= CATEGORY_MASK_ISO_8_ELSE;
ae9ff118 2860 else
ff0dacd7 2861 rejected |= CATEGORY_MASK_ISO_8_ELSE;
4ed46869
KH
2862 break;
2863
4ed46869 2864 case ISO_CODE_SO:
d46c5b12 2865 case ISO_CODE_SI:
ff0dacd7 2866 /* Locking shift out/in. */
74383408
KH
2867 if (inhibit_iso_escape_detection)
2868 break;
f46869e4 2869 single_shifting = 0;
ff0dacd7 2870 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
d46c5b12
KH
2871 break;
2872
4ed46869 2873 case ISO_CODE_CSI:
ff0dacd7 2874 /* Control sequence introducer. */
f46869e4 2875 single_shifting = 0;
ff0dacd7
KH
2876 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2877 found |= CATEGORY_MASK_ISO_8_ELSE;
2878 goto check_extra_latin;
2879
4ed46869
KH
2880 case ISO_CODE_SS2:
2881 case ISO_CODE_SS3:
ff0dacd7
KH
2882 /* Single shift. */
2883 if (inhibit_iso_escape_detection)
2884 break;
75e2a253 2885 single_shifting = 0;
ff0dacd7
KH
2886 rejected |= CATEGORY_MASK_ISO_7BIT;
2887 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2888 & CODING_ISO_FLAG_SINGLE_SHIFT)
75e2a253 2889 found |= CATEGORY_MASK_ISO_8_1, single_shifting = 1;
ff0dacd7
KH
2890 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2891 & CODING_ISO_FLAG_SINGLE_SHIFT)
75e2a253
KH
2892 found |= CATEGORY_MASK_ISO_8_2, single_shifting = 1;
2893 if (single_shifting)
2894 break;
ff0dacd7 2895 goto check_extra_latin;
4ed46869
KH
2896
2897 default:
065e3595
KH
2898 if (c < 0)
2899 continue;
4ed46869 2900 if (c < 0x80)
f46869e4
KH
2901 {
2902 single_shifting = 0;
2903 break;
2904 }
ff0dacd7 2905 if (c >= 0xA0)
c4825358 2906 {
ff0dacd7
KH
2907 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2908 found |= CATEGORY_MASK_ISO_8_1;
f46869e4 2909 /* Check the length of succeeding codes of the range
ff0dacd7
KH
2910 0xA0..0FF. If the byte length is even, we include
2911 CATEGORY_MASK_ISO_8_2 in `found'. We can check this
2912 only when we are not single shifting. */
2913 if (! single_shifting
2914 && ! (rejected & CATEGORY_MASK_ISO_8_2))
f46869e4 2915 {
e17de821 2916 int i = 1;
b73bfc1c
KH
2917 while (src < src_end)
2918 {
df7492f9 2919 ONE_MORE_BYTE (c);
b73bfc1c
KH
2920 if (c < 0xA0)
2921 break;
2922 i++;
2923 }
2924
2925 if (i & 1 && src < src_end)
ff0dacd7 2926 rejected |= CATEGORY_MASK_ISO_8_2;
f46869e4 2927 else
ff0dacd7 2928 found |= CATEGORY_MASK_ISO_8_2;
f46869e4 2929 }
ff0dacd7 2930 break;
4ed46869 2931 }
ff0dacd7
KH
2932 check_extra_latin:
2933 single_shifting = 0;
2934 if (! VECTORP (Vlatin_extra_code_table)
2935 || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
2936 {
2937 rejected = CATEGORY_MASK_ISO;
2938 break;
2939 }
2940 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2941 & CODING_ISO_FLAG_LATIN_EXTRA)
2942 found |= CATEGORY_MASK_ISO_8_1;
2943 else
2944 rejected |= CATEGORY_MASK_ISO_8_1;
75e2a253 2945 rejected |= CATEGORY_MASK_ISO_8_2;
4ed46869
KH
2946 }
2947 }
ff0dacd7
KH
2948 detect_info->rejected |= CATEGORY_MASK_ISO;
2949 return 0;
4ed46869 2950
df7492f9 2951 no_more_source:
ff0dacd7
KH
2952 detect_info->rejected |= rejected;
2953 detect_info->found |= (found & ~rejected);
df7492f9 2954 return 1;
4ed46869 2955}
ec6d2bb8 2956
4ed46869 2957
134b9549
KH
2958/* Set designation state into CODING. Set CHARS_96 to -1 if the
2959 escape sequence should be kept. */
df7492f9
KH
2960#define DECODE_DESIGNATION(reg, dim, chars_96, final) \
2961 do { \
2962 int id, prev; \
2963 \
2964 if (final < '0' || final >= 128 \
2965 || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0) \
2966 || !SAFE_CHARSET_P (coding, id)) \
2967 { \
2968 CODING_ISO_DESIGNATION (coding, reg) = -2; \
134b9549
KH
2969 chars_96 = -1; \
2970 break; \
df7492f9
KH
2971 } \
2972 prev = CODING_ISO_DESIGNATION (coding, reg); \
bf16eb23
KH
2973 if (id == charset_jisx0201_roman) \
2974 { \
2975 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
2976 id = charset_ascii; \
2977 } \
2978 else if (id == charset_jisx0208_1978) \
2979 { \
2980 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
2981 id = charset_jisx0208; \
2982 } \
df7492f9
KH
2983 CODING_ISO_DESIGNATION (coding, reg) = id; \
2984 /* If there was an invalid designation to REG previously, and this \
2985 designation is ASCII to REG, we should keep this designation \
2986 sequence. */ \
2987 if (prev == -2 && id == charset_ascii) \
134b9549 2988 chars_96 = -1; \
4ed46869
KH
2989 } while (0)
2990
d46c5b12 2991
df7492f9
KH
2992#define MAYBE_FINISH_COMPOSITION() \
2993 do { \
2994 int i; \
2995 if (composition_state == COMPOSING_NO) \
2996 break; \
2997 /* It is assured that we have enough room for producing \
2998 characters stored in the table `components'. */ \
2999 if (charbuf + component_idx > charbuf_end) \
3000 goto no_more_source; \
3001 composition_state = COMPOSING_NO; \
3002 if (method == COMPOSITION_RELATIVE \
3003 || method == COMPOSITION_WITH_ALTCHARS) \
3004 { \
3005 for (i = 0; i < component_idx; i++) \
3006 *charbuf++ = components[i]; \
3007 char_offset += component_idx; \
3008 } \
3009 else \
3010 { \
3011 for (i = 0; i < component_idx; i += 2) \
3012 *charbuf++ = components[i]; \
3013 char_offset += (component_idx / 2) + 1; \
3014 } \
3015 } while (0)
3016
d46c5b12 3017
aa72b389
KH
3018/* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3019 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3020 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
df7492f9
KH
3021 ESC 3 : altchar composition : ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3022 ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
aa72b389 3023 */
ec6d2bb8 3024
df7492f9
KH
3025#define DECODE_COMPOSITION_START(c1) \
3026 do { \
3027 if (c1 == '0' \
781d7a48 3028 && composition_state == COMPOSING_COMPONENT_RULE) \
df7492f9
KH
3029 { \
3030 component_len = component_idx; \
3031 composition_state = COMPOSING_CHAR; \
3032 } \
3033 else \
3034 { \
8f924df7 3035 const unsigned char *p; \
df7492f9
KH
3036 \
3037 MAYBE_FINISH_COMPOSITION (); \
3038 if (charbuf + MAX_COMPOSITION_COMPONENTS > charbuf_end) \
3039 goto no_more_source; \
3040 for (p = src; p < src_end - 1; p++) \
3041 if (*p == ISO_CODE_ESC && p[1] == '1') \
3042 break; \
3043 if (p == src_end - 1) \
3044 { \
9286b333
KH
3045 /* The current composition doesn't end in the current \
3046 source. */ \
3047 record_conversion_result \
3048 (coding, CODING_RESULT_INSUFFICIENT_SRC); \
df7492f9
KH
3049 goto no_more_source; \
3050 } \
3051 \
3052 /* This is surely the start of a composition. */ \
3053 method = (c1 == '0' ? COMPOSITION_RELATIVE \
3054 : c1 == '2' ? COMPOSITION_WITH_RULE \
3055 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
3056 : COMPOSITION_WITH_RULE_ALTCHARS); \
3057 composition_state = (c1 <= '2' ? COMPOSING_CHAR \
3058 : COMPOSING_COMPONENT_CHAR); \
3059 component_idx = component_len = 0; \
3060 } \
ec6d2bb8
KH
3061 } while (0)
3062
ec6d2bb8 3063
df7492f9
KH
3064/* Handle compositoin end sequence ESC 1. */
3065
3066#define DECODE_COMPOSITION_END() \
ec6d2bb8 3067 do { \
df7492f9
KH
3068 int nchars = (component_len > 0 ? component_idx - component_len \
3069 : method == COMPOSITION_RELATIVE ? component_idx \
3070 : (component_idx + 1) / 2); \
3071 int i; \
3072 int *saved_charbuf = charbuf; \
3073 \
69a80ea3 3074 ADD_COMPOSITION_DATA (charbuf, nchars, method); \
df7492f9 3075 if (method != COMPOSITION_RELATIVE) \
ec6d2bb8 3076 { \
df7492f9
KH
3077 if (component_len == 0) \
3078 for (i = 0; i < component_idx; i++) \
3079 *charbuf++ = components[i]; \
3080 else \
3081 for (i = 0; i < component_len; i++) \
3082 *charbuf++ = components[i]; \
3083 *saved_charbuf = saved_charbuf - charbuf; \
ec6d2bb8 3084 } \
df7492f9
KH
3085 if (method == COMPOSITION_WITH_RULE) \
3086 for (i = 0; i < component_idx; i += 2, char_offset++) \
3087 *charbuf++ = components[i]; \
ec6d2bb8 3088 else \
df7492f9
KH
3089 for (i = component_len; i < component_idx; i++, char_offset++) \
3090 *charbuf++ = components[i]; \
3091 coding->annotated = 1; \
3092 composition_state = COMPOSING_NO; \
ec6d2bb8
KH
3093 } while (0)
3094
df7492f9 3095
ec6d2bb8
KH
3096/* Decode a composition rule from the byte C1 (and maybe one more byte
3097 from SRC) and store one encoded composition rule in
3098 coding->cmp_data. */
3099
3100#define DECODE_COMPOSITION_RULE(c1) \
3101 do { \
ec6d2bb8
KH
3102 (c1) -= 32; \
3103 if (c1 < 81) /* old format (before ver.21) */ \
3104 { \
3105 int gref = (c1) / 9; \
3106 int nref = (c1) % 9; \
3107 if (gref == 4) gref = 10; \
3108 if (nref == 4) nref = 10; \
df7492f9 3109 c1 = COMPOSITION_ENCODE_RULE (gref, nref); \
ec6d2bb8 3110 } \
b73bfc1c 3111 else if (c1 < 93) /* new format (after ver.21) */ \
ec6d2bb8
KH
3112 { \
3113 ONE_MORE_BYTE (c2); \
df7492f9 3114 c1 = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32); \
ec6d2bb8 3115 } \
df7492f9
KH
3116 else \
3117 c1 = 0; \
ec6d2bb8 3118 } while (0)
88993dfd 3119
d46c5b12 3120
4ed46869
KH
3121/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
3122
b73bfc1c 3123static void
df7492f9 3124decode_coding_iso_2022 (coding)
4ed46869 3125 struct coding_system *coding;
4ed46869 3126{
8f924df7
KH
3127 const unsigned char *src = coding->source + coding->consumed;
3128 const unsigned char *src_end = coding->source + coding->src_bytes;
3129 const unsigned char *src_base;
69a80ea3 3130 int *charbuf = coding->charbuf + coding->charbuf_used;
ff0dacd7 3131 int *charbuf_end
69a80ea3 3132 = coding->charbuf + coding->charbuf_size - 4 - MAX_ANNOTATION_LENGTH;
df7492f9 3133 int consumed_chars = 0, consumed_chars_base;
df7492f9 3134 int multibytep = coding->src_multibyte;
4ed46869 3135 /* Charsets invoked to graphic plane 0 and 1 respectively. */
df7492f9
KH
3136 int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3137 int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
134b9549 3138 int charset_id_2, charset_id_3;
df7492f9
KH
3139 struct charset *charset;
3140 int c;
3141 /* For handling composition sequence. */
3142#define COMPOSING_NO 0
3143#define COMPOSING_CHAR 1
3144#define COMPOSING_RULE 2
3145#define COMPOSING_COMPONENT_CHAR 3
3146#define COMPOSING_COMPONENT_RULE 4
3147
3148 int composition_state = COMPOSING_NO;
3149 enum composition_method method;
3150 int components[MAX_COMPOSITION_COMPONENTS * 2 + 1];
3151 int component_idx;
3152 int component_len;
24a73b0a 3153 Lisp_Object attrs, charset_list;
ff0dacd7
KH
3154 int char_offset = coding->produced_char;
3155 int last_offset = char_offset;
3156 int last_id = charset_ascii;
119852e7
KH
3157 int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3158 int byte_after_cr = -1;
df7492f9 3159
24a73b0a 3160 CODING_GET_INFO (coding, attrs, charset_list);
df7492f9 3161 setup_iso_safe_charsets (attrs);
287c57d7
KH
3162 /* Charset list may have been changed. */
3163 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3164 coding->safe_charsets = (char *) SDATA (CODING_ATTR_SAFE_CHARSETS(attrs));
b73bfc1c
KH
3165
3166 while (1)
4ed46869 3167 {
463f5630 3168 int c1, c2;
b73bfc1c
KH
3169
3170 src_base = src;
df7492f9
KH
3171 consumed_chars_base = consumed_chars;
3172
3173 if (charbuf >= charbuf_end)
3174 break;
3175
119852e7
KH
3176 if (byte_after_cr >= 0)
3177 c1 = byte_after_cr, byte_after_cr = -1;
3178 else
3179 ONE_MORE_BYTE (c1);
065e3595
KH
3180 if (c1 < 0)
3181 goto invalid_code;
4ed46869 3182
98725083 3183 /* We produce at most one character. */
4ed46869
KH
3184 switch (iso_code_class [c1])
3185 {
3186 case ISO_0x20_or_0x7F:
df7492f9 3187 if (composition_state != COMPOSING_NO)
ec6d2bb8 3188 {
df7492f9
KH
3189 if (composition_state == COMPOSING_RULE
3190 || composition_state == COMPOSING_COMPONENT_RULE)
3191 {
3192 DECODE_COMPOSITION_RULE (c1);
3193 components[component_idx++] = c1;
3194 composition_state--;
3195 continue;
3196 }
4ed46869 3197 }
df7492f9
KH
3198 if (charset_id_0 < 0
3199 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
781d7a48
KH
3200 /* This is SPACE or DEL. */
3201 charset = CHARSET_FROM_ID (charset_ascii);
3202 else
3203 charset = CHARSET_FROM_ID (charset_id_0);
3204 break;
4ed46869
KH
3205
3206 case ISO_graphic_plane_0:
781d7a48 3207 if (composition_state != COMPOSING_NO)
b73bfc1c 3208 {
781d7a48
KH
3209 if (composition_state == COMPOSING_RULE
3210 || composition_state == COMPOSING_COMPONENT_RULE)
3211 {
3212 DECODE_COMPOSITION_RULE (c1);
3213 components[component_idx++] = c1;
3214 composition_state--;
3215 continue;
3216 }
b73bfc1c 3217 }
134b9549
KH
3218 if (charset_id_0 < 0)
3219 charset = CHARSET_FROM_ID (charset_ascii);
3220 else
3221 charset = CHARSET_FROM_ID (charset_id_0);
4ed46869
KH
3222 break;
3223
3224 case ISO_0xA0_or_0xFF:
df7492f9
KH
3225 if (charset_id_1 < 0
3226 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3227 || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3228 goto invalid_code;
4ed46869
KH
3229 /* This is a graphic character, we fall down ... */
3230
3231 case ISO_graphic_plane_1:
df7492f9
KH
3232 if (charset_id_1 < 0)
3233 goto invalid_code;
3234 charset = CHARSET_FROM_ID (charset_id_1);
4ed46869
KH
3235 break;
3236
df7492f9 3237 case ISO_control_0:
119852e7
KH
3238 if (eol_crlf && c1 == '\r')
3239 ONE_MORE_BYTE (byte_after_cr);
df7492f9
KH
3240 MAYBE_FINISH_COMPOSITION ();
3241 charset = CHARSET_FROM_ID (charset_ascii);
4ed46869
KH
3242 break;
3243
df7492f9
KH
3244 case ISO_control_1:
3245 MAYBE_FINISH_COMPOSITION ();
3246 goto invalid_code;
3247
4ed46869 3248 case ISO_shift_out:
df7492f9
KH
3249 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3250 || CODING_ISO_DESIGNATION (coding, 1) < 0)
3251 goto invalid_code;
3252 CODING_ISO_INVOCATION (coding, 0) = 1;
3253 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
b73bfc1c 3254 continue;
4ed46869
KH
3255
3256 case ISO_shift_in:
df7492f9
KH
3257 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3258 goto invalid_code;
3259 CODING_ISO_INVOCATION (coding, 0) = 0;
3260 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
b73bfc1c 3261 continue;
4ed46869
KH
3262
3263 case ISO_single_shift_2_7:
3264 case ISO_single_shift_2:
df7492f9
KH
3265 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3266 goto invalid_code;
4ed46869
KH
3267 /* SS2 is handled as an escape sequence of ESC 'N' */
3268 c1 = 'N';
3269 goto label_escape_sequence;
3270
3271 case ISO_single_shift_3:
df7492f9
KH
3272 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3273 goto invalid_code;
4ed46869
KH
3274 /* SS2 is handled as an escape sequence of ESC 'O' */
3275 c1 = 'O';
3276 goto label_escape_sequence;
3277
3278 case ISO_control_sequence_introducer:
3279 /* CSI is handled as an escape sequence of ESC '[' ... */
3280 c1 = '[';
3281 goto label_escape_sequence;
3282
3283 case ISO_escape:
3284 ONE_MORE_BYTE (c1);
3285 label_escape_sequence:
df7492f9 3286 /* Escape sequences handled here are invocation,
4ed46869
KH
3287 designation, direction specification, and character
3288 composition specification. */
3289 switch (c1)
3290 {
3291 case '&': /* revision of following character set */
3292 ONE_MORE_BYTE (c1);
3293 if (!(c1 >= '@' && c1 <= '~'))
df7492f9 3294 goto invalid_code;
4ed46869
KH
3295 ONE_MORE_BYTE (c1);
3296 if (c1 != ISO_CODE_ESC)
df7492f9 3297 goto invalid_code;
4ed46869
KH
3298 ONE_MORE_BYTE (c1);
3299 goto label_escape_sequence;
3300
3301 case '$': /* designation of 2-byte character set */
df7492f9
KH
3302 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3303 goto invalid_code;
134b9549
KH
3304 {
3305 int reg, chars96;
3306
3307 ONE_MORE_BYTE (c1);
3308 if (c1 >= '@' && c1 <= 'B')
3309 { /* designation of JISX0208.1978, GB2312.1980,
88993dfd 3310 or JISX0208.1980 */
134b9549
KH
3311 reg = 0, chars96 = 0;
3312 }
3313 else if (c1 >= 0x28 && c1 <= 0x2B)
3314 { /* designation of DIMENSION2_CHARS94 character set */
3315 reg = c1 - 0x28, chars96 = 0;
3316 ONE_MORE_BYTE (c1);
3317 }
3318 else if (c1 >= 0x2C && c1 <= 0x2F)
3319 { /* designation of DIMENSION2_CHARS96 character set */
3320 reg = c1 - 0x2C, chars96 = 1;
3321 ONE_MORE_BYTE (c1);
3322 }
3323 else
3324 goto invalid_code;
3325 DECODE_DESIGNATION (reg, 2, chars96, c1);
3326 /* We must update these variables now. */
3327 if (reg == 0)
3328 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3329 else if (reg == 1)
3330 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3331 if (chars96 < 0)
3332 goto invalid_code;
3333 }
b73bfc1c 3334 continue;
4ed46869
KH
3335
3336 case 'n': /* invocation of locking-shift-2 */
df7492f9
KH
3337 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3338 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3339 goto invalid_code;
3340 CODING_ISO_INVOCATION (coding, 0) = 2;
3341 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
b73bfc1c 3342 continue;
4ed46869
KH
3343
3344 case 'o': /* invocation of locking-shift-3 */
df7492f9
KH
3345 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3346 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3347 goto invalid_code;
3348 CODING_ISO_INVOCATION (coding, 0) = 3;
3349 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
b73bfc1c 3350 continue;
4ed46869
KH
3351
3352 case 'N': /* invocation of single-shift-2 */
df7492f9
KH
3353 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3354 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3355 goto invalid_code;
134b9549
KH
3356 charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3357 if (charset_id_2 < 0)
3358 charset = CHARSET_FROM_ID (charset_ascii);
3359 else
3360 charset = CHARSET_FROM_ID (charset_id_2);
b73bfc1c 3361 ONE_MORE_BYTE (c1);
e7046a18 3362 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
df7492f9 3363 goto invalid_code;
4ed46869
KH
3364 break;
3365
3366 case 'O': /* invocation of single-shift-3 */
df7492f9
KH
3367 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3368 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3369 goto invalid_code;
134b9549
KH
3370 charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3371 if (charset_id_3 < 0)
3372 charset = CHARSET_FROM_ID (charset_ascii);
3373 else
3374 charset = CHARSET_FROM_ID (charset_id_3);
b73bfc1c 3375 ONE_MORE_BYTE (c1);
e7046a18 3376 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
df7492f9 3377 goto invalid_code;
4ed46869
KH
3378 break;
3379
ec6d2bb8 3380 case '0': case '2': case '3': case '4': /* start composition */
df7492f9
KH
3381 if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3382 goto invalid_code;
ec6d2bb8 3383 DECODE_COMPOSITION_START (c1);
b73bfc1c 3384 continue;
4ed46869 3385
ec6d2bb8 3386 case '1': /* end composition */
df7492f9
KH
3387 if (composition_state == COMPOSING_NO)
3388 goto invalid_code;
3389 DECODE_COMPOSITION_END ();
b73bfc1c 3390 continue;
4ed46869
KH
3391
3392 case '[': /* specification of direction */
df7492f9
KH
3393 if (! CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION)
3394 goto invalid_code;
4ed46869 3395 /* For the moment, nested direction is not supported.
d46c5b12 3396 So, `coding->mode & CODING_MODE_DIRECTION' zero means
df7492f9 3397 left-to-right, and nozero means right-to-left. */
4ed46869
KH
3398 ONE_MORE_BYTE (c1);
3399 switch (c1)
3400 {
3401 case ']': /* end of the current direction */
d46c5b12 3402 coding->mode &= ~CODING_MODE_DIRECTION;
4ed46869
KH
3403
3404 case '0': /* end of the current direction */
3405 case '1': /* start of left-to-right direction */
3406 ONE_MORE_BYTE (c1);
3407 if (c1 == ']')
d46c5b12 3408 coding->mode &= ~CODING_MODE_DIRECTION;
4ed46869 3409 else
df7492f9 3410 goto invalid_code;
4ed46869
KH
3411 break;
3412
3413 case '2': /* start of right-to-left direction */
3414 ONE_MORE_BYTE (c1);
3415 if (c1 == ']')
d46c5b12 3416 coding->mode |= CODING_MODE_DIRECTION;
4ed46869 3417 else
df7492f9 3418 goto invalid_code;
4ed46869
KH
3419 break;
3420
3421 default:
df7492f9 3422 goto invalid_code;
4ed46869 3423 }
b73bfc1c 3424 continue;
4ed46869 3425
103e0180 3426 case '%':
103e0180
KH
3427 ONE_MORE_BYTE (c1);
3428 if (c1 == '/')
3429 {
3430 /* CTEXT extended segment:
3431 ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3432 We keep these bytes as is for the moment.
3433 They may be decoded by post-read-conversion. */
3434 int dim, M, L;
4776e638 3435 int size;
8f924df7 3436
103e0180
KH
3437 ONE_MORE_BYTE (dim);
3438 ONE_MORE_BYTE (M);
3439 ONE_MORE_BYTE (L);
3440 size = ((M - 128) * 128) + (L - 128);
4776e638
KH
3441 if (charbuf + 8 + size > charbuf_end)
3442 goto break_loop;
3443 *charbuf++ = ISO_CODE_ESC;
3444 *charbuf++ = '%';
3445 *charbuf++ = '/';
3446 *charbuf++ = dim;
3447 *charbuf++ = BYTE8_TO_CHAR (M);
3448 *charbuf++ = BYTE8_TO_CHAR (L);
103e0180
KH
3449 while (size-- > 0)
3450 {
3451 ONE_MORE_BYTE (c1);
4776e638 3452 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
103e0180 3453 }
103e0180
KH
3454 }
3455 else if (c1 == 'G')
3456 {
103e0180
KH
3457 /* XFree86 extension for embedding UTF-8 in CTEXT:
3458 ESC % G --UTF-8-BYTES-- ESC % @
3459 We keep these bytes as is for the moment.
3460 They may be decoded by post-read-conversion. */
4776e638
KH
3461 int *p = charbuf;
3462
3463 if (p + 6 > charbuf_end)
3464 goto break_loop;
3465 *p++ = ISO_CODE_ESC;
3466 *p++ = '%';
3467 *p++ = 'G';
3468 while (p < charbuf_end)
103e0180
KH
3469 {
3470 ONE_MORE_BYTE (c1);
3471 if (c1 == ISO_CODE_ESC
3472 && src + 1 < src_end
3473 && src[0] == '%'
3474 && src[1] == '@')
9ffd559c
KH
3475 {
3476 src += 2;
3477 break;
3478 }
4776e638 3479 *p++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
103e0180 3480 }
4776e638
KH
3481 if (p + 3 > charbuf_end)
3482 goto break_loop;
3483 *p++ = ISO_CODE_ESC;
3484 *p++ = '%';
3485 *p++ = '@';
3486 charbuf = p;
103e0180
KH
3487 }
3488 else
4776e638 3489 goto invalid_code;
103e0180 3490 continue;
4776e638 3491 break;
103e0180 3492
4ed46869 3493 default:
df7492f9
KH
3494 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3495 goto invalid_code;
134b9549
KH
3496 {
3497 int reg, chars96;
3498
3499 if (c1 >= 0x28 && c1 <= 0x2B)
3500 { /* designation of DIMENSION1_CHARS94 character set */
3501 reg = c1 - 0x28, chars96 = 0;
3502 ONE_MORE_BYTE (c1);
3503 }
3504 else if (c1 >= 0x2C && c1 <= 0x2F)
3505 { /* designation of DIMENSION1_CHARS96 character set */
3506 reg = c1 - 0x2C, chars96 = 1;
3507 ONE_MORE_BYTE (c1);
3508 }
3509 else
3510 goto invalid_code;
3511 DECODE_DESIGNATION (reg, 1, chars96, c1);
3512 /* We must update these variables now. */
3513 if (reg == 0)
3514 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3515 else if (reg == 1)
3516 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3517 if (chars96 < 0)
3518 goto invalid_code;
3519 }
b73bfc1c 3520 continue;
4ed46869 3521 }
b73bfc1c 3522 }
4ed46869 3523
ff0dacd7
KH
3524 if (charset->id != charset_ascii
3525 && last_id != charset->id)
3526 {
3527 if (last_id != charset_ascii)
69a80ea3 3528 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
ff0dacd7
KH
3529 last_id = charset->id;
3530 last_offset = char_offset;
3531 }
3532
b73bfc1c 3533 /* Now we know CHARSET and 1st position code C1 of a character.
df7492f9
KH
3534 Produce a decoded character while getting 2nd position code
3535 C2 if necessary. */
3536 c1 &= 0x7F;
3537 if (CHARSET_DIMENSION (charset) > 1)
b73bfc1c
KH
3538 {
3539 ONE_MORE_BYTE (c2);
df7492f9 3540 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
b73bfc1c 3541 /* C2 is not in a valid range. */
df7492f9
KH
3542 goto invalid_code;
3543 c1 = (c1 << 8) | (c2 & 0x7F);
3544 if (CHARSET_DIMENSION (charset) > 2)
3545 {
3546 ONE_MORE_BYTE (c2);
3547 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3548 /* C2 is not in a valid range. */
3549 goto invalid_code;
3550 c1 = (c1 << 8) | (c2 & 0x7F);
3551 }
3552 }
3553
3554 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3555 if (c < 0)
3556 {
3557 MAYBE_FINISH_COMPOSITION ();
3558 for (; src_base < src; src_base++, char_offset++)
3559 {
3560 if (ASCII_BYTE_P (*src_base))
3561 *charbuf++ = *src_base;
3562 else
3563 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3564 }
3565 }
3566 else if (composition_state == COMPOSING_NO)
3567 {
3568 *charbuf++ = c;
3569 char_offset++;
4ed46869 3570 }
df7492f9 3571 else
781d7a48
KH
3572 {
3573 components[component_idx++] = c;
3574 if (method == COMPOSITION_WITH_RULE
3575 || (method == COMPOSITION_WITH_RULE_ALTCHARS
3576 && composition_state == COMPOSING_COMPONENT_CHAR))
3577 composition_state++;
4ed46869
KH
3578 }
3579 continue;
3580
df7492f9
KH
3581 invalid_code:
3582 MAYBE_FINISH_COMPOSITION ();
4ed46869 3583 src = src_base;
df7492f9
KH
3584 consumed_chars = consumed_chars_base;
3585 ONE_MORE_BYTE (c);
065e3595 3586 *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
ff0dacd7 3587 char_offset++;
df7492f9 3588 coding->errors++;
4776e638
KH
3589 continue;
3590
3591 break_loop:
3592 break;
4ed46869 3593 }
fb88bf2d 3594
df7492f9 3595 no_more_source:
ff0dacd7 3596 if (last_id != charset_ascii)
69a80ea3 3597 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
df7492f9
KH
3598 coding->consumed_char += consumed_chars_base;
3599 coding->consumed = src_base - coding->source;
3600 coding->charbuf_used = charbuf - coding->charbuf;
4ed46869
KH
3601}
3602
b73bfc1c 3603
f4dee582 3604/* ISO2022 encoding stuff. */
4ed46869
KH
3605
3606/*
f4dee582 3607 It is not enough to say just "ISO2022" on encoding, we have to
df7492f9 3608 specify more details. In Emacs, each coding system of ISO2022
4ed46869 3609 variant has the following specifications:
df7492f9 3610 1. Initial designation to G0 thru G3.
4ed46869
KH
3611 2. Allows short-form designation?
3612 3. ASCII should be designated to G0 before control characters?
3613 4. ASCII should be designated to G0 at end of line?
3614 5. 7-bit environment or 8-bit environment?
3615 6. Use locking-shift?
3616 7. Use Single-shift?
3617 And the following two are only for Japanese:
3618 8. Use ASCII in place of JIS0201-1976-Roman?
3619 9. Use JISX0208-1983 in place of JISX0208-1978?
df7492f9
KH
3620 These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3621 defined by macros CODING_ISO_FLAG_XXX. See `coding.h' for more
f4dee582 3622 details.
4ed46869
KH
3623*/
3624
3625/* Produce codes (escape sequence) for designating CHARSET to graphic
b73bfc1c
KH
3626 register REG at DST, and increment DST. If <final-char> of CHARSET is
3627 '@', 'A', or 'B' and the coding system CODING allows, produce
3628 designation sequence of short-form. */
4ed46869
KH
3629
3630#define ENCODE_DESIGNATION(charset, reg, coding) \
3631 do { \
df7492f9 3632 unsigned char final_char = CHARSET_ISO_FINAL (charset); \
4ed46869
KH
3633 char *intermediate_char_94 = "()*+"; \
3634 char *intermediate_char_96 = ",-./"; \
df7492f9
KH
3635 int revision = -1; \
3636 int c; \
3637 \
3638 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION) \
c197f191 3639 revision = CHARSET_ISO_REVISION (charset); \
df7492f9
KH
3640 \
3641 if (revision >= 0) \
70c22245 3642 { \
df7492f9
KH
3643 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&'); \
3644 EMIT_ONE_BYTE ('@' + revision); \
4ed46869 3645 } \
df7492f9 3646 EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC); \
4ed46869
KH
3647 if (CHARSET_DIMENSION (charset) == 1) \
3648 { \
df7492f9
KH
3649 if (! CHARSET_ISO_CHARS_96 (charset)) \
3650 c = intermediate_char_94[reg]; \
4ed46869 3651 else \
df7492f9
KH
3652 c = intermediate_char_96[reg]; \
3653 EMIT_ONE_ASCII_BYTE (c); \
4ed46869
KH
3654 } \
3655 else \
3656 { \
df7492f9
KH
3657 EMIT_ONE_ASCII_BYTE ('$'); \
3658 if (! CHARSET_ISO_CHARS_96 (charset)) \
4ed46869 3659 { \
df7492f9 3660 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM \
b73bfc1c
KH
3661 || reg != 0 \
3662 || final_char < '@' || final_char > 'B') \
df7492f9 3663 EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]); \
4ed46869
KH
3664 } \
3665 else \
df7492f9 3666 EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]); \
4ed46869 3667 } \
df7492f9
KH
3668 EMIT_ONE_ASCII_BYTE (final_char); \
3669 \
3670 CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset); \
4ed46869
KH
3671 } while (0)
3672
df7492f9 3673
4ed46869
KH
3674/* The following two macros produce codes (control character or escape
3675 sequence) for ISO2022 single-shift functions (single-shift-2 and
3676 single-shift-3). */
3677
df7492f9
KH
3678#define ENCODE_SINGLE_SHIFT_2 \
3679 do { \
3680 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3681 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N'); \
3682 else \
3683 EMIT_ONE_BYTE (ISO_CODE_SS2); \
3684 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
4ed46869
KH
3685 } while (0)
3686
df7492f9
KH
3687
3688#define ENCODE_SINGLE_SHIFT_3 \
3689 do { \
3690 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3691 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O'); \
3692 else \
3693 EMIT_ONE_BYTE (ISO_CODE_SS3); \
3694 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
4ed46869
KH
3695 } while (0)
3696
df7492f9 3697
4ed46869
KH
3698/* The following four macros produce codes (control character or
3699 escape sequence) for ISO2022 locking-shift functions (shift-in,
3700 shift-out, locking-shift-2, and locking-shift-3). */
3701
df7492f9
KH
3702#define ENCODE_SHIFT_IN \
3703 do { \
3704 EMIT_ONE_ASCII_BYTE (ISO_CODE_SI); \
3705 CODING_ISO_INVOCATION (coding, 0) = 0; \
4ed46869
KH
3706 } while (0)
3707
df7492f9
KH
3708
3709#define ENCODE_SHIFT_OUT \
3710 do { \
3711 EMIT_ONE_ASCII_BYTE (ISO_CODE_SO); \
3712 CODING_ISO_INVOCATION (coding, 0) = 1; \
4ed46869
KH
3713 } while (0)
3714
df7492f9
KH
3715
3716#define ENCODE_LOCKING_SHIFT_2 \
3717 do { \
3718 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3719 CODING_ISO_INVOCATION (coding, 0) = 2; \
4ed46869
KH
3720 } while (0)
3721
df7492f9
KH
3722
3723#define ENCODE_LOCKING_SHIFT_3 \
3724 do { \
3725 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3726 CODING_ISO_INVOCATION (coding, 0) = 3; \
4ed46869
KH
3727 } while (0)
3728
df7492f9 3729
f4dee582
RS
3730/* Produce codes for a DIMENSION1 character whose character set is
3731 CHARSET and whose position-code is C1. Designation and invocation
4ed46869
KH
3732 sequences are also produced in advance if necessary. */
3733
6e85d753
KH
3734#define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
3735 do { \
df7492f9 3736 int id = CHARSET_ID (charset); \
bf16eb23
KH
3737 \
3738 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
3739 && id == charset_ascii) \
3740 { \
3741 id = charset_jisx0201_roman; \
3742 charset = CHARSET_FROM_ID (id); \
3743 } \
3744 \
df7492f9 3745 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
6e85d753 3746 { \
df7492f9
KH
3747 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3748 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
6e85d753 3749 else \
df7492f9
KH
3750 EMIT_ONE_BYTE (c1 | 0x80); \
3751 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
6e85d753
KH
3752 break; \
3753 } \
df7492f9 3754 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
6e85d753 3755 { \
df7492f9 3756 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
6e85d753
KH
3757 break; \
3758 } \
df7492f9 3759 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
6e85d753 3760 { \
df7492f9 3761 EMIT_ONE_BYTE (c1 | 0x80); \
6e85d753
KH
3762 break; \
3763 } \
6e85d753
KH
3764 else \
3765 /* Since CHARSET is not yet invoked to any graphic planes, we \
3766 must invoke it, or, at first, designate it to some graphic \
3767 register. Then repeat the loop to actually produce the \
3768 character. */ \
df7492f9
KH
3769 dst = encode_invocation_designation (charset, coding, dst, \
3770 &produced_chars); \
4ed46869
KH
3771 } while (1)
3772
df7492f9 3773
f4dee582
RS
3774/* Produce codes for a DIMENSION2 character whose character set is
3775 CHARSET and whose position-codes are C1 and C2. Designation and
4ed46869
KH
3776 invocation codes are also produced in advance if necessary. */
3777
6e85d753
KH
3778#define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
3779 do { \
df7492f9 3780 int id = CHARSET_ID (charset); \
bf16eb23
KH
3781 \
3782 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
3783 && id == charset_jisx0208) \
3784 { \
3785 id = charset_jisx0208_1978; \
3786 charset = CHARSET_FROM_ID (id); \
3787 } \
3788 \
df7492f9 3789 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
6e85d753 3790 { \
df7492f9
KH
3791 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3792 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
6e85d753 3793 else \
df7492f9
KH
3794 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3795 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
6e85d753
KH
3796 break; \
3797 } \
df7492f9 3798 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
6e85d753 3799 { \
df7492f9 3800 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
6e85d753
KH
3801 break; \
3802 } \
df7492f9 3803 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
6e85d753 3804 { \
df7492f9 3805 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
6e85d753
KH
3806 break; \
3807 } \
6e85d753
KH
3808 else \
3809 /* Since CHARSET is not yet invoked to any graphic planes, we \
3810 must invoke it, or, at first, designate it to some graphic \
3811 register. Then repeat the loop to actually produce the \
3812 character. */ \
df7492f9
KH
3813 dst = encode_invocation_designation (charset, coding, dst, \
3814 &produced_chars); \
4ed46869
KH
3815 } while (1)
3816
05e6f5dc 3817
df7492f9
KH
3818#define ENCODE_ISO_CHARACTER(charset, c) \
3819 do { \
3820 int code = ENCODE_CHAR ((charset),(c)); \
3821 \
3822 if (CHARSET_DIMENSION (charset) == 1) \
3823 ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code); \
3824 else \
3825 ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
84fbb8a0 3826 } while (0)
bdd9fb48 3827
05e6f5dc 3828
4ed46869 3829/* Produce designation and invocation codes at a place pointed by DST
df7492f9 3830 to use CHARSET. The element `spec.iso_2022' of *CODING is updated.
4ed46869
KH
3831 Return new DST. */
3832
3833unsigned char *
df7492f9
KH
3834encode_invocation_designation (charset, coding, dst, p_nchars)
3835 struct charset *charset;
4ed46869
KH
3836 struct coding_system *coding;
3837 unsigned char *dst;
df7492f9 3838 int *p_nchars;
4ed46869 3839{
df7492f9
KH
3840 int multibytep = coding->dst_multibyte;
3841 int produced_chars = *p_nchars;
4ed46869 3842 int reg; /* graphic register number */
df7492f9 3843 int id = CHARSET_ID (charset);
4ed46869
KH
3844
3845 /* At first, check designations. */
3846 for (reg = 0; reg < 4; reg++)
df7492f9 3847 if (id == CODING_ISO_DESIGNATION (coding, reg))
4ed46869
KH
3848 break;
3849
3850 if (reg >= 4)
3851 {
3852 /* CHARSET is not yet designated to any graphic registers. */
3853 /* At first check the requested designation. */
df7492f9
KH
3854 reg = CODING_ISO_REQUEST (coding, id);
3855 if (reg < 0)
1ba9e4ab
KH
3856 /* Since CHARSET requests no special designation, designate it
3857 to graphic register 0. */
4ed46869
KH
3858 reg = 0;
3859
3860 ENCODE_DESIGNATION (charset, reg, coding);
3861 }
3862
df7492f9
KH
3863 if (CODING_ISO_INVOCATION (coding, 0) != reg
3864 && CODING_ISO_INVOCATION (coding, 1) != reg)
4ed46869
KH
3865 {
3866 /* Since the graphic register REG is not invoked to any graphic
3867 planes, invoke it to graphic plane 0. */
3868 switch (reg)
3869 {
3870 case 0: /* graphic register 0 */
3871 ENCODE_SHIFT_IN;
3872 break;
3873
3874 case 1: /* graphic register 1 */
3875 ENCODE_SHIFT_OUT;
3876 break;
3877
3878 case 2: /* graphic register 2 */
df7492f9 3879 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4ed46869
KH
3880 ENCODE_SINGLE_SHIFT_2;
3881 else
3882 ENCODE_LOCKING_SHIFT_2;
3883 break;
3884
3885 case 3: /* graphic register 3 */
df7492f9 3886 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4ed46869
KH
3887 ENCODE_SINGLE_SHIFT_3;
3888 else
3889 ENCODE_LOCKING_SHIFT_3;
3890 break;
3891 }
3892 }
b73bfc1c 3893
df7492f9 3894 *p_nchars = produced_chars;
4ed46869
KH
3895 return dst;
3896}
3897
df7492f9
KH
3898/* The following three macros produce codes for indicating direction
3899 of text. */
3900#define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
ec6d2bb8 3901 do { \
df7492f9
KH
3902 if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS) \
3903 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '['); \
ec6d2bb8 3904 else \
df7492f9 3905 EMIT_ONE_BYTE (ISO_CODE_CSI); \
ec6d2bb8
KH
3906 } while (0)
3907
ec6d2bb8 3908
df7492f9
KH
3909#define ENCODE_DIRECTION_R2L() \
3910 do { \
3911 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3912 EMIT_TWO_ASCII_BYTES ('2', ']'); \
ec6d2bb8
KH
3913 } while (0)
3914
ec6d2bb8 3915
df7492f9 3916#define ENCODE_DIRECTION_L2R() \
ec6d2bb8 3917 do { \
df7492f9
KH
3918 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3919 EMIT_TWO_ASCII_BYTES ('0', ']'); \
ec6d2bb8 3920 } while (0)
4ed46869 3921
4ed46869
KH
3922
3923/* Produce codes for designation and invocation to reset the graphic
3924 planes and registers to initial state. */
df7492f9
KH
3925#define ENCODE_RESET_PLANE_AND_REGISTER() \
3926 do { \
3927 int reg; \
3928 struct charset *charset; \
3929 \
3930 if (CODING_ISO_INVOCATION (coding, 0) != 0) \
3931 ENCODE_SHIFT_IN; \
3932 for (reg = 0; reg < 4; reg++) \
3933 if (CODING_ISO_INITIAL (coding, reg) >= 0 \
3934 && (CODING_ISO_DESIGNATION (coding, reg) \
3935 != CODING_ISO_INITIAL (coding, reg))) \
3936 { \
3937 charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
3938 ENCODE_DESIGNATION (charset, reg, coding); \
3939 } \
4ed46869
KH
3940 } while (0)
3941
df7492f9 3942
bdd9fb48 3943/* Produce designation sequences of charsets in the line started from
b73bfc1c 3944 SRC to a place pointed by DST, and return updated DST.
bdd9fb48
KH
3945
3946 If the current block ends before any end-of-line, we may fail to
d46c5b12
KH
3947 find all the necessary designations. */
3948
b73bfc1c 3949static unsigned char *
df7492f9 3950encode_designation_at_bol (coding, charbuf, charbuf_end, dst)
e0e989f6 3951 struct coding_system *coding;
df7492f9
KH
3952 int *charbuf, *charbuf_end;
3953 unsigned char *dst;
e0e989f6 3954{
df7492f9 3955 struct charset *charset;
bdd9fb48
KH
3956 /* Table of charsets to be designated to each graphic register. */
3957 int r[4];
df7492f9
KH
3958 int c, found = 0, reg;
3959 int produced_chars = 0;
3960 int multibytep = coding->dst_multibyte;
3961 Lisp_Object attrs;
3962 Lisp_Object charset_list;
3963
3964 attrs = CODING_ID_ATTRS (coding->id);
3965 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3966 if (EQ (charset_list, Qiso_2022))
3967 charset_list = Viso_2022_charset_list;
bdd9fb48
KH
3968
3969 for (reg = 0; reg < 4; reg++)
3970 r[reg] = -1;
3971
b73bfc1c 3972 while (found < 4)
e0e989f6 3973 {
df7492f9
KH
3974 int id;
3975
3976 c = *charbuf++;
b73bfc1c
KH
3977 if (c == '\n')
3978 break;
df7492f9
KH
3979 charset = char_charset (c, charset_list, NULL);
3980 id = CHARSET_ID (charset);
3981 reg = CODING_ISO_REQUEST (coding, id);
3982 if (reg >= 0 && r[reg] < 0)
bdd9fb48
KH
3983 {
3984 found++;
df7492f9 3985 r[reg] = id;
bdd9fb48 3986 }
bdd9fb48
KH
3987 }
3988
3989 if (found)
3990 {
3991 for (reg = 0; reg < 4; reg++)
3992 if (r[reg] >= 0
df7492f9
KH
3993 && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
3994 ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
e0e989f6 3995 }
b73bfc1c
KH
3996
3997 return dst;
e0e989f6
KH
3998}
3999
4ed46869
KH
4000/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
4001
df7492f9
KH
4002static int
4003encode_coding_iso_2022 (coding)
4ed46869 4004 struct coding_system *coding;
4ed46869 4005{
df7492f9
KH
4006 int multibytep = coding->dst_multibyte;
4007 int *charbuf = coding->charbuf;
4008 int *charbuf_end = charbuf + coding->charbuf_used;
4009 unsigned char *dst = coding->destination + coding->produced;
4010 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4011 int safe_room = 16;
4012 int bol_designation
4013 = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4014 && CODING_ISO_BOL (coding));
4015 int produced_chars = 0;
4016 Lisp_Object attrs, eol_type, charset_list;
4017 int ascii_compatible;
b73bfc1c 4018 int c;
ff0dacd7 4019 int preferred_charset_id = -1;
05e6f5dc 4020
24a73b0a
KH
4021 CODING_GET_INFO (coding, attrs, charset_list);
4022 eol_type = CODING_ID_EOL_TYPE (coding->id);
4023 if (VECTORP (eol_type))
4024 eol_type = Qunix;
4025
004068e4 4026 setup_iso_safe_charsets (attrs);
ff0dacd7 4027 /* Charset list may have been changed. */
287c57d7 4028 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8f924df7 4029 coding->safe_charsets = (char *) SDATA (CODING_ATTR_SAFE_CHARSETS(attrs));
0eecad43 4030
df7492f9 4031 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
bdd9fb48 4032
df7492f9 4033 while (charbuf < charbuf_end)
4ed46869 4034 {
df7492f9 4035 ASSURE_DESTINATION (safe_room);
b73bfc1c 4036
df7492f9 4037 if (bol_designation)
b73bfc1c 4038 {
df7492f9 4039 unsigned char *dst_prev = dst;
4ed46869 4040
bdd9fb48 4041 /* We have to produce designation sequences if any now. */
df7492f9
KH
4042 dst = encode_designation_at_bol (coding, charbuf, charbuf_end, dst);
4043 bol_designation = 0;
4044 /* We are sure that designation sequences are all ASCII bytes. */
4045 produced_chars += dst - dst_prev;
e0e989f6
KH
4046 }
4047
df7492f9 4048 c = *charbuf++;
ec6d2bb8 4049
ff0dacd7
KH
4050 if (c < 0)
4051 {
4052 /* Handle an annotation. */
4053 switch (*charbuf)
ec6d2bb8 4054 {
ff0dacd7
KH
4055 case CODING_ANNOTATE_COMPOSITION_MASK:
4056 /* Not yet implemented. */
4057 break;
4058 case CODING_ANNOTATE_CHARSET_MASK:
cf7dfdf5 4059 preferred_charset_id = charbuf[2];
ff0dacd7
KH
4060 if (preferred_charset_id >= 0
4061 && NILP (Fmemq (make_number (preferred_charset_id),
4062 charset_list)))
4063 preferred_charset_id = -1;
4064 break;
4065 default:
4066 abort ();
4ed46869 4067 }
ff0dacd7
KH
4068 charbuf += -c - 1;
4069 continue;
4ed46869 4070 }
ec6d2bb8 4071
b73bfc1c
KH
4072 /* Now encode the character C. */
4073 if (c < 0x20 || c == 0x7F)
4074 {
df7492f9
KH
4075 if (c == '\n'
4076 || (c == '\r' && EQ (eol_type, Qmac)))
19a8d9e0 4077 {
df7492f9
KH
4078 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4079 ENCODE_RESET_PLANE_AND_REGISTER ();
4080 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
b73bfc1c 4081 {
df7492f9
KH
4082 int i;
4083
4084 for (i = 0; i < 4; i++)
4085 CODING_ISO_DESIGNATION (coding, i)
4086 = CODING_ISO_INITIAL (coding, i);
b73bfc1c 4087 }
df7492f9
KH
4088 bol_designation
4089 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
19a8d9e0 4090 }
df7492f9
KH
4091 else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4092 ENCODE_RESET_PLANE_AND_REGISTER ();
4093 EMIT_ONE_ASCII_BYTE (c);
4ed46869 4094 }
df7492f9 4095 else if (ASCII_CHAR_P (c))
88993dfd 4096 {
df7492f9
KH
4097 if (ascii_compatible)
4098 EMIT_ONE_ASCII_BYTE (c);
93dec019 4099 else
19a8d9e0 4100 {
bf16eb23
KH
4101 struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4102 ENCODE_ISO_CHARACTER (charset, c);
19a8d9e0 4103 }
4ed46869 4104 }
16eafb5d 4105 else if (CHAR_BYTE8_P (c))
88993dfd 4106 {
16eafb5d
KH
4107 c = CHAR_TO_BYTE8 (c);
4108 EMIT_ONE_BYTE (c);
88993dfd 4109 }
b73bfc1c 4110 else
df7492f9 4111 {
ff0dacd7 4112 struct charset *charset;
b73bfc1c 4113
ff0dacd7
KH
4114 if (preferred_charset_id >= 0)
4115 {
4116 charset = CHARSET_FROM_ID (preferred_charset_id);
4117 if (! CHAR_CHARSET_P (c, charset))
4118 charset = char_charset (c, charset_list, NULL);
4119 }
4120 else
4121 charset = char_charset (c, charset_list, NULL);
df7492f9
KH
4122 if (!charset)
4123 {
41cbe562
KH
4124 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4125 {
4126 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4127 charset = CHARSET_FROM_ID (charset_ascii);
4128 }
4129 else
4130 {
4131 c = coding->default_char;
4132 charset = char_charset (c, charset_list, NULL);
4133 }
df7492f9
KH
4134 }
4135 ENCODE_ISO_CHARACTER (charset, c);
4136 }
84fbb8a0 4137 }
b73bfc1c 4138
df7492f9
KH
4139 if (coding->mode & CODING_MODE_LAST_BLOCK
4140 && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4141 {
4142 ASSURE_DESTINATION (safe_room);
4143 ENCODE_RESET_PLANE_AND_REGISTER ();
4144 }
065e3595 4145 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
4146 CODING_ISO_BOL (coding) = bol_designation;
4147 coding->produced_char += produced_chars;
4148 coding->produced = dst - coding->destination;
4149 return 0;
4ed46869
KH
4150}
4151
4152\f
df7492f9 4153/*** 8,9. SJIS and BIG5 handlers ***/
4ed46869 4154
df7492f9 4155/* Although SJIS and BIG5 are not ISO's coding system, they are used
4ed46869
KH
4156 quite widely. So, for the moment, Emacs supports them in the bare
4157 C code. But, in the future, they may be supported only by CCL. */
4158
4159/* SJIS is a coding system encoding three character sets: ASCII, right
4160 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
4161 as is. A character of charset katakana-jisx0201 is encoded by
4162 "position-code + 0x80". A character of charset japanese-jisx0208
4163 is encoded in 2-byte but two position-codes are divided and shifted
df7492f9 4164 so that it fit in the range below.
4ed46869
KH
4165
4166 --- CODE RANGE of SJIS ---
4167 (character set) (range)
4168 ASCII 0x00 .. 0x7F
df7492f9 4169 KATAKANA-JISX0201 0xA0 .. 0xDF
c28a9453 4170 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
d14d03ac 4171 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
4ed46869
KH
4172 -------------------------------
4173
4174*/
4175
4176/* BIG5 is a coding system encoding two character sets: ASCII and
4177 Big5. An ASCII character is encoded as is. Big5 is a two-byte
df7492f9 4178 character set and is encoded in two-byte.
4ed46869
KH
4179
4180 --- CODE RANGE of BIG5 ---
4181 (character set) (range)
4182 ASCII 0x00 .. 0x7F
4183 Big5 (1st byte) 0xA1 .. 0xFE
4184 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
4185 --------------------------
4186
df7492f9 4187 */
4ed46869
KH
4188
4189/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4190 Check if a text is encoded in SJIS. If it is, return
df7492f9 4191 CATEGORY_MASK_SJIS, else return 0. */
4ed46869 4192
0a28aafb 4193static int
ff0dacd7 4194detect_coding_sjis (coding, detect_info)
df7492f9 4195 struct coding_system *coding;
ff0dacd7 4196 struct coding_detection_info *detect_info;
4ed46869 4197{
065e3595 4198 const unsigned char *src = coding->source, *src_base;
8f924df7 4199 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9
KH
4200 int multibytep = coding->src_multibyte;
4201 int consumed_chars = 0;
4202 int found = 0;
b73bfc1c 4203 int c;
df7492f9 4204
ff0dacd7 4205 detect_info->checked |= CATEGORY_MASK_SJIS;
df7492f9
KH
4206 /* A coding system of this category is always ASCII compatible. */
4207 src += coding->head_ascii;
4ed46869 4208
b73bfc1c 4209 while (1)
4ed46869 4210 {
065e3595 4211 src_base = src;
df7492f9 4212 ONE_MORE_BYTE (c);
682169fe
KH
4213 if (c < 0x80)
4214 continue;
df7492f9 4215 if ((c >= 0x81 && c <= 0x9F) || (c >= 0xE0 && c <= 0xEF))
4ed46869 4216 {
df7492f9 4217 ONE_MORE_BYTE (c);
682169fe 4218 if (c < 0x40 || c == 0x7F || c > 0xFC)
df7492f9 4219 break;
ff0dacd7 4220 found = CATEGORY_MASK_SJIS;
4ed46869 4221 }
df7492f9 4222 else if (c >= 0xA0 && c < 0xE0)
ff0dacd7 4223 found = CATEGORY_MASK_SJIS;
df7492f9
KH
4224 else
4225 break;
4ed46869 4226 }
ff0dacd7 4227 detect_info->rejected |= CATEGORY_MASK_SJIS;
df7492f9
KH
4228 return 0;
4229
4230 no_more_source:
065e3595 4231 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
89528eb3 4232 {
ff0dacd7 4233 detect_info->rejected |= CATEGORY_MASK_SJIS;
89528eb3 4234 return 0;
4ed46869 4235 }
ff0dacd7
KH
4236 detect_info->found |= found;
4237 return 1;
4ed46869
KH
4238}
4239
4240/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4241 Check if a text is encoded in BIG5. If it is, return
df7492f9 4242 CATEGORY_MASK_BIG5, else return 0. */
4ed46869 4243
0a28aafb 4244static int
ff0dacd7 4245detect_coding_big5 (coding, detect_info)
df7492f9 4246 struct coding_system *coding;
ff0dacd7 4247 struct coding_detection_info *detect_info;
4ed46869 4248{
065e3595 4249 const unsigned char *src = coding->source, *src_base;
8f924df7 4250 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9
KH
4251 int multibytep = coding->src_multibyte;
4252 int consumed_chars = 0;
4253 int found = 0;
b73bfc1c 4254 int c;
fa42c37f 4255
ff0dacd7 4256 detect_info->checked |= CATEGORY_MASK_BIG5;
df7492f9
KH
4257 /* A coding system of this category is always ASCII compatible. */
4258 src += coding->head_ascii;
fa42c37f 4259
b73bfc1c 4260 while (1)
fa42c37f 4261 {
065e3595 4262 src_base = src;
df7492f9
KH
4263 ONE_MORE_BYTE (c);
4264 if (c < 0x80)
fa42c37f 4265 continue;
df7492f9 4266 if (c >= 0xA1)
fa42c37f 4267 {
df7492f9
KH
4268 ONE_MORE_BYTE (c);
4269 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
fa42c37f 4270 return 0;
ff0dacd7 4271 found = CATEGORY_MASK_BIG5;
fa42c37f 4272 }
df7492f9
KH
4273 else
4274 break;
fa42c37f 4275 }
ff0dacd7 4276 detect_info->rejected |= CATEGORY_MASK_BIG5;
fa42c37f 4277 return 0;
fa42c37f 4278
df7492f9 4279 no_more_source:
065e3595 4280 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
89528eb3 4281 {
ff0dacd7 4282 detect_info->rejected |= CATEGORY_MASK_BIG5;
89528eb3
KH
4283 return 0;
4284 }
ff0dacd7
KH
4285 detect_info->found |= found;
4286 return 1;
fa42c37f
KH
4287}
4288
4ed46869
KH
4289/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
4290 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
fa42c37f 4291
b73bfc1c 4292static void
df7492f9 4293decode_coding_sjis (coding)
4ed46869 4294 struct coding_system *coding;
4ed46869 4295{
8f924df7
KH
4296 const unsigned char *src = coding->source + coding->consumed;
4297 const unsigned char *src_end = coding->source + coding->src_bytes;
4298 const unsigned char *src_base;
69a80ea3
KH
4299 int *charbuf = coding->charbuf + coding->charbuf_used;
4300 int *charbuf_end
4301 = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
df7492f9
KH
4302 int consumed_chars = 0, consumed_chars_base;
4303 int multibytep = coding->src_multibyte;
4304 struct charset *charset_roman, *charset_kanji, *charset_kana;
57a47f8a 4305 struct charset *charset_kanji2;
24a73b0a 4306 Lisp_Object attrs, charset_list, val;
ff0dacd7
KH
4307 int char_offset = coding->produced_char;
4308 int last_offset = char_offset;
4309 int last_id = charset_ascii;
119852e7
KH
4310 int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4311 int byte_after_cr = -1;
a5d301df 4312
24a73b0a 4313 CODING_GET_INFO (coding, attrs, charset_list);
df7492f9
KH
4314
4315 val = charset_list;
4316 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
89528eb3 4317 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
57a47f8a
KH
4318 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4319 charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
fa42c37f 4320
b73bfc1c 4321 while (1)
4ed46869 4322 {
df7492f9 4323 int c, c1;
24a73b0a 4324 struct charset *charset;
fa42c37f 4325
b73bfc1c 4326 src_base = src;
df7492f9 4327 consumed_chars_base = consumed_chars;
fa42c37f 4328
df7492f9
KH
4329 if (charbuf >= charbuf_end)
4330 break;
4331
119852e7
KH
4332 if (byte_after_cr >= 0)
4333 c = byte_after_cr, byte_after_cr = -1;
4334 else
4335 ONE_MORE_BYTE (c);
065e3595
KH
4336 if (c < 0)
4337 goto invalid_code;
24a73b0a 4338 if (c < 0x80)
119852e7
KH
4339 {
4340 if (eol_crlf && c == '\r')
4341 ONE_MORE_BYTE (byte_after_cr);
4342 charset = charset_roman;
4343 }
57a47f8a 4344 else if (c == 0x80 || c == 0xA0)
8e921c4b 4345 goto invalid_code;
57a47f8a
KH
4346 else if (c >= 0xA1 && c <= 0xDF)
4347 {
4348 /* SJIS -> JISX0201-Kana */
4349 c &= 0x7F;
4350 charset = charset_kana;
4351 }
4352 else if (c <= 0xEF)
df7492f9 4353 {
57a47f8a
KH
4354 /* SJIS -> JISX0208 */
4355 ONE_MORE_BYTE (c1);
4356 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
24a73b0a 4357 goto invalid_code;
57a47f8a
KH
4358 c = (c << 8) | c1;
4359 SJIS_TO_JIS (c);
4360 charset = charset_kanji;
4361 }
4362 else if (c <= 0xFC && charset_kanji2)
4363 {
c6876370 4364 /* SJIS -> JISX0213-2 */
57a47f8a
KH
4365 ONE_MORE_BYTE (c1);
4366 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
24a73b0a 4367 goto invalid_code;
57a47f8a
KH
4368 c = (c << 8) | c1;
4369 SJIS_TO_JIS2 (c);
4370 charset = charset_kanji2;
df7492f9 4371 }
57a47f8a
KH
4372 else
4373 goto invalid_code;
24a73b0a
KH
4374 if (charset->id != charset_ascii
4375 && last_id != charset->id)
4376 {
4377 if (last_id != charset_ascii)
69a80ea3 4378 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
24a73b0a
KH
4379 last_id = charset->id;
4380 last_offset = char_offset;
4381 }
4382 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
df7492f9 4383 *charbuf++ = c;
ff0dacd7 4384 char_offset++;
df7492f9 4385 continue;
b73bfc1c 4386
df7492f9
KH
4387 invalid_code:
4388 src = src_base;
4389 consumed_chars = consumed_chars_base;
4390 ONE_MORE_BYTE (c);
065e3595 4391 *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
ff0dacd7 4392 char_offset++;
df7492f9
KH
4393 coding->errors++;
4394 }
fa42c37f 4395
df7492f9 4396 no_more_source:
ff0dacd7 4397 if (last_id != charset_ascii)
69a80ea3 4398 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
df7492f9
KH
4399 coding->consumed_char += consumed_chars_base;
4400 coding->consumed = src_base - coding->source;
4401 coding->charbuf_used = charbuf - coding->charbuf;
fa42c37f
KH
4402}
4403
b73bfc1c 4404static void
df7492f9 4405decode_coding_big5 (coding)
4ed46869 4406 struct coding_system *coding;
4ed46869 4407{
8f924df7
KH
4408 const unsigned char *src = coding->source + coding->consumed;
4409 const unsigned char *src_end = coding->source + coding->src_bytes;
4410 const unsigned char *src_base;
69a80ea3
KH
4411 int *charbuf = coding->charbuf + coding->charbuf_used;
4412 int *charbuf_end
4413 = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
df7492f9
KH
4414 int consumed_chars = 0, consumed_chars_base;
4415 int multibytep = coding->src_multibyte;
4416 struct charset *charset_roman, *charset_big5;
24a73b0a 4417 Lisp_Object attrs, charset_list, val;
ff0dacd7
KH
4418 int char_offset = coding->produced_char;
4419 int last_offset = char_offset;
4420 int last_id = charset_ascii;
119852e7
KH
4421 int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4422 int byte_after_cr = -1;
df7492f9 4423
24a73b0a 4424 CODING_GET_INFO (coding, attrs, charset_list);
df7492f9
KH
4425 val = charset_list;
4426 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4427 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4ed46869 4428
b73bfc1c 4429 while (1)
4ed46869 4430 {
df7492f9 4431 int c, c1;
24a73b0a 4432 struct charset *charset;
b73bfc1c
KH
4433
4434 src_base = src;
df7492f9
KH
4435 consumed_chars_base = consumed_chars;
4436
4437 if (charbuf >= charbuf_end)
4438 break;
4439
119852e7 4440 if (byte_after_cr >= 0)
14daee73 4441 c = byte_after_cr, byte_after_cr = -1;
119852e7
KH
4442 else
4443 ONE_MORE_BYTE (c);
b73bfc1c 4444
065e3595
KH
4445 if (c < 0)
4446 goto invalid_code;
24a73b0a 4447 if (c < 0x80)
119852e7 4448 {
14daee73 4449 if (eol_crlf && c == '\r')
119852e7
KH
4450 ONE_MORE_BYTE (byte_after_cr);
4451 charset = charset_roman;
4452 }
24a73b0a 4453 else
4ed46869 4454 {
24a73b0a
KH
4455 /* BIG5 -> Big5 */
4456 if (c < 0xA1 || c > 0xFE)
4457 goto invalid_code;
4458 ONE_MORE_BYTE (c1);
4459 if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4460 goto invalid_code;
4461 c = c << 8 | c1;
4462 charset = charset_big5;
4ed46869 4463 }
24a73b0a
KH
4464 if (charset->id != charset_ascii
4465 && last_id != charset->id)
df7492f9 4466 {
24a73b0a 4467 if (last_id != charset_ascii)
69a80ea3 4468 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
24a73b0a
KH
4469 last_id = charset->id;
4470 last_offset = char_offset;
4ed46869 4471 }
24a73b0a 4472 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
df7492f9 4473 *charbuf++ = c;
ff0dacd7 4474 char_offset++;
fb88bf2d
KH
4475 continue;
4476
df7492f9 4477 invalid_code:
4ed46869 4478 src = src_base;
df7492f9
KH
4479 consumed_chars = consumed_chars_base;
4480 ONE_MORE_BYTE (c);
065e3595 4481 *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
ff0dacd7 4482 char_offset++;
df7492f9 4483 coding->errors++;
fb88bf2d 4484 }
d46c5b12 4485
df7492f9 4486 no_more_source:
ff0dacd7 4487 if (last_id != charset_ascii)
69a80ea3 4488 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
df7492f9
KH
4489 coding->consumed_char += consumed_chars_base;
4490 coding->consumed = src_base - coding->source;
4491 coding->charbuf_used = charbuf - coding->charbuf;
4ed46869
KH
4492}
4493
4494/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
b73bfc1c
KH
4495 This function can encode charsets `ascii', `katakana-jisx0201',
4496 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
4497 are sure that all these charsets are registered as official charset
4ed46869
KH
4498 (i.e. do not have extended leading-codes). Characters of other
4499 charsets are produced without any encoding. If SJIS_P is 1, encode
4500 SJIS text, else encode BIG5 text. */
4501
df7492f9
KH
4502static int
4503encode_coding_sjis (coding)
4ed46869 4504 struct coding_system *coding;
4ed46869 4505{
df7492f9
KH
4506 int multibytep = coding->dst_multibyte;
4507 int *charbuf = coding->charbuf;
4508 int *charbuf_end = charbuf + coding->charbuf_used;
4509 unsigned char *dst = coding->destination + coding->produced;
4510 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4511 int safe_room = 4;
4512 int produced_chars = 0;
24a73b0a 4513 Lisp_Object attrs, charset_list, val;
df7492f9
KH
4514 int ascii_compatible;
4515 struct charset *charset_roman, *charset_kanji, *charset_kana;
57a47f8a 4516 struct charset *charset_kanji2;
df7492f9 4517 int c;
a5d301df 4518
24a73b0a 4519 CODING_GET_INFO (coding, attrs, charset_list);
df7492f9
KH
4520 val = charset_list;
4521 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4522 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
57a47f8a
KH
4523 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4524 charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4ed46869 4525
df7492f9 4526 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
93dec019 4527
df7492f9
KH
4528 while (charbuf < charbuf_end)
4529 {
4530 ASSURE_DESTINATION (safe_room);
4531 c = *charbuf++;
b73bfc1c 4532 /* Now encode the character C. */
df7492f9
KH
4533 if (ASCII_CHAR_P (c) && ascii_compatible)
4534 EMIT_ONE_ASCII_BYTE (c);
16eafb5d
KH
4535 else if (CHAR_BYTE8_P (c))
4536 {
4537 c = CHAR_TO_BYTE8 (c);
4538 EMIT_ONE_BYTE (c);
4539 }
df7492f9 4540 else
b73bfc1c 4541 {
df7492f9
KH
4542 unsigned code;
4543 struct charset *charset = char_charset (c, charset_list, &code);
4544
4545 if (!charset)
4ed46869 4546 {
41cbe562 4547 if (coding->mode & CODING_MODE_SAFE_ENCODING)
b73bfc1c 4548 {
41cbe562
KH
4549 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4550 charset = CHARSET_FROM_ID (charset_ascii);
b73bfc1c 4551 }
41cbe562 4552 else
b73bfc1c 4553 {
41cbe562
KH
4554 c = coding->default_char;
4555 charset = char_charset (c, charset_list, &code);
b73bfc1c 4556 }
b73bfc1c 4557 }
df7492f9
KH
4558 if (code == CHARSET_INVALID_CODE (charset))
4559 abort ();
4560 if (charset == charset_kanji)
4561 {
4562 int c1, c2;
4563 JIS_TO_SJIS (code);
4564 c1 = code >> 8, c2 = code & 0xFF;
4565 EMIT_TWO_BYTES (c1, c2);
4566 }
4567 else if (charset == charset_kana)
4568 EMIT_ONE_BYTE (code | 0x80);
57a47f8a
KH
4569 else if (charset_kanji2 && charset == charset_kanji2)
4570 {
4571 int c1, c2;
4572
4573 c1 = code >> 8;
4574 if (c1 == 0x21 || (c1 >= 0x23 && c1 < 0x25)
4575 || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
4576 {
4577 JIS_TO_SJIS2 (code);
4578 c1 = code >> 8, c2 = code & 0xFF;
4579 EMIT_TWO_BYTES (c1, c2);
4580 }
4581 else
4582 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4583 }
df7492f9
KH
4584 else
4585 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4586 }
4587 }
065e3595 4588 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
4589 coding->produced_char += produced_chars;
4590 coding->produced = dst - coding->destination;
4591 return 0;
4592}
4593
4594static int
4595encode_coding_big5 (coding)
4596 struct coding_system *coding;
4597{
4598 int multibytep = coding->dst_multibyte;
4599 int *charbuf = coding->charbuf;
4600 int *charbuf_end = charbuf + coding->charbuf_used;
4601 unsigned char *dst = coding->destination + coding->produced;
4602 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4603 int safe_room = 4;
4604 int produced_chars = 0;
24a73b0a 4605 Lisp_Object attrs, charset_list, val;
df7492f9
KH
4606 int ascii_compatible;
4607 struct charset *charset_roman, *charset_big5;
4608 int c;
4609
24a73b0a 4610 CODING_GET_INFO (coding, attrs, charset_list);
df7492f9
KH
4611 val = charset_list;
4612 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4613 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4614 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4615
4616 while (charbuf < charbuf_end)
4617 {
4618 ASSURE_DESTINATION (safe_room);
4619 c = *charbuf++;
4620 /* Now encode the character C. */
4621 if (ASCII_CHAR_P (c) && ascii_compatible)
4622 EMIT_ONE_ASCII_BYTE (c);
16eafb5d
KH
4623 else if (CHAR_BYTE8_P (c))
4624 {
4625 c = CHAR_TO_BYTE8 (c);
4626 EMIT_ONE_BYTE (c);
b73bfc1c
KH
4627 }
4628 else
4629 {
df7492f9
KH
4630 unsigned code;
4631 struct charset *charset = char_charset (c, charset_list, &code);
4632
4633 if (! charset)
b73bfc1c 4634 {
41cbe562 4635 if (coding->mode & CODING_MODE_SAFE_ENCODING)
b73bfc1c 4636 {
41cbe562
KH
4637 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4638 charset = CHARSET_FROM_ID (charset_ascii);
b73bfc1c 4639 }
41cbe562 4640 else
0eecad43 4641 {
41cbe562
KH
4642 c = coding->default_char;
4643 charset = char_charset (c, charset_list, &code);
0eecad43 4644 }
4ed46869 4645 }
df7492f9
KH
4646 if (code == CHARSET_INVALID_CODE (charset))
4647 abort ();
4648 if (charset == charset_big5)
b73bfc1c 4649 {
df7492f9
KH
4650 int c1, c2;
4651
4652 c1 = code >> 8, c2 = code & 0xFF;
4653 EMIT_TWO_BYTES (c1, c2);
b73bfc1c 4654 }
df7492f9
KH
4655 else
4656 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4ed46869 4657 }
4ed46869 4658 }
065e3595 4659 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
4660 coding->produced_char += produced_chars;
4661 coding->produced = dst - coding->destination;
4662 return 0;
4ed46869
KH
4663}
4664
4665\f
df7492f9 4666/*** 10. CCL handlers ***/
1397dc18
KH
4667
4668/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4669 Check if a text is encoded in a coding system of which
4670 encoder/decoder are written in CCL program. If it is, return
df7492f9 4671 CATEGORY_MASK_CCL, else return 0. */
1397dc18 4672
0a28aafb 4673static int
ff0dacd7 4674detect_coding_ccl (coding, detect_info)
df7492f9 4675 struct coding_system *coding;
ff0dacd7 4676 struct coding_detection_info *detect_info;
1397dc18 4677{
065e3595 4678 const unsigned char *src = coding->source, *src_base;
8f924df7 4679 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9
KH
4680 int multibytep = coding->src_multibyte;
4681 int consumed_chars = 0;
4682 int found = 0;
0e219d54 4683 unsigned char *valids;
df7492f9
KH
4684 int head_ascii = coding->head_ascii;
4685 Lisp_Object attrs;
4686
ff0dacd7
KH
4687 detect_info->checked |= CATEGORY_MASK_CCL;
4688
df7492f9 4689 coding = &coding_categories[coding_category_ccl];
0e219d54 4690 valids = CODING_CCL_VALIDS (coding);
df7492f9
KH
4691 attrs = CODING_ID_ATTRS (coding->id);
4692 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4693 src += head_ascii;
1397dc18 4694
b73bfc1c 4695 while (1)
1397dc18 4696 {
df7492f9 4697 int c;
065e3595
KH
4698
4699 src_base = src;
df7492f9 4700 ONE_MORE_BYTE (c);
065e3595 4701 if (c < 0 || ! valids[c])
df7492f9 4702 break;
ff0dacd7
KH
4703 if ((valids[c] > 1))
4704 found = CATEGORY_MASK_CCL;
df7492f9 4705 }
ff0dacd7 4706 detect_info->rejected |= CATEGORY_MASK_CCL;
df7492f9
KH
4707 return 0;
4708
4709 no_more_source:
ff0dacd7
KH
4710 detect_info->found |= found;
4711 return 1;
df7492f9
KH
4712}
4713
4714static void
4715decode_coding_ccl (coding)
4716 struct coding_system *coding;
4717{
7c78e542 4718 const unsigned char *src = coding->source + coding->consumed;
8f924df7 4719 const unsigned char *src_end = coding->source + coding->src_bytes;
69a80ea3
KH
4720 int *charbuf = coding->charbuf + coding->charbuf_used;
4721 int *charbuf_end = coding->charbuf + coding->charbuf_size;
df7492f9
KH
4722 int consumed_chars = 0;
4723 int multibytep = coding->src_multibyte;
4724 struct ccl_program ccl;
4725 int source_charbuf[1024];
4726 int source_byteidx[1024];
24a73b0a 4727 Lisp_Object attrs, charset_list;
df7492f9 4728
24a73b0a 4729 CODING_GET_INFO (coding, attrs, charset_list);
df7492f9
KH
4730 setup_ccl_program (&ccl, CODING_CCL_DECODER (coding));
4731
4732 while (src < src_end)
4733 {
7c78e542 4734 const unsigned char *p = src;
df7492f9
KH
4735 int *source, *source_end;
4736 int i = 0;
4737
4738 if (multibytep)
4739 while (i < 1024 && p < src_end)
4740 {
4741 source_byteidx[i] = p - src;
4742 source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
4743 }
4744 else
4745 while (i < 1024 && p < src_end)
4746 source_charbuf[i++] = *p++;
8f924df7 4747
df7492f9
KH
4748 if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
4749 ccl.last_block = 1;
4750
4751 source = source_charbuf;
4752 source_end = source + i;
4753 while (source < source_end)
4754 {
4755 ccl_driver (&ccl, source, charbuf,
8dcbea82
KH
4756 source_end - source, charbuf_end - charbuf,
4757 charset_list);
df7492f9
KH
4758 source += ccl.consumed;
4759 charbuf += ccl.produced;
4760 if (ccl.status != CCL_STAT_SUSPEND_BY_DST)
4761 break;
4762 }
4763 if (source < source_end)
4764 src += source_byteidx[source - source_charbuf];
4765 else
4766 src = p;
4767 consumed_chars += source - source_charbuf;
4768
4769 if (ccl.status != CCL_STAT_SUSPEND_BY_SRC
4770 && ccl.status != CODING_RESULT_INSUFFICIENT_SRC)
4771 break;
4772 }
4773
4774 switch (ccl.status)
4775 {
4776 case CCL_STAT_SUSPEND_BY_SRC:
065e3595 4777 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
df7492f9
KH
4778 break;
4779 case CCL_STAT_SUSPEND_BY_DST:
4780 break;
4781 case CCL_STAT_QUIT:
4782 case CCL_STAT_INVALID_CMD:
065e3595 4783 record_conversion_result (coding, CODING_RESULT_INTERRUPT);
df7492f9
KH
4784 break;
4785 default:
065e3595 4786 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
4787 break;
4788 }
4789 coding->consumed_char += consumed_chars;
4790 coding->consumed = src - coding->source;
4791 coding->charbuf_used = charbuf - coding->charbuf;
4792}
4793
4794static int
4795encode_coding_ccl (coding)
4796 struct coding_system *coding;
4797{
4798 struct ccl_program ccl;
4799 int multibytep = coding->dst_multibyte;
4800 int *charbuf = coding->charbuf;
4801 int *charbuf_end = charbuf + coding->charbuf_used;
4802 unsigned char *dst = coding->destination + coding->produced;
4803 unsigned char *dst_end = coding->destination + coding->dst_bytes;
df7492f9
KH
4804 int destination_charbuf[1024];
4805 int i, produced_chars = 0;
24a73b0a 4806 Lisp_Object attrs, charset_list;
df7492f9 4807
24a73b0a 4808 CODING_GET_INFO (coding, attrs, charset_list);
df7492f9
KH
4809 setup_ccl_program (&ccl, CODING_CCL_ENCODER (coding));
4810
4811 ccl.last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4812 ccl.dst_multibyte = coding->dst_multibyte;
4813
8cffd3e7 4814 while (charbuf < charbuf_end)
df7492f9 4815 {
df7492f9 4816 ccl_driver (&ccl, charbuf, destination_charbuf,
8cffd3e7 4817 charbuf_end - charbuf, 1024, charset_list);
df7492f9 4818 if (multibytep)
8cffd3e7
KH
4819 {
4820 ASSURE_DESTINATION (ccl.produced * 2);
4821 for (i = 0; i < ccl.produced; i++)
4822 EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
4823 }
df7492f9
KH
4824 else
4825 {
8cffd3e7 4826 ASSURE_DESTINATION (ccl.produced);
3ed051d4 4827 for (i = 0; i < ccl.produced; i++)
df7492f9
KH
4828 *dst++ = destination_charbuf[i] & 0xFF;
4829 produced_chars += ccl.produced;
4830 }
8cffd3e7
KH
4831 charbuf += ccl.consumed;
4832 if (ccl.status == CCL_STAT_QUIT
4833 || ccl.status == CCL_STAT_INVALID_CMD)
4834 break;
df7492f9
KH
4835 }
4836
4837 switch (ccl.status)
4838 {
4839 case CCL_STAT_SUSPEND_BY_SRC:
065e3595 4840 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
df7492f9
KH
4841 break;
4842 case CCL_STAT_SUSPEND_BY_DST:
065e3595 4843 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
df7492f9
KH
4844 break;
4845 case CCL_STAT_QUIT:
4846 case CCL_STAT_INVALID_CMD:
065e3595 4847 record_conversion_result (coding, CODING_RESULT_INTERRUPT);
df7492f9
KH
4848 break;
4849 default:
065e3595 4850 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9 4851 break;
1397dc18 4852 }
df7492f9
KH
4853
4854 coding->produced_char += produced_chars;
4855 coding->produced = dst - coding->destination;
4856 return 0;
1397dc18
KH
4857}
4858
df7492f9 4859
1397dc18 4860\f
df7492f9 4861/*** 10, 11. no-conversion handlers ***/
4ed46869 4862
b73bfc1c 4863/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
4ed46869 4864
b73bfc1c 4865static void
df7492f9 4866decode_coding_raw_text (coding)
4ed46869 4867 struct coding_system *coding;
4ed46869 4868{
119852e7
KH
4869 int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4870
df7492f9 4871 coding->chars_at_source = 1;
119852e7
KH
4872 coding->consumed_char = coding->src_chars;
4873 coding->consumed = coding->src_bytes;
4874 if (eol_crlf && coding->source[coding->src_bytes - 1] == '\r')
4875 {
4876 coding->consumed_char--;
4877 coding->consumed--;
4878 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
4879 }
4880 else
4881 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9 4882}
4ed46869 4883
df7492f9
KH
4884static int
4885encode_coding_raw_text (coding)
4886 struct coding_system *coding;
4887{
4888 int multibytep = coding->dst_multibyte;
4889 int *charbuf = coding->charbuf;
4890 int *charbuf_end = coding->charbuf + coding->charbuf_used;
4891 unsigned char *dst = coding->destination + coding->produced;
4892 unsigned char *dst_end = coding->destination + coding->dst_bytes;
a0ed9b27 4893 int produced_chars = 0;
b73bfc1c
KH
4894 int c;
4895
df7492f9 4896 if (multibytep)
b73bfc1c 4897 {
df7492f9 4898 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
4ed46869 4899
df7492f9
KH
4900 if (coding->src_multibyte)
4901 while (charbuf < charbuf_end)
4902 {
4903 ASSURE_DESTINATION (safe_room);
4904 c = *charbuf++;
4905 if (ASCII_CHAR_P (c))
4906 EMIT_ONE_ASCII_BYTE (c);
4907 else if (CHAR_BYTE8_P (c))
4908 {
4909 c = CHAR_TO_BYTE8 (c);
4910 EMIT_ONE_BYTE (c);
4911 }
4912 else
4913 {
4914 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
93dec019 4915
df7492f9
KH
4916 CHAR_STRING_ADVANCE (c, p1);
4917 while (p0 < p1)
9d123124
KH
4918 {
4919 EMIT_ONE_BYTE (*p0);
4920 p0++;
4921 }
df7492f9
KH
4922 }
4923 }
b73bfc1c 4924 else
df7492f9
KH
4925 while (charbuf < charbuf_end)
4926 {
4927 ASSURE_DESTINATION (safe_room);
4928 c = *charbuf++;
4929 EMIT_ONE_BYTE (c);
4930 }
4931 }
4932 else
4ed46869 4933 {
df7492f9 4934 if (coding->src_multibyte)
d46c5b12 4935 {
df7492f9
KH
4936 int safe_room = MAX_MULTIBYTE_LENGTH;
4937
4938 while (charbuf < charbuf_end)
d46c5b12 4939 {
df7492f9
KH
4940 ASSURE_DESTINATION (safe_room);
4941 c = *charbuf++;
4942 if (ASCII_CHAR_P (c))
4943 *dst++ = c;
4944 else if (CHAR_BYTE8_P (c))
4945 *dst++ = CHAR_TO_BYTE8 (c);
b73bfc1c 4946 else
df7492f9 4947 CHAR_STRING_ADVANCE (c, dst);
d46c5b12
KH
4948 }
4949 }
df7492f9
KH
4950 else
4951 {
4952 ASSURE_DESTINATION (charbuf_end - charbuf);
4953 while (charbuf < charbuf_end && dst < dst_end)
4954 *dst++ = *charbuf++;
8f924df7 4955 }
319a3947 4956 produced_chars = dst - (coding->destination + coding->produced);
4ed46869 4957 }
065e3595 4958 record_conversion_result (coding, CODING_RESULT_SUCCESS);
a0ed9b27 4959 coding->produced_char += produced_chars;
df7492f9
KH
4960 coding->produced = dst - coding->destination;
4961 return 0;
4ed46869
KH
4962}
4963
ff0dacd7
KH
4964/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4965 Check if a text is encoded in a charset-based coding system. If it
4966 is, return 1, else return 0. */
4967
0a28aafb 4968static int
ff0dacd7 4969detect_coding_charset (coding, detect_info)
df7492f9 4970 struct coding_system *coding;
ff0dacd7 4971 struct coding_detection_info *detect_info;
1397dc18 4972{
065e3595 4973 const unsigned char *src = coding->source, *src_base;
8f924df7 4974 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9
KH
4975 int multibytep = coding->src_multibyte;
4976 int consumed_chars = 0;
4977 Lisp_Object attrs, valids;
584948ac 4978 int found = 0;
716b3fa0 4979 int head_ascii = coding->head_ascii;
1397dc18 4980
ff0dacd7
KH
4981 detect_info->checked |= CATEGORY_MASK_CHARSET;
4982
df7492f9
KH
4983 coding = &coding_categories[coding_category_charset];
4984 attrs = CODING_ID_ATTRS (coding->id);
4985 valids = AREF (attrs, coding_attr_charset_valids);
4986
4987 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
716b3fa0 4988 src += head_ascii;
1397dc18 4989
b73bfc1c 4990 while (1)
1397dc18 4991 {
df7492f9 4992 int c;
716b3fa0
KH
4993 Lisp_Object val;
4994 struct charset *charset;
4995 int dim, idx;
1397dc18 4996
065e3595 4997 src_base = src;
df7492f9 4998 ONE_MORE_BYTE (c);
065e3595
KH
4999 if (c < 0)
5000 continue;
716b3fa0
KH
5001 val = AREF (valids, c);
5002 if (NILP (val))
df7492f9 5003 break;
584948ac 5004 if (c >= 0x80)
ff0dacd7 5005 found = CATEGORY_MASK_CHARSET;
716b3fa0
KH
5006 if (INTEGERP (val))
5007 {
5008 charset = CHARSET_FROM_ID (XFASTINT (val));
5009 dim = CHARSET_DIMENSION (charset);
5010 for (idx = 1; idx < dim; idx++)
5011 {
5012 if (src == src_end)
5013 goto too_short;
5014 ONE_MORE_BYTE (c);
3ed051d4 5015 if (c < charset->code_space[(dim - 1 - idx) * 2]
716b3fa0
KH
5016 || c > charset->code_space[(dim - 1 - idx) * 2 + 1])
5017 break;
5018 }
5019 if (idx < dim)
5020 break;
5021 }
5022 else
5023 {
5024 idx = 1;
5025 for (; CONSP (val); val = XCDR (val))
5026 {
5027 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5028 dim = CHARSET_DIMENSION (charset);
5029 while (idx < dim)
5030 {
5031 if (src == src_end)
5032 goto too_short;
5033 ONE_MORE_BYTE (c);
5034 if (c < charset->code_space[(dim - 1 - idx) * 4]
5035 || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5036 break;
5037 idx++;
5038 }
5039 if (idx == dim)
5040 {
5041 val = Qnil;
5042 break;
5043 }
5044 }
5045 if (CONSP (val))
5046 break;
5047 }
df7492f9 5048 }
716b3fa0 5049 too_short:
ff0dacd7 5050 detect_info->rejected |= CATEGORY_MASK_CHARSET;
df7492f9 5051 return 0;
4ed46869 5052
df7492f9 5053 no_more_source:
ff0dacd7
KH
5054 detect_info->found |= found;
5055 return 1;
df7492f9 5056}
b73bfc1c 5057
b73bfc1c 5058static void
df7492f9 5059decode_coding_charset (coding)
4ed46869 5060 struct coding_system *coding;
4ed46869 5061{
8f924df7
KH
5062 const unsigned char *src = coding->source + coding->consumed;
5063 const unsigned char *src_end = coding->source + coding->src_bytes;
5064 const unsigned char *src_base;
69a80ea3
KH
5065 int *charbuf = coding->charbuf + coding->charbuf_used;
5066 int *charbuf_end
5067 = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
df7492f9
KH
5068 int consumed_chars = 0, consumed_chars_base;
5069 int multibytep = coding->src_multibyte;
24a73b0a 5070 Lisp_Object attrs, charset_list, valids;
ff0dacd7
KH
5071 int char_offset = coding->produced_char;
5072 int last_offset = char_offset;
5073 int last_id = charset_ascii;
119852e7
KH
5074 int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5075 int byte_after_cr = -1;
df7492f9 5076
24a73b0a 5077 CODING_GET_INFO (coding, attrs, charset_list);
4eb6d3f1 5078 valids = AREF (attrs, coding_attr_charset_valids);
b73bfc1c 5079
df7492f9 5080 while (1)
4ed46869 5081 {
4eb6d3f1 5082 int c;
24a73b0a
KH
5083 Lisp_Object val;
5084 struct charset *charset;
5085 int dim;
5086 int len = 1;
5087 unsigned code;
df7492f9
KH
5088
5089 src_base = src;
5090 consumed_chars_base = consumed_chars;
b73bfc1c 5091
df7492f9
KH
5092 if (charbuf >= charbuf_end)
5093 break;
5094
119852e7
KH
5095 if (byte_after_cr >= 0)
5096 {
5097 c = byte_after_cr;
5098 byte_after_cr = -1;
5099 }
5100 else
5101 {
5102 ONE_MORE_BYTE (c);
5103 if (eol_crlf && c == '\r')
5104 ONE_MORE_BYTE (byte_after_cr);
5105 }
065e3595
KH
5106 if (c < 0)
5107 goto invalid_code;
24a73b0a
KH
5108 code = c;
5109
5110 val = AREF (valids, c);
5111 if (NILP (val))
5112 goto invalid_code;
5113 if (INTEGERP (val))
d46c5b12 5114 {
24a73b0a
KH
5115 charset = CHARSET_FROM_ID (XFASTINT (val));
5116 dim = CHARSET_DIMENSION (charset);
5117 while (len < dim)
b73bfc1c 5118 {
24a73b0a
KH
5119 ONE_MORE_BYTE (c);
5120 code = (code << 8) | c;
5121 len++;
b73bfc1c 5122 }
24a73b0a
KH
5123 CODING_DECODE_CHAR (coding, src, src_base, src_end,
5124 charset, code, c);
d46c5b12 5125 }
df7492f9 5126 else
d46c5b12 5127 {
24a73b0a
KH
5128 /* VAL is a list of charset IDs. It is assured that the
5129 list is sorted by charset dimensions (smaller one
5130 comes first). */
5131 while (CONSP (val))
4eb6d3f1 5132 {
24a73b0a 5133 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
c7c66a95 5134 dim = CHARSET_DIMENSION (charset);
f9d71dcd 5135 while (len < dim)
4eb6d3f1 5136 {
acb2a965
KH
5137 ONE_MORE_BYTE (c);
5138 code = (code << 8) | c;
f9d71dcd 5139 len++;
4eb6d3f1 5140 }
24a73b0a
KH
5141 CODING_DECODE_CHAR (coding, src, src_base,
5142 src_end, charset, code, c);
5143 if (c >= 0)
5144 break;
5145 val = XCDR (val);
ff0dacd7 5146 }
d46c5b12 5147 }
24a73b0a
KH
5148 if (c < 0)
5149 goto invalid_code;
5150 if (charset->id != charset_ascii
5151 && last_id != charset->id)
5152 {
5153 if (last_id != charset_ascii)
69a80ea3 5154 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
24a73b0a
KH
5155 last_id = charset->id;
5156 last_offset = char_offset;
5157 }
5158
df7492f9 5159 *charbuf++ = c;
ff0dacd7 5160 char_offset++;
df7492f9
KH
5161 continue;
5162
5163 invalid_code:
5164 src = src_base;
5165 consumed_chars = consumed_chars_base;
5166 ONE_MORE_BYTE (c);
065e3595 5167 *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
ff0dacd7 5168 char_offset++;
df7492f9 5169 coding->errors++;
4ed46869
KH
5170 }
5171
df7492f9 5172 no_more_source:
ff0dacd7 5173 if (last_id != charset_ascii)
69a80ea3 5174 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
df7492f9
KH
5175 coding->consumed_char += consumed_chars_base;
5176 coding->consumed = src_base - coding->source;
5177 coding->charbuf_used = charbuf - coding->charbuf;
4ed46869
KH
5178}
5179
df7492f9
KH
5180static int
5181encode_coding_charset (coding)
4ed46869 5182 struct coding_system *coding;
4ed46869 5183{
df7492f9
KH
5184 int multibytep = coding->dst_multibyte;
5185 int *charbuf = coding->charbuf;
5186 int *charbuf_end = charbuf + coding->charbuf_used;
5187 unsigned char *dst = coding->destination + coding->produced;
5188 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5189 int safe_room = MAX_MULTIBYTE_LENGTH;
5190 int produced_chars = 0;
24a73b0a 5191 Lisp_Object attrs, charset_list;
df7492f9 5192 int ascii_compatible;
b73bfc1c 5193 int c;
b73bfc1c 5194
24a73b0a 5195 CODING_GET_INFO (coding, attrs, charset_list);
df7492f9 5196 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
fb88bf2d 5197
df7492f9 5198 while (charbuf < charbuf_end)
4ed46869 5199 {
4eb6d3f1 5200 struct charset *charset;
df7492f9 5201 unsigned code;
8f924df7 5202
df7492f9
KH
5203 ASSURE_DESTINATION (safe_room);
5204 c = *charbuf++;
5205 if (ascii_compatible && ASCII_CHAR_P (c))
5206 EMIT_ONE_ASCII_BYTE (c);
16eafb5d 5207 else if (CHAR_BYTE8_P (c))
4ed46869 5208 {
16eafb5d
KH
5209 c = CHAR_TO_BYTE8 (c);
5210 EMIT_ONE_BYTE (c);
d46c5b12 5211 }
d46c5b12 5212 else
b73bfc1c 5213 {
4eb6d3f1
KH
5214 charset = char_charset (c, charset_list, &code);
5215 if (charset)
5216 {
5217 if (CHARSET_DIMENSION (charset) == 1)
5218 EMIT_ONE_BYTE (code);
5219 else if (CHARSET_DIMENSION (charset) == 2)
5220 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5221 else if (CHARSET_DIMENSION (charset) == 3)
5222 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5223 else
5224 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5225 (code >> 8) & 0xFF, code & 0xFF);
5226 }
5227 else
41cbe562
KH
5228 {
5229 if (coding->mode & CODING_MODE_SAFE_ENCODING)
5230 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5231 else
5232 c = coding->default_char;
5233 EMIT_ONE_BYTE (c);
5234 }
4ed46869 5235 }
4ed46869
KH
5236 }
5237
065e3595 5238 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
5239 coding->produced_char += produced_chars;
5240 coding->produced = dst - coding->destination;
5241 return 0;
4ed46869
KH
5242}
5243
5244\f
1397dc18 5245/*** 7. C library functions ***/
4ed46869 5246
df7492f9
KH
5247/* Setup coding context CODING from information about CODING_SYSTEM.
5248 If CODING_SYSTEM is nil, `no-conversion' is assumed. If
5249 CODING_SYSTEM is invalid, signal an error. */
4ed46869 5250
ec6d2bb8 5251void
e0e989f6
KH
5252setup_coding_system (coding_system, coding)
5253 Lisp_Object coding_system;
4ed46869
KH
5254 struct coding_system *coding;
5255{
df7492f9
KH
5256 Lisp_Object attrs;
5257 Lisp_Object eol_type;
5258 Lisp_Object coding_type;
4608c386 5259 Lisp_Object val;
4ed46869 5260
df7492f9 5261 if (NILP (coding_system))
ae6f73fa 5262 coding_system = Qundecided;
c07c8e12 5263
df7492f9 5264 CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
1f5dbf34 5265
df7492f9
KH
5266 attrs = CODING_ID_ATTRS (coding->id);
5267 eol_type = CODING_ID_EOL_TYPE (coding->id);
1f5dbf34 5268
df7492f9
KH
5269 coding->mode = 0;
5270 coding->head_ascii = -1;
4a015c45
KH
5271 if (VECTORP (eol_type))
5272 coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5273 | CODING_REQUIRE_DETECTION_MASK);
5274 else if (! EQ (eol_type, Qunix))
5275 coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5276 | CODING_REQUIRE_ENCODING_MASK);
5277 else
5278 coding->common_flags = 0;
5e5c78be
KH
5279 if (! NILP (CODING_ATTR_POST_READ (attrs)))
5280 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5281 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5282 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
8f924df7
KH
5283 if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5284 coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
1f5dbf34 5285
df7492f9 5286 val = CODING_ATTR_SAFE_CHARSETS (attrs);
8f924df7
KH
5287 coding->max_charset_id = SCHARS (val) - 1;
5288 coding->safe_charsets = (char *) SDATA (val);
df7492f9 5289 coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
4608c386 5290
df7492f9
KH
5291 coding_type = CODING_ATTR_TYPE (attrs);
5292 if (EQ (coding_type, Qundecided))
d46c5b12 5293 {
df7492f9
KH
5294 coding->detector = NULL;
5295 coding->decoder = decode_coding_raw_text;
5296 coding->encoder = encode_coding_raw_text;
5297 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
d46c5b12 5298 }
df7492f9 5299 else if (EQ (coding_type, Qiso_2022))
d46c5b12 5300 {
df7492f9
KH
5301 int i;
5302 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5303
5304 /* Invoke graphic register 0 to plane 0. */
5305 CODING_ISO_INVOCATION (coding, 0) = 0;
5306 /* Invoke graphic register 1 to plane 1 if we can use 8-bit. */
5307 CODING_ISO_INVOCATION (coding, 1)
5308 = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5309 /* Setup the initial status of designation. */
5310 for (i = 0; i < 4; i++)
5311 CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5312 /* Not single shifting initially. */
5313 CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5314 /* Beginning of buffer should also be regarded as bol. */
5315 CODING_ISO_BOL (coding) = 1;
5316 coding->detector = detect_coding_iso_2022;
5317 coding->decoder = decode_coding_iso_2022;
5318 coding->encoder = encode_coding_iso_2022;
5319 if (flags & CODING_ISO_FLAG_SAFE)
5320 coding->mode |= CODING_MODE_SAFE_ENCODING;
d46c5b12 5321 coding->common_flags
df7492f9
KH
5322 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5323 | CODING_REQUIRE_FLUSHING_MASK);
5324 if (flags & CODING_ISO_FLAG_COMPOSITION)
5325 coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
ff0dacd7
KH
5326 if (flags & CODING_ISO_FLAG_DESIGNATION)
5327 coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
df7492f9
KH
5328 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5329 {
5330 setup_iso_safe_charsets (attrs);
5331 val = CODING_ATTR_SAFE_CHARSETS (attrs);
8f924df7
KH
5332 coding->max_charset_id = SCHARS (val) - 1;
5333 coding->safe_charsets = (char *) SDATA (val);
df7492f9
KH
5334 }
5335 CODING_ISO_FLAGS (coding) = flags;
d46c5b12 5336 }
df7492f9 5337 else if (EQ (coding_type, Qcharset))
d46c5b12 5338 {
df7492f9
KH
5339 coding->detector = detect_coding_charset;
5340 coding->decoder = decode_coding_charset;
5341 coding->encoder = encode_coding_charset;
d46c5b12 5342 coding->common_flags
df7492f9 5343 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
d46c5b12 5344 }
df7492f9 5345 else if (EQ (coding_type, Qutf_8))
d46c5b12 5346 {
a470d443
KH
5347 val = AREF (attrs, coding_attr_utf_bom);
5348 CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5349 : EQ (val, Qt) ? utf_with_bom
5350 : utf_without_bom);
df7492f9
KH
5351 coding->detector = detect_coding_utf_8;
5352 coding->decoder = decode_coding_utf_8;
5353 coding->encoder = encode_coding_utf_8;
5354 coding->common_flags
5355 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
a470d443
KH
5356 if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5357 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
df7492f9
KH
5358 }
5359 else if (EQ (coding_type, Qutf_16))
5360 {
a470d443
KH
5361 val = AREF (attrs, coding_attr_utf_bom);
5362 CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5363 : EQ (val, Qt) ? utf_with_bom
5364 : utf_without_bom);
df7492f9 5365 val = AREF (attrs, coding_attr_utf_16_endian);
b49a1807 5366 CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
df7492f9 5367 : utf_16_little_endian);
e19c3639 5368 CODING_UTF_16_SURROGATE (coding) = 0;
df7492f9
KH
5369 coding->detector = detect_coding_utf_16;
5370 coding->decoder = decode_coding_utf_16;
5371 coding->encoder = encode_coding_utf_16;
5372 coding->common_flags
5373 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
a470d443 5374 if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
b49a1807 5375 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
d46c5b12 5376 }
df7492f9 5377 else if (EQ (coding_type, Qccl))
4ed46869 5378 {
df7492f9
KH
5379 coding->detector = detect_coding_ccl;
5380 coding->decoder = decode_coding_ccl;
5381 coding->encoder = encode_coding_ccl;
c952af22 5382 coding->common_flags
df7492f9
KH
5383 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5384 | CODING_REQUIRE_FLUSHING_MASK);
5385 }
5386 else if (EQ (coding_type, Qemacs_mule))
5387 {
5388 coding->detector = detect_coding_emacs_mule;
5389 coding->decoder = decode_coding_emacs_mule;
5390 coding->encoder = encode_coding_emacs_mule;
c952af22 5391 coding->common_flags
df7492f9
KH
5392 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5393 if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5394 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5395 {
5396 Lisp_Object tail, safe_charsets;
5397 int max_charset_id = 0;
5398
5399 for (tail = Vemacs_mule_charset_list; CONSP (tail);
5400 tail = XCDR (tail))
5401 if (max_charset_id < XFASTINT (XCAR (tail)))
5402 max_charset_id = XFASTINT (XCAR (tail));
5403 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
5404 make_number (255));
5405 for (tail = Vemacs_mule_charset_list; CONSP (tail);
5406 tail = XCDR (tail))
8f924df7 5407 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
df7492f9 5408 coding->max_charset_id = max_charset_id;
8f924df7 5409 coding->safe_charsets = (char *) SDATA (safe_charsets);
df7492f9
KH
5410 }
5411 }
5412 else if (EQ (coding_type, Qshift_jis))
5413 {
5414 coding->detector = detect_coding_sjis;
5415 coding->decoder = decode_coding_sjis;
5416 coding->encoder = encode_coding_sjis;
c952af22 5417 coding->common_flags
df7492f9
KH
5418 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5419 }
5420 else if (EQ (coding_type, Qbig5))
5421 {
5422 coding->detector = detect_coding_big5;
5423 coding->decoder = decode_coding_big5;
5424 coding->encoder = encode_coding_big5;
c952af22 5425 coding->common_flags
df7492f9
KH
5426 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5427 }
5428 else /* EQ (coding_type, Qraw_text) */
ec6d2bb8 5429 {
df7492f9
KH
5430 coding->detector = NULL;
5431 coding->decoder = decode_coding_raw_text;
5432 coding->encoder = encode_coding_raw_text;
ea29edf2
KH
5433 if (! EQ (eol_type, Qunix))
5434 {
5435 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5436 if (! VECTORP (eol_type))
5437 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5438 }
5439
4ed46869 5440 }
4ed46869 5441
df7492f9 5442 return;
4ed46869
KH
5443}
5444
0ff61e78
KH
5445/* Return a list of charsets supported by CODING. */
5446
5447Lisp_Object
5448coding_charset_list (coding)
5449 struct coding_system *coding;
5450{
35befdaa 5451 Lisp_Object attrs, charset_list;
0ff61e78
KH
5452
5453 CODING_GET_INFO (coding, attrs, charset_list);
5454 if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5455 {
5456 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5457
5458 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5459 charset_list = Viso_2022_charset_list;
5460 }
5461 else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5462 {
5463 charset_list = Vemacs_mule_charset_list;
5464 }
5465 return charset_list;
5466}
5467
5468
df7492f9
KH
5469/* Return raw-text or one of its subsidiaries that has the same
5470 eol_type as CODING-SYSTEM. */
ec6d2bb8 5471
df7492f9
KH
5472Lisp_Object
5473raw_text_coding_system (coding_system)
5474 Lisp_Object coding_system;
ec6d2bb8 5475{
0be8721c 5476 Lisp_Object spec, attrs;
df7492f9 5477 Lisp_Object eol_type, raw_text_eol_type;
ec6d2bb8 5478
d3e4cb56
KH
5479 if (NILP (coding_system))
5480 return Qraw_text;
df7492f9
KH
5481 spec = CODING_SYSTEM_SPEC (coding_system);
5482 attrs = AREF (spec, 0);
ec6d2bb8 5483
df7492f9
KH
5484 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5485 return coding_system;
ec6d2bb8 5486
df7492f9
KH
5487 eol_type = AREF (spec, 2);
5488 if (VECTORP (eol_type))
5489 return Qraw_text;
5490 spec = CODING_SYSTEM_SPEC (Qraw_text);
5491 raw_text_eol_type = AREF (spec, 2);
5492 return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5493 : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5494 : AREF (raw_text_eol_type, 2));
ec6d2bb8
KH
5495}
5496
54f78171 5497
df7492f9
KH
5498/* If CODING_SYSTEM doesn't specify end-of-line format but PARENT
5499 does, return one of the subsidiary that has the same eol-spec as
fcbcfb64
KH
5500 PARENT. Otherwise, return CODING_SYSTEM. If PARENT is nil,
5501 inherit end-of-line format from the system's setting
5502 (system_eol_type). */
df7492f9
KH
5503
5504Lisp_Object
5505coding_inherit_eol_type (coding_system, parent)
b74e4686 5506 Lisp_Object coding_system, parent;
54f78171 5507{
3e139625 5508 Lisp_Object spec, eol_type;
54f78171 5509
d3e4cb56
KH
5510 if (NILP (coding_system))
5511 coding_system = Qraw_text;
df7492f9 5512 spec = CODING_SYSTEM_SPEC (coding_system);
df7492f9 5513 eol_type = AREF (spec, 2);
fcbcfb64 5514 if (VECTORP (eol_type))
df7492f9 5515 {
df7492f9
KH
5516 Lisp_Object parent_eol_type;
5517
fcbcfb64
KH
5518 if (! NILP (parent))
5519 {
5520 Lisp_Object parent_spec;
5521
4a015c45 5522 parent_spec = CODING_SYSTEM_SPEC (parent);
fcbcfb64
KH
5523 parent_eol_type = AREF (parent_spec, 2);
5524 }
5525 else
5526 parent_eol_type = system_eol_type;
df7492f9
KH
5527 if (EQ (parent_eol_type, Qunix))
5528 coding_system = AREF (eol_type, 0);
5529 else if (EQ (parent_eol_type, Qdos))
5530 coding_system = AREF (eol_type, 1);
5531 else if (EQ (parent_eol_type, Qmac))
5532 coding_system = AREF (eol_type, 2);
54f78171 5533 }
df7492f9 5534 return coding_system;
54f78171
KH
5535}
5536
4ed46869
KH
5537/* Emacs has a mechanism to automatically detect a coding system if it
5538 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
5539 it's impossible to distinguish some coding systems accurately
5540 because they use the same range of codes. So, at first, coding
5541 systems are categorized into 7, those are:
5542
0ef69138 5543 o coding-category-emacs-mule
4ed46869
KH
5544
5545 The category for a coding system which has the same code range
5546 as Emacs' internal format. Assigned the coding-system (Lisp
0ef69138 5547 symbol) `emacs-mule' by default.
4ed46869
KH
5548
5549 o coding-category-sjis
5550
5551 The category for a coding system which has the same code range
5552 as SJIS. Assigned the coding-system (Lisp
7717c392 5553 symbol) `japanese-shift-jis' by default.
4ed46869
KH
5554
5555 o coding-category-iso-7
5556
5557 The category for a coding system which has the same code range
7717c392 5558 as ISO2022 of 7-bit environment. This doesn't use any locking
d46c5b12
KH
5559 shift and single shift functions. This can encode/decode all
5560 charsets. Assigned the coding-system (Lisp symbol)
5561 `iso-2022-7bit' by default.
5562
5563 o coding-category-iso-7-tight
5564
5565 Same as coding-category-iso-7 except that this can
5566 encode/decode only the specified charsets.
4ed46869
KH
5567
5568 o coding-category-iso-8-1
5569
5570 The category for a coding system which has the same code range
5571 as ISO2022 of 8-bit environment and graphic plane 1 used only
7717c392
KH
5572 for DIMENSION1 charset. This doesn't use any locking shift
5573 and single shift functions. Assigned the coding-system (Lisp
5574 symbol) `iso-latin-1' by default.
4ed46869
KH
5575
5576 o coding-category-iso-8-2
5577
5578 The category for a coding system which has the same code range
5579 as ISO2022 of 8-bit environment and graphic plane 1 used only
7717c392
KH
5580 for DIMENSION2 charset. This doesn't use any locking shift
5581 and single shift functions. Assigned the coding-system (Lisp
5582 symbol) `japanese-iso-8bit' by default.
4ed46869 5583
7717c392 5584 o coding-category-iso-7-else
4ed46869
KH
5585
5586 The category for a coding system which has the same code range
df7492f9 5587 as ISO2022 of 7-bit environemnt but uses locking shift or
7717c392
KH
5588 single shift functions. Assigned the coding-system (Lisp
5589 symbol) `iso-2022-7bit-lock' by default.
5590
5591 o coding-category-iso-8-else
5592
5593 The category for a coding system which has the same code range
df7492f9 5594 as ISO2022 of 8-bit environemnt but uses locking shift or
7717c392
KH
5595 single shift functions. Assigned the coding-system (Lisp
5596 symbol) `iso-2022-8bit-ss2' by default.
4ed46869
KH
5597
5598 o coding-category-big5
5599
5600 The category for a coding system which has the same code range
5601 as BIG5. Assigned the coding-system (Lisp symbol)
e0e989f6 5602 `cn-big5' by default.
4ed46869 5603
fa42c37f
KH
5604 o coding-category-utf-8
5605
5606 The category for a coding system which has the same code range
6e76ae91 5607 as UTF-8 (cf. RFC3629). Assigned the coding-system (Lisp
fa42c37f
KH
5608 symbol) `utf-8' by default.
5609
5610 o coding-category-utf-16-be
5611
5612 The category for a coding system in which a text has an
5613 Unicode signature (cf. Unicode Standard) in the order of BIG
5614 endian at the head. Assigned the coding-system (Lisp symbol)
5615 `utf-16-be' by default.
5616
5617 o coding-category-utf-16-le
5618
5619 The category for a coding system in which a text has an
5620 Unicode signature (cf. Unicode Standard) in the order of
5621 LITTLE endian at the head. Assigned the coding-system (Lisp
5622 symbol) `utf-16-le' by default.
5623
1397dc18
KH
5624 o coding-category-ccl
5625
5626 The category for a coding system of which encoder/decoder is
5627 written in CCL programs. The default value is nil, i.e., no
5628 coding system is assigned.
5629
4ed46869
KH
5630 o coding-category-binary
5631
5632 The category for a coding system not categorized in any of the
5633 above. Assigned the coding-system (Lisp symbol)
e0e989f6 5634 `no-conversion' by default.
4ed46869
KH
5635
5636 Each of them is a Lisp symbol and the value is an actual
df7492f9 5637 `coding-system's (this is also a Lisp symbol) assigned by a user.
4ed46869
KH
5638 What Emacs does actually is to detect a category of coding system.
5639 Then, it uses a `coding-system' assigned to it. If Emacs can't
df7492f9 5640 decide only one possible category, it selects a category of the
4ed46869
KH
5641 highest priority. Priorities of categories are also specified by a
5642 user in a Lisp variable `coding-category-list'.
5643
5644*/
5645
df7492f9
KH
5646#define EOL_SEEN_NONE 0
5647#define EOL_SEEN_LF 1
5648#define EOL_SEEN_CR 2
5649#define EOL_SEEN_CRLF 4
66cfb530 5650
ff0dacd7
KH
5651/* Detect how end-of-line of a text of length SRC_BYTES pointed by
5652 SOURCE is encoded. If CATEGORY is one of
5653 coding_category_utf_16_XXXX, assume that CR and LF are encoded by
5654 two-byte, else they are encoded by one-byte.
5655
5656 Return one of EOL_SEEN_XXX. */
4ed46869 5657
bc4bc72a 5658#define MAX_EOL_CHECK_COUNT 3
d46c5b12
KH
5659
5660static int
89528eb3 5661detect_eol (source, src_bytes, category)
f6cbaf43 5662 const unsigned char *source;
df7492f9 5663 EMACS_INT src_bytes;
89528eb3 5664 enum coding_category category;
4ed46869 5665{
f6cbaf43 5666 const unsigned char *src = source, *src_end = src + src_bytes;
4ed46869 5667 unsigned char c;
df7492f9
KH
5668 int total = 0;
5669 int eol_seen = EOL_SEEN_NONE;
4ed46869 5670
89528eb3 5671 if ((1 << category) & CATEGORY_MASK_UTF_16)
4ed46869 5672 {
df7492f9 5673 int msb, lsb;
fa42c37f 5674
89528eb3
KH
5675 msb = category == (coding_category_utf_16_le
5676 | coding_category_utf_16_le_nosig);
df7492f9 5677 lsb = 1 - msb;
fa42c37f 5678
df7492f9 5679 while (src + 1 < src_end)
fa42c37f 5680 {
df7492f9
KH
5681 c = src[lsb];
5682 if (src[msb] == 0 && (c == '\n' || c == '\r'))
fa42c37f 5683 {
df7492f9
KH
5684 int this_eol;
5685
5686 if (c == '\n')
5687 this_eol = EOL_SEEN_LF;
5688 else if (src + 3 >= src_end
5689 || src[msb + 2] != 0
5690 || src[lsb + 2] != '\n')
5691 this_eol = EOL_SEEN_CR;
fa42c37f 5692 else
8f924df7 5693 this_eol = EOL_SEEN_CRLF;
df7492f9
KH
5694
5695 if (eol_seen == EOL_SEEN_NONE)
5696 /* This is the first end-of-line. */
5697 eol_seen = this_eol;
5698 else if (eol_seen != this_eol)
fa42c37f 5699 {
df7492f9
KH
5700 /* The found type is different from what found before. */
5701 eol_seen = EOL_SEEN_LF;
5702 break;
fa42c37f 5703 }
df7492f9
KH
5704 if (++total == MAX_EOL_CHECK_COUNT)
5705 break;
fa42c37f 5706 }
df7492f9 5707 src += 2;
fa42c37f 5708 }
bcf26d6a 5709 }
d46c5b12 5710 else
c4825358 5711 {
df7492f9 5712 while (src < src_end)
27901516 5713 {
df7492f9
KH
5714 c = *src++;
5715 if (c == '\n' || c == '\r')
5716 {
5717 int this_eol;
d46c5b12 5718
df7492f9
KH
5719 if (c == '\n')
5720 this_eol = EOL_SEEN_LF;
5721 else if (src >= src_end || *src != '\n')
5722 this_eol = EOL_SEEN_CR;
5723 else
5724 this_eol = EOL_SEEN_CRLF, src++;
d46c5b12 5725
df7492f9
KH
5726 if (eol_seen == EOL_SEEN_NONE)
5727 /* This is the first end-of-line. */
5728 eol_seen = this_eol;
5729 else if (eol_seen != this_eol)
5730 {
5731 /* The found type is different from what found before. */
5732 eol_seen = EOL_SEEN_LF;
5733 break;
5734 }
5735 if (++total == MAX_EOL_CHECK_COUNT)
5736 break;
5737 }
5738 }
73be902c 5739 }
df7492f9 5740 return eol_seen;
73be902c
KH
5741}
5742
df7492f9 5743
24a73b0a 5744static Lisp_Object
df7492f9
KH
5745adjust_coding_eol_type (coding, eol_seen)
5746 struct coding_system *coding;
5747 int eol_seen;
73be902c 5748{
0be8721c 5749 Lisp_Object eol_type;
8f924df7 5750
df7492f9
KH
5751 eol_type = CODING_ID_EOL_TYPE (coding->id);
5752 if (eol_seen & EOL_SEEN_LF)
24a73b0a
KH
5753 {
5754 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
5755 eol_type = Qunix;
5756 }
6f197c07 5757 else if (eol_seen & EOL_SEEN_CRLF)
24a73b0a
KH
5758 {
5759 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
5760 eol_type = Qdos;
5761 }
6f197c07 5762 else if (eol_seen & EOL_SEEN_CR)
24a73b0a
KH
5763 {
5764 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
5765 eol_type = Qmac;
5766 }
5767 return eol_type;
d46c5b12 5768}
4ed46869 5769
df7492f9
KH
5770/* Detect how a text specified in CODING is encoded. If a coding
5771 system is detected, update fields of CODING by the detected coding
5772 system. */
0a28aafb 5773
df7492f9
KH
5774void
5775detect_coding (coding)
d46c5b12 5776 struct coding_system *coding;
d46c5b12 5777{
8f924df7 5778 const unsigned char *src, *src_end;
d46c5b12 5779
df7492f9
KH
5780 coding->consumed = coding->consumed_char = 0;
5781 coding->produced = coding->produced_char = 0;
5782 coding_set_source (coding);
1c3478b0 5783
df7492f9 5784 src_end = coding->source + coding->src_bytes;
c0e16b14 5785 coding->head_ascii = 0;
1c3478b0 5786
df7492f9
KH
5787 /* If we have not yet decided the text encoding type, detect it
5788 now. */
5789 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
b73bfc1c 5790 {
df7492f9 5791 int c, i;
6cb21a4f 5792 struct coding_detection_info detect_info;
2f3cbb32 5793 int null_byte_found = 0, eight_bit_found = 0;
df7492f9 5794
6cb21a4f 5795 detect_info.checked = detect_info.found = detect_info.rejected = 0;
2f3cbb32 5796 for (src = coding->source; src < src_end; src++)
d46c5b12 5797 {
df7492f9 5798 c = *src;
6cb21a4f 5799 if (c & 0x80)
6cb21a4f 5800 {
2f3cbb32 5801 eight_bit_found = 1;
2f3cbb32
KH
5802 if (null_byte_found)
5803 break;
5804 }
5805 else if (c < 0x20)
5806 {
5807 if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
5808 && ! inhibit_iso_escape_detection
5809 && ! detect_info.checked)
6cb21a4f 5810 {
2f3cbb32
KH
5811 if (detect_coding_iso_2022 (coding, &detect_info))
5812 {
5813 /* We have scanned the whole data. */
5814 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
c0e16b14
KH
5815 {
5816 /* We didn't find an 8-bit code. We may
5817 have found a null-byte, but it's very
5818 rare that a binary file confirm to
5819 ISO-2022. */
5820 src = src_end;
5821 coding->head_ascii = src - coding->source;
5822 }
5823 detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
2f3cbb32
KH
5824 break;
5825 }
5826 }
5827 else if (! c)
5828 {
5829 null_byte_found = 1;
5830 if (eight_bit_found)
5831 break;
6cb21a4f 5832 }
c006c0c8
KH
5833 if (! eight_bit_found)
5834 coding->head_ascii++;
6cb21a4f 5835 }
c006c0c8 5836 else if (! eight_bit_found)
c0e16b14 5837 coding->head_ascii++;
d46c5b12 5838 }
df7492f9 5839
2f3cbb32
KH
5840 if (null_byte_found || eight_bit_found
5841 || coding->head_ascii < coding->src_bytes
6cb21a4f 5842 || detect_info.found)
d46c5b12 5843 {
ff0dacd7
KH
5844 enum coding_category category;
5845 struct coding_system *this;
df7492f9 5846
6cb21a4f
KH
5847 if (coding->head_ascii == coding->src_bytes)
5848 /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */
5849 for (i = 0; i < coding_category_raw_text; i++)
5850 {
5851 category = coding_priorities[i];
5852 this = coding_categories + category;
5853 if (detect_info.found & (1 << category))
24a73b0a 5854 break;
6cb21a4f
KH
5855 }
5856 else
2f3cbb32
KH
5857 {
5858 if (null_byte_found)
ff0dacd7 5859 {
2f3cbb32
KH
5860 detect_info.checked |= ~CATEGORY_MASK_UTF_16;
5861 detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
ff0dacd7 5862 }
2f3cbb32
KH
5863 for (i = 0; i < coding_category_raw_text; i++)
5864 {
5865 category = coding_priorities[i];
5866 this = coding_categories + category;
5867 if (this->id < 0)
5868 {
5869 /* No coding system of this category is defined. */
5870 detect_info.rejected |= (1 << category);
5871 }
5872 else if (category >= coding_category_raw_text)
5873 continue;
5874 else if (detect_info.checked & (1 << category))
5875 {
5876 if (detect_info.found & (1 << category))
5877 break;
5878 }
5879 else if ((*(this->detector)) (coding, &detect_info)
5880 && detect_info.found & (1 << category))
5881 {
5882 if (category == coding_category_utf_16_auto)
5883 {
5884 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5885 category = coding_category_utf_16_le;
5886 else
5887 category = coding_category_utf_16_be;
5888 }
5889 break;
5890 }
5891 }
2f3cbb32 5892 }
c0e16b14
KH
5893
5894 if (i < coding_category_raw_text)
5895 setup_coding_system (CODING_ID_NAME (this->id), coding);
5896 else if (null_byte_found)
5897 setup_coding_system (Qno_conversion, coding);
5898 else if ((detect_info.rejected & CATEGORY_MASK_ANY)
5899 == CATEGORY_MASK_ANY)
5900 setup_coding_system (Qraw_text, coding);
5901 else if (detect_info.rejected)
5902 for (i = 0; i < coding_category_raw_text; i++)
5903 if (! (detect_info.rejected & (1 << coding_priorities[i])))
5904 {
5905 this = coding_categories + coding_priorities[i];
5906 setup_coding_system (CODING_ID_NAME (this->id), coding);
5907 break;
5908 }
d46c5b12 5909 }
b73bfc1c 5910 }
a470d443
KH
5911 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
5912 == coding_category_utf_8_auto)
5913 {
5914 Lisp_Object coding_systems;
5915 struct coding_detection_info detect_info;
5916
5917 coding_systems
5918 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
5919 detect_info.found = detect_info.rejected = 0;
5920 coding->head_ascii = 0;
5921 if (CONSP (coding_systems)
5922 && detect_coding_utf_8 (coding, &detect_info))
5923 {
5924 if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
5925 setup_coding_system (XCAR (coding_systems), coding);
5926 else
5927 setup_coding_system (XCDR (coding_systems), coding);
5928 }
5929 }
24a73b0a
KH
5930 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
5931 == coding_category_utf_16_auto)
b49a1807
KH
5932 {
5933 Lisp_Object coding_systems;
5934 struct coding_detection_info detect_info;
5935
5936 coding_systems
a470d443 5937 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
b49a1807 5938 detect_info.found = detect_info.rejected = 0;
a470d443 5939 coding->head_ascii = 0;
b49a1807 5940 if (CONSP (coding_systems)
24a73b0a 5941 && detect_coding_utf_16 (coding, &detect_info))
b49a1807
KH
5942 {
5943 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5944 setup_coding_system (XCAR (coding_systems), coding);
24a73b0a 5945 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
b49a1807
KH
5946 setup_coding_system (XCDR (coding_systems), coding);
5947 }
5948 }
4ed46869 5949}
4ed46869 5950
d46c5b12 5951
aaaf0b1e 5952static void
df7492f9 5953decode_eol (coding)
aaaf0b1e 5954 struct coding_system *coding;
aaaf0b1e 5955{
24a73b0a
KH
5956 Lisp_Object eol_type;
5957 unsigned char *p, *pbeg, *pend;
3ed051d4 5958
24a73b0a
KH
5959 eol_type = CODING_ID_EOL_TYPE (coding->id);
5960 if (EQ (eol_type, Qunix))
5961 return;
5962
5963 if (NILP (coding->dst_object))
5964 pbeg = coding->destination;
5965 else
5966 pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
5967 pend = pbeg + coding->produced;
5968
5969 if (VECTORP (eol_type))
aaaf0b1e 5970 {
df7492f9 5971 int eol_seen = EOL_SEEN_NONE;
4ed46869 5972
24a73b0a 5973 for (p = pbeg; p < pend; p++)
aaaf0b1e 5974 {
df7492f9
KH
5975 if (*p == '\n')
5976 eol_seen |= EOL_SEEN_LF;
5977 else if (*p == '\r')
aaaf0b1e 5978 {
df7492f9 5979 if (p + 1 < pend && *(p + 1) == '\n')
aaaf0b1e 5980 {
df7492f9
KH
5981 eol_seen |= EOL_SEEN_CRLF;
5982 p++;
aaaf0b1e 5983 }
aaaf0b1e 5984 else
df7492f9 5985 eol_seen |= EOL_SEEN_CR;
aaaf0b1e 5986 }
aaaf0b1e 5987 }
24a73b0a
KH
5988 if (eol_seen != EOL_SEEN_NONE
5989 && eol_seen != EOL_SEEN_LF
5990 && eol_seen != EOL_SEEN_CRLF
5991 && eol_seen != EOL_SEEN_CR)
5992 eol_seen = EOL_SEEN_LF;
df7492f9 5993 if (eol_seen != EOL_SEEN_NONE)
24a73b0a 5994 eol_type = adjust_coding_eol_type (coding, eol_seen);
aaaf0b1e 5995 }
d46c5b12 5996
24a73b0a 5997 if (EQ (eol_type, Qmac))
27901516 5998 {
24a73b0a 5999 for (p = pbeg; p < pend; p++)
df7492f9
KH
6000 if (*p == '\r')
6001 *p = '\n';
4ed46869 6002 }
24a73b0a 6003 else if (EQ (eol_type, Qdos))
df7492f9 6004 {
24a73b0a 6005 int n = 0;
b73bfc1c 6006
24a73b0a
KH
6007 if (NILP (coding->dst_object))
6008 {
4347441b
KH
6009 /* Start deleting '\r' from the tail to minimize the memory
6010 movement. */
24a73b0a
KH
6011 for (p = pend - 2; p >= pbeg; p--)
6012 if (*p == '\r')
6013 {
6014 safe_bcopy ((char *) (p + 1), (char *) p, pend-- - p - 1);
6015 n++;
6016 }
6017 }
6018 else
6019 {
4347441b
KH
6020 int pos_byte = coding->dst_pos_byte;
6021 int pos = coding->dst_pos;
6022 int pos_end = pos + coding->produced_char - 1;
6023
6024 while (pos < pos_end)
6025 {
6026 p = BYTE_POS_ADDR (pos_byte);
6027 if (*p == '\r' && p[1] == '\n')
6028 {
6029 del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6030 n++;
6031 pos_end--;
6032 }
6033 pos++;
69b8522d
KH
6034 if (coding->dst_multibyte)
6035 pos_byte += BYTES_BY_CHAR_HEAD (*p);
6036 else
6037 pos_byte++;
4347441b 6038 }
24a73b0a
KH
6039 }
6040 coding->produced -= n;
6041 coding->produced_char -= n;
aaaf0b1e 6042 }
4ed46869
KH
6043}
6044
7d64c6ad 6045
a6f87d34
KH
6046/* Return a translation table (or list of them) from coding system
6047 attribute vector ATTRS for encoding (ENCODEP is nonzero) or
6048 decoding (ENCODEP is zero). */
7d64c6ad 6049
e6a54062 6050static Lisp_Object
09ee6fdd
KH
6051get_translation_table (attrs, encodep, max_lookup)
6052 Lisp_Object attrs;
6053 int encodep, *max_lookup;
7d64c6ad
KH
6054{
6055 Lisp_Object standard, translation_table;
09ee6fdd 6056 Lisp_Object val;
7d64c6ad
KH
6057
6058 if (encodep)
6059 translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6060 standard = Vstandard_translation_table_for_encode;
6061 else
6062 translation_table = CODING_ATTR_DECODE_TBL (attrs),
6063 standard = Vstandard_translation_table_for_decode;
7d64c6ad 6064 if (NILP (translation_table))
09ee6fdd
KH
6065 translation_table = standard;
6066 else
a6f87d34 6067 {
09ee6fdd
KH
6068 if (SYMBOLP (translation_table))
6069 translation_table = Fget (translation_table, Qtranslation_table);
6070 else if (CONSP (translation_table))
6071 {
6072 translation_table = Fcopy_sequence (translation_table);
6073 for (val = translation_table; CONSP (val); val = XCDR (val))
6074 if (SYMBOLP (XCAR (val)))
6075 XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6076 }
6077 if (CHAR_TABLE_P (standard))
6078 {
6079 if (CONSP (translation_table))
6080 translation_table = nconc2 (translation_table,
6081 Fcons (standard, Qnil));
6082 else
6083 translation_table = Fcons (translation_table,
6084 Fcons (standard, Qnil));
6085 }
a6f87d34 6086 }
2170c8f0
KH
6087
6088 if (max_lookup)
09ee6fdd 6089 {
2170c8f0
KH
6090 *max_lookup = 1;
6091 if (CHAR_TABLE_P (translation_table)
6092 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6093 {
6094 val = XCHAR_TABLE (translation_table)->extras[1];
6095 if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6096 *max_lookup = XFASTINT (val);
6097 }
6098 else if (CONSP (translation_table))
6099 {
6100 Lisp_Object tail, val;
09ee6fdd 6101
2170c8f0
KH
6102 for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6103 if (CHAR_TABLE_P (XCAR (tail))
6104 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6105 {
6106 val = XCHAR_TABLE (XCAR (tail))->extras[1];
6107 if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6108 *max_lookup = XFASTINT (val);
6109 }
6110 }
a6f87d34 6111 }
7d64c6ad
KH
6112 return translation_table;
6113}
6114
09ee6fdd
KH
6115#define LOOKUP_TRANSLATION_TABLE(table, c, trans) \
6116 do { \
6117 trans = Qnil; \
6118 if (CHAR_TABLE_P (table)) \
6119 { \
6120 trans = CHAR_TABLE_REF (table, c); \
6121 if (CHARACTERP (trans)) \
6122 c = XFASTINT (trans), trans = Qnil; \
6123 } \
6124 else if (CONSP (table)) \
6125 { \
6126 Lisp_Object tail; \
6127 \
6128 for (tail = table; CONSP (tail); tail = XCDR (tail)) \
6129 if (CHAR_TABLE_P (XCAR (tail))) \
6130 { \
6131 trans = CHAR_TABLE_REF (XCAR (tail), c); \
6132 if (CHARACTERP (trans)) \
6133 c = XFASTINT (trans), trans = Qnil; \
6134 else if (! NILP (trans)) \
6135 break; \
6136 } \
6137 } \
e6a54062
KH
6138 } while (0)
6139
7d64c6ad 6140
69a80ea3
KH
6141static Lisp_Object
6142get_translation (val, buf, buf_end, last_block, from_nchars, to_nchars)
6143 Lisp_Object val;
6144 int *buf, *buf_end;
6145 int last_block;
6146 int *from_nchars, *to_nchars;
6147{
433f7f87
KH
6148 /* VAL is TO or (([FROM-CHAR ...] . TO) ...) where TO is TO-CHAR or
6149 [TO-CHAR ...]. */
69a80ea3
KH
6150 if (CONSP (val))
6151 {
433f7f87 6152 Lisp_Object from, tail;
69a80ea3
KH
6153 int i, len;
6154
433f7f87 6155 for (tail = val; CONSP (tail); tail = XCDR (tail))
69a80ea3 6156 {
433f7f87
KH
6157 val = XCAR (tail);
6158 from = XCAR (val);
6159 len = ASIZE (from);
6160 for (i = 0; i < len; i++)
6161 {
6162 if (buf + i == buf_end)
6163 {
6164 if (! last_block)
6165 return Qt;
6166 break;
6167 }
6168 if (XINT (AREF (from, i)) != buf[i])
6169 break;
6170 }
6171 if (i == len)
6172 {
6173 val = XCDR (val);
6174 *from_nchars = len;
6175 break;
6176 }
69a80ea3 6177 }
433f7f87
KH
6178 if (! CONSP (tail))
6179 return Qnil;
69a80ea3
KH
6180 }
6181 if (VECTORP (val))
6182 *buf = XINT (AREF (val, 0)), *to_nchars = ASIZE (val);
6183 else
6184 *buf = XINT (val);
6185 return val;
6186}
6187
6188
d46c5b12 6189static int
69a80ea3 6190produce_chars (coding, translation_table, last_block)
df7492f9 6191 struct coding_system *coding;
69a80ea3
KH
6192 Lisp_Object translation_table;
6193 int last_block;
4ed46869 6194{
df7492f9
KH
6195 unsigned char *dst = coding->destination + coding->produced;
6196 unsigned char *dst_end = coding->destination + coding->dst_bytes;
119852e7
KH
6197 EMACS_INT produced;
6198 EMACS_INT produced_chars = 0;
69a80ea3 6199 int carryover = 0;
4ed46869 6200
df7492f9 6201 if (! coding->chars_at_source)
4ed46869 6202 {
119852e7 6203 /* Source characters are in coding->charbuf. */
fba4576f
AS
6204 int *buf = coding->charbuf;
6205 int *buf_end = buf + coding->charbuf_used;
4ed46869 6206
db274c7a
KH
6207 if (EQ (coding->src_object, coding->dst_object))
6208 {
6209 coding_set_source (coding);
6210 dst_end = ((unsigned char *) coding->source) + coding->consumed;
6211 }
4ed46869 6212
df7492f9 6213 while (buf < buf_end)
4ed46869 6214 {
69a80ea3 6215 int c = *buf, i;
bc4bc72a 6216
df7492f9
KH
6217 if (c >= 0)
6218 {
69a80ea3
KH
6219 int from_nchars = 1, to_nchars = 1;
6220 Lisp_Object trans = Qnil;
6221
09ee6fdd 6222 LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
e6a54062 6223 if (! NILP (trans))
69a80ea3
KH
6224 {
6225 trans = get_translation (trans, buf, buf_end, last_block,
6226 &from_nchars, &to_nchars);
6227 if (EQ (trans, Qt))
6228 break;
6229 c = *buf;
6230 }
6231
6232 if (dst + MAX_MULTIBYTE_LENGTH * to_nchars > dst_end)
6233 {
6234 dst = alloc_destination (coding,
6235 buf_end - buf
6236 + MAX_MULTIBYTE_LENGTH * to_nchars,
6237 dst);
db274c7a
KH
6238 if (EQ (coding->src_object, coding->dst_object))
6239 {
6240 coding_set_source (coding);
6241 dst_end = ((unsigned char *) coding->source) + coding->consumed;
6242 }
6243 else
6244 dst_end = coding->destination + coding->dst_bytes;
69a80ea3
KH
6245 }
6246
433f7f87 6247 for (i = 0; i < to_nchars; i++)
69a80ea3 6248 {
433f7f87
KH
6249 if (i > 0)
6250 c = XINT (AREF (trans, i));
69a80ea3
KH
6251 if (coding->dst_multibyte
6252 || ! CHAR_BYTE8_P (c))
db274c7a 6253 CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
69a80ea3
KH
6254 else
6255 *dst++ = CHAR_TO_BYTE8 (c);
6256 }
6257 produced_chars += to_nchars;
6258 *buf++ = to_nchars;
6259 while (--from_nchars > 0)
6260 *buf++ = 0;
d46c5b12 6261 }
df7492f9 6262 else
69a80ea3
KH
6263 /* This is an annotation datum. (-C) is the length. */
6264 buf += -c;
4ed46869 6265 }
69a80ea3 6266 carryover = buf_end - buf;
4ed46869 6267 }
fa42c37f 6268 else
fa42c37f 6269 {
119852e7 6270 /* Source characters are at coding->source. */
8f924df7 6271 const unsigned char *src = coding->source;
119852e7 6272 const unsigned char *src_end = src + coding->consumed;
4ed46869 6273
db274c7a
KH
6274 if (EQ (coding->dst_object, coding->src_object))
6275 dst_end = (unsigned char *) src;
df7492f9 6276 if (coding->src_multibyte != coding->dst_multibyte)
fa42c37f 6277 {
df7492f9 6278 if (coding->src_multibyte)
fa42c37f 6279 {
71c81426 6280 int multibytep = 1;
119852e7 6281 EMACS_INT consumed_chars;
d46c5b12 6282
df7492f9
KH
6283 while (1)
6284 {
8f924df7 6285 const unsigned char *src_base = src;
df7492f9 6286 int c;
b73bfc1c 6287
df7492f9 6288 ONE_MORE_BYTE (c);
119852e7 6289 if (dst == dst_end)
df7492f9 6290 {
119852e7
KH
6291 if (EQ (coding->src_object, coding->dst_object))
6292 dst_end = (unsigned char *) src;
6293 if (dst == dst_end)
df7492f9 6294 {
119852e7
KH
6295 EMACS_INT offset = src - coding->source;
6296
6297 dst = alloc_destination (coding, src_end - src + 1,
6298 dst);
6299 dst_end = coding->destination + coding->dst_bytes;
6300 coding_set_source (coding);
6301 src = coding->source + offset;
6302 src_end = coding->source + coding->src_bytes;
db274c7a
KH
6303 if (EQ (coding->src_object, coding->dst_object))
6304 dst_end = (unsigned char *) src;
df7492f9 6305 }
df7492f9
KH
6306 }
6307 *dst++ = c;
6308 produced_chars++;
6309 }
6310 no_more_source:
6311 ;
fa42c37f
KH
6312 }
6313 else
df7492f9
KH
6314 while (src < src_end)
6315 {
71c81426 6316 int multibytep = 1;
df7492f9 6317 int c = *src++;
b73bfc1c 6318
df7492f9
KH
6319 if (dst >= dst_end - 1)
6320 {
2c78b7e1 6321 if (EQ (coding->src_object, coding->dst_object))
8f924df7 6322 dst_end = (unsigned char *) src;
2c78b7e1
KH
6323 if (dst >= dst_end - 1)
6324 {
119852e7 6325 EMACS_INT offset = src - coding->source;
db274c7a 6326 EMACS_INT more_bytes;
119852e7 6327
db274c7a
KH
6328 if (EQ (coding->src_object, coding->dst_object))
6329 more_bytes = ((src_end - src) / 2) + 2;
6330 else
6331 more_bytes = src_end - src + 2;
6332 dst = alloc_destination (coding, more_bytes, dst);
2c78b7e1
KH
6333 dst_end = coding->destination + coding->dst_bytes;
6334 coding_set_source (coding);
119852e7 6335 src = coding->source + offset;
2c78b7e1 6336 src_end = coding->source + coding->src_bytes;
db274c7a
KH
6337 if (EQ (coding->src_object, coding->dst_object))
6338 dst_end = (unsigned char *) src;
2c78b7e1 6339 }
df7492f9
KH
6340 }
6341 EMIT_ONE_BYTE (c);
6342 }
d46c5b12 6343 }
df7492f9
KH
6344 else
6345 {
6346 if (!EQ (coding->src_object, coding->dst_object))
fa42c37f 6347 {
119852e7 6348 EMACS_INT require = coding->src_bytes - coding->dst_bytes;
4ed46869 6349
df7492f9 6350 if (require > 0)
fa42c37f 6351 {
df7492f9
KH
6352 EMACS_INT offset = src - coding->source;
6353
6354 dst = alloc_destination (coding, require, dst);
6355 coding_set_source (coding);
6356 src = coding->source + offset;
6357 src_end = coding->source + coding->src_bytes;
fa42c37f
KH
6358 }
6359 }
119852e7 6360 produced_chars = coding->consumed_char;
df7492f9 6361 while (src < src_end)
14daee73 6362 *dst++ = *src++;
fa42c37f
KH
6363 }
6364 }
6365
df7492f9 6366 produced = dst - (coding->destination + coding->produced);
284201e4 6367 if (BUFFERP (coding->dst_object) && produced_chars > 0)
df7492f9
KH
6368 insert_from_gap (produced_chars, produced);
6369 coding->produced += produced;
6370 coding->produced_char += produced_chars;
69a80ea3 6371 return carryover;
fa42c37f
KH
6372}
6373
ff0dacd7
KH
6374/* Compose text in CODING->object according to the annotation data at
6375 CHARBUF. CHARBUF is an array:
6376 [ -LENGTH ANNOTATION_MASK FROM TO METHOD COMP_LEN [ COMPONENTS... ] ]
df7492f9 6377 */
4ed46869 6378
df7492f9 6379static INLINE void
69a80ea3 6380produce_composition (coding, charbuf, pos)
4ed46869 6381 struct coding_system *coding;
df7492f9 6382 int *charbuf;
69a80ea3 6383 EMACS_INT pos;
4ed46869 6384{
df7492f9 6385 int len;
69a80ea3 6386 EMACS_INT to;
df7492f9 6387 enum composition_method method;
df7492f9 6388 Lisp_Object components;
fa42c37f 6389
df7492f9 6390 len = -charbuf[0];
69a80ea3 6391 to = pos + charbuf[2];
9ffd559c
KH
6392 if (to <= pos)
6393 return;
69a80ea3 6394 method = (enum composition_method) (charbuf[3]);
d46c5b12 6395
df7492f9
KH
6396 if (method == COMPOSITION_RELATIVE)
6397 components = Qnil;
9ffd559c
KH
6398 else if (method >= COMPOSITION_WITH_RULE
6399 && method <= COMPOSITION_WITH_RULE_ALTCHARS)
d46c5b12 6400 {
df7492f9
KH
6401 Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
6402 int i;
b73bfc1c 6403
69a80ea3
KH
6404 len -= 4;
6405 charbuf += 4;
df7492f9 6406 for (i = 0; i < len; i++)
9ffd559c
KH
6407 {
6408 args[i] = make_number (charbuf[i]);
f75c90a9 6409 if (charbuf[i] < 0)
9ffd559c
KH
6410 return;
6411 }
df7492f9
KH
6412 components = (method == COMPOSITION_WITH_ALTCHARS
6413 ? Fstring (len, args) : Fvector (len, args));
d46c5b12 6414 }
9ffd559c
KH
6415 else
6416 return;
69a80ea3 6417 compose_text (pos, to, components, Qnil, coding->dst_object);
d46c5b12
KH
6418}
6419
d46c5b12 6420
ff0dacd7
KH
6421/* Put `charset' property on text in CODING->object according to
6422 the annotation data at CHARBUF. CHARBUF is an array:
69a80ea3 6423 [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
ff0dacd7 6424 */
d46c5b12 6425
ff0dacd7 6426static INLINE void
69a80ea3 6427produce_charset (coding, charbuf, pos)
d46c5b12 6428 struct coding_system *coding;
ff0dacd7 6429 int *charbuf;
69a80ea3 6430 EMACS_INT pos;
d46c5b12 6431{
69a80ea3
KH
6432 EMACS_INT from = pos - charbuf[2];
6433 struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
b73bfc1c 6434
69a80ea3 6435 Fput_text_property (make_number (from), make_number (pos),
ff0dacd7
KH
6436 Qcharset, CHARSET_NAME (charset),
6437 coding->dst_object);
d46c5b12
KH
6438}
6439
d46c5b12 6440
df7492f9
KH
6441#define CHARBUF_SIZE 0x4000
6442
6443#define ALLOC_CONVERSION_WORK_AREA(coding) \
6444 do { \
6445 int size = CHARBUF_SIZE;; \
6446 \
6447 coding->charbuf = NULL; \
6448 while (size > 1024) \
6449 { \
6450 coding->charbuf = (int *) alloca (sizeof (int) * size); \
6451 if (coding->charbuf) \
6452 break; \
6453 size >>= 1; \
6454 } \
6455 if (! coding->charbuf) \
6456 { \
065e3595 6457 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
df7492f9
KH
6458 return coding->result; \
6459 } \
6460 coding->charbuf_size = size; \
6461 } while (0)
4ed46869 6462
d46c5b12
KH
6463
6464static void
69a80ea3 6465produce_annotation (coding, pos)
d46c5b12 6466 struct coding_system *coding;
69a80ea3 6467 EMACS_INT pos;
d46c5b12 6468{
df7492f9
KH
6469 int *charbuf = coding->charbuf;
6470 int *charbuf_end = charbuf + coding->charbuf_used;
d46c5b12 6471
ff0dacd7
KH
6472 if (NILP (coding->dst_object))
6473 return;
d46c5b12 6474
df7492f9 6475 while (charbuf < charbuf_end)
a84f1519 6476 {
df7492f9 6477 if (*charbuf >= 0)
69a80ea3 6478 pos += *charbuf++;
d46c5b12 6479 else
d46c5b12 6480 {
df7492f9 6481 int len = -*charbuf;
ff0dacd7 6482 switch (charbuf[1])
df7492f9
KH
6483 {
6484 case CODING_ANNOTATE_COMPOSITION_MASK:
69a80ea3 6485 produce_composition (coding, charbuf, pos);
df7492f9 6486 break;
ff0dacd7 6487 case CODING_ANNOTATE_CHARSET_MASK:
69a80ea3 6488 produce_charset (coding, charbuf, pos);
ff0dacd7 6489 break;
df7492f9
KH
6490 default:
6491 abort ();
6492 }
6493 charbuf += len;
d46c5b12 6494 }
a84f1519 6495 }
d46c5b12
KH
6496}
6497
df7492f9
KH
6498/* Decode the data at CODING->src_object into CODING->dst_object.
6499 CODING->src_object is a buffer, a string, or nil.
6500 CODING->dst_object is a buffer.
d46c5b12 6501
df7492f9
KH
6502 If CODING->src_object is a buffer, it must be the current buffer.
6503 In this case, if CODING->src_pos is positive, it is a position of
6504 the source text in the buffer, otherwise, the source text is in the
6505 gap area of the buffer, and CODING->src_pos specifies the offset of
6506 the text from GPT (which must be the same as PT). If this is the
6507 same buffer as CODING->dst_object, CODING->src_pos must be
6508 negative.
d46c5b12 6509
b6828792 6510 If CODING->src_object is a string, CODING->src_pos is an index to
df7492f9 6511 that string.
d46c5b12 6512
df7492f9
KH
6513 If CODING->src_object is nil, CODING->source must already point to
6514 the non-relocatable memory area. In this case, CODING->src_pos is
6515 an offset from CODING->source.
73be902c 6516
df7492f9
KH
6517 The decoded data is inserted at the current point of the buffer
6518 CODING->dst_object.
6519*/
d46c5b12 6520
df7492f9
KH
6521static int
6522decode_coding (coding)
d46c5b12 6523 struct coding_system *coding;
d46c5b12 6524{
df7492f9 6525 Lisp_Object attrs;
24a73b0a 6526 Lisp_Object undo_list;
7d64c6ad 6527 Lisp_Object translation_table;
69a80ea3
KH
6528 int carryover;
6529 int i;
d46c5b12 6530
df7492f9
KH
6531 if (BUFFERP (coding->src_object)
6532 && coding->src_pos > 0
6533 && coding->src_pos < GPT
6534 && coding->src_pos + coding->src_chars > GPT)
6535 move_gap_both (coding->src_pos, coding->src_pos_byte);
d46c5b12 6536
24a73b0a 6537 undo_list = Qt;
df7492f9 6538 if (BUFFERP (coding->dst_object))
1c3478b0 6539 {
df7492f9
KH
6540 if (current_buffer != XBUFFER (coding->dst_object))
6541 set_buffer_internal (XBUFFER (coding->dst_object));
6542 if (GPT != PT)
6543 move_gap_both (PT, PT_BYTE);
24a73b0a
KH
6544 undo_list = current_buffer->undo_list;
6545 current_buffer->undo_list = Qt;
1c3478b0
KH
6546 }
6547
df7492f9
KH
6548 coding->consumed = coding->consumed_char = 0;
6549 coding->produced = coding->produced_char = 0;
6550 coding->chars_at_source = 0;
065e3595 6551 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9 6552 coding->errors = 0;
1c3478b0 6553
df7492f9
KH
6554 ALLOC_CONVERSION_WORK_AREA (coding);
6555
6556 attrs = CODING_ID_ATTRS (coding->id);
2170c8f0 6557 translation_table = get_translation_table (attrs, 0, NULL);
df7492f9 6558
69a80ea3 6559 carryover = 0;
df7492f9 6560 do
b73bfc1c 6561 {
69a80ea3
KH
6562 EMACS_INT pos = coding->dst_pos + coding->produced_char;
6563
df7492f9
KH
6564 coding_set_source (coding);
6565 coding->annotated = 0;
69a80ea3 6566 coding->charbuf_used = carryover;
df7492f9 6567 (*(coding->decoder)) (coding);
df7492f9 6568 coding_set_destination (coding);
69a80ea3 6569 carryover = produce_chars (coding, translation_table, 0);
df7492f9 6570 if (coding->annotated)
69a80ea3
KH
6571 produce_annotation (coding, pos);
6572 for (i = 0; i < carryover; i++)
6573 coding->charbuf[i]
6574 = coding->charbuf[coding->charbuf_used - carryover + i];
d46c5b12 6575 }
df7492f9 6576 while (coding->consumed < coding->src_bytes
54b367bb
KH
6577 && (coding->result == CODING_RESULT_SUCCESS
6578 || coding->result == CODING_RESULT_INVALID_SRC));
d46c5b12 6579
69a80ea3
KH
6580 if (carryover > 0)
6581 {
6582 coding_set_destination (coding);
6583 coding->charbuf_used = carryover;
6584 produce_chars (coding, translation_table, 1);
6585 }
6586
df7492f9
KH
6587 coding->carryover_bytes = 0;
6588 if (coding->consumed < coding->src_bytes)
d46c5b12 6589 {
df7492f9 6590 int nbytes = coding->src_bytes - coding->consumed;
8f924df7 6591 const unsigned char *src;
df7492f9
KH
6592
6593 coding_set_source (coding);
6594 coding_set_destination (coding);
6595 src = coding->source + coding->consumed;
6596
6597 if (coding->mode & CODING_MODE_LAST_BLOCK)
1c3478b0 6598 {
df7492f9
KH
6599 /* Flush out unprocessed data as binary chars. We are sure
6600 that the number of data is less than the size of
6601 coding->charbuf. */
065e3595 6602 coding->charbuf_used = 0;
df7492f9 6603 while (nbytes-- > 0)
1c3478b0 6604 {
df7492f9 6605 int c = *src++;
98725083 6606
1c91457d
KH
6607 if (c & 0x80)
6608 c = BYTE8_TO_CHAR (c);
6609 coding->charbuf[coding->charbuf_used++] = c;
1c3478b0 6610 }
f6cbaf43 6611 produce_chars (coding, Qnil, 1);
d46c5b12 6612 }
d46c5b12 6613 else
df7492f9
KH
6614 {
6615 /* Record unprocessed bytes in coding->carryover. We are
6616 sure that the number of data is less than the size of
6617 coding->carryover. */
6618 unsigned char *p = coding->carryover;
6619
6620 coding->carryover_bytes = nbytes;
6621 while (nbytes-- > 0)
6622 *p++ = *src++;
1c3478b0 6623 }
df7492f9 6624 coding->consumed = coding->src_bytes;
b73bfc1c 6625 }
69f76525 6626
4347441b
KH
6627 if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix))
6628 decode_eol (coding);
24a73b0a
KH
6629 if (BUFFERP (coding->dst_object))
6630 {
6631 current_buffer->undo_list = undo_list;
6632 record_insert (coding->dst_pos, coding->produced_char);
6633 }
73be902c 6634 return coding->result;
4ed46869
KH
6635}
6636
aaaf0b1e 6637
e1c23804 6638/* Extract an annotation datum from a composition starting at POS and
ff0dacd7
KH
6639 ending before LIMIT of CODING->src_object (buffer or string), store
6640 the data in BUF, set *STOP to a starting position of the next
6641 composition (if any) or to LIMIT, and return the address of the
6642 next element of BUF.
6643
6644 If such an annotation is not found, set *STOP to a starting
6645 position of a composition after POS (if any) or to LIMIT, and
6646 return BUF. */
6647
6648static INLINE int *
6649handle_composition_annotation (pos, limit, coding, buf, stop)
6650 EMACS_INT pos, limit;
aaaf0b1e 6651 struct coding_system *coding;
ff0dacd7
KH
6652 int *buf;
6653 EMACS_INT *stop;
aaaf0b1e 6654{
ff0dacd7
KH
6655 EMACS_INT start, end;
6656 Lisp_Object prop;
aaaf0b1e 6657
ff0dacd7
KH
6658 if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
6659 || end > limit)
6660 *stop = limit;
6661 else if (start > pos)
6662 *stop = start;
6663 else
aaaf0b1e 6664 {
ff0dacd7 6665 if (start == pos)
aaaf0b1e 6666 {
ff0dacd7
KH
6667 /* We found a composition. Store the corresponding
6668 annotation data in BUF. */
6669 int *head = buf;
6670 enum composition_method method = COMPOSITION_METHOD (prop);
6671 int nchars = COMPOSITION_LENGTH (prop);
6672
69a80ea3 6673 ADD_COMPOSITION_DATA (buf, nchars, method);
ff0dacd7 6674 if (method != COMPOSITION_RELATIVE)
aaaf0b1e 6675 {
ff0dacd7
KH
6676 Lisp_Object components;
6677 int len, i, i_byte;
6678
6679 components = COMPOSITION_COMPONENTS (prop);
6680 if (VECTORP (components))
aaaf0b1e 6681 {
ff0dacd7
KH
6682 len = XVECTOR (components)->size;
6683 for (i = 0; i < len; i++)
6684 *buf++ = XINT (AREF (components, i));
aaaf0b1e 6685 }
ff0dacd7 6686 else if (STRINGP (components))
aaaf0b1e 6687 {
8f924df7 6688 len = SCHARS (components);
ff0dacd7
KH
6689 i = i_byte = 0;
6690 while (i < len)
6691 {
6692 FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
6693 buf++;
6694 }
6695 }
6696 else if (INTEGERP (components))
6697 {
6698 len = 1;
6699 *buf++ = XINT (components);
6700 }
6701 else if (CONSP (components))
6702 {
6703 for (len = 0; CONSP (components);
6704 len++, components = XCDR (components))
6705 *buf++ = XINT (XCAR (components));
aaaf0b1e 6706 }
aaaf0b1e 6707 else
ff0dacd7
KH
6708 abort ();
6709 *head -= len;
aaaf0b1e 6710 }
aaaf0b1e 6711 }
ff0dacd7
KH
6712
6713 if (find_composition (end, limit, &start, &end, &prop,
6714 coding->src_object)
6715 && end <= limit)
6716 *stop = start;
6717 else
6718 *stop = limit;
aaaf0b1e 6719 }
ff0dacd7
KH
6720 return buf;
6721}
6722
6723
e1c23804 6724/* Extract an annotation datum from a text property `charset' at POS of
ff0dacd7
KH
6725 CODING->src_object (buffer of string), store the data in BUF, set
6726 *STOP to the position where the value of `charset' property changes
6727 (limiting by LIMIT), and return the address of the next element of
6728 BUF.
6729
6730 If the property value is nil, set *STOP to the position where the
6731 property value is non-nil (limiting by LIMIT), and return BUF. */
6732
6733static INLINE int *
6734handle_charset_annotation (pos, limit, coding, buf, stop)
6735 EMACS_INT pos, limit;
6736 struct coding_system *coding;
6737 int *buf;
6738 EMACS_INT *stop;
6739{
6740 Lisp_Object val, next;
6741 int id;
6742
6743 val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
6744 if (! NILP (val) && CHARSETP (val))
6745 id = XINT (CHARSET_SYMBOL_ID (val));
6746 else
6747 id = -1;
69a80ea3 6748 ADD_CHARSET_DATA (buf, 0, id);
ff0dacd7
KH
6749 next = Fnext_single_property_change (make_number (pos), Qcharset,
6750 coding->src_object,
6751 make_number (limit));
6752 *stop = XINT (next);
6753 return buf;
6754}
6755
6756
df7492f9 6757static void
09ee6fdd 6758consume_chars (coding, translation_table, max_lookup)
df7492f9 6759 struct coding_system *coding;
433f7f87 6760 Lisp_Object translation_table;
09ee6fdd 6761 int max_lookup;
df7492f9
KH
6762{
6763 int *buf = coding->charbuf;
ff0dacd7 6764 int *buf_end = coding->charbuf + coding->charbuf_size;
7c78e542 6765 const unsigned char *src = coding->source + coding->consumed;
4776e638 6766 const unsigned char *src_end = coding->source + coding->src_bytes;
ff0dacd7
KH
6767 EMACS_INT pos = coding->src_pos + coding->consumed_char;
6768 EMACS_INT end_pos = coding->src_pos + coding->src_chars;
df7492f9
KH
6769 int multibytep = coding->src_multibyte;
6770 Lisp_Object eol_type;
6771 int c;
ff0dacd7 6772 EMACS_INT stop, stop_composition, stop_charset;
09ee6fdd 6773 int *lookup_buf = NULL;
433f7f87
KH
6774
6775 if (! NILP (translation_table))
09ee6fdd 6776 lookup_buf = alloca (sizeof (int) * max_lookup);
88993dfd 6777
df7492f9
KH
6778 eol_type = CODING_ID_EOL_TYPE (coding->id);
6779 if (VECTORP (eol_type))
6780 eol_type = Qunix;
88993dfd 6781
df7492f9
KH
6782 /* Note: composition handling is not yet implemented. */
6783 coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
ec6d2bb8 6784
0b5670c9
KH
6785 if (NILP (coding->src_object))
6786 stop = stop_composition = stop_charset = end_pos;
ff0dacd7 6787 else
0b5670c9
KH
6788 {
6789 if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
6790 stop = stop_composition = pos;
6791 else
6792 stop = stop_composition = end_pos;
6793 if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
6794 stop = stop_charset = pos;
6795 else
6796 stop_charset = end_pos;
6797 }
ec6d2bb8 6798
24a73b0a 6799 /* Compensate for CRLF and conversion. */
ff0dacd7 6800 buf_end -= 1 + MAX_ANNOTATION_LENGTH;
df7492f9 6801 while (buf < buf_end)
aaaf0b1e 6802 {
433f7f87
KH
6803 Lisp_Object trans;
6804
df7492f9 6805 if (pos == stop)
ec6d2bb8 6806 {
df7492f9
KH
6807 if (pos == end_pos)
6808 break;
ff0dacd7
KH
6809 if (pos == stop_composition)
6810 buf = handle_composition_annotation (pos, end_pos, coding,
6811 buf, &stop_composition);
6812 if (pos == stop_charset)
6813 buf = handle_charset_annotation (pos, end_pos, coding,
6814 buf, &stop_charset);
6815 stop = (stop_composition < stop_charset
6816 ? stop_composition : stop_charset);
df7492f9
KH
6817 }
6818
6819 if (! multibytep)
4776e638 6820 {
d3e4cb56 6821 EMACS_INT bytes;
aaaf0b1e 6822
ea29edf2
KH
6823 if (coding->encoder == encode_coding_raw_text)
6824 c = *src++, pos++;
6825 else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
db274c7a 6826 c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
4776e638 6827 else
f03caae0 6828 c = BYTE8_TO_CHAR (*src), src++, pos++;
4776e638 6829 }
df7492f9 6830 else
db274c7a 6831 c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
df7492f9
KH
6832 if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
6833 c = '\n';
6834 if (! EQ (eol_type, Qunix))
aaaf0b1e 6835 {
df7492f9 6836 if (c == '\n')
aaaf0b1e 6837 {
df7492f9
KH
6838 if (EQ (eol_type, Qdos))
6839 *buf++ = '\r';
6840 else
6841 c = '\r';
aaaf0b1e
KH
6842 }
6843 }
433f7f87 6844
e6a54062 6845 trans = Qnil;
09ee6fdd 6846 LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
e6a54062 6847 if (NILP (trans))
433f7f87
KH
6848 *buf++ = c;
6849 else
6850 {
6851 int from_nchars = 1, to_nchars = 1;
6852 int *lookup_buf_end;
6853 const unsigned char *p = src;
6854 int i;
6855
6856 lookup_buf[0] = c;
6857 for (i = 1; i < max_lookup && p < src_end; i++)
6858 lookup_buf[i] = STRING_CHAR_ADVANCE (p);
6859 lookup_buf_end = lookup_buf + i;
6860 trans = get_translation (trans, lookup_buf, lookup_buf_end, 1,
6861 &from_nchars, &to_nchars);
6862 if (EQ (trans, Qt)
6863 || buf + to_nchars > buf_end)
6864 break;
6865 *buf++ = *lookup_buf;
6866 for (i = 1; i < to_nchars; i++)
6867 *buf++ = XINT (AREF (trans, i));
6868 for (i = 1; i < from_nchars; i++, pos++)
6869 src += MULTIBYTE_LENGTH_NO_CHECK (src);
6870 }
aaaf0b1e 6871 }
ec6d2bb8 6872
df7492f9
KH
6873 coding->consumed = src - coding->source;
6874 coding->consumed_char = pos - coding->src_pos;
6875 coding->charbuf_used = buf - coding->charbuf;
6876 coding->chars_at_source = 0;
aaaf0b1e
KH
6877}
6878
4ed46869 6879
df7492f9
KH
6880/* Encode the text at CODING->src_object into CODING->dst_object.
6881 CODING->src_object is a buffer or a string.
6882 CODING->dst_object is a buffer or nil.
6883
6884 If CODING->src_object is a buffer, it must be the current buffer.
6885 In this case, if CODING->src_pos is positive, it is a position of
6886 the source text in the buffer, otherwise. the source text is in the
6887 gap area of the buffer, and coding->src_pos specifies the offset of
6888 the text from GPT (which must be the same as PT). If this is the
6889 same buffer as CODING->dst_object, CODING->src_pos must be
6890 negative and CODING should not have `pre-write-conversion'.
6891
6892 If CODING->src_object is a string, CODING should not have
6893 `pre-write-conversion'.
6894
6895 If CODING->dst_object is a buffer, the encoded data is inserted at
6896 the current point of that buffer.
6897
6898 If CODING->dst_object is nil, the encoded data is placed at the
6899 memory area specified by CODING->destination. */
6900
6901static int
6902encode_coding (coding)
4ed46869 6903 struct coding_system *coding;
4ed46869 6904{
df7492f9 6905 Lisp_Object attrs;
7d64c6ad 6906 Lisp_Object translation_table;
09ee6fdd 6907 int max_lookup;
9861e777 6908
df7492f9 6909 attrs = CODING_ID_ATTRS (coding->id);
ea29edf2
KH
6910 if (coding->encoder == encode_coding_raw_text)
6911 translation_table = Qnil, max_lookup = 0;
6912 else
6913 translation_table = get_translation_table (attrs, 1, &max_lookup);
4ed46869 6914
df7492f9 6915 if (BUFFERP (coding->dst_object))
8844fa83 6916 {
df7492f9
KH
6917 set_buffer_internal (XBUFFER (coding->dst_object));
6918 coding->dst_multibyte
6919 = ! NILP (current_buffer->enable_multibyte_characters);
8844fa83 6920 }
4ed46869 6921
b73bfc1c 6922 coding->consumed = coding->consumed_char = 0;
df7492f9 6923 coding->produced = coding->produced_char = 0;
065e3595 6924 record_conversion_result (coding, CODING_RESULT_SUCCESS);
b73bfc1c 6925 coding->errors = 0;
b73bfc1c 6926
df7492f9 6927 ALLOC_CONVERSION_WORK_AREA (coding);
4ed46869 6928
df7492f9
KH
6929 do {
6930 coding_set_source (coding);
09ee6fdd 6931 consume_chars (coding, translation_table, max_lookup);
df7492f9
KH
6932 coding_set_destination (coding);
6933 (*(coding->encoder)) (coding);
6934 } while (coding->consumed_char < coding->src_chars);
6935
284201e4 6936 if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
df7492f9
KH
6937 insert_from_gap (coding->produced_char, coding->produced);
6938
6939 return (coding->result);
ec6d2bb8
KH
6940}
6941
fb88bf2d 6942
24a73b0a
KH
6943/* Name (or base name) of work buffer for code conversion. */
6944static Lisp_Object Vcode_conversion_workbuf_name;
d46c5b12 6945
24a73b0a
KH
6946/* A working buffer used by the top level conversion. Once it is
6947 created, it is never destroyed. It has the name
6948 Vcode_conversion_workbuf_name. The other working buffers are
6949 destroyed after the use is finished, and their names are modified
6950 versions of Vcode_conversion_workbuf_name. */
6951static Lisp_Object Vcode_conversion_reused_workbuf;
b73bfc1c 6952
24a73b0a
KH
6953/* 1 iff Vcode_conversion_reused_workbuf is already in use. */
6954static int reused_workbuf_in_use;
4ed46869 6955
24a73b0a
KH
6956
6957/* Return a working buffer of code convesion. MULTIBYTE specifies the
6958 multibyteness of returning buffer. */
b73bfc1c 6959
f6cbaf43 6960static Lisp_Object
24a73b0a 6961make_conversion_work_buffer (multibyte)
f6cbaf43 6962 int multibyte;
df7492f9 6963{
24a73b0a
KH
6964 Lisp_Object name, workbuf;
6965 struct buffer *current;
4ed46869 6966
24a73b0a 6967 if (reused_workbuf_in_use++)
065e3595
KH
6968 {
6969 name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
6970 workbuf = Fget_buffer_create (name);
6971 }
df7492f9 6972 else
065e3595
KH
6973 {
6974 name = Vcode_conversion_workbuf_name;
6975 workbuf = Fget_buffer_create (name);
6976 if (NILP (Vcode_conversion_reused_workbuf))
6977 Vcode_conversion_reused_workbuf = workbuf;
6978 }
24a73b0a
KH
6979 current = current_buffer;
6980 set_buffer_internal (XBUFFER (workbuf));
3ed051d4 6981 Ferase_buffer ();
df7492f9 6982 current_buffer->undo_list = Qt;
24a73b0a 6983 current_buffer->enable_multibyte_characters = multibyte ? Qt : Qnil;
df7492f9 6984 set_buffer_internal (current);
24a73b0a 6985 return workbuf;
df7492f9 6986}
d46c5b12 6987
24a73b0a 6988
4776e638 6989static Lisp_Object
24a73b0a
KH
6990code_conversion_restore (arg)
6991 Lisp_Object arg;
4776e638 6992{
24a73b0a 6993 Lisp_Object current, workbuf;
948bdcf3 6994 struct gcpro gcpro1;
24a73b0a 6995
948bdcf3 6996 GCPRO1 (arg);
24a73b0a
KH
6997 current = XCAR (arg);
6998 workbuf = XCDR (arg);
6999 if (! NILP (workbuf))
7000 {
7001 if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7002 reused_workbuf_in_use = 0;
7003 else if (! NILP (Fbuffer_live_p (workbuf)))
7004 Fkill_buffer (workbuf);
7005 }
7006 set_buffer_internal (XBUFFER (current));
948bdcf3 7007 UNGCPRO;
4776e638
KH
7008 return Qnil;
7009}
b73bfc1c 7010
24a73b0a
KH
7011Lisp_Object
7012code_conversion_save (with_work_buf, multibyte)
4776e638 7013 int with_work_buf, multibyte;
df7492f9 7014{
24a73b0a 7015 Lisp_Object workbuf = Qnil;
b73bfc1c 7016
4776e638 7017 if (with_work_buf)
24a73b0a
KH
7018 workbuf = make_conversion_work_buffer (multibyte);
7019 record_unwind_protect (code_conversion_restore,
7020 Fcons (Fcurrent_buffer (), workbuf));
4776e638 7021 return workbuf;
df7492f9 7022}
d46c5b12 7023
df7492f9
KH
7024int
7025decode_coding_gap (coding, chars, bytes)
7026 struct coding_system *coding;
7027 EMACS_INT chars, bytes;
7028{
7029 int count = specpdl_ptr - specpdl;
5e5c78be 7030 Lisp_Object attrs;
fb88bf2d 7031
24a73b0a 7032 code_conversion_save (0, 0);
ec6d2bb8 7033
24a73b0a 7034 coding->src_object = Fcurrent_buffer ();
df7492f9
KH
7035 coding->src_chars = chars;
7036 coding->src_bytes = bytes;
7037 coding->src_pos = -chars;
7038 coding->src_pos_byte = -bytes;
7039 coding->src_multibyte = chars < bytes;
24a73b0a 7040 coding->dst_object = coding->src_object;
df7492f9
KH
7041 coding->dst_pos = PT;
7042 coding->dst_pos_byte = PT_BYTE;
71c81426 7043 coding->dst_multibyte = ! NILP (current_buffer->enable_multibyte_characters);
4ed46869 7044
df7492f9
KH
7045 if (CODING_REQUIRE_DETECTION (coding))
7046 detect_coding (coding);
8f924df7 7047
9286b333 7048 coding->mode |= CODING_MODE_LAST_BLOCK;
287c57d7 7049 current_buffer->text->inhibit_shrinking = 1;
df7492f9 7050 decode_coding (coding);
287c57d7 7051 current_buffer->text->inhibit_shrinking = 0;
d46c5b12 7052
5e5c78be
KH
7053 attrs = CODING_ID_ATTRS (coding->id);
7054 if (! NILP (CODING_ATTR_POST_READ (attrs)))
b73bfc1c 7055 {
5e5c78be
KH
7056 EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7057 Lisp_Object val;
7058
7059 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
5e5c78be
KH
7060 val = call1 (CODING_ATTR_POST_READ (attrs),
7061 make_number (coding->produced_char));
5e5c78be
KH
7062 CHECK_NATNUM (val);
7063 coding->produced_char += Z - prev_Z;
7064 coding->produced += Z_BYTE - prev_Z_BYTE;
b73bfc1c 7065 }
4ed46869 7066
df7492f9 7067 unbind_to (count, Qnil);
b73bfc1c
KH
7068 return coding->result;
7069}
52d41803 7070
4ed46869 7071int
df7492f9 7072encode_coding_gap (coding, chars, bytes)
4ed46869 7073 struct coding_system *coding;
df7492f9 7074 EMACS_INT chars, bytes;
4ed46869 7075{
df7492f9 7076 int count = specpdl_ptr - specpdl;
4ed46869 7077
24a73b0a 7078 code_conversion_save (0, 0);
4ed46869 7079
24a73b0a 7080 coding->src_object = Fcurrent_buffer ();
df7492f9
KH
7081 coding->src_chars = chars;
7082 coding->src_bytes = bytes;
7083 coding->src_pos = -chars;
7084 coding->src_pos_byte = -bytes;
7085 coding->src_multibyte = chars < bytes;
7086 coding->dst_object = coding->src_object;
7087 coding->dst_pos = PT;
7088 coding->dst_pos_byte = PT_BYTE;
4ed46869 7089
df7492f9 7090 encode_coding (coding);
b73bfc1c 7091
df7492f9
KH
7092 unbind_to (count, Qnil);
7093 return coding->result;
7094}
4ed46869 7095
d46c5b12 7096
df7492f9
KH
7097/* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
7098 SRC_OBJECT into DST_OBJECT by coding context CODING.
b73bfc1c 7099
df7492f9 7100 SRC_OBJECT is a buffer, a string, or Qnil.
b73bfc1c 7101
df7492f9
KH
7102 If it is a buffer, the text is at point of the buffer. FROM and TO
7103 are positions in the buffer.
b73bfc1c 7104
df7492f9
KH
7105 If it is a string, the text is at the beginning of the string.
7106 FROM and TO are indices to the string.
4ed46869 7107
df7492f9
KH
7108 If it is nil, the text is at coding->source. FROM and TO are
7109 indices to coding->source.
bb10be8b 7110
df7492f9 7111 DST_OBJECT is a buffer, Qt, or Qnil.
4ed46869 7112
df7492f9
KH
7113 If it is a buffer, the decoded text is inserted at point of the
7114 buffer. If the buffer is the same as SRC_OBJECT, the source text
7115 is deleted.
4ed46869 7116
df7492f9
KH
7117 If it is Qt, a string is made from the decoded text, and
7118 set in CODING->dst_object.
d46c5b12 7119
df7492f9 7120 If it is Qnil, the decoded text is stored at CODING->destination.
2cb26057 7121 The caller must allocate CODING->dst_bytes bytes at
df7492f9
KH
7122 CODING->destination by xmalloc. If the decoded text is longer than
7123 CODING->dst_bytes, CODING->destination is relocated by xrealloc.
7124 */
d46c5b12 7125
df7492f9
KH
7126void
7127decode_coding_object (coding, src_object, from, from_byte, to, to_byte,
7128 dst_object)
d46c5b12 7129 struct coding_system *coding;
df7492f9
KH
7130 Lisp_Object src_object;
7131 EMACS_INT from, from_byte, to, to_byte;
7132 Lisp_Object dst_object;
d46c5b12 7133{
df7492f9
KH
7134 int count = specpdl_ptr - specpdl;
7135 unsigned char *destination;
7136 EMACS_INT dst_bytes;
7137 EMACS_INT chars = to - from;
7138 EMACS_INT bytes = to_byte - from_byte;
7139 Lisp_Object attrs;
4776e638 7140 int saved_pt = -1, saved_pt_byte;
64cedb0c 7141 int need_marker_adjustment = 0;
b3bfad50 7142 Lisp_Object old_deactivate_mark;
d46c5b12 7143
b3bfad50 7144 old_deactivate_mark = Vdeactivate_mark;
93dec019 7145
df7492f9 7146 if (NILP (dst_object))
d46c5b12 7147 {
df7492f9
KH
7148 destination = coding->destination;
7149 dst_bytes = coding->dst_bytes;
d46c5b12 7150 }
93dec019 7151
df7492f9
KH
7152 coding->src_object = src_object;
7153 coding->src_chars = chars;
7154 coding->src_bytes = bytes;
7155 coding->src_multibyte = chars < bytes;
70ad9fc4 7156
df7492f9 7157 if (STRINGP (src_object))
d46c5b12 7158 {
df7492f9
KH
7159 coding->src_pos = from;
7160 coding->src_pos_byte = from_byte;
d46c5b12 7161 }
df7492f9 7162 else if (BUFFERP (src_object))
88993dfd 7163 {
df7492f9
KH
7164 set_buffer_internal (XBUFFER (src_object));
7165 if (from != GPT)
7166 move_gap_both (from, from_byte);
7167 if (EQ (src_object, dst_object))
fb88bf2d 7168 {
64cedb0c
KH
7169 struct Lisp_Marker *tail;
7170
7171 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7172 {
7173 tail->need_adjustment
7174 = tail->charpos == (tail->insertion_type ? from : to);
7175 need_marker_adjustment |= tail->need_adjustment;
7176 }
4776e638 7177 saved_pt = PT, saved_pt_byte = PT_BYTE;
df7492f9 7178 TEMP_SET_PT_BOTH (from, from_byte);
f4a3cc44 7179 current_buffer->text->inhibit_shrinking = 1;
df7492f9
KH
7180 del_range_both (from, from_byte, to, to_byte, 1);
7181 coding->src_pos = -chars;
7182 coding->src_pos_byte = -bytes;
fb88bf2d 7183 }
df7492f9 7184 else
fb88bf2d 7185 {
df7492f9
KH
7186 coding->src_pos = from;
7187 coding->src_pos_byte = from_byte;
fb88bf2d 7188 }
88993dfd
KH
7189 }
7190
df7492f9
KH
7191 if (CODING_REQUIRE_DETECTION (coding))
7192 detect_coding (coding);
7193 attrs = CODING_ID_ATTRS (coding->id);
d46c5b12 7194
2cb26057
KH
7195 if (EQ (dst_object, Qt)
7196 || (! NILP (CODING_ATTR_POST_READ (attrs))
7197 && NILP (dst_object)))
b73bfc1c 7198 {
a1567c45
SM
7199 coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
7200 coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
df7492f9
KH
7201 coding->dst_pos = BEG;
7202 coding->dst_pos_byte = BEG_BYTE;
b73bfc1c 7203 }
df7492f9 7204 else if (BUFFERP (dst_object))
d46c5b12 7205 {
24a73b0a 7206 code_conversion_save (0, 0);
df7492f9
KH
7207 coding->dst_object = dst_object;
7208 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
7209 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
7210 coding->dst_multibyte
7211 = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
d46c5b12
KH
7212 }
7213 else
7214 {
24a73b0a 7215 code_conversion_save (0, 0);
df7492f9 7216 coding->dst_object = Qnil;
0154725e
SM
7217 /* Most callers presume this will return a multibyte result, and they
7218 won't use `binary' or `raw-text' anyway, so let's not worry about
7219 CODING_FOR_UNIBYTE. */
bb555731 7220 coding->dst_multibyte = 1;
d46c5b12
KH
7221 }
7222
df7492f9 7223 decode_coding (coding);
fa46990e 7224
df7492f9
KH
7225 if (BUFFERP (coding->dst_object))
7226 set_buffer_internal (XBUFFER (coding->dst_object));
d46c5b12 7227
df7492f9 7228 if (! NILP (CODING_ATTR_POST_READ (attrs)))
d46c5b12 7229 {
b3bfad50 7230 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
df7492f9 7231 EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
2b4f9037 7232 Lisp_Object val;
d46c5b12 7233
c0cc7f7f 7234 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
b3bfad50
KH
7235 GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7236 old_deactivate_mark);
d4850d67
KH
7237 val = safe_call1 (CODING_ATTR_POST_READ (attrs),
7238 make_number (coding->produced_char));
df7492f9
KH
7239 UNGCPRO;
7240 CHECK_NATNUM (val);
7241 coding->produced_char += Z - prev_Z;
7242 coding->produced += Z_BYTE - prev_Z_BYTE;
d46c5b12 7243 }
de79a6a5 7244
df7492f9 7245 if (EQ (dst_object, Qt))
ec6d2bb8 7246 {
df7492f9
KH
7247 coding->dst_object = Fbuffer_string ();
7248 }
7249 else if (NILP (dst_object) && BUFFERP (coding->dst_object))
7250 {
7251 set_buffer_internal (XBUFFER (coding->dst_object));
7252 if (dst_bytes < coding->produced)
7253 {
b3bfad50 7254 destination = xrealloc (destination, coding->produced);
df7492f9
KH
7255 if (! destination)
7256 {
065e3595
KH
7257 record_conversion_result (coding,
7258 CODING_RESULT_INSUFFICIENT_DST);
df7492f9
KH
7259 unbind_to (count, Qnil);
7260 return;
7261 }
7262 if (BEGV < GPT && GPT < BEGV + coding->produced_char)
7263 move_gap_both (BEGV, BEGV_BYTE);
7264 bcopy (BEGV_ADDR, destination, coding->produced);
7265 coding->destination = destination;
d46c5b12 7266 }
ec6d2bb8 7267 }
b73bfc1c 7268
4776e638
KH
7269 if (saved_pt >= 0)
7270 {
7271 /* This is the case of:
7272 (BUFFERP (src_object) && EQ (src_object, dst_object))
7273 As we have moved PT while replacing the original buffer
7274 contents, we must recover it now. */
7275 set_buffer_internal (XBUFFER (src_object));
f4a3cc44 7276 current_buffer->text->inhibit_shrinking = 0;
4776e638
KH
7277 if (saved_pt < from)
7278 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7279 else if (saved_pt < from + chars)
7280 TEMP_SET_PT_BOTH (from, from_byte);
7281 else if (! NILP (current_buffer->enable_multibyte_characters))
7282 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7283 saved_pt_byte + (coding->produced - bytes));
7284 else
7285 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7286 saved_pt_byte + (coding->produced - bytes));
64cedb0c
KH
7287
7288 if (need_marker_adjustment)
7289 {
7290 struct Lisp_Marker *tail;
7291
7292 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7293 if (tail->need_adjustment)
7294 {
7295 tail->need_adjustment = 0;
7296 if (tail->insertion_type)
7297 {
7298 tail->bytepos = from_byte;
7299 tail->charpos = from;
7300 }
7301 else
7302 {
7303 tail->bytepos = from_byte + coding->produced;
7304 tail->charpos
7305 = (NILP (current_buffer->enable_multibyte_characters)
7306 ? tail->bytepos : from + coding->produced_char);
7307 }
7308 }
7309 }
d46c5b12 7310 }
4776e638 7311
b3bfad50 7312 Vdeactivate_mark = old_deactivate_mark;
065e3595 7313 unbind_to (count, coding->dst_object);
d46c5b12
KH
7314}
7315
d46c5b12 7316
df7492f9
KH
7317void
7318encode_coding_object (coding, src_object, from, from_byte, to, to_byte,
7319 dst_object)
d46c5b12 7320 struct coding_system *coding;
df7492f9
KH
7321 Lisp_Object src_object;
7322 EMACS_INT from, from_byte, to, to_byte;
7323 Lisp_Object dst_object;
d46c5b12 7324{
b73bfc1c 7325 int count = specpdl_ptr - specpdl;
df7492f9
KH
7326 EMACS_INT chars = to - from;
7327 EMACS_INT bytes = to_byte - from_byte;
7328 Lisp_Object attrs;
4776e638 7329 int saved_pt = -1, saved_pt_byte;
64cedb0c 7330 int need_marker_adjustment = 0;
c02d943b 7331 int kill_src_buffer = 0;
b3bfad50 7332 Lisp_Object old_deactivate_mark;
df7492f9 7333
b3bfad50 7334 old_deactivate_mark = Vdeactivate_mark;
df7492f9
KH
7335
7336 coding->src_object = src_object;
7337 coding->src_chars = chars;
7338 coding->src_bytes = bytes;
7339 coding->src_multibyte = chars < bytes;
7340
7341 attrs = CODING_ID_ATTRS (coding->id);
7342
64cedb0c
KH
7343 if (EQ (src_object, dst_object))
7344 {
7345 struct Lisp_Marker *tail;
7346
7347 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7348 {
7349 tail->need_adjustment
7350 = tail->charpos == (tail->insertion_type ? from : to);
7351 need_marker_adjustment |= tail->need_adjustment;
7352 }
7353 }
7354
df7492f9 7355 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
6bac5b12 7356 {
24a73b0a 7357 coding->src_object = code_conversion_save (1, coding->src_multibyte);
df7492f9
KH
7358 set_buffer_internal (XBUFFER (coding->src_object));
7359 if (STRINGP (src_object))
7360 insert_from_string (src_object, from, from_byte, chars, bytes, 0);
7361 else if (BUFFERP (src_object))
7362 insert_from_buffer (XBUFFER (src_object), from, chars, 0);
7363 else
7364 insert_1_both (coding->source + from, chars, bytes, 0, 0, 0);
d46c5b12 7365
df7492f9
KH
7366 if (EQ (src_object, dst_object))
7367 {
7368 set_buffer_internal (XBUFFER (src_object));
4776e638 7369 saved_pt = PT, saved_pt_byte = PT_BYTE;
df7492f9
KH
7370 del_range_both (from, from_byte, to, to_byte, 1);
7371 set_buffer_internal (XBUFFER (coding->src_object));
7372 }
7373
d4850d67
KH
7374 {
7375 Lisp_Object args[3];
b3bfad50 7376 struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
d4850d67 7377
b3bfad50
KH
7378 GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7379 old_deactivate_mark);
d4850d67
KH
7380 args[0] = CODING_ATTR_PRE_WRITE (attrs);
7381 args[1] = make_number (BEG);
7382 args[2] = make_number (Z);
7383 safe_call (3, args);
b3bfad50 7384 UNGCPRO;
d4850d67 7385 }
c02d943b
KH
7386 if (XBUFFER (coding->src_object) != current_buffer)
7387 kill_src_buffer = 1;
ac87bbef 7388 coding->src_object = Fcurrent_buffer ();
df7492f9
KH
7389 if (BEG != GPT)
7390 move_gap_both (BEG, BEG_BYTE);
7391 coding->src_chars = Z - BEG;
7392 coding->src_bytes = Z_BYTE - BEG_BYTE;
7393 coding->src_pos = BEG;
7394 coding->src_pos_byte = BEG_BYTE;
7395 coding->src_multibyte = Z < Z_BYTE;
7396 }
7397 else if (STRINGP (src_object))
d46c5b12 7398 {
24a73b0a 7399 code_conversion_save (0, 0);
df7492f9
KH
7400 coding->src_pos = from;
7401 coding->src_pos_byte = from_byte;
b73bfc1c 7402 }
df7492f9 7403 else if (BUFFERP (src_object))
b73bfc1c 7404 {
24a73b0a 7405 code_conversion_save (0, 0);
df7492f9 7406 set_buffer_internal (XBUFFER (src_object));
df7492f9 7407 if (EQ (src_object, dst_object))
d46c5b12 7408 {
4776e638 7409 saved_pt = PT, saved_pt_byte = PT_BYTE;
ff0dacd7
KH
7410 coding->src_object = del_range_1 (from, to, 1, 1);
7411 coding->src_pos = 0;
7412 coding->src_pos_byte = 0;
d46c5b12 7413 }
df7492f9 7414 else
d46c5b12 7415 {
ff0dacd7
KH
7416 if (from < GPT && to >= GPT)
7417 move_gap_both (from, from_byte);
df7492f9
KH
7418 coding->src_pos = from;
7419 coding->src_pos_byte = from_byte;
d46c5b12 7420 }
d46c5b12 7421 }
4776e638 7422 else
24a73b0a 7423 code_conversion_save (0, 0);
d46c5b12 7424
df7492f9 7425 if (BUFFERP (dst_object))
88993dfd 7426 {
df7492f9 7427 coding->dst_object = dst_object;
28f67a95
KH
7428 if (EQ (src_object, dst_object))
7429 {
7430 coding->dst_pos = from;
7431 coding->dst_pos_byte = from_byte;
7432 }
7433 else
7434 {
319a3947
KH
7435 struct buffer *current = current_buffer;
7436
7437 set_buffer_temp (XBUFFER (dst_object));
7438 coding->dst_pos = PT;
7439 coding->dst_pos_byte = PT_BYTE;
7440 move_gap_both (coding->dst_pos, coding->dst_pos_byte);
7441 set_buffer_temp (current);
28f67a95 7442 }
df7492f9
KH
7443 coding->dst_multibyte
7444 = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
88993dfd 7445 }
df7492f9 7446 else if (EQ (dst_object, Qt))
d46c5b12 7447 {
df7492f9 7448 coding->dst_object = Qnil;
df7492f9 7449 coding->dst_bytes = coding->src_chars;
ac87bbef
KH
7450 if (coding->dst_bytes == 0)
7451 coding->dst_bytes = 1;
7452 coding->destination = (unsigned char *) xmalloc (coding->dst_bytes);
df7492f9 7453 coding->dst_multibyte = 0;
d46c5b12
KH
7454 }
7455 else
7456 {
df7492f9
KH
7457 coding->dst_object = Qnil;
7458 coding->dst_multibyte = 0;
d46c5b12
KH
7459 }
7460
df7492f9 7461 encode_coding (coding);
d46c5b12 7462
df7492f9 7463 if (EQ (dst_object, Qt))
d46c5b12 7464 {
df7492f9
KH
7465 if (BUFFERP (coding->dst_object))
7466 coding->dst_object = Fbuffer_string ();
7467 else
d46c5b12 7468 {
df7492f9
KH
7469 coding->dst_object
7470 = make_unibyte_string ((char *) coding->destination,
7471 coding->produced);
7472 xfree (coding->destination);
d46c5b12 7473 }
4ed46869 7474 }
d46c5b12 7475
4776e638
KH
7476 if (saved_pt >= 0)
7477 {
7478 /* This is the case of:
7479 (BUFFERP (src_object) && EQ (src_object, dst_object))
7480 As we have moved PT while replacing the original buffer
7481 contents, we must recover it now. */
7482 set_buffer_internal (XBUFFER (src_object));
7483 if (saved_pt < from)
7484 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7485 else if (saved_pt < from + chars)
7486 TEMP_SET_PT_BOTH (from, from_byte);
7487 else if (! NILP (current_buffer->enable_multibyte_characters))
7488 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7489 saved_pt_byte + (coding->produced - bytes));
d46c5b12 7490 else
4776e638
KH
7491 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7492 saved_pt_byte + (coding->produced - bytes));
64cedb0c
KH
7493
7494 if (need_marker_adjustment)
7495 {
7496 struct Lisp_Marker *tail;
7497
7498 for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7499 if (tail->need_adjustment)
7500 {
7501 tail->need_adjustment = 0;
7502 if (tail->insertion_type)
7503 {
7504 tail->bytepos = from_byte;
7505 tail->charpos = from;
7506 }
7507 else
7508 {
7509 tail->bytepos = from_byte + coding->produced;
7510 tail->charpos
7511 = (NILP (current_buffer->enable_multibyte_characters)
7512 ? tail->bytepos : from + coding->produced_char);
7513 }
7514 }
7515 }
4776e638
KH
7516 }
7517
c02d943b
KH
7518 if (kill_src_buffer)
7519 Fkill_buffer (coding->src_object);
b3bfad50
KH
7520
7521 Vdeactivate_mark = old_deactivate_mark;
df7492f9 7522 unbind_to (count, Qnil);
b73bfc1c
KH
7523}
7524
df7492f9 7525
b73bfc1c 7526Lisp_Object
df7492f9 7527preferred_coding_system ()
b73bfc1c 7528{
df7492f9 7529 int id = coding_categories[coding_priorities[0]].id;
2391eaa4 7530
df7492f9 7531 return CODING_ID_NAME (id);
4ed46869
KH
7532}
7533
7534\f
7535#ifdef emacs
1397dc18 7536/*** 8. Emacs Lisp library functions ***/
4ed46869 7537
4ed46869 7538DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
48b0f3ae 7539 doc: /* Return t if OBJECT is nil or a coding-system.
df7492f9 7540See the documentation of `define-coding-system' for information
48b0f3ae 7541about coding-system objects. */)
d4a1d553
JB
7542 (object)
7543 Lisp_Object object;
4ed46869 7544{
d4a1d553
JB
7545 if (NILP (object)
7546 || CODING_SYSTEM_ID (object) >= 0)
44e8490d 7547 return Qt;
d4a1d553
JB
7548 if (! SYMBOLP (object)
7549 || NILP (Fget (object, Qcoding_system_define_form)))
44e8490d
KH
7550 return Qnil;
7551 return Qt;
4ed46869
KH
7552}
7553
9d991de8
RS
7554DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
7555 Sread_non_nil_coding_system, 1, 1, 0,
48b0f3ae
PJ
7556 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
7557 (prompt)
4ed46869
KH
7558 Lisp_Object prompt;
7559{
e0e989f6 7560 Lisp_Object val;
9d991de8
RS
7561 do
7562 {
4608c386
KH
7563 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
7564 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
9d991de8 7565 }
8f924df7 7566 while (SCHARS (val) == 0);
e0e989f6 7567 return (Fintern (val, Qnil));
4ed46869
KH
7568}
7569
9b787f3e 7570DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
48b0f3ae 7571 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
c7183fb8
GM
7572If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
7573Ignores case when completing coding systems (all Emacs coding systems
7574are lower-case). */)
48b0f3ae 7575 (prompt, default_coding_system)
9b787f3e 7576 Lisp_Object prompt, default_coding_system;
4ed46869 7577{
f44d27ce 7578 Lisp_Object val;
c7183fb8
GM
7579 int count = SPECPDL_INDEX ();
7580
9b787f3e 7581 if (SYMBOLP (default_coding_system))
57d25e6f 7582 default_coding_system = SYMBOL_NAME (default_coding_system);
c7183fb8 7583 specbind (Qcompletion_ignore_case, Qt);
4608c386 7584 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
9b787f3e
RS
7585 Qt, Qnil, Qcoding_system_history,
7586 default_coding_system, Qnil);
c7183fb8 7587 unbind_to (count, Qnil);
8f924df7 7588 return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
4ed46869
KH
7589}
7590
7591DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
7592 1, 1, 0,
48b0f3ae 7593 doc: /* Check validity of CODING-SYSTEM.
9ffd559c
KH
7594If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
7595It is valid if it is nil or a symbol defined as a coding system by the
7596function `define-coding-system'. */)
df7492f9 7597 (coding_system)
4ed46869
KH
7598 Lisp_Object coding_system;
7599{
44e8490d
KH
7600 Lisp_Object define_form;
7601
7602 define_form = Fget (coding_system, Qcoding_system_define_form);
7603 if (! NILP (define_form))
7604 {
7605 Fput (coding_system, Qcoding_system_define_form, Qnil);
7606 safe_eval (define_form);
7607 }
4ed46869
KH
7608 if (!NILP (Fcoding_system_p (coding_system)))
7609 return coding_system;
fcad4ec4 7610 xsignal1 (Qcoding_system_error, coding_system);
4ed46869 7611}
df7492f9 7612
3a73fa5d 7613\f
89528eb3
KH
7614/* Detect how the bytes at SRC of length SRC_BYTES are encoded. If
7615 HIGHEST is nonzero, return the coding system of the highest
7616 priority among the detected coding systems. Otherwize return a
7617 list of detected coding systems sorted by their priorities. If
7618 MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
7619 multibyte form but contains only ASCII and eight-bit chars.
7620 Otherwise, the bytes are raw bytes.
7621
7622 CODING-SYSTEM controls the detection as below:
7623
7624 If it is nil, detect both text-format and eol-format. If the
7625 text-format part of CODING-SYSTEM is already specified
7626 (e.g. `iso-latin-1'), detect only eol-format. If the eol-format
7627 part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
7628 detect only text-format. */
7629
d46c5b12 7630Lisp_Object
24a73b0a
KH
7631detect_coding_system (src, src_chars, src_bytes, highest, multibytep,
7632 coding_system)
8f924df7 7633 const unsigned char *src;
13818c30
SM
7634 EMACS_INT src_chars, src_bytes;
7635 int highest;
0a28aafb 7636 int multibytep;
df7492f9 7637 Lisp_Object coding_system;
4ed46869 7638{
8f924df7 7639 const unsigned char *src_end = src + src_bytes;
df7492f9
KH
7640 Lisp_Object attrs, eol_type;
7641 Lisp_Object val;
7642 struct coding_system coding;
89528eb3 7643 int id;
ff0dacd7 7644 struct coding_detection_info detect_info;
24a73b0a 7645 enum coding_category base_category;
2f3cbb32 7646 int null_byte_found = 0, eight_bit_found = 0;
b73bfc1c 7647
df7492f9
KH
7648 if (NILP (coding_system))
7649 coding_system = Qundecided;
7650 setup_coding_system (coding_system, &coding);
7651 attrs = CODING_ID_ATTRS (coding.id);
7652 eol_type = CODING_ID_EOL_TYPE (coding.id);
89528eb3 7653 coding_system = CODING_ATTR_BASE_NAME (attrs);
4ed46869 7654
df7492f9 7655 coding.source = src;
24a73b0a 7656 coding.src_chars = src_chars;
df7492f9
KH
7657 coding.src_bytes = src_bytes;
7658 coding.src_multibyte = multibytep;
7659 coding.consumed = 0;
89528eb3 7660 coding.mode |= CODING_MODE_LAST_BLOCK;
c0e16b14 7661 coding.head_ascii = 0;
d46c5b12 7662
ff0dacd7 7663 detect_info.checked = detect_info.found = detect_info.rejected = 0;
d46c5b12 7664
89528eb3 7665 /* At first, detect text-format if necessary. */
24a73b0a
KH
7666 base_category = XINT (CODING_ATTR_CATEGORY (attrs));
7667 if (base_category == coding_category_undecided)
4ed46869 7668 {
ff0dacd7
KH
7669 enum coding_category category;
7670 struct coding_system *this;
7671 int c, i;
88993dfd 7672
24a73b0a 7673 /* Skip all ASCII bytes except for a few ISO2022 controls. */
2f3cbb32 7674 for (; src < src_end; src++)
4ed46869 7675 {
df7492f9 7676 c = *src;
6cb21a4f 7677 if (c & 0x80)
6cb21a4f 7678 {
2f3cbb32 7679 eight_bit_found = 1;
2f3cbb32
KH
7680 if (null_byte_found)
7681 break;
7682 }
c0e16b14 7683 else if (c < 0x20)
2f3cbb32
KH
7684 {
7685 if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
7686 && ! inhibit_iso_escape_detection
7687 && ! detect_info.checked)
6cb21a4f 7688 {
2f3cbb32
KH
7689 if (detect_coding_iso_2022 (&coding, &detect_info))
7690 {
7691 /* We have scanned the whole data. */
7692 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
c0e16b14
KH
7693 {
7694 /* We didn't find an 8-bit code. We may
7695 have found a null-byte, but it's very
7696 rare that a binary file confirm to
7697 ISO-2022. */
7698 src = src_end;
7699 coding.head_ascii = src - coding.source;
7700 }
7701 detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
2f3cbb32
KH
7702 break;
7703 }
7704 }
7705 else if (! c)
7706 {
7707 null_byte_found = 1;
7708 if (eight_bit_found)
7709 break;
6cb21a4f 7710 }
c006c0c8
KH
7711 if (! eight_bit_found)
7712 coding.head_ascii++;
6cb21a4f 7713 }
c006c0c8 7714 else if (! eight_bit_found)
c0e16b14 7715 coding.head_ascii++;
4ed46869 7716 }
88993dfd 7717
2f3cbb32
KH
7718 if (null_byte_found || eight_bit_found
7719 || coding.head_ascii < coding.src_bytes
6cb21a4f
KH
7720 || detect_info.found)
7721 {
2f3cbb32 7722 if (coding.head_ascii == coding.src_bytes)
6cb21a4f
KH
7723 /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */
7724 for (i = 0; i < coding_category_raw_text; i++)
ff0dacd7 7725 {
6cb21a4f 7726 category = coding_priorities[i];
c7266f4a 7727 this = coding_categories + category;
6cb21a4f 7728 if (detect_info.found & (1 << category))
ff0dacd7
KH
7729 break;
7730 }
6cb21a4f 7731 else
2f3cbb32
KH
7732 {
7733 if (null_byte_found)
7734 {
7735 detect_info.checked |= ~CATEGORY_MASK_UTF_16;
7736 detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
7737 }
7738 for (i = 0; i < coding_category_raw_text; i++)
7739 {
7740 category = coding_priorities[i];
7741 this = coding_categories + category;
6cb21a4f 7742
2f3cbb32
KH
7743 if (this->id < 0)
7744 {
7745 /* No coding system of this category is defined. */
7746 detect_info.rejected |= (1 << category);
7747 }
7748 else if (category >= coding_category_raw_text)
7749 continue;
7750 else if (detect_info.checked & (1 << category))
7751 {
7752 if (highest
7753 && (detect_info.found & (1 << category)))
6cb21a4f 7754 break;
2f3cbb32
KH
7755 }
7756 else if ((*(this->detector)) (&coding, &detect_info)
7757 && highest
7758 && (detect_info.found & (1 << category)))
7759 {
7760 if (category == coding_category_utf_16_auto)
7761 {
7762 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
7763 category = coding_category_utf_16_le;
7764 else
7765 category = coding_category_utf_16_be;
7766 }
7767 break;
7768 }
7769 }
7770 }
6cb21a4f 7771 }
ec6d2bb8 7772
2f3cbb32 7773 if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY)
ec6d2bb8 7774 {
ff0dacd7 7775 detect_info.found = CATEGORY_MASK_RAW_TEXT;
89528eb3
KH
7776 id = coding_categories[coding_category_raw_text].id;
7777 val = Fcons (make_number (id), Qnil);
7778 }
ff0dacd7 7779 else if (! detect_info.rejected && ! detect_info.found)
89528eb3 7780 {
ff0dacd7 7781 detect_info.found = CATEGORY_MASK_ANY;
89528eb3
KH
7782 id = coding_categories[coding_category_undecided].id;
7783 val = Fcons (make_number (id), Qnil);
7784 }
7785 else if (highest)
7786 {
ff0dacd7 7787 if (detect_info.found)
ec6d2bb8 7788 {
ff0dacd7
KH
7789 detect_info.found = 1 << category;
7790 val = Fcons (make_number (this->id), Qnil);
7791 }
7792 else
7793 for (i = 0; i < coding_category_raw_text; i++)
7794 if (! (detect_info.rejected & (1 << coding_priorities[i])))
7795 {
7796 detect_info.found = 1 << coding_priorities[i];
7797 id = coding_categories[coding_priorities[i]].id;
7798 val = Fcons (make_number (id), Qnil);
7799 break;
7800 }
7801 }
89528eb3
KH
7802 else
7803 {
ff0dacd7
KH
7804 int mask = detect_info.rejected | detect_info.found;
7805 int found = 0;
89528eb3 7806 val = Qnil;
ec6d2bb8 7807
89528eb3 7808 for (i = coding_category_raw_text - 1; i >= 0; i--)
ff0dacd7
KH
7809 {
7810 category = coding_priorities[i];
7811 if (! (mask & (1 << category)))
ec6d2bb8 7812 {
ff0dacd7
KH
7813 found |= 1 << category;
7814 id = coding_categories[category].id;
c7266f4a
KH
7815 if (id >= 0)
7816 val = Fcons (make_number (id), val);
ff0dacd7
KH
7817 }
7818 }
7819 for (i = coding_category_raw_text - 1; i >= 0; i--)
7820 {
7821 category = coding_priorities[i];
7822 if (detect_info.found & (1 << category))
7823 {
7824 id = coding_categories[category].id;
7825 val = Fcons (make_number (id), val);
ec6d2bb8 7826 }
ec6d2bb8 7827 }
ff0dacd7 7828 detect_info.found |= found;
ec6d2bb8 7829 }
ec6d2bb8 7830 }
a470d443
KH
7831 else if (base_category == coding_category_utf_8_auto)
7832 {
7833 if (detect_coding_utf_8 (&coding, &detect_info))
7834 {
7835 struct coding_system *this;
7836
7837 if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
7838 this = coding_categories + coding_category_utf_8_sig;
7839 else
7840 this = coding_categories + coding_category_utf_8_nosig;
7841 val = Fcons (make_number (this->id), Qnil);
7842 }
7843 }
24a73b0a
KH
7844 else if (base_category == coding_category_utf_16_auto)
7845 {
7846 if (detect_coding_utf_16 (&coding, &detect_info))
7847 {
24a73b0a
KH
7848 struct coding_system *this;
7849
7850 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
7851 this = coding_categories + coding_category_utf_16_le;
7852 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
7853 this = coding_categories + coding_category_utf_16_be;
7854 else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
7855 this = coding_categories + coding_category_utf_16_be_nosig;
7856 else
7857 this = coding_categories + coding_category_utf_16_le_nosig;
7858 val = Fcons (make_number (this->id), Qnil);
7859 }
7860 }
df7492f9
KH
7861 else
7862 {
ff0dacd7 7863 detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
89528eb3 7864 val = Fcons (make_number (coding.id), Qnil);
4ed46869 7865 }
df7492f9 7866
89528eb3 7867 /* Then, detect eol-format if necessary. */
df7492f9 7868 {
89528eb3 7869 int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol;
df7492f9
KH
7870 Lisp_Object tail;
7871
89528eb3
KH
7872 if (VECTORP (eol_type))
7873 {
ff0dacd7 7874 if (detect_info.found & ~CATEGORY_MASK_UTF_16)
2f3cbb32
KH
7875 {
7876 if (null_byte_found)
7877 normal_eol = EOL_SEEN_LF;
7878 else
7879 normal_eol = detect_eol (coding.source, src_bytes,
7880 coding_category_raw_text);
7881 }
ff0dacd7
KH
7882 if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
7883 | CATEGORY_MASK_UTF_16_BE_NOSIG))
89528eb3
KH
7884 utf_16_be_eol = detect_eol (coding.source, src_bytes,
7885 coding_category_utf_16_be);
ff0dacd7
KH
7886 if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
7887 | CATEGORY_MASK_UTF_16_LE_NOSIG))
89528eb3
KH
7888 utf_16_le_eol = detect_eol (coding.source, src_bytes,
7889 coding_category_utf_16_le);
7890 }
7891 else
7892 {
7893 if (EQ (eol_type, Qunix))
7894 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
7895 else if (EQ (eol_type, Qdos))
7896 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
7897 else
7898 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
7899 }
7900
df7492f9
KH
7901 for (tail = val; CONSP (tail); tail = XCDR (tail))
7902 {
89528eb3 7903 enum coding_category category;
df7492f9 7904 int this_eol;
89528eb3
KH
7905
7906 id = XINT (XCAR (tail));
7907 attrs = CODING_ID_ATTRS (id);
7908 category = XINT (CODING_ATTR_CATEGORY (attrs));
7909 eol_type = CODING_ID_EOL_TYPE (id);
df7492f9
KH
7910 if (VECTORP (eol_type))
7911 {
89528eb3
KH
7912 if (category == coding_category_utf_16_be
7913 || category == coding_category_utf_16_be_nosig)
7914 this_eol = utf_16_be_eol;
7915 else if (category == coding_category_utf_16_le
7916 || category == coding_category_utf_16_le_nosig)
7917 this_eol = utf_16_le_eol;
df7492f9 7918 else
89528eb3
KH
7919 this_eol = normal_eol;
7920
df7492f9
KH
7921 if (this_eol == EOL_SEEN_LF)
7922 XSETCAR (tail, AREF (eol_type, 0));
7923 else if (this_eol == EOL_SEEN_CRLF)
7924 XSETCAR (tail, AREF (eol_type, 1));
7925 else if (this_eol == EOL_SEEN_CR)
7926 XSETCAR (tail, AREF (eol_type, 2));
89528eb3
KH
7927 else
7928 XSETCAR (tail, CODING_ID_NAME (id));
df7492f9 7929 }
89528eb3
KH
7930 else
7931 XSETCAR (tail, CODING_ID_NAME (id));
df7492f9
KH
7932 }
7933 }
ec6d2bb8 7934
03699b14 7935 return (highest ? XCAR (val) : val);
ec6d2bb8
KH
7936}
7937
ec6d2bb8 7938
d46c5b12
KH
7939DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
7940 2, 3, 0,
48b0f3ae
PJ
7941 doc: /* Detect coding system of the text in the region between START and END.
7942Return a list of possible coding systems ordered by priority.
ec6d2bb8 7943
12e0131a 7944If only ASCII characters are found (except for such ISO-2022 control
d4a1d553
JB
7945characters as ESC), it returns a list of single element `undecided'
7946or its subsidiary coding system according to a detected end-of-line
7947format.
ec6d2bb8 7948
48b0f3ae
PJ
7949If optional argument HIGHEST is non-nil, return the coding system of
7950highest priority. */)
7951 (start, end, highest)
d46c5b12
KH
7952 Lisp_Object start, end, highest;
7953{
7954 int from, to;
7955 int from_byte, to_byte;
ec6d2bb8 7956
b7826503
PJ
7957 CHECK_NUMBER_COERCE_MARKER (start);
7958 CHECK_NUMBER_COERCE_MARKER (end);
ec6d2bb8 7959
d46c5b12
KH
7960 validate_region (&start, &end);
7961 from = XINT (start), to = XINT (end);
7962 from_byte = CHAR_TO_BYTE (from);
7963 to_byte = CHAR_TO_BYTE (to);
ec6d2bb8 7964
d46c5b12
KH
7965 if (from < GPT && to >= GPT)
7966 move_gap_both (to, to_byte);
c210f766 7967
d46c5b12 7968 return detect_coding_system (BYTE_POS_ADDR (from_byte),
24a73b0a 7969 to - from, to_byte - from_byte,
0a28aafb
KH
7970 !NILP (highest),
7971 !NILP (current_buffer
df7492f9
KH
7972 ->enable_multibyte_characters),
7973 Qnil);
ec6d2bb8
KH
7974}
7975
d46c5b12
KH
7976DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
7977 1, 2, 0,
48b0f3ae
PJ
7978 doc: /* Detect coding system of the text in STRING.
7979Return a list of possible coding systems ordered by priority.
fb88bf2d 7980
12e0131a 7981If only ASCII characters are found (except for such ISO-2022 control
d4a1d553
JB
7982characters as ESC), it returns a list of single element `undecided'
7983or its subsidiary coding system according to a detected end-of-line
7984format.
d46c5b12 7985
48b0f3ae
PJ
7986If optional argument HIGHEST is non-nil, return the coding system of
7987highest priority. */)
7988 (string, highest)
d46c5b12
KH
7989 Lisp_Object string, highest;
7990{
b7826503 7991 CHECK_STRING (string);
b73bfc1c 7992
24a73b0a
KH
7993 return detect_coding_system (SDATA (string),
7994 SCHARS (string), SBYTES (string),
8f924df7 7995 !NILP (highest), STRING_MULTIBYTE (string),
df7492f9 7996 Qnil);
4ed46869 7997}
4ed46869 7998
b73bfc1c 7999
df7492f9
KH
8000static INLINE int
8001char_encodable_p (c, attrs)
8002 int c;
8003 Lisp_Object attrs;
05e6f5dc 8004{
df7492f9 8005 Lisp_Object tail;
df7492f9 8006 struct charset *charset;
7d64c6ad 8007 Lisp_Object translation_table;
d46c5b12 8008
7d64c6ad 8009 translation_table = CODING_ATTR_TRANS_TBL (attrs);
a6f87d34 8010 if (! NILP (translation_table))
7d64c6ad 8011 c = translate_char (translation_table, c);
df7492f9
KH
8012 for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8013 CONSP (tail); tail = XCDR (tail))
e133c8fa 8014 {
df7492f9
KH
8015 charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8016 if (CHAR_CHARSET_P (c, charset))
8017 break;
e133c8fa 8018 }
df7492f9 8019 return (! NILP (tail));
05e6f5dc 8020}
83fa074f 8021
fb88bf2d 8022
df7492f9
KH
8023/* Return a list of coding systems that safely encode the text between
8024 START and END. If EXCLUDE is non-nil, it is a list of coding
8025 systems not to check. The returned list doesn't contain any such
48468dac 8026 coding systems. In any case, if the text contains only ASCII or is
df7492f9 8027 unibyte, return t. */
e077cc80 8028
df7492f9
KH
8029DEFUN ("find-coding-systems-region-internal",
8030 Ffind_coding_systems_region_internal,
8031 Sfind_coding_systems_region_internal, 2, 3, 0,
8032 doc: /* Internal use only. */)
8033 (start, end, exclude)
8034 Lisp_Object start, end, exclude;
8035{
8036 Lisp_Object coding_attrs_list, safe_codings;
8037 EMACS_INT start_byte, end_byte;
7c78e542 8038 const unsigned char *p, *pbeg, *pend;
df7492f9
KH
8039 int c;
8040 Lisp_Object tail, elt;
d46c5b12 8041
df7492f9
KH
8042 if (STRINGP (start))
8043 {
8044 if (!STRING_MULTIBYTE (start)
8f924df7 8045 || SCHARS (start) == SBYTES (start))
df7492f9
KH
8046 return Qt;
8047 start_byte = 0;
8f924df7 8048 end_byte = SBYTES (start);
df7492f9
KH
8049 }
8050 else
d46c5b12 8051 {
df7492f9
KH
8052 CHECK_NUMBER_COERCE_MARKER (start);
8053 CHECK_NUMBER_COERCE_MARKER (end);
8054 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8055 args_out_of_range (start, end);
8056 if (NILP (current_buffer->enable_multibyte_characters))
8057 return Qt;
8058 start_byte = CHAR_TO_BYTE (XINT (start));
8059 end_byte = CHAR_TO_BYTE (XINT (end));
8060 if (XINT (end) - XINT (start) == end_byte - start_byte)
8061 return Qt;
d46c5b12 8062
e1c23804 8063 if (XINT (start) < GPT && XINT (end) > GPT)
d46c5b12 8064 {
e1c23804
DL
8065 if ((GPT - XINT (start)) < (XINT (end) - GPT))
8066 move_gap_both (XINT (start), start_byte);
df7492f9 8067 else
e1c23804 8068 move_gap_both (XINT (end), end_byte);
d46c5b12
KH
8069 }
8070 }
8071
df7492f9
KH
8072 coding_attrs_list = Qnil;
8073 for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
8074 if (NILP (exclude)
8075 || NILP (Fmemq (XCAR (tail), exclude)))
8076 {
8077 Lisp_Object attrs;
d46c5b12 8078
df7492f9
KH
8079 attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
8080 if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
8081 && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
7d64c6ad
KH
8082 {
8083 ASET (attrs, coding_attr_trans_tbl,
2170c8f0 8084 get_translation_table (attrs, 1, NULL));
7d64c6ad
KH
8085 coding_attrs_list = Fcons (attrs, coding_attrs_list);
8086 }
df7492f9 8087 }
d46c5b12 8088
df7492f9 8089 if (STRINGP (start))
8f924df7 8090 p = pbeg = SDATA (start);
df7492f9
KH
8091 else
8092 p = pbeg = BYTE_POS_ADDR (start_byte);
8093 pend = p + (end_byte - start_byte);
b843d1ae 8094
df7492f9
KH
8095 while (p < pend && ASCII_BYTE_P (*p)) p++;
8096 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
d46c5b12 8097
05e6f5dc 8098 while (p < pend)
72d1a715 8099 {
df7492f9
KH
8100 if (ASCII_BYTE_P (*p))
8101 p++;
72d1a715
RS
8102 else
8103 {
df7492f9 8104 c = STRING_CHAR_ADVANCE (p);
12410ef1 8105
df7492f9
KH
8106 charset_map_loaded = 0;
8107 for (tail = coding_attrs_list; CONSP (tail);)
8108 {
8109 elt = XCAR (tail);
8110 if (NILP (elt))
8111 tail = XCDR (tail);
8112 else if (char_encodable_p (c, elt))
8113 tail = XCDR (tail);
8114 else if (CONSP (XCDR (tail)))
8115 {
8116 XSETCAR (tail, XCAR (XCDR (tail)));
8117 XSETCDR (tail, XCDR (XCDR (tail)));
8118 }
8119 else
8120 {
8121 XSETCAR (tail, Qnil);
8122 tail = XCDR (tail);
8123 }
8124 }
8125 if (charset_map_loaded)
8126 {
8127 EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
05e6f5dc 8128
df7492f9 8129 if (STRINGP (start))
8f924df7 8130 pbeg = SDATA (start);
df7492f9
KH
8131 else
8132 pbeg = BYTE_POS_ADDR (start_byte);
8133 p = pbeg + p_offset;
8134 pend = pbeg + pend_offset;
8135 }
8136 }
ec6d2bb8 8137 }
fb88bf2d 8138
988b3759 8139 safe_codings = list2 (Qraw_text, Qno_conversion);
df7492f9
KH
8140 for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
8141 if (! NILP (XCAR (tail)))
8142 safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
ec6d2bb8 8143
05e6f5dc
KH
8144 return safe_codings;
8145}
4956c225 8146
d46c5b12 8147
8f924df7
KH
8148DEFUN ("unencodable-char-position", Funencodable_char_position,
8149 Sunencodable_char_position, 3, 5, 0,
8150 doc: /*
8151Return position of first un-encodable character in a region.
d4a1d553 8152START and END specify the region and CODING-SYSTEM specifies the
8f924df7 8153encoding to check. Return nil if CODING-SYSTEM does encode the region.
d46c5b12 8154
8f924df7
KH
8155If optional 4th argument COUNT is non-nil, it specifies at most how
8156many un-encodable characters to search. In this case, the value is a
8157list of positions.
d46c5b12 8158
8f924df7
KH
8159If optional 5th argument STRING is non-nil, it is a string to search
8160for un-encodable characters. In that case, START and END are indexes
8161to the string. */)
8162 (start, end, coding_system, count, string)
8163 Lisp_Object start, end, coding_system, count, string;
8164{
8165 int n;
8166 struct coding_system coding;
7d64c6ad 8167 Lisp_Object attrs, charset_list, translation_table;
8f924df7
KH
8168 Lisp_Object positions;
8169 int from, to;
8170 const unsigned char *p, *stop, *pend;
8171 int ascii_compatible;
fb88bf2d 8172
8f924df7
KH
8173 setup_coding_system (Fcheck_coding_system (coding_system), &coding);
8174 attrs = CODING_ID_ATTRS (coding.id);
8175 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
8176 return Qnil;
8177 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
8178 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2170c8f0 8179 translation_table = get_translation_table (attrs, 1, NULL);
fb88bf2d 8180
8f924df7
KH
8181 if (NILP (string))
8182 {
8183 validate_region (&start, &end);
8184 from = XINT (start);
8185 to = XINT (end);
8186 if (NILP (current_buffer->enable_multibyte_characters)
8187 || (ascii_compatible
8188 && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
8189 return Qnil;
8190 p = CHAR_POS_ADDR (from);
8191 pend = CHAR_POS_ADDR (to);
8192 if (from < GPT && to >= GPT)
8193 stop = GPT_ADDR;
8194 else
8195 stop = pend;
8196 }
8197 else
8198 {
8199 CHECK_STRING (string);
8200 CHECK_NATNUM (start);
8201 CHECK_NATNUM (end);
8202 from = XINT (start);
8203 to = XINT (end);
8204 if (from > to
8205 || to > SCHARS (string))
8206 args_out_of_range_3 (string, start, end);
8207 if (! STRING_MULTIBYTE (string))
8208 return Qnil;
8209 p = SDATA (string) + string_char_to_byte (string, from);
8210 stop = pend = SDATA (string) + string_char_to_byte (string, to);
8211 if (ascii_compatible && (to - from) == (pend - p))
8212 return Qnil;
8213 }
f2558efd 8214
8f924df7
KH
8215 if (NILP (count))
8216 n = 1;
8217 else
b73bfc1c 8218 {
8f924df7
KH
8219 CHECK_NATNUM (count);
8220 n = XINT (count);
b73bfc1c
KH
8221 }
8222
8f924df7
KH
8223 positions = Qnil;
8224 while (1)
d46c5b12 8225 {
8f924df7 8226 int c;
ec6d2bb8 8227
8f924df7
KH
8228 if (ascii_compatible)
8229 while (p < stop && ASCII_BYTE_P (*p))
8230 p++, from++;
8231 if (p >= stop)
0e79d667 8232 {
8f924df7
KH
8233 if (p >= pend)
8234 break;
8235 stop = pend;
8236 p = GAP_END_ADDR;
0e79d667 8237 }
ec6d2bb8 8238
8f924df7
KH
8239 c = STRING_CHAR_ADVANCE (p);
8240 if (! (ASCII_CHAR_P (c) && ascii_compatible)
7d64c6ad
KH
8241 && ! char_charset (translate_char (translation_table, c),
8242 charset_list, NULL))
ec6d2bb8 8243 {
8f924df7
KH
8244 positions = Fcons (make_number (from), positions);
8245 n--;
8246 if (n == 0)
8247 break;
ec6d2bb8
KH
8248 }
8249
8f924df7
KH
8250 from++;
8251 }
d46c5b12 8252
8f924df7
KH
8253 return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
8254}
d46c5b12 8255
d46c5b12 8256
df7492f9
KH
8257DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
8258 Scheck_coding_systems_region, 3, 3, 0,
8259 doc: /* Check if the region is encodable by coding systems.
d46c5b12 8260
df7492f9
KH
8261START and END are buffer positions specifying the region.
8262CODING-SYSTEM-LIST is a list of coding systems to check.
d46c5b12 8263
df7492f9 8264The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
d4a1d553 8265CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
df7492f9
KH
8266whole region, POS0, POS1, ... are buffer positions where non-encodable
8267characters are found.
93dec019 8268
df7492f9
KH
8269If all coding systems in CODING-SYSTEM-LIST can encode the region, the
8270value is nil.
93dec019 8271
df7492f9
KH
8272START may be a string. In that case, check if the string is
8273encodable, and the value contains indices to the string instead of
8274buffer positions. END is ignored. */)
8275 (start, end, coding_system_list)
8276 Lisp_Object start, end, coding_system_list;
05e6f5dc 8277{
df7492f9
KH
8278 Lisp_Object list;
8279 EMACS_INT start_byte, end_byte;
8280 int pos;
7c78e542 8281 const unsigned char *p, *pbeg, *pend;
df7492f9 8282 int c;
7d64c6ad 8283 Lisp_Object tail, elt, attrs;
70ad9fc4 8284
05e6f5dc
KH
8285 if (STRINGP (start))
8286 {
df7492f9 8287 if (!STRING_MULTIBYTE (start)
8f924df7 8288 && SCHARS (start) != SBYTES (start))
df7492f9
KH
8289 return Qnil;
8290 start_byte = 0;
8f924df7 8291 end_byte = SBYTES (start);
df7492f9 8292 pos = 0;
d46c5b12 8293 }
05e6f5dc 8294 else
b73bfc1c 8295 {
b7826503
PJ
8296 CHECK_NUMBER_COERCE_MARKER (start);
8297 CHECK_NUMBER_COERCE_MARKER (end);
05e6f5dc
KH
8298 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8299 args_out_of_range (start, end);
8300 if (NILP (current_buffer->enable_multibyte_characters))
df7492f9
KH
8301 return Qnil;
8302 start_byte = CHAR_TO_BYTE (XINT (start));
8303 end_byte = CHAR_TO_BYTE (XINT (end));
8304 if (XINT (end) - XINT (start) == end_byte - start_byte)
05e6f5dc 8305 return Qt;
df7492f9 8306
e1c23804 8307 if (XINT (start) < GPT && XINT (end) > GPT)
b73bfc1c 8308 {
e1c23804
DL
8309 if ((GPT - XINT (start)) < (XINT (end) - GPT))
8310 move_gap_both (XINT (start), start_byte);
df7492f9 8311 else
e1c23804 8312 move_gap_both (XINT (end), end_byte);
b73bfc1c 8313 }
e1c23804 8314 pos = XINT (start);
b73bfc1c 8315 }
7553d0e1 8316
df7492f9
KH
8317 list = Qnil;
8318 for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
12410ef1 8319 {
df7492f9 8320 elt = XCAR (tail);
7d64c6ad 8321 attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
2170c8f0
KH
8322 ASET (attrs, coding_attr_trans_tbl,
8323 get_translation_table (attrs, 1, NULL));
7d64c6ad 8324 list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
12410ef1
KH
8325 }
8326
df7492f9 8327 if (STRINGP (start))
8f924df7 8328 p = pbeg = SDATA (start);
72d1a715 8329 else
df7492f9
KH
8330 p = pbeg = BYTE_POS_ADDR (start_byte);
8331 pend = p + (end_byte - start_byte);
4ed46869 8332
df7492f9
KH
8333 while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
8334 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
ec6d2bb8 8335
df7492f9 8336 while (p < pend)
d46c5b12 8337 {
df7492f9
KH
8338 if (ASCII_BYTE_P (*p))
8339 p++;
e133c8fa 8340 else
05e6f5dc 8341 {
df7492f9
KH
8342 c = STRING_CHAR_ADVANCE (p);
8343
8344 charset_map_loaded = 0;
8345 for (tail = list; CONSP (tail); tail = XCDR (tail))
8346 {
8347 elt = XCDR (XCAR (tail));
8348 if (! char_encodable_p (c, XCAR (elt)))
8349 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
8350 }
8351 if (charset_map_loaded)
8352 {
8353 EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
8354
8355 if (STRINGP (start))
8f924df7 8356 pbeg = SDATA (start);
df7492f9
KH
8357 else
8358 pbeg = BYTE_POS_ADDR (start_byte);
8359 p = pbeg + p_offset;
8360 pend = pbeg + pend_offset;
8361 }
05e6f5dc 8362 }
df7492f9 8363 pos++;
d46c5b12 8364 }
4ed46869 8365
df7492f9
KH
8366 tail = list;
8367 list = Qnil;
8368 for (; CONSP (tail); tail = XCDR (tail))
ec6d2bb8 8369 {
df7492f9
KH
8370 elt = XCAR (tail);
8371 if (CONSP (XCDR (XCDR (elt))))
8372 list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
8373 list);
ec6d2bb8 8374 }
2b4f9037 8375
df7492f9 8376 return list;
d46c5b12
KH
8377}
8378
3fd9494b 8379
b73bfc1c 8380Lisp_Object
df7492f9
KH
8381code_convert_region (start, end, coding_system, dst_object, encodep, norecord)
8382 Lisp_Object start, end, coding_system, dst_object;
8383 int encodep, norecord;
4ed46869 8384{
3a73fa5d 8385 struct coding_system coding;
df7492f9
KH
8386 EMACS_INT from, from_byte, to, to_byte;
8387 Lisp_Object src_object;
4ed46869 8388
b7826503
PJ
8389 CHECK_NUMBER_COERCE_MARKER (start);
8390 CHECK_NUMBER_COERCE_MARKER (end);
df7492f9
KH
8391 if (NILP (coding_system))
8392 coding_system = Qno_conversion;
8393 else
8394 CHECK_CODING_SYSTEM (coding_system);
8395 src_object = Fcurrent_buffer ();
8396 if (NILP (dst_object))
8397 dst_object = src_object;
8398 else if (! EQ (dst_object, Qt))
8399 CHECK_BUFFER (dst_object);
3a73fa5d 8400
d46c5b12
KH
8401 validate_region (&start, &end);
8402 from = XFASTINT (start);
df7492f9 8403 from_byte = CHAR_TO_BYTE (from);
d46c5b12 8404 to = XFASTINT (end);
df7492f9 8405 to_byte = CHAR_TO_BYTE (to);
764ca8da 8406
df7492f9
KH
8407 setup_coding_system (coding_system, &coding);
8408 coding.mode |= CODING_MODE_LAST_BLOCK;
ec6d2bb8 8409
df7492f9
KH
8410 if (encodep)
8411 encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8412 dst_object);
8413 else
8414 decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8415 dst_object);
8416 if (! norecord)
8417 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
ec6d2bb8 8418
df7492f9
KH
8419 return (BUFFERP (dst_object)
8420 ? make_number (coding.produced_char)
8421 : coding.dst_object);
4031e2bf 8422}
78108bcd 8423
4ed46869 8424
4031e2bf 8425DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
df7492f9 8426 3, 4, "r\nzCoding system: ",
48b0f3ae 8427 doc: /* Decode the current region from the specified coding system.
df7492f9
KH
8428When called from a program, takes four arguments:
8429 START, END, CODING-SYSTEM, and DESTINATION.
8430START and END are buffer positions.
8844fa83 8431
df7492f9 8432Optional 4th arguments DESTINATION specifies where the decoded text goes.
2354b80b 8433If nil, the region between START and END is replaced by the decoded text.
df7492f9 8434If buffer, the decoded text is inserted in the buffer.
446dcd75 8435In those cases, the length of the decoded text is returned.
319a3947 8436If DESTINATION is t, the decoded text is returned.
8844fa83 8437
48b0f3ae
PJ
8438This function sets `last-coding-system-used' to the precise coding system
8439used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
319a3947 8440not fully specified.) */)
df7492f9
KH
8441 (start, end, coding_system, destination)
8442 Lisp_Object start, end, coding_system, destination;
4031e2bf 8443{
df7492f9 8444 return code_convert_region (start, end, coding_system, destination, 0, 0);
3a73fa5d 8445}
8844fa83 8446
3a73fa5d 8447DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
df7492f9
KH
8448 3, 4, "r\nzCoding system: ",
8449 doc: /* Encode the current region by specified coding system.
d4a1d553
JB
8450When called from a program, takes four arguments:
8451 START, END, CODING-SYSTEM and DESTINATION.
8452START and END are buffer positions.
d46c5b12 8453
df7492f9
KH
8454Optional 4th arguments DESTINATION specifies where the encoded text goes.
8455If nil, the region between START and END is replace by the encoded text.
8456If buffer, the encoded text is inserted in the buffer.
446dcd75 8457In those cases, the length of the encoded text is returned.
319a3947 8458If DESTINATION is t, the encoded text is returned.
2391eaa4 8459
48b0f3ae
PJ
8460This function sets `last-coding-system-used' to the precise coding system
8461used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
319a3947 8462not fully specified.) */)
df7492f9
KH
8463 (start, end, coding_system, destination)
8464 Lisp_Object start, end, coding_system, destination;
3a73fa5d 8465{
df7492f9 8466 return code_convert_region (start, end, coding_system, destination, 1, 0);
b73bfc1c
KH
8467}
8468
8469Lisp_Object
df7492f9
KH
8470code_convert_string (string, coding_system, dst_object,
8471 encodep, nocopy, norecord)
8472 Lisp_Object string, coding_system, dst_object;
8473 int encodep, nocopy, norecord;
b73bfc1c 8474{
4031e2bf 8475 struct coding_system coding;
df7492f9 8476 EMACS_INT chars, bytes;
ec6d2bb8 8477
b7826503 8478 CHECK_STRING (string);
d46c5b12 8479 if (NILP (coding_system))
4956c225 8480 {
df7492f9
KH
8481 if (! norecord)
8482 Vlast_coding_system_used = Qno_conversion;
8483 if (NILP (dst_object))
8484 return (nocopy ? Fcopy_sequence (string) : string);
4956c225 8485 }
b73bfc1c 8486
df7492f9
KH
8487 if (NILP (coding_system))
8488 coding_system = Qno_conversion;
8489 else
8490 CHECK_CODING_SYSTEM (coding_system);
8491 if (NILP (dst_object))
8492 dst_object = Qt;
8493 else if (! EQ (dst_object, Qt))
8494 CHECK_BUFFER (dst_object);
73be902c 8495
df7492f9 8496 setup_coding_system (coding_system, &coding);
d46c5b12 8497 coding.mode |= CODING_MODE_LAST_BLOCK;
8f924df7
KH
8498 chars = SCHARS (string);
8499 bytes = SBYTES (string);
df7492f9
KH
8500 if (encodep)
8501 encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
8502 else
8503 decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
8504 if (! norecord)
8505 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
73be902c 8506
df7492f9
KH
8507 return (BUFFERP (dst_object)
8508 ? make_number (coding.produced_char)
8509 : coding.dst_object);
4ed46869 8510}
73be902c 8511
b73bfc1c 8512
ecec61c1 8513/* Encode or decode STRING according to CODING_SYSTEM.
ec6d2bb8 8514 Do not set Vlast_coding_system_used.
4ed46869 8515
ec6d2bb8
KH
8516 This function is called only from macros DECODE_FILE and
8517 ENCODE_FILE, thus we ignore character composition. */
4ed46869 8518
ecec61c1
KH
8519Lisp_Object
8520code_convert_string_norecord (string, coding_system, encodep)
8521 Lisp_Object string, coding_system;
8522 int encodep;
4ed46869 8523{
0be8721c 8524 return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
4ed46869
KH
8525}
8526
4ed46869 8527
df7492f9
KH
8528DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
8529 2, 4, 0,
8530 doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
8531
8532Optional third arg NOCOPY non-nil means it is OK to return STRING itself
8533if the decoding operation is trivial.
ecec61c1 8534
d4a1d553 8535Optional fourth arg BUFFER non-nil means that the decoded text is
a3f6ee6d 8536inserted in BUFFER instead of returned as a string. In this case,
319a3947 8537the return value is the length of the decoded text.
ecec61c1 8538
df7492f9
KH
8539This function sets `last-coding-system-used' to the precise coding system
8540used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
d4a1d553 8541not fully specified.) */)
df7492f9
KH
8542 (string, coding_system, nocopy, buffer)
8543 Lisp_Object string, coding_system, nocopy, buffer;
4ed46869 8544{
df7492f9
KH
8545 return code_convert_string (string, coding_system, buffer,
8546 0, ! NILP (nocopy), 0);
4ed46869
KH
8547}
8548
df7492f9
KH
8549DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
8550 2, 4, 0,
8551 doc: /* Encode STRING to CODING-SYSTEM, and return the result.
8552
8553Optional third arg NOCOPY non-nil means it is OK to return STRING
8554itself if the encoding operation is trivial.
8555
d4a1d553 8556Optional fourth arg BUFFER non-nil means that the encoded text is
a3f6ee6d 8557inserted in BUFFER instead of returned as a string. In this case,
446dcd75 8558the return value is the length of the encoded text.
df7492f9
KH
8559
8560This function sets `last-coding-system-used' to the precise coding system
8561used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8562not fully specified.) */)
8563 (string, coding_system, nocopy, buffer)
8564 Lisp_Object string, coding_system, nocopy, buffer;
4ed46869 8565{
df7492f9 8566 return code_convert_string (string, coding_system, buffer,
c197f191 8567 1, ! NILP (nocopy), 1);
4ed46869 8568}
df7492f9 8569
3a73fa5d 8570\f
4ed46869 8571DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
48b0f3ae
PJ
8572 doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
8573Return the corresponding character. */)
8574 (code)
4ed46869 8575 Lisp_Object code;
4ed46869 8576{
df7492f9
KH
8577 Lisp_Object spec, attrs, val;
8578 struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
8579 int c;
4ed46869 8580
df7492f9
KH
8581 CHECK_NATNUM (code);
8582 c = XFASTINT (code);
8583 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
8584 attrs = AREF (spec, 0);
4ed46869 8585
df7492f9
KH
8586 if (ASCII_BYTE_P (c)
8587 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8588 return code;
4ed46869 8589
df7492f9
KH
8590 val = CODING_ATTR_CHARSET_LIST (attrs);
8591 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
004068e4
KH
8592 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
8593 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
fa42c37f 8594
df7492f9
KH
8595 if (c <= 0x7F)
8596 charset = charset_roman;
8597 else if (c >= 0xA0 && c < 0xDF)
55ab7be3 8598 {
df7492f9
KH
8599 charset = charset_kana;
8600 c -= 0x80;
4ed46869 8601 }
55ab7be3 8602 else
4ed46869 8603 {
004068e4 8604 int s1 = c >> 8, s2 = c & 0xFF;
df7492f9
KH
8605
8606 if (s1 < 0x81 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF
8607 || s2 < 0x40 || s2 == 0x7F || s2 > 0xFC)
8608 error ("Invalid code: %d", code);
8609 SJIS_TO_JIS (c);
8610 charset = charset_kanji;
4ed46869 8611 }
df7492f9
KH
8612 c = DECODE_CHAR (charset, c);
8613 if (c < 0)
8614 error ("Invalid code: %d", code);
8615 return make_number (c);
93dec019 8616}
4ed46869 8617
48b0f3ae 8618
4ed46869 8619DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
8acf0c0e 8620 doc: /* Encode a Japanese character CH to shift_jis encoding.
48b0f3ae
PJ
8621Return the corresponding code in SJIS. */)
8622 (ch)
df7492f9 8623 Lisp_Object ch;
4ed46869 8624{
df7492f9
KH
8625 Lisp_Object spec, attrs, charset_list;
8626 int c;
8627 struct charset *charset;
8628 unsigned code;
48b0f3ae 8629
df7492f9
KH
8630 CHECK_CHARACTER (ch);
8631 c = XFASTINT (ch);
8632 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
8633 attrs = AREF (spec, 0);
8634
8635 if (ASCII_CHAR_P (c)
8636 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8637 return ch;
8638
8639 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8640 charset = char_charset (c, charset_list, &code);
8641 if (code == CHARSET_INVALID_CODE (charset))
8642 error ("Can't encode by shift_jis encoding: %d", c);
8643 JIS_TO_SJIS (code);
8644
8645 return make_number (code);
4ed46869
KH
8646}
8647
8648DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
48b0f3ae
PJ
8649 doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
8650Return the corresponding character. */)
8651 (code)
4ed46869 8652 Lisp_Object code;
d46c5b12 8653{
df7492f9
KH
8654 Lisp_Object spec, attrs, val;
8655 struct charset *charset_roman, *charset_big5, *charset;
8656 int c;
6289dd10 8657
df7492f9
KH
8658 CHECK_NATNUM (code);
8659 c = XFASTINT (code);
8660 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
8661 attrs = AREF (spec, 0);
4ed46869 8662
df7492f9
KH
8663 if (ASCII_BYTE_P (c)
8664 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8665 return code;
6289dd10 8666
df7492f9
KH
8667 val = CODING_ATTR_CHARSET_LIST (attrs);
8668 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
8669 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
c210f766 8670
df7492f9
KH
8671 if (c <= 0x7F)
8672 charset = charset_roman;
c28a9453
KH
8673 else
8674 {
df7492f9
KH
8675 int b1 = c >> 8, b2 = c & 0x7F;
8676 if (b1 < 0xA1 || b1 > 0xFE
8677 || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
8678 error ("Invalid code: %d", code);
8679 charset = charset_big5;
c28a9453 8680 }
df7492f9
KH
8681 c = DECODE_CHAR (charset, (unsigned )c);
8682 if (c < 0)
8683 error ("Invalid code: %d", code);
8684 return make_number (c);
d46c5b12 8685}
6289dd10 8686
4ed46869 8687DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
8acf0c0e 8688 doc: /* Encode the Big5 character CH to BIG5 coding system.
48b0f3ae
PJ
8689Return the corresponding character code in Big5. */)
8690 (ch)
4ed46869
KH
8691 Lisp_Object ch;
8692{
df7492f9
KH
8693 Lisp_Object spec, attrs, charset_list;
8694 struct charset *charset;
8695 int c;
8696 unsigned code;
8697
8698 CHECK_CHARACTER (ch);
8699 c = XFASTINT (ch);
8700 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
8701 attrs = AREF (spec, 0);
8702 if (ASCII_CHAR_P (c)
8703 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8704 return ch;
8705
8706 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8707 charset = char_charset (c, charset_list, &code);
8708 if (code == CHARSET_INVALID_CODE (charset))
8709 error ("Can't encode by Big5 encoding: %d", c);
8710
8711 return make_number (code);
4ed46869 8712}
48b0f3ae 8713
3a73fa5d 8714\f
002fdb44 8715DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
68bba4e4 8716 Sset_terminal_coding_system_internal, 1, 2, 0,
48b0f3ae 8717 doc: /* Internal use only. */)
6ed8eeff 8718 (coding_system, terminal)
b74e4686 8719 Lisp_Object coding_system;
6ed8eeff 8720 Lisp_Object terminal;
4ed46869 8721{
6ed8eeff 8722 struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
b7826503 8723 CHECK_SYMBOL (coding_system);
b8299c66 8724 setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
70c22245 8725 /* We had better not send unsafe characters to terminal. */
c73bd236 8726 terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
df7492f9 8727 /* Characer composition should be disabled. */
c73bd236 8728 terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
b8299c66
KL
8729 terminal_coding->src_multibyte = 1;
8730 terminal_coding->dst_multibyte = 0;
4ed46869
KH
8731 return Qnil;
8732}
8733
c4825358
KH
8734DEFUN ("set-safe-terminal-coding-system-internal",
8735 Fset_safe_terminal_coding_system_internal,
48b0f3ae 8736 Sset_safe_terminal_coding_system_internal, 1, 1, 0,
ddb67bdc 8737 doc: /* Internal use only. */)
48b0f3ae 8738 (coding_system)
b74e4686 8739 Lisp_Object coding_system;
d46c5b12 8740{
b7826503 8741 CHECK_SYMBOL (coding_system);
c4825358
KH
8742 setup_coding_system (Fcheck_coding_system (coding_system),
8743 &safe_terminal_coding);
df7492f9
KH
8744 /* Characer composition should be disabled. */
8745 safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
b73bfc1c
KH
8746 safe_terminal_coding.src_multibyte = 1;
8747 safe_terminal_coding.dst_multibyte = 0;
c4825358
KH
8748 return Qnil;
8749}
4ed46869 8750
002fdb44 8751DEFUN ("terminal-coding-system", Fterminal_coding_system,
68bba4e4 8752 Sterminal_coding_system, 0, 1, 0,
6ed8eeff
KL
8753 doc: /* Return coding system specified for terminal output on the given terminal.
8754TERMINAL may be a terminal id, a frame, or nil for the selected
8755frame's terminal device. */)
8756 (terminal)
8757 Lisp_Object terminal;
4ed46869 8758{
985773c9
MB
8759 struct coding_system *terminal_coding
8760 = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
8761 Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
ae6f73fa 8762
ae6f73fa 8763 /* For backward compatibility, return nil if it is `undecided'. */
f75c90a9 8764 return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
4ed46869
KH
8765}
8766
002fdb44 8767DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
68bba4e4 8768 Sset_keyboard_coding_system_internal, 1, 2, 0,
48b0f3ae 8769 doc: /* Internal use only. */)
6ed8eeff 8770 (coding_system, terminal)
4ed46869 8771 Lisp_Object coding_system;
6ed8eeff 8772 Lisp_Object terminal;
4ed46869 8773{
6ed8eeff 8774 struct terminal *t = get_terminal (terminal, 1);
b7826503 8775 CHECK_SYMBOL (coding_system);
df7492f9 8776 setup_coding_system (Fcheck_coding_system (coding_system),
c73bd236 8777 TERMINAL_KEYBOARD_CODING (t));
df7492f9 8778 /* Characer composition should be disabled. */
c73bd236
MB
8779 TERMINAL_KEYBOARD_CODING (t)->common_flags
8780 &= ~CODING_ANNOTATE_COMPOSITION_MASK;
4ed46869
KH
8781 return Qnil;
8782}
8783
8784DEFUN ("keyboard-coding-system",
985773c9 8785 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
48b0f3ae 8786 doc: /* Return coding system specified for decoding keyboard input. */)
985773c9
MB
8787 (terminal)
8788 Lisp_Object terminal;
4ed46869 8789{
985773c9
MB
8790 return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
8791 (get_terminal (terminal, 1))->id);
4ed46869
KH
8792}
8793
4ed46869 8794\f
a5d301df
KH
8795DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
8796 Sfind_operation_coding_system, 1, MANY, 0,
48b0f3ae
PJ
8797 doc: /* Choose a coding system for an operation based on the target name.
8798The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
8799DECODING-SYSTEM is the coding system to use for decoding
8800\(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
8801for encoding (in case OPERATION does encoding).
05e6f5dc 8802
48b0f3ae
PJ
8803The first argument OPERATION specifies an I/O primitive:
8804 For file I/O, `insert-file-contents' or `write-region'.
8805 For process I/O, `call-process', `call-process-region', or `start-process'.
8806 For network I/O, `open-network-stream'.
05e6f5dc 8807
48b0f3ae
PJ
8808The remaining arguments should be the same arguments that were passed
8809to the primitive. Depending on which primitive, one of those arguments
8810is selected as the TARGET. For example, if OPERATION does file I/O,
8811whichever argument specifies the file name is TARGET.
05e6f5dc 8812
48b0f3ae 8813TARGET has a meaning which depends on OPERATION:
b883cdb2 8814 For file I/O, TARGET is a file name (except for the special case below).
48b0f3ae 8815 For process I/O, TARGET is a process name.
d4a1d553 8816 For network I/O, TARGET is a service name or a port number.
05e6f5dc 8817
d4a1d553 8818This function looks up what is specified for TARGET in
48b0f3ae
PJ
8819`file-coding-system-alist', `process-coding-system-alist',
8820or `network-coding-system-alist' depending on OPERATION.
8821They may specify a coding system, a cons of coding systems,
8822or a function symbol to call.
8823In the last case, we call the function with one argument,
8824which is a list of all the arguments given to this function.
1011c487
MB
8825If the function can't decide a coding system, it can return
8826`undecided' so that the normal code-detection is performed.
48b0f3ae 8827
b883cdb2
MB
8828If OPERATION is `insert-file-contents', the argument corresponding to
8829TARGET may be a cons (FILENAME . BUFFER). In that case, FILENAME is a
8830file name to look up, and BUFFER is a buffer that contains the file's
8831contents (not yet decoded). If `file-coding-system-alist' specifies a
8832function to call for FILENAME, that function should examine the
8833contents of BUFFER instead of reading the file.
8834
d918f936 8835usage: (find-operation-coding-system OPERATION ARGUMENTS...) */)
48b0f3ae 8836 (nargs, args)
4ed46869
KH
8837 int nargs;
8838 Lisp_Object *args;
6b89e3aa 8839{
4ed46869
KH
8840 Lisp_Object operation, target_idx, target, val;
8841 register Lisp_Object chain;
177c0ea7 8842
4ed46869
KH
8843 if (nargs < 2)
8844 error ("Too few arguments");
8845 operation = args[0];
8846 if (!SYMBOLP (operation)
8847 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
3ed051d4 8848 error ("Invalid first argument");
4ed46869
KH
8849 if (nargs < 1 + XINT (target_idx))
8850 error ("Too few arguments for operation: %s",
8f924df7 8851 SDATA (SYMBOL_NAME (operation)));
4ed46869
KH
8852 target = args[XINT (target_idx) + 1];
8853 if (!(STRINGP (target)
091a0ff0
KH
8854 || (EQ (operation, Qinsert_file_contents) && CONSP (target)
8855 && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
4ed46869 8856 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
df7492f9 8857 error ("Invalid %dth argument", XINT (target_idx) + 1);
091a0ff0
KH
8858 if (CONSP (target))
8859 target = XCAR (target);
4ed46869 8860
2e34157c
RS
8861 chain = ((EQ (operation, Qinsert_file_contents)
8862 || EQ (operation, Qwrite_region))
02ba4723 8863 ? Vfile_coding_system_alist
2e34157c 8864 : (EQ (operation, Qopen_network_stream)
02ba4723
KH
8865 ? Vnetwork_coding_system_alist
8866 : Vprocess_coding_system_alist));
4ed46869
KH
8867 if (NILP (chain))
8868 return Qnil;
8869
03699b14 8870 for (; CONSP (chain); chain = XCDR (chain))
6b89e3aa 8871 {
f44d27ce 8872 Lisp_Object elt;
6b89e3aa 8873
df7492f9 8874 elt = XCAR (chain);
4ed46869
KH
8875 if (CONSP (elt)
8876 && ((STRINGP (target)
03699b14
KR
8877 && STRINGP (XCAR (elt))
8878 && fast_string_match (XCAR (elt), target) >= 0)
8879 || (INTEGERP (target) && EQ (target, XCAR (elt)))))
6b89e3aa 8880 {
03699b14 8881 val = XCDR (elt);
b19fd4c5
KH
8882 /* Here, if VAL is both a valid coding system and a valid
8883 function symbol, we return VAL as a coding system. */
02ba4723
KH
8884 if (CONSP (val))
8885 return val;
8886 if (! SYMBOLP (val))
8887 return Qnil;
8888 if (! NILP (Fcoding_system_p (val)))
8889 return Fcons (val, val);
b19fd4c5 8890 if (! NILP (Ffboundp (val)))
6b89e3aa 8891 {
e2b97060
MB
8892 /* We use call1 rather than safe_call1
8893 so as to get bug reports about functions called here
8894 which don't handle the current interface. */
8895 val = call1 (val, Flist (nargs, args));
b19fd4c5
KH
8896 if (CONSP (val))
8897 return val;
8898 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
8899 return Fcons (val, val);
6b89e3aa 8900 }
02ba4723 8901 return Qnil;
6b89e3aa
KH
8902 }
8903 }
4ed46869 8904 return Qnil;
6b89e3aa
KH
8905}
8906
df7492f9 8907DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
a3f6ee6d 8908 Sset_coding_system_priority, 0, MANY, 0,
da7db224 8909 doc: /* Assign higher priority to the coding systems given as arguments.
d4a1d553 8910If multiple coding systems belong to the same category,
a3181084
DL
8911all but the first one are ignored.
8912
d4a1d553 8913usage: (set-coding-system-priority &rest coding-systems) */)
df7492f9
KH
8914 (nargs, args)
8915 int nargs;
8916 Lisp_Object *args;
8917{
8918 int i, j;
8919 int changed[coding_category_max];
8920 enum coding_category priorities[coding_category_max];
8921
8922 bzero (changed, sizeof changed);
6b89e3aa 8923
df7492f9 8924 for (i = j = 0; i < nargs; i++)
6b89e3aa 8925 {
df7492f9
KH
8926 enum coding_category category;
8927 Lisp_Object spec, attrs;
6b89e3aa 8928
df7492f9
KH
8929 CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
8930 attrs = AREF (spec, 0);
8931 category = XINT (CODING_ATTR_CATEGORY (attrs));
8932 if (changed[category])
8933 /* Ignore this coding system because a coding system of the
8934 same category already had a higher priority. */
8935 continue;
8936 changed[category] = 1;
8937 priorities[j++] = category;
8938 if (coding_categories[category].id >= 0
8939 && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
8940 setup_coding_system (args[i], &coding_categories[category]);
ff563fce 8941 Fset (AREF (Vcoding_category_table, category), args[i]);
df7492f9 8942 }
6b89e3aa 8943
df7492f9
KH
8944 /* Now we have decided top J priorities. Reflect the order of the
8945 original priorities to the remaining priorities. */
6b89e3aa 8946
df7492f9 8947 for (i = j, j = 0; i < coding_category_max; i++, j++)
6b89e3aa 8948 {
df7492f9
KH
8949 while (j < coding_category_max
8950 && changed[coding_priorities[j]])
8951 j++;
8952 if (j == coding_category_max)
8953 abort ();
8954 priorities[i] = coding_priorities[j];
8955 }
6b89e3aa 8956
df7492f9 8957 bcopy (priorities, coding_priorities, sizeof priorities);
177c0ea7 8958
ff563fce
KH
8959 /* Update `coding-category-list'. */
8960 Vcoding_category_list = Qnil;
8961 for (i = coding_category_max - 1; i >= 0; i--)
8962 Vcoding_category_list
8963 = Fcons (AREF (Vcoding_category_table, priorities[i]),
8964 Vcoding_category_list);
6b89e3aa 8965
df7492f9 8966 return Qnil;
6b89e3aa
KH
8967}
8968
df7492f9
KH
8969DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
8970 Scoding_system_priority_list, 0, 1, 0,
da7db224
DL
8971 doc: /* Return a list of coding systems ordered by their priorities.
8972HIGHESTP non-nil means just return the highest priority one. */)
df7492f9
KH
8973 (highestp)
8974 Lisp_Object highestp;
d46c5b12
KH
8975{
8976 int i;
df7492f9 8977 Lisp_Object val;
6b89e3aa 8978
df7492f9 8979 for (i = 0, val = Qnil; i < coding_category_max; i++)
d46c5b12 8980 {
df7492f9
KH
8981 enum coding_category category = coding_priorities[i];
8982 int id = coding_categories[category].id;
8983 Lisp_Object attrs;
068a9dbd 8984
df7492f9
KH
8985 if (id < 0)
8986 continue;
8987 attrs = CODING_ID_ATTRS (id);
8988 if (! NILP (highestp))
8989 return CODING_ATTR_BASE_NAME (attrs);
8990 val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
8991 }
8992 return Fnreverse (val);
8993}
068a9dbd 8994
f0064e1f 8995static char *suffixes[] = { "-unix", "-dos", "-mac" };
068a9dbd
KH
8996
8997static Lisp_Object
df7492f9
KH
8998make_subsidiaries (base)
8999 Lisp_Object base;
068a9dbd 9000{
df7492f9 9001 Lisp_Object subsidiaries;
8f924df7 9002 int base_name_len = SBYTES (SYMBOL_NAME (base));
df7492f9
KH
9003 char *buf = (char *) alloca (base_name_len + 6);
9004 int i;
068a9dbd 9005
8f924df7 9006 bcopy (SDATA (SYMBOL_NAME (base)), buf, base_name_len);
df7492f9
KH
9007 subsidiaries = Fmake_vector (make_number (3), Qnil);
9008 for (i = 0; i < 3; i++)
068a9dbd 9009 {
df7492f9
KH
9010 bcopy (suffixes[i], buf + base_name_len, strlen (suffixes[i]) + 1);
9011 ASET (subsidiaries, i, intern (buf));
068a9dbd 9012 }
df7492f9 9013 return subsidiaries;
068a9dbd
KH
9014}
9015
9016
df7492f9
KH
9017DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
9018 Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
1fcd6c8b
DL
9019 doc: /* For internal use only.
9020usage: (define-coding-system-internal ...) */)
df7492f9
KH
9021 (nargs, args)
9022 int nargs;
9023 Lisp_Object *args;
068a9dbd 9024{
df7492f9
KH
9025 Lisp_Object name;
9026 Lisp_Object spec_vec; /* [ ATTRS ALIASE EOL_TYPE ] */
9027 Lisp_Object attrs; /* Vector of attributes. */
9028 Lisp_Object eol_type;
9029 Lisp_Object aliases;
9030 Lisp_Object coding_type, charset_list, safe_charsets;
9031 enum coding_category category;
9032 Lisp_Object tail, val;
9033 int max_charset_id = 0;
9034 int i;
068a9dbd 9035
df7492f9
KH
9036 if (nargs < coding_arg_max)
9037 goto short_args;
068a9dbd 9038
df7492f9 9039 attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
068a9dbd 9040
df7492f9
KH
9041 name = args[coding_arg_name];
9042 CHECK_SYMBOL (name);
9043 CODING_ATTR_BASE_NAME (attrs) = name;
068a9dbd 9044
df7492f9
KH
9045 val = args[coding_arg_mnemonic];
9046 if (! STRINGP (val))
9047 CHECK_CHARACTER (val);
9048 CODING_ATTR_MNEMONIC (attrs) = val;
068a9dbd 9049
df7492f9
KH
9050 coding_type = args[coding_arg_coding_type];
9051 CHECK_SYMBOL (coding_type);
9052 CODING_ATTR_TYPE (attrs) = coding_type;
068a9dbd 9053
df7492f9
KH
9054 charset_list = args[coding_arg_charset_list];
9055 if (SYMBOLP (charset_list))
9056 {
9057 if (EQ (charset_list, Qiso_2022))
9058 {
9059 if (! EQ (coding_type, Qiso_2022))
9060 error ("Invalid charset-list");
9061 charset_list = Viso_2022_charset_list;
9062 }
9063 else if (EQ (charset_list, Qemacs_mule))
9064 {
9065 if (! EQ (coding_type, Qemacs_mule))
9066 error ("Invalid charset-list");
9067 charset_list = Vemacs_mule_charset_list;
9068 }
9069 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9070 if (max_charset_id < XFASTINT (XCAR (tail)))
9071 max_charset_id = XFASTINT (XCAR (tail));
9072 }
068a9dbd
KH
9073 else
9074 {
df7492f9 9075 charset_list = Fcopy_sequence (charset_list);
985773c9 9076 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
068a9dbd 9077 {
df7492f9
KH
9078 struct charset *charset;
9079
985773c9 9080 val = XCAR (tail);
df7492f9
KH
9081 CHECK_CHARSET_GET_CHARSET (val, charset);
9082 if (EQ (coding_type, Qiso_2022)
9083 ? CHARSET_ISO_FINAL (charset) < 0
9084 : EQ (coding_type, Qemacs_mule)
9085 ? CHARSET_EMACS_MULE_ID (charset) < 0
9086 : 0)
9087 error ("Can't handle charset `%s'",
8f924df7 9088 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
df7492f9 9089
8f924df7 9090 XSETCAR (tail, make_number (charset->id));
df7492f9
KH
9091 if (max_charset_id < charset->id)
9092 max_charset_id = charset->id;
068a9dbd
KH
9093 }
9094 }
df7492f9 9095 CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
068a9dbd 9096
df7492f9
KH
9097 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
9098 make_number (255));
9099 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8f924df7 9100 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
df7492f9 9101 CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
068a9dbd 9102
584948ac 9103 CODING_ATTR_ASCII_COMPAT (attrs) = args[coding_arg_ascii_compatible_p];
3a73fa5d 9104
df7492f9 9105 val = args[coding_arg_decode_translation_table];
a6f87d34 9106 if (! CHAR_TABLE_P (val) && ! CONSP (val))
7d64c6ad 9107 CHECK_SYMBOL (val);
df7492f9 9108 CODING_ATTR_DECODE_TBL (attrs) = val;
3a73fa5d 9109
df7492f9 9110 val = args[coding_arg_encode_translation_table];
a6f87d34 9111 if (! CHAR_TABLE_P (val) && ! CONSP (val))
7d64c6ad 9112 CHECK_SYMBOL (val);
df7492f9 9113 CODING_ATTR_ENCODE_TBL (attrs) = val;
d46c5b12 9114
df7492f9
KH
9115 val = args[coding_arg_post_read_conversion];
9116 CHECK_SYMBOL (val);
9117 CODING_ATTR_POST_READ (attrs) = val;
d46c5b12 9118
df7492f9
KH
9119 val = args[coding_arg_pre_write_conversion];
9120 CHECK_SYMBOL (val);
9121 CODING_ATTR_PRE_WRITE (attrs) = val;
3a73fa5d 9122
df7492f9
KH
9123 val = args[coding_arg_default_char];
9124 if (NILP (val))
9125 CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
9126 else
9127 {
8f924df7 9128 CHECK_CHARACTER (val);
df7492f9
KH
9129 CODING_ATTR_DEFAULT_CHAR (attrs) = val;
9130 }
4031e2bf 9131
8f924df7
KH
9132 val = args[coding_arg_for_unibyte];
9133 CODING_ATTR_FOR_UNIBYTE (attrs) = NILP (val) ? Qnil : Qt;
3a73fa5d 9134
df7492f9
KH
9135 val = args[coding_arg_plist];
9136 CHECK_LIST (val);
9137 CODING_ATTR_PLIST (attrs) = val;
3a73fa5d 9138
df7492f9
KH
9139 if (EQ (coding_type, Qcharset))
9140 {
c7c66a95
KH
9141 /* Generate a lisp vector of 256 elements. Each element is nil,
9142 integer, or a list of charset IDs.
3a73fa5d 9143
c7c66a95
KH
9144 If Nth element is nil, the byte code N is invalid in this
9145 coding system.
4ed46869 9146
c7c66a95
KH
9147 If Nth element is a number NUM, N is the first byte of a
9148 charset whose ID is NUM.
4ed46869 9149
c7c66a95
KH
9150 If Nth element is a list of charset IDs, N is the first byte
9151 of one of them. The list is sorted by dimensions of the
2bc515e4 9152 charsets. A charset of smaller dimension comes firtst. */
df7492f9 9153 val = Fmake_vector (make_number (256), Qnil);
4ed46869 9154
5c99c2e6 9155 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
df7492f9 9156 {
c7c66a95
KH
9157 struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
9158 int dim = CHARSET_DIMENSION (charset);
9159 int idx = (dim - 1) * 4;
4ed46869 9160
5c99c2e6 9161 if (CHARSET_ASCII_COMPATIBLE_P (charset))
584948ac 9162 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
4031e2bf 9163
15d143f7
KH
9164 for (i = charset->code_space[idx];
9165 i <= charset->code_space[idx + 1]; i++)
9166 {
c7c66a95
KH
9167 Lisp_Object tmp, tmp2;
9168 int dim2;
ec6d2bb8 9169
c7c66a95
KH
9170 tmp = AREF (val, i);
9171 if (NILP (tmp))
9172 tmp = XCAR (tail);
9173 else if (NUMBERP (tmp))
9174 {
9175 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
9176 if (dim < dim2)
c7c66a95 9177 tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
f9d71dcd
KH
9178 else
9179 tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
c7c66a95 9180 }
15d143f7 9181 else
c7c66a95
KH
9182 {
9183 for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
9184 {
9185 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
9186 if (dim < dim2)
9187 break;
9188 }
9189 if (NILP (tmp2))
9190 tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
9191 else
9192 {
9193 XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
9194 XSETCAR (tmp2, XCAR (tail));
9195 }
9196 }
9197 ASET (val, i, tmp);
15d143f7 9198 }
df7492f9
KH
9199 }
9200 ASET (attrs, coding_attr_charset_valids, val);
9201 category = coding_category_charset;
9202 }
9203 else if (EQ (coding_type, Qccl))
9204 {
9205 Lisp_Object valids;
ecec61c1 9206
df7492f9
KH
9207 if (nargs < coding_arg_ccl_max)
9208 goto short_args;
ecec61c1 9209
df7492f9
KH
9210 val = args[coding_arg_ccl_decoder];
9211 CHECK_CCL_PROGRAM (val);
9212 if (VECTORP (val))
9213 val = Fcopy_sequence (val);
9214 ASET (attrs, coding_attr_ccl_decoder, val);
ecec61c1 9215
df7492f9
KH
9216 val = args[coding_arg_ccl_encoder];
9217 CHECK_CCL_PROGRAM (val);
9218 if (VECTORP (val))
9219 val = Fcopy_sequence (val);
9220 ASET (attrs, coding_attr_ccl_encoder, val);
ecec61c1 9221
df7492f9
KH
9222 val = args[coding_arg_ccl_valids];
9223 valids = Fmake_string (make_number (256), make_number (0));
9224 for (tail = val; !NILP (tail); tail = Fcdr (tail))
9225 {
8dcbea82 9226 int from, to;
ecec61c1 9227
df7492f9
KH
9228 val = Fcar (tail);
9229 if (INTEGERP (val))
8dcbea82
KH
9230 {
9231 from = to = XINT (val);
9232 if (from < 0 || from > 255)
9233 args_out_of_range_3 (val, make_number (0), make_number (255));
9234 }
df7492f9
KH
9235 else
9236 {
df7492f9 9237 CHECK_CONS (val);
8f924df7
KH
9238 CHECK_NATNUM_CAR (val);
9239 CHECK_NATNUM_CDR (val);
df7492f9 9240 from = XINT (XCAR (val));
8f924df7 9241 if (from > 255)
8dcbea82
KH
9242 args_out_of_range_3 (XCAR (val),
9243 make_number (0), make_number (255));
df7492f9 9244 to = XINT (XCDR (val));
8dcbea82
KH
9245 if (to < from || to > 255)
9246 args_out_of_range_3 (XCDR (val),
9247 XCAR (val), make_number (255));
df7492f9 9248 }
8dcbea82 9249 for (i = from; i <= to; i++)
8f924df7 9250 SSET (valids, i, 1);
df7492f9
KH
9251 }
9252 ASET (attrs, coding_attr_ccl_valids, valids);
4ed46869 9253
df7492f9 9254 category = coding_category_ccl;
55ab7be3 9255 }
df7492f9 9256 else if (EQ (coding_type, Qutf_16))
55ab7be3 9257 {
df7492f9 9258 Lisp_Object bom, endian;
4ed46869 9259
584948ac 9260 CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
4ed46869 9261
df7492f9
KH
9262 if (nargs < coding_arg_utf16_max)
9263 goto short_args;
4ed46869 9264
df7492f9
KH
9265 bom = args[coding_arg_utf16_bom];
9266 if (! NILP (bom) && ! EQ (bom, Qt))
9267 {
9268 CHECK_CONS (bom);
8f924df7
KH
9269 val = XCAR (bom);
9270 CHECK_CODING_SYSTEM (val);
9271 val = XCDR (bom);
9272 CHECK_CODING_SYSTEM (val);
df7492f9 9273 }
a470d443 9274 ASET (attrs, coding_attr_utf_bom, bom);
df7492f9
KH
9275
9276 endian = args[coding_arg_utf16_endian];
b49a1807
KH
9277 CHECK_SYMBOL (endian);
9278 if (NILP (endian))
9279 endian = Qbig;
9280 else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
8f924df7 9281 error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
df7492f9
KH
9282 ASET (attrs, coding_attr_utf_16_endian, endian);
9283
9284 category = (CONSP (bom)
9285 ? coding_category_utf_16_auto
9286 : NILP (bom)
b49a1807 9287 ? (EQ (endian, Qbig)
df7492f9
KH
9288 ? coding_category_utf_16_be_nosig
9289 : coding_category_utf_16_le_nosig)
b49a1807 9290 : (EQ (endian, Qbig)
df7492f9
KH
9291 ? coding_category_utf_16_be
9292 : coding_category_utf_16_le));
9293 }
9294 else if (EQ (coding_type, Qiso_2022))
9295 {
9296 Lisp_Object initial, reg_usage, request, flags;
4776e638 9297 int i;
1397dc18 9298
df7492f9
KH
9299 if (nargs < coding_arg_iso2022_max)
9300 goto short_args;
9301
9302 initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
9303 CHECK_VECTOR (initial);
9304 for (i = 0; i < 4; i++)
9305 {
9306 val = Faref (initial, make_number (i));
9307 if (! NILP (val))
9308 {
584948ac
KH
9309 struct charset *charset;
9310
9311 CHECK_CHARSET_GET_CHARSET (val, charset);
9312 ASET (initial, i, make_number (CHARSET_ID (charset)));
9313 if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
9314 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
df7492f9
KH
9315 }
9316 else
9317 ASET (initial, i, make_number (-1));
9318 }
9319
9320 reg_usage = args[coding_arg_iso2022_reg_usage];
9321 CHECK_CONS (reg_usage);
8f924df7
KH
9322 CHECK_NUMBER_CAR (reg_usage);
9323 CHECK_NUMBER_CDR (reg_usage);
df7492f9
KH
9324
9325 request = Fcopy_sequence (args[coding_arg_iso2022_request]);
9326 for (tail = request; ! NILP (tail); tail = Fcdr (tail))
1397dc18 9327 {
df7492f9 9328 int id;
8f924df7 9329 Lisp_Object tmp;
df7492f9
KH
9330
9331 val = Fcar (tail);
9332 CHECK_CONS (val);
8f924df7
KH
9333 tmp = XCAR (val);
9334 CHECK_CHARSET_GET_ID (tmp, id);
9335 CHECK_NATNUM_CDR (val);
df7492f9
KH
9336 if (XINT (XCDR (val)) >= 4)
9337 error ("Invalid graphic register number: %d", XINT (XCDR (val)));
8f924df7 9338 XSETCAR (val, make_number (id));
1397dc18 9339 }
4ed46869 9340
df7492f9
KH
9341 flags = args[coding_arg_iso2022_flags];
9342 CHECK_NATNUM (flags);
9343 i = XINT (flags);
9344 if (EQ (args[coding_arg_charset_list], Qiso_2022))
9345 flags = make_number (i | CODING_ISO_FLAG_FULL_SUPPORT);
9346
9347 ASET (attrs, coding_attr_iso_initial, initial);
9348 ASET (attrs, coding_attr_iso_usage, reg_usage);
9349 ASET (attrs, coding_attr_iso_request, request);
9350 ASET (attrs, coding_attr_iso_flags, flags);
9351 setup_iso_safe_charsets (attrs);
9352
9353 if (i & CODING_ISO_FLAG_SEVEN_BITS)
9354 category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
9355 | CODING_ISO_FLAG_SINGLE_SHIFT))
9356 ? coding_category_iso_7_else
9357 : EQ (args[coding_arg_charset_list], Qiso_2022)
9358 ? coding_category_iso_7
9359 : coding_category_iso_7_tight);
9360 else
9361 {
9362 int id = XINT (AREF (initial, 1));
9363
c6fb6e98 9364 category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
df7492f9
KH
9365 || EQ (args[coding_arg_charset_list], Qiso_2022)
9366 || id < 0)
9367 ? coding_category_iso_8_else
9368 : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
9369 ? coding_category_iso_8_1
9370 : coding_category_iso_8_2);
9371 }
0ce7886f
KH
9372 if (category != coding_category_iso_8_1
9373 && category != coding_category_iso_8_2)
9374 CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
df7492f9
KH
9375 }
9376 else if (EQ (coding_type, Qemacs_mule))
c28a9453 9377 {
df7492f9
KH
9378 if (EQ (args[coding_arg_charset_list], Qemacs_mule))
9379 ASET (attrs, coding_attr_emacs_mule_full, Qt);
584948ac 9380 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
df7492f9 9381 category = coding_category_emacs_mule;
c28a9453 9382 }
df7492f9 9383 else if (EQ (coding_type, Qshift_jis))
c28a9453 9384 {
df7492f9
KH
9385
9386 struct charset *charset;
9387
7d64c6ad 9388 if (XINT (Flength (charset_list)) != 3
6e07c25f 9389 && XINT (Flength (charset_list)) != 4)
7d64c6ad 9390 error ("There should be three or four charsets");
df7492f9
KH
9391
9392 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9393 if (CHARSET_DIMENSION (charset) != 1)
9394 error ("Dimension of charset %s is not one",
8f924df7 9395 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
584948ac
KH
9396 if (CHARSET_ASCII_COMPATIBLE_P (charset))
9397 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
df7492f9
KH
9398
9399 charset_list = XCDR (charset_list);
9400 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9401 if (CHARSET_DIMENSION (charset) != 1)
9402 error ("Dimension of charset %s is not one",
8f924df7 9403 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
df7492f9
KH
9404
9405 charset_list = XCDR (charset_list);
9406 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9407 if (CHARSET_DIMENSION (charset) != 2)
7d64c6ad
KH
9408 error ("Dimension of charset %s is not two",
9409 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9410
9411 charset_list = XCDR (charset_list);
2b917a06
KH
9412 if (! NILP (charset_list))
9413 {
9414 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9415 if (CHARSET_DIMENSION (charset) != 2)
9416 error ("Dimension of charset %s is not two",
9417 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9418 }
df7492f9
KH
9419
9420 category = coding_category_sjis;
9421 Vsjis_coding_system = name;
c28a9453 9422 }
df7492f9
KH
9423 else if (EQ (coding_type, Qbig5))
9424 {
9425 struct charset *charset;
4ed46869 9426
df7492f9
KH
9427 if (XINT (Flength (charset_list)) != 2)
9428 error ("There should be just two charsets");
9429
9430 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9431 if (CHARSET_DIMENSION (charset) != 1)
9432 error ("Dimension of charset %s is not one",
8f924df7 9433 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
584948ac
KH
9434 if (CHARSET_ASCII_COMPATIBLE_P (charset))
9435 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
df7492f9
KH
9436
9437 charset_list = XCDR (charset_list);
9438 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9439 if (CHARSET_DIMENSION (charset) != 2)
9440 error ("Dimension of charset %s is not two",
8f924df7 9441 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
4ed46869 9442
df7492f9
KH
9443 category = coding_category_big5;
9444 Vbig5_coding_system = name;
9445 }
9446 else if (EQ (coding_type, Qraw_text))
c28a9453 9447 {
584948ac
KH
9448 category = coding_category_raw_text;
9449 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
c28a9453 9450 }
df7492f9 9451 else if (EQ (coding_type, Qutf_8))
4ed46869 9452 {
a470d443
KH
9453 Lisp_Object bom;
9454
584948ac 9455 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
a470d443
KH
9456
9457 if (nargs < coding_arg_utf8_max)
9458 goto short_args;
9459
9460 bom = args[coding_arg_utf8_bom];
9461 if (! NILP (bom) && ! EQ (bom, Qt))
9462 {
9463 CHECK_CONS (bom);
9464 val = XCAR (bom);
9465 CHECK_CODING_SYSTEM (val);
9466 val = XCDR (bom);
9467 CHECK_CODING_SYSTEM (val);
9468 }
9469 ASET (attrs, coding_attr_utf_bom, bom);
9470
9471 category = (CONSP (bom) ? coding_category_utf_8_auto
9472 : NILP (bom) ? coding_category_utf_8_nosig
9473 : coding_category_utf_8_sig);
4ed46869 9474 }
df7492f9
KH
9475 else if (EQ (coding_type, Qundecided))
9476 category = coding_category_undecided;
4ed46869 9477 else
df7492f9 9478 error ("Invalid coding system type: %s",
8f924df7 9479 SDATA (SYMBOL_NAME (coding_type)));
4ed46869 9480
df7492f9 9481 CODING_ATTR_CATEGORY (attrs) = make_number (category);
01378f49
KH
9482 CODING_ATTR_PLIST (attrs)
9483 = Fcons (QCcategory, Fcons (AREF (Vcoding_category_table, category),
9484 CODING_ATTR_PLIST (attrs)));
35befdaa 9485 CODING_ATTR_PLIST (attrs)
3ed051d4 9486 = Fcons (QCascii_compatible_p,
35befdaa
KH
9487 Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
9488 CODING_ATTR_PLIST (attrs)));
c4825358 9489
df7492f9
KH
9490 eol_type = args[coding_arg_eol_type];
9491 if (! NILP (eol_type)
9492 && ! EQ (eol_type, Qunix)
9493 && ! EQ (eol_type, Qdos)
9494 && ! EQ (eol_type, Qmac))
9495 error ("Invalid eol-type");
4ed46869 9496
df7492f9 9497 aliases = Fcons (name, Qnil);
4ed46869 9498
df7492f9
KH
9499 if (NILP (eol_type))
9500 {
9501 eol_type = make_subsidiaries (name);
9502 for (i = 0; i < 3; i++)
1397dc18 9503 {
df7492f9
KH
9504 Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
9505
9506 this_name = AREF (eol_type, i);
9507 this_aliases = Fcons (this_name, Qnil);
9508 this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
9509 this_spec = Fmake_vector (make_number (3), attrs);
9510 ASET (this_spec, 1, this_aliases);
9511 ASET (this_spec, 2, this_eol_type);
9512 Fputhash (this_name, this_spec, Vcoding_system_hash_table);
9513 Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
583f71ca
KH
9514 val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
9515 if (NILP (val))
9516 Vcoding_system_alist
9517 = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
9518 Vcoding_system_alist);
1397dc18 9519 }
d46c5b12 9520 }
4ed46869 9521
df7492f9
KH
9522 spec_vec = Fmake_vector (make_number (3), attrs);
9523 ASET (spec_vec, 1, aliases);
9524 ASET (spec_vec, 2, eol_type);
48b0f3ae 9525
df7492f9
KH
9526 Fputhash (name, spec_vec, Vcoding_system_hash_table);
9527 Vcoding_system_list = Fcons (name, Vcoding_system_list);
583f71ca
KH
9528 val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
9529 if (NILP (val))
9530 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
9531 Vcoding_system_alist);
48b0f3ae 9532
df7492f9
KH
9533 {
9534 int id = coding_categories[category].id;
48b0f3ae 9535
df7492f9
KH
9536 if (id < 0 || EQ (name, CODING_ID_NAME (id)))
9537 setup_coding_system (name, &coding_categories[category]);
9538 }
48b0f3ae 9539
d46c5b12 9540 return Qnil;
48b0f3ae 9541
df7492f9
KH
9542 short_args:
9543 return Fsignal (Qwrong_number_of_arguments,
9544 Fcons (intern ("define-coding-system-internal"),
9545 make_number (nargs)));
d46c5b12 9546}
4ed46869 9547
d6925f38 9548
a6f87d34
KH
9549DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
9550 3, 3, 0,
9551 doc: /* Change value in CODING-SYSTEM's property list PROP to VAL. */)
9552 (coding_system, prop, val)
9553 Lisp_Object coding_system, prop, val;
9554{
3dbe7859 9555 Lisp_Object spec, attrs;
a6f87d34
KH
9556
9557 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9558 attrs = AREF (spec, 0);
9559 if (EQ (prop, QCmnemonic))
9560 {
9561 if (! STRINGP (val))
9562 CHECK_CHARACTER (val);
9563 CODING_ATTR_MNEMONIC (attrs) = val;
9564 }
9565 else if (EQ (prop, QCdefalut_char))
9566 {
9567 if (NILP (val))
9568 val = make_number (' ');
9569 else
9570 CHECK_CHARACTER (val);
9571 CODING_ATTR_DEFAULT_CHAR (attrs) = val;
9572 }
9573 else if (EQ (prop, QCdecode_translation_table))
9574 {
9575 if (! CHAR_TABLE_P (val) && ! CONSP (val))
9576 CHECK_SYMBOL (val);
9577 CODING_ATTR_DECODE_TBL (attrs) = val;
9578 }
9579 else if (EQ (prop, QCencode_translation_table))
9580 {
9581 if (! CHAR_TABLE_P (val) && ! CONSP (val))
9582 CHECK_SYMBOL (val);
9583 CODING_ATTR_ENCODE_TBL (attrs) = val;
9584 }
9585 else if (EQ (prop, QCpost_read_conversion))
9586 {
9587 CHECK_SYMBOL (val);
9588 CODING_ATTR_POST_READ (attrs) = val;
9589 }
9590 else if (EQ (prop, QCpre_write_conversion))
9591 {
9592 CHECK_SYMBOL (val);
9593 CODING_ATTR_PRE_WRITE (attrs) = val;
9594 }
35befdaa
KH
9595 else if (EQ (prop, QCascii_compatible_p))
9596 {
9597 CODING_ATTR_ASCII_COMPAT (attrs) = val;
9598 }
a6f87d34
KH
9599
9600 CODING_ATTR_PLIST (attrs)
9601 = Fplist_put (CODING_ATTR_PLIST (attrs), prop, val);
9602 return val;
9603}
9604
9605
df7492f9
KH
9606DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
9607 Sdefine_coding_system_alias, 2, 2, 0,
9608 doc: /* Define ALIAS as an alias for CODING-SYSTEM. */)
9609 (alias, coding_system)
9610 Lisp_Object alias, coding_system;
66cfb530 9611{
583f71ca 9612 Lisp_Object spec, aliases, eol_type, val;
4ed46869 9613
df7492f9
KH
9614 CHECK_SYMBOL (alias);
9615 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9616 aliases = AREF (spec, 1);
d4a1d553 9617 /* ALIASES should be a list of length more than zero, and the first
d6925f38
KH
9618 element is a base coding system. Append ALIAS at the tail of the
9619 list. */
df7492f9
KH
9620 while (!NILP (XCDR (aliases)))
9621 aliases = XCDR (aliases);
8f924df7 9622 XSETCDR (aliases, Fcons (alias, Qnil));
4ed46869 9623
df7492f9
KH
9624 eol_type = AREF (spec, 2);
9625 if (VECTORP (eol_type))
4ed46869 9626 {
df7492f9
KH
9627 Lisp_Object subsidiaries;
9628 int i;
4ed46869 9629
df7492f9
KH
9630 subsidiaries = make_subsidiaries (alias);
9631 for (i = 0; i < 3; i++)
9632 Fdefine_coding_system_alias (AREF (subsidiaries, i),
9633 AREF (eol_type, i));
4ed46869 9634 }
df7492f9
KH
9635
9636 Fputhash (alias, spec, Vcoding_system_hash_table);
d6925f38 9637 Vcoding_system_list = Fcons (alias, Vcoding_system_list);
583f71ca
KH
9638 val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
9639 if (NILP (val))
9640 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
9641 Vcoding_system_alist);
66cfb530 9642
4ed46869
KH
9643 return Qnil;
9644}
9645
df7492f9
KH
9646DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
9647 1, 1, 0,
9648 doc: /* Return the base of CODING-SYSTEM.
da7db224 9649Any alias or subsidiary coding system is not a base coding system. */)
df7492f9
KH
9650 (coding_system)
9651 Lisp_Object coding_system;
d46c5b12 9652{
df7492f9 9653 Lisp_Object spec, attrs;
d46c5b12 9654
df7492f9
KH
9655 if (NILP (coding_system))
9656 return (Qno_conversion);
9657 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9658 attrs = AREF (spec, 0);
9659 return CODING_ATTR_BASE_NAME (attrs);
9660}
1397dc18 9661
df7492f9
KH
9662DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
9663 1, 1, 0,
9664 doc: "Return the property list of CODING-SYSTEM.")
9665 (coding_system)
9666 Lisp_Object coding_system;
9667{
9668 Lisp_Object spec, attrs;
1397dc18 9669
df7492f9
KH
9670 if (NILP (coding_system))
9671 coding_system = Qno_conversion;
9672 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9673 attrs = AREF (spec, 0);
9674 return CODING_ATTR_PLIST (attrs);
d46c5b12
KH
9675}
9676
df7492f9
KH
9677
9678DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
9679 1, 1, 0,
da7db224 9680 doc: /* Return the list of aliases of CODING-SYSTEM. */)
df7492f9
KH
9681 (coding_system)
9682 Lisp_Object coding_system;
66cfb530 9683{
df7492f9 9684 Lisp_Object spec;
84d60297 9685
df7492f9
KH
9686 if (NILP (coding_system))
9687 coding_system = Qno_conversion;
9688 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
da7db224 9689 return AREF (spec, 1);
df7492f9 9690}
66cfb530 9691
df7492f9
KH
9692DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
9693 Scoding_system_eol_type, 1, 1, 0,
9694 doc: /* Return eol-type of CODING-SYSTEM.
d4a1d553 9695An eol-type is an integer 0, 1, 2, or a vector of coding systems.
66cfb530 9696
df7492f9
KH
9697Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
9698and CR respectively.
66cfb530 9699
df7492f9
KH
9700A vector value indicates that a format of end-of-line should be
9701detected automatically. Nth element of the vector is the subsidiary
9702coding system whose eol-type is N. */)
6b89e3aa
KH
9703 (coding_system)
9704 Lisp_Object coding_system;
9705{
df7492f9
KH
9706 Lisp_Object spec, eol_type;
9707 int n;
6b89e3aa 9708
df7492f9
KH
9709 if (NILP (coding_system))
9710 coding_system = Qno_conversion;
9711 if (! CODING_SYSTEM_P (coding_system))
9712 return Qnil;
9713 spec = CODING_SYSTEM_SPEC (coding_system);
9714 eol_type = AREF (spec, 2);
9715 if (VECTORP (eol_type))
9716 return Fcopy_sequence (eol_type);
9717 n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
9718 return make_number (n);
6b89e3aa
KH
9719}
9720
4ed46869
KH
9721#endif /* emacs */
9722
9723\f
1397dc18 9724/*** 9. Post-amble ***/
4ed46869 9725
dfcf069d 9726void
4ed46869
KH
9727init_coding_once ()
9728{
9729 int i;
9730
df7492f9
KH
9731 for (i = 0; i < coding_category_max; i++)
9732 {
9733 coding_categories[i].id = -1;
9734 coding_priorities[i] = i;
9735 }
4ed46869
KH
9736
9737 /* ISO2022 specific initialize routine. */
9738 for (i = 0; i < 0x20; i++)
b73bfc1c 9739 iso_code_class[i] = ISO_control_0;
4ed46869
KH
9740 for (i = 0x21; i < 0x7F; i++)
9741 iso_code_class[i] = ISO_graphic_plane_0;
9742 for (i = 0x80; i < 0xA0; i++)
b73bfc1c 9743 iso_code_class[i] = ISO_control_1;
4ed46869
KH
9744 for (i = 0xA1; i < 0xFF; i++)
9745 iso_code_class[i] = ISO_graphic_plane_1;
9746 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
9747 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
4ed46869
KH
9748 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
9749 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
9750 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
9751 iso_code_class[ISO_CODE_ESC] = ISO_escape;
9752 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
9753 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
9754 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
9755
df7492f9
KH
9756 for (i = 0; i < 256; i++)
9757 {
9758 emacs_mule_bytes[i] = 1;
9759 }
7c78e542
KH
9760 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
9761 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
9762 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
9763 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
e0e989f6
KH
9764}
9765
9766#ifdef emacs
9767
dfcf069d 9768void
e0e989f6
KH
9769syms_of_coding ()
9770{
df7492f9 9771 staticpro (&Vcoding_system_hash_table);
8f924df7
KH
9772 {
9773 Lisp_Object args[2];
9774 args[0] = QCtest;
9775 args[1] = Qeq;
9776 Vcoding_system_hash_table = Fmake_hash_table (2, args);
9777 }
df7492f9
KH
9778
9779 staticpro (&Vsjis_coding_system);
9780 Vsjis_coding_system = Qnil;
e0e989f6 9781
df7492f9
KH
9782 staticpro (&Vbig5_coding_system);
9783 Vbig5_coding_system = Qnil;
9784
24a73b0a
KH
9785 staticpro (&Vcode_conversion_reused_workbuf);
9786 Vcode_conversion_reused_workbuf = Qnil;
9787
9788 staticpro (&Vcode_conversion_workbuf_name);
9789 Vcode_conversion_workbuf_name = build_string (" *code-conversion-work*");
e0e989f6 9790
24a73b0a 9791 reused_workbuf_in_use = 0;
df7492f9
KH
9792
9793 DEFSYM (Qcharset, "charset");
9794 DEFSYM (Qtarget_idx, "target-idx");
9795 DEFSYM (Qcoding_system_history, "coding-system-history");
bb0115a2
RS
9796 Fset (Qcoding_system_history, Qnil);
9797
9ce27fde 9798 /* Target FILENAME is the first argument. */
e0e989f6 9799 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
9ce27fde 9800 /* Target FILENAME is the third argument. */
e0e989f6
KH
9801 Fput (Qwrite_region, Qtarget_idx, make_number (2));
9802
df7492f9 9803 DEFSYM (Qcall_process, "call-process");
9ce27fde 9804 /* Target PROGRAM is the first argument. */
e0e989f6
KH
9805 Fput (Qcall_process, Qtarget_idx, make_number (0));
9806
df7492f9 9807 DEFSYM (Qcall_process_region, "call-process-region");
9ce27fde 9808 /* Target PROGRAM is the third argument. */
e0e989f6
KH
9809 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
9810
df7492f9 9811 DEFSYM (Qstart_process, "start-process");
9ce27fde 9812 /* Target PROGRAM is the third argument. */
e0e989f6
KH
9813 Fput (Qstart_process, Qtarget_idx, make_number (2));
9814
df7492f9 9815 DEFSYM (Qopen_network_stream, "open-network-stream");
9ce27fde 9816 /* Target SERVICE is the fourth argument. */
e0e989f6
KH
9817 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
9818
df7492f9
KH
9819 DEFSYM (Qcoding_system, "coding-system");
9820 DEFSYM (Qcoding_aliases, "coding-aliases");
4ed46869 9821
df7492f9
KH
9822 DEFSYM (Qeol_type, "eol-type");
9823 DEFSYM (Qunix, "unix");
9824 DEFSYM (Qdos, "dos");
4ed46869 9825
df7492f9
KH
9826 DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
9827 DEFSYM (Qpost_read_conversion, "post-read-conversion");
9828 DEFSYM (Qpre_write_conversion, "pre-write-conversion");
9829 DEFSYM (Qdefault_char, "default-char");
9830 DEFSYM (Qundecided, "undecided");
9831 DEFSYM (Qno_conversion, "no-conversion");
9832 DEFSYM (Qraw_text, "raw-text");
4ed46869 9833
df7492f9 9834 DEFSYM (Qiso_2022, "iso-2022");
4ed46869 9835
df7492f9 9836 DEFSYM (Qutf_8, "utf-8");
8f924df7 9837 DEFSYM (Qutf_8_emacs, "utf-8-emacs");
27901516 9838
df7492f9 9839 DEFSYM (Qutf_16, "utf-16");
df7492f9
KH
9840 DEFSYM (Qbig, "big");
9841 DEFSYM (Qlittle, "little");
27901516 9842
df7492f9
KH
9843 DEFSYM (Qshift_jis, "shift-jis");
9844 DEFSYM (Qbig5, "big5");
4ed46869 9845
df7492f9 9846 DEFSYM (Qcoding_system_p, "coding-system-p");
4ed46869 9847
df7492f9 9848 DEFSYM (Qcoding_system_error, "coding-system-error");
4ed46869
KH
9849 Fput (Qcoding_system_error, Qerror_conditions,
9850 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
9851 Fput (Qcoding_system_error, Qerror_message,
9ce27fde 9852 build_string ("Invalid coding system"));
4ed46869 9853
05e6f5dc
KH
9854 /* Intern this now in case it isn't already done.
9855 Setting this variable twice is harmless.
9856 But don't staticpro it here--that is done in alloc.c. */
9857 Qchar_table_extra_slots = intern ("char-table-extra-slots");
70c22245 9858
df7492f9 9859 DEFSYM (Qtranslation_table, "translation-table");
433f7f87 9860 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
df7492f9
KH
9861 DEFSYM (Qtranslation_table_id, "translation-table-id");
9862 DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
9863 DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
1397dc18 9864
df7492f9 9865 DEFSYM (Qvalid_codes, "valid-codes");
9ce27fde 9866
df7492f9 9867 DEFSYM (Qemacs_mule, "emacs-mule");
d46c5b12 9868
01378f49 9869 DEFSYM (QCcategory, ":category");
a6f87d34
KH
9870 DEFSYM (QCmnemonic, ":mnemonic");
9871 DEFSYM (QCdefalut_char, ":default-char");
9872 DEFSYM (QCdecode_translation_table, ":decode-translation-table");
9873 DEFSYM (QCencode_translation_table, ":encode-translation-table");
9874 DEFSYM (QCpost_read_conversion, ":post-read-conversion");
9875 DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
35befdaa 9876 DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
01378f49 9877
df7492f9
KH
9878 Vcoding_category_table
9879 = Fmake_vector (make_number (coding_category_max), Qnil);
9880 staticpro (&Vcoding_category_table);
9881 /* Followings are target of code detection. */
9882 ASET (Vcoding_category_table, coding_category_iso_7,
9883 intern ("coding-category-iso-7"));
9884 ASET (Vcoding_category_table, coding_category_iso_7_tight,
9885 intern ("coding-category-iso-7-tight"));
9886 ASET (Vcoding_category_table, coding_category_iso_8_1,
9887 intern ("coding-category-iso-8-1"));
9888 ASET (Vcoding_category_table, coding_category_iso_8_2,
9889 intern ("coding-category-iso-8-2"));
9890 ASET (Vcoding_category_table, coding_category_iso_7_else,
9891 intern ("coding-category-iso-7-else"));
9892 ASET (Vcoding_category_table, coding_category_iso_8_else,
9893 intern ("coding-category-iso-8-else"));
a470d443
KH
9894 ASET (Vcoding_category_table, coding_category_utf_8_auto,
9895 intern ("coding-category-utf-8-auto"));
9896 ASET (Vcoding_category_table, coding_category_utf_8_nosig,
df7492f9 9897 intern ("coding-category-utf-8"));
a470d443
KH
9898 ASET (Vcoding_category_table, coding_category_utf_8_sig,
9899 intern ("coding-category-utf-8-sig"));
df7492f9
KH
9900 ASET (Vcoding_category_table, coding_category_utf_16_be,
9901 intern ("coding-category-utf-16-be"));
ff563fce
KH
9902 ASET (Vcoding_category_table, coding_category_utf_16_auto,
9903 intern ("coding-category-utf-16-auto"));
df7492f9
KH
9904 ASET (Vcoding_category_table, coding_category_utf_16_le,
9905 intern ("coding-category-utf-16-le"));
9906 ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
9907 intern ("coding-category-utf-16-be-nosig"));
9908 ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
9909 intern ("coding-category-utf-16-le-nosig"));
9910 ASET (Vcoding_category_table, coding_category_charset,
9911 intern ("coding-category-charset"));
9912 ASET (Vcoding_category_table, coding_category_sjis,
9913 intern ("coding-category-sjis"));
9914 ASET (Vcoding_category_table, coding_category_big5,
9915 intern ("coding-category-big5"));
9916 ASET (Vcoding_category_table, coding_category_ccl,
9917 intern ("coding-category-ccl"));
9918 ASET (Vcoding_category_table, coding_category_emacs_mule,
9919 intern ("coding-category-emacs-mule"));
9920 /* Followings are NOT target of code detection. */
9921 ASET (Vcoding_category_table, coding_category_raw_text,
9922 intern ("coding-category-raw-text"));
9923 ASET (Vcoding_category_table, coding_category_undecided,
9924 intern ("coding-category-undecided"));
ecf488bc 9925
065e3595
KH
9926 DEFSYM (Qinsufficient_source, "insufficient-source");
9927 DEFSYM (Qinconsistent_eol, "inconsistent-eol");
9928 DEFSYM (Qinvalid_source, "invalid-source");
9929 DEFSYM (Qinterrupted, "interrupted");
9930 DEFSYM (Qinsufficient_memory, "insufficient-memory");
44e8490d 9931 DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
065e3595 9932
4ed46869
KH
9933 defsubr (&Scoding_system_p);
9934 defsubr (&Sread_coding_system);
9935 defsubr (&Sread_non_nil_coding_system);
9936 defsubr (&Scheck_coding_system);
9937 defsubr (&Sdetect_coding_region);
d46c5b12 9938 defsubr (&Sdetect_coding_string);
05e6f5dc 9939 defsubr (&Sfind_coding_systems_region_internal);
068a9dbd 9940 defsubr (&Sunencodable_char_position);
df7492f9 9941 defsubr (&Scheck_coding_systems_region);
4ed46869
KH
9942 defsubr (&Sdecode_coding_region);
9943 defsubr (&Sencode_coding_region);
9944 defsubr (&Sdecode_coding_string);
9945 defsubr (&Sencode_coding_string);
9946 defsubr (&Sdecode_sjis_char);
9947 defsubr (&Sencode_sjis_char);
9948 defsubr (&Sdecode_big5_char);
9949 defsubr (&Sencode_big5_char);
1ba9e4ab 9950 defsubr (&Sset_terminal_coding_system_internal);
c4825358 9951 defsubr (&Sset_safe_terminal_coding_system_internal);
4ed46869 9952 defsubr (&Sterminal_coding_system);
1ba9e4ab 9953 defsubr (&Sset_keyboard_coding_system_internal);
4ed46869 9954 defsubr (&Skeyboard_coding_system);
a5d301df 9955 defsubr (&Sfind_operation_coding_system);
df7492f9 9956 defsubr (&Sset_coding_system_priority);
6b89e3aa 9957 defsubr (&Sdefine_coding_system_internal);
df7492f9 9958 defsubr (&Sdefine_coding_system_alias);
a6f87d34 9959 defsubr (&Scoding_system_put);
df7492f9
KH
9960 defsubr (&Scoding_system_base);
9961 defsubr (&Scoding_system_plist);
9962 defsubr (&Scoding_system_aliases);
9963 defsubr (&Scoding_system_eol_type);
9964 defsubr (&Scoding_system_priority_list);
4ed46869 9965
4608c386 9966 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
48b0f3ae
PJ
9967 doc: /* List of coding systems.
9968
9969Do not alter the value of this variable manually. This variable should be
df7492f9 9970updated by the functions `define-coding-system' and
48b0f3ae 9971`define-coding-system-alias'. */);
4608c386
KH
9972 Vcoding_system_list = Qnil;
9973
9974 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
48b0f3ae
PJ
9975 doc: /* Alist of coding system names.
9976Each element is one element list of coding system name.
446dcd75 9977This variable is given to `completing-read' as COLLECTION argument.
48b0f3ae
PJ
9978
9979Do not alter the value of this variable manually. This variable should be
9980updated by the functions `make-coding-system' and
9981`define-coding-system-alias'. */);
4608c386
KH
9982 Vcoding_system_alist = Qnil;
9983
4ed46869 9984 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
48b0f3ae
PJ
9985 doc: /* List of coding-categories (symbols) ordered by priority.
9986
9987On detecting a coding system, Emacs tries code detection algorithms
9988associated with each coding-category one by one in this order. When
9989one algorithm agrees with a byte sequence of source text, the coding
0ec31faf
KH
9990system bound to the corresponding coding-category is selected.
9991
42205607 9992Don't modify this variable directly, but use `set-coding-priority'. */);
4ed46869
KH
9993 {
9994 int i;
9995
9996 Vcoding_category_list = Qnil;
df7492f9 9997 for (i = coding_category_max - 1; i >= 0; i--)
4ed46869 9998 Vcoding_category_list
d46c5b12
KH
9999 = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
10000 Vcoding_category_list);
4ed46869
KH
10001 }
10002
10003 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
48b0f3ae
PJ
10004 doc: /* Specify the coding system for read operations.
10005It is useful to bind this variable with `let', but do not set it globally.
10006If the value is a coding system, it is used for decoding on read operation.
446dcd75
JB
10007If not, an appropriate element is used from one of the coding system alists.
10008There are three such tables: `file-coding-system-alist',
48b0f3ae 10009`process-coding-system-alist', and `network-coding-system-alist'. */);
4ed46869
KH
10010 Vcoding_system_for_read = Qnil;
10011
10012 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
48b0f3ae
PJ
10013 doc: /* Specify the coding system for write operations.
10014Programs bind this variable with `let', but you should not set it globally.
10015If the value is a coding system, it is used for encoding of output,
10016when writing it to a file and when sending it to a file or subprocess.
10017
10018If this does not specify a coding system, an appropriate element
446dcd75
JB
10019is used from one of the coding system alists.
10020There are three such tables: `file-coding-system-alist',
48b0f3ae
PJ
10021`process-coding-system-alist', and `network-coding-system-alist'.
10022For output to files, if the above procedure does not specify a coding system,
10023the value of `buffer-file-coding-system' is used. */);
4ed46869
KH
10024 Vcoding_system_for_write = Qnil;
10025
10026 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
df7492f9
KH
10027 doc: /*
10028Coding system used in the latest file or process I/O. */);
4ed46869
KH
10029 Vlast_coding_system_used = Qnil;
10030
065e3595
KH
10031 DEFVAR_LISP ("last-code-conversion-error", &Vlast_code_conversion_error,
10032 doc: /*
10033Error status of the last code conversion.
10034
10035When an error was detected in the last code conversion, this variable
10036is set to one of the following symbols.
10037 `insufficient-source'
10038 `inconsistent-eol'
10039 `invalid-source'
10040 `interrupted'
10041 `insufficient-memory'
10042When no error was detected, the value doesn't change. So, to check
10043the error status of a code conversion by this variable, you must
10044explicitly set this variable to nil before performing code
10045conversion. */);
10046 Vlast_code_conversion_error = Qnil;
10047
9ce27fde 10048 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
df7492f9
KH
10049 doc: /*
10050*Non-nil means always inhibit code conversion of end-of-line format.
48b0f3ae
PJ
10051See info node `Coding Systems' and info node `Text and Binary' concerning
10052such conversion. */);
9ce27fde
KH
10053 inhibit_eol_conversion = 0;
10054
ed29121d 10055 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
df7492f9
KH
10056 doc: /*
10057Non-nil means process buffer inherits coding system of process output.
48b0f3ae
PJ
10058Bind it to t if the process output is to be treated as if it were a file
10059read from some filesystem. */);
ed29121d
EZ
10060 inherit_process_coding_system = 0;
10061
02ba4723 10062 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
df7492f9
KH
10063 doc: /*
10064Alist to decide a coding system to use for a file I/O operation.
48b0f3ae
PJ
10065The format is ((PATTERN . VAL) ...),
10066where PATTERN is a regular expression matching a file name,
10067VAL is a coding system, a cons of coding systems, or a function symbol.
10068If VAL is a coding system, it is used for both decoding and encoding
10069the file contents.
10070If VAL is a cons of coding systems, the car part is used for decoding,
10071and the cdr part is used for encoding.
10072If VAL is a function symbol, the function must return a coding system
2c53e699
KH
10073or a cons of coding systems which are used as above. The function is
10074called with an argument that is a list of the arguments with which
5a0bbd9a
KH
10075`find-operation-coding-system' was called. If the function can't decide
10076a coding system, it can return `undecided' so that the normal
10077code-detection is performed.
48b0f3ae
PJ
10078
10079See also the function `find-operation-coding-system'
10080and the variable `auto-coding-alist'. */);
02ba4723
KH
10081 Vfile_coding_system_alist = Qnil;
10082
10083 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
df7492f9
KH
10084 doc: /*
10085Alist to decide a coding system to use for a process I/O operation.
48b0f3ae
PJ
10086The format is ((PATTERN . VAL) ...),
10087where PATTERN is a regular expression matching a program name,
10088VAL is a coding system, a cons of coding systems, or a function symbol.
10089If VAL is a coding system, it is used for both decoding what received
10090from the program and encoding what sent to the program.
10091If VAL is a cons of coding systems, the car part is used for decoding,
10092and the cdr part is used for encoding.
10093If VAL is a function symbol, the function must return a coding system
10094or a cons of coding systems which are used as above.
10095
10096See also the function `find-operation-coding-system'. */);
02ba4723
KH
10097 Vprocess_coding_system_alist = Qnil;
10098
10099 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
df7492f9
KH
10100 doc: /*
10101Alist to decide a coding system to use for a network I/O operation.
48b0f3ae
PJ
10102The format is ((PATTERN . VAL) ...),
10103where PATTERN is a regular expression matching a network service name
10104or is a port number to connect to,
10105VAL is a coding system, a cons of coding systems, or a function symbol.
10106If VAL is a coding system, it is used for both decoding what received
10107from the network stream and encoding what sent to the network stream.
10108If VAL is a cons of coding systems, the car part is used for decoding,
10109and the cdr part is used for encoding.
10110If VAL is a function symbol, the function must return a coding system
10111or a cons of coding systems which are used as above.
10112
10113See also the function `find-operation-coding-system'. */);
02ba4723 10114 Vnetwork_coding_system_alist = Qnil;
4ed46869 10115
68c45bf0 10116 DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
75205970
RS
10117 doc: /* Coding system to use with system messages.
10118Also used for decoding keyboard input on X Window system. */);
68c45bf0
PE
10119 Vlocale_coding_system = Qnil;
10120
005f0d35 10121 /* The eol mnemonics are reset in startup.el system-dependently. */
7722baf9 10122 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
df7492f9
KH
10123 doc: /*
10124*String displayed in mode line for UNIX-like (LF) end-of-line format. */);
7722baf9 10125 eol_mnemonic_unix = build_string (":");
4ed46869 10126
7722baf9 10127 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
df7492f9
KH
10128 doc: /*
10129*String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
7722baf9 10130 eol_mnemonic_dos = build_string ("\\");
4ed46869 10131
7722baf9 10132 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
df7492f9
KH
10133 doc: /*
10134*String displayed in mode line for MAC-like (CR) end-of-line format. */);
7722baf9 10135 eol_mnemonic_mac = build_string ("/");
4ed46869 10136
7722baf9 10137 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
df7492f9
KH
10138 doc: /*
10139*String displayed in mode line when end-of-line format is not yet determined. */);
7722baf9 10140 eol_mnemonic_undecided = build_string (":");
4ed46869 10141
84fbb8a0 10142 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
df7492f9
KH
10143 doc: /*
10144*Non-nil enables character translation while encoding and decoding. */);
84fbb8a0 10145 Venable_character_translation = Qt;
bdd9fb48 10146
f967223b 10147 DEFVAR_LISP ("standard-translation-table-for-decode",
48b0f3ae
PJ
10148 &Vstandard_translation_table_for_decode,
10149 doc: /* Table for translating characters while decoding. */);
f967223b 10150 Vstandard_translation_table_for_decode = Qnil;
bdd9fb48 10151
f967223b 10152 DEFVAR_LISP ("standard-translation-table-for-encode",
48b0f3ae
PJ
10153 &Vstandard_translation_table_for_encode,
10154 doc: /* Table for translating characters while encoding. */);
f967223b 10155 Vstandard_translation_table_for_encode = Qnil;
4ed46869 10156
df7492f9 10157 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table,
48b0f3ae
PJ
10158 doc: /* Alist of charsets vs revision numbers.
10159While encoding, if a charset (car part of an element) is found,
df7492f9
KH
10160designate it with the escape sequence identifying revision (cdr part
10161of the element). */);
10162 Vcharset_revision_table = Qnil;
02ba4723
KH
10163
10164 DEFVAR_LISP ("default-process-coding-system",
10165 &Vdefault_process_coding_system,
48b0f3ae
PJ
10166 doc: /* Cons of coding systems used for process I/O by default.
10167The car part is used for decoding a process output,
10168the cdr part is used for encoding a text to be sent to a process. */);
02ba4723 10169 Vdefault_process_coding_system = Qnil;
c4825358 10170
3f003981 10171 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
df7492f9
KH
10172 doc: /*
10173Table of extra Latin codes in the range 128..159 (inclusive).
48b0f3ae
PJ
10174This is a vector of length 256.
10175If Nth element is non-nil, the existence of code N in a file
10176\(or output of subprocess) doesn't prevent it to be detected as
10177a coding system of ISO 2022 variant which has a flag
10178`accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
10179or reading output of a subprocess.
446dcd75 10180Only 128th through 159th elements have a meaning. */);
3f003981 10181 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
d46c5b12
KH
10182
10183 DEFVAR_LISP ("select-safe-coding-system-function",
10184 &Vselect_safe_coding_system_function,
df7492f9
KH
10185 doc: /*
10186Function to call to select safe coding system for encoding a text.
48b0f3ae
PJ
10187
10188If set, this function is called to force a user to select a proper
10189coding system which can encode the text in the case that a default
fdecf907
GM
10190coding system used in each operation can't encode the text. The
10191function should take care that the buffer is not modified while
10192the coding system is being selected.
48b0f3ae
PJ
10193
10194The default value is `select-safe-coding-system' (which see). */);
d46c5b12
KH
10195 Vselect_safe_coding_system_function = Qnil;
10196
5d5bf4d8
KH
10197 DEFVAR_BOOL ("coding-system-require-warning",
10198 &coding_system_require_warning,
10199 doc: /* Internal use only.
6b89e3aa
KH
10200If non-nil, on writing a file, `select-safe-coding-system-function' is
10201called even if `coding-system-for-write' is non-nil. The command
10202`universal-coding-system-argument' binds this variable to t temporarily. */);
5d5bf4d8
KH
10203 coding_system_require_warning = 0;
10204
10205
22ab2303 10206 DEFVAR_BOOL ("inhibit-iso-escape-detection",
74383408 10207 &inhibit_iso_escape_detection,
df7492f9
KH
10208 doc: /*
10209If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
48b0f3ae
PJ
10210
10211By default, on reading a file, Emacs tries to detect how the text is
10212encoded. This code detection is sensitive to escape sequences. If
10213the sequence is valid as ISO2022, the code is determined as one of
10214the ISO2022 encodings, and the file is decoded by the corresponding
10215coding system (e.g. `iso-2022-7bit').
10216
10217However, there may be a case that you want to read escape sequences in
10218a file as is. In such a case, you can set this variable to non-nil.
10219Then, as the code detection ignores any escape sequences, no file is
10220detected as encoded in some ISO2022 encoding. The result is that all
10221escape sequences become visible in a buffer.
10222
10223The default value is nil, and it is strongly recommended not to change
10224it. That is because many Emacs Lisp source files that contain
10225non-ASCII characters are encoded by the coding system `iso-2022-7bit'
10226in Emacs's distribution, and they won't be decoded correctly on
10227reading if you suppress escape sequence detection.
10228
10229The other way to read escape sequences in a file without decoding is
10230to explicitly specify some coding system that doesn't use ISO2022's
10231escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument]. */);
74383408 10232 inhibit_iso_escape_detection = 0;
002fdb44
DL
10233
10234 DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
15c8f9d1 10235 doc: /* Char table for translating self-inserting characters.
446dcd75
JB
10236This is applied to the result of input methods, not their input.
10237See also `keyboard-translate-table'. */);
002fdb44 10238 Vtranslation_table_for_input = Qnil;
8f924df7 10239
2c78b7e1
KH
10240 {
10241 Lisp_Object args[coding_arg_max];
8f924df7 10242 Lisp_Object plist[16];
2c78b7e1
KH
10243 int i;
10244
10245 for (i = 0; i < coding_arg_max; i++)
10246 args[i] = Qnil;
10247
10248 plist[0] = intern (":name");
10249 plist[1] = args[coding_arg_name] = Qno_conversion;
10250 plist[2] = intern (":mnemonic");
10251 plist[3] = args[coding_arg_mnemonic] = make_number ('=');
10252 plist[4] = intern (":coding-type");
10253 plist[5] = args[coding_arg_coding_type] = Qraw_text;
10254 plist[6] = intern (":ascii-compatible-p");
10255 plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
10256 plist[8] = intern (":default-char");
10257 plist[9] = args[coding_arg_default_char] = make_number (0);
8f924df7
KH
10258 plist[10] = intern (":for-unibyte");
10259 plist[11] = args[coding_arg_for_unibyte] = Qt;
10260 plist[12] = intern (":docstring");
10261 plist[13] = build_string ("Do no conversion.\n\
2c78b7e1
KH
10262\n\
10263When you visit a file with this coding, the file is read into a\n\
10264unibyte buffer as is, thus each byte of a file is treated as a\n\
10265character.");
8f924df7
KH
10266 plist[14] = intern (":eol-type");
10267 plist[15] = args[coding_arg_eol_type] = Qunix;
10268 args[coding_arg_plist] = Flist (16, plist);
2c78b7e1 10269 Fdefine_coding_system_internal (coding_arg_max, args);
ae6f73fa
KH
10270
10271 plist[1] = args[coding_arg_name] = Qundecided;
10272 plist[3] = args[coding_arg_mnemonic] = make_number ('-');
10273 plist[5] = args[coding_arg_coding_type] = Qundecided;
10274 /* This is already set.
35befdaa 10275 plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
ae6f73fa
KH
10276 plist[8] = intern (":charset-list");
10277 plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
10278 plist[11] = args[coding_arg_for_unibyte] = Qnil;
10279 plist[13] = build_string ("No conversion on encoding, automatic conversion on decoding.");
10280 plist[15] = args[coding_arg_eol_type] = Qnil;
10281 args[coding_arg_plist] = Flist (16, plist);
10282 Fdefine_coding_system_internal (coding_arg_max, args);
2c78b7e1
KH
10283 }
10284
2c78b7e1 10285 setup_coding_system (Qno_conversion, &safe_terminal_coding);
ff563fce
KH
10286
10287 {
10288 int i;
10289
10290 for (i = 0; i < coding_category_max; i++)
10291 Fset (AREF (Vcoding_category_table, i), Qno_conversion);
10292 }
fcbcfb64
KH
10293#if defined (MSDOS) || defined (WINDOWSNT)
10294 system_eol_type = Qdos;
10295#else
10296 system_eol_type = Qunix;
10297#endif
10298 staticpro (&system_eol_type);
4ed46869
KH
10299}
10300
68c45bf0
PE
10301char *
10302emacs_strerror (error_number)
10303 int error_number;
10304{
10305 char *str;
10306
ca9c0567 10307 synchronize_system_messages_locale ();
68c45bf0
PE
10308 str = strerror (error_number);
10309
10310 if (! NILP (Vlocale_coding_system))
10311 {
10312 Lisp_Object dec = code_convert_string_norecord (build_string (str),
10313 Vlocale_coding_system,
10314 0);
d5db4077 10315 str = (char *) SDATA (dec);
68c45bf0
PE
10316 }
10317
10318 return str;
10319}
10320
4ed46869 10321#endif /* emacs */
9ffd559c
KH
10322
10323/* arch-tag: 3a3a2b01-5ff6-4071-9afe-f5b808d9229d
10324 (do not change this comment) */