(x_encode_text): Allocate coding.destination here, and
[bpt/emacs.git] / src / coding.c
CommitLineData
4ed46869 1/* Coding system handler (conversion, detection, and etc).
4a2f9c6a 2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
203cb916 3 Licensed to the Free Software Foundation.
70ad9fc4 4 Copyright (C) 2001 Free Software Foundation, Inc.
df7492f9
KH
5 Copyright (C) 2001, 2002
6 National Institute of Advanced Industrial Science and Technology (AIST)
7 Registration Number H13PRO009
4ed46869 8
369314dc
KH
9This file is part of GNU Emacs.
10
11GNU Emacs is free software; you can redistribute it and/or modify
12it under the terms of the GNU General Public License as published by
13the Free Software Foundation; either version 2, or (at your option)
14any later version.
4ed46869 15
369314dc
KH
16GNU Emacs is distributed in the hope that it will be useful,
17but WITHOUT ANY WARRANTY; without even the implied warranty of
18MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19GNU General Public License for more details.
4ed46869 20
369314dc
KH
21You should have received a copy of the GNU General Public License
22along with GNU Emacs; see the file COPYING. If not, write to
23the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24Boston, MA 02111-1307, USA. */
4ed46869
KH
25
26/*** TABLE OF CONTENTS ***
27
b73bfc1c 28 0. General comments
4ed46869 29 1. Preamble
df7492f9
KH
30 2. Emacs' internal format (emacs-utf-8) handlers
31 3. UTF-8 handlers
32 4. UTF-16 handlers
33 5. Charset-base coding systems handlers
34 6. emacs-mule (old Emacs' internal format) handlers
35 7. ISO2022 handlers
36 8. Shift-JIS and BIG5 handlers
37 9. CCL handlers
38 10. C library functions
39 11. Emacs Lisp library functions
40 12. Postamble
4ed46869
KH
41
42*/
43
df7492f9 44/*** 0. General comments ***
b73bfc1c
KH
45
46
df7492f9 47CODING SYSTEM
4ed46869 48
5bad0796
DL
49 A coding system is an object for an encoding mechanism that contains
50 information about how to convert byte sequences to character
e19c3639
KH
51 sequences and vice versa. When we say "decode", it means converting
52 a byte sequence of a specific coding system into a character
53 sequence that is represented by Emacs' internal coding system
54 `emacs-utf-8', and when we say "encode", it means converting a
55 character sequence of emacs-utf-8 to a byte sequence of a specific
56 coding system.
57
58 In Emacs Lisp, a coding system is represented by a Lisp symbol. In
59 C level, a coding system is represented by a vector of attributes
5bad0796 60 stored in the hash table Vcharset_hash_table. The conversion from
e19c3639
KH
61 coding system symbol to attributes vector is done by looking up
62 Vcharset_hash_table by the symbol.
63
64 Coding systems are classified into the following types depending on
5bad0796 65 the encoding mechanism. Here's a brief description of the types.
df7492f9
KH
66
67 o UTF-8
68
69 o UTF-16
70
71 o Charset-base coding system
72
73 A coding system defined by one or more (coded) character sets.
5bad0796 74 Decoding and encoding are done by a code converter defined for each
df7492f9
KH
75 character set.
76
5bad0796 77 o Old Emacs internal format (emacs-mule)
df7492f9 78
5bad0796 79 The coding system adopted by old versions of Emacs (20 and 21).
4ed46869 80
df7492f9 81 o ISO2022-base coding system
93dec019 82
df7492f9
KH
83 The most famous coding system for multiple character sets. X's
84 Compound Text, various EUCs (Extended Unix Code), and coding systems
85 used in the Internet communication such as ISO-2022-JP are all
86 variants of ISO2022.
87
88 o SJIS (or Shift-JIS or MS-Kanji-Code)
89
4ed46869
KH
90 A coding system to encode character sets: ASCII, JISX0201, and
91 JISX0208. Widely used for PC's in Japan. Details are described in
df7492f9 92 section 8.
4ed46869 93
df7492f9 94 o BIG5
4ed46869 95
df7492f9
KH
96 A coding system to encode character sets: ASCII and Big5. Widely
97 used by Chinese (mainly in Taiwan and Hong Kong). Details are
98 described in section 8. In this file, when we write "big5" (all
99 lowercase), we mean the coding system, and when we write "Big5"
100 (capitalized), we mean the character set.
4ed46869 101
df7492f9 102 o CCL
27901516 103
5bad0796 104 If a user wants to decode/encode text encoded in a coding system
df7492f9
KH
105 not listed above, he can supply a decoder and an encoder for it in
106 CCL (Code Conversion Language) programs. Emacs executes the CCL
107 program while decoding/encoding.
27901516 108
df7492f9 109 o Raw-text
4ed46869 110
df7492f9 111 A coding system for a text containing raw eight-bit data. Emacs
5bad0796 112 treats each byte of source text as a character (except for
df7492f9 113 end-of-line conversion).
4ed46869 114
df7492f9
KH
115 o No-conversion
116
117 Like raw text, but don't do end-of-line conversion.
4ed46869 118
4ed46869 119
df7492f9 120END-OF-LINE FORMAT
4ed46869 121
5bad0796 122 How text end-of-line is encoded depends on operating system. For
df7492f9 123 instance, Unix's format is just one byte of LF (line-feed) code,
f4dee582 124 whereas DOS's format is two-byte sequence of `carriage-return' and
d46c5b12
KH
125 `line-feed' codes. MacOS's format is usually one byte of
126 `carriage-return'.
4ed46869 127
5bad0796 128 Since text character encoding and end-of-line encoding are
df7492f9
KH
129 independent, any coding system described above can take any format
130 of end-of-line (except for no-conversion).
4ed46869 131
e19c3639
KH
132STRUCT CODING_SYSTEM
133
134 Before using a coding system for code conversion (i.e. decoding and
135 encoding), we setup a structure of type `struct coding_system'.
136 This structure keeps various information about a specific code
5bad0796 137 conversion (e.g. the location of source and destination data).
e19c3639 138
4ed46869
KH
139*/
140
df7492f9
KH
141/* COMMON MACROS */
142
143
4ed46869
KH
144/*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
145
df7492f9
KH
146 These functions check if a byte sequence specified as a source in
147 CODING conforms to the format of XXX. Return 1 if the data contains
148 a byte sequence which can be decoded into non-ASCII characters by
149 the coding system. Otherwize (i.e. the data contains only ASCII
150 characters or invalid sequence) return 0.
151
152 It also resets some bits of an integer pointed by MASK. The macros
153 CATEGORY_MASK_XXX specifies each bit of this integer.
154
155 Below is the template of these functions. */
156
4ed46869 157#if 0
df7492f9
KH
158static int
159detect_coding_XXX (coding, mask)
160 struct coding_system *coding;
161 int *mask;
4ed46869 162{
df7492f9
KH
163 unsigned char *src = coding->source;
164 unsigned char *src_end = coding->source + coding->src_bytes;
165 int multibytep = coding->src_multibyte;
166 int c;
167 int found = 0;
168 ...;
169
170 while (1)
171 {
172 /* Get one byte from the source. If the souce is exausted, jump
173 to no_more_source:. */
174 ONE_MORE_BYTE (c);
175 /* Check if it conforms to XXX. If not, break the loop. */
176 }
177 /* As the data is invalid for XXX, reset a proper bits. */
178 *mask &= ~CODING_CATEGORY_XXX;
179 return 0;
180 no_more_source:
181 /* The source exausted. */
182 if (!found)
183 /* ASCII characters only. */
184 return 0;
185 /* Some data should be decoded into non-ASCII characters. */
186 *mask &= CODING_CATEGORY_XXX;
187 return 1;
4ed46869
KH
188}
189#endif
190
191/*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
192
df7492f9
KH
193 These functions decode a byte sequence specified as a source by
194 CODING. The resulting multibyte text goes to a place pointed to by
195 CODING->charbuf, the length of which should not exceed
196 CODING->charbuf_size;
d46c5b12 197
df7492f9
KH
198 These functions set the information of original and decoded texts in
199 CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
200 They also set CODING->result to one of CODING_RESULT_XXX indicating
201 how the decoding is finished.
d46c5b12 202
df7492f9 203 Below is the template of these functions. */
d46c5b12 204
4ed46869 205#if 0
b73bfc1c 206static void
df7492f9 207decode_coding_XXXX (coding)
4ed46869 208 struct coding_system *coding;
4ed46869 209{
df7492f9
KH
210 unsigned char *src = coding->source + coding->consumed;
211 unsigned char *src_end = coding->source + coding->src_bytes;
212 /* SRC_BASE remembers the start position in source in each loop.
213 The loop will be exited when there's not enough source code, or
214 when there's no room in CHARBUF for a decoded character. */
215 unsigned char *src_base;
216 /* A buffer to produce decoded characters. */
217 int *charbuf = coding->charbuf;
218 int *charbuf_end = charbuf + coding->charbuf_size;
219 int multibytep = coding->src_multibyte;
220
221 while (1)
222 {
223 src_base = src;
224 if (charbuf < charbuf_end)
225 /* No more room to produce a decoded character. */
226 break;
227 ONE_MORE_BYTE (c);
228 /* Decode it. */
229 }
230
231 no_more_source:
232 if (src_base < src_end
233 && coding->mode & CODING_MODE_LAST_BLOCK)
234 /* If the source ends by partial bytes to construct a character,
235 treat them as eight-bit raw data. */
236 while (src_base < src_end && charbuf < charbuf_end)
237 *charbuf++ = *src_base++;
238 /* Remember how many bytes and characters we consumed. If the
239 source is multibyte, the bytes and chars are not identical. */
240 coding->consumed = coding->consumed_char = src_base - coding->source;
241 /* Remember how many characters we produced. */
242 coding->charbuf_used = charbuf - coding->charbuf;
4ed46869
KH
243}
244#endif
245
246/*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
247
df7492f9
KH
248 These functions encode SRC_BYTES length text at SOURCE of Emacs'
249 internal multibyte format by CODING. The resulting byte sequence
b73bfc1c
KH
250 goes to a place pointed to by DESTINATION, the length of which
251 should not exceed DST_BYTES.
d46c5b12 252
df7492f9
KH
253 These functions set the information of original and encoded texts in
254 the members produced, produced_char, consumed, and consumed_char of
255 the structure *CODING. They also set the member result to one of
256 CODING_RESULT_XXX indicating how the encoding finished.
d46c5b12 257
df7492f9
KH
258 DST_BYTES zero means that source area and destination area are
259 overlapped, which means that we can produce a encoded text until it
260 reaches at the head of not-yet-encoded source text.
d46c5b12 261
df7492f9 262 Below is a template of these functions. */
4ed46869 263#if 0
b73bfc1c 264static void
df7492f9 265encode_coding_XXX (coding)
4ed46869 266 struct coding_system *coding;
4ed46869 267{
df7492f9
KH
268 int multibytep = coding->dst_multibyte;
269 int *charbuf = coding->charbuf;
270 int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
271 unsigned char *dst = coding->destination + coding->produced;
272 unsigned char *dst_end = coding->destination + coding->dst_bytes;
273 unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
274 int produced_chars = 0;
275
276 for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
277 {
278 int c = *charbuf;
279 /* Encode C into DST, and increment DST. */
280 }
281 label_no_more_destination:
282 /* How many chars and bytes we produced. */
283 coding->produced_char += produced_chars;
284 coding->produced = dst - coding->destination;
4ed46869
KH
285}
286#endif
287
4ed46869
KH
288\f
289/*** 1. Preamble ***/
290
68c45bf0 291#include <config.h>
4ed46869
KH
292#include <stdio.h>
293
4ed46869
KH
294#include "lisp.h"
295#include "buffer.h"
df7492f9 296#include "character.h"
4ed46869
KH
297#include "charset.h"
298#include "ccl.h"
df7492f9 299#include "composite.h"
4ed46869
KH
300#include "coding.h"
301#include "window.h"
302
df7492f9 303Lisp_Object Vcoding_system_hash_table;
4ed46869 304
df7492f9
KH
305Lisp_Object Qcoding_system, Qcoding_aliases, Qeol_type;
306Lisp_Object Qunix, Qdos, Qmac;
4ed46869
KH
307Lisp_Object Qbuffer_file_coding_system;
308Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
df7492f9 309Lisp_Object Qdefault_char;
27901516 310Lisp_Object Qno_conversion, Qundecided;
df7492f9
KH
311Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5;
312Lisp_Object Qutf_16_be_nosig, Qutf_16_be, Qutf_16_le_nosig, Qutf_16_le;
313Lisp_Object Qsignature, Qendian, Qbig, Qlittle;
bb0115a2 314Lisp_Object Qcoding_system_history;
1397dc18 315Lisp_Object Qvalid_codes;
4ed46869
KH
316
317extern Lisp_Object Qinsert_file_contents, Qwrite_region;
318Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
319Lisp_Object Qstart_process, Qopen_network_stream;
320Lisp_Object Qtarget_idx;
321
d46c5b12
KH
322Lisp_Object Vselect_safe_coding_system_function;
323
7722baf9
EZ
324/* Mnemonic string for each format of end-of-line. */
325Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
326/* Mnemonic string to indicate format of end-of-line is not yet
4ed46869 327 decided. */
7722baf9 328Lisp_Object eol_mnemonic_undecided;
4ed46869
KH
329
330#ifdef emacs
331
4608c386
KH
332Lisp_Object Vcoding_system_list, Vcoding_system_alist;
333
334Lisp_Object Qcoding_system_p, Qcoding_system_error;
4ed46869 335
d46c5b12
KH
336/* Coding system emacs-mule and raw-text are for converting only
337 end-of-line format. */
338Lisp_Object Qemacs_mule, Qraw_text;
9ce27fde 339
4ed46869
KH
340/* Coding-systems are handed between Emacs Lisp programs and C internal
341 routines by the following three variables. */
342/* Coding-system for reading files and receiving data from process. */
343Lisp_Object Vcoding_system_for_read;
344/* Coding-system for writing files and sending data to process. */
345Lisp_Object Vcoding_system_for_write;
346/* Coding-system actually used in the latest I/O. */
347Lisp_Object Vlast_coding_system_used;
348
c4825358 349/* A vector of length 256 which contains information about special
94487c4e 350 Latin codes (especially for dealing with Microsoft codes). */
3f003981 351Lisp_Object Vlatin_extra_code_table;
c4825358 352
9ce27fde
KH
353/* Flag to inhibit code conversion of end-of-line format. */
354int inhibit_eol_conversion;
355
74383408
KH
356/* Flag to inhibit ISO2022 escape sequence detection. */
357int inhibit_iso_escape_detection;
358
ed29121d
EZ
359/* Flag to make buffer-file-coding-system inherit from process-coding. */
360int inherit_process_coding_system;
361
c4825358 362/* Coding system to be used to encode text for terminal display. */
4ed46869
KH
363struct coding_system terminal_coding;
364
c4825358
KH
365/* Coding system to be used to encode text for terminal display when
366 terminal coding system is nil. */
367struct coding_system safe_terminal_coding;
368
369/* Coding system of what is sent from terminal keyboard. */
4ed46869
KH
370struct coding_system keyboard_coding;
371
02ba4723
KH
372Lisp_Object Vfile_coding_system_alist;
373Lisp_Object Vprocess_coding_system_alist;
374Lisp_Object Vnetwork_coding_system_alist;
4ed46869 375
68c45bf0
PE
376Lisp_Object Vlocale_coding_system;
377
4ed46869
KH
378#endif /* emacs */
379
f967223b
KH
380/* Flag to tell if we look up translation table on character code
381 conversion. */
84fbb8a0 382Lisp_Object Venable_character_translation;
f967223b
KH
383/* Standard translation table to look up on decoding (reading). */
384Lisp_Object Vstandard_translation_table_for_decode;
385/* Standard translation table to look up on encoding (writing). */
386Lisp_Object Vstandard_translation_table_for_encode;
84fbb8a0 387
f967223b
KH
388Lisp_Object Qtranslation_table;
389Lisp_Object Qtranslation_table_id;
390Lisp_Object Qtranslation_table_for_decode;
391Lisp_Object Qtranslation_table_for_encode;
4ed46869
KH
392
393/* Alist of charsets vs revision number. */
df7492f9 394static Lisp_Object Vcharset_revision_table;
4ed46869 395
02ba4723
KH
396/* Default coding systems used for process I/O. */
397Lisp_Object Vdefault_process_coding_system;
398
b843d1ae
KH
399/* Global flag to tell that we can't call post-read-conversion and
400 pre-write-conversion functions. Usually the value is zero, but it
401 is set to 1 temporarily while such functions are running. This is
402 to avoid infinite recursive call. */
403static int inhibit_pre_post_conversion;
404
05e6f5dc
KH
405/* Char-table containing safe coding systems of each character. */
406Lisp_Object Vchar_coding_system_table;
407Lisp_Object Qchar_coding_system;
408
df7492f9
KH
409/* Two special coding systems. */
410Lisp_Object Vsjis_coding_system;
411Lisp_Object Vbig5_coding_system;
412
413
414static int detect_coding_utf_8 P_ ((struct coding_system *, int *));
415static void decode_coding_utf_8 P_ ((struct coding_system *));
416static int encode_coding_utf_8 P_ ((struct coding_system *));
417
418static int detect_coding_utf_16 P_ ((struct coding_system *, int *));
419static void decode_coding_utf_16 P_ ((struct coding_system *));
420static int encode_coding_utf_16 P_ ((struct coding_system *));
421
422static int detect_coding_iso_2022 P_ ((struct coding_system *, int *));
423static void decode_coding_iso_2022 P_ ((struct coding_system *));
424static int encode_coding_iso_2022 P_ ((struct coding_system *));
425
426static int detect_coding_emacs_mule P_ ((struct coding_system *, int *));
427static void decode_coding_emacs_mule P_ ((struct coding_system *));
428static int encode_coding_emacs_mule P_ ((struct coding_system *));
429
430static int detect_coding_sjis P_ ((struct coding_system *, int *));
431static void decode_coding_sjis P_ ((struct coding_system *));
432static int encode_coding_sjis P_ ((struct coding_system *));
433
434static int detect_coding_big5 P_ ((struct coding_system *, int *));
435static void decode_coding_big5 P_ ((struct coding_system *));
436static int encode_coding_big5 P_ ((struct coding_system *));
437
438static int detect_coding_ccl P_ ((struct coding_system *, int *));
439static void decode_coding_ccl P_ ((struct coding_system *));
440static int encode_coding_ccl P_ ((struct coding_system *));
441
442static void decode_coding_raw_text P_ ((struct coding_system *));
443static int encode_coding_raw_text P_ ((struct coding_system *));
444
445
446/* ISO2022 section */
447
448#define CODING_ISO_INITIAL(coding, reg) \
449 (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id), \
450 coding_attr_iso_initial), \
451 reg)))
452
453
454#define CODING_ISO_REQUEST(coding, charset_id) \
455 ((charset_id <= (coding)->max_charset_id \
456 ? (coding)->safe_charsets[charset_id] \
457 : -1))
458
459
460#define CODING_ISO_FLAGS(coding) \
461 ((coding)->spec.iso_2022.flags)
462#define CODING_ISO_DESIGNATION(coding, reg) \
463 ((coding)->spec.iso_2022.current_designation[reg])
464#define CODING_ISO_INVOCATION(coding, plane) \
465 ((coding)->spec.iso_2022.current_invocation[plane])
466#define CODING_ISO_SINGLE_SHIFTING(coding) \
467 ((coding)->spec.iso_2022.single_shifting)
468#define CODING_ISO_BOL(coding) \
469 ((coding)->spec.iso_2022.bol)
470#define CODING_ISO_INVOKED_CHARSET(coding, plane) \
471 CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
472
473/* Control characters of ISO2022. */
474 /* code */ /* function */
475#define ISO_CODE_LF 0x0A /* line-feed */
476#define ISO_CODE_CR 0x0D /* carriage-return */
477#define ISO_CODE_SO 0x0E /* shift-out */
478#define ISO_CODE_SI 0x0F /* shift-in */
479#define ISO_CODE_SS2_7 0x19 /* single-shift-2 for 7-bit code */
480#define ISO_CODE_ESC 0x1B /* escape */
481#define ISO_CODE_SS2 0x8E /* single-shift-2 */
482#define ISO_CODE_SS3 0x8F /* single-shift-3 */
483#define ISO_CODE_CSI 0x9B /* control-sequence-introducer */
484
485/* All code (1-byte) of ISO2022 is classified into one of the
486 followings. */
487enum iso_code_class_type
488 {
489 ISO_control_0, /* Control codes in the range
490 0x00..0x1F and 0x7F, except for the
491 following 5 codes. */
492 ISO_carriage_return, /* ISO_CODE_CR (0x0D) */
493 ISO_shift_out, /* ISO_CODE_SO (0x0E) */
494 ISO_shift_in, /* ISO_CODE_SI (0x0F) */
495 ISO_single_shift_2_7, /* ISO_CODE_SS2_7 (0x19) */
496 ISO_escape, /* ISO_CODE_SO (0x1B) */
497 ISO_control_1, /* Control codes in the range
498 0x80..0x9F, except for the
499 following 3 codes. */
500 ISO_single_shift_2, /* ISO_CODE_SS2 (0x8E) */
501 ISO_single_shift_3, /* ISO_CODE_SS3 (0x8F) */
502 ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
503 ISO_0x20_or_0x7F, /* Codes of the values 0x20 or 0x7F. */
504 ISO_graphic_plane_0, /* Graphic codes in the range 0x21..0x7E. */
505 ISO_0xA0_or_0xFF, /* Codes of the values 0xA0 or 0xFF. */
506 ISO_graphic_plane_1 /* Graphic codes in the range 0xA1..0xFE. */
507 };
05e6f5dc 508
df7492f9
KH
509/** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
510 `iso-flags' attribute of an iso2022 coding system. */
93dec019 511
df7492f9
KH
512/* If set, produce long-form designation sequence (e.g. ESC $ ( A)
513 instead of the correct short-form sequence (e.g. ESC $ A). */
514#define CODING_ISO_FLAG_LONG_FORM 0x0001
05e6f5dc 515
df7492f9
KH
516/* If set, reset graphic planes and registers at end-of-line to the
517 initial state. */
518#define CODING_ISO_FLAG_RESET_AT_EOL 0x0002
05e6f5dc 519
df7492f9
KH
520/* If set, reset graphic planes and registers before any control
521 characters to the initial state. */
522#define CODING_ISO_FLAG_RESET_AT_CNTL 0x0004
4ed46869 523
df7492f9
KH
524/* If set, encode by 7-bit environment. */
525#define CODING_ISO_FLAG_SEVEN_BITS 0x0008
b73bfc1c 526
df7492f9
KH
527/* If set, use locking-shift function. */
528#define CODING_ISO_FLAG_LOCKING_SHIFT 0x0010
b73bfc1c 529
df7492f9
KH
530/* If set, use single-shift function. Overwrite
531 CODING_ISO_FLAG_LOCKING_SHIFT. */
532#define CODING_ISO_FLAG_SINGLE_SHIFT 0x0020
b73bfc1c 533
df7492f9
KH
534/* If set, use designation escape sequence. */
535#define CODING_ISO_FLAG_DESIGNATION 0x0040
b73bfc1c 536
df7492f9
KH
537/* If set, produce revision number sequence. */
538#define CODING_ISO_FLAG_REVISION 0x0080
f4dee582 539
df7492f9
KH
540/* If set, produce ISO6429's direction specifying sequence. */
541#define CODING_ISO_FLAG_DIRECTION 0x0100
4ed46869 542
df7492f9
KH
543/* If set, assume designation states are reset at beginning of line on
544 output. */
545#define CODING_ISO_FLAG_INIT_AT_BOL 0x0200
aa72b389 546
df7492f9
KH
547/* If set, designation sequence should be placed at beginning of line
548 on output. */
549#define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
aa72b389 550
df7492f9
KH
551/* If set, do not encode unsafe charactes on output. */
552#define CODING_ISO_FLAG_SAFE 0x0800
aa72b389 553
df7492f9
KH
554/* If set, extra latin codes (128..159) are accepted as a valid code
555 on input. */
556#define CODING_ISO_FLAG_LATIN_EXTRA 0x1000
aa72b389 557
df7492f9 558#define CODING_ISO_FLAG_COMPOSITION 0x2000
aa72b389 559
df7492f9 560#define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000
aa72b389 561
df7492f9 562#define CODING_ISO_FLAG_FULL_SUPPORT 0x8000
aa72b389 563
df7492f9
KH
564/* A character to be produced on output if encoding of the original
565 character is prohibited by CODING_ISO_FLAG_SAFE. */
566#define CODING_INHIBIT_CHARACTER_SUBSTITUTION '?'
aa72b389 567
aa72b389 568
df7492f9
KH
569/* UTF-16 section */
570#define CODING_UTF_16_BOM(coding) \
571 ((coding)->spec.utf_16.bom)
4ed46869 572
df7492f9
KH
573#define CODING_UTF_16_ENDIAN(coding) \
574 ((coding)->spec.utf_16.endian)
4ed46869 575
df7492f9
KH
576#define CODING_UTF_16_SURROGATE(coding) \
577 ((coding)->spec.utf_16.surrogate)
4ed46869 578
4ed46869 579
df7492f9
KH
580/* CCL section */
581#define CODING_CCL_DECODER(coding) \
582 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
583#define CODING_CCL_ENCODER(coding) \
584 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
585#define CODING_CCL_VALIDS(coding) \
586 (XSTRING (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)) \
587 ->data)
4ed46869 588
df7492f9 589/* Index for each coding category in `coding_category_table' */
4ed46869 590
df7492f9
KH
591enum coding_category
592 {
593 coding_category_iso_7,
594 coding_category_iso_7_tight,
595 coding_category_iso_8_1,
596 coding_category_iso_8_2,
597 coding_category_iso_7_else,
598 coding_category_iso_8_else,
599 coding_category_utf_8,
600 coding_category_utf_16_auto,
601 coding_category_utf_16_be,
602 coding_category_utf_16_le,
603 coding_category_utf_16_be_nosig,
604 coding_category_utf_16_le_nosig,
605 coding_category_charset,
606 coding_category_sjis,
607 coding_category_big5,
608 coding_category_ccl,
609 coding_category_emacs_mule,
610 /* All above are targets of code detection. */
611 coding_category_raw_text,
612 coding_category_undecided,
613 coding_category_max
614 };
615
616/* Definitions of flag bits used in detect_coding_XXXX. */
617#define CATEGORY_MASK_ISO_7 (1 << coding_category_iso_7)
618#define CATEGORY_MASK_ISO_7_TIGHT (1 << coding_category_iso_7_tight)
619#define CATEGORY_MASK_ISO_8_1 (1 << coding_category_iso_8_1)
620#define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2)
621#define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else)
622#define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else)
623#define CATEGORY_MASK_UTF_8 (1 << coding_category_utf_8)
624#define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be)
625#define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le)
626#define CATEGORY_MASK_UTF_16_BE_NOSIG (1 << coding_category_utf_16_be_nosig)
627#define CATEGORY_MASK_UTF_16_LE_NOSIG (1 << coding_category_utf_16_le_nosig)
628#define CATEGORY_MASK_CHARSET (1 << coding_category_charset)
629#define CATEGORY_MASK_SJIS (1 << coding_category_sjis)
630#define CATEGORY_MASK_BIG5 (1 << coding_category_big5)
631#define CATEGORY_MASK_CCL (1 << coding_category_ccl)
632#define CATEGORY_MASK_EMACS_MULE (1 << coding_category_emacs_mule)
633
634/* This value is returned if detect_coding_mask () find nothing other
635 than ASCII characters. */
636#define CATEGORY_MASK_ANY \
637 (CATEGORY_MASK_ISO_7 \
638 | CATEGORY_MASK_ISO_7_TIGHT \
639 | CATEGORY_MASK_ISO_8_1 \
640 | CATEGORY_MASK_ISO_8_2 \
641 | CATEGORY_MASK_ISO_7_ELSE \
642 | CATEGORY_MASK_ISO_8_ELSE \
643 | CATEGORY_MASK_UTF_8 \
644 | CATEGORY_MASK_UTF_16_BE \
645 | CATEGORY_MASK_UTF_16_LE \
646 | CATEGORY_MASK_UTF_16_BE_NOSIG \
647 | CATEGORY_MASK_UTF_16_LE_NOSIG \
648 | CATEGORY_MASK_CHARSET \
649 | CATEGORY_MASK_SJIS \
650 | CATEGORY_MASK_BIG5 \
651 | CATEGORY_MASK_CCL \
652 | CATEGORY_MASK_EMACS_MULE)
653
654
655#define CATEGORY_MASK_ISO_7BIT \
656 (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
657
658#define CATEGORY_MASK_ISO_8BIT \
659 (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
660
661#define CATEGORY_MASK_ISO_ELSE \
662 (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
663
664#define CATEGORY_MASK_ISO_ESCAPE \
665 (CATEGORY_MASK_ISO_7 \
666 | CATEGORY_MASK_ISO_7_TIGHT \
667 | CATEGORY_MASK_ISO_7_ELSE \
668 | CATEGORY_MASK_ISO_8_ELSE)
669
670#define CATEGORY_MASK_ISO \
671 ( CATEGORY_MASK_ISO_7BIT \
672 | CATEGORY_MASK_ISO_8BIT \
673 | CATEGORY_MASK_ISO_ELSE)
674
675#define CATEGORY_MASK_UTF_16 \
676 (CATEGORY_MASK_UTF_16_BE \
677 | CATEGORY_MASK_UTF_16_LE \
678 | CATEGORY_MASK_UTF_16_BE_NOSIG \
679 | CATEGORY_MASK_UTF_16_LE_NOSIG)
680
681
682/* List of symbols `coding-category-xxx' ordered by priority. This
683 variable is exposed to Emacs Lisp. */
684static Lisp_Object Vcoding_category_list;
685
686/* Table of coding categories (Lisp symbols). This variable is for
687 internal use oly. */
688static Lisp_Object Vcoding_category_table;
689
690/* Table of coding-categories ordered by priority. */
691static enum coding_category coding_priorities[coding_category_max];
692
693/* Nth element is a coding context for the coding system bound to the
694 Nth coding category. */
695static struct coding_system coding_categories[coding_category_max];
696
697static int detected_mask[coding_category_raw_text] =
698 { CATEGORY_MASK_ISO,
699 CATEGORY_MASK_ISO,
700 CATEGORY_MASK_ISO,
701 CATEGORY_MASK_ISO,
702 CATEGORY_MASK_ISO,
703 CATEGORY_MASK_ISO,
704 CATEGORY_MASK_UTF_8,
705 CATEGORY_MASK_UTF_16,
706 CATEGORY_MASK_UTF_16,
707 CATEGORY_MASK_UTF_16,
708 CATEGORY_MASK_UTF_16,
709 CATEGORY_MASK_UTF_16,
710 CATEGORY_MASK_CHARSET,
711 CATEGORY_MASK_SJIS,
712 CATEGORY_MASK_BIG5,
713 CATEGORY_MASK_CCL,
714 CATEGORY_MASK_EMACS_MULE
715 };
716
717/*** Commonly used macros and functions ***/
718
719#ifndef min
720#define min(a, b) ((a) < (b) ? (a) : (b))
721#endif
722#ifndef max
723#define max(a, b) ((a) > (b) ? (a) : (b))
724#endif
4ed46869 725
df7492f9
KH
726#define CODING_GET_INFO(coding, attrs, eol_type, charset_list) \
727 do { \
728 attrs = CODING_ID_ATTRS (coding->id); \
729 eol_type = CODING_ID_EOL_TYPE (coding->id); \
730 if (VECTORP (eol_type)) \
731 eol_type = Qunix; \
732 charset_list = CODING_ATTR_CHARSET_LIST (attrs); \
733 } while (0)
4ed46869 734
4ed46869 735
df7492f9
KH
736/* Safely get one byte from the source text pointed by SRC which ends
737 at SRC_END, and set C to that byte. If there are not enough bytes
738 in the source, it jumps to `no_more_source'. The caller
739 should declare and set these variables appropriately in advance:
740 src, src_end, multibytep
741*/
aa72b389 742
df7492f9 743#define ONE_MORE_BYTE(c) \
aa72b389 744 do { \
df7492f9
KH
745 if (src == src_end) \
746 { \
747 if (src_base < src) \
748 coding->result = CODING_RESULT_INSUFFICIENT_SRC; \
749 goto no_more_source; \
750 } \
751 c = *src++; \
752 if (multibytep && (c & 0x80)) \
753 { \
754 if ((c & 0xFE) != 0xC0) \
755 error ("Undecodable char found"); \
756 c = ((c & 1) << 6) | *src++; \
757 } \
758 consumed_chars++; \
aa72b389
KH
759 } while (0)
760
aa72b389 761
df7492f9
KH
762#define ONE_MORE_BYTE_NO_CHECK(c) \
763 do { \
764 c = *src++; \
765 if (multibytep && (c & 0x80)) \
766 { \
767 if ((c & 0xFE) != 0xC0) \
768 error ("Undecodable char found"); \
769 c = ((c & 1) << 6) | *src++; \
770 } \
aa72b389
KH
771 } while (0)
772
aa72b389 773
df7492f9
KH
774/* Store a byte C in the place pointed by DST and increment DST to the
775 next free point, and increment PRODUCED_CHARS. The caller should
776 assure that C is 0..127, and declare and set the variable `dst'
777 appropriately in advance.
778*/
aa72b389
KH
779
780
df7492f9
KH
781#define EMIT_ONE_ASCII_BYTE(c) \
782 do { \
783 produced_chars++; \
784 *dst++ = (c); \
785 } while (0)
aa72b389 786
aa72b389 787
df7492f9 788/* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2. */
aa72b389 789
df7492f9
KH
790#define EMIT_TWO_ASCII_BYTES(c1, c2) \
791 do { \
792 produced_chars += 2; \
793 *dst++ = (c1), *dst++ = (c2); \
794 } while (0)
aa72b389 795
df7492f9
KH
796
797/* Store a byte C in the place pointed by DST and increment DST to the
798 next free point, and increment PRODUCED_CHARS. If MULTIBYTEP is
799 nonzero, store in an appropriate multibyte from. The caller should
800 declare and set the variables `dst' and `multibytep' appropriately
801 in advance. */
802
803#define EMIT_ONE_BYTE(c) \
804 do { \
805 produced_chars++; \
806 if (multibytep) \
807 { \
808 int ch = (c); \
809 if (ch >= 0x80) \
810 ch = BYTE8_TO_CHAR (ch); \
811 CHAR_STRING_ADVANCE (ch, dst); \
812 } \
813 else \
814 *dst++ = (c); \
aa72b389
KH
815 } while (0)
816
817
df7492f9 818/* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */
aa72b389 819
e19c3639
KH
820#define EMIT_TWO_BYTES(c1, c2) \
821 do { \
822 produced_chars += 2; \
823 if (multibytep) \
824 { \
825 int ch; \
826 \
827 ch = (c1); \
828 if (ch >= 0x80) \
829 ch = BYTE8_TO_CHAR (ch); \
830 CHAR_STRING_ADVANCE (ch, dst); \
831 ch = (c2); \
832 if (ch >= 0x80) \
833 ch = BYTE8_TO_CHAR (ch); \
834 CHAR_STRING_ADVANCE (ch, dst); \
835 } \
836 else \
837 { \
838 *dst++ = (c1); \
839 *dst++ = (c2); \
840 } \
aa72b389
KH
841 } while (0)
842
843
df7492f9
KH
844#define EMIT_THREE_BYTES(c1, c2, c3) \
845 do { \
846 EMIT_ONE_BYTE (c1); \
847 EMIT_TWO_BYTES (c2, c3); \
848 } while (0)
aa72b389 849
aa72b389 850
df7492f9
KH
851#define EMIT_FOUR_BYTES(c1, c2, c3, c4) \
852 do { \
853 EMIT_TWO_BYTES (c1, c2); \
854 EMIT_TWO_BYTES (c3, c4); \
855 } while (0)
aa72b389 856
aa72b389 857
df7492f9
KH
858#define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
859 do { \
860 charset_map_loaded = 0; \
861 c = DECODE_CHAR (charset, code); \
862 if (charset_map_loaded) \
863 { \
864 unsigned char *orig = coding->source; \
865 EMACS_INT offset; \
866 \
867 coding_set_source (coding); \
868 offset = coding->source - orig; \
869 src += offset; \
870 src_base += offset; \
871 src_end += offset; \
872 } \
873 } while (0)
aa72b389 874
aa72b389 875
df7492f9
KH
876#define ASSURE_DESTINATION(bytes) \
877 do { \
878 if (dst + (bytes) >= dst_end) \
879 { \
880 int more_bytes = charbuf_end - charbuf + (bytes); \
881 \
882 dst = alloc_destination (coding, more_bytes, dst); \
883 dst_end = coding->destination + coding->dst_bytes; \
884 } \
885 } while (0)
b1887814 886
df7492f9
KH
887
888
889static void
890coding_set_source (coding)
891 struct coding_system *coding;
892{
893 if (BUFFERP (coding->src_object))
894 {
895 if (coding->src_pos < 0)
896 coding->source = GAP_END_ADDR + coding->src_pos_byte;
897 else
898 {
e19c3639 899 struct buffer *buf = XBUFFER (coding->src_object);
e19c3639
KH
900 EMACS_INT gpt_byte = BUF_GPT_BYTE (buf);
901 unsigned char *beg_addr = BUF_BEG_ADDR (buf);
902
903 coding->source = beg_addr + coding->src_pos_byte - 1;
904 if (coding->src_pos_byte >= gpt_byte)
905 coding->source += BUF_GAP_SIZE (buf);
aa72b389
KH
906 }
907 }
df7492f9 908 else if (STRINGP (coding->src_object))
aa72b389 909 {
df7492f9
KH
910 coding->source = (XSTRING (coding->src_object)->data
911 + coding->src_pos_byte);
912 }
913 else
914 /* Otherwise, the source is C string and is never relocated
915 automatically. Thus we don't have to update anything. */
916 ;
917}
918
919static void
920coding_set_destination (coding)
921 struct coding_system *coding;
922{
923 if (BUFFERP (coding->dst_object))
924 {
925 /* We are sure that coding->dst_pos_byte is before the gap of the
926 buffer. */
927 coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
928 + coding->dst_pos_byte - 1);
929 if (coding->src_pos < 0)
df7492f9
KH
930 coding->dst_bytes = (GAP_END_ADDR
931 - (coding->src_bytes - coding->consumed)
932 - coding->destination);
933 else
934 coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
935 - coding->destination);
936 }
937 else
938 /* Otherwise, the destination is C string and is never relocated
939 automatically. Thus we don't have to update anything. */
940 ;
941}
942
943
944static void
945coding_alloc_by_realloc (coding, bytes)
946 struct coding_system *coding;
947 EMACS_INT bytes;
948{
949 coding->destination = (unsigned char *) xrealloc (coding->destination,
950 coding->dst_bytes + bytes);
951 coding->dst_bytes += bytes;
952}
953
954static void
955coding_alloc_by_making_gap (coding, bytes)
956 struct coding_system *coding;
957 EMACS_INT bytes;
958{
2c78b7e1
KH
959 if (BUFFERP (coding->dst_object)
960 && EQ (coding->src_object, coding->dst_object))
df7492f9
KH
961 {
962 EMACS_INT add = coding->src_bytes - coding->consumed;
963
964 GAP_SIZE -= add; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
965 make_gap (bytes);
966 GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
967 }
968 else
969 {
2c78b7e1
KH
970 Lisp_Object this_buffer;
971
972 this_buffer = Fcurrent_buffer ();
df7492f9
KH
973 set_buffer_internal (XBUFFER (coding->dst_object));
974 make_gap (bytes);
975 set_buffer_internal (XBUFFER (this_buffer));
976 }
977}
978
979
980static unsigned char *
981alloc_destination (coding, nbytes, dst)
982 struct coding_system *coding;
983 int nbytes;
984 unsigned char *dst;
985{
986 EMACS_INT offset = dst - coding->destination;
987
988 if (BUFFERP (coding->dst_object))
989 coding_alloc_by_making_gap (coding, nbytes);
990 else
991 coding_alloc_by_realloc (coding, nbytes);
992 coding->result = CODING_RESULT_SUCCESS;
993 coding_set_destination (coding);
994 dst = coding->destination + offset;
995 return dst;
996}
aa72b389 997
df7492f9
KH
998\f
999/*** 2. Emacs' internal format (emacs-utf-8) ***/
1000
1001
1002
1003\f
1004/*** 3. UTF-8 ***/
1005
1006/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1007 Check if a text is encoded in UTF-8. If it is, return
1008 CATEGORY_MASK_UTF_8, else return 0. */
1009
1010#define UTF_8_1_OCTET_P(c) ((c) < 0x80)
1011#define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
1012#define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1013#define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1014#define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1015#define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1016
1017static int
1018detect_coding_utf_8 (coding, mask)
1019 struct coding_system *coding;
1020 int *mask;
1021{
1022 unsigned char *src = coding->source, *src_base = src;
1023 unsigned char *src_end = coding->source + coding->src_bytes;
1024 int multibytep = coding->src_multibyte;
1025 int consumed_chars = 0;
1026 int found = 0;
1027
1028 /* A coding system of this category is always ASCII compatible. */
1029 src += coding->head_ascii;
1030
1031 while (1)
1032 {
1033 int c, c1, c2, c3, c4;
1034
1035 ONE_MORE_BYTE (c);
1036 if (UTF_8_1_OCTET_P (c))
1037 continue;
1038 ONE_MORE_BYTE (c1);
1039 if (! UTF_8_EXTRA_OCTET_P (c1))
1040 break;
1041 if (UTF_8_2_OCTET_LEADING_P (c))
1042 {
1043 found++;
1044 continue;
1045 }
1046 ONE_MORE_BYTE (c2);
1047 if (! UTF_8_EXTRA_OCTET_P (c2))
1048 break;
1049 if (UTF_8_3_OCTET_LEADING_P (c))
1050 {
1051 found++;
1052 continue;
1053 }
1054 ONE_MORE_BYTE (c3);
1055 if (! UTF_8_EXTRA_OCTET_P (c3))
1056 break;
1057 if (UTF_8_4_OCTET_LEADING_P (c))
aa72b389 1058 {
df7492f9
KH
1059 found++;
1060 continue;
1061 }
1062 ONE_MORE_BYTE (c4);
1063 if (! UTF_8_EXTRA_OCTET_P (c4))
1064 break;
1065 if (UTF_8_5_OCTET_LEADING_P (c))
1066 {
1067 found++;
1068 continue;
1069 }
1070 break;
1071 }
1072 *mask &= ~CATEGORY_MASK_UTF_8;
1073 return 0;
1074
1075 no_more_source:
1076 if (! found)
1077 return 0;
1078 *mask &= CATEGORY_MASK_UTF_8;
1079 return 1;
1080}
1081
1082
1083static void
1084decode_coding_utf_8 (coding)
1085 struct coding_system *coding;
1086{
1087 unsigned char *src = coding->source + coding->consumed;
1088 unsigned char *src_end = coding->source + coding->src_bytes;
1089 unsigned char *src_base;
1090 int *charbuf = coding->charbuf;
1091 int *charbuf_end = charbuf + coding->charbuf_size;
1092 int consumed_chars = 0, consumed_chars_base;
1093 int multibytep = coding->src_multibyte;
1094 Lisp_Object attr, eol_type, charset_list;
1095
1096 CODING_GET_INFO (coding, attr, eol_type, charset_list);
1097
1098 while (1)
1099 {
1100 int c, c1, c2, c3, c4, c5;
1101
1102 src_base = src;
1103 consumed_chars_base = consumed_chars;
1104
1105 if (charbuf >= charbuf_end)
1106 break;
1107
1108 ONE_MORE_BYTE (c1);
1109 if (UTF_8_1_OCTET_P(c1))
1110 {
1111 c = c1;
1112 if (c == '\r')
aa72b389 1113 {
df7492f9
KH
1114 if (EQ (eol_type, Qdos))
1115 {
1116 if (src == src_end)
1117 goto no_more_source;
1118 if (*src == '\n')
1119 ONE_MORE_BYTE (c);
1120 }
1121 else if (EQ (eol_type, Qmac))
1122 c = '\n';
aa72b389 1123 }
aa72b389 1124 }
df7492f9 1125 else
aa72b389 1126 {
df7492f9
KH
1127 ONE_MORE_BYTE (c2);
1128 if (! UTF_8_EXTRA_OCTET_P (c2))
1129 goto invalid_code;
1130 if (UTF_8_2_OCTET_LEADING_P (c1))
1131 c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1132 else
aa72b389 1133 {
df7492f9
KH
1134 ONE_MORE_BYTE (c3);
1135 if (! UTF_8_EXTRA_OCTET_P (c3))
1136 goto invalid_code;
1137 if (UTF_8_3_OCTET_LEADING_P (c1))
1138 c = (((c1 & 0xF) << 12)
1139 | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1140 else
1141 {
1142 ONE_MORE_BYTE (c4);
1143 if (! UTF_8_EXTRA_OCTET_P (c4))
1144 goto invalid_code;
1145 if (UTF_8_4_OCTET_LEADING_P (c1))
1146 c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1147 | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1148 else
1149 {
1150 ONE_MORE_BYTE (c5);
1151 if (! UTF_8_EXTRA_OCTET_P (c5))
1152 goto invalid_code;
1153 if (UTF_8_5_OCTET_LEADING_P (c1))
1154 {
1155 c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1156 | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1157 | (c5 & 0x3F));
1158 if (c > MAX_CHAR)
1159 goto invalid_code;
1160 }
1161 else
1162 goto invalid_code;
1163 }
1164 }
aa72b389 1165 }
aa72b389 1166 }
df7492f9
KH
1167
1168 *charbuf++ = c;
1169 continue;
1170
1171 invalid_code:
1172 src = src_base;
1173 consumed_chars = consumed_chars_base;
1174 ONE_MORE_BYTE (c);
1175 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1176 coding->errors++;
aa72b389
KH
1177 }
1178
df7492f9
KH
1179 no_more_source:
1180 coding->consumed_char += consumed_chars_base;
1181 coding->consumed = src_base - coding->source;
1182 coding->charbuf_used = charbuf - coding->charbuf;
1183}
1184
1185
1186static int
1187encode_coding_utf_8 (coding)
1188 struct coding_system *coding;
1189{
1190 int multibytep = coding->dst_multibyte;
1191 int *charbuf = coding->charbuf;
1192 int *charbuf_end = charbuf + coding->charbuf_used;
1193 unsigned char *dst = coding->destination + coding->produced;
1194 unsigned char *dst_end = coding->destination + coding->dst_bytes;
e19c3639 1195 int produced_chars = 0;
df7492f9
KH
1196 int c;
1197
1198 if (multibytep)
aa72b389 1199 {
df7492f9
KH
1200 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1201
1202 while (charbuf < charbuf_end)
aa72b389 1203 {
df7492f9
KH
1204 unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1205
1206 ASSURE_DESTINATION (safe_room);
1207 c = *charbuf++;
1208 CHAR_STRING_ADVANCE (c, pend);
1209 for (p = str; p < pend; p++)
1210 EMIT_ONE_BYTE (*p);
aa72b389 1211 }
aa72b389 1212 }
df7492f9
KH
1213 else
1214 {
1215 int safe_room = MAX_MULTIBYTE_LENGTH;
1216
1217 while (charbuf < charbuf_end)
1218 {
1219 ASSURE_DESTINATION (safe_room);
1220 c = *charbuf++;
1221 dst += CHAR_STRING (c, dst);
1222 produced_chars++;
1223 }
1224 }
1225 coding->result = CODING_RESULT_SUCCESS;
1226 coding->produced_char += produced_chars;
1227 coding->produced = dst - coding->destination;
1228 return 0;
aa72b389
KH
1229}
1230
4ed46869 1231
df7492f9
KH
1232/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1233 Check if a text is encoded in UTF-16 Big Endian (endian == 1) or
1234 Little Endian (otherwise). If it is, return
1235 CATEGORY_MASK_UTF_16_BE or CATEGORY_MASK_UTF_16_LE,
1236 else return 0. */
1237
1238#define UTF_16_HIGH_SURROGATE_P(val) \
1239 (((val) & 0xFC00) == 0xD800)
1240
1241#define UTF_16_LOW_SURROGATE_P(val) \
1242 (((val) & 0xFC00) == 0xDC00)
1243
1244#define UTF_16_INVALID_P(val) \
1245 (((val) == 0xFFFE) \
1246 || ((val) == 0xFFFF) \
1247 || UTF_16_LOW_SURROGATE_P (val))
1248
1249
1250static int
1251detect_coding_utf_16 (coding, mask)
b73bfc1c 1252 struct coding_system *coding;
df7492f9 1253 int *mask;
b73bfc1c 1254{
df7492f9
KH
1255 unsigned char *src = coding->source, *src_base = src;
1256 unsigned char *src_end = coding->source + coding->src_bytes;
1257 int multibytep = coding->src_multibyte;
1258 int consumed_chars = 0;
1259 int c1, c2;
1260
1261 ONE_MORE_BYTE (c1);
1262 ONE_MORE_BYTE (c2);
4ed46869 1263
df7492f9 1264 if ((c1 == 0xFF) && (c2 == 0xFE))
b73bfc1c 1265 {
df7492f9
KH
1266 *mask &= CATEGORY_MASK_UTF_16_LE;
1267 return 1;
1268 }
1269 else if ((c1 == 0xFE) && (c2 == 0xFF))
1270 {
1271 *mask &= CATEGORY_MASK_UTF_16_BE;
1272 return 1;
1273 }
1274 no_more_source:
1275 return 0;
1276}
ec6d2bb8 1277
df7492f9
KH
1278static void
1279decode_coding_utf_16 (coding)
1280 struct coding_system *coding;
1281{
1282 unsigned char *src = coding->source + coding->consumed;
1283 unsigned char *src_end = coding->source + coding->src_bytes;
0be8721c 1284 unsigned char *src_base;
df7492f9
KH
1285 int *charbuf = coding->charbuf;
1286 int *charbuf_end = charbuf + coding->charbuf_size;
1287 int consumed_chars = 0, consumed_chars_base;
1288 int multibytep = coding->src_multibyte;
1289 enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1290 enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1291 int surrogate = CODING_UTF_16_SURROGATE (coding);
1292 Lisp_Object attr, eol_type, charset_list;
1293
1294 CODING_GET_INFO (coding, attr, eol_type, charset_list);
1295
1296 if (bom != utf_16_without_bom)
1297 {
1298 int c, c1, c2;
4af310db 1299
df7492f9
KH
1300 src_base = src;
1301 ONE_MORE_BYTE (c1);
1302 ONE_MORE_BYTE (c2);
e19c3639 1303 c = (c1 << 8) | c2;
df7492f9
KH
1304 if (bom == utf_16_with_bom)
1305 {
1306 if (endian == utf_16_big_endian
1307 ? c != 0xFFFE : c != 0xFEFF)
4af310db 1308 {
df7492f9
KH
1309 /* We are sure that there's enouph room at CHARBUF. */
1310 *charbuf++ = c1;
1311 *charbuf++ = c2;
1312 coding->errors++;
4af310db 1313 }
4af310db 1314 }
df7492f9 1315 else
4af310db 1316 {
df7492f9
KH
1317 if (c == 0xFFFE)
1318 CODING_UTF_16_ENDIAN (coding)
1319 = endian = utf_16_big_endian;
1320 else if (c == 0xFEFF)
1321 CODING_UTF_16_ENDIAN (coding)
1322 = endian = utf_16_little_endian;
1323 else
4af310db 1324 {
df7492f9
KH
1325 CODING_UTF_16_ENDIAN (coding)
1326 = endian = utf_16_big_endian;
1327 src = src_base;
4af310db 1328 }
4af310db 1329 }
df7492f9
KH
1330 CODING_UTF_16_BOM (coding) = utf_16_with_bom;
1331 }
1332
1333 while (1)
1334 {
1335 int c, c1, c2;
1336
1337 src_base = src;
1338 consumed_chars_base = consumed_chars;
1339
1340 if (charbuf + 2 >= charbuf_end)
1341 break;
1342
1343 ONE_MORE_BYTE (c1);
1344 ONE_MORE_BYTE (c2);
1345 c = (endian == utf_16_big_endian
e19c3639 1346 ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
df7492f9 1347 if (surrogate)
aa72b389 1348 {
df7492f9 1349 if (! UTF_16_LOW_SURROGATE_P (c))
aa72b389 1350 {
df7492f9
KH
1351 if (endian == utf_16_big_endian)
1352 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1353 else
1354 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1355 *charbuf++ = c1;
1356 *charbuf++ = c2;
1357 coding->errors++;
1358 if (UTF_16_HIGH_SURROGATE_P (c))
1359 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1360 else
1361 *charbuf++ = c;
aa72b389 1362 }
df7492f9
KH
1363 else
1364 {
1365 c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1366 CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1367 *charbuf++ = c;
1368 }
1369 }
1370 else
1371 {
1372 if (UTF_16_HIGH_SURROGATE_P (c))
1373 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1374 else
1375 *charbuf++ = c;
1376 }
1377 }
1378
1379 no_more_source:
1380 coding->consumed_char += consumed_chars_base;
1381 coding->consumed = src_base - coding->source;
1382 coding->charbuf_used = charbuf - coding->charbuf;
1383}
1384
1385static int
1386encode_coding_utf_16 (coding)
1387 struct coding_system *coding;
1388{
1389 int multibytep = coding->dst_multibyte;
1390 int *charbuf = coding->charbuf;
1391 int *charbuf_end = charbuf + coding->charbuf_used;
1392 unsigned char *dst = coding->destination + coding->produced;
1393 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1394 int safe_room = 8;
1395 enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1396 int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1397 int produced_chars = 0;
1398 Lisp_Object attrs, eol_type, charset_list;
1399 int c;
1400
1401 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
1402
1403 if (bom == utf_16_with_bom)
1404 {
1405 ASSURE_DESTINATION (safe_room);
1406 if (big_endian)
1407 EMIT_TWO_BYTES (0xFF, 0xFE);
1408 else
1409 EMIT_TWO_BYTES (0xFE, 0xFF);
1410 CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1411 }
1412
1413 while (charbuf < charbuf_end)
1414 {
1415 ASSURE_DESTINATION (safe_room);
1416 c = *charbuf++;
e19c3639
KH
1417 if (c >= MAX_UNICODE_CHAR)
1418 c = coding->default_char;
df7492f9
KH
1419
1420 if (c < 0x10000)
1421 {
1422 if (big_endian)
1423 EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1424 else
1425 EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1426 }
1427 else
1428 {
1429 int c1, c2;
1430
1431 c -= 0x10000;
1432 c1 = (c >> 10) + 0xD800;
1433 c2 = (c & 0x3FF) + 0xDC00;
1434 if (big_endian)
1435 EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1436 else
1437 EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1438 }
1439 }
1440 coding->result = CODING_RESULT_SUCCESS;
1441 coding->produced = dst - coding->destination;
1442 coding->produced_char += produced_chars;
1443 return 0;
1444}
1445
1446\f
1447/*** 6. Old Emacs' internal format (emacs-mule) ***/
1448
1449/* Emacs' internal format for representation of multiple character
1450 sets is a kind of multi-byte encoding, i.e. characters are
1451 represented by variable-length sequences of one-byte codes.
1452
1453 ASCII characters and control characters (e.g. `tab', `newline') are
1454 represented by one-byte sequences which are their ASCII codes, in
1455 the range 0x00 through 0x7F.
1456
1457 8-bit characters of the range 0x80..0x9F are represented by
1458 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1459 code + 0x20).
1460
1461 8-bit characters of the range 0xA0..0xFF are represented by
1462 one-byte sequences which are their 8-bit code.
1463
1464 The other characters are represented by a sequence of `base
1465 leading-code', optional `extended leading-code', and one or two
1466 `position-code's. The length of the sequence is determined by the
1467 base leading-code. Leading-code takes the range 0x81 through 0x9D,
1468 whereas extended leading-code and position-code take the range 0xA0
1469 through 0xFF. See `charset.h' for more details about leading-code
1470 and position-code.
1471
1472 --- CODE RANGE of Emacs' internal format ---
1473 character set range
1474 ------------- -----
1475 ascii 0x00..0x7F
1476 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1477 eight-bit-graphic 0xA0..0xBF
1478 ELSE 0x81..0x9D + [0xA0..0xFF]+
1479 ---------------------------------------------
1480
1481 As this is the internal character representation, the format is
1482 usually not used externally (i.e. in a file or in a data sent to a
1483 process). But, it is possible to have a text externally in this
1484 format (i.e. by encoding by the coding system `emacs-mule').
1485
1486 In that case, a sequence of one-byte codes has a slightly different
1487 form.
1488
1489 At first, all characters in eight-bit-control are represented by
1490 one-byte sequences which are their 8-bit code.
1491
1492 Next, character composition data are represented by the byte
1493 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1494 where,
1495 METHOD is 0xF0 plus one of composition method (enum
1496 composition_method),
1497
1498 BYTES is 0xA0 plus a byte length of this composition data,
1499
1500 CHARS is 0x20 plus a number of characters composed by this
1501 data,
1502
1503 COMPONENTs are characters of multibye form or composition
1504 rules encoded by two-byte of ASCII codes.
1505
1506 In addition, for backward compatibility, the following formats are
1507 also recognized as composition data on decoding.
1508
1509 0x80 MSEQ ...
1510 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1511
1512 Here,
1513 MSEQ is a multibyte form but in these special format:
1514 ASCII: 0xA0 ASCII_CODE+0x80,
1515 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1516 RULE is a one byte code of the range 0xA0..0xF0 that
1517 represents a composition rule.
1518 */
1519
1520char emacs_mule_bytes[256];
1521
1522/* Leading-code followed by extended leading-code. */
1523#define LEADING_CODE_PRIVATE_11 0x9A /* for private DIMENSION1 of 1-column */
1524#define LEADING_CODE_PRIVATE_12 0x9B /* for private DIMENSION1 of 2-column */
1525#define LEADING_CODE_PRIVATE_21 0x9C /* for private DIMENSION2 of 1-column */
1526#define LEADING_CODE_PRIVATE_22 0x9D /* for private DIMENSION2 of 2-column */
1527
1528
1529int
1530emacs_mule_char (coding, composition, nbytes, nchars)
1531 struct coding_system *coding;
1532 int composition;
1533 int *nbytes, *nchars;
1534{
1535 unsigned char *src = coding->source + coding->consumed;
1536 unsigned char *src_end = coding->source + coding->src_bytes;
1537 int multibytep = coding->src_multibyte;
1538 unsigned char *src_base = src;
1539 struct charset *charset;
1540 unsigned code;
1541 int c;
1542 int consumed_chars = 0;
1543
1544 ONE_MORE_BYTE (c);
1545 if (composition)
1546 {
1547 c -= 0x20;
1548 if (c == 0x80)
1549 {
1550 ONE_MORE_BYTE (c);
1551 if (c < 0xA0)
1552 goto invalid_code;
1553 *nbytes = src - src_base;
1554 *nchars = consumed_chars;
1555 return (c - 0x80);
aa72b389 1556 }
df7492f9
KH
1557 }
1558
1559 switch (emacs_mule_bytes[c])
1560 {
1561 case 2:
1562 if (! (charset = emacs_mule_charset[c]))
1563 goto invalid_code;
1564 ONE_MORE_BYTE (c);
1565 code = c & 0x7F;
1566 break;
1567
1568 case 3:
1569 if (c == LEADING_CODE_PRIVATE_11
1570 || c == LEADING_CODE_PRIVATE_12)
b73bfc1c 1571 {
df7492f9
KH
1572 ONE_MORE_BYTE (c);
1573 if (! (charset = emacs_mule_charset[c]))
1574 goto invalid_code;
1575 ONE_MORE_BYTE (c);
1576 code = c & 0x7F;
b73bfc1c
KH
1577 }
1578 else
1579 {
df7492f9
KH
1580 if (! (charset = emacs_mule_charset[c]))
1581 goto invalid_code;
1582 ONE_MORE_BYTE (c);
1583 code = (c & 0x7F) << 7;
1584 ONE_MORE_BYTE (c);
1585 code |= c & 0x7F;
1586 }
1587 break;
1588
1589 case 4:
1590 if (! (charset = emacs_mule_charset[c]))
1591 goto invalid_code;
1592 ONE_MORE_BYTE (c);
1593 code = (c & 0x7F) << 7;
1594 ONE_MORE_BYTE (c);
1595 code |= c & 0x7F;
1596 break;
1597
1598 case 1:
1599 code = c;
1600 charset = CHARSET_FROM_ID (ASCII_BYTE_P (code) ? charset_ascii
1601 : code < 0xA0 ? charset_8_bit_control
1602 : charset_8_bit_graphic);
1603 break;
1604
1605 default:
1606 abort ();
1607 }
1608 c = DECODE_CHAR (charset, code);
1609 if (c < 0)
1610 goto invalid_code;
1611 *nbytes = src - src_base;
1612 *nchars = consumed_chars;
1613 return c;
1614
1615 no_more_source:
1616 return -2;
1617
1618 invalid_code:
1619 return -1;
1620}
1621
1622
1623/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1624 Check if a text is encoded in `emacs-mule'. */
1625
1626static int
1627detect_coding_emacs_mule (coding, mask)
1628 struct coding_system *coding;
1629 int *mask;
1630{
1631 unsigned char *src = coding->source, *src_base = src;
1632 unsigned char *src_end = coding->source + coding->src_bytes;
1633 int multibytep = coding->src_multibyte;
1634 int consumed_chars = 0;
1635 int c;
1636 int found = 0;
1637
1638 /* A coding system of this category is always ASCII compatible. */
1639 src += coding->head_ascii;
1640
1641 while (1)
1642 {
1643 ONE_MORE_BYTE (c);
1644
1645 if (c == 0x80)
1646 {
1647 /* Perhaps the start of composite character. We simple skip
1648 it because analyzing it is too heavy for detecting. But,
1649 at least, we check that the composite character
1650 constitues of more than 4 bytes. */
1651 unsigned char *src_base;
1652
1653 repeat:
1654 src_base = src;
1655 do
1656 {
1657 ONE_MORE_BYTE (c);
1658 }
1659 while (c >= 0xA0);
1660
1661 if (src - src_base <= 4)
1662 break;
1663 found = 1;
1664 if (c == 0x80)
1665 goto repeat;
b73bfc1c 1666 }
df7492f9
KH
1667
1668 if (c < 0x80)
b73bfc1c 1669 {
df7492f9
KH
1670 if (c < 0x20
1671 && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1672 break;
1673 }
1674 else
1675 {
1676 unsigned char *src_base = src - 1;
1677
1678 do
1679 {
1680 ONE_MORE_BYTE (c);
1681 }
1682 while (c >= 0xA0);
1683 if (src - src_base != emacs_mule_bytes[*src_base])
1684 break;
1685 found = 1;
4ed46869
KH
1686 }
1687 }
df7492f9
KH
1688 *mask &= ~CATEGORY_MASK_EMACS_MULE;
1689 return 0;
1690
1691 no_more_source:
1692 if (!found)
1693 return 0;
1694 *mask &= CATEGORY_MASK_EMACS_MULE;
1695 return 1;
4ed46869
KH
1696}
1697
b73bfc1c 1698
df7492f9
KH
1699/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
1700
1701/* Decode a character represented as a component of composition
1702 sequence of Emacs 20/21 style at SRC. Set C to that character and
1703 update SRC to the head of next character (or an encoded composition
1704 rule). If SRC doesn't points a composition component, set C to -1.
1705 If SRC points an invalid byte sequence, global exit by a return
1706 value 0. */
1707
1708#define DECODE_EMACS_MULE_COMPOSITION_CHAR(buf) \
1709 if (1) \
1710 { \
1711 int c; \
1712 int nbytes, nchars; \
1713 \
1714 if (src == src_end) \
1715 break; \
1716 c = emacs_mule_char (coding, 1, &nbytes, &nchars); \
1717 if (c < 0) \
1718 { \
1719 if (c == -2) \
1720 break; \
1721 goto invalid_code; \
1722 } \
1723 *buf++ = c; \
1724 src += nbytes; \
1725 consumed_chars += nchars; \
1726 } \
1727 else
1728
1729
1730/* Decode a composition rule represented as a component of composition
1731 sequence of Emacs 20 style at SRC. Set C to the rule. If SRC
1732 points an invalid byte sequence, set C to -1. */
1733
1734#define DECODE_EMACS_MULE_COMPOSITION_RULE(buf) \
1735 do { \
1736 int c, gref, nref; \
1737 \
1738 if (src < src_end) \
1739 goto invalid_code; \
1740 ONE_MORE_BYTE_NO_CHECK (c); \
1741 c -= 0xA0; \
1742 if (c < 0 || c >= 81) \
1743 goto invalid_code; \
1744 \
1745 gref = c / 9, nref = c % 9; \
1746 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
1747 } while (0)
1748
1749
1750#define ADD_COMPOSITION_DATA(buf, method, nchars) \
1751 do { \
1752 *buf++ = -5; \
1753 *buf++ = coding->produced_char + char_offset; \
1754 *buf++ = CODING_ANNOTATE_COMPOSITION_MASK; \
1755 *buf++ = method; \
1756 *buf++ = nchars; \
1757 } while (0)
aa72b389 1758
df7492f9
KH
1759
1760#define DECODE_EMACS_MULE_21_COMPOSITION(c) \
aa72b389 1761 do { \
df7492f9
KH
1762 /* Emacs 21 style format. The first three bytes at SRC are \
1763 (METHOD - 0xF0), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is \
1764 the byte length of this composition information, CHARS is the \
1765 number of characters composed by this composition. */ \
1766 enum composition_method method = c - 0xF0; \
1767 int consumed_chars_limit; \
1768 int nbytes, nchars; \
1769 \
1770 ONE_MORE_BYTE (c); \
1771 nbytes = c - 0xA0; \
1772 if (nbytes < 3) \
1773 goto invalid_code; \
1774 ONE_MORE_BYTE (c); \
1775 nchars = c - 0xA0; \
1776 ADD_COMPOSITION_DATA (charbuf, method, nchars); \
1777 consumed_chars_limit = consumed_chars_base + nbytes; \
1778 if (method != COMPOSITION_RELATIVE) \
aa72b389 1779 { \
df7492f9
KH
1780 int i = 0; \
1781 while (consumed_chars < consumed_chars_limit) \
aa72b389 1782 { \
df7492f9
KH
1783 if (i % 2 && method != COMPOSITION_WITH_ALTCHARS) \
1784 DECODE_EMACS_MULE_COMPOSITION_RULE (charbuf); \
1785 else \
1786 DECODE_EMACS_MULE_COMPOSITION_CHAR (charbuf); \
aa72b389 1787 } \
df7492f9
KH
1788 if (consumed_chars < consumed_chars_limit) \
1789 goto invalid_code; \
aa72b389
KH
1790 } \
1791 } while (0)
93dec019 1792
aa72b389 1793
df7492f9
KH
1794#define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION(c) \
1795 do { \
1796 /* Emacs 20 style format for relative composition. */ \
1797 /* Store multibyte form of characters to be composed. */ \
1798 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
1799 int *buf = components; \
1800 int i, j; \
1801 \
1802 src = src_base; \
1803 ONE_MORE_BYTE (c); /* skip 0x80 */ \
1804 for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \
1805 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1806 if (i < 2) \
1807 goto invalid_code; \
1808 ADD_COMPOSITION_DATA (charbuf, COMPOSITION_RELATIVE, i); \
1809 for (j = 0; j < i; j++) \
1810 *charbuf++ = components[j]; \
1811 } while (0)
1812
1813
1814#define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION(c) \
1815 do { \
1816 /* Emacs 20 style format for rule-base composition. */ \
1817 /* Store multibyte form of characters to be composed. */ \
1818 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
1819 int *buf = components; \
1820 int i, j; \
1821 \
1822 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1823 for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \
1824 { \
1825 DECODE_EMACS_MULE_COMPOSITION_RULE (buf); \
1826 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1827 } \
1828 if (i < 1 || (buf - components) % 2 == 0) \
1829 goto invalid_code; \
1830 if (charbuf + i + (i / 2) + 1 < charbuf_end) \
1831 goto no_more_source; \
1832 ADD_COMPOSITION_DATA (buf, COMPOSITION_WITH_RULE, i); \
1833 for (j = 0; j < i; j++) \
1834 *charbuf++ = components[j]; \
1835 for (j = 0; j < i; j += 2) \
1836 *charbuf++ = components[j]; \
1837 } while (0)
1838
aa72b389
KH
1839
1840static void
df7492f9 1841decode_coding_emacs_mule (coding)
aa72b389 1842 struct coding_system *coding;
aa72b389 1843{
df7492f9
KH
1844 unsigned char *src = coding->source + coding->consumed;
1845 unsigned char *src_end = coding->source + coding->src_bytes;
aa72b389 1846 unsigned char *src_base;
df7492f9
KH
1847 int *charbuf = coding->charbuf;
1848 int *charbuf_end = charbuf + coding->charbuf_size;
1849 int consumed_chars = 0, consumed_chars_base;
1850 int char_offset = 0;
1851 int multibytep = coding->src_multibyte;
1852 Lisp_Object attrs, eol_type, charset_list;
aa72b389 1853
df7492f9 1854 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
aa72b389 1855
aa72b389
KH
1856 while (1)
1857 {
df7492f9
KH
1858 int c;
1859
aa72b389 1860 src_base = src;
df7492f9
KH
1861 consumed_chars_base = consumed_chars;
1862
1863 if (charbuf >= charbuf_end)
1864 break;
aa72b389 1865
df7492f9
KH
1866 ONE_MORE_BYTE (c);
1867
1868 if (c < 0x80)
aa72b389 1869 {
df7492f9
KH
1870 if (c == '\r')
1871 {
1872 if (EQ (eol_type, Qdos))
1873 {
1874 if (src == src_end)
1875 goto no_more_source;
1876 if (*src == '\n')
1877 ONE_MORE_BYTE (c);
1878 }
1879 else if (EQ (eol_type, Qmac))
1880 c = '\n';
1881 }
1882 *charbuf++ = c;
1883 char_offset++;
aa72b389 1884 }
df7492f9
KH
1885 else if (c == 0x80)
1886 {
1887 if (charbuf + 5 + (MAX_COMPOSITION_COMPONENTS * 2) - 1 > charbuf_end)
1888 break;
1889 ONE_MORE_BYTE (c);
1890 if (c - 0xF0 >= COMPOSITION_RELATIVE
1891 && c - 0xF0 <= COMPOSITION_WITH_RULE_ALTCHARS)
1892 DECODE_EMACS_MULE_21_COMPOSITION (c);
1893 else if (c < 0xC0)
1894 DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (c);
1895 else if (c == 0xFF)
1896 DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (c);
1897 else
1898 goto invalid_code;
1899 }
1900 else if (c < 0xA0 && emacs_mule_bytes[c] > 1)
1901 {
1902 int nbytes, nchars;
1903 src--;
1904 c = emacs_mule_char (coding, 0, &nbytes, &nchars);
1905 if (c < 0)
1906 {
1907 if (c == -2)
1908 break;
1909 goto invalid_code;
1910 }
1911 *charbuf++ = c;
1912 char_offset++;
1913 }
1914 continue;
1915
1916 invalid_code:
1917 src = src_base;
1918 consumed_chars = consumed_chars_base;
1919 ONE_MORE_BYTE (c);
1920 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1921 coding->errors++;
1922 }
1923
1924 no_more_source:
1925 coding->consumed_char += consumed_chars_base;
1926 coding->consumed = src_base - coding->source;
1927 coding->charbuf_used = charbuf - coding->charbuf;
1928}
1929
1930
1931#define EMACS_MULE_LEADING_CODES(id, codes) \
1932 do { \
1933 if (id < 0xA0) \
1934 codes[0] = id, codes[1] = 0; \
1935 else if (id < 0xE0) \
1936 codes[0] = 0x9A, codes[1] = id; \
1937 else if (id < 0xF0) \
1938 codes[0] = 0x9B, codes[1] = id; \
1939 else if (id < 0xF5) \
1940 codes[0] = 0x9C, codes[1] = id; \
1941 else \
1942 codes[0] = 0x9D, codes[1] = id; \
1943 } while (0);
1944
aa72b389 1945
df7492f9
KH
1946static int
1947encode_coding_emacs_mule (coding)
1948 struct coding_system *coding;
1949{
1950 int multibytep = coding->dst_multibyte;
1951 int *charbuf = coding->charbuf;
1952 int *charbuf_end = charbuf + coding->charbuf_used;
1953 unsigned char *dst = coding->destination + coding->produced;
1954 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1955 int safe_room = 8;
df7492f9
KH
1956 int produced_chars = 0;
1957 Lisp_Object attrs, eol_type, charset_list;
1958 int c;
1959
1960 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
1961
1962 while (charbuf < charbuf_end)
1963 {
1964 ASSURE_DESTINATION (safe_room);
1965 c = *charbuf++;
1966 if (ASCII_CHAR_P (c))
1967 EMIT_ONE_ASCII_BYTE (c);
1968 else
aa72b389 1969 {
df7492f9
KH
1970 struct charset *charset;
1971 unsigned code;
1972 int dimension;
1973 int emacs_mule_id;
1974 unsigned char leading_codes[2];
1975
1976 charset = char_charset (c, charset_list, &code);
1977 if (! charset)
1978 {
1979 c = coding->default_char;
1980 if (ASCII_CHAR_P (c))
1981 {
1982 EMIT_ONE_ASCII_BYTE (c);
1983 continue;
1984 }
1985 charset = char_charset (c, charset_list, &code);
1986 }
1987 dimension = CHARSET_DIMENSION (charset);
1988 emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
1989 EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
1990 EMIT_ONE_BYTE (leading_codes[0]);
1991 if (leading_codes[1])
1992 EMIT_ONE_BYTE (leading_codes[1]);
1993 if (dimension == 1)
1994 EMIT_ONE_BYTE (code);
aa72b389 1995 else
df7492f9
KH
1996 {
1997 EMIT_ONE_BYTE (code >> 8);
1998 EMIT_ONE_BYTE (code & 0xFF);
1999 }
aa72b389 2000 }
aa72b389 2001 }
df7492f9
KH
2002 coding->result = CODING_RESULT_SUCCESS;
2003 coding->produced_char += produced_chars;
2004 coding->produced = dst - coding->destination;
2005 return 0;
aa72b389 2006}
b73bfc1c 2007
4ed46869 2008\f
df7492f9 2009/*** 7. ISO2022 handlers ***/
4ed46869
KH
2010
2011/* The following note describes the coding system ISO2022 briefly.
39787efd 2012 Since the intention of this note is to help understand the
df7492f9 2013 functions in this file, some parts are NOT ACCURATE or OVERLY
39787efd 2014 SIMPLIFIED. For thorough understanding, please refer to the
df7492f9 2015 original document of ISO2022.
4ed46869
KH
2016
2017 ISO2022 provides many mechanisms to encode several character sets
df7492f9 2018 in 7-bit and 8-bit environments. For 7-bite environments, all text
39787efd
KH
2019 is encoded using bytes less than 128. This may make the encoded
2020 text a little bit longer, but the text passes more easily through
df7492f9 2021 several gateways, some of which strip off MSB (Most Signigant Bit).
b73bfc1c 2022
df7492f9
KH
2023 There are two kinds of character sets: control character set and
2024 graphic character set. The former contains control characters such
4ed46869 2025 as `newline' and `escape' to provide control functions (control
39787efd 2026 functions are also provided by escape sequences). The latter
df7492f9 2027 contains graphic characters such as 'A' and '-'. Emacs recognizes
4ed46869
KH
2028 two control character sets and many graphic character sets.
2029
2030 Graphic character sets are classified into one of the following
39787efd
KH
2031 four classes, according to the number of bytes (DIMENSION) and
2032 number of characters in one dimension (CHARS) of the set:
2033 - DIMENSION1_CHARS94
2034 - DIMENSION1_CHARS96
2035 - DIMENSION2_CHARS94
2036 - DIMENSION2_CHARS96
2037
2038 In addition, each character set is assigned an identification tag,
df7492f9 2039 unique for each set, called "final character" (denoted as <F>
39787efd
KH
2040 hereafter). The <F> of each character set is decided by ECMA(*)
2041 when it is registered in ISO. The code range of <F> is 0x30..0x7F
2042 (0x30..0x3F are for private use only).
4ed46869
KH
2043
2044 Note (*): ECMA = European Computer Manufacturers Association
2045
df7492f9 2046 Here are examples of graphic character set [NAME(<F>)]:
4ed46869
KH
2047 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2048 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2049 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2050 o DIMENSION2_CHARS96 -- none for the moment
2051
39787efd 2052 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
4ed46869
KH
2053 C0 [0x00..0x1F] -- control character plane 0
2054 GL [0x20..0x7F] -- graphic character plane 0
2055 C1 [0x80..0x9F] -- control character plane 1
2056 GR [0xA0..0xFF] -- graphic character plane 1
2057
2058 A control character set is directly designated and invoked to C0 or
39787efd
KH
2059 C1 by an escape sequence. The most common case is that:
2060 - ISO646's control character set is designated/invoked to C0, and
2061 - ISO6429's control character set is designated/invoked to C1,
2062 and usually these designations/invocations are omitted in encoded
2063 text. In a 7-bit environment, only C0 can be used, and a control
2064 character for C1 is encoded by an appropriate escape sequence to
2065 fit into the environment. All control characters for C1 are
2066 defined to have corresponding escape sequences.
4ed46869
KH
2067
2068 A graphic character set is at first designated to one of four
2069 graphic registers (G0 through G3), then these graphic registers are
2070 invoked to GL or GR. These designations and invocations can be
2071 done independently. The most common case is that G0 is invoked to
39787efd
KH
2072 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
2073 these invocations and designations are omitted in encoded text.
2074 In a 7-bit environment, only GL can be used.
4ed46869 2075
39787efd
KH
2076 When a graphic character set of CHARS94 is invoked to GL, codes
2077 0x20 and 0x7F of the GL area work as control characters SPACE and
2078 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2079 be used.
4ed46869
KH
2080
2081 There are two ways of invocation: locking-shift and single-shift.
2082 With locking-shift, the invocation lasts until the next different
39787efd
KH
2083 invocation, whereas with single-shift, the invocation affects the
2084 following character only and doesn't affect the locking-shift
2085 state. Invocations are done by the following control characters or
2086 escape sequences:
4ed46869
KH
2087
2088 ----------------------------------------------------------------------
39787efd 2089 abbrev function cntrl escape seq description
4ed46869 2090 ----------------------------------------------------------------------
39787efd
KH
2091 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
2092 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
2093 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
2094 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
2095 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
2096 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
2097 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
2098 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
2099 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
4ed46869 2100 ----------------------------------------------------------------------
39787efd
KH
2101 (*) These are not used by any known coding system.
2102
2103 Control characters for these functions are defined by macros
2104 ISO_CODE_XXX in `coding.h'.
4ed46869 2105
39787efd 2106 Designations are done by the following escape sequences:
4ed46869
KH
2107 ----------------------------------------------------------------------
2108 escape sequence description
2109 ----------------------------------------------------------------------
2110 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
2111 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
2112 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
2113 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
2114 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
2115 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
2116 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
2117 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
2118 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
2119 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
2120 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
2121 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
2122 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
2123 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
2124 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
2125 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
2126 ----------------------------------------------------------------------
2127
2128 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
39787efd 2129 of dimension 1, chars 94, and final character <F>, etc...
4ed46869
KH
2130
2131 Note (*): Although these designations are not allowed in ISO2022,
2132 Emacs accepts them on decoding, and produces them on encoding
39787efd 2133 CHARS96 character sets in a coding system which is characterized as
4ed46869
KH
2134 7-bit environment, non-locking-shift, and non-single-shift.
2135
2136 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
df7492f9 2137 '(' must be omitted. We refer to this as "short-form" hereafter.
4ed46869 2138
df7492f9 2139 Now you may notice that there are a lot of ways for encoding the
39787efd
KH
2140 same multilingual text in ISO2022. Actually, there exist many
2141 coding systems such as Compound Text (used in X11's inter client
df7492f9
KH
2142 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
2143 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
4ed46869
KH
2144 localized platforms), and all of these are variants of ISO2022.
2145
2146 In addition to the above, Emacs handles two more kinds of escape
2147 sequences: ISO6429's direction specification and Emacs' private
2148 sequence for specifying character composition.
2149
39787efd 2150 ISO6429's direction specification takes the following form:
4ed46869
KH
2151 o CSI ']' -- end of the current direction
2152 o CSI '0' ']' -- end of the current direction
2153 o CSI '1' ']' -- start of left-to-right text
2154 o CSI '2' ']' -- start of right-to-left text
2155 The control character CSI (0x9B: control sequence introducer) is
39787efd
KH
2156 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2157
2158 Character composition specification takes the following form:
ec6d2bb8
KH
2159 o ESC '0' -- start relative composition
2160 o ESC '1' -- end composition
2161 o ESC '2' -- start rule-base composition (*)
2162 o ESC '3' -- start relative composition with alternate chars (**)
2163 o ESC '4' -- start rule-base composition with alternate chars (**)
b73bfc1c 2164 Since these are not standard escape sequences of any ISO standard,
df7492f9 2165 the use of them for these meaning is restricted to Emacs only.
ec6d2bb8 2166
df7492f9 2167 (*) This form is used only in Emacs 20.5 and the older versions,
b73bfc1c 2168 but the newer versions can safely decode it.
df7492f9 2169 (**) This form is used only in Emacs 21.1 and the newer versions,
b73bfc1c 2170 and the older versions can't decode it.
ec6d2bb8 2171
df7492f9 2172 Here's a list of examples usages of these composition escape
b73bfc1c 2173 sequences (categorized by `enum composition_method').
ec6d2bb8 2174
b73bfc1c 2175 COMPOSITION_RELATIVE:
ec6d2bb8 2176 ESC 0 CHAR [ CHAR ] ESC 1
df7492f9 2177 COMPOSITOIN_WITH_RULE:
ec6d2bb8 2178 ESC 2 CHAR [ RULE CHAR ] ESC 1
b73bfc1c 2179 COMPOSITION_WITH_ALTCHARS:
ec6d2bb8 2180 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
b73bfc1c 2181 COMPOSITION_WITH_RULE_ALTCHARS:
ec6d2bb8 2182 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
4ed46869
KH
2183
2184enum iso_code_class_type iso_code_class[256];
2185
df7492f9
KH
2186#define SAFE_CHARSET_P(coding, id) \
2187 ((id) <= (coding)->max_charset_id \
2188 && (coding)->safe_charsets[id] >= 0)
2189
2190
2191#define SHIFT_OUT_OK(category) \
2192 (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
2193
2194static void
2195setup_iso_safe_charsets (Lisp_Object attrs)
2196{
2197 Lisp_Object charset_list, safe_charsets;
2198 Lisp_Object request;
2199 Lisp_Object reg_usage;
2200 Lisp_Object tail;
2201 int reg94, reg96;
2202 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2203 int max_charset_id;
2204
2205 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2206 if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2207 && ! EQ (charset_list, Viso_2022_charset_list))
2208 {
2209 CODING_ATTR_CHARSET_LIST (attrs)
2210 = charset_list = Viso_2022_charset_list;
2211 ASET (attrs, coding_attr_safe_charsets, Qnil);
2212 }
2213
2214 if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2215 return;
2216
2217 max_charset_id = 0;
2218 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2219 {
2220 int id = XINT (XCAR (tail));
2221 if (max_charset_id < id)
2222 max_charset_id = id;
2223 }
2224
2225 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
2226 make_number (255));
2227 request = AREF (attrs, coding_attr_iso_request);
2228 reg_usage = AREF (attrs, coding_attr_iso_usage);
2229 reg94 = XINT (XCAR (reg_usage));
2230 reg96 = XINT (XCDR (reg_usage));
2231
2232 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2233 {
2234 Lisp_Object id;
2235 Lisp_Object reg;
2236 struct charset *charset;
2237
2238 id = XCAR (tail);
2239 charset = CHARSET_FROM_ID (XINT (id));
2240 reg = Fcdr (Fassq (request, id));
2241 if (! NILP (reg))
2242 XSTRING (safe_charsets)->data[XINT (id)] = XINT (reg);
2243 else if (charset->iso_chars_96)
2244 {
2245 if (reg96 < 4)
2246 XSTRING (safe_charsets)->data[XINT (id)] = reg96;
2247 }
2248 else
2249 {
2250 if (reg94 < 4)
2251 XSTRING (safe_charsets)->data[XINT (id)] = reg94;
2252 }
2253 }
2254 ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2255}
d46c5b12 2256
d46c5b12 2257
4ed46869 2258/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
df7492f9 2259 Check if a text is encoded in ISO2022. If it is, returns an
4ed46869 2260 integer in which appropriate flag bits any of:
df7492f9
KH
2261 CATEGORY_MASK_ISO_7
2262 CATEGORY_MASK_ISO_7_TIGHT
2263 CATEGORY_MASK_ISO_8_1
2264 CATEGORY_MASK_ISO_8_2
2265 CATEGORY_MASK_ISO_7_ELSE
2266 CATEGORY_MASK_ISO_8_ELSE
4ed46869
KH
2267 are set. If a code which should never appear in ISO2022 is found,
2268 returns 0. */
2269
0a28aafb 2270static int
df7492f9
KH
2271detect_coding_iso_2022 (coding, mask)
2272 struct coding_system *coding;
2273 int *mask;
4ed46869 2274{
df7492f9
KH
2275 unsigned char *src = coding->source, *src_base = src;
2276 unsigned char *src_end = coding->source + coding->src_bytes;
2277 int multibytep = coding->src_multibyte;
2278 int mask_iso = CATEGORY_MASK_ISO;
2279 int mask_found = 0, mask_8bit_found = 0;
f46869e4 2280 int reg[4], shift_out = 0, single_shifting = 0;
df7492f9
KH
2281 int id;
2282 int c, c1;
2283 int consumed_chars = 0;
2284 int i;
2285
2286 for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2287 {
2288 struct coding_system *this = &(coding_categories[i]);
2289 Lisp_Object attrs, val;
2290
2291 attrs = CODING_ID_ATTRS (this->id);
2292 if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2293 && ! EQ (CODING_ATTR_SAFE_CHARSETS (attrs), Viso_2022_charset_list))
2294 setup_iso_safe_charsets (attrs);
2295 val = CODING_ATTR_SAFE_CHARSETS (attrs);
2296 this->max_charset_id = XSTRING (val)->size - 1;
2297 this->safe_charsets = (char *) XSTRING (val)->data;
2298 }
2299
2300 /* A coding system of this category is always ASCII compatible. */
2301 src += coding->head_ascii;
3f003981 2302
df7492f9
KH
2303 reg[0] = charset_ascii, reg[1] = reg[2] = reg[3] = -1;
2304 while (mask_iso && src < src_end)
4ed46869 2305 {
df7492f9 2306 ONE_MORE_BYTE (c);
4ed46869
KH
2307 switch (c)
2308 {
2309 case ISO_CODE_ESC:
74383408
KH
2310 if (inhibit_iso_escape_detection)
2311 break;
f46869e4 2312 single_shifting = 0;
df7492f9 2313 ONE_MORE_BYTE (c);
d46c5b12 2314 if (c >= '(' && c <= '/')
4ed46869 2315 {
bf9cdd4e 2316 /* Designation sequence for a charset of dimension 1. */
df7492f9 2317 ONE_MORE_BYTE (c1);
d46c5b12 2318 if (c1 < ' ' || c1 >= 0x80
df7492f9 2319 || (id = iso_charset_table[0][c >= ','][c1]) < 0)
d46c5b12
KH
2320 /* Invalid designation sequence. Just ignore. */
2321 break;
df7492f9 2322 reg[(c - '(') % 4] = id;
bf9cdd4e
KH
2323 }
2324 else if (c == '$')
2325 {
2326 /* Designation sequence for a charset of dimension 2. */
df7492f9 2327 ONE_MORE_BYTE (c);
bf9cdd4e
KH
2328 if (c >= '@' && c <= 'B')
2329 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
df7492f9 2330 reg[0] = id = iso_charset_table[1][0][c];
bf9cdd4e 2331 else if (c >= '(' && c <= '/')
bcf26d6a 2332 {
df7492f9 2333 ONE_MORE_BYTE (c1);
d46c5b12 2334 if (c1 < ' ' || c1 >= 0x80
df7492f9 2335 || (id = iso_charset_table[1][c >= ','][c1]) < 0)
d46c5b12
KH
2336 /* Invalid designation sequence. Just ignore. */
2337 break;
df7492f9 2338 reg[(c - '(') % 4] = id;
bcf26d6a 2339 }
bf9cdd4e 2340 else
d46c5b12
KH
2341 /* Invalid designation sequence. Just ignore. */
2342 break;
2343 }
ae9ff118 2344 else if (c == 'N' || c == 'O')
d46c5b12 2345 {
ae9ff118 2346 /* ESC <Fe> for SS2 or SS3. */
df7492f9 2347 mask_iso &= CATEGORY_MASK_ISO_7_ELSE;
d46c5b12 2348 break;
4ed46869 2349 }
ec6d2bb8
KH
2350 else if (c >= '0' && c <= '4')
2351 {
2352 /* ESC <Fp> for start/end composition. */
df7492f9 2353 mask_found |= CATEGORY_MASK_ISO;
ec6d2bb8
KH
2354 break;
2355 }
bf9cdd4e 2356 else
df7492f9
KH
2357 {
2358 /* Invalid escape sequence. */
2359 mask_iso &= ~CATEGORY_MASK_ISO_ESCAPE;
2360 break;
2361 }
d46c5b12
KH
2362
2363 /* We found a valid designation sequence for CHARSET. */
df7492f9
KH
2364 mask_iso &= ~CATEGORY_MASK_ISO_8BIT;
2365 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
2366 id))
2367 mask_found |= CATEGORY_MASK_ISO_7;
d46c5b12 2368 else
df7492f9
KH
2369 mask_iso &= ~CATEGORY_MASK_ISO_7;
2370 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
2371 id))
2372 mask_found |= CATEGORY_MASK_ISO_7_TIGHT;
d46c5b12 2373 else
df7492f9
KH
2374 mask_iso &= ~CATEGORY_MASK_ISO_7_TIGHT;
2375 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
2376 id))
2377 mask_found |= CATEGORY_MASK_ISO_7_ELSE;
ae9ff118 2378 else
df7492f9
KH
2379 mask_iso &= ~CATEGORY_MASK_ISO_7_ELSE;
2380 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
2381 id))
2382 mask_found |= CATEGORY_MASK_ISO_8_ELSE;
ae9ff118 2383 else
df7492f9 2384 mask_iso &= ~CATEGORY_MASK_ISO_8_ELSE;
4ed46869
KH
2385 break;
2386
4ed46869 2387 case ISO_CODE_SO:
74383408
KH
2388 if (inhibit_iso_escape_detection)
2389 break;
f46869e4 2390 single_shifting = 0;
d46c5b12
KH
2391 if (shift_out == 0
2392 && (reg[1] >= 0
df7492f9
KH
2393 || SHIFT_OUT_OK (coding_category_iso_7_else)
2394 || SHIFT_OUT_OK (coding_category_iso_8_else)))
d46c5b12
KH
2395 {
2396 /* Locking shift out. */
df7492f9
KH
2397 mask_iso &= ~CATEGORY_MASK_ISO_7BIT;
2398 mask_found |= CATEGORY_MASK_ISO_ELSE;
d46c5b12 2399 }
e0e989f6 2400 break;
df7492f9 2401
d46c5b12 2402 case ISO_CODE_SI:
74383408
KH
2403 if (inhibit_iso_escape_detection)
2404 break;
f46869e4 2405 single_shifting = 0;
d46c5b12
KH
2406 if (shift_out == 1)
2407 {
2408 /* Locking shift in. */
df7492f9
KH
2409 mask_iso &= ~CATEGORY_MASK_ISO_7BIT;
2410 mask_found |= CATEGORY_MASK_ISO_ELSE;
d46c5b12
KH
2411 }
2412 break;
2413
4ed46869 2414 case ISO_CODE_CSI:
f46869e4 2415 single_shifting = 0;
4ed46869
KH
2416 case ISO_CODE_SS2:
2417 case ISO_CODE_SS3:
3f003981 2418 {
df7492f9 2419 int newmask = CATEGORY_MASK_ISO_8_ELSE;
3f003981 2420
74383408
KH
2421 if (inhibit_iso_escape_detection)
2422 break;
70c22245
KH
2423 if (c != ISO_CODE_CSI)
2424 {
df7492f9
KH
2425 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2426 & CODING_ISO_FLAG_SINGLE_SHIFT)
2427 newmask |= CATEGORY_MASK_ISO_8_1;
2428 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2429 & CODING_ISO_FLAG_SINGLE_SHIFT)
2430 newmask |= CATEGORY_MASK_ISO_8_2;
f46869e4 2431 single_shifting = 1;
70c22245 2432 }
3f003981
KH
2433 if (VECTORP (Vlatin_extra_code_table)
2434 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
2435 {
df7492f9
KH
2436 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2437 & CODING_ISO_FLAG_LATIN_EXTRA)
2438 newmask |= CATEGORY_MASK_ISO_8_1;
2439 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2440 & CODING_ISO_FLAG_LATIN_EXTRA)
2441 newmask |= CATEGORY_MASK_ISO_8_2;
3f003981 2442 }
df7492f9 2443 mask_iso &= newmask;
d46c5b12 2444 mask_found |= newmask;
3f003981
KH
2445 }
2446 break;
4ed46869
KH
2447
2448 default:
2449 if (c < 0x80)
f46869e4
KH
2450 {
2451 single_shifting = 0;
2452 break;
2453 }
4ed46869 2454 else if (c < 0xA0)
c4825358 2455 {
f46869e4 2456 single_shifting = 0;
df7492f9 2457 mask_8bit_found = 1;
3f003981
KH
2458 if (VECTORP (Vlatin_extra_code_table)
2459 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
c4825358 2460 {
3f003981
KH
2461 int newmask = 0;
2462
df7492f9
KH
2463 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2464 & CODING_ISO_FLAG_LATIN_EXTRA)
2465 newmask |= CATEGORY_MASK_ISO_8_1;
2466 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2467 & CODING_ISO_FLAG_LATIN_EXTRA)
2468 newmask |= CATEGORY_MASK_ISO_8_2;
2469 mask_iso &= newmask;
d46c5b12 2470 mask_found |= newmask;
c4825358 2471 }
3f003981
KH
2472 else
2473 return 0;
c4825358 2474 }
4ed46869
KH
2475 else
2476 {
df7492f9
KH
2477 mask_iso &= ~(CATEGORY_MASK_ISO_7BIT
2478 | CATEGORY_MASK_ISO_7_ELSE);
2479 mask_found |= CATEGORY_MASK_ISO_8_1;
2480 mask_8bit_found = 1;
f46869e4
KH
2481 /* Check the length of succeeding codes of the range
2482 0xA0..0FF. If the byte length is odd, we exclude
df7492f9 2483 CATEGORY_MASK_ISO_8_2. We can check this only
f46869e4 2484 when we are not single shifting. */
b73bfc1c 2485 if (!single_shifting
df7492f9 2486 && mask_iso & CATEGORY_MASK_ISO_8_2)
f46869e4 2487 {
e17de821 2488 int i = 1;
b73bfc1c
KH
2489 while (src < src_end)
2490 {
df7492f9 2491 ONE_MORE_BYTE (c);
b73bfc1c
KH
2492 if (c < 0xA0)
2493 break;
2494 i++;
2495 }
2496
2497 if (i & 1 && src < src_end)
df7492f9 2498 mask_iso &= ~CATEGORY_MASK_ISO_8_2;
f46869e4 2499 else
df7492f9 2500 mask_found |= CATEGORY_MASK_ISO_8_2;
f46869e4 2501 }
4ed46869
KH
2502 }
2503 break;
2504 }
2505 }
df7492f9
KH
2506 no_more_source:
2507 if (!mask_iso)
2508 {
2509 *mask &= ~CATEGORY_MASK_ISO;
2510 return 0;
2511 }
2512 if (!mask_found)
2513 return 0;
2514 *mask &= mask_iso & mask_found;
2515 if (! mask_8bit_found)
2516 *mask &= ~(CATEGORY_MASK_ISO_8BIT | CATEGORY_MASK_ISO_8_ELSE);
2517 return 1;
4ed46869
KH
2518}
2519
4ed46869
KH
2520
2521/* Set designation state into CODING. */
df7492f9
KH
2522#define DECODE_DESIGNATION(reg, dim, chars_96, final) \
2523 do { \
2524 int id, prev; \
2525 \
2526 if (final < '0' || final >= 128 \
2527 || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0) \
2528 || !SAFE_CHARSET_P (coding, id)) \
2529 { \
2530 CODING_ISO_DESIGNATION (coding, reg) = -2; \
2531 goto invalid_code; \
2532 } \
2533 prev = CODING_ISO_DESIGNATION (coding, reg); \
2534 CODING_ISO_DESIGNATION (coding, reg) = id; \
2535 /* If there was an invalid designation to REG previously, and this \
2536 designation is ASCII to REG, we should keep this designation \
2537 sequence. */ \
2538 if (prev == -2 && id == charset_ascii) \
2539 goto invalid_code; \
4ed46869
KH
2540 } while (0)
2541
d46c5b12 2542
df7492f9
KH
2543#define MAYBE_FINISH_COMPOSITION() \
2544 do { \
2545 int i; \
2546 if (composition_state == COMPOSING_NO) \
2547 break; \
2548 /* It is assured that we have enough room for producing \
2549 characters stored in the table `components'. */ \
2550 if (charbuf + component_idx > charbuf_end) \
2551 goto no_more_source; \
2552 composition_state = COMPOSING_NO; \
2553 if (method == COMPOSITION_RELATIVE \
2554 || method == COMPOSITION_WITH_ALTCHARS) \
2555 { \
2556 for (i = 0; i < component_idx; i++) \
2557 *charbuf++ = components[i]; \
2558 char_offset += component_idx; \
2559 } \
2560 else \
2561 { \
2562 for (i = 0; i < component_idx; i += 2) \
2563 *charbuf++ = components[i]; \
2564 char_offset += (component_idx / 2) + 1; \
2565 } \
2566 } while (0)
2567
d46c5b12 2568
aa72b389
KH
2569/* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
2570 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
2571 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
df7492f9
KH
2572 ESC 3 : altchar composition : ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
2573 ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
aa72b389 2574 */
ec6d2bb8 2575
df7492f9
KH
2576#define DECODE_COMPOSITION_START(c1) \
2577 do { \
2578 if (c1 == '0' \
2579 && composition_state == COMPOSING_COMPONENT_CHAR) \
2580 { \
2581 component_len = component_idx; \
2582 composition_state = COMPOSING_CHAR; \
2583 } \
2584 else \
2585 { \
2586 unsigned char *p; \
2587 \
2588 MAYBE_FINISH_COMPOSITION (); \
2589 if (charbuf + MAX_COMPOSITION_COMPONENTS > charbuf_end) \
2590 goto no_more_source; \
2591 for (p = src; p < src_end - 1; p++) \
2592 if (*p == ISO_CODE_ESC && p[1] == '1') \
2593 break; \
2594 if (p == src_end - 1) \
2595 { \
2596 if (coding->mode & CODING_MODE_LAST_BLOCK) \
2597 goto invalid_code; \
2598 goto no_more_source; \
2599 } \
2600 \
2601 /* This is surely the start of a composition. */ \
2602 method = (c1 == '0' ? COMPOSITION_RELATIVE \
2603 : c1 == '2' ? COMPOSITION_WITH_RULE \
2604 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
2605 : COMPOSITION_WITH_RULE_ALTCHARS); \
2606 composition_state = (c1 <= '2' ? COMPOSING_CHAR \
2607 : COMPOSING_COMPONENT_CHAR); \
2608 component_idx = component_len = 0; \
2609 } \
ec6d2bb8
KH
2610 } while (0)
2611
ec6d2bb8 2612
df7492f9
KH
2613/* Handle compositoin end sequence ESC 1. */
2614
2615#define DECODE_COMPOSITION_END() \
ec6d2bb8 2616 do { \
df7492f9
KH
2617 int nchars = (component_len > 0 ? component_idx - component_len \
2618 : method == COMPOSITION_RELATIVE ? component_idx \
2619 : (component_idx + 1) / 2); \
2620 int i; \
2621 int *saved_charbuf = charbuf; \
2622 \
2623 ADD_COMPOSITION_DATA (charbuf, method, nchars); \
2624 if (method != COMPOSITION_RELATIVE) \
ec6d2bb8 2625 { \
df7492f9
KH
2626 if (component_len == 0) \
2627 for (i = 0; i < component_idx; i++) \
2628 *charbuf++ = components[i]; \
2629 else \
2630 for (i = 0; i < component_len; i++) \
2631 *charbuf++ = components[i]; \
2632 *saved_charbuf = saved_charbuf - charbuf; \
ec6d2bb8 2633 } \
df7492f9
KH
2634 if (method == COMPOSITION_WITH_RULE) \
2635 for (i = 0; i < component_idx; i += 2, char_offset++) \
2636 *charbuf++ = components[i]; \
ec6d2bb8 2637 else \
df7492f9
KH
2638 for (i = component_len; i < component_idx; i++, char_offset++) \
2639 *charbuf++ = components[i]; \
2640 coding->annotated = 1; \
2641 composition_state = COMPOSING_NO; \
ec6d2bb8
KH
2642 } while (0)
2643
df7492f9 2644
ec6d2bb8
KH
2645/* Decode a composition rule from the byte C1 (and maybe one more byte
2646 from SRC) and store one encoded composition rule in
2647 coding->cmp_data. */
2648
2649#define DECODE_COMPOSITION_RULE(c1) \
2650 do { \
ec6d2bb8
KH
2651 (c1) -= 32; \
2652 if (c1 < 81) /* old format (before ver.21) */ \
2653 { \
2654 int gref = (c1) / 9; \
2655 int nref = (c1) % 9; \
2656 if (gref == 4) gref = 10; \
2657 if (nref == 4) nref = 10; \
df7492f9 2658 c1 = COMPOSITION_ENCODE_RULE (gref, nref); \
ec6d2bb8 2659 } \
b73bfc1c 2660 else if (c1 < 93) /* new format (after ver.21) */ \
ec6d2bb8
KH
2661 { \
2662 ONE_MORE_BYTE (c2); \
df7492f9 2663 c1 = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32); \
ec6d2bb8 2664 } \
df7492f9
KH
2665 else \
2666 c1 = 0; \
ec6d2bb8 2667 } while (0)
88993dfd 2668
d46c5b12 2669
4ed46869
KH
2670/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
2671
b73bfc1c 2672static void
df7492f9 2673decode_coding_iso_2022 (coding)
4ed46869 2674 struct coding_system *coding;
4ed46869 2675{
df7492f9
KH
2676 unsigned char *src = coding->source + coding->consumed;
2677 unsigned char *src_end = coding->source + coding->src_bytes;
b73bfc1c 2678 unsigned char *src_base;
df7492f9
KH
2679 int *charbuf = coding->charbuf;
2680 int *charbuf_end = charbuf + coding->charbuf_size - 4;
2681 int consumed_chars = 0, consumed_chars_base;
2682 int char_offset = 0;
2683 int multibytep = coding->src_multibyte;
2684 /* Charsets invoked to graphic plane 0 and 1 respectively. */
2685 int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2686 int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
2687 struct charset *charset;
2688 int c;
2689 /* For handling composition sequence. */
2690#define COMPOSING_NO 0
2691#define COMPOSING_CHAR 1
2692#define COMPOSING_RULE 2
2693#define COMPOSING_COMPONENT_CHAR 3
2694#define COMPOSING_COMPONENT_RULE 4
2695
2696 int composition_state = COMPOSING_NO;
2697 enum composition_method method;
2698 int components[MAX_COMPOSITION_COMPONENTS * 2 + 1];
2699 int component_idx;
2700 int component_len;
2701 Lisp_Object attrs, eol_type, charset_list;
2702
2703 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
2704 setup_iso_safe_charsets (attrs);
b73bfc1c
KH
2705
2706 while (1)
4ed46869 2707 {
b73bfc1c
KH
2708 int c1, c2;
2709
2710 src_base = src;
df7492f9
KH
2711 consumed_chars_base = consumed_chars;
2712
2713 if (charbuf >= charbuf_end)
2714 break;
2715
b73bfc1c 2716 ONE_MORE_BYTE (c1);
4ed46869 2717
ec6d2bb8 2718 /* We produce no character or one character. */
4ed46869
KH
2719 switch (iso_code_class [c1])
2720 {
2721 case ISO_0x20_or_0x7F:
df7492f9 2722 if (composition_state != COMPOSING_NO)
ec6d2bb8 2723 {
df7492f9
KH
2724 if (composition_state == COMPOSING_RULE
2725 || composition_state == COMPOSING_COMPONENT_RULE)
2726 {
2727 DECODE_COMPOSITION_RULE (c1);
2728 components[component_idx++] = c1;
2729 composition_state--;
2730 continue;
2731 }
2732 else if (method == COMPOSITION_WITH_RULE)
2733 composition_state = COMPOSING_RULE;
2734 else if (method == COMPOSITION_WITH_RULE_ALTCHARS
2735 && composition_state == COMPOSING_COMPONENT_CHAR)
2736 composition_state = COMPOSING_COMPONENT_CHAR;
ec6d2bb8 2737 }
df7492f9
KH
2738 if (charset_id_0 < 0
2739 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
4ed46869
KH
2740 {
2741 /* This is SPACE or DEL. */
df7492f9 2742 charset = CHARSET_FROM_ID (charset_ascii);
4ed46869
KH
2743 break;
2744 }
2745 /* This is a graphic character, we fall down ... */
2746
2747 case ISO_graphic_plane_0:
df7492f9 2748 if (composition_state == COMPOSING_RULE)
b73bfc1c
KH
2749 {
2750 DECODE_COMPOSITION_RULE (c1);
df7492f9
KH
2751 components[component_idx++] = c1;
2752 composition_state = COMPOSING_CHAR;
b73bfc1c 2753 }
df7492f9 2754 charset = CHARSET_FROM_ID (charset_id_0);
4ed46869
KH
2755 break;
2756
2757 case ISO_0xA0_or_0xFF:
df7492f9
KH
2758 if (charset_id_1 < 0
2759 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
2760 || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
2761 goto invalid_code;
4ed46869
KH
2762 /* This is a graphic character, we fall down ... */
2763
2764 case ISO_graphic_plane_1:
df7492f9
KH
2765 if (charset_id_1 < 0)
2766 goto invalid_code;
2767 charset = CHARSET_FROM_ID (charset_id_1);
4ed46869
KH
2768 break;
2769
2770 case ISO_carriage_return:
df7492f9 2771 if (c1 == '\r')
4ed46869 2772 {
df7492f9 2773 if (EQ (eol_type, Qdos))
4ed46869 2774 {
df7492f9
KH
2775 if (src == src_end)
2776 goto no_more_source;
2777 if (*src == '\n')
2778 ONE_MORE_BYTE (c1);
4ed46869 2779 }
df7492f9
KH
2780 else if (EQ (eol_type, Qmac))
2781 c1 = '\n';
4ed46869 2782 }
df7492f9
KH
2783 /* fall through */
2784
2785 case ISO_control_0:
2786 MAYBE_FINISH_COMPOSITION ();
2787 charset = CHARSET_FROM_ID (charset_ascii);
4ed46869
KH
2788 break;
2789
df7492f9
KH
2790 case ISO_control_1:
2791 MAYBE_FINISH_COMPOSITION ();
2792 goto invalid_code;
2793
4ed46869 2794 case ISO_shift_out:
df7492f9
KH
2795 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
2796 || CODING_ISO_DESIGNATION (coding, 1) < 0)
2797 goto invalid_code;
2798 CODING_ISO_INVOCATION (coding, 0) = 1;
2799 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
b73bfc1c 2800 continue;
4ed46869
KH
2801
2802 case ISO_shift_in:
df7492f9
KH
2803 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
2804 goto invalid_code;
2805 CODING_ISO_INVOCATION (coding, 0) = 0;
2806 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
b73bfc1c 2807 continue;
4ed46869
KH
2808
2809 case ISO_single_shift_2_7:
2810 case ISO_single_shift_2:
df7492f9
KH
2811 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
2812 goto invalid_code;
4ed46869
KH
2813 /* SS2 is handled as an escape sequence of ESC 'N' */
2814 c1 = 'N';
2815 goto label_escape_sequence;
2816
2817 case ISO_single_shift_3:
df7492f9
KH
2818 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
2819 goto invalid_code;
4ed46869
KH
2820 /* SS2 is handled as an escape sequence of ESC 'O' */
2821 c1 = 'O';
2822 goto label_escape_sequence;
2823
2824 case ISO_control_sequence_introducer:
2825 /* CSI is handled as an escape sequence of ESC '[' ... */
2826 c1 = '[';
2827 goto label_escape_sequence;
2828
2829 case ISO_escape:
2830 ONE_MORE_BYTE (c1);
2831 label_escape_sequence:
df7492f9 2832 /* Escape sequences handled here are invocation,
4ed46869
KH
2833 designation, direction specification, and character
2834 composition specification. */
2835 switch (c1)
2836 {
2837 case '&': /* revision of following character set */
2838 ONE_MORE_BYTE (c1);
2839 if (!(c1 >= '@' && c1 <= '~'))
df7492f9 2840 goto invalid_code;
4ed46869
KH
2841 ONE_MORE_BYTE (c1);
2842 if (c1 != ISO_CODE_ESC)
df7492f9 2843 goto invalid_code;
4ed46869
KH
2844 ONE_MORE_BYTE (c1);
2845 goto label_escape_sequence;
2846
2847 case '$': /* designation of 2-byte character set */
df7492f9
KH
2848 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
2849 goto invalid_code;
4ed46869
KH
2850 ONE_MORE_BYTE (c1);
2851 if (c1 >= '@' && c1 <= 'B')
2852 { /* designation of JISX0208.1978, GB2312.1980,
88993dfd 2853 or JISX0208.1980 */
df7492f9 2854 DECODE_DESIGNATION (0, 2, 0, c1);
4ed46869
KH
2855 }
2856 else if (c1 >= 0x28 && c1 <= 0x2B)
2857 { /* designation of DIMENSION2_CHARS94 character set */
2858 ONE_MORE_BYTE (c2);
df7492f9 2859 DECODE_DESIGNATION (c1 - 0x28, 2, 0, c2);
4ed46869
KH
2860 }
2861 else if (c1 >= 0x2C && c1 <= 0x2F)
2862 { /* designation of DIMENSION2_CHARS96 character set */
2863 ONE_MORE_BYTE (c2);
df7492f9 2864 DECODE_DESIGNATION (c1 - 0x2C, 2, 1, c2);
4ed46869
KH
2865 }
2866 else
df7492f9 2867 goto invalid_code;
b73bfc1c 2868 /* We must update these variables now. */
df7492f9
KH
2869 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2870 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
b73bfc1c 2871 continue;
4ed46869
KH
2872
2873 case 'n': /* invocation of locking-shift-2 */
df7492f9
KH
2874 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
2875 || CODING_ISO_DESIGNATION (coding, 2) < 0)
2876 goto invalid_code;
2877 CODING_ISO_INVOCATION (coding, 0) = 2;
2878 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
b73bfc1c 2879 continue;
4ed46869
KH
2880
2881 case 'o': /* invocation of locking-shift-3 */
df7492f9
KH
2882 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
2883 || CODING_ISO_DESIGNATION (coding, 3) < 0)
2884 goto invalid_code;
2885 CODING_ISO_INVOCATION (coding, 0) = 3;
2886 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
b73bfc1c 2887 continue;
4ed46869
KH
2888
2889 case 'N': /* invocation of single-shift-2 */
df7492f9
KH
2890 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
2891 || CODING_ISO_DESIGNATION (coding, 2) < 0)
2892 goto invalid_code;
2893 charset = CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding, 2));
b73bfc1c 2894 ONE_MORE_BYTE (c1);
e7046a18 2895 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
df7492f9 2896 goto invalid_code;
4ed46869
KH
2897 break;
2898
2899 case 'O': /* invocation of single-shift-3 */
df7492f9
KH
2900 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
2901 || CODING_ISO_DESIGNATION (coding, 3) < 0)
2902 goto invalid_code;
2903 charset = CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding, 3));
b73bfc1c 2904 ONE_MORE_BYTE (c1);
e7046a18 2905 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
df7492f9 2906 goto invalid_code;
4ed46869
KH
2907 break;
2908
ec6d2bb8 2909 case '0': case '2': case '3': case '4': /* start composition */
df7492f9
KH
2910 if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
2911 goto invalid_code;
ec6d2bb8 2912 DECODE_COMPOSITION_START (c1);
b73bfc1c 2913 continue;
4ed46869 2914
ec6d2bb8 2915 case '1': /* end composition */
df7492f9
KH
2916 if (composition_state == COMPOSING_NO)
2917 goto invalid_code;
2918 DECODE_COMPOSITION_END ();
b73bfc1c 2919 continue;
4ed46869
KH
2920
2921 case '[': /* specification of direction */
df7492f9
KH
2922 if (! CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION)
2923 goto invalid_code;
4ed46869 2924 /* For the moment, nested direction is not supported.
d46c5b12 2925 So, `coding->mode & CODING_MODE_DIRECTION' zero means
df7492f9 2926 left-to-right, and nozero means right-to-left. */
4ed46869
KH
2927 ONE_MORE_BYTE (c1);
2928 switch (c1)
2929 {
2930 case ']': /* end of the current direction */
d46c5b12 2931 coding->mode &= ~CODING_MODE_DIRECTION;
4ed46869
KH
2932
2933 case '0': /* end of the current direction */
2934 case '1': /* start of left-to-right direction */
2935 ONE_MORE_BYTE (c1);
2936 if (c1 == ']')
d46c5b12 2937 coding->mode &= ~CODING_MODE_DIRECTION;
4ed46869 2938 else
df7492f9 2939 goto invalid_code;
4ed46869
KH
2940 break;
2941
2942 case '2': /* start of right-to-left direction */
2943 ONE_MORE_BYTE (c1);
2944 if (c1 == ']')
d46c5b12 2945 coding->mode |= CODING_MODE_DIRECTION;
4ed46869 2946 else
df7492f9 2947 goto invalid_code;
4ed46869
KH
2948 break;
2949
2950 default:
df7492f9 2951 goto invalid_code;
4ed46869 2952 }
b73bfc1c 2953 continue;
4ed46869
KH
2954
2955 default:
df7492f9
KH
2956 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
2957 goto invalid_code;
4ed46869
KH
2958 if (c1 >= 0x28 && c1 <= 0x2B)
2959 { /* designation of DIMENSION1_CHARS94 character set */
2960 ONE_MORE_BYTE (c2);
df7492f9 2961 DECODE_DESIGNATION (c1 - 0x28, 1, 0, c2);
4ed46869
KH
2962 }
2963 else if (c1 >= 0x2C && c1 <= 0x2F)
2964 { /* designation of DIMENSION1_CHARS96 character set */
2965 ONE_MORE_BYTE (c2);
df7492f9 2966 DECODE_DESIGNATION (c1 - 0x2C, 1, 1, c2);
4ed46869
KH
2967 }
2968 else
df7492f9 2969 goto invalid_code;
b73bfc1c 2970 /* We must update these variables now. */
df7492f9
KH
2971 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2972 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
b73bfc1c 2973 continue;
4ed46869 2974 }
b73bfc1c 2975 }
4ed46869 2976
b73bfc1c 2977 /* Now we know CHARSET and 1st position code C1 of a character.
df7492f9
KH
2978 Produce a decoded character while getting 2nd position code
2979 C2 if necessary. */
2980 c1 &= 0x7F;
2981 if (CHARSET_DIMENSION (charset) > 1)
b73bfc1c
KH
2982 {
2983 ONE_MORE_BYTE (c2);
df7492f9 2984 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
b73bfc1c 2985 /* C2 is not in a valid range. */
df7492f9
KH
2986 goto invalid_code;
2987 c1 = (c1 << 8) | (c2 & 0x7F);
2988 if (CHARSET_DIMENSION (charset) > 2)
2989 {
2990 ONE_MORE_BYTE (c2);
2991 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
2992 /* C2 is not in a valid range. */
2993 goto invalid_code;
2994 c1 = (c1 << 8) | (c2 & 0x7F);
2995 }
2996 }
2997
2998 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
2999 if (c < 0)
3000 {
3001 MAYBE_FINISH_COMPOSITION ();
3002 for (; src_base < src; src_base++, char_offset++)
3003 {
3004 if (ASCII_BYTE_P (*src_base))
3005 *charbuf++ = *src_base;
3006 else
3007 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3008 }
3009 }
3010 else if (composition_state == COMPOSING_NO)
3011 {
3012 *charbuf++ = c;
3013 char_offset++;
4ed46869 3014 }
df7492f9
KH
3015 else
3016 components[component_idx++] = c;
4ed46869
KH
3017 continue;
3018
df7492f9
KH
3019 invalid_code:
3020 MAYBE_FINISH_COMPOSITION ();
4ed46869 3021 src = src_base;
df7492f9
KH
3022 consumed_chars = consumed_chars_base;
3023 ONE_MORE_BYTE (c);
3024 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3025 coding->errors++;
4ed46869 3026 }
fb88bf2d 3027
df7492f9
KH
3028 no_more_source:
3029 coding->consumed_char += consumed_chars_base;
3030 coding->consumed = src_base - coding->source;
3031 coding->charbuf_used = charbuf - coding->charbuf;
4ed46869
KH
3032}
3033
b73bfc1c 3034
f4dee582 3035/* ISO2022 encoding stuff. */
4ed46869
KH
3036
3037/*
f4dee582 3038 It is not enough to say just "ISO2022" on encoding, we have to
df7492f9 3039 specify more details. In Emacs, each coding system of ISO2022
4ed46869 3040 variant has the following specifications:
df7492f9 3041 1. Initial designation to G0 thru G3.
4ed46869
KH
3042 2. Allows short-form designation?
3043 3. ASCII should be designated to G0 before control characters?
3044 4. ASCII should be designated to G0 at end of line?
3045 5. 7-bit environment or 8-bit environment?
3046 6. Use locking-shift?
3047 7. Use Single-shift?
3048 And the following two are only for Japanese:
3049 8. Use ASCII in place of JIS0201-1976-Roman?
3050 9. Use JISX0208-1983 in place of JISX0208-1978?
df7492f9
KH
3051 These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3052 defined by macros CODING_ISO_FLAG_XXX. See `coding.h' for more
f4dee582 3053 details.
4ed46869
KH
3054*/
3055
3056/* Produce codes (escape sequence) for designating CHARSET to graphic
b73bfc1c
KH
3057 register REG at DST, and increment DST. If <final-char> of CHARSET is
3058 '@', 'A', or 'B' and the coding system CODING allows, produce
3059 designation sequence of short-form. */
4ed46869
KH
3060
3061#define ENCODE_DESIGNATION(charset, reg, coding) \
3062 do { \
df7492f9 3063 unsigned char final_char = CHARSET_ISO_FINAL (charset); \
4ed46869
KH
3064 char *intermediate_char_94 = "()*+"; \
3065 char *intermediate_char_96 = ",-./"; \
df7492f9
KH
3066 int revision = -1; \
3067 int c; \
3068 \
3069 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION) \
3070 revision = XINT (CHARSET_ISO_REVISION (charset)); \
3071 \
3072 if (revision >= 0) \
70c22245 3073 { \
df7492f9
KH
3074 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&'); \
3075 EMIT_ONE_BYTE ('@' + revision); \
4ed46869 3076 } \
df7492f9 3077 EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC); \
4ed46869
KH
3078 if (CHARSET_DIMENSION (charset) == 1) \
3079 { \
df7492f9
KH
3080 if (! CHARSET_ISO_CHARS_96 (charset)) \
3081 c = intermediate_char_94[reg]; \
4ed46869 3082 else \
df7492f9
KH
3083 c = intermediate_char_96[reg]; \
3084 EMIT_ONE_ASCII_BYTE (c); \
4ed46869
KH
3085 } \
3086 else \
3087 { \
df7492f9
KH
3088 EMIT_ONE_ASCII_BYTE ('$'); \
3089 if (! CHARSET_ISO_CHARS_96 (charset)) \
4ed46869 3090 { \
df7492f9 3091 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM \
b73bfc1c
KH
3092 || reg != 0 \
3093 || final_char < '@' || final_char > 'B') \
df7492f9 3094 EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]); \
4ed46869
KH
3095 } \
3096 else \
df7492f9 3097 EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]); \
4ed46869 3098 } \
df7492f9
KH
3099 EMIT_ONE_ASCII_BYTE (final_char); \
3100 \
3101 CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset); \
4ed46869
KH
3102 } while (0)
3103
df7492f9 3104
4ed46869
KH
3105/* The following two macros produce codes (control character or escape
3106 sequence) for ISO2022 single-shift functions (single-shift-2 and
3107 single-shift-3). */
3108
df7492f9
KH
3109#define ENCODE_SINGLE_SHIFT_2 \
3110 do { \
3111 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3112 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N'); \
3113 else \
3114 EMIT_ONE_BYTE (ISO_CODE_SS2); \
3115 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
4ed46869
KH
3116 } while (0)
3117
df7492f9
KH
3118
3119#define ENCODE_SINGLE_SHIFT_3 \
3120 do { \
3121 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3122 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O'); \
3123 else \
3124 EMIT_ONE_BYTE (ISO_CODE_SS3); \
3125 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
4ed46869
KH
3126 } while (0)
3127
df7492f9 3128
4ed46869
KH
3129/* The following four macros produce codes (control character or
3130 escape sequence) for ISO2022 locking-shift functions (shift-in,
3131 shift-out, locking-shift-2, and locking-shift-3). */
3132
df7492f9
KH
3133#define ENCODE_SHIFT_IN \
3134 do { \
3135 EMIT_ONE_ASCII_BYTE (ISO_CODE_SI); \
3136 CODING_ISO_INVOCATION (coding, 0) = 0; \
4ed46869
KH
3137 } while (0)
3138
df7492f9
KH
3139
3140#define ENCODE_SHIFT_OUT \
3141 do { \
3142 EMIT_ONE_ASCII_BYTE (ISO_CODE_SO); \
3143 CODING_ISO_INVOCATION (coding, 0) = 1; \
4ed46869
KH
3144 } while (0)
3145
df7492f9
KH
3146
3147#define ENCODE_LOCKING_SHIFT_2 \
3148 do { \
3149 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3150 CODING_ISO_INVOCATION (coding, 0) = 2; \
4ed46869
KH
3151 } while (0)
3152
df7492f9
KH
3153
3154#define ENCODE_LOCKING_SHIFT_3 \
3155 do { \
3156 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3157 CODING_ISO_INVOCATION (coding, 0) = 3; \
4ed46869
KH
3158 } while (0)
3159
df7492f9 3160
f4dee582
RS
3161/* Produce codes for a DIMENSION1 character whose character set is
3162 CHARSET and whose position-code is C1. Designation and invocation
4ed46869
KH
3163 sequences are also produced in advance if necessary. */
3164
6e85d753
KH
3165#define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
3166 do { \
df7492f9
KH
3167 int id = CHARSET_ID (charset); \
3168 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
6e85d753 3169 { \
df7492f9
KH
3170 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3171 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
6e85d753 3172 else \
df7492f9
KH
3173 EMIT_ONE_BYTE (c1 | 0x80); \
3174 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
6e85d753
KH
3175 break; \
3176 } \
df7492f9 3177 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
6e85d753 3178 { \
df7492f9 3179 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
6e85d753
KH
3180 break; \
3181 } \
df7492f9 3182 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
6e85d753 3183 { \
df7492f9 3184 EMIT_ONE_BYTE (c1 | 0x80); \
6e85d753
KH
3185 break; \
3186 } \
6e85d753
KH
3187 else \
3188 /* Since CHARSET is not yet invoked to any graphic planes, we \
3189 must invoke it, or, at first, designate it to some graphic \
3190 register. Then repeat the loop to actually produce the \
3191 character. */ \
df7492f9
KH
3192 dst = encode_invocation_designation (charset, coding, dst, \
3193 &produced_chars); \
4ed46869
KH
3194 } while (1)
3195
df7492f9 3196
f4dee582
RS
3197/* Produce codes for a DIMENSION2 character whose character set is
3198 CHARSET and whose position-codes are C1 and C2. Designation and
4ed46869
KH
3199 invocation codes are also produced in advance if necessary. */
3200
6e85d753
KH
3201#define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
3202 do { \
df7492f9
KH
3203 int id = CHARSET_ID (charset); \
3204 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
6e85d753 3205 { \
df7492f9
KH
3206 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3207 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
6e85d753 3208 else \
df7492f9
KH
3209 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3210 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
6e85d753
KH
3211 break; \
3212 } \
df7492f9 3213 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
6e85d753 3214 { \
df7492f9 3215 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
6e85d753
KH
3216 break; \
3217 } \
df7492f9 3218 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
6e85d753 3219 { \
df7492f9 3220 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
6e85d753
KH
3221 break; \
3222 } \
6e85d753
KH
3223 else \
3224 /* Since CHARSET is not yet invoked to any graphic planes, we \
3225 must invoke it, or, at first, designate it to some graphic \
3226 register. Then repeat the loop to actually produce the \
3227 character. */ \
df7492f9
KH
3228 dst = encode_invocation_designation (charset, coding, dst, \
3229 &produced_chars); \
4ed46869
KH
3230 } while (1)
3231
05e6f5dc 3232
df7492f9
KH
3233#define ENCODE_ISO_CHARACTER(charset, c) \
3234 do { \
3235 int code = ENCODE_CHAR ((charset),(c)); \
3236 \
3237 if (CHARSET_DIMENSION (charset) == 1) \
3238 ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code); \
3239 else \
3240 ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
84fbb8a0 3241 } while (0)
bdd9fb48 3242
05e6f5dc 3243
4ed46869 3244/* Produce designation and invocation codes at a place pointed by DST
df7492f9 3245 to use CHARSET. The element `spec.iso_2022' of *CODING is updated.
4ed46869
KH
3246 Return new DST. */
3247
3248unsigned char *
df7492f9
KH
3249encode_invocation_designation (charset, coding, dst, p_nchars)
3250 struct charset *charset;
4ed46869
KH
3251 struct coding_system *coding;
3252 unsigned char *dst;
df7492f9 3253 int *p_nchars;
4ed46869 3254{
df7492f9
KH
3255 int multibytep = coding->dst_multibyte;
3256 int produced_chars = *p_nchars;
4ed46869 3257 int reg; /* graphic register number */
df7492f9 3258 int id = CHARSET_ID (charset);
4ed46869
KH
3259
3260 /* At first, check designations. */
3261 for (reg = 0; reg < 4; reg++)
df7492f9 3262 if (id == CODING_ISO_DESIGNATION (coding, reg))
4ed46869
KH
3263 break;
3264
3265 if (reg >= 4)
3266 {
3267 /* CHARSET is not yet designated to any graphic registers. */
3268 /* At first check the requested designation. */
df7492f9
KH
3269 reg = CODING_ISO_REQUEST (coding, id);
3270 if (reg < 0)
1ba9e4ab
KH
3271 /* Since CHARSET requests no special designation, designate it
3272 to graphic register 0. */
4ed46869
KH
3273 reg = 0;
3274
3275 ENCODE_DESIGNATION (charset, reg, coding);
3276 }
3277
df7492f9
KH
3278 if (CODING_ISO_INVOCATION (coding, 0) != reg
3279 && CODING_ISO_INVOCATION (coding, 1) != reg)
4ed46869
KH
3280 {
3281 /* Since the graphic register REG is not invoked to any graphic
3282 planes, invoke it to graphic plane 0. */
3283 switch (reg)
3284 {
3285 case 0: /* graphic register 0 */
3286 ENCODE_SHIFT_IN;
3287 break;
3288
3289 case 1: /* graphic register 1 */
3290 ENCODE_SHIFT_OUT;
3291 break;
3292
3293 case 2: /* graphic register 2 */
df7492f9 3294 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4ed46869
KH
3295 ENCODE_SINGLE_SHIFT_2;
3296 else
3297 ENCODE_LOCKING_SHIFT_2;
3298 break;
3299
3300 case 3: /* graphic register 3 */
df7492f9 3301 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4ed46869
KH
3302 ENCODE_SINGLE_SHIFT_3;
3303 else
3304 ENCODE_LOCKING_SHIFT_3;
3305 break;
3306 }
3307 }
b73bfc1c 3308
df7492f9 3309 *p_nchars = produced_chars;
4ed46869
KH
3310 return dst;
3311}
3312
df7492f9
KH
3313/* The following three macros produce codes for indicating direction
3314 of text. */
3315#define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
ec6d2bb8 3316 do { \
df7492f9
KH
3317 if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS) \
3318 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '['); \
ec6d2bb8 3319 else \
df7492f9 3320 EMIT_ONE_BYTE (ISO_CODE_CSI); \
ec6d2bb8
KH
3321 } while (0)
3322
ec6d2bb8 3323
df7492f9
KH
3324#define ENCODE_DIRECTION_R2L() \
3325 do { \
3326 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3327 EMIT_TWO_ASCII_BYTES ('2', ']'); \
ec6d2bb8
KH
3328 } while (0)
3329
ec6d2bb8 3330
df7492f9 3331#define ENCODE_DIRECTION_L2R() \
ec6d2bb8 3332 do { \
df7492f9
KH
3333 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3334 EMIT_TWO_ASCII_BYTES ('0', ']'); \
4ed46869
KH
3335 } while (0)
3336
4ed46869
KH
3337
3338/* Produce codes for designation and invocation to reset the graphic
3339 planes and registers to initial state. */
df7492f9
KH
3340#define ENCODE_RESET_PLANE_AND_REGISTER() \
3341 do { \
3342 int reg; \
3343 struct charset *charset; \
3344 \
3345 if (CODING_ISO_INVOCATION (coding, 0) != 0) \
3346 ENCODE_SHIFT_IN; \
3347 for (reg = 0; reg < 4; reg++) \
3348 if (CODING_ISO_INITIAL (coding, reg) >= 0 \
3349 && (CODING_ISO_DESIGNATION (coding, reg) \
3350 != CODING_ISO_INITIAL (coding, reg))) \
3351 { \
3352 charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
3353 ENCODE_DESIGNATION (charset, reg, coding); \
3354 } \
4ed46869
KH
3355 } while (0)
3356
df7492f9 3357
bdd9fb48 3358/* Produce designation sequences of charsets in the line started from
b73bfc1c 3359 SRC to a place pointed by DST, and return updated DST.
bdd9fb48
KH
3360
3361 If the current block ends before any end-of-line, we may fail to
d46c5b12
KH
3362 find all the necessary designations. */
3363
b73bfc1c 3364static unsigned char *
df7492f9 3365encode_designation_at_bol (coding, charbuf, charbuf_end, dst)
e0e989f6 3366 struct coding_system *coding;
df7492f9
KH
3367 int *charbuf, *charbuf_end;
3368 unsigned char *dst;
e0e989f6 3369{
df7492f9 3370 struct charset *charset;
bdd9fb48
KH
3371 /* Table of charsets to be designated to each graphic register. */
3372 int r[4];
df7492f9
KH
3373 int c, found = 0, reg;
3374 int produced_chars = 0;
3375 int multibytep = coding->dst_multibyte;
3376 Lisp_Object attrs;
3377 Lisp_Object charset_list;
3378
3379 attrs = CODING_ID_ATTRS (coding->id);
3380 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3381 if (EQ (charset_list, Qiso_2022))
3382 charset_list = Viso_2022_charset_list;
bdd9fb48
KH
3383
3384 for (reg = 0; reg < 4; reg++)
3385 r[reg] = -1;
3386
b73bfc1c 3387 while (found < 4)
e0e989f6 3388 {
df7492f9
KH
3389 int id;
3390
3391 c = *charbuf++;
b73bfc1c
KH
3392 if (c == '\n')
3393 break;
df7492f9
KH
3394 charset = char_charset (c, charset_list, NULL);
3395 id = CHARSET_ID (charset);
3396 reg = CODING_ISO_REQUEST (coding, id);
3397 if (reg >= 0 && r[reg] < 0)
bdd9fb48
KH
3398 {
3399 found++;
df7492f9 3400 r[reg] = id;
bdd9fb48 3401 }
bdd9fb48
KH
3402 }
3403
3404 if (found)
3405 {
3406 for (reg = 0; reg < 4; reg++)
3407 if (r[reg] >= 0
df7492f9
KH
3408 && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
3409 ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
e0e989f6 3410 }
b73bfc1c
KH
3411
3412 return dst;
e0e989f6
KH
3413}
3414
4ed46869
KH
3415/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
3416
df7492f9
KH
3417static int
3418encode_coding_iso_2022 (coding)
4ed46869 3419 struct coding_system *coding;
4ed46869 3420{
df7492f9
KH
3421 int multibytep = coding->dst_multibyte;
3422 int *charbuf = coding->charbuf;
3423 int *charbuf_end = charbuf + coding->charbuf_used;
3424 unsigned char *dst = coding->destination + coding->produced;
3425 unsigned char *dst_end = coding->destination + coding->dst_bytes;
3426 int safe_room = 16;
3427 int bol_designation
3428 = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
3429 && CODING_ISO_BOL (coding));
3430 int produced_chars = 0;
3431 Lisp_Object attrs, eol_type, charset_list;
3432 int ascii_compatible;
b73bfc1c 3433 int c;
05e6f5dc 3434
df7492f9 3435 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
004068e4
KH
3436 setup_iso_safe_charsets (attrs);
3437 coding->safe_charsets
3438 = (char *) XSTRING (CODING_ATTR_SAFE_CHARSETS(attrs))->data;
bdd9fb48 3439
df7492f9 3440 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4ed46869 3441
df7492f9 3442 while (charbuf < charbuf_end)
4ed46869 3443 {
df7492f9 3444 ASSURE_DESTINATION (safe_room);
b73bfc1c 3445
df7492f9 3446 if (bol_designation)
b73bfc1c 3447 {
df7492f9 3448 unsigned char *dst_prev = dst;
4ed46869 3449
bdd9fb48 3450 /* We have to produce designation sequences if any now. */
df7492f9
KH
3451 dst = encode_designation_at_bol (coding, charbuf, charbuf_end, dst);
3452 bol_designation = 0;
3453 /* We are sure that designation sequences are all ASCII bytes. */
3454 produced_chars += dst - dst_prev;
4ed46869 3455 }
ec6d2bb8 3456
df7492f9 3457 c = *charbuf++;
4ed46869 3458
b73bfc1c
KH
3459 /* Now encode the character C. */
3460 if (c < 0x20 || c == 0x7F)
3461 {
df7492f9
KH
3462 if (c == '\n'
3463 || (c == '\r' && EQ (eol_type, Qmac)))
19a8d9e0 3464 {
df7492f9
KH
3465 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3466 ENCODE_RESET_PLANE_AND_REGISTER ();
3467 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
b73bfc1c 3468 {
df7492f9
KH
3469 int i;
3470
3471 for (i = 0; i < 4; i++)
3472 CODING_ISO_DESIGNATION (coding, i)
3473 = CODING_ISO_INITIAL (coding, i);
b73bfc1c 3474 }
df7492f9
KH
3475 bol_designation
3476 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
19a8d9e0 3477 }
df7492f9
KH
3478 else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
3479 ENCODE_RESET_PLANE_AND_REGISTER ();
3480 EMIT_ONE_ASCII_BYTE (c);
4ed46869 3481 }
df7492f9 3482 else if (ASCII_CHAR_P (c))
88993dfd 3483 {
df7492f9
KH
3484 if (ascii_compatible)
3485 EMIT_ONE_ASCII_BYTE (c);
3486 else
3487 ENCODE_ISO_CHARACTER (CHARSET_FROM_ID (charset_ascii), c);
88993dfd 3488 }
b73bfc1c 3489 else
df7492f9
KH
3490 {
3491 struct charset *charset = char_charset (c, charset_list, NULL);
b73bfc1c 3492
df7492f9
KH
3493 if (!charset)
3494 {
3495 c = coding->default_char;
3496 charset = char_charset (c, charset_list, NULL);
3497 }
3498 ENCODE_ISO_CHARACTER (charset, c);
3499 }
84fbb8a0 3500 }
b73bfc1c 3501
df7492f9
KH
3502 if (coding->mode & CODING_MODE_LAST_BLOCK
3503 && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3504 {
3505 ASSURE_DESTINATION (safe_room);
3506 ENCODE_RESET_PLANE_AND_REGISTER ();
3507 }
3508 coding->result = CODING_RESULT_SUCCESS;
3509 CODING_ISO_BOL (coding) = bol_designation;
3510 coding->produced_char += produced_chars;
3511 coding->produced = dst - coding->destination;
3512 return 0;
4ed46869
KH
3513}
3514
3515\f
df7492f9 3516/*** 8,9. SJIS and BIG5 handlers ***/
4ed46869 3517
df7492f9 3518/* Although SJIS and BIG5 are not ISO's coding system, they are used
4ed46869
KH
3519 quite widely. So, for the moment, Emacs supports them in the bare
3520 C code. But, in the future, they may be supported only by CCL. */
3521
3522/* SJIS is a coding system encoding three character sets: ASCII, right
3523 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
3524 as is. A character of charset katakana-jisx0201 is encoded by
3525 "position-code + 0x80". A character of charset japanese-jisx0208
3526 is encoded in 2-byte but two position-codes are divided and shifted
df7492f9 3527 so that it fit in the range below.
4ed46869
KH
3528
3529 --- CODE RANGE of SJIS ---
3530 (character set) (range)
3531 ASCII 0x00 .. 0x7F
df7492f9 3532 KATAKANA-JISX0201 0xA0 .. 0xDF
c28a9453 3533 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
d14d03ac 3534 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
4ed46869
KH
3535 -------------------------------
3536
3537*/
3538
3539/* BIG5 is a coding system encoding two character sets: ASCII and
3540 Big5. An ASCII character is encoded as is. Big5 is a two-byte
df7492f9 3541 character set and is encoded in two-byte.
4ed46869
KH
3542
3543 --- CODE RANGE of BIG5 ---
3544 (character set) (range)
3545 ASCII 0x00 .. 0x7F
3546 Big5 (1st byte) 0xA1 .. 0xFE
3547 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3548 --------------------------
3549
df7492f9 3550 */
4ed46869
KH
3551
3552/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3553 Check if a text is encoded in SJIS. If it is, return
df7492f9 3554 CATEGORY_MASK_SJIS, else return 0. */
4ed46869 3555
0a28aafb 3556static int
df7492f9
KH
3557detect_coding_sjis (coding, mask)
3558 struct coding_system *coding;
3559 int *mask;
4ed46869 3560{
df7492f9
KH
3561 unsigned char *src = coding->source, *src_base = src;
3562 unsigned char *src_end = coding->source + coding->src_bytes;
3563 int multibytep = coding->src_multibyte;
3564 int consumed_chars = 0;
3565 int found = 0;
b73bfc1c 3566 int c;
df7492f9
KH
3567
3568 /* A coding system of this category is always ASCII compatible. */
3569 src += coding->head_ascii;
4ed46869 3570
b73bfc1c 3571 while (1)
4ed46869 3572 {
df7492f9 3573 ONE_MORE_BYTE (c);
682169fe
KH
3574 if (c < 0x80)
3575 continue;
df7492f9 3576 if ((c >= 0x81 && c <= 0x9F) || (c >= 0xE0 && c <= 0xEF))
4ed46869 3577 {
df7492f9 3578 ONE_MORE_BYTE (c);
682169fe 3579 if (c < 0x40 || c == 0x7F || c > 0xFC)
df7492f9
KH
3580 break;
3581 found = 1;
4ed46869 3582 }
df7492f9
KH
3583 else if (c >= 0xA0 && c < 0xE0)
3584 found = 1;
3585 else
3586 break;
4ed46869 3587 }
df7492f9
KH
3588 *mask &= ~CATEGORY_MASK_SJIS;
3589 return 0;
3590
3591 no_more_source:
3592 if (!found)
3593 return 0;
3594 *mask &= CATEGORY_MASK_SJIS;
3595 return 1;
4ed46869
KH
3596}
3597
3598/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3599 Check if a text is encoded in BIG5. If it is, return
df7492f9 3600 CATEGORY_MASK_BIG5, else return 0. */
4ed46869 3601
0a28aafb 3602static int
df7492f9
KH
3603detect_coding_big5 (coding, mask)
3604 struct coding_system *coding;
3605 int *mask;
4ed46869 3606{
df7492f9
KH
3607 unsigned char *src = coding->source, *src_base = src;
3608 unsigned char *src_end = coding->source + coding->src_bytes;
3609 int multibytep = coding->src_multibyte;
3610 int consumed_chars = 0;
3611 int found = 0;
b73bfc1c 3612 int c;
fa42c37f 3613
df7492f9
KH
3614 /* A coding system of this category is always ASCII compatible. */
3615 src += coding->head_ascii;
fa42c37f 3616
b73bfc1c 3617 while (1)
fa42c37f 3618 {
df7492f9
KH
3619 ONE_MORE_BYTE (c);
3620 if (c < 0x80)
fa42c37f 3621 continue;
df7492f9 3622 if (c >= 0xA1)
fa42c37f 3623 {
df7492f9
KH
3624 ONE_MORE_BYTE (c);
3625 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
fa42c37f 3626 return 0;
df7492f9 3627 found = 1;
fa42c37f 3628 }
df7492f9
KH
3629 else
3630 break;
fa42c37f 3631 }
df7492f9 3632 *mask &= ~CATEGORY_MASK_BIG5;
fa42c37f 3633 return 0;
df7492f9
KH
3634
3635 no_more_source:
3636 if (!found)
3637 return 0;
3638 *mask &= CATEGORY_MASK_BIG5;
3639 return 1;
fa42c37f
KH
3640}
3641
4ed46869
KH
3642/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
3643 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
3644
b73bfc1c 3645static void
df7492f9 3646decode_coding_sjis (coding)
4ed46869 3647 struct coding_system *coding;
4ed46869 3648{
df7492f9
KH
3649 unsigned char *src = coding->source + coding->consumed;
3650 unsigned char *src_end = coding->source + coding->src_bytes;
b73bfc1c 3651 unsigned char *src_base;
df7492f9
KH
3652 int *charbuf = coding->charbuf;
3653 int *charbuf_end = charbuf + coding->charbuf_size;
3654 int consumed_chars = 0, consumed_chars_base;
3655 int multibytep = coding->src_multibyte;
3656 struct charset *charset_roman, *charset_kanji, *charset_kana;
3657 Lisp_Object attrs, eol_type, charset_list, val;
a5d301df 3658
df7492f9
KH
3659 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
3660
3661 val = charset_list;
3662 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3663 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3664 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val)));
4ed46869 3665
b73bfc1c 3666 while (1)
4ed46869 3667 {
df7492f9 3668 int c, c1;
b73bfc1c
KH
3669
3670 src_base = src;
df7492f9
KH
3671 consumed_chars_base = consumed_chars;
3672
3673 if (charbuf >= charbuf_end)
3674 break;
3675
3676 ONE_MORE_BYTE (c);
b73bfc1c 3677
df7492f9 3678 if (c == '\r')
4ed46869 3679 {
df7492f9 3680 if (EQ (eol_type, Qdos))
4ed46869 3681 {
df7492f9
KH
3682 if (src == src_end)
3683 goto no_more_source;
3684 if (*src == '\n')
3685 ONE_MORE_BYTE (c);
4ed46869 3686 }
df7492f9
KH
3687 else if (EQ (eol_type, Qmac))
3688 c = '\n';
4ed46869 3689 }
54f78171 3690 else
df7492f9
KH
3691 {
3692 struct charset *charset;
3693
3694 if (c < 0x80)
3695 charset = charset_roman;
3696 else
4ed46869 3697 {
df7492f9
KH
3698 if (c >= 0xF0)
3699 goto invalid_code;
3700 if (c < 0xA0 || c >= 0xE0)
fb88bf2d 3701 {
54f78171 3702 /* SJIS -> JISX0208 */
df7492f9
KH
3703 ONE_MORE_BYTE (c1);
3704 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
3705 goto invalid_code;
3706 c = (c << 8) | c1;
3707 SJIS_TO_JIS (c);
3708 charset = charset_kanji;
5e34de15 3709 }
fb88bf2d 3710 else
b73bfc1c 3711 /* SJIS -> JISX0201-Kana */
df7492f9
KH
3712 charset = charset_kana;
3713 }
3714 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
3715 }
3716 *charbuf++ = c;
3717 continue;
3718
3719 invalid_code:
3720 src = src_base;
3721 consumed_chars = consumed_chars_base;
3722 ONE_MORE_BYTE (c);
3723 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3724 coding->errors++;
3725 }
3726
3727 no_more_source:
3728 coding->consumed_char += consumed_chars_base;
3729 coding->consumed = src_base - coding->source;
3730 coding->charbuf_used = charbuf - coding->charbuf;
3731}
3732
3733static void
3734decode_coding_big5 (coding)
3735 struct coding_system *coding;
3736{
3737 unsigned char *src = coding->source + coding->consumed;
3738 unsigned char *src_end = coding->source + coding->src_bytes;
3739 unsigned char *src_base;
3740 int *charbuf = coding->charbuf;
3741 int *charbuf_end = charbuf + coding->charbuf_size;
3742 int consumed_chars = 0, consumed_chars_base;
3743 int multibytep = coding->src_multibyte;
3744 struct charset *charset_roman, *charset_big5;
3745 Lisp_Object attrs, eol_type, charset_list, val;
3746
3747 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
3748 val = charset_list;
3749 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3750 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
3751
3752 while (1)
3753 {
3754 int c, c1;
3755
3756 src_base = src;
3757 consumed_chars_base = consumed_chars;
3758
3759 if (charbuf >= charbuf_end)
3760 break;
3761
3762 ONE_MORE_BYTE (c);
3763
3764 if (c == '\r')
3765 {
3766 if (EQ (eol_type, Qdos))
3767 {
3768 if (src == src_end)
3769 goto no_more_source;
3770 if (*src == '\n')
3771 ONE_MORE_BYTE (c);
4ed46869 3772 }
df7492f9
KH
3773 else if (EQ (eol_type, Qmac))
3774 c = '\n';
3775 }
3776 else
3777 {
3778 struct charset *charset;
3779 if (c < 0x80)
3780 charset = charset_roman;
fb88bf2d 3781 else
fb88bf2d 3782 {
54f78171 3783 /* BIG5 -> Big5 */
df7492f9
KH
3784 if (c < 0xA1 || c > 0xFE)
3785 goto invalid_code;
3786 ONE_MORE_BYTE (c1);
3787 if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
3788 goto invalid_code;
3789 c = c << 8 | c1;
3790 charset = charset_big5;
4ed46869 3791 }
df7492f9 3792 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4ed46869 3793 }
4ed46869 3794
df7492f9 3795 *charbuf++ = c;
fb88bf2d
KH
3796 continue;
3797
df7492f9 3798 invalid_code:
4ed46869 3799 src = src_base;
df7492f9
KH
3800 consumed_chars = consumed_chars_base;
3801 ONE_MORE_BYTE (c);
3802 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3803 coding->errors++;
fb88bf2d 3804 }
d46c5b12 3805
df7492f9
KH
3806 no_more_source:
3807 coding->consumed_char += consumed_chars_base;
3808 coding->consumed = src_base - coding->source;
3809 coding->charbuf_used = charbuf - coding->charbuf;
3810}
3811
3812/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
3813 This function can encode charsets `ascii', `katakana-jisx0201',
3814 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
3815 are sure that all these charsets are registered as official charset
3816 (i.e. do not have extended leading-codes). Characters of other
3817 charsets are produced without any encoding. If SJIS_P is 1, encode
3818 SJIS text, else encode BIG5 text. */
3819
3820static int
3821encode_coding_sjis (coding)
3822 struct coding_system *coding;
3823{
3824 int multibytep = coding->dst_multibyte;
3825 int *charbuf = coding->charbuf;
3826 int *charbuf_end = charbuf + coding->charbuf_used;
3827 unsigned char *dst = coding->destination + coding->produced;
3828 unsigned char *dst_end = coding->destination + coding->dst_bytes;
3829 int safe_room = 4;
3830 int produced_chars = 0;
3831 Lisp_Object attrs, eol_type, charset_list, val;
3832 int ascii_compatible;
3833 struct charset *charset_roman, *charset_kanji, *charset_kana;
3834 int c;
3835
3836 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
3837 val = charset_list;
3838 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3839 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3840 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
3841
3842 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
3843
3844 while (charbuf < charbuf_end)
3845 {
3846 ASSURE_DESTINATION (safe_room);
3847 c = *charbuf++;
3848 /* Now encode the character C. */
3849 if (ASCII_CHAR_P (c) && ascii_compatible)
3850 EMIT_ONE_ASCII_BYTE (c);
3851 else
3852 {
3853 unsigned code;
3854 struct charset *charset = char_charset (c, charset_list, &code);
3855
3856 if (!charset)
3857 {
3858 c = coding->default_char;
3859 charset = char_charset (c, charset_list, &code);
3860 }
3861 if (code == CHARSET_INVALID_CODE (charset))
3862 abort ();
3863 if (charset == charset_kanji)
3864 {
3865 int c1, c2;
3866 JIS_TO_SJIS (code);
3867 c1 = code >> 8, c2 = code & 0xFF;
3868 EMIT_TWO_BYTES (c1, c2);
3869 }
3870 else if (charset == charset_kana)
3871 EMIT_ONE_BYTE (code | 0x80);
3872 else
3873 EMIT_ONE_ASCII_BYTE (code & 0x7F);
3874 }
3875 }
3876 coding->result = CODING_RESULT_SUCCESS;
3877 coding->produced_char += produced_chars;
3878 coding->produced = dst - coding->destination;
3879 return 0;
3880}
3881
3882static int
3883encode_coding_big5 (coding)
3884 struct coding_system *coding;
3885{
3886 int multibytep = coding->dst_multibyte;
3887 int *charbuf = coding->charbuf;
3888 int *charbuf_end = charbuf + coding->charbuf_used;
3889 unsigned char *dst = coding->destination + coding->produced;
3890 unsigned char *dst_end = coding->destination + coding->dst_bytes;
3891 int safe_room = 4;
3892 int produced_chars = 0;
3893 Lisp_Object attrs, eol_type, charset_list, val;
3894 int ascii_compatible;
3895 struct charset *charset_roman, *charset_big5;
3896 int c;
3897
3898 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
3899 val = charset_list;
3900 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3901 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
3902 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
3903
3904 while (charbuf < charbuf_end)
3905 {
3906 ASSURE_DESTINATION (safe_room);
3907 c = *charbuf++;
3908 /* Now encode the character C. */
3909 if (ASCII_CHAR_P (c) && ascii_compatible)
3910 EMIT_ONE_ASCII_BYTE (c);
3911 else
3912 {
3913 unsigned code;
3914 struct charset *charset = char_charset (c, charset_list, &code);
3915
3916 if (! charset)
3917 {
3918 c = coding->default_char;
3919 charset = char_charset (c, charset_list, &code);
3920 }
3921 if (code == CHARSET_INVALID_CODE (charset))
3922 abort ();
3923 if (charset == charset_big5)
3924 {
3925 int c1, c2;
3926
3927 c1 = code >> 8, c2 = code & 0xFF;
3928 EMIT_TWO_BYTES (c1, c2);
3929 }
3930 else
3931 EMIT_ONE_ASCII_BYTE (code & 0x7F);
3932 }
3933 }
3934 coding->result = CODING_RESULT_SUCCESS;
3935 coding->produced_char += produced_chars;
3936 coding->produced = dst - coding->destination;
3937 return 0;
3938}
3939
3940\f
3941/*** 10. CCL handlers ***/
3942
3943/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3944 Check if a text is encoded in a coding system of which
3945 encoder/decoder are written in CCL program. If it is, return
3946 CATEGORY_MASK_CCL, else return 0. */
3947
3948static int
3949detect_coding_ccl (coding, mask)
3950 struct coding_system *coding;
3951 int *mask;
3952{
3953 unsigned char *src = coding->source, *src_base = src;
3954 unsigned char *src_end = coding->source + coding->src_bytes;
3955 int multibytep = coding->src_multibyte;
3956 int consumed_chars = 0;
3957 int found = 0;
3958 unsigned char *valids = CODING_CCL_VALIDS (coding);
3959 int head_ascii = coding->head_ascii;
3960 Lisp_Object attrs;
3961
3962 coding = &coding_categories[coding_category_ccl];
3963 attrs = CODING_ID_ATTRS (coding->id);
3964 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
3965 src += head_ascii;
3966
3967 while (1)
3968 {
3969 int c;
3970 ONE_MORE_BYTE (c);
3971 if (! valids[c])
3972 break;
3973 if (!found && valids[c] > 1)
3974 found = 1;
3975 }
3976 *mask &= ~CATEGORY_MASK_CCL;
3977 return 0;
3978
3979 no_more_source:
3980 if (!found)
3981 return 0;
3982 *mask &= CATEGORY_MASK_CCL;
3983 return 1;
3984}
3985
3986static void
3987decode_coding_ccl (coding)
3988 struct coding_system *coding;
3989{
3990 unsigned char *src = coding->source + coding->consumed;
3991 unsigned char *src_end = coding->source + coding->src_bytes;
3992 int *charbuf = coding->charbuf;
3993 int *charbuf_end = charbuf + coding->charbuf_size;
3994 int consumed_chars = 0;
3995 int multibytep = coding->src_multibyte;
3996 struct ccl_program ccl;
3997 int source_charbuf[1024];
3998 int source_byteidx[1024];
3999
4000 setup_ccl_program (&ccl, CODING_CCL_DECODER (coding));
4001
4002 while (src < src_end)
4003 {
4004 unsigned char *p = src;
4005 int *source, *source_end;
4006 int i = 0;
4007
4008 if (multibytep)
4009 while (i < 1024 && p < src_end)
4010 {
4011 source_byteidx[i] = p - src;
4012 source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
4013 }
4014 else
4015 while (i < 1024 && p < src_end)
4016 source_charbuf[i++] = *p++;
4017
4018 if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
4019 ccl.last_block = 1;
4020
4021 source = source_charbuf;
4022 source_end = source + i;
4023 while (source < source_end)
4024 {
4025 ccl_driver (&ccl, source, charbuf,
4026 source_end - source, charbuf_end - charbuf);
4027 source += ccl.consumed;
4028 charbuf += ccl.produced;
4029 if (ccl.status != CCL_STAT_SUSPEND_BY_DST)
4030 break;
4031 }
4032 if (source < source_end)
4033 src += source_byteidx[source - source_charbuf];
4034 else
4035 src = p;
4036 consumed_chars += source - source_charbuf;
4037
4038 if (ccl.status != CCL_STAT_SUSPEND_BY_SRC
4039 && ccl.status != CODING_RESULT_INSUFFICIENT_SRC)
4040 break;
4041 }
4042
4043 switch (ccl.status)
4044 {
4045 case CCL_STAT_SUSPEND_BY_SRC:
4046 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
4047 break;
4048 case CCL_STAT_SUSPEND_BY_DST:
4049 break;
4050 case CCL_STAT_QUIT:
4051 case CCL_STAT_INVALID_CMD:
4052 coding->result = CODING_RESULT_INTERRUPT;
4053 break;
4054 default:
4055 coding->result = CODING_RESULT_SUCCESS;
4056 break;
4057 }
4058 coding->consumed_char += consumed_chars;
4059 coding->consumed = src - coding->source;
4060 coding->charbuf_used = charbuf - coding->charbuf;
4061}
4062
4063static int
4064encode_coding_ccl (coding)
4065 struct coding_system *coding;
4066{
4067 struct ccl_program ccl;
4068 int multibytep = coding->dst_multibyte;
4069 int *charbuf = coding->charbuf;
4070 int *charbuf_end = charbuf + coding->charbuf_used;
4071 unsigned char *dst = coding->destination + coding->produced;
4072 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4073 unsigned char *adjusted_dst_end = dst_end - 1;
4074 int destination_charbuf[1024];
4075 int i, produced_chars = 0;
4076
4077 setup_ccl_program (&ccl, CODING_CCL_ENCODER (coding));
4078
4079 ccl.last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4080 ccl.dst_multibyte = coding->dst_multibyte;
4081
4082 while (charbuf < charbuf_end && dst < adjusted_dst_end)
4083 {
4084 int dst_bytes = dst_end - dst;
4085 if (dst_bytes > 1024)
4086 dst_bytes = 1024;
4087
4088 ccl_driver (&ccl, charbuf, destination_charbuf,
4089 charbuf_end - charbuf, dst_bytes);
4090 charbuf += ccl.consumed;
4091 if (multibytep)
4092 for (i = 0; i < ccl.produced; i++)
4093 EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
4094 else
4095 {
4096 for (i = 0; i < ccl.produced; i++)
4097 *dst++ = destination_charbuf[i] & 0xFF;
4098 produced_chars += ccl.produced;
4099 }
4100 }
4101
4102 switch (ccl.status)
4103 {
4104 case CCL_STAT_SUSPEND_BY_SRC:
4105 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
4106 break;
4107 case CCL_STAT_SUSPEND_BY_DST:
4108 coding->result = CODING_RESULT_INSUFFICIENT_DST;
4109 break;
4110 case CCL_STAT_QUIT:
4111 case CCL_STAT_INVALID_CMD:
4112 coding->result = CODING_RESULT_INTERRUPT;
4113 break;
4114 default:
4115 coding->result = CODING_RESULT_SUCCESS;
4116 break;
4117 }
4118
4119 coding->produced_char += produced_chars;
4120 coding->produced = dst - coding->destination;
4121 return 0;
4ed46869
KH
4122}
4123
df7492f9
KH
4124
4125\f
4126/*** 10, 11. no-conversion handlers ***/
4127
4128/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
4ed46869 4129
b73bfc1c 4130static void
df7492f9 4131decode_coding_raw_text (coding)
4ed46869 4132 struct coding_system *coding;
4ed46869 4133{
df7492f9 4134 coding->chars_at_source = 1;
2c78b7e1
KH
4135 coding->consumed_char = 0;
4136 coding->consumed = 0;
df7492f9
KH
4137 coding->result = CODING_RESULT_SUCCESS;
4138}
4ed46869 4139
df7492f9
KH
4140static int
4141encode_coding_raw_text (coding)
4142 struct coding_system *coding;
4143{
4144 int multibytep = coding->dst_multibyte;
4145 int *charbuf = coding->charbuf;
4146 int *charbuf_end = coding->charbuf + coding->charbuf_used;
4147 unsigned char *dst = coding->destination + coding->produced;
4148 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4149 int produced_chars = 0;
4150 int c;
a5d301df 4151
df7492f9 4152 if (multibytep)
b73bfc1c 4153 {
df7492f9 4154 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
4ed46869 4155
df7492f9
KH
4156 if (coding->src_multibyte)
4157 while (charbuf < charbuf_end)
4158 {
4159 ASSURE_DESTINATION (safe_room);
4160 c = *charbuf++;
4161 if (ASCII_CHAR_P (c))
4162 EMIT_ONE_ASCII_BYTE (c);
4163 else if (CHAR_BYTE8_P (c))
4164 {
4165 c = CHAR_TO_BYTE8 (c);
4166 EMIT_ONE_BYTE (c);
4167 }
4168 else
4169 {
4170 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
93dec019 4171
df7492f9
KH
4172 CHAR_STRING_ADVANCE (c, p1);
4173 while (p0 < p1)
4174 EMIT_ONE_BYTE (*p0);
4175 }
4176 }
b73bfc1c 4177 else
df7492f9
KH
4178 while (charbuf < charbuf_end)
4179 {
4180 ASSURE_DESTINATION (safe_room);
4181 c = *charbuf++;
4182 EMIT_ONE_BYTE (c);
4183 }
4184 }
4185 else
4186 {
4187 if (coding->src_multibyte)
b73bfc1c 4188 {
df7492f9
KH
4189 int safe_room = MAX_MULTIBYTE_LENGTH;
4190
4191 while (charbuf < charbuf_end)
b73bfc1c 4192 {
df7492f9
KH
4193 ASSURE_DESTINATION (safe_room);
4194 c = *charbuf++;
4195 if (ASCII_CHAR_P (c))
4196 *dst++ = c;
4197 else if (CHAR_BYTE8_P (c))
4198 *dst++ = CHAR_TO_BYTE8 (c);
b73bfc1c 4199 else
df7492f9
KH
4200 CHAR_STRING_ADVANCE (c, dst);
4201 produced_chars++;
b73bfc1c 4202 }
4ed46869 4203 }
df7492f9
KH
4204 else
4205 {
4206 ASSURE_DESTINATION (charbuf_end - charbuf);
4207 while (charbuf < charbuf_end && dst < dst_end)
4208 *dst++ = *charbuf++;
4209 produced_chars = dst - (coding->destination + coding->dst_bytes);
4210 }
4ed46869 4211 }
df7492f9
KH
4212 coding->result = CODING_RESULT_SUCCESS;
4213 coding->produced_char += produced_chars;
4214 coding->produced = dst - coding->destination;
4215 return 0;
4ed46869
KH
4216}
4217
0a28aafb 4218static int
df7492f9
KH
4219detect_coding_charset (coding, mask)
4220 struct coding_system *coding;
4221 int *mask;
1397dc18 4222{
df7492f9
KH
4223 unsigned char *src = coding->source, *src_base = src;
4224 unsigned char *src_end = coding->source + coding->src_bytes;
4225 int multibytep = coding->src_multibyte;
4226 int consumed_chars = 0;
4227 Lisp_Object attrs, valids;
1397dc18 4228
df7492f9
KH
4229 coding = &coding_categories[coding_category_charset];
4230 attrs = CODING_ID_ATTRS (coding->id);
4231 valids = AREF (attrs, coding_attr_charset_valids);
4232
4233 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4234 src += coding->head_ascii;
1397dc18 4235
b73bfc1c 4236 while (1)
1397dc18 4237 {
df7492f9 4238 int c;
1397dc18 4239
df7492f9
KH
4240 ONE_MORE_BYTE (c);
4241 if (NILP (AREF (valids, c)))
4242 break;
4243 }
4244 *mask &= ~CATEGORY_MASK_CHARSET;
4245 return 0;
4ed46869 4246
df7492f9
KH
4247 no_more_source:
4248 *mask &= CATEGORY_MASK_CHARSET;
4249 return 1;
4250}
4ed46869 4251
b73bfc1c 4252static void
df7492f9 4253decode_coding_charset (coding)
4ed46869 4254 struct coding_system *coding;
4ed46869 4255{
df7492f9
KH
4256 unsigned char *src = coding->source + coding->consumed;
4257 unsigned char *src_end = coding->source + coding->src_bytes;
b73bfc1c 4258 unsigned char *src_base;
df7492f9
KH
4259 int *charbuf = coding->charbuf;
4260 int *charbuf_end = charbuf + coding->charbuf_size;
4261 int consumed_chars = 0, consumed_chars_base;
4262 int multibytep = coding->src_multibyte;
4eb6d3f1 4263 Lisp_Object attrs, eol_type, charset_list, valids;
df7492f9
KH
4264
4265 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4eb6d3f1 4266 valids = AREF (attrs, coding_attr_charset_valids);
b73bfc1c 4267
df7492f9 4268 while (1)
4ed46869 4269 {
4eb6d3f1 4270 int c;
df7492f9
KH
4271
4272 src_base = src;
4273 consumed_chars_base = consumed_chars;
b73bfc1c 4274
df7492f9
KH
4275 if (charbuf >= charbuf_end)
4276 break;
4277
4eb6d3f1 4278 ONE_MORE_BYTE (c);
df7492f9 4279 if (c == '\r')
d46c5b12 4280 {
df7492f9 4281 if (EQ (eol_type, Qdos))
b73bfc1c 4282 {
4eb6d3f1
KH
4283 if (src < src_end
4284 && *src == '\n')
df7492f9 4285 ONE_MORE_BYTE (c);
b73bfc1c 4286 }
df7492f9 4287 else if (EQ (eol_type, Qmac))
b73bfc1c 4288 c = '\n';
d46c5b12 4289 }
df7492f9 4290 else
d46c5b12 4291 {
4eb6d3f1
KH
4292 Lisp_Object val;
4293 struct charset *charset;
4294 int c1;
4295
4296 val = AREF (valids, c);
4297 if (NILP (val))
4298 goto invalid_code;
4299 charset = CHARSET_FROM_ID (XFASTINT (val));
4300 if (CHARSET_DIMENSION (charset) > 1)
4301 {
4302 ONE_MORE_BYTE (c1);
4303 c = (c << 8) | c1;
4304 if (CHARSET_DIMENSION (charset) > 2)
4305 {
4306 ONE_MORE_BYTE (c1);
4307 c = (c << 8) | c1;
4308 if (CHARSET_DIMENSION (charset) > 3)
4309 {
4310 ONE_MORE_BYTE (c1);
4311 c = (c << 8) | c1;
4312 }
4313 }
4314 }
4315 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
df7492f9
KH
4316 if (c < 0)
4317 goto invalid_code;
d46c5b12 4318 }
df7492f9
KH
4319 *charbuf++ = c;
4320 continue;
4321
4322 invalid_code:
4323 src = src_base;
4324 consumed_chars = consumed_chars_base;
4325 ONE_MORE_BYTE (c);
4326 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4327 coding->errors++;
4ed46869
KH
4328 }
4329
df7492f9
KH
4330 no_more_source:
4331 coding->consumed_char += consumed_chars_base;
4332 coding->consumed = src_base - coding->source;
4333 coding->charbuf_used = charbuf - coding->charbuf;
4ed46869
KH
4334}
4335
df7492f9
KH
4336static int
4337encode_coding_charset (coding)
4ed46869 4338 struct coding_system *coding;
4ed46869 4339{
df7492f9
KH
4340 int multibytep = coding->dst_multibyte;
4341 int *charbuf = coding->charbuf;
4342 int *charbuf_end = charbuf + coding->charbuf_used;
4343 unsigned char *dst = coding->destination + coding->produced;
4344 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4345 int safe_room = MAX_MULTIBYTE_LENGTH;
4346 int produced_chars = 0;
df7492f9
KH
4347 Lisp_Object attrs, eol_type, charset_list;
4348 int ascii_compatible;
b73bfc1c 4349 int c;
b73bfc1c 4350
df7492f9 4351 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
df7492f9 4352 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
fb88bf2d 4353
df7492f9 4354 while (charbuf < charbuf_end)
4ed46869 4355 {
4eb6d3f1 4356 struct charset *charset;
df7492f9
KH
4357 unsigned code;
4358
4359 ASSURE_DESTINATION (safe_room);
4360 c = *charbuf++;
4361 if (ascii_compatible && ASCII_CHAR_P (c))
4362 EMIT_ONE_ASCII_BYTE (c);
d46c5b12 4363 else
4eb6d3f1
KH
4364 {
4365 charset = char_charset (c, charset_list, &code);
4366 if (charset)
4367 {
4368 if (CHARSET_DIMENSION (charset) == 1)
4369 EMIT_ONE_BYTE (code);
4370 else if (CHARSET_DIMENSION (charset) == 2)
4371 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
4372 else if (CHARSET_DIMENSION (charset) == 3)
4373 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
4374 else
4375 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
4376 (code >> 8) & 0xFF, code & 0xFF);
4377 }
4378 else
4379 EMIT_ONE_BYTE (coding->default_char);
4380 }
4ed46869
KH
4381 }
4382
df7492f9
KH
4383 coding->result = CODING_RESULT_SUCCESS;
4384 coding->produced_char += produced_chars;
4385 coding->produced = dst - coding->destination;
4386 return 0;
4ed46869
KH
4387}
4388
4389\f
1397dc18 4390/*** 7. C library functions ***/
4ed46869 4391
df7492f9 4392/* In Emacs Lisp, coding system is represented by a Lisp symbol which
4ed46869 4393 has a property `coding-system'. The value of this property is a
df7492f9 4394 vector of length 5 (called as coding-vector). Among elements of
4ed46869
KH
4395 this vector, the first (element[0]) and the fifth (element[4])
4396 carry important information for decoding/encoding. Before
4397 decoding/encoding, this information should be set in fields of a
4398 structure of type `coding_system'.
4399
df7492f9 4400 A value of property `coding-system' can be a symbol of another
4ed46869
KH
4401 subsidiary coding-system. In that case, Emacs gets coding-vector
4402 from that symbol.
4403
4404 `element[0]' contains information to be set in `coding->type'. The
4405 value and its meaning is as follows:
4406
0ef69138
KH
4407 0 -- coding_type_emacs_mule
4408 1 -- coding_type_sjis
df7492f9 4409 2 -- coding_type_iso_2022
0ef69138
KH
4410 3 -- coding_type_big5
4411 4 -- coding_type_ccl encoder/decoder written in CCL
4412 nil -- coding_type_no_conversion
4413 t -- coding_type_undecided (automatic conversion on decoding,
4414 no-conversion on encoding)
4ed46869
KH
4415
4416 `element[4]' contains information to be set in `coding->flags' and
4417 `coding->spec'. The meaning varies by `coding->type'.
4418
df7492f9 4419 If `coding->type' is `coding_type_iso_2022', element[4] is a vector
4ed46869
KH
4420 of length 32 (of which the first 13 sub-elements are used now).
4421 Meanings of these sub-elements are:
4422
df7492f9
KH
4423 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso_2022'
4424 If the value is an integer of valid charset, the charset is
4425 assumed to be designated to graphic register N initially.
4ed46869 4426
df7492f9
KH
4427 If the value is minus, it is a minus value of charset which
4428 reserves graphic register N, which means that the charset is
4429 not designated initially but should be designated to graphic
4430 register N just before encoding a character in that charset.
1397dc18 4431
df7492f9
KH
4432 If the value is nil, graphic register N is never used on
4433 encoding.
4434
4435 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
4436 Each value takes t or nil. See the section ISO2022 of
4437 `coding.h' for more information.
1397dc18 4438
df7492f9
KH
4439 If `coding->type' is `coding_type_big5', element[4] is t to denote
4440 BIG5-ETen or nil to denote BIG5-HKU.
4ed46869 4441
df7492f9 4442 If `coding->type' takes the other value, element[4] is ignored.
27901516 4443
df7492f9
KH
4444 Emacs Lisp's coding system also carries information about format of
4445 end-of-line in a value of property `eol-type'. If the value is
4446 integer, 0 means eol_lf, 1 means eol_crlf, and 2 means eol_cr. If
4447 it is not integer, it should be a vector of subsidiary coding
4448 systems of which property `eol-type' has one of above values.
4ed46869 4449
df7492f9 4450*/
4ed46869 4451
df7492f9
KH
4452/* Setup coding context CODING from information about CODING_SYSTEM.
4453 If CODING_SYSTEM is nil, `no-conversion' is assumed. If
4454 CODING_SYSTEM is invalid, signal an error. */
ec6d2bb8
KH
4455
4456void
df7492f9
KH
4457setup_coding_system (coding_system, coding)
4458 Lisp_Object coding_system;
ec6d2bb8
KH
4459 struct coding_system *coding;
4460{
df7492f9
KH
4461 Lisp_Object attrs;
4462 Lisp_Object eol_type;
4463 Lisp_Object coding_type;
4464 Lisp_Object val;
ec6d2bb8 4465
df7492f9
KH
4466 if (NILP (coding_system))
4467 coding_system = Qno_conversion;
4468
4469 CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
4470
4471 attrs = CODING_ID_ATTRS (coding->id);
4472 eol_type = CODING_ID_EOL_TYPE (coding->id);
4473
4474 coding->mode = 0;
4475 coding->head_ascii = -1;
4476 coding->common_flags
4477 = (VECTORP (eol_type) ? CODING_REQUIRE_DETECTION_MASK : 0);
4478
4479 val = CODING_ATTR_SAFE_CHARSETS (attrs);
4480 coding->max_charset_id = XSTRING (val)->size - 1;
4481 coding->safe_charsets = (char *) XSTRING (val)->data;
4482 coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
4483
4484 coding_type = CODING_ATTR_TYPE (attrs);
4485 if (EQ (coding_type, Qundecided))
4486 {
4487 coding->detector = NULL;
4488 coding->decoder = decode_coding_raw_text;
4489 coding->encoder = encode_coding_raw_text;
4490 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
4491 }
4492 else if (EQ (coding_type, Qiso_2022))
4493 {
4494 int i;
4495 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
4496
4497 /* Invoke graphic register 0 to plane 0. */
4498 CODING_ISO_INVOCATION (coding, 0) = 0;
4499 /* Invoke graphic register 1 to plane 1 if we can use 8-bit. */
4500 CODING_ISO_INVOCATION (coding, 1)
4501 = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
4502 /* Setup the initial status of designation. */
4503 for (i = 0; i < 4; i++)
4504 CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
4505 /* Not single shifting initially. */
4506 CODING_ISO_SINGLE_SHIFTING (coding) = 0;
4507 /* Beginning of buffer should also be regarded as bol. */
4508 CODING_ISO_BOL (coding) = 1;
4509 coding->detector = detect_coding_iso_2022;
4510 coding->decoder = decode_coding_iso_2022;
4511 coding->encoder = encode_coding_iso_2022;
4512 if (flags & CODING_ISO_FLAG_SAFE)
4513 coding->mode |= CODING_MODE_SAFE_ENCODING;
4514 coding->common_flags
4515 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
4516 | CODING_REQUIRE_FLUSHING_MASK);
4517 if (flags & CODING_ISO_FLAG_COMPOSITION)
4518 coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
4519 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
4520 {
4521 setup_iso_safe_charsets (attrs);
4522 val = CODING_ATTR_SAFE_CHARSETS (attrs);
4523 coding->max_charset_id = XSTRING (val)->size - 1;
4524 coding->safe_charsets = (char *) XSTRING (val)->data;
4525 }
4526 CODING_ISO_FLAGS (coding) = flags;
4527 }
4528 else if (EQ (coding_type, Qcharset))
4529 {
4530 coding->detector = detect_coding_charset;
4531 coding->decoder = decode_coding_charset;
4532 coding->encoder = encode_coding_charset;
4533 coding->common_flags
4534 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4535 }
4536 else if (EQ (coding_type, Qutf_8))
4537 {
4538 coding->detector = detect_coding_utf_8;
4539 coding->decoder = decode_coding_utf_8;
4540 coding->encoder = encode_coding_utf_8;
4541 coding->common_flags
4542 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4543 }
4544 else if (EQ (coding_type, Qutf_16))
4545 {
4546 val = AREF (attrs, coding_attr_utf_16_bom);
4547 CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_16_detect_bom
4548 : EQ (val, Qt) ? utf_16_with_bom
4549 : utf_16_without_bom);
4550 val = AREF (attrs, coding_attr_utf_16_endian);
4551 CODING_UTF_16_ENDIAN (coding) = (NILP (val) ? utf_16_big_endian
4552 : utf_16_little_endian);
e19c3639 4553 CODING_UTF_16_SURROGATE (coding) = 0;
df7492f9
KH
4554 coding->detector = detect_coding_utf_16;
4555 coding->decoder = decode_coding_utf_16;
4556 coding->encoder = encode_coding_utf_16;
4557 coding->common_flags
4558 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4559 }
4560 else if (EQ (coding_type, Qccl))
4561 {
4562 coding->detector = detect_coding_ccl;
4563 coding->decoder = decode_coding_ccl;
4564 coding->encoder = encode_coding_ccl;
4565 coding->common_flags
4566 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
4567 | CODING_REQUIRE_FLUSHING_MASK);
4568 }
4569 else if (EQ (coding_type, Qemacs_mule))
4570 {
4571 coding->detector = detect_coding_emacs_mule;
4572 coding->decoder = decode_coding_emacs_mule;
4573 coding->encoder = encode_coding_emacs_mule;
4574 coding->common_flags
4575 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4576 if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
4577 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
4578 {
4579 Lisp_Object tail, safe_charsets;
4580 int max_charset_id = 0;
4581
4582 for (tail = Vemacs_mule_charset_list; CONSP (tail);
4583 tail = XCDR (tail))
4584 if (max_charset_id < XFASTINT (XCAR (tail)))
4585 max_charset_id = XFASTINT (XCAR (tail));
4586 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
4587 make_number (255));
4588 for (tail = Vemacs_mule_charset_list; CONSP (tail);
4589 tail = XCDR (tail))
4590 XSTRING (safe_charsets)->data[XFASTINT (XCAR (tail))] = 0;
4591 coding->max_charset_id = max_charset_id;
4592 coding->safe_charsets = (char *) XSTRING (safe_charsets)->data;
4593 }
4594 }
4595 else if (EQ (coding_type, Qshift_jis))
4596 {
4597 coding->detector = detect_coding_sjis;
4598 coding->decoder = decode_coding_sjis;
4599 coding->encoder = encode_coding_sjis;
4600 coding->common_flags
4601 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4602 }
4603 else if (EQ (coding_type, Qbig5))
4604 {
4605 coding->detector = detect_coding_big5;
4606 coding->decoder = decode_coding_big5;
4607 coding->encoder = encode_coding_big5;
4608 coding->common_flags
4609 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4610 }
4611 else /* EQ (coding_type, Qraw_text) */
ec6d2bb8 4612 {
df7492f9
KH
4613 coding->detector = NULL;
4614 coding->decoder = decode_coding_raw_text;
4615 coding->encoder = encode_coding_raw_text;
4616 coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
ec6d2bb8 4617 }
df7492f9
KH
4618
4619 return;
ec6d2bb8
KH
4620}
4621
df7492f9
KH
4622/* Return raw-text or one of its subsidiaries that has the same
4623 eol_type as CODING-SYSTEM. */
ec6d2bb8 4624
df7492f9
KH
4625Lisp_Object
4626raw_text_coding_system (coding_system)
4627 Lisp_Object coding_system;
ec6d2bb8 4628{
0be8721c 4629 Lisp_Object spec, attrs;
df7492f9
KH
4630 Lisp_Object eol_type, raw_text_eol_type;
4631
4632 spec = CODING_SYSTEM_SPEC (coding_system);
4633 attrs = AREF (spec, 0);
4634
4635 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
4636 return coding_system;
ec6d2bb8 4637
df7492f9
KH
4638 eol_type = AREF (spec, 2);
4639 if (VECTORP (eol_type))
4640 return Qraw_text;
4641 spec = CODING_SYSTEM_SPEC (Qraw_text);
4642 raw_text_eol_type = AREF (spec, 2);
4643 return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
4644 : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
4645 : AREF (raw_text_eol_type, 2));
ec6d2bb8
KH
4646}
4647
54f78171 4648
df7492f9
KH
4649/* If CODING_SYSTEM doesn't specify end-of-line format but PARENT
4650 does, return one of the subsidiary that has the same eol-spec as
4651 PARENT. Otherwise, return CODING_SYSTEM. */
4652
4653Lisp_Object
4654coding_inherit_eol_type (coding_system, parent)
b74e4686 4655 Lisp_Object coding_system, parent;
54f78171 4656{
df7492f9 4657 Lisp_Object spec, attrs, eol_type;
54f78171 4658
df7492f9
KH
4659 spec = CODING_SYSTEM_SPEC (coding_system);
4660 attrs = AREF (spec, 0);
4661 eol_type = AREF (spec, 2);
4662 if (VECTORP (eol_type))
4663 {
4664 Lisp_Object parent_spec;
df7492f9
KH
4665 Lisp_Object parent_eol_type;
4666
4667 parent_spec
4668 = CODING_SYSTEM_SPEC (buffer_defaults.buffer_file_coding_system);
4669 parent_eol_type = AREF (parent_spec, 2);
4670 if (EQ (parent_eol_type, Qunix))
4671 coding_system = AREF (eol_type, 0);
4672 else if (EQ (parent_eol_type, Qdos))
4673 coding_system = AREF (eol_type, 1);
4674 else if (EQ (parent_eol_type, Qmac))
4675 coding_system = AREF (eol_type, 2);
54f78171 4676 }
df7492f9 4677 return coding_system;
54f78171
KH
4678}
4679
4ed46869
KH
4680/* Emacs has a mechanism to automatically detect a coding system if it
4681 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
4682 it's impossible to distinguish some coding systems accurately
4683 because they use the same range of codes. So, at first, coding
4684 systems are categorized into 7, those are:
4685
0ef69138 4686 o coding-category-emacs-mule
4ed46869
KH
4687
4688 The category for a coding system which has the same code range
4689 as Emacs' internal format. Assigned the coding-system (Lisp
0ef69138 4690 symbol) `emacs-mule' by default.
4ed46869
KH
4691
4692 o coding-category-sjis
4693
4694 The category for a coding system which has the same code range
4695 as SJIS. Assigned the coding-system (Lisp
7717c392 4696 symbol) `japanese-shift-jis' by default.
4ed46869
KH
4697
4698 o coding-category-iso-7
4699
4700 The category for a coding system which has the same code range
7717c392 4701 as ISO2022 of 7-bit environment. This doesn't use any locking
d46c5b12
KH
4702 shift and single shift functions. This can encode/decode all
4703 charsets. Assigned the coding-system (Lisp symbol)
4704 `iso-2022-7bit' by default.
4705
4706 o coding-category-iso-7-tight
4707
4708 Same as coding-category-iso-7 except that this can
4709 encode/decode only the specified charsets.
4ed46869
KH
4710
4711 o coding-category-iso-8-1
4712
4713 The category for a coding system which has the same code range
4714 as ISO2022 of 8-bit environment and graphic plane 1 used only
7717c392
KH
4715 for DIMENSION1 charset. This doesn't use any locking shift
4716 and single shift functions. Assigned the coding-system (Lisp
4717 symbol) `iso-latin-1' by default.
4ed46869
KH
4718
4719 o coding-category-iso-8-2
4720
4721 The category for a coding system which has the same code range
4722 as ISO2022 of 8-bit environment and graphic plane 1 used only
7717c392
KH
4723 for DIMENSION2 charset. This doesn't use any locking shift
4724 and single shift functions. Assigned the coding-system (Lisp
4725 symbol) `japanese-iso-8bit' by default.
4ed46869 4726
7717c392 4727 o coding-category-iso-7-else
4ed46869
KH
4728
4729 The category for a coding system which has the same code range
df7492f9 4730 as ISO2022 of 7-bit environemnt but uses locking shift or
7717c392
KH
4731 single shift functions. Assigned the coding-system (Lisp
4732 symbol) `iso-2022-7bit-lock' by default.
4733
4734 o coding-category-iso-8-else
4735
4736 The category for a coding system which has the same code range
df7492f9 4737 as ISO2022 of 8-bit environemnt but uses locking shift or
7717c392
KH
4738 single shift functions. Assigned the coding-system (Lisp
4739 symbol) `iso-2022-8bit-ss2' by default.
4ed46869
KH
4740
4741 o coding-category-big5
4742
4743 The category for a coding system which has the same code range
4744 as BIG5. Assigned the coding-system (Lisp symbol)
e0e989f6 4745 `cn-big5' by default.
4ed46869 4746
fa42c37f
KH
4747 o coding-category-utf-8
4748
4749 The category for a coding system which has the same code range
4750 as UTF-8 (cf. RFC2279). Assigned the coding-system (Lisp
4751 symbol) `utf-8' by default.
4752
4753 o coding-category-utf-16-be
4754
4755 The category for a coding system in which a text has an
4756 Unicode signature (cf. Unicode Standard) in the order of BIG
4757 endian at the head. Assigned the coding-system (Lisp symbol)
4758 `utf-16-be' by default.
4759
4760 o coding-category-utf-16-le
4761
4762 The category for a coding system in which a text has an
4763 Unicode signature (cf. Unicode Standard) in the order of
4764 LITTLE endian at the head. Assigned the coding-system (Lisp
4765 symbol) `utf-16-le' by default.
4766
1397dc18
KH
4767 o coding-category-ccl
4768
4769 The category for a coding system of which encoder/decoder is
4770 written in CCL programs. The default value is nil, i.e., no
4771 coding system is assigned.
4772
4ed46869
KH
4773 o coding-category-binary
4774
4775 The category for a coding system not categorized in any of the
4776 above. Assigned the coding-system (Lisp symbol)
e0e989f6 4777 `no-conversion' by default.
4ed46869
KH
4778
4779 Each of them is a Lisp symbol and the value is an actual
df7492f9 4780 `coding-system's (this is also a Lisp symbol) assigned by a user.
4ed46869
KH
4781 What Emacs does actually is to detect a category of coding system.
4782 Then, it uses a `coding-system' assigned to it. If Emacs can't
df7492f9 4783 decide only one possible category, it selects a category of the
4ed46869
KH
4784 highest priority. Priorities of categories are also specified by a
4785 user in a Lisp variable `coding-category-list'.
4786
4787*/
4788
df7492f9
KH
4789#define EOL_SEEN_NONE 0
4790#define EOL_SEEN_LF 1
4791#define EOL_SEEN_CR 2
4792#define EOL_SEEN_CRLF 4
4ed46869 4793
df7492f9
KH
4794/* Detect how end-of-line of a text of length CODING->src_bytes
4795 pointed by CODING->source is encoded. Return one of
4796 EOL_SEEN_XXX. */
4ed46869 4797
bc4bc72a
RS
4798#define MAX_EOL_CHECK_COUNT 3
4799
d46c5b12 4800static int
df7492f9
KH
4801detect_eol (coding, source, src_bytes)
4802 struct coding_system *coding;
d46c5b12 4803 unsigned char *source;
df7492f9 4804 EMACS_INT src_bytes;
4ed46869 4805{
df7492f9 4806 Lisp_Object attrs, coding_type;
d46c5b12 4807 unsigned char *src = source, *src_end = src + src_bytes;
4ed46869 4808 unsigned char c;
df7492f9
KH
4809 int total = 0;
4810 int eol_seen = EOL_SEEN_NONE;
4ed46869 4811
df7492f9
KH
4812 attrs = CODING_ID_ATTRS (coding->id);
4813 coding_type = CODING_ATTR_TYPE (attrs);
d46c5b12 4814
df7492f9 4815 if (EQ (coding_type, Qccl))
4ed46869 4816 {
df7492f9 4817 int msb, lsb;
fa42c37f 4818
df7492f9
KH
4819 msb = coding->spec.utf_16.endian == utf_16_little_endian;
4820 lsb = 1 - msb;
fa42c37f 4821
df7492f9 4822 while (src + 1 < src_end)
fa42c37f 4823 {
df7492f9
KH
4824 c = src[lsb];
4825 if (src[msb] == 0 && (c == '\n' || c == '\r'))
fa42c37f 4826 {
df7492f9
KH
4827 int this_eol;
4828
4829 if (c == '\n')
4830 this_eol = EOL_SEEN_LF;
4831 else if (src + 3 >= src_end
4832 || src[msb + 2] != 0
4833 || src[lsb + 2] != '\n')
4834 this_eol = EOL_SEEN_CR;
fa42c37f 4835 else
df7492f9
KH
4836 this_eol = EOL_SEEN_CRLF;
4837
4838 if (eol_seen == EOL_SEEN_NONE)
4839 /* This is the first end-of-line. */
4840 eol_seen = this_eol;
4841 else if (eol_seen != this_eol)
fa42c37f 4842 {
df7492f9
KH
4843 /* The found type is different from what found before. */
4844 eol_seen = EOL_SEEN_LF;
4845 break;
fa42c37f 4846 }
df7492f9
KH
4847 if (++total == MAX_EOL_CHECK_COUNT)
4848 break;
fa42c37f 4849 }
df7492f9 4850 src += 2;
fa42c37f 4851 }
df7492f9 4852 }
d46c5b12 4853 else
27901516 4854 {
df7492f9 4855 while (src < src_end)
27901516 4856 {
df7492f9
KH
4857 c = *src++;
4858 if (c == '\n' || c == '\r')
4859 {
4860 int this_eol;
d46c5b12 4861
df7492f9
KH
4862 if (c == '\n')
4863 this_eol = EOL_SEEN_LF;
4864 else if (src >= src_end || *src != '\n')
4865 this_eol = EOL_SEEN_CR;
4866 else
4867 this_eol = EOL_SEEN_CRLF, src++;
d46c5b12 4868
df7492f9
KH
4869 if (eol_seen == EOL_SEEN_NONE)
4870 /* This is the first end-of-line. */
4871 eol_seen = this_eol;
4872 else if (eol_seen != this_eol)
4873 {
4874 /* The found type is different from what found before. */
4875 eol_seen = EOL_SEEN_LF;
4876 break;
4877 }
4878 if (++total == MAX_EOL_CHECK_COUNT)
4879 break;
4880 }
4881 }
73be902c 4882 }
df7492f9 4883 return eol_seen;
73be902c
KH
4884}
4885
df7492f9 4886
73be902c 4887static void
df7492f9
KH
4888adjust_coding_eol_type (coding, eol_seen)
4889 struct coding_system *coding;
4890 int eol_seen;
73be902c 4891{
0be8721c 4892 Lisp_Object eol_type;
df7492f9
KH
4893
4894 eol_type = CODING_ID_EOL_TYPE (coding->id);
4895 if (eol_seen & EOL_SEEN_LF)
4896 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
4897 else if (eol_type & EOL_SEEN_CRLF)
4898 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
4899 else if (eol_type & EOL_SEEN_CR)
4900 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
d46c5b12
KH
4901}
4902
df7492f9
KH
4903/* Detect how a text specified in CODING is encoded. If a coding
4904 system is detected, update fields of CODING by the detected coding
4905 system. */
4906
4907void
4908detect_coding (coding)
d46c5b12 4909 struct coding_system *coding;
d46c5b12 4910{
df7492f9
KH
4911 unsigned char *src, *src_end;
4912 Lisp_Object attrs, coding_type;
d46c5b12 4913
df7492f9
KH
4914 coding->consumed = coding->consumed_char = 0;
4915 coding->produced = coding->produced_char = 0;
4916 coding_set_source (coding);
1c3478b0 4917
df7492f9 4918 src_end = coding->source + coding->src_bytes;
1c3478b0 4919
df7492f9
KH
4920 /* If we have not yet decided the text encoding type, detect it
4921 now. */
4922 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
b73bfc1c 4923 {
df7492f9
KH
4924 int mask = CATEGORY_MASK_ANY;
4925 int c, i;
4926
4927 for (src = coding->source; src < src_end; src++)
4928 {
4929 c = *src;
4930 if (c & 0x80 || (c < 0x20 && (c == ISO_CODE_ESC
4931 || c == ISO_CODE_SI
4932 || c == ISO_CODE_SO)))
4933 break;
4934 }
4935 coding->head_ascii = src - (coding->source + coding->consumed);
4936
4937 if (coding->head_ascii < coding->src_bytes)
1c3478b0 4938 {
df7492f9
KH
4939 int detected = 0;
4940
4941 for (i = 0; i < coding_category_raw_text; i++)
1c3478b0 4942 {
df7492f9
KH
4943 enum coding_category category = coding_priorities[i];
4944 struct coding_system *this = coding_categories + category;
4945
4946 if (category >= coding_category_raw_text
4947 || detected & (1 << category))
4948 continue;
4949
4950 if (this->id < 0)
1c3478b0 4951 {
df7492f9
KH
4952 /* No coding system of this category is defined. */
4953 mask &= ~(1 << category);
4954 }
4955 else
4956 {
4957 detected |= detected_mask[category];
4958 if ((*(this->detector)) (coding, &mask))
4959 break;
1c3478b0
KH
4960 }
4961 }
df7492f9
KH
4962 if (! mask)
4963 setup_coding_system (Qraw_text, coding);
4964 else if (mask != CATEGORY_MASK_ANY)
4965 for (i = 0; i < coding_category_raw_text; i++)
4966 {
4967 enum coding_category category = coding_priorities[i];
4968 struct coding_system *this = coding_categories + category;
4969
4970 if (mask & (1 << category))
4971 {
4972 setup_coding_system (CODING_ID_NAME (this->id), coding);
4973 break;
4974 }
4975 }
1c3478b0 4976 }
b73bfc1c 4977 }
69f76525 4978
df7492f9
KH
4979 attrs = CODING_ID_ATTRS (coding->id);
4980 coding_type = CODING_ATTR_TYPE (attrs);
4981
4982 /* If we have not yet decided the EOL type, detect it now. But, the
4983 detection is impossible for a CCL based coding system, in which
4984 case, we detct the EOL type after decoding. */
4985 if (VECTORP (CODING_ID_EOL_TYPE (coding->id))
4986 && ! EQ (coding_type, Qccl))
d46c5b12 4987 {
df7492f9
KH
4988 int eol_seen = detect_eol (coding, coding->source, coding->src_bytes);
4989
4990 if (eol_seen != EOL_SEEN_NONE)
4991 adjust_coding_eol_type (coding, eol_seen);
d46c5b12 4992 }
4ed46869
KH
4993}
4994
aaaf0b1e
KH
4995
4996static void
df7492f9 4997decode_eol (coding)
aaaf0b1e 4998 struct coding_system *coding;
aaaf0b1e 4999{
df7492f9 5000 if (VECTORP (CODING_ID_EOL_TYPE (coding->id)))
aaaf0b1e 5001 {
df7492f9
KH
5002 unsigned char *p = CHAR_POS_ADDR (coding->dst_pos);
5003 unsigned char *pend = p + coding->produced;
5004 int eol_seen = EOL_SEEN_NONE;
aaaf0b1e 5005
df7492f9 5006 for (; p < pend; p++)
aaaf0b1e 5007 {
df7492f9
KH
5008 if (*p == '\n')
5009 eol_seen |= EOL_SEEN_LF;
5010 else if (*p == '\r')
aaaf0b1e 5011 {
df7492f9 5012 if (p + 1 < pend && *(p + 1) == '\n')
aaaf0b1e 5013 {
df7492f9
KH
5014 eol_seen |= EOL_SEEN_CRLF;
5015 p++;
aaaf0b1e 5016 }
aaaf0b1e 5017 else
df7492f9 5018 eol_seen |= EOL_SEEN_CR;
aaaf0b1e 5019 }
aaaf0b1e 5020 }
df7492f9
KH
5021 if (eol_seen != EOL_SEEN_NONE)
5022 adjust_coding_eol_type (coding, eol_seen);
aaaf0b1e 5023 }
aaaf0b1e 5024
df7492f9
KH
5025 if (EQ (CODING_ID_EOL_TYPE (coding->id), Qmac))
5026 {
5027 unsigned char *p = CHAR_POS_ADDR (coding->dst_pos);
5028 unsigned char *pend = p + coding->produced;
5029
5030 for (; p < pend; p++)
5031 if (*p == '\r')
5032 *p = '\n';
5033 }
5034 else if (EQ (CODING_ID_EOL_TYPE (coding->id), Qdos))
5035 {
5036 unsigned char *p, *pbeg, *pend;
5037 Lisp_Object undo_list;
5038
5039 move_gap_both (coding->dst_pos + coding->produced_char,
5040 coding->dst_pos_byte + coding->produced);
5041 undo_list = current_buffer->undo_list;
5042 current_buffer->undo_list = Qt;
5043 del_range_2 (coding->dst_pos, coding->dst_pos_byte, GPT, GPT_BYTE, Qnil);
5044 current_buffer->undo_list = undo_list;
5045 pbeg = GPT_ADDR;
5046 pend = pbeg + coding->produced;
5047
5048 for (p = pend - 1; p >= pbeg; p--)
5049 if (*p == '\r')
5050 {
5051 safe_bcopy ((char *) (p + 1), (char *) p, pend - p - 1);
5052 pend--;
5053 }
5054 coding->produced_char -= coding->produced - (pend - pbeg);
5055 coding->produced = pend - pbeg;
5056 insert_from_gap (coding->produced_char, coding->produced);
aaaf0b1e
KH
5057 }
5058}
5059
df7492f9
KH
5060static void
5061translate_chars (coding, table)
4ed46869 5062 struct coding_system *coding;
df7492f9 5063 Lisp_Object table;
4ed46869 5064{
df7492f9
KH
5065 int *charbuf = coding->charbuf;
5066 int *charbuf_end = charbuf + coding->charbuf_used;
5067 int c;
5068
5069 if (coding->chars_at_source)
5070 return;
4ed46869 5071
df7492f9 5072 while (charbuf < charbuf_end)
8844fa83 5073 {
df7492f9
KH
5074 c = *charbuf;
5075 if (c < 0)
5076 charbuf += c;
5077 else
5078 *charbuf++ = translate_char (table, c);
8844fa83 5079 }
df7492f9 5080}
4ed46869 5081
df7492f9
KH
5082static int
5083produce_chars (coding)
5084 struct coding_system *coding;
5085{
5086 unsigned char *dst = coding->destination + coding->produced;
5087 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5088 int produced;
5089 int produced_chars = 0;
b73bfc1c 5090
df7492f9 5091 if (! coding->chars_at_source)
4ed46869 5092 {
df7492f9
KH
5093 /* Characters are in coding->charbuf. */
5094 int *buf = coding->charbuf;
5095 int *buf_end = buf + coding->charbuf_used;
5096 unsigned char *adjusted_dst_end;
4ed46869 5097
df7492f9
KH
5098 if (BUFFERP (coding->src_object)
5099 && EQ (coding->src_object, coding->dst_object))
5100 dst_end = coding->source + coding->consumed;
5101 adjusted_dst_end = dst_end - MAX_MULTIBYTE_LENGTH;
4ed46869 5102
df7492f9
KH
5103 while (buf < buf_end)
5104 {
5105 int c = *buf++;
5106
5107 if (dst >= adjusted_dst_end)
5108 {
5109 dst = alloc_destination (coding,
5110 buf_end - buf + MAX_MULTIBYTE_LENGTH,
5111 dst);
5112 dst_end = coding->destination + coding->dst_bytes;
5113 adjusted_dst_end = dst_end - MAX_MULTIBYTE_LENGTH;
5114 }
5115 if (c >= 0)
5116 {
5117 if (coding->dst_multibyte
5118 || ! CHAR_BYTE8_P (c))
5119 CHAR_STRING_ADVANCE (c, dst);
5120 else
5121 *dst++ = CHAR_TO_BYTE8 (c);
5122 produced_chars++;
5123 }
5124 else
5125 /* This is an annotation data. */
5126 buf -= c + 1;
5127 }
5128 }
5129 else
5130 {
df7492f9
KH
5131 unsigned char *src = coding->source;
5132 unsigned char *src_end = src + coding->src_bytes;
5133 Lisp_Object eol_type;
b73bfc1c 5134
df7492f9 5135 eol_type = CODING_ID_EOL_TYPE (coding->id);
4ed46869 5136
df7492f9 5137 if (coding->src_multibyte != coding->dst_multibyte)
aaaf0b1e 5138 {
df7492f9
KH
5139 if (coding->src_multibyte)
5140 {
71c81426 5141 int multibytep = 1;
df7492f9 5142 int consumed_chars;
d46c5b12 5143
df7492f9
KH
5144 while (1)
5145 {
5146 unsigned char *src_base = src;
5147 int c;
b73bfc1c 5148
df7492f9
KH
5149 ONE_MORE_BYTE (c);
5150 if (c == '\r')
5151 {
5152 if (EQ (eol_type, Qdos))
5153 {
5154 if (src < src_end
5155 && *src == '\n')
5156 c = *src++;
5157 }
5158 else if (EQ (eol_type, Qmac))
5159 c = '\n';
5160 }
5161 if (dst == dst_end)
5162 {
2c78b7e1 5163 coding->consumed = src - coding->source;
b73bfc1c 5164
2c78b7e1
KH
5165 if (EQ (coding->src_object, coding->dst_object))
5166 dst_end = src;
5167 if (dst == dst_end)
5168 {
5169 dst = alloc_destination (coding, src_end - src + 1,
5170 dst);
5171 dst_end = coding->destination + coding->dst_bytes;
5172 coding_set_source (coding);
5173 src = coding->source + coding->consumed;
5174 src_end = coding->source + coding->src_bytes;
5175 }
df7492f9
KH
5176 }
5177 *dst++ = c;
5178 produced_chars++;
5179 }
5180 no_more_source:
5181 ;
5182 }
5183 else
5184 while (src < src_end)
5185 {
71c81426 5186 int multibytep = 1;
df7492f9 5187 int c = *src++;
b73bfc1c 5188
df7492f9
KH
5189 if (c == '\r')
5190 {
5191 if (EQ (eol_type, Qdos))
5192 {
5193 if (src < src_end
5194 && *src == '\n')
5195 c = *src++;
5196 }
5197 else if (EQ (eol_type, Qmac))
5198 c = '\n';
5199 }
5200 if (dst >= dst_end - 1)
5201 {
2c78b7e1 5202 coding->consumed = src - coding->source;
df7492f9 5203
2c78b7e1
KH
5204 if (EQ (coding->src_object, coding->dst_object))
5205 dst_end = src;
5206 if (dst >= dst_end - 1)
5207 {
5208 dst = alloc_destination (coding, src_end - src + 2,
5209 dst);
5210 dst_end = coding->destination + coding->dst_bytes;
5211 coding_set_source (coding);
5212 src = coding->source + coding->consumed;
5213 src_end = coding->source + coding->src_bytes;
5214 }
df7492f9
KH
5215 }
5216 EMIT_ONE_BYTE (c);
5217 }
d46c5b12 5218 }
df7492f9
KH
5219 else
5220 {
5221 if (!EQ (coding->src_object, coding->dst_object))
5222 {
5223 int require = coding->src_bytes - coding->dst_bytes;
4ed46869 5224
df7492f9
KH
5225 if (require > 0)
5226 {
5227 EMACS_INT offset = src - coding->source;
5228
5229 dst = alloc_destination (coding, require, dst);
5230 coding_set_source (coding);
5231 src = coding->source + offset;
5232 src_end = coding->source + coding->src_bytes;
5233 }
5234 }
5235 produced_chars = coding->src_chars;
5236 while (src < src_end)
5237 {
5238 int c = *src++;
5239
5240 if (c == '\r')
5241 {
5242 if (EQ (eol_type, Qdos))
5243 {
5244 if (src < src_end
5245 && *src == '\n')
5246 c = *src++;
5247 produced_chars--;
5248 }
5249 else if (EQ (eol_type, Qmac))
5250 c = '\n';
5251 }
5252 *dst++ = c;
5253 }
5254 }
2c78b7e1
KH
5255 coding->consumed = coding->src_bytes;
5256 coding->consumed_char = coding->src_chars;
b73bfc1c 5257 }
4ed46869 5258
df7492f9
KH
5259 produced = dst - (coding->destination + coding->produced);
5260 if (BUFFERP (coding->dst_object))
5261 insert_from_gap (produced_chars, produced);
5262 coding->produced += produced;
5263 coding->produced_char += produced_chars;
5264 return produced_chars;
b73bfc1c 5265}
52d41803 5266
df7492f9
KH
5267/* [ -LENGTH CHAR_POS_OFFSET MASK METHOD COMP_LEN ]
5268 or
5269 [ -LENGTH CHAR_POS_OFFSET MASK METHOD COMP_LEN COMPONENTS... ]
5270 */
4ed46869 5271
df7492f9
KH
5272static INLINE void
5273produce_composition (coding, charbuf)
4ed46869 5274 struct coding_system *coding;
df7492f9 5275 int *charbuf;
4ed46869 5276{
df7492f9
KH
5277 Lisp_Object buffer;
5278 int len;
5279 EMACS_INT pos;
5280 enum composition_method method;
5281 int cmp_len;
5282 Lisp_Object components;
5283
5284 buffer = coding->dst_object;
5285 len = -charbuf[0];
5286 pos = coding->dst_pos + charbuf[1];
5287 method = (enum composition_method) (charbuf[3]);
5288 cmp_len = charbuf[4];
5289
5290 if (method == COMPOSITION_RELATIVE)
5291 components = Qnil;
5292 else
d46c5b12 5293 {
df7492f9
KH
5294 Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
5295 int i;
4ed46869 5296
df7492f9
KH
5297 len -= 5;
5298 charbuf += 5;
5299 for (i = 0; i < len; i++)
5300 args[i] = make_number (charbuf[i]);
5301 components = (method == COMPOSITION_WITH_ALTCHARS
5302 ? Fstring (len, args) : Fvector (len, args));
5303 }
5304 compose_text (pos, pos + cmp_len, components, Qnil, Qnil);
5305}
b73bfc1c 5306
df7492f9
KH
5307static int *
5308save_composition_data (buf, buf_end, prop)
5309 int *buf, *buf_end;
5310 Lisp_Object prop;
5311{
5312 enum composition_method method = COMPOSITION_METHOD (prop);
5313 int cmp_len = COMPOSITION_LENGTH (prop);
4ed46869 5314
df7492f9
KH
5315 if (buf + 4 + (MAX_COMPOSITION_COMPONENTS * 2 - 1) > buf_end)
5316 return NULL;
d46c5b12 5317
df7492f9
KH
5318 buf[1] = CODING_ANNOTATE_COMPOSITION_MASK;
5319 buf[2] = method;
5320 buf[3] = cmp_len;
b73bfc1c 5321
df7492f9
KH
5322 if (method == COMPOSITION_RELATIVE)
5323 buf[0] = 4;
5324 else
b73bfc1c 5325 {
df7492f9
KH
5326 Lisp_Object components;
5327 int len, i;
b73bfc1c 5328
df7492f9
KH
5329 components = COMPOSITION_COMPONENTS (prop);
5330 if (VECTORP (components))
d46c5b12 5331 {
df7492f9
KH
5332 len = XVECTOR (components)->size;
5333 for (i = 0; i < len; i++)
5334 buf[4 + i] = XINT (AREF (components, i));
5335 }
5336 else if (STRINGP (components))
5337 {
5338 int i_byte;
b73bfc1c 5339
df7492f9
KH
5340 len = XSTRING (components)->size;
5341 i = i_byte = 0;
5342 while (i < len)
5343 FETCH_STRING_CHAR_ADVANCE (buf[4 + i], components, i, i_byte);
5344 }
5345 else if (INTEGERP (components))
5346 {
5347 len = 1;
5348 buf[4] = XINT (components);
5349 }
5350 else if (CONSP (components))
5351 {
5352 for (len = 0; CONSP (components);
5353 len++, components = XCDR (components))
5354 buf[4 + len] = XINT (XCAR (components));
d46c5b12 5355 }
df7492f9
KH
5356 else
5357 abort ();
5358 buf[0] = 4 + len;
4ed46869 5359 }
df7492f9 5360 return (buf + buf[0]);
4ed46869
KH
5361}
5362
df7492f9
KH
5363#define CHARBUF_SIZE 0x4000
5364
5365#define ALLOC_CONVERSION_WORK_AREA(coding) \
5366 do { \
5367 int size = CHARBUF_SIZE;; \
5368 \
5369 coding->charbuf = NULL; \
5370 while (size > 1024) \
5371 { \
5372 coding->charbuf = (int *) alloca (sizeof (int) * size); \
5373 if (coding->charbuf) \
5374 break; \
5375 size >>= 1; \
5376 } \
5377 if (! coding->charbuf) \
5378 { \
5379 coding->result = CODING_RESULT_INSUFFICIENT_MEM; \
5380 return coding->result; \
5381 } \
5382 coding->charbuf_size = size; \
5383 } while (0)
4ed46869 5384
d46c5b12
KH
5385
5386static void
df7492f9 5387produce_annotation (coding)
d46c5b12 5388 struct coding_system *coding;
d46c5b12 5389{
df7492f9
KH
5390 int *charbuf = coding->charbuf;
5391 int *charbuf_end = charbuf + coding->charbuf_used;
d46c5b12 5392
df7492f9 5393 while (charbuf < charbuf_end)
d46c5b12 5394 {
df7492f9
KH
5395 if (*charbuf >= 0)
5396 charbuf++;
d46c5b12 5397 else
d46c5b12 5398 {
df7492f9
KH
5399 int len = -*charbuf;
5400 switch (charbuf[2])
5401 {
5402 case CODING_ANNOTATE_COMPOSITION_MASK:
5403 produce_composition (coding, charbuf);
5404 break;
5405 default:
5406 abort ();
5407 }
5408 charbuf += len;
d46c5b12 5409 }
df7492f9
KH
5410 }
5411}
d46c5b12 5412
df7492f9
KH
5413/* Decode the data at CODING->src_object into CODING->dst_object.
5414 CODING->src_object is a buffer, a string, or nil.
5415 CODING->dst_object is a buffer.
de79a6a5 5416
df7492f9
KH
5417 If CODING->src_object is a buffer, it must be the current buffer.
5418 In this case, if CODING->src_pos is positive, it is a position of
5419 the source text in the buffer, otherwise, the source text is in the
5420 gap area of the buffer, and CODING->src_pos specifies the offset of
5421 the text from GPT (which must be the same as PT). If this is the
5422 same buffer as CODING->dst_object, CODING->src_pos must be
5423 negative.
b73bfc1c 5424
df7492f9
KH
5425 If CODING->src_object is a string, CODING->src_pos in an index to
5426 that string.
d46c5b12 5427
df7492f9
KH
5428 If CODING->src_object is nil, CODING->source must already point to
5429 the non-relocatable memory area. In this case, CODING->src_pos is
5430 an offset from CODING->source.
d46c5b12 5431
df7492f9
KH
5432 The decoded data is inserted at the current point of the buffer
5433 CODING->dst_object.
5434*/
5435
5436static int
5437decode_coding (coding)
d46c5b12 5438 struct coding_system *coding;
d46c5b12 5439{
df7492f9 5440 Lisp_Object attrs;
d46c5b12 5441
df7492f9
KH
5442 if (BUFFERP (coding->src_object)
5443 && coding->src_pos > 0
5444 && coding->src_pos < GPT
5445 && coding->src_pos + coding->src_chars > GPT)
5446 move_gap_both (coding->src_pos, coding->src_pos_byte);
d46c5b12 5447
df7492f9 5448 if (BUFFERP (coding->dst_object))
88993dfd 5449 {
df7492f9
KH
5450 if (current_buffer != XBUFFER (coding->dst_object))
5451 set_buffer_internal (XBUFFER (coding->dst_object));
5452 if (GPT != PT)
5453 move_gap_both (PT, PT_BYTE);
88993dfd
KH
5454 }
5455
df7492f9
KH
5456 coding->consumed = coding->consumed_char = 0;
5457 coding->produced = coding->produced_char = 0;
5458 coding->chars_at_source = 0;
5459 coding->result = CODING_RESULT_SUCCESS;
5460 coding->errors = 0;
5461
5462 ALLOC_CONVERSION_WORK_AREA (coding);
5463
5464 attrs = CODING_ID_ATTRS (coding->id);
5465
5466 do
d46c5b12 5467 {
df7492f9
KH
5468 coding_set_source (coding);
5469 coding->annotated = 0;
5470 (*(coding->decoder)) (coding);
5471 if (!NILP (CODING_ATTR_DECODE_TBL (attrs)))
5472 translate_chars (CODING_ATTR_DECODE_TBL (attrs), coding);
5473 coding_set_destination (coding);
5474 produce_chars (coding);
5475 if (coding->annotated)
5476 produce_annotation (coding);
d46c5b12 5477 }
df7492f9
KH
5478 while (coding->consumed < coding->src_bytes
5479 && ! coding->result);
d46c5b12 5480
df7492f9
KH
5481 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qccl)
5482 && SYMBOLP (CODING_ID_EOL_TYPE (coding->id))
5483 && ! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix))
5484 decode_eol (coding);
d46c5b12 5485
df7492f9
KH
5486 coding->carryover_bytes = 0;
5487 if (coding->consumed < coding->src_bytes)
d46c5b12 5488 {
df7492f9
KH
5489 int nbytes = coding->src_bytes - coding->consumed;
5490 unsigned char *src;
5491
5492 coding_set_source (coding);
5493 coding_set_destination (coding);
5494 src = coding->source + coding->consumed;
5495
5496 if (coding->mode & CODING_MODE_LAST_BLOCK)
d46c5b12 5497 {
df7492f9
KH
5498 /* Flush out unprocessed data as binary chars. We are sure
5499 that the number of data is less than the size of
5500 coding->charbuf. */
5501 int *charbuf = coding->charbuf;
5502
5503 while (nbytes-- > 0)
d46c5b12 5504 {
df7492f9
KH
5505 int c = *src++;
5506 *charbuf++ = (c & 0x80 ? - c : c);
d46c5b12 5507 }
df7492f9 5508 produce_chars (coding);
d46c5b12 5509 }
d46c5b12 5510 else
df7492f9
KH
5511 {
5512 /* Record unprocessed bytes in coding->carryover. We are
5513 sure that the number of data is less than the size of
5514 coding->carryover. */
5515 unsigned char *p = coding->carryover;
5516
5517 coding->carryover_bytes = nbytes;
5518 while (nbytes-- > 0)
5519 *p++ = *src++;
5520 }
5521 coding->consumed = coding->src_bytes;
5522 }
b73bfc1c 5523
df7492f9 5524 return coding->result;
d46c5b12
KH
5525}
5526
df7492f9
KH
5527static void
5528consume_chars (coding)
5529 struct coding_system *coding;
5530{
5531 int *buf = coding->charbuf;
5532 /* -1 is to compensate for CRLF. */
5533 int *buf_end = coding->charbuf + coding->charbuf_size - 1;
5534 unsigned char *src = coding->source + coding->consumed;
5535 int pos = coding->src_pos + coding->consumed_char;
5536 int end_pos = coding->src_pos + coding->src_chars;
5537 int multibytep = coding->src_multibyte;
5538 Lisp_Object eol_type;
5539 int c;
5540 int start, end, stop;
5541 Lisp_Object object, prop;
88993dfd 5542
df7492f9
KH
5543 eol_type = CODING_ID_EOL_TYPE (coding->id);
5544 if (VECTORP (eol_type))
5545 eol_type = Qunix;
88993dfd 5546
df7492f9 5547 object = coding->src_object;
b843d1ae 5548
df7492f9
KH
5549 /* Note: composition handling is not yet implemented. */
5550 coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
ec6d2bb8 5551
df7492f9
KH
5552 if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK
5553 && find_composition (pos, end_pos, &start, &end, &prop, object)
5554 && end <= end_pos
5555 && (start >= pos
5556 || (find_composition (end, end_pos, &start, &end, &prop, object)
5557 && end <= end_pos)))
5558 stop = start;
5559 else
5560 stop = end_pos;
ec6d2bb8 5561
df7492f9 5562 while (buf < buf_end)
ec6d2bb8 5563 {
df7492f9 5564 if (pos == stop)
ec6d2bb8 5565 {
df7492f9 5566 int *p;
ec6d2bb8 5567
df7492f9
KH
5568 if (pos == end_pos)
5569 break;
5570 p = save_composition_data (buf, buf_end, prop);
5571 if (p == NULL)
5572 break;
5573 buf = p;
5574 if (find_composition (end, end_pos, &start, &end, &prop, object)
5575 && end <= end_pos)
5576 stop = start;
5577 else
5578 stop = end_pos;
5579 }
5580
5581 if (! multibytep)
5582 c = *src++;
5583 else
5584 c = STRING_CHAR_ADVANCE (src);
5585 if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
5586 c = '\n';
5587 if (! EQ (eol_type, Qunix))
5588 {
5589 if (c == '\n')
5590 {
5591 if (EQ (eol_type, Qdos))
5592 *buf++ = '\r';
5593 else
5594 c = '\r';
ec6d2bb8 5595 }
ec6d2bb8 5596 }
df7492f9
KH
5597 *buf++ = c;
5598 pos++;
ec6d2bb8 5599 }
ec6d2bb8 5600
df7492f9
KH
5601 coding->consumed = src - coding->source;
5602 coding->consumed_char = pos - coding->src_pos;
5603 coding->charbuf_used = buf - coding->charbuf;
5604 coding->chars_at_source = 0;
ec6d2bb8
KH
5605}
5606
ec6d2bb8 5607
df7492f9
KH
5608/* Encode the text at CODING->src_object into CODING->dst_object.
5609 CODING->src_object is a buffer or a string.
5610 CODING->dst_object is a buffer or nil.
5611
5612 If CODING->src_object is a buffer, it must be the current buffer.
5613 In this case, if CODING->src_pos is positive, it is a position of
5614 the source text in the buffer, otherwise. the source text is in the
5615 gap area of the buffer, and coding->src_pos specifies the offset of
5616 the text from GPT (which must be the same as PT). If this is the
5617 same buffer as CODING->dst_object, CODING->src_pos must be
5618 negative and CODING should not have `pre-write-conversion'.
5619
5620 If CODING->src_object is a string, CODING should not have
5621 `pre-write-conversion'.
5622
5623 If CODING->dst_object is a buffer, the encoded data is inserted at
5624 the current point of that buffer.
5625
5626 If CODING->dst_object is nil, the encoded data is placed at the
5627 memory area specified by CODING->destination. */
5628
5629static int
5630encode_coding (coding)
ec6d2bb8 5631 struct coding_system *coding;
ec6d2bb8 5632{
df7492f9 5633 Lisp_Object attrs;
ec6d2bb8 5634
df7492f9 5635 attrs = CODING_ID_ATTRS (coding->id);
ec6d2bb8 5636
df7492f9 5637 if (BUFFERP (coding->dst_object))
ec6d2bb8 5638 {
df7492f9
KH
5639 set_buffer_internal (XBUFFER (coding->dst_object));
5640 coding->dst_multibyte
5641 = ! NILP (current_buffer->enable_multibyte_characters);
5642 }
ec6d2bb8 5643
df7492f9
KH
5644 coding->consumed = coding->consumed_char = 0;
5645 coding->produced = coding->produced_char = 0;
5646 coding->result = CODING_RESULT_SUCCESS;
5647 coding->errors = 0;
ec6d2bb8 5648
df7492f9 5649 ALLOC_CONVERSION_WORK_AREA (coding);
ec6d2bb8 5650
df7492f9
KH
5651 do {
5652 coding_set_source (coding);
5653 consume_chars (coding);
5654
5655 if (!NILP (CODING_ATTR_ENCODE_TBL (attrs)))
5656 translate_chars (CODING_ATTR_ENCODE_TBL (attrs), coding);
5657
5658 coding_set_destination (coding);
5659 (*(coding->encoder)) (coding);
5660 } while (coding->consumed_char < coding->src_chars);
5661
5662 if (BUFFERP (coding->dst_object))
5663 insert_from_gap (coding->produced_char, coding->produced);
5664
5665 return (coding->result);
ec6d2bb8
KH
5666}
5667
df7492f9 5668/* Work buffer */
fb88bf2d 5669
df7492f9
KH
5670/* List of currently used working buffer. */
5671Lisp_Object Vcode_conversion_work_buf_list;
d46c5b12 5672
df7492f9
KH
5673/* A working buffer used by the top level conversion. */
5674Lisp_Object Vcode_conversion_reused_work_buf;
b73bfc1c 5675
4ed46869 5676
df7492f9
KH
5677/* Return a working buffer that can be freely used by the following
5678 code conversion. MULTIBYTEP specifies the multibyteness of the
5679 buffer. */
b73bfc1c 5680
df7492f9
KH
5681Lisp_Object
5682make_conversion_work_buffer (multibytep)
5683 int multibytep;
5684{
5685 struct buffer *current = current_buffer;
5686 Lisp_Object buf;
d46c5b12 5687
df7492f9 5688 if (NILP (Vcode_conversion_work_buf_list))
e133c8fa 5689 {
df7492f9
KH
5690 if (NILP (Vcode_conversion_reused_work_buf))
5691 Vcode_conversion_reused_work_buf
5692 = Fget_buffer_create (build_string (" *code-conversion-work*"));
5693 Vcode_conversion_work_buf_list
5694 = Fcons (Vcode_conversion_reused_work_buf, Qnil);
e133c8fa 5695 }
df7492f9 5696 else
d46c5b12 5697 {
df7492f9
KH
5698 int depth = Flength (Vcode_conversion_work_buf_list);
5699 char str[128];
e077cc80 5700
df7492f9
KH
5701 sprintf (str, " *code-conversion-work*<%d>", depth);
5702 Vcode_conversion_work_buf_list
5703 = Fcons (Fget_buffer_create (build_string (str)),
5704 Vcode_conversion_work_buf_list);
d46c5b12 5705 }
d46c5b12 5706
df7492f9
KH
5707 buf = XCAR (Vcode_conversion_work_buf_list);
5708 set_buffer_internal (XBUFFER (buf));
5709 current_buffer->undo_list = Qt;
5710 Ferase_buffer ();
5711 Fset_buffer_multibyte (multibytep ? Qt : Qnil);
5712 set_buffer_internal (current);
5713 return buf;
5714}
d46c5b12 5715
df7492f9 5716static struct coding_system *saved_coding;
d46c5b12 5717
df7492f9
KH
5718Lisp_Object
5719code_conversion_restore (info)
5720 Lisp_Object info;
5721{
5722 int depth = Flength (Vcode_conversion_work_buf_list);
5723 Lisp_Object buf;
d46c5b12 5724
df7492f9 5725 if (depth > 0)
d46c5b12 5726 {
df7492f9
KH
5727 buf = XCAR (Vcode_conversion_work_buf_list);
5728 Vcode_conversion_work_buf_list = XCDR (Vcode_conversion_work_buf_list);
5729 if (depth > 1 && !NILP (Fbuffer_live_p (buf)))
5730 Fkill_buffer (buf);
5731 }
d46c5b12 5732
df7492f9
KH
5733 if (saved_coding->dst_object == Qt
5734 && saved_coding->destination)
5735 xfree (saved_coding->destination);
b843d1ae 5736
df7492f9
KH
5737 return save_excursion_restore (info);
5738}
d46c5b12 5739
12410ef1 5740
df7492f9
KH
5741int
5742decode_coding_gap (coding, chars, bytes)
5743 struct coding_system *coding;
5744 EMACS_INT chars, bytes;
5745{
5746 int count = specpdl_ptr - specpdl;
fb88bf2d 5747
df7492f9
KH
5748 saved_coding = coding;
5749 record_unwind_protect (code_conversion_restore, save_excursion_save ());
ec6d2bb8 5750
df7492f9
KH
5751 coding->src_object = Fcurrent_buffer ();
5752 coding->src_chars = chars;
5753 coding->src_bytes = bytes;
5754 coding->src_pos = -chars;
5755 coding->src_pos_byte = -bytes;
5756 coding->src_multibyte = chars < bytes;
5757 coding->dst_object = coding->src_object;
5758 coding->dst_pos = PT;
5759 coding->dst_pos_byte = PT_BYTE;
71c81426 5760 coding->dst_multibyte = ! NILP (current_buffer->enable_multibyte_characters);
4956c225 5761
df7492f9
KH
5762 if (CODING_REQUIRE_DETECTION (coding))
5763 detect_coding (coding);
5764
5765 decode_coding (coding);
d46c5b12 5766
df7492f9
KH
5767 unbind_to (count, Qnil);
5768 return coding->result;
5769}
d46c5b12 5770
df7492f9
KH
5771int
5772encode_coding_gap (coding, chars, bytes)
5773 struct coding_system *coding;
5774 EMACS_INT chars, bytes;
5775{
5776 int count = specpdl_ptr - specpdl;
5777 Lisp_Object buffer;
d46c5b12 5778
df7492f9
KH
5779 saved_coding = coding;
5780 record_unwind_protect (code_conversion_restore, save_excursion_save ());
fb88bf2d 5781
df7492f9
KH
5782 buffer = Fcurrent_buffer ();
5783 coding->src_object = buffer;
5784 coding->src_chars = chars;
5785 coding->src_bytes = bytes;
5786 coding->src_pos = -chars;
5787 coding->src_pos_byte = -bytes;
5788 coding->src_multibyte = chars < bytes;
5789 coding->dst_object = coding->src_object;
5790 coding->dst_pos = PT;
5791 coding->dst_pos_byte = PT_BYTE;
fb88bf2d 5792
df7492f9 5793 encode_coding (coding);
f2558efd 5794
df7492f9
KH
5795 unbind_to (count, Qnil);
5796 return coding->result;
5797}
b73bfc1c 5798
d46c5b12 5799
df7492f9
KH
5800/* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
5801 SRC_OBJECT into DST_OBJECT by coding context CODING.
ec6d2bb8 5802
df7492f9 5803 SRC_OBJECT is a buffer, a string, or Qnil.
ec6d2bb8 5804
df7492f9
KH
5805 If it is a buffer, the text is at point of the buffer. FROM and TO
5806 are positions in the buffer.
ec6d2bb8 5807
df7492f9
KH
5808 If it is a string, the text is at the beginning of the string.
5809 FROM and TO are indices to the string.
ec6d2bb8 5810
df7492f9
KH
5811 If it is nil, the text is at coding->source. FROM and TO are
5812 indices to coding->source.
ec6d2bb8 5813
df7492f9 5814 DST_OBJECT is a buffer, Qt, or Qnil.
d46c5b12 5815
df7492f9
KH
5816 If it is a buffer, the decoded text is inserted at point of the
5817 buffer. If the buffer is the same as SRC_OBJECT, the source text
5818 is deleted.
d46c5b12 5819
df7492f9
KH
5820 If it is Qt, a string is made from the decoded text, and
5821 set in CODING->dst_object.
d46c5b12 5822
df7492f9
KH
5823 If it is Qnil, the decoded text is stored at CODING->destination.
5824 The called must allocate CODING->dst_bytes bytes at
5825 CODING->destination by xmalloc. If the decoded text is longer than
5826 CODING->dst_bytes, CODING->destination is relocated by xrealloc.
5827 */
d46c5b12 5828
df7492f9
KH
5829void
5830decode_coding_object (coding, src_object, from, from_byte, to, to_byte,
5831 dst_object)
5832 struct coding_system *coding;
5833 Lisp_Object src_object;
5834 EMACS_INT from, from_byte, to, to_byte;
5835 Lisp_Object dst_object;
5836{
5837 int count = specpdl_ptr - specpdl;
5838 unsigned char *destination;
5839 EMACS_INT dst_bytes;
5840 EMACS_INT chars = to - from;
5841 EMACS_INT bytes = to_byte - from_byte;
5842 Lisp_Object attrs;
d46c5b12 5843
df7492f9
KH
5844 saved_coding = coding;
5845 record_unwind_protect (code_conversion_restore, save_excursion_save ());
93dec019 5846
df7492f9
KH
5847 if (NILP (dst_object))
5848 {
5849 destination = coding->destination;
5850 dst_bytes = coding->dst_bytes;
5851 }
93dec019 5852
df7492f9
KH
5853 coding->src_object = src_object;
5854 coding->src_chars = chars;
5855 coding->src_bytes = bytes;
5856 coding->src_multibyte = chars < bytes;
70ad9fc4 5857
df7492f9
KH
5858 if (STRINGP (src_object))
5859 {
5860 coding->src_pos = from;
5861 coding->src_pos_byte = from_byte;
5862 }
5863 else if (BUFFERP (src_object))
5864 {
5865 set_buffer_internal (XBUFFER (src_object));
5866 if (from != GPT)
5867 move_gap_both (from, from_byte);
5868 if (EQ (src_object, dst_object))
fb88bf2d 5869 {
df7492f9
KH
5870 TEMP_SET_PT_BOTH (from, from_byte);
5871 del_range_both (from, from_byte, to, to_byte, 1);
5872 coding->src_pos = -chars;
5873 coding->src_pos_byte = -bytes;
fb88bf2d 5874 }
df7492f9 5875 else
fb88bf2d 5876 {
df7492f9
KH
5877 coding->src_pos = from;
5878 coding->src_pos_byte = from_byte;
fb88bf2d 5879 }
d46c5b12 5880 }
fb88bf2d 5881
df7492f9
KH
5882 if (CODING_REQUIRE_DETECTION (coding))
5883 detect_coding (coding);
5884 attrs = CODING_ID_ATTRS (coding->id);
5885
5886 if (! NILP (CODING_ATTR_POST_READ (attrs))
5887 || EQ (dst_object, Qt))
b73bfc1c 5888 {
df7492f9
KH
5889 coding->dst_object = make_conversion_work_buffer (1);
5890 coding->dst_pos = BEG;
5891 coding->dst_pos_byte = BEG_BYTE;
5892 coding->dst_multibyte = 1;
b73bfc1c 5893 }
df7492f9 5894 else if (BUFFERP (dst_object))
12410ef1 5895 {
df7492f9
KH
5896 coding->dst_object = dst_object;
5897 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
5898 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
5899 coding->dst_multibyte
5900 = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
12410ef1 5901 }
72d1a715 5902 else
df7492f9
KH
5903 {
5904 coding->dst_object = Qnil;
5905 coding->dst_multibyte = 1;
5906 }
5907
5908 decode_coding (coding);
4ed46869 5909
df7492f9
KH
5910 if (BUFFERP (coding->dst_object))
5911 set_buffer_internal (XBUFFER (coding->dst_object));
ec6d2bb8 5912
df7492f9 5913 if (! NILP (CODING_ATTR_POST_READ (attrs)))
d46c5b12 5914 {
df7492f9
KH
5915 struct gcpro gcpro1, gcpro2;
5916 EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
2b4f9037 5917 Lisp_Object val;
4ed46869 5918
c0cc7f7f 5919 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
df7492f9
KH
5920 GCPRO2 (coding->src_object, coding->dst_object);
5921 val = call1 (CODING_ATTR_POST_READ (attrs),
5922 make_number (coding->produced_char));
5923 UNGCPRO;
5924 CHECK_NATNUM (val);
5925 coding->produced_char += Z - prev_Z;
5926 coding->produced += Z_BYTE - prev_Z_BYTE;
d46c5b12 5927 }
4ed46869 5928
df7492f9 5929 if (EQ (dst_object, Qt))
ec6d2bb8 5930 {
df7492f9
KH
5931 coding->dst_object = Fbuffer_string ();
5932 }
5933 else if (NILP (dst_object) && BUFFERP (coding->dst_object))
5934 {
5935 set_buffer_internal (XBUFFER (coding->dst_object));
5936 if (dst_bytes < coding->produced)
5937 {
5938 destination
5939 = (unsigned char *) xrealloc (destination, coding->produced);
5940 if (! destination)
5941 {
5942 coding->result = CODING_RESULT_INSUFFICIENT_DST;
5943 unbind_to (count, Qnil);
5944 return;
5945 }
5946 if (BEGV < GPT && GPT < BEGV + coding->produced_char)
5947 move_gap_both (BEGV, BEGV_BYTE);
5948 bcopy (BEGV_ADDR, destination, coding->produced);
5949 coding->destination = destination;
5950 }
ec6d2bb8 5951 }
2b4f9037 5952
df7492f9 5953 unbind_to (count, Qnil);
d46c5b12
KH
5954}
5955
df7492f9
KH
5956
5957void
5958encode_coding_object (coding, src_object, from, from_byte, to, to_byte,
5959 dst_object)
b73bfc1c 5960 struct coding_system *coding;
df7492f9
KH
5961 Lisp_Object src_object;
5962 EMACS_INT from, from_byte, to, to_byte;
5963 Lisp_Object dst_object;
b73bfc1c
KH
5964{
5965 int count = specpdl_ptr - specpdl;
df7492f9
KH
5966 EMACS_INT chars = to - from;
5967 EMACS_INT bytes = to_byte - from_byte;
5968 Lisp_Object attrs;
5969
5970 saved_coding = coding;
5971 record_unwind_protect (code_conversion_restore, save_excursion_save ());
5972
5973 coding->src_object = src_object;
5974 coding->src_chars = chars;
5975 coding->src_bytes = bytes;
5976 coding->src_multibyte = chars < bytes;
5977
5978 attrs = CODING_ID_ATTRS (coding->id);
5979
5980 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
6bac5b12 5981 {
df7492f9 5982 Lisp_Object val;
b73bfc1c 5983
df7492f9
KH
5984 coding->src_object = make_conversion_work_buffer (coding->src_multibyte);
5985 set_buffer_internal (XBUFFER (coding->src_object));
5986 if (STRINGP (src_object))
5987 insert_from_string (src_object, from, from_byte, chars, bytes, 0);
5988 else if (BUFFERP (src_object))
5989 insert_from_buffer (XBUFFER (src_object), from, chars, 0);
5990 else
5991 insert_1_both (coding->source + from, chars, bytes, 0, 0, 0);
5992
5993 if (EQ (src_object, dst_object))
5994 {
5995 set_buffer_internal (XBUFFER (src_object));
5996 del_range_both (from, from_byte, to, to_byte, 1);
5997 set_buffer_internal (XBUFFER (coding->src_object));
5998 }
5999
6000 val = call2 (CODING_ATTR_PRE_WRITE (attrs),
6001 make_number (1), make_number (chars));
6002 CHECK_NATNUM (val);
6003 if (BEG != GPT)
6004 move_gap_both (BEG, BEG_BYTE);
6005 coding->src_chars = Z - BEG;
6006 coding->src_bytes = Z_BYTE - BEG_BYTE;
6007 coding->src_pos = BEG;
6008 coding->src_pos_byte = BEG_BYTE;
6009 coding->src_multibyte = Z < Z_BYTE;
6010 }
6011 else if (STRINGP (src_object))
6012 {
6013 coding->src_pos = from;
6014 coding->src_pos_byte = from_byte;
6015 }
6016 else if (BUFFERP (src_object))
d46c5b12 6017 {
df7492f9
KH
6018 set_buffer_internal (XBUFFER (src_object));
6019 if (from != GPT)
6020 move_gap_both (from, from_byte);
6021 if (EQ (src_object, dst_object))
d46c5b12 6022 {
df7492f9
KH
6023 del_range_both (from, from_byte, to, to_byte, 1);
6024 coding->src_pos = -chars;
6025 coding->src_pos_byte = -bytes;
d46c5b12 6026 }
df7492f9 6027 else
d46c5b12 6028 {
df7492f9
KH
6029 coding->src_pos = from;
6030 coding->src_pos_byte = from_byte;
d46c5b12
KH
6031 }
6032 }
4ed46869 6033
df7492f9 6034 if (BUFFERP (dst_object))
d46c5b12 6035 {
df7492f9
KH
6036 coding->dst_object = dst_object;
6037 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
6038 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
6039 coding->dst_multibyte
6040 = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
b73bfc1c 6041 }
df7492f9 6042 else if (EQ (dst_object, Qt))
4956c225 6043 {
df7492f9
KH
6044 coding->dst_object = Qnil;
6045 coding->destination = (unsigned char *) xmalloc (coding->src_chars);
6046 coding->dst_bytes = coding->src_chars;
6047 coding->dst_multibyte = 0;
4956c225 6048 }
df7492f9 6049 else
78108bcd 6050 {
df7492f9
KH
6051 coding->dst_object = Qnil;
6052 coding->dst_multibyte = 0;
78108bcd
KH
6053 }
6054
df7492f9 6055 encode_coding (coding);
4ed46869 6056
df7492f9 6057 if (EQ (dst_object, Qt))
4ed46869 6058 {
df7492f9
KH
6059 if (BUFFERP (coding->dst_object))
6060 coding->dst_object = Fbuffer_string ();
6061 else
73be902c 6062 {
df7492f9
KH
6063 coding->dst_object
6064 = make_unibyte_string ((char *) coding->destination,
6065 coding->produced);
6066 xfree (coding->destination);
73be902c 6067 }
4ed46869 6068 }
d46c5b12 6069
df7492f9 6070 unbind_to (count, Qnil);
b73bfc1c
KH
6071}
6072
df7492f9 6073
b73bfc1c 6074Lisp_Object
df7492f9 6075preferred_coding_system ()
b73bfc1c 6076{
df7492f9 6077 int id = coding_categories[coding_priorities[0]].id;
2391eaa4 6078
df7492f9 6079 return CODING_ID_NAME (id);
4ed46869
KH
6080}
6081
6082\f
6083#ifdef emacs
1397dc18 6084/*** 8. Emacs Lisp library functions ***/
4ed46869 6085
4ed46869 6086DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
48b0f3ae 6087 doc: /* Return t if OBJECT is nil or a coding-system.
df7492f9 6088See the documentation of `define-coding-system' for information
48b0f3ae
PJ
6089about coding-system objects. */)
6090 (obj)
4ed46869
KH
6091 Lisp_Object obj;
6092{
df7492f9 6093 return ((NILP (obj) || CODING_SYSTEM_P (obj)) ? Qt : Qnil);
4ed46869
KH
6094}
6095
9d991de8
RS
6096DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
6097 Sread_non_nil_coding_system, 1, 1, 0,
48b0f3ae
PJ
6098 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
6099 (prompt)
4ed46869
KH
6100 Lisp_Object prompt;
6101{
e0e989f6 6102 Lisp_Object val;
9d991de8
RS
6103 do
6104 {
4608c386
KH
6105 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6106 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
9d991de8
RS
6107 }
6108 while (XSTRING (val)->size == 0);
e0e989f6 6109 return (Fintern (val, Qnil));
4ed46869
KH
6110}
6111
9b787f3e 6112DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
48b0f3ae
PJ
6113 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
6114If the user enters null input, return second argument DEFAULT-CODING-SYSTEM. */)
6115 (prompt, default_coding_system)
9b787f3e 6116 Lisp_Object prompt, default_coding_system;
4ed46869 6117{
f44d27ce 6118 Lisp_Object val;
9b787f3e
RS
6119 if (SYMBOLP (default_coding_system))
6120 XSETSTRING (default_coding_system, XSYMBOL (default_coding_system)->name);
4608c386 6121 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
9b787f3e
RS
6122 Qt, Qnil, Qcoding_system_history,
6123 default_coding_system, Qnil);
e0e989f6 6124 return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
4ed46869
KH
6125}
6126
6127DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
6128 1, 1, 0,
48b0f3ae
PJ
6129 doc: /* Check validity of CODING-SYSTEM.
6130If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
6131It is valid if it is a symbol with a non-nil `coding-system' property.
6132The value of property should be a vector of length 5. */)
df7492f9 6133 (coding_system)
4ed46869
KH
6134 Lisp_Object coding_system;
6135{
b7826503 6136 CHECK_SYMBOL (coding_system);
4ed46869
KH
6137 if (!NILP (Fcoding_system_p (coding_system)))
6138 return coding_system;
6139 while (1)
02ba4723 6140 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
4ed46869 6141}
df7492f9 6142
3a73fa5d 6143\f
d46c5b12 6144Lisp_Object
df7492f9 6145detect_coding_system (src, src_bytes, highest, multibytep, coding_system)
d46c5b12
KH
6146 unsigned char *src;
6147 int src_bytes, highest;
0a28aafb 6148 int multibytep;
df7492f9 6149 Lisp_Object coding_system;
4ed46869 6150{
df7492f9
KH
6151 unsigned char *src_end = src + src_bytes;
6152 int mask = CATEGORY_MASK_ANY;
6153 int detected = 0;
6154 int c, i;
6155 Lisp_Object attrs, eol_type;
6156 Lisp_Object val;
6157 struct coding_system coding;
6158
6159 if (NILP (coding_system))
6160 coding_system = Qundecided;
6161 setup_coding_system (coding_system, &coding);
6162 attrs = CODING_ID_ATTRS (coding.id);
6163 eol_type = CODING_ID_EOL_TYPE (coding.id);
4ed46869 6164
df7492f9
KH
6165 coding.source = src;
6166 coding.src_bytes = src_bytes;
6167 coding.src_multibyte = multibytep;
6168 coding.consumed = 0;
4ed46869 6169
df7492f9 6170 if (XINT (CODING_ATTR_CATEGORY (attrs)) != coding_category_undecided)
4ed46869 6171 {
df7492f9 6172 mask = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
4ed46869 6173 }
df7492f9 6174 else
4ed46869 6175 {
df7492f9
KH
6176 coding_system = Qnil;
6177 for (; src < src_end; src++)
4ed46869 6178 {
df7492f9
KH
6179 c = *src;
6180 if (c & 0x80 || (c < 0x20 && (c == ISO_CODE_ESC
6181 || c == ISO_CODE_SI
6182 || c == ISO_CODE_SO)))
d46c5b12 6183 break;
4ed46869 6184 }
df7492f9
KH
6185 coding.head_ascii = src - coding.source;
6186
6187 if (src < src_end)
6188 for (i = 0; i < coding_category_raw_text; i++)
6189 {
6190 enum coding_category category = coding_priorities[i];
6191 struct coding_system *this = coding_categories + category;
6192
6193 if (category >= coding_category_raw_text
6194 || detected & (1 << category))
6195 continue;
6196
6197 if (this->id < 0)
6198 {
6199 /* No coding system of this category is defined. */
6200 mask &= ~(1 << category);
6201 }
6202 else
6203 {
6204 detected |= detected_mask[category];
6205 if ((*(coding_categories[category].detector)) (&coding, &mask)
6206 && highest)
6207 {
6208 mask &= detected_mask[category];
6209 break;
6210 }
6211 }
6212 }
4ed46869 6213 }
4ed46869 6214
df7492f9
KH
6215 if (!mask)
6216 val = Fcons (make_number (coding_category_raw_text), Qnil);
6217 else if (mask == CATEGORY_MASK_ANY)
6218 val = Fcons (make_number (coding_category_undecided), Qnil);
6219 else if (highest)
4ed46869 6220 {
df7492f9
KH
6221 for (i = 0; i < coding_category_raw_text; i++)
6222 if (mask & (1 << coding_priorities[i]))
6223 {
6224 val = Fcons (make_number (coding_priorities[i]), Qnil);
6225 break;
6226 }
6227 }
6228 else
6229 {
6230 val = Qnil;
6231 for (i = coding_category_raw_text - 1; i >= 0; i--)
6232 if (mask & (1 << coding_priorities[i]))
6233 val = Fcons (make_number (coding_priorities[i]), val);
4ed46869 6234 }
df7492f9
KH
6235
6236 {
6237 int one_byte_eol = -1, two_byte_eol = -1;
6238 Lisp_Object tail;
6239
6240 for (tail = val; CONSP (tail); tail = XCDR (tail))
6241 {
6242 struct coding_system *this
6243 = (NILP (coding_system) ? coding_categories + XINT (XCAR (tail))
6244 : &coding);
6245 int this_eol;
6246
6247 attrs = CODING_ID_ATTRS (this->id);
6248 eol_type = CODING_ID_EOL_TYPE (this->id);
6249 XSETCAR (tail, CODING_ID_NAME (this->id));
6250 if (VECTORP (eol_type))
6251 {
6252 if (EQ (CODING_ATTR_TYPE (attrs), Qutf_16))
6253 {
6254 if (two_byte_eol < 0)
6255 two_byte_eol = detect_eol (this, coding.source, src_bytes);
6256 this_eol = two_byte_eol;
6257 }
6258 else
6259 {
6260 if (one_byte_eol < 0)
6261 one_byte_eol =detect_eol (this, coding.source, src_bytes);
6262 this_eol = one_byte_eol;
6263 }
6264 if (this_eol == EOL_SEEN_LF)
6265 XSETCAR (tail, AREF (eol_type, 0));
6266 else if (this_eol == EOL_SEEN_CRLF)
6267 XSETCAR (tail, AREF (eol_type, 1));
6268 else if (this_eol == EOL_SEEN_CR)
6269 XSETCAR (tail, AREF (eol_type, 2));
6270 }
6271 }
6272 }
6273
03699b14 6274 return (highest ? XCAR (val) : val);
93dec019 6275}
4ed46869 6276
df7492f9 6277
d46c5b12
KH
6278DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
6279 2, 3, 0,
48b0f3ae
PJ
6280 doc: /* Detect coding system of the text in the region between START and END.
6281Return a list of possible coding systems ordered by priority.
6282
6283If only ASCII characters are found, it returns a list of single element
6284`undecided' or its subsidiary coding system according to a detected
6285end-of-line format.
6286
6287If optional argument HIGHEST is non-nil, return the coding system of
6288highest priority. */)
6289 (start, end, highest)
d46c5b12
KH
6290 Lisp_Object start, end, highest;
6291{
6292 int from, to;
6293 int from_byte, to_byte;
6289dd10 6294
b7826503
PJ
6295 CHECK_NUMBER_COERCE_MARKER (start);
6296 CHECK_NUMBER_COERCE_MARKER (end);
4ed46869 6297
d46c5b12
KH
6298 validate_region (&start, &end);
6299 from = XINT (start), to = XINT (end);
6300 from_byte = CHAR_TO_BYTE (from);
6301 to_byte = CHAR_TO_BYTE (to);
6289dd10 6302
d46c5b12
KH
6303 if (from < GPT && to >= GPT)
6304 move_gap_both (to, to_byte);
c210f766 6305
d46c5b12 6306 return detect_coding_system (BYTE_POS_ADDR (from_byte),
df7492f9 6307 to_byte - from_byte,
0a28aafb
KH
6308 !NILP (highest),
6309 !NILP (current_buffer
df7492f9
KH
6310 ->enable_multibyte_characters),
6311 Qnil);
d46c5b12 6312}
6289dd10 6313
d46c5b12
KH
6314DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
6315 1, 2, 0,
48b0f3ae
PJ
6316 doc: /* Detect coding system of the text in STRING.
6317Return a list of possible coding systems ordered by priority.
6318
6319If only ASCII characters are found, it returns a list of single element
6320`undecided' or its subsidiary coding system according to a detected
6321end-of-line format.
6322
6323If optional argument HIGHEST is non-nil, return the coding system of
6324highest priority. */)
6325 (string, highest)
d46c5b12
KH
6326 Lisp_Object string, highest;
6327{
b7826503 6328 CHECK_STRING (string);
4ed46869 6329
d46c5b12 6330 return detect_coding_system (XSTRING (string)->data,
df7492f9 6331 STRING_BYTES (XSTRING (string)),
0a28aafb 6332 !NILP (highest),
df7492f9
KH
6333 STRING_MULTIBYTE (string),
6334 Qnil);
4ed46869
KH
6335}
6336
05e6f5dc 6337
df7492f9
KH
6338static INLINE int
6339char_encodable_p (c, attrs)
6340 int c;
6341 Lisp_Object attrs;
05e6f5dc 6342{
df7492f9 6343 Lisp_Object tail;
df7492f9 6344 struct charset *charset;
05e6f5dc 6345
df7492f9
KH
6346 for (tail = CODING_ATTR_CHARSET_LIST (attrs);
6347 CONSP (tail); tail = XCDR (tail))
05e6f5dc 6348 {
df7492f9
KH
6349 charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
6350 if (CHAR_CHARSET_P (c, charset))
6351 break;
05e6f5dc 6352 }
df7492f9 6353 return (! NILP (tail));
05e6f5dc
KH
6354}
6355
6356
df7492f9
KH
6357/* Return a list of coding systems that safely encode the text between
6358 START and END. If EXCLUDE is non-nil, it is a list of coding
6359 systems not to check. The returned list doesn't contain any such
6360 coding systems. In any case, If the text contains only ASCII or is
6361 unibyte, return t. */
6362
6363DEFUN ("find-coding-systems-region-internal",
6364 Ffind_coding_systems_region_internal,
6365 Sfind_coding_systems_region_internal, 2, 3, 0,
6366 doc: /* Internal use only. */)
6367 (start, end, exclude)
6368 Lisp_Object start, end, exclude;
6369{
6370 Lisp_Object coding_attrs_list, safe_codings;
6371 EMACS_INT start_byte, end_byte;
6372 unsigned char *p, *pbeg, *pend;
6373 int c;
6374 Lisp_Object tail, elt;
05e6f5dc 6375
df7492f9
KH
6376 if (STRINGP (start))
6377 {
6378 if (!STRING_MULTIBYTE (start)
6379 && XSTRING (start)->size != STRING_BYTES (XSTRING (start)))
6380 return Qt;
6381 start_byte = 0;
6382 end_byte = STRING_BYTES (XSTRING (start));
6383 }
6384 else
6385 {
6386 CHECK_NUMBER_COERCE_MARKER (start);
6387 CHECK_NUMBER_COERCE_MARKER (end);
6388 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
6389 args_out_of_range (start, end);
6390 if (NILP (current_buffer->enable_multibyte_characters))
6391 return Qt;
6392 start_byte = CHAR_TO_BYTE (XINT (start));
6393 end_byte = CHAR_TO_BYTE (XINT (end));
6394 if (XINT (end) - XINT (start) == end_byte - start_byte)
6395 return Qt;
05e6f5dc 6396
df7492f9
KH
6397 if (start < GPT && end > GPT)
6398 {
6399 if ((GPT - start) < (end - GPT))
6400 move_gap_both (start, start_byte);
6401 else
6402 move_gap_both (end, end_byte);
6403 }
6404 }
05e6f5dc 6405
df7492f9
KH
6406 coding_attrs_list = Qnil;
6407 for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
6408 if (NILP (exclude)
6409 || NILP (Fmemq (XCAR (tail), exclude)))
6410 {
6411 Lisp_Object attrs;
05e6f5dc 6412
df7492f9
KH
6413 attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
6414 if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
6415 && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
6416 coding_attrs_list = Fcons (attrs, coding_attrs_list);
6417 }
6418
6419 if (STRINGP (start))
6420 p = pbeg = XSTRING (start)->data;
6421 else
6422 p = pbeg = BYTE_POS_ADDR (start_byte);
6423 pend = p + (end_byte - start_byte);
6424
6425 while (p < pend && ASCII_BYTE_P (*p)) p++;
6426 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
05e6f5dc
KH
6427
6428 while (p < pend)
6429 {
df7492f9
KH
6430 if (ASCII_BYTE_P (*p))
6431 p++;
6432 else
6433 {
6434 c = STRING_CHAR_ADVANCE (p);
6435
6436 charset_map_loaded = 0;
6437 for (tail = coding_attrs_list; CONSP (tail);)
6438 {
6439 elt = XCAR (tail);
6440 if (NILP (elt))
6441 tail = XCDR (tail);
6442 else if (char_encodable_p (c, elt))
6443 tail = XCDR (tail);
6444 else if (CONSP (XCDR (tail)))
6445 {
6446 XSETCAR (tail, XCAR (XCDR (tail)));
6447 XSETCDR (tail, XCDR (XCDR (tail)));
6448 }
6449 else
6450 {
6451 XSETCAR (tail, Qnil);
6452 tail = XCDR (tail);
6453 }
6454 }
6455 if (charset_map_loaded)
6456 {
6457 EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
05e6f5dc 6458
df7492f9
KH
6459 if (STRINGP (start))
6460 pbeg = XSTRING (start)->data;
6461 else
6462 pbeg = BYTE_POS_ADDR (start_byte);
6463 p = pbeg + p_offset;
6464 pend = pbeg + pend_offset;
6465 }
6466 }
05e6f5dc 6467 }
df7492f9
KH
6468
6469 safe_codings = Qnil;
6470 for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
6471 if (! NILP (XCAR (tail)))
6472 safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
6473
05e6f5dc
KH
6474 return safe_codings;
6475}
6476
6477
df7492f9
KH
6478DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
6479 Scheck_coding_systems_region, 3, 3, 0,
6480 doc: /* Check if the region is encodable by coding systems.
05e6f5dc 6481
df7492f9
KH
6482START and END are buffer positions specifying the region.
6483CODING-SYSTEM-LIST is a list of coding systems to check.
6484
6485The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
6486CODING-SYSTEM is a member of CODING-SYSTEM-LIst and can't encode the
6487whole region, POS0, POS1, ... are buffer positions where non-encodable
6488characters are found.
6489
6490If all coding systems in CODING-SYSTEM-LIST can encode the region, the
6491value is nil.
6492
6493START may be a string. In that case, check if the string is
6494encodable, and the value contains indices to the string instead of
6495buffer positions. END is ignored. */)
6496 (start, end, coding_system_list)
6497 Lisp_Object start, end, coding_system_list;
05e6f5dc 6498{
df7492f9
KH
6499 Lisp_Object list;
6500 EMACS_INT start_byte, end_byte;
6501 int pos;
6502 unsigned char *p, *pbeg, *pend;
6503 int c;
6504 Lisp_Object tail, elt;
05e6f5dc
KH
6505
6506 if (STRINGP (start))
6507 {
df7492f9
KH
6508 if (!STRING_MULTIBYTE (start)
6509 && XSTRING (start)->size != STRING_BYTES (XSTRING (start)))
6510 return Qnil;
6511 start_byte = 0;
6512 end_byte = STRING_BYTES (XSTRING (start));
6513 pos = 0;
05e6f5dc
KH
6514 }
6515 else
6516 {
b7826503
PJ
6517 CHECK_NUMBER_COERCE_MARKER (start);
6518 CHECK_NUMBER_COERCE_MARKER (end);
05e6f5dc
KH
6519 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
6520 args_out_of_range (start, end);
6521 if (NILP (current_buffer->enable_multibyte_characters))
df7492f9
KH
6522 return Qnil;
6523 start_byte = CHAR_TO_BYTE (XINT (start));
6524 end_byte = CHAR_TO_BYTE (XINT (end));
6525 if (XINT (end) - XINT (start) == end_byte - start_byte)
05e6f5dc 6526 return Qt;
df7492f9
KH
6527
6528 if (start < GPT && end > GPT)
6529 {
6530 if ((GPT - start) < (end - GPT))
6531 move_gap_both (start, start_byte);
6532 else
6533 move_gap_both (end, end_byte);
6534 }
6535 pos = start;
6536 }
6537
6538 list = Qnil;
6539 for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
6540 {
6541 elt = XCAR (tail);
6542 list = Fcons (Fcons (elt, Fcons (AREF (CODING_SYSTEM_SPEC (elt), 0),
6543 Qnil)),
6544 list);
05e6f5dc
KH
6545 }
6546
df7492f9
KH
6547 if (STRINGP (start))
6548 p = pbeg = XSTRING (start)->data;
6549 else
6550 p = pbeg = BYTE_POS_ADDR (start_byte);
6551 pend = p + (end_byte - start_byte);
6552
6553 while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
6554 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
6555
6556 while (p < pend)
05e6f5dc 6557 {
df7492f9
KH
6558 if (ASCII_BYTE_P (*p))
6559 p++;
6560 else
05e6f5dc 6561 {
df7492f9
KH
6562 c = STRING_CHAR_ADVANCE (p);
6563
6564 charset_map_loaded = 0;
6565 for (tail = list; CONSP (tail); tail = XCDR (tail))
6566 {
6567 elt = XCDR (XCAR (tail));
6568 if (! char_encodable_p (c, XCAR (elt)))
6569 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
6570 }
6571 if (charset_map_loaded)
6572 {
6573 EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
6574
6575 if (STRINGP (start))
6576 pbeg = XSTRING (start)->data;
6577 else
6578 pbeg = BYTE_POS_ADDR (start_byte);
6579 p = pbeg + p_offset;
6580 pend = pbeg + pend_offset;
6581 }
05e6f5dc 6582 }
df7492f9 6583 pos++;
05e6f5dc
KH
6584 }
6585
df7492f9
KH
6586 tail = list;
6587 list = Qnil;
6588 for (; CONSP (tail); tail = XCDR (tail))
05e6f5dc 6589 {
df7492f9
KH
6590 elt = XCAR (tail);
6591 if (CONSP (XCDR (XCDR (elt))))
6592 list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
6593 list);
05e6f5dc 6594 }
df7492f9
KH
6595
6596 return list;
05e6f5dc
KH
6597}
6598
6599
df7492f9 6600
4031e2bf 6601Lisp_Object
df7492f9
KH
6602code_convert_region (start, end, coding_system, dst_object, encodep, norecord)
6603 Lisp_Object start, end, coding_system, dst_object;
6604 int encodep, norecord;
3a73fa5d
RS
6605{
6606 struct coding_system coding;
df7492f9
KH
6607 EMACS_INT from, from_byte, to, to_byte;
6608 Lisp_Object src_object;
3a73fa5d 6609
b7826503
PJ
6610 CHECK_NUMBER_COERCE_MARKER (start);
6611 CHECK_NUMBER_COERCE_MARKER (end);
df7492f9
KH
6612 if (NILP (coding_system))
6613 coding_system = Qno_conversion;
6614 else
6615 CHECK_CODING_SYSTEM (coding_system);
6616 src_object = Fcurrent_buffer ();
6617 if (NILP (dst_object))
6618 dst_object = src_object;
6619 else if (! EQ (dst_object, Qt))
6620 CHECK_BUFFER (dst_object);
3a73fa5d 6621
d46c5b12
KH
6622 validate_region (&start, &end);
6623 from = XFASTINT (start);
df7492f9 6624 from_byte = CHAR_TO_BYTE (from);
d46c5b12 6625 to = XFASTINT (end);
df7492f9 6626 to_byte = CHAR_TO_BYTE (to);
d46c5b12 6627
df7492f9
KH
6628 setup_coding_system (coding_system, &coding);
6629 coding.mode |= CODING_MODE_LAST_BLOCK;
6630
6631 if (encodep)
6632 encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
6633 dst_object);
6634 else
6635 decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
6636 dst_object);
6637 if (! norecord)
6638 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
d46c5b12 6639
df7492f9
KH
6640 if (coding.result != CODING_RESULT_SUCCESS)
6641 error ("Code conversion error: %d", coding.result);
3a73fa5d 6642
df7492f9
KH
6643 return (BUFFERP (dst_object)
6644 ? make_number (coding.produced_char)
6645 : coding.dst_object);
4031e2bf
KH
6646}
6647
df7492f9 6648
4031e2bf 6649DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
df7492f9 6650 3, 4, "r\nzCoding system: ",
48b0f3ae 6651 doc: /* Decode the current region from the specified coding system.
df7492f9
KH
6652When called from a program, takes four arguments:
6653 START, END, CODING-SYSTEM, and DESTINATION.
6654START and END are buffer positions.
6655
6656Optional 4th arguments DESTINATION specifies where the decoded text goes.
6657If nil, the region between START and END is replace by the decoded text.
6658If buffer, the decoded text is inserted in the buffer.
6659If t, the decoded text is returned.
6660
48b0f3ae
PJ
6661This function sets `last-coding-system-used' to the precise coding system
6662used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6663not fully specified.)
6664It returns the length of the decoded text. */)
df7492f9
KH
6665 (start, end, coding_system, destination)
6666 Lisp_Object start, end, coding_system, destination;
4031e2bf 6667{
df7492f9 6668 return code_convert_region (start, end, coding_system, destination, 0, 0);
3a73fa5d
RS
6669}
6670
6671DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
df7492f9
KH
6672 3, 4, "r\nzCoding system: ",
6673 doc: /* Encode the current region by specified coding system.
48b0f3ae
PJ
6674When called from a program, takes three arguments:
6675START, END, and CODING-SYSTEM. START and END are buffer positions.
df7492f9
KH
6676
6677Optional 4th arguments DESTINATION specifies where the encoded text goes.
6678If nil, the region between START and END is replace by the encoded text.
6679If buffer, the encoded text is inserted in the buffer.
6680If t, the encoded text is returned.
6681
48b0f3ae
PJ
6682This function sets `last-coding-system-used' to the precise coding system
6683used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6684not fully specified.)
6685It returns the length of the encoded text. */)
df7492f9
KH
6686 (start, end, coding_system, destination)
6687 Lisp_Object start, end, coding_system, destination;
3a73fa5d 6688{
df7492f9 6689 return code_convert_region (start, end, coding_system, destination, 1, 0);
4031e2bf 6690}
3a73fa5d 6691
4031e2bf 6692Lisp_Object
df7492f9
KH
6693code_convert_string (string, coding_system, dst_object,
6694 encodep, nocopy, norecord)
6695 Lisp_Object string, coding_system, dst_object;
6696 int encodep, nocopy, norecord;
4031e2bf
KH
6697{
6698 struct coding_system coding;
df7492f9 6699 EMACS_INT chars, bytes;
3a73fa5d 6700
b7826503 6701 CHECK_STRING (string);
d46c5b12 6702 if (NILP (coding_system))
df7492f9
KH
6703 {
6704 if (! norecord)
6705 Vlast_coding_system_used = Qno_conversion;
6706 if (NILP (dst_object))
6707 return (nocopy ? Fcopy_sequence (string) : string);
6708 }
4ed46869 6709
df7492f9
KH
6710 if (NILP (coding_system))
6711 coding_system = Qno_conversion;
6712 else
6713 CHECK_CODING_SYSTEM (coding_system);
6714 if (NILP (dst_object))
6715 dst_object = Qt;
6716 else if (! EQ (dst_object, Qt))
6717 CHECK_BUFFER (dst_object);
5f1cd180 6718
df7492f9 6719 setup_coding_system (coding_system, &coding);
d46c5b12 6720 coding.mode |= CODING_MODE_LAST_BLOCK;
df7492f9
KH
6721 chars = XSTRING (string)->size;
6722 bytes = STRING_BYTES (XSTRING (string));
6723 if (encodep)
6724 encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
6725 else
6726 decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
6727 if (! norecord)
6728 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
ec6d2bb8 6729
df7492f9
KH
6730 if (coding.result != CODING_RESULT_SUCCESS)
6731 error ("Code conversion error: %d", coding.result);
4ed46869 6732
df7492f9
KH
6733 return (BUFFERP (dst_object)
6734 ? make_number (coding.produced_char)
6735 : coding.dst_object);
4ed46869
KH
6736}
6737
4031e2bf 6738
ecec61c1 6739/* Encode or decode STRING according to CODING_SYSTEM.
ec6d2bb8
KH
6740 Do not set Vlast_coding_system_used.
6741
6742 This function is called only from macros DECODE_FILE and
6743 ENCODE_FILE, thus we ignore character composition. */
ecec61c1
KH
6744
6745Lisp_Object
6746code_convert_string_norecord (string, coding_system, encodep)
6747 Lisp_Object string, coding_system;
6748 int encodep;
6749{
0be8721c 6750 return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
df7492f9 6751}
ecec61c1 6752
ecec61c1 6753
df7492f9
KH
6754DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
6755 2, 4, 0,
6756 doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
6757
6758Optional third arg NOCOPY non-nil means it is OK to return STRING itself
6759if the decoding operation is trivial.
ecec61c1 6760
df7492f9
KH
6761Optional fourth arg BUFFER non-nil meant that the decoded text is
6762inserted in BUFFER instead of returned as a astring. In this case,
6763the return value is BUFFER.
ecec61c1 6764
df7492f9
KH
6765This function sets `last-coding-system-used' to the precise coding system
6766used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6767not fully specified. */)
6768 (string, coding_system, nocopy, buffer)
6769 Lisp_Object string, coding_system, nocopy, buffer;
6770{
6771 return code_convert_string (string, coding_system, buffer,
6772 0, ! NILP (nocopy), 0);
6773}
6774
6775DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
6776 2, 4, 0,
6777 doc: /* Encode STRING to CODING-SYSTEM, and return the result.
6778
6779Optional third arg NOCOPY non-nil means it is OK to return STRING
6780itself if the encoding operation is trivial.
6781
6782Optional fourth arg BUFFER non-nil meant that the encoded text is
6783inserted in BUFFER instead of returned as a astring. In this case,
6784the return value is BUFFER.
6785
6786This function sets `last-coding-system-used' to the precise coding system
6787used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6788not fully specified.) */)
6789 (string, coding_system, nocopy, buffer)
6790 Lisp_Object string, coding_system, nocopy, buffer;
6791{
6792 return code_convert_string (string, coding_system, buffer,
6793 nocopy, ! NILP (nocopy), 1);
ecec61c1 6794}
df7492f9 6795
3a73fa5d 6796\f
4ed46869 6797DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
48b0f3ae
PJ
6798 doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
6799Return the corresponding character. */)
6800 (code)
4ed46869
KH
6801 Lisp_Object code;
6802{
df7492f9
KH
6803 Lisp_Object spec, attrs, val;
6804 struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
6805 int c;
6806
6807 CHECK_NATNUM (code);
6808 c = XFASTINT (code);
6809 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
6810 attrs = AREF (spec, 0);
4ed46869 6811
df7492f9
KH
6812 if (ASCII_BYTE_P (c)
6813 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
6814 return code;
6815
6816 val = CODING_ATTR_CHARSET_LIST (attrs);
6817 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
004068e4
KH
6818 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
6819 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
df7492f9
KH
6820
6821 if (c <= 0x7F)
6822 charset = charset_roman;
6823 else if (c >= 0xA0 && c < 0xDF)
55ab7be3 6824 {
df7492f9
KH
6825 charset = charset_kana;
6826 c -= 0x80;
55ab7be3
KH
6827 }
6828 else
6829 {
004068e4 6830 int s1 = c >> 8, s2 = c & 0xFF;
df7492f9
KH
6831
6832 if (s1 < 0x81 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF
6833 || s2 < 0x40 || s2 == 0x7F || s2 > 0xFC)
6834 error ("Invalid code: %d", code);
6835 SJIS_TO_JIS (c);
6836 charset = charset_kanji;
55ab7be3 6837 }
df7492f9
KH
6838 c = DECODE_CHAR (charset, c);
6839 if (c < 0)
6840 error ("Invalid code: %d", code);
6841 return make_number (c);
4ed46869
KH
6842}
6843
df7492f9 6844
4ed46869 6845DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
48b0f3ae
PJ
6846 doc: /* Encode a Japanese character CHAR to shift_jis encoding.
6847Return the corresponding code in SJIS. */)
6848 (ch)
df7492f9 6849 Lisp_Object ch;
4ed46869 6850{
df7492f9
KH
6851 Lisp_Object spec, attrs, charset_list;
6852 int c;
6853 struct charset *charset;
6854 unsigned code;
4ed46869 6855
df7492f9
KH
6856 CHECK_CHARACTER (ch);
6857 c = XFASTINT (ch);
6858 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
6859 attrs = AREF (spec, 0);
6860
6861 if (ASCII_CHAR_P (c)
6862 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
6863 return ch;
6864
6865 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
6866 charset = char_charset (c, charset_list, &code);
6867 if (code == CHARSET_INVALID_CODE (charset))
6868 error ("Can't encode by shift_jis encoding: %d", c);
6869 JIS_TO_SJIS (code);
6870
6871 return make_number (code);
4ed46869
KH
6872}
6873
6874DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
48b0f3ae
PJ
6875 doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
6876Return the corresponding character. */)
6877 (code)
4ed46869
KH
6878 Lisp_Object code;
6879{
df7492f9
KH
6880 Lisp_Object spec, attrs, val;
6881 struct charset *charset_roman, *charset_big5, *charset;
6882 int c;
4ed46869 6883
df7492f9
KH
6884 CHECK_NATNUM (code);
6885 c = XFASTINT (code);
6886 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
6887 attrs = AREF (spec, 0);
6888
6889 if (ASCII_BYTE_P (c)
6890 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
6891 return code;
6892
6893 val = CODING_ATTR_CHARSET_LIST (attrs);
6894 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
6895 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
6896
6897 if (c <= 0x7F)
6898 charset = charset_roman;
c28a9453
KH
6899 else
6900 {
df7492f9
KH
6901 int b1 = c >> 8, b2 = c & 0x7F;
6902 if (b1 < 0xA1 || b1 > 0xFE
6903 || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
6904 error ("Invalid code: %d", code);
6905 charset = charset_big5;
c28a9453 6906 }
df7492f9
KH
6907 c = DECODE_CHAR (charset, (unsigned )c);
6908 if (c < 0)
6909 error ("Invalid code: %d", code);
6910 return make_number (c);
4ed46869
KH
6911}
6912
6913DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
48b0f3ae
PJ
6914 doc: /* Encode the Big5 character CHAR to BIG5 coding system.
6915Return the corresponding character code in Big5. */)
6916 (ch)
4ed46869
KH
6917 Lisp_Object ch;
6918{
df7492f9
KH
6919 Lisp_Object spec, attrs, charset_list;
6920 struct charset *charset;
6921 int c;
6922 unsigned code;
6923
6924 CHECK_CHARACTER (ch);
6925 c = XFASTINT (ch);
6926 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
6927 attrs = AREF (spec, 0);
6928 if (ASCII_CHAR_P (c)
6929 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
6930 return ch;
6931
6932 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
6933 charset = char_charset (c, charset_list, &code);
6934 if (code == CHARSET_INVALID_CODE (charset))
6935 error ("Can't encode by Big5 encoding: %d", c);
6936
6937 return make_number (code);
4ed46869 6938}
df7492f9 6939
3a73fa5d 6940\f
1ba9e4ab
KH
6941DEFUN ("set-terminal-coding-system-internal",
6942 Fset_terminal_coding_system_internal,
48b0f3ae
PJ
6943 Sset_terminal_coding_system_internal, 1, 1, 0,
6944 doc: /* Internal use only. */)
6945 (coding_system)
b74e4686 6946 Lisp_Object coding_system;
4ed46869 6947{
b7826503 6948 CHECK_SYMBOL (coding_system);
df7492f9
KH
6949 setup_coding_system (Fcheck_coding_system (coding_system),
6950 &terminal_coding);
6951
70c22245 6952 /* We had better not send unsafe characters to terminal. */
df7492f9
KH
6953 terminal_coding.mode |= CODING_MODE_SAFE_ENCODING;
6954 /* Characer composition should be disabled. */
6955 terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
b73bfc1c
KH
6956 terminal_coding.src_multibyte = 1;
6957 terminal_coding.dst_multibyte = 0;
4ed46869
KH
6958 return Qnil;
6959}
6960
c4825358
KH
6961DEFUN ("set-safe-terminal-coding-system-internal",
6962 Fset_safe_terminal_coding_system_internal,
48b0f3ae 6963 Sset_safe_terminal_coding_system_internal, 1, 1, 0,
ddb67bdc 6964 doc: /* Internal use only. */)
48b0f3ae 6965 (coding_system)
b74e4686 6966 Lisp_Object coding_system;
c4825358 6967{
b7826503 6968 CHECK_SYMBOL (coding_system);
c4825358
KH
6969 setup_coding_system (Fcheck_coding_system (coding_system),
6970 &safe_terminal_coding);
df7492f9
KH
6971 /* Characer composition should be disabled. */
6972 safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
b73bfc1c
KH
6973 safe_terminal_coding.src_multibyte = 1;
6974 safe_terminal_coding.dst_multibyte = 0;
c4825358
KH
6975 return Qnil;
6976}
6977
4ed46869
KH
6978DEFUN ("terminal-coding-system",
6979 Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
48b0f3ae
PJ
6980 doc: /* Return coding system specified for terminal output. */)
6981 ()
4ed46869 6982{
df7492f9 6983 return CODING_ID_NAME (terminal_coding.id);
4ed46869
KH
6984}
6985
1ba9e4ab
KH
6986DEFUN ("set-keyboard-coding-system-internal",
6987 Fset_keyboard_coding_system_internal,
48b0f3ae
PJ
6988 Sset_keyboard_coding_system_internal, 1, 1, 0,
6989 doc: /* Internal use only. */)
6990 (coding_system)
4ed46869
KH
6991 Lisp_Object coding_system;
6992{
b7826503 6993 CHECK_SYMBOL (coding_system);
df7492f9
KH
6994 setup_coding_system (Fcheck_coding_system (coding_system),
6995 &keyboard_coding);
6996 /* Characer composition should be disabled. */
6997 keyboard_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
4ed46869
KH
6998 return Qnil;
6999}
7000
7001DEFUN ("keyboard-coding-system",
7002 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
48b0f3ae
PJ
7003 doc: /* Return coding system specified for decoding keyboard input. */)
7004 ()
4ed46869 7005{
df7492f9 7006 return CODING_ID_NAME (keyboard_coding.id);
4ed46869
KH
7007}
7008
7009\f
a5d301df
KH
7010DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
7011 Sfind_operation_coding_system, 1, MANY, 0,
48b0f3ae
PJ
7012 doc: /* Choose a coding system for an operation based on the target name.
7013The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
7014DECODING-SYSTEM is the coding system to use for decoding
7015\(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
7016for encoding (in case OPERATION does encoding).
7017
7018The first argument OPERATION specifies an I/O primitive:
7019 For file I/O, `insert-file-contents' or `write-region'.
7020 For process I/O, `call-process', `call-process-region', or `start-process'.
7021 For network I/O, `open-network-stream'.
7022
7023The remaining arguments should be the same arguments that were passed
7024to the primitive. Depending on which primitive, one of those arguments
7025is selected as the TARGET. For example, if OPERATION does file I/O,
7026whichever argument specifies the file name is TARGET.
7027
7028TARGET has a meaning which depends on OPERATION:
7029 For file I/O, TARGET is a file name.
7030 For process I/O, TARGET is a process name.
7031 For network I/O, TARGET is a service name or a port number
7032
7033This function looks up what specified for TARGET in,
7034`file-coding-system-alist', `process-coding-system-alist',
7035or `network-coding-system-alist' depending on OPERATION.
7036They may specify a coding system, a cons of coding systems,
7037or a function symbol to call.
7038In the last case, we call the function with one argument,
7039which is a list of all the arguments given to this function.
7040
7041usage: (find-operation-coding-system OPERATION ARGUMENTS ...) */)
7042 (nargs, args)
4ed46869
KH
7043 int nargs;
7044 Lisp_Object *args;
7045{
7046 Lisp_Object operation, target_idx, target, val;
7047 register Lisp_Object chain;
7048
7049 if (nargs < 2)
7050 error ("Too few arguments");
7051 operation = args[0];
7052 if (!SYMBOLP (operation)
7053 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
df7492f9 7054 error ("Invalid first arguement");
4ed46869
KH
7055 if (nargs < 1 + XINT (target_idx))
7056 error ("Too few arguments for operation: %s",
7057 XSYMBOL (operation)->name->data);
7058 target = args[XINT (target_idx) + 1];
7059 if (!(STRINGP (target)
7060 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
df7492f9 7061 error ("Invalid %dth argument", XINT (target_idx) + 1);
4ed46869 7062
2e34157c
RS
7063 chain = ((EQ (operation, Qinsert_file_contents)
7064 || EQ (operation, Qwrite_region))
02ba4723 7065 ? Vfile_coding_system_alist
2e34157c 7066 : (EQ (operation, Qopen_network_stream)
02ba4723
KH
7067 ? Vnetwork_coding_system_alist
7068 : Vprocess_coding_system_alist));
4ed46869
KH
7069 if (NILP (chain))
7070 return Qnil;
7071
03699b14 7072 for (; CONSP (chain); chain = XCDR (chain))
4ed46869 7073 {
f44d27ce 7074 Lisp_Object elt;
4ed46869 7075
df7492f9 7076 elt = XCAR (chain);
4ed46869
KH
7077 if (CONSP (elt)
7078 && ((STRINGP (target)
03699b14
KR
7079 && STRINGP (XCAR (elt))
7080 && fast_string_match (XCAR (elt), target) >= 0)
7081 || (INTEGERP (target) && EQ (target, XCAR (elt)))))
02ba4723 7082 {
03699b14 7083 val = XCDR (elt);
b19fd4c5
KH
7084 /* Here, if VAL is both a valid coding system and a valid
7085 function symbol, we return VAL as a coding system. */
02ba4723
KH
7086 if (CONSP (val))
7087 return val;
7088 if (! SYMBOLP (val))
7089 return Qnil;
7090 if (! NILP (Fcoding_system_p (val)))
7091 return Fcons (val, val);
b19fd4c5
KH
7092 if (! NILP (Ffboundp (val)))
7093 {
7094 val = call1 (val, Flist (nargs, args));
7095 if (CONSP (val))
7096 return val;
7097 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
7098 return Fcons (val, val);
7099 }
02ba4723
KH
7100 return Qnil;
7101 }
4ed46869
KH
7102 }
7103 return Qnil;
7104}
7105
df7492f9
KH
7106DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
7107 Sset_coding_system_priority, 1, MANY, 0,
7108 doc: /* Put higher priority to coding systems of the arguments. */)
7109 (nargs, args)
7110 int nargs;
7111 Lisp_Object *args;
7112{
7113 int i, j;
7114 int changed[coding_category_max];
7115 enum coding_category priorities[coding_category_max];
7116
7117 bzero (changed, sizeof changed);
7118
7119 for (i = j = 0; i < nargs; i++)
7120 {
7121 enum coding_category category;
7122 Lisp_Object spec, attrs;
7123
7124 CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
7125 attrs = AREF (spec, 0);
7126 category = XINT (CODING_ATTR_CATEGORY (attrs));
7127 if (changed[category])
7128 /* Ignore this coding system because a coding system of the
7129 same category already had a higher priority. */
7130 continue;
7131 changed[category] = 1;
7132 priorities[j++] = category;
7133 if (coding_categories[category].id >= 0
7134 && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
7135 setup_coding_system (args[i], &coding_categories[category]);
7136 }
7137
7138 /* Now we have decided top J priorities. Reflect the order of the
7139 original priorities to the remaining priorities. */
7140
7141 for (i = j, j = 0; i < coding_category_max; i++, j++)
7142 {
7143 while (j < coding_category_max
7144 && changed[coding_priorities[j]])
7145 j++;
7146 if (j == coding_category_max)
7147 abort ();
7148 priorities[i] = coding_priorities[j];
7149 }
7150
7151 bcopy (priorities, coding_priorities, sizeof priorities);
7152 return Qnil;
7153}
7154
7155DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
7156 Scoding_system_priority_list, 0, 1, 0,
7157 doc: /* Return a list of coding systems ordered by their priorities. */)
7158 (highestp)
7159 Lisp_Object highestp;
d46c5b12
KH
7160{
7161 int i;
df7492f9 7162 Lisp_Object val;
d46c5b12 7163
df7492f9 7164 for (i = 0, val = Qnil; i < coding_category_max; i++)
d46c5b12 7165 {
df7492f9
KH
7166 enum coding_category category = coding_priorities[i];
7167 int id = coding_categories[category].id;
7168 Lisp_Object attrs;
7169
7170 if (id < 0)
7171 continue;
7172 attrs = CODING_ID_ATTRS (id);
7173 if (! NILP (highestp))
7174 return CODING_ATTR_BASE_NAME (attrs);
7175 val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
7176 }
7177 return Fnreverse (val);
7178}
7179
7180static Lisp_Object
7181make_subsidiaries (base)
7182 Lisp_Object base;
7183{
7184 Lisp_Object subsidiaries;
7185 char *suffixes[] = { "-unix", "-dos", "-mac" };
7186 int base_name_len = STRING_BYTES (XSYMBOL (base)->name);
7187 char *buf = (char *) alloca (base_name_len + 6);
7188 int i;
7189
7190 bcopy (XSYMBOL (base)->name->data, buf, base_name_len);
7191 subsidiaries = Fmake_vector (make_number (3), Qnil);
7192 for (i = 0; i < 3; i++)
7193 {
7194 bcopy (suffixes[i], buf + base_name_len, strlen (suffixes[i]) + 1);
7195 ASET (subsidiaries, i, intern (buf));
7196 }
7197 return subsidiaries;
7198}
7199
7200
7201DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
7202 Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
7203 doc: /* For internal use only. */)
7204 (nargs, args)
7205 int nargs;
7206 Lisp_Object *args;
7207{
7208 Lisp_Object name;
7209 Lisp_Object spec_vec; /* [ ATTRS ALIASE EOL_TYPE ] */
7210 Lisp_Object attrs; /* Vector of attributes. */
7211 Lisp_Object eol_type;
7212 Lisp_Object aliases;
7213 Lisp_Object coding_type, charset_list, safe_charsets;
7214 enum coding_category category;
7215 Lisp_Object tail, val;
7216 int max_charset_id = 0;
7217 int i;
7218
7219 if (nargs < coding_arg_max)
7220 goto short_args;
7221
7222 attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
7223
7224 name = args[coding_arg_name];
7225 CHECK_SYMBOL (name);
7226 CODING_ATTR_BASE_NAME (attrs) = name;
7227
7228 val = args[coding_arg_mnemonic];
7229 if (! STRINGP (val))
7230 CHECK_CHARACTER (val);
7231 CODING_ATTR_MNEMONIC (attrs) = val;
7232
7233 coding_type = args[coding_arg_coding_type];
7234 CHECK_SYMBOL (coding_type);
7235 CODING_ATTR_TYPE (attrs) = coding_type;
7236
7237 charset_list = args[coding_arg_charset_list];
7238 if (SYMBOLP (charset_list))
7239 {
7240 if (EQ (charset_list, Qiso_2022))
7241 {
7242 if (! EQ (coding_type, Qiso_2022))
7243 error ("Invalid charset-list");
7244 charset_list = Viso_2022_charset_list;
7245 }
7246 else if (EQ (charset_list, Qemacs_mule))
7247 {
7248 if (! EQ (coding_type, Qemacs_mule))
7249 error ("Invalid charset-list");
7250 charset_list = Vemacs_mule_charset_list;
7251 }
7252 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
7253 if (max_charset_id < XFASTINT (XCAR (tail)))
7254 max_charset_id = XFASTINT (XCAR (tail));
7255 }
7256 else
7257 {
7258 charset_list = Fcopy_sequence (charset_list);
7259 for (tail = charset_list; !NILP (tail); tail = Fcdr (tail))
7260 {
7261 struct charset *charset;
7262
7263 val = Fcar (tail);
7264 CHECK_CHARSET_GET_CHARSET (val, charset);
7265 if (EQ (coding_type, Qiso_2022)
7266 ? CHARSET_ISO_FINAL (charset) < 0
7267 : EQ (coding_type, Qemacs_mule)
7268 ? CHARSET_EMACS_MULE_ID (charset) < 0
7269 : 0)
7270 error ("Can't handle charset `%s'",
7271 XSYMBOL (CHARSET_NAME (charset))->name->data);
7272
7273 XCAR (tail) = make_number (charset->id);
7274 if (max_charset_id < charset->id)
7275 max_charset_id = charset->id;
7276 }
7277 }
7278 CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
7279
7280 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
7281 make_number (255));
7282 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
7283 XSTRING (safe_charsets)->data[XFASTINT (XCAR (tail))] = 0;
7284 CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
7285
7286 val = args[coding_arg_decode_translation_table];
7287 if (! NILP (val))
7288 CHECK_CHAR_TABLE (val);
7289 CODING_ATTR_DECODE_TBL (attrs) = val;
7290
7291 val = args[coding_arg_encode_translation_table];
7292 if (! NILP (val))
7293 CHECK_CHAR_TABLE (val);
7294 CODING_ATTR_ENCODE_TBL (attrs) = val;
7295
7296 val = args[coding_arg_post_read_conversion];
7297 CHECK_SYMBOL (val);
7298 CODING_ATTR_POST_READ (attrs) = val;
7299
7300 val = args[coding_arg_pre_write_conversion];
7301 CHECK_SYMBOL (val);
7302 CODING_ATTR_PRE_WRITE (attrs) = val;
7303
7304 val = args[coding_arg_default_char];
7305 if (NILP (val))
7306 CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
7307 else
7308 {
7309 CHECK_CHARACTER (val);
7310 CODING_ATTR_DEFAULT_CHAR (attrs) = val;
7311 }
7312
7313 val = args[coding_arg_plist];
7314 CHECK_LIST (val);
7315 CODING_ATTR_PLIST (attrs) = val;
7316
7317 if (EQ (coding_type, Qcharset))
7318 {
7319 val = Fmake_vector (make_number (256), Qnil);
7320
7321 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
7322 {
7323 struct charset *charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
15d143f7 7324 int idx = (CHARSET_DIMENSION (charset) - 1) * 4;
df7492f9 7325
15d143f7
KH
7326 for (i = charset->code_space[idx];
7327 i <= charset->code_space[idx + 1]; i++)
7328 {
7329 if (NILP (AREF (val, i)))
7330 ASET (val, i, XCAR (tail));
7331 else
7332 error ("Charsets conflicts in the first byte");
7333 }
df7492f9
KH
7334 }
7335 ASET (attrs, coding_attr_charset_valids, val);
7336 category = coding_category_charset;
7337 }
7338 else if (EQ (coding_type, Qccl))
7339 {
7340 Lisp_Object valids;
7341
7342 if (nargs < coding_arg_ccl_max)
7343 goto short_args;
7344
7345 val = args[coding_arg_ccl_decoder];
7346 CHECK_CCL_PROGRAM (val);
7347 if (VECTORP (val))
7348 val = Fcopy_sequence (val);
7349 ASET (attrs, coding_attr_ccl_decoder, val);
7350
7351 val = args[coding_arg_ccl_encoder];
7352 CHECK_CCL_PROGRAM (val);
7353 if (VECTORP (val))
7354 val = Fcopy_sequence (val);
7355 ASET (attrs, coding_attr_ccl_encoder, val);
7356
7357 val = args[coding_arg_ccl_valids];
7358 valids = Fmake_string (make_number (256), make_number (0));
7359 for (tail = val; !NILP (tail); tail = Fcdr (tail))
7360 {
7361 val = Fcar (tail);
7362 if (INTEGERP (val))
7363 ASET (valids, XINT (val), 1);
7364 else
7365 {
7366 int from, to;
7367
7368 CHECK_CONS (val);
7369 CHECK_NUMBER (XCAR (val));
7370 CHECK_NUMBER (XCDR (val));
7371 from = XINT (XCAR (val));
7372 to = XINT (XCDR (val));
7373 for (i = from; i <= to; i++)
7374 ASET (valids, i, 1);
7375 }
7376 }
7377 ASET (attrs, coding_attr_ccl_valids, valids);
7378
7379 category = coding_category_ccl;
7380 }
7381 else if (EQ (coding_type, Qutf_16))
7382 {
7383 Lisp_Object bom, endian;
7384
7385 if (nargs < coding_arg_utf16_max)
7386 goto short_args;
7387
7388 bom = args[coding_arg_utf16_bom];
7389 if (! NILP (bom) && ! EQ (bom, Qt))
7390 {
7391 CHECK_CONS (bom);
7392 CHECK_CODING_SYSTEM (XCAR (bom));
7393 CHECK_CODING_SYSTEM (XCDR (bom));
7394 }
7395 ASET (attrs, coding_attr_utf_16_bom, bom);
7396
7397 endian = args[coding_arg_utf16_endian];
7398 ASET (attrs, coding_attr_utf_16_endian, endian);
7399
7400 category = (CONSP (bom)
7401 ? coding_category_utf_16_auto
7402 : NILP (bom)
7403 ? (NILP (endian)
7404 ? coding_category_utf_16_be_nosig
7405 : coding_category_utf_16_le_nosig)
7406 : (NILP (endian)
7407 ? coding_category_utf_16_be
7408 : coding_category_utf_16_le));
7409 }
7410 else if (EQ (coding_type, Qiso_2022))
7411 {
7412 Lisp_Object initial, reg_usage, request, flags;
0be8721c 7413 int i, id;
1397dc18 7414
df7492f9
KH
7415 if (nargs < coding_arg_iso2022_max)
7416 goto short_args;
7417
7418 initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
7419 CHECK_VECTOR (initial);
7420 for (i = 0; i < 4; i++)
7421 {
7422 val = Faref (initial, make_number (i));
7423 if (! NILP (val))
7424 {
7425 CHECK_CHARSET_GET_ID (val, id);
7426 ASET (initial, i, make_number (id));
7427 }
7428 else
7429 ASET (initial, i, make_number (-1));
7430 }
7431
7432 reg_usage = args[coding_arg_iso2022_reg_usage];
7433 CHECK_CONS (reg_usage);
7434 CHECK_NATNUM (XCAR (reg_usage));
7435 CHECK_NATNUM (XCDR (reg_usage));
7436
7437 request = Fcopy_sequence (args[coding_arg_iso2022_request]);
7438 for (tail = request; ! NILP (tail); tail = Fcdr (tail))
1397dc18 7439 {
df7492f9
KH
7440 int id;
7441
7442 val = Fcar (tail);
7443 CHECK_CONS (val);
7444 CHECK_CHARSET_GET_ID (XCAR (val), id);
7445 CHECK_NATNUM (XCDR (val));
7446 if (XINT (XCDR (val)) >= 4)
7447 error ("Invalid graphic register number: %d", XINT (XCDR (val)));
7448 XCAR (val) = make_number (id);
1397dc18 7449 }
df7492f9
KH
7450
7451 flags = args[coding_arg_iso2022_flags];
7452 CHECK_NATNUM (flags);
7453 i = XINT (flags);
7454 if (EQ (args[coding_arg_charset_list], Qiso_2022))
7455 flags = make_number (i | CODING_ISO_FLAG_FULL_SUPPORT);
7456
7457 ASET (attrs, coding_attr_iso_initial, initial);
7458 ASET (attrs, coding_attr_iso_usage, reg_usage);
7459 ASET (attrs, coding_attr_iso_request, request);
7460 ASET (attrs, coding_attr_iso_flags, flags);
7461 setup_iso_safe_charsets (attrs);
7462
7463 if (i & CODING_ISO_FLAG_SEVEN_BITS)
7464 category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
7465 | CODING_ISO_FLAG_SINGLE_SHIFT))
7466 ? coding_category_iso_7_else
7467 : EQ (args[coding_arg_charset_list], Qiso_2022)
7468 ? coding_category_iso_7
7469 : coding_category_iso_7_tight);
7470 else
7471 {
7472 int id = XINT (AREF (initial, 1));
7473
7474 category = (((i & (CODING_ISO_FLAG_LOCKING_SHIFT
7475 | CODING_ISO_FLAG_SINGLE_SHIFT))
7476 || EQ (args[coding_arg_charset_list], Qiso_2022)
7477 || id < 0)
7478 ? coding_category_iso_8_else
7479 : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
7480 ? coding_category_iso_8_1
7481 : coding_category_iso_8_2);
7482 }
7483 }
7484 else if (EQ (coding_type, Qemacs_mule))
7485 {
7486 if (EQ (args[coding_arg_charset_list], Qemacs_mule))
7487 ASET (attrs, coding_attr_emacs_mule_full, Qt);
7488
7489 category = coding_category_emacs_mule;
7490 }
7491 else if (EQ (coding_type, Qshift_jis))
7492 {
7493
7494 struct charset *charset;
7495
7496 if (XINT (Flength (charset_list)) != 3)
7497 error ("There should be just three charsets");
7498
7499 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
7500 if (CHARSET_DIMENSION (charset) != 1)
7501 error ("Dimension of charset %s is not one",
7502 XSYMBOL (CHARSET_NAME (charset))->name->data);
7503
7504 charset_list = XCDR (charset_list);
7505 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
7506 if (CHARSET_DIMENSION (charset) != 1)
7507 error ("Dimension of charset %s is not one",
7508 XSYMBOL (CHARSET_NAME (charset))->name->data);
7509
7510 charset_list = XCDR (charset_list);
7511 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
7512 if (CHARSET_DIMENSION (charset) != 2)
7513 error ("Dimension of charset %s is not two",
7514 XSYMBOL (CHARSET_NAME (charset))->name->data);
7515
7516 category = coding_category_sjis;
7517 Vsjis_coding_system = name;
7518 }
7519 else if (EQ (coding_type, Qbig5))
7520 {
7521 struct charset *charset;
7522
7523 if (XINT (Flength (charset_list)) != 2)
7524 error ("There should be just two charsets");
7525
7526 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
7527 if (CHARSET_DIMENSION (charset) != 1)
7528 error ("Dimension of charset %s is not one",
7529 XSYMBOL (CHARSET_NAME (charset))->name->data);
7530
7531 charset_list = XCDR (charset_list);
7532 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
7533 if (CHARSET_DIMENSION (charset) != 2)
7534 error ("Dimension of charset %s is not two",
7535 XSYMBOL (CHARSET_NAME (charset))->name->data);
7536
7537 category = coding_category_big5;
7538 Vbig5_coding_system = name;
7539 }
7540 else if (EQ (coding_type, Qraw_text))
7541 category = coding_category_raw_text;
7542 else if (EQ (coding_type, Qutf_8))
7543 category = coding_category_utf_8;
7544 else if (EQ (coding_type, Qundecided))
7545 category = coding_category_undecided;
7546 else
7547 error ("Invalid coding system type: %s",
7548 XSYMBOL (coding_type)->name->data);
7549
7550 CODING_ATTR_CATEGORY (attrs) = make_number (category);
7551
7552 eol_type = args[coding_arg_eol_type];
7553 if (! NILP (eol_type)
7554 && ! EQ (eol_type, Qunix)
7555 && ! EQ (eol_type, Qdos)
7556 && ! EQ (eol_type, Qmac))
7557 error ("Invalid eol-type");
7558
7559 aliases = Fcons (name, Qnil);
7560
7561 if (NILP (eol_type))
7562 {
7563 eol_type = make_subsidiaries (name);
7564 for (i = 0; i < 3; i++)
1397dc18 7565 {
df7492f9
KH
7566 Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
7567
7568 this_name = AREF (eol_type, i);
7569 this_aliases = Fcons (this_name, Qnil);
7570 this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
7571 this_spec = Fmake_vector (make_number (3), attrs);
7572 ASET (this_spec, 1, this_aliases);
7573 ASET (this_spec, 2, this_eol_type);
7574 Fputhash (this_name, this_spec, Vcoding_system_hash_table);
7575 Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
7576 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
7577 Vcoding_system_alist);
1397dc18 7578 }
d46c5b12 7579 }
1397dc18 7580
df7492f9
KH
7581 spec_vec = Fmake_vector (make_number (3), attrs);
7582 ASET (spec_vec, 1, aliases);
7583 ASET (spec_vec, 2, eol_type);
7584
7585 Fputhash (name, spec_vec, Vcoding_system_hash_table);
7586 Vcoding_system_list = Fcons (name, Vcoding_system_list);
7587 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
7588 Vcoding_system_alist);
7589
7590 {
7591 int id = coding_categories[category].id;
7592
7593 if (id < 0 || EQ (name, CODING_ID_NAME (id)))
7594 setup_coding_system (name, &coding_categories[category]);
7595 }
7596
d46c5b12 7597 return Qnil;
df7492f9
KH
7598
7599 short_args:
7600 return Fsignal (Qwrong_number_of_arguments,
7601 Fcons (intern ("define-coding-system-internal"),
7602 make_number (nargs)));
d46c5b12
KH
7603}
7604
df7492f9
KH
7605DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
7606 Sdefine_coding_system_alias, 2, 2, 0,
7607 doc: /* Define ALIAS as an alias for CODING-SYSTEM. */)
7608 (alias, coding_system)
7609 Lisp_Object alias, coding_system;
66cfb530 7610{
df7492f9 7611 Lisp_Object spec, aliases, eol_type;
84d60297 7612
df7492f9
KH
7613 CHECK_SYMBOL (alias);
7614 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
7615 aliases = AREF (spec, 1);
7616 while (!NILP (XCDR (aliases)))
7617 aliases = XCDR (aliases);
7618 XCDR (aliases) = Fcons (alias, Qnil);
66cfb530 7619
df7492f9
KH
7620 eol_type = AREF (spec, 2);
7621 if (VECTORP (eol_type))
66cfb530 7622 {
df7492f9
KH
7623 Lisp_Object subsidiaries;
7624 int i;
7625
7626 subsidiaries = make_subsidiaries (alias);
7627 for (i = 0; i < 3; i++)
7628 Fdefine_coding_system_alias (AREF (subsidiaries, i),
7629 AREF (eol_type, i));
7630
7631 ASET (spec, 2, subsidiaries);
66cfb530 7632 }
df7492f9
KH
7633
7634 Fputhash (alias, spec, Vcoding_system_hash_table);
5bad0796
DL
7635 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
7636 Vcoding_system_alist);
66cfb530
KH
7637
7638 return Qnil;
7639}
7640
df7492f9
KH
7641DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
7642 1, 1, 0,
7643 doc: /* Return the base of CODING-SYSTEM.
7644Any alias or subsidiary coding systems are not base coding system. */)
7645 (coding_system)
7646 Lisp_Object coding_system;
7647{
7648 Lisp_Object spec, attrs;
7649
7650 if (NILP (coding_system))
7651 return (Qno_conversion);
7652 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
7653 attrs = AREF (spec, 0);
7654 return CODING_ATTR_BASE_NAME (attrs);
7655}
7656
7657DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
7658 1, 1, 0,
7659 doc: "Return the property list of CODING-SYSTEM.")
7660 (coding_system)
7661 Lisp_Object coding_system;
7662{
7663 Lisp_Object spec, attrs;
7664
7665 if (NILP (coding_system))
7666 coding_system = Qno_conversion;
7667 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
7668 attrs = AREF (spec, 0);
7669 return CODING_ATTR_PLIST (attrs);
7670}
7671
7672
7673DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
7674 1, 1, 0,
7675 doc: /* Return the list of aliases of CODING-SYSTEM.
7676A base coding system is what made by `define-coding-system'.
7677Any alias nor subsidiary coding systems are not base coding system. */)
7678 (coding_system)
7679 Lisp_Object coding_system;
7680{
7681 Lisp_Object spec;
7682
7683 if (NILP (coding_system))
7684 coding_system = Qno_conversion;
7685 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
7686 return AREF (spec, 2);
7687}
7688
7689DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
7690 Scoding_system_eol_type, 1, 1, 0,
7691 doc: /* Return eol-type of CODING-SYSTEM.
7692An eol-type is integer 0, 1, 2, or a vector of coding systems.
7693
7694Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
7695and CR respectively.
7696
7697A vector value indicates that a format of end-of-line should be
7698detected automatically. Nth element of the vector is the subsidiary
7699coding system whose eol-type is N. */)
7700 (coding_system)
7701 Lisp_Object coding_system;
7702{
7703 Lisp_Object spec, eol_type;
7704 int n;
7705
7706 if (NILP (coding_system))
7707 coding_system = Qno_conversion;
7708 if (! CODING_SYSTEM_P (coding_system))
7709 return Qnil;
7710 spec = CODING_SYSTEM_SPEC (coding_system);
7711 eol_type = AREF (spec, 2);
7712 if (VECTORP (eol_type))
7713 return Fcopy_sequence (eol_type);
7714 n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
7715 return make_number (n);
7716}
7717
4ed46869
KH
7718#endif /* emacs */
7719
7720\f
1397dc18 7721/*** 9. Post-amble ***/
4ed46869 7722
dfcf069d 7723void
4ed46869
KH
7724init_coding_once ()
7725{
7726 int i;
7727
df7492f9
KH
7728 for (i = 0; i < coding_category_max; i++)
7729 {
7730 coding_categories[i].id = -1;
7731 coding_priorities[i] = i;
7732 }
4ed46869
KH
7733
7734 /* ISO2022 specific initialize routine. */
7735 for (i = 0; i < 0x20; i++)
b73bfc1c 7736 iso_code_class[i] = ISO_control_0;
4ed46869
KH
7737 for (i = 0x21; i < 0x7F; i++)
7738 iso_code_class[i] = ISO_graphic_plane_0;
7739 for (i = 0x80; i < 0xA0; i++)
b73bfc1c 7740 iso_code_class[i] = ISO_control_1;
4ed46869
KH
7741 for (i = 0xA1; i < 0xFF; i++)
7742 iso_code_class[i] = ISO_graphic_plane_1;
7743 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
7744 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
7745 iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
7746 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
7747 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
7748 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
7749 iso_code_class[ISO_CODE_ESC] = ISO_escape;
7750 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
7751 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
7752 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
7753
b843d1ae 7754 inhibit_pre_post_conversion = 0;
df7492f9
KH
7755
7756 for (i = 0; i < 256; i++)
7757 {
7758 emacs_mule_bytes[i] = 1;
7759 }
e0e989f6
KH
7760}
7761
7762#ifdef emacs
7763
dfcf069d 7764void
e0e989f6
KH
7765syms_of_coding ()
7766{
df7492f9
KH
7767 staticpro (&Vcoding_system_hash_table);
7768 Vcoding_system_hash_table = Fmakehash (Qeq);
7769
7770 staticpro (&Vsjis_coding_system);
7771 Vsjis_coding_system = Qnil;
7772
7773 staticpro (&Vbig5_coding_system);
7774 Vbig5_coding_system = Qnil;
7775
7776 staticpro (&Vcode_conversion_work_buf_list);
7777 Vcode_conversion_work_buf_list = Qnil;
e0e989f6 7778
df7492f9
KH
7779 staticpro (&Vcode_conversion_reused_work_buf);
7780 Vcode_conversion_reused_work_buf = Qnil;
7781
7782 DEFSYM (Qcharset, "charset");
7783 DEFSYM (Qtarget_idx, "target-idx");
7784 DEFSYM (Qcoding_system_history, "coding-system-history");
bb0115a2
RS
7785 Fset (Qcoding_system_history, Qnil);
7786
9ce27fde 7787 /* Target FILENAME is the first argument. */
e0e989f6 7788 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
9ce27fde 7789 /* Target FILENAME is the third argument. */
e0e989f6
KH
7790 Fput (Qwrite_region, Qtarget_idx, make_number (2));
7791
df7492f9 7792 DEFSYM (Qcall_process, "call-process");
9ce27fde 7793 /* Target PROGRAM is the first argument. */
e0e989f6
KH
7794 Fput (Qcall_process, Qtarget_idx, make_number (0));
7795
df7492f9 7796 DEFSYM (Qcall_process_region, "call-process-region");
9ce27fde 7797 /* Target PROGRAM is the third argument. */
e0e989f6
KH
7798 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
7799
df7492f9 7800 DEFSYM (Qstart_process, "start-process");
9ce27fde 7801 /* Target PROGRAM is the third argument. */
e0e989f6
KH
7802 Fput (Qstart_process, Qtarget_idx, make_number (2));
7803
df7492f9 7804 DEFSYM (Qopen_network_stream, "open-network-stream");
9ce27fde 7805 /* Target SERVICE is the fourth argument. */
e0e989f6
KH
7806 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
7807
df7492f9
KH
7808 DEFSYM (Qcoding_system, "coding-system");
7809 DEFSYM (Qcoding_aliases, "coding-aliases");
4ed46869 7810
df7492f9
KH
7811 DEFSYM (Qeol_type, "eol-type");
7812 DEFSYM (Qunix, "unix");
7813 DEFSYM (Qdos, "dos");
7814 DEFSYM (Qmac, "mac");
4ed46869 7815
df7492f9
KH
7816 DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
7817 DEFSYM (Qpost_read_conversion, "post-read-conversion");
7818 DEFSYM (Qpre_write_conversion, "pre-write-conversion");
7819 DEFSYM (Qdefault_char, "default-char");
7820 DEFSYM (Qundecided, "undecided");
7821 DEFSYM (Qno_conversion, "no-conversion");
7822 DEFSYM (Qraw_text, "raw-text");
4ed46869 7823
df7492f9 7824 DEFSYM (Qiso_2022, "iso-2022");
4ed46869 7825
df7492f9 7826 DEFSYM (Qutf_8, "utf-8");
27901516 7827
df7492f9
KH
7828 DEFSYM (Qutf_16, "utf-16");
7829 DEFSYM (Qutf_16_be, "utf-16-be");
7830 DEFSYM (Qutf_16_be_nosig, "utf-16-be-nosig");
7831 DEFSYM (Qutf_16_le, "utf-16-l3");
7832 DEFSYM (Qutf_16_le_nosig, "utf-16-le-nosig");
7833 DEFSYM (Qsignature, "signature");
7834 DEFSYM (Qendian, "endian");
7835 DEFSYM (Qbig, "big");
7836 DEFSYM (Qlittle, "little");
27901516 7837
df7492f9
KH
7838 DEFSYM (Qshift_jis, "shift-jis");
7839 DEFSYM (Qbig5, "big5");
4ed46869 7840
df7492f9 7841 DEFSYM (Qcoding_system_p, "coding-system-p");
4ed46869 7842
df7492f9 7843 DEFSYM (Qcoding_system_error, "coding-system-error");
4ed46869
KH
7844 Fput (Qcoding_system_error, Qerror_conditions,
7845 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
7846 Fput (Qcoding_system_error, Qerror_message,
9ce27fde 7847 build_string ("Invalid coding system"));
4ed46869 7848
df7492f9
KH
7849 /* Intern this now in case it isn't already done.
7850 Setting this variable twice is harmless.
7851 But don't staticpro it here--that is done in alloc.c. */
7852 Qchar_table_extra_slots = intern ("char-table-extra-slots");
4ed46869 7853
df7492f9 7854 DEFSYM (Qtranslation_table, "translation-table");
1397dc18 7855 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (1));
df7492f9
KH
7856 DEFSYM (Qtranslation_table_id, "translation-table-id");
7857 DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
7858 DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
bdd9fb48 7859
df7492f9 7860 DEFSYM (Qchar_coding_system, "char-coding-system");
84fbb8a0 7861
df7492f9 7862 Fput (Qchar_coding_system, Qchar_table_extra_slots, make_number (2));
a5d301df 7863
df7492f9 7864 DEFSYM (Qvalid_codes, "valid-codes");
05e6f5dc 7865
df7492f9 7866 DEFSYM (Qemacs_mule, "emacs-mule");
05e6f5dc 7867
df7492f9
KH
7868 Vcoding_category_table
7869 = Fmake_vector (make_number (coding_category_max), Qnil);
7870 staticpro (&Vcoding_category_table);
7871 /* Followings are target of code detection. */
7872 ASET (Vcoding_category_table, coding_category_iso_7,
7873 intern ("coding-category-iso-7"));
7874 ASET (Vcoding_category_table, coding_category_iso_7_tight,
7875 intern ("coding-category-iso-7-tight"));
7876 ASET (Vcoding_category_table, coding_category_iso_8_1,
7877 intern ("coding-category-iso-8-1"));
7878 ASET (Vcoding_category_table, coding_category_iso_8_2,
7879 intern ("coding-category-iso-8-2"));
7880 ASET (Vcoding_category_table, coding_category_iso_7_else,
7881 intern ("coding-category-iso-7-else"));
7882 ASET (Vcoding_category_table, coding_category_iso_8_else,
7883 intern ("coding-category-iso-8-else"));
7884 ASET (Vcoding_category_table, coding_category_utf_8,
7885 intern ("coding-category-utf-8"));
7886 ASET (Vcoding_category_table, coding_category_utf_16_be,
7887 intern ("coding-category-utf-16-be"));
7888 ASET (Vcoding_category_table, coding_category_utf_16_le,
7889 intern ("coding-category-utf-16-le"));
7890 ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
7891 intern ("coding-category-utf-16-be-nosig"));
7892 ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
7893 intern ("coding-category-utf-16-le-nosig"));
7894 ASET (Vcoding_category_table, coding_category_charset,
7895 intern ("coding-category-charset"));
7896 ASET (Vcoding_category_table, coding_category_sjis,
7897 intern ("coding-category-sjis"));
7898 ASET (Vcoding_category_table, coding_category_big5,
7899 intern ("coding-category-big5"));
7900 ASET (Vcoding_category_table, coding_category_ccl,
7901 intern ("coding-category-ccl"));
7902 ASET (Vcoding_category_table, coding_category_emacs_mule,
7903 intern ("coding-category-emacs-mule"));
7904 /* Followings are NOT target of code detection. */
7905 ASET (Vcoding_category_table, coding_category_raw_text,
7906 intern ("coding-category-raw-text"));
7907 ASET (Vcoding_category_table, coding_category_undecided,
7908 intern ("coding-category-undecided"));
70c22245 7909
4ed46869
KH
7910 defsubr (&Scoding_system_p);
7911 defsubr (&Sread_coding_system);
7912 defsubr (&Sread_non_nil_coding_system);
7913 defsubr (&Scheck_coding_system);
7914 defsubr (&Sdetect_coding_region);
d46c5b12 7915 defsubr (&Sdetect_coding_string);
05e6f5dc 7916 defsubr (&Sfind_coding_systems_region_internal);
df7492f9 7917 defsubr (&Scheck_coding_systems_region);
4ed46869
KH
7918 defsubr (&Sdecode_coding_region);
7919 defsubr (&Sencode_coding_region);
7920 defsubr (&Sdecode_coding_string);
7921 defsubr (&Sencode_coding_string);
7922 defsubr (&Sdecode_sjis_char);
7923 defsubr (&Sencode_sjis_char);
7924 defsubr (&Sdecode_big5_char);
7925 defsubr (&Sencode_big5_char);
1ba9e4ab 7926 defsubr (&Sset_terminal_coding_system_internal);
c4825358 7927 defsubr (&Sset_safe_terminal_coding_system_internal);
4ed46869 7928 defsubr (&Sterminal_coding_system);
1ba9e4ab 7929 defsubr (&Sset_keyboard_coding_system_internal);
4ed46869 7930 defsubr (&Skeyboard_coding_system);
a5d301df 7931 defsubr (&Sfind_operation_coding_system);
df7492f9
KH
7932 defsubr (&Sset_coding_system_priority);
7933 defsubr (&Sdefine_coding_system_internal);
7934 defsubr (&Sdefine_coding_system_alias);
7935 defsubr (&Scoding_system_base);
7936 defsubr (&Scoding_system_plist);
7937 defsubr (&Scoding_system_aliases);
7938 defsubr (&Scoding_system_eol_type);
7939 defsubr (&Scoding_system_priority_list);
4ed46869 7940
4608c386 7941 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
48b0f3ae
PJ
7942 doc: /* List of coding systems.
7943
7944Do not alter the value of this variable manually. This variable should be
df7492f9 7945updated by the functions `define-coding-system' and
48b0f3ae 7946`define-coding-system-alias'. */);
4608c386
KH
7947 Vcoding_system_list = Qnil;
7948
7949 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
48b0f3ae
PJ
7950 doc: /* Alist of coding system names.
7951Each element is one element list of coding system name.
7952This variable is given to `completing-read' as TABLE argument.
7953
7954Do not alter the value of this variable manually. This variable should be
7955updated by the functions `make-coding-system' and
7956`define-coding-system-alias'. */);
4608c386
KH
7957 Vcoding_system_alist = Qnil;
7958
4ed46869 7959 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
48b0f3ae
PJ
7960 doc: /* List of coding-categories (symbols) ordered by priority.
7961
7962On detecting a coding system, Emacs tries code detection algorithms
7963associated with each coding-category one by one in this order. When
7964one algorithm agrees with a byte sequence of source text, the coding
7965system bound to the corresponding coding-category is selected. */);
4ed46869
KH
7966 {
7967 int i;
7968
7969 Vcoding_category_list = Qnil;
df7492f9 7970 for (i = coding_category_max - 1; i >= 0; i--)
4ed46869 7971 Vcoding_category_list
d46c5b12
KH
7972 = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
7973 Vcoding_category_list);
4ed46869
KH
7974 }
7975
7976 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
48b0f3ae
PJ
7977 doc: /* Specify the coding system for read operations.
7978It is useful to bind this variable with `let', but do not set it globally.
7979If the value is a coding system, it is used for decoding on read operation.
7980If not, an appropriate element is used from one of the coding system alists:
7981There are three such tables, `file-coding-system-alist',
7982`process-coding-system-alist', and `network-coding-system-alist'. */);
4ed46869
KH
7983 Vcoding_system_for_read = Qnil;
7984
7985 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
48b0f3ae
PJ
7986 doc: /* Specify the coding system for write operations.
7987Programs bind this variable with `let', but you should not set it globally.
7988If the value is a coding system, it is used for encoding of output,
7989when writing it to a file and when sending it to a file or subprocess.
7990
7991If this does not specify a coding system, an appropriate element
7992is used from one of the coding system alists:
7993There are three such tables, `file-coding-system-alist',
7994`process-coding-system-alist', and `network-coding-system-alist'.
7995For output to files, if the above procedure does not specify a coding system,
7996the value of `buffer-file-coding-system' is used. */);
4ed46869
KH
7997 Vcoding_system_for_write = Qnil;
7998
7999 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
df7492f9
KH
8000 doc: /*
8001Coding system used in the latest file or process I/O. */);
4ed46869
KH
8002 Vlast_coding_system_used = Qnil;
8003
9ce27fde 8004 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
df7492f9
KH
8005 doc: /*
8006*Non-nil means always inhibit code conversion of end-of-line format.
48b0f3ae
PJ
8007See info node `Coding Systems' and info node `Text and Binary' concerning
8008such conversion. */);
9ce27fde
KH
8009 inhibit_eol_conversion = 0;
8010
ed29121d 8011 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
df7492f9
KH
8012 doc: /*
8013Non-nil means process buffer inherits coding system of process output.
48b0f3ae
PJ
8014Bind it to t if the process output is to be treated as if it were a file
8015read from some filesystem. */);
ed29121d
EZ
8016 inherit_process_coding_system = 0;
8017
02ba4723 8018 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
df7492f9
KH
8019 doc: /*
8020Alist to decide a coding system to use for a file I/O operation.
48b0f3ae
PJ
8021The format is ((PATTERN . VAL) ...),
8022where PATTERN is a regular expression matching a file name,
8023VAL is a coding system, a cons of coding systems, or a function symbol.
8024If VAL is a coding system, it is used for both decoding and encoding
8025the file contents.
8026If VAL is a cons of coding systems, the car part is used for decoding,
8027and the cdr part is used for encoding.
8028If VAL is a function symbol, the function must return a coding system
0192762c
DL
8029or a cons of coding systems which are used as above. The function gets
8030the arguments with which `find-operation-coding-systems' was called.
48b0f3ae
PJ
8031
8032See also the function `find-operation-coding-system'
8033and the variable `auto-coding-alist'. */);
02ba4723
KH
8034 Vfile_coding_system_alist = Qnil;
8035
8036 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
df7492f9
KH
8037 doc: /*
8038Alist to decide a coding system to use for a process I/O operation.
48b0f3ae
PJ
8039The format is ((PATTERN . VAL) ...),
8040where PATTERN is a regular expression matching a program name,
8041VAL is a coding system, a cons of coding systems, or a function symbol.
8042If VAL is a coding system, it is used for both decoding what received
8043from the program and encoding what sent to the program.
8044If VAL is a cons of coding systems, the car part is used for decoding,
8045and the cdr part is used for encoding.
8046If VAL is a function symbol, the function must return a coding system
8047or a cons of coding systems which are used as above.
8048
8049See also the function `find-operation-coding-system'. */);
02ba4723
KH
8050 Vprocess_coding_system_alist = Qnil;
8051
8052 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
df7492f9
KH
8053 doc: /*
8054Alist to decide a coding system to use for a network I/O operation.
48b0f3ae
PJ
8055The format is ((PATTERN . VAL) ...),
8056where PATTERN is a regular expression matching a network service name
8057or is a port number to connect to,
8058VAL is a coding system, a cons of coding systems, or a function symbol.
8059If VAL is a coding system, it is used for both decoding what received
8060from the network stream and encoding what sent to the network stream.
8061If VAL is a cons of coding systems, the car part is used for decoding,
8062and the cdr part is used for encoding.
8063If VAL is a function symbol, the function must return a coding system
8064or a cons of coding systems which are used as above.
8065
8066See also the function `find-operation-coding-system'. */);
02ba4723 8067 Vnetwork_coding_system_alist = Qnil;
4ed46869 8068
68c45bf0 8069 DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
75205970
RS
8070 doc: /* Coding system to use with system messages.
8071Also used for decoding keyboard input on X Window system. */);
68c45bf0
PE
8072 Vlocale_coding_system = Qnil;
8073
005f0d35 8074 /* The eol mnemonics are reset in startup.el system-dependently. */
7722baf9 8075 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
df7492f9
KH
8076 doc: /*
8077*String displayed in mode line for UNIX-like (LF) end-of-line format. */);
7722baf9 8078 eol_mnemonic_unix = build_string (":");
4ed46869 8079
7722baf9 8080 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
df7492f9
KH
8081 doc: /*
8082*String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
7722baf9 8083 eol_mnemonic_dos = build_string ("\\");
4ed46869 8084
7722baf9 8085 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
df7492f9
KH
8086 doc: /*
8087*String displayed in mode line for MAC-like (CR) end-of-line format. */);
7722baf9 8088 eol_mnemonic_mac = build_string ("/");
4ed46869 8089
7722baf9 8090 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
df7492f9
KH
8091 doc: /*
8092*String displayed in mode line when end-of-line format is not yet determined. */);
7722baf9 8093 eol_mnemonic_undecided = build_string (":");
4ed46869 8094
84fbb8a0 8095 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
df7492f9
KH
8096 doc: /*
8097*Non-nil enables character translation while encoding and decoding. */);
84fbb8a0 8098 Venable_character_translation = Qt;
bdd9fb48 8099
f967223b 8100 DEFVAR_LISP ("standard-translation-table-for-decode",
48b0f3ae
PJ
8101 &Vstandard_translation_table_for_decode,
8102 doc: /* Table for translating characters while decoding. */);
f967223b 8103 Vstandard_translation_table_for_decode = Qnil;
bdd9fb48 8104
f967223b 8105 DEFVAR_LISP ("standard-translation-table-for-encode",
48b0f3ae
PJ
8106 &Vstandard_translation_table_for_encode,
8107 doc: /* Table for translating characters while encoding. */);
f967223b 8108 Vstandard_translation_table_for_encode = Qnil;
4ed46869 8109
df7492f9 8110 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table,
48b0f3ae
PJ
8111 doc: /* Alist of charsets vs revision numbers.
8112While encoding, if a charset (car part of an element) is found,
df7492f9
KH
8113designate it with the escape sequence identifying revision (cdr part
8114of the element). */);
8115 Vcharset_revision_table = Qnil;
02ba4723
KH
8116
8117 DEFVAR_LISP ("default-process-coding-system",
8118 &Vdefault_process_coding_system,
48b0f3ae
PJ
8119 doc: /* Cons of coding systems used for process I/O by default.
8120The car part is used for decoding a process output,
8121the cdr part is used for encoding a text to be sent to a process. */);
02ba4723 8122 Vdefault_process_coding_system = Qnil;
c4825358 8123
3f003981 8124 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
df7492f9
KH
8125 doc: /*
8126Table of extra Latin codes in the range 128..159 (inclusive).
48b0f3ae
PJ
8127This is a vector of length 256.
8128If Nth element is non-nil, the existence of code N in a file
8129\(or output of subprocess) doesn't prevent it to be detected as
8130a coding system of ISO 2022 variant which has a flag
8131`accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
8132or reading output of a subprocess.
8133Only 128th through 159th elements has a meaning. */);
3f003981 8134 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
d46c5b12
KH
8135
8136 DEFVAR_LISP ("select-safe-coding-system-function",
8137 &Vselect_safe_coding_system_function,
df7492f9
KH
8138 doc: /*
8139Function to call to select safe coding system for encoding a text.
48b0f3ae
PJ
8140
8141If set, this function is called to force a user to select a proper
8142coding system which can encode the text in the case that a default
8143coding system used in each operation can't encode the text.
8144
8145The default value is `select-safe-coding-system' (which see). */);
d46c5b12
KH
8146 Vselect_safe_coding_system_function = Qnil;
8147
05e6f5dc 8148 DEFVAR_LISP ("char-coding-system-table", &Vchar_coding_system_table,
df7492f9
KH
8149 doc: /*
8150Char-table containing safe coding systems of each characters.
48b0f3ae
PJ
8151Each element doesn't include such generic coding systems that can
8152encode any characters. They are in the first extra slot. */);
05e6f5dc
KH
8153 Vchar_coding_system_table = Fmake_char_table (Qchar_coding_system, Qnil);
8154
22ab2303 8155 DEFVAR_BOOL ("inhibit-iso-escape-detection",
74383408 8156 &inhibit_iso_escape_detection,
df7492f9
KH
8157 doc: /*
8158If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
48b0f3ae
PJ
8159
8160By default, on reading a file, Emacs tries to detect how the text is
8161encoded. This code detection is sensitive to escape sequences. If
8162the sequence is valid as ISO2022, the code is determined as one of
8163the ISO2022 encodings, and the file is decoded by the corresponding
8164coding system (e.g. `iso-2022-7bit').
8165
8166However, there may be a case that you want to read escape sequences in
8167a file as is. In such a case, you can set this variable to non-nil.
8168Then, as the code detection ignores any escape sequences, no file is
8169detected as encoded in some ISO2022 encoding. The result is that all
8170escape sequences become visible in a buffer.
8171
8172The default value is nil, and it is strongly recommended not to change
8173it. That is because many Emacs Lisp source files that contain
8174non-ASCII characters are encoded by the coding system `iso-2022-7bit'
8175in Emacs's distribution, and they won't be decoded correctly on
8176reading if you suppress escape sequence detection.
8177
8178The other way to read escape sequences in a file without decoding is
8179to explicitly specify some coding system that doesn't use ISO2022's
8180escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument]. */);
74383408 8181 inhibit_iso_escape_detection = 0;
2c78b7e1
KH
8182
8183 {
8184 Lisp_Object args[coding_arg_max];
8185 Lisp_Object plist[14];
8186 int i;
8187
8188 for (i = 0; i < coding_arg_max; i++)
8189 args[i] = Qnil;
8190
8191 plist[0] = intern (":name");
8192 plist[1] = args[coding_arg_name] = Qno_conversion;
8193 plist[2] = intern (":mnemonic");
8194 plist[3] = args[coding_arg_mnemonic] = make_number ('=');
8195 plist[4] = intern (":coding-type");
8196 plist[5] = args[coding_arg_coding_type] = Qraw_text;
8197 plist[6] = intern (":ascii-compatible-p");
8198 plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
8199 plist[8] = intern (":default-char");
8200 plist[9] = args[coding_arg_default_char] = make_number (0);
8201 plist[10] = intern (":docstring");
8202 plist[11] = build_string ("Do no conversion.\n\
8203\n\
8204When you visit a file with this coding, the file is read into a\n\
8205unibyte buffer as is, thus each byte of a file is treated as a\n\
8206character.");
8207 plist[12] = intern (":eol-type");
8208 plist[13] = args[coding_arg_eol_type] = Qunix;
8209 args[coding_arg_plist] = Flist (14, plist);
8210 Fdefine_coding_system_internal (coding_arg_max, args);
8211 }
8212
8213 setup_coding_system (Qno_conversion, &keyboard_coding);
8214 setup_coding_system (Qno_conversion, &terminal_coding);
8215 setup_coding_system (Qno_conversion, &safe_terminal_coding);
4ed46869
KH
8216}
8217
68c45bf0
PE
8218char *
8219emacs_strerror (error_number)
8220 int error_number;
8221{
8222 char *str;
8223
ca9c0567 8224 synchronize_system_messages_locale ();
68c45bf0
PE
8225 str = strerror (error_number);
8226
8227 if (! NILP (Vlocale_coding_system))
8228 {
8229 Lisp_Object dec = code_convert_string_norecord (build_string (str),
8230 Vlocale_coding_system,
8231 0);
8232 str = (char *) XSTRING (dec)->data;
8233 }
8234
8235 return str;
8236}
8237
4ed46869 8238#endif /* emacs */