(Fdefine_charset_internal): Fix bug for the case of re-defining a
[bpt/emacs.git] / src / coding.c
CommitLineData
4ed46869 1/* Coding system handler (conversion, detection, and etc).
4a2f9c6a 2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
203cb916 3 Licensed to the Free Software Foundation.
70ad9fc4 4 Copyright (C) 2001 Free Software Foundation, Inc.
df7492f9
KH
5 Copyright (C) 2001, 2002
6 National Institute of Advanced Industrial Science and Technology (AIST)
7 Registration Number H13PRO009
4ed46869 8
369314dc
KH
9This file is part of GNU Emacs.
10
11GNU Emacs is free software; you can redistribute it and/or modify
12it under the terms of the GNU General Public License as published by
13the Free Software Foundation; either version 2, or (at your option)
14any later version.
4ed46869 15
369314dc
KH
16GNU Emacs is distributed in the hope that it will be useful,
17but WITHOUT ANY WARRANTY; without even the implied warranty of
18MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19GNU General Public License for more details.
4ed46869 20
369314dc
KH
21You should have received a copy of the GNU General Public License
22along with GNU Emacs; see the file COPYING. If not, write to
23the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24Boston, MA 02111-1307, USA. */
4ed46869
KH
25
26/*** TABLE OF CONTENTS ***
27
b73bfc1c 28 0. General comments
4ed46869 29 1. Preamble
df7492f9
KH
30 2. Emacs' internal format (emacs-utf-8) handlers
31 3. UTF-8 handlers
32 4. UTF-16 handlers
33 5. Charset-base coding systems handlers
34 6. emacs-mule (old Emacs' internal format) handlers
35 7. ISO2022 handlers
36 8. Shift-JIS and BIG5 handlers
37 9. CCL handlers
38 10. C library functions
39 11. Emacs Lisp library functions
40 12. Postamble
4ed46869
KH
41
42*/
43
df7492f9 44/*** 0. General comments ***
b73bfc1c
KH
45
46
df7492f9 47CODING SYSTEM
4ed46869 48
5bad0796
DL
49 A coding system is an object for an encoding mechanism that contains
50 information about how to convert byte sequences to character
e19c3639
KH
51 sequences and vice versa. When we say "decode", it means converting
52 a byte sequence of a specific coding system into a character
53 sequence that is represented by Emacs' internal coding system
54 `emacs-utf-8', and when we say "encode", it means converting a
55 character sequence of emacs-utf-8 to a byte sequence of a specific
56 coding system.
57
58 In Emacs Lisp, a coding system is represented by a Lisp symbol. In
59 C level, a coding system is represented by a vector of attributes
5bad0796 60 stored in the hash table Vcharset_hash_table. The conversion from
e19c3639
KH
61 coding system symbol to attributes vector is done by looking up
62 Vcharset_hash_table by the symbol.
63
64 Coding systems are classified into the following types depending on
5bad0796 65 the encoding mechanism. Here's a brief description of the types.
df7492f9
KH
66
67 o UTF-8
68
69 o UTF-16
70
71 o Charset-base coding system
72
73 A coding system defined by one or more (coded) character sets.
5bad0796 74 Decoding and encoding are done by a code converter defined for each
df7492f9
KH
75 character set.
76
5bad0796 77 o Old Emacs internal format (emacs-mule)
df7492f9 78
5bad0796 79 The coding system adopted by old versions of Emacs (20 and 21).
4ed46869 80
df7492f9 81 o ISO2022-base coding system
93dec019 82
df7492f9
KH
83 The most famous coding system for multiple character sets. X's
84 Compound Text, various EUCs (Extended Unix Code), and coding systems
85 used in the Internet communication such as ISO-2022-JP are all
86 variants of ISO2022.
87
88 o SJIS (or Shift-JIS or MS-Kanji-Code)
89
4ed46869
KH
90 A coding system to encode character sets: ASCII, JISX0201, and
91 JISX0208. Widely used for PC's in Japan. Details are described in
df7492f9 92 section 8.
4ed46869 93
df7492f9 94 o BIG5
4ed46869 95
df7492f9
KH
96 A coding system to encode character sets: ASCII and Big5. Widely
97 used by Chinese (mainly in Taiwan and Hong Kong). Details are
98 described in section 8. In this file, when we write "big5" (all
99 lowercase), we mean the coding system, and when we write "Big5"
100 (capitalized), we mean the character set.
4ed46869 101
df7492f9 102 o CCL
27901516 103
5bad0796 104 If a user wants to decode/encode text encoded in a coding system
df7492f9
KH
105 not listed above, he can supply a decoder and an encoder for it in
106 CCL (Code Conversion Language) programs. Emacs executes the CCL
107 program while decoding/encoding.
27901516 108
df7492f9 109 o Raw-text
4ed46869 110
df7492f9 111 A coding system for a text containing raw eight-bit data. Emacs
5bad0796 112 treats each byte of source text as a character (except for
df7492f9 113 end-of-line conversion).
4ed46869 114
df7492f9
KH
115 o No-conversion
116
117 Like raw text, but don't do end-of-line conversion.
4ed46869 118
4ed46869 119
df7492f9 120END-OF-LINE FORMAT
4ed46869 121
5bad0796 122 How text end-of-line is encoded depends on operating system. For
df7492f9 123 instance, Unix's format is just one byte of LF (line-feed) code,
f4dee582 124 whereas DOS's format is two-byte sequence of `carriage-return' and
d46c5b12
KH
125 `line-feed' codes. MacOS's format is usually one byte of
126 `carriage-return'.
4ed46869 127
5bad0796 128 Since text character encoding and end-of-line encoding are
df7492f9
KH
129 independent, any coding system described above can take any format
130 of end-of-line (except for no-conversion).
4ed46869 131
e19c3639
KH
132STRUCT CODING_SYSTEM
133
134 Before using a coding system for code conversion (i.e. decoding and
135 encoding), we setup a structure of type `struct coding_system'.
136 This structure keeps various information about a specific code
5bad0796 137 conversion (e.g. the location of source and destination data).
e19c3639 138
4ed46869
KH
139*/
140
df7492f9
KH
141/* COMMON MACROS */
142
143
4ed46869
KH
144/*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
145
df7492f9
KH
146 These functions check if a byte sequence specified as a source in
147 CODING conforms to the format of XXX. Return 1 if the data contains
148 a byte sequence which can be decoded into non-ASCII characters by
149 the coding system. Otherwize (i.e. the data contains only ASCII
150 characters or invalid sequence) return 0.
151
152 It also resets some bits of an integer pointed by MASK. The macros
153 CATEGORY_MASK_XXX specifies each bit of this integer.
154
155 Below is the template of these functions. */
156
4ed46869 157#if 0
df7492f9
KH
158static int
159detect_coding_XXX (coding, mask)
160 struct coding_system *coding;
161 int *mask;
4ed46869 162{
df7492f9
KH
163 unsigned char *src = coding->source;
164 unsigned char *src_end = coding->source + coding->src_bytes;
165 int multibytep = coding->src_multibyte;
166 int c;
167 int found = 0;
168 ...;
169
170 while (1)
171 {
172 /* Get one byte from the source. If the souce is exausted, jump
173 to no_more_source:. */
174 ONE_MORE_BYTE (c);
175 /* Check if it conforms to XXX. If not, break the loop. */
176 }
177 /* As the data is invalid for XXX, reset a proper bits. */
178 *mask &= ~CODING_CATEGORY_XXX;
179 return 0;
180 no_more_source:
181 /* The source exausted. */
182 if (!found)
183 /* ASCII characters only. */
184 return 0;
185 /* Some data should be decoded into non-ASCII characters. */
186 *mask &= CODING_CATEGORY_XXX;
187 return 1;
4ed46869
KH
188}
189#endif
190
191/*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
192
df7492f9
KH
193 These functions decode a byte sequence specified as a source by
194 CODING. The resulting multibyte text goes to a place pointed to by
195 CODING->charbuf, the length of which should not exceed
196 CODING->charbuf_size;
d46c5b12 197
df7492f9
KH
198 These functions set the information of original and decoded texts in
199 CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
200 They also set CODING->result to one of CODING_RESULT_XXX indicating
201 how the decoding is finished.
d46c5b12 202
df7492f9 203 Below is the template of these functions. */
d46c5b12 204
4ed46869 205#if 0
b73bfc1c 206static void
df7492f9 207decode_coding_XXXX (coding)
4ed46869 208 struct coding_system *coding;
4ed46869 209{
df7492f9
KH
210 unsigned char *src = coding->source + coding->consumed;
211 unsigned char *src_end = coding->source + coding->src_bytes;
212 /* SRC_BASE remembers the start position in source in each loop.
213 The loop will be exited when there's not enough source code, or
214 when there's no room in CHARBUF for a decoded character. */
215 unsigned char *src_base;
216 /* A buffer to produce decoded characters. */
217 int *charbuf = coding->charbuf;
218 int *charbuf_end = charbuf + coding->charbuf_size;
219 int multibytep = coding->src_multibyte;
220
221 while (1)
222 {
223 src_base = src;
224 if (charbuf < charbuf_end)
225 /* No more room to produce a decoded character. */
226 break;
227 ONE_MORE_BYTE (c);
228 /* Decode it. */
229 }
230
231 no_more_source:
232 if (src_base < src_end
233 && coding->mode & CODING_MODE_LAST_BLOCK)
234 /* If the source ends by partial bytes to construct a character,
235 treat them as eight-bit raw data. */
236 while (src_base < src_end && charbuf < charbuf_end)
237 *charbuf++ = *src_base++;
238 /* Remember how many bytes and characters we consumed. If the
239 source is multibyte, the bytes and chars are not identical. */
240 coding->consumed = coding->consumed_char = src_base - coding->source;
241 /* Remember how many characters we produced. */
242 coding->charbuf_used = charbuf - coding->charbuf;
4ed46869
KH
243}
244#endif
245
246/*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
247
df7492f9
KH
248 These functions encode SRC_BYTES length text at SOURCE of Emacs'
249 internal multibyte format by CODING. The resulting byte sequence
b73bfc1c
KH
250 goes to a place pointed to by DESTINATION, the length of which
251 should not exceed DST_BYTES.
d46c5b12 252
df7492f9
KH
253 These functions set the information of original and encoded texts in
254 the members produced, produced_char, consumed, and consumed_char of
255 the structure *CODING. They also set the member result to one of
256 CODING_RESULT_XXX indicating how the encoding finished.
d46c5b12 257
df7492f9
KH
258 DST_BYTES zero means that source area and destination area are
259 overlapped, which means that we can produce a encoded text until it
260 reaches at the head of not-yet-encoded source text.
d46c5b12 261
df7492f9 262 Below is a template of these functions. */
4ed46869 263#if 0
b73bfc1c 264static void
df7492f9 265encode_coding_XXX (coding)
4ed46869 266 struct coding_system *coding;
4ed46869 267{
df7492f9
KH
268 int multibytep = coding->dst_multibyte;
269 int *charbuf = coding->charbuf;
270 int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
271 unsigned char *dst = coding->destination + coding->produced;
272 unsigned char *dst_end = coding->destination + coding->dst_bytes;
273 unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
274 int produced_chars = 0;
275
276 for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
277 {
278 int c = *charbuf;
279 /* Encode C into DST, and increment DST. */
280 }
281 label_no_more_destination:
282 /* How many chars and bytes we produced. */
283 coding->produced_char += produced_chars;
284 coding->produced = dst - coding->destination;
4ed46869
KH
285}
286#endif
287
4ed46869
KH
288\f
289/*** 1. Preamble ***/
290
68c45bf0 291#include <config.h>
4ed46869
KH
292#include <stdio.h>
293
4ed46869
KH
294#include "lisp.h"
295#include "buffer.h"
df7492f9 296#include "character.h"
4ed46869
KH
297#include "charset.h"
298#include "ccl.h"
df7492f9 299#include "composite.h"
4ed46869
KH
300#include "coding.h"
301#include "window.h"
302
df7492f9 303Lisp_Object Vcoding_system_hash_table;
4ed46869 304
df7492f9
KH
305Lisp_Object Qcoding_system, Qcoding_aliases, Qeol_type;
306Lisp_Object Qunix, Qdos, Qmac;
4ed46869
KH
307Lisp_Object Qbuffer_file_coding_system;
308Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
df7492f9 309Lisp_Object Qdefault_char;
27901516 310Lisp_Object Qno_conversion, Qundecided;
df7492f9
KH
311Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5;
312Lisp_Object Qutf_16_be_nosig, Qutf_16_be, Qutf_16_le_nosig, Qutf_16_le;
313Lisp_Object Qsignature, Qendian, Qbig, Qlittle;
bb0115a2 314Lisp_Object Qcoding_system_history;
1397dc18 315Lisp_Object Qvalid_codes;
4ed46869
KH
316
317extern Lisp_Object Qinsert_file_contents, Qwrite_region;
318Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
319Lisp_Object Qstart_process, Qopen_network_stream;
320Lisp_Object Qtarget_idx;
321
d46c5b12
KH
322Lisp_Object Vselect_safe_coding_system_function;
323
7722baf9
EZ
324/* Mnemonic string for each format of end-of-line. */
325Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
326/* Mnemonic string to indicate format of end-of-line is not yet
4ed46869 327 decided. */
7722baf9 328Lisp_Object eol_mnemonic_undecided;
4ed46869
KH
329
330#ifdef emacs
331
4608c386
KH
332Lisp_Object Vcoding_system_list, Vcoding_system_alist;
333
334Lisp_Object Qcoding_system_p, Qcoding_system_error;
4ed46869 335
d46c5b12
KH
336/* Coding system emacs-mule and raw-text are for converting only
337 end-of-line format. */
338Lisp_Object Qemacs_mule, Qraw_text;
9ce27fde 339
4ed46869
KH
340/* Coding-systems are handed between Emacs Lisp programs and C internal
341 routines by the following three variables. */
342/* Coding-system for reading files and receiving data from process. */
343Lisp_Object Vcoding_system_for_read;
344/* Coding-system for writing files and sending data to process. */
345Lisp_Object Vcoding_system_for_write;
346/* Coding-system actually used in the latest I/O. */
347Lisp_Object Vlast_coding_system_used;
348
c4825358 349/* A vector of length 256 which contains information about special
94487c4e 350 Latin codes (especially for dealing with Microsoft codes). */
3f003981 351Lisp_Object Vlatin_extra_code_table;
c4825358 352
9ce27fde
KH
353/* Flag to inhibit code conversion of end-of-line format. */
354int inhibit_eol_conversion;
355
74383408
KH
356/* Flag to inhibit ISO2022 escape sequence detection. */
357int inhibit_iso_escape_detection;
358
ed29121d
EZ
359/* Flag to make buffer-file-coding-system inherit from process-coding. */
360int inherit_process_coding_system;
361
c4825358 362/* Coding system to be used to encode text for terminal display. */
4ed46869
KH
363struct coding_system terminal_coding;
364
c4825358
KH
365/* Coding system to be used to encode text for terminal display when
366 terminal coding system is nil. */
367struct coding_system safe_terminal_coding;
368
369/* Coding system of what is sent from terminal keyboard. */
4ed46869
KH
370struct coding_system keyboard_coding;
371
02ba4723
KH
372Lisp_Object Vfile_coding_system_alist;
373Lisp_Object Vprocess_coding_system_alist;
374Lisp_Object Vnetwork_coding_system_alist;
4ed46869 375
68c45bf0
PE
376Lisp_Object Vlocale_coding_system;
377
4ed46869
KH
378#endif /* emacs */
379
f967223b
KH
380/* Flag to tell if we look up translation table on character code
381 conversion. */
84fbb8a0 382Lisp_Object Venable_character_translation;
f967223b
KH
383/* Standard translation table to look up on decoding (reading). */
384Lisp_Object Vstandard_translation_table_for_decode;
385/* Standard translation table to look up on encoding (writing). */
386Lisp_Object Vstandard_translation_table_for_encode;
84fbb8a0 387
f967223b
KH
388Lisp_Object Qtranslation_table;
389Lisp_Object Qtranslation_table_id;
390Lisp_Object Qtranslation_table_for_decode;
391Lisp_Object Qtranslation_table_for_encode;
4ed46869
KH
392
393/* Alist of charsets vs revision number. */
df7492f9 394static Lisp_Object Vcharset_revision_table;
4ed46869 395
02ba4723
KH
396/* Default coding systems used for process I/O. */
397Lisp_Object Vdefault_process_coding_system;
398
b843d1ae
KH
399/* Global flag to tell that we can't call post-read-conversion and
400 pre-write-conversion functions. Usually the value is zero, but it
401 is set to 1 temporarily while such functions are running. This is
402 to avoid infinite recursive call. */
403static int inhibit_pre_post_conversion;
404
df7492f9
KH
405/* Two special coding systems. */
406Lisp_Object Vsjis_coding_system;
407Lisp_Object Vbig5_coding_system;
408
409
410static int detect_coding_utf_8 P_ ((struct coding_system *, int *));
411static void decode_coding_utf_8 P_ ((struct coding_system *));
412static int encode_coding_utf_8 P_ ((struct coding_system *));
413
414static int detect_coding_utf_16 P_ ((struct coding_system *, int *));
415static void decode_coding_utf_16 P_ ((struct coding_system *));
416static int encode_coding_utf_16 P_ ((struct coding_system *));
417
418static int detect_coding_iso_2022 P_ ((struct coding_system *, int *));
419static void decode_coding_iso_2022 P_ ((struct coding_system *));
420static int encode_coding_iso_2022 P_ ((struct coding_system *));
421
422static int detect_coding_emacs_mule P_ ((struct coding_system *, int *));
423static void decode_coding_emacs_mule P_ ((struct coding_system *));
424static int encode_coding_emacs_mule P_ ((struct coding_system *));
425
426static int detect_coding_sjis P_ ((struct coding_system *, int *));
427static void decode_coding_sjis P_ ((struct coding_system *));
428static int encode_coding_sjis P_ ((struct coding_system *));
429
430static int detect_coding_big5 P_ ((struct coding_system *, int *));
431static void decode_coding_big5 P_ ((struct coding_system *));
432static int encode_coding_big5 P_ ((struct coding_system *));
433
434static int detect_coding_ccl P_ ((struct coding_system *, int *));
435static void decode_coding_ccl P_ ((struct coding_system *));
436static int encode_coding_ccl P_ ((struct coding_system *));
437
438static void decode_coding_raw_text P_ ((struct coding_system *));
439static int encode_coding_raw_text P_ ((struct coding_system *));
440
441
442/* ISO2022 section */
443
444#define CODING_ISO_INITIAL(coding, reg) \
445 (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id), \
446 coding_attr_iso_initial), \
447 reg)))
448
449
450#define CODING_ISO_REQUEST(coding, charset_id) \
451 ((charset_id <= (coding)->max_charset_id \
452 ? (coding)->safe_charsets[charset_id] \
453 : -1))
454
455
456#define CODING_ISO_FLAGS(coding) \
457 ((coding)->spec.iso_2022.flags)
458#define CODING_ISO_DESIGNATION(coding, reg) \
459 ((coding)->spec.iso_2022.current_designation[reg])
460#define CODING_ISO_INVOCATION(coding, plane) \
461 ((coding)->spec.iso_2022.current_invocation[plane])
462#define CODING_ISO_SINGLE_SHIFTING(coding) \
463 ((coding)->spec.iso_2022.single_shifting)
464#define CODING_ISO_BOL(coding) \
465 ((coding)->spec.iso_2022.bol)
466#define CODING_ISO_INVOKED_CHARSET(coding, plane) \
467 CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
468
469/* Control characters of ISO2022. */
470 /* code */ /* function */
471#define ISO_CODE_LF 0x0A /* line-feed */
472#define ISO_CODE_CR 0x0D /* carriage-return */
473#define ISO_CODE_SO 0x0E /* shift-out */
474#define ISO_CODE_SI 0x0F /* shift-in */
475#define ISO_CODE_SS2_7 0x19 /* single-shift-2 for 7-bit code */
476#define ISO_CODE_ESC 0x1B /* escape */
477#define ISO_CODE_SS2 0x8E /* single-shift-2 */
478#define ISO_CODE_SS3 0x8F /* single-shift-3 */
479#define ISO_CODE_CSI 0x9B /* control-sequence-introducer */
480
481/* All code (1-byte) of ISO2022 is classified into one of the
482 followings. */
483enum iso_code_class_type
484 {
485 ISO_control_0, /* Control codes in the range
486 0x00..0x1F and 0x7F, except for the
487 following 5 codes. */
488 ISO_carriage_return, /* ISO_CODE_CR (0x0D) */
489 ISO_shift_out, /* ISO_CODE_SO (0x0E) */
490 ISO_shift_in, /* ISO_CODE_SI (0x0F) */
491 ISO_single_shift_2_7, /* ISO_CODE_SS2_7 (0x19) */
492 ISO_escape, /* ISO_CODE_SO (0x1B) */
493 ISO_control_1, /* Control codes in the range
494 0x80..0x9F, except for the
495 following 3 codes. */
496 ISO_single_shift_2, /* ISO_CODE_SS2 (0x8E) */
497 ISO_single_shift_3, /* ISO_CODE_SS3 (0x8F) */
498 ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
499 ISO_0x20_or_0x7F, /* Codes of the values 0x20 or 0x7F. */
500 ISO_graphic_plane_0, /* Graphic codes in the range 0x21..0x7E. */
501 ISO_0xA0_or_0xFF, /* Codes of the values 0xA0 or 0xFF. */
502 ISO_graphic_plane_1 /* Graphic codes in the range 0xA1..0xFE. */
503 };
05e6f5dc 504
df7492f9
KH
505/** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
506 `iso-flags' attribute of an iso2022 coding system. */
93dec019 507
df7492f9
KH
508/* If set, produce long-form designation sequence (e.g. ESC $ ( A)
509 instead of the correct short-form sequence (e.g. ESC $ A). */
510#define CODING_ISO_FLAG_LONG_FORM 0x0001
05e6f5dc 511
df7492f9
KH
512/* If set, reset graphic planes and registers at end-of-line to the
513 initial state. */
514#define CODING_ISO_FLAG_RESET_AT_EOL 0x0002
05e6f5dc 515
df7492f9
KH
516/* If set, reset graphic planes and registers before any control
517 characters to the initial state. */
518#define CODING_ISO_FLAG_RESET_AT_CNTL 0x0004
4ed46869 519
df7492f9
KH
520/* If set, encode by 7-bit environment. */
521#define CODING_ISO_FLAG_SEVEN_BITS 0x0008
b73bfc1c 522
df7492f9
KH
523/* If set, use locking-shift function. */
524#define CODING_ISO_FLAG_LOCKING_SHIFT 0x0010
b73bfc1c 525
df7492f9
KH
526/* If set, use single-shift function. Overwrite
527 CODING_ISO_FLAG_LOCKING_SHIFT. */
528#define CODING_ISO_FLAG_SINGLE_SHIFT 0x0020
b73bfc1c 529
df7492f9
KH
530/* If set, use designation escape sequence. */
531#define CODING_ISO_FLAG_DESIGNATION 0x0040
b73bfc1c 532
df7492f9
KH
533/* If set, produce revision number sequence. */
534#define CODING_ISO_FLAG_REVISION 0x0080
f4dee582 535
df7492f9
KH
536/* If set, produce ISO6429's direction specifying sequence. */
537#define CODING_ISO_FLAG_DIRECTION 0x0100
4ed46869 538
df7492f9
KH
539/* If set, assume designation states are reset at beginning of line on
540 output. */
541#define CODING_ISO_FLAG_INIT_AT_BOL 0x0200
aa72b389 542
df7492f9
KH
543/* If set, designation sequence should be placed at beginning of line
544 on output. */
545#define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
aa72b389 546
df7492f9
KH
547/* If set, do not encode unsafe charactes on output. */
548#define CODING_ISO_FLAG_SAFE 0x0800
aa72b389 549
df7492f9
KH
550/* If set, extra latin codes (128..159) are accepted as a valid code
551 on input. */
552#define CODING_ISO_FLAG_LATIN_EXTRA 0x1000
aa72b389 553
df7492f9 554#define CODING_ISO_FLAG_COMPOSITION 0x2000
aa72b389 555
df7492f9 556#define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000
aa72b389 557
df7492f9 558#define CODING_ISO_FLAG_FULL_SUPPORT 0x8000
aa72b389 559
df7492f9
KH
560/* A character to be produced on output if encoding of the original
561 character is prohibited by CODING_ISO_FLAG_SAFE. */
562#define CODING_INHIBIT_CHARACTER_SUBSTITUTION '?'
aa72b389 563
aa72b389 564
df7492f9
KH
565/* UTF-16 section */
566#define CODING_UTF_16_BOM(coding) \
567 ((coding)->spec.utf_16.bom)
4ed46869 568
df7492f9
KH
569#define CODING_UTF_16_ENDIAN(coding) \
570 ((coding)->spec.utf_16.endian)
4ed46869 571
df7492f9
KH
572#define CODING_UTF_16_SURROGATE(coding) \
573 ((coding)->spec.utf_16.surrogate)
4ed46869 574
4ed46869 575
df7492f9
KH
576/* CCL section */
577#define CODING_CCL_DECODER(coding) \
578 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
579#define CODING_CCL_ENCODER(coding) \
580 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
581#define CODING_CCL_VALIDS(coding) \
582 (XSTRING (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)) \
583 ->data)
4ed46869 584
df7492f9 585/* Index for each coding category in `coding_category_table' */
4ed46869 586
df7492f9
KH
587enum coding_category
588 {
589 coding_category_iso_7,
590 coding_category_iso_7_tight,
591 coding_category_iso_8_1,
592 coding_category_iso_8_2,
593 coding_category_iso_7_else,
594 coding_category_iso_8_else,
595 coding_category_utf_8,
596 coding_category_utf_16_auto,
597 coding_category_utf_16_be,
598 coding_category_utf_16_le,
599 coding_category_utf_16_be_nosig,
600 coding_category_utf_16_le_nosig,
601 coding_category_charset,
602 coding_category_sjis,
603 coding_category_big5,
604 coding_category_ccl,
605 coding_category_emacs_mule,
606 /* All above are targets of code detection. */
607 coding_category_raw_text,
608 coding_category_undecided,
609 coding_category_max
610 };
611
612/* Definitions of flag bits used in detect_coding_XXXX. */
613#define CATEGORY_MASK_ISO_7 (1 << coding_category_iso_7)
614#define CATEGORY_MASK_ISO_7_TIGHT (1 << coding_category_iso_7_tight)
615#define CATEGORY_MASK_ISO_8_1 (1 << coding_category_iso_8_1)
616#define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2)
617#define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else)
618#define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else)
619#define CATEGORY_MASK_UTF_8 (1 << coding_category_utf_8)
620#define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be)
621#define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le)
622#define CATEGORY_MASK_UTF_16_BE_NOSIG (1 << coding_category_utf_16_be_nosig)
623#define CATEGORY_MASK_UTF_16_LE_NOSIG (1 << coding_category_utf_16_le_nosig)
624#define CATEGORY_MASK_CHARSET (1 << coding_category_charset)
625#define CATEGORY_MASK_SJIS (1 << coding_category_sjis)
626#define CATEGORY_MASK_BIG5 (1 << coding_category_big5)
627#define CATEGORY_MASK_CCL (1 << coding_category_ccl)
628#define CATEGORY_MASK_EMACS_MULE (1 << coding_category_emacs_mule)
629
630/* This value is returned if detect_coding_mask () find nothing other
631 than ASCII characters. */
632#define CATEGORY_MASK_ANY \
633 (CATEGORY_MASK_ISO_7 \
634 | CATEGORY_MASK_ISO_7_TIGHT \
635 | CATEGORY_MASK_ISO_8_1 \
636 | CATEGORY_MASK_ISO_8_2 \
637 | CATEGORY_MASK_ISO_7_ELSE \
638 | CATEGORY_MASK_ISO_8_ELSE \
639 | CATEGORY_MASK_UTF_8 \
640 | CATEGORY_MASK_UTF_16_BE \
641 | CATEGORY_MASK_UTF_16_LE \
642 | CATEGORY_MASK_UTF_16_BE_NOSIG \
643 | CATEGORY_MASK_UTF_16_LE_NOSIG \
644 | CATEGORY_MASK_CHARSET \
645 | CATEGORY_MASK_SJIS \
646 | CATEGORY_MASK_BIG5 \
647 | CATEGORY_MASK_CCL \
648 | CATEGORY_MASK_EMACS_MULE)
649
650
651#define CATEGORY_MASK_ISO_7BIT \
652 (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
653
654#define CATEGORY_MASK_ISO_8BIT \
655 (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
656
657#define CATEGORY_MASK_ISO_ELSE \
658 (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
659
660#define CATEGORY_MASK_ISO_ESCAPE \
661 (CATEGORY_MASK_ISO_7 \
662 | CATEGORY_MASK_ISO_7_TIGHT \
663 | CATEGORY_MASK_ISO_7_ELSE \
664 | CATEGORY_MASK_ISO_8_ELSE)
665
666#define CATEGORY_MASK_ISO \
667 ( CATEGORY_MASK_ISO_7BIT \
668 | CATEGORY_MASK_ISO_8BIT \
669 | CATEGORY_MASK_ISO_ELSE)
670
671#define CATEGORY_MASK_UTF_16 \
672 (CATEGORY_MASK_UTF_16_BE \
673 | CATEGORY_MASK_UTF_16_LE \
674 | CATEGORY_MASK_UTF_16_BE_NOSIG \
675 | CATEGORY_MASK_UTF_16_LE_NOSIG)
676
677
678/* List of symbols `coding-category-xxx' ordered by priority. This
679 variable is exposed to Emacs Lisp. */
680static Lisp_Object Vcoding_category_list;
681
682/* Table of coding categories (Lisp symbols). This variable is for
683 internal use oly. */
684static Lisp_Object Vcoding_category_table;
685
686/* Table of coding-categories ordered by priority. */
687static enum coding_category coding_priorities[coding_category_max];
688
689/* Nth element is a coding context for the coding system bound to the
690 Nth coding category. */
691static struct coding_system coding_categories[coding_category_max];
692
693static int detected_mask[coding_category_raw_text] =
694 { CATEGORY_MASK_ISO,
695 CATEGORY_MASK_ISO,
696 CATEGORY_MASK_ISO,
697 CATEGORY_MASK_ISO,
698 CATEGORY_MASK_ISO,
699 CATEGORY_MASK_ISO,
700 CATEGORY_MASK_UTF_8,
701 CATEGORY_MASK_UTF_16,
702 CATEGORY_MASK_UTF_16,
703 CATEGORY_MASK_UTF_16,
704 CATEGORY_MASK_UTF_16,
705 CATEGORY_MASK_UTF_16,
706 CATEGORY_MASK_CHARSET,
707 CATEGORY_MASK_SJIS,
708 CATEGORY_MASK_BIG5,
709 CATEGORY_MASK_CCL,
710 CATEGORY_MASK_EMACS_MULE
711 };
712
713/*** Commonly used macros and functions ***/
714
715#ifndef min
716#define min(a, b) ((a) < (b) ? (a) : (b))
717#endif
718#ifndef max
719#define max(a, b) ((a) > (b) ? (a) : (b))
720#endif
4ed46869 721
df7492f9
KH
722#define CODING_GET_INFO(coding, attrs, eol_type, charset_list) \
723 do { \
724 attrs = CODING_ID_ATTRS (coding->id); \
725 eol_type = CODING_ID_EOL_TYPE (coding->id); \
726 if (VECTORP (eol_type)) \
727 eol_type = Qunix; \
728 charset_list = CODING_ATTR_CHARSET_LIST (attrs); \
729 } while (0)
4ed46869 730
4ed46869 731
df7492f9
KH
732/* Safely get one byte from the source text pointed by SRC which ends
733 at SRC_END, and set C to that byte. If there are not enough bytes
734 in the source, it jumps to `no_more_source'. The caller
735 should declare and set these variables appropriately in advance:
736 src, src_end, multibytep
737*/
aa72b389 738
df7492f9 739#define ONE_MORE_BYTE(c) \
aa72b389 740 do { \
df7492f9
KH
741 if (src == src_end) \
742 { \
743 if (src_base < src) \
744 coding->result = CODING_RESULT_INSUFFICIENT_SRC; \
745 goto no_more_source; \
746 } \
747 c = *src++; \
748 if (multibytep && (c & 0x80)) \
749 { \
750 if ((c & 0xFE) != 0xC0) \
751 error ("Undecodable char found"); \
752 c = ((c & 1) << 6) | *src++; \
753 } \
754 consumed_chars++; \
aa72b389
KH
755 } while (0)
756
aa72b389 757
df7492f9
KH
758#define ONE_MORE_BYTE_NO_CHECK(c) \
759 do { \
760 c = *src++; \
761 if (multibytep && (c & 0x80)) \
762 { \
763 if ((c & 0xFE) != 0xC0) \
764 error ("Undecodable char found"); \
765 c = ((c & 1) << 6) | *src++; \
766 } \
aa72b389
KH
767 } while (0)
768
aa72b389 769
df7492f9
KH
770/* Store a byte C in the place pointed by DST and increment DST to the
771 next free point, and increment PRODUCED_CHARS. The caller should
772 assure that C is 0..127, and declare and set the variable `dst'
773 appropriately in advance.
774*/
aa72b389
KH
775
776
df7492f9
KH
777#define EMIT_ONE_ASCII_BYTE(c) \
778 do { \
779 produced_chars++; \
780 *dst++ = (c); \
781 } while (0)
aa72b389 782
aa72b389 783
df7492f9 784/* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2. */
aa72b389 785
df7492f9
KH
786#define EMIT_TWO_ASCII_BYTES(c1, c2) \
787 do { \
788 produced_chars += 2; \
789 *dst++ = (c1), *dst++ = (c2); \
790 } while (0)
aa72b389 791
df7492f9
KH
792
793/* Store a byte C in the place pointed by DST and increment DST to the
794 next free point, and increment PRODUCED_CHARS. If MULTIBYTEP is
795 nonzero, store in an appropriate multibyte from. The caller should
796 declare and set the variables `dst' and `multibytep' appropriately
797 in advance. */
798
799#define EMIT_ONE_BYTE(c) \
800 do { \
801 produced_chars++; \
802 if (multibytep) \
803 { \
804 int ch = (c); \
805 if (ch >= 0x80) \
806 ch = BYTE8_TO_CHAR (ch); \
807 CHAR_STRING_ADVANCE (ch, dst); \
808 } \
809 else \
810 *dst++ = (c); \
aa72b389
KH
811 } while (0)
812
813
df7492f9 814/* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */
aa72b389 815
e19c3639
KH
816#define EMIT_TWO_BYTES(c1, c2) \
817 do { \
818 produced_chars += 2; \
819 if (multibytep) \
820 { \
821 int ch; \
822 \
823 ch = (c1); \
824 if (ch >= 0x80) \
825 ch = BYTE8_TO_CHAR (ch); \
826 CHAR_STRING_ADVANCE (ch, dst); \
827 ch = (c2); \
828 if (ch >= 0x80) \
829 ch = BYTE8_TO_CHAR (ch); \
830 CHAR_STRING_ADVANCE (ch, dst); \
831 } \
832 else \
833 { \
834 *dst++ = (c1); \
835 *dst++ = (c2); \
836 } \
aa72b389
KH
837 } while (0)
838
839
df7492f9
KH
840#define EMIT_THREE_BYTES(c1, c2, c3) \
841 do { \
842 EMIT_ONE_BYTE (c1); \
843 EMIT_TWO_BYTES (c2, c3); \
844 } while (0)
aa72b389 845
aa72b389 846
df7492f9
KH
847#define EMIT_FOUR_BYTES(c1, c2, c3, c4) \
848 do { \
849 EMIT_TWO_BYTES (c1, c2); \
850 EMIT_TWO_BYTES (c3, c4); \
851 } while (0)
aa72b389 852
aa72b389 853
df7492f9
KH
854#define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
855 do { \
856 charset_map_loaded = 0; \
857 c = DECODE_CHAR (charset, code); \
858 if (charset_map_loaded) \
859 { \
860 unsigned char *orig = coding->source; \
861 EMACS_INT offset; \
862 \
863 coding_set_source (coding); \
864 offset = coding->source - orig; \
865 src += offset; \
866 src_base += offset; \
867 src_end += offset; \
868 } \
869 } while (0)
aa72b389 870
aa72b389 871
df7492f9
KH
872#define ASSURE_DESTINATION(bytes) \
873 do { \
874 if (dst + (bytes) >= dst_end) \
875 { \
876 int more_bytes = charbuf_end - charbuf + (bytes); \
877 \
878 dst = alloc_destination (coding, more_bytes, dst); \
879 dst_end = coding->destination + coding->dst_bytes; \
880 } \
881 } while (0)
b1887814 882
df7492f9
KH
883
884
885static void
886coding_set_source (coding)
887 struct coding_system *coding;
888{
889 if (BUFFERP (coding->src_object))
890 {
891 if (coding->src_pos < 0)
892 coding->source = GAP_END_ADDR + coding->src_pos_byte;
893 else
894 {
e19c3639 895 struct buffer *buf = XBUFFER (coding->src_object);
e19c3639
KH
896 EMACS_INT gpt_byte = BUF_GPT_BYTE (buf);
897 unsigned char *beg_addr = BUF_BEG_ADDR (buf);
898
899 coding->source = beg_addr + coding->src_pos_byte - 1;
900 if (coding->src_pos_byte >= gpt_byte)
901 coding->source += BUF_GAP_SIZE (buf);
aa72b389
KH
902 }
903 }
df7492f9 904 else if (STRINGP (coding->src_object))
aa72b389 905 {
df7492f9
KH
906 coding->source = (XSTRING (coding->src_object)->data
907 + coding->src_pos_byte);
908 }
909 else
910 /* Otherwise, the source is C string and is never relocated
911 automatically. Thus we don't have to update anything. */
912 ;
913}
914
915static void
916coding_set_destination (coding)
917 struct coding_system *coding;
918{
919 if (BUFFERP (coding->dst_object))
920 {
921 /* We are sure that coding->dst_pos_byte is before the gap of the
922 buffer. */
923 coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
924 + coding->dst_pos_byte - 1);
925 if (coding->src_pos < 0)
df7492f9
KH
926 coding->dst_bytes = (GAP_END_ADDR
927 - (coding->src_bytes - coding->consumed)
928 - coding->destination);
929 else
930 coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
931 - coding->destination);
932 }
933 else
934 /* Otherwise, the destination is C string and is never relocated
935 automatically. Thus we don't have to update anything. */
936 ;
937}
938
939
940static void
941coding_alloc_by_realloc (coding, bytes)
942 struct coding_system *coding;
943 EMACS_INT bytes;
944{
945 coding->destination = (unsigned char *) xrealloc (coding->destination,
946 coding->dst_bytes + bytes);
947 coding->dst_bytes += bytes;
948}
949
950static void
951coding_alloc_by_making_gap (coding, bytes)
952 struct coding_system *coding;
953 EMACS_INT bytes;
954{
2c78b7e1
KH
955 if (BUFFERP (coding->dst_object)
956 && EQ (coding->src_object, coding->dst_object))
df7492f9
KH
957 {
958 EMACS_INT add = coding->src_bytes - coding->consumed;
959
960 GAP_SIZE -= add; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
961 make_gap (bytes);
962 GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
963 }
964 else
965 {
2c78b7e1
KH
966 Lisp_Object this_buffer;
967
968 this_buffer = Fcurrent_buffer ();
df7492f9
KH
969 set_buffer_internal (XBUFFER (coding->dst_object));
970 make_gap (bytes);
971 set_buffer_internal (XBUFFER (this_buffer));
972 }
973}
974
975
976static unsigned char *
977alloc_destination (coding, nbytes, dst)
978 struct coding_system *coding;
979 int nbytes;
980 unsigned char *dst;
981{
982 EMACS_INT offset = dst - coding->destination;
983
984 if (BUFFERP (coding->dst_object))
985 coding_alloc_by_making_gap (coding, nbytes);
986 else
987 coding_alloc_by_realloc (coding, nbytes);
988 coding->result = CODING_RESULT_SUCCESS;
989 coding_set_destination (coding);
990 dst = coding->destination + offset;
991 return dst;
992}
aa72b389 993
df7492f9
KH
994\f
995/*** 2. Emacs' internal format (emacs-utf-8) ***/
996
997
998
999\f
1000/*** 3. UTF-8 ***/
1001
1002/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1003 Check if a text is encoded in UTF-8. If it is, return
1004 CATEGORY_MASK_UTF_8, else return 0. */
1005
1006#define UTF_8_1_OCTET_P(c) ((c) < 0x80)
1007#define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
1008#define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1009#define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1010#define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1011#define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1012
1013static int
1014detect_coding_utf_8 (coding, mask)
1015 struct coding_system *coding;
1016 int *mask;
1017{
1018 unsigned char *src = coding->source, *src_base = src;
1019 unsigned char *src_end = coding->source + coding->src_bytes;
1020 int multibytep = coding->src_multibyte;
1021 int consumed_chars = 0;
1022 int found = 0;
1023
1024 /* A coding system of this category is always ASCII compatible. */
1025 src += coding->head_ascii;
1026
1027 while (1)
1028 {
1029 int c, c1, c2, c3, c4;
1030
1031 ONE_MORE_BYTE (c);
1032 if (UTF_8_1_OCTET_P (c))
1033 continue;
1034 ONE_MORE_BYTE (c1);
1035 if (! UTF_8_EXTRA_OCTET_P (c1))
1036 break;
1037 if (UTF_8_2_OCTET_LEADING_P (c))
1038 {
1039 found++;
1040 continue;
1041 }
1042 ONE_MORE_BYTE (c2);
1043 if (! UTF_8_EXTRA_OCTET_P (c2))
1044 break;
1045 if (UTF_8_3_OCTET_LEADING_P (c))
1046 {
1047 found++;
1048 continue;
1049 }
1050 ONE_MORE_BYTE (c3);
1051 if (! UTF_8_EXTRA_OCTET_P (c3))
1052 break;
1053 if (UTF_8_4_OCTET_LEADING_P (c))
aa72b389 1054 {
df7492f9
KH
1055 found++;
1056 continue;
1057 }
1058 ONE_MORE_BYTE (c4);
1059 if (! UTF_8_EXTRA_OCTET_P (c4))
1060 break;
1061 if (UTF_8_5_OCTET_LEADING_P (c))
1062 {
1063 found++;
1064 continue;
1065 }
1066 break;
1067 }
1068 *mask &= ~CATEGORY_MASK_UTF_8;
1069 return 0;
1070
1071 no_more_source:
1072 if (! found)
1073 return 0;
1074 *mask &= CATEGORY_MASK_UTF_8;
1075 return 1;
1076}
1077
1078
1079static void
1080decode_coding_utf_8 (coding)
1081 struct coding_system *coding;
1082{
1083 unsigned char *src = coding->source + coding->consumed;
1084 unsigned char *src_end = coding->source + coding->src_bytes;
1085 unsigned char *src_base;
1086 int *charbuf = coding->charbuf;
1087 int *charbuf_end = charbuf + coding->charbuf_size;
1088 int consumed_chars = 0, consumed_chars_base;
1089 int multibytep = coding->src_multibyte;
1090 Lisp_Object attr, eol_type, charset_list;
1091
1092 CODING_GET_INFO (coding, attr, eol_type, charset_list);
1093
1094 while (1)
1095 {
1096 int c, c1, c2, c3, c4, c5;
1097
1098 src_base = src;
1099 consumed_chars_base = consumed_chars;
1100
1101 if (charbuf >= charbuf_end)
1102 break;
1103
1104 ONE_MORE_BYTE (c1);
1105 if (UTF_8_1_OCTET_P(c1))
1106 {
1107 c = c1;
1108 if (c == '\r')
aa72b389 1109 {
df7492f9
KH
1110 if (EQ (eol_type, Qdos))
1111 {
1112 if (src == src_end)
1113 goto no_more_source;
1114 if (*src == '\n')
1115 ONE_MORE_BYTE (c);
1116 }
1117 else if (EQ (eol_type, Qmac))
1118 c = '\n';
aa72b389 1119 }
aa72b389 1120 }
df7492f9 1121 else
aa72b389 1122 {
df7492f9
KH
1123 ONE_MORE_BYTE (c2);
1124 if (! UTF_8_EXTRA_OCTET_P (c2))
1125 goto invalid_code;
1126 if (UTF_8_2_OCTET_LEADING_P (c1))
1127 c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1128 else
aa72b389 1129 {
df7492f9
KH
1130 ONE_MORE_BYTE (c3);
1131 if (! UTF_8_EXTRA_OCTET_P (c3))
1132 goto invalid_code;
1133 if (UTF_8_3_OCTET_LEADING_P (c1))
1134 c = (((c1 & 0xF) << 12)
1135 | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1136 else
1137 {
1138 ONE_MORE_BYTE (c4);
1139 if (! UTF_8_EXTRA_OCTET_P (c4))
1140 goto invalid_code;
1141 if (UTF_8_4_OCTET_LEADING_P (c1))
1142 c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1143 | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1144 else
1145 {
1146 ONE_MORE_BYTE (c5);
1147 if (! UTF_8_EXTRA_OCTET_P (c5))
1148 goto invalid_code;
1149 if (UTF_8_5_OCTET_LEADING_P (c1))
1150 {
1151 c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1152 | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1153 | (c5 & 0x3F));
1154 if (c > MAX_CHAR)
1155 goto invalid_code;
1156 }
1157 else
1158 goto invalid_code;
1159 }
1160 }
aa72b389 1161 }
aa72b389 1162 }
df7492f9
KH
1163
1164 *charbuf++ = c;
1165 continue;
1166
1167 invalid_code:
1168 src = src_base;
1169 consumed_chars = consumed_chars_base;
1170 ONE_MORE_BYTE (c);
1171 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1172 coding->errors++;
aa72b389
KH
1173 }
1174
df7492f9
KH
1175 no_more_source:
1176 coding->consumed_char += consumed_chars_base;
1177 coding->consumed = src_base - coding->source;
1178 coding->charbuf_used = charbuf - coding->charbuf;
1179}
1180
1181
1182static int
1183encode_coding_utf_8 (coding)
1184 struct coding_system *coding;
1185{
1186 int multibytep = coding->dst_multibyte;
1187 int *charbuf = coding->charbuf;
1188 int *charbuf_end = charbuf + coding->charbuf_used;
1189 unsigned char *dst = coding->destination + coding->produced;
1190 unsigned char *dst_end = coding->destination + coding->dst_bytes;
e19c3639 1191 int produced_chars = 0;
df7492f9
KH
1192 int c;
1193
1194 if (multibytep)
aa72b389 1195 {
df7492f9
KH
1196 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1197
1198 while (charbuf < charbuf_end)
aa72b389 1199 {
df7492f9
KH
1200 unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1201
1202 ASSURE_DESTINATION (safe_room);
1203 c = *charbuf++;
1204 CHAR_STRING_ADVANCE (c, pend);
1205 for (p = str; p < pend; p++)
1206 EMIT_ONE_BYTE (*p);
aa72b389 1207 }
aa72b389 1208 }
df7492f9
KH
1209 else
1210 {
1211 int safe_room = MAX_MULTIBYTE_LENGTH;
1212
1213 while (charbuf < charbuf_end)
1214 {
1215 ASSURE_DESTINATION (safe_room);
1216 c = *charbuf++;
1217 dst += CHAR_STRING (c, dst);
1218 produced_chars++;
1219 }
1220 }
1221 coding->result = CODING_RESULT_SUCCESS;
1222 coding->produced_char += produced_chars;
1223 coding->produced = dst - coding->destination;
1224 return 0;
aa72b389
KH
1225}
1226
4ed46869 1227
df7492f9
KH
1228/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1229 Check if a text is encoded in UTF-16 Big Endian (endian == 1) or
1230 Little Endian (otherwise). If it is, return
1231 CATEGORY_MASK_UTF_16_BE or CATEGORY_MASK_UTF_16_LE,
1232 else return 0. */
1233
1234#define UTF_16_HIGH_SURROGATE_P(val) \
1235 (((val) & 0xFC00) == 0xD800)
1236
1237#define UTF_16_LOW_SURROGATE_P(val) \
1238 (((val) & 0xFC00) == 0xDC00)
1239
1240#define UTF_16_INVALID_P(val) \
1241 (((val) == 0xFFFE) \
1242 || ((val) == 0xFFFF) \
1243 || UTF_16_LOW_SURROGATE_P (val))
1244
1245
1246static int
1247detect_coding_utf_16 (coding, mask)
b73bfc1c 1248 struct coding_system *coding;
df7492f9 1249 int *mask;
b73bfc1c 1250{
df7492f9
KH
1251 unsigned char *src = coding->source, *src_base = src;
1252 unsigned char *src_end = coding->source + coding->src_bytes;
1253 int multibytep = coding->src_multibyte;
1254 int consumed_chars = 0;
1255 int c1, c2;
1256
1257 ONE_MORE_BYTE (c1);
1258 ONE_MORE_BYTE (c2);
4ed46869 1259
df7492f9 1260 if ((c1 == 0xFF) && (c2 == 0xFE))
b73bfc1c 1261 {
df7492f9
KH
1262 *mask &= CATEGORY_MASK_UTF_16_LE;
1263 return 1;
1264 }
1265 else if ((c1 == 0xFE) && (c2 == 0xFF))
1266 {
1267 *mask &= CATEGORY_MASK_UTF_16_BE;
1268 return 1;
1269 }
1270 no_more_source:
1271 return 0;
1272}
ec6d2bb8 1273
df7492f9
KH
1274static void
1275decode_coding_utf_16 (coding)
1276 struct coding_system *coding;
1277{
1278 unsigned char *src = coding->source + coding->consumed;
1279 unsigned char *src_end = coding->source + coding->src_bytes;
0be8721c 1280 unsigned char *src_base;
df7492f9
KH
1281 int *charbuf = coding->charbuf;
1282 int *charbuf_end = charbuf + coding->charbuf_size;
1283 int consumed_chars = 0, consumed_chars_base;
1284 int multibytep = coding->src_multibyte;
1285 enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1286 enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1287 int surrogate = CODING_UTF_16_SURROGATE (coding);
1288 Lisp_Object attr, eol_type, charset_list;
1289
1290 CODING_GET_INFO (coding, attr, eol_type, charset_list);
1291
1292 if (bom != utf_16_without_bom)
1293 {
1294 int c, c1, c2;
4af310db 1295
df7492f9
KH
1296 src_base = src;
1297 ONE_MORE_BYTE (c1);
1298 ONE_MORE_BYTE (c2);
e19c3639 1299 c = (c1 << 8) | c2;
df7492f9
KH
1300 if (bom == utf_16_with_bom)
1301 {
1302 if (endian == utf_16_big_endian
1303 ? c != 0xFFFE : c != 0xFEFF)
4af310db 1304 {
df7492f9
KH
1305 /* We are sure that there's enouph room at CHARBUF. */
1306 *charbuf++ = c1;
1307 *charbuf++ = c2;
1308 coding->errors++;
4af310db 1309 }
4af310db 1310 }
df7492f9 1311 else
4af310db 1312 {
df7492f9
KH
1313 if (c == 0xFFFE)
1314 CODING_UTF_16_ENDIAN (coding)
1315 = endian = utf_16_big_endian;
1316 else if (c == 0xFEFF)
1317 CODING_UTF_16_ENDIAN (coding)
1318 = endian = utf_16_little_endian;
1319 else
4af310db 1320 {
df7492f9
KH
1321 CODING_UTF_16_ENDIAN (coding)
1322 = endian = utf_16_big_endian;
1323 src = src_base;
4af310db 1324 }
4af310db 1325 }
df7492f9
KH
1326 CODING_UTF_16_BOM (coding) = utf_16_with_bom;
1327 }
1328
1329 while (1)
1330 {
1331 int c, c1, c2;
1332
1333 src_base = src;
1334 consumed_chars_base = consumed_chars;
1335
1336 if (charbuf + 2 >= charbuf_end)
1337 break;
1338
1339 ONE_MORE_BYTE (c1);
1340 ONE_MORE_BYTE (c2);
1341 c = (endian == utf_16_big_endian
e19c3639 1342 ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
df7492f9 1343 if (surrogate)
aa72b389 1344 {
df7492f9 1345 if (! UTF_16_LOW_SURROGATE_P (c))
aa72b389 1346 {
df7492f9
KH
1347 if (endian == utf_16_big_endian)
1348 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1349 else
1350 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1351 *charbuf++ = c1;
1352 *charbuf++ = c2;
1353 coding->errors++;
1354 if (UTF_16_HIGH_SURROGATE_P (c))
1355 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1356 else
1357 *charbuf++ = c;
aa72b389 1358 }
df7492f9
KH
1359 else
1360 {
1361 c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1362 CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1363 *charbuf++ = c;
1364 }
1365 }
1366 else
1367 {
1368 if (UTF_16_HIGH_SURROGATE_P (c))
1369 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1370 else
1371 *charbuf++ = c;
1372 }
1373 }
1374
1375 no_more_source:
1376 coding->consumed_char += consumed_chars_base;
1377 coding->consumed = src_base - coding->source;
1378 coding->charbuf_used = charbuf - coding->charbuf;
1379}
1380
1381static int
1382encode_coding_utf_16 (coding)
1383 struct coding_system *coding;
1384{
1385 int multibytep = coding->dst_multibyte;
1386 int *charbuf = coding->charbuf;
1387 int *charbuf_end = charbuf + coding->charbuf_used;
1388 unsigned char *dst = coding->destination + coding->produced;
1389 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1390 int safe_room = 8;
1391 enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1392 int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1393 int produced_chars = 0;
1394 Lisp_Object attrs, eol_type, charset_list;
1395 int c;
1396
1397 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
1398
1399 if (bom == utf_16_with_bom)
1400 {
1401 ASSURE_DESTINATION (safe_room);
1402 if (big_endian)
1403 EMIT_TWO_BYTES (0xFF, 0xFE);
1404 else
1405 EMIT_TWO_BYTES (0xFE, 0xFF);
1406 CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1407 }
1408
1409 while (charbuf < charbuf_end)
1410 {
1411 ASSURE_DESTINATION (safe_room);
1412 c = *charbuf++;
e19c3639
KH
1413 if (c >= MAX_UNICODE_CHAR)
1414 c = coding->default_char;
df7492f9
KH
1415
1416 if (c < 0x10000)
1417 {
1418 if (big_endian)
1419 EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1420 else
1421 EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1422 }
1423 else
1424 {
1425 int c1, c2;
1426
1427 c -= 0x10000;
1428 c1 = (c >> 10) + 0xD800;
1429 c2 = (c & 0x3FF) + 0xDC00;
1430 if (big_endian)
1431 EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1432 else
1433 EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1434 }
1435 }
1436 coding->result = CODING_RESULT_SUCCESS;
1437 coding->produced = dst - coding->destination;
1438 coding->produced_char += produced_chars;
1439 return 0;
1440}
1441
1442\f
1443/*** 6. Old Emacs' internal format (emacs-mule) ***/
1444
1445/* Emacs' internal format for representation of multiple character
1446 sets is a kind of multi-byte encoding, i.e. characters are
1447 represented by variable-length sequences of one-byte codes.
1448
1449 ASCII characters and control characters (e.g. `tab', `newline') are
1450 represented by one-byte sequences which are their ASCII codes, in
1451 the range 0x00 through 0x7F.
1452
1453 8-bit characters of the range 0x80..0x9F are represented by
1454 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1455 code + 0x20).
1456
1457 8-bit characters of the range 0xA0..0xFF are represented by
1458 one-byte sequences which are their 8-bit code.
1459
1460 The other characters are represented by a sequence of `base
1461 leading-code', optional `extended leading-code', and one or two
1462 `position-code's. The length of the sequence is determined by the
1463 base leading-code. Leading-code takes the range 0x81 through 0x9D,
1464 whereas extended leading-code and position-code take the range 0xA0
1465 through 0xFF. See `charset.h' for more details about leading-code
1466 and position-code.
1467
1468 --- CODE RANGE of Emacs' internal format ---
1469 character set range
1470 ------------- -----
1471 ascii 0x00..0x7F
1472 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1473 eight-bit-graphic 0xA0..0xBF
1474 ELSE 0x81..0x9D + [0xA0..0xFF]+
1475 ---------------------------------------------
1476
1477 As this is the internal character representation, the format is
1478 usually not used externally (i.e. in a file or in a data sent to a
1479 process). But, it is possible to have a text externally in this
1480 format (i.e. by encoding by the coding system `emacs-mule').
1481
1482 In that case, a sequence of one-byte codes has a slightly different
1483 form.
1484
1485 At first, all characters in eight-bit-control are represented by
1486 one-byte sequences which are their 8-bit code.
1487
1488 Next, character composition data are represented by the byte
1489 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1490 where,
1491 METHOD is 0xF0 plus one of composition method (enum
1492 composition_method),
1493
1494 BYTES is 0xA0 plus a byte length of this composition data,
1495
1496 CHARS is 0x20 plus a number of characters composed by this
1497 data,
1498
1499 COMPONENTs are characters of multibye form or composition
1500 rules encoded by two-byte of ASCII codes.
1501
1502 In addition, for backward compatibility, the following formats are
1503 also recognized as composition data on decoding.
1504
1505 0x80 MSEQ ...
1506 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1507
1508 Here,
1509 MSEQ is a multibyte form but in these special format:
1510 ASCII: 0xA0 ASCII_CODE+0x80,
1511 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1512 RULE is a one byte code of the range 0xA0..0xF0 that
1513 represents a composition rule.
1514 */
1515
1516char emacs_mule_bytes[256];
1517
1518/* Leading-code followed by extended leading-code. */
1519#define LEADING_CODE_PRIVATE_11 0x9A /* for private DIMENSION1 of 1-column */
1520#define LEADING_CODE_PRIVATE_12 0x9B /* for private DIMENSION1 of 2-column */
1521#define LEADING_CODE_PRIVATE_21 0x9C /* for private DIMENSION2 of 1-column */
1522#define LEADING_CODE_PRIVATE_22 0x9D /* for private DIMENSION2 of 2-column */
1523
1524
1525int
1526emacs_mule_char (coding, composition, nbytes, nchars)
1527 struct coding_system *coding;
1528 int composition;
1529 int *nbytes, *nchars;
1530{
1531 unsigned char *src = coding->source + coding->consumed;
1532 unsigned char *src_end = coding->source + coding->src_bytes;
1533 int multibytep = coding->src_multibyte;
1534 unsigned char *src_base = src;
1535 struct charset *charset;
1536 unsigned code;
1537 int c;
1538 int consumed_chars = 0;
1539
1540 ONE_MORE_BYTE (c);
1541 if (composition)
1542 {
1543 c -= 0x20;
1544 if (c == 0x80)
1545 {
1546 ONE_MORE_BYTE (c);
1547 if (c < 0xA0)
1548 goto invalid_code;
1549 *nbytes = src - src_base;
1550 *nchars = consumed_chars;
1551 return (c - 0x80);
aa72b389 1552 }
df7492f9
KH
1553 }
1554
1555 switch (emacs_mule_bytes[c])
1556 {
1557 case 2:
1558 if (! (charset = emacs_mule_charset[c]))
1559 goto invalid_code;
1560 ONE_MORE_BYTE (c);
1561 code = c & 0x7F;
1562 break;
1563
1564 case 3:
1565 if (c == LEADING_CODE_PRIVATE_11
1566 || c == LEADING_CODE_PRIVATE_12)
b73bfc1c 1567 {
df7492f9
KH
1568 ONE_MORE_BYTE (c);
1569 if (! (charset = emacs_mule_charset[c]))
1570 goto invalid_code;
1571 ONE_MORE_BYTE (c);
1572 code = c & 0x7F;
b73bfc1c
KH
1573 }
1574 else
1575 {
df7492f9
KH
1576 if (! (charset = emacs_mule_charset[c]))
1577 goto invalid_code;
1578 ONE_MORE_BYTE (c);
1579 code = (c & 0x7F) << 7;
1580 ONE_MORE_BYTE (c);
1581 code |= c & 0x7F;
1582 }
1583 break;
1584
1585 case 4:
1586 if (! (charset = emacs_mule_charset[c]))
1587 goto invalid_code;
1588 ONE_MORE_BYTE (c);
1589 code = (c & 0x7F) << 7;
1590 ONE_MORE_BYTE (c);
1591 code |= c & 0x7F;
1592 break;
1593
1594 case 1:
1595 code = c;
1596 charset = CHARSET_FROM_ID (ASCII_BYTE_P (code) ? charset_ascii
1597 : code < 0xA0 ? charset_8_bit_control
1598 : charset_8_bit_graphic);
1599 break;
1600
1601 default:
1602 abort ();
1603 }
1604 c = DECODE_CHAR (charset, code);
1605 if (c < 0)
1606 goto invalid_code;
1607 *nbytes = src - src_base;
1608 *nchars = consumed_chars;
1609 return c;
1610
1611 no_more_source:
1612 return -2;
1613
1614 invalid_code:
1615 return -1;
1616}
1617
1618
1619/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1620 Check if a text is encoded in `emacs-mule'. */
1621
1622static int
1623detect_coding_emacs_mule (coding, mask)
1624 struct coding_system *coding;
1625 int *mask;
1626{
1627 unsigned char *src = coding->source, *src_base = src;
1628 unsigned char *src_end = coding->source + coding->src_bytes;
1629 int multibytep = coding->src_multibyte;
1630 int consumed_chars = 0;
1631 int c;
1632 int found = 0;
1633
1634 /* A coding system of this category is always ASCII compatible. */
1635 src += coding->head_ascii;
1636
1637 while (1)
1638 {
1639 ONE_MORE_BYTE (c);
1640
1641 if (c == 0x80)
1642 {
1643 /* Perhaps the start of composite character. We simple skip
1644 it because analyzing it is too heavy for detecting. But,
1645 at least, we check that the composite character
1646 constitues of more than 4 bytes. */
1647 unsigned char *src_base;
1648
1649 repeat:
1650 src_base = src;
1651 do
1652 {
1653 ONE_MORE_BYTE (c);
1654 }
1655 while (c >= 0xA0);
1656
1657 if (src - src_base <= 4)
1658 break;
1659 found = 1;
1660 if (c == 0x80)
1661 goto repeat;
b73bfc1c 1662 }
df7492f9
KH
1663
1664 if (c < 0x80)
b73bfc1c 1665 {
df7492f9
KH
1666 if (c < 0x20
1667 && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1668 break;
1669 }
1670 else
1671 {
1672 unsigned char *src_base = src - 1;
1673
1674 do
1675 {
1676 ONE_MORE_BYTE (c);
1677 }
1678 while (c >= 0xA0);
1679 if (src - src_base != emacs_mule_bytes[*src_base])
1680 break;
1681 found = 1;
4ed46869
KH
1682 }
1683 }
df7492f9
KH
1684 *mask &= ~CATEGORY_MASK_EMACS_MULE;
1685 return 0;
1686
1687 no_more_source:
1688 if (!found)
1689 return 0;
1690 *mask &= CATEGORY_MASK_EMACS_MULE;
1691 return 1;
4ed46869
KH
1692}
1693
b73bfc1c 1694
df7492f9
KH
1695/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
1696
1697/* Decode a character represented as a component of composition
1698 sequence of Emacs 20/21 style at SRC. Set C to that character and
1699 update SRC to the head of next character (or an encoded composition
1700 rule). If SRC doesn't points a composition component, set C to -1.
1701 If SRC points an invalid byte sequence, global exit by a return
1702 value 0. */
1703
1704#define DECODE_EMACS_MULE_COMPOSITION_CHAR(buf) \
1705 if (1) \
1706 { \
1707 int c; \
1708 int nbytes, nchars; \
1709 \
1710 if (src == src_end) \
1711 break; \
1712 c = emacs_mule_char (coding, 1, &nbytes, &nchars); \
1713 if (c < 0) \
1714 { \
1715 if (c == -2) \
1716 break; \
1717 goto invalid_code; \
1718 } \
1719 *buf++ = c; \
1720 src += nbytes; \
1721 consumed_chars += nchars; \
1722 } \
1723 else
1724
1725
1726/* Decode a composition rule represented as a component of composition
1727 sequence of Emacs 20 style at SRC. Set C to the rule. If SRC
1728 points an invalid byte sequence, set C to -1. */
1729
1730#define DECODE_EMACS_MULE_COMPOSITION_RULE(buf) \
1731 do { \
1732 int c, gref, nref; \
1733 \
1734 if (src < src_end) \
1735 goto invalid_code; \
1736 ONE_MORE_BYTE_NO_CHECK (c); \
1737 c -= 0xA0; \
1738 if (c < 0 || c >= 81) \
1739 goto invalid_code; \
1740 \
1741 gref = c / 9, nref = c % 9; \
1742 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
1743 } while (0)
1744
1745
1746#define ADD_COMPOSITION_DATA(buf, method, nchars) \
1747 do { \
1748 *buf++ = -5; \
1749 *buf++ = coding->produced_char + char_offset; \
1750 *buf++ = CODING_ANNOTATE_COMPOSITION_MASK; \
1751 *buf++ = method; \
1752 *buf++ = nchars; \
1753 } while (0)
aa72b389 1754
df7492f9
KH
1755
1756#define DECODE_EMACS_MULE_21_COMPOSITION(c) \
aa72b389 1757 do { \
df7492f9
KH
1758 /* Emacs 21 style format. The first three bytes at SRC are \
1759 (METHOD - 0xF0), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is \
1760 the byte length of this composition information, CHARS is the \
1761 number of characters composed by this composition. */ \
1762 enum composition_method method = c - 0xF0; \
1763 int consumed_chars_limit; \
1764 int nbytes, nchars; \
1765 \
1766 ONE_MORE_BYTE (c); \
1767 nbytes = c - 0xA0; \
1768 if (nbytes < 3) \
1769 goto invalid_code; \
1770 ONE_MORE_BYTE (c); \
1771 nchars = c - 0xA0; \
1772 ADD_COMPOSITION_DATA (charbuf, method, nchars); \
1773 consumed_chars_limit = consumed_chars_base + nbytes; \
1774 if (method != COMPOSITION_RELATIVE) \
aa72b389 1775 { \
df7492f9
KH
1776 int i = 0; \
1777 while (consumed_chars < consumed_chars_limit) \
aa72b389 1778 { \
df7492f9
KH
1779 if (i % 2 && method != COMPOSITION_WITH_ALTCHARS) \
1780 DECODE_EMACS_MULE_COMPOSITION_RULE (charbuf); \
1781 else \
1782 DECODE_EMACS_MULE_COMPOSITION_CHAR (charbuf); \
aa72b389 1783 } \
df7492f9
KH
1784 if (consumed_chars < consumed_chars_limit) \
1785 goto invalid_code; \
aa72b389
KH
1786 } \
1787 } while (0)
93dec019 1788
aa72b389 1789
df7492f9
KH
1790#define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION(c) \
1791 do { \
1792 /* Emacs 20 style format for relative composition. */ \
1793 /* Store multibyte form of characters to be composed. */ \
1794 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
1795 int *buf = components; \
1796 int i, j; \
1797 \
1798 src = src_base; \
1799 ONE_MORE_BYTE (c); /* skip 0x80 */ \
1800 for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \
1801 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1802 if (i < 2) \
1803 goto invalid_code; \
1804 ADD_COMPOSITION_DATA (charbuf, COMPOSITION_RELATIVE, i); \
1805 for (j = 0; j < i; j++) \
1806 *charbuf++ = components[j]; \
1807 } while (0)
1808
1809
1810#define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION(c) \
1811 do { \
1812 /* Emacs 20 style format for rule-base composition. */ \
1813 /* Store multibyte form of characters to be composed. */ \
1814 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
1815 int *buf = components; \
1816 int i, j; \
1817 \
1818 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1819 for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \
1820 { \
1821 DECODE_EMACS_MULE_COMPOSITION_RULE (buf); \
1822 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1823 } \
1824 if (i < 1 || (buf - components) % 2 == 0) \
1825 goto invalid_code; \
1826 if (charbuf + i + (i / 2) + 1 < charbuf_end) \
1827 goto no_more_source; \
1828 ADD_COMPOSITION_DATA (buf, COMPOSITION_WITH_RULE, i); \
1829 for (j = 0; j < i; j++) \
1830 *charbuf++ = components[j]; \
1831 for (j = 0; j < i; j += 2) \
1832 *charbuf++ = components[j]; \
1833 } while (0)
1834
aa72b389
KH
1835
1836static void
df7492f9 1837decode_coding_emacs_mule (coding)
aa72b389 1838 struct coding_system *coding;
aa72b389 1839{
df7492f9
KH
1840 unsigned char *src = coding->source + coding->consumed;
1841 unsigned char *src_end = coding->source + coding->src_bytes;
aa72b389 1842 unsigned char *src_base;
df7492f9
KH
1843 int *charbuf = coding->charbuf;
1844 int *charbuf_end = charbuf + coding->charbuf_size;
1845 int consumed_chars = 0, consumed_chars_base;
1846 int char_offset = 0;
1847 int multibytep = coding->src_multibyte;
1848 Lisp_Object attrs, eol_type, charset_list;
aa72b389 1849
df7492f9 1850 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
aa72b389 1851
aa72b389
KH
1852 while (1)
1853 {
df7492f9
KH
1854 int c;
1855
aa72b389 1856 src_base = src;
df7492f9
KH
1857 consumed_chars_base = consumed_chars;
1858
1859 if (charbuf >= charbuf_end)
1860 break;
aa72b389 1861
df7492f9
KH
1862 ONE_MORE_BYTE (c);
1863
1864 if (c < 0x80)
aa72b389 1865 {
df7492f9
KH
1866 if (c == '\r')
1867 {
1868 if (EQ (eol_type, Qdos))
1869 {
1870 if (src == src_end)
1871 goto no_more_source;
1872 if (*src == '\n')
1873 ONE_MORE_BYTE (c);
1874 }
1875 else if (EQ (eol_type, Qmac))
1876 c = '\n';
1877 }
1878 *charbuf++ = c;
1879 char_offset++;
aa72b389 1880 }
df7492f9
KH
1881 else if (c == 0x80)
1882 {
1883 if (charbuf + 5 + (MAX_COMPOSITION_COMPONENTS * 2) - 1 > charbuf_end)
1884 break;
1885 ONE_MORE_BYTE (c);
1886 if (c - 0xF0 >= COMPOSITION_RELATIVE
1887 && c - 0xF0 <= COMPOSITION_WITH_RULE_ALTCHARS)
1888 DECODE_EMACS_MULE_21_COMPOSITION (c);
1889 else if (c < 0xC0)
1890 DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (c);
1891 else if (c == 0xFF)
1892 DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (c);
1893 else
1894 goto invalid_code;
1895 }
1896 else if (c < 0xA0 && emacs_mule_bytes[c] > 1)
1897 {
1898 int nbytes, nchars;
1899 src--;
1900 c = emacs_mule_char (coding, 0, &nbytes, &nchars);
1901 if (c < 0)
1902 {
1903 if (c == -2)
1904 break;
1905 goto invalid_code;
1906 }
1907 *charbuf++ = c;
1908 char_offset++;
1909 }
1910 continue;
1911
1912 invalid_code:
1913 src = src_base;
1914 consumed_chars = consumed_chars_base;
1915 ONE_MORE_BYTE (c);
1916 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1917 coding->errors++;
1918 }
1919
1920 no_more_source:
1921 coding->consumed_char += consumed_chars_base;
1922 coding->consumed = src_base - coding->source;
1923 coding->charbuf_used = charbuf - coding->charbuf;
1924}
1925
1926
1927#define EMACS_MULE_LEADING_CODES(id, codes) \
1928 do { \
1929 if (id < 0xA0) \
1930 codes[0] = id, codes[1] = 0; \
1931 else if (id < 0xE0) \
1932 codes[0] = 0x9A, codes[1] = id; \
1933 else if (id < 0xF0) \
1934 codes[0] = 0x9B, codes[1] = id; \
1935 else if (id < 0xF5) \
1936 codes[0] = 0x9C, codes[1] = id; \
1937 else \
1938 codes[0] = 0x9D, codes[1] = id; \
1939 } while (0);
1940
aa72b389 1941
df7492f9
KH
1942static int
1943encode_coding_emacs_mule (coding)
1944 struct coding_system *coding;
1945{
1946 int multibytep = coding->dst_multibyte;
1947 int *charbuf = coding->charbuf;
1948 int *charbuf_end = charbuf + coding->charbuf_used;
1949 unsigned char *dst = coding->destination + coding->produced;
1950 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1951 int safe_room = 8;
df7492f9
KH
1952 int produced_chars = 0;
1953 Lisp_Object attrs, eol_type, charset_list;
1954 int c;
1955
1956 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
1957
1958 while (charbuf < charbuf_end)
1959 {
1960 ASSURE_DESTINATION (safe_room);
1961 c = *charbuf++;
1962 if (ASCII_CHAR_P (c))
1963 EMIT_ONE_ASCII_BYTE (c);
1964 else
aa72b389 1965 {
df7492f9
KH
1966 struct charset *charset;
1967 unsigned code;
1968 int dimension;
1969 int emacs_mule_id;
1970 unsigned char leading_codes[2];
1971
1972 charset = char_charset (c, charset_list, &code);
1973 if (! charset)
1974 {
1975 c = coding->default_char;
1976 if (ASCII_CHAR_P (c))
1977 {
1978 EMIT_ONE_ASCII_BYTE (c);
1979 continue;
1980 }
1981 charset = char_charset (c, charset_list, &code);
1982 }
1983 dimension = CHARSET_DIMENSION (charset);
1984 emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
1985 EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
1986 EMIT_ONE_BYTE (leading_codes[0]);
1987 if (leading_codes[1])
1988 EMIT_ONE_BYTE (leading_codes[1]);
1989 if (dimension == 1)
1990 EMIT_ONE_BYTE (code);
aa72b389 1991 else
df7492f9
KH
1992 {
1993 EMIT_ONE_BYTE (code >> 8);
1994 EMIT_ONE_BYTE (code & 0xFF);
1995 }
aa72b389 1996 }
aa72b389 1997 }
df7492f9
KH
1998 coding->result = CODING_RESULT_SUCCESS;
1999 coding->produced_char += produced_chars;
2000 coding->produced = dst - coding->destination;
2001 return 0;
aa72b389 2002}
b73bfc1c 2003
4ed46869 2004\f
df7492f9 2005/*** 7. ISO2022 handlers ***/
4ed46869
KH
2006
2007/* The following note describes the coding system ISO2022 briefly.
39787efd 2008 Since the intention of this note is to help understand the
df7492f9 2009 functions in this file, some parts are NOT ACCURATE or OVERLY
39787efd 2010 SIMPLIFIED. For thorough understanding, please refer to the
df7492f9 2011 original document of ISO2022.
4ed46869
KH
2012
2013 ISO2022 provides many mechanisms to encode several character sets
df7492f9 2014 in 7-bit and 8-bit environments. For 7-bite environments, all text
39787efd
KH
2015 is encoded using bytes less than 128. This may make the encoded
2016 text a little bit longer, but the text passes more easily through
df7492f9 2017 several gateways, some of which strip off MSB (Most Signigant Bit).
b73bfc1c 2018
df7492f9
KH
2019 There are two kinds of character sets: control character set and
2020 graphic character set. The former contains control characters such
4ed46869 2021 as `newline' and `escape' to provide control functions (control
39787efd 2022 functions are also provided by escape sequences). The latter
df7492f9 2023 contains graphic characters such as 'A' and '-'. Emacs recognizes
4ed46869
KH
2024 two control character sets and many graphic character sets.
2025
2026 Graphic character sets are classified into one of the following
39787efd
KH
2027 four classes, according to the number of bytes (DIMENSION) and
2028 number of characters in one dimension (CHARS) of the set:
2029 - DIMENSION1_CHARS94
2030 - DIMENSION1_CHARS96
2031 - DIMENSION2_CHARS94
2032 - DIMENSION2_CHARS96
2033
2034 In addition, each character set is assigned an identification tag,
df7492f9 2035 unique for each set, called "final character" (denoted as <F>
39787efd
KH
2036 hereafter). The <F> of each character set is decided by ECMA(*)
2037 when it is registered in ISO. The code range of <F> is 0x30..0x7F
2038 (0x30..0x3F are for private use only).
4ed46869
KH
2039
2040 Note (*): ECMA = European Computer Manufacturers Association
2041
df7492f9 2042 Here are examples of graphic character set [NAME(<F>)]:
4ed46869
KH
2043 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2044 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2045 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2046 o DIMENSION2_CHARS96 -- none for the moment
2047
39787efd 2048 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
4ed46869
KH
2049 C0 [0x00..0x1F] -- control character plane 0
2050 GL [0x20..0x7F] -- graphic character plane 0
2051 C1 [0x80..0x9F] -- control character plane 1
2052 GR [0xA0..0xFF] -- graphic character plane 1
2053
2054 A control character set is directly designated and invoked to C0 or
39787efd
KH
2055 C1 by an escape sequence. The most common case is that:
2056 - ISO646's control character set is designated/invoked to C0, and
2057 - ISO6429's control character set is designated/invoked to C1,
2058 and usually these designations/invocations are omitted in encoded
2059 text. In a 7-bit environment, only C0 can be used, and a control
2060 character for C1 is encoded by an appropriate escape sequence to
2061 fit into the environment. All control characters for C1 are
2062 defined to have corresponding escape sequences.
4ed46869
KH
2063
2064 A graphic character set is at first designated to one of four
2065 graphic registers (G0 through G3), then these graphic registers are
2066 invoked to GL or GR. These designations and invocations can be
2067 done independently. The most common case is that G0 is invoked to
39787efd
KH
2068 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
2069 these invocations and designations are omitted in encoded text.
2070 In a 7-bit environment, only GL can be used.
4ed46869 2071
39787efd
KH
2072 When a graphic character set of CHARS94 is invoked to GL, codes
2073 0x20 and 0x7F of the GL area work as control characters SPACE and
2074 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2075 be used.
4ed46869
KH
2076
2077 There are two ways of invocation: locking-shift and single-shift.
2078 With locking-shift, the invocation lasts until the next different
39787efd
KH
2079 invocation, whereas with single-shift, the invocation affects the
2080 following character only and doesn't affect the locking-shift
2081 state. Invocations are done by the following control characters or
2082 escape sequences:
4ed46869
KH
2083
2084 ----------------------------------------------------------------------
39787efd 2085 abbrev function cntrl escape seq description
4ed46869 2086 ----------------------------------------------------------------------
39787efd
KH
2087 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
2088 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
2089 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
2090 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
2091 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
2092 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
2093 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
2094 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
2095 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
4ed46869 2096 ----------------------------------------------------------------------
39787efd
KH
2097 (*) These are not used by any known coding system.
2098
2099 Control characters for these functions are defined by macros
2100 ISO_CODE_XXX in `coding.h'.
4ed46869 2101
39787efd 2102 Designations are done by the following escape sequences:
4ed46869
KH
2103 ----------------------------------------------------------------------
2104 escape sequence description
2105 ----------------------------------------------------------------------
2106 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
2107 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
2108 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
2109 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
2110 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
2111 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
2112 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
2113 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
2114 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
2115 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
2116 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
2117 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
2118 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
2119 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
2120 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
2121 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
2122 ----------------------------------------------------------------------
2123
2124 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
39787efd 2125 of dimension 1, chars 94, and final character <F>, etc...
4ed46869
KH
2126
2127 Note (*): Although these designations are not allowed in ISO2022,
2128 Emacs accepts them on decoding, and produces them on encoding
39787efd 2129 CHARS96 character sets in a coding system which is characterized as
4ed46869
KH
2130 7-bit environment, non-locking-shift, and non-single-shift.
2131
2132 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
df7492f9 2133 '(' must be omitted. We refer to this as "short-form" hereafter.
4ed46869 2134
df7492f9 2135 Now you may notice that there are a lot of ways for encoding the
39787efd
KH
2136 same multilingual text in ISO2022. Actually, there exist many
2137 coding systems such as Compound Text (used in X11's inter client
df7492f9
KH
2138 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
2139 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
4ed46869
KH
2140 localized platforms), and all of these are variants of ISO2022.
2141
2142 In addition to the above, Emacs handles two more kinds of escape
2143 sequences: ISO6429's direction specification and Emacs' private
2144 sequence for specifying character composition.
2145
39787efd 2146 ISO6429's direction specification takes the following form:
4ed46869
KH
2147 o CSI ']' -- end of the current direction
2148 o CSI '0' ']' -- end of the current direction
2149 o CSI '1' ']' -- start of left-to-right text
2150 o CSI '2' ']' -- start of right-to-left text
2151 The control character CSI (0x9B: control sequence introducer) is
39787efd
KH
2152 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2153
2154 Character composition specification takes the following form:
ec6d2bb8
KH
2155 o ESC '0' -- start relative composition
2156 o ESC '1' -- end composition
2157 o ESC '2' -- start rule-base composition (*)
2158 o ESC '3' -- start relative composition with alternate chars (**)
2159 o ESC '4' -- start rule-base composition with alternate chars (**)
b73bfc1c 2160 Since these are not standard escape sequences of any ISO standard,
df7492f9 2161 the use of them for these meaning is restricted to Emacs only.
ec6d2bb8 2162
df7492f9 2163 (*) This form is used only in Emacs 20.5 and the older versions,
b73bfc1c 2164 but the newer versions can safely decode it.
df7492f9 2165 (**) This form is used only in Emacs 21.1 and the newer versions,
b73bfc1c 2166 and the older versions can't decode it.
ec6d2bb8 2167
df7492f9 2168 Here's a list of examples usages of these composition escape
b73bfc1c 2169 sequences (categorized by `enum composition_method').
ec6d2bb8 2170
b73bfc1c 2171 COMPOSITION_RELATIVE:
ec6d2bb8 2172 ESC 0 CHAR [ CHAR ] ESC 1
df7492f9 2173 COMPOSITOIN_WITH_RULE:
ec6d2bb8 2174 ESC 2 CHAR [ RULE CHAR ] ESC 1
b73bfc1c 2175 COMPOSITION_WITH_ALTCHARS:
ec6d2bb8 2176 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
b73bfc1c 2177 COMPOSITION_WITH_RULE_ALTCHARS:
ec6d2bb8 2178 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
4ed46869
KH
2179
2180enum iso_code_class_type iso_code_class[256];
2181
df7492f9
KH
2182#define SAFE_CHARSET_P(coding, id) \
2183 ((id) <= (coding)->max_charset_id \
2184 && (coding)->safe_charsets[id] >= 0)
2185
2186
2187#define SHIFT_OUT_OK(category) \
2188 (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
2189
2190static void
2191setup_iso_safe_charsets (Lisp_Object attrs)
2192{
2193 Lisp_Object charset_list, safe_charsets;
2194 Lisp_Object request;
2195 Lisp_Object reg_usage;
2196 Lisp_Object tail;
2197 int reg94, reg96;
2198 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2199 int max_charset_id;
2200
2201 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2202 if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2203 && ! EQ (charset_list, Viso_2022_charset_list))
2204 {
2205 CODING_ATTR_CHARSET_LIST (attrs)
2206 = charset_list = Viso_2022_charset_list;
2207 ASET (attrs, coding_attr_safe_charsets, Qnil);
2208 }
2209
2210 if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2211 return;
2212
2213 max_charset_id = 0;
2214 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2215 {
2216 int id = XINT (XCAR (tail));
2217 if (max_charset_id < id)
2218 max_charset_id = id;
2219 }
2220
2221 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
2222 make_number (255));
2223 request = AREF (attrs, coding_attr_iso_request);
2224 reg_usage = AREF (attrs, coding_attr_iso_usage);
2225 reg94 = XINT (XCAR (reg_usage));
2226 reg96 = XINT (XCDR (reg_usage));
2227
2228 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2229 {
2230 Lisp_Object id;
2231 Lisp_Object reg;
2232 struct charset *charset;
2233
2234 id = XCAR (tail);
2235 charset = CHARSET_FROM_ID (XINT (id));
2236 reg = Fcdr (Fassq (request, id));
2237 if (! NILP (reg))
2238 XSTRING (safe_charsets)->data[XINT (id)] = XINT (reg);
2239 else if (charset->iso_chars_96)
2240 {
2241 if (reg96 < 4)
2242 XSTRING (safe_charsets)->data[XINT (id)] = reg96;
2243 }
2244 else
2245 {
2246 if (reg94 < 4)
2247 XSTRING (safe_charsets)->data[XINT (id)] = reg94;
2248 }
2249 }
2250 ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2251}
d46c5b12 2252
d46c5b12 2253
4ed46869 2254/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
df7492f9 2255 Check if a text is encoded in ISO2022. If it is, returns an
4ed46869 2256 integer in which appropriate flag bits any of:
df7492f9
KH
2257 CATEGORY_MASK_ISO_7
2258 CATEGORY_MASK_ISO_7_TIGHT
2259 CATEGORY_MASK_ISO_8_1
2260 CATEGORY_MASK_ISO_8_2
2261 CATEGORY_MASK_ISO_7_ELSE
2262 CATEGORY_MASK_ISO_8_ELSE
4ed46869
KH
2263 are set. If a code which should never appear in ISO2022 is found,
2264 returns 0. */
2265
0a28aafb 2266static int
df7492f9
KH
2267detect_coding_iso_2022 (coding, mask)
2268 struct coding_system *coding;
2269 int *mask;
4ed46869 2270{
df7492f9
KH
2271 unsigned char *src = coding->source, *src_base = src;
2272 unsigned char *src_end = coding->source + coding->src_bytes;
2273 int multibytep = coding->src_multibyte;
2274 int mask_iso = CATEGORY_MASK_ISO;
2275 int mask_found = 0, mask_8bit_found = 0;
f46869e4 2276 int reg[4], shift_out = 0, single_shifting = 0;
df7492f9
KH
2277 int id;
2278 int c, c1;
2279 int consumed_chars = 0;
2280 int i;
2281
2282 for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2283 {
2284 struct coding_system *this = &(coding_categories[i]);
2285 Lisp_Object attrs, val;
2286
2287 attrs = CODING_ID_ATTRS (this->id);
2288 if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2289 && ! EQ (CODING_ATTR_SAFE_CHARSETS (attrs), Viso_2022_charset_list))
2290 setup_iso_safe_charsets (attrs);
2291 val = CODING_ATTR_SAFE_CHARSETS (attrs);
2292 this->max_charset_id = XSTRING (val)->size - 1;
2293 this->safe_charsets = (char *) XSTRING (val)->data;
2294 }
2295
2296 /* A coding system of this category is always ASCII compatible. */
2297 src += coding->head_ascii;
3f003981 2298
df7492f9
KH
2299 reg[0] = charset_ascii, reg[1] = reg[2] = reg[3] = -1;
2300 while (mask_iso && src < src_end)
4ed46869 2301 {
df7492f9 2302 ONE_MORE_BYTE (c);
4ed46869
KH
2303 switch (c)
2304 {
2305 case ISO_CODE_ESC:
74383408
KH
2306 if (inhibit_iso_escape_detection)
2307 break;
f46869e4 2308 single_shifting = 0;
df7492f9 2309 ONE_MORE_BYTE (c);
d46c5b12 2310 if (c >= '(' && c <= '/')
4ed46869 2311 {
bf9cdd4e 2312 /* Designation sequence for a charset of dimension 1. */
df7492f9 2313 ONE_MORE_BYTE (c1);
d46c5b12 2314 if (c1 < ' ' || c1 >= 0x80
df7492f9 2315 || (id = iso_charset_table[0][c >= ','][c1]) < 0)
d46c5b12
KH
2316 /* Invalid designation sequence. Just ignore. */
2317 break;
df7492f9 2318 reg[(c - '(') % 4] = id;
bf9cdd4e
KH
2319 }
2320 else if (c == '$')
2321 {
2322 /* Designation sequence for a charset of dimension 2. */
df7492f9 2323 ONE_MORE_BYTE (c);
bf9cdd4e
KH
2324 if (c >= '@' && c <= 'B')
2325 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
df7492f9 2326 reg[0] = id = iso_charset_table[1][0][c];
bf9cdd4e 2327 else if (c >= '(' && c <= '/')
bcf26d6a 2328 {
df7492f9 2329 ONE_MORE_BYTE (c1);
d46c5b12 2330 if (c1 < ' ' || c1 >= 0x80
df7492f9 2331 || (id = iso_charset_table[1][c >= ','][c1]) < 0)
d46c5b12
KH
2332 /* Invalid designation sequence. Just ignore. */
2333 break;
df7492f9 2334 reg[(c - '(') % 4] = id;
bcf26d6a 2335 }
bf9cdd4e 2336 else
d46c5b12
KH
2337 /* Invalid designation sequence. Just ignore. */
2338 break;
2339 }
ae9ff118 2340 else if (c == 'N' || c == 'O')
d46c5b12 2341 {
ae9ff118 2342 /* ESC <Fe> for SS2 or SS3. */
df7492f9 2343 mask_iso &= CATEGORY_MASK_ISO_7_ELSE;
d46c5b12 2344 break;
4ed46869 2345 }
ec6d2bb8
KH
2346 else if (c >= '0' && c <= '4')
2347 {
2348 /* ESC <Fp> for start/end composition. */
df7492f9 2349 mask_found |= CATEGORY_MASK_ISO;
ec6d2bb8
KH
2350 break;
2351 }
bf9cdd4e 2352 else
df7492f9
KH
2353 {
2354 /* Invalid escape sequence. */
2355 mask_iso &= ~CATEGORY_MASK_ISO_ESCAPE;
2356 break;
2357 }
d46c5b12
KH
2358
2359 /* We found a valid designation sequence for CHARSET. */
df7492f9
KH
2360 mask_iso &= ~CATEGORY_MASK_ISO_8BIT;
2361 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
2362 id))
2363 mask_found |= CATEGORY_MASK_ISO_7;
d46c5b12 2364 else
df7492f9
KH
2365 mask_iso &= ~CATEGORY_MASK_ISO_7;
2366 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
2367 id))
2368 mask_found |= CATEGORY_MASK_ISO_7_TIGHT;
d46c5b12 2369 else
df7492f9
KH
2370 mask_iso &= ~CATEGORY_MASK_ISO_7_TIGHT;
2371 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
2372 id))
2373 mask_found |= CATEGORY_MASK_ISO_7_ELSE;
ae9ff118 2374 else
df7492f9
KH
2375 mask_iso &= ~CATEGORY_MASK_ISO_7_ELSE;
2376 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
2377 id))
2378 mask_found |= CATEGORY_MASK_ISO_8_ELSE;
ae9ff118 2379 else
df7492f9 2380 mask_iso &= ~CATEGORY_MASK_ISO_8_ELSE;
4ed46869
KH
2381 break;
2382
4ed46869 2383 case ISO_CODE_SO:
74383408
KH
2384 if (inhibit_iso_escape_detection)
2385 break;
f46869e4 2386 single_shifting = 0;
d46c5b12
KH
2387 if (shift_out == 0
2388 && (reg[1] >= 0
df7492f9
KH
2389 || SHIFT_OUT_OK (coding_category_iso_7_else)
2390 || SHIFT_OUT_OK (coding_category_iso_8_else)))
d46c5b12
KH
2391 {
2392 /* Locking shift out. */
df7492f9
KH
2393 mask_iso &= ~CATEGORY_MASK_ISO_7BIT;
2394 mask_found |= CATEGORY_MASK_ISO_ELSE;
d46c5b12 2395 }
e0e989f6 2396 break;
df7492f9 2397
d46c5b12 2398 case ISO_CODE_SI:
74383408
KH
2399 if (inhibit_iso_escape_detection)
2400 break;
f46869e4 2401 single_shifting = 0;
d46c5b12
KH
2402 if (shift_out == 1)
2403 {
2404 /* Locking shift in. */
df7492f9
KH
2405 mask_iso &= ~CATEGORY_MASK_ISO_7BIT;
2406 mask_found |= CATEGORY_MASK_ISO_ELSE;
d46c5b12
KH
2407 }
2408 break;
2409
4ed46869 2410 case ISO_CODE_CSI:
f46869e4 2411 single_shifting = 0;
4ed46869
KH
2412 case ISO_CODE_SS2:
2413 case ISO_CODE_SS3:
3f003981 2414 {
df7492f9 2415 int newmask = CATEGORY_MASK_ISO_8_ELSE;
3f003981 2416
74383408
KH
2417 if (inhibit_iso_escape_detection)
2418 break;
70c22245
KH
2419 if (c != ISO_CODE_CSI)
2420 {
df7492f9
KH
2421 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2422 & CODING_ISO_FLAG_SINGLE_SHIFT)
2423 newmask |= CATEGORY_MASK_ISO_8_1;
2424 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2425 & CODING_ISO_FLAG_SINGLE_SHIFT)
2426 newmask |= CATEGORY_MASK_ISO_8_2;
f46869e4 2427 single_shifting = 1;
70c22245 2428 }
3f003981
KH
2429 if (VECTORP (Vlatin_extra_code_table)
2430 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
2431 {
df7492f9
KH
2432 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2433 & CODING_ISO_FLAG_LATIN_EXTRA)
2434 newmask |= CATEGORY_MASK_ISO_8_1;
2435 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2436 & CODING_ISO_FLAG_LATIN_EXTRA)
2437 newmask |= CATEGORY_MASK_ISO_8_2;
3f003981 2438 }
df7492f9 2439 mask_iso &= newmask;
d46c5b12 2440 mask_found |= newmask;
3f003981
KH
2441 }
2442 break;
4ed46869
KH
2443
2444 default:
2445 if (c < 0x80)
f46869e4
KH
2446 {
2447 single_shifting = 0;
2448 break;
2449 }
4ed46869 2450 else if (c < 0xA0)
c4825358 2451 {
f46869e4 2452 single_shifting = 0;
df7492f9 2453 mask_8bit_found = 1;
3f003981
KH
2454 if (VECTORP (Vlatin_extra_code_table)
2455 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
c4825358 2456 {
3f003981
KH
2457 int newmask = 0;
2458
df7492f9
KH
2459 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2460 & CODING_ISO_FLAG_LATIN_EXTRA)
2461 newmask |= CATEGORY_MASK_ISO_8_1;
2462 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2463 & CODING_ISO_FLAG_LATIN_EXTRA)
2464 newmask |= CATEGORY_MASK_ISO_8_2;
2465 mask_iso &= newmask;
d46c5b12 2466 mask_found |= newmask;
c4825358 2467 }
3f003981
KH
2468 else
2469 return 0;
c4825358 2470 }
4ed46869
KH
2471 else
2472 {
df7492f9
KH
2473 mask_iso &= ~(CATEGORY_MASK_ISO_7BIT
2474 | CATEGORY_MASK_ISO_7_ELSE);
2475 mask_found |= CATEGORY_MASK_ISO_8_1;
2476 mask_8bit_found = 1;
f46869e4
KH
2477 /* Check the length of succeeding codes of the range
2478 0xA0..0FF. If the byte length is odd, we exclude
df7492f9 2479 CATEGORY_MASK_ISO_8_2. We can check this only
f46869e4 2480 when we are not single shifting. */
b73bfc1c 2481 if (!single_shifting
df7492f9 2482 && mask_iso & CATEGORY_MASK_ISO_8_2)
f46869e4 2483 {
e17de821 2484 int i = 1;
b73bfc1c
KH
2485 while (src < src_end)
2486 {
df7492f9 2487 ONE_MORE_BYTE (c);
b73bfc1c
KH
2488 if (c < 0xA0)
2489 break;
2490 i++;
2491 }
2492
2493 if (i & 1 && src < src_end)
df7492f9 2494 mask_iso &= ~CATEGORY_MASK_ISO_8_2;
f46869e4 2495 else
df7492f9 2496 mask_found |= CATEGORY_MASK_ISO_8_2;
f46869e4 2497 }
4ed46869
KH
2498 }
2499 break;
2500 }
2501 }
df7492f9
KH
2502 no_more_source:
2503 if (!mask_iso)
2504 {
2505 *mask &= ~CATEGORY_MASK_ISO;
2506 return 0;
2507 }
2508 if (!mask_found)
2509 return 0;
2510 *mask &= mask_iso & mask_found;
2511 if (! mask_8bit_found)
2512 *mask &= ~(CATEGORY_MASK_ISO_8BIT | CATEGORY_MASK_ISO_8_ELSE);
2513 return 1;
4ed46869
KH
2514}
2515
4ed46869
KH
2516
2517/* Set designation state into CODING. */
df7492f9
KH
2518#define DECODE_DESIGNATION(reg, dim, chars_96, final) \
2519 do { \
2520 int id, prev; \
2521 \
2522 if (final < '0' || final >= 128 \
2523 || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0) \
2524 || !SAFE_CHARSET_P (coding, id)) \
2525 { \
2526 CODING_ISO_DESIGNATION (coding, reg) = -2; \
2527 goto invalid_code; \
2528 } \
2529 prev = CODING_ISO_DESIGNATION (coding, reg); \
2530 CODING_ISO_DESIGNATION (coding, reg) = id; \
2531 /* If there was an invalid designation to REG previously, and this \
2532 designation is ASCII to REG, we should keep this designation \
2533 sequence. */ \
2534 if (prev == -2 && id == charset_ascii) \
2535 goto invalid_code; \
4ed46869
KH
2536 } while (0)
2537
d46c5b12 2538
df7492f9
KH
2539#define MAYBE_FINISH_COMPOSITION() \
2540 do { \
2541 int i; \
2542 if (composition_state == COMPOSING_NO) \
2543 break; \
2544 /* It is assured that we have enough room for producing \
2545 characters stored in the table `components'. */ \
2546 if (charbuf + component_idx > charbuf_end) \
2547 goto no_more_source; \
2548 composition_state = COMPOSING_NO; \
2549 if (method == COMPOSITION_RELATIVE \
2550 || method == COMPOSITION_WITH_ALTCHARS) \
2551 { \
2552 for (i = 0; i < component_idx; i++) \
2553 *charbuf++ = components[i]; \
2554 char_offset += component_idx; \
2555 } \
2556 else \
2557 { \
2558 for (i = 0; i < component_idx; i += 2) \
2559 *charbuf++ = components[i]; \
2560 char_offset += (component_idx / 2) + 1; \
2561 } \
2562 } while (0)
2563
d46c5b12 2564
aa72b389
KH
2565/* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
2566 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
2567 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
df7492f9
KH
2568 ESC 3 : altchar composition : ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
2569 ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
aa72b389 2570 */
ec6d2bb8 2571
df7492f9
KH
2572#define DECODE_COMPOSITION_START(c1) \
2573 do { \
2574 if (c1 == '0' \
2575 && composition_state == COMPOSING_COMPONENT_CHAR) \
2576 { \
2577 component_len = component_idx; \
2578 composition_state = COMPOSING_CHAR; \
2579 } \
2580 else \
2581 { \
2582 unsigned char *p; \
2583 \
2584 MAYBE_FINISH_COMPOSITION (); \
2585 if (charbuf + MAX_COMPOSITION_COMPONENTS > charbuf_end) \
2586 goto no_more_source; \
2587 for (p = src; p < src_end - 1; p++) \
2588 if (*p == ISO_CODE_ESC && p[1] == '1') \
2589 break; \
2590 if (p == src_end - 1) \
2591 { \
2592 if (coding->mode & CODING_MODE_LAST_BLOCK) \
2593 goto invalid_code; \
2594 goto no_more_source; \
2595 } \
2596 \
2597 /* This is surely the start of a composition. */ \
2598 method = (c1 == '0' ? COMPOSITION_RELATIVE \
2599 : c1 == '2' ? COMPOSITION_WITH_RULE \
2600 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
2601 : COMPOSITION_WITH_RULE_ALTCHARS); \
2602 composition_state = (c1 <= '2' ? COMPOSING_CHAR \
2603 : COMPOSING_COMPONENT_CHAR); \
2604 component_idx = component_len = 0; \
2605 } \
ec6d2bb8
KH
2606 } while (0)
2607
ec6d2bb8 2608
df7492f9
KH
2609/* Handle compositoin end sequence ESC 1. */
2610
2611#define DECODE_COMPOSITION_END() \
ec6d2bb8 2612 do { \
df7492f9
KH
2613 int nchars = (component_len > 0 ? component_idx - component_len \
2614 : method == COMPOSITION_RELATIVE ? component_idx \
2615 : (component_idx + 1) / 2); \
2616 int i; \
2617 int *saved_charbuf = charbuf; \
2618 \
2619 ADD_COMPOSITION_DATA (charbuf, method, nchars); \
2620 if (method != COMPOSITION_RELATIVE) \
ec6d2bb8 2621 { \
df7492f9
KH
2622 if (component_len == 0) \
2623 for (i = 0; i < component_idx; i++) \
2624 *charbuf++ = components[i]; \
2625 else \
2626 for (i = 0; i < component_len; i++) \
2627 *charbuf++ = components[i]; \
2628 *saved_charbuf = saved_charbuf - charbuf; \
ec6d2bb8 2629 } \
df7492f9
KH
2630 if (method == COMPOSITION_WITH_RULE) \
2631 for (i = 0; i < component_idx; i += 2, char_offset++) \
2632 *charbuf++ = components[i]; \
ec6d2bb8 2633 else \
df7492f9
KH
2634 for (i = component_len; i < component_idx; i++, char_offset++) \
2635 *charbuf++ = components[i]; \
2636 coding->annotated = 1; \
2637 composition_state = COMPOSING_NO; \
ec6d2bb8
KH
2638 } while (0)
2639
df7492f9 2640
ec6d2bb8
KH
2641/* Decode a composition rule from the byte C1 (and maybe one more byte
2642 from SRC) and store one encoded composition rule in
2643 coding->cmp_data. */
2644
2645#define DECODE_COMPOSITION_RULE(c1) \
2646 do { \
ec6d2bb8
KH
2647 (c1) -= 32; \
2648 if (c1 < 81) /* old format (before ver.21) */ \
2649 { \
2650 int gref = (c1) / 9; \
2651 int nref = (c1) % 9; \
2652 if (gref == 4) gref = 10; \
2653 if (nref == 4) nref = 10; \
df7492f9 2654 c1 = COMPOSITION_ENCODE_RULE (gref, nref); \
ec6d2bb8 2655 } \
b73bfc1c 2656 else if (c1 < 93) /* new format (after ver.21) */ \
ec6d2bb8
KH
2657 { \
2658 ONE_MORE_BYTE (c2); \
df7492f9 2659 c1 = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32); \
ec6d2bb8 2660 } \
df7492f9
KH
2661 else \
2662 c1 = 0; \
ec6d2bb8 2663 } while (0)
88993dfd 2664
d46c5b12 2665
4ed46869
KH
2666/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
2667
b73bfc1c 2668static void
df7492f9 2669decode_coding_iso_2022 (coding)
4ed46869 2670 struct coding_system *coding;
4ed46869 2671{
df7492f9
KH
2672 unsigned char *src = coding->source + coding->consumed;
2673 unsigned char *src_end = coding->source + coding->src_bytes;
b73bfc1c 2674 unsigned char *src_base;
df7492f9
KH
2675 int *charbuf = coding->charbuf;
2676 int *charbuf_end = charbuf + coding->charbuf_size - 4;
2677 int consumed_chars = 0, consumed_chars_base;
2678 int char_offset = 0;
2679 int multibytep = coding->src_multibyte;
2680 /* Charsets invoked to graphic plane 0 and 1 respectively. */
2681 int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2682 int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
2683 struct charset *charset;
2684 int c;
2685 /* For handling composition sequence. */
2686#define COMPOSING_NO 0
2687#define COMPOSING_CHAR 1
2688#define COMPOSING_RULE 2
2689#define COMPOSING_COMPONENT_CHAR 3
2690#define COMPOSING_COMPONENT_RULE 4
2691
2692 int composition_state = COMPOSING_NO;
2693 enum composition_method method;
2694 int components[MAX_COMPOSITION_COMPONENTS * 2 + 1];
2695 int component_idx;
2696 int component_len;
2697 Lisp_Object attrs, eol_type, charset_list;
2698
2699 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
2700 setup_iso_safe_charsets (attrs);
b73bfc1c
KH
2701
2702 while (1)
4ed46869 2703 {
b73bfc1c
KH
2704 int c1, c2;
2705
2706 src_base = src;
df7492f9
KH
2707 consumed_chars_base = consumed_chars;
2708
2709 if (charbuf >= charbuf_end)
2710 break;
2711
b73bfc1c 2712 ONE_MORE_BYTE (c1);
4ed46869 2713
ec6d2bb8 2714 /* We produce no character or one character. */
4ed46869
KH
2715 switch (iso_code_class [c1])
2716 {
2717 case ISO_0x20_or_0x7F:
df7492f9 2718 if (composition_state != COMPOSING_NO)
ec6d2bb8 2719 {
df7492f9
KH
2720 if (composition_state == COMPOSING_RULE
2721 || composition_state == COMPOSING_COMPONENT_RULE)
2722 {
2723 DECODE_COMPOSITION_RULE (c1);
2724 components[component_idx++] = c1;
2725 composition_state--;
2726 continue;
2727 }
2728 else if (method == COMPOSITION_WITH_RULE)
2729 composition_state = COMPOSING_RULE;
2730 else if (method == COMPOSITION_WITH_RULE_ALTCHARS
2731 && composition_state == COMPOSING_COMPONENT_CHAR)
2732 composition_state = COMPOSING_COMPONENT_CHAR;
ec6d2bb8 2733 }
df7492f9
KH
2734 if (charset_id_0 < 0
2735 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
4ed46869
KH
2736 {
2737 /* This is SPACE or DEL. */
df7492f9 2738 charset = CHARSET_FROM_ID (charset_ascii);
4ed46869
KH
2739 break;
2740 }
2741 /* This is a graphic character, we fall down ... */
2742
2743 case ISO_graphic_plane_0:
df7492f9 2744 if (composition_state == COMPOSING_RULE)
b73bfc1c
KH
2745 {
2746 DECODE_COMPOSITION_RULE (c1);
df7492f9
KH
2747 components[component_idx++] = c1;
2748 composition_state = COMPOSING_CHAR;
b73bfc1c 2749 }
df7492f9 2750 charset = CHARSET_FROM_ID (charset_id_0);
4ed46869
KH
2751 break;
2752
2753 case ISO_0xA0_or_0xFF:
df7492f9
KH
2754 if (charset_id_1 < 0
2755 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
2756 || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
2757 goto invalid_code;
4ed46869
KH
2758 /* This is a graphic character, we fall down ... */
2759
2760 case ISO_graphic_plane_1:
df7492f9
KH
2761 if (charset_id_1 < 0)
2762 goto invalid_code;
2763 charset = CHARSET_FROM_ID (charset_id_1);
4ed46869
KH
2764 break;
2765
2766 case ISO_carriage_return:
df7492f9 2767 if (c1 == '\r')
4ed46869 2768 {
df7492f9 2769 if (EQ (eol_type, Qdos))
4ed46869 2770 {
df7492f9
KH
2771 if (src == src_end)
2772 goto no_more_source;
2773 if (*src == '\n')
2774 ONE_MORE_BYTE (c1);
4ed46869 2775 }
df7492f9
KH
2776 else if (EQ (eol_type, Qmac))
2777 c1 = '\n';
4ed46869 2778 }
df7492f9
KH
2779 /* fall through */
2780
2781 case ISO_control_0:
2782 MAYBE_FINISH_COMPOSITION ();
2783 charset = CHARSET_FROM_ID (charset_ascii);
4ed46869
KH
2784 break;
2785
df7492f9
KH
2786 case ISO_control_1:
2787 MAYBE_FINISH_COMPOSITION ();
2788 goto invalid_code;
2789
4ed46869 2790 case ISO_shift_out:
df7492f9
KH
2791 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
2792 || CODING_ISO_DESIGNATION (coding, 1) < 0)
2793 goto invalid_code;
2794 CODING_ISO_INVOCATION (coding, 0) = 1;
2795 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
b73bfc1c 2796 continue;
4ed46869
KH
2797
2798 case ISO_shift_in:
df7492f9
KH
2799 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
2800 goto invalid_code;
2801 CODING_ISO_INVOCATION (coding, 0) = 0;
2802 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
b73bfc1c 2803 continue;
4ed46869
KH
2804
2805 case ISO_single_shift_2_7:
2806 case ISO_single_shift_2:
df7492f9
KH
2807 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
2808 goto invalid_code;
4ed46869
KH
2809 /* SS2 is handled as an escape sequence of ESC 'N' */
2810 c1 = 'N';
2811 goto label_escape_sequence;
2812
2813 case ISO_single_shift_3:
df7492f9
KH
2814 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
2815 goto invalid_code;
4ed46869
KH
2816 /* SS2 is handled as an escape sequence of ESC 'O' */
2817 c1 = 'O';
2818 goto label_escape_sequence;
2819
2820 case ISO_control_sequence_introducer:
2821 /* CSI is handled as an escape sequence of ESC '[' ... */
2822 c1 = '[';
2823 goto label_escape_sequence;
2824
2825 case ISO_escape:
2826 ONE_MORE_BYTE (c1);
2827 label_escape_sequence:
df7492f9 2828 /* Escape sequences handled here are invocation,
4ed46869
KH
2829 designation, direction specification, and character
2830 composition specification. */
2831 switch (c1)
2832 {
2833 case '&': /* revision of following character set */
2834 ONE_MORE_BYTE (c1);
2835 if (!(c1 >= '@' && c1 <= '~'))
df7492f9 2836 goto invalid_code;
4ed46869
KH
2837 ONE_MORE_BYTE (c1);
2838 if (c1 != ISO_CODE_ESC)
df7492f9 2839 goto invalid_code;
4ed46869
KH
2840 ONE_MORE_BYTE (c1);
2841 goto label_escape_sequence;
2842
2843 case '$': /* designation of 2-byte character set */
df7492f9
KH
2844 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
2845 goto invalid_code;
4ed46869
KH
2846 ONE_MORE_BYTE (c1);
2847 if (c1 >= '@' && c1 <= 'B')
2848 { /* designation of JISX0208.1978, GB2312.1980,
88993dfd 2849 or JISX0208.1980 */
df7492f9 2850 DECODE_DESIGNATION (0, 2, 0, c1);
4ed46869
KH
2851 }
2852 else if (c1 >= 0x28 && c1 <= 0x2B)
2853 { /* designation of DIMENSION2_CHARS94 character set */
2854 ONE_MORE_BYTE (c2);
df7492f9 2855 DECODE_DESIGNATION (c1 - 0x28, 2, 0, c2);
4ed46869
KH
2856 }
2857 else if (c1 >= 0x2C && c1 <= 0x2F)
2858 { /* designation of DIMENSION2_CHARS96 character set */
2859 ONE_MORE_BYTE (c2);
df7492f9 2860 DECODE_DESIGNATION (c1 - 0x2C, 2, 1, c2);
4ed46869
KH
2861 }
2862 else
df7492f9 2863 goto invalid_code;
b73bfc1c 2864 /* We must update these variables now. */
df7492f9
KH
2865 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2866 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
b73bfc1c 2867 continue;
4ed46869
KH
2868
2869 case 'n': /* invocation of locking-shift-2 */
df7492f9
KH
2870 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
2871 || CODING_ISO_DESIGNATION (coding, 2) < 0)
2872 goto invalid_code;
2873 CODING_ISO_INVOCATION (coding, 0) = 2;
2874 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
b73bfc1c 2875 continue;
4ed46869
KH
2876
2877 case 'o': /* invocation of locking-shift-3 */
df7492f9
KH
2878 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
2879 || CODING_ISO_DESIGNATION (coding, 3) < 0)
2880 goto invalid_code;
2881 CODING_ISO_INVOCATION (coding, 0) = 3;
2882 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
b73bfc1c 2883 continue;
4ed46869
KH
2884
2885 case 'N': /* invocation of single-shift-2 */
df7492f9
KH
2886 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
2887 || CODING_ISO_DESIGNATION (coding, 2) < 0)
2888 goto invalid_code;
2889 charset = CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding, 2));
b73bfc1c 2890 ONE_MORE_BYTE (c1);
e7046a18 2891 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
df7492f9 2892 goto invalid_code;
4ed46869
KH
2893 break;
2894
2895 case 'O': /* invocation of single-shift-3 */
df7492f9
KH
2896 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
2897 || CODING_ISO_DESIGNATION (coding, 3) < 0)
2898 goto invalid_code;
2899 charset = CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding, 3));
b73bfc1c 2900 ONE_MORE_BYTE (c1);
e7046a18 2901 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
df7492f9 2902 goto invalid_code;
4ed46869
KH
2903 break;
2904
ec6d2bb8 2905 case '0': case '2': case '3': case '4': /* start composition */
df7492f9
KH
2906 if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
2907 goto invalid_code;
ec6d2bb8 2908 DECODE_COMPOSITION_START (c1);
b73bfc1c 2909 continue;
4ed46869 2910
ec6d2bb8 2911 case '1': /* end composition */
df7492f9
KH
2912 if (composition_state == COMPOSING_NO)
2913 goto invalid_code;
2914 DECODE_COMPOSITION_END ();
b73bfc1c 2915 continue;
4ed46869
KH
2916
2917 case '[': /* specification of direction */
df7492f9
KH
2918 if (! CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION)
2919 goto invalid_code;
4ed46869 2920 /* For the moment, nested direction is not supported.
d46c5b12 2921 So, `coding->mode & CODING_MODE_DIRECTION' zero means
df7492f9 2922 left-to-right, and nozero means right-to-left. */
4ed46869
KH
2923 ONE_MORE_BYTE (c1);
2924 switch (c1)
2925 {
2926 case ']': /* end of the current direction */
d46c5b12 2927 coding->mode &= ~CODING_MODE_DIRECTION;
4ed46869
KH
2928
2929 case '0': /* end of the current direction */
2930 case '1': /* start of left-to-right direction */
2931 ONE_MORE_BYTE (c1);
2932 if (c1 == ']')
d46c5b12 2933 coding->mode &= ~CODING_MODE_DIRECTION;
4ed46869 2934 else
df7492f9 2935 goto invalid_code;
4ed46869
KH
2936 break;
2937
2938 case '2': /* start of right-to-left direction */
2939 ONE_MORE_BYTE (c1);
2940 if (c1 == ']')
d46c5b12 2941 coding->mode |= CODING_MODE_DIRECTION;
4ed46869 2942 else
df7492f9 2943 goto invalid_code;
4ed46869
KH
2944 break;
2945
2946 default:
df7492f9 2947 goto invalid_code;
4ed46869 2948 }
b73bfc1c 2949 continue;
4ed46869
KH
2950
2951 default:
df7492f9
KH
2952 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
2953 goto invalid_code;
4ed46869
KH
2954 if (c1 >= 0x28 && c1 <= 0x2B)
2955 { /* designation of DIMENSION1_CHARS94 character set */
2956 ONE_MORE_BYTE (c2);
df7492f9 2957 DECODE_DESIGNATION (c1 - 0x28, 1, 0, c2);
4ed46869
KH
2958 }
2959 else if (c1 >= 0x2C && c1 <= 0x2F)
2960 { /* designation of DIMENSION1_CHARS96 character set */
2961 ONE_MORE_BYTE (c2);
df7492f9 2962 DECODE_DESIGNATION (c1 - 0x2C, 1, 1, c2);
4ed46869
KH
2963 }
2964 else
df7492f9 2965 goto invalid_code;
b73bfc1c 2966 /* We must update these variables now. */
df7492f9
KH
2967 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2968 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
b73bfc1c 2969 continue;
4ed46869 2970 }
b73bfc1c 2971 }
4ed46869 2972
b73bfc1c 2973 /* Now we know CHARSET and 1st position code C1 of a character.
df7492f9
KH
2974 Produce a decoded character while getting 2nd position code
2975 C2 if necessary. */
2976 c1 &= 0x7F;
2977 if (CHARSET_DIMENSION (charset) > 1)
b73bfc1c
KH
2978 {
2979 ONE_MORE_BYTE (c2);
df7492f9 2980 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
b73bfc1c 2981 /* C2 is not in a valid range. */
df7492f9
KH
2982 goto invalid_code;
2983 c1 = (c1 << 8) | (c2 & 0x7F);
2984 if (CHARSET_DIMENSION (charset) > 2)
2985 {
2986 ONE_MORE_BYTE (c2);
2987 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
2988 /* C2 is not in a valid range. */
2989 goto invalid_code;
2990 c1 = (c1 << 8) | (c2 & 0x7F);
2991 }
2992 }
2993
2994 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
2995 if (c < 0)
2996 {
2997 MAYBE_FINISH_COMPOSITION ();
2998 for (; src_base < src; src_base++, char_offset++)
2999 {
3000 if (ASCII_BYTE_P (*src_base))
3001 *charbuf++ = *src_base;
3002 else
3003 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3004 }
3005 }
3006 else if (composition_state == COMPOSING_NO)
3007 {
3008 *charbuf++ = c;
3009 char_offset++;
4ed46869 3010 }
df7492f9
KH
3011 else
3012 components[component_idx++] = c;
4ed46869
KH
3013 continue;
3014
df7492f9
KH
3015 invalid_code:
3016 MAYBE_FINISH_COMPOSITION ();
4ed46869 3017 src = src_base;
df7492f9
KH
3018 consumed_chars = consumed_chars_base;
3019 ONE_MORE_BYTE (c);
3020 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3021 coding->errors++;
4ed46869 3022 }
fb88bf2d 3023
df7492f9
KH
3024 no_more_source:
3025 coding->consumed_char += consumed_chars_base;
3026 coding->consumed = src_base - coding->source;
3027 coding->charbuf_used = charbuf - coding->charbuf;
4ed46869
KH
3028}
3029
b73bfc1c 3030
f4dee582 3031/* ISO2022 encoding stuff. */
4ed46869
KH
3032
3033/*
f4dee582 3034 It is not enough to say just "ISO2022" on encoding, we have to
df7492f9 3035 specify more details. In Emacs, each coding system of ISO2022
4ed46869 3036 variant has the following specifications:
df7492f9 3037 1. Initial designation to G0 thru G3.
4ed46869
KH
3038 2. Allows short-form designation?
3039 3. ASCII should be designated to G0 before control characters?
3040 4. ASCII should be designated to G0 at end of line?
3041 5. 7-bit environment or 8-bit environment?
3042 6. Use locking-shift?
3043 7. Use Single-shift?
3044 And the following two are only for Japanese:
3045 8. Use ASCII in place of JIS0201-1976-Roman?
3046 9. Use JISX0208-1983 in place of JISX0208-1978?
df7492f9
KH
3047 These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3048 defined by macros CODING_ISO_FLAG_XXX. See `coding.h' for more
f4dee582 3049 details.
4ed46869
KH
3050*/
3051
3052/* Produce codes (escape sequence) for designating CHARSET to graphic
b73bfc1c
KH
3053 register REG at DST, and increment DST. If <final-char> of CHARSET is
3054 '@', 'A', or 'B' and the coding system CODING allows, produce
3055 designation sequence of short-form. */
4ed46869
KH
3056
3057#define ENCODE_DESIGNATION(charset, reg, coding) \
3058 do { \
df7492f9 3059 unsigned char final_char = CHARSET_ISO_FINAL (charset); \
4ed46869
KH
3060 char *intermediate_char_94 = "()*+"; \
3061 char *intermediate_char_96 = ",-./"; \
df7492f9
KH
3062 int revision = -1; \
3063 int c; \
3064 \
3065 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION) \
3066 revision = XINT (CHARSET_ISO_REVISION (charset)); \
3067 \
3068 if (revision >= 0) \
70c22245 3069 { \
df7492f9
KH
3070 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&'); \
3071 EMIT_ONE_BYTE ('@' + revision); \
4ed46869 3072 } \
df7492f9 3073 EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC); \
4ed46869
KH
3074 if (CHARSET_DIMENSION (charset) == 1) \
3075 { \
df7492f9
KH
3076 if (! CHARSET_ISO_CHARS_96 (charset)) \
3077 c = intermediate_char_94[reg]; \
4ed46869 3078 else \
df7492f9
KH
3079 c = intermediate_char_96[reg]; \
3080 EMIT_ONE_ASCII_BYTE (c); \
4ed46869
KH
3081 } \
3082 else \
3083 { \
df7492f9
KH
3084 EMIT_ONE_ASCII_BYTE ('$'); \
3085 if (! CHARSET_ISO_CHARS_96 (charset)) \
4ed46869 3086 { \
df7492f9 3087 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM \
b73bfc1c
KH
3088 || reg != 0 \
3089 || final_char < '@' || final_char > 'B') \
df7492f9 3090 EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]); \
4ed46869
KH
3091 } \
3092 else \
df7492f9 3093 EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]); \
4ed46869 3094 } \
df7492f9
KH
3095 EMIT_ONE_ASCII_BYTE (final_char); \
3096 \
3097 CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset); \
4ed46869
KH
3098 } while (0)
3099
df7492f9 3100
4ed46869
KH
3101/* The following two macros produce codes (control character or escape
3102 sequence) for ISO2022 single-shift functions (single-shift-2 and
3103 single-shift-3). */
3104
df7492f9
KH
3105#define ENCODE_SINGLE_SHIFT_2 \
3106 do { \
3107 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3108 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N'); \
3109 else \
3110 EMIT_ONE_BYTE (ISO_CODE_SS2); \
3111 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
4ed46869
KH
3112 } while (0)
3113
df7492f9
KH
3114
3115#define ENCODE_SINGLE_SHIFT_3 \
3116 do { \
3117 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3118 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O'); \
3119 else \
3120 EMIT_ONE_BYTE (ISO_CODE_SS3); \
3121 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
4ed46869
KH
3122 } while (0)
3123
df7492f9 3124
4ed46869
KH
3125/* The following four macros produce codes (control character or
3126 escape sequence) for ISO2022 locking-shift functions (shift-in,
3127 shift-out, locking-shift-2, and locking-shift-3). */
3128
df7492f9
KH
3129#define ENCODE_SHIFT_IN \
3130 do { \
3131 EMIT_ONE_ASCII_BYTE (ISO_CODE_SI); \
3132 CODING_ISO_INVOCATION (coding, 0) = 0; \
4ed46869
KH
3133 } while (0)
3134
df7492f9
KH
3135
3136#define ENCODE_SHIFT_OUT \
3137 do { \
3138 EMIT_ONE_ASCII_BYTE (ISO_CODE_SO); \
3139 CODING_ISO_INVOCATION (coding, 0) = 1; \
4ed46869
KH
3140 } while (0)
3141
df7492f9
KH
3142
3143#define ENCODE_LOCKING_SHIFT_2 \
3144 do { \
3145 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3146 CODING_ISO_INVOCATION (coding, 0) = 2; \
4ed46869
KH
3147 } while (0)
3148
df7492f9
KH
3149
3150#define ENCODE_LOCKING_SHIFT_3 \
3151 do { \
3152 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3153 CODING_ISO_INVOCATION (coding, 0) = 3; \
4ed46869
KH
3154 } while (0)
3155
df7492f9 3156
f4dee582
RS
3157/* Produce codes for a DIMENSION1 character whose character set is
3158 CHARSET and whose position-code is C1. Designation and invocation
4ed46869
KH
3159 sequences are also produced in advance if necessary. */
3160
6e85d753
KH
3161#define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
3162 do { \
df7492f9
KH
3163 int id = CHARSET_ID (charset); \
3164 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
6e85d753 3165 { \
df7492f9
KH
3166 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3167 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
6e85d753 3168 else \
df7492f9
KH
3169 EMIT_ONE_BYTE (c1 | 0x80); \
3170 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
6e85d753
KH
3171 break; \
3172 } \
df7492f9 3173 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
6e85d753 3174 { \
df7492f9 3175 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
6e85d753
KH
3176 break; \
3177 } \
df7492f9 3178 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
6e85d753 3179 { \
df7492f9 3180 EMIT_ONE_BYTE (c1 | 0x80); \
6e85d753
KH
3181 break; \
3182 } \
6e85d753
KH
3183 else \
3184 /* Since CHARSET is not yet invoked to any graphic planes, we \
3185 must invoke it, or, at first, designate it to some graphic \
3186 register. Then repeat the loop to actually produce the \
3187 character. */ \
df7492f9
KH
3188 dst = encode_invocation_designation (charset, coding, dst, \
3189 &produced_chars); \
4ed46869
KH
3190 } while (1)
3191
df7492f9 3192
f4dee582
RS
3193/* Produce codes for a DIMENSION2 character whose character set is
3194 CHARSET and whose position-codes are C1 and C2. Designation and
4ed46869
KH
3195 invocation codes are also produced in advance if necessary. */
3196
6e85d753
KH
3197#define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
3198 do { \
df7492f9
KH
3199 int id = CHARSET_ID (charset); \
3200 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
6e85d753 3201 { \
df7492f9
KH
3202 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3203 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
6e85d753 3204 else \
df7492f9
KH
3205 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3206 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
6e85d753
KH
3207 break; \
3208 } \
df7492f9 3209 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
6e85d753 3210 { \
df7492f9 3211 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
6e85d753
KH
3212 break; \
3213 } \
df7492f9 3214 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
6e85d753 3215 { \
df7492f9 3216 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
6e85d753
KH
3217 break; \
3218 } \
6e85d753
KH
3219 else \
3220 /* Since CHARSET is not yet invoked to any graphic planes, we \
3221 must invoke it, or, at first, designate it to some graphic \
3222 register. Then repeat the loop to actually produce the \
3223 character. */ \
df7492f9
KH
3224 dst = encode_invocation_designation (charset, coding, dst, \
3225 &produced_chars); \
4ed46869
KH
3226 } while (1)
3227
05e6f5dc 3228
df7492f9
KH
3229#define ENCODE_ISO_CHARACTER(charset, c) \
3230 do { \
3231 int code = ENCODE_CHAR ((charset),(c)); \
3232 \
3233 if (CHARSET_DIMENSION (charset) == 1) \
3234 ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code); \
3235 else \
3236 ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
84fbb8a0 3237 } while (0)
bdd9fb48 3238
05e6f5dc 3239
4ed46869 3240/* Produce designation and invocation codes at a place pointed by DST
df7492f9 3241 to use CHARSET. The element `spec.iso_2022' of *CODING is updated.
4ed46869
KH
3242 Return new DST. */
3243
3244unsigned char *
df7492f9
KH
3245encode_invocation_designation (charset, coding, dst, p_nchars)
3246 struct charset *charset;
4ed46869
KH
3247 struct coding_system *coding;
3248 unsigned char *dst;
df7492f9 3249 int *p_nchars;
4ed46869 3250{
df7492f9
KH
3251 int multibytep = coding->dst_multibyte;
3252 int produced_chars = *p_nchars;
4ed46869 3253 int reg; /* graphic register number */
df7492f9 3254 int id = CHARSET_ID (charset);
4ed46869
KH
3255
3256 /* At first, check designations. */
3257 for (reg = 0; reg < 4; reg++)
df7492f9 3258 if (id == CODING_ISO_DESIGNATION (coding, reg))
4ed46869
KH
3259 break;
3260
3261 if (reg >= 4)
3262 {
3263 /* CHARSET is not yet designated to any graphic registers. */
3264 /* At first check the requested designation. */
df7492f9
KH
3265 reg = CODING_ISO_REQUEST (coding, id);
3266 if (reg < 0)
1ba9e4ab
KH
3267 /* Since CHARSET requests no special designation, designate it
3268 to graphic register 0. */
4ed46869
KH
3269 reg = 0;
3270
3271 ENCODE_DESIGNATION (charset, reg, coding);
3272 }
3273
df7492f9
KH
3274 if (CODING_ISO_INVOCATION (coding, 0) != reg
3275 && CODING_ISO_INVOCATION (coding, 1) != reg)
4ed46869
KH
3276 {
3277 /* Since the graphic register REG is not invoked to any graphic
3278 planes, invoke it to graphic plane 0. */
3279 switch (reg)
3280 {
3281 case 0: /* graphic register 0 */
3282 ENCODE_SHIFT_IN;
3283 break;
3284
3285 case 1: /* graphic register 1 */
3286 ENCODE_SHIFT_OUT;
3287 break;
3288
3289 case 2: /* graphic register 2 */
df7492f9 3290 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4ed46869
KH
3291 ENCODE_SINGLE_SHIFT_2;
3292 else
3293 ENCODE_LOCKING_SHIFT_2;
3294 break;
3295
3296 case 3: /* graphic register 3 */
df7492f9 3297 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4ed46869
KH
3298 ENCODE_SINGLE_SHIFT_3;
3299 else
3300 ENCODE_LOCKING_SHIFT_3;
3301 break;
3302 }
3303 }
b73bfc1c 3304
df7492f9 3305 *p_nchars = produced_chars;
4ed46869
KH
3306 return dst;
3307}
3308
df7492f9
KH
3309/* The following three macros produce codes for indicating direction
3310 of text. */
3311#define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
ec6d2bb8 3312 do { \
df7492f9
KH
3313 if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS) \
3314 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '['); \
ec6d2bb8 3315 else \
df7492f9 3316 EMIT_ONE_BYTE (ISO_CODE_CSI); \
ec6d2bb8
KH
3317 } while (0)
3318
ec6d2bb8 3319
df7492f9
KH
3320#define ENCODE_DIRECTION_R2L() \
3321 do { \
3322 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3323 EMIT_TWO_ASCII_BYTES ('2', ']'); \
ec6d2bb8
KH
3324 } while (0)
3325
ec6d2bb8 3326
df7492f9 3327#define ENCODE_DIRECTION_L2R() \
ec6d2bb8 3328 do { \
df7492f9
KH
3329 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3330 EMIT_TWO_ASCII_BYTES ('0', ']'); \
4ed46869
KH
3331 } while (0)
3332
4ed46869
KH
3333
3334/* Produce codes for designation and invocation to reset the graphic
3335 planes and registers to initial state. */
df7492f9
KH
3336#define ENCODE_RESET_PLANE_AND_REGISTER() \
3337 do { \
3338 int reg; \
3339 struct charset *charset; \
3340 \
3341 if (CODING_ISO_INVOCATION (coding, 0) != 0) \
3342 ENCODE_SHIFT_IN; \
3343 for (reg = 0; reg < 4; reg++) \
3344 if (CODING_ISO_INITIAL (coding, reg) >= 0 \
3345 && (CODING_ISO_DESIGNATION (coding, reg) \
3346 != CODING_ISO_INITIAL (coding, reg))) \
3347 { \
3348 charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
3349 ENCODE_DESIGNATION (charset, reg, coding); \
3350 } \
4ed46869
KH
3351 } while (0)
3352
df7492f9 3353
bdd9fb48 3354/* Produce designation sequences of charsets in the line started from
b73bfc1c 3355 SRC to a place pointed by DST, and return updated DST.
bdd9fb48
KH
3356
3357 If the current block ends before any end-of-line, we may fail to
d46c5b12
KH
3358 find all the necessary designations. */
3359
b73bfc1c 3360static unsigned char *
df7492f9 3361encode_designation_at_bol (coding, charbuf, charbuf_end, dst)
e0e989f6 3362 struct coding_system *coding;
df7492f9
KH
3363 int *charbuf, *charbuf_end;
3364 unsigned char *dst;
e0e989f6 3365{
df7492f9 3366 struct charset *charset;
bdd9fb48
KH
3367 /* Table of charsets to be designated to each graphic register. */
3368 int r[4];
df7492f9
KH
3369 int c, found = 0, reg;
3370 int produced_chars = 0;
3371 int multibytep = coding->dst_multibyte;
3372 Lisp_Object attrs;
3373 Lisp_Object charset_list;
3374
3375 attrs = CODING_ID_ATTRS (coding->id);
3376 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3377 if (EQ (charset_list, Qiso_2022))
3378 charset_list = Viso_2022_charset_list;
bdd9fb48
KH
3379
3380 for (reg = 0; reg < 4; reg++)
3381 r[reg] = -1;
3382
b73bfc1c 3383 while (found < 4)
e0e989f6 3384 {
df7492f9
KH
3385 int id;
3386
3387 c = *charbuf++;
b73bfc1c
KH
3388 if (c == '\n')
3389 break;
df7492f9
KH
3390 charset = char_charset (c, charset_list, NULL);
3391 id = CHARSET_ID (charset);
3392 reg = CODING_ISO_REQUEST (coding, id);
3393 if (reg >= 0 && r[reg] < 0)
bdd9fb48
KH
3394 {
3395 found++;
df7492f9 3396 r[reg] = id;
bdd9fb48 3397 }
bdd9fb48
KH
3398 }
3399
3400 if (found)
3401 {
3402 for (reg = 0; reg < 4; reg++)
3403 if (r[reg] >= 0
df7492f9
KH
3404 && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
3405 ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
e0e989f6 3406 }
b73bfc1c
KH
3407
3408 return dst;
e0e989f6
KH
3409}
3410
4ed46869
KH
3411/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
3412
df7492f9
KH
3413static int
3414encode_coding_iso_2022 (coding)
4ed46869 3415 struct coding_system *coding;
4ed46869 3416{
df7492f9
KH
3417 int multibytep = coding->dst_multibyte;
3418 int *charbuf = coding->charbuf;
3419 int *charbuf_end = charbuf + coding->charbuf_used;
3420 unsigned char *dst = coding->destination + coding->produced;
3421 unsigned char *dst_end = coding->destination + coding->dst_bytes;
3422 int safe_room = 16;
3423 int bol_designation
3424 = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
3425 && CODING_ISO_BOL (coding));
3426 int produced_chars = 0;
3427 Lisp_Object attrs, eol_type, charset_list;
3428 int ascii_compatible;
b73bfc1c 3429 int c;
05e6f5dc 3430
df7492f9 3431 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
004068e4
KH
3432 setup_iso_safe_charsets (attrs);
3433 coding->safe_charsets
3434 = (char *) XSTRING (CODING_ATTR_SAFE_CHARSETS(attrs))->data;
bdd9fb48 3435
df7492f9 3436 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4ed46869 3437
df7492f9 3438 while (charbuf < charbuf_end)
4ed46869 3439 {
df7492f9 3440 ASSURE_DESTINATION (safe_room);
b73bfc1c 3441
df7492f9 3442 if (bol_designation)
b73bfc1c 3443 {
df7492f9 3444 unsigned char *dst_prev = dst;
4ed46869 3445
bdd9fb48 3446 /* We have to produce designation sequences if any now. */
df7492f9
KH
3447 dst = encode_designation_at_bol (coding, charbuf, charbuf_end, dst);
3448 bol_designation = 0;
3449 /* We are sure that designation sequences are all ASCII bytes. */
3450 produced_chars += dst - dst_prev;
4ed46869 3451 }
ec6d2bb8 3452
df7492f9 3453 c = *charbuf++;
4ed46869 3454
b73bfc1c
KH
3455 /* Now encode the character C. */
3456 if (c < 0x20 || c == 0x7F)
3457 {
df7492f9
KH
3458 if (c == '\n'
3459 || (c == '\r' && EQ (eol_type, Qmac)))
19a8d9e0 3460 {
df7492f9
KH
3461 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3462 ENCODE_RESET_PLANE_AND_REGISTER ();
3463 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
b73bfc1c 3464 {
df7492f9
KH
3465 int i;
3466
3467 for (i = 0; i < 4; i++)
3468 CODING_ISO_DESIGNATION (coding, i)
3469 = CODING_ISO_INITIAL (coding, i);
b73bfc1c 3470 }
df7492f9
KH
3471 bol_designation
3472 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
19a8d9e0 3473 }
df7492f9
KH
3474 else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
3475 ENCODE_RESET_PLANE_AND_REGISTER ();
3476 EMIT_ONE_ASCII_BYTE (c);
4ed46869 3477 }
df7492f9 3478 else if (ASCII_CHAR_P (c))
88993dfd 3479 {
df7492f9
KH
3480 if (ascii_compatible)
3481 EMIT_ONE_ASCII_BYTE (c);
3482 else
3483 ENCODE_ISO_CHARACTER (CHARSET_FROM_ID (charset_ascii), c);
88993dfd 3484 }
b73bfc1c 3485 else
df7492f9
KH
3486 {
3487 struct charset *charset = char_charset (c, charset_list, NULL);
b73bfc1c 3488
df7492f9
KH
3489 if (!charset)
3490 {
41cbe562
KH
3491 if (coding->mode & CODING_MODE_SAFE_ENCODING)
3492 {
3493 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
3494 charset = CHARSET_FROM_ID (charset_ascii);
3495 }
3496 else
3497 {
3498 c = coding->default_char;
3499 charset = char_charset (c, charset_list, NULL);
3500 }
df7492f9
KH
3501 }
3502 ENCODE_ISO_CHARACTER (charset, c);
3503 }
84fbb8a0 3504 }
b73bfc1c 3505
df7492f9
KH
3506 if (coding->mode & CODING_MODE_LAST_BLOCK
3507 && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3508 {
3509 ASSURE_DESTINATION (safe_room);
3510 ENCODE_RESET_PLANE_AND_REGISTER ();
3511 }
3512 coding->result = CODING_RESULT_SUCCESS;
3513 CODING_ISO_BOL (coding) = bol_designation;
3514 coding->produced_char += produced_chars;
3515 coding->produced = dst - coding->destination;
3516 return 0;
4ed46869
KH
3517}
3518
3519\f
df7492f9 3520/*** 8,9. SJIS and BIG5 handlers ***/
4ed46869 3521
df7492f9 3522/* Although SJIS and BIG5 are not ISO's coding system, they are used
4ed46869
KH
3523 quite widely. So, for the moment, Emacs supports them in the bare
3524 C code. But, in the future, they may be supported only by CCL. */
3525
3526/* SJIS is a coding system encoding three character sets: ASCII, right
3527 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
3528 as is. A character of charset katakana-jisx0201 is encoded by
3529 "position-code + 0x80". A character of charset japanese-jisx0208
3530 is encoded in 2-byte but two position-codes are divided and shifted
df7492f9 3531 so that it fit in the range below.
4ed46869
KH
3532
3533 --- CODE RANGE of SJIS ---
3534 (character set) (range)
3535 ASCII 0x00 .. 0x7F
df7492f9 3536 KATAKANA-JISX0201 0xA0 .. 0xDF
c28a9453 3537 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
d14d03ac 3538 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
4ed46869
KH
3539 -------------------------------
3540
3541*/
3542
3543/* BIG5 is a coding system encoding two character sets: ASCII and
3544 Big5. An ASCII character is encoded as is. Big5 is a two-byte
df7492f9 3545 character set and is encoded in two-byte.
4ed46869
KH
3546
3547 --- CODE RANGE of BIG5 ---
3548 (character set) (range)
3549 ASCII 0x00 .. 0x7F
3550 Big5 (1st byte) 0xA1 .. 0xFE
3551 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3552 --------------------------
3553
df7492f9 3554 */
4ed46869
KH
3555
3556/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3557 Check if a text is encoded in SJIS. If it is, return
df7492f9 3558 CATEGORY_MASK_SJIS, else return 0. */
4ed46869 3559
0a28aafb 3560static int
df7492f9
KH
3561detect_coding_sjis (coding, mask)
3562 struct coding_system *coding;
3563 int *mask;
4ed46869 3564{
df7492f9
KH
3565 unsigned char *src = coding->source, *src_base = src;
3566 unsigned char *src_end = coding->source + coding->src_bytes;
3567 int multibytep = coding->src_multibyte;
3568 int consumed_chars = 0;
3569 int found = 0;
b73bfc1c 3570 int c;
df7492f9
KH
3571
3572 /* A coding system of this category is always ASCII compatible. */
3573 src += coding->head_ascii;
4ed46869 3574
b73bfc1c 3575 while (1)
4ed46869 3576 {
df7492f9 3577 ONE_MORE_BYTE (c);
682169fe
KH
3578 if (c < 0x80)
3579 continue;
df7492f9 3580 if ((c >= 0x81 && c <= 0x9F) || (c >= 0xE0 && c <= 0xEF))
4ed46869 3581 {
df7492f9 3582 ONE_MORE_BYTE (c);
682169fe 3583 if (c < 0x40 || c == 0x7F || c > 0xFC)
df7492f9
KH
3584 break;
3585 found = 1;
4ed46869 3586 }
df7492f9
KH
3587 else if (c >= 0xA0 && c < 0xE0)
3588 found = 1;
3589 else
3590 break;
4ed46869 3591 }
df7492f9
KH
3592 *mask &= ~CATEGORY_MASK_SJIS;
3593 return 0;
3594
3595 no_more_source:
3596 if (!found)
3597 return 0;
3598 *mask &= CATEGORY_MASK_SJIS;
3599 return 1;
4ed46869
KH
3600}
3601
3602/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3603 Check if a text is encoded in BIG5. If it is, return
df7492f9 3604 CATEGORY_MASK_BIG5, else return 0. */
4ed46869 3605
0a28aafb 3606static int
df7492f9
KH
3607detect_coding_big5 (coding, mask)
3608 struct coding_system *coding;
3609 int *mask;
4ed46869 3610{
df7492f9
KH
3611 unsigned char *src = coding->source, *src_base = src;
3612 unsigned char *src_end = coding->source + coding->src_bytes;
3613 int multibytep = coding->src_multibyte;
3614 int consumed_chars = 0;
3615 int found = 0;
b73bfc1c 3616 int c;
fa42c37f 3617
df7492f9
KH
3618 /* A coding system of this category is always ASCII compatible. */
3619 src += coding->head_ascii;
fa42c37f 3620
b73bfc1c 3621 while (1)
fa42c37f 3622 {
df7492f9
KH
3623 ONE_MORE_BYTE (c);
3624 if (c < 0x80)
fa42c37f 3625 continue;
df7492f9 3626 if (c >= 0xA1)
fa42c37f 3627 {
df7492f9
KH
3628 ONE_MORE_BYTE (c);
3629 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
fa42c37f 3630 return 0;
df7492f9 3631 found = 1;
fa42c37f 3632 }
df7492f9
KH
3633 else
3634 break;
fa42c37f 3635 }
df7492f9 3636 *mask &= ~CATEGORY_MASK_BIG5;
fa42c37f 3637 return 0;
df7492f9
KH
3638
3639 no_more_source:
3640 if (!found)
3641 return 0;
3642 *mask &= CATEGORY_MASK_BIG5;
3643 return 1;
fa42c37f
KH
3644}
3645
4ed46869
KH
3646/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
3647 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
3648
b73bfc1c 3649static void
df7492f9 3650decode_coding_sjis (coding)
4ed46869 3651 struct coding_system *coding;
4ed46869 3652{
df7492f9
KH
3653 unsigned char *src = coding->source + coding->consumed;
3654 unsigned char *src_end = coding->source + coding->src_bytes;
b73bfc1c 3655 unsigned char *src_base;
df7492f9
KH
3656 int *charbuf = coding->charbuf;
3657 int *charbuf_end = charbuf + coding->charbuf_size;
3658 int consumed_chars = 0, consumed_chars_base;
3659 int multibytep = coding->src_multibyte;
3660 struct charset *charset_roman, *charset_kanji, *charset_kana;
3661 Lisp_Object attrs, eol_type, charset_list, val;
a5d301df 3662
df7492f9
KH
3663 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
3664
3665 val = charset_list;
3666 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3667 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3668 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val)));
4ed46869 3669
b73bfc1c 3670 while (1)
4ed46869 3671 {
df7492f9 3672 int c, c1;
b73bfc1c
KH
3673
3674 src_base = src;
df7492f9
KH
3675 consumed_chars_base = consumed_chars;
3676
3677 if (charbuf >= charbuf_end)
3678 break;
3679
3680 ONE_MORE_BYTE (c);
b73bfc1c 3681
df7492f9 3682 if (c == '\r')
4ed46869 3683 {
df7492f9 3684 if (EQ (eol_type, Qdos))
4ed46869 3685 {
df7492f9
KH
3686 if (src == src_end)
3687 goto no_more_source;
3688 if (*src == '\n')
3689 ONE_MORE_BYTE (c);
4ed46869 3690 }
df7492f9
KH
3691 else if (EQ (eol_type, Qmac))
3692 c = '\n';
4ed46869 3693 }
54f78171 3694 else
df7492f9
KH
3695 {
3696 struct charset *charset;
3697
3698 if (c < 0x80)
3699 charset = charset_roman;
3700 else
4ed46869 3701 {
df7492f9
KH
3702 if (c >= 0xF0)
3703 goto invalid_code;
3704 if (c < 0xA0 || c >= 0xE0)
fb88bf2d 3705 {
54f78171 3706 /* SJIS -> JISX0208 */
df7492f9
KH
3707 ONE_MORE_BYTE (c1);
3708 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
3709 goto invalid_code;
3710 c = (c << 8) | c1;
3711 SJIS_TO_JIS (c);
3712 charset = charset_kanji;
5e34de15 3713 }
fb88bf2d 3714 else
b73bfc1c 3715 /* SJIS -> JISX0201-Kana */
df7492f9
KH
3716 charset = charset_kana;
3717 }
3718 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
3719 }
3720 *charbuf++ = c;
3721 continue;
3722
3723 invalid_code:
3724 src = src_base;
3725 consumed_chars = consumed_chars_base;
3726 ONE_MORE_BYTE (c);
3727 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3728 coding->errors++;
3729 }
3730
3731 no_more_source:
3732 coding->consumed_char += consumed_chars_base;
3733 coding->consumed = src_base - coding->source;
3734 coding->charbuf_used = charbuf - coding->charbuf;
3735}
3736
3737static void
3738decode_coding_big5 (coding)
3739 struct coding_system *coding;
3740{
3741 unsigned char *src = coding->source + coding->consumed;
3742 unsigned char *src_end = coding->source + coding->src_bytes;
3743 unsigned char *src_base;
3744 int *charbuf = coding->charbuf;
3745 int *charbuf_end = charbuf + coding->charbuf_size;
3746 int consumed_chars = 0, consumed_chars_base;
3747 int multibytep = coding->src_multibyte;
3748 struct charset *charset_roman, *charset_big5;
3749 Lisp_Object attrs, eol_type, charset_list, val;
3750
3751 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
3752 val = charset_list;
3753 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3754 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
3755
3756 while (1)
3757 {
3758 int c, c1;
3759
3760 src_base = src;
3761 consumed_chars_base = consumed_chars;
3762
3763 if (charbuf >= charbuf_end)
3764 break;
3765
3766 ONE_MORE_BYTE (c);
3767
3768 if (c == '\r')
3769 {
3770 if (EQ (eol_type, Qdos))
3771 {
3772 if (src == src_end)
3773 goto no_more_source;
3774 if (*src == '\n')
3775 ONE_MORE_BYTE (c);
4ed46869 3776 }
df7492f9
KH
3777 else if (EQ (eol_type, Qmac))
3778 c = '\n';
3779 }
3780 else
3781 {
3782 struct charset *charset;
3783 if (c < 0x80)
3784 charset = charset_roman;
fb88bf2d 3785 else
fb88bf2d 3786 {
54f78171 3787 /* BIG5 -> Big5 */
df7492f9
KH
3788 if (c < 0xA1 || c > 0xFE)
3789 goto invalid_code;
3790 ONE_MORE_BYTE (c1);
3791 if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
3792 goto invalid_code;
3793 c = c << 8 | c1;
3794 charset = charset_big5;
4ed46869 3795 }
df7492f9 3796 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4ed46869 3797 }
4ed46869 3798
df7492f9 3799 *charbuf++ = c;
fb88bf2d
KH
3800 continue;
3801
df7492f9 3802 invalid_code:
4ed46869 3803 src = src_base;
df7492f9
KH
3804 consumed_chars = consumed_chars_base;
3805 ONE_MORE_BYTE (c);
3806 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3807 coding->errors++;
fb88bf2d 3808 }
d46c5b12 3809
df7492f9
KH
3810 no_more_source:
3811 coding->consumed_char += consumed_chars_base;
3812 coding->consumed = src_base - coding->source;
3813 coding->charbuf_used = charbuf - coding->charbuf;
3814}
3815
3816/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
3817 This function can encode charsets `ascii', `katakana-jisx0201',
3818 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
3819 are sure that all these charsets are registered as official charset
3820 (i.e. do not have extended leading-codes). Characters of other
3821 charsets are produced without any encoding. If SJIS_P is 1, encode
3822 SJIS text, else encode BIG5 text. */
3823
3824static int
3825encode_coding_sjis (coding)
3826 struct coding_system *coding;
3827{
3828 int multibytep = coding->dst_multibyte;
3829 int *charbuf = coding->charbuf;
3830 int *charbuf_end = charbuf + coding->charbuf_used;
3831 unsigned char *dst = coding->destination + coding->produced;
3832 unsigned char *dst_end = coding->destination + coding->dst_bytes;
3833 int safe_room = 4;
3834 int produced_chars = 0;
3835 Lisp_Object attrs, eol_type, charset_list, val;
3836 int ascii_compatible;
3837 struct charset *charset_roman, *charset_kanji, *charset_kana;
3838 int c;
3839
3840 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
3841 val = charset_list;
3842 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3843 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3844 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
3845
3846 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
3847
3848 while (charbuf < charbuf_end)
3849 {
3850 ASSURE_DESTINATION (safe_room);
3851 c = *charbuf++;
3852 /* Now encode the character C. */
3853 if (ASCII_CHAR_P (c) && ascii_compatible)
3854 EMIT_ONE_ASCII_BYTE (c);
3855 else
3856 {
3857 unsigned code;
3858 struct charset *charset = char_charset (c, charset_list, &code);
3859
3860 if (!charset)
3861 {
41cbe562
KH
3862 if (coding->mode & CODING_MODE_SAFE_ENCODING)
3863 {
3864 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
3865 charset = CHARSET_FROM_ID (charset_ascii);
3866 }
3867 else
3868 {
3869 c = coding->default_char;
3870 charset = char_charset (c, charset_list, &code);
3871 }
df7492f9
KH
3872 }
3873 if (code == CHARSET_INVALID_CODE (charset))
3874 abort ();
3875 if (charset == charset_kanji)
3876 {
3877 int c1, c2;
3878 JIS_TO_SJIS (code);
3879 c1 = code >> 8, c2 = code & 0xFF;
3880 EMIT_TWO_BYTES (c1, c2);
3881 }
3882 else if (charset == charset_kana)
3883 EMIT_ONE_BYTE (code | 0x80);
3884 else
3885 EMIT_ONE_ASCII_BYTE (code & 0x7F);
3886 }
3887 }
3888 coding->result = CODING_RESULT_SUCCESS;
3889 coding->produced_char += produced_chars;
3890 coding->produced = dst - coding->destination;
3891 return 0;
3892}
3893
3894static int
3895encode_coding_big5 (coding)
3896 struct coding_system *coding;
3897{
3898 int multibytep = coding->dst_multibyte;
3899 int *charbuf = coding->charbuf;
3900 int *charbuf_end = charbuf + coding->charbuf_used;
3901 unsigned char *dst = coding->destination + coding->produced;
3902 unsigned char *dst_end = coding->destination + coding->dst_bytes;
3903 int safe_room = 4;
3904 int produced_chars = 0;
3905 Lisp_Object attrs, eol_type, charset_list, val;
3906 int ascii_compatible;
3907 struct charset *charset_roman, *charset_big5;
3908 int c;
3909
3910 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
3911 val = charset_list;
3912 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3913 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
3914 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
3915
3916 while (charbuf < charbuf_end)
3917 {
3918 ASSURE_DESTINATION (safe_room);
3919 c = *charbuf++;
3920 /* Now encode the character C. */
3921 if (ASCII_CHAR_P (c) && ascii_compatible)
3922 EMIT_ONE_ASCII_BYTE (c);
3923 else
3924 {
3925 unsigned code;
3926 struct charset *charset = char_charset (c, charset_list, &code);
3927
3928 if (! charset)
3929 {
41cbe562
KH
3930 if (coding->mode & CODING_MODE_SAFE_ENCODING)
3931 {
3932 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
3933 charset = CHARSET_FROM_ID (charset_ascii);
3934 }
3935 else
3936 {
3937 c = coding->default_char;
3938 charset = char_charset (c, charset_list, &code);
3939 }
df7492f9
KH
3940 }
3941 if (code == CHARSET_INVALID_CODE (charset))
3942 abort ();
3943 if (charset == charset_big5)
3944 {
3945 int c1, c2;
3946
3947 c1 = code >> 8, c2 = code & 0xFF;
3948 EMIT_TWO_BYTES (c1, c2);
3949 }
3950 else
3951 EMIT_ONE_ASCII_BYTE (code & 0x7F);
3952 }
3953 }
3954 coding->result = CODING_RESULT_SUCCESS;
3955 coding->produced_char += produced_chars;
3956 coding->produced = dst - coding->destination;
3957 return 0;
3958}
3959
3960\f
3961/*** 10. CCL handlers ***/
3962
3963/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3964 Check if a text is encoded in a coding system of which
3965 encoder/decoder are written in CCL program. If it is, return
3966 CATEGORY_MASK_CCL, else return 0. */
3967
3968static int
3969detect_coding_ccl (coding, mask)
3970 struct coding_system *coding;
3971 int *mask;
3972{
3973 unsigned char *src = coding->source, *src_base = src;
3974 unsigned char *src_end = coding->source + coding->src_bytes;
3975 int multibytep = coding->src_multibyte;
3976 int consumed_chars = 0;
3977 int found = 0;
3978 unsigned char *valids = CODING_CCL_VALIDS (coding);
3979 int head_ascii = coding->head_ascii;
3980 Lisp_Object attrs;
3981
3982 coding = &coding_categories[coding_category_ccl];
3983 attrs = CODING_ID_ATTRS (coding->id);
3984 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
3985 src += head_ascii;
3986
3987 while (1)
3988 {
3989 int c;
3990 ONE_MORE_BYTE (c);
3991 if (! valids[c])
3992 break;
3993 if (!found && valids[c] > 1)
3994 found = 1;
3995 }
3996 *mask &= ~CATEGORY_MASK_CCL;
3997 return 0;
3998
3999 no_more_source:
4000 if (!found)
4001 return 0;
4002 *mask &= CATEGORY_MASK_CCL;
4003 return 1;
4004}
4005
4006static void
4007decode_coding_ccl (coding)
4008 struct coding_system *coding;
4009{
4010 unsigned char *src = coding->source + coding->consumed;
4011 unsigned char *src_end = coding->source + coding->src_bytes;
4012 int *charbuf = coding->charbuf;
4013 int *charbuf_end = charbuf + coding->charbuf_size;
4014 int consumed_chars = 0;
4015 int multibytep = coding->src_multibyte;
4016 struct ccl_program ccl;
4017 int source_charbuf[1024];
4018 int source_byteidx[1024];
4019
4020 setup_ccl_program (&ccl, CODING_CCL_DECODER (coding));
4021
4022 while (src < src_end)
4023 {
4024 unsigned char *p = src;
4025 int *source, *source_end;
4026 int i = 0;
4027
4028 if (multibytep)
4029 while (i < 1024 && p < src_end)
4030 {
4031 source_byteidx[i] = p - src;
4032 source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
4033 }
4034 else
4035 while (i < 1024 && p < src_end)
4036 source_charbuf[i++] = *p++;
4037
4038 if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
4039 ccl.last_block = 1;
4040
4041 source = source_charbuf;
4042 source_end = source + i;
4043 while (source < source_end)
4044 {
4045 ccl_driver (&ccl, source, charbuf,
4046 source_end - source, charbuf_end - charbuf);
4047 source += ccl.consumed;
4048 charbuf += ccl.produced;
4049 if (ccl.status != CCL_STAT_SUSPEND_BY_DST)
4050 break;
4051 }
4052 if (source < source_end)
4053 src += source_byteidx[source - source_charbuf];
4054 else
4055 src = p;
4056 consumed_chars += source - source_charbuf;
4057
4058 if (ccl.status != CCL_STAT_SUSPEND_BY_SRC
4059 && ccl.status != CODING_RESULT_INSUFFICIENT_SRC)
4060 break;
4061 }
4062
4063 switch (ccl.status)
4064 {
4065 case CCL_STAT_SUSPEND_BY_SRC:
4066 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
4067 break;
4068 case CCL_STAT_SUSPEND_BY_DST:
4069 break;
4070 case CCL_STAT_QUIT:
4071 case CCL_STAT_INVALID_CMD:
4072 coding->result = CODING_RESULT_INTERRUPT;
4073 break;
4074 default:
4075 coding->result = CODING_RESULT_SUCCESS;
4076 break;
4077 }
4078 coding->consumed_char += consumed_chars;
4079 coding->consumed = src - coding->source;
4080 coding->charbuf_used = charbuf - coding->charbuf;
4081}
4082
4083static int
4084encode_coding_ccl (coding)
4085 struct coding_system *coding;
4086{
4087 struct ccl_program ccl;
4088 int multibytep = coding->dst_multibyte;
4089 int *charbuf = coding->charbuf;
4090 int *charbuf_end = charbuf + coding->charbuf_used;
4091 unsigned char *dst = coding->destination + coding->produced;
4092 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4093 unsigned char *adjusted_dst_end = dst_end - 1;
4094 int destination_charbuf[1024];
4095 int i, produced_chars = 0;
4096
4097 setup_ccl_program (&ccl, CODING_CCL_ENCODER (coding));
4098
4099 ccl.last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4100 ccl.dst_multibyte = coding->dst_multibyte;
4101
4102 while (charbuf < charbuf_end && dst < adjusted_dst_end)
4103 {
4104 int dst_bytes = dst_end - dst;
4105 if (dst_bytes > 1024)
4106 dst_bytes = 1024;
4107
4108 ccl_driver (&ccl, charbuf, destination_charbuf,
4109 charbuf_end - charbuf, dst_bytes);
4110 charbuf += ccl.consumed;
4111 if (multibytep)
4112 for (i = 0; i < ccl.produced; i++)
4113 EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
4114 else
4115 {
4116 for (i = 0; i < ccl.produced; i++)
4117 *dst++ = destination_charbuf[i] & 0xFF;
4118 produced_chars += ccl.produced;
4119 }
4120 }
4121
4122 switch (ccl.status)
4123 {
4124 case CCL_STAT_SUSPEND_BY_SRC:
4125 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
4126 break;
4127 case CCL_STAT_SUSPEND_BY_DST:
4128 coding->result = CODING_RESULT_INSUFFICIENT_DST;
4129 break;
4130 case CCL_STAT_QUIT:
4131 case CCL_STAT_INVALID_CMD:
4132 coding->result = CODING_RESULT_INTERRUPT;
4133 break;
4134 default:
4135 coding->result = CODING_RESULT_SUCCESS;
4136 break;
4137 }
4138
4139 coding->produced_char += produced_chars;
4140 coding->produced = dst - coding->destination;
4141 return 0;
4ed46869
KH
4142}
4143
df7492f9
KH
4144
4145\f
4146/*** 10, 11. no-conversion handlers ***/
4147
4148/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
4ed46869 4149
b73bfc1c 4150static void
df7492f9 4151decode_coding_raw_text (coding)
4ed46869 4152 struct coding_system *coding;
4ed46869 4153{
df7492f9 4154 coding->chars_at_source = 1;
2c78b7e1
KH
4155 coding->consumed_char = 0;
4156 coding->consumed = 0;
df7492f9
KH
4157 coding->result = CODING_RESULT_SUCCESS;
4158}
4ed46869 4159
df7492f9
KH
4160static int
4161encode_coding_raw_text (coding)
4162 struct coding_system *coding;
4163{
4164 int multibytep = coding->dst_multibyte;
4165 int *charbuf = coding->charbuf;
4166 int *charbuf_end = coding->charbuf + coding->charbuf_used;
4167 unsigned char *dst = coding->destination + coding->produced;
4168 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4169 int produced_chars = 0;
4170 int c;
a5d301df 4171
df7492f9 4172 if (multibytep)
b73bfc1c 4173 {
df7492f9 4174 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
4ed46869 4175
df7492f9
KH
4176 if (coding->src_multibyte)
4177 while (charbuf < charbuf_end)
4178 {
4179 ASSURE_DESTINATION (safe_room);
4180 c = *charbuf++;
4181 if (ASCII_CHAR_P (c))
4182 EMIT_ONE_ASCII_BYTE (c);
4183 else if (CHAR_BYTE8_P (c))
4184 {
4185 c = CHAR_TO_BYTE8 (c);
4186 EMIT_ONE_BYTE (c);
4187 }
4188 else
4189 {
4190 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
93dec019 4191
df7492f9
KH
4192 CHAR_STRING_ADVANCE (c, p1);
4193 while (p0 < p1)
4194 EMIT_ONE_BYTE (*p0);
4195 }
4196 }
b73bfc1c 4197 else
df7492f9
KH
4198 while (charbuf < charbuf_end)
4199 {
4200 ASSURE_DESTINATION (safe_room);
4201 c = *charbuf++;
4202 EMIT_ONE_BYTE (c);
4203 }
4204 }
4205 else
4206 {
4207 if (coding->src_multibyte)
b73bfc1c 4208 {
df7492f9
KH
4209 int safe_room = MAX_MULTIBYTE_LENGTH;
4210
4211 while (charbuf < charbuf_end)
b73bfc1c 4212 {
df7492f9
KH
4213 ASSURE_DESTINATION (safe_room);
4214 c = *charbuf++;
4215 if (ASCII_CHAR_P (c))
4216 *dst++ = c;
4217 else if (CHAR_BYTE8_P (c))
4218 *dst++ = CHAR_TO_BYTE8 (c);
b73bfc1c 4219 else
df7492f9
KH
4220 CHAR_STRING_ADVANCE (c, dst);
4221 produced_chars++;
b73bfc1c 4222 }
4ed46869 4223 }
df7492f9
KH
4224 else
4225 {
4226 ASSURE_DESTINATION (charbuf_end - charbuf);
4227 while (charbuf < charbuf_end && dst < dst_end)
4228 *dst++ = *charbuf++;
4229 produced_chars = dst - (coding->destination + coding->dst_bytes);
4230 }
4ed46869 4231 }
df7492f9
KH
4232 coding->result = CODING_RESULT_SUCCESS;
4233 coding->produced_char += produced_chars;
4234 coding->produced = dst - coding->destination;
4235 return 0;
4ed46869
KH
4236}
4237
0a28aafb 4238static int
df7492f9
KH
4239detect_coding_charset (coding, mask)
4240 struct coding_system *coding;
4241 int *mask;
1397dc18 4242{
df7492f9
KH
4243 unsigned char *src = coding->source, *src_base = src;
4244 unsigned char *src_end = coding->source + coding->src_bytes;
4245 int multibytep = coding->src_multibyte;
4246 int consumed_chars = 0;
4247 Lisp_Object attrs, valids;
1397dc18 4248
df7492f9
KH
4249 coding = &coding_categories[coding_category_charset];
4250 attrs = CODING_ID_ATTRS (coding->id);
4251 valids = AREF (attrs, coding_attr_charset_valids);
4252
4253 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4254 src += coding->head_ascii;
1397dc18 4255
b73bfc1c 4256 while (1)
1397dc18 4257 {
df7492f9 4258 int c;
1397dc18 4259
df7492f9
KH
4260 ONE_MORE_BYTE (c);
4261 if (NILP (AREF (valids, c)))
4262 break;
4263 }
4264 *mask &= ~CATEGORY_MASK_CHARSET;
4265 return 0;
4ed46869 4266
df7492f9
KH
4267 no_more_source:
4268 *mask &= CATEGORY_MASK_CHARSET;
4269 return 1;
4270}
4ed46869 4271
b73bfc1c 4272static void
df7492f9 4273decode_coding_charset (coding)
4ed46869 4274 struct coding_system *coding;
4ed46869 4275{
df7492f9
KH
4276 unsigned char *src = coding->source + coding->consumed;
4277 unsigned char *src_end = coding->source + coding->src_bytes;
b73bfc1c 4278 unsigned char *src_base;
df7492f9
KH
4279 int *charbuf = coding->charbuf;
4280 int *charbuf_end = charbuf + coding->charbuf_size;
4281 int consumed_chars = 0, consumed_chars_base;
4282 int multibytep = coding->src_multibyte;
4eb6d3f1 4283 Lisp_Object attrs, eol_type, charset_list, valids;
df7492f9
KH
4284
4285 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4eb6d3f1 4286 valids = AREF (attrs, coding_attr_charset_valids);
b73bfc1c 4287
df7492f9 4288 while (1)
4ed46869 4289 {
4eb6d3f1 4290 int c;
df7492f9
KH
4291
4292 src_base = src;
4293 consumed_chars_base = consumed_chars;
b73bfc1c 4294
df7492f9
KH
4295 if (charbuf >= charbuf_end)
4296 break;
4297
4eb6d3f1 4298 ONE_MORE_BYTE (c);
df7492f9 4299 if (c == '\r')
d46c5b12 4300 {
df7492f9 4301 if (EQ (eol_type, Qdos))
b73bfc1c 4302 {
4eb6d3f1
KH
4303 if (src < src_end
4304 && *src == '\n')
df7492f9 4305 ONE_MORE_BYTE (c);
b73bfc1c 4306 }
df7492f9 4307 else if (EQ (eol_type, Qmac))
b73bfc1c 4308 c = '\n';
d46c5b12 4309 }
df7492f9 4310 else
d46c5b12 4311 {
4eb6d3f1
KH
4312 Lisp_Object val;
4313 struct charset *charset;
4314 int c1;
4315
4316 val = AREF (valids, c);
4317 if (NILP (val))
4318 goto invalid_code;
4319 charset = CHARSET_FROM_ID (XFASTINT (val));
4320 if (CHARSET_DIMENSION (charset) > 1)
4321 {
4322 ONE_MORE_BYTE (c1);
4323 c = (c << 8) | c1;
4324 if (CHARSET_DIMENSION (charset) > 2)
4325 {
4326 ONE_MORE_BYTE (c1);
4327 c = (c << 8) | c1;
4328 if (CHARSET_DIMENSION (charset) > 3)
4329 {
4330 ONE_MORE_BYTE (c1);
4331 c = (c << 8) | c1;
4332 }
4333 }
4334 }
4335 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
df7492f9
KH
4336 if (c < 0)
4337 goto invalid_code;
d46c5b12 4338 }
df7492f9
KH
4339 *charbuf++ = c;
4340 continue;
4341
4342 invalid_code:
4343 src = src_base;
4344 consumed_chars = consumed_chars_base;
4345 ONE_MORE_BYTE (c);
4346 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4347 coding->errors++;
4ed46869
KH
4348 }
4349
df7492f9
KH
4350 no_more_source:
4351 coding->consumed_char += consumed_chars_base;
4352 coding->consumed = src_base - coding->source;
4353 coding->charbuf_used = charbuf - coding->charbuf;
4ed46869
KH
4354}
4355
df7492f9
KH
4356static int
4357encode_coding_charset (coding)
4ed46869 4358 struct coding_system *coding;
4ed46869 4359{
df7492f9
KH
4360 int multibytep = coding->dst_multibyte;
4361 int *charbuf = coding->charbuf;
4362 int *charbuf_end = charbuf + coding->charbuf_used;
4363 unsigned char *dst = coding->destination + coding->produced;
4364 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4365 int safe_room = MAX_MULTIBYTE_LENGTH;
4366 int produced_chars = 0;
df7492f9
KH
4367 Lisp_Object attrs, eol_type, charset_list;
4368 int ascii_compatible;
b73bfc1c 4369 int c;
b73bfc1c 4370
df7492f9 4371 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
df7492f9 4372 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
fb88bf2d 4373
df7492f9 4374 while (charbuf < charbuf_end)
4ed46869 4375 {
4eb6d3f1 4376 struct charset *charset;
df7492f9
KH
4377 unsigned code;
4378
4379 ASSURE_DESTINATION (safe_room);
4380 c = *charbuf++;
4381 if (ascii_compatible && ASCII_CHAR_P (c))
4382 EMIT_ONE_ASCII_BYTE (c);
d46c5b12 4383 else
4eb6d3f1
KH
4384 {
4385 charset = char_charset (c, charset_list, &code);
4386 if (charset)
4387 {
4388 if (CHARSET_DIMENSION (charset) == 1)
4389 EMIT_ONE_BYTE (code);
4390 else if (CHARSET_DIMENSION (charset) == 2)
4391 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
4392 else if (CHARSET_DIMENSION (charset) == 3)
4393 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
4394 else
4395 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
4396 (code >> 8) & 0xFF, code & 0xFF);
4397 }
4398 else
41cbe562
KH
4399 {
4400 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4401 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4402 else
4403 c = coding->default_char;
4404 EMIT_ONE_BYTE (c);
4405 }
4eb6d3f1 4406 }
4ed46869
KH
4407 }
4408
df7492f9
KH
4409 coding->result = CODING_RESULT_SUCCESS;
4410 coding->produced_char += produced_chars;
4411 coding->produced = dst - coding->destination;
4412 return 0;
4ed46869
KH
4413}
4414
4415\f
1397dc18 4416/*** 7. C library functions ***/
4ed46869 4417
df7492f9 4418/* In Emacs Lisp, coding system is represented by a Lisp symbol which
4ed46869 4419 has a property `coding-system'. The value of this property is a
df7492f9 4420 vector of length 5 (called as coding-vector). Among elements of
4ed46869
KH
4421 this vector, the first (element[0]) and the fifth (element[4])
4422 carry important information for decoding/encoding. Before
4423 decoding/encoding, this information should be set in fields of a
4424 structure of type `coding_system'.
4425
df7492f9 4426 A value of property `coding-system' can be a symbol of another
4ed46869
KH
4427 subsidiary coding-system. In that case, Emacs gets coding-vector
4428 from that symbol.
4429
4430 `element[0]' contains information to be set in `coding->type'. The
4431 value and its meaning is as follows:
4432
0ef69138
KH
4433 0 -- coding_type_emacs_mule
4434 1 -- coding_type_sjis
df7492f9 4435 2 -- coding_type_iso_2022
0ef69138
KH
4436 3 -- coding_type_big5
4437 4 -- coding_type_ccl encoder/decoder written in CCL
4438 nil -- coding_type_no_conversion
4439 t -- coding_type_undecided (automatic conversion on decoding,
4440 no-conversion on encoding)
4ed46869
KH
4441
4442 `element[4]' contains information to be set in `coding->flags' and
4443 `coding->spec'. The meaning varies by `coding->type'.
4444
df7492f9 4445 If `coding->type' is `coding_type_iso_2022', element[4] is a vector
4ed46869
KH
4446 of length 32 (of which the first 13 sub-elements are used now).
4447 Meanings of these sub-elements are:
4448
df7492f9
KH
4449 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso_2022'
4450 If the value is an integer of valid charset, the charset is
4451 assumed to be designated to graphic register N initially.
4ed46869 4452
df7492f9
KH
4453 If the value is minus, it is a minus value of charset which
4454 reserves graphic register N, which means that the charset is
4455 not designated initially but should be designated to graphic
4456 register N just before encoding a character in that charset.
1397dc18 4457
df7492f9
KH
4458 If the value is nil, graphic register N is never used on
4459 encoding.
4460
4461 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
4462 Each value takes t or nil. See the section ISO2022 of
4463 `coding.h' for more information.
1397dc18 4464
df7492f9
KH
4465 If `coding->type' is `coding_type_big5', element[4] is t to denote
4466 BIG5-ETen or nil to denote BIG5-HKU.
4ed46869 4467
df7492f9 4468 If `coding->type' takes the other value, element[4] is ignored.
27901516 4469
df7492f9
KH
4470 Emacs Lisp's coding system also carries information about format of
4471 end-of-line in a value of property `eol-type'. If the value is
4472 integer, 0 means eol_lf, 1 means eol_crlf, and 2 means eol_cr. If
4473 it is not integer, it should be a vector of subsidiary coding
4474 systems of which property `eol-type' has one of above values.
4ed46869 4475
df7492f9 4476*/
4ed46869 4477
df7492f9
KH
4478/* Setup coding context CODING from information about CODING_SYSTEM.
4479 If CODING_SYSTEM is nil, `no-conversion' is assumed. If
4480 CODING_SYSTEM is invalid, signal an error. */
ec6d2bb8
KH
4481
4482void
df7492f9
KH
4483setup_coding_system (coding_system, coding)
4484 Lisp_Object coding_system;
ec6d2bb8
KH
4485 struct coding_system *coding;
4486{
df7492f9
KH
4487 Lisp_Object attrs;
4488 Lisp_Object eol_type;
4489 Lisp_Object coding_type;
4490 Lisp_Object val;
ec6d2bb8 4491
df7492f9
KH
4492 if (NILP (coding_system))
4493 coding_system = Qno_conversion;
4494
4495 CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
4496
4497 attrs = CODING_ID_ATTRS (coding->id);
4498 eol_type = CODING_ID_EOL_TYPE (coding->id);
4499
4500 coding->mode = 0;
4501 coding->head_ascii = -1;
4502 coding->common_flags
4503 = (VECTORP (eol_type) ? CODING_REQUIRE_DETECTION_MASK : 0);
4504
4505 val = CODING_ATTR_SAFE_CHARSETS (attrs);
4506 coding->max_charset_id = XSTRING (val)->size - 1;
4507 coding->safe_charsets = (char *) XSTRING (val)->data;
4508 coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
4509
4510 coding_type = CODING_ATTR_TYPE (attrs);
4511 if (EQ (coding_type, Qundecided))
4512 {
4513 coding->detector = NULL;
4514 coding->decoder = decode_coding_raw_text;
4515 coding->encoder = encode_coding_raw_text;
4516 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
4517 }
4518 else if (EQ (coding_type, Qiso_2022))
4519 {
4520 int i;
4521 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
4522
4523 /* Invoke graphic register 0 to plane 0. */
4524 CODING_ISO_INVOCATION (coding, 0) = 0;
4525 /* Invoke graphic register 1 to plane 1 if we can use 8-bit. */
4526 CODING_ISO_INVOCATION (coding, 1)
4527 = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
4528 /* Setup the initial status of designation. */
4529 for (i = 0; i < 4; i++)
4530 CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
4531 /* Not single shifting initially. */
4532 CODING_ISO_SINGLE_SHIFTING (coding) = 0;
4533 /* Beginning of buffer should also be regarded as bol. */
4534 CODING_ISO_BOL (coding) = 1;
4535 coding->detector = detect_coding_iso_2022;
4536 coding->decoder = decode_coding_iso_2022;
4537 coding->encoder = encode_coding_iso_2022;
4538 if (flags & CODING_ISO_FLAG_SAFE)
4539 coding->mode |= CODING_MODE_SAFE_ENCODING;
4540 coding->common_flags
4541 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
4542 | CODING_REQUIRE_FLUSHING_MASK);
4543 if (flags & CODING_ISO_FLAG_COMPOSITION)
4544 coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
4545 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
4546 {
4547 setup_iso_safe_charsets (attrs);
4548 val = CODING_ATTR_SAFE_CHARSETS (attrs);
4549 coding->max_charset_id = XSTRING (val)->size - 1;
4550 coding->safe_charsets = (char *) XSTRING (val)->data;
4551 }
4552 CODING_ISO_FLAGS (coding) = flags;
4553 }
4554 else if (EQ (coding_type, Qcharset))
4555 {
4556 coding->detector = detect_coding_charset;
4557 coding->decoder = decode_coding_charset;
4558 coding->encoder = encode_coding_charset;
4559 coding->common_flags
4560 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4561 }
4562 else if (EQ (coding_type, Qutf_8))
4563 {
4564 coding->detector = detect_coding_utf_8;
4565 coding->decoder = decode_coding_utf_8;
4566 coding->encoder = encode_coding_utf_8;
4567 coding->common_flags
4568 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4569 }
4570 else if (EQ (coding_type, Qutf_16))
4571 {
4572 val = AREF (attrs, coding_attr_utf_16_bom);
4573 CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_16_detect_bom
4574 : EQ (val, Qt) ? utf_16_with_bom
4575 : utf_16_without_bom);
4576 val = AREF (attrs, coding_attr_utf_16_endian);
4577 CODING_UTF_16_ENDIAN (coding) = (NILP (val) ? utf_16_big_endian
4578 : utf_16_little_endian);
e19c3639 4579 CODING_UTF_16_SURROGATE (coding) = 0;
df7492f9
KH
4580 coding->detector = detect_coding_utf_16;
4581 coding->decoder = decode_coding_utf_16;
4582 coding->encoder = encode_coding_utf_16;
4583 coding->common_flags
4584 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4585 }
4586 else if (EQ (coding_type, Qccl))
4587 {
4588 coding->detector = detect_coding_ccl;
4589 coding->decoder = decode_coding_ccl;
4590 coding->encoder = encode_coding_ccl;
4591 coding->common_flags
4592 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
4593 | CODING_REQUIRE_FLUSHING_MASK);
4594 }
4595 else if (EQ (coding_type, Qemacs_mule))
4596 {
4597 coding->detector = detect_coding_emacs_mule;
4598 coding->decoder = decode_coding_emacs_mule;
4599 coding->encoder = encode_coding_emacs_mule;
4600 coding->common_flags
4601 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4602 if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
4603 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
4604 {
4605 Lisp_Object tail, safe_charsets;
4606 int max_charset_id = 0;
4607
4608 for (tail = Vemacs_mule_charset_list; CONSP (tail);
4609 tail = XCDR (tail))
4610 if (max_charset_id < XFASTINT (XCAR (tail)))
4611 max_charset_id = XFASTINT (XCAR (tail));
4612 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
4613 make_number (255));
4614 for (tail = Vemacs_mule_charset_list; CONSP (tail);
4615 tail = XCDR (tail))
4616 XSTRING (safe_charsets)->data[XFASTINT (XCAR (tail))] = 0;
4617 coding->max_charset_id = max_charset_id;
4618 coding->safe_charsets = (char *) XSTRING (safe_charsets)->data;
4619 }
4620 }
4621 else if (EQ (coding_type, Qshift_jis))
4622 {
4623 coding->detector = detect_coding_sjis;
4624 coding->decoder = decode_coding_sjis;
4625 coding->encoder = encode_coding_sjis;
4626 coding->common_flags
4627 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4628 }
4629 else if (EQ (coding_type, Qbig5))
4630 {
4631 coding->detector = detect_coding_big5;
4632 coding->decoder = decode_coding_big5;
4633 coding->encoder = encode_coding_big5;
4634 coding->common_flags
4635 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4636 }
4637 else /* EQ (coding_type, Qraw_text) */
ec6d2bb8 4638 {
df7492f9
KH
4639 coding->detector = NULL;
4640 coding->decoder = decode_coding_raw_text;
4641 coding->encoder = encode_coding_raw_text;
4642 coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
ec6d2bb8 4643 }
df7492f9
KH
4644
4645 return;
ec6d2bb8
KH
4646}
4647
df7492f9
KH
4648/* Return raw-text or one of its subsidiaries that has the same
4649 eol_type as CODING-SYSTEM. */
ec6d2bb8 4650
df7492f9
KH
4651Lisp_Object
4652raw_text_coding_system (coding_system)
4653 Lisp_Object coding_system;
ec6d2bb8 4654{
0be8721c 4655 Lisp_Object spec, attrs;
df7492f9
KH
4656 Lisp_Object eol_type, raw_text_eol_type;
4657
4658 spec = CODING_SYSTEM_SPEC (coding_system);
4659 attrs = AREF (spec, 0);
4660
4661 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
4662 return coding_system;
ec6d2bb8 4663
df7492f9
KH
4664 eol_type = AREF (spec, 2);
4665 if (VECTORP (eol_type))
4666 return Qraw_text;
4667 spec = CODING_SYSTEM_SPEC (Qraw_text);
4668 raw_text_eol_type = AREF (spec, 2);
4669 return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
4670 : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
4671 : AREF (raw_text_eol_type, 2));
ec6d2bb8
KH
4672}
4673
54f78171 4674
df7492f9
KH
4675/* If CODING_SYSTEM doesn't specify end-of-line format but PARENT
4676 does, return one of the subsidiary that has the same eol-spec as
4677 PARENT. Otherwise, return CODING_SYSTEM. */
4678
4679Lisp_Object
4680coding_inherit_eol_type (coding_system, parent)
b74e4686 4681 Lisp_Object coding_system, parent;
54f78171 4682{
df7492f9 4683 Lisp_Object spec, attrs, eol_type;
54f78171 4684
df7492f9
KH
4685 spec = CODING_SYSTEM_SPEC (coding_system);
4686 attrs = AREF (spec, 0);
4687 eol_type = AREF (spec, 2);
4688 if (VECTORP (eol_type))
4689 {
4690 Lisp_Object parent_spec;
df7492f9
KH
4691 Lisp_Object parent_eol_type;
4692
4693 parent_spec
4694 = CODING_SYSTEM_SPEC (buffer_defaults.buffer_file_coding_system);
4695 parent_eol_type = AREF (parent_spec, 2);
4696 if (EQ (parent_eol_type, Qunix))
4697 coding_system = AREF (eol_type, 0);
4698 else if (EQ (parent_eol_type, Qdos))
4699 coding_system = AREF (eol_type, 1);
4700 else if (EQ (parent_eol_type, Qmac))
4701 coding_system = AREF (eol_type, 2);
54f78171 4702 }
df7492f9 4703 return coding_system;
54f78171
KH
4704}
4705
4ed46869
KH
4706/* Emacs has a mechanism to automatically detect a coding system if it
4707 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
4708 it's impossible to distinguish some coding systems accurately
4709 because they use the same range of codes. So, at first, coding
4710 systems are categorized into 7, those are:
4711
0ef69138 4712 o coding-category-emacs-mule
4ed46869
KH
4713
4714 The category for a coding system which has the same code range
4715 as Emacs' internal format. Assigned the coding-system (Lisp
0ef69138 4716 symbol) `emacs-mule' by default.
4ed46869
KH
4717
4718 o coding-category-sjis
4719
4720 The category for a coding system which has the same code range
4721 as SJIS. Assigned the coding-system (Lisp
7717c392 4722 symbol) `japanese-shift-jis' by default.
4ed46869
KH
4723
4724 o coding-category-iso-7
4725
4726 The category for a coding system which has the same code range
7717c392 4727 as ISO2022 of 7-bit environment. This doesn't use any locking
d46c5b12
KH
4728 shift and single shift functions. This can encode/decode all
4729 charsets. Assigned the coding-system (Lisp symbol)
4730 `iso-2022-7bit' by default.
4731
4732 o coding-category-iso-7-tight
4733
4734 Same as coding-category-iso-7 except that this can
4735 encode/decode only the specified charsets.
4ed46869
KH
4736
4737 o coding-category-iso-8-1
4738
4739 The category for a coding system which has the same code range
4740 as ISO2022 of 8-bit environment and graphic plane 1 used only
7717c392
KH
4741 for DIMENSION1 charset. This doesn't use any locking shift
4742 and single shift functions. Assigned the coding-system (Lisp
4743 symbol) `iso-latin-1' by default.
4ed46869
KH
4744
4745 o coding-category-iso-8-2
4746
4747 The category for a coding system which has the same code range
4748 as ISO2022 of 8-bit environment and graphic plane 1 used only
7717c392
KH
4749 for DIMENSION2 charset. This doesn't use any locking shift
4750 and single shift functions. Assigned the coding-system (Lisp
4751 symbol) `japanese-iso-8bit' by default.
4ed46869 4752
7717c392 4753 o coding-category-iso-7-else
4ed46869
KH
4754
4755 The category for a coding system which has the same code range
df7492f9 4756 as ISO2022 of 7-bit environemnt but uses locking shift or
7717c392
KH
4757 single shift functions. Assigned the coding-system (Lisp
4758 symbol) `iso-2022-7bit-lock' by default.
4759
4760 o coding-category-iso-8-else
4761
4762 The category for a coding system which has the same code range
df7492f9 4763 as ISO2022 of 8-bit environemnt but uses locking shift or
7717c392
KH
4764 single shift functions. Assigned the coding-system (Lisp
4765 symbol) `iso-2022-8bit-ss2' by default.
4ed46869
KH
4766
4767 o coding-category-big5
4768
4769 The category for a coding system which has the same code range
4770 as BIG5. Assigned the coding-system (Lisp symbol)
e0e989f6 4771 `cn-big5' by default.
4ed46869 4772
fa42c37f
KH
4773 o coding-category-utf-8
4774
4775 The category for a coding system which has the same code range
4776 as UTF-8 (cf. RFC2279). Assigned the coding-system (Lisp
4777 symbol) `utf-8' by default.
4778
4779 o coding-category-utf-16-be
4780
4781 The category for a coding system in which a text has an
4782 Unicode signature (cf. Unicode Standard) in the order of BIG
4783 endian at the head. Assigned the coding-system (Lisp symbol)
4784 `utf-16-be' by default.
4785
4786 o coding-category-utf-16-le
4787
4788 The category for a coding system in which a text has an
4789 Unicode signature (cf. Unicode Standard) in the order of
4790 LITTLE endian at the head. Assigned the coding-system (Lisp
4791 symbol) `utf-16-le' by default.
4792
1397dc18
KH
4793 o coding-category-ccl
4794
4795 The category for a coding system of which encoder/decoder is
4796 written in CCL programs. The default value is nil, i.e., no
4797 coding system is assigned.
4798
4ed46869
KH
4799 o coding-category-binary
4800
4801 The category for a coding system not categorized in any of the
4802 above. Assigned the coding-system (Lisp symbol)
e0e989f6 4803 `no-conversion' by default.
4ed46869
KH
4804
4805 Each of them is a Lisp symbol and the value is an actual
df7492f9 4806 `coding-system's (this is also a Lisp symbol) assigned by a user.
4ed46869
KH
4807 What Emacs does actually is to detect a category of coding system.
4808 Then, it uses a `coding-system' assigned to it. If Emacs can't
df7492f9 4809 decide only one possible category, it selects a category of the
4ed46869
KH
4810 highest priority. Priorities of categories are also specified by a
4811 user in a Lisp variable `coding-category-list'.
4812
4813*/
4814
df7492f9
KH
4815#define EOL_SEEN_NONE 0
4816#define EOL_SEEN_LF 1
4817#define EOL_SEEN_CR 2
4818#define EOL_SEEN_CRLF 4
4ed46869 4819
df7492f9
KH
4820/* Detect how end-of-line of a text of length CODING->src_bytes
4821 pointed by CODING->source is encoded. Return one of
4822 EOL_SEEN_XXX. */
4ed46869 4823
bc4bc72a
RS
4824#define MAX_EOL_CHECK_COUNT 3
4825
d46c5b12 4826static int
df7492f9
KH
4827detect_eol (coding, source, src_bytes)
4828 struct coding_system *coding;
d46c5b12 4829 unsigned char *source;
df7492f9 4830 EMACS_INT src_bytes;
4ed46869 4831{
df7492f9 4832 Lisp_Object attrs, coding_type;
d46c5b12 4833 unsigned char *src = source, *src_end = src + src_bytes;
4ed46869 4834 unsigned char c;
df7492f9
KH
4835 int total = 0;
4836 int eol_seen = EOL_SEEN_NONE;
4ed46869 4837
df7492f9
KH
4838 attrs = CODING_ID_ATTRS (coding->id);
4839 coding_type = CODING_ATTR_TYPE (attrs);
d46c5b12 4840
df7492f9 4841 if (EQ (coding_type, Qccl))
4ed46869 4842 {
df7492f9 4843 int msb, lsb;
fa42c37f 4844
df7492f9
KH
4845 msb = coding->spec.utf_16.endian == utf_16_little_endian;
4846 lsb = 1 - msb;
fa42c37f 4847
df7492f9 4848 while (src + 1 < src_end)
fa42c37f 4849 {
df7492f9
KH
4850 c = src[lsb];
4851 if (src[msb] == 0 && (c == '\n' || c == '\r'))
fa42c37f 4852 {
df7492f9
KH
4853 int this_eol;
4854
4855 if (c == '\n')
4856 this_eol = EOL_SEEN_LF;
4857 else if (src + 3 >= src_end
4858 || src[msb + 2] != 0
4859 || src[lsb + 2] != '\n')
4860 this_eol = EOL_SEEN_CR;
fa42c37f 4861 else
df7492f9
KH
4862 this_eol = EOL_SEEN_CRLF;
4863
4864 if (eol_seen == EOL_SEEN_NONE)
4865 /* This is the first end-of-line. */
4866 eol_seen = this_eol;
4867 else if (eol_seen != this_eol)
fa42c37f 4868 {
df7492f9
KH
4869 /* The found type is different from what found before. */
4870 eol_seen = EOL_SEEN_LF;
4871 break;
fa42c37f 4872 }
df7492f9
KH
4873 if (++total == MAX_EOL_CHECK_COUNT)
4874 break;
fa42c37f 4875 }
df7492f9 4876 src += 2;
fa42c37f 4877 }
df7492f9 4878 }
d46c5b12 4879 else
27901516 4880 {
df7492f9 4881 while (src < src_end)
27901516 4882 {
df7492f9
KH
4883 c = *src++;
4884 if (c == '\n' || c == '\r')
4885 {
4886 int this_eol;
d46c5b12 4887
df7492f9
KH
4888 if (c == '\n')
4889 this_eol = EOL_SEEN_LF;
4890 else if (src >= src_end || *src != '\n')
4891 this_eol = EOL_SEEN_CR;
4892 else
4893 this_eol = EOL_SEEN_CRLF, src++;
d46c5b12 4894
df7492f9
KH
4895 if (eol_seen == EOL_SEEN_NONE)
4896 /* This is the first end-of-line. */
4897 eol_seen = this_eol;
4898 else if (eol_seen != this_eol)
4899 {
4900 /* The found type is different from what found before. */
4901 eol_seen = EOL_SEEN_LF;
4902 break;
4903 }
4904 if (++total == MAX_EOL_CHECK_COUNT)
4905 break;
4906 }
4907 }
73be902c 4908 }
df7492f9 4909 return eol_seen;
73be902c
KH
4910}
4911
df7492f9 4912
73be902c 4913static void
df7492f9
KH
4914adjust_coding_eol_type (coding, eol_seen)
4915 struct coding_system *coding;
4916 int eol_seen;
73be902c 4917{
0be8721c 4918 Lisp_Object eol_type;
df7492f9
KH
4919
4920 eol_type = CODING_ID_EOL_TYPE (coding->id);
4921 if (eol_seen & EOL_SEEN_LF)
4922 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
4923 else if (eol_type & EOL_SEEN_CRLF)
4924 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
4925 else if (eol_type & EOL_SEEN_CR)
4926 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
d46c5b12
KH
4927}
4928
df7492f9
KH
4929/* Detect how a text specified in CODING is encoded. If a coding
4930 system is detected, update fields of CODING by the detected coding
4931 system. */
4932
4933void
4934detect_coding (coding)
d46c5b12 4935 struct coding_system *coding;
d46c5b12 4936{
df7492f9
KH
4937 unsigned char *src, *src_end;
4938 Lisp_Object attrs, coding_type;
d46c5b12 4939
df7492f9
KH
4940 coding->consumed = coding->consumed_char = 0;
4941 coding->produced = coding->produced_char = 0;
4942 coding_set_source (coding);
1c3478b0 4943
df7492f9 4944 src_end = coding->source + coding->src_bytes;
1c3478b0 4945
df7492f9
KH
4946 /* If we have not yet decided the text encoding type, detect it
4947 now. */
4948 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
b73bfc1c 4949 {
df7492f9
KH
4950 int mask = CATEGORY_MASK_ANY;
4951 int c, i;
4952
4953 for (src = coding->source; src < src_end; src++)
4954 {
4955 c = *src;
4956 if (c & 0x80 || (c < 0x20 && (c == ISO_CODE_ESC
4957 || c == ISO_CODE_SI
4958 || c == ISO_CODE_SO)))
4959 break;
4960 }
4961 coding->head_ascii = src - (coding->source + coding->consumed);
4962
4963 if (coding->head_ascii < coding->src_bytes)
1c3478b0 4964 {
df7492f9
KH
4965 int detected = 0;
4966
4967 for (i = 0; i < coding_category_raw_text; i++)
1c3478b0 4968 {
df7492f9
KH
4969 enum coding_category category = coding_priorities[i];
4970 struct coding_system *this = coding_categories + category;
4971
4972 if (category >= coding_category_raw_text
4973 || detected & (1 << category))
4974 continue;
4975
4976 if (this->id < 0)
1c3478b0 4977 {
df7492f9
KH
4978 /* No coding system of this category is defined. */
4979 mask &= ~(1 << category);
4980 }
4981 else
4982 {
4983 detected |= detected_mask[category];
4984 if ((*(this->detector)) (coding, &mask))
4985 break;
1c3478b0
KH
4986 }
4987 }
df7492f9
KH
4988 if (! mask)
4989 setup_coding_system (Qraw_text, coding);
4990 else if (mask != CATEGORY_MASK_ANY)
4991 for (i = 0; i < coding_category_raw_text; i++)
4992 {
4993 enum coding_category category = coding_priorities[i];
4994 struct coding_system *this = coding_categories + category;
4995
4996 if (mask & (1 << category))
4997 {
4998 setup_coding_system (CODING_ID_NAME (this->id), coding);
4999 break;
5000 }
5001 }
1c3478b0 5002 }
b73bfc1c 5003 }
69f76525 5004
df7492f9
KH
5005 attrs = CODING_ID_ATTRS (coding->id);
5006 coding_type = CODING_ATTR_TYPE (attrs);
5007
5008 /* If we have not yet decided the EOL type, detect it now. But, the
5009 detection is impossible for a CCL based coding system, in which
5010 case, we detct the EOL type after decoding. */
5011 if (VECTORP (CODING_ID_EOL_TYPE (coding->id))
5012 && ! EQ (coding_type, Qccl))
d46c5b12 5013 {
df7492f9
KH
5014 int eol_seen = detect_eol (coding, coding->source, coding->src_bytes);
5015
5016 if (eol_seen != EOL_SEEN_NONE)
5017 adjust_coding_eol_type (coding, eol_seen);
d46c5b12 5018 }
4ed46869
KH
5019}
5020
aaaf0b1e
KH
5021
5022static void
df7492f9 5023decode_eol (coding)
aaaf0b1e 5024 struct coding_system *coding;
aaaf0b1e 5025{
df7492f9 5026 if (VECTORP (CODING_ID_EOL_TYPE (coding->id)))
aaaf0b1e 5027 {
df7492f9
KH
5028 unsigned char *p = CHAR_POS_ADDR (coding->dst_pos);
5029 unsigned char *pend = p + coding->produced;
5030 int eol_seen = EOL_SEEN_NONE;
aaaf0b1e 5031
df7492f9 5032 for (; p < pend; p++)
aaaf0b1e 5033 {
df7492f9
KH
5034 if (*p == '\n')
5035 eol_seen |= EOL_SEEN_LF;
5036 else if (*p == '\r')
aaaf0b1e 5037 {
df7492f9 5038 if (p + 1 < pend && *(p + 1) == '\n')
aaaf0b1e 5039 {
df7492f9
KH
5040 eol_seen |= EOL_SEEN_CRLF;
5041 p++;
aaaf0b1e 5042 }
aaaf0b1e 5043 else
df7492f9 5044 eol_seen |= EOL_SEEN_CR;
aaaf0b1e 5045 }
aaaf0b1e 5046 }
df7492f9
KH
5047 if (eol_seen != EOL_SEEN_NONE)
5048 adjust_coding_eol_type (coding, eol_seen);
aaaf0b1e 5049 }
aaaf0b1e 5050
df7492f9
KH
5051 if (EQ (CODING_ID_EOL_TYPE (coding->id), Qmac))
5052 {
5053 unsigned char *p = CHAR_POS_ADDR (coding->dst_pos);
5054 unsigned char *pend = p + coding->produced;
5055
5056 for (; p < pend; p++)
5057 if (*p == '\r')
5058 *p = '\n';
5059 }
5060 else if (EQ (CODING_ID_EOL_TYPE (coding->id), Qdos))
5061 {
5062 unsigned char *p, *pbeg, *pend;
5063 Lisp_Object undo_list;
5064
5065 move_gap_both (coding->dst_pos + coding->produced_char,
5066 coding->dst_pos_byte + coding->produced);
5067 undo_list = current_buffer->undo_list;
5068 current_buffer->undo_list = Qt;
5069 del_range_2 (coding->dst_pos, coding->dst_pos_byte, GPT, GPT_BYTE, Qnil);
5070 current_buffer->undo_list = undo_list;
5071 pbeg = GPT_ADDR;
5072 pend = pbeg + coding->produced;
5073
5074 for (p = pend - 1; p >= pbeg; p--)
5075 if (*p == '\r')
5076 {
5077 safe_bcopy ((char *) (p + 1), (char *) p, pend - p - 1);
5078 pend--;
5079 }
5080 coding->produced_char -= coding->produced - (pend - pbeg);
5081 coding->produced = pend - pbeg;
5082 insert_from_gap (coding->produced_char, coding->produced);
aaaf0b1e
KH
5083 }
5084}
5085
df7492f9
KH
5086static void
5087translate_chars (coding, table)
4ed46869 5088 struct coding_system *coding;
df7492f9 5089 Lisp_Object table;
4ed46869 5090{
df7492f9
KH
5091 int *charbuf = coding->charbuf;
5092 int *charbuf_end = charbuf + coding->charbuf_used;
5093 int c;
5094
5095 if (coding->chars_at_source)
5096 return;
4ed46869 5097
df7492f9 5098 while (charbuf < charbuf_end)
8844fa83 5099 {
df7492f9
KH
5100 c = *charbuf;
5101 if (c < 0)
5102 charbuf += c;
5103 else
5104 *charbuf++ = translate_char (table, c);
8844fa83 5105 }
df7492f9 5106}
4ed46869 5107
df7492f9
KH
5108static int
5109produce_chars (coding)
5110 struct coding_system *coding;
5111{
5112 unsigned char *dst = coding->destination + coding->produced;
5113 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5114 int produced;
5115 int produced_chars = 0;
b73bfc1c 5116
df7492f9 5117 if (! coding->chars_at_source)
4ed46869 5118 {
df7492f9
KH
5119 /* Characters are in coding->charbuf. */
5120 int *buf = coding->charbuf;
5121 int *buf_end = buf + coding->charbuf_used;
5122 unsigned char *adjusted_dst_end;
4ed46869 5123
df7492f9
KH
5124 if (BUFFERP (coding->src_object)
5125 && EQ (coding->src_object, coding->dst_object))
5126 dst_end = coding->source + coding->consumed;
5127 adjusted_dst_end = dst_end - MAX_MULTIBYTE_LENGTH;
4ed46869 5128
df7492f9
KH
5129 while (buf < buf_end)
5130 {
5131 int c = *buf++;
5132
5133 if (dst >= adjusted_dst_end)
5134 {
5135 dst = alloc_destination (coding,
5136 buf_end - buf + MAX_MULTIBYTE_LENGTH,
5137 dst);
5138 dst_end = coding->destination + coding->dst_bytes;
5139 adjusted_dst_end = dst_end - MAX_MULTIBYTE_LENGTH;
5140 }
5141 if (c >= 0)
5142 {
5143 if (coding->dst_multibyte
5144 || ! CHAR_BYTE8_P (c))
5145 CHAR_STRING_ADVANCE (c, dst);
5146 else
5147 *dst++ = CHAR_TO_BYTE8 (c);
5148 produced_chars++;
5149 }
5150 else
5151 /* This is an annotation data. */
5152 buf -= c + 1;
5153 }
5154 }
5155 else
5156 {
df7492f9
KH
5157 unsigned char *src = coding->source;
5158 unsigned char *src_end = src + coding->src_bytes;
5159 Lisp_Object eol_type;
b73bfc1c 5160
df7492f9 5161 eol_type = CODING_ID_EOL_TYPE (coding->id);
4ed46869 5162
df7492f9 5163 if (coding->src_multibyte != coding->dst_multibyte)
aaaf0b1e 5164 {
df7492f9
KH
5165 if (coding->src_multibyte)
5166 {
71c81426 5167 int multibytep = 1;
df7492f9 5168 int consumed_chars;
d46c5b12 5169
df7492f9
KH
5170 while (1)
5171 {
5172 unsigned char *src_base = src;
5173 int c;
b73bfc1c 5174
df7492f9
KH
5175 ONE_MORE_BYTE (c);
5176 if (c == '\r')
5177 {
5178 if (EQ (eol_type, Qdos))
5179 {
5180 if (src < src_end
5181 && *src == '\n')
5182 c = *src++;
5183 }
5184 else if (EQ (eol_type, Qmac))
5185 c = '\n';
5186 }
5187 if (dst == dst_end)
5188 {
2c78b7e1 5189 coding->consumed = src - coding->source;
b73bfc1c 5190
2c78b7e1
KH
5191 if (EQ (coding->src_object, coding->dst_object))
5192 dst_end = src;
5193 if (dst == dst_end)
5194 {
5195 dst = alloc_destination (coding, src_end - src + 1,
5196 dst);
5197 dst_end = coding->destination + coding->dst_bytes;
5198 coding_set_source (coding);
5199 src = coding->source + coding->consumed;
5200 src_end = coding->source + coding->src_bytes;
5201 }
df7492f9
KH
5202 }
5203 *dst++ = c;
5204 produced_chars++;
5205 }
5206 no_more_source:
5207 ;
5208 }
5209 else
5210 while (src < src_end)
5211 {
71c81426 5212 int multibytep = 1;
df7492f9 5213 int c = *src++;
b73bfc1c 5214
df7492f9
KH
5215 if (c == '\r')
5216 {
5217 if (EQ (eol_type, Qdos))
5218 {
5219 if (src < src_end
5220 && *src == '\n')
5221 c = *src++;
5222 }
5223 else if (EQ (eol_type, Qmac))
5224 c = '\n';
5225 }
5226 if (dst >= dst_end - 1)
5227 {
2c78b7e1 5228 coding->consumed = src - coding->source;
df7492f9 5229
2c78b7e1
KH
5230 if (EQ (coding->src_object, coding->dst_object))
5231 dst_end = src;
5232 if (dst >= dst_end - 1)
5233 {
5234 dst = alloc_destination (coding, src_end - src + 2,
5235 dst);
5236 dst_end = coding->destination + coding->dst_bytes;
5237 coding_set_source (coding);
5238 src = coding->source + coding->consumed;
5239 src_end = coding->source + coding->src_bytes;
5240 }
df7492f9
KH
5241 }
5242 EMIT_ONE_BYTE (c);
5243 }
d46c5b12 5244 }
df7492f9
KH
5245 else
5246 {
5247 if (!EQ (coding->src_object, coding->dst_object))
5248 {
5249 int require = coding->src_bytes - coding->dst_bytes;
4ed46869 5250
df7492f9
KH
5251 if (require > 0)
5252 {
5253 EMACS_INT offset = src - coding->source;
5254
5255 dst = alloc_destination (coding, require, dst);
5256 coding_set_source (coding);
5257 src = coding->source + offset;
5258 src_end = coding->source + coding->src_bytes;
5259 }
5260 }
5261 produced_chars = coding->src_chars;
5262 while (src < src_end)
5263 {
5264 int c = *src++;
5265
5266 if (c == '\r')
5267 {
5268 if (EQ (eol_type, Qdos))
5269 {
5270 if (src < src_end
5271 && *src == '\n')
5272 c = *src++;
5273 produced_chars--;
5274 }
5275 else if (EQ (eol_type, Qmac))
5276 c = '\n';
5277 }
5278 *dst++ = c;
5279 }
5280 }
2c78b7e1
KH
5281 coding->consumed = coding->src_bytes;
5282 coding->consumed_char = coding->src_chars;
b73bfc1c 5283 }
4ed46869 5284
df7492f9
KH
5285 produced = dst - (coding->destination + coding->produced);
5286 if (BUFFERP (coding->dst_object))
5287 insert_from_gap (produced_chars, produced);
5288 coding->produced += produced;
5289 coding->produced_char += produced_chars;
5290 return produced_chars;
b73bfc1c 5291}
52d41803 5292
df7492f9
KH
5293/* [ -LENGTH CHAR_POS_OFFSET MASK METHOD COMP_LEN ]
5294 or
5295 [ -LENGTH CHAR_POS_OFFSET MASK METHOD COMP_LEN COMPONENTS... ]
5296 */
4ed46869 5297
df7492f9
KH
5298static INLINE void
5299produce_composition (coding, charbuf)
4ed46869 5300 struct coding_system *coding;
df7492f9 5301 int *charbuf;
4ed46869 5302{
df7492f9
KH
5303 Lisp_Object buffer;
5304 int len;
5305 EMACS_INT pos;
5306 enum composition_method method;
5307 int cmp_len;
5308 Lisp_Object components;
5309
5310 buffer = coding->dst_object;
5311 len = -charbuf[0];
5312 pos = coding->dst_pos + charbuf[1];
5313 method = (enum composition_method) (charbuf[3]);
5314 cmp_len = charbuf[4];
5315
5316 if (method == COMPOSITION_RELATIVE)
5317 components = Qnil;
5318 else
d46c5b12 5319 {
df7492f9
KH
5320 Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
5321 int i;
4ed46869 5322
df7492f9
KH
5323 len -= 5;
5324 charbuf += 5;
5325 for (i = 0; i < len; i++)
5326 args[i] = make_number (charbuf[i]);
5327 components = (method == COMPOSITION_WITH_ALTCHARS
5328 ? Fstring (len, args) : Fvector (len, args));
5329 }
5330 compose_text (pos, pos + cmp_len, components, Qnil, Qnil);
5331}
b73bfc1c 5332
df7492f9
KH
5333static int *
5334save_composition_data (buf, buf_end, prop)
5335 int *buf, *buf_end;
5336 Lisp_Object prop;
5337{
5338 enum composition_method method = COMPOSITION_METHOD (prop);
5339 int cmp_len = COMPOSITION_LENGTH (prop);
4ed46869 5340
df7492f9
KH
5341 if (buf + 4 + (MAX_COMPOSITION_COMPONENTS * 2 - 1) > buf_end)
5342 return NULL;
d46c5b12 5343
df7492f9
KH
5344 buf[1] = CODING_ANNOTATE_COMPOSITION_MASK;
5345 buf[2] = method;
5346 buf[3] = cmp_len;
b73bfc1c 5347
df7492f9
KH
5348 if (method == COMPOSITION_RELATIVE)
5349 buf[0] = 4;
5350 else
b73bfc1c 5351 {
df7492f9
KH
5352 Lisp_Object components;
5353 int len, i;
b73bfc1c 5354
df7492f9
KH
5355 components = COMPOSITION_COMPONENTS (prop);
5356 if (VECTORP (components))
d46c5b12 5357 {
df7492f9
KH
5358 len = XVECTOR (components)->size;
5359 for (i = 0; i < len; i++)
5360 buf[4 + i] = XINT (AREF (components, i));
5361 }
5362 else if (STRINGP (components))
5363 {
5364 int i_byte;
b73bfc1c 5365
df7492f9
KH
5366 len = XSTRING (components)->size;
5367 i = i_byte = 0;
5368 while (i < len)
5369 FETCH_STRING_CHAR_ADVANCE (buf[4 + i], components, i, i_byte);
5370 }
5371 else if (INTEGERP (components))
5372 {
5373 len = 1;
5374 buf[4] = XINT (components);
5375 }
5376 else if (CONSP (components))
5377 {
5378 for (len = 0; CONSP (components);
5379 len++, components = XCDR (components))
5380 buf[4 + len] = XINT (XCAR (components));
d46c5b12 5381 }
df7492f9
KH
5382 else
5383 abort ();
5384 buf[0] = 4 + len;
4ed46869 5385 }
df7492f9 5386 return (buf + buf[0]);
4ed46869
KH
5387}
5388
df7492f9
KH
5389#define CHARBUF_SIZE 0x4000
5390
5391#define ALLOC_CONVERSION_WORK_AREA(coding) \
5392 do { \
5393 int size = CHARBUF_SIZE;; \
5394 \
5395 coding->charbuf = NULL; \
5396 while (size > 1024) \
5397 { \
5398 coding->charbuf = (int *) alloca (sizeof (int) * size); \
5399 if (coding->charbuf) \
5400 break; \
5401 size >>= 1; \
5402 } \
5403 if (! coding->charbuf) \
5404 { \
5405 coding->result = CODING_RESULT_INSUFFICIENT_MEM; \
5406 return coding->result; \
5407 } \
5408 coding->charbuf_size = size; \
5409 } while (0)
4ed46869 5410
d46c5b12
KH
5411
5412static void
df7492f9 5413produce_annotation (coding)
d46c5b12 5414 struct coding_system *coding;
d46c5b12 5415{
df7492f9
KH
5416 int *charbuf = coding->charbuf;
5417 int *charbuf_end = charbuf + coding->charbuf_used;
d46c5b12 5418
df7492f9 5419 while (charbuf < charbuf_end)
d46c5b12 5420 {
df7492f9
KH
5421 if (*charbuf >= 0)
5422 charbuf++;
d46c5b12 5423 else
d46c5b12 5424 {
df7492f9
KH
5425 int len = -*charbuf;
5426 switch (charbuf[2])
5427 {
5428 case CODING_ANNOTATE_COMPOSITION_MASK:
5429 produce_composition (coding, charbuf);
5430 break;
5431 default:
5432 abort ();
5433 }
5434 charbuf += len;
d46c5b12 5435 }
df7492f9
KH
5436 }
5437}
d46c5b12 5438
df7492f9
KH
5439/* Decode the data at CODING->src_object into CODING->dst_object.
5440 CODING->src_object is a buffer, a string, or nil.
5441 CODING->dst_object is a buffer.
de79a6a5 5442
df7492f9
KH
5443 If CODING->src_object is a buffer, it must be the current buffer.
5444 In this case, if CODING->src_pos is positive, it is a position of
5445 the source text in the buffer, otherwise, the source text is in the
5446 gap area of the buffer, and CODING->src_pos specifies the offset of
5447 the text from GPT (which must be the same as PT). If this is the
5448 same buffer as CODING->dst_object, CODING->src_pos must be
5449 negative.
b73bfc1c 5450
df7492f9
KH
5451 If CODING->src_object is a string, CODING->src_pos in an index to
5452 that string.
d46c5b12 5453
df7492f9
KH
5454 If CODING->src_object is nil, CODING->source must already point to
5455 the non-relocatable memory area. In this case, CODING->src_pos is
5456 an offset from CODING->source.
d46c5b12 5457
df7492f9
KH
5458 The decoded data is inserted at the current point of the buffer
5459 CODING->dst_object.
5460*/
5461
5462static int
5463decode_coding (coding)
d46c5b12 5464 struct coding_system *coding;
d46c5b12 5465{
df7492f9 5466 Lisp_Object attrs;
d46c5b12 5467
df7492f9
KH
5468 if (BUFFERP (coding->src_object)
5469 && coding->src_pos > 0
5470 && coding->src_pos < GPT
5471 && coding->src_pos + coding->src_chars > GPT)
5472 move_gap_both (coding->src_pos, coding->src_pos_byte);
d46c5b12 5473
df7492f9 5474 if (BUFFERP (coding->dst_object))
88993dfd 5475 {
df7492f9
KH
5476 if (current_buffer != XBUFFER (coding->dst_object))
5477 set_buffer_internal (XBUFFER (coding->dst_object));
5478 if (GPT != PT)
5479 move_gap_both (PT, PT_BYTE);
88993dfd
KH
5480 }
5481
df7492f9
KH
5482 coding->consumed = coding->consumed_char = 0;
5483 coding->produced = coding->produced_char = 0;
5484 coding->chars_at_source = 0;
5485 coding->result = CODING_RESULT_SUCCESS;
5486 coding->errors = 0;
5487
5488 ALLOC_CONVERSION_WORK_AREA (coding);
5489
5490 attrs = CODING_ID_ATTRS (coding->id);
5491
5492 do
d46c5b12 5493 {
df7492f9
KH
5494 coding_set_source (coding);
5495 coding->annotated = 0;
5496 (*(coding->decoder)) (coding);
5497 if (!NILP (CODING_ATTR_DECODE_TBL (attrs)))
5498 translate_chars (CODING_ATTR_DECODE_TBL (attrs), coding);
5499 coding_set_destination (coding);
5500 produce_chars (coding);
5501 if (coding->annotated)
5502 produce_annotation (coding);
d46c5b12 5503 }
df7492f9
KH
5504 while (coding->consumed < coding->src_bytes
5505 && ! coding->result);
d46c5b12 5506
df7492f9
KH
5507 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qccl)
5508 && SYMBOLP (CODING_ID_EOL_TYPE (coding->id))
5509 && ! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix))
5510 decode_eol (coding);
d46c5b12 5511
df7492f9
KH
5512 coding->carryover_bytes = 0;
5513 if (coding->consumed < coding->src_bytes)
d46c5b12 5514 {
df7492f9
KH
5515 int nbytes = coding->src_bytes - coding->consumed;
5516 unsigned char *src;
5517
5518 coding_set_source (coding);
5519 coding_set_destination (coding);
5520 src = coding->source + coding->consumed;
5521
5522 if (coding->mode & CODING_MODE_LAST_BLOCK)
d46c5b12 5523 {
df7492f9
KH
5524 /* Flush out unprocessed data as binary chars. We are sure
5525 that the number of data is less than the size of
5526 coding->charbuf. */
5527 int *charbuf = coding->charbuf;
5528
5529 while (nbytes-- > 0)
d46c5b12 5530 {
df7492f9
KH
5531 int c = *src++;
5532 *charbuf++ = (c & 0x80 ? - c : c);
d46c5b12 5533 }
df7492f9 5534 produce_chars (coding);
d46c5b12 5535 }
d46c5b12 5536 else
df7492f9
KH
5537 {
5538 /* Record unprocessed bytes in coding->carryover. We are
5539 sure that the number of data is less than the size of
5540 coding->carryover. */
5541 unsigned char *p = coding->carryover;
5542
5543 coding->carryover_bytes = nbytes;
5544 while (nbytes-- > 0)
5545 *p++ = *src++;
5546 }
5547 coding->consumed = coding->src_bytes;
5548 }
b73bfc1c 5549
df7492f9 5550 return coding->result;
d46c5b12
KH
5551}
5552
df7492f9
KH
5553static void
5554consume_chars (coding)
5555 struct coding_system *coding;
5556{
5557 int *buf = coding->charbuf;
5558 /* -1 is to compensate for CRLF. */
5559 int *buf_end = coding->charbuf + coding->charbuf_size - 1;
5560 unsigned char *src = coding->source + coding->consumed;
5561 int pos = coding->src_pos + coding->consumed_char;
5562 int end_pos = coding->src_pos + coding->src_chars;
5563 int multibytep = coding->src_multibyte;
5564 Lisp_Object eol_type;
5565 int c;
5566 int start, end, stop;
5567 Lisp_Object object, prop;
88993dfd 5568
df7492f9
KH
5569 eol_type = CODING_ID_EOL_TYPE (coding->id);
5570 if (VECTORP (eol_type))
5571 eol_type = Qunix;
88993dfd 5572
df7492f9 5573 object = coding->src_object;
b843d1ae 5574
df7492f9
KH
5575 /* Note: composition handling is not yet implemented. */
5576 coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
ec6d2bb8 5577
df7492f9
KH
5578 if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK
5579 && find_composition (pos, end_pos, &start, &end, &prop, object)
5580 && end <= end_pos
5581 && (start >= pos
5582 || (find_composition (end, end_pos, &start, &end, &prop, object)
5583 && end <= end_pos)))
5584 stop = start;
5585 else
5586 stop = end_pos;
ec6d2bb8 5587
df7492f9 5588 while (buf < buf_end)
ec6d2bb8 5589 {
df7492f9 5590 if (pos == stop)
ec6d2bb8 5591 {
df7492f9 5592 int *p;
ec6d2bb8 5593
df7492f9
KH
5594 if (pos == end_pos)
5595 break;
5596 p = save_composition_data (buf, buf_end, prop);
5597 if (p == NULL)
5598 break;
5599 buf = p;
5600 if (find_composition (end, end_pos, &start, &end, &prop, object)
5601 && end <= end_pos)
5602 stop = start;
5603 else
5604 stop = end_pos;
5605 }
5606
5607 if (! multibytep)
5608 c = *src++;
5609 else
5610 c = STRING_CHAR_ADVANCE (src);
5611 if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
5612 c = '\n';
5613 if (! EQ (eol_type, Qunix))
5614 {
5615 if (c == '\n')
5616 {
5617 if (EQ (eol_type, Qdos))
5618 *buf++ = '\r';
5619 else
5620 c = '\r';
ec6d2bb8 5621 }
ec6d2bb8 5622 }
df7492f9
KH
5623 *buf++ = c;
5624 pos++;
ec6d2bb8 5625 }
ec6d2bb8 5626
df7492f9
KH
5627 coding->consumed = src - coding->source;
5628 coding->consumed_char = pos - coding->src_pos;
5629 coding->charbuf_used = buf - coding->charbuf;
5630 coding->chars_at_source = 0;
ec6d2bb8
KH
5631}
5632
ec6d2bb8 5633
df7492f9
KH
5634/* Encode the text at CODING->src_object into CODING->dst_object.
5635 CODING->src_object is a buffer or a string.
5636 CODING->dst_object is a buffer or nil.
5637
5638 If CODING->src_object is a buffer, it must be the current buffer.
5639 In this case, if CODING->src_pos is positive, it is a position of
5640 the source text in the buffer, otherwise. the source text is in the
5641 gap area of the buffer, and coding->src_pos specifies the offset of
5642 the text from GPT (which must be the same as PT). If this is the
5643 same buffer as CODING->dst_object, CODING->src_pos must be
5644 negative and CODING should not have `pre-write-conversion'.
5645
5646 If CODING->src_object is a string, CODING should not have
5647 `pre-write-conversion'.
5648
5649 If CODING->dst_object is a buffer, the encoded data is inserted at
5650 the current point of that buffer.
5651
5652 If CODING->dst_object is nil, the encoded data is placed at the
5653 memory area specified by CODING->destination. */
5654
5655static int
5656encode_coding (coding)
ec6d2bb8 5657 struct coding_system *coding;
ec6d2bb8 5658{
df7492f9 5659 Lisp_Object attrs;
ec6d2bb8 5660
df7492f9 5661 attrs = CODING_ID_ATTRS (coding->id);
ec6d2bb8 5662
df7492f9 5663 if (BUFFERP (coding->dst_object))
ec6d2bb8 5664 {
df7492f9
KH
5665 set_buffer_internal (XBUFFER (coding->dst_object));
5666 coding->dst_multibyte
5667 = ! NILP (current_buffer->enable_multibyte_characters);
5668 }
ec6d2bb8 5669
df7492f9
KH
5670 coding->consumed = coding->consumed_char = 0;
5671 coding->produced = coding->produced_char = 0;
5672 coding->result = CODING_RESULT_SUCCESS;
5673 coding->errors = 0;
ec6d2bb8 5674
df7492f9 5675 ALLOC_CONVERSION_WORK_AREA (coding);
ec6d2bb8 5676
df7492f9
KH
5677 do {
5678 coding_set_source (coding);
5679 consume_chars (coding);
5680
5681 if (!NILP (CODING_ATTR_ENCODE_TBL (attrs)))
5682 translate_chars (CODING_ATTR_ENCODE_TBL (attrs), coding);
5683
5684 coding_set_destination (coding);
5685 (*(coding->encoder)) (coding);
5686 } while (coding->consumed_char < coding->src_chars);
5687
5688 if (BUFFERP (coding->dst_object))
5689 insert_from_gap (coding->produced_char, coding->produced);
5690
5691 return (coding->result);
ec6d2bb8
KH
5692}
5693
df7492f9 5694/* Work buffer */
fb88bf2d 5695
df7492f9
KH
5696/* List of currently used working buffer. */
5697Lisp_Object Vcode_conversion_work_buf_list;
d46c5b12 5698
df7492f9
KH
5699/* A working buffer used by the top level conversion. */
5700Lisp_Object Vcode_conversion_reused_work_buf;
b73bfc1c 5701
4ed46869 5702
df7492f9
KH
5703/* Return a working buffer that can be freely used by the following
5704 code conversion. MULTIBYTEP specifies the multibyteness of the
5705 buffer. */
b73bfc1c 5706
df7492f9
KH
5707Lisp_Object
5708make_conversion_work_buffer (multibytep)
5709 int multibytep;
5710{
5711 struct buffer *current = current_buffer;
5712 Lisp_Object buf;
d46c5b12 5713
df7492f9 5714 if (NILP (Vcode_conversion_work_buf_list))
e133c8fa 5715 {
df7492f9
KH
5716 if (NILP (Vcode_conversion_reused_work_buf))
5717 Vcode_conversion_reused_work_buf
5718 = Fget_buffer_create (build_string (" *code-conversion-work*"));
5719 Vcode_conversion_work_buf_list
5720 = Fcons (Vcode_conversion_reused_work_buf, Qnil);
e133c8fa 5721 }
df7492f9 5722 else
d46c5b12 5723 {
df7492f9
KH
5724 int depth = Flength (Vcode_conversion_work_buf_list);
5725 char str[128];
e077cc80 5726
df7492f9
KH
5727 sprintf (str, " *code-conversion-work*<%d>", depth);
5728 Vcode_conversion_work_buf_list
5729 = Fcons (Fget_buffer_create (build_string (str)),
5730 Vcode_conversion_work_buf_list);
d46c5b12 5731 }
d46c5b12 5732
df7492f9
KH
5733 buf = XCAR (Vcode_conversion_work_buf_list);
5734 set_buffer_internal (XBUFFER (buf));
5735 current_buffer->undo_list = Qt;
5736 Ferase_buffer ();
5737 Fset_buffer_multibyte (multibytep ? Qt : Qnil);
5738 set_buffer_internal (current);
5739 return buf;
5740}
d46c5b12 5741
df7492f9 5742static struct coding_system *saved_coding;
d46c5b12 5743
df7492f9
KH
5744Lisp_Object
5745code_conversion_restore (info)
5746 Lisp_Object info;
5747{
5748 int depth = Flength (Vcode_conversion_work_buf_list);
5749 Lisp_Object buf;
d46c5b12 5750
df7492f9 5751 if (depth > 0)
d46c5b12 5752 {
df7492f9
KH
5753 buf = XCAR (Vcode_conversion_work_buf_list);
5754 Vcode_conversion_work_buf_list = XCDR (Vcode_conversion_work_buf_list);
5755 if (depth > 1 && !NILP (Fbuffer_live_p (buf)))
5756 Fkill_buffer (buf);
5757 }
d46c5b12 5758
df7492f9
KH
5759 if (saved_coding->dst_object == Qt
5760 && saved_coding->destination)
5761 xfree (saved_coding->destination);
b843d1ae 5762
df7492f9
KH
5763 return save_excursion_restore (info);
5764}
d46c5b12 5765
12410ef1 5766
df7492f9
KH
5767int
5768decode_coding_gap (coding, chars, bytes)
5769 struct coding_system *coding;
5770 EMACS_INT chars, bytes;
5771{
5772 int count = specpdl_ptr - specpdl;
fb88bf2d 5773
df7492f9
KH
5774 saved_coding = coding;
5775 record_unwind_protect (code_conversion_restore, save_excursion_save ());
ec6d2bb8 5776
df7492f9
KH
5777 coding->src_object = Fcurrent_buffer ();
5778 coding->src_chars = chars;
5779 coding->src_bytes = bytes;
5780 coding->src_pos = -chars;
5781 coding->src_pos_byte = -bytes;
5782 coding->src_multibyte = chars < bytes;
5783 coding->dst_object = coding->src_object;
5784 coding->dst_pos = PT;
5785 coding->dst_pos_byte = PT_BYTE;
71c81426 5786 coding->dst_multibyte = ! NILP (current_buffer->enable_multibyte_characters);
4956c225 5787
df7492f9
KH
5788 if (CODING_REQUIRE_DETECTION (coding))
5789 detect_coding (coding);
5790
5791 decode_coding (coding);
d46c5b12 5792
df7492f9
KH
5793 unbind_to (count, Qnil);
5794 return coding->result;
5795}
d46c5b12 5796
df7492f9
KH
5797int
5798encode_coding_gap (coding, chars, bytes)
5799 struct coding_system *coding;
5800 EMACS_INT chars, bytes;
5801{
5802 int count = specpdl_ptr - specpdl;
5803 Lisp_Object buffer;
d46c5b12 5804
df7492f9
KH
5805 saved_coding = coding;
5806 record_unwind_protect (code_conversion_restore, save_excursion_save ());
fb88bf2d 5807
df7492f9
KH
5808 buffer = Fcurrent_buffer ();
5809 coding->src_object = buffer;
5810 coding->src_chars = chars;
5811 coding->src_bytes = bytes;
5812 coding->src_pos = -chars;
5813 coding->src_pos_byte = -bytes;
5814 coding->src_multibyte = chars < bytes;
5815 coding->dst_object = coding->src_object;
5816 coding->dst_pos = PT;
5817 coding->dst_pos_byte = PT_BYTE;
fb88bf2d 5818
df7492f9 5819 encode_coding (coding);
f2558efd 5820
df7492f9
KH
5821 unbind_to (count, Qnil);
5822 return coding->result;
5823}
b73bfc1c 5824
d46c5b12 5825
df7492f9
KH
5826/* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
5827 SRC_OBJECT into DST_OBJECT by coding context CODING.
ec6d2bb8 5828
df7492f9 5829 SRC_OBJECT is a buffer, a string, or Qnil.
ec6d2bb8 5830
df7492f9
KH
5831 If it is a buffer, the text is at point of the buffer. FROM and TO
5832 are positions in the buffer.
ec6d2bb8 5833
df7492f9
KH
5834 If it is a string, the text is at the beginning of the string.
5835 FROM and TO are indices to the string.
ec6d2bb8 5836
df7492f9
KH
5837 If it is nil, the text is at coding->source. FROM and TO are
5838 indices to coding->source.
ec6d2bb8 5839
df7492f9 5840 DST_OBJECT is a buffer, Qt, or Qnil.
d46c5b12 5841
df7492f9
KH
5842 If it is a buffer, the decoded text is inserted at point of the
5843 buffer. If the buffer is the same as SRC_OBJECT, the source text
5844 is deleted.
d46c5b12 5845
df7492f9
KH
5846 If it is Qt, a string is made from the decoded text, and
5847 set in CODING->dst_object.
d46c5b12 5848
df7492f9
KH
5849 If it is Qnil, the decoded text is stored at CODING->destination.
5850 The called must allocate CODING->dst_bytes bytes at
5851 CODING->destination by xmalloc. If the decoded text is longer than
5852 CODING->dst_bytes, CODING->destination is relocated by xrealloc.
5853 */
d46c5b12 5854
df7492f9
KH
5855void
5856decode_coding_object (coding, src_object, from, from_byte, to, to_byte,
5857 dst_object)
5858 struct coding_system *coding;
5859 Lisp_Object src_object;
5860 EMACS_INT from, from_byte, to, to_byte;
5861 Lisp_Object dst_object;
5862{
5863 int count = specpdl_ptr - specpdl;
5864 unsigned char *destination;
5865 EMACS_INT dst_bytes;
5866 EMACS_INT chars = to - from;
5867 EMACS_INT bytes = to_byte - from_byte;
5868 Lisp_Object attrs;
d46c5b12 5869
df7492f9
KH
5870 saved_coding = coding;
5871 record_unwind_protect (code_conversion_restore, save_excursion_save ());
93dec019 5872
df7492f9
KH
5873 if (NILP (dst_object))
5874 {
5875 destination = coding->destination;
5876 dst_bytes = coding->dst_bytes;
5877 }
93dec019 5878
df7492f9
KH
5879 coding->src_object = src_object;
5880 coding->src_chars = chars;
5881 coding->src_bytes = bytes;
5882 coding->src_multibyte = chars < bytes;
70ad9fc4 5883
df7492f9
KH
5884 if (STRINGP (src_object))
5885 {
5886 coding->src_pos = from;
5887 coding->src_pos_byte = from_byte;
5888 }
5889 else if (BUFFERP (src_object))
5890 {
5891 set_buffer_internal (XBUFFER (src_object));
5892 if (from != GPT)
5893 move_gap_both (from, from_byte);
5894 if (EQ (src_object, dst_object))
fb88bf2d 5895 {
df7492f9
KH
5896 TEMP_SET_PT_BOTH (from, from_byte);
5897 del_range_both (from, from_byte, to, to_byte, 1);
5898 coding->src_pos = -chars;
5899 coding->src_pos_byte = -bytes;
fb88bf2d 5900 }
df7492f9 5901 else
fb88bf2d 5902 {
df7492f9
KH
5903 coding->src_pos = from;
5904 coding->src_pos_byte = from_byte;
fb88bf2d 5905 }
d46c5b12 5906 }
fb88bf2d 5907
df7492f9
KH
5908 if (CODING_REQUIRE_DETECTION (coding))
5909 detect_coding (coding);
5910 attrs = CODING_ID_ATTRS (coding->id);
5911
5912 if (! NILP (CODING_ATTR_POST_READ (attrs))
5913 || EQ (dst_object, Qt))
b73bfc1c 5914 {
df7492f9
KH
5915 coding->dst_object = make_conversion_work_buffer (1);
5916 coding->dst_pos = BEG;
5917 coding->dst_pos_byte = BEG_BYTE;
5918 coding->dst_multibyte = 1;
b73bfc1c 5919 }
df7492f9 5920 else if (BUFFERP (dst_object))
12410ef1 5921 {
df7492f9
KH
5922 coding->dst_object = dst_object;
5923 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
5924 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
5925 coding->dst_multibyte
5926 = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
12410ef1 5927 }
72d1a715 5928 else
df7492f9
KH
5929 {
5930 coding->dst_object = Qnil;
5931 coding->dst_multibyte = 1;
5932 }
5933
5934 decode_coding (coding);
4ed46869 5935
df7492f9
KH
5936 if (BUFFERP (coding->dst_object))
5937 set_buffer_internal (XBUFFER (coding->dst_object));
ec6d2bb8 5938
df7492f9 5939 if (! NILP (CODING_ATTR_POST_READ (attrs)))
d46c5b12 5940 {
df7492f9
KH
5941 struct gcpro gcpro1, gcpro2;
5942 EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
2b4f9037 5943 Lisp_Object val;
4ed46869 5944
c0cc7f7f 5945 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
df7492f9
KH
5946 GCPRO2 (coding->src_object, coding->dst_object);
5947 val = call1 (CODING_ATTR_POST_READ (attrs),
5948 make_number (coding->produced_char));
5949 UNGCPRO;
5950 CHECK_NATNUM (val);
5951 coding->produced_char += Z - prev_Z;
5952 coding->produced += Z_BYTE - prev_Z_BYTE;
d46c5b12 5953 }
4ed46869 5954
df7492f9 5955 if (EQ (dst_object, Qt))
ec6d2bb8 5956 {
df7492f9
KH
5957 coding->dst_object = Fbuffer_string ();
5958 }
5959 else if (NILP (dst_object) && BUFFERP (coding->dst_object))
5960 {
5961 set_buffer_internal (XBUFFER (coding->dst_object));
5962 if (dst_bytes < coding->produced)
5963 {
5964 destination
5965 = (unsigned char *) xrealloc (destination, coding->produced);
5966 if (! destination)
5967 {
5968 coding->result = CODING_RESULT_INSUFFICIENT_DST;
5969 unbind_to (count, Qnil);
5970 return;
5971 }
5972 if (BEGV < GPT && GPT < BEGV + coding->produced_char)
5973 move_gap_both (BEGV, BEGV_BYTE);
5974 bcopy (BEGV_ADDR, destination, coding->produced);
5975 coding->destination = destination;
5976 }
ec6d2bb8 5977 }
2b4f9037 5978
df7492f9 5979 unbind_to (count, Qnil);
d46c5b12
KH
5980}
5981
df7492f9
KH
5982
5983void
5984encode_coding_object (coding, src_object, from, from_byte, to, to_byte,
5985 dst_object)
b73bfc1c 5986 struct coding_system *coding;
df7492f9
KH
5987 Lisp_Object src_object;
5988 EMACS_INT from, from_byte, to, to_byte;
5989 Lisp_Object dst_object;
b73bfc1c
KH
5990{
5991 int count = specpdl_ptr - specpdl;
df7492f9
KH
5992 EMACS_INT chars = to - from;
5993 EMACS_INT bytes = to_byte - from_byte;
5994 Lisp_Object attrs;
5995
5996 saved_coding = coding;
5997 record_unwind_protect (code_conversion_restore, save_excursion_save ());
5998
5999 coding->src_object = src_object;
6000 coding->src_chars = chars;
6001 coding->src_bytes = bytes;
6002 coding->src_multibyte = chars < bytes;
6003
6004 attrs = CODING_ID_ATTRS (coding->id);
6005
6006 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
6bac5b12 6007 {
df7492f9
KH
6008 coding->src_object = make_conversion_work_buffer (coding->src_multibyte);
6009 set_buffer_internal (XBUFFER (coding->src_object));
6010 if (STRINGP (src_object))
6011 insert_from_string (src_object, from, from_byte, chars, bytes, 0);
6012 else if (BUFFERP (src_object))
6013 insert_from_buffer (XBUFFER (src_object), from, chars, 0);
6014 else
6015 insert_1_both (coding->source + from, chars, bytes, 0, 0, 0);
6016
6017 if (EQ (src_object, dst_object))
6018 {
6019 set_buffer_internal (XBUFFER (src_object));
6020 del_range_both (from, from_byte, to, to_byte, 1);
6021 set_buffer_internal (XBUFFER (coding->src_object));
6022 }
6023
ac87bbef
KH
6024 call2 (CODING_ATTR_PRE_WRITE (attrs),
6025 make_number (BEG), make_number (Z));
6026 coding->src_object = Fcurrent_buffer ();
df7492f9
KH
6027 if (BEG != GPT)
6028 move_gap_both (BEG, BEG_BYTE);
6029 coding->src_chars = Z - BEG;
6030 coding->src_bytes = Z_BYTE - BEG_BYTE;
6031 coding->src_pos = BEG;
6032 coding->src_pos_byte = BEG_BYTE;
6033 coding->src_multibyte = Z < Z_BYTE;
6034 }
6035 else if (STRINGP (src_object))
6036 {
6037 coding->src_pos = from;
6038 coding->src_pos_byte = from_byte;
6039 }
6040 else if (BUFFERP (src_object))
d46c5b12 6041 {
df7492f9
KH
6042 set_buffer_internal (XBUFFER (src_object));
6043 if (from != GPT)
6044 move_gap_both (from, from_byte);
6045 if (EQ (src_object, dst_object))
d46c5b12 6046 {
df7492f9
KH
6047 del_range_both (from, from_byte, to, to_byte, 1);
6048 coding->src_pos = -chars;
6049 coding->src_pos_byte = -bytes;
d46c5b12 6050 }
df7492f9 6051 else
d46c5b12 6052 {
df7492f9
KH
6053 coding->src_pos = from;
6054 coding->src_pos_byte = from_byte;
d46c5b12
KH
6055 }
6056 }
4ed46869 6057
df7492f9 6058 if (BUFFERP (dst_object))
d46c5b12 6059 {
df7492f9
KH
6060 coding->dst_object = dst_object;
6061 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
6062 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
6063 coding->dst_multibyte
6064 = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
b73bfc1c 6065 }
df7492f9 6066 else if (EQ (dst_object, Qt))
4956c225 6067 {
df7492f9 6068 coding->dst_object = Qnil;
df7492f9 6069 coding->dst_bytes = coding->src_chars;
ac87bbef
KH
6070 if (coding->dst_bytes == 0)
6071 coding->dst_bytes = 1;
6072 coding->destination = (unsigned char *) xmalloc (coding->dst_bytes);
df7492f9 6073 coding->dst_multibyte = 0;
4956c225 6074 }
df7492f9 6075 else
78108bcd 6076 {
df7492f9
KH
6077 coding->dst_object = Qnil;
6078 coding->dst_multibyte = 0;
78108bcd
KH
6079 }
6080
df7492f9 6081 encode_coding (coding);
4ed46869 6082
df7492f9 6083 if (EQ (dst_object, Qt))
4ed46869 6084 {
df7492f9
KH
6085 if (BUFFERP (coding->dst_object))
6086 coding->dst_object = Fbuffer_string ();
6087 else
73be902c 6088 {
df7492f9
KH
6089 coding->dst_object
6090 = make_unibyte_string ((char *) coding->destination,
6091 coding->produced);
6092 xfree (coding->destination);
73be902c 6093 }
4ed46869 6094 }
d46c5b12 6095
df7492f9 6096 unbind_to (count, Qnil);
b73bfc1c
KH
6097}
6098
df7492f9 6099
b73bfc1c 6100Lisp_Object
df7492f9 6101preferred_coding_system ()
b73bfc1c 6102{
df7492f9 6103 int id = coding_categories[coding_priorities[0]].id;
2391eaa4 6104
df7492f9 6105 return CODING_ID_NAME (id);
4ed46869
KH
6106}
6107
6108\f
6109#ifdef emacs
1397dc18 6110/*** 8. Emacs Lisp library functions ***/
4ed46869 6111
4ed46869 6112DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
48b0f3ae 6113 doc: /* Return t if OBJECT is nil or a coding-system.
df7492f9 6114See the documentation of `define-coding-system' for information
48b0f3ae
PJ
6115about coding-system objects. */)
6116 (obj)
4ed46869
KH
6117 Lisp_Object obj;
6118{
df7492f9 6119 return ((NILP (obj) || CODING_SYSTEM_P (obj)) ? Qt : Qnil);
4ed46869
KH
6120}
6121
9d991de8
RS
6122DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
6123 Sread_non_nil_coding_system, 1, 1, 0,
48b0f3ae
PJ
6124 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
6125 (prompt)
4ed46869
KH
6126 Lisp_Object prompt;
6127{
e0e989f6 6128 Lisp_Object val;
9d991de8
RS
6129 do
6130 {
4608c386
KH
6131 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6132 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
9d991de8
RS
6133 }
6134 while (XSTRING (val)->size == 0);
e0e989f6 6135 return (Fintern (val, Qnil));
4ed46869
KH
6136}
6137
9b787f3e 6138DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
48b0f3ae
PJ
6139 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
6140If the user enters null input, return second argument DEFAULT-CODING-SYSTEM. */)
6141 (prompt, default_coding_system)
9b787f3e 6142 Lisp_Object prompt, default_coding_system;
4ed46869 6143{
f44d27ce 6144 Lisp_Object val;
9b787f3e
RS
6145 if (SYMBOLP (default_coding_system))
6146 XSETSTRING (default_coding_system, XSYMBOL (default_coding_system)->name);
4608c386 6147 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
9b787f3e
RS
6148 Qt, Qnil, Qcoding_system_history,
6149 default_coding_system, Qnil);
e0e989f6 6150 return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
4ed46869
KH
6151}
6152
6153DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
6154 1, 1, 0,
48b0f3ae
PJ
6155 doc: /* Check validity of CODING-SYSTEM.
6156If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
6157It is valid if it is a symbol with a non-nil `coding-system' property.
6158The value of property should be a vector of length 5. */)
df7492f9 6159 (coding_system)
4ed46869
KH
6160 Lisp_Object coding_system;
6161{
b7826503 6162 CHECK_SYMBOL (coding_system);
4ed46869
KH
6163 if (!NILP (Fcoding_system_p (coding_system)))
6164 return coding_system;
6165 while (1)
02ba4723 6166 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
4ed46869 6167}
df7492f9 6168
3a73fa5d 6169\f
d46c5b12 6170Lisp_Object
df7492f9 6171detect_coding_system (src, src_bytes, highest, multibytep, coding_system)
d46c5b12
KH
6172 unsigned char *src;
6173 int src_bytes, highest;
0a28aafb 6174 int multibytep;
df7492f9 6175 Lisp_Object coding_system;
4ed46869 6176{
df7492f9
KH
6177 unsigned char *src_end = src + src_bytes;
6178 int mask = CATEGORY_MASK_ANY;
6179 int detected = 0;
6180 int c, i;
6181 Lisp_Object attrs, eol_type;
6182 Lisp_Object val;
6183 struct coding_system coding;
6184
6185 if (NILP (coding_system))
6186 coding_system = Qundecided;
6187 setup_coding_system (coding_system, &coding);
6188 attrs = CODING_ID_ATTRS (coding.id);
6189 eol_type = CODING_ID_EOL_TYPE (coding.id);
4ed46869 6190
df7492f9
KH
6191 coding.source = src;
6192 coding.src_bytes = src_bytes;
6193 coding.src_multibyte = multibytep;
6194 coding.consumed = 0;
4ed46869 6195
df7492f9 6196 if (XINT (CODING_ATTR_CATEGORY (attrs)) != coding_category_undecided)
4ed46869 6197 {
df7492f9 6198 mask = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
4ed46869 6199 }
df7492f9 6200 else
4ed46869 6201 {
df7492f9
KH
6202 coding_system = Qnil;
6203 for (; src < src_end; src++)
4ed46869 6204 {
df7492f9
KH
6205 c = *src;
6206 if (c & 0x80 || (c < 0x20 && (c == ISO_CODE_ESC
6207 || c == ISO_CODE_SI
6208 || c == ISO_CODE_SO)))
d46c5b12 6209 break;
4ed46869 6210 }
df7492f9
KH
6211 coding.head_ascii = src - coding.source;
6212
6213 if (src < src_end)
6214 for (i = 0; i < coding_category_raw_text; i++)
6215 {
6216 enum coding_category category = coding_priorities[i];
6217 struct coding_system *this = coding_categories + category;
6218
6219 if (category >= coding_category_raw_text
6220 || detected & (1 << category))
6221 continue;
6222
6223 if (this->id < 0)
6224 {
6225 /* No coding system of this category is defined. */
6226 mask &= ~(1 << category);
6227 }
6228 else
6229 {
6230 detected |= detected_mask[category];
6231 if ((*(coding_categories[category].detector)) (&coding, &mask)
6232 && highest)
6233 {
6234 mask &= detected_mask[category];
6235 break;
6236 }
6237 }
6238 }
4ed46869 6239 }
4ed46869 6240
df7492f9
KH
6241 if (!mask)
6242 val = Fcons (make_number (coding_category_raw_text), Qnil);
6243 else if (mask == CATEGORY_MASK_ANY)
6244 val = Fcons (make_number (coding_category_undecided), Qnil);
6245 else if (highest)
4ed46869 6246 {
df7492f9
KH
6247 for (i = 0; i < coding_category_raw_text; i++)
6248 if (mask & (1 << coding_priorities[i]))
6249 {
6250 val = Fcons (make_number (coding_priorities[i]), Qnil);
6251 break;
6252 }
6253 }
6254 else
6255 {
6256 val = Qnil;
6257 for (i = coding_category_raw_text - 1; i >= 0; i--)
6258 if (mask & (1 << coding_priorities[i]))
6259 val = Fcons (make_number (coding_priorities[i]), val);
4ed46869 6260 }
df7492f9
KH
6261
6262 {
6263 int one_byte_eol = -1, two_byte_eol = -1;
6264 Lisp_Object tail;
6265
6266 for (tail = val; CONSP (tail); tail = XCDR (tail))
6267 {
6268 struct coding_system *this
6269 = (NILP (coding_system) ? coding_categories + XINT (XCAR (tail))
6270 : &coding);
6271 int this_eol;
6272
6273 attrs = CODING_ID_ATTRS (this->id);
6274 eol_type = CODING_ID_EOL_TYPE (this->id);
6275 XSETCAR (tail, CODING_ID_NAME (this->id));
6276 if (VECTORP (eol_type))
6277 {
6278 if (EQ (CODING_ATTR_TYPE (attrs), Qutf_16))
6279 {
6280 if (two_byte_eol < 0)
6281 two_byte_eol = detect_eol (this, coding.source, src_bytes);
6282 this_eol = two_byte_eol;
6283 }
6284 else
6285 {
6286 if (one_byte_eol < 0)
6287 one_byte_eol =detect_eol (this, coding.source, src_bytes);
6288 this_eol = one_byte_eol;
6289 }
6290 if (this_eol == EOL_SEEN_LF)
6291 XSETCAR (tail, AREF (eol_type, 0));
6292 else if (this_eol == EOL_SEEN_CRLF)
6293 XSETCAR (tail, AREF (eol_type, 1));
6294 else if (this_eol == EOL_SEEN_CR)
6295 XSETCAR (tail, AREF (eol_type, 2));
6296 }
6297 }
6298 }
6299
03699b14 6300 return (highest ? XCAR (val) : val);
93dec019 6301}
4ed46869 6302
df7492f9 6303
d46c5b12
KH
6304DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
6305 2, 3, 0,
48b0f3ae
PJ
6306 doc: /* Detect coding system of the text in the region between START and END.
6307Return a list of possible coding systems ordered by priority.
6308
6309If only ASCII characters are found, it returns a list of single element
6310`undecided' or its subsidiary coding system according to a detected
6311end-of-line format.
6312
6313If optional argument HIGHEST is non-nil, return the coding system of
6314highest priority. */)
6315 (start, end, highest)
d46c5b12
KH
6316 Lisp_Object start, end, highest;
6317{
6318 int from, to;
6319 int from_byte, to_byte;
6289dd10 6320
b7826503
PJ
6321 CHECK_NUMBER_COERCE_MARKER (start);
6322 CHECK_NUMBER_COERCE_MARKER (end);
4ed46869 6323
d46c5b12
KH
6324 validate_region (&start, &end);
6325 from = XINT (start), to = XINT (end);
6326 from_byte = CHAR_TO_BYTE (from);
6327 to_byte = CHAR_TO_BYTE (to);
6289dd10 6328
d46c5b12
KH
6329 if (from < GPT && to >= GPT)
6330 move_gap_both (to, to_byte);
c210f766 6331
d46c5b12 6332 return detect_coding_system (BYTE_POS_ADDR (from_byte),
df7492f9 6333 to_byte - from_byte,
0a28aafb
KH
6334 !NILP (highest),
6335 !NILP (current_buffer
df7492f9
KH
6336 ->enable_multibyte_characters),
6337 Qnil);
d46c5b12 6338}
6289dd10 6339
d46c5b12
KH
6340DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
6341 1, 2, 0,
48b0f3ae
PJ
6342 doc: /* Detect coding system of the text in STRING.
6343Return a list of possible coding systems ordered by priority.
6344
6345If only ASCII characters are found, it returns a list of single element
6346`undecided' or its subsidiary coding system according to a detected
6347end-of-line format.
6348
6349If optional argument HIGHEST is non-nil, return the coding system of
6350highest priority. */)
6351 (string, highest)
d46c5b12
KH
6352 Lisp_Object string, highest;
6353{
b7826503 6354 CHECK_STRING (string);
4ed46869 6355
d46c5b12 6356 return detect_coding_system (XSTRING (string)->data,
df7492f9 6357 STRING_BYTES (XSTRING (string)),
0a28aafb 6358 !NILP (highest),
df7492f9
KH
6359 STRING_MULTIBYTE (string),
6360 Qnil);
4ed46869
KH
6361}
6362
05e6f5dc 6363
df7492f9
KH
6364static INLINE int
6365char_encodable_p (c, attrs)
6366 int c;
6367 Lisp_Object attrs;
05e6f5dc 6368{
df7492f9 6369 Lisp_Object tail;
df7492f9 6370 struct charset *charset;
05e6f5dc 6371
df7492f9
KH
6372 for (tail = CODING_ATTR_CHARSET_LIST (attrs);
6373 CONSP (tail); tail = XCDR (tail))
05e6f5dc 6374 {
df7492f9
KH
6375 charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
6376 if (CHAR_CHARSET_P (c, charset))
6377 break;
05e6f5dc 6378 }
df7492f9 6379 return (! NILP (tail));
05e6f5dc
KH
6380}
6381
6382
df7492f9
KH
6383/* Return a list of coding systems that safely encode the text between
6384 START and END. If EXCLUDE is non-nil, it is a list of coding
6385 systems not to check. The returned list doesn't contain any such
6386 coding systems. In any case, If the text contains only ASCII or is
6387 unibyte, return t. */
6388
6389DEFUN ("find-coding-systems-region-internal",
6390 Ffind_coding_systems_region_internal,
6391 Sfind_coding_systems_region_internal, 2, 3, 0,
6392 doc: /* Internal use only. */)
6393 (start, end, exclude)
6394 Lisp_Object start, end, exclude;
6395{
6396 Lisp_Object coding_attrs_list, safe_codings;
6397 EMACS_INT start_byte, end_byte;
6398 unsigned char *p, *pbeg, *pend;
6399 int c;
6400 Lisp_Object tail, elt;
05e6f5dc 6401
df7492f9
KH
6402 if (STRINGP (start))
6403 {
6404 if (!STRING_MULTIBYTE (start)
6405 && XSTRING (start)->size != STRING_BYTES (XSTRING (start)))
6406 return Qt;
6407 start_byte = 0;
6408 end_byte = STRING_BYTES (XSTRING (start));
6409 }
6410 else
6411 {
6412 CHECK_NUMBER_COERCE_MARKER (start);
6413 CHECK_NUMBER_COERCE_MARKER (end);
6414 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
6415 args_out_of_range (start, end);
6416 if (NILP (current_buffer->enable_multibyte_characters))
6417 return Qt;
6418 start_byte = CHAR_TO_BYTE (XINT (start));
6419 end_byte = CHAR_TO_BYTE (XINT (end));
6420 if (XINT (end) - XINT (start) == end_byte - start_byte)
6421 return Qt;
05e6f5dc 6422
df7492f9
KH
6423 if (start < GPT && end > GPT)
6424 {
6425 if ((GPT - start) < (end - GPT))
6426 move_gap_both (start, start_byte);
6427 else
6428 move_gap_both (end, end_byte);
6429 }
6430 }
05e6f5dc 6431
df7492f9
KH
6432 coding_attrs_list = Qnil;
6433 for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
6434 if (NILP (exclude)
6435 || NILP (Fmemq (XCAR (tail), exclude)))
6436 {
6437 Lisp_Object attrs;
05e6f5dc 6438
df7492f9
KH
6439 attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
6440 if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
6441 && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
6442 coding_attrs_list = Fcons (attrs, coding_attrs_list);
6443 }
6444
6445 if (STRINGP (start))
6446 p = pbeg = XSTRING (start)->data;
6447 else
6448 p = pbeg = BYTE_POS_ADDR (start_byte);
6449 pend = p + (end_byte - start_byte);
6450
6451 while (p < pend && ASCII_BYTE_P (*p)) p++;
6452 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
05e6f5dc
KH
6453
6454 while (p < pend)
6455 {
df7492f9
KH
6456 if (ASCII_BYTE_P (*p))
6457 p++;
6458 else
6459 {
6460 c = STRING_CHAR_ADVANCE (p);
6461
6462 charset_map_loaded = 0;
6463 for (tail = coding_attrs_list; CONSP (tail);)
6464 {
6465 elt = XCAR (tail);
6466 if (NILP (elt))
6467 tail = XCDR (tail);
6468 else if (char_encodable_p (c, elt))
6469 tail = XCDR (tail);
6470 else if (CONSP (XCDR (tail)))
6471 {
6472 XSETCAR (tail, XCAR (XCDR (tail)));
6473 XSETCDR (tail, XCDR (XCDR (tail)));
6474 }
6475 else
6476 {
6477 XSETCAR (tail, Qnil);
6478 tail = XCDR (tail);
6479 }
6480 }
6481 if (charset_map_loaded)
6482 {
6483 EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
05e6f5dc 6484
df7492f9
KH
6485 if (STRINGP (start))
6486 pbeg = XSTRING (start)->data;
6487 else
6488 pbeg = BYTE_POS_ADDR (start_byte);
6489 p = pbeg + p_offset;
6490 pend = pbeg + pend_offset;
6491 }
6492 }
05e6f5dc 6493 }
df7492f9
KH
6494
6495 safe_codings = Qnil;
6496 for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
6497 if (! NILP (XCAR (tail)))
6498 safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
6499
05e6f5dc
KH
6500 return safe_codings;
6501}
6502
6503
df7492f9
KH
6504DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
6505 Scheck_coding_systems_region, 3, 3, 0,
6506 doc: /* Check if the region is encodable by coding systems.
05e6f5dc 6507
df7492f9
KH
6508START and END are buffer positions specifying the region.
6509CODING-SYSTEM-LIST is a list of coding systems to check.
6510
6511The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
6512CODING-SYSTEM is a member of CODING-SYSTEM-LIst and can't encode the
6513whole region, POS0, POS1, ... are buffer positions where non-encodable
6514characters are found.
6515
6516If all coding systems in CODING-SYSTEM-LIST can encode the region, the
6517value is nil.
6518
6519START may be a string. In that case, check if the string is
6520encodable, and the value contains indices to the string instead of
6521buffer positions. END is ignored. */)
6522 (start, end, coding_system_list)
6523 Lisp_Object start, end, coding_system_list;
05e6f5dc 6524{
df7492f9
KH
6525 Lisp_Object list;
6526 EMACS_INT start_byte, end_byte;
6527 int pos;
6528 unsigned char *p, *pbeg, *pend;
6529 int c;
6530 Lisp_Object tail, elt;
05e6f5dc
KH
6531
6532 if (STRINGP (start))
6533 {
df7492f9
KH
6534 if (!STRING_MULTIBYTE (start)
6535 && XSTRING (start)->size != STRING_BYTES (XSTRING (start)))
6536 return Qnil;
6537 start_byte = 0;
6538 end_byte = STRING_BYTES (XSTRING (start));
6539 pos = 0;
05e6f5dc
KH
6540 }
6541 else
6542 {
b7826503
PJ
6543 CHECK_NUMBER_COERCE_MARKER (start);
6544 CHECK_NUMBER_COERCE_MARKER (end);
05e6f5dc
KH
6545 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
6546 args_out_of_range (start, end);
6547 if (NILP (current_buffer->enable_multibyte_characters))
df7492f9
KH
6548 return Qnil;
6549 start_byte = CHAR_TO_BYTE (XINT (start));
6550 end_byte = CHAR_TO_BYTE (XINT (end));
6551 if (XINT (end) - XINT (start) == end_byte - start_byte)
05e6f5dc 6552 return Qt;
df7492f9
KH
6553
6554 if (start < GPT && end > GPT)
6555 {
6556 if ((GPT - start) < (end - GPT))
6557 move_gap_both (start, start_byte);
6558 else
6559 move_gap_both (end, end_byte);
6560 }
6561 pos = start;
6562 }
6563
6564 list = Qnil;
6565 for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
6566 {
6567 elt = XCAR (tail);
6568 list = Fcons (Fcons (elt, Fcons (AREF (CODING_SYSTEM_SPEC (elt), 0),
6569 Qnil)),
6570 list);
05e6f5dc
KH
6571 }
6572
df7492f9
KH
6573 if (STRINGP (start))
6574 p = pbeg = XSTRING (start)->data;
6575 else
6576 p = pbeg = BYTE_POS_ADDR (start_byte);
6577 pend = p + (end_byte - start_byte);
6578
6579 while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
6580 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
6581
6582 while (p < pend)
05e6f5dc 6583 {
df7492f9
KH
6584 if (ASCII_BYTE_P (*p))
6585 p++;
6586 else
05e6f5dc 6587 {
df7492f9
KH
6588 c = STRING_CHAR_ADVANCE (p);
6589
6590 charset_map_loaded = 0;
6591 for (tail = list; CONSP (tail); tail = XCDR (tail))
6592 {
6593 elt = XCDR (XCAR (tail));
6594 if (! char_encodable_p (c, XCAR (elt)))
6595 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
6596 }
6597 if (charset_map_loaded)
6598 {
6599 EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
6600
6601 if (STRINGP (start))
6602 pbeg = XSTRING (start)->data;
6603 else
6604 pbeg = BYTE_POS_ADDR (start_byte);
6605 p = pbeg + p_offset;
6606 pend = pbeg + pend_offset;
6607 }
05e6f5dc 6608 }
df7492f9 6609 pos++;
05e6f5dc
KH
6610 }
6611
df7492f9
KH
6612 tail = list;
6613 list = Qnil;
6614 for (; CONSP (tail); tail = XCDR (tail))
05e6f5dc 6615 {
df7492f9
KH
6616 elt = XCAR (tail);
6617 if (CONSP (XCDR (XCDR (elt))))
6618 list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
6619 list);
05e6f5dc 6620 }
df7492f9
KH
6621
6622 return list;
05e6f5dc
KH
6623}
6624
6625
df7492f9 6626
4031e2bf 6627Lisp_Object
df7492f9
KH
6628code_convert_region (start, end, coding_system, dst_object, encodep, norecord)
6629 Lisp_Object start, end, coding_system, dst_object;
6630 int encodep, norecord;
3a73fa5d
RS
6631{
6632 struct coding_system coding;
df7492f9
KH
6633 EMACS_INT from, from_byte, to, to_byte;
6634 Lisp_Object src_object;
3a73fa5d 6635
b7826503
PJ
6636 CHECK_NUMBER_COERCE_MARKER (start);
6637 CHECK_NUMBER_COERCE_MARKER (end);
df7492f9
KH
6638 if (NILP (coding_system))
6639 coding_system = Qno_conversion;
6640 else
6641 CHECK_CODING_SYSTEM (coding_system);
6642 src_object = Fcurrent_buffer ();
6643 if (NILP (dst_object))
6644 dst_object = src_object;
6645 else if (! EQ (dst_object, Qt))
6646 CHECK_BUFFER (dst_object);
3a73fa5d 6647
d46c5b12
KH
6648 validate_region (&start, &end);
6649 from = XFASTINT (start);
df7492f9 6650 from_byte = CHAR_TO_BYTE (from);
d46c5b12 6651 to = XFASTINT (end);
df7492f9 6652 to_byte = CHAR_TO_BYTE (to);
d46c5b12 6653
df7492f9
KH
6654 setup_coding_system (coding_system, &coding);
6655 coding.mode |= CODING_MODE_LAST_BLOCK;
6656
6657 if (encodep)
6658 encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
6659 dst_object);
6660 else
6661 decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
6662 dst_object);
6663 if (! norecord)
6664 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
d46c5b12 6665
df7492f9
KH
6666 if (coding.result != CODING_RESULT_SUCCESS)
6667 error ("Code conversion error: %d", coding.result);
3a73fa5d 6668
df7492f9
KH
6669 return (BUFFERP (dst_object)
6670 ? make_number (coding.produced_char)
6671 : coding.dst_object);
4031e2bf
KH
6672}
6673
df7492f9 6674
4031e2bf 6675DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
df7492f9 6676 3, 4, "r\nzCoding system: ",
48b0f3ae 6677 doc: /* Decode the current region from the specified coding system.
df7492f9
KH
6678When called from a program, takes four arguments:
6679 START, END, CODING-SYSTEM, and DESTINATION.
6680START and END are buffer positions.
6681
6682Optional 4th arguments DESTINATION specifies where the decoded text goes.
6683If nil, the region between START and END is replace by the decoded text.
6684If buffer, the decoded text is inserted in the buffer.
6685If t, the decoded text is returned.
6686
48b0f3ae
PJ
6687This function sets `last-coding-system-used' to the precise coding system
6688used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6689not fully specified.)
6690It returns the length of the decoded text. */)
df7492f9
KH
6691 (start, end, coding_system, destination)
6692 Lisp_Object start, end, coding_system, destination;
4031e2bf 6693{
df7492f9 6694 return code_convert_region (start, end, coding_system, destination, 0, 0);
3a73fa5d
RS
6695}
6696
6697DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
df7492f9
KH
6698 3, 4, "r\nzCoding system: ",
6699 doc: /* Encode the current region by specified coding system.
48b0f3ae
PJ
6700When called from a program, takes three arguments:
6701START, END, and CODING-SYSTEM. START and END are buffer positions.
df7492f9
KH
6702
6703Optional 4th arguments DESTINATION specifies where the encoded text goes.
6704If nil, the region between START and END is replace by the encoded text.
6705If buffer, the encoded text is inserted in the buffer.
6706If t, the encoded text is returned.
6707
48b0f3ae
PJ
6708This function sets `last-coding-system-used' to the precise coding system
6709used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6710not fully specified.)
6711It returns the length of the encoded text. */)
df7492f9
KH
6712 (start, end, coding_system, destination)
6713 Lisp_Object start, end, coding_system, destination;
3a73fa5d 6714{
df7492f9 6715 return code_convert_region (start, end, coding_system, destination, 1, 0);
4031e2bf 6716}
3a73fa5d 6717
4031e2bf 6718Lisp_Object
df7492f9
KH
6719code_convert_string (string, coding_system, dst_object,
6720 encodep, nocopy, norecord)
6721 Lisp_Object string, coding_system, dst_object;
6722 int encodep, nocopy, norecord;
4031e2bf
KH
6723{
6724 struct coding_system coding;
df7492f9 6725 EMACS_INT chars, bytes;
3a73fa5d 6726
b7826503 6727 CHECK_STRING (string);
d46c5b12 6728 if (NILP (coding_system))
df7492f9
KH
6729 {
6730 if (! norecord)
6731 Vlast_coding_system_used = Qno_conversion;
6732 if (NILP (dst_object))
6733 return (nocopy ? Fcopy_sequence (string) : string);
6734 }
4ed46869 6735
df7492f9
KH
6736 if (NILP (coding_system))
6737 coding_system = Qno_conversion;
6738 else
6739 CHECK_CODING_SYSTEM (coding_system);
6740 if (NILP (dst_object))
6741 dst_object = Qt;
6742 else if (! EQ (dst_object, Qt))
6743 CHECK_BUFFER (dst_object);
5f1cd180 6744
df7492f9 6745 setup_coding_system (coding_system, &coding);
d46c5b12 6746 coding.mode |= CODING_MODE_LAST_BLOCK;
df7492f9
KH
6747 chars = XSTRING (string)->size;
6748 bytes = STRING_BYTES (XSTRING (string));
6749 if (encodep)
6750 encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
6751 else
6752 decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
6753 if (! norecord)
6754 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
ec6d2bb8 6755
df7492f9
KH
6756 if (coding.result != CODING_RESULT_SUCCESS)
6757 error ("Code conversion error: %d", coding.result);
4ed46869 6758
df7492f9
KH
6759 return (BUFFERP (dst_object)
6760 ? make_number (coding.produced_char)
6761 : coding.dst_object);
4ed46869
KH
6762}
6763
4031e2bf 6764
ecec61c1 6765/* Encode or decode STRING according to CODING_SYSTEM.
ec6d2bb8
KH
6766 Do not set Vlast_coding_system_used.
6767
6768 This function is called only from macros DECODE_FILE and
6769 ENCODE_FILE, thus we ignore character composition. */
ecec61c1
KH
6770
6771Lisp_Object
6772code_convert_string_norecord (string, coding_system, encodep)
6773 Lisp_Object string, coding_system;
6774 int encodep;
6775{
0be8721c 6776 return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
df7492f9 6777}
ecec61c1 6778
ecec61c1 6779
df7492f9
KH
6780DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
6781 2, 4, 0,
6782 doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
6783
6784Optional third arg NOCOPY non-nil means it is OK to return STRING itself
6785if the decoding operation is trivial.
ecec61c1 6786
df7492f9
KH
6787Optional fourth arg BUFFER non-nil meant that the decoded text is
6788inserted in BUFFER instead of returned as a astring. In this case,
6789the return value is BUFFER.
ecec61c1 6790
df7492f9
KH
6791This function sets `last-coding-system-used' to the precise coding system
6792used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6793not fully specified. */)
6794 (string, coding_system, nocopy, buffer)
6795 Lisp_Object string, coding_system, nocopy, buffer;
6796{
6797 return code_convert_string (string, coding_system, buffer,
6798 0, ! NILP (nocopy), 0);
6799}
6800
6801DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
6802 2, 4, 0,
6803 doc: /* Encode STRING to CODING-SYSTEM, and return the result.
6804
6805Optional third arg NOCOPY non-nil means it is OK to return STRING
6806itself if the encoding operation is trivial.
6807
6808Optional fourth arg BUFFER non-nil meant that the encoded text is
6809inserted in BUFFER instead of returned as a astring. In this case,
6810the return value is BUFFER.
6811
6812This function sets `last-coding-system-used' to the precise coding system
6813used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6814not fully specified.) */)
6815 (string, coding_system, nocopy, buffer)
6816 Lisp_Object string, coding_system, nocopy, buffer;
6817{
6818 return code_convert_string (string, coding_system, buffer,
6819 nocopy, ! NILP (nocopy), 1);
ecec61c1 6820}
df7492f9 6821
3a73fa5d 6822\f
4ed46869 6823DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
48b0f3ae
PJ
6824 doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
6825Return the corresponding character. */)
6826 (code)
4ed46869
KH
6827 Lisp_Object code;
6828{
df7492f9
KH
6829 Lisp_Object spec, attrs, val;
6830 struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
6831 int c;
6832
6833 CHECK_NATNUM (code);
6834 c = XFASTINT (code);
6835 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
6836 attrs = AREF (spec, 0);
4ed46869 6837
df7492f9
KH
6838 if (ASCII_BYTE_P (c)
6839 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
6840 return code;
6841
6842 val = CODING_ATTR_CHARSET_LIST (attrs);
6843 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
004068e4
KH
6844 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
6845 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
df7492f9
KH
6846
6847 if (c <= 0x7F)
6848 charset = charset_roman;
6849 else if (c >= 0xA0 && c < 0xDF)
55ab7be3 6850 {
df7492f9
KH
6851 charset = charset_kana;
6852 c -= 0x80;
55ab7be3
KH
6853 }
6854 else
6855 {
004068e4 6856 int s1 = c >> 8, s2 = c & 0xFF;
df7492f9
KH
6857
6858 if (s1 < 0x81 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF
6859 || s2 < 0x40 || s2 == 0x7F || s2 > 0xFC)
6860 error ("Invalid code: %d", code);
6861 SJIS_TO_JIS (c);
6862 charset = charset_kanji;
55ab7be3 6863 }
df7492f9
KH
6864 c = DECODE_CHAR (charset, c);
6865 if (c < 0)
6866 error ("Invalid code: %d", code);
6867 return make_number (c);
4ed46869
KH
6868}
6869
df7492f9 6870
4ed46869 6871DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
48b0f3ae
PJ
6872 doc: /* Encode a Japanese character CHAR to shift_jis encoding.
6873Return the corresponding code in SJIS. */)
6874 (ch)
df7492f9 6875 Lisp_Object ch;
4ed46869 6876{
df7492f9
KH
6877 Lisp_Object spec, attrs, charset_list;
6878 int c;
6879 struct charset *charset;
6880 unsigned code;
4ed46869 6881
df7492f9
KH
6882 CHECK_CHARACTER (ch);
6883 c = XFASTINT (ch);
6884 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
6885 attrs = AREF (spec, 0);
6886
6887 if (ASCII_CHAR_P (c)
6888 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
6889 return ch;
6890
6891 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
6892 charset = char_charset (c, charset_list, &code);
6893 if (code == CHARSET_INVALID_CODE (charset))
6894 error ("Can't encode by shift_jis encoding: %d", c);
6895 JIS_TO_SJIS (code);
6896
6897 return make_number (code);
4ed46869
KH
6898}
6899
6900DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
48b0f3ae
PJ
6901 doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
6902Return the corresponding character. */)
6903 (code)
4ed46869
KH
6904 Lisp_Object code;
6905{
df7492f9
KH
6906 Lisp_Object spec, attrs, val;
6907 struct charset *charset_roman, *charset_big5, *charset;
6908 int c;
4ed46869 6909
df7492f9
KH
6910 CHECK_NATNUM (code);
6911 c = XFASTINT (code);
6912 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
6913 attrs = AREF (spec, 0);
6914
6915 if (ASCII_BYTE_P (c)
6916 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
6917 return code;
6918
6919 val = CODING_ATTR_CHARSET_LIST (attrs);
6920 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
6921 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
6922
6923 if (c <= 0x7F)
6924 charset = charset_roman;
c28a9453
KH
6925 else
6926 {
df7492f9
KH
6927 int b1 = c >> 8, b2 = c & 0x7F;
6928 if (b1 < 0xA1 || b1 > 0xFE
6929 || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
6930 error ("Invalid code: %d", code);
6931 charset = charset_big5;
c28a9453 6932 }
df7492f9
KH
6933 c = DECODE_CHAR (charset, (unsigned )c);
6934 if (c < 0)
6935 error ("Invalid code: %d", code);
6936 return make_number (c);
4ed46869
KH
6937}
6938
6939DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
48b0f3ae
PJ
6940 doc: /* Encode the Big5 character CHAR to BIG5 coding system.
6941Return the corresponding character code in Big5. */)
6942 (ch)
4ed46869
KH
6943 Lisp_Object ch;
6944{
df7492f9
KH
6945 Lisp_Object spec, attrs, charset_list;
6946 struct charset *charset;
6947 int c;
6948 unsigned code;
6949
6950 CHECK_CHARACTER (ch);
6951 c = XFASTINT (ch);
6952 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
6953 attrs = AREF (spec, 0);
6954 if (ASCII_CHAR_P (c)
6955 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
6956 return ch;
6957
6958 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
6959 charset = char_charset (c, charset_list, &code);
6960 if (code == CHARSET_INVALID_CODE (charset))
6961 error ("Can't encode by Big5 encoding: %d", c);
6962
6963 return make_number (code);
4ed46869 6964}
df7492f9 6965
3a73fa5d 6966\f
1ba9e4ab
KH
6967DEFUN ("set-terminal-coding-system-internal",
6968 Fset_terminal_coding_system_internal,
48b0f3ae
PJ
6969 Sset_terminal_coding_system_internal, 1, 1, 0,
6970 doc: /* Internal use only. */)
6971 (coding_system)
b74e4686 6972 Lisp_Object coding_system;
4ed46869 6973{
b7826503 6974 CHECK_SYMBOL (coding_system);
df7492f9
KH
6975 setup_coding_system (Fcheck_coding_system (coding_system),
6976 &terminal_coding);
6977
70c22245 6978 /* We had better not send unsafe characters to terminal. */
df7492f9
KH
6979 terminal_coding.mode |= CODING_MODE_SAFE_ENCODING;
6980 /* Characer composition should be disabled. */
6981 terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
b73bfc1c
KH
6982 terminal_coding.src_multibyte = 1;
6983 terminal_coding.dst_multibyte = 0;
4ed46869
KH
6984 return Qnil;
6985}
6986
c4825358
KH
6987DEFUN ("set-safe-terminal-coding-system-internal",
6988 Fset_safe_terminal_coding_system_internal,
48b0f3ae 6989 Sset_safe_terminal_coding_system_internal, 1, 1, 0,
ddb67bdc 6990 doc: /* Internal use only. */)
48b0f3ae 6991 (coding_system)
b74e4686 6992 Lisp_Object coding_system;
c4825358 6993{
b7826503 6994 CHECK_SYMBOL (coding_system);
c4825358
KH
6995 setup_coding_system (Fcheck_coding_system (coding_system),
6996 &safe_terminal_coding);
df7492f9
KH
6997 /* Characer composition should be disabled. */
6998 safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
b73bfc1c
KH
6999 safe_terminal_coding.src_multibyte = 1;
7000 safe_terminal_coding.dst_multibyte = 0;
c4825358
KH
7001 return Qnil;
7002}
7003
4ed46869
KH
7004DEFUN ("terminal-coding-system",
7005 Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
48b0f3ae
PJ
7006 doc: /* Return coding system specified for terminal output. */)
7007 ()
4ed46869 7008{
df7492f9 7009 return CODING_ID_NAME (terminal_coding.id);
4ed46869
KH
7010}
7011
1ba9e4ab
KH
7012DEFUN ("set-keyboard-coding-system-internal",
7013 Fset_keyboard_coding_system_internal,
48b0f3ae
PJ
7014 Sset_keyboard_coding_system_internal, 1, 1, 0,
7015 doc: /* Internal use only. */)
7016 (coding_system)
4ed46869
KH
7017 Lisp_Object coding_system;
7018{
b7826503 7019 CHECK_SYMBOL (coding_system);
df7492f9
KH
7020 setup_coding_system (Fcheck_coding_system (coding_system),
7021 &keyboard_coding);
7022 /* Characer composition should be disabled. */
7023 keyboard_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
4ed46869
KH
7024 return Qnil;
7025}
7026
7027DEFUN ("keyboard-coding-system",
7028 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
48b0f3ae
PJ
7029 doc: /* Return coding system specified for decoding keyboard input. */)
7030 ()
4ed46869 7031{
df7492f9 7032 return CODING_ID_NAME (keyboard_coding.id);
4ed46869
KH
7033}
7034
7035\f
a5d301df
KH
7036DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
7037 Sfind_operation_coding_system, 1, MANY, 0,
48b0f3ae
PJ
7038 doc: /* Choose a coding system for an operation based on the target name.
7039The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
7040DECODING-SYSTEM is the coding system to use for decoding
7041\(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
7042for encoding (in case OPERATION does encoding).
7043
7044The first argument OPERATION specifies an I/O primitive:
7045 For file I/O, `insert-file-contents' or `write-region'.
7046 For process I/O, `call-process', `call-process-region', or `start-process'.
7047 For network I/O, `open-network-stream'.
7048
7049The remaining arguments should be the same arguments that were passed
7050to the primitive. Depending on which primitive, one of those arguments
7051is selected as the TARGET. For example, if OPERATION does file I/O,
7052whichever argument specifies the file name is TARGET.
7053
7054TARGET has a meaning which depends on OPERATION:
7055 For file I/O, TARGET is a file name.
7056 For process I/O, TARGET is a process name.
7057 For network I/O, TARGET is a service name or a port number
7058
7059This function looks up what specified for TARGET in,
7060`file-coding-system-alist', `process-coding-system-alist',
7061or `network-coding-system-alist' depending on OPERATION.
7062They may specify a coding system, a cons of coding systems,
7063or a function symbol to call.
7064In the last case, we call the function with one argument,
7065which is a list of all the arguments given to this function.
7066
7067usage: (find-operation-coding-system OPERATION ARGUMENTS ...) */)
7068 (nargs, args)
4ed46869
KH
7069 int nargs;
7070 Lisp_Object *args;
7071{
7072 Lisp_Object operation, target_idx, target, val;
7073 register Lisp_Object chain;
7074
7075 if (nargs < 2)
7076 error ("Too few arguments");
7077 operation = args[0];
7078 if (!SYMBOLP (operation)
7079 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
df7492f9 7080 error ("Invalid first arguement");
4ed46869
KH
7081 if (nargs < 1 + XINT (target_idx))
7082 error ("Too few arguments for operation: %s",
7083 XSYMBOL (operation)->name->data);
7084 target = args[XINT (target_idx) + 1];
7085 if (!(STRINGP (target)
7086 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
df7492f9 7087 error ("Invalid %dth argument", XINT (target_idx) + 1);
4ed46869 7088
2e34157c
RS
7089 chain = ((EQ (operation, Qinsert_file_contents)
7090 || EQ (operation, Qwrite_region))
02ba4723 7091 ? Vfile_coding_system_alist
2e34157c 7092 : (EQ (operation, Qopen_network_stream)
02ba4723
KH
7093 ? Vnetwork_coding_system_alist
7094 : Vprocess_coding_system_alist));
4ed46869
KH
7095 if (NILP (chain))
7096 return Qnil;
7097
03699b14 7098 for (; CONSP (chain); chain = XCDR (chain))
4ed46869 7099 {
f44d27ce 7100 Lisp_Object elt;
4ed46869 7101
df7492f9 7102 elt = XCAR (chain);
4ed46869
KH
7103 if (CONSP (elt)
7104 && ((STRINGP (target)
03699b14
KR
7105 && STRINGP (XCAR (elt))
7106 && fast_string_match (XCAR (elt), target) >= 0)
7107 || (INTEGERP (target) && EQ (target, XCAR (elt)))))
02ba4723 7108 {
03699b14 7109 val = XCDR (elt);
b19fd4c5
KH
7110 /* Here, if VAL is both a valid coding system and a valid
7111 function symbol, we return VAL as a coding system. */
02ba4723
KH
7112 if (CONSP (val))
7113 return val;
7114 if (! SYMBOLP (val))
7115 return Qnil;
7116 if (! NILP (Fcoding_system_p (val)))
7117 return Fcons (val, val);
b19fd4c5
KH
7118 if (! NILP (Ffboundp (val)))
7119 {
7120 val = call1 (val, Flist (nargs, args));
7121 if (CONSP (val))
7122 return val;
7123 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
7124 return Fcons (val, val);
7125 }
02ba4723
KH
7126 return Qnil;
7127 }
4ed46869
KH
7128 }
7129 return Qnil;
7130}
7131
df7492f9
KH
7132DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
7133 Sset_coding_system_priority, 1, MANY, 0,
1fcd6c8b
DL
7134 doc: /* Assign higher priority to coding systems given as arguments.
7135usage: (set-coding-system-priority CODING-SYSTEM ...) */)
df7492f9
KH
7136 (nargs, args)
7137 int nargs;
7138 Lisp_Object *args;
7139{
7140 int i, j;
7141 int changed[coding_category_max];
7142 enum coding_category priorities[coding_category_max];
7143
7144 bzero (changed, sizeof changed);
7145
7146 for (i = j = 0; i < nargs; i++)
7147 {
7148 enum coding_category category;
7149 Lisp_Object spec, attrs;
7150
7151 CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
7152 attrs = AREF (spec, 0);
7153 category = XINT (CODING_ATTR_CATEGORY (attrs));
7154 if (changed[category])
7155 /* Ignore this coding system because a coding system of the
7156 same category already had a higher priority. */
7157 continue;
7158 changed[category] = 1;
7159 priorities[j++] = category;
7160 if (coding_categories[category].id >= 0
7161 && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
7162 setup_coding_system (args[i], &coding_categories[category]);
7163 }
7164
7165 /* Now we have decided top J priorities. Reflect the order of the
7166 original priorities to the remaining priorities. */
7167
7168 for (i = j, j = 0; i < coding_category_max; i++, j++)
7169 {
7170 while (j < coding_category_max
7171 && changed[coding_priorities[j]])
7172 j++;
7173 if (j == coding_category_max)
7174 abort ();
7175 priorities[i] = coding_priorities[j];
7176 }
7177
7178 bcopy (priorities, coding_priorities, sizeof priorities);
7179 return Qnil;
7180}
7181
7182DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
7183 Scoding_system_priority_list, 0, 1, 0,
7184 doc: /* Return a list of coding systems ordered by their priorities. */)
7185 (highestp)
7186 Lisp_Object highestp;
d46c5b12
KH
7187{
7188 int i;
df7492f9 7189 Lisp_Object val;
d46c5b12 7190
df7492f9 7191 for (i = 0, val = Qnil; i < coding_category_max; i++)
d46c5b12 7192 {
df7492f9
KH
7193 enum coding_category category = coding_priorities[i];
7194 int id = coding_categories[category].id;
7195 Lisp_Object attrs;
7196
7197 if (id < 0)
7198 continue;
7199 attrs = CODING_ID_ATTRS (id);
7200 if (! NILP (highestp))
7201 return CODING_ATTR_BASE_NAME (attrs);
7202 val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
7203 }
7204 return Fnreverse (val);
7205}
7206
7207static Lisp_Object
7208make_subsidiaries (base)
7209 Lisp_Object base;
7210{
7211 Lisp_Object subsidiaries;
7212 char *suffixes[] = { "-unix", "-dos", "-mac" };
7213 int base_name_len = STRING_BYTES (XSYMBOL (base)->name);
7214 char *buf = (char *) alloca (base_name_len + 6);
7215 int i;
7216
7217 bcopy (XSYMBOL (base)->name->data, buf, base_name_len);
7218 subsidiaries = Fmake_vector (make_number (3), Qnil);
7219 for (i = 0; i < 3; i++)
7220 {
7221 bcopy (suffixes[i], buf + base_name_len, strlen (suffixes[i]) + 1);
7222 ASET (subsidiaries, i, intern (buf));
7223 }
7224 return subsidiaries;
7225}
7226
7227
7228DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
7229 Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
1fcd6c8b
DL
7230 doc: /* For internal use only.
7231usage: (define-coding-system-internal ...) */)
df7492f9
KH
7232 (nargs, args)
7233 int nargs;
7234 Lisp_Object *args;
7235{
7236 Lisp_Object name;
7237 Lisp_Object spec_vec; /* [ ATTRS ALIASE EOL_TYPE ] */
7238 Lisp_Object attrs; /* Vector of attributes. */
7239 Lisp_Object eol_type;
7240 Lisp_Object aliases;
7241 Lisp_Object coding_type, charset_list, safe_charsets;
7242 enum coding_category category;
7243 Lisp_Object tail, val;
7244 int max_charset_id = 0;
7245 int i;
7246
7247 if (nargs < coding_arg_max)
7248 goto short_args;
7249
7250 attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
7251
7252 name = args[coding_arg_name];
7253 CHECK_SYMBOL (name);
7254 CODING_ATTR_BASE_NAME (attrs) = name;
7255
7256 val = args[coding_arg_mnemonic];
7257 if (! STRINGP (val))
7258 CHECK_CHARACTER (val);
7259 CODING_ATTR_MNEMONIC (attrs) = val;
7260
7261 coding_type = args[coding_arg_coding_type];
7262 CHECK_SYMBOL (coding_type);
7263 CODING_ATTR_TYPE (attrs) = coding_type;
7264
7265 charset_list = args[coding_arg_charset_list];
7266 if (SYMBOLP (charset_list))
7267 {
7268 if (EQ (charset_list, Qiso_2022))
7269 {
7270 if (! EQ (coding_type, Qiso_2022))
7271 error ("Invalid charset-list");
7272 charset_list = Viso_2022_charset_list;
7273 }
7274 else if (EQ (charset_list, Qemacs_mule))
7275 {
7276 if (! EQ (coding_type, Qemacs_mule))
7277 error ("Invalid charset-list");
7278 charset_list = Vemacs_mule_charset_list;
7279 }
7280 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
7281 if (max_charset_id < XFASTINT (XCAR (tail)))
7282 max_charset_id = XFASTINT (XCAR (tail));
7283 }
7284 else
7285 {
7286 charset_list = Fcopy_sequence (charset_list);
7287 for (tail = charset_list; !NILP (tail); tail = Fcdr (tail))
7288 {
7289 struct charset *charset;
7290
7291 val = Fcar (tail);
7292 CHECK_CHARSET_GET_CHARSET (val, charset);
7293 if (EQ (coding_type, Qiso_2022)
7294 ? CHARSET_ISO_FINAL (charset) < 0
7295 : EQ (coding_type, Qemacs_mule)
7296 ? CHARSET_EMACS_MULE_ID (charset) < 0
7297 : 0)
7298 error ("Can't handle charset `%s'",
7299 XSYMBOL (CHARSET_NAME (charset))->name->data);
7300
7301 XCAR (tail) = make_number (charset->id);
7302 if (max_charset_id < charset->id)
7303 max_charset_id = charset->id;
7304 }
7305 }
7306 CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
7307
7308 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
7309 make_number (255));
7310 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
7311 XSTRING (safe_charsets)->data[XFASTINT (XCAR (tail))] = 0;
7312 CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
7313
7314 val = args[coding_arg_decode_translation_table];
7315 if (! NILP (val))
7316 CHECK_CHAR_TABLE (val);
7317 CODING_ATTR_DECODE_TBL (attrs) = val;
7318
7319 val = args[coding_arg_encode_translation_table];
7320 if (! NILP (val))
7321 CHECK_CHAR_TABLE (val);
7322 CODING_ATTR_ENCODE_TBL (attrs) = val;
7323
7324 val = args[coding_arg_post_read_conversion];
7325 CHECK_SYMBOL (val);
7326 CODING_ATTR_POST_READ (attrs) = val;
7327
7328 val = args[coding_arg_pre_write_conversion];
7329 CHECK_SYMBOL (val);
7330 CODING_ATTR_PRE_WRITE (attrs) = val;
7331
7332 val = args[coding_arg_default_char];
7333 if (NILP (val))
7334 CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
7335 else
7336 {
7337 CHECK_CHARACTER (val);
7338 CODING_ATTR_DEFAULT_CHAR (attrs) = val;
7339 }
7340
7341 val = args[coding_arg_plist];
7342 CHECK_LIST (val);
7343 CODING_ATTR_PLIST (attrs) = val;
7344
7345 if (EQ (coding_type, Qcharset))
7346 {
7347 val = Fmake_vector (make_number (256), Qnil);
7348
7349 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
7350 {
7351 struct charset *charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
15d143f7 7352 int idx = (CHARSET_DIMENSION (charset) - 1) * 4;
df7492f9 7353
15d143f7
KH
7354 for (i = charset->code_space[idx];
7355 i <= charset->code_space[idx + 1]; i++)
7356 {
7357 if (NILP (AREF (val, i)))
7358 ASET (val, i, XCAR (tail));
7359 else
7360 error ("Charsets conflicts in the first byte");
7361 }
df7492f9
KH
7362 }
7363 ASET (attrs, coding_attr_charset_valids, val);
7364 category = coding_category_charset;
7365 }
7366 else if (EQ (coding_type, Qccl))
7367 {
7368 Lisp_Object valids;
7369
7370 if (nargs < coding_arg_ccl_max)
7371 goto short_args;
7372
7373 val = args[coding_arg_ccl_decoder];
7374 CHECK_CCL_PROGRAM (val);
7375 if (VECTORP (val))
7376 val = Fcopy_sequence (val);
7377 ASET (attrs, coding_attr_ccl_decoder, val);
7378
7379 val = args[coding_arg_ccl_encoder];
7380 CHECK_CCL_PROGRAM (val);
7381 if (VECTORP (val))
7382 val = Fcopy_sequence (val);
7383 ASET (attrs, coding_attr_ccl_encoder, val);
7384
7385 val = args[coding_arg_ccl_valids];
7386 valids = Fmake_string (make_number (256), make_number (0));
7387 for (tail = val; !NILP (tail); tail = Fcdr (tail))
7388 {
7389 val = Fcar (tail);
7390 if (INTEGERP (val))
7391 ASET (valids, XINT (val), 1);
7392 else
7393 {
7394 int from, to;
7395
7396 CHECK_CONS (val);
7397 CHECK_NUMBER (XCAR (val));
7398 CHECK_NUMBER (XCDR (val));
7399 from = XINT (XCAR (val));
7400 to = XINT (XCDR (val));
7401 for (i = from; i <= to; i++)
7402 ASET (valids, i, 1);
7403 }
7404 }
7405 ASET (attrs, coding_attr_ccl_valids, valids);
7406
7407 category = coding_category_ccl;
7408 }
7409 else if (EQ (coding_type, Qutf_16))
7410 {
7411 Lisp_Object bom, endian;
7412
7413 if (nargs < coding_arg_utf16_max)
7414 goto short_args;
7415
7416 bom = args[coding_arg_utf16_bom];
7417 if (! NILP (bom) && ! EQ (bom, Qt))
7418 {
7419 CHECK_CONS (bom);
7420 CHECK_CODING_SYSTEM (XCAR (bom));
7421 CHECK_CODING_SYSTEM (XCDR (bom));
7422 }
7423 ASET (attrs, coding_attr_utf_16_bom, bom);
7424
7425 endian = args[coding_arg_utf16_endian];
7426 ASET (attrs, coding_attr_utf_16_endian, endian);
7427
7428 category = (CONSP (bom)
7429 ? coding_category_utf_16_auto
7430 : NILP (bom)
7431 ? (NILP (endian)
7432 ? coding_category_utf_16_be_nosig
7433 : coding_category_utf_16_le_nosig)
7434 : (NILP (endian)
7435 ? coding_category_utf_16_be
7436 : coding_category_utf_16_le));
7437 }
7438 else if (EQ (coding_type, Qiso_2022))
7439 {
7440 Lisp_Object initial, reg_usage, request, flags;
0be8721c 7441 int i, id;
1397dc18 7442
df7492f9
KH
7443 if (nargs < coding_arg_iso2022_max)
7444 goto short_args;
7445
7446 initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
7447 CHECK_VECTOR (initial);
7448 for (i = 0; i < 4; i++)
7449 {
7450 val = Faref (initial, make_number (i));
7451 if (! NILP (val))
7452 {
7453 CHECK_CHARSET_GET_ID (val, id);
7454 ASET (initial, i, make_number (id));
7455 }
7456 else
7457 ASET (initial, i, make_number (-1));
7458 }
7459
7460 reg_usage = args[coding_arg_iso2022_reg_usage];
7461 CHECK_CONS (reg_usage);
7462 CHECK_NATNUM (XCAR (reg_usage));
7463 CHECK_NATNUM (XCDR (reg_usage));
7464
7465 request = Fcopy_sequence (args[coding_arg_iso2022_request]);
7466 for (tail = request; ! NILP (tail); tail = Fcdr (tail))
1397dc18 7467 {
df7492f9
KH
7468 int id;
7469
7470 val = Fcar (tail);
7471 CHECK_CONS (val);
7472 CHECK_CHARSET_GET_ID (XCAR (val), id);
7473 CHECK_NATNUM (XCDR (val));
7474 if (XINT (XCDR (val)) >= 4)
7475 error ("Invalid graphic register number: %d", XINT (XCDR (val)));
7476 XCAR (val) = make_number (id);
1397dc18 7477 }
df7492f9
KH
7478
7479 flags = args[coding_arg_iso2022_flags];
7480 CHECK_NATNUM (flags);
7481 i = XINT (flags);
7482 if (EQ (args[coding_arg_charset_list], Qiso_2022))
7483 flags = make_number (i | CODING_ISO_FLAG_FULL_SUPPORT);
7484
7485 ASET (attrs, coding_attr_iso_initial, initial);
7486 ASET (attrs, coding_attr_iso_usage, reg_usage);
7487 ASET (attrs, coding_attr_iso_request, request);
7488 ASET (attrs, coding_attr_iso_flags, flags);
7489 setup_iso_safe_charsets (attrs);
7490
7491 if (i & CODING_ISO_FLAG_SEVEN_BITS)
7492 category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
7493 | CODING_ISO_FLAG_SINGLE_SHIFT))
7494 ? coding_category_iso_7_else
7495 : EQ (args[coding_arg_charset_list], Qiso_2022)
7496 ? coding_category_iso_7
7497 : coding_category_iso_7_tight);
7498 else
7499 {
7500 int id = XINT (AREF (initial, 1));
7501
7502 category = (((i & (CODING_ISO_FLAG_LOCKING_SHIFT
7503 | CODING_ISO_FLAG_SINGLE_SHIFT))
7504 || EQ (args[coding_arg_charset_list], Qiso_2022)
7505 || id < 0)
7506 ? coding_category_iso_8_else
7507 : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
7508 ? coding_category_iso_8_1
7509 : coding_category_iso_8_2);
7510 }
7511 }
7512 else if (EQ (coding_type, Qemacs_mule))
7513 {
7514 if (EQ (args[coding_arg_charset_list], Qemacs_mule))
7515 ASET (attrs, coding_attr_emacs_mule_full, Qt);
7516
7517 category = coding_category_emacs_mule;
7518 }
7519 else if (EQ (coding_type, Qshift_jis))
7520 {
7521
7522 struct charset *charset;
7523
7524 if (XINT (Flength (charset_list)) != 3)
7525 error ("There should be just three charsets");
7526
7527 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
7528 if (CHARSET_DIMENSION (charset) != 1)
7529 error ("Dimension of charset %s is not one",
7530 XSYMBOL (CHARSET_NAME (charset))->name->data);
7531
7532 charset_list = XCDR (charset_list);
7533 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
7534 if (CHARSET_DIMENSION (charset) != 1)
7535 error ("Dimension of charset %s is not one",
7536 XSYMBOL (CHARSET_NAME (charset))->name->data);
7537
7538 charset_list = XCDR (charset_list);
7539 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
7540 if (CHARSET_DIMENSION (charset) != 2)
7541 error ("Dimension of charset %s is not two",
7542 XSYMBOL (CHARSET_NAME (charset))->name->data);
7543
7544 category = coding_category_sjis;
7545 Vsjis_coding_system = name;
7546 }
7547 else if (EQ (coding_type, Qbig5))
7548 {
7549 struct charset *charset;
7550
7551 if (XINT (Flength (charset_list)) != 2)
7552 error ("There should be just two charsets");
7553
7554 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
7555 if (CHARSET_DIMENSION (charset) != 1)
7556 error ("Dimension of charset %s is not one",
7557 XSYMBOL (CHARSET_NAME (charset))->name->data);
7558
7559 charset_list = XCDR (charset_list);
7560 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
7561 if (CHARSET_DIMENSION (charset) != 2)
7562 error ("Dimension of charset %s is not two",
7563 XSYMBOL (CHARSET_NAME (charset))->name->data);
7564
7565 category = coding_category_big5;
7566 Vbig5_coding_system = name;
7567 }
7568 else if (EQ (coding_type, Qraw_text))
7569 category = coding_category_raw_text;
7570 else if (EQ (coding_type, Qutf_8))
7571 category = coding_category_utf_8;
7572 else if (EQ (coding_type, Qundecided))
7573 category = coding_category_undecided;
7574 else
7575 error ("Invalid coding system type: %s",
7576 XSYMBOL (coding_type)->name->data);
7577
7578 CODING_ATTR_CATEGORY (attrs) = make_number (category);
7579
7580 eol_type = args[coding_arg_eol_type];
7581 if (! NILP (eol_type)
7582 && ! EQ (eol_type, Qunix)
7583 && ! EQ (eol_type, Qdos)
7584 && ! EQ (eol_type, Qmac))
7585 error ("Invalid eol-type");
7586
7587 aliases = Fcons (name, Qnil);
7588
7589 if (NILP (eol_type))
7590 {
7591 eol_type = make_subsidiaries (name);
7592 for (i = 0; i < 3; i++)
1397dc18 7593 {
df7492f9
KH
7594 Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
7595
7596 this_name = AREF (eol_type, i);
7597 this_aliases = Fcons (this_name, Qnil);
7598 this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
7599 this_spec = Fmake_vector (make_number (3), attrs);
7600 ASET (this_spec, 1, this_aliases);
7601 ASET (this_spec, 2, this_eol_type);
7602 Fputhash (this_name, this_spec, Vcoding_system_hash_table);
7603 Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
7604 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
7605 Vcoding_system_alist);
1397dc18 7606 }
d46c5b12 7607 }
1397dc18 7608
df7492f9
KH
7609 spec_vec = Fmake_vector (make_number (3), attrs);
7610 ASET (spec_vec, 1, aliases);
7611 ASET (spec_vec, 2, eol_type);
7612
7613 Fputhash (name, spec_vec, Vcoding_system_hash_table);
7614 Vcoding_system_list = Fcons (name, Vcoding_system_list);
7615 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
7616 Vcoding_system_alist);
7617
7618 {
7619 int id = coding_categories[category].id;
7620
7621 if (id < 0 || EQ (name, CODING_ID_NAME (id)))
7622 setup_coding_system (name, &coding_categories[category]);
7623 }
7624
d46c5b12 7625 return Qnil;
df7492f9
KH
7626
7627 short_args:
7628 return Fsignal (Qwrong_number_of_arguments,
7629 Fcons (intern ("define-coding-system-internal"),
7630 make_number (nargs)));
d46c5b12
KH
7631}
7632
df7492f9
KH
7633DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
7634 Sdefine_coding_system_alias, 2, 2, 0,
7635 doc: /* Define ALIAS as an alias for CODING-SYSTEM. */)
7636 (alias, coding_system)
7637 Lisp_Object alias, coding_system;
66cfb530 7638{
df7492f9 7639 Lisp_Object spec, aliases, eol_type;
84d60297 7640
df7492f9
KH
7641 CHECK_SYMBOL (alias);
7642 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
7643 aliases = AREF (spec, 1);
7644 while (!NILP (XCDR (aliases)))
7645 aliases = XCDR (aliases);
7646 XCDR (aliases) = Fcons (alias, Qnil);
66cfb530 7647
df7492f9
KH
7648 eol_type = AREF (spec, 2);
7649 if (VECTORP (eol_type))
66cfb530 7650 {
df7492f9
KH
7651 Lisp_Object subsidiaries;
7652 int i;
7653
7654 subsidiaries = make_subsidiaries (alias);
7655 for (i = 0; i < 3; i++)
7656 Fdefine_coding_system_alias (AREF (subsidiaries, i),
7657 AREF (eol_type, i));
7658
7659 ASET (spec, 2, subsidiaries);
66cfb530 7660 }
df7492f9
KH
7661
7662 Fputhash (alias, spec, Vcoding_system_hash_table);
5bad0796
DL
7663 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
7664 Vcoding_system_alist);
66cfb530
KH
7665
7666 return Qnil;
7667}
7668
df7492f9
KH
7669DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
7670 1, 1, 0,
7671 doc: /* Return the base of CODING-SYSTEM.
7672Any alias or subsidiary coding systems are not base coding system. */)
7673 (coding_system)
7674 Lisp_Object coding_system;
7675{
7676 Lisp_Object spec, attrs;
7677
7678 if (NILP (coding_system))
7679 return (Qno_conversion);
7680 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
7681 attrs = AREF (spec, 0);
7682 return CODING_ATTR_BASE_NAME (attrs);
7683}
7684
7685DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
7686 1, 1, 0,
7687 doc: "Return the property list of CODING-SYSTEM.")
7688 (coding_system)
7689 Lisp_Object coding_system;
7690{
7691 Lisp_Object spec, attrs;
7692
7693 if (NILP (coding_system))
7694 coding_system = Qno_conversion;
7695 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
7696 attrs = AREF (spec, 0);
7697 return CODING_ATTR_PLIST (attrs);
7698}
7699
7700
7701DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
7702 1, 1, 0,
7703 doc: /* Return the list of aliases of CODING-SYSTEM.
7704A base coding system is what made by `define-coding-system'.
7705Any alias nor subsidiary coding systems are not base coding system. */)
7706 (coding_system)
7707 Lisp_Object coding_system;
7708{
7709 Lisp_Object spec;
7710
7711 if (NILP (coding_system))
7712 coding_system = Qno_conversion;
7713 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
7714 return AREF (spec, 2);
7715}
7716
7717DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
7718 Scoding_system_eol_type, 1, 1, 0,
7719 doc: /* Return eol-type of CODING-SYSTEM.
7720An eol-type is integer 0, 1, 2, or a vector of coding systems.
7721
7722Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
7723and CR respectively.
7724
7725A vector value indicates that a format of end-of-line should be
7726detected automatically. Nth element of the vector is the subsidiary
7727coding system whose eol-type is N. */)
7728 (coding_system)
7729 Lisp_Object coding_system;
7730{
7731 Lisp_Object spec, eol_type;
7732 int n;
7733
7734 if (NILP (coding_system))
7735 coding_system = Qno_conversion;
7736 if (! CODING_SYSTEM_P (coding_system))
7737 return Qnil;
7738 spec = CODING_SYSTEM_SPEC (coding_system);
7739 eol_type = AREF (spec, 2);
7740 if (VECTORP (eol_type))
7741 return Fcopy_sequence (eol_type);
7742 n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
7743 return make_number (n);
7744}
7745
4ed46869
KH
7746#endif /* emacs */
7747
7748\f
1397dc18 7749/*** 9. Post-amble ***/
4ed46869 7750
dfcf069d 7751void
4ed46869
KH
7752init_coding_once ()
7753{
7754 int i;
7755
df7492f9
KH
7756 for (i = 0; i < coding_category_max; i++)
7757 {
7758 coding_categories[i].id = -1;
7759 coding_priorities[i] = i;
7760 }
4ed46869
KH
7761
7762 /* ISO2022 specific initialize routine. */
7763 for (i = 0; i < 0x20; i++)
b73bfc1c 7764 iso_code_class[i] = ISO_control_0;
4ed46869
KH
7765 for (i = 0x21; i < 0x7F; i++)
7766 iso_code_class[i] = ISO_graphic_plane_0;
7767 for (i = 0x80; i < 0xA0; i++)
b73bfc1c 7768 iso_code_class[i] = ISO_control_1;
4ed46869
KH
7769 for (i = 0xA1; i < 0xFF; i++)
7770 iso_code_class[i] = ISO_graphic_plane_1;
7771 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
7772 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
7773 iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
7774 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
7775 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
7776 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
7777 iso_code_class[ISO_CODE_ESC] = ISO_escape;
7778 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
7779 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
7780 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
7781
b843d1ae 7782 inhibit_pre_post_conversion = 0;
df7492f9
KH
7783
7784 for (i = 0; i < 256; i++)
7785 {
7786 emacs_mule_bytes[i] = 1;
7787 }
e0e989f6
KH
7788}
7789
7790#ifdef emacs
7791
dfcf069d 7792void
e0e989f6
KH
7793syms_of_coding ()
7794{
df7492f9
KH
7795 staticpro (&Vcoding_system_hash_table);
7796 Vcoding_system_hash_table = Fmakehash (Qeq);
7797
7798 staticpro (&Vsjis_coding_system);
7799 Vsjis_coding_system = Qnil;
7800
7801 staticpro (&Vbig5_coding_system);
7802 Vbig5_coding_system = Qnil;
7803
7804 staticpro (&Vcode_conversion_work_buf_list);
7805 Vcode_conversion_work_buf_list = Qnil;
e0e989f6 7806
df7492f9
KH
7807 staticpro (&Vcode_conversion_reused_work_buf);
7808 Vcode_conversion_reused_work_buf = Qnil;
7809
7810 DEFSYM (Qcharset, "charset");
7811 DEFSYM (Qtarget_idx, "target-idx");
7812 DEFSYM (Qcoding_system_history, "coding-system-history");
bb0115a2
RS
7813 Fset (Qcoding_system_history, Qnil);
7814
9ce27fde 7815 /* Target FILENAME is the first argument. */
e0e989f6 7816 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
9ce27fde 7817 /* Target FILENAME is the third argument. */
e0e989f6
KH
7818 Fput (Qwrite_region, Qtarget_idx, make_number (2));
7819
df7492f9 7820 DEFSYM (Qcall_process, "call-process");
9ce27fde 7821 /* Target PROGRAM is the first argument. */
e0e989f6
KH
7822 Fput (Qcall_process, Qtarget_idx, make_number (0));
7823
df7492f9 7824 DEFSYM (Qcall_process_region, "call-process-region");
9ce27fde 7825 /* Target PROGRAM is the third argument. */
e0e989f6
KH
7826 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
7827
df7492f9 7828 DEFSYM (Qstart_process, "start-process");
9ce27fde 7829 /* Target PROGRAM is the third argument. */
e0e989f6
KH
7830 Fput (Qstart_process, Qtarget_idx, make_number (2));
7831
df7492f9 7832 DEFSYM (Qopen_network_stream, "open-network-stream");
9ce27fde 7833 /* Target SERVICE is the fourth argument. */
e0e989f6
KH
7834 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
7835
df7492f9
KH
7836 DEFSYM (Qcoding_system, "coding-system");
7837 DEFSYM (Qcoding_aliases, "coding-aliases");
4ed46869 7838
df7492f9
KH
7839 DEFSYM (Qeol_type, "eol-type");
7840 DEFSYM (Qunix, "unix");
7841 DEFSYM (Qdos, "dos");
7842 DEFSYM (Qmac, "mac");
4ed46869 7843
df7492f9
KH
7844 DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
7845 DEFSYM (Qpost_read_conversion, "post-read-conversion");
7846 DEFSYM (Qpre_write_conversion, "pre-write-conversion");
7847 DEFSYM (Qdefault_char, "default-char");
7848 DEFSYM (Qundecided, "undecided");
7849 DEFSYM (Qno_conversion, "no-conversion");
7850 DEFSYM (Qraw_text, "raw-text");
4ed46869 7851
df7492f9 7852 DEFSYM (Qiso_2022, "iso-2022");
4ed46869 7853
df7492f9 7854 DEFSYM (Qutf_8, "utf-8");
27901516 7855
df7492f9
KH
7856 DEFSYM (Qutf_16, "utf-16");
7857 DEFSYM (Qutf_16_be, "utf-16-be");
7858 DEFSYM (Qutf_16_be_nosig, "utf-16-be-nosig");
7859 DEFSYM (Qutf_16_le, "utf-16-l3");
7860 DEFSYM (Qutf_16_le_nosig, "utf-16-le-nosig");
7861 DEFSYM (Qsignature, "signature");
7862 DEFSYM (Qendian, "endian");
7863 DEFSYM (Qbig, "big");
7864 DEFSYM (Qlittle, "little");
27901516 7865
df7492f9
KH
7866 DEFSYM (Qshift_jis, "shift-jis");
7867 DEFSYM (Qbig5, "big5");
4ed46869 7868
df7492f9 7869 DEFSYM (Qcoding_system_p, "coding-system-p");
4ed46869 7870
df7492f9 7871 DEFSYM (Qcoding_system_error, "coding-system-error");
4ed46869
KH
7872 Fput (Qcoding_system_error, Qerror_conditions,
7873 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
7874 Fput (Qcoding_system_error, Qerror_message,
9ce27fde 7875 build_string ("Invalid coding system"));
4ed46869 7876
df7492f9
KH
7877 /* Intern this now in case it isn't already done.
7878 Setting this variable twice is harmless.
7879 But don't staticpro it here--that is done in alloc.c. */
7880 Qchar_table_extra_slots = intern ("char-table-extra-slots");
4ed46869 7881
df7492f9 7882 DEFSYM (Qtranslation_table, "translation-table");
1397dc18 7883 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (1));
df7492f9
KH
7884 DEFSYM (Qtranslation_table_id, "translation-table-id");
7885 DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
7886 DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
bdd9fb48 7887
df7492f9 7888 DEFSYM (Qvalid_codes, "valid-codes");
05e6f5dc 7889
df7492f9 7890 DEFSYM (Qemacs_mule, "emacs-mule");
05e6f5dc 7891
df7492f9
KH
7892 Vcoding_category_table
7893 = Fmake_vector (make_number (coding_category_max), Qnil);
7894 staticpro (&Vcoding_category_table);
7895 /* Followings are target of code detection. */
7896 ASET (Vcoding_category_table, coding_category_iso_7,
7897 intern ("coding-category-iso-7"));
7898 ASET (Vcoding_category_table, coding_category_iso_7_tight,
7899 intern ("coding-category-iso-7-tight"));
7900 ASET (Vcoding_category_table, coding_category_iso_8_1,
7901 intern ("coding-category-iso-8-1"));
7902 ASET (Vcoding_category_table, coding_category_iso_8_2,
7903 intern ("coding-category-iso-8-2"));
7904 ASET (Vcoding_category_table, coding_category_iso_7_else,
7905 intern ("coding-category-iso-7-else"));
7906 ASET (Vcoding_category_table, coding_category_iso_8_else,
7907 intern ("coding-category-iso-8-else"));
7908 ASET (Vcoding_category_table, coding_category_utf_8,
7909 intern ("coding-category-utf-8"));
7910 ASET (Vcoding_category_table, coding_category_utf_16_be,
7911 intern ("coding-category-utf-16-be"));
7912 ASET (Vcoding_category_table, coding_category_utf_16_le,
7913 intern ("coding-category-utf-16-le"));
7914 ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
7915 intern ("coding-category-utf-16-be-nosig"));
7916 ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
7917 intern ("coding-category-utf-16-le-nosig"));
7918 ASET (Vcoding_category_table, coding_category_charset,
7919 intern ("coding-category-charset"));
7920 ASET (Vcoding_category_table, coding_category_sjis,
7921 intern ("coding-category-sjis"));
7922 ASET (Vcoding_category_table, coding_category_big5,
7923 intern ("coding-category-big5"));
7924 ASET (Vcoding_category_table, coding_category_ccl,
7925 intern ("coding-category-ccl"));
7926 ASET (Vcoding_category_table, coding_category_emacs_mule,
7927 intern ("coding-category-emacs-mule"));
7928 /* Followings are NOT target of code detection. */
7929 ASET (Vcoding_category_table, coding_category_raw_text,
7930 intern ("coding-category-raw-text"));
7931 ASET (Vcoding_category_table, coding_category_undecided,
7932 intern ("coding-category-undecided"));
70c22245 7933
4ed46869
KH
7934 defsubr (&Scoding_system_p);
7935 defsubr (&Sread_coding_system);
7936 defsubr (&Sread_non_nil_coding_system);
7937 defsubr (&Scheck_coding_system);
7938 defsubr (&Sdetect_coding_region);
d46c5b12 7939 defsubr (&Sdetect_coding_string);
05e6f5dc 7940 defsubr (&Sfind_coding_systems_region_internal);
df7492f9 7941 defsubr (&Scheck_coding_systems_region);
4ed46869
KH
7942 defsubr (&Sdecode_coding_region);
7943 defsubr (&Sencode_coding_region);
7944 defsubr (&Sdecode_coding_string);
7945 defsubr (&Sencode_coding_string);
7946 defsubr (&Sdecode_sjis_char);
7947 defsubr (&Sencode_sjis_char);
7948 defsubr (&Sdecode_big5_char);
7949 defsubr (&Sencode_big5_char);
1ba9e4ab 7950 defsubr (&Sset_terminal_coding_system_internal);
c4825358 7951 defsubr (&Sset_safe_terminal_coding_system_internal);
4ed46869 7952 defsubr (&Sterminal_coding_system);
1ba9e4ab 7953 defsubr (&Sset_keyboard_coding_system_internal);
4ed46869 7954 defsubr (&Skeyboard_coding_system);
a5d301df 7955 defsubr (&Sfind_operation_coding_system);
df7492f9
KH
7956 defsubr (&Sset_coding_system_priority);
7957 defsubr (&Sdefine_coding_system_internal);
7958 defsubr (&Sdefine_coding_system_alias);
7959 defsubr (&Scoding_system_base);
7960 defsubr (&Scoding_system_plist);
7961 defsubr (&Scoding_system_aliases);
7962 defsubr (&Scoding_system_eol_type);
7963 defsubr (&Scoding_system_priority_list);
4ed46869 7964
4608c386 7965 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
48b0f3ae
PJ
7966 doc: /* List of coding systems.
7967
7968Do not alter the value of this variable manually. This variable should be
df7492f9 7969updated by the functions `define-coding-system' and
48b0f3ae 7970`define-coding-system-alias'. */);
4608c386
KH
7971 Vcoding_system_list = Qnil;
7972
7973 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
48b0f3ae
PJ
7974 doc: /* Alist of coding system names.
7975Each element is one element list of coding system name.
7976This variable is given to `completing-read' as TABLE argument.
7977
7978Do not alter the value of this variable manually. This variable should be
7979updated by the functions `make-coding-system' and
7980`define-coding-system-alias'. */);
4608c386
KH
7981 Vcoding_system_alist = Qnil;
7982
4ed46869 7983 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
48b0f3ae
PJ
7984 doc: /* List of coding-categories (symbols) ordered by priority.
7985
7986On detecting a coding system, Emacs tries code detection algorithms
7987associated with each coding-category one by one in this order. When
7988one algorithm agrees with a byte sequence of source text, the coding
7989system bound to the corresponding coding-category is selected. */);
4ed46869
KH
7990 {
7991 int i;
7992
7993 Vcoding_category_list = Qnil;
df7492f9 7994 for (i = coding_category_max - 1; i >= 0; i--)
4ed46869 7995 Vcoding_category_list
d46c5b12
KH
7996 = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
7997 Vcoding_category_list);
4ed46869
KH
7998 }
7999
8000 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
48b0f3ae
PJ
8001 doc: /* Specify the coding system for read operations.
8002It is useful to bind this variable with `let', but do not set it globally.
8003If the value is a coding system, it is used for decoding on read operation.
8004If not, an appropriate element is used from one of the coding system alists:
8005There are three such tables, `file-coding-system-alist',
8006`process-coding-system-alist', and `network-coding-system-alist'. */);
4ed46869
KH
8007 Vcoding_system_for_read = Qnil;
8008
8009 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
48b0f3ae
PJ
8010 doc: /* Specify the coding system for write operations.
8011Programs bind this variable with `let', but you should not set it globally.
8012If the value is a coding system, it is used for encoding of output,
8013when writing it to a file and when sending it to a file or subprocess.
8014
8015If this does not specify a coding system, an appropriate element
8016is used from one of the coding system alists:
8017There are three such tables, `file-coding-system-alist',
8018`process-coding-system-alist', and `network-coding-system-alist'.
8019For output to files, if the above procedure does not specify a coding system,
8020the value of `buffer-file-coding-system' is used. */);
4ed46869
KH
8021 Vcoding_system_for_write = Qnil;
8022
8023 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
df7492f9
KH
8024 doc: /*
8025Coding system used in the latest file or process I/O. */);
4ed46869
KH
8026 Vlast_coding_system_used = Qnil;
8027
9ce27fde 8028 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
df7492f9
KH
8029 doc: /*
8030*Non-nil means always inhibit code conversion of end-of-line format.
48b0f3ae
PJ
8031See info node `Coding Systems' and info node `Text and Binary' concerning
8032such conversion. */);
9ce27fde
KH
8033 inhibit_eol_conversion = 0;
8034
ed29121d 8035 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
df7492f9
KH
8036 doc: /*
8037Non-nil means process buffer inherits coding system of process output.
48b0f3ae
PJ
8038Bind it to t if the process output is to be treated as if it were a file
8039read from some filesystem. */);
ed29121d
EZ
8040 inherit_process_coding_system = 0;
8041
02ba4723 8042 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
df7492f9
KH
8043 doc: /*
8044Alist to decide a coding system to use for a file I/O operation.
48b0f3ae
PJ
8045The format is ((PATTERN . VAL) ...),
8046where PATTERN is a regular expression matching a file name,
8047VAL is a coding system, a cons of coding systems, or a function symbol.
8048If VAL is a coding system, it is used for both decoding and encoding
8049the file contents.
8050If VAL is a cons of coding systems, the car part is used for decoding,
8051and the cdr part is used for encoding.
8052If VAL is a function symbol, the function must return a coding system
0192762c
DL
8053or a cons of coding systems which are used as above. The function gets
8054the arguments with which `find-operation-coding-systems' was called.
48b0f3ae
PJ
8055
8056See also the function `find-operation-coding-system'
8057and the variable `auto-coding-alist'. */);
02ba4723
KH
8058 Vfile_coding_system_alist = Qnil;
8059
8060 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
df7492f9
KH
8061 doc: /*
8062Alist to decide a coding system to use for a process I/O operation.
48b0f3ae
PJ
8063The format is ((PATTERN . VAL) ...),
8064where PATTERN is a regular expression matching a program name,
8065VAL is a coding system, a cons of coding systems, or a function symbol.
8066If VAL is a coding system, it is used for both decoding what received
8067from the program and encoding what sent to the program.
8068If VAL is a cons of coding systems, the car part is used for decoding,
8069and the cdr part is used for encoding.
8070If VAL is a function symbol, the function must return a coding system
8071or a cons of coding systems which are used as above.
8072
8073See also the function `find-operation-coding-system'. */);
02ba4723
KH
8074 Vprocess_coding_system_alist = Qnil;
8075
8076 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
df7492f9
KH
8077 doc: /*
8078Alist to decide a coding system to use for a network I/O operation.
48b0f3ae
PJ
8079The format is ((PATTERN . VAL) ...),
8080where PATTERN is a regular expression matching a network service name
8081or is a port number to connect to,
8082VAL is a coding system, a cons of coding systems, or a function symbol.
8083If VAL is a coding system, it is used for both decoding what received
8084from the network stream and encoding what sent to the network stream.
8085If VAL is a cons of coding systems, the car part is used for decoding,
8086and the cdr part is used for encoding.
8087If VAL is a function symbol, the function must return a coding system
8088or a cons of coding systems which are used as above.
8089
8090See also the function `find-operation-coding-system'. */);
02ba4723 8091 Vnetwork_coding_system_alist = Qnil;
4ed46869 8092
68c45bf0 8093 DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
75205970
RS
8094 doc: /* Coding system to use with system messages.
8095Also used for decoding keyboard input on X Window system. */);
68c45bf0
PE
8096 Vlocale_coding_system = Qnil;
8097
005f0d35 8098 /* The eol mnemonics are reset in startup.el system-dependently. */
7722baf9 8099 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
df7492f9
KH
8100 doc: /*
8101*String displayed in mode line for UNIX-like (LF) end-of-line format. */);
7722baf9 8102 eol_mnemonic_unix = build_string (":");
4ed46869 8103
7722baf9 8104 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
df7492f9
KH
8105 doc: /*
8106*String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
7722baf9 8107 eol_mnemonic_dos = build_string ("\\");
4ed46869 8108
7722baf9 8109 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
df7492f9
KH
8110 doc: /*
8111*String displayed in mode line for MAC-like (CR) end-of-line format. */);
7722baf9 8112 eol_mnemonic_mac = build_string ("/");
4ed46869 8113
7722baf9 8114 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
df7492f9
KH
8115 doc: /*
8116*String displayed in mode line when end-of-line format is not yet determined. */);
7722baf9 8117 eol_mnemonic_undecided = build_string (":");
4ed46869 8118
84fbb8a0 8119 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
df7492f9
KH
8120 doc: /*
8121*Non-nil enables character translation while encoding and decoding. */);
84fbb8a0 8122 Venable_character_translation = Qt;
bdd9fb48 8123
f967223b 8124 DEFVAR_LISP ("standard-translation-table-for-decode",
48b0f3ae
PJ
8125 &Vstandard_translation_table_for_decode,
8126 doc: /* Table for translating characters while decoding. */);
f967223b 8127 Vstandard_translation_table_for_decode = Qnil;
bdd9fb48 8128
f967223b 8129 DEFVAR_LISP ("standard-translation-table-for-encode",
48b0f3ae
PJ
8130 &Vstandard_translation_table_for_encode,
8131 doc: /* Table for translating characters while encoding. */);
f967223b 8132 Vstandard_translation_table_for_encode = Qnil;
4ed46869 8133
df7492f9 8134 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table,
48b0f3ae
PJ
8135 doc: /* Alist of charsets vs revision numbers.
8136While encoding, if a charset (car part of an element) is found,
df7492f9
KH
8137designate it with the escape sequence identifying revision (cdr part
8138of the element). */);
8139 Vcharset_revision_table = Qnil;
02ba4723
KH
8140
8141 DEFVAR_LISP ("default-process-coding-system",
8142 &Vdefault_process_coding_system,
48b0f3ae
PJ
8143 doc: /* Cons of coding systems used for process I/O by default.
8144The car part is used for decoding a process output,
8145the cdr part is used for encoding a text to be sent to a process. */);
02ba4723 8146 Vdefault_process_coding_system = Qnil;
c4825358 8147
3f003981 8148 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
df7492f9
KH
8149 doc: /*
8150Table of extra Latin codes in the range 128..159 (inclusive).
48b0f3ae
PJ
8151This is a vector of length 256.
8152If Nth element is non-nil, the existence of code N in a file
8153\(or output of subprocess) doesn't prevent it to be detected as
8154a coding system of ISO 2022 variant which has a flag
8155`accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
8156or reading output of a subprocess.
8157Only 128th through 159th elements has a meaning. */);
3f003981 8158 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
d46c5b12
KH
8159
8160 DEFVAR_LISP ("select-safe-coding-system-function",
8161 &Vselect_safe_coding_system_function,
df7492f9
KH
8162 doc: /*
8163Function to call to select safe coding system for encoding a text.
48b0f3ae
PJ
8164
8165If set, this function is called to force a user to select a proper
8166coding system which can encode the text in the case that a default
8167coding system used in each operation can't encode the text.
8168
8169The default value is `select-safe-coding-system' (which see). */);
d46c5b12
KH
8170 Vselect_safe_coding_system_function = Qnil;
8171
22ab2303 8172 DEFVAR_BOOL ("inhibit-iso-escape-detection",
74383408 8173 &inhibit_iso_escape_detection,
df7492f9
KH
8174 doc: /*
8175If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
48b0f3ae
PJ
8176
8177By default, on reading a file, Emacs tries to detect how the text is
8178encoded. This code detection is sensitive to escape sequences. If
8179the sequence is valid as ISO2022, the code is determined as one of
8180the ISO2022 encodings, and the file is decoded by the corresponding
8181coding system (e.g. `iso-2022-7bit').
8182
8183However, there may be a case that you want to read escape sequences in
8184a file as is. In such a case, you can set this variable to non-nil.
8185Then, as the code detection ignores any escape sequences, no file is
8186detected as encoded in some ISO2022 encoding. The result is that all
8187escape sequences become visible in a buffer.
8188
8189The default value is nil, and it is strongly recommended not to change
8190it. That is because many Emacs Lisp source files that contain
8191non-ASCII characters are encoded by the coding system `iso-2022-7bit'
8192in Emacs's distribution, and they won't be decoded correctly on
8193reading if you suppress escape sequence detection.
8194
8195The other way to read escape sequences in a file without decoding is
8196to explicitly specify some coding system that doesn't use ISO2022's
8197escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument]. */);
74383408 8198 inhibit_iso_escape_detection = 0;
2c78b7e1
KH
8199
8200 {
8201 Lisp_Object args[coding_arg_max];
8202 Lisp_Object plist[14];
8203 int i;
8204
8205 for (i = 0; i < coding_arg_max; i++)
8206 args[i] = Qnil;
8207
8208 plist[0] = intern (":name");
8209 plist[1] = args[coding_arg_name] = Qno_conversion;
8210 plist[2] = intern (":mnemonic");
8211 plist[3] = args[coding_arg_mnemonic] = make_number ('=');
8212 plist[4] = intern (":coding-type");
8213 plist[5] = args[coding_arg_coding_type] = Qraw_text;
8214 plist[6] = intern (":ascii-compatible-p");
8215 plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
8216 plist[8] = intern (":default-char");
8217 plist[9] = args[coding_arg_default_char] = make_number (0);
8218 plist[10] = intern (":docstring");
8219 plist[11] = build_string ("Do no conversion.\n\
8220\n\
8221When you visit a file with this coding, the file is read into a\n\
8222unibyte buffer as is, thus each byte of a file is treated as a\n\
8223character.");
8224 plist[12] = intern (":eol-type");
8225 plist[13] = args[coding_arg_eol_type] = Qunix;
8226 args[coding_arg_plist] = Flist (14, plist);
8227 Fdefine_coding_system_internal (coding_arg_max, args);
8228 }
8229
8230 setup_coding_system (Qno_conversion, &keyboard_coding);
8231 setup_coding_system (Qno_conversion, &terminal_coding);
8232 setup_coding_system (Qno_conversion, &safe_terminal_coding);
4ed46869
KH
8233}
8234
68c45bf0
PE
8235char *
8236emacs_strerror (error_number)
8237 int error_number;
8238{
8239 char *str;
8240
ca9c0567 8241 synchronize_system_messages_locale ();
68c45bf0
PE
8242 str = strerror (error_number);
8243
8244 if (! NILP (Vlocale_coding_system))
8245 {
8246 Lisp_Object dec = code_convert_string_norecord (build_string (str),
8247 Vlocale_coding_system,
8248 0);
8249 str = (char *) XSTRING (dec)->data;
8250 }
8251
8252 return str;
8253}
8254
4ed46869 8255#endif /* emacs */