(detect_coding_emacs_mule): Fix counting of encoded
[bpt/emacs.git] / src / coding.c
CommitLineData
9542cb1f 1/* Coding system handler (conversion, detection, etc).
4a2f9c6a 2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
8f924df7 3 Licensed to the Free Software Foundation.
6f197c07 4 Copyright (C) 2001, 2002 Free Software Foundation, Inc.
8f924df7 5 Copyright (C) 2003
df7492f9
KH
6 National Institute of Advanced Industrial Science and Technology (AIST)
7 Registration Number H13PRO009
4ed46869 8
369314dc
KH
9This file is part of GNU Emacs.
10
11GNU Emacs is free software; you can redistribute it and/or modify
12it under the terms of the GNU General Public License as published by
13the Free Software Foundation; either version 2, or (at your option)
14any later version.
4ed46869 15
369314dc
KH
16GNU Emacs is distributed in the hope that it will be useful,
17but WITHOUT ANY WARRANTY; without even the implied warranty of
18MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19GNU General Public License for more details.
4ed46869 20
369314dc
KH
21You should have received a copy of the GNU General Public License
22along with GNU Emacs; see the file COPYING. If not, write to
23the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24Boston, MA 02111-1307, USA. */
4ed46869
KH
25
26/*** TABLE OF CONTENTS ***
27
b73bfc1c 28 0. General comments
4ed46869 29 1. Preamble
df7492f9
KH
30 2. Emacs' internal format (emacs-utf-8) handlers
31 3. UTF-8 handlers
32 4. UTF-16 handlers
33 5. Charset-base coding systems handlers
34 6. emacs-mule (old Emacs' internal format) handlers
35 7. ISO2022 handlers
36 8. Shift-JIS and BIG5 handlers
37 9. CCL handlers
38 10. C library functions
39 11. Emacs Lisp library functions
40 12. Postamble
4ed46869
KH
41
42*/
43
df7492f9 44/*** 0. General comments ***
b73bfc1c
KH
45
46
df7492f9 47CODING SYSTEM
4ed46869 48
5bad0796
DL
49 A coding system is an object for an encoding mechanism that contains
50 information about how to convert byte sequences to character
e19c3639
KH
51 sequences and vice versa. When we say "decode", it means converting
52 a byte sequence of a specific coding system into a character
53 sequence that is represented by Emacs' internal coding system
54 `emacs-utf-8', and when we say "encode", it means converting a
55 character sequence of emacs-utf-8 to a byte sequence of a specific
0ef69138 56 coding system.
4ed46869 57
e19c3639
KH
58 In Emacs Lisp, a coding system is represented by a Lisp symbol. In
59 C level, a coding system is represented by a vector of attributes
5bad0796 60 stored in the hash table Vcharset_hash_table. The conversion from
e19c3639
KH
61 coding system symbol to attributes vector is done by looking up
62 Vcharset_hash_table by the symbol.
4ed46869 63
e19c3639 64 Coding systems are classified into the following types depending on
5bad0796 65 the encoding mechanism. Here's a brief description of the types.
4ed46869 66
df7492f9
KH
67 o UTF-8
68
69 o UTF-16
70
71 o Charset-base coding system
72
73 A coding system defined by one or more (coded) character sets.
5bad0796 74 Decoding and encoding are done by a code converter defined for each
df7492f9
KH
75 character set.
76
5bad0796 77 o Old Emacs internal format (emacs-mule)
df7492f9 78
5bad0796 79 The coding system adopted by old versions of Emacs (20 and 21).
4ed46869 80
df7492f9 81 o ISO2022-base coding system
4ed46869
KH
82
83 The most famous coding system for multiple character sets. X's
df7492f9
KH
84 Compound Text, various EUCs (Extended Unix Code), and coding systems
85 used in the Internet communication such as ISO-2022-JP are all
86 variants of ISO2022.
4ed46869 87
df7492f9 88 o SJIS (or Shift-JIS or MS-Kanji-Code)
93dec019 89
4ed46869
KH
90 A coding system to encode character sets: ASCII, JISX0201, and
91 JISX0208. Widely used for PC's in Japan. Details are described in
df7492f9 92 section 8.
4ed46869 93
df7492f9 94 o BIG5
4ed46869 95
df7492f9 96 A coding system to encode character sets: ASCII and Big5. Widely
cfb43547 97 used for Chinese (mainly in Taiwan and Hong Kong). Details are
df7492f9
KH
98 described in section 8. In this file, when we write "big5" (all
99 lowercase), we mean the coding system, and when we write "Big5"
100 (capitalized), we mean the character set.
4ed46869 101
df7492f9 102 o CCL
27901516 103
5bad0796 104 If a user wants to decode/encode text encoded in a coding system
df7492f9
KH
105 not listed above, he can supply a decoder and an encoder for it in
106 CCL (Code Conversion Language) programs. Emacs executes the CCL
107 program while decoding/encoding.
27901516 108
df7492f9 109 o Raw-text
4ed46869 110
5a936b46 111 A coding system for text containing raw eight-bit data. Emacs
5bad0796 112 treats each byte of source text as a character (except for
df7492f9 113 end-of-line conversion).
4ed46869 114
df7492f9
KH
115 o No-conversion
116
117 Like raw text, but don't do end-of-line conversion.
4ed46869 118
4ed46869 119
df7492f9 120END-OF-LINE FORMAT
4ed46869 121
5bad0796 122 How text end-of-line is encoded depends on operating system. For
df7492f9 123 instance, Unix's format is just one byte of LF (line-feed) code,
f4dee582 124 whereas DOS's format is two-byte sequence of `carriage-return' and
d46c5b12
KH
125 `line-feed' codes. MacOS's format is usually one byte of
126 `carriage-return'.
4ed46869 127
cfb43547 128 Since text character encoding and end-of-line encoding are
df7492f9
KH
129 independent, any coding system described above can take any format
130 of end-of-line (except for no-conversion).
4ed46869 131
e19c3639
KH
132STRUCT CODING_SYSTEM
133
134 Before using a coding system for code conversion (i.e. decoding and
135 encoding), we setup a structure of type `struct coding_system'.
136 This structure keeps various information about a specific code
5bad0796 137 conversion (e.g. the location of source and destination data).
4ed46869
KH
138
139*/
140
df7492f9
KH
141/* COMMON MACROS */
142
143
4ed46869
KH
144/*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
145
df7492f9 146 These functions check if a byte sequence specified as a source in
ff0dacd7
KH
147 CODING conforms to the format of XXX, and update the members of
148 DETECT_INFO.
df7492f9 149
ff0dacd7 150 Return 1 if the byte sequence conforms to XXX, otherwise return 0.
df7492f9
KH
151
152 Below is the template of these functions. */
153
4ed46869 154#if 0
df7492f9 155static int
ff0dacd7 156detect_coding_XXX (coding, detect_info)
df7492f9 157 struct coding_system *coding;
ff0dacd7 158 struct coding_detection_info *detect_info;
4ed46869 159{
df7492f9
KH
160 unsigned char *src = coding->source;
161 unsigned char *src_end = coding->source + coding->src_bytes;
162 int multibytep = coding->src_multibyte;
ff0dacd7 163 int consumed_chars = 0;
df7492f9
KH
164 int found = 0;
165 ...;
166
167 while (1)
168 {
169 /* Get one byte from the source. If the souce is exausted, jump
170 to no_more_source:. */
171 ONE_MORE_BYTE (c);
ff0dacd7
KH
172
173 if (! __C_conforms_to_XXX___ (c))
174 break;
175 if (! __C_strongly_suggests_XXX__ (c))
176 found = CATEGORY_MASK_XXX;
df7492f9 177 }
ff0dacd7
KH
178 /* The byte sequence is invalid for XXX. */
179 detect_info->rejected |= CATEGORY_MASK_XXX;
df7492f9 180 return 0;
ff0dacd7 181
df7492f9 182 no_more_source:
ff0dacd7
KH
183 /* The source exausted successfully. */
184 detect_info->found |= found;
df7492f9 185 return 1;
4ed46869
KH
186}
187#endif
188
189/*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
190
df7492f9
KH
191 These functions decode a byte sequence specified as a source by
192 CODING. The resulting multibyte text goes to a place pointed to by
193 CODING->charbuf, the length of which should not exceed
194 CODING->charbuf_size;
d46c5b12 195
df7492f9
KH
196 These functions set the information of original and decoded texts in
197 CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
198 They also set CODING->result to one of CODING_RESULT_XXX indicating
199 how the decoding is finished.
d46c5b12 200
df7492f9 201 Below is the template of these functions. */
d46c5b12 202
4ed46869 203#if 0
b73bfc1c 204static void
df7492f9 205decode_coding_XXXX (coding)
4ed46869 206 struct coding_system *coding;
4ed46869 207{
df7492f9
KH
208 unsigned char *src = coding->source + coding->consumed;
209 unsigned char *src_end = coding->source + coding->src_bytes;
210 /* SRC_BASE remembers the start position in source in each loop.
211 The loop will be exited when there's not enough source code, or
212 when there's no room in CHARBUF for a decoded character. */
213 unsigned char *src_base;
214 /* A buffer to produce decoded characters. */
215 int *charbuf = coding->charbuf;
216 int *charbuf_end = charbuf + coding->charbuf_size;
217 int multibytep = coding->src_multibyte;
218
219 while (1)
220 {
221 src_base = src;
222 if (charbuf < charbuf_end)
223 /* No more room to produce a decoded character. */
224 break;
225 ONE_MORE_BYTE (c);
226 /* Decode it. */
227 }
228
229 no_more_source:
230 if (src_base < src_end
231 && coding->mode & CODING_MODE_LAST_BLOCK)
232 /* If the source ends by partial bytes to construct a character,
233 treat them as eight-bit raw data. */
234 while (src_base < src_end && charbuf < charbuf_end)
235 *charbuf++ = *src_base++;
236 /* Remember how many bytes and characters we consumed. If the
237 source is multibyte, the bytes and chars are not identical. */
238 coding->consumed = coding->consumed_char = src_base - coding->source;
239 /* Remember how many characters we produced. */
240 coding->charbuf_used = charbuf - coding->charbuf;
4ed46869
KH
241}
242#endif
243
244/*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
245
df7492f9
KH
246 These functions encode SRC_BYTES length text at SOURCE of Emacs'
247 internal multibyte format by CODING. The resulting byte sequence
b73bfc1c
KH
248 goes to a place pointed to by DESTINATION, the length of which
249 should not exceed DST_BYTES.
d46c5b12 250
df7492f9
KH
251 These functions set the information of original and encoded texts in
252 the members produced, produced_char, consumed, and consumed_char of
253 the structure *CODING. They also set the member result to one of
254 CODING_RESULT_XXX indicating how the encoding finished.
d46c5b12 255
df7492f9
KH
256 DST_BYTES zero means that source area and destination area are
257 overlapped, which means that we can produce a encoded text until it
258 reaches at the head of not-yet-encoded source text.
d46c5b12 259
df7492f9 260 Below is a template of these functions. */
4ed46869 261#if 0
b73bfc1c 262static void
df7492f9 263encode_coding_XXX (coding)
4ed46869 264 struct coding_system *coding;
4ed46869 265{
df7492f9
KH
266 int multibytep = coding->dst_multibyte;
267 int *charbuf = coding->charbuf;
268 int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
269 unsigned char *dst = coding->destination + coding->produced;
270 unsigned char *dst_end = coding->destination + coding->dst_bytes;
271 unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
272 int produced_chars = 0;
273
274 for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
275 {
276 int c = *charbuf;
277 /* Encode C into DST, and increment DST. */
278 }
279 label_no_more_destination:
280 /* How many chars and bytes we produced. */
281 coding->produced_char += produced_chars;
282 coding->produced = dst - coding->destination;
4ed46869
KH
283}
284#endif
285
4ed46869
KH
286\f
287/*** 1. Preamble ***/
288
68c45bf0 289#include <config.h>
4ed46869
KH
290#include <stdio.h>
291
4ed46869
KH
292#include "lisp.h"
293#include "buffer.h"
df7492f9 294#include "character.h"
4ed46869
KH
295#include "charset.h"
296#include "ccl.h"
df7492f9 297#include "composite.h"
4ed46869
KH
298#include "coding.h"
299#include "window.h"
4ed46869 300
df7492f9 301Lisp_Object Vcoding_system_hash_table;
4ed46869 302
df7492f9 303Lisp_Object Qcoding_system, Qcoding_aliases, Qeol_type;
1965cb73
DL
304Lisp_Object Qunix, Qdos;
305extern Lisp_Object Qmac; /* frame.c */
4ed46869
KH
306Lisp_Object Qbuffer_file_coding_system;
307Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
df7492f9 308Lisp_Object Qdefault_char;
27901516 309Lisp_Object Qno_conversion, Qundecided;
df7492f9 310Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5;
b49a1807 311Lisp_Object Qbig, Qlittle;
bb0115a2 312Lisp_Object Qcoding_system_history;
1397dc18 313Lisp_Object Qvalid_codes;
a6f87d34
KH
314Lisp_Object QCcategory, QCmnemonic, QCdefalut_char;
315Lisp_Object QCdecode_translation_table, QCencode_translation_table;
316Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
4ed46869
KH
317
318extern Lisp_Object Qinsert_file_contents, Qwrite_region;
319Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
320Lisp_Object Qstart_process, Qopen_network_stream;
321Lisp_Object Qtarget_idx;
322
065e3595
KH
323Lisp_Object Qinsufficient_source, Qinconsistent_eol, Qinvalid_source;
324Lisp_Object Qinterrupted, Qinsufficient_memory;
325
5d5bf4d8
KH
326int coding_system_require_warning;
327
d46c5b12
KH
328Lisp_Object Vselect_safe_coding_system_function;
329
7722baf9
EZ
330/* Mnemonic string for each format of end-of-line. */
331Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
332/* Mnemonic string to indicate format of end-of-line is not yet
4ed46869 333 decided. */
7722baf9 334Lisp_Object eol_mnemonic_undecided;
4ed46869
KH
335
336#ifdef emacs
337
4608c386
KH
338Lisp_Object Vcoding_system_list, Vcoding_system_alist;
339
340Lisp_Object Qcoding_system_p, Qcoding_system_error;
4ed46869 341
d46c5b12
KH
342/* Coding system emacs-mule and raw-text are for converting only
343 end-of-line format. */
344Lisp_Object Qemacs_mule, Qraw_text;
8f924df7 345Lisp_Object Qutf_8_emacs;
ecf488bc 346
4ed46869
KH
347/* Coding-systems are handed between Emacs Lisp programs and C internal
348 routines by the following three variables. */
349/* Coding-system for reading files and receiving data from process. */
350Lisp_Object Vcoding_system_for_read;
351/* Coding-system for writing files and sending data to process. */
352Lisp_Object Vcoding_system_for_write;
353/* Coding-system actually used in the latest I/O. */
354Lisp_Object Vlast_coding_system_used;
065e3595
KH
355/* Set to non-nil when an error is detected while code conversion. */
356Lisp_Object Vlast_code_conversion_error;
c4825358 357/* A vector of length 256 which contains information about special
94487c4e 358 Latin codes (especially for dealing with Microsoft codes). */
3f003981 359Lisp_Object Vlatin_extra_code_table;
c4825358 360
9ce27fde
KH
361/* Flag to inhibit code conversion of end-of-line format. */
362int inhibit_eol_conversion;
363
74383408
KH
364/* Flag to inhibit ISO2022 escape sequence detection. */
365int inhibit_iso_escape_detection;
366
ed29121d
EZ
367/* Flag to make buffer-file-coding-system inherit from process-coding. */
368int inherit_process_coding_system;
369
c4825358 370/* Coding system to be used to encode text for terminal display. */
4ed46869
KH
371struct coding_system terminal_coding;
372
c4825358
KH
373/* Coding system to be used to encode text for terminal display when
374 terminal coding system is nil. */
375struct coding_system safe_terminal_coding;
376
377/* Coding system of what is sent from terminal keyboard. */
4ed46869
KH
378struct coding_system keyboard_coding;
379
02ba4723
KH
380Lisp_Object Vfile_coding_system_alist;
381Lisp_Object Vprocess_coding_system_alist;
382Lisp_Object Vnetwork_coding_system_alist;
4ed46869 383
68c45bf0
PE
384Lisp_Object Vlocale_coding_system;
385
4ed46869
KH
386#endif /* emacs */
387
f967223b
KH
388/* Flag to tell if we look up translation table on character code
389 conversion. */
84fbb8a0 390Lisp_Object Venable_character_translation;
f967223b
KH
391/* Standard translation table to look up on decoding (reading). */
392Lisp_Object Vstandard_translation_table_for_decode;
393/* Standard translation table to look up on encoding (writing). */
394Lisp_Object Vstandard_translation_table_for_encode;
84fbb8a0 395
f967223b
KH
396Lisp_Object Qtranslation_table;
397Lisp_Object Qtranslation_table_id;
398Lisp_Object Qtranslation_table_for_decode;
399Lisp_Object Qtranslation_table_for_encode;
4ed46869
KH
400
401/* Alist of charsets vs revision number. */
df7492f9 402static Lisp_Object Vcharset_revision_table;
4ed46869 403
02ba4723
KH
404/* Default coding systems used for process I/O. */
405Lisp_Object Vdefault_process_coding_system;
406
002fdb44
DL
407/* Char table for translating Quail and self-inserting input. */
408Lisp_Object Vtranslation_table_for_input;
409
df7492f9
KH
410/* Two special coding systems. */
411Lisp_Object Vsjis_coding_system;
412Lisp_Object Vbig5_coding_system;
413
065e3595
KH
414static void record_conversion_result (struct coding_system *coding,
415 enum coding_result_code result);
ff0dacd7
KH
416static int detect_coding_utf_8 P_ ((struct coding_system *,
417 struct coding_detection_info *info));
df7492f9
KH
418static void decode_coding_utf_8 P_ ((struct coding_system *));
419static int encode_coding_utf_8 P_ ((struct coding_system *));
420
ff0dacd7
KH
421static int detect_coding_utf_16 P_ ((struct coding_system *,
422 struct coding_detection_info *info));
df7492f9
KH
423static void decode_coding_utf_16 P_ ((struct coding_system *));
424static int encode_coding_utf_16 P_ ((struct coding_system *));
425
ff0dacd7
KH
426static int detect_coding_iso_2022 P_ ((struct coding_system *,
427 struct coding_detection_info *info));
df7492f9
KH
428static void decode_coding_iso_2022 P_ ((struct coding_system *));
429static int encode_coding_iso_2022 P_ ((struct coding_system *));
430
ff0dacd7
KH
431static int detect_coding_emacs_mule P_ ((struct coding_system *,
432 struct coding_detection_info *info));
df7492f9
KH
433static void decode_coding_emacs_mule P_ ((struct coding_system *));
434static int encode_coding_emacs_mule P_ ((struct coding_system *));
435
ff0dacd7
KH
436static int detect_coding_sjis P_ ((struct coding_system *,
437 struct coding_detection_info *info));
df7492f9
KH
438static void decode_coding_sjis P_ ((struct coding_system *));
439static int encode_coding_sjis P_ ((struct coding_system *));
440
ff0dacd7
KH
441static int detect_coding_big5 P_ ((struct coding_system *,
442 struct coding_detection_info *info));
df7492f9
KH
443static void decode_coding_big5 P_ ((struct coding_system *));
444static int encode_coding_big5 P_ ((struct coding_system *));
445
ff0dacd7
KH
446static int detect_coding_ccl P_ ((struct coding_system *,
447 struct coding_detection_info *info));
df7492f9
KH
448static void decode_coding_ccl P_ ((struct coding_system *));
449static int encode_coding_ccl P_ ((struct coding_system *));
450
451static void decode_coding_raw_text P_ ((struct coding_system *));
452static int encode_coding_raw_text P_ ((struct coding_system *));
453
454
455/* ISO2022 section */
456
457#define CODING_ISO_INITIAL(coding, reg) \
458 (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id), \
459 coding_attr_iso_initial), \
460 reg)))
461
462
463#define CODING_ISO_REQUEST(coding, charset_id) \
464 ((charset_id <= (coding)->max_charset_id \
465 ? (coding)->safe_charsets[charset_id] \
466 : -1))
467
468
469#define CODING_ISO_FLAGS(coding) \
470 ((coding)->spec.iso_2022.flags)
471#define CODING_ISO_DESIGNATION(coding, reg) \
472 ((coding)->spec.iso_2022.current_designation[reg])
473#define CODING_ISO_INVOCATION(coding, plane) \
474 ((coding)->spec.iso_2022.current_invocation[plane])
475#define CODING_ISO_SINGLE_SHIFTING(coding) \
476 ((coding)->spec.iso_2022.single_shifting)
477#define CODING_ISO_BOL(coding) \
478 ((coding)->spec.iso_2022.bol)
479#define CODING_ISO_INVOKED_CHARSET(coding, plane) \
480 CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
481
482/* Control characters of ISO2022. */
483 /* code */ /* function */
484#define ISO_CODE_LF 0x0A /* line-feed */
485#define ISO_CODE_CR 0x0D /* carriage-return */
486#define ISO_CODE_SO 0x0E /* shift-out */
487#define ISO_CODE_SI 0x0F /* shift-in */
488#define ISO_CODE_SS2_7 0x19 /* single-shift-2 for 7-bit code */
489#define ISO_CODE_ESC 0x1B /* escape */
490#define ISO_CODE_SS2 0x8E /* single-shift-2 */
491#define ISO_CODE_SS3 0x8F /* single-shift-3 */
492#define ISO_CODE_CSI 0x9B /* control-sequence-introducer */
493
494/* All code (1-byte) of ISO2022 is classified into one of the
495 followings. */
496enum iso_code_class_type
497 {
498 ISO_control_0, /* Control codes in the range
499 0x00..0x1F and 0x7F, except for the
500 following 5 codes. */
df7492f9
KH
501 ISO_shift_out, /* ISO_CODE_SO (0x0E) */
502 ISO_shift_in, /* ISO_CODE_SI (0x0F) */
503 ISO_single_shift_2_7, /* ISO_CODE_SS2_7 (0x19) */
504 ISO_escape, /* ISO_CODE_SO (0x1B) */
505 ISO_control_1, /* Control codes in the range
506 0x80..0x9F, except for the
507 following 3 codes. */
508 ISO_single_shift_2, /* ISO_CODE_SS2 (0x8E) */
509 ISO_single_shift_3, /* ISO_CODE_SS3 (0x8F) */
510 ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
511 ISO_0x20_or_0x7F, /* Codes of the values 0x20 or 0x7F. */
512 ISO_graphic_plane_0, /* Graphic codes in the range 0x21..0x7E. */
513 ISO_0xA0_or_0xFF, /* Codes of the values 0xA0 or 0xFF. */
514 ISO_graphic_plane_1 /* Graphic codes in the range 0xA1..0xFE. */
515 };
05e6f5dc 516
df7492f9
KH
517/** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
518 `iso-flags' attribute of an iso2022 coding system. */
05e6f5dc 519
df7492f9
KH
520/* If set, produce long-form designation sequence (e.g. ESC $ ( A)
521 instead of the correct short-form sequence (e.g. ESC $ A). */
522#define CODING_ISO_FLAG_LONG_FORM 0x0001
93dec019 523
df7492f9
KH
524/* If set, reset graphic planes and registers at end-of-line to the
525 initial state. */
526#define CODING_ISO_FLAG_RESET_AT_EOL 0x0002
05e6f5dc 527
df7492f9
KH
528/* If set, reset graphic planes and registers before any control
529 characters to the initial state. */
530#define CODING_ISO_FLAG_RESET_AT_CNTL 0x0004
05e6f5dc 531
df7492f9
KH
532/* If set, encode by 7-bit environment. */
533#define CODING_ISO_FLAG_SEVEN_BITS 0x0008
4ed46869 534
df7492f9
KH
535/* If set, use locking-shift function. */
536#define CODING_ISO_FLAG_LOCKING_SHIFT 0x0010
b73bfc1c 537
df7492f9
KH
538/* If set, use single-shift function. Overwrite
539 CODING_ISO_FLAG_LOCKING_SHIFT. */
540#define CODING_ISO_FLAG_SINGLE_SHIFT 0x0020
b73bfc1c 541
df7492f9
KH
542/* If set, use designation escape sequence. */
543#define CODING_ISO_FLAG_DESIGNATION 0x0040
b73bfc1c 544
df7492f9
KH
545/* If set, produce revision number sequence. */
546#define CODING_ISO_FLAG_REVISION 0x0080
b73bfc1c 547
df7492f9
KH
548/* If set, produce ISO6429's direction specifying sequence. */
549#define CODING_ISO_FLAG_DIRECTION 0x0100
f4dee582 550
df7492f9
KH
551/* If set, assume designation states are reset at beginning of line on
552 output. */
553#define CODING_ISO_FLAG_INIT_AT_BOL 0x0200
4ed46869 554
df7492f9
KH
555/* If set, designation sequence should be placed at beginning of line
556 on output. */
557#define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
aa72b389 558
df7492f9
KH
559/* If set, do not encode unsafe charactes on output. */
560#define CODING_ISO_FLAG_SAFE 0x0800
aa72b389 561
df7492f9
KH
562/* If set, extra latin codes (128..159) are accepted as a valid code
563 on input. */
564#define CODING_ISO_FLAG_LATIN_EXTRA 0x1000
aa72b389 565
df7492f9 566#define CODING_ISO_FLAG_COMPOSITION 0x2000
aa72b389 567
df7492f9 568#define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000
aa72b389 569
bf16eb23 570#define CODING_ISO_FLAG_USE_ROMAN 0x8000
aa72b389 571
bf16eb23 572#define CODING_ISO_FLAG_USE_OLDJIS 0x10000
aa72b389 573
bf16eb23 574#define CODING_ISO_FLAG_FULL_SUPPORT 0x100000
aa72b389 575
df7492f9
KH
576/* A character to be produced on output if encoding of the original
577 character is prohibited by CODING_ISO_FLAG_SAFE. */
578#define CODING_INHIBIT_CHARACTER_SUBSTITUTION '?'
aa72b389 579
4ed46869 580
df7492f9
KH
581/* UTF-16 section */
582#define CODING_UTF_16_BOM(coding) \
583 ((coding)->spec.utf_16.bom)
4ed46869 584
df7492f9
KH
585#define CODING_UTF_16_ENDIAN(coding) \
586 ((coding)->spec.utf_16.endian)
4ed46869 587
df7492f9
KH
588#define CODING_UTF_16_SURROGATE(coding) \
589 ((coding)->spec.utf_16.surrogate)
4ed46869 590
4ed46869 591
df7492f9
KH
592/* CCL section */
593#define CODING_CCL_DECODER(coding) \
594 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
595#define CODING_CCL_ENCODER(coding) \
596 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
597#define CODING_CCL_VALIDS(coding) \
8f924df7 598 (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
4ed46869 599
5a936b46 600/* Index for each coding category in `coding_categories' */
4ed46869 601
df7492f9
KH
602enum coding_category
603 {
604 coding_category_iso_7,
605 coding_category_iso_7_tight,
606 coding_category_iso_8_1,
607 coding_category_iso_8_2,
608 coding_category_iso_7_else,
609 coding_category_iso_8_else,
610 coding_category_utf_8,
611 coding_category_utf_16_auto,
612 coding_category_utf_16_be,
613 coding_category_utf_16_le,
614 coding_category_utf_16_be_nosig,
615 coding_category_utf_16_le_nosig,
616 coding_category_charset,
617 coding_category_sjis,
618 coding_category_big5,
619 coding_category_ccl,
620 coding_category_emacs_mule,
621 /* All above are targets of code detection. */
622 coding_category_raw_text,
623 coding_category_undecided,
624 coding_category_max
625 };
626
627/* Definitions of flag bits used in detect_coding_XXXX. */
628#define CATEGORY_MASK_ISO_7 (1 << coding_category_iso_7)
629#define CATEGORY_MASK_ISO_7_TIGHT (1 << coding_category_iso_7_tight)
630#define CATEGORY_MASK_ISO_8_1 (1 << coding_category_iso_8_1)
631#define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2)
632#define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else)
633#define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else)
634#define CATEGORY_MASK_UTF_8 (1 << coding_category_utf_8)
b49a1807 635#define CATEGORY_MASK_UTF_16_AUTO (1 << coding_category_utf_16_auto)
df7492f9
KH
636#define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be)
637#define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le)
638#define CATEGORY_MASK_UTF_16_BE_NOSIG (1 << coding_category_utf_16_be_nosig)
639#define CATEGORY_MASK_UTF_16_LE_NOSIG (1 << coding_category_utf_16_le_nosig)
640#define CATEGORY_MASK_CHARSET (1 << coding_category_charset)
641#define CATEGORY_MASK_SJIS (1 << coding_category_sjis)
642#define CATEGORY_MASK_BIG5 (1 << coding_category_big5)
643#define CATEGORY_MASK_CCL (1 << coding_category_ccl)
644#define CATEGORY_MASK_EMACS_MULE (1 << coding_category_emacs_mule)
ff0dacd7 645#define CATEGORY_MASK_RAW_TEXT (1 << coding_category_raw_text)
df7492f9
KH
646
647/* This value is returned if detect_coding_mask () find nothing other
648 than ASCII characters. */
649#define CATEGORY_MASK_ANY \
650 (CATEGORY_MASK_ISO_7 \
651 | CATEGORY_MASK_ISO_7_TIGHT \
652 | CATEGORY_MASK_ISO_8_1 \
653 | CATEGORY_MASK_ISO_8_2 \
654 | CATEGORY_MASK_ISO_7_ELSE \
655 | CATEGORY_MASK_ISO_8_ELSE \
656 | CATEGORY_MASK_UTF_8 \
657 | CATEGORY_MASK_UTF_16_BE \
658 | CATEGORY_MASK_UTF_16_LE \
659 | CATEGORY_MASK_UTF_16_BE_NOSIG \
660 | CATEGORY_MASK_UTF_16_LE_NOSIG \
661 | CATEGORY_MASK_CHARSET \
662 | CATEGORY_MASK_SJIS \
663 | CATEGORY_MASK_BIG5 \
664 | CATEGORY_MASK_CCL \
665 | CATEGORY_MASK_EMACS_MULE)
666
667
668#define CATEGORY_MASK_ISO_7BIT \
669 (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
670
671#define CATEGORY_MASK_ISO_8BIT \
672 (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
673
674#define CATEGORY_MASK_ISO_ELSE \
675 (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
676
677#define CATEGORY_MASK_ISO_ESCAPE \
678 (CATEGORY_MASK_ISO_7 \
679 | CATEGORY_MASK_ISO_7_TIGHT \
680 | CATEGORY_MASK_ISO_7_ELSE \
681 | CATEGORY_MASK_ISO_8_ELSE)
682
683#define CATEGORY_MASK_ISO \
684 ( CATEGORY_MASK_ISO_7BIT \
685 | CATEGORY_MASK_ISO_8BIT \
686 | CATEGORY_MASK_ISO_ELSE)
687
688#define CATEGORY_MASK_UTF_16 \
689 (CATEGORY_MASK_UTF_16_BE \
690 | CATEGORY_MASK_UTF_16_LE \
691 | CATEGORY_MASK_UTF_16_BE_NOSIG \
692 | CATEGORY_MASK_UTF_16_LE_NOSIG)
693
694
695/* List of symbols `coding-category-xxx' ordered by priority. This
696 variable is exposed to Emacs Lisp. */
697static Lisp_Object Vcoding_category_list;
698
699/* Table of coding categories (Lisp symbols). This variable is for
700 internal use oly. */
701static Lisp_Object Vcoding_category_table;
702
703/* Table of coding-categories ordered by priority. */
704static enum coding_category coding_priorities[coding_category_max];
705
706/* Nth element is a coding context for the coding system bound to the
707 Nth coding category. */
708static struct coding_system coding_categories[coding_category_max];
709
df7492f9
KH
710/*** Commonly used macros and functions ***/
711
712#ifndef min
713#define min(a, b) ((a) < (b) ? (a) : (b))
714#endif
715#ifndef max
716#define max(a, b) ((a) > (b) ? (a) : (b))
717#endif
4ed46869 718
24a73b0a
KH
719#define CODING_GET_INFO(coding, attrs, charset_list) \
720 do { \
721 (attrs) = CODING_ID_ATTRS ((coding)->id); \
722 (charset_list) = CODING_ATTR_CHARSET_LIST (attrs); \
df7492f9 723 } while (0)
4ed46869 724
4ed46869 725
df7492f9
KH
726/* Safely get one byte from the source text pointed by SRC which ends
727 at SRC_END, and set C to that byte. If there are not enough bytes
065e3595
KH
728 in the source, it jumps to `no_more_source'. If multibytep is
729 nonzero, and a multibyte character is found at SRC, set C to the
730 negative value of the character code. The caller should declare
731 and set these variables appropriately in advance:
732 src, src_end, multibytep */
aa72b389 733
065e3595
KH
734#define ONE_MORE_BYTE(c) \
735 do { \
736 if (src == src_end) \
737 { \
738 if (src_base < src) \
739 record_conversion_result \
740 (coding, CODING_RESULT_INSUFFICIENT_SRC); \
741 goto no_more_source; \
742 } \
743 c = *src++; \
744 if (multibytep && (c & 0x80)) \
745 { \
746 if ((c & 0xFE) == 0xC0) \
747 c = ((c & 1) << 6) | *src++; \
748 else \
749 { \
750 c = - string_char (--src, &src, NULL); \
751 record_conversion_result \
752 (coding, CODING_RESULT_INVALID_SRC); \
753 } \
754 } \
755 consumed_chars++; \
aa72b389
KH
756 } while (0)
757
aa72b389 758
065e3595
KH
759#define ONE_MORE_BYTE_NO_CHECK(c) \
760 do { \
761 c = *src++; \
762 if (multibytep && (c & 0x80)) \
763 { \
764 if ((c & 0xFE) == 0xC0) \
765 c = ((c & 1) << 6) | *src++; \
766 else \
767 { \
768 c = - string_char (--src, &src, NULL); \
769 record_conversion_result \
770 (coding, CODING_RESULT_INVALID_SRC); \
771 } \
772 } \
773 consumed_chars++; \
aa72b389
KH
774 } while (0)
775
aa72b389 776
df7492f9
KH
777/* Store a byte C in the place pointed by DST and increment DST to the
778 next free point, and increment PRODUCED_CHARS. The caller should
779 assure that C is 0..127, and declare and set the variable `dst'
780 appropriately in advance.
781*/
aa72b389
KH
782
783
df7492f9
KH
784#define EMIT_ONE_ASCII_BYTE(c) \
785 do { \
786 produced_chars++; \
787 *dst++ = (c); \
b6871cc7 788 } while (0)
aa72b389
KH
789
790
df7492f9 791/* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2. */
aa72b389 792
df7492f9
KH
793#define EMIT_TWO_ASCII_BYTES(c1, c2) \
794 do { \
795 produced_chars += 2; \
796 *dst++ = (c1), *dst++ = (c2); \
797 } while (0)
aa72b389
KH
798
799
df7492f9
KH
800/* Store a byte C in the place pointed by DST and increment DST to the
801 next free point, and increment PRODUCED_CHARS. If MULTIBYTEP is
802 nonzero, store in an appropriate multibyte from. The caller should
803 declare and set the variables `dst' and `multibytep' appropriately
804 in advance. */
805
806#define EMIT_ONE_BYTE(c) \
807 do { \
808 produced_chars++; \
809 if (multibytep) \
810 { \
811 int ch = (c); \
812 if (ch >= 0x80) \
813 ch = BYTE8_TO_CHAR (ch); \
814 CHAR_STRING_ADVANCE (ch, dst); \
815 } \
816 else \
817 *dst++ = (c); \
aa72b389 818 } while (0)
aa72b389 819
aa72b389 820
df7492f9 821/* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */
aa72b389 822
e19c3639
KH
823#define EMIT_TWO_BYTES(c1, c2) \
824 do { \
825 produced_chars += 2; \
826 if (multibytep) \
827 { \
828 int ch; \
829 \
830 ch = (c1); \
831 if (ch >= 0x80) \
832 ch = BYTE8_TO_CHAR (ch); \
833 CHAR_STRING_ADVANCE (ch, dst); \
834 ch = (c2); \
835 if (ch >= 0x80) \
836 ch = BYTE8_TO_CHAR (ch); \
837 CHAR_STRING_ADVANCE (ch, dst); \
838 } \
839 else \
840 { \
841 *dst++ = (c1); \
842 *dst++ = (c2); \
843 } \
aa72b389
KH
844 } while (0)
845
846
df7492f9
KH
847#define EMIT_THREE_BYTES(c1, c2, c3) \
848 do { \
849 EMIT_ONE_BYTE (c1); \
850 EMIT_TWO_BYTES (c2, c3); \
851 } while (0)
aa72b389 852
aa72b389 853
df7492f9
KH
854#define EMIT_FOUR_BYTES(c1, c2, c3, c4) \
855 do { \
856 EMIT_TWO_BYTES (c1, c2); \
857 EMIT_TWO_BYTES (c3, c4); \
858 } while (0)
aa72b389 859
aa72b389 860
065e3595
KH
861static void
862record_conversion_result (struct coding_system *coding,
863 enum coding_result_code result)
864{
865 coding->result = result;
866 switch (result)
867 {
868 case CODING_RESULT_INSUFFICIENT_SRC:
869 Vlast_code_conversion_error = Qinsufficient_source;
870 break;
871 case CODING_RESULT_INCONSISTENT_EOL:
872 Vlast_code_conversion_error = Qinconsistent_eol;
873 break;
874 case CODING_RESULT_INVALID_SRC:
875 Vlast_code_conversion_error = Qinvalid_source;
876 break;
877 case CODING_RESULT_INTERRUPT:
878 Vlast_code_conversion_error = Qinterrupted;
879 break;
880 case CODING_RESULT_INSUFFICIENT_MEM:
881 Vlast_code_conversion_error = Qinsufficient_memory;
882 break;
883 }
884}
885
df7492f9
KH
886#define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
887 do { \
888 charset_map_loaded = 0; \
889 c = DECODE_CHAR (charset, code); \
890 if (charset_map_loaded) \
891 { \
8f924df7 892 const unsigned char *orig = coding->source; \
df7492f9
KH
893 EMACS_INT offset; \
894 \
895 coding_set_source (coding); \
896 offset = coding->source - orig; \
897 src += offset; \
898 src_base += offset; \
899 src_end += offset; \
900 } \
aa72b389
KH
901 } while (0)
902
903
df7492f9
KH
904#define ASSURE_DESTINATION(bytes) \
905 do { \
906 if (dst + (bytes) >= dst_end) \
907 { \
908 int more_bytes = charbuf_end - charbuf + (bytes); \
909 \
910 dst = alloc_destination (coding, more_bytes, dst); \
911 dst_end = coding->destination + coding->dst_bytes; \
912 } \
913 } while (0)
aa72b389 914
aa72b389 915
aa72b389 916
df7492f9
KH
917static void
918coding_set_source (coding)
aa72b389 919 struct coding_system *coding;
aa72b389 920{
df7492f9
KH
921 if (BUFFERP (coding->src_object))
922 {
2cb26057 923 struct buffer *buf = XBUFFER (coding->src_object);
aa72b389 924
df7492f9 925 if (coding->src_pos < 0)
2cb26057 926 coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
df7492f9 927 else
2cb26057 928 coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
aa72b389 929 }
df7492f9 930 else if (STRINGP (coding->src_object))
aa72b389 931 {
8f924df7 932 coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
aa72b389 933 }
df7492f9
KH
934 else
935 /* Otherwise, the source is C string and is never relocated
936 automatically. Thus we don't have to update anything. */
937 ;
938}
aa72b389 939
df7492f9
KH
940static void
941coding_set_destination (coding)
942 struct coding_system *coding;
943{
944 if (BUFFERP (coding->dst_object))
aa72b389 945 {
df7492f9 946 if (coding->src_pos < 0)
aa72b389 947 {
28f67a95
KH
948 coding->destination = BEG_ADDR + coding->dst_pos_byte - 1;
949 coding->dst_bytes = (GAP_END_ADDR
950 - (coding->src_bytes - coding->consumed)
951 - coding->destination);
aa72b389 952 }
df7492f9 953 else
28f67a95
KH
954 {
955 /* We are sure that coding->dst_pos_byte is before the gap
956 of the buffer. */
957 coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
958 + coding->dst_pos_byte - 1);
959 coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
960 - coding->destination);
961 }
df7492f9
KH
962 }
963 else
964 /* Otherwise, the destination is C string and is never relocated
965 automatically. Thus we don't have to update anything. */
966 ;
967}
968
969
970static void
971coding_alloc_by_realloc (coding, bytes)
972 struct coding_system *coding;
973 EMACS_INT bytes;
974{
975 coding->destination = (unsigned char *) xrealloc (coding->destination,
976 coding->dst_bytes + bytes);
977 coding->dst_bytes += bytes;
978}
979
980static void
981coding_alloc_by_making_gap (coding, bytes)
982 struct coding_system *coding;
983 EMACS_INT bytes;
984{
2c78b7e1
KH
985 if (BUFFERP (coding->dst_object)
986 && EQ (coding->src_object, coding->dst_object))
df7492f9
KH
987 {
988 EMACS_INT add = coding->src_bytes - coding->consumed;
989
990 GAP_SIZE -= add; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
991 make_gap (bytes);
992 GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
993 }
994 else
995 {
2c78b7e1
KH
996 Lisp_Object this_buffer;
997
998 this_buffer = Fcurrent_buffer ();
df7492f9
KH
999 set_buffer_internal (XBUFFER (coding->dst_object));
1000 make_gap (bytes);
1001 set_buffer_internal (XBUFFER (this_buffer));
aa72b389 1002 }
df7492f9 1003}
8f924df7 1004
df7492f9
KH
1005
1006static unsigned char *
1007alloc_destination (coding, nbytes, dst)
1008 struct coding_system *coding;
3e139625 1009 EMACS_INT nbytes;
df7492f9
KH
1010 unsigned char *dst;
1011{
1012 EMACS_INT offset = dst - coding->destination;
1013
1014 if (BUFFERP (coding->dst_object))
1015 coding_alloc_by_making_gap (coding, nbytes);
aa72b389 1016 else
df7492f9 1017 coding_alloc_by_realloc (coding, nbytes);
065e3595 1018 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
1019 coding_set_destination (coding);
1020 dst = coding->destination + offset;
1021 return dst;
1022}
aa72b389 1023
ff0dacd7
KH
1024/** Macros for annotations. */
1025
1026/* Maximum length of annotation data (sum of annotations for
1027 composition and charset). */
1028#define MAX_ANNOTATION_LENGTH (5 + (MAX_COMPOSITION_COMPONENTS * 2) - 1 + 5)
1029
1030/* An annotation data is stored in the array coding->charbuf in this
1031 format:
1032 [ -LENGTH ANNOTATION_MASK FROM TO ... ]
1033 LENGTH is the number of elements in the annotation.
1034 ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1035 FROM and TO specify the range of text annotated. They are relative
1036 to coding->src_pos (on encoding) or coding->dst_pos (on decoding).
1037
1038 The format of the following elements depend on ANNOTATION_MASK.
1039
1040 In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1041 follows:
1042 ... METHOD [ COMPOSITION-COMPONENTS ... ]
1043 METHOD is one of enum composition_method.
1044 Optionnal COMPOSITION-COMPONENTS are characters and composition
1045 rules.
1046
1047 In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1048 follows. */
1049
1050#define ADD_ANNOTATION_DATA(buf, len, mask, from, to) \
1051 do { \
1052 *(buf)++ = -(len); \
1053 *(buf)++ = (mask); \
1054 *(buf)++ = (from); \
1055 *(buf)++ = (to); \
1056 coding->annotated = 1; \
1057 } while (0);
1058
1059#define ADD_COMPOSITION_DATA(buf, from, to, method) \
1060 do { \
1061 ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, from, to); \
1062 *buf++ = method; \
1063 } while (0)
1064
1065
1066#define ADD_CHARSET_DATA(buf, from, to, id) \
1067 do { \
1068 ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_CHARSET_MASK, from, to); \
1069 *buf++ = id; \
1070 } while (0)
1071
df7492f9
KH
1072\f
1073/*** 2. Emacs' internal format (emacs-utf-8) ***/
1074
1075
1076
1077\f
1078/*** 3. UTF-8 ***/
1079
1080/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
ff0dacd7
KH
1081 Check if a text is encoded in UTF-8. If it is, return 1, else
1082 return 0. */
df7492f9
KH
1083
1084#define UTF_8_1_OCTET_P(c) ((c) < 0x80)
1085#define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
1086#define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1087#define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1088#define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1089#define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1090
1091static int
ff0dacd7 1092detect_coding_utf_8 (coding, detect_info)
df7492f9 1093 struct coding_system *coding;
ff0dacd7 1094 struct coding_detection_info *detect_info;
df7492f9 1095{
065e3595 1096 const unsigned char *src = coding->source, *src_base;
8f924df7 1097 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9
KH
1098 int multibytep = coding->src_multibyte;
1099 int consumed_chars = 0;
1100 int found = 0;
1101
ff0dacd7 1102 detect_info->checked |= CATEGORY_MASK_UTF_8;
df7492f9
KH
1103 /* A coding system of this category is always ASCII compatible. */
1104 src += coding->head_ascii;
1105
1106 while (1)
aa72b389 1107 {
df7492f9 1108 int c, c1, c2, c3, c4;
aa72b389 1109
065e3595 1110 src_base = src;
df7492f9 1111 ONE_MORE_BYTE (c);
065e3595 1112 if (c < 0 || UTF_8_1_OCTET_P (c))
df7492f9
KH
1113 continue;
1114 ONE_MORE_BYTE (c1);
065e3595 1115 if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
df7492f9
KH
1116 break;
1117 if (UTF_8_2_OCTET_LEADING_P (c))
aa72b389 1118 {
ff0dacd7 1119 found = CATEGORY_MASK_UTF_8;
df7492f9 1120 continue;
aa72b389 1121 }
df7492f9 1122 ONE_MORE_BYTE (c2);
065e3595 1123 if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
df7492f9
KH
1124 break;
1125 if (UTF_8_3_OCTET_LEADING_P (c))
aa72b389 1126 {
ff0dacd7 1127 found = CATEGORY_MASK_UTF_8;
df7492f9 1128 continue;
aa72b389 1129 }
df7492f9 1130 ONE_MORE_BYTE (c3);
065e3595 1131 if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
df7492f9
KH
1132 break;
1133 if (UTF_8_4_OCTET_LEADING_P (c))
aa72b389 1134 {
ff0dacd7 1135 found = CATEGORY_MASK_UTF_8;
df7492f9
KH
1136 continue;
1137 }
1138 ONE_MORE_BYTE (c4);
065e3595 1139 if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
df7492f9
KH
1140 break;
1141 if (UTF_8_5_OCTET_LEADING_P (c))
1142 {
ff0dacd7 1143 found = CATEGORY_MASK_UTF_8;
df7492f9
KH
1144 continue;
1145 }
1146 break;
aa72b389 1147 }
ff0dacd7 1148 detect_info->rejected |= CATEGORY_MASK_UTF_8;
df7492f9 1149 return 0;
aa72b389 1150
df7492f9 1151 no_more_source:
065e3595 1152 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
aa72b389 1153 {
ff0dacd7 1154 detect_info->rejected |= CATEGORY_MASK_UTF_8;
89528eb3 1155 return 0;
aa72b389 1156 }
ff0dacd7
KH
1157 detect_info->found |= found;
1158 return 1;
aa72b389
KH
1159}
1160
4ed46869 1161
b73bfc1c 1162static void
df7492f9 1163decode_coding_utf_8 (coding)
b73bfc1c 1164 struct coding_system *coding;
b73bfc1c 1165{
8f924df7
KH
1166 const unsigned char *src = coding->source + coding->consumed;
1167 const unsigned char *src_end = coding->source + coding->src_bytes;
1168 const unsigned char *src_base;
df7492f9
KH
1169 int *charbuf = coding->charbuf;
1170 int *charbuf_end = charbuf + coding->charbuf_size;
1171 int consumed_chars = 0, consumed_chars_base;
1172 int multibytep = coding->src_multibyte;
24a73b0a 1173 Lisp_Object attr, charset_list;
4ed46869 1174
24a73b0a 1175 CODING_GET_INFO (coding, attr, charset_list);
df7492f9
KH
1176
1177 while (1)
b73bfc1c 1178 {
df7492f9 1179 int c, c1, c2, c3, c4, c5;
ec6d2bb8 1180
df7492f9
KH
1181 src_base = src;
1182 consumed_chars_base = consumed_chars;
4af310db 1183
df7492f9
KH
1184 if (charbuf >= charbuf_end)
1185 break;
1186
1187 ONE_MORE_BYTE (c1);
065e3595
KH
1188 if (c1 < 0)
1189 {
1190 c = - c1;
1191 }
1192 else if (UTF_8_1_OCTET_P(c1))
df7492f9
KH
1193 {
1194 c = c1;
4af310db 1195 }
df7492f9 1196 else
4af310db 1197 {
df7492f9 1198 ONE_MORE_BYTE (c2);
065e3595 1199 if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
df7492f9
KH
1200 goto invalid_code;
1201 if (UTF_8_2_OCTET_LEADING_P (c1))
4af310db 1202 {
b0edb2c5
DL
1203 c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1204 /* Reject overlong sequences here and below. Encoders
1205 producing them are incorrect, they can be misleading,
1206 and they mess up read/write invariance. */
1207 if (c < 128)
1208 goto invalid_code;
4af310db 1209 }
df7492f9 1210 else
aa72b389 1211 {
df7492f9 1212 ONE_MORE_BYTE (c3);
065e3595 1213 if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
df7492f9
KH
1214 goto invalid_code;
1215 if (UTF_8_3_OCTET_LEADING_P (c1))
b0edb2c5
DL
1216 {
1217 c = (((c1 & 0xF) << 12)
1218 | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
72fe1301
DL
1219 if (c < 0x800
1220 || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
b0edb2c5
DL
1221 goto invalid_code;
1222 }
df7492f9
KH
1223 else
1224 {
1225 ONE_MORE_BYTE (c4);
065e3595 1226 if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
df7492f9
KH
1227 goto invalid_code;
1228 if (UTF_8_4_OCTET_LEADING_P (c1))
b0edb2c5 1229 {
df7492f9
KH
1230 c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1231 | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
b0edb2c5
DL
1232 if (c < 0x10000)
1233 goto invalid_code;
1234 }
df7492f9
KH
1235 else
1236 {
1237 ONE_MORE_BYTE (c5);
065e3595 1238 if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
df7492f9
KH
1239 goto invalid_code;
1240 if (UTF_8_5_OCTET_LEADING_P (c1))
1241 {
1242 c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1243 | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1244 | (c5 & 0x3F));
b0edb2c5 1245 if ((c > MAX_CHAR) || (c < 0x200000))
df7492f9
KH
1246 goto invalid_code;
1247 }
1248 else
1249 goto invalid_code;
1250 }
1251 }
aa72b389 1252 }
b73bfc1c 1253 }
df7492f9
KH
1254
1255 *charbuf++ = c;
1256 continue;
1257
1258 invalid_code:
1259 src = src_base;
1260 consumed_chars = consumed_chars_base;
1261 ONE_MORE_BYTE (c);
1262 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1263 coding->errors++;
aa72b389
KH
1264 }
1265
df7492f9
KH
1266 no_more_source:
1267 coding->consumed_char += consumed_chars_base;
1268 coding->consumed = src_base - coding->source;
1269 coding->charbuf_used = charbuf - coding->charbuf;
1270}
1271
1272
1273static int
1274encode_coding_utf_8 (coding)
1275 struct coding_system *coding;
1276{
1277 int multibytep = coding->dst_multibyte;
1278 int *charbuf = coding->charbuf;
1279 int *charbuf_end = charbuf + coding->charbuf_used;
1280 unsigned char *dst = coding->destination + coding->produced;
1281 unsigned char *dst_end = coding->destination + coding->dst_bytes;
e19c3639 1282 int produced_chars = 0;
df7492f9
KH
1283 int c;
1284
1285 if (multibytep)
aa72b389 1286 {
df7492f9
KH
1287 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1288
1289 while (charbuf < charbuf_end)
b73bfc1c 1290 {
df7492f9 1291 unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
8f924df7 1292
df7492f9
KH
1293 ASSURE_DESTINATION (safe_room);
1294 c = *charbuf++;
28f67a95
KH
1295 if (CHAR_BYTE8_P (c))
1296 {
1297 c = CHAR_TO_BYTE8 (c);
1298 EMIT_ONE_BYTE (c);
1299 }
1300 else
1301 {
1302 CHAR_STRING_ADVANCE (c, pend);
1303 for (p = str; p < pend; p++)
1304 EMIT_ONE_BYTE (*p);
1305 }
b73bfc1c 1306 }
aa72b389 1307 }
df7492f9
KH
1308 else
1309 {
1310 int safe_room = MAX_MULTIBYTE_LENGTH;
1311
1312 while (charbuf < charbuf_end)
b73bfc1c 1313 {
df7492f9
KH
1314 ASSURE_DESTINATION (safe_room);
1315 c = *charbuf++;
1316 dst += CHAR_STRING (c, dst);
1317 produced_chars++;
4ed46869
KH
1318 }
1319 }
065e3595 1320 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
1321 coding->produced_char += produced_chars;
1322 coding->produced = dst - coding->destination;
1323 return 0;
4ed46869
KH
1324}
1325
b73bfc1c 1326
df7492f9 1327/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
ff0dacd7
KH
1328 Check if a text is encoded in one of UTF-16 based coding systems.
1329 If it is, return 1, else return 0. */
aa72b389 1330
df7492f9
KH
1331#define UTF_16_HIGH_SURROGATE_P(val) \
1332 (((val) & 0xFC00) == 0xD800)
1333
1334#define UTF_16_LOW_SURROGATE_P(val) \
1335 (((val) & 0xFC00) == 0xDC00)
93dec019 1336
df7492f9
KH
1337#define UTF_16_INVALID_P(val) \
1338 (((val) == 0xFFFE) \
1339 || ((val) == 0xFFFF) \
1340 || UTF_16_LOW_SURROGATE_P (val))
aa72b389 1341
aa72b389 1342
df7492f9 1343static int
ff0dacd7 1344detect_coding_utf_16 (coding, detect_info)
aa72b389 1345 struct coding_system *coding;
ff0dacd7 1346 struct coding_detection_info *detect_info;
aa72b389 1347{
8f924df7
KH
1348 const unsigned char *src = coding->source, *src_base = src;
1349 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9
KH
1350 int multibytep = coding->src_multibyte;
1351 int consumed_chars = 0;
1352 int c1, c2;
aa72b389 1353
ff0dacd7 1354 detect_info->checked |= CATEGORY_MASK_UTF_16;
ff0dacd7 1355 if (coding->mode & CODING_MODE_LAST_BLOCK
24a73b0a 1356 && (coding->src_chars & 1))
ff0dacd7
KH
1357 {
1358 detect_info->rejected |= CATEGORY_MASK_UTF_16;
1359 return 0;
1360 }
24a73b0a 1361
df7492f9
KH
1362 ONE_MORE_BYTE (c1);
1363 ONE_MORE_BYTE (c2);
df7492f9 1364 if ((c1 == 0xFF) && (c2 == 0xFE))
aa72b389 1365 {
b49a1807
KH
1366 detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1367 | CATEGORY_MASK_UTF_16_AUTO);
24a73b0a
KH
1368 detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1369 | CATEGORY_MASK_UTF_16_BE_NOSIG
1370 | CATEGORY_MASK_UTF_16_LE_NOSIG);
aa72b389 1371 }
df7492f9 1372 else if ((c1 == 0xFE) && (c2 == 0xFF))
ff0dacd7 1373 {
b49a1807
KH
1374 detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1375 | CATEGORY_MASK_UTF_16_AUTO);
24a73b0a
KH
1376 detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1377 | CATEGORY_MASK_UTF_16_BE_NOSIG
1378 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1379 }
065e3595 1380 else if (c1 >= 0 && c2 >= 0)
24a73b0a
KH
1381 {
1382 unsigned char b1[256], b2[256];
1383 int b1_variants = 1, b2_variants = 1;
1384 int n;
1385
1386 bzero (b1, 256), bzero (b2, 256);
1387 b1[c1]++, b2[c2]++;
1388 for (n = 0; n < 256 && src < src_end; n++)
1389 {
065e3595 1390 src_base = src;
24a73b0a
KH
1391 ONE_MORE_BYTE (c1);
1392 ONE_MORE_BYTE (c2);
065e3595
KH
1393 if (c1 < 0 || c2 < 0)
1394 break;
24a73b0a
KH
1395 if (! b1[c1++]) b1_variants++;
1396 if (! b2[c2++]) b2_variants++;
1397 }
1398 if (b1_variants < b2_variants)
1399 detect_info->found |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1400 else
1401 detect_info->found |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1402 detect_info->rejected
1403 |= (CATEGORY_MASK_UTF_16_BE | CATEGORY_MASK_UTF_16_LE);
ff0dacd7 1404 }
df7492f9 1405 no_more_source:
ff0dacd7 1406 return 1;
df7492f9 1407}
aa72b389 1408
df7492f9
KH
1409static void
1410decode_coding_utf_16 (coding)
1411 struct coding_system *coding;
1412{
8f924df7
KH
1413 const unsigned char *src = coding->source + coding->consumed;
1414 const unsigned char *src_end = coding->source + coding->src_bytes;
1415 const unsigned char *src_base;
df7492f9
KH
1416 int *charbuf = coding->charbuf;
1417 int *charbuf_end = charbuf + coding->charbuf_size;
1418 int consumed_chars = 0, consumed_chars_base;
1419 int multibytep = coding->src_multibyte;
1420 enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1421 enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1422 int surrogate = CODING_UTF_16_SURROGATE (coding);
24a73b0a 1423 Lisp_Object attr, charset_list;
df7492f9 1424
24a73b0a 1425 CODING_GET_INFO (coding, attr, charset_list);
df7492f9 1426
b49a1807 1427 if (bom == utf_16_with_bom)
aa72b389 1428 {
df7492f9 1429 int c, c1, c2;
4af310db 1430
aa72b389 1431 src_base = src;
df7492f9
KH
1432 ONE_MORE_BYTE (c1);
1433 ONE_MORE_BYTE (c2);
e19c3639 1434 c = (c1 << 8) | c2;
aa72b389 1435
b49a1807
KH
1436 if (endian == utf_16_big_endian
1437 ? c != 0xFEFF : c != 0xFFFE)
aa72b389 1438 {
b49a1807
KH
1439 /* The first two bytes are not BOM. Treat them as bytes
1440 for a normal character. */
1441 src = src_base;
1442 coding->errors++;
aa72b389 1443 }
b49a1807
KH
1444 CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1445 }
1446 else if (bom == utf_16_detect_bom)
1447 {
1448 /* We have already tried to detect BOM and failed in
1449 detect_coding. */
1450 CODING_UTF_16_BOM (coding) = utf_16_without_bom;
df7492f9 1451 }
aa72b389 1452
df7492f9
KH
1453 while (1)
1454 {
1455 int c, c1, c2;
1456
1457 src_base = src;
1458 consumed_chars_base = consumed_chars;
1459
1460 if (charbuf + 2 >= charbuf_end)
1461 break;
1462
1463 ONE_MORE_BYTE (c1);
065e3595
KH
1464 if (c1 < 0)
1465 {
1466 *charbuf++ = -c1;
1467 continue;
1468 }
df7492f9 1469 ONE_MORE_BYTE (c2);
065e3595
KH
1470 if (c2 < 0)
1471 {
1472 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1473 *charbuf++ = -c2;
1474 continue;
1475 }
df7492f9 1476 c = (endian == utf_16_big_endian
e19c3639 1477 ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
df7492f9 1478 if (surrogate)
fd3ae0b9 1479 {
df7492f9 1480 if (! UTF_16_LOW_SURROGATE_P (c))
fd3ae0b9 1481 {
df7492f9
KH
1482 if (endian == utf_16_big_endian)
1483 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1484 else
1485 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1486 *charbuf++ = c1;
1487 *charbuf++ = c2;
1488 coding->errors++;
1489 if (UTF_16_HIGH_SURROGATE_P (c))
1490 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
fd3ae0b9 1491 else
df7492f9 1492 *charbuf++ = c;
fd3ae0b9
KH
1493 }
1494 else
df7492f9
KH
1495 {
1496 c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1497 CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
29f7ffd0 1498 *charbuf++ = 0x10000 + c;
df7492f9 1499 }
fd3ae0b9 1500 }
aa72b389 1501 else
df7492f9
KH
1502 {
1503 if (UTF_16_HIGH_SURROGATE_P (c))
1504 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1505 else
1506 *charbuf++ = c;
8f924df7 1507 }
aa72b389 1508 }
df7492f9
KH
1509
1510 no_more_source:
1511 coding->consumed_char += consumed_chars_base;
1512 coding->consumed = src_base - coding->source;
1513 coding->charbuf_used = charbuf - coding->charbuf;
aa72b389 1514}
b73bfc1c 1515
df7492f9
KH
1516static int
1517encode_coding_utf_16 (coding)
1518 struct coding_system *coding;
1519{
1520 int multibytep = coding->dst_multibyte;
1521 int *charbuf = coding->charbuf;
1522 int *charbuf_end = charbuf + coding->charbuf_used;
1523 unsigned char *dst = coding->destination + coding->produced;
1524 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1525 int safe_room = 8;
1526 enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1527 int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1528 int produced_chars = 0;
24a73b0a 1529 Lisp_Object attrs, charset_list;
df7492f9 1530 int c;
4ed46869 1531
24a73b0a 1532 CODING_GET_INFO (coding, attrs, charset_list);
df7492f9 1533
b49a1807 1534 if (bom != utf_16_without_bom)
df7492f9
KH
1535 {
1536 ASSURE_DESTINATION (safe_room);
1537 if (big_endian)
df7492f9 1538 EMIT_TWO_BYTES (0xFE, 0xFF);
880cf180
KH
1539 else
1540 EMIT_TWO_BYTES (0xFF, 0xFE);
df7492f9
KH
1541 CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1542 }
1543
1544 while (charbuf < charbuf_end)
1545 {
1546 ASSURE_DESTINATION (safe_room);
1547 c = *charbuf++;
e19c3639
KH
1548 if (c >= MAX_UNICODE_CHAR)
1549 c = coding->default_char;
df7492f9
KH
1550
1551 if (c < 0x10000)
1552 {
1553 if (big_endian)
1554 EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1555 else
1556 EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1557 }
1558 else
1559 {
1560 int c1, c2;
1561
1562 c -= 0x10000;
1563 c1 = (c >> 10) + 0xD800;
1564 c2 = (c & 0x3FF) + 0xDC00;
1565 if (big_endian)
1566 EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1567 else
1568 EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1569 }
1570 }
065e3595 1571 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
1572 coding->produced = dst - coding->destination;
1573 coding->produced_char += produced_chars;
1574 return 0;
1575}
1576
1577\f
1578/*** 6. Old Emacs' internal format (emacs-mule) ***/
1579
1580/* Emacs' internal format for representation of multiple character
1581 sets is a kind of multi-byte encoding, i.e. characters are
1582 represented by variable-length sequences of one-byte codes.
1583
1584 ASCII characters and control characters (e.g. `tab', `newline') are
1585 represented by one-byte sequences which are their ASCII codes, in
1586 the range 0x00 through 0x7F.
1587
1588 8-bit characters of the range 0x80..0x9F are represented by
1589 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1590 code + 0x20).
1591
1592 8-bit characters of the range 0xA0..0xFF are represented by
1593 one-byte sequences which are their 8-bit code.
1594
1595 The other characters are represented by a sequence of `base
1596 leading-code', optional `extended leading-code', and one or two
1597 `position-code's. The length of the sequence is determined by the
1598 base leading-code. Leading-code takes the range 0x81 through 0x9D,
1599 whereas extended leading-code and position-code take the range 0xA0
1600 through 0xFF. See `charset.h' for more details about leading-code
1601 and position-code.
1602
1603 --- CODE RANGE of Emacs' internal format ---
1604 character set range
1605 ------------- -----
1606 ascii 0x00..0x7F
1607 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1608 eight-bit-graphic 0xA0..0xBF
1609 ELSE 0x81..0x9D + [0xA0..0xFF]+
1610 ---------------------------------------------
1611
1612 As this is the internal character representation, the format is
1613 usually not used externally (i.e. in a file or in a data sent to a
1614 process). But, it is possible to have a text externally in this
1615 format (i.e. by encoding by the coding system `emacs-mule').
1616
1617 In that case, a sequence of one-byte codes has a slightly different
1618 form.
1619
1620 At first, all characters in eight-bit-control are represented by
1621 one-byte sequences which are their 8-bit code.
1622
1623 Next, character composition data are represented by the byte
1624 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1625 where,
1626 METHOD is 0xF0 plus one of composition method (enum
1627 composition_method),
1628
1629 BYTES is 0xA0 plus a byte length of this composition data,
1630
1631 CHARS is 0x20 plus a number of characters composed by this
1632 data,
1633
1634 COMPONENTs are characters of multibye form or composition
1635 rules encoded by two-byte of ASCII codes.
1636
1637 In addition, for backward compatibility, the following formats are
1638 also recognized as composition data on decoding.
1639
1640 0x80 MSEQ ...
1641 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1642
1643 Here,
1644 MSEQ is a multibyte form but in these special format:
1645 ASCII: 0xA0 ASCII_CODE+0x80,
1646 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1647 RULE is a one byte code of the range 0xA0..0xF0 that
1648 represents a composition rule.
1649 */
1650
1651char emacs_mule_bytes[256];
1652
df7492f9 1653int
ff0dacd7 1654emacs_mule_char (coding, src, nbytes, nchars, id)
df7492f9 1655 struct coding_system *coding;
065e3595 1656 const unsigned char *src;
ff0dacd7 1657 int *nbytes, *nchars, *id;
df7492f9 1658{
8f924df7
KH
1659 const unsigned char *src_end = coding->source + coding->src_bytes;
1660 const unsigned char *src_base = src;
df7492f9 1661 int multibytep = coding->src_multibyte;
df7492f9
KH
1662 struct charset *charset;
1663 unsigned code;
1664 int c;
1665 int consumed_chars = 0;
1666
1667 ONE_MORE_BYTE (c);
065e3595 1668 if (c < 0)
df7492f9 1669 {
065e3595
KH
1670 c = -c;
1671 charset = emacs_mule_charset[0];
1672 }
1673 else
1674 {
1675 switch (emacs_mule_bytes[c])
b73bfc1c 1676 {
065e3595 1677 case 2:
df7492f9
KH
1678 if (! (charset = emacs_mule_charset[c]))
1679 goto invalid_code;
1680 ONE_MORE_BYTE (c);
065e3595
KH
1681 if (c < 0)
1682 goto invalid_code;
df7492f9 1683 code = c & 0x7F;
065e3595
KH
1684 break;
1685
1686 case 3:
1687 if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
1688 || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
1689 {
1690 ONE_MORE_BYTE (c);
1691 if (c < 0 || ! (charset = emacs_mule_charset[c]))
1692 goto invalid_code;
1693 ONE_MORE_BYTE (c);
1694 if (c < 0)
1695 goto invalid_code;
1696 code = c & 0x7F;
1697 }
1698 else
1699 {
1700 if (! (charset = emacs_mule_charset[c]))
1701 goto invalid_code;
1702 ONE_MORE_BYTE (c);
1703 if (c < 0)
1704 goto invalid_code;
1705 code = (c & 0x7F) << 8;
1706 ONE_MORE_BYTE (c);
1707 if (c < 0)
1708 goto invalid_code;
1709 code |= c & 0x7F;
1710 }
1711 break;
1712
1713 case 4:
1714 ONE_MORE_BYTE (c);
1715 if (c < 0 || ! (charset = emacs_mule_charset[c]))
df7492f9
KH
1716 goto invalid_code;
1717 ONE_MORE_BYTE (c);
065e3595
KH
1718 if (c < 0)
1719 goto invalid_code;
781d7a48 1720 code = (c & 0x7F) << 8;
df7492f9 1721 ONE_MORE_BYTE (c);
065e3595
KH
1722 if (c < 0)
1723 goto invalid_code;
df7492f9 1724 code |= c & 0x7F;
065e3595 1725 break;
df7492f9 1726
065e3595
KH
1727 case 1:
1728 code = c;
1729 charset = CHARSET_FROM_ID (ASCII_BYTE_P (code)
1730 ? charset_ascii : charset_eight_bit);
1731 break;
df7492f9 1732
065e3595
KH
1733 default:
1734 abort ();
1735 }
1736 c = DECODE_CHAR (charset, code);
1737 if (c < 0)
1738 goto invalid_code;
df7492f9 1739 }
df7492f9
KH
1740 *nbytes = src - src_base;
1741 *nchars = consumed_chars;
ff0dacd7
KH
1742 if (id)
1743 *id = charset->id;
df7492f9
KH
1744 return c;
1745
1746 no_more_source:
1747 return -2;
1748
1749 invalid_code:
1750 return -1;
1751}
1752
1753
1754/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
ff0dacd7
KH
1755 Check if a text is encoded in `emacs-mule'. If it is, return 1,
1756 else return 0. */
df7492f9
KH
1757
1758static int
ff0dacd7 1759detect_coding_emacs_mule (coding, detect_info)
df7492f9 1760 struct coding_system *coding;
ff0dacd7 1761 struct coding_detection_info *detect_info;
df7492f9 1762{
065e3595 1763 const unsigned char *src = coding->source, *src_base;
8f924df7 1764 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9
KH
1765 int multibytep = coding->src_multibyte;
1766 int consumed_chars = 0;
1767 int c;
1768 int found = 0;
1769
ff0dacd7 1770 detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
df7492f9
KH
1771 /* A coding system of this category is always ASCII compatible. */
1772 src += coding->head_ascii;
1773
1774 while (1)
1775 {
065e3595 1776 src_base = src;
df7492f9 1777 ONE_MORE_BYTE (c);
065e3595
KH
1778 if (c < 0)
1779 continue;
df7492f9
KH
1780 if (c == 0x80)
1781 {
1782 /* Perhaps the start of composite character. We simple skip
1783 it because analyzing it is too heavy for detecting. But,
1784 at least, we check that the composite character
1785 constitues of more than 4 bytes. */
8f924df7 1786 const unsigned char *src_base;
df7492f9
KH
1787
1788 repeat:
1789 src_base = src;
1790 do
1791 {
1792 ONE_MORE_BYTE (c);
1793 }
1794 while (c >= 0xA0);
1795
1796 if (src - src_base <= 4)
1797 break;
ff0dacd7 1798 found = CATEGORY_MASK_EMACS_MULE;
df7492f9
KH
1799 if (c == 0x80)
1800 goto repeat;
b73bfc1c 1801 }
df7492f9
KH
1802
1803 if (c < 0x80)
b73bfc1c 1804 {
df7492f9
KH
1805 if (c < 0x20
1806 && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1807 break;
1808 }
1809 else
1810 {
0e219d54 1811 int more_bytes = emacs_mule_bytes[*src_base] - 1;
df7492f9 1812
0e219d54 1813 while (more_bytes > 0)
df7492f9
KH
1814 {
1815 ONE_MORE_BYTE (c);
0e219d54
KH
1816 if (c < 0xA0)
1817 {
1818 src--; /* Unread the last byte. */
1819 break;
1820 }
1821 more_bytes--;
df7492f9 1822 }
0e219d54 1823 if (more_bytes != 0)
df7492f9 1824 break;
ff0dacd7 1825 found = CATEGORY_MASK_EMACS_MULE;
4ed46869
KH
1826 }
1827 }
ff0dacd7 1828 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
df7492f9
KH
1829 return 0;
1830
1831 no_more_source:
065e3595 1832 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
89528eb3 1833 {
ff0dacd7 1834 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
89528eb3
KH
1835 return 0;
1836 }
ff0dacd7
KH
1837 detect_info->found |= found;
1838 return 1;
4ed46869
KH
1839}
1840
b73bfc1c 1841
df7492f9
KH
1842/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
1843
1844/* Decode a character represented as a component of composition
1845 sequence of Emacs 20/21 style at SRC. Set C to that character and
1846 update SRC to the head of next character (or an encoded composition
1847 rule). If SRC doesn't points a composition component, set C to -1.
1848 If SRC points an invalid byte sequence, global exit by a return
1849 value 0. */
1850
1851#define DECODE_EMACS_MULE_COMPOSITION_CHAR(buf) \
1852 if (1) \
1853 { \
1854 int c; \
1855 int nbytes, nchars; \
1856 \
1857 if (src == src_end) \
1858 break; \
ff0dacd7 1859 c = emacs_mule_char (coding, src, &nbytes, &nchars, NULL);\
df7492f9
KH
1860 if (c < 0) \
1861 { \
1862 if (c == -2) \
1863 break; \
1864 goto invalid_code; \
1865 } \
1866 *buf++ = c; \
1867 src += nbytes; \
1868 consumed_chars += nchars; \
1869 } \
1870 else
1871
1872
1873/* Decode a composition rule represented as a component of composition
781d7a48
KH
1874 sequence of Emacs 20 style at SRC. Store the decoded rule in *BUF,
1875 and increment BUF. If SRC points an invalid byte sequence, set C
1876 to -1. */
df7492f9 1877
781d7a48 1878#define DECODE_EMACS_MULE_COMPOSITION_RULE_20(buf) \
df7492f9
KH
1879 do { \
1880 int c, gref, nref; \
1881 \
781d7a48 1882 if (src >= src_end) \
df7492f9
KH
1883 goto invalid_code; \
1884 ONE_MORE_BYTE_NO_CHECK (c); \
781d7a48 1885 c -= 0x20; \
df7492f9
KH
1886 if (c < 0 || c >= 81) \
1887 goto invalid_code; \
1888 \
1889 gref = c / 9, nref = c % 9; \
1890 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
1891 } while (0)
1892
1893
781d7a48
KH
1894/* Decode a composition rule represented as a component of composition
1895 sequence of Emacs 21 style at SRC. Store the decoded rule in *BUF,
1896 and increment BUF. If SRC points an invalid byte sequence, set C
1897 to -1. */
1898
1899#define DECODE_EMACS_MULE_COMPOSITION_RULE_21(buf) \
1900 do { \
1901 int gref, nref; \
1902 \
1903 if (src + 1>= src_end) \
1904 goto invalid_code; \
1905 ONE_MORE_BYTE_NO_CHECK (gref); \
1906 gref -= 0x20; \
1907 ONE_MORE_BYTE_NO_CHECK (nref); \
1908 nref -= 0x20; \
1909 if (gref < 0 || gref >= 81 \
1910 || nref < 0 || nref >= 81) \
1911 goto invalid_code; \
1912 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
1913 } while (0)
1914
1915
df7492f9 1916#define DECODE_EMACS_MULE_21_COMPOSITION(c) \
aa72b389 1917 do { \
df7492f9 1918 /* Emacs 21 style format. The first three bytes at SRC are \
781d7a48 1919 (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is \
df7492f9
KH
1920 the byte length of this composition information, CHARS is the \
1921 number of characters composed by this composition. */ \
781d7a48
KH
1922 enum composition_method method = c - 0xF2; \
1923 int *charbuf_base = charbuf; \
ff0dacd7 1924 int from, to; \
df7492f9
KH
1925 int consumed_chars_limit; \
1926 int nbytes, nchars; \
1927 \
1928 ONE_MORE_BYTE (c); \
065e3595
KH
1929 if (c < 0) \
1930 goto invalid_code; \
df7492f9
KH
1931 nbytes = c - 0xA0; \
1932 if (nbytes < 3) \
1933 goto invalid_code; \
1934 ONE_MORE_BYTE (c); \
065e3595
KH
1935 if (c < 0) \
1936 goto invalid_code; \
df7492f9 1937 nchars = c - 0xA0; \
ff0dacd7
KH
1938 from = coding->produced + char_offset; \
1939 to = from + nchars; \
1940 ADD_COMPOSITION_DATA (charbuf, from, to, method); \
df7492f9
KH
1941 consumed_chars_limit = consumed_chars_base + nbytes; \
1942 if (method != COMPOSITION_RELATIVE) \
aa72b389 1943 { \
df7492f9
KH
1944 int i = 0; \
1945 while (consumed_chars < consumed_chars_limit) \
aa72b389 1946 { \
df7492f9 1947 if (i % 2 && method != COMPOSITION_WITH_ALTCHARS) \
781d7a48 1948 DECODE_EMACS_MULE_COMPOSITION_RULE_21 (charbuf); \
df7492f9
KH
1949 else \
1950 DECODE_EMACS_MULE_COMPOSITION_CHAR (charbuf); \
781d7a48 1951 i++; \
aa72b389 1952 } \
df7492f9
KH
1953 if (consumed_chars < consumed_chars_limit) \
1954 goto invalid_code; \
781d7a48 1955 charbuf_base[0] -= i; \
aa72b389
KH
1956 } \
1957 } while (0)
93dec019 1958
aa72b389 1959
df7492f9
KH
1960#define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION(c) \
1961 do { \
1962 /* Emacs 20 style format for relative composition. */ \
1963 /* Store multibyte form of characters to be composed. */ \
ff0dacd7 1964 enum composition_method method = COMPOSITION_RELATIVE; \
df7492f9
KH
1965 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
1966 int *buf = components; \
1967 int i, j; \
ff0dacd7 1968 int from, to; \
df7492f9
KH
1969 \
1970 src = src_base; \
1971 ONE_MORE_BYTE (c); /* skip 0x80 */ \
1972 for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \
1973 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1974 if (i < 2) \
1975 goto invalid_code; \
ff0dacd7
KH
1976 from = coding->produced_char + char_offset; \
1977 to = from + i; \
1978 ADD_COMPOSITION_DATA (charbuf, from, to, method); \
df7492f9
KH
1979 for (j = 0; j < i; j++) \
1980 *charbuf++ = components[j]; \
1981 } while (0)
1982
1983
1984#define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION(c) \
1985 do { \
1986 /* Emacs 20 style format for rule-base composition. */ \
1987 /* Store multibyte form of characters to be composed. */ \
ff0dacd7 1988 enum composition_method method = COMPOSITION_WITH_RULE; \
df7492f9
KH
1989 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
1990 int *buf = components; \
1991 int i, j; \
ff0dacd7 1992 int from, to; \
df7492f9
KH
1993 \
1994 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1995 for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \
1996 { \
781d7a48 1997 DECODE_EMACS_MULE_COMPOSITION_RULE_20 (buf); \
df7492f9
KH
1998 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1999 } \
2000 if (i < 1 || (buf - components) % 2 == 0) \
2001 goto invalid_code; \
2002 if (charbuf + i + (i / 2) + 1 < charbuf_end) \
2003 goto no_more_source; \
ff0dacd7
KH
2004 from = coding->produced_char + char_offset; \
2005 to = from + i; \
2006 ADD_COMPOSITION_DATA (buf, from, to, method); \
df7492f9
KH
2007 for (j = 0; j < i; j++) \
2008 *charbuf++ = components[j]; \
2009 for (j = 0; j < i; j += 2) \
2010 *charbuf++ = components[j]; \
2011 } while (0)
2012
aa72b389
KH
2013
2014static void
df7492f9 2015decode_coding_emacs_mule (coding)
aa72b389 2016 struct coding_system *coding;
aa72b389 2017{
8f924df7
KH
2018 const unsigned char *src = coding->source + coding->consumed;
2019 const unsigned char *src_end = coding->source + coding->src_bytes;
2020 const unsigned char *src_base;
df7492f9 2021 int *charbuf = coding->charbuf;
ff0dacd7 2022 int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
df7492f9 2023 int consumed_chars = 0, consumed_chars_base;
df7492f9 2024 int multibytep = coding->src_multibyte;
24a73b0a 2025 Lisp_Object attrs, charset_list;
ff0dacd7
KH
2026 int char_offset = coding->produced_char;
2027 int last_offset = char_offset;
2028 int last_id = charset_ascii;
aa72b389 2029
24a73b0a 2030 CODING_GET_INFO (coding, attrs, charset_list);
aa72b389 2031
aa72b389
KH
2032 while (1)
2033 {
df7492f9
KH
2034 int c;
2035
aa72b389 2036 src_base = src;
df7492f9
KH
2037 consumed_chars_base = consumed_chars;
2038
2039 if (charbuf >= charbuf_end)
2040 break;
aa72b389 2041
df7492f9 2042 ONE_MORE_BYTE (c);
065e3595
KH
2043 if (c < 0)
2044 {
2045 *charbuf++ = -c;
2046 char_offset++;
2047 }
2048 else if (c < 0x80)
aa72b389 2049 {
df7492f9
KH
2050 *charbuf++ = c;
2051 char_offset++;
aa72b389 2052 }
df7492f9
KH
2053 else if (c == 0x80)
2054 {
df7492f9 2055 ONE_MORE_BYTE (c);
065e3595
KH
2056 if (c < 0)
2057 goto invalid_code;
781d7a48
KH
2058 if (c - 0xF2 >= COMPOSITION_RELATIVE
2059 && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)
df7492f9
KH
2060 DECODE_EMACS_MULE_21_COMPOSITION (c);
2061 else if (c < 0xC0)
2062 DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (c);
2063 else if (c == 0xFF)
2064 DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (c);
2065 else
2066 goto invalid_code;
2067 }
2068 else if (c < 0xA0 && emacs_mule_bytes[c] > 1)
2069 {
2070 int nbytes, nchars;
ff0dacd7
KH
2071 int id;
2072
781d7a48
KH
2073 src = src_base;
2074 consumed_chars = consumed_chars_base;
ff0dacd7 2075 c = emacs_mule_char (coding, src, &nbytes, &nchars, &id);
df7492f9
KH
2076 if (c < 0)
2077 {
2078 if (c == -2)
2079 break;
2080 goto invalid_code;
2081 }
ff0dacd7
KH
2082 if (last_id != id)
2083 {
2084 if (last_id != charset_ascii)
2085 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
2086 last_id = id;
2087 last_offset = char_offset;
2088 }
df7492f9 2089 *charbuf++ = c;
781d7a48
KH
2090 src += nbytes;
2091 consumed_chars += nchars;
df7492f9
KH
2092 char_offset++;
2093 }
2094 continue;
2095
2096 invalid_code:
2097 src = src_base;
2098 consumed_chars = consumed_chars_base;
2099 ONE_MORE_BYTE (c);
2100 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
ff0dacd7 2101 char_offset++;
df7492f9
KH
2102 coding->errors++;
2103 }
2104
2105 no_more_source:
ff0dacd7
KH
2106 if (last_id != charset_ascii)
2107 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
df7492f9
KH
2108 coding->consumed_char += consumed_chars_base;
2109 coding->consumed = src_base - coding->source;
2110 coding->charbuf_used = charbuf - coding->charbuf;
2111}
2112
2113
2114#define EMACS_MULE_LEADING_CODES(id, codes) \
2115 do { \
2116 if (id < 0xA0) \
2117 codes[0] = id, codes[1] = 0; \
2118 else if (id < 0xE0) \
2119 codes[0] = 0x9A, codes[1] = id; \
2120 else if (id < 0xF0) \
2121 codes[0] = 0x9B, codes[1] = id; \
2122 else if (id < 0xF5) \
2123 codes[0] = 0x9C, codes[1] = id; \
2124 else \
2125 codes[0] = 0x9D, codes[1] = id; \
2126 } while (0);
2127
aa72b389 2128
df7492f9
KH
2129static int
2130encode_coding_emacs_mule (coding)
2131 struct coding_system *coding;
2132{
2133 int multibytep = coding->dst_multibyte;
2134 int *charbuf = coding->charbuf;
2135 int *charbuf_end = charbuf + coding->charbuf_used;
2136 unsigned char *dst = coding->destination + coding->produced;
2137 unsigned char *dst_end = coding->destination + coding->dst_bytes;
2138 int safe_room = 8;
df7492f9 2139 int produced_chars = 0;
24a73b0a 2140 Lisp_Object attrs, charset_list;
df7492f9 2141 int c;
ff0dacd7 2142 int preferred_charset_id = -1;
df7492f9 2143
24a73b0a 2144 CODING_GET_INFO (coding, attrs, charset_list);
eccb6815
KH
2145 if (! EQ (charset_list, Vemacs_mule_charset_list))
2146 {
2147 CODING_ATTR_CHARSET_LIST (attrs)
2148 = charset_list = Vemacs_mule_charset_list;
2149 }
df7492f9
KH
2150
2151 while (charbuf < charbuf_end)
2152 {
2153 ASSURE_DESTINATION (safe_room);
2154 c = *charbuf++;
ff0dacd7
KH
2155
2156 if (c < 0)
2157 {
2158 /* Handle an annotation. */
2159 switch (*charbuf)
2160 {
2161 case CODING_ANNOTATE_COMPOSITION_MASK:
2162 /* Not yet implemented. */
2163 break;
2164 case CODING_ANNOTATE_CHARSET_MASK:
2165 preferred_charset_id = charbuf[3];
2166 if (preferred_charset_id >= 0
2167 && NILP (Fmemq (make_number (preferred_charset_id),
2168 charset_list)))
2169 preferred_charset_id = -1;
2170 break;
2171 default:
2172 abort ();
2173 }
2174 charbuf += -c - 1;
2175 continue;
2176 }
2177
df7492f9
KH
2178 if (ASCII_CHAR_P (c))
2179 EMIT_ONE_ASCII_BYTE (c);
16eafb5d
KH
2180 else if (CHAR_BYTE8_P (c))
2181 {
2182 c = CHAR_TO_BYTE8 (c);
2183 EMIT_ONE_BYTE (c);
2184 }
df7492f9 2185 else
aa72b389 2186 {
df7492f9
KH
2187 struct charset *charset;
2188 unsigned code;
2189 int dimension;
2190 int emacs_mule_id;
2191 unsigned char leading_codes[2];
2192
ff0dacd7
KH
2193 if (preferred_charset_id >= 0)
2194 {
2195 charset = CHARSET_FROM_ID (preferred_charset_id);
2196 if (! CHAR_CHARSET_P (c, charset))
2197 charset = char_charset (c, charset_list, NULL);
2198 }
2199 else
2200 charset = char_charset (c, charset_list, &code);
df7492f9
KH
2201 if (! charset)
2202 {
2203 c = coding->default_char;
2204 if (ASCII_CHAR_P (c))
2205 {
2206 EMIT_ONE_ASCII_BYTE (c);
2207 continue;
2208 }
2209 charset = char_charset (c, charset_list, &code);
2210 }
2211 dimension = CHARSET_DIMENSION (charset);
2212 emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2213 EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2214 EMIT_ONE_BYTE (leading_codes[0]);
2215 if (leading_codes[1])
2216 EMIT_ONE_BYTE (leading_codes[1]);
2217 if (dimension == 1)
1fa663f9 2218 EMIT_ONE_BYTE (code | 0x80);
aa72b389 2219 else
df7492f9 2220 {
1fa663f9 2221 code |= 0x8080;
df7492f9
KH
2222 EMIT_ONE_BYTE (code >> 8);
2223 EMIT_ONE_BYTE (code & 0xFF);
2224 }
aa72b389 2225 }
aa72b389 2226 }
065e3595 2227 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
2228 coding->produced_char += produced_chars;
2229 coding->produced = dst - coding->destination;
2230 return 0;
aa72b389 2231}
b73bfc1c 2232
4ed46869 2233\f
df7492f9 2234/*** 7. ISO2022 handlers ***/
4ed46869
KH
2235
2236/* The following note describes the coding system ISO2022 briefly.
39787efd 2237 Since the intention of this note is to help understand the
5a936b46 2238 functions in this file, some parts are NOT ACCURATE or are OVERLY
39787efd 2239 SIMPLIFIED. For thorough understanding, please refer to the
5a936b46 2240 original document of ISO2022. This is equivalent to the standard
cfb43547 2241 ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
4ed46869
KH
2242
2243 ISO2022 provides many mechanisms to encode several character sets
cfb43547 2244 in 7-bit and 8-bit environments. For 7-bit environments, all text
39787efd
KH
2245 is encoded using bytes less than 128. This may make the encoded
2246 text a little bit longer, but the text passes more easily through
cfb43547 2247 several types of gateway, some of which strip off the MSB (Most
8ca3766a 2248 Significant Bit).
b73bfc1c 2249
cfb43547
DL
2250 There are two kinds of character sets: control character sets and
2251 graphic character sets. The former contain control characters such
4ed46869 2252 as `newline' and `escape' to provide control functions (control
39787efd 2253 functions are also provided by escape sequences). The latter
cfb43547 2254 contain graphic characters such as 'A' and '-'. Emacs recognizes
4ed46869
KH
2255 two control character sets and many graphic character sets.
2256
2257 Graphic character sets are classified into one of the following
39787efd
KH
2258 four classes, according to the number of bytes (DIMENSION) and
2259 number of characters in one dimension (CHARS) of the set:
2260 - DIMENSION1_CHARS94
2261 - DIMENSION1_CHARS96
2262 - DIMENSION2_CHARS94
2263 - DIMENSION2_CHARS96
2264
2265 In addition, each character set is assigned an identification tag,
cfb43547 2266 unique for each set, called the "final character" (denoted as <F>
39787efd
KH
2267 hereafter). The <F> of each character set is decided by ECMA(*)
2268 when it is registered in ISO. The code range of <F> is 0x30..0x7F
2269 (0x30..0x3F are for private use only).
4ed46869
KH
2270
2271 Note (*): ECMA = European Computer Manufacturers Association
2272
cfb43547 2273 Here are examples of graphic character sets [NAME(<F>)]:
4ed46869
KH
2274 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2275 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2276 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2277 o DIMENSION2_CHARS96 -- none for the moment
2278
39787efd 2279 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
4ed46869
KH
2280 C0 [0x00..0x1F] -- control character plane 0
2281 GL [0x20..0x7F] -- graphic character plane 0
2282 C1 [0x80..0x9F] -- control character plane 1
2283 GR [0xA0..0xFF] -- graphic character plane 1
2284
2285 A control character set is directly designated and invoked to C0 or
39787efd
KH
2286 C1 by an escape sequence. The most common case is that:
2287 - ISO646's control character set is designated/invoked to C0, and
2288 - ISO6429's control character set is designated/invoked to C1,
2289 and usually these designations/invocations are omitted in encoded
2290 text. In a 7-bit environment, only C0 can be used, and a control
2291 character for C1 is encoded by an appropriate escape sequence to
2292 fit into the environment. All control characters for C1 are
2293 defined to have corresponding escape sequences.
4ed46869
KH
2294
2295 A graphic character set is at first designated to one of four
2296 graphic registers (G0 through G3), then these graphic registers are
2297 invoked to GL or GR. These designations and invocations can be
2298 done independently. The most common case is that G0 is invoked to
39787efd
KH
2299 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
2300 these invocations and designations are omitted in encoded text.
2301 In a 7-bit environment, only GL can be used.
4ed46869 2302
39787efd
KH
2303 When a graphic character set of CHARS94 is invoked to GL, codes
2304 0x20 and 0x7F of the GL area work as control characters SPACE and
2305 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2306 be used.
4ed46869
KH
2307
2308 There are two ways of invocation: locking-shift and single-shift.
2309 With locking-shift, the invocation lasts until the next different
39787efd
KH
2310 invocation, whereas with single-shift, the invocation affects the
2311 following character only and doesn't affect the locking-shift
2312 state. Invocations are done by the following control characters or
2313 escape sequences:
4ed46869
KH
2314
2315 ----------------------------------------------------------------------
39787efd 2316 abbrev function cntrl escape seq description
4ed46869 2317 ----------------------------------------------------------------------
39787efd
KH
2318 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
2319 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
2320 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
2321 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
2322 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
2323 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
2324 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
2325 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
2326 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
4ed46869 2327 ----------------------------------------------------------------------
39787efd
KH
2328 (*) These are not used by any known coding system.
2329
2330 Control characters for these functions are defined by macros
2331 ISO_CODE_XXX in `coding.h'.
4ed46869 2332
39787efd 2333 Designations are done by the following escape sequences:
4ed46869
KH
2334 ----------------------------------------------------------------------
2335 escape sequence description
2336 ----------------------------------------------------------------------
2337 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
2338 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
2339 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
2340 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
2341 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
2342 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
2343 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
2344 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
2345 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
2346 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
2347 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
2348 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
2349 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
2350 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
2351 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
2352 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
2353 ----------------------------------------------------------------------
2354
2355 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
39787efd 2356 of dimension 1, chars 94, and final character <F>, etc...
4ed46869
KH
2357
2358 Note (*): Although these designations are not allowed in ISO2022,
2359 Emacs accepts them on decoding, and produces them on encoding
39787efd 2360 CHARS96 character sets in a coding system which is characterized as
4ed46869
KH
2361 7-bit environment, non-locking-shift, and non-single-shift.
2362
2363 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
df7492f9 2364 '(' must be omitted. We refer to this as "short-form" hereafter.
4ed46869 2365
cfb43547 2366 Now you may notice that there are a lot of ways of encoding the
39787efd
KH
2367 same multilingual text in ISO2022. Actually, there exist many
2368 coding systems such as Compound Text (used in X11's inter client
8ca3766a
DL
2369 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2370 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
4ed46869
KH
2371 localized platforms), and all of these are variants of ISO2022.
2372
2373 In addition to the above, Emacs handles two more kinds of escape
2374 sequences: ISO6429's direction specification and Emacs' private
2375 sequence for specifying character composition.
2376
39787efd 2377 ISO6429's direction specification takes the following form:
4ed46869
KH
2378 o CSI ']' -- end of the current direction
2379 o CSI '0' ']' -- end of the current direction
2380 o CSI '1' ']' -- start of left-to-right text
2381 o CSI '2' ']' -- start of right-to-left text
2382 The control character CSI (0x9B: control sequence introducer) is
39787efd
KH
2383 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2384
2385 Character composition specification takes the following form:
ec6d2bb8
KH
2386 o ESC '0' -- start relative composition
2387 o ESC '1' -- end composition
2388 o ESC '2' -- start rule-base composition (*)
2389 o ESC '3' -- start relative composition with alternate chars (**)
2390 o ESC '4' -- start rule-base composition with alternate chars (**)
b73bfc1c 2391 Since these are not standard escape sequences of any ISO standard,
cfb43547 2392 the use of them with these meanings is restricted to Emacs only.
ec6d2bb8 2393
5a936b46
DL
2394 (*) This form is used only in Emacs 20.7 and older versions,
2395 but newer versions can safely decode it.
cfb43547 2396 (**) This form is used only in Emacs 21.1 and newer versions,
5a936b46 2397 and older versions can't decode it.
ec6d2bb8 2398
cfb43547 2399 Here's a list of example usages of these composition escape
b73bfc1c 2400 sequences (categorized by `enum composition_method').
ec6d2bb8 2401
b73bfc1c 2402 COMPOSITION_RELATIVE:
ec6d2bb8 2403 ESC 0 CHAR [ CHAR ] ESC 1
8ca3766a 2404 COMPOSITION_WITH_RULE:
ec6d2bb8 2405 ESC 2 CHAR [ RULE CHAR ] ESC 1
b73bfc1c 2406 COMPOSITION_WITH_ALTCHARS:
ec6d2bb8 2407 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
b73bfc1c 2408 COMPOSITION_WITH_RULE_ALTCHARS:
ec6d2bb8 2409 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
4ed46869
KH
2410
2411enum iso_code_class_type iso_code_class[256];
2412
df7492f9
KH
2413#define SAFE_CHARSET_P(coding, id) \
2414 ((id) <= (coding)->max_charset_id \
2415 && (coding)->safe_charsets[id] >= 0)
2416
2417
2418#define SHIFT_OUT_OK(category) \
2419 (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
2420
2421static void
f0064e1f
DL
2422setup_iso_safe_charsets (attrs)
2423 Lisp_Object attrs;
df7492f9
KH
2424{
2425 Lisp_Object charset_list, safe_charsets;
2426 Lisp_Object request;
2427 Lisp_Object reg_usage;
2428 Lisp_Object tail;
2429 int reg94, reg96;
2430 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2431 int max_charset_id;
2432
2433 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2434 if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2435 && ! EQ (charset_list, Viso_2022_charset_list))
2436 {
2437 CODING_ATTR_CHARSET_LIST (attrs)
2438 = charset_list = Viso_2022_charset_list;
2439 ASET (attrs, coding_attr_safe_charsets, Qnil);
2440 }
2441
2442 if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2443 return;
2444
2445 max_charset_id = 0;
2446 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2447 {
2448 int id = XINT (XCAR (tail));
2449 if (max_charset_id < id)
2450 max_charset_id = id;
2451 }
d46c5b12 2452
df7492f9
KH
2453 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
2454 make_number (255));
2455 request = AREF (attrs, coding_attr_iso_request);
2456 reg_usage = AREF (attrs, coding_attr_iso_usage);
2457 reg94 = XINT (XCAR (reg_usage));
2458 reg96 = XINT (XCDR (reg_usage));
2459
2460 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2461 {
2462 Lisp_Object id;
2463 Lisp_Object reg;
2464 struct charset *charset;
2465
2466 id = XCAR (tail);
2467 charset = CHARSET_FROM_ID (XINT (id));
bf16eb23 2468 reg = Fcdr (Fassq (id, request));
df7492f9 2469 if (! NILP (reg))
8f924df7 2470 SSET (safe_charsets, XINT (id), XINT (reg));
df7492f9
KH
2471 else if (charset->iso_chars_96)
2472 {
2473 if (reg96 < 4)
8f924df7 2474 SSET (safe_charsets, XINT (id), reg96);
df7492f9
KH
2475 }
2476 else
2477 {
2478 if (reg94 < 4)
8f924df7 2479 SSET (safe_charsets, XINT (id), reg94);
df7492f9
KH
2480 }
2481 }
2482 ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2483}
d46c5b12 2484
b6871cc7 2485
4ed46869 2486/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
ff0dacd7
KH
2487 Check if a text is encoded in one of ISO-2022 based codig systems.
2488 If it is, return 1, else return 0. */
4ed46869 2489
0a28aafb 2490static int
ff0dacd7 2491detect_coding_iso_2022 (coding, detect_info)
df7492f9 2492 struct coding_system *coding;
ff0dacd7 2493 struct coding_detection_info *detect_info;
4ed46869 2494{
8f924df7
KH
2495 const unsigned char *src = coding->source, *src_base = src;
2496 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9 2497 int multibytep = coding->src_multibyte;
ff0dacd7 2498 int single_shifting = 0;
df7492f9
KH
2499 int id;
2500 int c, c1;
2501 int consumed_chars = 0;
2502 int i;
ff0dacd7
KH
2503 int rejected = 0;
2504 int found = 0;
2505
2506 detect_info->checked |= CATEGORY_MASK_ISO;
df7492f9
KH
2507
2508 for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2509 {
2510 struct coding_system *this = &(coding_categories[i]);
2511 Lisp_Object attrs, val;
2512
2513 attrs = CODING_ID_ATTRS (this->id);
2514 if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2515 && ! EQ (CODING_ATTR_SAFE_CHARSETS (attrs), Viso_2022_charset_list))
2516 setup_iso_safe_charsets (attrs);
2517 val = CODING_ATTR_SAFE_CHARSETS (attrs);
8f924df7
KH
2518 this->max_charset_id = SCHARS (val) - 1;
2519 this->safe_charsets = (char *) SDATA (val);
df7492f9
KH
2520 }
2521
2522 /* A coding system of this category is always ASCII compatible. */
2523 src += coding->head_ascii;
3f003981 2524
ff0dacd7 2525 while (rejected != CATEGORY_MASK_ISO)
4ed46869 2526 {
065e3595 2527 src_base = src;
df7492f9 2528 ONE_MORE_BYTE (c);
4ed46869
KH
2529 switch (c)
2530 {
2531 case ISO_CODE_ESC:
74383408
KH
2532 if (inhibit_iso_escape_detection)
2533 break;
f46869e4 2534 single_shifting = 0;
df7492f9 2535 ONE_MORE_BYTE (c);
d46c5b12 2536 if (c >= '(' && c <= '/')
4ed46869 2537 {
bf9cdd4e 2538 /* Designation sequence for a charset of dimension 1. */
df7492f9 2539 ONE_MORE_BYTE (c1);
d46c5b12 2540 if (c1 < ' ' || c1 >= 0x80
df7492f9 2541 || (id = iso_charset_table[0][c >= ','][c1]) < 0)
d46c5b12
KH
2542 /* Invalid designation sequence. Just ignore. */
2543 break;
bf9cdd4e
KH
2544 }
2545 else if (c == '$')
2546 {
2547 /* Designation sequence for a charset of dimension 2. */
df7492f9 2548 ONE_MORE_BYTE (c);
bf9cdd4e
KH
2549 if (c >= '@' && c <= 'B')
2550 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
ff0dacd7 2551 id = iso_charset_table[1][0][c];
bf9cdd4e 2552 else if (c >= '(' && c <= '/')
bcf26d6a 2553 {
df7492f9 2554 ONE_MORE_BYTE (c1);
d46c5b12 2555 if (c1 < ' ' || c1 >= 0x80
df7492f9 2556 || (id = iso_charset_table[1][c >= ','][c1]) < 0)
d46c5b12
KH
2557 /* Invalid designation sequence. Just ignore. */
2558 break;
bcf26d6a 2559 }
bf9cdd4e 2560 else
ff0dacd7 2561 /* Invalid designation sequence. Just ignore it. */
d46c5b12
KH
2562 break;
2563 }
ae9ff118 2564 else if (c == 'N' || c == 'O')
d46c5b12 2565 {
ae9ff118 2566 /* ESC <Fe> for SS2 or SS3. */
ff0dacd7
KH
2567 single_shifting = 1;
2568 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
d46c5b12 2569 break;
4ed46869 2570 }
ec6d2bb8
KH
2571 else if (c >= '0' && c <= '4')
2572 {
2573 /* ESC <Fp> for start/end composition. */
ff0dacd7 2574 found |= CATEGORY_MASK_ISO;
ec6d2bb8
KH
2575 break;
2576 }
bf9cdd4e 2577 else
df7492f9 2578 {
ff0dacd7 2579 /* Invalid escape sequence. Just ignore it. */
df7492f9
KH
2580 break;
2581 }
d46c5b12
KH
2582
2583 /* We found a valid designation sequence for CHARSET. */
ff0dacd7 2584 rejected |= CATEGORY_MASK_ISO_8BIT;
df7492f9
KH
2585 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
2586 id))
ff0dacd7 2587 found |= CATEGORY_MASK_ISO_7;
d46c5b12 2588 else
ff0dacd7 2589 rejected |= CATEGORY_MASK_ISO_7;
df7492f9
KH
2590 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
2591 id))
ff0dacd7 2592 found |= CATEGORY_MASK_ISO_7_TIGHT;
d46c5b12 2593 else
ff0dacd7 2594 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
df7492f9
KH
2595 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
2596 id))
ff0dacd7 2597 found |= CATEGORY_MASK_ISO_7_ELSE;
ae9ff118 2598 else
ff0dacd7 2599 rejected |= CATEGORY_MASK_ISO_7_ELSE;
df7492f9
KH
2600 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
2601 id))
ff0dacd7 2602 found |= CATEGORY_MASK_ISO_8_ELSE;
ae9ff118 2603 else
ff0dacd7 2604 rejected |= CATEGORY_MASK_ISO_8_ELSE;
4ed46869
KH
2605 break;
2606
4ed46869 2607 case ISO_CODE_SO:
d46c5b12 2608 case ISO_CODE_SI:
ff0dacd7 2609 /* Locking shift out/in. */
74383408
KH
2610 if (inhibit_iso_escape_detection)
2611 break;
f46869e4 2612 single_shifting = 0;
ff0dacd7
KH
2613 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2614 found |= CATEGORY_MASK_ISO_ELSE;
d46c5b12
KH
2615 break;
2616
4ed46869 2617 case ISO_CODE_CSI:
ff0dacd7 2618 /* Control sequence introducer. */
f46869e4 2619 single_shifting = 0;
ff0dacd7
KH
2620 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2621 found |= CATEGORY_MASK_ISO_8_ELSE;
2622 goto check_extra_latin;
2623
4ed46869
KH
2624 case ISO_CODE_SS2:
2625 case ISO_CODE_SS3:
ff0dacd7
KH
2626 /* Single shift. */
2627 if (inhibit_iso_escape_detection)
2628 break;
75e2a253 2629 single_shifting = 0;
ff0dacd7
KH
2630 rejected |= CATEGORY_MASK_ISO_7BIT;
2631 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2632 & CODING_ISO_FLAG_SINGLE_SHIFT)
75e2a253 2633 found |= CATEGORY_MASK_ISO_8_1, single_shifting = 1;
ff0dacd7
KH
2634 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2635 & CODING_ISO_FLAG_SINGLE_SHIFT)
75e2a253
KH
2636 found |= CATEGORY_MASK_ISO_8_2, single_shifting = 1;
2637 if (single_shifting)
2638 break;
ff0dacd7 2639 goto check_extra_latin;
4ed46869
KH
2640
2641 default:
065e3595
KH
2642 if (c < 0)
2643 continue;
4ed46869 2644 if (c < 0x80)
f46869e4
KH
2645 {
2646 single_shifting = 0;
2647 break;
2648 }
ff0dacd7 2649 if (c >= 0xA0)
c4825358 2650 {
ff0dacd7
KH
2651 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2652 found |= CATEGORY_MASK_ISO_8_1;
f46869e4 2653 /* Check the length of succeeding codes of the range
ff0dacd7
KH
2654 0xA0..0FF. If the byte length is even, we include
2655 CATEGORY_MASK_ISO_8_2 in `found'. We can check this
2656 only when we are not single shifting. */
2657 if (! single_shifting
2658 && ! (rejected & CATEGORY_MASK_ISO_8_2))
f46869e4 2659 {
e17de821 2660 int i = 1;
b73bfc1c
KH
2661 while (src < src_end)
2662 {
df7492f9 2663 ONE_MORE_BYTE (c);
b73bfc1c
KH
2664 if (c < 0xA0)
2665 break;
2666 i++;
2667 }
2668
2669 if (i & 1 && src < src_end)
ff0dacd7 2670 rejected |= CATEGORY_MASK_ISO_8_2;
f46869e4 2671 else
ff0dacd7 2672 found |= CATEGORY_MASK_ISO_8_2;
f46869e4 2673 }
ff0dacd7 2674 break;
4ed46869 2675 }
ff0dacd7
KH
2676 check_extra_latin:
2677 single_shifting = 0;
2678 if (! VECTORP (Vlatin_extra_code_table)
2679 || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
2680 {
2681 rejected = CATEGORY_MASK_ISO;
2682 break;
2683 }
2684 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2685 & CODING_ISO_FLAG_LATIN_EXTRA)
2686 found |= CATEGORY_MASK_ISO_8_1;
2687 else
2688 rejected |= CATEGORY_MASK_ISO_8_1;
75e2a253 2689 rejected |= CATEGORY_MASK_ISO_8_2;
4ed46869
KH
2690 }
2691 }
ff0dacd7
KH
2692 detect_info->rejected |= CATEGORY_MASK_ISO;
2693 return 0;
4ed46869 2694
df7492f9 2695 no_more_source:
ff0dacd7
KH
2696 detect_info->rejected |= rejected;
2697 detect_info->found |= (found & ~rejected);
df7492f9 2698 return 1;
4ed46869 2699}
ec6d2bb8 2700
4ed46869
KH
2701
2702/* Set designation state into CODING. */
df7492f9
KH
2703#define DECODE_DESIGNATION(reg, dim, chars_96, final) \
2704 do { \
2705 int id, prev; \
2706 \
2707 if (final < '0' || final >= 128 \
2708 || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0) \
2709 || !SAFE_CHARSET_P (coding, id)) \
2710 { \
2711 CODING_ISO_DESIGNATION (coding, reg) = -2; \
2712 goto invalid_code; \
2713 } \
2714 prev = CODING_ISO_DESIGNATION (coding, reg); \
bf16eb23
KH
2715 if (id == charset_jisx0201_roman) \
2716 { \
2717 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
2718 id = charset_ascii; \
2719 } \
2720 else if (id == charset_jisx0208_1978) \
2721 { \
2722 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
2723 id = charset_jisx0208; \
2724 } \
df7492f9
KH
2725 CODING_ISO_DESIGNATION (coding, reg) = id; \
2726 /* If there was an invalid designation to REG previously, and this \
2727 designation is ASCII to REG, we should keep this designation \
2728 sequence. */ \
2729 if (prev == -2 && id == charset_ascii) \
2730 goto invalid_code; \
4ed46869
KH
2731 } while (0)
2732
d46c5b12 2733
df7492f9
KH
2734#define MAYBE_FINISH_COMPOSITION() \
2735 do { \
2736 int i; \
2737 if (composition_state == COMPOSING_NO) \
2738 break; \
2739 /* It is assured that we have enough room for producing \
2740 characters stored in the table `components'. */ \
2741 if (charbuf + component_idx > charbuf_end) \
2742 goto no_more_source; \
2743 composition_state = COMPOSING_NO; \
2744 if (method == COMPOSITION_RELATIVE \
2745 || method == COMPOSITION_WITH_ALTCHARS) \
2746 { \
2747 for (i = 0; i < component_idx; i++) \
2748 *charbuf++ = components[i]; \
2749 char_offset += component_idx; \
2750 } \
2751 else \
2752 { \
2753 for (i = 0; i < component_idx; i += 2) \
2754 *charbuf++ = components[i]; \
2755 char_offset += (component_idx / 2) + 1; \
2756 } \
2757 } while (0)
2758
d46c5b12 2759
aa72b389
KH
2760/* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
2761 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
2762 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
df7492f9
KH
2763 ESC 3 : altchar composition : ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
2764 ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
aa72b389 2765 */
ec6d2bb8 2766
df7492f9
KH
2767#define DECODE_COMPOSITION_START(c1) \
2768 do { \
2769 if (c1 == '0' \
781d7a48 2770 && composition_state == COMPOSING_COMPONENT_RULE) \
df7492f9
KH
2771 { \
2772 component_len = component_idx; \
2773 composition_state = COMPOSING_CHAR; \
2774 } \
2775 else \
2776 { \
8f924df7 2777 const unsigned char *p; \
df7492f9
KH
2778 \
2779 MAYBE_FINISH_COMPOSITION (); \
2780 if (charbuf + MAX_COMPOSITION_COMPONENTS > charbuf_end) \
2781 goto no_more_source; \
2782 for (p = src; p < src_end - 1; p++) \
2783 if (*p == ISO_CODE_ESC && p[1] == '1') \
2784 break; \
2785 if (p == src_end - 1) \
2786 { \
2787 if (coding->mode & CODING_MODE_LAST_BLOCK) \
2788 goto invalid_code; \
2789 goto no_more_source; \
2790 } \
2791 \
2792 /* This is surely the start of a composition. */ \
2793 method = (c1 == '0' ? COMPOSITION_RELATIVE \
2794 : c1 == '2' ? COMPOSITION_WITH_RULE \
2795 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
2796 : COMPOSITION_WITH_RULE_ALTCHARS); \
2797 composition_state = (c1 <= '2' ? COMPOSING_CHAR \
2798 : COMPOSING_COMPONENT_CHAR); \
2799 component_idx = component_len = 0; \
2800 } \
ec6d2bb8
KH
2801 } while (0)
2802
ec6d2bb8 2803
df7492f9
KH
2804/* Handle compositoin end sequence ESC 1. */
2805
2806#define DECODE_COMPOSITION_END() \
ec6d2bb8 2807 do { \
df7492f9
KH
2808 int nchars = (component_len > 0 ? component_idx - component_len \
2809 : method == COMPOSITION_RELATIVE ? component_idx \
2810 : (component_idx + 1) / 2); \
2811 int i; \
2812 int *saved_charbuf = charbuf; \
8f924df7 2813 int from = char_offset; \
ff0dacd7 2814 int to = from + nchars; \
df7492f9 2815 \
ff0dacd7 2816 ADD_COMPOSITION_DATA (charbuf, from, to, method); \
df7492f9 2817 if (method != COMPOSITION_RELATIVE) \
ec6d2bb8 2818 { \
df7492f9
KH
2819 if (component_len == 0) \
2820 for (i = 0; i < component_idx; i++) \
2821 *charbuf++ = components[i]; \
2822 else \
2823 for (i = 0; i < component_len; i++) \
2824 *charbuf++ = components[i]; \
2825 *saved_charbuf = saved_charbuf - charbuf; \
ec6d2bb8 2826 } \
df7492f9
KH
2827 if (method == COMPOSITION_WITH_RULE) \
2828 for (i = 0; i < component_idx; i += 2, char_offset++) \
2829 *charbuf++ = components[i]; \
ec6d2bb8 2830 else \
df7492f9
KH
2831 for (i = component_len; i < component_idx; i++, char_offset++) \
2832 *charbuf++ = components[i]; \
2833 coding->annotated = 1; \
2834 composition_state = COMPOSING_NO; \
ec6d2bb8
KH
2835 } while (0)
2836
df7492f9 2837
ec6d2bb8
KH
2838/* Decode a composition rule from the byte C1 (and maybe one more byte
2839 from SRC) and store one encoded composition rule in
2840 coding->cmp_data. */
2841
2842#define DECODE_COMPOSITION_RULE(c1) \
2843 do { \
ec6d2bb8
KH
2844 (c1) -= 32; \
2845 if (c1 < 81) /* old format (before ver.21) */ \
2846 { \
2847 int gref = (c1) / 9; \
2848 int nref = (c1) % 9; \
2849 if (gref == 4) gref = 10; \
2850 if (nref == 4) nref = 10; \
df7492f9 2851 c1 = COMPOSITION_ENCODE_RULE (gref, nref); \
ec6d2bb8 2852 } \
b73bfc1c 2853 else if (c1 < 93) /* new format (after ver.21) */ \
ec6d2bb8
KH
2854 { \
2855 ONE_MORE_BYTE (c2); \
df7492f9 2856 c1 = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32); \
ec6d2bb8 2857 } \
df7492f9
KH
2858 else \
2859 c1 = 0; \
ec6d2bb8 2860 } while (0)
88993dfd 2861
d46c5b12 2862
4ed46869
KH
2863/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
2864
b73bfc1c 2865static void
df7492f9 2866decode_coding_iso_2022 (coding)
4ed46869 2867 struct coding_system *coding;
4ed46869 2868{
8f924df7
KH
2869 const unsigned char *src = coding->source + coding->consumed;
2870 const unsigned char *src_end = coding->source + coding->src_bytes;
2871 const unsigned char *src_base;
df7492f9 2872 int *charbuf = coding->charbuf;
ff0dacd7
KH
2873 int *charbuf_end
2874 = charbuf + coding->charbuf_size - 4 - MAX_ANNOTATION_LENGTH;
df7492f9 2875 int consumed_chars = 0, consumed_chars_base;
df7492f9 2876 int multibytep = coding->src_multibyte;
4ed46869 2877 /* Charsets invoked to graphic plane 0 and 1 respectively. */
df7492f9
KH
2878 int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2879 int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
2880 struct charset *charset;
2881 int c;
2882 /* For handling composition sequence. */
2883#define COMPOSING_NO 0
2884#define COMPOSING_CHAR 1
2885#define COMPOSING_RULE 2
2886#define COMPOSING_COMPONENT_CHAR 3
2887#define COMPOSING_COMPONENT_RULE 4
2888
2889 int composition_state = COMPOSING_NO;
2890 enum composition_method method;
2891 int components[MAX_COMPOSITION_COMPONENTS * 2 + 1];
2892 int component_idx;
2893 int component_len;
24a73b0a 2894 Lisp_Object attrs, charset_list;
ff0dacd7
KH
2895 int char_offset = coding->produced_char;
2896 int last_offset = char_offset;
2897 int last_id = charset_ascii;
df7492f9 2898
24a73b0a 2899 CODING_GET_INFO (coding, attrs, charset_list);
df7492f9 2900 setup_iso_safe_charsets (attrs);
b73bfc1c
KH
2901
2902 while (1)
4ed46869 2903 {
463f5630 2904 int c1, c2;
b73bfc1c
KH
2905
2906 src_base = src;
df7492f9
KH
2907 consumed_chars_base = consumed_chars;
2908
2909 if (charbuf >= charbuf_end)
2910 break;
2911
b73bfc1c 2912 ONE_MORE_BYTE (c1);
065e3595
KH
2913 if (c1 < 0)
2914 goto invalid_code;
4ed46869 2915
98725083 2916 /* We produce at most one character. */
4ed46869
KH
2917 switch (iso_code_class [c1])
2918 {
2919 case ISO_0x20_or_0x7F:
df7492f9 2920 if (composition_state != COMPOSING_NO)
ec6d2bb8 2921 {
df7492f9
KH
2922 if (composition_state == COMPOSING_RULE
2923 || composition_state == COMPOSING_COMPONENT_RULE)
2924 {
2925 DECODE_COMPOSITION_RULE (c1);
2926 components[component_idx++] = c1;
2927 composition_state--;
2928 continue;
2929 }
4ed46869 2930 }
df7492f9
KH
2931 if (charset_id_0 < 0
2932 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
781d7a48
KH
2933 /* This is SPACE or DEL. */
2934 charset = CHARSET_FROM_ID (charset_ascii);
2935 else
2936 charset = CHARSET_FROM_ID (charset_id_0);
2937 break;
4ed46869
KH
2938
2939 case ISO_graphic_plane_0:
781d7a48 2940 if (composition_state != COMPOSING_NO)
b73bfc1c 2941 {
781d7a48
KH
2942 if (composition_state == COMPOSING_RULE
2943 || composition_state == COMPOSING_COMPONENT_RULE)
2944 {
2945 DECODE_COMPOSITION_RULE (c1);
2946 components[component_idx++] = c1;
2947 composition_state--;
2948 continue;
2949 }
b73bfc1c 2950 }
df7492f9 2951 charset = CHARSET_FROM_ID (charset_id_0);
4ed46869
KH
2952 break;
2953
2954 case ISO_0xA0_or_0xFF:
df7492f9
KH
2955 if (charset_id_1 < 0
2956 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
2957 || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
2958 goto invalid_code;
4ed46869
KH
2959 /* This is a graphic character, we fall down ... */
2960
2961 case ISO_graphic_plane_1:
df7492f9
KH
2962 if (charset_id_1 < 0)
2963 goto invalid_code;
2964 charset = CHARSET_FROM_ID (charset_id_1);
4ed46869
KH
2965 break;
2966
df7492f9
KH
2967 case ISO_control_0:
2968 MAYBE_FINISH_COMPOSITION ();
2969 charset = CHARSET_FROM_ID (charset_ascii);
4ed46869
KH
2970 break;
2971
df7492f9
KH
2972 case ISO_control_1:
2973 MAYBE_FINISH_COMPOSITION ();
2974 goto invalid_code;
2975
4ed46869 2976 case ISO_shift_out:
df7492f9
KH
2977 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
2978 || CODING_ISO_DESIGNATION (coding, 1) < 0)
2979 goto invalid_code;
2980 CODING_ISO_INVOCATION (coding, 0) = 1;
2981 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
b73bfc1c 2982 continue;
4ed46869
KH
2983
2984 case ISO_shift_in:
df7492f9
KH
2985 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
2986 goto invalid_code;
2987 CODING_ISO_INVOCATION (coding, 0) = 0;
2988 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
b73bfc1c 2989 continue;
4ed46869
KH
2990
2991 case ISO_single_shift_2_7:
2992 case ISO_single_shift_2:
df7492f9
KH
2993 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
2994 goto invalid_code;
4ed46869
KH
2995 /* SS2 is handled as an escape sequence of ESC 'N' */
2996 c1 = 'N';
2997 goto label_escape_sequence;
2998
2999 case ISO_single_shift_3:
df7492f9
KH
3000 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3001 goto invalid_code;
4ed46869
KH
3002 /* SS2 is handled as an escape sequence of ESC 'O' */
3003 c1 = 'O';
3004 goto label_escape_sequence;
3005
3006 case ISO_control_sequence_introducer:
3007 /* CSI is handled as an escape sequence of ESC '[' ... */
3008 c1 = '[';
3009 goto label_escape_sequence;
3010
3011 case ISO_escape:
3012 ONE_MORE_BYTE (c1);
3013 label_escape_sequence:
df7492f9 3014 /* Escape sequences handled here are invocation,
4ed46869
KH
3015 designation, direction specification, and character
3016 composition specification. */
3017 switch (c1)
3018 {
3019 case '&': /* revision of following character set */
3020 ONE_MORE_BYTE (c1);
3021 if (!(c1 >= '@' && c1 <= '~'))
df7492f9 3022 goto invalid_code;
4ed46869
KH
3023 ONE_MORE_BYTE (c1);
3024 if (c1 != ISO_CODE_ESC)
df7492f9 3025 goto invalid_code;
4ed46869
KH
3026 ONE_MORE_BYTE (c1);
3027 goto label_escape_sequence;
3028
3029 case '$': /* designation of 2-byte character set */
df7492f9
KH
3030 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3031 goto invalid_code;
4ed46869
KH
3032 ONE_MORE_BYTE (c1);
3033 if (c1 >= '@' && c1 <= 'B')
3034 { /* designation of JISX0208.1978, GB2312.1980,
88993dfd 3035 or JISX0208.1980 */
df7492f9 3036 DECODE_DESIGNATION (0, 2, 0, c1);
4ed46869
KH
3037 }
3038 else if (c1 >= 0x28 && c1 <= 0x2B)
3039 { /* designation of DIMENSION2_CHARS94 character set */
3040 ONE_MORE_BYTE (c2);
df7492f9 3041 DECODE_DESIGNATION (c1 - 0x28, 2, 0, c2);
4ed46869
KH
3042 }
3043 else if (c1 >= 0x2C && c1 <= 0x2F)
3044 { /* designation of DIMENSION2_CHARS96 character set */
3045 ONE_MORE_BYTE (c2);
df7492f9 3046 DECODE_DESIGNATION (c1 - 0x2C, 2, 1, c2);
4ed46869
KH
3047 }
3048 else
df7492f9 3049 goto invalid_code;
b73bfc1c 3050 /* We must update these variables now. */
df7492f9
KH
3051 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3052 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
b73bfc1c 3053 continue;
4ed46869
KH
3054
3055 case 'n': /* invocation of locking-shift-2 */
df7492f9
KH
3056 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3057 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3058 goto invalid_code;
3059 CODING_ISO_INVOCATION (coding, 0) = 2;
3060 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
b73bfc1c 3061 continue;
4ed46869
KH
3062
3063 case 'o': /* invocation of locking-shift-3 */
df7492f9
KH
3064 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3065 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3066 goto invalid_code;
3067 CODING_ISO_INVOCATION (coding, 0) = 3;
3068 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
b73bfc1c 3069 continue;
4ed46869
KH
3070
3071 case 'N': /* invocation of single-shift-2 */
df7492f9
KH
3072 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3073 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3074 goto invalid_code;
3075 charset = CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding, 2));
b73bfc1c 3076 ONE_MORE_BYTE (c1);
e7046a18 3077 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
df7492f9 3078 goto invalid_code;
4ed46869
KH
3079 break;
3080
3081 case 'O': /* invocation of single-shift-3 */
df7492f9
KH
3082 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3083 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3084 goto invalid_code;
3085 charset = CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding, 3));
b73bfc1c 3086 ONE_MORE_BYTE (c1);
e7046a18 3087 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
df7492f9 3088 goto invalid_code;
4ed46869
KH
3089 break;
3090
ec6d2bb8 3091 case '0': case '2': case '3': case '4': /* start composition */
df7492f9
KH
3092 if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3093 goto invalid_code;
ec6d2bb8 3094 DECODE_COMPOSITION_START (c1);
b73bfc1c 3095 continue;
4ed46869 3096
ec6d2bb8 3097 case '1': /* end composition */
df7492f9
KH
3098 if (composition_state == COMPOSING_NO)
3099 goto invalid_code;
3100 DECODE_COMPOSITION_END ();
b73bfc1c 3101 continue;
4ed46869
KH
3102
3103 case '[': /* specification of direction */
df7492f9
KH
3104 if (! CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION)
3105 goto invalid_code;
4ed46869 3106 /* For the moment, nested direction is not supported.
d46c5b12 3107 So, `coding->mode & CODING_MODE_DIRECTION' zero means
df7492f9 3108 left-to-right, and nozero means right-to-left. */
4ed46869
KH
3109 ONE_MORE_BYTE (c1);
3110 switch (c1)
3111 {
3112 case ']': /* end of the current direction */
d46c5b12 3113 coding->mode &= ~CODING_MODE_DIRECTION;
4ed46869
KH
3114
3115 case '0': /* end of the current direction */
3116 case '1': /* start of left-to-right direction */
3117 ONE_MORE_BYTE (c1);
3118 if (c1 == ']')
d46c5b12 3119 coding->mode &= ~CODING_MODE_DIRECTION;
4ed46869 3120 else
df7492f9 3121 goto invalid_code;
4ed46869
KH
3122 break;
3123
3124 case '2': /* start of right-to-left direction */
3125 ONE_MORE_BYTE (c1);
3126 if (c1 == ']')
d46c5b12 3127 coding->mode |= CODING_MODE_DIRECTION;
4ed46869 3128 else
df7492f9 3129 goto invalid_code;
4ed46869
KH
3130 break;
3131
3132 default:
df7492f9 3133 goto invalid_code;
4ed46869 3134 }
b73bfc1c 3135 continue;
4ed46869 3136
103e0180 3137 case '%':
103e0180
KH
3138 ONE_MORE_BYTE (c1);
3139 if (c1 == '/')
3140 {
3141 /* CTEXT extended segment:
3142 ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3143 We keep these bytes as is for the moment.
3144 They may be decoded by post-read-conversion. */
3145 int dim, M, L;
4776e638 3146 int size;
8f924df7 3147
103e0180
KH
3148 ONE_MORE_BYTE (dim);
3149 ONE_MORE_BYTE (M);
3150 ONE_MORE_BYTE (L);
3151 size = ((M - 128) * 128) + (L - 128);
4776e638
KH
3152 if (charbuf + 8 + size > charbuf_end)
3153 goto break_loop;
3154 *charbuf++ = ISO_CODE_ESC;
3155 *charbuf++ = '%';
3156 *charbuf++ = '/';
3157 *charbuf++ = dim;
3158 *charbuf++ = BYTE8_TO_CHAR (M);
3159 *charbuf++ = BYTE8_TO_CHAR (L);
103e0180
KH
3160 while (size-- > 0)
3161 {
3162 ONE_MORE_BYTE (c1);
4776e638 3163 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
103e0180 3164 }
103e0180
KH
3165 }
3166 else if (c1 == 'G')
3167 {
103e0180
KH
3168 /* XFree86 extension for embedding UTF-8 in CTEXT:
3169 ESC % G --UTF-8-BYTES-- ESC % @
3170 We keep these bytes as is for the moment.
3171 They may be decoded by post-read-conversion. */
4776e638
KH
3172 int *p = charbuf;
3173
3174 if (p + 6 > charbuf_end)
3175 goto break_loop;
3176 *p++ = ISO_CODE_ESC;
3177 *p++ = '%';
3178 *p++ = 'G';
3179 while (p < charbuf_end)
103e0180
KH
3180 {
3181 ONE_MORE_BYTE (c1);
3182 if (c1 == ISO_CODE_ESC
3183 && src + 1 < src_end
3184 && src[0] == '%'
3185 && src[1] == '@')
3186 break;
4776e638 3187 *p++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
103e0180 3188 }
4776e638
KH
3189 if (p + 3 > charbuf_end)
3190 goto break_loop;
3191 *p++ = ISO_CODE_ESC;
3192 *p++ = '%';
3193 *p++ = '@';
3194 charbuf = p;
103e0180
KH
3195 }
3196 else
4776e638 3197 goto invalid_code;
103e0180 3198 continue;
4776e638 3199 break;
103e0180 3200
4ed46869 3201 default:
df7492f9
KH
3202 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3203 goto invalid_code;
4ed46869
KH
3204 if (c1 >= 0x28 && c1 <= 0x2B)
3205 { /* designation of DIMENSION1_CHARS94 character set */
3206 ONE_MORE_BYTE (c2);
df7492f9 3207 DECODE_DESIGNATION (c1 - 0x28, 1, 0, c2);
4ed46869
KH
3208 }
3209 else if (c1 >= 0x2C && c1 <= 0x2F)
3210 { /* designation of DIMENSION1_CHARS96 character set */
3211 ONE_MORE_BYTE (c2);
df7492f9 3212 DECODE_DESIGNATION (c1 - 0x2C, 1, 1, c2);
4ed46869
KH
3213 }
3214 else
df7492f9 3215 goto invalid_code;
b73bfc1c 3216 /* We must update these variables now. */
df7492f9
KH
3217 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3218 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
b73bfc1c 3219 continue;
4ed46869 3220 }
b73bfc1c 3221 }
4ed46869 3222
ff0dacd7
KH
3223 if (charset->id != charset_ascii
3224 && last_id != charset->id)
3225 {
3226 if (last_id != charset_ascii)
3227 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
3228 last_id = charset->id;
3229 last_offset = char_offset;
3230 }
3231
b73bfc1c 3232 /* Now we know CHARSET and 1st position code C1 of a character.
df7492f9
KH
3233 Produce a decoded character while getting 2nd position code
3234 C2 if necessary. */
3235 c1 &= 0x7F;
3236 if (CHARSET_DIMENSION (charset) > 1)
b73bfc1c
KH
3237 {
3238 ONE_MORE_BYTE (c2);
df7492f9 3239 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
b73bfc1c 3240 /* C2 is not in a valid range. */
df7492f9
KH
3241 goto invalid_code;
3242 c1 = (c1 << 8) | (c2 & 0x7F);
3243 if (CHARSET_DIMENSION (charset) > 2)
3244 {
3245 ONE_MORE_BYTE (c2);
3246 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3247 /* C2 is not in a valid range. */
3248 goto invalid_code;
3249 c1 = (c1 << 8) | (c2 & 0x7F);
3250 }
3251 }
3252
3253 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3254 if (c < 0)
3255 {
3256 MAYBE_FINISH_COMPOSITION ();
3257 for (; src_base < src; src_base++, char_offset++)
3258 {
3259 if (ASCII_BYTE_P (*src_base))
3260 *charbuf++ = *src_base;
3261 else
3262 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3263 }
3264 }
3265 else if (composition_state == COMPOSING_NO)
3266 {
3267 *charbuf++ = c;
3268 char_offset++;
4ed46869 3269 }
df7492f9 3270 else
781d7a48
KH
3271 {
3272 components[component_idx++] = c;
3273 if (method == COMPOSITION_WITH_RULE
3274 || (method == COMPOSITION_WITH_RULE_ALTCHARS
3275 && composition_state == COMPOSING_COMPONENT_CHAR))
3276 composition_state++;
4ed46869
KH
3277 }
3278 continue;
3279
df7492f9
KH
3280 invalid_code:
3281 MAYBE_FINISH_COMPOSITION ();
4ed46869 3282 src = src_base;
df7492f9
KH
3283 consumed_chars = consumed_chars_base;
3284 ONE_MORE_BYTE (c);
065e3595 3285 *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
ff0dacd7 3286 char_offset++;
df7492f9 3287 coding->errors++;
4776e638
KH
3288 continue;
3289
3290 break_loop:
3291 break;
4ed46869 3292 }
fb88bf2d 3293
df7492f9 3294 no_more_source:
ff0dacd7
KH
3295 if (last_id != charset_ascii)
3296 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
df7492f9
KH
3297 coding->consumed_char += consumed_chars_base;
3298 coding->consumed = src_base - coding->source;
3299 coding->charbuf_used = charbuf - coding->charbuf;
4ed46869
KH
3300}
3301
b73bfc1c 3302
f4dee582 3303/* ISO2022 encoding stuff. */
4ed46869
KH
3304
3305/*
f4dee582 3306 It is not enough to say just "ISO2022" on encoding, we have to
df7492f9 3307 specify more details. In Emacs, each coding system of ISO2022
4ed46869 3308 variant has the following specifications:
df7492f9 3309 1. Initial designation to G0 thru G3.
4ed46869
KH
3310 2. Allows short-form designation?
3311 3. ASCII should be designated to G0 before control characters?
3312 4. ASCII should be designated to G0 at end of line?
3313 5. 7-bit environment or 8-bit environment?
3314 6. Use locking-shift?
3315 7. Use Single-shift?
3316 And the following two are only for Japanese:
3317 8. Use ASCII in place of JIS0201-1976-Roman?
3318 9. Use JISX0208-1983 in place of JISX0208-1978?
df7492f9
KH
3319 These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3320 defined by macros CODING_ISO_FLAG_XXX. See `coding.h' for more
f4dee582 3321 details.
4ed46869
KH
3322*/
3323
3324/* Produce codes (escape sequence) for designating CHARSET to graphic
b73bfc1c
KH
3325 register REG at DST, and increment DST. If <final-char> of CHARSET is
3326 '@', 'A', or 'B' and the coding system CODING allows, produce
3327 designation sequence of short-form. */
4ed46869
KH
3328
3329#define ENCODE_DESIGNATION(charset, reg, coding) \
3330 do { \
df7492f9 3331 unsigned char final_char = CHARSET_ISO_FINAL (charset); \
4ed46869
KH
3332 char *intermediate_char_94 = "()*+"; \
3333 char *intermediate_char_96 = ",-./"; \
df7492f9
KH
3334 int revision = -1; \
3335 int c; \
3336 \
3337 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION) \
c197f191 3338 revision = CHARSET_ISO_REVISION (charset); \
df7492f9
KH
3339 \
3340 if (revision >= 0) \
70c22245 3341 { \
df7492f9
KH
3342 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&'); \
3343 EMIT_ONE_BYTE ('@' + revision); \
4ed46869 3344 } \
df7492f9 3345 EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC); \
4ed46869
KH
3346 if (CHARSET_DIMENSION (charset) == 1) \
3347 { \
df7492f9
KH
3348 if (! CHARSET_ISO_CHARS_96 (charset)) \
3349 c = intermediate_char_94[reg]; \
4ed46869 3350 else \
df7492f9
KH
3351 c = intermediate_char_96[reg]; \
3352 EMIT_ONE_ASCII_BYTE (c); \
4ed46869
KH
3353 } \
3354 else \
3355 { \
df7492f9
KH
3356 EMIT_ONE_ASCII_BYTE ('$'); \
3357 if (! CHARSET_ISO_CHARS_96 (charset)) \
4ed46869 3358 { \
df7492f9 3359 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM \
b73bfc1c
KH
3360 || reg != 0 \
3361 || final_char < '@' || final_char > 'B') \
df7492f9 3362 EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]); \
4ed46869
KH
3363 } \
3364 else \
df7492f9 3365 EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]); \
4ed46869 3366 } \
df7492f9
KH
3367 EMIT_ONE_ASCII_BYTE (final_char); \
3368 \
3369 CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset); \
4ed46869
KH
3370 } while (0)
3371
df7492f9 3372
4ed46869
KH
3373/* The following two macros produce codes (control character or escape
3374 sequence) for ISO2022 single-shift functions (single-shift-2 and
3375 single-shift-3). */
3376
df7492f9
KH
3377#define ENCODE_SINGLE_SHIFT_2 \
3378 do { \
3379 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3380 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N'); \
3381 else \
3382 EMIT_ONE_BYTE (ISO_CODE_SS2); \
3383 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
4ed46869
KH
3384 } while (0)
3385
df7492f9
KH
3386
3387#define ENCODE_SINGLE_SHIFT_3 \
3388 do { \
3389 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3390 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O'); \
3391 else \
3392 EMIT_ONE_BYTE (ISO_CODE_SS3); \
3393 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
4ed46869
KH
3394 } while (0)
3395
df7492f9 3396
4ed46869
KH
3397/* The following four macros produce codes (control character or
3398 escape sequence) for ISO2022 locking-shift functions (shift-in,
3399 shift-out, locking-shift-2, and locking-shift-3). */
3400
df7492f9
KH
3401#define ENCODE_SHIFT_IN \
3402 do { \
3403 EMIT_ONE_ASCII_BYTE (ISO_CODE_SI); \
3404 CODING_ISO_INVOCATION (coding, 0) = 0; \
4ed46869
KH
3405 } while (0)
3406
df7492f9
KH
3407
3408#define ENCODE_SHIFT_OUT \
3409 do { \
3410 EMIT_ONE_ASCII_BYTE (ISO_CODE_SO); \
3411 CODING_ISO_INVOCATION (coding, 0) = 1; \
4ed46869
KH
3412 } while (0)
3413
df7492f9
KH
3414
3415#define ENCODE_LOCKING_SHIFT_2 \
3416 do { \
3417 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3418 CODING_ISO_INVOCATION (coding, 0) = 2; \
4ed46869
KH
3419 } while (0)
3420
df7492f9
KH
3421
3422#define ENCODE_LOCKING_SHIFT_3 \
3423 do { \
3424 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3425 CODING_ISO_INVOCATION (coding, 0) = 3; \
4ed46869
KH
3426 } while (0)
3427
df7492f9 3428
f4dee582
RS
3429/* Produce codes for a DIMENSION1 character whose character set is
3430 CHARSET and whose position-code is C1. Designation and invocation
4ed46869
KH
3431 sequences are also produced in advance if necessary. */
3432
6e85d753
KH
3433#define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
3434 do { \
df7492f9 3435 int id = CHARSET_ID (charset); \
bf16eb23
KH
3436 \
3437 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
3438 && id == charset_ascii) \
3439 { \
3440 id = charset_jisx0201_roman; \
3441 charset = CHARSET_FROM_ID (id); \
3442 } \
3443 \
df7492f9 3444 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
6e85d753 3445 { \
df7492f9
KH
3446 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3447 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
6e85d753 3448 else \
df7492f9
KH
3449 EMIT_ONE_BYTE (c1 | 0x80); \
3450 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
6e85d753
KH
3451 break; \
3452 } \
df7492f9 3453 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
6e85d753 3454 { \
df7492f9 3455 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
6e85d753
KH
3456 break; \
3457 } \
df7492f9 3458 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
6e85d753 3459 { \
df7492f9 3460 EMIT_ONE_BYTE (c1 | 0x80); \
6e85d753
KH
3461 break; \
3462 } \
6e85d753
KH
3463 else \
3464 /* Since CHARSET is not yet invoked to any graphic planes, we \
3465 must invoke it, or, at first, designate it to some graphic \
3466 register. Then repeat the loop to actually produce the \
3467 character. */ \
df7492f9
KH
3468 dst = encode_invocation_designation (charset, coding, dst, \
3469 &produced_chars); \
4ed46869
KH
3470 } while (1)
3471
df7492f9 3472
f4dee582
RS
3473/* Produce codes for a DIMENSION2 character whose character set is
3474 CHARSET and whose position-codes are C1 and C2. Designation and
4ed46869
KH
3475 invocation codes are also produced in advance if necessary. */
3476
6e85d753
KH
3477#define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
3478 do { \
df7492f9 3479 int id = CHARSET_ID (charset); \
bf16eb23
KH
3480 \
3481 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
3482 && id == charset_jisx0208) \
3483 { \
3484 id = charset_jisx0208_1978; \
3485 charset = CHARSET_FROM_ID (id); \
3486 } \
3487 \
df7492f9 3488 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
6e85d753 3489 { \
df7492f9
KH
3490 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3491 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
6e85d753 3492 else \
df7492f9
KH
3493 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3494 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
6e85d753
KH
3495 break; \
3496 } \
df7492f9 3497 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
6e85d753 3498 { \
df7492f9 3499 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
6e85d753
KH
3500 break; \
3501 } \
df7492f9 3502 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
6e85d753 3503 { \
df7492f9 3504 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
6e85d753
KH
3505 break; \
3506 } \
6e85d753
KH
3507 else \
3508 /* Since CHARSET is not yet invoked to any graphic planes, we \
3509 must invoke it, or, at first, designate it to some graphic \
3510 register. Then repeat the loop to actually produce the \
3511 character. */ \
df7492f9
KH
3512 dst = encode_invocation_designation (charset, coding, dst, \
3513 &produced_chars); \
4ed46869
KH
3514 } while (1)
3515
05e6f5dc 3516
df7492f9
KH
3517#define ENCODE_ISO_CHARACTER(charset, c) \
3518 do { \
3519 int code = ENCODE_CHAR ((charset),(c)); \
3520 \
3521 if (CHARSET_DIMENSION (charset) == 1) \
3522 ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code); \
3523 else \
3524 ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
84fbb8a0 3525 } while (0)
bdd9fb48 3526
05e6f5dc 3527
4ed46869 3528/* Produce designation and invocation codes at a place pointed by DST
df7492f9 3529 to use CHARSET. The element `spec.iso_2022' of *CODING is updated.
4ed46869
KH
3530 Return new DST. */
3531
3532unsigned char *
df7492f9
KH
3533encode_invocation_designation (charset, coding, dst, p_nchars)
3534 struct charset *charset;
4ed46869
KH
3535 struct coding_system *coding;
3536 unsigned char *dst;
df7492f9 3537 int *p_nchars;
4ed46869 3538{
df7492f9
KH
3539 int multibytep = coding->dst_multibyte;
3540 int produced_chars = *p_nchars;
4ed46869 3541 int reg; /* graphic register number */
df7492f9 3542 int id = CHARSET_ID (charset);
4ed46869
KH
3543
3544 /* At first, check designations. */
3545 for (reg = 0; reg < 4; reg++)
df7492f9 3546 if (id == CODING_ISO_DESIGNATION (coding, reg))
4ed46869
KH
3547 break;
3548
3549 if (reg >= 4)
3550 {
3551 /* CHARSET is not yet designated to any graphic registers. */
3552 /* At first check the requested designation. */
df7492f9
KH
3553 reg = CODING_ISO_REQUEST (coding, id);
3554 if (reg < 0)
1ba9e4ab
KH
3555 /* Since CHARSET requests no special designation, designate it
3556 to graphic register 0. */
4ed46869
KH
3557 reg = 0;
3558
3559 ENCODE_DESIGNATION (charset, reg, coding);
3560 }
3561
df7492f9
KH
3562 if (CODING_ISO_INVOCATION (coding, 0) != reg
3563 && CODING_ISO_INVOCATION (coding, 1) != reg)
4ed46869
KH
3564 {
3565 /* Since the graphic register REG is not invoked to any graphic
3566 planes, invoke it to graphic plane 0. */
3567 switch (reg)
3568 {
3569 case 0: /* graphic register 0 */
3570 ENCODE_SHIFT_IN;
3571 break;
3572
3573 case 1: /* graphic register 1 */
3574 ENCODE_SHIFT_OUT;
3575 break;
3576
3577 case 2: /* graphic register 2 */
df7492f9 3578 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4ed46869
KH
3579 ENCODE_SINGLE_SHIFT_2;
3580 else
3581 ENCODE_LOCKING_SHIFT_2;
3582 break;
3583
3584 case 3: /* graphic register 3 */
df7492f9 3585 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4ed46869
KH
3586 ENCODE_SINGLE_SHIFT_3;
3587 else
3588 ENCODE_LOCKING_SHIFT_3;
3589 break;
3590 }
3591 }
b73bfc1c 3592
df7492f9 3593 *p_nchars = produced_chars;
4ed46869
KH
3594 return dst;
3595}
3596
df7492f9
KH
3597/* The following three macros produce codes for indicating direction
3598 of text. */
3599#define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
ec6d2bb8 3600 do { \
df7492f9
KH
3601 if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS) \
3602 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '['); \
ec6d2bb8 3603 else \
df7492f9 3604 EMIT_ONE_BYTE (ISO_CODE_CSI); \
ec6d2bb8
KH
3605 } while (0)
3606
ec6d2bb8 3607
df7492f9
KH
3608#define ENCODE_DIRECTION_R2L() \
3609 do { \
3610 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3611 EMIT_TWO_ASCII_BYTES ('2', ']'); \
ec6d2bb8
KH
3612 } while (0)
3613
ec6d2bb8 3614
df7492f9 3615#define ENCODE_DIRECTION_L2R() \
ec6d2bb8 3616 do { \
df7492f9
KH
3617 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3618 EMIT_TWO_ASCII_BYTES ('0', ']'); \
ec6d2bb8 3619 } while (0)
4ed46869 3620
4ed46869
KH
3621
3622/* Produce codes for designation and invocation to reset the graphic
3623 planes and registers to initial state. */
df7492f9
KH
3624#define ENCODE_RESET_PLANE_AND_REGISTER() \
3625 do { \
3626 int reg; \
3627 struct charset *charset; \
3628 \
3629 if (CODING_ISO_INVOCATION (coding, 0) != 0) \
3630 ENCODE_SHIFT_IN; \
3631 for (reg = 0; reg < 4; reg++) \
3632 if (CODING_ISO_INITIAL (coding, reg) >= 0 \
3633 && (CODING_ISO_DESIGNATION (coding, reg) \
3634 != CODING_ISO_INITIAL (coding, reg))) \
3635 { \
3636 charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
3637 ENCODE_DESIGNATION (charset, reg, coding); \
3638 } \
4ed46869
KH
3639 } while (0)
3640
df7492f9 3641
bdd9fb48 3642/* Produce designation sequences of charsets in the line started from
b73bfc1c 3643 SRC to a place pointed by DST, and return updated DST.
bdd9fb48
KH
3644
3645 If the current block ends before any end-of-line, we may fail to
d46c5b12
KH
3646 find all the necessary designations. */
3647
b73bfc1c 3648static unsigned char *
df7492f9 3649encode_designation_at_bol (coding, charbuf, charbuf_end, dst)
e0e989f6 3650 struct coding_system *coding;
df7492f9
KH
3651 int *charbuf, *charbuf_end;
3652 unsigned char *dst;
e0e989f6 3653{
df7492f9 3654 struct charset *charset;
bdd9fb48
KH
3655 /* Table of charsets to be designated to each graphic register. */
3656 int r[4];
df7492f9
KH
3657 int c, found = 0, reg;
3658 int produced_chars = 0;
3659 int multibytep = coding->dst_multibyte;
3660 Lisp_Object attrs;
3661 Lisp_Object charset_list;
3662
3663 attrs = CODING_ID_ATTRS (coding->id);
3664 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3665 if (EQ (charset_list, Qiso_2022))
3666 charset_list = Viso_2022_charset_list;
bdd9fb48
KH
3667
3668 for (reg = 0; reg < 4; reg++)
3669 r[reg] = -1;
3670
b73bfc1c 3671 while (found < 4)
e0e989f6 3672 {
df7492f9
KH
3673 int id;
3674
3675 c = *charbuf++;
b73bfc1c
KH
3676 if (c == '\n')
3677 break;
df7492f9
KH
3678 charset = char_charset (c, charset_list, NULL);
3679 id = CHARSET_ID (charset);
3680 reg = CODING_ISO_REQUEST (coding, id);
3681 if (reg >= 0 && r[reg] < 0)
bdd9fb48
KH
3682 {
3683 found++;
df7492f9 3684 r[reg] = id;
bdd9fb48 3685 }
bdd9fb48
KH
3686 }
3687
3688 if (found)
3689 {
3690 for (reg = 0; reg < 4; reg++)
3691 if (r[reg] >= 0
df7492f9
KH
3692 && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
3693 ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
e0e989f6 3694 }
b73bfc1c
KH
3695
3696 return dst;
e0e989f6
KH
3697}
3698
4ed46869
KH
3699/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
3700
df7492f9
KH
3701static int
3702encode_coding_iso_2022 (coding)
4ed46869 3703 struct coding_system *coding;
4ed46869 3704{
df7492f9
KH
3705 int multibytep = coding->dst_multibyte;
3706 int *charbuf = coding->charbuf;
3707 int *charbuf_end = charbuf + coding->charbuf_used;
3708 unsigned char *dst = coding->destination + coding->produced;
3709 unsigned char *dst_end = coding->destination + coding->dst_bytes;
3710 int safe_room = 16;
3711 int bol_designation
3712 = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
3713 && CODING_ISO_BOL (coding));
3714 int produced_chars = 0;
3715 Lisp_Object attrs, eol_type, charset_list;
3716 int ascii_compatible;
b73bfc1c 3717 int c;
ff0dacd7 3718 int preferred_charset_id = -1;
05e6f5dc 3719
24a73b0a
KH
3720 CODING_GET_INFO (coding, attrs, charset_list);
3721 eol_type = CODING_ID_EOL_TYPE (coding->id);
3722 if (VECTORP (eol_type))
3723 eol_type = Qunix;
3724
004068e4 3725 setup_iso_safe_charsets (attrs);
ff0dacd7
KH
3726 /* Charset list may have been changed. */
3727 charset_list = CODING_ATTR_CHARSET_LIST (attrs); \
8f924df7 3728 coding->safe_charsets = (char *) SDATA (CODING_ATTR_SAFE_CHARSETS(attrs));
0eecad43 3729
df7492f9 3730 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
bdd9fb48 3731
df7492f9 3732 while (charbuf < charbuf_end)
4ed46869 3733 {
df7492f9 3734 ASSURE_DESTINATION (safe_room);
b73bfc1c 3735
df7492f9 3736 if (bol_designation)
b73bfc1c 3737 {
df7492f9 3738 unsigned char *dst_prev = dst;
4ed46869 3739
bdd9fb48 3740 /* We have to produce designation sequences if any now. */
df7492f9
KH
3741 dst = encode_designation_at_bol (coding, charbuf, charbuf_end, dst);
3742 bol_designation = 0;
3743 /* We are sure that designation sequences are all ASCII bytes. */
3744 produced_chars += dst - dst_prev;
e0e989f6
KH
3745 }
3746
df7492f9 3747 c = *charbuf++;
ec6d2bb8 3748
ff0dacd7
KH
3749 if (c < 0)
3750 {
3751 /* Handle an annotation. */
3752 switch (*charbuf)
ec6d2bb8 3753 {
ff0dacd7
KH
3754 case CODING_ANNOTATE_COMPOSITION_MASK:
3755 /* Not yet implemented. */
3756 break;
3757 case CODING_ANNOTATE_CHARSET_MASK:
3758 preferred_charset_id = charbuf[3];
3759 if (preferred_charset_id >= 0
3760 && NILP (Fmemq (make_number (preferred_charset_id),
3761 charset_list)))
3762 preferred_charset_id = -1;
3763 break;
3764 default:
3765 abort ();
4ed46869 3766 }
ff0dacd7
KH
3767 charbuf += -c - 1;
3768 continue;
4ed46869 3769 }
ec6d2bb8 3770
b73bfc1c
KH
3771 /* Now encode the character C. */
3772 if (c < 0x20 || c == 0x7F)
3773 {
df7492f9
KH
3774 if (c == '\n'
3775 || (c == '\r' && EQ (eol_type, Qmac)))
19a8d9e0 3776 {
df7492f9
KH
3777 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3778 ENCODE_RESET_PLANE_AND_REGISTER ();
3779 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
b73bfc1c 3780 {
df7492f9
KH
3781 int i;
3782
3783 for (i = 0; i < 4; i++)
3784 CODING_ISO_DESIGNATION (coding, i)
3785 = CODING_ISO_INITIAL (coding, i);
b73bfc1c 3786 }
df7492f9
KH
3787 bol_designation
3788 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
19a8d9e0 3789 }
df7492f9
KH
3790 else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
3791 ENCODE_RESET_PLANE_AND_REGISTER ();
3792 EMIT_ONE_ASCII_BYTE (c);
4ed46869 3793 }
df7492f9 3794 else if (ASCII_CHAR_P (c))
88993dfd 3795 {
df7492f9
KH
3796 if (ascii_compatible)
3797 EMIT_ONE_ASCII_BYTE (c);
93dec019 3798 else
19a8d9e0 3799 {
bf16eb23
KH
3800 struct charset *charset = CHARSET_FROM_ID (charset_ascii);
3801 ENCODE_ISO_CHARACTER (charset, c);
19a8d9e0 3802 }
4ed46869 3803 }
16eafb5d 3804 else if (CHAR_BYTE8_P (c))
88993dfd 3805 {
16eafb5d
KH
3806 c = CHAR_TO_BYTE8 (c);
3807 EMIT_ONE_BYTE (c);
88993dfd 3808 }
b73bfc1c 3809 else
df7492f9 3810 {
ff0dacd7 3811 struct charset *charset;
b73bfc1c 3812
ff0dacd7
KH
3813 if (preferred_charset_id >= 0)
3814 {
3815 charset = CHARSET_FROM_ID (preferred_charset_id);
3816 if (! CHAR_CHARSET_P (c, charset))
3817 charset = char_charset (c, charset_list, NULL);
3818 }
3819 else
3820 charset = char_charset (c, charset_list, NULL);
df7492f9
KH
3821 if (!charset)
3822 {
41cbe562
KH
3823 if (coding->mode & CODING_MODE_SAFE_ENCODING)
3824 {
3825 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
3826 charset = CHARSET_FROM_ID (charset_ascii);
3827 }
3828 else
3829 {
3830 c = coding->default_char;
3831 charset = char_charset (c, charset_list, NULL);
3832 }
df7492f9
KH
3833 }
3834 ENCODE_ISO_CHARACTER (charset, c);
3835 }
84fbb8a0 3836 }
b73bfc1c 3837
df7492f9
KH
3838 if (coding->mode & CODING_MODE_LAST_BLOCK
3839 && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3840 {
3841 ASSURE_DESTINATION (safe_room);
3842 ENCODE_RESET_PLANE_AND_REGISTER ();
3843 }
065e3595 3844 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
3845 CODING_ISO_BOL (coding) = bol_designation;
3846 coding->produced_char += produced_chars;
3847 coding->produced = dst - coding->destination;
3848 return 0;
4ed46869
KH
3849}
3850
3851\f
df7492f9 3852/*** 8,9. SJIS and BIG5 handlers ***/
4ed46869 3853
df7492f9 3854/* Although SJIS and BIG5 are not ISO's coding system, they are used
4ed46869
KH
3855 quite widely. So, for the moment, Emacs supports them in the bare
3856 C code. But, in the future, they may be supported only by CCL. */
3857
3858/* SJIS is a coding system encoding three character sets: ASCII, right
3859 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
3860 as is. A character of charset katakana-jisx0201 is encoded by
3861 "position-code + 0x80". A character of charset japanese-jisx0208
3862 is encoded in 2-byte but two position-codes are divided and shifted
df7492f9 3863 so that it fit in the range below.
4ed46869
KH
3864
3865 --- CODE RANGE of SJIS ---
3866 (character set) (range)
3867 ASCII 0x00 .. 0x7F
df7492f9 3868 KATAKANA-JISX0201 0xA0 .. 0xDF
c28a9453 3869 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
d14d03ac 3870 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
4ed46869
KH
3871 -------------------------------
3872
3873*/
3874
3875/* BIG5 is a coding system encoding two character sets: ASCII and
3876 Big5. An ASCII character is encoded as is. Big5 is a two-byte
df7492f9 3877 character set and is encoded in two-byte.
4ed46869
KH
3878
3879 --- CODE RANGE of BIG5 ---
3880 (character set) (range)
3881 ASCII 0x00 .. 0x7F
3882 Big5 (1st byte) 0xA1 .. 0xFE
3883 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3884 --------------------------
3885
df7492f9 3886 */
4ed46869
KH
3887
3888/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3889 Check if a text is encoded in SJIS. If it is, return
df7492f9 3890 CATEGORY_MASK_SJIS, else return 0. */
4ed46869 3891
0a28aafb 3892static int
ff0dacd7 3893detect_coding_sjis (coding, detect_info)
df7492f9 3894 struct coding_system *coding;
ff0dacd7 3895 struct coding_detection_info *detect_info;
4ed46869 3896{
065e3595 3897 const unsigned char *src = coding->source, *src_base;
8f924df7 3898 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9
KH
3899 int multibytep = coding->src_multibyte;
3900 int consumed_chars = 0;
3901 int found = 0;
b73bfc1c 3902 int c;
df7492f9 3903
ff0dacd7 3904 detect_info->checked |= CATEGORY_MASK_SJIS;
df7492f9
KH
3905 /* A coding system of this category is always ASCII compatible. */
3906 src += coding->head_ascii;
4ed46869 3907
b73bfc1c 3908 while (1)
4ed46869 3909 {
065e3595 3910 src_base = src;
df7492f9 3911 ONE_MORE_BYTE (c);
682169fe
KH
3912 if (c < 0x80)
3913 continue;
df7492f9 3914 if ((c >= 0x81 && c <= 0x9F) || (c >= 0xE0 && c <= 0xEF))
4ed46869 3915 {
df7492f9 3916 ONE_MORE_BYTE (c);
682169fe 3917 if (c < 0x40 || c == 0x7F || c > 0xFC)
df7492f9 3918 break;
ff0dacd7 3919 found = CATEGORY_MASK_SJIS;
4ed46869 3920 }
df7492f9 3921 else if (c >= 0xA0 && c < 0xE0)
ff0dacd7 3922 found = CATEGORY_MASK_SJIS;
df7492f9
KH
3923 else
3924 break;
4ed46869 3925 }
ff0dacd7 3926 detect_info->rejected |= CATEGORY_MASK_SJIS;
df7492f9
KH
3927 return 0;
3928
3929 no_more_source:
065e3595 3930 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
89528eb3 3931 {
ff0dacd7 3932 detect_info->rejected |= CATEGORY_MASK_SJIS;
89528eb3 3933 return 0;
4ed46869 3934 }
ff0dacd7
KH
3935 detect_info->found |= found;
3936 return 1;
4ed46869
KH
3937}
3938
3939/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3940 Check if a text is encoded in BIG5. If it is, return
df7492f9 3941 CATEGORY_MASK_BIG5, else return 0. */
4ed46869 3942
0a28aafb 3943static int
ff0dacd7 3944detect_coding_big5 (coding, detect_info)
df7492f9 3945 struct coding_system *coding;
ff0dacd7 3946 struct coding_detection_info *detect_info;
4ed46869 3947{
065e3595 3948 const unsigned char *src = coding->source, *src_base;
8f924df7 3949 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9
KH
3950 int multibytep = coding->src_multibyte;
3951 int consumed_chars = 0;
3952 int found = 0;
b73bfc1c 3953 int c;
fa42c37f 3954
ff0dacd7 3955 detect_info->checked |= CATEGORY_MASK_BIG5;
df7492f9
KH
3956 /* A coding system of this category is always ASCII compatible. */
3957 src += coding->head_ascii;
fa42c37f 3958
b73bfc1c 3959 while (1)
fa42c37f 3960 {
065e3595 3961 src_base = src;
df7492f9
KH
3962 ONE_MORE_BYTE (c);
3963 if (c < 0x80)
fa42c37f 3964 continue;
df7492f9 3965 if (c >= 0xA1)
fa42c37f 3966 {
df7492f9
KH
3967 ONE_MORE_BYTE (c);
3968 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
fa42c37f 3969 return 0;
ff0dacd7 3970 found = CATEGORY_MASK_BIG5;
fa42c37f 3971 }
df7492f9
KH
3972 else
3973 break;
fa42c37f 3974 }
ff0dacd7 3975 detect_info->rejected |= CATEGORY_MASK_BIG5;
fa42c37f 3976 return 0;
fa42c37f 3977
df7492f9 3978 no_more_source:
065e3595 3979 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
89528eb3 3980 {
ff0dacd7 3981 detect_info->rejected |= CATEGORY_MASK_BIG5;
89528eb3
KH
3982 return 0;
3983 }
ff0dacd7
KH
3984 detect_info->found |= found;
3985 return 1;
fa42c37f
KH
3986}
3987
4ed46869
KH
3988/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
3989 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
fa42c37f 3990
b73bfc1c 3991static void
df7492f9 3992decode_coding_sjis (coding)
4ed46869 3993 struct coding_system *coding;
4ed46869 3994{
8f924df7
KH
3995 const unsigned char *src = coding->source + coding->consumed;
3996 const unsigned char *src_end = coding->source + coding->src_bytes;
3997 const unsigned char *src_base;
df7492f9 3998 int *charbuf = coding->charbuf;
ff0dacd7 3999 int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
df7492f9
KH
4000 int consumed_chars = 0, consumed_chars_base;
4001 int multibytep = coding->src_multibyte;
4002 struct charset *charset_roman, *charset_kanji, *charset_kana;
57a47f8a 4003 struct charset *charset_kanji2;
24a73b0a 4004 Lisp_Object attrs, charset_list, val;
ff0dacd7
KH
4005 int char_offset = coding->produced_char;
4006 int last_offset = char_offset;
4007 int last_id = charset_ascii;
a5d301df 4008
24a73b0a 4009 CODING_GET_INFO (coding, attrs, charset_list);
df7492f9
KH
4010
4011 val = charset_list;
4012 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
89528eb3 4013 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
57a47f8a
KH
4014 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4015 charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
fa42c37f 4016
b73bfc1c 4017 while (1)
4ed46869 4018 {
df7492f9 4019 int c, c1;
24a73b0a 4020 struct charset *charset;
fa42c37f 4021
b73bfc1c 4022 src_base = src;
df7492f9 4023 consumed_chars_base = consumed_chars;
fa42c37f 4024
df7492f9
KH
4025 if (charbuf >= charbuf_end)
4026 break;
4027
4028 ONE_MORE_BYTE (c);
065e3595
KH
4029 if (c < 0)
4030 goto invalid_code;
24a73b0a
KH
4031 if (c < 0x80)
4032 charset = charset_roman;
57a47f8a 4033 else if (c == 0x80 || c == 0xA0)
8e921c4b 4034 goto invalid_code;
57a47f8a
KH
4035 else if (c >= 0xA1 && c <= 0xDF)
4036 {
4037 /* SJIS -> JISX0201-Kana */
4038 c &= 0x7F;
4039 charset = charset_kana;
4040 }
4041 else if (c <= 0xEF)
df7492f9 4042 {
57a47f8a
KH
4043 /* SJIS -> JISX0208 */
4044 ONE_MORE_BYTE (c1);
4045 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
24a73b0a 4046 goto invalid_code;
57a47f8a
KH
4047 c = (c << 8) | c1;
4048 SJIS_TO_JIS (c);
4049 charset = charset_kanji;
4050 }
4051 else if (c <= 0xFC && charset_kanji2)
4052 {
c6876370 4053 /* SJIS -> JISX0213-2 */
57a47f8a
KH
4054 ONE_MORE_BYTE (c1);
4055 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
24a73b0a 4056 goto invalid_code;
57a47f8a
KH
4057 c = (c << 8) | c1;
4058 SJIS_TO_JIS2 (c);
4059 charset = charset_kanji2;
df7492f9 4060 }
57a47f8a
KH
4061 else
4062 goto invalid_code;
24a73b0a
KH
4063 if (charset->id != charset_ascii
4064 && last_id != charset->id)
4065 {
4066 if (last_id != charset_ascii)
4067 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4068 last_id = charset->id;
4069 last_offset = char_offset;
4070 }
4071 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
df7492f9 4072 *charbuf++ = c;
ff0dacd7 4073 char_offset++;
df7492f9 4074 continue;
b73bfc1c 4075
df7492f9
KH
4076 invalid_code:
4077 src = src_base;
4078 consumed_chars = consumed_chars_base;
4079 ONE_MORE_BYTE (c);
065e3595 4080 *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
ff0dacd7 4081 char_offset++;
df7492f9
KH
4082 coding->errors++;
4083 }
fa42c37f 4084
df7492f9 4085 no_more_source:
ff0dacd7
KH
4086 if (last_id != charset_ascii)
4087 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
df7492f9
KH
4088 coding->consumed_char += consumed_chars_base;
4089 coding->consumed = src_base - coding->source;
4090 coding->charbuf_used = charbuf - coding->charbuf;
fa42c37f
KH
4091}
4092
b73bfc1c 4093static void
df7492f9 4094decode_coding_big5 (coding)
4ed46869 4095 struct coding_system *coding;
4ed46869 4096{
8f924df7
KH
4097 const unsigned char *src = coding->source + coding->consumed;
4098 const unsigned char *src_end = coding->source + coding->src_bytes;
4099 const unsigned char *src_base;
df7492f9 4100 int *charbuf = coding->charbuf;
ff0dacd7 4101 int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
df7492f9
KH
4102 int consumed_chars = 0, consumed_chars_base;
4103 int multibytep = coding->src_multibyte;
4104 struct charset *charset_roman, *charset_big5;
24a73b0a 4105 Lisp_Object attrs, charset_list, val;
ff0dacd7
KH
4106 int char_offset = coding->produced_char;
4107 int last_offset = char_offset;
4108 int last_id = charset_ascii;
df7492f9 4109
24a73b0a 4110 CODING_GET_INFO (coding, attrs, charset_list);
df7492f9
KH
4111 val = charset_list;
4112 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4113 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4ed46869 4114
b73bfc1c 4115 while (1)
4ed46869 4116 {
df7492f9 4117 int c, c1;
24a73b0a 4118 struct charset *charset;
b73bfc1c
KH
4119
4120 src_base = src;
df7492f9
KH
4121 consumed_chars_base = consumed_chars;
4122
4123 if (charbuf >= charbuf_end)
4124 break;
4125
4126 ONE_MORE_BYTE (c);
b73bfc1c 4127
065e3595
KH
4128 if (c < 0)
4129 goto invalid_code;
24a73b0a
KH
4130 if (c < 0x80)
4131 charset = charset_roman;
4132 else
4ed46869 4133 {
24a73b0a
KH
4134 /* BIG5 -> Big5 */
4135 if (c < 0xA1 || c > 0xFE)
4136 goto invalid_code;
4137 ONE_MORE_BYTE (c1);
4138 if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4139 goto invalid_code;
4140 c = c << 8 | c1;
4141 charset = charset_big5;
4ed46869 4142 }
24a73b0a
KH
4143 if (charset->id != charset_ascii
4144 && last_id != charset->id)
df7492f9 4145 {
24a73b0a
KH
4146 if (last_id != charset_ascii)
4147 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4148 last_id = charset->id;
4149 last_offset = char_offset;
4ed46869 4150 }
24a73b0a 4151 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
df7492f9 4152 *charbuf++ = c;
ff0dacd7 4153 char_offset++;
fb88bf2d
KH
4154 continue;
4155
df7492f9 4156 invalid_code:
4ed46869 4157 src = src_base;
df7492f9
KH
4158 consumed_chars = consumed_chars_base;
4159 ONE_MORE_BYTE (c);
065e3595 4160 *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
ff0dacd7 4161 char_offset++;
df7492f9 4162 coding->errors++;
fb88bf2d 4163 }
d46c5b12 4164
df7492f9 4165 no_more_source:
ff0dacd7
KH
4166 if (last_id != charset_ascii)
4167 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
df7492f9
KH
4168 coding->consumed_char += consumed_chars_base;
4169 coding->consumed = src_base - coding->source;
4170 coding->charbuf_used = charbuf - coding->charbuf;
4ed46869
KH
4171}
4172
4173/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
b73bfc1c
KH
4174 This function can encode charsets `ascii', `katakana-jisx0201',
4175 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
4176 are sure that all these charsets are registered as official charset
4ed46869
KH
4177 (i.e. do not have extended leading-codes). Characters of other
4178 charsets are produced without any encoding. If SJIS_P is 1, encode
4179 SJIS text, else encode BIG5 text. */
4180
df7492f9
KH
4181static int
4182encode_coding_sjis (coding)
4ed46869 4183 struct coding_system *coding;
4ed46869 4184{
df7492f9
KH
4185 int multibytep = coding->dst_multibyte;
4186 int *charbuf = coding->charbuf;
4187 int *charbuf_end = charbuf + coding->charbuf_used;
4188 unsigned char *dst = coding->destination + coding->produced;
4189 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4190 int safe_room = 4;
4191 int produced_chars = 0;
24a73b0a 4192 Lisp_Object attrs, charset_list, val;
df7492f9
KH
4193 int ascii_compatible;
4194 struct charset *charset_roman, *charset_kanji, *charset_kana;
57a47f8a 4195 struct charset *charset_kanji2;
df7492f9 4196 int c;
a5d301df 4197
24a73b0a 4198 CODING_GET_INFO (coding, attrs, charset_list);
df7492f9
KH
4199 val = charset_list;
4200 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4201 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
57a47f8a
KH
4202 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4203 charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4ed46869 4204
df7492f9 4205 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
93dec019 4206
df7492f9
KH
4207 while (charbuf < charbuf_end)
4208 {
4209 ASSURE_DESTINATION (safe_room);
4210 c = *charbuf++;
b73bfc1c 4211 /* Now encode the character C. */
df7492f9
KH
4212 if (ASCII_CHAR_P (c) && ascii_compatible)
4213 EMIT_ONE_ASCII_BYTE (c);
16eafb5d
KH
4214 else if (CHAR_BYTE8_P (c))
4215 {
4216 c = CHAR_TO_BYTE8 (c);
4217 EMIT_ONE_BYTE (c);
4218 }
df7492f9 4219 else
b73bfc1c 4220 {
df7492f9
KH
4221 unsigned code;
4222 struct charset *charset = char_charset (c, charset_list, &code);
4223
4224 if (!charset)
4ed46869 4225 {
41cbe562 4226 if (coding->mode & CODING_MODE_SAFE_ENCODING)
b73bfc1c 4227 {
41cbe562
KH
4228 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4229 charset = CHARSET_FROM_ID (charset_ascii);
b73bfc1c 4230 }
41cbe562 4231 else
b73bfc1c 4232 {
41cbe562
KH
4233 c = coding->default_char;
4234 charset = char_charset (c, charset_list, &code);
b73bfc1c 4235 }
b73bfc1c 4236 }
df7492f9
KH
4237 if (code == CHARSET_INVALID_CODE (charset))
4238 abort ();
4239 if (charset == charset_kanji)
4240 {
4241 int c1, c2;
4242 JIS_TO_SJIS (code);
4243 c1 = code >> 8, c2 = code & 0xFF;
4244 EMIT_TWO_BYTES (c1, c2);
4245 }
4246 else if (charset == charset_kana)
4247 EMIT_ONE_BYTE (code | 0x80);
57a47f8a
KH
4248 else if (charset_kanji2 && charset == charset_kanji2)
4249 {
4250 int c1, c2;
4251
4252 c1 = code >> 8;
4253 if (c1 == 0x21 || (c1 >= 0x23 && c1 < 0x25)
4254 || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
4255 {
4256 JIS_TO_SJIS2 (code);
4257 c1 = code >> 8, c2 = code & 0xFF;
4258 EMIT_TWO_BYTES (c1, c2);
4259 }
4260 else
4261 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4262 }
df7492f9
KH
4263 else
4264 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4265 }
4266 }
065e3595 4267 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
4268 coding->produced_char += produced_chars;
4269 coding->produced = dst - coding->destination;
4270 return 0;
4271}
4272
4273static int
4274encode_coding_big5 (coding)
4275 struct coding_system *coding;
4276{
4277 int multibytep = coding->dst_multibyte;
4278 int *charbuf = coding->charbuf;
4279 int *charbuf_end = charbuf + coding->charbuf_used;
4280 unsigned char *dst = coding->destination + coding->produced;
4281 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4282 int safe_room = 4;
4283 int produced_chars = 0;
24a73b0a 4284 Lisp_Object attrs, charset_list, val;
df7492f9
KH
4285 int ascii_compatible;
4286 struct charset *charset_roman, *charset_big5;
4287 int c;
4288
24a73b0a 4289 CODING_GET_INFO (coding, attrs, charset_list);
df7492f9
KH
4290 val = charset_list;
4291 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4292 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4293 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4294
4295 while (charbuf < charbuf_end)
4296 {
4297 ASSURE_DESTINATION (safe_room);
4298 c = *charbuf++;
4299 /* Now encode the character C. */
4300 if (ASCII_CHAR_P (c) && ascii_compatible)
4301 EMIT_ONE_ASCII_BYTE (c);
16eafb5d
KH
4302 else if (CHAR_BYTE8_P (c))
4303 {
4304 c = CHAR_TO_BYTE8 (c);
4305 EMIT_ONE_BYTE (c);
b73bfc1c
KH
4306 }
4307 else
4308 {
df7492f9
KH
4309 unsigned code;
4310 struct charset *charset = char_charset (c, charset_list, &code);
4311
4312 if (! charset)
b73bfc1c 4313 {
41cbe562 4314 if (coding->mode & CODING_MODE_SAFE_ENCODING)
b73bfc1c 4315 {
41cbe562
KH
4316 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4317 charset = CHARSET_FROM_ID (charset_ascii);
b73bfc1c 4318 }
41cbe562 4319 else
0eecad43 4320 {
41cbe562
KH
4321 c = coding->default_char;
4322 charset = char_charset (c, charset_list, &code);
0eecad43 4323 }
4ed46869 4324 }
df7492f9
KH
4325 if (code == CHARSET_INVALID_CODE (charset))
4326 abort ();
4327 if (charset == charset_big5)
b73bfc1c 4328 {
df7492f9
KH
4329 int c1, c2;
4330
4331 c1 = code >> 8, c2 = code & 0xFF;
4332 EMIT_TWO_BYTES (c1, c2);
b73bfc1c 4333 }
df7492f9
KH
4334 else
4335 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4ed46869 4336 }
4ed46869 4337 }
065e3595 4338 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
4339 coding->produced_char += produced_chars;
4340 coding->produced = dst - coding->destination;
4341 return 0;
4ed46869
KH
4342}
4343
4344\f
df7492f9 4345/*** 10. CCL handlers ***/
1397dc18
KH
4346
4347/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4348 Check if a text is encoded in a coding system of which
4349 encoder/decoder are written in CCL program. If it is, return
df7492f9 4350 CATEGORY_MASK_CCL, else return 0. */
1397dc18 4351
0a28aafb 4352static int
ff0dacd7 4353detect_coding_ccl (coding, detect_info)
df7492f9 4354 struct coding_system *coding;
ff0dacd7 4355 struct coding_detection_info *detect_info;
1397dc18 4356{
065e3595 4357 const unsigned char *src = coding->source, *src_base;
8f924df7 4358 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9
KH
4359 int multibytep = coding->src_multibyte;
4360 int consumed_chars = 0;
4361 int found = 0;
0e219d54 4362 unsigned char *valids;
df7492f9
KH
4363 int head_ascii = coding->head_ascii;
4364 Lisp_Object attrs;
4365
ff0dacd7
KH
4366 detect_info->checked |= CATEGORY_MASK_CCL;
4367
df7492f9 4368 coding = &coding_categories[coding_category_ccl];
0e219d54 4369 valids = CODING_CCL_VALIDS (coding);
df7492f9
KH
4370 attrs = CODING_ID_ATTRS (coding->id);
4371 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4372 src += head_ascii;
1397dc18 4373
b73bfc1c 4374 while (1)
1397dc18 4375 {
df7492f9 4376 int c;
065e3595
KH
4377
4378 src_base = src;
df7492f9 4379 ONE_MORE_BYTE (c);
065e3595 4380 if (c < 0 || ! valids[c])
df7492f9 4381 break;
ff0dacd7
KH
4382 if ((valids[c] > 1))
4383 found = CATEGORY_MASK_CCL;
df7492f9 4384 }
ff0dacd7 4385 detect_info->rejected |= CATEGORY_MASK_CCL;
df7492f9
KH
4386 return 0;
4387
4388 no_more_source:
ff0dacd7
KH
4389 detect_info->found |= found;
4390 return 1;
df7492f9
KH
4391}
4392
4393static void
4394decode_coding_ccl (coding)
4395 struct coding_system *coding;
4396{
7c78e542 4397 const unsigned char *src = coding->source + coding->consumed;
8f924df7 4398 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9
KH
4399 int *charbuf = coding->charbuf;
4400 int *charbuf_end = charbuf + coding->charbuf_size;
4401 int consumed_chars = 0;
4402 int multibytep = coding->src_multibyte;
4403 struct ccl_program ccl;
4404 int source_charbuf[1024];
4405 int source_byteidx[1024];
24a73b0a 4406 Lisp_Object attrs, charset_list;
df7492f9 4407
24a73b0a 4408 CODING_GET_INFO (coding, attrs, charset_list);
df7492f9
KH
4409 setup_ccl_program (&ccl, CODING_CCL_DECODER (coding));
4410
4411 while (src < src_end)
4412 {
7c78e542 4413 const unsigned char *p = src;
df7492f9
KH
4414 int *source, *source_end;
4415 int i = 0;
4416
4417 if (multibytep)
4418 while (i < 1024 && p < src_end)
4419 {
4420 source_byteidx[i] = p - src;
4421 source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
4422 }
4423 else
4424 while (i < 1024 && p < src_end)
4425 source_charbuf[i++] = *p++;
8f924df7 4426
df7492f9
KH
4427 if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
4428 ccl.last_block = 1;
4429
4430 source = source_charbuf;
4431 source_end = source + i;
4432 while (source < source_end)
4433 {
4434 ccl_driver (&ccl, source, charbuf,
8dcbea82
KH
4435 source_end - source, charbuf_end - charbuf,
4436 charset_list);
df7492f9
KH
4437 source += ccl.consumed;
4438 charbuf += ccl.produced;
4439 if (ccl.status != CCL_STAT_SUSPEND_BY_DST)
4440 break;
4441 }
4442 if (source < source_end)
4443 src += source_byteidx[source - source_charbuf];
4444 else
4445 src = p;
4446 consumed_chars += source - source_charbuf;
4447
4448 if (ccl.status != CCL_STAT_SUSPEND_BY_SRC
4449 && ccl.status != CODING_RESULT_INSUFFICIENT_SRC)
4450 break;
4451 }
4452
4453 switch (ccl.status)
4454 {
4455 case CCL_STAT_SUSPEND_BY_SRC:
065e3595 4456 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
df7492f9
KH
4457 break;
4458 case CCL_STAT_SUSPEND_BY_DST:
4459 break;
4460 case CCL_STAT_QUIT:
4461 case CCL_STAT_INVALID_CMD:
065e3595 4462 record_conversion_result (coding, CODING_RESULT_INTERRUPT);
df7492f9
KH
4463 break;
4464 default:
065e3595 4465 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
4466 break;
4467 }
4468 coding->consumed_char += consumed_chars;
4469 coding->consumed = src - coding->source;
4470 coding->charbuf_used = charbuf - coding->charbuf;
4471}
4472
4473static int
4474encode_coding_ccl (coding)
4475 struct coding_system *coding;
4476{
4477 struct ccl_program ccl;
4478 int multibytep = coding->dst_multibyte;
4479 int *charbuf = coding->charbuf;
4480 int *charbuf_end = charbuf + coding->charbuf_used;
4481 unsigned char *dst = coding->destination + coding->produced;
4482 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4483 unsigned char *adjusted_dst_end = dst_end - 1;
4484 int destination_charbuf[1024];
4485 int i, produced_chars = 0;
24a73b0a 4486 Lisp_Object attrs, charset_list;
df7492f9 4487
24a73b0a 4488 CODING_GET_INFO (coding, attrs, charset_list);
df7492f9
KH
4489 setup_ccl_program (&ccl, CODING_CCL_ENCODER (coding));
4490
4491 ccl.last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4492 ccl.dst_multibyte = coding->dst_multibyte;
4493
4494 while (charbuf < charbuf_end && dst < adjusted_dst_end)
4495 {
4496 int dst_bytes = dst_end - dst;
4497 if (dst_bytes > 1024)
4498 dst_bytes = 1024;
4499
4500 ccl_driver (&ccl, charbuf, destination_charbuf,
8dcbea82 4501 charbuf_end - charbuf, dst_bytes, charset_list);
df7492f9
KH
4502 charbuf += ccl.consumed;
4503 if (multibytep)
4504 for (i = 0; i < ccl.produced; i++)
4505 EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
4506 else
4507 {
4508 for (i = 0; i < ccl.produced; i++)
4509 *dst++ = destination_charbuf[i] & 0xFF;
4510 produced_chars += ccl.produced;
4511 }
4512 }
4513
4514 switch (ccl.status)
4515 {
4516 case CCL_STAT_SUSPEND_BY_SRC:
065e3595 4517 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
df7492f9
KH
4518 break;
4519 case CCL_STAT_SUSPEND_BY_DST:
065e3595 4520 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
df7492f9
KH
4521 break;
4522 case CCL_STAT_QUIT:
4523 case CCL_STAT_INVALID_CMD:
065e3595 4524 record_conversion_result (coding, CODING_RESULT_INTERRUPT);
df7492f9
KH
4525 break;
4526 default:
065e3595 4527 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9 4528 break;
1397dc18 4529 }
df7492f9
KH
4530
4531 coding->produced_char += produced_chars;
4532 coding->produced = dst - coding->destination;
4533 return 0;
1397dc18
KH
4534}
4535
df7492f9 4536
1397dc18 4537\f
df7492f9 4538/*** 10, 11. no-conversion handlers ***/
4ed46869 4539
b73bfc1c 4540/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
4ed46869 4541
b73bfc1c 4542static void
df7492f9 4543decode_coding_raw_text (coding)
4ed46869 4544 struct coding_system *coding;
4ed46869 4545{
df7492f9 4546 coding->chars_at_source = 1;
2c78b7e1
KH
4547 coding->consumed_char = 0;
4548 coding->consumed = 0;
065e3595 4549 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9 4550}
4ed46869 4551
df7492f9
KH
4552static int
4553encode_coding_raw_text (coding)
4554 struct coding_system *coding;
4555{
4556 int multibytep = coding->dst_multibyte;
4557 int *charbuf = coding->charbuf;
4558 int *charbuf_end = coding->charbuf + coding->charbuf_used;
4559 unsigned char *dst = coding->destination + coding->produced;
4560 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4561 int produced_chars = 0;
b73bfc1c
KH
4562 int c;
4563
df7492f9 4564 if (multibytep)
b73bfc1c 4565 {
df7492f9 4566 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
4ed46869 4567
df7492f9
KH
4568 if (coding->src_multibyte)
4569 while (charbuf < charbuf_end)
4570 {
4571 ASSURE_DESTINATION (safe_room);
4572 c = *charbuf++;
4573 if (ASCII_CHAR_P (c))
4574 EMIT_ONE_ASCII_BYTE (c);
4575 else if (CHAR_BYTE8_P (c))
4576 {
4577 c = CHAR_TO_BYTE8 (c);
4578 EMIT_ONE_BYTE (c);
4579 }
4580 else
4581 {
4582 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
93dec019 4583
df7492f9
KH
4584 CHAR_STRING_ADVANCE (c, p1);
4585 while (p0 < p1)
9d123124
KH
4586 {
4587 EMIT_ONE_BYTE (*p0);
4588 p0++;
4589 }
df7492f9
KH
4590 }
4591 }
b73bfc1c 4592 else
df7492f9
KH
4593 while (charbuf < charbuf_end)
4594 {
4595 ASSURE_DESTINATION (safe_room);
4596 c = *charbuf++;
4597 EMIT_ONE_BYTE (c);
4598 }
4599 }
4600 else
4ed46869 4601 {
df7492f9 4602 if (coding->src_multibyte)
d46c5b12 4603 {
df7492f9
KH
4604 int safe_room = MAX_MULTIBYTE_LENGTH;
4605
4606 while (charbuf < charbuf_end)
d46c5b12 4607 {
df7492f9
KH
4608 ASSURE_DESTINATION (safe_room);
4609 c = *charbuf++;
4610 if (ASCII_CHAR_P (c))
4611 *dst++ = c;
4612 else if (CHAR_BYTE8_P (c))
4613 *dst++ = CHAR_TO_BYTE8 (c);
b73bfc1c 4614 else
df7492f9
KH
4615 CHAR_STRING_ADVANCE (c, dst);
4616 produced_chars++;
d46c5b12
KH
4617 }
4618 }
df7492f9
KH
4619 else
4620 {
4621 ASSURE_DESTINATION (charbuf_end - charbuf);
4622 while (charbuf < charbuf_end && dst < dst_end)
4623 *dst++ = *charbuf++;
4624 produced_chars = dst - (coding->destination + coding->dst_bytes);
8f924df7 4625 }
4ed46869 4626 }
065e3595 4627 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
4628 coding->produced_char += produced_chars;
4629 coding->produced = dst - coding->destination;
4630 return 0;
4ed46869
KH
4631}
4632
ff0dacd7
KH
4633/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4634 Check if a text is encoded in a charset-based coding system. If it
4635 is, return 1, else return 0. */
4636
0a28aafb 4637static int
ff0dacd7 4638detect_coding_charset (coding, detect_info)
df7492f9 4639 struct coding_system *coding;
ff0dacd7 4640 struct coding_detection_info *detect_info;
1397dc18 4641{
065e3595 4642 const unsigned char *src = coding->source, *src_base;
8f924df7 4643 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9
KH
4644 int multibytep = coding->src_multibyte;
4645 int consumed_chars = 0;
4646 Lisp_Object attrs, valids;
584948ac 4647 int found = 0;
1397dc18 4648
ff0dacd7
KH
4649 detect_info->checked |= CATEGORY_MASK_CHARSET;
4650
df7492f9
KH
4651 coding = &coding_categories[coding_category_charset];
4652 attrs = CODING_ID_ATTRS (coding->id);
4653 valids = AREF (attrs, coding_attr_charset_valids);
4654
4655 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4656 src += coding->head_ascii;
1397dc18 4657
b73bfc1c 4658 while (1)
1397dc18 4659 {
df7492f9 4660 int c;
1397dc18 4661
065e3595 4662 src_base = src;
df7492f9 4663 ONE_MORE_BYTE (c);
065e3595
KH
4664 if (c < 0)
4665 continue;
df7492f9
KH
4666 if (NILP (AREF (valids, c)))
4667 break;
584948ac 4668 if (c >= 0x80)
ff0dacd7 4669 found = CATEGORY_MASK_CHARSET;
df7492f9 4670 }
ff0dacd7 4671 detect_info->rejected |= CATEGORY_MASK_CHARSET;
df7492f9 4672 return 0;
4ed46869 4673
df7492f9 4674 no_more_source:
ff0dacd7
KH
4675 detect_info->found |= found;
4676 return 1;
df7492f9 4677}
b73bfc1c 4678
b73bfc1c 4679static void
df7492f9 4680decode_coding_charset (coding)
4ed46869 4681 struct coding_system *coding;
4ed46869 4682{
8f924df7
KH
4683 const unsigned char *src = coding->source + coding->consumed;
4684 const unsigned char *src_end = coding->source + coding->src_bytes;
4685 const unsigned char *src_base;
df7492f9 4686 int *charbuf = coding->charbuf;
ff0dacd7 4687 int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
df7492f9
KH
4688 int consumed_chars = 0, consumed_chars_base;
4689 int multibytep = coding->src_multibyte;
24a73b0a 4690 Lisp_Object attrs, charset_list, valids;
ff0dacd7
KH
4691 int char_offset = coding->produced_char;
4692 int last_offset = char_offset;
4693 int last_id = charset_ascii;
df7492f9 4694
24a73b0a 4695 CODING_GET_INFO (coding, attrs, charset_list);
4eb6d3f1 4696 valids = AREF (attrs, coding_attr_charset_valids);
b73bfc1c 4697
df7492f9 4698 while (1)
4ed46869 4699 {
4eb6d3f1 4700 int c;
24a73b0a
KH
4701 Lisp_Object val;
4702 struct charset *charset;
4703 int dim;
4704 int len = 1;
4705 unsigned code;
df7492f9
KH
4706
4707 src_base = src;
4708 consumed_chars_base = consumed_chars;
b73bfc1c 4709
df7492f9
KH
4710 if (charbuf >= charbuf_end)
4711 break;
4712
4eb6d3f1 4713 ONE_MORE_BYTE (c);
065e3595
KH
4714 if (c < 0)
4715 goto invalid_code;
24a73b0a
KH
4716 code = c;
4717
4718 val = AREF (valids, c);
4719 if (NILP (val))
4720 goto invalid_code;
4721 if (INTEGERP (val))
d46c5b12 4722 {
24a73b0a
KH
4723 charset = CHARSET_FROM_ID (XFASTINT (val));
4724 dim = CHARSET_DIMENSION (charset);
4725 while (len < dim)
b73bfc1c 4726 {
24a73b0a
KH
4727 ONE_MORE_BYTE (c);
4728 code = (code << 8) | c;
4729 len++;
b73bfc1c 4730 }
24a73b0a
KH
4731 CODING_DECODE_CHAR (coding, src, src_base, src_end,
4732 charset, code, c);
d46c5b12 4733 }
df7492f9 4734 else
d46c5b12 4735 {
24a73b0a
KH
4736 /* VAL is a list of charset IDs. It is assured that the
4737 list is sorted by charset dimensions (smaller one
4738 comes first). */
4739 while (CONSP (val))
4eb6d3f1 4740 {
24a73b0a 4741 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
c7c66a95 4742 dim = CHARSET_DIMENSION (charset);
f9d71dcd 4743 while (len < dim)
4eb6d3f1 4744 {
acb2a965
KH
4745 ONE_MORE_BYTE (c);
4746 code = (code << 8) | c;
f9d71dcd 4747 len++;
4eb6d3f1 4748 }
24a73b0a
KH
4749 CODING_DECODE_CHAR (coding, src, src_base,
4750 src_end, charset, code, c);
4751 if (c >= 0)
4752 break;
4753 val = XCDR (val);
ff0dacd7 4754 }
d46c5b12 4755 }
24a73b0a
KH
4756 if (c < 0)
4757 goto invalid_code;
4758 if (charset->id != charset_ascii
4759 && last_id != charset->id)
4760 {
4761 if (last_id != charset_ascii)
4762 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4763 last_id = charset->id;
4764 last_offset = char_offset;
4765 }
4766
df7492f9 4767 *charbuf++ = c;
ff0dacd7 4768 char_offset++;
df7492f9
KH
4769 continue;
4770
4771 invalid_code:
4772 src = src_base;
4773 consumed_chars = consumed_chars_base;
4774 ONE_MORE_BYTE (c);
065e3595 4775 *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
ff0dacd7 4776 char_offset++;
df7492f9 4777 coding->errors++;
4ed46869
KH
4778 }
4779
df7492f9 4780 no_more_source:
ff0dacd7
KH
4781 if (last_id != charset_ascii)
4782 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
df7492f9
KH
4783 coding->consumed_char += consumed_chars_base;
4784 coding->consumed = src_base - coding->source;
4785 coding->charbuf_used = charbuf - coding->charbuf;
4ed46869
KH
4786}
4787
df7492f9
KH
4788static int
4789encode_coding_charset (coding)
4ed46869 4790 struct coding_system *coding;
4ed46869 4791{
df7492f9
KH
4792 int multibytep = coding->dst_multibyte;
4793 int *charbuf = coding->charbuf;
4794 int *charbuf_end = charbuf + coding->charbuf_used;
4795 unsigned char *dst = coding->destination + coding->produced;
4796 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4797 int safe_room = MAX_MULTIBYTE_LENGTH;
4798 int produced_chars = 0;
24a73b0a 4799 Lisp_Object attrs, charset_list;
df7492f9 4800 int ascii_compatible;
b73bfc1c 4801 int c;
b73bfc1c 4802
24a73b0a 4803 CODING_GET_INFO (coding, attrs, charset_list);
df7492f9 4804 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
fb88bf2d 4805
df7492f9 4806 while (charbuf < charbuf_end)
4ed46869 4807 {
4eb6d3f1 4808 struct charset *charset;
df7492f9 4809 unsigned code;
8f924df7 4810
df7492f9
KH
4811 ASSURE_DESTINATION (safe_room);
4812 c = *charbuf++;
4813 if (ascii_compatible && ASCII_CHAR_P (c))
4814 EMIT_ONE_ASCII_BYTE (c);
16eafb5d 4815 else if (CHAR_BYTE8_P (c))
4ed46869 4816 {
16eafb5d
KH
4817 c = CHAR_TO_BYTE8 (c);
4818 EMIT_ONE_BYTE (c);
d46c5b12 4819 }
d46c5b12 4820 else
b73bfc1c 4821 {
4eb6d3f1
KH
4822 charset = char_charset (c, charset_list, &code);
4823 if (charset)
4824 {
4825 if (CHARSET_DIMENSION (charset) == 1)
4826 EMIT_ONE_BYTE (code);
4827 else if (CHARSET_DIMENSION (charset) == 2)
4828 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
4829 else if (CHARSET_DIMENSION (charset) == 3)
4830 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
4831 else
4832 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
4833 (code >> 8) & 0xFF, code & 0xFF);
4834 }
4835 else
41cbe562
KH
4836 {
4837 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4838 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4839 else
4840 c = coding->default_char;
4841 EMIT_ONE_BYTE (c);
4842 }
4ed46869 4843 }
4ed46869
KH
4844 }
4845
065e3595 4846 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
4847 coding->produced_char += produced_chars;
4848 coding->produced = dst - coding->destination;
4849 return 0;
4ed46869
KH
4850}
4851
4852\f
1397dc18 4853/*** 7. C library functions ***/
4ed46869 4854
df7492f9
KH
4855/* Setup coding context CODING from information about CODING_SYSTEM.
4856 If CODING_SYSTEM is nil, `no-conversion' is assumed. If
4857 CODING_SYSTEM is invalid, signal an error. */
4ed46869 4858
ec6d2bb8 4859void
e0e989f6
KH
4860setup_coding_system (coding_system, coding)
4861 Lisp_Object coding_system;
4ed46869
KH
4862 struct coding_system *coding;
4863{
df7492f9
KH
4864 Lisp_Object attrs;
4865 Lisp_Object eol_type;
4866 Lisp_Object coding_type;
4608c386 4867 Lisp_Object val;
4ed46869 4868
df7492f9
KH
4869 if (NILP (coding_system))
4870 coding_system = Qno_conversion;
c07c8e12 4871
df7492f9 4872 CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
1f5dbf34 4873
df7492f9
KH
4874 attrs = CODING_ID_ATTRS (coding->id);
4875 eol_type = CODING_ID_EOL_TYPE (coding->id);
1f5dbf34 4876
df7492f9
KH
4877 coding->mode = 0;
4878 coding->head_ascii = -1;
4879 coding->common_flags
4880 = (VECTORP (eol_type) ? CODING_REQUIRE_DETECTION_MASK : 0);
5e5c78be
KH
4881 if (! NILP (CODING_ATTR_POST_READ (attrs)))
4882 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
4883 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
4884 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
8f924df7
KH
4885 if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
4886 coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
1f5dbf34 4887
df7492f9 4888 val = CODING_ATTR_SAFE_CHARSETS (attrs);
8f924df7
KH
4889 coding->max_charset_id = SCHARS (val) - 1;
4890 coding->safe_charsets = (char *) SDATA (val);
df7492f9 4891 coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
4608c386 4892
df7492f9
KH
4893 coding_type = CODING_ATTR_TYPE (attrs);
4894 if (EQ (coding_type, Qundecided))
d46c5b12 4895 {
df7492f9
KH
4896 coding->detector = NULL;
4897 coding->decoder = decode_coding_raw_text;
4898 coding->encoder = encode_coding_raw_text;
4899 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
d46c5b12 4900 }
df7492f9 4901 else if (EQ (coding_type, Qiso_2022))
d46c5b12 4902 {
df7492f9
KH
4903 int i;
4904 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
4905
4906 /* Invoke graphic register 0 to plane 0. */
4907 CODING_ISO_INVOCATION (coding, 0) = 0;
4908 /* Invoke graphic register 1 to plane 1 if we can use 8-bit. */
4909 CODING_ISO_INVOCATION (coding, 1)
4910 = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
4911 /* Setup the initial status of designation. */
4912 for (i = 0; i < 4; i++)
4913 CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
4914 /* Not single shifting initially. */
4915 CODING_ISO_SINGLE_SHIFTING (coding) = 0;
4916 /* Beginning of buffer should also be regarded as bol. */
4917 CODING_ISO_BOL (coding) = 1;
4918 coding->detector = detect_coding_iso_2022;
4919 coding->decoder = decode_coding_iso_2022;
4920 coding->encoder = encode_coding_iso_2022;
4921 if (flags & CODING_ISO_FLAG_SAFE)
4922 coding->mode |= CODING_MODE_SAFE_ENCODING;
d46c5b12 4923 coding->common_flags
df7492f9
KH
4924 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
4925 | CODING_REQUIRE_FLUSHING_MASK);
4926 if (flags & CODING_ISO_FLAG_COMPOSITION)
4927 coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
ff0dacd7
KH
4928 if (flags & CODING_ISO_FLAG_DESIGNATION)
4929 coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
df7492f9
KH
4930 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
4931 {
4932 setup_iso_safe_charsets (attrs);
4933 val = CODING_ATTR_SAFE_CHARSETS (attrs);
8f924df7
KH
4934 coding->max_charset_id = SCHARS (val) - 1;
4935 coding->safe_charsets = (char *) SDATA (val);
df7492f9
KH
4936 }
4937 CODING_ISO_FLAGS (coding) = flags;
d46c5b12 4938 }
df7492f9 4939 else if (EQ (coding_type, Qcharset))
d46c5b12 4940 {
df7492f9
KH
4941 coding->detector = detect_coding_charset;
4942 coding->decoder = decode_coding_charset;
4943 coding->encoder = encode_coding_charset;
d46c5b12 4944 coding->common_flags
df7492f9 4945 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
d46c5b12 4946 }
df7492f9 4947 else if (EQ (coding_type, Qutf_8))
d46c5b12 4948 {
df7492f9
KH
4949 coding->detector = detect_coding_utf_8;
4950 coding->decoder = decode_coding_utf_8;
4951 coding->encoder = encode_coding_utf_8;
4952 coding->common_flags
4953 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4954 }
4955 else if (EQ (coding_type, Qutf_16))
4956 {
4957 val = AREF (attrs, coding_attr_utf_16_bom);
4958 CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_16_detect_bom
4959 : EQ (val, Qt) ? utf_16_with_bom
4960 : utf_16_without_bom);
4961 val = AREF (attrs, coding_attr_utf_16_endian);
b49a1807 4962 CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
df7492f9 4963 : utf_16_little_endian);
e19c3639 4964 CODING_UTF_16_SURROGATE (coding) = 0;
df7492f9
KH
4965 coding->detector = detect_coding_utf_16;
4966 coding->decoder = decode_coding_utf_16;
4967 coding->encoder = encode_coding_utf_16;
4968 coding->common_flags
4969 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
b49a1807
KH
4970 if (CODING_UTF_16_BOM (coding) == utf_16_detect_bom)
4971 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
d46c5b12 4972 }
df7492f9 4973 else if (EQ (coding_type, Qccl))
4ed46869 4974 {
df7492f9
KH
4975 coding->detector = detect_coding_ccl;
4976 coding->decoder = decode_coding_ccl;
4977 coding->encoder = encode_coding_ccl;
c952af22 4978 coding->common_flags
df7492f9
KH
4979 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
4980 | CODING_REQUIRE_FLUSHING_MASK);
4981 }
4982 else if (EQ (coding_type, Qemacs_mule))
4983 {
4984 coding->detector = detect_coding_emacs_mule;
4985 coding->decoder = decode_coding_emacs_mule;
4986 coding->encoder = encode_coding_emacs_mule;
c952af22 4987 coding->common_flags
df7492f9
KH
4988 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4989 if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
4990 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
4991 {
4992 Lisp_Object tail, safe_charsets;
4993 int max_charset_id = 0;
4994
4995 for (tail = Vemacs_mule_charset_list; CONSP (tail);
4996 tail = XCDR (tail))
4997 if (max_charset_id < XFASTINT (XCAR (tail)))
4998 max_charset_id = XFASTINT (XCAR (tail));
4999 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
5000 make_number (255));
5001 for (tail = Vemacs_mule_charset_list; CONSP (tail);
5002 tail = XCDR (tail))
8f924df7 5003 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
df7492f9 5004 coding->max_charset_id = max_charset_id;
8f924df7 5005 coding->safe_charsets = (char *) SDATA (safe_charsets);
df7492f9
KH
5006 }
5007 }
5008 else if (EQ (coding_type, Qshift_jis))
5009 {
5010 coding->detector = detect_coding_sjis;
5011 coding->decoder = decode_coding_sjis;
5012 coding->encoder = encode_coding_sjis;
c952af22 5013 coding->common_flags
df7492f9
KH
5014 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5015 }
5016 else if (EQ (coding_type, Qbig5))
5017 {
5018 coding->detector = detect_coding_big5;
5019 coding->decoder = decode_coding_big5;
5020 coding->encoder = encode_coding_big5;
c952af22 5021 coding->common_flags
df7492f9
KH
5022 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5023 }
5024 else /* EQ (coding_type, Qraw_text) */
ec6d2bb8 5025 {
df7492f9
KH
5026 coding->detector = NULL;
5027 coding->decoder = decode_coding_raw_text;
5028 coding->encoder = encode_coding_raw_text;
4ed46869 5029 }
4ed46869 5030
df7492f9 5031 return;
4ed46869
KH
5032}
5033
df7492f9
KH
5034/* Return raw-text or one of its subsidiaries that has the same
5035 eol_type as CODING-SYSTEM. */
ec6d2bb8 5036
df7492f9
KH
5037Lisp_Object
5038raw_text_coding_system (coding_system)
5039 Lisp_Object coding_system;
ec6d2bb8 5040{
0be8721c 5041 Lisp_Object spec, attrs;
df7492f9 5042 Lisp_Object eol_type, raw_text_eol_type;
ec6d2bb8 5043
d3e4cb56
KH
5044 if (NILP (coding_system))
5045 return Qraw_text;
df7492f9
KH
5046 spec = CODING_SYSTEM_SPEC (coding_system);
5047 attrs = AREF (spec, 0);
ec6d2bb8 5048
df7492f9
KH
5049 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5050 return coding_system;
ec6d2bb8 5051
df7492f9
KH
5052 eol_type = AREF (spec, 2);
5053 if (VECTORP (eol_type))
5054 return Qraw_text;
5055 spec = CODING_SYSTEM_SPEC (Qraw_text);
5056 raw_text_eol_type = AREF (spec, 2);
5057 return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5058 : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5059 : AREF (raw_text_eol_type, 2));
ec6d2bb8
KH
5060}
5061
54f78171 5062
df7492f9
KH
5063/* If CODING_SYSTEM doesn't specify end-of-line format but PARENT
5064 does, return one of the subsidiary that has the same eol-spec as
5065 PARENT. Otherwise, return CODING_SYSTEM. */
5066
5067Lisp_Object
5068coding_inherit_eol_type (coding_system, parent)
b74e4686 5069 Lisp_Object coding_system, parent;
54f78171 5070{
3e139625 5071 Lisp_Object spec, eol_type;
54f78171 5072
d3e4cb56
KH
5073 if (NILP (coding_system))
5074 coding_system = Qraw_text;
df7492f9 5075 spec = CODING_SYSTEM_SPEC (coding_system);
df7492f9 5076 eol_type = AREF (spec, 2);
d3e4cb56
KH
5077 if (VECTORP (eol_type)
5078 && ! NILP (parent))
df7492f9
KH
5079 {
5080 Lisp_Object parent_spec;
df7492f9
KH
5081 Lisp_Object parent_eol_type;
5082
5083 parent_spec
5084 = CODING_SYSTEM_SPEC (buffer_defaults.buffer_file_coding_system);
5085 parent_eol_type = AREF (parent_spec, 2);
5086 if (EQ (parent_eol_type, Qunix))
5087 coding_system = AREF (eol_type, 0);
5088 else if (EQ (parent_eol_type, Qdos))
5089 coding_system = AREF (eol_type, 1);
5090 else if (EQ (parent_eol_type, Qmac))
5091 coding_system = AREF (eol_type, 2);
54f78171 5092 }
df7492f9 5093 return coding_system;
54f78171
KH
5094}
5095
4ed46869
KH
5096/* Emacs has a mechanism to automatically detect a coding system if it
5097 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
5098 it's impossible to distinguish some coding systems accurately
5099 because they use the same range of codes. So, at first, coding
5100 systems are categorized into 7, those are:
5101
0ef69138 5102 o coding-category-emacs-mule
4ed46869
KH
5103
5104 The category for a coding system which has the same code range
5105 as Emacs' internal format. Assigned the coding-system (Lisp
0ef69138 5106 symbol) `emacs-mule' by default.
4ed46869
KH
5107
5108 o coding-category-sjis
5109
5110 The category for a coding system which has the same code range
5111 as SJIS. Assigned the coding-system (Lisp
7717c392 5112 symbol) `japanese-shift-jis' by default.
4ed46869
KH
5113
5114 o coding-category-iso-7
5115
5116 The category for a coding system which has the same code range
7717c392 5117 as ISO2022 of 7-bit environment. This doesn't use any locking
d46c5b12
KH
5118 shift and single shift functions. This can encode/decode all
5119 charsets. Assigned the coding-system (Lisp symbol)
5120 `iso-2022-7bit' by default.
5121
5122 o coding-category-iso-7-tight
5123
5124 Same as coding-category-iso-7 except that this can
5125 encode/decode only the specified charsets.
4ed46869
KH
5126
5127 o coding-category-iso-8-1
5128
5129 The category for a coding system which has the same code range
5130 as ISO2022 of 8-bit environment and graphic plane 1 used only
7717c392
KH
5131 for DIMENSION1 charset. This doesn't use any locking shift
5132 and single shift functions. Assigned the coding-system (Lisp
5133 symbol) `iso-latin-1' by default.
4ed46869
KH
5134
5135 o coding-category-iso-8-2
5136
5137 The category for a coding system which has the same code range
5138 as ISO2022 of 8-bit environment and graphic plane 1 used only
7717c392
KH
5139 for DIMENSION2 charset. This doesn't use any locking shift
5140 and single shift functions. Assigned the coding-system (Lisp
5141 symbol) `japanese-iso-8bit' by default.
4ed46869 5142
7717c392 5143 o coding-category-iso-7-else
4ed46869
KH
5144
5145 The category for a coding system which has the same code range
df7492f9 5146 as ISO2022 of 7-bit environemnt but uses locking shift or
7717c392
KH
5147 single shift functions. Assigned the coding-system (Lisp
5148 symbol) `iso-2022-7bit-lock' by default.
5149
5150 o coding-category-iso-8-else
5151
5152 The category for a coding system which has the same code range
df7492f9 5153 as ISO2022 of 8-bit environemnt but uses locking shift or
7717c392
KH
5154 single shift functions. Assigned the coding-system (Lisp
5155 symbol) `iso-2022-8bit-ss2' by default.
4ed46869
KH
5156
5157 o coding-category-big5
5158
5159 The category for a coding system which has the same code range
5160 as BIG5. Assigned the coding-system (Lisp symbol)
e0e989f6 5161 `cn-big5' by default.
4ed46869 5162
fa42c37f
KH
5163 o coding-category-utf-8
5164
5165 The category for a coding system which has the same code range
5166 as UTF-8 (cf. RFC2279). Assigned the coding-system (Lisp
5167 symbol) `utf-8' by default.
5168
5169 o coding-category-utf-16-be
5170
5171 The category for a coding system in which a text has an
5172 Unicode signature (cf. Unicode Standard) in the order of BIG
5173 endian at the head. Assigned the coding-system (Lisp symbol)
5174 `utf-16-be' by default.
5175
5176 o coding-category-utf-16-le
5177
5178 The category for a coding system in which a text has an
5179 Unicode signature (cf. Unicode Standard) in the order of
5180 LITTLE endian at the head. Assigned the coding-system (Lisp
5181 symbol) `utf-16-le' by default.
5182
1397dc18
KH
5183 o coding-category-ccl
5184
5185 The category for a coding system of which encoder/decoder is
5186 written in CCL programs. The default value is nil, i.e., no
5187 coding system is assigned.
5188
4ed46869
KH
5189 o coding-category-binary
5190
5191 The category for a coding system not categorized in any of the
5192 above. Assigned the coding-system (Lisp symbol)
e0e989f6 5193 `no-conversion' by default.
4ed46869
KH
5194
5195 Each of them is a Lisp symbol and the value is an actual
df7492f9 5196 `coding-system's (this is also a Lisp symbol) assigned by a user.
4ed46869
KH
5197 What Emacs does actually is to detect a category of coding system.
5198 Then, it uses a `coding-system' assigned to it. If Emacs can't
df7492f9 5199 decide only one possible category, it selects a category of the
4ed46869
KH
5200 highest priority. Priorities of categories are also specified by a
5201 user in a Lisp variable `coding-category-list'.
5202
5203*/
5204
df7492f9
KH
5205#define EOL_SEEN_NONE 0
5206#define EOL_SEEN_LF 1
5207#define EOL_SEEN_CR 2
5208#define EOL_SEEN_CRLF 4
66cfb530 5209
ff0dacd7
KH
5210/* Detect how end-of-line of a text of length SRC_BYTES pointed by
5211 SOURCE is encoded. If CATEGORY is one of
5212 coding_category_utf_16_XXXX, assume that CR and LF are encoded by
5213 two-byte, else they are encoded by one-byte.
5214
5215 Return one of EOL_SEEN_XXX. */
4ed46869 5216
bc4bc72a 5217#define MAX_EOL_CHECK_COUNT 3
d46c5b12
KH
5218
5219static int
89528eb3 5220detect_eol (source, src_bytes, category)
d46c5b12 5221 unsigned char *source;
df7492f9 5222 EMACS_INT src_bytes;
89528eb3 5223 enum coding_category category;
4ed46869 5224{
d46c5b12 5225 unsigned char *src = source, *src_end = src + src_bytes;
4ed46869 5226 unsigned char c;
df7492f9
KH
5227 int total = 0;
5228 int eol_seen = EOL_SEEN_NONE;
4ed46869 5229
89528eb3 5230 if ((1 << category) & CATEGORY_MASK_UTF_16)
4ed46869 5231 {
df7492f9 5232 int msb, lsb;
fa42c37f 5233
89528eb3
KH
5234 msb = category == (coding_category_utf_16_le
5235 | coding_category_utf_16_le_nosig);
df7492f9 5236 lsb = 1 - msb;
fa42c37f 5237
df7492f9 5238 while (src + 1 < src_end)
fa42c37f 5239 {
df7492f9
KH
5240 c = src[lsb];
5241 if (src[msb] == 0 && (c == '\n' || c == '\r'))
fa42c37f 5242 {
df7492f9
KH
5243 int this_eol;
5244
5245 if (c == '\n')
5246 this_eol = EOL_SEEN_LF;
5247 else if (src + 3 >= src_end
5248 || src[msb + 2] != 0
5249 || src[lsb + 2] != '\n')
5250 this_eol = EOL_SEEN_CR;
fa42c37f 5251 else
8f924df7 5252 this_eol = EOL_SEEN_CRLF;
df7492f9
KH
5253
5254 if (eol_seen == EOL_SEEN_NONE)
5255 /* This is the first end-of-line. */
5256 eol_seen = this_eol;
5257 else if (eol_seen != this_eol)
fa42c37f 5258 {
df7492f9
KH
5259 /* The found type is different from what found before. */
5260 eol_seen = EOL_SEEN_LF;
5261 break;
fa42c37f 5262 }
df7492f9
KH
5263 if (++total == MAX_EOL_CHECK_COUNT)
5264 break;
fa42c37f 5265 }
df7492f9 5266 src += 2;
fa42c37f 5267 }
bcf26d6a 5268 }
d46c5b12 5269 else
c4825358 5270 {
df7492f9 5271 while (src < src_end)
27901516 5272 {
df7492f9
KH
5273 c = *src++;
5274 if (c == '\n' || c == '\r')
5275 {
5276 int this_eol;
d46c5b12 5277
df7492f9
KH
5278 if (c == '\n')
5279 this_eol = EOL_SEEN_LF;
5280 else if (src >= src_end || *src != '\n')
5281 this_eol = EOL_SEEN_CR;
5282 else
5283 this_eol = EOL_SEEN_CRLF, src++;
d46c5b12 5284
df7492f9
KH
5285 if (eol_seen == EOL_SEEN_NONE)
5286 /* This is the first end-of-line. */
5287 eol_seen = this_eol;
5288 else if (eol_seen != this_eol)
5289 {
5290 /* The found type is different from what found before. */
5291 eol_seen = EOL_SEEN_LF;
5292 break;
5293 }
5294 if (++total == MAX_EOL_CHECK_COUNT)
5295 break;
5296 }
5297 }
73be902c 5298 }
df7492f9 5299 return eol_seen;
73be902c
KH
5300}
5301
df7492f9 5302
24a73b0a 5303static Lisp_Object
df7492f9
KH
5304adjust_coding_eol_type (coding, eol_seen)
5305 struct coding_system *coding;
5306 int eol_seen;
73be902c 5307{
0be8721c 5308 Lisp_Object eol_type;
8f924df7 5309
df7492f9
KH
5310 eol_type = CODING_ID_EOL_TYPE (coding->id);
5311 if (eol_seen & EOL_SEEN_LF)
24a73b0a
KH
5312 {
5313 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
5314 eol_type = Qunix;
5315 }
6f197c07 5316 else if (eol_seen & EOL_SEEN_CRLF)
24a73b0a
KH
5317 {
5318 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
5319 eol_type = Qdos;
5320 }
6f197c07 5321 else if (eol_seen & EOL_SEEN_CR)
24a73b0a
KH
5322 {
5323 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
5324 eol_type = Qmac;
5325 }
5326 return eol_type;
d46c5b12 5327}
4ed46869 5328
df7492f9
KH
5329/* Detect how a text specified in CODING is encoded. If a coding
5330 system is detected, update fields of CODING by the detected coding
5331 system. */
0a28aafb 5332
df7492f9
KH
5333void
5334detect_coding (coding)
d46c5b12 5335 struct coding_system *coding;
d46c5b12 5336{
8f924df7 5337 const unsigned char *src, *src_end;
df7492f9 5338 Lisp_Object attrs, coding_type;
d46c5b12 5339
df7492f9
KH
5340 coding->consumed = coding->consumed_char = 0;
5341 coding->produced = coding->produced_char = 0;
5342 coding_set_source (coding);
1c3478b0 5343
df7492f9 5344 src_end = coding->source + coding->src_bytes;
1c3478b0 5345
df7492f9
KH
5346 /* If we have not yet decided the text encoding type, detect it
5347 now. */
5348 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
b73bfc1c 5349 {
df7492f9
KH
5350 int c, i;
5351
24a73b0a 5352 for (i = 0, src = coding->source; src < src_end; i++, src++)
d46c5b12 5353 {
df7492f9 5354 c = *src;
75e2a253 5355 if (c & 0x80 || (c < 0x20 && (c == ISO_CODE_ESC
df7492f9
KH
5356 || c == ISO_CODE_SI
5357 || c == ISO_CODE_SO)))
5358 break;
d46c5b12 5359 }
df7492f9
KH
5360 coding->head_ascii = src - (coding->source + coding->consumed);
5361
5362 if (coding->head_ascii < coding->src_bytes)
d46c5b12 5363 {
ff0dacd7
KH
5364 struct coding_detection_info detect_info;
5365 enum coding_category category;
5366 struct coding_system *this;
df7492f9 5367
ff0dacd7 5368 detect_info.checked = detect_info.found = detect_info.rejected = 0;
df7492f9 5369 for (i = 0; i < coding_category_raw_text; i++)
d46c5b12 5370 {
ff0dacd7
KH
5371 category = coding_priorities[i];
5372 this = coding_categories + category;
df7492f9 5373 if (this->id < 0)
fa42c37f 5374 {
df7492f9 5375 /* No coding system of this category is defined. */
ff0dacd7 5376 detect_info.rejected |= (1 << category);
fa42c37f 5377 }
ff0dacd7 5378 else if (category >= coding_category_raw_text)
89528eb3 5379 continue;
ff0dacd7 5380 else if (detect_info.checked & (1 << category))
fa42c37f 5381 {
ff0dacd7
KH
5382 if (detect_info.found & (1 << category))
5383 break;
fa42c37f 5384 }
ff0dacd7
KH
5385 else if ((*(this->detector)) (coding, &detect_info)
5386 && detect_info.found & (1 << category))
24a73b0a
KH
5387 {
5388 if (category == coding_category_utf_16_auto)
5389 {
5390 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5391 category = coding_category_utf_16_le;
5392 else
5393 category = coding_category_utf_16_be;
5394 }
5395 break;
5396 }
d46c5b12 5397 }
ff0dacd7
KH
5398 if (i < coding_category_raw_text)
5399 setup_coding_system (CODING_ID_NAME (this->id), coding);
5400 else if (detect_info.rejected == CATEGORY_MASK_ANY)
df7492f9 5401 setup_coding_system (Qraw_text, coding);
ff0dacd7 5402 else if (detect_info.rejected)
df7492f9 5403 for (i = 0; i < coding_category_raw_text; i++)
ff0dacd7
KH
5404 if (! (detect_info.rejected & (1 << coding_priorities[i])))
5405 {
5406 this = coding_categories + coding_priorities[i];
5407 setup_coding_system (CODING_ID_NAME (this->id), coding);
5408 break;
5409 }
d46c5b12 5410 }
b73bfc1c 5411 }
24a73b0a
KH
5412 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
5413 == coding_category_utf_16_auto)
b49a1807
KH
5414 {
5415 Lisp_Object coding_systems;
5416 struct coding_detection_info detect_info;
5417
5418 coding_systems
5419 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_16_bom);
5420 detect_info.found = detect_info.rejected = 0;
5421 if (CONSP (coding_systems)
24a73b0a 5422 && detect_coding_utf_16 (coding, &detect_info))
b49a1807
KH
5423 {
5424 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5425 setup_coding_system (XCAR (coding_systems), coding);
24a73b0a 5426 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
b49a1807
KH
5427 setup_coding_system (XCDR (coding_systems), coding);
5428 }
5429 }
4ed46869 5430}
4ed46869 5431
d46c5b12 5432
aaaf0b1e 5433static void
df7492f9 5434decode_eol (coding)
aaaf0b1e 5435 struct coding_system *coding;
aaaf0b1e 5436{
24a73b0a
KH
5437 Lisp_Object eol_type;
5438 unsigned char *p, *pbeg, *pend;
5439
5440 eol_type = CODING_ID_EOL_TYPE (coding->id);
5441 if (EQ (eol_type, Qunix))
5442 return;
5443
5444 if (NILP (coding->dst_object))
5445 pbeg = coding->destination;
5446 else
5447 pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
5448 pend = pbeg + coding->produced;
5449
5450 if (VECTORP (eol_type))
aaaf0b1e 5451 {
df7492f9 5452 int eol_seen = EOL_SEEN_NONE;
4ed46869 5453
24a73b0a 5454 for (p = pbeg; p < pend; p++)
aaaf0b1e 5455 {
df7492f9
KH
5456 if (*p == '\n')
5457 eol_seen |= EOL_SEEN_LF;
5458 else if (*p == '\r')
aaaf0b1e 5459 {
df7492f9 5460 if (p + 1 < pend && *(p + 1) == '\n')
aaaf0b1e 5461 {
df7492f9
KH
5462 eol_seen |= EOL_SEEN_CRLF;
5463 p++;
aaaf0b1e 5464 }
aaaf0b1e 5465 else
df7492f9 5466 eol_seen |= EOL_SEEN_CR;
aaaf0b1e 5467 }
aaaf0b1e 5468 }
24a73b0a
KH
5469 if (eol_seen != EOL_SEEN_NONE
5470 && eol_seen != EOL_SEEN_LF
5471 && eol_seen != EOL_SEEN_CRLF
5472 && eol_seen != EOL_SEEN_CR)
5473 eol_seen = EOL_SEEN_LF;
df7492f9 5474 if (eol_seen != EOL_SEEN_NONE)
24a73b0a 5475 eol_type = adjust_coding_eol_type (coding, eol_seen);
aaaf0b1e 5476 }
d46c5b12 5477
24a73b0a 5478 if (EQ (eol_type, Qmac))
27901516 5479 {
24a73b0a 5480 for (p = pbeg; p < pend; p++)
df7492f9
KH
5481 if (*p == '\r')
5482 *p = '\n';
4ed46869 5483 }
24a73b0a 5484 else if (EQ (eol_type, Qdos))
df7492f9 5485 {
24a73b0a 5486 int n = 0;
b73bfc1c 5487
24a73b0a
KH
5488 if (NILP (coding->dst_object))
5489 {
5490 for (p = pend - 2; p >= pbeg; p--)
5491 if (*p == '\r')
5492 {
5493 safe_bcopy ((char *) (p + 1), (char *) p, pend-- - p - 1);
5494 n++;
5495 }
5496 }
5497 else
5498 {
5499 for (p = pend - 2; p >= pbeg; p--)
5500 if (*p == '\r')
5501 {
5502 int pos_byte = coding->dst_pos_byte + (p - pbeg);
5503 int pos = BYTE_TO_CHAR (pos_byte);
5504
5505 del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
5506 n++;
5507 }
5508 }
5509 coding->produced -= n;
5510 coding->produced_char -= n;
aaaf0b1e 5511 }
4ed46869
KH
5512}
5513
7d64c6ad 5514
a6f87d34
KH
5515/* Return a translation table (or list of them) from coding system
5516 attribute vector ATTRS for encoding (ENCODEP is nonzero) or
5517 decoding (ENCODEP is zero). */
7d64c6ad
KH
5518
5519static INLINE
5520get_translation_table (attrs, encodep)
5521{
5522 Lisp_Object standard, translation_table;
5523
5524 if (encodep)
5525 translation_table = CODING_ATTR_ENCODE_TBL (attrs),
5526 standard = Vstandard_translation_table_for_encode;
5527 else
5528 translation_table = CODING_ATTR_DECODE_TBL (attrs),
5529 standard = Vstandard_translation_table_for_decode;
7d64c6ad 5530 if (NILP (translation_table))
a6f87d34
KH
5531 return standard;
5532 if (SYMBOLP (translation_table))
5533 translation_table = Fget (translation_table, Qtranslation_table);
5534 else if (CONSP (translation_table))
5535 {
5536 Lisp_Object val;
5537
5538 translation_table = Fcopy_sequence (translation_table);
5539 for (val = translation_table; CONSP (val); val = XCDR (val))
5540 if (SYMBOLP (XCAR (val)))
5541 XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
5542 }
5543 if (! NILP (standard))
5544 {
5545 if (CONSP (translation_table))
5546 translation_table = nconc2 (translation_table, Fcons (standard, Qnil));
5547 else
5548 translation_table = Fcons (translation_table, Fcons (standard, Qnil));
5549 }
7d64c6ad
KH
5550 return translation_table;
5551}
5552
5553
df7492f9
KH
5554static void
5555translate_chars (coding, table)
4ed46869 5556 struct coding_system *coding;
df7492f9 5557 Lisp_Object table;
4ed46869 5558{
df7492f9
KH
5559 int *charbuf = coding->charbuf;
5560 int *charbuf_end = charbuf + coding->charbuf_used;
5561 int c;
d46c5b12 5562
df7492f9
KH
5563 if (coding->chars_at_source)
5564 return;
4ed46869 5565
df7492f9 5566 while (charbuf < charbuf_end)
8844fa83 5567 {
df7492f9
KH
5568 c = *charbuf;
5569 if (c < 0)
7d64c6ad 5570 charbuf += -c;
df7492f9
KH
5571 else
5572 *charbuf++ = translate_char (table, c);
8844fa83 5573 }
df7492f9 5574}
bc4bc72a 5575
d46c5b12 5576static int
df7492f9
KH
5577produce_chars (coding)
5578 struct coding_system *coding;
4ed46869 5579{
df7492f9
KH
5580 unsigned char *dst = coding->destination + coding->produced;
5581 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5582 int produced;
5583 int produced_chars = 0;
4ed46869 5584
df7492f9 5585 if (! coding->chars_at_source)
4ed46869 5586 {
df7492f9 5587 /* Characters are in coding->charbuf. */
fba4576f
AS
5588 int *buf = coding->charbuf;
5589 int *buf_end = buf + coding->charbuf_used;
df7492f9 5590 unsigned char *adjusted_dst_end;
4ed46869 5591
df7492f9
KH
5592 if (BUFFERP (coding->src_object)
5593 && EQ (coding->src_object, coding->dst_object))
8f924df7 5594 dst_end = ((unsigned char *) coding->source) + coding->consumed;
df7492f9 5595 adjusted_dst_end = dst_end - MAX_MULTIBYTE_LENGTH;
4ed46869 5596
df7492f9 5597 while (buf < buf_end)
4ed46869 5598 {
df7492f9 5599 int c = *buf++;
bc4bc72a 5600
df7492f9 5601 if (dst >= adjusted_dst_end)
d46c5b12 5602 {
df7492f9
KH
5603 dst = alloc_destination (coding,
5604 buf_end - buf + MAX_MULTIBYTE_LENGTH,
5605 dst);
5606 dst_end = coding->destination + coding->dst_bytes;
5607 adjusted_dst_end = dst_end - MAX_MULTIBYTE_LENGTH;
5608 }
5609 if (c >= 0)
5610 {
5611 if (coding->dst_multibyte
5612 || ! CHAR_BYTE8_P (c))
5613 CHAR_STRING_ADVANCE (c, dst);
5614 else
5615 *dst++ = CHAR_TO_BYTE8 (c);
5616 produced_chars++;
d46c5b12 5617 }
df7492f9 5618 else
d3e4cb56
KH
5619 /* This is an annotation datum. (-C) is the length of
5620 it. */
5621 buf += -c - 1;
4ed46869
KH
5622 }
5623 }
fa42c37f 5624 else
fa42c37f 5625 {
8f924df7
KH
5626 const unsigned char *src = coding->source;
5627 const unsigned char *src_end = src + coding->src_bytes;
df7492f9 5628 Lisp_Object eol_type;
fa42c37f 5629
df7492f9 5630 eol_type = CODING_ID_EOL_TYPE (coding->id);
4ed46869 5631
df7492f9 5632 if (coding->src_multibyte != coding->dst_multibyte)
fa42c37f 5633 {
df7492f9 5634 if (coding->src_multibyte)
fa42c37f 5635 {
71c81426 5636 int multibytep = 1;
df7492f9 5637 int consumed_chars;
d46c5b12 5638
df7492f9
KH
5639 while (1)
5640 {
8f924df7 5641 const unsigned char *src_base = src;
df7492f9 5642 int c;
b73bfc1c 5643
df7492f9
KH
5644 ONE_MORE_BYTE (c);
5645 if (c == '\r')
5646 {
5647 if (EQ (eol_type, Qdos))
5648 {
98725083
KH
5649 if (src == src_end)
5650 {
065e3595
KH
5651 record_conversion_result
5652 (coding, CODING_RESULT_INSUFFICIENT_SRC);
98725083
KH
5653 goto no_more_source;
5654 }
5655 if (*src == '\n')
df7492f9
KH
5656 c = *src++;
5657 }
5658 else if (EQ (eol_type, Qmac))
5659 c = '\n';
5660 }
5661 if (dst == dst_end)
5662 {
2c78b7e1 5663 coding->consumed = src - coding->source;
b73bfc1c 5664
2c78b7e1 5665 if (EQ (coding->src_object, coding->dst_object))
8f924df7 5666 dst_end = (unsigned char *) src;
2c78b7e1
KH
5667 if (dst == dst_end)
5668 {
5669 dst = alloc_destination (coding, src_end - src + 1,
5670 dst);
5671 dst_end = coding->destination + coding->dst_bytes;
5672 coding_set_source (coding);
5673 src = coding->source + coding->consumed;
5674 src_end = coding->source + coding->src_bytes;
5675 }
df7492f9
KH
5676 }
5677 *dst++ = c;
5678 produced_chars++;
5679 }
5680 no_more_source:
5681 ;
fa42c37f
KH
5682 }
5683 else
df7492f9
KH
5684 while (src < src_end)
5685 {
71c81426 5686 int multibytep = 1;
df7492f9 5687 int c = *src++;
b73bfc1c 5688
df7492f9
KH
5689 if (c == '\r')
5690 {
5691 if (EQ (eol_type, Qdos))
5692 {
5693 if (src < src_end
5694 && *src == '\n')
5695 c = *src++;
5696 }
5697 else if (EQ (eol_type, Qmac))
5698 c = '\n';
5699 }
5700 if (dst >= dst_end - 1)
5701 {
2c78b7e1 5702 coding->consumed = src - coding->source;
df7492f9 5703
2c78b7e1 5704 if (EQ (coding->src_object, coding->dst_object))
8f924df7 5705 dst_end = (unsigned char *) src;
2c78b7e1
KH
5706 if (dst >= dst_end - 1)
5707 {
5708 dst = alloc_destination (coding, src_end - src + 2,
5709 dst);
5710 dst_end = coding->destination + coding->dst_bytes;
5711 coding_set_source (coding);
5712 src = coding->source + coding->consumed;
5713 src_end = coding->source + coding->src_bytes;
5714 }
df7492f9
KH
5715 }
5716 EMIT_ONE_BYTE (c);
5717 }
d46c5b12 5718 }
df7492f9
KH
5719 else
5720 {
5721 if (!EQ (coding->src_object, coding->dst_object))
fa42c37f 5722 {
df7492f9 5723 int require = coding->src_bytes - coding->dst_bytes;
4ed46869 5724
df7492f9 5725 if (require > 0)
fa42c37f 5726 {
df7492f9
KH
5727 EMACS_INT offset = src - coding->source;
5728
5729 dst = alloc_destination (coding, require, dst);
5730 coding_set_source (coding);
5731 src = coding->source + offset;
5732 src_end = coding->source + coding->src_bytes;
fa42c37f
KH
5733 }
5734 }
df7492f9
KH
5735 produced_chars = coding->src_chars;
5736 while (src < src_end)
fa42c37f 5737 {
df7492f9
KH
5738 int c = *src++;
5739
5740 if (c == '\r')
5741 {
5742 if (EQ (eol_type, Qdos))
5743 {
5744 if (src < src_end
5745 && *src == '\n')
5746 c = *src++;
5747 produced_chars--;
5748 }
5749 else if (EQ (eol_type, Qmac))
5750 c = '\n';
5751 }
5752 *dst++ = c;
fa42c37f
KH
5753 }
5754 }
2c78b7e1
KH
5755 coding->consumed = coding->src_bytes;
5756 coding->consumed_char = coding->src_chars;
fa42c37f
KH
5757 }
5758
df7492f9
KH
5759 produced = dst - (coding->destination + coding->produced);
5760 if (BUFFERP (coding->dst_object))
5761 insert_from_gap (produced_chars, produced);
5762 coding->produced += produced;
5763 coding->produced_char += produced_chars;
5764 return produced_chars;
fa42c37f
KH
5765}
5766
ff0dacd7
KH
5767/* Compose text in CODING->object according to the annotation data at
5768 CHARBUF. CHARBUF is an array:
5769 [ -LENGTH ANNOTATION_MASK FROM TO METHOD COMP_LEN [ COMPONENTS... ] ]
df7492f9 5770 */
4ed46869 5771
df7492f9
KH
5772static INLINE void
5773produce_composition (coding, charbuf)
4ed46869 5774 struct coding_system *coding;
df7492f9 5775 int *charbuf;
4ed46869 5776{
df7492f9 5777 int len;
ff0dacd7 5778 EMACS_INT from, to;
df7492f9 5779 enum composition_method method;
df7492f9 5780 Lisp_Object components;
fa42c37f 5781
df7492f9 5782 len = -charbuf[0];
ff0dacd7
KH
5783 from = coding->dst_pos + charbuf[2];
5784 to = coding->dst_pos + charbuf[3];
5785 method = (enum composition_method) (charbuf[4]);
d46c5b12 5786
df7492f9
KH
5787 if (method == COMPOSITION_RELATIVE)
5788 components = Qnil;
d46c5b12 5789 else
d46c5b12 5790 {
df7492f9
KH
5791 Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
5792 int i;
b73bfc1c 5793
df7492f9
KH
5794 len -= 5;
5795 charbuf += 5;
5796 for (i = 0; i < len; i++)
5797 args[i] = make_number (charbuf[i]);
5798 components = (method == COMPOSITION_WITH_ALTCHARS
5799 ? Fstring (len, args) : Fvector (len, args));
d46c5b12 5800 }
ff0dacd7 5801 compose_text (from, to, components, Qnil, coding->dst_object);
d46c5b12
KH
5802}
5803
d46c5b12 5804
ff0dacd7
KH
5805/* Put `charset' property on text in CODING->object according to
5806 the annotation data at CHARBUF. CHARBUF is an array:
5807 [ -LENGTH ANNOTATION_MASK FROM TO CHARSET-ID ]
5808 */
d46c5b12 5809
ff0dacd7
KH
5810static INLINE void
5811produce_charset (coding, charbuf)
d46c5b12 5812 struct coding_system *coding;
ff0dacd7 5813 int *charbuf;
d46c5b12 5814{
ff0dacd7
KH
5815 EMACS_INT from = coding->dst_pos + charbuf[2];
5816 EMACS_INT to = coding->dst_pos + charbuf[3];
5817 struct charset *charset = CHARSET_FROM_ID (charbuf[4]);
b73bfc1c 5818
ff0dacd7
KH
5819 Fput_text_property (make_number (from), make_number (to),
5820 Qcharset, CHARSET_NAME (charset),
5821 coding->dst_object);
d46c5b12
KH
5822}
5823
d46c5b12 5824
df7492f9
KH
5825#define CHARBUF_SIZE 0x4000
5826
5827#define ALLOC_CONVERSION_WORK_AREA(coding) \
5828 do { \
5829 int size = CHARBUF_SIZE;; \
5830 \
5831 coding->charbuf = NULL; \
5832 while (size > 1024) \
5833 { \
5834 coding->charbuf = (int *) alloca (sizeof (int) * size); \
5835 if (coding->charbuf) \
5836 break; \
5837 size >>= 1; \
5838 } \
5839 if (! coding->charbuf) \
5840 { \
065e3595 5841 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
df7492f9
KH
5842 return coding->result; \
5843 } \
5844 coding->charbuf_size = size; \
5845 } while (0)
4ed46869 5846
d46c5b12
KH
5847
5848static void
df7492f9 5849produce_annotation (coding)
d46c5b12 5850 struct coding_system *coding;
d46c5b12 5851{
df7492f9
KH
5852 int *charbuf = coding->charbuf;
5853 int *charbuf_end = charbuf + coding->charbuf_used;
d46c5b12 5854
ff0dacd7
KH
5855 if (NILP (coding->dst_object))
5856 return;
d46c5b12 5857
df7492f9 5858 while (charbuf < charbuf_end)
a84f1519 5859 {
df7492f9
KH
5860 if (*charbuf >= 0)
5861 charbuf++;
d46c5b12 5862 else
d46c5b12 5863 {
df7492f9 5864 int len = -*charbuf;
ff0dacd7 5865 switch (charbuf[1])
df7492f9
KH
5866 {
5867 case CODING_ANNOTATE_COMPOSITION_MASK:
5868 produce_composition (coding, charbuf);
5869 break;
ff0dacd7
KH
5870 case CODING_ANNOTATE_CHARSET_MASK:
5871 produce_charset (coding, charbuf);
5872 break;
df7492f9
KH
5873 default:
5874 abort ();
5875 }
5876 charbuf += len;
d46c5b12 5877 }
a84f1519 5878 }
d46c5b12
KH
5879}
5880
df7492f9
KH
5881/* Decode the data at CODING->src_object into CODING->dst_object.
5882 CODING->src_object is a buffer, a string, or nil.
5883 CODING->dst_object is a buffer.
d46c5b12 5884
df7492f9
KH
5885 If CODING->src_object is a buffer, it must be the current buffer.
5886 In this case, if CODING->src_pos is positive, it is a position of
5887 the source text in the buffer, otherwise, the source text is in the
5888 gap area of the buffer, and CODING->src_pos specifies the offset of
5889 the text from GPT (which must be the same as PT). If this is the
5890 same buffer as CODING->dst_object, CODING->src_pos must be
5891 negative.
d46c5b12 5892
df7492f9
KH
5893 If CODING->src_object is a string, CODING->src_pos in an index to
5894 that string.
d46c5b12 5895
df7492f9
KH
5896 If CODING->src_object is nil, CODING->source must already point to
5897 the non-relocatable memory area. In this case, CODING->src_pos is
5898 an offset from CODING->source.
73be902c 5899
df7492f9
KH
5900 The decoded data is inserted at the current point of the buffer
5901 CODING->dst_object.
5902*/
d46c5b12 5903
df7492f9
KH
5904static int
5905decode_coding (coding)
d46c5b12 5906 struct coding_system *coding;
d46c5b12 5907{
df7492f9 5908 Lisp_Object attrs;
24a73b0a 5909 Lisp_Object undo_list;
7d64c6ad 5910 Lisp_Object translation_table;
d46c5b12 5911
df7492f9
KH
5912 if (BUFFERP (coding->src_object)
5913 && coding->src_pos > 0
5914 && coding->src_pos < GPT
5915 && coding->src_pos + coding->src_chars > GPT)
5916 move_gap_both (coding->src_pos, coding->src_pos_byte);
d46c5b12 5917
24a73b0a 5918 undo_list = Qt;
df7492f9 5919 if (BUFFERP (coding->dst_object))
1c3478b0 5920 {
df7492f9
KH
5921 if (current_buffer != XBUFFER (coding->dst_object))
5922 set_buffer_internal (XBUFFER (coding->dst_object));
5923 if (GPT != PT)
5924 move_gap_both (PT, PT_BYTE);
24a73b0a
KH
5925 undo_list = current_buffer->undo_list;
5926 current_buffer->undo_list = Qt;
1c3478b0
KH
5927 }
5928
df7492f9
KH
5929 coding->consumed = coding->consumed_char = 0;
5930 coding->produced = coding->produced_char = 0;
5931 coding->chars_at_source = 0;
065e3595 5932 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9 5933 coding->errors = 0;
1c3478b0 5934
df7492f9
KH
5935 ALLOC_CONVERSION_WORK_AREA (coding);
5936
5937 attrs = CODING_ID_ATTRS (coding->id);
a6f87d34 5938 translation_table = get_translation_table (attrs, 0);
df7492f9
KH
5939
5940 do
b73bfc1c 5941 {
df7492f9
KH
5942 coding_set_source (coding);
5943 coding->annotated = 0;
5944 (*(coding->decoder)) (coding);
7d64c6ad
KH
5945 if (!NILP (translation_table))
5946 translate_chars (coding, translation_table);
df7492f9
KH
5947 coding_set_destination (coding);
5948 produce_chars (coding);
5949 if (coding->annotated)
5950 produce_annotation (coding);
d46c5b12 5951 }
df7492f9
KH
5952 while (coding->consumed < coding->src_bytes
5953 && ! coding->result);
d46c5b12 5954
df7492f9
KH
5955 coding->carryover_bytes = 0;
5956 if (coding->consumed < coding->src_bytes)
d46c5b12 5957 {
df7492f9 5958 int nbytes = coding->src_bytes - coding->consumed;
8f924df7 5959 const unsigned char *src;
df7492f9
KH
5960
5961 coding_set_source (coding);
5962 coding_set_destination (coding);
5963 src = coding->source + coding->consumed;
5964
5965 if (coding->mode & CODING_MODE_LAST_BLOCK)
1c3478b0 5966 {
df7492f9
KH
5967 /* Flush out unprocessed data as binary chars. We are sure
5968 that the number of data is less than the size of
5969 coding->charbuf. */
065e3595 5970 coding->charbuf_used = 0;
df7492f9 5971 while (nbytes-- > 0)
1c3478b0 5972 {
df7492f9 5973 int c = *src++;
98725083
KH
5974
5975 coding->charbuf[coding->charbuf_used++] = (c & 0x80 ? - c : c);
1c3478b0 5976 }
df7492f9 5977 produce_chars (coding);
d46c5b12 5978 }
d46c5b12 5979 else
df7492f9
KH
5980 {
5981 /* Record unprocessed bytes in coding->carryover. We are
5982 sure that the number of data is less than the size of
5983 coding->carryover. */
5984 unsigned char *p = coding->carryover;
5985
5986 coding->carryover_bytes = nbytes;
5987 while (nbytes-- > 0)
5988 *p++ = *src++;
1c3478b0 5989 }
df7492f9 5990 coding->consumed = coding->src_bytes;
b73bfc1c 5991 }
69f76525 5992
24a73b0a
KH
5993 if (BUFFERP (coding->dst_object))
5994 {
5995 current_buffer->undo_list = undo_list;
5996 record_insert (coding->dst_pos, coding->produced_char);
5997 }
5998 if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix))
5999 decode_eol (coding);
73be902c 6000 return coding->result;
4ed46869
KH
6001}
6002
aaaf0b1e 6003
e1c23804 6004/* Extract an annotation datum from a composition starting at POS and
ff0dacd7
KH
6005 ending before LIMIT of CODING->src_object (buffer or string), store
6006 the data in BUF, set *STOP to a starting position of the next
6007 composition (if any) or to LIMIT, and return the address of the
6008 next element of BUF.
6009
6010 If such an annotation is not found, set *STOP to a starting
6011 position of a composition after POS (if any) or to LIMIT, and
6012 return BUF. */
6013
6014static INLINE int *
6015handle_composition_annotation (pos, limit, coding, buf, stop)
6016 EMACS_INT pos, limit;
aaaf0b1e 6017 struct coding_system *coding;
ff0dacd7
KH
6018 int *buf;
6019 EMACS_INT *stop;
aaaf0b1e 6020{
ff0dacd7
KH
6021 EMACS_INT start, end;
6022 Lisp_Object prop;
aaaf0b1e 6023
ff0dacd7
KH
6024 if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
6025 || end > limit)
6026 *stop = limit;
6027 else if (start > pos)
6028 *stop = start;
6029 else
aaaf0b1e 6030 {
ff0dacd7 6031 if (start == pos)
aaaf0b1e 6032 {
ff0dacd7
KH
6033 /* We found a composition. Store the corresponding
6034 annotation data in BUF. */
6035 int *head = buf;
6036 enum composition_method method = COMPOSITION_METHOD (prop);
6037 int nchars = COMPOSITION_LENGTH (prop);
6038
6039 ADD_COMPOSITION_DATA (buf, 0, nchars, method);
6040 if (method != COMPOSITION_RELATIVE)
aaaf0b1e 6041 {
ff0dacd7
KH
6042 Lisp_Object components;
6043 int len, i, i_byte;
6044
6045 components = COMPOSITION_COMPONENTS (prop);
6046 if (VECTORP (components))
aaaf0b1e 6047 {
ff0dacd7
KH
6048 len = XVECTOR (components)->size;
6049 for (i = 0; i < len; i++)
6050 *buf++ = XINT (AREF (components, i));
aaaf0b1e 6051 }
ff0dacd7 6052 else if (STRINGP (components))
aaaf0b1e 6053 {
8f924df7 6054 len = SCHARS (components);
ff0dacd7
KH
6055 i = i_byte = 0;
6056 while (i < len)
6057 {
6058 FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
6059 buf++;
6060 }
6061 }
6062 else if (INTEGERP (components))
6063 {
6064 len = 1;
6065 *buf++ = XINT (components);
6066 }
6067 else if (CONSP (components))
6068 {
6069 for (len = 0; CONSP (components);
6070 len++, components = XCDR (components))
6071 *buf++ = XINT (XCAR (components));
aaaf0b1e 6072 }
aaaf0b1e 6073 else
ff0dacd7
KH
6074 abort ();
6075 *head -= len;
aaaf0b1e 6076 }
aaaf0b1e 6077 }
ff0dacd7
KH
6078
6079 if (find_composition (end, limit, &start, &end, &prop,
6080 coding->src_object)
6081 && end <= limit)
6082 *stop = start;
6083 else
6084 *stop = limit;
aaaf0b1e 6085 }
ff0dacd7
KH
6086 return buf;
6087}
6088
6089
e1c23804 6090/* Extract an annotation datum from a text property `charset' at POS of
ff0dacd7
KH
6091 CODING->src_object (buffer of string), store the data in BUF, set
6092 *STOP to the position where the value of `charset' property changes
6093 (limiting by LIMIT), and return the address of the next element of
6094 BUF.
6095
6096 If the property value is nil, set *STOP to the position where the
6097 property value is non-nil (limiting by LIMIT), and return BUF. */
6098
6099static INLINE int *
6100handle_charset_annotation (pos, limit, coding, buf, stop)
6101 EMACS_INT pos, limit;
6102 struct coding_system *coding;
6103 int *buf;
6104 EMACS_INT *stop;
6105{
6106 Lisp_Object val, next;
6107 int id;
6108
6109 val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
6110 if (! NILP (val) && CHARSETP (val))
6111 id = XINT (CHARSET_SYMBOL_ID (val));
6112 else
6113 id = -1;
6114 ADD_CHARSET_DATA (buf, 0, 0, id);
6115 next = Fnext_single_property_change (make_number (pos), Qcharset,
6116 coding->src_object,
6117 make_number (limit));
6118 *stop = XINT (next);
6119 return buf;
6120}
6121
6122
df7492f9
KH
6123static void
6124consume_chars (coding)
6125 struct coding_system *coding;
6126{
6127 int *buf = coding->charbuf;
ff0dacd7 6128 int *buf_end = coding->charbuf + coding->charbuf_size;
7c78e542 6129 const unsigned char *src = coding->source + coding->consumed;
4776e638 6130 const unsigned char *src_end = coding->source + coding->src_bytes;
ff0dacd7
KH
6131 EMACS_INT pos = coding->src_pos + coding->consumed_char;
6132 EMACS_INT end_pos = coding->src_pos + coding->src_chars;
df7492f9
KH
6133 int multibytep = coding->src_multibyte;
6134 Lisp_Object eol_type;
6135 int c;
ff0dacd7 6136 EMACS_INT stop, stop_composition, stop_charset;
88993dfd 6137
df7492f9
KH
6138 eol_type = CODING_ID_EOL_TYPE (coding->id);
6139 if (VECTORP (eol_type))
6140 eol_type = Qunix;
88993dfd 6141
df7492f9
KH
6142 /* Note: composition handling is not yet implemented. */
6143 coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
ec6d2bb8 6144
0b5670c9
KH
6145 if (NILP (coding->src_object))
6146 stop = stop_composition = stop_charset = end_pos;
ff0dacd7 6147 else
0b5670c9
KH
6148 {
6149 if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
6150 stop = stop_composition = pos;
6151 else
6152 stop = stop_composition = end_pos;
6153 if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
6154 stop = stop_charset = pos;
6155 else
6156 stop_charset = end_pos;
6157 }
ec6d2bb8 6158
24a73b0a 6159 /* Compensate for CRLF and conversion. */
ff0dacd7 6160 buf_end -= 1 + MAX_ANNOTATION_LENGTH;
df7492f9 6161 while (buf < buf_end)
aaaf0b1e 6162 {
df7492f9 6163 if (pos == stop)
ec6d2bb8 6164 {
df7492f9
KH
6165 if (pos == end_pos)
6166 break;
ff0dacd7
KH
6167 if (pos == stop_composition)
6168 buf = handle_composition_annotation (pos, end_pos, coding,
6169 buf, &stop_composition);
6170 if (pos == stop_charset)
6171 buf = handle_charset_annotation (pos, end_pos, coding,
6172 buf, &stop_charset);
6173 stop = (stop_composition < stop_charset
6174 ? stop_composition : stop_charset);
df7492f9
KH
6175 }
6176
6177 if (! multibytep)
4776e638 6178 {
d3e4cb56 6179 EMACS_INT bytes;
aaaf0b1e 6180
d3e4cb56
KH
6181 if (! CODING_FOR_UNIBYTE (coding)
6182 && (bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
4776e638
KH
6183 c = STRING_CHAR_ADVANCE (src), pos += bytes;
6184 else
6185 c = *src++, pos++;
6186 }
df7492f9 6187 else
4776e638 6188 c = STRING_CHAR_ADVANCE (src), pos++;
df7492f9
KH
6189 if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
6190 c = '\n';
6191 if (! EQ (eol_type, Qunix))
aaaf0b1e 6192 {
df7492f9 6193 if (c == '\n')
aaaf0b1e 6194 {
df7492f9
KH
6195 if (EQ (eol_type, Qdos))
6196 *buf++ = '\r';
6197 else
6198 c = '\r';
aaaf0b1e
KH
6199 }
6200 }
df7492f9 6201 *buf++ = c;
aaaf0b1e 6202 }
ec6d2bb8 6203
df7492f9
KH
6204 coding->consumed = src - coding->source;
6205 coding->consumed_char = pos - coding->src_pos;
6206 coding->charbuf_used = buf - coding->charbuf;
6207 coding->chars_at_source = 0;
aaaf0b1e
KH
6208}
6209
4ed46869 6210
df7492f9
KH
6211/* Encode the text at CODING->src_object into CODING->dst_object.
6212 CODING->src_object is a buffer or a string.
6213 CODING->dst_object is a buffer or nil.
6214
6215 If CODING->src_object is a buffer, it must be the current buffer.
6216 In this case, if CODING->src_pos is positive, it is a position of
6217 the source text in the buffer, otherwise. the source text is in the
6218 gap area of the buffer, and coding->src_pos specifies the offset of
6219 the text from GPT (which must be the same as PT). If this is the
6220 same buffer as CODING->dst_object, CODING->src_pos must be
6221 negative and CODING should not have `pre-write-conversion'.
6222
6223 If CODING->src_object is a string, CODING should not have
6224 `pre-write-conversion'.
6225
6226 If CODING->dst_object is a buffer, the encoded data is inserted at
6227 the current point of that buffer.
6228
6229 If CODING->dst_object is nil, the encoded data is placed at the
6230 memory area specified by CODING->destination. */
6231
6232static int
6233encode_coding (coding)
4ed46869 6234 struct coding_system *coding;
4ed46869 6235{
df7492f9 6236 Lisp_Object attrs;
7d64c6ad 6237 Lisp_Object translation_table;
9861e777 6238
df7492f9 6239 attrs = CODING_ID_ATTRS (coding->id);
7d64c6ad 6240 translation_table = get_translation_table (attrs, 1);
4ed46869 6241
df7492f9 6242 if (BUFFERP (coding->dst_object))
8844fa83 6243 {
df7492f9
KH
6244 set_buffer_internal (XBUFFER (coding->dst_object));
6245 coding->dst_multibyte
6246 = ! NILP (current_buffer->enable_multibyte_characters);
8844fa83 6247 }
4ed46869 6248
b73bfc1c 6249 coding->consumed = coding->consumed_char = 0;
df7492f9 6250 coding->produced = coding->produced_char = 0;
065e3595 6251 record_conversion_result (coding, CODING_RESULT_SUCCESS);
b73bfc1c 6252 coding->errors = 0;
b73bfc1c 6253
df7492f9 6254 ALLOC_CONVERSION_WORK_AREA (coding);
4ed46869 6255
df7492f9
KH
6256 do {
6257 coding_set_source (coding);
6258 consume_chars (coding);
4ed46869 6259
7d64c6ad
KH
6260 if (!NILP (translation_table))
6261 translate_chars (coding, translation_table);
b73bfc1c 6262
df7492f9
KH
6263 coding_set_destination (coding);
6264 (*(coding->encoder)) (coding);
6265 } while (coding->consumed_char < coding->src_chars);
6266
6267 if (BUFFERP (coding->dst_object))
6268 insert_from_gap (coding->produced_char, coding->produced);
6269
6270 return (coding->result);
ec6d2bb8
KH
6271}
6272
fb88bf2d 6273
24a73b0a
KH
6274/* Name (or base name) of work buffer for code conversion. */
6275static Lisp_Object Vcode_conversion_workbuf_name;
d46c5b12 6276
24a73b0a
KH
6277/* A working buffer used by the top level conversion. Once it is
6278 created, it is never destroyed. It has the name
6279 Vcode_conversion_workbuf_name. The other working buffers are
6280 destroyed after the use is finished, and their names are modified
6281 versions of Vcode_conversion_workbuf_name. */
6282static Lisp_Object Vcode_conversion_reused_workbuf;
b73bfc1c 6283
24a73b0a
KH
6284/* 1 iff Vcode_conversion_reused_workbuf is already in use. */
6285static int reused_workbuf_in_use;
4ed46869 6286
24a73b0a
KH
6287
6288/* Return a working buffer of code convesion. MULTIBYTE specifies the
6289 multibyteness of returning buffer. */
b73bfc1c 6290
df7492f9 6291Lisp_Object
24a73b0a 6292make_conversion_work_buffer (multibyte)
df7492f9 6293{
24a73b0a
KH
6294 Lisp_Object name, workbuf;
6295 struct buffer *current;
4ed46869 6296
24a73b0a 6297 if (reused_workbuf_in_use++)
065e3595
KH
6298 {
6299 name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
6300 workbuf = Fget_buffer_create (name);
6301 }
df7492f9 6302 else
065e3595
KH
6303 {
6304 name = Vcode_conversion_workbuf_name;
6305 workbuf = Fget_buffer_create (name);
6306 if (NILP (Vcode_conversion_reused_workbuf))
6307 Vcode_conversion_reused_workbuf = workbuf;
6308 }
24a73b0a
KH
6309 current = current_buffer;
6310 set_buffer_internal (XBUFFER (workbuf));
6311 Ferase_buffer ();
df7492f9 6312 current_buffer->undo_list = Qt;
24a73b0a 6313 current_buffer->enable_multibyte_characters = multibyte ? Qt : Qnil;
df7492f9 6314 set_buffer_internal (current);
24a73b0a 6315 return workbuf;
df7492f9 6316}
d46c5b12 6317
24a73b0a 6318
4776e638 6319static Lisp_Object
24a73b0a
KH
6320code_conversion_restore (arg)
6321 Lisp_Object arg;
4776e638 6322{
24a73b0a
KH
6323 Lisp_Object current, workbuf;
6324
6325 current = XCAR (arg);
6326 workbuf = XCDR (arg);
6327 if (! NILP (workbuf))
6328 {
6329 if (EQ (workbuf, Vcode_conversion_reused_workbuf))
6330 reused_workbuf_in_use = 0;
6331 else if (! NILP (Fbuffer_live_p (workbuf)))
6332 Fkill_buffer (workbuf);
6333 }
6334 set_buffer_internal (XBUFFER (current));
4776e638
KH
6335 return Qnil;
6336}
b73bfc1c 6337
24a73b0a
KH
6338Lisp_Object
6339code_conversion_save (with_work_buf, multibyte)
4776e638 6340 int with_work_buf, multibyte;
df7492f9 6341{
24a73b0a 6342 Lisp_Object workbuf = Qnil;
b73bfc1c 6343
4776e638 6344 if (with_work_buf)
24a73b0a
KH
6345 workbuf = make_conversion_work_buffer (multibyte);
6346 record_unwind_protect (code_conversion_restore,
6347 Fcons (Fcurrent_buffer (), workbuf));
4776e638 6348 return workbuf;
df7492f9 6349}
d46c5b12 6350
df7492f9
KH
6351int
6352decode_coding_gap (coding, chars, bytes)
6353 struct coding_system *coding;
6354 EMACS_INT chars, bytes;
6355{
6356 int count = specpdl_ptr - specpdl;
5e5c78be 6357 Lisp_Object attrs;
fb88bf2d 6358
24a73b0a 6359 code_conversion_save (0, 0);
ec6d2bb8 6360
24a73b0a 6361 coding->src_object = Fcurrent_buffer ();
df7492f9
KH
6362 coding->src_chars = chars;
6363 coding->src_bytes = bytes;
6364 coding->src_pos = -chars;
6365 coding->src_pos_byte = -bytes;
6366 coding->src_multibyte = chars < bytes;
24a73b0a 6367 coding->dst_object = coding->src_object;
df7492f9
KH
6368 coding->dst_pos = PT;
6369 coding->dst_pos_byte = PT_BYTE;
71c81426 6370 coding->dst_multibyte = ! NILP (current_buffer->enable_multibyte_characters);
98725083 6371 coding->mode |= CODING_MODE_LAST_BLOCK;
4ed46869 6372
df7492f9
KH
6373 if (CODING_REQUIRE_DETECTION (coding))
6374 detect_coding (coding);
8f924df7 6375
df7492f9 6376 decode_coding (coding);
d46c5b12 6377
5e5c78be
KH
6378 attrs = CODING_ID_ATTRS (coding->id);
6379 if (! NILP (CODING_ATTR_POST_READ (attrs)))
b73bfc1c 6380 {
5e5c78be
KH
6381 EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
6382 Lisp_Object val;
6383
6384 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
5e5c78be
KH
6385 val = call1 (CODING_ATTR_POST_READ (attrs),
6386 make_number (coding->produced_char));
5e5c78be
KH
6387 CHECK_NATNUM (val);
6388 coding->produced_char += Z - prev_Z;
6389 coding->produced += Z_BYTE - prev_Z_BYTE;
b73bfc1c 6390 }
4ed46869 6391
df7492f9 6392 unbind_to (count, Qnil);
b73bfc1c
KH
6393 return coding->result;
6394}
52d41803 6395
4ed46869 6396int
df7492f9 6397encode_coding_gap (coding, chars, bytes)
4ed46869 6398 struct coding_system *coding;
df7492f9 6399 EMACS_INT chars, bytes;
4ed46869 6400{
df7492f9 6401 int count = specpdl_ptr - specpdl;
4ed46869 6402
24a73b0a 6403 code_conversion_save (0, 0);
4ed46869 6404
24a73b0a 6405 coding->src_object = Fcurrent_buffer ();
df7492f9
KH
6406 coding->src_chars = chars;
6407 coding->src_bytes = bytes;
6408 coding->src_pos = -chars;
6409 coding->src_pos_byte = -bytes;
6410 coding->src_multibyte = chars < bytes;
6411 coding->dst_object = coding->src_object;
6412 coding->dst_pos = PT;
6413 coding->dst_pos_byte = PT_BYTE;
4ed46869 6414
df7492f9 6415 encode_coding (coding);
b73bfc1c 6416
df7492f9
KH
6417 unbind_to (count, Qnil);
6418 return coding->result;
6419}
4ed46869 6420
d46c5b12 6421
df7492f9
KH
6422/* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
6423 SRC_OBJECT into DST_OBJECT by coding context CODING.
b73bfc1c 6424
df7492f9 6425 SRC_OBJECT is a buffer, a string, or Qnil.
b73bfc1c 6426
df7492f9
KH
6427 If it is a buffer, the text is at point of the buffer. FROM and TO
6428 are positions in the buffer.
b73bfc1c 6429
df7492f9
KH
6430 If it is a string, the text is at the beginning of the string.
6431 FROM and TO are indices to the string.
4ed46869 6432
df7492f9
KH
6433 If it is nil, the text is at coding->source. FROM and TO are
6434 indices to coding->source.
bb10be8b 6435
df7492f9 6436 DST_OBJECT is a buffer, Qt, or Qnil.
4ed46869 6437
df7492f9
KH
6438 If it is a buffer, the decoded text is inserted at point of the
6439 buffer. If the buffer is the same as SRC_OBJECT, the source text
6440 is deleted.
4ed46869 6441
df7492f9
KH
6442 If it is Qt, a string is made from the decoded text, and
6443 set in CODING->dst_object.
d46c5b12 6444
df7492f9 6445 If it is Qnil, the decoded text is stored at CODING->destination.
2cb26057 6446 The caller must allocate CODING->dst_bytes bytes at
df7492f9
KH
6447 CODING->destination by xmalloc. If the decoded text is longer than
6448 CODING->dst_bytes, CODING->destination is relocated by xrealloc.
6449 */
d46c5b12 6450
df7492f9
KH
6451void
6452decode_coding_object (coding, src_object, from, from_byte, to, to_byte,
6453 dst_object)
d46c5b12 6454 struct coding_system *coding;
df7492f9
KH
6455 Lisp_Object src_object;
6456 EMACS_INT from, from_byte, to, to_byte;
6457 Lisp_Object dst_object;
d46c5b12 6458{
df7492f9
KH
6459 int count = specpdl_ptr - specpdl;
6460 unsigned char *destination;
6461 EMACS_INT dst_bytes;
6462 EMACS_INT chars = to - from;
6463 EMACS_INT bytes = to_byte - from_byte;
6464 Lisp_Object attrs;
4776e638
KH
6465 Lisp_Object buffer;
6466 int saved_pt = -1, saved_pt_byte;
d46c5b12 6467
4776e638 6468 buffer = Fcurrent_buffer ();
93dec019 6469
df7492f9 6470 if (NILP (dst_object))
d46c5b12 6471 {
df7492f9
KH
6472 destination = coding->destination;
6473 dst_bytes = coding->dst_bytes;
d46c5b12 6474 }
93dec019 6475
df7492f9
KH
6476 coding->src_object = src_object;
6477 coding->src_chars = chars;
6478 coding->src_bytes = bytes;
6479 coding->src_multibyte = chars < bytes;
70ad9fc4 6480
df7492f9 6481 if (STRINGP (src_object))
d46c5b12 6482 {
df7492f9
KH
6483 coding->src_pos = from;
6484 coding->src_pos_byte = from_byte;
d46c5b12 6485 }
df7492f9 6486 else if (BUFFERP (src_object))
88993dfd 6487 {
df7492f9
KH
6488 set_buffer_internal (XBUFFER (src_object));
6489 if (from != GPT)
6490 move_gap_both (from, from_byte);
6491 if (EQ (src_object, dst_object))
fb88bf2d 6492 {
4776e638 6493 saved_pt = PT, saved_pt_byte = PT_BYTE;
df7492f9
KH
6494 TEMP_SET_PT_BOTH (from, from_byte);
6495 del_range_both (from, from_byte, to, to_byte, 1);
6496 coding->src_pos = -chars;
6497 coding->src_pos_byte = -bytes;
fb88bf2d 6498 }
df7492f9 6499 else
fb88bf2d 6500 {
df7492f9
KH
6501 coding->src_pos = from;
6502 coding->src_pos_byte = from_byte;
fb88bf2d 6503 }
88993dfd
KH
6504 }
6505
df7492f9
KH
6506 if (CODING_REQUIRE_DETECTION (coding))
6507 detect_coding (coding);
6508 attrs = CODING_ID_ATTRS (coding->id);
d46c5b12 6509
2cb26057
KH
6510 if (EQ (dst_object, Qt)
6511 || (! NILP (CODING_ATTR_POST_READ (attrs))
6512 && NILP (dst_object)))
b73bfc1c 6513 {
24a73b0a 6514 coding->dst_object = code_conversion_save (1, 1);
df7492f9
KH
6515 coding->dst_pos = BEG;
6516 coding->dst_pos_byte = BEG_BYTE;
6517 coding->dst_multibyte = 1;
b73bfc1c 6518 }
df7492f9 6519 else if (BUFFERP (dst_object))
d46c5b12 6520 {
24a73b0a 6521 code_conversion_save (0, 0);
df7492f9
KH
6522 coding->dst_object = dst_object;
6523 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
6524 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
6525 coding->dst_multibyte
6526 = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
d46c5b12
KH
6527 }
6528 else
6529 {
24a73b0a 6530 code_conversion_save (0, 0);
df7492f9
KH
6531 coding->dst_object = Qnil;
6532 coding->dst_multibyte = 1;
d46c5b12
KH
6533 }
6534
df7492f9 6535 decode_coding (coding);
fa46990e 6536
df7492f9
KH
6537 if (BUFFERP (coding->dst_object))
6538 set_buffer_internal (XBUFFER (coding->dst_object));
d46c5b12 6539
df7492f9 6540 if (! NILP (CODING_ATTR_POST_READ (attrs)))
d46c5b12 6541 {
df7492f9
KH
6542 struct gcpro gcpro1, gcpro2;
6543 EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
2b4f9037 6544 Lisp_Object val;
d46c5b12 6545
c0cc7f7f 6546 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
df7492f9
KH
6547 GCPRO2 (coding->src_object, coding->dst_object);
6548 val = call1 (CODING_ATTR_POST_READ (attrs),
6549 make_number (coding->produced_char));
6550 UNGCPRO;
6551 CHECK_NATNUM (val);
6552 coding->produced_char += Z - prev_Z;
6553 coding->produced += Z_BYTE - prev_Z_BYTE;
d46c5b12 6554 }
de79a6a5 6555
df7492f9 6556 if (EQ (dst_object, Qt))
ec6d2bb8 6557 {
df7492f9
KH
6558 coding->dst_object = Fbuffer_string ();
6559 }
6560 else if (NILP (dst_object) && BUFFERP (coding->dst_object))
6561 {
6562 set_buffer_internal (XBUFFER (coding->dst_object));
6563 if (dst_bytes < coding->produced)
6564 {
6565 destination
6566 = (unsigned char *) xrealloc (destination, coding->produced);
6567 if (! destination)
6568 {
065e3595
KH
6569 record_conversion_result (coding,
6570 CODING_RESULT_INSUFFICIENT_DST);
df7492f9
KH
6571 unbind_to (count, Qnil);
6572 return;
6573 }
6574 if (BEGV < GPT && GPT < BEGV + coding->produced_char)
6575 move_gap_both (BEGV, BEGV_BYTE);
6576 bcopy (BEGV_ADDR, destination, coding->produced);
6577 coding->destination = destination;
d46c5b12 6578 }
ec6d2bb8 6579 }
b73bfc1c 6580
4776e638
KH
6581 if (saved_pt >= 0)
6582 {
6583 /* This is the case of:
6584 (BUFFERP (src_object) && EQ (src_object, dst_object))
6585 As we have moved PT while replacing the original buffer
6586 contents, we must recover it now. */
6587 set_buffer_internal (XBUFFER (src_object));
6588 if (saved_pt < from)
6589 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
6590 else if (saved_pt < from + chars)
6591 TEMP_SET_PT_BOTH (from, from_byte);
6592 else if (! NILP (current_buffer->enable_multibyte_characters))
6593 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
6594 saved_pt_byte + (coding->produced - bytes));
6595 else
6596 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
6597 saved_pt_byte + (coding->produced - bytes));
d46c5b12 6598 }
4776e638 6599
065e3595 6600 unbind_to (count, coding->dst_object);
d46c5b12
KH
6601}
6602
d46c5b12 6603
df7492f9
KH
6604void
6605encode_coding_object (coding, src_object, from, from_byte, to, to_byte,
6606 dst_object)
d46c5b12 6607 struct coding_system *coding;
df7492f9
KH
6608 Lisp_Object src_object;
6609 EMACS_INT from, from_byte, to, to_byte;
6610 Lisp_Object dst_object;
d46c5b12 6611{
b73bfc1c 6612 int count = specpdl_ptr - specpdl;
df7492f9
KH
6613 EMACS_INT chars = to - from;
6614 EMACS_INT bytes = to_byte - from_byte;
6615 Lisp_Object attrs;
4776e638
KH
6616 Lisp_Object buffer;
6617 int saved_pt = -1, saved_pt_byte;
df7492f9 6618
4776e638 6619 buffer = Fcurrent_buffer ();
df7492f9
KH
6620
6621 coding->src_object = src_object;
6622 coding->src_chars = chars;
6623 coding->src_bytes = bytes;
6624 coding->src_multibyte = chars < bytes;
6625
6626 attrs = CODING_ID_ATTRS (coding->id);
6627
6628 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
6bac5b12 6629 {
24a73b0a 6630 coding->src_object = code_conversion_save (1, coding->src_multibyte);
df7492f9
KH
6631 set_buffer_internal (XBUFFER (coding->src_object));
6632 if (STRINGP (src_object))
6633 insert_from_string (src_object, from, from_byte, chars, bytes, 0);
6634 else if (BUFFERP (src_object))
6635 insert_from_buffer (XBUFFER (src_object), from, chars, 0);
6636 else
6637 insert_1_both (coding->source + from, chars, bytes, 0, 0, 0);
d46c5b12 6638
df7492f9
KH
6639 if (EQ (src_object, dst_object))
6640 {
6641 set_buffer_internal (XBUFFER (src_object));
4776e638 6642 saved_pt = PT, saved_pt_byte = PT_BYTE;
df7492f9
KH
6643 del_range_both (from, from_byte, to, to_byte, 1);
6644 set_buffer_internal (XBUFFER (coding->src_object));
6645 }
6646
ac87bbef
KH
6647 call2 (CODING_ATTR_PRE_WRITE (attrs),
6648 make_number (BEG), make_number (Z));
6649 coding->src_object = Fcurrent_buffer ();
df7492f9
KH
6650 if (BEG != GPT)
6651 move_gap_both (BEG, BEG_BYTE);
6652 coding->src_chars = Z - BEG;
6653 coding->src_bytes = Z_BYTE - BEG_BYTE;
6654 coding->src_pos = BEG;
6655 coding->src_pos_byte = BEG_BYTE;
6656 coding->src_multibyte = Z < Z_BYTE;
6657 }
6658 else if (STRINGP (src_object))
d46c5b12 6659 {
24a73b0a 6660 code_conversion_save (0, 0);
df7492f9
KH
6661 coding->src_pos = from;
6662 coding->src_pos_byte = from_byte;
b73bfc1c 6663 }
df7492f9 6664 else if (BUFFERP (src_object))
b73bfc1c 6665 {
24a73b0a 6666 code_conversion_save (0, 0);
df7492f9 6667 set_buffer_internal (XBUFFER (src_object));
df7492f9 6668 if (EQ (src_object, dst_object))
d46c5b12 6669 {
4776e638 6670 saved_pt = PT, saved_pt_byte = PT_BYTE;
ff0dacd7
KH
6671 coding->src_object = del_range_1 (from, to, 1, 1);
6672 coding->src_pos = 0;
6673 coding->src_pos_byte = 0;
d46c5b12 6674 }
df7492f9 6675 else
d46c5b12 6676 {
ff0dacd7
KH
6677 if (from < GPT && to >= GPT)
6678 move_gap_both (from, from_byte);
df7492f9
KH
6679 coding->src_pos = from;
6680 coding->src_pos_byte = from_byte;
d46c5b12 6681 }
d46c5b12 6682 }
4776e638 6683 else
24a73b0a 6684 code_conversion_save (0, 0);
d46c5b12 6685
df7492f9 6686 if (BUFFERP (dst_object))
88993dfd 6687 {
df7492f9 6688 coding->dst_object = dst_object;
28f67a95
KH
6689 if (EQ (src_object, dst_object))
6690 {
6691 coding->dst_pos = from;
6692 coding->dst_pos_byte = from_byte;
6693 }
6694 else
6695 {
6696 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
6697 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
6698 }
df7492f9
KH
6699 coding->dst_multibyte
6700 = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
88993dfd 6701 }
df7492f9 6702 else if (EQ (dst_object, Qt))
d46c5b12 6703 {
df7492f9 6704 coding->dst_object = Qnil;
df7492f9 6705 coding->dst_bytes = coding->src_chars;
ac87bbef
KH
6706 if (coding->dst_bytes == 0)
6707 coding->dst_bytes = 1;
6708 coding->destination = (unsigned char *) xmalloc (coding->dst_bytes);
df7492f9 6709 coding->dst_multibyte = 0;
d46c5b12
KH
6710 }
6711 else
6712 {
df7492f9
KH
6713 coding->dst_object = Qnil;
6714 coding->dst_multibyte = 0;
d46c5b12
KH
6715 }
6716
df7492f9 6717 encode_coding (coding);
d46c5b12 6718
df7492f9 6719 if (EQ (dst_object, Qt))
d46c5b12 6720 {
df7492f9
KH
6721 if (BUFFERP (coding->dst_object))
6722 coding->dst_object = Fbuffer_string ();
6723 else
d46c5b12 6724 {
df7492f9
KH
6725 coding->dst_object
6726 = make_unibyte_string ((char *) coding->destination,
6727 coding->produced);
6728 xfree (coding->destination);
d46c5b12 6729 }
4ed46869 6730 }
d46c5b12 6731
4776e638
KH
6732 if (saved_pt >= 0)
6733 {
6734 /* This is the case of:
6735 (BUFFERP (src_object) && EQ (src_object, dst_object))
6736 As we have moved PT while replacing the original buffer
6737 contents, we must recover it now. */
6738 set_buffer_internal (XBUFFER (src_object));
6739 if (saved_pt < from)
6740 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
6741 else if (saved_pt < from + chars)
6742 TEMP_SET_PT_BOTH (from, from_byte);
6743 else if (! NILP (current_buffer->enable_multibyte_characters))
6744 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
6745 saved_pt_byte + (coding->produced - bytes));
d46c5b12 6746 else
4776e638
KH
6747 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
6748 saved_pt_byte + (coding->produced - bytes));
6749 }
6750
df7492f9 6751 unbind_to (count, Qnil);
b73bfc1c
KH
6752}
6753
df7492f9 6754
b73bfc1c 6755Lisp_Object
df7492f9 6756preferred_coding_system ()
b73bfc1c 6757{
df7492f9 6758 int id = coding_categories[coding_priorities[0]].id;
2391eaa4 6759
df7492f9 6760 return CODING_ID_NAME (id);
4ed46869
KH
6761}
6762
6763\f
6764#ifdef emacs
1397dc18 6765/*** 8. Emacs Lisp library functions ***/
4ed46869 6766
4ed46869 6767DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
48b0f3ae 6768 doc: /* Return t if OBJECT is nil or a coding-system.
df7492f9 6769See the documentation of `define-coding-system' for information
48b0f3ae
PJ
6770about coding-system objects. */)
6771 (obj)
4ed46869
KH
6772 Lisp_Object obj;
6773{
df7492f9 6774 return ((NILP (obj) || CODING_SYSTEM_P (obj)) ? Qt : Qnil);
4ed46869
KH
6775}
6776
9d991de8
RS
6777DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
6778 Sread_non_nil_coding_system, 1, 1, 0,
48b0f3ae
PJ
6779 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
6780 (prompt)
4ed46869
KH
6781 Lisp_Object prompt;
6782{
e0e989f6 6783 Lisp_Object val;
9d991de8
RS
6784 do
6785 {
4608c386
KH
6786 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6787 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
9d991de8 6788 }
8f924df7 6789 while (SCHARS (val) == 0);
e0e989f6 6790 return (Fintern (val, Qnil));
4ed46869
KH
6791}
6792
9b787f3e 6793DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
48b0f3ae
PJ
6794 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
6795If the user enters null input, return second argument DEFAULT-CODING-SYSTEM. */)
6796 (prompt, default_coding_system)
9b787f3e 6797 Lisp_Object prompt, default_coding_system;
4ed46869 6798{
f44d27ce 6799 Lisp_Object val;
9b787f3e 6800 if (SYMBOLP (default_coding_system))
a3181084 6801 XSETSTRING (default_coding_system, XPNTR (SYMBOL_NAME (default_coding_system)));
4608c386 6802 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
9b787f3e
RS
6803 Qt, Qnil, Qcoding_system_history,
6804 default_coding_system, Qnil);
8f924df7 6805 return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
4ed46869
KH
6806}
6807
6808DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
6809 1, 1, 0,
48b0f3ae 6810 doc: /* Check validity of CODING-SYSTEM.
b054002f 6811If valid, return CODING-SYSTEM, else signal a `coding-system-error' error. */)
df7492f9 6812 (coding_system)
4ed46869
KH
6813 Lisp_Object coding_system;
6814{
b7826503 6815 CHECK_SYMBOL (coding_system);
4ed46869
KH
6816 if (!NILP (Fcoding_system_p (coding_system)))
6817 return coding_system;
6818 while (1)
02ba4723 6819 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
4ed46869 6820}
df7492f9 6821
3a73fa5d 6822\f
89528eb3
KH
6823/* Detect how the bytes at SRC of length SRC_BYTES are encoded. If
6824 HIGHEST is nonzero, return the coding system of the highest
6825 priority among the detected coding systems. Otherwize return a
6826 list of detected coding systems sorted by their priorities. If
6827 MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
6828 multibyte form but contains only ASCII and eight-bit chars.
6829 Otherwise, the bytes are raw bytes.
6830
6831 CODING-SYSTEM controls the detection as below:
6832
6833 If it is nil, detect both text-format and eol-format. If the
6834 text-format part of CODING-SYSTEM is already specified
6835 (e.g. `iso-latin-1'), detect only eol-format. If the eol-format
6836 part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
6837 detect only text-format. */
6838
d46c5b12 6839Lisp_Object
24a73b0a
KH
6840detect_coding_system (src, src_chars, src_bytes, highest, multibytep,
6841 coding_system)
8f924df7 6842 const unsigned char *src;
24a73b0a 6843 int src_chars, src_bytes, highest;
0a28aafb 6844 int multibytep;
df7492f9 6845 Lisp_Object coding_system;
4ed46869 6846{
8f924df7 6847 const unsigned char *src_end = src + src_bytes;
df7492f9
KH
6848 Lisp_Object attrs, eol_type;
6849 Lisp_Object val;
6850 struct coding_system coding;
89528eb3 6851 int id;
ff0dacd7 6852 struct coding_detection_info detect_info;
24a73b0a 6853 enum coding_category base_category;
b73bfc1c 6854
df7492f9
KH
6855 if (NILP (coding_system))
6856 coding_system = Qundecided;
6857 setup_coding_system (coding_system, &coding);
6858 attrs = CODING_ID_ATTRS (coding.id);
6859 eol_type = CODING_ID_EOL_TYPE (coding.id);
89528eb3 6860 coding_system = CODING_ATTR_BASE_NAME (attrs);
4ed46869 6861
df7492f9 6862 coding.source = src;
24a73b0a 6863 coding.src_chars = src_chars;
df7492f9
KH
6864 coding.src_bytes = src_bytes;
6865 coding.src_multibyte = multibytep;
6866 coding.consumed = 0;
89528eb3 6867 coding.mode |= CODING_MODE_LAST_BLOCK;
d46c5b12 6868
ff0dacd7 6869 detect_info.checked = detect_info.found = detect_info.rejected = 0;
d46c5b12 6870
89528eb3 6871 /* At first, detect text-format if necessary. */
24a73b0a
KH
6872 base_category = XINT (CODING_ATTR_CATEGORY (attrs));
6873 if (base_category == coding_category_undecided)
4ed46869 6874 {
ff0dacd7
KH
6875 enum coding_category category;
6876 struct coding_system *this;
6877 int c, i;
88993dfd 6878
24a73b0a
KH
6879 /* Skip all ASCII bytes except for a few ISO2022 controls. */
6880 for (i = 0; src < src_end; i++, src++)
4ed46869 6881 {
df7492f9 6882 c = *src;
75e2a253 6883 if (c & 0x80 || (c < 0x20 && (c == ISO_CODE_ESC
24a73b0a
KH
6884 || c == ISO_CODE_SI
6885 || c == ISO_CODE_SO)))
d46c5b12 6886 break;
4ed46869 6887 }
df7492f9 6888 coding.head_ascii = src - coding.source;
88993dfd 6889
df7492f9
KH
6890 if (src < src_end)
6891 for (i = 0; i < coding_category_raw_text; i++)
6892 {
ff0dacd7
KH
6893 category = coding_priorities[i];
6894 this = coding_categories + category;
b843d1ae 6895
df7492f9
KH
6896 if (this->id < 0)
6897 {
6898 /* No coding system of this category is defined. */
ff0dacd7 6899 detect_info.rejected |= (1 << category);
df7492f9 6900 }
ff0dacd7 6901 else if (category >= coding_category_raw_text)
89528eb3 6902 continue;
ff0dacd7
KH
6903 else if (detect_info.checked & (1 << category))
6904 {
6905 if (highest
6906 && (detect_info.found & (1 << category)))
6907 break;
6908 }
df7492f9
KH
6909 else
6910 {
ff0dacd7 6911 if ((*(this->detector)) (&coding, &detect_info)
89528eb3 6912 && highest
ff0dacd7 6913 && (detect_info.found & (1 << category)))
24a73b0a
KH
6914 {
6915 if (category == coding_category_utf_16_auto)
6916 {
6917 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6918 category = coding_category_utf_16_le;
6919 else
6920 category = coding_category_utf_16_be;
6921 }
6922 break;
6923 }
df7492f9
KH
6924 }
6925 }
ec6d2bb8 6926
ff0dacd7 6927 if (detect_info.rejected == CATEGORY_MASK_ANY)
ec6d2bb8 6928 {
ff0dacd7 6929 detect_info.found = CATEGORY_MASK_RAW_TEXT;
89528eb3
KH
6930 id = coding_categories[coding_category_raw_text].id;
6931 val = Fcons (make_number (id), Qnil);
6932 }
ff0dacd7 6933 else if (! detect_info.rejected && ! detect_info.found)
89528eb3 6934 {
ff0dacd7 6935 detect_info.found = CATEGORY_MASK_ANY;
89528eb3
KH
6936 id = coding_categories[coding_category_undecided].id;
6937 val = Fcons (make_number (id), Qnil);
6938 }
6939 else if (highest)
6940 {
ff0dacd7 6941 if (detect_info.found)
ec6d2bb8 6942 {
ff0dacd7
KH
6943 detect_info.found = 1 << category;
6944 val = Fcons (make_number (this->id), Qnil);
6945 }
6946 else
6947 for (i = 0; i < coding_category_raw_text; i++)
6948 if (! (detect_info.rejected & (1 << coding_priorities[i])))
6949 {
6950 detect_info.found = 1 << coding_priorities[i];
6951 id = coding_categories[coding_priorities[i]].id;
6952 val = Fcons (make_number (id), Qnil);
6953 break;
6954 }
6955 }
89528eb3
KH
6956 else
6957 {
ff0dacd7
KH
6958 int mask = detect_info.rejected | detect_info.found;
6959 int found = 0;
89528eb3 6960 val = Qnil;
ec6d2bb8 6961
89528eb3 6962 for (i = coding_category_raw_text - 1; i >= 0; i--)
ff0dacd7
KH
6963 {
6964 category = coding_priorities[i];
6965 if (! (mask & (1 << category)))
ec6d2bb8 6966 {
ff0dacd7
KH
6967 found |= 1 << category;
6968 id = coding_categories[category].id;
6969 val = Fcons (make_number (id), val);
6970 }
6971 }
6972 for (i = coding_category_raw_text - 1; i >= 0; i--)
6973 {
6974 category = coding_priorities[i];
6975 if (detect_info.found & (1 << category))
6976 {
6977 id = coding_categories[category].id;
6978 val = Fcons (make_number (id), val);
ec6d2bb8 6979 }
ec6d2bb8 6980 }
ff0dacd7 6981 detect_info.found |= found;
ec6d2bb8 6982 }
ec6d2bb8 6983 }
24a73b0a
KH
6984 else if (base_category == coding_category_utf_16_auto)
6985 {
6986 if (detect_coding_utf_16 (&coding, &detect_info))
6987 {
6988 enum coding_category category;
6989 struct coding_system *this;
6990
6991 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6992 this = coding_categories + coding_category_utf_16_le;
6993 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6994 this = coding_categories + coding_category_utf_16_be;
6995 else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
6996 this = coding_categories + coding_category_utf_16_be_nosig;
6997 else
6998 this = coding_categories + coding_category_utf_16_le_nosig;
6999 val = Fcons (make_number (this->id), Qnil);
7000 }
7001 }
df7492f9
KH
7002 else
7003 {
ff0dacd7 7004 detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
89528eb3 7005 val = Fcons (make_number (coding.id), Qnil);
4ed46869 7006 }
df7492f9 7007
89528eb3 7008 /* Then, detect eol-format if necessary. */
df7492f9 7009 {
89528eb3 7010 int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol;
df7492f9
KH
7011 Lisp_Object tail;
7012
89528eb3
KH
7013 if (VECTORP (eol_type))
7014 {
ff0dacd7 7015 if (detect_info.found & ~CATEGORY_MASK_UTF_16)
89528eb3
KH
7016 normal_eol = detect_eol (coding.source, src_bytes,
7017 coding_category_raw_text);
ff0dacd7
KH
7018 if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
7019 | CATEGORY_MASK_UTF_16_BE_NOSIG))
89528eb3
KH
7020 utf_16_be_eol = detect_eol (coding.source, src_bytes,
7021 coding_category_utf_16_be);
ff0dacd7
KH
7022 if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
7023 | CATEGORY_MASK_UTF_16_LE_NOSIG))
89528eb3
KH
7024 utf_16_le_eol = detect_eol (coding.source, src_bytes,
7025 coding_category_utf_16_le);
7026 }
7027 else
7028 {
7029 if (EQ (eol_type, Qunix))
7030 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
7031 else if (EQ (eol_type, Qdos))
7032 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
7033 else
7034 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
7035 }
7036
df7492f9
KH
7037 for (tail = val; CONSP (tail); tail = XCDR (tail))
7038 {
89528eb3 7039 enum coding_category category;
df7492f9 7040 int this_eol;
89528eb3
KH
7041
7042 id = XINT (XCAR (tail));
7043 attrs = CODING_ID_ATTRS (id);
7044 category = XINT (CODING_ATTR_CATEGORY (attrs));
7045 eol_type = CODING_ID_EOL_TYPE (id);
df7492f9
KH
7046 if (VECTORP (eol_type))
7047 {
89528eb3
KH
7048 if (category == coding_category_utf_16_be
7049 || category == coding_category_utf_16_be_nosig)
7050 this_eol = utf_16_be_eol;
7051 else if (category == coding_category_utf_16_le
7052 || category == coding_category_utf_16_le_nosig)
7053 this_eol = utf_16_le_eol;
df7492f9 7054 else
89528eb3
KH
7055 this_eol = normal_eol;
7056
df7492f9
KH
7057 if (this_eol == EOL_SEEN_LF)
7058 XSETCAR (tail, AREF (eol_type, 0));
7059 else if (this_eol == EOL_SEEN_CRLF)
7060 XSETCAR (tail, AREF (eol_type, 1));
7061 else if (this_eol == EOL_SEEN_CR)
7062 XSETCAR (tail, AREF (eol_type, 2));
89528eb3
KH
7063 else
7064 XSETCAR (tail, CODING_ID_NAME (id));
df7492f9 7065 }
89528eb3
KH
7066 else
7067 XSETCAR (tail, CODING_ID_NAME (id));
df7492f9
KH
7068 }
7069 }
ec6d2bb8 7070
03699b14 7071 return (highest ? XCAR (val) : val);
ec6d2bb8
KH
7072}
7073
ec6d2bb8 7074
d46c5b12
KH
7075DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
7076 2, 3, 0,
48b0f3ae
PJ
7077 doc: /* Detect coding system of the text in the region between START and END.
7078Return a list of possible coding systems ordered by priority.
ec6d2bb8 7079
48b0f3ae
PJ
7080If only ASCII characters are found, it returns a list of single element
7081`undecided' or its subsidiary coding system according to a detected
7082end-of-line format.
ec6d2bb8 7083
48b0f3ae
PJ
7084If optional argument HIGHEST is non-nil, return the coding system of
7085highest priority. */)
7086 (start, end, highest)
d46c5b12
KH
7087 Lisp_Object start, end, highest;
7088{
7089 int from, to;
7090 int from_byte, to_byte;
ec6d2bb8 7091
b7826503
PJ
7092 CHECK_NUMBER_COERCE_MARKER (start);
7093 CHECK_NUMBER_COERCE_MARKER (end);
ec6d2bb8 7094
d46c5b12
KH
7095 validate_region (&start, &end);
7096 from = XINT (start), to = XINT (end);
7097 from_byte = CHAR_TO_BYTE (from);
7098 to_byte = CHAR_TO_BYTE (to);
ec6d2bb8 7099
d46c5b12
KH
7100 if (from < GPT && to >= GPT)
7101 move_gap_both (to, to_byte);
c210f766 7102
d46c5b12 7103 return detect_coding_system (BYTE_POS_ADDR (from_byte),
24a73b0a 7104 to - from, to_byte - from_byte,
0a28aafb
KH
7105 !NILP (highest),
7106 !NILP (current_buffer
df7492f9
KH
7107 ->enable_multibyte_characters),
7108 Qnil);
ec6d2bb8
KH
7109}
7110
d46c5b12
KH
7111DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
7112 1, 2, 0,
48b0f3ae
PJ
7113 doc: /* Detect coding system of the text in STRING.
7114Return a list of possible coding systems ordered by priority.
fb88bf2d 7115
48b0f3ae
PJ
7116If only ASCII characters are found, it returns a list of single element
7117`undecided' or its subsidiary coding system according to a detected
7118end-of-line format.
d46c5b12 7119
48b0f3ae
PJ
7120If optional argument HIGHEST is non-nil, return the coding system of
7121highest priority. */)
7122 (string, highest)
d46c5b12
KH
7123 Lisp_Object string, highest;
7124{
b7826503 7125 CHECK_STRING (string);
b73bfc1c 7126
24a73b0a
KH
7127 return detect_coding_system (SDATA (string),
7128 SCHARS (string), SBYTES (string),
8f924df7 7129 !NILP (highest), STRING_MULTIBYTE (string),
df7492f9 7130 Qnil);
4ed46869 7131}
4ed46869 7132
b73bfc1c 7133
df7492f9
KH
7134static INLINE int
7135char_encodable_p (c, attrs)
7136 int c;
7137 Lisp_Object attrs;
05e6f5dc 7138{
df7492f9 7139 Lisp_Object tail;
df7492f9 7140 struct charset *charset;
7d64c6ad 7141 Lisp_Object translation_table;
d46c5b12 7142
7d64c6ad 7143 translation_table = CODING_ATTR_TRANS_TBL (attrs);
a6f87d34 7144 if (! NILP (translation_table))
7d64c6ad 7145 c = translate_char (translation_table, c);
df7492f9
KH
7146 for (tail = CODING_ATTR_CHARSET_LIST (attrs);
7147 CONSP (tail); tail = XCDR (tail))
e133c8fa 7148 {
df7492f9
KH
7149 charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
7150 if (CHAR_CHARSET_P (c, charset))
7151 break;
e133c8fa 7152 }
df7492f9 7153 return (! NILP (tail));
05e6f5dc 7154}
83fa074f 7155
fb88bf2d 7156
df7492f9
KH
7157/* Return a list of coding systems that safely encode the text between
7158 START and END. If EXCLUDE is non-nil, it is a list of coding
7159 systems not to check. The returned list doesn't contain any such
48468dac 7160 coding systems. In any case, if the text contains only ASCII or is
df7492f9 7161 unibyte, return t. */
e077cc80 7162
df7492f9
KH
7163DEFUN ("find-coding-systems-region-internal",
7164 Ffind_coding_systems_region_internal,
7165 Sfind_coding_systems_region_internal, 2, 3, 0,
7166 doc: /* Internal use only. */)
7167 (start, end, exclude)
7168 Lisp_Object start, end, exclude;
7169{
7170 Lisp_Object coding_attrs_list, safe_codings;
7171 EMACS_INT start_byte, end_byte;
7c78e542 7172 const unsigned char *p, *pbeg, *pend;
df7492f9
KH
7173 int c;
7174 Lisp_Object tail, elt;
d46c5b12 7175
df7492f9
KH
7176 if (STRINGP (start))
7177 {
7178 if (!STRING_MULTIBYTE (start)
8f924df7 7179 || SCHARS (start) == SBYTES (start))
df7492f9
KH
7180 return Qt;
7181 start_byte = 0;
8f924df7 7182 end_byte = SBYTES (start);
df7492f9
KH
7183 }
7184 else
d46c5b12 7185 {
df7492f9
KH
7186 CHECK_NUMBER_COERCE_MARKER (start);
7187 CHECK_NUMBER_COERCE_MARKER (end);
7188 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
7189 args_out_of_range (start, end);
7190 if (NILP (current_buffer->enable_multibyte_characters))
7191 return Qt;
7192 start_byte = CHAR_TO_BYTE (XINT (start));
7193 end_byte = CHAR_TO_BYTE (XINT (end));
7194 if (XINT (end) - XINT (start) == end_byte - start_byte)
7195 return Qt;
d46c5b12 7196
e1c23804 7197 if (XINT (start) < GPT && XINT (end) > GPT)
d46c5b12 7198 {
e1c23804
DL
7199 if ((GPT - XINT (start)) < (XINT (end) - GPT))
7200 move_gap_both (XINT (start), start_byte);
df7492f9 7201 else
e1c23804 7202 move_gap_both (XINT (end), end_byte);
d46c5b12
KH
7203 }
7204 }
7205
df7492f9
KH
7206 coding_attrs_list = Qnil;
7207 for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
7208 if (NILP (exclude)
7209 || NILP (Fmemq (XCAR (tail), exclude)))
7210 {
7211 Lisp_Object attrs;
d46c5b12 7212
df7492f9
KH
7213 attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
7214 if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
7215 && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
7d64c6ad
KH
7216 {
7217 ASET (attrs, coding_attr_trans_tbl,
7218 get_translation_table (attrs, 1));
7219 coding_attrs_list = Fcons (attrs, coding_attrs_list);
7220 }
df7492f9 7221 }
d46c5b12 7222
df7492f9 7223 if (STRINGP (start))
8f924df7 7224 p = pbeg = SDATA (start);
df7492f9
KH
7225 else
7226 p = pbeg = BYTE_POS_ADDR (start_byte);
7227 pend = p + (end_byte - start_byte);
b843d1ae 7228
df7492f9
KH
7229 while (p < pend && ASCII_BYTE_P (*p)) p++;
7230 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
d46c5b12 7231
05e6f5dc 7232 while (p < pend)
72d1a715 7233 {
df7492f9
KH
7234 if (ASCII_BYTE_P (*p))
7235 p++;
72d1a715
RS
7236 else
7237 {
df7492f9 7238 c = STRING_CHAR_ADVANCE (p);
12410ef1 7239
df7492f9
KH
7240 charset_map_loaded = 0;
7241 for (tail = coding_attrs_list; CONSP (tail);)
7242 {
7243 elt = XCAR (tail);
7244 if (NILP (elt))
7245 tail = XCDR (tail);
7246 else if (char_encodable_p (c, elt))
7247 tail = XCDR (tail);
7248 else if (CONSP (XCDR (tail)))
7249 {
7250 XSETCAR (tail, XCAR (XCDR (tail)));
7251 XSETCDR (tail, XCDR (XCDR (tail)));
7252 }
7253 else
7254 {
7255 XSETCAR (tail, Qnil);
7256 tail = XCDR (tail);
7257 }
7258 }
7259 if (charset_map_loaded)
7260 {
7261 EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
05e6f5dc 7262
df7492f9 7263 if (STRINGP (start))
8f924df7 7264 pbeg = SDATA (start);
df7492f9
KH
7265 else
7266 pbeg = BYTE_POS_ADDR (start_byte);
7267 p = pbeg + p_offset;
7268 pend = pbeg + pend_offset;
7269 }
7270 }
ec6d2bb8 7271 }
fb88bf2d 7272
df7492f9
KH
7273 safe_codings = Qnil;
7274 for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
7275 if (! NILP (XCAR (tail)))
7276 safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
ec6d2bb8 7277
05e6f5dc
KH
7278 return safe_codings;
7279}
4956c225 7280
d46c5b12 7281
8f924df7
KH
7282DEFUN ("unencodable-char-position", Funencodable_char_position,
7283 Sunencodable_char_position, 3, 5, 0,
7284 doc: /*
7285Return position of first un-encodable character in a region.
7286START and END specfiy the region and CODING-SYSTEM specifies the
7287encoding to check. Return nil if CODING-SYSTEM does encode the region.
d46c5b12 7288
8f924df7
KH
7289If optional 4th argument COUNT is non-nil, it specifies at most how
7290many un-encodable characters to search. In this case, the value is a
7291list of positions.
d46c5b12 7292
8f924df7
KH
7293If optional 5th argument STRING is non-nil, it is a string to search
7294for un-encodable characters. In that case, START and END are indexes
7295to the string. */)
7296 (start, end, coding_system, count, string)
7297 Lisp_Object start, end, coding_system, count, string;
7298{
7299 int n;
7300 struct coding_system coding;
7d64c6ad 7301 Lisp_Object attrs, charset_list, translation_table;
8f924df7
KH
7302 Lisp_Object positions;
7303 int from, to;
7304 const unsigned char *p, *stop, *pend;
7305 int ascii_compatible;
fb88bf2d 7306
8f924df7
KH
7307 setup_coding_system (Fcheck_coding_system (coding_system), &coding);
7308 attrs = CODING_ID_ATTRS (coding.id);
7309 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
7310 return Qnil;
7311 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
7312 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7d64c6ad 7313 translation_table = get_translation_table (attrs, 1);
fb88bf2d 7314
8f924df7
KH
7315 if (NILP (string))
7316 {
7317 validate_region (&start, &end);
7318 from = XINT (start);
7319 to = XINT (end);
7320 if (NILP (current_buffer->enable_multibyte_characters)
7321 || (ascii_compatible
7322 && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
7323 return Qnil;
7324 p = CHAR_POS_ADDR (from);
7325 pend = CHAR_POS_ADDR (to);
7326 if (from < GPT && to >= GPT)
7327 stop = GPT_ADDR;
7328 else
7329 stop = pend;
7330 }
7331 else
7332 {
7333 CHECK_STRING (string);
7334 CHECK_NATNUM (start);
7335 CHECK_NATNUM (end);
7336 from = XINT (start);
7337 to = XINT (end);
7338 if (from > to
7339 || to > SCHARS (string))
7340 args_out_of_range_3 (string, start, end);
7341 if (! STRING_MULTIBYTE (string))
7342 return Qnil;
7343 p = SDATA (string) + string_char_to_byte (string, from);
7344 stop = pend = SDATA (string) + string_char_to_byte (string, to);
7345 if (ascii_compatible && (to - from) == (pend - p))
7346 return Qnil;
7347 }
f2558efd 7348
8f924df7
KH
7349 if (NILP (count))
7350 n = 1;
7351 else
b73bfc1c 7352 {
8f924df7
KH
7353 CHECK_NATNUM (count);
7354 n = XINT (count);
b73bfc1c
KH
7355 }
7356
8f924df7
KH
7357 positions = Qnil;
7358 while (1)
d46c5b12 7359 {
8f924df7 7360 int c;
ec6d2bb8 7361
8f924df7
KH
7362 if (ascii_compatible)
7363 while (p < stop && ASCII_BYTE_P (*p))
7364 p++, from++;
7365 if (p >= stop)
0e79d667 7366 {
8f924df7
KH
7367 if (p >= pend)
7368 break;
7369 stop = pend;
7370 p = GAP_END_ADDR;
0e79d667 7371 }
ec6d2bb8 7372
8f924df7
KH
7373 c = STRING_CHAR_ADVANCE (p);
7374 if (! (ASCII_CHAR_P (c) && ascii_compatible)
7d64c6ad
KH
7375 && ! char_charset (translate_char (translation_table, c),
7376 charset_list, NULL))
ec6d2bb8 7377 {
8f924df7
KH
7378 positions = Fcons (make_number (from), positions);
7379 n--;
7380 if (n == 0)
7381 break;
ec6d2bb8
KH
7382 }
7383
8f924df7
KH
7384 from++;
7385 }
d46c5b12 7386
8f924df7
KH
7387 return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
7388}
d46c5b12 7389
d46c5b12 7390
df7492f9
KH
7391DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
7392 Scheck_coding_systems_region, 3, 3, 0,
7393 doc: /* Check if the region is encodable by coding systems.
d46c5b12 7394
df7492f9
KH
7395START and END are buffer positions specifying the region.
7396CODING-SYSTEM-LIST is a list of coding systems to check.
d46c5b12 7397
df7492f9
KH
7398The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
7399CODING-SYSTEM is a member of CODING-SYSTEM-LIst and can't encode the
7400whole region, POS0, POS1, ... are buffer positions where non-encodable
7401characters are found.
93dec019 7402
df7492f9
KH
7403If all coding systems in CODING-SYSTEM-LIST can encode the region, the
7404value is nil.
93dec019 7405
df7492f9
KH
7406START may be a string. In that case, check if the string is
7407encodable, and the value contains indices to the string instead of
7408buffer positions. END is ignored. */)
7409 (start, end, coding_system_list)
7410 Lisp_Object start, end, coding_system_list;
05e6f5dc 7411{
df7492f9
KH
7412 Lisp_Object list;
7413 EMACS_INT start_byte, end_byte;
7414 int pos;
7c78e542 7415 const unsigned char *p, *pbeg, *pend;
df7492f9 7416 int c;
7d64c6ad 7417 Lisp_Object tail, elt, attrs;
70ad9fc4 7418
05e6f5dc
KH
7419 if (STRINGP (start))
7420 {
df7492f9 7421 if (!STRING_MULTIBYTE (start)
8f924df7 7422 && SCHARS (start) != SBYTES (start))
df7492f9
KH
7423 return Qnil;
7424 start_byte = 0;
8f924df7 7425 end_byte = SBYTES (start);
df7492f9 7426 pos = 0;
d46c5b12 7427 }
05e6f5dc 7428 else
b73bfc1c 7429 {
b7826503
PJ
7430 CHECK_NUMBER_COERCE_MARKER (start);
7431 CHECK_NUMBER_COERCE_MARKER (end);
05e6f5dc
KH
7432 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
7433 args_out_of_range (start, end);
7434 if (NILP (current_buffer->enable_multibyte_characters))
df7492f9
KH
7435 return Qnil;
7436 start_byte = CHAR_TO_BYTE (XINT (start));
7437 end_byte = CHAR_TO_BYTE (XINT (end));
7438 if (XINT (end) - XINT (start) == end_byte - start_byte)
05e6f5dc 7439 return Qt;
df7492f9 7440
e1c23804 7441 if (XINT (start) < GPT && XINT (end) > GPT)
b73bfc1c 7442 {
e1c23804
DL
7443 if ((GPT - XINT (start)) < (XINT (end) - GPT))
7444 move_gap_both (XINT (start), start_byte);
df7492f9 7445 else
e1c23804 7446 move_gap_both (XINT (end), end_byte);
b73bfc1c 7447 }
e1c23804 7448 pos = XINT (start);
b73bfc1c 7449 }
7553d0e1 7450
df7492f9
KH
7451 list = Qnil;
7452 for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
12410ef1 7453 {
df7492f9 7454 elt = XCAR (tail);
7d64c6ad
KH
7455 attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
7456 ASET (attrs, coding_attr_trans_tbl, get_translation_table (attrs, 1));
7457 list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
12410ef1
KH
7458 }
7459
df7492f9 7460 if (STRINGP (start))
8f924df7 7461 p = pbeg = SDATA (start);
72d1a715 7462 else
df7492f9
KH
7463 p = pbeg = BYTE_POS_ADDR (start_byte);
7464 pend = p + (end_byte - start_byte);
4ed46869 7465
df7492f9
KH
7466 while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
7467 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
ec6d2bb8 7468
df7492f9 7469 while (p < pend)
d46c5b12 7470 {
df7492f9
KH
7471 if (ASCII_BYTE_P (*p))
7472 p++;
e133c8fa 7473 else
05e6f5dc 7474 {
df7492f9
KH
7475 c = STRING_CHAR_ADVANCE (p);
7476
7477 charset_map_loaded = 0;
7478 for (tail = list; CONSP (tail); tail = XCDR (tail))
7479 {
7480 elt = XCDR (XCAR (tail));
7481 if (! char_encodable_p (c, XCAR (elt)))
7482 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
7483 }
7484 if (charset_map_loaded)
7485 {
7486 EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
7487
7488 if (STRINGP (start))
8f924df7 7489 pbeg = SDATA (start);
df7492f9
KH
7490 else
7491 pbeg = BYTE_POS_ADDR (start_byte);
7492 p = pbeg + p_offset;
7493 pend = pbeg + pend_offset;
7494 }
05e6f5dc 7495 }
df7492f9 7496 pos++;
d46c5b12 7497 }
4ed46869 7498
df7492f9
KH
7499 tail = list;
7500 list = Qnil;
7501 for (; CONSP (tail); tail = XCDR (tail))
ec6d2bb8 7502 {
df7492f9
KH
7503 elt = XCAR (tail);
7504 if (CONSP (XCDR (XCDR (elt))))
7505 list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
7506 list);
ec6d2bb8 7507 }
2b4f9037 7508
df7492f9 7509 return list;
d46c5b12
KH
7510}
7511
3fd9494b 7512
b73bfc1c 7513Lisp_Object
df7492f9
KH
7514code_convert_region (start, end, coding_system, dst_object, encodep, norecord)
7515 Lisp_Object start, end, coding_system, dst_object;
7516 int encodep, norecord;
4ed46869 7517{
3a73fa5d 7518 struct coding_system coding;
df7492f9
KH
7519 EMACS_INT from, from_byte, to, to_byte;
7520 Lisp_Object src_object;
4ed46869 7521
b7826503
PJ
7522 CHECK_NUMBER_COERCE_MARKER (start);
7523 CHECK_NUMBER_COERCE_MARKER (end);
df7492f9
KH
7524 if (NILP (coding_system))
7525 coding_system = Qno_conversion;
7526 else
7527 CHECK_CODING_SYSTEM (coding_system);
7528 src_object = Fcurrent_buffer ();
7529 if (NILP (dst_object))
7530 dst_object = src_object;
7531 else if (! EQ (dst_object, Qt))
7532 CHECK_BUFFER (dst_object);
3a73fa5d 7533
d46c5b12
KH
7534 validate_region (&start, &end);
7535 from = XFASTINT (start);
df7492f9 7536 from_byte = CHAR_TO_BYTE (from);
d46c5b12 7537 to = XFASTINT (end);
df7492f9 7538 to_byte = CHAR_TO_BYTE (to);
764ca8da 7539
df7492f9
KH
7540 setup_coding_system (coding_system, &coding);
7541 coding.mode |= CODING_MODE_LAST_BLOCK;
ec6d2bb8 7542
df7492f9
KH
7543 if (encodep)
7544 encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
7545 dst_object);
7546 else
7547 decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
7548 dst_object);
7549 if (! norecord)
7550 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
ec6d2bb8 7551
df7492f9
KH
7552 return (BUFFERP (dst_object)
7553 ? make_number (coding.produced_char)
7554 : coding.dst_object);
4031e2bf 7555}
78108bcd 7556
4ed46869 7557
4031e2bf 7558DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
df7492f9 7559 3, 4, "r\nzCoding system: ",
48b0f3ae 7560 doc: /* Decode the current region from the specified coding system.
df7492f9
KH
7561When called from a program, takes four arguments:
7562 START, END, CODING-SYSTEM, and DESTINATION.
7563START and END are buffer positions.
8844fa83 7564
df7492f9
KH
7565Optional 4th arguments DESTINATION specifies where the decoded text goes.
7566If nil, the region between START and END is replace by the decoded text.
7567If buffer, the decoded text is inserted in the buffer.
7568If t, the decoded text is returned.
8844fa83 7569
48b0f3ae
PJ
7570This function sets `last-coding-system-used' to the precise coding system
7571used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7572not fully specified.)
7573It returns the length of the decoded text. */)
df7492f9
KH
7574 (start, end, coding_system, destination)
7575 Lisp_Object start, end, coding_system, destination;
4031e2bf 7576{
df7492f9 7577 return code_convert_region (start, end, coding_system, destination, 0, 0);
3a73fa5d 7578}
8844fa83 7579
3a73fa5d 7580DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
df7492f9
KH
7581 3, 4, "r\nzCoding system: ",
7582 doc: /* Encode the current region by specified coding system.
48b0f3ae
PJ
7583When called from a program, takes three arguments:
7584START, END, and CODING-SYSTEM. START and END are buffer positions.
d46c5b12 7585
df7492f9
KH
7586Optional 4th arguments DESTINATION specifies where the encoded text goes.
7587If nil, the region between START and END is replace by the encoded text.
7588If buffer, the encoded text is inserted in the buffer.
7589If t, the encoded text is returned.
2391eaa4 7590
48b0f3ae
PJ
7591This function sets `last-coding-system-used' to the precise coding system
7592used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7593not fully specified.)
7594It returns the length of the encoded text. */)
df7492f9
KH
7595 (start, end, coding_system, destination)
7596 Lisp_Object start, end, coding_system, destination;
3a73fa5d 7597{
df7492f9 7598 return code_convert_region (start, end, coding_system, destination, 1, 0);
b73bfc1c
KH
7599}
7600
7601Lisp_Object
df7492f9
KH
7602code_convert_string (string, coding_system, dst_object,
7603 encodep, nocopy, norecord)
7604 Lisp_Object string, coding_system, dst_object;
7605 int encodep, nocopy, norecord;
b73bfc1c 7606{
4031e2bf 7607 struct coding_system coding;
df7492f9 7608 EMACS_INT chars, bytes;
ec6d2bb8 7609
b7826503 7610 CHECK_STRING (string);
d46c5b12 7611 if (NILP (coding_system))
4956c225 7612 {
df7492f9
KH
7613 if (! norecord)
7614 Vlast_coding_system_used = Qno_conversion;
7615 if (NILP (dst_object))
7616 return (nocopy ? Fcopy_sequence (string) : string);
4956c225 7617 }
b73bfc1c 7618
df7492f9
KH
7619 if (NILP (coding_system))
7620 coding_system = Qno_conversion;
7621 else
7622 CHECK_CODING_SYSTEM (coding_system);
7623 if (NILP (dst_object))
7624 dst_object = Qt;
7625 else if (! EQ (dst_object, Qt))
7626 CHECK_BUFFER (dst_object);
73be902c 7627
df7492f9 7628 setup_coding_system (coding_system, &coding);
d46c5b12 7629 coding.mode |= CODING_MODE_LAST_BLOCK;
8f924df7
KH
7630 chars = SCHARS (string);
7631 bytes = SBYTES (string);
df7492f9
KH
7632 if (encodep)
7633 encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
7634 else
7635 decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
7636 if (! norecord)
7637 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
73be902c 7638
df7492f9
KH
7639 return (BUFFERP (dst_object)
7640 ? make_number (coding.produced_char)
7641 : coding.dst_object);
4ed46869 7642}
73be902c 7643
b73bfc1c 7644
ecec61c1 7645/* Encode or decode STRING according to CODING_SYSTEM.
ec6d2bb8 7646 Do not set Vlast_coding_system_used.
4ed46869 7647
ec6d2bb8
KH
7648 This function is called only from macros DECODE_FILE and
7649 ENCODE_FILE, thus we ignore character composition. */
4ed46869 7650
ecec61c1
KH
7651Lisp_Object
7652code_convert_string_norecord (string, coding_system, encodep)
7653 Lisp_Object string, coding_system;
7654 int encodep;
4ed46869 7655{
0be8721c 7656 return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
4ed46869
KH
7657}
7658
4ed46869 7659
df7492f9
KH
7660DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
7661 2, 4, 0,
7662 doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
7663
7664Optional third arg NOCOPY non-nil means it is OK to return STRING itself
7665if the decoding operation is trivial.
ecec61c1 7666
df7492f9 7667Optional fourth arg BUFFER non-nil meant that the decoded text is
a3f6ee6d 7668inserted in BUFFER instead of returned as a string. In this case,
df7492f9 7669the return value is BUFFER.
ecec61c1 7670
df7492f9
KH
7671This function sets `last-coding-system-used' to the precise coding system
7672used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7673not fully specified. */)
7674 (string, coding_system, nocopy, buffer)
7675 Lisp_Object string, coding_system, nocopy, buffer;
4ed46869 7676{
df7492f9
KH
7677 return code_convert_string (string, coding_system, buffer,
7678 0, ! NILP (nocopy), 0);
4ed46869
KH
7679}
7680
df7492f9
KH
7681DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
7682 2, 4, 0,
7683 doc: /* Encode STRING to CODING-SYSTEM, and return the result.
7684
7685Optional third arg NOCOPY non-nil means it is OK to return STRING
7686itself if the encoding operation is trivial.
7687
7688Optional fourth arg BUFFER non-nil meant that the encoded text is
a3f6ee6d 7689inserted in BUFFER instead of returned as a string. In this case,
df7492f9
KH
7690the return value is BUFFER.
7691
7692This function sets `last-coding-system-used' to the precise coding system
7693used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7694not fully specified.) */)
7695 (string, coding_system, nocopy, buffer)
7696 Lisp_Object string, coding_system, nocopy, buffer;
4ed46869 7697{
df7492f9 7698 return code_convert_string (string, coding_system, buffer,
c197f191 7699 1, ! NILP (nocopy), 1);
4ed46869 7700}
df7492f9 7701
3a73fa5d 7702\f
4ed46869 7703DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
48b0f3ae
PJ
7704 doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
7705Return the corresponding character. */)
7706 (code)
4ed46869 7707 Lisp_Object code;
4ed46869 7708{
df7492f9
KH
7709 Lisp_Object spec, attrs, val;
7710 struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
7711 int c;
4ed46869 7712
df7492f9
KH
7713 CHECK_NATNUM (code);
7714 c = XFASTINT (code);
7715 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
7716 attrs = AREF (spec, 0);
4ed46869 7717
df7492f9
KH
7718 if (ASCII_BYTE_P (c)
7719 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7720 return code;
4ed46869 7721
df7492f9
KH
7722 val = CODING_ATTR_CHARSET_LIST (attrs);
7723 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
004068e4
KH
7724 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
7725 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
fa42c37f 7726
df7492f9
KH
7727 if (c <= 0x7F)
7728 charset = charset_roman;
7729 else if (c >= 0xA0 && c < 0xDF)
55ab7be3 7730 {
df7492f9
KH
7731 charset = charset_kana;
7732 c -= 0x80;
4ed46869 7733 }
55ab7be3 7734 else
4ed46869 7735 {
004068e4 7736 int s1 = c >> 8, s2 = c & 0xFF;
df7492f9
KH
7737
7738 if (s1 < 0x81 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF
7739 || s2 < 0x40 || s2 == 0x7F || s2 > 0xFC)
7740 error ("Invalid code: %d", code);
7741 SJIS_TO_JIS (c);
7742 charset = charset_kanji;
4ed46869 7743 }
df7492f9
KH
7744 c = DECODE_CHAR (charset, c);
7745 if (c < 0)
7746 error ("Invalid code: %d", code);
7747 return make_number (c);
93dec019 7748}
4ed46869 7749
48b0f3ae 7750
4ed46869 7751DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
48b0f3ae
PJ
7752 doc: /* Encode a Japanese character CHAR to shift_jis encoding.
7753Return the corresponding code in SJIS. */)
7754 (ch)
df7492f9 7755 Lisp_Object ch;
4ed46869 7756{
df7492f9
KH
7757 Lisp_Object spec, attrs, charset_list;
7758 int c;
7759 struct charset *charset;
7760 unsigned code;
48b0f3ae 7761
df7492f9
KH
7762 CHECK_CHARACTER (ch);
7763 c = XFASTINT (ch);
7764 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
7765 attrs = AREF (spec, 0);
7766
7767 if (ASCII_CHAR_P (c)
7768 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7769 return ch;
7770
7771 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7772 charset = char_charset (c, charset_list, &code);
7773 if (code == CHARSET_INVALID_CODE (charset))
7774 error ("Can't encode by shift_jis encoding: %d", c);
7775 JIS_TO_SJIS (code);
7776
7777 return make_number (code);
4ed46869
KH
7778}
7779
7780DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
48b0f3ae
PJ
7781 doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
7782Return the corresponding character. */)
7783 (code)
4ed46869 7784 Lisp_Object code;
d46c5b12 7785{
df7492f9
KH
7786 Lisp_Object spec, attrs, val;
7787 struct charset *charset_roman, *charset_big5, *charset;
7788 int c;
6289dd10 7789
df7492f9
KH
7790 CHECK_NATNUM (code);
7791 c = XFASTINT (code);
7792 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
7793 attrs = AREF (spec, 0);
4ed46869 7794
df7492f9
KH
7795 if (ASCII_BYTE_P (c)
7796 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7797 return code;
6289dd10 7798
df7492f9
KH
7799 val = CODING_ATTR_CHARSET_LIST (attrs);
7800 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
7801 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
c210f766 7802
df7492f9
KH
7803 if (c <= 0x7F)
7804 charset = charset_roman;
c28a9453
KH
7805 else
7806 {
df7492f9
KH
7807 int b1 = c >> 8, b2 = c & 0x7F;
7808 if (b1 < 0xA1 || b1 > 0xFE
7809 || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
7810 error ("Invalid code: %d", code);
7811 charset = charset_big5;
c28a9453 7812 }
df7492f9
KH
7813 c = DECODE_CHAR (charset, (unsigned )c);
7814 if (c < 0)
7815 error ("Invalid code: %d", code);
7816 return make_number (c);
d46c5b12 7817}
6289dd10 7818
4ed46869 7819DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
48b0f3ae
PJ
7820 doc: /* Encode the Big5 character CHAR to BIG5 coding system.
7821Return the corresponding character code in Big5. */)
7822 (ch)
4ed46869
KH
7823 Lisp_Object ch;
7824{
df7492f9
KH
7825 Lisp_Object spec, attrs, charset_list;
7826 struct charset *charset;
7827 int c;
7828 unsigned code;
7829
7830 CHECK_CHARACTER (ch);
7831 c = XFASTINT (ch);
7832 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
7833 attrs = AREF (spec, 0);
7834 if (ASCII_CHAR_P (c)
7835 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7836 return ch;
7837
7838 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7839 charset = char_charset (c, charset_list, &code);
7840 if (code == CHARSET_INVALID_CODE (charset))
7841 error ("Can't encode by Big5 encoding: %d", c);
7842
7843 return make_number (code);
4ed46869 7844}
48b0f3ae 7845
3a73fa5d 7846\f
1ba9e4ab
KH
7847DEFUN ("set-terminal-coding-system-internal",
7848 Fset_terminal_coding_system_internal,
48b0f3ae
PJ
7849 Sset_terminal_coding_system_internal, 1, 1, 0,
7850 doc: /* Internal use only. */)
7851 (coding_system)
b74e4686 7852 Lisp_Object coding_system;
4ed46869 7853{
b7826503 7854 CHECK_SYMBOL (coding_system);
df7492f9
KH
7855 setup_coding_system (Fcheck_coding_system (coding_system),
7856 &terminal_coding);
48b0f3ae 7857
70c22245 7858 /* We had better not send unsafe characters to terminal. */
df7492f9
KH
7859 terminal_coding.mode |= CODING_MODE_SAFE_ENCODING;
7860 /* Characer composition should be disabled. */
7861 terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
b73bfc1c
KH
7862 terminal_coding.src_multibyte = 1;
7863 terminal_coding.dst_multibyte = 0;
4ed46869
KH
7864 return Qnil;
7865}
7866
c4825358
KH
7867DEFUN ("set-safe-terminal-coding-system-internal",
7868 Fset_safe_terminal_coding_system_internal,
48b0f3ae 7869 Sset_safe_terminal_coding_system_internal, 1, 1, 0,
ddb67bdc 7870 doc: /* Internal use only. */)
48b0f3ae 7871 (coding_system)
b74e4686 7872 Lisp_Object coding_system;
d46c5b12 7873{
b7826503 7874 CHECK_SYMBOL (coding_system);
c4825358
KH
7875 setup_coding_system (Fcheck_coding_system (coding_system),
7876 &safe_terminal_coding);
df7492f9
KH
7877 /* Characer composition should be disabled. */
7878 safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
b73bfc1c
KH
7879 safe_terminal_coding.src_multibyte = 1;
7880 safe_terminal_coding.dst_multibyte = 0;
c4825358
KH
7881 return Qnil;
7882}
4ed46869 7883
4ed46869
KH
7884DEFUN ("terminal-coding-system",
7885 Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
48b0f3ae
PJ
7886 doc: /* Return coding system specified for terminal output. */)
7887 ()
4ed46869 7888{
df7492f9 7889 return CODING_ID_NAME (terminal_coding.id);
4ed46869
KH
7890}
7891
1ba9e4ab
KH
7892DEFUN ("set-keyboard-coding-system-internal",
7893 Fset_keyboard_coding_system_internal,
48b0f3ae
PJ
7894 Sset_keyboard_coding_system_internal, 1, 1, 0,
7895 doc: /* Internal use only. */)
7896 (coding_system)
4ed46869
KH
7897 Lisp_Object coding_system;
7898{
b7826503 7899 CHECK_SYMBOL (coding_system);
df7492f9
KH
7900 setup_coding_system (Fcheck_coding_system (coding_system),
7901 &keyboard_coding);
7902 /* Characer composition should be disabled. */
7903 keyboard_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
4ed46869
KH
7904 return Qnil;
7905}
7906
7907DEFUN ("keyboard-coding-system",
7908 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
48b0f3ae
PJ
7909 doc: /* Return coding system specified for decoding keyboard input. */)
7910 ()
4ed46869 7911{
df7492f9 7912 return CODING_ID_NAME (keyboard_coding.id);
4ed46869
KH
7913}
7914
4ed46869 7915\f
a5d301df
KH
7916DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
7917 Sfind_operation_coding_system, 1, MANY, 0,
48b0f3ae
PJ
7918 doc: /* Choose a coding system for an operation based on the target name.
7919The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
7920DECODING-SYSTEM is the coding system to use for decoding
7921\(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
7922for encoding (in case OPERATION does encoding).
05e6f5dc 7923
48b0f3ae
PJ
7924The first argument OPERATION specifies an I/O primitive:
7925 For file I/O, `insert-file-contents' or `write-region'.
7926 For process I/O, `call-process', `call-process-region', or `start-process'.
7927 For network I/O, `open-network-stream'.
05e6f5dc 7928
48b0f3ae
PJ
7929The remaining arguments should be the same arguments that were passed
7930to the primitive. Depending on which primitive, one of those arguments
7931is selected as the TARGET. For example, if OPERATION does file I/O,
7932whichever argument specifies the file name is TARGET.
05e6f5dc 7933
48b0f3ae
PJ
7934TARGET has a meaning which depends on OPERATION:
7935 For file I/O, TARGET is a file name.
7936 For process I/O, TARGET is a process name.
7937 For network I/O, TARGET is a service name or a port number
05e6f5dc 7938
48b0f3ae
PJ
7939This function looks up what specified for TARGET in,
7940`file-coding-system-alist', `process-coding-system-alist',
7941or `network-coding-system-alist' depending on OPERATION.
7942They may specify a coding system, a cons of coding systems,
7943or a function symbol to call.
7944In the last case, we call the function with one argument,
7945which is a list of all the arguments given to this function.
7946
7947usage: (find-operation-coding-system OPERATION ARGUMENTS ...) */)
7948 (nargs, args)
4ed46869
KH
7949 int nargs;
7950 Lisp_Object *args;
6b89e3aa 7951{
4ed46869
KH
7952 Lisp_Object operation, target_idx, target, val;
7953 register Lisp_Object chain;
177c0ea7 7954
4ed46869
KH
7955 if (nargs < 2)
7956 error ("Too few arguments");
7957 operation = args[0];
7958 if (!SYMBOLP (operation)
7959 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
df7492f9 7960 error ("Invalid first arguement");
4ed46869
KH
7961 if (nargs < 1 + XINT (target_idx))
7962 error ("Too few arguments for operation: %s",
8f924df7 7963 SDATA (SYMBOL_NAME (operation)));
4ed46869
KH
7964 target = args[XINT (target_idx) + 1];
7965 if (!(STRINGP (target)
7966 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
df7492f9 7967 error ("Invalid %dth argument", XINT (target_idx) + 1);
4ed46869 7968
2e34157c
RS
7969 chain = ((EQ (operation, Qinsert_file_contents)
7970 || EQ (operation, Qwrite_region))
02ba4723 7971 ? Vfile_coding_system_alist
2e34157c 7972 : (EQ (operation, Qopen_network_stream)
02ba4723
KH
7973 ? Vnetwork_coding_system_alist
7974 : Vprocess_coding_system_alist));
4ed46869
KH
7975 if (NILP (chain))
7976 return Qnil;
7977
03699b14 7978 for (; CONSP (chain); chain = XCDR (chain))
6b89e3aa 7979 {
f44d27ce 7980 Lisp_Object elt;
6b89e3aa 7981
df7492f9 7982 elt = XCAR (chain);
4ed46869
KH
7983 if (CONSP (elt)
7984 && ((STRINGP (target)
03699b14
KR
7985 && STRINGP (XCAR (elt))
7986 && fast_string_match (XCAR (elt), target) >= 0)
7987 || (INTEGERP (target) && EQ (target, XCAR (elt)))))
6b89e3aa 7988 {
03699b14 7989 val = XCDR (elt);
b19fd4c5
KH
7990 /* Here, if VAL is both a valid coding system and a valid
7991 function symbol, we return VAL as a coding system. */
02ba4723
KH
7992 if (CONSP (val))
7993 return val;
7994 if (! SYMBOLP (val))
7995 return Qnil;
7996 if (! NILP (Fcoding_system_p (val)))
7997 return Fcons (val, val);
b19fd4c5 7998 if (! NILP (Ffboundp (val)))
6b89e3aa 7999 {
b19fd4c5
KH
8000 val = call1 (val, Flist (nargs, args));
8001 if (CONSP (val))
8002 return val;
8003 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
8004 return Fcons (val, val);
6b89e3aa 8005 }
02ba4723 8006 return Qnil;
6b89e3aa
KH
8007 }
8008 }
4ed46869 8009 return Qnil;
6b89e3aa
KH
8010}
8011
df7492f9 8012DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
a3f6ee6d 8013 Sset_coding_system_priority, 0, MANY, 0,
da7db224 8014 doc: /* Assign higher priority to the coding systems given as arguments.
ff563fce 8015If multiple coding systems belongs to the same category,
a3181084
DL
8016all but the first one are ignored.
8017
8018usage: (set-coding-system-priority ...) */)
df7492f9
KH
8019 (nargs, args)
8020 int nargs;
8021 Lisp_Object *args;
8022{
8023 int i, j;
8024 int changed[coding_category_max];
8025 enum coding_category priorities[coding_category_max];
8026
8027 bzero (changed, sizeof changed);
6b89e3aa 8028
df7492f9 8029 for (i = j = 0; i < nargs; i++)
6b89e3aa 8030 {
df7492f9
KH
8031 enum coding_category category;
8032 Lisp_Object spec, attrs;
6b89e3aa 8033
df7492f9
KH
8034 CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
8035 attrs = AREF (spec, 0);
8036 category = XINT (CODING_ATTR_CATEGORY (attrs));
8037 if (changed[category])
8038 /* Ignore this coding system because a coding system of the
8039 same category already had a higher priority. */
8040 continue;
8041 changed[category] = 1;
8042 priorities[j++] = category;
8043 if (coding_categories[category].id >= 0
8044 && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
8045 setup_coding_system (args[i], &coding_categories[category]);
ff563fce 8046 Fset (AREF (Vcoding_category_table, category), args[i]);
df7492f9 8047 }
6b89e3aa 8048
df7492f9
KH
8049 /* Now we have decided top J priorities. Reflect the order of the
8050 original priorities to the remaining priorities. */
6b89e3aa 8051
df7492f9 8052 for (i = j, j = 0; i < coding_category_max; i++, j++)
6b89e3aa 8053 {
df7492f9
KH
8054 while (j < coding_category_max
8055 && changed[coding_priorities[j]])
8056 j++;
8057 if (j == coding_category_max)
8058 abort ();
8059 priorities[i] = coding_priorities[j];
8060 }
6b89e3aa 8061
df7492f9 8062 bcopy (priorities, coding_priorities, sizeof priorities);
177c0ea7 8063
ff563fce
KH
8064 /* Update `coding-category-list'. */
8065 Vcoding_category_list = Qnil;
8066 for (i = coding_category_max - 1; i >= 0; i--)
8067 Vcoding_category_list
8068 = Fcons (AREF (Vcoding_category_table, priorities[i]),
8069 Vcoding_category_list);
6b89e3aa 8070
df7492f9 8071 return Qnil;
6b89e3aa
KH
8072}
8073
df7492f9
KH
8074DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
8075 Scoding_system_priority_list, 0, 1, 0,
da7db224
DL
8076 doc: /* Return a list of coding systems ordered by their priorities.
8077HIGHESTP non-nil means just return the highest priority one. */)
df7492f9
KH
8078 (highestp)
8079 Lisp_Object highestp;
d46c5b12
KH
8080{
8081 int i;
df7492f9 8082 Lisp_Object val;
6b89e3aa 8083
df7492f9 8084 for (i = 0, val = Qnil; i < coding_category_max; i++)
d46c5b12 8085 {
df7492f9
KH
8086 enum coding_category category = coding_priorities[i];
8087 int id = coding_categories[category].id;
8088 Lisp_Object attrs;
068a9dbd 8089
df7492f9
KH
8090 if (id < 0)
8091 continue;
8092 attrs = CODING_ID_ATTRS (id);
8093 if (! NILP (highestp))
8094 return CODING_ATTR_BASE_NAME (attrs);
8095 val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
8096 }
8097 return Fnreverse (val);
8098}
068a9dbd 8099
f0064e1f 8100static char *suffixes[] = { "-unix", "-dos", "-mac" };
068a9dbd
KH
8101
8102static Lisp_Object
df7492f9
KH
8103make_subsidiaries (base)
8104 Lisp_Object base;
068a9dbd 8105{
df7492f9 8106 Lisp_Object subsidiaries;
8f924df7 8107 int base_name_len = SBYTES (SYMBOL_NAME (base));
df7492f9
KH
8108 char *buf = (char *) alloca (base_name_len + 6);
8109 int i;
068a9dbd 8110
8f924df7 8111 bcopy (SDATA (SYMBOL_NAME (base)), buf, base_name_len);
df7492f9
KH
8112 subsidiaries = Fmake_vector (make_number (3), Qnil);
8113 for (i = 0; i < 3; i++)
068a9dbd 8114 {
df7492f9
KH
8115 bcopy (suffixes[i], buf + base_name_len, strlen (suffixes[i]) + 1);
8116 ASET (subsidiaries, i, intern (buf));
068a9dbd 8117 }
df7492f9 8118 return subsidiaries;
068a9dbd
KH
8119}
8120
8121
df7492f9
KH
8122DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
8123 Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
1fcd6c8b
DL
8124 doc: /* For internal use only.
8125usage: (define-coding-system-internal ...) */)
df7492f9
KH
8126 (nargs, args)
8127 int nargs;
8128 Lisp_Object *args;
068a9dbd 8129{
df7492f9
KH
8130 Lisp_Object name;
8131 Lisp_Object spec_vec; /* [ ATTRS ALIASE EOL_TYPE ] */
8132 Lisp_Object attrs; /* Vector of attributes. */
8133 Lisp_Object eol_type;
8134 Lisp_Object aliases;
8135 Lisp_Object coding_type, charset_list, safe_charsets;
8136 enum coding_category category;
8137 Lisp_Object tail, val;
8138 int max_charset_id = 0;
8139 int i;
068a9dbd 8140
df7492f9
KH
8141 if (nargs < coding_arg_max)
8142 goto short_args;
068a9dbd 8143
df7492f9 8144 attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
068a9dbd 8145
df7492f9
KH
8146 name = args[coding_arg_name];
8147 CHECK_SYMBOL (name);
8148 CODING_ATTR_BASE_NAME (attrs) = name;
068a9dbd 8149
df7492f9
KH
8150 val = args[coding_arg_mnemonic];
8151 if (! STRINGP (val))
8152 CHECK_CHARACTER (val);
8153 CODING_ATTR_MNEMONIC (attrs) = val;
068a9dbd 8154
df7492f9
KH
8155 coding_type = args[coding_arg_coding_type];
8156 CHECK_SYMBOL (coding_type);
8157 CODING_ATTR_TYPE (attrs) = coding_type;
068a9dbd 8158
df7492f9
KH
8159 charset_list = args[coding_arg_charset_list];
8160 if (SYMBOLP (charset_list))
8161 {
8162 if (EQ (charset_list, Qiso_2022))
8163 {
8164 if (! EQ (coding_type, Qiso_2022))
8165 error ("Invalid charset-list");
8166 charset_list = Viso_2022_charset_list;
8167 }
8168 else if (EQ (charset_list, Qemacs_mule))
8169 {
8170 if (! EQ (coding_type, Qemacs_mule))
8171 error ("Invalid charset-list");
8172 charset_list = Vemacs_mule_charset_list;
8173 }
8174 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8175 if (max_charset_id < XFASTINT (XCAR (tail)))
8176 max_charset_id = XFASTINT (XCAR (tail));
8177 }
068a9dbd
KH
8178 else
8179 {
df7492f9
KH
8180 charset_list = Fcopy_sequence (charset_list);
8181 for (tail = charset_list; !NILP (tail); tail = Fcdr (tail))
068a9dbd 8182 {
df7492f9
KH
8183 struct charset *charset;
8184
8185 val = Fcar (tail);
8186 CHECK_CHARSET_GET_CHARSET (val, charset);
8187 if (EQ (coding_type, Qiso_2022)
8188 ? CHARSET_ISO_FINAL (charset) < 0
8189 : EQ (coding_type, Qemacs_mule)
8190 ? CHARSET_EMACS_MULE_ID (charset) < 0
8191 : 0)
8192 error ("Can't handle charset `%s'",
8f924df7 8193 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
df7492f9 8194
8f924df7 8195 XSETCAR (tail, make_number (charset->id));
df7492f9
KH
8196 if (max_charset_id < charset->id)
8197 max_charset_id = charset->id;
068a9dbd
KH
8198 }
8199 }
df7492f9 8200 CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
068a9dbd 8201
df7492f9
KH
8202 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
8203 make_number (255));
8204 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8f924df7 8205 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
df7492f9 8206 CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
068a9dbd 8207
584948ac 8208 CODING_ATTR_ASCII_COMPAT (attrs) = args[coding_arg_ascii_compatible_p];
3a73fa5d 8209
df7492f9 8210 val = args[coding_arg_decode_translation_table];
a6f87d34 8211 if (! CHAR_TABLE_P (val) && ! CONSP (val))
7d64c6ad 8212 CHECK_SYMBOL (val);
df7492f9 8213 CODING_ATTR_DECODE_TBL (attrs) = val;
3a73fa5d 8214
df7492f9 8215 val = args[coding_arg_encode_translation_table];
a6f87d34 8216 if (! CHAR_TABLE_P (val) && ! CONSP (val))
7d64c6ad 8217 CHECK_SYMBOL (val);
df7492f9 8218 CODING_ATTR_ENCODE_TBL (attrs) = val;
d46c5b12 8219
df7492f9
KH
8220 val = args[coding_arg_post_read_conversion];
8221 CHECK_SYMBOL (val);
8222 CODING_ATTR_POST_READ (attrs) = val;
d46c5b12 8223
df7492f9
KH
8224 val = args[coding_arg_pre_write_conversion];
8225 CHECK_SYMBOL (val);
8226 CODING_ATTR_PRE_WRITE (attrs) = val;
3a73fa5d 8227
df7492f9
KH
8228 val = args[coding_arg_default_char];
8229 if (NILP (val))
8230 CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
8231 else
8232 {
8f924df7 8233 CHECK_CHARACTER (val);
df7492f9
KH
8234 CODING_ATTR_DEFAULT_CHAR (attrs) = val;
8235 }
4031e2bf 8236
8f924df7
KH
8237 val = args[coding_arg_for_unibyte];
8238 CODING_ATTR_FOR_UNIBYTE (attrs) = NILP (val) ? Qnil : Qt;
3a73fa5d 8239
df7492f9
KH
8240 val = args[coding_arg_plist];
8241 CHECK_LIST (val);
8242 CODING_ATTR_PLIST (attrs) = val;
3a73fa5d 8243
df7492f9
KH
8244 if (EQ (coding_type, Qcharset))
8245 {
c7c66a95
KH
8246 /* Generate a lisp vector of 256 elements. Each element is nil,
8247 integer, or a list of charset IDs.
3a73fa5d 8248
c7c66a95
KH
8249 If Nth element is nil, the byte code N is invalid in this
8250 coding system.
4ed46869 8251
c7c66a95
KH
8252 If Nth element is a number NUM, N is the first byte of a
8253 charset whose ID is NUM.
4ed46869 8254
c7c66a95
KH
8255 If Nth element is a list of charset IDs, N is the first byte
8256 of one of them. The list is sorted by dimensions of the
2bc515e4 8257 charsets. A charset of smaller dimension comes firtst. */
df7492f9 8258 val = Fmake_vector (make_number (256), Qnil);
4ed46869 8259
5c99c2e6 8260 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
df7492f9 8261 {
c7c66a95
KH
8262 struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
8263 int dim = CHARSET_DIMENSION (charset);
8264 int idx = (dim - 1) * 4;
4ed46869 8265
5c99c2e6 8266 if (CHARSET_ASCII_COMPATIBLE_P (charset))
584948ac 8267 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
4031e2bf 8268
15d143f7
KH
8269 for (i = charset->code_space[idx];
8270 i <= charset->code_space[idx + 1]; i++)
8271 {
c7c66a95
KH
8272 Lisp_Object tmp, tmp2;
8273 int dim2;
ec6d2bb8 8274
c7c66a95
KH
8275 tmp = AREF (val, i);
8276 if (NILP (tmp))
8277 tmp = XCAR (tail);
8278 else if (NUMBERP (tmp))
8279 {
8280 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
8281 if (dim < dim2)
c7c66a95 8282 tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
f9d71dcd
KH
8283 else
8284 tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
c7c66a95 8285 }
15d143f7 8286 else
c7c66a95
KH
8287 {
8288 for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
8289 {
8290 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
8291 if (dim < dim2)
8292 break;
8293 }
8294 if (NILP (tmp2))
8295 tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
8296 else
8297 {
8298 XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
8299 XSETCAR (tmp2, XCAR (tail));
8300 }
8301 }
8302 ASET (val, i, tmp);
15d143f7 8303 }
df7492f9
KH
8304 }
8305 ASET (attrs, coding_attr_charset_valids, val);
8306 category = coding_category_charset;
8307 }
8308 else if (EQ (coding_type, Qccl))
8309 {
8310 Lisp_Object valids;
ecec61c1 8311
df7492f9
KH
8312 if (nargs < coding_arg_ccl_max)
8313 goto short_args;
ecec61c1 8314
df7492f9
KH
8315 val = args[coding_arg_ccl_decoder];
8316 CHECK_CCL_PROGRAM (val);
8317 if (VECTORP (val))
8318 val = Fcopy_sequence (val);
8319 ASET (attrs, coding_attr_ccl_decoder, val);
ecec61c1 8320
df7492f9
KH
8321 val = args[coding_arg_ccl_encoder];
8322 CHECK_CCL_PROGRAM (val);
8323 if (VECTORP (val))
8324 val = Fcopy_sequence (val);
8325 ASET (attrs, coding_attr_ccl_encoder, val);
ecec61c1 8326
df7492f9
KH
8327 val = args[coding_arg_ccl_valids];
8328 valids = Fmake_string (make_number (256), make_number (0));
8329 for (tail = val; !NILP (tail); tail = Fcdr (tail))
8330 {
8dcbea82 8331 int from, to;
ecec61c1 8332
df7492f9
KH
8333 val = Fcar (tail);
8334 if (INTEGERP (val))
8dcbea82
KH
8335 {
8336 from = to = XINT (val);
8337 if (from < 0 || from > 255)
8338 args_out_of_range_3 (val, make_number (0), make_number (255));
8339 }
df7492f9
KH
8340 else
8341 {
df7492f9 8342 CHECK_CONS (val);
8f924df7
KH
8343 CHECK_NATNUM_CAR (val);
8344 CHECK_NATNUM_CDR (val);
df7492f9 8345 from = XINT (XCAR (val));
8f924df7 8346 if (from > 255)
8dcbea82
KH
8347 args_out_of_range_3 (XCAR (val),
8348 make_number (0), make_number (255));
df7492f9 8349 to = XINT (XCDR (val));
8dcbea82
KH
8350 if (to < from || to > 255)
8351 args_out_of_range_3 (XCDR (val),
8352 XCAR (val), make_number (255));
df7492f9 8353 }
8dcbea82 8354 for (i = from; i <= to; i++)
8f924df7 8355 SSET (valids, i, 1);
df7492f9
KH
8356 }
8357 ASET (attrs, coding_attr_ccl_valids, valids);
4ed46869 8358
df7492f9 8359 category = coding_category_ccl;
55ab7be3 8360 }
df7492f9 8361 else if (EQ (coding_type, Qutf_16))
55ab7be3 8362 {
df7492f9 8363 Lisp_Object bom, endian;
4ed46869 8364
584948ac 8365 CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
4ed46869 8366
df7492f9
KH
8367 if (nargs < coding_arg_utf16_max)
8368 goto short_args;
4ed46869 8369
df7492f9
KH
8370 bom = args[coding_arg_utf16_bom];
8371 if (! NILP (bom) && ! EQ (bom, Qt))
8372 {
8373 CHECK_CONS (bom);
8f924df7
KH
8374 val = XCAR (bom);
8375 CHECK_CODING_SYSTEM (val);
8376 val = XCDR (bom);
8377 CHECK_CODING_SYSTEM (val);
df7492f9
KH
8378 }
8379 ASET (attrs, coding_attr_utf_16_bom, bom);
8380
8381 endian = args[coding_arg_utf16_endian];
b49a1807
KH
8382 CHECK_SYMBOL (endian);
8383 if (NILP (endian))
8384 endian = Qbig;
8385 else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
8f924df7 8386 error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
df7492f9
KH
8387 ASET (attrs, coding_attr_utf_16_endian, endian);
8388
8389 category = (CONSP (bom)
8390 ? coding_category_utf_16_auto
8391 : NILP (bom)
b49a1807 8392 ? (EQ (endian, Qbig)
df7492f9
KH
8393 ? coding_category_utf_16_be_nosig
8394 : coding_category_utf_16_le_nosig)
b49a1807 8395 : (EQ (endian, Qbig)
df7492f9
KH
8396 ? coding_category_utf_16_be
8397 : coding_category_utf_16_le));
8398 }
8399 else if (EQ (coding_type, Qiso_2022))
8400 {
8401 Lisp_Object initial, reg_usage, request, flags;
4776e638 8402 int i;
1397dc18 8403
df7492f9
KH
8404 if (nargs < coding_arg_iso2022_max)
8405 goto short_args;
8406
8407 initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
8408 CHECK_VECTOR (initial);
8409 for (i = 0; i < 4; i++)
8410 {
8411 val = Faref (initial, make_number (i));
8412 if (! NILP (val))
8413 {
584948ac
KH
8414 struct charset *charset;
8415
8416 CHECK_CHARSET_GET_CHARSET (val, charset);
8417 ASET (initial, i, make_number (CHARSET_ID (charset)));
8418 if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
8419 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
df7492f9
KH
8420 }
8421 else
8422 ASET (initial, i, make_number (-1));
8423 }
8424
8425 reg_usage = args[coding_arg_iso2022_reg_usage];
8426 CHECK_CONS (reg_usage);
8f924df7
KH
8427 CHECK_NUMBER_CAR (reg_usage);
8428 CHECK_NUMBER_CDR (reg_usage);
df7492f9
KH
8429
8430 request = Fcopy_sequence (args[coding_arg_iso2022_request]);
8431 for (tail = request; ! NILP (tail); tail = Fcdr (tail))
1397dc18 8432 {
df7492f9 8433 int id;
8f924df7 8434 Lisp_Object tmp;
df7492f9
KH
8435
8436 val = Fcar (tail);
8437 CHECK_CONS (val);
8f924df7
KH
8438 tmp = XCAR (val);
8439 CHECK_CHARSET_GET_ID (tmp, id);
8440 CHECK_NATNUM_CDR (val);
df7492f9
KH
8441 if (XINT (XCDR (val)) >= 4)
8442 error ("Invalid graphic register number: %d", XINT (XCDR (val)));
8f924df7 8443 XSETCAR (val, make_number (id));
1397dc18 8444 }
4ed46869 8445
df7492f9
KH
8446 flags = args[coding_arg_iso2022_flags];
8447 CHECK_NATNUM (flags);
8448 i = XINT (flags);
8449 if (EQ (args[coding_arg_charset_list], Qiso_2022))
8450 flags = make_number (i | CODING_ISO_FLAG_FULL_SUPPORT);
8451
8452 ASET (attrs, coding_attr_iso_initial, initial);
8453 ASET (attrs, coding_attr_iso_usage, reg_usage);
8454 ASET (attrs, coding_attr_iso_request, request);
8455 ASET (attrs, coding_attr_iso_flags, flags);
8456 setup_iso_safe_charsets (attrs);
8457
8458 if (i & CODING_ISO_FLAG_SEVEN_BITS)
8459 category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
8460 | CODING_ISO_FLAG_SINGLE_SHIFT))
8461 ? coding_category_iso_7_else
8462 : EQ (args[coding_arg_charset_list], Qiso_2022)
8463 ? coding_category_iso_7
8464 : coding_category_iso_7_tight);
8465 else
8466 {
8467 int id = XINT (AREF (initial, 1));
8468
c6fb6e98 8469 category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
df7492f9
KH
8470 || EQ (args[coding_arg_charset_list], Qiso_2022)
8471 || id < 0)
8472 ? coding_category_iso_8_else
8473 : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
8474 ? coding_category_iso_8_1
8475 : coding_category_iso_8_2);
8476 }
0ce7886f
KH
8477 if (category != coding_category_iso_8_1
8478 && category != coding_category_iso_8_2)
8479 CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
df7492f9
KH
8480 }
8481 else if (EQ (coding_type, Qemacs_mule))
c28a9453 8482 {
df7492f9
KH
8483 if (EQ (args[coding_arg_charset_list], Qemacs_mule))
8484 ASET (attrs, coding_attr_emacs_mule_full, Qt);
584948ac 8485 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
df7492f9 8486 category = coding_category_emacs_mule;
c28a9453 8487 }
df7492f9 8488 else if (EQ (coding_type, Qshift_jis))
c28a9453 8489 {
df7492f9
KH
8490
8491 struct charset *charset;
8492
7d64c6ad 8493 if (XINT (Flength (charset_list)) != 3
6e07c25f 8494 && XINT (Flength (charset_list)) != 4)
7d64c6ad 8495 error ("There should be three or four charsets");
df7492f9
KH
8496
8497 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8498 if (CHARSET_DIMENSION (charset) != 1)
8499 error ("Dimension of charset %s is not one",
8f924df7 8500 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
584948ac
KH
8501 if (CHARSET_ASCII_COMPATIBLE_P (charset))
8502 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
df7492f9
KH
8503
8504 charset_list = XCDR (charset_list);
8505 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8506 if (CHARSET_DIMENSION (charset) != 1)
8507 error ("Dimension of charset %s is not one",
8f924df7 8508 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
df7492f9
KH
8509
8510 charset_list = XCDR (charset_list);
8511 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8512 if (CHARSET_DIMENSION (charset) != 2)
7d64c6ad
KH
8513 error ("Dimension of charset %s is not two",
8514 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8515
8516 charset_list = XCDR (charset_list);
2b917a06
KH
8517 if (! NILP (charset_list))
8518 {
8519 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8520 if (CHARSET_DIMENSION (charset) != 2)
8521 error ("Dimension of charset %s is not two",
8522 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8523 }
df7492f9
KH
8524
8525 category = coding_category_sjis;
8526 Vsjis_coding_system = name;
c28a9453 8527 }
df7492f9
KH
8528 else if (EQ (coding_type, Qbig5))
8529 {
8530 struct charset *charset;
4ed46869 8531
df7492f9
KH
8532 if (XINT (Flength (charset_list)) != 2)
8533 error ("There should be just two charsets");
8534
8535 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8536 if (CHARSET_DIMENSION (charset) != 1)
8537 error ("Dimension of charset %s is not one",
8f924df7 8538 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
584948ac
KH
8539 if (CHARSET_ASCII_COMPATIBLE_P (charset))
8540 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
df7492f9
KH
8541
8542 charset_list = XCDR (charset_list);
8543 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8544 if (CHARSET_DIMENSION (charset) != 2)
8545 error ("Dimension of charset %s is not two",
8f924df7 8546 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
4ed46869 8547
df7492f9
KH
8548 category = coding_category_big5;
8549 Vbig5_coding_system = name;
8550 }
8551 else if (EQ (coding_type, Qraw_text))
c28a9453 8552 {
584948ac
KH
8553 category = coding_category_raw_text;
8554 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
c28a9453 8555 }
df7492f9 8556 else if (EQ (coding_type, Qutf_8))
4ed46869 8557 {
584948ac
KH
8558 category = coding_category_utf_8;
8559 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
4ed46869 8560 }
df7492f9
KH
8561 else if (EQ (coding_type, Qundecided))
8562 category = coding_category_undecided;
4ed46869 8563 else
df7492f9 8564 error ("Invalid coding system type: %s",
8f924df7 8565 SDATA (SYMBOL_NAME (coding_type)));
4ed46869 8566
df7492f9 8567 CODING_ATTR_CATEGORY (attrs) = make_number (category);
01378f49
KH
8568 CODING_ATTR_PLIST (attrs)
8569 = Fcons (QCcategory, Fcons (AREF (Vcoding_category_table, category),
8570 CODING_ATTR_PLIST (attrs)));
c4825358 8571
df7492f9
KH
8572 eol_type = args[coding_arg_eol_type];
8573 if (! NILP (eol_type)
8574 && ! EQ (eol_type, Qunix)
8575 && ! EQ (eol_type, Qdos)
8576 && ! EQ (eol_type, Qmac))
8577 error ("Invalid eol-type");
4ed46869 8578
df7492f9 8579 aliases = Fcons (name, Qnil);
4ed46869 8580
df7492f9
KH
8581 if (NILP (eol_type))
8582 {
8583 eol_type = make_subsidiaries (name);
8584 for (i = 0; i < 3; i++)
1397dc18 8585 {
df7492f9
KH
8586 Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
8587
8588 this_name = AREF (eol_type, i);
8589 this_aliases = Fcons (this_name, Qnil);
8590 this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
8591 this_spec = Fmake_vector (make_number (3), attrs);
8592 ASET (this_spec, 1, this_aliases);
8593 ASET (this_spec, 2, this_eol_type);
8594 Fputhash (this_name, this_spec, Vcoding_system_hash_table);
8595 Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
8596 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
8597 Vcoding_system_alist);
1397dc18 8598 }
d46c5b12 8599 }
4ed46869 8600
df7492f9
KH
8601 spec_vec = Fmake_vector (make_number (3), attrs);
8602 ASET (spec_vec, 1, aliases);
8603 ASET (spec_vec, 2, eol_type);
48b0f3ae 8604
df7492f9
KH
8605 Fputhash (name, spec_vec, Vcoding_system_hash_table);
8606 Vcoding_system_list = Fcons (name, Vcoding_system_list);
8607 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
8608 Vcoding_system_alist);
48b0f3ae 8609
df7492f9
KH
8610 {
8611 int id = coding_categories[category].id;
48b0f3ae 8612
df7492f9
KH
8613 if (id < 0 || EQ (name, CODING_ID_NAME (id)))
8614 setup_coding_system (name, &coding_categories[category]);
8615 }
48b0f3ae 8616
d46c5b12 8617 return Qnil;
48b0f3ae 8618
df7492f9
KH
8619 short_args:
8620 return Fsignal (Qwrong_number_of_arguments,
8621 Fcons (intern ("define-coding-system-internal"),
8622 make_number (nargs)));
d46c5b12 8623}
4ed46869 8624
d6925f38 8625
a6f87d34
KH
8626DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
8627 3, 3, 0,
8628 doc: /* Change value in CODING-SYSTEM's property list PROP to VAL. */)
8629 (coding_system, prop, val)
8630 Lisp_Object coding_system, prop, val;
8631{
8632 Lisp_Object spec, attrs, plist;
8633
8634 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8635 attrs = AREF (spec, 0);
8636 if (EQ (prop, QCmnemonic))
8637 {
8638 if (! STRINGP (val))
8639 CHECK_CHARACTER (val);
8640 CODING_ATTR_MNEMONIC (attrs) = val;
8641 }
8642 else if (EQ (prop, QCdefalut_char))
8643 {
8644 if (NILP (val))
8645 val = make_number (' ');
8646 else
8647 CHECK_CHARACTER (val);
8648 CODING_ATTR_DEFAULT_CHAR (attrs) = val;
8649 }
8650 else if (EQ (prop, QCdecode_translation_table))
8651 {
8652 if (! CHAR_TABLE_P (val) && ! CONSP (val))
8653 CHECK_SYMBOL (val);
8654 CODING_ATTR_DECODE_TBL (attrs) = val;
8655 }
8656 else if (EQ (prop, QCencode_translation_table))
8657 {
8658 if (! CHAR_TABLE_P (val) && ! CONSP (val))
8659 CHECK_SYMBOL (val);
8660 CODING_ATTR_ENCODE_TBL (attrs) = val;
8661 }
8662 else if (EQ (prop, QCpost_read_conversion))
8663 {
8664 CHECK_SYMBOL (val);
8665 CODING_ATTR_POST_READ (attrs) = val;
8666 }
8667 else if (EQ (prop, QCpre_write_conversion))
8668 {
8669 CHECK_SYMBOL (val);
8670 CODING_ATTR_PRE_WRITE (attrs) = val;
8671 }
8672
8673 CODING_ATTR_PLIST (attrs)
8674 = Fplist_put (CODING_ATTR_PLIST (attrs), prop, val);
8675 return val;
8676}
8677
8678
df7492f9
KH
8679DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
8680 Sdefine_coding_system_alias, 2, 2, 0,
8681 doc: /* Define ALIAS as an alias for CODING-SYSTEM. */)
8682 (alias, coding_system)
8683 Lisp_Object alias, coding_system;
66cfb530 8684{
df7492f9 8685 Lisp_Object spec, aliases, eol_type;
4ed46869 8686
df7492f9
KH
8687 CHECK_SYMBOL (alias);
8688 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8689 aliases = AREF (spec, 1);
d6925f38
KH
8690 /* ALISES should be a list of length more than zero, and the first
8691 element is a base coding system. Append ALIAS at the tail of the
8692 list. */
df7492f9
KH
8693 while (!NILP (XCDR (aliases)))
8694 aliases = XCDR (aliases);
8f924df7 8695 XSETCDR (aliases, Fcons (alias, Qnil));
4ed46869 8696
df7492f9
KH
8697 eol_type = AREF (spec, 2);
8698 if (VECTORP (eol_type))
4ed46869 8699 {
df7492f9
KH
8700 Lisp_Object subsidiaries;
8701 int i;
4ed46869 8702
df7492f9
KH
8703 subsidiaries = make_subsidiaries (alias);
8704 for (i = 0; i < 3; i++)
8705 Fdefine_coding_system_alias (AREF (subsidiaries, i),
8706 AREF (eol_type, i));
4ed46869 8707 }
df7492f9
KH
8708
8709 Fputhash (alias, spec, Vcoding_system_hash_table);
d6925f38 8710 Vcoding_system_list = Fcons (alias, Vcoding_system_list);
5bad0796
DL
8711 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
8712 Vcoding_system_alist);
66cfb530 8713
4ed46869
KH
8714 return Qnil;
8715}
8716
df7492f9
KH
8717DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
8718 1, 1, 0,
8719 doc: /* Return the base of CODING-SYSTEM.
da7db224 8720Any alias or subsidiary coding system is not a base coding system. */)
df7492f9
KH
8721 (coding_system)
8722 Lisp_Object coding_system;
d46c5b12 8723{
df7492f9 8724 Lisp_Object spec, attrs;
d46c5b12 8725
df7492f9
KH
8726 if (NILP (coding_system))
8727 return (Qno_conversion);
8728 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8729 attrs = AREF (spec, 0);
8730 return CODING_ATTR_BASE_NAME (attrs);
8731}
1397dc18 8732
df7492f9
KH
8733DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
8734 1, 1, 0,
8735 doc: "Return the property list of CODING-SYSTEM.")
8736 (coding_system)
8737 Lisp_Object coding_system;
8738{
8739 Lisp_Object spec, attrs;
1397dc18 8740
df7492f9
KH
8741 if (NILP (coding_system))
8742 coding_system = Qno_conversion;
8743 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8744 attrs = AREF (spec, 0);
8745 return CODING_ATTR_PLIST (attrs);
d46c5b12
KH
8746}
8747
df7492f9
KH
8748
8749DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
8750 1, 1, 0,
da7db224 8751 doc: /* Return the list of aliases of CODING-SYSTEM. */)
df7492f9
KH
8752 (coding_system)
8753 Lisp_Object coding_system;
66cfb530 8754{
df7492f9 8755 Lisp_Object spec;
84d60297 8756
df7492f9
KH
8757 if (NILP (coding_system))
8758 coding_system = Qno_conversion;
8759 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
da7db224 8760 return AREF (spec, 1);
df7492f9 8761}
66cfb530 8762
df7492f9
KH
8763DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
8764 Scoding_system_eol_type, 1, 1, 0,
8765 doc: /* Return eol-type of CODING-SYSTEM.
8766An eol-type is integer 0, 1, 2, or a vector of coding systems.
66cfb530 8767
df7492f9
KH
8768Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
8769and CR respectively.
66cfb530 8770
df7492f9
KH
8771A vector value indicates that a format of end-of-line should be
8772detected automatically. Nth element of the vector is the subsidiary
8773coding system whose eol-type is N. */)
6b89e3aa
KH
8774 (coding_system)
8775 Lisp_Object coding_system;
8776{
df7492f9
KH
8777 Lisp_Object spec, eol_type;
8778 int n;
6b89e3aa 8779
df7492f9
KH
8780 if (NILP (coding_system))
8781 coding_system = Qno_conversion;
8782 if (! CODING_SYSTEM_P (coding_system))
8783 return Qnil;
8784 spec = CODING_SYSTEM_SPEC (coding_system);
8785 eol_type = AREF (spec, 2);
8786 if (VECTORP (eol_type))
8787 return Fcopy_sequence (eol_type);
8788 n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
8789 return make_number (n);
6b89e3aa
KH
8790}
8791
4ed46869
KH
8792#endif /* emacs */
8793
8794\f
1397dc18 8795/*** 9. Post-amble ***/
4ed46869 8796
dfcf069d 8797void
4ed46869
KH
8798init_coding_once ()
8799{
8800 int i;
8801
df7492f9
KH
8802 for (i = 0; i < coding_category_max; i++)
8803 {
8804 coding_categories[i].id = -1;
8805 coding_priorities[i] = i;
8806 }
4ed46869
KH
8807
8808 /* ISO2022 specific initialize routine. */
8809 for (i = 0; i < 0x20; i++)
b73bfc1c 8810 iso_code_class[i] = ISO_control_0;
4ed46869
KH
8811 for (i = 0x21; i < 0x7F; i++)
8812 iso_code_class[i] = ISO_graphic_plane_0;
8813 for (i = 0x80; i < 0xA0; i++)
b73bfc1c 8814 iso_code_class[i] = ISO_control_1;
4ed46869
KH
8815 for (i = 0xA1; i < 0xFF; i++)
8816 iso_code_class[i] = ISO_graphic_plane_1;
8817 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
8818 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
4ed46869
KH
8819 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
8820 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
8821 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
8822 iso_code_class[ISO_CODE_ESC] = ISO_escape;
8823 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
8824 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
8825 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
8826
df7492f9
KH
8827 for (i = 0; i < 256; i++)
8828 {
8829 emacs_mule_bytes[i] = 1;
8830 }
7c78e542
KH
8831 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
8832 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
8833 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
8834 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
e0e989f6
KH
8835}
8836
8837#ifdef emacs
8838
dfcf069d 8839void
e0e989f6
KH
8840syms_of_coding ()
8841{
df7492f9 8842 staticpro (&Vcoding_system_hash_table);
8f924df7
KH
8843 {
8844 Lisp_Object args[2];
8845 args[0] = QCtest;
8846 args[1] = Qeq;
8847 Vcoding_system_hash_table = Fmake_hash_table (2, args);
8848 }
df7492f9
KH
8849
8850 staticpro (&Vsjis_coding_system);
8851 Vsjis_coding_system = Qnil;
e0e989f6 8852
df7492f9
KH
8853 staticpro (&Vbig5_coding_system);
8854 Vbig5_coding_system = Qnil;
8855
24a73b0a
KH
8856 staticpro (&Vcode_conversion_reused_workbuf);
8857 Vcode_conversion_reused_workbuf = Qnil;
8858
8859 staticpro (&Vcode_conversion_workbuf_name);
8860 Vcode_conversion_workbuf_name = build_string (" *code-conversion-work*");
e0e989f6 8861
24a73b0a 8862 reused_workbuf_in_use = 0;
df7492f9
KH
8863
8864 DEFSYM (Qcharset, "charset");
8865 DEFSYM (Qtarget_idx, "target-idx");
8866 DEFSYM (Qcoding_system_history, "coding-system-history");
bb0115a2
RS
8867 Fset (Qcoding_system_history, Qnil);
8868
9ce27fde 8869 /* Target FILENAME is the first argument. */
e0e989f6 8870 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
9ce27fde 8871 /* Target FILENAME is the third argument. */
e0e989f6
KH
8872 Fput (Qwrite_region, Qtarget_idx, make_number (2));
8873
df7492f9 8874 DEFSYM (Qcall_process, "call-process");
9ce27fde 8875 /* Target PROGRAM is the first argument. */
e0e989f6
KH
8876 Fput (Qcall_process, Qtarget_idx, make_number (0));
8877
df7492f9 8878 DEFSYM (Qcall_process_region, "call-process-region");
9ce27fde 8879 /* Target PROGRAM is the third argument. */
e0e989f6
KH
8880 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
8881
df7492f9 8882 DEFSYM (Qstart_process, "start-process");
9ce27fde 8883 /* Target PROGRAM is the third argument. */
e0e989f6
KH
8884 Fput (Qstart_process, Qtarget_idx, make_number (2));
8885
df7492f9 8886 DEFSYM (Qopen_network_stream, "open-network-stream");
9ce27fde 8887 /* Target SERVICE is the fourth argument. */
e0e989f6
KH
8888 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
8889
df7492f9
KH
8890 DEFSYM (Qcoding_system, "coding-system");
8891 DEFSYM (Qcoding_aliases, "coding-aliases");
4ed46869 8892
df7492f9
KH
8893 DEFSYM (Qeol_type, "eol-type");
8894 DEFSYM (Qunix, "unix");
8895 DEFSYM (Qdos, "dos");
4ed46869 8896
df7492f9
KH
8897 DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
8898 DEFSYM (Qpost_read_conversion, "post-read-conversion");
8899 DEFSYM (Qpre_write_conversion, "pre-write-conversion");
8900 DEFSYM (Qdefault_char, "default-char");
8901 DEFSYM (Qundecided, "undecided");
8902 DEFSYM (Qno_conversion, "no-conversion");
8903 DEFSYM (Qraw_text, "raw-text");
4ed46869 8904
df7492f9 8905 DEFSYM (Qiso_2022, "iso-2022");
4ed46869 8906
df7492f9 8907 DEFSYM (Qutf_8, "utf-8");
8f924df7 8908 DEFSYM (Qutf_8_emacs, "utf-8-emacs");
27901516 8909
df7492f9 8910 DEFSYM (Qutf_16, "utf-16");
df7492f9
KH
8911 DEFSYM (Qbig, "big");
8912 DEFSYM (Qlittle, "little");
27901516 8913
df7492f9
KH
8914 DEFSYM (Qshift_jis, "shift-jis");
8915 DEFSYM (Qbig5, "big5");
4ed46869 8916
df7492f9 8917 DEFSYM (Qcoding_system_p, "coding-system-p");
4ed46869 8918
df7492f9 8919 DEFSYM (Qcoding_system_error, "coding-system-error");
4ed46869
KH
8920 Fput (Qcoding_system_error, Qerror_conditions,
8921 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
8922 Fput (Qcoding_system_error, Qerror_message,
9ce27fde 8923 build_string ("Invalid coding system"));
4ed46869 8924
05e6f5dc
KH
8925 /* Intern this now in case it isn't already done.
8926 Setting this variable twice is harmless.
8927 But don't staticpro it here--that is done in alloc.c. */
8928 Qchar_table_extra_slots = intern ("char-table-extra-slots");
70c22245 8929
df7492f9 8930 DEFSYM (Qtranslation_table, "translation-table");
1397dc18 8931 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (1));
df7492f9
KH
8932 DEFSYM (Qtranslation_table_id, "translation-table-id");
8933 DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
8934 DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
1397dc18 8935
df7492f9 8936 DEFSYM (Qvalid_codes, "valid-codes");
9ce27fde 8937
df7492f9 8938 DEFSYM (Qemacs_mule, "emacs-mule");
d46c5b12 8939
01378f49 8940 DEFSYM (QCcategory, ":category");
a6f87d34
KH
8941 DEFSYM (QCmnemonic, ":mnemonic");
8942 DEFSYM (QCdefalut_char, ":default-char");
8943 DEFSYM (QCdecode_translation_table, ":decode-translation-table");
8944 DEFSYM (QCencode_translation_table, ":encode-translation-table");
8945 DEFSYM (QCpost_read_conversion, ":post-read-conversion");
8946 DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
01378f49 8947
df7492f9
KH
8948 Vcoding_category_table
8949 = Fmake_vector (make_number (coding_category_max), Qnil);
8950 staticpro (&Vcoding_category_table);
8951 /* Followings are target of code detection. */
8952 ASET (Vcoding_category_table, coding_category_iso_7,
8953 intern ("coding-category-iso-7"));
8954 ASET (Vcoding_category_table, coding_category_iso_7_tight,
8955 intern ("coding-category-iso-7-tight"));
8956 ASET (Vcoding_category_table, coding_category_iso_8_1,
8957 intern ("coding-category-iso-8-1"));
8958 ASET (Vcoding_category_table, coding_category_iso_8_2,
8959 intern ("coding-category-iso-8-2"));
8960 ASET (Vcoding_category_table, coding_category_iso_7_else,
8961 intern ("coding-category-iso-7-else"));
8962 ASET (Vcoding_category_table, coding_category_iso_8_else,
8963 intern ("coding-category-iso-8-else"));
8964 ASET (Vcoding_category_table, coding_category_utf_8,
8965 intern ("coding-category-utf-8"));
8966 ASET (Vcoding_category_table, coding_category_utf_16_be,
8967 intern ("coding-category-utf-16-be"));
ff563fce
KH
8968 ASET (Vcoding_category_table, coding_category_utf_16_auto,
8969 intern ("coding-category-utf-16-auto"));
df7492f9
KH
8970 ASET (Vcoding_category_table, coding_category_utf_16_le,
8971 intern ("coding-category-utf-16-le"));
8972 ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
8973 intern ("coding-category-utf-16-be-nosig"));
8974 ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
8975 intern ("coding-category-utf-16-le-nosig"));
8976 ASET (Vcoding_category_table, coding_category_charset,
8977 intern ("coding-category-charset"));
8978 ASET (Vcoding_category_table, coding_category_sjis,
8979 intern ("coding-category-sjis"));
8980 ASET (Vcoding_category_table, coding_category_big5,
8981 intern ("coding-category-big5"));
8982 ASET (Vcoding_category_table, coding_category_ccl,
8983 intern ("coding-category-ccl"));
8984 ASET (Vcoding_category_table, coding_category_emacs_mule,
8985 intern ("coding-category-emacs-mule"));
8986 /* Followings are NOT target of code detection. */
8987 ASET (Vcoding_category_table, coding_category_raw_text,
8988 intern ("coding-category-raw-text"));
8989 ASET (Vcoding_category_table, coding_category_undecided,
8990 intern ("coding-category-undecided"));
ecf488bc 8991
065e3595
KH
8992 DEFSYM (Qinsufficient_source, "insufficient-source");
8993 DEFSYM (Qinconsistent_eol, "inconsistent-eol");
8994 DEFSYM (Qinvalid_source, "invalid-source");
8995 DEFSYM (Qinterrupted, "interrupted");
8996 DEFSYM (Qinsufficient_memory, "insufficient-memory");
8997
4ed46869
KH
8998 defsubr (&Scoding_system_p);
8999 defsubr (&Sread_coding_system);
9000 defsubr (&Sread_non_nil_coding_system);
9001 defsubr (&Scheck_coding_system);
9002 defsubr (&Sdetect_coding_region);
d46c5b12 9003 defsubr (&Sdetect_coding_string);
05e6f5dc 9004 defsubr (&Sfind_coding_systems_region_internal);
068a9dbd 9005 defsubr (&Sunencodable_char_position);
df7492f9 9006 defsubr (&Scheck_coding_systems_region);
4ed46869
KH
9007 defsubr (&Sdecode_coding_region);
9008 defsubr (&Sencode_coding_region);
9009 defsubr (&Sdecode_coding_string);
9010 defsubr (&Sencode_coding_string);
9011 defsubr (&Sdecode_sjis_char);
9012 defsubr (&Sencode_sjis_char);
9013 defsubr (&Sdecode_big5_char);
9014 defsubr (&Sencode_big5_char);
1ba9e4ab 9015 defsubr (&Sset_terminal_coding_system_internal);
c4825358 9016 defsubr (&Sset_safe_terminal_coding_system_internal);
4ed46869 9017 defsubr (&Sterminal_coding_system);
1ba9e4ab 9018 defsubr (&Sset_keyboard_coding_system_internal);
4ed46869 9019 defsubr (&Skeyboard_coding_system);
a5d301df 9020 defsubr (&Sfind_operation_coding_system);
df7492f9 9021 defsubr (&Sset_coding_system_priority);
6b89e3aa 9022 defsubr (&Sdefine_coding_system_internal);
df7492f9 9023 defsubr (&Sdefine_coding_system_alias);
a6f87d34 9024 defsubr (&Scoding_system_put);
df7492f9
KH
9025 defsubr (&Scoding_system_base);
9026 defsubr (&Scoding_system_plist);
9027 defsubr (&Scoding_system_aliases);
9028 defsubr (&Scoding_system_eol_type);
9029 defsubr (&Scoding_system_priority_list);
4ed46869 9030
4608c386 9031 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
48b0f3ae
PJ
9032 doc: /* List of coding systems.
9033
9034Do not alter the value of this variable manually. This variable should be
df7492f9 9035updated by the functions `define-coding-system' and
48b0f3ae 9036`define-coding-system-alias'. */);
4608c386
KH
9037 Vcoding_system_list = Qnil;
9038
9039 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
48b0f3ae
PJ
9040 doc: /* Alist of coding system names.
9041Each element is one element list of coding system name.
9042This variable is given to `completing-read' as TABLE argument.
9043
9044Do not alter the value of this variable manually. This variable should be
9045updated by the functions `make-coding-system' and
9046`define-coding-system-alias'. */);
4608c386
KH
9047 Vcoding_system_alist = Qnil;
9048
4ed46869 9049 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
48b0f3ae
PJ
9050 doc: /* List of coding-categories (symbols) ordered by priority.
9051
9052On detecting a coding system, Emacs tries code detection algorithms
9053associated with each coding-category one by one in this order. When
9054one algorithm agrees with a byte sequence of source text, the coding
9055system bound to the corresponding coding-category is selected. */);
4ed46869
KH
9056 {
9057 int i;
9058
9059 Vcoding_category_list = Qnil;
df7492f9 9060 for (i = coding_category_max - 1; i >= 0; i--)
4ed46869 9061 Vcoding_category_list
d46c5b12
KH
9062 = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
9063 Vcoding_category_list);
4ed46869
KH
9064 }
9065
9066 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
48b0f3ae
PJ
9067 doc: /* Specify the coding system for read operations.
9068It is useful to bind this variable with `let', but do not set it globally.
9069If the value is a coding system, it is used for decoding on read operation.
9070If not, an appropriate element is used from one of the coding system alists:
9071There are three such tables, `file-coding-system-alist',
9072`process-coding-system-alist', and `network-coding-system-alist'. */);
4ed46869
KH
9073 Vcoding_system_for_read = Qnil;
9074
9075 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
48b0f3ae
PJ
9076 doc: /* Specify the coding system for write operations.
9077Programs bind this variable with `let', but you should not set it globally.
9078If the value is a coding system, it is used for encoding of output,
9079when writing it to a file and when sending it to a file or subprocess.
9080
9081If this does not specify a coding system, an appropriate element
9082is used from one of the coding system alists:
9083There are three such tables, `file-coding-system-alist',
9084`process-coding-system-alist', and `network-coding-system-alist'.
9085For output to files, if the above procedure does not specify a coding system,
9086the value of `buffer-file-coding-system' is used. */);
4ed46869
KH
9087 Vcoding_system_for_write = Qnil;
9088
9089 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
df7492f9
KH
9090 doc: /*
9091Coding system used in the latest file or process I/O. */);
4ed46869
KH
9092 Vlast_coding_system_used = Qnil;
9093
065e3595
KH
9094 DEFVAR_LISP ("last-code-conversion-error", &Vlast_code_conversion_error,
9095 doc: /*
9096Error status of the last code conversion.
9097
9098When an error was detected in the last code conversion, this variable
9099is set to one of the following symbols.
9100 `insufficient-source'
9101 `inconsistent-eol'
9102 `invalid-source'
9103 `interrupted'
9104 `insufficient-memory'
9105When no error was detected, the value doesn't change. So, to check
9106the error status of a code conversion by this variable, you must
9107explicitly set this variable to nil before performing code
9108conversion. */);
9109 Vlast_code_conversion_error = Qnil;
9110
9ce27fde 9111 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
df7492f9
KH
9112 doc: /*
9113*Non-nil means always inhibit code conversion of end-of-line format.
48b0f3ae
PJ
9114See info node `Coding Systems' and info node `Text and Binary' concerning
9115such conversion. */);
9ce27fde
KH
9116 inhibit_eol_conversion = 0;
9117
ed29121d 9118 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
df7492f9
KH
9119 doc: /*
9120Non-nil means process buffer inherits coding system of process output.
48b0f3ae
PJ
9121Bind it to t if the process output is to be treated as if it were a file
9122read from some filesystem. */);
ed29121d
EZ
9123 inherit_process_coding_system = 0;
9124
02ba4723 9125 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
df7492f9
KH
9126 doc: /*
9127Alist to decide a coding system to use for a file I/O operation.
48b0f3ae
PJ
9128The format is ((PATTERN . VAL) ...),
9129where PATTERN is a regular expression matching a file name,
9130VAL is a coding system, a cons of coding systems, or a function symbol.
9131If VAL is a coding system, it is used for both decoding and encoding
9132the file contents.
9133If VAL is a cons of coding systems, the car part is used for decoding,
9134and the cdr part is used for encoding.
9135If VAL is a function symbol, the function must return a coding system
0192762c
DL
9136or a cons of coding systems which are used as above. The function gets
9137the arguments with which `find-operation-coding-systems' was called.
48b0f3ae
PJ
9138
9139See also the function `find-operation-coding-system'
9140and the variable `auto-coding-alist'. */);
02ba4723
KH
9141 Vfile_coding_system_alist = Qnil;
9142
9143 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
df7492f9
KH
9144 doc: /*
9145Alist to decide a coding system to use for a process I/O operation.
48b0f3ae
PJ
9146The format is ((PATTERN . VAL) ...),
9147where PATTERN is a regular expression matching a program name,
9148VAL is a coding system, a cons of coding systems, or a function symbol.
9149If VAL is a coding system, it is used for both decoding what received
9150from the program and encoding what sent to the program.
9151If VAL is a cons of coding systems, the car part is used for decoding,
9152and the cdr part is used for encoding.
9153If VAL is a function symbol, the function must return a coding system
9154or a cons of coding systems which are used as above.
9155
9156See also the function `find-operation-coding-system'. */);
02ba4723
KH
9157 Vprocess_coding_system_alist = Qnil;
9158
9159 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
df7492f9
KH
9160 doc: /*
9161Alist to decide a coding system to use for a network I/O operation.
48b0f3ae
PJ
9162The format is ((PATTERN . VAL) ...),
9163where PATTERN is a regular expression matching a network service name
9164or is a port number to connect to,
9165VAL is a coding system, a cons of coding systems, or a function symbol.
9166If VAL is a coding system, it is used for both decoding what received
9167from the network stream and encoding what sent to the network stream.
9168If VAL is a cons of coding systems, the car part is used for decoding,
9169and the cdr part is used for encoding.
9170If VAL is a function symbol, the function must return a coding system
9171or a cons of coding systems which are used as above.
9172
9173See also the function `find-operation-coding-system'. */);
02ba4723 9174 Vnetwork_coding_system_alist = Qnil;
4ed46869 9175
68c45bf0 9176 DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
75205970
RS
9177 doc: /* Coding system to use with system messages.
9178Also used for decoding keyboard input on X Window system. */);
68c45bf0
PE
9179 Vlocale_coding_system = Qnil;
9180
005f0d35 9181 /* The eol mnemonics are reset in startup.el system-dependently. */
7722baf9 9182 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
df7492f9
KH
9183 doc: /*
9184*String displayed in mode line for UNIX-like (LF) end-of-line format. */);
7722baf9 9185 eol_mnemonic_unix = build_string (":");
4ed46869 9186
7722baf9 9187 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
df7492f9
KH
9188 doc: /*
9189*String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
7722baf9 9190 eol_mnemonic_dos = build_string ("\\");
4ed46869 9191
7722baf9 9192 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
df7492f9
KH
9193 doc: /*
9194*String displayed in mode line for MAC-like (CR) end-of-line format. */);
7722baf9 9195 eol_mnemonic_mac = build_string ("/");
4ed46869 9196
7722baf9 9197 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
df7492f9
KH
9198 doc: /*
9199*String displayed in mode line when end-of-line format is not yet determined. */);
7722baf9 9200 eol_mnemonic_undecided = build_string (":");
4ed46869 9201
84fbb8a0 9202 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
df7492f9
KH
9203 doc: /*
9204*Non-nil enables character translation while encoding and decoding. */);
84fbb8a0 9205 Venable_character_translation = Qt;
bdd9fb48 9206
f967223b 9207 DEFVAR_LISP ("standard-translation-table-for-decode",
48b0f3ae
PJ
9208 &Vstandard_translation_table_for_decode,
9209 doc: /* Table for translating characters while decoding. */);
f967223b 9210 Vstandard_translation_table_for_decode = Qnil;
bdd9fb48 9211
f967223b 9212 DEFVAR_LISP ("standard-translation-table-for-encode",
48b0f3ae
PJ
9213 &Vstandard_translation_table_for_encode,
9214 doc: /* Table for translating characters while encoding. */);
f967223b 9215 Vstandard_translation_table_for_encode = Qnil;
4ed46869 9216
df7492f9 9217 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table,
48b0f3ae
PJ
9218 doc: /* Alist of charsets vs revision numbers.
9219While encoding, if a charset (car part of an element) is found,
df7492f9
KH
9220designate it with the escape sequence identifying revision (cdr part
9221of the element). */);
9222 Vcharset_revision_table = Qnil;
02ba4723
KH
9223
9224 DEFVAR_LISP ("default-process-coding-system",
9225 &Vdefault_process_coding_system,
48b0f3ae
PJ
9226 doc: /* Cons of coding systems used for process I/O by default.
9227The car part is used for decoding a process output,
9228the cdr part is used for encoding a text to be sent to a process. */);
02ba4723 9229 Vdefault_process_coding_system = Qnil;
c4825358 9230
3f003981 9231 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
df7492f9
KH
9232 doc: /*
9233Table of extra Latin codes in the range 128..159 (inclusive).
48b0f3ae
PJ
9234This is a vector of length 256.
9235If Nth element is non-nil, the existence of code N in a file
9236\(or output of subprocess) doesn't prevent it to be detected as
9237a coding system of ISO 2022 variant which has a flag
9238`accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
9239or reading output of a subprocess.
9240Only 128th through 159th elements has a meaning. */);
3f003981 9241 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
d46c5b12
KH
9242
9243 DEFVAR_LISP ("select-safe-coding-system-function",
9244 &Vselect_safe_coding_system_function,
df7492f9
KH
9245 doc: /*
9246Function to call to select safe coding system for encoding a text.
48b0f3ae
PJ
9247
9248If set, this function is called to force a user to select a proper
9249coding system which can encode the text in the case that a default
9250coding system used in each operation can't encode the text.
9251
9252The default value is `select-safe-coding-system' (which see). */);
d46c5b12
KH
9253 Vselect_safe_coding_system_function = Qnil;
9254
5d5bf4d8
KH
9255 DEFVAR_BOOL ("coding-system-require-warning",
9256 &coding_system_require_warning,
9257 doc: /* Internal use only.
6b89e3aa
KH
9258If non-nil, on writing a file, `select-safe-coding-system-function' is
9259called even if `coding-system-for-write' is non-nil. The command
9260`universal-coding-system-argument' binds this variable to t temporarily. */);
5d5bf4d8
KH
9261 coding_system_require_warning = 0;
9262
9263
22ab2303 9264 DEFVAR_BOOL ("inhibit-iso-escape-detection",
74383408 9265 &inhibit_iso_escape_detection,
df7492f9
KH
9266 doc: /*
9267If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
48b0f3ae
PJ
9268
9269By default, on reading a file, Emacs tries to detect how the text is
9270encoded. This code detection is sensitive to escape sequences. If
9271the sequence is valid as ISO2022, the code is determined as one of
9272the ISO2022 encodings, and the file is decoded by the corresponding
9273coding system (e.g. `iso-2022-7bit').
9274
9275However, there may be a case that you want to read escape sequences in
9276a file as is. In such a case, you can set this variable to non-nil.
9277Then, as the code detection ignores any escape sequences, no file is
9278detected as encoded in some ISO2022 encoding. The result is that all
9279escape sequences become visible in a buffer.
9280
9281The default value is nil, and it is strongly recommended not to change
9282it. That is because many Emacs Lisp source files that contain
9283non-ASCII characters are encoded by the coding system `iso-2022-7bit'
9284in Emacs's distribution, and they won't be decoded correctly on
9285reading if you suppress escape sequence detection.
9286
9287The other way to read escape sequences in a file without decoding is
9288to explicitly specify some coding system that doesn't use ISO2022's
9289escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument]. */);
74383408 9290 inhibit_iso_escape_detection = 0;
002fdb44
DL
9291
9292 DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
15c8f9d1
DL
9293 doc: /* Char table for translating self-inserting characters.
9294This is applied to the result of input methods, not their input. See also
9295`keyboard-translate-table'. */);
002fdb44 9296 Vtranslation_table_for_input = Qnil;
8f924df7 9297
2c78b7e1
KH
9298 {
9299 Lisp_Object args[coding_arg_max];
8f924df7 9300 Lisp_Object plist[16];
2c78b7e1
KH
9301 int i;
9302
9303 for (i = 0; i < coding_arg_max; i++)
9304 args[i] = Qnil;
9305
9306 plist[0] = intern (":name");
9307 plist[1] = args[coding_arg_name] = Qno_conversion;
9308 plist[2] = intern (":mnemonic");
9309 plist[3] = args[coding_arg_mnemonic] = make_number ('=');
9310 plist[4] = intern (":coding-type");
9311 plist[5] = args[coding_arg_coding_type] = Qraw_text;
9312 plist[6] = intern (":ascii-compatible-p");
9313 plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
9314 plist[8] = intern (":default-char");
9315 plist[9] = args[coding_arg_default_char] = make_number (0);
8f924df7
KH
9316 plist[10] = intern (":for-unibyte");
9317 plist[11] = args[coding_arg_for_unibyte] = Qt;
9318 plist[12] = intern (":docstring");
9319 plist[13] = build_string ("Do no conversion.\n\
2c78b7e1
KH
9320\n\
9321When you visit a file with this coding, the file is read into a\n\
9322unibyte buffer as is, thus each byte of a file is treated as a\n\
9323character.");
8f924df7
KH
9324 plist[14] = intern (":eol-type");
9325 plist[15] = args[coding_arg_eol_type] = Qunix;
9326 args[coding_arg_plist] = Flist (16, plist);
2c78b7e1
KH
9327 Fdefine_coding_system_internal (coding_arg_max, args);
9328 }
9329
9330 setup_coding_system (Qno_conversion, &keyboard_coding);
9331 setup_coding_system (Qno_conversion, &terminal_coding);
9332 setup_coding_system (Qno_conversion, &safe_terminal_coding);
ff563fce
KH
9333
9334 {
9335 int i;
9336
9337 for (i = 0; i < coding_category_max; i++)
9338 Fset (AREF (Vcoding_category_table, i), Qno_conversion);
9339 }
4ed46869
KH
9340}
9341
68c45bf0
PE
9342char *
9343emacs_strerror (error_number)
9344 int error_number;
9345{
9346 char *str;
9347
ca9c0567 9348 synchronize_system_messages_locale ();
68c45bf0
PE
9349 str = strerror (error_number);
9350
9351 if (! NILP (Vlocale_coding_system))
9352 {
9353 Lisp_Object dec = code_convert_string_norecord (build_string (str),
9354 Vlocale_coding_system,
9355 0);
d5db4077 9356 str = (char *) SDATA (dec);
68c45bf0
PE
9357 }
9358
9359 return str;
9360}
9361
4ed46869 9362#endif /* emacs */