(Fread_coding_system): Fix arg of XSETSTRING.
[bpt/emacs.git] / src / coding.c
CommitLineData
9542cb1f 1/* Coding system handler (conversion, detection, etc).
4a2f9c6a 2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
8f924df7 3 Licensed to the Free Software Foundation.
6f197c07 4 Copyright (C) 2001, 2002 Free Software Foundation, Inc.
8f924df7 5 Copyright (C) 2003
df7492f9
KH
6 National Institute of Advanced Industrial Science and Technology (AIST)
7 Registration Number H13PRO009
4ed46869 8
369314dc
KH
9This file is part of GNU Emacs.
10
11GNU Emacs is free software; you can redistribute it and/or modify
12it under the terms of the GNU General Public License as published by
13the Free Software Foundation; either version 2, or (at your option)
14any later version.
4ed46869 15
369314dc
KH
16GNU Emacs is distributed in the hope that it will be useful,
17but WITHOUT ANY WARRANTY; without even the implied warranty of
18MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19GNU General Public License for more details.
4ed46869 20
369314dc
KH
21You should have received a copy of the GNU General Public License
22along with GNU Emacs; see the file COPYING. If not, write to
23the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24Boston, MA 02111-1307, USA. */
4ed46869
KH
25
26/*** TABLE OF CONTENTS ***
27
b73bfc1c 28 0. General comments
4ed46869 29 1. Preamble
df7492f9
KH
30 2. Emacs' internal format (emacs-utf-8) handlers
31 3. UTF-8 handlers
32 4. UTF-16 handlers
33 5. Charset-base coding systems handlers
34 6. emacs-mule (old Emacs' internal format) handlers
35 7. ISO2022 handlers
36 8. Shift-JIS and BIG5 handlers
37 9. CCL handlers
38 10. C library functions
39 11. Emacs Lisp library functions
40 12. Postamble
4ed46869
KH
41
42*/
43
df7492f9 44/*** 0. General comments ***
b73bfc1c
KH
45
46
df7492f9 47CODING SYSTEM
4ed46869 48
5bad0796
DL
49 A coding system is an object for an encoding mechanism that contains
50 information about how to convert byte sequences to character
e19c3639
KH
51 sequences and vice versa. When we say "decode", it means converting
52 a byte sequence of a specific coding system into a character
53 sequence that is represented by Emacs' internal coding system
54 `emacs-utf-8', and when we say "encode", it means converting a
55 character sequence of emacs-utf-8 to a byte sequence of a specific
0ef69138 56 coding system.
4ed46869 57
e19c3639
KH
58 In Emacs Lisp, a coding system is represented by a Lisp symbol. In
59 C level, a coding system is represented by a vector of attributes
5bad0796 60 stored in the hash table Vcharset_hash_table. The conversion from
e19c3639
KH
61 coding system symbol to attributes vector is done by looking up
62 Vcharset_hash_table by the symbol.
4ed46869 63
e19c3639 64 Coding systems are classified into the following types depending on
5bad0796 65 the encoding mechanism. Here's a brief description of the types.
4ed46869 66
df7492f9
KH
67 o UTF-8
68
69 o UTF-16
70
71 o Charset-base coding system
72
73 A coding system defined by one or more (coded) character sets.
5bad0796 74 Decoding and encoding are done by a code converter defined for each
df7492f9
KH
75 character set.
76
5bad0796 77 o Old Emacs internal format (emacs-mule)
df7492f9 78
5bad0796 79 The coding system adopted by old versions of Emacs (20 and 21).
4ed46869 80
df7492f9 81 o ISO2022-base coding system
4ed46869
KH
82
83 The most famous coding system for multiple character sets. X's
df7492f9
KH
84 Compound Text, various EUCs (Extended Unix Code), and coding systems
85 used in the Internet communication such as ISO-2022-JP are all
86 variants of ISO2022.
4ed46869 87
df7492f9 88 o SJIS (or Shift-JIS or MS-Kanji-Code)
93dec019 89
4ed46869
KH
90 A coding system to encode character sets: ASCII, JISX0201, and
91 JISX0208. Widely used for PC's in Japan. Details are described in
df7492f9 92 section 8.
4ed46869 93
df7492f9 94 o BIG5
4ed46869 95
df7492f9 96 A coding system to encode character sets: ASCII and Big5. Widely
cfb43547 97 used for Chinese (mainly in Taiwan and Hong Kong). Details are
df7492f9
KH
98 described in section 8. In this file, when we write "big5" (all
99 lowercase), we mean the coding system, and when we write "Big5"
100 (capitalized), we mean the character set.
4ed46869 101
df7492f9 102 o CCL
27901516 103
5bad0796 104 If a user wants to decode/encode text encoded in a coding system
df7492f9
KH
105 not listed above, he can supply a decoder and an encoder for it in
106 CCL (Code Conversion Language) programs. Emacs executes the CCL
107 program while decoding/encoding.
27901516 108
df7492f9 109 o Raw-text
4ed46869 110
5a936b46 111 A coding system for text containing raw eight-bit data. Emacs
5bad0796 112 treats each byte of source text as a character (except for
df7492f9 113 end-of-line conversion).
4ed46869 114
df7492f9
KH
115 o No-conversion
116
117 Like raw text, but don't do end-of-line conversion.
4ed46869 118
4ed46869 119
df7492f9 120END-OF-LINE FORMAT
4ed46869 121
5bad0796 122 How text end-of-line is encoded depends on operating system. For
df7492f9 123 instance, Unix's format is just one byte of LF (line-feed) code,
f4dee582 124 whereas DOS's format is two-byte sequence of `carriage-return' and
d46c5b12
KH
125 `line-feed' codes. MacOS's format is usually one byte of
126 `carriage-return'.
4ed46869 127
cfb43547 128 Since text character encoding and end-of-line encoding are
df7492f9
KH
129 independent, any coding system described above can take any format
130 of end-of-line (except for no-conversion).
4ed46869 131
e19c3639
KH
132STRUCT CODING_SYSTEM
133
134 Before using a coding system for code conversion (i.e. decoding and
135 encoding), we setup a structure of type `struct coding_system'.
136 This structure keeps various information about a specific code
5bad0796 137 conversion (e.g. the location of source and destination data).
4ed46869
KH
138
139*/
140
df7492f9
KH
141/* COMMON MACROS */
142
143
4ed46869
KH
144/*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
145
df7492f9 146 These functions check if a byte sequence specified as a source in
ff0dacd7
KH
147 CODING conforms to the format of XXX, and update the members of
148 DETECT_INFO.
df7492f9 149
ff0dacd7 150 Return 1 if the byte sequence conforms to XXX, otherwise return 0.
df7492f9
KH
151
152 Below is the template of these functions. */
153
4ed46869 154#if 0
df7492f9 155static int
ff0dacd7 156detect_coding_XXX (coding, detect_info)
df7492f9 157 struct coding_system *coding;
ff0dacd7 158 struct coding_detection_info *detect_info;
4ed46869 159{
df7492f9
KH
160 unsigned char *src = coding->source;
161 unsigned char *src_end = coding->source + coding->src_bytes;
162 int multibytep = coding->src_multibyte;
ff0dacd7 163 int consumed_chars = 0;
df7492f9
KH
164 int found = 0;
165 ...;
166
167 while (1)
168 {
169 /* Get one byte from the source. If the souce is exausted, jump
170 to no_more_source:. */
171 ONE_MORE_BYTE (c);
ff0dacd7
KH
172
173 if (! __C_conforms_to_XXX___ (c))
174 break;
175 if (! __C_strongly_suggests_XXX__ (c))
176 found = CATEGORY_MASK_XXX;
df7492f9 177 }
ff0dacd7
KH
178 /* The byte sequence is invalid for XXX. */
179 detect_info->rejected |= CATEGORY_MASK_XXX;
df7492f9 180 return 0;
ff0dacd7 181
df7492f9 182 no_more_source:
ff0dacd7
KH
183 /* The source exausted successfully. */
184 detect_info->found |= found;
df7492f9 185 return 1;
4ed46869
KH
186}
187#endif
188
189/*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
190
df7492f9
KH
191 These functions decode a byte sequence specified as a source by
192 CODING. The resulting multibyte text goes to a place pointed to by
193 CODING->charbuf, the length of which should not exceed
194 CODING->charbuf_size;
d46c5b12 195
df7492f9
KH
196 These functions set the information of original and decoded texts in
197 CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
198 They also set CODING->result to one of CODING_RESULT_XXX indicating
199 how the decoding is finished.
d46c5b12 200
df7492f9 201 Below is the template of these functions. */
d46c5b12 202
4ed46869 203#if 0
b73bfc1c 204static void
df7492f9 205decode_coding_XXXX (coding)
4ed46869 206 struct coding_system *coding;
4ed46869 207{
df7492f9
KH
208 unsigned char *src = coding->source + coding->consumed;
209 unsigned char *src_end = coding->source + coding->src_bytes;
210 /* SRC_BASE remembers the start position in source in each loop.
211 The loop will be exited when there's not enough source code, or
212 when there's no room in CHARBUF for a decoded character. */
213 unsigned char *src_base;
214 /* A buffer to produce decoded characters. */
215 int *charbuf = coding->charbuf;
216 int *charbuf_end = charbuf + coding->charbuf_size;
217 int multibytep = coding->src_multibyte;
218
219 while (1)
220 {
221 src_base = src;
222 if (charbuf < charbuf_end)
223 /* No more room to produce a decoded character. */
224 break;
225 ONE_MORE_BYTE (c);
226 /* Decode it. */
227 }
228
229 no_more_source:
230 if (src_base < src_end
231 && coding->mode & CODING_MODE_LAST_BLOCK)
232 /* If the source ends by partial bytes to construct a character,
233 treat them as eight-bit raw data. */
234 while (src_base < src_end && charbuf < charbuf_end)
235 *charbuf++ = *src_base++;
236 /* Remember how many bytes and characters we consumed. If the
237 source is multibyte, the bytes and chars are not identical. */
238 coding->consumed = coding->consumed_char = src_base - coding->source;
239 /* Remember how many characters we produced. */
240 coding->charbuf_used = charbuf - coding->charbuf;
4ed46869
KH
241}
242#endif
243
244/*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
245
df7492f9
KH
246 These functions encode SRC_BYTES length text at SOURCE of Emacs'
247 internal multibyte format by CODING. The resulting byte sequence
b73bfc1c
KH
248 goes to a place pointed to by DESTINATION, the length of which
249 should not exceed DST_BYTES.
d46c5b12 250
df7492f9
KH
251 These functions set the information of original and encoded texts in
252 the members produced, produced_char, consumed, and consumed_char of
253 the structure *CODING. They also set the member result to one of
254 CODING_RESULT_XXX indicating how the encoding finished.
d46c5b12 255
df7492f9
KH
256 DST_BYTES zero means that source area and destination area are
257 overlapped, which means that we can produce a encoded text until it
258 reaches at the head of not-yet-encoded source text.
d46c5b12 259
df7492f9 260 Below is a template of these functions. */
4ed46869 261#if 0
b73bfc1c 262static void
df7492f9 263encode_coding_XXX (coding)
4ed46869 264 struct coding_system *coding;
4ed46869 265{
df7492f9
KH
266 int multibytep = coding->dst_multibyte;
267 int *charbuf = coding->charbuf;
268 int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
269 unsigned char *dst = coding->destination + coding->produced;
270 unsigned char *dst_end = coding->destination + coding->dst_bytes;
271 unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
272 int produced_chars = 0;
273
274 for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
275 {
276 int c = *charbuf;
277 /* Encode C into DST, and increment DST. */
278 }
279 label_no_more_destination:
280 /* How many chars and bytes we produced. */
281 coding->produced_char += produced_chars;
282 coding->produced = dst - coding->destination;
4ed46869
KH
283}
284#endif
285
4ed46869
KH
286\f
287/*** 1. Preamble ***/
288
68c45bf0 289#include <config.h>
4ed46869
KH
290#include <stdio.h>
291
4ed46869
KH
292#include "lisp.h"
293#include "buffer.h"
df7492f9 294#include "character.h"
4ed46869
KH
295#include "charset.h"
296#include "ccl.h"
df7492f9 297#include "composite.h"
4ed46869
KH
298#include "coding.h"
299#include "window.h"
4ed46869 300
df7492f9 301Lisp_Object Vcoding_system_hash_table;
4ed46869 302
df7492f9 303Lisp_Object Qcoding_system, Qcoding_aliases, Qeol_type;
1965cb73
DL
304Lisp_Object Qunix, Qdos;
305extern Lisp_Object Qmac; /* frame.c */
4ed46869
KH
306Lisp_Object Qbuffer_file_coding_system;
307Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
df7492f9 308Lisp_Object Qdefault_char;
27901516 309Lisp_Object Qno_conversion, Qundecided;
df7492f9 310Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5;
b49a1807 311Lisp_Object Qbig, Qlittle;
bb0115a2 312Lisp_Object Qcoding_system_history;
1397dc18 313Lisp_Object Qvalid_codes;
01378f49 314Lisp_Object QCcategory;
4ed46869
KH
315
316extern Lisp_Object Qinsert_file_contents, Qwrite_region;
317Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
318Lisp_Object Qstart_process, Qopen_network_stream;
319Lisp_Object Qtarget_idx;
320
5d5bf4d8
KH
321int coding_system_require_warning;
322
d46c5b12
KH
323Lisp_Object Vselect_safe_coding_system_function;
324
7722baf9
EZ
325/* Mnemonic string for each format of end-of-line. */
326Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
327/* Mnemonic string to indicate format of end-of-line is not yet
4ed46869 328 decided. */
7722baf9 329Lisp_Object eol_mnemonic_undecided;
4ed46869
KH
330
331#ifdef emacs
332
4608c386
KH
333Lisp_Object Vcoding_system_list, Vcoding_system_alist;
334
335Lisp_Object Qcoding_system_p, Qcoding_system_error;
4ed46869 336
d46c5b12
KH
337/* Coding system emacs-mule and raw-text are for converting only
338 end-of-line format. */
339Lisp_Object Qemacs_mule, Qraw_text;
8f924df7 340Lisp_Object Qutf_8_emacs;
ecf488bc 341
4ed46869
KH
342/* Coding-systems are handed between Emacs Lisp programs and C internal
343 routines by the following three variables. */
344/* Coding-system for reading files and receiving data from process. */
345Lisp_Object Vcoding_system_for_read;
346/* Coding-system for writing files and sending data to process. */
347Lisp_Object Vcoding_system_for_write;
348/* Coding-system actually used in the latest I/O. */
349Lisp_Object Vlast_coding_system_used;
350
c4825358 351/* A vector of length 256 which contains information about special
94487c4e 352 Latin codes (especially for dealing with Microsoft codes). */
3f003981 353Lisp_Object Vlatin_extra_code_table;
c4825358 354
9ce27fde
KH
355/* Flag to inhibit code conversion of end-of-line format. */
356int inhibit_eol_conversion;
357
74383408
KH
358/* Flag to inhibit ISO2022 escape sequence detection. */
359int inhibit_iso_escape_detection;
360
ed29121d
EZ
361/* Flag to make buffer-file-coding-system inherit from process-coding. */
362int inherit_process_coding_system;
363
c4825358 364/* Coding system to be used to encode text for terminal display. */
4ed46869
KH
365struct coding_system terminal_coding;
366
c4825358
KH
367/* Coding system to be used to encode text for terminal display when
368 terminal coding system is nil. */
369struct coding_system safe_terminal_coding;
370
371/* Coding system of what is sent from terminal keyboard. */
4ed46869
KH
372struct coding_system keyboard_coding;
373
02ba4723
KH
374Lisp_Object Vfile_coding_system_alist;
375Lisp_Object Vprocess_coding_system_alist;
376Lisp_Object Vnetwork_coding_system_alist;
4ed46869 377
68c45bf0
PE
378Lisp_Object Vlocale_coding_system;
379
4ed46869
KH
380#endif /* emacs */
381
f967223b
KH
382/* Flag to tell if we look up translation table on character code
383 conversion. */
84fbb8a0 384Lisp_Object Venable_character_translation;
f967223b
KH
385/* Standard translation table to look up on decoding (reading). */
386Lisp_Object Vstandard_translation_table_for_decode;
387/* Standard translation table to look up on encoding (writing). */
388Lisp_Object Vstandard_translation_table_for_encode;
84fbb8a0 389
f967223b
KH
390Lisp_Object Qtranslation_table;
391Lisp_Object Qtranslation_table_id;
392Lisp_Object Qtranslation_table_for_decode;
393Lisp_Object Qtranslation_table_for_encode;
4ed46869
KH
394
395/* Alist of charsets vs revision number. */
df7492f9 396static Lisp_Object Vcharset_revision_table;
4ed46869 397
02ba4723
KH
398/* Default coding systems used for process I/O. */
399Lisp_Object Vdefault_process_coding_system;
400
002fdb44
DL
401/* Char table for translating Quail and self-inserting input. */
402Lisp_Object Vtranslation_table_for_input;
403
b843d1ae
KH
404/* Global flag to tell that we can't call post-read-conversion and
405 pre-write-conversion functions. Usually the value is zero, but it
406 is set to 1 temporarily while such functions are running. This is
407 to avoid infinite recursive call. */
408static int inhibit_pre_post_conversion;
409
df7492f9
KH
410/* Two special coding systems. */
411Lisp_Object Vsjis_coding_system;
412Lisp_Object Vbig5_coding_system;
413
414
ff0dacd7
KH
415static int detect_coding_utf_8 P_ ((struct coding_system *,
416 struct coding_detection_info *info));
df7492f9
KH
417static void decode_coding_utf_8 P_ ((struct coding_system *));
418static int encode_coding_utf_8 P_ ((struct coding_system *));
419
ff0dacd7
KH
420static int detect_coding_utf_16 P_ ((struct coding_system *,
421 struct coding_detection_info *info));
df7492f9
KH
422static void decode_coding_utf_16 P_ ((struct coding_system *));
423static int encode_coding_utf_16 P_ ((struct coding_system *));
424
ff0dacd7
KH
425static int detect_coding_iso_2022 P_ ((struct coding_system *,
426 struct coding_detection_info *info));
df7492f9
KH
427static void decode_coding_iso_2022 P_ ((struct coding_system *));
428static int encode_coding_iso_2022 P_ ((struct coding_system *));
429
ff0dacd7
KH
430static int detect_coding_emacs_mule P_ ((struct coding_system *,
431 struct coding_detection_info *info));
df7492f9
KH
432static void decode_coding_emacs_mule P_ ((struct coding_system *));
433static int encode_coding_emacs_mule P_ ((struct coding_system *));
434
ff0dacd7
KH
435static int detect_coding_sjis P_ ((struct coding_system *,
436 struct coding_detection_info *info));
df7492f9
KH
437static void decode_coding_sjis P_ ((struct coding_system *));
438static int encode_coding_sjis P_ ((struct coding_system *));
439
ff0dacd7
KH
440static int detect_coding_big5 P_ ((struct coding_system *,
441 struct coding_detection_info *info));
df7492f9
KH
442static void decode_coding_big5 P_ ((struct coding_system *));
443static int encode_coding_big5 P_ ((struct coding_system *));
444
ff0dacd7
KH
445static int detect_coding_ccl P_ ((struct coding_system *,
446 struct coding_detection_info *info));
df7492f9
KH
447static void decode_coding_ccl P_ ((struct coding_system *));
448static int encode_coding_ccl P_ ((struct coding_system *));
449
450static void decode_coding_raw_text P_ ((struct coding_system *));
451static int encode_coding_raw_text P_ ((struct coding_system *));
452
453
454/* ISO2022 section */
455
456#define CODING_ISO_INITIAL(coding, reg) \
457 (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id), \
458 coding_attr_iso_initial), \
459 reg)))
460
461
462#define CODING_ISO_REQUEST(coding, charset_id) \
463 ((charset_id <= (coding)->max_charset_id \
464 ? (coding)->safe_charsets[charset_id] \
465 : -1))
466
467
468#define CODING_ISO_FLAGS(coding) \
469 ((coding)->spec.iso_2022.flags)
470#define CODING_ISO_DESIGNATION(coding, reg) \
471 ((coding)->spec.iso_2022.current_designation[reg])
472#define CODING_ISO_INVOCATION(coding, plane) \
473 ((coding)->spec.iso_2022.current_invocation[plane])
474#define CODING_ISO_SINGLE_SHIFTING(coding) \
475 ((coding)->spec.iso_2022.single_shifting)
476#define CODING_ISO_BOL(coding) \
477 ((coding)->spec.iso_2022.bol)
478#define CODING_ISO_INVOKED_CHARSET(coding, plane) \
479 CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
480
481/* Control characters of ISO2022. */
482 /* code */ /* function */
483#define ISO_CODE_LF 0x0A /* line-feed */
484#define ISO_CODE_CR 0x0D /* carriage-return */
485#define ISO_CODE_SO 0x0E /* shift-out */
486#define ISO_CODE_SI 0x0F /* shift-in */
487#define ISO_CODE_SS2_7 0x19 /* single-shift-2 for 7-bit code */
488#define ISO_CODE_ESC 0x1B /* escape */
489#define ISO_CODE_SS2 0x8E /* single-shift-2 */
490#define ISO_CODE_SS3 0x8F /* single-shift-3 */
491#define ISO_CODE_CSI 0x9B /* control-sequence-introducer */
492
493/* All code (1-byte) of ISO2022 is classified into one of the
494 followings. */
495enum iso_code_class_type
496 {
497 ISO_control_0, /* Control codes in the range
498 0x00..0x1F and 0x7F, except for the
499 following 5 codes. */
500 ISO_carriage_return, /* ISO_CODE_CR (0x0D) */
501 ISO_shift_out, /* ISO_CODE_SO (0x0E) */
502 ISO_shift_in, /* ISO_CODE_SI (0x0F) */
503 ISO_single_shift_2_7, /* ISO_CODE_SS2_7 (0x19) */
504 ISO_escape, /* ISO_CODE_SO (0x1B) */
505 ISO_control_1, /* Control codes in the range
506 0x80..0x9F, except for the
507 following 3 codes. */
508 ISO_single_shift_2, /* ISO_CODE_SS2 (0x8E) */
509 ISO_single_shift_3, /* ISO_CODE_SS3 (0x8F) */
510 ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
511 ISO_0x20_or_0x7F, /* Codes of the values 0x20 or 0x7F. */
512 ISO_graphic_plane_0, /* Graphic codes in the range 0x21..0x7E. */
513 ISO_0xA0_or_0xFF, /* Codes of the values 0xA0 or 0xFF. */
514 ISO_graphic_plane_1 /* Graphic codes in the range 0xA1..0xFE. */
515 };
05e6f5dc 516
df7492f9
KH
517/** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
518 `iso-flags' attribute of an iso2022 coding system. */
05e6f5dc 519
df7492f9
KH
520/* If set, produce long-form designation sequence (e.g. ESC $ ( A)
521 instead of the correct short-form sequence (e.g. ESC $ A). */
522#define CODING_ISO_FLAG_LONG_FORM 0x0001
93dec019 523
df7492f9
KH
524/* If set, reset graphic planes and registers at end-of-line to the
525 initial state. */
526#define CODING_ISO_FLAG_RESET_AT_EOL 0x0002
05e6f5dc 527
df7492f9
KH
528/* If set, reset graphic planes and registers before any control
529 characters to the initial state. */
530#define CODING_ISO_FLAG_RESET_AT_CNTL 0x0004
05e6f5dc 531
df7492f9
KH
532/* If set, encode by 7-bit environment. */
533#define CODING_ISO_FLAG_SEVEN_BITS 0x0008
4ed46869 534
df7492f9
KH
535/* If set, use locking-shift function. */
536#define CODING_ISO_FLAG_LOCKING_SHIFT 0x0010
b73bfc1c 537
df7492f9
KH
538/* If set, use single-shift function. Overwrite
539 CODING_ISO_FLAG_LOCKING_SHIFT. */
540#define CODING_ISO_FLAG_SINGLE_SHIFT 0x0020
b73bfc1c 541
df7492f9
KH
542/* If set, use designation escape sequence. */
543#define CODING_ISO_FLAG_DESIGNATION 0x0040
b73bfc1c 544
df7492f9
KH
545/* If set, produce revision number sequence. */
546#define CODING_ISO_FLAG_REVISION 0x0080
b73bfc1c 547
df7492f9
KH
548/* If set, produce ISO6429's direction specifying sequence. */
549#define CODING_ISO_FLAG_DIRECTION 0x0100
f4dee582 550
df7492f9
KH
551/* If set, assume designation states are reset at beginning of line on
552 output. */
553#define CODING_ISO_FLAG_INIT_AT_BOL 0x0200
4ed46869 554
df7492f9
KH
555/* If set, designation sequence should be placed at beginning of line
556 on output. */
557#define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
aa72b389 558
df7492f9
KH
559/* If set, do not encode unsafe charactes on output. */
560#define CODING_ISO_FLAG_SAFE 0x0800
aa72b389 561
df7492f9
KH
562/* If set, extra latin codes (128..159) are accepted as a valid code
563 on input. */
564#define CODING_ISO_FLAG_LATIN_EXTRA 0x1000
aa72b389 565
df7492f9 566#define CODING_ISO_FLAG_COMPOSITION 0x2000
aa72b389 567
df7492f9 568#define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000
aa72b389 569
bf16eb23 570#define CODING_ISO_FLAG_USE_ROMAN 0x8000
aa72b389 571
bf16eb23 572#define CODING_ISO_FLAG_USE_OLDJIS 0x10000
aa72b389 573
bf16eb23 574#define CODING_ISO_FLAG_FULL_SUPPORT 0x100000
aa72b389 575
df7492f9
KH
576/* A character to be produced on output if encoding of the original
577 character is prohibited by CODING_ISO_FLAG_SAFE. */
578#define CODING_INHIBIT_CHARACTER_SUBSTITUTION '?'
aa72b389 579
4ed46869 580
df7492f9
KH
581/* UTF-16 section */
582#define CODING_UTF_16_BOM(coding) \
583 ((coding)->spec.utf_16.bom)
4ed46869 584
df7492f9
KH
585#define CODING_UTF_16_ENDIAN(coding) \
586 ((coding)->spec.utf_16.endian)
4ed46869 587
df7492f9
KH
588#define CODING_UTF_16_SURROGATE(coding) \
589 ((coding)->spec.utf_16.surrogate)
4ed46869 590
4ed46869 591
df7492f9
KH
592/* CCL section */
593#define CODING_CCL_DECODER(coding) \
594 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
595#define CODING_CCL_ENCODER(coding) \
596 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
597#define CODING_CCL_VALIDS(coding) \
8f924df7 598 (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
4ed46869 599
5a936b46 600/* Index for each coding category in `coding_categories' */
4ed46869 601
df7492f9
KH
602enum coding_category
603 {
604 coding_category_iso_7,
605 coding_category_iso_7_tight,
606 coding_category_iso_8_1,
607 coding_category_iso_8_2,
608 coding_category_iso_7_else,
609 coding_category_iso_8_else,
610 coding_category_utf_8,
611 coding_category_utf_16_auto,
612 coding_category_utf_16_be,
613 coding_category_utf_16_le,
614 coding_category_utf_16_be_nosig,
615 coding_category_utf_16_le_nosig,
616 coding_category_charset,
617 coding_category_sjis,
618 coding_category_big5,
619 coding_category_ccl,
620 coding_category_emacs_mule,
621 /* All above are targets of code detection. */
622 coding_category_raw_text,
623 coding_category_undecided,
624 coding_category_max
625 };
626
627/* Definitions of flag bits used in detect_coding_XXXX. */
628#define CATEGORY_MASK_ISO_7 (1 << coding_category_iso_7)
629#define CATEGORY_MASK_ISO_7_TIGHT (1 << coding_category_iso_7_tight)
630#define CATEGORY_MASK_ISO_8_1 (1 << coding_category_iso_8_1)
631#define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2)
632#define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else)
633#define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else)
634#define CATEGORY_MASK_UTF_8 (1 << coding_category_utf_8)
b49a1807 635#define CATEGORY_MASK_UTF_16_AUTO (1 << coding_category_utf_16_auto)
df7492f9
KH
636#define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be)
637#define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le)
638#define CATEGORY_MASK_UTF_16_BE_NOSIG (1 << coding_category_utf_16_be_nosig)
639#define CATEGORY_MASK_UTF_16_LE_NOSIG (1 << coding_category_utf_16_le_nosig)
640#define CATEGORY_MASK_CHARSET (1 << coding_category_charset)
641#define CATEGORY_MASK_SJIS (1 << coding_category_sjis)
642#define CATEGORY_MASK_BIG5 (1 << coding_category_big5)
643#define CATEGORY_MASK_CCL (1 << coding_category_ccl)
644#define CATEGORY_MASK_EMACS_MULE (1 << coding_category_emacs_mule)
ff0dacd7 645#define CATEGORY_MASK_RAW_TEXT (1 << coding_category_raw_text)
df7492f9
KH
646
647/* This value is returned if detect_coding_mask () find nothing other
648 than ASCII characters. */
649#define CATEGORY_MASK_ANY \
650 (CATEGORY_MASK_ISO_7 \
651 | CATEGORY_MASK_ISO_7_TIGHT \
652 | CATEGORY_MASK_ISO_8_1 \
653 | CATEGORY_MASK_ISO_8_2 \
654 | CATEGORY_MASK_ISO_7_ELSE \
655 | CATEGORY_MASK_ISO_8_ELSE \
656 | CATEGORY_MASK_UTF_8 \
657 | CATEGORY_MASK_UTF_16_BE \
658 | CATEGORY_MASK_UTF_16_LE \
659 | CATEGORY_MASK_UTF_16_BE_NOSIG \
660 | CATEGORY_MASK_UTF_16_LE_NOSIG \
661 | CATEGORY_MASK_CHARSET \
662 | CATEGORY_MASK_SJIS \
663 | CATEGORY_MASK_BIG5 \
664 | CATEGORY_MASK_CCL \
665 | CATEGORY_MASK_EMACS_MULE)
666
667
668#define CATEGORY_MASK_ISO_7BIT \
669 (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
670
671#define CATEGORY_MASK_ISO_8BIT \
672 (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
673
674#define CATEGORY_MASK_ISO_ELSE \
675 (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
676
677#define CATEGORY_MASK_ISO_ESCAPE \
678 (CATEGORY_MASK_ISO_7 \
679 | CATEGORY_MASK_ISO_7_TIGHT \
680 | CATEGORY_MASK_ISO_7_ELSE \
681 | CATEGORY_MASK_ISO_8_ELSE)
682
683#define CATEGORY_MASK_ISO \
684 ( CATEGORY_MASK_ISO_7BIT \
685 | CATEGORY_MASK_ISO_8BIT \
686 | CATEGORY_MASK_ISO_ELSE)
687
688#define CATEGORY_MASK_UTF_16 \
689 (CATEGORY_MASK_UTF_16_BE \
690 | CATEGORY_MASK_UTF_16_LE \
691 | CATEGORY_MASK_UTF_16_BE_NOSIG \
692 | CATEGORY_MASK_UTF_16_LE_NOSIG)
693
694
695/* List of symbols `coding-category-xxx' ordered by priority. This
696 variable is exposed to Emacs Lisp. */
697static Lisp_Object Vcoding_category_list;
698
699/* Table of coding categories (Lisp symbols). This variable is for
700 internal use oly. */
701static Lisp_Object Vcoding_category_table;
702
703/* Table of coding-categories ordered by priority. */
704static enum coding_category coding_priorities[coding_category_max];
705
706/* Nth element is a coding context for the coding system bound to the
707 Nth coding category. */
708static struct coding_system coding_categories[coding_category_max];
709
df7492f9
KH
710/*** Commonly used macros and functions ***/
711
712#ifndef min
713#define min(a, b) ((a) < (b) ? (a) : (b))
714#endif
715#ifndef max
716#define max(a, b) ((a) > (b) ? (a) : (b))
717#endif
4ed46869 718
df7492f9
KH
719#define CODING_GET_INFO(coding, attrs, eol_type, charset_list) \
720 do { \
721 attrs = CODING_ID_ATTRS (coding->id); \
722 eol_type = CODING_ID_EOL_TYPE (coding->id); \
723 if (VECTORP (eol_type)) \
724 eol_type = Qunix; \
725 charset_list = CODING_ATTR_CHARSET_LIST (attrs); \
726 } while (0)
4ed46869 727
4ed46869 728
df7492f9
KH
729/* Safely get one byte from the source text pointed by SRC which ends
730 at SRC_END, and set C to that byte. If there are not enough bytes
731 in the source, it jumps to `no_more_source'. The caller
732 should declare and set these variables appropriately in advance:
733 src, src_end, multibytep
734*/
aa72b389 735
df7492f9 736#define ONE_MORE_BYTE(c) \
aa72b389 737 do { \
df7492f9
KH
738 if (src == src_end) \
739 { \
740 if (src_base < src) \
741 coding->result = CODING_RESULT_INSUFFICIENT_SRC; \
742 goto no_more_source; \
743 } \
744 c = *src++; \
745 if (multibytep && (c & 0x80)) \
746 { \
747 if ((c & 0xFE) != 0xC0) \
748 error ("Undecodable char found"); \
749 c = ((c & 1) << 6) | *src++; \
750 } \
751 consumed_chars++; \
aa72b389
KH
752 } while (0)
753
aa72b389 754
df7492f9
KH
755#define ONE_MORE_BYTE_NO_CHECK(c) \
756 do { \
757 c = *src++; \
758 if (multibytep && (c & 0x80)) \
759 { \
760 if ((c & 0xFE) != 0xC0) \
761 error ("Undecodable char found"); \
762 c = ((c & 1) << 6) | *src++; \
763 } \
781d7a48 764 consumed_chars++; \
aa72b389
KH
765 } while (0)
766
aa72b389 767
df7492f9
KH
768/* Store a byte C in the place pointed by DST and increment DST to the
769 next free point, and increment PRODUCED_CHARS. The caller should
770 assure that C is 0..127, and declare and set the variable `dst'
771 appropriately in advance.
772*/
aa72b389
KH
773
774
df7492f9
KH
775#define EMIT_ONE_ASCII_BYTE(c) \
776 do { \
777 produced_chars++; \
778 *dst++ = (c); \
b6871cc7 779 } while (0)
aa72b389
KH
780
781
df7492f9 782/* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2. */
aa72b389 783
df7492f9
KH
784#define EMIT_TWO_ASCII_BYTES(c1, c2) \
785 do { \
786 produced_chars += 2; \
787 *dst++ = (c1), *dst++ = (c2); \
788 } while (0)
aa72b389
KH
789
790
df7492f9
KH
791/* Store a byte C in the place pointed by DST and increment DST to the
792 next free point, and increment PRODUCED_CHARS. If MULTIBYTEP is
793 nonzero, store in an appropriate multibyte from. The caller should
794 declare and set the variables `dst' and `multibytep' appropriately
795 in advance. */
796
797#define EMIT_ONE_BYTE(c) \
798 do { \
799 produced_chars++; \
800 if (multibytep) \
801 { \
802 int ch = (c); \
803 if (ch >= 0x80) \
804 ch = BYTE8_TO_CHAR (ch); \
805 CHAR_STRING_ADVANCE (ch, dst); \
806 } \
807 else \
808 *dst++ = (c); \
aa72b389 809 } while (0)
aa72b389 810
aa72b389 811
df7492f9 812/* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */
aa72b389 813
e19c3639
KH
814#define EMIT_TWO_BYTES(c1, c2) \
815 do { \
816 produced_chars += 2; \
817 if (multibytep) \
818 { \
819 int ch; \
820 \
821 ch = (c1); \
822 if (ch >= 0x80) \
823 ch = BYTE8_TO_CHAR (ch); \
824 CHAR_STRING_ADVANCE (ch, dst); \
825 ch = (c2); \
826 if (ch >= 0x80) \
827 ch = BYTE8_TO_CHAR (ch); \
828 CHAR_STRING_ADVANCE (ch, dst); \
829 } \
830 else \
831 { \
832 *dst++ = (c1); \
833 *dst++ = (c2); \
834 } \
aa72b389
KH
835 } while (0)
836
837
df7492f9
KH
838#define EMIT_THREE_BYTES(c1, c2, c3) \
839 do { \
840 EMIT_ONE_BYTE (c1); \
841 EMIT_TWO_BYTES (c2, c3); \
842 } while (0)
aa72b389 843
aa72b389 844
df7492f9
KH
845#define EMIT_FOUR_BYTES(c1, c2, c3, c4) \
846 do { \
847 EMIT_TWO_BYTES (c1, c2); \
848 EMIT_TWO_BYTES (c3, c4); \
849 } while (0)
aa72b389 850
aa72b389 851
df7492f9
KH
852#define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
853 do { \
854 charset_map_loaded = 0; \
855 c = DECODE_CHAR (charset, code); \
856 if (charset_map_loaded) \
857 { \
8f924df7 858 const unsigned char *orig = coding->source; \
df7492f9
KH
859 EMACS_INT offset; \
860 \
861 coding_set_source (coding); \
862 offset = coding->source - orig; \
863 src += offset; \
864 src_base += offset; \
865 src_end += offset; \
866 } \
aa72b389
KH
867 } while (0)
868
869
df7492f9
KH
870#define ASSURE_DESTINATION(bytes) \
871 do { \
872 if (dst + (bytes) >= dst_end) \
873 { \
874 int more_bytes = charbuf_end - charbuf + (bytes); \
875 \
876 dst = alloc_destination (coding, more_bytes, dst); \
877 dst_end = coding->destination + coding->dst_bytes; \
878 } \
879 } while (0)
aa72b389 880
aa72b389 881
aa72b389 882
df7492f9
KH
883static void
884coding_set_source (coding)
aa72b389 885 struct coding_system *coding;
aa72b389 886{
df7492f9
KH
887 if (BUFFERP (coding->src_object))
888 {
2cb26057 889 struct buffer *buf = XBUFFER (coding->src_object);
aa72b389 890
df7492f9 891 if (coding->src_pos < 0)
2cb26057 892 coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
df7492f9 893 else
2cb26057 894 coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
aa72b389 895 }
df7492f9 896 else if (STRINGP (coding->src_object))
aa72b389 897 {
8f924df7 898 coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
aa72b389 899 }
df7492f9
KH
900 else
901 /* Otherwise, the source is C string and is never relocated
902 automatically. Thus we don't have to update anything. */
903 ;
904}
aa72b389 905
df7492f9
KH
906static void
907coding_set_destination (coding)
908 struct coding_system *coding;
909{
910 if (BUFFERP (coding->dst_object))
aa72b389 911 {
df7492f9 912 if (coding->src_pos < 0)
aa72b389 913 {
28f67a95
KH
914 coding->destination = BEG_ADDR + coding->dst_pos_byte - 1;
915 coding->dst_bytes = (GAP_END_ADDR
916 - (coding->src_bytes - coding->consumed)
917 - coding->destination);
aa72b389 918 }
df7492f9 919 else
28f67a95
KH
920 {
921 /* We are sure that coding->dst_pos_byte is before the gap
922 of the buffer. */
923 coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
924 + coding->dst_pos_byte - 1);
925 coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
926 - coding->destination);
927 }
df7492f9
KH
928 }
929 else
930 /* Otherwise, the destination is C string and is never relocated
931 automatically. Thus we don't have to update anything. */
932 ;
933}
934
935
936static void
937coding_alloc_by_realloc (coding, bytes)
938 struct coding_system *coding;
939 EMACS_INT bytes;
940{
941 coding->destination = (unsigned char *) xrealloc (coding->destination,
942 coding->dst_bytes + bytes);
943 coding->dst_bytes += bytes;
944}
945
946static void
947coding_alloc_by_making_gap (coding, bytes)
948 struct coding_system *coding;
949 EMACS_INT bytes;
950{
2c78b7e1
KH
951 if (BUFFERP (coding->dst_object)
952 && EQ (coding->src_object, coding->dst_object))
df7492f9
KH
953 {
954 EMACS_INT add = coding->src_bytes - coding->consumed;
955
956 GAP_SIZE -= add; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
957 make_gap (bytes);
958 GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
959 }
960 else
961 {
2c78b7e1
KH
962 Lisp_Object this_buffer;
963
964 this_buffer = Fcurrent_buffer ();
df7492f9
KH
965 set_buffer_internal (XBUFFER (coding->dst_object));
966 make_gap (bytes);
967 set_buffer_internal (XBUFFER (this_buffer));
aa72b389 968 }
df7492f9 969}
8f924df7 970
df7492f9
KH
971
972static unsigned char *
973alloc_destination (coding, nbytes, dst)
974 struct coding_system *coding;
975 int nbytes;
976 unsigned char *dst;
977{
978 EMACS_INT offset = dst - coding->destination;
979
980 if (BUFFERP (coding->dst_object))
981 coding_alloc_by_making_gap (coding, nbytes);
aa72b389 982 else
df7492f9
KH
983 coding_alloc_by_realloc (coding, nbytes);
984 coding->result = CODING_RESULT_SUCCESS;
985 coding_set_destination (coding);
986 dst = coding->destination + offset;
987 return dst;
988}
aa72b389 989
ff0dacd7
KH
990/** Macros for annotations. */
991
992/* Maximum length of annotation data (sum of annotations for
993 composition and charset). */
994#define MAX_ANNOTATION_LENGTH (5 + (MAX_COMPOSITION_COMPONENTS * 2) - 1 + 5)
995
996/* An annotation data is stored in the array coding->charbuf in this
997 format:
998 [ -LENGTH ANNOTATION_MASK FROM TO ... ]
999 LENGTH is the number of elements in the annotation.
1000 ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1001 FROM and TO specify the range of text annotated. They are relative
1002 to coding->src_pos (on encoding) or coding->dst_pos (on decoding).
1003
1004 The format of the following elements depend on ANNOTATION_MASK.
1005
1006 In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1007 follows:
1008 ... METHOD [ COMPOSITION-COMPONENTS ... ]
1009 METHOD is one of enum composition_method.
1010 Optionnal COMPOSITION-COMPONENTS are characters and composition
1011 rules.
1012
1013 In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1014 follows. */
1015
1016#define ADD_ANNOTATION_DATA(buf, len, mask, from, to) \
1017 do { \
1018 *(buf)++ = -(len); \
1019 *(buf)++ = (mask); \
1020 *(buf)++ = (from); \
1021 *(buf)++ = (to); \
1022 coding->annotated = 1; \
1023 } while (0);
1024
1025#define ADD_COMPOSITION_DATA(buf, from, to, method) \
1026 do { \
1027 ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, from, to); \
1028 *buf++ = method; \
1029 } while (0)
1030
1031
1032#define ADD_CHARSET_DATA(buf, from, to, id) \
1033 do { \
1034 ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_CHARSET_MASK, from, to); \
1035 *buf++ = id; \
1036 } while (0)
1037
df7492f9
KH
1038\f
1039/*** 2. Emacs' internal format (emacs-utf-8) ***/
1040
1041
1042
1043\f
1044/*** 3. UTF-8 ***/
1045
1046/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
ff0dacd7
KH
1047 Check if a text is encoded in UTF-8. If it is, return 1, else
1048 return 0. */
df7492f9
KH
1049
1050#define UTF_8_1_OCTET_P(c) ((c) < 0x80)
1051#define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
1052#define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1053#define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1054#define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1055#define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1056
1057static int
ff0dacd7 1058detect_coding_utf_8 (coding, detect_info)
df7492f9 1059 struct coding_system *coding;
ff0dacd7 1060 struct coding_detection_info *detect_info;
df7492f9 1061{
8f924df7
KH
1062 const unsigned char *src = coding->source, *src_base = src;
1063 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9
KH
1064 int multibytep = coding->src_multibyte;
1065 int consumed_chars = 0;
1066 int found = 0;
89528eb3 1067 int incomplete;
df7492f9 1068
ff0dacd7 1069 detect_info->checked |= CATEGORY_MASK_UTF_8;
df7492f9
KH
1070 /* A coding system of this category is always ASCII compatible. */
1071 src += coding->head_ascii;
1072
1073 while (1)
aa72b389 1074 {
df7492f9 1075 int c, c1, c2, c3, c4;
aa72b389 1076
89528eb3 1077 incomplete = 0;
df7492f9
KH
1078 ONE_MORE_BYTE (c);
1079 if (UTF_8_1_OCTET_P (c))
1080 continue;
89528eb3 1081 incomplete = 1;
df7492f9
KH
1082 ONE_MORE_BYTE (c1);
1083 if (! UTF_8_EXTRA_OCTET_P (c1))
1084 break;
1085 if (UTF_8_2_OCTET_LEADING_P (c))
aa72b389 1086 {
ff0dacd7 1087 found = CATEGORY_MASK_UTF_8;
df7492f9 1088 continue;
aa72b389 1089 }
df7492f9
KH
1090 ONE_MORE_BYTE (c2);
1091 if (! UTF_8_EXTRA_OCTET_P (c2))
1092 break;
1093 if (UTF_8_3_OCTET_LEADING_P (c))
aa72b389 1094 {
ff0dacd7 1095 found = CATEGORY_MASK_UTF_8;
df7492f9 1096 continue;
aa72b389 1097 }
df7492f9
KH
1098 ONE_MORE_BYTE (c3);
1099 if (! UTF_8_EXTRA_OCTET_P (c3))
1100 break;
1101 if (UTF_8_4_OCTET_LEADING_P (c))
aa72b389 1102 {
ff0dacd7 1103 found = CATEGORY_MASK_UTF_8;
df7492f9
KH
1104 continue;
1105 }
1106 ONE_MORE_BYTE (c4);
1107 if (! UTF_8_EXTRA_OCTET_P (c4))
1108 break;
1109 if (UTF_8_5_OCTET_LEADING_P (c))
1110 {
ff0dacd7 1111 found = CATEGORY_MASK_UTF_8;
df7492f9
KH
1112 continue;
1113 }
1114 break;
aa72b389 1115 }
ff0dacd7 1116 detect_info->rejected |= CATEGORY_MASK_UTF_8;
df7492f9 1117 return 0;
aa72b389 1118
df7492f9 1119 no_more_source:
89528eb3 1120 if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK)
aa72b389 1121 {
ff0dacd7 1122 detect_info->rejected |= CATEGORY_MASK_UTF_8;
89528eb3 1123 return 0;
aa72b389 1124 }
ff0dacd7
KH
1125 detect_info->found |= found;
1126 return 1;
aa72b389
KH
1127}
1128
4ed46869 1129
b73bfc1c 1130static void
df7492f9 1131decode_coding_utf_8 (coding)
b73bfc1c 1132 struct coding_system *coding;
b73bfc1c 1133{
8f924df7
KH
1134 const unsigned char *src = coding->source + coding->consumed;
1135 const unsigned char *src_end = coding->source + coding->src_bytes;
1136 const unsigned char *src_base;
df7492f9
KH
1137 int *charbuf = coding->charbuf;
1138 int *charbuf_end = charbuf + coding->charbuf_size;
1139 int consumed_chars = 0, consumed_chars_base;
1140 int multibytep = coding->src_multibyte;
1141 Lisp_Object attr, eol_type, charset_list;
4ed46869 1142
df7492f9
KH
1143 CODING_GET_INFO (coding, attr, eol_type, charset_list);
1144
1145 while (1)
b73bfc1c 1146 {
df7492f9 1147 int c, c1, c2, c3, c4, c5;
ec6d2bb8 1148
df7492f9
KH
1149 src_base = src;
1150 consumed_chars_base = consumed_chars;
4af310db 1151
df7492f9
KH
1152 if (charbuf >= charbuf_end)
1153 break;
1154
1155 ONE_MORE_BYTE (c1);
1156 if (UTF_8_1_OCTET_P(c1))
1157 {
1158 c = c1;
1159 if (c == '\r')
4af310db 1160 {
df7492f9 1161 if (EQ (eol_type, Qdos))
4af310db 1162 {
df7492f9 1163 if (src == src_end)
98725083
KH
1164 {
1165 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
1166 goto no_more_source;
1167 }
df7492f9
KH
1168 if (*src == '\n')
1169 ONE_MORE_BYTE (c);
4af310db 1170 }
df7492f9
KH
1171 else if (EQ (eol_type, Qmac))
1172 c = '\n';
4af310db 1173 }
4af310db 1174 }
df7492f9 1175 else
4af310db 1176 {
df7492f9
KH
1177 ONE_MORE_BYTE (c2);
1178 if (! UTF_8_EXTRA_OCTET_P (c2))
1179 goto invalid_code;
1180 if (UTF_8_2_OCTET_LEADING_P (c1))
4af310db 1181 {
b0edb2c5
DL
1182 c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1183 /* Reject overlong sequences here and below. Encoders
1184 producing them are incorrect, they can be misleading,
1185 and they mess up read/write invariance. */
1186 if (c < 128)
1187 goto invalid_code;
4af310db 1188 }
df7492f9 1189 else
aa72b389 1190 {
df7492f9
KH
1191 ONE_MORE_BYTE (c3);
1192 if (! UTF_8_EXTRA_OCTET_P (c3))
1193 goto invalid_code;
1194 if (UTF_8_3_OCTET_LEADING_P (c1))
b0edb2c5
DL
1195 {
1196 c = (((c1 & 0xF) << 12)
1197 | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
72fe1301
DL
1198 if (c < 0x800
1199 || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
b0edb2c5
DL
1200 goto invalid_code;
1201 }
df7492f9
KH
1202 else
1203 {
1204 ONE_MORE_BYTE (c4);
1205 if (! UTF_8_EXTRA_OCTET_P (c4))
1206 goto invalid_code;
1207 if (UTF_8_4_OCTET_LEADING_P (c1))
b0edb2c5 1208 {
df7492f9
KH
1209 c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1210 | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
b0edb2c5
DL
1211 if (c < 0x10000)
1212 goto invalid_code;
1213 }
df7492f9
KH
1214 else
1215 {
1216 ONE_MORE_BYTE (c5);
1217 if (! UTF_8_EXTRA_OCTET_P (c5))
1218 goto invalid_code;
1219 if (UTF_8_5_OCTET_LEADING_P (c1))
1220 {
1221 c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1222 | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1223 | (c5 & 0x3F));
b0edb2c5 1224 if ((c > MAX_CHAR) || (c < 0x200000))
df7492f9
KH
1225 goto invalid_code;
1226 }
1227 else
1228 goto invalid_code;
1229 }
1230 }
aa72b389 1231 }
b73bfc1c 1232 }
df7492f9
KH
1233
1234 *charbuf++ = c;
1235 continue;
1236
1237 invalid_code:
1238 src = src_base;
1239 consumed_chars = consumed_chars_base;
1240 ONE_MORE_BYTE (c);
1241 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1242 coding->errors++;
aa72b389
KH
1243 }
1244
df7492f9
KH
1245 no_more_source:
1246 coding->consumed_char += consumed_chars_base;
1247 coding->consumed = src_base - coding->source;
1248 coding->charbuf_used = charbuf - coding->charbuf;
1249}
1250
1251
1252static int
1253encode_coding_utf_8 (coding)
1254 struct coding_system *coding;
1255{
1256 int multibytep = coding->dst_multibyte;
1257 int *charbuf = coding->charbuf;
1258 int *charbuf_end = charbuf + coding->charbuf_used;
1259 unsigned char *dst = coding->destination + coding->produced;
1260 unsigned char *dst_end = coding->destination + coding->dst_bytes;
e19c3639 1261 int produced_chars = 0;
df7492f9
KH
1262 int c;
1263
1264 if (multibytep)
aa72b389 1265 {
df7492f9
KH
1266 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1267
1268 while (charbuf < charbuf_end)
b73bfc1c 1269 {
df7492f9 1270 unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
8f924df7 1271
df7492f9
KH
1272 ASSURE_DESTINATION (safe_room);
1273 c = *charbuf++;
28f67a95
KH
1274 if (CHAR_BYTE8_P (c))
1275 {
1276 c = CHAR_TO_BYTE8 (c);
1277 EMIT_ONE_BYTE (c);
1278 }
1279 else
1280 {
1281 CHAR_STRING_ADVANCE (c, pend);
1282 for (p = str; p < pend; p++)
1283 EMIT_ONE_BYTE (*p);
1284 }
b73bfc1c 1285 }
aa72b389 1286 }
df7492f9
KH
1287 else
1288 {
1289 int safe_room = MAX_MULTIBYTE_LENGTH;
1290
1291 while (charbuf < charbuf_end)
b73bfc1c 1292 {
df7492f9
KH
1293 ASSURE_DESTINATION (safe_room);
1294 c = *charbuf++;
1295 dst += CHAR_STRING (c, dst);
1296 produced_chars++;
4ed46869
KH
1297 }
1298 }
df7492f9
KH
1299 coding->result = CODING_RESULT_SUCCESS;
1300 coding->produced_char += produced_chars;
1301 coding->produced = dst - coding->destination;
1302 return 0;
4ed46869
KH
1303}
1304
b73bfc1c 1305
df7492f9 1306/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
ff0dacd7
KH
1307 Check if a text is encoded in one of UTF-16 based coding systems.
1308 If it is, return 1, else return 0. */
aa72b389 1309
df7492f9
KH
1310#define UTF_16_HIGH_SURROGATE_P(val) \
1311 (((val) & 0xFC00) == 0xD800)
1312
1313#define UTF_16_LOW_SURROGATE_P(val) \
1314 (((val) & 0xFC00) == 0xDC00)
93dec019 1315
df7492f9
KH
1316#define UTF_16_INVALID_P(val) \
1317 (((val) == 0xFFFE) \
1318 || ((val) == 0xFFFF) \
1319 || UTF_16_LOW_SURROGATE_P (val))
aa72b389 1320
aa72b389 1321
df7492f9 1322static int
ff0dacd7 1323detect_coding_utf_16 (coding, detect_info)
aa72b389 1324 struct coding_system *coding;
ff0dacd7 1325 struct coding_detection_info *detect_info;
aa72b389 1326{
8f924df7
KH
1327 const unsigned char *src = coding->source, *src_base = src;
1328 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9
KH
1329 int multibytep = coding->src_multibyte;
1330 int consumed_chars = 0;
1331 int c1, c2;
aa72b389 1332
ff0dacd7 1333 detect_info->checked |= CATEGORY_MASK_UTF_16;
aa72b389 1334
ff0dacd7
KH
1335 if (coding->mode & CODING_MODE_LAST_BLOCK
1336 && (coding->src_bytes & 1))
1337 {
1338 detect_info->rejected |= CATEGORY_MASK_UTF_16;
1339 return 0;
1340 }
df7492f9
KH
1341 ONE_MORE_BYTE (c1);
1342 ONE_MORE_BYTE (c2);
aa72b389 1343
df7492f9 1344 if ((c1 == 0xFF) && (c2 == 0xFE))
aa72b389 1345 {
b49a1807
KH
1346 detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1347 | CATEGORY_MASK_UTF_16_AUTO);
ff0dacd7 1348 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE;
aa72b389 1349 }
df7492f9 1350 else if ((c1 == 0xFE) && (c2 == 0xFF))
ff0dacd7 1351 {
b49a1807
KH
1352 detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1353 | CATEGORY_MASK_UTF_16_AUTO);
ff0dacd7
KH
1354 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE;
1355 }
df7492f9 1356 no_more_source:
ff0dacd7 1357 return 1;
df7492f9 1358}
aa72b389 1359
df7492f9
KH
1360static void
1361decode_coding_utf_16 (coding)
1362 struct coding_system *coding;
1363{
8f924df7
KH
1364 const unsigned char *src = coding->source + coding->consumed;
1365 const unsigned char *src_end = coding->source + coding->src_bytes;
1366 const unsigned char *src_base;
df7492f9
KH
1367 int *charbuf = coding->charbuf;
1368 int *charbuf_end = charbuf + coding->charbuf_size;
1369 int consumed_chars = 0, consumed_chars_base;
1370 int multibytep = coding->src_multibyte;
1371 enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1372 enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1373 int surrogate = CODING_UTF_16_SURROGATE (coding);
1374 Lisp_Object attr, eol_type, charset_list;
1375
1376 CODING_GET_INFO (coding, attr, eol_type, charset_list);
1377
b49a1807 1378 if (bom == utf_16_with_bom)
aa72b389 1379 {
df7492f9 1380 int c, c1, c2;
4af310db 1381
aa72b389 1382 src_base = src;
df7492f9
KH
1383 ONE_MORE_BYTE (c1);
1384 ONE_MORE_BYTE (c2);
e19c3639 1385 c = (c1 << 8) | c2;
aa72b389 1386
b49a1807
KH
1387 if (endian == utf_16_big_endian
1388 ? c != 0xFEFF : c != 0xFFFE)
aa72b389 1389 {
b49a1807
KH
1390 /* The first two bytes are not BOM. Treat them as bytes
1391 for a normal character. */
1392 src = src_base;
1393 coding->errors++;
aa72b389 1394 }
b49a1807
KH
1395 CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1396 }
1397 else if (bom == utf_16_detect_bom)
1398 {
1399 /* We have already tried to detect BOM and failed in
1400 detect_coding. */
1401 CODING_UTF_16_BOM (coding) = utf_16_without_bom;
df7492f9 1402 }
aa72b389 1403
df7492f9
KH
1404 while (1)
1405 {
1406 int c, c1, c2;
1407
1408 src_base = src;
1409 consumed_chars_base = consumed_chars;
1410
1411 if (charbuf + 2 >= charbuf_end)
1412 break;
1413
1414 ONE_MORE_BYTE (c1);
1415 ONE_MORE_BYTE (c2);
1416 c = (endian == utf_16_big_endian
e19c3639 1417 ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
df7492f9 1418 if (surrogate)
fd3ae0b9 1419 {
df7492f9 1420 if (! UTF_16_LOW_SURROGATE_P (c))
fd3ae0b9 1421 {
df7492f9
KH
1422 if (endian == utf_16_big_endian)
1423 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1424 else
1425 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1426 *charbuf++ = c1;
1427 *charbuf++ = c2;
1428 coding->errors++;
1429 if (UTF_16_HIGH_SURROGATE_P (c))
1430 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
fd3ae0b9 1431 else
df7492f9 1432 *charbuf++ = c;
fd3ae0b9
KH
1433 }
1434 else
df7492f9
KH
1435 {
1436 c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1437 CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1438 *charbuf++ = c;
1439 }
fd3ae0b9 1440 }
aa72b389 1441 else
df7492f9
KH
1442 {
1443 if (UTF_16_HIGH_SURROGATE_P (c))
1444 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1445 else
1446 *charbuf++ = c;
8f924df7 1447 }
aa72b389 1448 }
df7492f9
KH
1449
1450 no_more_source:
1451 coding->consumed_char += consumed_chars_base;
1452 coding->consumed = src_base - coding->source;
1453 coding->charbuf_used = charbuf - coding->charbuf;
aa72b389 1454}
b73bfc1c 1455
df7492f9
KH
1456static int
1457encode_coding_utf_16 (coding)
1458 struct coding_system *coding;
1459{
1460 int multibytep = coding->dst_multibyte;
1461 int *charbuf = coding->charbuf;
1462 int *charbuf_end = charbuf + coding->charbuf_used;
1463 unsigned char *dst = coding->destination + coding->produced;
1464 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1465 int safe_room = 8;
1466 enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1467 int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1468 int produced_chars = 0;
1469 Lisp_Object attrs, eol_type, charset_list;
1470 int c;
4ed46869 1471
df7492f9
KH
1472 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
1473
b49a1807 1474 if (bom != utf_16_without_bom)
df7492f9
KH
1475 {
1476 ASSURE_DESTINATION (safe_room);
1477 if (big_endian)
df7492f9 1478 EMIT_TWO_BYTES (0xFE, 0xFF);
880cf180
KH
1479 else
1480 EMIT_TWO_BYTES (0xFF, 0xFE);
df7492f9
KH
1481 CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1482 }
1483
1484 while (charbuf < charbuf_end)
1485 {
1486 ASSURE_DESTINATION (safe_room);
1487 c = *charbuf++;
e19c3639
KH
1488 if (c >= MAX_UNICODE_CHAR)
1489 c = coding->default_char;
df7492f9
KH
1490
1491 if (c < 0x10000)
1492 {
1493 if (big_endian)
1494 EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1495 else
1496 EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1497 }
1498 else
1499 {
1500 int c1, c2;
1501
1502 c -= 0x10000;
1503 c1 = (c >> 10) + 0xD800;
1504 c2 = (c & 0x3FF) + 0xDC00;
1505 if (big_endian)
1506 EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1507 else
1508 EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1509 }
1510 }
1511 coding->result = CODING_RESULT_SUCCESS;
1512 coding->produced = dst - coding->destination;
1513 coding->produced_char += produced_chars;
1514 return 0;
1515}
1516
1517\f
1518/*** 6. Old Emacs' internal format (emacs-mule) ***/
1519
1520/* Emacs' internal format for representation of multiple character
1521 sets is a kind of multi-byte encoding, i.e. characters are
1522 represented by variable-length sequences of one-byte codes.
1523
1524 ASCII characters and control characters (e.g. `tab', `newline') are
1525 represented by one-byte sequences which are their ASCII codes, in
1526 the range 0x00 through 0x7F.
1527
1528 8-bit characters of the range 0x80..0x9F are represented by
1529 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1530 code + 0x20).
1531
1532 8-bit characters of the range 0xA0..0xFF are represented by
1533 one-byte sequences which are their 8-bit code.
1534
1535 The other characters are represented by a sequence of `base
1536 leading-code', optional `extended leading-code', and one or two
1537 `position-code's. The length of the sequence is determined by the
1538 base leading-code. Leading-code takes the range 0x81 through 0x9D,
1539 whereas extended leading-code and position-code take the range 0xA0
1540 through 0xFF. See `charset.h' for more details about leading-code
1541 and position-code.
1542
1543 --- CODE RANGE of Emacs' internal format ---
1544 character set range
1545 ------------- -----
1546 ascii 0x00..0x7F
1547 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1548 eight-bit-graphic 0xA0..0xBF
1549 ELSE 0x81..0x9D + [0xA0..0xFF]+
1550 ---------------------------------------------
1551
1552 As this is the internal character representation, the format is
1553 usually not used externally (i.e. in a file or in a data sent to a
1554 process). But, it is possible to have a text externally in this
1555 format (i.e. by encoding by the coding system `emacs-mule').
1556
1557 In that case, a sequence of one-byte codes has a slightly different
1558 form.
1559
1560 At first, all characters in eight-bit-control are represented by
1561 one-byte sequences which are their 8-bit code.
1562
1563 Next, character composition data are represented by the byte
1564 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1565 where,
1566 METHOD is 0xF0 plus one of composition method (enum
1567 composition_method),
1568
1569 BYTES is 0xA0 plus a byte length of this composition data,
1570
1571 CHARS is 0x20 plus a number of characters composed by this
1572 data,
1573
1574 COMPONENTs are characters of multibye form or composition
1575 rules encoded by two-byte of ASCII codes.
1576
1577 In addition, for backward compatibility, the following formats are
1578 also recognized as composition data on decoding.
1579
1580 0x80 MSEQ ...
1581 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1582
1583 Here,
1584 MSEQ is a multibyte form but in these special format:
1585 ASCII: 0xA0 ASCII_CODE+0x80,
1586 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1587 RULE is a one byte code of the range 0xA0..0xF0 that
1588 represents a composition rule.
1589 */
1590
1591char emacs_mule_bytes[256];
1592
df7492f9 1593int
ff0dacd7 1594emacs_mule_char (coding, src, nbytes, nchars, id)
df7492f9 1595 struct coding_system *coding;
781d7a48 1596 unsigned char *src;
ff0dacd7 1597 int *nbytes, *nchars, *id;
df7492f9 1598{
8f924df7
KH
1599 const unsigned char *src_end = coding->source + coding->src_bytes;
1600 const unsigned char *src_base = src;
df7492f9 1601 int multibytep = coding->src_multibyte;
df7492f9
KH
1602 struct charset *charset;
1603 unsigned code;
1604 int c;
1605 int consumed_chars = 0;
1606
1607 ONE_MORE_BYTE (c);
df7492f9
KH
1608 switch (emacs_mule_bytes[c])
1609 {
1610 case 2:
1611 if (! (charset = emacs_mule_charset[c]))
1612 goto invalid_code;
1613 ONE_MORE_BYTE (c);
1614 code = c & 0x7F;
1615 break;
1616
1617 case 3:
7c78e542
KH
1618 if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
1619 || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
b73bfc1c 1620 {
df7492f9
KH
1621 ONE_MORE_BYTE (c);
1622 if (! (charset = emacs_mule_charset[c]))
1623 goto invalid_code;
1624 ONE_MORE_BYTE (c);
1625 code = c & 0x7F;
b73bfc1c
KH
1626 }
1627 else
1628 {
df7492f9
KH
1629 if (! (charset = emacs_mule_charset[c]))
1630 goto invalid_code;
1631 ONE_MORE_BYTE (c);
781d7a48 1632 code = (c & 0x7F) << 8;
df7492f9
KH
1633 ONE_MORE_BYTE (c);
1634 code |= c & 0x7F;
1635 }
1636 break;
1637
1638 case 4:
781d7a48 1639 ONE_MORE_BYTE (c);
df7492f9
KH
1640 if (! (charset = emacs_mule_charset[c]))
1641 goto invalid_code;
1642 ONE_MORE_BYTE (c);
781d7a48 1643 code = (c & 0x7F) << 8;
df7492f9
KH
1644 ONE_MORE_BYTE (c);
1645 code |= c & 0x7F;
1646 break;
1647
1648 case 1:
1649 code = c;
9d123124
KH
1650 charset = CHARSET_FROM_ID (ASCII_BYTE_P (code)
1651 ? charset_ascii : charset_eight_bit);
df7492f9
KH
1652 break;
1653
1654 default:
1655 abort ();
1656 }
1657 c = DECODE_CHAR (charset, code);
1658 if (c < 0)
1659 goto invalid_code;
1660 *nbytes = src - src_base;
1661 *nchars = consumed_chars;
ff0dacd7
KH
1662 if (id)
1663 *id = charset->id;
df7492f9
KH
1664 return c;
1665
1666 no_more_source:
1667 return -2;
1668
1669 invalid_code:
1670 return -1;
1671}
1672
1673
1674/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
ff0dacd7
KH
1675 Check if a text is encoded in `emacs-mule'. If it is, return 1,
1676 else return 0. */
df7492f9
KH
1677
1678static int
ff0dacd7 1679detect_coding_emacs_mule (coding, detect_info)
df7492f9 1680 struct coding_system *coding;
ff0dacd7 1681 struct coding_detection_info *detect_info;
df7492f9 1682{
8f924df7
KH
1683 const unsigned char *src = coding->source, *src_base = src;
1684 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9
KH
1685 int multibytep = coding->src_multibyte;
1686 int consumed_chars = 0;
1687 int c;
1688 int found = 0;
89528eb3 1689 int incomplete;
df7492f9 1690
ff0dacd7 1691 detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
df7492f9
KH
1692 /* A coding system of this category is always ASCII compatible. */
1693 src += coding->head_ascii;
1694
1695 while (1)
1696 {
89528eb3 1697 incomplete = 0;
df7492f9 1698 ONE_MORE_BYTE (c);
89528eb3 1699 incomplete = 1;
df7492f9
KH
1700
1701 if (c == 0x80)
1702 {
1703 /* Perhaps the start of composite character. We simple skip
1704 it because analyzing it is too heavy for detecting. But,
1705 at least, we check that the composite character
1706 constitues of more than 4 bytes. */
8f924df7 1707 const unsigned char *src_base;
df7492f9
KH
1708
1709 repeat:
1710 src_base = src;
1711 do
1712 {
1713 ONE_MORE_BYTE (c);
1714 }
1715 while (c >= 0xA0);
1716
1717 if (src - src_base <= 4)
1718 break;
ff0dacd7 1719 found = CATEGORY_MASK_EMACS_MULE;
df7492f9
KH
1720 if (c == 0x80)
1721 goto repeat;
b73bfc1c 1722 }
df7492f9
KH
1723
1724 if (c < 0x80)
b73bfc1c 1725 {
df7492f9
KH
1726 if (c < 0x20
1727 && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1728 break;
1729 }
1730 else
1731 {
8f924df7 1732 const unsigned char *src_base = src - 1;
df7492f9
KH
1733
1734 do
1735 {
1736 ONE_MORE_BYTE (c);
1737 }
1738 while (c >= 0xA0);
1739 if (src - src_base != emacs_mule_bytes[*src_base])
1740 break;
ff0dacd7 1741 found = CATEGORY_MASK_EMACS_MULE;
4ed46869
KH
1742 }
1743 }
ff0dacd7 1744 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
df7492f9
KH
1745 return 0;
1746
1747 no_more_source:
89528eb3
KH
1748 if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK)
1749 {
ff0dacd7 1750 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
89528eb3
KH
1751 return 0;
1752 }
ff0dacd7
KH
1753 detect_info->found |= found;
1754 return 1;
4ed46869
KH
1755}
1756
b73bfc1c 1757
df7492f9
KH
1758/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
1759
1760/* Decode a character represented as a component of composition
1761 sequence of Emacs 20/21 style at SRC. Set C to that character and
1762 update SRC to the head of next character (or an encoded composition
1763 rule). If SRC doesn't points a composition component, set C to -1.
1764 If SRC points an invalid byte sequence, global exit by a return
1765 value 0. */
1766
1767#define DECODE_EMACS_MULE_COMPOSITION_CHAR(buf) \
1768 if (1) \
1769 { \
1770 int c; \
1771 int nbytes, nchars; \
1772 \
1773 if (src == src_end) \
1774 break; \
ff0dacd7 1775 c = emacs_mule_char (coding, src, &nbytes, &nchars, NULL);\
df7492f9
KH
1776 if (c < 0) \
1777 { \
1778 if (c == -2) \
1779 break; \
1780 goto invalid_code; \
1781 } \
1782 *buf++ = c; \
1783 src += nbytes; \
1784 consumed_chars += nchars; \
1785 } \
1786 else
1787
1788
1789/* Decode a composition rule represented as a component of composition
781d7a48
KH
1790 sequence of Emacs 20 style at SRC. Store the decoded rule in *BUF,
1791 and increment BUF. If SRC points an invalid byte sequence, set C
1792 to -1. */
df7492f9 1793
781d7a48 1794#define DECODE_EMACS_MULE_COMPOSITION_RULE_20(buf) \
df7492f9
KH
1795 do { \
1796 int c, gref, nref; \
1797 \
781d7a48 1798 if (src >= src_end) \
df7492f9
KH
1799 goto invalid_code; \
1800 ONE_MORE_BYTE_NO_CHECK (c); \
781d7a48 1801 c -= 0x20; \
df7492f9
KH
1802 if (c < 0 || c >= 81) \
1803 goto invalid_code; \
1804 \
1805 gref = c / 9, nref = c % 9; \
1806 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
1807 } while (0)
1808
1809
781d7a48
KH
1810/* Decode a composition rule represented as a component of composition
1811 sequence of Emacs 21 style at SRC. Store the decoded rule in *BUF,
1812 and increment BUF. If SRC points an invalid byte sequence, set C
1813 to -1. */
1814
1815#define DECODE_EMACS_MULE_COMPOSITION_RULE_21(buf) \
1816 do { \
1817 int gref, nref; \
1818 \
1819 if (src + 1>= src_end) \
1820 goto invalid_code; \
1821 ONE_MORE_BYTE_NO_CHECK (gref); \
1822 gref -= 0x20; \
1823 ONE_MORE_BYTE_NO_CHECK (nref); \
1824 nref -= 0x20; \
1825 if (gref < 0 || gref >= 81 \
1826 || nref < 0 || nref >= 81) \
1827 goto invalid_code; \
1828 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
1829 } while (0)
1830
1831
df7492f9 1832#define DECODE_EMACS_MULE_21_COMPOSITION(c) \
aa72b389 1833 do { \
df7492f9 1834 /* Emacs 21 style format. The first three bytes at SRC are \
781d7a48 1835 (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is \
df7492f9
KH
1836 the byte length of this composition information, CHARS is the \
1837 number of characters composed by this composition. */ \
781d7a48
KH
1838 enum composition_method method = c - 0xF2; \
1839 int *charbuf_base = charbuf; \
ff0dacd7 1840 int from, to; \
df7492f9
KH
1841 int consumed_chars_limit; \
1842 int nbytes, nchars; \
1843 \
1844 ONE_MORE_BYTE (c); \
1845 nbytes = c - 0xA0; \
1846 if (nbytes < 3) \
1847 goto invalid_code; \
1848 ONE_MORE_BYTE (c); \
1849 nchars = c - 0xA0; \
ff0dacd7
KH
1850 from = coding->produced + char_offset; \
1851 to = from + nchars; \
1852 ADD_COMPOSITION_DATA (charbuf, from, to, method); \
df7492f9
KH
1853 consumed_chars_limit = consumed_chars_base + nbytes; \
1854 if (method != COMPOSITION_RELATIVE) \
aa72b389 1855 { \
df7492f9
KH
1856 int i = 0; \
1857 while (consumed_chars < consumed_chars_limit) \
aa72b389 1858 { \
df7492f9 1859 if (i % 2 && method != COMPOSITION_WITH_ALTCHARS) \
781d7a48 1860 DECODE_EMACS_MULE_COMPOSITION_RULE_21 (charbuf); \
df7492f9
KH
1861 else \
1862 DECODE_EMACS_MULE_COMPOSITION_CHAR (charbuf); \
781d7a48 1863 i++; \
aa72b389 1864 } \
df7492f9
KH
1865 if (consumed_chars < consumed_chars_limit) \
1866 goto invalid_code; \
781d7a48 1867 charbuf_base[0] -= i; \
aa72b389
KH
1868 } \
1869 } while (0)
93dec019 1870
aa72b389 1871
df7492f9
KH
1872#define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION(c) \
1873 do { \
1874 /* Emacs 20 style format for relative composition. */ \
1875 /* Store multibyte form of characters to be composed. */ \
ff0dacd7 1876 enum composition_method method = COMPOSITION_RELATIVE; \
df7492f9
KH
1877 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
1878 int *buf = components; \
1879 int i, j; \
ff0dacd7 1880 int from, to; \
df7492f9
KH
1881 \
1882 src = src_base; \
1883 ONE_MORE_BYTE (c); /* skip 0x80 */ \
1884 for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \
1885 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1886 if (i < 2) \
1887 goto invalid_code; \
ff0dacd7
KH
1888 from = coding->produced_char + char_offset; \
1889 to = from + i; \
1890 ADD_COMPOSITION_DATA (charbuf, from, to, method); \
df7492f9
KH
1891 for (j = 0; j < i; j++) \
1892 *charbuf++ = components[j]; \
1893 } while (0)
1894
1895
1896#define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION(c) \
1897 do { \
1898 /* Emacs 20 style format for rule-base composition. */ \
1899 /* Store multibyte form of characters to be composed. */ \
ff0dacd7 1900 enum composition_method method = COMPOSITION_WITH_RULE; \
df7492f9
KH
1901 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
1902 int *buf = components; \
1903 int i, j; \
ff0dacd7 1904 int from, to; \
df7492f9
KH
1905 \
1906 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1907 for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \
1908 { \
781d7a48 1909 DECODE_EMACS_MULE_COMPOSITION_RULE_20 (buf); \
df7492f9
KH
1910 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1911 } \
1912 if (i < 1 || (buf - components) % 2 == 0) \
1913 goto invalid_code; \
1914 if (charbuf + i + (i / 2) + 1 < charbuf_end) \
1915 goto no_more_source; \
ff0dacd7
KH
1916 from = coding->produced_char + char_offset; \
1917 to = from + i; \
1918 ADD_COMPOSITION_DATA (buf, from, to, method); \
df7492f9
KH
1919 for (j = 0; j < i; j++) \
1920 *charbuf++ = components[j]; \
1921 for (j = 0; j < i; j += 2) \
1922 *charbuf++ = components[j]; \
1923 } while (0)
1924
aa72b389
KH
1925
1926static void
df7492f9 1927decode_coding_emacs_mule (coding)
aa72b389 1928 struct coding_system *coding;
aa72b389 1929{
8f924df7
KH
1930 const unsigned char *src = coding->source + coding->consumed;
1931 const unsigned char *src_end = coding->source + coding->src_bytes;
1932 const unsigned char *src_base;
df7492f9 1933 int *charbuf = coding->charbuf;
ff0dacd7 1934 int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
df7492f9 1935 int consumed_chars = 0, consumed_chars_base;
df7492f9
KH
1936 int multibytep = coding->src_multibyte;
1937 Lisp_Object attrs, eol_type, charset_list;
ff0dacd7
KH
1938 int char_offset = coding->produced_char;
1939 int last_offset = char_offset;
1940 int last_id = charset_ascii;
aa72b389 1941
df7492f9 1942 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
aa72b389 1943
aa72b389
KH
1944 while (1)
1945 {
df7492f9
KH
1946 int c;
1947
aa72b389 1948 src_base = src;
df7492f9
KH
1949 consumed_chars_base = consumed_chars;
1950
1951 if (charbuf >= charbuf_end)
1952 break;
aa72b389 1953
df7492f9
KH
1954 ONE_MORE_BYTE (c);
1955
1956 if (c < 0x80)
aa72b389 1957 {
df7492f9
KH
1958 if (c == '\r')
1959 {
1960 if (EQ (eol_type, Qdos))
1961 {
1962 if (src == src_end)
98725083
KH
1963 {
1964 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
1965 goto no_more_source;
1966 }
df7492f9
KH
1967 if (*src == '\n')
1968 ONE_MORE_BYTE (c);
1969 }
1970 else if (EQ (eol_type, Qmac))
1971 c = '\n';
1972 }
1973 *charbuf++ = c;
1974 char_offset++;
aa72b389 1975 }
df7492f9
KH
1976 else if (c == 0x80)
1977 {
df7492f9 1978 ONE_MORE_BYTE (c);
781d7a48
KH
1979 if (c - 0xF2 >= COMPOSITION_RELATIVE
1980 && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)
df7492f9
KH
1981 DECODE_EMACS_MULE_21_COMPOSITION (c);
1982 else if (c < 0xC0)
1983 DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (c);
1984 else if (c == 0xFF)
1985 DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (c);
1986 else
1987 goto invalid_code;
1988 }
1989 else if (c < 0xA0 && emacs_mule_bytes[c] > 1)
1990 {
1991 int nbytes, nchars;
ff0dacd7
KH
1992 int id;
1993
781d7a48
KH
1994 src = src_base;
1995 consumed_chars = consumed_chars_base;
ff0dacd7 1996 c = emacs_mule_char (coding, src, &nbytes, &nchars, &id);
df7492f9
KH
1997 if (c < 0)
1998 {
1999 if (c == -2)
2000 break;
2001 goto invalid_code;
2002 }
ff0dacd7
KH
2003 if (last_id != id)
2004 {
2005 if (last_id != charset_ascii)
2006 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
2007 last_id = id;
2008 last_offset = char_offset;
2009 }
df7492f9 2010 *charbuf++ = c;
781d7a48
KH
2011 src += nbytes;
2012 consumed_chars += nchars;
df7492f9
KH
2013 char_offset++;
2014 }
2015 continue;
2016
2017 invalid_code:
2018 src = src_base;
2019 consumed_chars = consumed_chars_base;
2020 ONE_MORE_BYTE (c);
2021 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
ff0dacd7 2022 char_offset++;
df7492f9
KH
2023 coding->errors++;
2024 }
2025
2026 no_more_source:
ff0dacd7
KH
2027 if (last_id != charset_ascii)
2028 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
df7492f9
KH
2029 coding->consumed_char += consumed_chars_base;
2030 coding->consumed = src_base - coding->source;
2031 coding->charbuf_used = charbuf - coding->charbuf;
2032}
2033
2034
2035#define EMACS_MULE_LEADING_CODES(id, codes) \
2036 do { \
2037 if (id < 0xA0) \
2038 codes[0] = id, codes[1] = 0; \
2039 else if (id < 0xE0) \
2040 codes[0] = 0x9A, codes[1] = id; \
2041 else if (id < 0xF0) \
2042 codes[0] = 0x9B, codes[1] = id; \
2043 else if (id < 0xF5) \
2044 codes[0] = 0x9C, codes[1] = id; \
2045 else \
2046 codes[0] = 0x9D, codes[1] = id; \
2047 } while (0);
2048
aa72b389 2049
df7492f9
KH
2050static int
2051encode_coding_emacs_mule (coding)
2052 struct coding_system *coding;
2053{
2054 int multibytep = coding->dst_multibyte;
2055 int *charbuf = coding->charbuf;
2056 int *charbuf_end = charbuf + coding->charbuf_used;
2057 unsigned char *dst = coding->destination + coding->produced;
2058 unsigned char *dst_end = coding->destination + coding->dst_bytes;
2059 int safe_room = 8;
df7492f9
KH
2060 int produced_chars = 0;
2061 Lisp_Object attrs, eol_type, charset_list;
2062 int c;
ff0dacd7 2063 int preferred_charset_id = -1;
df7492f9
KH
2064
2065 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
2066
2067 while (charbuf < charbuf_end)
2068 {
2069 ASSURE_DESTINATION (safe_room);
2070 c = *charbuf++;
ff0dacd7
KH
2071
2072 if (c < 0)
2073 {
2074 /* Handle an annotation. */
2075 switch (*charbuf)
2076 {
2077 case CODING_ANNOTATE_COMPOSITION_MASK:
2078 /* Not yet implemented. */
2079 break;
2080 case CODING_ANNOTATE_CHARSET_MASK:
2081 preferred_charset_id = charbuf[3];
2082 if (preferred_charset_id >= 0
2083 && NILP (Fmemq (make_number (preferred_charset_id),
2084 charset_list)))
2085 preferred_charset_id = -1;
2086 break;
2087 default:
2088 abort ();
2089 }
2090 charbuf += -c - 1;
2091 continue;
2092 }
2093
df7492f9
KH
2094 if (ASCII_CHAR_P (c))
2095 EMIT_ONE_ASCII_BYTE (c);
16eafb5d
KH
2096 else if (CHAR_BYTE8_P (c))
2097 {
2098 c = CHAR_TO_BYTE8 (c);
2099 EMIT_ONE_BYTE (c);
2100 }
df7492f9 2101 else
aa72b389 2102 {
df7492f9
KH
2103 struct charset *charset;
2104 unsigned code;
2105 int dimension;
2106 int emacs_mule_id;
2107 unsigned char leading_codes[2];
2108
ff0dacd7
KH
2109 if (preferred_charset_id >= 0)
2110 {
2111 charset = CHARSET_FROM_ID (preferred_charset_id);
2112 if (! CHAR_CHARSET_P (c, charset))
2113 charset = char_charset (c, charset_list, NULL);
2114 }
2115 else
2116 charset = char_charset (c, charset_list, &code);
df7492f9
KH
2117 if (! charset)
2118 {
2119 c = coding->default_char;
2120 if (ASCII_CHAR_P (c))
2121 {
2122 EMIT_ONE_ASCII_BYTE (c);
2123 continue;
2124 }
2125 charset = char_charset (c, charset_list, &code);
2126 }
2127 dimension = CHARSET_DIMENSION (charset);
2128 emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2129 EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2130 EMIT_ONE_BYTE (leading_codes[0]);
2131 if (leading_codes[1])
2132 EMIT_ONE_BYTE (leading_codes[1]);
2133 if (dimension == 1)
2134 EMIT_ONE_BYTE (code);
aa72b389 2135 else
df7492f9
KH
2136 {
2137 EMIT_ONE_BYTE (code >> 8);
2138 EMIT_ONE_BYTE (code & 0xFF);
2139 }
aa72b389 2140 }
aa72b389 2141 }
df7492f9
KH
2142 coding->result = CODING_RESULT_SUCCESS;
2143 coding->produced_char += produced_chars;
2144 coding->produced = dst - coding->destination;
2145 return 0;
aa72b389 2146}
b73bfc1c 2147
4ed46869 2148\f
df7492f9 2149/*** 7. ISO2022 handlers ***/
4ed46869
KH
2150
2151/* The following note describes the coding system ISO2022 briefly.
39787efd 2152 Since the intention of this note is to help understand the
5a936b46 2153 functions in this file, some parts are NOT ACCURATE or are OVERLY
39787efd 2154 SIMPLIFIED. For thorough understanding, please refer to the
5a936b46 2155 original document of ISO2022. This is equivalent to the standard
cfb43547 2156 ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
4ed46869
KH
2157
2158 ISO2022 provides many mechanisms to encode several character sets
cfb43547 2159 in 7-bit and 8-bit environments. For 7-bit environments, all text
39787efd
KH
2160 is encoded using bytes less than 128. This may make the encoded
2161 text a little bit longer, but the text passes more easily through
cfb43547 2162 several types of gateway, some of which strip off the MSB (Most
8ca3766a 2163 Significant Bit).
b73bfc1c 2164
cfb43547
DL
2165 There are two kinds of character sets: control character sets and
2166 graphic character sets. The former contain control characters such
4ed46869 2167 as `newline' and `escape' to provide control functions (control
39787efd 2168 functions are also provided by escape sequences). The latter
cfb43547 2169 contain graphic characters such as 'A' and '-'. Emacs recognizes
4ed46869
KH
2170 two control character sets and many graphic character sets.
2171
2172 Graphic character sets are classified into one of the following
39787efd
KH
2173 four classes, according to the number of bytes (DIMENSION) and
2174 number of characters in one dimension (CHARS) of the set:
2175 - DIMENSION1_CHARS94
2176 - DIMENSION1_CHARS96
2177 - DIMENSION2_CHARS94
2178 - DIMENSION2_CHARS96
2179
2180 In addition, each character set is assigned an identification tag,
cfb43547 2181 unique for each set, called the "final character" (denoted as <F>
39787efd
KH
2182 hereafter). The <F> of each character set is decided by ECMA(*)
2183 when it is registered in ISO. The code range of <F> is 0x30..0x7F
2184 (0x30..0x3F are for private use only).
4ed46869
KH
2185
2186 Note (*): ECMA = European Computer Manufacturers Association
2187
cfb43547 2188 Here are examples of graphic character sets [NAME(<F>)]:
4ed46869
KH
2189 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2190 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2191 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2192 o DIMENSION2_CHARS96 -- none for the moment
2193
39787efd 2194 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
4ed46869
KH
2195 C0 [0x00..0x1F] -- control character plane 0
2196 GL [0x20..0x7F] -- graphic character plane 0
2197 C1 [0x80..0x9F] -- control character plane 1
2198 GR [0xA0..0xFF] -- graphic character plane 1
2199
2200 A control character set is directly designated and invoked to C0 or
39787efd
KH
2201 C1 by an escape sequence. The most common case is that:
2202 - ISO646's control character set is designated/invoked to C0, and
2203 - ISO6429's control character set is designated/invoked to C1,
2204 and usually these designations/invocations are omitted in encoded
2205 text. In a 7-bit environment, only C0 can be used, and a control
2206 character for C1 is encoded by an appropriate escape sequence to
2207 fit into the environment. All control characters for C1 are
2208 defined to have corresponding escape sequences.
4ed46869
KH
2209
2210 A graphic character set is at first designated to one of four
2211 graphic registers (G0 through G3), then these graphic registers are
2212 invoked to GL or GR. These designations and invocations can be
2213 done independently. The most common case is that G0 is invoked to
39787efd
KH
2214 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
2215 these invocations and designations are omitted in encoded text.
2216 In a 7-bit environment, only GL can be used.
4ed46869 2217
39787efd
KH
2218 When a graphic character set of CHARS94 is invoked to GL, codes
2219 0x20 and 0x7F of the GL area work as control characters SPACE and
2220 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2221 be used.
4ed46869
KH
2222
2223 There are two ways of invocation: locking-shift and single-shift.
2224 With locking-shift, the invocation lasts until the next different
39787efd
KH
2225 invocation, whereas with single-shift, the invocation affects the
2226 following character only and doesn't affect the locking-shift
2227 state. Invocations are done by the following control characters or
2228 escape sequences:
4ed46869
KH
2229
2230 ----------------------------------------------------------------------
39787efd 2231 abbrev function cntrl escape seq description
4ed46869 2232 ----------------------------------------------------------------------
39787efd
KH
2233 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
2234 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
2235 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
2236 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
2237 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
2238 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
2239 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
2240 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
2241 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
4ed46869 2242 ----------------------------------------------------------------------
39787efd
KH
2243 (*) These are not used by any known coding system.
2244
2245 Control characters for these functions are defined by macros
2246 ISO_CODE_XXX in `coding.h'.
4ed46869 2247
39787efd 2248 Designations are done by the following escape sequences:
4ed46869
KH
2249 ----------------------------------------------------------------------
2250 escape sequence description
2251 ----------------------------------------------------------------------
2252 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
2253 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
2254 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
2255 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
2256 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
2257 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
2258 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
2259 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
2260 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
2261 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
2262 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
2263 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
2264 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
2265 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
2266 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
2267 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
2268 ----------------------------------------------------------------------
2269
2270 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
39787efd 2271 of dimension 1, chars 94, and final character <F>, etc...
4ed46869
KH
2272
2273 Note (*): Although these designations are not allowed in ISO2022,
2274 Emacs accepts them on decoding, and produces them on encoding
39787efd 2275 CHARS96 character sets in a coding system which is characterized as
4ed46869
KH
2276 7-bit environment, non-locking-shift, and non-single-shift.
2277
2278 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
df7492f9 2279 '(' must be omitted. We refer to this as "short-form" hereafter.
4ed46869 2280
cfb43547 2281 Now you may notice that there are a lot of ways of encoding the
39787efd
KH
2282 same multilingual text in ISO2022. Actually, there exist many
2283 coding systems such as Compound Text (used in X11's inter client
8ca3766a
DL
2284 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2285 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
4ed46869
KH
2286 localized platforms), and all of these are variants of ISO2022.
2287
2288 In addition to the above, Emacs handles two more kinds of escape
2289 sequences: ISO6429's direction specification and Emacs' private
2290 sequence for specifying character composition.
2291
39787efd 2292 ISO6429's direction specification takes the following form:
4ed46869
KH
2293 o CSI ']' -- end of the current direction
2294 o CSI '0' ']' -- end of the current direction
2295 o CSI '1' ']' -- start of left-to-right text
2296 o CSI '2' ']' -- start of right-to-left text
2297 The control character CSI (0x9B: control sequence introducer) is
39787efd
KH
2298 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2299
2300 Character composition specification takes the following form:
ec6d2bb8
KH
2301 o ESC '0' -- start relative composition
2302 o ESC '1' -- end composition
2303 o ESC '2' -- start rule-base composition (*)
2304 o ESC '3' -- start relative composition with alternate chars (**)
2305 o ESC '4' -- start rule-base composition with alternate chars (**)
b73bfc1c 2306 Since these are not standard escape sequences of any ISO standard,
cfb43547 2307 the use of them with these meanings is restricted to Emacs only.
ec6d2bb8 2308
5a936b46
DL
2309 (*) This form is used only in Emacs 20.7 and older versions,
2310 but newer versions can safely decode it.
cfb43547 2311 (**) This form is used only in Emacs 21.1 and newer versions,
5a936b46 2312 and older versions can't decode it.
ec6d2bb8 2313
cfb43547 2314 Here's a list of example usages of these composition escape
b73bfc1c 2315 sequences (categorized by `enum composition_method').
ec6d2bb8 2316
b73bfc1c 2317 COMPOSITION_RELATIVE:
ec6d2bb8 2318 ESC 0 CHAR [ CHAR ] ESC 1
8ca3766a 2319 COMPOSITION_WITH_RULE:
ec6d2bb8 2320 ESC 2 CHAR [ RULE CHAR ] ESC 1
b73bfc1c 2321 COMPOSITION_WITH_ALTCHARS:
ec6d2bb8 2322 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
b73bfc1c 2323 COMPOSITION_WITH_RULE_ALTCHARS:
ec6d2bb8 2324 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
4ed46869
KH
2325
2326enum iso_code_class_type iso_code_class[256];
2327
df7492f9
KH
2328#define SAFE_CHARSET_P(coding, id) \
2329 ((id) <= (coding)->max_charset_id \
2330 && (coding)->safe_charsets[id] >= 0)
2331
2332
2333#define SHIFT_OUT_OK(category) \
2334 (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
2335
2336static void
f0064e1f
DL
2337setup_iso_safe_charsets (attrs)
2338 Lisp_Object attrs;
df7492f9
KH
2339{
2340 Lisp_Object charset_list, safe_charsets;
2341 Lisp_Object request;
2342 Lisp_Object reg_usage;
2343 Lisp_Object tail;
2344 int reg94, reg96;
2345 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2346 int max_charset_id;
2347
2348 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2349 if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2350 && ! EQ (charset_list, Viso_2022_charset_list))
2351 {
2352 CODING_ATTR_CHARSET_LIST (attrs)
2353 = charset_list = Viso_2022_charset_list;
2354 ASET (attrs, coding_attr_safe_charsets, Qnil);
2355 }
2356
2357 if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2358 return;
2359
2360 max_charset_id = 0;
2361 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2362 {
2363 int id = XINT (XCAR (tail));
2364 if (max_charset_id < id)
2365 max_charset_id = id;
2366 }
d46c5b12 2367
df7492f9
KH
2368 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
2369 make_number (255));
2370 request = AREF (attrs, coding_attr_iso_request);
2371 reg_usage = AREF (attrs, coding_attr_iso_usage);
2372 reg94 = XINT (XCAR (reg_usage));
2373 reg96 = XINT (XCDR (reg_usage));
2374
2375 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2376 {
2377 Lisp_Object id;
2378 Lisp_Object reg;
2379 struct charset *charset;
2380
2381 id = XCAR (tail);
2382 charset = CHARSET_FROM_ID (XINT (id));
bf16eb23 2383 reg = Fcdr (Fassq (id, request));
df7492f9 2384 if (! NILP (reg))
8f924df7 2385 SSET (safe_charsets, XINT (id), XINT (reg));
df7492f9
KH
2386 else if (charset->iso_chars_96)
2387 {
2388 if (reg96 < 4)
8f924df7 2389 SSET (safe_charsets, XINT (id), reg96);
df7492f9
KH
2390 }
2391 else
2392 {
2393 if (reg94 < 4)
8f924df7 2394 SSET (safe_charsets, XINT (id), reg94);
df7492f9
KH
2395 }
2396 }
2397 ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2398}
d46c5b12 2399
b6871cc7 2400
4ed46869 2401/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
ff0dacd7
KH
2402 Check if a text is encoded in one of ISO-2022 based codig systems.
2403 If it is, return 1, else return 0. */
4ed46869 2404
0a28aafb 2405static int
ff0dacd7 2406detect_coding_iso_2022 (coding, detect_info)
df7492f9 2407 struct coding_system *coding;
ff0dacd7 2408 struct coding_detection_info *detect_info;
4ed46869 2409{
8f924df7
KH
2410 const unsigned char *src = coding->source, *src_base = src;
2411 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9 2412 int multibytep = coding->src_multibyte;
ff0dacd7 2413 int single_shifting = 0;
df7492f9
KH
2414 int id;
2415 int c, c1;
2416 int consumed_chars = 0;
2417 int i;
ff0dacd7
KH
2418 int rejected = 0;
2419 int found = 0;
2420
2421 detect_info->checked |= CATEGORY_MASK_ISO;
df7492f9
KH
2422
2423 for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2424 {
2425 struct coding_system *this = &(coding_categories[i]);
2426 Lisp_Object attrs, val;
2427
2428 attrs = CODING_ID_ATTRS (this->id);
2429 if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2430 && ! EQ (CODING_ATTR_SAFE_CHARSETS (attrs), Viso_2022_charset_list))
2431 setup_iso_safe_charsets (attrs);
2432 val = CODING_ATTR_SAFE_CHARSETS (attrs);
8f924df7
KH
2433 this->max_charset_id = SCHARS (val) - 1;
2434 this->safe_charsets = (char *) SDATA (val);
df7492f9
KH
2435 }
2436
2437 /* A coding system of this category is always ASCII compatible. */
2438 src += coding->head_ascii;
3f003981 2439
ff0dacd7 2440 while (rejected != CATEGORY_MASK_ISO)
4ed46869 2441 {
df7492f9 2442 ONE_MORE_BYTE (c);
4ed46869
KH
2443 switch (c)
2444 {
2445 case ISO_CODE_ESC:
74383408
KH
2446 if (inhibit_iso_escape_detection)
2447 break;
f46869e4 2448 single_shifting = 0;
df7492f9 2449 ONE_MORE_BYTE (c);
d46c5b12 2450 if (c >= '(' && c <= '/')
4ed46869 2451 {
bf9cdd4e 2452 /* Designation sequence for a charset of dimension 1. */
df7492f9 2453 ONE_MORE_BYTE (c1);
d46c5b12 2454 if (c1 < ' ' || c1 >= 0x80
df7492f9 2455 || (id = iso_charset_table[0][c >= ','][c1]) < 0)
d46c5b12
KH
2456 /* Invalid designation sequence. Just ignore. */
2457 break;
bf9cdd4e
KH
2458 }
2459 else if (c == '$')
2460 {
2461 /* Designation sequence for a charset of dimension 2. */
df7492f9 2462 ONE_MORE_BYTE (c);
bf9cdd4e
KH
2463 if (c >= '@' && c <= 'B')
2464 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
ff0dacd7 2465 id = iso_charset_table[1][0][c];
bf9cdd4e 2466 else if (c >= '(' && c <= '/')
bcf26d6a 2467 {
df7492f9 2468 ONE_MORE_BYTE (c1);
d46c5b12 2469 if (c1 < ' ' || c1 >= 0x80
df7492f9 2470 || (id = iso_charset_table[1][c >= ','][c1]) < 0)
d46c5b12
KH
2471 /* Invalid designation sequence. Just ignore. */
2472 break;
bcf26d6a 2473 }
bf9cdd4e 2474 else
ff0dacd7 2475 /* Invalid designation sequence. Just ignore it. */
d46c5b12
KH
2476 break;
2477 }
ae9ff118 2478 else if (c == 'N' || c == 'O')
d46c5b12 2479 {
ae9ff118 2480 /* ESC <Fe> for SS2 or SS3. */
ff0dacd7
KH
2481 single_shifting = 1;
2482 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
d46c5b12 2483 break;
4ed46869 2484 }
ec6d2bb8
KH
2485 else if (c >= '0' && c <= '4')
2486 {
2487 /* ESC <Fp> for start/end composition. */
ff0dacd7 2488 found |= CATEGORY_MASK_ISO;
ec6d2bb8
KH
2489 break;
2490 }
bf9cdd4e 2491 else
df7492f9 2492 {
ff0dacd7 2493 /* Invalid escape sequence. Just ignore it. */
df7492f9
KH
2494 break;
2495 }
d46c5b12
KH
2496
2497 /* We found a valid designation sequence for CHARSET. */
ff0dacd7 2498 rejected |= CATEGORY_MASK_ISO_8BIT;
df7492f9
KH
2499 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
2500 id))
ff0dacd7 2501 found |= CATEGORY_MASK_ISO_7;
d46c5b12 2502 else
ff0dacd7 2503 rejected |= CATEGORY_MASK_ISO_7;
df7492f9
KH
2504 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
2505 id))
ff0dacd7 2506 found |= CATEGORY_MASK_ISO_7_TIGHT;
d46c5b12 2507 else
ff0dacd7 2508 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
df7492f9
KH
2509 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
2510 id))
ff0dacd7 2511 found |= CATEGORY_MASK_ISO_7_ELSE;
ae9ff118 2512 else
ff0dacd7 2513 rejected |= CATEGORY_MASK_ISO_7_ELSE;
df7492f9
KH
2514 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
2515 id))
ff0dacd7 2516 found |= CATEGORY_MASK_ISO_8_ELSE;
ae9ff118 2517 else
ff0dacd7 2518 rejected |= CATEGORY_MASK_ISO_8_ELSE;
4ed46869
KH
2519 break;
2520
4ed46869 2521 case ISO_CODE_SO:
d46c5b12 2522 case ISO_CODE_SI:
ff0dacd7 2523 /* Locking shift out/in. */
74383408
KH
2524 if (inhibit_iso_escape_detection)
2525 break;
f46869e4 2526 single_shifting = 0;
ff0dacd7
KH
2527 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2528 found |= CATEGORY_MASK_ISO_ELSE;
d46c5b12
KH
2529 break;
2530
4ed46869 2531 case ISO_CODE_CSI:
ff0dacd7 2532 /* Control sequence introducer. */
f46869e4 2533 single_shifting = 0;
ff0dacd7
KH
2534 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2535 found |= CATEGORY_MASK_ISO_8_ELSE;
2536 goto check_extra_latin;
2537
2538
4ed46869
KH
2539 case ISO_CODE_SS2:
2540 case ISO_CODE_SS3:
ff0dacd7
KH
2541 /* Single shift. */
2542 if (inhibit_iso_escape_detection)
2543 break;
2544 single_shifting = 1;
2545 rejected |= CATEGORY_MASK_ISO_7BIT;
2546 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2547 & CODING_ISO_FLAG_SINGLE_SHIFT)
2548 found |= CATEGORY_MASK_ISO_8_1;
2549 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2550 & CODING_ISO_FLAG_SINGLE_SHIFT)
2551 found |= CATEGORY_MASK_ISO_8_2;
2552 goto check_extra_latin;
4ed46869
KH
2553
2554 default:
2555 if (c < 0x80)
f46869e4
KH
2556 {
2557 single_shifting = 0;
2558 break;
2559 }
ff0dacd7 2560 if (c >= 0xA0)
c4825358 2561 {
ff0dacd7
KH
2562 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2563 found |= CATEGORY_MASK_ISO_8_1;
f46869e4 2564 /* Check the length of succeeding codes of the range
ff0dacd7
KH
2565 0xA0..0FF. If the byte length is even, we include
2566 CATEGORY_MASK_ISO_8_2 in `found'. We can check this
2567 only when we are not single shifting. */
2568 if (! single_shifting
2569 && ! (rejected & CATEGORY_MASK_ISO_8_2))
f46869e4 2570 {
e17de821 2571 int i = 1;
b73bfc1c
KH
2572 while (src < src_end)
2573 {
df7492f9 2574 ONE_MORE_BYTE (c);
b73bfc1c
KH
2575 if (c < 0xA0)
2576 break;
2577 i++;
2578 }
2579
2580 if (i & 1 && src < src_end)
ff0dacd7 2581 rejected |= CATEGORY_MASK_ISO_8_2;
f46869e4 2582 else
ff0dacd7 2583 found |= CATEGORY_MASK_ISO_8_2;
f46869e4 2584 }
ff0dacd7 2585 break;
4ed46869 2586 }
ff0dacd7
KH
2587 check_extra_latin:
2588 single_shifting = 0;
2589 if (! VECTORP (Vlatin_extra_code_table)
2590 || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
2591 {
2592 rejected = CATEGORY_MASK_ISO;
2593 break;
2594 }
2595 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2596 & CODING_ISO_FLAG_LATIN_EXTRA)
2597 found |= CATEGORY_MASK_ISO_8_1;
2598 else
2599 rejected |= CATEGORY_MASK_ISO_8_1;
2600 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2601 & CODING_ISO_FLAG_LATIN_EXTRA)
2602 found |= CATEGORY_MASK_ISO_8_2;
2603 else
2604 rejected |= CATEGORY_MASK_ISO_8_2;
4ed46869
KH
2605 }
2606 }
ff0dacd7
KH
2607 detect_info->rejected |= CATEGORY_MASK_ISO;
2608 return 0;
4ed46869 2609
df7492f9 2610 no_more_source:
ff0dacd7
KH
2611 detect_info->rejected |= rejected;
2612 detect_info->found |= (found & ~rejected);
df7492f9 2613 return 1;
4ed46869 2614}
ec6d2bb8 2615
4ed46869
KH
2616
2617/* Set designation state into CODING. */
df7492f9
KH
2618#define DECODE_DESIGNATION(reg, dim, chars_96, final) \
2619 do { \
2620 int id, prev; \
2621 \
2622 if (final < '0' || final >= 128 \
2623 || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0) \
2624 || !SAFE_CHARSET_P (coding, id)) \
2625 { \
2626 CODING_ISO_DESIGNATION (coding, reg) = -2; \
2627 goto invalid_code; \
2628 } \
2629 prev = CODING_ISO_DESIGNATION (coding, reg); \
bf16eb23
KH
2630 if (id == charset_jisx0201_roman) \
2631 { \
2632 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
2633 id = charset_ascii; \
2634 } \
2635 else if (id == charset_jisx0208_1978) \
2636 { \
2637 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
2638 id = charset_jisx0208; \
2639 } \
df7492f9
KH
2640 CODING_ISO_DESIGNATION (coding, reg) = id; \
2641 /* If there was an invalid designation to REG previously, and this \
2642 designation is ASCII to REG, we should keep this designation \
2643 sequence. */ \
2644 if (prev == -2 && id == charset_ascii) \
2645 goto invalid_code; \
4ed46869
KH
2646 } while (0)
2647
d46c5b12 2648
df7492f9
KH
2649#define MAYBE_FINISH_COMPOSITION() \
2650 do { \
2651 int i; \
2652 if (composition_state == COMPOSING_NO) \
2653 break; \
2654 /* It is assured that we have enough room for producing \
2655 characters stored in the table `components'. */ \
2656 if (charbuf + component_idx > charbuf_end) \
2657 goto no_more_source; \
2658 composition_state = COMPOSING_NO; \
2659 if (method == COMPOSITION_RELATIVE \
2660 || method == COMPOSITION_WITH_ALTCHARS) \
2661 { \
2662 for (i = 0; i < component_idx; i++) \
2663 *charbuf++ = components[i]; \
2664 char_offset += component_idx; \
2665 } \
2666 else \
2667 { \
2668 for (i = 0; i < component_idx; i += 2) \
2669 *charbuf++ = components[i]; \
2670 char_offset += (component_idx / 2) + 1; \
2671 } \
2672 } while (0)
2673
d46c5b12 2674
aa72b389
KH
2675/* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
2676 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
2677 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
df7492f9
KH
2678 ESC 3 : altchar composition : ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
2679 ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
aa72b389 2680 */
ec6d2bb8 2681
df7492f9
KH
2682#define DECODE_COMPOSITION_START(c1) \
2683 do { \
2684 if (c1 == '0' \
781d7a48 2685 && composition_state == COMPOSING_COMPONENT_RULE) \
df7492f9
KH
2686 { \
2687 component_len = component_idx; \
2688 composition_state = COMPOSING_CHAR; \
2689 } \
2690 else \
2691 { \
8f924df7 2692 const unsigned char *p; \
df7492f9
KH
2693 \
2694 MAYBE_FINISH_COMPOSITION (); \
2695 if (charbuf + MAX_COMPOSITION_COMPONENTS > charbuf_end) \
2696 goto no_more_source; \
2697 for (p = src; p < src_end - 1; p++) \
2698 if (*p == ISO_CODE_ESC && p[1] == '1') \
2699 break; \
2700 if (p == src_end - 1) \
2701 { \
2702 if (coding->mode & CODING_MODE_LAST_BLOCK) \
2703 goto invalid_code; \
2704 goto no_more_source; \
2705 } \
2706 \
2707 /* This is surely the start of a composition. */ \
2708 method = (c1 == '0' ? COMPOSITION_RELATIVE \
2709 : c1 == '2' ? COMPOSITION_WITH_RULE \
2710 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
2711 : COMPOSITION_WITH_RULE_ALTCHARS); \
2712 composition_state = (c1 <= '2' ? COMPOSING_CHAR \
2713 : COMPOSING_COMPONENT_CHAR); \
2714 component_idx = component_len = 0; \
2715 } \
ec6d2bb8
KH
2716 } while (0)
2717
ec6d2bb8 2718
df7492f9
KH
2719/* Handle compositoin end sequence ESC 1. */
2720
2721#define DECODE_COMPOSITION_END() \
ec6d2bb8 2722 do { \
df7492f9
KH
2723 int nchars = (component_len > 0 ? component_idx - component_len \
2724 : method == COMPOSITION_RELATIVE ? component_idx \
2725 : (component_idx + 1) / 2); \
2726 int i; \
2727 int *saved_charbuf = charbuf; \
8f924df7 2728 int from = char_offset; \
ff0dacd7 2729 int to = from + nchars; \
df7492f9 2730 \
ff0dacd7 2731 ADD_COMPOSITION_DATA (charbuf, from, to, method); \
df7492f9 2732 if (method != COMPOSITION_RELATIVE) \
ec6d2bb8 2733 { \
df7492f9
KH
2734 if (component_len == 0) \
2735 for (i = 0; i < component_idx; i++) \
2736 *charbuf++ = components[i]; \
2737 else \
2738 for (i = 0; i < component_len; i++) \
2739 *charbuf++ = components[i]; \
2740 *saved_charbuf = saved_charbuf - charbuf; \
ec6d2bb8 2741 } \
df7492f9
KH
2742 if (method == COMPOSITION_WITH_RULE) \
2743 for (i = 0; i < component_idx; i += 2, char_offset++) \
2744 *charbuf++ = components[i]; \
ec6d2bb8 2745 else \
df7492f9
KH
2746 for (i = component_len; i < component_idx; i++, char_offset++) \
2747 *charbuf++ = components[i]; \
2748 coding->annotated = 1; \
2749 composition_state = COMPOSING_NO; \
ec6d2bb8
KH
2750 } while (0)
2751
df7492f9 2752
ec6d2bb8
KH
2753/* Decode a composition rule from the byte C1 (and maybe one more byte
2754 from SRC) and store one encoded composition rule in
2755 coding->cmp_data. */
2756
2757#define DECODE_COMPOSITION_RULE(c1) \
2758 do { \
ec6d2bb8
KH
2759 (c1) -= 32; \
2760 if (c1 < 81) /* old format (before ver.21) */ \
2761 { \
2762 int gref = (c1) / 9; \
2763 int nref = (c1) % 9; \
2764 if (gref == 4) gref = 10; \
2765 if (nref == 4) nref = 10; \
df7492f9 2766 c1 = COMPOSITION_ENCODE_RULE (gref, nref); \
ec6d2bb8 2767 } \
b73bfc1c 2768 else if (c1 < 93) /* new format (after ver.21) */ \
ec6d2bb8
KH
2769 { \
2770 ONE_MORE_BYTE (c2); \
df7492f9 2771 c1 = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32); \
ec6d2bb8 2772 } \
df7492f9
KH
2773 else \
2774 c1 = 0; \
ec6d2bb8 2775 } while (0)
88993dfd 2776
d46c5b12 2777
4ed46869
KH
2778/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
2779
b73bfc1c 2780static void
df7492f9 2781decode_coding_iso_2022 (coding)
4ed46869 2782 struct coding_system *coding;
4ed46869 2783{
8f924df7
KH
2784 const unsigned char *src = coding->source + coding->consumed;
2785 const unsigned char *src_end = coding->source + coding->src_bytes;
2786 const unsigned char *src_base;
df7492f9 2787 int *charbuf = coding->charbuf;
ff0dacd7
KH
2788 int *charbuf_end
2789 = charbuf + coding->charbuf_size - 4 - MAX_ANNOTATION_LENGTH;
df7492f9 2790 int consumed_chars = 0, consumed_chars_base;
df7492f9 2791 int multibytep = coding->src_multibyte;
4ed46869 2792 /* Charsets invoked to graphic plane 0 and 1 respectively. */
df7492f9
KH
2793 int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2794 int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
2795 struct charset *charset;
2796 int c;
2797 /* For handling composition sequence. */
2798#define COMPOSING_NO 0
2799#define COMPOSING_CHAR 1
2800#define COMPOSING_RULE 2
2801#define COMPOSING_COMPONENT_CHAR 3
2802#define COMPOSING_COMPONENT_RULE 4
2803
2804 int composition_state = COMPOSING_NO;
2805 enum composition_method method;
2806 int components[MAX_COMPOSITION_COMPONENTS * 2 + 1];
2807 int component_idx;
2808 int component_len;
2809 Lisp_Object attrs, eol_type, charset_list;
ff0dacd7
KH
2810 int char_offset = coding->produced_char;
2811 int last_offset = char_offset;
2812 int last_id = charset_ascii;
df7492f9
KH
2813
2814 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
2815 setup_iso_safe_charsets (attrs);
b73bfc1c
KH
2816
2817 while (1)
4ed46869 2818 {
463f5630 2819 int c1, c2;
b73bfc1c
KH
2820
2821 src_base = src;
df7492f9
KH
2822 consumed_chars_base = consumed_chars;
2823
2824 if (charbuf >= charbuf_end)
2825 break;
2826
b73bfc1c 2827 ONE_MORE_BYTE (c1);
4ed46869 2828
98725083 2829 /* We produce at most one character. */
4ed46869
KH
2830 switch (iso_code_class [c1])
2831 {
2832 case ISO_0x20_or_0x7F:
df7492f9 2833 if (composition_state != COMPOSING_NO)
ec6d2bb8 2834 {
df7492f9
KH
2835 if (composition_state == COMPOSING_RULE
2836 || composition_state == COMPOSING_COMPONENT_RULE)
2837 {
2838 DECODE_COMPOSITION_RULE (c1);
2839 components[component_idx++] = c1;
2840 composition_state--;
2841 continue;
2842 }
4ed46869 2843 }
df7492f9
KH
2844 if (charset_id_0 < 0
2845 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
781d7a48
KH
2846 /* This is SPACE or DEL. */
2847 charset = CHARSET_FROM_ID (charset_ascii);
2848 else
2849 charset = CHARSET_FROM_ID (charset_id_0);
2850 break;
4ed46869
KH
2851
2852 case ISO_graphic_plane_0:
781d7a48 2853 if (composition_state != COMPOSING_NO)
b73bfc1c 2854 {
781d7a48
KH
2855 if (composition_state == COMPOSING_RULE
2856 || composition_state == COMPOSING_COMPONENT_RULE)
2857 {
2858 DECODE_COMPOSITION_RULE (c1);
2859 components[component_idx++] = c1;
2860 composition_state--;
2861 continue;
2862 }
b73bfc1c 2863 }
df7492f9 2864 charset = CHARSET_FROM_ID (charset_id_0);
4ed46869
KH
2865 break;
2866
2867 case ISO_0xA0_or_0xFF:
df7492f9
KH
2868 if (charset_id_1 < 0
2869 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
2870 || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
2871 goto invalid_code;
4ed46869
KH
2872 /* This is a graphic character, we fall down ... */
2873
2874 case ISO_graphic_plane_1:
df7492f9
KH
2875 if (charset_id_1 < 0)
2876 goto invalid_code;
2877 charset = CHARSET_FROM_ID (charset_id_1);
4ed46869
KH
2878 break;
2879
2880 case ISO_carriage_return:
df7492f9 2881 if (c1 == '\r')
4ed46869 2882 {
df7492f9 2883 if (EQ (eol_type, Qdos))
4ed46869 2884 {
df7492f9 2885 if (src == src_end)
98725083
KH
2886 {
2887 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
2888 goto no_more_source;
8f924df7 2889 }
df7492f9
KH
2890 if (*src == '\n')
2891 ONE_MORE_BYTE (c1);
4ed46869 2892 }
df7492f9
KH
2893 else if (EQ (eol_type, Qmac))
2894 c1 = '\n';
4ed46869 2895 }
df7492f9
KH
2896 /* fall through */
2897
2898 case ISO_control_0:
2899 MAYBE_FINISH_COMPOSITION ();
2900 charset = CHARSET_FROM_ID (charset_ascii);
4ed46869
KH
2901 break;
2902
df7492f9
KH
2903 case ISO_control_1:
2904 MAYBE_FINISH_COMPOSITION ();
2905 goto invalid_code;
2906
4ed46869 2907 case ISO_shift_out:
df7492f9
KH
2908 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
2909 || CODING_ISO_DESIGNATION (coding, 1) < 0)
2910 goto invalid_code;
2911 CODING_ISO_INVOCATION (coding, 0) = 1;
2912 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
b73bfc1c 2913 continue;
4ed46869
KH
2914
2915 case ISO_shift_in:
df7492f9
KH
2916 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
2917 goto invalid_code;
2918 CODING_ISO_INVOCATION (coding, 0) = 0;
2919 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
b73bfc1c 2920 continue;
4ed46869
KH
2921
2922 case ISO_single_shift_2_7:
2923 case ISO_single_shift_2:
df7492f9
KH
2924 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
2925 goto invalid_code;
4ed46869
KH
2926 /* SS2 is handled as an escape sequence of ESC 'N' */
2927 c1 = 'N';
2928 goto label_escape_sequence;
2929
2930 case ISO_single_shift_3:
df7492f9
KH
2931 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
2932 goto invalid_code;
4ed46869
KH
2933 /* SS2 is handled as an escape sequence of ESC 'O' */
2934 c1 = 'O';
2935 goto label_escape_sequence;
2936
2937 case ISO_control_sequence_introducer:
2938 /* CSI is handled as an escape sequence of ESC '[' ... */
2939 c1 = '[';
2940 goto label_escape_sequence;
2941
2942 case ISO_escape:
2943 ONE_MORE_BYTE (c1);
2944 label_escape_sequence:
df7492f9 2945 /* Escape sequences handled here are invocation,
4ed46869
KH
2946 designation, direction specification, and character
2947 composition specification. */
2948 switch (c1)
2949 {
2950 case '&': /* revision of following character set */
2951 ONE_MORE_BYTE (c1);
2952 if (!(c1 >= '@' && c1 <= '~'))
df7492f9 2953 goto invalid_code;
4ed46869
KH
2954 ONE_MORE_BYTE (c1);
2955 if (c1 != ISO_CODE_ESC)
df7492f9 2956 goto invalid_code;
4ed46869
KH
2957 ONE_MORE_BYTE (c1);
2958 goto label_escape_sequence;
2959
2960 case '$': /* designation of 2-byte character set */
df7492f9
KH
2961 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
2962 goto invalid_code;
4ed46869
KH
2963 ONE_MORE_BYTE (c1);
2964 if (c1 >= '@' && c1 <= 'B')
2965 { /* designation of JISX0208.1978, GB2312.1980,
88993dfd 2966 or JISX0208.1980 */
df7492f9 2967 DECODE_DESIGNATION (0, 2, 0, c1);
4ed46869
KH
2968 }
2969 else if (c1 >= 0x28 && c1 <= 0x2B)
2970 { /* designation of DIMENSION2_CHARS94 character set */
2971 ONE_MORE_BYTE (c2);
df7492f9 2972 DECODE_DESIGNATION (c1 - 0x28, 2, 0, c2);
4ed46869
KH
2973 }
2974 else if (c1 >= 0x2C && c1 <= 0x2F)
2975 { /* designation of DIMENSION2_CHARS96 character set */
2976 ONE_MORE_BYTE (c2);
df7492f9 2977 DECODE_DESIGNATION (c1 - 0x2C, 2, 1, c2);
4ed46869
KH
2978 }
2979 else
df7492f9 2980 goto invalid_code;
b73bfc1c 2981 /* We must update these variables now. */
df7492f9
KH
2982 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2983 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
b73bfc1c 2984 continue;
4ed46869
KH
2985
2986 case 'n': /* invocation of locking-shift-2 */
df7492f9
KH
2987 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
2988 || CODING_ISO_DESIGNATION (coding, 2) < 0)
2989 goto invalid_code;
2990 CODING_ISO_INVOCATION (coding, 0) = 2;
2991 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
b73bfc1c 2992 continue;
4ed46869
KH
2993
2994 case 'o': /* invocation of locking-shift-3 */
df7492f9
KH
2995 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
2996 || CODING_ISO_DESIGNATION (coding, 3) < 0)
2997 goto invalid_code;
2998 CODING_ISO_INVOCATION (coding, 0) = 3;
2999 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
b73bfc1c 3000 continue;
4ed46869
KH
3001
3002 case 'N': /* invocation of single-shift-2 */
df7492f9
KH
3003 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3004 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3005 goto invalid_code;
3006 charset = CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding, 2));
b73bfc1c 3007 ONE_MORE_BYTE (c1);
e7046a18 3008 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
df7492f9 3009 goto invalid_code;
4ed46869
KH
3010 break;
3011
3012 case 'O': /* invocation of single-shift-3 */
df7492f9
KH
3013 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3014 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3015 goto invalid_code;
3016 charset = CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding, 3));
b73bfc1c 3017 ONE_MORE_BYTE (c1);
e7046a18 3018 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
df7492f9 3019 goto invalid_code;
4ed46869
KH
3020 break;
3021
ec6d2bb8 3022 case '0': case '2': case '3': case '4': /* start composition */
df7492f9
KH
3023 if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3024 goto invalid_code;
ec6d2bb8 3025 DECODE_COMPOSITION_START (c1);
b73bfc1c 3026 continue;
4ed46869 3027
ec6d2bb8 3028 case '1': /* end composition */
df7492f9
KH
3029 if (composition_state == COMPOSING_NO)
3030 goto invalid_code;
3031 DECODE_COMPOSITION_END ();
b73bfc1c 3032 continue;
4ed46869
KH
3033
3034 case '[': /* specification of direction */
df7492f9
KH
3035 if (! CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION)
3036 goto invalid_code;
4ed46869 3037 /* For the moment, nested direction is not supported.
d46c5b12 3038 So, `coding->mode & CODING_MODE_DIRECTION' zero means
df7492f9 3039 left-to-right, and nozero means right-to-left. */
4ed46869
KH
3040 ONE_MORE_BYTE (c1);
3041 switch (c1)
3042 {
3043 case ']': /* end of the current direction */
d46c5b12 3044 coding->mode &= ~CODING_MODE_DIRECTION;
4ed46869
KH
3045
3046 case '0': /* end of the current direction */
3047 case '1': /* start of left-to-right direction */
3048 ONE_MORE_BYTE (c1);
3049 if (c1 == ']')
d46c5b12 3050 coding->mode &= ~CODING_MODE_DIRECTION;
4ed46869 3051 else
df7492f9 3052 goto invalid_code;
4ed46869
KH
3053 break;
3054
3055 case '2': /* start of right-to-left direction */
3056 ONE_MORE_BYTE (c1);
3057 if (c1 == ']')
d46c5b12 3058 coding->mode |= CODING_MODE_DIRECTION;
4ed46869 3059 else
df7492f9 3060 goto invalid_code;
4ed46869
KH
3061 break;
3062
3063 default:
df7492f9 3064 goto invalid_code;
4ed46869 3065 }
b73bfc1c 3066 continue;
4ed46869 3067
103e0180 3068 case '%':
103e0180
KH
3069 ONE_MORE_BYTE (c1);
3070 if (c1 == '/')
3071 {
3072 /* CTEXT extended segment:
3073 ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3074 We keep these bytes as is for the moment.
3075 They may be decoded by post-read-conversion. */
3076 int dim, M, L;
4776e638 3077 int size;
8f924df7 3078
103e0180
KH
3079 ONE_MORE_BYTE (dim);
3080 ONE_MORE_BYTE (M);
3081 ONE_MORE_BYTE (L);
3082 size = ((M - 128) * 128) + (L - 128);
4776e638
KH
3083 if (charbuf + 8 + size > charbuf_end)
3084 goto break_loop;
3085 *charbuf++ = ISO_CODE_ESC;
3086 *charbuf++ = '%';
3087 *charbuf++ = '/';
3088 *charbuf++ = dim;
3089 *charbuf++ = BYTE8_TO_CHAR (M);
3090 *charbuf++ = BYTE8_TO_CHAR (L);
103e0180
KH
3091 while (size-- > 0)
3092 {
3093 ONE_MORE_BYTE (c1);
4776e638 3094 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
103e0180 3095 }
103e0180
KH
3096 }
3097 else if (c1 == 'G')
3098 {
103e0180
KH
3099 /* XFree86 extension for embedding UTF-8 in CTEXT:
3100 ESC % G --UTF-8-BYTES-- ESC % @
3101 We keep these bytes as is for the moment.
3102 They may be decoded by post-read-conversion. */
4776e638
KH
3103 int *p = charbuf;
3104
3105 if (p + 6 > charbuf_end)
3106 goto break_loop;
3107 *p++ = ISO_CODE_ESC;
3108 *p++ = '%';
3109 *p++ = 'G';
3110 while (p < charbuf_end)
103e0180
KH
3111 {
3112 ONE_MORE_BYTE (c1);
3113 if (c1 == ISO_CODE_ESC
3114 && src + 1 < src_end
3115 && src[0] == '%'
3116 && src[1] == '@')
3117 break;
4776e638 3118 *p++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
103e0180 3119 }
4776e638
KH
3120 if (p + 3 > charbuf_end)
3121 goto break_loop;
3122 *p++ = ISO_CODE_ESC;
3123 *p++ = '%';
3124 *p++ = '@';
3125 charbuf = p;
103e0180
KH
3126 }
3127 else
4776e638 3128 goto invalid_code;
103e0180 3129 continue;
4776e638 3130 break;
103e0180 3131
4ed46869 3132 default:
df7492f9
KH
3133 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3134 goto invalid_code;
4ed46869
KH
3135 if (c1 >= 0x28 && c1 <= 0x2B)
3136 { /* designation of DIMENSION1_CHARS94 character set */
3137 ONE_MORE_BYTE (c2);
df7492f9 3138 DECODE_DESIGNATION (c1 - 0x28, 1, 0, c2);
4ed46869
KH
3139 }
3140 else if (c1 >= 0x2C && c1 <= 0x2F)
3141 { /* designation of DIMENSION1_CHARS96 character set */
3142 ONE_MORE_BYTE (c2);
df7492f9 3143 DECODE_DESIGNATION (c1 - 0x2C, 1, 1, c2);
4ed46869
KH
3144 }
3145 else
df7492f9 3146 goto invalid_code;
b73bfc1c 3147 /* We must update these variables now. */
df7492f9
KH
3148 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3149 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
b73bfc1c 3150 continue;
4ed46869 3151 }
b73bfc1c 3152 }
4ed46869 3153
ff0dacd7
KH
3154 if (charset->id != charset_ascii
3155 && last_id != charset->id)
3156 {
3157 if (last_id != charset_ascii)
3158 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
3159 last_id = charset->id;
3160 last_offset = char_offset;
3161 }
3162
b73bfc1c 3163 /* Now we know CHARSET and 1st position code C1 of a character.
df7492f9
KH
3164 Produce a decoded character while getting 2nd position code
3165 C2 if necessary. */
3166 c1 &= 0x7F;
3167 if (CHARSET_DIMENSION (charset) > 1)
b73bfc1c
KH
3168 {
3169 ONE_MORE_BYTE (c2);
df7492f9 3170 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
b73bfc1c 3171 /* C2 is not in a valid range. */
df7492f9
KH
3172 goto invalid_code;
3173 c1 = (c1 << 8) | (c2 & 0x7F);
3174 if (CHARSET_DIMENSION (charset) > 2)
3175 {
3176 ONE_MORE_BYTE (c2);
3177 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3178 /* C2 is not in a valid range. */
3179 goto invalid_code;
3180 c1 = (c1 << 8) | (c2 & 0x7F);
3181 }
3182 }
3183
3184 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3185 if (c < 0)
3186 {
3187 MAYBE_FINISH_COMPOSITION ();
3188 for (; src_base < src; src_base++, char_offset++)
3189 {
3190 if (ASCII_BYTE_P (*src_base))
3191 *charbuf++ = *src_base;
3192 else
3193 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3194 }
3195 }
3196 else if (composition_state == COMPOSING_NO)
3197 {
3198 *charbuf++ = c;
3199 char_offset++;
4ed46869 3200 }
df7492f9 3201 else
781d7a48
KH
3202 {
3203 components[component_idx++] = c;
3204 if (method == COMPOSITION_WITH_RULE
3205 || (method == COMPOSITION_WITH_RULE_ALTCHARS
3206 && composition_state == COMPOSING_COMPONENT_CHAR))
3207 composition_state++;
4ed46869
KH
3208 }
3209 continue;
3210
df7492f9
KH
3211 invalid_code:
3212 MAYBE_FINISH_COMPOSITION ();
4ed46869 3213 src = src_base;
df7492f9
KH
3214 consumed_chars = consumed_chars_base;
3215 ONE_MORE_BYTE (c);
3216 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
ff0dacd7 3217 char_offset++;
df7492f9 3218 coding->errors++;
4776e638
KH
3219 continue;
3220
3221 break_loop:
3222 break;
4ed46869 3223 }
fb88bf2d 3224
df7492f9 3225 no_more_source:
ff0dacd7
KH
3226 if (last_id != charset_ascii)
3227 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
df7492f9
KH
3228 coding->consumed_char += consumed_chars_base;
3229 coding->consumed = src_base - coding->source;
3230 coding->charbuf_used = charbuf - coding->charbuf;
4ed46869
KH
3231}
3232
b73bfc1c 3233
f4dee582 3234/* ISO2022 encoding stuff. */
4ed46869
KH
3235
3236/*
f4dee582 3237 It is not enough to say just "ISO2022" on encoding, we have to
df7492f9 3238 specify more details. In Emacs, each coding system of ISO2022
4ed46869 3239 variant has the following specifications:
df7492f9 3240 1. Initial designation to G0 thru G3.
4ed46869
KH
3241 2. Allows short-form designation?
3242 3. ASCII should be designated to G0 before control characters?
3243 4. ASCII should be designated to G0 at end of line?
3244 5. 7-bit environment or 8-bit environment?
3245 6. Use locking-shift?
3246 7. Use Single-shift?
3247 And the following two are only for Japanese:
3248 8. Use ASCII in place of JIS0201-1976-Roman?
3249 9. Use JISX0208-1983 in place of JISX0208-1978?
df7492f9
KH
3250 These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3251 defined by macros CODING_ISO_FLAG_XXX. See `coding.h' for more
f4dee582 3252 details.
4ed46869
KH
3253*/
3254
3255/* Produce codes (escape sequence) for designating CHARSET to graphic
b73bfc1c
KH
3256 register REG at DST, and increment DST. If <final-char> of CHARSET is
3257 '@', 'A', or 'B' and the coding system CODING allows, produce
3258 designation sequence of short-form. */
4ed46869
KH
3259
3260#define ENCODE_DESIGNATION(charset, reg, coding) \
3261 do { \
df7492f9 3262 unsigned char final_char = CHARSET_ISO_FINAL (charset); \
4ed46869
KH
3263 char *intermediate_char_94 = "()*+"; \
3264 char *intermediate_char_96 = ",-./"; \
df7492f9
KH
3265 int revision = -1; \
3266 int c; \
3267 \
3268 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION) \
c197f191 3269 revision = CHARSET_ISO_REVISION (charset); \
df7492f9
KH
3270 \
3271 if (revision >= 0) \
70c22245 3272 { \
df7492f9
KH
3273 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&'); \
3274 EMIT_ONE_BYTE ('@' + revision); \
4ed46869 3275 } \
df7492f9 3276 EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC); \
4ed46869
KH
3277 if (CHARSET_DIMENSION (charset) == 1) \
3278 { \
df7492f9
KH
3279 if (! CHARSET_ISO_CHARS_96 (charset)) \
3280 c = intermediate_char_94[reg]; \
4ed46869 3281 else \
df7492f9
KH
3282 c = intermediate_char_96[reg]; \
3283 EMIT_ONE_ASCII_BYTE (c); \
4ed46869
KH
3284 } \
3285 else \
3286 { \
df7492f9
KH
3287 EMIT_ONE_ASCII_BYTE ('$'); \
3288 if (! CHARSET_ISO_CHARS_96 (charset)) \
4ed46869 3289 { \
df7492f9 3290 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM \
b73bfc1c
KH
3291 || reg != 0 \
3292 || final_char < '@' || final_char > 'B') \
df7492f9 3293 EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]); \
4ed46869
KH
3294 } \
3295 else \
df7492f9 3296 EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]); \
4ed46869 3297 } \
df7492f9
KH
3298 EMIT_ONE_ASCII_BYTE (final_char); \
3299 \
3300 CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset); \
4ed46869
KH
3301 } while (0)
3302
df7492f9 3303
4ed46869
KH
3304/* The following two macros produce codes (control character or escape
3305 sequence) for ISO2022 single-shift functions (single-shift-2 and
3306 single-shift-3). */
3307
df7492f9
KH
3308#define ENCODE_SINGLE_SHIFT_2 \
3309 do { \
3310 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3311 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N'); \
3312 else \
3313 EMIT_ONE_BYTE (ISO_CODE_SS2); \
3314 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
4ed46869
KH
3315 } while (0)
3316
df7492f9
KH
3317
3318#define ENCODE_SINGLE_SHIFT_3 \
3319 do { \
3320 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3321 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O'); \
3322 else \
3323 EMIT_ONE_BYTE (ISO_CODE_SS3); \
3324 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
4ed46869
KH
3325 } while (0)
3326
df7492f9 3327
4ed46869
KH
3328/* The following four macros produce codes (control character or
3329 escape sequence) for ISO2022 locking-shift functions (shift-in,
3330 shift-out, locking-shift-2, and locking-shift-3). */
3331
df7492f9
KH
3332#define ENCODE_SHIFT_IN \
3333 do { \
3334 EMIT_ONE_ASCII_BYTE (ISO_CODE_SI); \
3335 CODING_ISO_INVOCATION (coding, 0) = 0; \
4ed46869
KH
3336 } while (0)
3337
df7492f9
KH
3338
3339#define ENCODE_SHIFT_OUT \
3340 do { \
3341 EMIT_ONE_ASCII_BYTE (ISO_CODE_SO); \
3342 CODING_ISO_INVOCATION (coding, 0) = 1; \
4ed46869
KH
3343 } while (0)
3344
df7492f9
KH
3345
3346#define ENCODE_LOCKING_SHIFT_2 \
3347 do { \
3348 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3349 CODING_ISO_INVOCATION (coding, 0) = 2; \
4ed46869
KH
3350 } while (0)
3351
df7492f9
KH
3352
3353#define ENCODE_LOCKING_SHIFT_3 \
3354 do { \
3355 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3356 CODING_ISO_INVOCATION (coding, 0) = 3; \
4ed46869
KH
3357 } while (0)
3358
df7492f9 3359
f4dee582
RS
3360/* Produce codes for a DIMENSION1 character whose character set is
3361 CHARSET and whose position-code is C1. Designation and invocation
4ed46869
KH
3362 sequences are also produced in advance if necessary. */
3363
6e85d753
KH
3364#define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
3365 do { \
df7492f9 3366 int id = CHARSET_ID (charset); \
bf16eb23
KH
3367 \
3368 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
3369 && id == charset_ascii) \
3370 { \
3371 id = charset_jisx0201_roman; \
3372 charset = CHARSET_FROM_ID (id); \
3373 } \
3374 \
df7492f9 3375 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
6e85d753 3376 { \
df7492f9
KH
3377 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3378 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
6e85d753 3379 else \
df7492f9
KH
3380 EMIT_ONE_BYTE (c1 | 0x80); \
3381 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
6e85d753
KH
3382 break; \
3383 } \
df7492f9 3384 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
6e85d753 3385 { \
df7492f9 3386 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
6e85d753
KH
3387 break; \
3388 } \
df7492f9 3389 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
6e85d753 3390 { \
df7492f9 3391 EMIT_ONE_BYTE (c1 | 0x80); \
6e85d753
KH
3392 break; \
3393 } \
6e85d753
KH
3394 else \
3395 /* Since CHARSET is not yet invoked to any graphic planes, we \
3396 must invoke it, or, at first, designate it to some graphic \
3397 register. Then repeat the loop to actually produce the \
3398 character. */ \
df7492f9
KH
3399 dst = encode_invocation_designation (charset, coding, dst, \
3400 &produced_chars); \
4ed46869
KH
3401 } while (1)
3402
df7492f9 3403
f4dee582
RS
3404/* Produce codes for a DIMENSION2 character whose character set is
3405 CHARSET and whose position-codes are C1 and C2. Designation and
4ed46869
KH
3406 invocation codes are also produced in advance if necessary. */
3407
6e85d753
KH
3408#define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
3409 do { \
df7492f9 3410 int id = CHARSET_ID (charset); \
bf16eb23
KH
3411 \
3412 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
3413 && id == charset_jisx0208) \
3414 { \
3415 id = charset_jisx0208_1978; \
3416 charset = CHARSET_FROM_ID (id); \
3417 } \
3418 \
df7492f9 3419 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
6e85d753 3420 { \
df7492f9
KH
3421 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3422 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
6e85d753 3423 else \
df7492f9
KH
3424 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3425 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
6e85d753
KH
3426 break; \
3427 } \
df7492f9 3428 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
6e85d753 3429 { \
df7492f9 3430 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
6e85d753
KH
3431 break; \
3432 } \
df7492f9 3433 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
6e85d753 3434 { \
df7492f9 3435 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
6e85d753
KH
3436 break; \
3437 } \
6e85d753
KH
3438 else \
3439 /* Since CHARSET is not yet invoked to any graphic planes, we \
3440 must invoke it, or, at first, designate it to some graphic \
3441 register. Then repeat the loop to actually produce the \
3442 character. */ \
df7492f9
KH
3443 dst = encode_invocation_designation (charset, coding, dst, \
3444 &produced_chars); \
4ed46869
KH
3445 } while (1)
3446
05e6f5dc 3447
df7492f9
KH
3448#define ENCODE_ISO_CHARACTER(charset, c) \
3449 do { \
3450 int code = ENCODE_CHAR ((charset),(c)); \
3451 \
3452 if (CHARSET_DIMENSION (charset) == 1) \
3453 ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code); \
3454 else \
3455 ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
84fbb8a0 3456 } while (0)
bdd9fb48 3457
05e6f5dc 3458
4ed46869 3459/* Produce designation and invocation codes at a place pointed by DST
df7492f9 3460 to use CHARSET. The element `spec.iso_2022' of *CODING is updated.
4ed46869
KH
3461 Return new DST. */
3462
3463unsigned char *
df7492f9
KH
3464encode_invocation_designation (charset, coding, dst, p_nchars)
3465 struct charset *charset;
4ed46869
KH
3466 struct coding_system *coding;
3467 unsigned char *dst;
df7492f9 3468 int *p_nchars;
4ed46869 3469{
df7492f9
KH
3470 int multibytep = coding->dst_multibyte;
3471 int produced_chars = *p_nchars;
4ed46869 3472 int reg; /* graphic register number */
df7492f9 3473 int id = CHARSET_ID (charset);
4ed46869
KH
3474
3475 /* At first, check designations. */
3476 for (reg = 0; reg < 4; reg++)
df7492f9 3477 if (id == CODING_ISO_DESIGNATION (coding, reg))
4ed46869
KH
3478 break;
3479
3480 if (reg >= 4)
3481 {
3482 /* CHARSET is not yet designated to any graphic registers. */
3483 /* At first check the requested designation. */
df7492f9
KH
3484 reg = CODING_ISO_REQUEST (coding, id);
3485 if (reg < 0)
1ba9e4ab
KH
3486 /* Since CHARSET requests no special designation, designate it
3487 to graphic register 0. */
4ed46869
KH
3488 reg = 0;
3489
3490 ENCODE_DESIGNATION (charset, reg, coding);
3491 }
3492
df7492f9
KH
3493 if (CODING_ISO_INVOCATION (coding, 0) != reg
3494 && CODING_ISO_INVOCATION (coding, 1) != reg)
4ed46869
KH
3495 {
3496 /* Since the graphic register REG is not invoked to any graphic
3497 planes, invoke it to graphic plane 0. */
3498 switch (reg)
3499 {
3500 case 0: /* graphic register 0 */
3501 ENCODE_SHIFT_IN;
3502 break;
3503
3504 case 1: /* graphic register 1 */
3505 ENCODE_SHIFT_OUT;
3506 break;
3507
3508 case 2: /* graphic register 2 */
df7492f9 3509 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4ed46869
KH
3510 ENCODE_SINGLE_SHIFT_2;
3511 else
3512 ENCODE_LOCKING_SHIFT_2;
3513 break;
3514
3515 case 3: /* graphic register 3 */
df7492f9 3516 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4ed46869
KH
3517 ENCODE_SINGLE_SHIFT_3;
3518 else
3519 ENCODE_LOCKING_SHIFT_3;
3520 break;
3521 }
3522 }
b73bfc1c 3523
df7492f9 3524 *p_nchars = produced_chars;
4ed46869
KH
3525 return dst;
3526}
3527
df7492f9
KH
3528/* The following three macros produce codes for indicating direction
3529 of text. */
3530#define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
ec6d2bb8 3531 do { \
df7492f9
KH
3532 if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS) \
3533 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '['); \
ec6d2bb8 3534 else \
df7492f9 3535 EMIT_ONE_BYTE (ISO_CODE_CSI); \
ec6d2bb8
KH
3536 } while (0)
3537
ec6d2bb8 3538
df7492f9
KH
3539#define ENCODE_DIRECTION_R2L() \
3540 do { \
3541 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3542 EMIT_TWO_ASCII_BYTES ('2', ']'); \
ec6d2bb8
KH
3543 } while (0)
3544
ec6d2bb8 3545
df7492f9 3546#define ENCODE_DIRECTION_L2R() \
ec6d2bb8 3547 do { \
df7492f9
KH
3548 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3549 EMIT_TWO_ASCII_BYTES ('0', ']'); \
ec6d2bb8 3550 } while (0)
4ed46869 3551
4ed46869
KH
3552
3553/* Produce codes for designation and invocation to reset the graphic
3554 planes and registers to initial state. */
df7492f9
KH
3555#define ENCODE_RESET_PLANE_AND_REGISTER() \
3556 do { \
3557 int reg; \
3558 struct charset *charset; \
3559 \
3560 if (CODING_ISO_INVOCATION (coding, 0) != 0) \
3561 ENCODE_SHIFT_IN; \
3562 for (reg = 0; reg < 4; reg++) \
3563 if (CODING_ISO_INITIAL (coding, reg) >= 0 \
3564 && (CODING_ISO_DESIGNATION (coding, reg) \
3565 != CODING_ISO_INITIAL (coding, reg))) \
3566 { \
3567 charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
3568 ENCODE_DESIGNATION (charset, reg, coding); \
3569 } \
4ed46869
KH
3570 } while (0)
3571
df7492f9 3572
bdd9fb48 3573/* Produce designation sequences of charsets in the line started from
b73bfc1c 3574 SRC to a place pointed by DST, and return updated DST.
bdd9fb48
KH
3575
3576 If the current block ends before any end-of-line, we may fail to
d46c5b12
KH
3577 find all the necessary designations. */
3578
b73bfc1c 3579static unsigned char *
df7492f9 3580encode_designation_at_bol (coding, charbuf, charbuf_end, dst)
e0e989f6 3581 struct coding_system *coding;
df7492f9
KH
3582 int *charbuf, *charbuf_end;
3583 unsigned char *dst;
e0e989f6 3584{
df7492f9 3585 struct charset *charset;
bdd9fb48
KH
3586 /* Table of charsets to be designated to each graphic register. */
3587 int r[4];
df7492f9
KH
3588 int c, found = 0, reg;
3589 int produced_chars = 0;
3590 int multibytep = coding->dst_multibyte;
3591 Lisp_Object attrs;
3592 Lisp_Object charset_list;
3593
3594 attrs = CODING_ID_ATTRS (coding->id);
3595 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3596 if (EQ (charset_list, Qiso_2022))
3597 charset_list = Viso_2022_charset_list;
bdd9fb48
KH
3598
3599 for (reg = 0; reg < 4; reg++)
3600 r[reg] = -1;
3601
b73bfc1c 3602 while (found < 4)
e0e989f6 3603 {
df7492f9
KH
3604 int id;
3605
3606 c = *charbuf++;
b73bfc1c
KH
3607 if (c == '\n')
3608 break;
df7492f9
KH
3609 charset = char_charset (c, charset_list, NULL);
3610 id = CHARSET_ID (charset);
3611 reg = CODING_ISO_REQUEST (coding, id);
3612 if (reg >= 0 && r[reg] < 0)
bdd9fb48
KH
3613 {
3614 found++;
df7492f9 3615 r[reg] = id;
bdd9fb48 3616 }
bdd9fb48
KH
3617 }
3618
3619 if (found)
3620 {
3621 for (reg = 0; reg < 4; reg++)
3622 if (r[reg] >= 0
df7492f9
KH
3623 && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
3624 ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
e0e989f6 3625 }
b73bfc1c
KH
3626
3627 return dst;
e0e989f6
KH
3628}
3629
4ed46869
KH
3630/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
3631
df7492f9
KH
3632static int
3633encode_coding_iso_2022 (coding)
4ed46869 3634 struct coding_system *coding;
4ed46869 3635{
df7492f9
KH
3636 int multibytep = coding->dst_multibyte;
3637 int *charbuf = coding->charbuf;
3638 int *charbuf_end = charbuf + coding->charbuf_used;
3639 unsigned char *dst = coding->destination + coding->produced;
3640 unsigned char *dst_end = coding->destination + coding->dst_bytes;
3641 int safe_room = 16;
3642 int bol_designation
3643 = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
3644 && CODING_ISO_BOL (coding));
3645 int produced_chars = 0;
3646 Lisp_Object attrs, eol_type, charset_list;
3647 int ascii_compatible;
b73bfc1c 3648 int c;
ff0dacd7 3649 int preferred_charset_id = -1;
05e6f5dc 3650
df7492f9 3651 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
004068e4 3652 setup_iso_safe_charsets (attrs);
ff0dacd7
KH
3653 /* Charset list may have been changed. */
3654 charset_list = CODING_ATTR_CHARSET_LIST (attrs); \
8f924df7 3655 coding->safe_charsets = (char *) SDATA (CODING_ATTR_SAFE_CHARSETS(attrs));
0eecad43 3656
df7492f9 3657 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
bdd9fb48 3658
df7492f9 3659 while (charbuf < charbuf_end)
4ed46869 3660 {
df7492f9 3661 ASSURE_DESTINATION (safe_room);
b73bfc1c 3662
df7492f9 3663 if (bol_designation)
b73bfc1c 3664 {
df7492f9 3665 unsigned char *dst_prev = dst;
4ed46869 3666
bdd9fb48 3667 /* We have to produce designation sequences if any now. */
df7492f9
KH
3668 dst = encode_designation_at_bol (coding, charbuf, charbuf_end, dst);
3669 bol_designation = 0;
3670 /* We are sure that designation sequences are all ASCII bytes. */
3671 produced_chars += dst - dst_prev;
e0e989f6
KH
3672 }
3673
df7492f9 3674 c = *charbuf++;
ec6d2bb8 3675
ff0dacd7
KH
3676 if (c < 0)
3677 {
3678 /* Handle an annotation. */
3679 switch (*charbuf)
ec6d2bb8 3680 {
ff0dacd7
KH
3681 case CODING_ANNOTATE_COMPOSITION_MASK:
3682 /* Not yet implemented. */
3683 break;
3684 case CODING_ANNOTATE_CHARSET_MASK:
3685 preferred_charset_id = charbuf[3];
3686 if (preferred_charset_id >= 0
3687 && NILP (Fmemq (make_number (preferred_charset_id),
3688 charset_list)))
3689 preferred_charset_id = -1;
3690 break;
3691 default:
3692 abort ();
4ed46869 3693 }
ff0dacd7
KH
3694 charbuf += -c - 1;
3695 continue;
4ed46869 3696 }
ec6d2bb8 3697
b73bfc1c
KH
3698 /* Now encode the character C. */
3699 if (c < 0x20 || c == 0x7F)
3700 {
df7492f9
KH
3701 if (c == '\n'
3702 || (c == '\r' && EQ (eol_type, Qmac)))
19a8d9e0 3703 {
df7492f9
KH
3704 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3705 ENCODE_RESET_PLANE_AND_REGISTER ();
3706 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
b73bfc1c 3707 {
df7492f9
KH
3708 int i;
3709
3710 for (i = 0; i < 4; i++)
3711 CODING_ISO_DESIGNATION (coding, i)
3712 = CODING_ISO_INITIAL (coding, i);
b73bfc1c 3713 }
df7492f9
KH
3714 bol_designation
3715 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
19a8d9e0 3716 }
df7492f9
KH
3717 else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
3718 ENCODE_RESET_PLANE_AND_REGISTER ();
3719 EMIT_ONE_ASCII_BYTE (c);
4ed46869 3720 }
df7492f9 3721 else if (ASCII_CHAR_P (c))
88993dfd 3722 {
df7492f9
KH
3723 if (ascii_compatible)
3724 EMIT_ONE_ASCII_BYTE (c);
93dec019 3725 else
19a8d9e0 3726 {
bf16eb23
KH
3727 struct charset *charset = CHARSET_FROM_ID (charset_ascii);
3728 ENCODE_ISO_CHARACTER (charset, c);
19a8d9e0 3729 }
4ed46869 3730 }
16eafb5d 3731 else if (CHAR_BYTE8_P (c))
88993dfd 3732 {
16eafb5d
KH
3733 c = CHAR_TO_BYTE8 (c);
3734 EMIT_ONE_BYTE (c);
88993dfd 3735 }
b73bfc1c 3736 else
df7492f9 3737 {
ff0dacd7 3738 struct charset *charset;
b73bfc1c 3739
ff0dacd7
KH
3740 if (preferred_charset_id >= 0)
3741 {
3742 charset = CHARSET_FROM_ID (preferred_charset_id);
3743 if (! CHAR_CHARSET_P (c, charset))
3744 charset = char_charset (c, charset_list, NULL);
3745 }
3746 else
3747 charset = char_charset (c, charset_list, NULL);
df7492f9
KH
3748 if (!charset)
3749 {
41cbe562
KH
3750 if (coding->mode & CODING_MODE_SAFE_ENCODING)
3751 {
3752 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
3753 charset = CHARSET_FROM_ID (charset_ascii);
3754 }
3755 else
3756 {
3757 c = coding->default_char;
3758 charset = char_charset (c, charset_list, NULL);
3759 }
df7492f9
KH
3760 }
3761 ENCODE_ISO_CHARACTER (charset, c);
3762 }
84fbb8a0 3763 }
b73bfc1c 3764
df7492f9
KH
3765 if (coding->mode & CODING_MODE_LAST_BLOCK
3766 && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3767 {
3768 ASSURE_DESTINATION (safe_room);
3769 ENCODE_RESET_PLANE_AND_REGISTER ();
3770 }
3771 coding->result = CODING_RESULT_SUCCESS;
3772 CODING_ISO_BOL (coding) = bol_designation;
3773 coding->produced_char += produced_chars;
3774 coding->produced = dst - coding->destination;
3775 return 0;
4ed46869
KH
3776}
3777
3778\f
df7492f9 3779/*** 8,9. SJIS and BIG5 handlers ***/
4ed46869 3780
df7492f9 3781/* Although SJIS and BIG5 are not ISO's coding system, they are used
4ed46869
KH
3782 quite widely. So, for the moment, Emacs supports them in the bare
3783 C code. But, in the future, they may be supported only by CCL. */
3784
3785/* SJIS is a coding system encoding three character sets: ASCII, right
3786 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
3787 as is. A character of charset katakana-jisx0201 is encoded by
3788 "position-code + 0x80". A character of charset japanese-jisx0208
3789 is encoded in 2-byte but two position-codes are divided and shifted
df7492f9 3790 so that it fit in the range below.
4ed46869
KH
3791
3792 --- CODE RANGE of SJIS ---
3793 (character set) (range)
3794 ASCII 0x00 .. 0x7F
df7492f9 3795 KATAKANA-JISX0201 0xA0 .. 0xDF
c28a9453 3796 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
d14d03ac 3797 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
4ed46869
KH
3798 -------------------------------
3799
3800*/
3801
3802/* BIG5 is a coding system encoding two character sets: ASCII and
3803 Big5. An ASCII character is encoded as is. Big5 is a two-byte
df7492f9 3804 character set and is encoded in two-byte.
4ed46869
KH
3805
3806 --- CODE RANGE of BIG5 ---
3807 (character set) (range)
3808 ASCII 0x00 .. 0x7F
3809 Big5 (1st byte) 0xA1 .. 0xFE
3810 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3811 --------------------------
3812
df7492f9 3813 */
4ed46869
KH
3814
3815/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3816 Check if a text is encoded in SJIS. If it is, return
df7492f9 3817 CATEGORY_MASK_SJIS, else return 0. */
4ed46869 3818
0a28aafb 3819static int
ff0dacd7 3820detect_coding_sjis (coding, detect_info)
df7492f9 3821 struct coding_system *coding;
ff0dacd7 3822 struct coding_detection_info *detect_info;
4ed46869 3823{
8f924df7
KH
3824 const unsigned char *src = coding->source, *src_base = src;
3825 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9
KH
3826 int multibytep = coding->src_multibyte;
3827 int consumed_chars = 0;
3828 int found = 0;
b73bfc1c 3829 int c;
89528eb3 3830 int incomplete;
df7492f9 3831
ff0dacd7 3832 detect_info->checked |= CATEGORY_MASK_SJIS;
df7492f9
KH
3833 /* A coding system of this category is always ASCII compatible. */
3834 src += coding->head_ascii;
4ed46869 3835
b73bfc1c 3836 while (1)
4ed46869 3837 {
89528eb3 3838 incomplete = 0;
df7492f9 3839 ONE_MORE_BYTE (c);
89528eb3 3840 incomplete = 1;
682169fe
KH
3841 if (c < 0x80)
3842 continue;
df7492f9 3843 if ((c >= 0x81 && c <= 0x9F) || (c >= 0xE0 && c <= 0xEF))
4ed46869 3844 {
df7492f9 3845 ONE_MORE_BYTE (c);
682169fe 3846 if (c < 0x40 || c == 0x7F || c > 0xFC)
df7492f9 3847 break;
ff0dacd7 3848 found = CATEGORY_MASK_SJIS;
4ed46869 3849 }
df7492f9 3850 else if (c >= 0xA0 && c < 0xE0)
ff0dacd7 3851 found = CATEGORY_MASK_SJIS;
df7492f9
KH
3852 else
3853 break;
4ed46869 3854 }
ff0dacd7 3855 detect_info->rejected |= CATEGORY_MASK_SJIS;
df7492f9
KH
3856 return 0;
3857
3858 no_more_source:
89528eb3
KH
3859 if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK)
3860 {
ff0dacd7 3861 detect_info->rejected |= CATEGORY_MASK_SJIS;
89528eb3 3862 return 0;
4ed46869 3863 }
ff0dacd7
KH
3864 detect_info->found |= found;
3865 return 1;
4ed46869
KH
3866}
3867
3868/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3869 Check if a text is encoded in BIG5. If it is, return
df7492f9 3870 CATEGORY_MASK_BIG5, else return 0. */
4ed46869 3871
0a28aafb 3872static int
ff0dacd7 3873detect_coding_big5 (coding, detect_info)
df7492f9 3874 struct coding_system *coding;
ff0dacd7 3875 struct coding_detection_info *detect_info;
4ed46869 3876{
8f924df7
KH
3877 const unsigned char *src = coding->source, *src_base = src;
3878 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9
KH
3879 int multibytep = coding->src_multibyte;
3880 int consumed_chars = 0;
3881 int found = 0;
b73bfc1c 3882 int c;
89528eb3 3883 int incomplete;
fa42c37f 3884
ff0dacd7 3885 detect_info->checked |= CATEGORY_MASK_BIG5;
df7492f9
KH
3886 /* A coding system of this category is always ASCII compatible. */
3887 src += coding->head_ascii;
fa42c37f 3888
b73bfc1c 3889 while (1)
fa42c37f 3890 {
89528eb3 3891 incomplete = 0;
df7492f9 3892 ONE_MORE_BYTE (c);
89528eb3 3893 incomplete = 1;
df7492f9 3894 if (c < 0x80)
fa42c37f 3895 continue;
df7492f9 3896 if (c >= 0xA1)
fa42c37f 3897 {
df7492f9
KH
3898 ONE_MORE_BYTE (c);
3899 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
fa42c37f 3900 return 0;
ff0dacd7 3901 found = CATEGORY_MASK_BIG5;
fa42c37f 3902 }
df7492f9
KH
3903 else
3904 break;
fa42c37f 3905 }
ff0dacd7 3906 detect_info->rejected |= CATEGORY_MASK_BIG5;
fa42c37f 3907 return 0;
fa42c37f 3908
df7492f9 3909 no_more_source:
89528eb3
KH
3910 if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK)
3911 {
ff0dacd7 3912 detect_info->rejected |= CATEGORY_MASK_BIG5;
89528eb3
KH
3913 return 0;
3914 }
ff0dacd7
KH
3915 detect_info->found |= found;
3916 return 1;
fa42c37f
KH
3917}
3918
4ed46869
KH
3919/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
3920 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
fa42c37f 3921
b73bfc1c 3922static void
df7492f9 3923decode_coding_sjis (coding)
4ed46869 3924 struct coding_system *coding;
4ed46869 3925{
8f924df7
KH
3926 const unsigned char *src = coding->source + coding->consumed;
3927 const unsigned char *src_end = coding->source + coding->src_bytes;
3928 const unsigned char *src_base;
df7492f9 3929 int *charbuf = coding->charbuf;
ff0dacd7 3930 int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
df7492f9
KH
3931 int consumed_chars = 0, consumed_chars_base;
3932 int multibytep = coding->src_multibyte;
3933 struct charset *charset_roman, *charset_kanji, *charset_kana;
3934 Lisp_Object attrs, eol_type, charset_list, val;
ff0dacd7
KH
3935 int char_offset = coding->produced_char;
3936 int last_offset = char_offset;
3937 int last_id = charset_ascii;
a5d301df 3938
df7492f9
KH
3939 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
3940
3941 val = charset_list;
3942 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
89528eb3
KH
3943 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3944 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
fa42c37f 3945
b73bfc1c 3946 while (1)
4ed46869 3947 {
df7492f9 3948 int c, c1;
fa42c37f 3949
b73bfc1c 3950 src_base = src;
df7492f9 3951 consumed_chars_base = consumed_chars;
fa42c37f 3952
df7492f9
KH
3953 if (charbuf >= charbuf_end)
3954 break;
3955
3956 ONE_MORE_BYTE (c);
b73bfc1c 3957
df7492f9 3958 if (c == '\r')
4ed46869 3959 {
df7492f9 3960 if (EQ (eol_type, Qdos))
4ed46869 3961 {
df7492f9 3962 if (src == src_end)
98725083
KH
3963 {
3964 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
3965 goto no_more_source;
3966 }
df7492f9
KH
3967 if (*src == '\n')
3968 ONE_MORE_BYTE (c);
4ed46869 3969 }
df7492f9
KH
3970 else if (EQ (eol_type, Qmac))
3971 c = '\n';
4ed46869 3972 }
54f78171 3973 else
df7492f9
KH
3974 {
3975 struct charset *charset;
fa42c37f 3976
df7492f9
KH
3977 if (c < 0x80)
3978 charset = charset_roman;
3979 else
4ed46869 3980 {
df7492f9
KH
3981 if (c >= 0xF0)
3982 goto invalid_code;
3983 if (c < 0xA0 || c >= 0xE0)
fb88bf2d 3984 {
54f78171 3985 /* SJIS -> JISX0208 */
df7492f9
KH
3986 ONE_MORE_BYTE (c1);
3987 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
3988 goto invalid_code;
3989 c = (c << 8) | c1;
3990 SJIS_TO_JIS (c);
3991 charset = charset_kanji;
5e34de15 3992 }
7487494c 3993 else if (c > 0xA0)
89528eb3
KH
3994 {
3995 /* SJIS -> JISX0201-Kana */
3996 c &= 0x7F;
3997 charset = charset_kana;
3998 }
7487494c
KH
3999 else
4000 goto invalid_code;
df7492f9 4001 }
ff0dacd7
KH
4002 if (charset->id != charset_ascii
4003 && last_id != charset->id)
4004 {
4005 if (last_id != charset_ascii)
4006 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4007 last_id = charset->id;
4008 last_offset = char_offset;
4009 }
df7492f9
KH
4010 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4011 }
4012 *charbuf++ = c;
ff0dacd7 4013 char_offset++;
df7492f9 4014 continue;
b73bfc1c 4015
df7492f9
KH
4016 invalid_code:
4017 src = src_base;
4018 consumed_chars = consumed_chars_base;
4019 ONE_MORE_BYTE (c);
4020 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
ff0dacd7 4021 char_offset++;
df7492f9
KH
4022 coding->errors++;
4023 }
fa42c37f 4024
df7492f9 4025 no_more_source:
ff0dacd7
KH
4026 if (last_id != charset_ascii)
4027 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
df7492f9
KH
4028 coding->consumed_char += consumed_chars_base;
4029 coding->consumed = src_base - coding->source;
4030 coding->charbuf_used = charbuf - coding->charbuf;
fa42c37f
KH
4031}
4032
b73bfc1c 4033static void
df7492f9 4034decode_coding_big5 (coding)
4ed46869 4035 struct coding_system *coding;
4ed46869 4036{
8f924df7
KH
4037 const unsigned char *src = coding->source + coding->consumed;
4038 const unsigned char *src_end = coding->source + coding->src_bytes;
4039 const unsigned char *src_base;
df7492f9 4040 int *charbuf = coding->charbuf;
ff0dacd7 4041 int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
df7492f9
KH
4042 int consumed_chars = 0, consumed_chars_base;
4043 int multibytep = coding->src_multibyte;
4044 struct charset *charset_roman, *charset_big5;
4045 Lisp_Object attrs, eol_type, charset_list, val;
ff0dacd7
KH
4046 int char_offset = coding->produced_char;
4047 int last_offset = char_offset;
4048 int last_id = charset_ascii;
df7492f9
KH
4049
4050 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4051 val = charset_list;
4052 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4053 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4ed46869 4054
b73bfc1c 4055 while (1)
4ed46869 4056 {
df7492f9 4057 int c, c1;
b73bfc1c
KH
4058
4059 src_base = src;
df7492f9
KH
4060 consumed_chars_base = consumed_chars;
4061
4062 if (charbuf >= charbuf_end)
4063 break;
4064
4065 ONE_MORE_BYTE (c);
b73bfc1c 4066
df7492f9 4067 if (c == '\r')
4ed46869 4068 {
df7492f9 4069 if (EQ (eol_type, Qdos))
4ed46869 4070 {
df7492f9 4071 if (src == src_end)
b73bfc1c 4072 {
98725083
KH
4073 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
4074 goto no_more_source;
d46c5b12 4075 }
df7492f9
KH
4076 if (*src == '\n')
4077 ONE_MORE_BYTE (c);
4ed46869 4078 }
df7492f9
KH
4079 else if (EQ (eol_type, Qmac))
4080 c = '\n';
4ed46869 4081 }
54f78171 4082 else
df7492f9
KH
4083 {
4084 struct charset *charset;
4085 if (c < 0x80)
4086 charset = charset_roman;
fb88bf2d 4087 else
fb88bf2d 4088 {
54f78171 4089 /* BIG5 -> Big5 */
df7492f9
KH
4090 if (c < 0xA1 || c > 0xFE)
4091 goto invalid_code;
4092 ONE_MORE_BYTE (c1);
4093 if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4094 goto invalid_code;
4095 c = c << 8 | c1;
4096 charset = charset_big5;
4ed46869 4097 }
ff0dacd7
KH
4098 if (charset->id != charset_ascii
4099 && last_id != charset->id)
4100 {
4101 if (last_id != charset_ascii)
4102 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4103 last_id = charset->id;
4104 last_offset = char_offset;
4ed46869 4105 }
df7492f9 4106 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4ed46869 4107 }
4ed46869 4108
df7492f9 4109 *charbuf++ = c;
ff0dacd7 4110 char_offset++;
fb88bf2d
KH
4111 continue;
4112
df7492f9 4113 invalid_code:
4ed46869 4114 src = src_base;
df7492f9
KH
4115 consumed_chars = consumed_chars_base;
4116 ONE_MORE_BYTE (c);
4117 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
ff0dacd7 4118 char_offset++;
df7492f9 4119 coding->errors++;
fb88bf2d 4120 }
d46c5b12 4121
df7492f9 4122 no_more_source:
ff0dacd7
KH
4123 if (last_id != charset_ascii)
4124 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
df7492f9
KH
4125 coding->consumed_char += consumed_chars_base;
4126 coding->consumed = src_base - coding->source;
4127 coding->charbuf_used = charbuf - coding->charbuf;
4ed46869
KH
4128}
4129
4130/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
b73bfc1c
KH
4131 This function can encode charsets `ascii', `katakana-jisx0201',
4132 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
4133 are sure that all these charsets are registered as official charset
4ed46869
KH
4134 (i.e. do not have extended leading-codes). Characters of other
4135 charsets are produced without any encoding. If SJIS_P is 1, encode
4136 SJIS text, else encode BIG5 text. */
4137
df7492f9
KH
4138static int
4139encode_coding_sjis (coding)
4ed46869 4140 struct coding_system *coding;
4ed46869 4141{
df7492f9
KH
4142 int multibytep = coding->dst_multibyte;
4143 int *charbuf = coding->charbuf;
4144 int *charbuf_end = charbuf + coding->charbuf_used;
4145 unsigned char *dst = coding->destination + coding->produced;
4146 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4147 int safe_room = 4;
4148 int produced_chars = 0;
4149 Lisp_Object attrs, eol_type, charset_list, val;
4150 int ascii_compatible;
4151 struct charset *charset_roman, *charset_kanji, *charset_kana;
4152 int c;
a5d301df 4153
df7492f9
KH
4154 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4155 val = charset_list;
4156 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4157 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4158 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
4ed46869 4159
df7492f9 4160 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
93dec019 4161
df7492f9
KH
4162 while (charbuf < charbuf_end)
4163 {
4164 ASSURE_DESTINATION (safe_room);
4165 c = *charbuf++;
b73bfc1c 4166 /* Now encode the character C. */
df7492f9
KH
4167 if (ASCII_CHAR_P (c) && ascii_compatible)
4168 EMIT_ONE_ASCII_BYTE (c);
16eafb5d
KH
4169 else if (CHAR_BYTE8_P (c))
4170 {
4171 c = CHAR_TO_BYTE8 (c);
4172 EMIT_ONE_BYTE (c);
4173 }
df7492f9 4174 else
b73bfc1c 4175 {
df7492f9
KH
4176 unsigned code;
4177 struct charset *charset = char_charset (c, charset_list, &code);
4178
4179 if (!charset)
4ed46869 4180 {
41cbe562 4181 if (coding->mode & CODING_MODE_SAFE_ENCODING)
b73bfc1c 4182 {
41cbe562
KH
4183 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4184 charset = CHARSET_FROM_ID (charset_ascii);
b73bfc1c 4185 }
41cbe562 4186 else
b73bfc1c 4187 {
41cbe562
KH
4188 c = coding->default_char;
4189 charset = char_charset (c, charset_list, &code);
b73bfc1c 4190 }
b73bfc1c 4191 }
df7492f9
KH
4192 if (code == CHARSET_INVALID_CODE (charset))
4193 abort ();
4194 if (charset == charset_kanji)
4195 {
4196 int c1, c2;
4197 JIS_TO_SJIS (code);
4198 c1 = code >> 8, c2 = code & 0xFF;
4199 EMIT_TWO_BYTES (c1, c2);
4200 }
4201 else if (charset == charset_kana)
4202 EMIT_ONE_BYTE (code | 0x80);
4203 else
4204 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4205 }
4206 }
4207 coding->result = CODING_RESULT_SUCCESS;
4208 coding->produced_char += produced_chars;
4209 coding->produced = dst - coding->destination;
4210 return 0;
4211}
4212
4213static int
4214encode_coding_big5 (coding)
4215 struct coding_system *coding;
4216{
4217 int multibytep = coding->dst_multibyte;
4218 int *charbuf = coding->charbuf;
4219 int *charbuf_end = charbuf + coding->charbuf_used;
4220 unsigned char *dst = coding->destination + coding->produced;
4221 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4222 int safe_room = 4;
4223 int produced_chars = 0;
4224 Lisp_Object attrs, eol_type, charset_list, val;
4225 int ascii_compatible;
4226 struct charset *charset_roman, *charset_big5;
4227 int c;
4228
4229 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4230 val = charset_list;
4231 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4232 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4233 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4234
4235 while (charbuf < charbuf_end)
4236 {
4237 ASSURE_DESTINATION (safe_room);
4238 c = *charbuf++;
4239 /* Now encode the character C. */
4240 if (ASCII_CHAR_P (c) && ascii_compatible)
4241 EMIT_ONE_ASCII_BYTE (c);
16eafb5d
KH
4242 else if (CHAR_BYTE8_P (c))
4243 {
4244 c = CHAR_TO_BYTE8 (c);
4245 EMIT_ONE_BYTE (c);
b73bfc1c
KH
4246 }
4247 else
4248 {
df7492f9
KH
4249 unsigned code;
4250 struct charset *charset = char_charset (c, charset_list, &code);
4251
4252 if (! charset)
b73bfc1c 4253 {
41cbe562 4254 if (coding->mode & CODING_MODE_SAFE_ENCODING)
b73bfc1c 4255 {
41cbe562
KH
4256 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4257 charset = CHARSET_FROM_ID (charset_ascii);
b73bfc1c 4258 }
41cbe562 4259 else
0eecad43 4260 {
41cbe562
KH
4261 c = coding->default_char;
4262 charset = char_charset (c, charset_list, &code);
0eecad43 4263 }
4ed46869 4264 }
df7492f9
KH
4265 if (code == CHARSET_INVALID_CODE (charset))
4266 abort ();
4267 if (charset == charset_big5)
b73bfc1c 4268 {
df7492f9
KH
4269 int c1, c2;
4270
4271 c1 = code >> 8, c2 = code & 0xFF;
4272 EMIT_TWO_BYTES (c1, c2);
b73bfc1c 4273 }
df7492f9
KH
4274 else
4275 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4ed46869 4276 }
4ed46869 4277 }
df7492f9
KH
4278 coding->result = CODING_RESULT_SUCCESS;
4279 coding->produced_char += produced_chars;
4280 coding->produced = dst - coding->destination;
4281 return 0;
4ed46869
KH
4282}
4283
4284\f
df7492f9 4285/*** 10. CCL handlers ***/
1397dc18
KH
4286
4287/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4288 Check if a text is encoded in a coding system of which
4289 encoder/decoder are written in CCL program. If it is, return
df7492f9 4290 CATEGORY_MASK_CCL, else return 0. */
1397dc18 4291
0a28aafb 4292static int
ff0dacd7 4293detect_coding_ccl (coding, detect_info)
df7492f9 4294 struct coding_system *coding;
ff0dacd7 4295 struct coding_detection_info *detect_info;
1397dc18 4296{
8f924df7
KH
4297 const unsigned char *src = coding->source, *src_base = src;
4298 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9
KH
4299 int multibytep = coding->src_multibyte;
4300 int consumed_chars = 0;
4301 int found = 0;
4302 unsigned char *valids = CODING_CCL_VALIDS (coding);
4303 int head_ascii = coding->head_ascii;
4304 Lisp_Object attrs;
4305
ff0dacd7
KH
4306 detect_info->checked |= CATEGORY_MASK_CCL;
4307
df7492f9
KH
4308 coding = &coding_categories[coding_category_ccl];
4309 attrs = CODING_ID_ATTRS (coding->id);
4310 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4311 src += head_ascii;
1397dc18 4312
b73bfc1c 4313 while (1)
1397dc18 4314 {
df7492f9
KH
4315 int c;
4316 ONE_MORE_BYTE (c);
4317 if (! valids[c])
4318 break;
ff0dacd7
KH
4319 if ((valids[c] > 1))
4320 found = CATEGORY_MASK_CCL;
df7492f9 4321 }
ff0dacd7 4322 detect_info->rejected |= CATEGORY_MASK_CCL;
df7492f9
KH
4323 return 0;
4324
4325 no_more_source:
ff0dacd7
KH
4326 detect_info->found |= found;
4327 return 1;
df7492f9
KH
4328}
4329
4330static void
4331decode_coding_ccl (coding)
4332 struct coding_system *coding;
4333{
7c78e542 4334 const unsigned char *src = coding->source + coding->consumed;
8f924df7 4335 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9
KH
4336 int *charbuf = coding->charbuf;
4337 int *charbuf_end = charbuf + coding->charbuf_size;
4338 int consumed_chars = 0;
4339 int multibytep = coding->src_multibyte;
4340 struct ccl_program ccl;
4341 int source_charbuf[1024];
4342 int source_byteidx[1024];
4776e638 4343 Lisp_Object attrs, eol_type, charset_list;
df7492f9 4344
8dcbea82 4345 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
df7492f9
KH
4346 setup_ccl_program (&ccl, CODING_CCL_DECODER (coding));
4347
4348 while (src < src_end)
4349 {
7c78e542 4350 const unsigned char *p = src;
df7492f9
KH
4351 int *source, *source_end;
4352 int i = 0;
4353
4354 if (multibytep)
4355 while (i < 1024 && p < src_end)
4356 {
4357 source_byteidx[i] = p - src;
4358 source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
4359 }
4360 else
4361 while (i < 1024 && p < src_end)
4362 source_charbuf[i++] = *p++;
8f924df7 4363
df7492f9
KH
4364 if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
4365 ccl.last_block = 1;
4366
4367 source = source_charbuf;
4368 source_end = source + i;
4369 while (source < source_end)
4370 {
4371 ccl_driver (&ccl, source, charbuf,
8dcbea82
KH
4372 source_end - source, charbuf_end - charbuf,
4373 charset_list);
df7492f9
KH
4374 source += ccl.consumed;
4375 charbuf += ccl.produced;
4376 if (ccl.status != CCL_STAT_SUSPEND_BY_DST)
4377 break;
4378 }
4379 if (source < source_end)
4380 src += source_byteidx[source - source_charbuf];
4381 else
4382 src = p;
4383 consumed_chars += source - source_charbuf;
4384
4385 if (ccl.status != CCL_STAT_SUSPEND_BY_SRC
4386 && ccl.status != CODING_RESULT_INSUFFICIENT_SRC)
4387 break;
4388 }
4389
4390 switch (ccl.status)
4391 {
4392 case CCL_STAT_SUSPEND_BY_SRC:
4393 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
4394 break;
4395 case CCL_STAT_SUSPEND_BY_DST:
4396 break;
4397 case CCL_STAT_QUIT:
4398 case CCL_STAT_INVALID_CMD:
4399 coding->result = CODING_RESULT_INTERRUPT;
4400 break;
4401 default:
4402 coding->result = CODING_RESULT_SUCCESS;
4403 break;
4404 }
4405 coding->consumed_char += consumed_chars;
4406 coding->consumed = src - coding->source;
4407 coding->charbuf_used = charbuf - coding->charbuf;
4408}
4409
4410static int
4411encode_coding_ccl (coding)
4412 struct coding_system *coding;
4413{
4414 struct ccl_program ccl;
4415 int multibytep = coding->dst_multibyte;
4416 int *charbuf = coding->charbuf;
4417 int *charbuf_end = charbuf + coding->charbuf_used;
4418 unsigned char *dst = coding->destination + coding->produced;
4419 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4420 unsigned char *adjusted_dst_end = dst_end - 1;
4421 int destination_charbuf[1024];
4422 int i, produced_chars = 0;
8dcbea82 4423 Lisp_Object attrs, eol_type, charset_list;
df7492f9 4424
8dcbea82 4425 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
df7492f9
KH
4426 setup_ccl_program (&ccl, CODING_CCL_ENCODER (coding));
4427
4428 ccl.last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4429 ccl.dst_multibyte = coding->dst_multibyte;
4430
4431 while (charbuf < charbuf_end && dst < adjusted_dst_end)
4432 {
4433 int dst_bytes = dst_end - dst;
4434 if (dst_bytes > 1024)
4435 dst_bytes = 1024;
4436
4437 ccl_driver (&ccl, charbuf, destination_charbuf,
8dcbea82 4438 charbuf_end - charbuf, dst_bytes, charset_list);
df7492f9
KH
4439 charbuf += ccl.consumed;
4440 if (multibytep)
4441 for (i = 0; i < ccl.produced; i++)
4442 EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
4443 else
4444 {
4445 for (i = 0; i < ccl.produced; i++)
4446 *dst++ = destination_charbuf[i] & 0xFF;
4447 produced_chars += ccl.produced;
4448 }
4449 }
4450
4451 switch (ccl.status)
4452 {
4453 case CCL_STAT_SUSPEND_BY_SRC:
4454 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
4455 break;
4456 case CCL_STAT_SUSPEND_BY_DST:
4457 coding->result = CODING_RESULT_INSUFFICIENT_DST;
4458 break;
4459 case CCL_STAT_QUIT:
4460 case CCL_STAT_INVALID_CMD:
4461 coding->result = CODING_RESULT_INTERRUPT;
4462 break;
4463 default:
4464 coding->result = CODING_RESULT_SUCCESS;
4465 break;
1397dc18 4466 }
df7492f9
KH
4467
4468 coding->produced_char += produced_chars;
4469 coding->produced = dst - coding->destination;
4470 return 0;
1397dc18
KH
4471}
4472
df7492f9 4473
1397dc18 4474\f
df7492f9 4475/*** 10, 11. no-conversion handlers ***/
4ed46869 4476
b73bfc1c 4477/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
4ed46869 4478
b73bfc1c 4479static void
df7492f9 4480decode_coding_raw_text (coding)
4ed46869 4481 struct coding_system *coding;
4ed46869 4482{
df7492f9 4483 coding->chars_at_source = 1;
2c78b7e1
KH
4484 coding->consumed_char = 0;
4485 coding->consumed = 0;
df7492f9
KH
4486 coding->result = CODING_RESULT_SUCCESS;
4487}
4ed46869 4488
df7492f9
KH
4489static int
4490encode_coding_raw_text (coding)
4491 struct coding_system *coding;
4492{
4493 int multibytep = coding->dst_multibyte;
4494 int *charbuf = coding->charbuf;
4495 int *charbuf_end = coding->charbuf + coding->charbuf_used;
4496 unsigned char *dst = coding->destination + coding->produced;
4497 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4498 int produced_chars = 0;
b73bfc1c
KH
4499 int c;
4500
df7492f9 4501 if (multibytep)
b73bfc1c 4502 {
df7492f9 4503 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
4ed46869 4504
df7492f9
KH
4505 if (coding->src_multibyte)
4506 while (charbuf < charbuf_end)
4507 {
4508 ASSURE_DESTINATION (safe_room);
4509 c = *charbuf++;
4510 if (ASCII_CHAR_P (c))
4511 EMIT_ONE_ASCII_BYTE (c);
4512 else if (CHAR_BYTE8_P (c))
4513 {
4514 c = CHAR_TO_BYTE8 (c);
4515 EMIT_ONE_BYTE (c);
4516 }
4517 else
4518 {
4519 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
93dec019 4520
df7492f9
KH
4521 CHAR_STRING_ADVANCE (c, p1);
4522 while (p0 < p1)
9d123124
KH
4523 {
4524 EMIT_ONE_BYTE (*p0);
4525 p0++;
4526 }
df7492f9
KH
4527 }
4528 }
b73bfc1c 4529 else
df7492f9
KH
4530 while (charbuf < charbuf_end)
4531 {
4532 ASSURE_DESTINATION (safe_room);
4533 c = *charbuf++;
4534 EMIT_ONE_BYTE (c);
4535 }
4536 }
4537 else
4ed46869 4538 {
df7492f9 4539 if (coding->src_multibyte)
d46c5b12 4540 {
df7492f9
KH
4541 int safe_room = MAX_MULTIBYTE_LENGTH;
4542
4543 while (charbuf < charbuf_end)
d46c5b12 4544 {
df7492f9
KH
4545 ASSURE_DESTINATION (safe_room);
4546 c = *charbuf++;
4547 if (ASCII_CHAR_P (c))
4548 *dst++ = c;
4549 else if (CHAR_BYTE8_P (c))
4550 *dst++ = CHAR_TO_BYTE8 (c);
b73bfc1c 4551 else
df7492f9
KH
4552 CHAR_STRING_ADVANCE (c, dst);
4553 produced_chars++;
d46c5b12
KH
4554 }
4555 }
df7492f9
KH
4556 else
4557 {
4558 ASSURE_DESTINATION (charbuf_end - charbuf);
4559 while (charbuf < charbuf_end && dst < dst_end)
4560 *dst++ = *charbuf++;
4561 produced_chars = dst - (coding->destination + coding->dst_bytes);
8f924df7 4562 }
4ed46869 4563 }
df7492f9
KH
4564 coding->result = CODING_RESULT_SUCCESS;
4565 coding->produced_char += produced_chars;
4566 coding->produced = dst - coding->destination;
4567 return 0;
4ed46869
KH
4568}
4569
ff0dacd7
KH
4570/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4571 Check if a text is encoded in a charset-based coding system. If it
4572 is, return 1, else return 0. */
4573
0a28aafb 4574static int
ff0dacd7 4575detect_coding_charset (coding, detect_info)
df7492f9 4576 struct coding_system *coding;
ff0dacd7 4577 struct coding_detection_info *detect_info;
1397dc18 4578{
8f924df7
KH
4579 const unsigned char *src = coding->source, *src_base = src;
4580 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9
KH
4581 int multibytep = coding->src_multibyte;
4582 int consumed_chars = 0;
4583 Lisp_Object attrs, valids;
584948ac 4584 int found = 0;
1397dc18 4585
ff0dacd7
KH
4586 detect_info->checked |= CATEGORY_MASK_CHARSET;
4587
df7492f9
KH
4588 coding = &coding_categories[coding_category_charset];
4589 attrs = CODING_ID_ATTRS (coding->id);
4590 valids = AREF (attrs, coding_attr_charset_valids);
4591
4592 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4593 src += coding->head_ascii;
1397dc18 4594
b73bfc1c 4595 while (1)
1397dc18 4596 {
df7492f9 4597 int c;
1397dc18 4598
df7492f9
KH
4599 ONE_MORE_BYTE (c);
4600 if (NILP (AREF (valids, c)))
4601 break;
584948ac 4602 if (c >= 0x80)
ff0dacd7 4603 found = CATEGORY_MASK_CHARSET;
df7492f9 4604 }
ff0dacd7 4605 detect_info->rejected |= CATEGORY_MASK_CHARSET;
df7492f9 4606 return 0;
4ed46869 4607
df7492f9 4608 no_more_source:
ff0dacd7
KH
4609 detect_info->found |= found;
4610 return 1;
df7492f9 4611}
b73bfc1c 4612
b73bfc1c 4613static void
df7492f9 4614decode_coding_charset (coding)
4ed46869 4615 struct coding_system *coding;
4ed46869 4616{
8f924df7
KH
4617 const unsigned char *src = coding->source + coding->consumed;
4618 const unsigned char *src_end = coding->source + coding->src_bytes;
4619 const unsigned char *src_base;
df7492f9 4620 int *charbuf = coding->charbuf;
ff0dacd7 4621 int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
df7492f9
KH
4622 int consumed_chars = 0, consumed_chars_base;
4623 int multibytep = coding->src_multibyte;
4eb6d3f1 4624 Lisp_Object attrs, eol_type, charset_list, valids;
ff0dacd7
KH
4625 int char_offset = coding->produced_char;
4626 int last_offset = char_offset;
4627 int last_id = charset_ascii;
df7492f9
KH
4628
4629 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4eb6d3f1 4630 valids = AREF (attrs, coding_attr_charset_valids);
b73bfc1c 4631
df7492f9 4632 while (1)
4ed46869 4633 {
4eb6d3f1 4634 int c;
df7492f9
KH
4635
4636 src_base = src;
4637 consumed_chars_base = consumed_chars;
b73bfc1c 4638
df7492f9
KH
4639 if (charbuf >= charbuf_end)
4640 break;
4641
4eb6d3f1 4642 ONE_MORE_BYTE (c);
df7492f9 4643 if (c == '\r')
d46c5b12 4644 {
c7c66a95
KH
4645 /* Here we assume that no charset maps '\r' to something
4646 else. */
df7492f9 4647 if (EQ (eol_type, Qdos))
b73bfc1c 4648 {
98725083 4649 if (src == src_end)
b73bfc1c 4650 {
98725083
KH
4651 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
4652 goto no_more_source;
b73bfc1c 4653 }
98725083 4654 if (*src == '\n')
df7492f9 4655 ONE_MORE_BYTE (c);
b73bfc1c 4656 }
df7492f9 4657 else if (EQ (eol_type, Qmac))
b73bfc1c 4658 c = '\n';
d46c5b12 4659 }
df7492f9 4660 else
d46c5b12 4661 {
4eb6d3f1
KH
4662 Lisp_Object val;
4663 struct charset *charset;
c7c66a95 4664 int dim;
acb2a965
KH
4665 int len = 1;
4666 unsigned code = c;
4eb6d3f1
KH
4667
4668 val = AREF (valids, c);
4669 if (NILP (val))
4670 goto invalid_code;
c7c66a95 4671 if (INTEGERP (val))
4eb6d3f1 4672 {
c7c66a95
KH
4673 charset = CHARSET_FROM_ID (XFASTINT (val));
4674 dim = CHARSET_DIMENSION (charset);
f9d71dcd 4675 while (len < dim)
4eb6d3f1 4676 {
acb2a965
KH
4677 ONE_MORE_BYTE (c);
4678 code = (code << 8) | c;
f9d71dcd 4679 len++;
4eb6d3f1 4680 }
c7c66a95
KH
4681 CODING_DECODE_CHAR (coding, src, src_base, src_end,
4682 charset, code, c);
4683 }
4684 else
4685 {
4686 /* VAL is a list of charset IDs. It is assured that the
4687 list is sorted by charset dimensions (smaller one
4688 comes first). */
c7c66a95
KH
4689 while (CONSP (val))
4690 {
4691 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
4692 dim = CHARSET_DIMENSION (charset);
f9d71dcd 4693 while (len < dim)
c7c66a95 4694 {
acb2a965
KH
4695 ONE_MORE_BYTE (c);
4696 code = (code << 8) | c;
f9d71dcd 4697 len++;
c7c66a95 4698 }
c7c66a95
KH
4699 CODING_DECODE_CHAR (coding, src, src_base,
4700 src_end, charset, code, c);
4701 if (c >= 0)
4702 break;
4703 val = XCDR (val);
4704 }
4eb6d3f1 4705 }
df7492f9
KH
4706 if (c < 0)
4707 goto invalid_code;
ff0dacd7
KH
4708 if (charset->id != charset_ascii
4709 && last_id != charset->id)
4710 {
4711 if (last_id != charset_ascii)
4712 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4713 last_id = charset->id;
4714 last_offset = char_offset;
4715 }
d46c5b12 4716 }
df7492f9 4717 *charbuf++ = c;
ff0dacd7 4718 char_offset++;
df7492f9
KH
4719 continue;
4720
4721 invalid_code:
4722 src = src_base;
4723 consumed_chars = consumed_chars_base;
4724 ONE_MORE_BYTE (c);
4725 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
ff0dacd7 4726 char_offset++;
df7492f9 4727 coding->errors++;
4ed46869
KH
4728 }
4729
df7492f9 4730 no_more_source:
ff0dacd7
KH
4731 if (last_id != charset_ascii)
4732 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
df7492f9
KH
4733 coding->consumed_char += consumed_chars_base;
4734 coding->consumed = src_base - coding->source;
4735 coding->charbuf_used = charbuf - coding->charbuf;
4ed46869
KH
4736}
4737
df7492f9
KH
4738static int
4739encode_coding_charset (coding)
4ed46869 4740 struct coding_system *coding;
4ed46869 4741{
df7492f9
KH
4742 int multibytep = coding->dst_multibyte;
4743 int *charbuf = coding->charbuf;
4744 int *charbuf_end = charbuf + coding->charbuf_used;
4745 unsigned char *dst = coding->destination + coding->produced;
4746 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4747 int safe_room = MAX_MULTIBYTE_LENGTH;
4748 int produced_chars = 0;
df7492f9
KH
4749 Lisp_Object attrs, eol_type, charset_list;
4750 int ascii_compatible;
b73bfc1c 4751 int c;
b73bfc1c 4752
df7492f9 4753 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
df7492f9 4754 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
fb88bf2d 4755
df7492f9 4756 while (charbuf < charbuf_end)
4ed46869 4757 {
4eb6d3f1 4758 struct charset *charset;
df7492f9 4759 unsigned code;
8f924df7 4760
df7492f9
KH
4761 ASSURE_DESTINATION (safe_room);
4762 c = *charbuf++;
4763 if (ascii_compatible && ASCII_CHAR_P (c))
4764 EMIT_ONE_ASCII_BYTE (c);
16eafb5d 4765 else if (CHAR_BYTE8_P (c))
4ed46869 4766 {
16eafb5d
KH
4767 c = CHAR_TO_BYTE8 (c);
4768 EMIT_ONE_BYTE (c);
d46c5b12 4769 }
d46c5b12 4770 else
b73bfc1c 4771 {
4eb6d3f1
KH
4772 charset = char_charset (c, charset_list, &code);
4773 if (charset)
4774 {
4775 if (CHARSET_DIMENSION (charset) == 1)
4776 EMIT_ONE_BYTE (code);
4777 else if (CHARSET_DIMENSION (charset) == 2)
4778 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
4779 else if (CHARSET_DIMENSION (charset) == 3)
4780 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
4781 else
4782 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
4783 (code >> 8) & 0xFF, code & 0xFF);
4784 }
4785 else
41cbe562
KH
4786 {
4787 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4788 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4789 else
4790 c = coding->default_char;
4791 EMIT_ONE_BYTE (c);
4792 }
4ed46869 4793 }
4ed46869
KH
4794 }
4795
df7492f9
KH
4796 coding->result = CODING_RESULT_SUCCESS;
4797 coding->produced_char += produced_chars;
4798 coding->produced = dst - coding->destination;
4799 return 0;
4ed46869
KH
4800}
4801
4802\f
1397dc18 4803/*** 7. C library functions ***/
4ed46869 4804
df7492f9
KH
4805/* Setup coding context CODING from information about CODING_SYSTEM.
4806 If CODING_SYSTEM is nil, `no-conversion' is assumed. If
4807 CODING_SYSTEM is invalid, signal an error. */
4ed46869 4808
ec6d2bb8 4809void
e0e989f6
KH
4810setup_coding_system (coding_system, coding)
4811 Lisp_Object coding_system;
4ed46869
KH
4812 struct coding_system *coding;
4813{
df7492f9
KH
4814 Lisp_Object attrs;
4815 Lisp_Object eol_type;
4816 Lisp_Object coding_type;
4608c386 4817 Lisp_Object val;
4ed46869 4818
df7492f9
KH
4819 if (NILP (coding_system))
4820 coding_system = Qno_conversion;
c07c8e12 4821
df7492f9 4822 CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
1f5dbf34 4823
df7492f9
KH
4824 attrs = CODING_ID_ATTRS (coding->id);
4825 eol_type = CODING_ID_EOL_TYPE (coding->id);
1f5dbf34 4826
df7492f9
KH
4827 coding->mode = 0;
4828 coding->head_ascii = -1;
4829 coding->common_flags
4830 = (VECTORP (eol_type) ? CODING_REQUIRE_DETECTION_MASK : 0);
5e5c78be
KH
4831 if (! NILP (CODING_ATTR_POST_READ (attrs)))
4832 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
4833 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
4834 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
8f924df7
KH
4835 if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
4836 coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
1f5dbf34 4837
df7492f9 4838 val = CODING_ATTR_SAFE_CHARSETS (attrs);
8f924df7
KH
4839 coding->max_charset_id = SCHARS (val) - 1;
4840 coding->safe_charsets = (char *) SDATA (val);
df7492f9 4841 coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
4608c386 4842
df7492f9
KH
4843 coding_type = CODING_ATTR_TYPE (attrs);
4844 if (EQ (coding_type, Qundecided))
d46c5b12 4845 {
df7492f9
KH
4846 coding->detector = NULL;
4847 coding->decoder = decode_coding_raw_text;
4848 coding->encoder = encode_coding_raw_text;
4849 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
d46c5b12 4850 }
df7492f9 4851 else if (EQ (coding_type, Qiso_2022))
d46c5b12 4852 {
df7492f9
KH
4853 int i;
4854 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
4855
4856 /* Invoke graphic register 0 to plane 0. */
4857 CODING_ISO_INVOCATION (coding, 0) = 0;
4858 /* Invoke graphic register 1 to plane 1 if we can use 8-bit. */
4859 CODING_ISO_INVOCATION (coding, 1)
4860 = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
4861 /* Setup the initial status of designation. */
4862 for (i = 0; i < 4; i++)
4863 CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
4864 /* Not single shifting initially. */
4865 CODING_ISO_SINGLE_SHIFTING (coding) = 0;
4866 /* Beginning of buffer should also be regarded as bol. */
4867 CODING_ISO_BOL (coding) = 1;
4868 coding->detector = detect_coding_iso_2022;
4869 coding->decoder = decode_coding_iso_2022;
4870 coding->encoder = encode_coding_iso_2022;
4871 if (flags & CODING_ISO_FLAG_SAFE)
4872 coding->mode |= CODING_MODE_SAFE_ENCODING;
d46c5b12 4873 coding->common_flags
df7492f9
KH
4874 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
4875 | CODING_REQUIRE_FLUSHING_MASK);
4876 if (flags & CODING_ISO_FLAG_COMPOSITION)
4877 coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
ff0dacd7
KH
4878 if (flags & CODING_ISO_FLAG_DESIGNATION)
4879 coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
df7492f9
KH
4880 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
4881 {
4882 setup_iso_safe_charsets (attrs);
4883 val = CODING_ATTR_SAFE_CHARSETS (attrs);
8f924df7
KH
4884 coding->max_charset_id = SCHARS (val) - 1;
4885 coding->safe_charsets = (char *) SDATA (val);
df7492f9
KH
4886 }
4887 CODING_ISO_FLAGS (coding) = flags;
d46c5b12 4888 }
df7492f9 4889 else if (EQ (coding_type, Qcharset))
d46c5b12 4890 {
df7492f9
KH
4891 coding->detector = detect_coding_charset;
4892 coding->decoder = decode_coding_charset;
4893 coding->encoder = encode_coding_charset;
d46c5b12 4894 coding->common_flags
df7492f9 4895 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
d46c5b12 4896 }
df7492f9 4897 else if (EQ (coding_type, Qutf_8))
d46c5b12 4898 {
df7492f9
KH
4899 coding->detector = detect_coding_utf_8;
4900 coding->decoder = decode_coding_utf_8;
4901 coding->encoder = encode_coding_utf_8;
4902 coding->common_flags
4903 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4904 }
4905 else if (EQ (coding_type, Qutf_16))
4906 {
4907 val = AREF (attrs, coding_attr_utf_16_bom);
4908 CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_16_detect_bom
4909 : EQ (val, Qt) ? utf_16_with_bom
4910 : utf_16_without_bom);
4911 val = AREF (attrs, coding_attr_utf_16_endian);
b49a1807 4912 CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
df7492f9 4913 : utf_16_little_endian);
e19c3639 4914 CODING_UTF_16_SURROGATE (coding) = 0;
df7492f9
KH
4915 coding->detector = detect_coding_utf_16;
4916 coding->decoder = decode_coding_utf_16;
4917 coding->encoder = encode_coding_utf_16;
4918 coding->common_flags
4919 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
b49a1807
KH
4920 if (CODING_UTF_16_BOM (coding) == utf_16_detect_bom)
4921 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
d46c5b12 4922 }
df7492f9 4923 else if (EQ (coding_type, Qccl))
4ed46869 4924 {
df7492f9
KH
4925 coding->detector = detect_coding_ccl;
4926 coding->decoder = decode_coding_ccl;
4927 coding->encoder = encode_coding_ccl;
c952af22 4928 coding->common_flags
df7492f9
KH
4929 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
4930 | CODING_REQUIRE_FLUSHING_MASK);
4931 }
4932 else if (EQ (coding_type, Qemacs_mule))
4933 {
4934 coding->detector = detect_coding_emacs_mule;
4935 coding->decoder = decode_coding_emacs_mule;
4936 coding->encoder = encode_coding_emacs_mule;
c952af22 4937 coding->common_flags
df7492f9
KH
4938 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4939 if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
4940 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
4941 {
4942 Lisp_Object tail, safe_charsets;
4943 int max_charset_id = 0;
4944
4945 for (tail = Vemacs_mule_charset_list; CONSP (tail);
4946 tail = XCDR (tail))
4947 if (max_charset_id < XFASTINT (XCAR (tail)))
4948 max_charset_id = XFASTINT (XCAR (tail));
4949 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
4950 make_number (255));
4951 for (tail = Vemacs_mule_charset_list; CONSP (tail);
4952 tail = XCDR (tail))
8f924df7 4953 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
df7492f9 4954 coding->max_charset_id = max_charset_id;
8f924df7 4955 coding->safe_charsets = (char *) SDATA (safe_charsets);
df7492f9
KH
4956 }
4957 }
4958 else if (EQ (coding_type, Qshift_jis))
4959 {
4960 coding->detector = detect_coding_sjis;
4961 coding->decoder = decode_coding_sjis;
4962 coding->encoder = encode_coding_sjis;
c952af22 4963 coding->common_flags
df7492f9
KH
4964 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4965 }
4966 else if (EQ (coding_type, Qbig5))
4967 {
4968 coding->detector = detect_coding_big5;
4969 coding->decoder = decode_coding_big5;
4970 coding->encoder = encode_coding_big5;
c952af22 4971 coding->common_flags
df7492f9
KH
4972 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4973 }
4974 else /* EQ (coding_type, Qraw_text) */
ec6d2bb8 4975 {
df7492f9
KH
4976 coding->detector = NULL;
4977 coding->decoder = decode_coding_raw_text;
4978 coding->encoder = encode_coding_raw_text;
4ed46869 4979 }
4ed46869 4980
df7492f9 4981 return;
4ed46869
KH
4982}
4983
df7492f9
KH
4984/* Return raw-text or one of its subsidiaries that has the same
4985 eol_type as CODING-SYSTEM. */
ec6d2bb8 4986
df7492f9
KH
4987Lisp_Object
4988raw_text_coding_system (coding_system)
4989 Lisp_Object coding_system;
ec6d2bb8 4990{
0be8721c 4991 Lisp_Object spec, attrs;
df7492f9 4992 Lisp_Object eol_type, raw_text_eol_type;
ec6d2bb8 4993
d3e4cb56
KH
4994 if (NILP (coding_system))
4995 return Qraw_text;
df7492f9
KH
4996 spec = CODING_SYSTEM_SPEC (coding_system);
4997 attrs = AREF (spec, 0);
ec6d2bb8 4998
df7492f9
KH
4999 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5000 return coding_system;
ec6d2bb8 5001
df7492f9
KH
5002 eol_type = AREF (spec, 2);
5003 if (VECTORP (eol_type))
5004 return Qraw_text;
5005 spec = CODING_SYSTEM_SPEC (Qraw_text);
5006 raw_text_eol_type = AREF (spec, 2);
5007 return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5008 : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5009 : AREF (raw_text_eol_type, 2));
ec6d2bb8
KH
5010}
5011
54f78171 5012
df7492f9
KH
5013/* If CODING_SYSTEM doesn't specify end-of-line format but PARENT
5014 does, return one of the subsidiary that has the same eol-spec as
5015 PARENT. Otherwise, return CODING_SYSTEM. */
5016
5017Lisp_Object
5018coding_inherit_eol_type (coding_system, parent)
b74e4686 5019 Lisp_Object coding_system, parent;
54f78171 5020{
df7492f9 5021 Lisp_Object spec, attrs, eol_type;
54f78171 5022
d3e4cb56
KH
5023 if (NILP (coding_system))
5024 coding_system = Qraw_text;
df7492f9
KH
5025 spec = CODING_SYSTEM_SPEC (coding_system);
5026 attrs = AREF (spec, 0);
5027 eol_type = AREF (spec, 2);
d3e4cb56
KH
5028 if (VECTORP (eol_type)
5029 && ! NILP (parent))
df7492f9
KH
5030 {
5031 Lisp_Object parent_spec;
df7492f9
KH
5032 Lisp_Object parent_eol_type;
5033
5034 parent_spec
5035 = CODING_SYSTEM_SPEC (buffer_defaults.buffer_file_coding_system);
5036 parent_eol_type = AREF (parent_spec, 2);
5037 if (EQ (parent_eol_type, Qunix))
5038 coding_system = AREF (eol_type, 0);
5039 else if (EQ (parent_eol_type, Qdos))
5040 coding_system = AREF (eol_type, 1);
5041 else if (EQ (parent_eol_type, Qmac))
5042 coding_system = AREF (eol_type, 2);
54f78171 5043 }
df7492f9 5044 return coding_system;
54f78171
KH
5045}
5046
4ed46869
KH
5047/* Emacs has a mechanism to automatically detect a coding system if it
5048 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
5049 it's impossible to distinguish some coding systems accurately
5050 because they use the same range of codes. So, at first, coding
5051 systems are categorized into 7, those are:
5052
0ef69138 5053 o coding-category-emacs-mule
4ed46869
KH
5054
5055 The category for a coding system which has the same code range
5056 as Emacs' internal format. Assigned the coding-system (Lisp
0ef69138 5057 symbol) `emacs-mule' by default.
4ed46869
KH
5058
5059 o coding-category-sjis
5060
5061 The category for a coding system which has the same code range
5062 as SJIS. Assigned the coding-system (Lisp
7717c392 5063 symbol) `japanese-shift-jis' by default.
4ed46869
KH
5064
5065 o coding-category-iso-7
5066
5067 The category for a coding system which has the same code range
7717c392 5068 as ISO2022 of 7-bit environment. This doesn't use any locking
d46c5b12
KH
5069 shift and single shift functions. This can encode/decode all
5070 charsets. Assigned the coding-system (Lisp symbol)
5071 `iso-2022-7bit' by default.
5072
5073 o coding-category-iso-7-tight
5074
5075 Same as coding-category-iso-7 except that this can
5076 encode/decode only the specified charsets.
4ed46869
KH
5077
5078 o coding-category-iso-8-1
5079
5080 The category for a coding system which has the same code range
5081 as ISO2022 of 8-bit environment and graphic plane 1 used only
7717c392
KH
5082 for DIMENSION1 charset. This doesn't use any locking shift
5083 and single shift functions. Assigned the coding-system (Lisp
5084 symbol) `iso-latin-1' by default.
4ed46869
KH
5085
5086 o coding-category-iso-8-2
5087
5088 The category for a coding system which has the same code range
5089 as ISO2022 of 8-bit environment and graphic plane 1 used only
7717c392
KH
5090 for DIMENSION2 charset. This doesn't use any locking shift
5091 and single shift functions. Assigned the coding-system (Lisp
5092 symbol) `japanese-iso-8bit' by default.
4ed46869 5093
7717c392 5094 o coding-category-iso-7-else
4ed46869
KH
5095
5096 The category for a coding system which has the same code range
df7492f9 5097 as ISO2022 of 7-bit environemnt but uses locking shift or
7717c392
KH
5098 single shift functions. Assigned the coding-system (Lisp
5099 symbol) `iso-2022-7bit-lock' by default.
5100
5101 o coding-category-iso-8-else
5102
5103 The category for a coding system which has the same code range
df7492f9 5104 as ISO2022 of 8-bit environemnt but uses locking shift or
7717c392
KH
5105 single shift functions. Assigned the coding-system (Lisp
5106 symbol) `iso-2022-8bit-ss2' by default.
4ed46869
KH
5107
5108 o coding-category-big5
5109
5110 The category for a coding system which has the same code range
5111 as BIG5. Assigned the coding-system (Lisp symbol)
e0e989f6 5112 `cn-big5' by default.
4ed46869 5113
fa42c37f
KH
5114 o coding-category-utf-8
5115
5116 The category for a coding system which has the same code range
5117 as UTF-8 (cf. RFC2279). Assigned the coding-system (Lisp
5118 symbol) `utf-8' by default.
5119
5120 o coding-category-utf-16-be
5121
5122 The category for a coding system in which a text has an
5123 Unicode signature (cf. Unicode Standard) in the order of BIG
5124 endian at the head. Assigned the coding-system (Lisp symbol)
5125 `utf-16-be' by default.
5126
5127 o coding-category-utf-16-le
5128
5129 The category for a coding system in which a text has an
5130 Unicode signature (cf. Unicode Standard) in the order of
5131 LITTLE endian at the head. Assigned the coding-system (Lisp
5132 symbol) `utf-16-le' by default.
5133
1397dc18
KH
5134 o coding-category-ccl
5135
5136 The category for a coding system of which encoder/decoder is
5137 written in CCL programs. The default value is nil, i.e., no
5138 coding system is assigned.
5139
4ed46869
KH
5140 o coding-category-binary
5141
5142 The category for a coding system not categorized in any of the
5143 above. Assigned the coding-system (Lisp symbol)
e0e989f6 5144 `no-conversion' by default.
4ed46869
KH
5145
5146 Each of them is a Lisp symbol and the value is an actual
df7492f9 5147 `coding-system's (this is also a Lisp symbol) assigned by a user.
4ed46869
KH
5148 What Emacs does actually is to detect a category of coding system.
5149 Then, it uses a `coding-system' assigned to it. If Emacs can't
df7492f9 5150 decide only one possible category, it selects a category of the
4ed46869
KH
5151 highest priority. Priorities of categories are also specified by a
5152 user in a Lisp variable `coding-category-list'.
5153
5154*/
5155
df7492f9
KH
5156#define EOL_SEEN_NONE 0
5157#define EOL_SEEN_LF 1
5158#define EOL_SEEN_CR 2
5159#define EOL_SEEN_CRLF 4
66cfb530 5160
ff0dacd7
KH
5161/* Detect how end-of-line of a text of length SRC_BYTES pointed by
5162 SOURCE is encoded. If CATEGORY is one of
5163 coding_category_utf_16_XXXX, assume that CR and LF are encoded by
5164 two-byte, else they are encoded by one-byte.
5165
5166 Return one of EOL_SEEN_XXX. */
4ed46869 5167
bc4bc72a 5168#define MAX_EOL_CHECK_COUNT 3
d46c5b12
KH
5169
5170static int
89528eb3 5171detect_eol (source, src_bytes, category)
d46c5b12 5172 unsigned char *source;
df7492f9 5173 EMACS_INT src_bytes;
89528eb3 5174 enum coding_category category;
4ed46869 5175{
d46c5b12 5176 unsigned char *src = source, *src_end = src + src_bytes;
4ed46869 5177 unsigned char c;
df7492f9
KH
5178 int total = 0;
5179 int eol_seen = EOL_SEEN_NONE;
4ed46869 5180
89528eb3 5181 if ((1 << category) & CATEGORY_MASK_UTF_16)
4ed46869 5182 {
df7492f9 5183 int msb, lsb;
fa42c37f 5184
89528eb3
KH
5185 msb = category == (coding_category_utf_16_le
5186 | coding_category_utf_16_le_nosig);
df7492f9 5187 lsb = 1 - msb;
fa42c37f 5188
df7492f9 5189 while (src + 1 < src_end)
fa42c37f 5190 {
df7492f9
KH
5191 c = src[lsb];
5192 if (src[msb] == 0 && (c == '\n' || c == '\r'))
fa42c37f 5193 {
df7492f9
KH
5194 int this_eol;
5195
5196 if (c == '\n')
5197 this_eol = EOL_SEEN_LF;
5198 else if (src + 3 >= src_end
5199 || src[msb + 2] != 0
5200 || src[lsb + 2] != '\n')
5201 this_eol = EOL_SEEN_CR;
fa42c37f 5202 else
8f924df7 5203 this_eol = EOL_SEEN_CRLF;
df7492f9
KH
5204
5205 if (eol_seen == EOL_SEEN_NONE)
5206 /* This is the first end-of-line. */
5207 eol_seen = this_eol;
5208 else if (eol_seen != this_eol)
fa42c37f 5209 {
df7492f9
KH
5210 /* The found type is different from what found before. */
5211 eol_seen = EOL_SEEN_LF;
5212 break;
fa42c37f 5213 }
df7492f9
KH
5214 if (++total == MAX_EOL_CHECK_COUNT)
5215 break;
fa42c37f 5216 }
df7492f9 5217 src += 2;
fa42c37f 5218 }
bcf26d6a 5219 }
d46c5b12 5220 else
c4825358 5221 {
df7492f9 5222 while (src < src_end)
27901516 5223 {
df7492f9
KH
5224 c = *src++;
5225 if (c == '\n' || c == '\r')
5226 {
5227 int this_eol;
d46c5b12 5228
df7492f9
KH
5229 if (c == '\n')
5230 this_eol = EOL_SEEN_LF;
5231 else if (src >= src_end || *src != '\n')
5232 this_eol = EOL_SEEN_CR;
5233 else
5234 this_eol = EOL_SEEN_CRLF, src++;
d46c5b12 5235
df7492f9
KH
5236 if (eol_seen == EOL_SEEN_NONE)
5237 /* This is the first end-of-line. */
5238 eol_seen = this_eol;
5239 else if (eol_seen != this_eol)
5240 {
5241 /* The found type is different from what found before. */
5242 eol_seen = EOL_SEEN_LF;
5243 break;
5244 }
5245 if (++total == MAX_EOL_CHECK_COUNT)
5246 break;
5247 }
5248 }
73be902c 5249 }
df7492f9 5250 return eol_seen;
73be902c
KH
5251}
5252
df7492f9 5253
73be902c 5254static void
df7492f9
KH
5255adjust_coding_eol_type (coding, eol_seen)
5256 struct coding_system *coding;
5257 int eol_seen;
73be902c 5258{
0be8721c 5259 Lisp_Object eol_type;
8f924df7 5260
df7492f9
KH
5261 eol_type = CODING_ID_EOL_TYPE (coding->id);
5262 if (eol_seen & EOL_SEEN_LF)
5263 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6f197c07 5264 else if (eol_seen & EOL_SEEN_CRLF)
df7492f9 5265 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6f197c07 5266 else if (eol_seen & EOL_SEEN_CR)
df7492f9 5267 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
d46c5b12 5268}
4ed46869 5269
df7492f9
KH
5270/* Detect how a text specified in CODING is encoded. If a coding
5271 system is detected, update fields of CODING by the detected coding
5272 system. */
0a28aafb 5273
df7492f9
KH
5274void
5275detect_coding (coding)
d46c5b12 5276 struct coding_system *coding;
d46c5b12 5277{
8f924df7 5278 const unsigned char *src, *src_end;
df7492f9 5279 Lisp_Object attrs, coding_type;
d46c5b12 5280
df7492f9
KH
5281 coding->consumed = coding->consumed_char = 0;
5282 coding->produced = coding->produced_char = 0;
5283 coding_set_source (coding);
1c3478b0 5284
df7492f9 5285 src_end = coding->source + coding->src_bytes;
1c3478b0 5286
df7492f9
KH
5287 /* If we have not yet decided the text encoding type, detect it
5288 now. */
5289 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
b73bfc1c 5290 {
df7492f9
KH
5291 int c, i;
5292
5293 for (src = coding->source; src < src_end; src++)
d46c5b12 5294 {
df7492f9
KH
5295 c = *src;
5296 if (c & 0x80 || (c < 0x20 && (c == ISO_CODE_ESC
5297 || c == ISO_CODE_SI
5298 || c == ISO_CODE_SO)))
5299 break;
d46c5b12 5300 }
df7492f9
KH
5301 coding->head_ascii = src - (coding->source + coding->consumed);
5302
5303 if (coding->head_ascii < coding->src_bytes)
d46c5b12 5304 {
ff0dacd7
KH
5305 struct coding_detection_info detect_info;
5306 enum coding_category category;
5307 struct coding_system *this;
df7492f9 5308
ff0dacd7 5309 detect_info.checked = detect_info.found = detect_info.rejected = 0;
df7492f9 5310 for (i = 0; i < coding_category_raw_text; i++)
d46c5b12 5311 {
ff0dacd7
KH
5312 category = coding_priorities[i];
5313 this = coding_categories + category;
df7492f9 5314 if (this->id < 0)
fa42c37f 5315 {
df7492f9 5316 /* No coding system of this category is defined. */
ff0dacd7 5317 detect_info.rejected |= (1 << category);
fa42c37f 5318 }
ff0dacd7 5319 else if (category >= coding_category_raw_text)
89528eb3 5320 continue;
ff0dacd7 5321 else if (detect_info.checked & (1 << category))
fa42c37f 5322 {
ff0dacd7
KH
5323 if (detect_info.found & (1 << category))
5324 break;
fa42c37f 5325 }
ff0dacd7
KH
5326 else if ((*(this->detector)) (coding, &detect_info)
5327 && detect_info.found & (1 << category))
5328 break;
d46c5b12 5329 }
ff0dacd7
KH
5330 if (i < coding_category_raw_text)
5331 setup_coding_system (CODING_ID_NAME (this->id), coding);
5332 else if (detect_info.rejected == CATEGORY_MASK_ANY)
df7492f9 5333 setup_coding_system (Qraw_text, coding);
ff0dacd7 5334 else if (detect_info.rejected)
df7492f9 5335 for (i = 0; i < coding_category_raw_text; i++)
ff0dacd7
KH
5336 if (! (detect_info.rejected & (1 << coding_priorities[i])))
5337 {
5338 this = coding_categories + coding_priorities[i];
5339 setup_coding_system (CODING_ID_NAME (this->id), coding);
5340 break;
5341 }
d46c5b12 5342 }
b73bfc1c 5343 }
b49a1807
KH
5344 else if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qutf_16))
5345 {
5346 Lisp_Object coding_systems;
5347 struct coding_detection_info detect_info;
5348
5349 coding_systems
5350 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_16_bom);
5351 detect_info.found = detect_info.rejected = 0;
5352 if (CONSP (coding_systems)
5353 && detect_coding_utf_16 (coding, &detect_info)
5354 && (detect_info.found & (CATEGORY_MASK_UTF_16_LE
5355 | CATEGORY_MASK_UTF_16_BE)))
5356 {
5357 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5358 setup_coding_system (XCAR (coding_systems), coding);
5359 else
5360 setup_coding_system (XCDR (coding_systems), coding);
5361 }
5362 }
4ed46869 5363
df7492f9
KH
5364 attrs = CODING_ID_ATTRS (coding->id);
5365 coding_type = CODING_ATTR_TYPE (attrs);
4ed46869 5366
df7492f9
KH
5367 /* If we have not yet decided the EOL type, detect it now. But, the
5368 detection is impossible for a CCL based coding system, in which
5369 case, we detct the EOL type after decoding. */
5370 if (VECTORP (CODING_ID_EOL_TYPE (coding->id))
5371 && ! EQ (coding_type, Qccl))
d46c5b12 5372 {
89528eb3
KH
5373 int eol_seen = detect_eol (coding->source, coding->src_bytes,
5374 XINT (CODING_ATTR_CATEGORY (attrs)));
4ed46869 5375
df7492f9
KH
5376 if (eol_seen != EOL_SEEN_NONE)
5377 adjust_coding_eol_type (coding, eol_seen);
d46c5b12 5378 }
4ed46869 5379}
4ed46869 5380
d46c5b12 5381
aaaf0b1e 5382static void
df7492f9 5383decode_eol (coding)
aaaf0b1e 5384 struct coding_system *coding;
aaaf0b1e 5385{
df7492f9 5386 if (VECTORP (CODING_ID_EOL_TYPE (coding->id)))
aaaf0b1e 5387 {
df7492f9
KH
5388 unsigned char *p = CHAR_POS_ADDR (coding->dst_pos);
5389 unsigned char *pend = p + coding->produced;
5390 int eol_seen = EOL_SEEN_NONE;
4ed46869 5391
df7492f9 5392 for (; p < pend; p++)
aaaf0b1e 5393 {
df7492f9
KH
5394 if (*p == '\n')
5395 eol_seen |= EOL_SEEN_LF;
5396 else if (*p == '\r')
aaaf0b1e 5397 {
df7492f9 5398 if (p + 1 < pend && *(p + 1) == '\n')
aaaf0b1e 5399 {
df7492f9
KH
5400 eol_seen |= EOL_SEEN_CRLF;
5401 p++;
aaaf0b1e 5402 }
aaaf0b1e 5403 else
df7492f9 5404 eol_seen |= EOL_SEEN_CR;
aaaf0b1e 5405 }
aaaf0b1e 5406 }
df7492f9
KH
5407 if (eol_seen != EOL_SEEN_NONE)
5408 adjust_coding_eol_type (coding, eol_seen);
aaaf0b1e 5409 }
d46c5b12 5410
df7492f9 5411 if (EQ (CODING_ID_EOL_TYPE (coding->id), Qmac))
27901516 5412 {
df7492f9
KH
5413 unsigned char *p = CHAR_POS_ADDR (coding->dst_pos);
5414 unsigned char *pend = p + coding->produced;
d46c5b12 5415
df7492f9
KH
5416 for (; p < pend; p++)
5417 if (*p == '\r')
5418 *p = '\n';
4ed46869 5419 }
df7492f9
KH
5420 else if (EQ (CODING_ID_EOL_TYPE (coding->id), Qdos))
5421 {
5422 unsigned char *p, *pbeg, *pend;
5423 Lisp_Object undo_list;
b73bfc1c 5424
df7492f9
KH
5425 move_gap_both (coding->dst_pos + coding->produced_char,
5426 coding->dst_pos_byte + coding->produced);
5427 undo_list = current_buffer->undo_list;
5428 current_buffer->undo_list = Qt;
c197f191 5429 del_range_2 (coding->dst_pos, coding->dst_pos_byte, GPT, GPT_BYTE, 0);
df7492f9
KH
5430 current_buffer->undo_list = undo_list;
5431 pbeg = GPT_ADDR;
5432 pend = pbeg + coding->produced;
b73bfc1c 5433
df7492f9
KH
5434 for (p = pend - 1; p >= pbeg; p--)
5435 if (*p == '\r')
5436 {
5437 safe_bcopy ((char *) (p + 1), (char *) p, pend - p - 1);
5438 pend--;
5439 }
5440 coding->produced_char -= coding->produced - (pend - pbeg);
5441 coding->produced = pend - pbeg;
5442 insert_from_gap (coding->produced_char, coding->produced);
aaaf0b1e 5443 }
4ed46869
KH
5444}
5445
df7492f9
KH
5446static void
5447translate_chars (coding, table)
4ed46869 5448 struct coding_system *coding;
df7492f9 5449 Lisp_Object table;
4ed46869 5450{
df7492f9
KH
5451 int *charbuf = coding->charbuf;
5452 int *charbuf_end = charbuf + coding->charbuf_used;
5453 int c;
d46c5b12 5454
df7492f9
KH
5455 if (coding->chars_at_source)
5456 return;
4ed46869 5457
df7492f9 5458 while (charbuf < charbuf_end)
8844fa83 5459 {
df7492f9
KH
5460 c = *charbuf;
5461 if (c < 0)
5462 charbuf += c;
5463 else
5464 *charbuf++ = translate_char (table, c);
8844fa83 5465 }
df7492f9 5466}
bc4bc72a 5467
d46c5b12 5468static int
df7492f9
KH
5469produce_chars (coding)
5470 struct coding_system *coding;
4ed46869 5471{
df7492f9
KH
5472 unsigned char *dst = coding->destination + coding->produced;
5473 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5474 int produced;
5475 int produced_chars = 0;
4ed46869 5476
df7492f9 5477 if (! coding->chars_at_source)
4ed46869 5478 {
df7492f9
KH
5479 /* Characters are in coding->charbuf. */
5480 int *buf = coding->charbuf;
5481 int *buf_end = buf + coding->charbuf_used;
5482 unsigned char *adjusted_dst_end;
4ed46869 5483
df7492f9
KH
5484 if (BUFFERP (coding->src_object)
5485 && EQ (coding->src_object, coding->dst_object))
8f924df7 5486 dst_end = ((unsigned char *) coding->source) + coding->consumed;
df7492f9 5487 adjusted_dst_end = dst_end - MAX_MULTIBYTE_LENGTH;
4ed46869 5488
df7492f9 5489 while (buf < buf_end)
4ed46869 5490 {
df7492f9 5491 int c = *buf++;
bc4bc72a 5492
df7492f9 5493 if (dst >= adjusted_dst_end)
d46c5b12 5494 {
df7492f9
KH
5495 dst = alloc_destination (coding,
5496 buf_end - buf + MAX_MULTIBYTE_LENGTH,
5497 dst);
5498 dst_end = coding->destination + coding->dst_bytes;
5499 adjusted_dst_end = dst_end - MAX_MULTIBYTE_LENGTH;
5500 }
5501 if (c >= 0)
5502 {
5503 if (coding->dst_multibyte
5504 || ! CHAR_BYTE8_P (c))
5505 CHAR_STRING_ADVANCE (c, dst);
5506 else
5507 *dst++ = CHAR_TO_BYTE8 (c);
5508 produced_chars++;
d46c5b12 5509 }
df7492f9 5510 else
d3e4cb56
KH
5511 /* This is an annotation datum. (-C) is the length of
5512 it. */
5513 buf += -c - 1;
4ed46869
KH
5514 }
5515 }
fa42c37f 5516 else
fa42c37f 5517 {
8f924df7
KH
5518 const unsigned char *src = coding->source;
5519 const unsigned char *src_end = src + coding->src_bytes;
df7492f9 5520 Lisp_Object eol_type;
fa42c37f 5521
df7492f9 5522 eol_type = CODING_ID_EOL_TYPE (coding->id);
4ed46869 5523
df7492f9 5524 if (coding->src_multibyte != coding->dst_multibyte)
fa42c37f 5525 {
df7492f9 5526 if (coding->src_multibyte)
fa42c37f 5527 {
71c81426 5528 int multibytep = 1;
df7492f9 5529 int consumed_chars;
d46c5b12 5530
df7492f9
KH
5531 while (1)
5532 {
8f924df7 5533 const unsigned char *src_base = src;
df7492f9 5534 int c;
b73bfc1c 5535
df7492f9
KH
5536 ONE_MORE_BYTE (c);
5537 if (c == '\r')
5538 {
5539 if (EQ (eol_type, Qdos))
5540 {
98725083
KH
5541 if (src == src_end)
5542 {
5543 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
5544 goto no_more_source;
5545 }
5546 if (*src == '\n')
df7492f9
KH
5547 c = *src++;
5548 }
5549 else if (EQ (eol_type, Qmac))
5550 c = '\n';
5551 }
5552 if (dst == dst_end)
5553 {
2c78b7e1 5554 coding->consumed = src - coding->source;
b73bfc1c 5555
2c78b7e1 5556 if (EQ (coding->src_object, coding->dst_object))
8f924df7 5557 dst_end = (unsigned char *) src;
2c78b7e1
KH
5558 if (dst == dst_end)
5559 {
5560 dst = alloc_destination (coding, src_end - src + 1,
5561 dst);
5562 dst_end = coding->destination + coding->dst_bytes;
5563 coding_set_source (coding);
5564 src = coding->source + coding->consumed;
5565 src_end = coding->source + coding->src_bytes;
5566 }
df7492f9
KH
5567 }
5568 *dst++ = c;
5569 produced_chars++;
5570 }
5571 no_more_source:
5572 ;
fa42c37f
KH
5573 }
5574 else
df7492f9
KH
5575 while (src < src_end)
5576 {
71c81426 5577 int multibytep = 1;
df7492f9 5578 int c = *src++;
b73bfc1c 5579
df7492f9
KH
5580 if (c == '\r')
5581 {
5582 if (EQ (eol_type, Qdos))
5583 {
5584 if (src < src_end
5585 && *src == '\n')
5586 c = *src++;
5587 }
5588 else if (EQ (eol_type, Qmac))
5589 c = '\n';
5590 }
5591 if (dst >= dst_end - 1)
5592 {
2c78b7e1 5593 coding->consumed = src - coding->source;
df7492f9 5594
2c78b7e1 5595 if (EQ (coding->src_object, coding->dst_object))
8f924df7 5596 dst_end = (unsigned char *) src;
2c78b7e1
KH
5597 if (dst >= dst_end - 1)
5598 {
5599 dst = alloc_destination (coding, src_end - src + 2,
5600 dst);
5601 dst_end = coding->destination + coding->dst_bytes;
5602 coding_set_source (coding);
5603 src = coding->source + coding->consumed;
5604 src_end = coding->source + coding->src_bytes;
5605 }
df7492f9
KH
5606 }
5607 EMIT_ONE_BYTE (c);
5608 }
d46c5b12 5609 }
df7492f9
KH
5610 else
5611 {
5612 if (!EQ (coding->src_object, coding->dst_object))
fa42c37f 5613 {
df7492f9 5614 int require = coding->src_bytes - coding->dst_bytes;
4ed46869 5615
df7492f9 5616 if (require > 0)
fa42c37f 5617 {
df7492f9
KH
5618 EMACS_INT offset = src - coding->source;
5619
5620 dst = alloc_destination (coding, require, dst);
5621 coding_set_source (coding);
5622 src = coding->source + offset;
5623 src_end = coding->source + coding->src_bytes;
fa42c37f
KH
5624 }
5625 }
df7492f9
KH
5626 produced_chars = coding->src_chars;
5627 while (src < src_end)
fa42c37f 5628 {
df7492f9
KH
5629 int c = *src++;
5630
5631 if (c == '\r')
5632 {
5633 if (EQ (eol_type, Qdos))
5634 {
5635 if (src < src_end
5636 && *src == '\n')
5637 c = *src++;
5638 produced_chars--;
5639 }
5640 else if (EQ (eol_type, Qmac))
5641 c = '\n';
5642 }
5643 *dst++ = c;
fa42c37f
KH
5644 }
5645 }
2c78b7e1
KH
5646 coding->consumed = coding->src_bytes;
5647 coding->consumed_char = coding->src_chars;
fa42c37f
KH
5648 }
5649
df7492f9
KH
5650 produced = dst - (coding->destination + coding->produced);
5651 if (BUFFERP (coding->dst_object))
5652 insert_from_gap (produced_chars, produced);
5653 coding->produced += produced;
5654 coding->produced_char += produced_chars;
5655 return produced_chars;
fa42c37f
KH
5656}
5657
ff0dacd7
KH
5658/* Compose text in CODING->object according to the annotation data at
5659 CHARBUF. CHARBUF is an array:
5660 [ -LENGTH ANNOTATION_MASK FROM TO METHOD COMP_LEN [ COMPONENTS... ] ]
df7492f9 5661 */
4ed46869 5662
df7492f9
KH
5663static INLINE void
5664produce_composition (coding, charbuf)
4ed46869 5665 struct coding_system *coding;
df7492f9 5666 int *charbuf;
4ed46869 5667{
df7492f9 5668 int len;
ff0dacd7 5669 EMACS_INT from, to;
df7492f9 5670 enum composition_method method;
df7492f9 5671 Lisp_Object components;
fa42c37f 5672
df7492f9 5673 len = -charbuf[0];
ff0dacd7
KH
5674 from = coding->dst_pos + charbuf[2];
5675 to = coding->dst_pos + charbuf[3];
5676 method = (enum composition_method) (charbuf[4]);
d46c5b12 5677
df7492f9
KH
5678 if (method == COMPOSITION_RELATIVE)
5679 components = Qnil;
d46c5b12 5680 else
d46c5b12 5681 {
df7492f9
KH
5682 Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
5683 int i;
b73bfc1c 5684
df7492f9
KH
5685 len -= 5;
5686 charbuf += 5;
5687 for (i = 0; i < len; i++)
5688 args[i] = make_number (charbuf[i]);
5689 components = (method == COMPOSITION_WITH_ALTCHARS
5690 ? Fstring (len, args) : Fvector (len, args));
d46c5b12 5691 }
ff0dacd7 5692 compose_text (from, to, components, Qnil, coding->dst_object);
d46c5b12
KH
5693}
5694
d46c5b12 5695
ff0dacd7
KH
5696/* Put `charset' property on text in CODING->object according to
5697 the annotation data at CHARBUF. CHARBUF is an array:
5698 [ -LENGTH ANNOTATION_MASK FROM TO CHARSET-ID ]
5699 */
d46c5b12 5700
ff0dacd7
KH
5701static INLINE void
5702produce_charset (coding, charbuf)
d46c5b12 5703 struct coding_system *coding;
ff0dacd7 5704 int *charbuf;
d46c5b12 5705{
ff0dacd7
KH
5706 EMACS_INT from = coding->dst_pos + charbuf[2];
5707 EMACS_INT to = coding->dst_pos + charbuf[3];
5708 struct charset *charset = CHARSET_FROM_ID (charbuf[4]);
b73bfc1c 5709
ff0dacd7
KH
5710 Fput_text_property (make_number (from), make_number (to),
5711 Qcharset, CHARSET_NAME (charset),
5712 coding->dst_object);
d46c5b12
KH
5713}
5714
d46c5b12 5715
df7492f9
KH
5716#define CHARBUF_SIZE 0x4000
5717
5718#define ALLOC_CONVERSION_WORK_AREA(coding) \
5719 do { \
5720 int size = CHARBUF_SIZE;; \
5721 \
5722 coding->charbuf = NULL; \
5723 while (size > 1024) \
5724 { \
5725 coding->charbuf = (int *) alloca (sizeof (int) * size); \
5726 if (coding->charbuf) \
5727 break; \
5728 size >>= 1; \
5729 } \
5730 if (! coding->charbuf) \
5731 { \
5732 coding->result = CODING_RESULT_INSUFFICIENT_MEM; \
5733 return coding->result; \
5734 } \
5735 coding->charbuf_size = size; \
5736 } while (0)
4ed46869 5737
d46c5b12
KH
5738
5739static void
df7492f9 5740produce_annotation (coding)
d46c5b12 5741 struct coding_system *coding;
d46c5b12 5742{
df7492f9
KH
5743 int *charbuf = coding->charbuf;
5744 int *charbuf_end = charbuf + coding->charbuf_used;
d46c5b12 5745
ff0dacd7
KH
5746 if (NILP (coding->dst_object))
5747 return;
d46c5b12 5748
df7492f9 5749 while (charbuf < charbuf_end)
a84f1519 5750 {
df7492f9
KH
5751 if (*charbuf >= 0)
5752 charbuf++;
d46c5b12 5753 else
d46c5b12 5754 {
df7492f9 5755 int len = -*charbuf;
ff0dacd7 5756 switch (charbuf[1])
df7492f9
KH
5757 {
5758 case CODING_ANNOTATE_COMPOSITION_MASK:
5759 produce_composition (coding, charbuf);
5760 break;
ff0dacd7
KH
5761 case CODING_ANNOTATE_CHARSET_MASK:
5762 produce_charset (coding, charbuf);
5763 break;
df7492f9
KH
5764 default:
5765 abort ();
5766 }
5767 charbuf += len;
d46c5b12 5768 }
a84f1519 5769 }
d46c5b12
KH
5770}
5771
df7492f9
KH
5772/* Decode the data at CODING->src_object into CODING->dst_object.
5773 CODING->src_object is a buffer, a string, or nil.
5774 CODING->dst_object is a buffer.
d46c5b12 5775
df7492f9
KH
5776 If CODING->src_object is a buffer, it must be the current buffer.
5777 In this case, if CODING->src_pos is positive, it is a position of
5778 the source text in the buffer, otherwise, the source text is in the
5779 gap area of the buffer, and CODING->src_pos specifies the offset of
5780 the text from GPT (which must be the same as PT). If this is the
5781 same buffer as CODING->dst_object, CODING->src_pos must be
5782 negative.
d46c5b12 5783
df7492f9
KH
5784 If CODING->src_object is a string, CODING->src_pos in an index to
5785 that string.
d46c5b12 5786
df7492f9
KH
5787 If CODING->src_object is nil, CODING->source must already point to
5788 the non-relocatable memory area. In this case, CODING->src_pos is
5789 an offset from CODING->source.
73be902c 5790
df7492f9
KH
5791 The decoded data is inserted at the current point of the buffer
5792 CODING->dst_object.
5793*/
d46c5b12 5794
df7492f9
KH
5795static int
5796decode_coding (coding)
d46c5b12 5797 struct coding_system *coding;
d46c5b12 5798{
df7492f9 5799 Lisp_Object attrs;
d46c5b12 5800
df7492f9
KH
5801 if (BUFFERP (coding->src_object)
5802 && coding->src_pos > 0
5803 && coding->src_pos < GPT
5804 && coding->src_pos + coding->src_chars > GPT)
5805 move_gap_both (coding->src_pos, coding->src_pos_byte);
d46c5b12 5806
df7492f9 5807 if (BUFFERP (coding->dst_object))
1c3478b0 5808 {
df7492f9
KH
5809 if (current_buffer != XBUFFER (coding->dst_object))
5810 set_buffer_internal (XBUFFER (coding->dst_object));
5811 if (GPT != PT)
5812 move_gap_both (PT, PT_BYTE);
1c3478b0
KH
5813 }
5814
df7492f9
KH
5815 coding->consumed = coding->consumed_char = 0;
5816 coding->produced = coding->produced_char = 0;
5817 coding->chars_at_source = 0;
5818 coding->result = CODING_RESULT_SUCCESS;
5819 coding->errors = 0;
1c3478b0 5820
df7492f9
KH
5821 ALLOC_CONVERSION_WORK_AREA (coding);
5822
5823 attrs = CODING_ID_ATTRS (coding->id);
5824
5825 do
b73bfc1c 5826 {
df7492f9
KH
5827 coding_set_source (coding);
5828 coding->annotated = 0;
5829 (*(coding->decoder)) (coding);
5830 if (!NILP (CODING_ATTR_DECODE_TBL (attrs)))
da4109a9
KH
5831 translate_chars (coding, CODING_ATTR_DECODE_TBL (attrs));
5832 else if (!NILP (Vstandard_translation_table_for_decode))
5833 translate_chars (coding, Vstandard_translation_table_for_decode);
df7492f9
KH
5834 coding_set_destination (coding);
5835 produce_chars (coding);
5836 if (coding->annotated)
5837 produce_annotation (coding);
d46c5b12 5838 }
df7492f9
KH
5839 while (coding->consumed < coding->src_bytes
5840 && ! coding->result);
d46c5b12 5841
df7492f9
KH
5842 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qccl)
5843 && SYMBOLP (CODING_ID_EOL_TYPE (coding->id))
5844 && ! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix))
5845 decode_eol (coding);
d46c5b12 5846
df7492f9
KH
5847 coding->carryover_bytes = 0;
5848 if (coding->consumed < coding->src_bytes)
d46c5b12 5849 {
df7492f9 5850 int nbytes = coding->src_bytes - coding->consumed;
8f924df7 5851 const unsigned char *src;
df7492f9
KH
5852
5853 coding_set_source (coding);
5854 coding_set_destination (coding);
5855 src = coding->source + coding->consumed;
5856
5857 if (coding->mode & CODING_MODE_LAST_BLOCK)
1c3478b0 5858 {
df7492f9
KH
5859 /* Flush out unprocessed data as binary chars. We are sure
5860 that the number of data is less than the size of
5861 coding->charbuf. */
df7492f9 5862 while (nbytes-- > 0)
1c3478b0 5863 {
df7492f9 5864 int c = *src++;
98725083
KH
5865
5866 coding->charbuf[coding->charbuf_used++] = (c & 0x80 ? - c : c);
1c3478b0 5867 }
df7492f9 5868 produce_chars (coding);
d46c5b12 5869 }
d46c5b12 5870 else
df7492f9
KH
5871 {
5872 /* Record unprocessed bytes in coding->carryover. We are
5873 sure that the number of data is less than the size of
5874 coding->carryover. */
5875 unsigned char *p = coding->carryover;
5876
5877 coding->carryover_bytes = nbytes;
5878 while (nbytes-- > 0)
5879 *p++ = *src++;
1c3478b0 5880 }
df7492f9 5881 coding->consumed = coding->src_bytes;
b73bfc1c 5882 }
69f76525 5883
73be902c 5884 return coding->result;
4ed46869
KH
5885}
5886
aaaf0b1e 5887
e1c23804 5888/* Extract an annotation datum from a composition starting at POS and
ff0dacd7
KH
5889 ending before LIMIT of CODING->src_object (buffer or string), store
5890 the data in BUF, set *STOP to a starting position of the next
5891 composition (if any) or to LIMIT, and return the address of the
5892 next element of BUF.
5893
5894 If such an annotation is not found, set *STOP to a starting
5895 position of a composition after POS (if any) or to LIMIT, and
5896 return BUF. */
5897
5898static INLINE int *
5899handle_composition_annotation (pos, limit, coding, buf, stop)
5900 EMACS_INT pos, limit;
aaaf0b1e 5901 struct coding_system *coding;
ff0dacd7
KH
5902 int *buf;
5903 EMACS_INT *stop;
aaaf0b1e 5904{
ff0dacd7
KH
5905 EMACS_INT start, end;
5906 Lisp_Object prop;
aaaf0b1e 5907
ff0dacd7
KH
5908 if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
5909 || end > limit)
5910 *stop = limit;
5911 else if (start > pos)
5912 *stop = start;
5913 else
aaaf0b1e 5914 {
ff0dacd7 5915 if (start == pos)
aaaf0b1e 5916 {
ff0dacd7
KH
5917 /* We found a composition. Store the corresponding
5918 annotation data in BUF. */
5919 int *head = buf;
5920 enum composition_method method = COMPOSITION_METHOD (prop);
5921 int nchars = COMPOSITION_LENGTH (prop);
5922
5923 ADD_COMPOSITION_DATA (buf, 0, nchars, method);
5924 if (method != COMPOSITION_RELATIVE)
aaaf0b1e 5925 {
ff0dacd7
KH
5926 Lisp_Object components;
5927 int len, i, i_byte;
5928
5929 components = COMPOSITION_COMPONENTS (prop);
5930 if (VECTORP (components))
aaaf0b1e 5931 {
ff0dacd7
KH
5932 len = XVECTOR (components)->size;
5933 for (i = 0; i < len; i++)
5934 *buf++ = XINT (AREF (components, i));
aaaf0b1e 5935 }
ff0dacd7 5936 else if (STRINGP (components))
aaaf0b1e 5937 {
8f924df7 5938 len = SCHARS (components);
ff0dacd7
KH
5939 i = i_byte = 0;
5940 while (i < len)
5941 {
5942 FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
5943 buf++;
5944 }
5945 }
5946 else if (INTEGERP (components))
5947 {
5948 len = 1;
5949 *buf++ = XINT (components);
5950 }
5951 else if (CONSP (components))
5952 {
5953 for (len = 0; CONSP (components);
5954 len++, components = XCDR (components))
5955 *buf++ = XINT (XCAR (components));
aaaf0b1e 5956 }
aaaf0b1e 5957 else
ff0dacd7
KH
5958 abort ();
5959 *head -= len;
aaaf0b1e 5960 }
aaaf0b1e 5961 }
ff0dacd7
KH
5962
5963 if (find_composition (end, limit, &start, &end, &prop,
5964 coding->src_object)
5965 && end <= limit)
5966 *stop = start;
5967 else
5968 *stop = limit;
aaaf0b1e 5969 }
ff0dacd7
KH
5970 return buf;
5971}
5972
5973
e1c23804 5974/* Extract an annotation datum from a text property `charset' at POS of
ff0dacd7
KH
5975 CODING->src_object (buffer of string), store the data in BUF, set
5976 *STOP to the position where the value of `charset' property changes
5977 (limiting by LIMIT), and return the address of the next element of
5978 BUF.
5979
5980 If the property value is nil, set *STOP to the position where the
5981 property value is non-nil (limiting by LIMIT), and return BUF. */
5982
5983static INLINE int *
5984handle_charset_annotation (pos, limit, coding, buf, stop)
5985 EMACS_INT pos, limit;
5986 struct coding_system *coding;
5987 int *buf;
5988 EMACS_INT *stop;
5989{
5990 Lisp_Object val, next;
5991 int id;
5992
5993 val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
5994 if (! NILP (val) && CHARSETP (val))
5995 id = XINT (CHARSET_SYMBOL_ID (val));
5996 else
5997 id = -1;
5998 ADD_CHARSET_DATA (buf, 0, 0, id);
5999 next = Fnext_single_property_change (make_number (pos), Qcharset,
6000 coding->src_object,
6001 make_number (limit));
6002 *stop = XINT (next);
6003 return buf;
6004}
6005
6006
df7492f9
KH
6007static void
6008consume_chars (coding)
6009 struct coding_system *coding;
6010{
6011 int *buf = coding->charbuf;
ff0dacd7 6012 int *buf_end = coding->charbuf + coding->charbuf_size;
7c78e542 6013 const unsigned char *src = coding->source + coding->consumed;
4776e638 6014 const unsigned char *src_end = coding->source + coding->src_bytes;
ff0dacd7
KH
6015 EMACS_INT pos = coding->src_pos + coding->consumed_char;
6016 EMACS_INT end_pos = coding->src_pos + coding->src_chars;
df7492f9
KH
6017 int multibytep = coding->src_multibyte;
6018 Lisp_Object eol_type;
6019 int c;
ff0dacd7 6020 EMACS_INT stop, stop_composition, stop_charset;
88993dfd 6021
df7492f9
KH
6022 eol_type = CODING_ID_EOL_TYPE (coding->id);
6023 if (VECTORP (eol_type))
6024 eol_type = Qunix;
88993dfd 6025
df7492f9
KH
6026 /* Note: composition handling is not yet implemented. */
6027 coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
ec6d2bb8 6028
ff0dacd7
KH
6029 if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
6030 stop = stop_composition = pos;
6031 else
6032 stop = stop_composition = end_pos;
6033 if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
6034 stop = stop_charset = pos;
df7492f9 6035 else
ff0dacd7 6036 stop_charset = end_pos;
ec6d2bb8 6037
ff0dacd7
KH
6038 /* Compensate for CRLF and annotation. */
6039 buf_end -= 1 + MAX_ANNOTATION_LENGTH;
df7492f9 6040 while (buf < buf_end)
aaaf0b1e 6041 {
df7492f9 6042 if (pos == stop)
ec6d2bb8 6043 {
df7492f9
KH
6044 if (pos == end_pos)
6045 break;
ff0dacd7
KH
6046 if (pos == stop_composition)
6047 buf = handle_composition_annotation (pos, end_pos, coding,
6048 buf, &stop_composition);
6049 if (pos == stop_charset)
6050 buf = handle_charset_annotation (pos, end_pos, coding,
6051 buf, &stop_charset);
6052 stop = (stop_composition < stop_charset
6053 ? stop_composition : stop_charset);
df7492f9
KH
6054 }
6055
6056 if (! multibytep)
4776e638 6057 {
d3e4cb56 6058 EMACS_INT bytes;
aaaf0b1e 6059
d3e4cb56
KH
6060 if (! CODING_FOR_UNIBYTE (coding)
6061 && (bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
4776e638
KH
6062 c = STRING_CHAR_ADVANCE (src), pos += bytes;
6063 else
6064 c = *src++, pos++;
6065 }
df7492f9 6066 else
4776e638 6067 c = STRING_CHAR_ADVANCE (src), pos++;
df7492f9
KH
6068 if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
6069 c = '\n';
6070 if (! EQ (eol_type, Qunix))
aaaf0b1e 6071 {
df7492f9 6072 if (c == '\n')
aaaf0b1e 6073 {
df7492f9
KH
6074 if (EQ (eol_type, Qdos))
6075 *buf++ = '\r';
6076 else
6077 c = '\r';
aaaf0b1e
KH
6078 }
6079 }
df7492f9 6080 *buf++ = c;
aaaf0b1e 6081 }
ec6d2bb8 6082
df7492f9
KH
6083 coding->consumed = src - coding->source;
6084 coding->consumed_char = pos - coding->src_pos;
6085 coding->charbuf_used = buf - coding->charbuf;
6086 coding->chars_at_source = 0;
aaaf0b1e
KH
6087}
6088
4ed46869 6089
df7492f9
KH
6090/* Encode the text at CODING->src_object into CODING->dst_object.
6091 CODING->src_object is a buffer or a string.
6092 CODING->dst_object is a buffer or nil.
6093
6094 If CODING->src_object is a buffer, it must be the current buffer.
6095 In this case, if CODING->src_pos is positive, it is a position of
6096 the source text in the buffer, otherwise. the source text is in the
6097 gap area of the buffer, and coding->src_pos specifies the offset of
6098 the text from GPT (which must be the same as PT). If this is the
6099 same buffer as CODING->dst_object, CODING->src_pos must be
6100 negative and CODING should not have `pre-write-conversion'.
6101
6102 If CODING->src_object is a string, CODING should not have
6103 `pre-write-conversion'.
6104
6105 If CODING->dst_object is a buffer, the encoded data is inserted at
6106 the current point of that buffer.
6107
6108 If CODING->dst_object is nil, the encoded data is placed at the
6109 memory area specified by CODING->destination. */
6110
6111static int
6112encode_coding (coding)
4ed46869 6113 struct coding_system *coding;
4ed46869 6114{
df7492f9 6115 Lisp_Object attrs;
9861e777 6116
df7492f9 6117 attrs = CODING_ID_ATTRS (coding->id);
4ed46869 6118
df7492f9 6119 if (BUFFERP (coding->dst_object))
8844fa83 6120 {
df7492f9
KH
6121 set_buffer_internal (XBUFFER (coding->dst_object));
6122 coding->dst_multibyte
6123 = ! NILP (current_buffer->enable_multibyte_characters);
8844fa83 6124 }
4ed46869 6125
b73bfc1c 6126 coding->consumed = coding->consumed_char = 0;
df7492f9
KH
6127 coding->produced = coding->produced_char = 0;
6128 coding->result = CODING_RESULT_SUCCESS;
b73bfc1c 6129 coding->errors = 0;
b73bfc1c 6130
df7492f9 6131 ALLOC_CONVERSION_WORK_AREA (coding);
4ed46869 6132
df7492f9
KH
6133 do {
6134 coding_set_source (coding);
6135 consume_chars (coding);
4ed46869 6136
df7492f9 6137 if (!NILP (CODING_ATTR_ENCODE_TBL (attrs)))
da4109a9
KH
6138 translate_chars (coding, CODING_ATTR_ENCODE_TBL (attrs));
6139 else if (!NILP (Vstandard_translation_table_for_encode))
6140 translate_chars (coding, Vstandard_translation_table_for_encode);
b73bfc1c 6141
df7492f9
KH
6142 coding_set_destination (coding);
6143 (*(coding->encoder)) (coding);
6144 } while (coding->consumed_char < coding->src_chars);
6145
6146 if (BUFFERP (coding->dst_object))
6147 insert_from_gap (coding->produced_char, coding->produced);
6148
6149 return (coding->result);
ec6d2bb8
KH
6150}
6151
fb88bf2d 6152
4776e638
KH
6153/* Stack of working buffers used in code conversion. An nil element
6154 means that the code conversion of that level is not using a working
6155 buffer. */
df7492f9 6156Lisp_Object Vcode_conversion_work_buf_list;
d46c5b12 6157
df7492f9
KH
6158/* A working buffer used by the top level conversion. */
6159Lisp_Object Vcode_conversion_reused_work_buf;
b73bfc1c 6160
4ed46869 6161
df7492f9
KH
6162/* Return a working buffer that can be freely used by the following
6163 code conversion. MULTIBYTEP specifies the multibyteness of the
6164 buffer. */
b73bfc1c 6165
df7492f9 6166Lisp_Object
4776e638
KH
6167make_conversion_work_buffer (multibytep, depth)
6168 int multibytep, depth;
df7492f9
KH
6169{
6170 struct buffer *current = current_buffer;
4776e638 6171 Lisp_Object buf, name;
4ed46869 6172
4776e638 6173 if (depth == 0)
e133c8fa 6174 {
df7492f9
KH
6175 if (NILP (Vcode_conversion_reused_work_buf))
6176 Vcode_conversion_reused_work_buf
857dccb0 6177 = Fget_buffer_create (build_string (" *code-converting-work<0>*"));
4776e638 6178 buf = Vcode_conversion_reused_work_buf;
e133c8fa 6179 }
df7492f9 6180 else
d46c5b12 6181 {
4776e638 6182 if (depth < 0)
aaaf0b1e 6183 {
857dccb0 6184 name = build_string (" *code-converting-work*");
4776e638 6185 name = Fgenerate_new_buffer_name (name, Qnil);
aaaf0b1e 6186 }
4776e638 6187 else
9861e777 6188 {
4776e638 6189 char str[128];
d46c5b12 6190
857dccb0 6191 sprintf (str, " *code-converting-work*<%d>", depth);
4776e638
KH
6192 name = build_string (str);
6193 }
6194 buf = Fget_buffer_create (name);
b73bfc1c 6195 }
df7492f9
KH
6196 set_buffer_internal (XBUFFER (buf));
6197 current_buffer->undo_list = Qt;
6198 Ferase_buffer ();
8f924df7 6199 Fset_buffer_multibyte (multibytep ? Qt : Qnil);
df7492f9
KH
6200 set_buffer_internal (current);
6201 return buf;
6202}
d46c5b12 6203
4776e638
KH
6204static Lisp_Object
6205code_conversion_restore (buffer)
6206 Lisp_Object buffer;
6207{
6208 Lisp_Object workbuf;
6209
6210 workbuf = XCAR (Vcode_conversion_work_buf_list);
6211 if (! NILP (workbuf)
6212 && ! EQ (workbuf, Vcode_conversion_reused_work_buf)
6213 && ! NILP (Fbuffer_live_p (workbuf)))
6214 Fkill_buffer (workbuf);
6215 Vcode_conversion_work_buf_list = XCDR (Vcode_conversion_work_buf_list);
6216 set_buffer_internal (XBUFFER (buffer));
6217 return Qnil;
6218}
b73bfc1c 6219
4776e638
KH
6220static Lisp_Object
6221code_conversion_save (buffer, with_work_buf, multibyte)
6222 Lisp_Object buffer;
6223 int with_work_buf, multibyte;
df7492f9 6224{
4776e638 6225 Lisp_Object workbuf;
b73bfc1c 6226
4776e638 6227 if (with_work_buf)
b73bfc1c 6228 {
4776e638 6229 int depth = XINT (Flength (Vcode_conversion_work_buf_list));
b73bfc1c 6230
4776e638 6231 workbuf = make_conversion_work_buffer (multibyte, depth);
4ed46869 6232 }
4776e638
KH
6233 else
6234 workbuf = Qnil;
6235 Vcode_conversion_work_buf_list
6236 = Fcons (workbuf, Vcode_conversion_work_buf_list);
6237 record_unwind_protect (code_conversion_restore, buffer);
6238 return workbuf;
df7492f9 6239}
d46c5b12 6240
df7492f9
KH
6241int
6242decode_coding_gap (coding, chars, bytes)
6243 struct coding_system *coding;
6244 EMACS_INT chars, bytes;
6245{
6246 int count = specpdl_ptr - specpdl;
5e5c78be 6247 Lisp_Object attrs;
4776e638 6248 Lisp_Object buffer;
fb88bf2d 6249
4776e638
KH
6250 buffer = Fcurrent_buffer ();
6251 code_conversion_save (buffer, 0, 0);
ec6d2bb8 6252
4776e638 6253 coding->src_object = buffer;
df7492f9
KH
6254 coding->src_chars = chars;
6255 coding->src_bytes = bytes;
6256 coding->src_pos = -chars;
6257 coding->src_pos_byte = -bytes;
6258 coding->src_multibyte = chars < bytes;
5e5c78be 6259 coding->dst_object = buffer;
df7492f9
KH
6260 coding->dst_pos = PT;
6261 coding->dst_pos_byte = PT_BYTE;
71c81426 6262 coding->dst_multibyte = ! NILP (current_buffer->enable_multibyte_characters);
98725083 6263 coding->mode |= CODING_MODE_LAST_BLOCK;
4ed46869 6264
df7492f9
KH
6265 if (CODING_REQUIRE_DETECTION (coding))
6266 detect_coding (coding);
8f924df7 6267
df7492f9 6268 decode_coding (coding);
d46c5b12 6269
5e5c78be
KH
6270 attrs = CODING_ID_ATTRS (coding->id);
6271 if (! NILP (CODING_ATTR_POST_READ (attrs)))
b73bfc1c 6272 {
5e5c78be
KH
6273 EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
6274 Lisp_Object val;
6275
6276 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
5e5c78be
KH
6277 val = call1 (CODING_ATTR_POST_READ (attrs),
6278 make_number (coding->produced_char));
5e5c78be
KH
6279 CHECK_NATNUM (val);
6280 coding->produced_char += Z - prev_Z;
6281 coding->produced += Z_BYTE - prev_Z_BYTE;
b73bfc1c 6282 }
4ed46869 6283
df7492f9 6284 unbind_to (count, Qnil);
b73bfc1c
KH
6285 return coding->result;
6286}
52d41803 6287
4ed46869 6288int
df7492f9 6289encode_coding_gap (coding, chars, bytes)
4ed46869 6290 struct coding_system *coding;
df7492f9 6291 EMACS_INT chars, bytes;
4ed46869 6292{
df7492f9
KH
6293 int count = specpdl_ptr - specpdl;
6294 Lisp_Object buffer;
4ed46869 6295
df7492f9 6296 buffer = Fcurrent_buffer ();
4776e638 6297 code_conversion_save (buffer, 0, 0);
4ed46869 6298
df7492f9
KH
6299 coding->src_object = buffer;
6300 coding->src_chars = chars;
6301 coding->src_bytes = bytes;
6302 coding->src_pos = -chars;
6303 coding->src_pos_byte = -bytes;
6304 coding->src_multibyte = chars < bytes;
6305 coding->dst_object = coding->src_object;
6306 coding->dst_pos = PT;
6307 coding->dst_pos_byte = PT_BYTE;
4ed46869 6308
df7492f9 6309 encode_coding (coding);
b73bfc1c 6310
df7492f9
KH
6311 unbind_to (count, Qnil);
6312 return coding->result;
6313}
4ed46869 6314
d46c5b12 6315
df7492f9
KH
6316/* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
6317 SRC_OBJECT into DST_OBJECT by coding context CODING.
b73bfc1c 6318
df7492f9 6319 SRC_OBJECT is a buffer, a string, or Qnil.
b73bfc1c 6320
df7492f9
KH
6321 If it is a buffer, the text is at point of the buffer. FROM and TO
6322 are positions in the buffer.
b73bfc1c 6323
df7492f9
KH
6324 If it is a string, the text is at the beginning of the string.
6325 FROM and TO are indices to the string.
4ed46869 6326
df7492f9
KH
6327 If it is nil, the text is at coding->source. FROM and TO are
6328 indices to coding->source.
bb10be8b 6329
df7492f9 6330 DST_OBJECT is a buffer, Qt, or Qnil.
4ed46869 6331
df7492f9
KH
6332 If it is a buffer, the decoded text is inserted at point of the
6333 buffer. If the buffer is the same as SRC_OBJECT, the source text
6334 is deleted.
4ed46869 6335
df7492f9
KH
6336 If it is Qt, a string is made from the decoded text, and
6337 set in CODING->dst_object.
d46c5b12 6338
df7492f9 6339 If it is Qnil, the decoded text is stored at CODING->destination.
2cb26057 6340 The caller must allocate CODING->dst_bytes bytes at
df7492f9
KH
6341 CODING->destination by xmalloc. If the decoded text is longer than
6342 CODING->dst_bytes, CODING->destination is relocated by xrealloc.
6343 */
d46c5b12 6344
df7492f9
KH
6345void
6346decode_coding_object (coding, src_object, from, from_byte, to, to_byte,
6347 dst_object)
d46c5b12 6348 struct coding_system *coding;
df7492f9
KH
6349 Lisp_Object src_object;
6350 EMACS_INT from, from_byte, to, to_byte;
6351 Lisp_Object dst_object;
d46c5b12 6352{
df7492f9
KH
6353 int count = specpdl_ptr - specpdl;
6354 unsigned char *destination;
6355 EMACS_INT dst_bytes;
6356 EMACS_INT chars = to - from;
6357 EMACS_INT bytes = to_byte - from_byte;
6358 Lisp_Object attrs;
4776e638
KH
6359 Lisp_Object buffer;
6360 int saved_pt = -1, saved_pt_byte;
d46c5b12 6361
4776e638 6362 buffer = Fcurrent_buffer ();
93dec019 6363
df7492f9 6364 if (NILP (dst_object))
d46c5b12 6365 {
df7492f9
KH
6366 destination = coding->destination;
6367 dst_bytes = coding->dst_bytes;
d46c5b12 6368 }
93dec019 6369
df7492f9
KH
6370 coding->src_object = src_object;
6371 coding->src_chars = chars;
6372 coding->src_bytes = bytes;
6373 coding->src_multibyte = chars < bytes;
70ad9fc4 6374
df7492f9 6375 if (STRINGP (src_object))
d46c5b12 6376 {
df7492f9
KH
6377 coding->src_pos = from;
6378 coding->src_pos_byte = from_byte;
d46c5b12 6379 }
df7492f9 6380 else if (BUFFERP (src_object))
88993dfd 6381 {
df7492f9
KH
6382 set_buffer_internal (XBUFFER (src_object));
6383 if (from != GPT)
6384 move_gap_both (from, from_byte);
6385 if (EQ (src_object, dst_object))
fb88bf2d 6386 {
4776e638 6387 saved_pt = PT, saved_pt_byte = PT_BYTE;
df7492f9
KH
6388 TEMP_SET_PT_BOTH (from, from_byte);
6389 del_range_both (from, from_byte, to, to_byte, 1);
6390 coding->src_pos = -chars;
6391 coding->src_pos_byte = -bytes;
fb88bf2d 6392 }
df7492f9 6393 else
fb88bf2d 6394 {
df7492f9
KH
6395 coding->src_pos = from;
6396 coding->src_pos_byte = from_byte;
fb88bf2d 6397 }
88993dfd
KH
6398 }
6399
df7492f9
KH
6400 if (CODING_REQUIRE_DETECTION (coding))
6401 detect_coding (coding);
6402 attrs = CODING_ID_ATTRS (coding->id);
d46c5b12 6403
2cb26057
KH
6404 if (EQ (dst_object, Qt)
6405 || (! NILP (CODING_ATTR_POST_READ (attrs))
6406 && NILP (dst_object)))
b73bfc1c 6407 {
4776e638 6408 coding->dst_object = code_conversion_save (buffer, 1, 1);
df7492f9
KH
6409 coding->dst_pos = BEG;
6410 coding->dst_pos_byte = BEG_BYTE;
6411 coding->dst_multibyte = 1;
b73bfc1c 6412 }
df7492f9 6413 else if (BUFFERP (dst_object))
d46c5b12 6414 {
4776e638 6415 code_conversion_save (buffer, 0, 0);
df7492f9
KH
6416 coding->dst_object = dst_object;
6417 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
6418 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
6419 coding->dst_multibyte
6420 = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
d46c5b12
KH
6421 }
6422 else
6423 {
4776e638 6424 code_conversion_save (buffer, 0, 0);
df7492f9
KH
6425 coding->dst_object = Qnil;
6426 coding->dst_multibyte = 1;
d46c5b12
KH
6427 }
6428
df7492f9 6429 decode_coding (coding);
fa46990e 6430
df7492f9
KH
6431 if (BUFFERP (coding->dst_object))
6432 set_buffer_internal (XBUFFER (coding->dst_object));
d46c5b12 6433
df7492f9 6434 if (! NILP (CODING_ATTR_POST_READ (attrs)))
d46c5b12 6435 {
df7492f9
KH
6436 struct gcpro gcpro1, gcpro2;
6437 EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
2b4f9037 6438 Lisp_Object val;
d46c5b12 6439
c0cc7f7f 6440 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
df7492f9
KH
6441 GCPRO2 (coding->src_object, coding->dst_object);
6442 val = call1 (CODING_ATTR_POST_READ (attrs),
6443 make_number (coding->produced_char));
6444 UNGCPRO;
6445 CHECK_NATNUM (val);
6446 coding->produced_char += Z - prev_Z;
6447 coding->produced += Z_BYTE - prev_Z_BYTE;
d46c5b12 6448 }
de79a6a5 6449
df7492f9 6450 if (EQ (dst_object, Qt))
ec6d2bb8 6451 {
df7492f9
KH
6452 coding->dst_object = Fbuffer_string ();
6453 }
6454 else if (NILP (dst_object) && BUFFERP (coding->dst_object))
6455 {
6456 set_buffer_internal (XBUFFER (coding->dst_object));
6457 if (dst_bytes < coding->produced)
6458 {
6459 destination
6460 = (unsigned char *) xrealloc (destination, coding->produced);
6461 if (! destination)
6462 {
6463 coding->result = CODING_RESULT_INSUFFICIENT_DST;
6464 unbind_to (count, Qnil);
6465 return;
6466 }
6467 if (BEGV < GPT && GPT < BEGV + coding->produced_char)
6468 move_gap_both (BEGV, BEGV_BYTE);
6469 bcopy (BEGV_ADDR, destination, coding->produced);
6470 coding->destination = destination;
d46c5b12 6471 }
ec6d2bb8 6472 }
b73bfc1c 6473
4776e638
KH
6474 if (saved_pt >= 0)
6475 {
6476 /* This is the case of:
6477 (BUFFERP (src_object) && EQ (src_object, dst_object))
6478 As we have moved PT while replacing the original buffer
6479 contents, we must recover it now. */
6480 set_buffer_internal (XBUFFER (src_object));
6481 if (saved_pt < from)
6482 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
6483 else if (saved_pt < from + chars)
6484 TEMP_SET_PT_BOTH (from, from_byte);
6485 else if (! NILP (current_buffer->enable_multibyte_characters))
6486 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
6487 saved_pt_byte + (coding->produced - bytes));
6488 else
6489 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
6490 saved_pt_byte + (coding->produced - bytes));
d46c5b12 6491 }
4776e638 6492
df7492f9 6493 unbind_to (count, Qnil);
d46c5b12
KH
6494}
6495
d46c5b12 6496
df7492f9
KH
6497void
6498encode_coding_object (coding, src_object, from, from_byte, to, to_byte,
6499 dst_object)
d46c5b12 6500 struct coding_system *coding;
df7492f9
KH
6501 Lisp_Object src_object;
6502 EMACS_INT from, from_byte, to, to_byte;
6503 Lisp_Object dst_object;
d46c5b12 6504{
b73bfc1c 6505 int count = specpdl_ptr - specpdl;
df7492f9
KH
6506 EMACS_INT chars = to - from;
6507 EMACS_INT bytes = to_byte - from_byte;
6508 Lisp_Object attrs;
4776e638
KH
6509 Lisp_Object buffer;
6510 int saved_pt = -1, saved_pt_byte;
df7492f9 6511
4776e638 6512 buffer = Fcurrent_buffer ();
df7492f9
KH
6513
6514 coding->src_object = src_object;
6515 coding->src_chars = chars;
6516 coding->src_bytes = bytes;
6517 coding->src_multibyte = chars < bytes;
6518
6519 attrs = CODING_ID_ATTRS (coding->id);
6520
6521 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
6bac5b12 6522 {
4776e638
KH
6523 coding->src_object = code_conversion_save (buffer, 1,
6524 coding->src_multibyte);
df7492f9
KH
6525 set_buffer_internal (XBUFFER (coding->src_object));
6526 if (STRINGP (src_object))
6527 insert_from_string (src_object, from, from_byte, chars, bytes, 0);
6528 else if (BUFFERP (src_object))
6529 insert_from_buffer (XBUFFER (src_object), from, chars, 0);
6530 else
6531 insert_1_both (coding->source + from, chars, bytes, 0, 0, 0);
d46c5b12 6532
df7492f9
KH
6533 if (EQ (src_object, dst_object))
6534 {
6535 set_buffer_internal (XBUFFER (src_object));
4776e638 6536 saved_pt = PT, saved_pt_byte = PT_BYTE;
df7492f9
KH
6537 del_range_both (from, from_byte, to, to_byte, 1);
6538 set_buffer_internal (XBUFFER (coding->src_object));
6539 }
6540
ac87bbef
KH
6541 call2 (CODING_ATTR_PRE_WRITE (attrs),
6542 make_number (BEG), make_number (Z));
6543 coding->src_object = Fcurrent_buffer ();
df7492f9
KH
6544 if (BEG != GPT)
6545 move_gap_both (BEG, BEG_BYTE);
6546 coding->src_chars = Z - BEG;
6547 coding->src_bytes = Z_BYTE - BEG_BYTE;
6548 coding->src_pos = BEG;
6549 coding->src_pos_byte = BEG_BYTE;
6550 coding->src_multibyte = Z < Z_BYTE;
6551 }
6552 else if (STRINGP (src_object))
d46c5b12 6553 {
4776e638 6554 code_conversion_save (buffer, 0, 0);
df7492f9
KH
6555 coding->src_pos = from;
6556 coding->src_pos_byte = from_byte;
b73bfc1c 6557 }
df7492f9 6558 else if (BUFFERP (src_object))
b73bfc1c 6559 {
4776e638 6560 code_conversion_save (buffer, 0, 0);
df7492f9 6561 set_buffer_internal (XBUFFER (src_object));
df7492f9 6562 if (EQ (src_object, dst_object))
d46c5b12 6563 {
4776e638 6564 saved_pt = PT, saved_pt_byte = PT_BYTE;
ff0dacd7
KH
6565 coding->src_object = del_range_1 (from, to, 1, 1);
6566 coding->src_pos = 0;
6567 coding->src_pos_byte = 0;
d46c5b12 6568 }
df7492f9 6569 else
d46c5b12 6570 {
ff0dacd7
KH
6571 if (from < GPT && to >= GPT)
6572 move_gap_both (from, from_byte);
df7492f9
KH
6573 coding->src_pos = from;
6574 coding->src_pos_byte = from_byte;
d46c5b12 6575 }
d46c5b12 6576 }
4776e638
KH
6577 else
6578 code_conversion_save (buffer, 0, 0);
d46c5b12 6579
df7492f9 6580 if (BUFFERP (dst_object))
88993dfd 6581 {
df7492f9 6582 coding->dst_object = dst_object;
28f67a95
KH
6583 if (EQ (src_object, dst_object))
6584 {
6585 coding->dst_pos = from;
6586 coding->dst_pos_byte = from_byte;
6587 }
6588 else
6589 {
6590 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
6591 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
6592 }
df7492f9
KH
6593 coding->dst_multibyte
6594 = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
88993dfd 6595 }
df7492f9 6596 else if (EQ (dst_object, Qt))
d46c5b12 6597 {
df7492f9 6598 coding->dst_object = Qnil;
df7492f9 6599 coding->dst_bytes = coding->src_chars;
ac87bbef
KH
6600 if (coding->dst_bytes == 0)
6601 coding->dst_bytes = 1;
6602 coding->destination = (unsigned char *) xmalloc (coding->dst_bytes);
df7492f9 6603 coding->dst_multibyte = 0;
d46c5b12
KH
6604 }
6605 else
6606 {
df7492f9
KH
6607 coding->dst_object = Qnil;
6608 coding->dst_multibyte = 0;
d46c5b12
KH
6609 }
6610
df7492f9 6611 encode_coding (coding);
d46c5b12 6612
df7492f9 6613 if (EQ (dst_object, Qt))
d46c5b12 6614 {
df7492f9
KH
6615 if (BUFFERP (coding->dst_object))
6616 coding->dst_object = Fbuffer_string ();
6617 else
d46c5b12 6618 {
df7492f9
KH
6619 coding->dst_object
6620 = make_unibyte_string ((char *) coding->destination,
6621 coding->produced);
6622 xfree (coding->destination);
d46c5b12 6623 }
4ed46869 6624 }
d46c5b12 6625
4776e638
KH
6626 if (saved_pt >= 0)
6627 {
6628 /* This is the case of:
6629 (BUFFERP (src_object) && EQ (src_object, dst_object))
6630 As we have moved PT while replacing the original buffer
6631 contents, we must recover it now. */
6632 set_buffer_internal (XBUFFER (src_object));
6633 if (saved_pt < from)
6634 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
6635 else if (saved_pt < from + chars)
6636 TEMP_SET_PT_BOTH (from, from_byte);
6637 else if (! NILP (current_buffer->enable_multibyte_characters))
6638 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
6639 saved_pt_byte + (coding->produced - bytes));
d46c5b12 6640 else
4776e638
KH
6641 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
6642 saved_pt_byte + (coding->produced - bytes));
6643 }
6644
df7492f9 6645 unbind_to (count, Qnil);
b73bfc1c
KH
6646}
6647
df7492f9 6648
b73bfc1c 6649Lisp_Object
df7492f9 6650preferred_coding_system ()
b73bfc1c 6651{
df7492f9 6652 int id = coding_categories[coding_priorities[0]].id;
2391eaa4 6653
df7492f9 6654 return CODING_ID_NAME (id);
4ed46869
KH
6655}
6656
6657\f
6658#ifdef emacs
1397dc18 6659/*** 8. Emacs Lisp library functions ***/
4ed46869 6660
4ed46869 6661DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
48b0f3ae 6662 doc: /* Return t if OBJECT is nil or a coding-system.
df7492f9 6663See the documentation of `define-coding-system' for information
48b0f3ae
PJ
6664about coding-system objects. */)
6665 (obj)
4ed46869
KH
6666 Lisp_Object obj;
6667{
df7492f9 6668 return ((NILP (obj) || CODING_SYSTEM_P (obj)) ? Qt : Qnil);
4ed46869
KH
6669}
6670
9d991de8
RS
6671DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
6672 Sread_non_nil_coding_system, 1, 1, 0,
48b0f3ae
PJ
6673 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
6674 (prompt)
4ed46869
KH
6675 Lisp_Object prompt;
6676{
e0e989f6 6677 Lisp_Object val;
9d991de8
RS
6678 do
6679 {
4608c386
KH
6680 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6681 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
9d991de8 6682 }
8f924df7 6683 while (SCHARS (val) == 0);
e0e989f6 6684 return (Fintern (val, Qnil));
4ed46869
KH
6685}
6686
9b787f3e 6687DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
48b0f3ae
PJ
6688 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
6689If the user enters null input, return second argument DEFAULT-CODING-SYSTEM. */)
6690 (prompt, default_coding_system)
9b787f3e 6691 Lisp_Object prompt, default_coding_system;
4ed46869 6692{
f44d27ce 6693 Lisp_Object val;
9b787f3e 6694 if (SYMBOLP (default_coding_system))
a3181084 6695 XSETSTRING (default_coding_system, XPNTR (SYMBOL_NAME (default_coding_system)));
4608c386 6696 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
9b787f3e
RS
6697 Qt, Qnil, Qcoding_system_history,
6698 default_coding_system, Qnil);
8f924df7 6699 return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
4ed46869
KH
6700}
6701
6702DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
6703 1, 1, 0,
48b0f3ae 6704 doc: /* Check validity of CODING-SYSTEM.
b054002f 6705If valid, return CODING-SYSTEM, else signal a `coding-system-error' error. */)
df7492f9 6706 (coding_system)
4ed46869
KH
6707 Lisp_Object coding_system;
6708{
b7826503 6709 CHECK_SYMBOL (coding_system);
4ed46869
KH
6710 if (!NILP (Fcoding_system_p (coding_system)))
6711 return coding_system;
6712 while (1)
02ba4723 6713 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
4ed46869 6714}
df7492f9 6715
3a73fa5d 6716\f
89528eb3
KH
6717/* Detect how the bytes at SRC of length SRC_BYTES are encoded. If
6718 HIGHEST is nonzero, return the coding system of the highest
6719 priority among the detected coding systems. Otherwize return a
6720 list of detected coding systems sorted by their priorities. If
6721 MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
6722 multibyte form but contains only ASCII and eight-bit chars.
6723 Otherwise, the bytes are raw bytes.
6724
6725 CODING-SYSTEM controls the detection as below:
6726
6727 If it is nil, detect both text-format and eol-format. If the
6728 text-format part of CODING-SYSTEM is already specified
6729 (e.g. `iso-latin-1'), detect only eol-format. If the eol-format
6730 part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
6731 detect only text-format. */
6732
d46c5b12 6733Lisp_Object
df7492f9 6734detect_coding_system (src, src_bytes, highest, multibytep, coding_system)
8f924df7 6735 const unsigned char *src;
d46c5b12 6736 int src_bytes, highest;
0a28aafb 6737 int multibytep;
df7492f9 6738 Lisp_Object coding_system;
4ed46869 6739{
8f924df7 6740 const unsigned char *src_end = src + src_bytes;
df7492f9
KH
6741 Lisp_Object attrs, eol_type;
6742 Lisp_Object val;
6743 struct coding_system coding;
89528eb3 6744 int id;
ff0dacd7 6745 struct coding_detection_info detect_info;
b73bfc1c 6746
df7492f9
KH
6747 if (NILP (coding_system))
6748 coding_system = Qundecided;
6749 setup_coding_system (coding_system, &coding);
6750 attrs = CODING_ID_ATTRS (coding.id);
6751 eol_type = CODING_ID_EOL_TYPE (coding.id);
89528eb3 6752 coding_system = CODING_ATTR_BASE_NAME (attrs);
4ed46869 6753
df7492f9
KH
6754 coding.source = src;
6755 coding.src_bytes = src_bytes;
6756 coding.src_multibyte = multibytep;
6757 coding.consumed = 0;
89528eb3 6758 coding.mode |= CODING_MODE_LAST_BLOCK;
d46c5b12 6759
ff0dacd7 6760 detect_info.checked = detect_info.found = detect_info.rejected = 0;
d46c5b12 6761
89528eb3
KH
6762 /* At first, detect text-format if necessary. */
6763 if (XINT (CODING_ATTR_CATEGORY (attrs)) == coding_category_undecided)
4ed46869 6764 {
ff0dacd7
KH
6765 enum coding_category category;
6766 struct coding_system *this;
6767 int c, i;
88993dfd 6768
df7492f9 6769 for (; src < src_end; src++)
4ed46869 6770 {
df7492f9 6771 c = *src;
89528eb3
KH
6772 if (c & 0x80
6773 || (c < 0x20 && (c == ISO_CODE_ESC
6774 || c == ISO_CODE_SI
584948ac 6775 || c == ISO_CODE_SO)))
d46c5b12 6776 break;
4ed46869 6777 }
df7492f9 6778 coding.head_ascii = src - coding.source;
88993dfd 6779
df7492f9
KH
6780 if (src < src_end)
6781 for (i = 0; i < coding_category_raw_text; i++)
6782 {
ff0dacd7
KH
6783 category = coding_priorities[i];
6784 this = coding_categories + category;
b843d1ae 6785
df7492f9
KH
6786 if (this->id < 0)
6787 {
6788 /* No coding system of this category is defined. */
ff0dacd7 6789 detect_info.rejected |= (1 << category);
df7492f9 6790 }
ff0dacd7 6791 else if (category >= coding_category_raw_text)
89528eb3 6792 continue;
ff0dacd7
KH
6793 else if (detect_info.checked & (1 << category))
6794 {
6795 if (highest
6796 && (detect_info.found & (1 << category)))
6797 break;
6798 }
df7492f9
KH
6799 else
6800 {
ff0dacd7 6801 if ((*(this->detector)) (&coding, &detect_info)
89528eb3 6802 && highest
ff0dacd7
KH
6803 && (detect_info.found & (1 << category)))
6804 break;
df7492f9
KH
6805 }
6806 }
ec6d2bb8 6807
ec6d2bb8 6808
ff0dacd7 6809 if (detect_info.rejected == CATEGORY_MASK_ANY)
ec6d2bb8 6810 {
ff0dacd7 6811 detect_info.found = CATEGORY_MASK_RAW_TEXT;
89528eb3
KH
6812 id = coding_categories[coding_category_raw_text].id;
6813 val = Fcons (make_number (id), Qnil);
6814 }
ff0dacd7 6815 else if (! detect_info.rejected && ! detect_info.found)
89528eb3 6816 {
ff0dacd7 6817 detect_info.found = CATEGORY_MASK_ANY;
89528eb3
KH
6818 id = coding_categories[coding_category_undecided].id;
6819 val = Fcons (make_number (id), Qnil);
6820 }
6821 else if (highest)
6822 {
ff0dacd7 6823 if (detect_info.found)
ec6d2bb8 6824 {
ff0dacd7
KH
6825 detect_info.found = 1 << category;
6826 val = Fcons (make_number (this->id), Qnil);
6827 }
6828 else
6829 for (i = 0; i < coding_category_raw_text; i++)
6830 if (! (detect_info.rejected & (1 << coding_priorities[i])))
6831 {
6832 detect_info.found = 1 << coding_priorities[i];
6833 id = coding_categories[coding_priorities[i]].id;
6834 val = Fcons (make_number (id), Qnil);
6835 break;
6836 }
6837 }
89528eb3
KH
6838 else
6839 {
ff0dacd7
KH
6840 int mask = detect_info.rejected | detect_info.found;
6841 int found = 0;
89528eb3 6842 val = Qnil;
ec6d2bb8 6843
89528eb3 6844 for (i = coding_category_raw_text - 1; i >= 0; i--)
ff0dacd7
KH
6845 {
6846 category = coding_priorities[i];
6847 if (! (mask & (1 << category)))
ec6d2bb8 6848 {
ff0dacd7
KH
6849 found |= 1 << category;
6850 id = coding_categories[category].id;
6851 val = Fcons (make_number (id), val);
6852 }
6853 }
6854 for (i = coding_category_raw_text - 1; i >= 0; i--)
6855 {
6856 category = coding_priorities[i];
6857 if (detect_info.found & (1 << category))
6858 {
6859 id = coding_categories[category].id;
6860 val = Fcons (make_number (id), val);
ec6d2bb8 6861 }
ec6d2bb8 6862 }
ff0dacd7 6863 detect_info.found |= found;
ec6d2bb8 6864 }
ec6d2bb8 6865 }
df7492f9
KH
6866 else
6867 {
ff0dacd7 6868 detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
89528eb3 6869 val = Fcons (make_number (coding.id), Qnil);
4ed46869 6870 }
df7492f9 6871
89528eb3 6872 /* Then, detect eol-format if necessary. */
df7492f9 6873 {
89528eb3 6874 int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol;
df7492f9
KH
6875 Lisp_Object tail;
6876
89528eb3
KH
6877 if (VECTORP (eol_type))
6878 {
ff0dacd7 6879 if (detect_info.found & ~CATEGORY_MASK_UTF_16)
89528eb3
KH
6880 normal_eol = detect_eol (coding.source, src_bytes,
6881 coding_category_raw_text);
ff0dacd7
KH
6882 if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
6883 | CATEGORY_MASK_UTF_16_BE_NOSIG))
89528eb3
KH
6884 utf_16_be_eol = detect_eol (coding.source, src_bytes,
6885 coding_category_utf_16_be);
ff0dacd7
KH
6886 if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
6887 | CATEGORY_MASK_UTF_16_LE_NOSIG))
89528eb3
KH
6888 utf_16_le_eol = detect_eol (coding.source, src_bytes,
6889 coding_category_utf_16_le);
6890 }
6891 else
6892 {
6893 if (EQ (eol_type, Qunix))
6894 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
6895 else if (EQ (eol_type, Qdos))
6896 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
6897 else
6898 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
6899 }
6900
df7492f9
KH
6901 for (tail = val; CONSP (tail); tail = XCDR (tail))
6902 {
89528eb3 6903 enum coding_category category;
df7492f9 6904 int this_eol;
89528eb3
KH
6905
6906 id = XINT (XCAR (tail));
6907 attrs = CODING_ID_ATTRS (id);
6908 category = XINT (CODING_ATTR_CATEGORY (attrs));
6909 eol_type = CODING_ID_EOL_TYPE (id);
df7492f9
KH
6910 if (VECTORP (eol_type))
6911 {
89528eb3
KH
6912 if (category == coding_category_utf_16_be
6913 || category == coding_category_utf_16_be_nosig)
6914 this_eol = utf_16_be_eol;
6915 else if (category == coding_category_utf_16_le
6916 || category == coding_category_utf_16_le_nosig)
6917 this_eol = utf_16_le_eol;
df7492f9 6918 else
89528eb3
KH
6919 this_eol = normal_eol;
6920
df7492f9
KH
6921 if (this_eol == EOL_SEEN_LF)
6922 XSETCAR (tail, AREF (eol_type, 0));
6923 else if (this_eol == EOL_SEEN_CRLF)
6924 XSETCAR (tail, AREF (eol_type, 1));
6925 else if (this_eol == EOL_SEEN_CR)
6926 XSETCAR (tail, AREF (eol_type, 2));
89528eb3
KH
6927 else
6928 XSETCAR (tail, CODING_ID_NAME (id));
df7492f9 6929 }
89528eb3
KH
6930 else
6931 XSETCAR (tail, CODING_ID_NAME (id));
df7492f9
KH
6932 }
6933 }
ec6d2bb8 6934
03699b14 6935 return (highest ? XCAR (val) : val);
ec6d2bb8
KH
6936}
6937
ec6d2bb8 6938
d46c5b12
KH
6939DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
6940 2, 3, 0,
48b0f3ae
PJ
6941 doc: /* Detect coding system of the text in the region between START and END.
6942Return a list of possible coding systems ordered by priority.
ec6d2bb8 6943
48b0f3ae
PJ
6944If only ASCII characters are found, it returns a list of single element
6945`undecided' or its subsidiary coding system according to a detected
6946end-of-line format.
ec6d2bb8 6947
48b0f3ae
PJ
6948If optional argument HIGHEST is non-nil, return the coding system of
6949highest priority. */)
6950 (start, end, highest)
d46c5b12
KH
6951 Lisp_Object start, end, highest;
6952{
6953 int from, to;
6954 int from_byte, to_byte;
ec6d2bb8 6955
b7826503
PJ
6956 CHECK_NUMBER_COERCE_MARKER (start);
6957 CHECK_NUMBER_COERCE_MARKER (end);
ec6d2bb8 6958
d46c5b12
KH
6959 validate_region (&start, &end);
6960 from = XINT (start), to = XINT (end);
6961 from_byte = CHAR_TO_BYTE (from);
6962 to_byte = CHAR_TO_BYTE (to);
ec6d2bb8 6963
d46c5b12
KH
6964 if (from < GPT && to >= GPT)
6965 move_gap_both (to, to_byte);
c210f766 6966
d46c5b12 6967 return detect_coding_system (BYTE_POS_ADDR (from_byte),
df7492f9 6968 to_byte - from_byte,
0a28aafb
KH
6969 !NILP (highest),
6970 !NILP (current_buffer
df7492f9
KH
6971 ->enable_multibyte_characters),
6972 Qnil);
ec6d2bb8
KH
6973}
6974
d46c5b12
KH
6975DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
6976 1, 2, 0,
48b0f3ae
PJ
6977 doc: /* Detect coding system of the text in STRING.
6978Return a list of possible coding systems ordered by priority.
fb88bf2d 6979
48b0f3ae
PJ
6980If only ASCII characters are found, it returns a list of single element
6981`undecided' or its subsidiary coding system according to a detected
6982end-of-line format.
d46c5b12 6983
48b0f3ae
PJ
6984If optional argument HIGHEST is non-nil, return the coding system of
6985highest priority. */)
6986 (string, highest)
d46c5b12
KH
6987 Lisp_Object string, highest;
6988{
b7826503 6989 CHECK_STRING (string);
b73bfc1c 6990
8f924df7
KH
6991 return detect_coding_system (SDATA (string), SBYTES (string),
6992 !NILP (highest), STRING_MULTIBYTE (string),
df7492f9 6993 Qnil);
4ed46869 6994}
4ed46869 6995
b73bfc1c 6996
df7492f9
KH
6997static INLINE int
6998char_encodable_p (c, attrs)
6999 int c;
7000 Lisp_Object attrs;
05e6f5dc 7001{
df7492f9 7002 Lisp_Object tail;
df7492f9 7003 struct charset *charset;
d46c5b12 7004
df7492f9
KH
7005 for (tail = CODING_ATTR_CHARSET_LIST (attrs);
7006 CONSP (tail); tail = XCDR (tail))
e133c8fa 7007 {
df7492f9
KH
7008 charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
7009 if (CHAR_CHARSET_P (c, charset))
7010 break;
e133c8fa 7011 }
df7492f9 7012 return (! NILP (tail));
05e6f5dc 7013}
83fa074f 7014
fb88bf2d 7015
df7492f9
KH
7016/* Return a list of coding systems that safely encode the text between
7017 START and END. If EXCLUDE is non-nil, it is a list of coding
7018 systems not to check. The returned list doesn't contain any such
48468dac 7019 coding systems. In any case, if the text contains only ASCII or is
df7492f9 7020 unibyte, return t. */
e077cc80 7021
df7492f9
KH
7022DEFUN ("find-coding-systems-region-internal",
7023 Ffind_coding_systems_region_internal,
7024 Sfind_coding_systems_region_internal, 2, 3, 0,
7025 doc: /* Internal use only. */)
7026 (start, end, exclude)
7027 Lisp_Object start, end, exclude;
7028{
7029 Lisp_Object coding_attrs_list, safe_codings;
7030 EMACS_INT start_byte, end_byte;
7c78e542 7031 const unsigned char *p, *pbeg, *pend;
df7492f9
KH
7032 int c;
7033 Lisp_Object tail, elt;
d46c5b12 7034
df7492f9
KH
7035 if (STRINGP (start))
7036 {
7037 if (!STRING_MULTIBYTE (start)
8f924df7 7038 || SCHARS (start) == SBYTES (start))
df7492f9
KH
7039 return Qt;
7040 start_byte = 0;
8f924df7 7041 end_byte = SBYTES (start);
df7492f9
KH
7042 }
7043 else
d46c5b12 7044 {
df7492f9
KH
7045 CHECK_NUMBER_COERCE_MARKER (start);
7046 CHECK_NUMBER_COERCE_MARKER (end);
7047 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
7048 args_out_of_range (start, end);
7049 if (NILP (current_buffer->enable_multibyte_characters))
7050 return Qt;
7051 start_byte = CHAR_TO_BYTE (XINT (start));
7052 end_byte = CHAR_TO_BYTE (XINT (end));
7053 if (XINT (end) - XINT (start) == end_byte - start_byte)
7054 return Qt;
d46c5b12 7055
e1c23804 7056 if (XINT (start) < GPT && XINT (end) > GPT)
d46c5b12 7057 {
e1c23804
DL
7058 if ((GPT - XINT (start)) < (XINT (end) - GPT))
7059 move_gap_both (XINT (start), start_byte);
df7492f9 7060 else
e1c23804 7061 move_gap_both (XINT (end), end_byte);
d46c5b12
KH
7062 }
7063 }
7064
df7492f9
KH
7065 coding_attrs_list = Qnil;
7066 for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
7067 if (NILP (exclude)
7068 || NILP (Fmemq (XCAR (tail), exclude)))
7069 {
7070 Lisp_Object attrs;
d46c5b12 7071
df7492f9
KH
7072 attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
7073 if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
7074 && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
7075 coding_attrs_list = Fcons (attrs, coding_attrs_list);
7076 }
d46c5b12 7077
df7492f9 7078 if (STRINGP (start))
8f924df7 7079 p = pbeg = SDATA (start);
df7492f9
KH
7080 else
7081 p = pbeg = BYTE_POS_ADDR (start_byte);
7082 pend = p + (end_byte - start_byte);
b843d1ae 7083
df7492f9
KH
7084 while (p < pend && ASCII_BYTE_P (*p)) p++;
7085 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
d46c5b12 7086
05e6f5dc 7087 while (p < pend)
72d1a715 7088 {
df7492f9
KH
7089 if (ASCII_BYTE_P (*p))
7090 p++;
72d1a715
RS
7091 else
7092 {
df7492f9 7093 c = STRING_CHAR_ADVANCE (p);
12410ef1 7094
df7492f9
KH
7095 charset_map_loaded = 0;
7096 for (tail = coding_attrs_list; CONSP (tail);)
7097 {
7098 elt = XCAR (tail);
7099 if (NILP (elt))
7100 tail = XCDR (tail);
7101 else if (char_encodable_p (c, elt))
7102 tail = XCDR (tail);
7103 else if (CONSP (XCDR (tail)))
7104 {
7105 XSETCAR (tail, XCAR (XCDR (tail)));
7106 XSETCDR (tail, XCDR (XCDR (tail)));
7107 }
7108 else
7109 {
7110 XSETCAR (tail, Qnil);
7111 tail = XCDR (tail);
7112 }
7113 }
7114 if (charset_map_loaded)
7115 {
7116 EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
05e6f5dc 7117
df7492f9 7118 if (STRINGP (start))
8f924df7 7119 pbeg = SDATA (start);
df7492f9
KH
7120 else
7121 pbeg = BYTE_POS_ADDR (start_byte);
7122 p = pbeg + p_offset;
7123 pend = pbeg + pend_offset;
7124 }
7125 }
ec6d2bb8 7126 }
fb88bf2d 7127
df7492f9
KH
7128 safe_codings = Qnil;
7129 for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
7130 if (! NILP (XCAR (tail)))
7131 safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
ec6d2bb8 7132
05e6f5dc
KH
7133 return safe_codings;
7134}
4956c225 7135
d46c5b12 7136
8f924df7
KH
7137DEFUN ("unencodable-char-position", Funencodable_char_position,
7138 Sunencodable_char_position, 3, 5, 0,
7139 doc: /*
7140Return position of first un-encodable character in a region.
7141START and END specfiy the region and CODING-SYSTEM specifies the
7142encoding to check. Return nil if CODING-SYSTEM does encode the region.
d46c5b12 7143
8f924df7
KH
7144If optional 4th argument COUNT is non-nil, it specifies at most how
7145many un-encodable characters to search. In this case, the value is a
7146list of positions.
d46c5b12 7147
8f924df7
KH
7148If optional 5th argument STRING is non-nil, it is a string to search
7149for un-encodable characters. In that case, START and END are indexes
7150to the string. */)
7151 (start, end, coding_system, count, string)
7152 Lisp_Object start, end, coding_system, count, string;
7153{
7154 int n;
7155 struct coding_system coding;
7156 Lisp_Object attrs, charset_list;
7157 Lisp_Object positions;
7158 int from, to;
7159 const unsigned char *p, *stop, *pend;
7160 int ascii_compatible;
fb88bf2d 7161
8f924df7
KH
7162 setup_coding_system (Fcheck_coding_system (coding_system), &coding);
7163 attrs = CODING_ID_ATTRS (coding.id);
7164 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
7165 return Qnil;
7166 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
7167 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
fb88bf2d 7168
8f924df7
KH
7169 if (NILP (string))
7170 {
7171 validate_region (&start, &end);
7172 from = XINT (start);
7173 to = XINT (end);
7174 if (NILP (current_buffer->enable_multibyte_characters)
7175 || (ascii_compatible
7176 && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
7177 return Qnil;
7178 p = CHAR_POS_ADDR (from);
7179 pend = CHAR_POS_ADDR (to);
7180 if (from < GPT && to >= GPT)
7181 stop = GPT_ADDR;
7182 else
7183 stop = pend;
7184 }
7185 else
7186 {
7187 CHECK_STRING (string);
7188 CHECK_NATNUM (start);
7189 CHECK_NATNUM (end);
7190 from = XINT (start);
7191 to = XINT (end);
7192 if (from > to
7193 || to > SCHARS (string))
7194 args_out_of_range_3 (string, start, end);
7195 if (! STRING_MULTIBYTE (string))
7196 return Qnil;
7197 p = SDATA (string) + string_char_to_byte (string, from);
7198 stop = pend = SDATA (string) + string_char_to_byte (string, to);
7199 if (ascii_compatible && (to - from) == (pend - p))
7200 return Qnil;
7201 }
f2558efd 7202
8f924df7
KH
7203 if (NILP (count))
7204 n = 1;
7205 else
b73bfc1c 7206 {
8f924df7
KH
7207 CHECK_NATNUM (count);
7208 n = XINT (count);
b73bfc1c
KH
7209 }
7210
8f924df7
KH
7211 positions = Qnil;
7212 while (1)
d46c5b12 7213 {
8f924df7 7214 int c;
ec6d2bb8 7215
8f924df7
KH
7216 if (ascii_compatible)
7217 while (p < stop && ASCII_BYTE_P (*p))
7218 p++, from++;
7219 if (p >= stop)
0e79d667 7220 {
8f924df7
KH
7221 if (p >= pend)
7222 break;
7223 stop = pend;
7224 p = GAP_END_ADDR;
0e79d667 7225 }
ec6d2bb8 7226
8f924df7
KH
7227 c = STRING_CHAR_ADVANCE (p);
7228 if (! (ASCII_CHAR_P (c) && ascii_compatible)
7229 && ! char_charset (c, charset_list, NULL))
ec6d2bb8 7230 {
8f924df7
KH
7231 positions = Fcons (make_number (from), positions);
7232 n--;
7233 if (n == 0)
7234 break;
ec6d2bb8
KH
7235 }
7236
8f924df7
KH
7237 from++;
7238 }
d46c5b12 7239
8f924df7
KH
7240 return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
7241}
d46c5b12 7242
d46c5b12 7243
df7492f9
KH
7244DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
7245 Scheck_coding_systems_region, 3, 3, 0,
7246 doc: /* Check if the region is encodable by coding systems.
d46c5b12 7247
df7492f9
KH
7248START and END are buffer positions specifying the region.
7249CODING-SYSTEM-LIST is a list of coding systems to check.
d46c5b12 7250
df7492f9
KH
7251The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
7252CODING-SYSTEM is a member of CODING-SYSTEM-LIst and can't encode the
7253whole region, POS0, POS1, ... are buffer positions where non-encodable
7254characters are found.
93dec019 7255
df7492f9
KH
7256If all coding systems in CODING-SYSTEM-LIST can encode the region, the
7257value is nil.
93dec019 7258
df7492f9
KH
7259START may be a string. In that case, check if the string is
7260encodable, and the value contains indices to the string instead of
7261buffer positions. END is ignored. */)
7262 (start, end, coding_system_list)
7263 Lisp_Object start, end, coding_system_list;
05e6f5dc 7264{
df7492f9
KH
7265 Lisp_Object list;
7266 EMACS_INT start_byte, end_byte;
7267 int pos;
7c78e542 7268 const unsigned char *p, *pbeg, *pend;
df7492f9
KH
7269 int c;
7270 Lisp_Object tail, elt;
70ad9fc4 7271
05e6f5dc
KH
7272 if (STRINGP (start))
7273 {
df7492f9 7274 if (!STRING_MULTIBYTE (start)
8f924df7 7275 && SCHARS (start) != SBYTES (start))
df7492f9
KH
7276 return Qnil;
7277 start_byte = 0;
8f924df7 7278 end_byte = SBYTES (start);
df7492f9 7279 pos = 0;
d46c5b12 7280 }
05e6f5dc 7281 else
b73bfc1c 7282 {
b7826503
PJ
7283 CHECK_NUMBER_COERCE_MARKER (start);
7284 CHECK_NUMBER_COERCE_MARKER (end);
05e6f5dc
KH
7285 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
7286 args_out_of_range (start, end);
7287 if (NILP (current_buffer->enable_multibyte_characters))
df7492f9
KH
7288 return Qnil;
7289 start_byte = CHAR_TO_BYTE (XINT (start));
7290 end_byte = CHAR_TO_BYTE (XINT (end));
7291 if (XINT (end) - XINT (start) == end_byte - start_byte)
05e6f5dc 7292 return Qt;
df7492f9 7293
e1c23804 7294 if (XINT (start) < GPT && XINT (end) > GPT)
b73bfc1c 7295 {
e1c23804
DL
7296 if ((GPT - XINT (start)) < (XINT (end) - GPT))
7297 move_gap_both (XINT (start), start_byte);
df7492f9 7298 else
e1c23804 7299 move_gap_both (XINT (end), end_byte);
b73bfc1c 7300 }
e1c23804 7301 pos = XINT (start);
b73bfc1c 7302 }
7553d0e1 7303
df7492f9
KH
7304 list = Qnil;
7305 for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
12410ef1 7306 {
df7492f9
KH
7307 elt = XCAR (tail);
7308 list = Fcons (Fcons (elt, Fcons (AREF (CODING_SYSTEM_SPEC (elt), 0),
7309 Qnil)),
7310 list);
12410ef1
KH
7311 }
7312
df7492f9 7313 if (STRINGP (start))
8f924df7 7314 p = pbeg = SDATA (start);
72d1a715 7315 else
df7492f9
KH
7316 p = pbeg = BYTE_POS_ADDR (start_byte);
7317 pend = p + (end_byte - start_byte);
4ed46869 7318
df7492f9
KH
7319 while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
7320 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
ec6d2bb8 7321
df7492f9 7322 while (p < pend)
d46c5b12 7323 {
df7492f9
KH
7324 if (ASCII_BYTE_P (*p))
7325 p++;
e133c8fa 7326 else
05e6f5dc 7327 {
df7492f9
KH
7328 c = STRING_CHAR_ADVANCE (p);
7329
7330 charset_map_loaded = 0;
7331 for (tail = list; CONSP (tail); tail = XCDR (tail))
7332 {
7333 elt = XCDR (XCAR (tail));
7334 if (! char_encodable_p (c, XCAR (elt)))
7335 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
7336 }
7337 if (charset_map_loaded)
7338 {
7339 EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
7340
7341 if (STRINGP (start))
8f924df7 7342 pbeg = SDATA (start);
df7492f9
KH
7343 else
7344 pbeg = BYTE_POS_ADDR (start_byte);
7345 p = pbeg + p_offset;
7346 pend = pbeg + pend_offset;
7347 }
05e6f5dc 7348 }
df7492f9 7349 pos++;
d46c5b12 7350 }
4ed46869 7351
df7492f9
KH
7352 tail = list;
7353 list = Qnil;
7354 for (; CONSP (tail); tail = XCDR (tail))
ec6d2bb8 7355 {
df7492f9
KH
7356 elt = XCAR (tail);
7357 if (CONSP (XCDR (XCDR (elt))))
7358 list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
7359 list);
ec6d2bb8 7360 }
2b4f9037 7361
df7492f9 7362 return list;
d46c5b12
KH
7363}
7364
3fd9494b 7365
b73bfc1c
KH
7366
7367Lisp_Object
df7492f9
KH
7368code_convert_region (start, end, coding_system, dst_object, encodep, norecord)
7369 Lisp_Object start, end, coding_system, dst_object;
7370 int encodep, norecord;
4ed46869 7371{
3a73fa5d 7372 struct coding_system coding;
df7492f9
KH
7373 EMACS_INT from, from_byte, to, to_byte;
7374 Lisp_Object src_object;
4ed46869 7375
b7826503
PJ
7376 CHECK_NUMBER_COERCE_MARKER (start);
7377 CHECK_NUMBER_COERCE_MARKER (end);
df7492f9
KH
7378 if (NILP (coding_system))
7379 coding_system = Qno_conversion;
7380 else
7381 CHECK_CODING_SYSTEM (coding_system);
7382 src_object = Fcurrent_buffer ();
7383 if (NILP (dst_object))
7384 dst_object = src_object;
7385 else if (! EQ (dst_object, Qt))
7386 CHECK_BUFFER (dst_object);
3a73fa5d 7387
d46c5b12
KH
7388 validate_region (&start, &end);
7389 from = XFASTINT (start);
df7492f9 7390 from_byte = CHAR_TO_BYTE (from);
d46c5b12 7391 to = XFASTINT (end);
df7492f9 7392 to_byte = CHAR_TO_BYTE (to);
764ca8da 7393
df7492f9
KH
7394 setup_coding_system (coding_system, &coding);
7395 coding.mode |= CODING_MODE_LAST_BLOCK;
ec6d2bb8 7396
df7492f9
KH
7397 if (encodep)
7398 encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
7399 dst_object);
7400 else
7401 decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
7402 dst_object);
7403 if (! norecord)
7404 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
ec6d2bb8 7405
df7492f9
KH
7406 if (coding.result != CODING_RESULT_SUCCESS)
7407 error ("Code conversion error: %d", coding.result);
b73bfc1c 7408
df7492f9
KH
7409 return (BUFFERP (dst_object)
7410 ? make_number (coding.produced_char)
7411 : coding.dst_object);
4031e2bf 7412}
78108bcd 7413
4ed46869 7414
4031e2bf 7415DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
df7492f9 7416 3, 4, "r\nzCoding system: ",
48b0f3ae 7417 doc: /* Decode the current region from the specified coding system.
df7492f9
KH
7418When called from a program, takes four arguments:
7419 START, END, CODING-SYSTEM, and DESTINATION.
7420START and END are buffer positions.
8844fa83 7421
df7492f9
KH
7422Optional 4th arguments DESTINATION specifies where the decoded text goes.
7423If nil, the region between START and END is replace by the decoded text.
7424If buffer, the decoded text is inserted in the buffer.
7425If t, the decoded text is returned.
8844fa83 7426
48b0f3ae
PJ
7427This function sets `last-coding-system-used' to the precise coding system
7428used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7429not fully specified.)
7430It returns the length of the decoded text. */)
df7492f9
KH
7431 (start, end, coding_system, destination)
7432 Lisp_Object start, end, coding_system, destination;
4031e2bf 7433{
df7492f9 7434 return code_convert_region (start, end, coding_system, destination, 0, 0);
3a73fa5d 7435}
8844fa83 7436
3a73fa5d 7437DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
df7492f9
KH
7438 3, 4, "r\nzCoding system: ",
7439 doc: /* Encode the current region by specified coding system.
48b0f3ae
PJ
7440When called from a program, takes three arguments:
7441START, END, and CODING-SYSTEM. START and END are buffer positions.
d46c5b12 7442
df7492f9
KH
7443Optional 4th arguments DESTINATION specifies where the encoded text goes.
7444If nil, the region between START and END is replace by the encoded text.
7445If buffer, the encoded text is inserted in the buffer.
7446If t, the encoded text is returned.
2391eaa4 7447
48b0f3ae
PJ
7448This function sets `last-coding-system-used' to the precise coding system
7449used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7450not fully specified.)
7451It returns the length of the encoded text. */)
df7492f9
KH
7452 (start, end, coding_system, destination)
7453 Lisp_Object start, end, coding_system, destination;
3a73fa5d 7454{
df7492f9 7455 return code_convert_region (start, end, coding_system, destination, 1, 0);
b73bfc1c
KH
7456}
7457
7458Lisp_Object
df7492f9
KH
7459code_convert_string (string, coding_system, dst_object,
7460 encodep, nocopy, norecord)
7461 Lisp_Object string, coding_system, dst_object;
7462 int encodep, nocopy, norecord;
b73bfc1c 7463{
4031e2bf 7464 struct coding_system coding;
df7492f9 7465 EMACS_INT chars, bytes;
ec6d2bb8 7466
b7826503 7467 CHECK_STRING (string);
d46c5b12 7468 if (NILP (coding_system))
4956c225 7469 {
df7492f9
KH
7470 if (! norecord)
7471 Vlast_coding_system_used = Qno_conversion;
7472 if (NILP (dst_object))
7473 return (nocopy ? Fcopy_sequence (string) : string);
4956c225 7474 }
b73bfc1c 7475
df7492f9
KH
7476 if (NILP (coding_system))
7477 coding_system = Qno_conversion;
7478 else
7479 CHECK_CODING_SYSTEM (coding_system);
7480 if (NILP (dst_object))
7481 dst_object = Qt;
7482 else if (! EQ (dst_object, Qt))
7483 CHECK_BUFFER (dst_object);
73be902c 7484
df7492f9 7485 setup_coding_system (coding_system, &coding);
d46c5b12 7486 coding.mode |= CODING_MODE_LAST_BLOCK;
8f924df7
KH
7487 chars = SCHARS (string);
7488 bytes = SBYTES (string);
df7492f9
KH
7489 if (encodep)
7490 encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
7491 else
7492 decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
7493 if (! norecord)
7494 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
73be902c 7495
df7492f9
KH
7496 if (coding.result != CODING_RESULT_SUCCESS)
7497 error ("Code conversion error: %d", coding.result);
2391eaa4 7498
df7492f9
KH
7499 return (BUFFERP (dst_object)
7500 ? make_number (coding.produced_char)
7501 : coding.dst_object);
4ed46869 7502}
73be902c 7503
b73bfc1c 7504
ecec61c1 7505/* Encode or decode STRING according to CODING_SYSTEM.
ec6d2bb8 7506 Do not set Vlast_coding_system_used.
4ed46869 7507
ec6d2bb8
KH
7508 This function is called only from macros DECODE_FILE and
7509 ENCODE_FILE, thus we ignore character composition. */
4ed46869 7510
ecec61c1
KH
7511Lisp_Object
7512code_convert_string_norecord (string, coding_system, encodep)
7513 Lisp_Object string, coding_system;
7514 int encodep;
4ed46869 7515{
0be8721c 7516 return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
4ed46869
KH
7517}
7518
4ed46869 7519
df7492f9
KH
7520DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
7521 2, 4, 0,
7522 doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
7523
7524Optional third arg NOCOPY non-nil means it is OK to return STRING itself
7525if the decoding operation is trivial.
ecec61c1 7526
df7492f9 7527Optional fourth arg BUFFER non-nil meant that the decoded text is
a3f6ee6d 7528inserted in BUFFER instead of returned as a string. In this case,
df7492f9 7529the return value is BUFFER.
ecec61c1 7530
df7492f9
KH
7531This function sets `last-coding-system-used' to the precise coding system
7532used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7533not fully specified. */)
7534 (string, coding_system, nocopy, buffer)
7535 Lisp_Object string, coding_system, nocopy, buffer;
4ed46869 7536{
df7492f9
KH
7537 return code_convert_string (string, coding_system, buffer,
7538 0, ! NILP (nocopy), 0);
4ed46869
KH
7539}
7540
df7492f9
KH
7541DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
7542 2, 4, 0,
7543 doc: /* Encode STRING to CODING-SYSTEM, and return the result.
7544
7545Optional third arg NOCOPY non-nil means it is OK to return STRING
7546itself if the encoding operation is trivial.
7547
7548Optional fourth arg BUFFER non-nil meant that the encoded text is
a3f6ee6d 7549inserted in BUFFER instead of returned as a string. In this case,
df7492f9
KH
7550the return value is BUFFER.
7551
7552This function sets `last-coding-system-used' to the precise coding system
7553used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7554not fully specified.) */)
7555 (string, coding_system, nocopy, buffer)
7556 Lisp_Object string, coding_system, nocopy, buffer;
4ed46869 7557{
df7492f9 7558 return code_convert_string (string, coding_system, buffer,
c197f191 7559 1, ! NILP (nocopy), 1);
4ed46869 7560}
df7492f9 7561
3a73fa5d 7562\f
4ed46869 7563DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
48b0f3ae
PJ
7564 doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
7565Return the corresponding character. */)
7566 (code)
4ed46869 7567 Lisp_Object code;
4ed46869 7568{
df7492f9
KH
7569 Lisp_Object spec, attrs, val;
7570 struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
7571 int c;
4ed46869 7572
df7492f9
KH
7573 CHECK_NATNUM (code);
7574 c = XFASTINT (code);
7575 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
7576 attrs = AREF (spec, 0);
4ed46869 7577
df7492f9
KH
7578 if (ASCII_BYTE_P (c)
7579 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7580 return code;
4ed46869 7581
df7492f9
KH
7582 val = CODING_ATTR_CHARSET_LIST (attrs);
7583 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
004068e4
KH
7584 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
7585 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
fa42c37f 7586
df7492f9
KH
7587 if (c <= 0x7F)
7588 charset = charset_roman;
7589 else if (c >= 0xA0 && c < 0xDF)
55ab7be3 7590 {
df7492f9
KH
7591 charset = charset_kana;
7592 c -= 0x80;
4ed46869 7593 }
55ab7be3 7594 else
4ed46869 7595 {
004068e4 7596 int s1 = c >> 8, s2 = c & 0xFF;
df7492f9
KH
7597
7598 if (s1 < 0x81 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF
7599 || s2 < 0x40 || s2 == 0x7F || s2 > 0xFC)
7600 error ("Invalid code: %d", code);
7601 SJIS_TO_JIS (c);
7602 charset = charset_kanji;
4ed46869 7603 }
df7492f9
KH
7604 c = DECODE_CHAR (charset, c);
7605 if (c < 0)
7606 error ("Invalid code: %d", code);
7607 return make_number (c);
93dec019 7608}
4ed46869 7609
48b0f3ae 7610
4ed46869 7611DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
48b0f3ae
PJ
7612 doc: /* Encode a Japanese character CHAR to shift_jis encoding.
7613Return the corresponding code in SJIS. */)
7614 (ch)
df7492f9 7615 Lisp_Object ch;
4ed46869 7616{
df7492f9
KH
7617 Lisp_Object spec, attrs, charset_list;
7618 int c;
7619 struct charset *charset;
7620 unsigned code;
48b0f3ae 7621
df7492f9
KH
7622 CHECK_CHARACTER (ch);
7623 c = XFASTINT (ch);
7624 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
7625 attrs = AREF (spec, 0);
7626
7627 if (ASCII_CHAR_P (c)
7628 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7629 return ch;
7630
7631 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7632 charset = char_charset (c, charset_list, &code);
7633 if (code == CHARSET_INVALID_CODE (charset))
7634 error ("Can't encode by shift_jis encoding: %d", c);
7635 JIS_TO_SJIS (code);
7636
7637 return make_number (code);
4ed46869
KH
7638}
7639
7640DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
48b0f3ae
PJ
7641 doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
7642Return the corresponding character. */)
7643 (code)
4ed46869 7644 Lisp_Object code;
d46c5b12 7645{
df7492f9
KH
7646 Lisp_Object spec, attrs, val;
7647 struct charset *charset_roman, *charset_big5, *charset;
7648 int c;
6289dd10 7649
df7492f9
KH
7650 CHECK_NATNUM (code);
7651 c = XFASTINT (code);
7652 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
7653 attrs = AREF (spec, 0);
4ed46869 7654
df7492f9
KH
7655 if (ASCII_BYTE_P (c)
7656 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7657 return code;
6289dd10 7658
df7492f9
KH
7659 val = CODING_ATTR_CHARSET_LIST (attrs);
7660 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
7661 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
c210f766 7662
df7492f9
KH
7663 if (c <= 0x7F)
7664 charset = charset_roman;
c28a9453
KH
7665 else
7666 {
df7492f9
KH
7667 int b1 = c >> 8, b2 = c & 0x7F;
7668 if (b1 < 0xA1 || b1 > 0xFE
7669 || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
7670 error ("Invalid code: %d", code);
7671 charset = charset_big5;
c28a9453 7672 }
df7492f9
KH
7673 c = DECODE_CHAR (charset, (unsigned )c);
7674 if (c < 0)
7675 error ("Invalid code: %d", code);
7676 return make_number (c);
d46c5b12 7677}
6289dd10 7678
4ed46869 7679DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
48b0f3ae
PJ
7680 doc: /* Encode the Big5 character CHAR to BIG5 coding system.
7681Return the corresponding character code in Big5. */)
7682 (ch)
4ed46869
KH
7683 Lisp_Object ch;
7684{
df7492f9
KH
7685 Lisp_Object spec, attrs, charset_list;
7686 struct charset *charset;
7687 int c;
7688 unsigned code;
7689
7690 CHECK_CHARACTER (ch);
7691 c = XFASTINT (ch);
7692 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
7693 attrs = AREF (spec, 0);
7694 if (ASCII_CHAR_P (c)
7695 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7696 return ch;
7697
7698 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7699 charset = char_charset (c, charset_list, &code);
7700 if (code == CHARSET_INVALID_CODE (charset))
7701 error ("Can't encode by Big5 encoding: %d", c);
7702
7703 return make_number (code);
4ed46869 7704}
48b0f3ae 7705
3a73fa5d 7706\f
1ba9e4ab
KH
7707DEFUN ("set-terminal-coding-system-internal",
7708 Fset_terminal_coding_system_internal,
48b0f3ae
PJ
7709 Sset_terminal_coding_system_internal, 1, 1, 0,
7710 doc: /* Internal use only. */)
7711 (coding_system)
b74e4686 7712 Lisp_Object coding_system;
4ed46869 7713{
b7826503 7714 CHECK_SYMBOL (coding_system);
df7492f9
KH
7715 setup_coding_system (Fcheck_coding_system (coding_system),
7716 &terminal_coding);
48b0f3ae 7717
70c22245 7718 /* We had better not send unsafe characters to terminal. */
df7492f9
KH
7719 terminal_coding.mode |= CODING_MODE_SAFE_ENCODING;
7720 /* Characer composition should be disabled. */
7721 terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
b73bfc1c
KH
7722 terminal_coding.src_multibyte = 1;
7723 terminal_coding.dst_multibyte = 0;
4ed46869
KH
7724 return Qnil;
7725}
7726
c4825358
KH
7727DEFUN ("set-safe-terminal-coding-system-internal",
7728 Fset_safe_terminal_coding_system_internal,
48b0f3ae 7729 Sset_safe_terminal_coding_system_internal, 1, 1, 0,
ddb67bdc 7730 doc: /* Internal use only. */)
48b0f3ae 7731 (coding_system)
b74e4686 7732 Lisp_Object coding_system;
d46c5b12 7733{
b7826503 7734 CHECK_SYMBOL (coding_system);
c4825358
KH
7735 setup_coding_system (Fcheck_coding_system (coding_system),
7736 &safe_terminal_coding);
df7492f9
KH
7737 /* Characer composition should be disabled. */
7738 safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
b73bfc1c
KH
7739 safe_terminal_coding.src_multibyte = 1;
7740 safe_terminal_coding.dst_multibyte = 0;
c4825358
KH
7741 return Qnil;
7742}
4ed46869 7743
4ed46869
KH
7744DEFUN ("terminal-coding-system",
7745 Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
48b0f3ae
PJ
7746 doc: /* Return coding system specified for terminal output. */)
7747 ()
4ed46869 7748{
df7492f9 7749 return CODING_ID_NAME (terminal_coding.id);
4ed46869
KH
7750}
7751
1ba9e4ab
KH
7752DEFUN ("set-keyboard-coding-system-internal",
7753 Fset_keyboard_coding_system_internal,
48b0f3ae
PJ
7754 Sset_keyboard_coding_system_internal, 1, 1, 0,
7755 doc: /* Internal use only. */)
7756 (coding_system)
4ed46869
KH
7757 Lisp_Object coding_system;
7758{
b7826503 7759 CHECK_SYMBOL (coding_system);
df7492f9
KH
7760 setup_coding_system (Fcheck_coding_system (coding_system),
7761 &keyboard_coding);
7762 /* Characer composition should be disabled. */
7763 keyboard_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
4ed46869
KH
7764 return Qnil;
7765}
7766
7767DEFUN ("keyboard-coding-system",
7768 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
48b0f3ae
PJ
7769 doc: /* Return coding system specified for decoding keyboard input. */)
7770 ()
4ed46869 7771{
df7492f9 7772 return CODING_ID_NAME (keyboard_coding.id);
4ed46869
KH
7773}
7774
4ed46869 7775\f
a5d301df
KH
7776DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
7777 Sfind_operation_coding_system, 1, MANY, 0,
48b0f3ae
PJ
7778 doc: /* Choose a coding system for an operation based on the target name.
7779The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
7780DECODING-SYSTEM is the coding system to use for decoding
7781\(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
7782for encoding (in case OPERATION does encoding).
05e6f5dc 7783
48b0f3ae
PJ
7784The first argument OPERATION specifies an I/O primitive:
7785 For file I/O, `insert-file-contents' or `write-region'.
7786 For process I/O, `call-process', `call-process-region', or `start-process'.
7787 For network I/O, `open-network-stream'.
05e6f5dc 7788
48b0f3ae
PJ
7789The remaining arguments should be the same arguments that were passed
7790to the primitive. Depending on which primitive, one of those arguments
7791is selected as the TARGET. For example, if OPERATION does file I/O,
7792whichever argument specifies the file name is TARGET.
05e6f5dc 7793
48b0f3ae
PJ
7794TARGET has a meaning which depends on OPERATION:
7795 For file I/O, TARGET is a file name.
7796 For process I/O, TARGET is a process name.
7797 For network I/O, TARGET is a service name or a port number
05e6f5dc 7798
48b0f3ae
PJ
7799This function looks up what specified for TARGET in,
7800`file-coding-system-alist', `process-coding-system-alist',
7801or `network-coding-system-alist' depending on OPERATION.
7802They may specify a coding system, a cons of coding systems,
7803or a function symbol to call.
7804In the last case, we call the function with one argument,
7805which is a list of all the arguments given to this function.
7806
7807usage: (find-operation-coding-system OPERATION ARGUMENTS ...) */)
7808 (nargs, args)
4ed46869
KH
7809 int nargs;
7810 Lisp_Object *args;
6b89e3aa 7811{
4ed46869
KH
7812 Lisp_Object operation, target_idx, target, val;
7813 register Lisp_Object chain;
177c0ea7 7814
4ed46869
KH
7815 if (nargs < 2)
7816 error ("Too few arguments");
7817 operation = args[0];
7818 if (!SYMBOLP (operation)
7819 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
df7492f9 7820 error ("Invalid first arguement");
4ed46869
KH
7821 if (nargs < 1 + XINT (target_idx))
7822 error ("Too few arguments for operation: %s",
8f924df7 7823 SDATA (SYMBOL_NAME (operation)));
4ed46869
KH
7824 target = args[XINT (target_idx) + 1];
7825 if (!(STRINGP (target)
7826 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
df7492f9 7827 error ("Invalid %dth argument", XINT (target_idx) + 1);
4ed46869 7828
2e34157c
RS
7829 chain = ((EQ (operation, Qinsert_file_contents)
7830 || EQ (operation, Qwrite_region))
02ba4723 7831 ? Vfile_coding_system_alist
2e34157c 7832 : (EQ (operation, Qopen_network_stream)
02ba4723
KH
7833 ? Vnetwork_coding_system_alist
7834 : Vprocess_coding_system_alist));
4ed46869
KH
7835 if (NILP (chain))
7836 return Qnil;
7837
03699b14 7838 for (; CONSP (chain); chain = XCDR (chain))
6b89e3aa 7839 {
f44d27ce 7840 Lisp_Object elt;
6b89e3aa 7841
df7492f9 7842 elt = XCAR (chain);
4ed46869
KH
7843 if (CONSP (elt)
7844 && ((STRINGP (target)
03699b14
KR
7845 && STRINGP (XCAR (elt))
7846 && fast_string_match (XCAR (elt), target) >= 0)
7847 || (INTEGERP (target) && EQ (target, XCAR (elt)))))
6b89e3aa 7848 {
03699b14 7849 val = XCDR (elt);
b19fd4c5
KH
7850 /* Here, if VAL is both a valid coding system and a valid
7851 function symbol, we return VAL as a coding system. */
02ba4723
KH
7852 if (CONSP (val))
7853 return val;
7854 if (! SYMBOLP (val))
7855 return Qnil;
7856 if (! NILP (Fcoding_system_p (val)))
7857 return Fcons (val, val);
b19fd4c5 7858 if (! NILP (Ffboundp (val)))
6b89e3aa 7859 {
b19fd4c5
KH
7860 val = call1 (val, Flist (nargs, args));
7861 if (CONSP (val))
7862 return val;
7863 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
7864 return Fcons (val, val);
6b89e3aa 7865 }
02ba4723 7866 return Qnil;
6b89e3aa
KH
7867 }
7868 }
4ed46869 7869 return Qnil;
6b89e3aa
KH
7870}
7871
df7492f9 7872DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
a3f6ee6d 7873 Sset_coding_system_priority, 0, MANY, 0,
da7db224 7874 doc: /* Assign higher priority to the coding systems given as arguments.
ff563fce 7875If multiple coding systems belongs to the same category,
a3181084
DL
7876all but the first one are ignored.
7877
7878usage: (set-coding-system-priority ...) */)
df7492f9
KH
7879 (nargs, args)
7880 int nargs;
7881 Lisp_Object *args;
7882{
7883 int i, j;
7884 int changed[coding_category_max];
7885 enum coding_category priorities[coding_category_max];
7886
7887 bzero (changed, sizeof changed);
6b89e3aa 7888
df7492f9 7889 for (i = j = 0; i < nargs; i++)
6b89e3aa 7890 {
df7492f9
KH
7891 enum coding_category category;
7892 Lisp_Object spec, attrs;
6b89e3aa 7893
df7492f9
KH
7894 CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
7895 attrs = AREF (spec, 0);
7896 category = XINT (CODING_ATTR_CATEGORY (attrs));
7897 if (changed[category])
7898 /* Ignore this coding system because a coding system of the
7899 same category already had a higher priority. */
7900 continue;
7901 changed[category] = 1;
7902 priorities[j++] = category;
7903 if (coding_categories[category].id >= 0
7904 && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
7905 setup_coding_system (args[i], &coding_categories[category]);
ff563fce 7906 Fset (AREF (Vcoding_category_table, category), args[i]);
df7492f9 7907 }
6b89e3aa 7908
df7492f9
KH
7909 /* Now we have decided top J priorities. Reflect the order of the
7910 original priorities to the remaining priorities. */
6b89e3aa 7911
df7492f9 7912 for (i = j, j = 0; i < coding_category_max; i++, j++)
6b89e3aa 7913 {
df7492f9
KH
7914 while (j < coding_category_max
7915 && changed[coding_priorities[j]])
7916 j++;
7917 if (j == coding_category_max)
7918 abort ();
7919 priorities[i] = coding_priorities[j];
7920 }
6b89e3aa 7921
df7492f9 7922 bcopy (priorities, coding_priorities, sizeof priorities);
177c0ea7 7923
ff563fce
KH
7924 /* Update `coding-category-list'. */
7925 Vcoding_category_list = Qnil;
7926 for (i = coding_category_max - 1; i >= 0; i--)
7927 Vcoding_category_list
7928 = Fcons (AREF (Vcoding_category_table, priorities[i]),
7929 Vcoding_category_list);
6b89e3aa 7930
df7492f9 7931 return Qnil;
6b89e3aa
KH
7932}
7933
df7492f9
KH
7934DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
7935 Scoding_system_priority_list, 0, 1, 0,
da7db224
DL
7936 doc: /* Return a list of coding systems ordered by their priorities.
7937HIGHESTP non-nil means just return the highest priority one. */)
df7492f9
KH
7938 (highestp)
7939 Lisp_Object highestp;
d46c5b12
KH
7940{
7941 int i;
df7492f9 7942 Lisp_Object val;
6b89e3aa 7943
df7492f9 7944 for (i = 0, val = Qnil; i < coding_category_max; i++)
d46c5b12 7945 {
df7492f9
KH
7946 enum coding_category category = coding_priorities[i];
7947 int id = coding_categories[category].id;
7948 Lisp_Object attrs;
068a9dbd 7949
df7492f9
KH
7950 if (id < 0)
7951 continue;
7952 attrs = CODING_ID_ATTRS (id);
7953 if (! NILP (highestp))
7954 return CODING_ATTR_BASE_NAME (attrs);
7955 val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
7956 }
7957 return Fnreverse (val);
7958}
068a9dbd 7959
f0064e1f 7960static char *suffixes[] = { "-unix", "-dos", "-mac" };
068a9dbd
KH
7961
7962static Lisp_Object
df7492f9
KH
7963make_subsidiaries (base)
7964 Lisp_Object base;
068a9dbd 7965{
df7492f9 7966 Lisp_Object subsidiaries;
8f924df7 7967 int base_name_len = SBYTES (SYMBOL_NAME (base));
df7492f9
KH
7968 char *buf = (char *) alloca (base_name_len + 6);
7969 int i;
068a9dbd 7970
8f924df7 7971 bcopy (SDATA (SYMBOL_NAME (base)), buf, base_name_len);
df7492f9
KH
7972 subsidiaries = Fmake_vector (make_number (3), Qnil);
7973 for (i = 0; i < 3; i++)
068a9dbd 7974 {
df7492f9
KH
7975 bcopy (suffixes[i], buf + base_name_len, strlen (suffixes[i]) + 1);
7976 ASET (subsidiaries, i, intern (buf));
068a9dbd 7977 }
df7492f9 7978 return subsidiaries;
068a9dbd
KH
7979}
7980
7981
df7492f9
KH
7982DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
7983 Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
1fcd6c8b
DL
7984 doc: /* For internal use only.
7985usage: (define-coding-system-internal ...) */)
df7492f9
KH
7986 (nargs, args)
7987 int nargs;
7988 Lisp_Object *args;
068a9dbd 7989{
df7492f9
KH
7990 Lisp_Object name;
7991 Lisp_Object spec_vec; /* [ ATTRS ALIASE EOL_TYPE ] */
7992 Lisp_Object attrs; /* Vector of attributes. */
7993 Lisp_Object eol_type;
7994 Lisp_Object aliases;
7995 Lisp_Object coding_type, charset_list, safe_charsets;
7996 enum coding_category category;
7997 Lisp_Object tail, val;
7998 int max_charset_id = 0;
7999 int i;
068a9dbd 8000
df7492f9
KH
8001 if (nargs < coding_arg_max)
8002 goto short_args;
068a9dbd 8003
df7492f9 8004 attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
068a9dbd 8005
df7492f9
KH
8006 name = args[coding_arg_name];
8007 CHECK_SYMBOL (name);
8008 CODING_ATTR_BASE_NAME (attrs) = name;
068a9dbd 8009
df7492f9
KH
8010 val = args[coding_arg_mnemonic];
8011 if (! STRINGP (val))
8012 CHECK_CHARACTER (val);
8013 CODING_ATTR_MNEMONIC (attrs) = val;
068a9dbd 8014
df7492f9
KH
8015 coding_type = args[coding_arg_coding_type];
8016 CHECK_SYMBOL (coding_type);
8017 CODING_ATTR_TYPE (attrs) = coding_type;
068a9dbd 8018
df7492f9
KH
8019 charset_list = args[coding_arg_charset_list];
8020 if (SYMBOLP (charset_list))
8021 {
8022 if (EQ (charset_list, Qiso_2022))
8023 {
8024 if (! EQ (coding_type, Qiso_2022))
8025 error ("Invalid charset-list");
8026 charset_list = Viso_2022_charset_list;
8027 }
8028 else if (EQ (charset_list, Qemacs_mule))
8029 {
8030 if (! EQ (coding_type, Qemacs_mule))
8031 error ("Invalid charset-list");
8032 charset_list = Vemacs_mule_charset_list;
8033 }
8034 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8035 if (max_charset_id < XFASTINT (XCAR (tail)))
8036 max_charset_id = XFASTINT (XCAR (tail));
8037 }
068a9dbd
KH
8038 else
8039 {
df7492f9
KH
8040 charset_list = Fcopy_sequence (charset_list);
8041 for (tail = charset_list; !NILP (tail); tail = Fcdr (tail))
068a9dbd 8042 {
df7492f9
KH
8043 struct charset *charset;
8044
8045 val = Fcar (tail);
8046 CHECK_CHARSET_GET_CHARSET (val, charset);
8047 if (EQ (coding_type, Qiso_2022)
8048 ? CHARSET_ISO_FINAL (charset) < 0
8049 : EQ (coding_type, Qemacs_mule)
8050 ? CHARSET_EMACS_MULE_ID (charset) < 0
8051 : 0)
8052 error ("Can't handle charset `%s'",
8f924df7 8053 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
df7492f9 8054
8f924df7 8055 XSETCAR (tail, make_number (charset->id));
df7492f9
KH
8056 if (max_charset_id < charset->id)
8057 max_charset_id = charset->id;
068a9dbd
KH
8058 }
8059 }
df7492f9 8060 CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
068a9dbd 8061
df7492f9
KH
8062 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
8063 make_number (255));
8064 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8f924df7 8065 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
df7492f9 8066 CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
068a9dbd 8067
584948ac 8068 CODING_ATTR_ASCII_COMPAT (attrs) = args[coding_arg_ascii_compatible_p];
3a73fa5d 8069
df7492f9
KH
8070 val = args[coding_arg_decode_translation_table];
8071 if (! NILP (val))
8072 CHECK_CHAR_TABLE (val);
8073 CODING_ATTR_DECODE_TBL (attrs) = val;
3a73fa5d 8074
df7492f9
KH
8075 val = args[coding_arg_encode_translation_table];
8076 if (! NILP (val))
8077 CHECK_CHAR_TABLE (val);
8078 CODING_ATTR_ENCODE_TBL (attrs) = val;
d46c5b12 8079
df7492f9
KH
8080 val = args[coding_arg_post_read_conversion];
8081 CHECK_SYMBOL (val);
8082 CODING_ATTR_POST_READ (attrs) = val;
d46c5b12 8083
df7492f9
KH
8084 val = args[coding_arg_pre_write_conversion];
8085 CHECK_SYMBOL (val);
8086 CODING_ATTR_PRE_WRITE (attrs) = val;
3a73fa5d 8087
df7492f9
KH
8088 val = args[coding_arg_default_char];
8089 if (NILP (val))
8090 CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
8091 else
8092 {
8f924df7 8093 CHECK_CHARACTER (val);
df7492f9
KH
8094 CODING_ATTR_DEFAULT_CHAR (attrs) = val;
8095 }
4031e2bf 8096
8f924df7
KH
8097 val = args[coding_arg_for_unibyte];
8098 CODING_ATTR_FOR_UNIBYTE (attrs) = NILP (val) ? Qnil : Qt;
3a73fa5d 8099
df7492f9
KH
8100 val = args[coding_arg_plist];
8101 CHECK_LIST (val);
8102 CODING_ATTR_PLIST (attrs) = val;
3a73fa5d 8103
df7492f9
KH
8104 if (EQ (coding_type, Qcharset))
8105 {
4187a77d 8106 Lisp_Object list;
c7c66a95
KH
8107 /* Generate a lisp vector of 256 elements. Each element is nil,
8108 integer, or a list of charset IDs.
3a73fa5d 8109
c7c66a95
KH
8110 If Nth element is nil, the byte code N is invalid in this
8111 coding system.
4ed46869 8112
c7c66a95
KH
8113 If Nth element is a number NUM, N is the first byte of a
8114 charset whose ID is NUM.
4ed46869 8115
c7c66a95
KH
8116 If Nth element is a list of charset IDs, N is the first byte
8117 of one of them. The list is sorted by dimensions of the
8118 charsets. A charset of smaller dimension comes firtst.
8119 */
4187a77d
KH
8120 for (list = Qnil, tail = charset_list; CONSP (tail); tail = XCDR (tail))
8121 {
8122 struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
5f1cd180 8123
4187a77d
KH
8124 if (charset->method == CHARSET_METHOD_SUPERSET)
8125 {
8126 val = CHARSET_SUPERSET (charset);
8127 for (; CONSP (val); val = XCDR (val))
8f924df7 8128 list = Fcons (XCAR (XCAR (val)), list);
4187a77d
KH
8129 }
8130 else
8131 list = Fcons (XCAR (tail), list);
8132 }
ec6d2bb8 8133
df7492f9 8134 val = Fmake_vector (make_number (256), Qnil);
4ed46869 8135
4187a77d 8136 for (tail = Fnreverse (list); CONSP (tail); tail = XCDR (tail))
df7492f9 8137 {
c7c66a95
KH
8138 struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
8139 int dim = CHARSET_DIMENSION (charset);
8140 int idx = (dim - 1) * 4;
4ed46869 8141
584948ac
KH
8142 if (CHARSET_ASCII_COMPATIBLE_P (charset))
8143 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
4031e2bf 8144
15d143f7
KH
8145 for (i = charset->code_space[idx];
8146 i <= charset->code_space[idx + 1]; i++)
8147 {
c7c66a95
KH
8148 Lisp_Object tmp, tmp2;
8149 int dim2;
ec6d2bb8 8150
c7c66a95
KH
8151 tmp = AREF (val, i);
8152 if (NILP (tmp))
8153 tmp = XCAR (tail);
8154 else if (NUMBERP (tmp))
8155 {
8156 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
8157 if (dim < dim2)
c7c66a95 8158 tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
f9d71dcd
KH
8159 else
8160 tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
c7c66a95 8161 }
15d143f7 8162 else
c7c66a95
KH
8163 {
8164 for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
8165 {
8166 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
8167 if (dim < dim2)
8168 break;
8169 }
8170 if (NILP (tmp2))
8171 tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
8172 else
8173 {
8174 XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
8175 XSETCAR (tmp2, XCAR (tail));
8176 }
8177 }
8178 ASET (val, i, tmp);
15d143f7 8179 }
df7492f9
KH
8180 }
8181 ASET (attrs, coding_attr_charset_valids, val);
8182 category = coding_category_charset;
8183 }
8184 else if (EQ (coding_type, Qccl))
8185 {
8186 Lisp_Object valids;
ecec61c1 8187
df7492f9
KH
8188 if (nargs < coding_arg_ccl_max)
8189 goto short_args;
ecec61c1 8190
df7492f9
KH
8191 val = args[coding_arg_ccl_decoder];
8192 CHECK_CCL_PROGRAM (val);
8193 if (VECTORP (val))
8194 val = Fcopy_sequence (val);
8195 ASET (attrs, coding_attr_ccl_decoder, val);
ecec61c1 8196
df7492f9
KH
8197 val = args[coding_arg_ccl_encoder];
8198 CHECK_CCL_PROGRAM (val);
8199 if (VECTORP (val))
8200 val = Fcopy_sequence (val);
8201 ASET (attrs, coding_attr_ccl_encoder, val);
ecec61c1 8202
df7492f9
KH
8203 val = args[coding_arg_ccl_valids];
8204 valids = Fmake_string (make_number (256), make_number (0));
8205 for (tail = val; !NILP (tail); tail = Fcdr (tail))
8206 {
8dcbea82 8207 int from, to;
ecec61c1 8208
df7492f9
KH
8209 val = Fcar (tail);
8210 if (INTEGERP (val))
8dcbea82
KH
8211 {
8212 from = to = XINT (val);
8213 if (from < 0 || from > 255)
8214 args_out_of_range_3 (val, make_number (0), make_number (255));
8215 }
df7492f9
KH
8216 else
8217 {
df7492f9 8218 CHECK_CONS (val);
8f924df7
KH
8219 CHECK_NATNUM_CAR (val);
8220 CHECK_NATNUM_CDR (val);
df7492f9 8221 from = XINT (XCAR (val));
8f924df7 8222 if (from > 255)
8dcbea82
KH
8223 args_out_of_range_3 (XCAR (val),
8224 make_number (0), make_number (255));
df7492f9 8225 to = XINT (XCDR (val));
8dcbea82
KH
8226 if (to < from || to > 255)
8227 args_out_of_range_3 (XCDR (val),
8228 XCAR (val), make_number (255));
df7492f9 8229 }
8dcbea82 8230 for (i = from; i <= to; i++)
8f924df7 8231 SSET (valids, i, 1);
df7492f9
KH
8232 }
8233 ASET (attrs, coding_attr_ccl_valids, valids);
4ed46869 8234
df7492f9 8235 category = coding_category_ccl;
55ab7be3 8236 }
df7492f9 8237 else if (EQ (coding_type, Qutf_16))
55ab7be3 8238 {
df7492f9 8239 Lisp_Object bom, endian;
4ed46869 8240
584948ac 8241 CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
4ed46869 8242
df7492f9
KH
8243 if (nargs < coding_arg_utf16_max)
8244 goto short_args;
4ed46869 8245
df7492f9
KH
8246 bom = args[coding_arg_utf16_bom];
8247 if (! NILP (bom) && ! EQ (bom, Qt))
8248 {
8249 CHECK_CONS (bom);
8f924df7
KH
8250 val = XCAR (bom);
8251 CHECK_CODING_SYSTEM (val);
8252 val = XCDR (bom);
8253 CHECK_CODING_SYSTEM (val);
df7492f9
KH
8254 }
8255 ASET (attrs, coding_attr_utf_16_bom, bom);
8256
8257 endian = args[coding_arg_utf16_endian];
b49a1807
KH
8258 CHECK_SYMBOL (endian);
8259 if (NILP (endian))
8260 endian = Qbig;
8261 else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
8f924df7 8262 error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
df7492f9
KH
8263 ASET (attrs, coding_attr_utf_16_endian, endian);
8264
8265 category = (CONSP (bom)
8266 ? coding_category_utf_16_auto
8267 : NILP (bom)
b49a1807 8268 ? (EQ (endian, Qbig)
df7492f9
KH
8269 ? coding_category_utf_16_be_nosig
8270 : coding_category_utf_16_le_nosig)
b49a1807 8271 : (EQ (endian, Qbig)
df7492f9
KH
8272 ? coding_category_utf_16_be
8273 : coding_category_utf_16_le));
8274 }
8275 else if (EQ (coding_type, Qiso_2022))
8276 {
8277 Lisp_Object initial, reg_usage, request, flags;
4776e638 8278 int i;
1397dc18 8279
df7492f9
KH
8280 if (nargs < coding_arg_iso2022_max)
8281 goto short_args;
8282
8283 initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
8284 CHECK_VECTOR (initial);
8285 for (i = 0; i < 4; i++)
8286 {
8287 val = Faref (initial, make_number (i));
8288 if (! NILP (val))
8289 {
584948ac
KH
8290 struct charset *charset;
8291
8292 CHECK_CHARSET_GET_CHARSET (val, charset);
8293 ASET (initial, i, make_number (CHARSET_ID (charset)));
8294 if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
8295 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
df7492f9
KH
8296 }
8297 else
8298 ASET (initial, i, make_number (-1));
8299 }
8300
8301 reg_usage = args[coding_arg_iso2022_reg_usage];
8302 CHECK_CONS (reg_usage);
8f924df7
KH
8303 CHECK_NUMBER_CAR (reg_usage);
8304 CHECK_NUMBER_CDR (reg_usage);
df7492f9
KH
8305
8306 request = Fcopy_sequence (args[coding_arg_iso2022_request]);
8307 for (tail = request; ! NILP (tail); tail = Fcdr (tail))
1397dc18 8308 {
df7492f9 8309 int id;
8f924df7 8310 Lisp_Object tmp;
df7492f9
KH
8311
8312 val = Fcar (tail);
8313 CHECK_CONS (val);
8f924df7
KH
8314 tmp = XCAR (val);
8315 CHECK_CHARSET_GET_ID (tmp, id);
8316 CHECK_NATNUM_CDR (val);
df7492f9
KH
8317 if (XINT (XCDR (val)) >= 4)
8318 error ("Invalid graphic register number: %d", XINT (XCDR (val)));
8f924df7 8319 XSETCAR (val, make_number (id));
1397dc18 8320 }
4ed46869 8321
df7492f9
KH
8322 flags = args[coding_arg_iso2022_flags];
8323 CHECK_NATNUM (flags);
8324 i = XINT (flags);
8325 if (EQ (args[coding_arg_charset_list], Qiso_2022))
8326 flags = make_number (i | CODING_ISO_FLAG_FULL_SUPPORT);
8327
8328 ASET (attrs, coding_attr_iso_initial, initial);
8329 ASET (attrs, coding_attr_iso_usage, reg_usage);
8330 ASET (attrs, coding_attr_iso_request, request);
8331 ASET (attrs, coding_attr_iso_flags, flags);
8332 setup_iso_safe_charsets (attrs);
8333
8334 if (i & CODING_ISO_FLAG_SEVEN_BITS)
8335 category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
8336 | CODING_ISO_FLAG_SINGLE_SHIFT))
8337 ? coding_category_iso_7_else
8338 : EQ (args[coding_arg_charset_list], Qiso_2022)
8339 ? coding_category_iso_7
8340 : coding_category_iso_7_tight);
8341 else
8342 {
8343 int id = XINT (AREF (initial, 1));
8344
c6fb6e98 8345 category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
df7492f9
KH
8346 || EQ (args[coding_arg_charset_list], Qiso_2022)
8347 || id < 0)
8348 ? coding_category_iso_8_else
8349 : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
8350 ? coding_category_iso_8_1
8351 : coding_category_iso_8_2);
8352 }
0ce7886f
KH
8353 if (category != coding_category_iso_8_1
8354 && category != coding_category_iso_8_2)
8355 CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
df7492f9
KH
8356 }
8357 else if (EQ (coding_type, Qemacs_mule))
c28a9453 8358 {
df7492f9
KH
8359 if (EQ (args[coding_arg_charset_list], Qemacs_mule))
8360 ASET (attrs, coding_attr_emacs_mule_full, Qt);
584948ac 8361 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
df7492f9 8362 category = coding_category_emacs_mule;
c28a9453 8363 }
df7492f9 8364 else if (EQ (coding_type, Qshift_jis))
c28a9453 8365 {
df7492f9
KH
8366
8367 struct charset *charset;
8368
8369 if (XINT (Flength (charset_list)) != 3)
8370 error ("There should be just three charsets");
8371
8372 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8373 if (CHARSET_DIMENSION (charset) != 1)
8374 error ("Dimension of charset %s is not one",
8f924df7 8375 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
584948ac
KH
8376 if (CHARSET_ASCII_COMPATIBLE_P (charset))
8377 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
df7492f9
KH
8378
8379 charset_list = XCDR (charset_list);
8380 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8381 if (CHARSET_DIMENSION (charset) != 1)
8382 error ("Dimension of charset %s is not one",
8f924df7 8383 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
df7492f9
KH
8384
8385 charset_list = XCDR (charset_list);
8386 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8387 if (CHARSET_DIMENSION (charset) != 2)
8388 error ("Dimension of charset %s is not two",
8f924df7 8389 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
df7492f9
KH
8390
8391 category = coding_category_sjis;
8392 Vsjis_coding_system = name;
c28a9453 8393 }
df7492f9
KH
8394 else if (EQ (coding_type, Qbig5))
8395 {
8396 struct charset *charset;
4ed46869 8397
df7492f9
KH
8398 if (XINT (Flength (charset_list)) != 2)
8399 error ("There should be just two charsets");
8400
8401 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8402 if (CHARSET_DIMENSION (charset) != 1)
8403 error ("Dimension of charset %s is not one",
8f924df7 8404 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
584948ac
KH
8405 if (CHARSET_ASCII_COMPATIBLE_P (charset))
8406 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
df7492f9
KH
8407
8408 charset_list = XCDR (charset_list);
8409 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8410 if (CHARSET_DIMENSION (charset) != 2)
8411 error ("Dimension of charset %s is not two",
8f924df7 8412 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
4ed46869 8413
df7492f9
KH
8414 category = coding_category_big5;
8415 Vbig5_coding_system = name;
8416 }
8417 else if (EQ (coding_type, Qraw_text))
c28a9453 8418 {
584948ac
KH
8419 category = coding_category_raw_text;
8420 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
c28a9453 8421 }
df7492f9 8422 else if (EQ (coding_type, Qutf_8))
4ed46869 8423 {
584948ac
KH
8424 category = coding_category_utf_8;
8425 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
4ed46869 8426 }
df7492f9
KH
8427 else if (EQ (coding_type, Qundecided))
8428 category = coding_category_undecided;
4ed46869 8429 else
df7492f9 8430 error ("Invalid coding system type: %s",
8f924df7 8431 SDATA (SYMBOL_NAME (coding_type)));
4ed46869 8432
df7492f9 8433 CODING_ATTR_CATEGORY (attrs) = make_number (category);
01378f49
KH
8434 CODING_ATTR_PLIST (attrs)
8435 = Fcons (QCcategory, Fcons (AREF (Vcoding_category_table, category),
8436 CODING_ATTR_PLIST (attrs)));
c4825358 8437
df7492f9
KH
8438 eol_type = args[coding_arg_eol_type];
8439 if (! NILP (eol_type)
8440 && ! EQ (eol_type, Qunix)
8441 && ! EQ (eol_type, Qdos)
8442 && ! EQ (eol_type, Qmac))
8443 error ("Invalid eol-type");
4ed46869 8444
df7492f9 8445 aliases = Fcons (name, Qnil);
4ed46869 8446
df7492f9
KH
8447 if (NILP (eol_type))
8448 {
8449 eol_type = make_subsidiaries (name);
8450 for (i = 0; i < 3; i++)
1397dc18 8451 {
df7492f9
KH
8452 Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
8453
8454 this_name = AREF (eol_type, i);
8455 this_aliases = Fcons (this_name, Qnil);
8456 this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
8457 this_spec = Fmake_vector (make_number (3), attrs);
8458 ASET (this_spec, 1, this_aliases);
8459 ASET (this_spec, 2, this_eol_type);
8460 Fputhash (this_name, this_spec, Vcoding_system_hash_table);
8461 Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
8462 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
8463 Vcoding_system_alist);
1397dc18 8464 }
d46c5b12 8465 }
4ed46869 8466
df7492f9
KH
8467 spec_vec = Fmake_vector (make_number (3), attrs);
8468 ASET (spec_vec, 1, aliases);
8469 ASET (spec_vec, 2, eol_type);
48b0f3ae 8470
df7492f9
KH
8471 Fputhash (name, spec_vec, Vcoding_system_hash_table);
8472 Vcoding_system_list = Fcons (name, Vcoding_system_list);
8473 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
8474 Vcoding_system_alist);
48b0f3ae 8475
df7492f9
KH
8476 {
8477 int id = coding_categories[category].id;
48b0f3ae 8478
df7492f9
KH
8479 if (id < 0 || EQ (name, CODING_ID_NAME (id)))
8480 setup_coding_system (name, &coding_categories[category]);
8481 }
48b0f3ae 8482
d46c5b12 8483 return Qnil;
48b0f3ae 8484
df7492f9
KH
8485 short_args:
8486 return Fsignal (Qwrong_number_of_arguments,
8487 Fcons (intern ("define-coding-system-internal"),
8488 make_number (nargs)));
d46c5b12 8489}
4ed46869 8490
da7db224 8491/* Fixme: should this record the alias relationships for
e1c23804 8492 diagnostics? Should it update coding-system-list? */
df7492f9
KH
8493DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
8494 Sdefine_coding_system_alias, 2, 2, 0,
8495 doc: /* Define ALIAS as an alias for CODING-SYSTEM. */)
8496 (alias, coding_system)
8497 Lisp_Object alias, coding_system;
66cfb530 8498{
df7492f9 8499 Lisp_Object spec, aliases, eol_type;
4ed46869 8500
df7492f9
KH
8501 CHECK_SYMBOL (alias);
8502 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8503 aliases = AREF (spec, 1);
8504 while (!NILP (XCDR (aliases)))
8505 aliases = XCDR (aliases);
8f924df7 8506 XSETCDR (aliases, Fcons (alias, Qnil));
4ed46869 8507
df7492f9
KH
8508 eol_type = AREF (spec, 2);
8509 if (VECTORP (eol_type))
4ed46869 8510 {
df7492f9
KH
8511 Lisp_Object subsidiaries;
8512 int i;
4ed46869 8513
df7492f9
KH
8514 subsidiaries = make_subsidiaries (alias);
8515 for (i = 0; i < 3; i++)
8516 Fdefine_coding_system_alias (AREF (subsidiaries, i),
8517 AREF (eol_type, i));
8518
8519 ASET (spec, 2, subsidiaries);
4ed46869 8520 }
df7492f9
KH
8521
8522 Fputhash (alias, spec, Vcoding_system_hash_table);
5bad0796
DL
8523 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
8524 Vcoding_system_alist);
66cfb530 8525
4ed46869
KH
8526 return Qnil;
8527}
8528
df7492f9
KH
8529DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
8530 1, 1, 0,
8531 doc: /* Return the base of CODING-SYSTEM.
da7db224 8532Any alias or subsidiary coding system is not a base coding system. */)
df7492f9
KH
8533 (coding_system)
8534 Lisp_Object coding_system;
d46c5b12 8535{
df7492f9 8536 Lisp_Object spec, attrs;
d46c5b12 8537
df7492f9
KH
8538 if (NILP (coding_system))
8539 return (Qno_conversion);
8540 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8541 attrs = AREF (spec, 0);
8542 return CODING_ATTR_BASE_NAME (attrs);
8543}
1397dc18 8544
df7492f9
KH
8545DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
8546 1, 1, 0,
8547 doc: "Return the property list of CODING-SYSTEM.")
8548 (coding_system)
8549 Lisp_Object coding_system;
8550{
8551 Lisp_Object spec, attrs;
1397dc18 8552
df7492f9
KH
8553 if (NILP (coding_system))
8554 coding_system = Qno_conversion;
8555 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8556 attrs = AREF (spec, 0);
8557 return CODING_ATTR_PLIST (attrs);
d46c5b12
KH
8558}
8559
df7492f9
KH
8560
8561DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
8562 1, 1, 0,
da7db224 8563 doc: /* Return the list of aliases of CODING-SYSTEM. */)
df7492f9
KH
8564 (coding_system)
8565 Lisp_Object coding_system;
66cfb530 8566{
df7492f9 8567 Lisp_Object spec;
84d60297 8568
df7492f9
KH
8569 if (NILP (coding_system))
8570 coding_system = Qno_conversion;
8571 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
da7db224 8572 return AREF (spec, 1);
df7492f9 8573}
66cfb530 8574
df7492f9
KH
8575DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
8576 Scoding_system_eol_type, 1, 1, 0,
8577 doc: /* Return eol-type of CODING-SYSTEM.
8578An eol-type is integer 0, 1, 2, or a vector of coding systems.
66cfb530 8579
df7492f9
KH
8580Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
8581and CR respectively.
66cfb530 8582
df7492f9
KH
8583A vector value indicates that a format of end-of-line should be
8584detected automatically. Nth element of the vector is the subsidiary
8585coding system whose eol-type is N. */)
6b89e3aa
KH
8586 (coding_system)
8587 Lisp_Object coding_system;
8588{
df7492f9
KH
8589 Lisp_Object spec, eol_type;
8590 int n;
6b89e3aa 8591
df7492f9
KH
8592 if (NILP (coding_system))
8593 coding_system = Qno_conversion;
8594 if (! CODING_SYSTEM_P (coding_system))
8595 return Qnil;
8596 spec = CODING_SYSTEM_SPEC (coding_system);
8597 eol_type = AREF (spec, 2);
8598 if (VECTORP (eol_type))
8599 return Fcopy_sequence (eol_type);
8600 n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
8601 return make_number (n);
6b89e3aa
KH
8602}
8603
4ed46869
KH
8604#endif /* emacs */
8605
8606\f
1397dc18 8607/*** 9. Post-amble ***/
4ed46869 8608
dfcf069d 8609void
4ed46869
KH
8610init_coding_once ()
8611{
8612 int i;
8613
df7492f9
KH
8614 for (i = 0; i < coding_category_max; i++)
8615 {
8616 coding_categories[i].id = -1;
8617 coding_priorities[i] = i;
8618 }
4ed46869
KH
8619
8620 /* ISO2022 specific initialize routine. */
8621 for (i = 0; i < 0x20; i++)
b73bfc1c 8622 iso_code_class[i] = ISO_control_0;
4ed46869
KH
8623 for (i = 0x21; i < 0x7F; i++)
8624 iso_code_class[i] = ISO_graphic_plane_0;
8625 for (i = 0x80; i < 0xA0; i++)
b73bfc1c 8626 iso_code_class[i] = ISO_control_1;
4ed46869
KH
8627 for (i = 0xA1; i < 0xFF; i++)
8628 iso_code_class[i] = ISO_graphic_plane_1;
8629 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
8630 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
8631 iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
8632 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
8633 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
8634 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
8635 iso_code_class[ISO_CODE_ESC] = ISO_escape;
8636 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
8637 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
8638 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
8639
b843d1ae 8640 inhibit_pre_post_conversion = 0;
df7492f9
KH
8641
8642 for (i = 0; i < 256; i++)
8643 {
8644 emacs_mule_bytes[i] = 1;
8645 }
7c78e542
KH
8646 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
8647 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
8648 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
8649 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
e0e989f6
KH
8650}
8651
8652#ifdef emacs
8653
dfcf069d 8654void
e0e989f6
KH
8655syms_of_coding ()
8656{
df7492f9 8657 staticpro (&Vcoding_system_hash_table);
8f924df7
KH
8658 {
8659 Lisp_Object args[2];
8660 args[0] = QCtest;
8661 args[1] = Qeq;
8662 Vcoding_system_hash_table = Fmake_hash_table (2, args);
8663 }
df7492f9
KH
8664
8665 staticpro (&Vsjis_coding_system);
8666 Vsjis_coding_system = Qnil;
e0e989f6 8667
df7492f9
KH
8668 staticpro (&Vbig5_coding_system);
8669 Vbig5_coding_system = Qnil;
8670
8671 staticpro (&Vcode_conversion_work_buf_list);
8672 Vcode_conversion_work_buf_list = Qnil;
e0e989f6 8673
df7492f9
KH
8674 staticpro (&Vcode_conversion_reused_work_buf);
8675 Vcode_conversion_reused_work_buf = Qnil;
8676
8677 DEFSYM (Qcharset, "charset");
8678 DEFSYM (Qtarget_idx, "target-idx");
8679 DEFSYM (Qcoding_system_history, "coding-system-history");
bb0115a2
RS
8680 Fset (Qcoding_system_history, Qnil);
8681
9ce27fde 8682 /* Target FILENAME is the first argument. */
e0e989f6 8683 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
9ce27fde 8684 /* Target FILENAME is the third argument. */
e0e989f6
KH
8685 Fput (Qwrite_region, Qtarget_idx, make_number (2));
8686
df7492f9 8687 DEFSYM (Qcall_process, "call-process");
9ce27fde 8688 /* Target PROGRAM is the first argument. */
e0e989f6
KH
8689 Fput (Qcall_process, Qtarget_idx, make_number (0));
8690
df7492f9 8691 DEFSYM (Qcall_process_region, "call-process-region");
9ce27fde 8692 /* Target PROGRAM is the third argument. */
e0e989f6
KH
8693 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
8694
df7492f9 8695 DEFSYM (Qstart_process, "start-process");
9ce27fde 8696 /* Target PROGRAM is the third argument. */
e0e989f6
KH
8697 Fput (Qstart_process, Qtarget_idx, make_number (2));
8698
df7492f9 8699 DEFSYM (Qopen_network_stream, "open-network-stream");
9ce27fde 8700 /* Target SERVICE is the fourth argument. */
e0e989f6
KH
8701 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
8702
df7492f9
KH
8703 DEFSYM (Qcoding_system, "coding-system");
8704 DEFSYM (Qcoding_aliases, "coding-aliases");
4ed46869 8705
df7492f9
KH
8706 DEFSYM (Qeol_type, "eol-type");
8707 DEFSYM (Qunix, "unix");
8708 DEFSYM (Qdos, "dos");
4ed46869 8709
df7492f9
KH
8710 DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
8711 DEFSYM (Qpost_read_conversion, "post-read-conversion");
8712 DEFSYM (Qpre_write_conversion, "pre-write-conversion");
8713 DEFSYM (Qdefault_char, "default-char");
8714 DEFSYM (Qundecided, "undecided");
8715 DEFSYM (Qno_conversion, "no-conversion");
8716 DEFSYM (Qraw_text, "raw-text");
4ed46869 8717
df7492f9 8718 DEFSYM (Qiso_2022, "iso-2022");
4ed46869 8719
df7492f9 8720 DEFSYM (Qutf_8, "utf-8");
8f924df7 8721 DEFSYM (Qutf_8_emacs, "utf-8-emacs");
27901516 8722
df7492f9 8723 DEFSYM (Qutf_16, "utf-16");
df7492f9
KH
8724 DEFSYM (Qbig, "big");
8725 DEFSYM (Qlittle, "little");
27901516 8726
df7492f9
KH
8727 DEFSYM (Qshift_jis, "shift-jis");
8728 DEFSYM (Qbig5, "big5");
4ed46869 8729
df7492f9 8730 DEFSYM (Qcoding_system_p, "coding-system-p");
4ed46869 8731
df7492f9 8732 DEFSYM (Qcoding_system_error, "coding-system-error");
4ed46869
KH
8733 Fput (Qcoding_system_error, Qerror_conditions,
8734 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
8735 Fput (Qcoding_system_error, Qerror_message,
9ce27fde 8736 build_string ("Invalid coding system"));
4ed46869 8737
05e6f5dc
KH
8738 /* Intern this now in case it isn't already done.
8739 Setting this variable twice is harmless.
8740 But don't staticpro it here--that is done in alloc.c. */
8741 Qchar_table_extra_slots = intern ("char-table-extra-slots");
70c22245 8742
df7492f9 8743 DEFSYM (Qtranslation_table, "translation-table");
1397dc18 8744 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (1));
df7492f9
KH
8745 DEFSYM (Qtranslation_table_id, "translation-table-id");
8746 DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
8747 DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
1397dc18 8748
df7492f9 8749 DEFSYM (Qvalid_codes, "valid-codes");
9ce27fde 8750
df7492f9 8751 DEFSYM (Qemacs_mule, "emacs-mule");
d46c5b12 8752
01378f49
KH
8753 DEFSYM (QCcategory, ":category");
8754
df7492f9
KH
8755 Vcoding_category_table
8756 = Fmake_vector (make_number (coding_category_max), Qnil);
8757 staticpro (&Vcoding_category_table);
8758 /* Followings are target of code detection. */
8759 ASET (Vcoding_category_table, coding_category_iso_7,
8760 intern ("coding-category-iso-7"));
8761 ASET (Vcoding_category_table, coding_category_iso_7_tight,
8762 intern ("coding-category-iso-7-tight"));
8763 ASET (Vcoding_category_table, coding_category_iso_8_1,
8764 intern ("coding-category-iso-8-1"));
8765 ASET (Vcoding_category_table, coding_category_iso_8_2,
8766 intern ("coding-category-iso-8-2"));
8767 ASET (Vcoding_category_table, coding_category_iso_7_else,
8768 intern ("coding-category-iso-7-else"));
8769 ASET (Vcoding_category_table, coding_category_iso_8_else,
8770 intern ("coding-category-iso-8-else"));
8771 ASET (Vcoding_category_table, coding_category_utf_8,
8772 intern ("coding-category-utf-8"));
8773 ASET (Vcoding_category_table, coding_category_utf_16_be,
8774 intern ("coding-category-utf-16-be"));
ff563fce
KH
8775 ASET (Vcoding_category_table, coding_category_utf_16_auto,
8776 intern ("coding-category-utf-16-auto"));
df7492f9
KH
8777 ASET (Vcoding_category_table, coding_category_utf_16_le,
8778 intern ("coding-category-utf-16-le"));
8779 ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
8780 intern ("coding-category-utf-16-be-nosig"));
8781 ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
8782 intern ("coding-category-utf-16-le-nosig"));
8783 ASET (Vcoding_category_table, coding_category_charset,
8784 intern ("coding-category-charset"));
8785 ASET (Vcoding_category_table, coding_category_sjis,
8786 intern ("coding-category-sjis"));
8787 ASET (Vcoding_category_table, coding_category_big5,
8788 intern ("coding-category-big5"));
8789 ASET (Vcoding_category_table, coding_category_ccl,
8790 intern ("coding-category-ccl"));
8791 ASET (Vcoding_category_table, coding_category_emacs_mule,
8792 intern ("coding-category-emacs-mule"));
8793 /* Followings are NOT target of code detection. */
8794 ASET (Vcoding_category_table, coding_category_raw_text,
8795 intern ("coding-category-raw-text"));
8796 ASET (Vcoding_category_table, coding_category_undecided,
8797 intern ("coding-category-undecided"));
ecf488bc 8798
4ed46869
KH
8799 defsubr (&Scoding_system_p);
8800 defsubr (&Sread_coding_system);
8801 defsubr (&Sread_non_nil_coding_system);
8802 defsubr (&Scheck_coding_system);
8803 defsubr (&Sdetect_coding_region);
d46c5b12 8804 defsubr (&Sdetect_coding_string);
05e6f5dc 8805 defsubr (&Sfind_coding_systems_region_internal);
068a9dbd 8806 defsubr (&Sunencodable_char_position);
df7492f9 8807 defsubr (&Scheck_coding_systems_region);
4ed46869
KH
8808 defsubr (&Sdecode_coding_region);
8809 defsubr (&Sencode_coding_region);
8810 defsubr (&Sdecode_coding_string);
8811 defsubr (&Sencode_coding_string);
8812 defsubr (&Sdecode_sjis_char);
8813 defsubr (&Sencode_sjis_char);
8814 defsubr (&Sdecode_big5_char);
8815 defsubr (&Sencode_big5_char);
1ba9e4ab 8816 defsubr (&Sset_terminal_coding_system_internal);
c4825358 8817 defsubr (&Sset_safe_terminal_coding_system_internal);
4ed46869 8818 defsubr (&Sterminal_coding_system);
1ba9e4ab 8819 defsubr (&Sset_keyboard_coding_system_internal);
4ed46869 8820 defsubr (&Skeyboard_coding_system);
a5d301df 8821 defsubr (&Sfind_operation_coding_system);
df7492f9 8822 defsubr (&Sset_coding_system_priority);
6b89e3aa 8823 defsubr (&Sdefine_coding_system_internal);
df7492f9
KH
8824 defsubr (&Sdefine_coding_system_alias);
8825 defsubr (&Scoding_system_base);
8826 defsubr (&Scoding_system_plist);
8827 defsubr (&Scoding_system_aliases);
8828 defsubr (&Scoding_system_eol_type);
8829 defsubr (&Scoding_system_priority_list);
4ed46869 8830
4608c386 8831 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
48b0f3ae
PJ
8832 doc: /* List of coding systems.
8833
8834Do not alter the value of this variable manually. This variable should be
df7492f9 8835updated by the functions `define-coding-system' and
48b0f3ae 8836`define-coding-system-alias'. */);
4608c386
KH
8837 Vcoding_system_list = Qnil;
8838
8839 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
48b0f3ae
PJ
8840 doc: /* Alist of coding system names.
8841Each element is one element list of coding system name.
8842This variable is given to `completing-read' as TABLE argument.
8843
8844Do not alter the value of this variable manually. This variable should be
8845updated by the functions `make-coding-system' and
8846`define-coding-system-alias'. */);
4608c386
KH
8847 Vcoding_system_alist = Qnil;
8848
4ed46869 8849 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
48b0f3ae
PJ
8850 doc: /* List of coding-categories (symbols) ordered by priority.
8851
8852On detecting a coding system, Emacs tries code detection algorithms
8853associated with each coding-category one by one in this order. When
8854one algorithm agrees with a byte sequence of source text, the coding
8855system bound to the corresponding coding-category is selected. */);
4ed46869
KH
8856 {
8857 int i;
8858
8859 Vcoding_category_list = Qnil;
df7492f9 8860 for (i = coding_category_max - 1; i >= 0; i--)
4ed46869 8861 Vcoding_category_list
d46c5b12
KH
8862 = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
8863 Vcoding_category_list);
4ed46869
KH
8864 }
8865
8866 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
48b0f3ae
PJ
8867 doc: /* Specify the coding system for read operations.
8868It is useful to bind this variable with `let', but do not set it globally.
8869If the value is a coding system, it is used for decoding on read operation.
8870If not, an appropriate element is used from one of the coding system alists:
8871There are three such tables, `file-coding-system-alist',
8872`process-coding-system-alist', and `network-coding-system-alist'. */);
4ed46869
KH
8873 Vcoding_system_for_read = Qnil;
8874
8875 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
48b0f3ae
PJ
8876 doc: /* Specify the coding system for write operations.
8877Programs bind this variable with `let', but you should not set it globally.
8878If the value is a coding system, it is used for encoding of output,
8879when writing it to a file and when sending it to a file or subprocess.
8880
8881If this does not specify a coding system, an appropriate element
8882is used from one of the coding system alists:
8883There are three such tables, `file-coding-system-alist',
8884`process-coding-system-alist', and `network-coding-system-alist'.
8885For output to files, if the above procedure does not specify a coding system,
8886the value of `buffer-file-coding-system' is used. */);
4ed46869
KH
8887 Vcoding_system_for_write = Qnil;
8888
8889 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
df7492f9
KH
8890 doc: /*
8891Coding system used in the latest file or process I/O. */);
4ed46869
KH
8892 Vlast_coding_system_used = Qnil;
8893
9ce27fde 8894 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
df7492f9
KH
8895 doc: /*
8896*Non-nil means always inhibit code conversion of end-of-line format.
48b0f3ae
PJ
8897See info node `Coding Systems' and info node `Text and Binary' concerning
8898such conversion. */);
9ce27fde
KH
8899 inhibit_eol_conversion = 0;
8900
ed29121d 8901 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
df7492f9
KH
8902 doc: /*
8903Non-nil means process buffer inherits coding system of process output.
48b0f3ae
PJ
8904Bind it to t if the process output is to be treated as if it were a file
8905read from some filesystem. */);
ed29121d
EZ
8906 inherit_process_coding_system = 0;
8907
02ba4723 8908 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
df7492f9
KH
8909 doc: /*
8910Alist to decide a coding system to use for a file I/O operation.
48b0f3ae
PJ
8911The format is ((PATTERN . VAL) ...),
8912where PATTERN is a regular expression matching a file name,
8913VAL is a coding system, a cons of coding systems, or a function symbol.
8914If VAL is a coding system, it is used for both decoding and encoding
8915the file contents.
8916If VAL is a cons of coding systems, the car part is used for decoding,
8917and the cdr part is used for encoding.
8918If VAL is a function symbol, the function must return a coding system
0192762c
DL
8919or a cons of coding systems which are used as above. The function gets
8920the arguments with which `find-operation-coding-systems' was called.
48b0f3ae
PJ
8921
8922See also the function `find-operation-coding-system'
8923and the variable `auto-coding-alist'. */);
02ba4723
KH
8924 Vfile_coding_system_alist = Qnil;
8925
8926 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
df7492f9
KH
8927 doc: /*
8928Alist to decide a coding system to use for a process I/O operation.
48b0f3ae
PJ
8929The format is ((PATTERN . VAL) ...),
8930where PATTERN is a regular expression matching a program name,
8931VAL is a coding system, a cons of coding systems, or a function symbol.
8932If VAL is a coding system, it is used for both decoding what received
8933from the program and encoding what sent to the program.
8934If VAL is a cons of coding systems, the car part is used for decoding,
8935and the cdr part is used for encoding.
8936If VAL is a function symbol, the function must return a coding system
8937or a cons of coding systems which are used as above.
8938
8939See also the function `find-operation-coding-system'. */);
02ba4723
KH
8940 Vprocess_coding_system_alist = Qnil;
8941
8942 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
df7492f9
KH
8943 doc: /*
8944Alist to decide a coding system to use for a network I/O operation.
48b0f3ae
PJ
8945The format is ((PATTERN . VAL) ...),
8946where PATTERN is a regular expression matching a network service name
8947or is a port number to connect to,
8948VAL is a coding system, a cons of coding systems, or a function symbol.
8949If VAL is a coding system, it is used for both decoding what received
8950from the network stream and encoding what sent to the network stream.
8951If VAL is a cons of coding systems, the car part is used for decoding,
8952and the cdr part is used for encoding.
8953If VAL is a function symbol, the function must return a coding system
8954or a cons of coding systems which are used as above.
8955
8956See also the function `find-operation-coding-system'. */);
02ba4723 8957 Vnetwork_coding_system_alist = Qnil;
4ed46869 8958
68c45bf0 8959 DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
75205970
RS
8960 doc: /* Coding system to use with system messages.
8961Also used for decoding keyboard input on X Window system. */);
68c45bf0
PE
8962 Vlocale_coding_system = Qnil;
8963
005f0d35 8964 /* The eol mnemonics are reset in startup.el system-dependently. */
7722baf9 8965 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
df7492f9
KH
8966 doc: /*
8967*String displayed in mode line for UNIX-like (LF) end-of-line format. */);
7722baf9 8968 eol_mnemonic_unix = build_string (":");
4ed46869 8969
7722baf9 8970 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
df7492f9
KH
8971 doc: /*
8972*String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
7722baf9 8973 eol_mnemonic_dos = build_string ("\\");
4ed46869 8974
7722baf9 8975 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
df7492f9
KH
8976 doc: /*
8977*String displayed in mode line for MAC-like (CR) end-of-line format. */);
7722baf9 8978 eol_mnemonic_mac = build_string ("/");
4ed46869 8979
7722baf9 8980 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
df7492f9
KH
8981 doc: /*
8982*String displayed in mode line when end-of-line format is not yet determined. */);
7722baf9 8983 eol_mnemonic_undecided = build_string (":");
4ed46869 8984
84fbb8a0 8985 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
df7492f9
KH
8986 doc: /*
8987*Non-nil enables character translation while encoding and decoding. */);
84fbb8a0 8988 Venable_character_translation = Qt;
bdd9fb48 8989
f967223b 8990 DEFVAR_LISP ("standard-translation-table-for-decode",
48b0f3ae
PJ
8991 &Vstandard_translation_table_for_decode,
8992 doc: /* Table for translating characters while decoding. */);
f967223b 8993 Vstandard_translation_table_for_decode = Qnil;
bdd9fb48 8994
f967223b 8995 DEFVAR_LISP ("standard-translation-table-for-encode",
48b0f3ae
PJ
8996 &Vstandard_translation_table_for_encode,
8997 doc: /* Table for translating characters while encoding. */);
f967223b 8998 Vstandard_translation_table_for_encode = Qnil;
4ed46869 8999
df7492f9 9000 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table,
48b0f3ae
PJ
9001 doc: /* Alist of charsets vs revision numbers.
9002While encoding, if a charset (car part of an element) is found,
df7492f9
KH
9003designate it with the escape sequence identifying revision (cdr part
9004of the element). */);
9005 Vcharset_revision_table = Qnil;
02ba4723
KH
9006
9007 DEFVAR_LISP ("default-process-coding-system",
9008 &Vdefault_process_coding_system,
48b0f3ae
PJ
9009 doc: /* Cons of coding systems used for process I/O by default.
9010The car part is used for decoding a process output,
9011the cdr part is used for encoding a text to be sent to a process. */);
02ba4723 9012 Vdefault_process_coding_system = Qnil;
c4825358 9013
3f003981 9014 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
df7492f9
KH
9015 doc: /*
9016Table of extra Latin codes in the range 128..159 (inclusive).
48b0f3ae
PJ
9017This is a vector of length 256.
9018If Nth element is non-nil, the existence of code N in a file
9019\(or output of subprocess) doesn't prevent it to be detected as
9020a coding system of ISO 2022 variant which has a flag
9021`accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
9022or reading output of a subprocess.
9023Only 128th through 159th elements has a meaning. */);
3f003981 9024 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
d46c5b12
KH
9025
9026 DEFVAR_LISP ("select-safe-coding-system-function",
9027 &Vselect_safe_coding_system_function,
df7492f9
KH
9028 doc: /*
9029Function to call to select safe coding system for encoding a text.
48b0f3ae
PJ
9030
9031If set, this function is called to force a user to select a proper
9032coding system which can encode the text in the case that a default
9033coding system used in each operation can't encode the text.
9034
9035The default value is `select-safe-coding-system' (which see). */);
d46c5b12
KH
9036 Vselect_safe_coding_system_function = Qnil;
9037
5d5bf4d8
KH
9038 DEFVAR_BOOL ("coding-system-require-warning",
9039 &coding_system_require_warning,
9040 doc: /* Internal use only.
6b89e3aa
KH
9041If non-nil, on writing a file, `select-safe-coding-system-function' is
9042called even if `coding-system-for-write' is non-nil. The command
9043`universal-coding-system-argument' binds this variable to t temporarily. */);
5d5bf4d8
KH
9044 coding_system_require_warning = 0;
9045
9046
22ab2303 9047 DEFVAR_BOOL ("inhibit-iso-escape-detection",
74383408 9048 &inhibit_iso_escape_detection,
df7492f9
KH
9049 doc: /*
9050If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
48b0f3ae
PJ
9051
9052By default, on reading a file, Emacs tries to detect how the text is
9053encoded. This code detection is sensitive to escape sequences. If
9054the sequence is valid as ISO2022, the code is determined as one of
9055the ISO2022 encodings, and the file is decoded by the corresponding
9056coding system (e.g. `iso-2022-7bit').
9057
9058However, there may be a case that you want to read escape sequences in
9059a file as is. In such a case, you can set this variable to non-nil.
9060Then, as the code detection ignores any escape sequences, no file is
9061detected as encoded in some ISO2022 encoding. The result is that all
9062escape sequences become visible in a buffer.
9063
9064The default value is nil, and it is strongly recommended not to change
9065it. That is because many Emacs Lisp source files that contain
9066non-ASCII characters are encoded by the coding system `iso-2022-7bit'
9067in Emacs's distribution, and they won't be decoded correctly on
9068reading if you suppress escape sequence detection.
9069
9070The other way to read escape sequences in a file without decoding is
9071to explicitly specify some coding system that doesn't use ISO2022's
9072escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument]. */);
74383408 9073 inhibit_iso_escape_detection = 0;
002fdb44
DL
9074
9075 DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
15c8f9d1
DL
9076 doc: /* Char table for translating self-inserting characters.
9077This is applied to the result of input methods, not their input. See also
9078`keyboard-translate-table'. */);
002fdb44 9079 Vtranslation_table_for_input = Qnil;
8f924df7 9080
2c78b7e1
KH
9081 {
9082 Lisp_Object args[coding_arg_max];
8f924df7 9083 Lisp_Object plist[16];
2c78b7e1
KH
9084 int i;
9085
9086 for (i = 0; i < coding_arg_max; i++)
9087 args[i] = Qnil;
9088
9089 plist[0] = intern (":name");
9090 plist[1] = args[coding_arg_name] = Qno_conversion;
9091 plist[2] = intern (":mnemonic");
9092 plist[3] = args[coding_arg_mnemonic] = make_number ('=');
9093 plist[4] = intern (":coding-type");
9094 plist[5] = args[coding_arg_coding_type] = Qraw_text;
9095 plist[6] = intern (":ascii-compatible-p");
9096 plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
9097 plist[8] = intern (":default-char");
9098 plist[9] = args[coding_arg_default_char] = make_number (0);
8f924df7
KH
9099 plist[10] = intern (":for-unibyte");
9100 plist[11] = args[coding_arg_for_unibyte] = Qt;
9101 plist[12] = intern (":docstring");
9102 plist[13] = build_string ("Do no conversion.\n\
2c78b7e1
KH
9103\n\
9104When you visit a file with this coding, the file is read into a\n\
9105unibyte buffer as is, thus each byte of a file is treated as a\n\
9106character.");
8f924df7
KH
9107 plist[14] = intern (":eol-type");
9108 plist[15] = args[coding_arg_eol_type] = Qunix;
9109 args[coding_arg_plist] = Flist (16, plist);
2c78b7e1
KH
9110 Fdefine_coding_system_internal (coding_arg_max, args);
9111 }
9112
9113 setup_coding_system (Qno_conversion, &keyboard_coding);
9114 setup_coding_system (Qno_conversion, &terminal_coding);
9115 setup_coding_system (Qno_conversion, &safe_terminal_coding);
ff563fce
KH
9116
9117 {
9118 int i;
9119
9120 for (i = 0; i < coding_category_max; i++)
9121 Fset (AREF (Vcoding_category_table, i), Qno_conversion);
9122 }
4ed46869
KH
9123}
9124
68c45bf0
PE
9125char *
9126emacs_strerror (error_number)
9127 int error_number;
9128{
9129 char *str;
9130
ca9c0567 9131 synchronize_system_messages_locale ();
68c45bf0
PE
9132 str = strerror (error_number);
9133
9134 if (! NILP (Vlocale_coding_system))
9135 {
9136 Lisp_Object dec = code_convert_string_norecord (build_string (str),
9137 Vlocale_coding_system,
9138 0);
d5db4077 9139 str = (char *) SDATA (dec);
68c45bf0
PE
9140 }
9141
9142 return str;
9143}
9144
4ed46869 9145#endif /* emacs */