(get_translation_table): New function.
[bpt/emacs.git] / src / coding.c
CommitLineData
9542cb1f 1/* Coding system handler (conversion, detection, etc).
4a2f9c6a 2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
8f924df7 3 Licensed to the Free Software Foundation.
6f197c07 4 Copyright (C) 2001, 2002 Free Software Foundation, Inc.
8f924df7 5 Copyright (C) 2003
df7492f9
KH
6 National Institute of Advanced Industrial Science and Technology (AIST)
7 Registration Number H13PRO009
4ed46869 8
369314dc
KH
9This file is part of GNU Emacs.
10
11GNU Emacs is free software; you can redistribute it and/or modify
12it under the terms of the GNU General Public License as published by
13the Free Software Foundation; either version 2, or (at your option)
14any later version.
4ed46869 15
369314dc
KH
16GNU Emacs is distributed in the hope that it will be useful,
17but WITHOUT ANY WARRANTY; without even the implied warranty of
18MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19GNU General Public License for more details.
4ed46869 20
369314dc
KH
21You should have received a copy of the GNU General Public License
22along with GNU Emacs; see the file COPYING. If not, write to
23the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24Boston, MA 02111-1307, USA. */
4ed46869
KH
25
26/*** TABLE OF CONTENTS ***
27
b73bfc1c 28 0. General comments
4ed46869 29 1. Preamble
df7492f9
KH
30 2. Emacs' internal format (emacs-utf-8) handlers
31 3. UTF-8 handlers
32 4. UTF-16 handlers
33 5. Charset-base coding systems handlers
34 6. emacs-mule (old Emacs' internal format) handlers
35 7. ISO2022 handlers
36 8. Shift-JIS and BIG5 handlers
37 9. CCL handlers
38 10. C library functions
39 11. Emacs Lisp library functions
40 12. Postamble
4ed46869
KH
41
42*/
43
df7492f9 44/*** 0. General comments ***
b73bfc1c
KH
45
46
df7492f9 47CODING SYSTEM
4ed46869 48
5bad0796
DL
49 A coding system is an object for an encoding mechanism that contains
50 information about how to convert byte sequences to character
e19c3639
KH
51 sequences and vice versa. When we say "decode", it means converting
52 a byte sequence of a specific coding system into a character
53 sequence that is represented by Emacs' internal coding system
54 `emacs-utf-8', and when we say "encode", it means converting a
55 character sequence of emacs-utf-8 to a byte sequence of a specific
0ef69138 56 coding system.
4ed46869 57
e19c3639
KH
58 In Emacs Lisp, a coding system is represented by a Lisp symbol. In
59 C level, a coding system is represented by a vector of attributes
5bad0796 60 stored in the hash table Vcharset_hash_table. The conversion from
e19c3639
KH
61 coding system symbol to attributes vector is done by looking up
62 Vcharset_hash_table by the symbol.
4ed46869 63
e19c3639 64 Coding systems are classified into the following types depending on
5bad0796 65 the encoding mechanism. Here's a brief description of the types.
4ed46869 66
df7492f9
KH
67 o UTF-8
68
69 o UTF-16
70
71 o Charset-base coding system
72
73 A coding system defined by one or more (coded) character sets.
5bad0796 74 Decoding and encoding are done by a code converter defined for each
df7492f9
KH
75 character set.
76
5bad0796 77 o Old Emacs internal format (emacs-mule)
df7492f9 78
5bad0796 79 The coding system adopted by old versions of Emacs (20 and 21).
4ed46869 80
df7492f9 81 o ISO2022-base coding system
4ed46869
KH
82
83 The most famous coding system for multiple character sets. X's
df7492f9
KH
84 Compound Text, various EUCs (Extended Unix Code), and coding systems
85 used in the Internet communication such as ISO-2022-JP are all
86 variants of ISO2022.
4ed46869 87
df7492f9 88 o SJIS (or Shift-JIS or MS-Kanji-Code)
93dec019 89
4ed46869
KH
90 A coding system to encode character sets: ASCII, JISX0201, and
91 JISX0208. Widely used for PC's in Japan. Details are described in
df7492f9 92 section 8.
4ed46869 93
df7492f9 94 o BIG5
4ed46869 95
df7492f9 96 A coding system to encode character sets: ASCII and Big5. Widely
cfb43547 97 used for Chinese (mainly in Taiwan and Hong Kong). Details are
df7492f9
KH
98 described in section 8. In this file, when we write "big5" (all
99 lowercase), we mean the coding system, and when we write "Big5"
100 (capitalized), we mean the character set.
4ed46869 101
df7492f9 102 o CCL
27901516 103
5bad0796 104 If a user wants to decode/encode text encoded in a coding system
df7492f9
KH
105 not listed above, he can supply a decoder and an encoder for it in
106 CCL (Code Conversion Language) programs. Emacs executes the CCL
107 program while decoding/encoding.
27901516 108
df7492f9 109 o Raw-text
4ed46869 110
5a936b46 111 A coding system for text containing raw eight-bit data. Emacs
5bad0796 112 treats each byte of source text as a character (except for
df7492f9 113 end-of-line conversion).
4ed46869 114
df7492f9
KH
115 o No-conversion
116
117 Like raw text, but don't do end-of-line conversion.
4ed46869 118
4ed46869 119
df7492f9 120END-OF-LINE FORMAT
4ed46869 121
5bad0796 122 How text end-of-line is encoded depends on operating system. For
df7492f9 123 instance, Unix's format is just one byte of LF (line-feed) code,
f4dee582 124 whereas DOS's format is two-byte sequence of `carriage-return' and
d46c5b12
KH
125 `line-feed' codes. MacOS's format is usually one byte of
126 `carriage-return'.
4ed46869 127
cfb43547 128 Since text character encoding and end-of-line encoding are
df7492f9
KH
129 independent, any coding system described above can take any format
130 of end-of-line (except for no-conversion).
4ed46869 131
e19c3639
KH
132STRUCT CODING_SYSTEM
133
134 Before using a coding system for code conversion (i.e. decoding and
135 encoding), we setup a structure of type `struct coding_system'.
136 This structure keeps various information about a specific code
5bad0796 137 conversion (e.g. the location of source and destination data).
4ed46869
KH
138
139*/
140
df7492f9
KH
141/* COMMON MACROS */
142
143
4ed46869
KH
144/*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
145
df7492f9 146 These functions check if a byte sequence specified as a source in
ff0dacd7
KH
147 CODING conforms to the format of XXX, and update the members of
148 DETECT_INFO.
df7492f9 149
ff0dacd7 150 Return 1 if the byte sequence conforms to XXX, otherwise return 0.
df7492f9
KH
151
152 Below is the template of these functions. */
153
4ed46869 154#if 0
df7492f9 155static int
ff0dacd7 156detect_coding_XXX (coding, detect_info)
df7492f9 157 struct coding_system *coding;
ff0dacd7 158 struct coding_detection_info *detect_info;
4ed46869 159{
df7492f9
KH
160 unsigned char *src = coding->source;
161 unsigned char *src_end = coding->source + coding->src_bytes;
162 int multibytep = coding->src_multibyte;
ff0dacd7 163 int consumed_chars = 0;
df7492f9
KH
164 int found = 0;
165 ...;
166
167 while (1)
168 {
169 /* Get one byte from the source. If the souce is exausted, jump
170 to no_more_source:. */
171 ONE_MORE_BYTE (c);
ff0dacd7
KH
172
173 if (! __C_conforms_to_XXX___ (c))
174 break;
175 if (! __C_strongly_suggests_XXX__ (c))
176 found = CATEGORY_MASK_XXX;
df7492f9 177 }
ff0dacd7
KH
178 /* The byte sequence is invalid for XXX. */
179 detect_info->rejected |= CATEGORY_MASK_XXX;
df7492f9 180 return 0;
ff0dacd7 181
df7492f9 182 no_more_source:
ff0dacd7
KH
183 /* The source exausted successfully. */
184 detect_info->found |= found;
df7492f9 185 return 1;
4ed46869
KH
186}
187#endif
188
189/*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
190
df7492f9
KH
191 These functions decode a byte sequence specified as a source by
192 CODING. The resulting multibyte text goes to a place pointed to by
193 CODING->charbuf, the length of which should not exceed
194 CODING->charbuf_size;
d46c5b12 195
df7492f9
KH
196 These functions set the information of original and decoded texts in
197 CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
198 They also set CODING->result to one of CODING_RESULT_XXX indicating
199 how the decoding is finished.
d46c5b12 200
df7492f9 201 Below is the template of these functions. */
d46c5b12 202
4ed46869 203#if 0
b73bfc1c 204static void
df7492f9 205decode_coding_XXXX (coding)
4ed46869 206 struct coding_system *coding;
4ed46869 207{
df7492f9
KH
208 unsigned char *src = coding->source + coding->consumed;
209 unsigned char *src_end = coding->source + coding->src_bytes;
210 /* SRC_BASE remembers the start position in source in each loop.
211 The loop will be exited when there's not enough source code, or
212 when there's no room in CHARBUF for a decoded character. */
213 unsigned char *src_base;
214 /* A buffer to produce decoded characters. */
215 int *charbuf = coding->charbuf;
216 int *charbuf_end = charbuf + coding->charbuf_size;
217 int multibytep = coding->src_multibyte;
218
219 while (1)
220 {
221 src_base = src;
222 if (charbuf < charbuf_end)
223 /* No more room to produce a decoded character. */
224 break;
225 ONE_MORE_BYTE (c);
226 /* Decode it. */
227 }
228
229 no_more_source:
230 if (src_base < src_end
231 && coding->mode & CODING_MODE_LAST_BLOCK)
232 /* If the source ends by partial bytes to construct a character,
233 treat them as eight-bit raw data. */
234 while (src_base < src_end && charbuf < charbuf_end)
235 *charbuf++ = *src_base++;
236 /* Remember how many bytes and characters we consumed. If the
237 source is multibyte, the bytes and chars are not identical. */
238 coding->consumed = coding->consumed_char = src_base - coding->source;
239 /* Remember how many characters we produced. */
240 coding->charbuf_used = charbuf - coding->charbuf;
4ed46869
KH
241}
242#endif
243
244/*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
245
df7492f9
KH
246 These functions encode SRC_BYTES length text at SOURCE of Emacs'
247 internal multibyte format by CODING. The resulting byte sequence
b73bfc1c
KH
248 goes to a place pointed to by DESTINATION, the length of which
249 should not exceed DST_BYTES.
d46c5b12 250
df7492f9
KH
251 These functions set the information of original and encoded texts in
252 the members produced, produced_char, consumed, and consumed_char of
253 the structure *CODING. They also set the member result to one of
254 CODING_RESULT_XXX indicating how the encoding finished.
d46c5b12 255
df7492f9
KH
256 DST_BYTES zero means that source area and destination area are
257 overlapped, which means that we can produce a encoded text until it
258 reaches at the head of not-yet-encoded source text.
d46c5b12 259
df7492f9 260 Below is a template of these functions. */
4ed46869 261#if 0
b73bfc1c 262static void
df7492f9 263encode_coding_XXX (coding)
4ed46869 264 struct coding_system *coding;
4ed46869 265{
df7492f9
KH
266 int multibytep = coding->dst_multibyte;
267 int *charbuf = coding->charbuf;
268 int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
269 unsigned char *dst = coding->destination + coding->produced;
270 unsigned char *dst_end = coding->destination + coding->dst_bytes;
271 unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
272 int produced_chars = 0;
273
274 for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
275 {
276 int c = *charbuf;
277 /* Encode C into DST, and increment DST. */
278 }
279 label_no_more_destination:
280 /* How many chars and bytes we produced. */
281 coding->produced_char += produced_chars;
282 coding->produced = dst - coding->destination;
4ed46869
KH
283}
284#endif
285
4ed46869
KH
286\f
287/*** 1. Preamble ***/
288
68c45bf0 289#include <config.h>
4ed46869
KH
290#include <stdio.h>
291
4ed46869
KH
292#include "lisp.h"
293#include "buffer.h"
df7492f9 294#include "character.h"
4ed46869
KH
295#include "charset.h"
296#include "ccl.h"
df7492f9 297#include "composite.h"
4ed46869
KH
298#include "coding.h"
299#include "window.h"
4ed46869 300
df7492f9 301Lisp_Object Vcoding_system_hash_table;
4ed46869 302
df7492f9 303Lisp_Object Qcoding_system, Qcoding_aliases, Qeol_type;
1965cb73
DL
304Lisp_Object Qunix, Qdos;
305extern Lisp_Object Qmac; /* frame.c */
4ed46869
KH
306Lisp_Object Qbuffer_file_coding_system;
307Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
df7492f9 308Lisp_Object Qdefault_char;
27901516 309Lisp_Object Qno_conversion, Qundecided;
df7492f9 310Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5;
b49a1807 311Lisp_Object Qbig, Qlittle;
bb0115a2 312Lisp_Object Qcoding_system_history;
1397dc18 313Lisp_Object Qvalid_codes;
01378f49 314Lisp_Object QCcategory;
4ed46869
KH
315
316extern Lisp_Object Qinsert_file_contents, Qwrite_region;
317Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
318Lisp_Object Qstart_process, Qopen_network_stream;
319Lisp_Object Qtarget_idx;
320
065e3595
KH
321Lisp_Object Qinsufficient_source, Qinconsistent_eol, Qinvalid_source;
322Lisp_Object Qinterrupted, Qinsufficient_memory;
323
5d5bf4d8
KH
324int coding_system_require_warning;
325
d46c5b12
KH
326Lisp_Object Vselect_safe_coding_system_function;
327
7722baf9
EZ
328/* Mnemonic string for each format of end-of-line. */
329Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
330/* Mnemonic string to indicate format of end-of-line is not yet
4ed46869 331 decided. */
7722baf9 332Lisp_Object eol_mnemonic_undecided;
4ed46869
KH
333
334#ifdef emacs
335
4608c386
KH
336Lisp_Object Vcoding_system_list, Vcoding_system_alist;
337
338Lisp_Object Qcoding_system_p, Qcoding_system_error;
4ed46869 339
d46c5b12
KH
340/* Coding system emacs-mule and raw-text are for converting only
341 end-of-line format. */
342Lisp_Object Qemacs_mule, Qraw_text;
8f924df7 343Lisp_Object Qutf_8_emacs;
ecf488bc 344
4ed46869
KH
345/* Coding-systems are handed between Emacs Lisp programs and C internal
346 routines by the following three variables. */
347/* Coding-system for reading files and receiving data from process. */
348Lisp_Object Vcoding_system_for_read;
349/* Coding-system for writing files and sending data to process. */
350Lisp_Object Vcoding_system_for_write;
351/* Coding-system actually used in the latest I/O. */
352Lisp_Object Vlast_coding_system_used;
065e3595
KH
353/* Set to non-nil when an error is detected while code conversion. */
354Lisp_Object Vlast_code_conversion_error;
c4825358 355/* A vector of length 256 which contains information about special
94487c4e 356 Latin codes (especially for dealing with Microsoft codes). */
3f003981 357Lisp_Object Vlatin_extra_code_table;
c4825358 358
9ce27fde
KH
359/* Flag to inhibit code conversion of end-of-line format. */
360int inhibit_eol_conversion;
361
74383408
KH
362/* Flag to inhibit ISO2022 escape sequence detection. */
363int inhibit_iso_escape_detection;
364
ed29121d
EZ
365/* Flag to make buffer-file-coding-system inherit from process-coding. */
366int inherit_process_coding_system;
367
c4825358 368/* Coding system to be used to encode text for terminal display. */
4ed46869
KH
369struct coding_system terminal_coding;
370
c4825358
KH
371/* Coding system to be used to encode text for terminal display when
372 terminal coding system is nil. */
373struct coding_system safe_terminal_coding;
374
375/* Coding system of what is sent from terminal keyboard. */
4ed46869
KH
376struct coding_system keyboard_coding;
377
02ba4723
KH
378Lisp_Object Vfile_coding_system_alist;
379Lisp_Object Vprocess_coding_system_alist;
380Lisp_Object Vnetwork_coding_system_alist;
4ed46869 381
68c45bf0
PE
382Lisp_Object Vlocale_coding_system;
383
4ed46869
KH
384#endif /* emacs */
385
f967223b
KH
386/* Flag to tell if we look up translation table on character code
387 conversion. */
84fbb8a0 388Lisp_Object Venable_character_translation;
f967223b
KH
389/* Standard translation table to look up on decoding (reading). */
390Lisp_Object Vstandard_translation_table_for_decode;
391/* Standard translation table to look up on encoding (writing). */
392Lisp_Object Vstandard_translation_table_for_encode;
84fbb8a0 393
f967223b
KH
394Lisp_Object Qtranslation_table;
395Lisp_Object Qtranslation_table_id;
396Lisp_Object Qtranslation_table_for_decode;
397Lisp_Object Qtranslation_table_for_encode;
4ed46869
KH
398
399/* Alist of charsets vs revision number. */
df7492f9 400static Lisp_Object Vcharset_revision_table;
4ed46869 401
02ba4723
KH
402/* Default coding systems used for process I/O. */
403Lisp_Object Vdefault_process_coding_system;
404
002fdb44
DL
405/* Char table for translating Quail and self-inserting input. */
406Lisp_Object Vtranslation_table_for_input;
407
df7492f9
KH
408/* Two special coding systems. */
409Lisp_Object Vsjis_coding_system;
410Lisp_Object Vbig5_coding_system;
411
412
065e3595
KH
413static void record_conversion_result (struct coding_system *coding,
414 enum coding_result_code result);
ff0dacd7
KH
415static int detect_coding_utf_8 P_ ((struct coding_system *,
416 struct coding_detection_info *info));
df7492f9
KH
417static void decode_coding_utf_8 P_ ((struct coding_system *));
418static int encode_coding_utf_8 P_ ((struct coding_system *));
419
ff0dacd7
KH
420static int detect_coding_utf_16 P_ ((struct coding_system *,
421 struct coding_detection_info *info));
df7492f9
KH
422static void decode_coding_utf_16 P_ ((struct coding_system *));
423static int encode_coding_utf_16 P_ ((struct coding_system *));
424
ff0dacd7
KH
425static int detect_coding_iso_2022 P_ ((struct coding_system *,
426 struct coding_detection_info *info));
df7492f9
KH
427static void decode_coding_iso_2022 P_ ((struct coding_system *));
428static int encode_coding_iso_2022 P_ ((struct coding_system *));
429
ff0dacd7
KH
430static int detect_coding_emacs_mule P_ ((struct coding_system *,
431 struct coding_detection_info *info));
df7492f9
KH
432static void decode_coding_emacs_mule P_ ((struct coding_system *));
433static int encode_coding_emacs_mule P_ ((struct coding_system *));
434
ff0dacd7
KH
435static int detect_coding_sjis P_ ((struct coding_system *,
436 struct coding_detection_info *info));
df7492f9
KH
437static void decode_coding_sjis P_ ((struct coding_system *));
438static int encode_coding_sjis P_ ((struct coding_system *));
439
ff0dacd7
KH
440static int detect_coding_big5 P_ ((struct coding_system *,
441 struct coding_detection_info *info));
df7492f9
KH
442static void decode_coding_big5 P_ ((struct coding_system *));
443static int encode_coding_big5 P_ ((struct coding_system *));
444
ff0dacd7
KH
445static int detect_coding_ccl P_ ((struct coding_system *,
446 struct coding_detection_info *info));
df7492f9
KH
447static void decode_coding_ccl P_ ((struct coding_system *));
448static int encode_coding_ccl P_ ((struct coding_system *));
449
450static void decode_coding_raw_text P_ ((struct coding_system *));
451static int encode_coding_raw_text P_ ((struct coding_system *));
452
453
454/* ISO2022 section */
455
456#define CODING_ISO_INITIAL(coding, reg) \
457 (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id), \
458 coding_attr_iso_initial), \
459 reg)))
460
461
462#define CODING_ISO_REQUEST(coding, charset_id) \
463 ((charset_id <= (coding)->max_charset_id \
464 ? (coding)->safe_charsets[charset_id] \
465 : -1))
466
467
468#define CODING_ISO_FLAGS(coding) \
469 ((coding)->spec.iso_2022.flags)
470#define CODING_ISO_DESIGNATION(coding, reg) \
471 ((coding)->spec.iso_2022.current_designation[reg])
472#define CODING_ISO_INVOCATION(coding, plane) \
473 ((coding)->spec.iso_2022.current_invocation[plane])
474#define CODING_ISO_SINGLE_SHIFTING(coding) \
475 ((coding)->spec.iso_2022.single_shifting)
476#define CODING_ISO_BOL(coding) \
477 ((coding)->spec.iso_2022.bol)
478#define CODING_ISO_INVOKED_CHARSET(coding, plane) \
479 CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
480
481/* Control characters of ISO2022. */
482 /* code */ /* function */
483#define ISO_CODE_LF 0x0A /* line-feed */
484#define ISO_CODE_CR 0x0D /* carriage-return */
485#define ISO_CODE_SO 0x0E /* shift-out */
486#define ISO_CODE_SI 0x0F /* shift-in */
487#define ISO_CODE_SS2_7 0x19 /* single-shift-2 for 7-bit code */
488#define ISO_CODE_ESC 0x1B /* escape */
489#define ISO_CODE_SS2 0x8E /* single-shift-2 */
490#define ISO_CODE_SS3 0x8F /* single-shift-3 */
491#define ISO_CODE_CSI 0x9B /* control-sequence-introducer */
492
493/* All code (1-byte) of ISO2022 is classified into one of the
494 followings. */
495enum iso_code_class_type
496 {
497 ISO_control_0, /* Control codes in the range
498 0x00..0x1F and 0x7F, except for the
499 following 5 codes. */
df7492f9
KH
500 ISO_shift_out, /* ISO_CODE_SO (0x0E) */
501 ISO_shift_in, /* ISO_CODE_SI (0x0F) */
502 ISO_single_shift_2_7, /* ISO_CODE_SS2_7 (0x19) */
503 ISO_escape, /* ISO_CODE_SO (0x1B) */
504 ISO_control_1, /* Control codes in the range
505 0x80..0x9F, except for the
506 following 3 codes. */
507 ISO_single_shift_2, /* ISO_CODE_SS2 (0x8E) */
508 ISO_single_shift_3, /* ISO_CODE_SS3 (0x8F) */
509 ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
510 ISO_0x20_or_0x7F, /* Codes of the values 0x20 or 0x7F. */
511 ISO_graphic_plane_0, /* Graphic codes in the range 0x21..0x7E. */
512 ISO_0xA0_or_0xFF, /* Codes of the values 0xA0 or 0xFF. */
513 ISO_graphic_plane_1 /* Graphic codes in the range 0xA1..0xFE. */
514 };
05e6f5dc 515
df7492f9
KH
516/** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
517 `iso-flags' attribute of an iso2022 coding system. */
05e6f5dc 518
df7492f9
KH
519/* If set, produce long-form designation sequence (e.g. ESC $ ( A)
520 instead of the correct short-form sequence (e.g. ESC $ A). */
521#define CODING_ISO_FLAG_LONG_FORM 0x0001
93dec019 522
df7492f9
KH
523/* If set, reset graphic planes and registers at end-of-line to the
524 initial state. */
525#define CODING_ISO_FLAG_RESET_AT_EOL 0x0002
05e6f5dc 526
df7492f9
KH
527/* If set, reset graphic planes and registers before any control
528 characters to the initial state. */
529#define CODING_ISO_FLAG_RESET_AT_CNTL 0x0004
05e6f5dc 530
df7492f9
KH
531/* If set, encode by 7-bit environment. */
532#define CODING_ISO_FLAG_SEVEN_BITS 0x0008
4ed46869 533
df7492f9
KH
534/* If set, use locking-shift function. */
535#define CODING_ISO_FLAG_LOCKING_SHIFT 0x0010
b73bfc1c 536
df7492f9
KH
537/* If set, use single-shift function. Overwrite
538 CODING_ISO_FLAG_LOCKING_SHIFT. */
539#define CODING_ISO_FLAG_SINGLE_SHIFT 0x0020
b73bfc1c 540
df7492f9
KH
541/* If set, use designation escape sequence. */
542#define CODING_ISO_FLAG_DESIGNATION 0x0040
b73bfc1c 543
df7492f9
KH
544/* If set, produce revision number sequence. */
545#define CODING_ISO_FLAG_REVISION 0x0080
b73bfc1c 546
df7492f9
KH
547/* If set, produce ISO6429's direction specifying sequence. */
548#define CODING_ISO_FLAG_DIRECTION 0x0100
f4dee582 549
df7492f9
KH
550/* If set, assume designation states are reset at beginning of line on
551 output. */
552#define CODING_ISO_FLAG_INIT_AT_BOL 0x0200
4ed46869 553
df7492f9
KH
554/* If set, designation sequence should be placed at beginning of line
555 on output. */
556#define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
aa72b389 557
df7492f9
KH
558/* If set, do not encode unsafe charactes on output. */
559#define CODING_ISO_FLAG_SAFE 0x0800
aa72b389 560
df7492f9
KH
561/* If set, extra latin codes (128..159) are accepted as a valid code
562 on input. */
563#define CODING_ISO_FLAG_LATIN_EXTRA 0x1000
aa72b389 564
df7492f9 565#define CODING_ISO_FLAG_COMPOSITION 0x2000
aa72b389 566
df7492f9 567#define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000
aa72b389 568
bf16eb23 569#define CODING_ISO_FLAG_USE_ROMAN 0x8000
aa72b389 570
bf16eb23 571#define CODING_ISO_FLAG_USE_OLDJIS 0x10000
aa72b389 572
bf16eb23 573#define CODING_ISO_FLAG_FULL_SUPPORT 0x100000
aa72b389 574
df7492f9
KH
575/* A character to be produced on output if encoding of the original
576 character is prohibited by CODING_ISO_FLAG_SAFE. */
577#define CODING_INHIBIT_CHARACTER_SUBSTITUTION '?'
aa72b389 578
4ed46869 579
df7492f9
KH
580/* UTF-16 section */
581#define CODING_UTF_16_BOM(coding) \
582 ((coding)->spec.utf_16.bom)
4ed46869 583
df7492f9
KH
584#define CODING_UTF_16_ENDIAN(coding) \
585 ((coding)->spec.utf_16.endian)
4ed46869 586
df7492f9
KH
587#define CODING_UTF_16_SURROGATE(coding) \
588 ((coding)->spec.utf_16.surrogate)
4ed46869 589
4ed46869 590
df7492f9
KH
591/* CCL section */
592#define CODING_CCL_DECODER(coding) \
593 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
594#define CODING_CCL_ENCODER(coding) \
595 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
596#define CODING_CCL_VALIDS(coding) \
8f924df7 597 (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
4ed46869 598
5a936b46 599/* Index for each coding category in `coding_categories' */
4ed46869 600
df7492f9
KH
601enum coding_category
602 {
603 coding_category_iso_7,
604 coding_category_iso_7_tight,
605 coding_category_iso_8_1,
606 coding_category_iso_8_2,
607 coding_category_iso_7_else,
608 coding_category_iso_8_else,
609 coding_category_utf_8,
610 coding_category_utf_16_auto,
611 coding_category_utf_16_be,
612 coding_category_utf_16_le,
613 coding_category_utf_16_be_nosig,
614 coding_category_utf_16_le_nosig,
615 coding_category_charset,
616 coding_category_sjis,
617 coding_category_big5,
618 coding_category_ccl,
619 coding_category_emacs_mule,
620 /* All above are targets of code detection. */
621 coding_category_raw_text,
622 coding_category_undecided,
623 coding_category_max
624 };
625
626/* Definitions of flag bits used in detect_coding_XXXX. */
627#define CATEGORY_MASK_ISO_7 (1 << coding_category_iso_7)
628#define CATEGORY_MASK_ISO_7_TIGHT (1 << coding_category_iso_7_tight)
629#define CATEGORY_MASK_ISO_8_1 (1 << coding_category_iso_8_1)
630#define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2)
631#define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else)
632#define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else)
633#define CATEGORY_MASK_UTF_8 (1 << coding_category_utf_8)
b49a1807 634#define CATEGORY_MASK_UTF_16_AUTO (1 << coding_category_utf_16_auto)
df7492f9
KH
635#define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be)
636#define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le)
637#define CATEGORY_MASK_UTF_16_BE_NOSIG (1 << coding_category_utf_16_be_nosig)
638#define CATEGORY_MASK_UTF_16_LE_NOSIG (1 << coding_category_utf_16_le_nosig)
639#define CATEGORY_MASK_CHARSET (1 << coding_category_charset)
640#define CATEGORY_MASK_SJIS (1 << coding_category_sjis)
641#define CATEGORY_MASK_BIG5 (1 << coding_category_big5)
642#define CATEGORY_MASK_CCL (1 << coding_category_ccl)
643#define CATEGORY_MASK_EMACS_MULE (1 << coding_category_emacs_mule)
ff0dacd7 644#define CATEGORY_MASK_RAW_TEXT (1 << coding_category_raw_text)
df7492f9
KH
645
646/* This value is returned if detect_coding_mask () find nothing other
647 than ASCII characters. */
648#define CATEGORY_MASK_ANY \
649 (CATEGORY_MASK_ISO_7 \
650 | CATEGORY_MASK_ISO_7_TIGHT \
651 | CATEGORY_MASK_ISO_8_1 \
652 | CATEGORY_MASK_ISO_8_2 \
653 | CATEGORY_MASK_ISO_7_ELSE \
654 | CATEGORY_MASK_ISO_8_ELSE \
655 | CATEGORY_MASK_UTF_8 \
656 | CATEGORY_MASK_UTF_16_BE \
657 | CATEGORY_MASK_UTF_16_LE \
658 | CATEGORY_MASK_UTF_16_BE_NOSIG \
659 | CATEGORY_MASK_UTF_16_LE_NOSIG \
660 | CATEGORY_MASK_CHARSET \
661 | CATEGORY_MASK_SJIS \
662 | CATEGORY_MASK_BIG5 \
663 | CATEGORY_MASK_CCL \
664 | CATEGORY_MASK_EMACS_MULE)
665
666
667#define CATEGORY_MASK_ISO_7BIT \
668 (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
669
670#define CATEGORY_MASK_ISO_8BIT \
671 (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
672
673#define CATEGORY_MASK_ISO_ELSE \
674 (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
675
676#define CATEGORY_MASK_ISO_ESCAPE \
677 (CATEGORY_MASK_ISO_7 \
678 | CATEGORY_MASK_ISO_7_TIGHT \
679 | CATEGORY_MASK_ISO_7_ELSE \
680 | CATEGORY_MASK_ISO_8_ELSE)
681
682#define CATEGORY_MASK_ISO \
683 ( CATEGORY_MASK_ISO_7BIT \
684 | CATEGORY_MASK_ISO_8BIT \
685 | CATEGORY_MASK_ISO_ELSE)
686
687#define CATEGORY_MASK_UTF_16 \
688 (CATEGORY_MASK_UTF_16_BE \
689 | CATEGORY_MASK_UTF_16_LE \
690 | CATEGORY_MASK_UTF_16_BE_NOSIG \
691 | CATEGORY_MASK_UTF_16_LE_NOSIG)
692
693
694/* List of symbols `coding-category-xxx' ordered by priority. This
695 variable is exposed to Emacs Lisp. */
696static Lisp_Object Vcoding_category_list;
697
698/* Table of coding categories (Lisp symbols). This variable is for
699 internal use oly. */
700static Lisp_Object Vcoding_category_table;
701
702/* Table of coding-categories ordered by priority. */
703static enum coding_category coding_priorities[coding_category_max];
704
705/* Nth element is a coding context for the coding system bound to the
706 Nth coding category. */
707static struct coding_system coding_categories[coding_category_max];
708
df7492f9
KH
709/*** Commonly used macros and functions ***/
710
711#ifndef min
712#define min(a, b) ((a) < (b) ? (a) : (b))
713#endif
714#ifndef max
715#define max(a, b) ((a) > (b) ? (a) : (b))
716#endif
4ed46869 717
24a73b0a
KH
718#define CODING_GET_INFO(coding, attrs, charset_list) \
719 do { \
720 (attrs) = CODING_ID_ATTRS ((coding)->id); \
721 (charset_list) = CODING_ATTR_CHARSET_LIST (attrs); \
df7492f9 722 } while (0)
4ed46869 723
4ed46869 724
df7492f9
KH
725/* Safely get one byte from the source text pointed by SRC which ends
726 at SRC_END, and set C to that byte. If there are not enough bytes
065e3595
KH
727 in the source, it jumps to `no_more_source'. If multibytep is
728 nonzero, and a multibyte character is found at SRC, set C to the
729 negative value of the character code. The caller should declare
730 and set these variables appropriately in advance:
731 src, src_end, multibytep */
aa72b389 732
065e3595
KH
733#define ONE_MORE_BYTE(c) \
734 do { \
735 if (src == src_end) \
736 { \
737 if (src_base < src) \
738 record_conversion_result \
739 (coding, CODING_RESULT_INSUFFICIENT_SRC); \
740 goto no_more_source; \
741 } \
742 c = *src++; \
743 if (multibytep && (c & 0x80)) \
744 { \
745 if ((c & 0xFE) == 0xC0) \
746 c = ((c & 1) << 6) | *src++; \
747 else \
748 { \
749 c = - string_char (--src, &src, NULL); \
750 record_conversion_result \
751 (coding, CODING_RESULT_INVALID_SRC); \
752 } \
753 } \
754 consumed_chars++; \
aa72b389
KH
755 } while (0)
756
aa72b389 757
065e3595
KH
758#define ONE_MORE_BYTE_NO_CHECK(c) \
759 do { \
760 c = *src++; \
761 if (multibytep && (c & 0x80)) \
762 { \
763 if ((c & 0xFE) == 0xC0) \
764 c = ((c & 1) << 6) | *src++; \
765 else \
766 { \
767 c = - string_char (--src, &src, NULL); \
768 record_conversion_result \
769 (coding, CODING_RESULT_INVALID_SRC); \
770 } \
771 } \
772 consumed_chars++; \
aa72b389
KH
773 } while (0)
774
aa72b389 775
df7492f9
KH
776/* Store a byte C in the place pointed by DST and increment DST to the
777 next free point, and increment PRODUCED_CHARS. The caller should
778 assure that C is 0..127, and declare and set the variable `dst'
779 appropriately in advance.
780*/
aa72b389
KH
781
782
df7492f9
KH
783#define EMIT_ONE_ASCII_BYTE(c) \
784 do { \
785 produced_chars++; \
786 *dst++ = (c); \
b6871cc7 787 } while (0)
aa72b389
KH
788
789
df7492f9 790/* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2. */
aa72b389 791
df7492f9
KH
792#define EMIT_TWO_ASCII_BYTES(c1, c2) \
793 do { \
794 produced_chars += 2; \
795 *dst++ = (c1), *dst++ = (c2); \
796 } while (0)
aa72b389
KH
797
798
df7492f9
KH
799/* Store a byte C in the place pointed by DST and increment DST to the
800 next free point, and increment PRODUCED_CHARS. If MULTIBYTEP is
801 nonzero, store in an appropriate multibyte from. The caller should
802 declare and set the variables `dst' and `multibytep' appropriately
803 in advance. */
804
805#define EMIT_ONE_BYTE(c) \
806 do { \
807 produced_chars++; \
808 if (multibytep) \
809 { \
810 int ch = (c); \
811 if (ch >= 0x80) \
812 ch = BYTE8_TO_CHAR (ch); \
813 CHAR_STRING_ADVANCE (ch, dst); \
814 } \
815 else \
816 *dst++ = (c); \
aa72b389 817 } while (0)
aa72b389 818
aa72b389 819
df7492f9 820/* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */
aa72b389 821
e19c3639
KH
822#define EMIT_TWO_BYTES(c1, c2) \
823 do { \
824 produced_chars += 2; \
825 if (multibytep) \
826 { \
827 int ch; \
828 \
829 ch = (c1); \
830 if (ch >= 0x80) \
831 ch = BYTE8_TO_CHAR (ch); \
832 CHAR_STRING_ADVANCE (ch, dst); \
833 ch = (c2); \
834 if (ch >= 0x80) \
835 ch = BYTE8_TO_CHAR (ch); \
836 CHAR_STRING_ADVANCE (ch, dst); \
837 } \
838 else \
839 { \
840 *dst++ = (c1); \
841 *dst++ = (c2); \
842 } \
aa72b389
KH
843 } while (0)
844
845
df7492f9
KH
846#define EMIT_THREE_BYTES(c1, c2, c3) \
847 do { \
848 EMIT_ONE_BYTE (c1); \
849 EMIT_TWO_BYTES (c2, c3); \
850 } while (0)
aa72b389 851
aa72b389 852
df7492f9
KH
853#define EMIT_FOUR_BYTES(c1, c2, c3, c4) \
854 do { \
855 EMIT_TWO_BYTES (c1, c2); \
856 EMIT_TWO_BYTES (c3, c4); \
857 } while (0)
aa72b389 858
aa72b389 859
065e3595
KH
860static void
861record_conversion_result (struct coding_system *coding,
862 enum coding_result_code result)
863{
864 coding->result = result;
865 switch (result)
866 {
867 case CODING_RESULT_INSUFFICIENT_SRC:
868 Vlast_code_conversion_error = Qinsufficient_source;
869 break;
870 case CODING_RESULT_INCONSISTENT_EOL:
871 Vlast_code_conversion_error = Qinconsistent_eol;
872 break;
873 case CODING_RESULT_INVALID_SRC:
874 Vlast_code_conversion_error = Qinvalid_source;
875 break;
876 case CODING_RESULT_INTERRUPT:
877 Vlast_code_conversion_error = Qinterrupted;
878 break;
879 case CODING_RESULT_INSUFFICIENT_MEM:
880 Vlast_code_conversion_error = Qinsufficient_memory;
881 break;
882 }
883}
884
df7492f9
KH
885#define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
886 do { \
887 charset_map_loaded = 0; \
888 c = DECODE_CHAR (charset, code); \
889 if (charset_map_loaded) \
890 { \
8f924df7 891 const unsigned char *orig = coding->source; \
df7492f9
KH
892 EMACS_INT offset; \
893 \
894 coding_set_source (coding); \
895 offset = coding->source - orig; \
896 src += offset; \
897 src_base += offset; \
898 src_end += offset; \
899 } \
aa72b389
KH
900 } while (0)
901
902
df7492f9
KH
903#define ASSURE_DESTINATION(bytes) \
904 do { \
905 if (dst + (bytes) >= dst_end) \
906 { \
907 int more_bytes = charbuf_end - charbuf + (bytes); \
908 \
909 dst = alloc_destination (coding, more_bytes, dst); \
910 dst_end = coding->destination + coding->dst_bytes; \
911 } \
912 } while (0)
aa72b389 913
aa72b389 914
aa72b389 915
df7492f9
KH
916static void
917coding_set_source (coding)
aa72b389 918 struct coding_system *coding;
aa72b389 919{
df7492f9
KH
920 if (BUFFERP (coding->src_object))
921 {
2cb26057 922 struct buffer *buf = XBUFFER (coding->src_object);
aa72b389 923
df7492f9 924 if (coding->src_pos < 0)
2cb26057 925 coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
df7492f9 926 else
2cb26057 927 coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
aa72b389 928 }
df7492f9 929 else if (STRINGP (coding->src_object))
aa72b389 930 {
8f924df7 931 coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
aa72b389 932 }
df7492f9
KH
933 else
934 /* Otherwise, the source is C string and is never relocated
935 automatically. Thus we don't have to update anything. */
936 ;
937}
aa72b389 938
df7492f9
KH
939static void
940coding_set_destination (coding)
941 struct coding_system *coding;
942{
943 if (BUFFERP (coding->dst_object))
aa72b389 944 {
df7492f9 945 if (coding->src_pos < 0)
aa72b389 946 {
28f67a95
KH
947 coding->destination = BEG_ADDR + coding->dst_pos_byte - 1;
948 coding->dst_bytes = (GAP_END_ADDR
949 - (coding->src_bytes - coding->consumed)
950 - coding->destination);
aa72b389 951 }
df7492f9 952 else
28f67a95
KH
953 {
954 /* We are sure that coding->dst_pos_byte is before the gap
955 of the buffer. */
956 coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
957 + coding->dst_pos_byte - 1);
958 coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
959 - coding->destination);
960 }
df7492f9
KH
961 }
962 else
963 /* Otherwise, the destination is C string and is never relocated
964 automatically. Thus we don't have to update anything. */
965 ;
966}
967
968
969static void
970coding_alloc_by_realloc (coding, bytes)
971 struct coding_system *coding;
972 EMACS_INT bytes;
973{
974 coding->destination = (unsigned char *) xrealloc (coding->destination,
975 coding->dst_bytes + bytes);
976 coding->dst_bytes += bytes;
977}
978
979static void
980coding_alloc_by_making_gap (coding, bytes)
981 struct coding_system *coding;
982 EMACS_INT bytes;
983{
2c78b7e1
KH
984 if (BUFFERP (coding->dst_object)
985 && EQ (coding->src_object, coding->dst_object))
df7492f9
KH
986 {
987 EMACS_INT add = coding->src_bytes - coding->consumed;
988
989 GAP_SIZE -= add; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
990 make_gap (bytes);
991 GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
992 }
993 else
994 {
2c78b7e1
KH
995 Lisp_Object this_buffer;
996
997 this_buffer = Fcurrent_buffer ();
df7492f9
KH
998 set_buffer_internal (XBUFFER (coding->dst_object));
999 make_gap (bytes);
1000 set_buffer_internal (XBUFFER (this_buffer));
aa72b389 1001 }
df7492f9 1002}
8f924df7 1003
df7492f9
KH
1004
1005static unsigned char *
1006alloc_destination (coding, nbytes, dst)
1007 struct coding_system *coding;
3e139625 1008 EMACS_INT nbytes;
df7492f9
KH
1009 unsigned char *dst;
1010{
1011 EMACS_INT offset = dst - coding->destination;
1012
1013 if (BUFFERP (coding->dst_object))
1014 coding_alloc_by_making_gap (coding, nbytes);
aa72b389 1015 else
df7492f9 1016 coding_alloc_by_realloc (coding, nbytes);
065e3595 1017 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
1018 coding_set_destination (coding);
1019 dst = coding->destination + offset;
1020 return dst;
1021}
aa72b389 1022
ff0dacd7
KH
1023/** Macros for annotations. */
1024
1025/* Maximum length of annotation data (sum of annotations for
1026 composition and charset). */
1027#define MAX_ANNOTATION_LENGTH (5 + (MAX_COMPOSITION_COMPONENTS * 2) - 1 + 5)
1028
1029/* An annotation data is stored in the array coding->charbuf in this
1030 format:
1031 [ -LENGTH ANNOTATION_MASK FROM TO ... ]
1032 LENGTH is the number of elements in the annotation.
1033 ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1034 FROM and TO specify the range of text annotated. They are relative
1035 to coding->src_pos (on encoding) or coding->dst_pos (on decoding).
1036
1037 The format of the following elements depend on ANNOTATION_MASK.
1038
1039 In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1040 follows:
1041 ... METHOD [ COMPOSITION-COMPONENTS ... ]
1042 METHOD is one of enum composition_method.
1043 Optionnal COMPOSITION-COMPONENTS are characters and composition
1044 rules.
1045
1046 In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1047 follows. */
1048
1049#define ADD_ANNOTATION_DATA(buf, len, mask, from, to) \
1050 do { \
1051 *(buf)++ = -(len); \
1052 *(buf)++ = (mask); \
1053 *(buf)++ = (from); \
1054 *(buf)++ = (to); \
1055 coding->annotated = 1; \
1056 } while (0);
1057
1058#define ADD_COMPOSITION_DATA(buf, from, to, method) \
1059 do { \
1060 ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, from, to); \
1061 *buf++ = method; \
1062 } while (0)
1063
1064
1065#define ADD_CHARSET_DATA(buf, from, to, id) \
1066 do { \
1067 ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_CHARSET_MASK, from, to); \
1068 *buf++ = id; \
1069 } while (0)
1070
df7492f9
KH
1071\f
1072/*** 2. Emacs' internal format (emacs-utf-8) ***/
1073
1074
1075
1076\f
1077/*** 3. UTF-8 ***/
1078
1079/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
ff0dacd7
KH
1080 Check if a text is encoded in UTF-8. If it is, return 1, else
1081 return 0. */
df7492f9
KH
1082
1083#define UTF_8_1_OCTET_P(c) ((c) < 0x80)
1084#define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
1085#define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1086#define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1087#define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1088#define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1089
1090static int
ff0dacd7 1091detect_coding_utf_8 (coding, detect_info)
df7492f9 1092 struct coding_system *coding;
ff0dacd7 1093 struct coding_detection_info *detect_info;
df7492f9 1094{
065e3595 1095 const unsigned char *src = coding->source, *src_base;
8f924df7 1096 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9
KH
1097 int multibytep = coding->src_multibyte;
1098 int consumed_chars = 0;
1099 int found = 0;
1100
ff0dacd7 1101 detect_info->checked |= CATEGORY_MASK_UTF_8;
df7492f9
KH
1102 /* A coding system of this category is always ASCII compatible. */
1103 src += coding->head_ascii;
1104
1105 while (1)
aa72b389 1106 {
df7492f9 1107 int c, c1, c2, c3, c4;
aa72b389 1108
065e3595 1109 src_base = src;
df7492f9 1110 ONE_MORE_BYTE (c);
065e3595 1111 if (c < 0 || UTF_8_1_OCTET_P (c))
df7492f9
KH
1112 continue;
1113 ONE_MORE_BYTE (c1);
065e3595 1114 if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
df7492f9
KH
1115 break;
1116 if (UTF_8_2_OCTET_LEADING_P (c))
aa72b389 1117 {
ff0dacd7 1118 found = CATEGORY_MASK_UTF_8;
df7492f9 1119 continue;
aa72b389 1120 }
df7492f9 1121 ONE_MORE_BYTE (c2);
065e3595 1122 if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
df7492f9
KH
1123 break;
1124 if (UTF_8_3_OCTET_LEADING_P (c))
aa72b389 1125 {
ff0dacd7 1126 found = CATEGORY_MASK_UTF_8;
df7492f9 1127 continue;
aa72b389 1128 }
df7492f9 1129 ONE_MORE_BYTE (c3);
065e3595 1130 if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
df7492f9
KH
1131 break;
1132 if (UTF_8_4_OCTET_LEADING_P (c))
aa72b389 1133 {
ff0dacd7 1134 found = CATEGORY_MASK_UTF_8;
df7492f9
KH
1135 continue;
1136 }
1137 ONE_MORE_BYTE (c4);
065e3595 1138 if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
df7492f9
KH
1139 break;
1140 if (UTF_8_5_OCTET_LEADING_P (c))
1141 {
ff0dacd7 1142 found = CATEGORY_MASK_UTF_8;
df7492f9
KH
1143 continue;
1144 }
1145 break;
aa72b389 1146 }
ff0dacd7 1147 detect_info->rejected |= CATEGORY_MASK_UTF_8;
df7492f9 1148 return 0;
aa72b389 1149
df7492f9 1150 no_more_source:
065e3595 1151 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
aa72b389 1152 {
ff0dacd7 1153 detect_info->rejected |= CATEGORY_MASK_UTF_8;
89528eb3 1154 return 0;
aa72b389 1155 }
ff0dacd7
KH
1156 detect_info->found |= found;
1157 return 1;
aa72b389
KH
1158}
1159
4ed46869 1160
b73bfc1c 1161static void
df7492f9 1162decode_coding_utf_8 (coding)
b73bfc1c 1163 struct coding_system *coding;
b73bfc1c 1164{
8f924df7
KH
1165 const unsigned char *src = coding->source + coding->consumed;
1166 const unsigned char *src_end = coding->source + coding->src_bytes;
1167 const unsigned char *src_base;
df7492f9
KH
1168 int *charbuf = coding->charbuf;
1169 int *charbuf_end = charbuf + coding->charbuf_size;
1170 int consumed_chars = 0, consumed_chars_base;
1171 int multibytep = coding->src_multibyte;
24a73b0a 1172 Lisp_Object attr, charset_list;
4ed46869 1173
24a73b0a 1174 CODING_GET_INFO (coding, attr, charset_list);
df7492f9
KH
1175
1176 while (1)
b73bfc1c 1177 {
df7492f9 1178 int c, c1, c2, c3, c4, c5;
ec6d2bb8 1179
df7492f9
KH
1180 src_base = src;
1181 consumed_chars_base = consumed_chars;
4af310db 1182
df7492f9
KH
1183 if (charbuf >= charbuf_end)
1184 break;
1185
1186 ONE_MORE_BYTE (c1);
065e3595
KH
1187 if (c1 < 0)
1188 {
1189 c = - c1;
1190 }
1191 else if (UTF_8_1_OCTET_P(c1))
df7492f9
KH
1192 {
1193 c = c1;
4af310db 1194 }
df7492f9 1195 else
4af310db 1196 {
df7492f9 1197 ONE_MORE_BYTE (c2);
065e3595 1198 if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
df7492f9
KH
1199 goto invalid_code;
1200 if (UTF_8_2_OCTET_LEADING_P (c1))
4af310db 1201 {
b0edb2c5
DL
1202 c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1203 /* Reject overlong sequences here and below. Encoders
1204 producing them are incorrect, they can be misleading,
1205 and they mess up read/write invariance. */
1206 if (c < 128)
1207 goto invalid_code;
4af310db 1208 }
df7492f9 1209 else
aa72b389 1210 {
df7492f9 1211 ONE_MORE_BYTE (c3);
065e3595 1212 if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
df7492f9
KH
1213 goto invalid_code;
1214 if (UTF_8_3_OCTET_LEADING_P (c1))
b0edb2c5
DL
1215 {
1216 c = (((c1 & 0xF) << 12)
1217 | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
72fe1301
DL
1218 if (c < 0x800
1219 || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
b0edb2c5
DL
1220 goto invalid_code;
1221 }
df7492f9
KH
1222 else
1223 {
1224 ONE_MORE_BYTE (c4);
065e3595 1225 if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
df7492f9
KH
1226 goto invalid_code;
1227 if (UTF_8_4_OCTET_LEADING_P (c1))
b0edb2c5 1228 {
df7492f9
KH
1229 c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1230 | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
b0edb2c5
DL
1231 if (c < 0x10000)
1232 goto invalid_code;
1233 }
df7492f9
KH
1234 else
1235 {
1236 ONE_MORE_BYTE (c5);
065e3595 1237 if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
df7492f9
KH
1238 goto invalid_code;
1239 if (UTF_8_5_OCTET_LEADING_P (c1))
1240 {
1241 c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1242 | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1243 | (c5 & 0x3F));
b0edb2c5 1244 if ((c > MAX_CHAR) || (c < 0x200000))
df7492f9
KH
1245 goto invalid_code;
1246 }
1247 else
1248 goto invalid_code;
1249 }
1250 }
aa72b389 1251 }
b73bfc1c 1252 }
df7492f9
KH
1253
1254 *charbuf++ = c;
1255 continue;
1256
1257 invalid_code:
1258 src = src_base;
1259 consumed_chars = consumed_chars_base;
1260 ONE_MORE_BYTE (c);
1261 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1262 coding->errors++;
aa72b389
KH
1263 }
1264
df7492f9
KH
1265 no_more_source:
1266 coding->consumed_char += consumed_chars_base;
1267 coding->consumed = src_base - coding->source;
1268 coding->charbuf_used = charbuf - coding->charbuf;
1269}
1270
1271
1272static int
1273encode_coding_utf_8 (coding)
1274 struct coding_system *coding;
1275{
1276 int multibytep = coding->dst_multibyte;
1277 int *charbuf = coding->charbuf;
1278 int *charbuf_end = charbuf + coding->charbuf_used;
1279 unsigned char *dst = coding->destination + coding->produced;
1280 unsigned char *dst_end = coding->destination + coding->dst_bytes;
e19c3639 1281 int produced_chars = 0;
df7492f9
KH
1282 int c;
1283
1284 if (multibytep)
aa72b389 1285 {
df7492f9
KH
1286 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1287
1288 while (charbuf < charbuf_end)
b73bfc1c 1289 {
df7492f9 1290 unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
8f924df7 1291
df7492f9
KH
1292 ASSURE_DESTINATION (safe_room);
1293 c = *charbuf++;
28f67a95
KH
1294 if (CHAR_BYTE8_P (c))
1295 {
1296 c = CHAR_TO_BYTE8 (c);
1297 EMIT_ONE_BYTE (c);
1298 }
1299 else
1300 {
1301 CHAR_STRING_ADVANCE (c, pend);
1302 for (p = str; p < pend; p++)
1303 EMIT_ONE_BYTE (*p);
1304 }
b73bfc1c 1305 }
aa72b389 1306 }
df7492f9
KH
1307 else
1308 {
1309 int safe_room = MAX_MULTIBYTE_LENGTH;
1310
1311 while (charbuf < charbuf_end)
b73bfc1c 1312 {
df7492f9
KH
1313 ASSURE_DESTINATION (safe_room);
1314 c = *charbuf++;
1315 dst += CHAR_STRING (c, dst);
1316 produced_chars++;
4ed46869
KH
1317 }
1318 }
065e3595 1319 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
1320 coding->produced_char += produced_chars;
1321 coding->produced = dst - coding->destination;
1322 return 0;
4ed46869
KH
1323}
1324
b73bfc1c 1325
df7492f9 1326/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
ff0dacd7
KH
1327 Check if a text is encoded in one of UTF-16 based coding systems.
1328 If it is, return 1, else return 0. */
aa72b389 1329
df7492f9
KH
1330#define UTF_16_HIGH_SURROGATE_P(val) \
1331 (((val) & 0xFC00) == 0xD800)
1332
1333#define UTF_16_LOW_SURROGATE_P(val) \
1334 (((val) & 0xFC00) == 0xDC00)
93dec019 1335
df7492f9
KH
1336#define UTF_16_INVALID_P(val) \
1337 (((val) == 0xFFFE) \
1338 || ((val) == 0xFFFF) \
1339 || UTF_16_LOW_SURROGATE_P (val))
aa72b389 1340
aa72b389 1341
df7492f9 1342static int
ff0dacd7 1343detect_coding_utf_16 (coding, detect_info)
aa72b389 1344 struct coding_system *coding;
ff0dacd7 1345 struct coding_detection_info *detect_info;
aa72b389 1346{
8f924df7
KH
1347 const unsigned char *src = coding->source, *src_base = src;
1348 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9
KH
1349 int multibytep = coding->src_multibyte;
1350 int consumed_chars = 0;
1351 int c1, c2;
aa72b389 1352
ff0dacd7 1353 detect_info->checked |= CATEGORY_MASK_UTF_16;
ff0dacd7 1354 if (coding->mode & CODING_MODE_LAST_BLOCK
24a73b0a 1355 && (coding->src_chars & 1))
ff0dacd7
KH
1356 {
1357 detect_info->rejected |= CATEGORY_MASK_UTF_16;
1358 return 0;
1359 }
24a73b0a 1360
df7492f9
KH
1361 ONE_MORE_BYTE (c1);
1362 ONE_MORE_BYTE (c2);
df7492f9 1363 if ((c1 == 0xFF) && (c2 == 0xFE))
aa72b389 1364 {
b49a1807
KH
1365 detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1366 | CATEGORY_MASK_UTF_16_AUTO);
24a73b0a
KH
1367 detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1368 | CATEGORY_MASK_UTF_16_BE_NOSIG
1369 | CATEGORY_MASK_UTF_16_LE_NOSIG);
aa72b389 1370 }
df7492f9 1371 else if ((c1 == 0xFE) && (c2 == 0xFF))
ff0dacd7 1372 {
b49a1807
KH
1373 detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1374 | CATEGORY_MASK_UTF_16_AUTO);
24a73b0a
KH
1375 detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1376 | CATEGORY_MASK_UTF_16_BE_NOSIG
1377 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1378 }
065e3595 1379 else if (c1 >= 0 && c2 >= 0)
24a73b0a
KH
1380 {
1381 unsigned char b1[256], b2[256];
1382 int b1_variants = 1, b2_variants = 1;
1383 int n;
1384
1385 bzero (b1, 256), bzero (b2, 256);
1386 b1[c1]++, b2[c2]++;
1387 for (n = 0; n < 256 && src < src_end; n++)
1388 {
065e3595 1389 src_base = src;
24a73b0a
KH
1390 ONE_MORE_BYTE (c1);
1391 ONE_MORE_BYTE (c2);
065e3595
KH
1392 if (c1 < 0 || c2 < 0)
1393 break;
24a73b0a
KH
1394 if (! b1[c1++]) b1_variants++;
1395 if (! b2[c2++]) b2_variants++;
1396 }
1397 if (b1_variants < b2_variants)
1398 detect_info->found |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1399 else
1400 detect_info->found |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1401 detect_info->rejected
1402 |= (CATEGORY_MASK_UTF_16_BE | CATEGORY_MASK_UTF_16_LE);
ff0dacd7 1403 }
df7492f9 1404 no_more_source:
ff0dacd7 1405 return 1;
df7492f9 1406}
aa72b389 1407
df7492f9
KH
1408static void
1409decode_coding_utf_16 (coding)
1410 struct coding_system *coding;
1411{
8f924df7
KH
1412 const unsigned char *src = coding->source + coding->consumed;
1413 const unsigned char *src_end = coding->source + coding->src_bytes;
1414 const unsigned char *src_base;
df7492f9
KH
1415 int *charbuf = coding->charbuf;
1416 int *charbuf_end = charbuf + coding->charbuf_size;
1417 int consumed_chars = 0, consumed_chars_base;
1418 int multibytep = coding->src_multibyte;
1419 enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1420 enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1421 int surrogate = CODING_UTF_16_SURROGATE (coding);
24a73b0a 1422 Lisp_Object attr, charset_list;
df7492f9 1423
24a73b0a 1424 CODING_GET_INFO (coding, attr, charset_list);
df7492f9 1425
b49a1807 1426 if (bom == utf_16_with_bom)
aa72b389 1427 {
df7492f9 1428 int c, c1, c2;
4af310db 1429
aa72b389 1430 src_base = src;
df7492f9
KH
1431 ONE_MORE_BYTE (c1);
1432 ONE_MORE_BYTE (c2);
e19c3639 1433 c = (c1 << 8) | c2;
aa72b389 1434
b49a1807
KH
1435 if (endian == utf_16_big_endian
1436 ? c != 0xFEFF : c != 0xFFFE)
aa72b389 1437 {
b49a1807
KH
1438 /* The first two bytes are not BOM. Treat them as bytes
1439 for a normal character. */
1440 src = src_base;
1441 coding->errors++;
aa72b389 1442 }
b49a1807
KH
1443 CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1444 }
1445 else if (bom == utf_16_detect_bom)
1446 {
1447 /* We have already tried to detect BOM and failed in
1448 detect_coding. */
1449 CODING_UTF_16_BOM (coding) = utf_16_without_bom;
df7492f9 1450 }
aa72b389 1451
df7492f9
KH
1452 while (1)
1453 {
1454 int c, c1, c2;
1455
1456 src_base = src;
1457 consumed_chars_base = consumed_chars;
1458
1459 if (charbuf + 2 >= charbuf_end)
1460 break;
1461
1462 ONE_MORE_BYTE (c1);
065e3595
KH
1463 if (c1 < 0)
1464 {
1465 *charbuf++ = -c1;
1466 continue;
1467 }
df7492f9 1468 ONE_MORE_BYTE (c2);
065e3595
KH
1469 if (c2 < 0)
1470 {
1471 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1472 *charbuf++ = -c2;
1473 continue;
1474 }
df7492f9 1475 c = (endian == utf_16_big_endian
e19c3639 1476 ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
df7492f9 1477 if (surrogate)
fd3ae0b9 1478 {
df7492f9 1479 if (! UTF_16_LOW_SURROGATE_P (c))
fd3ae0b9 1480 {
df7492f9
KH
1481 if (endian == utf_16_big_endian)
1482 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1483 else
1484 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1485 *charbuf++ = c1;
1486 *charbuf++ = c2;
1487 coding->errors++;
1488 if (UTF_16_HIGH_SURROGATE_P (c))
1489 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
fd3ae0b9 1490 else
df7492f9 1491 *charbuf++ = c;
fd3ae0b9
KH
1492 }
1493 else
df7492f9
KH
1494 {
1495 c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1496 CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1497 *charbuf++ = c;
1498 }
fd3ae0b9 1499 }
aa72b389 1500 else
df7492f9
KH
1501 {
1502 if (UTF_16_HIGH_SURROGATE_P (c))
1503 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1504 else
1505 *charbuf++ = c;
8f924df7 1506 }
aa72b389 1507 }
df7492f9
KH
1508
1509 no_more_source:
1510 coding->consumed_char += consumed_chars_base;
1511 coding->consumed = src_base - coding->source;
1512 coding->charbuf_used = charbuf - coding->charbuf;
aa72b389 1513}
b73bfc1c 1514
df7492f9
KH
1515static int
1516encode_coding_utf_16 (coding)
1517 struct coding_system *coding;
1518{
1519 int multibytep = coding->dst_multibyte;
1520 int *charbuf = coding->charbuf;
1521 int *charbuf_end = charbuf + coding->charbuf_used;
1522 unsigned char *dst = coding->destination + coding->produced;
1523 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1524 int safe_room = 8;
1525 enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1526 int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1527 int produced_chars = 0;
24a73b0a 1528 Lisp_Object attrs, charset_list;
df7492f9 1529 int c;
4ed46869 1530
24a73b0a 1531 CODING_GET_INFO (coding, attrs, charset_list);
df7492f9 1532
b49a1807 1533 if (bom != utf_16_without_bom)
df7492f9
KH
1534 {
1535 ASSURE_DESTINATION (safe_room);
1536 if (big_endian)
df7492f9 1537 EMIT_TWO_BYTES (0xFE, 0xFF);
880cf180
KH
1538 else
1539 EMIT_TWO_BYTES (0xFF, 0xFE);
df7492f9
KH
1540 CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1541 }
1542
1543 while (charbuf < charbuf_end)
1544 {
1545 ASSURE_DESTINATION (safe_room);
1546 c = *charbuf++;
e19c3639
KH
1547 if (c >= MAX_UNICODE_CHAR)
1548 c = coding->default_char;
df7492f9
KH
1549
1550 if (c < 0x10000)
1551 {
1552 if (big_endian)
1553 EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1554 else
1555 EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1556 }
1557 else
1558 {
1559 int c1, c2;
1560
1561 c -= 0x10000;
1562 c1 = (c >> 10) + 0xD800;
1563 c2 = (c & 0x3FF) + 0xDC00;
1564 if (big_endian)
1565 EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1566 else
1567 EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1568 }
1569 }
065e3595 1570 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
1571 coding->produced = dst - coding->destination;
1572 coding->produced_char += produced_chars;
1573 return 0;
1574}
1575
1576\f
1577/*** 6. Old Emacs' internal format (emacs-mule) ***/
1578
1579/* Emacs' internal format for representation of multiple character
1580 sets is a kind of multi-byte encoding, i.e. characters are
1581 represented by variable-length sequences of one-byte codes.
1582
1583 ASCII characters and control characters (e.g. `tab', `newline') are
1584 represented by one-byte sequences which are their ASCII codes, in
1585 the range 0x00 through 0x7F.
1586
1587 8-bit characters of the range 0x80..0x9F are represented by
1588 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1589 code + 0x20).
1590
1591 8-bit characters of the range 0xA0..0xFF are represented by
1592 one-byte sequences which are their 8-bit code.
1593
1594 The other characters are represented by a sequence of `base
1595 leading-code', optional `extended leading-code', and one or two
1596 `position-code's. The length of the sequence is determined by the
1597 base leading-code. Leading-code takes the range 0x81 through 0x9D,
1598 whereas extended leading-code and position-code take the range 0xA0
1599 through 0xFF. See `charset.h' for more details about leading-code
1600 and position-code.
1601
1602 --- CODE RANGE of Emacs' internal format ---
1603 character set range
1604 ------------- -----
1605 ascii 0x00..0x7F
1606 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1607 eight-bit-graphic 0xA0..0xBF
1608 ELSE 0x81..0x9D + [0xA0..0xFF]+
1609 ---------------------------------------------
1610
1611 As this is the internal character representation, the format is
1612 usually not used externally (i.e. in a file or in a data sent to a
1613 process). But, it is possible to have a text externally in this
1614 format (i.e. by encoding by the coding system `emacs-mule').
1615
1616 In that case, a sequence of one-byte codes has a slightly different
1617 form.
1618
1619 At first, all characters in eight-bit-control are represented by
1620 one-byte sequences which are their 8-bit code.
1621
1622 Next, character composition data are represented by the byte
1623 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1624 where,
1625 METHOD is 0xF0 plus one of composition method (enum
1626 composition_method),
1627
1628 BYTES is 0xA0 plus a byte length of this composition data,
1629
1630 CHARS is 0x20 plus a number of characters composed by this
1631 data,
1632
1633 COMPONENTs are characters of multibye form or composition
1634 rules encoded by two-byte of ASCII codes.
1635
1636 In addition, for backward compatibility, the following formats are
1637 also recognized as composition data on decoding.
1638
1639 0x80 MSEQ ...
1640 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1641
1642 Here,
1643 MSEQ is a multibyte form but in these special format:
1644 ASCII: 0xA0 ASCII_CODE+0x80,
1645 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1646 RULE is a one byte code of the range 0xA0..0xF0 that
1647 represents a composition rule.
1648 */
1649
1650char emacs_mule_bytes[256];
1651
df7492f9 1652int
ff0dacd7 1653emacs_mule_char (coding, src, nbytes, nchars, id)
df7492f9 1654 struct coding_system *coding;
065e3595 1655 const unsigned char *src;
ff0dacd7 1656 int *nbytes, *nchars, *id;
df7492f9 1657{
8f924df7
KH
1658 const unsigned char *src_end = coding->source + coding->src_bytes;
1659 const unsigned char *src_base = src;
df7492f9 1660 int multibytep = coding->src_multibyte;
df7492f9
KH
1661 struct charset *charset;
1662 unsigned code;
1663 int c;
1664 int consumed_chars = 0;
1665
1666 ONE_MORE_BYTE (c);
065e3595 1667 if (c < 0)
df7492f9 1668 {
065e3595
KH
1669 c = -c;
1670 charset = emacs_mule_charset[0];
1671 }
1672 else
1673 {
1674 switch (emacs_mule_bytes[c])
b73bfc1c 1675 {
065e3595 1676 case 2:
df7492f9
KH
1677 if (! (charset = emacs_mule_charset[c]))
1678 goto invalid_code;
1679 ONE_MORE_BYTE (c);
065e3595
KH
1680 if (c < 0)
1681 goto invalid_code;
df7492f9 1682 code = c & 0x7F;
065e3595
KH
1683 break;
1684
1685 case 3:
1686 if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
1687 || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
1688 {
1689 ONE_MORE_BYTE (c);
1690 if (c < 0 || ! (charset = emacs_mule_charset[c]))
1691 goto invalid_code;
1692 ONE_MORE_BYTE (c);
1693 if (c < 0)
1694 goto invalid_code;
1695 code = c & 0x7F;
1696 }
1697 else
1698 {
1699 if (! (charset = emacs_mule_charset[c]))
1700 goto invalid_code;
1701 ONE_MORE_BYTE (c);
1702 if (c < 0)
1703 goto invalid_code;
1704 code = (c & 0x7F) << 8;
1705 ONE_MORE_BYTE (c);
1706 if (c < 0)
1707 goto invalid_code;
1708 code |= c & 0x7F;
1709 }
1710 break;
1711
1712 case 4:
1713 ONE_MORE_BYTE (c);
1714 if (c < 0 || ! (charset = emacs_mule_charset[c]))
df7492f9
KH
1715 goto invalid_code;
1716 ONE_MORE_BYTE (c);
065e3595
KH
1717 if (c < 0)
1718 goto invalid_code;
781d7a48 1719 code = (c & 0x7F) << 8;
df7492f9 1720 ONE_MORE_BYTE (c);
065e3595
KH
1721 if (c < 0)
1722 goto invalid_code;
df7492f9 1723 code |= c & 0x7F;
065e3595 1724 break;
df7492f9 1725
065e3595
KH
1726 case 1:
1727 code = c;
1728 charset = CHARSET_FROM_ID (ASCII_BYTE_P (code)
1729 ? charset_ascii : charset_eight_bit);
1730 break;
df7492f9 1731
065e3595
KH
1732 default:
1733 abort ();
1734 }
1735 c = DECODE_CHAR (charset, code);
1736 if (c < 0)
1737 goto invalid_code;
df7492f9 1738 }
df7492f9
KH
1739 *nbytes = src - src_base;
1740 *nchars = consumed_chars;
ff0dacd7
KH
1741 if (id)
1742 *id = charset->id;
df7492f9
KH
1743 return c;
1744
1745 no_more_source:
1746 return -2;
1747
1748 invalid_code:
1749 return -1;
1750}
1751
1752
1753/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
ff0dacd7
KH
1754 Check if a text is encoded in `emacs-mule'. If it is, return 1,
1755 else return 0. */
df7492f9
KH
1756
1757static int
ff0dacd7 1758detect_coding_emacs_mule (coding, detect_info)
df7492f9 1759 struct coding_system *coding;
ff0dacd7 1760 struct coding_detection_info *detect_info;
df7492f9 1761{
065e3595 1762 const unsigned char *src = coding->source, *src_base;
8f924df7 1763 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9
KH
1764 int multibytep = coding->src_multibyte;
1765 int consumed_chars = 0;
1766 int c;
1767 int found = 0;
1768
ff0dacd7 1769 detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
df7492f9
KH
1770 /* A coding system of this category is always ASCII compatible. */
1771 src += coding->head_ascii;
1772
1773 while (1)
1774 {
065e3595 1775 src_base = src;
df7492f9 1776 ONE_MORE_BYTE (c);
065e3595
KH
1777 if (c < 0)
1778 continue;
df7492f9
KH
1779 if (c == 0x80)
1780 {
1781 /* Perhaps the start of composite character. We simple skip
1782 it because analyzing it is too heavy for detecting. But,
1783 at least, we check that the composite character
1784 constitues of more than 4 bytes. */
8f924df7 1785 const unsigned char *src_base;
df7492f9
KH
1786
1787 repeat:
1788 src_base = src;
1789 do
1790 {
1791 ONE_MORE_BYTE (c);
1792 }
1793 while (c >= 0xA0);
1794
1795 if (src - src_base <= 4)
1796 break;
ff0dacd7 1797 found = CATEGORY_MASK_EMACS_MULE;
df7492f9
KH
1798 if (c == 0x80)
1799 goto repeat;
b73bfc1c 1800 }
df7492f9
KH
1801
1802 if (c < 0x80)
b73bfc1c 1803 {
df7492f9
KH
1804 if (c < 0x20
1805 && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1806 break;
1807 }
1808 else
1809 {
8f924df7 1810 const unsigned char *src_base = src - 1;
df7492f9
KH
1811
1812 do
1813 {
1814 ONE_MORE_BYTE (c);
1815 }
1816 while (c >= 0xA0);
1817 if (src - src_base != emacs_mule_bytes[*src_base])
1818 break;
ff0dacd7 1819 found = CATEGORY_MASK_EMACS_MULE;
4ed46869
KH
1820 }
1821 }
ff0dacd7 1822 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
df7492f9
KH
1823 return 0;
1824
1825 no_more_source:
065e3595 1826 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
89528eb3 1827 {
ff0dacd7 1828 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
89528eb3
KH
1829 return 0;
1830 }
ff0dacd7
KH
1831 detect_info->found |= found;
1832 return 1;
4ed46869
KH
1833}
1834
b73bfc1c 1835
df7492f9
KH
1836/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
1837
1838/* Decode a character represented as a component of composition
1839 sequence of Emacs 20/21 style at SRC. Set C to that character and
1840 update SRC to the head of next character (or an encoded composition
1841 rule). If SRC doesn't points a composition component, set C to -1.
1842 If SRC points an invalid byte sequence, global exit by a return
1843 value 0. */
1844
1845#define DECODE_EMACS_MULE_COMPOSITION_CHAR(buf) \
1846 if (1) \
1847 { \
1848 int c; \
1849 int nbytes, nchars; \
1850 \
1851 if (src == src_end) \
1852 break; \
ff0dacd7 1853 c = emacs_mule_char (coding, src, &nbytes, &nchars, NULL);\
df7492f9
KH
1854 if (c < 0) \
1855 { \
1856 if (c == -2) \
1857 break; \
1858 goto invalid_code; \
1859 } \
1860 *buf++ = c; \
1861 src += nbytes; \
1862 consumed_chars += nchars; \
1863 } \
1864 else
1865
1866
1867/* Decode a composition rule represented as a component of composition
781d7a48
KH
1868 sequence of Emacs 20 style at SRC. Store the decoded rule in *BUF,
1869 and increment BUF. If SRC points an invalid byte sequence, set C
1870 to -1. */
df7492f9 1871
781d7a48 1872#define DECODE_EMACS_MULE_COMPOSITION_RULE_20(buf) \
df7492f9
KH
1873 do { \
1874 int c, gref, nref; \
1875 \
781d7a48 1876 if (src >= src_end) \
df7492f9
KH
1877 goto invalid_code; \
1878 ONE_MORE_BYTE_NO_CHECK (c); \
781d7a48 1879 c -= 0x20; \
df7492f9
KH
1880 if (c < 0 || c >= 81) \
1881 goto invalid_code; \
1882 \
1883 gref = c / 9, nref = c % 9; \
1884 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
1885 } while (0)
1886
1887
781d7a48
KH
1888/* Decode a composition rule represented as a component of composition
1889 sequence of Emacs 21 style at SRC. Store the decoded rule in *BUF,
1890 and increment BUF. If SRC points an invalid byte sequence, set C
1891 to -1. */
1892
1893#define DECODE_EMACS_MULE_COMPOSITION_RULE_21(buf) \
1894 do { \
1895 int gref, nref; \
1896 \
1897 if (src + 1>= src_end) \
1898 goto invalid_code; \
1899 ONE_MORE_BYTE_NO_CHECK (gref); \
1900 gref -= 0x20; \
1901 ONE_MORE_BYTE_NO_CHECK (nref); \
1902 nref -= 0x20; \
1903 if (gref < 0 || gref >= 81 \
1904 || nref < 0 || nref >= 81) \
1905 goto invalid_code; \
1906 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
1907 } while (0)
1908
1909
df7492f9 1910#define DECODE_EMACS_MULE_21_COMPOSITION(c) \
aa72b389 1911 do { \
df7492f9 1912 /* Emacs 21 style format. The first three bytes at SRC are \
781d7a48 1913 (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is \
df7492f9
KH
1914 the byte length of this composition information, CHARS is the \
1915 number of characters composed by this composition. */ \
781d7a48
KH
1916 enum composition_method method = c - 0xF2; \
1917 int *charbuf_base = charbuf; \
ff0dacd7 1918 int from, to; \
df7492f9
KH
1919 int consumed_chars_limit; \
1920 int nbytes, nchars; \
1921 \
1922 ONE_MORE_BYTE (c); \
065e3595
KH
1923 if (c < 0) \
1924 goto invalid_code; \
df7492f9
KH
1925 nbytes = c - 0xA0; \
1926 if (nbytes < 3) \
1927 goto invalid_code; \
1928 ONE_MORE_BYTE (c); \
065e3595
KH
1929 if (c < 0) \
1930 goto invalid_code; \
df7492f9 1931 nchars = c - 0xA0; \
ff0dacd7
KH
1932 from = coding->produced + char_offset; \
1933 to = from + nchars; \
1934 ADD_COMPOSITION_DATA (charbuf, from, to, method); \
df7492f9
KH
1935 consumed_chars_limit = consumed_chars_base + nbytes; \
1936 if (method != COMPOSITION_RELATIVE) \
aa72b389 1937 { \
df7492f9
KH
1938 int i = 0; \
1939 while (consumed_chars < consumed_chars_limit) \
aa72b389 1940 { \
df7492f9 1941 if (i % 2 && method != COMPOSITION_WITH_ALTCHARS) \
781d7a48 1942 DECODE_EMACS_MULE_COMPOSITION_RULE_21 (charbuf); \
df7492f9
KH
1943 else \
1944 DECODE_EMACS_MULE_COMPOSITION_CHAR (charbuf); \
781d7a48 1945 i++; \
aa72b389 1946 } \
df7492f9
KH
1947 if (consumed_chars < consumed_chars_limit) \
1948 goto invalid_code; \
781d7a48 1949 charbuf_base[0] -= i; \
aa72b389
KH
1950 } \
1951 } while (0)
93dec019 1952
aa72b389 1953
df7492f9
KH
1954#define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION(c) \
1955 do { \
1956 /* Emacs 20 style format for relative composition. */ \
1957 /* Store multibyte form of characters to be composed. */ \
ff0dacd7 1958 enum composition_method method = COMPOSITION_RELATIVE; \
df7492f9
KH
1959 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
1960 int *buf = components; \
1961 int i, j; \
ff0dacd7 1962 int from, to; \
df7492f9
KH
1963 \
1964 src = src_base; \
1965 ONE_MORE_BYTE (c); /* skip 0x80 */ \
1966 for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \
1967 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1968 if (i < 2) \
1969 goto invalid_code; \
ff0dacd7
KH
1970 from = coding->produced_char + char_offset; \
1971 to = from + i; \
1972 ADD_COMPOSITION_DATA (charbuf, from, to, method); \
df7492f9
KH
1973 for (j = 0; j < i; j++) \
1974 *charbuf++ = components[j]; \
1975 } while (0)
1976
1977
1978#define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION(c) \
1979 do { \
1980 /* Emacs 20 style format for rule-base composition. */ \
1981 /* Store multibyte form of characters to be composed. */ \
ff0dacd7 1982 enum composition_method method = COMPOSITION_WITH_RULE; \
df7492f9
KH
1983 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
1984 int *buf = components; \
1985 int i, j; \
ff0dacd7 1986 int from, to; \
df7492f9
KH
1987 \
1988 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1989 for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \
1990 { \
781d7a48 1991 DECODE_EMACS_MULE_COMPOSITION_RULE_20 (buf); \
df7492f9
KH
1992 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1993 } \
1994 if (i < 1 || (buf - components) % 2 == 0) \
1995 goto invalid_code; \
1996 if (charbuf + i + (i / 2) + 1 < charbuf_end) \
1997 goto no_more_source; \
ff0dacd7
KH
1998 from = coding->produced_char + char_offset; \
1999 to = from + i; \
2000 ADD_COMPOSITION_DATA (buf, from, to, method); \
df7492f9
KH
2001 for (j = 0; j < i; j++) \
2002 *charbuf++ = components[j]; \
2003 for (j = 0; j < i; j += 2) \
2004 *charbuf++ = components[j]; \
2005 } while (0)
2006
aa72b389
KH
2007
2008static void
df7492f9 2009decode_coding_emacs_mule (coding)
aa72b389 2010 struct coding_system *coding;
aa72b389 2011{
8f924df7
KH
2012 const unsigned char *src = coding->source + coding->consumed;
2013 const unsigned char *src_end = coding->source + coding->src_bytes;
2014 const unsigned char *src_base;
df7492f9 2015 int *charbuf = coding->charbuf;
ff0dacd7 2016 int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
df7492f9 2017 int consumed_chars = 0, consumed_chars_base;
df7492f9 2018 int multibytep = coding->src_multibyte;
24a73b0a 2019 Lisp_Object attrs, charset_list;
ff0dacd7
KH
2020 int char_offset = coding->produced_char;
2021 int last_offset = char_offset;
2022 int last_id = charset_ascii;
aa72b389 2023
24a73b0a 2024 CODING_GET_INFO (coding, attrs, charset_list);
aa72b389 2025
aa72b389
KH
2026 while (1)
2027 {
df7492f9
KH
2028 int c;
2029
aa72b389 2030 src_base = src;
df7492f9
KH
2031 consumed_chars_base = consumed_chars;
2032
2033 if (charbuf >= charbuf_end)
2034 break;
aa72b389 2035
df7492f9 2036 ONE_MORE_BYTE (c);
065e3595
KH
2037 if (c < 0)
2038 {
2039 *charbuf++ = -c;
2040 char_offset++;
2041 }
2042 else if (c < 0x80)
aa72b389 2043 {
df7492f9
KH
2044 *charbuf++ = c;
2045 char_offset++;
aa72b389 2046 }
df7492f9
KH
2047 else if (c == 0x80)
2048 {
df7492f9 2049 ONE_MORE_BYTE (c);
065e3595
KH
2050 if (c < 0)
2051 goto invalid_code;
781d7a48
KH
2052 if (c - 0xF2 >= COMPOSITION_RELATIVE
2053 && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)
df7492f9
KH
2054 DECODE_EMACS_MULE_21_COMPOSITION (c);
2055 else if (c < 0xC0)
2056 DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (c);
2057 else if (c == 0xFF)
2058 DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (c);
2059 else
2060 goto invalid_code;
2061 }
2062 else if (c < 0xA0 && emacs_mule_bytes[c] > 1)
2063 {
2064 int nbytes, nchars;
ff0dacd7
KH
2065 int id;
2066
781d7a48
KH
2067 src = src_base;
2068 consumed_chars = consumed_chars_base;
ff0dacd7 2069 c = emacs_mule_char (coding, src, &nbytes, &nchars, &id);
df7492f9
KH
2070 if (c < 0)
2071 {
2072 if (c == -2)
2073 break;
2074 goto invalid_code;
2075 }
ff0dacd7
KH
2076 if (last_id != id)
2077 {
2078 if (last_id != charset_ascii)
2079 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
2080 last_id = id;
2081 last_offset = char_offset;
2082 }
df7492f9 2083 *charbuf++ = c;
781d7a48
KH
2084 src += nbytes;
2085 consumed_chars += nchars;
df7492f9
KH
2086 char_offset++;
2087 }
2088 continue;
2089
2090 invalid_code:
2091 src = src_base;
2092 consumed_chars = consumed_chars_base;
2093 ONE_MORE_BYTE (c);
2094 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
ff0dacd7 2095 char_offset++;
df7492f9
KH
2096 coding->errors++;
2097 }
2098
2099 no_more_source:
ff0dacd7
KH
2100 if (last_id != charset_ascii)
2101 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
df7492f9
KH
2102 coding->consumed_char += consumed_chars_base;
2103 coding->consumed = src_base - coding->source;
2104 coding->charbuf_used = charbuf - coding->charbuf;
2105}
2106
2107
2108#define EMACS_MULE_LEADING_CODES(id, codes) \
2109 do { \
2110 if (id < 0xA0) \
2111 codes[0] = id, codes[1] = 0; \
2112 else if (id < 0xE0) \
2113 codes[0] = 0x9A, codes[1] = id; \
2114 else if (id < 0xF0) \
2115 codes[0] = 0x9B, codes[1] = id; \
2116 else if (id < 0xF5) \
2117 codes[0] = 0x9C, codes[1] = id; \
2118 else \
2119 codes[0] = 0x9D, codes[1] = id; \
2120 } while (0);
2121
aa72b389 2122
df7492f9
KH
2123static int
2124encode_coding_emacs_mule (coding)
2125 struct coding_system *coding;
2126{
2127 int multibytep = coding->dst_multibyte;
2128 int *charbuf = coding->charbuf;
2129 int *charbuf_end = charbuf + coding->charbuf_used;
2130 unsigned char *dst = coding->destination + coding->produced;
2131 unsigned char *dst_end = coding->destination + coding->dst_bytes;
2132 int safe_room = 8;
df7492f9 2133 int produced_chars = 0;
24a73b0a 2134 Lisp_Object attrs, charset_list;
df7492f9 2135 int c;
ff0dacd7 2136 int preferred_charset_id = -1;
df7492f9 2137
24a73b0a 2138 CODING_GET_INFO (coding, attrs, charset_list);
eccb6815
KH
2139 if (! EQ (charset_list, Vemacs_mule_charset_list))
2140 {
2141 CODING_ATTR_CHARSET_LIST (attrs)
2142 = charset_list = Vemacs_mule_charset_list;
2143 }
df7492f9
KH
2144
2145 while (charbuf < charbuf_end)
2146 {
2147 ASSURE_DESTINATION (safe_room);
2148 c = *charbuf++;
ff0dacd7
KH
2149
2150 if (c < 0)
2151 {
2152 /* Handle an annotation. */
2153 switch (*charbuf)
2154 {
2155 case CODING_ANNOTATE_COMPOSITION_MASK:
2156 /* Not yet implemented. */
2157 break;
2158 case CODING_ANNOTATE_CHARSET_MASK:
2159 preferred_charset_id = charbuf[3];
2160 if (preferred_charset_id >= 0
2161 && NILP (Fmemq (make_number (preferred_charset_id),
2162 charset_list)))
2163 preferred_charset_id = -1;
2164 break;
2165 default:
2166 abort ();
2167 }
2168 charbuf += -c - 1;
2169 continue;
2170 }
2171
df7492f9
KH
2172 if (ASCII_CHAR_P (c))
2173 EMIT_ONE_ASCII_BYTE (c);
16eafb5d
KH
2174 else if (CHAR_BYTE8_P (c))
2175 {
2176 c = CHAR_TO_BYTE8 (c);
2177 EMIT_ONE_BYTE (c);
2178 }
df7492f9 2179 else
aa72b389 2180 {
df7492f9
KH
2181 struct charset *charset;
2182 unsigned code;
2183 int dimension;
2184 int emacs_mule_id;
2185 unsigned char leading_codes[2];
2186
ff0dacd7
KH
2187 if (preferred_charset_id >= 0)
2188 {
2189 charset = CHARSET_FROM_ID (preferred_charset_id);
2190 if (! CHAR_CHARSET_P (c, charset))
2191 charset = char_charset (c, charset_list, NULL);
2192 }
2193 else
2194 charset = char_charset (c, charset_list, &code);
df7492f9
KH
2195 if (! charset)
2196 {
2197 c = coding->default_char;
2198 if (ASCII_CHAR_P (c))
2199 {
2200 EMIT_ONE_ASCII_BYTE (c);
2201 continue;
2202 }
2203 charset = char_charset (c, charset_list, &code);
2204 }
2205 dimension = CHARSET_DIMENSION (charset);
2206 emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2207 EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2208 EMIT_ONE_BYTE (leading_codes[0]);
2209 if (leading_codes[1])
2210 EMIT_ONE_BYTE (leading_codes[1]);
2211 if (dimension == 1)
1fa663f9 2212 EMIT_ONE_BYTE (code | 0x80);
aa72b389 2213 else
df7492f9 2214 {
1fa663f9 2215 code |= 0x8080;
df7492f9
KH
2216 EMIT_ONE_BYTE (code >> 8);
2217 EMIT_ONE_BYTE (code & 0xFF);
2218 }
aa72b389 2219 }
aa72b389 2220 }
065e3595 2221 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
2222 coding->produced_char += produced_chars;
2223 coding->produced = dst - coding->destination;
2224 return 0;
aa72b389 2225}
b73bfc1c 2226
4ed46869 2227\f
df7492f9 2228/*** 7. ISO2022 handlers ***/
4ed46869
KH
2229
2230/* The following note describes the coding system ISO2022 briefly.
39787efd 2231 Since the intention of this note is to help understand the
5a936b46 2232 functions in this file, some parts are NOT ACCURATE or are OVERLY
39787efd 2233 SIMPLIFIED. For thorough understanding, please refer to the
5a936b46 2234 original document of ISO2022. This is equivalent to the standard
cfb43547 2235 ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
4ed46869
KH
2236
2237 ISO2022 provides many mechanisms to encode several character sets
cfb43547 2238 in 7-bit and 8-bit environments. For 7-bit environments, all text
39787efd
KH
2239 is encoded using bytes less than 128. This may make the encoded
2240 text a little bit longer, but the text passes more easily through
cfb43547 2241 several types of gateway, some of which strip off the MSB (Most
8ca3766a 2242 Significant Bit).
b73bfc1c 2243
cfb43547
DL
2244 There are two kinds of character sets: control character sets and
2245 graphic character sets. The former contain control characters such
4ed46869 2246 as `newline' and `escape' to provide control functions (control
39787efd 2247 functions are also provided by escape sequences). The latter
cfb43547 2248 contain graphic characters such as 'A' and '-'. Emacs recognizes
4ed46869
KH
2249 two control character sets and many graphic character sets.
2250
2251 Graphic character sets are classified into one of the following
39787efd
KH
2252 four classes, according to the number of bytes (DIMENSION) and
2253 number of characters in one dimension (CHARS) of the set:
2254 - DIMENSION1_CHARS94
2255 - DIMENSION1_CHARS96
2256 - DIMENSION2_CHARS94
2257 - DIMENSION2_CHARS96
2258
2259 In addition, each character set is assigned an identification tag,
cfb43547 2260 unique for each set, called the "final character" (denoted as <F>
39787efd
KH
2261 hereafter). The <F> of each character set is decided by ECMA(*)
2262 when it is registered in ISO. The code range of <F> is 0x30..0x7F
2263 (0x30..0x3F are for private use only).
4ed46869
KH
2264
2265 Note (*): ECMA = European Computer Manufacturers Association
2266
cfb43547 2267 Here are examples of graphic character sets [NAME(<F>)]:
4ed46869
KH
2268 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2269 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2270 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2271 o DIMENSION2_CHARS96 -- none for the moment
2272
39787efd 2273 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
4ed46869
KH
2274 C0 [0x00..0x1F] -- control character plane 0
2275 GL [0x20..0x7F] -- graphic character plane 0
2276 C1 [0x80..0x9F] -- control character plane 1
2277 GR [0xA0..0xFF] -- graphic character plane 1
2278
2279 A control character set is directly designated and invoked to C0 or
39787efd
KH
2280 C1 by an escape sequence. The most common case is that:
2281 - ISO646's control character set is designated/invoked to C0, and
2282 - ISO6429's control character set is designated/invoked to C1,
2283 and usually these designations/invocations are omitted in encoded
2284 text. In a 7-bit environment, only C0 can be used, and a control
2285 character for C1 is encoded by an appropriate escape sequence to
2286 fit into the environment. All control characters for C1 are
2287 defined to have corresponding escape sequences.
4ed46869
KH
2288
2289 A graphic character set is at first designated to one of four
2290 graphic registers (G0 through G3), then these graphic registers are
2291 invoked to GL or GR. These designations and invocations can be
2292 done independently. The most common case is that G0 is invoked to
39787efd
KH
2293 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
2294 these invocations and designations are omitted in encoded text.
2295 In a 7-bit environment, only GL can be used.
4ed46869 2296
39787efd
KH
2297 When a graphic character set of CHARS94 is invoked to GL, codes
2298 0x20 and 0x7F of the GL area work as control characters SPACE and
2299 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2300 be used.
4ed46869
KH
2301
2302 There are two ways of invocation: locking-shift and single-shift.
2303 With locking-shift, the invocation lasts until the next different
39787efd
KH
2304 invocation, whereas with single-shift, the invocation affects the
2305 following character only and doesn't affect the locking-shift
2306 state. Invocations are done by the following control characters or
2307 escape sequences:
4ed46869
KH
2308
2309 ----------------------------------------------------------------------
39787efd 2310 abbrev function cntrl escape seq description
4ed46869 2311 ----------------------------------------------------------------------
39787efd
KH
2312 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
2313 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
2314 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
2315 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
2316 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
2317 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
2318 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
2319 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
2320 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
4ed46869 2321 ----------------------------------------------------------------------
39787efd
KH
2322 (*) These are not used by any known coding system.
2323
2324 Control characters for these functions are defined by macros
2325 ISO_CODE_XXX in `coding.h'.
4ed46869 2326
39787efd 2327 Designations are done by the following escape sequences:
4ed46869
KH
2328 ----------------------------------------------------------------------
2329 escape sequence description
2330 ----------------------------------------------------------------------
2331 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
2332 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
2333 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
2334 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
2335 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
2336 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
2337 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
2338 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
2339 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
2340 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
2341 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
2342 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
2343 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
2344 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
2345 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
2346 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
2347 ----------------------------------------------------------------------
2348
2349 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
39787efd 2350 of dimension 1, chars 94, and final character <F>, etc...
4ed46869
KH
2351
2352 Note (*): Although these designations are not allowed in ISO2022,
2353 Emacs accepts them on decoding, and produces them on encoding
39787efd 2354 CHARS96 character sets in a coding system which is characterized as
4ed46869
KH
2355 7-bit environment, non-locking-shift, and non-single-shift.
2356
2357 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
df7492f9 2358 '(' must be omitted. We refer to this as "short-form" hereafter.
4ed46869 2359
cfb43547 2360 Now you may notice that there are a lot of ways of encoding the
39787efd
KH
2361 same multilingual text in ISO2022. Actually, there exist many
2362 coding systems such as Compound Text (used in X11's inter client
8ca3766a
DL
2363 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2364 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
4ed46869
KH
2365 localized platforms), and all of these are variants of ISO2022.
2366
2367 In addition to the above, Emacs handles two more kinds of escape
2368 sequences: ISO6429's direction specification and Emacs' private
2369 sequence for specifying character composition.
2370
39787efd 2371 ISO6429's direction specification takes the following form:
4ed46869
KH
2372 o CSI ']' -- end of the current direction
2373 o CSI '0' ']' -- end of the current direction
2374 o CSI '1' ']' -- start of left-to-right text
2375 o CSI '2' ']' -- start of right-to-left text
2376 The control character CSI (0x9B: control sequence introducer) is
39787efd
KH
2377 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2378
2379 Character composition specification takes the following form:
ec6d2bb8
KH
2380 o ESC '0' -- start relative composition
2381 o ESC '1' -- end composition
2382 o ESC '2' -- start rule-base composition (*)
2383 o ESC '3' -- start relative composition with alternate chars (**)
2384 o ESC '4' -- start rule-base composition with alternate chars (**)
b73bfc1c 2385 Since these are not standard escape sequences of any ISO standard,
cfb43547 2386 the use of them with these meanings is restricted to Emacs only.
ec6d2bb8 2387
5a936b46
DL
2388 (*) This form is used only in Emacs 20.7 and older versions,
2389 but newer versions can safely decode it.
cfb43547 2390 (**) This form is used only in Emacs 21.1 and newer versions,
5a936b46 2391 and older versions can't decode it.
ec6d2bb8 2392
cfb43547 2393 Here's a list of example usages of these composition escape
b73bfc1c 2394 sequences (categorized by `enum composition_method').
ec6d2bb8 2395
b73bfc1c 2396 COMPOSITION_RELATIVE:
ec6d2bb8 2397 ESC 0 CHAR [ CHAR ] ESC 1
8ca3766a 2398 COMPOSITION_WITH_RULE:
ec6d2bb8 2399 ESC 2 CHAR [ RULE CHAR ] ESC 1
b73bfc1c 2400 COMPOSITION_WITH_ALTCHARS:
ec6d2bb8 2401 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
b73bfc1c 2402 COMPOSITION_WITH_RULE_ALTCHARS:
ec6d2bb8 2403 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
4ed46869
KH
2404
2405enum iso_code_class_type iso_code_class[256];
2406
df7492f9
KH
2407#define SAFE_CHARSET_P(coding, id) \
2408 ((id) <= (coding)->max_charset_id \
2409 && (coding)->safe_charsets[id] >= 0)
2410
2411
2412#define SHIFT_OUT_OK(category) \
2413 (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
2414
2415static void
f0064e1f
DL
2416setup_iso_safe_charsets (attrs)
2417 Lisp_Object attrs;
df7492f9
KH
2418{
2419 Lisp_Object charset_list, safe_charsets;
2420 Lisp_Object request;
2421 Lisp_Object reg_usage;
2422 Lisp_Object tail;
2423 int reg94, reg96;
2424 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2425 int max_charset_id;
2426
2427 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2428 if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2429 && ! EQ (charset_list, Viso_2022_charset_list))
2430 {
2431 CODING_ATTR_CHARSET_LIST (attrs)
2432 = charset_list = Viso_2022_charset_list;
2433 ASET (attrs, coding_attr_safe_charsets, Qnil);
2434 }
2435
2436 if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2437 return;
2438
2439 max_charset_id = 0;
2440 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2441 {
2442 int id = XINT (XCAR (tail));
2443 if (max_charset_id < id)
2444 max_charset_id = id;
2445 }
d46c5b12 2446
df7492f9
KH
2447 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
2448 make_number (255));
2449 request = AREF (attrs, coding_attr_iso_request);
2450 reg_usage = AREF (attrs, coding_attr_iso_usage);
2451 reg94 = XINT (XCAR (reg_usage));
2452 reg96 = XINT (XCDR (reg_usage));
2453
2454 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2455 {
2456 Lisp_Object id;
2457 Lisp_Object reg;
2458 struct charset *charset;
2459
2460 id = XCAR (tail);
2461 charset = CHARSET_FROM_ID (XINT (id));
bf16eb23 2462 reg = Fcdr (Fassq (id, request));
df7492f9 2463 if (! NILP (reg))
8f924df7 2464 SSET (safe_charsets, XINT (id), XINT (reg));
df7492f9
KH
2465 else if (charset->iso_chars_96)
2466 {
2467 if (reg96 < 4)
8f924df7 2468 SSET (safe_charsets, XINT (id), reg96);
df7492f9
KH
2469 }
2470 else
2471 {
2472 if (reg94 < 4)
8f924df7 2473 SSET (safe_charsets, XINT (id), reg94);
df7492f9
KH
2474 }
2475 }
2476 ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2477}
d46c5b12 2478
b6871cc7 2479
4ed46869 2480/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
ff0dacd7
KH
2481 Check if a text is encoded in one of ISO-2022 based codig systems.
2482 If it is, return 1, else return 0. */
4ed46869 2483
0a28aafb 2484static int
ff0dacd7 2485detect_coding_iso_2022 (coding, detect_info)
df7492f9 2486 struct coding_system *coding;
ff0dacd7 2487 struct coding_detection_info *detect_info;
4ed46869 2488{
8f924df7
KH
2489 const unsigned char *src = coding->source, *src_base = src;
2490 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9 2491 int multibytep = coding->src_multibyte;
ff0dacd7 2492 int single_shifting = 0;
df7492f9
KH
2493 int id;
2494 int c, c1;
2495 int consumed_chars = 0;
2496 int i;
ff0dacd7
KH
2497 int rejected = 0;
2498 int found = 0;
2499
2500 detect_info->checked |= CATEGORY_MASK_ISO;
df7492f9
KH
2501
2502 for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2503 {
2504 struct coding_system *this = &(coding_categories[i]);
2505 Lisp_Object attrs, val;
2506
2507 attrs = CODING_ID_ATTRS (this->id);
2508 if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2509 && ! EQ (CODING_ATTR_SAFE_CHARSETS (attrs), Viso_2022_charset_list))
2510 setup_iso_safe_charsets (attrs);
2511 val = CODING_ATTR_SAFE_CHARSETS (attrs);
8f924df7
KH
2512 this->max_charset_id = SCHARS (val) - 1;
2513 this->safe_charsets = (char *) SDATA (val);
df7492f9
KH
2514 }
2515
2516 /* A coding system of this category is always ASCII compatible. */
2517 src += coding->head_ascii;
3f003981 2518
ff0dacd7 2519 while (rejected != CATEGORY_MASK_ISO)
4ed46869 2520 {
065e3595 2521 src_base = src;
df7492f9 2522 ONE_MORE_BYTE (c);
4ed46869
KH
2523 switch (c)
2524 {
2525 case ISO_CODE_ESC:
74383408
KH
2526 if (inhibit_iso_escape_detection)
2527 break;
f46869e4 2528 single_shifting = 0;
df7492f9 2529 ONE_MORE_BYTE (c);
d46c5b12 2530 if (c >= '(' && c <= '/')
4ed46869 2531 {
bf9cdd4e 2532 /* Designation sequence for a charset of dimension 1. */
df7492f9 2533 ONE_MORE_BYTE (c1);
d46c5b12 2534 if (c1 < ' ' || c1 >= 0x80
df7492f9 2535 || (id = iso_charset_table[0][c >= ','][c1]) < 0)
d46c5b12
KH
2536 /* Invalid designation sequence. Just ignore. */
2537 break;
bf9cdd4e
KH
2538 }
2539 else if (c == '$')
2540 {
2541 /* Designation sequence for a charset of dimension 2. */
df7492f9 2542 ONE_MORE_BYTE (c);
bf9cdd4e
KH
2543 if (c >= '@' && c <= 'B')
2544 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
ff0dacd7 2545 id = iso_charset_table[1][0][c];
bf9cdd4e 2546 else if (c >= '(' && c <= '/')
bcf26d6a 2547 {
df7492f9 2548 ONE_MORE_BYTE (c1);
d46c5b12 2549 if (c1 < ' ' || c1 >= 0x80
df7492f9 2550 || (id = iso_charset_table[1][c >= ','][c1]) < 0)
d46c5b12
KH
2551 /* Invalid designation sequence. Just ignore. */
2552 break;
bcf26d6a 2553 }
bf9cdd4e 2554 else
ff0dacd7 2555 /* Invalid designation sequence. Just ignore it. */
d46c5b12
KH
2556 break;
2557 }
ae9ff118 2558 else if (c == 'N' || c == 'O')
d46c5b12 2559 {
ae9ff118 2560 /* ESC <Fe> for SS2 or SS3. */
ff0dacd7
KH
2561 single_shifting = 1;
2562 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
d46c5b12 2563 break;
4ed46869 2564 }
ec6d2bb8
KH
2565 else if (c >= '0' && c <= '4')
2566 {
2567 /* ESC <Fp> for start/end composition. */
ff0dacd7 2568 found |= CATEGORY_MASK_ISO;
ec6d2bb8
KH
2569 break;
2570 }
bf9cdd4e 2571 else
df7492f9 2572 {
ff0dacd7 2573 /* Invalid escape sequence. Just ignore it. */
df7492f9
KH
2574 break;
2575 }
d46c5b12
KH
2576
2577 /* We found a valid designation sequence for CHARSET. */
ff0dacd7 2578 rejected |= CATEGORY_MASK_ISO_8BIT;
df7492f9
KH
2579 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
2580 id))
ff0dacd7 2581 found |= CATEGORY_MASK_ISO_7;
d46c5b12 2582 else
ff0dacd7 2583 rejected |= CATEGORY_MASK_ISO_7;
df7492f9
KH
2584 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
2585 id))
ff0dacd7 2586 found |= CATEGORY_MASK_ISO_7_TIGHT;
d46c5b12 2587 else
ff0dacd7 2588 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
df7492f9
KH
2589 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
2590 id))
ff0dacd7 2591 found |= CATEGORY_MASK_ISO_7_ELSE;
ae9ff118 2592 else
ff0dacd7 2593 rejected |= CATEGORY_MASK_ISO_7_ELSE;
df7492f9
KH
2594 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
2595 id))
ff0dacd7 2596 found |= CATEGORY_MASK_ISO_8_ELSE;
ae9ff118 2597 else
ff0dacd7 2598 rejected |= CATEGORY_MASK_ISO_8_ELSE;
4ed46869
KH
2599 break;
2600
4ed46869 2601 case ISO_CODE_SO:
d46c5b12 2602 case ISO_CODE_SI:
ff0dacd7 2603 /* Locking shift out/in. */
74383408
KH
2604 if (inhibit_iso_escape_detection)
2605 break;
f46869e4 2606 single_shifting = 0;
ff0dacd7
KH
2607 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2608 found |= CATEGORY_MASK_ISO_ELSE;
d46c5b12
KH
2609 break;
2610
4ed46869 2611 case ISO_CODE_CSI:
ff0dacd7 2612 /* Control sequence introducer. */
f46869e4 2613 single_shifting = 0;
ff0dacd7
KH
2614 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2615 found |= CATEGORY_MASK_ISO_8_ELSE;
2616 goto check_extra_latin;
2617
2618
4ed46869
KH
2619 case ISO_CODE_SS2:
2620 case ISO_CODE_SS3:
ff0dacd7
KH
2621 /* Single shift. */
2622 if (inhibit_iso_escape_detection)
2623 break;
2624 single_shifting = 1;
2625 rejected |= CATEGORY_MASK_ISO_7BIT;
2626 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2627 & CODING_ISO_FLAG_SINGLE_SHIFT)
2628 found |= CATEGORY_MASK_ISO_8_1;
2629 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2630 & CODING_ISO_FLAG_SINGLE_SHIFT)
2631 found |= CATEGORY_MASK_ISO_8_2;
2632 goto check_extra_latin;
4ed46869
KH
2633
2634 default:
065e3595
KH
2635 if (c < 0)
2636 continue;
4ed46869 2637 if (c < 0x80)
f46869e4
KH
2638 {
2639 single_shifting = 0;
2640 break;
2641 }
ff0dacd7 2642 if (c >= 0xA0)
c4825358 2643 {
ff0dacd7
KH
2644 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2645 found |= CATEGORY_MASK_ISO_8_1;
f46869e4 2646 /* Check the length of succeeding codes of the range
ff0dacd7
KH
2647 0xA0..0FF. If the byte length is even, we include
2648 CATEGORY_MASK_ISO_8_2 in `found'. We can check this
2649 only when we are not single shifting. */
2650 if (! single_shifting
2651 && ! (rejected & CATEGORY_MASK_ISO_8_2))
f46869e4 2652 {
e17de821 2653 int i = 1;
b73bfc1c
KH
2654 while (src < src_end)
2655 {
df7492f9 2656 ONE_MORE_BYTE (c);
b73bfc1c
KH
2657 if (c < 0xA0)
2658 break;
2659 i++;
2660 }
2661
2662 if (i & 1 && src < src_end)
ff0dacd7 2663 rejected |= CATEGORY_MASK_ISO_8_2;
f46869e4 2664 else
ff0dacd7 2665 found |= CATEGORY_MASK_ISO_8_2;
f46869e4 2666 }
ff0dacd7 2667 break;
4ed46869 2668 }
ff0dacd7
KH
2669 check_extra_latin:
2670 single_shifting = 0;
2671 if (! VECTORP (Vlatin_extra_code_table)
2672 || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
2673 {
2674 rejected = CATEGORY_MASK_ISO;
2675 break;
2676 }
2677 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2678 & CODING_ISO_FLAG_LATIN_EXTRA)
2679 found |= CATEGORY_MASK_ISO_8_1;
2680 else
2681 rejected |= CATEGORY_MASK_ISO_8_1;
2682 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2683 & CODING_ISO_FLAG_LATIN_EXTRA)
2684 found |= CATEGORY_MASK_ISO_8_2;
2685 else
2686 rejected |= CATEGORY_MASK_ISO_8_2;
4ed46869
KH
2687 }
2688 }
ff0dacd7
KH
2689 detect_info->rejected |= CATEGORY_MASK_ISO;
2690 return 0;
4ed46869 2691
df7492f9 2692 no_more_source:
ff0dacd7
KH
2693 detect_info->rejected |= rejected;
2694 detect_info->found |= (found & ~rejected);
df7492f9 2695 return 1;
4ed46869 2696}
ec6d2bb8 2697
4ed46869
KH
2698
2699/* Set designation state into CODING. */
df7492f9
KH
2700#define DECODE_DESIGNATION(reg, dim, chars_96, final) \
2701 do { \
2702 int id, prev; \
2703 \
2704 if (final < '0' || final >= 128 \
2705 || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0) \
2706 || !SAFE_CHARSET_P (coding, id)) \
2707 { \
2708 CODING_ISO_DESIGNATION (coding, reg) = -2; \
2709 goto invalid_code; \
2710 } \
2711 prev = CODING_ISO_DESIGNATION (coding, reg); \
bf16eb23
KH
2712 if (id == charset_jisx0201_roman) \
2713 { \
2714 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
2715 id = charset_ascii; \
2716 } \
2717 else if (id == charset_jisx0208_1978) \
2718 { \
2719 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
2720 id = charset_jisx0208; \
2721 } \
df7492f9
KH
2722 CODING_ISO_DESIGNATION (coding, reg) = id; \
2723 /* If there was an invalid designation to REG previously, and this \
2724 designation is ASCII to REG, we should keep this designation \
2725 sequence. */ \
2726 if (prev == -2 && id == charset_ascii) \
2727 goto invalid_code; \
4ed46869
KH
2728 } while (0)
2729
d46c5b12 2730
df7492f9
KH
2731#define MAYBE_FINISH_COMPOSITION() \
2732 do { \
2733 int i; \
2734 if (composition_state == COMPOSING_NO) \
2735 break; \
2736 /* It is assured that we have enough room for producing \
2737 characters stored in the table `components'. */ \
2738 if (charbuf + component_idx > charbuf_end) \
2739 goto no_more_source; \
2740 composition_state = COMPOSING_NO; \
2741 if (method == COMPOSITION_RELATIVE \
2742 || method == COMPOSITION_WITH_ALTCHARS) \
2743 { \
2744 for (i = 0; i < component_idx; i++) \
2745 *charbuf++ = components[i]; \
2746 char_offset += component_idx; \
2747 } \
2748 else \
2749 { \
2750 for (i = 0; i < component_idx; i += 2) \
2751 *charbuf++ = components[i]; \
2752 char_offset += (component_idx / 2) + 1; \
2753 } \
2754 } while (0)
2755
d46c5b12 2756
aa72b389
KH
2757/* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
2758 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
2759 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
df7492f9
KH
2760 ESC 3 : altchar composition : ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
2761 ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
aa72b389 2762 */
ec6d2bb8 2763
df7492f9
KH
2764#define DECODE_COMPOSITION_START(c1) \
2765 do { \
2766 if (c1 == '0' \
781d7a48 2767 && composition_state == COMPOSING_COMPONENT_RULE) \
df7492f9
KH
2768 { \
2769 component_len = component_idx; \
2770 composition_state = COMPOSING_CHAR; \
2771 } \
2772 else \
2773 { \
8f924df7 2774 const unsigned char *p; \
df7492f9
KH
2775 \
2776 MAYBE_FINISH_COMPOSITION (); \
2777 if (charbuf + MAX_COMPOSITION_COMPONENTS > charbuf_end) \
2778 goto no_more_source; \
2779 for (p = src; p < src_end - 1; p++) \
2780 if (*p == ISO_CODE_ESC && p[1] == '1') \
2781 break; \
2782 if (p == src_end - 1) \
2783 { \
2784 if (coding->mode & CODING_MODE_LAST_BLOCK) \
2785 goto invalid_code; \
2786 goto no_more_source; \
2787 } \
2788 \
2789 /* This is surely the start of a composition. */ \
2790 method = (c1 == '0' ? COMPOSITION_RELATIVE \
2791 : c1 == '2' ? COMPOSITION_WITH_RULE \
2792 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
2793 : COMPOSITION_WITH_RULE_ALTCHARS); \
2794 composition_state = (c1 <= '2' ? COMPOSING_CHAR \
2795 : COMPOSING_COMPONENT_CHAR); \
2796 component_idx = component_len = 0; \
2797 } \
ec6d2bb8
KH
2798 } while (0)
2799
ec6d2bb8 2800
df7492f9
KH
2801/* Handle compositoin end sequence ESC 1. */
2802
2803#define DECODE_COMPOSITION_END() \
ec6d2bb8 2804 do { \
df7492f9
KH
2805 int nchars = (component_len > 0 ? component_idx - component_len \
2806 : method == COMPOSITION_RELATIVE ? component_idx \
2807 : (component_idx + 1) / 2); \
2808 int i; \
2809 int *saved_charbuf = charbuf; \
8f924df7 2810 int from = char_offset; \
ff0dacd7 2811 int to = from + nchars; \
df7492f9 2812 \
ff0dacd7 2813 ADD_COMPOSITION_DATA (charbuf, from, to, method); \
df7492f9 2814 if (method != COMPOSITION_RELATIVE) \
ec6d2bb8 2815 { \
df7492f9
KH
2816 if (component_len == 0) \
2817 for (i = 0; i < component_idx; i++) \
2818 *charbuf++ = components[i]; \
2819 else \
2820 for (i = 0; i < component_len; i++) \
2821 *charbuf++ = components[i]; \
2822 *saved_charbuf = saved_charbuf - charbuf; \
ec6d2bb8 2823 } \
df7492f9
KH
2824 if (method == COMPOSITION_WITH_RULE) \
2825 for (i = 0; i < component_idx; i += 2, char_offset++) \
2826 *charbuf++ = components[i]; \
ec6d2bb8 2827 else \
df7492f9
KH
2828 for (i = component_len; i < component_idx; i++, char_offset++) \
2829 *charbuf++ = components[i]; \
2830 coding->annotated = 1; \
2831 composition_state = COMPOSING_NO; \
ec6d2bb8
KH
2832 } while (0)
2833
df7492f9 2834
ec6d2bb8
KH
2835/* Decode a composition rule from the byte C1 (and maybe one more byte
2836 from SRC) and store one encoded composition rule in
2837 coding->cmp_data. */
2838
2839#define DECODE_COMPOSITION_RULE(c1) \
2840 do { \
ec6d2bb8
KH
2841 (c1) -= 32; \
2842 if (c1 < 81) /* old format (before ver.21) */ \
2843 { \
2844 int gref = (c1) / 9; \
2845 int nref = (c1) % 9; \
2846 if (gref == 4) gref = 10; \
2847 if (nref == 4) nref = 10; \
df7492f9 2848 c1 = COMPOSITION_ENCODE_RULE (gref, nref); \
ec6d2bb8 2849 } \
b73bfc1c 2850 else if (c1 < 93) /* new format (after ver.21) */ \
ec6d2bb8
KH
2851 { \
2852 ONE_MORE_BYTE (c2); \
df7492f9 2853 c1 = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32); \
ec6d2bb8 2854 } \
df7492f9
KH
2855 else \
2856 c1 = 0; \
ec6d2bb8 2857 } while (0)
88993dfd 2858
d46c5b12 2859
4ed46869
KH
2860/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
2861
b73bfc1c 2862static void
df7492f9 2863decode_coding_iso_2022 (coding)
4ed46869 2864 struct coding_system *coding;
4ed46869 2865{
8f924df7
KH
2866 const unsigned char *src = coding->source + coding->consumed;
2867 const unsigned char *src_end = coding->source + coding->src_bytes;
2868 const unsigned char *src_base;
df7492f9 2869 int *charbuf = coding->charbuf;
ff0dacd7
KH
2870 int *charbuf_end
2871 = charbuf + coding->charbuf_size - 4 - MAX_ANNOTATION_LENGTH;
df7492f9 2872 int consumed_chars = 0, consumed_chars_base;
df7492f9 2873 int multibytep = coding->src_multibyte;
4ed46869 2874 /* Charsets invoked to graphic plane 0 and 1 respectively. */
df7492f9
KH
2875 int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2876 int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
2877 struct charset *charset;
2878 int c;
2879 /* For handling composition sequence. */
2880#define COMPOSING_NO 0
2881#define COMPOSING_CHAR 1
2882#define COMPOSING_RULE 2
2883#define COMPOSING_COMPONENT_CHAR 3
2884#define COMPOSING_COMPONENT_RULE 4
2885
2886 int composition_state = COMPOSING_NO;
2887 enum composition_method method;
2888 int components[MAX_COMPOSITION_COMPONENTS * 2 + 1];
2889 int component_idx;
2890 int component_len;
24a73b0a 2891 Lisp_Object attrs, charset_list;
ff0dacd7
KH
2892 int char_offset = coding->produced_char;
2893 int last_offset = char_offset;
2894 int last_id = charset_ascii;
df7492f9 2895
24a73b0a 2896 CODING_GET_INFO (coding, attrs, charset_list);
df7492f9 2897 setup_iso_safe_charsets (attrs);
b73bfc1c
KH
2898
2899 while (1)
4ed46869 2900 {
463f5630 2901 int c1, c2;
b73bfc1c
KH
2902
2903 src_base = src;
df7492f9
KH
2904 consumed_chars_base = consumed_chars;
2905
2906 if (charbuf >= charbuf_end)
2907 break;
2908
b73bfc1c 2909 ONE_MORE_BYTE (c1);
065e3595
KH
2910 if (c1 < 0)
2911 goto invalid_code;
4ed46869 2912
98725083 2913 /* We produce at most one character. */
4ed46869
KH
2914 switch (iso_code_class [c1])
2915 {
2916 case ISO_0x20_or_0x7F:
df7492f9 2917 if (composition_state != COMPOSING_NO)
ec6d2bb8 2918 {
df7492f9
KH
2919 if (composition_state == COMPOSING_RULE
2920 || composition_state == COMPOSING_COMPONENT_RULE)
2921 {
2922 DECODE_COMPOSITION_RULE (c1);
2923 components[component_idx++] = c1;
2924 composition_state--;
2925 continue;
2926 }
4ed46869 2927 }
df7492f9
KH
2928 if (charset_id_0 < 0
2929 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
781d7a48
KH
2930 /* This is SPACE or DEL. */
2931 charset = CHARSET_FROM_ID (charset_ascii);
2932 else
2933 charset = CHARSET_FROM_ID (charset_id_0);
2934 break;
4ed46869
KH
2935
2936 case ISO_graphic_plane_0:
781d7a48 2937 if (composition_state != COMPOSING_NO)
b73bfc1c 2938 {
781d7a48
KH
2939 if (composition_state == COMPOSING_RULE
2940 || composition_state == COMPOSING_COMPONENT_RULE)
2941 {
2942 DECODE_COMPOSITION_RULE (c1);
2943 components[component_idx++] = c1;
2944 composition_state--;
2945 continue;
2946 }
b73bfc1c 2947 }
df7492f9 2948 charset = CHARSET_FROM_ID (charset_id_0);
4ed46869
KH
2949 break;
2950
2951 case ISO_0xA0_or_0xFF:
df7492f9
KH
2952 if (charset_id_1 < 0
2953 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
2954 || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
2955 goto invalid_code;
4ed46869
KH
2956 /* This is a graphic character, we fall down ... */
2957
2958 case ISO_graphic_plane_1:
df7492f9
KH
2959 if (charset_id_1 < 0)
2960 goto invalid_code;
2961 charset = CHARSET_FROM_ID (charset_id_1);
4ed46869
KH
2962 break;
2963
df7492f9
KH
2964 case ISO_control_0:
2965 MAYBE_FINISH_COMPOSITION ();
2966 charset = CHARSET_FROM_ID (charset_ascii);
4ed46869
KH
2967 break;
2968
df7492f9
KH
2969 case ISO_control_1:
2970 MAYBE_FINISH_COMPOSITION ();
2971 goto invalid_code;
2972
4ed46869 2973 case ISO_shift_out:
df7492f9
KH
2974 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
2975 || CODING_ISO_DESIGNATION (coding, 1) < 0)
2976 goto invalid_code;
2977 CODING_ISO_INVOCATION (coding, 0) = 1;
2978 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
b73bfc1c 2979 continue;
4ed46869
KH
2980
2981 case ISO_shift_in:
df7492f9
KH
2982 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
2983 goto invalid_code;
2984 CODING_ISO_INVOCATION (coding, 0) = 0;
2985 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
b73bfc1c 2986 continue;
4ed46869
KH
2987
2988 case ISO_single_shift_2_7:
2989 case ISO_single_shift_2:
df7492f9
KH
2990 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
2991 goto invalid_code;
4ed46869
KH
2992 /* SS2 is handled as an escape sequence of ESC 'N' */
2993 c1 = 'N';
2994 goto label_escape_sequence;
2995
2996 case ISO_single_shift_3:
df7492f9
KH
2997 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
2998 goto invalid_code;
4ed46869
KH
2999 /* SS2 is handled as an escape sequence of ESC 'O' */
3000 c1 = 'O';
3001 goto label_escape_sequence;
3002
3003 case ISO_control_sequence_introducer:
3004 /* CSI is handled as an escape sequence of ESC '[' ... */
3005 c1 = '[';
3006 goto label_escape_sequence;
3007
3008 case ISO_escape:
3009 ONE_MORE_BYTE (c1);
3010 label_escape_sequence:
df7492f9 3011 /* Escape sequences handled here are invocation,
4ed46869
KH
3012 designation, direction specification, and character
3013 composition specification. */
3014 switch (c1)
3015 {
3016 case '&': /* revision of following character set */
3017 ONE_MORE_BYTE (c1);
3018 if (!(c1 >= '@' && c1 <= '~'))
df7492f9 3019 goto invalid_code;
4ed46869
KH
3020 ONE_MORE_BYTE (c1);
3021 if (c1 != ISO_CODE_ESC)
df7492f9 3022 goto invalid_code;
4ed46869
KH
3023 ONE_MORE_BYTE (c1);
3024 goto label_escape_sequence;
3025
3026 case '$': /* designation of 2-byte character set */
df7492f9
KH
3027 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3028 goto invalid_code;
4ed46869
KH
3029 ONE_MORE_BYTE (c1);
3030 if (c1 >= '@' && c1 <= 'B')
3031 { /* designation of JISX0208.1978, GB2312.1980,
88993dfd 3032 or JISX0208.1980 */
df7492f9 3033 DECODE_DESIGNATION (0, 2, 0, c1);
4ed46869
KH
3034 }
3035 else if (c1 >= 0x28 && c1 <= 0x2B)
3036 { /* designation of DIMENSION2_CHARS94 character set */
3037 ONE_MORE_BYTE (c2);
df7492f9 3038 DECODE_DESIGNATION (c1 - 0x28, 2, 0, c2);
4ed46869
KH
3039 }
3040 else if (c1 >= 0x2C && c1 <= 0x2F)
3041 { /* designation of DIMENSION2_CHARS96 character set */
3042 ONE_MORE_BYTE (c2);
df7492f9 3043 DECODE_DESIGNATION (c1 - 0x2C, 2, 1, c2);
4ed46869
KH
3044 }
3045 else
df7492f9 3046 goto invalid_code;
b73bfc1c 3047 /* We must update these variables now. */
df7492f9
KH
3048 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3049 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
b73bfc1c 3050 continue;
4ed46869
KH
3051
3052 case 'n': /* invocation of locking-shift-2 */
df7492f9
KH
3053 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3054 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3055 goto invalid_code;
3056 CODING_ISO_INVOCATION (coding, 0) = 2;
3057 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
b73bfc1c 3058 continue;
4ed46869
KH
3059
3060 case 'o': /* invocation of locking-shift-3 */
df7492f9
KH
3061 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3062 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3063 goto invalid_code;
3064 CODING_ISO_INVOCATION (coding, 0) = 3;
3065 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
b73bfc1c 3066 continue;
4ed46869
KH
3067
3068 case 'N': /* invocation of single-shift-2 */
df7492f9
KH
3069 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3070 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3071 goto invalid_code;
3072 charset = CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding, 2));
b73bfc1c 3073 ONE_MORE_BYTE (c1);
e7046a18 3074 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
df7492f9 3075 goto invalid_code;
4ed46869
KH
3076 break;
3077
3078 case 'O': /* invocation of single-shift-3 */
df7492f9
KH
3079 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3080 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3081 goto invalid_code;
3082 charset = CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding, 3));
b73bfc1c 3083 ONE_MORE_BYTE (c1);
e7046a18 3084 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
df7492f9 3085 goto invalid_code;
4ed46869
KH
3086 break;
3087
ec6d2bb8 3088 case '0': case '2': case '3': case '4': /* start composition */
df7492f9
KH
3089 if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3090 goto invalid_code;
ec6d2bb8 3091 DECODE_COMPOSITION_START (c1);
b73bfc1c 3092 continue;
4ed46869 3093
ec6d2bb8 3094 case '1': /* end composition */
df7492f9
KH
3095 if (composition_state == COMPOSING_NO)
3096 goto invalid_code;
3097 DECODE_COMPOSITION_END ();
b73bfc1c 3098 continue;
4ed46869
KH
3099
3100 case '[': /* specification of direction */
df7492f9
KH
3101 if (! CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION)
3102 goto invalid_code;
4ed46869 3103 /* For the moment, nested direction is not supported.
d46c5b12 3104 So, `coding->mode & CODING_MODE_DIRECTION' zero means
df7492f9 3105 left-to-right, and nozero means right-to-left. */
4ed46869
KH
3106 ONE_MORE_BYTE (c1);
3107 switch (c1)
3108 {
3109 case ']': /* end of the current direction */
d46c5b12 3110 coding->mode &= ~CODING_MODE_DIRECTION;
4ed46869
KH
3111
3112 case '0': /* end of the current direction */
3113 case '1': /* start of left-to-right direction */
3114 ONE_MORE_BYTE (c1);
3115 if (c1 == ']')
d46c5b12 3116 coding->mode &= ~CODING_MODE_DIRECTION;
4ed46869 3117 else
df7492f9 3118 goto invalid_code;
4ed46869
KH
3119 break;
3120
3121 case '2': /* start of right-to-left direction */
3122 ONE_MORE_BYTE (c1);
3123 if (c1 == ']')
d46c5b12 3124 coding->mode |= CODING_MODE_DIRECTION;
4ed46869 3125 else
df7492f9 3126 goto invalid_code;
4ed46869
KH
3127 break;
3128
3129 default:
df7492f9 3130 goto invalid_code;
4ed46869 3131 }
b73bfc1c 3132 continue;
4ed46869 3133
103e0180 3134 case '%':
103e0180
KH
3135 ONE_MORE_BYTE (c1);
3136 if (c1 == '/')
3137 {
3138 /* CTEXT extended segment:
3139 ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3140 We keep these bytes as is for the moment.
3141 They may be decoded by post-read-conversion. */
3142 int dim, M, L;
4776e638 3143 int size;
8f924df7 3144
103e0180
KH
3145 ONE_MORE_BYTE (dim);
3146 ONE_MORE_BYTE (M);
3147 ONE_MORE_BYTE (L);
3148 size = ((M - 128) * 128) + (L - 128);
4776e638
KH
3149 if (charbuf + 8 + size > charbuf_end)
3150 goto break_loop;
3151 *charbuf++ = ISO_CODE_ESC;
3152 *charbuf++ = '%';
3153 *charbuf++ = '/';
3154 *charbuf++ = dim;
3155 *charbuf++ = BYTE8_TO_CHAR (M);
3156 *charbuf++ = BYTE8_TO_CHAR (L);
103e0180
KH
3157 while (size-- > 0)
3158 {
3159 ONE_MORE_BYTE (c1);
4776e638 3160 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
103e0180 3161 }
103e0180
KH
3162 }
3163 else if (c1 == 'G')
3164 {
103e0180
KH
3165 /* XFree86 extension for embedding UTF-8 in CTEXT:
3166 ESC % G --UTF-8-BYTES-- ESC % @
3167 We keep these bytes as is for the moment.
3168 They may be decoded by post-read-conversion. */
4776e638
KH
3169 int *p = charbuf;
3170
3171 if (p + 6 > charbuf_end)
3172 goto break_loop;
3173 *p++ = ISO_CODE_ESC;
3174 *p++ = '%';
3175 *p++ = 'G';
3176 while (p < charbuf_end)
103e0180
KH
3177 {
3178 ONE_MORE_BYTE (c1);
3179 if (c1 == ISO_CODE_ESC
3180 && src + 1 < src_end
3181 && src[0] == '%'
3182 && src[1] == '@')
3183 break;
4776e638 3184 *p++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
103e0180 3185 }
4776e638
KH
3186 if (p + 3 > charbuf_end)
3187 goto break_loop;
3188 *p++ = ISO_CODE_ESC;
3189 *p++ = '%';
3190 *p++ = '@';
3191 charbuf = p;
103e0180
KH
3192 }
3193 else
4776e638 3194 goto invalid_code;
103e0180 3195 continue;
4776e638 3196 break;
103e0180 3197
4ed46869 3198 default:
df7492f9
KH
3199 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3200 goto invalid_code;
4ed46869
KH
3201 if (c1 >= 0x28 && c1 <= 0x2B)
3202 { /* designation of DIMENSION1_CHARS94 character set */
3203 ONE_MORE_BYTE (c2);
df7492f9 3204 DECODE_DESIGNATION (c1 - 0x28, 1, 0, c2);
4ed46869
KH
3205 }
3206 else if (c1 >= 0x2C && c1 <= 0x2F)
3207 { /* designation of DIMENSION1_CHARS96 character set */
3208 ONE_MORE_BYTE (c2);
df7492f9 3209 DECODE_DESIGNATION (c1 - 0x2C, 1, 1, c2);
4ed46869
KH
3210 }
3211 else
df7492f9 3212 goto invalid_code;
b73bfc1c 3213 /* We must update these variables now. */
df7492f9
KH
3214 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3215 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
b73bfc1c 3216 continue;
4ed46869 3217 }
b73bfc1c 3218 }
4ed46869 3219
ff0dacd7
KH
3220 if (charset->id != charset_ascii
3221 && last_id != charset->id)
3222 {
3223 if (last_id != charset_ascii)
3224 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
3225 last_id = charset->id;
3226 last_offset = char_offset;
3227 }
3228
b73bfc1c 3229 /* Now we know CHARSET and 1st position code C1 of a character.
df7492f9
KH
3230 Produce a decoded character while getting 2nd position code
3231 C2 if necessary. */
3232 c1 &= 0x7F;
3233 if (CHARSET_DIMENSION (charset) > 1)
b73bfc1c
KH
3234 {
3235 ONE_MORE_BYTE (c2);
df7492f9 3236 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
b73bfc1c 3237 /* C2 is not in a valid range. */
df7492f9
KH
3238 goto invalid_code;
3239 c1 = (c1 << 8) | (c2 & 0x7F);
3240 if (CHARSET_DIMENSION (charset) > 2)
3241 {
3242 ONE_MORE_BYTE (c2);
3243 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3244 /* C2 is not in a valid range. */
3245 goto invalid_code;
3246 c1 = (c1 << 8) | (c2 & 0x7F);
3247 }
3248 }
3249
3250 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3251 if (c < 0)
3252 {
3253 MAYBE_FINISH_COMPOSITION ();
3254 for (; src_base < src; src_base++, char_offset++)
3255 {
3256 if (ASCII_BYTE_P (*src_base))
3257 *charbuf++ = *src_base;
3258 else
3259 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3260 }
3261 }
3262 else if (composition_state == COMPOSING_NO)
3263 {
3264 *charbuf++ = c;
3265 char_offset++;
4ed46869 3266 }
df7492f9 3267 else
781d7a48
KH
3268 {
3269 components[component_idx++] = c;
3270 if (method == COMPOSITION_WITH_RULE
3271 || (method == COMPOSITION_WITH_RULE_ALTCHARS
3272 && composition_state == COMPOSING_COMPONENT_CHAR))
3273 composition_state++;
4ed46869
KH
3274 }
3275 continue;
3276
df7492f9
KH
3277 invalid_code:
3278 MAYBE_FINISH_COMPOSITION ();
4ed46869 3279 src = src_base;
df7492f9
KH
3280 consumed_chars = consumed_chars_base;
3281 ONE_MORE_BYTE (c);
065e3595 3282 *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
ff0dacd7 3283 char_offset++;
df7492f9 3284 coding->errors++;
4776e638
KH
3285 continue;
3286
3287 break_loop:
3288 break;
4ed46869 3289 }
fb88bf2d 3290
df7492f9 3291 no_more_source:
ff0dacd7
KH
3292 if (last_id != charset_ascii)
3293 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
df7492f9
KH
3294 coding->consumed_char += consumed_chars_base;
3295 coding->consumed = src_base - coding->source;
3296 coding->charbuf_used = charbuf - coding->charbuf;
4ed46869
KH
3297}
3298
b73bfc1c 3299
f4dee582 3300/* ISO2022 encoding stuff. */
4ed46869
KH
3301
3302/*
f4dee582 3303 It is not enough to say just "ISO2022" on encoding, we have to
df7492f9 3304 specify more details. In Emacs, each coding system of ISO2022
4ed46869 3305 variant has the following specifications:
df7492f9 3306 1. Initial designation to G0 thru G3.
4ed46869
KH
3307 2. Allows short-form designation?
3308 3. ASCII should be designated to G0 before control characters?
3309 4. ASCII should be designated to G0 at end of line?
3310 5. 7-bit environment or 8-bit environment?
3311 6. Use locking-shift?
3312 7. Use Single-shift?
3313 And the following two are only for Japanese:
3314 8. Use ASCII in place of JIS0201-1976-Roman?
3315 9. Use JISX0208-1983 in place of JISX0208-1978?
df7492f9
KH
3316 These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3317 defined by macros CODING_ISO_FLAG_XXX. See `coding.h' for more
f4dee582 3318 details.
4ed46869
KH
3319*/
3320
3321/* Produce codes (escape sequence) for designating CHARSET to graphic
b73bfc1c
KH
3322 register REG at DST, and increment DST. If <final-char> of CHARSET is
3323 '@', 'A', or 'B' and the coding system CODING allows, produce
3324 designation sequence of short-form. */
4ed46869
KH
3325
3326#define ENCODE_DESIGNATION(charset, reg, coding) \
3327 do { \
df7492f9 3328 unsigned char final_char = CHARSET_ISO_FINAL (charset); \
4ed46869
KH
3329 char *intermediate_char_94 = "()*+"; \
3330 char *intermediate_char_96 = ",-./"; \
df7492f9
KH
3331 int revision = -1; \
3332 int c; \
3333 \
3334 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION) \
c197f191 3335 revision = CHARSET_ISO_REVISION (charset); \
df7492f9
KH
3336 \
3337 if (revision >= 0) \
70c22245 3338 { \
df7492f9
KH
3339 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&'); \
3340 EMIT_ONE_BYTE ('@' + revision); \
4ed46869 3341 } \
df7492f9 3342 EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC); \
4ed46869
KH
3343 if (CHARSET_DIMENSION (charset) == 1) \
3344 { \
df7492f9
KH
3345 if (! CHARSET_ISO_CHARS_96 (charset)) \
3346 c = intermediate_char_94[reg]; \
4ed46869 3347 else \
df7492f9
KH
3348 c = intermediate_char_96[reg]; \
3349 EMIT_ONE_ASCII_BYTE (c); \
4ed46869
KH
3350 } \
3351 else \
3352 { \
df7492f9
KH
3353 EMIT_ONE_ASCII_BYTE ('$'); \
3354 if (! CHARSET_ISO_CHARS_96 (charset)) \
4ed46869 3355 { \
df7492f9 3356 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM \
b73bfc1c
KH
3357 || reg != 0 \
3358 || final_char < '@' || final_char > 'B') \
df7492f9 3359 EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]); \
4ed46869
KH
3360 } \
3361 else \
df7492f9 3362 EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]); \
4ed46869 3363 } \
df7492f9
KH
3364 EMIT_ONE_ASCII_BYTE (final_char); \
3365 \
3366 CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset); \
4ed46869
KH
3367 } while (0)
3368
df7492f9 3369
4ed46869
KH
3370/* The following two macros produce codes (control character or escape
3371 sequence) for ISO2022 single-shift functions (single-shift-2 and
3372 single-shift-3). */
3373
df7492f9
KH
3374#define ENCODE_SINGLE_SHIFT_2 \
3375 do { \
3376 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3377 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N'); \
3378 else \
3379 EMIT_ONE_BYTE (ISO_CODE_SS2); \
3380 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
4ed46869
KH
3381 } while (0)
3382
df7492f9
KH
3383
3384#define ENCODE_SINGLE_SHIFT_3 \
3385 do { \
3386 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3387 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O'); \
3388 else \
3389 EMIT_ONE_BYTE (ISO_CODE_SS3); \
3390 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
4ed46869
KH
3391 } while (0)
3392
df7492f9 3393
4ed46869
KH
3394/* The following four macros produce codes (control character or
3395 escape sequence) for ISO2022 locking-shift functions (shift-in,
3396 shift-out, locking-shift-2, and locking-shift-3). */
3397
df7492f9
KH
3398#define ENCODE_SHIFT_IN \
3399 do { \
3400 EMIT_ONE_ASCII_BYTE (ISO_CODE_SI); \
3401 CODING_ISO_INVOCATION (coding, 0) = 0; \
4ed46869
KH
3402 } while (0)
3403
df7492f9
KH
3404
3405#define ENCODE_SHIFT_OUT \
3406 do { \
3407 EMIT_ONE_ASCII_BYTE (ISO_CODE_SO); \
3408 CODING_ISO_INVOCATION (coding, 0) = 1; \
4ed46869
KH
3409 } while (0)
3410
df7492f9
KH
3411
3412#define ENCODE_LOCKING_SHIFT_2 \
3413 do { \
3414 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3415 CODING_ISO_INVOCATION (coding, 0) = 2; \
4ed46869
KH
3416 } while (0)
3417
df7492f9
KH
3418
3419#define ENCODE_LOCKING_SHIFT_3 \
3420 do { \
3421 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3422 CODING_ISO_INVOCATION (coding, 0) = 3; \
4ed46869
KH
3423 } while (0)
3424
df7492f9 3425
f4dee582
RS
3426/* Produce codes for a DIMENSION1 character whose character set is
3427 CHARSET and whose position-code is C1. Designation and invocation
4ed46869
KH
3428 sequences are also produced in advance if necessary. */
3429
6e85d753
KH
3430#define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
3431 do { \
df7492f9 3432 int id = CHARSET_ID (charset); \
bf16eb23
KH
3433 \
3434 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
3435 && id == charset_ascii) \
3436 { \
3437 id = charset_jisx0201_roman; \
3438 charset = CHARSET_FROM_ID (id); \
3439 } \
3440 \
df7492f9 3441 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
6e85d753 3442 { \
df7492f9
KH
3443 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3444 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
6e85d753 3445 else \
df7492f9
KH
3446 EMIT_ONE_BYTE (c1 | 0x80); \
3447 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
6e85d753
KH
3448 break; \
3449 } \
df7492f9 3450 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
6e85d753 3451 { \
df7492f9 3452 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
6e85d753
KH
3453 break; \
3454 } \
df7492f9 3455 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
6e85d753 3456 { \
df7492f9 3457 EMIT_ONE_BYTE (c1 | 0x80); \
6e85d753
KH
3458 break; \
3459 } \
6e85d753
KH
3460 else \
3461 /* Since CHARSET is not yet invoked to any graphic planes, we \
3462 must invoke it, or, at first, designate it to some graphic \
3463 register. Then repeat the loop to actually produce the \
3464 character. */ \
df7492f9
KH
3465 dst = encode_invocation_designation (charset, coding, dst, \
3466 &produced_chars); \
4ed46869
KH
3467 } while (1)
3468
df7492f9 3469
f4dee582
RS
3470/* Produce codes for a DIMENSION2 character whose character set is
3471 CHARSET and whose position-codes are C1 and C2. Designation and
4ed46869
KH
3472 invocation codes are also produced in advance if necessary. */
3473
6e85d753
KH
3474#define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
3475 do { \
df7492f9 3476 int id = CHARSET_ID (charset); \
bf16eb23
KH
3477 \
3478 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
3479 && id == charset_jisx0208) \
3480 { \
3481 id = charset_jisx0208_1978; \
3482 charset = CHARSET_FROM_ID (id); \
3483 } \
3484 \
df7492f9 3485 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
6e85d753 3486 { \
df7492f9
KH
3487 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3488 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
6e85d753 3489 else \
df7492f9
KH
3490 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3491 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
6e85d753
KH
3492 break; \
3493 } \
df7492f9 3494 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
6e85d753 3495 { \
df7492f9 3496 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
6e85d753
KH
3497 break; \
3498 } \
df7492f9 3499 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
6e85d753 3500 { \
df7492f9 3501 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
6e85d753
KH
3502 break; \
3503 } \
6e85d753
KH
3504 else \
3505 /* Since CHARSET is not yet invoked to any graphic planes, we \
3506 must invoke it, or, at first, designate it to some graphic \
3507 register. Then repeat the loop to actually produce the \
3508 character. */ \
df7492f9
KH
3509 dst = encode_invocation_designation (charset, coding, dst, \
3510 &produced_chars); \
4ed46869
KH
3511 } while (1)
3512
05e6f5dc 3513
df7492f9
KH
3514#define ENCODE_ISO_CHARACTER(charset, c) \
3515 do { \
3516 int code = ENCODE_CHAR ((charset),(c)); \
3517 \
3518 if (CHARSET_DIMENSION (charset) == 1) \
3519 ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code); \
3520 else \
3521 ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
84fbb8a0 3522 } while (0)
bdd9fb48 3523
05e6f5dc 3524
4ed46869 3525/* Produce designation and invocation codes at a place pointed by DST
df7492f9 3526 to use CHARSET. The element `spec.iso_2022' of *CODING is updated.
4ed46869
KH
3527 Return new DST. */
3528
3529unsigned char *
df7492f9
KH
3530encode_invocation_designation (charset, coding, dst, p_nchars)
3531 struct charset *charset;
4ed46869
KH
3532 struct coding_system *coding;
3533 unsigned char *dst;
df7492f9 3534 int *p_nchars;
4ed46869 3535{
df7492f9
KH
3536 int multibytep = coding->dst_multibyte;
3537 int produced_chars = *p_nchars;
4ed46869 3538 int reg; /* graphic register number */
df7492f9 3539 int id = CHARSET_ID (charset);
4ed46869
KH
3540
3541 /* At first, check designations. */
3542 for (reg = 0; reg < 4; reg++)
df7492f9 3543 if (id == CODING_ISO_DESIGNATION (coding, reg))
4ed46869
KH
3544 break;
3545
3546 if (reg >= 4)
3547 {
3548 /* CHARSET is not yet designated to any graphic registers. */
3549 /* At first check the requested designation. */
df7492f9
KH
3550 reg = CODING_ISO_REQUEST (coding, id);
3551 if (reg < 0)
1ba9e4ab
KH
3552 /* Since CHARSET requests no special designation, designate it
3553 to graphic register 0. */
4ed46869
KH
3554 reg = 0;
3555
3556 ENCODE_DESIGNATION (charset, reg, coding);
3557 }
3558
df7492f9
KH
3559 if (CODING_ISO_INVOCATION (coding, 0) != reg
3560 && CODING_ISO_INVOCATION (coding, 1) != reg)
4ed46869
KH
3561 {
3562 /* Since the graphic register REG is not invoked to any graphic
3563 planes, invoke it to graphic plane 0. */
3564 switch (reg)
3565 {
3566 case 0: /* graphic register 0 */
3567 ENCODE_SHIFT_IN;
3568 break;
3569
3570 case 1: /* graphic register 1 */
3571 ENCODE_SHIFT_OUT;
3572 break;
3573
3574 case 2: /* graphic register 2 */
df7492f9 3575 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4ed46869
KH
3576 ENCODE_SINGLE_SHIFT_2;
3577 else
3578 ENCODE_LOCKING_SHIFT_2;
3579 break;
3580
3581 case 3: /* graphic register 3 */
df7492f9 3582 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4ed46869
KH
3583 ENCODE_SINGLE_SHIFT_3;
3584 else
3585 ENCODE_LOCKING_SHIFT_3;
3586 break;
3587 }
3588 }
b73bfc1c 3589
df7492f9 3590 *p_nchars = produced_chars;
4ed46869
KH
3591 return dst;
3592}
3593
df7492f9
KH
3594/* The following three macros produce codes for indicating direction
3595 of text. */
3596#define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
ec6d2bb8 3597 do { \
df7492f9
KH
3598 if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS) \
3599 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '['); \
ec6d2bb8 3600 else \
df7492f9 3601 EMIT_ONE_BYTE (ISO_CODE_CSI); \
ec6d2bb8
KH
3602 } while (0)
3603
ec6d2bb8 3604
df7492f9
KH
3605#define ENCODE_DIRECTION_R2L() \
3606 do { \
3607 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3608 EMIT_TWO_ASCII_BYTES ('2', ']'); \
ec6d2bb8
KH
3609 } while (0)
3610
ec6d2bb8 3611
df7492f9 3612#define ENCODE_DIRECTION_L2R() \
ec6d2bb8 3613 do { \
df7492f9
KH
3614 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3615 EMIT_TWO_ASCII_BYTES ('0', ']'); \
ec6d2bb8 3616 } while (0)
4ed46869 3617
4ed46869
KH
3618
3619/* Produce codes for designation and invocation to reset the graphic
3620 planes and registers to initial state. */
df7492f9
KH
3621#define ENCODE_RESET_PLANE_AND_REGISTER() \
3622 do { \
3623 int reg; \
3624 struct charset *charset; \
3625 \
3626 if (CODING_ISO_INVOCATION (coding, 0) != 0) \
3627 ENCODE_SHIFT_IN; \
3628 for (reg = 0; reg < 4; reg++) \
3629 if (CODING_ISO_INITIAL (coding, reg) >= 0 \
3630 && (CODING_ISO_DESIGNATION (coding, reg) \
3631 != CODING_ISO_INITIAL (coding, reg))) \
3632 { \
3633 charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
3634 ENCODE_DESIGNATION (charset, reg, coding); \
3635 } \
4ed46869
KH
3636 } while (0)
3637
df7492f9 3638
bdd9fb48 3639/* Produce designation sequences of charsets in the line started from
b73bfc1c 3640 SRC to a place pointed by DST, and return updated DST.
bdd9fb48
KH
3641
3642 If the current block ends before any end-of-line, we may fail to
d46c5b12
KH
3643 find all the necessary designations. */
3644
b73bfc1c 3645static unsigned char *
df7492f9 3646encode_designation_at_bol (coding, charbuf, charbuf_end, dst)
e0e989f6 3647 struct coding_system *coding;
df7492f9
KH
3648 int *charbuf, *charbuf_end;
3649 unsigned char *dst;
e0e989f6 3650{
df7492f9 3651 struct charset *charset;
bdd9fb48
KH
3652 /* Table of charsets to be designated to each graphic register. */
3653 int r[4];
df7492f9
KH
3654 int c, found = 0, reg;
3655 int produced_chars = 0;
3656 int multibytep = coding->dst_multibyte;
3657 Lisp_Object attrs;
3658 Lisp_Object charset_list;
3659
3660 attrs = CODING_ID_ATTRS (coding->id);
3661 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3662 if (EQ (charset_list, Qiso_2022))
3663 charset_list = Viso_2022_charset_list;
bdd9fb48
KH
3664
3665 for (reg = 0; reg < 4; reg++)
3666 r[reg] = -1;
3667
b73bfc1c 3668 while (found < 4)
e0e989f6 3669 {
df7492f9
KH
3670 int id;
3671
3672 c = *charbuf++;
b73bfc1c
KH
3673 if (c == '\n')
3674 break;
df7492f9
KH
3675 charset = char_charset (c, charset_list, NULL);
3676 id = CHARSET_ID (charset);
3677 reg = CODING_ISO_REQUEST (coding, id);
3678 if (reg >= 0 && r[reg] < 0)
bdd9fb48
KH
3679 {
3680 found++;
df7492f9 3681 r[reg] = id;
bdd9fb48 3682 }
bdd9fb48
KH
3683 }
3684
3685 if (found)
3686 {
3687 for (reg = 0; reg < 4; reg++)
3688 if (r[reg] >= 0
df7492f9
KH
3689 && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
3690 ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
e0e989f6 3691 }
b73bfc1c
KH
3692
3693 return dst;
e0e989f6
KH
3694}
3695
4ed46869
KH
3696/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
3697
df7492f9
KH
3698static int
3699encode_coding_iso_2022 (coding)
4ed46869 3700 struct coding_system *coding;
4ed46869 3701{
df7492f9
KH
3702 int multibytep = coding->dst_multibyte;
3703 int *charbuf = coding->charbuf;
3704 int *charbuf_end = charbuf + coding->charbuf_used;
3705 unsigned char *dst = coding->destination + coding->produced;
3706 unsigned char *dst_end = coding->destination + coding->dst_bytes;
3707 int safe_room = 16;
3708 int bol_designation
3709 = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
3710 && CODING_ISO_BOL (coding));
3711 int produced_chars = 0;
3712 Lisp_Object attrs, eol_type, charset_list;
3713 int ascii_compatible;
b73bfc1c 3714 int c;
ff0dacd7 3715 int preferred_charset_id = -1;
05e6f5dc 3716
24a73b0a
KH
3717 CODING_GET_INFO (coding, attrs, charset_list);
3718 eol_type = CODING_ID_EOL_TYPE (coding->id);
3719 if (VECTORP (eol_type))
3720 eol_type = Qunix;
3721
004068e4 3722 setup_iso_safe_charsets (attrs);
ff0dacd7
KH
3723 /* Charset list may have been changed. */
3724 charset_list = CODING_ATTR_CHARSET_LIST (attrs); \
8f924df7 3725 coding->safe_charsets = (char *) SDATA (CODING_ATTR_SAFE_CHARSETS(attrs));
0eecad43 3726
df7492f9 3727 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
bdd9fb48 3728
df7492f9 3729 while (charbuf < charbuf_end)
4ed46869 3730 {
df7492f9 3731 ASSURE_DESTINATION (safe_room);
b73bfc1c 3732
df7492f9 3733 if (bol_designation)
b73bfc1c 3734 {
df7492f9 3735 unsigned char *dst_prev = dst;
4ed46869 3736
bdd9fb48 3737 /* We have to produce designation sequences if any now. */
df7492f9
KH
3738 dst = encode_designation_at_bol (coding, charbuf, charbuf_end, dst);
3739 bol_designation = 0;
3740 /* We are sure that designation sequences are all ASCII bytes. */
3741 produced_chars += dst - dst_prev;
e0e989f6
KH
3742 }
3743
df7492f9 3744 c = *charbuf++;
ec6d2bb8 3745
ff0dacd7
KH
3746 if (c < 0)
3747 {
3748 /* Handle an annotation. */
3749 switch (*charbuf)
ec6d2bb8 3750 {
ff0dacd7
KH
3751 case CODING_ANNOTATE_COMPOSITION_MASK:
3752 /* Not yet implemented. */
3753 break;
3754 case CODING_ANNOTATE_CHARSET_MASK:
3755 preferred_charset_id = charbuf[3];
3756 if (preferred_charset_id >= 0
3757 && NILP (Fmemq (make_number (preferred_charset_id),
3758 charset_list)))
3759 preferred_charset_id = -1;
3760 break;
3761 default:
3762 abort ();
4ed46869 3763 }
ff0dacd7
KH
3764 charbuf += -c - 1;
3765 continue;
4ed46869 3766 }
ec6d2bb8 3767
b73bfc1c
KH
3768 /* Now encode the character C. */
3769 if (c < 0x20 || c == 0x7F)
3770 {
df7492f9
KH
3771 if (c == '\n'
3772 || (c == '\r' && EQ (eol_type, Qmac)))
19a8d9e0 3773 {
df7492f9
KH
3774 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3775 ENCODE_RESET_PLANE_AND_REGISTER ();
3776 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
b73bfc1c 3777 {
df7492f9
KH
3778 int i;
3779
3780 for (i = 0; i < 4; i++)
3781 CODING_ISO_DESIGNATION (coding, i)
3782 = CODING_ISO_INITIAL (coding, i);
b73bfc1c 3783 }
df7492f9
KH
3784 bol_designation
3785 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
19a8d9e0 3786 }
df7492f9
KH
3787 else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
3788 ENCODE_RESET_PLANE_AND_REGISTER ();
3789 EMIT_ONE_ASCII_BYTE (c);
4ed46869 3790 }
df7492f9 3791 else if (ASCII_CHAR_P (c))
88993dfd 3792 {
df7492f9
KH
3793 if (ascii_compatible)
3794 EMIT_ONE_ASCII_BYTE (c);
93dec019 3795 else
19a8d9e0 3796 {
bf16eb23
KH
3797 struct charset *charset = CHARSET_FROM_ID (charset_ascii);
3798 ENCODE_ISO_CHARACTER (charset, c);
19a8d9e0 3799 }
4ed46869 3800 }
16eafb5d 3801 else if (CHAR_BYTE8_P (c))
88993dfd 3802 {
16eafb5d
KH
3803 c = CHAR_TO_BYTE8 (c);
3804 EMIT_ONE_BYTE (c);
88993dfd 3805 }
b73bfc1c 3806 else
df7492f9 3807 {
ff0dacd7 3808 struct charset *charset;
b73bfc1c 3809
ff0dacd7
KH
3810 if (preferred_charset_id >= 0)
3811 {
3812 charset = CHARSET_FROM_ID (preferred_charset_id);
3813 if (! CHAR_CHARSET_P (c, charset))
3814 charset = char_charset (c, charset_list, NULL);
3815 }
3816 else
3817 charset = char_charset (c, charset_list, NULL);
df7492f9
KH
3818 if (!charset)
3819 {
41cbe562
KH
3820 if (coding->mode & CODING_MODE_SAFE_ENCODING)
3821 {
3822 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
3823 charset = CHARSET_FROM_ID (charset_ascii);
3824 }
3825 else
3826 {
3827 c = coding->default_char;
3828 charset = char_charset (c, charset_list, NULL);
3829 }
df7492f9
KH
3830 }
3831 ENCODE_ISO_CHARACTER (charset, c);
3832 }
84fbb8a0 3833 }
b73bfc1c 3834
df7492f9
KH
3835 if (coding->mode & CODING_MODE_LAST_BLOCK
3836 && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3837 {
3838 ASSURE_DESTINATION (safe_room);
3839 ENCODE_RESET_PLANE_AND_REGISTER ();
3840 }
065e3595 3841 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
3842 CODING_ISO_BOL (coding) = bol_designation;
3843 coding->produced_char += produced_chars;
3844 coding->produced = dst - coding->destination;
3845 return 0;
4ed46869
KH
3846}
3847
3848\f
df7492f9 3849/*** 8,9. SJIS and BIG5 handlers ***/
4ed46869 3850
df7492f9 3851/* Although SJIS and BIG5 are not ISO's coding system, they are used
4ed46869
KH
3852 quite widely. So, for the moment, Emacs supports them in the bare
3853 C code. But, in the future, they may be supported only by CCL. */
3854
3855/* SJIS is a coding system encoding three character sets: ASCII, right
3856 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
3857 as is. A character of charset katakana-jisx0201 is encoded by
3858 "position-code + 0x80". A character of charset japanese-jisx0208
3859 is encoded in 2-byte but two position-codes are divided and shifted
df7492f9 3860 so that it fit in the range below.
4ed46869
KH
3861
3862 --- CODE RANGE of SJIS ---
3863 (character set) (range)
3864 ASCII 0x00 .. 0x7F
df7492f9 3865 KATAKANA-JISX0201 0xA0 .. 0xDF
c28a9453 3866 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
d14d03ac 3867 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
4ed46869
KH
3868 -------------------------------
3869
3870*/
3871
3872/* BIG5 is a coding system encoding two character sets: ASCII and
3873 Big5. An ASCII character is encoded as is. Big5 is a two-byte
df7492f9 3874 character set and is encoded in two-byte.
4ed46869
KH
3875
3876 --- CODE RANGE of BIG5 ---
3877 (character set) (range)
3878 ASCII 0x00 .. 0x7F
3879 Big5 (1st byte) 0xA1 .. 0xFE
3880 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3881 --------------------------
3882
df7492f9 3883 */
4ed46869
KH
3884
3885/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3886 Check if a text is encoded in SJIS. If it is, return
df7492f9 3887 CATEGORY_MASK_SJIS, else return 0. */
4ed46869 3888
0a28aafb 3889static int
ff0dacd7 3890detect_coding_sjis (coding, detect_info)
df7492f9 3891 struct coding_system *coding;
ff0dacd7 3892 struct coding_detection_info *detect_info;
4ed46869 3893{
065e3595 3894 const unsigned char *src = coding->source, *src_base;
8f924df7 3895 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9
KH
3896 int multibytep = coding->src_multibyte;
3897 int consumed_chars = 0;
3898 int found = 0;
b73bfc1c 3899 int c;
df7492f9 3900
ff0dacd7 3901 detect_info->checked |= CATEGORY_MASK_SJIS;
df7492f9
KH
3902 /* A coding system of this category is always ASCII compatible. */
3903 src += coding->head_ascii;
4ed46869 3904
b73bfc1c 3905 while (1)
4ed46869 3906 {
065e3595 3907 src_base = src;
df7492f9 3908 ONE_MORE_BYTE (c);
682169fe
KH
3909 if (c < 0x80)
3910 continue;
df7492f9 3911 if ((c >= 0x81 && c <= 0x9F) || (c >= 0xE0 && c <= 0xEF))
4ed46869 3912 {
df7492f9 3913 ONE_MORE_BYTE (c);
682169fe 3914 if (c < 0x40 || c == 0x7F || c > 0xFC)
df7492f9 3915 break;
ff0dacd7 3916 found = CATEGORY_MASK_SJIS;
4ed46869 3917 }
df7492f9 3918 else if (c >= 0xA0 && c < 0xE0)
ff0dacd7 3919 found = CATEGORY_MASK_SJIS;
df7492f9
KH
3920 else
3921 break;
4ed46869 3922 }
ff0dacd7 3923 detect_info->rejected |= CATEGORY_MASK_SJIS;
df7492f9
KH
3924 return 0;
3925
3926 no_more_source:
065e3595 3927 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
89528eb3 3928 {
ff0dacd7 3929 detect_info->rejected |= CATEGORY_MASK_SJIS;
89528eb3 3930 return 0;
4ed46869 3931 }
ff0dacd7
KH
3932 detect_info->found |= found;
3933 return 1;
4ed46869
KH
3934}
3935
3936/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3937 Check if a text is encoded in BIG5. If it is, return
df7492f9 3938 CATEGORY_MASK_BIG5, else return 0. */
4ed46869 3939
0a28aafb 3940static int
ff0dacd7 3941detect_coding_big5 (coding, detect_info)
df7492f9 3942 struct coding_system *coding;
ff0dacd7 3943 struct coding_detection_info *detect_info;
4ed46869 3944{
065e3595 3945 const unsigned char *src = coding->source, *src_base;
8f924df7 3946 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9
KH
3947 int multibytep = coding->src_multibyte;
3948 int consumed_chars = 0;
3949 int found = 0;
b73bfc1c 3950 int c;
fa42c37f 3951
ff0dacd7 3952 detect_info->checked |= CATEGORY_MASK_BIG5;
df7492f9
KH
3953 /* A coding system of this category is always ASCII compatible. */
3954 src += coding->head_ascii;
fa42c37f 3955
b73bfc1c 3956 while (1)
fa42c37f 3957 {
065e3595 3958 src_base = src;
df7492f9
KH
3959 ONE_MORE_BYTE (c);
3960 if (c < 0x80)
fa42c37f 3961 continue;
df7492f9 3962 if (c >= 0xA1)
fa42c37f 3963 {
df7492f9
KH
3964 ONE_MORE_BYTE (c);
3965 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
fa42c37f 3966 return 0;
ff0dacd7 3967 found = CATEGORY_MASK_BIG5;
fa42c37f 3968 }
df7492f9
KH
3969 else
3970 break;
fa42c37f 3971 }
ff0dacd7 3972 detect_info->rejected |= CATEGORY_MASK_BIG5;
fa42c37f 3973 return 0;
fa42c37f 3974
df7492f9 3975 no_more_source:
065e3595 3976 if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
89528eb3 3977 {
ff0dacd7 3978 detect_info->rejected |= CATEGORY_MASK_BIG5;
89528eb3
KH
3979 return 0;
3980 }
ff0dacd7
KH
3981 detect_info->found |= found;
3982 return 1;
fa42c37f
KH
3983}
3984
4ed46869
KH
3985/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
3986 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
fa42c37f 3987
b73bfc1c 3988static void
df7492f9 3989decode_coding_sjis (coding)
4ed46869 3990 struct coding_system *coding;
4ed46869 3991{
8f924df7
KH
3992 const unsigned char *src = coding->source + coding->consumed;
3993 const unsigned char *src_end = coding->source + coding->src_bytes;
3994 const unsigned char *src_base;
df7492f9 3995 int *charbuf = coding->charbuf;
ff0dacd7 3996 int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
df7492f9
KH
3997 int consumed_chars = 0, consumed_chars_base;
3998 int multibytep = coding->src_multibyte;
3999 struct charset *charset_roman, *charset_kanji, *charset_kana;
24a73b0a 4000 Lisp_Object attrs, charset_list, val;
ff0dacd7
KH
4001 int char_offset = coding->produced_char;
4002 int last_offset = char_offset;
4003 int last_id = charset_ascii;
a5d301df 4004
24a73b0a 4005 CODING_GET_INFO (coding, attrs, charset_list);
df7492f9
KH
4006
4007 val = charset_list;
4008 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
89528eb3
KH
4009 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4010 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
fa42c37f 4011
b73bfc1c 4012 while (1)
4ed46869 4013 {
df7492f9 4014 int c, c1;
24a73b0a 4015 struct charset *charset;
fa42c37f 4016
b73bfc1c 4017 src_base = src;
df7492f9 4018 consumed_chars_base = consumed_chars;
fa42c37f 4019
df7492f9
KH
4020 if (charbuf >= charbuf_end)
4021 break;
4022
4023 ONE_MORE_BYTE (c);
065e3595
KH
4024 if (c < 0)
4025 goto invalid_code;
24a73b0a
KH
4026 if (c < 0x80)
4027 charset = charset_roman;
8e921c4b
KH
4028 else if (c == 0x80)
4029 goto invalid_code;
54f78171 4030 else
df7492f9 4031 {
24a73b0a
KH
4032 if (c >= 0xF0)
4033 goto invalid_code;
4034 if (c < 0xA0 || c >= 0xE0)
4ed46869 4035 {
24a73b0a
KH
4036 /* SJIS -> JISX0208 */
4037 ONE_MORE_BYTE (c1);
4038 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
7487494c 4039 goto invalid_code;
24a73b0a
KH
4040 c = (c << 8) | c1;
4041 SJIS_TO_JIS (c);
4042 charset = charset_kanji;
df7492f9 4043 }
24a73b0a 4044 else if (c > 0xA0)
ff0dacd7 4045 {
24a73b0a
KH
4046 /* SJIS -> JISX0201-Kana */
4047 c &= 0x7F;
4048 charset = charset_kana;
ff0dacd7 4049 }
24a73b0a
KH
4050 else
4051 goto invalid_code;
df7492f9 4052 }
24a73b0a
KH
4053 if (charset->id != charset_ascii
4054 && last_id != charset->id)
4055 {
4056 if (last_id != charset_ascii)
4057 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4058 last_id = charset->id;
4059 last_offset = char_offset;
4060 }
4061 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
df7492f9 4062 *charbuf++ = c;
ff0dacd7 4063 char_offset++;
df7492f9 4064 continue;
b73bfc1c 4065
df7492f9
KH
4066 invalid_code:
4067 src = src_base;
4068 consumed_chars = consumed_chars_base;
4069 ONE_MORE_BYTE (c);
065e3595 4070 *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
ff0dacd7 4071 char_offset++;
df7492f9
KH
4072 coding->errors++;
4073 }
fa42c37f 4074
df7492f9 4075 no_more_source:
ff0dacd7
KH
4076 if (last_id != charset_ascii)
4077 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
df7492f9
KH
4078 coding->consumed_char += consumed_chars_base;
4079 coding->consumed = src_base - coding->source;
4080 coding->charbuf_used = charbuf - coding->charbuf;
fa42c37f
KH
4081}
4082
b73bfc1c 4083static void
df7492f9 4084decode_coding_big5 (coding)
4ed46869 4085 struct coding_system *coding;
4ed46869 4086{
8f924df7
KH
4087 const unsigned char *src = coding->source + coding->consumed;
4088 const unsigned char *src_end = coding->source + coding->src_bytes;
4089 const unsigned char *src_base;
df7492f9 4090 int *charbuf = coding->charbuf;
ff0dacd7 4091 int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
df7492f9
KH
4092 int consumed_chars = 0, consumed_chars_base;
4093 int multibytep = coding->src_multibyte;
4094 struct charset *charset_roman, *charset_big5;
24a73b0a 4095 Lisp_Object attrs, charset_list, val;
ff0dacd7
KH
4096 int char_offset = coding->produced_char;
4097 int last_offset = char_offset;
4098 int last_id = charset_ascii;
df7492f9 4099
24a73b0a 4100 CODING_GET_INFO (coding, attrs, charset_list);
df7492f9
KH
4101 val = charset_list;
4102 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4103 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4ed46869 4104
b73bfc1c 4105 while (1)
4ed46869 4106 {
df7492f9 4107 int c, c1;
24a73b0a 4108 struct charset *charset;
b73bfc1c
KH
4109
4110 src_base = src;
df7492f9
KH
4111 consumed_chars_base = consumed_chars;
4112
4113 if (charbuf >= charbuf_end)
4114 break;
4115
4116 ONE_MORE_BYTE (c);
b73bfc1c 4117
065e3595
KH
4118 if (c < 0)
4119 goto invalid_code;
24a73b0a
KH
4120 if (c < 0x80)
4121 charset = charset_roman;
4122 else
4ed46869 4123 {
24a73b0a
KH
4124 /* BIG5 -> Big5 */
4125 if (c < 0xA1 || c > 0xFE)
4126 goto invalid_code;
4127 ONE_MORE_BYTE (c1);
4128 if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4129 goto invalid_code;
4130 c = c << 8 | c1;
4131 charset = charset_big5;
4ed46869 4132 }
24a73b0a
KH
4133 if (charset->id != charset_ascii
4134 && last_id != charset->id)
df7492f9 4135 {
24a73b0a
KH
4136 if (last_id != charset_ascii)
4137 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4138 last_id = charset->id;
4139 last_offset = char_offset;
4ed46869 4140 }
24a73b0a 4141 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
df7492f9 4142 *charbuf++ = c;
ff0dacd7 4143 char_offset++;
fb88bf2d
KH
4144 continue;
4145
df7492f9 4146 invalid_code:
4ed46869 4147 src = src_base;
df7492f9
KH
4148 consumed_chars = consumed_chars_base;
4149 ONE_MORE_BYTE (c);
065e3595 4150 *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
ff0dacd7 4151 char_offset++;
df7492f9 4152 coding->errors++;
fb88bf2d 4153 }
d46c5b12 4154
df7492f9 4155 no_more_source:
ff0dacd7
KH
4156 if (last_id != charset_ascii)
4157 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
df7492f9
KH
4158 coding->consumed_char += consumed_chars_base;
4159 coding->consumed = src_base - coding->source;
4160 coding->charbuf_used = charbuf - coding->charbuf;
4ed46869
KH
4161}
4162
4163/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
b73bfc1c
KH
4164 This function can encode charsets `ascii', `katakana-jisx0201',
4165 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
4166 are sure that all these charsets are registered as official charset
4ed46869
KH
4167 (i.e. do not have extended leading-codes). Characters of other
4168 charsets are produced without any encoding. If SJIS_P is 1, encode
4169 SJIS text, else encode BIG5 text. */
4170
df7492f9
KH
4171static int
4172encode_coding_sjis (coding)
4ed46869 4173 struct coding_system *coding;
4ed46869 4174{
df7492f9
KH
4175 int multibytep = coding->dst_multibyte;
4176 int *charbuf = coding->charbuf;
4177 int *charbuf_end = charbuf + coding->charbuf_used;
4178 unsigned char *dst = coding->destination + coding->produced;
4179 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4180 int safe_room = 4;
4181 int produced_chars = 0;
24a73b0a 4182 Lisp_Object attrs, charset_list, val;
df7492f9
KH
4183 int ascii_compatible;
4184 struct charset *charset_roman, *charset_kanji, *charset_kana;
4185 int c;
a5d301df 4186
24a73b0a 4187 CODING_GET_INFO (coding, attrs, charset_list);
df7492f9
KH
4188 val = charset_list;
4189 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4190 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4191 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
4ed46869 4192
df7492f9 4193 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
93dec019 4194
df7492f9
KH
4195 while (charbuf < charbuf_end)
4196 {
4197 ASSURE_DESTINATION (safe_room);
4198 c = *charbuf++;
b73bfc1c 4199 /* Now encode the character C. */
df7492f9
KH
4200 if (ASCII_CHAR_P (c) && ascii_compatible)
4201 EMIT_ONE_ASCII_BYTE (c);
16eafb5d
KH
4202 else if (CHAR_BYTE8_P (c))
4203 {
4204 c = CHAR_TO_BYTE8 (c);
4205 EMIT_ONE_BYTE (c);
4206 }
df7492f9 4207 else
b73bfc1c 4208 {
df7492f9
KH
4209 unsigned code;
4210 struct charset *charset = char_charset (c, charset_list, &code);
4211
4212 if (!charset)
4ed46869 4213 {
41cbe562 4214 if (coding->mode & CODING_MODE_SAFE_ENCODING)
b73bfc1c 4215 {
41cbe562
KH
4216 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4217 charset = CHARSET_FROM_ID (charset_ascii);
b73bfc1c 4218 }
41cbe562 4219 else
b73bfc1c 4220 {
41cbe562
KH
4221 c = coding->default_char;
4222 charset = char_charset (c, charset_list, &code);
b73bfc1c 4223 }
b73bfc1c 4224 }
df7492f9
KH
4225 if (code == CHARSET_INVALID_CODE (charset))
4226 abort ();
4227 if (charset == charset_kanji)
4228 {
4229 int c1, c2;
4230 JIS_TO_SJIS (code);
4231 c1 = code >> 8, c2 = code & 0xFF;
4232 EMIT_TWO_BYTES (c1, c2);
4233 }
4234 else if (charset == charset_kana)
4235 EMIT_ONE_BYTE (code | 0x80);
4236 else
4237 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4238 }
4239 }
065e3595 4240 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
4241 coding->produced_char += produced_chars;
4242 coding->produced = dst - coding->destination;
4243 return 0;
4244}
4245
4246static int
4247encode_coding_big5 (coding)
4248 struct coding_system *coding;
4249{
4250 int multibytep = coding->dst_multibyte;
4251 int *charbuf = coding->charbuf;
4252 int *charbuf_end = charbuf + coding->charbuf_used;
4253 unsigned char *dst = coding->destination + coding->produced;
4254 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4255 int safe_room = 4;
4256 int produced_chars = 0;
24a73b0a 4257 Lisp_Object attrs, charset_list, val;
df7492f9
KH
4258 int ascii_compatible;
4259 struct charset *charset_roman, *charset_big5;
4260 int c;
4261
24a73b0a 4262 CODING_GET_INFO (coding, attrs, charset_list);
df7492f9
KH
4263 val = charset_list;
4264 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4265 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4266 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4267
4268 while (charbuf < charbuf_end)
4269 {
4270 ASSURE_DESTINATION (safe_room);
4271 c = *charbuf++;
4272 /* Now encode the character C. */
4273 if (ASCII_CHAR_P (c) && ascii_compatible)
4274 EMIT_ONE_ASCII_BYTE (c);
16eafb5d
KH
4275 else if (CHAR_BYTE8_P (c))
4276 {
4277 c = CHAR_TO_BYTE8 (c);
4278 EMIT_ONE_BYTE (c);
b73bfc1c
KH
4279 }
4280 else
4281 {
df7492f9
KH
4282 unsigned code;
4283 struct charset *charset = char_charset (c, charset_list, &code);
4284
4285 if (! charset)
b73bfc1c 4286 {
41cbe562 4287 if (coding->mode & CODING_MODE_SAFE_ENCODING)
b73bfc1c 4288 {
41cbe562
KH
4289 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4290 charset = CHARSET_FROM_ID (charset_ascii);
b73bfc1c 4291 }
41cbe562 4292 else
0eecad43 4293 {
41cbe562
KH
4294 c = coding->default_char;
4295 charset = char_charset (c, charset_list, &code);
0eecad43 4296 }
4ed46869 4297 }
df7492f9
KH
4298 if (code == CHARSET_INVALID_CODE (charset))
4299 abort ();
4300 if (charset == charset_big5)
b73bfc1c 4301 {
df7492f9
KH
4302 int c1, c2;
4303
4304 c1 = code >> 8, c2 = code & 0xFF;
4305 EMIT_TWO_BYTES (c1, c2);
b73bfc1c 4306 }
df7492f9
KH
4307 else
4308 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4ed46869 4309 }
4ed46869 4310 }
065e3595 4311 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
4312 coding->produced_char += produced_chars;
4313 coding->produced = dst - coding->destination;
4314 return 0;
4ed46869
KH
4315}
4316
4317\f
df7492f9 4318/*** 10. CCL handlers ***/
1397dc18
KH
4319
4320/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4321 Check if a text is encoded in a coding system of which
4322 encoder/decoder are written in CCL program. If it is, return
df7492f9 4323 CATEGORY_MASK_CCL, else return 0. */
1397dc18 4324
0a28aafb 4325static int
ff0dacd7 4326detect_coding_ccl (coding, detect_info)
df7492f9 4327 struct coding_system *coding;
ff0dacd7 4328 struct coding_detection_info *detect_info;
1397dc18 4329{
065e3595 4330 const unsigned char *src = coding->source, *src_base;
8f924df7 4331 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9
KH
4332 int multibytep = coding->src_multibyte;
4333 int consumed_chars = 0;
4334 int found = 0;
4335 unsigned char *valids = CODING_CCL_VALIDS (coding);
4336 int head_ascii = coding->head_ascii;
4337 Lisp_Object attrs;
4338
ff0dacd7
KH
4339 detect_info->checked |= CATEGORY_MASK_CCL;
4340
df7492f9
KH
4341 coding = &coding_categories[coding_category_ccl];
4342 attrs = CODING_ID_ATTRS (coding->id);
4343 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4344 src += head_ascii;
1397dc18 4345
b73bfc1c 4346 while (1)
1397dc18 4347 {
df7492f9 4348 int c;
065e3595
KH
4349
4350 src_base = src;
df7492f9 4351 ONE_MORE_BYTE (c);
065e3595 4352 if (c < 0 || ! valids[c])
df7492f9 4353 break;
ff0dacd7
KH
4354 if ((valids[c] > 1))
4355 found = CATEGORY_MASK_CCL;
df7492f9 4356 }
ff0dacd7 4357 detect_info->rejected |= CATEGORY_MASK_CCL;
df7492f9
KH
4358 return 0;
4359
4360 no_more_source:
ff0dacd7
KH
4361 detect_info->found |= found;
4362 return 1;
df7492f9
KH
4363}
4364
4365static void
4366decode_coding_ccl (coding)
4367 struct coding_system *coding;
4368{
7c78e542 4369 const unsigned char *src = coding->source + coding->consumed;
8f924df7 4370 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9
KH
4371 int *charbuf = coding->charbuf;
4372 int *charbuf_end = charbuf + coding->charbuf_size;
4373 int consumed_chars = 0;
4374 int multibytep = coding->src_multibyte;
4375 struct ccl_program ccl;
4376 int source_charbuf[1024];
4377 int source_byteidx[1024];
24a73b0a 4378 Lisp_Object attrs, charset_list;
df7492f9 4379
24a73b0a 4380 CODING_GET_INFO (coding, attrs, charset_list);
df7492f9
KH
4381 setup_ccl_program (&ccl, CODING_CCL_DECODER (coding));
4382
4383 while (src < src_end)
4384 {
7c78e542 4385 const unsigned char *p = src;
df7492f9
KH
4386 int *source, *source_end;
4387 int i = 0;
4388
4389 if (multibytep)
4390 while (i < 1024 && p < src_end)
4391 {
4392 source_byteidx[i] = p - src;
4393 source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
4394 }
4395 else
4396 while (i < 1024 && p < src_end)
4397 source_charbuf[i++] = *p++;
8f924df7 4398
df7492f9
KH
4399 if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
4400 ccl.last_block = 1;
4401
4402 source = source_charbuf;
4403 source_end = source + i;
4404 while (source < source_end)
4405 {
4406 ccl_driver (&ccl, source, charbuf,
8dcbea82
KH
4407 source_end - source, charbuf_end - charbuf,
4408 charset_list);
df7492f9
KH
4409 source += ccl.consumed;
4410 charbuf += ccl.produced;
4411 if (ccl.status != CCL_STAT_SUSPEND_BY_DST)
4412 break;
4413 }
4414 if (source < source_end)
4415 src += source_byteidx[source - source_charbuf];
4416 else
4417 src = p;
4418 consumed_chars += source - source_charbuf;
4419
4420 if (ccl.status != CCL_STAT_SUSPEND_BY_SRC
4421 && ccl.status != CODING_RESULT_INSUFFICIENT_SRC)
4422 break;
4423 }
4424
4425 switch (ccl.status)
4426 {
4427 case CCL_STAT_SUSPEND_BY_SRC:
065e3595 4428 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
df7492f9
KH
4429 break;
4430 case CCL_STAT_SUSPEND_BY_DST:
4431 break;
4432 case CCL_STAT_QUIT:
4433 case CCL_STAT_INVALID_CMD:
065e3595 4434 record_conversion_result (coding, CODING_RESULT_INTERRUPT);
df7492f9
KH
4435 break;
4436 default:
065e3595 4437 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
4438 break;
4439 }
4440 coding->consumed_char += consumed_chars;
4441 coding->consumed = src - coding->source;
4442 coding->charbuf_used = charbuf - coding->charbuf;
4443}
4444
4445static int
4446encode_coding_ccl (coding)
4447 struct coding_system *coding;
4448{
4449 struct ccl_program ccl;
4450 int multibytep = coding->dst_multibyte;
4451 int *charbuf = coding->charbuf;
4452 int *charbuf_end = charbuf + coding->charbuf_used;
4453 unsigned char *dst = coding->destination + coding->produced;
4454 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4455 unsigned char *adjusted_dst_end = dst_end - 1;
4456 int destination_charbuf[1024];
4457 int i, produced_chars = 0;
24a73b0a 4458 Lisp_Object attrs, charset_list;
df7492f9 4459
24a73b0a 4460 CODING_GET_INFO (coding, attrs, charset_list);
df7492f9
KH
4461 setup_ccl_program (&ccl, CODING_CCL_ENCODER (coding));
4462
4463 ccl.last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4464 ccl.dst_multibyte = coding->dst_multibyte;
4465
4466 while (charbuf < charbuf_end && dst < adjusted_dst_end)
4467 {
4468 int dst_bytes = dst_end - dst;
4469 if (dst_bytes > 1024)
4470 dst_bytes = 1024;
4471
4472 ccl_driver (&ccl, charbuf, destination_charbuf,
8dcbea82 4473 charbuf_end - charbuf, dst_bytes, charset_list);
df7492f9
KH
4474 charbuf += ccl.consumed;
4475 if (multibytep)
4476 for (i = 0; i < ccl.produced; i++)
4477 EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
4478 else
4479 {
4480 for (i = 0; i < ccl.produced; i++)
4481 *dst++ = destination_charbuf[i] & 0xFF;
4482 produced_chars += ccl.produced;
4483 }
4484 }
4485
4486 switch (ccl.status)
4487 {
4488 case CCL_STAT_SUSPEND_BY_SRC:
065e3595 4489 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
df7492f9
KH
4490 break;
4491 case CCL_STAT_SUSPEND_BY_DST:
065e3595 4492 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
df7492f9
KH
4493 break;
4494 case CCL_STAT_QUIT:
4495 case CCL_STAT_INVALID_CMD:
065e3595 4496 record_conversion_result (coding, CODING_RESULT_INTERRUPT);
df7492f9
KH
4497 break;
4498 default:
065e3595 4499 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9 4500 break;
1397dc18 4501 }
df7492f9
KH
4502
4503 coding->produced_char += produced_chars;
4504 coding->produced = dst - coding->destination;
4505 return 0;
1397dc18
KH
4506}
4507
df7492f9 4508
1397dc18 4509\f
df7492f9 4510/*** 10, 11. no-conversion handlers ***/
4ed46869 4511
b73bfc1c 4512/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
4ed46869 4513
b73bfc1c 4514static void
df7492f9 4515decode_coding_raw_text (coding)
4ed46869 4516 struct coding_system *coding;
4ed46869 4517{
df7492f9 4518 coding->chars_at_source = 1;
2c78b7e1
KH
4519 coding->consumed_char = 0;
4520 coding->consumed = 0;
065e3595 4521 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9 4522}
4ed46869 4523
df7492f9
KH
4524static int
4525encode_coding_raw_text (coding)
4526 struct coding_system *coding;
4527{
4528 int multibytep = coding->dst_multibyte;
4529 int *charbuf = coding->charbuf;
4530 int *charbuf_end = coding->charbuf + coding->charbuf_used;
4531 unsigned char *dst = coding->destination + coding->produced;
4532 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4533 int produced_chars = 0;
b73bfc1c
KH
4534 int c;
4535
df7492f9 4536 if (multibytep)
b73bfc1c 4537 {
df7492f9 4538 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
4ed46869 4539
df7492f9
KH
4540 if (coding->src_multibyte)
4541 while (charbuf < charbuf_end)
4542 {
4543 ASSURE_DESTINATION (safe_room);
4544 c = *charbuf++;
4545 if (ASCII_CHAR_P (c))
4546 EMIT_ONE_ASCII_BYTE (c);
4547 else if (CHAR_BYTE8_P (c))
4548 {
4549 c = CHAR_TO_BYTE8 (c);
4550 EMIT_ONE_BYTE (c);
4551 }
4552 else
4553 {
4554 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
93dec019 4555
df7492f9
KH
4556 CHAR_STRING_ADVANCE (c, p1);
4557 while (p0 < p1)
9d123124
KH
4558 {
4559 EMIT_ONE_BYTE (*p0);
4560 p0++;
4561 }
df7492f9
KH
4562 }
4563 }
b73bfc1c 4564 else
df7492f9
KH
4565 while (charbuf < charbuf_end)
4566 {
4567 ASSURE_DESTINATION (safe_room);
4568 c = *charbuf++;
4569 EMIT_ONE_BYTE (c);
4570 }
4571 }
4572 else
4ed46869 4573 {
df7492f9 4574 if (coding->src_multibyte)
d46c5b12 4575 {
df7492f9
KH
4576 int safe_room = MAX_MULTIBYTE_LENGTH;
4577
4578 while (charbuf < charbuf_end)
d46c5b12 4579 {
df7492f9
KH
4580 ASSURE_DESTINATION (safe_room);
4581 c = *charbuf++;
4582 if (ASCII_CHAR_P (c))
4583 *dst++ = c;
4584 else if (CHAR_BYTE8_P (c))
4585 *dst++ = CHAR_TO_BYTE8 (c);
b73bfc1c 4586 else
df7492f9
KH
4587 CHAR_STRING_ADVANCE (c, dst);
4588 produced_chars++;
d46c5b12
KH
4589 }
4590 }
df7492f9
KH
4591 else
4592 {
4593 ASSURE_DESTINATION (charbuf_end - charbuf);
4594 while (charbuf < charbuf_end && dst < dst_end)
4595 *dst++ = *charbuf++;
4596 produced_chars = dst - (coding->destination + coding->dst_bytes);
8f924df7 4597 }
4ed46869 4598 }
065e3595 4599 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
4600 coding->produced_char += produced_chars;
4601 coding->produced = dst - coding->destination;
4602 return 0;
4ed46869
KH
4603}
4604
ff0dacd7
KH
4605/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4606 Check if a text is encoded in a charset-based coding system. If it
4607 is, return 1, else return 0. */
4608
0a28aafb 4609static int
ff0dacd7 4610detect_coding_charset (coding, detect_info)
df7492f9 4611 struct coding_system *coding;
ff0dacd7 4612 struct coding_detection_info *detect_info;
1397dc18 4613{
065e3595 4614 const unsigned char *src = coding->source, *src_base;
8f924df7 4615 const unsigned char *src_end = coding->source + coding->src_bytes;
df7492f9
KH
4616 int multibytep = coding->src_multibyte;
4617 int consumed_chars = 0;
4618 Lisp_Object attrs, valids;
584948ac 4619 int found = 0;
1397dc18 4620
ff0dacd7
KH
4621 detect_info->checked |= CATEGORY_MASK_CHARSET;
4622
df7492f9
KH
4623 coding = &coding_categories[coding_category_charset];
4624 attrs = CODING_ID_ATTRS (coding->id);
4625 valids = AREF (attrs, coding_attr_charset_valids);
4626
4627 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4628 src += coding->head_ascii;
1397dc18 4629
b73bfc1c 4630 while (1)
1397dc18 4631 {
df7492f9 4632 int c;
1397dc18 4633
065e3595 4634 src_base = src;
df7492f9 4635 ONE_MORE_BYTE (c);
065e3595
KH
4636 if (c < 0)
4637 continue;
df7492f9
KH
4638 if (NILP (AREF (valids, c)))
4639 break;
584948ac 4640 if (c >= 0x80)
ff0dacd7 4641 found = CATEGORY_MASK_CHARSET;
df7492f9 4642 }
ff0dacd7 4643 detect_info->rejected |= CATEGORY_MASK_CHARSET;
df7492f9 4644 return 0;
4ed46869 4645
df7492f9 4646 no_more_source:
ff0dacd7
KH
4647 detect_info->found |= found;
4648 return 1;
df7492f9 4649}
b73bfc1c 4650
b73bfc1c 4651static void
df7492f9 4652decode_coding_charset (coding)
4ed46869 4653 struct coding_system *coding;
4ed46869 4654{
8f924df7
KH
4655 const unsigned char *src = coding->source + coding->consumed;
4656 const unsigned char *src_end = coding->source + coding->src_bytes;
4657 const unsigned char *src_base;
df7492f9 4658 int *charbuf = coding->charbuf;
ff0dacd7 4659 int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
df7492f9
KH
4660 int consumed_chars = 0, consumed_chars_base;
4661 int multibytep = coding->src_multibyte;
24a73b0a 4662 Lisp_Object attrs, charset_list, valids;
ff0dacd7
KH
4663 int char_offset = coding->produced_char;
4664 int last_offset = char_offset;
4665 int last_id = charset_ascii;
df7492f9 4666
24a73b0a 4667 CODING_GET_INFO (coding, attrs, charset_list);
4eb6d3f1 4668 valids = AREF (attrs, coding_attr_charset_valids);
b73bfc1c 4669
df7492f9 4670 while (1)
4ed46869 4671 {
4eb6d3f1 4672 int c;
24a73b0a
KH
4673 Lisp_Object val;
4674 struct charset *charset;
4675 int dim;
4676 int len = 1;
4677 unsigned code;
df7492f9
KH
4678
4679 src_base = src;
4680 consumed_chars_base = consumed_chars;
b73bfc1c 4681
df7492f9
KH
4682 if (charbuf >= charbuf_end)
4683 break;
4684
4eb6d3f1 4685 ONE_MORE_BYTE (c);
065e3595
KH
4686 if (c < 0)
4687 goto invalid_code;
24a73b0a
KH
4688 code = c;
4689
4690 val = AREF (valids, c);
4691 if (NILP (val))
4692 goto invalid_code;
4693 if (INTEGERP (val))
d46c5b12 4694 {
24a73b0a
KH
4695 charset = CHARSET_FROM_ID (XFASTINT (val));
4696 dim = CHARSET_DIMENSION (charset);
4697 while (len < dim)
b73bfc1c 4698 {
24a73b0a
KH
4699 ONE_MORE_BYTE (c);
4700 code = (code << 8) | c;
4701 len++;
b73bfc1c 4702 }
24a73b0a
KH
4703 CODING_DECODE_CHAR (coding, src, src_base, src_end,
4704 charset, code, c);
d46c5b12 4705 }
df7492f9 4706 else
d46c5b12 4707 {
24a73b0a
KH
4708 /* VAL is a list of charset IDs. It is assured that the
4709 list is sorted by charset dimensions (smaller one
4710 comes first). */
4711 while (CONSP (val))
4eb6d3f1 4712 {
24a73b0a 4713 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
c7c66a95 4714 dim = CHARSET_DIMENSION (charset);
f9d71dcd 4715 while (len < dim)
4eb6d3f1 4716 {
acb2a965
KH
4717 ONE_MORE_BYTE (c);
4718 code = (code << 8) | c;
f9d71dcd 4719 len++;
4eb6d3f1 4720 }
24a73b0a
KH
4721 CODING_DECODE_CHAR (coding, src, src_base,
4722 src_end, charset, code, c);
4723 if (c >= 0)
4724 break;
4725 val = XCDR (val);
ff0dacd7 4726 }
d46c5b12 4727 }
24a73b0a
KH
4728 if (c < 0)
4729 goto invalid_code;
4730 if (charset->id != charset_ascii
4731 && last_id != charset->id)
4732 {
4733 if (last_id != charset_ascii)
4734 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4735 last_id = charset->id;
4736 last_offset = char_offset;
4737 }
4738
df7492f9 4739 *charbuf++ = c;
ff0dacd7 4740 char_offset++;
df7492f9
KH
4741 continue;
4742
4743 invalid_code:
4744 src = src_base;
4745 consumed_chars = consumed_chars_base;
4746 ONE_MORE_BYTE (c);
065e3595 4747 *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
ff0dacd7 4748 char_offset++;
df7492f9 4749 coding->errors++;
4ed46869
KH
4750 }
4751
df7492f9 4752 no_more_source:
ff0dacd7
KH
4753 if (last_id != charset_ascii)
4754 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
df7492f9
KH
4755 coding->consumed_char += consumed_chars_base;
4756 coding->consumed = src_base - coding->source;
4757 coding->charbuf_used = charbuf - coding->charbuf;
4ed46869
KH
4758}
4759
df7492f9
KH
4760static int
4761encode_coding_charset (coding)
4ed46869 4762 struct coding_system *coding;
4ed46869 4763{
df7492f9
KH
4764 int multibytep = coding->dst_multibyte;
4765 int *charbuf = coding->charbuf;
4766 int *charbuf_end = charbuf + coding->charbuf_used;
4767 unsigned char *dst = coding->destination + coding->produced;
4768 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4769 int safe_room = MAX_MULTIBYTE_LENGTH;
4770 int produced_chars = 0;
24a73b0a 4771 Lisp_Object attrs, charset_list;
df7492f9 4772 int ascii_compatible;
b73bfc1c 4773 int c;
b73bfc1c 4774
24a73b0a 4775 CODING_GET_INFO (coding, attrs, charset_list);
df7492f9 4776 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
fb88bf2d 4777
df7492f9 4778 while (charbuf < charbuf_end)
4ed46869 4779 {
4eb6d3f1 4780 struct charset *charset;
df7492f9 4781 unsigned code;
8f924df7 4782
df7492f9
KH
4783 ASSURE_DESTINATION (safe_room);
4784 c = *charbuf++;
4785 if (ascii_compatible && ASCII_CHAR_P (c))
4786 EMIT_ONE_ASCII_BYTE (c);
16eafb5d 4787 else if (CHAR_BYTE8_P (c))
4ed46869 4788 {
16eafb5d
KH
4789 c = CHAR_TO_BYTE8 (c);
4790 EMIT_ONE_BYTE (c);
d46c5b12 4791 }
d46c5b12 4792 else
b73bfc1c 4793 {
4eb6d3f1
KH
4794 charset = char_charset (c, charset_list, &code);
4795 if (charset)
4796 {
4797 if (CHARSET_DIMENSION (charset) == 1)
4798 EMIT_ONE_BYTE (code);
4799 else if (CHARSET_DIMENSION (charset) == 2)
4800 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
4801 else if (CHARSET_DIMENSION (charset) == 3)
4802 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
4803 else
4804 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
4805 (code >> 8) & 0xFF, code & 0xFF);
4806 }
4807 else
41cbe562
KH
4808 {
4809 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4810 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4811 else
4812 c = coding->default_char;
4813 EMIT_ONE_BYTE (c);
4814 }
4ed46869 4815 }
4ed46869
KH
4816 }
4817
065e3595 4818 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9
KH
4819 coding->produced_char += produced_chars;
4820 coding->produced = dst - coding->destination;
4821 return 0;
4ed46869
KH
4822}
4823
4824\f
1397dc18 4825/*** 7. C library functions ***/
4ed46869 4826
df7492f9
KH
4827/* Setup coding context CODING from information about CODING_SYSTEM.
4828 If CODING_SYSTEM is nil, `no-conversion' is assumed. If
4829 CODING_SYSTEM is invalid, signal an error. */
4ed46869 4830
ec6d2bb8 4831void
e0e989f6
KH
4832setup_coding_system (coding_system, coding)
4833 Lisp_Object coding_system;
4ed46869
KH
4834 struct coding_system *coding;
4835{
df7492f9
KH
4836 Lisp_Object attrs;
4837 Lisp_Object eol_type;
4838 Lisp_Object coding_type;
4608c386 4839 Lisp_Object val;
4ed46869 4840
df7492f9
KH
4841 if (NILP (coding_system))
4842 coding_system = Qno_conversion;
c07c8e12 4843
df7492f9 4844 CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
1f5dbf34 4845
df7492f9
KH
4846 attrs = CODING_ID_ATTRS (coding->id);
4847 eol_type = CODING_ID_EOL_TYPE (coding->id);
1f5dbf34 4848
df7492f9
KH
4849 coding->mode = 0;
4850 coding->head_ascii = -1;
4851 coding->common_flags
4852 = (VECTORP (eol_type) ? CODING_REQUIRE_DETECTION_MASK : 0);
5e5c78be
KH
4853 if (! NILP (CODING_ATTR_POST_READ (attrs)))
4854 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
4855 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
4856 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
8f924df7
KH
4857 if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
4858 coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
1f5dbf34 4859
df7492f9 4860 val = CODING_ATTR_SAFE_CHARSETS (attrs);
8f924df7
KH
4861 coding->max_charset_id = SCHARS (val) - 1;
4862 coding->safe_charsets = (char *) SDATA (val);
df7492f9 4863 coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
4608c386 4864
df7492f9
KH
4865 coding_type = CODING_ATTR_TYPE (attrs);
4866 if (EQ (coding_type, Qundecided))
d46c5b12 4867 {
df7492f9
KH
4868 coding->detector = NULL;
4869 coding->decoder = decode_coding_raw_text;
4870 coding->encoder = encode_coding_raw_text;
4871 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
d46c5b12 4872 }
df7492f9 4873 else if (EQ (coding_type, Qiso_2022))
d46c5b12 4874 {
df7492f9
KH
4875 int i;
4876 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
4877
4878 /* Invoke graphic register 0 to plane 0. */
4879 CODING_ISO_INVOCATION (coding, 0) = 0;
4880 /* Invoke graphic register 1 to plane 1 if we can use 8-bit. */
4881 CODING_ISO_INVOCATION (coding, 1)
4882 = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
4883 /* Setup the initial status of designation. */
4884 for (i = 0; i < 4; i++)
4885 CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
4886 /* Not single shifting initially. */
4887 CODING_ISO_SINGLE_SHIFTING (coding) = 0;
4888 /* Beginning of buffer should also be regarded as bol. */
4889 CODING_ISO_BOL (coding) = 1;
4890 coding->detector = detect_coding_iso_2022;
4891 coding->decoder = decode_coding_iso_2022;
4892 coding->encoder = encode_coding_iso_2022;
4893 if (flags & CODING_ISO_FLAG_SAFE)
4894 coding->mode |= CODING_MODE_SAFE_ENCODING;
d46c5b12 4895 coding->common_flags
df7492f9
KH
4896 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
4897 | CODING_REQUIRE_FLUSHING_MASK);
4898 if (flags & CODING_ISO_FLAG_COMPOSITION)
4899 coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
ff0dacd7
KH
4900 if (flags & CODING_ISO_FLAG_DESIGNATION)
4901 coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
df7492f9
KH
4902 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
4903 {
4904 setup_iso_safe_charsets (attrs);
4905 val = CODING_ATTR_SAFE_CHARSETS (attrs);
8f924df7
KH
4906 coding->max_charset_id = SCHARS (val) - 1;
4907 coding->safe_charsets = (char *) SDATA (val);
df7492f9
KH
4908 }
4909 CODING_ISO_FLAGS (coding) = flags;
d46c5b12 4910 }
df7492f9 4911 else if (EQ (coding_type, Qcharset))
d46c5b12 4912 {
df7492f9
KH
4913 coding->detector = detect_coding_charset;
4914 coding->decoder = decode_coding_charset;
4915 coding->encoder = encode_coding_charset;
d46c5b12 4916 coding->common_flags
df7492f9 4917 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
d46c5b12 4918 }
df7492f9 4919 else if (EQ (coding_type, Qutf_8))
d46c5b12 4920 {
df7492f9
KH
4921 coding->detector = detect_coding_utf_8;
4922 coding->decoder = decode_coding_utf_8;
4923 coding->encoder = encode_coding_utf_8;
4924 coding->common_flags
4925 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4926 }
4927 else if (EQ (coding_type, Qutf_16))
4928 {
4929 val = AREF (attrs, coding_attr_utf_16_bom);
4930 CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_16_detect_bom
4931 : EQ (val, Qt) ? utf_16_with_bom
4932 : utf_16_without_bom);
4933 val = AREF (attrs, coding_attr_utf_16_endian);
b49a1807 4934 CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
df7492f9 4935 : utf_16_little_endian);
e19c3639 4936 CODING_UTF_16_SURROGATE (coding) = 0;
df7492f9
KH
4937 coding->detector = detect_coding_utf_16;
4938 coding->decoder = decode_coding_utf_16;
4939 coding->encoder = encode_coding_utf_16;
4940 coding->common_flags
4941 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
b49a1807
KH
4942 if (CODING_UTF_16_BOM (coding) == utf_16_detect_bom)
4943 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
d46c5b12 4944 }
df7492f9 4945 else if (EQ (coding_type, Qccl))
4ed46869 4946 {
df7492f9
KH
4947 coding->detector = detect_coding_ccl;
4948 coding->decoder = decode_coding_ccl;
4949 coding->encoder = encode_coding_ccl;
c952af22 4950 coding->common_flags
df7492f9
KH
4951 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
4952 | CODING_REQUIRE_FLUSHING_MASK);
4953 }
4954 else if (EQ (coding_type, Qemacs_mule))
4955 {
4956 coding->detector = detect_coding_emacs_mule;
4957 coding->decoder = decode_coding_emacs_mule;
4958 coding->encoder = encode_coding_emacs_mule;
c952af22 4959 coding->common_flags
df7492f9
KH
4960 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4961 if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
4962 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
4963 {
4964 Lisp_Object tail, safe_charsets;
4965 int max_charset_id = 0;
4966
4967 for (tail = Vemacs_mule_charset_list; CONSP (tail);
4968 tail = XCDR (tail))
4969 if (max_charset_id < XFASTINT (XCAR (tail)))
4970 max_charset_id = XFASTINT (XCAR (tail));
4971 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
4972 make_number (255));
4973 for (tail = Vemacs_mule_charset_list; CONSP (tail);
4974 tail = XCDR (tail))
8f924df7 4975 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
df7492f9 4976 coding->max_charset_id = max_charset_id;
8f924df7 4977 coding->safe_charsets = (char *) SDATA (safe_charsets);
df7492f9
KH
4978 }
4979 }
4980 else if (EQ (coding_type, Qshift_jis))
4981 {
4982 coding->detector = detect_coding_sjis;
4983 coding->decoder = decode_coding_sjis;
4984 coding->encoder = encode_coding_sjis;
c952af22 4985 coding->common_flags
df7492f9
KH
4986 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4987 }
4988 else if (EQ (coding_type, Qbig5))
4989 {
4990 coding->detector = detect_coding_big5;
4991 coding->decoder = decode_coding_big5;
4992 coding->encoder = encode_coding_big5;
c952af22 4993 coding->common_flags
df7492f9
KH
4994 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4995 }
4996 else /* EQ (coding_type, Qraw_text) */
ec6d2bb8 4997 {
df7492f9
KH
4998 coding->detector = NULL;
4999 coding->decoder = decode_coding_raw_text;
5000 coding->encoder = encode_coding_raw_text;
4ed46869 5001 }
4ed46869 5002
df7492f9 5003 return;
4ed46869
KH
5004}
5005
df7492f9
KH
5006/* Return raw-text or one of its subsidiaries that has the same
5007 eol_type as CODING-SYSTEM. */
ec6d2bb8 5008
df7492f9
KH
5009Lisp_Object
5010raw_text_coding_system (coding_system)
5011 Lisp_Object coding_system;
ec6d2bb8 5012{
0be8721c 5013 Lisp_Object spec, attrs;
df7492f9 5014 Lisp_Object eol_type, raw_text_eol_type;
ec6d2bb8 5015
d3e4cb56
KH
5016 if (NILP (coding_system))
5017 return Qraw_text;
df7492f9
KH
5018 spec = CODING_SYSTEM_SPEC (coding_system);
5019 attrs = AREF (spec, 0);
ec6d2bb8 5020
df7492f9
KH
5021 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5022 return coding_system;
ec6d2bb8 5023
df7492f9
KH
5024 eol_type = AREF (spec, 2);
5025 if (VECTORP (eol_type))
5026 return Qraw_text;
5027 spec = CODING_SYSTEM_SPEC (Qraw_text);
5028 raw_text_eol_type = AREF (spec, 2);
5029 return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5030 : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5031 : AREF (raw_text_eol_type, 2));
ec6d2bb8
KH
5032}
5033
54f78171 5034
df7492f9
KH
5035/* If CODING_SYSTEM doesn't specify end-of-line format but PARENT
5036 does, return one of the subsidiary that has the same eol-spec as
5037 PARENT. Otherwise, return CODING_SYSTEM. */
5038
5039Lisp_Object
5040coding_inherit_eol_type (coding_system, parent)
b74e4686 5041 Lisp_Object coding_system, parent;
54f78171 5042{
3e139625 5043 Lisp_Object spec, eol_type;
54f78171 5044
d3e4cb56
KH
5045 if (NILP (coding_system))
5046 coding_system = Qraw_text;
df7492f9 5047 spec = CODING_SYSTEM_SPEC (coding_system);
df7492f9 5048 eol_type = AREF (spec, 2);
d3e4cb56
KH
5049 if (VECTORP (eol_type)
5050 && ! NILP (parent))
df7492f9
KH
5051 {
5052 Lisp_Object parent_spec;
df7492f9
KH
5053 Lisp_Object parent_eol_type;
5054
5055 parent_spec
5056 = CODING_SYSTEM_SPEC (buffer_defaults.buffer_file_coding_system);
5057 parent_eol_type = AREF (parent_spec, 2);
5058 if (EQ (parent_eol_type, Qunix))
5059 coding_system = AREF (eol_type, 0);
5060 else if (EQ (parent_eol_type, Qdos))
5061 coding_system = AREF (eol_type, 1);
5062 else if (EQ (parent_eol_type, Qmac))
5063 coding_system = AREF (eol_type, 2);
54f78171 5064 }
df7492f9 5065 return coding_system;
54f78171
KH
5066}
5067
4ed46869
KH
5068/* Emacs has a mechanism to automatically detect a coding system if it
5069 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
5070 it's impossible to distinguish some coding systems accurately
5071 because they use the same range of codes. So, at first, coding
5072 systems are categorized into 7, those are:
5073
0ef69138 5074 o coding-category-emacs-mule
4ed46869
KH
5075
5076 The category for a coding system which has the same code range
5077 as Emacs' internal format. Assigned the coding-system (Lisp
0ef69138 5078 symbol) `emacs-mule' by default.
4ed46869
KH
5079
5080 o coding-category-sjis
5081
5082 The category for a coding system which has the same code range
5083 as SJIS. Assigned the coding-system (Lisp
7717c392 5084 symbol) `japanese-shift-jis' by default.
4ed46869
KH
5085
5086 o coding-category-iso-7
5087
5088 The category for a coding system which has the same code range
7717c392 5089 as ISO2022 of 7-bit environment. This doesn't use any locking
d46c5b12
KH
5090 shift and single shift functions. This can encode/decode all
5091 charsets. Assigned the coding-system (Lisp symbol)
5092 `iso-2022-7bit' by default.
5093
5094 o coding-category-iso-7-tight
5095
5096 Same as coding-category-iso-7 except that this can
5097 encode/decode only the specified charsets.
4ed46869
KH
5098
5099 o coding-category-iso-8-1
5100
5101 The category for a coding system which has the same code range
5102 as ISO2022 of 8-bit environment and graphic plane 1 used only
7717c392
KH
5103 for DIMENSION1 charset. This doesn't use any locking shift
5104 and single shift functions. Assigned the coding-system (Lisp
5105 symbol) `iso-latin-1' by default.
4ed46869
KH
5106
5107 o coding-category-iso-8-2
5108
5109 The category for a coding system which has the same code range
5110 as ISO2022 of 8-bit environment and graphic plane 1 used only
7717c392
KH
5111 for DIMENSION2 charset. This doesn't use any locking shift
5112 and single shift functions. Assigned the coding-system (Lisp
5113 symbol) `japanese-iso-8bit' by default.
4ed46869 5114
7717c392 5115 o coding-category-iso-7-else
4ed46869
KH
5116
5117 The category for a coding system which has the same code range
df7492f9 5118 as ISO2022 of 7-bit environemnt but uses locking shift or
7717c392
KH
5119 single shift functions. Assigned the coding-system (Lisp
5120 symbol) `iso-2022-7bit-lock' by default.
5121
5122 o coding-category-iso-8-else
5123
5124 The category for a coding system which has the same code range
df7492f9 5125 as ISO2022 of 8-bit environemnt but uses locking shift or
7717c392
KH
5126 single shift functions. Assigned the coding-system (Lisp
5127 symbol) `iso-2022-8bit-ss2' by default.
4ed46869
KH
5128
5129 o coding-category-big5
5130
5131 The category for a coding system which has the same code range
5132 as BIG5. Assigned the coding-system (Lisp symbol)
e0e989f6 5133 `cn-big5' by default.
4ed46869 5134
fa42c37f
KH
5135 o coding-category-utf-8
5136
5137 The category for a coding system which has the same code range
5138 as UTF-8 (cf. RFC2279). Assigned the coding-system (Lisp
5139 symbol) `utf-8' by default.
5140
5141 o coding-category-utf-16-be
5142
5143 The category for a coding system in which a text has an
5144 Unicode signature (cf. Unicode Standard) in the order of BIG
5145 endian at the head. Assigned the coding-system (Lisp symbol)
5146 `utf-16-be' by default.
5147
5148 o coding-category-utf-16-le
5149
5150 The category for a coding system in which a text has an
5151 Unicode signature (cf. Unicode Standard) in the order of
5152 LITTLE endian at the head. Assigned the coding-system (Lisp
5153 symbol) `utf-16-le' by default.
5154
1397dc18
KH
5155 o coding-category-ccl
5156
5157 The category for a coding system of which encoder/decoder is
5158 written in CCL programs. The default value is nil, i.e., no
5159 coding system is assigned.
5160
4ed46869
KH
5161 o coding-category-binary
5162
5163 The category for a coding system not categorized in any of the
5164 above. Assigned the coding-system (Lisp symbol)
e0e989f6 5165 `no-conversion' by default.
4ed46869
KH
5166
5167 Each of them is a Lisp symbol and the value is an actual
df7492f9 5168 `coding-system's (this is also a Lisp symbol) assigned by a user.
4ed46869
KH
5169 What Emacs does actually is to detect a category of coding system.
5170 Then, it uses a `coding-system' assigned to it. If Emacs can't
df7492f9 5171 decide only one possible category, it selects a category of the
4ed46869
KH
5172 highest priority. Priorities of categories are also specified by a
5173 user in a Lisp variable `coding-category-list'.
5174
5175*/
5176
df7492f9
KH
5177#define EOL_SEEN_NONE 0
5178#define EOL_SEEN_LF 1
5179#define EOL_SEEN_CR 2
5180#define EOL_SEEN_CRLF 4
66cfb530 5181
ff0dacd7
KH
5182/* Detect how end-of-line of a text of length SRC_BYTES pointed by
5183 SOURCE is encoded. If CATEGORY is one of
5184 coding_category_utf_16_XXXX, assume that CR and LF are encoded by
5185 two-byte, else they are encoded by one-byte.
5186
5187 Return one of EOL_SEEN_XXX. */
4ed46869 5188
bc4bc72a 5189#define MAX_EOL_CHECK_COUNT 3
d46c5b12
KH
5190
5191static int
89528eb3 5192detect_eol (source, src_bytes, category)
d46c5b12 5193 unsigned char *source;
df7492f9 5194 EMACS_INT src_bytes;
89528eb3 5195 enum coding_category category;
4ed46869 5196{
d46c5b12 5197 unsigned char *src = source, *src_end = src + src_bytes;
4ed46869 5198 unsigned char c;
df7492f9
KH
5199 int total = 0;
5200 int eol_seen = EOL_SEEN_NONE;
4ed46869 5201
89528eb3 5202 if ((1 << category) & CATEGORY_MASK_UTF_16)
4ed46869 5203 {
df7492f9 5204 int msb, lsb;
fa42c37f 5205
89528eb3
KH
5206 msb = category == (coding_category_utf_16_le
5207 | coding_category_utf_16_le_nosig);
df7492f9 5208 lsb = 1 - msb;
fa42c37f 5209
df7492f9 5210 while (src + 1 < src_end)
fa42c37f 5211 {
df7492f9
KH
5212 c = src[lsb];
5213 if (src[msb] == 0 && (c == '\n' || c == '\r'))
fa42c37f 5214 {
df7492f9
KH
5215 int this_eol;
5216
5217 if (c == '\n')
5218 this_eol = EOL_SEEN_LF;
5219 else if (src + 3 >= src_end
5220 || src[msb + 2] != 0
5221 || src[lsb + 2] != '\n')
5222 this_eol = EOL_SEEN_CR;
fa42c37f 5223 else
8f924df7 5224 this_eol = EOL_SEEN_CRLF;
df7492f9
KH
5225
5226 if (eol_seen == EOL_SEEN_NONE)
5227 /* This is the first end-of-line. */
5228 eol_seen = this_eol;
5229 else if (eol_seen != this_eol)
fa42c37f 5230 {
df7492f9
KH
5231 /* The found type is different from what found before. */
5232 eol_seen = EOL_SEEN_LF;
5233 break;
fa42c37f 5234 }
df7492f9
KH
5235 if (++total == MAX_EOL_CHECK_COUNT)
5236 break;
fa42c37f 5237 }
df7492f9 5238 src += 2;
fa42c37f 5239 }
bcf26d6a 5240 }
d46c5b12 5241 else
c4825358 5242 {
df7492f9 5243 while (src < src_end)
27901516 5244 {
df7492f9
KH
5245 c = *src++;
5246 if (c == '\n' || c == '\r')
5247 {
5248 int this_eol;
d46c5b12 5249
df7492f9
KH
5250 if (c == '\n')
5251 this_eol = EOL_SEEN_LF;
5252 else if (src >= src_end || *src != '\n')
5253 this_eol = EOL_SEEN_CR;
5254 else
5255 this_eol = EOL_SEEN_CRLF, src++;
d46c5b12 5256
df7492f9
KH
5257 if (eol_seen == EOL_SEEN_NONE)
5258 /* This is the first end-of-line. */
5259 eol_seen = this_eol;
5260 else if (eol_seen != this_eol)
5261 {
5262 /* The found type is different from what found before. */
5263 eol_seen = EOL_SEEN_LF;
5264 break;
5265 }
5266 if (++total == MAX_EOL_CHECK_COUNT)
5267 break;
5268 }
5269 }
73be902c 5270 }
df7492f9 5271 return eol_seen;
73be902c
KH
5272}
5273
df7492f9 5274
24a73b0a 5275static Lisp_Object
df7492f9
KH
5276adjust_coding_eol_type (coding, eol_seen)
5277 struct coding_system *coding;
5278 int eol_seen;
73be902c 5279{
0be8721c 5280 Lisp_Object eol_type;
8f924df7 5281
df7492f9
KH
5282 eol_type = CODING_ID_EOL_TYPE (coding->id);
5283 if (eol_seen & EOL_SEEN_LF)
24a73b0a
KH
5284 {
5285 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
5286 eol_type = Qunix;
5287 }
6f197c07 5288 else if (eol_seen & EOL_SEEN_CRLF)
24a73b0a
KH
5289 {
5290 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
5291 eol_type = Qdos;
5292 }
6f197c07 5293 else if (eol_seen & EOL_SEEN_CR)
24a73b0a
KH
5294 {
5295 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
5296 eol_type = Qmac;
5297 }
5298 return eol_type;
d46c5b12 5299}
4ed46869 5300
df7492f9
KH
5301/* Detect how a text specified in CODING is encoded. If a coding
5302 system is detected, update fields of CODING by the detected coding
5303 system. */
0a28aafb 5304
df7492f9
KH
5305void
5306detect_coding (coding)
d46c5b12 5307 struct coding_system *coding;
d46c5b12 5308{
8f924df7 5309 const unsigned char *src, *src_end;
df7492f9 5310 Lisp_Object attrs, coding_type;
d46c5b12 5311
df7492f9
KH
5312 coding->consumed = coding->consumed_char = 0;
5313 coding->produced = coding->produced_char = 0;
5314 coding_set_source (coding);
1c3478b0 5315
df7492f9 5316 src_end = coding->source + coding->src_bytes;
1c3478b0 5317
df7492f9
KH
5318 /* If we have not yet decided the text encoding type, detect it
5319 now. */
5320 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
b73bfc1c 5321 {
df7492f9
KH
5322 int c, i;
5323
24a73b0a 5324 for (i = 0, src = coding->source; src < src_end; i++, src++)
d46c5b12 5325 {
df7492f9 5326 c = *src;
24a73b0a
KH
5327 if (c & 0x80 || (c < 0x20 && (c == 0
5328 || c == ISO_CODE_ESC
df7492f9
KH
5329 || c == ISO_CODE_SI
5330 || c == ISO_CODE_SO)))
5331 break;
d46c5b12 5332 }
df7492f9
KH
5333 coding->head_ascii = src - (coding->source + coding->consumed);
5334
5335 if (coding->head_ascii < coding->src_bytes)
d46c5b12 5336 {
ff0dacd7
KH
5337 struct coding_detection_info detect_info;
5338 enum coding_category category;
5339 struct coding_system *this;
df7492f9 5340
ff0dacd7 5341 detect_info.checked = detect_info.found = detect_info.rejected = 0;
df7492f9 5342 for (i = 0; i < coding_category_raw_text; i++)
d46c5b12 5343 {
ff0dacd7
KH
5344 category = coding_priorities[i];
5345 this = coding_categories + category;
df7492f9 5346 if (this->id < 0)
fa42c37f 5347 {
df7492f9 5348 /* No coding system of this category is defined. */
ff0dacd7 5349 detect_info.rejected |= (1 << category);
fa42c37f 5350 }
ff0dacd7 5351 else if (category >= coding_category_raw_text)
89528eb3 5352 continue;
ff0dacd7 5353 else if (detect_info.checked & (1 << category))
fa42c37f 5354 {
ff0dacd7
KH
5355 if (detect_info.found & (1 << category))
5356 break;
fa42c37f 5357 }
ff0dacd7
KH
5358 else if ((*(this->detector)) (coding, &detect_info)
5359 && detect_info.found & (1 << category))
24a73b0a
KH
5360 {
5361 if (category == coding_category_utf_16_auto)
5362 {
5363 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5364 category = coding_category_utf_16_le;
5365 else
5366 category = coding_category_utf_16_be;
5367 }
5368 break;
5369 }
d46c5b12 5370 }
ff0dacd7
KH
5371 if (i < coding_category_raw_text)
5372 setup_coding_system (CODING_ID_NAME (this->id), coding);
5373 else if (detect_info.rejected == CATEGORY_MASK_ANY)
df7492f9 5374 setup_coding_system (Qraw_text, coding);
ff0dacd7 5375 else if (detect_info.rejected)
df7492f9 5376 for (i = 0; i < coding_category_raw_text; i++)
ff0dacd7
KH
5377 if (! (detect_info.rejected & (1 << coding_priorities[i])))
5378 {
5379 this = coding_categories + coding_priorities[i];
5380 setup_coding_system (CODING_ID_NAME (this->id), coding);
5381 break;
5382 }
d46c5b12 5383 }
b73bfc1c 5384 }
24a73b0a
KH
5385 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
5386 == coding_category_utf_16_auto)
b49a1807
KH
5387 {
5388 Lisp_Object coding_systems;
5389 struct coding_detection_info detect_info;
5390
5391 coding_systems
5392 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_16_bom);
5393 detect_info.found = detect_info.rejected = 0;
5394 if (CONSP (coding_systems)
24a73b0a 5395 && detect_coding_utf_16 (coding, &detect_info))
b49a1807
KH
5396 {
5397 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5398 setup_coding_system (XCAR (coding_systems), coding);
24a73b0a 5399 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
b49a1807
KH
5400 setup_coding_system (XCDR (coding_systems), coding);
5401 }
5402 }
4ed46869 5403}
4ed46869 5404
d46c5b12 5405
aaaf0b1e 5406static void
df7492f9 5407decode_eol (coding)
aaaf0b1e 5408 struct coding_system *coding;
aaaf0b1e 5409{
24a73b0a
KH
5410 Lisp_Object eol_type;
5411 unsigned char *p, *pbeg, *pend;
5412
5413 eol_type = CODING_ID_EOL_TYPE (coding->id);
5414 if (EQ (eol_type, Qunix))
5415 return;
5416
5417 if (NILP (coding->dst_object))
5418 pbeg = coding->destination;
5419 else
5420 pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
5421 pend = pbeg + coding->produced;
5422
5423 if (VECTORP (eol_type))
aaaf0b1e 5424 {
df7492f9 5425 int eol_seen = EOL_SEEN_NONE;
4ed46869 5426
24a73b0a 5427 for (p = pbeg; p < pend; p++)
aaaf0b1e 5428 {
df7492f9
KH
5429 if (*p == '\n')
5430 eol_seen |= EOL_SEEN_LF;
5431 else if (*p == '\r')
aaaf0b1e 5432 {
df7492f9 5433 if (p + 1 < pend && *(p + 1) == '\n')
aaaf0b1e 5434 {
df7492f9
KH
5435 eol_seen |= EOL_SEEN_CRLF;
5436 p++;
aaaf0b1e 5437 }
aaaf0b1e 5438 else
df7492f9 5439 eol_seen |= EOL_SEEN_CR;
aaaf0b1e 5440 }
aaaf0b1e 5441 }
24a73b0a
KH
5442 if (eol_seen != EOL_SEEN_NONE
5443 && eol_seen != EOL_SEEN_LF
5444 && eol_seen != EOL_SEEN_CRLF
5445 && eol_seen != EOL_SEEN_CR)
5446 eol_seen = EOL_SEEN_LF;
df7492f9 5447 if (eol_seen != EOL_SEEN_NONE)
24a73b0a 5448 eol_type = adjust_coding_eol_type (coding, eol_seen);
aaaf0b1e 5449 }
d46c5b12 5450
24a73b0a 5451 if (EQ (eol_type, Qmac))
27901516 5452 {
24a73b0a 5453 for (p = pbeg; p < pend; p++)
df7492f9
KH
5454 if (*p == '\r')
5455 *p = '\n';
4ed46869 5456 }
24a73b0a 5457 else if (EQ (eol_type, Qdos))
df7492f9 5458 {
24a73b0a 5459 int n = 0;
b73bfc1c 5460
24a73b0a
KH
5461 if (NILP (coding->dst_object))
5462 {
5463 for (p = pend - 2; p >= pbeg; p--)
5464 if (*p == '\r')
5465 {
5466 safe_bcopy ((char *) (p + 1), (char *) p, pend-- - p - 1);
5467 n++;
5468 }
5469 }
5470 else
5471 {
5472 for (p = pend - 2; p >= pbeg; p--)
5473 if (*p == '\r')
5474 {
5475 int pos_byte = coding->dst_pos_byte + (p - pbeg);
5476 int pos = BYTE_TO_CHAR (pos_byte);
5477
5478 del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
5479 n++;
5480 }
5481 }
5482 coding->produced -= n;
5483 coding->produced_char -= n;
aaaf0b1e 5484 }
4ed46869
KH
5485}
5486
7d64c6ad
KH
5487
5488/* Return a translation table from coding system attribute vector ATTRS
5489 for encoding (ENCODEP is nonzero) or decoding (ENCODEP is zeor). */
5490
5491static INLINE
5492get_translation_table (attrs, encodep)
5493{
5494 Lisp_Object standard, translation_table;
5495
5496 if (encodep)
5497 translation_table = CODING_ATTR_ENCODE_TBL (attrs),
5498 standard = Vstandard_translation_table_for_encode;
5499 else
5500 translation_table = CODING_ATTR_DECODE_TBL (attrs),
5501 standard = Vstandard_translation_table_for_decode;
5502 if (! NILP (translation_table) && SYMBOLP (translation_table))
5503 translation_table = Fget (translation_table, Qtranslation_table);
5504 if (NILP (translation_table))
5505 translation_table = standard;
5506 if (! CHAR_TABLE_P (translation_table))
5507 translation_table = Qnil;
5508 return translation_table;
5509}
5510
5511
df7492f9
KH
5512static void
5513translate_chars (coding, table)
4ed46869 5514 struct coding_system *coding;
df7492f9 5515 Lisp_Object table;
4ed46869 5516{
df7492f9
KH
5517 int *charbuf = coding->charbuf;
5518 int *charbuf_end = charbuf + coding->charbuf_used;
5519 int c;
d46c5b12 5520
df7492f9
KH
5521 if (coding->chars_at_source)
5522 return;
4ed46869 5523
df7492f9 5524 while (charbuf < charbuf_end)
8844fa83 5525 {
df7492f9
KH
5526 c = *charbuf;
5527 if (c < 0)
7d64c6ad 5528 charbuf += -c;
df7492f9
KH
5529 else
5530 *charbuf++ = translate_char (table, c);
8844fa83 5531 }
df7492f9 5532}
bc4bc72a 5533
d46c5b12 5534static int
df7492f9
KH
5535produce_chars (coding)
5536 struct coding_system *coding;
4ed46869 5537{
df7492f9
KH
5538 unsigned char *dst = coding->destination + coding->produced;
5539 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5540 int produced;
5541 int produced_chars = 0;
4ed46869 5542
df7492f9 5543 if (! coding->chars_at_source)
4ed46869 5544 {
df7492f9 5545 /* Characters are in coding->charbuf. */
fba4576f
AS
5546 int *buf = coding->charbuf;
5547 int *buf_end = buf + coding->charbuf_used;
df7492f9 5548 unsigned char *adjusted_dst_end;
4ed46869 5549
df7492f9
KH
5550 if (BUFFERP (coding->src_object)
5551 && EQ (coding->src_object, coding->dst_object))
8f924df7 5552 dst_end = ((unsigned char *) coding->source) + coding->consumed;
df7492f9 5553 adjusted_dst_end = dst_end - MAX_MULTIBYTE_LENGTH;
4ed46869 5554
df7492f9 5555 while (buf < buf_end)
4ed46869 5556 {
df7492f9 5557 int c = *buf++;
bc4bc72a 5558
df7492f9 5559 if (dst >= adjusted_dst_end)
d46c5b12 5560 {
df7492f9
KH
5561 dst = alloc_destination (coding,
5562 buf_end - buf + MAX_MULTIBYTE_LENGTH,
5563 dst);
5564 dst_end = coding->destination + coding->dst_bytes;
5565 adjusted_dst_end = dst_end - MAX_MULTIBYTE_LENGTH;
5566 }
5567 if (c >= 0)
5568 {
5569 if (coding->dst_multibyte
5570 || ! CHAR_BYTE8_P (c))
5571 CHAR_STRING_ADVANCE (c, dst);
5572 else
5573 *dst++ = CHAR_TO_BYTE8 (c);
5574 produced_chars++;
d46c5b12 5575 }
df7492f9 5576 else
d3e4cb56
KH
5577 /* This is an annotation datum. (-C) is the length of
5578 it. */
5579 buf += -c - 1;
4ed46869
KH
5580 }
5581 }
fa42c37f 5582 else
fa42c37f 5583 {
8f924df7
KH
5584 const unsigned char *src = coding->source;
5585 const unsigned char *src_end = src + coding->src_bytes;
df7492f9 5586 Lisp_Object eol_type;
fa42c37f 5587
df7492f9 5588 eol_type = CODING_ID_EOL_TYPE (coding->id);
4ed46869 5589
df7492f9 5590 if (coding->src_multibyte != coding->dst_multibyte)
fa42c37f 5591 {
df7492f9 5592 if (coding->src_multibyte)
fa42c37f 5593 {
71c81426 5594 int multibytep = 1;
df7492f9 5595 int consumed_chars;
d46c5b12 5596
df7492f9
KH
5597 while (1)
5598 {
8f924df7 5599 const unsigned char *src_base = src;
df7492f9 5600 int c;
b73bfc1c 5601
df7492f9
KH
5602 ONE_MORE_BYTE (c);
5603 if (c == '\r')
5604 {
5605 if (EQ (eol_type, Qdos))
5606 {
98725083
KH
5607 if (src == src_end)
5608 {
065e3595
KH
5609 record_conversion_result
5610 (coding, CODING_RESULT_INSUFFICIENT_SRC);
98725083
KH
5611 goto no_more_source;
5612 }
5613 if (*src == '\n')
df7492f9
KH
5614 c = *src++;
5615 }
5616 else if (EQ (eol_type, Qmac))
5617 c = '\n';
5618 }
5619 if (dst == dst_end)
5620 {
2c78b7e1 5621 coding->consumed = src - coding->source;
b73bfc1c 5622
2c78b7e1 5623 if (EQ (coding->src_object, coding->dst_object))
8f924df7 5624 dst_end = (unsigned char *) src;
2c78b7e1
KH
5625 if (dst == dst_end)
5626 {
5627 dst = alloc_destination (coding, src_end - src + 1,
5628 dst);
5629 dst_end = coding->destination + coding->dst_bytes;
5630 coding_set_source (coding);
5631 src = coding->source + coding->consumed;
5632 src_end = coding->source + coding->src_bytes;
5633 }
df7492f9
KH
5634 }
5635 *dst++ = c;
5636 produced_chars++;
5637 }
5638 no_more_source:
5639 ;
fa42c37f
KH
5640 }
5641 else
df7492f9
KH
5642 while (src < src_end)
5643 {
71c81426 5644 int multibytep = 1;
df7492f9 5645 int c = *src++;
b73bfc1c 5646
df7492f9
KH
5647 if (c == '\r')
5648 {
5649 if (EQ (eol_type, Qdos))
5650 {
5651 if (src < src_end
5652 && *src == '\n')
5653 c = *src++;
5654 }
5655 else if (EQ (eol_type, Qmac))
5656 c = '\n';
5657 }
5658 if (dst >= dst_end - 1)
5659 {
2c78b7e1 5660 coding->consumed = src - coding->source;
df7492f9 5661
2c78b7e1 5662 if (EQ (coding->src_object, coding->dst_object))
8f924df7 5663 dst_end = (unsigned char *) src;
2c78b7e1
KH
5664 if (dst >= dst_end - 1)
5665 {
5666 dst = alloc_destination (coding, src_end - src + 2,
5667 dst);
5668 dst_end = coding->destination + coding->dst_bytes;
5669 coding_set_source (coding);
5670 src = coding->source + coding->consumed;
5671 src_end = coding->source + coding->src_bytes;
5672 }
df7492f9
KH
5673 }
5674 EMIT_ONE_BYTE (c);
5675 }
d46c5b12 5676 }
df7492f9
KH
5677 else
5678 {
5679 if (!EQ (coding->src_object, coding->dst_object))
fa42c37f 5680 {
df7492f9 5681 int require = coding->src_bytes - coding->dst_bytes;
4ed46869 5682
df7492f9 5683 if (require > 0)
fa42c37f 5684 {
df7492f9
KH
5685 EMACS_INT offset = src - coding->source;
5686
5687 dst = alloc_destination (coding, require, dst);
5688 coding_set_source (coding);
5689 src = coding->source + offset;
5690 src_end = coding->source + coding->src_bytes;
fa42c37f
KH
5691 }
5692 }
df7492f9
KH
5693 produced_chars = coding->src_chars;
5694 while (src < src_end)
fa42c37f 5695 {
df7492f9
KH
5696 int c = *src++;
5697
5698 if (c == '\r')
5699 {
5700 if (EQ (eol_type, Qdos))
5701 {
5702 if (src < src_end
5703 && *src == '\n')
5704 c = *src++;
5705 produced_chars--;
5706 }
5707 else if (EQ (eol_type, Qmac))
5708 c = '\n';
5709 }
5710 *dst++ = c;
fa42c37f
KH
5711 }
5712 }
2c78b7e1
KH
5713 coding->consumed = coding->src_bytes;
5714 coding->consumed_char = coding->src_chars;
fa42c37f
KH
5715 }
5716
df7492f9
KH
5717 produced = dst - (coding->destination + coding->produced);
5718 if (BUFFERP (coding->dst_object))
5719 insert_from_gap (produced_chars, produced);
5720 coding->produced += produced;
5721 coding->produced_char += produced_chars;
5722 return produced_chars;
fa42c37f
KH
5723}
5724
ff0dacd7
KH
5725/* Compose text in CODING->object according to the annotation data at
5726 CHARBUF. CHARBUF is an array:
5727 [ -LENGTH ANNOTATION_MASK FROM TO METHOD COMP_LEN [ COMPONENTS... ] ]
df7492f9 5728 */
4ed46869 5729
df7492f9
KH
5730static INLINE void
5731produce_composition (coding, charbuf)
4ed46869 5732 struct coding_system *coding;
df7492f9 5733 int *charbuf;
4ed46869 5734{
df7492f9 5735 int len;
ff0dacd7 5736 EMACS_INT from, to;
df7492f9 5737 enum composition_method method;
df7492f9 5738 Lisp_Object components;
fa42c37f 5739
df7492f9 5740 len = -charbuf[0];
ff0dacd7
KH
5741 from = coding->dst_pos + charbuf[2];
5742 to = coding->dst_pos + charbuf[3];
5743 method = (enum composition_method) (charbuf[4]);
d46c5b12 5744
df7492f9
KH
5745 if (method == COMPOSITION_RELATIVE)
5746 components = Qnil;
d46c5b12 5747 else
d46c5b12 5748 {
df7492f9
KH
5749 Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
5750 int i;
b73bfc1c 5751
df7492f9
KH
5752 len -= 5;
5753 charbuf += 5;
5754 for (i = 0; i < len; i++)
5755 args[i] = make_number (charbuf[i]);
5756 components = (method == COMPOSITION_WITH_ALTCHARS
5757 ? Fstring (len, args) : Fvector (len, args));
d46c5b12 5758 }
ff0dacd7 5759 compose_text (from, to, components, Qnil, coding->dst_object);
d46c5b12
KH
5760}
5761
d46c5b12 5762
ff0dacd7
KH
5763/* Put `charset' property on text in CODING->object according to
5764 the annotation data at CHARBUF. CHARBUF is an array:
5765 [ -LENGTH ANNOTATION_MASK FROM TO CHARSET-ID ]
5766 */
d46c5b12 5767
ff0dacd7
KH
5768static INLINE void
5769produce_charset (coding, charbuf)
d46c5b12 5770 struct coding_system *coding;
ff0dacd7 5771 int *charbuf;
d46c5b12 5772{
ff0dacd7
KH
5773 EMACS_INT from = coding->dst_pos + charbuf[2];
5774 EMACS_INT to = coding->dst_pos + charbuf[3];
5775 struct charset *charset = CHARSET_FROM_ID (charbuf[4]);
b73bfc1c 5776
ff0dacd7
KH
5777 Fput_text_property (make_number (from), make_number (to),
5778 Qcharset, CHARSET_NAME (charset),
5779 coding->dst_object);
d46c5b12
KH
5780}
5781
d46c5b12 5782
df7492f9
KH
5783#define CHARBUF_SIZE 0x4000
5784
5785#define ALLOC_CONVERSION_WORK_AREA(coding) \
5786 do { \
5787 int size = CHARBUF_SIZE;; \
5788 \
5789 coding->charbuf = NULL; \
5790 while (size > 1024) \
5791 { \
5792 coding->charbuf = (int *) alloca (sizeof (int) * size); \
5793 if (coding->charbuf) \
5794 break; \
5795 size >>= 1; \
5796 } \
5797 if (! coding->charbuf) \
5798 { \
065e3595 5799 record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
df7492f9
KH
5800 return coding->result; \
5801 } \
5802 coding->charbuf_size = size; \
5803 } while (0)
4ed46869 5804
d46c5b12
KH
5805
5806static void
df7492f9 5807produce_annotation (coding)
d46c5b12 5808 struct coding_system *coding;
d46c5b12 5809{
df7492f9
KH
5810 int *charbuf = coding->charbuf;
5811 int *charbuf_end = charbuf + coding->charbuf_used;
d46c5b12 5812
ff0dacd7
KH
5813 if (NILP (coding->dst_object))
5814 return;
d46c5b12 5815
df7492f9 5816 while (charbuf < charbuf_end)
a84f1519 5817 {
df7492f9
KH
5818 if (*charbuf >= 0)
5819 charbuf++;
d46c5b12 5820 else
d46c5b12 5821 {
df7492f9 5822 int len = -*charbuf;
ff0dacd7 5823 switch (charbuf[1])
df7492f9
KH
5824 {
5825 case CODING_ANNOTATE_COMPOSITION_MASK:
5826 produce_composition (coding, charbuf);
5827 break;
ff0dacd7
KH
5828 case CODING_ANNOTATE_CHARSET_MASK:
5829 produce_charset (coding, charbuf);
5830 break;
df7492f9
KH
5831 default:
5832 abort ();
5833 }
5834 charbuf += len;
d46c5b12 5835 }
a84f1519 5836 }
d46c5b12
KH
5837}
5838
df7492f9
KH
5839/* Decode the data at CODING->src_object into CODING->dst_object.
5840 CODING->src_object is a buffer, a string, or nil.
5841 CODING->dst_object is a buffer.
d46c5b12 5842
df7492f9
KH
5843 If CODING->src_object is a buffer, it must be the current buffer.
5844 In this case, if CODING->src_pos is positive, it is a position of
5845 the source text in the buffer, otherwise, the source text is in the
5846 gap area of the buffer, and CODING->src_pos specifies the offset of
5847 the text from GPT (which must be the same as PT). If this is the
5848 same buffer as CODING->dst_object, CODING->src_pos must be
5849 negative.
d46c5b12 5850
df7492f9
KH
5851 If CODING->src_object is a string, CODING->src_pos in an index to
5852 that string.
d46c5b12 5853
df7492f9
KH
5854 If CODING->src_object is nil, CODING->source must already point to
5855 the non-relocatable memory area. In this case, CODING->src_pos is
5856 an offset from CODING->source.
73be902c 5857
df7492f9
KH
5858 The decoded data is inserted at the current point of the buffer
5859 CODING->dst_object.
5860*/
d46c5b12 5861
df7492f9
KH
5862static int
5863decode_coding (coding)
d46c5b12 5864 struct coding_system *coding;
d46c5b12 5865{
df7492f9 5866 Lisp_Object attrs;
24a73b0a 5867 Lisp_Object undo_list;
7d64c6ad 5868 Lisp_Object translation_table;
d46c5b12 5869
df7492f9
KH
5870 if (BUFFERP (coding->src_object)
5871 && coding->src_pos > 0
5872 && coding->src_pos < GPT
5873 && coding->src_pos + coding->src_chars > GPT)
5874 move_gap_both (coding->src_pos, coding->src_pos_byte);
d46c5b12 5875
24a73b0a 5876 undo_list = Qt;
df7492f9 5877 if (BUFFERP (coding->dst_object))
1c3478b0 5878 {
df7492f9
KH
5879 if (current_buffer != XBUFFER (coding->dst_object))
5880 set_buffer_internal (XBUFFER (coding->dst_object));
5881 if (GPT != PT)
5882 move_gap_both (PT, PT_BYTE);
24a73b0a
KH
5883 undo_list = current_buffer->undo_list;
5884 current_buffer->undo_list = Qt;
1c3478b0
KH
5885 }
5886
df7492f9
KH
5887 coding->consumed = coding->consumed_char = 0;
5888 coding->produced = coding->produced_char = 0;
5889 coding->chars_at_source = 0;
065e3595 5890 record_conversion_result (coding, CODING_RESULT_SUCCESS);
df7492f9 5891 coding->errors = 0;
1c3478b0 5892
df7492f9
KH
5893 ALLOC_CONVERSION_WORK_AREA (coding);
5894
5895 attrs = CODING_ID_ATTRS (coding->id);
7d64c6ad 5896 translation_table = get_translation_table (attrs, 1);
df7492f9
KH
5897
5898 do
b73bfc1c 5899 {
df7492f9
KH
5900 coding_set_source (coding);
5901 coding->annotated = 0;
5902 (*(coding->decoder)) (coding);
7d64c6ad
KH
5903 if (!NILP (translation_table))
5904 translate_chars (coding, translation_table);
df7492f9
KH
5905 coding_set_destination (coding);
5906 produce_chars (coding);
5907 if (coding->annotated)
5908 produce_annotation (coding);
d46c5b12 5909 }
df7492f9
KH
5910 while (coding->consumed < coding->src_bytes
5911 && ! coding->result);
d46c5b12 5912
df7492f9
KH
5913 coding->carryover_bytes = 0;
5914 if (coding->consumed < coding->src_bytes)
d46c5b12 5915 {
df7492f9 5916 int nbytes = coding->src_bytes - coding->consumed;
8f924df7 5917 const unsigned char *src;
df7492f9
KH
5918
5919 coding_set_source (coding);
5920 coding_set_destination (coding);
5921 src = coding->source + coding->consumed;
5922
5923 if (coding->mode & CODING_MODE_LAST_BLOCK)
1c3478b0 5924 {
df7492f9
KH
5925 /* Flush out unprocessed data as binary chars. We are sure
5926 that the number of data is less than the size of
5927 coding->charbuf. */
065e3595 5928 coding->charbuf_used = 0;
df7492f9 5929 while (nbytes-- > 0)
1c3478b0 5930 {
df7492f9 5931 int c = *src++;
98725083
KH
5932
5933 coding->charbuf[coding->charbuf_used++] = (c & 0x80 ? - c : c);
1c3478b0 5934 }
df7492f9 5935 produce_chars (coding);
d46c5b12 5936 }
d46c5b12 5937 else
df7492f9
KH
5938 {
5939 /* Record unprocessed bytes in coding->carryover. We are
5940 sure that the number of data is less than the size of
5941 coding->carryover. */
5942 unsigned char *p = coding->carryover;
5943
5944 coding->carryover_bytes = nbytes;
5945 while (nbytes-- > 0)
5946 *p++ = *src++;
1c3478b0 5947 }
df7492f9 5948 coding->consumed = coding->src_bytes;
b73bfc1c 5949 }
69f76525 5950
24a73b0a
KH
5951 if (BUFFERP (coding->dst_object))
5952 {
5953 current_buffer->undo_list = undo_list;
5954 record_insert (coding->dst_pos, coding->produced_char);
5955 }
5956 if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix))
5957 decode_eol (coding);
73be902c 5958 return coding->result;
4ed46869
KH
5959}
5960
aaaf0b1e 5961
e1c23804 5962/* Extract an annotation datum from a composition starting at POS and
ff0dacd7
KH
5963 ending before LIMIT of CODING->src_object (buffer or string), store
5964 the data in BUF, set *STOP to a starting position of the next
5965 composition (if any) or to LIMIT, and return the address of the
5966 next element of BUF.
5967
5968 If such an annotation is not found, set *STOP to a starting
5969 position of a composition after POS (if any) or to LIMIT, and
5970 return BUF. */
5971
5972static INLINE int *
5973handle_composition_annotation (pos, limit, coding, buf, stop)
5974 EMACS_INT pos, limit;
aaaf0b1e 5975 struct coding_system *coding;
ff0dacd7
KH
5976 int *buf;
5977 EMACS_INT *stop;
aaaf0b1e 5978{
ff0dacd7
KH
5979 EMACS_INT start, end;
5980 Lisp_Object prop;
aaaf0b1e 5981
ff0dacd7
KH
5982 if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
5983 || end > limit)
5984 *stop = limit;
5985 else if (start > pos)
5986 *stop = start;
5987 else
aaaf0b1e 5988 {
ff0dacd7 5989 if (start == pos)
aaaf0b1e 5990 {
ff0dacd7
KH
5991 /* We found a composition. Store the corresponding
5992 annotation data in BUF. */
5993 int *head = buf;
5994 enum composition_method method = COMPOSITION_METHOD (prop);
5995 int nchars = COMPOSITION_LENGTH (prop);
5996
5997 ADD_COMPOSITION_DATA (buf, 0, nchars, method);
5998 if (method != COMPOSITION_RELATIVE)
aaaf0b1e 5999 {
ff0dacd7
KH
6000 Lisp_Object components;
6001 int len, i, i_byte;
6002
6003 components = COMPOSITION_COMPONENTS (prop);
6004 if (VECTORP (components))
aaaf0b1e 6005 {
ff0dacd7
KH
6006 len = XVECTOR (components)->size;
6007 for (i = 0; i < len; i++)
6008 *buf++ = XINT (AREF (components, i));
aaaf0b1e 6009 }
ff0dacd7 6010 else if (STRINGP (components))
aaaf0b1e 6011 {
8f924df7 6012 len = SCHARS (components);
ff0dacd7
KH
6013 i = i_byte = 0;
6014 while (i < len)
6015 {
6016 FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
6017 buf++;
6018 }
6019 }
6020 else if (INTEGERP (components))
6021 {
6022 len = 1;
6023 *buf++ = XINT (components);
6024 }
6025 else if (CONSP (components))
6026 {
6027 for (len = 0; CONSP (components);
6028 len++, components = XCDR (components))
6029 *buf++ = XINT (XCAR (components));
aaaf0b1e 6030 }
aaaf0b1e 6031 else
ff0dacd7
KH
6032 abort ();
6033 *head -= len;
aaaf0b1e 6034 }
aaaf0b1e 6035 }
ff0dacd7
KH
6036
6037 if (find_composition (end, limit, &start, &end, &prop,
6038 coding->src_object)
6039 && end <= limit)
6040 *stop = start;
6041 else
6042 *stop = limit;
aaaf0b1e 6043 }
ff0dacd7
KH
6044 return buf;
6045}
6046
6047
e1c23804 6048/* Extract an annotation datum from a text property `charset' at POS of
ff0dacd7
KH
6049 CODING->src_object (buffer of string), store the data in BUF, set
6050 *STOP to the position where the value of `charset' property changes
6051 (limiting by LIMIT), and return the address of the next element of
6052 BUF.
6053
6054 If the property value is nil, set *STOP to the position where the
6055 property value is non-nil (limiting by LIMIT), and return BUF. */
6056
6057static INLINE int *
6058handle_charset_annotation (pos, limit, coding, buf, stop)
6059 EMACS_INT pos, limit;
6060 struct coding_system *coding;
6061 int *buf;
6062 EMACS_INT *stop;
6063{
6064 Lisp_Object val, next;
6065 int id;
6066
6067 val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
6068 if (! NILP (val) && CHARSETP (val))
6069 id = XINT (CHARSET_SYMBOL_ID (val));
6070 else
6071 id = -1;
6072 ADD_CHARSET_DATA (buf, 0, 0, id);
6073 next = Fnext_single_property_change (make_number (pos), Qcharset,
6074 coding->src_object,
6075 make_number (limit));
6076 *stop = XINT (next);
6077 return buf;
6078}
6079
6080
df7492f9
KH
6081static void
6082consume_chars (coding)
6083 struct coding_system *coding;
6084{
6085 int *buf = coding->charbuf;
ff0dacd7 6086 int *buf_end = coding->charbuf + coding->charbuf_size;
7c78e542 6087 const unsigned char *src = coding->source + coding->consumed;
4776e638 6088 const unsigned char *src_end = coding->source + coding->src_bytes;
ff0dacd7
KH
6089 EMACS_INT pos = coding->src_pos + coding->consumed_char;
6090 EMACS_INT end_pos = coding->src_pos + coding->src_chars;
df7492f9
KH
6091 int multibytep = coding->src_multibyte;
6092 Lisp_Object eol_type;
6093 int c;
ff0dacd7 6094 EMACS_INT stop, stop_composition, stop_charset;
88993dfd 6095
df7492f9
KH
6096 eol_type = CODING_ID_EOL_TYPE (coding->id);
6097 if (VECTORP (eol_type))
6098 eol_type = Qunix;
88993dfd 6099
df7492f9
KH
6100 /* Note: composition handling is not yet implemented. */
6101 coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
ec6d2bb8 6102
0b5670c9
KH
6103 if (NILP (coding->src_object))
6104 stop = stop_composition = stop_charset = end_pos;
ff0dacd7 6105 else
0b5670c9
KH
6106 {
6107 if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
6108 stop = stop_composition = pos;
6109 else
6110 stop = stop_composition = end_pos;
6111 if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
6112 stop = stop_charset = pos;
6113 else
6114 stop_charset = end_pos;
6115 }
ec6d2bb8 6116
24a73b0a 6117 /* Compensate for CRLF and conversion. */
ff0dacd7 6118 buf_end -= 1 + MAX_ANNOTATION_LENGTH;
df7492f9 6119 while (buf < buf_end)
aaaf0b1e 6120 {
df7492f9 6121 if (pos == stop)
ec6d2bb8 6122 {
df7492f9
KH
6123 if (pos == end_pos)
6124 break;
ff0dacd7
KH
6125 if (pos == stop_composition)
6126 buf = handle_composition_annotation (pos, end_pos, coding,
6127 buf, &stop_composition);
6128 if (pos == stop_charset)
6129 buf = handle_charset_annotation (pos, end_pos, coding,
6130 buf, &stop_charset);
6131 stop = (stop_composition < stop_charset
6132 ? stop_composition : stop_charset);
df7492f9
KH
6133 }
6134
6135 if (! multibytep)
4776e638 6136 {
d3e4cb56 6137 EMACS_INT bytes;
aaaf0b1e 6138
d3e4cb56
KH
6139 if (! CODING_FOR_UNIBYTE (coding)
6140 && (bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
4776e638
KH
6141 c = STRING_CHAR_ADVANCE (src), pos += bytes;
6142 else
6143 c = *src++, pos++;
6144 }
df7492f9 6145 else
4776e638 6146 c = STRING_CHAR_ADVANCE (src), pos++;
df7492f9
KH
6147 if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
6148 c = '\n';
6149 if (! EQ (eol_type, Qunix))
aaaf0b1e 6150 {
df7492f9 6151 if (c == '\n')
aaaf0b1e 6152 {
df7492f9
KH
6153 if (EQ (eol_type, Qdos))
6154 *buf++ = '\r';
6155 else
6156 c = '\r';
aaaf0b1e
KH
6157 }
6158 }
df7492f9 6159 *buf++ = c;
aaaf0b1e 6160 }
ec6d2bb8 6161
df7492f9
KH
6162 coding->consumed = src - coding->source;
6163 coding->consumed_char = pos - coding->src_pos;
6164 coding->charbuf_used = buf - coding->charbuf;
6165 coding->chars_at_source = 0;
aaaf0b1e
KH
6166}
6167
4ed46869 6168
df7492f9
KH
6169/* Encode the text at CODING->src_object into CODING->dst_object.
6170 CODING->src_object is a buffer or a string.
6171 CODING->dst_object is a buffer or nil.
6172
6173 If CODING->src_object is a buffer, it must be the current buffer.
6174 In this case, if CODING->src_pos is positive, it is a position of
6175 the source text in the buffer, otherwise. the source text is in the
6176 gap area of the buffer, and coding->src_pos specifies the offset of
6177 the text from GPT (which must be the same as PT). If this is the
6178 same buffer as CODING->dst_object, CODING->src_pos must be
6179 negative and CODING should not have `pre-write-conversion'.
6180
6181 If CODING->src_object is a string, CODING should not have
6182 `pre-write-conversion'.
6183
6184 If CODING->dst_object is a buffer, the encoded data is inserted at
6185 the current point of that buffer.
6186
6187 If CODING->dst_object is nil, the encoded data is placed at the
6188 memory area specified by CODING->destination. */
6189
6190static int
6191encode_coding (coding)
4ed46869 6192 struct coding_system *coding;
4ed46869 6193{
df7492f9 6194 Lisp_Object attrs;
7d64c6ad 6195 Lisp_Object translation_table;
9861e777 6196
df7492f9 6197 attrs = CODING_ID_ATTRS (coding->id);
7d64c6ad 6198 translation_table = get_translation_table (attrs, 1);
4ed46869 6199
df7492f9 6200 if (BUFFERP (coding->dst_object))
8844fa83 6201 {
df7492f9
KH
6202 set_buffer_internal (XBUFFER (coding->dst_object));
6203 coding->dst_multibyte
6204 = ! NILP (current_buffer->enable_multibyte_characters);
8844fa83 6205 }
4ed46869 6206
b73bfc1c 6207 coding->consumed = coding->consumed_char = 0;
df7492f9 6208 coding->produced = coding->produced_char = 0;
065e3595 6209 record_conversion_result (coding, CODING_RESULT_SUCCESS);
b73bfc1c 6210 coding->errors = 0;
b73bfc1c 6211
df7492f9 6212 ALLOC_CONVERSION_WORK_AREA (coding);
4ed46869 6213
df7492f9
KH
6214 do {
6215 coding_set_source (coding);
6216 consume_chars (coding);
4ed46869 6217
7d64c6ad
KH
6218 if (!NILP (translation_table))
6219 translate_chars (coding, translation_table);
b73bfc1c 6220
df7492f9
KH
6221 coding_set_destination (coding);
6222 (*(coding->encoder)) (coding);
6223 } while (coding->consumed_char < coding->src_chars);
6224
6225 if (BUFFERP (coding->dst_object))
6226 insert_from_gap (coding->produced_char, coding->produced);
6227
6228 return (coding->result);
ec6d2bb8
KH
6229}
6230
fb88bf2d 6231
24a73b0a
KH
6232/* Name (or base name) of work buffer for code conversion. */
6233static Lisp_Object Vcode_conversion_workbuf_name;
d46c5b12 6234
24a73b0a
KH
6235/* A working buffer used by the top level conversion. Once it is
6236 created, it is never destroyed. It has the name
6237 Vcode_conversion_workbuf_name. The other working buffers are
6238 destroyed after the use is finished, and their names are modified
6239 versions of Vcode_conversion_workbuf_name. */
6240static Lisp_Object Vcode_conversion_reused_workbuf;
b73bfc1c 6241
24a73b0a
KH
6242/* 1 iff Vcode_conversion_reused_workbuf is already in use. */
6243static int reused_workbuf_in_use;
4ed46869 6244
24a73b0a
KH
6245
6246/* Return a working buffer of code convesion. MULTIBYTE specifies the
6247 multibyteness of returning buffer. */
b73bfc1c 6248
df7492f9 6249Lisp_Object
24a73b0a 6250make_conversion_work_buffer (multibyte)
df7492f9 6251{
24a73b0a
KH
6252 Lisp_Object name, workbuf;
6253 struct buffer *current;
4ed46869 6254
24a73b0a 6255 if (reused_workbuf_in_use++)
065e3595
KH
6256 {
6257 name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
6258 workbuf = Fget_buffer_create (name);
6259 }
df7492f9 6260 else
065e3595
KH
6261 {
6262 name = Vcode_conversion_workbuf_name;
6263 workbuf = Fget_buffer_create (name);
6264 if (NILP (Vcode_conversion_reused_workbuf))
6265 Vcode_conversion_reused_workbuf = workbuf;
6266 }
24a73b0a
KH
6267 current = current_buffer;
6268 set_buffer_internal (XBUFFER (workbuf));
6269 Ferase_buffer ();
df7492f9 6270 current_buffer->undo_list = Qt;
24a73b0a 6271 current_buffer->enable_multibyte_characters = multibyte ? Qt : Qnil;
df7492f9 6272 set_buffer_internal (current);
24a73b0a 6273 return workbuf;
df7492f9 6274}
d46c5b12 6275
24a73b0a 6276
4776e638 6277static Lisp_Object
24a73b0a
KH
6278code_conversion_restore (arg)
6279 Lisp_Object arg;
4776e638 6280{
24a73b0a
KH
6281 Lisp_Object current, workbuf;
6282
6283 current = XCAR (arg);
6284 workbuf = XCDR (arg);
6285 if (! NILP (workbuf))
6286 {
6287 if (EQ (workbuf, Vcode_conversion_reused_workbuf))
6288 reused_workbuf_in_use = 0;
6289 else if (! NILP (Fbuffer_live_p (workbuf)))
6290 Fkill_buffer (workbuf);
6291 }
6292 set_buffer_internal (XBUFFER (current));
4776e638
KH
6293 return Qnil;
6294}
b73bfc1c 6295
24a73b0a
KH
6296Lisp_Object
6297code_conversion_save (with_work_buf, multibyte)
4776e638 6298 int with_work_buf, multibyte;
df7492f9 6299{
24a73b0a 6300 Lisp_Object workbuf = Qnil;
b73bfc1c 6301
4776e638 6302 if (with_work_buf)
24a73b0a
KH
6303 workbuf = make_conversion_work_buffer (multibyte);
6304 record_unwind_protect (code_conversion_restore,
6305 Fcons (Fcurrent_buffer (), workbuf));
4776e638 6306 return workbuf;
df7492f9 6307}
d46c5b12 6308
df7492f9
KH
6309int
6310decode_coding_gap (coding, chars, bytes)
6311 struct coding_system *coding;
6312 EMACS_INT chars, bytes;
6313{
6314 int count = specpdl_ptr - specpdl;
5e5c78be 6315 Lisp_Object attrs;
fb88bf2d 6316
24a73b0a 6317 code_conversion_save (0, 0);
ec6d2bb8 6318
24a73b0a 6319 coding->src_object = Fcurrent_buffer ();
df7492f9
KH
6320 coding->src_chars = chars;
6321 coding->src_bytes = bytes;
6322 coding->src_pos = -chars;
6323 coding->src_pos_byte = -bytes;
6324 coding->src_multibyte = chars < bytes;
24a73b0a 6325 coding->dst_object = coding->src_object;
df7492f9
KH
6326 coding->dst_pos = PT;
6327 coding->dst_pos_byte = PT_BYTE;
71c81426 6328 coding->dst_multibyte = ! NILP (current_buffer->enable_multibyte_characters);
98725083 6329 coding->mode |= CODING_MODE_LAST_BLOCK;
4ed46869 6330
df7492f9
KH
6331 if (CODING_REQUIRE_DETECTION (coding))
6332 detect_coding (coding);
8f924df7 6333
df7492f9 6334 decode_coding (coding);
d46c5b12 6335
5e5c78be
KH
6336 attrs = CODING_ID_ATTRS (coding->id);
6337 if (! NILP (CODING_ATTR_POST_READ (attrs)))
b73bfc1c 6338 {
5e5c78be
KH
6339 EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
6340 Lisp_Object val;
6341
6342 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
5e5c78be
KH
6343 val = call1 (CODING_ATTR_POST_READ (attrs),
6344 make_number (coding->produced_char));
5e5c78be
KH
6345 CHECK_NATNUM (val);
6346 coding->produced_char += Z - prev_Z;
6347 coding->produced += Z_BYTE - prev_Z_BYTE;
b73bfc1c 6348 }
4ed46869 6349
df7492f9 6350 unbind_to (count, Qnil);
b73bfc1c
KH
6351 return coding->result;
6352}
52d41803 6353
4ed46869 6354int
df7492f9 6355encode_coding_gap (coding, chars, bytes)
4ed46869 6356 struct coding_system *coding;
df7492f9 6357 EMACS_INT chars, bytes;
4ed46869 6358{
df7492f9 6359 int count = specpdl_ptr - specpdl;
4ed46869 6360
24a73b0a 6361 code_conversion_save (0, 0);
4ed46869 6362
24a73b0a 6363 coding->src_object = Fcurrent_buffer ();
df7492f9
KH
6364 coding->src_chars = chars;
6365 coding->src_bytes = bytes;
6366 coding->src_pos = -chars;
6367 coding->src_pos_byte = -bytes;
6368 coding->src_multibyte = chars < bytes;
6369 coding->dst_object = coding->src_object;
6370 coding->dst_pos = PT;
6371 coding->dst_pos_byte = PT_BYTE;
4ed46869 6372
df7492f9 6373 encode_coding (coding);
b73bfc1c 6374
df7492f9
KH
6375 unbind_to (count, Qnil);
6376 return coding->result;
6377}
4ed46869 6378
d46c5b12 6379
df7492f9
KH
6380/* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
6381 SRC_OBJECT into DST_OBJECT by coding context CODING.
b73bfc1c 6382
df7492f9 6383 SRC_OBJECT is a buffer, a string, or Qnil.
b73bfc1c 6384
df7492f9
KH
6385 If it is a buffer, the text is at point of the buffer. FROM and TO
6386 are positions in the buffer.
b73bfc1c 6387
df7492f9
KH
6388 If it is a string, the text is at the beginning of the string.
6389 FROM and TO are indices to the string.
4ed46869 6390
df7492f9
KH
6391 If it is nil, the text is at coding->source. FROM and TO are
6392 indices to coding->source.
bb10be8b 6393
df7492f9 6394 DST_OBJECT is a buffer, Qt, or Qnil.
4ed46869 6395
df7492f9
KH
6396 If it is a buffer, the decoded text is inserted at point of the
6397 buffer. If the buffer is the same as SRC_OBJECT, the source text
6398 is deleted.
4ed46869 6399
df7492f9
KH
6400 If it is Qt, a string is made from the decoded text, and
6401 set in CODING->dst_object.
d46c5b12 6402
df7492f9 6403 If it is Qnil, the decoded text is stored at CODING->destination.
2cb26057 6404 The caller must allocate CODING->dst_bytes bytes at
df7492f9
KH
6405 CODING->destination by xmalloc. If the decoded text is longer than
6406 CODING->dst_bytes, CODING->destination is relocated by xrealloc.
6407 */
d46c5b12 6408
df7492f9
KH
6409void
6410decode_coding_object (coding, src_object, from, from_byte, to, to_byte,
6411 dst_object)
d46c5b12 6412 struct coding_system *coding;
df7492f9
KH
6413 Lisp_Object src_object;
6414 EMACS_INT from, from_byte, to, to_byte;
6415 Lisp_Object dst_object;
d46c5b12 6416{
df7492f9
KH
6417 int count = specpdl_ptr - specpdl;
6418 unsigned char *destination;
6419 EMACS_INT dst_bytes;
6420 EMACS_INT chars = to - from;
6421 EMACS_INT bytes = to_byte - from_byte;
6422 Lisp_Object attrs;
4776e638
KH
6423 Lisp_Object buffer;
6424 int saved_pt = -1, saved_pt_byte;
d46c5b12 6425
4776e638 6426 buffer = Fcurrent_buffer ();
93dec019 6427
df7492f9 6428 if (NILP (dst_object))
d46c5b12 6429 {
df7492f9
KH
6430 destination = coding->destination;
6431 dst_bytes = coding->dst_bytes;
d46c5b12 6432 }
93dec019 6433
df7492f9
KH
6434 coding->src_object = src_object;
6435 coding->src_chars = chars;
6436 coding->src_bytes = bytes;
6437 coding->src_multibyte = chars < bytes;
70ad9fc4 6438
df7492f9 6439 if (STRINGP (src_object))
d46c5b12 6440 {
df7492f9
KH
6441 coding->src_pos = from;
6442 coding->src_pos_byte = from_byte;
d46c5b12 6443 }
df7492f9 6444 else if (BUFFERP (src_object))
88993dfd 6445 {
df7492f9
KH
6446 set_buffer_internal (XBUFFER (src_object));
6447 if (from != GPT)
6448 move_gap_both (from, from_byte);
6449 if (EQ (src_object, dst_object))
fb88bf2d 6450 {
4776e638 6451 saved_pt = PT, saved_pt_byte = PT_BYTE;
df7492f9
KH
6452 TEMP_SET_PT_BOTH (from, from_byte);
6453 del_range_both (from, from_byte, to, to_byte, 1);
6454 coding->src_pos = -chars;
6455 coding->src_pos_byte = -bytes;
fb88bf2d 6456 }
df7492f9 6457 else
fb88bf2d 6458 {
df7492f9
KH
6459 coding->src_pos = from;
6460 coding->src_pos_byte = from_byte;
fb88bf2d 6461 }
88993dfd
KH
6462 }
6463
df7492f9
KH
6464 if (CODING_REQUIRE_DETECTION (coding))
6465 detect_coding (coding);
6466 attrs = CODING_ID_ATTRS (coding->id);
d46c5b12 6467
2cb26057
KH
6468 if (EQ (dst_object, Qt)
6469 || (! NILP (CODING_ATTR_POST_READ (attrs))
6470 && NILP (dst_object)))
b73bfc1c 6471 {
24a73b0a 6472 coding->dst_object = code_conversion_save (1, 1);
df7492f9
KH
6473 coding->dst_pos = BEG;
6474 coding->dst_pos_byte = BEG_BYTE;
6475 coding->dst_multibyte = 1;
b73bfc1c 6476 }
df7492f9 6477 else if (BUFFERP (dst_object))
d46c5b12 6478 {
24a73b0a 6479 code_conversion_save (0, 0);
df7492f9
KH
6480 coding->dst_object = dst_object;
6481 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
6482 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
6483 coding->dst_multibyte
6484 = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
d46c5b12
KH
6485 }
6486 else
6487 {
24a73b0a 6488 code_conversion_save (0, 0);
df7492f9
KH
6489 coding->dst_object = Qnil;
6490 coding->dst_multibyte = 1;
d46c5b12
KH
6491 }
6492
df7492f9 6493 decode_coding (coding);
fa46990e 6494
df7492f9
KH
6495 if (BUFFERP (coding->dst_object))
6496 set_buffer_internal (XBUFFER (coding->dst_object));
d46c5b12 6497
df7492f9 6498 if (! NILP (CODING_ATTR_POST_READ (attrs)))
d46c5b12 6499 {
df7492f9
KH
6500 struct gcpro gcpro1, gcpro2;
6501 EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
2b4f9037 6502 Lisp_Object val;
d46c5b12 6503
c0cc7f7f 6504 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
df7492f9
KH
6505 GCPRO2 (coding->src_object, coding->dst_object);
6506 val = call1 (CODING_ATTR_POST_READ (attrs),
6507 make_number (coding->produced_char));
6508 UNGCPRO;
6509 CHECK_NATNUM (val);
6510 coding->produced_char += Z - prev_Z;
6511 coding->produced += Z_BYTE - prev_Z_BYTE;
d46c5b12 6512 }
de79a6a5 6513
df7492f9 6514 if (EQ (dst_object, Qt))
ec6d2bb8 6515 {
df7492f9
KH
6516 coding->dst_object = Fbuffer_string ();
6517 }
6518 else if (NILP (dst_object) && BUFFERP (coding->dst_object))
6519 {
6520 set_buffer_internal (XBUFFER (coding->dst_object));
6521 if (dst_bytes < coding->produced)
6522 {
6523 destination
6524 = (unsigned char *) xrealloc (destination, coding->produced);
6525 if (! destination)
6526 {
065e3595
KH
6527 record_conversion_result (coding,
6528 CODING_RESULT_INSUFFICIENT_DST);
df7492f9
KH
6529 unbind_to (count, Qnil);
6530 return;
6531 }
6532 if (BEGV < GPT && GPT < BEGV + coding->produced_char)
6533 move_gap_both (BEGV, BEGV_BYTE);
6534 bcopy (BEGV_ADDR, destination, coding->produced);
6535 coding->destination = destination;
d46c5b12 6536 }
ec6d2bb8 6537 }
b73bfc1c 6538
4776e638
KH
6539 if (saved_pt >= 0)
6540 {
6541 /* This is the case of:
6542 (BUFFERP (src_object) && EQ (src_object, dst_object))
6543 As we have moved PT while replacing the original buffer
6544 contents, we must recover it now. */
6545 set_buffer_internal (XBUFFER (src_object));
6546 if (saved_pt < from)
6547 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
6548 else if (saved_pt < from + chars)
6549 TEMP_SET_PT_BOTH (from, from_byte);
6550 else if (! NILP (current_buffer->enable_multibyte_characters))
6551 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
6552 saved_pt_byte + (coding->produced - bytes));
6553 else
6554 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
6555 saved_pt_byte + (coding->produced - bytes));
d46c5b12 6556 }
4776e638 6557
065e3595 6558 unbind_to (count, coding->dst_object);
d46c5b12
KH
6559}
6560
d46c5b12 6561
df7492f9
KH
6562void
6563encode_coding_object (coding, src_object, from, from_byte, to, to_byte,
6564 dst_object)
d46c5b12 6565 struct coding_system *coding;
df7492f9
KH
6566 Lisp_Object src_object;
6567 EMACS_INT from, from_byte, to, to_byte;
6568 Lisp_Object dst_object;
d46c5b12 6569{
b73bfc1c 6570 int count = specpdl_ptr - specpdl;
df7492f9
KH
6571 EMACS_INT chars = to - from;
6572 EMACS_INT bytes = to_byte - from_byte;
6573 Lisp_Object attrs;
4776e638
KH
6574 Lisp_Object buffer;
6575 int saved_pt = -1, saved_pt_byte;
df7492f9 6576
4776e638 6577 buffer = Fcurrent_buffer ();
df7492f9
KH
6578
6579 coding->src_object = src_object;
6580 coding->src_chars = chars;
6581 coding->src_bytes = bytes;
6582 coding->src_multibyte = chars < bytes;
6583
6584 attrs = CODING_ID_ATTRS (coding->id);
6585
6586 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
6bac5b12 6587 {
24a73b0a 6588 coding->src_object = code_conversion_save (1, coding->src_multibyte);
df7492f9
KH
6589 set_buffer_internal (XBUFFER (coding->src_object));
6590 if (STRINGP (src_object))
6591 insert_from_string (src_object, from, from_byte, chars, bytes, 0);
6592 else if (BUFFERP (src_object))
6593 insert_from_buffer (XBUFFER (src_object), from, chars, 0);
6594 else
6595 insert_1_both (coding->source + from, chars, bytes, 0, 0, 0);
d46c5b12 6596
df7492f9
KH
6597 if (EQ (src_object, dst_object))
6598 {
6599 set_buffer_internal (XBUFFER (src_object));
4776e638 6600 saved_pt = PT, saved_pt_byte = PT_BYTE;
df7492f9
KH
6601 del_range_both (from, from_byte, to, to_byte, 1);
6602 set_buffer_internal (XBUFFER (coding->src_object));
6603 }
6604
ac87bbef
KH
6605 call2 (CODING_ATTR_PRE_WRITE (attrs),
6606 make_number (BEG), make_number (Z));
6607 coding->src_object = Fcurrent_buffer ();
df7492f9
KH
6608 if (BEG != GPT)
6609 move_gap_both (BEG, BEG_BYTE);
6610 coding->src_chars = Z - BEG;
6611 coding->src_bytes = Z_BYTE - BEG_BYTE;
6612 coding->src_pos = BEG;
6613 coding->src_pos_byte = BEG_BYTE;
6614 coding->src_multibyte = Z < Z_BYTE;
6615 }
6616 else if (STRINGP (src_object))
d46c5b12 6617 {
24a73b0a 6618 code_conversion_save (0, 0);
df7492f9
KH
6619 coding->src_pos = from;
6620 coding->src_pos_byte = from_byte;
b73bfc1c 6621 }
df7492f9 6622 else if (BUFFERP (src_object))
b73bfc1c 6623 {
24a73b0a 6624 code_conversion_save (0, 0);
df7492f9 6625 set_buffer_internal (XBUFFER (src_object));
df7492f9 6626 if (EQ (src_object, dst_object))
d46c5b12 6627 {
4776e638 6628 saved_pt = PT, saved_pt_byte = PT_BYTE;
ff0dacd7
KH
6629 coding->src_object = del_range_1 (from, to, 1, 1);
6630 coding->src_pos = 0;
6631 coding->src_pos_byte = 0;
d46c5b12 6632 }
df7492f9 6633 else
d46c5b12 6634 {
ff0dacd7
KH
6635 if (from < GPT && to >= GPT)
6636 move_gap_both (from, from_byte);
df7492f9
KH
6637 coding->src_pos = from;
6638 coding->src_pos_byte = from_byte;
d46c5b12 6639 }
d46c5b12 6640 }
4776e638 6641 else
24a73b0a 6642 code_conversion_save (0, 0);
d46c5b12 6643
df7492f9 6644 if (BUFFERP (dst_object))
88993dfd 6645 {
df7492f9 6646 coding->dst_object = dst_object;
28f67a95
KH
6647 if (EQ (src_object, dst_object))
6648 {
6649 coding->dst_pos = from;
6650 coding->dst_pos_byte = from_byte;
6651 }
6652 else
6653 {
6654 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
6655 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
6656 }
df7492f9
KH
6657 coding->dst_multibyte
6658 = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
88993dfd 6659 }
df7492f9 6660 else if (EQ (dst_object, Qt))
d46c5b12 6661 {
df7492f9 6662 coding->dst_object = Qnil;
df7492f9 6663 coding->dst_bytes = coding->src_chars;
ac87bbef
KH
6664 if (coding->dst_bytes == 0)
6665 coding->dst_bytes = 1;
6666 coding->destination = (unsigned char *) xmalloc (coding->dst_bytes);
df7492f9 6667 coding->dst_multibyte = 0;
d46c5b12
KH
6668 }
6669 else
6670 {
df7492f9
KH
6671 coding->dst_object = Qnil;
6672 coding->dst_multibyte = 0;
d46c5b12
KH
6673 }
6674
df7492f9 6675 encode_coding (coding);
d46c5b12 6676
df7492f9 6677 if (EQ (dst_object, Qt))
d46c5b12 6678 {
df7492f9
KH
6679 if (BUFFERP (coding->dst_object))
6680 coding->dst_object = Fbuffer_string ();
6681 else
d46c5b12 6682 {
df7492f9
KH
6683 coding->dst_object
6684 = make_unibyte_string ((char *) coding->destination,
6685 coding->produced);
6686 xfree (coding->destination);
d46c5b12 6687 }
4ed46869 6688 }
d46c5b12 6689
4776e638
KH
6690 if (saved_pt >= 0)
6691 {
6692 /* This is the case of:
6693 (BUFFERP (src_object) && EQ (src_object, dst_object))
6694 As we have moved PT while replacing the original buffer
6695 contents, we must recover it now. */
6696 set_buffer_internal (XBUFFER (src_object));
6697 if (saved_pt < from)
6698 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
6699 else if (saved_pt < from + chars)
6700 TEMP_SET_PT_BOTH (from, from_byte);
6701 else if (! NILP (current_buffer->enable_multibyte_characters))
6702 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
6703 saved_pt_byte + (coding->produced - bytes));
d46c5b12 6704 else
4776e638
KH
6705 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
6706 saved_pt_byte + (coding->produced - bytes));
6707 }
6708
df7492f9 6709 unbind_to (count, Qnil);
b73bfc1c
KH
6710}
6711
df7492f9 6712
b73bfc1c 6713Lisp_Object
df7492f9 6714preferred_coding_system ()
b73bfc1c 6715{
df7492f9 6716 int id = coding_categories[coding_priorities[0]].id;
2391eaa4 6717
df7492f9 6718 return CODING_ID_NAME (id);
4ed46869
KH
6719}
6720
6721\f
6722#ifdef emacs
1397dc18 6723/*** 8. Emacs Lisp library functions ***/
4ed46869 6724
4ed46869 6725DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
48b0f3ae 6726 doc: /* Return t if OBJECT is nil or a coding-system.
df7492f9 6727See the documentation of `define-coding-system' for information
48b0f3ae
PJ
6728about coding-system objects. */)
6729 (obj)
4ed46869
KH
6730 Lisp_Object obj;
6731{
df7492f9 6732 return ((NILP (obj) || CODING_SYSTEM_P (obj)) ? Qt : Qnil);
4ed46869
KH
6733}
6734
9d991de8
RS
6735DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
6736 Sread_non_nil_coding_system, 1, 1, 0,
48b0f3ae
PJ
6737 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
6738 (prompt)
4ed46869
KH
6739 Lisp_Object prompt;
6740{
e0e989f6 6741 Lisp_Object val;
9d991de8
RS
6742 do
6743 {
4608c386
KH
6744 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6745 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
9d991de8 6746 }
8f924df7 6747 while (SCHARS (val) == 0);
e0e989f6 6748 return (Fintern (val, Qnil));
4ed46869
KH
6749}
6750
9b787f3e 6751DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
48b0f3ae
PJ
6752 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
6753If the user enters null input, return second argument DEFAULT-CODING-SYSTEM. */)
6754 (prompt, default_coding_system)
9b787f3e 6755 Lisp_Object prompt, default_coding_system;
4ed46869 6756{
f44d27ce 6757 Lisp_Object val;
9b787f3e 6758 if (SYMBOLP (default_coding_system))
a3181084 6759 XSETSTRING (default_coding_system, XPNTR (SYMBOL_NAME (default_coding_system)));
4608c386 6760 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
9b787f3e
RS
6761 Qt, Qnil, Qcoding_system_history,
6762 default_coding_system, Qnil);
8f924df7 6763 return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
4ed46869
KH
6764}
6765
6766DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
6767 1, 1, 0,
48b0f3ae 6768 doc: /* Check validity of CODING-SYSTEM.
b054002f 6769If valid, return CODING-SYSTEM, else signal a `coding-system-error' error. */)
df7492f9 6770 (coding_system)
4ed46869
KH
6771 Lisp_Object coding_system;
6772{
b7826503 6773 CHECK_SYMBOL (coding_system);
4ed46869
KH
6774 if (!NILP (Fcoding_system_p (coding_system)))
6775 return coding_system;
6776 while (1)
02ba4723 6777 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
4ed46869 6778}
df7492f9 6779
3a73fa5d 6780\f
89528eb3
KH
6781/* Detect how the bytes at SRC of length SRC_BYTES are encoded. If
6782 HIGHEST is nonzero, return the coding system of the highest
6783 priority among the detected coding systems. Otherwize return a
6784 list of detected coding systems sorted by their priorities. If
6785 MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
6786 multibyte form but contains only ASCII and eight-bit chars.
6787 Otherwise, the bytes are raw bytes.
6788
6789 CODING-SYSTEM controls the detection as below:
6790
6791 If it is nil, detect both text-format and eol-format. If the
6792 text-format part of CODING-SYSTEM is already specified
6793 (e.g. `iso-latin-1'), detect only eol-format. If the eol-format
6794 part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
6795 detect only text-format. */
6796
d46c5b12 6797Lisp_Object
24a73b0a
KH
6798detect_coding_system (src, src_chars, src_bytes, highest, multibytep,
6799 coding_system)
8f924df7 6800 const unsigned char *src;
24a73b0a 6801 int src_chars, src_bytes, highest;
0a28aafb 6802 int multibytep;
df7492f9 6803 Lisp_Object coding_system;
4ed46869 6804{
8f924df7 6805 const unsigned char *src_end = src + src_bytes;
df7492f9
KH
6806 Lisp_Object attrs, eol_type;
6807 Lisp_Object val;
6808 struct coding_system coding;
89528eb3 6809 int id;
ff0dacd7 6810 struct coding_detection_info detect_info;
24a73b0a 6811 enum coding_category base_category;
b73bfc1c 6812
df7492f9
KH
6813 if (NILP (coding_system))
6814 coding_system = Qundecided;
6815 setup_coding_system (coding_system, &coding);
6816 attrs = CODING_ID_ATTRS (coding.id);
6817 eol_type = CODING_ID_EOL_TYPE (coding.id);
89528eb3 6818 coding_system = CODING_ATTR_BASE_NAME (attrs);
4ed46869 6819
df7492f9 6820 coding.source = src;
24a73b0a 6821 coding.src_chars = src_chars;
df7492f9
KH
6822 coding.src_bytes = src_bytes;
6823 coding.src_multibyte = multibytep;
6824 coding.consumed = 0;
89528eb3 6825 coding.mode |= CODING_MODE_LAST_BLOCK;
d46c5b12 6826
ff0dacd7 6827 detect_info.checked = detect_info.found = detect_info.rejected = 0;
d46c5b12 6828
89528eb3 6829 /* At first, detect text-format if necessary. */
24a73b0a
KH
6830 base_category = XINT (CODING_ATTR_CATEGORY (attrs));
6831 if (base_category == coding_category_undecided)
4ed46869 6832 {
ff0dacd7
KH
6833 enum coding_category category;
6834 struct coding_system *this;
6835 int c, i;
88993dfd 6836
24a73b0a
KH
6837 /* Skip all ASCII bytes except for a few ISO2022 controls. */
6838 for (i = 0; src < src_end; i++, src++)
4ed46869 6839 {
df7492f9 6840 c = *src;
24a73b0a
KH
6841 if (c & 0x80 || (c < 0x20 && (c == 0
6842 || c == ISO_CODE_ESC
6843 || c == ISO_CODE_SI
6844 || c == ISO_CODE_SO)))
d46c5b12 6845 break;
4ed46869 6846 }
df7492f9 6847 coding.head_ascii = src - coding.source;
88993dfd 6848
df7492f9
KH
6849 if (src < src_end)
6850 for (i = 0; i < coding_category_raw_text; i++)
6851 {
ff0dacd7
KH
6852 category = coding_priorities[i];
6853 this = coding_categories + category;
b843d1ae 6854
df7492f9
KH
6855 if (this->id < 0)
6856 {
6857 /* No coding system of this category is defined. */
ff0dacd7 6858 detect_info.rejected |= (1 << category);
df7492f9 6859 }
ff0dacd7 6860 else if (category >= coding_category_raw_text)
89528eb3 6861 continue;
ff0dacd7
KH
6862 else if (detect_info.checked & (1 << category))
6863 {
6864 if (highest
6865 && (detect_info.found & (1 << category)))
6866 break;
6867 }
df7492f9
KH
6868 else
6869 {
ff0dacd7 6870 if ((*(this->detector)) (&coding, &detect_info)
89528eb3 6871 && highest
ff0dacd7 6872 && (detect_info.found & (1 << category)))
24a73b0a
KH
6873 {
6874 if (category == coding_category_utf_16_auto)
6875 {
6876 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6877 category = coding_category_utf_16_le;
6878 else
6879 category = coding_category_utf_16_be;
6880 }
6881 break;
6882 }
df7492f9
KH
6883 }
6884 }
ec6d2bb8 6885
ff0dacd7 6886 if (detect_info.rejected == CATEGORY_MASK_ANY)
ec6d2bb8 6887 {
ff0dacd7 6888 detect_info.found = CATEGORY_MASK_RAW_TEXT;
89528eb3
KH
6889 id = coding_categories[coding_category_raw_text].id;
6890 val = Fcons (make_number (id), Qnil);
6891 }
ff0dacd7 6892 else if (! detect_info.rejected && ! detect_info.found)
89528eb3 6893 {
ff0dacd7 6894 detect_info.found = CATEGORY_MASK_ANY;
89528eb3
KH
6895 id = coding_categories[coding_category_undecided].id;
6896 val = Fcons (make_number (id), Qnil);
6897 }
6898 else if (highest)
6899 {
ff0dacd7 6900 if (detect_info.found)
ec6d2bb8 6901 {
ff0dacd7
KH
6902 detect_info.found = 1 << category;
6903 val = Fcons (make_number (this->id), Qnil);
6904 }
6905 else
6906 for (i = 0; i < coding_category_raw_text; i++)
6907 if (! (detect_info.rejected & (1 << coding_priorities[i])))
6908 {
6909 detect_info.found = 1 << coding_priorities[i];
6910 id = coding_categories[coding_priorities[i]].id;
6911 val = Fcons (make_number (id), Qnil);
6912 break;
6913 }
6914 }
89528eb3
KH
6915 else
6916 {
ff0dacd7
KH
6917 int mask = detect_info.rejected | detect_info.found;
6918 int found = 0;
89528eb3 6919 val = Qnil;
ec6d2bb8 6920
89528eb3 6921 for (i = coding_category_raw_text - 1; i >= 0; i--)
ff0dacd7
KH
6922 {
6923 category = coding_priorities[i];
6924 if (! (mask & (1 << category)))
ec6d2bb8 6925 {
ff0dacd7
KH
6926 found |= 1 << category;
6927 id = coding_categories[category].id;
6928 val = Fcons (make_number (id), val);
6929 }
6930 }
6931 for (i = coding_category_raw_text - 1; i >= 0; i--)
6932 {
6933 category = coding_priorities[i];
6934 if (detect_info.found & (1 << category))
6935 {
6936 id = coding_categories[category].id;
6937 val = Fcons (make_number (id), val);
ec6d2bb8 6938 }
ec6d2bb8 6939 }
ff0dacd7 6940 detect_info.found |= found;
ec6d2bb8 6941 }
ec6d2bb8 6942 }
24a73b0a
KH
6943 else if (base_category == coding_category_utf_16_auto)
6944 {
6945 if (detect_coding_utf_16 (&coding, &detect_info))
6946 {
6947 enum coding_category category;
6948 struct coding_system *this;
6949
6950 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6951 this = coding_categories + coding_category_utf_16_le;
6952 else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6953 this = coding_categories + coding_category_utf_16_be;
6954 else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
6955 this = coding_categories + coding_category_utf_16_be_nosig;
6956 else
6957 this = coding_categories + coding_category_utf_16_le_nosig;
6958 val = Fcons (make_number (this->id), Qnil);
6959 }
6960 }
df7492f9
KH
6961 else
6962 {
ff0dacd7 6963 detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
89528eb3 6964 val = Fcons (make_number (coding.id), Qnil);
4ed46869 6965 }
df7492f9 6966
89528eb3 6967 /* Then, detect eol-format if necessary. */
df7492f9 6968 {
89528eb3 6969 int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol;
df7492f9
KH
6970 Lisp_Object tail;
6971
89528eb3
KH
6972 if (VECTORP (eol_type))
6973 {
ff0dacd7 6974 if (detect_info.found & ~CATEGORY_MASK_UTF_16)
89528eb3
KH
6975 normal_eol = detect_eol (coding.source, src_bytes,
6976 coding_category_raw_text);
ff0dacd7
KH
6977 if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
6978 | CATEGORY_MASK_UTF_16_BE_NOSIG))
89528eb3
KH
6979 utf_16_be_eol = detect_eol (coding.source, src_bytes,
6980 coding_category_utf_16_be);
ff0dacd7
KH
6981 if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
6982 | CATEGORY_MASK_UTF_16_LE_NOSIG))
89528eb3
KH
6983 utf_16_le_eol = detect_eol (coding.source, src_bytes,
6984 coding_category_utf_16_le);
6985 }
6986 else
6987 {
6988 if (EQ (eol_type, Qunix))
6989 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
6990 else if (EQ (eol_type, Qdos))
6991 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
6992 else
6993 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
6994 }
6995
df7492f9
KH
6996 for (tail = val; CONSP (tail); tail = XCDR (tail))
6997 {
89528eb3 6998 enum coding_category category;
df7492f9 6999 int this_eol;
89528eb3
KH
7000
7001 id = XINT (XCAR (tail));
7002 attrs = CODING_ID_ATTRS (id);
7003 category = XINT (CODING_ATTR_CATEGORY (attrs));
7004 eol_type = CODING_ID_EOL_TYPE (id);
df7492f9
KH
7005 if (VECTORP (eol_type))
7006 {
89528eb3
KH
7007 if (category == coding_category_utf_16_be
7008 || category == coding_category_utf_16_be_nosig)
7009 this_eol = utf_16_be_eol;
7010 else if (category == coding_category_utf_16_le
7011 || category == coding_category_utf_16_le_nosig)
7012 this_eol = utf_16_le_eol;
df7492f9 7013 else
89528eb3
KH
7014 this_eol = normal_eol;
7015
df7492f9
KH
7016 if (this_eol == EOL_SEEN_LF)
7017 XSETCAR (tail, AREF (eol_type, 0));
7018 else if (this_eol == EOL_SEEN_CRLF)
7019 XSETCAR (tail, AREF (eol_type, 1));
7020 else if (this_eol == EOL_SEEN_CR)
7021 XSETCAR (tail, AREF (eol_type, 2));
89528eb3
KH
7022 else
7023 XSETCAR (tail, CODING_ID_NAME (id));
df7492f9 7024 }
89528eb3
KH
7025 else
7026 XSETCAR (tail, CODING_ID_NAME (id));
df7492f9
KH
7027 }
7028 }
ec6d2bb8 7029
03699b14 7030 return (highest ? XCAR (val) : val);
ec6d2bb8
KH
7031}
7032
ec6d2bb8 7033
d46c5b12
KH
7034DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
7035 2, 3, 0,
48b0f3ae
PJ
7036 doc: /* Detect coding system of the text in the region between START and END.
7037Return a list of possible coding systems ordered by priority.
ec6d2bb8 7038
48b0f3ae
PJ
7039If only ASCII characters are found, it returns a list of single element
7040`undecided' or its subsidiary coding system according to a detected
7041end-of-line format.
ec6d2bb8 7042
48b0f3ae
PJ
7043If optional argument HIGHEST is non-nil, return the coding system of
7044highest priority. */)
7045 (start, end, highest)
d46c5b12
KH
7046 Lisp_Object start, end, highest;
7047{
7048 int from, to;
7049 int from_byte, to_byte;
ec6d2bb8 7050
b7826503
PJ
7051 CHECK_NUMBER_COERCE_MARKER (start);
7052 CHECK_NUMBER_COERCE_MARKER (end);
ec6d2bb8 7053
d46c5b12
KH
7054 validate_region (&start, &end);
7055 from = XINT (start), to = XINT (end);
7056 from_byte = CHAR_TO_BYTE (from);
7057 to_byte = CHAR_TO_BYTE (to);
ec6d2bb8 7058
d46c5b12
KH
7059 if (from < GPT && to >= GPT)
7060 move_gap_both (to, to_byte);
c210f766 7061
d46c5b12 7062 return detect_coding_system (BYTE_POS_ADDR (from_byte),
24a73b0a 7063 to - from, to_byte - from_byte,
0a28aafb
KH
7064 !NILP (highest),
7065 !NILP (current_buffer
df7492f9
KH
7066 ->enable_multibyte_characters),
7067 Qnil);
ec6d2bb8
KH
7068}
7069
d46c5b12
KH
7070DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
7071 1, 2, 0,
48b0f3ae
PJ
7072 doc: /* Detect coding system of the text in STRING.
7073Return a list of possible coding systems ordered by priority.
fb88bf2d 7074
48b0f3ae
PJ
7075If only ASCII characters are found, it returns a list of single element
7076`undecided' or its subsidiary coding system according to a detected
7077end-of-line format.
d46c5b12 7078
48b0f3ae
PJ
7079If optional argument HIGHEST is non-nil, return the coding system of
7080highest priority. */)
7081 (string, highest)
d46c5b12
KH
7082 Lisp_Object string, highest;
7083{
b7826503 7084 CHECK_STRING (string);
b73bfc1c 7085
24a73b0a
KH
7086 return detect_coding_system (SDATA (string),
7087 SCHARS (string), SBYTES (string),
8f924df7 7088 !NILP (highest), STRING_MULTIBYTE (string),
df7492f9 7089 Qnil);
4ed46869 7090}
4ed46869 7091
b73bfc1c 7092
df7492f9
KH
7093static INLINE int
7094char_encodable_p (c, attrs)
7095 int c;
7096 Lisp_Object attrs;
05e6f5dc 7097{
df7492f9 7098 Lisp_Object tail;
df7492f9 7099 struct charset *charset;
7d64c6ad 7100 Lisp_Object translation_table;
d46c5b12 7101
7d64c6ad
KH
7102 translation_table = CODING_ATTR_TRANS_TBL (attrs);
7103 if (CHAR_TABLE_P (translation_table))
7104 c = translate_char (translation_table, c);
df7492f9
KH
7105 for (tail = CODING_ATTR_CHARSET_LIST (attrs);
7106 CONSP (tail); tail = XCDR (tail))
e133c8fa 7107 {
df7492f9
KH
7108 charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
7109 if (CHAR_CHARSET_P (c, charset))
7110 break;
e133c8fa 7111 }
df7492f9 7112 return (! NILP (tail));
05e6f5dc 7113}
83fa074f 7114
fb88bf2d 7115
df7492f9
KH
7116/* Return a list of coding systems that safely encode the text between
7117 START and END. If EXCLUDE is non-nil, it is a list of coding
7118 systems not to check. The returned list doesn't contain any such
48468dac 7119 coding systems. In any case, if the text contains only ASCII or is
df7492f9 7120 unibyte, return t. */
e077cc80 7121
df7492f9
KH
7122DEFUN ("find-coding-systems-region-internal",
7123 Ffind_coding_systems_region_internal,
7124 Sfind_coding_systems_region_internal, 2, 3, 0,
7125 doc: /* Internal use only. */)
7126 (start, end, exclude)
7127 Lisp_Object start, end, exclude;
7128{
7129 Lisp_Object coding_attrs_list, safe_codings;
7130 EMACS_INT start_byte, end_byte;
7c78e542 7131 const unsigned char *p, *pbeg, *pend;
df7492f9
KH
7132 int c;
7133 Lisp_Object tail, elt;
d46c5b12 7134
df7492f9
KH
7135 if (STRINGP (start))
7136 {
7137 if (!STRING_MULTIBYTE (start)
8f924df7 7138 || SCHARS (start) == SBYTES (start))
df7492f9
KH
7139 return Qt;
7140 start_byte = 0;
8f924df7 7141 end_byte = SBYTES (start);
df7492f9
KH
7142 }
7143 else
d46c5b12 7144 {
df7492f9
KH
7145 CHECK_NUMBER_COERCE_MARKER (start);
7146 CHECK_NUMBER_COERCE_MARKER (end);
7147 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
7148 args_out_of_range (start, end);
7149 if (NILP (current_buffer->enable_multibyte_characters))
7150 return Qt;
7151 start_byte = CHAR_TO_BYTE (XINT (start));
7152 end_byte = CHAR_TO_BYTE (XINT (end));
7153 if (XINT (end) - XINT (start) == end_byte - start_byte)
7154 return Qt;
d46c5b12 7155
e1c23804 7156 if (XINT (start) < GPT && XINT (end) > GPT)
d46c5b12 7157 {
e1c23804
DL
7158 if ((GPT - XINT (start)) < (XINT (end) - GPT))
7159 move_gap_both (XINT (start), start_byte);
df7492f9 7160 else
e1c23804 7161 move_gap_both (XINT (end), end_byte);
d46c5b12
KH
7162 }
7163 }
7164
df7492f9
KH
7165 coding_attrs_list = Qnil;
7166 for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
7167 if (NILP (exclude)
7168 || NILP (Fmemq (XCAR (tail), exclude)))
7169 {
7170 Lisp_Object attrs;
d46c5b12 7171
df7492f9
KH
7172 attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
7173 if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
7174 && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
7d64c6ad
KH
7175 {
7176 ASET (attrs, coding_attr_trans_tbl,
7177 get_translation_table (attrs, 1));
7178 coding_attrs_list = Fcons (attrs, coding_attrs_list);
7179 }
df7492f9 7180 }
d46c5b12 7181
df7492f9 7182 if (STRINGP (start))
8f924df7 7183 p = pbeg = SDATA (start);
df7492f9
KH
7184 else
7185 p = pbeg = BYTE_POS_ADDR (start_byte);
7186 pend = p + (end_byte - start_byte);
b843d1ae 7187
df7492f9
KH
7188 while (p < pend && ASCII_BYTE_P (*p)) p++;
7189 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
d46c5b12 7190
05e6f5dc 7191 while (p < pend)
72d1a715 7192 {
df7492f9
KH
7193 if (ASCII_BYTE_P (*p))
7194 p++;
72d1a715
RS
7195 else
7196 {
df7492f9 7197 c = STRING_CHAR_ADVANCE (p);
12410ef1 7198
df7492f9
KH
7199 charset_map_loaded = 0;
7200 for (tail = coding_attrs_list; CONSP (tail);)
7201 {
7202 elt = XCAR (tail);
7203 if (NILP (elt))
7204 tail = XCDR (tail);
7205 else if (char_encodable_p (c, elt))
7206 tail = XCDR (tail);
7207 else if (CONSP (XCDR (tail)))
7208 {
7209 XSETCAR (tail, XCAR (XCDR (tail)));
7210 XSETCDR (tail, XCDR (XCDR (tail)));
7211 }
7212 else
7213 {
7214 XSETCAR (tail, Qnil);
7215 tail = XCDR (tail);
7216 }
7217 }
7218 if (charset_map_loaded)
7219 {
7220 EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
05e6f5dc 7221
df7492f9 7222 if (STRINGP (start))
8f924df7 7223 pbeg = SDATA (start);
df7492f9
KH
7224 else
7225 pbeg = BYTE_POS_ADDR (start_byte);
7226 p = pbeg + p_offset;
7227 pend = pbeg + pend_offset;
7228 }
7229 }
ec6d2bb8 7230 }
fb88bf2d 7231
df7492f9
KH
7232 safe_codings = Qnil;
7233 for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
7234 if (! NILP (XCAR (tail)))
7235 safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
ec6d2bb8 7236
05e6f5dc
KH
7237 return safe_codings;
7238}
4956c225 7239
d46c5b12 7240
8f924df7
KH
7241DEFUN ("unencodable-char-position", Funencodable_char_position,
7242 Sunencodable_char_position, 3, 5, 0,
7243 doc: /*
7244Return position of first un-encodable character in a region.
7245START and END specfiy the region and CODING-SYSTEM specifies the
7246encoding to check. Return nil if CODING-SYSTEM does encode the region.
d46c5b12 7247
8f924df7
KH
7248If optional 4th argument COUNT is non-nil, it specifies at most how
7249many un-encodable characters to search. In this case, the value is a
7250list of positions.
d46c5b12 7251
8f924df7
KH
7252If optional 5th argument STRING is non-nil, it is a string to search
7253for un-encodable characters. In that case, START and END are indexes
7254to the string. */)
7255 (start, end, coding_system, count, string)
7256 Lisp_Object start, end, coding_system, count, string;
7257{
7258 int n;
7259 struct coding_system coding;
7d64c6ad 7260 Lisp_Object attrs, charset_list, translation_table;
8f924df7
KH
7261 Lisp_Object positions;
7262 int from, to;
7263 const unsigned char *p, *stop, *pend;
7264 int ascii_compatible;
fb88bf2d 7265
8f924df7
KH
7266 setup_coding_system (Fcheck_coding_system (coding_system), &coding);
7267 attrs = CODING_ID_ATTRS (coding.id);
7268 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
7269 return Qnil;
7270 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
7271 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7d64c6ad 7272 translation_table = get_translation_table (attrs, 1);
fb88bf2d 7273
8f924df7
KH
7274 if (NILP (string))
7275 {
7276 validate_region (&start, &end);
7277 from = XINT (start);
7278 to = XINT (end);
7279 if (NILP (current_buffer->enable_multibyte_characters)
7280 || (ascii_compatible
7281 && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
7282 return Qnil;
7283 p = CHAR_POS_ADDR (from);
7284 pend = CHAR_POS_ADDR (to);
7285 if (from < GPT && to >= GPT)
7286 stop = GPT_ADDR;
7287 else
7288 stop = pend;
7289 }
7290 else
7291 {
7292 CHECK_STRING (string);
7293 CHECK_NATNUM (start);
7294 CHECK_NATNUM (end);
7295 from = XINT (start);
7296 to = XINT (end);
7297 if (from > to
7298 || to > SCHARS (string))
7299 args_out_of_range_3 (string, start, end);
7300 if (! STRING_MULTIBYTE (string))
7301 return Qnil;
7302 p = SDATA (string) + string_char_to_byte (string, from);
7303 stop = pend = SDATA (string) + string_char_to_byte (string, to);
7304 if (ascii_compatible && (to - from) == (pend - p))
7305 return Qnil;
7306 }
f2558efd 7307
8f924df7
KH
7308 if (NILP (count))
7309 n = 1;
7310 else
b73bfc1c 7311 {
8f924df7
KH
7312 CHECK_NATNUM (count);
7313 n = XINT (count);
b73bfc1c
KH
7314 }
7315
8f924df7
KH
7316 positions = Qnil;
7317 while (1)
d46c5b12 7318 {
8f924df7 7319 int c;
ec6d2bb8 7320
8f924df7
KH
7321 if (ascii_compatible)
7322 while (p < stop && ASCII_BYTE_P (*p))
7323 p++, from++;
7324 if (p >= stop)
0e79d667 7325 {
8f924df7
KH
7326 if (p >= pend)
7327 break;
7328 stop = pend;
7329 p = GAP_END_ADDR;
0e79d667 7330 }
ec6d2bb8 7331
8f924df7
KH
7332 c = STRING_CHAR_ADVANCE (p);
7333 if (! (ASCII_CHAR_P (c) && ascii_compatible)
7d64c6ad
KH
7334 && ! char_charset (translate_char (translation_table, c),
7335 charset_list, NULL))
ec6d2bb8 7336 {
8f924df7
KH
7337 positions = Fcons (make_number (from), positions);
7338 n--;
7339 if (n == 0)
7340 break;
ec6d2bb8
KH
7341 }
7342
8f924df7
KH
7343 from++;
7344 }
d46c5b12 7345
8f924df7
KH
7346 return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
7347}
d46c5b12 7348
d46c5b12 7349
df7492f9
KH
7350DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
7351 Scheck_coding_systems_region, 3, 3, 0,
7352 doc: /* Check if the region is encodable by coding systems.
d46c5b12 7353
df7492f9
KH
7354START and END are buffer positions specifying the region.
7355CODING-SYSTEM-LIST is a list of coding systems to check.
d46c5b12 7356
df7492f9
KH
7357The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
7358CODING-SYSTEM is a member of CODING-SYSTEM-LIst and can't encode the
7359whole region, POS0, POS1, ... are buffer positions where non-encodable
7360characters are found.
93dec019 7361
df7492f9
KH
7362If all coding systems in CODING-SYSTEM-LIST can encode the region, the
7363value is nil.
93dec019 7364
df7492f9
KH
7365START may be a string. In that case, check if the string is
7366encodable, and the value contains indices to the string instead of
7367buffer positions. END is ignored. */)
7368 (start, end, coding_system_list)
7369 Lisp_Object start, end, coding_system_list;
05e6f5dc 7370{
df7492f9
KH
7371 Lisp_Object list;
7372 EMACS_INT start_byte, end_byte;
7373 int pos;
7c78e542 7374 const unsigned char *p, *pbeg, *pend;
df7492f9 7375 int c;
7d64c6ad 7376 Lisp_Object tail, elt, attrs;
70ad9fc4 7377
05e6f5dc
KH
7378 if (STRINGP (start))
7379 {
df7492f9 7380 if (!STRING_MULTIBYTE (start)
8f924df7 7381 && SCHARS (start) != SBYTES (start))
df7492f9
KH
7382 return Qnil;
7383 start_byte = 0;
8f924df7 7384 end_byte = SBYTES (start);
df7492f9 7385 pos = 0;
d46c5b12 7386 }
05e6f5dc 7387 else
b73bfc1c 7388 {
b7826503
PJ
7389 CHECK_NUMBER_COERCE_MARKER (start);
7390 CHECK_NUMBER_COERCE_MARKER (end);
05e6f5dc
KH
7391 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
7392 args_out_of_range (start, end);
7393 if (NILP (current_buffer->enable_multibyte_characters))
df7492f9
KH
7394 return Qnil;
7395 start_byte = CHAR_TO_BYTE (XINT (start));
7396 end_byte = CHAR_TO_BYTE (XINT (end));
7397 if (XINT (end) - XINT (start) == end_byte - start_byte)
05e6f5dc 7398 return Qt;
df7492f9 7399
e1c23804 7400 if (XINT (start) < GPT && XINT (end) > GPT)
b73bfc1c 7401 {
e1c23804
DL
7402 if ((GPT - XINT (start)) < (XINT (end) - GPT))
7403 move_gap_both (XINT (start), start_byte);
df7492f9 7404 else
e1c23804 7405 move_gap_both (XINT (end), end_byte);
b73bfc1c 7406 }
e1c23804 7407 pos = XINT (start);
b73bfc1c 7408 }
7553d0e1 7409
df7492f9
KH
7410 list = Qnil;
7411 for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
12410ef1 7412 {
df7492f9 7413 elt = XCAR (tail);
7d64c6ad
KH
7414 attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
7415 ASET (attrs, coding_attr_trans_tbl, get_translation_table (attrs, 1));
7416 list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
12410ef1
KH
7417 }
7418
df7492f9 7419 if (STRINGP (start))
8f924df7 7420 p = pbeg = SDATA (start);
72d1a715 7421 else
df7492f9
KH
7422 p = pbeg = BYTE_POS_ADDR (start_byte);
7423 pend = p + (end_byte - start_byte);
4ed46869 7424
df7492f9
KH
7425 while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
7426 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
ec6d2bb8 7427
df7492f9 7428 while (p < pend)
d46c5b12 7429 {
df7492f9
KH
7430 if (ASCII_BYTE_P (*p))
7431 p++;
e133c8fa 7432 else
05e6f5dc 7433 {
df7492f9
KH
7434 c = STRING_CHAR_ADVANCE (p);
7435
7436 charset_map_loaded = 0;
7437 for (tail = list; CONSP (tail); tail = XCDR (tail))
7438 {
7439 elt = XCDR (XCAR (tail));
7440 if (! char_encodable_p (c, XCAR (elt)))
7441 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
7442 }
7443 if (charset_map_loaded)
7444 {
7445 EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
7446
7447 if (STRINGP (start))
8f924df7 7448 pbeg = SDATA (start);
df7492f9
KH
7449 else
7450 pbeg = BYTE_POS_ADDR (start_byte);
7451 p = pbeg + p_offset;
7452 pend = pbeg + pend_offset;
7453 }
05e6f5dc 7454 }
df7492f9 7455 pos++;
d46c5b12 7456 }
4ed46869 7457
df7492f9
KH
7458 tail = list;
7459 list = Qnil;
7460 for (; CONSP (tail); tail = XCDR (tail))
ec6d2bb8 7461 {
df7492f9
KH
7462 elt = XCAR (tail);
7463 if (CONSP (XCDR (XCDR (elt))))
7464 list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
7465 list);
ec6d2bb8 7466 }
2b4f9037 7467
df7492f9 7468 return list;
d46c5b12
KH
7469}
7470
3fd9494b 7471
b73bfc1c
KH
7472
7473Lisp_Object
df7492f9
KH
7474code_convert_region (start, end, coding_system, dst_object, encodep, norecord)
7475 Lisp_Object start, end, coding_system, dst_object;
7476 int encodep, norecord;
4ed46869 7477{
3a73fa5d 7478 struct coding_system coding;
df7492f9
KH
7479 EMACS_INT from, from_byte, to, to_byte;
7480 Lisp_Object src_object;
4ed46869 7481
b7826503
PJ
7482 CHECK_NUMBER_COERCE_MARKER (start);
7483 CHECK_NUMBER_COERCE_MARKER (end);
df7492f9
KH
7484 if (NILP (coding_system))
7485 coding_system = Qno_conversion;
7486 else
7487 CHECK_CODING_SYSTEM (coding_system);
7488 src_object = Fcurrent_buffer ();
7489 if (NILP (dst_object))
7490 dst_object = src_object;
7491 else if (! EQ (dst_object, Qt))
7492 CHECK_BUFFER (dst_object);
3a73fa5d 7493
d46c5b12
KH
7494 validate_region (&start, &end);
7495 from = XFASTINT (start);
df7492f9 7496 from_byte = CHAR_TO_BYTE (from);
d46c5b12 7497 to = XFASTINT (end);
df7492f9 7498 to_byte = CHAR_TO_BYTE (to);
764ca8da 7499
df7492f9
KH
7500 setup_coding_system (coding_system, &coding);
7501 coding.mode |= CODING_MODE_LAST_BLOCK;
ec6d2bb8 7502
df7492f9
KH
7503 if (encodep)
7504 encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
7505 dst_object);
7506 else
7507 decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
7508 dst_object);
7509 if (! norecord)
7510 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
ec6d2bb8 7511
df7492f9
KH
7512 return (BUFFERP (dst_object)
7513 ? make_number (coding.produced_char)
7514 : coding.dst_object);
4031e2bf 7515}
78108bcd 7516
4ed46869 7517
4031e2bf 7518DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
df7492f9 7519 3, 4, "r\nzCoding system: ",
48b0f3ae 7520 doc: /* Decode the current region from the specified coding system.
df7492f9
KH
7521When called from a program, takes four arguments:
7522 START, END, CODING-SYSTEM, and DESTINATION.
7523START and END are buffer positions.
8844fa83 7524
df7492f9
KH
7525Optional 4th arguments DESTINATION specifies where the decoded text goes.
7526If nil, the region between START and END is replace by the decoded text.
7527If buffer, the decoded text is inserted in the buffer.
7528If t, the decoded text is returned.
8844fa83 7529
48b0f3ae
PJ
7530This function sets `last-coding-system-used' to the precise coding system
7531used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7532not fully specified.)
7533It returns the length of the decoded text. */)
df7492f9
KH
7534 (start, end, coding_system, destination)
7535 Lisp_Object start, end, coding_system, destination;
4031e2bf 7536{
df7492f9 7537 return code_convert_region (start, end, coding_system, destination, 0, 0);
3a73fa5d 7538}
8844fa83 7539
3a73fa5d 7540DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
df7492f9
KH
7541 3, 4, "r\nzCoding system: ",
7542 doc: /* Encode the current region by specified coding system.
48b0f3ae
PJ
7543When called from a program, takes three arguments:
7544START, END, and CODING-SYSTEM. START and END are buffer positions.
d46c5b12 7545
df7492f9
KH
7546Optional 4th arguments DESTINATION specifies where the encoded text goes.
7547If nil, the region between START and END is replace by the encoded text.
7548If buffer, the encoded text is inserted in the buffer.
7549If t, the encoded text is returned.
2391eaa4 7550
48b0f3ae
PJ
7551This function sets `last-coding-system-used' to the precise coding system
7552used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7553not fully specified.)
7554It returns the length of the encoded text. */)
df7492f9
KH
7555 (start, end, coding_system, destination)
7556 Lisp_Object start, end, coding_system, destination;
3a73fa5d 7557{
df7492f9 7558 return code_convert_region (start, end, coding_system, destination, 1, 0);
b73bfc1c
KH
7559}
7560
7561Lisp_Object
df7492f9
KH
7562code_convert_string (string, coding_system, dst_object,
7563 encodep, nocopy, norecord)
7564 Lisp_Object string, coding_system, dst_object;
7565 int encodep, nocopy, norecord;
b73bfc1c 7566{
4031e2bf 7567 struct coding_system coding;
df7492f9 7568 EMACS_INT chars, bytes;
ec6d2bb8 7569
b7826503 7570 CHECK_STRING (string);
d46c5b12 7571 if (NILP (coding_system))
4956c225 7572 {
df7492f9
KH
7573 if (! norecord)
7574 Vlast_coding_system_used = Qno_conversion;
7575 if (NILP (dst_object))
7576 return (nocopy ? Fcopy_sequence (string) : string);
4956c225 7577 }
b73bfc1c 7578
df7492f9
KH
7579 if (NILP (coding_system))
7580 coding_system = Qno_conversion;
7581 else
7582 CHECK_CODING_SYSTEM (coding_system);
7583 if (NILP (dst_object))
7584 dst_object = Qt;
7585 else if (! EQ (dst_object, Qt))
7586 CHECK_BUFFER (dst_object);
73be902c 7587
df7492f9 7588 setup_coding_system (coding_system, &coding);
d46c5b12 7589 coding.mode |= CODING_MODE_LAST_BLOCK;
8f924df7
KH
7590 chars = SCHARS (string);
7591 bytes = SBYTES (string);
df7492f9
KH
7592 if (encodep)
7593 encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
7594 else
7595 decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
7596 if (! norecord)
7597 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
73be902c 7598
df7492f9
KH
7599 return (BUFFERP (dst_object)
7600 ? make_number (coding.produced_char)
7601 : coding.dst_object);
4ed46869 7602}
73be902c 7603
b73bfc1c 7604
ecec61c1 7605/* Encode or decode STRING according to CODING_SYSTEM.
ec6d2bb8 7606 Do not set Vlast_coding_system_used.
4ed46869 7607
ec6d2bb8
KH
7608 This function is called only from macros DECODE_FILE and
7609 ENCODE_FILE, thus we ignore character composition. */
4ed46869 7610
ecec61c1
KH
7611Lisp_Object
7612code_convert_string_norecord (string, coding_system, encodep)
7613 Lisp_Object string, coding_system;
7614 int encodep;
4ed46869 7615{
0be8721c 7616 return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
4ed46869
KH
7617}
7618
4ed46869 7619
df7492f9
KH
7620DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
7621 2, 4, 0,
7622 doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
7623
7624Optional third arg NOCOPY non-nil means it is OK to return STRING itself
7625if the decoding operation is trivial.
ecec61c1 7626
df7492f9 7627Optional fourth arg BUFFER non-nil meant that the decoded text is
a3f6ee6d 7628inserted in BUFFER instead of returned as a string. In this case,
df7492f9 7629the return value is BUFFER.
ecec61c1 7630
df7492f9
KH
7631This function sets `last-coding-system-used' to the precise coding system
7632used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7633not fully specified. */)
7634 (string, coding_system, nocopy, buffer)
7635 Lisp_Object string, coding_system, nocopy, buffer;
4ed46869 7636{
df7492f9
KH
7637 return code_convert_string (string, coding_system, buffer,
7638 0, ! NILP (nocopy), 0);
4ed46869
KH
7639}
7640
df7492f9
KH
7641DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
7642 2, 4, 0,
7643 doc: /* Encode STRING to CODING-SYSTEM, and return the result.
7644
7645Optional third arg NOCOPY non-nil means it is OK to return STRING
7646itself if the encoding operation is trivial.
7647
7648Optional fourth arg BUFFER non-nil meant that the encoded text is
a3f6ee6d 7649inserted in BUFFER instead of returned as a string. In this case,
df7492f9
KH
7650the return value is BUFFER.
7651
7652This function sets `last-coding-system-used' to the precise coding system
7653used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7654not fully specified.) */)
7655 (string, coding_system, nocopy, buffer)
7656 Lisp_Object string, coding_system, nocopy, buffer;
4ed46869 7657{
df7492f9 7658 return code_convert_string (string, coding_system, buffer,
c197f191 7659 1, ! NILP (nocopy), 1);
4ed46869 7660}
df7492f9 7661
3a73fa5d 7662\f
4ed46869 7663DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
48b0f3ae
PJ
7664 doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
7665Return the corresponding character. */)
7666 (code)
4ed46869 7667 Lisp_Object code;
4ed46869 7668{
df7492f9
KH
7669 Lisp_Object spec, attrs, val;
7670 struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
7671 int c;
4ed46869 7672
df7492f9
KH
7673 CHECK_NATNUM (code);
7674 c = XFASTINT (code);
7675 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
7676 attrs = AREF (spec, 0);
4ed46869 7677
df7492f9
KH
7678 if (ASCII_BYTE_P (c)
7679 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7680 return code;
4ed46869 7681
df7492f9
KH
7682 val = CODING_ATTR_CHARSET_LIST (attrs);
7683 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
004068e4
KH
7684 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
7685 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
fa42c37f 7686
df7492f9
KH
7687 if (c <= 0x7F)
7688 charset = charset_roman;
7689 else if (c >= 0xA0 && c < 0xDF)
55ab7be3 7690 {
df7492f9
KH
7691 charset = charset_kana;
7692 c -= 0x80;
4ed46869 7693 }
55ab7be3 7694 else
4ed46869 7695 {
004068e4 7696 int s1 = c >> 8, s2 = c & 0xFF;
df7492f9
KH
7697
7698 if (s1 < 0x81 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF
7699 || s2 < 0x40 || s2 == 0x7F || s2 > 0xFC)
7700 error ("Invalid code: %d", code);
7701 SJIS_TO_JIS (c);
7702 charset = charset_kanji;
4ed46869 7703 }
df7492f9
KH
7704 c = DECODE_CHAR (charset, c);
7705 if (c < 0)
7706 error ("Invalid code: %d", code);
7707 return make_number (c);
93dec019 7708}
4ed46869 7709
48b0f3ae 7710
4ed46869 7711DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
48b0f3ae
PJ
7712 doc: /* Encode a Japanese character CHAR to shift_jis encoding.
7713Return the corresponding code in SJIS. */)
7714 (ch)
df7492f9 7715 Lisp_Object ch;
4ed46869 7716{
df7492f9
KH
7717 Lisp_Object spec, attrs, charset_list;
7718 int c;
7719 struct charset *charset;
7720 unsigned code;
48b0f3ae 7721
df7492f9
KH
7722 CHECK_CHARACTER (ch);
7723 c = XFASTINT (ch);
7724 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
7725 attrs = AREF (spec, 0);
7726
7727 if (ASCII_CHAR_P (c)
7728 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7729 return ch;
7730
7731 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7732 charset = char_charset (c, charset_list, &code);
7733 if (code == CHARSET_INVALID_CODE (charset))
7734 error ("Can't encode by shift_jis encoding: %d", c);
7735 JIS_TO_SJIS (code);
7736
7737 return make_number (code);
4ed46869
KH
7738}
7739
7740DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
48b0f3ae
PJ
7741 doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
7742Return the corresponding character. */)
7743 (code)
4ed46869 7744 Lisp_Object code;
d46c5b12 7745{
df7492f9
KH
7746 Lisp_Object spec, attrs, val;
7747 struct charset *charset_roman, *charset_big5, *charset;
7748 int c;
6289dd10 7749
df7492f9
KH
7750 CHECK_NATNUM (code);
7751 c = XFASTINT (code);
7752 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
7753 attrs = AREF (spec, 0);
4ed46869 7754
df7492f9
KH
7755 if (ASCII_BYTE_P (c)
7756 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7757 return code;
6289dd10 7758
df7492f9
KH
7759 val = CODING_ATTR_CHARSET_LIST (attrs);
7760 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
7761 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
c210f766 7762
df7492f9
KH
7763 if (c <= 0x7F)
7764 charset = charset_roman;
c28a9453
KH
7765 else
7766 {
df7492f9
KH
7767 int b1 = c >> 8, b2 = c & 0x7F;
7768 if (b1 < 0xA1 || b1 > 0xFE
7769 || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
7770 error ("Invalid code: %d", code);
7771 charset = charset_big5;
c28a9453 7772 }
df7492f9
KH
7773 c = DECODE_CHAR (charset, (unsigned )c);
7774 if (c < 0)
7775 error ("Invalid code: %d", code);
7776 return make_number (c);
d46c5b12 7777}
6289dd10 7778
4ed46869 7779DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
48b0f3ae
PJ
7780 doc: /* Encode the Big5 character CHAR to BIG5 coding system.
7781Return the corresponding character code in Big5. */)
7782 (ch)
4ed46869
KH
7783 Lisp_Object ch;
7784{
df7492f9
KH
7785 Lisp_Object spec, attrs, charset_list;
7786 struct charset *charset;
7787 int c;
7788 unsigned code;
7789
7790 CHECK_CHARACTER (ch);
7791 c = XFASTINT (ch);
7792 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
7793 attrs = AREF (spec, 0);
7794 if (ASCII_CHAR_P (c)
7795 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7796 return ch;
7797
7798 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7799 charset = char_charset (c, charset_list, &code);
7800 if (code == CHARSET_INVALID_CODE (charset))
7801 error ("Can't encode by Big5 encoding: %d", c);
7802
7803 return make_number (code);
4ed46869 7804}
48b0f3ae 7805
3a73fa5d 7806\f
1ba9e4ab
KH
7807DEFUN ("set-terminal-coding-system-internal",
7808 Fset_terminal_coding_system_internal,
48b0f3ae
PJ
7809 Sset_terminal_coding_system_internal, 1, 1, 0,
7810 doc: /* Internal use only. */)
7811 (coding_system)
b74e4686 7812 Lisp_Object coding_system;
4ed46869 7813{
b7826503 7814 CHECK_SYMBOL (coding_system);
df7492f9
KH
7815 setup_coding_system (Fcheck_coding_system (coding_system),
7816 &terminal_coding);
48b0f3ae 7817
70c22245 7818 /* We had better not send unsafe characters to terminal. */
df7492f9
KH
7819 terminal_coding.mode |= CODING_MODE_SAFE_ENCODING;
7820 /* Characer composition should be disabled. */
7821 terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
b73bfc1c
KH
7822 terminal_coding.src_multibyte = 1;
7823 terminal_coding.dst_multibyte = 0;
4ed46869
KH
7824 return Qnil;
7825}
7826
c4825358
KH
7827DEFUN ("set-safe-terminal-coding-system-internal",
7828 Fset_safe_terminal_coding_system_internal,
48b0f3ae 7829 Sset_safe_terminal_coding_system_internal, 1, 1, 0,
ddb67bdc 7830 doc: /* Internal use only. */)
48b0f3ae 7831 (coding_system)
b74e4686 7832 Lisp_Object coding_system;
d46c5b12 7833{
b7826503 7834 CHECK_SYMBOL (coding_system);
c4825358
KH
7835 setup_coding_system (Fcheck_coding_system (coding_system),
7836 &safe_terminal_coding);
df7492f9
KH
7837 /* Characer composition should be disabled. */
7838 safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
b73bfc1c
KH
7839 safe_terminal_coding.src_multibyte = 1;
7840 safe_terminal_coding.dst_multibyte = 0;
c4825358
KH
7841 return Qnil;
7842}
4ed46869 7843
4ed46869
KH
7844DEFUN ("terminal-coding-system",
7845 Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
48b0f3ae
PJ
7846 doc: /* Return coding system specified for terminal output. */)
7847 ()
4ed46869 7848{
df7492f9 7849 return CODING_ID_NAME (terminal_coding.id);
4ed46869
KH
7850}
7851
1ba9e4ab
KH
7852DEFUN ("set-keyboard-coding-system-internal",
7853 Fset_keyboard_coding_system_internal,
48b0f3ae
PJ
7854 Sset_keyboard_coding_system_internal, 1, 1, 0,
7855 doc: /* Internal use only. */)
7856 (coding_system)
4ed46869
KH
7857 Lisp_Object coding_system;
7858{
b7826503 7859 CHECK_SYMBOL (coding_system);
df7492f9
KH
7860 setup_coding_system (Fcheck_coding_system (coding_system),
7861 &keyboard_coding);
7862 /* Characer composition should be disabled. */
7863 keyboard_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
4ed46869
KH
7864 return Qnil;
7865}
7866
7867DEFUN ("keyboard-coding-system",
7868 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
48b0f3ae
PJ
7869 doc: /* Return coding system specified for decoding keyboard input. */)
7870 ()
4ed46869 7871{
df7492f9 7872 return CODING_ID_NAME (keyboard_coding.id);
4ed46869
KH
7873}
7874
4ed46869 7875\f
a5d301df
KH
7876DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
7877 Sfind_operation_coding_system, 1, MANY, 0,
48b0f3ae
PJ
7878 doc: /* Choose a coding system for an operation based on the target name.
7879The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
7880DECODING-SYSTEM is the coding system to use for decoding
7881\(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
7882for encoding (in case OPERATION does encoding).
05e6f5dc 7883
48b0f3ae
PJ
7884The first argument OPERATION specifies an I/O primitive:
7885 For file I/O, `insert-file-contents' or `write-region'.
7886 For process I/O, `call-process', `call-process-region', or `start-process'.
7887 For network I/O, `open-network-stream'.
05e6f5dc 7888
48b0f3ae
PJ
7889The remaining arguments should be the same arguments that were passed
7890to the primitive. Depending on which primitive, one of those arguments
7891is selected as the TARGET. For example, if OPERATION does file I/O,
7892whichever argument specifies the file name is TARGET.
05e6f5dc 7893
48b0f3ae
PJ
7894TARGET has a meaning which depends on OPERATION:
7895 For file I/O, TARGET is a file name.
7896 For process I/O, TARGET is a process name.
7897 For network I/O, TARGET is a service name or a port number
05e6f5dc 7898
48b0f3ae
PJ
7899This function looks up what specified for TARGET in,
7900`file-coding-system-alist', `process-coding-system-alist',
7901or `network-coding-system-alist' depending on OPERATION.
7902They may specify a coding system, a cons of coding systems,
7903or a function symbol to call.
7904In the last case, we call the function with one argument,
7905which is a list of all the arguments given to this function.
7906
7907usage: (find-operation-coding-system OPERATION ARGUMENTS ...) */)
7908 (nargs, args)
4ed46869
KH
7909 int nargs;
7910 Lisp_Object *args;
6b89e3aa 7911{
4ed46869
KH
7912 Lisp_Object operation, target_idx, target, val;
7913 register Lisp_Object chain;
177c0ea7 7914
4ed46869
KH
7915 if (nargs < 2)
7916 error ("Too few arguments");
7917 operation = args[0];
7918 if (!SYMBOLP (operation)
7919 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
df7492f9 7920 error ("Invalid first arguement");
4ed46869
KH
7921 if (nargs < 1 + XINT (target_idx))
7922 error ("Too few arguments for operation: %s",
8f924df7 7923 SDATA (SYMBOL_NAME (operation)));
4ed46869
KH
7924 target = args[XINT (target_idx) + 1];
7925 if (!(STRINGP (target)
7926 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
df7492f9 7927 error ("Invalid %dth argument", XINT (target_idx) + 1);
4ed46869 7928
2e34157c
RS
7929 chain = ((EQ (operation, Qinsert_file_contents)
7930 || EQ (operation, Qwrite_region))
02ba4723 7931 ? Vfile_coding_system_alist
2e34157c 7932 : (EQ (operation, Qopen_network_stream)
02ba4723
KH
7933 ? Vnetwork_coding_system_alist
7934 : Vprocess_coding_system_alist));
4ed46869
KH
7935 if (NILP (chain))
7936 return Qnil;
7937
03699b14 7938 for (; CONSP (chain); chain = XCDR (chain))
6b89e3aa 7939 {
f44d27ce 7940 Lisp_Object elt;
6b89e3aa 7941
df7492f9 7942 elt = XCAR (chain);
4ed46869
KH
7943 if (CONSP (elt)
7944 && ((STRINGP (target)
03699b14
KR
7945 && STRINGP (XCAR (elt))
7946 && fast_string_match (XCAR (elt), target) >= 0)
7947 || (INTEGERP (target) && EQ (target, XCAR (elt)))))
6b89e3aa 7948 {
03699b14 7949 val = XCDR (elt);
b19fd4c5
KH
7950 /* Here, if VAL is both a valid coding system and a valid
7951 function symbol, we return VAL as a coding system. */
02ba4723
KH
7952 if (CONSP (val))
7953 return val;
7954 if (! SYMBOLP (val))
7955 return Qnil;
7956 if (! NILP (Fcoding_system_p (val)))
7957 return Fcons (val, val);
b19fd4c5 7958 if (! NILP (Ffboundp (val)))
6b89e3aa 7959 {
b19fd4c5
KH
7960 val = call1 (val, Flist (nargs, args));
7961 if (CONSP (val))
7962 return val;
7963 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
7964 return Fcons (val, val);
6b89e3aa 7965 }
02ba4723 7966 return Qnil;
6b89e3aa
KH
7967 }
7968 }
4ed46869 7969 return Qnil;
6b89e3aa
KH
7970}
7971
df7492f9 7972DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
a3f6ee6d 7973 Sset_coding_system_priority, 0, MANY, 0,
da7db224 7974 doc: /* Assign higher priority to the coding systems given as arguments.
ff563fce 7975If multiple coding systems belongs to the same category,
a3181084
DL
7976all but the first one are ignored.
7977
7978usage: (set-coding-system-priority ...) */)
df7492f9
KH
7979 (nargs, args)
7980 int nargs;
7981 Lisp_Object *args;
7982{
7983 int i, j;
7984 int changed[coding_category_max];
7985 enum coding_category priorities[coding_category_max];
7986
7987 bzero (changed, sizeof changed);
6b89e3aa 7988
df7492f9 7989 for (i = j = 0; i < nargs; i++)
6b89e3aa 7990 {
df7492f9
KH
7991 enum coding_category category;
7992 Lisp_Object spec, attrs;
6b89e3aa 7993
df7492f9
KH
7994 CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
7995 attrs = AREF (spec, 0);
7996 category = XINT (CODING_ATTR_CATEGORY (attrs));
7997 if (changed[category])
7998 /* Ignore this coding system because a coding system of the
7999 same category already had a higher priority. */
8000 continue;
8001 changed[category] = 1;
8002 priorities[j++] = category;
8003 if (coding_categories[category].id >= 0
8004 && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
8005 setup_coding_system (args[i], &coding_categories[category]);
ff563fce 8006 Fset (AREF (Vcoding_category_table, category), args[i]);
df7492f9 8007 }
6b89e3aa 8008
df7492f9
KH
8009 /* Now we have decided top J priorities. Reflect the order of the
8010 original priorities to the remaining priorities. */
6b89e3aa 8011
df7492f9 8012 for (i = j, j = 0; i < coding_category_max; i++, j++)
6b89e3aa 8013 {
df7492f9
KH
8014 while (j < coding_category_max
8015 && changed[coding_priorities[j]])
8016 j++;
8017 if (j == coding_category_max)
8018 abort ();
8019 priorities[i] = coding_priorities[j];
8020 }
6b89e3aa 8021
df7492f9 8022 bcopy (priorities, coding_priorities, sizeof priorities);
177c0ea7 8023
ff563fce
KH
8024 /* Update `coding-category-list'. */
8025 Vcoding_category_list = Qnil;
8026 for (i = coding_category_max - 1; i >= 0; i--)
8027 Vcoding_category_list
8028 = Fcons (AREF (Vcoding_category_table, priorities[i]),
8029 Vcoding_category_list);
6b89e3aa 8030
df7492f9 8031 return Qnil;
6b89e3aa
KH
8032}
8033
df7492f9
KH
8034DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
8035 Scoding_system_priority_list, 0, 1, 0,
da7db224
DL
8036 doc: /* Return a list of coding systems ordered by their priorities.
8037HIGHESTP non-nil means just return the highest priority one. */)
df7492f9
KH
8038 (highestp)
8039 Lisp_Object highestp;
d46c5b12
KH
8040{
8041 int i;
df7492f9 8042 Lisp_Object val;
6b89e3aa 8043
df7492f9 8044 for (i = 0, val = Qnil; i < coding_category_max; i++)
d46c5b12 8045 {
df7492f9
KH
8046 enum coding_category category = coding_priorities[i];
8047 int id = coding_categories[category].id;
8048 Lisp_Object attrs;
068a9dbd 8049
df7492f9
KH
8050 if (id < 0)
8051 continue;
8052 attrs = CODING_ID_ATTRS (id);
8053 if (! NILP (highestp))
8054 return CODING_ATTR_BASE_NAME (attrs);
8055 val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
8056 }
8057 return Fnreverse (val);
8058}
068a9dbd 8059
f0064e1f 8060static char *suffixes[] = { "-unix", "-dos", "-mac" };
068a9dbd
KH
8061
8062static Lisp_Object
df7492f9
KH
8063make_subsidiaries (base)
8064 Lisp_Object base;
068a9dbd 8065{
df7492f9 8066 Lisp_Object subsidiaries;
8f924df7 8067 int base_name_len = SBYTES (SYMBOL_NAME (base));
df7492f9
KH
8068 char *buf = (char *) alloca (base_name_len + 6);
8069 int i;
068a9dbd 8070
8f924df7 8071 bcopy (SDATA (SYMBOL_NAME (base)), buf, base_name_len);
df7492f9
KH
8072 subsidiaries = Fmake_vector (make_number (3), Qnil);
8073 for (i = 0; i < 3; i++)
068a9dbd 8074 {
df7492f9
KH
8075 bcopy (suffixes[i], buf + base_name_len, strlen (suffixes[i]) + 1);
8076 ASET (subsidiaries, i, intern (buf));
068a9dbd 8077 }
df7492f9 8078 return subsidiaries;
068a9dbd
KH
8079}
8080
8081
df7492f9
KH
8082DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
8083 Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
1fcd6c8b
DL
8084 doc: /* For internal use only.
8085usage: (define-coding-system-internal ...) */)
df7492f9
KH
8086 (nargs, args)
8087 int nargs;
8088 Lisp_Object *args;
068a9dbd 8089{
df7492f9
KH
8090 Lisp_Object name;
8091 Lisp_Object spec_vec; /* [ ATTRS ALIASE EOL_TYPE ] */
8092 Lisp_Object attrs; /* Vector of attributes. */
8093 Lisp_Object eol_type;
8094 Lisp_Object aliases;
8095 Lisp_Object coding_type, charset_list, safe_charsets;
8096 enum coding_category category;
8097 Lisp_Object tail, val;
8098 int max_charset_id = 0;
8099 int i;
068a9dbd 8100
df7492f9
KH
8101 if (nargs < coding_arg_max)
8102 goto short_args;
068a9dbd 8103
df7492f9 8104 attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
068a9dbd 8105
df7492f9
KH
8106 name = args[coding_arg_name];
8107 CHECK_SYMBOL (name);
8108 CODING_ATTR_BASE_NAME (attrs) = name;
068a9dbd 8109
df7492f9
KH
8110 val = args[coding_arg_mnemonic];
8111 if (! STRINGP (val))
8112 CHECK_CHARACTER (val);
8113 CODING_ATTR_MNEMONIC (attrs) = val;
068a9dbd 8114
df7492f9
KH
8115 coding_type = args[coding_arg_coding_type];
8116 CHECK_SYMBOL (coding_type);
8117 CODING_ATTR_TYPE (attrs) = coding_type;
068a9dbd 8118
df7492f9
KH
8119 charset_list = args[coding_arg_charset_list];
8120 if (SYMBOLP (charset_list))
8121 {
8122 if (EQ (charset_list, Qiso_2022))
8123 {
8124 if (! EQ (coding_type, Qiso_2022))
8125 error ("Invalid charset-list");
8126 charset_list = Viso_2022_charset_list;
8127 }
8128 else if (EQ (charset_list, Qemacs_mule))
8129 {
8130 if (! EQ (coding_type, Qemacs_mule))
8131 error ("Invalid charset-list");
8132 charset_list = Vemacs_mule_charset_list;
8133 }
8134 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8135 if (max_charset_id < XFASTINT (XCAR (tail)))
8136 max_charset_id = XFASTINT (XCAR (tail));
8137 }
068a9dbd
KH
8138 else
8139 {
df7492f9
KH
8140 charset_list = Fcopy_sequence (charset_list);
8141 for (tail = charset_list; !NILP (tail); tail = Fcdr (tail))
068a9dbd 8142 {
df7492f9
KH
8143 struct charset *charset;
8144
8145 val = Fcar (tail);
8146 CHECK_CHARSET_GET_CHARSET (val, charset);
8147 if (EQ (coding_type, Qiso_2022)
8148 ? CHARSET_ISO_FINAL (charset) < 0
8149 : EQ (coding_type, Qemacs_mule)
8150 ? CHARSET_EMACS_MULE_ID (charset) < 0
8151 : 0)
8152 error ("Can't handle charset `%s'",
8f924df7 8153 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
df7492f9 8154
8f924df7 8155 XSETCAR (tail, make_number (charset->id));
df7492f9
KH
8156 if (max_charset_id < charset->id)
8157 max_charset_id = charset->id;
068a9dbd
KH
8158 }
8159 }
df7492f9 8160 CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
068a9dbd 8161
df7492f9
KH
8162 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
8163 make_number (255));
8164 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8f924df7 8165 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
df7492f9 8166 CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
068a9dbd 8167
584948ac 8168 CODING_ATTR_ASCII_COMPAT (attrs) = args[coding_arg_ascii_compatible_p];
3a73fa5d 8169
df7492f9 8170 val = args[coding_arg_decode_translation_table];
7d64c6ad
KH
8171 if (! CHAR_TABLE_P (val))
8172 CHECK_SYMBOL (val);
df7492f9 8173 CODING_ATTR_DECODE_TBL (attrs) = val;
3a73fa5d 8174
df7492f9 8175 val = args[coding_arg_encode_translation_table];
7d64c6ad
KH
8176 if (! CHAR_TABLE_P (val))
8177 CHECK_SYMBOL (val);
df7492f9 8178 CODING_ATTR_ENCODE_TBL (attrs) = val;
d46c5b12 8179
df7492f9
KH
8180 val = args[coding_arg_post_read_conversion];
8181 CHECK_SYMBOL (val);
8182 CODING_ATTR_POST_READ (attrs) = val;
d46c5b12 8183
df7492f9
KH
8184 val = args[coding_arg_pre_write_conversion];
8185 CHECK_SYMBOL (val);
8186 CODING_ATTR_PRE_WRITE (attrs) = val;
3a73fa5d 8187
df7492f9
KH
8188 val = args[coding_arg_default_char];
8189 if (NILP (val))
8190 CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
8191 else
8192 {
8f924df7 8193 CHECK_CHARACTER (val);
df7492f9
KH
8194 CODING_ATTR_DEFAULT_CHAR (attrs) = val;
8195 }
4031e2bf 8196
8f924df7
KH
8197 val = args[coding_arg_for_unibyte];
8198 CODING_ATTR_FOR_UNIBYTE (attrs) = NILP (val) ? Qnil : Qt;
3a73fa5d 8199
df7492f9
KH
8200 val = args[coding_arg_plist];
8201 CHECK_LIST (val);
8202 CODING_ATTR_PLIST (attrs) = val;
3a73fa5d 8203
df7492f9
KH
8204 if (EQ (coding_type, Qcharset))
8205 {
c7c66a95
KH
8206 /* Generate a lisp vector of 256 elements. Each element is nil,
8207 integer, or a list of charset IDs.
3a73fa5d 8208
c7c66a95
KH
8209 If Nth element is nil, the byte code N is invalid in this
8210 coding system.
4ed46869 8211
c7c66a95
KH
8212 If Nth element is a number NUM, N is the first byte of a
8213 charset whose ID is NUM.
4ed46869 8214
c7c66a95
KH
8215 If Nth element is a list of charset IDs, N is the first byte
8216 of one of them. The list is sorted by dimensions of the
2bc515e4 8217 charsets. A charset of smaller dimension comes firtst. */
df7492f9 8218 val = Fmake_vector (make_number (256), Qnil);
4ed46869 8219
5c99c2e6 8220 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
df7492f9 8221 {
c7c66a95
KH
8222 struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
8223 int dim = CHARSET_DIMENSION (charset);
8224 int idx = (dim - 1) * 4;
4ed46869 8225
5c99c2e6 8226 if (CHARSET_ASCII_COMPATIBLE_P (charset))
584948ac 8227 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
4031e2bf 8228
15d143f7
KH
8229 for (i = charset->code_space[idx];
8230 i <= charset->code_space[idx + 1]; i++)
8231 {
c7c66a95
KH
8232 Lisp_Object tmp, tmp2;
8233 int dim2;
ec6d2bb8 8234
c7c66a95
KH
8235 tmp = AREF (val, i);
8236 if (NILP (tmp))
8237 tmp = XCAR (tail);
8238 else if (NUMBERP (tmp))
8239 {
8240 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
8241 if (dim < dim2)
c7c66a95 8242 tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
f9d71dcd
KH
8243 else
8244 tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
c7c66a95 8245 }
15d143f7 8246 else
c7c66a95
KH
8247 {
8248 for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
8249 {
8250 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
8251 if (dim < dim2)
8252 break;
8253 }
8254 if (NILP (tmp2))
8255 tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
8256 else
8257 {
8258 XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
8259 XSETCAR (tmp2, XCAR (tail));
8260 }
8261 }
8262 ASET (val, i, tmp);
15d143f7 8263 }
df7492f9
KH
8264 }
8265 ASET (attrs, coding_attr_charset_valids, val);
8266 category = coding_category_charset;
8267 }
8268 else if (EQ (coding_type, Qccl))
8269 {
8270 Lisp_Object valids;
ecec61c1 8271
df7492f9
KH
8272 if (nargs < coding_arg_ccl_max)
8273 goto short_args;
ecec61c1 8274
df7492f9
KH
8275 val = args[coding_arg_ccl_decoder];
8276 CHECK_CCL_PROGRAM (val);
8277 if (VECTORP (val))
8278 val = Fcopy_sequence (val);
8279 ASET (attrs, coding_attr_ccl_decoder, val);
ecec61c1 8280
df7492f9
KH
8281 val = args[coding_arg_ccl_encoder];
8282 CHECK_CCL_PROGRAM (val);
8283 if (VECTORP (val))
8284 val = Fcopy_sequence (val);
8285 ASET (attrs, coding_attr_ccl_encoder, val);
ecec61c1 8286
df7492f9
KH
8287 val = args[coding_arg_ccl_valids];
8288 valids = Fmake_string (make_number (256), make_number (0));
8289 for (tail = val; !NILP (tail); tail = Fcdr (tail))
8290 {
8dcbea82 8291 int from, to;
ecec61c1 8292
df7492f9
KH
8293 val = Fcar (tail);
8294 if (INTEGERP (val))
8dcbea82
KH
8295 {
8296 from = to = XINT (val);
8297 if (from < 0 || from > 255)
8298 args_out_of_range_3 (val, make_number (0), make_number (255));
8299 }
df7492f9
KH
8300 else
8301 {
df7492f9 8302 CHECK_CONS (val);
8f924df7
KH
8303 CHECK_NATNUM_CAR (val);
8304 CHECK_NATNUM_CDR (val);
df7492f9 8305 from = XINT (XCAR (val));
8f924df7 8306 if (from > 255)
8dcbea82
KH
8307 args_out_of_range_3 (XCAR (val),
8308 make_number (0), make_number (255));
df7492f9 8309 to = XINT (XCDR (val));
8dcbea82
KH
8310 if (to < from || to > 255)
8311 args_out_of_range_3 (XCDR (val),
8312 XCAR (val), make_number (255));
df7492f9 8313 }
8dcbea82 8314 for (i = from; i <= to; i++)
8f924df7 8315 SSET (valids, i, 1);
df7492f9
KH
8316 }
8317 ASET (attrs, coding_attr_ccl_valids, valids);
4ed46869 8318
df7492f9 8319 category = coding_category_ccl;
55ab7be3 8320 }
df7492f9 8321 else if (EQ (coding_type, Qutf_16))
55ab7be3 8322 {
df7492f9 8323 Lisp_Object bom, endian;
4ed46869 8324
584948ac 8325 CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
4ed46869 8326
df7492f9
KH
8327 if (nargs < coding_arg_utf16_max)
8328 goto short_args;
4ed46869 8329
df7492f9
KH
8330 bom = args[coding_arg_utf16_bom];
8331 if (! NILP (bom) && ! EQ (bom, Qt))
8332 {
8333 CHECK_CONS (bom);
8f924df7
KH
8334 val = XCAR (bom);
8335 CHECK_CODING_SYSTEM (val);
8336 val = XCDR (bom);
8337 CHECK_CODING_SYSTEM (val);
df7492f9
KH
8338 }
8339 ASET (attrs, coding_attr_utf_16_bom, bom);
8340
8341 endian = args[coding_arg_utf16_endian];
b49a1807
KH
8342 CHECK_SYMBOL (endian);
8343 if (NILP (endian))
8344 endian = Qbig;
8345 else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
8f924df7 8346 error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
df7492f9
KH
8347 ASET (attrs, coding_attr_utf_16_endian, endian);
8348
8349 category = (CONSP (bom)
8350 ? coding_category_utf_16_auto
8351 : NILP (bom)
b49a1807 8352 ? (EQ (endian, Qbig)
df7492f9
KH
8353 ? coding_category_utf_16_be_nosig
8354 : coding_category_utf_16_le_nosig)
b49a1807 8355 : (EQ (endian, Qbig)
df7492f9
KH
8356 ? coding_category_utf_16_be
8357 : coding_category_utf_16_le));
8358 }
8359 else if (EQ (coding_type, Qiso_2022))
8360 {
8361 Lisp_Object initial, reg_usage, request, flags;
4776e638 8362 int i;
1397dc18 8363
df7492f9
KH
8364 if (nargs < coding_arg_iso2022_max)
8365 goto short_args;
8366
8367 initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
8368 CHECK_VECTOR (initial);
8369 for (i = 0; i < 4; i++)
8370 {
8371 val = Faref (initial, make_number (i));
8372 if (! NILP (val))
8373 {
584948ac
KH
8374 struct charset *charset;
8375
8376 CHECK_CHARSET_GET_CHARSET (val, charset);
8377 ASET (initial, i, make_number (CHARSET_ID (charset)));
8378 if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
8379 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
df7492f9
KH
8380 }
8381 else
8382 ASET (initial, i, make_number (-1));
8383 }
8384
8385 reg_usage = args[coding_arg_iso2022_reg_usage];
8386 CHECK_CONS (reg_usage);
8f924df7
KH
8387 CHECK_NUMBER_CAR (reg_usage);
8388 CHECK_NUMBER_CDR (reg_usage);
df7492f9
KH
8389
8390 request = Fcopy_sequence (args[coding_arg_iso2022_request]);
8391 for (tail = request; ! NILP (tail); tail = Fcdr (tail))
1397dc18 8392 {
df7492f9 8393 int id;
8f924df7 8394 Lisp_Object tmp;
df7492f9
KH
8395
8396 val = Fcar (tail);
8397 CHECK_CONS (val);
8f924df7
KH
8398 tmp = XCAR (val);
8399 CHECK_CHARSET_GET_ID (tmp, id);
8400 CHECK_NATNUM_CDR (val);
df7492f9
KH
8401 if (XINT (XCDR (val)) >= 4)
8402 error ("Invalid graphic register number: %d", XINT (XCDR (val)));
8f924df7 8403 XSETCAR (val, make_number (id));
1397dc18 8404 }
4ed46869 8405
df7492f9
KH
8406 flags = args[coding_arg_iso2022_flags];
8407 CHECK_NATNUM (flags);
8408 i = XINT (flags);
8409 if (EQ (args[coding_arg_charset_list], Qiso_2022))
8410 flags = make_number (i | CODING_ISO_FLAG_FULL_SUPPORT);
8411
8412 ASET (attrs, coding_attr_iso_initial, initial);
8413 ASET (attrs, coding_attr_iso_usage, reg_usage);
8414 ASET (attrs, coding_attr_iso_request, request);
8415 ASET (attrs, coding_attr_iso_flags, flags);
8416 setup_iso_safe_charsets (attrs);
8417
8418 if (i & CODING_ISO_FLAG_SEVEN_BITS)
8419 category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
8420 | CODING_ISO_FLAG_SINGLE_SHIFT))
8421 ? coding_category_iso_7_else
8422 : EQ (args[coding_arg_charset_list], Qiso_2022)
8423 ? coding_category_iso_7
8424 : coding_category_iso_7_tight);
8425 else
8426 {
8427 int id = XINT (AREF (initial, 1));
8428
c6fb6e98 8429 category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
df7492f9
KH
8430 || EQ (args[coding_arg_charset_list], Qiso_2022)
8431 || id < 0)
8432 ? coding_category_iso_8_else
8433 : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
8434 ? coding_category_iso_8_1
8435 : coding_category_iso_8_2);
8436 }
0ce7886f
KH
8437 if (category != coding_category_iso_8_1
8438 && category != coding_category_iso_8_2)
8439 CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
df7492f9
KH
8440 }
8441 else if (EQ (coding_type, Qemacs_mule))
c28a9453 8442 {
df7492f9
KH
8443 if (EQ (args[coding_arg_charset_list], Qemacs_mule))
8444 ASET (attrs, coding_attr_emacs_mule_full, Qt);
584948ac 8445 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
df7492f9 8446 category = coding_category_emacs_mule;
c28a9453 8447 }
df7492f9 8448 else if (EQ (coding_type, Qshift_jis))
c28a9453 8449 {
df7492f9
KH
8450
8451 struct charset *charset;
8452
7d64c6ad
KH
8453 if (XINT (Flength (charset_list)) != 3
8454 || XINT (Flength (charset_list)) != 4)
8455 error ("There should be three or four charsets");
df7492f9
KH
8456
8457 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8458 if (CHARSET_DIMENSION (charset) != 1)
8459 error ("Dimension of charset %s is not one",
8f924df7 8460 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
584948ac
KH
8461 if (CHARSET_ASCII_COMPATIBLE_P (charset))
8462 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
df7492f9
KH
8463
8464 charset_list = XCDR (charset_list);
8465 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8466 if (CHARSET_DIMENSION (charset) != 1)
8467 error ("Dimension of charset %s is not one",
8f924df7 8468 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
df7492f9
KH
8469
8470 charset_list = XCDR (charset_list);
8471 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8472 if (CHARSET_DIMENSION (charset) != 2)
7d64c6ad
KH
8473 error ("Dimension of charset %s is not two",
8474 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8475
8476 charset_list = XCDR (charset_list);
8477 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8478 if (CHARSET_DIMENSION (charset) != 2)
df7492f9 8479 error ("Dimension of charset %s is not two",
8f924df7 8480 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
df7492f9
KH
8481
8482 category = coding_category_sjis;
8483 Vsjis_coding_system = name;
c28a9453 8484 }
df7492f9
KH
8485 else if (EQ (coding_type, Qbig5))
8486 {
8487 struct charset *charset;
4ed46869 8488
df7492f9
KH
8489 if (XINT (Flength (charset_list)) != 2)
8490 error ("There should be just two charsets");
8491
8492 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8493 if (CHARSET_DIMENSION (charset) != 1)
8494 error ("Dimension of charset %s is not one",
8f924df7 8495 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
584948ac
KH
8496 if (CHARSET_ASCII_COMPATIBLE_P (charset))
8497 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
df7492f9
KH
8498
8499 charset_list = XCDR (charset_list);
8500 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8501 if (CHARSET_DIMENSION (charset) != 2)
8502 error ("Dimension of charset %s is not two",
8f924df7 8503 SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
4ed46869 8504
df7492f9
KH
8505 category = coding_category_big5;
8506 Vbig5_coding_system = name;
8507 }
8508 else if (EQ (coding_type, Qraw_text))
c28a9453 8509 {
584948ac
KH
8510 category = coding_category_raw_text;
8511 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
c28a9453 8512 }
df7492f9 8513 else if (EQ (coding_type, Qutf_8))
4ed46869 8514 {
584948ac
KH
8515 category = coding_category_utf_8;
8516 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
4ed46869 8517 }
df7492f9
KH
8518 else if (EQ (coding_type, Qundecided))
8519 category = coding_category_undecided;
4ed46869 8520 else
df7492f9 8521 error ("Invalid coding system type: %s",
8f924df7 8522 SDATA (SYMBOL_NAME (coding_type)));
4ed46869 8523
df7492f9 8524 CODING_ATTR_CATEGORY (attrs) = make_number (category);
01378f49
KH
8525 CODING_ATTR_PLIST (attrs)
8526 = Fcons (QCcategory, Fcons (AREF (Vcoding_category_table, category),
8527 CODING_ATTR_PLIST (attrs)));
c4825358 8528
df7492f9
KH
8529 eol_type = args[coding_arg_eol_type];
8530 if (! NILP (eol_type)
8531 && ! EQ (eol_type, Qunix)
8532 && ! EQ (eol_type, Qdos)
8533 && ! EQ (eol_type, Qmac))
8534 error ("Invalid eol-type");
4ed46869 8535
df7492f9 8536 aliases = Fcons (name, Qnil);
4ed46869 8537
df7492f9
KH
8538 if (NILP (eol_type))
8539 {
8540 eol_type = make_subsidiaries (name);
8541 for (i = 0; i < 3; i++)
1397dc18 8542 {
df7492f9
KH
8543 Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
8544
8545 this_name = AREF (eol_type, i);
8546 this_aliases = Fcons (this_name, Qnil);
8547 this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
8548 this_spec = Fmake_vector (make_number (3), attrs);
8549 ASET (this_spec, 1, this_aliases);
8550 ASET (this_spec, 2, this_eol_type);
8551 Fputhash (this_name, this_spec, Vcoding_system_hash_table);
8552 Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
8553 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
8554 Vcoding_system_alist);
1397dc18 8555 }
d46c5b12 8556 }
4ed46869 8557
df7492f9
KH
8558 spec_vec = Fmake_vector (make_number (3), attrs);
8559 ASET (spec_vec, 1, aliases);
8560 ASET (spec_vec, 2, eol_type);
48b0f3ae 8561
df7492f9
KH
8562 Fputhash (name, spec_vec, Vcoding_system_hash_table);
8563 Vcoding_system_list = Fcons (name, Vcoding_system_list);
8564 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
8565 Vcoding_system_alist);
48b0f3ae 8566
df7492f9
KH
8567 {
8568 int id = coding_categories[category].id;
48b0f3ae 8569
df7492f9
KH
8570 if (id < 0 || EQ (name, CODING_ID_NAME (id)))
8571 setup_coding_system (name, &coding_categories[category]);
8572 }
48b0f3ae 8573
d46c5b12 8574 return Qnil;
48b0f3ae 8575
df7492f9
KH
8576 short_args:
8577 return Fsignal (Qwrong_number_of_arguments,
8578 Fcons (intern ("define-coding-system-internal"),
8579 make_number (nargs)));
d46c5b12 8580}
4ed46869 8581
d6925f38 8582
df7492f9
KH
8583DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
8584 Sdefine_coding_system_alias, 2, 2, 0,
8585 doc: /* Define ALIAS as an alias for CODING-SYSTEM. */)
8586 (alias, coding_system)
8587 Lisp_Object alias, coding_system;
66cfb530 8588{
df7492f9 8589 Lisp_Object spec, aliases, eol_type;
4ed46869 8590
df7492f9
KH
8591 CHECK_SYMBOL (alias);
8592 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8593 aliases = AREF (spec, 1);
d6925f38
KH
8594 /* ALISES should be a list of length more than zero, and the first
8595 element is a base coding system. Append ALIAS at the tail of the
8596 list. */
df7492f9
KH
8597 while (!NILP (XCDR (aliases)))
8598 aliases = XCDR (aliases);
8f924df7 8599 XSETCDR (aliases, Fcons (alias, Qnil));
4ed46869 8600
df7492f9
KH
8601 eol_type = AREF (spec, 2);
8602 if (VECTORP (eol_type))
4ed46869 8603 {
df7492f9
KH
8604 Lisp_Object subsidiaries;
8605 int i;
4ed46869 8606
df7492f9
KH
8607 subsidiaries = make_subsidiaries (alias);
8608 for (i = 0; i < 3; i++)
8609 Fdefine_coding_system_alias (AREF (subsidiaries, i),
8610 AREF (eol_type, i));
4ed46869 8611 }
df7492f9
KH
8612
8613 Fputhash (alias, spec, Vcoding_system_hash_table);
d6925f38 8614 Vcoding_system_list = Fcons (alias, Vcoding_system_list);
5bad0796
DL
8615 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
8616 Vcoding_system_alist);
66cfb530 8617
4ed46869
KH
8618 return Qnil;
8619}
8620
df7492f9
KH
8621DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
8622 1, 1, 0,
8623 doc: /* Return the base of CODING-SYSTEM.
da7db224 8624Any alias or subsidiary coding system is not a base coding system. */)
df7492f9
KH
8625 (coding_system)
8626 Lisp_Object coding_system;
d46c5b12 8627{
df7492f9 8628 Lisp_Object spec, attrs;
d46c5b12 8629
df7492f9
KH
8630 if (NILP (coding_system))
8631 return (Qno_conversion);
8632 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8633 attrs = AREF (spec, 0);
8634 return CODING_ATTR_BASE_NAME (attrs);
8635}
1397dc18 8636
df7492f9
KH
8637DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
8638 1, 1, 0,
8639 doc: "Return the property list of CODING-SYSTEM.")
8640 (coding_system)
8641 Lisp_Object coding_system;
8642{
8643 Lisp_Object spec, attrs;
1397dc18 8644
df7492f9
KH
8645 if (NILP (coding_system))
8646 coding_system = Qno_conversion;
8647 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8648 attrs = AREF (spec, 0);
8649 return CODING_ATTR_PLIST (attrs);
d46c5b12
KH
8650}
8651
df7492f9
KH
8652
8653DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
8654 1, 1, 0,
da7db224 8655 doc: /* Return the list of aliases of CODING-SYSTEM. */)
df7492f9
KH
8656 (coding_system)
8657 Lisp_Object coding_system;
66cfb530 8658{
df7492f9 8659 Lisp_Object spec;
84d60297 8660
df7492f9
KH
8661 if (NILP (coding_system))
8662 coding_system = Qno_conversion;
8663 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
da7db224 8664 return AREF (spec, 1);
df7492f9 8665}
66cfb530 8666
df7492f9
KH
8667DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
8668 Scoding_system_eol_type, 1, 1, 0,
8669 doc: /* Return eol-type of CODING-SYSTEM.
8670An eol-type is integer 0, 1, 2, or a vector of coding systems.
66cfb530 8671
df7492f9
KH
8672Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
8673and CR respectively.
66cfb530 8674
df7492f9
KH
8675A vector value indicates that a format of end-of-line should be
8676detected automatically. Nth element of the vector is the subsidiary
8677coding system whose eol-type is N. */)
6b89e3aa
KH
8678 (coding_system)
8679 Lisp_Object coding_system;
8680{
df7492f9
KH
8681 Lisp_Object spec, eol_type;
8682 int n;
6b89e3aa 8683
df7492f9
KH
8684 if (NILP (coding_system))
8685 coding_system = Qno_conversion;
8686 if (! CODING_SYSTEM_P (coding_system))
8687 return Qnil;
8688 spec = CODING_SYSTEM_SPEC (coding_system);
8689 eol_type = AREF (spec, 2);
8690 if (VECTORP (eol_type))
8691 return Fcopy_sequence (eol_type);
8692 n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
8693 return make_number (n);
6b89e3aa
KH
8694}
8695
4ed46869
KH
8696#endif /* emacs */
8697
8698\f
1397dc18 8699/*** 9. Post-amble ***/
4ed46869 8700
dfcf069d 8701void
4ed46869
KH
8702init_coding_once ()
8703{
8704 int i;
8705
df7492f9
KH
8706 for (i = 0; i < coding_category_max; i++)
8707 {
8708 coding_categories[i].id = -1;
8709 coding_priorities[i] = i;
8710 }
4ed46869
KH
8711
8712 /* ISO2022 specific initialize routine. */
8713 for (i = 0; i < 0x20; i++)
b73bfc1c 8714 iso_code_class[i] = ISO_control_0;
4ed46869
KH
8715 for (i = 0x21; i < 0x7F; i++)
8716 iso_code_class[i] = ISO_graphic_plane_0;
8717 for (i = 0x80; i < 0xA0; i++)
b73bfc1c 8718 iso_code_class[i] = ISO_control_1;
4ed46869
KH
8719 for (i = 0xA1; i < 0xFF; i++)
8720 iso_code_class[i] = ISO_graphic_plane_1;
8721 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
8722 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
4ed46869
KH
8723 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
8724 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
8725 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
8726 iso_code_class[ISO_CODE_ESC] = ISO_escape;
8727 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
8728 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
8729 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
8730
df7492f9
KH
8731 for (i = 0; i < 256; i++)
8732 {
8733 emacs_mule_bytes[i] = 1;
8734 }
7c78e542
KH
8735 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
8736 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
8737 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
8738 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
e0e989f6
KH
8739}
8740
8741#ifdef emacs
8742
dfcf069d 8743void
e0e989f6
KH
8744syms_of_coding ()
8745{
df7492f9 8746 staticpro (&Vcoding_system_hash_table);
8f924df7
KH
8747 {
8748 Lisp_Object args[2];
8749 args[0] = QCtest;
8750 args[1] = Qeq;
8751 Vcoding_system_hash_table = Fmake_hash_table (2, args);
8752 }
df7492f9
KH
8753
8754 staticpro (&Vsjis_coding_system);
8755 Vsjis_coding_system = Qnil;
e0e989f6 8756
df7492f9
KH
8757 staticpro (&Vbig5_coding_system);
8758 Vbig5_coding_system = Qnil;
8759
24a73b0a
KH
8760 staticpro (&Vcode_conversion_reused_workbuf);
8761 Vcode_conversion_reused_workbuf = Qnil;
8762
8763 staticpro (&Vcode_conversion_workbuf_name);
8764 Vcode_conversion_workbuf_name = build_string (" *code-conversion-work*");
e0e989f6 8765
24a73b0a 8766 reused_workbuf_in_use = 0;
df7492f9
KH
8767
8768 DEFSYM (Qcharset, "charset");
8769 DEFSYM (Qtarget_idx, "target-idx");
8770 DEFSYM (Qcoding_system_history, "coding-system-history");
bb0115a2
RS
8771 Fset (Qcoding_system_history, Qnil);
8772
9ce27fde 8773 /* Target FILENAME is the first argument. */
e0e989f6 8774 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
9ce27fde 8775 /* Target FILENAME is the third argument. */
e0e989f6
KH
8776 Fput (Qwrite_region, Qtarget_idx, make_number (2));
8777
df7492f9 8778 DEFSYM (Qcall_process, "call-process");
9ce27fde 8779 /* Target PROGRAM is the first argument. */
e0e989f6
KH
8780 Fput (Qcall_process, Qtarget_idx, make_number (0));
8781
df7492f9 8782 DEFSYM (Qcall_process_region, "call-process-region");
9ce27fde 8783 /* Target PROGRAM is the third argument. */
e0e989f6
KH
8784 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
8785
df7492f9 8786 DEFSYM (Qstart_process, "start-process");
9ce27fde 8787 /* Target PROGRAM is the third argument. */
e0e989f6
KH
8788 Fput (Qstart_process, Qtarget_idx, make_number (2));
8789
df7492f9 8790 DEFSYM (Qopen_network_stream, "open-network-stream");
9ce27fde 8791 /* Target SERVICE is the fourth argument. */
e0e989f6
KH
8792 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
8793
df7492f9
KH
8794 DEFSYM (Qcoding_system, "coding-system");
8795 DEFSYM (Qcoding_aliases, "coding-aliases");
4ed46869 8796
df7492f9
KH
8797 DEFSYM (Qeol_type, "eol-type");
8798 DEFSYM (Qunix, "unix");
8799 DEFSYM (Qdos, "dos");
4ed46869 8800
df7492f9
KH
8801 DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
8802 DEFSYM (Qpost_read_conversion, "post-read-conversion");
8803 DEFSYM (Qpre_write_conversion, "pre-write-conversion");
8804 DEFSYM (Qdefault_char, "default-char");
8805 DEFSYM (Qundecided, "undecided");
8806 DEFSYM (Qno_conversion, "no-conversion");
8807 DEFSYM (Qraw_text, "raw-text");
4ed46869 8808
df7492f9 8809 DEFSYM (Qiso_2022, "iso-2022");
4ed46869 8810
df7492f9 8811 DEFSYM (Qutf_8, "utf-8");
8f924df7 8812 DEFSYM (Qutf_8_emacs, "utf-8-emacs");
27901516 8813
df7492f9 8814 DEFSYM (Qutf_16, "utf-16");
df7492f9
KH
8815 DEFSYM (Qbig, "big");
8816 DEFSYM (Qlittle, "little");
27901516 8817
df7492f9
KH
8818 DEFSYM (Qshift_jis, "shift-jis");
8819 DEFSYM (Qbig5, "big5");
4ed46869 8820
df7492f9 8821 DEFSYM (Qcoding_system_p, "coding-system-p");
4ed46869 8822
df7492f9 8823 DEFSYM (Qcoding_system_error, "coding-system-error");
4ed46869
KH
8824 Fput (Qcoding_system_error, Qerror_conditions,
8825 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
8826 Fput (Qcoding_system_error, Qerror_message,
9ce27fde 8827 build_string ("Invalid coding system"));
4ed46869 8828
05e6f5dc
KH
8829 /* Intern this now in case it isn't already done.
8830 Setting this variable twice is harmless.
8831 But don't staticpro it here--that is done in alloc.c. */
8832 Qchar_table_extra_slots = intern ("char-table-extra-slots");
70c22245 8833
df7492f9 8834 DEFSYM (Qtranslation_table, "translation-table");
1397dc18 8835 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (1));
df7492f9
KH
8836 DEFSYM (Qtranslation_table_id, "translation-table-id");
8837 DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
8838 DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
1397dc18 8839
df7492f9 8840 DEFSYM (Qvalid_codes, "valid-codes");
9ce27fde 8841
df7492f9 8842 DEFSYM (Qemacs_mule, "emacs-mule");
d46c5b12 8843
01378f49
KH
8844 DEFSYM (QCcategory, ":category");
8845
df7492f9
KH
8846 Vcoding_category_table
8847 = Fmake_vector (make_number (coding_category_max), Qnil);
8848 staticpro (&Vcoding_category_table);
8849 /* Followings are target of code detection. */
8850 ASET (Vcoding_category_table, coding_category_iso_7,
8851 intern ("coding-category-iso-7"));
8852 ASET (Vcoding_category_table, coding_category_iso_7_tight,
8853 intern ("coding-category-iso-7-tight"));
8854 ASET (Vcoding_category_table, coding_category_iso_8_1,
8855 intern ("coding-category-iso-8-1"));
8856 ASET (Vcoding_category_table, coding_category_iso_8_2,
8857 intern ("coding-category-iso-8-2"));
8858 ASET (Vcoding_category_table, coding_category_iso_7_else,
8859 intern ("coding-category-iso-7-else"));
8860 ASET (Vcoding_category_table, coding_category_iso_8_else,
8861 intern ("coding-category-iso-8-else"));
8862 ASET (Vcoding_category_table, coding_category_utf_8,
8863 intern ("coding-category-utf-8"));
8864 ASET (Vcoding_category_table, coding_category_utf_16_be,
8865 intern ("coding-category-utf-16-be"));
ff563fce
KH
8866 ASET (Vcoding_category_table, coding_category_utf_16_auto,
8867 intern ("coding-category-utf-16-auto"));
df7492f9
KH
8868 ASET (Vcoding_category_table, coding_category_utf_16_le,
8869 intern ("coding-category-utf-16-le"));
8870 ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
8871 intern ("coding-category-utf-16-be-nosig"));
8872 ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
8873 intern ("coding-category-utf-16-le-nosig"));
8874 ASET (Vcoding_category_table, coding_category_charset,
8875 intern ("coding-category-charset"));
8876 ASET (Vcoding_category_table, coding_category_sjis,
8877 intern ("coding-category-sjis"));
8878 ASET (Vcoding_category_table, coding_category_big5,
8879 intern ("coding-category-big5"));
8880 ASET (Vcoding_category_table, coding_category_ccl,
8881 intern ("coding-category-ccl"));
8882 ASET (Vcoding_category_table, coding_category_emacs_mule,
8883 intern ("coding-category-emacs-mule"));
8884 /* Followings are NOT target of code detection. */
8885 ASET (Vcoding_category_table, coding_category_raw_text,
8886 intern ("coding-category-raw-text"));
8887 ASET (Vcoding_category_table, coding_category_undecided,
8888 intern ("coding-category-undecided"));
ecf488bc 8889
065e3595
KH
8890 DEFSYM (Qinsufficient_source, "insufficient-source");
8891 DEFSYM (Qinconsistent_eol, "inconsistent-eol");
8892 DEFSYM (Qinvalid_source, "invalid-source");
8893 DEFSYM (Qinterrupted, "interrupted");
8894 DEFSYM (Qinsufficient_memory, "insufficient-memory");
8895
4ed46869
KH
8896 defsubr (&Scoding_system_p);
8897 defsubr (&Sread_coding_system);
8898 defsubr (&Sread_non_nil_coding_system);
8899 defsubr (&Scheck_coding_system);
8900 defsubr (&Sdetect_coding_region);
d46c5b12 8901 defsubr (&Sdetect_coding_string);
05e6f5dc 8902 defsubr (&Sfind_coding_systems_region_internal);
068a9dbd 8903 defsubr (&Sunencodable_char_position);
df7492f9 8904 defsubr (&Scheck_coding_systems_region);
4ed46869
KH
8905 defsubr (&Sdecode_coding_region);
8906 defsubr (&Sencode_coding_region);
8907 defsubr (&Sdecode_coding_string);
8908 defsubr (&Sencode_coding_string);
8909 defsubr (&Sdecode_sjis_char);
8910 defsubr (&Sencode_sjis_char);
8911 defsubr (&Sdecode_big5_char);
8912 defsubr (&Sencode_big5_char);
1ba9e4ab 8913 defsubr (&Sset_terminal_coding_system_internal);
c4825358 8914 defsubr (&Sset_safe_terminal_coding_system_internal);
4ed46869 8915 defsubr (&Sterminal_coding_system);
1ba9e4ab 8916 defsubr (&Sset_keyboard_coding_system_internal);
4ed46869 8917 defsubr (&Skeyboard_coding_system);
a5d301df 8918 defsubr (&Sfind_operation_coding_system);
df7492f9 8919 defsubr (&Sset_coding_system_priority);
6b89e3aa 8920 defsubr (&Sdefine_coding_system_internal);
df7492f9
KH
8921 defsubr (&Sdefine_coding_system_alias);
8922 defsubr (&Scoding_system_base);
8923 defsubr (&Scoding_system_plist);
8924 defsubr (&Scoding_system_aliases);
8925 defsubr (&Scoding_system_eol_type);
8926 defsubr (&Scoding_system_priority_list);
4ed46869 8927
4608c386 8928 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
48b0f3ae
PJ
8929 doc: /* List of coding systems.
8930
8931Do not alter the value of this variable manually. This variable should be
df7492f9 8932updated by the functions `define-coding-system' and
48b0f3ae 8933`define-coding-system-alias'. */);
4608c386
KH
8934 Vcoding_system_list = Qnil;
8935
8936 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
48b0f3ae
PJ
8937 doc: /* Alist of coding system names.
8938Each element is one element list of coding system name.
8939This variable is given to `completing-read' as TABLE argument.
8940
8941Do not alter the value of this variable manually. This variable should be
8942updated by the functions `make-coding-system' and
8943`define-coding-system-alias'. */);
4608c386
KH
8944 Vcoding_system_alist = Qnil;
8945
4ed46869 8946 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
48b0f3ae
PJ
8947 doc: /* List of coding-categories (symbols) ordered by priority.
8948
8949On detecting a coding system, Emacs tries code detection algorithms
8950associated with each coding-category one by one in this order. When
8951one algorithm agrees with a byte sequence of source text, the coding
8952system bound to the corresponding coding-category is selected. */);
4ed46869
KH
8953 {
8954 int i;
8955
8956 Vcoding_category_list = Qnil;
df7492f9 8957 for (i = coding_category_max - 1; i >= 0; i--)
4ed46869 8958 Vcoding_category_list
d46c5b12
KH
8959 = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
8960 Vcoding_category_list);
4ed46869
KH
8961 }
8962
8963 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
48b0f3ae
PJ
8964 doc: /* Specify the coding system for read operations.
8965It is useful to bind this variable with `let', but do not set it globally.
8966If the value is a coding system, it is used for decoding on read operation.
8967If not, an appropriate element is used from one of the coding system alists:
8968There are three such tables, `file-coding-system-alist',
8969`process-coding-system-alist', and `network-coding-system-alist'. */);
4ed46869
KH
8970 Vcoding_system_for_read = Qnil;
8971
8972 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
48b0f3ae
PJ
8973 doc: /* Specify the coding system for write operations.
8974Programs bind this variable with `let', but you should not set it globally.
8975If the value is a coding system, it is used for encoding of output,
8976when writing it to a file and when sending it to a file or subprocess.
8977
8978If this does not specify a coding system, an appropriate element
8979is used from one of the coding system alists:
8980There are three such tables, `file-coding-system-alist',
8981`process-coding-system-alist', and `network-coding-system-alist'.
8982For output to files, if the above procedure does not specify a coding system,
8983the value of `buffer-file-coding-system' is used. */);
4ed46869
KH
8984 Vcoding_system_for_write = Qnil;
8985
8986 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
df7492f9
KH
8987 doc: /*
8988Coding system used in the latest file or process I/O. */);
4ed46869
KH
8989 Vlast_coding_system_used = Qnil;
8990
065e3595
KH
8991 DEFVAR_LISP ("last-code-conversion-error", &Vlast_code_conversion_error,
8992 doc: /*
8993Error status of the last code conversion.
8994
8995When an error was detected in the last code conversion, this variable
8996is set to one of the following symbols.
8997 `insufficient-source'
8998 `inconsistent-eol'
8999 `invalid-source'
9000 `interrupted'
9001 `insufficient-memory'
9002When no error was detected, the value doesn't change. So, to check
9003the error status of a code conversion by this variable, you must
9004explicitly set this variable to nil before performing code
9005conversion. */);
9006 Vlast_code_conversion_error = Qnil;
9007
9ce27fde 9008 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
df7492f9
KH
9009 doc: /*
9010*Non-nil means always inhibit code conversion of end-of-line format.
48b0f3ae
PJ
9011See info node `Coding Systems' and info node `Text and Binary' concerning
9012such conversion. */);
9ce27fde
KH
9013 inhibit_eol_conversion = 0;
9014
ed29121d 9015 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
df7492f9
KH
9016 doc: /*
9017Non-nil means process buffer inherits coding system of process output.
48b0f3ae
PJ
9018Bind it to t if the process output is to be treated as if it were a file
9019read from some filesystem. */);
ed29121d
EZ
9020 inherit_process_coding_system = 0;
9021
02ba4723 9022 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
df7492f9
KH
9023 doc: /*
9024Alist to decide a coding system to use for a file I/O operation.
48b0f3ae
PJ
9025The format is ((PATTERN . VAL) ...),
9026where PATTERN is a regular expression matching a file name,
9027VAL is a coding system, a cons of coding systems, or a function symbol.
9028If VAL is a coding system, it is used for both decoding and encoding
9029the file contents.
9030If VAL is a cons of coding systems, the car part is used for decoding,
9031and the cdr part is used for encoding.
9032If VAL is a function symbol, the function must return a coding system
0192762c
DL
9033or a cons of coding systems which are used as above. The function gets
9034the arguments with which `find-operation-coding-systems' was called.
48b0f3ae
PJ
9035
9036See also the function `find-operation-coding-system'
9037and the variable `auto-coding-alist'. */);
02ba4723
KH
9038 Vfile_coding_system_alist = Qnil;
9039
9040 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
df7492f9
KH
9041 doc: /*
9042Alist to decide a coding system to use for a process I/O operation.
48b0f3ae
PJ
9043The format is ((PATTERN . VAL) ...),
9044where PATTERN is a regular expression matching a program name,
9045VAL is a coding system, a cons of coding systems, or a function symbol.
9046If VAL is a coding system, it is used for both decoding what received
9047from the program and encoding what sent to the program.
9048If VAL is a cons of coding systems, the car part is used for decoding,
9049and the cdr part is used for encoding.
9050If VAL is a function symbol, the function must return a coding system
9051or a cons of coding systems which are used as above.
9052
9053See also the function `find-operation-coding-system'. */);
02ba4723
KH
9054 Vprocess_coding_system_alist = Qnil;
9055
9056 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
df7492f9
KH
9057 doc: /*
9058Alist to decide a coding system to use for a network I/O operation.
48b0f3ae
PJ
9059The format is ((PATTERN . VAL) ...),
9060where PATTERN is a regular expression matching a network service name
9061or is a port number to connect to,
9062VAL is a coding system, a cons of coding systems, or a function symbol.
9063If VAL is a coding system, it is used for both decoding what received
9064from the network stream and encoding what sent to the network stream.
9065If VAL is a cons of coding systems, the car part is used for decoding,
9066and the cdr part is used for encoding.
9067If VAL is a function symbol, the function must return a coding system
9068or a cons of coding systems which are used as above.
9069
9070See also the function `find-operation-coding-system'. */);
02ba4723 9071 Vnetwork_coding_system_alist = Qnil;
4ed46869 9072
68c45bf0 9073 DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
75205970
RS
9074 doc: /* Coding system to use with system messages.
9075Also used for decoding keyboard input on X Window system. */);
68c45bf0
PE
9076 Vlocale_coding_system = Qnil;
9077
005f0d35 9078 /* The eol mnemonics are reset in startup.el system-dependently. */
7722baf9 9079 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
df7492f9
KH
9080 doc: /*
9081*String displayed in mode line for UNIX-like (LF) end-of-line format. */);
7722baf9 9082 eol_mnemonic_unix = build_string (":");
4ed46869 9083
7722baf9 9084 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
df7492f9
KH
9085 doc: /*
9086*String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
7722baf9 9087 eol_mnemonic_dos = build_string ("\\");
4ed46869 9088
7722baf9 9089 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
df7492f9
KH
9090 doc: /*
9091*String displayed in mode line for MAC-like (CR) end-of-line format. */);
7722baf9 9092 eol_mnemonic_mac = build_string ("/");
4ed46869 9093
7722baf9 9094 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
df7492f9
KH
9095 doc: /*
9096*String displayed in mode line when end-of-line format is not yet determined. */);
7722baf9 9097 eol_mnemonic_undecided = build_string (":");
4ed46869 9098
84fbb8a0 9099 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
df7492f9
KH
9100 doc: /*
9101*Non-nil enables character translation while encoding and decoding. */);
84fbb8a0 9102 Venable_character_translation = Qt;
bdd9fb48 9103
f967223b 9104 DEFVAR_LISP ("standard-translation-table-for-decode",
48b0f3ae
PJ
9105 &Vstandard_translation_table_for_decode,
9106 doc: /* Table for translating characters while decoding. */);
f967223b 9107 Vstandard_translation_table_for_decode = Qnil;
bdd9fb48 9108
f967223b 9109 DEFVAR_LISP ("standard-translation-table-for-encode",
48b0f3ae
PJ
9110 &Vstandard_translation_table_for_encode,
9111 doc: /* Table for translating characters while encoding. */);
f967223b 9112 Vstandard_translation_table_for_encode = Qnil;
4ed46869 9113
df7492f9 9114 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table,
48b0f3ae
PJ
9115 doc: /* Alist of charsets vs revision numbers.
9116While encoding, if a charset (car part of an element) is found,
df7492f9
KH
9117designate it with the escape sequence identifying revision (cdr part
9118of the element). */);
9119 Vcharset_revision_table = Qnil;
02ba4723
KH
9120
9121 DEFVAR_LISP ("default-process-coding-system",
9122 &Vdefault_process_coding_system,
48b0f3ae
PJ
9123 doc: /* Cons of coding systems used for process I/O by default.
9124The car part is used for decoding a process output,
9125the cdr part is used for encoding a text to be sent to a process. */);
02ba4723 9126 Vdefault_process_coding_system = Qnil;
c4825358 9127
3f003981 9128 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
df7492f9
KH
9129 doc: /*
9130Table of extra Latin codes in the range 128..159 (inclusive).
48b0f3ae
PJ
9131This is a vector of length 256.
9132If Nth element is non-nil, the existence of code N in a file
9133\(or output of subprocess) doesn't prevent it to be detected as
9134a coding system of ISO 2022 variant which has a flag
9135`accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
9136or reading output of a subprocess.
9137Only 128th through 159th elements has a meaning. */);
3f003981 9138 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
d46c5b12
KH
9139
9140 DEFVAR_LISP ("select-safe-coding-system-function",
9141 &Vselect_safe_coding_system_function,
df7492f9
KH
9142 doc: /*
9143Function to call to select safe coding system for encoding a text.
48b0f3ae
PJ
9144
9145If set, this function is called to force a user to select a proper
9146coding system which can encode the text in the case that a default
9147coding system used in each operation can't encode the text.
9148
9149The default value is `select-safe-coding-system' (which see). */);
d46c5b12
KH
9150 Vselect_safe_coding_system_function = Qnil;
9151
5d5bf4d8
KH
9152 DEFVAR_BOOL ("coding-system-require-warning",
9153 &coding_system_require_warning,
9154 doc: /* Internal use only.
6b89e3aa
KH
9155If non-nil, on writing a file, `select-safe-coding-system-function' is
9156called even if `coding-system-for-write' is non-nil. The command
9157`universal-coding-system-argument' binds this variable to t temporarily. */);
5d5bf4d8
KH
9158 coding_system_require_warning = 0;
9159
9160
22ab2303 9161 DEFVAR_BOOL ("inhibit-iso-escape-detection",
74383408 9162 &inhibit_iso_escape_detection,
df7492f9
KH
9163 doc: /*
9164If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
48b0f3ae
PJ
9165
9166By default, on reading a file, Emacs tries to detect how the text is
9167encoded. This code detection is sensitive to escape sequences. If
9168the sequence is valid as ISO2022, the code is determined as one of
9169the ISO2022 encodings, and the file is decoded by the corresponding
9170coding system (e.g. `iso-2022-7bit').
9171
9172However, there may be a case that you want to read escape sequences in
9173a file as is. In such a case, you can set this variable to non-nil.
9174Then, as the code detection ignores any escape sequences, no file is
9175detected as encoded in some ISO2022 encoding. The result is that all
9176escape sequences become visible in a buffer.
9177
9178The default value is nil, and it is strongly recommended not to change
9179it. That is because many Emacs Lisp source files that contain
9180non-ASCII characters are encoded by the coding system `iso-2022-7bit'
9181in Emacs's distribution, and they won't be decoded correctly on
9182reading if you suppress escape sequence detection.
9183
9184The other way to read escape sequences in a file without decoding is
9185to explicitly specify some coding system that doesn't use ISO2022's
9186escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument]. */);
74383408 9187 inhibit_iso_escape_detection = 0;
002fdb44
DL
9188
9189 DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
15c8f9d1
DL
9190 doc: /* Char table for translating self-inserting characters.
9191This is applied to the result of input methods, not their input. See also
9192`keyboard-translate-table'. */);
002fdb44 9193 Vtranslation_table_for_input = Qnil;
8f924df7 9194
2c78b7e1
KH
9195 {
9196 Lisp_Object args[coding_arg_max];
8f924df7 9197 Lisp_Object plist[16];
2c78b7e1
KH
9198 int i;
9199
9200 for (i = 0; i < coding_arg_max; i++)
9201 args[i] = Qnil;
9202
9203 plist[0] = intern (":name");
9204 plist[1] = args[coding_arg_name] = Qno_conversion;
9205 plist[2] = intern (":mnemonic");
9206 plist[3] = args[coding_arg_mnemonic] = make_number ('=');
9207 plist[4] = intern (":coding-type");
9208 plist[5] = args[coding_arg_coding_type] = Qraw_text;
9209 plist[6] = intern (":ascii-compatible-p");
9210 plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
9211 plist[8] = intern (":default-char");
9212 plist[9] = args[coding_arg_default_char] = make_number (0);
8f924df7
KH
9213 plist[10] = intern (":for-unibyte");
9214 plist[11] = args[coding_arg_for_unibyte] = Qt;
9215 plist[12] = intern (":docstring");
9216 plist[13] = build_string ("Do no conversion.\n\
2c78b7e1
KH
9217\n\
9218When you visit a file with this coding, the file is read into a\n\
9219unibyte buffer as is, thus each byte of a file is treated as a\n\
9220character.");
8f924df7
KH
9221 plist[14] = intern (":eol-type");
9222 plist[15] = args[coding_arg_eol_type] = Qunix;
9223 args[coding_arg_plist] = Flist (16, plist);
2c78b7e1
KH
9224 Fdefine_coding_system_internal (coding_arg_max, args);
9225 }
9226
9227 setup_coding_system (Qno_conversion, &keyboard_coding);
9228 setup_coding_system (Qno_conversion, &terminal_coding);
9229 setup_coding_system (Qno_conversion, &safe_terminal_coding);
ff563fce
KH
9230
9231 {
9232 int i;
9233
9234 for (i = 0; i < coding_category_max; i++)
9235 Fset (AREF (Vcoding_category_table, i), Qno_conversion);
9236 }
4ed46869
KH
9237}
9238
68c45bf0
PE
9239char *
9240emacs_strerror (error_number)
9241 int error_number;
9242{
9243 char *str;
9244
ca9c0567 9245 synchronize_system_messages_locale ();
68c45bf0
PE
9246 str = strerror (error_number);
9247
9248 if (! NILP (Vlocale_coding_system))
9249 {
9250 Lisp_Object dec = code_convert_string_norecord (build_string (str),
9251 Vlocale_coding_system,
9252 0);
d5db4077 9253 str = (char *) SDATA (dec);
68c45bf0
PE
9254 }
9255
9256 return str;
9257}
9258
4ed46869 9259#endif /* emacs */