(custom-variable-prompt): Allow customization of
[bpt/emacs.git] / src / coding.c
CommitLineData
4ed46869 1/* Coding system handler (conversion, detection, and etc).
4a2f9c6a 2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
203cb916 3 Licensed to the Free Software Foundation.
4ed46869 4
369314dc
KH
5This file is part of GNU Emacs.
6
7GNU Emacs is free software; you can redistribute it and/or modify
8it under the terms of the GNU General Public License as published by
9the Free Software Foundation; either version 2, or (at your option)
10any later version.
4ed46869 11
369314dc
KH
12GNU Emacs is distributed in the hope that it will be useful,
13but WITHOUT ANY WARRANTY; without even the implied warranty of
14MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15GNU General Public License for more details.
4ed46869 16
369314dc
KH
17You should have received a copy of the GNU General Public License
18along with GNU Emacs; see the file COPYING. If not, write to
19the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20Boston, MA 02111-1307, USA. */
4ed46869
KH
21
22/*** TABLE OF CONTENTS ***
23
24 1. Preamble
0ef69138 25 2. Emacs' internal format (emacs-mule) handlers
4ed46869
KH
26 3. ISO2022 handlers
27 4. Shift-JIS and BIG5 handlers
1397dc18
KH
28 5. CCL handlers
29 6. End-of-line handlers
30 7. C library functions
31 8. Emacs Lisp library functions
32 9. Post-amble
4ed46869
KH
33
34*/
35
36/*** GENERAL NOTE on CODING SYSTEM ***
37
38 Coding system is an encoding mechanism of one or more character
39 sets. Here's a list of coding systems which Emacs can handle. When
40 we say "decode", it means converting some other coding system to
0ef69138
KH
41 Emacs' internal format (emacs-internal), and when we say "encode",
42 it means converting the coding system emacs-mule to some other
43 coding system.
4ed46869 44
0ef69138 45 0. Emacs' internal format (emacs-mule)
4ed46869
KH
46
47 Emacs itself holds a multi-lingual character in a buffer and a string
f4dee582 48 in a special format. Details are described in section 2.
4ed46869
KH
49
50 1. ISO2022
51
52 The most famous coding system for multiple character sets. X's
f4dee582
RS
53 Compound Text, various EUCs (Extended Unix Code), and coding
54 systems used in Internet communication such as ISO-2022-JP are
55 all variants of ISO2022. Details are described in section 3.
4ed46869
KH
56
57 2. SJIS (or Shift-JIS or MS-Kanji-Code)
58
59 A coding system to encode character sets: ASCII, JISX0201, and
60 JISX0208. Widely used for PC's in Japan. Details are described in
f4dee582 61 section 4.
4ed46869
KH
62
63 3. BIG5
64
65 A coding system to encode character sets: ASCII and Big5. Widely
66 used by Chinese (mainly in Taiwan and Hong Kong). Details are
f4dee582
RS
67 described in section 4. In this file, when we write "BIG5"
68 (all uppercase), we mean the coding system, and when we write
69 "Big5" (capitalized), we mean the character set.
4ed46869 70
27901516
KH
71 4. Raw text
72
4608c386
KH
73 A coding system for a text containing random 8-bit code. Emacs does
74 no code conversion on such a text except for end-of-line format.
27901516
KH
75
76 5. Other
4ed46869 77
f4dee582 78 If a user wants to read/write a text encoded in a coding system not
4ed46869
KH
79 listed above, he can supply a decoder and an encoder for it in CCL
80 (Code Conversion Language) programs. Emacs executes the CCL program
81 while reading/writing.
82
d46c5b12
KH
83 Emacs represents a coding system by a Lisp symbol that has a property
84 `coding-system'. But, before actually using the coding system, the
4ed46869 85 information about it is set in a structure of type `struct
f4dee582 86 coding_system' for rapid processing. See section 6 for more details.
4ed46869
KH
87
88*/
89
90/*** GENERAL NOTES on END-OF-LINE FORMAT ***
91
92 How end-of-line of a text is encoded depends on a system. For
93 instance, Unix's format is just one byte of `line-feed' code,
f4dee582 94 whereas DOS's format is two-byte sequence of `carriage-return' and
d46c5b12
KH
95 `line-feed' codes. MacOS's format is usually one byte of
96 `carriage-return'.
4ed46869 97
f4dee582
RS
98 Since text characters encoding and end-of-line encoding are
99 independent, any coding system described above can take
4ed46869 100 any format of end-of-line. So, Emacs has information of format of
f4dee582 101 end-of-line in each coding-system. See section 6 for more details.
4ed46869
KH
102
103*/
104
105/*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
106
107 These functions check if a text between SRC and SRC_END is encoded
108 in the coding system category XXX. Each returns an integer value in
109 which appropriate flag bits for the category XXX is set. The flag
110 bits are defined in macros CODING_CATEGORY_MASK_XXX. Below is the
111 template of these functions. */
112#if 0
113int
0ef69138 114detect_coding_emacs_mule (src, src_end)
4ed46869
KH
115 unsigned char *src, *src_end;
116{
117 ...
118}
119#endif
120
121/*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
122
123 These functions decode SRC_BYTES length text at SOURCE encoded in
0ef69138 124 CODING to Emacs' internal format (emacs-mule). The resulting text
d46c5b12
KH
125 goes to a place pointed to by DESTINATION, the length of which
126 should not exceed DST_BYTES. These functions set the information of
127 original and decoded texts in the members produced, produced_char,
128 consumed, and consumed_char of the structure *CODING.
129
130 The return value is an integer (CODING_FINISH_XXX) indicating how
131 the decoding finished.
132
133 DST_BYTES zero means that source area and destination area are
134 overlapped, which means that we can produce a decoded text until it
135 reaches at the head of not-yet-decoded source text.
136
137 Below is a template of these functions. */
4ed46869 138#if 0
d46c5b12 139decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
140 struct coding_system *coding;
141 unsigned char *source, *destination;
142 int src_bytes, dst_bytes;
4ed46869
KH
143{
144 ...
145}
146#endif
147
148/*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
149
0ef69138
KH
150 These functions encode SRC_BYTES length text at SOURCE of Emacs'
151 internal format (emacs-mule) to CODING. The resulting text goes to
f4dee582 152 a place pointed to by DESTINATION, the length of which should not
d46c5b12
KH
153 exceed DST_BYTES. These functions set the information of
154 original and encoded texts in the members produced, produced_char,
155 consumed, and consumed_char of the structure *CODING.
156
157 The return value is an integer (CODING_FINISH_XXX) indicating how
158 the encoding finished.
159
160 DST_BYTES zero means that source area and destination area are
161 overlapped, which means that we can produce a decoded text until it
162 reaches at the head of not-yet-decoded source text.
163
164 Below is a template of these functions. */
4ed46869 165#if 0
d46c5b12 166encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
167 struct coding_system *coding;
168 unsigned char *source, *destination;
169 int src_bytes, dst_bytes;
4ed46869
KH
170{
171 ...
172}
173#endif
174
175/*** COMMONLY USED MACROS ***/
176
177/* The following three macros ONE_MORE_BYTE, TWO_MORE_BYTES, and
178 THREE_MORE_BYTES safely get one, two, and three bytes from the
179 source text respectively. If there are not enough bytes in the
180 source, they jump to `label_end_of_loop'. The caller should set
181 variables `src' and `src_end' to appropriate areas in advance. */
182
183#define ONE_MORE_BYTE(c1) \
184 do { \
185 if (src < src_end) \
186 c1 = *src++; \
187 else \
188 goto label_end_of_loop; \
189 } while (0)
190
191#define TWO_MORE_BYTES(c1, c2) \
192 do { \
193 if (src + 1 < src_end) \
194 c1 = *src++, c2 = *src++; \
195 else \
196 goto label_end_of_loop; \
197 } while (0)
198
199#define THREE_MORE_BYTES(c1, c2, c3) \
200 do { \
201 if (src + 2 < src_end) \
202 c1 = *src++, c2 = *src++, c3 = *src++; \
203 else \
204 goto label_end_of_loop; \
205 } while (0)
206
207/* The following three macros DECODE_CHARACTER_ASCII,
208 DECODE_CHARACTER_DIMENSION1, and DECODE_CHARACTER_DIMENSION2 put
209 the multi-byte form of a character of each class at the place
210 pointed by `dst'. The caller should set the variable `dst' to
211 point to an appropriate area and the variable `coding' to point to
212 the coding-system of the currently decoding text in advance. */
213
214/* Decode one ASCII character C. */
215
de79a6a5
KH
216#define DECODE_CHARACTER_ASCII(c) \
217 do { \
218 if (COMPOSING_P (coding->composing)) \
219 { \
220 *dst++ = 0xA0, *dst++ = (c) | 0x80; \
221 coding->composed_chars++; \
d14d03ac
KH
222 if (((c) | 0x80) < 0xA0) \
223 coding->fake_multibyte = 1; \
de79a6a5
KH
224 } \
225 else \
226 { \
227 *dst++ = (c); \
228 coding->produced_char++; \
d14d03ac
KH
229 if ((c) >= 0x80) \
230 coding->fake_multibyte = 1; \
de79a6a5 231 } \
4ed46869
KH
232 } while (0)
233
f4dee582 234/* Decode one DIMENSION1 character whose charset is CHARSET and whose
4ed46869
KH
235 position-code is C. */
236
237#define DECODE_CHARACTER_DIMENSION1(charset, c) \
238 do { \
239 unsigned char leading_code = CHARSET_LEADING_CODE_BASE (charset); \
240 if (COMPOSING_P (coding->composing)) \
de79a6a5
KH
241 { \
242 *dst++ = leading_code + 0x20; \
243 coding->composed_chars++; \
244 } \
4ed46869 245 else \
d46c5b12
KH
246 { \
247 *dst++ = leading_code; \
248 coding->produced_char++; \
249 } \
4ed46869
KH
250 if (leading_code = CHARSET_LEADING_CODE_EXT (charset)) \
251 *dst++ = leading_code; \
252 *dst++ = (c) | 0x80; \
d14d03ac
KH
253 if (((c) | 0x80) < 0xA0) \
254 coding->fake_multibyte = 1; \
4ed46869
KH
255 } while (0)
256
f4dee582 257/* Decode one DIMENSION2 character whose charset is CHARSET and whose
4ed46869
KH
258 position-codes are C1 and C2. */
259
260#define DECODE_CHARACTER_DIMENSION2(charset, c1, c2) \
261 do { \
262 DECODE_CHARACTER_DIMENSION1 (charset, c1); \
263 *dst++ = (c2) | 0x80; \
d14d03ac
KH
264 if (((c2) | 0x80) < 0xA0) \
265 coding->fake_multibyte = 1; \
4ed46869
KH
266 } while (0)
267
268\f
269/*** 1. Preamble ***/
270
271#include <stdio.h>
272
273#ifdef emacs
274
275#include <config.h>
276#include "lisp.h"
277#include "buffer.h"
278#include "charset.h"
279#include "ccl.h"
280#include "coding.h"
281#include "window.h"
282
283#else /* not emacs */
284
285#include "mulelib.h"
286
287#endif /* not emacs */
288
289Lisp_Object Qcoding_system, Qeol_type;
290Lisp_Object Qbuffer_file_coding_system;
291Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
27901516 292Lisp_Object Qno_conversion, Qundecided;
bb0115a2 293Lisp_Object Qcoding_system_history;
70c22245 294Lisp_Object Qsafe_charsets;
1397dc18 295Lisp_Object Qvalid_codes;
4ed46869
KH
296
297extern Lisp_Object Qinsert_file_contents, Qwrite_region;
298Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
299Lisp_Object Qstart_process, Qopen_network_stream;
300Lisp_Object Qtarget_idx;
301
d46c5b12
KH
302Lisp_Object Vselect_safe_coding_system_function;
303
7722baf9
EZ
304/* Mnemonic string for each format of end-of-line. */
305Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
306/* Mnemonic string to indicate format of end-of-line is not yet
4ed46869 307 decided. */
7722baf9 308Lisp_Object eol_mnemonic_undecided;
4ed46869 309
9ce27fde
KH
310/* Format of end-of-line decided by system. This is CODING_EOL_LF on
311 Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac. */
312int system_eol_type;
313
4ed46869
KH
314#ifdef emacs
315
4608c386
KH
316Lisp_Object Vcoding_system_list, Vcoding_system_alist;
317
318Lisp_Object Qcoding_system_p, Qcoding_system_error;
4ed46869 319
d46c5b12
KH
320/* Coding system emacs-mule and raw-text are for converting only
321 end-of-line format. */
322Lisp_Object Qemacs_mule, Qraw_text;
9ce27fde 323
4ed46869
KH
324/* Coding-systems are handed between Emacs Lisp programs and C internal
325 routines by the following three variables. */
326/* Coding-system for reading files and receiving data from process. */
327Lisp_Object Vcoding_system_for_read;
328/* Coding-system for writing files and sending data to process. */
329Lisp_Object Vcoding_system_for_write;
330/* Coding-system actually used in the latest I/O. */
331Lisp_Object Vlast_coding_system_used;
332
c4825358 333/* A vector of length 256 which contains information about special
94487c4e 334 Latin codes (especially for dealing with Microsoft codes). */
3f003981 335Lisp_Object Vlatin_extra_code_table;
c4825358 336
9ce27fde
KH
337/* Flag to inhibit code conversion of end-of-line format. */
338int inhibit_eol_conversion;
339
ed29121d
EZ
340/* Flag to make buffer-file-coding-system inherit from process-coding. */
341int inherit_process_coding_system;
342
c4825358 343/* Coding system to be used to encode text for terminal display. */
4ed46869
KH
344struct coding_system terminal_coding;
345
c4825358
KH
346/* Coding system to be used to encode text for terminal display when
347 terminal coding system is nil. */
348struct coding_system safe_terminal_coding;
349
350/* Coding system of what is sent from terminal keyboard. */
4ed46869
KH
351struct coding_system keyboard_coding;
352
6bc51348
KH
353/* Default coding system to be used to write a file. */
354struct coding_system default_buffer_file_coding;
355
02ba4723
KH
356Lisp_Object Vfile_coding_system_alist;
357Lisp_Object Vprocess_coding_system_alist;
358Lisp_Object Vnetwork_coding_system_alist;
4ed46869
KH
359
360#endif /* emacs */
361
d46c5b12 362Lisp_Object Qcoding_category, Qcoding_category_index;
4ed46869
KH
363
364/* List of symbols `coding-category-xxx' ordered by priority. */
365Lisp_Object Vcoding_category_list;
366
d46c5b12
KH
367/* Table of coding categories (Lisp symbols). */
368Lisp_Object Vcoding_category_table;
4ed46869
KH
369
370/* Table of names of symbol for each coding-category. */
371char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
0ef69138 372 "coding-category-emacs-mule",
4ed46869
KH
373 "coding-category-sjis",
374 "coding-category-iso-7",
d46c5b12 375 "coding-category-iso-7-tight",
4ed46869
KH
376 "coding-category-iso-8-1",
377 "coding-category-iso-8-2",
7717c392
KH
378 "coding-category-iso-7-else",
379 "coding-category-iso-8-else",
89fa8b36 380 "coding-category-ccl",
4ed46869 381 "coding-category-big5",
27901516 382 "coding-category-raw-text",
89fa8b36 383 "coding-category-binary"
4ed46869
KH
384};
385
66cfb530 386/* Table of pointers to coding systems corresponding to each coding
d46c5b12
KH
387 categories. */
388struct coding_system *coding_system_table[CODING_CATEGORY_IDX_MAX];
389
66cfb530
KH
390/* Table of coding category masks. Nth element is a mask for a coding
391 cateogry of which priority is Nth. */
392static
393int coding_priorities[CODING_CATEGORY_IDX_MAX];
394
f967223b
KH
395/* Flag to tell if we look up translation table on character code
396 conversion. */
84fbb8a0 397Lisp_Object Venable_character_translation;
f967223b
KH
398/* Standard translation table to look up on decoding (reading). */
399Lisp_Object Vstandard_translation_table_for_decode;
400/* Standard translation table to look up on encoding (writing). */
401Lisp_Object Vstandard_translation_table_for_encode;
84fbb8a0 402
f967223b
KH
403Lisp_Object Qtranslation_table;
404Lisp_Object Qtranslation_table_id;
405Lisp_Object Qtranslation_table_for_decode;
406Lisp_Object Qtranslation_table_for_encode;
4ed46869
KH
407
408/* Alist of charsets vs revision number. */
409Lisp_Object Vcharset_revision_alist;
410
02ba4723
KH
411/* Default coding systems used for process I/O. */
412Lisp_Object Vdefault_process_coding_system;
413
4ed46869 414\f
0ef69138 415/*** 2. Emacs internal format (emacs-mule) handlers ***/
4ed46869
KH
416
417/* Emacs' internal format for encoding multiple character sets is a
f4dee582
RS
418 kind of multi-byte encoding, i.e. characters are encoded by
419 variable-length sequences of one-byte codes. ASCII characters
420 and control characters (e.g. `tab', `newline') are represented by
421 one-byte sequences which are their ASCII codes, in the range 0x00
422 through 0x7F. The other characters are represented by a sequence
423 of `base leading-code', optional `extended leading-code', and one
424 or two `position-code's. The length of the sequence is determined
425 by the base leading-code. Leading-code takes the range 0x80
426 through 0x9F, whereas extended leading-code and position-code take
427 the range 0xA0 through 0xFF. See `charset.h' for more details
428 about leading-code and position-code.
429
430 There's one exception to this rule. Special leading-code
4ed46869
KH
431 `leading-code-composition' denotes that the following several
432 characters should be composed into one character. Leading-codes of
433 components (except for ASCII) are added 0x20. An ASCII character
434 component is represented by a 2-byte sequence of `0xA0' and
f4dee582
RS
435 `ASCII-code + 0x80'. See also the comments in `charset.h' for the
436 details of composite character. Hence, we can summarize the code
4ed46869
KH
437 range as follows:
438
439 --- CODE RANGE of Emacs' internal format ---
440 (character set) (range)
441 ASCII 0x00 .. 0x7F
442 ELSE (1st byte) 0x80 .. 0x9F
443 (rest bytes) 0xA0 .. 0xFF
444 ---------------------------------------------
445
446 */
447
448enum emacs_code_class_type emacs_code_class[256];
449
450/* Go to the next statement only if *SRC is accessible and the code is
451 greater than 0xA0. */
452#define CHECK_CODE_RANGE_A0_FF \
453 do { \
454 if (src >= src_end) \
455 goto label_end_of_switch; \
456 else if (*src++ < 0xA0) \
457 return 0; \
458 } while (0)
459
460/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
461 Check if a text is encoded in Emacs' internal format. If it is,
d46c5b12 462 return CODING_CATEGORY_MASK_EMACS_MULE, else return 0. */
4ed46869
KH
463
464int
0ef69138 465detect_coding_emacs_mule (src, src_end)
4ed46869
KH
466 unsigned char *src, *src_end;
467{
468 unsigned char c;
469 int composing = 0;
470
471 while (src < src_end)
472 {
473 c = *src++;
474
475 if (composing)
476 {
477 if (c < 0xA0)
478 composing = 0;
479 else
480 c -= 0x20;
481 }
482
483 switch (emacs_code_class[c])
484 {
485 case EMACS_ascii_code:
486 case EMACS_linefeed_code:
487 break;
488
489 case EMACS_control_code:
490 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
491 return 0;
492 break;
493
494 case EMACS_invalid_code:
495 return 0;
496
497 case EMACS_leading_code_composition: /* c == 0x80 */
498 if (composing)
499 CHECK_CODE_RANGE_A0_FF;
500 else
501 composing = 1;
502 break;
503
504 case EMACS_leading_code_4:
505 CHECK_CODE_RANGE_A0_FF;
506 /* fall down to check it two more times ... */
507
508 case EMACS_leading_code_3:
509 CHECK_CODE_RANGE_A0_FF;
510 /* fall down to check it one more time ... */
511
512 case EMACS_leading_code_2:
513 CHECK_CODE_RANGE_A0_FF;
514 break;
515
516 default:
517 label_end_of_switch:
518 break;
519 }
520 }
0ef69138 521 return CODING_CATEGORY_MASK_EMACS_MULE;
4ed46869
KH
522}
523
524\f
525/*** 3. ISO2022 handlers ***/
526
527/* The following note describes the coding system ISO2022 briefly.
39787efd
KH
528 Since the intention of this note is to help understand the
529 functions in this file, some parts are NOT ACCURATE or OVERLY
530 SIMPLIFIED. For thorough understanding, please refer to the
4ed46869
KH
531 original document of ISO2022.
532
533 ISO2022 provides many mechanisms to encode several character sets
39787efd
KH
534 in 7-bit and 8-bit environments. For 7-bite environments, all text
535 is encoded using bytes less than 128. This may make the encoded
536 text a little bit longer, but the text passes more easily through
537 several gateways, some of which strip off MSB (Most Signigant Bit).
538
539 There are two kinds of character sets: control character set and
4ed46869
KH
540 graphic character set. The former contains control characters such
541 as `newline' and `escape' to provide control functions (control
39787efd
KH
542 functions are also provided by escape sequences). The latter
543 contains graphic characters such as 'A' and '-'. Emacs recognizes
4ed46869
KH
544 two control character sets and many graphic character sets.
545
546 Graphic character sets are classified into one of the following
39787efd
KH
547 four classes, according to the number of bytes (DIMENSION) and
548 number of characters in one dimension (CHARS) of the set:
549 - DIMENSION1_CHARS94
550 - DIMENSION1_CHARS96
551 - DIMENSION2_CHARS94
552 - DIMENSION2_CHARS96
553
554 In addition, each character set is assigned an identification tag,
555 unique for each set, called "final character" (denoted as <F>
556 hereafter). The <F> of each character set is decided by ECMA(*)
557 when it is registered in ISO. The code range of <F> is 0x30..0x7F
558 (0x30..0x3F are for private use only).
4ed46869
KH
559
560 Note (*): ECMA = European Computer Manufacturers Association
561
562 Here are examples of graphic character set [NAME(<F>)]:
563 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
564 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
565 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
566 o DIMENSION2_CHARS96 -- none for the moment
567
39787efd 568 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
4ed46869
KH
569 C0 [0x00..0x1F] -- control character plane 0
570 GL [0x20..0x7F] -- graphic character plane 0
571 C1 [0x80..0x9F] -- control character plane 1
572 GR [0xA0..0xFF] -- graphic character plane 1
573
574 A control character set is directly designated and invoked to C0 or
39787efd
KH
575 C1 by an escape sequence. The most common case is that:
576 - ISO646's control character set is designated/invoked to C0, and
577 - ISO6429's control character set is designated/invoked to C1,
578 and usually these designations/invocations are omitted in encoded
579 text. In a 7-bit environment, only C0 can be used, and a control
580 character for C1 is encoded by an appropriate escape sequence to
581 fit into the environment. All control characters for C1 are
582 defined to have corresponding escape sequences.
4ed46869
KH
583
584 A graphic character set is at first designated to one of four
585 graphic registers (G0 through G3), then these graphic registers are
586 invoked to GL or GR. These designations and invocations can be
587 done independently. The most common case is that G0 is invoked to
39787efd
KH
588 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
589 these invocations and designations are omitted in encoded text.
590 In a 7-bit environment, only GL can be used.
4ed46869 591
39787efd
KH
592 When a graphic character set of CHARS94 is invoked to GL, codes
593 0x20 and 0x7F of the GL area work as control characters SPACE and
594 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
595 be used.
4ed46869
KH
596
597 There are two ways of invocation: locking-shift and single-shift.
598 With locking-shift, the invocation lasts until the next different
39787efd
KH
599 invocation, whereas with single-shift, the invocation affects the
600 following character only and doesn't affect the locking-shift
601 state. Invocations are done by the following control characters or
602 escape sequences:
4ed46869
KH
603
604 ----------------------------------------------------------------------
39787efd 605 abbrev function cntrl escape seq description
4ed46869 606 ----------------------------------------------------------------------
39787efd
KH
607 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
608 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
609 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
610 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
611 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
612 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
613 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
614 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
615 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
4ed46869 616 ----------------------------------------------------------------------
39787efd
KH
617 (*) These are not used by any known coding system.
618
619 Control characters for these functions are defined by macros
620 ISO_CODE_XXX in `coding.h'.
4ed46869 621
39787efd 622 Designations are done by the following escape sequences:
4ed46869
KH
623 ----------------------------------------------------------------------
624 escape sequence description
625 ----------------------------------------------------------------------
626 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
627 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
628 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
629 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
630 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
631 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
632 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
633 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
634 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
635 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
636 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
637 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
638 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
639 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
640 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
641 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
642 ----------------------------------------------------------------------
643
644 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
39787efd 645 of dimension 1, chars 94, and final character <F>, etc...
4ed46869
KH
646
647 Note (*): Although these designations are not allowed in ISO2022,
648 Emacs accepts them on decoding, and produces them on encoding
39787efd 649 CHARS96 character sets in a coding system which is characterized as
4ed46869
KH
650 7-bit environment, non-locking-shift, and non-single-shift.
651
652 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
39787efd 653 '(' can be omitted. We refer to this as "short-form" hereafter.
4ed46869
KH
654
655 Now you may notice that there are a lot of ways for encoding the
39787efd
KH
656 same multilingual text in ISO2022. Actually, there exist many
657 coding systems such as Compound Text (used in X11's inter client
658 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
659 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
4ed46869
KH
660 localized platforms), and all of these are variants of ISO2022.
661
662 In addition to the above, Emacs handles two more kinds of escape
663 sequences: ISO6429's direction specification and Emacs' private
664 sequence for specifying character composition.
665
39787efd 666 ISO6429's direction specification takes the following form:
4ed46869
KH
667 o CSI ']' -- end of the current direction
668 o CSI '0' ']' -- end of the current direction
669 o CSI '1' ']' -- start of left-to-right text
670 o CSI '2' ']' -- start of right-to-left text
671 The control character CSI (0x9B: control sequence introducer) is
39787efd
KH
672 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
673
674 Character composition specification takes the following form:
4ed46869
KH
675 o ESC '0' -- start character composition
676 o ESC '1' -- end character composition
39787efd
KH
677 Since these are not standard escape sequences of any ISO standard,
678 the use of them for these meaning is restricted to Emacs only. */
4ed46869
KH
679
680enum iso_code_class_type iso_code_class[256];
681
f024b6aa
RS
682#define CHARSET_OK(idx, charset) \
683 (coding_system_table[idx] \
684 && (coding_system_table[idx]->safe_charsets[charset] \
685 || (CODING_SPEC_ISO_REQUESTED_DESIGNATION \
686 (coding_system_table[idx], charset) \
687 != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)))
d46c5b12
KH
688
689#define SHIFT_OUT_OK(idx) \
690 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding_system_table[idx], 1) >= 0)
691
4ed46869
KH
692/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
693 Check if a text is encoded in ISO2022. If it is, returns an
694 integer in which appropriate flag bits any of:
695 CODING_CATEGORY_MASK_ISO_7
d46c5b12 696 CODING_CATEGORY_MASK_ISO_7_TIGHT
4ed46869
KH
697 CODING_CATEGORY_MASK_ISO_8_1
698 CODING_CATEGORY_MASK_ISO_8_2
7717c392
KH
699 CODING_CATEGORY_MASK_ISO_7_ELSE
700 CODING_CATEGORY_MASK_ISO_8_ELSE
4ed46869
KH
701 are set. If a code which should never appear in ISO2022 is found,
702 returns 0. */
703
704int
705detect_coding_iso2022 (src, src_end)
706 unsigned char *src, *src_end;
707{
d46c5b12
KH
708 int mask = CODING_CATEGORY_MASK_ISO;
709 int mask_found = 0;
f46869e4 710 int reg[4], shift_out = 0, single_shifting = 0;
d46c5b12 711 int c, c1, i, charset;
3f003981 712
d46c5b12 713 reg[0] = CHARSET_ASCII, reg[1] = reg[2] = reg[3] = -1;
3f003981 714 while (mask && src < src_end)
4ed46869
KH
715 {
716 c = *src++;
717 switch (c)
718 {
719 case ISO_CODE_ESC:
f46869e4 720 single_shifting = 0;
e0e989f6 721 if (src >= src_end)
4ed46869
KH
722 break;
723 c = *src++;
d46c5b12 724 if (c >= '(' && c <= '/')
4ed46869 725 {
bf9cdd4e
KH
726 /* Designation sequence for a charset of dimension 1. */
727 if (src >= src_end)
728 break;
d46c5b12
KH
729 c1 = *src++;
730 if (c1 < ' ' || c1 >= 0x80
731 || (charset = iso_charset_table[0][c >= ','][c1]) < 0)
732 /* Invalid designation sequence. Just ignore. */
733 break;
734 reg[(c - '(') % 4] = charset;
bf9cdd4e
KH
735 }
736 else if (c == '$')
737 {
738 /* Designation sequence for a charset of dimension 2. */
739 if (src >= src_end)
740 break;
741 c = *src++;
742 if (c >= '@' && c <= 'B')
743 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
d46c5b12 744 reg[0] = charset = iso_charset_table[1][0][c];
bf9cdd4e 745 else if (c >= '(' && c <= '/')
bcf26d6a 746 {
bf9cdd4e
KH
747 if (src >= src_end)
748 break;
d46c5b12
KH
749 c1 = *src++;
750 if (c1 < ' ' || c1 >= 0x80
751 || (charset = iso_charset_table[1][c >= ','][c1]) < 0)
752 /* Invalid designation sequence. Just ignore. */
753 break;
754 reg[(c - '(') % 4] = charset;
bcf26d6a 755 }
bf9cdd4e 756 else
d46c5b12
KH
757 /* Invalid designation sequence. Just ignore. */
758 break;
759 }
ae9ff118 760 else if (c == 'N' || c == 'O')
d46c5b12 761 {
ae9ff118
KH
762 /* ESC <Fe> for SS2 or SS3. */
763 mask &= CODING_CATEGORY_MASK_ISO_7_ELSE;
d46c5b12 764 break;
4ed46869 765 }
bf9cdd4e 766 else if (c == '0' || c == '1' || c == '2')
ae9ff118 767 /* ESC <Fp> for start/end composition. Just ignore. */
d46c5b12 768 break;
bf9cdd4e 769 else
d46c5b12
KH
770 /* Invalid escape sequence. Just ignore. */
771 break;
772
773 /* We found a valid designation sequence for CHARSET. */
774 mask &= ~CODING_CATEGORY_MASK_ISO_8BIT;
775 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7, charset))
776 mask_found |= CODING_CATEGORY_MASK_ISO_7;
777 else
778 mask &= ~CODING_CATEGORY_MASK_ISO_7;
779 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT, charset))
780 mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
781 else
782 mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
ae9ff118
KH
783 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE, charset))
784 mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
785 else
d46c5b12 786 mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
ae9ff118
KH
787 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE, charset))
788 mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
789 else
d46c5b12 790 mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
4ed46869
KH
791 break;
792
4ed46869 793 case ISO_CODE_SO:
f46869e4 794 single_shifting = 0;
d46c5b12
KH
795 if (shift_out == 0
796 && (reg[1] >= 0
797 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE)
798 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE)))
799 {
800 /* Locking shift out. */
801 mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
802 mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
803 }
e0e989f6
KH
804 break;
805
d46c5b12 806 case ISO_CODE_SI:
f46869e4 807 single_shifting = 0;
d46c5b12
KH
808 if (shift_out == 1)
809 {
810 /* Locking shift in. */
811 mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
812 mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
813 }
814 break;
815
4ed46869 816 case ISO_CODE_CSI:
f46869e4 817 single_shifting = 0;
4ed46869
KH
818 case ISO_CODE_SS2:
819 case ISO_CODE_SS3:
3f003981
KH
820 {
821 int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
822
70c22245
KH
823 if (c != ISO_CODE_CSI)
824 {
d46c5b12
KH
825 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
826 & CODING_FLAG_ISO_SINGLE_SHIFT)
70c22245 827 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
d46c5b12
KH
828 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
829 & CODING_FLAG_ISO_SINGLE_SHIFT)
70c22245 830 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
f46869e4 831 single_shifting = 1;
70c22245 832 }
3f003981
KH
833 if (VECTORP (Vlatin_extra_code_table)
834 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
835 {
d46c5b12
KH
836 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
837 & CODING_FLAG_ISO_LATIN_EXTRA)
3f003981 838 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
d46c5b12
KH
839 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
840 & CODING_FLAG_ISO_LATIN_EXTRA)
3f003981
KH
841 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
842 }
843 mask &= newmask;
d46c5b12 844 mask_found |= newmask;
3f003981
KH
845 }
846 break;
4ed46869
KH
847
848 default:
849 if (c < 0x80)
f46869e4
KH
850 {
851 single_shifting = 0;
852 break;
853 }
4ed46869 854 else if (c < 0xA0)
c4825358 855 {
f46869e4 856 single_shifting = 0;
3f003981
KH
857 if (VECTORP (Vlatin_extra_code_table)
858 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
c4825358 859 {
3f003981
KH
860 int newmask = 0;
861
d46c5b12
KH
862 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
863 & CODING_FLAG_ISO_LATIN_EXTRA)
3f003981 864 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
d46c5b12
KH
865 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
866 & CODING_FLAG_ISO_LATIN_EXTRA)
3f003981
KH
867 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
868 mask &= newmask;
d46c5b12 869 mask_found |= newmask;
c4825358 870 }
3f003981
KH
871 else
872 return 0;
c4825358 873 }
4ed46869
KH
874 else
875 {
7717c392 876 unsigned char *src_begin = src;
4ed46869 877
d46c5b12 878 mask &= ~(CODING_CATEGORY_MASK_ISO_7BIT
7717c392 879 | CODING_CATEGORY_MASK_ISO_7_ELSE);
d46c5b12 880 mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
f46869e4
KH
881 /* Check the length of succeeding codes of the range
882 0xA0..0FF. If the byte length is odd, we exclude
883 CODING_CATEGORY_MASK_ISO_8_2. We can check this only
884 when we are not single shifting. */
885 if (!single_shifting)
886 {
887 while (src < src_end && *src >= 0xA0)
888 src++;
889 if ((src - src_begin - 1) & 1 && src < src_end)
890 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
891 else
892 mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
893 }
4ed46869
KH
894 }
895 break;
896 }
897 }
898
d46c5b12 899 return (mask & mask_found);
4ed46869
KH
900}
901
902/* Decode a character of which charset is CHARSET and the 1st position
bdd9fb48 903 code is C1. If dimension of CHARSET is 2, the 2nd position code is
4ed46869
KH
904 fetched from SRC and set to C2. If CHARSET is negative, it means
905 that we are decoding ill formed text, and what we can do is just to
906 read C1 as is. */
907
bdd9fb48
KH
908#define DECODE_ISO_CHARACTER(charset, c1) \
909 do { \
910 int c_alt, charset_alt = (charset); \
911 if (COMPOSING_HEAD_P (coding->composing)) \
912 { \
913 *dst++ = LEADING_CODE_COMPOSITION; \
914 if (COMPOSING_WITH_RULE_P (coding->composing)) \
915 /* To tell composition rules are embeded. */ \
916 *dst++ = 0xFF; \
917 coding->composing += 2; \
918 } \
85bbb134 919 if (charset_alt >= 0) \
bdd9fb48 920 { \
85bbb134 921 if (CHARSET_DIMENSION (charset_alt) == 2) \
70c22245
KH
922 { \
923 ONE_MORE_BYTE (c2); \
924 if (iso_code_class[(c2) & 0x7F] != ISO_0x20_or_0x7F \
925 && iso_code_class[(c2) & 0x7F] != ISO_graphic_plane_0) \
926 { \
927 src--; \
85bbb134 928 charset_alt = CHARSET_ASCII; \
70c22245
KH
929 } \
930 } \
84fbb8a0
KH
931 if (!NILP (translation_table) \
932 && ((c_alt = translate_char (translation_table, \
85bbb134 933 -1, charset_alt, c1, c2)) >= 0)) \
bdd9fb48
KH
934 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
935 } \
936 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
937 DECODE_CHARACTER_ASCII (c1); \
938 else if (CHARSET_DIMENSION (charset_alt) == 1) \
939 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
940 else \
941 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
942 if (COMPOSING_WITH_RULE_P (coding->composing)) \
943 /* To tell a composition rule follows. */ \
944 coding->composing = COMPOSING_WITH_RULE_RULE; \
4ed46869
KH
945 } while (0)
946
947/* Set designation state into CODING. */
d46c5b12
KH
948#define DECODE_DESIGNATION(reg, dimension, chars, final_char) \
949 do { \
944bd420
KH
950 int charset; \
951 \
952 if (final_char < '0' || final_char >= 128) \
953 goto label_invalid_code; \
954 charset = ISO_CHARSET_TABLE (make_number (dimension), \
955 make_number (chars), \
956 make_number (final_char)); \
d46c5b12 957 if (charset >= 0 \
704c5781
KH
958 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) == reg \
959 || coding->safe_charsets[charset])) \
d46c5b12
KH
960 { \
961 if (coding->spec.iso2022.last_invalid_designation_register == 0 \
962 && reg == 0 \
963 && charset == CHARSET_ASCII) \
964 { \
965 /* We should insert this designation sequence as is so \
966 that it is surely written back to a file. */ \
967 coding->spec.iso2022.last_invalid_designation_register = -1; \
968 goto label_invalid_code; \
969 } \
970 coding->spec.iso2022.last_invalid_designation_register = -1; \
971 if ((coding->mode & CODING_MODE_DIRECTION) \
972 && CHARSET_REVERSE_CHARSET (charset) >= 0) \
973 charset = CHARSET_REVERSE_CHARSET (charset); \
974 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
975 } \
976 else \
977 { \
978 coding->spec.iso2022.last_invalid_designation_register = reg; \
979 goto label_invalid_code; \
980 } \
4ed46869
KH
981 } while (0)
982
88993dfd
KH
983/* Return 0 if there's a valid composing sequence starting at SRC and
984 ending before SRC_END, else return -1. */
d46c5b12 985
84fbb8a0
KH
986int
987check_composing_code (coding, src, src_end)
d46c5b12
KH
988 struct coding_system *coding;
989 unsigned char *src, *src_end;
990{
d46c5b12
KH
991 int charset, c, c1, dim;
992
993 while (src < src_end)
994 {
88993dfd
KH
995 c = *src++;
996 if (c >= 0x20)
997 continue;
998 if (c != ISO_CODE_ESC || src >= src_end)
999 return -1;
1000 c = *src++;
1001 if (c == '1') /* end of compsition */
1002 return 0;
1003 if (src + 2 >= src_end
1004 || !coding->flags & CODING_FLAG_ISO_DESIGNATION)
1005 return -1;
1006
1007 dim = (c == '$');
1008 if (dim == 1)
1009 c = (*src >= '@' && *src <= 'B') ? '(' : *src++;
1010 if (c >= '(' && c <= '/')
d46c5b12 1011 {
88993dfd
KH
1012 c1 = *src++;
1013 if ((c1 < ' ' || c1 >= 0x80)
1014 || (charset = iso_charset_table[dim][c >= ','][c1]) < 0
1015 || ! coding->safe_charsets[charset]
1016 || (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
1017 == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
1018 return -1;
d46c5b12 1019 }
88993dfd
KH
1020 else
1021 return -1;
d46c5b12 1022 }
88993dfd
KH
1023
1024 /* We have not found the sequence "ESC 1". */
1025 return -1;
d46c5b12
KH
1026}
1027
4ed46869
KH
1028/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
1029
1030int
d46c5b12 1031decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
1032 struct coding_system *coding;
1033 unsigned char *source, *destination;
1034 int src_bytes, dst_bytes;
4ed46869
KH
1035{
1036 unsigned char *src = source;
1037 unsigned char *src_end = source + src_bytes;
1038 unsigned char *dst = destination;
1039 unsigned char *dst_end = destination + dst_bytes;
1040 /* Since the maximum bytes produced by each loop is 7, we subtract 6
1041 from DST_END to assure that overflow checking is necessary only
1042 at the head of loop. */
1043 unsigned char *adjusted_dst_end = dst_end - 6;
1044 int charset;
1045 /* Charsets invoked to graphic plane 0 and 1 respectively. */
1046 int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1047 int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
84fbb8a0 1048 Lisp_Object translation_table
f967223b 1049 = coding->translation_table_for_decode;
d46c5b12 1050 int result = CODING_FINISH_NORMAL;
bdd9fb48 1051
84fbb8a0 1052 if (!NILP (Venable_character_translation) && NILP (translation_table))
f967223b 1053 translation_table = Vstandard_translation_table_for_decode;
4ed46869 1054
d46c5b12 1055 coding->produced_char = 0;
fb88bf2d 1056 coding->fake_multibyte = 0;
d46c5b12
KH
1057 while (src < src_end && (dst_bytes
1058 ? (dst < adjusted_dst_end)
1059 : (dst < src - 6)))
4ed46869
KH
1060 {
1061 /* SRC_BASE remembers the start position in source in each loop.
1062 The loop will be exited when there's not enough source text
1063 to analyze long escape sequence or 2-byte code (within macros
1064 ONE_MORE_BYTE or TWO_MORE_BYTES). In that case, SRC is reset
1065 to SRC_BASE before exiting. */
1066 unsigned char *src_base = src;
bdd9fb48 1067 int c1 = *src++, c2;
4ed46869
KH
1068
1069 switch (iso_code_class [c1])
1070 {
1071 case ISO_0x20_or_0x7F:
1072 if (!coding->composing
1073 && (charset0 < 0 || CHARSET_CHARS (charset0) == 94))
1074 {
1075 /* This is SPACE or DEL. */
1076 *dst++ = c1;
d46c5b12 1077 coding->produced_char++;
4ed46869
KH
1078 break;
1079 }
1080 /* This is a graphic character, we fall down ... */
1081
1082 case ISO_graphic_plane_0:
1083 if (coding->composing == COMPOSING_WITH_RULE_RULE)
1084 {
1085 /* This is a composition rule. */
1086 *dst++ = c1 | 0x80;
1087 coding->composing = COMPOSING_WITH_RULE_TAIL;
1088 }
1089 else
1090 DECODE_ISO_CHARACTER (charset0, c1);
1091 break;
1092
1093 case ISO_0xA0_or_0xFF:
d46c5b12
KH
1094 if (charset1 < 0 || CHARSET_CHARS (charset1) == 94
1095 || coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
fb88bf2d 1096 goto label_invalid_code;
4ed46869
KH
1097 /* This is a graphic character, we fall down ... */
1098
1099 case ISO_graphic_plane_1:
d46c5b12 1100 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
fb88bf2d 1101 goto label_invalid_code;
d46c5b12
KH
1102 else
1103 DECODE_ISO_CHARACTER (charset1, c1);
4ed46869
KH
1104 break;
1105
1106 case ISO_control_code:
1107 /* All ISO2022 control characters in this class have the
1108 same representation in Emacs internal format. */
d46c5b12
KH
1109 if (c1 == '\n'
1110 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1111 && (coding->eol_type == CODING_EOL_CR
1112 || coding->eol_type == CODING_EOL_CRLF))
1113 {
1114 result = CODING_FINISH_INCONSISTENT_EOL;
1115 goto label_end_of_loop_2;
1116 }
4ed46869 1117 *dst++ = c1;
d46c5b12 1118 coding->produced_char++;
174a4cbe
KH
1119 if (c1 >= 0x80)
1120 coding->fake_multibyte = 1;
4ed46869
KH
1121 break;
1122
1123 case ISO_carriage_return:
1124 if (coding->eol_type == CODING_EOL_CR)
d46c5b12 1125 *dst++ = '\n';
4ed46869
KH
1126 else if (coding->eol_type == CODING_EOL_CRLF)
1127 {
1128 ONE_MORE_BYTE (c1);
1129 if (c1 == ISO_CODE_LF)
1130 *dst++ = '\n';
1131 else
1132 {
d46c5b12
KH
1133 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1134 {
1135 result = CODING_FINISH_INCONSISTENT_EOL;
1136 goto label_end_of_loop_2;
1137 }
4ed46869 1138 src--;
d46c5b12 1139 *dst++ = '\r';
4ed46869
KH
1140 }
1141 }
1142 else
d46c5b12
KH
1143 *dst++ = c1;
1144 coding->produced_char++;
4ed46869
KH
1145 break;
1146
1147 case ISO_shift_out:
d46c5b12
KH
1148 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1149 || CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
1150 goto label_invalid_code;
4ed46869
KH
1151 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
1152 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1153 break;
1154
1155 case ISO_shift_in:
d46c5b12
KH
1156 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
1157 goto label_invalid_code;
4ed46869
KH
1158 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
1159 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1160 break;
1161
1162 case ISO_single_shift_2_7:
1163 case ISO_single_shift_2:
d46c5b12
KH
1164 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1165 goto label_invalid_code;
4ed46869
KH
1166 /* SS2 is handled as an escape sequence of ESC 'N' */
1167 c1 = 'N';
1168 goto label_escape_sequence;
1169
1170 case ISO_single_shift_3:
d46c5b12
KH
1171 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1172 goto label_invalid_code;
4ed46869
KH
1173 /* SS2 is handled as an escape sequence of ESC 'O' */
1174 c1 = 'O';
1175 goto label_escape_sequence;
1176
1177 case ISO_control_sequence_introducer:
1178 /* CSI is handled as an escape sequence of ESC '[' ... */
1179 c1 = '[';
1180 goto label_escape_sequence;
1181
1182 case ISO_escape:
1183 ONE_MORE_BYTE (c1);
1184 label_escape_sequence:
1185 /* Escape sequences handled by Emacs are invocation,
1186 designation, direction specification, and character
1187 composition specification. */
1188 switch (c1)
1189 {
1190 case '&': /* revision of following character set */
1191 ONE_MORE_BYTE (c1);
1192 if (!(c1 >= '@' && c1 <= '~'))
d46c5b12 1193 goto label_invalid_code;
4ed46869
KH
1194 ONE_MORE_BYTE (c1);
1195 if (c1 != ISO_CODE_ESC)
d46c5b12 1196 goto label_invalid_code;
4ed46869
KH
1197 ONE_MORE_BYTE (c1);
1198 goto label_escape_sequence;
1199
1200 case '$': /* designation of 2-byte character set */
d46c5b12
KH
1201 if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1202 goto label_invalid_code;
4ed46869
KH
1203 ONE_MORE_BYTE (c1);
1204 if (c1 >= '@' && c1 <= 'B')
1205 { /* designation of JISX0208.1978, GB2312.1980,
88993dfd 1206 or JISX0208.1980 */
4ed46869
KH
1207 DECODE_DESIGNATION (0, 2, 94, c1);
1208 }
1209 else if (c1 >= 0x28 && c1 <= 0x2B)
1210 { /* designation of DIMENSION2_CHARS94 character set */
1211 ONE_MORE_BYTE (c2);
1212 DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
1213 }
1214 else if (c1 >= 0x2C && c1 <= 0x2F)
1215 { /* designation of DIMENSION2_CHARS96 character set */
1216 ONE_MORE_BYTE (c2);
1217 DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
1218 }
1219 else
d46c5b12 1220 goto label_invalid_code;
4ed46869
KH
1221 break;
1222
1223 case 'n': /* invocation of locking-shift-2 */
d46c5b12
KH
1224 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1225 || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1226 goto label_invalid_code;
4ed46869 1227 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
e0e989f6 1228 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
4ed46869
KH
1229 break;
1230
1231 case 'o': /* invocation of locking-shift-3 */
d46c5b12
KH
1232 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1233 || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1234 goto label_invalid_code;
4ed46869 1235 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
e0e989f6 1236 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
4ed46869
KH
1237 break;
1238
1239 case 'N': /* invocation of single-shift-2 */
d46c5b12
KH
1240 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1241 || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1242 goto label_invalid_code;
4ed46869
KH
1243 ONE_MORE_BYTE (c1);
1244 charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
1245 DECODE_ISO_CHARACTER (charset, c1);
1246 break;
1247
1248 case 'O': /* invocation of single-shift-3 */
d46c5b12
KH
1249 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1250 || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1251 goto label_invalid_code;
4ed46869
KH
1252 ONE_MORE_BYTE (c1);
1253 charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
1254 DECODE_ISO_CHARACTER (charset, c1);
1255 break;
1256
d46c5b12
KH
1257 case '0': case '2': /* start composing */
1258 /* Before processing composing, we must be sure that all
1259 characters being composed are supported by CODING.
88993dfd
KH
1260 If not, we must give up composing. */
1261 if (check_composing_code (coding, src, src_end) == 0)
1262 {
1263 /* We are looking at a valid composition sequence. */
1264 coding->composing = (c1 == '0'
1265 ? COMPOSING_NO_RULE_HEAD
1266 : COMPOSING_WITH_RULE_HEAD);
1267 coding->composed_chars = 0;
1268 }
1269 else
1270 {
1271 *dst++ = ISO_CODE_ESC;
1272 *dst++ = c1;
1273 coding->produced_char += 2;
1274 }
4ed46869
KH
1275 break;
1276
1277 case '1': /* end composing */
88993dfd
KH
1278 if (!coding->composing)
1279 {
1280 *dst++ = ISO_CODE_ESC;
1281 *dst++ = c1;
1282 coding->produced_char += 2;
1283 break;
1284 }
1285
de79a6a5
KH
1286 if (coding->composed_chars > 0)
1287 {
1288 if (coding->composed_chars == 1)
1289 {
1290 unsigned char *this_char_start = dst;
1291 int this_bytes;
1292
1293 /* Only one character is in the composing
1294 sequence. Make it a normal character. */
1295 while (*--this_char_start != LEADING_CODE_COMPOSITION);
1296 dst = (this_char_start
1297 + (coding->composing == COMPOSING_NO_RULE_TAIL
1298 ? 1 : 2));
1299 *dst -= 0x20;
1300 if (*dst == 0x80)
1301 *++dst &= 0x7F;
1302 this_bytes = BYTES_BY_CHAR_HEAD (*dst);
1303 while (this_bytes--) *this_char_start++ = *dst++;
1304 dst = this_char_start;
1305 }
1306 coding->produced_char++;
1307 }
4ed46869 1308 coding->composing = COMPOSING_NO;
4ed46869
KH
1309 break;
1310
1311 case '[': /* specification of direction */
d46c5b12
KH
1312 if (coding->flags & CODING_FLAG_ISO_NO_DIRECTION)
1313 goto label_invalid_code;
4ed46869 1314 /* For the moment, nested direction is not supported.
d46c5b12
KH
1315 So, `coding->mode & CODING_MODE_DIRECTION' zero means
1316 left-to-right, and nozero means right-to-left. */
4ed46869
KH
1317 ONE_MORE_BYTE (c1);
1318 switch (c1)
1319 {
1320 case ']': /* end of the current direction */
d46c5b12 1321 coding->mode &= ~CODING_MODE_DIRECTION;
4ed46869
KH
1322
1323 case '0': /* end of the current direction */
1324 case '1': /* start of left-to-right direction */
1325 ONE_MORE_BYTE (c1);
1326 if (c1 == ']')
d46c5b12 1327 coding->mode &= ~CODING_MODE_DIRECTION;
4ed46869 1328 else
d46c5b12 1329 goto label_invalid_code;
4ed46869
KH
1330 break;
1331
1332 case '2': /* start of right-to-left direction */
1333 ONE_MORE_BYTE (c1);
1334 if (c1 == ']')
d46c5b12 1335 coding->mode |= CODING_MODE_DIRECTION;
4ed46869 1336 else
d46c5b12 1337 goto label_invalid_code;
4ed46869
KH
1338 break;
1339
1340 default:
d46c5b12 1341 goto label_invalid_code;
4ed46869
KH
1342 }
1343 break;
1344
1345 default:
d46c5b12
KH
1346 if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1347 goto label_invalid_code;
4ed46869
KH
1348 if (c1 >= 0x28 && c1 <= 0x2B)
1349 { /* designation of DIMENSION1_CHARS94 character set */
1350 ONE_MORE_BYTE (c2);
1351 DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
1352 }
1353 else if (c1 >= 0x2C && c1 <= 0x2F)
1354 { /* designation of DIMENSION1_CHARS96 character set */
1355 ONE_MORE_BYTE (c2);
1356 DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
1357 }
1358 else
1359 {
d46c5b12 1360 goto label_invalid_code;
4ed46869
KH
1361 }
1362 }
1363 /* We must update these variables now. */
1364 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1365 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1366 break;
1367
d46c5b12 1368 label_invalid_code:
d46c5b12
KH
1369 while (src_base < src)
1370 *dst++ = *src_base++;
fb88bf2d 1371 coding->fake_multibyte = 1;
4ed46869
KH
1372 }
1373 continue;
1374
1375 label_end_of_loop:
d46c5b12
KH
1376 result = CODING_FINISH_INSUFFICIENT_SRC;
1377 label_end_of_loop_2:
4ed46869
KH
1378 src = src_base;
1379 break;
1380 }
1381
fb88bf2d 1382 if (src < src_end)
4ed46869 1383 {
fb88bf2d
KH
1384 if (result == CODING_FINISH_NORMAL)
1385 result = CODING_FINISH_INSUFFICIENT_DST;
1386 else if (result != CODING_FINISH_INCONSISTENT_EOL
1387 && coding->mode & CODING_MODE_LAST_BLOCK)
1388 {
1389 /* This is the last block of the text to be decoded. We had
1390 better just flush out all remaining codes in the text
1391 although they are not valid characters. */
1392 src_bytes = src_end - src;
1393 if (dst_bytes && (dst_end - dst < src_bytes))
1394 src_bytes = dst_end - dst;
1395 bcopy (src, dst, src_bytes);
1396 dst += src_bytes;
1397 src += src_bytes;
1398 coding->fake_multibyte = 1;
1399 }
4ed46869 1400 }
fb88bf2d 1401
d46c5b12
KH
1402 coding->consumed = coding->consumed_char = src - source;
1403 coding->produced = dst - destination;
1404 return result;
4ed46869
KH
1405}
1406
f4dee582 1407/* ISO2022 encoding stuff. */
4ed46869
KH
1408
1409/*
f4dee582 1410 It is not enough to say just "ISO2022" on encoding, we have to
d46c5b12 1411 specify more details. In Emacs, each coding system of ISO2022
4ed46869
KH
1412 variant has the following specifications:
1413 1. Initial designation to G0 thru G3.
1414 2. Allows short-form designation?
1415 3. ASCII should be designated to G0 before control characters?
1416 4. ASCII should be designated to G0 at end of line?
1417 5. 7-bit environment or 8-bit environment?
1418 6. Use locking-shift?
1419 7. Use Single-shift?
1420 And the following two are only for Japanese:
1421 8. Use ASCII in place of JIS0201-1976-Roman?
1422 9. Use JISX0208-1983 in place of JISX0208-1978?
1423 These specifications are encoded in `coding->flags' as flag bits
1424 defined by macros CODING_FLAG_ISO_XXX. See `coding.h' for more
f4dee582 1425 details.
4ed46869
KH
1426*/
1427
1428/* Produce codes (escape sequence) for designating CHARSET to graphic
1429 register REG. If <final-char> of CHARSET is '@', 'A', or 'B' and
1430 the coding system CODING allows, produce designation sequence of
1431 short-form. */
1432
1433#define ENCODE_DESIGNATION(charset, reg, coding) \
1434 do { \
1435 unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset); \
1436 char *intermediate_char_94 = "()*+"; \
1437 char *intermediate_char_96 = ",-./"; \
70c22245
KH
1438 int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset); \
1439 if (revision < 255) \
1440 { \
4ed46869
KH
1441 *dst++ = ISO_CODE_ESC; \
1442 *dst++ = '&'; \
70c22245 1443 *dst++ = '@' + revision; \
4ed46869
KH
1444 } \
1445 *dst++ = ISO_CODE_ESC; \
1446 if (CHARSET_DIMENSION (charset) == 1) \
1447 { \
1448 if (CHARSET_CHARS (charset) == 94) \
1449 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1450 else \
1451 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1452 } \
1453 else \
1454 { \
1455 *dst++ = '$'; \
1456 if (CHARSET_CHARS (charset) == 94) \
1457 { \
1458 if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM) \
1459 || reg != 0 \
1460 || final_char < '@' || final_char > 'B') \
1461 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1462 } \
1463 else \
1464 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1465 } \
1466 *dst++ = final_char; \
1467 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
1468 } while (0)
1469
1470/* The following two macros produce codes (control character or escape
1471 sequence) for ISO2022 single-shift functions (single-shift-2 and
1472 single-shift-3). */
1473
1474#define ENCODE_SINGLE_SHIFT_2 \
1475 do { \
1476 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1477 *dst++ = ISO_CODE_ESC, *dst++ = 'N'; \
1478 else \
fb88bf2d
KH
1479 { \
1480 *dst++ = ISO_CODE_SS2; \
1481 coding->fake_multibyte = 1; \
1482 } \
4ed46869
KH
1483 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1484 } while (0)
1485
fb88bf2d
KH
1486#define ENCODE_SINGLE_SHIFT_3 \
1487 do { \
4ed46869 1488 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
fb88bf2d
KH
1489 *dst++ = ISO_CODE_ESC, *dst++ = 'O'; \
1490 else \
1491 { \
1492 *dst++ = ISO_CODE_SS3; \
1493 coding->fake_multibyte = 1; \
1494 } \
4ed46869
KH
1495 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1496 } while (0)
1497
1498/* The following four macros produce codes (control character or
1499 escape sequence) for ISO2022 locking-shift functions (shift-in,
1500 shift-out, locking-shift-2, and locking-shift-3). */
1501
1502#define ENCODE_SHIFT_IN \
1503 do { \
1504 *dst++ = ISO_CODE_SI; \
1505 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1506 } while (0)
1507
1508#define ENCODE_SHIFT_OUT \
1509 do { \
1510 *dst++ = ISO_CODE_SO; \
1511 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1512 } while (0)
1513
1514#define ENCODE_LOCKING_SHIFT_2 \
1515 do { \
1516 *dst++ = ISO_CODE_ESC, *dst++ = 'n'; \
1517 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1518 } while (0)
1519
1520#define ENCODE_LOCKING_SHIFT_3 \
1521 do { \
1522 *dst++ = ISO_CODE_ESC, *dst++ = 'o'; \
1523 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1524 } while (0)
1525
f4dee582
RS
1526/* Produce codes for a DIMENSION1 character whose character set is
1527 CHARSET and whose position-code is C1. Designation and invocation
4ed46869
KH
1528 sequences are also produced in advance if necessary. */
1529
1530
6e85d753
KH
1531#define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
1532 do { \
1533 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1534 { \
1535 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1536 *dst++ = c1 & 0x7F; \
1537 else \
1538 *dst++ = c1 | 0x80; \
1539 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1540 break; \
1541 } \
1542 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1543 { \
1544 *dst++ = c1 & 0x7F; \
1545 break; \
1546 } \
1547 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1548 { \
1549 *dst++ = c1 | 0x80; \
1550 break; \
1551 } \
1552 else if (coding->flags & CODING_FLAG_ISO_SAFE \
70c22245 1553 && !coding->safe_charsets[charset]) \
6e85d753
KH
1554 { \
1555 /* We should not encode this character, instead produce one or \
1556 two `?'s. */ \
1557 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1558 if (CHARSET_WIDTH (charset) == 2) \
1559 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1560 break; \
1561 } \
1562 else \
1563 /* Since CHARSET is not yet invoked to any graphic planes, we \
1564 must invoke it, or, at first, designate it to some graphic \
1565 register. Then repeat the loop to actually produce the \
1566 character. */ \
1567 dst = encode_invocation_designation (charset, coding, dst); \
4ed46869
KH
1568 } while (1)
1569
f4dee582
RS
1570/* Produce codes for a DIMENSION2 character whose character set is
1571 CHARSET and whose position-codes are C1 and C2. Designation and
4ed46869
KH
1572 invocation codes are also produced in advance if necessary. */
1573
6e85d753
KH
1574#define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
1575 do { \
1576 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1577 { \
1578 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1579 *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F; \
1580 else \
1581 *dst++ = c1 | 0x80, *dst++ = c2 | 0x80; \
1582 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1583 break; \
1584 } \
1585 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1586 { \
1587 *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F; \
1588 break; \
1589 } \
1590 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1591 { \
1592 *dst++ = c1 | 0x80, *dst++= c2 | 0x80; \
1593 break; \
1594 } \
1595 else if (coding->flags & CODING_FLAG_ISO_SAFE \
70c22245 1596 && !coding->safe_charsets[charset]) \
6e85d753
KH
1597 { \
1598 /* We should not encode this character, instead produce one or \
1599 two `?'s. */ \
1600 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1601 if (CHARSET_WIDTH (charset) == 2) \
1602 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1603 break; \
1604 } \
1605 else \
1606 /* Since CHARSET is not yet invoked to any graphic planes, we \
1607 must invoke it, or, at first, designate it to some graphic \
1608 register. Then repeat the loop to actually produce the \
1609 character. */ \
1610 dst = encode_invocation_designation (charset, coding, dst); \
4ed46869
KH
1611 } while (1)
1612
6f551029
KH
1613#define ENCODE_ISO_CHARACTER(charset, c1, c2) \
1614 do { \
1615 int c_alt, charset_alt; \
1616 if (!NILP (translation_table) \
1617 && ((c_alt = translate_char (translation_table, -1, \
1618 charset, c1, c2)) \
1619 >= 0)) \
1620 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1621 else \
1622 charset_alt = charset; \
1623 if (CHARSET_DEFINED_P (charset_alt)) \
1624 { \
1625 if (CHARSET_DIMENSION (charset_alt) == 1) \
1626 { \
1627 if (charset == CHARSET_ASCII \
1628 && coding->flags & CODING_FLAG_ISO_USE_ROMAN) \
1629 charset_alt = charset_latin_jisx0201; \
1630 ENCODE_ISO_CHARACTER_DIMENSION1 (charset_alt, c1); \
1631 } \
1632 else \
1633 { \
1634 if (charset == charset_jisx0208 \
1635 && coding->flags & CODING_FLAG_ISO_USE_OLDJIS) \
1636 charset_alt = charset_jisx0208_1978; \
1637 ENCODE_ISO_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
1638 } \
1639 } \
1640 else \
1641 { \
1642 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1643 { \
1644 *dst++ = charset & 0x7f; \
1645 *dst++ = c1 & 0x7f; \
1646 if (c2) \
1647 *dst++ = c2 & 0x7f; \
1648 } \
1649 else \
1650 { \
1651 *dst++ = charset; \
1652 *dst++ = c1; \
1653 if (c2) \
1654 *dst++ = c2; \
1655 } \
1656 } \
1657 if (! COMPOSING_P (coding->composing)) \
1658 coding->consumed_char++; \
84fbb8a0 1659 } while (0)
bdd9fb48 1660
4ed46869
KH
1661/* Produce designation and invocation codes at a place pointed by DST
1662 to use CHARSET. The element `spec.iso2022' of *CODING is updated.
1663 Return new DST. */
1664
1665unsigned char *
1666encode_invocation_designation (charset, coding, dst)
1667 int charset;
1668 struct coding_system *coding;
1669 unsigned char *dst;
1670{
1671 int reg; /* graphic register number */
1672
1673 /* At first, check designations. */
1674 for (reg = 0; reg < 4; reg++)
1675 if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
1676 break;
1677
1678 if (reg >= 4)
1679 {
1680 /* CHARSET is not yet designated to any graphic registers. */
1681 /* At first check the requested designation. */
1682 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1ba9e4ab
KH
1683 if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1684 /* Since CHARSET requests no special designation, designate it
1685 to graphic register 0. */
4ed46869
KH
1686 reg = 0;
1687
1688 ENCODE_DESIGNATION (charset, reg, coding);
1689 }
1690
1691 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
1692 && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
1693 {
1694 /* Since the graphic register REG is not invoked to any graphic
1695 planes, invoke it to graphic plane 0. */
1696 switch (reg)
1697 {
1698 case 0: /* graphic register 0 */
1699 ENCODE_SHIFT_IN;
1700 break;
1701
1702 case 1: /* graphic register 1 */
1703 ENCODE_SHIFT_OUT;
1704 break;
1705
1706 case 2: /* graphic register 2 */
1707 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1708 ENCODE_SINGLE_SHIFT_2;
1709 else
1710 ENCODE_LOCKING_SHIFT_2;
1711 break;
1712
1713 case 3: /* graphic register 3 */
1714 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1715 ENCODE_SINGLE_SHIFT_3;
1716 else
1717 ENCODE_LOCKING_SHIFT_3;
1718 break;
1719 }
1720 }
1721 return dst;
1722}
1723
1724/* The following two macros produce codes for indicating composition. */
1725#define ENCODE_COMPOSITION_NO_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '0'
1726#define ENCODE_COMPOSITION_WITH_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '2'
1727#define ENCODE_COMPOSITION_END *dst++ = ISO_CODE_ESC, *dst++ = '1'
1728
1729/* The following three macros produce codes for indicating direction
1730 of text. */
1731#define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
1732 do { \
1733 if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS) \
1734 *dst++ = ISO_CODE_ESC, *dst++ = '['; \
1735 else \
1736 *dst++ = ISO_CODE_CSI; \
1737 } while (0)
1738
1739#define ENCODE_DIRECTION_R2L \
1740 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '2', *dst++ = ']'
1741
1742#define ENCODE_DIRECTION_L2R \
1743 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '0', *dst++ = ']'
1744
1745/* Produce codes for designation and invocation to reset the graphic
1746 planes and registers to initial state. */
e0e989f6
KH
1747#define ENCODE_RESET_PLANE_AND_REGISTER \
1748 do { \
1749 int reg; \
1750 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0) \
1751 ENCODE_SHIFT_IN; \
1752 for (reg = 0; reg < 4; reg++) \
1753 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0 \
1754 && (CODING_SPEC_ISO_DESIGNATION (coding, reg) \
1755 != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg))) \
1756 ENCODE_DESIGNATION \
1757 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
4ed46869
KH
1758 } while (0)
1759
bdd9fb48 1760/* Produce designation sequences of charsets in the line started from
d46c5b12 1761 SRC to a place pointed by *DSTP, and update DSTP.
bdd9fb48
KH
1762
1763 If the current block ends before any end-of-line, we may fail to
d46c5b12
KH
1764 find all the necessary designations. */
1765
dfcf069d 1766void
bdd9fb48 1767encode_designation_at_bol (coding, table, src, src_end, dstp)
e0e989f6 1768 struct coding_system *coding;
bdd9fb48 1769 Lisp_Object table;
e0e989f6
KH
1770 unsigned char *src, *src_end, **dstp;
1771{
bdd9fb48
KH
1772 int charset, c, found = 0, reg;
1773 /* Table of charsets to be designated to each graphic register. */
1774 int r[4];
1775 unsigned char *dst = *dstp;
1776
1777 for (reg = 0; reg < 4; reg++)
1778 r[reg] = -1;
1779
1780 while (src < src_end && *src != '\n' && found < 4)
e0e989f6 1781 {
bdd9fb48
KH
1782 int bytes = BYTES_BY_CHAR_HEAD (*src);
1783
1784 if (NILP (table))
1785 charset = CHARSET_AT (src);
1786 else
e0e989f6 1787 {
35cb8686
RS
1788 int c_alt;
1789 unsigned char c1, c2;
bdd9fb48
KH
1790
1791 SPLIT_STRING(src, bytes, charset, c1, c2);
84fbb8a0 1792 if ((c_alt = translate_char (table, -1, charset, c1, c2)) >= 0)
bdd9fb48 1793 charset = CHAR_CHARSET (c_alt);
e0e989f6 1794 }
bdd9fb48 1795
e0e989f6 1796 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
d46c5b12 1797 if (reg != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION && r[reg] < 0)
bdd9fb48
KH
1798 {
1799 found++;
1800 r[reg] = charset;
1801 }
1802
1803 src += bytes;
1804 }
1805
1806 if (found)
1807 {
1808 for (reg = 0; reg < 4; reg++)
1809 if (r[reg] >= 0
1810 && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
1811 ENCODE_DESIGNATION (r[reg], reg, coding);
1812 *dstp = dst;
e0e989f6 1813 }
e0e989f6
KH
1814}
1815
4ed46869
KH
1816/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
1817
1818int
d46c5b12 1819encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
1820 struct coding_system *coding;
1821 unsigned char *source, *destination;
1822 int src_bytes, dst_bytes;
4ed46869
KH
1823{
1824 unsigned char *src = source;
1825 unsigned char *src_end = source + src_bytes;
1826 unsigned char *dst = destination;
1827 unsigned char *dst_end = destination + dst_bytes;
e0e989f6 1828 /* Since the maximum bytes produced by each loop is 20, we subtract 19
4ed46869
KH
1829 from DST_END to assure overflow checking is necessary only at the
1830 head of loop. */
e0e989f6 1831 unsigned char *adjusted_dst_end = dst_end - 19;
84fbb8a0 1832 Lisp_Object translation_table
f967223b 1833 = coding->translation_table_for_encode;
d46c5b12 1834 int result = CODING_FINISH_NORMAL;
bdd9fb48 1835
84fbb8a0 1836 if (!NILP (Venable_character_translation) && NILP (translation_table))
f967223b 1837 translation_table = Vstandard_translation_table_for_encode;
4ed46869 1838
d46c5b12 1839 coding->consumed_char = 0;
fb88bf2d 1840 coding->fake_multibyte = 0;
d46c5b12
KH
1841 while (src < src_end && (dst_bytes
1842 ? (dst < adjusted_dst_end)
1843 : (dst < src - 19)))
4ed46869
KH
1844 {
1845 /* SRC_BASE remembers the start position in source in each loop.
1846 The loop will be exited when there's not enough source text
1847 to analyze multi-byte codes (within macros ONE_MORE_BYTE,
1848 TWO_MORE_BYTES, and THREE_MORE_BYTES). In that case, SRC is
1849 reset to SRC_BASE before exiting. */
1850 unsigned char *src_base = src;
bdd9fb48 1851 int charset, c1, c2, c3, c4;
4ed46869 1852
e0e989f6
KH
1853 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
1854 && CODING_SPEC_ISO_BOL (coding))
1855 {
bdd9fb48 1856 /* We have to produce designation sequences if any now. */
84fbb8a0 1857 encode_designation_at_bol (coding, translation_table,
bdd9fb48 1858 src, src_end, &dst);
e0e989f6
KH
1859 CODING_SPEC_ISO_BOL (coding) = 0;
1860 }
1861
1862 c1 = *src++;
4ed46869 1863 /* If we are seeing a component of a composite character, we are
d46c5b12
KH
1864 seeing a leading-code encoded irregularly for composition, or
1865 a composition rule if composing with rule. We must set C1 to
1866 a normal leading-code or an ASCII code. If we are not seeing
1867 a composite character, we must reset composition,
1868 designation, and invocation states. */
4ed46869
KH
1869 if (COMPOSING_P (coding->composing))
1870 {
1871 if (c1 < 0xA0)
1872 {
1873 /* We are not in a composite character any longer. */
1874 coding->composing = COMPOSING_NO;
d46c5b12 1875 ENCODE_RESET_PLANE_AND_REGISTER;
4ed46869
KH
1876 ENCODE_COMPOSITION_END;
1877 }
1878 else
1879 {
1880 if (coding->composing == COMPOSING_WITH_RULE_RULE)
1881 {
1882 *dst++ = c1 & 0x7F;
1883 coding->composing = COMPOSING_WITH_RULE_HEAD;
1884 continue;
1885 }
1886 else if (coding->composing == COMPOSING_WITH_RULE_HEAD)
1887 coding->composing = COMPOSING_WITH_RULE_RULE;
1888 if (c1 == 0xA0)
1889 {
1890 /* This is an ASCII component. */
1891 ONE_MORE_BYTE (c1);
1892 c1 &= 0x7F;
1893 }
1894 else
1895 /* This is a leading-code of non ASCII component. */
1896 c1 -= 0x20;
1897 }
1898 }
1899
1900 /* Now encode one character. C1 is a control character, an
1901 ASCII character, or a leading-code of multi-byte character. */
1902 switch (emacs_code_class[c1])
1903 {
1904 case EMACS_ascii_code:
bdd9fb48 1905 ENCODE_ISO_CHARACTER (CHARSET_ASCII, c1, /* dummy */ c2);
4ed46869
KH
1906 break;
1907
1908 case EMACS_control_code:
1909 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
e0e989f6 1910 ENCODE_RESET_PLANE_AND_REGISTER;
4ed46869 1911 *dst++ = c1;
d46c5b12 1912 coding->consumed_char++;
4ed46869
KH
1913 break;
1914
1915 case EMACS_carriage_return_code:
d46c5b12 1916 if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
4ed46869
KH
1917 {
1918 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
e0e989f6 1919 ENCODE_RESET_PLANE_AND_REGISTER;
4ed46869 1920 *dst++ = c1;
d46c5b12 1921 coding->consumed_char++;
4ed46869
KH
1922 break;
1923 }
1924 /* fall down to treat '\r' as '\n' ... */
1925
1926 case EMACS_linefeed_code:
1927 if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
e0e989f6
KH
1928 ENCODE_RESET_PLANE_AND_REGISTER;
1929 if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
1930 bcopy (coding->spec.iso2022.initial_designation,
1931 coding->spec.iso2022.current_designation,
1932 sizeof coding->spec.iso2022.initial_designation);
4ed46869 1933 if (coding->eol_type == CODING_EOL_LF
0ef69138 1934 || coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
1935 *dst++ = ISO_CODE_LF;
1936 else if (coding->eol_type == CODING_EOL_CRLF)
1937 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
1938 else
1939 *dst++ = ISO_CODE_CR;
e0e989f6 1940 CODING_SPEC_ISO_BOL (coding) = 1;
d46c5b12 1941 coding->consumed_char++;
4ed46869
KH
1942 break;
1943
1944 case EMACS_leading_code_2:
1945 ONE_MORE_BYTE (c2);
19a8d9e0
KH
1946 if (c2 < 0xA0)
1947 {
1948 /* invalid sequence */
1949 *dst++ = c1;
38cf95df
RS
1950 src--;
1951 coding->consumed_char++;
19a8d9e0
KH
1952 }
1953 else
1954 ENCODE_ISO_CHARACTER (c1, c2, /* dummy */ c3);
4ed46869
KH
1955 break;
1956
1957 case EMACS_leading_code_3:
1958 TWO_MORE_BYTES (c2, c3);
19a8d9e0
KH
1959 if (c2 < 0xA0 || c3 < 0xA0)
1960 {
1961 /* invalid sequence */
1962 *dst++ = c1;
38cf95df
RS
1963 src -= 2;
1964 coding->consumed_char++;
19a8d9e0
KH
1965 }
1966 else if (c1 < LEADING_CODE_PRIVATE_11)
bdd9fb48 1967 ENCODE_ISO_CHARACTER (c1, c2, c3);
4ed46869 1968 else
bdd9fb48 1969 ENCODE_ISO_CHARACTER (c2, c3, /* dummy */ c4);
4ed46869
KH
1970 break;
1971
1972 case EMACS_leading_code_4:
1973 THREE_MORE_BYTES (c2, c3, c4);
19a8d9e0
KH
1974 if (c2 < 0xA0 || c3 < 0xA0 || c4 < 0xA0)
1975 {
1976 /* invalid sequence */
1977 *dst++ = c1;
38cf95df
RS
1978 src -= 3;
1979 coding->consumed_char++;
19a8d9e0
KH
1980 }
1981 else
1982 ENCODE_ISO_CHARACTER (c2, c3, c4);
4ed46869
KH
1983 break;
1984
1985 case EMACS_leading_code_composition:
19a8d9e0
KH
1986 ONE_MORE_BYTE (c2);
1987 if (c2 < 0xA0)
1988 {
1989 /* invalid sequence */
1990 *dst++ = c1;
38cf95df
RS
1991 src--;
1992 coding->consumed_char++;
19a8d9e0
KH
1993 }
1994 else if (c2 == 0xFF)
4ed46869 1995 {
d46c5b12 1996 ENCODE_RESET_PLANE_AND_REGISTER;
4ed46869
KH
1997 coding->composing = COMPOSING_WITH_RULE_HEAD;
1998 ENCODE_COMPOSITION_WITH_RULE_START;
d46c5b12 1999 coding->consumed_char++;
4ed46869
KH
2000 }
2001 else
2002 {
d46c5b12 2003 ENCODE_RESET_PLANE_AND_REGISTER;
4ed46869
KH
2004 /* Rewind one byte because it is a character code of
2005 composition elements. */
2006 src--;
2007 coding->composing = COMPOSING_NO_RULE_HEAD;
2008 ENCODE_COMPOSITION_NO_RULE_START;
d46c5b12 2009 coding->consumed_char++;
4ed46869
KH
2010 }
2011 break;
2012
2013 case EMACS_invalid_code:
3efbce95
KH
2014 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2015 ENCODE_RESET_PLANE_AND_REGISTER;
4ed46869 2016 *dst++ = c1;
d46c5b12 2017 coding->consumed_char++;
4ed46869
KH
2018 break;
2019 }
2020 continue;
2021 label_end_of_loop:
d46c5b12
KH
2022 result = CODING_FINISH_INSUFFICIENT_SRC;
2023 src = src_base;
4ed46869
KH
2024 break;
2025 }
2026
49cb52b4
KH
2027 if (src < src_end && result == CODING_FINISH_NORMAL)
2028 result = CODING_FINISH_INSUFFICIENT_DST;
2029
2030 /* If this is the last block of the text to be encoded, we must
2031 reset graphic planes and registers to the initial state, and
2032 flush out the carryover if any. */
2033 if (coding->mode & CODING_MODE_LAST_BLOCK)
84fbb8a0
KH
2034 {
2035 ENCODE_RESET_PLANE_AND_REGISTER;
2036 if (COMPOSING_P (coding->composing))
2037 ENCODE_COMPOSITION_END;
88993dfd
KH
2038 if (result == CODING_FINISH_INSUFFICIENT_SRC)
2039 {
2040 while (src < src_end && dst < dst_end)
2041 *dst++ = *src++;
2042 }
84fbb8a0 2043 }
d46c5b12
KH
2044 coding->consumed = src - source;
2045 coding->produced = coding->produced_char = dst - destination;
2046 return result;
4ed46869
KH
2047}
2048
2049\f
2050/*** 4. SJIS and BIG5 handlers ***/
2051
f4dee582 2052/* Although SJIS and BIG5 are not ISO's coding system, they are used
4ed46869
KH
2053 quite widely. So, for the moment, Emacs supports them in the bare
2054 C code. But, in the future, they may be supported only by CCL. */
2055
2056/* SJIS is a coding system encoding three character sets: ASCII, right
2057 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
2058 as is. A character of charset katakana-jisx0201 is encoded by
2059 "position-code + 0x80". A character of charset japanese-jisx0208
2060 is encoded in 2-byte but two position-codes are divided and shifted
2061 so that it fit in the range below.
2062
2063 --- CODE RANGE of SJIS ---
2064 (character set) (range)
2065 ASCII 0x00 .. 0x7F
2066 KATAKANA-JISX0201 0xA0 .. 0xDF
c28a9453 2067 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
d14d03ac 2068 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
4ed46869
KH
2069 -------------------------------
2070
2071*/
2072
2073/* BIG5 is a coding system encoding two character sets: ASCII and
2074 Big5. An ASCII character is encoded as is. Big5 is a two-byte
2075 character set and is encoded in two-byte.
2076
2077 --- CODE RANGE of BIG5 ---
2078 (character set) (range)
2079 ASCII 0x00 .. 0x7F
2080 Big5 (1st byte) 0xA1 .. 0xFE
2081 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
2082 --------------------------
2083
2084 Since the number of characters in Big5 is larger than maximum
2085 characters in Emacs' charset (96x96), it can't be handled as one
2086 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
2087 and `charset-big5-2'. Both are DIMENSION2 and CHARS94. The former
2088 contains frequently used characters and the latter contains less
2089 frequently used characters. */
2090
2091/* Macros to decode or encode a character of Big5 in BIG5. B1 and B2
2092 are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
2093 C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
2094 format. CHARSET is `charset_big5_1' or `charset_big5_2'. */
2095
2096/* Number of Big5 characters which have the same code in 1st byte. */
2097#define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2098
2099#define DECODE_BIG5(b1, b2, charset, c1, c2) \
2100 do { \
2101 unsigned int temp \
2102 = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62); \
2103 if (b1 < 0xC9) \
2104 charset = charset_big5_1; \
2105 else \
2106 { \
2107 charset = charset_big5_2; \
2108 temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW; \
2109 } \
2110 c1 = temp / (0xFF - 0xA1) + 0x21; \
2111 c2 = temp % (0xFF - 0xA1) + 0x21; \
2112 } while (0)
2113
2114#define ENCODE_BIG5(charset, c1, c2, b1, b2) \
2115 do { \
2116 unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21); \
2117 if (charset == charset_big5_2) \
2118 temp += BIG5_SAME_ROW * (0xC9 - 0xA1); \
2119 b1 = temp / BIG5_SAME_ROW + 0xA1; \
2120 b2 = temp % BIG5_SAME_ROW; \
2121 b2 += b2 < 0x3F ? 0x40 : 0x62; \
2122 } while (0)
2123
a5d301df
KH
2124#define DECODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \
2125 do { \
2126 int c_alt, charset_alt = (charset); \
84fbb8a0
KH
2127 if (!NILP (translation_table) \
2128 && ((c_alt = translate_char (translation_table, \
2129 -1, (charset), c1, c2)) >= 0)) \
55ab7be3 2130 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
a5d301df
KH
2131 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
2132 DECODE_CHARACTER_ASCII (c1); \
2133 else if (CHARSET_DIMENSION (charset_alt) == 1) \
2134 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
2135 else \
2136 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
2137 } while (0)
2138
84fbb8a0
KH
2139#define ENCODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \
2140 do { \
2141 int c_alt, charset_alt; \
2142 if (!NILP (translation_table) \
2143 && ((c_alt = translate_char (translation_table, -1, \
2144 charset, c1, c2)) \
2145 >= 0)) \
2146 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
2147 else \
2148 charset_alt = charset; \
2149 if (charset_alt == charset_ascii) \
2150 *dst++ = c1; \
2151 else if (CHARSET_DIMENSION (charset_alt) == 1) \
2152 { \
2153 if (sjis_p && charset_alt == charset_katakana_jisx0201) \
2154 *dst++ = c1; \
2155 else \
2156 { \
2157 *dst++ = charset_alt, *dst++ = c1; \
2158 coding->fake_multibyte = 1; \
2159 } \
2160 } \
2161 else \
2162 { \
2163 c1 &= 0x7F, c2 &= 0x7F; \
2164 if (sjis_p && charset_alt == charset_jisx0208) \
2165 { \
2166 unsigned char s1, s2; \
2167 \
2168 ENCODE_SJIS (c1, c2, s1, s2); \
2169 *dst++ = s1, *dst++ = s2; \
2170 coding->fake_multibyte = 1; \
2171 } \
2172 else if (!sjis_p \
2173 && (charset_alt == charset_big5_1 \
2174 || charset_alt == charset_big5_2)) \
2175 { \
2176 unsigned char b1, b2; \
2177 \
2178 ENCODE_BIG5 (charset_alt, c1, c2, b1, b2); \
2179 *dst++ = b1, *dst++ = b2; \
2180 } \
2181 else \
2182 { \
2183 *dst++ = charset_alt, *dst++ = c1, *dst++ = c2; \
2184 coding->fake_multibyte = 1; \
2185 } \
2186 } \
2187 coding->consumed_char++; \
a5d301df
KH
2188 } while (0);
2189
4ed46869
KH
2190/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2191 Check if a text is encoded in SJIS. If it is, return
2192 CODING_CATEGORY_MASK_SJIS, else return 0. */
2193
2194int
2195detect_coding_sjis (src, src_end)
2196 unsigned char *src, *src_end;
2197{
2198 unsigned char c;
2199
2200 while (src < src_end)
2201 {
2202 c = *src++;
4ed46869
KH
2203 if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
2204 {
2205 if (src < src_end && *src++ < 0x40)
2206 return 0;
2207 }
2208 }
2209 return CODING_CATEGORY_MASK_SJIS;
2210}
2211
2212/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2213 Check if a text is encoded in BIG5. If it is, return
2214 CODING_CATEGORY_MASK_BIG5, else return 0. */
2215
2216int
2217detect_coding_big5 (src, src_end)
2218 unsigned char *src, *src_end;
2219{
2220 unsigned char c;
2221
2222 while (src < src_end)
2223 {
2224 c = *src++;
4ed46869
KH
2225 if (c >= 0xA1)
2226 {
2227 if (src >= src_end)
2228 break;
2229 c = *src++;
2230 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
2231 return 0;
2232 }
2233 }
2234 return CODING_CATEGORY_MASK_BIG5;
2235}
2236
2237/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2238 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
2239
2240int
2241decode_coding_sjis_big5 (coding, source, destination,
d46c5b12 2242 src_bytes, dst_bytes, sjis_p)
4ed46869
KH
2243 struct coding_system *coding;
2244 unsigned char *source, *destination;
2245 int src_bytes, dst_bytes;
4ed46869
KH
2246 int sjis_p;
2247{
2248 unsigned char *src = source;
2249 unsigned char *src_end = source + src_bytes;
2250 unsigned char *dst = destination;
2251 unsigned char *dst_end = destination + dst_bytes;
2252 /* Since the maximum bytes produced by each loop is 4, we subtract 3
2253 from DST_END to assure overflow checking is necessary only at the
2254 head of loop. */
2255 unsigned char *adjusted_dst_end = dst_end - 3;
84fbb8a0 2256 Lisp_Object translation_table
f967223b 2257 = coding->translation_table_for_decode;
d46c5b12 2258 int result = CODING_FINISH_NORMAL;
a5d301df 2259
84fbb8a0 2260 if (!NILP (Venable_character_translation) && NILP (translation_table))
f967223b 2261 translation_table = Vstandard_translation_table_for_decode;
4ed46869 2262
d46c5b12 2263 coding->produced_char = 0;
fb88bf2d 2264 coding->fake_multibyte = 0;
d46c5b12
KH
2265 while (src < src_end && (dst_bytes
2266 ? (dst < adjusted_dst_end)
2267 : (dst < src - 3)))
4ed46869
KH
2268 {
2269 /* SRC_BASE remembers the start position in source in each loop.
2270 The loop will be exited when there's not enough source text
2271 to analyze two-byte character (within macro ONE_MORE_BYTE).
2272 In that case, SRC is reset to SRC_BASE before exiting. */
2273 unsigned char *src_base = src;
2274 unsigned char c1 = *src++, c2, c3, c4;
2275
d46c5b12 2276 if (c1 < 0x20)
4ed46869 2277 {
d46c5b12 2278 if (c1 == '\r')
4ed46869 2279 {
d46c5b12
KH
2280 if (coding->eol_type == CODING_EOL_CRLF)
2281 {
2282 ONE_MORE_BYTE (c2);
2283 if (c2 == '\n')
2284 *dst++ = c2;
2285 else if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2286 {
2287 result = CODING_FINISH_INCONSISTENT_EOL;
2288 goto label_end_of_loop_2;
2289 }
2290 else
2291 /* To process C2 again, SRC is subtracted by 1. */
2292 *dst++ = c1, src--;
2293 }
2294 else if (coding->eol_type == CODING_EOL_CR)
2295 *dst++ = '\n';
4ed46869 2296 else
d46c5b12
KH
2297 *dst++ = c1;
2298 }
2299 else if (c1 == '\n'
2300 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2301 && (coding->eol_type == CODING_EOL_CR
2302 || coding->eol_type == CODING_EOL_CRLF))
2303 {
2304 result = CODING_FINISH_INCONSISTENT_EOL;
2305 goto label_end_of_loop_2;
4ed46869
KH
2306 }
2307 else
2308 *dst++ = c1;
d46c5b12 2309 coding->produced_char++;
4ed46869 2310 }
a5d301df
KH
2311 else if (c1 < 0x80)
2312 DECODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
54f78171 2313 else
4ed46869 2314 {
4ed46869
KH
2315 if (sjis_p)
2316 {
54f78171 2317 if (c1 < 0xA0 || (c1 >= 0xE0 && c1 < 0xF0))
fb88bf2d 2318 {
54f78171
KH
2319 /* SJIS -> JISX0208 */
2320 ONE_MORE_BYTE (c2);
d14d03ac 2321 if (c2 >= 0x40 && c2 != 0x7F && c2 <= 0xFC)
54f78171
KH
2322 {
2323 DECODE_SJIS (c1, c2, c3, c4);
2324 DECODE_SJIS_BIG5_CHARACTER (charset_jisx0208, c3, c4);
2325 }
2326 else
2327 goto label_invalid_code_2;
fb88bf2d 2328 }
54f78171
KH
2329 else if (c1 < 0xE0)
2330 /* SJIS -> JISX0201-Kana */
2331 DECODE_SJIS_BIG5_CHARACTER (charset_katakana_jisx0201, c1,
2332 /* dummy */ c2);
fb88bf2d 2333 else
54f78171 2334 goto label_invalid_code_1;
4ed46869 2335 }
fb88bf2d 2336 else
fb88bf2d 2337 {
54f78171
KH
2338 /* BIG5 -> Big5 */
2339 if (c1 >= 0xA1 && c1 <= 0xFE)
fb88bf2d 2340 {
54f78171
KH
2341 ONE_MORE_BYTE (c2);
2342 if ((c2 >= 0x40 && c2 <= 0x7E) || (c2 >= 0xA1 && c2 <= 0xFE))
2343 {
2344 int charset;
4ed46869 2345
54f78171
KH
2346 DECODE_BIG5 (c1, c2, charset, c3, c4);
2347 DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
2348 }
2349 else
2350 goto label_invalid_code_2;
fb88bf2d
KH
2351 }
2352 else
54f78171 2353 goto label_invalid_code_1;
4ed46869
KH
2354 }
2355 }
2356 continue;
2357
fb88bf2d
KH
2358 label_invalid_code_1:
2359 *dst++ = c1;
2360 coding->produced_char++;
2361 coding->fake_multibyte = 1;
2362 continue;
2363
2364 label_invalid_code_2:
2365 *dst++ = c1; *dst++= c2;
2366 coding->produced_char += 2;
2367 coding->fake_multibyte = 1;
2368 continue;
2369
4ed46869 2370 label_end_of_loop:
d46c5b12
KH
2371 result = CODING_FINISH_INSUFFICIENT_SRC;
2372 label_end_of_loop_2:
4ed46869
KH
2373 src = src_base;
2374 break;
2375 }
2376
fb88bf2d
KH
2377 if (src < src_end)
2378 {
2379 if (result == CODING_FINISH_NORMAL)
2380 result = CODING_FINISH_INSUFFICIENT_DST;
2381 else if (result != CODING_FINISH_INCONSISTENT_EOL
2382 && coding->mode & CODING_MODE_LAST_BLOCK)
2383 {
2384 src_bytes = src_end - src;
2385 if (dst_bytes && (dst_end - dst < src_bytes))
2386 src_bytes = dst_end - dst;
2387 bcopy (dst, src, src_bytes);
2388 src += src_bytes;
2389 dst += src_bytes;
2390 coding->fake_multibyte = 1;
2391 }
2392 }
d46c5b12
KH
2393
2394 coding->consumed = coding->consumed_char = src - source;
2395 coding->produced = dst - destination;
2396 return result;
4ed46869
KH
2397}
2398
2399/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
2400 This function can encode `charset_ascii', `charset_katakana_jisx0201',
2401 `charset_jisx0208', `charset_big5_1', and `charset_big5-2'. We are
2402 sure that all these charsets are registered as official charset
2403 (i.e. do not have extended leading-codes). Characters of other
2404 charsets are produced without any encoding. If SJIS_P is 1, encode
2405 SJIS text, else encode BIG5 text. */
2406
2407int
2408encode_coding_sjis_big5 (coding, source, destination,
d46c5b12 2409 src_bytes, dst_bytes, sjis_p)
4ed46869
KH
2410 struct coding_system *coding;
2411 unsigned char *source, *destination;
2412 int src_bytes, dst_bytes;
4ed46869
KH
2413 int sjis_p;
2414{
2415 unsigned char *src = source;
2416 unsigned char *src_end = source + src_bytes;
2417 unsigned char *dst = destination;
2418 unsigned char *dst_end = destination + dst_bytes;
2419 /* Since the maximum bytes produced by each loop is 2, we subtract 1
2420 from DST_END to assure overflow checking is necessary only at the
2421 head of loop. */
2422 unsigned char *adjusted_dst_end = dst_end - 1;
84fbb8a0 2423 Lisp_Object translation_table
f967223b 2424 = coding->translation_table_for_encode;
d46c5b12 2425 int result = CODING_FINISH_NORMAL;
a5d301df 2426
84fbb8a0 2427 if (!NILP (Venable_character_translation) && NILP (translation_table))
f967223b 2428 translation_table = Vstandard_translation_table_for_encode;
4ed46869 2429
d46c5b12 2430 coding->consumed_char = 0;
fb88bf2d 2431 coding->fake_multibyte = 0;
d46c5b12
KH
2432 while (src < src_end && (dst_bytes
2433 ? (dst < adjusted_dst_end)
2434 : (dst < src - 1)))
4ed46869
KH
2435 {
2436 /* SRC_BASE remembers the start position in source in each loop.
2437 The loop will be exited when there's not enough source text
2438 to analyze multi-byte codes (within macros ONE_MORE_BYTE and
2439 TWO_MORE_BYTES). In that case, SRC is reset to SRC_BASE
2440 before exiting. */
2441 unsigned char *src_base = src;
2442 unsigned char c1 = *src++, c2, c3, c4;
2443
2444 if (coding->composing)
2445 {
2446 if (c1 == 0xA0)
2447 {
2448 ONE_MORE_BYTE (c1);
2449 c1 &= 0x7F;
2450 }
2451 else if (c1 >= 0xA0)
2452 c1 -= 0x20;
2453 else
2454 coding->composing = 0;
2455 }
2456
2457 switch (emacs_code_class[c1])
2458 {
2459 case EMACS_ascii_code:
a5d301df
KH
2460 ENCODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
2461 break;
2462
4ed46869
KH
2463 case EMACS_control_code:
2464 *dst++ = c1;
d46c5b12 2465 coding->consumed_char++;
4ed46869
KH
2466 break;
2467
2468 case EMACS_carriage_return_code:
d46c5b12 2469 if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
4ed46869
KH
2470 {
2471 *dst++ = c1;
d46c5b12 2472 coding->consumed_char++;
4ed46869
KH
2473 break;
2474 }
2475 /* fall down to treat '\r' as '\n' ... */
2476
2477 case EMACS_linefeed_code:
2478 if (coding->eol_type == CODING_EOL_LF
0ef69138 2479 || coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
2480 *dst++ = '\n';
2481 else if (coding->eol_type == CODING_EOL_CRLF)
2482 *dst++ = '\r', *dst++ = '\n';
2483 else
2484 *dst++ = '\r';
d46c5b12 2485 coding->consumed_char++;
4ed46869
KH
2486 break;
2487
2488 case EMACS_leading_code_2:
2489 ONE_MORE_BYTE (c2);
a5d301df 2490 ENCODE_SJIS_BIG5_CHARACTER (c1, c2, /* dummy */ c3);
4ed46869
KH
2491 break;
2492
2493 case EMACS_leading_code_3:
2494 TWO_MORE_BYTES (c2, c3);
a5d301df 2495 ENCODE_SJIS_BIG5_CHARACTER (c1, c2, c3);
4ed46869
KH
2496 break;
2497
2498 case EMACS_leading_code_4:
2499 THREE_MORE_BYTES (c2, c3, c4);
a5d301df 2500 ENCODE_SJIS_BIG5_CHARACTER (c2, c3, c4);
4ed46869
KH
2501 break;
2502
2503 case EMACS_leading_code_composition:
2504 coding->composing = 1;
2505 break;
2506
2507 default: /* i.e. case EMACS_invalid_code: */
2508 *dst++ = c1;
d46c5b12 2509 coding->consumed_char++;
4ed46869
KH
2510 }
2511 continue;
2512
2513 label_end_of_loop:
d46c5b12
KH
2514 result = CODING_FINISH_INSUFFICIENT_SRC;
2515 src = src_base;
4ed46869
KH
2516 break;
2517 }
2518
d46c5b12
KH
2519 if (result == CODING_FINISH_NORMAL
2520 && src < src_end)
2521 result = CODING_FINISH_INSUFFICIENT_DST;
2522 coding->consumed = src - source;
2523 coding->produced = coding->produced_char = dst - destination;
2524 return result;
4ed46869
KH
2525}
2526
2527\f
1397dc18
KH
2528/*** 5. CCL handlers ***/
2529
2530/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2531 Check if a text is encoded in a coding system of which
2532 encoder/decoder are written in CCL program. If it is, return
2533 CODING_CATEGORY_MASK_CCL, else return 0. */
2534
2535int
2536detect_coding_ccl (src, src_end)
2537 unsigned char *src, *src_end;
2538{
2539 unsigned char *valid;
2540
2541 /* No coding system is assigned to coding-category-ccl. */
2542 if (!coding_system_table[CODING_CATEGORY_IDX_CCL])
2543 return 0;
2544
2545 valid = coding_system_table[CODING_CATEGORY_IDX_CCL]->spec.ccl.valid_codes;
2546 while (src < src_end)
2547 {
2548 if (! valid[*src]) return 0;
2549 src++;
2550 }
2551 return CODING_CATEGORY_MASK_CCL;
2552}
2553
2554\f
2555/*** 6. End-of-line handlers ***/
4ed46869
KH
2556
2557/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2558 This function is called only when `coding->eol_type' is
2559 CODING_EOL_CRLF or CODING_EOL_CR. */
2560
dfcf069d 2561int
d46c5b12 2562decode_eol (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
2563 struct coding_system *coding;
2564 unsigned char *source, *destination;
2565 int src_bytes, dst_bytes;
4ed46869
KH
2566{
2567 unsigned char *src = source;
2568 unsigned char *src_end = source + src_bytes;
2569 unsigned char *dst = destination;
2570 unsigned char *dst_end = destination + dst_bytes;
fb88bf2d 2571 unsigned char c;
d46c5b12
KH
2572 int result = CODING_FINISH_NORMAL;
2573
fb88bf2d
KH
2574 coding->fake_multibyte = 0;
2575
d46c5b12
KH
2576 if (src_bytes <= 0)
2577 return result;
4ed46869
KH
2578
2579 switch (coding->eol_type)
2580 {
2581 case CODING_EOL_CRLF:
2582 {
2583 /* Since the maximum bytes produced by each loop is 2, we
2584 subtract 1 from DST_END to assure overflow checking is
2585 necessary only at the head of loop. */
2586 unsigned char *adjusted_dst_end = dst_end - 1;
2587
d46c5b12
KH
2588 while (src < src_end && (dst_bytes
2589 ? (dst < adjusted_dst_end)
2590 : (dst < src - 1)))
4ed46869
KH
2591 {
2592 unsigned char *src_base = src;
fb88bf2d
KH
2593
2594 c = *src++;
4ed46869
KH
2595 if (c == '\r')
2596 {
2597 ONE_MORE_BYTE (c);
fdfcf19d
KH
2598 if (c == '\n')
2599 *dst++ = c;
2600 else
d46c5b12
KH
2601 {
2602 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2603 {
2604 result = CODING_FINISH_INCONSISTENT_EOL;
2605 goto label_end_of_loop_2;
2606 }
fdfcf19d 2607 src--;
d46c5b12 2608 *dst++ = '\r';
fb88bf2d
KH
2609 if (BASE_LEADING_CODE_P (c))
2610 coding->fake_multibyte = 1;
d46c5b12 2611 }
4ed46869 2612 }
d46c5b12
KH
2613 else if (c == '\n'
2614 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL))
2615 {
2616 result = CODING_FINISH_INCONSISTENT_EOL;
2617 goto label_end_of_loop_2;
2618 }
4ed46869 2619 else
fb88bf2d
KH
2620 {
2621 *dst++ = c;
2622 if (BASE_LEADING_CODE_P (c))
2623 coding->fake_multibyte = 1;
2624 }
4ed46869
KH
2625 continue;
2626
2627 label_end_of_loop:
d46c5b12
KH
2628 result = CODING_FINISH_INSUFFICIENT_SRC;
2629 label_end_of_loop_2:
4ed46869
KH
2630 src = src_base;
2631 break;
2632 }
fdfcf19d
KH
2633 if (src < src_end)
2634 {
2635 if (result == CODING_FINISH_NORMAL)
2636 result = CODING_FINISH_INSUFFICIENT_DST;
2637 else if (result != CODING_FINISH_INCONSISTENT_EOL
2638 && coding->mode & CODING_MODE_LAST_BLOCK)
2639 {
2640 /* This is the last block of the text to be decoded.
2641 We flush out all remaining codes. */
2642 src_bytes = src_end - src;
2643 if (dst_bytes && (dst_end - dst < src_bytes))
2644 src_bytes = dst_end - dst;
2645 bcopy (src, dst, src_bytes);
2646 dst += src_bytes;
2647 src += src_bytes;
2648 }
2649 }
4ed46869 2650 }
d46c5b12 2651 break;
4ed46869
KH
2652
2653 case CODING_EOL_CR:
d46c5b12
KH
2654 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2655 {
fb88bf2d
KH
2656 while (src < src_end)
2657 {
2658 if ((c = *src++) == '\n')
2659 break;
2660 if (BASE_LEADING_CODE_P (c))
2661 coding->fake_multibyte = 1;
2662 }
d46c5b12
KH
2663 if (*--src == '\n')
2664 {
2665 src_bytes = src - source;
2666 result = CODING_FINISH_INCONSISTENT_EOL;
2667 }
2668 }
2669 if (dst_bytes && src_bytes > dst_bytes)
2670 {
2671 result = CODING_FINISH_INSUFFICIENT_DST;
2672 src_bytes = dst_bytes;
2673 }
2674 if (dst_bytes)
2675 bcopy (source, destination, src_bytes);
2676 else
2677 safe_bcopy (source, destination, src_bytes);
2678 src = source + src_bytes;
2679 while (src_bytes--) if (*dst++ == '\r') dst[-1] = '\n';
4ed46869
KH
2680 break;
2681
2682 default: /* i.e. case: CODING_EOL_LF */
d46c5b12
KH
2683 if (dst_bytes && src_bytes > dst_bytes)
2684 {
2685 result = CODING_FINISH_INSUFFICIENT_DST;
2686 src_bytes = dst_bytes;
2687 }
2688 if (dst_bytes)
2689 bcopy (source, destination, src_bytes);
2690 else
2691 safe_bcopy (source, destination, src_bytes);
2692 src += src_bytes;
993824c9 2693 dst += src_bytes;
fb88bf2d 2694 coding->fake_multibyte = 1;
4ed46869
KH
2695 break;
2696 }
2697
d46c5b12
KH
2698 coding->consumed = coding->consumed_char = src - source;
2699 coding->produced = coding->produced_char = dst - destination;
2700 return result;
4ed46869
KH
2701}
2702
2703/* See "GENERAL NOTES about `encode_coding_XXX ()' functions". Encode
2704 format of end-of-line according to `coding->eol_type'. If
d46c5b12
KH
2705 `coding->mode & CODING_MODE_SELECTIVE_DISPLAY' is nonzero, code
2706 '\r' in source text also means end-of-line. */
4ed46869 2707
dfcf069d 2708int
d46c5b12 2709encode_eol (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
2710 struct coding_system *coding;
2711 unsigned char *source, *destination;
2712 int src_bytes, dst_bytes;
4ed46869
KH
2713{
2714 unsigned char *src = source;
2715 unsigned char *dst = destination;
d46c5b12 2716 int result = CODING_FINISH_NORMAL;
4ed46869 2717
fb88bf2d
KH
2718 coding->fake_multibyte = 0;
2719
d46c5b12
KH
2720 if (coding->eol_type == CODING_EOL_CRLF)
2721 {
2722 unsigned char c;
2723 unsigned char *src_end = source + src_bytes;
2724 unsigned char *dst_end = destination + dst_bytes;
2725 /* Since the maximum bytes produced by each loop is 2, we
2726 subtract 1 from DST_END to assure overflow checking is
2727 necessary only at the head of loop. */
2728 unsigned char *adjusted_dst_end = dst_end - 1;
2729
2730 while (src < src_end && (dst_bytes
2731 ? (dst < adjusted_dst_end)
2732 : (dst < src - 1)))
2733 {
2734 c = *src++;
2735 if (c == '\n'
2736 || (c == '\r' && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY)))
2737 *dst++ = '\r', *dst++ = '\n';
2738 else
fb88bf2d
KH
2739 {
2740 *dst++ = c;
2741 if (BASE_LEADING_CODE_P (c))
2742 coding->fake_multibyte = 1;
2743 }
d46c5b12
KH
2744 }
2745 if (src < src_end)
2746 result = CODING_FINISH_INSUFFICIENT_DST;
2747 }
2748 else
4ed46869 2749 {
fb88bf2d
KH
2750 unsigned char c;
2751
d46c5b12 2752 if (dst_bytes && src_bytes > dst_bytes)
4ed46869 2753 {
d46c5b12
KH
2754 src_bytes = dst_bytes;
2755 result = CODING_FINISH_INSUFFICIENT_DST;
2756 }
2757 if (dst_bytes)
2758 bcopy (source, destination, src_bytes);
2759 else
993824c9
RS
2760 safe_bcopy (source, destination, src_bytes);
2761 dst_bytes = src_bytes;
2762 if (coding->eol_type == CODING_EOL_CR)
d46c5b12
KH
2763 {
2764 while (src_bytes--)
fb88bf2d
KH
2765 {
2766 if ((c = *dst++) == '\n')
2767 dst[-1] = '\r';
2768 else if (BASE_LEADING_CODE_P (c))
993824c9 2769 coding->fake_multibyte = 1;
fb88bf2d 2770 }
d46c5b12 2771 }
fb88bf2d 2772 else
d46c5b12 2773 {
fb88bf2d
KH
2774 if (coding->mode & CODING_MODE_SELECTIVE_DISPLAY)
2775 {
2776 while (src_bytes--)
2777 if (*dst++ == '\r') dst[-1] = '\n';
2778 }
2779 coding->fake_multibyte = 1;
4ed46869 2780 }
fb88bf2d
KH
2781 src = source + dst_bytes;
2782 dst = destination + dst_bytes;
4ed46869
KH
2783 }
2784
d46c5b12
KH
2785 coding->consumed = coding->consumed_char = src - source;
2786 coding->produced = coding->produced_char = dst - destination;
2787 return result;
4ed46869
KH
2788}
2789
2790\f
1397dc18 2791/*** 7. C library functions ***/
4ed46869
KH
2792
2793/* In Emacs Lisp, coding system is represented by a Lisp symbol which
2794 has a property `coding-system'. The value of this property is a
2795 vector of length 5 (called as coding-vector). Among elements of
2796 this vector, the first (element[0]) and the fifth (element[4])
2797 carry important information for decoding/encoding. Before
2798 decoding/encoding, this information should be set in fields of a
2799 structure of type `coding_system'.
2800
2801 A value of property `coding-system' can be a symbol of another
2802 subsidiary coding-system. In that case, Emacs gets coding-vector
2803 from that symbol.
2804
2805 `element[0]' contains information to be set in `coding->type'. The
2806 value and its meaning is as follows:
2807
0ef69138
KH
2808 0 -- coding_type_emacs_mule
2809 1 -- coding_type_sjis
2810 2 -- coding_type_iso2022
2811 3 -- coding_type_big5
2812 4 -- coding_type_ccl encoder/decoder written in CCL
2813 nil -- coding_type_no_conversion
2814 t -- coding_type_undecided (automatic conversion on decoding,
2815 no-conversion on encoding)
4ed46869
KH
2816
2817 `element[4]' contains information to be set in `coding->flags' and
2818 `coding->spec'. The meaning varies by `coding->type'.
2819
2820 If `coding->type' is `coding_type_iso2022', element[4] is a vector
2821 of length 32 (of which the first 13 sub-elements are used now).
2822 Meanings of these sub-elements are:
2823
2824 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2825 If the value is an integer of valid charset, the charset is
2826 assumed to be designated to graphic register N initially.
2827
2828 If the value is minus, it is a minus value of charset which
2829 reserves graphic register N, which means that the charset is
2830 not designated initially but should be designated to graphic
2831 register N just before encoding a character in that charset.
2832
2833 If the value is nil, graphic register N is never used on
2834 encoding.
2835
2836 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2837 Each value takes t or nil. See the section ISO2022 of
2838 `coding.h' for more information.
2839
2840 If `coding->type' is `coding_type_big5', element[4] is t to denote
2841 BIG5-ETen or nil to denote BIG5-HKU.
2842
2843 If `coding->type' takes the other value, element[4] is ignored.
2844
2845 Emacs Lisp's coding system also carries information about format of
2846 end-of-line in a value of property `eol-type'. If the value is
2847 integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2848 means CODING_EOL_CR. If it is not integer, it should be a vector
2849 of subsidiary coding systems of which property `eol-type' has one
2850 of above values.
2851
2852*/
2853
2854/* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2855 and set it in CODING. If CODING_SYSTEM_SYMBOL is invalid, CODING
2856 is setup so that no conversion is necessary and return -1, else
2857 return 0. */
2858
2859int
e0e989f6
KH
2860setup_coding_system (coding_system, coding)
2861 Lisp_Object coding_system;
4ed46869
KH
2862 struct coding_system *coding;
2863{
d46c5b12 2864 Lisp_Object coding_spec, coding_type, eol_type, plist;
4608c386 2865 Lisp_Object val;
70c22245 2866 int i;
4ed46869 2867
d46c5b12 2868 /* Initialize some fields required for all kinds of coding systems. */
774324d6 2869 coding->symbol = coding_system;
d46c5b12
KH
2870 coding->common_flags = 0;
2871 coding->mode = 0;
2872 coding->heading_ascii = -1;
2873 coding->post_read_conversion = coding->pre_write_conversion = Qnil;
1f5dbf34
KH
2874
2875 if (NILP (coding_system))
2876 goto label_invalid_coding_system;
2877
4608c386 2878 coding_spec = Fget (coding_system, Qcoding_system);
1f5dbf34 2879
4608c386
KH
2880 if (!VECTORP (coding_spec)
2881 || XVECTOR (coding_spec)->size != 5
2882 || !CONSP (XVECTOR (coding_spec)->contents[3]))
4ed46869 2883 goto label_invalid_coding_system;
4608c386 2884
d46c5b12
KH
2885 eol_type = inhibit_eol_conversion ? Qnil : Fget (coding_system, Qeol_type);
2886 if (VECTORP (eol_type))
2887 {
2888 coding->eol_type = CODING_EOL_UNDECIDED;
2889 coding->common_flags = CODING_REQUIRE_DETECTION_MASK;
2890 }
2891 else if (XFASTINT (eol_type) == 1)
2892 {
2893 coding->eol_type = CODING_EOL_CRLF;
2894 coding->common_flags
2895 = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2896 }
2897 else if (XFASTINT (eol_type) == 2)
2898 {
2899 coding->eol_type = CODING_EOL_CR;
2900 coding->common_flags
2901 = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2902 }
2903 else
2904 coding->eol_type = CODING_EOL_LF;
2905
2906 coding_type = XVECTOR (coding_spec)->contents[0];
2907 /* Try short cut. */
2908 if (SYMBOLP (coding_type))
2909 {
2910 if (EQ (coding_type, Qt))
2911 {
2912 coding->type = coding_type_undecided;
2913 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
2914 }
2915 else
2916 coding->type = coding_type_no_conversion;
2917 return 0;
2918 }
2919
2920 /* Initialize remaining fields. */
2921 coding->composing = 0;
a63063ae 2922 coding->composed_chars = 0;
d46c5b12
KH
2923
2924 /* Get values of coding system properties:
2925 `post-read-conversion', `pre-write-conversion',
f967223b 2926 `translation-table-for-decode', `translation-table-for-encode'. */
4608c386
KH
2927 plist = XVECTOR (coding_spec)->contents[3];
2928 coding->post_read_conversion = Fplist_get (plist, Qpost_read_conversion);
2929 coding->pre_write_conversion = Fplist_get (plist, Qpre_write_conversion);
f967223b 2930 val = Fplist_get (plist, Qtranslation_table_for_decode);
4608c386 2931 if (SYMBOLP (val))
f967223b
KH
2932 val = Fget (val, Qtranslation_table_for_decode);
2933 coding->translation_table_for_decode = CHAR_TABLE_P (val) ? val : Qnil;
2934 val = Fplist_get (plist, Qtranslation_table_for_encode);
4608c386 2935 if (SYMBOLP (val))
f967223b
KH
2936 val = Fget (val, Qtranslation_table_for_encode);
2937 coding->translation_table_for_encode = CHAR_TABLE_P (val) ? val : Qnil;
d46c5b12
KH
2938 val = Fplist_get (plist, Qcoding_category);
2939 if (!NILP (val))
2940 {
2941 val = Fget (val, Qcoding_category_index);
2942 if (INTEGERP (val))
2943 coding->category_idx = XINT (val);
2944 else
2945 goto label_invalid_coding_system;
2946 }
2947 else
2948 goto label_invalid_coding_system;
4608c386 2949
70c22245
KH
2950 val = Fplist_get (plist, Qsafe_charsets);
2951 if (EQ (val, Qt))
2952 {
2953 for (i = 0; i <= MAX_CHARSET; i++)
2954 coding->safe_charsets[i] = 1;
2955 }
2956 else
2957 {
2958 bzero (coding->safe_charsets, MAX_CHARSET + 1);
2959 while (CONSP (val))
2960 {
2961 if ((i = get_charset_id (XCONS (val)->car)) >= 0)
2962 coding->safe_charsets[i] = 1;
2963 val = XCONS (val)->cdr;
2964 }
2965 }
2966
d46c5b12 2967 switch (XFASTINT (coding_type))
4ed46869
KH
2968 {
2969 case 0:
0ef69138 2970 coding->type = coding_type_emacs_mule;
c952af22
KH
2971 if (!NILP (coding->post_read_conversion))
2972 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
2973 if (!NILP (coding->pre_write_conversion))
2974 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
4ed46869
KH
2975 break;
2976
2977 case 1:
2978 coding->type = coding_type_sjis;
c952af22
KH
2979 coding->common_flags
2980 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
4ed46869
KH
2981 break;
2982
2983 case 2:
2984 coding->type = coding_type_iso2022;
c952af22
KH
2985 coding->common_flags
2986 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
4ed46869 2987 {
70c22245 2988 Lisp_Object val, temp;
4ed46869 2989 Lisp_Object *flags;
d46c5b12 2990 int i, charset, reg_bits = 0;
4ed46869 2991
4608c386 2992 val = XVECTOR (coding_spec)->contents[4];
f44d27ce 2993
4ed46869
KH
2994 if (!VECTORP (val) || XVECTOR (val)->size != 32)
2995 goto label_invalid_coding_system;
2996
2997 flags = XVECTOR (val)->contents;
2998 coding->flags
2999 = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
3000 | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
3001 | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
3002 | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
3003 | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
3004 | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
3005 | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
3006 | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
e0e989f6
KH
3007 | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
3008 | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
c4825358
KH
3009 | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3010 | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
3f003981 3011 | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
c4825358 3012 );
4ed46869
KH
3013
3014 /* Invoke graphic register 0 to plane 0. */
3015 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
3016 /* Invoke graphic register 1 to plane 1 if we can use full 8-bit. */
3017 CODING_SPEC_ISO_INVOCATION (coding, 1)
3018 = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
3019 /* Not single shifting at first. */
6e85d753 3020 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
e0e989f6 3021 /* Beginning of buffer should also be regarded as bol. */
6e85d753 3022 CODING_SPEC_ISO_BOL (coding) = 1;
4ed46869 3023
70c22245
KH
3024 for (charset = 0; charset <= MAX_CHARSET; charset++)
3025 CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = 255;
3026 val = Vcharset_revision_alist;
3027 while (CONSP (val))
3028 {
3029 charset = get_charset_id (Fcar_safe (XCONS (val)->car));
3030 if (charset >= 0
3031 && (temp = Fcdr_safe (XCONS (val)->car), INTEGERP (temp))
3032 && (i = XINT (temp), (i >= 0 && (i + '@') < 128)))
3033 CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = i;
3034 val = XCONS (val)->cdr;
3035 }
3036
4ed46869
KH
3037 /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
3038 FLAGS[REG] can be one of below:
3039 integer CHARSET: CHARSET occupies register I,
3040 t: designate nothing to REG initially, but can be used
3041 by any charsets,
3042 list of integer, nil, or t: designate the first
3043 element (if integer) to REG initially, the remaining
3044 elements (if integer) is designated to REG on request,
d46c5b12 3045 if an element is t, REG can be used by any charsets,
4ed46869 3046 nil: REG is never used. */
467e7675 3047 for (charset = 0; charset <= MAX_CHARSET; charset++)
1ba9e4ab
KH
3048 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3049 = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
4ed46869
KH
3050 for (i = 0; i < 4; i++)
3051 {
3052 if (INTEGERP (flags[i])
e0e989f6
KH
3053 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset))
3054 || (charset = get_charset_id (flags[i])) >= 0)
4ed46869
KH
3055 {
3056 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3057 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
3058 }
3059 else if (EQ (flags[i], Qt))
3060 {
3061 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
d46c5b12
KH
3062 reg_bits |= 1 << i;
3063 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
4ed46869
KH
3064 }
3065 else if (CONSP (flags[i]))
3066 {
84d60297
RS
3067 Lisp_Object tail;
3068 tail = flags[i];
4ed46869 3069
d46c5b12 3070 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
4ed46869
KH
3071 if (INTEGERP (XCONS (tail)->car)
3072 && (charset = XINT (XCONS (tail)->car),
e0e989f6
KH
3073 CHARSET_VALID_P (charset))
3074 || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
4ed46869
KH
3075 {
3076 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3077 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
3078 }
3079 else
3080 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3081 tail = XCONS (tail)->cdr;
3082 while (CONSP (tail))
3083 {
3084 if (INTEGERP (XCONS (tail)->car)
3085 && (charset = XINT (XCONS (tail)->car),
e0e989f6
KH
3086 CHARSET_VALID_P (charset))
3087 || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
70c22245
KH
3088 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3089 = i;
4ed46869 3090 else if (EQ (XCONS (tail)->car, Qt))
d46c5b12 3091 reg_bits |= 1 << i;
4ed46869
KH
3092 tail = XCONS (tail)->cdr;
3093 }
3094 }
3095 else
3096 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3097
3098 CODING_SPEC_ISO_DESIGNATION (coding, i)
3099 = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
3100 }
3101
d46c5b12 3102 if (reg_bits && ! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
4ed46869
KH
3103 {
3104 /* REG 1 can be used only by locking shift in 7-bit env. */
3105 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
d46c5b12 3106 reg_bits &= ~2;
4ed46869
KH
3107 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
3108 /* Without any shifting, only REG 0 and 1 can be used. */
d46c5b12 3109 reg_bits &= 3;
4ed46869
KH
3110 }
3111
d46c5b12
KH
3112 if (reg_bits)
3113 for (charset = 0; charset <= MAX_CHARSET; charset++)
6e85d753 3114 {
d46c5b12
KH
3115 if (CHARSET_VALID_P (charset))
3116 {
3117 /* There exist some default graphic registers to be
3118 used CHARSET. */
3119
3120 /* We had better avoid designating a charset of
3121 CHARS96 to REG 0 as far as possible. */
3122 if (CHARSET_CHARS (charset) == 96)
3123 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3124 = (reg_bits & 2
3125 ? 1 : (reg_bits & 4 ? 2 : (reg_bits & 8 ? 3 : 0)));
3126 else
3127 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3128 = (reg_bits & 1
3129 ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
3130 }
6e85d753 3131 }
4ed46869 3132 }
c952af22 3133 coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
d46c5b12 3134 coding->spec.iso2022.last_invalid_designation_register = -1;
4ed46869
KH
3135 break;
3136
3137 case 3:
3138 coding->type = coding_type_big5;
c952af22
KH
3139 coding->common_flags
3140 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
4ed46869 3141 coding->flags
4608c386 3142 = (NILP (XVECTOR (coding_spec)->contents[4])
4ed46869
KH
3143 ? CODING_FLAG_BIG5_HKU
3144 : CODING_FLAG_BIG5_ETEN);
3145 break;
3146
3147 case 4:
3148 coding->type = coding_type_ccl;
c952af22
KH
3149 coding->common_flags
3150 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
4ed46869 3151 {
84d60297 3152 Lisp_Object val;
d21ca14d
KH
3153 Lisp_Object decoder, encoder;
3154
84d60297 3155 val = XVECTOR (coding_spec)->contents[4];
4ed46869 3156 if (CONSP (val)
d21ca14d
KH
3157 && SYMBOLP (XCONS (val)->car)
3158 && !NILP (decoder = Fget (XCONS (val)->car, Qccl_program_idx))
f82423d7 3159 && !NILP (decoder = Fcdr (Faref (Vccl_program_table, decoder)))
d21ca14d
KH
3160 && SYMBOLP (XCONS (val)->cdr)
3161 && !NILP (encoder = Fget (XCONS (val)->cdr, Qccl_program_idx))
f82423d7 3162 && !NILP (encoder = Fcdr (Faref (Vccl_program_table, encoder))))
4ed46869 3163 {
d21ca14d
KH
3164 setup_ccl_program (&(coding->spec.ccl.decoder), decoder);
3165 setup_ccl_program (&(coding->spec.ccl.encoder), encoder);
4ed46869
KH
3166 }
3167 else
3168 goto label_invalid_coding_system;
1397dc18
KH
3169
3170 bzero (coding->spec.ccl.valid_codes, 256);
3171 val = Fplist_get (plist, Qvalid_codes);
3172 if (CONSP (val))
3173 {
3174 Lisp_Object this;
3175
7b179c2d 3176 for (; CONSP (val); val = XCONS (val)->cdr)
1397dc18 3177 {
7b179c2d 3178 this = XCONS (val)->car;
1397dc18
KH
3179 if (INTEGERP (this)
3180 && XINT (this) >= 0 && XINT (this) < 256)
3181 coding->spec.ccl.valid_codes[XINT (this)] = 1;
3182 else if (CONSP (this)
3183 && INTEGERP (XCONS (this)->car)
3184 && INTEGERP (XCONS (this)->cdr))
3185 {
3186 int start = XINT (XCONS (this)->car);
3187 int end = XINT (XCONS (this)->cdr);
3188
3189 if (start >= 0 && start <= end && end < 256)
e133c8fa 3190 while (start <= end)
1397dc18
KH
3191 coding->spec.ccl.valid_codes[start++] = 1;
3192 }
3193 }
3194 }
4ed46869 3195 }
c952af22 3196 coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
4ed46869
KH
3197 break;
3198
27901516
KH
3199 case 5:
3200 coding->type = coding_type_raw_text;
3201 break;
3202
4ed46869 3203 default:
d46c5b12 3204 goto label_invalid_coding_system;
4ed46869
KH
3205 }
3206 return 0;
3207
3208 label_invalid_coding_system:
3209 coding->type = coding_type_no_conversion;
d46c5b12 3210 coding->category_idx = CODING_CATEGORY_IDX_BINARY;
c952af22 3211 coding->common_flags = 0;
dec137e5 3212 coding->eol_type = CODING_EOL_LF;
d46c5b12 3213 coding->pre_write_conversion = coding->post_read_conversion = Qnil;
4ed46869
KH
3214 return -1;
3215}
3216
54f78171
KH
3217/* Setup raw-text or one of its subsidiaries in the structure
3218 coding_system CODING according to the already setup value eol_type
3219 in CODING. CODING should be setup for some coding system in
3220 advance. */
3221
3222void
3223setup_raw_text_coding_system (coding)
3224 struct coding_system *coding;
3225{
3226 if (coding->type != coding_type_raw_text)
3227 {
3228 coding->symbol = Qraw_text;
3229 coding->type = coding_type_raw_text;
3230 if (coding->eol_type != CODING_EOL_UNDECIDED)
3231 {
84d60297
RS
3232 Lisp_Object subsidiaries;
3233 subsidiaries = Fget (Qraw_text, Qeol_type);
54f78171
KH
3234
3235 if (VECTORP (subsidiaries)
3236 && XVECTOR (subsidiaries)->size == 3)
3237 coding->symbol
3238 = XVECTOR (subsidiaries)->contents[coding->eol_type];
3239 }
3240 }
3241 return;
3242}
3243
4ed46869
KH
3244/* Emacs has a mechanism to automatically detect a coding system if it
3245 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
3246 it's impossible to distinguish some coding systems accurately
3247 because they use the same range of codes. So, at first, coding
3248 systems are categorized into 7, those are:
3249
0ef69138 3250 o coding-category-emacs-mule
4ed46869
KH
3251
3252 The category for a coding system which has the same code range
3253 as Emacs' internal format. Assigned the coding-system (Lisp
0ef69138 3254 symbol) `emacs-mule' by default.
4ed46869
KH
3255
3256 o coding-category-sjis
3257
3258 The category for a coding system which has the same code range
3259 as SJIS. Assigned the coding-system (Lisp
7717c392 3260 symbol) `japanese-shift-jis' by default.
4ed46869
KH
3261
3262 o coding-category-iso-7
3263
3264 The category for a coding system which has the same code range
7717c392 3265 as ISO2022 of 7-bit environment. This doesn't use any locking
d46c5b12
KH
3266 shift and single shift functions. This can encode/decode all
3267 charsets. Assigned the coding-system (Lisp symbol)
3268 `iso-2022-7bit' by default.
3269
3270 o coding-category-iso-7-tight
3271
3272 Same as coding-category-iso-7 except that this can
3273 encode/decode only the specified charsets.
4ed46869
KH
3274
3275 o coding-category-iso-8-1
3276
3277 The category for a coding system which has the same code range
3278 as ISO2022 of 8-bit environment and graphic plane 1 used only
7717c392
KH
3279 for DIMENSION1 charset. This doesn't use any locking shift
3280 and single shift functions. Assigned the coding-system (Lisp
3281 symbol) `iso-latin-1' by default.
4ed46869
KH
3282
3283 o coding-category-iso-8-2
3284
3285 The category for a coding system which has the same code range
3286 as ISO2022 of 8-bit environment and graphic plane 1 used only
7717c392
KH
3287 for DIMENSION2 charset. This doesn't use any locking shift
3288 and single shift functions. Assigned the coding-system (Lisp
3289 symbol) `japanese-iso-8bit' by default.
4ed46869 3290
7717c392 3291 o coding-category-iso-7-else
4ed46869
KH
3292
3293 The category for a coding system which has the same code range
7717c392
KH
3294 as ISO2022 of 7-bit environemnt but uses locking shift or
3295 single shift functions. Assigned the coding-system (Lisp
3296 symbol) `iso-2022-7bit-lock' by default.
3297
3298 o coding-category-iso-8-else
3299
3300 The category for a coding system which has the same code range
3301 as ISO2022 of 8-bit environemnt but uses locking shift or
3302 single shift functions. Assigned the coding-system (Lisp
3303 symbol) `iso-2022-8bit-ss2' by default.
4ed46869
KH
3304
3305 o coding-category-big5
3306
3307 The category for a coding system which has the same code range
3308 as BIG5. Assigned the coding-system (Lisp symbol)
e0e989f6 3309 `cn-big5' by default.
4ed46869 3310
1397dc18
KH
3311 o coding-category-ccl
3312
3313 The category for a coding system of which encoder/decoder is
3314 written in CCL programs. The default value is nil, i.e., no
3315 coding system is assigned.
3316
4ed46869
KH
3317 o coding-category-binary
3318
3319 The category for a coding system not categorized in any of the
3320 above. Assigned the coding-system (Lisp symbol)
e0e989f6 3321 `no-conversion' by default.
4ed46869
KH
3322
3323 Each of them is a Lisp symbol and the value is an actual
3324 `coding-system's (this is also a Lisp symbol) assigned by a user.
3325 What Emacs does actually is to detect a category of coding system.
3326 Then, it uses a `coding-system' assigned to it. If Emacs can't
3327 decide only one possible category, it selects a category of the
3328 highest priority. Priorities of categories are also specified by a
3329 user in a Lisp variable `coding-category-list'.
3330
3331*/
3332
66cfb530
KH
3333static
3334int ascii_skip_code[256];
3335
d46c5b12 3336/* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded.
4ed46869
KH
3337 If it detects possible coding systems, return an integer in which
3338 appropriate flag bits are set. Flag bits are defined by macros
d46c5b12 3339 CODING_CATEGORY_MASK_XXX in `coding.h'.
4ed46869 3340
d46c5b12
KH
3341 How many ASCII characters are at the head is returned as *SKIP. */
3342
3343static int
3344detect_coding_mask (source, src_bytes, priorities, skip)
3345 unsigned char *source;
3346 int src_bytes, *priorities, *skip;
4ed46869
KH
3347{
3348 register unsigned char c;
d46c5b12 3349 unsigned char *src = source, *src_end = source + src_bytes;
66cfb530 3350 unsigned int mask;
d46c5b12 3351 int i;
4ed46869
KH
3352
3353 /* At first, skip all ASCII characters and control characters except
3354 for three ISO2022 specific control characters. */
66cfb530
KH
3355 ascii_skip_code[ISO_CODE_SO] = 0;
3356 ascii_skip_code[ISO_CODE_SI] = 0;
3357 ascii_skip_code[ISO_CODE_ESC] = 0;
3358
bcf26d6a 3359 label_loop_detect_coding:
66cfb530 3360 while (src < src_end && ascii_skip_code[*src]) src++;
d46c5b12 3361 *skip = src - source;
4ed46869
KH
3362
3363 if (src >= src_end)
3364 /* We found nothing other than ASCII. There's nothing to do. */
d46c5b12 3365 return 0;
4ed46869 3366
8a8147d6 3367 c = *src;
4ed46869
KH
3368 /* The text seems to be encoded in some multilingual coding system.
3369 Now, try to find in which coding system the text is encoded. */
3370 if (c < 0x80)
bcf26d6a
KH
3371 {
3372 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
3373 /* C is an ISO2022 specific control code of C0. */
3374 mask = detect_coding_iso2022 (src, src_end);
1b2af4b0 3375 if (mask == 0)
d46c5b12
KH
3376 {
3377 /* No valid ISO2022 code follows C. Try again. */
3378 src++;
66cfb530
KH
3379 if (c == ISO_CODE_ESC)
3380 ascii_skip_code[ISO_CODE_ESC] = 1;
3381 else
3382 ascii_skip_code[ISO_CODE_SO] = ascii_skip_code[ISO_CODE_SI] = 1;
d46c5b12
KH
3383 goto label_loop_detect_coding;
3384 }
3385 if (priorities)
3386 goto label_return_highest_only;
bcf26d6a 3387 }
d46c5b12 3388 else
c4825358 3389 {
d46c5b12 3390 int try;
4ed46869 3391
d46c5b12
KH
3392 if (c < 0xA0)
3393 {
3394 /* C is the first byte of SJIS character code,
3395 or a leading-code of Emacs' internal format (emacs-mule). */
3396 try = CODING_CATEGORY_MASK_SJIS | CODING_CATEGORY_MASK_EMACS_MULE;
3397
3398 /* Or, if C is a special latin extra code,
3399 or is an ISO2022 specific control code of C1 (SS2 or SS3),
3400 or is an ISO2022 control-sequence-introducer (CSI),
3401 we should also consider the possibility of ISO2022 codings. */
3402 if ((VECTORP (Vlatin_extra_code_table)
3403 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
3404 || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
3405 || (c == ISO_CODE_CSI
3406 && (src < src_end
3407 && (*src == ']'
3408 || ((*src == '0' || *src == '1' || *src == '2')
3409 && src + 1 < src_end
3410 && src[1] == ']')))))
3411 try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
3412 | CODING_CATEGORY_MASK_ISO_8BIT);
3413 }
c4825358 3414 else
d46c5b12
KH
3415 /* C is a character of ISO2022 in graphic plane right,
3416 or a SJIS's 1-byte character code (i.e. JISX0201),
3417 or the first byte of BIG5's 2-byte code. */
3418 try = (CODING_CATEGORY_MASK_ISO_8_ELSE
3419 | CODING_CATEGORY_MASK_ISO_8BIT
3420 | CODING_CATEGORY_MASK_SJIS
3421 | CODING_CATEGORY_MASK_BIG5);
3422
1397dc18
KH
3423 /* Or, we may have to consider the possibility of CCL. */
3424 if (coding_system_table[CODING_CATEGORY_IDX_CCL]
3425 && (coding_system_table[CODING_CATEGORY_IDX_CCL]
3426 ->spec.ccl.valid_codes)[c])
3427 try |= CODING_CATEGORY_MASK_CCL;
3428
d46c5b12
KH
3429 mask = 0;
3430 if (priorities)
3431 {
3432 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3433 {
5ab13dd0 3434 if (priorities[i] & try & CODING_CATEGORY_MASK_ISO)
d46c5b12 3435 mask = detect_coding_iso2022 (src, src_end);
5ab13dd0 3436 else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS)
d46c5b12 3437 mask = detect_coding_sjis (src, src_end);
5ab13dd0 3438 else if (priorities[i] & try & CODING_CATEGORY_MASK_BIG5)
d46c5b12 3439 mask = detect_coding_big5 (src, src_end);
5ab13dd0 3440 else if (priorities[i] & try & CODING_CATEGORY_MASK_EMACS_MULE)
d46c5b12 3441 mask = detect_coding_emacs_mule (src, src_end);
89fa8b36 3442 else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL)
1397dc18 3443 mask = detect_coding_ccl (src, src_end);
5ab13dd0
RS
3444 else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT)
3445 mask = CODING_CATEGORY_MASK_RAW_TEXT;
3446 else if (priorities[i] & CODING_CATEGORY_MASK_BINARY)
3447 mask = CODING_CATEGORY_MASK_BINARY;
d46c5b12
KH
3448 if (mask)
3449 goto label_return_highest_only;
3450 }
3451 return CODING_CATEGORY_MASK_RAW_TEXT;
3452 }
3453 if (try & CODING_CATEGORY_MASK_ISO)
3454 mask |= detect_coding_iso2022 (src, src_end);
3455 if (try & CODING_CATEGORY_MASK_SJIS)
3456 mask |= detect_coding_sjis (src, src_end);
3457 if (try & CODING_CATEGORY_MASK_BIG5)
3458 mask |= detect_coding_big5 (src, src_end);
3459 if (try & CODING_CATEGORY_MASK_EMACS_MULE)
1397dc18
KH
3460 mask |= detect_coding_emacs_mule (src, src_end);
3461 if (try & CODING_CATEGORY_MASK_CCL)
3462 mask |= detect_coding_ccl (src, src_end);
c4825358 3463 }
5ab13dd0 3464 return (mask | CODING_CATEGORY_MASK_RAW_TEXT | CODING_CATEGORY_MASK_BINARY);
d46c5b12
KH
3465
3466 label_return_highest_only:
3467 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3468 {
3469 if (mask & priorities[i])
3470 return priorities[i];
3471 }
3472 return CODING_CATEGORY_MASK_RAW_TEXT;
4ed46869
KH
3473}
3474
3475/* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
3476 The information of the detected coding system is set in CODING. */
3477
3478void
3479detect_coding (coding, src, src_bytes)
3480 struct coding_system *coding;
3481 unsigned char *src;
3482 int src_bytes;
3483{
d46c5b12
KH
3484 unsigned int idx;
3485 int skip, mask, i;
84d60297 3486 Lisp_Object val;
4ed46869 3487
84d60297 3488 val = Vcoding_category_list;
66cfb530 3489 mask = detect_coding_mask (src, src_bytes, coding_priorities, &skip);
d46c5b12 3490 coding->heading_ascii = skip;
4ed46869 3491
d46c5b12
KH
3492 if (!mask) return;
3493
3494 /* We found a single coding system of the highest priority in MASK. */
3495 idx = 0;
3496 while (mask && ! (mask & 1)) mask >>= 1, idx++;
3497 if (! mask)
3498 idx = CODING_CATEGORY_IDX_RAW_TEXT;
4ed46869 3499
d46c5b12
KH
3500 val = XSYMBOL (XVECTOR (Vcoding_category_table)->contents[idx])->value;
3501
3502 if (coding->eol_type != CODING_EOL_UNDECIDED)
27901516 3503 {
84d60297 3504 Lisp_Object tmp;
d46c5b12 3505
84d60297 3506 tmp = Fget (val, Qeol_type);
d46c5b12
KH
3507 if (VECTORP (tmp))
3508 val = XVECTOR (tmp)->contents[coding->eol_type];
4ed46869 3509 }
d46c5b12
KH
3510 setup_coding_system (val, coding);
3511 /* Set this again because setup_coding_system reset this member. */
3512 coding->heading_ascii = skip;
4ed46869
KH
3513}
3514
d46c5b12
KH
3515/* Detect how end-of-line of a text of length SRC_BYTES pointed by
3516 SOURCE is encoded. Return one of CODING_EOL_LF, CODING_EOL_CRLF,
3517 CODING_EOL_CR, and CODING_EOL_UNDECIDED.
3518
3519 How many non-eol characters are at the head is returned as *SKIP. */
4ed46869 3520
bc4bc72a
RS
3521#define MAX_EOL_CHECK_COUNT 3
3522
d46c5b12
KH
3523static int
3524detect_eol_type (source, src_bytes, skip)
3525 unsigned char *source;
3526 int src_bytes, *skip;
4ed46869 3527{
d46c5b12 3528 unsigned char *src = source, *src_end = src + src_bytes;
4ed46869 3529 unsigned char c;
bc4bc72a
RS
3530 int total = 0; /* How many end-of-lines are found so far. */
3531 int eol_type = CODING_EOL_UNDECIDED;
3532 int this_eol_type;
4ed46869 3533
d46c5b12
KH
3534 *skip = 0;
3535
bc4bc72a 3536 while (src < src_end && total < MAX_EOL_CHECK_COUNT)
4ed46869
KH
3537 {
3538 c = *src++;
bc4bc72a 3539 if (c == '\n' || c == '\r')
4ed46869 3540 {
d46c5b12
KH
3541 if (*skip == 0)
3542 *skip = src - 1 - source;
bc4bc72a
RS
3543 total++;
3544 if (c == '\n')
3545 this_eol_type = CODING_EOL_LF;
3546 else if (src >= src_end || *src != '\n')
3547 this_eol_type = CODING_EOL_CR;
4ed46869 3548 else
bc4bc72a
RS
3549 this_eol_type = CODING_EOL_CRLF, src++;
3550
3551 if (eol_type == CODING_EOL_UNDECIDED)
3552 /* This is the first end-of-line. */
3553 eol_type = this_eol_type;
3554 else if (eol_type != this_eol_type)
d46c5b12
KH
3555 {
3556 /* The found type is different from what found before. */
3557 eol_type = CODING_EOL_INCONSISTENT;
3558 break;
3559 }
4ed46869
KH
3560 }
3561 }
bc4bc72a 3562
d46c5b12
KH
3563 if (*skip == 0)
3564 *skip = src_end - source;
85a02ca4 3565 return eol_type;
4ed46869
KH
3566}
3567
3568/* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
3569 is encoded. If it detects an appropriate format of end-of-line, it
3570 sets the information in *CODING. */
3571
3572void
3573detect_eol (coding, src, src_bytes)
3574 struct coding_system *coding;
3575 unsigned char *src;
3576 int src_bytes;
3577{
4608c386 3578 Lisp_Object val;
d46c5b12
KH
3579 int skip;
3580 int eol_type = detect_eol_type (src, src_bytes, &skip);
3581
3582 if (coding->heading_ascii > skip)
3583 coding->heading_ascii = skip;
3584 else
3585 skip = coding->heading_ascii;
4ed46869 3586
0ef69138 3587 if (eol_type == CODING_EOL_UNDECIDED)
4ed46869 3588 return;
27901516
KH
3589 if (eol_type == CODING_EOL_INCONSISTENT)
3590 {
3591#if 0
3592 /* This code is suppressed until we find a better way to
992f23f2 3593 distinguish raw text file and binary file. */
27901516
KH
3594
3595 /* If we have already detected that the coding is raw-text, the
3596 coding should actually be no-conversion. */
3597 if (coding->type == coding_type_raw_text)
3598 {
3599 setup_coding_system (Qno_conversion, coding);
3600 return;
3601 }
3602 /* Else, let's decode only text code anyway. */
3603#endif /* 0 */
1b2af4b0 3604 eol_type = CODING_EOL_LF;
27901516
KH
3605 }
3606
4608c386 3607 val = Fget (coding->symbol, Qeol_type);
4ed46869 3608 if (VECTORP (val) && XVECTOR (val)->size == 3)
d46c5b12
KH
3609 {
3610 setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
3611 coding->heading_ascii = skip;
3612 }
3613}
3614
3615#define CONVERSION_BUFFER_EXTRA_ROOM 256
3616
3617#define DECODING_BUFFER_MAG(coding) \
3618 (coding->type == coding_type_iso2022 \
3619 ? 3 \
3620 : ((coding->type == coding_type_sjis || coding->type == coding_type_big5) \
3621 ? 2 \
3622 : (coding->type == coding_type_raw_text \
3623 ? 1 \
3624 : (coding->type == coding_type_ccl \
3625 ? coding->spec.ccl.decoder.buf_magnification \
3626 : 2))))
3627
3628/* Return maximum size (bytes) of a buffer enough for decoding
3629 SRC_BYTES of text encoded in CODING. */
3630
3631int
3632decoding_buffer_size (coding, src_bytes)
3633 struct coding_system *coding;
3634 int src_bytes;
3635{
3636 return (src_bytes * DECODING_BUFFER_MAG (coding)
3637 + CONVERSION_BUFFER_EXTRA_ROOM);
3638}
3639
3640/* Return maximum size (bytes) of a buffer enough for encoding
3641 SRC_BYTES of text to CODING. */
3642
3643int
3644encoding_buffer_size (coding, src_bytes)
3645 struct coding_system *coding;
3646 int src_bytes;
3647{
3648 int magnification;
3649
3650 if (coding->type == coding_type_ccl)
3651 magnification = coding->spec.ccl.encoder.buf_magnification;
3652 else
3653 magnification = 3;
3654
3655 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
3656}
3657
3658#ifndef MINIMUM_CONVERSION_BUFFER_SIZE
3659#define MINIMUM_CONVERSION_BUFFER_SIZE 1024
3660#endif
3661
3662char *conversion_buffer;
3663int conversion_buffer_size;
3664
3665/* Return a pointer to a SIZE bytes of buffer to be used for encoding
3666 or decoding. Sufficient memory is allocated automatically. If we
3667 run out of memory, return NULL. */
3668
3669char *
3670get_conversion_buffer (size)
3671 int size;
3672{
3673 if (size > conversion_buffer_size)
3674 {
3675 char *buf;
3676 int real_size = conversion_buffer_size * 2;
3677
3678 while (real_size < size) real_size *= 2;
3679 buf = (char *) xmalloc (real_size);
3680 xfree (conversion_buffer);
3681 conversion_buffer = buf;
3682 conversion_buffer_size = real_size;
3683 }
3684 return conversion_buffer;
3685}
3686
3687int
3688ccl_coding_driver (coding, source, destination, src_bytes, dst_bytes, encodep)
3689 struct coding_system *coding;
3690 unsigned char *source, *destination;
3691 int src_bytes, dst_bytes, encodep;
3692{
3693 struct ccl_program *ccl
3694 = encodep ? &coding->spec.ccl.encoder : &coding->spec.ccl.decoder;
3695 int result;
3696
ae9ff118 3697 ccl->last_block = coding->mode & CODING_MODE_LAST_BLOCK;
7b179c2d 3698
d46c5b12
KH
3699 coding->produced = ccl_driver (ccl, source, destination,
3700 src_bytes, dst_bytes, &(coding->consumed));
69f76525 3701 coding->produced_char
48942766
KH
3702 = (encodep
3703 ? coding->produced
3704 : multibyte_chars_in_text (destination, coding->produced));
69f76525
KH
3705 coding->consumed_char
3706 = multibyte_chars_in_text (source, coding->consumed);
3707
d46c5b12
KH
3708 switch (ccl->status)
3709 {
3710 case CCL_STAT_SUSPEND_BY_SRC:
3711 result = CODING_FINISH_INSUFFICIENT_SRC;
3712 break;
3713 case CCL_STAT_SUSPEND_BY_DST:
3714 result = CODING_FINISH_INSUFFICIENT_DST;
3715 break;
9864ebce
KH
3716 case CCL_STAT_QUIT:
3717 case CCL_STAT_INVALID_CMD:
3718 result = CODING_FINISH_INTERRUPT;
3719 break;
d46c5b12
KH
3720 default:
3721 result = CODING_FINISH_NORMAL;
3722 break;
3723 }
3724 return result;
4ed46869
KH
3725}
3726
3727/* See "GENERAL NOTES about `decode_coding_XXX ()' functions". Before
3728 decoding, it may detect coding system and format of end-of-line if
3729 those are not yet decided. */
3730
3731int
d46c5b12 3732decode_coding (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
3733 struct coding_system *coding;
3734 unsigned char *source, *destination;
3735 int src_bytes, dst_bytes;
4ed46869 3736{
d46c5b12 3737 int result;
4ed46869 3738
d4e57bcd 3739 if (src_bytes <= 0
944bd420 3740 && coding->type != coding_type_ccl
d4e57bcd
KH
3741 && ! (coding->mode & CODING_MODE_LAST_BLOCK
3742 && CODING_REQUIRE_FLUSHING (coding)))
4ed46869 3743 {
d46c5b12
KH
3744 coding->produced = coding->produced_char = 0;
3745 coding->consumed = coding->consumed_char = 0;
fb88bf2d 3746 coding->fake_multibyte = 0;
d46c5b12 3747 return CODING_FINISH_NORMAL;
4ed46869
KH
3748 }
3749
0ef69138 3750 if (coding->type == coding_type_undecided)
4ed46869
KH
3751 detect_coding (coding, source, src_bytes);
3752
0ef69138 3753 if (coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
3754 detect_eol (coding, source, src_bytes);
3755
4ed46869
KH
3756 switch (coding->type)
3757 {
0ef69138
KH
3758 case coding_type_emacs_mule:
3759 case coding_type_undecided:
27901516 3760 case coding_type_raw_text:
4ed46869 3761 if (coding->eol_type == CODING_EOL_LF
0ef69138 3762 || coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869 3763 goto label_no_conversion;
d46c5b12 3764 result = decode_eol (coding, source, destination, src_bytes, dst_bytes);
4ed46869
KH
3765 break;
3766
3767 case coding_type_sjis:
d46c5b12
KH
3768 result = decode_coding_sjis_big5 (coding, source, destination,
3769 src_bytes, dst_bytes, 1);
4ed46869
KH
3770 break;
3771
3772 case coding_type_iso2022:
d46c5b12
KH
3773 result = decode_coding_iso2022 (coding, source, destination,
3774 src_bytes, dst_bytes);
4ed46869
KH
3775 break;
3776
3777 case coding_type_big5:
d46c5b12
KH
3778 result = decode_coding_sjis_big5 (coding, source, destination,
3779 src_bytes, dst_bytes, 0);
4ed46869
KH
3780 break;
3781
3782 case coding_type_ccl:
d46c5b12
KH
3783 result = ccl_coding_driver (coding, source, destination,
3784 src_bytes, dst_bytes, 0);
3785 break;
3786
3787 default: /* i.e. case coding_type_no_conversion: */
3788 label_no_conversion:
3789 if (dst_bytes && src_bytes > dst_bytes)
3790 {
3791 coding->produced = dst_bytes;
3792 result = CODING_FINISH_INSUFFICIENT_DST;
3793 }
3794 else
3795 {
3796 coding->produced = src_bytes;
3797 result = CODING_FINISH_NORMAL;
3798 }
3799 if (dst_bytes)
3800 bcopy (source, destination, coding->produced);
3801 else
3802 safe_bcopy (source, destination, coding->produced);
fb88bf2d 3803 coding->fake_multibyte = 1;
d46c5b12
KH
3804 coding->consumed
3805 = coding->consumed_char = coding->produced_char = coding->produced;
4ed46869
KH
3806 break;
3807 }
3808
d46c5b12 3809 return result;
4ed46869
KH
3810}
3811
3812/* See "GENERAL NOTES about `encode_coding_XXX ()' functions". */
3813
3814int
d46c5b12 3815encode_coding (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
3816 struct coding_system *coding;
3817 unsigned char *source, *destination;
3818 int src_bytes, dst_bytes;
4ed46869 3819{
d46c5b12 3820 int result;
4ed46869 3821
d4e57bcd
KH
3822 if (src_bytes <= 0
3823 && ! (coding->mode & CODING_MODE_LAST_BLOCK
3824 && CODING_REQUIRE_FLUSHING (coding)))
4ed46869 3825 {
d46c5b12
KH
3826 coding->produced = coding->produced_char = 0;
3827 coding->consumed = coding->consumed_char = 0;
fb88bf2d 3828 coding->fake_multibyte = 0;
d46c5b12
KH
3829 return CODING_FINISH_NORMAL;
3830 }
4ed46869 3831
d46c5b12
KH
3832 switch (coding->type)
3833 {
0ef69138
KH
3834 case coding_type_emacs_mule:
3835 case coding_type_undecided:
27901516 3836 case coding_type_raw_text:
4ed46869 3837 if (coding->eol_type == CODING_EOL_LF
0ef69138 3838 || coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869 3839 goto label_no_conversion;
d46c5b12 3840 result = encode_eol (coding, source, destination, src_bytes, dst_bytes);
4ed46869
KH
3841 break;
3842
3843 case coding_type_sjis:
d46c5b12
KH
3844 result = encode_coding_sjis_big5 (coding, source, destination,
3845 src_bytes, dst_bytes, 1);
4ed46869
KH
3846 break;
3847
3848 case coding_type_iso2022:
d46c5b12
KH
3849 result = encode_coding_iso2022 (coding, source, destination,
3850 src_bytes, dst_bytes);
4ed46869
KH
3851 break;
3852
3853 case coding_type_big5:
d46c5b12
KH
3854 result = encode_coding_sjis_big5 (coding, source, destination,
3855 src_bytes, dst_bytes, 0);
4ed46869
KH
3856 break;
3857
3858 case coding_type_ccl:
d46c5b12
KH
3859 result = ccl_coding_driver (coding, source, destination,
3860 src_bytes, dst_bytes, 1);
3861 break;
3862
3863 default: /* i.e. case coding_type_no_conversion: */
3864 label_no_conversion:
3865 if (dst_bytes && src_bytes > dst_bytes)
3866 {
3867 coding->produced = dst_bytes;
3868 result = CODING_FINISH_INSUFFICIENT_DST;
3869 }
3870 else
3871 {
3872 coding->produced = src_bytes;
3873 result = CODING_FINISH_NORMAL;
3874 }
3875 if (dst_bytes)
3876 bcopy (source, destination, coding->produced);
3877 else
3878 safe_bcopy (source, destination, coding->produced);
3879 if (coding->mode & CODING_MODE_SELECTIVE_DISPLAY)
3880 {
3881 unsigned char *p = destination, *pend = p + coding->produced;
3882 while (p < pend)
3883 if (*p++ == '\015') p[-1] = '\n';
3884 }
fb88bf2d 3885 coding->fake_multibyte = 1;
d46c5b12
KH
3886 coding->consumed
3887 = coding->consumed_char = coding->produced_char = coding->produced;
4ed46869
KH
3888 break;
3889 }
3890
d46c5b12 3891 return result;
4ed46869
KH
3892}
3893
fb88bf2d
KH
3894/* Scan text in the region between *BEG and *END (byte positions),
3895 skip characters which we don't have to decode by coding system
3896 CODING at the head and tail, then set *BEG and *END to the region
3897 of the text we actually have to convert. The caller should move
3898 the gap out of the region in advance.
4ed46869 3899
d46c5b12
KH
3900 If STR is not NULL, *BEG and *END are indices into STR. */
3901
3902static void
3903shrink_decoding_region (beg, end, coding, str)
3904 int *beg, *end;
3905 struct coding_system *coding;
3906 unsigned char *str;
3907{
fb88bf2d 3908 unsigned char *begp_orig, *begp, *endp_orig, *endp, c;
d46c5b12 3909 int eol_conversion;
88993dfd 3910 Lisp_Object translation_table;
d46c5b12
KH
3911
3912 if (coding->type == coding_type_ccl
3913 || coding->type == coding_type_undecided
3914 || !NILP (coding->post_read_conversion))
3915 {
3916 /* We can't skip any data. */
3917 return;
3918 }
3919 else if (coding->type == coding_type_no_conversion)
3920 {
fb88bf2d
KH
3921 /* We need no conversion, but don't have to skip any data here.
3922 Decoding routine handles them effectively anyway. */
d46c5b12
KH
3923 return;
3924 }
3925
88993dfd
KH
3926 translation_table = coding->translation_table_for_decode;
3927 if (NILP (translation_table) && !NILP (Venable_character_translation))
3928 translation_table = Vstandard_translation_table_for_decode;
3929 if (CHAR_TABLE_P (translation_table))
3930 {
3931 int i;
3932 for (i = 0; i < 128; i++)
3933 if (!NILP (CHAR_TABLE_REF (translation_table, i)))
3934 break;
3935 if (i < 128)
3936 /* Some ASCII character should be tranlsated. We give up
3937 shrinking. */
3938 return;
3939 }
3940
aa60dea6
KH
3941 eol_conversion = (coding->eol_type != CODING_EOL_LF);
3942
3943 if ((! eol_conversion) && (coding->heading_ascii >= 0))
d46c5b12
KH
3944 /* Detection routine has already found how much we can skip at the
3945 head. */
3946 *beg += coding->heading_ascii;
3947
3948 if (str)
3949 {
3950 begp_orig = begp = str + *beg;
3951 endp_orig = endp = str + *end;
3952 }
3953 else
3954 {
fb88bf2d 3955 begp_orig = begp = BYTE_POS_ADDR (*beg);
d46c5b12
KH
3956 endp_orig = endp = begp + *end - *beg;
3957 }
3958
d46c5b12
KH
3959 switch (coding->type)
3960 {
3961 case coding_type_emacs_mule:
3962 case coding_type_raw_text:
3963 if (eol_conversion)
3964 {
3965 if (coding->heading_ascii < 0)
fb88bf2d 3966 while (begp < endp && *begp != '\r' && *begp < 0x80) begp++;
ee59c65f 3967 while (begp < endp && endp[-1] != '\r' && endp[-1] < 0x80)
fb88bf2d 3968 endp--;
ee59c65f
RS
3969 /* Do not consider LF as ascii if preceded by CR, since that
3970 confuses eol decoding. */
3971 if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
3972 endp++;
d46c5b12
KH
3973 }
3974 else
3975 begp = endp;
3976 break;
3977
3978 case coding_type_sjis:
3979 case coding_type_big5:
3980 /* We can skip all ASCII characters at the head. */
3981 if (coding->heading_ascii < 0)
3982 {
3983 if (eol_conversion)
de9d083c 3984 while (begp < endp && *begp < 0x80 && *begp != '\r') begp++;
d46c5b12
KH
3985 else
3986 while (begp < endp && *begp < 0x80) begp++;
3987 }
3988 /* We can skip all ASCII characters at the tail except for the
3989 second byte of SJIS or BIG5 code. */
3990 if (eol_conversion)
de9d083c 3991 while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\r') endp--;
d46c5b12
KH
3992 else
3993 while (begp < endp && endp[-1] < 0x80) endp--;
ee59c65f
RS
3994 /* Do not consider LF as ascii if preceded by CR, since that
3995 confuses eol decoding. */
3996 if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
3997 endp++;
d46c5b12
KH
3998 if (begp < endp && endp < endp_orig && endp[-1] >= 0x80)
3999 endp++;
4000 break;
4001
4002 default: /* i.e. case coding_type_iso2022: */
622fece5
KH
4003 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
4004 /* We can't skip any data. */
4005 break;
d46c5b12
KH
4006 if (coding->heading_ascii < 0)
4007 {
d46c5b12
KH
4008 /* We can skip all ASCII characters at the head except for a
4009 few control codes. */
4010 while (begp < endp && (c = *begp) < 0x80
4011 && c != ISO_CODE_CR && c != ISO_CODE_SO
4012 && c != ISO_CODE_SI && c != ISO_CODE_ESC
4013 && (!eol_conversion || c != ISO_CODE_LF))
4014 begp++;
4015 }
4016 switch (coding->category_idx)
4017 {
4018 case CODING_CATEGORY_IDX_ISO_8_1:
4019 case CODING_CATEGORY_IDX_ISO_8_2:
4020 /* We can skip all ASCII characters at the tail. */
4021 if (eol_conversion)
de9d083c 4022 while (begp < endp && (c = endp[-1]) < 0x80 && c != '\r') endp--;
d46c5b12
KH
4023 else
4024 while (begp < endp && endp[-1] < 0x80) endp--;
ee59c65f
RS
4025 /* Do not consider LF as ascii if preceded by CR, since that
4026 confuses eol decoding. */
4027 if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
4028 endp++;
d46c5b12
KH
4029 break;
4030
4031 case CODING_CATEGORY_IDX_ISO_7:
4032 case CODING_CATEGORY_IDX_ISO_7_TIGHT:
de79a6a5
KH
4033 {
4034 /* We can skip all charactes at the tail except for 8-bit
4035 codes and ESC and the following 2-byte at the tail. */
4036 unsigned char *eight_bit = NULL;
4037
4038 if (eol_conversion)
4039 while (begp < endp
4040 && (c = endp[-1]) != ISO_CODE_ESC && c != '\r')
4041 {
4042 if (!eight_bit && c & 0x80) eight_bit = endp;
4043 endp--;
4044 }
4045 else
4046 while (begp < endp
4047 && (c = endp[-1]) != ISO_CODE_ESC)
4048 {
4049 if (!eight_bit && c & 0x80) eight_bit = endp;
4050 endp--;
4051 }
4052 /* Do not consider LF as ascii if preceded by CR, since that
4053 confuses eol decoding. */
4054 if (begp < endp && endp < endp_orig
4055 && endp[-1] == '\r' && endp[0] == '\n')
4056 endp++;
4057 if (begp < endp && endp[-1] == ISO_CODE_ESC)
4058 {
4059 if (endp + 1 < endp_orig && end[0] == '(' && end[1] == 'B')
4060 /* This is an ASCII designation sequence. We can
4061 surely skip the tail. But, if we have
4062 encountered an 8-bit code, skip only the codes
4063 after that. */
4064 endp = eight_bit ? eight_bit : endp + 2;
4065 else
4066 /* Hmmm, we can't skip the tail. */
4067 endp = endp_orig;
4068 }
4069 else if (eight_bit)
4070 endp = eight_bit;
4071 }
d46c5b12
KH
4072 }
4073 }
4074 *beg += begp - begp_orig;
4075 *end += endp - endp_orig;
4076 return;
4077}
4078
4079/* Like shrink_decoding_region but for encoding. */
4080
4081static void
4082shrink_encoding_region (beg, end, coding, str)
4083 int *beg, *end;
4084 struct coding_system *coding;
4085 unsigned char *str;
4086{
4087 unsigned char *begp_orig, *begp, *endp_orig, *endp;
4088 int eol_conversion;
88993dfd 4089 Lisp_Object translation_table;
d46c5b12
KH
4090
4091 if (coding->type == coding_type_ccl)
4092 /* We can't skip any data. */
4093 return;
4094 else if (coding->type == coding_type_no_conversion)
4095 {
4096 /* We need no conversion. */
4097 *beg = *end;
4098 return;
4099 }
4100
88993dfd
KH
4101 translation_table = coding->translation_table_for_encode;
4102 if (NILP (translation_table) && !NILP (Venable_character_translation))
4103 translation_table = Vstandard_translation_table_for_encode;
4104 if (CHAR_TABLE_P (translation_table))
4105 {
4106 int i;
4107 for (i = 0; i < 128; i++)
4108 if (!NILP (CHAR_TABLE_REF (translation_table, i)))
4109 break;
4110 if (i < 128)
4111 /* Some ASCII character should be tranlsated. We give up
4112 shrinking. */
4113 return;
4114 }
4115
d46c5b12
KH
4116 if (str)
4117 {
4118 begp_orig = begp = str + *beg;
4119 endp_orig = endp = str + *end;
4120 }
4121 else
4122 {
fb88bf2d 4123 begp_orig = begp = BYTE_POS_ADDR (*beg);
d46c5b12
KH
4124 endp_orig = endp = begp + *end - *beg;
4125 }
4126
4127 eol_conversion = (coding->eol_type == CODING_EOL_CR
4128 || coding->eol_type == CODING_EOL_CRLF);
4129
4130 /* Here, we don't have to check coding->pre_write_conversion because
4131 the caller is expected to have handled it already. */
4132 switch (coding->type)
4133 {
4134 case coding_type_undecided:
4135 case coding_type_emacs_mule:
4136 case coding_type_raw_text:
4137 if (eol_conversion)
4138 {
4139 while (begp < endp && *begp != '\n') begp++;
4140 while (begp < endp && endp[-1] != '\n') endp--;
4141 }
4142 else
4143 begp = endp;
4144 break;
4145
4146 case coding_type_iso2022:
622fece5
KH
4147 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
4148 /* We can't skip any data. */
4149 break;
d46c5b12
KH
4150 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
4151 {
4152 unsigned char *bol = begp;
4153 while (begp < endp && *begp < 0x80)
4154 {
4155 begp++;
4156 if (begp[-1] == '\n')
4157 bol = begp;
4158 }
4159 begp = bol;
4160 goto label_skip_tail;
4161 }
4162 /* fall down ... */
4163
4164 default:
4165 /* We can skip all ASCII characters at the head and tail. */
4166 if (eol_conversion)
4167 while (begp < endp && *begp < 0x80 && *begp != '\n') begp++;
4168 else
4169 while (begp < endp && *begp < 0x80) begp++;
4170 label_skip_tail:
4171 if (eol_conversion)
4172 while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\n') endp--;
4173 else
4174 while (begp < endp && *(endp - 1) < 0x80) endp--;
4175 break;
4176 }
4177
4178 *beg += begp - begp_orig;
4179 *end += endp - endp_orig;
4180 return;
4181}
4182
88993dfd
KH
4183/* As shrinking conversion region requires some overhead, we don't try
4184 shrinking if the length of conversion region is less than this
4185 value. */
4186static int shrink_conversion_region_threshhold = 1024;
4187
4188#define SHRINK_CONVERSION_REGION(beg, end, coding, str, encodep) \
4189 do { \
4190 if (*(end) - *(beg) > shrink_conversion_region_threshhold) \
4191 { \
4192 if (encodep) shrink_encoding_region (beg, end, coding, str); \
4193 else shrink_decoding_region (beg, end, coding, str); \
4194 } \
4195 } while (0)
4196
d46c5b12 4197/* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the
fb88bf2d
KH
4198 text from FROM to TO (byte positions are FROM_BYTE and TO_BYTE) by
4199 coding system CODING, and return the status code of code conversion
4200 (currently, this value has no meaning).
4201
4202 How many characters (and bytes) are converted to how many
4203 characters (and bytes) are recorded in members of the structure
4204 CODING.
d46c5b12 4205
6e44253b 4206 If REPLACE is nonzero, we do various things as if the original text
d46c5b12 4207 is deleted and a new text is inserted. See the comments in
6e44253b 4208 replace_range (insdel.c) to know what we are doing. */
4ed46869
KH
4209
4210int
6e44253b
KH
4211code_convert_region (from, from_byte, to, to_byte, coding, encodep, replace)
4212 int from, from_byte, to, to_byte, encodep, replace;
4ed46869 4213 struct coding_system *coding;
4ed46869 4214{
fb88bf2d
KH
4215 int len = to - from, len_byte = to_byte - from_byte;
4216 int require, inserted, inserted_byte;
12410ef1 4217 int head_skip, tail_skip, total_skip;
84d60297 4218 Lisp_Object saved_coding_symbol;
fb88bf2d
KH
4219 int multibyte = !NILP (current_buffer->enable_multibyte_characters);
4220 int first = 1;
4221 int fake_multibyte = 0;
4222 unsigned char *src, *dst;
84d60297 4223 Lisp_Object deletion;
e133c8fa 4224 int orig_point = PT, orig_len = len;
6abb9bd9 4225 int prev_Z;
84d60297
RS
4226
4227 deletion = Qnil;
4228 saved_coding_symbol = Qnil;
d46c5b12 4229
83fa074f 4230 if (from < PT && PT < to)
e133c8fa
KH
4231 {
4232 TEMP_SET_PT_BOTH (from, from_byte);
4233 orig_point = from;
4234 }
83fa074f 4235
6e44253b 4236 if (replace)
d46c5b12 4237 {
fb88bf2d
KH
4238 int saved_from = from;
4239
d46c5b12 4240 prepare_to_modify_buffer (from, to, &from);
fb88bf2d
KH
4241 if (saved_from != from)
4242 {
4243 to = from + len;
4244 if (multibyte)
4245 from_byte = CHAR_TO_BYTE (from), to_byte = CHAR_TO_BYTE (to);
4246 else
4247 from_byte = from, to_byte = to;
4248 len_byte = to_byte - from_byte;
4249 }
d46c5b12 4250 }
d46c5b12
KH
4251
4252 if (! encodep && CODING_REQUIRE_DETECTION (coding))
4253 {
12410ef1 4254 /* We must detect encoding of text and eol format. */
d46c5b12
KH
4255
4256 if (from < GPT && to > GPT)
4257 move_gap_both (from, from_byte);
4258 if (coding->type == coding_type_undecided)
4259 {
fb88bf2d 4260 detect_coding (coding, BYTE_POS_ADDR (from_byte), len_byte);
d46c5b12 4261 if (coding->type == coding_type_undecided)
12410ef1
KH
4262 /* It seems that the text contains only ASCII, but we
4263 should not left it undecided because the deeper
4264 decoding routine (decode_coding) tries to detect the
4265 encodings again in vain. */
d46c5b12
KH
4266 coding->type = coding_type_emacs_mule;
4267 }
4268 if (coding->eol_type == CODING_EOL_UNDECIDED)
4269 {
4270 saved_coding_symbol = coding->symbol;
4271 detect_eol (coding, BYTE_POS_ADDR (from_byte), len_byte);
4272 if (coding->eol_type == CODING_EOL_UNDECIDED)
4273 coding->eol_type = CODING_EOL_LF;
4274 /* We had better recover the original eol format if we
4275 encounter an inconsitent eol format while decoding. */
4276 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4277 }
4278 }
4279
fb88bf2d
KH
4280 coding->consumed_char = len, coding->consumed = len_byte;
4281
d46c5b12
KH
4282 if (encodep
4283 ? ! CODING_REQUIRE_ENCODING (coding)
4284 : ! CODING_REQUIRE_DECODING (coding))
fb88bf2d
KH
4285 {
4286 coding->produced = len_byte;
12410ef1
KH
4287 if (multibyte
4288 && ! replace
4289 /* See the comment of the member heading_ascii in coding.h. */
4290 && coding->heading_ascii < len_byte)
fb88bf2d 4291 {
6e44253b
KH
4292 /* We still may have to combine byte at the head and the
4293 tail of the text in the region. */
12410ef1 4294 if (from < GPT && GPT < to)
6e44253b 4295 move_gap_both (to, to_byte);
12410ef1
KH
4296 len = multibyte_chars_in_text (BYTE_POS_ADDR (from_byte), len_byte);
4297 adjust_after_insert (from, from_byte, to, to_byte, len);
4298 coding->produced_char = len;
fb88bf2d
KH
4299 }
4300 else
68e3a8f1
AS
4301 {
4302 if (!replace)
4303 adjust_after_insert (from, from_byte, to, to_byte, len_byte);
4304 coding->produced_char = len_byte;
4305 }
fb88bf2d
KH
4306 return 0;
4307 }
d46c5b12
KH
4308
4309 /* Now we convert the text. */
4310
4311 /* For encoding, we must process pre-write-conversion in advance. */
4312 if (encodep
d46c5b12
KH
4313 && ! NILP (coding->pre_write_conversion)
4314 && SYMBOLP (coding->pre_write_conversion)
4315 && ! NILP (Ffboundp (coding->pre_write_conversion)))
4316 {
2b4f9037
KH
4317 /* The function in pre-write-conversion may put a new text in a
4318 new buffer. */
0007bdd0
KH
4319 struct buffer *prev = current_buffer;
4320 Lisp_Object new;
d46c5b12 4321
b39f748c
AS
4322 call2 (coding->pre_write_conversion,
4323 make_number (from), make_number (to));
d46c5b12
KH
4324 if (current_buffer != prev)
4325 {
4326 len = ZV - BEGV;
0007bdd0 4327 new = Fcurrent_buffer ();
d46c5b12 4328 set_buffer_internal_1 (prev);
ddbc19ff 4329 del_range_2 (from, from_byte, to, to_byte);
e133c8fa 4330 TEMP_SET_PT_BOTH (from, from_byte);
0007bdd0
KH
4331 insert_from_buffer (XBUFFER (new), 1, len, 0);
4332 Fkill_buffer (new);
e133c8fa
KH
4333 if (orig_point >= to)
4334 orig_point += len - orig_len;
4335 else if (orig_point > from)
4336 orig_point = from;
4337 orig_len = len;
d46c5b12 4338 to = from + len;
e133c8fa 4339 from_byte = multibyte ? CHAR_TO_BYTE (from) : from_byte;
fb88bf2d 4340 to_byte = multibyte ? CHAR_TO_BYTE (to) : to;
d46c5b12 4341 len_byte = to_byte - from_byte;
e133c8fa 4342 TEMP_SET_PT_BOTH (from, from_byte);
d46c5b12
KH
4343 }
4344 }
4345
12410ef1
KH
4346 if (replace)
4347 deletion = make_buffer_string_both (from, from_byte, to, to_byte, 1);
4348
d46c5b12 4349 /* Try to skip the heading and tailing ASCIIs. */
12410ef1
KH
4350 {
4351 int from_byte_orig = from_byte, to_byte_orig = to_byte;
4352
4353 if (from < GPT && GPT < to)
4354 move_gap_both (from, from_byte);
88993dfd 4355 SHRINK_CONVERSION_REGION (&from_byte, &to_byte, coding, NULL, encodep);
d4e57bcd 4356 if (from_byte == to_byte
944bd420 4357 && coding->type != coding_type_ccl
d4e57bcd
KH
4358 && ! (coding->mode & CODING_MODE_LAST_BLOCK
4359 && CODING_REQUIRE_FLUSHING (coding)))
12410ef1
KH
4360 {
4361 coding->produced = len_byte;
4362 coding->produced_char = multibyte ? len : len_byte;
4363 if (!replace)
4364 /* We must record and adjust for this new text now. */
4365 adjust_after_insert (from, from_byte_orig, to, to_byte_orig, len);
4366 return 0;
4367 }
fb88bf2d 4368
12410ef1
KH
4369 head_skip = from_byte - from_byte_orig;
4370 tail_skip = to_byte_orig - to_byte;
4371 total_skip = head_skip + tail_skip;
4372 from += head_skip;
4373 to -= tail_skip;
4374 len -= total_skip; len_byte -= total_skip;
4375 }
d46c5b12 4376
88993dfd 4377 /* The code conversion routine can not preserve text properties for
55d8d769
KH
4378 now. So, we must remove all text properties in the region.
4379 Here, we must suppress all modification hooks. */
88993dfd 4380 if (replace)
55d8d769
KH
4381 {
4382 int saved_inhibit_modification_hooks = inhibit_modification_hooks;
4383 inhibit_modification_hooks = 1;
4384 Fset_text_properties (make_number (from), make_number (to), Qnil, Qnil);
4385 inhibit_modification_hooks = saved_inhibit_modification_hooks;
4386 }
88993dfd 4387
fb88bf2d
KH
4388 /* For converion, we must put the gap before the text in addition to
4389 making the gap larger for efficient decoding. The required gap
4390 size starts from 2000 which is the magic number used in make_gap.
4391 But, after one batch of conversion, it will be incremented if we
4392 find that it is not enough . */
d46c5b12
KH
4393 require = 2000;
4394
4395 if (GAP_SIZE < require)
4396 make_gap (require - GAP_SIZE);
4397 move_gap_both (from, from_byte);
4398
d46c5b12 4399 inserted = inserted_byte = 0;
fb88bf2d
KH
4400 src = GAP_END_ADDR, dst = GPT_ADDR;
4401
4402 GAP_SIZE += len_byte;
4403 ZV -= len;
4404 Z -= len;
4405 ZV_BYTE -= len_byte;
4406 Z_BYTE -= len_byte;
4407
f2558efd
KH
4408 if (GPT - BEG < beg_unchanged)
4409 beg_unchanged = GPT - BEG;
4410 if (Z - GPT < end_unchanged)
4411 end_unchanged = Z - GPT;
4412
d46c5b12
KH
4413 for (;;)
4414 {
fb88bf2d 4415 int result;
d46c5b12
KH
4416
4417 /* The buffer memory is changed from:
fb88bf2d
KH
4418 +--------+converted-text+---------+-------original-text------+---+
4419 |<-from->|<--inserted-->|---------|<-----------len---------->|---|
4420 |<------------------- GAP_SIZE -------------------->| */
d46c5b12 4421 if (encodep)
fb88bf2d 4422 result = encode_coding (coding, src, dst, len_byte, 0);
d46c5b12 4423 else
fb88bf2d 4424 result = decode_coding (coding, src, dst, len_byte, 0);
d46c5b12
KH
4425 /* to:
4426 +--------+-------converted-text--------+--+---original-text--+---+
fb88bf2d
KH
4427 |<-from->|<--inserted-->|<--produced-->|--|<-(len-consumed)->|---|
4428 |<------------------- GAP_SIZE -------------------->| */
4429 if (coding->fake_multibyte)
4430 fake_multibyte = 1;
d46c5b12 4431
fb88bf2d
KH
4432 if (!encodep && !multibyte)
4433 coding->produced_char = coding->produced;
d46c5b12
KH
4434 inserted += coding->produced_char;
4435 inserted_byte += coding->produced;
d46c5b12 4436 len_byte -= coding->consumed;
fb88bf2d
KH
4437 src += coding->consumed;
4438 dst += inserted_byte;
d46c5b12 4439
9864ebce
KH
4440 if (result == CODING_FINISH_NORMAL)
4441 {
4442 src += len_byte;
4443 break;
4444 }
d46c5b12
KH
4445 if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
4446 {
fb88bf2d 4447 unsigned char *pend = dst, *p = pend - inserted_byte;
d46c5b12
KH
4448
4449 /* Encode LFs back to the original eol format (CR or CRLF). */
4450 if (coding->eol_type == CODING_EOL_CR)
4451 {
4452 while (p < pend) if (*p++ == '\n') p[-1] = '\r';
4453 }
4454 else
4455 {
d46c5b12
KH
4456 int count = 0;
4457
fb88bf2d
KH
4458 while (p < pend) if (*p++ == '\n') count++;
4459 if (src - dst < count)
d46c5b12 4460 {
fb88bf2d
KH
4461 /* We don't have sufficient room for putting LFs
4462 back to CRLF. We must record converted and
4463 not-yet-converted text back to the buffer
4464 content, enlarge the gap, then record them out of
4465 the buffer contents again. */
4466 int add = len_byte + inserted_byte;
4467
4468 GAP_SIZE -= add;
4469 ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
4470 GPT += inserted_byte; GPT_BYTE += inserted_byte;
4471 make_gap (count - GAP_SIZE);
4472 GAP_SIZE += add;
4473 ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
4474 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
4475 /* Don't forget to update SRC, DST, and PEND. */
4476 src = GAP_END_ADDR - len_byte;
4477 dst = GPT_ADDR + inserted_byte;
4478 pend = dst;
d46c5b12 4479 }
d46c5b12
KH
4480 inserted += count;
4481 inserted_byte += count;
fb88bf2d
KH
4482 coding->produced += count;
4483 p = dst = pend + count;
4484 while (count)
4485 {
4486 *--p = *--pend;
4487 if (*p == '\n') count--, *--p = '\r';
4488 }
d46c5b12
KH
4489 }
4490
4491 /* Suppress eol-format conversion in the further conversion. */
4492 coding->eol_type = CODING_EOL_LF;
4493
4494 /* Restore the original symbol. */
4495 coding->symbol = saved_coding_symbol;
fb88bf2d
KH
4496
4497 continue;
d46c5b12
KH
4498 }
4499 if (len_byte <= 0)
944bd420
KH
4500 {
4501 if (coding->type != coding_type_ccl
4502 || coding->mode & CODING_MODE_LAST_BLOCK)
4503 break;
4504 coding->mode |= CODING_MODE_LAST_BLOCK;
4505 continue;
4506 }
d46c5b12
KH
4507 if (result == CODING_FINISH_INSUFFICIENT_SRC)
4508 {
4509 /* The source text ends in invalid codes. Let's just
4510 make them valid buffer contents, and finish conversion. */
fb88bf2d 4511 inserted += len_byte;
d46c5b12 4512 inserted_byte += len_byte;
fb88bf2d 4513 while (len_byte--)
ee59c65f 4514 *dst++ = *src++;
fb88bf2d 4515 fake_multibyte = 1;
d46c5b12
KH
4516 break;
4517 }
9864ebce
KH
4518 if (result == CODING_FINISH_INTERRUPT)
4519 {
4520 /* The conversion procedure was interrupted by a user. */
4521 fake_multibyte = 1;
4522 break;
4523 }
4524 /* Now RESULT == CODING_FINISH_INSUFFICIENT_DST */
4525 if (coding->consumed < 1)
4526 {
4527 /* It's quite strange to require more memory without
4528 consuming any bytes. Perhaps CCL program bug. */
4529 fake_multibyte = 1;
4530 break;
4531 }
fb88bf2d
KH
4532 if (first)
4533 {
4534 /* We have just done the first batch of conversion which was
4535 stoped because of insufficient gap. Let's reconsider the
4536 required gap size (i.e. SRT - DST) now.
4537
4538 We have converted ORIG bytes (== coding->consumed) into
4539 NEW bytes (coding->produced). To convert the remaining
4540 LEN bytes, we may need REQUIRE bytes of gap, where:
4541 REQUIRE + LEN_BYTE = LEN_BYTE * (NEW / ORIG)
4542 REQUIRE = LEN_BYTE * (NEW - ORIG) / ORIG
4543 Here, we are sure that NEW >= ORIG. */
6e44253b
KH
4544 float ratio = coding->produced - coding->consumed;
4545 ratio /= coding->consumed;
4546 require = len_byte * ratio;
fb88bf2d
KH
4547 first = 0;
4548 }
4549 if ((src - dst) < (require + 2000))
4550 {
4551 /* See the comment above the previous call of make_gap. */
4552 int add = len_byte + inserted_byte;
4553
4554 GAP_SIZE -= add;
4555 ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
4556 GPT += inserted_byte; GPT_BYTE += inserted_byte;
4557 make_gap (require + 2000);
4558 GAP_SIZE += add;
4559 ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
4560 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
4561 /* Don't forget to update SRC, DST. */
4562 src = GAP_END_ADDR - len_byte;
4563 dst = GPT_ADDR + inserted_byte;
4564 }
d46c5b12 4565 }
fb88bf2d
KH
4566 if (src - dst > 0) *dst = 0; /* Put an anchor. */
4567
2b4f9037 4568 if (multibyte
88993dfd
KH
4569 && (encodep
4570 || fake_multibyte
4571 || (to - from) != (to_byte - from_byte)))
2b4f9037 4572 inserted = multibyte_chars_in_text (GPT_ADDR, inserted_byte);
7553d0e1 4573
12410ef1
KH
4574 /* If we have shrinked the conversion area, adjust it now. */
4575 if (total_skip > 0)
4576 {
4577 if (tail_skip > 0)
4578 safe_bcopy (GAP_END_ADDR, GPT_ADDR + inserted_byte, tail_skip);
4579 inserted += total_skip; inserted_byte += total_skip;
4580 GAP_SIZE += total_skip;
4581 GPT -= head_skip; GPT_BYTE -= head_skip;
4582 ZV -= total_skip; ZV_BYTE -= total_skip;
4583 Z -= total_skip; Z_BYTE -= total_skip;
4584 from -= head_skip; from_byte -= head_skip;
4585 to += tail_skip; to_byte += tail_skip;
4586 }
4587
6abb9bd9 4588 prev_Z = Z;
12410ef1 4589 adjust_after_replace (from, from_byte, deletion, inserted, inserted_byte);
6abb9bd9 4590 inserted = Z - prev_Z;
4ed46869 4591
2b4f9037 4592 if (! encodep && ! NILP (coding->post_read_conversion))
d46c5b12 4593 {
2b4f9037 4594 Lisp_Object val;
4ed46869 4595
e133c8fa
KH
4596 if (from != PT)
4597 TEMP_SET_PT_BOTH (from, from_byte);
6abb9bd9 4598 prev_Z = Z;
2b4f9037 4599 val = call1 (coding->post_read_conversion, make_number (inserted));
6abb9bd9 4600 CHECK_NUMBER (val, 0);
944bd420 4601 inserted += Z - prev_Z;
e133c8fa
KH
4602 }
4603
4604 if (orig_point >= from)
4605 {
4606 if (orig_point >= from + orig_len)
4607 orig_point += inserted - orig_len;
4608 else
4609 orig_point = from;
4610 TEMP_SET_PT (orig_point);
d46c5b12 4611 }
4ed46869 4612
2b4f9037
KH
4613 signal_after_change (from, to - from, inserted);
4614
fb88bf2d 4615 {
12410ef1
KH
4616 coding->consumed = to_byte - from_byte;
4617 coding->consumed_char = to - from;
4618 coding->produced = inserted_byte;
4619 coding->produced_char = inserted;
fb88bf2d 4620 }
7553d0e1 4621
fb88bf2d 4622 return 0;
d46c5b12
KH
4623}
4624
4625Lisp_Object
4626code_convert_string (str, coding, encodep, nocopy)
4627 Lisp_Object str;
4ed46869 4628 struct coding_system *coding;
d46c5b12 4629 int encodep, nocopy;
4ed46869 4630{
d46c5b12
KH
4631 int len;
4632 char *buf;
fc932ac6
RS
4633 int from = 0, to = XSTRING (str)->size;
4634 int to_byte = STRING_BYTES (XSTRING (str));
d46c5b12 4635 struct gcpro gcpro1;
84d60297 4636 Lisp_Object saved_coding_symbol;
d46c5b12 4637 int result;
4ed46869 4638
84d60297 4639 saved_coding_symbol = Qnil;
d46c5b12
KH
4640 if (encodep && !NILP (coding->pre_write_conversion)
4641 || !encodep && !NILP (coding->post_read_conversion))
4642 {
4643 /* Since we have to call Lisp functions which assume target text
4644 is in a buffer, after setting a temporary buffer, call
4645 code_convert_region. */
4646 int count = specpdl_ptr - specpdl;
4647 struct buffer *prev = current_buffer;
e133c8fa 4648
d46c5b12
KH
4649 record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
4650 temp_output_buffer_setup (" *code-converting-work*");
4651 set_buffer_internal (XBUFFER (Vstandard_output));
4652 if (encodep)
4653 insert_from_string (str, 0, 0, to, to_byte, 0);
4654 else
4655 {
4656 /* We must insert the contents of STR as is without
4657 unibyte<->multibyte conversion. */
4658 current_buffer->enable_multibyte_characters = Qnil;
4659 insert_from_string (str, 0, 0, to_byte, to_byte, 0);
4660 current_buffer->enable_multibyte_characters = Qt;
4661 }
fb88bf2d 4662 code_convert_region (BEGV, BEGV_BYTE, ZV, ZV_BYTE, coding, encodep, 1);
d46c5b12
KH
4663 if (encodep)
4664 /* We must return the buffer contents as unibyte string. */
4665 current_buffer->enable_multibyte_characters = Qnil;
4666 str = make_buffer_string (BEGV, ZV, 0);
4667 set_buffer_internal (prev);
4668 return unbind_to (count, str);
4669 }
4ed46869 4670
d46c5b12
KH
4671 if (! encodep && CODING_REQUIRE_DETECTION (coding))
4672 {
4673 /* See the comments in code_convert_region. */
4674 if (coding->type == coding_type_undecided)
4675 {
4676 detect_coding (coding, XSTRING (str)->data, to_byte);
4677 if (coding->type == coding_type_undecided)
4678 coding->type = coding_type_emacs_mule;
4679 }
4680 if (coding->eol_type == CODING_EOL_UNDECIDED)
4681 {
4682 saved_coding_symbol = coding->symbol;
4683 detect_eol (coding, XSTRING (str)->data, to_byte);
4684 if (coding->eol_type == CODING_EOL_UNDECIDED)
4685 coding->eol_type = CODING_EOL_LF;
4686 /* We had better recover the original eol format if we
4687 encounter an inconsitent eol format while decoding. */
4688 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4689 }
4690 }
4ed46869 4691
d46c5b12
KH
4692 if (encodep
4693 ? ! CODING_REQUIRE_ENCODING (coding)
4694 : ! CODING_REQUIRE_DECODING (coding))
4695 from = to_byte;
4696 else
4697 {
4698 /* Try to skip the heading and tailing ASCIIs. */
88993dfd
KH
4699 SHRINK_CONVERSION_REGION (&from, &to_byte, coding, XSTRING (str)->data,
4700 encodep);
d46c5b12 4701 }
e133c8fa
KH
4702 if (from == to_byte
4703 && coding->type != coding_type_ccl)
d46c5b12 4704 return (nocopy ? str : Fcopy_sequence (str));
4ed46869 4705
d46c5b12
KH
4706 if (encodep)
4707 len = encoding_buffer_size (coding, to_byte - from);
4708 else
4709 len = decoding_buffer_size (coding, to_byte - from);
fc932ac6 4710 len += from + STRING_BYTES (XSTRING (str)) - to_byte;
d46c5b12
KH
4711 GCPRO1 (str);
4712 buf = get_conversion_buffer (len);
4713 UNGCPRO;
4ed46869 4714
d46c5b12
KH
4715 if (from > 0)
4716 bcopy (XSTRING (str)->data, buf, from);
4717 result = (encodep
4718 ? encode_coding (coding, XSTRING (str)->data + from,
4719 buf + from, to_byte - from, len)
4720 : decode_coding (coding, XSTRING (str)->data + from,
f30cc612 4721 buf + from, to_byte - from, len));
d46c5b12 4722 if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
4ed46869 4723 {
d46c5b12
KH
4724 /* We simple try to decode the whole string again but without
4725 eol-conversion this time. */
4726 coding->eol_type = CODING_EOL_LF;
4727 coding->symbol = saved_coding_symbol;
4728 return code_convert_string (str, coding, encodep, nocopy);
4ed46869 4729 }
d46c5b12
KH
4730
4731 bcopy (XSTRING (str)->data + to_byte, buf + from + coding->produced,
fc932ac6 4732 STRING_BYTES (XSTRING (str)) - to_byte);
d46c5b12 4733
fc932ac6 4734 len = from + STRING_BYTES (XSTRING (str)) - to_byte;
d46c5b12
KH
4735 if (encodep)
4736 str = make_unibyte_string (buf, len + coding->produced);
4737 else
826bfb8b
KH
4738 {
4739 int chars= (coding->fake_multibyte
4740 ? multibyte_chars_in_text (buf + from, coding->produced)
4741 : coding->produced_char);
4742 str = make_multibyte_string (buf, len + chars, len + coding->produced);
4743 }
4744
d46c5b12 4745 return str;
4ed46869
KH
4746}
4747
4748\f
4749#ifdef emacs
1397dc18 4750/*** 8. Emacs Lisp library functions ***/
4ed46869 4751
4ed46869
KH
4752DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
4753 "Return t if OBJECT is nil or a coding-system.\n\
3a73fa5d
RS
4754See the documentation of `make-coding-system' for information\n\
4755about coding-system objects.")
4ed46869
KH
4756 (obj)
4757 Lisp_Object obj;
4758{
4608c386
KH
4759 if (NILP (obj))
4760 return Qt;
4761 if (!SYMBOLP (obj))
4762 return Qnil;
4763 /* Get coding-spec vector for OBJ. */
4764 obj = Fget (obj, Qcoding_system);
4765 return ((VECTORP (obj) && XVECTOR (obj)->size == 5)
4766 ? Qt : Qnil);
4ed46869
KH
4767}
4768
9d991de8
RS
4769DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
4770 Sread_non_nil_coding_system, 1, 1, 0,
e0e989f6 4771 "Read a coding system from the minibuffer, prompting with string PROMPT.")
4ed46869
KH
4772 (prompt)
4773 Lisp_Object prompt;
4774{
e0e989f6 4775 Lisp_Object val;
9d991de8
RS
4776 do
4777 {
4608c386
KH
4778 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
4779 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
9d991de8
RS
4780 }
4781 while (XSTRING (val)->size == 0);
e0e989f6 4782 return (Fintern (val, Qnil));
4ed46869
KH
4783}
4784
9b787f3e
RS
4785DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
4786 "Read a coding system from the minibuffer, prompting with string PROMPT.\n\
4787If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.")
4788 (prompt, default_coding_system)
4789 Lisp_Object prompt, default_coding_system;
4ed46869 4790{
f44d27ce 4791 Lisp_Object val;
9b787f3e
RS
4792 if (SYMBOLP (default_coding_system))
4793 XSETSTRING (default_coding_system, XSYMBOL (default_coding_system)->name);
4608c386 4794 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
9b787f3e
RS
4795 Qt, Qnil, Qcoding_system_history,
4796 default_coding_system, Qnil);
e0e989f6 4797 return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
4ed46869
KH
4798}
4799
4800DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
4801 1, 1, 0,
4802 "Check validity of CODING-SYSTEM.\n\
3a73fa5d
RS
4803If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.\n\
4804It is valid if it is a symbol with a non-nil `coding-system' property.\n\
4ed46869
KH
4805The value of property should be a vector of length 5.")
4806 (coding_system)
4807 Lisp_Object coding_system;
4808{
4809 CHECK_SYMBOL (coding_system, 0);
4810 if (!NILP (Fcoding_system_p (coding_system)))
4811 return coding_system;
4812 while (1)
02ba4723 4813 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
4ed46869 4814}
3a73fa5d 4815\f
d46c5b12
KH
4816Lisp_Object
4817detect_coding_system (src, src_bytes, highest)
4818 unsigned char *src;
4819 int src_bytes, highest;
4ed46869
KH
4820{
4821 int coding_mask, eol_type;
d46c5b12
KH
4822 Lisp_Object val, tmp;
4823 int dummy;
4ed46869 4824
d46c5b12
KH
4825 coding_mask = detect_coding_mask (src, src_bytes, NULL, &dummy);
4826 eol_type = detect_eol_type (src, src_bytes, &dummy);
4827 if (eol_type == CODING_EOL_INCONSISTENT)
25b02698 4828 eol_type = CODING_EOL_UNDECIDED;
4ed46869 4829
d46c5b12 4830 if (!coding_mask)
4ed46869 4831 {
27901516 4832 val = Qundecided;
d46c5b12 4833 if (eol_type != CODING_EOL_UNDECIDED)
4ed46869 4834 {
f44d27ce
RS
4835 Lisp_Object val2;
4836 val2 = Fget (Qundecided, Qeol_type);
4ed46869
KH
4837 if (VECTORP (val2))
4838 val = XVECTOR (val2)->contents[eol_type];
4839 }
80e803b4 4840 return (highest ? val : Fcons (val, Qnil));
4ed46869 4841 }
4ed46869 4842
d46c5b12
KH
4843 /* At first, gather possible coding systems in VAL. */
4844 val = Qnil;
4845 for (tmp = Vcoding_category_list; !NILP (tmp); tmp = XCONS (tmp)->cdr)
4ed46869 4846 {
d46c5b12
KH
4847 int idx
4848 = XFASTINT (Fget (XCONS (tmp)->car, Qcoding_category_index));
4849 if (coding_mask & (1 << idx))
4ed46869 4850 {
d46c5b12
KH
4851 val = Fcons (Fsymbol_value (XCONS (tmp)->car), val);
4852 if (highest)
4853 break;
4ed46869
KH
4854 }
4855 }
d46c5b12
KH
4856 if (!highest)
4857 val = Fnreverse (val);
4ed46869 4858
65059037 4859 /* Then, replace the elements with subsidiary coding systems. */
d46c5b12 4860 for (tmp = val; !NILP (tmp); tmp = XCONS (tmp)->cdr)
4ed46869 4861 {
65059037
RS
4862 if (eol_type != CODING_EOL_UNDECIDED
4863 && eol_type != CODING_EOL_INCONSISTENT)
4ed46869 4864 {
d46c5b12
KH
4865 Lisp_Object eol;
4866 eol = Fget (XCONS (tmp)->car, Qeol_type);
4867 if (VECTORP (eol))
4868 XCONS (tmp)->car = XVECTOR (eol)->contents[eol_type];
4ed46869
KH
4869 }
4870 }
d46c5b12
KH
4871 return (highest ? XCONS (val)->car : val);
4872}
4ed46869 4873
d46c5b12
KH
4874DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
4875 2, 3, 0,
4876 "Detect coding system of the text in the region between START and END.\n\
4877Return a list of possible coding systems ordered by priority.\n\
4878\n\
80e803b4
KH
4879If only ASCII characters are found, it returns a list of single element\n\
4880`undecided' or its subsidiary coding system according to a detected\n\
4881end-of-line format.\n\
d46c5b12
KH
4882\n\
4883If optional argument HIGHEST is non-nil, return the coding system of\n\
4884highest priority.")
4885 (start, end, highest)
4886 Lisp_Object start, end, highest;
4887{
4888 int from, to;
4889 int from_byte, to_byte;
6289dd10 4890
d46c5b12
KH
4891 CHECK_NUMBER_COERCE_MARKER (start, 0);
4892 CHECK_NUMBER_COERCE_MARKER (end, 1);
4ed46869 4893
d46c5b12
KH
4894 validate_region (&start, &end);
4895 from = XINT (start), to = XINT (end);
4896 from_byte = CHAR_TO_BYTE (from);
4897 to_byte = CHAR_TO_BYTE (to);
6289dd10 4898
d46c5b12
KH
4899 if (from < GPT && to >= GPT)
4900 move_gap_both (to, to_byte);
4ed46869 4901
d46c5b12
KH
4902 return detect_coding_system (BYTE_POS_ADDR (from_byte),
4903 to_byte - from_byte,
4904 !NILP (highest));
4905}
6289dd10 4906
d46c5b12
KH
4907DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
4908 1, 2, 0,
4909 "Detect coding system of the text in STRING.\n\
4910Return a list of possible coding systems ordered by priority.\n\
4911\n\
80e803b4
KH
4912If only ASCII characters are found, it returns a list of single element\n\
4913`undecided' or its subsidiary coding system according to a detected\n\
4914end-of-line format.\n\
d46c5b12
KH
4915\n\
4916If optional argument HIGHEST is non-nil, return the coding system of\n\
4917highest priority.")
4918 (string, highest)
4919 Lisp_Object string, highest;
4920{
4921 CHECK_STRING (string, 0);
4ed46869 4922
d46c5b12 4923 return detect_coding_system (XSTRING (string)->data,
fc932ac6 4924 STRING_BYTES (XSTRING (string)),
d46c5b12 4925 !NILP (highest));
4ed46869
KH
4926}
4927
4031e2bf
KH
4928Lisp_Object
4929code_convert_region1 (start, end, coding_system, encodep)
d46c5b12 4930 Lisp_Object start, end, coding_system;
4031e2bf 4931 int encodep;
3a73fa5d
RS
4932{
4933 struct coding_system coding;
4031e2bf 4934 int from, to, len;
3a73fa5d 4935
d46c5b12
KH
4936 CHECK_NUMBER_COERCE_MARKER (start, 0);
4937 CHECK_NUMBER_COERCE_MARKER (end, 1);
3a73fa5d
RS
4938 CHECK_SYMBOL (coding_system, 2);
4939
d46c5b12
KH
4940 validate_region (&start, &end);
4941 from = XFASTINT (start);
4942 to = XFASTINT (end);
4943
3a73fa5d 4944 if (NILP (coding_system))
d46c5b12
KH
4945 return make_number (to - from);
4946
3a73fa5d 4947 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
d46c5b12 4948 error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
3a73fa5d 4949
d46c5b12 4950 coding.mode |= CODING_MODE_LAST_BLOCK;
fb88bf2d
KH
4951 code_convert_region (from, CHAR_TO_BYTE (from), to, CHAR_TO_BYTE (to),
4952 &coding, encodep, 1);
f072a3e8 4953 Vlast_coding_system_used = coding.symbol;
fb88bf2d 4954 return make_number (coding.produced_char);
4031e2bf
KH
4955}
4956
4957DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
4958 3, 3, "r\nzCoding system: ",
4959 "Decode the current region by specified coding system.\n\
4960When called from a program, takes three arguments:\n\
4961START, END, and CODING-SYSTEM. START and END are buffer positions.\n\
f072a3e8
RS
4962This function sets `last-coding-system-used' to the precise coding system\n\
4963used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
4964not fully specified.)\n\
4965It returns the length of the decoded text.")
4031e2bf
KH
4966 (start, end, coding_system)
4967 Lisp_Object start, end, coding_system;
4968{
4969 return code_convert_region1 (start, end, coding_system, 0);
3a73fa5d
RS
4970}
4971
4972DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
4973 3, 3, "r\nzCoding system: ",
d46c5b12 4974 "Encode the current region by specified coding system.\n\
3a73fa5d 4975When called from a program, takes three arguments:\n\
d46c5b12 4976START, END, and CODING-SYSTEM. START and END are buffer positions.\n\
f072a3e8
RS
4977This function sets `last-coding-system-used' to the precise coding system\n\
4978used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
4979not fully specified.)\n\
4980It returns the length of the encoded text.")
d46c5b12
KH
4981 (start, end, coding_system)
4982 Lisp_Object start, end, coding_system;
3a73fa5d 4983{
4031e2bf
KH
4984 return code_convert_region1 (start, end, coding_system, 1);
4985}
3a73fa5d 4986
4031e2bf
KH
4987Lisp_Object
4988code_convert_string1 (string, coding_system, nocopy, encodep)
4989 Lisp_Object string, coding_system, nocopy;
4990 int encodep;
4991{
4992 struct coding_system coding;
3a73fa5d 4993
4031e2bf
KH
4994 CHECK_STRING (string, 0);
4995 CHECK_SYMBOL (coding_system, 1);
4ed46869 4996
d46c5b12 4997 if (NILP (coding_system))
4031e2bf 4998 return (NILP (nocopy) ? Fcopy_sequence (string) : string);
4ed46869 4999
d46c5b12
KH
5000 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
5001 error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
5f1cd180 5002
d46c5b12 5003 coding.mode |= CODING_MODE_LAST_BLOCK;
f072a3e8 5004 Vlast_coding_system_used = coding.symbol;
4031e2bf 5005 return code_convert_string (string, &coding, encodep, !NILP (nocopy));
4ed46869
KH
5006}
5007
4ed46869 5008DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
e0e989f6
KH
5009 2, 3, 0,
5010 "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
fe487a71 5011Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
f072a3e8
RS
5012if the decoding operation is trivial.\n\
5013This function sets `last-coding-system-used' to the precise coding system\n\
5014used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5015not fully specified.)")
e0e989f6
KH
5016 (string, coding_system, nocopy)
5017 Lisp_Object string, coding_system, nocopy;
4ed46869 5018{
f072a3e8 5019 return code_convert_string1 (string, coding_system, nocopy, 0);
4ed46869
KH
5020}
5021
5022DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
e0e989f6
KH
5023 2, 3, 0,
5024 "Encode STRING to CODING-SYSTEM, and return the result.\n\
fe487a71 5025Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
f072a3e8
RS
5026if the encoding operation is trivial.\n\
5027This function sets `last-coding-system-used' to the precise coding system\n\
5028used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5029not fully specified.)")
e0e989f6
KH
5030 (string, coding_system, nocopy)
5031 Lisp_Object string, coding_system, nocopy;
4ed46869 5032{
f072a3e8 5033 return code_convert_string1 (string, coding_system, nocopy, 1);
4ed46869 5034}
4031e2bf 5035
ecec61c1
KH
5036/* Encode or decode STRING according to CODING_SYSTEM.
5037 Do not set Vlast_coding_system_used. */
5038
5039Lisp_Object
5040code_convert_string_norecord (string, coding_system, encodep)
5041 Lisp_Object string, coding_system;
5042 int encodep;
5043{
5044 struct coding_system coding;
5045
5046 CHECK_STRING (string, 0);
5047 CHECK_SYMBOL (coding_system, 1);
5048
5049 if (NILP (coding_system))
5050 return string;
5051
5052 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
5053 error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
5054
5055 coding.mode |= CODING_MODE_LAST_BLOCK;
5056 return code_convert_string (string, &coding, encodep, Qt);
5057}
3a73fa5d 5058\f
4ed46869 5059DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
55ab7be3 5060 "Decode a Japanese character which has CODE in shift_jis encoding.\n\
4ed46869
KH
5061Return the corresponding character.")
5062 (code)
5063 Lisp_Object code;
5064{
5065 unsigned char c1, c2, s1, s2;
5066 Lisp_Object val;
5067
5068 CHECK_NUMBER (code, 0);
5069 s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
55ab7be3
KH
5070 if (s1 == 0)
5071 {
c28a9453
KH
5072 if (s2 < 0x80)
5073 XSETFASTINT (val, s2);
5074 else if (s2 >= 0xA0 || s2 <= 0xDF)
5075 XSETFASTINT (val,
5076 MAKE_NON_ASCII_CHAR (charset_katakana_jisx0201, s2, 0));
5077 else
9da8350f 5078 error ("Invalid Shift JIS code: %x", XFASTINT (code));
55ab7be3
KH
5079 }
5080 else
5081 {
5082 if ((s1 < 0x80 || s1 > 0x9F && s1 < 0xE0 || s1 > 0xEF)
5083 || (s2 < 0x40 || s2 == 0x7F || s2 > 0xFC))
9da8350f 5084 error ("Invalid Shift JIS code: %x", XFASTINT (code));
55ab7be3
KH
5085 DECODE_SJIS (s1, s2, c1, c2);
5086 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset_jisx0208, c1, c2));
5087 }
4ed46869
KH
5088 return val;
5089}
5090
5091DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
55ab7be3
KH
5092 "Encode a Japanese character CHAR to shift_jis encoding.\n\
5093Return the corresponding code in SJIS.")
4ed46869
KH
5094 (ch)
5095 Lisp_Object ch;
5096{
bcf26d6a 5097 int charset, c1, c2, s1, s2;
4ed46869
KH
5098 Lisp_Object val;
5099
5100 CHECK_NUMBER (ch, 0);
5101 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
c28a9453
KH
5102 if (charset == CHARSET_ASCII)
5103 {
5104 val = ch;
5105 }
5106 else if (charset == charset_jisx0208
5107 && c1 > 0x20 && c1 < 0x7F && c2 > 0x20 && c2 < 0x7F)
4ed46869
KH
5108 {
5109 ENCODE_SJIS (c1, c2, s1, s2);
bcf26d6a 5110 XSETFASTINT (val, (s1 << 8) | s2);
4ed46869 5111 }
55ab7be3
KH
5112 else if (charset == charset_katakana_jisx0201
5113 && c1 > 0x20 && c2 < 0xE0)
5114 {
5115 XSETFASTINT (val, c1 | 0x80);
5116 }
4ed46869 5117 else
55ab7be3 5118 error ("Can't encode to shift_jis: %d", XFASTINT (ch));
4ed46869
KH
5119 return val;
5120}
5121
5122DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
c28a9453 5123 "Decode a Big5 character which has CODE in BIG5 coding system.\n\
4ed46869
KH
5124Return the corresponding character.")
5125 (code)
5126 Lisp_Object code;
5127{
5128 int charset;
5129 unsigned char b1, b2, c1, c2;
5130 Lisp_Object val;
5131
5132 CHECK_NUMBER (code, 0);
5133 b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
c28a9453
KH
5134 if (b1 == 0)
5135 {
5136 if (b2 >= 0x80)
9da8350f 5137 error ("Invalid BIG5 code: %x", XFASTINT (code));
c28a9453
KH
5138 val = code;
5139 }
5140 else
5141 {
5142 if ((b1 < 0xA1 || b1 > 0xFE)
5143 || (b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE))
9da8350f 5144 error ("Invalid BIG5 code: %x", XFASTINT (code));
c28a9453
KH
5145 DECODE_BIG5 (b1, b2, charset, c1, c2);
5146 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset, c1, c2));
5147 }
4ed46869
KH
5148 return val;
5149}
5150
5151DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
d46c5b12 5152 "Encode the Big5 character CHAR to BIG5 coding system.\n\
4ed46869
KH
5153Return the corresponding character code in Big5.")
5154 (ch)
5155 Lisp_Object ch;
5156{
bcf26d6a 5157 int charset, c1, c2, b1, b2;
4ed46869
KH
5158 Lisp_Object val;
5159
5160 CHECK_NUMBER (ch, 0);
5161 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
c28a9453
KH
5162 if (charset == CHARSET_ASCII)
5163 {
5164 val = ch;
5165 }
5166 else if ((charset == charset_big5_1
5167 && (XFASTINT (ch) >= 0x250a1 && XFASTINT (ch) <= 0x271ec))
5168 || (charset == charset_big5_2
5169 && XFASTINT (ch) >= 0x290a1 && XFASTINT (ch) <= 0x2bdb2))
4ed46869
KH
5170 {
5171 ENCODE_BIG5 (charset, c1, c2, b1, b2);
bcf26d6a 5172 XSETFASTINT (val, (b1 << 8) | b2);
4ed46869
KH
5173 }
5174 else
c28a9453 5175 error ("Can't encode to Big5: %d", XFASTINT (ch));
4ed46869
KH
5176 return val;
5177}
3a73fa5d 5178\f
1ba9e4ab
KH
5179DEFUN ("set-terminal-coding-system-internal",
5180 Fset_terminal_coding_system_internal,
5181 Sset_terminal_coding_system_internal, 1, 1, 0, "")
4ed46869
KH
5182 (coding_system)
5183 Lisp_Object coding_system;
5184{
5185 CHECK_SYMBOL (coding_system, 0);
5186 setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
70c22245 5187 /* We had better not send unsafe characters to terminal. */
6e85d753
KH
5188 terminal_coding.flags |= CODING_FLAG_ISO_SAFE;
5189
4ed46869
KH
5190 return Qnil;
5191}
5192
c4825358
KH
5193DEFUN ("set-safe-terminal-coding-system-internal",
5194 Fset_safe_terminal_coding_system_internal,
5195 Sset_safe_terminal_coding_system_internal, 1, 1, 0, "")
5196 (coding_system)
5197 Lisp_Object coding_system;
5198{
5199 CHECK_SYMBOL (coding_system, 0);
5200 setup_coding_system (Fcheck_coding_system (coding_system),
5201 &safe_terminal_coding);
5202 return Qnil;
5203}
5204
4ed46869
KH
5205DEFUN ("terminal-coding-system",
5206 Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
3a73fa5d 5207 "Return coding system specified for terminal output.")
4ed46869
KH
5208 ()
5209{
5210 return terminal_coding.symbol;
5211}
5212
1ba9e4ab
KH
5213DEFUN ("set-keyboard-coding-system-internal",
5214 Fset_keyboard_coding_system_internal,
5215 Sset_keyboard_coding_system_internal, 1, 1, 0, "")
4ed46869
KH
5216 (coding_system)
5217 Lisp_Object coding_system;
5218{
5219 CHECK_SYMBOL (coding_system, 0);
5220 setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
5221 return Qnil;
5222}
5223
5224DEFUN ("keyboard-coding-system",
5225 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
3a73fa5d 5226 "Return coding system specified for decoding keyboard input.")
4ed46869
KH
5227 ()
5228{
5229 return keyboard_coding.symbol;
5230}
5231
5232\f
a5d301df
KH
5233DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
5234 Sfind_operation_coding_system, 1, MANY, 0,
5235 "Choose a coding system for an operation based on the target name.\n\
69f76525 5236The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).\n\
9ce27fde
KH
5237DECODING-SYSTEM is the coding system to use for decoding\n\
5238\(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system\n\
5239for encoding (in case OPERATION does encoding).\n\
ccdb79f5
RS
5240\n\
5241The first argument OPERATION specifies an I/O primitive:\n\
5242 For file I/O, `insert-file-contents' or `write-region'.\n\
5243 For process I/O, `call-process', `call-process-region', or `start-process'.\n\
5244 For network I/O, `open-network-stream'.\n\
5245\n\
5246The remaining arguments should be the same arguments that were passed\n\
5247to the primitive. Depending on which primitive, one of those arguments\n\
5248is selected as the TARGET. For example, if OPERATION does file I/O,\n\
5249whichever argument specifies the file name is TARGET.\n\
5250\n\
5251TARGET has a meaning which depends on OPERATION:\n\
4ed46869
KH
5252 For file I/O, TARGET is a file name.\n\
5253 For process I/O, TARGET is a process name.\n\
5254 For network I/O, TARGET is a service name or a port number\n\
5255\n\
02ba4723
KH
5256This function looks up what specified for TARGET in,\n\
5257`file-coding-system-alist', `process-coding-system-alist',\n\
5258or `network-coding-system-alist' depending on OPERATION.\n\
5259They may specify a coding system, a cons of coding systems,\n\
5260or a function symbol to call.\n\
5261In the last case, we call the function with one argument,\n\
9ce27fde 5262which is a list of all the arguments given to this function.")
4ed46869
KH
5263 (nargs, args)
5264 int nargs;
5265 Lisp_Object *args;
5266{
5267 Lisp_Object operation, target_idx, target, val;
5268 register Lisp_Object chain;
5269
5270 if (nargs < 2)
5271 error ("Too few arguments");
5272 operation = args[0];
5273 if (!SYMBOLP (operation)
5274 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
5275 error ("Invalid first arguement");
5276 if (nargs < 1 + XINT (target_idx))
5277 error ("Too few arguments for operation: %s",
5278 XSYMBOL (operation)->name->data);
5279 target = args[XINT (target_idx) + 1];
5280 if (!(STRINGP (target)
5281 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
5282 error ("Invalid %dth argument", XINT (target_idx) + 1);
5283
2e34157c
RS
5284 chain = ((EQ (operation, Qinsert_file_contents)
5285 || EQ (operation, Qwrite_region))
02ba4723 5286 ? Vfile_coding_system_alist
2e34157c 5287 : (EQ (operation, Qopen_network_stream)
02ba4723
KH
5288 ? Vnetwork_coding_system_alist
5289 : Vprocess_coding_system_alist));
4ed46869
KH
5290 if (NILP (chain))
5291 return Qnil;
5292
02ba4723 5293 for (; CONSP (chain); chain = XCONS (chain)->cdr)
4ed46869 5294 {
f44d27ce
RS
5295 Lisp_Object elt;
5296 elt = XCONS (chain)->car;
4ed46869
KH
5297
5298 if (CONSP (elt)
5299 && ((STRINGP (target)
5300 && STRINGP (XCONS (elt)->car)
5301 && fast_string_match (XCONS (elt)->car, target) >= 0)
5302 || (INTEGERP (target) && EQ (target, XCONS (elt)->car))))
02ba4723
KH
5303 {
5304 val = XCONS (elt)->cdr;
b19fd4c5
KH
5305 /* Here, if VAL is both a valid coding system and a valid
5306 function symbol, we return VAL as a coding system. */
02ba4723
KH
5307 if (CONSP (val))
5308 return val;
5309 if (! SYMBOLP (val))
5310 return Qnil;
5311 if (! NILP (Fcoding_system_p (val)))
5312 return Fcons (val, val);
b19fd4c5
KH
5313 if (! NILP (Ffboundp (val)))
5314 {
5315 val = call1 (val, Flist (nargs, args));
5316 if (CONSP (val))
5317 return val;
5318 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
5319 return Fcons (val, val);
5320 }
02ba4723
KH
5321 return Qnil;
5322 }
4ed46869
KH
5323 }
5324 return Qnil;
5325}
5326
1397dc18
KH
5327DEFUN ("update-coding-systems-internal", Fupdate_coding_systems_internal,
5328 Supdate_coding_systems_internal, 0, 0, 0,
5329 "Update internal database for ISO2022 and CCL based coding systems.\n\
d46c5b12
KH
5330When values of the following coding categories are changed, you must\n\
5331call this function:\n\
5332 coding-category-iso-7, coding-category-iso-7-tight,\n\
5333 coding-category-iso-8-1, coding-category-iso-8-2,\n\
1397dc18
KH
5334 coding-category-iso-7-else, coding-category-iso-8-else,\n\
5335 coding-category-ccl")
d46c5b12
KH
5336 ()
5337{
5338 int i;
5339
1397dc18 5340 for (i = CODING_CATEGORY_IDX_ISO_7; i <= CODING_CATEGORY_IDX_CCL; i++)
d46c5b12 5341 {
1397dc18
KH
5342 Lisp_Object val;
5343
5344 val = XSYMBOL (XVECTOR (Vcoding_category_table)->contents[i])->value;
5345 if (!NILP (val))
5346 {
5347 if (! coding_system_table[i])
5348 coding_system_table[i] = ((struct coding_system *)
5349 xmalloc (sizeof (struct coding_system)));
5350 setup_coding_system (val, coding_system_table[i]);
5351 }
5352 else if (coding_system_table[i])
5353 {
5354 xfree (coding_system_table[i]);
5355 coding_system_table[i] = NULL;
5356 }
d46c5b12 5357 }
1397dc18 5358
d46c5b12
KH
5359 return Qnil;
5360}
5361
66cfb530
KH
5362DEFUN ("set-coding-priority-internal", Fset_coding_priority_internal,
5363 Sset_coding_priority_internal, 0, 0, 0,
5364 "Update internal database for the current value of `coding-category-list'.\n\
5365This function is internal use only.")
5366 ()
5367{
5368 int i = 0, idx;
84d60297
RS
5369 Lisp_Object val;
5370
5371 val = Vcoding_category_list;
66cfb530
KH
5372
5373 while (CONSP (val) && i < CODING_CATEGORY_IDX_MAX)
5374 {
5375 if (! SYMBOLP (XCONS (val)->car))
5376 break;
5377 idx = XFASTINT (Fget (XCONS (val)->car, Qcoding_category_index));
5378 if (idx >= CODING_CATEGORY_IDX_MAX)
5379 break;
5380 coding_priorities[i++] = (1 << idx);
5381 val = XCONS (val)->cdr;
5382 }
5383 /* If coding-category-list is valid and contains all coding
5384 categories, `i' should be CODING_CATEGORY_IDX_MAX now. If not,
5385 the following code saves Emacs from craching. */
5386 while (i < CODING_CATEGORY_IDX_MAX)
5387 coding_priorities[i++] = CODING_CATEGORY_MASK_RAW_TEXT;
5388
5389 return Qnil;
5390}
5391
4ed46869
KH
5392#endif /* emacs */
5393
5394\f
1397dc18 5395/*** 9. Post-amble ***/
4ed46869 5396
6d74c3aa
KH
5397void
5398init_coding ()
5399{
5400 conversion_buffer = (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE);
5401}
5402
dfcf069d 5403void
4ed46869
KH
5404init_coding_once ()
5405{
5406 int i;
5407
0ef69138 5408 /* Emacs' internal format specific initialize routine. */
4ed46869
KH
5409 for (i = 0; i <= 0x20; i++)
5410 emacs_code_class[i] = EMACS_control_code;
5411 emacs_code_class[0x0A] = EMACS_linefeed_code;
5412 emacs_code_class[0x0D] = EMACS_carriage_return_code;
5413 for (i = 0x21 ; i < 0x7F; i++)
5414 emacs_code_class[i] = EMACS_ascii_code;
5415 emacs_code_class[0x7F] = EMACS_control_code;
5416 emacs_code_class[0x80] = EMACS_leading_code_composition;
5417 for (i = 0x81; i < 0xFF; i++)
5418 emacs_code_class[i] = EMACS_invalid_code;
5419 emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
5420 emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
5421 emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
5422 emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
5423
5424 /* ISO2022 specific initialize routine. */
5425 for (i = 0; i < 0x20; i++)
5426 iso_code_class[i] = ISO_control_code;
5427 for (i = 0x21; i < 0x7F; i++)
5428 iso_code_class[i] = ISO_graphic_plane_0;
5429 for (i = 0x80; i < 0xA0; i++)
5430 iso_code_class[i] = ISO_control_code;
5431 for (i = 0xA1; i < 0xFF; i++)
5432 iso_code_class[i] = ISO_graphic_plane_1;
5433 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
5434 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
5435 iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
5436 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
5437 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
5438 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
5439 iso_code_class[ISO_CODE_ESC] = ISO_escape;
5440 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
5441 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
5442 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
5443
e0e989f6 5444 conversion_buffer_size = MINIMUM_CONVERSION_BUFFER_SIZE;
e0e989f6
KH
5445
5446 setup_coding_system (Qnil, &keyboard_coding);
5447 setup_coding_system (Qnil, &terminal_coding);
c4825358 5448 setup_coding_system (Qnil, &safe_terminal_coding);
6bc51348 5449 setup_coding_system (Qnil, &default_buffer_file_coding);
9ce27fde 5450
d46c5b12
KH
5451 bzero (coding_system_table, sizeof coding_system_table);
5452
66cfb530
KH
5453 bzero (ascii_skip_code, sizeof ascii_skip_code);
5454 for (i = 0; i < 128; i++)
5455 ascii_skip_code[i] = 1;
5456
9ce27fde
KH
5457#if defined (MSDOS) || defined (WINDOWSNT)
5458 system_eol_type = CODING_EOL_CRLF;
5459#else
5460 system_eol_type = CODING_EOL_LF;
5461#endif
e0e989f6
KH
5462}
5463
5464#ifdef emacs
5465
dfcf069d 5466void
e0e989f6
KH
5467syms_of_coding ()
5468{
5469 Qtarget_idx = intern ("target-idx");
5470 staticpro (&Qtarget_idx);
5471
bb0115a2
RS
5472 Qcoding_system_history = intern ("coding-system-history");
5473 staticpro (&Qcoding_system_history);
5474 Fset (Qcoding_system_history, Qnil);
5475
9ce27fde 5476 /* Target FILENAME is the first argument. */
e0e989f6 5477 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
9ce27fde 5478 /* Target FILENAME is the third argument. */
e0e989f6
KH
5479 Fput (Qwrite_region, Qtarget_idx, make_number (2));
5480
5481 Qcall_process = intern ("call-process");
5482 staticpro (&Qcall_process);
9ce27fde 5483 /* Target PROGRAM is the first argument. */
e0e989f6
KH
5484 Fput (Qcall_process, Qtarget_idx, make_number (0));
5485
5486 Qcall_process_region = intern ("call-process-region");
5487 staticpro (&Qcall_process_region);
9ce27fde 5488 /* Target PROGRAM is the third argument. */
e0e989f6
KH
5489 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
5490
5491 Qstart_process = intern ("start-process");
5492 staticpro (&Qstart_process);
9ce27fde 5493 /* Target PROGRAM is the third argument. */
e0e989f6
KH
5494 Fput (Qstart_process, Qtarget_idx, make_number (2));
5495
5496 Qopen_network_stream = intern ("open-network-stream");
5497 staticpro (&Qopen_network_stream);
9ce27fde 5498 /* Target SERVICE is the fourth argument. */
e0e989f6
KH
5499 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
5500
4ed46869
KH
5501 Qcoding_system = intern ("coding-system");
5502 staticpro (&Qcoding_system);
5503
5504 Qeol_type = intern ("eol-type");
5505 staticpro (&Qeol_type);
5506
5507 Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
5508 staticpro (&Qbuffer_file_coding_system);
5509
5510 Qpost_read_conversion = intern ("post-read-conversion");
5511 staticpro (&Qpost_read_conversion);
5512
5513 Qpre_write_conversion = intern ("pre-write-conversion");
5514 staticpro (&Qpre_write_conversion);
5515
27901516
KH
5516 Qno_conversion = intern ("no-conversion");
5517 staticpro (&Qno_conversion);
5518
5519 Qundecided = intern ("undecided");
5520 staticpro (&Qundecided);
5521
4ed46869
KH
5522 Qcoding_system_p = intern ("coding-system-p");
5523 staticpro (&Qcoding_system_p);
5524
5525 Qcoding_system_error = intern ("coding-system-error");
5526 staticpro (&Qcoding_system_error);
5527
5528 Fput (Qcoding_system_error, Qerror_conditions,
5529 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
5530 Fput (Qcoding_system_error, Qerror_message,
9ce27fde 5531 build_string ("Invalid coding system"));
4ed46869 5532
d46c5b12
KH
5533 Qcoding_category = intern ("coding-category");
5534 staticpro (&Qcoding_category);
4ed46869
KH
5535 Qcoding_category_index = intern ("coding-category-index");
5536 staticpro (&Qcoding_category_index);
5537
d46c5b12
KH
5538 Vcoding_category_table
5539 = Fmake_vector (make_number (CODING_CATEGORY_IDX_MAX), Qnil);
5540 staticpro (&Vcoding_category_table);
4ed46869
KH
5541 {
5542 int i;
5543 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
5544 {
d46c5b12
KH
5545 XVECTOR (Vcoding_category_table)->contents[i]
5546 = intern (coding_category_name[i]);
5547 Fput (XVECTOR (Vcoding_category_table)->contents[i],
5548 Qcoding_category_index, make_number (i));
4ed46869
KH
5549 }
5550 }
5551
f967223b
KH
5552 Qtranslation_table = intern ("translation-table");
5553 staticpro (&Qtranslation_table);
1397dc18 5554 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (1));
bdd9fb48 5555
f967223b
KH
5556 Qtranslation_table_id = intern ("translation-table-id");
5557 staticpro (&Qtranslation_table_id);
84fbb8a0 5558
f967223b
KH
5559 Qtranslation_table_for_decode = intern ("translation-table-for-decode");
5560 staticpro (&Qtranslation_table_for_decode);
a5d301df 5561
f967223b
KH
5562 Qtranslation_table_for_encode = intern ("translation-table-for-encode");
5563 staticpro (&Qtranslation_table_for_encode);
a5d301df 5564
70c22245
KH
5565 Qsafe_charsets = intern ("safe-charsets");
5566 staticpro (&Qsafe_charsets);
5567
1397dc18
KH
5568 Qvalid_codes = intern ("valid-codes");
5569 staticpro (&Qvalid_codes);
5570
9ce27fde
KH
5571 Qemacs_mule = intern ("emacs-mule");
5572 staticpro (&Qemacs_mule);
5573
d46c5b12
KH
5574 Qraw_text = intern ("raw-text");
5575 staticpro (&Qraw_text);
5576
4ed46869
KH
5577 defsubr (&Scoding_system_p);
5578 defsubr (&Sread_coding_system);
5579 defsubr (&Sread_non_nil_coding_system);
5580 defsubr (&Scheck_coding_system);
5581 defsubr (&Sdetect_coding_region);
d46c5b12 5582 defsubr (&Sdetect_coding_string);
4ed46869
KH
5583 defsubr (&Sdecode_coding_region);
5584 defsubr (&Sencode_coding_region);
5585 defsubr (&Sdecode_coding_string);
5586 defsubr (&Sencode_coding_string);
5587 defsubr (&Sdecode_sjis_char);
5588 defsubr (&Sencode_sjis_char);
5589 defsubr (&Sdecode_big5_char);
5590 defsubr (&Sencode_big5_char);
1ba9e4ab 5591 defsubr (&Sset_terminal_coding_system_internal);
c4825358 5592 defsubr (&Sset_safe_terminal_coding_system_internal);
4ed46869 5593 defsubr (&Sterminal_coding_system);
1ba9e4ab 5594 defsubr (&Sset_keyboard_coding_system_internal);
4ed46869 5595 defsubr (&Skeyboard_coding_system);
a5d301df 5596 defsubr (&Sfind_operation_coding_system);
1397dc18 5597 defsubr (&Supdate_coding_systems_internal);
66cfb530 5598 defsubr (&Sset_coding_priority_internal);
4ed46869 5599
4608c386
KH
5600 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
5601 "List of coding systems.\n\
5602\n\
5603Do not alter the value of this variable manually. This variable should be\n\
5604updated by the functions `make-coding-system' and\n\
5605`define-coding-system-alias'.");
5606 Vcoding_system_list = Qnil;
5607
5608 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
5609 "Alist of coding system names.\n\
5610Each element is one element list of coding system name.\n\
5611This variable is given to `completing-read' as TABLE argument.\n\
5612\n\
5613Do not alter the value of this variable manually. This variable should be\n\
5614updated by the functions `make-coding-system' and\n\
5615`define-coding-system-alias'.");
5616 Vcoding_system_alist = Qnil;
5617
4ed46869
KH
5618 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
5619 "List of coding-categories (symbols) ordered by priority.");
5620 {
5621 int i;
5622
5623 Vcoding_category_list = Qnil;
5624 for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
5625 Vcoding_category_list
d46c5b12
KH
5626 = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
5627 Vcoding_category_list);
4ed46869
KH
5628 }
5629
5630 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
10bff6f1 5631 "Specify the coding system for read operations.\n\
2ebb362d 5632It is useful to bind this variable with `let', but do not set it globally.\n\
4ed46869 5633If the value is a coding system, it is used for decoding on read operation.\n\
a67a9c66 5634If not, an appropriate element is used from one of the coding system alists:\n\
10bff6f1 5635There are three such tables, `file-coding-system-alist',\n\
a67a9c66 5636`process-coding-system-alist', and `network-coding-system-alist'.");
4ed46869
KH
5637 Vcoding_system_for_read = Qnil;
5638
5639 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
10bff6f1 5640 "Specify the coding system for write operations.\n\
2ebb362d 5641It is useful to bind this variable with `let', but do not set it globally.\n\
4ed46869 5642If the value is a coding system, it is used for encoding on write operation.\n\
a67a9c66 5643If not, an appropriate element is used from one of the coding system alists:\n\
10bff6f1 5644There are three such tables, `file-coding-system-alist',\n\
a67a9c66 5645`process-coding-system-alist', and `network-coding-system-alist'.");
4ed46869
KH
5646 Vcoding_system_for_write = Qnil;
5647
5648 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
a67a9c66 5649 "Coding system used in the latest file or process I/O.");
4ed46869
KH
5650 Vlast_coding_system_used = Qnil;
5651
9ce27fde 5652 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
f07f4a24 5653 "*Non-nil means always inhibit code conversion of end-of-line format.\n\
94c7a214
DL
5654See info node `Coding Systems' and info node `Text and Binary' concerning\n\
5655such conversion.");
9ce27fde
KH
5656 inhibit_eol_conversion = 0;
5657
ed29121d
EZ
5658 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
5659 "Non-nil means process buffer inherits coding system of process output.\n\
5660Bind it to t if the process output is to be treated as if it were a file\n\
5661read from some filesystem.");
5662 inherit_process_coding_system = 0;
5663
02ba4723
KH
5664 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
5665 "Alist to decide a coding system to use for a file I/O operation.\n\
5666The format is ((PATTERN . VAL) ...),\n\
5667where PATTERN is a regular expression matching a file name,\n\
5668VAL is a coding system, a cons of coding systems, or a function symbol.\n\
5669If VAL is a coding system, it is used for both decoding and encoding\n\
5670the file contents.\n\
5671If VAL is a cons of coding systems, the car part is used for decoding,\n\
5672and the cdr part is used for encoding.\n\
5673If VAL is a function symbol, the function must return a coding system\n\
5674or a cons of coding systems which are used as above.\n\
e0e989f6 5675\n\
a85a871a 5676See also the function `find-operation-coding-system'\n\
eda284ac 5677and the variable `auto-coding-alist'.");
02ba4723
KH
5678 Vfile_coding_system_alist = Qnil;
5679
5680 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
5681 "Alist to decide a coding system to use for a process I/O operation.\n\
5682The format is ((PATTERN . VAL) ...),\n\
5683where PATTERN is a regular expression matching a program name,\n\
5684VAL is a coding system, a cons of coding systems, or a function symbol.\n\
5685If VAL is a coding system, it is used for both decoding what received\n\
5686from the program and encoding what sent to the program.\n\
5687If VAL is a cons of coding systems, the car part is used for decoding,\n\
5688and the cdr part is used for encoding.\n\
5689If VAL is a function symbol, the function must return a coding system\n\
5690or a cons of coding systems which are used as above.\n\
4ed46869 5691\n\
9ce27fde 5692See also the function `find-operation-coding-system'.");
02ba4723
KH
5693 Vprocess_coding_system_alist = Qnil;
5694
5695 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
5696 "Alist to decide a coding system to use for a network I/O operation.\n\
5697The format is ((PATTERN . VAL) ...),\n\
5698where PATTERN is a regular expression matching a network service name\n\
5699or is a port number to connect to,\n\
5700VAL is a coding system, a cons of coding systems, or a function symbol.\n\
5701If VAL is a coding system, it is used for both decoding what received\n\
5702from the network stream and encoding what sent to the network stream.\n\
5703If VAL is a cons of coding systems, the car part is used for decoding,\n\
5704and the cdr part is used for encoding.\n\
5705If VAL is a function symbol, the function must return a coding system\n\
5706or a cons of coding systems which are used as above.\n\
4ed46869 5707\n\
9ce27fde 5708See also the function `find-operation-coding-system'.");
02ba4723 5709 Vnetwork_coding_system_alist = Qnil;
4ed46869 5710
7722baf9
EZ
5711 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
5712 "*String displayed in mode line for UNIX-like (LF) end-of-line format.");
5713 eol_mnemonic_unix = build_string (":");
4ed46869 5714
7722baf9
EZ
5715 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
5716 "*String displayed in mode line for DOS-like (CRLF) end-of-line format.");
5717 eol_mnemonic_dos = build_string ("\\");
4ed46869 5718
7722baf9
EZ
5719 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
5720 "*String displayed in mode line for MAC-like (CR) end-of-line format.");
5721 eol_mnemonic_mac = build_string ("/");
4ed46869 5722
7722baf9
EZ
5723 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
5724 "*String displayed in mode line when end-of-line format is not yet determined.");
5725 eol_mnemonic_undecided = build_string (":");
4ed46869 5726
84fbb8a0 5727 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
f967223b 5728 "*Non-nil enables character translation while encoding and decoding.");
84fbb8a0 5729 Venable_character_translation = Qt;
bdd9fb48 5730
f967223b
KH
5731 DEFVAR_LISP ("standard-translation-table-for-decode",
5732 &Vstandard_translation_table_for_decode,
84fbb8a0 5733 "Table for translating characters while decoding.");
f967223b 5734 Vstandard_translation_table_for_decode = Qnil;
bdd9fb48 5735
f967223b
KH
5736 DEFVAR_LISP ("standard-translation-table-for-encode",
5737 &Vstandard_translation_table_for_encode,
84fbb8a0 5738 "Table for translationg characters while encoding.");
f967223b 5739 Vstandard_translation_table_for_encode = Qnil;
4ed46869
KH
5740
5741 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
5742 "Alist of charsets vs revision numbers.\n\
5743While encoding, if a charset (car part of an element) is found,\n\
5744designate it with the escape sequence identifing revision (cdr part of the element).");
5745 Vcharset_revision_alist = Qnil;
02ba4723
KH
5746
5747 DEFVAR_LISP ("default-process-coding-system",
5748 &Vdefault_process_coding_system,
5749 "Cons of coding systems used for process I/O by default.\n\
5750The car part is used for decoding a process output,\n\
5751the cdr part is used for encoding a text to be sent to a process.");
5752 Vdefault_process_coding_system = Qnil;
c4825358 5753
3f003981
KH
5754 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
5755 "Table of extra Latin codes in the range 128..159 (inclusive).\n\
c4825358
KH
5756This is a vector of length 256.\n\
5757If Nth element is non-nil, the existence of code N in a file\n\
bb0115a2 5758\(or output of subprocess) doesn't prevent it to be detected as\n\
3f003981
KH
5759a coding system of ISO 2022 variant which has a flag\n\
5760`accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file\n\
c4825358
KH
5761or reading output of a subprocess.\n\
5762Only 128th through 159th elements has a meaning.");
3f003981 5763 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
d46c5b12
KH
5764
5765 DEFVAR_LISP ("select-safe-coding-system-function",
5766 &Vselect_safe_coding_system_function,
5767 "Function to call to select safe coding system for encoding a text.\n\
5768\n\
5769If set, this function is called to force a user to select a proper\n\
5770coding system which can encode the text in the case that a default\n\
5771coding system used in each operation can't encode the text.\n\
5772\n\
a85a871a 5773The default value is `select-safe-coding-system' (which see).");
d46c5b12
KH
5774 Vselect_safe_coding_system_function = Qnil;
5775
4ed46869
KH
5776}
5777
5778#endif /* emacs */