(ccl_driver) <CCL_Call>: Now CCL program ID to call may be
[bpt/emacs.git] / src / coding.c
CommitLineData
4ed46869 1/* Coding system handler (conversion, detection, and etc).
4a2f9c6a 2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
203cb916 3 Licensed to the Free Software Foundation.
4ed46869 4
369314dc
KH
5This file is part of GNU Emacs.
6
7GNU Emacs is free software; you can redistribute it and/or modify
8it under the terms of the GNU General Public License as published by
9the Free Software Foundation; either version 2, or (at your option)
10any later version.
4ed46869 11
369314dc
KH
12GNU Emacs is distributed in the hope that it will be useful,
13but WITHOUT ANY WARRANTY; without even the implied warranty of
14MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15GNU General Public License for more details.
4ed46869 16
369314dc
KH
17You should have received a copy of the GNU General Public License
18along with GNU Emacs; see the file COPYING. If not, write to
19the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20Boston, MA 02111-1307, USA. */
4ed46869
KH
21
22/*** TABLE OF CONTENTS ***
23
24 1. Preamble
0ef69138 25 2. Emacs' internal format (emacs-mule) handlers
4ed46869
KH
26 3. ISO2022 handlers
27 4. Shift-JIS and BIG5 handlers
1397dc18
KH
28 5. CCL handlers
29 6. End-of-line handlers
30 7. C library functions
31 8. Emacs Lisp library functions
32 9. Post-amble
4ed46869
KH
33
34*/
35
36/*** GENERAL NOTE on CODING SYSTEM ***
37
38 Coding system is an encoding mechanism of one or more character
39 sets. Here's a list of coding systems which Emacs can handle. When
40 we say "decode", it means converting some other coding system to
0ef69138
KH
41 Emacs' internal format (emacs-internal), and when we say "encode",
42 it means converting the coding system emacs-mule to some other
43 coding system.
4ed46869 44
0ef69138 45 0. Emacs' internal format (emacs-mule)
4ed46869
KH
46
47 Emacs itself holds a multi-lingual character in a buffer and a string
f4dee582 48 in a special format. Details are described in section 2.
4ed46869
KH
49
50 1. ISO2022
51
52 The most famous coding system for multiple character sets. X's
f4dee582
RS
53 Compound Text, various EUCs (Extended Unix Code), and coding
54 systems used in Internet communication such as ISO-2022-JP are
55 all variants of ISO2022. Details are described in section 3.
4ed46869
KH
56
57 2. SJIS (or Shift-JIS or MS-Kanji-Code)
58
59 A coding system to encode character sets: ASCII, JISX0201, and
60 JISX0208. Widely used for PC's in Japan. Details are described in
f4dee582 61 section 4.
4ed46869
KH
62
63 3. BIG5
64
65 A coding system to encode character sets: ASCII and Big5. Widely
66 used by Chinese (mainly in Taiwan and Hong Kong). Details are
f4dee582
RS
67 described in section 4. In this file, when we write "BIG5"
68 (all uppercase), we mean the coding system, and when we write
69 "Big5" (capitalized), we mean the character set.
4ed46869 70
27901516
KH
71 4. Raw text
72
4608c386
KH
73 A coding system for a text containing random 8-bit code. Emacs does
74 no code conversion on such a text except for end-of-line format.
27901516
KH
75
76 5. Other
4ed46869 77
f4dee582 78 If a user wants to read/write a text encoded in a coding system not
4ed46869
KH
79 listed above, he can supply a decoder and an encoder for it in CCL
80 (Code Conversion Language) programs. Emacs executes the CCL program
81 while reading/writing.
82
d46c5b12
KH
83 Emacs represents a coding system by a Lisp symbol that has a property
84 `coding-system'. But, before actually using the coding system, the
4ed46869 85 information about it is set in a structure of type `struct
f4dee582 86 coding_system' for rapid processing. See section 6 for more details.
4ed46869
KH
87
88*/
89
90/*** GENERAL NOTES on END-OF-LINE FORMAT ***
91
92 How end-of-line of a text is encoded depends on a system. For
93 instance, Unix's format is just one byte of `line-feed' code,
f4dee582 94 whereas DOS's format is two-byte sequence of `carriage-return' and
d46c5b12
KH
95 `line-feed' codes. MacOS's format is usually one byte of
96 `carriage-return'.
4ed46869 97
f4dee582
RS
98 Since text characters encoding and end-of-line encoding are
99 independent, any coding system described above can take
4ed46869 100 any format of end-of-line. So, Emacs has information of format of
f4dee582 101 end-of-line in each coding-system. See section 6 for more details.
4ed46869
KH
102
103*/
104
105/*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
106
107 These functions check if a text between SRC and SRC_END is encoded
108 in the coding system category XXX. Each returns an integer value in
109 which appropriate flag bits for the category XXX is set. The flag
110 bits are defined in macros CODING_CATEGORY_MASK_XXX. Below is the
111 template of these functions. */
112#if 0
113int
0ef69138 114detect_coding_emacs_mule (src, src_end)
4ed46869
KH
115 unsigned char *src, *src_end;
116{
117 ...
118}
119#endif
120
121/*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
122
123 These functions decode SRC_BYTES length text at SOURCE encoded in
0ef69138 124 CODING to Emacs' internal format (emacs-mule). The resulting text
d46c5b12
KH
125 goes to a place pointed to by DESTINATION, the length of which
126 should not exceed DST_BYTES. These functions set the information of
127 original and decoded texts in the members produced, produced_char,
128 consumed, and consumed_char of the structure *CODING.
129
130 The return value is an integer (CODING_FINISH_XXX) indicating how
131 the decoding finished.
132
133 DST_BYTES zero means that source area and destination area are
134 overlapped, which means that we can produce a decoded text until it
135 reaches at the head of not-yet-decoded source text.
136
137 Below is a template of these functions. */
4ed46869 138#if 0
d46c5b12 139decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
140 struct coding_system *coding;
141 unsigned char *source, *destination;
142 int src_bytes, dst_bytes;
4ed46869
KH
143{
144 ...
145}
146#endif
147
148/*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
149
0ef69138
KH
150 These functions encode SRC_BYTES length text at SOURCE of Emacs'
151 internal format (emacs-mule) to CODING. The resulting text goes to
f4dee582 152 a place pointed to by DESTINATION, the length of which should not
d46c5b12
KH
153 exceed DST_BYTES. These functions set the information of
154 original and encoded texts in the members produced, produced_char,
155 consumed, and consumed_char of the structure *CODING.
156
157 The return value is an integer (CODING_FINISH_XXX) indicating how
158 the encoding finished.
159
160 DST_BYTES zero means that source area and destination area are
161 overlapped, which means that we can produce a decoded text until it
162 reaches at the head of not-yet-decoded source text.
163
164 Below is a template of these functions. */
4ed46869 165#if 0
d46c5b12 166encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
167 struct coding_system *coding;
168 unsigned char *source, *destination;
169 int src_bytes, dst_bytes;
4ed46869
KH
170{
171 ...
172}
173#endif
174
175/*** COMMONLY USED MACROS ***/
176
177/* The following three macros ONE_MORE_BYTE, TWO_MORE_BYTES, and
178 THREE_MORE_BYTES safely get one, two, and three bytes from the
179 source text respectively. If there are not enough bytes in the
180 source, they jump to `label_end_of_loop'. The caller should set
181 variables `src' and `src_end' to appropriate areas in advance. */
182
183#define ONE_MORE_BYTE(c1) \
184 do { \
185 if (src < src_end) \
186 c1 = *src++; \
187 else \
188 goto label_end_of_loop; \
189 } while (0)
190
191#define TWO_MORE_BYTES(c1, c2) \
192 do { \
193 if (src + 1 < src_end) \
194 c1 = *src++, c2 = *src++; \
195 else \
196 goto label_end_of_loop; \
197 } while (0)
198
199#define THREE_MORE_BYTES(c1, c2, c3) \
200 do { \
201 if (src + 2 < src_end) \
202 c1 = *src++, c2 = *src++, c3 = *src++; \
203 else \
204 goto label_end_of_loop; \
205 } while (0)
206
207/* The following three macros DECODE_CHARACTER_ASCII,
208 DECODE_CHARACTER_DIMENSION1, and DECODE_CHARACTER_DIMENSION2 put
209 the multi-byte form of a character of each class at the place
210 pointed by `dst'. The caller should set the variable `dst' to
211 point to an appropriate area and the variable `coding' to point to
212 the coding-system of the currently decoding text in advance. */
213
214/* Decode one ASCII character C. */
215
de79a6a5
KH
216#define DECODE_CHARACTER_ASCII(c) \
217 do { \
218 if (COMPOSING_P (coding->composing)) \
219 { \
220 *dst++ = 0xA0, *dst++ = (c) | 0x80; \
221 coding->composed_chars++; \
d14d03ac
KH
222 if (((c) | 0x80) < 0xA0) \
223 coding->fake_multibyte = 1; \
de79a6a5
KH
224 } \
225 else \
226 { \
227 *dst++ = (c); \
228 coding->produced_char++; \
d14d03ac
KH
229 if ((c) >= 0x80) \
230 coding->fake_multibyte = 1; \
de79a6a5 231 } \
4ed46869
KH
232 } while (0)
233
f4dee582 234/* Decode one DIMENSION1 character whose charset is CHARSET and whose
4ed46869
KH
235 position-code is C. */
236
237#define DECODE_CHARACTER_DIMENSION1(charset, c) \
238 do { \
239 unsigned char leading_code = CHARSET_LEADING_CODE_BASE (charset); \
240 if (COMPOSING_P (coding->composing)) \
de79a6a5
KH
241 { \
242 *dst++ = leading_code + 0x20; \
243 coding->composed_chars++; \
244 } \
4ed46869 245 else \
d46c5b12
KH
246 { \
247 *dst++ = leading_code; \
248 coding->produced_char++; \
249 } \
4ed46869
KH
250 if (leading_code = CHARSET_LEADING_CODE_EXT (charset)) \
251 *dst++ = leading_code; \
252 *dst++ = (c) | 0x80; \
d14d03ac
KH
253 if (((c) | 0x80) < 0xA0) \
254 coding->fake_multibyte = 1; \
4ed46869
KH
255 } while (0)
256
f4dee582 257/* Decode one DIMENSION2 character whose charset is CHARSET and whose
4ed46869
KH
258 position-codes are C1 and C2. */
259
260#define DECODE_CHARACTER_DIMENSION2(charset, c1, c2) \
261 do { \
262 DECODE_CHARACTER_DIMENSION1 (charset, c1); \
263 *dst++ = (c2) | 0x80; \
d14d03ac
KH
264 if (((c2) | 0x80) < 0xA0) \
265 coding->fake_multibyte = 1; \
4ed46869
KH
266 } while (0)
267
268\f
269/*** 1. Preamble ***/
270
271#include <stdio.h>
272
273#ifdef emacs
274
275#include <config.h>
276#include "lisp.h"
277#include "buffer.h"
278#include "charset.h"
279#include "ccl.h"
280#include "coding.h"
281#include "window.h"
282
283#else /* not emacs */
284
285#include "mulelib.h"
286
287#endif /* not emacs */
288
289Lisp_Object Qcoding_system, Qeol_type;
290Lisp_Object Qbuffer_file_coding_system;
291Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
27901516 292Lisp_Object Qno_conversion, Qundecided;
bb0115a2 293Lisp_Object Qcoding_system_history;
70c22245 294Lisp_Object Qsafe_charsets;
1397dc18 295Lisp_Object Qvalid_codes;
4ed46869
KH
296
297extern Lisp_Object Qinsert_file_contents, Qwrite_region;
298Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
299Lisp_Object Qstart_process, Qopen_network_stream;
300Lisp_Object Qtarget_idx;
301
d46c5b12
KH
302Lisp_Object Vselect_safe_coding_system_function;
303
7722baf9
EZ
304/* Mnemonic string for each format of end-of-line. */
305Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
306/* Mnemonic string to indicate format of end-of-line is not yet
4ed46869 307 decided. */
7722baf9 308Lisp_Object eol_mnemonic_undecided;
4ed46869 309
9ce27fde
KH
310/* Format of end-of-line decided by system. This is CODING_EOL_LF on
311 Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac. */
312int system_eol_type;
313
4ed46869
KH
314#ifdef emacs
315
4608c386
KH
316Lisp_Object Vcoding_system_list, Vcoding_system_alist;
317
318Lisp_Object Qcoding_system_p, Qcoding_system_error;
4ed46869 319
d46c5b12
KH
320/* Coding system emacs-mule and raw-text are for converting only
321 end-of-line format. */
322Lisp_Object Qemacs_mule, Qraw_text;
9ce27fde 323
4ed46869
KH
324/* Coding-systems are handed between Emacs Lisp programs and C internal
325 routines by the following three variables. */
326/* Coding-system for reading files and receiving data from process. */
327Lisp_Object Vcoding_system_for_read;
328/* Coding-system for writing files and sending data to process. */
329Lisp_Object Vcoding_system_for_write;
330/* Coding-system actually used in the latest I/O. */
331Lisp_Object Vlast_coding_system_used;
332
c4825358 333/* A vector of length 256 which contains information about special
94487c4e 334 Latin codes (especially for dealing with Microsoft codes). */
3f003981 335Lisp_Object Vlatin_extra_code_table;
c4825358 336
9ce27fde
KH
337/* Flag to inhibit code conversion of end-of-line format. */
338int inhibit_eol_conversion;
339
ed29121d
EZ
340/* Flag to make buffer-file-coding-system inherit from process-coding. */
341int inherit_process_coding_system;
342
c4825358 343/* Coding system to be used to encode text for terminal display. */
4ed46869
KH
344struct coding_system terminal_coding;
345
c4825358
KH
346/* Coding system to be used to encode text for terminal display when
347 terminal coding system is nil. */
348struct coding_system safe_terminal_coding;
349
350/* Coding system of what is sent from terminal keyboard. */
4ed46869
KH
351struct coding_system keyboard_coding;
352
6bc51348
KH
353/* Default coding system to be used to write a file. */
354struct coding_system default_buffer_file_coding;
355
02ba4723
KH
356Lisp_Object Vfile_coding_system_alist;
357Lisp_Object Vprocess_coding_system_alist;
358Lisp_Object Vnetwork_coding_system_alist;
4ed46869
KH
359
360#endif /* emacs */
361
d46c5b12 362Lisp_Object Qcoding_category, Qcoding_category_index;
4ed46869
KH
363
364/* List of symbols `coding-category-xxx' ordered by priority. */
365Lisp_Object Vcoding_category_list;
366
d46c5b12
KH
367/* Table of coding categories (Lisp symbols). */
368Lisp_Object Vcoding_category_table;
4ed46869
KH
369
370/* Table of names of symbol for each coding-category. */
371char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
0ef69138 372 "coding-category-emacs-mule",
4ed46869
KH
373 "coding-category-sjis",
374 "coding-category-iso-7",
d46c5b12 375 "coding-category-iso-7-tight",
4ed46869
KH
376 "coding-category-iso-8-1",
377 "coding-category-iso-8-2",
7717c392
KH
378 "coding-category-iso-7-else",
379 "coding-category-iso-8-else",
89fa8b36 380 "coding-category-ccl",
4ed46869 381 "coding-category-big5",
27901516 382 "coding-category-raw-text",
89fa8b36 383 "coding-category-binary"
4ed46869
KH
384};
385
66cfb530 386/* Table of pointers to coding systems corresponding to each coding
d46c5b12
KH
387 categories. */
388struct coding_system *coding_system_table[CODING_CATEGORY_IDX_MAX];
389
66cfb530
KH
390/* Table of coding category masks. Nth element is a mask for a coding
391 cateogry of which priority is Nth. */
392static
393int coding_priorities[CODING_CATEGORY_IDX_MAX];
394
f967223b
KH
395/* Flag to tell if we look up translation table on character code
396 conversion. */
84fbb8a0 397Lisp_Object Venable_character_translation;
f967223b
KH
398/* Standard translation table to look up on decoding (reading). */
399Lisp_Object Vstandard_translation_table_for_decode;
400/* Standard translation table to look up on encoding (writing). */
401Lisp_Object Vstandard_translation_table_for_encode;
84fbb8a0 402
f967223b
KH
403Lisp_Object Qtranslation_table;
404Lisp_Object Qtranslation_table_id;
405Lisp_Object Qtranslation_table_for_decode;
406Lisp_Object Qtranslation_table_for_encode;
4ed46869
KH
407
408/* Alist of charsets vs revision number. */
409Lisp_Object Vcharset_revision_alist;
410
02ba4723
KH
411/* Default coding systems used for process I/O. */
412Lisp_Object Vdefault_process_coding_system;
413
4ed46869 414\f
0ef69138 415/*** 2. Emacs internal format (emacs-mule) handlers ***/
4ed46869
KH
416
417/* Emacs' internal format for encoding multiple character sets is a
f4dee582
RS
418 kind of multi-byte encoding, i.e. characters are encoded by
419 variable-length sequences of one-byte codes. ASCII characters
420 and control characters (e.g. `tab', `newline') are represented by
421 one-byte sequences which are their ASCII codes, in the range 0x00
422 through 0x7F. The other characters are represented by a sequence
423 of `base leading-code', optional `extended leading-code', and one
424 or two `position-code's. The length of the sequence is determined
425 by the base leading-code. Leading-code takes the range 0x80
426 through 0x9F, whereas extended leading-code and position-code take
427 the range 0xA0 through 0xFF. See `charset.h' for more details
428 about leading-code and position-code.
429
430 There's one exception to this rule. Special leading-code
4ed46869
KH
431 `leading-code-composition' denotes that the following several
432 characters should be composed into one character. Leading-codes of
433 components (except for ASCII) are added 0x20. An ASCII character
434 component is represented by a 2-byte sequence of `0xA0' and
f4dee582
RS
435 `ASCII-code + 0x80'. See also the comments in `charset.h' for the
436 details of composite character. Hence, we can summarize the code
4ed46869
KH
437 range as follows:
438
439 --- CODE RANGE of Emacs' internal format ---
440 (character set) (range)
441 ASCII 0x00 .. 0x7F
442 ELSE (1st byte) 0x80 .. 0x9F
443 (rest bytes) 0xA0 .. 0xFF
444 ---------------------------------------------
445
446 */
447
448enum emacs_code_class_type emacs_code_class[256];
449
450/* Go to the next statement only if *SRC is accessible and the code is
451 greater than 0xA0. */
452#define CHECK_CODE_RANGE_A0_FF \
453 do { \
454 if (src >= src_end) \
455 goto label_end_of_switch; \
456 else if (*src++ < 0xA0) \
457 return 0; \
458 } while (0)
459
460/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
461 Check if a text is encoded in Emacs' internal format. If it is,
d46c5b12 462 return CODING_CATEGORY_MASK_EMACS_MULE, else return 0. */
4ed46869
KH
463
464int
0ef69138 465detect_coding_emacs_mule (src, src_end)
4ed46869
KH
466 unsigned char *src, *src_end;
467{
468 unsigned char c;
469 int composing = 0;
470
471 while (src < src_end)
472 {
473 c = *src++;
474
475 if (composing)
476 {
477 if (c < 0xA0)
478 composing = 0;
479 else
480 c -= 0x20;
481 }
482
483 switch (emacs_code_class[c])
484 {
485 case EMACS_ascii_code:
486 case EMACS_linefeed_code:
487 break;
488
489 case EMACS_control_code:
490 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
491 return 0;
492 break;
493
494 case EMACS_invalid_code:
495 return 0;
496
497 case EMACS_leading_code_composition: /* c == 0x80 */
498 if (composing)
499 CHECK_CODE_RANGE_A0_FF;
500 else
501 composing = 1;
502 break;
503
504 case EMACS_leading_code_4:
505 CHECK_CODE_RANGE_A0_FF;
506 /* fall down to check it two more times ... */
507
508 case EMACS_leading_code_3:
509 CHECK_CODE_RANGE_A0_FF;
510 /* fall down to check it one more time ... */
511
512 case EMACS_leading_code_2:
513 CHECK_CODE_RANGE_A0_FF;
514 break;
515
516 default:
517 label_end_of_switch:
518 break;
519 }
520 }
0ef69138 521 return CODING_CATEGORY_MASK_EMACS_MULE;
4ed46869
KH
522}
523
524\f
525/*** 3. ISO2022 handlers ***/
526
527/* The following note describes the coding system ISO2022 briefly.
39787efd
KH
528 Since the intention of this note is to help understand the
529 functions in this file, some parts are NOT ACCURATE or OVERLY
530 SIMPLIFIED. For thorough understanding, please refer to the
4ed46869
KH
531 original document of ISO2022.
532
533 ISO2022 provides many mechanisms to encode several character sets
39787efd
KH
534 in 7-bit and 8-bit environments. For 7-bite environments, all text
535 is encoded using bytes less than 128. This may make the encoded
536 text a little bit longer, but the text passes more easily through
537 several gateways, some of which strip off MSB (Most Signigant Bit).
538
539 There are two kinds of character sets: control character set and
4ed46869
KH
540 graphic character set. The former contains control characters such
541 as `newline' and `escape' to provide control functions (control
39787efd
KH
542 functions are also provided by escape sequences). The latter
543 contains graphic characters such as 'A' and '-'. Emacs recognizes
4ed46869
KH
544 two control character sets and many graphic character sets.
545
546 Graphic character sets are classified into one of the following
39787efd
KH
547 four classes, according to the number of bytes (DIMENSION) and
548 number of characters in one dimension (CHARS) of the set:
549 - DIMENSION1_CHARS94
550 - DIMENSION1_CHARS96
551 - DIMENSION2_CHARS94
552 - DIMENSION2_CHARS96
553
554 In addition, each character set is assigned an identification tag,
555 unique for each set, called "final character" (denoted as <F>
556 hereafter). The <F> of each character set is decided by ECMA(*)
557 when it is registered in ISO. The code range of <F> is 0x30..0x7F
558 (0x30..0x3F are for private use only).
4ed46869
KH
559
560 Note (*): ECMA = European Computer Manufacturers Association
561
562 Here are examples of graphic character set [NAME(<F>)]:
563 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
564 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
565 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
566 o DIMENSION2_CHARS96 -- none for the moment
567
39787efd 568 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
4ed46869
KH
569 C0 [0x00..0x1F] -- control character plane 0
570 GL [0x20..0x7F] -- graphic character plane 0
571 C1 [0x80..0x9F] -- control character plane 1
572 GR [0xA0..0xFF] -- graphic character plane 1
573
574 A control character set is directly designated and invoked to C0 or
39787efd
KH
575 C1 by an escape sequence. The most common case is that:
576 - ISO646's control character set is designated/invoked to C0, and
577 - ISO6429's control character set is designated/invoked to C1,
578 and usually these designations/invocations are omitted in encoded
579 text. In a 7-bit environment, only C0 can be used, and a control
580 character for C1 is encoded by an appropriate escape sequence to
581 fit into the environment. All control characters for C1 are
582 defined to have corresponding escape sequences.
4ed46869
KH
583
584 A graphic character set is at first designated to one of four
585 graphic registers (G0 through G3), then these graphic registers are
586 invoked to GL or GR. These designations and invocations can be
587 done independently. The most common case is that G0 is invoked to
39787efd
KH
588 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
589 these invocations and designations are omitted in encoded text.
590 In a 7-bit environment, only GL can be used.
4ed46869 591
39787efd
KH
592 When a graphic character set of CHARS94 is invoked to GL, codes
593 0x20 and 0x7F of the GL area work as control characters SPACE and
594 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
595 be used.
4ed46869
KH
596
597 There are two ways of invocation: locking-shift and single-shift.
598 With locking-shift, the invocation lasts until the next different
39787efd
KH
599 invocation, whereas with single-shift, the invocation affects the
600 following character only and doesn't affect the locking-shift
601 state. Invocations are done by the following control characters or
602 escape sequences:
4ed46869
KH
603
604 ----------------------------------------------------------------------
39787efd 605 abbrev function cntrl escape seq description
4ed46869 606 ----------------------------------------------------------------------
39787efd
KH
607 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
608 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
609 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
610 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
611 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
612 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
613 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
614 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
615 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
4ed46869 616 ----------------------------------------------------------------------
39787efd
KH
617 (*) These are not used by any known coding system.
618
619 Control characters for these functions are defined by macros
620 ISO_CODE_XXX in `coding.h'.
4ed46869 621
39787efd 622 Designations are done by the following escape sequences:
4ed46869
KH
623 ----------------------------------------------------------------------
624 escape sequence description
625 ----------------------------------------------------------------------
626 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
627 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
628 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
629 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
630 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
631 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
632 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
633 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
634 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
635 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
636 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
637 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
638 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
639 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
640 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
641 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
642 ----------------------------------------------------------------------
643
644 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
39787efd 645 of dimension 1, chars 94, and final character <F>, etc...
4ed46869
KH
646
647 Note (*): Although these designations are not allowed in ISO2022,
648 Emacs accepts them on decoding, and produces them on encoding
39787efd 649 CHARS96 character sets in a coding system which is characterized as
4ed46869
KH
650 7-bit environment, non-locking-shift, and non-single-shift.
651
652 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
39787efd 653 '(' can be omitted. We refer to this as "short-form" hereafter.
4ed46869
KH
654
655 Now you may notice that there are a lot of ways for encoding the
39787efd
KH
656 same multilingual text in ISO2022. Actually, there exist many
657 coding systems such as Compound Text (used in X11's inter client
658 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
659 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
4ed46869
KH
660 localized platforms), and all of these are variants of ISO2022.
661
662 In addition to the above, Emacs handles two more kinds of escape
663 sequences: ISO6429's direction specification and Emacs' private
664 sequence for specifying character composition.
665
39787efd 666 ISO6429's direction specification takes the following form:
4ed46869
KH
667 o CSI ']' -- end of the current direction
668 o CSI '0' ']' -- end of the current direction
669 o CSI '1' ']' -- start of left-to-right text
670 o CSI '2' ']' -- start of right-to-left text
671 The control character CSI (0x9B: control sequence introducer) is
39787efd
KH
672 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
673
674 Character composition specification takes the following form:
4ed46869
KH
675 o ESC '0' -- start character composition
676 o ESC '1' -- end character composition
39787efd
KH
677 Since these are not standard escape sequences of any ISO standard,
678 the use of them for these meaning is restricted to Emacs only. */
4ed46869
KH
679
680enum iso_code_class_type iso_code_class[256];
681
f024b6aa
RS
682#define CHARSET_OK(idx, charset) \
683 (coding_system_table[idx] \
684 && (coding_system_table[idx]->safe_charsets[charset] \
685 || (CODING_SPEC_ISO_REQUESTED_DESIGNATION \
686 (coding_system_table[idx], charset) \
687 != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)))
d46c5b12
KH
688
689#define SHIFT_OUT_OK(idx) \
690 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding_system_table[idx], 1) >= 0)
691
4ed46869
KH
692/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
693 Check if a text is encoded in ISO2022. If it is, returns an
694 integer in which appropriate flag bits any of:
695 CODING_CATEGORY_MASK_ISO_7
d46c5b12 696 CODING_CATEGORY_MASK_ISO_7_TIGHT
4ed46869
KH
697 CODING_CATEGORY_MASK_ISO_8_1
698 CODING_CATEGORY_MASK_ISO_8_2
7717c392
KH
699 CODING_CATEGORY_MASK_ISO_7_ELSE
700 CODING_CATEGORY_MASK_ISO_8_ELSE
4ed46869
KH
701 are set. If a code which should never appear in ISO2022 is found,
702 returns 0. */
703
704int
705detect_coding_iso2022 (src, src_end)
706 unsigned char *src, *src_end;
707{
d46c5b12
KH
708 int mask = CODING_CATEGORY_MASK_ISO;
709 int mask_found = 0;
f46869e4 710 int reg[4], shift_out = 0, single_shifting = 0;
d46c5b12 711 int c, c1, i, charset;
3f003981 712
d46c5b12 713 reg[0] = CHARSET_ASCII, reg[1] = reg[2] = reg[3] = -1;
3f003981 714 while (mask && src < src_end)
4ed46869
KH
715 {
716 c = *src++;
717 switch (c)
718 {
719 case ISO_CODE_ESC:
f46869e4 720 single_shifting = 0;
e0e989f6 721 if (src >= src_end)
4ed46869
KH
722 break;
723 c = *src++;
d46c5b12 724 if (c >= '(' && c <= '/')
4ed46869 725 {
bf9cdd4e
KH
726 /* Designation sequence for a charset of dimension 1. */
727 if (src >= src_end)
728 break;
d46c5b12
KH
729 c1 = *src++;
730 if (c1 < ' ' || c1 >= 0x80
731 || (charset = iso_charset_table[0][c >= ','][c1]) < 0)
732 /* Invalid designation sequence. Just ignore. */
733 break;
734 reg[(c - '(') % 4] = charset;
bf9cdd4e
KH
735 }
736 else if (c == '$')
737 {
738 /* Designation sequence for a charset of dimension 2. */
739 if (src >= src_end)
740 break;
741 c = *src++;
742 if (c >= '@' && c <= 'B')
743 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
d46c5b12 744 reg[0] = charset = iso_charset_table[1][0][c];
bf9cdd4e 745 else if (c >= '(' && c <= '/')
bcf26d6a 746 {
bf9cdd4e
KH
747 if (src >= src_end)
748 break;
d46c5b12
KH
749 c1 = *src++;
750 if (c1 < ' ' || c1 >= 0x80
751 || (charset = iso_charset_table[1][c >= ','][c1]) < 0)
752 /* Invalid designation sequence. Just ignore. */
753 break;
754 reg[(c - '(') % 4] = charset;
bcf26d6a 755 }
bf9cdd4e 756 else
d46c5b12
KH
757 /* Invalid designation sequence. Just ignore. */
758 break;
759 }
ae9ff118 760 else if (c == 'N' || c == 'O')
d46c5b12 761 {
ae9ff118
KH
762 /* ESC <Fe> for SS2 or SS3. */
763 mask &= CODING_CATEGORY_MASK_ISO_7_ELSE;
d46c5b12 764 break;
4ed46869 765 }
bf9cdd4e 766 else if (c == '0' || c == '1' || c == '2')
ae9ff118 767 /* ESC <Fp> for start/end composition. Just ignore. */
d46c5b12 768 break;
bf9cdd4e 769 else
d46c5b12
KH
770 /* Invalid escape sequence. Just ignore. */
771 break;
772
773 /* We found a valid designation sequence for CHARSET. */
774 mask &= ~CODING_CATEGORY_MASK_ISO_8BIT;
775 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7, charset))
776 mask_found |= CODING_CATEGORY_MASK_ISO_7;
777 else
778 mask &= ~CODING_CATEGORY_MASK_ISO_7;
779 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT, charset))
780 mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
781 else
782 mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
ae9ff118
KH
783 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE, charset))
784 mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
785 else
d46c5b12 786 mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
ae9ff118
KH
787 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE, charset))
788 mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
789 else
d46c5b12 790 mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
4ed46869
KH
791 break;
792
4ed46869 793 case ISO_CODE_SO:
f46869e4 794 single_shifting = 0;
d46c5b12
KH
795 if (shift_out == 0
796 && (reg[1] >= 0
797 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE)
798 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE)))
799 {
800 /* Locking shift out. */
801 mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
802 mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
803 }
e0e989f6
KH
804 break;
805
d46c5b12 806 case ISO_CODE_SI:
f46869e4 807 single_shifting = 0;
d46c5b12
KH
808 if (shift_out == 1)
809 {
810 /* Locking shift in. */
811 mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
812 mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
813 }
814 break;
815
4ed46869 816 case ISO_CODE_CSI:
f46869e4 817 single_shifting = 0;
4ed46869
KH
818 case ISO_CODE_SS2:
819 case ISO_CODE_SS3:
3f003981
KH
820 {
821 int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
822
70c22245
KH
823 if (c != ISO_CODE_CSI)
824 {
d46c5b12
KH
825 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
826 & CODING_FLAG_ISO_SINGLE_SHIFT)
70c22245 827 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
d46c5b12
KH
828 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
829 & CODING_FLAG_ISO_SINGLE_SHIFT)
70c22245 830 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
f46869e4 831 single_shifting = 1;
70c22245 832 }
3f003981
KH
833 if (VECTORP (Vlatin_extra_code_table)
834 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
835 {
d46c5b12
KH
836 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
837 & CODING_FLAG_ISO_LATIN_EXTRA)
3f003981 838 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
d46c5b12
KH
839 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
840 & CODING_FLAG_ISO_LATIN_EXTRA)
3f003981
KH
841 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
842 }
843 mask &= newmask;
d46c5b12 844 mask_found |= newmask;
3f003981
KH
845 }
846 break;
4ed46869
KH
847
848 default:
849 if (c < 0x80)
f46869e4
KH
850 {
851 single_shifting = 0;
852 break;
853 }
4ed46869 854 else if (c < 0xA0)
c4825358 855 {
f46869e4 856 single_shifting = 0;
3f003981
KH
857 if (VECTORP (Vlatin_extra_code_table)
858 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
c4825358 859 {
3f003981
KH
860 int newmask = 0;
861
d46c5b12
KH
862 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
863 & CODING_FLAG_ISO_LATIN_EXTRA)
3f003981 864 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
d46c5b12
KH
865 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
866 & CODING_FLAG_ISO_LATIN_EXTRA)
3f003981
KH
867 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
868 mask &= newmask;
d46c5b12 869 mask_found |= newmask;
c4825358 870 }
3f003981
KH
871 else
872 return 0;
c4825358 873 }
4ed46869
KH
874 else
875 {
7717c392 876 unsigned char *src_begin = src;
4ed46869 877
d46c5b12 878 mask &= ~(CODING_CATEGORY_MASK_ISO_7BIT
7717c392 879 | CODING_CATEGORY_MASK_ISO_7_ELSE);
d46c5b12 880 mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
f46869e4
KH
881 /* Check the length of succeeding codes of the range
882 0xA0..0FF. If the byte length is odd, we exclude
883 CODING_CATEGORY_MASK_ISO_8_2. We can check this only
884 when we are not single shifting. */
885 if (!single_shifting)
886 {
887 while (src < src_end && *src >= 0xA0)
888 src++;
889 if ((src - src_begin - 1) & 1 && src < src_end)
890 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
891 else
892 mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
893 }
4ed46869
KH
894 }
895 break;
896 }
897 }
898
d46c5b12 899 return (mask & mask_found);
4ed46869
KH
900}
901
902/* Decode a character of which charset is CHARSET and the 1st position
bdd9fb48 903 code is C1. If dimension of CHARSET is 2, the 2nd position code is
4ed46869
KH
904 fetched from SRC and set to C2. If CHARSET is negative, it means
905 that we are decoding ill formed text, and what we can do is just to
906 read C1 as is. */
907
bdd9fb48
KH
908#define DECODE_ISO_CHARACTER(charset, c1) \
909 do { \
910 int c_alt, charset_alt = (charset); \
911 if (COMPOSING_HEAD_P (coding->composing)) \
912 { \
913 *dst++ = LEADING_CODE_COMPOSITION; \
914 if (COMPOSING_WITH_RULE_P (coding->composing)) \
915 /* To tell composition rules are embeded. */ \
916 *dst++ = 0xFF; \
917 coding->composing += 2; \
918 } \
85bbb134 919 if (charset_alt >= 0) \
bdd9fb48 920 { \
85bbb134 921 if (CHARSET_DIMENSION (charset_alt) == 2) \
70c22245
KH
922 { \
923 ONE_MORE_BYTE (c2); \
924 if (iso_code_class[(c2) & 0x7F] != ISO_0x20_or_0x7F \
925 && iso_code_class[(c2) & 0x7F] != ISO_graphic_plane_0) \
926 { \
927 src--; \
85bbb134 928 charset_alt = CHARSET_ASCII; \
70c22245
KH
929 } \
930 } \
84fbb8a0
KH
931 if (!NILP (translation_table) \
932 && ((c_alt = translate_char (translation_table, \
85bbb134 933 -1, charset_alt, c1, c2)) >= 0)) \
bdd9fb48
KH
934 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
935 } \
936 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
937 DECODE_CHARACTER_ASCII (c1); \
938 else if (CHARSET_DIMENSION (charset_alt) == 1) \
939 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
940 else \
941 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
942 if (COMPOSING_WITH_RULE_P (coding->composing)) \
943 /* To tell a composition rule follows. */ \
944 coding->composing = COMPOSING_WITH_RULE_RULE; \
4ed46869
KH
945 } while (0)
946
947/* Set designation state into CODING. */
d46c5b12
KH
948#define DECODE_DESIGNATION(reg, dimension, chars, final_char) \
949 do { \
944bd420
KH
950 int charset; \
951 \
952 if (final_char < '0' || final_char >= 128) \
953 goto label_invalid_code; \
954 charset = ISO_CHARSET_TABLE (make_number (dimension), \
955 make_number (chars), \
956 make_number (final_char)); \
d46c5b12 957 if (charset >= 0 \
704c5781
KH
958 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) == reg \
959 || coding->safe_charsets[charset])) \
d46c5b12
KH
960 { \
961 if (coding->spec.iso2022.last_invalid_designation_register == 0 \
962 && reg == 0 \
963 && charset == CHARSET_ASCII) \
964 { \
965 /* We should insert this designation sequence as is so \
966 that it is surely written back to a file. */ \
967 coding->spec.iso2022.last_invalid_designation_register = -1; \
968 goto label_invalid_code; \
969 } \
970 coding->spec.iso2022.last_invalid_designation_register = -1; \
971 if ((coding->mode & CODING_MODE_DIRECTION) \
972 && CHARSET_REVERSE_CHARSET (charset) >= 0) \
973 charset = CHARSET_REVERSE_CHARSET (charset); \
974 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
975 } \
976 else \
977 { \
978 coding->spec.iso2022.last_invalid_designation_register = reg; \
979 goto label_invalid_code; \
980 } \
4ed46869
KH
981 } while (0)
982
88993dfd
KH
983/* Return 0 if there's a valid composing sequence starting at SRC and
984 ending before SRC_END, else return -1. */
d46c5b12 985
84fbb8a0
KH
986int
987check_composing_code (coding, src, src_end)
d46c5b12
KH
988 struct coding_system *coding;
989 unsigned char *src, *src_end;
990{
d46c5b12
KH
991 int charset, c, c1, dim;
992
993 while (src < src_end)
994 {
88993dfd
KH
995 c = *src++;
996 if (c >= 0x20)
997 continue;
998 if (c != ISO_CODE_ESC || src >= src_end)
999 return -1;
1000 c = *src++;
1001 if (c == '1') /* end of compsition */
1002 return 0;
1003 if (src + 2 >= src_end
1004 || !coding->flags & CODING_FLAG_ISO_DESIGNATION)
1005 return -1;
1006
1007 dim = (c == '$');
1008 if (dim == 1)
1009 c = (*src >= '@' && *src <= 'B') ? '(' : *src++;
1010 if (c >= '(' && c <= '/')
d46c5b12 1011 {
88993dfd
KH
1012 c1 = *src++;
1013 if ((c1 < ' ' || c1 >= 0x80)
1014 || (charset = iso_charset_table[dim][c >= ','][c1]) < 0
1015 || ! coding->safe_charsets[charset]
1016 || (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
1017 == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
1018 return -1;
d46c5b12 1019 }
88993dfd
KH
1020 else
1021 return -1;
d46c5b12 1022 }
88993dfd
KH
1023
1024 /* We have not found the sequence "ESC 1". */
1025 return -1;
d46c5b12
KH
1026}
1027
4ed46869
KH
1028/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
1029
1030int
d46c5b12 1031decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
1032 struct coding_system *coding;
1033 unsigned char *source, *destination;
1034 int src_bytes, dst_bytes;
4ed46869
KH
1035{
1036 unsigned char *src = source;
1037 unsigned char *src_end = source + src_bytes;
1038 unsigned char *dst = destination;
1039 unsigned char *dst_end = destination + dst_bytes;
1040 /* Since the maximum bytes produced by each loop is 7, we subtract 6
1041 from DST_END to assure that overflow checking is necessary only
1042 at the head of loop. */
1043 unsigned char *adjusted_dst_end = dst_end - 6;
1044 int charset;
1045 /* Charsets invoked to graphic plane 0 and 1 respectively. */
1046 int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1047 int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
84fbb8a0 1048 Lisp_Object translation_table
f967223b 1049 = coding->translation_table_for_decode;
d46c5b12 1050 int result = CODING_FINISH_NORMAL;
bdd9fb48 1051
84fbb8a0 1052 if (!NILP (Venable_character_translation) && NILP (translation_table))
f967223b 1053 translation_table = Vstandard_translation_table_for_decode;
4ed46869 1054
d46c5b12 1055 coding->produced_char = 0;
fb88bf2d 1056 coding->fake_multibyte = 0;
d46c5b12
KH
1057 while (src < src_end && (dst_bytes
1058 ? (dst < adjusted_dst_end)
1059 : (dst < src - 6)))
4ed46869
KH
1060 {
1061 /* SRC_BASE remembers the start position in source in each loop.
1062 The loop will be exited when there's not enough source text
1063 to analyze long escape sequence or 2-byte code (within macros
1064 ONE_MORE_BYTE or TWO_MORE_BYTES). In that case, SRC is reset
1065 to SRC_BASE before exiting. */
1066 unsigned char *src_base = src;
bdd9fb48 1067 int c1 = *src++, c2;
4ed46869
KH
1068
1069 switch (iso_code_class [c1])
1070 {
1071 case ISO_0x20_or_0x7F:
1072 if (!coding->composing
1073 && (charset0 < 0 || CHARSET_CHARS (charset0) == 94))
1074 {
1075 /* This is SPACE or DEL. */
1076 *dst++ = c1;
d46c5b12 1077 coding->produced_char++;
4ed46869
KH
1078 break;
1079 }
1080 /* This is a graphic character, we fall down ... */
1081
1082 case ISO_graphic_plane_0:
1083 if (coding->composing == COMPOSING_WITH_RULE_RULE)
1084 {
1085 /* This is a composition rule. */
1086 *dst++ = c1 | 0x80;
1087 coding->composing = COMPOSING_WITH_RULE_TAIL;
1088 }
1089 else
1090 DECODE_ISO_CHARACTER (charset0, c1);
1091 break;
1092
1093 case ISO_0xA0_or_0xFF:
d46c5b12
KH
1094 if (charset1 < 0 || CHARSET_CHARS (charset1) == 94
1095 || coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
fb88bf2d 1096 goto label_invalid_code;
4ed46869
KH
1097 /* This is a graphic character, we fall down ... */
1098
1099 case ISO_graphic_plane_1:
d46c5b12 1100 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
fb88bf2d 1101 goto label_invalid_code;
d46c5b12
KH
1102 else
1103 DECODE_ISO_CHARACTER (charset1, c1);
4ed46869
KH
1104 break;
1105
1106 case ISO_control_code:
1107 /* All ISO2022 control characters in this class have the
1108 same representation in Emacs internal format. */
d46c5b12
KH
1109 if (c1 == '\n'
1110 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1111 && (coding->eol_type == CODING_EOL_CR
1112 || coding->eol_type == CODING_EOL_CRLF))
1113 {
1114 result = CODING_FINISH_INCONSISTENT_EOL;
1115 goto label_end_of_loop_2;
1116 }
4ed46869 1117 *dst++ = c1;
d46c5b12 1118 coding->produced_char++;
174a4cbe
KH
1119 if (c1 >= 0x80)
1120 coding->fake_multibyte = 1;
4ed46869
KH
1121 break;
1122
1123 case ISO_carriage_return:
1124 if (coding->eol_type == CODING_EOL_CR)
d46c5b12 1125 *dst++ = '\n';
4ed46869
KH
1126 else if (coding->eol_type == CODING_EOL_CRLF)
1127 {
1128 ONE_MORE_BYTE (c1);
1129 if (c1 == ISO_CODE_LF)
1130 *dst++ = '\n';
1131 else
1132 {
d46c5b12
KH
1133 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1134 {
1135 result = CODING_FINISH_INCONSISTENT_EOL;
1136 goto label_end_of_loop_2;
1137 }
4ed46869 1138 src--;
d46c5b12 1139 *dst++ = '\r';
4ed46869
KH
1140 }
1141 }
1142 else
d46c5b12
KH
1143 *dst++ = c1;
1144 coding->produced_char++;
4ed46869
KH
1145 break;
1146
1147 case ISO_shift_out:
d46c5b12
KH
1148 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1149 || CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
1150 goto label_invalid_code;
4ed46869
KH
1151 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
1152 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1153 break;
1154
1155 case ISO_shift_in:
d46c5b12
KH
1156 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
1157 goto label_invalid_code;
4ed46869
KH
1158 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
1159 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1160 break;
1161
1162 case ISO_single_shift_2_7:
1163 case ISO_single_shift_2:
d46c5b12
KH
1164 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1165 goto label_invalid_code;
4ed46869
KH
1166 /* SS2 is handled as an escape sequence of ESC 'N' */
1167 c1 = 'N';
1168 goto label_escape_sequence;
1169
1170 case ISO_single_shift_3:
d46c5b12
KH
1171 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1172 goto label_invalid_code;
4ed46869
KH
1173 /* SS2 is handled as an escape sequence of ESC 'O' */
1174 c1 = 'O';
1175 goto label_escape_sequence;
1176
1177 case ISO_control_sequence_introducer:
1178 /* CSI is handled as an escape sequence of ESC '[' ... */
1179 c1 = '[';
1180 goto label_escape_sequence;
1181
1182 case ISO_escape:
1183 ONE_MORE_BYTE (c1);
1184 label_escape_sequence:
1185 /* Escape sequences handled by Emacs are invocation,
1186 designation, direction specification, and character
1187 composition specification. */
1188 switch (c1)
1189 {
1190 case '&': /* revision of following character set */
1191 ONE_MORE_BYTE (c1);
1192 if (!(c1 >= '@' && c1 <= '~'))
d46c5b12 1193 goto label_invalid_code;
4ed46869
KH
1194 ONE_MORE_BYTE (c1);
1195 if (c1 != ISO_CODE_ESC)
d46c5b12 1196 goto label_invalid_code;
4ed46869
KH
1197 ONE_MORE_BYTE (c1);
1198 goto label_escape_sequence;
1199
1200 case '$': /* designation of 2-byte character set */
d46c5b12
KH
1201 if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1202 goto label_invalid_code;
4ed46869
KH
1203 ONE_MORE_BYTE (c1);
1204 if (c1 >= '@' && c1 <= 'B')
1205 { /* designation of JISX0208.1978, GB2312.1980,
88993dfd 1206 or JISX0208.1980 */
4ed46869
KH
1207 DECODE_DESIGNATION (0, 2, 94, c1);
1208 }
1209 else if (c1 >= 0x28 && c1 <= 0x2B)
1210 { /* designation of DIMENSION2_CHARS94 character set */
1211 ONE_MORE_BYTE (c2);
1212 DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
1213 }
1214 else if (c1 >= 0x2C && c1 <= 0x2F)
1215 { /* designation of DIMENSION2_CHARS96 character set */
1216 ONE_MORE_BYTE (c2);
1217 DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
1218 }
1219 else
d46c5b12 1220 goto label_invalid_code;
4ed46869
KH
1221 break;
1222
1223 case 'n': /* invocation of locking-shift-2 */
d46c5b12
KH
1224 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1225 || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1226 goto label_invalid_code;
4ed46869 1227 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
e0e989f6 1228 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
4ed46869
KH
1229 break;
1230
1231 case 'o': /* invocation of locking-shift-3 */
d46c5b12
KH
1232 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1233 || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1234 goto label_invalid_code;
4ed46869 1235 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
e0e989f6 1236 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
4ed46869
KH
1237 break;
1238
1239 case 'N': /* invocation of single-shift-2 */
d46c5b12
KH
1240 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1241 || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1242 goto label_invalid_code;
4ed46869
KH
1243 ONE_MORE_BYTE (c1);
1244 charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
1245 DECODE_ISO_CHARACTER (charset, c1);
1246 break;
1247
1248 case 'O': /* invocation of single-shift-3 */
d46c5b12
KH
1249 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1250 || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1251 goto label_invalid_code;
4ed46869
KH
1252 ONE_MORE_BYTE (c1);
1253 charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
1254 DECODE_ISO_CHARACTER (charset, c1);
1255 break;
1256
d46c5b12
KH
1257 case '0': case '2': /* start composing */
1258 /* Before processing composing, we must be sure that all
1259 characters being composed are supported by CODING.
88993dfd
KH
1260 If not, we must give up composing. */
1261 if (check_composing_code (coding, src, src_end) == 0)
1262 {
1263 /* We are looking at a valid composition sequence. */
1264 coding->composing = (c1 == '0'
1265 ? COMPOSING_NO_RULE_HEAD
1266 : COMPOSING_WITH_RULE_HEAD);
1267 coding->composed_chars = 0;
1268 }
1269 else
1270 {
1271 *dst++ = ISO_CODE_ESC;
1272 *dst++ = c1;
1273 coding->produced_char += 2;
1274 }
4ed46869
KH
1275 break;
1276
1277 case '1': /* end composing */
88993dfd
KH
1278 if (!coding->composing)
1279 {
1280 *dst++ = ISO_CODE_ESC;
1281 *dst++ = c1;
1282 coding->produced_char += 2;
1283 break;
1284 }
1285
de79a6a5
KH
1286 if (coding->composed_chars > 0)
1287 {
1288 if (coding->composed_chars == 1)
1289 {
1290 unsigned char *this_char_start = dst;
1291 int this_bytes;
1292
1293 /* Only one character is in the composing
1294 sequence. Make it a normal character. */
1295 while (*--this_char_start != LEADING_CODE_COMPOSITION);
1296 dst = (this_char_start
1297 + (coding->composing == COMPOSING_NO_RULE_TAIL
1298 ? 1 : 2));
1299 *dst -= 0x20;
1300 if (*dst == 0x80)
1301 *++dst &= 0x7F;
1302 this_bytes = BYTES_BY_CHAR_HEAD (*dst);
1303 while (this_bytes--) *this_char_start++ = *dst++;
1304 dst = this_char_start;
1305 }
1306 coding->produced_char++;
1307 }
4ed46869 1308 coding->composing = COMPOSING_NO;
4ed46869
KH
1309 break;
1310
1311 case '[': /* specification of direction */
d46c5b12
KH
1312 if (coding->flags & CODING_FLAG_ISO_NO_DIRECTION)
1313 goto label_invalid_code;
4ed46869 1314 /* For the moment, nested direction is not supported.
d46c5b12
KH
1315 So, `coding->mode & CODING_MODE_DIRECTION' zero means
1316 left-to-right, and nozero means right-to-left. */
4ed46869
KH
1317 ONE_MORE_BYTE (c1);
1318 switch (c1)
1319 {
1320 case ']': /* end of the current direction */
d46c5b12 1321 coding->mode &= ~CODING_MODE_DIRECTION;
4ed46869
KH
1322
1323 case '0': /* end of the current direction */
1324 case '1': /* start of left-to-right direction */
1325 ONE_MORE_BYTE (c1);
1326 if (c1 == ']')
d46c5b12 1327 coding->mode &= ~CODING_MODE_DIRECTION;
4ed46869 1328 else
d46c5b12 1329 goto label_invalid_code;
4ed46869
KH
1330 break;
1331
1332 case '2': /* start of right-to-left direction */
1333 ONE_MORE_BYTE (c1);
1334 if (c1 == ']')
d46c5b12 1335 coding->mode |= CODING_MODE_DIRECTION;
4ed46869 1336 else
d46c5b12 1337 goto label_invalid_code;
4ed46869
KH
1338 break;
1339
1340 default:
d46c5b12 1341 goto label_invalid_code;
4ed46869
KH
1342 }
1343 break;
1344
1345 default:
d46c5b12
KH
1346 if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1347 goto label_invalid_code;
4ed46869
KH
1348 if (c1 >= 0x28 && c1 <= 0x2B)
1349 { /* designation of DIMENSION1_CHARS94 character set */
1350 ONE_MORE_BYTE (c2);
1351 DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
1352 }
1353 else if (c1 >= 0x2C && c1 <= 0x2F)
1354 { /* designation of DIMENSION1_CHARS96 character set */
1355 ONE_MORE_BYTE (c2);
1356 DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
1357 }
1358 else
1359 {
d46c5b12 1360 goto label_invalid_code;
4ed46869
KH
1361 }
1362 }
1363 /* We must update these variables now. */
1364 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1365 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1366 break;
1367
d46c5b12 1368 label_invalid_code:
d46c5b12
KH
1369 while (src_base < src)
1370 *dst++ = *src_base++;
fb88bf2d 1371 coding->fake_multibyte = 1;
4ed46869
KH
1372 }
1373 continue;
1374
1375 label_end_of_loop:
d46c5b12
KH
1376 result = CODING_FINISH_INSUFFICIENT_SRC;
1377 label_end_of_loop_2:
4ed46869
KH
1378 src = src_base;
1379 break;
1380 }
1381
fb88bf2d 1382 if (src < src_end)
4ed46869 1383 {
fb88bf2d
KH
1384 if (result == CODING_FINISH_NORMAL)
1385 result = CODING_FINISH_INSUFFICIENT_DST;
1386 else if (result != CODING_FINISH_INCONSISTENT_EOL
1387 && coding->mode & CODING_MODE_LAST_BLOCK)
1388 {
1389 /* This is the last block of the text to be decoded. We had
1390 better just flush out all remaining codes in the text
1391 although they are not valid characters. */
1392 src_bytes = src_end - src;
1393 if (dst_bytes && (dst_end - dst < src_bytes))
1394 src_bytes = dst_end - dst;
1395 bcopy (src, dst, src_bytes);
1396 dst += src_bytes;
1397 src += src_bytes;
1398 coding->fake_multibyte = 1;
1399 }
4ed46869 1400 }
fb88bf2d 1401
d46c5b12
KH
1402 coding->consumed = coding->consumed_char = src - source;
1403 coding->produced = dst - destination;
1404 return result;
4ed46869
KH
1405}
1406
f4dee582 1407/* ISO2022 encoding stuff. */
4ed46869
KH
1408
1409/*
f4dee582 1410 It is not enough to say just "ISO2022" on encoding, we have to
d46c5b12 1411 specify more details. In Emacs, each coding system of ISO2022
4ed46869
KH
1412 variant has the following specifications:
1413 1. Initial designation to G0 thru G3.
1414 2. Allows short-form designation?
1415 3. ASCII should be designated to G0 before control characters?
1416 4. ASCII should be designated to G0 at end of line?
1417 5. 7-bit environment or 8-bit environment?
1418 6. Use locking-shift?
1419 7. Use Single-shift?
1420 And the following two are only for Japanese:
1421 8. Use ASCII in place of JIS0201-1976-Roman?
1422 9. Use JISX0208-1983 in place of JISX0208-1978?
1423 These specifications are encoded in `coding->flags' as flag bits
1424 defined by macros CODING_FLAG_ISO_XXX. See `coding.h' for more
f4dee582 1425 details.
4ed46869
KH
1426*/
1427
1428/* Produce codes (escape sequence) for designating CHARSET to graphic
1429 register REG. If <final-char> of CHARSET is '@', 'A', or 'B' and
1430 the coding system CODING allows, produce designation sequence of
1431 short-form. */
1432
1433#define ENCODE_DESIGNATION(charset, reg, coding) \
1434 do { \
1435 unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset); \
1436 char *intermediate_char_94 = "()*+"; \
1437 char *intermediate_char_96 = ",-./"; \
70c22245
KH
1438 int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset); \
1439 if (revision < 255) \
1440 { \
4ed46869
KH
1441 *dst++ = ISO_CODE_ESC; \
1442 *dst++ = '&'; \
70c22245 1443 *dst++ = '@' + revision; \
4ed46869
KH
1444 } \
1445 *dst++ = ISO_CODE_ESC; \
1446 if (CHARSET_DIMENSION (charset) == 1) \
1447 { \
1448 if (CHARSET_CHARS (charset) == 94) \
1449 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1450 else \
1451 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1452 } \
1453 else \
1454 { \
1455 *dst++ = '$'; \
1456 if (CHARSET_CHARS (charset) == 94) \
1457 { \
1458 if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM) \
1459 || reg != 0 \
1460 || final_char < '@' || final_char > 'B') \
1461 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1462 } \
1463 else \
1464 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1465 } \
1466 *dst++ = final_char; \
1467 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
1468 } while (0)
1469
1470/* The following two macros produce codes (control character or escape
1471 sequence) for ISO2022 single-shift functions (single-shift-2 and
1472 single-shift-3). */
1473
1474#define ENCODE_SINGLE_SHIFT_2 \
1475 do { \
1476 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1477 *dst++ = ISO_CODE_ESC, *dst++ = 'N'; \
1478 else \
fb88bf2d
KH
1479 { \
1480 *dst++ = ISO_CODE_SS2; \
1481 coding->fake_multibyte = 1; \
1482 } \
4ed46869
KH
1483 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1484 } while (0)
1485
fb88bf2d
KH
1486#define ENCODE_SINGLE_SHIFT_3 \
1487 do { \
4ed46869 1488 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
fb88bf2d
KH
1489 *dst++ = ISO_CODE_ESC, *dst++ = 'O'; \
1490 else \
1491 { \
1492 *dst++ = ISO_CODE_SS3; \
1493 coding->fake_multibyte = 1; \
1494 } \
4ed46869
KH
1495 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1496 } while (0)
1497
1498/* The following four macros produce codes (control character or
1499 escape sequence) for ISO2022 locking-shift functions (shift-in,
1500 shift-out, locking-shift-2, and locking-shift-3). */
1501
1502#define ENCODE_SHIFT_IN \
1503 do { \
1504 *dst++ = ISO_CODE_SI; \
1505 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1506 } while (0)
1507
1508#define ENCODE_SHIFT_OUT \
1509 do { \
1510 *dst++ = ISO_CODE_SO; \
1511 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1512 } while (0)
1513
1514#define ENCODE_LOCKING_SHIFT_2 \
1515 do { \
1516 *dst++ = ISO_CODE_ESC, *dst++ = 'n'; \
1517 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1518 } while (0)
1519
1520#define ENCODE_LOCKING_SHIFT_3 \
1521 do { \
1522 *dst++ = ISO_CODE_ESC, *dst++ = 'o'; \
1523 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1524 } while (0)
1525
f4dee582
RS
1526/* Produce codes for a DIMENSION1 character whose character set is
1527 CHARSET and whose position-code is C1. Designation and invocation
4ed46869
KH
1528 sequences are also produced in advance if necessary. */
1529
1530
6e85d753
KH
1531#define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
1532 do { \
1533 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1534 { \
1535 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1536 *dst++ = c1 & 0x7F; \
1537 else \
1538 *dst++ = c1 | 0x80; \
1539 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1540 break; \
1541 } \
1542 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1543 { \
1544 *dst++ = c1 & 0x7F; \
1545 break; \
1546 } \
1547 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1548 { \
1549 *dst++ = c1 | 0x80; \
1550 break; \
1551 } \
1552 else if (coding->flags & CODING_FLAG_ISO_SAFE \
70c22245 1553 && !coding->safe_charsets[charset]) \
6e85d753
KH
1554 { \
1555 /* We should not encode this character, instead produce one or \
1556 two `?'s. */ \
1557 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1558 if (CHARSET_WIDTH (charset) == 2) \
1559 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1560 break; \
1561 } \
1562 else \
1563 /* Since CHARSET is not yet invoked to any graphic planes, we \
1564 must invoke it, or, at first, designate it to some graphic \
1565 register. Then repeat the loop to actually produce the \
1566 character. */ \
1567 dst = encode_invocation_designation (charset, coding, dst); \
4ed46869
KH
1568 } while (1)
1569
f4dee582
RS
1570/* Produce codes for a DIMENSION2 character whose character set is
1571 CHARSET and whose position-codes are C1 and C2. Designation and
4ed46869
KH
1572 invocation codes are also produced in advance if necessary. */
1573
6e85d753
KH
1574#define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
1575 do { \
1576 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1577 { \
1578 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1579 *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F; \
1580 else \
1581 *dst++ = c1 | 0x80, *dst++ = c2 | 0x80; \
1582 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1583 break; \
1584 } \
1585 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1586 { \
1587 *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F; \
1588 break; \
1589 } \
1590 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1591 { \
1592 *dst++ = c1 | 0x80, *dst++= c2 | 0x80; \
1593 break; \
1594 } \
1595 else if (coding->flags & CODING_FLAG_ISO_SAFE \
70c22245 1596 && !coding->safe_charsets[charset]) \
6e85d753
KH
1597 { \
1598 /* We should not encode this character, instead produce one or \
1599 two `?'s. */ \
1600 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1601 if (CHARSET_WIDTH (charset) == 2) \
1602 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1603 break; \
1604 } \
1605 else \
1606 /* Since CHARSET is not yet invoked to any graphic planes, we \
1607 must invoke it, or, at first, designate it to some graphic \
1608 register. Then repeat the loop to actually produce the \
1609 character. */ \
1610 dst = encode_invocation_designation (charset, coding, dst); \
4ed46869
KH
1611 } while (1)
1612
6f551029
KH
1613#define ENCODE_ISO_CHARACTER(charset, c1, c2) \
1614 do { \
1615 int c_alt, charset_alt; \
1616 if (!NILP (translation_table) \
1617 && ((c_alt = translate_char (translation_table, -1, \
1618 charset, c1, c2)) \
1619 >= 0)) \
1620 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1621 else \
1622 charset_alt = charset; \
1623 if (CHARSET_DEFINED_P (charset_alt)) \
1624 { \
1625 if (CHARSET_DIMENSION (charset_alt) == 1) \
1626 { \
1627 if (charset == CHARSET_ASCII \
1628 && coding->flags & CODING_FLAG_ISO_USE_ROMAN) \
1629 charset_alt = charset_latin_jisx0201; \
1630 ENCODE_ISO_CHARACTER_DIMENSION1 (charset_alt, c1); \
1631 } \
1632 else \
1633 { \
1634 if (charset == charset_jisx0208 \
1635 && coding->flags & CODING_FLAG_ISO_USE_OLDJIS) \
1636 charset_alt = charset_jisx0208_1978; \
1637 ENCODE_ISO_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
1638 } \
1639 } \
1640 else \
1641 { \
1642 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1643 { \
1644 *dst++ = charset & 0x7f; \
1645 *dst++ = c1 & 0x7f; \
1646 if (c2) \
1647 *dst++ = c2 & 0x7f; \
1648 } \
1649 else \
1650 { \
1651 *dst++ = charset; \
1652 *dst++ = c1; \
1653 if (c2) \
1654 *dst++ = c2; \
1655 } \
1656 } \
1657 if (! COMPOSING_P (coding->composing)) \
1658 coding->consumed_char++; \
84fbb8a0 1659 } while (0)
bdd9fb48 1660
4ed46869
KH
1661/* Produce designation and invocation codes at a place pointed by DST
1662 to use CHARSET. The element `spec.iso2022' of *CODING is updated.
1663 Return new DST. */
1664
1665unsigned char *
1666encode_invocation_designation (charset, coding, dst)
1667 int charset;
1668 struct coding_system *coding;
1669 unsigned char *dst;
1670{
1671 int reg; /* graphic register number */
1672
1673 /* At first, check designations. */
1674 for (reg = 0; reg < 4; reg++)
1675 if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
1676 break;
1677
1678 if (reg >= 4)
1679 {
1680 /* CHARSET is not yet designated to any graphic registers. */
1681 /* At first check the requested designation. */
1682 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1ba9e4ab
KH
1683 if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1684 /* Since CHARSET requests no special designation, designate it
1685 to graphic register 0. */
4ed46869
KH
1686 reg = 0;
1687
1688 ENCODE_DESIGNATION (charset, reg, coding);
1689 }
1690
1691 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
1692 && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
1693 {
1694 /* Since the graphic register REG is not invoked to any graphic
1695 planes, invoke it to graphic plane 0. */
1696 switch (reg)
1697 {
1698 case 0: /* graphic register 0 */
1699 ENCODE_SHIFT_IN;
1700 break;
1701
1702 case 1: /* graphic register 1 */
1703 ENCODE_SHIFT_OUT;
1704 break;
1705
1706 case 2: /* graphic register 2 */
1707 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1708 ENCODE_SINGLE_SHIFT_2;
1709 else
1710 ENCODE_LOCKING_SHIFT_2;
1711 break;
1712
1713 case 3: /* graphic register 3 */
1714 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1715 ENCODE_SINGLE_SHIFT_3;
1716 else
1717 ENCODE_LOCKING_SHIFT_3;
1718 break;
1719 }
1720 }
1721 return dst;
1722}
1723
1724/* The following two macros produce codes for indicating composition. */
1725#define ENCODE_COMPOSITION_NO_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '0'
1726#define ENCODE_COMPOSITION_WITH_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '2'
1727#define ENCODE_COMPOSITION_END *dst++ = ISO_CODE_ESC, *dst++ = '1'
1728
1729/* The following three macros produce codes for indicating direction
1730 of text. */
1731#define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
1732 do { \
1733 if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS) \
1734 *dst++ = ISO_CODE_ESC, *dst++ = '['; \
1735 else \
1736 *dst++ = ISO_CODE_CSI; \
1737 } while (0)
1738
1739#define ENCODE_DIRECTION_R2L \
1740 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '2', *dst++ = ']'
1741
1742#define ENCODE_DIRECTION_L2R \
1743 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '0', *dst++ = ']'
1744
1745/* Produce codes for designation and invocation to reset the graphic
1746 planes and registers to initial state. */
e0e989f6
KH
1747#define ENCODE_RESET_PLANE_AND_REGISTER \
1748 do { \
1749 int reg; \
1750 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0) \
1751 ENCODE_SHIFT_IN; \
1752 for (reg = 0; reg < 4; reg++) \
1753 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0 \
1754 && (CODING_SPEC_ISO_DESIGNATION (coding, reg) \
1755 != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg))) \
1756 ENCODE_DESIGNATION \
1757 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
4ed46869
KH
1758 } while (0)
1759
bdd9fb48 1760/* Produce designation sequences of charsets in the line started from
d46c5b12 1761 SRC to a place pointed by *DSTP, and update DSTP.
bdd9fb48
KH
1762
1763 If the current block ends before any end-of-line, we may fail to
d46c5b12
KH
1764 find all the necessary designations. */
1765
dfcf069d 1766void
bdd9fb48 1767encode_designation_at_bol (coding, table, src, src_end, dstp)
e0e989f6 1768 struct coding_system *coding;
bdd9fb48 1769 Lisp_Object table;
e0e989f6
KH
1770 unsigned char *src, *src_end, **dstp;
1771{
bdd9fb48
KH
1772 int charset, c, found = 0, reg;
1773 /* Table of charsets to be designated to each graphic register. */
1774 int r[4];
1775 unsigned char *dst = *dstp;
1776
1777 for (reg = 0; reg < 4; reg++)
1778 r[reg] = -1;
1779
1780 while (src < src_end && *src != '\n' && found < 4)
e0e989f6 1781 {
bdd9fb48
KH
1782 int bytes = BYTES_BY_CHAR_HEAD (*src);
1783
1784 if (NILP (table))
1785 charset = CHARSET_AT (src);
1786 else
e0e989f6 1787 {
35cb8686
RS
1788 int c_alt;
1789 unsigned char c1, c2;
bdd9fb48
KH
1790
1791 SPLIT_STRING(src, bytes, charset, c1, c2);
84fbb8a0 1792 if ((c_alt = translate_char (table, -1, charset, c1, c2)) >= 0)
bdd9fb48 1793 charset = CHAR_CHARSET (c_alt);
e0e989f6 1794 }
bdd9fb48 1795
e0e989f6 1796 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
d46c5b12 1797 if (reg != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION && r[reg] < 0)
bdd9fb48
KH
1798 {
1799 found++;
1800 r[reg] = charset;
1801 }
1802
1803 src += bytes;
1804 }
1805
1806 if (found)
1807 {
1808 for (reg = 0; reg < 4; reg++)
1809 if (r[reg] >= 0
1810 && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
1811 ENCODE_DESIGNATION (r[reg], reg, coding);
1812 *dstp = dst;
e0e989f6 1813 }
e0e989f6
KH
1814}
1815
4ed46869
KH
1816/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
1817
1818int
d46c5b12 1819encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
1820 struct coding_system *coding;
1821 unsigned char *source, *destination;
1822 int src_bytes, dst_bytes;
4ed46869
KH
1823{
1824 unsigned char *src = source;
1825 unsigned char *src_end = source + src_bytes;
1826 unsigned char *dst = destination;
1827 unsigned char *dst_end = destination + dst_bytes;
e0e989f6 1828 /* Since the maximum bytes produced by each loop is 20, we subtract 19
4ed46869
KH
1829 from DST_END to assure overflow checking is necessary only at the
1830 head of loop. */
e0e989f6 1831 unsigned char *adjusted_dst_end = dst_end - 19;
84fbb8a0 1832 Lisp_Object translation_table
f967223b 1833 = coding->translation_table_for_encode;
d46c5b12 1834 int result = CODING_FINISH_NORMAL;
bdd9fb48 1835
84fbb8a0 1836 if (!NILP (Venable_character_translation) && NILP (translation_table))
f967223b 1837 translation_table = Vstandard_translation_table_for_encode;
4ed46869 1838
d46c5b12 1839 coding->consumed_char = 0;
fb88bf2d 1840 coding->fake_multibyte = 0;
d46c5b12
KH
1841 while (src < src_end && (dst_bytes
1842 ? (dst < adjusted_dst_end)
1843 : (dst < src - 19)))
4ed46869
KH
1844 {
1845 /* SRC_BASE remembers the start position in source in each loop.
1846 The loop will be exited when there's not enough source text
1847 to analyze multi-byte codes (within macros ONE_MORE_BYTE,
1848 TWO_MORE_BYTES, and THREE_MORE_BYTES). In that case, SRC is
1849 reset to SRC_BASE before exiting. */
1850 unsigned char *src_base = src;
bdd9fb48 1851 int charset, c1, c2, c3, c4;
4ed46869 1852
e0e989f6
KH
1853 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
1854 && CODING_SPEC_ISO_BOL (coding))
1855 {
bdd9fb48 1856 /* We have to produce designation sequences if any now. */
84fbb8a0 1857 encode_designation_at_bol (coding, translation_table,
bdd9fb48 1858 src, src_end, &dst);
e0e989f6
KH
1859 CODING_SPEC_ISO_BOL (coding) = 0;
1860 }
1861
1862 c1 = *src++;
4ed46869 1863 /* If we are seeing a component of a composite character, we are
d46c5b12
KH
1864 seeing a leading-code encoded irregularly for composition, or
1865 a composition rule if composing with rule. We must set C1 to
1866 a normal leading-code or an ASCII code. If we are not seeing
1867 a composite character, we must reset composition,
1868 designation, and invocation states. */
4ed46869
KH
1869 if (COMPOSING_P (coding->composing))
1870 {
1871 if (c1 < 0xA0)
1872 {
1873 /* We are not in a composite character any longer. */
1874 coding->composing = COMPOSING_NO;
d46c5b12 1875 ENCODE_RESET_PLANE_AND_REGISTER;
4ed46869
KH
1876 ENCODE_COMPOSITION_END;
1877 }
1878 else
1879 {
1880 if (coding->composing == COMPOSING_WITH_RULE_RULE)
1881 {
1882 *dst++ = c1 & 0x7F;
1883 coding->composing = COMPOSING_WITH_RULE_HEAD;
1884 continue;
1885 }
1886 else if (coding->composing == COMPOSING_WITH_RULE_HEAD)
1887 coding->composing = COMPOSING_WITH_RULE_RULE;
1888 if (c1 == 0xA0)
1889 {
1890 /* This is an ASCII component. */
1891 ONE_MORE_BYTE (c1);
1892 c1 &= 0x7F;
1893 }
1894 else
1895 /* This is a leading-code of non ASCII component. */
1896 c1 -= 0x20;
1897 }
1898 }
1899
1900 /* Now encode one character. C1 is a control character, an
1901 ASCII character, or a leading-code of multi-byte character. */
1902 switch (emacs_code_class[c1])
1903 {
1904 case EMACS_ascii_code:
8dbb769e 1905 c2 = 0;
bdd9fb48 1906 ENCODE_ISO_CHARACTER (CHARSET_ASCII, c1, /* dummy */ c2);
4ed46869
KH
1907 break;
1908
1909 case EMACS_control_code:
1910 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
e0e989f6 1911 ENCODE_RESET_PLANE_AND_REGISTER;
4ed46869 1912 *dst++ = c1;
d46c5b12 1913 coding->consumed_char++;
4ed46869
KH
1914 break;
1915
1916 case EMACS_carriage_return_code:
d46c5b12 1917 if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
4ed46869
KH
1918 {
1919 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
e0e989f6 1920 ENCODE_RESET_PLANE_AND_REGISTER;
4ed46869 1921 *dst++ = c1;
d46c5b12 1922 coding->consumed_char++;
4ed46869
KH
1923 break;
1924 }
1925 /* fall down to treat '\r' as '\n' ... */
1926
1927 case EMACS_linefeed_code:
1928 if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
e0e989f6
KH
1929 ENCODE_RESET_PLANE_AND_REGISTER;
1930 if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
1931 bcopy (coding->spec.iso2022.initial_designation,
1932 coding->spec.iso2022.current_designation,
1933 sizeof coding->spec.iso2022.initial_designation);
4ed46869 1934 if (coding->eol_type == CODING_EOL_LF
0ef69138 1935 || coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
1936 *dst++ = ISO_CODE_LF;
1937 else if (coding->eol_type == CODING_EOL_CRLF)
1938 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
1939 else
1940 *dst++ = ISO_CODE_CR;
e0e989f6 1941 CODING_SPEC_ISO_BOL (coding) = 1;
d46c5b12 1942 coding->consumed_char++;
4ed46869
KH
1943 break;
1944
1945 case EMACS_leading_code_2:
1946 ONE_MORE_BYTE (c2);
8dbb769e 1947 c3 = 0;
19a8d9e0
KH
1948 if (c2 < 0xA0)
1949 {
1950 /* invalid sequence */
1951 *dst++ = c1;
38cf95df
RS
1952 src--;
1953 coding->consumed_char++;
19a8d9e0
KH
1954 }
1955 else
1956 ENCODE_ISO_CHARACTER (c1, c2, /* dummy */ c3);
4ed46869
KH
1957 break;
1958
1959 case EMACS_leading_code_3:
1960 TWO_MORE_BYTES (c2, c3);
8dbb769e 1961 c4 = 0;
19a8d9e0
KH
1962 if (c2 < 0xA0 || c3 < 0xA0)
1963 {
1964 /* invalid sequence */
1965 *dst++ = c1;
38cf95df
RS
1966 src -= 2;
1967 coding->consumed_char++;
19a8d9e0
KH
1968 }
1969 else if (c1 < LEADING_CODE_PRIVATE_11)
bdd9fb48 1970 ENCODE_ISO_CHARACTER (c1, c2, c3);
4ed46869 1971 else
bdd9fb48 1972 ENCODE_ISO_CHARACTER (c2, c3, /* dummy */ c4);
4ed46869
KH
1973 break;
1974
1975 case EMACS_leading_code_4:
1976 THREE_MORE_BYTES (c2, c3, c4);
19a8d9e0
KH
1977 if (c2 < 0xA0 || c3 < 0xA0 || c4 < 0xA0)
1978 {
1979 /* invalid sequence */
1980 *dst++ = c1;
38cf95df
RS
1981 src -= 3;
1982 coding->consumed_char++;
19a8d9e0
KH
1983 }
1984 else
1985 ENCODE_ISO_CHARACTER (c2, c3, c4);
4ed46869
KH
1986 break;
1987
1988 case EMACS_leading_code_composition:
19a8d9e0
KH
1989 ONE_MORE_BYTE (c2);
1990 if (c2 < 0xA0)
1991 {
1992 /* invalid sequence */
1993 *dst++ = c1;
38cf95df
RS
1994 src--;
1995 coding->consumed_char++;
19a8d9e0
KH
1996 }
1997 else if (c2 == 0xFF)
4ed46869 1998 {
d46c5b12 1999 ENCODE_RESET_PLANE_AND_REGISTER;
4ed46869
KH
2000 coding->composing = COMPOSING_WITH_RULE_HEAD;
2001 ENCODE_COMPOSITION_WITH_RULE_START;
d46c5b12 2002 coding->consumed_char++;
4ed46869
KH
2003 }
2004 else
2005 {
d46c5b12 2006 ENCODE_RESET_PLANE_AND_REGISTER;
4ed46869
KH
2007 /* Rewind one byte because it is a character code of
2008 composition elements. */
2009 src--;
2010 coding->composing = COMPOSING_NO_RULE_HEAD;
2011 ENCODE_COMPOSITION_NO_RULE_START;
d46c5b12 2012 coding->consumed_char++;
4ed46869
KH
2013 }
2014 break;
2015
2016 case EMACS_invalid_code:
3efbce95
KH
2017 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2018 ENCODE_RESET_PLANE_AND_REGISTER;
4ed46869 2019 *dst++ = c1;
d46c5b12 2020 coding->consumed_char++;
4ed46869
KH
2021 break;
2022 }
2023 continue;
2024 label_end_of_loop:
d46c5b12
KH
2025 result = CODING_FINISH_INSUFFICIENT_SRC;
2026 src = src_base;
4ed46869
KH
2027 break;
2028 }
2029
49cb52b4
KH
2030 if (src < src_end && result == CODING_FINISH_NORMAL)
2031 result = CODING_FINISH_INSUFFICIENT_DST;
2032
2033 /* If this is the last block of the text to be encoded, we must
2034 reset graphic planes and registers to the initial state, and
2035 flush out the carryover if any. */
2036 if (coding->mode & CODING_MODE_LAST_BLOCK)
84fbb8a0
KH
2037 {
2038 ENCODE_RESET_PLANE_AND_REGISTER;
2039 if (COMPOSING_P (coding->composing))
2040 ENCODE_COMPOSITION_END;
88993dfd
KH
2041 if (result == CODING_FINISH_INSUFFICIENT_SRC)
2042 {
2043 while (src < src_end && dst < dst_end)
2044 *dst++ = *src++;
2045 }
84fbb8a0 2046 }
d46c5b12
KH
2047 coding->consumed = src - source;
2048 coding->produced = coding->produced_char = dst - destination;
2049 return result;
4ed46869
KH
2050}
2051
2052\f
2053/*** 4. SJIS and BIG5 handlers ***/
2054
f4dee582 2055/* Although SJIS and BIG5 are not ISO's coding system, they are used
4ed46869
KH
2056 quite widely. So, for the moment, Emacs supports them in the bare
2057 C code. But, in the future, they may be supported only by CCL. */
2058
2059/* SJIS is a coding system encoding three character sets: ASCII, right
2060 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
2061 as is. A character of charset katakana-jisx0201 is encoded by
2062 "position-code + 0x80". A character of charset japanese-jisx0208
2063 is encoded in 2-byte but two position-codes are divided and shifted
2064 so that it fit in the range below.
2065
2066 --- CODE RANGE of SJIS ---
2067 (character set) (range)
2068 ASCII 0x00 .. 0x7F
2069 KATAKANA-JISX0201 0xA0 .. 0xDF
c28a9453 2070 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
d14d03ac 2071 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
4ed46869
KH
2072 -------------------------------
2073
2074*/
2075
2076/* BIG5 is a coding system encoding two character sets: ASCII and
2077 Big5. An ASCII character is encoded as is. Big5 is a two-byte
2078 character set and is encoded in two-byte.
2079
2080 --- CODE RANGE of BIG5 ---
2081 (character set) (range)
2082 ASCII 0x00 .. 0x7F
2083 Big5 (1st byte) 0xA1 .. 0xFE
2084 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
2085 --------------------------
2086
2087 Since the number of characters in Big5 is larger than maximum
2088 characters in Emacs' charset (96x96), it can't be handled as one
2089 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
2090 and `charset-big5-2'. Both are DIMENSION2 and CHARS94. The former
2091 contains frequently used characters and the latter contains less
2092 frequently used characters. */
2093
2094/* Macros to decode or encode a character of Big5 in BIG5. B1 and B2
2095 are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
2096 C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
2097 format. CHARSET is `charset_big5_1' or `charset_big5_2'. */
2098
2099/* Number of Big5 characters which have the same code in 1st byte. */
2100#define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2101
2102#define DECODE_BIG5(b1, b2, charset, c1, c2) \
2103 do { \
2104 unsigned int temp \
2105 = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62); \
2106 if (b1 < 0xC9) \
2107 charset = charset_big5_1; \
2108 else \
2109 { \
2110 charset = charset_big5_2; \
2111 temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW; \
2112 } \
2113 c1 = temp / (0xFF - 0xA1) + 0x21; \
2114 c2 = temp % (0xFF - 0xA1) + 0x21; \
2115 } while (0)
2116
2117#define ENCODE_BIG5(charset, c1, c2, b1, b2) \
2118 do { \
2119 unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21); \
2120 if (charset == charset_big5_2) \
2121 temp += BIG5_SAME_ROW * (0xC9 - 0xA1); \
2122 b1 = temp / BIG5_SAME_ROW + 0xA1; \
2123 b2 = temp % BIG5_SAME_ROW; \
2124 b2 += b2 < 0x3F ? 0x40 : 0x62; \
2125 } while (0)
2126
a5d301df
KH
2127#define DECODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \
2128 do { \
2129 int c_alt, charset_alt = (charset); \
84fbb8a0
KH
2130 if (!NILP (translation_table) \
2131 && ((c_alt = translate_char (translation_table, \
2132 -1, (charset), c1, c2)) >= 0)) \
55ab7be3 2133 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
a5d301df
KH
2134 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
2135 DECODE_CHARACTER_ASCII (c1); \
2136 else if (CHARSET_DIMENSION (charset_alt) == 1) \
2137 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
2138 else \
2139 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
2140 } while (0)
2141
84fbb8a0
KH
2142#define ENCODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \
2143 do { \
2144 int c_alt, charset_alt; \
2145 if (!NILP (translation_table) \
2146 && ((c_alt = translate_char (translation_table, -1, \
2147 charset, c1, c2)) \
2148 >= 0)) \
2149 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
2150 else \
2151 charset_alt = charset; \
2152 if (charset_alt == charset_ascii) \
2153 *dst++ = c1; \
2154 else if (CHARSET_DIMENSION (charset_alt) == 1) \
2155 { \
2156 if (sjis_p && charset_alt == charset_katakana_jisx0201) \
2157 *dst++ = c1; \
497ee4fb
KH
2158 else if (sjis_p && charset_alt == charset_latin_jisx0201) \
2159 *dst++ = c1 & 0x7F; \
84fbb8a0
KH
2160 else \
2161 { \
2162 *dst++ = charset_alt, *dst++ = c1; \
2163 coding->fake_multibyte = 1; \
2164 } \
2165 } \
2166 else \
2167 { \
2168 c1 &= 0x7F, c2 &= 0x7F; \
d6bd663c
KH
2169 if (sjis_p && (charset_alt == charset_jisx0208 \
2170 || charset_alt == charset_jisx0208_1978))\
84fbb8a0
KH
2171 { \
2172 unsigned char s1, s2; \
2173 \
2174 ENCODE_SJIS (c1, c2, s1, s2); \
2175 *dst++ = s1, *dst++ = s2; \
2176 coding->fake_multibyte = 1; \
2177 } \
2178 else if (!sjis_p \
2179 && (charset_alt == charset_big5_1 \
2180 || charset_alt == charset_big5_2)) \
2181 { \
2182 unsigned char b1, b2; \
2183 \
2184 ENCODE_BIG5 (charset_alt, c1, c2, b1, b2); \
2185 *dst++ = b1, *dst++ = b2; \
2186 } \
2187 else \
2188 { \
2189 *dst++ = charset_alt, *dst++ = c1, *dst++ = c2; \
2190 coding->fake_multibyte = 1; \
2191 } \
2192 } \
2193 coding->consumed_char++; \
a5d301df
KH
2194 } while (0);
2195
4ed46869
KH
2196/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2197 Check if a text is encoded in SJIS. If it is, return
2198 CODING_CATEGORY_MASK_SJIS, else return 0. */
2199
2200int
2201detect_coding_sjis (src, src_end)
2202 unsigned char *src, *src_end;
2203{
2204 unsigned char c;
2205
2206 while (src < src_end)
2207 {
2208 c = *src++;
4ed46869
KH
2209 if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
2210 {
2211 if (src < src_end && *src++ < 0x40)
2212 return 0;
2213 }
2214 }
2215 return CODING_CATEGORY_MASK_SJIS;
2216}
2217
2218/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2219 Check if a text is encoded in BIG5. If it is, return
2220 CODING_CATEGORY_MASK_BIG5, else return 0. */
2221
2222int
2223detect_coding_big5 (src, src_end)
2224 unsigned char *src, *src_end;
2225{
2226 unsigned char c;
2227
2228 while (src < src_end)
2229 {
2230 c = *src++;
4ed46869
KH
2231 if (c >= 0xA1)
2232 {
2233 if (src >= src_end)
2234 break;
2235 c = *src++;
2236 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
2237 return 0;
2238 }
2239 }
2240 return CODING_CATEGORY_MASK_BIG5;
2241}
2242
2243/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2244 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
2245
2246int
2247decode_coding_sjis_big5 (coding, source, destination,
d46c5b12 2248 src_bytes, dst_bytes, sjis_p)
4ed46869
KH
2249 struct coding_system *coding;
2250 unsigned char *source, *destination;
2251 int src_bytes, dst_bytes;
4ed46869
KH
2252 int sjis_p;
2253{
2254 unsigned char *src = source;
2255 unsigned char *src_end = source + src_bytes;
2256 unsigned char *dst = destination;
2257 unsigned char *dst_end = destination + dst_bytes;
2258 /* Since the maximum bytes produced by each loop is 4, we subtract 3
2259 from DST_END to assure overflow checking is necessary only at the
2260 head of loop. */
2261 unsigned char *adjusted_dst_end = dst_end - 3;
84fbb8a0 2262 Lisp_Object translation_table
f967223b 2263 = coding->translation_table_for_decode;
d46c5b12 2264 int result = CODING_FINISH_NORMAL;
a5d301df 2265
84fbb8a0 2266 if (!NILP (Venable_character_translation) && NILP (translation_table))
f967223b 2267 translation_table = Vstandard_translation_table_for_decode;
4ed46869 2268
d46c5b12 2269 coding->produced_char = 0;
fb88bf2d 2270 coding->fake_multibyte = 0;
d46c5b12
KH
2271 while (src < src_end && (dst_bytes
2272 ? (dst < adjusted_dst_end)
2273 : (dst < src - 3)))
4ed46869
KH
2274 {
2275 /* SRC_BASE remembers the start position in source in each loop.
2276 The loop will be exited when there's not enough source text
2277 to analyze two-byte character (within macro ONE_MORE_BYTE).
2278 In that case, SRC is reset to SRC_BASE before exiting. */
2279 unsigned char *src_base = src;
2280 unsigned char c1 = *src++, c2, c3, c4;
2281
d46c5b12 2282 if (c1 < 0x20)
4ed46869 2283 {
d46c5b12 2284 if (c1 == '\r')
4ed46869 2285 {
d46c5b12
KH
2286 if (coding->eol_type == CODING_EOL_CRLF)
2287 {
2288 ONE_MORE_BYTE (c2);
2289 if (c2 == '\n')
2290 *dst++ = c2;
2291 else if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2292 {
2293 result = CODING_FINISH_INCONSISTENT_EOL;
2294 goto label_end_of_loop_2;
2295 }
2296 else
2297 /* To process C2 again, SRC is subtracted by 1. */
2298 *dst++ = c1, src--;
2299 }
2300 else if (coding->eol_type == CODING_EOL_CR)
2301 *dst++ = '\n';
4ed46869 2302 else
d46c5b12
KH
2303 *dst++ = c1;
2304 }
2305 else if (c1 == '\n'
2306 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2307 && (coding->eol_type == CODING_EOL_CR
2308 || coding->eol_type == CODING_EOL_CRLF))
2309 {
2310 result = CODING_FINISH_INCONSISTENT_EOL;
2311 goto label_end_of_loop_2;
4ed46869
KH
2312 }
2313 else
2314 *dst++ = c1;
d46c5b12 2315 coding->produced_char++;
4ed46869 2316 }
a5d301df 2317 else if (c1 < 0x80)
5e34de15
KH
2318 {
2319 c2 = 0; /* avoid warning */
2320 DECODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
2321 }
54f78171 2322 else
4ed46869 2323 {
4ed46869
KH
2324 if (sjis_p)
2325 {
54f78171 2326 if (c1 < 0xA0 || (c1 >= 0xE0 && c1 < 0xF0))
fb88bf2d 2327 {
54f78171
KH
2328 /* SJIS -> JISX0208 */
2329 ONE_MORE_BYTE (c2);
d14d03ac 2330 if (c2 >= 0x40 && c2 != 0x7F && c2 <= 0xFC)
54f78171
KH
2331 {
2332 DECODE_SJIS (c1, c2, c3, c4);
2333 DECODE_SJIS_BIG5_CHARACTER (charset_jisx0208, c3, c4);
2334 }
2335 else
2336 goto label_invalid_code_2;
fb88bf2d 2337 }
54f78171
KH
2338 else if (c1 < 0xE0)
2339 /* SJIS -> JISX0201-Kana */
5e34de15
KH
2340 {
2341 c2 = 0; /* avoid warning */
2342 DECODE_SJIS_BIG5_CHARACTER (charset_katakana_jisx0201, c1,
2343 /* dummy */ c2);
2344 }
fb88bf2d 2345 else
54f78171 2346 goto label_invalid_code_1;
4ed46869 2347 }
fb88bf2d 2348 else
fb88bf2d 2349 {
54f78171
KH
2350 /* BIG5 -> Big5 */
2351 if (c1 >= 0xA1 && c1 <= 0xFE)
fb88bf2d 2352 {
54f78171
KH
2353 ONE_MORE_BYTE (c2);
2354 if ((c2 >= 0x40 && c2 <= 0x7E) || (c2 >= 0xA1 && c2 <= 0xFE))
2355 {
2356 int charset;
4ed46869 2357
54f78171
KH
2358 DECODE_BIG5 (c1, c2, charset, c3, c4);
2359 DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
2360 }
2361 else
2362 goto label_invalid_code_2;
fb88bf2d
KH
2363 }
2364 else
54f78171 2365 goto label_invalid_code_1;
4ed46869
KH
2366 }
2367 }
2368 continue;
2369
fb88bf2d
KH
2370 label_invalid_code_1:
2371 *dst++ = c1;
2372 coding->produced_char++;
2373 coding->fake_multibyte = 1;
2374 continue;
2375
2376 label_invalid_code_2:
2377 *dst++ = c1; *dst++= c2;
2378 coding->produced_char += 2;
2379 coding->fake_multibyte = 1;
2380 continue;
2381
4ed46869 2382 label_end_of_loop:
d46c5b12
KH
2383 result = CODING_FINISH_INSUFFICIENT_SRC;
2384 label_end_of_loop_2:
4ed46869
KH
2385 src = src_base;
2386 break;
2387 }
2388
fb88bf2d
KH
2389 if (src < src_end)
2390 {
2391 if (result == CODING_FINISH_NORMAL)
2392 result = CODING_FINISH_INSUFFICIENT_DST;
2393 else if (result != CODING_FINISH_INCONSISTENT_EOL
2394 && coding->mode & CODING_MODE_LAST_BLOCK)
2395 {
2396 src_bytes = src_end - src;
2397 if (dst_bytes && (dst_end - dst < src_bytes))
2398 src_bytes = dst_end - dst;
2399 bcopy (dst, src, src_bytes);
2400 src += src_bytes;
2401 dst += src_bytes;
2402 coding->fake_multibyte = 1;
2403 }
2404 }
d46c5b12
KH
2405
2406 coding->consumed = coding->consumed_char = src - source;
2407 coding->produced = dst - destination;
2408 return result;
4ed46869
KH
2409}
2410
2411/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
2412 This function can encode `charset_ascii', `charset_katakana_jisx0201',
2413 `charset_jisx0208', `charset_big5_1', and `charset_big5-2'. We are
2414 sure that all these charsets are registered as official charset
2415 (i.e. do not have extended leading-codes). Characters of other
2416 charsets are produced without any encoding. If SJIS_P is 1, encode
2417 SJIS text, else encode BIG5 text. */
2418
2419int
2420encode_coding_sjis_big5 (coding, source, destination,
d46c5b12 2421 src_bytes, dst_bytes, sjis_p)
4ed46869
KH
2422 struct coding_system *coding;
2423 unsigned char *source, *destination;
2424 int src_bytes, dst_bytes;
4ed46869
KH
2425 int sjis_p;
2426{
2427 unsigned char *src = source;
2428 unsigned char *src_end = source + src_bytes;
2429 unsigned char *dst = destination;
2430 unsigned char *dst_end = destination + dst_bytes;
2431 /* Since the maximum bytes produced by each loop is 2, we subtract 1
2432 from DST_END to assure overflow checking is necessary only at the
2433 head of loop. */
2434 unsigned char *adjusted_dst_end = dst_end - 1;
84fbb8a0 2435 Lisp_Object translation_table
f967223b 2436 = coding->translation_table_for_encode;
d46c5b12 2437 int result = CODING_FINISH_NORMAL;
a5d301df 2438
84fbb8a0 2439 if (!NILP (Venable_character_translation) && NILP (translation_table))
f967223b 2440 translation_table = Vstandard_translation_table_for_encode;
4ed46869 2441
d46c5b12 2442 coding->consumed_char = 0;
fb88bf2d 2443 coding->fake_multibyte = 0;
d46c5b12
KH
2444 while (src < src_end && (dst_bytes
2445 ? (dst < adjusted_dst_end)
2446 : (dst < src - 1)))
4ed46869
KH
2447 {
2448 /* SRC_BASE remembers the start position in source in each loop.
2449 The loop will be exited when there's not enough source text
2450 to analyze multi-byte codes (within macros ONE_MORE_BYTE and
2451 TWO_MORE_BYTES). In that case, SRC is reset to SRC_BASE
2452 before exiting. */
2453 unsigned char *src_base = src;
2454 unsigned char c1 = *src++, c2, c3, c4;
2455
2456 if (coding->composing)
2457 {
2458 if (c1 == 0xA0)
2459 {
2460 ONE_MORE_BYTE (c1);
2461 c1 &= 0x7F;
2462 }
2463 else if (c1 >= 0xA0)
2464 c1 -= 0x20;
2465 else
2466 coding->composing = 0;
2467 }
2468
2469 switch (emacs_code_class[c1])
2470 {
2471 case EMACS_ascii_code:
a5d301df
KH
2472 ENCODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
2473 break;
2474
4ed46869
KH
2475 case EMACS_control_code:
2476 *dst++ = c1;
d46c5b12 2477 coding->consumed_char++;
4ed46869
KH
2478 break;
2479
2480 case EMACS_carriage_return_code:
d46c5b12 2481 if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
4ed46869
KH
2482 {
2483 *dst++ = c1;
d46c5b12 2484 coding->consumed_char++;
4ed46869
KH
2485 break;
2486 }
2487 /* fall down to treat '\r' as '\n' ... */
2488
2489 case EMACS_linefeed_code:
2490 if (coding->eol_type == CODING_EOL_LF
0ef69138 2491 || coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
2492 *dst++ = '\n';
2493 else if (coding->eol_type == CODING_EOL_CRLF)
2494 *dst++ = '\r', *dst++ = '\n';
2495 else
2496 *dst++ = '\r';
d46c5b12 2497 coding->consumed_char++;
4ed46869
KH
2498 break;
2499
2500 case EMACS_leading_code_2:
2501 ONE_MORE_BYTE (c2);
a5d301df 2502 ENCODE_SJIS_BIG5_CHARACTER (c1, c2, /* dummy */ c3);
4ed46869
KH
2503 break;
2504
2505 case EMACS_leading_code_3:
2506 TWO_MORE_BYTES (c2, c3);
a5d301df 2507 ENCODE_SJIS_BIG5_CHARACTER (c1, c2, c3);
4ed46869
KH
2508 break;
2509
2510 case EMACS_leading_code_4:
2511 THREE_MORE_BYTES (c2, c3, c4);
a5d301df 2512 ENCODE_SJIS_BIG5_CHARACTER (c2, c3, c4);
4ed46869
KH
2513 break;
2514
2515 case EMACS_leading_code_composition:
2516 coding->composing = 1;
2517 break;
2518
2519 default: /* i.e. case EMACS_invalid_code: */
2520 *dst++ = c1;
d46c5b12 2521 coding->consumed_char++;
4ed46869
KH
2522 }
2523 continue;
2524
2525 label_end_of_loop:
d46c5b12
KH
2526 result = CODING_FINISH_INSUFFICIENT_SRC;
2527 src = src_base;
4ed46869
KH
2528 break;
2529 }
2530
d46c5b12
KH
2531 if (result == CODING_FINISH_NORMAL
2532 && src < src_end)
2533 result = CODING_FINISH_INSUFFICIENT_DST;
2534 coding->consumed = src - source;
2535 coding->produced = coding->produced_char = dst - destination;
2536 return result;
4ed46869
KH
2537}
2538
2539\f
1397dc18
KH
2540/*** 5. CCL handlers ***/
2541
2542/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2543 Check if a text is encoded in a coding system of which
2544 encoder/decoder are written in CCL program. If it is, return
2545 CODING_CATEGORY_MASK_CCL, else return 0. */
2546
2547int
2548detect_coding_ccl (src, src_end)
2549 unsigned char *src, *src_end;
2550{
2551 unsigned char *valid;
2552
2553 /* No coding system is assigned to coding-category-ccl. */
2554 if (!coding_system_table[CODING_CATEGORY_IDX_CCL])
2555 return 0;
2556
2557 valid = coding_system_table[CODING_CATEGORY_IDX_CCL]->spec.ccl.valid_codes;
2558 while (src < src_end)
2559 {
2560 if (! valid[*src]) return 0;
2561 src++;
2562 }
2563 return CODING_CATEGORY_MASK_CCL;
2564}
2565
2566\f
2567/*** 6. End-of-line handlers ***/
4ed46869
KH
2568
2569/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2570 This function is called only when `coding->eol_type' is
2571 CODING_EOL_CRLF or CODING_EOL_CR. */
2572
dfcf069d 2573int
d46c5b12 2574decode_eol (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
2575 struct coding_system *coding;
2576 unsigned char *source, *destination;
2577 int src_bytes, dst_bytes;
4ed46869
KH
2578{
2579 unsigned char *src = source;
2580 unsigned char *src_end = source + src_bytes;
2581 unsigned char *dst = destination;
2582 unsigned char *dst_end = destination + dst_bytes;
fb88bf2d 2583 unsigned char c;
d46c5b12
KH
2584 int result = CODING_FINISH_NORMAL;
2585
fb88bf2d
KH
2586 coding->fake_multibyte = 0;
2587
d46c5b12 2588 if (src_bytes <= 0)
716e0b0a
AI
2589 {
2590 coding->produced = coding->produced_char = 0;
2591 coding->consumed = coding->consumed_char = 0;
2592 return result;
2593 }
4ed46869
KH
2594
2595 switch (coding->eol_type)
2596 {
2597 case CODING_EOL_CRLF:
2598 {
2599 /* Since the maximum bytes produced by each loop is 2, we
2600 subtract 1 from DST_END to assure overflow checking is
2601 necessary only at the head of loop. */
2602 unsigned char *adjusted_dst_end = dst_end - 1;
2603
d46c5b12
KH
2604 while (src < src_end && (dst_bytes
2605 ? (dst < adjusted_dst_end)
2606 : (dst < src - 1)))
4ed46869
KH
2607 {
2608 unsigned char *src_base = src;
fb88bf2d
KH
2609
2610 c = *src++;
4ed46869
KH
2611 if (c == '\r')
2612 {
2613 ONE_MORE_BYTE (c);
fdfcf19d
KH
2614 if (c == '\n')
2615 *dst++ = c;
2616 else
d46c5b12
KH
2617 {
2618 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2619 {
2620 result = CODING_FINISH_INCONSISTENT_EOL;
2621 goto label_end_of_loop_2;
2622 }
fdfcf19d 2623 src--;
d46c5b12 2624 *dst++ = '\r';
fb88bf2d
KH
2625 if (BASE_LEADING_CODE_P (c))
2626 coding->fake_multibyte = 1;
d46c5b12 2627 }
4ed46869 2628 }
d46c5b12
KH
2629 else if (c == '\n'
2630 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL))
2631 {
2632 result = CODING_FINISH_INCONSISTENT_EOL;
2633 goto label_end_of_loop_2;
2634 }
4ed46869 2635 else
fb88bf2d
KH
2636 {
2637 *dst++ = c;
2638 if (BASE_LEADING_CODE_P (c))
2639 coding->fake_multibyte = 1;
2640 }
4ed46869
KH
2641 continue;
2642
2643 label_end_of_loop:
d46c5b12
KH
2644 result = CODING_FINISH_INSUFFICIENT_SRC;
2645 label_end_of_loop_2:
4ed46869
KH
2646 src = src_base;
2647 break;
2648 }
fdfcf19d
KH
2649 if (src < src_end)
2650 {
2651 if (result == CODING_FINISH_NORMAL)
2652 result = CODING_FINISH_INSUFFICIENT_DST;
2653 else if (result != CODING_FINISH_INCONSISTENT_EOL
2654 && coding->mode & CODING_MODE_LAST_BLOCK)
2655 {
2656 /* This is the last block of the text to be decoded.
2657 We flush out all remaining codes. */
2658 src_bytes = src_end - src;
2659 if (dst_bytes && (dst_end - dst < src_bytes))
2660 src_bytes = dst_end - dst;
2661 bcopy (src, dst, src_bytes);
2662 dst += src_bytes;
2663 src += src_bytes;
2664 }
2665 }
4ed46869 2666 }
d46c5b12 2667 break;
4ed46869
KH
2668
2669 case CODING_EOL_CR:
d46c5b12
KH
2670 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2671 {
fb88bf2d
KH
2672 while (src < src_end)
2673 {
2674 if ((c = *src++) == '\n')
2675 break;
2676 if (BASE_LEADING_CODE_P (c))
2677 coding->fake_multibyte = 1;
2678 }
d46c5b12
KH
2679 if (*--src == '\n')
2680 {
2681 src_bytes = src - source;
2682 result = CODING_FINISH_INCONSISTENT_EOL;
2683 }
2684 }
2685 if (dst_bytes && src_bytes > dst_bytes)
2686 {
2687 result = CODING_FINISH_INSUFFICIENT_DST;
2688 src_bytes = dst_bytes;
2689 }
2690 if (dst_bytes)
2691 bcopy (source, destination, src_bytes);
2692 else
2693 safe_bcopy (source, destination, src_bytes);
2694 src = source + src_bytes;
2695 while (src_bytes--) if (*dst++ == '\r') dst[-1] = '\n';
4ed46869
KH
2696 break;
2697
2698 default: /* i.e. case: CODING_EOL_LF */
d46c5b12
KH
2699 if (dst_bytes && src_bytes > dst_bytes)
2700 {
2701 result = CODING_FINISH_INSUFFICIENT_DST;
2702 src_bytes = dst_bytes;
2703 }
2704 if (dst_bytes)
2705 bcopy (source, destination, src_bytes);
2706 else
2707 safe_bcopy (source, destination, src_bytes);
2708 src += src_bytes;
993824c9 2709 dst += src_bytes;
fb88bf2d 2710 coding->fake_multibyte = 1;
4ed46869
KH
2711 break;
2712 }
2713
d46c5b12
KH
2714 coding->consumed = coding->consumed_char = src - source;
2715 coding->produced = coding->produced_char = dst - destination;
2716 return result;
4ed46869
KH
2717}
2718
2719/* See "GENERAL NOTES about `encode_coding_XXX ()' functions". Encode
2720 format of end-of-line according to `coding->eol_type'. If
d46c5b12
KH
2721 `coding->mode & CODING_MODE_SELECTIVE_DISPLAY' is nonzero, code
2722 '\r' in source text also means end-of-line. */
4ed46869 2723
dfcf069d 2724int
d46c5b12 2725encode_eol (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
2726 struct coding_system *coding;
2727 unsigned char *source, *destination;
2728 int src_bytes, dst_bytes;
4ed46869
KH
2729{
2730 unsigned char *src = source;
2731 unsigned char *dst = destination;
d46c5b12 2732 int result = CODING_FINISH_NORMAL;
4ed46869 2733
fb88bf2d
KH
2734 coding->fake_multibyte = 0;
2735
d46c5b12
KH
2736 if (coding->eol_type == CODING_EOL_CRLF)
2737 {
2738 unsigned char c;
2739 unsigned char *src_end = source + src_bytes;
2740 unsigned char *dst_end = destination + dst_bytes;
2741 /* Since the maximum bytes produced by each loop is 2, we
2742 subtract 1 from DST_END to assure overflow checking is
2743 necessary only at the head of loop. */
2744 unsigned char *adjusted_dst_end = dst_end - 1;
2745
2746 while (src < src_end && (dst_bytes
2747 ? (dst < adjusted_dst_end)
2748 : (dst < src - 1)))
2749 {
2750 c = *src++;
2751 if (c == '\n'
2752 || (c == '\r' && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY)))
2753 *dst++ = '\r', *dst++ = '\n';
2754 else
fb88bf2d
KH
2755 {
2756 *dst++ = c;
2757 if (BASE_LEADING_CODE_P (c))
2758 coding->fake_multibyte = 1;
2759 }
d46c5b12
KH
2760 }
2761 if (src < src_end)
2762 result = CODING_FINISH_INSUFFICIENT_DST;
2763 }
2764 else
4ed46869 2765 {
fb88bf2d
KH
2766 unsigned char c;
2767
d46c5b12 2768 if (dst_bytes && src_bytes > dst_bytes)
4ed46869 2769 {
d46c5b12
KH
2770 src_bytes = dst_bytes;
2771 result = CODING_FINISH_INSUFFICIENT_DST;
2772 }
2773 if (dst_bytes)
2774 bcopy (source, destination, src_bytes);
2775 else
993824c9
RS
2776 safe_bcopy (source, destination, src_bytes);
2777 dst_bytes = src_bytes;
2778 if (coding->eol_type == CODING_EOL_CR)
d46c5b12
KH
2779 {
2780 while (src_bytes--)
fb88bf2d
KH
2781 {
2782 if ((c = *dst++) == '\n')
2783 dst[-1] = '\r';
2784 else if (BASE_LEADING_CODE_P (c))
993824c9 2785 coding->fake_multibyte = 1;
fb88bf2d 2786 }
d46c5b12 2787 }
fb88bf2d 2788 else
d46c5b12 2789 {
fb88bf2d
KH
2790 if (coding->mode & CODING_MODE_SELECTIVE_DISPLAY)
2791 {
2792 while (src_bytes--)
2793 if (*dst++ == '\r') dst[-1] = '\n';
2794 }
2795 coding->fake_multibyte = 1;
4ed46869 2796 }
fb88bf2d
KH
2797 src = source + dst_bytes;
2798 dst = destination + dst_bytes;
4ed46869
KH
2799 }
2800
d46c5b12
KH
2801 coding->consumed = coding->consumed_char = src - source;
2802 coding->produced = coding->produced_char = dst - destination;
2803 return result;
4ed46869
KH
2804}
2805
2806\f
1397dc18 2807/*** 7. C library functions ***/
4ed46869
KH
2808
2809/* In Emacs Lisp, coding system is represented by a Lisp symbol which
2810 has a property `coding-system'. The value of this property is a
2811 vector of length 5 (called as coding-vector). Among elements of
2812 this vector, the first (element[0]) and the fifth (element[4])
2813 carry important information for decoding/encoding. Before
2814 decoding/encoding, this information should be set in fields of a
2815 structure of type `coding_system'.
2816
2817 A value of property `coding-system' can be a symbol of another
2818 subsidiary coding-system. In that case, Emacs gets coding-vector
2819 from that symbol.
2820
2821 `element[0]' contains information to be set in `coding->type'. The
2822 value and its meaning is as follows:
2823
0ef69138
KH
2824 0 -- coding_type_emacs_mule
2825 1 -- coding_type_sjis
2826 2 -- coding_type_iso2022
2827 3 -- coding_type_big5
2828 4 -- coding_type_ccl encoder/decoder written in CCL
2829 nil -- coding_type_no_conversion
2830 t -- coding_type_undecided (automatic conversion on decoding,
2831 no-conversion on encoding)
4ed46869
KH
2832
2833 `element[4]' contains information to be set in `coding->flags' and
2834 `coding->spec'. The meaning varies by `coding->type'.
2835
2836 If `coding->type' is `coding_type_iso2022', element[4] is a vector
2837 of length 32 (of which the first 13 sub-elements are used now).
2838 Meanings of these sub-elements are:
2839
2840 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2841 If the value is an integer of valid charset, the charset is
2842 assumed to be designated to graphic register N initially.
2843
2844 If the value is minus, it is a minus value of charset which
2845 reserves graphic register N, which means that the charset is
2846 not designated initially but should be designated to graphic
2847 register N just before encoding a character in that charset.
2848
2849 If the value is nil, graphic register N is never used on
2850 encoding.
2851
2852 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2853 Each value takes t or nil. See the section ISO2022 of
2854 `coding.h' for more information.
2855
2856 If `coding->type' is `coding_type_big5', element[4] is t to denote
2857 BIG5-ETen or nil to denote BIG5-HKU.
2858
2859 If `coding->type' takes the other value, element[4] is ignored.
2860
2861 Emacs Lisp's coding system also carries information about format of
2862 end-of-line in a value of property `eol-type'. If the value is
2863 integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2864 means CODING_EOL_CR. If it is not integer, it should be a vector
2865 of subsidiary coding systems of which property `eol-type' has one
2866 of above values.
2867
2868*/
2869
2870/* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2871 and set it in CODING. If CODING_SYSTEM_SYMBOL is invalid, CODING
2872 is setup so that no conversion is necessary and return -1, else
2873 return 0. */
2874
2875int
e0e989f6
KH
2876setup_coding_system (coding_system, coding)
2877 Lisp_Object coding_system;
4ed46869
KH
2878 struct coding_system *coding;
2879{
d46c5b12 2880 Lisp_Object coding_spec, coding_type, eol_type, plist;
4608c386 2881 Lisp_Object val;
70c22245 2882 int i;
4ed46869 2883
d46c5b12 2884 /* Initialize some fields required for all kinds of coding systems. */
774324d6 2885 coding->symbol = coding_system;
d46c5b12
KH
2886 coding->common_flags = 0;
2887 coding->mode = 0;
2888 coding->heading_ascii = -1;
2889 coding->post_read_conversion = coding->pre_write_conversion = Qnil;
1f5dbf34
KH
2890
2891 if (NILP (coding_system))
2892 goto label_invalid_coding_system;
2893
4608c386 2894 coding_spec = Fget (coding_system, Qcoding_system);
1f5dbf34 2895
4608c386
KH
2896 if (!VECTORP (coding_spec)
2897 || XVECTOR (coding_spec)->size != 5
2898 || !CONSP (XVECTOR (coding_spec)->contents[3]))
4ed46869 2899 goto label_invalid_coding_system;
4608c386 2900
d46c5b12
KH
2901 eol_type = inhibit_eol_conversion ? Qnil : Fget (coding_system, Qeol_type);
2902 if (VECTORP (eol_type))
2903 {
2904 coding->eol_type = CODING_EOL_UNDECIDED;
2905 coding->common_flags = CODING_REQUIRE_DETECTION_MASK;
2906 }
2907 else if (XFASTINT (eol_type) == 1)
2908 {
2909 coding->eol_type = CODING_EOL_CRLF;
2910 coding->common_flags
2911 = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2912 }
2913 else if (XFASTINT (eol_type) == 2)
2914 {
2915 coding->eol_type = CODING_EOL_CR;
2916 coding->common_flags
2917 = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2918 }
2919 else
2920 coding->eol_type = CODING_EOL_LF;
2921
2922 coding_type = XVECTOR (coding_spec)->contents[0];
2923 /* Try short cut. */
2924 if (SYMBOLP (coding_type))
2925 {
2926 if (EQ (coding_type, Qt))
2927 {
2928 coding->type = coding_type_undecided;
2929 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
2930 }
2931 else
2932 coding->type = coding_type_no_conversion;
2933 return 0;
2934 }
2935
2936 /* Initialize remaining fields. */
2937 coding->composing = 0;
a63063ae 2938 coding->composed_chars = 0;
d46c5b12
KH
2939
2940 /* Get values of coding system properties:
2941 `post-read-conversion', `pre-write-conversion',
f967223b 2942 `translation-table-for-decode', `translation-table-for-encode'. */
4608c386
KH
2943 plist = XVECTOR (coding_spec)->contents[3];
2944 coding->post_read_conversion = Fplist_get (plist, Qpost_read_conversion);
2945 coding->pre_write_conversion = Fplist_get (plist, Qpre_write_conversion);
f967223b 2946 val = Fplist_get (plist, Qtranslation_table_for_decode);
4608c386 2947 if (SYMBOLP (val))
f967223b
KH
2948 val = Fget (val, Qtranslation_table_for_decode);
2949 coding->translation_table_for_decode = CHAR_TABLE_P (val) ? val : Qnil;
2950 val = Fplist_get (plist, Qtranslation_table_for_encode);
4608c386 2951 if (SYMBOLP (val))
f967223b
KH
2952 val = Fget (val, Qtranslation_table_for_encode);
2953 coding->translation_table_for_encode = CHAR_TABLE_P (val) ? val : Qnil;
d46c5b12
KH
2954 val = Fplist_get (plist, Qcoding_category);
2955 if (!NILP (val))
2956 {
2957 val = Fget (val, Qcoding_category_index);
2958 if (INTEGERP (val))
2959 coding->category_idx = XINT (val);
2960 else
2961 goto label_invalid_coding_system;
2962 }
2963 else
2964 goto label_invalid_coding_system;
4608c386 2965
70c22245
KH
2966 val = Fplist_get (plist, Qsafe_charsets);
2967 if (EQ (val, Qt))
2968 {
2969 for (i = 0; i <= MAX_CHARSET; i++)
2970 coding->safe_charsets[i] = 1;
2971 }
2972 else
2973 {
2974 bzero (coding->safe_charsets, MAX_CHARSET + 1);
2975 while (CONSP (val))
2976 {
2977 if ((i = get_charset_id (XCONS (val)->car)) >= 0)
2978 coding->safe_charsets[i] = 1;
2979 val = XCONS (val)->cdr;
2980 }
2981 }
2982
d46c5b12 2983 switch (XFASTINT (coding_type))
4ed46869
KH
2984 {
2985 case 0:
0ef69138 2986 coding->type = coding_type_emacs_mule;
c952af22
KH
2987 if (!NILP (coding->post_read_conversion))
2988 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
2989 if (!NILP (coding->pre_write_conversion))
2990 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
4ed46869
KH
2991 break;
2992
2993 case 1:
2994 coding->type = coding_type_sjis;
c952af22
KH
2995 coding->common_flags
2996 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
4ed46869
KH
2997 break;
2998
2999 case 2:
3000 coding->type = coding_type_iso2022;
c952af22
KH
3001 coding->common_flags
3002 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
4ed46869 3003 {
70c22245 3004 Lisp_Object val, temp;
4ed46869 3005 Lisp_Object *flags;
d46c5b12 3006 int i, charset, reg_bits = 0;
4ed46869 3007
4608c386 3008 val = XVECTOR (coding_spec)->contents[4];
f44d27ce 3009
4ed46869
KH
3010 if (!VECTORP (val) || XVECTOR (val)->size != 32)
3011 goto label_invalid_coding_system;
3012
3013 flags = XVECTOR (val)->contents;
3014 coding->flags
3015 = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
3016 | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
3017 | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
3018 | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
3019 | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
3020 | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
3021 | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
3022 | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
e0e989f6
KH
3023 | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
3024 | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
c4825358
KH
3025 | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3026 | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
3f003981 3027 | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
c4825358 3028 );
4ed46869
KH
3029
3030 /* Invoke graphic register 0 to plane 0. */
3031 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
3032 /* Invoke graphic register 1 to plane 1 if we can use full 8-bit. */
3033 CODING_SPEC_ISO_INVOCATION (coding, 1)
3034 = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
3035 /* Not single shifting at first. */
6e85d753 3036 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
e0e989f6 3037 /* Beginning of buffer should also be regarded as bol. */
6e85d753 3038 CODING_SPEC_ISO_BOL (coding) = 1;
4ed46869 3039
70c22245
KH
3040 for (charset = 0; charset <= MAX_CHARSET; charset++)
3041 CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = 255;
3042 val = Vcharset_revision_alist;
3043 while (CONSP (val))
3044 {
3045 charset = get_charset_id (Fcar_safe (XCONS (val)->car));
3046 if (charset >= 0
3047 && (temp = Fcdr_safe (XCONS (val)->car), INTEGERP (temp))
3048 && (i = XINT (temp), (i >= 0 && (i + '@') < 128)))
3049 CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = i;
3050 val = XCONS (val)->cdr;
3051 }
3052
4ed46869
KH
3053 /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
3054 FLAGS[REG] can be one of below:
3055 integer CHARSET: CHARSET occupies register I,
3056 t: designate nothing to REG initially, but can be used
3057 by any charsets,
3058 list of integer, nil, or t: designate the first
3059 element (if integer) to REG initially, the remaining
3060 elements (if integer) is designated to REG on request,
d46c5b12 3061 if an element is t, REG can be used by any charsets,
4ed46869 3062 nil: REG is never used. */
467e7675 3063 for (charset = 0; charset <= MAX_CHARSET; charset++)
1ba9e4ab
KH
3064 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3065 = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
4ed46869
KH
3066 for (i = 0; i < 4; i++)
3067 {
3068 if (INTEGERP (flags[i])
e0e989f6
KH
3069 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset))
3070 || (charset = get_charset_id (flags[i])) >= 0)
4ed46869
KH
3071 {
3072 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3073 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
3074 }
3075 else if (EQ (flags[i], Qt))
3076 {
3077 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
d46c5b12
KH
3078 reg_bits |= 1 << i;
3079 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
4ed46869
KH
3080 }
3081 else if (CONSP (flags[i]))
3082 {
84d60297
RS
3083 Lisp_Object tail;
3084 tail = flags[i];
4ed46869 3085
d46c5b12 3086 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
4ed46869
KH
3087 if (INTEGERP (XCONS (tail)->car)
3088 && (charset = XINT (XCONS (tail)->car),
e0e989f6
KH
3089 CHARSET_VALID_P (charset))
3090 || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
4ed46869
KH
3091 {
3092 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3093 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
3094 }
3095 else
3096 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3097 tail = XCONS (tail)->cdr;
3098 while (CONSP (tail))
3099 {
3100 if (INTEGERP (XCONS (tail)->car)
3101 && (charset = XINT (XCONS (tail)->car),
e0e989f6
KH
3102 CHARSET_VALID_P (charset))
3103 || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
70c22245
KH
3104 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3105 = i;
4ed46869 3106 else if (EQ (XCONS (tail)->car, Qt))
d46c5b12 3107 reg_bits |= 1 << i;
4ed46869
KH
3108 tail = XCONS (tail)->cdr;
3109 }
3110 }
3111 else
3112 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3113
3114 CODING_SPEC_ISO_DESIGNATION (coding, i)
3115 = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
3116 }
3117
d46c5b12 3118 if (reg_bits && ! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
4ed46869
KH
3119 {
3120 /* REG 1 can be used only by locking shift in 7-bit env. */
3121 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
d46c5b12 3122 reg_bits &= ~2;
4ed46869
KH
3123 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
3124 /* Without any shifting, only REG 0 and 1 can be used. */
d46c5b12 3125 reg_bits &= 3;
4ed46869
KH
3126 }
3127
d46c5b12
KH
3128 if (reg_bits)
3129 for (charset = 0; charset <= MAX_CHARSET; charset++)
6e85d753 3130 {
d46c5b12
KH
3131 if (CHARSET_VALID_P (charset))
3132 {
3133 /* There exist some default graphic registers to be
3134 used CHARSET. */
3135
3136 /* We had better avoid designating a charset of
3137 CHARS96 to REG 0 as far as possible. */
3138 if (CHARSET_CHARS (charset) == 96)
3139 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3140 = (reg_bits & 2
3141 ? 1 : (reg_bits & 4 ? 2 : (reg_bits & 8 ? 3 : 0)));
3142 else
3143 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3144 = (reg_bits & 1
3145 ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
3146 }
6e85d753 3147 }
4ed46869 3148 }
c952af22 3149 coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
d46c5b12 3150 coding->spec.iso2022.last_invalid_designation_register = -1;
4ed46869
KH
3151 break;
3152
3153 case 3:
3154 coding->type = coding_type_big5;
c952af22
KH
3155 coding->common_flags
3156 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
4ed46869 3157 coding->flags
4608c386 3158 = (NILP (XVECTOR (coding_spec)->contents[4])
4ed46869
KH
3159 ? CODING_FLAG_BIG5_HKU
3160 : CODING_FLAG_BIG5_ETEN);
3161 break;
3162
3163 case 4:
3164 coding->type = coding_type_ccl;
c952af22
KH
3165 coding->common_flags
3166 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
4ed46869 3167 {
84d60297 3168 Lisp_Object val;
d21ca14d
KH
3169 Lisp_Object decoder, encoder;
3170
84d60297 3171 val = XVECTOR (coding_spec)->contents[4];
4ed46869 3172 if (CONSP (val)
d21ca14d
KH
3173 && SYMBOLP (XCONS (val)->car)
3174 && !NILP (decoder = Fget (XCONS (val)->car, Qccl_program_idx))
f82423d7 3175 && !NILP (decoder = Fcdr (Faref (Vccl_program_table, decoder)))
d21ca14d
KH
3176 && SYMBOLP (XCONS (val)->cdr)
3177 && !NILP (encoder = Fget (XCONS (val)->cdr, Qccl_program_idx))
f82423d7 3178 && !NILP (encoder = Fcdr (Faref (Vccl_program_table, encoder))))
4ed46869 3179 {
d21ca14d
KH
3180 setup_ccl_program (&(coding->spec.ccl.decoder), decoder);
3181 setup_ccl_program (&(coding->spec.ccl.encoder), encoder);
4ed46869
KH
3182 }
3183 else
3184 goto label_invalid_coding_system;
1397dc18
KH
3185
3186 bzero (coding->spec.ccl.valid_codes, 256);
3187 val = Fplist_get (plist, Qvalid_codes);
3188 if (CONSP (val))
3189 {
3190 Lisp_Object this;
3191
7b179c2d 3192 for (; CONSP (val); val = XCONS (val)->cdr)
1397dc18 3193 {
7b179c2d 3194 this = XCONS (val)->car;
1397dc18
KH
3195 if (INTEGERP (this)
3196 && XINT (this) >= 0 && XINT (this) < 256)
3197 coding->spec.ccl.valid_codes[XINT (this)] = 1;
3198 else if (CONSP (this)
3199 && INTEGERP (XCONS (this)->car)
3200 && INTEGERP (XCONS (this)->cdr))
3201 {
3202 int start = XINT (XCONS (this)->car);
3203 int end = XINT (XCONS (this)->cdr);
3204
3205 if (start >= 0 && start <= end && end < 256)
e133c8fa 3206 while (start <= end)
1397dc18
KH
3207 coding->spec.ccl.valid_codes[start++] = 1;
3208 }
3209 }
3210 }
4ed46869 3211 }
c952af22 3212 coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
4ed46869
KH
3213 break;
3214
27901516
KH
3215 case 5:
3216 coding->type = coding_type_raw_text;
3217 break;
3218
4ed46869 3219 default:
d46c5b12 3220 goto label_invalid_coding_system;
4ed46869
KH
3221 }
3222 return 0;
3223
3224 label_invalid_coding_system:
3225 coding->type = coding_type_no_conversion;
d46c5b12 3226 coding->category_idx = CODING_CATEGORY_IDX_BINARY;
c952af22 3227 coding->common_flags = 0;
dec137e5 3228 coding->eol_type = CODING_EOL_LF;
d46c5b12 3229 coding->pre_write_conversion = coding->post_read_conversion = Qnil;
4ed46869
KH
3230 return -1;
3231}
3232
54f78171
KH
3233/* Setup raw-text or one of its subsidiaries in the structure
3234 coding_system CODING according to the already setup value eol_type
3235 in CODING. CODING should be setup for some coding system in
3236 advance. */
3237
3238void
3239setup_raw_text_coding_system (coding)
3240 struct coding_system *coding;
3241{
3242 if (coding->type != coding_type_raw_text)
3243 {
3244 coding->symbol = Qraw_text;
3245 coding->type = coding_type_raw_text;
3246 if (coding->eol_type != CODING_EOL_UNDECIDED)
3247 {
84d60297
RS
3248 Lisp_Object subsidiaries;
3249 subsidiaries = Fget (Qraw_text, Qeol_type);
54f78171
KH
3250
3251 if (VECTORP (subsidiaries)
3252 && XVECTOR (subsidiaries)->size == 3)
3253 coding->symbol
3254 = XVECTOR (subsidiaries)->contents[coding->eol_type];
3255 }
716e0b0a 3256 setup_coding_system (coding->symbol, coding);
54f78171
KH
3257 }
3258 return;
3259}
3260
4ed46869
KH
3261/* Emacs has a mechanism to automatically detect a coding system if it
3262 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
3263 it's impossible to distinguish some coding systems accurately
3264 because they use the same range of codes. So, at first, coding
3265 systems are categorized into 7, those are:
3266
0ef69138 3267 o coding-category-emacs-mule
4ed46869
KH
3268
3269 The category for a coding system which has the same code range
3270 as Emacs' internal format. Assigned the coding-system (Lisp
0ef69138 3271 symbol) `emacs-mule' by default.
4ed46869
KH
3272
3273 o coding-category-sjis
3274
3275 The category for a coding system which has the same code range
3276 as SJIS. Assigned the coding-system (Lisp
7717c392 3277 symbol) `japanese-shift-jis' by default.
4ed46869
KH
3278
3279 o coding-category-iso-7
3280
3281 The category for a coding system which has the same code range
7717c392 3282 as ISO2022 of 7-bit environment. This doesn't use any locking
d46c5b12
KH
3283 shift and single shift functions. This can encode/decode all
3284 charsets. Assigned the coding-system (Lisp symbol)
3285 `iso-2022-7bit' by default.
3286
3287 o coding-category-iso-7-tight
3288
3289 Same as coding-category-iso-7 except that this can
3290 encode/decode only the specified charsets.
4ed46869
KH
3291
3292 o coding-category-iso-8-1
3293
3294 The category for a coding system which has the same code range
3295 as ISO2022 of 8-bit environment and graphic plane 1 used only
7717c392
KH
3296 for DIMENSION1 charset. This doesn't use any locking shift
3297 and single shift functions. Assigned the coding-system (Lisp
3298 symbol) `iso-latin-1' by default.
4ed46869
KH
3299
3300 o coding-category-iso-8-2
3301
3302 The category for a coding system which has the same code range
3303 as ISO2022 of 8-bit environment and graphic plane 1 used only
7717c392
KH
3304 for DIMENSION2 charset. This doesn't use any locking shift
3305 and single shift functions. Assigned the coding-system (Lisp
3306 symbol) `japanese-iso-8bit' by default.
4ed46869 3307
7717c392 3308 o coding-category-iso-7-else
4ed46869
KH
3309
3310 The category for a coding system which has the same code range
7717c392
KH
3311 as ISO2022 of 7-bit environemnt but uses locking shift or
3312 single shift functions. Assigned the coding-system (Lisp
3313 symbol) `iso-2022-7bit-lock' by default.
3314
3315 o coding-category-iso-8-else
3316
3317 The category for a coding system which has the same code range
3318 as ISO2022 of 8-bit environemnt but uses locking shift or
3319 single shift functions. Assigned the coding-system (Lisp
3320 symbol) `iso-2022-8bit-ss2' by default.
4ed46869
KH
3321
3322 o coding-category-big5
3323
3324 The category for a coding system which has the same code range
3325 as BIG5. Assigned the coding-system (Lisp symbol)
e0e989f6 3326 `cn-big5' by default.
4ed46869 3327
1397dc18
KH
3328 o coding-category-ccl
3329
3330 The category for a coding system of which encoder/decoder is
3331 written in CCL programs. The default value is nil, i.e., no
3332 coding system is assigned.
3333
4ed46869
KH
3334 o coding-category-binary
3335
3336 The category for a coding system not categorized in any of the
3337 above. Assigned the coding-system (Lisp symbol)
e0e989f6 3338 `no-conversion' by default.
4ed46869
KH
3339
3340 Each of them is a Lisp symbol and the value is an actual
3341 `coding-system's (this is also a Lisp symbol) assigned by a user.
3342 What Emacs does actually is to detect a category of coding system.
3343 Then, it uses a `coding-system' assigned to it. If Emacs can't
3344 decide only one possible category, it selects a category of the
3345 highest priority. Priorities of categories are also specified by a
3346 user in a Lisp variable `coding-category-list'.
3347
3348*/
3349
66cfb530
KH
3350static
3351int ascii_skip_code[256];
3352
d46c5b12 3353/* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded.
4ed46869
KH
3354 If it detects possible coding systems, return an integer in which
3355 appropriate flag bits are set. Flag bits are defined by macros
d46c5b12 3356 CODING_CATEGORY_MASK_XXX in `coding.h'.
4ed46869 3357
d46c5b12
KH
3358 How many ASCII characters are at the head is returned as *SKIP. */
3359
3360static int
3361detect_coding_mask (source, src_bytes, priorities, skip)
3362 unsigned char *source;
3363 int src_bytes, *priorities, *skip;
4ed46869
KH
3364{
3365 register unsigned char c;
d46c5b12 3366 unsigned char *src = source, *src_end = source + src_bytes;
66cfb530 3367 unsigned int mask;
d46c5b12 3368 int i;
4ed46869
KH
3369
3370 /* At first, skip all ASCII characters and control characters except
3371 for three ISO2022 specific control characters. */
66cfb530
KH
3372 ascii_skip_code[ISO_CODE_SO] = 0;
3373 ascii_skip_code[ISO_CODE_SI] = 0;
3374 ascii_skip_code[ISO_CODE_ESC] = 0;
3375
bcf26d6a 3376 label_loop_detect_coding:
66cfb530 3377 while (src < src_end && ascii_skip_code[*src]) src++;
d46c5b12 3378 *skip = src - source;
4ed46869
KH
3379
3380 if (src >= src_end)
3381 /* We found nothing other than ASCII. There's nothing to do. */
d46c5b12 3382 return 0;
4ed46869 3383
8a8147d6 3384 c = *src;
4ed46869
KH
3385 /* The text seems to be encoded in some multilingual coding system.
3386 Now, try to find in which coding system the text is encoded. */
3387 if (c < 0x80)
bcf26d6a
KH
3388 {
3389 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
3390 /* C is an ISO2022 specific control code of C0. */
3391 mask = detect_coding_iso2022 (src, src_end);
1b2af4b0 3392 if (mask == 0)
d46c5b12
KH
3393 {
3394 /* No valid ISO2022 code follows C. Try again. */
3395 src++;
66cfb530
KH
3396 if (c == ISO_CODE_ESC)
3397 ascii_skip_code[ISO_CODE_ESC] = 1;
3398 else
3399 ascii_skip_code[ISO_CODE_SO] = ascii_skip_code[ISO_CODE_SI] = 1;
d46c5b12
KH
3400 goto label_loop_detect_coding;
3401 }
3402 if (priorities)
3403 goto label_return_highest_only;
bcf26d6a 3404 }
d46c5b12 3405 else
c4825358 3406 {
d46c5b12 3407 int try;
4ed46869 3408
d46c5b12
KH
3409 if (c < 0xA0)
3410 {
3411 /* C is the first byte of SJIS character code,
3412 or a leading-code of Emacs' internal format (emacs-mule). */
3413 try = CODING_CATEGORY_MASK_SJIS | CODING_CATEGORY_MASK_EMACS_MULE;
3414
3415 /* Or, if C is a special latin extra code,
3416 or is an ISO2022 specific control code of C1 (SS2 or SS3),
3417 or is an ISO2022 control-sequence-introducer (CSI),
3418 we should also consider the possibility of ISO2022 codings. */
3419 if ((VECTORP (Vlatin_extra_code_table)
3420 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
3421 || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
3422 || (c == ISO_CODE_CSI
3423 && (src < src_end
3424 && (*src == ']'
3425 || ((*src == '0' || *src == '1' || *src == '2')
3426 && src + 1 < src_end
3427 && src[1] == ']')))))
3428 try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
3429 | CODING_CATEGORY_MASK_ISO_8BIT);
3430 }
c4825358 3431 else
d46c5b12
KH
3432 /* C is a character of ISO2022 in graphic plane right,
3433 or a SJIS's 1-byte character code (i.e. JISX0201),
3434 or the first byte of BIG5's 2-byte code. */
3435 try = (CODING_CATEGORY_MASK_ISO_8_ELSE
3436 | CODING_CATEGORY_MASK_ISO_8BIT
3437 | CODING_CATEGORY_MASK_SJIS
3438 | CODING_CATEGORY_MASK_BIG5);
3439
1397dc18
KH
3440 /* Or, we may have to consider the possibility of CCL. */
3441 if (coding_system_table[CODING_CATEGORY_IDX_CCL]
3442 && (coding_system_table[CODING_CATEGORY_IDX_CCL]
3443 ->spec.ccl.valid_codes)[c])
3444 try |= CODING_CATEGORY_MASK_CCL;
3445
d46c5b12
KH
3446 mask = 0;
3447 if (priorities)
3448 {
3449 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3450 {
5ab13dd0 3451 if (priorities[i] & try & CODING_CATEGORY_MASK_ISO)
d46c5b12 3452 mask = detect_coding_iso2022 (src, src_end);
5ab13dd0 3453 else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS)
d46c5b12 3454 mask = detect_coding_sjis (src, src_end);
5ab13dd0 3455 else if (priorities[i] & try & CODING_CATEGORY_MASK_BIG5)
d46c5b12 3456 mask = detect_coding_big5 (src, src_end);
5ab13dd0 3457 else if (priorities[i] & try & CODING_CATEGORY_MASK_EMACS_MULE)
d46c5b12 3458 mask = detect_coding_emacs_mule (src, src_end);
89fa8b36 3459 else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL)
1397dc18 3460 mask = detect_coding_ccl (src, src_end);
5ab13dd0
RS
3461 else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT)
3462 mask = CODING_CATEGORY_MASK_RAW_TEXT;
3463 else if (priorities[i] & CODING_CATEGORY_MASK_BINARY)
3464 mask = CODING_CATEGORY_MASK_BINARY;
d46c5b12
KH
3465 if (mask)
3466 goto label_return_highest_only;
3467 }
3468 return CODING_CATEGORY_MASK_RAW_TEXT;
3469 }
3470 if (try & CODING_CATEGORY_MASK_ISO)
3471 mask |= detect_coding_iso2022 (src, src_end);
3472 if (try & CODING_CATEGORY_MASK_SJIS)
3473 mask |= detect_coding_sjis (src, src_end);
3474 if (try & CODING_CATEGORY_MASK_BIG5)
3475 mask |= detect_coding_big5 (src, src_end);
3476 if (try & CODING_CATEGORY_MASK_EMACS_MULE)
1397dc18
KH
3477 mask |= detect_coding_emacs_mule (src, src_end);
3478 if (try & CODING_CATEGORY_MASK_CCL)
3479 mask |= detect_coding_ccl (src, src_end);
c4825358 3480 }
5ab13dd0 3481 return (mask | CODING_CATEGORY_MASK_RAW_TEXT | CODING_CATEGORY_MASK_BINARY);
d46c5b12
KH
3482
3483 label_return_highest_only:
3484 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3485 {
3486 if (mask & priorities[i])
3487 return priorities[i];
3488 }
3489 return CODING_CATEGORY_MASK_RAW_TEXT;
4ed46869
KH
3490}
3491
3492/* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
3493 The information of the detected coding system is set in CODING. */
3494
3495void
3496detect_coding (coding, src, src_bytes)
3497 struct coding_system *coding;
3498 unsigned char *src;
3499 int src_bytes;
3500{
d46c5b12
KH
3501 unsigned int idx;
3502 int skip, mask, i;
84d60297 3503 Lisp_Object val;
4ed46869 3504
84d60297 3505 val = Vcoding_category_list;
66cfb530 3506 mask = detect_coding_mask (src, src_bytes, coding_priorities, &skip);
d46c5b12 3507 coding->heading_ascii = skip;
4ed46869 3508
d46c5b12
KH
3509 if (!mask) return;
3510
3511 /* We found a single coding system of the highest priority in MASK. */
3512 idx = 0;
3513 while (mask && ! (mask & 1)) mask >>= 1, idx++;
3514 if (! mask)
3515 idx = CODING_CATEGORY_IDX_RAW_TEXT;
4ed46869 3516
d46c5b12
KH
3517 val = XSYMBOL (XVECTOR (Vcoding_category_table)->contents[idx])->value;
3518
3519 if (coding->eol_type != CODING_EOL_UNDECIDED)
27901516 3520 {
84d60297 3521 Lisp_Object tmp;
d46c5b12 3522
84d60297 3523 tmp = Fget (val, Qeol_type);
d46c5b12
KH
3524 if (VECTORP (tmp))
3525 val = XVECTOR (tmp)->contents[coding->eol_type];
4ed46869 3526 }
d46c5b12
KH
3527 setup_coding_system (val, coding);
3528 /* Set this again because setup_coding_system reset this member. */
3529 coding->heading_ascii = skip;
4ed46869
KH
3530}
3531
d46c5b12
KH
3532/* Detect how end-of-line of a text of length SRC_BYTES pointed by
3533 SOURCE is encoded. Return one of CODING_EOL_LF, CODING_EOL_CRLF,
3534 CODING_EOL_CR, and CODING_EOL_UNDECIDED.
3535
3536 How many non-eol characters are at the head is returned as *SKIP. */
4ed46869 3537
bc4bc72a
RS
3538#define MAX_EOL_CHECK_COUNT 3
3539
d46c5b12
KH
3540static int
3541detect_eol_type (source, src_bytes, skip)
3542 unsigned char *source;
3543 int src_bytes, *skip;
4ed46869 3544{
d46c5b12 3545 unsigned char *src = source, *src_end = src + src_bytes;
4ed46869 3546 unsigned char c;
bc4bc72a
RS
3547 int total = 0; /* How many end-of-lines are found so far. */
3548 int eol_type = CODING_EOL_UNDECIDED;
3549 int this_eol_type;
4ed46869 3550
d46c5b12
KH
3551 *skip = 0;
3552
bc4bc72a 3553 while (src < src_end && total < MAX_EOL_CHECK_COUNT)
4ed46869
KH
3554 {
3555 c = *src++;
bc4bc72a 3556 if (c == '\n' || c == '\r')
4ed46869 3557 {
d46c5b12
KH
3558 if (*skip == 0)
3559 *skip = src - 1 - source;
bc4bc72a
RS
3560 total++;
3561 if (c == '\n')
3562 this_eol_type = CODING_EOL_LF;
3563 else if (src >= src_end || *src != '\n')
3564 this_eol_type = CODING_EOL_CR;
4ed46869 3565 else
bc4bc72a
RS
3566 this_eol_type = CODING_EOL_CRLF, src++;
3567
3568 if (eol_type == CODING_EOL_UNDECIDED)
3569 /* This is the first end-of-line. */
3570 eol_type = this_eol_type;
3571 else if (eol_type != this_eol_type)
d46c5b12
KH
3572 {
3573 /* The found type is different from what found before. */
3574 eol_type = CODING_EOL_INCONSISTENT;
3575 break;
3576 }
4ed46869
KH
3577 }
3578 }
bc4bc72a 3579
d46c5b12
KH
3580 if (*skip == 0)
3581 *skip = src_end - source;
85a02ca4 3582 return eol_type;
4ed46869
KH
3583}
3584
3585/* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
3586 is encoded. If it detects an appropriate format of end-of-line, it
3587 sets the information in *CODING. */
3588
3589void
3590detect_eol (coding, src, src_bytes)
3591 struct coding_system *coding;
3592 unsigned char *src;
3593 int src_bytes;
3594{
4608c386 3595 Lisp_Object val;
d46c5b12
KH
3596 int skip;
3597 int eol_type = detect_eol_type (src, src_bytes, &skip);
3598
3599 if (coding->heading_ascii > skip)
3600 coding->heading_ascii = skip;
3601 else
3602 skip = coding->heading_ascii;
4ed46869 3603
0ef69138 3604 if (eol_type == CODING_EOL_UNDECIDED)
4ed46869 3605 return;
27901516
KH
3606 if (eol_type == CODING_EOL_INCONSISTENT)
3607 {
3608#if 0
3609 /* This code is suppressed until we find a better way to
992f23f2 3610 distinguish raw text file and binary file. */
27901516
KH
3611
3612 /* If we have already detected that the coding is raw-text, the
3613 coding should actually be no-conversion. */
3614 if (coding->type == coding_type_raw_text)
3615 {
3616 setup_coding_system (Qno_conversion, coding);
3617 return;
3618 }
3619 /* Else, let's decode only text code anyway. */
3620#endif /* 0 */
1b2af4b0 3621 eol_type = CODING_EOL_LF;
27901516
KH
3622 }
3623
4608c386 3624 val = Fget (coding->symbol, Qeol_type);
4ed46869 3625 if (VECTORP (val) && XVECTOR (val)->size == 3)
d46c5b12
KH
3626 {
3627 setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
3628 coding->heading_ascii = skip;
3629 }
3630}
3631
3632#define CONVERSION_BUFFER_EXTRA_ROOM 256
3633
3634#define DECODING_BUFFER_MAG(coding) \
3635 (coding->type == coding_type_iso2022 \
3636 ? 3 \
3637 : ((coding->type == coding_type_sjis || coding->type == coding_type_big5) \
3638 ? 2 \
3639 : (coding->type == coding_type_raw_text \
3640 ? 1 \
3641 : (coding->type == coding_type_ccl \
3642 ? coding->spec.ccl.decoder.buf_magnification \
3643 : 2))))
3644
3645/* Return maximum size (bytes) of a buffer enough for decoding
3646 SRC_BYTES of text encoded in CODING. */
3647
3648int
3649decoding_buffer_size (coding, src_bytes)
3650 struct coding_system *coding;
3651 int src_bytes;
3652{
3653 return (src_bytes * DECODING_BUFFER_MAG (coding)
3654 + CONVERSION_BUFFER_EXTRA_ROOM);
3655}
3656
3657/* Return maximum size (bytes) of a buffer enough for encoding
3658 SRC_BYTES of text to CODING. */
3659
3660int
3661encoding_buffer_size (coding, src_bytes)
3662 struct coding_system *coding;
3663 int src_bytes;
3664{
3665 int magnification;
3666
3667 if (coding->type == coding_type_ccl)
3668 magnification = coding->spec.ccl.encoder.buf_magnification;
3669 else
3670 magnification = 3;
3671
3672 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
3673}
3674
3675#ifndef MINIMUM_CONVERSION_BUFFER_SIZE
3676#define MINIMUM_CONVERSION_BUFFER_SIZE 1024
3677#endif
3678
3679char *conversion_buffer;
3680int conversion_buffer_size;
3681
3682/* Return a pointer to a SIZE bytes of buffer to be used for encoding
3683 or decoding. Sufficient memory is allocated automatically. If we
3684 run out of memory, return NULL. */
3685
3686char *
3687get_conversion_buffer (size)
3688 int size;
3689{
3690 if (size > conversion_buffer_size)
3691 {
3692 char *buf;
3693 int real_size = conversion_buffer_size * 2;
3694
3695 while (real_size < size) real_size *= 2;
3696 buf = (char *) xmalloc (real_size);
3697 xfree (conversion_buffer);
3698 conversion_buffer = buf;
3699 conversion_buffer_size = real_size;
3700 }
3701 return conversion_buffer;
3702}
3703
3704int
3705ccl_coding_driver (coding, source, destination, src_bytes, dst_bytes, encodep)
3706 struct coding_system *coding;
3707 unsigned char *source, *destination;
3708 int src_bytes, dst_bytes, encodep;
3709{
3710 struct ccl_program *ccl
3711 = encodep ? &coding->spec.ccl.encoder : &coding->spec.ccl.decoder;
3712 int result;
3713
ae9ff118 3714 ccl->last_block = coding->mode & CODING_MODE_LAST_BLOCK;
7b179c2d 3715
d46c5b12
KH
3716 coding->produced = ccl_driver (ccl, source, destination,
3717 src_bytes, dst_bytes, &(coding->consumed));
69f76525 3718 coding->produced_char
48942766
KH
3719 = (encodep
3720 ? coding->produced
3721 : multibyte_chars_in_text (destination, coding->produced));
69f76525
KH
3722 coding->consumed_char
3723 = multibyte_chars_in_text (source, coding->consumed);
3724
d46c5b12
KH
3725 switch (ccl->status)
3726 {
3727 case CCL_STAT_SUSPEND_BY_SRC:
3728 result = CODING_FINISH_INSUFFICIENT_SRC;
3729 break;
3730 case CCL_STAT_SUSPEND_BY_DST:
3731 result = CODING_FINISH_INSUFFICIENT_DST;
3732 break;
9864ebce
KH
3733 case CCL_STAT_QUIT:
3734 case CCL_STAT_INVALID_CMD:
3735 result = CODING_FINISH_INTERRUPT;
3736 break;
d46c5b12
KH
3737 default:
3738 result = CODING_FINISH_NORMAL;
3739 break;
3740 }
3741 return result;
4ed46869
KH
3742}
3743
3744/* See "GENERAL NOTES about `decode_coding_XXX ()' functions". Before
3745 decoding, it may detect coding system and format of end-of-line if
52d41803
KH
3746 those are not yet decided.
3747
3748 This function does not make full use of DESTINATION buffer. For
3749 instance, if coding->type is coding_type_iso2022, it uses only
3750 (DST_BYTES - 7) bytes of DESTINATION buffer. In the case that
3751 DST_BYTES is decided by the function decoding_buffer_size, it
3752 contains extra 256 bytes (defined by CONVERSION_BUFFER_EXTRA_ROOM).
3753 So, this function can decode the full SOURCE. But, in the other
3754 case, if you want to avoid carry over, you must supply at least 7
3755 bytes more area in DESTINATION buffer than expected maximum bytes
3756 that will be produced by this function. */
4ed46869
KH
3757
3758int
d46c5b12 3759decode_coding (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
3760 struct coding_system *coding;
3761 unsigned char *source, *destination;
3762 int src_bytes, dst_bytes;
4ed46869 3763{
d46c5b12 3764 int result;
4ed46869 3765
d4e57bcd 3766 if (src_bytes <= 0
944bd420 3767 && coding->type != coding_type_ccl
d4e57bcd
KH
3768 && ! (coding->mode & CODING_MODE_LAST_BLOCK
3769 && CODING_REQUIRE_FLUSHING (coding)))
4ed46869 3770 {
d46c5b12
KH
3771 coding->produced = coding->produced_char = 0;
3772 coding->consumed = coding->consumed_char = 0;
fb88bf2d 3773 coding->fake_multibyte = 0;
d46c5b12 3774 return CODING_FINISH_NORMAL;
4ed46869
KH
3775 }
3776
0ef69138 3777 if (coding->type == coding_type_undecided)
4ed46869
KH
3778 detect_coding (coding, source, src_bytes);
3779
0ef69138 3780 if (coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
3781 detect_eol (coding, source, src_bytes);
3782
4ed46869
KH
3783 switch (coding->type)
3784 {
0ef69138
KH
3785 case coding_type_emacs_mule:
3786 case coding_type_undecided:
27901516 3787 case coding_type_raw_text:
4ed46869 3788 if (coding->eol_type == CODING_EOL_LF
0ef69138 3789 || coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869 3790 goto label_no_conversion;
d46c5b12 3791 result = decode_eol (coding, source, destination, src_bytes, dst_bytes);
4ed46869
KH
3792 break;
3793
3794 case coding_type_sjis:
d46c5b12
KH
3795 result = decode_coding_sjis_big5 (coding, source, destination,
3796 src_bytes, dst_bytes, 1);
4ed46869
KH
3797 break;
3798
3799 case coding_type_iso2022:
d46c5b12
KH
3800 result = decode_coding_iso2022 (coding, source, destination,
3801 src_bytes, dst_bytes);
4ed46869
KH
3802 break;
3803
3804 case coding_type_big5:
d46c5b12
KH
3805 result = decode_coding_sjis_big5 (coding, source, destination,
3806 src_bytes, dst_bytes, 0);
4ed46869
KH
3807 break;
3808
3809 case coding_type_ccl:
d46c5b12
KH
3810 result = ccl_coding_driver (coding, source, destination,
3811 src_bytes, dst_bytes, 0);
3812 break;
3813
3814 default: /* i.e. case coding_type_no_conversion: */
3815 label_no_conversion:
3816 if (dst_bytes && src_bytes > dst_bytes)
3817 {
3818 coding->produced = dst_bytes;
3819 result = CODING_FINISH_INSUFFICIENT_DST;
3820 }
3821 else
3822 {
3823 coding->produced = src_bytes;
3824 result = CODING_FINISH_NORMAL;
3825 }
3826 if (dst_bytes)
3827 bcopy (source, destination, coding->produced);
3828 else
3829 safe_bcopy (source, destination, coding->produced);
fb88bf2d 3830 coding->fake_multibyte = 1;
d46c5b12
KH
3831 coding->consumed
3832 = coding->consumed_char = coding->produced_char = coding->produced;
4ed46869
KH
3833 break;
3834 }
3835
d46c5b12 3836 return result;
4ed46869
KH
3837}
3838
52d41803
KH
3839/* See "GENERAL NOTES about `encode_coding_XXX ()' functions".
3840
3841 This function does not make full use of DESTINATION buffer. For
3842 instance, if coding->type is coding_type_iso2022, it uses only
3843 (DST_BYTES - 20) bytes of DESTINATION buffer. In the case that
3844 DST_BYTES is decided by the function encoding_buffer_size, it
3845 contains extra 256 bytes (defined by CONVERSION_BUFFER_EXTRA_ROOM).
3846 So, this function can encode the full SOURCE. But, in the other
3847 case, if you want to avoid carry over, you must supply at least 20
3848 bytes more area in DESTINATION buffer than expected maximum bytes
3849 that will be produced by this function. */
4ed46869
KH
3850
3851int
d46c5b12 3852encode_coding (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
3853 struct coding_system *coding;
3854 unsigned char *source, *destination;
3855 int src_bytes, dst_bytes;
4ed46869 3856{
d46c5b12 3857 int result;
4ed46869 3858
d4e57bcd
KH
3859 if (src_bytes <= 0
3860 && ! (coding->mode & CODING_MODE_LAST_BLOCK
3861 && CODING_REQUIRE_FLUSHING (coding)))
4ed46869 3862 {
d46c5b12
KH
3863 coding->produced = coding->produced_char = 0;
3864 coding->consumed = coding->consumed_char = 0;
fb88bf2d 3865 coding->fake_multibyte = 0;
d46c5b12
KH
3866 return CODING_FINISH_NORMAL;
3867 }
4ed46869 3868
d46c5b12
KH
3869 switch (coding->type)
3870 {
0ef69138
KH
3871 case coding_type_emacs_mule:
3872 case coding_type_undecided:
27901516 3873 case coding_type_raw_text:
4ed46869 3874 if (coding->eol_type == CODING_EOL_LF
0ef69138 3875 || coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869 3876 goto label_no_conversion;
d46c5b12 3877 result = encode_eol (coding, source, destination, src_bytes, dst_bytes);
4ed46869
KH
3878 break;
3879
3880 case coding_type_sjis:
d46c5b12
KH
3881 result = encode_coding_sjis_big5 (coding, source, destination,
3882 src_bytes, dst_bytes, 1);
4ed46869
KH
3883 break;
3884
3885 case coding_type_iso2022:
d46c5b12
KH
3886 result = encode_coding_iso2022 (coding, source, destination,
3887 src_bytes, dst_bytes);
4ed46869
KH
3888 break;
3889
3890 case coding_type_big5:
d46c5b12
KH
3891 result = encode_coding_sjis_big5 (coding, source, destination,
3892 src_bytes, dst_bytes, 0);
4ed46869
KH
3893 break;
3894
3895 case coding_type_ccl:
d46c5b12
KH
3896 result = ccl_coding_driver (coding, source, destination,
3897 src_bytes, dst_bytes, 1);
3898 break;
3899
3900 default: /* i.e. case coding_type_no_conversion: */
3901 label_no_conversion:
3902 if (dst_bytes && src_bytes > dst_bytes)
3903 {
3904 coding->produced = dst_bytes;
3905 result = CODING_FINISH_INSUFFICIENT_DST;
3906 }
3907 else
3908 {
3909 coding->produced = src_bytes;
3910 result = CODING_FINISH_NORMAL;
3911 }
3912 if (dst_bytes)
3913 bcopy (source, destination, coding->produced);
3914 else
3915 safe_bcopy (source, destination, coding->produced);
3916 if (coding->mode & CODING_MODE_SELECTIVE_DISPLAY)
3917 {
3918 unsigned char *p = destination, *pend = p + coding->produced;
3919 while (p < pend)
3920 if (*p++ == '\015') p[-1] = '\n';
3921 }
fb88bf2d 3922 coding->fake_multibyte = 1;
d46c5b12
KH
3923 coding->consumed
3924 = coding->consumed_char = coding->produced_char = coding->produced;
4ed46869
KH
3925 break;
3926 }
3927
d46c5b12 3928 return result;
4ed46869
KH
3929}
3930
fb88bf2d
KH
3931/* Scan text in the region between *BEG and *END (byte positions),
3932 skip characters which we don't have to decode by coding system
3933 CODING at the head and tail, then set *BEG and *END to the region
3934 of the text we actually have to convert. The caller should move
3935 the gap out of the region in advance.
4ed46869 3936
d46c5b12
KH
3937 If STR is not NULL, *BEG and *END are indices into STR. */
3938
3939static void
3940shrink_decoding_region (beg, end, coding, str)
3941 int *beg, *end;
3942 struct coding_system *coding;
3943 unsigned char *str;
3944{
fb88bf2d 3945 unsigned char *begp_orig, *begp, *endp_orig, *endp, c;
d46c5b12 3946 int eol_conversion;
88993dfd 3947 Lisp_Object translation_table;
d46c5b12
KH
3948
3949 if (coding->type == coding_type_ccl
3950 || coding->type == coding_type_undecided
3951 || !NILP (coding->post_read_conversion))
3952 {
3953 /* We can't skip any data. */
3954 return;
3955 }
3956 else if (coding->type == coding_type_no_conversion)
3957 {
fb88bf2d
KH
3958 /* We need no conversion, but don't have to skip any data here.
3959 Decoding routine handles them effectively anyway. */
d46c5b12
KH
3960 return;
3961 }
3962
88993dfd
KH
3963 translation_table = coding->translation_table_for_decode;
3964 if (NILP (translation_table) && !NILP (Venable_character_translation))
3965 translation_table = Vstandard_translation_table_for_decode;
3966 if (CHAR_TABLE_P (translation_table))
3967 {
3968 int i;
3969 for (i = 0; i < 128; i++)
3970 if (!NILP (CHAR_TABLE_REF (translation_table, i)))
3971 break;
3972 if (i < 128)
3973 /* Some ASCII character should be tranlsated. We give up
3974 shrinking. */
3975 return;
3976 }
3977
aa60dea6
KH
3978 eol_conversion = (coding->eol_type != CODING_EOL_LF);
3979
3980 if ((! eol_conversion) && (coding->heading_ascii >= 0))
d46c5b12
KH
3981 /* Detection routine has already found how much we can skip at the
3982 head. */
3983 *beg += coding->heading_ascii;
3984
3985 if (str)
3986 {
3987 begp_orig = begp = str + *beg;
3988 endp_orig = endp = str + *end;
3989 }
3990 else
3991 {
fb88bf2d 3992 begp_orig = begp = BYTE_POS_ADDR (*beg);
d46c5b12
KH
3993 endp_orig = endp = begp + *end - *beg;
3994 }
3995
d46c5b12
KH
3996 switch (coding->type)
3997 {
3998 case coding_type_emacs_mule:
3999 case coding_type_raw_text:
4000 if (eol_conversion)
4001 {
4002 if (coding->heading_ascii < 0)
fb88bf2d 4003 while (begp < endp && *begp != '\r' && *begp < 0x80) begp++;
ee59c65f 4004 while (begp < endp && endp[-1] != '\r' && endp[-1] < 0x80)
fb88bf2d 4005 endp--;
ee59c65f
RS
4006 /* Do not consider LF as ascii if preceded by CR, since that
4007 confuses eol decoding. */
4008 if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
4009 endp++;
d46c5b12
KH
4010 }
4011 else
4012 begp = endp;
4013 break;
4014
4015 case coding_type_sjis:
4016 case coding_type_big5:
4017 /* We can skip all ASCII characters at the head. */
4018 if (coding->heading_ascii < 0)
4019 {
4020 if (eol_conversion)
de9d083c 4021 while (begp < endp && *begp < 0x80 && *begp != '\r') begp++;
d46c5b12
KH
4022 else
4023 while (begp < endp && *begp < 0x80) begp++;
4024 }
4025 /* We can skip all ASCII characters at the tail except for the
4026 second byte of SJIS or BIG5 code. */
4027 if (eol_conversion)
de9d083c 4028 while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\r') endp--;
d46c5b12
KH
4029 else
4030 while (begp < endp && endp[-1] < 0x80) endp--;
ee59c65f
RS
4031 /* Do not consider LF as ascii if preceded by CR, since that
4032 confuses eol decoding. */
4033 if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
4034 endp++;
d46c5b12
KH
4035 if (begp < endp && endp < endp_orig && endp[-1] >= 0x80)
4036 endp++;
4037 break;
4038
4039 default: /* i.e. case coding_type_iso2022: */
622fece5
KH
4040 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
4041 /* We can't skip any data. */
4042 break;
d46c5b12
KH
4043 if (coding->heading_ascii < 0)
4044 {
d46c5b12
KH
4045 /* We can skip all ASCII characters at the head except for a
4046 few control codes. */
4047 while (begp < endp && (c = *begp) < 0x80
4048 && c != ISO_CODE_CR && c != ISO_CODE_SO
4049 && c != ISO_CODE_SI && c != ISO_CODE_ESC
4050 && (!eol_conversion || c != ISO_CODE_LF))
4051 begp++;
4052 }
4053 switch (coding->category_idx)
4054 {
4055 case CODING_CATEGORY_IDX_ISO_8_1:
4056 case CODING_CATEGORY_IDX_ISO_8_2:
4057 /* We can skip all ASCII characters at the tail. */
4058 if (eol_conversion)
de9d083c 4059 while (begp < endp && (c = endp[-1]) < 0x80 && c != '\r') endp--;
d46c5b12
KH
4060 else
4061 while (begp < endp && endp[-1] < 0x80) endp--;
ee59c65f
RS
4062 /* Do not consider LF as ascii if preceded by CR, since that
4063 confuses eol decoding. */
4064 if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
4065 endp++;
d46c5b12
KH
4066 break;
4067
4068 case CODING_CATEGORY_IDX_ISO_7:
4069 case CODING_CATEGORY_IDX_ISO_7_TIGHT:
de79a6a5
KH
4070 {
4071 /* We can skip all charactes at the tail except for 8-bit
4072 codes and ESC and the following 2-byte at the tail. */
4073 unsigned char *eight_bit = NULL;
4074
4075 if (eol_conversion)
4076 while (begp < endp
4077 && (c = endp[-1]) != ISO_CODE_ESC && c != '\r')
4078 {
4079 if (!eight_bit && c & 0x80) eight_bit = endp;
4080 endp--;
4081 }
4082 else
4083 while (begp < endp
4084 && (c = endp[-1]) != ISO_CODE_ESC)
4085 {
4086 if (!eight_bit && c & 0x80) eight_bit = endp;
4087 endp--;
4088 }
4089 /* Do not consider LF as ascii if preceded by CR, since that
4090 confuses eol decoding. */
4091 if (begp < endp && endp < endp_orig
4092 && endp[-1] == '\r' && endp[0] == '\n')
4093 endp++;
4094 if (begp < endp && endp[-1] == ISO_CODE_ESC)
4095 {
4096 if (endp + 1 < endp_orig && end[0] == '(' && end[1] == 'B')
4097 /* This is an ASCII designation sequence. We can
4098 surely skip the tail. But, if we have
4099 encountered an 8-bit code, skip only the codes
4100 after that. */
4101 endp = eight_bit ? eight_bit : endp + 2;
4102 else
4103 /* Hmmm, we can't skip the tail. */
4104 endp = endp_orig;
4105 }
4106 else if (eight_bit)
4107 endp = eight_bit;
4108 }
d46c5b12
KH
4109 }
4110 }
4111 *beg += begp - begp_orig;
4112 *end += endp - endp_orig;
4113 return;
4114}
4115
4116/* Like shrink_decoding_region but for encoding. */
4117
4118static void
4119shrink_encoding_region (beg, end, coding, str)
4120 int *beg, *end;
4121 struct coding_system *coding;
4122 unsigned char *str;
4123{
4124 unsigned char *begp_orig, *begp, *endp_orig, *endp;
4125 int eol_conversion;
88993dfd 4126 Lisp_Object translation_table;
d46c5b12
KH
4127
4128 if (coding->type == coding_type_ccl)
4129 /* We can't skip any data. */
4130 return;
4131 else if (coding->type == coding_type_no_conversion)
4132 {
4133 /* We need no conversion. */
4134 *beg = *end;
4135 return;
4136 }
4137
88993dfd
KH
4138 translation_table = coding->translation_table_for_encode;
4139 if (NILP (translation_table) && !NILP (Venable_character_translation))
4140 translation_table = Vstandard_translation_table_for_encode;
4141 if (CHAR_TABLE_P (translation_table))
4142 {
4143 int i;
4144 for (i = 0; i < 128; i++)
4145 if (!NILP (CHAR_TABLE_REF (translation_table, i)))
4146 break;
4147 if (i < 128)
4148 /* Some ASCII character should be tranlsated. We give up
4149 shrinking. */
4150 return;
4151 }
4152
d46c5b12
KH
4153 if (str)
4154 {
4155 begp_orig = begp = str + *beg;
4156 endp_orig = endp = str + *end;
4157 }
4158 else
4159 {
fb88bf2d 4160 begp_orig = begp = BYTE_POS_ADDR (*beg);
d46c5b12
KH
4161 endp_orig = endp = begp + *end - *beg;
4162 }
4163
4164 eol_conversion = (coding->eol_type == CODING_EOL_CR
4165 || coding->eol_type == CODING_EOL_CRLF);
4166
4167 /* Here, we don't have to check coding->pre_write_conversion because
4168 the caller is expected to have handled it already. */
4169 switch (coding->type)
4170 {
4171 case coding_type_undecided:
4172 case coding_type_emacs_mule:
4173 case coding_type_raw_text:
4174 if (eol_conversion)
4175 {
4176 while (begp < endp && *begp != '\n') begp++;
4177 while (begp < endp && endp[-1] != '\n') endp--;
4178 }
4179 else
4180 begp = endp;
4181 break;
4182
4183 case coding_type_iso2022:
622fece5
KH
4184 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
4185 /* We can't skip any data. */
4186 break;
d46c5b12
KH
4187 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
4188 {
4189 unsigned char *bol = begp;
4190 while (begp < endp && *begp < 0x80)
4191 {
4192 begp++;
4193 if (begp[-1] == '\n')
4194 bol = begp;
4195 }
4196 begp = bol;
4197 goto label_skip_tail;
4198 }
4199 /* fall down ... */
4200
4201 default:
4202 /* We can skip all ASCII characters at the head and tail. */
4203 if (eol_conversion)
4204 while (begp < endp && *begp < 0x80 && *begp != '\n') begp++;
4205 else
4206 while (begp < endp && *begp < 0x80) begp++;
4207 label_skip_tail:
4208 if (eol_conversion)
4209 while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\n') endp--;
4210 else
4211 while (begp < endp && *(endp - 1) < 0x80) endp--;
4212 break;
4213 }
4214
4215 *beg += begp - begp_orig;
4216 *end += endp - endp_orig;
4217 return;
4218}
4219
88993dfd
KH
4220/* As shrinking conversion region requires some overhead, we don't try
4221 shrinking if the length of conversion region is less than this
4222 value. */
4223static int shrink_conversion_region_threshhold = 1024;
4224
4225#define SHRINK_CONVERSION_REGION(beg, end, coding, str, encodep) \
4226 do { \
4227 if (*(end) - *(beg) > shrink_conversion_region_threshhold) \
4228 { \
4229 if (encodep) shrink_encoding_region (beg, end, coding, str); \
4230 else shrink_decoding_region (beg, end, coding, str); \
4231 } \
4232 } while (0)
4233
d46c5b12 4234/* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the
fb88bf2d
KH
4235 text from FROM to TO (byte positions are FROM_BYTE and TO_BYTE) by
4236 coding system CODING, and return the status code of code conversion
4237 (currently, this value has no meaning).
4238
4239 How many characters (and bytes) are converted to how many
4240 characters (and bytes) are recorded in members of the structure
4241 CODING.
d46c5b12 4242
6e44253b 4243 If REPLACE is nonzero, we do various things as if the original text
d46c5b12 4244 is deleted and a new text is inserted. See the comments in
6e44253b 4245 replace_range (insdel.c) to know what we are doing. */
4ed46869
KH
4246
4247int
6e44253b
KH
4248code_convert_region (from, from_byte, to, to_byte, coding, encodep, replace)
4249 int from, from_byte, to, to_byte, encodep, replace;
4ed46869 4250 struct coding_system *coding;
4ed46869 4251{
fb88bf2d
KH
4252 int len = to - from, len_byte = to_byte - from_byte;
4253 int require, inserted, inserted_byte;
12410ef1 4254 int head_skip, tail_skip, total_skip;
84d60297 4255 Lisp_Object saved_coding_symbol;
fb88bf2d
KH
4256 int multibyte = !NILP (current_buffer->enable_multibyte_characters);
4257 int first = 1;
4258 int fake_multibyte = 0;
4259 unsigned char *src, *dst;
84d60297 4260 Lisp_Object deletion;
e133c8fa 4261 int orig_point = PT, orig_len = len;
6abb9bd9 4262 int prev_Z;
84d60297
RS
4263
4264 deletion = Qnil;
4265 saved_coding_symbol = Qnil;
d46c5b12 4266
83fa074f 4267 if (from < PT && PT < to)
e133c8fa
KH
4268 {
4269 TEMP_SET_PT_BOTH (from, from_byte);
4270 orig_point = from;
4271 }
83fa074f 4272
6e44253b 4273 if (replace)
d46c5b12 4274 {
fb88bf2d
KH
4275 int saved_from = from;
4276
d46c5b12 4277 prepare_to_modify_buffer (from, to, &from);
fb88bf2d
KH
4278 if (saved_from != from)
4279 {
4280 to = from + len;
4281 if (multibyte)
4282 from_byte = CHAR_TO_BYTE (from), to_byte = CHAR_TO_BYTE (to);
4283 else
4284 from_byte = from, to_byte = to;
4285 len_byte = to_byte - from_byte;
4286 }
d46c5b12 4287 }
d46c5b12
KH
4288
4289 if (! encodep && CODING_REQUIRE_DETECTION (coding))
4290 {
12410ef1 4291 /* We must detect encoding of text and eol format. */
d46c5b12
KH
4292
4293 if (from < GPT && to > GPT)
4294 move_gap_both (from, from_byte);
4295 if (coding->type == coding_type_undecided)
4296 {
fb88bf2d 4297 detect_coding (coding, BYTE_POS_ADDR (from_byte), len_byte);
d46c5b12 4298 if (coding->type == coding_type_undecided)
12410ef1
KH
4299 /* It seems that the text contains only ASCII, but we
4300 should not left it undecided because the deeper
4301 decoding routine (decode_coding) tries to detect the
4302 encodings again in vain. */
d46c5b12
KH
4303 coding->type = coding_type_emacs_mule;
4304 }
4305 if (coding->eol_type == CODING_EOL_UNDECIDED)
4306 {
4307 saved_coding_symbol = coding->symbol;
4308 detect_eol (coding, BYTE_POS_ADDR (from_byte), len_byte);
4309 if (coding->eol_type == CODING_EOL_UNDECIDED)
4310 coding->eol_type = CODING_EOL_LF;
4311 /* We had better recover the original eol format if we
4312 encounter an inconsitent eol format while decoding. */
4313 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4314 }
4315 }
4316
fb88bf2d
KH
4317 coding->consumed_char = len, coding->consumed = len_byte;
4318
d46c5b12
KH
4319 if (encodep
4320 ? ! CODING_REQUIRE_ENCODING (coding)
4321 : ! CODING_REQUIRE_DECODING (coding))
fb88bf2d
KH
4322 {
4323 coding->produced = len_byte;
12410ef1
KH
4324 if (multibyte
4325 && ! replace
4326 /* See the comment of the member heading_ascii in coding.h. */
4327 && coding->heading_ascii < len_byte)
fb88bf2d 4328 {
6e44253b
KH
4329 /* We still may have to combine byte at the head and the
4330 tail of the text in the region. */
12410ef1 4331 if (from < GPT && GPT < to)
6e44253b 4332 move_gap_both (to, to_byte);
12410ef1
KH
4333 len = multibyte_chars_in_text (BYTE_POS_ADDR (from_byte), len_byte);
4334 adjust_after_insert (from, from_byte, to, to_byte, len);
4335 coding->produced_char = len;
fb88bf2d
KH
4336 }
4337 else
68e3a8f1
AS
4338 {
4339 if (!replace)
4340 adjust_after_insert (from, from_byte, to, to_byte, len_byte);
4341 coding->produced_char = len_byte;
4342 }
fb88bf2d
KH
4343 return 0;
4344 }
d46c5b12
KH
4345
4346 /* Now we convert the text. */
4347
4348 /* For encoding, we must process pre-write-conversion in advance. */
4349 if (encodep
d46c5b12
KH
4350 && ! NILP (coding->pre_write_conversion)
4351 && SYMBOLP (coding->pre_write_conversion)
4352 && ! NILP (Ffboundp (coding->pre_write_conversion)))
4353 {
2b4f9037
KH
4354 /* The function in pre-write-conversion may put a new text in a
4355 new buffer. */
0007bdd0
KH
4356 struct buffer *prev = current_buffer;
4357 Lisp_Object new;
d46c5b12 4358
b39f748c
AS
4359 call2 (coding->pre_write_conversion,
4360 make_number (from), make_number (to));
d46c5b12
KH
4361 if (current_buffer != prev)
4362 {
4363 len = ZV - BEGV;
0007bdd0 4364 new = Fcurrent_buffer ();
d46c5b12 4365 set_buffer_internal_1 (prev);
ddbc19ff 4366 del_range_2 (from, from_byte, to, to_byte);
e133c8fa 4367 TEMP_SET_PT_BOTH (from, from_byte);
0007bdd0
KH
4368 insert_from_buffer (XBUFFER (new), 1, len, 0);
4369 Fkill_buffer (new);
e133c8fa
KH
4370 if (orig_point >= to)
4371 orig_point += len - orig_len;
4372 else if (orig_point > from)
4373 orig_point = from;
4374 orig_len = len;
d46c5b12 4375 to = from + len;
e133c8fa 4376 from_byte = multibyte ? CHAR_TO_BYTE (from) : from_byte;
fb88bf2d 4377 to_byte = multibyte ? CHAR_TO_BYTE (to) : to;
d46c5b12 4378 len_byte = to_byte - from_byte;
e133c8fa 4379 TEMP_SET_PT_BOTH (from, from_byte);
d46c5b12
KH
4380 }
4381 }
4382
12410ef1
KH
4383 if (replace)
4384 deletion = make_buffer_string_both (from, from_byte, to, to_byte, 1);
4385
d46c5b12 4386 /* Try to skip the heading and tailing ASCIIs. */
12410ef1
KH
4387 {
4388 int from_byte_orig = from_byte, to_byte_orig = to_byte;
4389
4390 if (from < GPT && GPT < to)
4391 move_gap_both (from, from_byte);
88993dfd 4392 SHRINK_CONVERSION_REGION (&from_byte, &to_byte, coding, NULL, encodep);
d4e57bcd 4393 if (from_byte == to_byte
944bd420 4394 && coding->type != coding_type_ccl
d4e57bcd
KH
4395 && ! (coding->mode & CODING_MODE_LAST_BLOCK
4396 && CODING_REQUIRE_FLUSHING (coding)))
12410ef1
KH
4397 {
4398 coding->produced = len_byte;
4399 coding->produced_char = multibyte ? len : len_byte;
4400 if (!replace)
4401 /* We must record and adjust for this new text now. */
4402 adjust_after_insert (from, from_byte_orig, to, to_byte_orig, len);
4403 return 0;
4404 }
fb88bf2d 4405
12410ef1
KH
4406 head_skip = from_byte - from_byte_orig;
4407 tail_skip = to_byte_orig - to_byte;
4408 total_skip = head_skip + tail_skip;
4409 from += head_skip;
4410 to -= tail_skip;
4411 len -= total_skip; len_byte -= total_skip;
4412 }
d46c5b12 4413
88993dfd 4414 /* The code conversion routine can not preserve text properties for
55d8d769
KH
4415 now. So, we must remove all text properties in the region.
4416 Here, we must suppress all modification hooks. */
88993dfd 4417 if (replace)
55d8d769
KH
4418 {
4419 int saved_inhibit_modification_hooks = inhibit_modification_hooks;
4420 inhibit_modification_hooks = 1;
4421 Fset_text_properties (make_number (from), make_number (to), Qnil, Qnil);
4422 inhibit_modification_hooks = saved_inhibit_modification_hooks;
4423 }
88993dfd 4424
fb88bf2d
KH
4425 /* For converion, we must put the gap before the text in addition to
4426 making the gap larger for efficient decoding. The required gap
4427 size starts from 2000 which is the magic number used in make_gap.
4428 But, after one batch of conversion, it will be incremented if we
4429 find that it is not enough . */
d46c5b12
KH
4430 require = 2000;
4431
4432 if (GAP_SIZE < require)
4433 make_gap (require - GAP_SIZE);
4434 move_gap_both (from, from_byte);
4435
d46c5b12 4436 inserted = inserted_byte = 0;
fb88bf2d
KH
4437 src = GAP_END_ADDR, dst = GPT_ADDR;
4438
4439 GAP_SIZE += len_byte;
4440 ZV -= len;
4441 Z -= len;
4442 ZV_BYTE -= len_byte;
4443 Z_BYTE -= len_byte;
4444
f2558efd
KH
4445 if (GPT - BEG < beg_unchanged)
4446 beg_unchanged = GPT - BEG;
4447 if (Z - GPT < end_unchanged)
4448 end_unchanged = Z - GPT;
4449
d46c5b12
KH
4450 for (;;)
4451 {
fb88bf2d 4452 int result;
d46c5b12
KH
4453
4454 /* The buffer memory is changed from:
fb88bf2d
KH
4455 +--------+converted-text+---------+-------original-text------+---+
4456 |<-from->|<--inserted-->|---------|<-----------len---------->|---|
4457 |<------------------- GAP_SIZE -------------------->| */
d46c5b12 4458 if (encodep)
fb88bf2d 4459 result = encode_coding (coding, src, dst, len_byte, 0);
d46c5b12 4460 else
fb88bf2d 4461 result = decode_coding (coding, src, dst, len_byte, 0);
d46c5b12
KH
4462 /* to:
4463 +--------+-------converted-text--------+--+---original-text--+---+
fb88bf2d
KH
4464 |<-from->|<--inserted-->|<--produced-->|--|<-(len-consumed)->|---|
4465 |<------------------- GAP_SIZE -------------------->| */
4466 if (coding->fake_multibyte)
4467 fake_multibyte = 1;
d46c5b12 4468
fb88bf2d
KH
4469 if (!encodep && !multibyte)
4470 coding->produced_char = coding->produced;
d46c5b12
KH
4471 inserted += coding->produced_char;
4472 inserted_byte += coding->produced;
d46c5b12 4473 len_byte -= coding->consumed;
fb88bf2d
KH
4474 src += coding->consumed;
4475 dst += inserted_byte;
d46c5b12 4476
9864ebce
KH
4477 if (result == CODING_FINISH_NORMAL)
4478 {
4479 src += len_byte;
4480 break;
4481 }
d46c5b12
KH
4482 if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
4483 {
fb88bf2d 4484 unsigned char *pend = dst, *p = pend - inserted_byte;
38edf7d4 4485 Lisp_Object eol_type;
d46c5b12
KH
4486
4487 /* Encode LFs back to the original eol format (CR or CRLF). */
4488 if (coding->eol_type == CODING_EOL_CR)
4489 {
4490 while (p < pend) if (*p++ == '\n') p[-1] = '\r';
4491 }
4492 else
4493 {
d46c5b12
KH
4494 int count = 0;
4495
fb88bf2d
KH
4496 while (p < pend) if (*p++ == '\n') count++;
4497 if (src - dst < count)
d46c5b12 4498 {
38edf7d4 4499 /* We don't have sufficient room for encoding LFs
fb88bf2d
KH
4500 back to CRLF. We must record converted and
4501 not-yet-converted text back to the buffer
4502 content, enlarge the gap, then record them out of
4503 the buffer contents again. */
4504 int add = len_byte + inserted_byte;
4505
4506 GAP_SIZE -= add;
4507 ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
4508 GPT += inserted_byte; GPT_BYTE += inserted_byte;
4509 make_gap (count - GAP_SIZE);
4510 GAP_SIZE += add;
4511 ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
4512 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
4513 /* Don't forget to update SRC, DST, and PEND. */
4514 src = GAP_END_ADDR - len_byte;
4515 dst = GPT_ADDR + inserted_byte;
4516 pend = dst;
d46c5b12 4517 }
d46c5b12
KH
4518 inserted += count;
4519 inserted_byte += count;
fb88bf2d
KH
4520 coding->produced += count;
4521 p = dst = pend + count;
4522 while (count)
4523 {
4524 *--p = *--pend;
4525 if (*p == '\n') count--, *--p = '\r';
4526 }
d46c5b12
KH
4527 }
4528
4529 /* Suppress eol-format conversion in the further conversion. */
4530 coding->eol_type = CODING_EOL_LF;
4531
38edf7d4
KH
4532 /* Set the coding system symbol to that for Unix-like EOL. */
4533 eol_type = Fget (saved_coding_symbol, Qeol_type);
4534 if (VECTORP (eol_type)
4535 && XVECTOR (eol_type)->size == 3
4536 && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
4537 coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
4538 else
4539 coding->symbol = saved_coding_symbol;
fb88bf2d
KH
4540
4541 continue;
d46c5b12
KH
4542 }
4543 if (len_byte <= 0)
944bd420
KH
4544 {
4545 if (coding->type != coding_type_ccl
4546 || coding->mode & CODING_MODE_LAST_BLOCK)
4547 break;
4548 coding->mode |= CODING_MODE_LAST_BLOCK;
4549 continue;
4550 }
d46c5b12
KH
4551 if (result == CODING_FINISH_INSUFFICIENT_SRC)
4552 {
4553 /* The source text ends in invalid codes. Let's just
4554 make them valid buffer contents, and finish conversion. */
fb88bf2d 4555 inserted += len_byte;
d46c5b12 4556 inserted_byte += len_byte;
fb88bf2d 4557 while (len_byte--)
ee59c65f 4558 *dst++ = *src++;
fb88bf2d 4559 fake_multibyte = 1;
d46c5b12
KH
4560 break;
4561 }
9864ebce
KH
4562 if (result == CODING_FINISH_INTERRUPT)
4563 {
4564 /* The conversion procedure was interrupted by a user. */
4565 fake_multibyte = 1;
4566 break;
4567 }
4568 /* Now RESULT == CODING_FINISH_INSUFFICIENT_DST */
4569 if (coding->consumed < 1)
4570 {
4571 /* It's quite strange to require more memory without
4572 consuming any bytes. Perhaps CCL program bug. */
4573 fake_multibyte = 1;
4574 break;
4575 }
fb88bf2d
KH
4576 if (first)
4577 {
4578 /* We have just done the first batch of conversion which was
4579 stoped because of insufficient gap. Let's reconsider the
4580 required gap size (i.e. SRT - DST) now.
4581
4582 We have converted ORIG bytes (== coding->consumed) into
4583 NEW bytes (coding->produced). To convert the remaining
4584 LEN bytes, we may need REQUIRE bytes of gap, where:
4585 REQUIRE + LEN_BYTE = LEN_BYTE * (NEW / ORIG)
4586 REQUIRE = LEN_BYTE * (NEW - ORIG) / ORIG
4587 Here, we are sure that NEW >= ORIG. */
6e44253b
KH
4588 float ratio = coding->produced - coding->consumed;
4589 ratio /= coding->consumed;
4590 require = len_byte * ratio;
fb88bf2d
KH
4591 first = 0;
4592 }
4593 if ((src - dst) < (require + 2000))
4594 {
4595 /* See the comment above the previous call of make_gap. */
4596 int add = len_byte + inserted_byte;
4597
4598 GAP_SIZE -= add;
4599 ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
4600 GPT += inserted_byte; GPT_BYTE += inserted_byte;
4601 make_gap (require + 2000);
4602 GAP_SIZE += add;
4603 ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
4604 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
4605 /* Don't forget to update SRC, DST. */
4606 src = GAP_END_ADDR - len_byte;
4607 dst = GPT_ADDR + inserted_byte;
4608 }
d46c5b12 4609 }
fb88bf2d
KH
4610 if (src - dst > 0) *dst = 0; /* Put an anchor. */
4611
2b4f9037 4612 if (multibyte
88993dfd
KH
4613 && (encodep
4614 || fake_multibyte
4615 || (to - from) != (to_byte - from_byte)))
2b4f9037 4616 inserted = multibyte_chars_in_text (GPT_ADDR, inserted_byte);
7553d0e1 4617
12410ef1
KH
4618 /* If we have shrinked the conversion area, adjust it now. */
4619 if (total_skip > 0)
4620 {
4621 if (tail_skip > 0)
4622 safe_bcopy (GAP_END_ADDR, GPT_ADDR + inserted_byte, tail_skip);
4623 inserted += total_skip; inserted_byte += total_skip;
4624 GAP_SIZE += total_skip;
4625 GPT -= head_skip; GPT_BYTE -= head_skip;
4626 ZV -= total_skip; ZV_BYTE -= total_skip;
4627 Z -= total_skip; Z_BYTE -= total_skip;
4628 from -= head_skip; from_byte -= head_skip;
4629 to += tail_skip; to_byte += tail_skip;
4630 }
4631
6abb9bd9 4632 prev_Z = Z;
12410ef1 4633 adjust_after_replace (from, from_byte, deletion, inserted, inserted_byte);
6abb9bd9 4634 inserted = Z - prev_Z;
4ed46869 4635
2b4f9037 4636 if (! encodep && ! NILP (coding->post_read_conversion))
d46c5b12 4637 {
2b4f9037 4638 Lisp_Object val;
4ed46869 4639
e133c8fa
KH
4640 if (from != PT)
4641 TEMP_SET_PT_BOTH (from, from_byte);
6abb9bd9 4642 prev_Z = Z;
2b4f9037 4643 val = call1 (coding->post_read_conversion, make_number (inserted));
6abb9bd9 4644 CHECK_NUMBER (val, 0);
944bd420 4645 inserted += Z - prev_Z;
e133c8fa
KH
4646 }
4647
4648 if (orig_point >= from)
4649 {
4650 if (orig_point >= from + orig_len)
4651 orig_point += inserted - orig_len;
4652 else
4653 orig_point = from;
4654 TEMP_SET_PT (orig_point);
d46c5b12 4655 }
4ed46869 4656
2b4f9037
KH
4657 signal_after_change (from, to - from, inserted);
4658
fb88bf2d 4659 {
12410ef1
KH
4660 coding->consumed = to_byte - from_byte;
4661 coding->consumed_char = to - from;
4662 coding->produced = inserted_byte;
4663 coding->produced_char = inserted;
fb88bf2d 4664 }
7553d0e1 4665
fb88bf2d 4666 return 0;
d46c5b12
KH
4667}
4668
4669Lisp_Object
4670code_convert_string (str, coding, encodep, nocopy)
4671 Lisp_Object str;
4ed46869 4672 struct coding_system *coding;
d46c5b12 4673 int encodep, nocopy;
4ed46869 4674{
d46c5b12
KH
4675 int len;
4676 char *buf;
fc932ac6
RS
4677 int from = 0, to = XSTRING (str)->size;
4678 int to_byte = STRING_BYTES (XSTRING (str));
d46c5b12 4679 struct gcpro gcpro1;
84d60297 4680 Lisp_Object saved_coding_symbol;
d46c5b12 4681 int result;
4ed46869 4682
84d60297 4683 saved_coding_symbol = Qnil;
d46c5b12
KH
4684 if (encodep && !NILP (coding->pre_write_conversion)
4685 || !encodep && !NILP (coding->post_read_conversion))
4686 {
4687 /* Since we have to call Lisp functions which assume target text
4688 is in a buffer, after setting a temporary buffer, call
4689 code_convert_region. */
4690 int count = specpdl_ptr - specpdl;
4691 struct buffer *prev = current_buffer;
e133c8fa 4692
d46c5b12
KH
4693 record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
4694 temp_output_buffer_setup (" *code-converting-work*");
4695 set_buffer_internal (XBUFFER (Vstandard_output));
4696 if (encodep)
4697 insert_from_string (str, 0, 0, to, to_byte, 0);
4698 else
4699 {
4700 /* We must insert the contents of STR as is without
4701 unibyte<->multibyte conversion. */
4702 current_buffer->enable_multibyte_characters = Qnil;
4703 insert_from_string (str, 0, 0, to_byte, to_byte, 0);
4704 current_buffer->enable_multibyte_characters = Qt;
4705 }
fb88bf2d 4706 code_convert_region (BEGV, BEGV_BYTE, ZV, ZV_BYTE, coding, encodep, 1);
d46c5b12
KH
4707 if (encodep)
4708 /* We must return the buffer contents as unibyte string. */
4709 current_buffer->enable_multibyte_characters = Qnil;
4710 str = make_buffer_string (BEGV, ZV, 0);
4711 set_buffer_internal (prev);
4712 return unbind_to (count, str);
4713 }
4ed46869 4714
d46c5b12
KH
4715 if (! encodep && CODING_REQUIRE_DETECTION (coding))
4716 {
4717 /* See the comments in code_convert_region. */
4718 if (coding->type == coding_type_undecided)
4719 {
4720 detect_coding (coding, XSTRING (str)->data, to_byte);
4721 if (coding->type == coding_type_undecided)
4722 coding->type = coding_type_emacs_mule;
4723 }
4724 if (coding->eol_type == CODING_EOL_UNDECIDED)
4725 {
4726 saved_coding_symbol = coding->symbol;
4727 detect_eol (coding, XSTRING (str)->data, to_byte);
4728 if (coding->eol_type == CODING_EOL_UNDECIDED)
4729 coding->eol_type = CODING_EOL_LF;
4730 /* We had better recover the original eol format if we
4731 encounter an inconsitent eol format while decoding. */
4732 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4733 }
4734 }
4ed46869 4735
d46c5b12
KH
4736 if (encodep
4737 ? ! CODING_REQUIRE_ENCODING (coding)
4738 : ! CODING_REQUIRE_DECODING (coding))
4739 from = to_byte;
4740 else
4741 {
4742 /* Try to skip the heading and tailing ASCIIs. */
88993dfd
KH
4743 SHRINK_CONVERSION_REGION (&from, &to_byte, coding, XSTRING (str)->data,
4744 encodep);
d46c5b12 4745 }
e133c8fa
KH
4746 if (from == to_byte
4747 && coding->type != coding_type_ccl)
d46c5b12 4748 return (nocopy ? str : Fcopy_sequence (str));
4ed46869 4749
d46c5b12
KH
4750 if (encodep)
4751 len = encoding_buffer_size (coding, to_byte - from);
4752 else
4753 len = decoding_buffer_size (coding, to_byte - from);
fc932ac6 4754 len += from + STRING_BYTES (XSTRING (str)) - to_byte;
d46c5b12
KH
4755 GCPRO1 (str);
4756 buf = get_conversion_buffer (len);
4757 UNGCPRO;
4ed46869 4758
d46c5b12
KH
4759 if (from > 0)
4760 bcopy (XSTRING (str)->data, buf, from);
4761 result = (encodep
4762 ? encode_coding (coding, XSTRING (str)->data + from,
4763 buf + from, to_byte - from, len)
4764 : decode_coding (coding, XSTRING (str)->data + from,
f30cc612 4765 buf + from, to_byte - from, len));
d46c5b12 4766 if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
4ed46869 4767 {
d46c5b12
KH
4768 /* We simple try to decode the whole string again but without
4769 eol-conversion this time. */
4770 coding->eol_type = CODING_EOL_LF;
4771 coding->symbol = saved_coding_symbol;
4772 return code_convert_string (str, coding, encodep, nocopy);
4ed46869 4773 }
d46c5b12
KH
4774
4775 bcopy (XSTRING (str)->data + to_byte, buf + from + coding->produced,
fc932ac6 4776 STRING_BYTES (XSTRING (str)) - to_byte);
d46c5b12 4777
fc932ac6 4778 len = from + STRING_BYTES (XSTRING (str)) - to_byte;
d46c5b12
KH
4779 if (encodep)
4780 str = make_unibyte_string (buf, len + coding->produced);
4781 else
826bfb8b
KH
4782 {
4783 int chars= (coding->fake_multibyte
4784 ? multibyte_chars_in_text (buf + from, coding->produced)
4785 : coding->produced_char);
4786 str = make_multibyte_string (buf, len + chars, len + coding->produced);
4787 }
4788
d46c5b12 4789 return str;
4ed46869
KH
4790}
4791
4792\f
4793#ifdef emacs
1397dc18 4794/*** 8. Emacs Lisp library functions ***/
4ed46869 4795
4ed46869
KH
4796DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
4797 "Return t if OBJECT is nil or a coding-system.\n\
3a73fa5d
RS
4798See the documentation of `make-coding-system' for information\n\
4799about coding-system objects.")
4ed46869
KH
4800 (obj)
4801 Lisp_Object obj;
4802{
4608c386
KH
4803 if (NILP (obj))
4804 return Qt;
4805 if (!SYMBOLP (obj))
4806 return Qnil;
4807 /* Get coding-spec vector for OBJ. */
4808 obj = Fget (obj, Qcoding_system);
4809 return ((VECTORP (obj) && XVECTOR (obj)->size == 5)
4810 ? Qt : Qnil);
4ed46869
KH
4811}
4812
9d991de8
RS
4813DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
4814 Sread_non_nil_coding_system, 1, 1, 0,
e0e989f6 4815 "Read a coding system from the minibuffer, prompting with string PROMPT.")
4ed46869
KH
4816 (prompt)
4817 Lisp_Object prompt;
4818{
e0e989f6 4819 Lisp_Object val;
9d991de8
RS
4820 do
4821 {
4608c386
KH
4822 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
4823 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
9d991de8
RS
4824 }
4825 while (XSTRING (val)->size == 0);
e0e989f6 4826 return (Fintern (val, Qnil));
4ed46869
KH
4827}
4828
9b787f3e
RS
4829DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
4830 "Read a coding system from the minibuffer, prompting with string PROMPT.\n\
4831If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.")
4832 (prompt, default_coding_system)
4833 Lisp_Object prompt, default_coding_system;
4ed46869 4834{
f44d27ce 4835 Lisp_Object val;
9b787f3e
RS
4836 if (SYMBOLP (default_coding_system))
4837 XSETSTRING (default_coding_system, XSYMBOL (default_coding_system)->name);
4608c386 4838 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
9b787f3e
RS
4839 Qt, Qnil, Qcoding_system_history,
4840 default_coding_system, Qnil);
e0e989f6 4841 return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
4ed46869
KH
4842}
4843
4844DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
4845 1, 1, 0,
4846 "Check validity of CODING-SYSTEM.\n\
3a73fa5d
RS
4847If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.\n\
4848It is valid if it is a symbol with a non-nil `coding-system' property.\n\
4ed46869
KH
4849The value of property should be a vector of length 5.")
4850 (coding_system)
4851 Lisp_Object coding_system;
4852{
4853 CHECK_SYMBOL (coding_system, 0);
4854 if (!NILP (Fcoding_system_p (coding_system)))
4855 return coding_system;
4856 while (1)
02ba4723 4857 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
4ed46869 4858}
3a73fa5d 4859\f
d46c5b12
KH
4860Lisp_Object
4861detect_coding_system (src, src_bytes, highest)
4862 unsigned char *src;
4863 int src_bytes, highest;
4ed46869
KH
4864{
4865 int coding_mask, eol_type;
d46c5b12
KH
4866 Lisp_Object val, tmp;
4867 int dummy;
4ed46869 4868
d46c5b12
KH
4869 coding_mask = detect_coding_mask (src, src_bytes, NULL, &dummy);
4870 eol_type = detect_eol_type (src, src_bytes, &dummy);
4871 if (eol_type == CODING_EOL_INCONSISTENT)
25b02698 4872 eol_type = CODING_EOL_UNDECIDED;
4ed46869 4873
d46c5b12 4874 if (!coding_mask)
4ed46869 4875 {
27901516 4876 val = Qundecided;
d46c5b12 4877 if (eol_type != CODING_EOL_UNDECIDED)
4ed46869 4878 {
f44d27ce
RS
4879 Lisp_Object val2;
4880 val2 = Fget (Qundecided, Qeol_type);
4ed46869
KH
4881 if (VECTORP (val2))
4882 val = XVECTOR (val2)->contents[eol_type];
4883 }
80e803b4 4884 return (highest ? val : Fcons (val, Qnil));
4ed46869 4885 }
4ed46869 4886
d46c5b12
KH
4887 /* At first, gather possible coding systems in VAL. */
4888 val = Qnil;
4889 for (tmp = Vcoding_category_list; !NILP (tmp); tmp = XCONS (tmp)->cdr)
4ed46869 4890 {
d46c5b12
KH
4891 int idx
4892 = XFASTINT (Fget (XCONS (tmp)->car, Qcoding_category_index));
4893 if (coding_mask & (1 << idx))
4ed46869 4894 {
d46c5b12
KH
4895 val = Fcons (Fsymbol_value (XCONS (tmp)->car), val);
4896 if (highest)
4897 break;
4ed46869
KH
4898 }
4899 }
d46c5b12
KH
4900 if (!highest)
4901 val = Fnreverse (val);
4ed46869 4902
65059037 4903 /* Then, replace the elements with subsidiary coding systems. */
d46c5b12 4904 for (tmp = val; !NILP (tmp); tmp = XCONS (tmp)->cdr)
4ed46869 4905 {
65059037
RS
4906 if (eol_type != CODING_EOL_UNDECIDED
4907 && eol_type != CODING_EOL_INCONSISTENT)
4ed46869 4908 {
d46c5b12
KH
4909 Lisp_Object eol;
4910 eol = Fget (XCONS (tmp)->car, Qeol_type);
4911 if (VECTORP (eol))
4912 XCONS (tmp)->car = XVECTOR (eol)->contents[eol_type];
4ed46869
KH
4913 }
4914 }
d46c5b12
KH
4915 return (highest ? XCONS (val)->car : val);
4916}
4ed46869 4917
d46c5b12
KH
4918DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
4919 2, 3, 0,
4920 "Detect coding system of the text in the region between START and END.\n\
4921Return a list of possible coding systems ordered by priority.\n\
4922\n\
80e803b4
KH
4923If only ASCII characters are found, it returns a list of single element\n\
4924`undecided' or its subsidiary coding system according to a detected\n\
4925end-of-line format.\n\
d46c5b12
KH
4926\n\
4927If optional argument HIGHEST is non-nil, return the coding system of\n\
4928highest priority.")
4929 (start, end, highest)
4930 Lisp_Object start, end, highest;
4931{
4932 int from, to;
4933 int from_byte, to_byte;
6289dd10 4934
d46c5b12
KH
4935 CHECK_NUMBER_COERCE_MARKER (start, 0);
4936 CHECK_NUMBER_COERCE_MARKER (end, 1);
4ed46869 4937
d46c5b12
KH
4938 validate_region (&start, &end);
4939 from = XINT (start), to = XINT (end);
4940 from_byte = CHAR_TO_BYTE (from);
4941 to_byte = CHAR_TO_BYTE (to);
6289dd10 4942
d46c5b12
KH
4943 if (from < GPT && to >= GPT)
4944 move_gap_both (to, to_byte);
4ed46869 4945
d46c5b12
KH
4946 return detect_coding_system (BYTE_POS_ADDR (from_byte),
4947 to_byte - from_byte,
4948 !NILP (highest));
4949}
6289dd10 4950
d46c5b12
KH
4951DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
4952 1, 2, 0,
4953 "Detect coding system of the text in STRING.\n\
4954Return a list of possible coding systems ordered by priority.\n\
4955\n\
80e803b4
KH
4956If only ASCII characters are found, it returns a list of single element\n\
4957`undecided' or its subsidiary coding system according to a detected\n\
4958end-of-line format.\n\
d46c5b12
KH
4959\n\
4960If optional argument HIGHEST is non-nil, return the coding system of\n\
4961highest priority.")
4962 (string, highest)
4963 Lisp_Object string, highest;
4964{
4965 CHECK_STRING (string, 0);
4ed46869 4966
d46c5b12 4967 return detect_coding_system (XSTRING (string)->data,
fc932ac6 4968 STRING_BYTES (XSTRING (string)),
d46c5b12 4969 !NILP (highest));
4ed46869
KH
4970}
4971
4031e2bf
KH
4972Lisp_Object
4973code_convert_region1 (start, end, coding_system, encodep)
d46c5b12 4974 Lisp_Object start, end, coding_system;
4031e2bf 4975 int encodep;
3a73fa5d
RS
4976{
4977 struct coding_system coding;
4031e2bf 4978 int from, to, len;
3a73fa5d 4979
d46c5b12
KH
4980 CHECK_NUMBER_COERCE_MARKER (start, 0);
4981 CHECK_NUMBER_COERCE_MARKER (end, 1);
3a73fa5d
RS
4982 CHECK_SYMBOL (coding_system, 2);
4983
d46c5b12
KH
4984 validate_region (&start, &end);
4985 from = XFASTINT (start);
4986 to = XFASTINT (end);
4987
3a73fa5d 4988 if (NILP (coding_system))
d46c5b12
KH
4989 return make_number (to - from);
4990
3a73fa5d 4991 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
d46c5b12 4992 error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
3a73fa5d 4993
d46c5b12 4994 coding.mode |= CODING_MODE_LAST_BLOCK;
fb88bf2d
KH
4995 code_convert_region (from, CHAR_TO_BYTE (from), to, CHAR_TO_BYTE (to),
4996 &coding, encodep, 1);
f072a3e8 4997 Vlast_coding_system_used = coding.symbol;
fb88bf2d 4998 return make_number (coding.produced_char);
4031e2bf
KH
4999}
5000
5001DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
5002 3, 3, "r\nzCoding system: ",
5003 "Decode the current region by specified coding system.\n\
5004When called from a program, takes three arguments:\n\
5005START, END, and CODING-SYSTEM. START and END are buffer positions.\n\
f072a3e8
RS
5006This function sets `last-coding-system-used' to the precise coding system\n\
5007used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5008not fully specified.)\n\
5009It returns the length of the decoded text.")
4031e2bf
KH
5010 (start, end, coding_system)
5011 Lisp_Object start, end, coding_system;
5012{
5013 return code_convert_region1 (start, end, coding_system, 0);
3a73fa5d
RS
5014}
5015
5016DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
5017 3, 3, "r\nzCoding system: ",
d46c5b12 5018 "Encode the current region by specified coding system.\n\
3a73fa5d 5019When called from a program, takes three arguments:\n\
d46c5b12 5020START, END, and CODING-SYSTEM. START and END are buffer positions.\n\
f072a3e8
RS
5021This function sets `last-coding-system-used' to the precise coding system\n\
5022used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5023not fully specified.)\n\
5024It returns the length of the encoded text.")
d46c5b12
KH
5025 (start, end, coding_system)
5026 Lisp_Object start, end, coding_system;
3a73fa5d 5027{
4031e2bf
KH
5028 return code_convert_region1 (start, end, coding_system, 1);
5029}
3a73fa5d 5030
4031e2bf
KH
5031Lisp_Object
5032code_convert_string1 (string, coding_system, nocopy, encodep)
5033 Lisp_Object string, coding_system, nocopy;
5034 int encodep;
5035{
5036 struct coding_system coding;
3a73fa5d 5037
4031e2bf
KH
5038 CHECK_STRING (string, 0);
5039 CHECK_SYMBOL (coding_system, 1);
4ed46869 5040
d46c5b12 5041 if (NILP (coding_system))
4031e2bf 5042 return (NILP (nocopy) ? Fcopy_sequence (string) : string);
4ed46869 5043
d46c5b12
KH
5044 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
5045 error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
5f1cd180 5046
d46c5b12 5047 coding.mode |= CODING_MODE_LAST_BLOCK;
f072a3e8 5048 Vlast_coding_system_used = coding.symbol;
4031e2bf 5049 return code_convert_string (string, &coding, encodep, !NILP (nocopy));
4ed46869
KH
5050}
5051
4ed46869 5052DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
e0e989f6
KH
5053 2, 3, 0,
5054 "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
fe487a71 5055Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
f072a3e8
RS
5056if the decoding operation is trivial.\n\
5057This function sets `last-coding-system-used' to the precise coding system\n\
5058used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5059not fully specified.)")
e0e989f6
KH
5060 (string, coding_system, nocopy)
5061 Lisp_Object string, coding_system, nocopy;
4ed46869 5062{
f072a3e8 5063 return code_convert_string1 (string, coding_system, nocopy, 0);
4ed46869
KH
5064}
5065
5066DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
e0e989f6
KH
5067 2, 3, 0,
5068 "Encode STRING to CODING-SYSTEM, and return the result.\n\
fe487a71 5069Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
f072a3e8
RS
5070if the encoding operation is trivial.\n\
5071This function sets `last-coding-system-used' to the precise coding system\n\
5072used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5073not fully specified.)")
e0e989f6
KH
5074 (string, coding_system, nocopy)
5075 Lisp_Object string, coding_system, nocopy;
4ed46869 5076{
f072a3e8 5077 return code_convert_string1 (string, coding_system, nocopy, 1);
4ed46869 5078}
4031e2bf 5079
ecec61c1
KH
5080/* Encode or decode STRING according to CODING_SYSTEM.
5081 Do not set Vlast_coding_system_used. */
5082
5083Lisp_Object
5084code_convert_string_norecord (string, coding_system, encodep)
5085 Lisp_Object string, coding_system;
5086 int encodep;
5087{
5088 struct coding_system coding;
5089
5090 CHECK_STRING (string, 0);
5091 CHECK_SYMBOL (coding_system, 1);
5092
5093 if (NILP (coding_system))
5094 return string;
5095
5096 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
5097 error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
5098
5099 coding.mode |= CODING_MODE_LAST_BLOCK;
5100 return code_convert_string (string, &coding, encodep, Qt);
5101}
3a73fa5d 5102\f
4ed46869 5103DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
55ab7be3 5104 "Decode a Japanese character which has CODE in shift_jis encoding.\n\
4ed46869
KH
5105Return the corresponding character.")
5106 (code)
5107 Lisp_Object code;
5108{
5109 unsigned char c1, c2, s1, s2;
5110 Lisp_Object val;
5111
5112 CHECK_NUMBER (code, 0);
5113 s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
55ab7be3
KH
5114 if (s1 == 0)
5115 {
c28a9453
KH
5116 if (s2 < 0x80)
5117 XSETFASTINT (val, s2);
5118 else if (s2 >= 0xA0 || s2 <= 0xDF)
5119 XSETFASTINT (val,
5120 MAKE_NON_ASCII_CHAR (charset_katakana_jisx0201, s2, 0));
5121 else
9da8350f 5122 error ("Invalid Shift JIS code: %x", XFASTINT (code));
55ab7be3
KH
5123 }
5124 else
5125 {
5126 if ((s1 < 0x80 || s1 > 0x9F && s1 < 0xE0 || s1 > 0xEF)
5127 || (s2 < 0x40 || s2 == 0x7F || s2 > 0xFC))
9da8350f 5128 error ("Invalid Shift JIS code: %x", XFASTINT (code));
55ab7be3
KH
5129 DECODE_SJIS (s1, s2, c1, c2);
5130 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset_jisx0208, c1, c2));
5131 }
4ed46869
KH
5132 return val;
5133}
5134
5135DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
55ab7be3
KH
5136 "Encode a Japanese character CHAR to shift_jis encoding.\n\
5137Return the corresponding code in SJIS.")
4ed46869
KH
5138 (ch)
5139 Lisp_Object ch;
5140{
bcf26d6a 5141 int charset, c1, c2, s1, s2;
4ed46869
KH
5142 Lisp_Object val;
5143
5144 CHECK_NUMBER (ch, 0);
5145 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
c28a9453
KH
5146 if (charset == CHARSET_ASCII)
5147 {
5148 val = ch;
5149 }
5150 else if (charset == charset_jisx0208
5151 && c1 > 0x20 && c1 < 0x7F && c2 > 0x20 && c2 < 0x7F)
4ed46869
KH
5152 {
5153 ENCODE_SJIS (c1, c2, s1, s2);
bcf26d6a 5154 XSETFASTINT (val, (s1 << 8) | s2);
4ed46869 5155 }
55ab7be3
KH
5156 else if (charset == charset_katakana_jisx0201
5157 && c1 > 0x20 && c2 < 0xE0)
5158 {
5159 XSETFASTINT (val, c1 | 0x80);
5160 }
4ed46869 5161 else
55ab7be3 5162 error ("Can't encode to shift_jis: %d", XFASTINT (ch));
4ed46869
KH
5163 return val;
5164}
5165
5166DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
c28a9453 5167 "Decode a Big5 character which has CODE in BIG5 coding system.\n\
4ed46869
KH
5168Return the corresponding character.")
5169 (code)
5170 Lisp_Object code;
5171{
5172 int charset;
5173 unsigned char b1, b2, c1, c2;
5174 Lisp_Object val;
5175
5176 CHECK_NUMBER (code, 0);
5177 b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
c28a9453
KH
5178 if (b1 == 0)
5179 {
5180 if (b2 >= 0x80)
9da8350f 5181 error ("Invalid BIG5 code: %x", XFASTINT (code));
c28a9453
KH
5182 val = code;
5183 }
5184 else
5185 {
5186 if ((b1 < 0xA1 || b1 > 0xFE)
5187 || (b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE))
9da8350f 5188 error ("Invalid BIG5 code: %x", XFASTINT (code));
c28a9453
KH
5189 DECODE_BIG5 (b1, b2, charset, c1, c2);
5190 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset, c1, c2));
5191 }
4ed46869
KH
5192 return val;
5193}
5194
5195DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
d46c5b12 5196 "Encode the Big5 character CHAR to BIG5 coding system.\n\
4ed46869
KH
5197Return the corresponding character code in Big5.")
5198 (ch)
5199 Lisp_Object ch;
5200{
bcf26d6a 5201 int charset, c1, c2, b1, b2;
4ed46869
KH
5202 Lisp_Object val;
5203
5204 CHECK_NUMBER (ch, 0);
5205 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
c28a9453
KH
5206 if (charset == CHARSET_ASCII)
5207 {
5208 val = ch;
5209 }
5210 else if ((charset == charset_big5_1
5211 && (XFASTINT (ch) >= 0x250a1 && XFASTINT (ch) <= 0x271ec))
5212 || (charset == charset_big5_2
5213 && XFASTINT (ch) >= 0x290a1 && XFASTINT (ch) <= 0x2bdb2))
4ed46869
KH
5214 {
5215 ENCODE_BIG5 (charset, c1, c2, b1, b2);
bcf26d6a 5216 XSETFASTINT (val, (b1 << 8) | b2);
4ed46869
KH
5217 }
5218 else
c28a9453 5219 error ("Can't encode to Big5: %d", XFASTINT (ch));
4ed46869
KH
5220 return val;
5221}
3a73fa5d 5222\f
1ba9e4ab
KH
5223DEFUN ("set-terminal-coding-system-internal",
5224 Fset_terminal_coding_system_internal,
5225 Sset_terminal_coding_system_internal, 1, 1, 0, "")
4ed46869
KH
5226 (coding_system)
5227 Lisp_Object coding_system;
5228{
5229 CHECK_SYMBOL (coding_system, 0);
5230 setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
70c22245 5231 /* We had better not send unsafe characters to terminal. */
6e85d753
KH
5232 terminal_coding.flags |= CODING_FLAG_ISO_SAFE;
5233
4ed46869
KH
5234 return Qnil;
5235}
5236
c4825358
KH
5237DEFUN ("set-safe-terminal-coding-system-internal",
5238 Fset_safe_terminal_coding_system_internal,
5239 Sset_safe_terminal_coding_system_internal, 1, 1, 0, "")
5240 (coding_system)
5241 Lisp_Object coding_system;
5242{
5243 CHECK_SYMBOL (coding_system, 0);
5244 setup_coding_system (Fcheck_coding_system (coding_system),
5245 &safe_terminal_coding);
5246 return Qnil;
5247}
5248
4ed46869
KH
5249DEFUN ("terminal-coding-system",
5250 Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
3a73fa5d 5251 "Return coding system specified for terminal output.")
4ed46869
KH
5252 ()
5253{
5254 return terminal_coding.symbol;
5255}
5256
1ba9e4ab
KH
5257DEFUN ("set-keyboard-coding-system-internal",
5258 Fset_keyboard_coding_system_internal,
5259 Sset_keyboard_coding_system_internal, 1, 1, 0, "")
4ed46869
KH
5260 (coding_system)
5261 Lisp_Object coding_system;
5262{
5263 CHECK_SYMBOL (coding_system, 0);
5264 setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
5265 return Qnil;
5266}
5267
5268DEFUN ("keyboard-coding-system",
5269 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
3a73fa5d 5270 "Return coding system specified for decoding keyboard input.")
4ed46869
KH
5271 ()
5272{
5273 return keyboard_coding.symbol;
5274}
5275
5276\f
a5d301df
KH
5277DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
5278 Sfind_operation_coding_system, 1, MANY, 0,
5279 "Choose a coding system for an operation based on the target name.\n\
69f76525 5280The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).\n\
9ce27fde
KH
5281DECODING-SYSTEM is the coding system to use for decoding\n\
5282\(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system\n\
5283for encoding (in case OPERATION does encoding).\n\
ccdb79f5
RS
5284\n\
5285The first argument OPERATION specifies an I/O primitive:\n\
5286 For file I/O, `insert-file-contents' or `write-region'.\n\
5287 For process I/O, `call-process', `call-process-region', or `start-process'.\n\
5288 For network I/O, `open-network-stream'.\n\
5289\n\
5290The remaining arguments should be the same arguments that were passed\n\
5291to the primitive. Depending on which primitive, one of those arguments\n\
5292is selected as the TARGET. For example, if OPERATION does file I/O,\n\
5293whichever argument specifies the file name is TARGET.\n\
5294\n\
5295TARGET has a meaning which depends on OPERATION:\n\
4ed46869
KH
5296 For file I/O, TARGET is a file name.\n\
5297 For process I/O, TARGET is a process name.\n\
5298 For network I/O, TARGET is a service name or a port number\n\
5299\n\
02ba4723
KH
5300This function looks up what specified for TARGET in,\n\
5301`file-coding-system-alist', `process-coding-system-alist',\n\
5302or `network-coding-system-alist' depending on OPERATION.\n\
5303They may specify a coding system, a cons of coding systems,\n\
5304or a function symbol to call.\n\
5305In the last case, we call the function with one argument,\n\
9ce27fde 5306which is a list of all the arguments given to this function.")
4ed46869
KH
5307 (nargs, args)
5308 int nargs;
5309 Lisp_Object *args;
5310{
5311 Lisp_Object operation, target_idx, target, val;
5312 register Lisp_Object chain;
5313
5314 if (nargs < 2)
5315 error ("Too few arguments");
5316 operation = args[0];
5317 if (!SYMBOLP (operation)
5318 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
5319 error ("Invalid first arguement");
5320 if (nargs < 1 + XINT (target_idx))
5321 error ("Too few arguments for operation: %s",
5322 XSYMBOL (operation)->name->data);
5323 target = args[XINT (target_idx) + 1];
5324 if (!(STRINGP (target)
5325 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
5326 error ("Invalid %dth argument", XINT (target_idx) + 1);
5327
2e34157c
RS
5328 chain = ((EQ (operation, Qinsert_file_contents)
5329 || EQ (operation, Qwrite_region))
02ba4723 5330 ? Vfile_coding_system_alist
2e34157c 5331 : (EQ (operation, Qopen_network_stream)
02ba4723
KH
5332 ? Vnetwork_coding_system_alist
5333 : Vprocess_coding_system_alist));
4ed46869
KH
5334 if (NILP (chain))
5335 return Qnil;
5336
02ba4723 5337 for (; CONSP (chain); chain = XCONS (chain)->cdr)
4ed46869 5338 {
f44d27ce
RS
5339 Lisp_Object elt;
5340 elt = XCONS (chain)->car;
4ed46869
KH
5341
5342 if (CONSP (elt)
5343 && ((STRINGP (target)
5344 && STRINGP (XCONS (elt)->car)
5345 && fast_string_match (XCONS (elt)->car, target) >= 0)
5346 || (INTEGERP (target) && EQ (target, XCONS (elt)->car))))
02ba4723
KH
5347 {
5348 val = XCONS (elt)->cdr;
b19fd4c5
KH
5349 /* Here, if VAL is both a valid coding system and a valid
5350 function symbol, we return VAL as a coding system. */
02ba4723
KH
5351 if (CONSP (val))
5352 return val;
5353 if (! SYMBOLP (val))
5354 return Qnil;
5355 if (! NILP (Fcoding_system_p (val)))
5356 return Fcons (val, val);
b19fd4c5
KH
5357 if (! NILP (Ffboundp (val)))
5358 {
5359 val = call1 (val, Flist (nargs, args));
5360 if (CONSP (val))
5361 return val;
5362 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
5363 return Fcons (val, val);
5364 }
02ba4723
KH
5365 return Qnil;
5366 }
4ed46869
KH
5367 }
5368 return Qnil;
5369}
5370
1397dc18
KH
5371DEFUN ("update-coding-systems-internal", Fupdate_coding_systems_internal,
5372 Supdate_coding_systems_internal, 0, 0, 0,
5373 "Update internal database for ISO2022 and CCL based coding systems.\n\
d46c5b12
KH
5374When values of the following coding categories are changed, you must\n\
5375call this function:\n\
5376 coding-category-iso-7, coding-category-iso-7-tight,\n\
5377 coding-category-iso-8-1, coding-category-iso-8-2,\n\
1397dc18
KH
5378 coding-category-iso-7-else, coding-category-iso-8-else,\n\
5379 coding-category-ccl")
d46c5b12
KH
5380 ()
5381{
5382 int i;
5383
1397dc18 5384 for (i = CODING_CATEGORY_IDX_ISO_7; i <= CODING_CATEGORY_IDX_CCL; i++)
d46c5b12 5385 {
1397dc18
KH
5386 Lisp_Object val;
5387
5388 val = XSYMBOL (XVECTOR (Vcoding_category_table)->contents[i])->value;
5389 if (!NILP (val))
5390 {
5391 if (! coding_system_table[i])
5392 coding_system_table[i] = ((struct coding_system *)
5393 xmalloc (sizeof (struct coding_system)));
5394 setup_coding_system (val, coding_system_table[i]);
5395 }
5396 else if (coding_system_table[i])
5397 {
5398 xfree (coding_system_table[i]);
5399 coding_system_table[i] = NULL;
5400 }
d46c5b12 5401 }
1397dc18 5402
d46c5b12
KH
5403 return Qnil;
5404}
5405
66cfb530
KH
5406DEFUN ("set-coding-priority-internal", Fset_coding_priority_internal,
5407 Sset_coding_priority_internal, 0, 0, 0,
5408 "Update internal database for the current value of `coding-category-list'.\n\
5409This function is internal use only.")
5410 ()
5411{
5412 int i = 0, idx;
84d60297
RS
5413 Lisp_Object val;
5414
5415 val = Vcoding_category_list;
66cfb530
KH
5416
5417 while (CONSP (val) && i < CODING_CATEGORY_IDX_MAX)
5418 {
5419 if (! SYMBOLP (XCONS (val)->car))
5420 break;
5421 idx = XFASTINT (Fget (XCONS (val)->car, Qcoding_category_index));
5422 if (idx >= CODING_CATEGORY_IDX_MAX)
5423 break;
5424 coding_priorities[i++] = (1 << idx);
5425 val = XCONS (val)->cdr;
5426 }
5427 /* If coding-category-list is valid and contains all coding
5428 categories, `i' should be CODING_CATEGORY_IDX_MAX now. If not,
5429 the following code saves Emacs from craching. */
5430 while (i < CODING_CATEGORY_IDX_MAX)
5431 coding_priorities[i++] = CODING_CATEGORY_MASK_RAW_TEXT;
5432
5433 return Qnil;
5434}
5435
4ed46869
KH
5436#endif /* emacs */
5437
5438\f
1397dc18 5439/*** 9. Post-amble ***/
4ed46869 5440
6d74c3aa
KH
5441void
5442init_coding ()
5443{
5444 conversion_buffer = (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE);
5445}
5446
dfcf069d 5447void
4ed46869
KH
5448init_coding_once ()
5449{
5450 int i;
5451
0ef69138 5452 /* Emacs' internal format specific initialize routine. */
4ed46869
KH
5453 for (i = 0; i <= 0x20; i++)
5454 emacs_code_class[i] = EMACS_control_code;
5455 emacs_code_class[0x0A] = EMACS_linefeed_code;
5456 emacs_code_class[0x0D] = EMACS_carriage_return_code;
5457 for (i = 0x21 ; i < 0x7F; i++)
5458 emacs_code_class[i] = EMACS_ascii_code;
5459 emacs_code_class[0x7F] = EMACS_control_code;
5460 emacs_code_class[0x80] = EMACS_leading_code_composition;
5461 for (i = 0x81; i < 0xFF; i++)
5462 emacs_code_class[i] = EMACS_invalid_code;
5463 emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
5464 emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
5465 emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
5466 emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
5467
5468 /* ISO2022 specific initialize routine. */
5469 for (i = 0; i < 0x20; i++)
5470 iso_code_class[i] = ISO_control_code;
5471 for (i = 0x21; i < 0x7F; i++)
5472 iso_code_class[i] = ISO_graphic_plane_0;
5473 for (i = 0x80; i < 0xA0; i++)
5474 iso_code_class[i] = ISO_control_code;
5475 for (i = 0xA1; i < 0xFF; i++)
5476 iso_code_class[i] = ISO_graphic_plane_1;
5477 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
5478 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
5479 iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
5480 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
5481 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
5482 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
5483 iso_code_class[ISO_CODE_ESC] = ISO_escape;
5484 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
5485 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
5486 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
5487
e0e989f6 5488 conversion_buffer_size = MINIMUM_CONVERSION_BUFFER_SIZE;
e0e989f6
KH
5489
5490 setup_coding_system (Qnil, &keyboard_coding);
5491 setup_coding_system (Qnil, &terminal_coding);
c4825358 5492 setup_coding_system (Qnil, &safe_terminal_coding);
6bc51348 5493 setup_coding_system (Qnil, &default_buffer_file_coding);
9ce27fde 5494
d46c5b12
KH
5495 bzero (coding_system_table, sizeof coding_system_table);
5496
66cfb530
KH
5497 bzero (ascii_skip_code, sizeof ascii_skip_code);
5498 for (i = 0; i < 128; i++)
5499 ascii_skip_code[i] = 1;
5500
9ce27fde
KH
5501#if defined (MSDOS) || defined (WINDOWSNT)
5502 system_eol_type = CODING_EOL_CRLF;
5503#else
5504 system_eol_type = CODING_EOL_LF;
5505#endif
e0e989f6
KH
5506}
5507
5508#ifdef emacs
5509
dfcf069d 5510void
e0e989f6
KH
5511syms_of_coding ()
5512{
5513 Qtarget_idx = intern ("target-idx");
5514 staticpro (&Qtarget_idx);
5515
bb0115a2
RS
5516 Qcoding_system_history = intern ("coding-system-history");
5517 staticpro (&Qcoding_system_history);
5518 Fset (Qcoding_system_history, Qnil);
5519
9ce27fde 5520 /* Target FILENAME is the first argument. */
e0e989f6 5521 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
9ce27fde 5522 /* Target FILENAME is the third argument. */
e0e989f6
KH
5523 Fput (Qwrite_region, Qtarget_idx, make_number (2));
5524
5525 Qcall_process = intern ("call-process");
5526 staticpro (&Qcall_process);
9ce27fde 5527 /* Target PROGRAM is the first argument. */
e0e989f6
KH
5528 Fput (Qcall_process, Qtarget_idx, make_number (0));
5529
5530 Qcall_process_region = intern ("call-process-region");
5531 staticpro (&Qcall_process_region);
9ce27fde 5532 /* Target PROGRAM is the third argument. */
e0e989f6
KH
5533 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
5534
5535 Qstart_process = intern ("start-process");
5536 staticpro (&Qstart_process);
9ce27fde 5537 /* Target PROGRAM is the third argument. */
e0e989f6
KH
5538 Fput (Qstart_process, Qtarget_idx, make_number (2));
5539
5540 Qopen_network_stream = intern ("open-network-stream");
5541 staticpro (&Qopen_network_stream);
9ce27fde 5542 /* Target SERVICE is the fourth argument. */
e0e989f6
KH
5543 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
5544
4ed46869
KH
5545 Qcoding_system = intern ("coding-system");
5546 staticpro (&Qcoding_system);
5547
5548 Qeol_type = intern ("eol-type");
5549 staticpro (&Qeol_type);
5550
5551 Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
5552 staticpro (&Qbuffer_file_coding_system);
5553
5554 Qpost_read_conversion = intern ("post-read-conversion");
5555 staticpro (&Qpost_read_conversion);
5556
5557 Qpre_write_conversion = intern ("pre-write-conversion");
5558 staticpro (&Qpre_write_conversion);
5559
27901516
KH
5560 Qno_conversion = intern ("no-conversion");
5561 staticpro (&Qno_conversion);
5562
5563 Qundecided = intern ("undecided");
5564 staticpro (&Qundecided);
5565
4ed46869
KH
5566 Qcoding_system_p = intern ("coding-system-p");
5567 staticpro (&Qcoding_system_p);
5568
5569 Qcoding_system_error = intern ("coding-system-error");
5570 staticpro (&Qcoding_system_error);
5571
5572 Fput (Qcoding_system_error, Qerror_conditions,
5573 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
5574 Fput (Qcoding_system_error, Qerror_message,
9ce27fde 5575 build_string ("Invalid coding system"));
4ed46869 5576
d46c5b12
KH
5577 Qcoding_category = intern ("coding-category");
5578 staticpro (&Qcoding_category);
4ed46869
KH
5579 Qcoding_category_index = intern ("coding-category-index");
5580 staticpro (&Qcoding_category_index);
5581
d46c5b12
KH
5582 Vcoding_category_table
5583 = Fmake_vector (make_number (CODING_CATEGORY_IDX_MAX), Qnil);
5584 staticpro (&Vcoding_category_table);
4ed46869
KH
5585 {
5586 int i;
5587 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
5588 {
d46c5b12
KH
5589 XVECTOR (Vcoding_category_table)->contents[i]
5590 = intern (coding_category_name[i]);
5591 Fput (XVECTOR (Vcoding_category_table)->contents[i],
5592 Qcoding_category_index, make_number (i));
4ed46869
KH
5593 }
5594 }
5595
f967223b
KH
5596 Qtranslation_table = intern ("translation-table");
5597 staticpro (&Qtranslation_table);
1397dc18 5598 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (1));
bdd9fb48 5599
f967223b
KH
5600 Qtranslation_table_id = intern ("translation-table-id");
5601 staticpro (&Qtranslation_table_id);
84fbb8a0 5602
f967223b
KH
5603 Qtranslation_table_for_decode = intern ("translation-table-for-decode");
5604 staticpro (&Qtranslation_table_for_decode);
a5d301df 5605
f967223b
KH
5606 Qtranslation_table_for_encode = intern ("translation-table-for-encode");
5607 staticpro (&Qtranslation_table_for_encode);
a5d301df 5608
70c22245
KH
5609 Qsafe_charsets = intern ("safe-charsets");
5610 staticpro (&Qsafe_charsets);
5611
1397dc18
KH
5612 Qvalid_codes = intern ("valid-codes");
5613 staticpro (&Qvalid_codes);
5614
9ce27fde
KH
5615 Qemacs_mule = intern ("emacs-mule");
5616 staticpro (&Qemacs_mule);
5617
d46c5b12
KH
5618 Qraw_text = intern ("raw-text");
5619 staticpro (&Qraw_text);
5620
4ed46869
KH
5621 defsubr (&Scoding_system_p);
5622 defsubr (&Sread_coding_system);
5623 defsubr (&Sread_non_nil_coding_system);
5624 defsubr (&Scheck_coding_system);
5625 defsubr (&Sdetect_coding_region);
d46c5b12 5626 defsubr (&Sdetect_coding_string);
4ed46869
KH
5627 defsubr (&Sdecode_coding_region);
5628 defsubr (&Sencode_coding_region);
5629 defsubr (&Sdecode_coding_string);
5630 defsubr (&Sencode_coding_string);
5631 defsubr (&Sdecode_sjis_char);
5632 defsubr (&Sencode_sjis_char);
5633 defsubr (&Sdecode_big5_char);
5634 defsubr (&Sencode_big5_char);
1ba9e4ab 5635 defsubr (&Sset_terminal_coding_system_internal);
c4825358 5636 defsubr (&Sset_safe_terminal_coding_system_internal);
4ed46869 5637 defsubr (&Sterminal_coding_system);
1ba9e4ab 5638 defsubr (&Sset_keyboard_coding_system_internal);
4ed46869 5639 defsubr (&Skeyboard_coding_system);
a5d301df 5640 defsubr (&Sfind_operation_coding_system);
1397dc18 5641 defsubr (&Supdate_coding_systems_internal);
66cfb530 5642 defsubr (&Sset_coding_priority_internal);
4ed46869 5643
4608c386
KH
5644 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
5645 "List of coding systems.\n\
5646\n\
5647Do not alter the value of this variable manually. This variable should be\n\
5648updated by the functions `make-coding-system' and\n\
5649`define-coding-system-alias'.");
5650 Vcoding_system_list = Qnil;
5651
5652 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
5653 "Alist of coding system names.\n\
5654Each element is one element list of coding system name.\n\
5655This variable is given to `completing-read' as TABLE argument.\n\
5656\n\
5657Do not alter the value of this variable manually. This variable should be\n\
5658updated by the functions `make-coding-system' and\n\
5659`define-coding-system-alias'.");
5660 Vcoding_system_alist = Qnil;
5661
4ed46869
KH
5662 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
5663 "List of coding-categories (symbols) ordered by priority.");
5664 {
5665 int i;
5666
5667 Vcoding_category_list = Qnil;
5668 for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
5669 Vcoding_category_list
d46c5b12
KH
5670 = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
5671 Vcoding_category_list);
4ed46869
KH
5672 }
5673
5674 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
10bff6f1 5675 "Specify the coding system for read operations.\n\
2ebb362d 5676It is useful to bind this variable with `let', but do not set it globally.\n\
4ed46869 5677If the value is a coding system, it is used for decoding on read operation.\n\
a67a9c66 5678If not, an appropriate element is used from one of the coding system alists:\n\
10bff6f1 5679There are three such tables, `file-coding-system-alist',\n\
a67a9c66 5680`process-coding-system-alist', and `network-coding-system-alist'.");
4ed46869
KH
5681 Vcoding_system_for_read = Qnil;
5682
5683 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
10bff6f1 5684 "Specify the coding system for write operations.\n\
928aedd8
RS
5685Programs bind this variable with `let', but you should not set it globally.\n\
5686If the value is a coding system, it is used for encoding of output,\n\
5687when writing it to a file and when sending it to a file or subprocess.\n\
5688\n\
5689If this does not specify a coding system, an appropriate element\n\
5690is used from one of the coding system alists:\n\
10bff6f1 5691There are three such tables, `file-coding-system-alist',\n\
928aedd8
RS
5692`process-coding-system-alist', and `network-coding-system-alist'.\n\
5693For output to files, if the above procedure does not specify a coding system,\n\
5694the value of `buffer-file-coding-system' is used.");
4ed46869
KH
5695 Vcoding_system_for_write = Qnil;
5696
5697 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
a67a9c66 5698 "Coding system used in the latest file or process I/O.");
4ed46869
KH
5699 Vlast_coding_system_used = Qnil;
5700
9ce27fde 5701 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
f07f4a24 5702 "*Non-nil means always inhibit code conversion of end-of-line format.\n\
94c7a214
DL
5703See info node `Coding Systems' and info node `Text and Binary' concerning\n\
5704such conversion.");
9ce27fde
KH
5705 inhibit_eol_conversion = 0;
5706
ed29121d
EZ
5707 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
5708 "Non-nil means process buffer inherits coding system of process output.\n\
5709Bind it to t if the process output is to be treated as if it were a file\n\
5710read from some filesystem.");
5711 inherit_process_coding_system = 0;
5712
02ba4723
KH
5713 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
5714 "Alist to decide a coding system to use for a file I/O operation.\n\
5715The format is ((PATTERN . VAL) ...),\n\
5716where PATTERN is a regular expression matching a file name,\n\
5717VAL is a coding system, a cons of coding systems, or a function symbol.\n\
5718If VAL is a coding system, it is used for both decoding and encoding\n\
5719the file contents.\n\
5720If VAL is a cons of coding systems, the car part is used for decoding,\n\
5721and the cdr part is used for encoding.\n\
5722If VAL is a function symbol, the function must return a coding system\n\
5723or a cons of coding systems which are used as above.\n\
e0e989f6 5724\n\
a85a871a 5725See also the function `find-operation-coding-system'\n\
eda284ac 5726and the variable `auto-coding-alist'.");
02ba4723
KH
5727 Vfile_coding_system_alist = Qnil;
5728
5729 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
5730 "Alist to decide a coding system to use for a process I/O operation.\n\
5731The format is ((PATTERN . VAL) ...),\n\
5732where PATTERN is a regular expression matching a program name,\n\
5733VAL is a coding system, a cons of coding systems, or a function symbol.\n\
5734If VAL is a coding system, it is used for both decoding what received\n\
5735from the program and encoding what sent to the program.\n\
5736If VAL is a cons of coding systems, the car part is used for decoding,\n\
5737and the cdr part is used for encoding.\n\
5738If VAL is a function symbol, the function must return a coding system\n\
5739or a cons of coding systems which are used as above.\n\
4ed46869 5740\n\
9ce27fde 5741See also the function `find-operation-coding-system'.");
02ba4723
KH
5742 Vprocess_coding_system_alist = Qnil;
5743
5744 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
5745 "Alist to decide a coding system to use for a network I/O operation.\n\
5746The format is ((PATTERN . VAL) ...),\n\
5747where PATTERN is a regular expression matching a network service name\n\
5748or is a port number to connect to,\n\
5749VAL is a coding system, a cons of coding systems, or a function symbol.\n\
5750If VAL is a coding system, it is used for both decoding what received\n\
5751from the network stream and encoding what sent to the network stream.\n\
5752If VAL is a cons of coding systems, the car part is used for decoding,\n\
5753and the cdr part is used for encoding.\n\
5754If VAL is a function symbol, the function must return a coding system\n\
5755or a cons of coding systems which are used as above.\n\
4ed46869 5756\n\
9ce27fde 5757See also the function `find-operation-coding-system'.");
02ba4723 5758 Vnetwork_coding_system_alist = Qnil;
4ed46869 5759
7722baf9
EZ
5760 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
5761 "*String displayed in mode line for UNIX-like (LF) end-of-line format.");
5762 eol_mnemonic_unix = build_string (":");
4ed46869 5763
7722baf9
EZ
5764 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
5765 "*String displayed in mode line for DOS-like (CRLF) end-of-line format.");
5766 eol_mnemonic_dos = build_string ("\\");
4ed46869 5767
7722baf9
EZ
5768 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
5769 "*String displayed in mode line for MAC-like (CR) end-of-line format.");
5770 eol_mnemonic_mac = build_string ("/");
4ed46869 5771
7722baf9
EZ
5772 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
5773 "*String displayed in mode line when end-of-line format is not yet determined.");
5774 eol_mnemonic_undecided = build_string (":");
4ed46869 5775
84fbb8a0 5776 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
f967223b 5777 "*Non-nil enables character translation while encoding and decoding.");
84fbb8a0 5778 Venable_character_translation = Qt;
bdd9fb48 5779
f967223b
KH
5780 DEFVAR_LISP ("standard-translation-table-for-decode",
5781 &Vstandard_translation_table_for_decode,
84fbb8a0 5782 "Table for translating characters while decoding.");
f967223b 5783 Vstandard_translation_table_for_decode = Qnil;
bdd9fb48 5784
f967223b
KH
5785 DEFVAR_LISP ("standard-translation-table-for-encode",
5786 &Vstandard_translation_table_for_encode,
84fbb8a0 5787 "Table for translationg characters while encoding.");
f967223b 5788 Vstandard_translation_table_for_encode = Qnil;
4ed46869
KH
5789
5790 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
5791 "Alist of charsets vs revision numbers.\n\
5792While encoding, if a charset (car part of an element) is found,\n\
5793designate it with the escape sequence identifing revision (cdr part of the element).");
5794 Vcharset_revision_alist = Qnil;
02ba4723
KH
5795
5796 DEFVAR_LISP ("default-process-coding-system",
5797 &Vdefault_process_coding_system,
5798 "Cons of coding systems used for process I/O by default.\n\
5799The car part is used for decoding a process output,\n\
5800the cdr part is used for encoding a text to be sent to a process.");
5801 Vdefault_process_coding_system = Qnil;
c4825358 5802
3f003981
KH
5803 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
5804 "Table of extra Latin codes in the range 128..159 (inclusive).\n\
c4825358
KH
5805This is a vector of length 256.\n\
5806If Nth element is non-nil, the existence of code N in a file\n\
bb0115a2 5807\(or output of subprocess) doesn't prevent it to be detected as\n\
3f003981
KH
5808a coding system of ISO 2022 variant which has a flag\n\
5809`accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file\n\
c4825358
KH
5810or reading output of a subprocess.\n\
5811Only 128th through 159th elements has a meaning.");
3f003981 5812 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
d46c5b12
KH
5813
5814 DEFVAR_LISP ("select-safe-coding-system-function",
5815 &Vselect_safe_coding_system_function,
5816 "Function to call to select safe coding system for encoding a text.\n\
5817\n\
5818If set, this function is called to force a user to select a proper\n\
5819coding system which can encode the text in the case that a default\n\
5820coding system used in each operation can't encode the text.\n\
5821\n\
a85a871a 5822The default value is `select-safe-coding-system' (which see).");
d46c5b12
KH
5823 Vselect_safe_coding_system_function = Qnil;
5824
4ed46869
KH
5825}
5826
5827#endif /* emacs */