(copy_executable_and_move_sections): Ifdef out a
[bpt/emacs.git] / src / coding.c
CommitLineData
4ed46869 1/* Coding system handler (conversion, detection, and etc).
4a2f9c6a 2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
203cb916 3 Licensed to the Free Software Foundation.
4ed46869 4
369314dc
KH
5This file is part of GNU Emacs.
6
7GNU Emacs is free software; you can redistribute it and/or modify
8it under the terms of the GNU General Public License as published by
9the Free Software Foundation; either version 2, or (at your option)
10any later version.
4ed46869 11
369314dc
KH
12GNU Emacs is distributed in the hope that it will be useful,
13but WITHOUT ANY WARRANTY; without even the implied warranty of
14MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15GNU General Public License for more details.
4ed46869 16
369314dc
KH
17You should have received a copy of the GNU General Public License
18along with GNU Emacs; see the file COPYING. If not, write to
19the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20Boston, MA 02111-1307, USA. */
4ed46869
KH
21
22/*** TABLE OF CONTENTS ***
23
24 1. Preamble
0ef69138 25 2. Emacs' internal format (emacs-mule) handlers
4ed46869
KH
26 3. ISO2022 handlers
27 4. Shift-JIS and BIG5 handlers
1397dc18
KH
28 5. CCL handlers
29 6. End-of-line handlers
30 7. C library functions
31 8. Emacs Lisp library functions
32 9. Post-amble
4ed46869
KH
33
34*/
35
36/*** GENERAL NOTE on CODING SYSTEM ***
37
38 Coding system is an encoding mechanism of one or more character
39 sets. Here's a list of coding systems which Emacs can handle. When
40 we say "decode", it means converting some other coding system to
0ef69138
KH
41 Emacs' internal format (emacs-internal), and when we say "encode",
42 it means converting the coding system emacs-mule to some other
43 coding system.
4ed46869 44
0ef69138 45 0. Emacs' internal format (emacs-mule)
4ed46869
KH
46
47 Emacs itself holds a multi-lingual character in a buffer and a string
f4dee582 48 in a special format. Details are described in section 2.
4ed46869
KH
49
50 1. ISO2022
51
52 The most famous coding system for multiple character sets. X's
f4dee582
RS
53 Compound Text, various EUCs (Extended Unix Code), and coding
54 systems used in Internet communication such as ISO-2022-JP are
55 all variants of ISO2022. Details are described in section 3.
4ed46869
KH
56
57 2. SJIS (or Shift-JIS or MS-Kanji-Code)
58
59 A coding system to encode character sets: ASCII, JISX0201, and
60 JISX0208. Widely used for PC's in Japan. Details are described in
f4dee582 61 section 4.
4ed46869
KH
62
63 3. BIG5
64
65 A coding system to encode character sets: ASCII and Big5. Widely
66 used by Chinese (mainly in Taiwan and Hong Kong). Details are
f4dee582
RS
67 described in section 4. In this file, when we write "BIG5"
68 (all uppercase), we mean the coding system, and when we write
69 "Big5" (capitalized), we mean the character set.
4ed46869 70
27901516
KH
71 4. Raw text
72
4608c386
KH
73 A coding system for a text containing random 8-bit code. Emacs does
74 no code conversion on such a text except for end-of-line format.
27901516
KH
75
76 5. Other
4ed46869 77
f4dee582 78 If a user wants to read/write a text encoded in a coding system not
4ed46869
KH
79 listed above, he can supply a decoder and an encoder for it in CCL
80 (Code Conversion Language) programs. Emacs executes the CCL program
81 while reading/writing.
82
d46c5b12
KH
83 Emacs represents a coding system by a Lisp symbol that has a property
84 `coding-system'. But, before actually using the coding system, the
4ed46869 85 information about it is set in a structure of type `struct
f4dee582 86 coding_system' for rapid processing. See section 6 for more details.
4ed46869
KH
87
88*/
89
90/*** GENERAL NOTES on END-OF-LINE FORMAT ***
91
92 How end-of-line of a text is encoded depends on a system. For
93 instance, Unix's format is just one byte of `line-feed' code,
f4dee582 94 whereas DOS's format is two-byte sequence of `carriage-return' and
d46c5b12
KH
95 `line-feed' codes. MacOS's format is usually one byte of
96 `carriage-return'.
4ed46869 97
f4dee582
RS
98 Since text characters encoding and end-of-line encoding are
99 independent, any coding system described above can take
4ed46869 100 any format of end-of-line. So, Emacs has information of format of
f4dee582 101 end-of-line in each coding-system. See section 6 for more details.
4ed46869
KH
102
103*/
104
105/*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
106
107 These functions check if a text between SRC and SRC_END is encoded
108 in the coding system category XXX. Each returns an integer value in
109 which appropriate flag bits for the category XXX is set. The flag
110 bits are defined in macros CODING_CATEGORY_MASK_XXX. Below is the
111 template of these functions. */
112#if 0
113int
0ef69138 114detect_coding_emacs_mule (src, src_end)
4ed46869
KH
115 unsigned char *src, *src_end;
116{
117 ...
118}
119#endif
120
121/*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
122
123 These functions decode SRC_BYTES length text at SOURCE encoded in
0ef69138 124 CODING to Emacs' internal format (emacs-mule). The resulting text
d46c5b12
KH
125 goes to a place pointed to by DESTINATION, the length of which
126 should not exceed DST_BYTES. These functions set the information of
127 original and decoded texts in the members produced, produced_char,
128 consumed, and consumed_char of the structure *CODING.
129
130 The return value is an integer (CODING_FINISH_XXX) indicating how
131 the decoding finished.
132
133 DST_BYTES zero means that source area and destination area are
134 overlapped, which means that we can produce a decoded text until it
135 reaches at the head of not-yet-decoded source text.
136
137 Below is a template of these functions. */
4ed46869 138#if 0
d46c5b12 139decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
140 struct coding_system *coding;
141 unsigned char *source, *destination;
142 int src_bytes, dst_bytes;
4ed46869
KH
143{
144 ...
145}
146#endif
147
148/*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
149
0ef69138
KH
150 These functions encode SRC_BYTES length text at SOURCE of Emacs'
151 internal format (emacs-mule) to CODING. The resulting text goes to
f4dee582 152 a place pointed to by DESTINATION, the length of which should not
d46c5b12
KH
153 exceed DST_BYTES. These functions set the information of
154 original and encoded texts in the members produced, produced_char,
155 consumed, and consumed_char of the structure *CODING.
156
157 The return value is an integer (CODING_FINISH_XXX) indicating how
158 the encoding finished.
159
160 DST_BYTES zero means that source area and destination area are
161 overlapped, which means that we can produce a decoded text until it
162 reaches at the head of not-yet-decoded source text.
163
164 Below is a template of these functions. */
4ed46869 165#if 0
d46c5b12 166encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
167 struct coding_system *coding;
168 unsigned char *source, *destination;
169 int src_bytes, dst_bytes;
4ed46869
KH
170{
171 ...
172}
173#endif
174
175/*** COMMONLY USED MACROS ***/
176
177/* The following three macros ONE_MORE_BYTE, TWO_MORE_BYTES, and
178 THREE_MORE_BYTES safely get one, two, and three bytes from the
179 source text respectively. If there are not enough bytes in the
180 source, they jump to `label_end_of_loop'. The caller should set
181 variables `src' and `src_end' to appropriate areas in advance. */
182
183#define ONE_MORE_BYTE(c1) \
184 do { \
185 if (src < src_end) \
186 c1 = *src++; \
187 else \
188 goto label_end_of_loop; \
189 } while (0)
190
191#define TWO_MORE_BYTES(c1, c2) \
192 do { \
193 if (src + 1 < src_end) \
194 c1 = *src++, c2 = *src++; \
195 else \
196 goto label_end_of_loop; \
197 } while (0)
198
199#define THREE_MORE_BYTES(c1, c2, c3) \
200 do { \
201 if (src + 2 < src_end) \
202 c1 = *src++, c2 = *src++, c3 = *src++; \
203 else \
204 goto label_end_of_loop; \
205 } while (0)
206
207/* The following three macros DECODE_CHARACTER_ASCII,
208 DECODE_CHARACTER_DIMENSION1, and DECODE_CHARACTER_DIMENSION2 put
209 the multi-byte form of a character of each class at the place
210 pointed by `dst'. The caller should set the variable `dst' to
211 point to an appropriate area and the variable `coding' to point to
212 the coding-system of the currently decoding text in advance. */
213
214/* Decode one ASCII character C. */
215
de79a6a5
KH
216#define DECODE_CHARACTER_ASCII(c) \
217 do { \
218 if (COMPOSING_P (coding->composing)) \
219 { \
220 *dst++ = 0xA0, *dst++ = (c) | 0x80; \
221 coding->composed_chars++; \
d14d03ac
KH
222 if (((c) | 0x80) < 0xA0) \
223 coding->fake_multibyte = 1; \
de79a6a5
KH
224 } \
225 else \
226 { \
227 *dst++ = (c); \
228 coding->produced_char++; \
d14d03ac
KH
229 if ((c) >= 0x80) \
230 coding->fake_multibyte = 1; \
de79a6a5 231 } \
4ed46869
KH
232 } while (0)
233
f4dee582 234/* Decode one DIMENSION1 character whose charset is CHARSET and whose
4ed46869
KH
235 position-code is C. */
236
237#define DECODE_CHARACTER_DIMENSION1(charset, c) \
238 do { \
239 unsigned char leading_code = CHARSET_LEADING_CODE_BASE (charset); \
240 if (COMPOSING_P (coding->composing)) \
de79a6a5
KH
241 { \
242 *dst++ = leading_code + 0x20; \
243 coding->composed_chars++; \
244 } \
4ed46869 245 else \
d46c5b12
KH
246 { \
247 *dst++ = leading_code; \
248 coding->produced_char++; \
249 } \
4ed46869
KH
250 if (leading_code = CHARSET_LEADING_CODE_EXT (charset)) \
251 *dst++ = leading_code; \
252 *dst++ = (c) | 0x80; \
d14d03ac
KH
253 if (((c) | 0x80) < 0xA0) \
254 coding->fake_multibyte = 1; \
4ed46869
KH
255 } while (0)
256
f4dee582 257/* Decode one DIMENSION2 character whose charset is CHARSET and whose
4ed46869
KH
258 position-codes are C1 and C2. */
259
260#define DECODE_CHARACTER_DIMENSION2(charset, c1, c2) \
261 do { \
262 DECODE_CHARACTER_DIMENSION1 (charset, c1); \
263 *dst++ = (c2) | 0x80; \
d14d03ac
KH
264 if (((c2) | 0x80) < 0xA0) \
265 coding->fake_multibyte = 1; \
4ed46869
KH
266 } while (0)
267
268\f
269/*** 1. Preamble ***/
270
271#include <stdio.h>
272
273#ifdef emacs
274
275#include <config.h>
276#include "lisp.h"
277#include "buffer.h"
278#include "charset.h"
279#include "ccl.h"
280#include "coding.h"
281#include "window.h"
282
283#else /* not emacs */
284
285#include "mulelib.h"
286
287#endif /* not emacs */
288
289Lisp_Object Qcoding_system, Qeol_type;
290Lisp_Object Qbuffer_file_coding_system;
291Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
27901516 292Lisp_Object Qno_conversion, Qundecided;
bb0115a2 293Lisp_Object Qcoding_system_history;
70c22245 294Lisp_Object Qsafe_charsets;
1397dc18 295Lisp_Object Qvalid_codes;
4ed46869
KH
296
297extern Lisp_Object Qinsert_file_contents, Qwrite_region;
298Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
299Lisp_Object Qstart_process, Qopen_network_stream;
300Lisp_Object Qtarget_idx;
301
d46c5b12
KH
302Lisp_Object Vselect_safe_coding_system_function;
303
7722baf9
EZ
304/* Mnemonic string for each format of end-of-line. */
305Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
306/* Mnemonic string to indicate format of end-of-line is not yet
4ed46869 307 decided. */
7722baf9 308Lisp_Object eol_mnemonic_undecided;
4ed46869 309
9ce27fde
KH
310/* Format of end-of-line decided by system. This is CODING_EOL_LF on
311 Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac. */
312int system_eol_type;
313
4ed46869
KH
314#ifdef emacs
315
4608c386
KH
316Lisp_Object Vcoding_system_list, Vcoding_system_alist;
317
318Lisp_Object Qcoding_system_p, Qcoding_system_error;
4ed46869 319
d46c5b12
KH
320/* Coding system emacs-mule and raw-text are for converting only
321 end-of-line format. */
322Lisp_Object Qemacs_mule, Qraw_text;
9ce27fde 323
4ed46869
KH
324/* Coding-systems are handed between Emacs Lisp programs and C internal
325 routines by the following three variables. */
326/* Coding-system for reading files and receiving data from process. */
327Lisp_Object Vcoding_system_for_read;
328/* Coding-system for writing files and sending data to process. */
329Lisp_Object Vcoding_system_for_write;
330/* Coding-system actually used in the latest I/O. */
331Lisp_Object Vlast_coding_system_used;
332
c4825358 333/* A vector of length 256 which contains information about special
94487c4e 334 Latin codes (especially for dealing with Microsoft codes). */
3f003981 335Lisp_Object Vlatin_extra_code_table;
c4825358 336
9ce27fde
KH
337/* Flag to inhibit code conversion of end-of-line format. */
338int inhibit_eol_conversion;
339
ed29121d
EZ
340/* Flag to make buffer-file-coding-system inherit from process-coding. */
341int inherit_process_coding_system;
342
c4825358 343/* Coding system to be used to encode text for terminal display. */
4ed46869
KH
344struct coding_system terminal_coding;
345
c4825358
KH
346/* Coding system to be used to encode text for terminal display when
347 terminal coding system is nil. */
348struct coding_system safe_terminal_coding;
349
350/* Coding system of what is sent from terminal keyboard. */
4ed46869
KH
351struct coding_system keyboard_coding;
352
6bc51348
KH
353/* Default coding system to be used to write a file. */
354struct coding_system default_buffer_file_coding;
355
02ba4723
KH
356Lisp_Object Vfile_coding_system_alist;
357Lisp_Object Vprocess_coding_system_alist;
358Lisp_Object Vnetwork_coding_system_alist;
4ed46869
KH
359
360#endif /* emacs */
361
d46c5b12 362Lisp_Object Qcoding_category, Qcoding_category_index;
4ed46869
KH
363
364/* List of symbols `coding-category-xxx' ordered by priority. */
365Lisp_Object Vcoding_category_list;
366
d46c5b12
KH
367/* Table of coding categories (Lisp symbols). */
368Lisp_Object Vcoding_category_table;
4ed46869
KH
369
370/* Table of names of symbol for each coding-category. */
371char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
0ef69138 372 "coding-category-emacs-mule",
4ed46869
KH
373 "coding-category-sjis",
374 "coding-category-iso-7",
d46c5b12 375 "coding-category-iso-7-tight",
4ed46869
KH
376 "coding-category-iso-8-1",
377 "coding-category-iso-8-2",
7717c392
KH
378 "coding-category-iso-7-else",
379 "coding-category-iso-8-else",
89fa8b36 380 "coding-category-ccl",
4ed46869 381 "coding-category-big5",
27901516 382 "coding-category-raw-text",
89fa8b36 383 "coding-category-binary"
4ed46869
KH
384};
385
66cfb530 386/* Table of pointers to coding systems corresponding to each coding
d46c5b12
KH
387 categories. */
388struct coding_system *coding_system_table[CODING_CATEGORY_IDX_MAX];
389
66cfb530
KH
390/* Table of coding category masks. Nth element is a mask for a coding
391 cateogry of which priority is Nth. */
392static
393int coding_priorities[CODING_CATEGORY_IDX_MAX];
394
f967223b
KH
395/* Flag to tell if we look up translation table on character code
396 conversion. */
84fbb8a0 397Lisp_Object Venable_character_translation;
f967223b
KH
398/* Standard translation table to look up on decoding (reading). */
399Lisp_Object Vstandard_translation_table_for_decode;
400/* Standard translation table to look up on encoding (writing). */
401Lisp_Object Vstandard_translation_table_for_encode;
84fbb8a0 402
f967223b
KH
403Lisp_Object Qtranslation_table;
404Lisp_Object Qtranslation_table_id;
405Lisp_Object Qtranslation_table_for_decode;
406Lisp_Object Qtranslation_table_for_encode;
4ed46869
KH
407
408/* Alist of charsets vs revision number. */
409Lisp_Object Vcharset_revision_alist;
410
02ba4723
KH
411/* Default coding systems used for process I/O. */
412Lisp_Object Vdefault_process_coding_system;
413
4ed46869 414\f
0ef69138 415/*** 2. Emacs internal format (emacs-mule) handlers ***/
4ed46869
KH
416
417/* Emacs' internal format for encoding multiple character sets is a
f4dee582
RS
418 kind of multi-byte encoding, i.e. characters are encoded by
419 variable-length sequences of one-byte codes. ASCII characters
420 and control characters (e.g. `tab', `newline') are represented by
421 one-byte sequences which are their ASCII codes, in the range 0x00
422 through 0x7F. The other characters are represented by a sequence
423 of `base leading-code', optional `extended leading-code', and one
424 or two `position-code's. The length of the sequence is determined
425 by the base leading-code. Leading-code takes the range 0x80
426 through 0x9F, whereas extended leading-code and position-code take
427 the range 0xA0 through 0xFF. See `charset.h' for more details
428 about leading-code and position-code.
429
430 There's one exception to this rule. Special leading-code
4ed46869
KH
431 `leading-code-composition' denotes that the following several
432 characters should be composed into one character. Leading-codes of
433 components (except for ASCII) are added 0x20. An ASCII character
434 component is represented by a 2-byte sequence of `0xA0' and
f4dee582
RS
435 `ASCII-code + 0x80'. See also the comments in `charset.h' for the
436 details of composite character. Hence, we can summarize the code
4ed46869
KH
437 range as follows:
438
439 --- CODE RANGE of Emacs' internal format ---
440 (character set) (range)
441 ASCII 0x00 .. 0x7F
442 ELSE (1st byte) 0x80 .. 0x9F
443 (rest bytes) 0xA0 .. 0xFF
444 ---------------------------------------------
445
446 */
447
448enum emacs_code_class_type emacs_code_class[256];
449
450/* Go to the next statement only if *SRC is accessible and the code is
451 greater than 0xA0. */
452#define CHECK_CODE_RANGE_A0_FF \
453 do { \
454 if (src >= src_end) \
455 goto label_end_of_switch; \
456 else if (*src++ < 0xA0) \
457 return 0; \
458 } while (0)
459
460/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
461 Check if a text is encoded in Emacs' internal format. If it is,
d46c5b12 462 return CODING_CATEGORY_MASK_EMACS_MULE, else return 0. */
4ed46869
KH
463
464int
0ef69138 465detect_coding_emacs_mule (src, src_end)
4ed46869
KH
466 unsigned char *src, *src_end;
467{
468 unsigned char c;
469 int composing = 0;
470
471 while (src < src_end)
472 {
473 c = *src++;
474
475 if (composing)
476 {
477 if (c < 0xA0)
478 composing = 0;
479 else
480 c -= 0x20;
481 }
482
483 switch (emacs_code_class[c])
484 {
485 case EMACS_ascii_code:
486 case EMACS_linefeed_code:
487 break;
488
489 case EMACS_control_code:
490 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
491 return 0;
492 break;
493
494 case EMACS_invalid_code:
495 return 0;
496
497 case EMACS_leading_code_composition: /* c == 0x80 */
498 if (composing)
499 CHECK_CODE_RANGE_A0_FF;
500 else
501 composing = 1;
502 break;
503
504 case EMACS_leading_code_4:
505 CHECK_CODE_RANGE_A0_FF;
506 /* fall down to check it two more times ... */
507
508 case EMACS_leading_code_3:
509 CHECK_CODE_RANGE_A0_FF;
510 /* fall down to check it one more time ... */
511
512 case EMACS_leading_code_2:
513 CHECK_CODE_RANGE_A0_FF;
514 break;
515
516 default:
517 label_end_of_switch:
518 break;
519 }
520 }
0ef69138 521 return CODING_CATEGORY_MASK_EMACS_MULE;
4ed46869
KH
522}
523
524\f
525/*** 3. ISO2022 handlers ***/
526
527/* The following note describes the coding system ISO2022 briefly.
39787efd
KH
528 Since the intention of this note is to help understand the
529 functions in this file, some parts are NOT ACCURATE or OVERLY
530 SIMPLIFIED. For thorough understanding, please refer to the
4ed46869
KH
531 original document of ISO2022.
532
533 ISO2022 provides many mechanisms to encode several character sets
39787efd
KH
534 in 7-bit and 8-bit environments. For 7-bite environments, all text
535 is encoded using bytes less than 128. This may make the encoded
536 text a little bit longer, but the text passes more easily through
537 several gateways, some of which strip off MSB (Most Signigant Bit).
538
539 There are two kinds of character sets: control character set and
4ed46869
KH
540 graphic character set. The former contains control characters such
541 as `newline' and `escape' to provide control functions (control
39787efd
KH
542 functions are also provided by escape sequences). The latter
543 contains graphic characters such as 'A' and '-'. Emacs recognizes
4ed46869
KH
544 two control character sets and many graphic character sets.
545
546 Graphic character sets are classified into one of the following
39787efd
KH
547 four classes, according to the number of bytes (DIMENSION) and
548 number of characters in one dimension (CHARS) of the set:
549 - DIMENSION1_CHARS94
550 - DIMENSION1_CHARS96
551 - DIMENSION2_CHARS94
552 - DIMENSION2_CHARS96
553
554 In addition, each character set is assigned an identification tag,
555 unique for each set, called "final character" (denoted as <F>
556 hereafter). The <F> of each character set is decided by ECMA(*)
557 when it is registered in ISO. The code range of <F> is 0x30..0x7F
558 (0x30..0x3F are for private use only).
4ed46869
KH
559
560 Note (*): ECMA = European Computer Manufacturers Association
561
562 Here are examples of graphic character set [NAME(<F>)]:
563 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
564 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
565 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
566 o DIMENSION2_CHARS96 -- none for the moment
567
39787efd 568 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
4ed46869
KH
569 C0 [0x00..0x1F] -- control character plane 0
570 GL [0x20..0x7F] -- graphic character plane 0
571 C1 [0x80..0x9F] -- control character plane 1
572 GR [0xA0..0xFF] -- graphic character plane 1
573
574 A control character set is directly designated and invoked to C0 or
39787efd
KH
575 C1 by an escape sequence. The most common case is that:
576 - ISO646's control character set is designated/invoked to C0, and
577 - ISO6429's control character set is designated/invoked to C1,
578 and usually these designations/invocations are omitted in encoded
579 text. In a 7-bit environment, only C0 can be used, and a control
580 character for C1 is encoded by an appropriate escape sequence to
581 fit into the environment. All control characters for C1 are
582 defined to have corresponding escape sequences.
4ed46869
KH
583
584 A graphic character set is at first designated to one of four
585 graphic registers (G0 through G3), then these graphic registers are
586 invoked to GL or GR. These designations and invocations can be
587 done independently. The most common case is that G0 is invoked to
39787efd
KH
588 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
589 these invocations and designations are omitted in encoded text.
590 In a 7-bit environment, only GL can be used.
4ed46869 591
39787efd
KH
592 When a graphic character set of CHARS94 is invoked to GL, codes
593 0x20 and 0x7F of the GL area work as control characters SPACE and
594 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
595 be used.
4ed46869
KH
596
597 There are two ways of invocation: locking-shift and single-shift.
598 With locking-shift, the invocation lasts until the next different
39787efd
KH
599 invocation, whereas with single-shift, the invocation affects the
600 following character only and doesn't affect the locking-shift
601 state. Invocations are done by the following control characters or
602 escape sequences:
4ed46869
KH
603
604 ----------------------------------------------------------------------
39787efd 605 abbrev function cntrl escape seq description
4ed46869 606 ----------------------------------------------------------------------
39787efd
KH
607 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
608 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
609 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
610 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
611 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
612 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
613 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
614 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
615 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
4ed46869 616 ----------------------------------------------------------------------
39787efd
KH
617 (*) These are not used by any known coding system.
618
619 Control characters for these functions are defined by macros
620 ISO_CODE_XXX in `coding.h'.
4ed46869 621
39787efd 622 Designations are done by the following escape sequences:
4ed46869
KH
623 ----------------------------------------------------------------------
624 escape sequence description
625 ----------------------------------------------------------------------
626 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
627 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
628 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
629 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
630 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
631 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
632 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
633 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
634 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
635 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
636 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
637 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
638 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
639 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
640 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
641 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
642 ----------------------------------------------------------------------
643
644 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
39787efd 645 of dimension 1, chars 94, and final character <F>, etc...
4ed46869
KH
646
647 Note (*): Although these designations are not allowed in ISO2022,
648 Emacs accepts them on decoding, and produces them on encoding
39787efd 649 CHARS96 character sets in a coding system which is characterized as
4ed46869
KH
650 7-bit environment, non-locking-shift, and non-single-shift.
651
652 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
39787efd 653 '(' can be omitted. We refer to this as "short-form" hereafter.
4ed46869
KH
654
655 Now you may notice that there are a lot of ways for encoding the
39787efd
KH
656 same multilingual text in ISO2022. Actually, there exist many
657 coding systems such as Compound Text (used in X11's inter client
658 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
659 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
4ed46869
KH
660 localized platforms), and all of these are variants of ISO2022.
661
662 In addition to the above, Emacs handles two more kinds of escape
663 sequences: ISO6429's direction specification and Emacs' private
664 sequence for specifying character composition.
665
39787efd 666 ISO6429's direction specification takes the following form:
4ed46869
KH
667 o CSI ']' -- end of the current direction
668 o CSI '0' ']' -- end of the current direction
669 o CSI '1' ']' -- start of left-to-right text
670 o CSI '2' ']' -- start of right-to-left text
671 The control character CSI (0x9B: control sequence introducer) is
39787efd
KH
672 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
673
674 Character composition specification takes the following form:
4ed46869
KH
675 o ESC '0' -- start character composition
676 o ESC '1' -- end character composition
39787efd
KH
677 Since these are not standard escape sequences of any ISO standard,
678 the use of them for these meaning is restricted to Emacs only. */
4ed46869
KH
679
680enum iso_code_class_type iso_code_class[256];
681
f024b6aa
RS
682#define CHARSET_OK(idx, charset) \
683 (coding_system_table[idx] \
684 && (coding_system_table[idx]->safe_charsets[charset] \
685 || (CODING_SPEC_ISO_REQUESTED_DESIGNATION \
686 (coding_system_table[idx], charset) \
687 != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)))
d46c5b12
KH
688
689#define SHIFT_OUT_OK(idx) \
690 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding_system_table[idx], 1) >= 0)
691
4ed46869
KH
692/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
693 Check if a text is encoded in ISO2022. If it is, returns an
694 integer in which appropriate flag bits any of:
695 CODING_CATEGORY_MASK_ISO_7
d46c5b12 696 CODING_CATEGORY_MASK_ISO_7_TIGHT
4ed46869
KH
697 CODING_CATEGORY_MASK_ISO_8_1
698 CODING_CATEGORY_MASK_ISO_8_2
7717c392
KH
699 CODING_CATEGORY_MASK_ISO_7_ELSE
700 CODING_CATEGORY_MASK_ISO_8_ELSE
4ed46869
KH
701 are set. If a code which should never appear in ISO2022 is found,
702 returns 0. */
703
704int
705detect_coding_iso2022 (src, src_end)
706 unsigned char *src, *src_end;
707{
d46c5b12
KH
708 int mask = CODING_CATEGORY_MASK_ISO;
709 int mask_found = 0;
f46869e4 710 int reg[4], shift_out = 0, single_shifting = 0;
d46c5b12 711 int c, c1, i, charset;
3f003981 712
d46c5b12 713 reg[0] = CHARSET_ASCII, reg[1] = reg[2] = reg[3] = -1;
3f003981 714 while (mask && src < src_end)
4ed46869
KH
715 {
716 c = *src++;
717 switch (c)
718 {
719 case ISO_CODE_ESC:
f46869e4 720 single_shifting = 0;
e0e989f6 721 if (src >= src_end)
4ed46869
KH
722 break;
723 c = *src++;
d46c5b12 724 if (c >= '(' && c <= '/')
4ed46869 725 {
bf9cdd4e
KH
726 /* Designation sequence for a charset of dimension 1. */
727 if (src >= src_end)
728 break;
d46c5b12
KH
729 c1 = *src++;
730 if (c1 < ' ' || c1 >= 0x80
731 || (charset = iso_charset_table[0][c >= ','][c1]) < 0)
732 /* Invalid designation sequence. Just ignore. */
733 break;
734 reg[(c - '(') % 4] = charset;
bf9cdd4e
KH
735 }
736 else if (c == '$')
737 {
738 /* Designation sequence for a charset of dimension 2. */
739 if (src >= src_end)
740 break;
741 c = *src++;
742 if (c >= '@' && c <= 'B')
743 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
d46c5b12 744 reg[0] = charset = iso_charset_table[1][0][c];
bf9cdd4e 745 else if (c >= '(' && c <= '/')
bcf26d6a 746 {
bf9cdd4e
KH
747 if (src >= src_end)
748 break;
d46c5b12
KH
749 c1 = *src++;
750 if (c1 < ' ' || c1 >= 0x80
751 || (charset = iso_charset_table[1][c >= ','][c1]) < 0)
752 /* Invalid designation sequence. Just ignore. */
753 break;
754 reg[(c - '(') % 4] = charset;
bcf26d6a 755 }
bf9cdd4e 756 else
d46c5b12
KH
757 /* Invalid designation sequence. Just ignore. */
758 break;
759 }
ae9ff118 760 else if (c == 'N' || c == 'O')
d46c5b12 761 {
ae9ff118
KH
762 /* ESC <Fe> for SS2 or SS3. */
763 mask &= CODING_CATEGORY_MASK_ISO_7_ELSE;
d46c5b12 764 break;
4ed46869 765 }
bf9cdd4e 766 else if (c == '0' || c == '1' || c == '2')
ae9ff118 767 /* ESC <Fp> for start/end composition. Just ignore. */
d46c5b12 768 break;
bf9cdd4e 769 else
d46c5b12
KH
770 /* Invalid escape sequence. Just ignore. */
771 break;
772
773 /* We found a valid designation sequence for CHARSET. */
774 mask &= ~CODING_CATEGORY_MASK_ISO_8BIT;
775 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7, charset))
776 mask_found |= CODING_CATEGORY_MASK_ISO_7;
777 else
778 mask &= ~CODING_CATEGORY_MASK_ISO_7;
779 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT, charset))
780 mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
781 else
782 mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
ae9ff118
KH
783 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE, charset))
784 mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
785 else
d46c5b12 786 mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
ae9ff118
KH
787 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE, charset))
788 mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
789 else
d46c5b12 790 mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
4ed46869
KH
791 break;
792
4ed46869 793 case ISO_CODE_SO:
f46869e4 794 single_shifting = 0;
d46c5b12
KH
795 if (shift_out == 0
796 && (reg[1] >= 0
797 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE)
798 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE)))
799 {
800 /* Locking shift out. */
801 mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
802 mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
803 }
e0e989f6
KH
804 break;
805
d46c5b12 806 case ISO_CODE_SI:
f46869e4 807 single_shifting = 0;
d46c5b12
KH
808 if (shift_out == 1)
809 {
810 /* Locking shift in. */
811 mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
812 mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
813 }
814 break;
815
4ed46869 816 case ISO_CODE_CSI:
f46869e4 817 single_shifting = 0;
4ed46869
KH
818 case ISO_CODE_SS2:
819 case ISO_CODE_SS3:
3f003981
KH
820 {
821 int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
822
70c22245
KH
823 if (c != ISO_CODE_CSI)
824 {
d46c5b12
KH
825 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
826 & CODING_FLAG_ISO_SINGLE_SHIFT)
70c22245 827 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
d46c5b12
KH
828 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
829 & CODING_FLAG_ISO_SINGLE_SHIFT)
70c22245 830 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
f46869e4 831 single_shifting = 1;
70c22245 832 }
3f003981
KH
833 if (VECTORP (Vlatin_extra_code_table)
834 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
835 {
d46c5b12
KH
836 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
837 & CODING_FLAG_ISO_LATIN_EXTRA)
3f003981 838 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
d46c5b12
KH
839 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
840 & CODING_FLAG_ISO_LATIN_EXTRA)
3f003981
KH
841 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
842 }
843 mask &= newmask;
d46c5b12 844 mask_found |= newmask;
3f003981
KH
845 }
846 break;
4ed46869
KH
847
848 default:
849 if (c < 0x80)
f46869e4
KH
850 {
851 single_shifting = 0;
852 break;
853 }
4ed46869 854 else if (c < 0xA0)
c4825358 855 {
f46869e4 856 single_shifting = 0;
3f003981
KH
857 if (VECTORP (Vlatin_extra_code_table)
858 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
c4825358 859 {
3f003981
KH
860 int newmask = 0;
861
d46c5b12
KH
862 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
863 & CODING_FLAG_ISO_LATIN_EXTRA)
3f003981 864 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
d46c5b12
KH
865 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
866 & CODING_FLAG_ISO_LATIN_EXTRA)
3f003981
KH
867 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
868 mask &= newmask;
d46c5b12 869 mask_found |= newmask;
c4825358 870 }
3f003981
KH
871 else
872 return 0;
c4825358 873 }
4ed46869
KH
874 else
875 {
7717c392 876 unsigned char *src_begin = src;
4ed46869 877
d46c5b12 878 mask &= ~(CODING_CATEGORY_MASK_ISO_7BIT
7717c392 879 | CODING_CATEGORY_MASK_ISO_7_ELSE);
d46c5b12 880 mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
f46869e4
KH
881 /* Check the length of succeeding codes of the range
882 0xA0..0FF. If the byte length is odd, we exclude
883 CODING_CATEGORY_MASK_ISO_8_2. We can check this only
884 when we are not single shifting. */
885 if (!single_shifting)
886 {
887 while (src < src_end && *src >= 0xA0)
888 src++;
889 if ((src - src_begin - 1) & 1 && src < src_end)
890 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
891 else
892 mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
893 }
4ed46869
KH
894 }
895 break;
896 }
897 }
898
d46c5b12 899 return (mask & mask_found);
4ed46869
KH
900}
901
902/* Decode a character of which charset is CHARSET and the 1st position
bdd9fb48 903 code is C1. If dimension of CHARSET is 2, the 2nd position code is
4ed46869
KH
904 fetched from SRC and set to C2. If CHARSET is negative, it means
905 that we are decoding ill formed text, and what we can do is just to
906 read C1 as is. */
907
bdd9fb48
KH
908#define DECODE_ISO_CHARACTER(charset, c1) \
909 do { \
910 int c_alt, charset_alt = (charset); \
911 if (COMPOSING_HEAD_P (coding->composing)) \
912 { \
913 *dst++ = LEADING_CODE_COMPOSITION; \
914 if (COMPOSING_WITH_RULE_P (coding->composing)) \
915 /* To tell composition rules are embeded. */ \
916 *dst++ = 0xFF; \
917 coding->composing += 2; \
918 } \
85bbb134 919 if (charset_alt >= 0) \
bdd9fb48 920 { \
85bbb134 921 if (CHARSET_DIMENSION (charset_alt) == 2) \
70c22245
KH
922 { \
923 ONE_MORE_BYTE (c2); \
924 if (iso_code_class[(c2) & 0x7F] != ISO_0x20_or_0x7F \
925 && iso_code_class[(c2) & 0x7F] != ISO_graphic_plane_0) \
926 { \
927 src--; \
85bbb134 928 charset_alt = CHARSET_ASCII; \
70c22245
KH
929 } \
930 } \
84fbb8a0
KH
931 if (!NILP (translation_table) \
932 && ((c_alt = translate_char (translation_table, \
85bbb134 933 -1, charset_alt, c1, c2)) >= 0)) \
bdd9fb48
KH
934 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
935 } \
936 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
937 DECODE_CHARACTER_ASCII (c1); \
938 else if (CHARSET_DIMENSION (charset_alt) == 1) \
939 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
940 else \
941 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
942 if (COMPOSING_WITH_RULE_P (coding->composing)) \
943 /* To tell a composition rule follows. */ \
944 coding->composing = COMPOSING_WITH_RULE_RULE; \
4ed46869
KH
945 } while (0)
946
947/* Set designation state into CODING. */
d46c5b12
KH
948#define DECODE_DESIGNATION(reg, dimension, chars, final_char) \
949 do { \
944bd420
KH
950 int charset; \
951 \
952 if (final_char < '0' || final_char >= 128) \
953 goto label_invalid_code; \
954 charset = ISO_CHARSET_TABLE (make_number (dimension), \
955 make_number (chars), \
956 make_number (final_char)); \
d46c5b12 957 if (charset >= 0 \
704c5781
KH
958 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) == reg \
959 || coding->safe_charsets[charset])) \
d46c5b12
KH
960 { \
961 if (coding->spec.iso2022.last_invalid_designation_register == 0 \
962 && reg == 0 \
963 && charset == CHARSET_ASCII) \
964 { \
965 /* We should insert this designation sequence as is so \
966 that it is surely written back to a file. */ \
967 coding->spec.iso2022.last_invalid_designation_register = -1; \
968 goto label_invalid_code; \
969 } \
970 coding->spec.iso2022.last_invalid_designation_register = -1; \
971 if ((coding->mode & CODING_MODE_DIRECTION) \
972 && CHARSET_REVERSE_CHARSET (charset) >= 0) \
973 charset = CHARSET_REVERSE_CHARSET (charset); \
974 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
975 } \
976 else \
977 { \
978 coding->spec.iso2022.last_invalid_designation_register = reg; \
979 goto label_invalid_code; \
980 } \
4ed46869
KH
981 } while (0)
982
88993dfd
KH
983/* Return 0 if there's a valid composing sequence starting at SRC and
984 ending before SRC_END, else return -1. */
d46c5b12 985
84fbb8a0
KH
986int
987check_composing_code (coding, src, src_end)
d46c5b12
KH
988 struct coding_system *coding;
989 unsigned char *src, *src_end;
990{
d46c5b12
KH
991 int charset, c, c1, dim;
992
993 while (src < src_end)
994 {
88993dfd
KH
995 c = *src++;
996 if (c >= 0x20)
997 continue;
998 if (c != ISO_CODE_ESC || src >= src_end)
999 return -1;
1000 c = *src++;
1001 if (c == '1') /* end of compsition */
1002 return 0;
1003 if (src + 2 >= src_end
1004 || !coding->flags & CODING_FLAG_ISO_DESIGNATION)
1005 return -1;
1006
1007 dim = (c == '$');
1008 if (dim == 1)
1009 c = (*src >= '@' && *src <= 'B') ? '(' : *src++;
1010 if (c >= '(' && c <= '/')
d46c5b12 1011 {
88993dfd
KH
1012 c1 = *src++;
1013 if ((c1 < ' ' || c1 >= 0x80)
1014 || (charset = iso_charset_table[dim][c >= ','][c1]) < 0
1015 || ! coding->safe_charsets[charset]
1016 || (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
1017 == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
1018 return -1;
d46c5b12 1019 }
88993dfd
KH
1020 else
1021 return -1;
d46c5b12 1022 }
88993dfd
KH
1023
1024 /* We have not found the sequence "ESC 1". */
1025 return -1;
d46c5b12
KH
1026}
1027
4ed46869
KH
1028/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
1029
1030int
d46c5b12 1031decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
1032 struct coding_system *coding;
1033 unsigned char *source, *destination;
1034 int src_bytes, dst_bytes;
4ed46869
KH
1035{
1036 unsigned char *src = source;
1037 unsigned char *src_end = source + src_bytes;
1038 unsigned char *dst = destination;
1039 unsigned char *dst_end = destination + dst_bytes;
1040 /* Since the maximum bytes produced by each loop is 7, we subtract 6
1041 from DST_END to assure that overflow checking is necessary only
1042 at the head of loop. */
1043 unsigned char *adjusted_dst_end = dst_end - 6;
1044 int charset;
1045 /* Charsets invoked to graphic plane 0 and 1 respectively. */
1046 int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1047 int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
84fbb8a0 1048 Lisp_Object translation_table
f967223b 1049 = coding->translation_table_for_decode;
d46c5b12 1050 int result = CODING_FINISH_NORMAL;
bdd9fb48 1051
84fbb8a0 1052 if (!NILP (Venable_character_translation) && NILP (translation_table))
f967223b 1053 translation_table = Vstandard_translation_table_for_decode;
4ed46869 1054
d46c5b12 1055 coding->produced_char = 0;
fb88bf2d 1056 coding->fake_multibyte = 0;
d46c5b12
KH
1057 while (src < src_end && (dst_bytes
1058 ? (dst < adjusted_dst_end)
1059 : (dst < src - 6)))
4ed46869
KH
1060 {
1061 /* SRC_BASE remembers the start position in source in each loop.
1062 The loop will be exited when there's not enough source text
1063 to analyze long escape sequence or 2-byte code (within macros
1064 ONE_MORE_BYTE or TWO_MORE_BYTES). In that case, SRC is reset
1065 to SRC_BASE before exiting. */
1066 unsigned char *src_base = src;
bdd9fb48 1067 int c1 = *src++, c2;
4ed46869
KH
1068
1069 switch (iso_code_class [c1])
1070 {
1071 case ISO_0x20_or_0x7F:
1072 if (!coding->composing
1073 && (charset0 < 0 || CHARSET_CHARS (charset0) == 94))
1074 {
1075 /* This is SPACE or DEL. */
1076 *dst++ = c1;
d46c5b12 1077 coding->produced_char++;
4ed46869
KH
1078 break;
1079 }
1080 /* This is a graphic character, we fall down ... */
1081
1082 case ISO_graphic_plane_0:
1083 if (coding->composing == COMPOSING_WITH_RULE_RULE)
1084 {
1085 /* This is a composition rule. */
1086 *dst++ = c1 | 0x80;
1087 coding->composing = COMPOSING_WITH_RULE_TAIL;
1088 }
1089 else
1090 DECODE_ISO_CHARACTER (charset0, c1);
1091 break;
1092
1093 case ISO_0xA0_or_0xFF:
d46c5b12
KH
1094 if (charset1 < 0 || CHARSET_CHARS (charset1) == 94
1095 || coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
fb88bf2d 1096 goto label_invalid_code;
4ed46869
KH
1097 /* This is a graphic character, we fall down ... */
1098
1099 case ISO_graphic_plane_1:
d46c5b12 1100 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
fb88bf2d 1101 goto label_invalid_code;
d46c5b12
KH
1102 else
1103 DECODE_ISO_CHARACTER (charset1, c1);
4ed46869
KH
1104 break;
1105
1106 case ISO_control_code:
1107 /* All ISO2022 control characters in this class have the
1108 same representation in Emacs internal format. */
d46c5b12
KH
1109 if (c1 == '\n'
1110 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1111 && (coding->eol_type == CODING_EOL_CR
1112 || coding->eol_type == CODING_EOL_CRLF))
1113 {
1114 result = CODING_FINISH_INCONSISTENT_EOL;
1115 goto label_end_of_loop_2;
1116 }
4ed46869 1117 *dst++ = c1;
d46c5b12 1118 coding->produced_char++;
174a4cbe
KH
1119 if (c1 >= 0x80)
1120 coding->fake_multibyte = 1;
4ed46869
KH
1121 break;
1122
1123 case ISO_carriage_return:
1124 if (coding->eol_type == CODING_EOL_CR)
d46c5b12 1125 *dst++ = '\n';
4ed46869
KH
1126 else if (coding->eol_type == CODING_EOL_CRLF)
1127 {
1128 ONE_MORE_BYTE (c1);
1129 if (c1 == ISO_CODE_LF)
1130 *dst++ = '\n';
1131 else
1132 {
d46c5b12
KH
1133 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1134 {
1135 result = CODING_FINISH_INCONSISTENT_EOL;
1136 goto label_end_of_loop_2;
1137 }
4ed46869 1138 src--;
d46c5b12 1139 *dst++ = '\r';
4ed46869
KH
1140 }
1141 }
1142 else
d46c5b12
KH
1143 *dst++ = c1;
1144 coding->produced_char++;
4ed46869
KH
1145 break;
1146
1147 case ISO_shift_out:
d46c5b12
KH
1148 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1149 || CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
1150 goto label_invalid_code;
4ed46869
KH
1151 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
1152 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1153 break;
1154
1155 case ISO_shift_in:
d46c5b12
KH
1156 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
1157 goto label_invalid_code;
4ed46869
KH
1158 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
1159 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1160 break;
1161
1162 case ISO_single_shift_2_7:
1163 case ISO_single_shift_2:
d46c5b12
KH
1164 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1165 goto label_invalid_code;
4ed46869
KH
1166 /* SS2 is handled as an escape sequence of ESC 'N' */
1167 c1 = 'N';
1168 goto label_escape_sequence;
1169
1170 case ISO_single_shift_3:
d46c5b12
KH
1171 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1172 goto label_invalid_code;
4ed46869
KH
1173 /* SS2 is handled as an escape sequence of ESC 'O' */
1174 c1 = 'O';
1175 goto label_escape_sequence;
1176
1177 case ISO_control_sequence_introducer:
1178 /* CSI is handled as an escape sequence of ESC '[' ... */
1179 c1 = '[';
1180 goto label_escape_sequence;
1181
1182 case ISO_escape:
1183 ONE_MORE_BYTE (c1);
1184 label_escape_sequence:
1185 /* Escape sequences handled by Emacs are invocation,
1186 designation, direction specification, and character
1187 composition specification. */
1188 switch (c1)
1189 {
1190 case '&': /* revision of following character set */
1191 ONE_MORE_BYTE (c1);
1192 if (!(c1 >= '@' && c1 <= '~'))
d46c5b12 1193 goto label_invalid_code;
4ed46869
KH
1194 ONE_MORE_BYTE (c1);
1195 if (c1 != ISO_CODE_ESC)
d46c5b12 1196 goto label_invalid_code;
4ed46869
KH
1197 ONE_MORE_BYTE (c1);
1198 goto label_escape_sequence;
1199
1200 case '$': /* designation of 2-byte character set */
d46c5b12
KH
1201 if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1202 goto label_invalid_code;
4ed46869
KH
1203 ONE_MORE_BYTE (c1);
1204 if (c1 >= '@' && c1 <= 'B')
1205 { /* designation of JISX0208.1978, GB2312.1980,
88993dfd 1206 or JISX0208.1980 */
4ed46869
KH
1207 DECODE_DESIGNATION (0, 2, 94, c1);
1208 }
1209 else if (c1 >= 0x28 && c1 <= 0x2B)
1210 { /* designation of DIMENSION2_CHARS94 character set */
1211 ONE_MORE_BYTE (c2);
1212 DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
1213 }
1214 else if (c1 >= 0x2C && c1 <= 0x2F)
1215 { /* designation of DIMENSION2_CHARS96 character set */
1216 ONE_MORE_BYTE (c2);
1217 DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
1218 }
1219 else
d46c5b12 1220 goto label_invalid_code;
4ed46869
KH
1221 break;
1222
1223 case 'n': /* invocation of locking-shift-2 */
d46c5b12
KH
1224 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1225 || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1226 goto label_invalid_code;
4ed46869 1227 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
e0e989f6 1228 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
4ed46869
KH
1229 break;
1230
1231 case 'o': /* invocation of locking-shift-3 */
d46c5b12
KH
1232 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1233 || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1234 goto label_invalid_code;
4ed46869 1235 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
e0e989f6 1236 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
4ed46869
KH
1237 break;
1238
1239 case 'N': /* invocation of single-shift-2 */
d46c5b12
KH
1240 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1241 || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1242 goto label_invalid_code;
4ed46869
KH
1243 ONE_MORE_BYTE (c1);
1244 charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
1245 DECODE_ISO_CHARACTER (charset, c1);
1246 break;
1247
1248 case 'O': /* invocation of single-shift-3 */
d46c5b12
KH
1249 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1250 || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1251 goto label_invalid_code;
4ed46869
KH
1252 ONE_MORE_BYTE (c1);
1253 charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
1254 DECODE_ISO_CHARACTER (charset, c1);
1255 break;
1256
d46c5b12
KH
1257 case '0': case '2': /* start composing */
1258 /* Before processing composing, we must be sure that all
1259 characters being composed are supported by CODING.
88993dfd
KH
1260 If not, we must give up composing. */
1261 if (check_composing_code (coding, src, src_end) == 0)
1262 {
1263 /* We are looking at a valid composition sequence. */
1264 coding->composing = (c1 == '0'
1265 ? COMPOSING_NO_RULE_HEAD
1266 : COMPOSING_WITH_RULE_HEAD);
1267 coding->composed_chars = 0;
1268 }
1269 else
1270 {
1271 *dst++ = ISO_CODE_ESC;
1272 *dst++ = c1;
1273 coding->produced_char += 2;
1274 }
4ed46869
KH
1275 break;
1276
1277 case '1': /* end composing */
88993dfd
KH
1278 if (!coding->composing)
1279 {
1280 *dst++ = ISO_CODE_ESC;
1281 *dst++ = c1;
1282 coding->produced_char += 2;
1283 break;
1284 }
1285
de79a6a5
KH
1286 if (coding->composed_chars > 0)
1287 {
1288 if (coding->composed_chars == 1)
1289 {
1290 unsigned char *this_char_start = dst;
1291 int this_bytes;
1292
1293 /* Only one character is in the composing
1294 sequence. Make it a normal character. */
1295 while (*--this_char_start != LEADING_CODE_COMPOSITION);
1296 dst = (this_char_start
1297 + (coding->composing == COMPOSING_NO_RULE_TAIL
1298 ? 1 : 2));
1299 *dst -= 0x20;
1300 if (*dst == 0x80)
1301 *++dst &= 0x7F;
1302 this_bytes = BYTES_BY_CHAR_HEAD (*dst);
1303 while (this_bytes--) *this_char_start++ = *dst++;
1304 dst = this_char_start;
1305 }
1306 coding->produced_char++;
1307 }
4ed46869 1308 coding->composing = COMPOSING_NO;
4ed46869
KH
1309 break;
1310
1311 case '[': /* specification of direction */
d46c5b12
KH
1312 if (coding->flags & CODING_FLAG_ISO_NO_DIRECTION)
1313 goto label_invalid_code;
4ed46869 1314 /* For the moment, nested direction is not supported.
d46c5b12
KH
1315 So, `coding->mode & CODING_MODE_DIRECTION' zero means
1316 left-to-right, and nozero means right-to-left. */
4ed46869
KH
1317 ONE_MORE_BYTE (c1);
1318 switch (c1)
1319 {
1320 case ']': /* end of the current direction */
d46c5b12 1321 coding->mode &= ~CODING_MODE_DIRECTION;
4ed46869
KH
1322
1323 case '0': /* end of the current direction */
1324 case '1': /* start of left-to-right direction */
1325 ONE_MORE_BYTE (c1);
1326 if (c1 == ']')
d46c5b12 1327 coding->mode &= ~CODING_MODE_DIRECTION;
4ed46869 1328 else
d46c5b12 1329 goto label_invalid_code;
4ed46869
KH
1330 break;
1331
1332 case '2': /* start of right-to-left direction */
1333 ONE_MORE_BYTE (c1);
1334 if (c1 == ']')
d46c5b12 1335 coding->mode |= CODING_MODE_DIRECTION;
4ed46869 1336 else
d46c5b12 1337 goto label_invalid_code;
4ed46869
KH
1338 break;
1339
1340 default:
d46c5b12 1341 goto label_invalid_code;
4ed46869
KH
1342 }
1343 break;
1344
1345 default:
d46c5b12
KH
1346 if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1347 goto label_invalid_code;
4ed46869
KH
1348 if (c1 >= 0x28 && c1 <= 0x2B)
1349 { /* designation of DIMENSION1_CHARS94 character set */
1350 ONE_MORE_BYTE (c2);
1351 DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
1352 }
1353 else if (c1 >= 0x2C && c1 <= 0x2F)
1354 { /* designation of DIMENSION1_CHARS96 character set */
1355 ONE_MORE_BYTE (c2);
1356 DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
1357 }
1358 else
1359 {
d46c5b12 1360 goto label_invalid_code;
4ed46869
KH
1361 }
1362 }
1363 /* We must update these variables now. */
1364 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1365 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1366 break;
1367
d46c5b12 1368 label_invalid_code:
d46c5b12
KH
1369 while (src_base < src)
1370 *dst++ = *src_base++;
fb88bf2d 1371 coding->fake_multibyte = 1;
4ed46869
KH
1372 }
1373 continue;
1374
1375 label_end_of_loop:
d46c5b12
KH
1376 result = CODING_FINISH_INSUFFICIENT_SRC;
1377 label_end_of_loop_2:
4ed46869
KH
1378 src = src_base;
1379 break;
1380 }
1381
fb88bf2d 1382 if (src < src_end)
4ed46869 1383 {
fb88bf2d
KH
1384 if (result == CODING_FINISH_NORMAL)
1385 result = CODING_FINISH_INSUFFICIENT_DST;
1386 else if (result != CODING_FINISH_INCONSISTENT_EOL
1387 && coding->mode & CODING_MODE_LAST_BLOCK)
1388 {
1389 /* This is the last block of the text to be decoded. We had
1390 better just flush out all remaining codes in the text
1391 although they are not valid characters. */
1392 src_bytes = src_end - src;
1393 if (dst_bytes && (dst_end - dst < src_bytes))
1394 src_bytes = dst_end - dst;
1395 bcopy (src, dst, src_bytes);
1396 dst += src_bytes;
1397 src += src_bytes;
1398 coding->fake_multibyte = 1;
1399 }
4ed46869 1400 }
fb88bf2d 1401
d46c5b12
KH
1402 coding->consumed = coding->consumed_char = src - source;
1403 coding->produced = dst - destination;
1404 return result;
4ed46869
KH
1405}
1406
f4dee582 1407/* ISO2022 encoding stuff. */
4ed46869
KH
1408
1409/*
f4dee582 1410 It is not enough to say just "ISO2022" on encoding, we have to
d46c5b12 1411 specify more details. In Emacs, each coding system of ISO2022
4ed46869
KH
1412 variant has the following specifications:
1413 1. Initial designation to G0 thru G3.
1414 2. Allows short-form designation?
1415 3. ASCII should be designated to G0 before control characters?
1416 4. ASCII should be designated to G0 at end of line?
1417 5. 7-bit environment or 8-bit environment?
1418 6. Use locking-shift?
1419 7. Use Single-shift?
1420 And the following two are only for Japanese:
1421 8. Use ASCII in place of JIS0201-1976-Roman?
1422 9. Use JISX0208-1983 in place of JISX0208-1978?
1423 These specifications are encoded in `coding->flags' as flag bits
1424 defined by macros CODING_FLAG_ISO_XXX. See `coding.h' for more
f4dee582 1425 details.
4ed46869
KH
1426*/
1427
1428/* Produce codes (escape sequence) for designating CHARSET to graphic
1429 register REG. If <final-char> of CHARSET is '@', 'A', or 'B' and
1430 the coding system CODING allows, produce designation sequence of
1431 short-form. */
1432
1433#define ENCODE_DESIGNATION(charset, reg, coding) \
1434 do { \
1435 unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset); \
1436 char *intermediate_char_94 = "()*+"; \
1437 char *intermediate_char_96 = ",-./"; \
70c22245
KH
1438 int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset); \
1439 if (revision < 255) \
1440 { \
4ed46869
KH
1441 *dst++ = ISO_CODE_ESC; \
1442 *dst++ = '&'; \
70c22245 1443 *dst++ = '@' + revision; \
4ed46869
KH
1444 } \
1445 *dst++ = ISO_CODE_ESC; \
1446 if (CHARSET_DIMENSION (charset) == 1) \
1447 { \
1448 if (CHARSET_CHARS (charset) == 94) \
1449 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1450 else \
1451 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1452 } \
1453 else \
1454 { \
1455 *dst++ = '$'; \
1456 if (CHARSET_CHARS (charset) == 94) \
1457 { \
1458 if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM) \
1459 || reg != 0 \
1460 || final_char < '@' || final_char > 'B') \
1461 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1462 } \
1463 else \
1464 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1465 } \
1466 *dst++ = final_char; \
1467 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
1468 } while (0)
1469
1470/* The following two macros produce codes (control character or escape
1471 sequence) for ISO2022 single-shift functions (single-shift-2 and
1472 single-shift-3). */
1473
1474#define ENCODE_SINGLE_SHIFT_2 \
1475 do { \
1476 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1477 *dst++ = ISO_CODE_ESC, *dst++ = 'N'; \
1478 else \
fb88bf2d
KH
1479 { \
1480 *dst++ = ISO_CODE_SS2; \
1481 coding->fake_multibyte = 1; \
1482 } \
4ed46869
KH
1483 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1484 } while (0)
1485
fb88bf2d
KH
1486#define ENCODE_SINGLE_SHIFT_3 \
1487 do { \
4ed46869 1488 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
fb88bf2d
KH
1489 *dst++ = ISO_CODE_ESC, *dst++ = 'O'; \
1490 else \
1491 { \
1492 *dst++ = ISO_CODE_SS3; \
1493 coding->fake_multibyte = 1; \
1494 } \
4ed46869
KH
1495 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1496 } while (0)
1497
1498/* The following four macros produce codes (control character or
1499 escape sequence) for ISO2022 locking-shift functions (shift-in,
1500 shift-out, locking-shift-2, and locking-shift-3). */
1501
1502#define ENCODE_SHIFT_IN \
1503 do { \
1504 *dst++ = ISO_CODE_SI; \
1505 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1506 } while (0)
1507
1508#define ENCODE_SHIFT_OUT \
1509 do { \
1510 *dst++ = ISO_CODE_SO; \
1511 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1512 } while (0)
1513
1514#define ENCODE_LOCKING_SHIFT_2 \
1515 do { \
1516 *dst++ = ISO_CODE_ESC, *dst++ = 'n'; \
1517 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1518 } while (0)
1519
1520#define ENCODE_LOCKING_SHIFT_3 \
1521 do { \
1522 *dst++ = ISO_CODE_ESC, *dst++ = 'o'; \
1523 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1524 } while (0)
1525
f4dee582
RS
1526/* Produce codes for a DIMENSION1 character whose character set is
1527 CHARSET and whose position-code is C1. Designation and invocation
4ed46869
KH
1528 sequences are also produced in advance if necessary. */
1529
1530
6e85d753
KH
1531#define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
1532 do { \
1533 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1534 { \
1535 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1536 *dst++ = c1 & 0x7F; \
1537 else \
1538 *dst++ = c1 | 0x80; \
1539 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1540 break; \
1541 } \
1542 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1543 { \
1544 *dst++ = c1 & 0x7F; \
1545 break; \
1546 } \
1547 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1548 { \
1549 *dst++ = c1 | 0x80; \
1550 break; \
1551 } \
1552 else if (coding->flags & CODING_FLAG_ISO_SAFE \
70c22245 1553 && !coding->safe_charsets[charset]) \
6e85d753
KH
1554 { \
1555 /* We should not encode this character, instead produce one or \
1556 two `?'s. */ \
1557 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1558 if (CHARSET_WIDTH (charset) == 2) \
1559 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1560 break; \
1561 } \
1562 else \
1563 /* Since CHARSET is not yet invoked to any graphic planes, we \
1564 must invoke it, or, at first, designate it to some graphic \
1565 register. Then repeat the loop to actually produce the \
1566 character. */ \
1567 dst = encode_invocation_designation (charset, coding, dst); \
4ed46869
KH
1568 } while (1)
1569
f4dee582
RS
1570/* Produce codes for a DIMENSION2 character whose character set is
1571 CHARSET and whose position-codes are C1 and C2. Designation and
4ed46869
KH
1572 invocation codes are also produced in advance if necessary. */
1573
6e85d753
KH
1574#define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
1575 do { \
1576 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1577 { \
1578 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1579 *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F; \
1580 else \
1581 *dst++ = c1 | 0x80, *dst++ = c2 | 0x80; \
1582 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1583 break; \
1584 } \
1585 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1586 { \
1587 *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F; \
1588 break; \
1589 } \
1590 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1591 { \
1592 *dst++ = c1 | 0x80, *dst++= c2 | 0x80; \
1593 break; \
1594 } \
1595 else if (coding->flags & CODING_FLAG_ISO_SAFE \
70c22245 1596 && !coding->safe_charsets[charset]) \
6e85d753
KH
1597 { \
1598 /* We should not encode this character, instead produce one or \
1599 two `?'s. */ \
1600 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1601 if (CHARSET_WIDTH (charset) == 2) \
1602 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1603 break; \
1604 } \
1605 else \
1606 /* Since CHARSET is not yet invoked to any graphic planes, we \
1607 must invoke it, or, at first, designate it to some graphic \
1608 register. Then repeat the loop to actually produce the \
1609 character. */ \
1610 dst = encode_invocation_designation (charset, coding, dst); \
4ed46869
KH
1611 } while (1)
1612
6f551029
KH
1613#define ENCODE_ISO_CHARACTER(charset, c1, c2) \
1614 do { \
1615 int c_alt, charset_alt; \
1616 if (!NILP (translation_table) \
1617 && ((c_alt = translate_char (translation_table, -1, \
1618 charset, c1, c2)) \
1619 >= 0)) \
1620 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1621 else \
1622 charset_alt = charset; \
1623 if (CHARSET_DEFINED_P (charset_alt)) \
1624 { \
1625 if (CHARSET_DIMENSION (charset_alt) == 1) \
1626 { \
1627 if (charset == CHARSET_ASCII \
1628 && coding->flags & CODING_FLAG_ISO_USE_ROMAN) \
1629 charset_alt = charset_latin_jisx0201; \
1630 ENCODE_ISO_CHARACTER_DIMENSION1 (charset_alt, c1); \
1631 } \
1632 else \
1633 { \
1634 if (charset == charset_jisx0208 \
1635 && coding->flags & CODING_FLAG_ISO_USE_OLDJIS) \
1636 charset_alt = charset_jisx0208_1978; \
1637 ENCODE_ISO_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
1638 } \
1639 } \
1640 else \
1641 { \
1642 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1643 { \
1644 *dst++ = charset & 0x7f; \
1645 *dst++ = c1 & 0x7f; \
1646 if (c2) \
1647 *dst++ = c2 & 0x7f; \
1648 } \
1649 else \
1650 { \
1651 *dst++ = charset; \
1652 *dst++ = c1; \
1653 if (c2) \
1654 *dst++ = c2; \
1655 } \
1656 } \
1657 if (! COMPOSING_P (coding->composing)) \
1658 coding->consumed_char++; \
84fbb8a0 1659 } while (0)
bdd9fb48 1660
4ed46869
KH
1661/* Produce designation and invocation codes at a place pointed by DST
1662 to use CHARSET. The element `spec.iso2022' of *CODING is updated.
1663 Return new DST. */
1664
1665unsigned char *
1666encode_invocation_designation (charset, coding, dst)
1667 int charset;
1668 struct coding_system *coding;
1669 unsigned char *dst;
1670{
1671 int reg; /* graphic register number */
1672
1673 /* At first, check designations. */
1674 for (reg = 0; reg < 4; reg++)
1675 if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
1676 break;
1677
1678 if (reg >= 4)
1679 {
1680 /* CHARSET is not yet designated to any graphic registers. */
1681 /* At first check the requested designation. */
1682 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1ba9e4ab
KH
1683 if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1684 /* Since CHARSET requests no special designation, designate it
1685 to graphic register 0. */
4ed46869
KH
1686 reg = 0;
1687
1688 ENCODE_DESIGNATION (charset, reg, coding);
1689 }
1690
1691 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
1692 && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
1693 {
1694 /* Since the graphic register REG is not invoked to any graphic
1695 planes, invoke it to graphic plane 0. */
1696 switch (reg)
1697 {
1698 case 0: /* graphic register 0 */
1699 ENCODE_SHIFT_IN;
1700 break;
1701
1702 case 1: /* graphic register 1 */
1703 ENCODE_SHIFT_OUT;
1704 break;
1705
1706 case 2: /* graphic register 2 */
1707 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1708 ENCODE_SINGLE_SHIFT_2;
1709 else
1710 ENCODE_LOCKING_SHIFT_2;
1711 break;
1712
1713 case 3: /* graphic register 3 */
1714 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1715 ENCODE_SINGLE_SHIFT_3;
1716 else
1717 ENCODE_LOCKING_SHIFT_3;
1718 break;
1719 }
1720 }
1721 return dst;
1722}
1723
1724/* The following two macros produce codes for indicating composition. */
1725#define ENCODE_COMPOSITION_NO_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '0'
1726#define ENCODE_COMPOSITION_WITH_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '2'
1727#define ENCODE_COMPOSITION_END *dst++ = ISO_CODE_ESC, *dst++ = '1'
1728
1729/* The following three macros produce codes for indicating direction
1730 of text. */
1731#define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
1732 do { \
1733 if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS) \
1734 *dst++ = ISO_CODE_ESC, *dst++ = '['; \
1735 else \
1736 *dst++ = ISO_CODE_CSI; \
1737 } while (0)
1738
1739#define ENCODE_DIRECTION_R2L \
1740 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '2', *dst++ = ']'
1741
1742#define ENCODE_DIRECTION_L2R \
1743 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '0', *dst++ = ']'
1744
1745/* Produce codes for designation and invocation to reset the graphic
1746 planes and registers to initial state. */
e0e989f6
KH
1747#define ENCODE_RESET_PLANE_AND_REGISTER \
1748 do { \
1749 int reg; \
1750 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0) \
1751 ENCODE_SHIFT_IN; \
1752 for (reg = 0; reg < 4; reg++) \
1753 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0 \
1754 && (CODING_SPEC_ISO_DESIGNATION (coding, reg) \
1755 != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg))) \
1756 ENCODE_DESIGNATION \
1757 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
4ed46869
KH
1758 } while (0)
1759
bdd9fb48 1760/* Produce designation sequences of charsets in the line started from
d46c5b12 1761 SRC to a place pointed by *DSTP, and update DSTP.
bdd9fb48
KH
1762
1763 If the current block ends before any end-of-line, we may fail to
d46c5b12
KH
1764 find all the necessary designations. */
1765
dfcf069d 1766void
bdd9fb48 1767encode_designation_at_bol (coding, table, src, src_end, dstp)
e0e989f6 1768 struct coding_system *coding;
bdd9fb48 1769 Lisp_Object table;
e0e989f6
KH
1770 unsigned char *src, *src_end, **dstp;
1771{
bdd9fb48
KH
1772 int charset, c, found = 0, reg;
1773 /* Table of charsets to be designated to each graphic register. */
1774 int r[4];
1775 unsigned char *dst = *dstp;
1776
1777 for (reg = 0; reg < 4; reg++)
1778 r[reg] = -1;
1779
1780 while (src < src_end && *src != '\n' && found < 4)
e0e989f6 1781 {
bdd9fb48
KH
1782 int bytes = BYTES_BY_CHAR_HEAD (*src);
1783
1784 if (NILP (table))
1785 charset = CHARSET_AT (src);
1786 else
e0e989f6 1787 {
35cb8686
RS
1788 int c_alt;
1789 unsigned char c1, c2;
bdd9fb48
KH
1790
1791 SPLIT_STRING(src, bytes, charset, c1, c2);
84fbb8a0 1792 if ((c_alt = translate_char (table, -1, charset, c1, c2)) >= 0)
bdd9fb48 1793 charset = CHAR_CHARSET (c_alt);
e0e989f6 1794 }
bdd9fb48 1795
e0e989f6 1796 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
d46c5b12 1797 if (reg != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION && r[reg] < 0)
bdd9fb48
KH
1798 {
1799 found++;
1800 r[reg] = charset;
1801 }
1802
1803 src += bytes;
1804 }
1805
1806 if (found)
1807 {
1808 for (reg = 0; reg < 4; reg++)
1809 if (r[reg] >= 0
1810 && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
1811 ENCODE_DESIGNATION (r[reg], reg, coding);
1812 *dstp = dst;
e0e989f6 1813 }
e0e989f6
KH
1814}
1815
4ed46869
KH
1816/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
1817
1818int
d46c5b12 1819encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
1820 struct coding_system *coding;
1821 unsigned char *source, *destination;
1822 int src_bytes, dst_bytes;
4ed46869
KH
1823{
1824 unsigned char *src = source;
1825 unsigned char *src_end = source + src_bytes;
1826 unsigned char *dst = destination;
1827 unsigned char *dst_end = destination + dst_bytes;
e0e989f6 1828 /* Since the maximum bytes produced by each loop is 20, we subtract 19
4ed46869
KH
1829 from DST_END to assure overflow checking is necessary only at the
1830 head of loop. */
e0e989f6 1831 unsigned char *adjusted_dst_end = dst_end - 19;
84fbb8a0 1832 Lisp_Object translation_table
f967223b 1833 = coding->translation_table_for_encode;
d46c5b12 1834 int result = CODING_FINISH_NORMAL;
bdd9fb48 1835
84fbb8a0 1836 if (!NILP (Venable_character_translation) && NILP (translation_table))
f967223b 1837 translation_table = Vstandard_translation_table_for_encode;
4ed46869 1838
d46c5b12 1839 coding->consumed_char = 0;
fb88bf2d 1840 coding->fake_multibyte = 0;
d46c5b12
KH
1841 while (src < src_end && (dst_bytes
1842 ? (dst < adjusted_dst_end)
1843 : (dst < src - 19)))
4ed46869
KH
1844 {
1845 /* SRC_BASE remembers the start position in source in each loop.
1846 The loop will be exited when there's not enough source text
1847 to analyze multi-byte codes (within macros ONE_MORE_BYTE,
1848 TWO_MORE_BYTES, and THREE_MORE_BYTES). In that case, SRC is
1849 reset to SRC_BASE before exiting. */
1850 unsigned char *src_base = src;
bdd9fb48 1851 int charset, c1, c2, c3, c4;
4ed46869 1852
e0e989f6
KH
1853 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
1854 && CODING_SPEC_ISO_BOL (coding))
1855 {
bdd9fb48 1856 /* We have to produce designation sequences if any now. */
84fbb8a0 1857 encode_designation_at_bol (coding, translation_table,
bdd9fb48 1858 src, src_end, &dst);
e0e989f6
KH
1859 CODING_SPEC_ISO_BOL (coding) = 0;
1860 }
1861
1862 c1 = *src++;
4ed46869 1863 /* If we are seeing a component of a composite character, we are
d46c5b12
KH
1864 seeing a leading-code encoded irregularly for composition, or
1865 a composition rule if composing with rule. We must set C1 to
1866 a normal leading-code or an ASCII code. If we are not seeing
1867 a composite character, we must reset composition,
1868 designation, and invocation states. */
4ed46869
KH
1869 if (COMPOSING_P (coding->composing))
1870 {
1871 if (c1 < 0xA0)
1872 {
1873 /* We are not in a composite character any longer. */
1874 coding->composing = COMPOSING_NO;
d46c5b12 1875 ENCODE_RESET_PLANE_AND_REGISTER;
4ed46869
KH
1876 ENCODE_COMPOSITION_END;
1877 }
1878 else
1879 {
1880 if (coding->composing == COMPOSING_WITH_RULE_RULE)
1881 {
1882 *dst++ = c1 & 0x7F;
1883 coding->composing = COMPOSING_WITH_RULE_HEAD;
1884 continue;
1885 }
1886 else if (coding->composing == COMPOSING_WITH_RULE_HEAD)
1887 coding->composing = COMPOSING_WITH_RULE_RULE;
1888 if (c1 == 0xA0)
1889 {
1890 /* This is an ASCII component. */
1891 ONE_MORE_BYTE (c1);
1892 c1 &= 0x7F;
1893 }
1894 else
1895 /* This is a leading-code of non ASCII component. */
1896 c1 -= 0x20;
1897 }
1898 }
1899
1900 /* Now encode one character. C1 is a control character, an
1901 ASCII character, or a leading-code of multi-byte character. */
1902 switch (emacs_code_class[c1])
1903 {
1904 case EMACS_ascii_code:
8dbb769e 1905 c2 = 0;
bdd9fb48 1906 ENCODE_ISO_CHARACTER (CHARSET_ASCII, c1, /* dummy */ c2);
4ed46869
KH
1907 break;
1908
1909 case EMACS_control_code:
1910 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
e0e989f6 1911 ENCODE_RESET_PLANE_AND_REGISTER;
4ed46869 1912 *dst++ = c1;
d46c5b12 1913 coding->consumed_char++;
4ed46869
KH
1914 break;
1915
1916 case EMACS_carriage_return_code:
d46c5b12 1917 if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
4ed46869
KH
1918 {
1919 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
e0e989f6 1920 ENCODE_RESET_PLANE_AND_REGISTER;
4ed46869 1921 *dst++ = c1;
d46c5b12 1922 coding->consumed_char++;
4ed46869
KH
1923 break;
1924 }
1925 /* fall down to treat '\r' as '\n' ... */
1926
1927 case EMACS_linefeed_code:
1928 if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
e0e989f6
KH
1929 ENCODE_RESET_PLANE_AND_REGISTER;
1930 if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
1931 bcopy (coding->spec.iso2022.initial_designation,
1932 coding->spec.iso2022.current_designation,
1933 sizeof coding->spec.iso2022.initial_designation);
4ed46869 1934 if (coding->eol_type == CODING_EOL_LF
0ef69138 1935 || coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
1936 *dst++ = ISO_CODE_LF;
1937 else if (coding->eol_type == CODING_EOL_CRLF)
1938 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
1939 else
1940 *dst++ = ISO_CODE_CR;
e0e989f6 1941 CODING_SPEC_ISO_BOL (coding) = 1;
d46c5b12 1942 coding->consumed_char++;
4ed46869
KH
1943 break;
1944
1945 case EMACS_leading_code_2:
1946 ONE_MORE_BYTE (c2);
8dbb769e 1947 c3 = 0;
19a8d9e0
KH
1948 if (c2 < 0xA0)
1949 {
1950 /* invalid sequence */
1951 *dst++ = c1;
38cf95df
RS
1952 src--;
1953 coding->consumed_char++;
19a8d9e0
KH
1954 }
1955 else
1956 ENCODE_ISO_CHARACTER (c1, c2, /* dummy */ c3);
4ed46869
KH
1957 break;
1958
1959 case EMACS_leading_code_3:
1960 TWO_MORE_BYTES (c2, c3);
8dbb769e 1961 c4 = 0;
19a8d9e0
KH
1962 if (c2 < 0xA0 || c3 < 0xA0)
1963 {
1964 /* invalid sequence */
1965 *dst++ = c1;
38cf95df
RS
1966 src -= 2;
1967 coding->consumed_char++;
19a8d9e0
KH
1968 }
1969 else if (c1 < LEADING_CODE_PRIVATE_11)
bdd9fb48 1970 ENCODE_ISO_CHARACTER (c1, c2, c3);
4ed46869 1971 else
bdd9fb48 1972 ENCODE_ISO_CHARACTER (c2, c3, /* dummy */ c4);
4ed46869
KH
1973 break;
1974
1975 case EMACS_leading_code_4:
1976 THREE_MORE_BYTES (c2, c3, c4);
19a8d9e0
KH
1977 if (c2 < 0xA0 || c3 < 0xA0 || c4 < 0xA0)
1978 {
1979 /* invalid sequence */
1980 *dst++ = c1;
38cf95df
RS
1981 src -= 3;
1982 coding->consumed_char++;
19a8d9e0
KH
1983 }
1984 else
1985 ENCODE_ISO_CHARACTER (c2, c3, c4);
4ed46869
KH
1986 break;
1987
1988 case EMACS_leading_code_composition:
19a8d9e0
KH
1989 ONE_MORE_BYTE (c2);
1990 if (c2 < 0xA0)
1991 {
1992 /* invalid sequence */
1993 *dst++ = c1;
38cf95df
RS
1994 src--;
1995 coding->consumed_char++;
19a8d9e0
KH
1996 }
1997 else if (c2 == 0xFF)
4ed46869 1998 {
d46c5b12 1999 ENCODE_RESET_PLANE_AND_REGISTER;
4ed46869
KH
2000 coding->composing = COMPOSING_WITH_RULE_HEAD;
2001 ENCODE_COMPOSITION_WITH_RULE_START;
d46c5b12 2002 coding->consumed_char++;
4ed46869
KH
2003 }
2004 else
2005 {
d46c5b12 2006 ENCODE_RESET_PLANE_AND_REGISTER;
4ed46869
KH
2007 /* Rewind one byte because it is a character code of
2008 composition elements. */
2009 src--;
2010 coding->composing = COMPOSING_NO_RULE_HEAD;
2011 ENCODE_COMPOSITION_NO_RULE_START;
d46c5b12 2012 coding->consumed_char++;
4ed46869
KH
2013 }
2014 break;
2015
2016 case EMACS_invalid_code:
3efbce95
KH
2017 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2018 ENCODE_RESET_PLANE_AND_REGISTER;
4ed46869 2019 *dst++ = c1;
d46c5b12 2020 coding->consumed_char++;
4ed46869
KH
2021 break;
2022 }
2023 continue;
2024 label_end_of_loop:
d46c5b12
KH
2025 result = CODING_FINISH_INSUFFICIENT_SRC;
2026 src = src_base;
4ed46869
KH
2027 break;
2028 }
2029
49cb52b4
KH
2030 if (src < src_end && result == CODING_FINISH_NORMAL)
2031 result = CODING_FINISH_INSUFFICIENT_DST;
2032
2033 /* If this is the last block of the text to be encoded, we must
2034 reset graphic planes and registers to the initial state, and
2035 flush out the carryover if any. */
2036 if (coding->mode & CODING_MODE_LAST_BLOCK)
84fbb8a0
KH
2037 {
2038 ENCODE_RESET_PLANE_AND_REGISTER;
2039 if (COMPOSING_P (coding->composing))
2040 ENCODE_COMPOSITION_END;
88993dfd
KH
2041 if (result == CODING_FINISH_INSUFFICIENT_SRC)
2042 {
2043 while (src < src_end && dst < dst_end)
2044 *dst++ = *src++;
2045 }
84fbb8a0 2046 }
d46c5b12
KH
2047 coding->consumed = src - source;
2048 coding->produced = coding->produced_char = dst - destination;
2049 return result;
4ed46869
KH
2050}
2051
2052\f
2053/*** 4. SJIS and BIG5 handlers ***/
2054
f4dee582 2055/* Although SJIS and BIG5 are not ISO's coding system, they are used
4ed46869
KH
2056 quite widely. So, for the moment, Emacs supports them in the bare
2057 C code. But, in the future, they may be supported only by CCL. */
2058
2059/* SJIS is a coding system encoding three character sets: ASCII, right
2060 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
2061 as is. A character of charset katakana-jisx0201 is encoded by
2062 "position-code + 0x80". A character of charset japanese-jisx0208
2063 is encoded in 2-byte but two position-codes are divided and shifted
2064 so that it fit in the range below.
2065
2066 --- CODE RANGE of SJIS ---
2067 (character set) (range)
2068 ASCII 0x00 .. 0x7F
2069 KATAKANA-JISX0201 0xA0 .. 0xDF
c28a9453 2070 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
d14d03ac 2071 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
4ed46869
KH
2072 -------------------------------
2073
2074*/
2075
2076/* BIG5 is a coding system encoding two character sets: ASCII and
2077 Big5. An ASCII character is encoded as is. Big5 is a two-byte
2078 character set and is encoded in two-byte.
2079
2080 --- CODE RANGE of BIG5 ---
2081 (character set) (range)
2082 ASCII 0x00 .. 0x7F
2083 Big5 (1st byte) 0xA1 .. 0xFE
2084 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
2085 --------------------------
2086
2087 Since the number of characters in Big5 is larger than maximum
2088 characters in Emacs' charset (96x96), it can't be handled as one
2089 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
2090 and `charset-big5-2'. Both are DIMENSION2 and CHARS94. The former
2091 contains frequently used characters and the latter contains less
2092 frequently used characters. */
2093
2094/* Macros to decode or encode a character of Big5 in BIG5. B1 and B2
2095 are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
2096 C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
2097 format. CHARSET is `charset_big5_1' or `charset_big5_2'. */
2098
2099/* Number of Big5 characters which have the same code in 1st byte. */
2100#define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2101
2102#define DECODE_BIG5(b1, b2, charset, c1, c2) \
2103 do { \
2104 unsigned int temp \
2105 = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62); \
2106 if (b1 < 0xC9) \
2107 charset = charset_big5_1; \
2108 else \
2109 { \
2110 charset = charset_big5_2; \
2111 temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW; \
2112 } \
2113 c1 = temp / (0xFF - 0xA1) + 0x21; \
2114 c2 = temp % (0xFF - 0xA1) + 0x21; \
2115 } while (0)
2116
2117#define ENCODE_BIG5(charset, c1, c2, b1, b2) \
2118 do { \
2119 unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21); \
2120 if (charset == charset_big5_2) \
2121 temp += BIG5_SAME_ROW * (0xC9 - 0xA1); \
2122 b1 = temp / BIG5_SAME_ROW + 0xA1; \
2123 b2 = temp % BIG5_SAME_ROW; \
2124 b2 += b2 < 0x3F ? 0x40 : 0x62; \
2125 } while (0)
2126
a5d301df
KH
2127#define DECODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \
2128 do { \
2129 int c_alt, charset_alt = (charset); \
84fbb8a0
KH
2130 if (!NILP (translation_table) \
2131 && ((c_alt = translate_char (translation_table, \
2132 -1, (charset), c1, c2)) >= 0)) \
55ab7be3 2133 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
a5d301df
KH
2134 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
2135 DECODE_CHARACTER_ASCII (c1); \
2136 else if (CHARSET_DIMENSION (charset_alt) == 1) \
2137 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
2138 else \
2139 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
2140 } while (0)
2141
84fbb8a0
KH
2142#define ENCODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \
2143 do { \
2144 int c_alt, charset_alt; \
2145 if (!NILP (translation_table) \
2146 && ((c_alt = translate_char (translation_table, -1, \
2147 charset, c1, c2)) \
2148 >= 0)) \
2149 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
2150 else \
2151 charset_alt = charset; \
2152 if (charset_alt == charset_ascii) \
2153 *dst++ = c1; \
2154 else if (CHARSET_DIMENSION (charset_alt) == 1) \
2155 { \
2156 if (sjis_p && charset_alt == charset_katakana_jisx0201) \
2157 *dst++ = c1; \
2158 else \
2159 { \
2160 *dst++ = charset_alt, *dst++ = c1; \
2161 coding->fake_multibyte = 1; \
2162 } \
2163 } \
2164 else \
2165 { \
2166 c1 &= 0x7F, c2 &= 0x7F; \
2167 if (sjis_p && charset_alt == charset_jisx0208) \
2168 { \
2169 unsigned char s1, s2; \
2170 \
2171 ENCODE_SJIS (c1, c2, s1, s2); \
2172 *dst++ = s1, *dst++ = s2; \
2173 coding->fake_multibyte = 1; \
2174 } \
2175 else if (!sjis_p \
2176 && (charset_alt == charset_big5_1 \
2177 || charset_alt == charset_big5_2)) \
2178 { \
2179 unsigned char b1, b2; \
2180 \
2181 ENCODE_BIG5 (charset_alt, c1, c2, b1, b2); \
2182 *dst++ = b1, *dst++ = b2; \
2183 } \
2184 else \
2185 { \
2186 *dst++ = charset_alt, *dst++ = c1, *dst++ = c2; \
2187 coding->fake_multibyte = 1; \
2188 } \
2189 } \
2190 coding->consumed_char++; \
a5d301df
KH
2191 } while (0);
2192
4ed46869
KH
2193/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2194 Check if a text is encoded in SJIS. If it is, return
2195 CODING_CATEGORY_MASK_SJIS, else return 0. */
2196
2197int
2198detect_coding_sjis (src, src_end)
2199 unsigned char *src, *src_end;
2200{
2201 unsigned char c;
2202
2203 while (src < src_end)
2204 {
2205 c = *src++;
4ed46869
KH
2206 if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
2207 {
2208 if (src < src_end && *src++ < 0x40)
2209 return 0;
2210 }
2211 }
2212 return CODING_CATEGORY_MASK_SJIS;
2213}
2214
2215/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2216 Check if a text is encoded in BIG5. If it is, return
2217 CODING_CATEGORY_MASK_BIG5, else return 0. */
2218
2219int
2220detect_coding_big5 (src, src_end)
2221 unsigned char *src, *src_end;
2222{
2223 unsigned char c;
2224
2225 while (src < src_end)
2226 {
2227 c = *src++;
4ed46869
KH
2228 if (c >= 0xA1)
2229 {
2230 if (src >= src_end)
2231 break;
2232 c = *src++;
2233 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
2234 return 0;
2235 }
2236 }
2237 return CODING_CATEGORY_MASK_BIG5;
2238}
2239
2240/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2241 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
2242
2243int
2244decode_coding_sjis_big5 (coding, source, destination,
d46c5b12 2245 src_bytes, dst_bytes, sjis_p)
4ed46869
KH
2246 struct coding_system *coding;
2247 unsigned char *source, *destination;
2248 int src_bytes, dst_bytes;
4ed46869
KH
2249 int sjis_p;
2250{
2251 unsigned char *src = source;
2252 unsigned char *src_end = source + src_bytes;
2253 unsigned char *dst = destination;
2254 unsigned char *dst_end = destination + dst_bytes;
2255 /* Since the maximum bytes produced by each loop is 4, we subtract 3
2256 from DST_END to assure overflow checking is necessary only at the
2257 head of loop. */
2258 unsigned char *adjusted_dst_end = dst_end - 3;
84fbb8a0 2259 Lisp_Object translation_table
f967223b 2260 = coding->translation_table_for_decode;
d46c5b12 2261 int result = CODING_FINISH_NORMAL;
a5d301df 2262
84fbb8a0 2263 if (!NILP (Venable_character_translation) && NILP (translation_table))
f967223b 2264 translation_table = Vstandard_translation_table_for_decode;
4ed46869 2265
d46c5b12 2266 coding->produced_char = 0;
fb88bf2d 2267 coding->fake_multibyte = 0;
d46c5b12
KH
2268 while (src < src_end && (dst_bytes
2269 ? (dst < adjusted_dst_end)
2270 : (dst < src - 3)))
4ed46869
KH
2271 {
2272 /* SRC_BASE remembers the start position in source in each loop.
2273 The loop will be exited when there's not enough source text
2274 to analyze two-byte character (within macro ONE_MORE_BYTE).
2275 In that case, SRC is reset to SRC_BASE before exiting. */
2276 unsigned char *src_base = src;
2277 unsigned char c1 = *src++, c2, c3, c4;
2278
d46c5b12 2279 if (c1 < 0x20)
4ed46869 2280 {
d46c5b12 2281 if (c1 == '\r')
4ed46869 2282 {
d46c5b12
KH
2283 if (coding->eol_type == CODING_EOL_CRLF)
2284 {
2285 ONE_MORE_BYTE (c2);
2286 if (c2 == '\n')
2287 *dst++ = c2;
2288 else if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2289 {
2290 result = CODING_FINISH_INCONSISTENT_EOL;
2291 goto label_end_of_loop_2;
2292 }
2293 else
2294 /* To process C2 again, SRC is subtracted by 1. */
2295 *dst++ = c1, src--;
2296 }
2297 else if (coding->eol_type == CODING_EOL_CR)
2298 *dst++ = '\n';
4ed46869 2299 else
d46c5b12
KH
2300 *dst++ = c1;
2301 }
2302 else if (c1 == '\n'
2303 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2304 && (coding->eol_type == CODING_EOL_CR
2305 || coding->eol_type == CODING_EOL_CRLF))
2306 {
2307 result = CODING_FINISH_INCONSISTENT_EOL;
2308 goto label_end_of_loop_2;
4ed46869
KH
2309 }
2310 else
2311 *dst++ = c1;
d46c5b12 2312 coding->produced_char++;
4ed46869 2313 }
a5d301df
KH
2314 else if (c1 < 0x80)
2315 DECODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
54f78171 2316 else
4ed46869 2317 {
4ed46869
KH
2318 if (sjis_p)
2319 {
54f78171 2320 if (c1 < 0xA0 || (c1 >= 0xE0 && c1 < 0xF0))
fb88bf2d 2321 {
54f78171
KH
2322 /* SJIS -> JISX0208 */
2323 ONE_MORE_BYTE (c2);
d14d03ac 2324 if (c2 >= 0x40 && c2 != 0x7F && c2 <= 0xFC)
54f78171
KH
2325 {
2326 DECODE_SJIS (c1, c2, c3, c4);
2327 DECODE_SJIS_BIG5_CHARACTER (charset_jisx0208, c3, c4);
2328 }
2329 else
2330 goto label_invalid_code_2;
fb88bf2d 2331 }
54f78171
KH
2332 else if (c1 < 0xE0)
2333 /* SJIS -> JISX0201-Kana */
2334 DECODE_SJIS_BIG5_CHARACTER (charset_katakana_jisx0201, c1,
2335 /* dummy */ c2);
fb88bf2d 2336 else
54f78171 2337 goto label_invalid_code_1;
4ed46869 2338 }
fb88bf2d 2339 else
fb88bf2d 2340 {
54f78171
KH
2341 /* BIG5 -> Big5 */
2342 if (c1 >= 0xA1 && c1 <= 0xFE)
fb88bf2d 2343 {
54f78171
KH
2344 ONE_MORE_BYTE (c2);
2345 if ((c2 >= 0x40 && c2 <= 0x7E) || (c2 >= 0xA1 && c2 <= 0xFE))
2346 {
2347 int charset;
4ed46869 2348
54f78171
KH
2349 DECODE_BIG5 (c1, c2, charset, c3, c4);
2350 DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
2351 }
2352 else
2353 goto label_invalid_code_2;
fb88bf2d
KH
2354 }
2355 else
54f78171 2356 goto label_invalid_code_1;
4ed46869
KH
2357 }
2358 }
2359 continue;
2360
fb88bf2d
KH
2361 label_invalid_code_1:
2362 *dst++ = c1;
2363 coding->produced_char++;
2364 coding->fake_multibyte = 1;
2365 continue;
2366
2367 label_invalid_code_2:
2368 *dst++ = c1; *dst++= c2;
2369 coding->produced_char += 2;
2370 coding->fake_multibyte = 1;
2371 continue;
2372
4ed46869 2373 label_end_of_loop:
d46c5b12
KH
2374 result = CODING_FINISH_INSUFFICIENT_SRC;
2375 label_end_of_loop_2:
4ed46869
KH
2376 src = src_base;
2377 break;
2378 }
2379
fb88bf2d
KH
2380 if (src < src_end)
2381 {
2382 if (result == CODING_FINISH_NORMAL)
2383 result = CODING_FINISH_INSUFFICIENT_DST;
2384 else if (result != CODING_FINISH_INCONSISTENT_EOL
2385 && coding->mode & CODING_MODE_LAST_BLOCK)
2386 {
2387 src_bytes = src_end - src;
2388 if (dst_bytes && (dst_end - dst < src_bytes))
2389 src_bytes = dst_end - dst;
2390 bcopy (dst, src, src_bytes);
2391 src += src_bytes;
2392 dst += src_bytes;
2393 coding->fake_multibyte = 1;
2394 }
2395 }
d46c5b12
KH
2396
2397 coding->consumed = coding->consumed_char = src - source;
2398 coding->produced = dst - destination;
2399 return result;
4ed46869
KH
2400}
2401
2402/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
2403 This function can encode `charset_ascii', `charset_katakana_jisx0201',
2404 `charset_jisx0208', `charset_big5_1', and `charset_big5-2'. We are
2405 sure that all these charsets are registered as official charset
2406 (i.e. do not have extended leading-codes). Characters of other
2407 charsets are produced without any encoding. If SJIS_P is 1, encode
2408 SJIS text, else encode BIG5 text. */
2409
2410int
2411encode_coding_sjis_big5 (coding, source, destination,
d46c5b12 2412 src_bytes, dst_bytes, sjis_p)
4ed46869
KH
2413 struct coding_system *coding;
2414 unsigned char *source, *destination;
2415 int src_bytes, dst_bytes;
4ed46869
KH
2416 int sjis_p;
2417{
2418 unsigned char *src = source;
2419 unsigned char *src_end = source + src_bytes;
2420 unsigned char *dst = destination;
2421 unsigned char *dst_end = destination + dst_bytes;
2422 /* Since the maximum bytes produced by each loop is 2, we subtract 1
2423 from DST_END to assure overflow checking is necessary only at the
2424 head of loop. */
2425 unsigned char *adjusted_dst_end = dst_end - 1;
84fbb8a0 2426 Lisp_Object translation_table
f967223b 2427 = coding->translation_table_for_encode;
d46c5b12 2428 int result = CODING_FINISH_NORMAL;
a5d301df 2429
84fbb8a0 2430 if (!NILP (Venable_character_translation) && NILP (translation_table))
f967223b 2431 translation_table = Vstandard_translation_table_for_encode;
4ed46869 2432
d46c5b12 2433 coding->consumed_char = 0;
fb88bf2d 2434 coding->fake_multibyte = 0;
d46c5b12
KH
2435 while (src < src_end && (dst_bytes
2436 ? (dst < adjusted_dst_end)
2437 : (dst < src - 1)))
4ed46869
KH
2438 {
2439 /* SRC_BASE remembers the start position in source in each loop.
2440 The loop will be exited when there's not enough source text
2441 to analyze multi-byte codes (within macros ONE_MORE_BYTE and
2442 TWO_MORE_BYTES). In that case, SRC is reset to SRC_BASE
2443 before exiting. */
2444 unsigned char *src_base = src;
2445 unsigned char c1 = *src++, c2, c3, c4;
2446
2447 if (coding->composing)
2448 {
2449 if (c1 == 0xA0)
2450 {
2451 ONE_MORE_BYTE (c1);
2452 c1 &= 0x7F;
2453 }
2454 else if (c1 >= 0xA0)
2455 c1 -= 0x20;
2456 else
2457 coding->composing = 0;
2458 }
2459
2460 switch (emacs_code_class[c1])
2461 {
2462 case EMACS_ascii_code:
a5d301df
KH
2463 ENCODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
2464 break;
2465
4ed46869
KH
2466 case EMACS_control_code:
2467 *dst++ = c1;
d46c5b12 2468 coding->consumed_char++;
4ed46869
KH
2469 break;
2470
2471 case EMACS_carriage_return_code:
d46c5b12 2472 if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
4ed46869
KH
2473 {
2474 *dst++ = c1;
d46c5b12 2475 coding->consumed_char++;
4ed46869
KH
2476 break;
2477 }
2478 /* fall down to treat '\r' as '\n' ... */
2479
2480 case EMACS_linefeed_code:
2481 if (coding->eol_type == CODING_EOL_LF
0ef69138 2482 || coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
2483 *dst++ = '\n';
2484 else if (coding->eol_type == CODING_EOL_CRLF)
2485 *dst++ = '\r', *dst++ = '\n';
2486 else
2487 *dst++ = '\r';
d46c5b12 2488 coding->consumed_char++;
4ed46869
KH
2489 break;
2490
2491 case EMACS_leading_code_2:
2492 ONE_MORE_BYTE (c2);
a5d301df 2493 ENCODE_SJIS_BIG5_CHARACTER (c1, c2, /* dummy */ c3);
4ed46869
KH
2494 break;
2495
2496 case EMACS_leading_code_3:
2497 TWO_MORE_BYTES (c2, c3);
a5d301df 2498 ENCODE_SJIS_BIG5_CHARACTER (c1, c2, c3);
4ed46869
KH
2499 break;
2500
2501 case EMACS_leading_code_4:
2502 THREE_MORE_BYTES (c2, c3, c4);
a5d301df 2503 ENCODE_SJIS_BIG5_CHARACTER (c2, c3, c4);
4ed46869
KH
2504 break;
2505
2506 case EMACS_leading_code_composition:
2507 coding->composing = 1;
2508 break;
2509
2510 default: /* i.e. case EMACS_invalid_code: */
2511 *dst++ = c1;
d46c5b12 2512 coding->consumed_char++;
4ed46869
KH
2513 }
2514 continue;
2515
2516 label_end_of_loop:
d46c5b12
KH
2517 result = CODING_FINISH_INSUFFICIENT_SRC;
2518 src = src_base;
4ed46869
KH
2519 break;
2520 }
2521
d46c5b12
KH
2522 if (result == CODING_FINISH_NORMAL
2523 && src < src_end)
2524 result = CODING_FINISH_INSUFFICIENT_DST;
2525 coding->consumed = src - source;
2526 coding->produced = coding->produced_char = dst - destination;
2527 return result;
4ed46869
KH
2528}
2529
2530\f
1397dc18
KH
2531/*** 5. CCL handlers ***/
2532
2533/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2534 Check if a text is encoded in a coding system of which
2535 encoder/decoder are written in CCL program. If it is, return
2536 CODING_CATEGORY_MASK_CCL, else return 0. */
2537
2538int
2539detect_coding_ccl (src, src_end)
2540 unsigned char *src, *src_end;
2541{
2542 unsigned char *valid;
2543
2544 /* No coding system is assigned to coding-category-ccl. */
2545 if (!coding_system_table[CODING_CATEGORY_IDX_CCL])
2546 return 0;
2547
2548 valid = coding_system_table[CODING_CATEGORY_IDX_CCL]->spec.ccl.valid_codes;
2549 while (src < src_end)
2550 {
2551 if (! valid[*src]) return 0;
2552 src++;
2553 }
2554 return CODING_CATEGORY_MASK_CCL;
2555}
2556
2557\f
2558/*** 6. End-of-line handlers ***/
4ed46869
KH
2559
2560/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2561 This function is called only when `coding->eol_type' is
2562 CODING_EOL_CRLF or CODING_EOL_CR. */
2563
dfcf069d 2564int
d46c5b12 2565decode_eol (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
2566 struct coding_system *coding;
2567 unsigned char *source, *destination;
2568 int src_bytes, dst_bytes;
4ed46869
KH
2569{
2570 unsigned char *src = source;
2571 unsigned char *src_end = source + src_bytes;
2572 unsigned char *dst = destination;
2573 unsigned char *dst_end = destination + dst_bytes;
fb88bf2d 2574 unsigned char c;
d46c5b12
KH
2575 int result = CODING_FINISH_NORMAL;
2576
fb88bf2d
KH
2577 coding->fake_multibyte = 0;
2578
d46c5b12
KH
2579 if (src_bytes <= 0)
2580 return result;
4ed46869
KH
2581
2582 switch (coding->eol_type)
2583 {
2584 case CODING_EOL_CRLF:
2585 {
2586 /* Since the maximum bytes produced by each loop is 2, we
2587 subtract 1 from DST_END to assure overflow checking is
2588 necessary only at the head of loop. */
2589 unsigned char *adjusted_dst_end = dst_end - 1;
2590
d46c5b12
KH
2591 while (src < src_end && (dst_bytes
2592 ? (dst < adjusted_dst_end)
2593 : (dst < src - 1)))
4ed46869
KH
2594 {
2595 unsigned char *src_base = src;
fb88bf2d
KH
2596
2597 c = *src++;
4ed46869
KH
2598 if (c == '\r')
2599 {
2600 ONE_MORE_BYTE (c);
fdfcf19d
KH
2601 if (c == '\n')
2602 *dst++ = c;
2603 else
d46c5b12
KH
2604 {
2605 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2606 {
2607 result = CODING_FINISH_INCONSISTENT_EOL;
2608 goto label_end_of_loop_2;
2609 }
fdfcf19d 2610 src--;
d46c5b12 2611 *dst++ = '\r';
fb88bf2d
KH
2612 if (BASE_LEADING_CODE_P (c))
2613 coding->fake_multibyte = 1;
d46c5b12 2614 }
4ed46869 2615 }
d46c5b12
KH
2616 else if (c == '\n'
2617 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL))
2618 {
2619 result = CODING_FINISH_INCONSISTENT_EOL;
2620 goto label_end_of_loop_2;
2621 }
4ed46869 2622 else
fb88bf2d
KH
2623 {
2624 *dst++ = c;
2625 if (BASE_LEADING_CODE_P (c))
2626 coding->fake_multibyte = 1;
2627 }
4ed46869
KH
2628 continue;
2629
2630 label_end_of_loop:
d46c5b12
KH
2631 result = CODING_FINISH_INSUFFICIENT_SRC;
2632 label_end_of_loop_2:
4ed46869
KH
2633 src = src_base;
2634 break;
2635 }
fdfcf19d
KH
2636 if (src < src_end)
2637 {
2638 if (result == CODING_FINISH_NORMAL)
2639 result = CODING_FINISH_INSUFFICIENT_DST;
2640 else if (result != CODING_FINISH_INCONSISTENT_EOL
2641 && coding->mode & CODING_MODE_LAST_BLOCK)
2642 {
2643 /* This is the last block of the text to be decoded.
2644 We flush out all remaining codes. */
2645 src_bytes = src_end - src;
2646 if (dst_bytes && (dst_end - dst < src_bytes))
2647 src_bytes = dst_end - dst;
2648 bcopy (src, dst, src_bytes);
2649 dst += src_bytes;
2650 src += src_bytes;
2651 }
2652 }
4ed46869 2653 }
d46c5b12 2654 break;
4ed46869
KH
2655
2656 case CODING_EOL_CR:
d46c5b12
KH
2657 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2658 {
fb88bf2d
KH
2659 while (src < src_end)
2660 {
2661 if ((c = *src++) == '\n')
2662 break;
2663 if (BASE_LEADING_CODE_P (c))
2664 coding->fake_multibyte = 1;
2665 }
d46c5b12
KH
2666 if (*--src == '\n')
2667 {
2668 src_bytes = src - source;
2669 result = CODING_FINISH_INCONSISTENT_EOL;
2670 }
2671 }
2672 if (dst_bytes && src_bytes > dst_bytes)
2673 {
2674 result = CODING_FINISH_INSUFFICIENT_DST;
2675 src_bytes = dst_bytes;
2676 }
2677 if (dst_bytes)
2678 bcopy (source, destination, src_bytes);
2679 else
2680 safe_bcopy (source, destination, src_bytes);
2681 src = source + src_bytes;
2682 while (src_bytes--) if (*dst++ == '\r') dst[-1] = '\n';
4ed46869
KH
2683 break;
2684
2685 default: /* i.e. case: CODING_EOL_LF */
d46c5b12
KH
2686 if (dst_bytes && src_bytes > dst_bytes)
2687 {
2688 result = CODING_FINISH_INSUFFICIENT_DST;
2689 src_bytes = dst_bytes;
2690 }
2691 if (dst_bytes)
2692 bcopy (source, destination, src_bytes);
2693 else
2694 safe_bcopy (source, destination, src_bytes);
2695 src += src_bytes;
993824c9 2696 dst += src_bytes;
fb88bf2d 2697 coding->fake_multibyte = 1;
4ed46869
KH
2698 break;
2699 }
2700
d46c5b12
KH
2701 coding->consumed = coding->consumed_char = src - source;
2702 coding->produced = coding->produced_char = dst - destination;
2703 return result;
4ed46869
KH
2704}
2705
2706/* See "GENERAL NOTES about `encode_coding_XXX ()' functions". Encode
2707 format of end-of-line according to `coding->eol_type'. If
d46c5b12
KH
2708 `coding->mode & CODING_MODE_SELECTIVE_DISPLAY' is nonzero, code
2709 '\r' in source text also means end-of-line. */
4ed46869 2710
dfcf069d 2711int
d46c5b12 2712encode_eol (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
2713 struct coding_system *coding;
2714 unsigned char *source, *destination;
2715 int src_bytes, dst_bytes;
4ed46869
KH
2716{
2717 unsigned char *src = source;
2718 unsigned char *dst = destination;
d46c5b12 2719 int result = CODING_FINISH_NORMAL;
4ed46869 2720
fb88bf2d
KH
2721 coding->fake_multibyte = 0;
2722
d46c5b12
KH
2723 if (coding->eol_type == CODING_EOL_CRLF)
2724 {
2725 unsigned char c;
2726 unsigned char *src_end = source + src_bytes;
2727 unsigned char *dst_end = destination + dst_bytes;
2728 /* Since the maximum bytes produced by each loop is 2, we
2729 subtract 1 from DST_END to assure overflow checking is
2730 necessary only at the head of loop. */
2731 unsigned char *adjusted_dst_end = dst_end - 1;
2732
2733 while (src < src_end && (dst_bytes
2734 ? (dst < adjusted_dst_end)
2735 : (dst < src - 1)))
2736 {
2737 c = *src++;
2738 if (c == '\n'
2739 || (c == '\r' && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY)))
2740 *dst++ = '\r', *dst++ = '\n';
2741 else
fb88bf2d
KH
2742 {
2743 *dst++ = c;
2744 if (BASE_LEADING_CODE_P (c))
2745 coding->fake_multibyte = 1;
2746 }
d46c5b12
KH
2747 }
2748 if (src < src_end)
2749 result = CODING_FINISH_INSUFFICIENT_DST;
2750 }
2751 else
4ed46869 2752 {
fb88bf2d
KH
2753 unsigned char c;
2754
d46c5b12 2755 if (dst_bytes && src_bytes > dst_bytes)
4ed46869 2756 {
d46c5b12
KH
2757 src_bytes = dst_bytes;
2758 result = CODING_FINISH_INSUFFICIENT_DST;
2759 }
2760 if (dst_bytes)
2761 bcopy (source, destination, src_bytes);
2762 else
993824c9
RS
2763 safe_bcopy (source, destination, src_bytes);
2764 dst_bytes = src_bytes;
2765 if (coding->eol_type == CODING_EOL_CR)
d46c5b12
KH
2766 {
2767 while (src_bytes--)
fb88bf2d
KH
2768 {
2769 if ((c = *dst++) == '\n')
2770 dst[-1] = '\r';
2771 else if (BASE_LEADING_CODE_P (c))
993824c9 2772 coding->fake_multibyte = 1;
fb88bf2d 2773 }
d46c5b12 2774 }
fb88bf2d 2775 else
d46c5b12 2776 {
fb88bf2d
KH
2777 if (coding->mode & CODING_MODE_SELECTIVE_DISPLAY)
2778 {
2779 while (src_bytes--)
2780 if (*dst++ == '\r') dst[-1] = '\n';
2781 }
2782 coding->fake_multibyte = 1;
4ed46869 2783 }
fb88bf2d
KH
2784 src = source + dst_bytes;
2785 dst = destination + dst_bytes;
4ed46869
KH
2786 }
2787
d46c5b12
KH
2788 coding->consumed = coding->consumed_char = src - source;
2789 coding->produced = coding->produced_char = dst - destination;
2790 return result;
4ed46869
KH
2791}
2792
2793\f
1397dc18 2794/*** 7. C library functions ***/
4ed46869
KH
2795
2796/* In Emacs Lisp, coding system is represented by a Lisp symbol which
2797 has a property `coding-system'. The value of this property is a
2798 vector of length 5 (called as coding-vector). Among elements of
2799 this vector, the first (element[0]) and the fifth (element[4])
2800 carry important information for decoding/encoding. Before
2801 decoding/encoding, this information should be set in fields of a
2802 structure of type `coding_system'.
2803
2804 A value of property `coding-system' can be a symbol of another
2805 subsidiary coding-system. In that case, Emacs gets coding-vector
2806 from that symbol.
2807
2808 `element[0]' contains information to be set in `coding->type'. The
2809 value and its meaning is as follows:
2810
0ef69138
KH
2811 0 -- coding_type_emacs_mule
2812 1 -- coding_type_sjis
2813 2 -- coding_type_iso2022
2814 3 -- coding_type_big5
2815 4 -- coding_type_ccl encoder/decoder written in CCL
2816 nil -- coding_type_no_conversion
2817 t -- coding_type_undecided (automatic conversion on decoding,
2818 no-conversion on encoding)
4ed46869
KH
2819
2820 `element[4]' contains information to be set in `coding->flags' and
2821 `coding->spec'. The meaning varies by `coding->type'.
2822
2823 If `coding->type' is `coding_type_iso2022', element[4] is a vector
2824 of length 32 (of which the first 13 sub-elements are used now).
2825 Meanings of these sub-elements are:
2826
2827 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2828 If the value is an integer of valid charset, the charset is
2829 assumed to be designated to graphic register N initially.
2830
2831 If the value is minus, it is a minus value of charset which
2832 reserves graphic register N, which means that the charset is
2833 not designated initially but should be designated to graphic
2834 register N just before encoding a character in that charset.
2835
2836 If the value is nil, graphic register N is never used on
2837 encoding.
2838
2839 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2840 Each value takes t or nil. See the section ISO2022 of
2841 `coding.h' for more information.
2842
2843 If `coding->type' is `coding_type_big5', element[4] is t to denote
2844 BIG5-ETen or nil to denote BIG5-HKU.
2845
2846 If `coding->type' takes the other value, element[4] is ignored.
2847
2848 Emacs Lisp's coding system also carries information about format of
2849 end-of-line in a value of property `eol-type'. If the value is
2850 integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2851 means CODING_EOL_CR. If it is not integer, it should be a vector
2852 of subsidiary coding systems of which property `eol-type' has one
2853 of above values.
2854
2855*/
2856
2857/* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2858 and set it in CODING. If CODING_SYSTEM_SYMBOL is invalid, CODING
2859 is setup so that no conversion is necessary and return -1, else
2860 return 0. */
2861
2862int
e0e989f6
KH
2863setup_coding_system (coding_system, coding)
2864 Lisp_Object coding_system;
4ed46869
KH
2865 struct coding_system *coding;
2866{
d46c5b12 2867 Lisp_Object coding_spec, coding_type, eol_type, plist;
4608c386 2868 Lisp_Object val;
70c22245 2869 int i;
4ed46869 2870
d46c5b12 2871 /* Initialize some fields required for all kinds of coding systems. */
774324d6 2872 coding->symbol = coding_system;
d46c5b12
KH
2873 coding->common_flags = 0;
2874 coding->mode = 0;
2875 coding->heading_ascii = -1;
2876 coding->post_read_conversion = coding->pre_write_conversion = Qnil;
1f5dbf34
KH
2877
2878 if (NILP (coding_system))
2879 goto label_invalid_coding_system;
2880
4608c386 2881 coding_spec = Fget (coding_system, Qcoding_system);
1f5dbf34 2882
4608c386
KH
2883 if (!VECTORP (coding_spec)
2884 || XVECTOR (coding_spec)->size != 5
2885 || !CONSP (XVECTOR (coding_spec)->contents[3]))
4ed46869 2886 goto label_invalid_coding_system;
4608c386 2887
d46c5b12
KH
2888 eol_type = inhibit_eol_conversion ? Qnil : Fget (coding_system, Qeol_type);
2889 if (VECTORP (eol_type))
2890 {
2891 coding->eol_type = CODING_EOL_UNDECIDED;
2892 coding->common_flags = CODING_REQUIRE_DETECTION_MASK;
2893 }
2894 else if (XFASTINT (eol_type) == 1)
2895 {
2896 coding->eol_type = CODING_EOL_CRLF;
2897 coding->common_flags
2898 = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2899 }
2900 else if (XFASTINT (eol_type) == 2)
2901 {
2902 coding->eol_type = CODING_EOL_CR;
2903 coding->common_flags
2904 = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2905 }
2906 else
2907 coding->eol_type = CODING_EOL_LF;
2908
2909 coding_type = XVECTOR (coding_spec)->contents[0];
2910 /* Try short cut. */
2911 if (SYMBOLP (coding_type))
2912 {
2913 if (EQ (coding_type, Qt))
2914 {
2915 coding->type = coding_type_undecided;
2916 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
2917 }
2918 else
2919 coding->type = coding_type_no_conversion;
2920 return 0;
2921 }
2922
2923 /* Initialize remaining fields. */
2924 coding->composing = 0;
a63063ae 2925 coding->composed_chars = 0;
d46c5b12
KH
2926
2927 /* Get values of coding system properties:
2928 `post-read-conversion', `pre-write-conversion',
f967223b 2929 `translation-table-for-decode', `translation-table-for-encode'. */
4608c386
KH
2930 plist = XVECTOR (coding_spec)->contents[3];
2931 coding->post_read_conversion = Fplist_get (plist, Qpost_read_conversion);
2932 coding->pre_write_conversion = Fplist_get (plist, Qpre_write_conversion);
f967223b 2933 val = Fplist_get (plist, Qtranslation_table_for_decode);
4608c386 2934 if (SYMBOLP (val))
f967223b
KH
2935 val = Fget (val, Qtranslation_table_for_decode);
2936 coding->translation_table_for_decode = CHAR_TABLE_P (val) ? val : Qnil;
2937 val = Fplist_get (plist, Qtranslation_table_for_encode);
4608c386 2938 if (SYMBOLP (val))
f967223b
KH
2939 val = Fget (val, Qtranslation_table_for_encode);
2940 coding->translation_table_for_encode = CHAR_TABLE_P (val) ? val : Qnil;
d46c5b12
KH
2941 val = Fplist_get (plist, Qcoding_category);
2942 if (!NILP (val))
2943 {
2944 val = Fget (val, Qcoding_category_index);
2945 if (INTEGERP (val))
2946 coding->category_idx = XINT (val);
2947 else
2948 goto label_invalid_coding_system;
2949 }
2950 else
2951 goto label_invalid_coding_system;
4608c386 2952
70c22245
KH
2953 val = Fplist_get (plist, Qsafe_charsets);
2954 if (EQ (val, Qt))
2955 {
2956 for (i = 0; i <= MAX_CHARSET; i++)
2957 coding->safe_charsets[i] = 1;
2958 }
2959 else
2960 {
2961 bzero (coding->safe_charsets, MAX_CHARSET + 1);
2962 while (CONSP (val))
2963 {
2964 if ((i = get_charset_id (XCONS (val)->car)) >= 0)
2965 coding->safe_charsets[i] = 1;
2966 val = XCONS (val)->cdr;
2967 }
2968 }
2969
d46c5b12 2970 switch (XFASTINT (coding_type))
4ed46869
KH
2971 {
2972 case 0:
0ef69138 2973 coding->type = coding_type_emacs_mule;
c952af22
KH
2974 if (!NILP (coding->post_read_conversion))
2975 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
2976 if (!NILP (coding->pre_write_conversion))
2977 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
4ed46869
KH
2978 break;
2979
2980 case 1:
2981 coding->type = coding_type_sjis;
c952af22
KH
2982 coding->common_flags
2983 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
4ed46869
KH
2984 break;
2985
2986 case 2:
2987 coding->type = coding_type_iso2022;
c952af22
KH
2988 coding->common_flags
2989 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
4ed46869 2990 {
70c22245 2991 Lisp_Object val, temp;
4ed46869 2992 Lisp_Object *flags;
d46c5b12 2993 int i, charset, reg_bits = 0;
4ed46869 2994
4608c386 2995 val = XVECTOR (coding_spec)->contents[4];
f44d27ce 2996
4ed46869
KH
2997 if (!VECTORP (val) || XVECTOR (val)->size != 32)
2998 goto label_invalid_coding_system;
2999
3000 flags = XVECTOR (val)->contents;
3001 coding->flags
3002 = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
3003 | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
3004 | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
3005 | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
3006 | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
3007 | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
3008 | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
3009 | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
e0e989f6
KH
3010 | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
3011 | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
c4825358
KH
3012 | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3013 | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
3f003981 3014 | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
c4825358 3015 );
4ed46869
KH
3016
3017 /* Invoke graphic register 0 to plane 0. */
3018 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
3019 /* Invoke graphic register 1 to plane 1 if we can use full 8-bit. */
3020 CODING_SPEC_ISO_INVOCATION (coding, 1)
3021 = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
3022 /* Not single shifting at first. */
6e85d753 3023 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
e0e989f6 3024 /* Beginning of buffer should also be regarded as bol. */
6e85d753 3025 CODING_SPEC_ISO_BOL (coding) = 1;
4ed46869 3026
70c22245
KH
3027 for (charset = 0; charset <= MAX_CHARSET; charset++)
3028 CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = 255;
3029 val = Vcharset_revision_alist;
3030 while (CONSP (val))
3031 {
3032 charset = get_charset_id (Fcar_safe (XCONS (val)->car));
3033 if (charset >= 0
3034 && (temp = Fcdr_safe (XCONS (val)->car), INTEGERP (temp))
3035 && (i = XINT (temp), (i >= 0 && (i + '@') < 128)))
3036 CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = i;
3037 val = XCONS (val)->cdr;
3038 }
3039
4ed46869
KH
3040 /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
3041 FLAGS[REG] can be one of below:
3042 integer CHARSET: CHARSET occupies register I,
3043 t: designate nothing to REG initially, but can be used
3044 by any charsets,
3045 list of integer, nil, or t: designate the first
3046 element (if integer) to REG initially, the remaining
3047 elements (if integer) is designated to REG on request,
d46c5b12 3048 if an element is t, REG can be used by any charsets,
4ed46869 3049 nil: REG is never used. */
467e7675 3050 for (charset = 0; charset <= MAX_CHARSET; charset++)
1ba9e4ab
KH
3051 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3052 = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
4ed46869
KH
3053 for (i = 0; i < 4; i++)
3054 {
3055 if (INTEGERP (flags[i])
e0e989f6
KH
3056 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset))
3057 || (charset = get_charset_id (flags[i])) >= 0)
4ed46869
KH
3058 {
3059 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3060 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
3061 }
3062 else if (EQ (flags[i], Qt))
3063 {
3064 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
d46c5b12
KH
3065 reg_bits |= 1 << i;
3066 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
4ed46869
KH
3067 }
3068 else if (CONSP (flags[i]))
3069 {
84d60297
RS
3070 Lisp_Object tail;
3071 tail = flags[i];
4ed46869 3072
d46c5b12 3073 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
4ed46869
KH
3074 if (INTEGERP (XCONS (tail)->car)
3075 && (charset = XINT (XCONS (tail)->car),
e0e989f6
KH
3076 CHARSET_VALID_P (charset))
3077 || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
4ed46869
KH
3078 {
3079 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3080 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
3081 }
3082 else
3083 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3084 tail = XCONS (tail)->cdr;
3085 while (CONSP (tail))
3086 {
3087 if (INTEGERP (XCONS (tail)->car)
3088 && (charset = XINT (XCONS (tail)->car),
e0e989f6
KH
3089 CHARSET_VALID_P (charset))
3090 || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
70c22245
KH
3091 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3092 = i;
4ed46869 3093 else if (EQ (XCONS (tail)->car, Qt))
d46c5b12 3094 reg_bits |= 1 << i;
4ed46869
KH
3095 tail = XCONS (tail)->cdr;
3096 }
3097 }
3098 else
3099 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3100
3101 CODING_SPEC_ISO_DESIGNATION (coding, i)
3102 = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
3103 }
3104
d46c5b12 3105 if (reg_bits && ! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
4ed46869
KH
3106 {
3107 /* REG 1 can be used only by locking shift in 7-bit env. */
3108 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
d46c5b12 3109 reg_bits &= ~2;
4ed46869
KH
3110 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
3111 /* Without any shifting, only REG 0 and 1 can be used. */
d46c5b12 3112 reg_bits &= 3;
4ed46869
KH
3113 }
3114
d46c5b12
KH
3115 if (reg_bits)
3116 for (charset = 0; charset <= MAX_CHARSET; charset++)
6e85d753 3117 {
d46c5b12
KH
3118 if (CHARSET_VALID_P (charset))
3119 {
3120 /* There exist some default graphic registers to be
3121 used CHARSET. */
3122
3123 /* We had better avoid designating a charset of
3124 CHARS96 to REG 0 as far as possible. */
3125 if (CHARSET_CHARS (charset) == 96)
3126 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3127 = (reg_bits & 2
3128 ? 1 : (reg_bits & 4 ? 2 : (reg_bits & 8 ? 3 : 0)));
3129 else
3130 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3131 = (reg_bits & 1
3132 ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
3133 }
6e85d753 3134 }
4ed46869 3135 }
c952af22 3136 coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
d46c5b12 3137 coding->spec.iso2022.last_invalid_designation_register = -1;
4ed46869
KH
3138 break;
3139
3140 case 3:
3141 coding->type = coding_type_big5;
c952af22
KH
3142 coding->common_flags
3143 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
4ed46869 3144 coding->flags
4608c386 3145 = (NILP (XVECTOR (coding_spec)->contents[4])
4ed46869
KH
3146 ? CODING_FLAG_BIG5_HKU
3147 : CODING_FLAG_BIG5_ETEN);
3148 break;
3149
3150 case 4:
3151 coding->type = coding_type_ccl;
c952af22
KH
3152 coding->common_flags
3153 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
4ed46869 3154 {
84d60297 3155 Lisp_Object val;
d21ca14d
KH
3156 Lisp_Object decoder, encoder;
3157
84d60297 3158 val = XVECTOR (coding_spec)->contents[4];
4ed46869 3159 if (CONSP (val)
d21ca14d
KH
3160 && SYMBOLP (XCONS (val)->car)
3161 && !NILP (decoder = Fget (XCONS (val)->car, Qccl_program_idx))
f82423d7 3162 && !NILP (decoder = Fcdr (Faref (Vccl_program_table, decoder)))
d21ca14d
KH
3163 && SYMBOLP (XCONS (val)->cdr)
3164 && !NILP (encoder = Fget (XCONS (val)->cdr, Qccl_program_idx))
f82423d7 3165 && !NILP (encoder = Fcdr (Faref (Vccl_program_table, encoder))))
4ed46869 3166 {
d21ca14d
KH
3167 setup_ccl_program (&(coding->spec.ccl.decoder), decoder);
3168 setup_ccl_program (&(coding->spec.ccl.encoder), encoder);
4ed46869
KH
3169 }
3170 else
3171 goto label_invalid_coding_system;
1397dc18
KH
3172
3173 bzero (coding->spec.ccl.valid_codes, 256);
3174 val = Fplist_get (plist, Qvalid_codes);
3175 if (CONSP (val))
3176 {
3177 Lisp_Object this;
3178
7b179c2d 3179 for (; CONSP (val); val = XCONS (val)->cdr)
1397dc18 3180 {
7b179c2d 3181 this = XCONS (val)->car;
1397dc18
KH
3182 if (INTEGERP (this)
3183 && XINT (this) >= 0 && XINT (this) < 256)
3184 coding->spec.ccl.valid_codes[XINT (this)] = 1;
3185 else if (CONSP (this)
3186 && INTEGERP (XCONS (this)->car)
3187 && INTEGERP (XCONS (this)->cdr))
3188 {
3189 int start = XINT (XCONS (this)->car);
3190 int end = XINT (XCONS (this)->cdr);
3191
3192 if (start >= 0 && start <= end && end < 256)
e133c8fa 3193 while (start <= end)
1397dc18
KH
3194 coding->spec.ccl.valid_codes[start++] = 1;
3195 }
3196 }
3197 }
4ed46869 3198 }
c952af22 3199 coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
4ed46869
KH
3200 break;
3201
27901516
KH
3202 case 5:
3203 coding->type = coding_type_raw_text;
3204 break;
3205
4ed46869 3206 default:
d46c5b12 3207 goto label_invalid_coding_system;
4ed46869
KH
3208 }
3209 return 0;
3210
3211 label_invalid_coding_system:
3212 coding->type = coding_type_no_conversion;
d46c5b12 3213 coding->category_idx = CODING_CATEGORY_IDX_BINARY;
c952af22 3214 coding->common_flags = 0;
dec137e5 3215 coding->eol_type = CODING_EOL_LF;
d46c5b12 3216 coding->pre_write_conversion = coding->post_read_conversion = Qnil;
4ed46869
KH
3217 return -1;
3218}
3219
54f78171
KH
3220/* Setup raw-text or one of its subsidiaries in the structure
3221 coding_system CODING according to the already setup value eol_type
3222 in CODING. CODING should be setup for some coding system in
3223 advance. */
3224
3225void
3226setup_raw_text_coding_system (coding)
3227 struct coding_system *coding;
3228{
3229 if (coding->type != coding_type_raw_text)
3230 {
3231 coding->symbol = Qraw_text;
3232 coding->type = coding_type_raw_text;
3233 if (coding->eol_type != CODING_EOL_UNDECIDED)
3234 {
84d60297
RS
3235 Lisp_Object subsidiaries;
3236 subsidiaries = Fget (Qraw_text, Qeol_type);
54f78171
KH
3237
3238 if (VECTORP (subsidiaries)
3239 && XVECTOR (subsidiaries)->size == 3)
3240 coding->symbol
3241 = XVECTOR (subsidiaries)->contents[coding->eol_type];
3242 }
3243 }
3244 return;
3245}
3246
4ed46869
KH
3247/* Emacs has a mechanism to automatically detect a coding system if it
3248 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
3249 it's impossible to distinguish some coding systems accurately
3250 because they use the same range of codes. So, at first, coding
3251 systems are categorized into 7, those are:
3252
0ef69138 3253 o coding-category-emacs-mule
4ed46869
KH
3254
3255 The category for a coding system which has the same code range
3256 as Emacs' internal format. Assigned the coding-system (Lisp
0ef69138 3257 symbol) `emacs-mule' by default.
4ed46869
KH
3258
3259 o coding-category-sjis
3260
3261 The category for a coding system which has the same code range
3262 as SJIS. Assigned the coding-system (Lisp
7717c392 3263 symbol) `japanese-shift-jis' by default.
4ed46869
KH
3264
3265 o coding-category-iso-7
3266
3267 The category for a coding system which has the same code range
7717c392 3268 as ISO2022 of 7-bit environment. This doesn't use any locking
d46c5b12
KH
3269 shift and single shift functions. This can encode/decode all
3270 charsets. Assigned the coding-system (Lisp symbol)
3271 `iso-2022-7bit' by default.
3272
3273 o coding-category-iso-7-tight
3274
3275 Same as coding-category-iso-7 except that this can
3276 encode/decode only the specified charsets.
4ed46869
KH
3277
3278 o coding-category-iso-8-1
3279
3280 The category for a coding system which has the same code range
3281 as ISO2022 of 8-bit environment and graphic plane 1 used only
7717c392
KH
3282 for DIMENSION1 charset. This doesn't use any locking shift
3283 and single shift functions. Assigned the coding-system (Lisp
3284 symbol) `iso-latin-1' by default.
4ed46869
KH
3285
3286 o coding-category-iso-8-2
3287
3288 The category for a coding system which has the same code range
3289 as ISO2022 of 8-bit environment and graphic plane 1 used only
7717c392
KH
3290 for DIMENSION2 charset. This doesn't use any locking shift
3291 and single shift functions. Assigned the coding-system (Lisp
3292 symbol) `japanese-iso-8bit' by default.
4ed46869 3293
7717c392 3294 o coding-category-iso-7-else
4ed46869
KH
3295
3296 The category for a coding system which has the same code range
7717c392
KH
3297 as ISO2022 of 7-bit environemnt but uses locking shift or
3298 single shift functions. Assigned the coding-system (Lisp
3299 symbol) `iso-2022-7bit-lock' by default.
3300
3301 o coding-category-iso-8-else
3302
3303 The category for a coding system which has the same code range
3304 as ISO2022 of 8-bit environemnt but uses locking shift or
3305 single shift functions. Assigned the coding-system (Lisp
3306 symbol) `iso-2022-8bit-ss2' by default.
4ed46869
KH
3307
3308 o coding-category-big5
3309
3310 The category for a coding system which has the same code range
3311 as BIG5. Assigned the coding-system (Lisp symbol)
e0e989f6 3312 `cn-big5' by default.
4ed46869 3313
1397dc18
KH
3314 o coding-category-ccl
3315
3316 The category for a coding system of which encoder/decoder is
3317 written in CCL programs. The default value is nil, i.e., no
3318 coding system is assigned.
3319
4ed46869
KH
3320 o coding-category-binary
3321
3322 The category for a coding system not categorized in any of the
3323 above. Assigned the coding-system (Lisp symbol)
e0e989f6 3324 `no-conversion' by default.
4ed46869
KH
3325
3326 Each of them is a Lisp symbol and the value is an actual
3327 `coding-system's (this is also a Lisp symbol) assigned by a user.
3328 What Emacs does actually is to detect a category of coding system.
3329 Then, it uses a `coding-system' assigned to it. If Emacs can't
3330 decide only one possible category, it selects a category of the
3331 highest priority. Priorities of categories are also specified by a
3332 user in a Lisp variable `coding-category-list'.
3333
3334*/
3335
66cfb530
KH
3336static
3337int ascii_skip_code[256];
3338
d46c5b12 3339/* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded.
4ed46869
KH
3340 If it detects possible coding systems, return an integer in which
3341 appropriate flag bits are set. Flag bits are defined by macros
d46c5b12 3342 CODING_CATEGORY_MASK_XXX in `coding.h'.
4ed46869 3343
d46c5b12
KH
3344 How many ASCII characters are at the head is returned as *SKIP. */
3345
3346static int
3347detect_coding_mask (source, src_bytes, priorities, skip)
3348 unsigned char *source;
3349 int src_bytes, *priorities, *skip;
4ed46869
KH
3350{
3351 register unsigned char c;
d46c5b12 3352 unsigned char *src = source, *src_end = source + src_bytes;
66cfb530 3353 unsigned int mask;
d46c5b12 3354 int i;
4ed46869
KH
3355
3356 /* At first, skip all ASCII characters and control characters except
3357 for three ISO2022 specific control characters. */
66cfb530
KH
3358 ascii_skip_code[ISO_CODE_SO] = 0;
3359 ascii_skip_code[ISO_CODE_SI] = 0;
3360 ascii_skip_code[ISO_CODE_ESC] = 0;
3361
bcf26d6a 3362 label_loop_detect_coding:
66cfb530 3363 while (src < src_end && ascii_skip_code[*src]) src++;
d46c5b12 3364 *skip = src - source;
4ed46869
KH
3365
3366 if (src >= src_end)
3367 /* We found nothing other than ASCII. There's nothing to do. */
d46c5b12 3368 return 0;
4ed46869 3369
8a8147d6 3370 c = *src;
4ed46869
KH
3371 /* The text seems to be encoded in some multilingual coding system.
3372 Now, try to find in which coding system the text is encoded. */
3373 if (c < 0x80)
bcf26d6a
KH
3374 {
3375 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
3376 /* C is an ISO2022 specific control code of C0. */
3377 mask = detect_coding_iso2022 (src, src_end);
1b2af4b0 3378 if (mask == 0)
d46c5b12
KH
3379 {
3380 /* No valid ISO2022 code follows C. Try again. */
3381 src++;
66cfb530
KH
3382 if (c == ISO_CODE_ESC)
3383 ascii_skip_code[ISO_CODE_ESC] = 1;
3384 else
3385 ascii_skip_code[ISO_CODE_SO] = ascii_skip_code[ISO_CODE_SI] = 1;
d46c5b12
KH
3386 goto label_loop_detect_coding;
3387 }
3388 if (priorities)
3389 goto label_return_highest_only;
bcf26d6a 3390 }
d46c5b12 3391 else
c4825358 3392 {
d46c5b12 3393 int try;
4ed46869 3394
d46c5b12
KH
3395 if (c < 0xA0)
3396 {
3397 /* C is the first byte of SJIS character code,
3398 or a leading-code of Emacs' internal format (emacs-mule). */
3399 try = CODING_CATEGORY_MASK_SJIS | CODING_CATEGORY_MASK_EMACS_MULE;
3400
3401 /* Or, if C is a special latin extra code,
3402 or is an ISO2022 specific control code of C1 (SS2 or SS3),
3403 or is an ISO2022 control-sequence-introducer (CSI),
3404 we should also consider the possibility of ISO2022 codings. */
3405 if ((VECTORP (Vlatin_extra_code_table)
3406 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
3407 || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
3408 || (c == ISO_CODE_CSI
3409 && (src < src_end
3410 && (*src == ']'
3411 || ((*src == '0' || *src == '1' || *src == '2')
3412 && src + 1 < src_end
3413 && src[1] == ']')))))
3414 try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
3415 | CODING_CATEGORY_MASK_ISO_8BIT);
3416 }
c4825358 3417 else
d46c5b12
KH
3418 /* C is a character of ISO2022 in graphic plane right,
3419 or a SJIS's 1-byte character code (i.e. JISX0201),
3420 or the first byte of BIG5's 2-byte code. */
3421 try = (CODING_CATEGORY_MASK_ISO_8_ELSE
3422 | CODING_CATEGORY_MASK_ISO_8BIT
3423 | CODING_CATEGORY_MASK_SJIS
3424 | CODING_CATEGORY_MASK_BIG5);
3425
1397dc18
KH
3426 /* Or, we may have to consider the possibility of CCL. */
3427 if (coding_system_table[CODING_CATEGORY_IDX_CCL]
3428 && (coding_system_table[CODING_CATEGORY_IDX_CCL]
3429 ->spec.ccl.valid_codes)[c])
3430 try |= CODING_CATEGORY_MASK_CCL;
3431
d46c5b12
KH
3432 mask = 0;
3433 if (priorities)
3434 {
3435 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3436 {
5ab13dd0 3437 if (priorities[i] & try & CODING_CATEGORY_MASK_ISO)
d46c5b12 3438 mask = detect_coding_iso2022 (src, src_end);
5ab13dd0 3439 else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS)
d46c5b12 3440 mask = detect_coding_sjis (src, src_end);
5ab13dd0 3441 else if (priorities[i] & try & CODING_CATEGORY_MASK_BIG5)
d46c5b12 3442 mask = detect_coding_big5 (src, src_end);
5ab13dd0 3443 else if (priorities[i] & try & CODING_CATEGORY_MASK_EMACS_MULE)
d46c5b12 3444 mask = detect_coding_emacs_mule (src, src_end);
89fa8b36 3445 else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL)
1397dc18 3446 mask = detect_coding_ccl (src, src_end);
5ab13dd0
RS
3447 else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT)
3448 mask = CODING_CATEGORY_MASK_RAW_TEXT;
3449 else if (priorities[i] & CODING_CATEGORY_MASK_BINARY)
3450 mask = CODING_CATEGORY_MASK_BINARY;
d46c5b12
KH
3451 if (mask)
3452 goto label_return_highest_only;
3453 }
3454 return CODING_CATEGORY_MASK_RAW_TEXT;
3455 }
3456 if (try & CODING_CATEGORY_MASK_ISO)
3457 mask |= detect_coding_iso2022 (src, src_end);
3458 if (try & CODING_CATEGORY_MASK_SJIS)
3459 mask |= detect_coding_sjis (src, src_end);
3460 if (try & CODING_CATEGORY_MASK_BIG5)
3461 mask |= detect_coding_big5 (src, src_end);
3462 if (try & CODING_CATEGORY_MASK_EMACS_MULE)
1397dc18
KH
3463 mask |= detect_coding_emacs_mule (src, src_end);
3464 if (try & CODING_CATEGORY_MASK_CCL)
3465 mask |= detect_coding_ccl (src, src_end);
c4825358 3466 }
5ab13dd0 3467 return (mask | CODING_CATEGORY_MASK_RAW_TEXT | CODING_CATEGORY_MASK_BINARY);
d46c5b12
KH
3468
3469 label_return_highest_only:
3470 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3471 {
3472 if (mask & priorities[i])
3473 return priorities[i];
3474 }
3475 return CODING_CATEGORY_MASK_RAW_TEXT;
4ed46869
KH
3476}
3477
3478/* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
3479 The information of the detected coding system is set in CODING. */
3480
3481void
3482detect_coding (coding, src, src_bytes)
3483 struct coding_system *coding;
3484 unsigned char *src;
3485 int src_bytes;
3486{
d46c5b12
KH
3487 unsigned int idx;
3488 int skip, mask, i;
84d60297 3489 Lisp_Object val;
4ed46869 3490
84d60297 3491 val = Vcoding_category_list;
66cfb530 3492 mask = detect_coding_mask (src, src_bytes, coding_priorities, &skip);
d46c5b12 3493 coding->heading_ascii = skip;
4ed46869 3494
d46c5b12
KH
3495 if (!mask) return;
3496
3497 /* We found a single coding system of the highest priority in MASK. */
3498 idx = 0;
3499 while (mask && ! (mask & 1)) mask >>= 1, idx++;
3500 if (! mask)
3501 idx = CODING_CATEGORY_IDX_RAW_TEXT;
4ed46869 3502
d46c5b12
KH
3503 val = XSYMBOL (XVECTOR (Vcoding_category_table)->contents[idx])->value;
3504
3505 if (coding->eol_type != CODING_EOL_UNDECIDED)
27901516 3506 {
84d60297 3507 Lisp_Object tmp;
d46c5b12 3508
84d60297 3509 tmp = Fget (val, Qeol_type);
d46c5b12
KH
3510 if (VECTORP (tmp))
3511 val = XVECTOR (tmp)->contents[coding->eol_type];
4ed46869 3512 }
d46c5b12
KH
3513 setup_coding_system (val, coding);
3514 /* Set this again because setup_coding_system reset this member. */
3515 coding->heading_ascii = skip;
4ed46869
KH
3516}
3517
d46c5b12
KH
3518/* Detect how end-of-line of a text of length SRC_BYTES pointed by
3519 SOURCE is encoded. Return one of CODING_EOL_LF, CODING_EOL_CRLF,
3520 CODING_EOL_CR, and CODING_EOL_UNDECIDED.
3521
3522 How many non-eol characters are at the head is returned as *SKIP. */
4ed46869 3523
bc4bc72a
RS
3524#define MAX_EOL_CHECK_COUNT 3
3525
d46c5b12
KH
3526static int
3527detect_eol_type (source, src_bytes, skip)
3528 unsigned char *source;
3529 int src_bytes, *skip;
4ed46869 3530{
d46c5b12 3531 unsigned char *src = source, *src_end = src + src_bytes;
4ed46869 3532 unsigned char c;
bc4bc72a
RS
3533 int total = 0; /* How many end-of-lines are found so far. */
3534 int eol_type = CODING_EOL_UNDECIDED;
3535 int this_eol_type;
4ed46869 3536
d46c5b12
KH
3537 *skip = 0;
3538
bc4bc72a 3539 while (src < src_end && total < MAX_EOL_CHECK_COUNT)
4ed46869
KH
3540 {
3541 c = *src++;
bc4bc72a 3542 if (c == '\n' || c == '\r')
4ed46869 3543 {
d46c5b12
KH
3544 if (*skip == 0)
3545 *skip = src - 1 - source;
bc4bc72a
RS
3546 total++;
3547 if (c == '\n')
3548 this_eol_type = CODING_EOL_LF;
3549 else if (src >= src_end || *src != '\n')
3550 this_eol_type = CODING_EOL_CR;
4ed46869 3551 else
bc4bc72a
RS
3552 this_eol_type = CODING_EOL_CRLF, src++;
3553
3554 if (eol_type == CODING_EOL_UNDECIDED)
3555 /* This is the first end-of-line. */
3556 eol_type = this_eol_type;
3557 else if (eol_type != this_eol_type)
d46c5b12
KH
3558 {
3559 /* The found type is different from what found before. */
3560 eol_type = CODING_EOL_INCONSISTENT;
3561 break;
3562 }
4ed46869
KH
3563 }
3564 }
bc4bc72a 3565
d46c5b12
KH
3566 if (*skip == 0)
3567 *skip = src_end - source;
85a02ca4 3568 return eol_type;
4ed46869
KH
3569}
3570
3571/* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
3572 is encoded. If it detects an appropriate format of end-of-line, it
3573 sets the information in *CODING. */
3574
3575void
3576detect_eol (coding, src, src_bytes)
3577 struct coding_system *coding;
3578 unsigned char *src;
3579 int src_bytes;
3580{
4608c386 3581 Lisp_Object val;
d46c5b12
KH
3582 int skip;
3583 int eol_type = detect_eol_type (src, src_bytes, &skip);
3584
3585 if (coding->heading_ascii > skip)
3586 coding->heading_ascii = skip;
3587 else
3588 skip = coding->heading_ascii;
4ed46869 3589
0ef69138 3590 if (eol_type == CODING_EOL_UNDECIDED)
4ed46869 3591 return;
27901516
KH
3592 if (eol_type == CODING_EOL_INCONSISTENT)
3593 {
3594#if 0
3595 /* This code is suppressed until we find a better way to
992f23f2 3596 distinguish raw text file and binary file. */
27901516
KH
3597
3598 /* If we have already detected that the coding is raw-text, the
3599 coding should actually be no-conversion. */
3600 if (coding->type == coding_type_raw_text)
3601 {
3602 setup_coding_system (Qno_conversion, coding);
3603 return;
3604 }
3605 /* Else, let's decode only text code anyway. */
3606#endif /* 0 */
1b2af4b0 3607 eol_type = CODING_EOL_LF;
27901516
KH
3608 }
3609
4608c386 3610 val = Fget (coding->symbol, Qeol_type);
4ed46869 3611 if (VECTORP (val) && XVECTOR (val)->size == 3)
d46c5b12
KH
3612 {
3613 setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
3614 coding->heading_ascii = skip;
3615 }
3616}
3617
3618#define CONVERSION_BUFFER_EXTRA_ROOM 256
3619
3620#define DECODING_BUFFER_MAG(coding) \
3621 (coding->type == coding_type_iso2022 \
3622 ? 3 \
3623 : ((coding->type == coding_type_sjis || coding->type == coding_type_big5) \
3624 ? 2 \
3625 : (coding->type == coding_type_raw_text \
3626 ? 1 \
3627 : (coding->type == coding_type_ccl \
3628 ? coding->spec.ccl.decoder.buf_magnification \
3629 : 2))))
3630
3631/* Return maximum size (bytes) of a buffer enough for decoding
3632 SRC_BYTES of text encoded in CODING. */
3633
3634int
3635decoding_buffer_size (coding, src_bytes)
3636 struct coding_system *coding;
3637 int src_bytes;
3638{
3639 return (src_bytes * DECODING_BUFFER_MAG (coding)
3640 + CONVERSION_BUFFER_EXTRA_ROOM);
3641}
3642
3643/* Return maximum size (bytes) of a buffer enough for encoding
3644 SRC_BYTES of text to CODING. */
3645
3646int
3647encoding_buffer_size (coding, src_bytes)
3648 struct coding_system *coding;
3649 int src_bytes;
3650{
3651 int magnification;
3652
3653 if (coding->type == coding_type_ccl)
3654 magnification = coding->spec.ccl.encoder.buf_magnification;
3655 else
3656 magnification = 3;
3657
3658 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
3659}
3660
3661#ifndef MINIMUM_CONVERSION_BUFFER_SIZE
3662#define MINIMUM_CONVERSION_BUFFER_SIZE 1024
3663#endif
3664
3665char *conversion_buffer;
3666int conversion_buffer_size;
3667
3668/* Return a pointer to a SIZE bytes of buffer to be used for encoding
3669 or decoding. Sufficient memory is allocated automatically. If we
3670 run out of memory, return NULL. */
3671
3672char *
3673get_conversion_buffer (size)
3674 int size;
3675{
3676 if (size > conversion_buffer_size)
3677 {
3678 char *buf;
3679 int real_size = conversion_buffer_size * 2;
3680
3681 while (real_size < size) real_size *= 2;
3682 buf = (char *) xmalloc (real_size);
3683 xfree (conversion_buffer);
3684 conversion_buffer = buf;
3685 conversion_buffer_size = real_size;
3686 }
3687 return conversion_buffer;
3688}
3689
3690int
3691ccl_coding_driver (coding, source, destination, src_bytes, dst_bytes, encodep)
3692 struct coding_system *coding;
3693 unsigned char *source, *destination;
3694 int src_bytes, dst_bytes, encodep;
3695{
3696 struct ccl_program *ccl
3697 = encodep ? &coding->spec.ccl.encoder : &coding->spec.ccl.decoder;
3698 int result;
3699
ae9ff118 3700 ccl->last_block = coding->mode & CODING_MODE_LAST_BLOCK;
7b179c2d 3701
d46c5b12
KH
3702 coding->produced = ccl_driver (ccl, source, destination,
3703 src_bytes, dst_bytes, &(coding->consumed));
69f76525 3704 coding->produced_char
48942766
KH
3705 = (encodep
3706 ? coding->produced
3707 : multibyte_chars_in_text (destination, coding->produced));
69f76525
KH
3708 coding->consumed_char
3709 = multibyte_chars_in_text (source, coding->consumed);
3710
d46c5b12
KH
3711 switch (ccl->status)
3712 {
3713 case CCL_STAT_SUSPEND_BY_SRC:
3714 result = CODING_FINISH_INSUFFICIENT_SRC;
3715 break;
3716 case CCL_STAT_SUSPEND_BY_DST:
3717 result = CODING_FINISH_INSUFFICIENT_DST;
3718 break;
9864ebce
KH
3719 case CCL_STAT_QUIT:
3720 case CCL_STAT_INVALID_CMD:
3721 result = CODING_FINISH_INTERRUPT;
3722 break;
d46c5b12
KH
3723 default:
3724 result = CODING_FINISH_NORMAL;
3725 break;
3726 }
3727 return result;
4ed46869
KH
3728}
3729
3730/* See "GENERAL NOTES about `decode_coding_XXX ()' functions". Before
3731 decoding, it may detect coding system and format of end-of-line if
52d41803
KH
3732 those are not yet decided.
3733
3734 This function does not make full use of DESTINATION buffer. For
3735 instance, if coding->type is coding_type_iso2022, it uses only
3736 (DST_BYTES - 7) bytes of DESTINATION buffer. In the case that
3737 DST_BYTES is decided by the function decoding_buffer_size, it
3738 contains extra 256 bytes (defined by CONVERSION_BUFFER_EXTRA_ROOM).
3739 So, this function can decode the full SOURCE. But, in the other
3740 case, if you want to avoid carry over, you must supply at least 7
3741 bytes more area in DESTINATION buffer than expected maximum bytes
3742 that will be produced by this function. */
4ed46869
KH
3743
3744int
d46c5b12 3745decode_coding (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
3746 struct coding_system *coding;
3747 unsigned char *source, *destination;
3748 int src_bytes, dst_bytes;
4ed46869 3749{
d46c5b12 3750 int result;
4ed46869 3751
d4e57bcd 3752 if (src_bytes <= 0
944bd420 3753 && coding->type != coding_type_ccl
d4e57bcd
KH
3754 && ! (coding->mode & CODING_MODE_LAST_BLOCK
3755 && CODING_REQUIRE_FLUSHING (coding)))
4ed46869 3756 {
d46c5b12
KH
3757 coding->produced = coding->produced_char = 0;
3758 coding->consumed = coding->consumed_char = 0;
fb88bf2d 3759 coding->fake_multibyte = 0;
d46c5b12 3760 return CODING_FINISH_NORMAL;
4ed46869
KH
3761 }
3762
0ef69138 3763 if (coding->type == coding_type_undecided)
4ed46869
KH
3764 detect_coding (coding, source, src_bytes);
3765
0ef69138 3766 if (coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
3767 detect_eol (coding, source, src_bytes);
3768
4ed46869
KH
3769 switch (coding->type)
3770 {
0ef69138
KH
3771 case coding_type_emacs_mule:
3772 case coding_type_undecided:
27901516 3773 case coding_type_raw_text:
4ed46869 3774 if (coding->eol_type == CODING_EOL_LF
0ef69138 3775 || coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869 3776 goto label_no_conversion;
d46c5b12 3777 result = decode_eol (coding, source, destination, src_bytes, dst_bytes);
4ed46869
KH
3778 break;
3779
3780 case coding_type_sjis:
d46c5b12
KH
3781 result = decode_coding_sjis_big5 (coding, source, destination,
3782 src_bytes, dst_bytes, 1);
4ed46869
KH
3783 break;
3784
3785 case coding_type_iso2022:
d46c5b12
KH
3786 result = decode_coding_iso2022 (coding, source, destination,
3787 src_bytes, dst_bytes);
4ed46869
KH
3788 break;
3789
3790 case coding_type_big5:
d46c5b12
KH
3791 result = decode_coding_sjis_big5 (coding, source, destination,
3792 src_bytes, dst_bytes, 0);
4ed46869
KH
3793 break;
3794
3795 case coding_type_ccl:
d46c5b12
KH
3796 result = ccl_coding_driver (coding, source, destination,
3797 src_bytes, dst_bytes, 0);
3798 break;
3799
3800 default: /* i.e. case coding_type_no_conversion: */
3801 label_no_conversion:
3802 if (dst_bytes && src_bytes > dst_bytes)
3803 {
3804 coding->produced = dst_bytes;
3805 result = CODING_FINISH_INSUFFICIENT_DST;
3806 }
3807 else
3808 {
3809 coding->produced = src_bytes;
3810 result = CODING_FINISH_NORMAL;
3811 }
3812 if (dst_bytes)
3813 bcopy (source, destination, coding->produced);
3814 else
3815 safe_bcopy (source, destination, coding->produced);
fb88bf2d 3816 coding->fake_multibyte = 1;
d46c5b12
KH
3817 coding->consumed
3818 = coding->consumed_char = coding->produced_char = coding->produced;
4ed46869
KH
3819 break;
3820 }
3821
d46c5b12 3822 return result;
4ed46869
KH
3823}
3824
52d41803
KH
3825/* See "GENERAL NOTES about `encode_coding_XXX ()' functions".
3826
3827 This function does not make full use of DESTINATION buffer. For
3828 instance, if coding->type is coding_type_iso2022, it uses only
3829 (DST_BYTES - 20) bytes of DESTINATION buffer. In the case that
3830 DST_BYTES is decided by the function encoding_buffer_size, it
3831 contains extra 256 bytes (defined by CONVERSION_BUFFER_EXTRA_ROOM).
3832 So, this function can encode the full SOURCE. But, in the other
3833 case, if you want to avoid carry over, you must supply at least 20
3834 bytes more area in DESTINATION buffer than expected maximum bytes
3835 that will be produced by this function. */
4ed46869
KH
3836
3837int
d46c5b12 3838encode_coding (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
3839 struct coding_system *coding;
3840 unsigned char *source, *destination;
3841 int src_bytes, dst_bytes;
4ed46869 3842{
d46c5b12 3843 int result;
4ed46869 3844
d4e57bcd
KH
3845 if (src_bytes <= 0
3846 && ! (coding->mode & CODING_MODE_LAST_BLOCK
3847 && CODING_REQUIRE_FLUSHING (coding)))
4ed46869 3848 {
d46c5b12
KH
3849 coding->produced = coding->produced_char = 0;
3850 coding->consumed = coding->consumed_char = 0;
fb88bf2d 3851 coding->fake_multibyte = 0;
d46c5b12
KH
3852 return CODING_FINISH_NORMAL;
3853 }
4ed46869 3854
d46c5b12
KH
3855 switch (coding->type)
3856 {
0ef69138
KH
3857 case coding_type_emacs_mule:
3858 case coding_type_undecided:
27901516 3859 case coding_type_raw_text:
4ed46869 3860 if (coding->eol_type == CODING_EOL_LF
0ef69138 3861 || coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869 3862 goto label_no_conversion;
d46c5b12 3863 result = encode_eol (coding, source, destination, src_bytes, dst_bytes);
4ed46869
KH
3864 break;
3865
3866 case coding_type_sjis:
d46c5b12
KH
3867 result = encode_coding_sjis_big5 (coding, source, destination,
3868 src_bytes, dst_bytes, 1);
4ed46869
KH
3869 break;
3870
3871 case coding_type_iso2022:
d46c5b12
KH
3872 result = encode_coding_iso2022 (coding, source, destination,
3873 src_bytes, dst_bytes);
4ed46869
KH
3874 break;
3875
3876 case coding_type_big5:
d46c5b12
KH
3877 result = encode_coding_sjis_big5 (coding, source, destination,
3878 src_bytes, dst_bytes, 0);
4ed46869
KH
3879 break;
3880
3881 case coding_type_ccl:
d46c5b12
KH
3882 result = ccl_coding_driver (coding, source, destination,
3883 src_bytes, dst_bytes, 1);
3884 break;
3885
3886 default: /* i.e. case coding_type_no_conversion: */
3887 label_no_conversion:
3888 if (dst_bytes && src_bytes > dst_bytes)
3889 {
3890 coding->produced = dst_bytes;
3891 result = CODING_FINISH_INSUFFICIENT_DST;
3892 }
3893 else
3894 {
3895 coding->produced = src_bytes;
3896 result = CODING_FINISH_NORMAL;
3897 }
3898 if (dst_bytes)
3899 bcopy (source, destination, coding->produced);
3900 else
3901 safe_bcopy (source, destination, coding->produced);
3902 if (coding->mode & CODING_MODE_SELECTIVE_DISPLAY)
3903 {
3904 unsigned char *p = destination, *pend = p + coding->produced;
3905 while (p < pend)
3906 if (*p++ == '\015') p[-1] = '\n';
3907 }
fb88bf2d 3908 coding->fake_multibyte = 1;
d46c5b12
KH
3909 coding->consumed
3910 = coding->consumed_char = coding->produced_char = coding->produced;
4ed46869
KH
3911 break;
3912 }
3913
d46c5b12 3914 return result;
4ed46869
KH
3915}
3916
fb88bf2d
KH
3917/* Scan text in the region between *BEG and *END (byte positions),
3918 skip characters which we don't have to decode by coding system
3919 CODING at the head and tail, then set *BEG and *END to the region
3920 of the text we actually have to convert. The caller should move
3921 the gap out of the region in advance.
4ed46869 3922
d46c5b12
KH
3923 If STR is not NULL, *BEG and *END are indices into STR. */
3924
3925static void
3926shrink_decoding_region (beg, end, coding, str)
3927 int *beg, *end;
3928 struct coding_system *coding;
3929 unsigned char *str;
3930{
fb88bf2d 3931 unsigned char *begp_orig, *begp, *endp_orig, *endp, c;
d46c5b12 3932 int eol_conversion;
88993dfd 3933 Lisp_Object translation_table;
d46c5b12
KH
3934
3935 if (coding->type == coding_type_ccl
3936 || coding->type == coding_type_undecided
3937 || !NILP (coding->post_read_conversion))
3938 {
3939 /* We can't skip any data. */
3940 return;
3941 }
3942 else if (coding->type == coding_type_no_conversion)
3943 {
fb88bf2d
KH
3944 /* We need no conversion, but don't have to skip any data here.
3945 Decoding routine handles them effectively anyway. */
d46c5b12
KH
3946 return;
3947 }
3948
88993dfd
KH
3949 translation_table = coding->translation_table_for_decode;
3950 if (NILP (translation_table) && !NILP (Venable_character_translation))
3951 translation_table = Vstandard_translation_table_for_decode;
3952 if (CHAR_TABLE_P (translation_table))
3953 {
3954 int i;
3955 for (i = 0; i < 128; i++)
3956 if (!NILP (CHAR_TABLE_REF (translation_table, i)))
3957 break;
3958 if (i < 128)
3959 /* Some ASCII character should be tranlsated. We give up
3960 shrinking. */
3961 return;
3962 }
3963
aa60dea6
KH
3964 eol_conversion = (coding->eol_type != CODING_EOL_LF);
3965
3966 if ((! eol_conversion) && (coding->heading_ascii >= 0))
d46c5b12
KH
3967 /* Detection routine has already found how much we can skip at the
3968 head. */
3969 *beg += coding->heading_ascii;
3970
3971 if (str)
3972 {
3973 begp_orig = begp = str + *beg;
3974 endp_orig = endp = str + *end;
3975 }
3976 else
3977 {
fb88bf2d 3978 begp_orig = begp = BYTE_POS_ADDR (*beg);
d46c5b12
KH
3979 endp_orig = endp = begp + *end - *beg;
3980 }
3981
d46c5b12
KH
3982 switch (coding->type)
3983 {
3984 case coding_type_emacs_mule:
3985 case coding_type_raw_text:
3986 if (eol_conversion)
3987 {
3988 if (coding->heading_ascii < 0)
fb88bf2d 3989 while (begp < endp && *begp != '\r' && *begp < 0x80) begp++;
ee59c65f 3990 while (begp < endp && endp[-1] != '\r' && endp[-1] < 0x80)
fb88bf2d 3991 endp--;
ee59c65f
RS
3992 /* Do not consider LF as ascii if preceded by CR, since that
3993 confuses eol decoding. */
3994 if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
3995 endp++;
d46c5b12
KH
3996 }
3997 else
3998 begp = endp;
3999 break;
4000
4001 case coding_type_sjis:
4002 case coding_type_big5:
4003 /* We can skip all ASCII characters at the head. */
4004 if (coding->heading_ascii < 0)
4005 {
4006 if (eol_conversion)
de9d083c 4007 while (begp < endp && *begp < 0x80 && *begp != '\r') begp++;
d46c5b12
KH
4008 else
4009 while (begp < endp && *begp < 0x80) begp++;
4010 }
4011 /* We can skip all ASCII characters at the tail except for the
4012 second byte of SJIS or BIG5 code. */
4013 if (eol_conversion)
de9d083c 4014 while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\r') endp--;
d46c5b12
KH
4015 else
4016 while (begp < endp && endp[-1] < 0x80) endp--;
ee59c65f
RS
4017 /* Do not consider LF as ascii if preceded by CR, since that
4018 confuses eol decoding. */
4019 if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
4020 endp++;
d46c5b12
KH
4021 if (begp < endp && endp < endp_orig && endp[-1] >= 0x80)
4022 endp++;
4023 break;
4024
4025 default: /* i.e. case coding_type_iso2022: */
622fece5
KH
4026 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
4027 /* We can't skip any data. */
4028 break;
d46c5b12
KH
4029 if (coding->heading_ascii < 0)
4030 {
d46c5b12
KH
4031 /* We can skip all ASCII characters at the head except for a
4032 few control codes. */
4033 while (begp < endp && (c = *begp) < 0x80
4034 && c != ISO_CODE_CR && c != ISO_CODE_SO
4035 && c != ISO_CODE_SI && c != ISO_CODE_ESC
4036 && (!eol_conversion || c != ISO_CODE_LF))
4037 begp++;
4038 }
4039 switch (coding->category_idx)
4040 {
4041 case CODING_CATEGORY_IDX_ISO_8_1:
4042 case CODING_CATEGORY_IDX_ISO_8_2:
4043 /* We can skip all ASCII characters at the tail. */
4044 if (eol_conversion)
de9d083c 4045 while (begp < endp && (c = endp[-1]) < 0x80 && c != '\r') endp--;
d46c5b12
KH
4046 else
4047 while (begp < endp && endp[-1] < 0x80) endp--;
ee59c65f
RS
4048 /* Do not consider LF as ascii if preceded by CR, since that
4049 confuses eol decoding. */
4050 if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
4051 endp++;
d46c5b12
KH
4052 break;
4053
4054 case CODING_CATEGORY_IDX_ISO_7:
4055 case CODING_CATEGORY_IDX_ISO_7_TIGHT:
de79a6a5
KH
4056 {
4057 /* We can skip all charactes at the tail except for 8-bit
4058 codes and ESC and the following 2-byte at the tail. */
4059 unsigned char *eight_bit = NULL;
4060
4061 if (eol_conversion)
4062 while (begp < endp
4063 && (c = endp[-1]) != ISO_CODE_ESC && c != '\r')
4064 {
4065 if (!eight_bit && c & 0x80) eight_bit = endp;
4066 endp--;
4067 }
4068 else
4069 while (begp < endp
4070 && (c = endp[-1]) != ISO_CODE_ESC)
4071 {
4072 if (!eight_bit && c & 0x80) eight_bit = endp;
4073 endp--;
4074 }
4075 /* Do not consider LF as ascii if preceded by CR, since that
4076 confuses eol decoding. */
4077 if (begp < endp && endp < endp_orig
4078 && endp[-1] == '\r' && endp[0] == '\n')
4079 endp++;
4080 if (begp < endp && endp[-1] == ISO_CODE_ESC)
4081 {
4082 if (endp + 1 < endp_orig && end[0] == '(' && end[1] == 'B')
4083 /* This is an ASCII designation sequence. We can
4084 surely skip the tail. But, if we have
4085 encountered an 8-bit code, skip only the codes
4086 after that. */
4087 endp = eight_bit ? eight_bit : endp + 2;
4088 else
4089 /* Hmmm, we can't skip the tail. */
4090 endp = endp_orig;
4091 }
4092 else if (eight_bit)
4093 endp = eight_bit;
4094 }
d46c5b12
KH
4095 }
4096 }
4097 *beg += begp - begp_orig;
4098 *end += endp - endp_orig;
4099 return;
4100}
4101
4102/* Like shrink_decoding_region but for encoding. */
4103
4104static void
4105shrink_encoding_region (beg, end, coding, str)
4106 int *beg, *end;
4107 struct coding_system *coding;
4108 unsigned char *str;
4109{
4110 unsigned char *begp_orig, *begp, *endp_orig, *endp;
4111 int eol_conversion;
88993dfd 4112 Lisp_Object translation_table;
d46c5b12
KH
4113
4114 if (coding->type == coding_type_ccl)
4115 /* We can't skip any data. */
4116 return;
4117 else if (coding->type == coding_type_no_conversion)
4118 {
4119 /* We need no conversion. */
4120 *beg = *end;
4121 return;
4122 }
4123
88993dfd
KH
4124 translation_table = coding->translation_table_for_encode;
4125 if (NILP (translation_table) && !NILP (Venable_character_translation))
4126 translation_table = Vstandard_translation_table_for_encode;
4127 if (CHAR_TABLE_P (translation_table))
4128 {
4129 int i;
4130 for (i = 0; i < 128; i++)
4131 if (!NILP (CHAR_TABLE_REF (translation_table, i)))
4132 break;
4133 if (i < 128)
4134 /* Some ASCII character should be tranlsated. We give up
4135 shrinking. */
4136 return;
4137 }
4138
d46c5b12
KH
4139 if (str)
4140 {
4141 begp_orig = begp = str + *beg;
4142 endp_orig = endp = str + *end;
4143 }
4144 else
4145 {
fb88bf2d 4146 begp_orig = begp = BYTE_POS_ADDR (*beg);
d46c5b12
KH
4147 endp_orig = endp = begp + *end - *beg;
4148 }
4149
4150 eol_conversion = (coding->eol_type == CODING_EOL_CR
4151 || coding->eol_type == CODING_EOL_CRLF);
4152
4153 /* Here, we don't have to check coding->pre_write_conversion because
4154 the caller is expected to have handled it already. */
4155 switch (coding->type)
4156 {
4157 case coding_type_undecided:
4158 case coding_type_emacs_mule:
4159 case coding_type_raw_text:
4160 if (eol_conversion)
4161 {
4162 while (begp < endp && *begp != '\n') begp++;
4163 while (begp < endp && endp[-1] != '\n') endp--;
4164 }
4165 else
4166 begp = endp;
4167 break;
4168
4169 case coding_type_iso2022:
622fece5
KH
4170 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
4171 /* We can't skip any data. */
4172 break;
d46c5b12
KH
4173 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
4174 {
4175 unsigned char *bol = begp;
4176 while (begp < endp && *begp < 0x80)
4177 {
4178 begp++;
4179 if (begp[-1] == '\n')
4180 bol = begp;
4181 }
4182 begp = bol;
4183 goto label_skip_tail;
4184 }
4185 /* fall down ... */
4186
4187 default:
4188 /* We can skip all ASCII characters at the head and tail. */
4189 if (eol_conversion)
4190 while (begp < endp && *begp < 0x80 && *begp != '\n') begp++;
4191 else
4192 while (begp < endp && *begp < 0x80) begp++;
4193 label_skip_tail:
4194 if (eol_conversion)
4195 while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\n') endp--;
4196 else
4197 while (begp < endp && *(endp - 1) < 0x80) endp--;
4198 break;
4199 }
4200
4201 *beg += begp - begp_orig;
4202 *end += endp - endp_orig;
4203 return;
4204}
4205
88993dfd
KH
4206/* As shrinking conversion region requires some overhead, we don't try
4207 shrinking if the length of conversion region is less than this
4208 value. */
4209static int shrink_conversion_region_threshhold = 1024;
4210
4211#define SHRINK_CONVERSION_REGION(beg, end, coding, str, encodep) \
4212 do { \
4213 if (*(end) - *(beg) > shrink_conversion_region_threshhold) \
4214 { \
4215 if (encodep) shrink_encoding_region (beg, end, coding, str); \
4216 else shrink_decoding_region (beg, end, coding, str); \
4217 } \
4218 } while (0)
4219
d46c5b12 4220/* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the
fb88bf2d
KH
4221 text from FROM to TO (byte positions are FROM_BYTE and TO_BYTE) by
4222 coding system CODING, and return the status code of code conversion
4223 (currently, this value has no meaning).
4224
4225 How many characters (and bytes) are converted to how many
4226 characters (and bytes) are recorded in members of the structure
4227 CODING.
d46c5b12 4228
6e44253b 4229 If REPLACE is nonzero, we do various things as if the original text
d46c5b12 4230 is deleted and a new text is inserted. See the comments in
6e44253b 4231 replace_range (insdel.c) to know what we are doing. */
4ed46869
KH
4232
4233int
6e44253b
KH
4234code_convert_region (from, from_byte, to, to_byte, coding, encodep, replace)
4235 int from, from_byte, to, to_byte, encodep, replace;
4ed46869 4236 struct coding_system *coding;
4ed46869 4237{
fb88bf2d
KH
4238 int len = to - from, len_byte = to_byte - from_byte;
4239 int require, inserted, inserted_byte;
12410ef1 4240 int head_skip, tail_skip, total_skip;
84d60297 4241 Lisp_Object saved_coding_symbol;
fb88bf2d
KH
4242 int multibyte = !NILP (current_buffer->enable_multibyte_characters);
4243 int first = 1;
4244 int fake_multibyte = 0;
4245 unsigned char *src, *dst;
84d60297 4246 Lisp_Object deletion;
e133c8fa 4247 int orig_point = PT, orig_len = len;
6abb9bd9 4248 int prev_Z;
84d60297
RS
4249
4250 deletion = Qnil;
4251 saved_coding_symbol = Qnil;
d46c5b12 4252
83fa074f 4253 if (from < PT && PT < to)
e133c8fa
KH
4254 {
4255 TEMP_SET_PT_BOTH (from, from_byte);
4256 orig_point = from;
4257 }
83fa074f 4258
6e44253b 4259 if (replace)
d46c5b12 4260 {
fb88bf2d
KH
4261 int saved_from = from;
4262
d46c5b12 4263 prepare_to_modify_buffer (from, to, &from);
fb88bf2d
KH
4264 if (saved_from != from)
4265 {
4266 to = from + len;
4267 if (multibyte)
4268 from_byte = CHAR_TO_BYTE (from), to_byte = CHAR_TO_BYTE (to);
4269 else
4270 from_byte = from, to_byte = to;
4271 len_byte = to_byte - from_byte;
4272 }
d46c5b12 4273 }
d46c5b12
KH
4274
4275 if (! encodep && CODING_REQUIRE_DETECTION (coding))
4276 {
12410ef1 4277 /* We must detect encoding of text and eol format. */
d46c5b12
KH
4278
4279 if (from < GPT && to > GPT)
4280 move_gap_both (from, from_byte);
4281 if (coding->type == coding_type_undecided)
4282 {
fb88bf2d 4283 detect_coding (coding, BYTE_POS_ADDR (from_byte), len_byte);
d46c5b12 4284 if (coding->type == coding_type_undecided)
12410ef1
KH
4285 /* It seems that the text contains only ASCII, but we
4286 should not left it undecided because the deeper
4287 decoding routine (decode_coding) tries to detect the
4288 encodings again in vain. */
d46c5b12
KH
4289 coding->type = coding_type_emacs_mule;
4290 }
4291 if (coding->eol_type == CODING_EOL_UNDECIDED)
4292 {
4293 saved_coding_symbol = coding->symbol;
4294 detect_eol (coding, BYTE_POS_ADDR (from_byte), len_byte);
4295 if (coding->eol_type == CODING_EOL_UNDECIDED)
4296 coding->eol_type = CODING_EOL_LF;
4297 /* We had better recover the original eol format if we
4298 encounter an inconsitent eol format while decoding. */
4299 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4300 }
4301 }
4302
fb88bf2d
KH
4303 coding->consumed_char = len, coding->consumed = len_byte;
4304
d46c5b12
KH
4305 if (encodep
4306 ? ! CODING_REQUIRE_ENCODING (coding)
4307 : ! CODING_REQUIRE_DECODING (coding))
fb88bf2d
KH
4308 {
4309 coding->produced = len_byte;
12410ef1
KH
4310 if (multibyte
4311 && ! replace
4312 /* See the comment of the member heading_ascii in coding.h. */
4313 && coding->heading_ascii < len_byte)
fb88bf2d 4314 {
6e44253b
KH
4315 /* We still may have to combine byte at the head and the
4316 tail of the text in the region. */
12410ef1 4317 if (from < GPT && GPT < to)
6e44253b 4318 move_gap_both (to, to_byte);
12410ef1
KH
4319 len = multibyte_chars_in_text (BYTE_POS_ADDR (from_byte), len_byte);
4320 adjust_after_insert (from, from_byte, to, to_byte, len);
4321 coding->produced_char = len;
fb88bf2d
KH
4322 }
4323 else
68e3a8f1
AS
4324 {
4325 if (!replace)
4326 adjust_after_insert (from, from_byte, to, to_byte, len_byte);
4327 coding->produced_char = len_byte;
4328 }
fb88bf2d
KH
4329 return 0;
4330 }
d46c5b12
KH
4331
4332 /* Now we convert the text. */
4333
4334 /* For encoding, we must process pre-write-conversion in advance. */
4335 if (encodep
d46c5b12
KH
4336 && ! NILP (coding->pre_write_conversion)
4337 && SYMBOLP (coding->pre_write_conversion)
4338 && ! NILP (Ffboundp (coding->pre_write_conversion)))
4339 {
2b4f9037
KH
4340 /* The function in pre-write-conversion may put a new text in a
4341 new buffer. */
0007bdd0
KH
4342 struct buffer *prev = current_buffer;
4343 Lisp_Object new;
d46c5b12 4344
b39f748c
AS
4345 call2 (coding->pre_write_conversion,
4346 make_number (from), make_number (to));
d46c5b12
KH
4347 if (current_buffer != prev)
4348 {
4349 len = ZV - BEGV;
0007bdd0 4350 new = Fcurrent_buffer ();
d46c5b12 4351 set_buffer_internal_1 (prev);
ddbc19ff 4352 del_range_2 (from, from_byte, to, to_byte);
e133c8fa 4353 TEMP_SET_PT_BOTH (from, from_byte);
0007bdd0
KH
4354 insert_from_buffer (XBUFFER (new), 1, len, 0);
4355 Fkill_buffer (new);
e133c8fa
KH
4356 if (orig_point >= to)
4357 orig_point += len - orig_len;
4358 else if (orig_point > from)
4359 orig_point = from;
4360 orig_len = len;
d46c5b12 4361 to = from + len;
e133c8fa 4362 from_byte = multibyte ? CHAR_TO_BYTE (from) : from_byte;
fb88bf2d 4363 to_byte = multibyte ? CHAR_TO_BYTE (to) : to;
d46c5b12 4364 len_byte = to_byte - from_byte;
e133c8fa 4365 TEMP_SET_PT_BOTH (from, from_byte);
d46c5b12
KH
4366 }
4367 }
4368
12410ef1
KH
4369 if (replace)
4370 deletion = make_buffer_string_both (from, from_byte, to, to_byte, 1);
4371
d46c5b12 4372 /* Try to skip the heading and tailing ASCIIs. */
12410ef1
KH
4373 {
4374 int from_byte_orig = from_byte, to_byte_orig = to_byte;
4375
4376 if (from < GPT && GPT < to)
4377 move_gap_both (from, from_byte);
88993dfd 4378 SHRINK_CONVERSION_REGION (&from_byte, &to_byte, coding, NULL, encodep);
d4e57bcd 4379 if (from_byte == to_byte
944bd420 4380 && coding->type != coding_type_ccl
d4e57bcd
KH
4381 && ! (coding->mode & CODING_MODE_LAST_BLOCK
4382 && CODING_REQUIRE_FLUSHING (coding)))
12410ef1
KH
4383 {
4384 coding->produced = len_byte;
4385 coding->produced_char = multibyte ? len : len_byte;
4386 if (!replace)
4387 /* We must record and adjust for this new text now. */
4388 adjust_after_insert (from, from_byte_orig, to, to_byte_orig, len);
4389 return 0;
4390 }
fb88bf2d 4391
12410ef1
KH
4392 head_skip = from_byte - from_byte_orig;
4393 tail_skip = to_byte_orig - to_byte;
4394 total_skip = head_skip + tail_skip;
4395 from += head_skip;
4396 to -= tail_skip;
4397 len -= total_skip; len_byte -= total_skip;
4398 }
d46c5b12 4399
88993dfd 4400 /* The code conversion routine can not preserve text properties for
55d8d769
KH
4401 now. So, we must remove all text properties in the region.
4402 Here, we must suppress all modification hooks. */
88993dfd 4403 if (replace)
55d8d769
KH
4404 {
4405 int saved_inhibit_modification_hooks = inhibit_modification_hooks;
4406 inhibit_modification_hooks = 1;
4407 Fset_text_properties (make_number (from), make_number (to), Qnil, Qnil);
4408 inhibit_modification_hooks = saved_inhibit_modification_hooks;
4409 }
88993dfd 4410
fb88bf2d
KH
4411 /* For converion, we must put the gap before the text in addition to
4412 making the gap larger for efficient decoding. The required gap
4413 size starts from 2000 which is the magic number used in make_gap.
4414 But, after one batch of conversion, it will be incremented if we
4415 find that it is not enough . */
d46c5b12
KH
4416 require = 2000;
4417
4418 if (GAP_SIZE < require)
4419 make_gap (require - GAP_SIZE);
4420 move_gap_both (from, from_byte);
4421
d46c5b12 4422 inserted = inserted_byte = 0;
fb88bf2d
KH
4423 src = GAP_END_ADDR, dst = GPT_ADDR;
4424
4425 GAP_SIZE += len_byte;
4426 ZV -= len;
4427 Z -= len;
4428 ZV_BYTE -= len_byte;
4429 Z_BYTE -= len_byte;
4430
f2558efd
KH
4431 if (GPT - BEG < beg_unchanged)
4432 beg_unchanged = GPT - BEG;
4433 if (Z - GPT < end_unchanged)
4434 end_unchanged = Z - GPT;
4435
d46c5b12
KH
4436 for (;;)
4437 {
fb88bf2d 4438 int result;
d46c5b12
KH
4439
4440 /* The buffer memory is changed from:
fb88bf2d
KH
4441 +--------+converted-text+---------+-------original-text------+---+
4442 |<-from->|<--inserted-->|---------|<-----------len---------->|---|
4443 |<------------------- GAP_SIZE -------------------->| */
d46c5b12 4444 if (encodep)
fb88bf2d 4445 result = encode_coding (coding, src, dst, len_byte, 0);
d46c5b12 4446 else
fb88bf2d 4447 result = decode_coding (coding, src, dst, len_byte, 0);
d46c5b12
KH
4448 /* to:
4449 +--------+-------converted-text--------+--+---original-text--+---+
fb88bf2d
KH
4450 |<-from->|<--inserted-->|<--produced-->|--|<-(len-consumed)->|---|
4451 |<------------------- GAP_SIZE -------------------->| */
4452 if (coding->fake_multibyte)
4453 fake_multibyte = 1;
d46c5b12 4454
fb88bf2d
KH
4455 if (!encodep && !multibyte)
4456 coding->produced_char = coding->produced;
d46c5b12
KH
4457 inserted += coding->produced_char;
4458 inserted_byte += coding->produced;
d46c5b12 4459 len_byte -= coding->consumed;
fb88bf2d
KH
4460 src += coding->consumed;
4461 dst += inserted_byte;
d46c5b12 4462
9864ebce
KH
4463 if (result == CODING_FINISH_NORMAL)
4464 {
4465 src += len_byte;
4466 break;
4467 }
d46c5b12
KH
4468 if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
4469 {
fb88bf2d 4470 unsigned char *pend = dst, *p = pend - inserted_byte;
d46c5b12
KH
4471
4472 /* Encode LFs back to the original eol format (CR or CRLF). */
4473 if (coding->eol_type == CODING_EOL_CR)
4474 {
4475 while (p < pend) if (*p++ == '\n') p[-1] = '\r';
4476 }
4477 else
4478 {
d46c5b12
KH
4479 int count = 0;
4480
fb88bf2d
KH
4481 while (p < pend) if (*p++ == '\n') count++;
4482 if (src - dst < count)
d46c5b12 4483 {
fb88bf2d
KH
4484 /* We don't have sufficient room for putting LFs
4485 back to CRLF. We must record converted and
4486 not-yet-converted text back to the buffer
4487 content, enlarge the gap, then record them out of
4488 the buffer contents again. */
4489 int add = len_byte + inserted_byte;
4490
4491 GAP_SIZE -= add;
4492 ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
4493 GPT += inserted_byte; GPT_BYTE += inserted_byte;
4494 make_gap (count - GAP_SIZE);
4495 GAP_SIZE += add;
4496 ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
4497 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
4498 /* Don't forget to update SRC, DST, and PEND. */
4499 src = GAP_END_ADDR - len_byte;
4500 dst = GPT_ADDR + inserted_byte;
4501 pend = dst;
d46c5b12 4502 }
d46c5b12
KH
4503 inserted += count;
4504 inserted_byte += count;
fb88bf2d
KH
4505 coding->produced += count;
4506 p = dst = pend + count;
4507 while (count)
4508 {
4509 *--p = *--pend;
4510 if (*p == '\n') count--, *--p = '\r';
4511 }
d46c5b12
KH
4512 }
4513
4514 /* Suppress eol-format conversion in the further conversion. */
4515 coding->eol_type = CODING_EOL_LF;
4516
4517 /* Restore the original symbol. */
4518 coding->symbol = saved_coding_symbol;
fb88bf2d
KH
4519
4520 continue;
d46c5b12
KH
4521 }
4522 if (len_byte <= 0)
944bd420
KH
4523 {
4524 if (coding->type != coding_type_ccl
4525 || coding->mode & CODING_MODE_LAST_BLOCK)
4526 break;
4527 coding->mode |= CODING_MODE_LAST_BLOCK;
4528 continue;
4529 }
d46c5b12
KH
4530 if (result == CODING_FINISH_INSUFFICIENT_SRC)
4531 {
4532 /* The source text ends in invalid codes. Let's just
4533 make them valid buffer contents, and finish conversion. */
fb88bf2d 4534 inserted += len_byte;
d46c5b12 4535 inserted_byte += len_byte;
fb88bf2d 4536 while (len_byte--)
ee59c65f 4537 *dst++ = *src++;
fb88bf2d 4538 fake_multibyte = 1;
d46c5b12
KH
4539 break;
4540 }
9864ebce
KH
4541 if (result == CODING_FINISH_INTERRUPT)
4542 {
4543 /* The conversion procedure was interrupted by a user. */
4544 fake_multibyte = 1;
4545 break;
4546 }
4547 /* Now RESULT == CODING_FINISH_INSUFFICIENT_DST */
4548 if (coding->consumed < 1)
4549 {
4550 /* It's quite strange to require more memory without
4551 consuming any bytes. Perhaps CCL program bug. */
4552 fake_multibyte = 1;
4553 break;
4554 }
fb88bf2d
KH
4555 if (first)
4556 {
4557 /* We have just done the first batch of conversion which was
4558 stoped because of insufficient gap. Let's reconsider the
4559 required gap size (i.e. SRT - DST) now.
4560
4561 We have converted ORIG bytes (== coding->consumed) into
4562 NEW bytes (coding->produced). To convert the remaining
4563 LEN bytes, we may need REQUIRE bytes of gap, where:
4564 REQUIRE + LEN_BYTE = LEN_BYTE * (NEW / ORIG)
4565 REQUIRE = LEN_BYTE * (NEW - ORIG) / ORIG
4566 Here, we are sure that NEW >= ORIG. */
6e44253b
KH
4567 float ratio = coding->produced - coding->consumed;
4568 ratio /= coding->consumed;
4569 require = len_byte * ratio;
fb88bf2d
KH
4570 first = 0;
4571 }
4572 if ((src - dst) < (require + 2000))
4573 {
4574 /* See the comment above the previous call of make_gap. */
4575 int add = len_byte + inserted_byte;
4576
4577 GAP_SIZE -= add;
4578 ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
4579 GPT += inserted_byte; GPT_BYTE += inserted_byte;
4580 make_gap (require + 2000);
4581 GAP_SIZE += add;
4582 ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
4583 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
4584 /* Don't forget to update SRC, DST. */
4585 src = GAP_END_ADDR - len_byte;
4586 dst = GPT_ADDR + inserted_byte;
4587 }
d46c5b12 4588 }
fb88bf2d
KH
4589 if (src - dst > 0) *dst = 0; /* Put an anchor. */
4590
2b4f9037 4591 if (multibyte
88993dfd
KH
4592 && (encodep
4593 || fake_multibyte
4594 || (to - from) != (to_byte - from_byte)))
2b4f9037 4595 inserted = multibyte_chars_in_text (GPT_ADDR, inserted_byte);
7553d0e1 4596
12410ef1
KH
4597 /* If we have shrinked the conversion area, adjust it now. */
4598 if (total_skip > 0)
4599 {
4600 if (tail_skip > 0)
4601 safe_bcopy (GAP_END_ADDR, GPT_ADDR + inserted_byte, tail_skip);
4602 inserted += total_skip; inserted_byte += total_skip;
4603 GAP_SIZE += total_skip;
4604 GPT -= head_skip; GPT_BYTE -= head_skip;
4605 ZV -= total_skip; ZV_BYTE -= total_skip;
4606 Z -= total_skip; Z_BYTE -= total_skip;
4607 from -= head_skip; from_byte -= head_skip;
4608 to += tail_skip; to_byte += tail_skip;
4609 }
4610
6abb9bd9 4611 prev_Z = Z;
12410ef1 4612 adjust_after_replace (from, from_byte, deletion, inserted, inserted_byte);
6abb9bd9 4613 inserted = Z - prev_Z;
4ed46869 4614
2b4f9037 4615 if (! encodep && ! NILP (coding->post_read_conversion))
d46c5b12 4616 {
2b4f9037 4617 Lisp_Object val;
4ed46869 4618
e133c8fa
KH
4619 if (from != PT)
4620 TEMP_SET_PT_BOTH (from, from_byte);
6abb9bd9 4621 prev_Z = Z;
2b4f9037 4622 val = call1 (coding->post_read_conversion, make_number (inserted));
6abb9bd9 4623 CHECK_NUMBER (val, 0);
944bd420 4624 inserted += Z - prev_Z;
e133c8fa
KH
4625 }
4626
4627 if (orig_point >= from)
4628 {
4629 if (orig_point >= from + orig_len)
4630 orig_point += inserted - orig_len;
4631 else
4632 orig_point = from;
4633 TEMP_SET_PT (orig_point);
d46c5b12 4634 }
4ed46869 4635
2b4f9037
KH
4636 signal_after_change (from, to - from, inserted);
4637
fb88bf2d 4638 {
12410ef1
KH
4639 coding->consumed = to_byte - from_byte;
4640 coding->consumed_char = to - from;
4641 coding->produced = inserted_byte;
4642 coding->produced_char = inserted;
fb88bf2d 4643 }
7553d0e1 4644
fb88bf2d 4645 return 0;
d46c5b12
KH
4646}
4647
4648Lisp_Object
4649code_convert_string (str, coding, encodep, nocopy)
4650 Lisp_Object str;
4ed46869 4651 struct coding_system *coding;
d46c5b12 4652 int encodep, nocopy;
4ed46869 4653{
d46c5b12
KH
4654 int len;
4655 char *buf;
fc932ac6
RS
4656 int from = 0, to = XSTRING (str)->size;
4657 int to_byte = STRING_BYTES (XSTRING (str));
d46c5b12 4658 struct gcpro gcpro1;
84d60297 4659 Lisp_Object saved_coding_symbol;
d46c5b12 4660 int result;
4ed46869 4661
84d60297 4662 saved_coding_symbol = Qnil;
d46c5b12
KH
4663 if (encodep && !NILP (coding->pre_write_conversion)
4664 || !encodep && !NILP (coding->post_read_conversion))
4665 {
4666 /* Since we have to call Lisp functions which assume target text
4667 is in a buffer, after setting a temporary buffer, call
4668 code_convert_region. */
4669 int count = specpdl_ptr - specpdl;
4670 struct buffer *prev = current_buffer;
e133c8fa 4671
d46c5b12
KH
4672 record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
4673 temp_output_buffer_setup (" *code-converting-work*");
4674 set_buffer_internal (XBUFFER (Vstandard_output));
4675 if (encodep)
4676 insert_from_string (str, 0, 0, to, to_byte, 0);
4677 else
4678 {
4679 /* We must insert the contents of STR as is without
4680 unibyte<->multibyte conversion. */
4681 current_buffer->enable_multibyte_characters = Qnil;
4682 insert_from_string (str, 0, 0, to_byte, to_byte, 0);
4683 current_buffer->enable_multibyte_characters = Qt;
4684 }
fb88bf2d 4685 code_convert_region (BEGV, BEGV_BYTE, ZV, ZV_BYTE, coding, encodep, 1);
d46c5b12
KH
4686 if (encodep)
4687 /* We must return the buffer contents as unibyte string. */
4688 current_buffer->enable_multibyte_characters = Qnil;
4689 str = make_buffer_string (BEGV, ZV, 0);
4690 set_buffer_internal (prev);
4691 return unbind_to (count, str);
4692 }
4ed46869 4693
d46c5b12
KH
4694 if (! encodep && CODING_REQUIRE_DETECTION (coding))
4695 {
4696 /* See the comments in code_convert_region. */
4697 if (coding->type == coding_type_undecided)
4698 {
4699 detect_coding (coding, XSTRING (str)->data, to_byte);
4700 if (coding->type == coding_type_undecided)
4701 coding->type = coding_type_emacs_mule;
4702 }
4703 if (coding->eol_type == CODING_EOL_UNDECIDED)
4704 {
4705 saved_coding_symbol = coding->symbol;
4706 detect_eol (coding, XSTRING (str)->data, to_byte);
4707 if (coding->eol_type == CODING_EOL_UNDECIDED)
4708 coding->eol_type = CODING_EOL_LF;
4709 /* We had better recover the original eol format if we
4710 encounter an inconsitent eol format while decoding. */
4711 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4712 }
4713 }
4ed46869 4714
d46c5b12
KH
4715 if (encodep
4716 ? ! CODING_REQUIRE_ENCODING (coding)
4717 : ! CODING_REQUIRE_DECODING (coding))
4718 from = to_byte;
4719 else
4720 {
4721 /* Try to skip the heading and tailing ASCIIs. */
88993dfd
KH
4722 SHRINK_CONVERSION_REGION (&from, &to_byte, coding, XSTRING (str)->data,
4723 encodep);
d46c5b12 4724 }
e133c8fa
KH
4725 if (from == to_byte
4726 && coding->type != coding_type_ccl)
d46c5b12 4727 return (nocopy ? str : Fcopy_sequence (str));
4ed46869 4728
d46c5b12
KH
4729 if (encodep)
4730 len = encoding_buffer_size (coding, to_byte - from);
4731 else
4732 len = decoding_buffer_size (coding, to_byte - from);
fc932ac6 4733 len += from + STRING_BYTES (XSTRING (str)) - to_byte;
d46c5b12
KH
4734 GCPRO1 (str);
4735 buf = get_conversion_buffer (len);
4736 UNGCPRO;
4ed46869 4737
d46c5b12
KH
4738 if (from > 0)
4739 bcopy (XSTRING (str)->data, buf, from);
4740 result = (encodep
4741 ? encode_coding (coding, XSTRING (str)->data + from,
4742 buf + from, to_byte - from, len)
4743 : decode_coding (coding, XSTRING (str)->data + from,
f30cc612 4744 buf + from, to_byte - from, len));
d46c5b12 4745 if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
4ed46869 4746 {
d46c5b12
KH
4747 /* We simple try to decode the whole string again but without
4748 eol-conversion this time. */
4749 coding->eol_type = CODING_EOL_LF;
4750 coding->symbol = saved_coding_symbol;
4751 return code_convert_string (str, coding, encodep, nocopy);
4ed46869 4752 }
d46c5b12
KH
4753
4754 bcopy (XSTRING (str)->data + to_byte, buf + from + coding->produced,
fc932ac6 4755 STRING_BYTES (XSTRING (str)) - to_byte);
d46c5b12 4756
fc932ac6 4757 len = from + STRING_BYTES (XSTRING (str)) - to_byte;
d46c5b12
KH
4758 if (encodep)
4759 str = make_unibyte_string (buf, len + coding->produced);
4760 else
826bfb8b
KH
4761 {
4762 int chars= (coding->fake_multibyte
4763 ? multibyte_chars_in_text (buf + from, coding->produced)
4764 : coding->produced_char);
4765 str = make_multibyte_string (buf, len + chars, len + coding->produced);
4766 }
4767
d46c5b12 4768 return str;
4ed46869
KH
4769}
4770
4771\f
4772#ifdef emacs
1397dc18 4773/*** 8. Emacs Lisp library functions ***/
4ed46869 4774
4ed46869
KH
4775DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
4776 "Return t if OBJECT is nil or a coding-system.\n\
3a73fa5d
RS
4777See the documentation of `make-coding-system' for information\n\
4778about coding-system objects.")
4ed46869
KH
4779 (obj)
4780 Lisp_Object obj;
4781{
4608c386
KH
4782 if (NILP (obj))
4783 return Qt;
4784 if (!SYMBOLP (obj))
4785 return Qnil;
4786 /* Get coding-spec vector for OBJ. */
4787 obj = Fget (obj, Qcoding_system);
4788 return ((VECTORP (obj) && XVECTOR (obj)->size == 5)
4789 ? Qt : Qnil);
4ed46869
KH
4790}
4791
9d991de8
RS
4792DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
4793 Sread_non_nil_coding_system, 1, 1, 0,
e0e989f6 4794 "Read a coding system from the minibuffer, prompting with string PROMPT.")
4ed46869
KH
4795 (prompt)
4796 Lisp_Object prompt;
4797{
e0e989f6 4798 Lisp_Object val;
9d991de8
RS
4799 do
4800 {
4608c386
KH
4801 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
4802 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
9d991de8
RS
4803 }
4804 while (XSTRING (val)->size == 0);
e0e989f6 4805 return (Fintern (val, Qnil));
4ed46869
KH
4806}
4807
9b787f3e
RS
4808DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
4809 "Read a coding system from the minibuffer, prompting with string PROMPT.\n\
4810If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.")
4811 (prompt, default_coding_system)
4812 Lisp_Object prompt, default_coding_system;
4ed46869 4813{
f44d27ce 4814 Lisp_Object val;
9b787f3e
RS
4815 if (SYMBOLP (default_coding_system))
4816 XSETSTRING (default_coding_system, XSYMBOL (default_coding_system)->name);
4608c386 4817 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
9b787f3e
RS
4818 Qt, Qnil, Qcoding_system_history,
4819 default_coding_system, Qnil);
e0e989f6 4820 return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
4ed46869
KH
4821}
4822
4823DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
4824 1, 1, 0,
4825 "Check validity of CODING-SYSTEM.\n\
3a73fa5d
RS
4826If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.\n\
4827It is valid if it is a symbol with a non-nil `coding-system' property.\n\
4ed46869
KH
4828The value of property should be a vector of length 5.")
4829 (coding_system)
4830 Lisp_Object coding_system;
4831{
4832 CHECK_SYMBOL (coding_system, 0);
4833 if (!NILP (Fcoding_system_p (coding_system)))
4834 return coding_system;
4835 while (1)
02ba4723 4836 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
4ed46869 4837}
3a73fa5d 4838\f
d46c5b12
KH
4839Lisp_Object
4840detect_coding_system (src, src_bytes, highest)
4841 unsigned char *src;
4842 int src_bytes, highest;
4ed46869
KH
4843{
4844 int coding_mask, eol_type;
d46c5b12
KH
4845 Lisp_Object val, tmp;
4846 int dummy;
4ed46869 4847
d46c5b12
KH
4848 coding_mask = detect_coding_mask (src, src_bytes, NULL, &dummy);
4849 eol_type = detect_eol_type (src, src_bytes, &dummy);
4850 if (eol_type == CODING_EOL_INCONSISTENT)
25b02698 4851 eol_type = CODING_EOL_UNDECIDED;
4ed46869 4852
d46c5b12 4853 if (!coding_mask)
4ed46869 4854 {
27901516 4855 val = Qundecided;
d46c5b12 4856 if (eol_type != CODING_EOL_UNDECIDED)
4ed46869 4857 {
f44d27ce
RS
4858 Lisp_Object val2;
4859 val2 = Fget (Qundecided, Qeol_type);
4ed46869
KH
4860 if (VECTORP (val2))
4861 val = XVECTOR (val2)->contents[eol_type];
4862 }
80e803b4 4863 return (highest ? val : Fcons (val, Qnil));
4ed46869 4864 }
4ed46869 4865
d46c5b12
KH
4866 /* At first, gather possible coding systems in VAL. */
4867 val = Qnil;
4868 for (tmp = Vcoding_category_list; !NILP (tmp); tmp = XCONS (tmp)->cdr)
4ed46869 4869 {
d46c5b12
KH
4870 int idx
4871 = XFASTINT (Fget (XCONS (tmp)->car, Qcoding_category_index));
4872 if (coding_mask & (1 << idx))
4ed46869 4873 {
d46c5b12
KH
4874 val = Fcons (Fsymbol_value (XCONS (tmp)->car), val);
4875 if (highest)
4876 break;
4ed46869
KH
4877 }
4878 }
d46c5b12
KH
4879 if (!highest)
4880 val = Fnreverse (val);
4ed46869 4881
65059037 4882 /* Then, replace the elements with subsidiary coding systems. */
d46c5b12 4883 for (tmp = val; !NILP (tmp); tmp = XCONS (tmp)->cdr)
4ed46869 4884 {
65059037
RS
4885 if (eol_type != CODING_EOL_UNDECIDED
4886 && eol_type != CODING_EOL_INCONSISTENT)
4ed46869 4887 {
d46c5b12
KH
4888 Lisp_Object eol;
4889 eol = Fget (XCONS (tmp)->car, Qeol_type);
4890 if (VECTORP (eol))
4891 XCONS (tmp)->car = XVECTOR (eol)->contents[eol_type];
4ed46869
KH
4892 }
4893 }
d46c5b12
KH
4894 return (highest ? XCONS (val)->car : val);
4895}
4ed46869 4896
d46c5b12
KH
4897DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
4898 2, 3, 0,
4899 "Detect coding system of the text in the region between START and END.\n\
4900Return a list of possible coding systems ordered by priority.\n\
4901\n\
80e803b4
KH
4902If only ASCII characters are found, it returns a list of single element\n\
4903`undecided' or its subsidiary coding system according to a detected\n\
4904end-of-line format.\n\
d46c5b12
KH
4905\n\
4906If optional argument HIGHEST is non-nil, return the coding system of\n\
4907highest priority.")
4908 (start, end, highest)
4909 Lisp_Object start, end, highest;
4910{
4911 int from, to;
4912 int from_byte, to_byte;
6289dd10 4913
d46c5b12
KH
4914 CHECK_NUMBER_COERCE_MARKER (start, 0);
4915 CHECK_NUMBER_COERCE_MARKER (end, 1);
4ed46869 4916
d46c5b12
KH
4917 validate_region (&start, &end);
4918 from = XINT (start), to = XINT (end);
4919 from_byte = CHAR_TO_BYTE (from);
4920 to_byte = CHAR_TO_BYTE (to);
6289dd10 4921
d46c5b12
KH
4922 if (from < GPT && to >= GPT)
4923 move_gap_both (to, to_byte);
4ed46869 4924
d46c5b12
KH
4925 return detect_coding_system (BYTE_POS_ADDR (from_byte),
4926 to_byte - from_byte,
4927 !NILP (highest));
4928}
6289dd10 4929
d46c5b12
KH
4930DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
4931 1, 2, 0,
4932 "Detect coding system of the text in STRING.\n\
4933Return a list of possible coding systems ordered by priority.\n\
4934\n\
80e803b4
KH
4935If only ASCII characters are found, it returns a list of single element\n\
4936`undecided' or its subsidiary coding system according to a detected\n\
4937end-of-line format.\n\
d46c5b12
KH
4938\n\
4939If optional argument HIGHEST is non-nil, return the coding system of\n\
4940highest priority.")
4941 (string, highest)
4942 Lisp_Object string, highest;
4943{
4944 CHECK_STRING (string, 0);
4ed46869 4945
d46c5b12 4946 return detect_coding_system (XSTRING (string)->data,
fc932ac6 4947 STRING_BYTES (XSTRING (string)),
d46c5b12 4948 !NILP (highest));
4ed46869
KH
4949}
4950
4031e2bf
KH
4951Lisp_Object
4952code_convert_region1 (start, end, coding_system, encodep)
d46c5b12 4953 Lisp_Object start, end, coding_system;
4031e2bf 4954 int encodep;
3a73fa5d
RS
4955{
4956 struct coding_system coding;
4031e2bf 4957 int from, to, len;
3a73fa5d 4958
d46c5b12
KH
4959 CHECK_NUMBER_COERCE_MARKER (start, 0);
4960 CHECK_NUMBER_COERCE_MARKER (end, 1);
3a73fa5d
RS
4961 CHECK_SYMBOL (coding_system, 2);
4962
d46c5b12
KH
4963 validate_region (&start, &end);
4964 from = XFASTINT (start);
4965 to = XFASTINT (end);
4966
3a73fa5d 4967 if (NILP (coding_system))
d46c5b12
KH
4968 return make_number (to - from);
4969
3a73fa5d 4970 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
d46c5b12 4971 error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
3a73fa5d 4972
d46c5b12 4973 coding.mode |= CODING_MODE_LAST_BLOCK;
fb88bf2d
KH
4974 code_convert_region (from, CHAR_TO_BYTE (from), to, CHAR_TO_BYTE (to),
4975 &coding, encodep, 1);
f072a3e8 4976 Vlast_coding_system_used = coding.symbol;
fb88bf2d 4977 return make_number (coding.produced_char);
4031e2bf
KH
4978}
4979
4980DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
4981 3, 3, "r\nzCoding system: ",
4982 "Decode the current region by specified coding system.\n\
4983When called from a program, takes three arguments:\n\
4984START, END, and CODING-SYSTEM. START and END are buffer positions.\n\
f072a3e8
RS
4985This function sets `last-coding-system-used' to the precise coding system\n\
4986used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
4987not fully specified.)\n\
4988It returns the length of the decoded text.")
4031e2bf
KH
4989 (start, end, coding_system)
4990 Lisp_Object start, end, coding_system;
4991{
4992 return code_convert_region1 (start, end, coding_system, 0);
3a73fa5d
RS
4993}
4994
4995DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
4996 3, 3, "r\nzCoding system: ",
d46c5b12 4997 "Encode the current region by specified coding system.\n\
3a73fa5d 4998When called from a program, takes three arguments:\n\
d46c5b12 4999START, END, and CODING-SYSTEM. START and END are buffer positions.\n\
f072a3e8
RS
5000This function sets `last-coding-system-used' to the precise coding system\n\
5001used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5002not fully specified.)\n\
5003It returns the length of the encoded text.")
d46c5b12
KH
5004 (start, end, coding_system)
5005 Lisp_Object start, end, coding_system;
3a73fa5d 5006{
4031e2bf
KH
5007 return code_convert_region1 (start, end, coding_system, 1);
5008}
3a73fa5d 5009
4031e2bf
KH
5010Lisp_Object
5011code_convert_string1 (string, coding_system, nocopy, encodep)
5012 Lisp_Object string, coding_system, nocopy;
5013 int encodep;
5014{
5015 struct coding_system coding;
3a73fa5d 5016
4031e2bf
KH
5017 CHECK_STRING (string, 0);
5018 CHECK_SYMBOL (coding_system, 1);
4ed46869 5019
d46c5b12 5020 if (NILP (coding_system))
4031e2bf 5021 return (NILP (nocopy) ? Fcopy_sequence (string) : string);
4ed46869 5022
d46c5b12
KH
5023 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
5024 error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
5f1cd180 5025
d46c5b12 5026 coding.mode |= CODING_MODE_LAST_BLOCK;
f072a3e8 5027 Vlast_coding_system_used = coding.symbol;
4031e2bf 5028 return code_convert_string (string, &coding, encodep, !NILP (nocopy));
4ed46869
KH
5029}
5030
4ed46869 5031DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
e0e989f6
KH
5032 2, 3, 0,
5033 "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
fe487a71 5034Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
f072a3e8
RS
5035if the decoding operation is trivial.\n\
5036This function sets `last-coding-system-used' to the precise coding system\n\
5037used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5038not fully specified.)")
e0e989f6
KH
5039 (string, coding_system, nocopy)
5040 Lisp_Object string, coding_system, nocopy;
4ed46869 5041{
f072a3e8 5042 return code_convert_string1 (string, coding_system, nocopy, 0);
4ed46869
KH
5043}
5044
5045DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
e0e989f6
KH
5046 2, 3, 0,
5047 "Encode STRING to CODING-SYSTEM, and return the result.\n\
fe487a71 5048Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
f072a3e8
RS
5049if the encoding operation is trivial.\n\
5050This function sets `last-coding-system-used' to the precise coding system\n\
5051used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5052not fully specified.)")
e0e989f6
KH
5053 (string, coding_system, nocopy)
5054 Lisp_Object string, coding_system, nocopy;
4ed46869 5055{
f072a3e8 5056 return code_convert_string1 (string, coding_system, nocopy, 1);
4ed46869 5057}
4031e2bf 5058
ecec61c1
KH
5059/* Encode or decode STRING according to CODING_SYSTEM.
5060 Do not set Vlast_coding_system_used. */
5061
5062Lisp_Object
5063code_convert_string_norecord (string, coding_system, encodep)
5064 Lisp_Object string, coding_system;
5065 int encodep;
5066{
5067 struct coding_system coding;
5068
5069 CHECK_STRING (string, 0);
5070 CHECK_SYMBOL (coding_system, 1);
5071
5072 if (NILP (coding_system))
5073 return string;
5074
5075 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
5076 error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
5077
5078 coding.mode |= CODING_MODE_LAST_BLOCK;
5079 return code_convert_string (string, &coding, encodep, Qt);
5080}
3a73fa5d 5081\f
4ed46869 5082DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
55ab7be3 5083 "Decode a Japanese character which has CODE in shift_jis encoding.\n\
4ed46869
KH
5084Return the corresponding character.")
5085 (code)
5086 Lisp_Object code;
5087{
5088 unsigned char c1, c2, s1, s2;
5089 Lisp_Object val;
5090
5091 CHECK_NUMBER (code, 0);
5092 s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
55ab7be3
KH
5093 if (s1 == 0)
5094 {
c28a9453
KH
5095 if (s2 < 0x80)
5096 XSETFASTINT (val, s2);
5097 else if (s2 >= 0xA0 || s2 <= 0xDF)
5098 XSETFASTINT (val,
5099 MAKE_NON_ASCII_CHAR (charset_katakana_jisx0201, s2, 0));
5100 else
9da8350f 5101 error ("Invalid Shift JIS code: %x", XFASTINT (code));
55ab7be3
KH
5102 }
5103 else
5104 {
5105 if ((s1 < 0x80 || s1 > 0x9F && s1 < 0xE0 || s1 > 0xEF)
5106 || (s2 < 0x40 || s2 == 0x7F || s2 > 0xFC))
9da8350f 5107 error ("Invalid Shift JIS code: %x", XFASTINT (code));
55ab7be3
KH
5108 DECODE_SJIS (s1, s2, c1, c2);
5109 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset_jisx0208, c1, c2));
5110 }
4ed46869
KH
5111 return val;
5112}
5113
5114DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
55ab7be3
KH
5115 "Encode a Japanese character CHAR to shift_jis encoding.\n\
5116Return the corresponding code in SJIS.")
4ed46869
KH
5117 (ch)
5118 Lisp_Object ch;
5119{
bcf26d6a 5120 int charset, c1, c2, s1, s2;
4ed46869
KH
5121 Lisp_Object val;
5122
5123 CHECK_NUMBER (ch, 0);
5124 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
c28a9453
KH
5125 if (charset == CHARSET_ASCII)
5126 {
5127 val = ch;
5128 }
5129 else if (charset == charset_jisx0208
5130 && c1 > 0x20 && c1 < 0x7F && c2 > 0x20 && c2 < 0x7F)
4ed46869
KH
5131 {
5132 ENCODE_SJIS (c1, c2, s1, s2);
bcf26d6a 5133 XSETFASTINT (val, (s1 << 8) | s2);
4ed46869 5134 }
55ab7be3
KH
5135 else if (charset == charset_katakana_jisx0201
5136 && c1 > 0x20 && c2 < 0xE0)
5137 {
5138 XSETFASTINT (val, c1 | 0x80);
5139 }
4ed46869 5140 else
55ab7be3 5141 error ("Can't encode to shift_jis: %d", XFASTINT (ch));
4ed46869
KH
5142 return val;
5143}
5144
5145DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
c28a9453 5146 "Decode a Big5 character which has CODE in BIG5 coding system.\n\
4ed46869
KH
5147Return the corresponding character.")
5148 (code)
5149 Lisp_Object code;
5150{
5151 int charset;
5152 unsigned char b1, b2, c1, c2;
5153 Lisp_Object val;
5154
5155 CHECK_NUMBER (code, 0);
5156 b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
c28a9453
KH
5157 if (b1 == 0)
5158 {
5159 if (b2 >= 0x80)
9da8350f 5160 error ("Invalid BIG5 code: %x", XFASTINT (code));
c28a9453
KH
5161 val = code;
5162 }
5163 else
5164 {
5165 if ((b1 < 0xA1 || b1 > 0xFE)
5166 || (b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE))
9da8350f 5167 error ("Invalid BIG5 code: %x", XFASTINT (code));
c28a9453
KH
5168 DECODE_BIG5 (b1, b2, charset, c1, c2);
5169 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset, c1, c2));
5170 }
4ed46869
KH
5171 return val;
5172}
5173
5174DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
d46c5b12 5175 "Encode the Big5 character CHAR to BIG5 coding system.\n\
4ed46869
KH
5176Return the corresponding character code in Big5.")
5177 (ch)
5178 Lisp_Object ch;
5179{
bcf26d6a 5180 int charset, c1, c2, b1, b2;
4ed46869
KH
5181 Lisp_Object val;
5182
5183 CHECK_NUMBER (ch, 0);
5184 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
c28a9453
KH
5185 if (charset == CHARSET_ASCII)
5186 {
5187 val = ch;
5188 }
5189 else if ((charset == charset_big5_1
5190 && (XFASTINT (ch) >= 0x250a1 && XFASTINT (ch) <= 0x271ec))
5191 || (charset == charset_big5_2
5192 && XFASTINT (ch) >= 0x290a1 && XFASTINT (ch) <= 0x2bdb2))
4ed46869
KH
5193 {
5194 ENCODE_BIG5 (charset, c1, c2, b1, b2);
bcf26d6a 5195 XSETFASTINT (val, (b1 << 8) | b2);
4ed46869
KH
5196 }
5197 else
c28a9453 5198 error ("Can't encode to Big5: %d", XFASTINT (ch));
4ed46869
KH
5199 return val;
5200}
3a73fa5d 5201\f
1ba9e4ab
KH
5202DEFUN ("set-terminal-coding-system-internal",
5203 Fset_terminal_coding_system_internal,
5204 Sset_terminal_coding_system_internal, 1, 1, 0, "")
4ed46869
KH
5205 (coding_system)
5206 Lisp_Object coding_system;
5207{
5208 CHECK_SYMBOL (coding_system, 0);
5209 setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
70c22245 5210 /* We had better not send unsafe characters to terminal. */
6e85d753
KH
5211 terminal_coding.flags |= CODING_FLAG_ISO_SAFE;
5212
4ed46869
KH
5213 return Qnil;
5214}
5215
c4825358
KH
5216DEFUN ("set-safe-terminal-coding-system-internal",
5217 Fset_safe_terminal_coding_system_internal,
5218 Sset_safe_terminal_coding_system_internal, 1, 1, 0, "")
5219 (coding_system)
5220 Lisp_Object coding_system;
5221{
5222 CHECK_SYMBOL (coding_system, 0);
5223 setup_coding_system (Fcheck_coding_system (coding_system),
5224 &safe_terminal_coding);
5225 return Qnil;
5226}
5227
4ed46869
KH
5228DEFUN ("terminal-coding-system",
5229 Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
3a73fa5d 5230 "Return coding system specified for terminal output.")
4ed46869
KH
5231 ()
5232{
5233 return terminal_coding.symbol;
5234}
5235
1ba9e4ab
KH
5236DEFUN ("set-keyboard-coding-system-internal",
5237 Fset_keyboard_coding_system_internal,
5238 Sset_keyboard_coding_system_internal, 1, 1, 0, "")
4ed46869
KH
5239 (coding_system)
5240 Lisp_Object coding_system;
5241{
5242 CHECK_SYMBOL (coding_system, 0);
5243 setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
5244 return Qnil;
5245}
5246
5247DEFUN ("keyboard-coding-system",
5248 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
3a73fa5d 5249 "Return coding system specified for decoding keyboard input.")
4ed46869
KH
5250 ()
5251{
5252 return keyboard_coding.symbol;
5253}
5254
5255\f
a5d301df
KH
5256DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
5257 Sfind_operation_coding_system, 1, MANY, 0,
5258 "Choose a coding system for an operation based on the target name.\n\
69f76525 5259The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).\n\
9ce27fde
KH
5260DECODING-SYSTEM is the coding system to use for decoding\n\
5261\(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system\n\
5262for encoding (in case OPERATION does encoding).\n\
ccdb79f5
RS
5263\n\
5264The first argument OPERATION specifies an I/O primitive:\n\
5265 For file I/O, `insert-file-contents' or `write-region'.\n\
5266 For process I/O, `call-process', `call-process-region', or `start-process'.\n\
5267 For network I/O, `open-network-stream'.\n\
5268\n\
5269The remaining arguments should be the same arguments that were passed\n\
5270to the primitive. Depending on which primitive, one of those arguments\n\
5271is selected as the TARGET. For example, if OPERATION does file I/O,\n\
5272whichever argument specifies the file name is TARGET.\n\
5273\n\
5274TARGET has a meaning which depends on OPERATION:\n\
4ed46869
KH
5275 For file I/O, TARGET is a file name.\n\
5276 For process I/O, TARGET is a process name.\n\
5277 For network I/O, TARGET is a service name or a port number\n\
5278\n\
02ba4723
KH
5279This function looks up what specified for TARGET in,\n\
5280`file-coding-system-alist', `process-coding-system-alist',\n\
5281or `network-coding-system-alist' depending on OPERATION.\n\
5282They may specify a coding system, a cons of coding systems,\n\
5283or a function symbol to call.\n\
5284In the last case, we call the function with one argument,\n\
9ce27fde 5285which is a list of all the arguments given to this function.")
4ed46869
KH
5286 (nargs, args)
5287 int nargs;
5288 Lisp_Object *args;
5289{
5290 Lisp_Object operation, target_idx, target, val;
5291 register Lisp_Object chain;
5292
5293 if (nargs < 2)
5294 error ("Too few arguments");
5295 operation = args[0];
5296 if (!SYMBOLP (operation)
5297 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
5298 error ("Invalid first arguement");
5299 if (nargs < 1 + XINT (target_idx))
5300 error ("Too few arguments for operation: %s",
5301 XSYMBOL (operation)->name->data);
5302 target = args[XINT (target_idx) + 1];
5303 if (!(STRINGP (target)
5304 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
5305 error ("Invalid %dth argument", XINT (target_idx) + 1);
5306
2e34157c
RS
5307 chain = ((EQ (operation, Qinsert_file_contents)
5308 || EQ (operation, Qwrite_region))
02ba4723 5309 ? Vfile_coding_system_alist
2e34157c 5310 : (EQ (operation, Qopen_network_stream)
02ba4723
KH
5311 ? Vnetwork_coding_system_alist
5312 : Vprocess_coding_system_alist));
4ed46869
KH
5313 if (NILP (chain))
5314 return Qnil;
5315
02ba4723 5316 for (; CONSP (chain); chain = XCONS (chain)->cdr)
4ed46869 5317 {
f44d27ce
RS
5318 Lisp_Object elt;
5319 elt = XCONS (chain)->car;
4ed46869
KH
5320
5321 if (CONSP (elt)
5322 && ((STRINGP (target)
5323 && STRINGP (XCONS (elt)->car)
5324 && fast_string_match (XCONS (elt)->car, target) >= 0)
5325 || (INTEGERP (target) && EQ (target, XCONS (elt)->car))))
02ba4723
KH
5326 {
5327 val = XCONS (elt)->cdr;
b19fd4c5
KH
5328 /* Here, if VAL is both a valid coding system and a valid
5329 function symbol, we return VAL as a coding system. */
02ba4723
KH
5330 if (CONSP (val))
5331 return val;
5332 if (! SYMBOLP (val))
5333 return Qnil;
5334 if (! NILP (Fcoding_system_p (val)))
5335 return Fcons (val, val);
b19fd4c5
KH
5336 if (! NILP (Ffboundp (val)))
5337 {
5338 val = call1 (val, Flist (nargs, args));
5339 if (CONSP (val))
5340 return val;
5341 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
5342 return Fcons (val, val);
5343 }
02ba4723
KH
5344 return Qnil;
5345 }
4ed46869
KH
5346 }
5347 return Qnil;
5348}
5349
1397dc18
KH
5350DEFUN ("update-coding-systems-internal", Fupdate_coding_systems_internal,
5351 Supdate_coding_systems_internal, 0, 0, 0,
5352 "Update internal database for ISO2022 and CCL based coding systems.\n\
d46c5b12
KH
5353When values of the following coding categories are changed, you must\n\
5354call this function:\n\
5355 coding-category-iso-7, coding-category-iso-7-tight,\n\
5356 coding-category-iso-8-1, coding-category-iso-8-2,\n\
1397dc18
KH
5357 coding-category-iso-7-else, coding-category-iso-8-else,\n\
5358 coding-category-ccl")
d46c5b12
KH
5359 ()
5360{
5361 int i;
5362
1397dc18 5363 for (i = CODING_CATEGORY_IDX_ISO_7; i <= CODING_CATEGORY_IDX_CCL; i++)
d46c5b12 5364 {
1397dc18
KH
5365 Lisp_Object val;
5366
5367 val = XSYMBOL (XVECTOR (Vcoding_category_table)->contents[i])->value;
5368 if (!NILP (val))
5369 {
5370 if (! coding_system_table[i])
5371 coding_system_table[i] = ((struct coding_system *)
5372 xmalloc (sizeof (struct coding_system)));
5373 setup_coding_system (val, coding_system_table[i]);
5374 }
5375 else if (coding_system_table[i])
5376 {
5377 xfree (coding_system_table[i]);
5378 coding_system_table[i] = NULL;
5379 }
d46c5b12 5380 }
1397dc18 5381
d46c5b12
KH
5382 return Qnil;
5383}
5384
66cfb530
KH
5385DEFUN ("set-coding-priority-internal", Fset_coding_priority_internal,
5386 Sset_coding_priority_internal, 0, 0, 0,
5387 "Update internal database for the current value of `coding-category-list'.\n\
5388This function is internal use only.")
5389 ()
5390{
5391 int i = 0, idx;
84d60297
RS
5392 Lisp_Object val;
5393
5394 val = Vcoding_category_list;
66cfb530
KH
5395
5396 while (CONSP (val) && i < CODING_CATEGORY_IDX_MAX)
5397 {
5398 if (! SYMBOLP (XCONS (val)->car))
5399 break;
5400 idx = XFASTINT (Fget (XCONS (val)->car, Qcoding_category_index));
5401 if (idx >= CODING_CATEGORY_IDX_MAX)
5402 break;
5403 coding_priorities[i++] = (1 << idx);
5404 val = XCONS (val)->cdr;
5405 }
5406 /* If coding-category-list is valid and contains all coding
5407 categories, `i' should be CODING_CATEGORY_IDX_MAX now. If not,
5408 the following code saves Emacs from craching. */
5409 while (i < CODING_CATEGORY_IDX_MAX)
5410 coding_priorities[i++] = CODING_CATEGORY_MASK_RAW_TEXT;
5411
5412 return Qnil;
5413}
5414
4ed46869
KH
5415#endif /* emacs */
5416
5417\f
1397dc18 5418/*** 9. Post-amble ***/
4ed46869 5419
6d74c3aa
KH
5420void
5421init_coding ()
5422{
5423 conversion_buffer = (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE);
5424}
5425
dfcf069d 5426void
4ed46869
KH
5427init_coding_once ()
5428{
5429 int i;
5430
0ef69138 5431 /* Emacs' internal format specific initialize routine. */
4ed46869
KH
5432 for (i = 0; i <= 0x20; i++)
5433 emacs_code_class[i] = EMACS_control_code;
5434 emacs_code_class[0x0A] = EMACS_linefeed_code;
5435 emacs_code_class[0x0D] = EMACS_carriage_return_code;
5436 for (i = 0x21 ; i < 0x7F; i++)
5437 emacs_code_class[i] = EMACS_ascii_code;
5438 emacs_code_class[0x7F] = EMACS_control_code;
5439 emacs_code_class[0x80] = EMACS_leading_code_composition;
5440 for (i = 0x81; i < 0xFF; i++)
5441 emacs_code_class[i] = EMACS_invalid_code;
5442 emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
5443 emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
5444 emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
5445 emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
5446
5447 /* ISO2022 specific initialize routine. */
5448 for (i = 0; i < 0x20; i++)
5449 iso_code_class[i] = ISO_control_code;
5450 for (i = 0x21; i < 0x7F; i++)
5451 iso_code_class[i] = ISO_graphic_plane_0;
5452 for (i = 0x80; i < 0xA0; i++)
5453 iso_code_class[i] = ISO_control_code;
5454 for (i = 0xA1; i < 0xFF; i++)
5455 iso_code_class[i] = ISO_graphic_plane_1;
5456 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
5457 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
5458 iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
5459 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
5460 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
5461 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
5462 iso_code_class[ISO_CODE_ESC] = ISO_escape;
5463 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
5464 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
5465 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
5466
e0e989f6 5467 conversion_buffer_size = MINIMUM_CONVERSION_BUFFER_SIZE;
e0e989f6
KH
5468
5469 setup_coding_system (Qnil, &keyboard_coding);
5470 setup_coding_system (Qnil, &terminal_coding);
c4825358 5471 setup_coding_system (Qnil, &safe_terminal_coding);
6bc51348 5472 setup_coding_system (Qnil, &default_buffer_file_coding);
9ce27fde 5473
d46c5b12
KH
5474 bzero (coding_system_table, sizeof coding_system_table);
5475
66cfb530
KH
5476 bzero (ascii_skip_code, sizeof ascii_skip_code);
5477 for (i = 0; i < 128; i++)
5478 ascii_skip_code[i] = 1;
5479
9ce27fde
KH
5480#if defined (MSDOS) || defined (WINDOWSNT)
5481 system_eol_type = CODING_EOL_CRLF;
5482#else
5483 system_eol_type = CODING_EOL_LF;
5484#endif
e0e989f6
KH
5485}
5486
5487#ifdef emacs
5488
dfcf069d 5489void
e0e989f6
KH
5490syms_of_coding ()
5491{
5492 Qtarget_idx = intern ("target-idx");
5493 staticpro (&Qtarget_idx);
5494
bb0115a2
RS
5495 Qcoding_system_history = intern ("coding-system-history");
5496 staticpro (&Qcoding_system_history);
5497 Fset (Qcoding_system_history, Qnil);
5498
9ce27fde 5499 /* Target FILENAME is the first argument. */
e0e989f6 5500 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
9ce27fde 5501 /* Target FILENAME is the third argument. */
e0e989f6
KH
5502 Fput (Qwrite_region, Qtarget_idx, make_number (2));
5503
5504 Qcall_process = intern ("call-process");
5505 staticpro (&Qcall_process);
9ce27fde 5506 /* Target PROGRAM is the first argument. */
e0e989f6
KH
5507 Fput (Qcall_process, Qtarget_idx, make_number (0));
5508
5509 Qcall_process_region = intern ("call-process-region");
5510 staticpro (&Qcall_process_region);
9ce27fde 5511 /* Target PROGRAM is the third argument. */
e0e989f6
KH
5512 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
5513
5514 Qstart_process = intern ("start-process");
5515 staticpro (&Qstart_process);
9ce27fde 5516 /* Target PROGRAM is the third argument. */
e0e989f6
KH
5517 Fput (Qstart_process, Qtarget_idx, make_number (2));
5518
5519 Qopen_network_stream = intern ("open-network-stream");
5520 staticpro (&Qopen_network_stream);
9ce27fde 5521 /* Target SERVICE is the fourth argument. */
e0e989f6
KH
5522 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
5523
4ed46869
KH
5524 Qcoding_system = intern ("coding-system");
5525 staticpro (&Qcoding_system);
5526
5527 Qeol_type = intern ("eol-type");
5528 staticpro (&Qeol_type);
5529
5530 Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
5531 staticpro (&Qbuffer_file_coding_system);
5532
5533 Qpost_read_conversion = intern ("post-read-conversion");
5534 staticpro (&Qpost_read_conversion);
5535
5536 Qpre_write_conversion = intern ("pre-write-conversion");
5537 staticpro (&Qpre_write_conversion);
5538
27901516
KH
5539 Qno_conversion = intern ("no-conversion");
5540 staticpro (&Qno_conversion);
5541
5542 Qundecided = intern ("undecided");
5543 staticpro (&Qundecided);
5544
4ed46869
KH
5545 Qcoding_system_p = intern ("coding-system-p");
5546 staticpro (&Qcoding_system_p);
5547
5548 Qcoding_system_error = intern ("coding-system-error");
5549 staticpro (&Qcoding_system_error);
5550
5551 Fput (Qcoding_system_error, Qerror_conditions,
5552 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
5553 Fput (Qcoding_system_error, Qerror_message,
9ce27fde 5554 build_string ("Invalid coding system"));
4ed46869 5555
d46c5b12
KH
5556 Qcoding_category = intern ("coding-category");
5557 staticpro (&Qcoding_category);
4ed46869
KH
5558 Qcoding_category_index = intern ("coding-category-index");
5559 staticpro (&Qcoding_category_index);
5560
d46c5b12
KH
5561 Vcoding_category_table
5562 = Fmake_vector (make_number (CODING_CATEGORY_IDX_MAX), Qnil);
5563 staticpro (&Vcoding_category_table);
4ed46869
KH
5564 {
5565 int i;
5566 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
5567 {
d46c5b12
KH
5568 XVECTOR (Vcoding_category_table)->contents[i]
5569 = intern (coding_category_name[i]);
5570 Fput (XVECTOR (Vcoding_category_table)->contents[i],
5571 Qcoding_category_index, make_number (i));
4ed46869
KH
5572 }
5573 }
5574
f967223b
KH
5575 Qtranslation_table = intern ("translation-table");
5576 staticpro (&Qtranslation_table);
1397dc18 5577 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (1));
bdd9fb48 5578
f967223b
KH
5579 Qtranslation_table_id = intern ("translation-table-id");
5580 staticpro (&Qtranslation_table_id);
84fbb8a0 5581
f967223b
KH
5582 Qtranslation_table_for_decode = intern ("translation-table-for-decode");
5583 staticpro (&Qtranslation_table_for_decode);
a5d301df 5584
f967223b
KH
5585 Qtranslation_table_for_encode = intern ("translation-table-for-encode");
5586 staticpro (&Qtranslation_table_for_encode);
a5d301df 5587
70c22245
KH
5588 Qsafe_charsets = intern ("safe-charsets");
5589 staticpro (&Qsafe_charsets);
5590
1397dc18
KH
5591 Qvalid_codes = intern ("valid-codes");
5592 staticpro (&Qvalid_codes);
5593
9ce27fde
KH
5594 Qemacs_mule = intern ("emacs-mule");
5595 staticpro (&Qemacs_mule);
5596
d46c5b12
KH
5597 Qraw_text = intern ("raw-text");
5598 staticpro (&Qraw_text);
5599
4ed46869
KH
5600 defsubr (&Scoding_system_p);
5601 defsubr (&Sread_coding_system);
5602 defsubr (&Sread_non_nil_coding_system);
5603 defsubr (&Scheck_coding_system);
5604 defsubr (&Sdetect_coding_region);
d46c5b12 5605 defsubr (&Sdetect_coding_string);
4ed46869
KH
5606 defsubr (&Sdecode_coding_region);
5607 defsubr (&Sencode_coding_region);
5608 defsubr (&Sdecode_coding_string);
5609 defsubr (&Sencode_coding_string);
5610 defsubr (&Sdecode_sjis_char);
5611 defsubr (&Sencode_sjis_char);
5612 defsubr (&Sdecode_big5_char);
5613 defsubr (&Sencode_big5_char);
1ba9e4ab 5614 defsubr (&Sset_terminal_coding_system_internal);
c4825358 5615 defsubr (&Sset_safe_terminal_coding_system_internal);
4ed46869 5616 defsubr (&Sterminal_coding_system);
1ba9e4ab 5617 defsubr (&Sset_keyboard_coding_system_internal);
4ed46869 5618 defsubr (&Skeyboard_coding_system);
a5d301df 5619 defsubr (&Sfind_operation_coding_system);
1397dc18 5620 defsubr (&Supdate_coding_systems_internal);
66cfb530 5621 defsubr (&Sset_coding_priority_internal);
4ed46869 5622
4608c386
KH
5623 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
5624 "List of coding systems.\n\
5625\n\
5626Do not alter the value of this variable manually. This variable should be\n\
5627updated by the functions `make-coding-system' and\n\
5628`define-coding-system-alias'.");
5629 Vcoding_system_list = Qnil;
5630
5631 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
5632 "Alist of coding system names.\n\
5633Each element is one element list of coding system name.\n\
5634This variable is given to `completing-read' as TABLE argument.\n\
5635\n\
5636Do not alter the value of this variable manually. This variable should be\n\
5637updated by the functions `make-coding-system' and\n\
5638`define-coding-system-alias'.");
5639 Vcoding_system_alist = Qnil;
5640
4ed46869
KH
5641 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
5642 "List of coding-categories (symbols) ordered by priority.");
5643 {
5644 int i;
5645
5646 Vcoding_category_list = Qnil;
5647 for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
5648 Vcoding_category_list
d46c5b12
KH
5649 = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
5650 Vcoding_category_list);
4ed46869
KH
5651 }
5652
5653 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
10bff6f1 5654 "Specify the coding system for read operations.\n\
2ebb362d 5655It is useful to bind this variable with `let', but do not set it globally.\n\
4ed46869 5656If the value is a coding system, it is used for decoding on read operation.\n\
a67a9c66 5657If not, an appropriate element is used from one of the coding system alists:\n\
10bff6f1 5658There are three such tables, `file-coding-system-alist',\n\
a67a9c66 5659`process-coding-system-alist', and `network-coding-system-alist'.");
4ed46869
KH
5660 Vcoding_system_for_read = Qnil;
5661
5662 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
10bff6f1 5663 "Specify the coding system for write operations.\n\
2ebb362d 5664It is useful to bind this variable with `let', but do not set it globally.\n\
4ed46869 5665If the value is a coding system, it is used for encoding on write operation.\n\
a67a9c66 5666If not, an appropriate element is used from one of the coding system alists:\n\
10bff6f1 5667There are three such tables, `file-coding-system-alist',\n\
a67a9c66 5668`process-coding-system-alist', and `network-coding-system-alist'.");
4ed46869
KH
5669 Vcoding_system_for_write = Qnil;
5670
5671 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
a67a9c66 5672 "Coding system used in the latest file or process I/O.");
4ed46869
KH
5673 Vlast_coding_system_used = Qnil;
5674
9ce27fde 5675 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
f07f4a24 5676 "*Non-nil means always inhibit code conversion of end-of-line format.\n\
94c7a214
DL
5677See info node `Coding Systems' and info node `Text and Binary' concerning\n\
5678such conversion.");
9ce27fde
KH
5679 inhibit_eol_conversion = 0;
5680
ed29121d
EZ
5681 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
5682 "Non-nil means process buffer inherits coding system of process output.\n\
5683Bind it to t if the process output is to be treated as if it were a file\n\
5684read from some filesystem.");
5685 inherit_process_coding_system = 0;
5686
02ba4723
KH
5687 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
5688 "Alist to decide a coding system to use for a file I/O operation.\n\
5689The format is ((PATTERN . VAL) ...),\n\
5690where PATTERN is a regular expression matching a file name,\n\
5691VAL is a coding system, a cons of coding systems, or a function symbol.\n\
5692If VAL is a coding system, it is used for both decoding and encoding\n\
5693the file contents.\n\
5694If VAL is a cons of coding systems, the car part is used for decoding,\n\
5695and the cdr part is used for encoding.\n\
5696If VAL is a function symbol, the function must return a coding system\n\
5697or a cons of coding systems which are used as above.\n\
e0e989f6 5698\n\
a85a871a 5699See also the function `find-operation-coding-system'\n\
eda284ac 5700and the variable `auto-coding-alist'.");
02ba4723
KH
5701 Vfile_coding_system_alist = Qnil;
5702
5703 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
5704 "Alist to decide a coding system to use for a process I/O operation.\n\
5705The format is ((PATTERN . VAL) ...),\n\
5706where PATTERN is a regular expression matching a program name,\n\
5707VAL is a coding system, a cons of coding systems, or a function symbol.\n\
5708If VAL is a coding system, it is used for both decoding what received\n\
5709from the program and encoding what sent to the program.\n\
5710If VAL is a cons of coding systems, the car part is used for decoding,\n\
5711and the cdr part is used for encoding.\n\
5712If VAL is a function symbol, the function must return a coding system\n\
5713or a cons of coding systems which are used as above.\n\
4ed46869 5714\n\
9ce27fde 5715See also the function `find-operation-coding-system'.");
02ba4723
KH
5716 Vprocess_coding_system_alist = Qnil;
5717
5718 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
5719 "Alist to decide a coding system to use for a network I/O operation.\n\
5720The format is ((PATTERN . VAL) ...),\n\
5721where PATTERN is a regular expression matching a network service name\n\
5722or is a port number to connect to,\n\
5723VAL is a coding system, a cons of coding systems, or a function symbol.\n\
5724If VAL is a coding system, it is used for both decoding what received\n\
5725from the network stream and encoding what sent to the network stream.\n\
5726If VAL is a cons of coding systems, the car part is used for decoding,\n\
5727and the cdr part is used for encoding.\n\
5728If VAL is a function symbol, the function must return a coding system\n\
5729or a cons of coding systems which are used as above.\n\
4ed46869 5730\n\
9ce27fde 5731See also the function `find-operation-coding-system'.");
02ba4723 5732 Vnetwork_coding_system_alist = Qnil;
4ed46869 5733
7722baf9
EZ
5734 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
5735 "*String displayed in mode line for UNIX-like (LF) end-of-line format.");
5736 eol_mnemonic_unix = build_string (":");
4ed46869 5737
7722baf9
EZ
5738 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
5739 "*String displayed in mode line for DOS-like (CRLF) end-of-line format.");
5740 eol_mnemonic_dos = build_string ("\\");
4ed46869 5741
7722baf9
EZ
5742 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
5743 "*String displayed in mode line for MAC-like (CR) end-of-line format.");
5744 eol_mnemonic_mac = build_string ("/");
4ed46869 5745
7722baf9
EZ
5746 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
5747 "*String displayed in mode line when end-of-line format is not yet determined.");
5748 eol_mnemonic_undecided = build_string (":");
4ed46869 5749
84fbb8a0 5750 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
f967223b 5751 "*Non-nil enables character translation while encoding and decoding.");
84fbb8a0 5752 Venable_character_translation = Qt;
bdd9fb48 5753
f967223b
KH
5754 DEFVAR_LISP ("standard-translation-table-for-decode",
5755 &Vstandard_translation_table_for_decode,
84fbb8a0 5756 "Table for translating characters while decoding.");
f967223b 5757 Vstandard_translation_table_for_decode = Qnil;
bdd9fb48 5758
f967223b
KH
5759 DEFVAR_LISP ("standard-translation-table-for-encode",
5760 &Vstandard_translation_table_for_encode,
84fbb8a0 5761 "Table for translationg characters while encoding.");
f967223b 5762 Vstandard_translation_table_for_encode = Qnil;
4ed46869
KH
5763
5764 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
5765 "Alist of charsets vs revision numbers.\n\
5766While encoding, if a charset (car part of an element) is found,\n\
5767designate it with the escape sequence identifing revision (cdr part of the element).");
5768 Vcharset_revision_alist = Qnil;
02ba4723
KH
5769
5770 DEFVAR_LISP ("default-process-coding-system",
5771 &Vdefault_process_coding_system,
5772 "Cons of coding systems used for process I/O by default.\n\
5773The car part is used for decoding a process output,\n\
5774the cdr part is used for encoding a text to be sent to a process.");
5775 Vdefault_process_coding_system = Qnil;
c4825358 5776
3f003981
KH
5777 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
5778 "Table of extra Latin codes in the range 128..159 (inclusive).\n\
c4825358
KH
5779This is a vector of length 256.\n\
5780If Nth element is non-nil, the existence of code N in a file\n\
bb0115a2 5781\(or output of subprocess) doesn't prevent it to be detected as\n\
3f003981
KH
5782a coding system of ISO 2022 variant which has a flag\n\
5783`accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file\n\
c4825358
KH
5784or reading output of a subprocess.\n\
5785Only 128th through 159th elements has a meaning.");
3f003981 5786 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
d46c5b12
KH
5787
5788 DEFVAR_LISP ("select-safe-coding-system-function",
5789 &Vselect_safe_coding_system_function,
5790 "Function to call to select safe coding system for encoding a text.\n\
5791\n\
5792If set, this function is called to force a user to select a proper\n\
5793coding system which can encode the text in the case that a default\n\
5794coding system used in each operation can't encode the text.\n\
5795\n\
a85a871a 5796The default value is `select-safe-coding-system' (which see).");
d46c5b12
KH
5797 Vselect_safe_coding_system_function = Qnil;
5798
4ed46869
KH
5799}
5800
5801#endif /* emacs */