(code_convert_string): Add record_unwind_protect to
[bpt/emacs.git] / src / coding.c
CommitLineData
4ed46869 1/* Coding system handler (conversion, detection, and etc).
4a2f9c6a 2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
203cb916 3 Licensed to the Free Software Foundation.
4ed46869 4
369314dc
KH
5This file is part of GNU Emacs.
6
7GNU Emacs is free software; you can redistribute it and/or modify
8it under the terms of the GNU General Public License as published by
9the Free Software Foundation; either version 2, or (at your option)
10any later version.
4ed46869 11
369314dc
KH
12GNU Emacs is distributed in the hope that it will be useful,
13but WITHOUT ANY WARRANTY; without even the implied warranty of
14MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15GNU General Public License for more details.
4ed46869 16
369314dc
KH
17You should have received a copy of the GNU General Public License
18along with GNU Emacs; see the file COPYING. If not, write to
19the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20Boston, MA 02111-1307, USA. */
4ed46869
KH
21
22/*** TABLE OF CONTENTS ***
23
24 1. Preamble
0ef69138 25 2. Emacs' internal format (emacs-mule) handlers
4ed46869
KH
26 3. ISO2022 handlers
27 4. Shift-JIS and BIG5 handlers
1397dc18
KH
28 5. CCL handlers
29 6. End-of-line handlers
30 7. C library functions
31 8. Emacs Lisp library functions
32 9. Post-amble
4ed46869
KH
33
34*/
35
36/*** GENERAL NOTE on CODING SYSTEM ***
37
38 Coding system is an encoding mechanism of one or more character
39 sets. Here's a list of coding systems which Emacs can handle. When
40 we say "decode", it means converting some other coding system to
0ef69138
KH
41 Emacs' internal format (emacs-internal), and when we say "encode",
42 it means converting the coding system emacs-mule to some other
43 coding system.
4ed46869 44
0ef69138 45 0. Emacs' internal format (emacs-mule)
4ed46869
KH
46
47 Emacs itself holds a multi-lingual character in a buffer and a string
f4dee582 48 in a special format. Details are described in section 2.
4ed46869
KH
49
50 1. ISO2022
51
52 The most famous coding system for multiple character sets. X's
f4dee582
RS
53 Compound Text, various EUCs (Extended Unix Code), and coding
54 systems used in Internet communication such as ISO-2022-JP are
55 all variants of ISO2022. Details are described in section 3.
4ed46869
KH
56
57 2. SJIS (or Shift-JIS or MS-Kanji-Code)
58
59 A coding system to encode character sets: ASCII, JISX0201, and
60 JISX0208. Widely used for PC's in Japan. Details are described in
f4dee582 61 section 4.
4ed46869
KH
62
63 3. BIG5
64
65 A coding system to encode character sets: ASCII and Big5. Widely
66 used by Chinese (mainly in Taiwan and Hong Kong). Details are
f4dee582
RS
67 described in section 4. In this file, when we write "BIG5"
68 (all uppercase), we mean the coding system, and when we write
69 "Big5" (capitalized), we mean the character set.
4ed46869 70
27901516
KH
71 4. Raw text
72
4608c386
KH
73 A coding system for a text containing random 8-bit code. Emacs does
74 no code conversion on such a text except for end-of-line format.
27901516
KH
75
76 5. Other
4ed46869 77
f4dee582 78 If a user wants to read/write a text encoded in a coding system not
4ed46869
KH
79 listed above, he can supply a decoder and an encoder for it in CCL
80 (Code Conversion Language) programs. Emacs executes the CCL program
81 while reading/writing.
82
d46c5b12
KH
83 Emacs represents a coding system by a Lisp symbol that has a property
84 `coding-system'. But, before actually using the coding system, the
4ed46869 85 information about it is set in a structure of type `struct
f4dee582 86 coding_system' for rapid processing. See section 6 for more details.
4ed46869
KH
87
88*/
89
90/*** GENERAL NOTES on END-OF-LINE FORMAT ***
91
92 How end-of-line of a text is encoded depends on a system. For
93 instance, Unix's format is just one byte of `line-feed' code,
f4dee582 94 whereas DOS's format is two-byte sequence of `carriage-return' and
d46c5b12
KH
95 `line-feed' codes. MacOS's format is usually one byte of
96 `carriage-return'.
4ed46869 97
f4dee582
RS
98 Since text characters encoding and end-of-line encoding are
99 independent, any coding system described above can take
4ed46869 100 any format of end-of-line. So, Emacs has information of format of
f4dee582 101 end-of-line in each coding-system. See section 6 for more details.
4ed46869
KH
102
103*/
104
105/*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
106
107 These functions check if a text between SRC and SRC_END is encoded
108 in the coding system category XXX. Each returns an integer value in
109 which appropriate flag bits for the category XXX is set. The flag
110 bits are defined in macros CODING_CATEGORY_MASK_XXX. Below is the
111 template of these functions. */
112#if 0
113int
0ef69138 114detect_coding_emacs_mule (src, src_end)
4ed46869
KH
115 unsigned char *src, *src_end;
116{
117 ...
118}
119#endif
120
121/*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
122
123 These functions decode SRC_BYTES length text at SOURCE encoded in
0ef69138 124 CODING to Emacs' internal format (emacs-mule). The resulting text
d46c5b12
KH
125 goes to a place pointed to by DESTINATION, the length of which
126 should not exceed DST_BYTES. These functions set the information of
127 original and decoded texts in the members produced, produced_char,
128 consumed, and consumed_char of the structure *CODING.
129
130 The return value is an integer (CODING_FINISH_XXX) indicating how
131 the decoding finished.
132
133 DST_BYTES zero means that source area and destination area are
134 overlapped, which means that we can produce a decoded text until it
135 reaches at the head of not-yet-decoded source text.
136
137 Below is a template of these functions. */
4ed46869 138#if 0
d46c5b12 139decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
140 struct coding_system *coding;
141 unsigned char *source, *destination;
142 int src_bytes, dst_bytes;
4ed46869
KH
143{
144 ...
145}
146#endif
147
148/*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
149
0ef69138
KH
150 These functions encode SRC_BYTES length text at SOURCE of Emacs'
151 internal format (emacs-mule) to CODING. The resulting text goes to
f4dee582 152 a place pointed to by DESTINATION, the length of which should not
d46c5b12
KH
153 exceed DST_BYTES. These functions set the information of
154 original and encoded texts in the members produced, produced_char,
155 consumed, and consumed_char of the structure *CODING.
156
157 The return value is an integer (CODING_FINISH_XXX) indicating how
158 the encoding finished.
159
160 DST_BYTES zero means that source area and destination area are
161 overlapped, which means that we can produce a decoded text until it
162 reaches at the head of not-yet-decoded source text.
163
164 Below is a template of these functions. */
4ed46869 165#if 0
d46c5b12 166encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
167 struct coding_system *coding;
168 unsigned char *source, *destination;
169 int src_bytes, dst_bytes;
4ed46869
KH
170{
171 ...
172}
173#endif
174
175/*** COMMONLY USED MACROS ***/
176
177/* The following three macros ONE_MORE_BYTE, TWO_MORE_BYTES, and
178 THREE_MORE_BYTES safely get one, two, and three bytes from the
179 source text respectively. If there are not enough bytes in the
180 source, they jump to `label_end_of_loop'. The caller should set
181 variables `src' and `src_end' to appropriate areas in advance. */
182
183#define ONE_MORE_BYTE(c1) \
184 do { \
185 if (src < src_end) \
186 c1 = *src++; \
187 else \
188 goto label_end_of_loop; \
189 } while (0)
190
191#define TWO_MORE_BYTES(c1, c2) \
192 do { \
193 if (src + 1 < src_end) \
194 c1 = *src++, c2 = *src++; \
195 else \
196 goto label_end_of_loop; \
197 } while (0)
198
199#define THREE_MORE_BYTES(c1, c2, c3) \
200 do { \
201 if (src + 2 < src_end) \
202 c1 = *src++, c2 = *src++, c3 = *src++; \
203 else \
204 goto label_end_of_loop; \
205 } while (0)
206
207/* The following three macros DECODE_CHARACTER_ASCII,
208 DECODE_CHARACTER_DIMENSION1, and DECODE_CHARACTER_DIMENSION2 put
209 the multi-byte form of a character of each class at the place
210 pointed by `dst'. The caller should set the variable `dst' to
211 point to an appropriate area and the variable `coding' to point to
212 the coding-system of the currently decoding text in advance. */
213
214/* Decode one ASCII character C. */
215
de79a6a5
KH
216#define DECODE_CHARACTER_ASCII(c) \
217 do { \
218 if (COMPOSING_P (coding->composing)) \
219 { \
220 *dst++ = 0xA0, *dst++ = (c) | 0x80; \
221 coding->composed_chars++; \
d14d03ac
KH
222 if (((c) | 0x80) < 0xA0) \
223 coding->fake_multibyte = 1; \
de79a6a5
KH
224 } \
225 else \
226 { \
251aeb24
KH
227 /* If ASCII charset is invoked to GR, \
228 we must reset MSB now. */ \
229 *dst++ = (c) & 0x7F; \
de79a6a5
KH
230 coding->produced_char++; \
231 } \
4ed46869
KH
232 } while (0)
233
f4dee582 234/* Decode one DIMENSION1 character whose charset is CHARSET and whose
4ed46869
KH
235 position-code is C. */
236
237#define DECODE_CHARACTER_DIMENSION1(charset, c) \
238 do { \
239 unsigned char leading_code = CHARSET_LEADING_CODE_BASE (charset); \
240 if (COMPOSING_P (coding->composing)) \
de79a6a5
KH
241 { \
242 *dst++ = leading_code + 0x20; \
243 coding->composed_chars++; \
244 } \
4ed46869 245 else \
d46c5b12
KH
246 { \
247 *dst++ = leading_code; \
248 coding->produced_char++; \
249 } \
4ed46869
KH
250 if (leading_code = CHARSET_LEADING_CODE_EXT (charset)) \
251 *dst++ = leading_code; \
252 *dst++ = (c) | 0x80; \
d14d03ac
KH
253 if (((c) | 0x80) < 0xA0) \
254 coding->fake_multibyte = 1; \
4ed46869
KH
255 } while (0)
256
f4dee582 257/* Decode one DIMENSION2 character whose charset is CHARSET and whose
4ed46869
KH
258 position-codes are C1 and C2. */
259
260#define DECODE_CHARACTER_DIMENSION2(charset, c1, c2) \
261 do { \
262 DECODE_CHARACTER_DIMENSION1 (charset, c1); \
263 *dst++ = (c2) | 0x80; \
d14d03ac
KH
264 if (((c2) | 0x80) < 0xA0) \
265 coding->fake_multibyte = 1; \
4ed46869
KH
266 } while (0)
267
268\f
269/*** 1. Preamble ***/
270
271#include <stdio.h>
272
273#ifdef emacs
274
275#include <config.h>
276#include "lisp.h"
277#include "buffer.h"
278#include "charset.h"
279#include "ccl.h"
280#include "coding.h"
281#include "window.h"
282
283#else /* not emacs */
284
285#include "mulelib.h"
286
287#endif /* not emacs */
288
289Lisp_Object Qcoding_system, Qeol_type;
290Lisp_Object Qbuffer_file_coding_system;
291Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
27901516 292Lisp_Object Qno_conversion, Qundecided;
bb0115a2 293Lisp_Object Qcoding_system_history;
70c22245 294Lisp_Object Qsafe_charsets;
1397dc18 295Lisp_Object Qvalid_codes;
4ed46869
KH
296
297extern Lisp_Object Qinsert_file_contents, Qwrite_region;
298Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
299Lisp_Object Qstart_process, Qopen_network_stream;
300Lisp_Object Qtarget_idx;
301
d46c5b12
KH
302Lisp_Object Vselect_safe_coding_system_function;
303
7722baf9
EZ
304/* Mnemonic string for each format of end-of-line. */
305Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
306/* Mnemonic string to indicate format of end-of-line is not yet
4ed46869 307 decided. */
7722baf9 308Lisp_Object eol_mnemonic_undecided;
4ed46869 309
9ce27fde
KH
310/* Format of end-of-line decided by system. This is CODING_EOL_LF on
311 Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac. */
312int system_eol_type;
313
4ed46869
KH
314#ifdef emacs
315
4608c386
KH
316Lisp_Object Vcoding_system_list, Vcoding_system_alist;
317
318Lisp_Object Qcoding_system_p, Qcoding_system_error;
4ed46869 319
d46c5b12
KH
320/* Coding system emacs-mule and raw-text are for converting only
321 end-of-line format. */
322Lisp_Object Qemacs_mule, Qraw_text;
9ce27fde 323
4ed46869
KH
324/* Coding-systems are handed between Emacs Lisp programs and C internal
325 routines by the following three variables. */
326/* Coding-system for reading files and receiving data from process. */
327Lisp_Object Vcoding_system_for_read;
328/* Coding-system for writing files and sending data to process. */
329Lisp_Object Vcoding_system_for_write;
330/* Coding-system actually used in the latest I/O. */
331Lisp_Object Vlast_coding_system_used;
332
c4825358 333/* A vector of length 256 which contains information about special
94487c4e 334 Latin codes (especially for dealing with Microsoft codes). */
3f003981 335Lisp_Object Vlatin_extra_code_table;
c4825358 336
9ce27fde
KH
337/* Flag to inhibit code conversion of end-of-line format. */
338int inhibit_eol_conversion;
339
ed29121d
EZ
340/* Flag to make buffer-file-coding-system inherit from process-coding. */
341int inherit_process_coding_system;
342
c4825358 343/* Coding system to be used to encode text for terminal display. */
4ed46869
KH
344struct coding_system terminal_coding;
345
c4825358
KH
346/* Coding system to be used to encode text for terminal display when
347 terminal coding system is nil. */
348struct coding_system safe_terminal_coding;
349
350/* Coding system of what is sent from terminal keyboard. */
4ed46869
KH
351struct coding_system keyboard_coding;
352
6bc51348
KH
353/* Default coding system to be used to write a file. */
354struct coding_system default_buffer_file_coding;
355
02ba4723
KH
356Lisp_Object Vfile_coding_system_alist;
357Lisp_Object Vprocess_coding_system_alist;
358Lisp_Object Vnetwork_coding_system_alist;
4ed46869
KH
359
360#endif /* emacs */
361
d46c5b12 362Lisp_Object Qcoding_category, Qcoding_category_index;
4ed46869
KH
363
364/* List of symbols `coding-category-xxx' ordered by priority. */
365Lisp_Object Vcoding_category_list;
366
d46c5b12
KH
367/* Table of coding categories (Lisp symbols). */
368Lisp_Object Vcoding_category_table;
4ed46869
KH
369
370/* Table of names of symbol for each coding-category. */
371char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
0ef69138 372 "coding-category-emacs-mule",
4ed46869
KH
373 "coding-category-sjis",
374 "coding-category-iso-7",
d46c5b12 375 "coding-category-iso-7-tight",
4ed46869
KH
376 "coding-category-iso-8-1",
377 "coding-category-iso-8-2",
7717c392
KH
378 "coding-category-iso-7-else",
379 "coding-category-iso-8-else",
89fa8b36 380 "coding-category-ccl",
4ed46869 381 "coding-category-big5",
27901516 382 "coding-category-raw-text",
89fa8b36 383 "coding-category-binary"
4ed46869
KH
384};
385
66cfb530 386/* Table of pointers to coding systems corresponding to each coding
d46c5b12
KH
387 categories. */
388struct coding_system *coding_system_table[CODING_CATEGORY_IDX_MAX];
389
66cfb530
KH
390/* Table of coding category masks. Nth element is a mask for a coding
391 cateogry of which priority is Nth. */
392static
393int coding_priorities[CODING_CATEGORY_IDX_MAX];
394
f967223b
KH
395/* Flag to tell if we look up translation table on character code
396 conversion. */
84fbb8a0 397Lisp_Object Venable_character_translation;
f967223b
KH
398/* Standard translation table to look up on decoding (reading). */
399Lisp_Object Vstandard_translation_table_for_decode;
400/* Standard translation table to look up on encoding (writing). */
401Lisp_Object Vstandard_translation_table_for_encode;
84fbb8a0 402
f967223b
KH
403Lisp_Object Qtranslation_table;
404Lisp_Object Qtranslation_table_id;
405Lisp_Object Qtranslation_table_for_decode;
406Lisp_Object Qtranslation_table_for_encode;
4ed46869
KH
407
408/* Alist of charsets vs revision number. */
409Lisp_Object Vcharset_revision_alist;
410
02ba4723
KH
411/* Default coding systems used for process I/O. */
412Lisp_Object Vdefault_process_coding_system;
413
b843d1ae
KH
414/* Global flag to tell that we can't call post-read-conversion and
415 pre-write-conversion functions. Usually the value is zero, but it
416 is set to 1 temporarily while such functions are running. This is
417 to avoid infinite recursive call. */
418static int inhibit_pre_post_conversion;
419
4ed46869 420\f
0ef69138 421/*** 2. Emacs internal format (emacs-mule) handlers ***/
4ed46869
KH
422
423/* Emacs' internal format for encoding multiple character sets is a
f4dee582
RS
424 kind of multi-byte encoding, i.e. characters are encoded by
425 variable-length sequences of one-byte codes. ASCII characters
426 and control characters (e.g. `tab', `newline') are represented by
427 one-byte sequences which are their ASCII codes, in the range 0x00
428 through 0x7F. The other characters are represented by a sequence
429 of `base leading-code', optional `extended leading-code', and one
430 or two `position-code's. The length of the sequence is determined
431 by the base leading-code. Leading-code takes the range 0x80
432 through 0x9F, whereas extended leading-code and position-code take
433 the range 0xA0 through 0xFF. See `charset.h' for more details
434 about leading-code and position-code.
435
436 There's one exception to this rule. Special leading-code
4ed46869
KH
437 `leading-code-composition' denotes that the following several
438 characters should be composed into one character. Leading-codes of
439 components (except for ASCII) are added 0x20. An ASCII character
440 component is represented by a 2-byte sequence of `0xA0' and
f4dee582
RS
441 `ASCII-code + 0x80'. See also the comments in `charset.h' for the
442 details of composite character. Hence, we can summarize the code
4ed46869
KH
443 range as follows:
444
445 --- CODE RANGE of Emacs' internal format ---
446 (character set) (range)
447 ASCII 0x00 .. 0x7F
448 ELSE (1st byte) 0x80 .. 0x9F
449 (rest bytes) 0xA0 .. 0xFF
450 ---------------------------------------------
451
452 */
453
454enum emacs_code_class_type emacs_code_class[256];
455
456/* Go to the next statement only if *SRC is accessible and the code is
457 greater than 0xA0. */
458#define CHECK_CODE_RANGE_A0_FF \
459 do { \
460 if (src >= src_end) \
461 goto label_end_of_switch; \
462 else if (*src++ < 0xA0) \
463 return 0; \
464 } while (0)
465
466/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
467 Check if a text is encoded in Emacs' internal format. If it is,
d46c5b12 468 return CODING_CATEGORY_MASK_EMACS_MULE, else return 0. */
4ed46869
KH
469
470int
0ef69138 471detect_coding_emacs_mule (src, src_end)
4ed46869
KH
472 unsigned char *src, *src_end;
473{
474 unsigned char c;
475 int composing = 0;
476
477 while (src < src_end)
478 {
479 c = *src++;
480
481 if (composing)
482 {
483 if (c < 0xA0)
484 composing = 0;
485 else
486 c -= 0x20;
487 }
488
489 switch (emacs_code_class[c])
490 {
491 case EMACS_ascii_code:
492 case EMACS_linefeed_code:
493 break;
494
495 case EMACS_control_code:
496 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
497 return 0;
498 break;
499
500 case EMACS_invalid_code:
501 return 0;
502
503 case EMACS_leading_code_composition: /* c == 0x80 */
504 if (composing)
505 CHECK_CODE_RANGE_A0_FF;
506 else
507 composing = 1;
508 break;
509
510 case EMACS_leading_code_4:
511 CHECK_CODE_RANGE_A0_FF;
512 /* fall down to check it two more times ... */
513
514 case EMACS_leading_code_3:
515 CHECK_CODE_RANGE_A0_FF;
516 /* fall down to check it one more time ... */
517
518 case EMACS_leading_code_2:
519 CHECK_CODE_RANGE_A0_FF;
520 break;
521
522 default:
523 label_end_of_switch:
524 break;
525 }
526 }
0ef69138 527 return CODING_CATEGORY_MASK_EMACS_MULE;
4ed46869
KH
528}
529
530\f
531/*** 3. ISO2022 handlers ***/
532
533/* The following note describes the coding system ISO2022 briefly.
39787efd
KH
534 Since the intention of this note is to help understand the
535 functions in this file, some parts are NOT ACCURATE or OVERLY
536 SIMPLIFIED. For thorough understanding, please refer to the
4ed46869
KH
537 original document of ISO2022.
538
539 ISO2022 provides many mechanisms to encode several character sets
39787efd
KH
540 in 7-bit and 8-bit environments. For 7-bite environments, all text
541 is encoded using bytes less than 128. This may make the encoded
542 text a little bit longer, but the text passes more easily through
543 several gateways, some of which strip off MSB (Most Signigant Bit).
544
545 There are two kinds of character sets: control character set and
4ed46869
KH
546 graphic character set. The former contains control characters such
547 as `newline' and `escape' to provide control functions (control
39787efd
KH
548 functions are also provided by escape sequences). The latter
549 contains graphic characters such as 'A' and '-'. Emacs recognizes
4ed46869
KH
550 two control character sets and many graphic character sets.
551
552 Graphic character sets are classified into one of the following
39787efd
KH
553 four classes, according to the number of bytes (DIMENSION) and
554 number of characters in one dimension (CHARS) of the set:
555 - DIMENSION1_CHARS94
556 - DIMENSION1_CHARS96
557 - DIMENSION2_CHARS94
558 - DIMENSION2_CHARS96
559
560 In addition, each character set is assigned an identification tag,
561 unique for each set, called "final character" (denoted as <F>
562 hereafter). The <F> of each character set is decided by ECMA(*)
563 when it is registered in ISO. The code range of <F> is 0x30..0x7F
564 (0x30..0x3F are for private use only).
4ed46869
KH
565
566 Note (*): ECMA = European Computer Manufacturers Association
567
568 Here are examples of graphic character set [NAME(<F>)]:
569 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
570 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
571 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
572 o DIMENSION2_CHARS96 -- none for the moment
573
39787efd 574 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
4ed46869
KH
575 C0 [0x00..0x1F] -- control character plane 0
576 GL [0x20..0x7F] -- graphic character plane 0
577 C1 [0x80..0x9F] -- control character plane 1
578 GR [0xA0..0xFF] -- graphic character plane 1
579
580 A control character set is directly designated and invoked to C0 or
39787efd
KH
581 C1 by an escape sequence. The most common case is that:
582 - ISO646's control character set is designated/invoked to C0, and
583 - ISO6429's control character set is designated/invoked to C1,
584 and usually these designations/invocations are omitted in encoded
585 text. In a 7-bit environment, only C0 can be used, and a control
586 character for C1 is encoded by an appropriate escape sequence to
587 fit into the environment. All control characters for C1 are
588 defined to have corresponding escape sequences.
4ed46869
KH
589
590 A graphic character set is at first designated to one of four
591 graphic registers (G0 through G3), then these graphic registers are
592 invoked to GL or GR. These designations and invocations can be
593 done independently. The most common case is that G0 is invoked to
39787efd
KH
594 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
595 these invocations and designations are omitted in encoded text.
596 In a 7-bit environment, only GL can be used.
4ed46869 597
39787efd
KH
598 When a graphic character set of CHARS94 is invoked to GL, codes
599 0x20 and 0x7F of the GL area work as control characters SPACE and
600 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
601 be used.
4ed46869
KH
602
603 There are two ways of invocation: locking-shift and single-shift.
604 With locking-shift, the invocation lasts until the next different
39787efd
KH
605 invocation, whereas with single-shift, the invocation affects the
606 following character only and doesn't affect the locking-shift
607 state. Invocations are done by the following control characters or
608 escape sequences:
4ed46869
KH
609
610 ----------------------------------------------------------------------
39787efd 611 abbrev function cntrl escape seq description
4ed46869 612 ----------------------------------------------------------------------
39787efd
KH
613 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
614 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
615 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
616 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
617 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
618 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
619 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
620 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
621 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
4ed46869 622 ----------------------------------------------------------------------
39787efd
KH
623 (*) These are not used by any known coding system.
624
625 Control characters for these functions are defined by macros
626 ISO_CODE_XXX in `coding.h'.
4ed46869 627
39787efd 628 Designations are done by the following escape sequences:
4ed46869
KH
629 ----------------------------------------------------------------------
630 escape sequence description
631 ----------------------------------------------------------------------
632 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
633 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
634 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
635 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
636 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
637 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
638 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
639 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
640 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
641 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
642 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
643 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
644 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
645 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
646 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
647 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
648 ----------------------------------------------------------------------
649
650 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
39787efd 651 of dimension 1, chars 94, and final character <F>, etc...
4ed46869
KH
652
653 Note (*): Although these designations are not allowed in ISO2022,
654 Emacs accepts them on decoding, and produces them on encoding
39787efd 655 CHARS96 character sets in a coding system which is characterized as
4ed46869
KH
656 7-bit environment, non-locking-shift, and non-single-shift.
657
658 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
39787efd 659 '(' can be omitted. We refer to this as "short-form" hereafter.
4ed46869
KH
660
661 Now you may notice that there are a lot of ways for encoding the
39787efd
KH
662 same multilingual text in ISO2022. Actually, there exist many
663 coding systems such as Compound Text (used in X11's inter client
664 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
665 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
4ed46869
KH
666 localized platforms), and all of these are variants of ISO2022.
667
668 In addition to the above, Emacs handles two more kinds of escape
669 sequences: ISO6429's direction specification and Emacs' private
670 sequence for specifying character composition.
671
39787efd 672 ISO6429's direction specification takes the following form:
4ed46869
KH
673 o CSI ']' -- end of the current direction
674 o CSI '0' ']' -- end of the current direction
675 o CSI '1' ']' -- start of left-to-right text
676 o CSI '2' ']' -- start of right-to-left text
677 The control character CSI (0x9B: control sequence introducer) is
39787efd
KH
678 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
679
680 Character composition specification takes the following form:
4ed46869
KH
681 o ESC '0' -- start character composition
682 o ESC '1' -- end character composition
39787efd
KH
683 Since these are not standard escape sequences of any ISO standard,
684 the use of them for these meaning is restricted to Emacs only. */
4ed46869
KH
685
686enum iso_code_class_type iso_code_class[256];
687
f024b6aa
RS
688#define CHARSET_OK(idx, charset) \
689 (coding_system_table[idx] \
690 && (coding_system_table[idx]->safe_charsets[charset] \
691 || (CODING_SPEC_ISO_REQUESTED_DESIGNATION \
692 (coding_system_table[idx], charset) \
693 != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)))
d46c5b12
KH
694
695#define SHIFT_OUT_OK(idx) \
696 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding_system_table[idx], 1) >= 0)
697
4ed46869
KH
698/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
699 Check if a text is encoded in ISO2022. If it is, returns an
700 integer in which appropriate flag bits any of:
701 CODING_CATEGORY_MASK_ISO_7
d46c5b12 702 CODING_CATEGORY_MASK_ISO_7_TIGHT
4ed46869
KH
703 CODING_CATEGORY_MASK_ISO_8_1
704 CODING_CATEGORY_MASK_ISO_8_2
7717c392
KH
705 CODING_CATEGORY_MASK_ISO_7_ELSE
706 CODING_CATEGORY_MASK_ISO_8_ELSE
4ed46869
KH
707 are set. If a code which should never appear in ISO2022 is found,
708 returns 0. */
709
710int
711detect_coding_iso2022 (src, src_end)
712 unsigned char *src, *src_end;
713{
d46c5b12
KH
714 int mask = CODING_CATEGORY_MASK_ISO;
715 int mask_found = 0;
f46869e4 716 int reg[4], shift_out = 0, single_shifting = 0;
d46c5b12 717 int c, c1, i, charset;
3f003981 718
d46c5b12 719 reg[0] = CHARSET_ASCII, reg[1] = reg[2] = reg[3] = -1;
3f003981 720 while (mask && src < src_end)
4ed46869
KH
721 {
722 c = *src++;
723 switch (c)
724 {
725 case ISO_CODE_ESC:
f46869e4 726 single_shifting = 0;
e0e989f6 727 if (src >= src_end)
4ed46869
KH
728 break;
729 c = *src++;
d46c5b12 730 if (c >= '(' && c <= '/')
4ed46869 731 {
bf9cdd4e
KH
732 /* Designation sequence for a charset of dimension 1. */
733 if (src >= src_end)
734 break;
d46c5b12
KH
735 c1 = *src++;
736 if (c1 < ' ' || c1 >= 0x80
737 || (charset = iso_charset_table[0][c >= ','][c1]) < 0)
738 /* Invalid designation sequence. Just ignore. */
739 break;
740 reg[(c - '(') % 4] = charset;
bf9cdd4e
KH
741 }
742 else if (c == '$')
743 {
744 /* Designation sequence for a charset of dimension 2. */
745 if (src >= src_end)
746 break;
747 c = *src++;
748 if (c >= '@' && c <= 'B')
749 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
d46c5b12 750 reg[0] = charset = iso_charset_table[1][0][c];
bf9cdd4e 751 else if (c >= '(' && c <= '/')
bcf26d6a 752 {
bf9cdd4e
KH
753 if (src >= src_end)
754 break;
d46c5b12
KH
755 c1 = *src++;
756 if (c1 < ' ' || c1 >= 0x80
757 || (charset = iso_charset_table[1][c >= ','][c1]) < 0)
758 /* Invalid designation sequence. Just ignore. */
759 break;
760 reg[(c - '(') % 4] = charset;
bcf26d6a 761 }
bf9cdd4e 762 else
d46c5b12
KH
763 /* Invalid designation sequence. Just ignore. */
764 break;
765 }
ae9ff118 766 else if (c == 'N' || c == 'O')
d46c5b12 767 {
ae9ff118
KH
768 /* ESC <Fe> for SS2 or SS3. */
769 mask &= CODING_CATEGORY_MASK_ISO_7_ELSE;
d46c5b12 770 break;
4ed46869 771 }
bf9cdd4e 772 else if (c == '0' || c == '1' || c == '2')
ae9ff118 773 /* ESC <Fp> for start/end composition. Just ignore. */
d46c5b12 774 break;
bf9cdd4e 775 else
d46c5b12
KH
776 /* Invalid escape sequence. Just ignore. */
777 break;
778
779 /* We found a valid designation sequence for CHARSET. */
780 mask &= ~CODING_CATEGORY_MASK_ISO_8BIT;
781 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7, charset))
782 mask_found |= CODING_CATEGORY_MASK_ISO_7;
783 else
784 mask &= ~CODING_CATEGORY_MASK_ISO_7;
785 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT, charset))
786 mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
787 else
788 mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
ae9ff118
KH
789 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE, charset))
790 mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
791 else
d46c5b12 792 mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
ae9ff118
KH
793 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE, charset))
794 mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
795 else
d46c5b12 796 mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
4ed46869
KH
797 break;
798
4ed46869 799 case ISO_CODE_SO:
f46869e4 800 single_shifting = 0;
d46c5b12
KH
801 if (shift_out == 0
802 && (reg[1] >= 0
803 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE)
804 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE)))
805 {
806 /* Locking shift out. */
807 mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
808 mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
809 }
e0e989f6
KH
810 break;
811
d46c5b12 812 case ISO_CODE_SI:
f46869e4 813 single_shifting = 0;
d46c5b12
KH
814 if (shift_out == 1)
815 {
816 /* Locking shift in. */
817 mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
818 mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
819 }
820 break;
821
4ed46869 822 case ISO_CODE_CSI:
f46869e4 823 single_shifting = 0;
4ed46869
KH
824 case ISO_CODE_SS2:
825 case ISO_CODE_SS3:
3f003981
KH
826 {
827 int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
828
70c22245
KH
829 if (c != ISO_CODE_CSI)
830 {
d46c5b12
KH
831 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
832 & CODING_FLAG_ISO_SINGLE_SHIFT)
70c22245 833 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
d46c5b12
KH
834 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
835 & CODING_FLAG_ISO_SINGLE_SHIFT)
70c22245 836 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
f46869e4 837 single_shifting = 1;
70c22245 838 }
3f003981
KH
839 if (VECTORP (Vlatin_extra_code_table)
840 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
841 {
d46c5b12
KH
842 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
843 & CODING_FLAG_ISO_LATIN_EXTRA)
3f003981 844 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
d46c5b12
KH
845 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
846 & CODING_FLAG_ISO_LATIN_EXTRA)
3f003981
KH
847 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
848 }
849 mask &= newmask;
d46c5b12 850 mask_found |= newmask;
3f003981
KH
851 }
852 break;
4ed46869
KH
853
854 default:
855 if (c < 0x80)
f46869e4
KH
856 {
857 single_shifting = 0;
858 break;
859 }
4ed46869 860 else if (c < 0xA0)
c4825358 861 {
f46869e4 862 single_shifting = 0;
3f003981
KH
863 if (VECTORP (Vlatin_extra_code_table)
864 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
c4825358 865 {
3f003981
KH
866 int newmask = 0;
867
d46c5b12
KH
868 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
869 & CODING_FLAG_ISO_LATIN_EXTRA)
3f003981 870 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
d46c5b12
KH
871 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
872 & CODING_FLAG_ISO_LATIN_EXTRA)
3f003981
KH
873 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
874 mask &= newmask;
d46c5b12 875 mask_found |= newmask;
c4825358 876 }
3f003981
KH
877 else
878 return 0;
c4825358 879 }
4ed46869
KH
880 else
881 {
7717c392 882 unsigned char *src_begin = src;
4ed46869 883
d46c5b12 884 mask &= ~(CODING_CATEGORY_MASK_ISO_7BIT
7717c392 885 | CODING_CATEGORY_MASK_ISO_7_ELSE);
d46c5b12 886 mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
f46869e4
KH
887 /* Check the length of succeeding codes of the range
888 0xA0..0FF. If the byte length is odd, we exclude
889 CODING_CATEGORY_MASK_ISO_8_2. We can check this only
890 when we are not single shifting. */
891 if (!single_shifting)
892 {
893 while (src < src_end && *src >= 0xA0)
894 src++;
895 if ((src - src_begin - 1) & 1 && src < src_end)
896 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
897 else
898 mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
899 }
4ed46869
KH
900 }
901 break;
902 }
903 }
904
d46c5b12 905 return (mask & mask_found);
4ed46869
KH
906}
907
908/* Decode a character of which charset is CHARSET and the 1st position
bdd9fb48 909 code is C1. If dimension of CHARSET is 2, the 2nd position code is
4ed46869
KH
910 fetched from SRC and set to C2. If CHARSET is negative, it means
911 that we are decoding ill formed text, and what we can do is just to
912 read C1 as is. */
913
bdd9fb48
KH
914#define DECODE_ISO_CHARACTER(charset, c1) \
915 do { \
916 int c_alt, charset_alt = (charset); \
917 if (COMPOSING_HEAD_P (coding->composing)) \
918 { \
919 *dst++ = LEADING_CODE_COMPOSITION; \
920 if (COMPOSING_WITH_RULE_P (coding->composing)) \
921 /* To tell composition rules are embeded. */ \
922 *dst++ = 0xFF; \
923 coding->composing += 2; \
924 } \
85bbb134 925 if (charset_alt >= 0) \
bdd9fb48 926 { \
85bbb134 927 if (CHARSET_DIMENSION (charset_alt) == 2) \
70c22245
KH
928 { \
929 ONE_MORE_BYTE (c2); \
930 if (iso_code_class[(c2) & 0x7F] != ISO_0x20_or_0x7F \
931 && iso_code_class[(c2) & 0x7F] != ISO_graphic_plane_0) \
932 { \
933 src--; \
85bbb134 934 charset_alt = CHARSET_ASCII; \
70c22245
KH
935 } \
936 } \
84fbb8a0
KH
937 if (!NILP (translation_table) \
938 && ((c_alt = translate_char (translation_table, \
85bbb134 939 -1, charset_alt, c1, c2)) >= 0)) \
bdd9fb48
KH
940 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
941 } \
942 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
943 DECODE_CHARACTER_ASCII (c1); \
944 else if (CHARSET_DIMENSION (charset_alt) == 1) \
945 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
946 else \
947 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
948 if (COMPOSING_WITH_RULE_P (coding->composing)) \
949 /* To tell a composition rule follows. */ \
950 coding->composing = COMPOSING_WITH_RULE_RULE; \
4ed46869
KH
951 } while (0)
952
953/* Set designation state into CODING. */
d46c5b12
KH
954#define DECODE_DESIGNATION(reg, dimension, chars, final_char) \
955 do { \
944bd420
KH
956 int charset; \
957 \
958 if (final_char < '0' || final_char >= 128) \
959 goto label_invalid_code; \
960 charset = ISO_CHARSET_TABLE (make_number (dimension), \
961 make_number (chars), \
962 make_number (final_char)); \
d46c5b12 963 if (charset >= 0 \
704c5781
KH
964 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) == reg \
965 || coding->safe_charsets[charset])) \
d46c5b12
KH
966 { \
967 if (coding->spec.iso2022.last_invalid_designation_register == 0 \
968 && reg == 0 \
969 && charset == CHARSET_ASCII) \
970 { \
971 /* We should insert this designation sequence as is so \
972 that it is surely written back to a file. */ \
973 coding->spec.iso2022.last_invalid_designation_register = -1; \
974 goto label_invalid_code; \
975 } \
976 coding->spec.iso2022.last_invalid_designation_register = -1; \
977 if ((coding->mode & CODING_MODE_DIRECTION) \
978 && CHARSET_REVERSE_CHARSET (charset) >= 0) \
979 charset = CHARSET_REVERSE_CHARSET (charset); \
980 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
981 } \
982 else \
983 { \
984 coding->spec.iso2022.last_invalid_designation_register = reg; \
985 goto label_invalid_code; \
986 } \
4ed46869
KH
987 } while (0)
988
88993dfd
KH
989/* Return 0 if there's a valid composing sequence starting at SRC and
990 ending before SRC_END, else return -1. */
d46c5b12 991
84fbb8a0
KH
992int
993check_composing_code (coding, src, src_end)
d46c5b12
KH
994 struct coding_system *coding;
995 unsigned char *src, *src_end;
996{
d46c5b12
KH
997 int charset, c, c1, dim;
998
999 while (src < src_end)
1000 {
88993dfd
KH
1001 c = *src++;
1002 if (c >= 0x20)
1003 continue;
1004 if (c != ISO_CODE_ESC || src >= src_end)
1005 return -1;
1006 c = *src++;
1007 if (c == '1') /* end of compsition */
1008 return 0;
1009 if (src + 2 >= src_end
1010 || !coding->flags & CODING_FLAG_ISO_DESIGNATION)
1011 return -1;
1012
1013 dim = (c == '$');
1014 if (dim == 1)
1015 c = (*src >= '@' && *src <= 'B') ? '(' : *src++;
1016 if (c >= '(' && c <= '/')
d46c5b12 1017 {
88993dfd
KH
1018 c1 = *src++;
1019 if ((c1 < ' ' || c1 >= 0x80)
1020 || (charset = iso_charset_table[dim][c >= ','][c1]) < 0
1021 || ! coding->safe_charsets[charset]
1022 || (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
1023 == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
1024 return -1;
d46c5b12 1025 }
88993dfd
KH
1026 else
1027 return -1;
d46c5b12 1028 }
88993dfd
KH
1029
1030 /* We have not found the sequence "ESC 1". */
1031 return -1;
d46c5b12
KH
1032}
1033
4ed46869
KH
1034/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
1035
1036int
d46c5b12 1037decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
1038 struct coding_system *coding;
1039 unsigned char *source, *destination;
1040 int src_bytes, dst_bytes;
4ed46869
KH
1041{
1042 unsigned char *src = source;
1043 unsigned char *src_end = source + src_bytes;
1044 unsigned char *dst = destination;
1045 unsigned char *dst_end = destination + dst_bytes;
1046 /* Since the maximum bytes produced by each loop is 7, we subtract 6
1047 from DST_END to assure that overflow checking is necessary only
1048 at the head of loop. */
1049 unsigned char *adjusted_dst_end = dst_end - 6;
1050 int charset;
1051 /* Charsets invoked to graphic plane 0 and 1 respectively. */
1052 int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1053 int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
84fbb8a0 1054 Lisp_Object translation_table
f967223b 1055 = coding->translation_table_for_decode;
d46c5b12 1056 int result = CODING_FINISH_NORMAL;
bdd9fb48 1057
84fbb8a0 1058 if (!NILP (Venable_character_translation) && NILP (translation_table))
f967223b 1059 translation_table = Vstandard_translation_table_for_decode;
4ed46869 1060
d46c5b12 1061 coding->produced_char = 0;
fb88bf2d 1062 coding->fake_multibyte = 0;
d46c5b12
KH
1063 while (src < src_end && (dst_bytes
1064 ? (dst < adjusted_dst_end)
1065 : (dst < src - 6)))
4ed46869
KH
1066 {
1067 /* SRC_BASE remembers the start position in source in each loop.
1068 The loop will be exited when there's not enough source text
1069 to analyze long escape sequence or 2-byte code (within macros
1070 ONE_MORE_BYTE or TWO_MORE_BYTES). In that case, SRC is reset
1071 to SRC_BASE before exiting. */
1072 unsigned char *src_base = src;
bdd9fb48 1073 int c1 = *src++, c2;
4ed46869
KH
1074
1075 switch (iso_code_class [c1])
1076 {
1077 case ISO_0x20_or_0x7F:
1078 if (!coding->composing
1079 && (charset0 < 0 || CHARSET_CHARS (charset0) == 94))
1080 {
1081 /* This is SPACE or DEL. */
1082 *dst++ = c1;
d46c5b12 1083 coding->produced_char++;
4ed46869
KH
1084 break;
1085 }
1086 /* This is a graphic character, we fall down ... */
1087
1088 case ISO_graphic_plane_0:
1089 if (coding->composing == COMPOSING_WITH_RULE_RULE)
1090 {
1091 /* This is a composition rule. */
1092 *dst++ = c1 | 0x80;
1093 coding->composing = COMPOSING_WITH_RULE_TAIL;
1094 }
1095 else
1096 DECODE_ISO_CHARACTER (charset0, c1);
1097 break;
1098
1099 case ISO_0xA0_or_0xFF:
d46c5b12
KH
1100 if (charset1 < 0 || CHARSET_CHARS (charset1) == 94
1101 || coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
fb88bf2d 1102 goto label_invalid_code;
4ed46869
KH
1103 /* This is a graphic character, we fall down ... */
1104
1105 case ISO_graphic_plane_1:
d46c5b12 1106 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
fb88bf2d 1107 goto label_invalid_code;
d46c5b12
KH
1108 else
1109 DECODE_ISO_CHARACTER (charset1, c1);
4ed46869
KH
1110 break;
1111
1112 case ISO_control_code:
1113 /* All ISO2022 control characters in this class have the
1114 same representation in Emacs internal format. */
d46c5b12
KH
1115 if (c1 == '\n'
1116 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1117 && (coding->eol_type == CODING_EOL_CR
1118 || coding->eol_type == CODING_EOL_CRLF))
1119 {
1120 result = CODING_FINISH_INCONSISTENT_EOL;
1121 goto label_end_of_loop_2;
1122 }
4ed46869 1123 *dst++ = c1;
d46c5b12 1124 coding->produced_char++;
174a4cbe
KH
1125 if (c1 >= 0x80)
1126 coding->fake_multibyte = 1;
4ed46869
KH
1127 break;
1128
1129 case ISO_carriage_return:
1130 if (coding->eol_type == CODING_EOL_CR)
d46c5b12 1131 *dst++ = '\n';
4ed46869
KH
1132 else if (coding->eol_type == CODING_EOL_CRLF)
1133 {
1134 ONE_MORE_BYTE (c1);
1135 if (c1 == ISO_CODE_LF)
1136 *dst++ = '\n';
1137 else
1138 {
d46c5b12
KH
1139 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1140 {
1141 result = CODING_FINISH_INCONSISTENT_EOL;
1142 goto label_end_of_loop_2;
1143 }
4ed46869 1144 src--;
d46c5b12 1145 *dst++ = '\r';
4ed46869
KH
1146 }
1147 }
1148 else
d46c5b12
KH
1149 *dst++ = c1;
1150 coding->produced_char++;
4ed46869
KH
1151 break;
1152
1153 case ISO_shift_out:
d46c5b12
KH
1154 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1155 || CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
1156 goto label_invalid_code;
4ed46869
KH
1157 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
1158 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1159 break;
1160
1161 case ISO_shift_in:
d46c5b12
KH
1162 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
1163 goto label_invalid_code;
4ed46869
KH
1164 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
1165 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1166 break;
1167
1168 case ISO_single_shift_2_7:
1169 case ISO_single_shift_2:
d46c5b12
KH
1170 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1171 goto label_invalid_code;
4ed46869
KH
1172 /* SS2 is handled as an escape sequence of ESC 'N' */
1173 c1 = 'N';
1174 goto label_escape_sequence;
1175
1176 case ISO_single_shift_3:
d46c5b12
KH
1177 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1178 goto label_invalid_code;
4ed46869
KH
1179 /* SS2 is handled as an escape sequence of ESC 'O' */
1180 c1 = 'O';
1181 goto label_escape_sequence;
1182
1183 case ISO_control_sequence_introducer:
1184 /* CSI is handled as an escape sequence of ESC '[' ... */
1185 c1 = '[';
1186 goto label_escape_sequence;
1187
1188 case ISO_escape:
1189 ONE_MORE_BYTE (c1);
1190 label_escape_sequence:
1191 /* Escape sequences handled by Emacs are invocation,
1192 designation, direction specification, and character
1193 composition specification. */
1194 switch (c1)
1195 {
1196 case '&': /* revision of following character set */
1197 ONE_MORE_BYTE (c1);
1198 if (!(c1 >= '@' && c1 <= '~'))
d46c5b12 1199 goto label_invalid_code;
4ed46869
KH
1200 ONE_MORE_BYTE (c1);
1201 if (c1 != ISO_CODE_ESC)
d46c5b12 1202 goto label_invalid_code;
4ed46869
KH
1203 ONE_MORE_BYTE (c1);
1204 goto label_escape_sequence;
1205
1206 case '$': /* designation of 2-byte character set */
d46c5b12
KH
1207 if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1208 goto label_invalid_code;
4ed46869
KH
1209 ONE_MORE_BYTE (c1);
1210 if (c1 >= '@' && c1 <= 'B')
1211 { /* designation of JISX0208.1978, GB2312.1980,
88993dfd 1212 or JISX0208.1980 */
4ed46869
KH
1213 DECODE_DESIGNATION (0, 2, 94, c1);
1214 }
1215 else if (c1 >= 0x28 && c1 <= 0x2B)
1216 { /* designation of DIMENSION2_CHARS94 character set */
1217 ONE_MORE_BYTE (c2);
1218 DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
1219 }
1220 else if (c1 >= 0x2C && c1 <= 0x2F)
1221 { /* designation of DIMENSION2_CHARS96 character set */
1222 ONE_MORE_BYTE (c2);
1223 DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
1224 }
1225 else
d46c5b12 1226 goto label_invalid_code;
4ed46869
KH
1227 break;
1228
1229 case 'n': /* invocation of locking-shift-2 */
d46c5b12
KH
1230 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1231 || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1232 goto label_invalid_code;
4ed46869 1233 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
e0e989f6 1234 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
4ed46869
KH
1235 break;
1236
1237 case 'o': /* invocation of locking-shift-3 */
d46c5b12
KH
1238 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1239 || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1240 goto label_invalid_code;
4ed46869 1241 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
e0e989f6 1242 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
4ed46869
KH
1243 break;
1244
1245 case 'N': /* invocation of single-shift-2 */
d46c5b12
KH
1246 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1247 || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1248 goto label_invalid_code;
4ed46869
KH
1249 ONE_MORE_BYTE (c1);
1250 charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
1251 DECODE_ISO_CHARACTER (charset, c1);
1252 break;
1253
1254 case 'O': /* invocation of single-shift-3 */
d46c5b12
KH
1255 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1256 || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1257 goto label_invalid_code;
4ed46869
KH
1258 ONE_MORE_BYTE (c1);
1259 charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
1260 DECODE_ISO_CHARACTER (charset, c1);
1261 break;
1262
d46c5b12
KH
1263 case '0': case '2': /* start composing */
1264 /* Before processing composing, we must be sure that all
1265 characters being composed are supported by CODING.
88993dfd
KH
1266 If not, we must give up composing. */
1267 if (check_composing_code (coding, src, src_end) == 0)
1268 {
1269 /* We are looking at a valid composition sequence. */
1270 coding->composing = (c1 == '0'
1271 ? COMPOSING_NO_RULE_HEAD
1272 : COMPOSING_WITH_RULE_HEAD);
1273 coding->composed_chars = 0;
1274 }
1275 else
1276 {
1277 *dst++ = ISO_CODE_ESC;
1278 *dst++ = c1;
1279 coding->produced_char += 2;
1280 }
4ed46869
KH
1281 break;
1282
1283 case '1': /* end composing */
88993dfd
KH
1284 if (!coding->composing)
1285 {
1286 *dst++ = ISO_CODE_ESC;
1287 *dst++ = c1;
1288 coding->produced_char += 2;
1289 break;
1290 }
1291
de79a6a5
KH
1292 if (coding->composed_chars > 0)
1293 {
1294 if (coding->composed_chars == 1)
1295 {
1296 unsigned char *this_char_start = dst;
1297 int this_bytes;
1298
1299 /* Only one character is in the composing
1300 sequence. Make it a normal character. */
1301 while (*--this_char_start != LEADING_CODE_COMPOSITION);
1302 dst = (this_char_start
1303 + (coding->composing == COMPOSING_NO_RULE_TAIL
1304 ? 1 : 2));
1305 *dst -= 0x20;
1306 if (*dst == 0x80)
1307 *++dst &= 0x7F;
1308 this_bytes = BYTES_BY_CHAR_HEAD (*dst);
1309 while (this_bytes--) *this_char_start++ = *dst++;
1310 dst = this_char_start;
1311 }
1312 coding->produced_char++;
1313 }
4ed46869 1314 coding->composing = COMPOSING_NO;
4ed46869
KH
1315 break;
1316
1317 case '[': /* specification of direction */
d46c5b12
KH
1318 if (coding->flags & CODING_FLAG_ISO_NO_DIRECTION)
1319 goto label_invalid_code;
4ed46869 1320 /* For the moment, nested direction is not supported.
d46c5b12
KH
1321 So, `coding->mode & CODING_MODE_DIRECTION' zero means
1322 left-to-right, and nozero means right-to-left. */
4ed46869
KH
1323 ONE_MORE_BYTE (c1);
1324 switch (c1)
1325 {
1326 case ']': /* end of the current direction */
d46c5b12 1327 coding->mode &= ~CODING_MODE_DIRECTION;
4ed46869
KH
1328
1329 case '0': /* end of the current direction */
1330 case '1': /* start of left-to-right direction */
1331 ONE_MORE_BYTE (c1);
1332 if (c1 == ']')
d46c5b12 1333 coding->mode &= ~CODING_MODE_DIRECTION;
4ed46869 1334 else
d46c5b12 1335 goto label_invalid_code;
4ed46869
KH
1336 break;
1337
1338 case '2': /* start of right-to-left direction */
1339 ONE_MORE_BYTE (c1);
1340 if (c1 == ']')
d46c5b12 1341 coding->mode |= CODING_MODE_DIRECTION;
4ed46869 1342 else
d46c5b12 1343 goto label_invalid_code;
4ed46869
KH
1344 break;
1345
1346 default:
d46c5b12 1347 goto label_invalid_code;
4ed46869
KH
1348 }
1349 break;
1350
1351 default:
d46c5b12
KH
1352 if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1353 goto label_invalid_code;
4ed46869
KH
1354 if (c1 >= 0x28 && c1 <= 0x2B)
1355 { /* designation of DIMENSION1_CHARS94 character set */
1356 ONE_MORE_BYTE (c2);
1357 DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
1358 }
1359 else if (c1 >= 0x2C && c1 <= 0x2F)
1360 { /* designation of DIMENSION1_CHARS96 character set */
1361 ONE_MORE_BYTE (c2);
1362 DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
1363 }
1364 else
1365 {
d46c5b12 1366 goto label_invalid_code;
4ed46869
KH
1367 }
1368 }
1369 /* We must update these variables now. */
1370 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1371 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1372 break;
1373
d46c5b12 1374 label_invalid_code:
d46c5b12
KH
1375 while (src_base < src)
1376 *dst++ = *src_base++;
fb88bf2d 1377 coding->fake_multibyte = 1;
4ed46869
KH
1378 }
1379 continue;
1380
1381 label_end_of_loop:
d46c5b12
KH
1382 result = CODING_FINISH_INSUFFICIENT_SRC;
1383 label_end_of_loop_2:
4ed46869
KH
1384 src = src_base;
1385 break;
1386 }
1387
fb88bf2d 1388 if (src < src_end)
4ed46869 1389 {
fb88bf2d
KH
1390 if (result == CODING_FINISH_NORMAL)
1391 result = CODING_FINISH_INSUFFICIENT_DST;
1392 else if (result != CODING_FINISH_INCONSISTENT_EOL
1393 && coding->mode & CODING_MODE_LAST_BLOCK)
1394 {
1395 /* This is the last block of the text to be decoded. We had
1396 better just flush out all remaining codes in the text
1397 although they are not valid characters. */
1398 src_bytes = src_end - src;
1399 if (dst_bytes && (dst_end - dst < src_bytes))
1400 src_bytes = dst_end - dst;
1401 bcopy (src, dst, src_bytes);
1402 dst += src_bytes;
1403 src += src_bytes;
1404 coding->fake_multibyte = 1;
1405 }
4ed46869 1406 }
fb88bf2d 1407
d46c5b12
KH
1408 coding->consumed = coding->consumed_char = src - source;
1409 coding->produced = dst - destination;
1410 return result;
4ed46869
KH
1411}
1412
f4dee582 1413/* ISO2022 encoding stuff. */
4ed46869
KH
1414
1415/*
f4dee582 1416 It is not enough to say just "ISO2022" on encoding, we have to
d46c5b12 1417 specify more details. In Emacs, each coding system of ISO2022
4ed46869
KH
1418 variant has the following specifications:
1419 1. Initial designation to G0 thru G3.
1420 2. Allows short-form designation?
1421 3. ASCII should be designated to G0 before control characters?
1422 4. ASCII should be designated to G0 at end of line?
1423 5. 7-bit environment or 8-bit environment?
1424 6. Use locking-shift?
1425 7. Use Single-shift?
1426 And the following two are only for Japanese:
1427 8. Use ASCII in place of JIS0201-1976-Roman?
1428 9. Use JISX0208-1983 in place of JISX0208-1978?
1429 These specifications are encoded in `coding->flags' as flag bits
1430 defined by macros CODING_FLAG_ISO_XXX. See `coding.h' for more
f4dee582 1431 details.
4ed46869
KH
1432*/
1433
1434/* Produce codes (escape sequence) for designating CHARSET to graphic
1435 register REG. If <final-char> of CHARSET is '@', 'A', or 'B' and
1436 the coding system CODING allows, produce designation sequence of
1437 short-form. */
1438
1439#define ENCODE_DESIGNATION(charset, reg, coding) \
1440 do { \
1441 unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset); \
1442 char *intermediate_char_94 = "()*+"; \
1443 char *intermediate_char_96 = ",-./"; \
70c22245
KH
1444 int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset); \
1445 if (revision < 255) \
1446 { \
4ed46869
KH
1447 *dst++ = ISO_CODE_ESC; \
1448 *dst++ = '&'; \
70c22245 1449 *dst++ = '@' + revision; \
4ed46869
KH
1450 } \
1451 *dst++ = ISO_CODE_ESC; \
1452 if (CHARSET_DIMENSION (charset) == 1) \
1453 { \
1454 if (CHARSET_CHARS (charset) == 94) \
1455 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1456 else \
1457 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1458 } \
1459 else \
1460 { \
1461 *dst++ = '$'; \
1462 if (CHARSET_CHARS (charset) == 94) \
1463 { \
1464 if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM) \
1465 || reg != 0 \
1466 || final_char < '@' || final_char > 'B') \
1467 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1468 } \
1469 else \
1470 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1471 } \
1472 *dst++ = final_char; \
1473 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
1474 } while (0)
1475
1476/* The following two macros produce codes (control character or escape
1477 sequence) for ISO2022 single-shift functions (single-shift-2 and
1478 single-shift-3). */
1479
1480#define ENCODE_SINGLE_SHIFT_2 \
1481 do { \
1482 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1483 *dst++ = ISO_CODE_ESC, *dst++ = 'N'; \
1484 else \
fb88bf2d
KH
1485 { \
1486 *dst++ = ISO_CODE_SS2; \
1487 coding->fake_multibyte = 1; \
1488 } \
4ed46869
KH
1489 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1490 } while (0)
1491
fb88bf2d
KH
1492#define ENCODE_SINGLE_SHIFT_3 \
1493 do { \
4ed46869 1494 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
fb88bf2d
KH
1495 *dst++ = ISO_CODE_ESC, *dst++ = 'O'; \
1496 else \
1497 { \
1498 *dst++ = ISO_CODE_SS3; \
1499 coding->fake_multibyte = 1; \
1500 } \
4ed46869
KH
1501 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1502 } while (0)
1503
1504/* The following four macros produce codes (control character or
1505 escape sequence) for ISO2022 locking-shift functions (shift-in,
1506 shift-out, locking-shift-2, and locking-shift-3). */
1507
1508#define ENCODE_SHIFT_IN \
1509 do { \
1510 *dst++ = ISO_CODE_SI; \
1511 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1512 } while (0)
1513
1514#define ENCODE_SHIFT_OUT \
1515 do { \
1516 *dst++ = ISO_CODE_SO; \
1517 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1518 } while (0)
1519
1520#define ENCODE_LOCKING_SHIFT_2 \
1521 do { \
1522 *dst++ = ISO_CODE_ESC, *dst++ = 'n'; \
1523 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1524 } while (0)
1525
1526#define ENCODE_LOCKING_SHIFT_3 \
1527 do { \
1528 *dst++ = ISO_CODE_ESC, *dst++ = 'o'; \
1529 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1530 } while (0)
1531
f4dee582
RS
1532/* Produce codes for a DIMENSION1 character whose character set is
1533 CHARSET and whose position-code is C1. Designation and invocation
4ed46869
KH
1534 sequences are also produced in advance if necessary. */
1535
1536
6e85d753
KH
1537#define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
1538 do { \
1539 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1540 { \
1541 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1542 *dst++ = c1 & 0x7F; \
1543 else \
1544 *dst++ = c1 | 0x80; \
1545 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1546 break; \
1547 } \
1548 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1549 { \
1550 *dst++ = c1 & 0x7F; \
1551 break; \
1552 } \
1553 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1554 { \
1555 *dst++ = c1 | 0x80; \
1556 break; \
1557 } \
1558 else if (coding->flags & CODING_FLAG_ISO_SAFE \
70c22245 1559 && !coding->safe_charsets[charset]) \
6e85d753
KH
1560 { \
1561 /* We should not encode this character, instead produce one or \
1562 two `?'s. */ \
1563 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1564 if (CHARSET_WIDTH (charset) == 2) \
1565 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1566 break; \
1567 } \
1568 else \
1569 /* Since CHARSET is not yet invoked to any graphic planes, we \
1570 must invoke it, or, at first, designate it to some graphic \
1571 register. Then repeat the loop to actually produce the \
1572 character. */ \
1573 dst = encode_invocation_designation (charset, coding, dst); \
4ed46869
KH
1574 } while (1)
1575
f4dee582
RS
1576/* Produce codes for a DIMENSION2 character whose character set is
1577 CHARSET and whose position-codes are C1 and C2. Designation and
4ed46869
KH
1578 invocation codes are also produced in advance if necessary. */
1579
6e85d753
KH
1580#define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
1581 do { \
1582 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1583 { \
1584 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1585 *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F; \
1586 else \
1587 *dst++ = c1 | 0x80, *dst++ = c2 | 0x80; \
1588 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1589 break; \
1590 } \
1591 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1592 { \
1593 *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F; \
1594 break; \
1595 } \
1596 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1597 { \
1598 *dst++ = c1 | 0x80, *dst++= c2 | 0x80; \
1599 break; \
1600 } \
1601 else if (coding->flags & CODING_FLAG_ISO_SAFE \
70c22245 1602 && !coding->safe_charsets[charset]) \
6e85d753
KH
1603 { \
1604 /* We should not encode this character, instead produce one or \
1605 two `?'s. */ \
1606 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1607 if (CHARSET_WIDTH (charset) == 2) \
1608 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1609 break; \
1610 } \
1611 else \
1612 /* Since CHARSET is not yet invoked to any graphic planes, we \
1613 must invoke it, or, at first, designate it to some graphic \
1614 register. Then repeat the loop to actually produce the \
1615 character. */ \
1616 dst = encode_invocation_designation (charset, coding, dst); \
4ed46869
KH
1617 } while (1)
1618
6f551029
KH
1619#define ENCODE_ISO_CHARACTER(charset, c1, c2) \
1620 do { \
1621 int c_alt, charset_alt; \
1622 if (!NILP (translation_table) \
1623 && ((c_alt = translate_char (translation_table, -1, \
1624 charset, c1, c2)) \
1625 >= 0)) \
1626 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1627 else \
1628 charset_alt = charset; \
1629 if (CHARSET_DEFINED_P (charset_alt)) \
1630 { \
1631 if (CHARSET_DIMENSION (charset_alt) == 1) \
1632 { \
1633 if (charset == CHARSET_ASCII \
1634 && coding->flags & CODING_FLAG_ISO_USE_ROMAN) \
1635 charset_alt = charset_latin_jisx0201; \
1636 ENCODE_ISO_CHARACTER_DIMENSION1 (charset_alt, c1); \
1637 } \
1638 else \
1639 { \
1640 if (charset == charset_jisx0208 \
1641 && coding->flags & CODING_FLAG_ISO_USE_OLDJIS) \
1642 charset_alt = charset_jisx0208_1978; \
1643 ENCODE_ISO_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
1644 } \
1645 } \
1646 else \
1647 { \
1648 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1649 { \
1650 *dst++ = charset & 0x7f; \
1651 *dst++ = c1 & 0x7f; \
1652 if (c2) \
1653 *dst++ = c2 & 0x7f; \
1654 } \
1655 else \
1656 { \
1657 *dst++ = charset; \
1658 *dst++ = c1; \
1659 if (c2) \
1660 *dst++ = c2; \
1661 } \
1662 } \
1663 if (! COMPOSING_P (coding->composing)) \
1664 coding->consumed_char++; \
84fbb8a0 1665 } while (0)
bdd9fb48 1666
4ed46869
KH
1667/* Produce designation and invocation codes at a place pointed by DST
1668 to use CHARSET. The element `spec.iso2022' of *CODING is updated.
1669 Return new DST. */
1670
1671unsigned char *
1672encode_invocation_designation (charset, coding, dst)
1673 int charset;
1674 struct coding_system *coding;
1675 unsigned char *dst;
1676{
1677 int reg; /* graphic register number */
1678
1679 /* At first, check designations. */
1680 for (reg = 0; reg < 4; reg++)
1681 if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
1682 break;
1683
1684 if (reg >= 4)
1685 {
1686 /* CHARSET is not yet designated to any graphic registers. */
1687 /* At first check the requested designation. */
1688 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1ba9e4ab
KH
1689 if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1690 /* Since CHARSET requests no special designation, designate it
1691 to graphic register 0. */
4ed46869
KH
1692 reg = 0;
1693
1694 ENCODE_DESIGNATION (charset, reg, coding);
1695 }
1696
1697 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
1698 && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
1699 {
1700 /* Since the graphic register REG is not invoked to any graphic
1701 planes, invoke it to graphic plane 0. */
1702 switch (reg)
1703 {
1704 case 0: /* graphic register 0 */
1705 ENCODE_SHIFT_IN;
1706 break;
1707
1708 case 1: /* graphic register 1 */
1709 ENCODE_SHIFT_OUT;
1710 break;
1711
1712 case 2: /* graphic register 2 */
1713 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1714 ENCODE_SINGLE_SHIFT_2;
1715 else
1716 ENCODE_LOCKING_SHIFT_2;
1717 break;
1718
1719 case 3: /* graphic register 3 */
1720 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1721 ENCODE_SINGLE_SHIFT_3;
1722 else
1723 ENCODE_LOCKING_SHIFT_3;
1724 break;
1725 }
1726 }
1727 return dst;
1728}
1729
1730/* The following two macros produce codes for indicating composition. */
1731#define ENCODE_COMPOSITION_NO_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '0'
1732#define ENCODE_COMPOSITION_WITH_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '2'
1733#define ENCODE_COMPOSITION_END *dst++ = ISO_CODE_ESC, *dst++ = '1'
1734
1735/* The following three macros produce codes for indicating direction
1736 of text. */
1737#define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
1738 do { \
1739 if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS) \
1740 *dst++ = ISO_CODE_ESC, *dst++ = '['; \
1741 else \
1742 *dst++ = ISO_CODE_CSI; \
1743 } while (0)
1744
1745#define ENCODE_DIRECTION_R2L \
1746 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '2', *dst++ = ']'
1747
1748#define ENCODE_DIRECTION_L2R \
1749 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '0', *dst++ = ']'
1750
1751/* Produce codes for designation and invocation to reset the graphic
1752 planes and registers to initial state. */
e0e989f6
KH
1753#define ENCODE_RESET_PLANE_AND_REGISTER \
1754 do { \
1755 int reg; \
1756 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0) \
1757 ENCODE_SHIFT_IN; \
1758 for (reg = 0; reg < 4; reg++) \
1759 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0 \
1760 && (CODING_SPEC_ISO_DESIGNATION (coding, reg) \
1761 != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg))) \
1762 ENCODE_DESIGNATION \
1763 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
4ed46869
KH
1764 } while (0)
1765
bdd9fb48 1766/* Produce designation sequences of charsets in the line started from
d46c5b12 1767 SRC to a place pointed by *DSTP, and update DSTP.
bdd9fb48
KH
1768
1769 If the current block ends before any end-of-line, we may fail to
d46c5b12
KH
1770 find all the necessary designations. */
1771
dfcf069d 1772void
bdd9fb48 1773encode_designation_at_bol (coding, table, src, src_end, dstp)
e0e989f6 1774 struct coding_system *coding;
bdd9fb48 1775 Lisp_Object table;
e0e989f6
KH
1776 unsigned char *src, *src_end, **dstp;
1777{
bdd9fb48
KH
1778 int charset, c, found = 0, reg;
1779 /* Table of charsets to be designated to each graphic register. */
1780 int r[4];
1781 unsigned char *dst = *dstp;
1782
1783 for (reg = 0; reg < 4; reg++)
1784 r[reg] = -1;
1785
1786 while (src < src_end && *src != '\n' && found < 4)
e0e989f6 1787 {
bdd9fb48
KH
1788 int bytes = BYTES_BY_CHAR_HEAD (*src);
1789
1790 if (NILP (table))
1791 charset = CHARSET_AT (src);
1792 else
e0e989f6 1793 {
35cb8686
RS
1794 int c_alt;
1795 unsigned char c1, c2;
bdd9fb48
KH
1796
1797 SPLIT_STRING(src, bytes, charset, c1, c2);
84fbb8a0 1798 if ((c_alt = translate_char (table, -1, charset, c1, c2)) >= 0)
bdd9fb48 1799 charset = CHAR_CHARSET (c_alt);
e0e989f6 1800 }
bdd9fb48 1801
e0e989f6 1802 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
d46c5b12 1803 if (reg != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION && r[reg] < 0)
bdd9fb48
KH
1804 {
1805 found++;
1806 r[reg] = charset;
1807 }
1808
1809 src += bytes;
1810 }
1811
1812 if (found)
1813 {
1814 for (reg = 0; reg < 4; reg++)
1815 if (r[reg] >= 0
1816 && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
1817 ENCODE_DESIGNATION (r[reg], reg, coding);
1818 *dstp = dst;
e0e989f6 1819 }
e0e989f6
KH
1820}
1821
4ed46869
KH
1822/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
1823
1824int
d46c5b12 1825encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
1826 struct coding_system *coding;
1827 unsigned char *source, *destination;
1828 int src_bytes, dst_bytes;
4ed46869
KH
1829{
1830 unsigned char *src = source;
1831 unsigned char *src_end = source + src_bytes;
1832 unsigned char *dst = destination;
1833 unsigned char *dst_end = destination + dst_bytes;
e0e989f6 1834 /* Since the maximum bytes produced by each loop is 20, we subtract 19
4ed46869
KH
1835 from DST_END to assure overflow checking is necessary only at the
1836 head of loop. */
e0e989f6 1837 unsigned char *adjusted_dst_end = dst_end - 19;
84fbb8a0 1838 Lisp_Object translation_table
f967223b 1839 = coding->translation_table_for_encode;
d46c5b12 1840 int result = CODING_FINISH_NORMAL;
bdd9fb48 1841
84fbb8a0 1842 if (!NILP (Venable_character_translation) && NILP (translation_table))
f967223b 1843 translation_table = Vstandard_translation_table_for_encode;
4ed46869 1844
d46c5b12 1845 coding->consumed_char = 0;
fb88bf2d 1846 coding->fake_multibyte = 0;
d46c5b12
KH
1847 while (src < src_end && (dst_bytes
1848 ? (dst < adjusted_dst_end)
1849 : (dst < src - 19)))
4ed46869
KH
1850 {
1851 /* SRC_BASE remembers the start position in source in each loop.
1852 The loop will be exited when there's not enough source text
1853 to analyze multi-byte codes (within macros ONE_MORE_BYTE,
1854 TWO_MORE_BYTES, and THREE_MORE_BYTES). In that case, SRC is
1855 reset to SRC_BASE before exiting. */
1856 unsigned char *src_base = src;
bdd9fb48 1857 int charset, c1, c2, c3, c4;
4ed46869 1858
e0e989f6
KH
1859 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
1860 && CODING_SPEC_ISO_BOL (coding))
1861 {
bdd9fb48 1862 /* We have to produce designation sequences if any now. */
84fbb8a0 1863 encode_designation_at_bol (coding, translation_table,
bdd9fb48 1864 src, src_end, &dst);
e0e989f6
KH
1865 CODING_SPEC_ISO_BOL (coding) = 0;
1866 }
1867
1868 c1 = *src++;
4ed46869 1869 /* If we are seeing a component of a composite character, we are
d46c5b12
KH
1870 seeing a leading-code encoded irregularly for composition, or
1871 a composition rule if composing with rule. We must set C1 to
1872 a normal leading-code or an ASCII code. If we are not seeing
1873 a composite character, we must reset composition,
1874 designation, and invocation states. */
4ed46869
KH
1875 if (COMPOSING_P (coding->composing))
1876 {
1877 if (c1 < 0xA0)
1878 {
1879 /* We are not in a composite character any longer. */
1880 coding->composing = COMPOSING_NO;
d46c5b12 1881 ENCODE_RESET_PLANE_AND_REGISTER;
4ed46869
KH
1882 ENCODE_COMPOSITION_END;
1883 }
1884 else
1885 {
1886 if (coding->composing == COMPOSING_WITH_RULE_RULE)
1887 {
1888 *dst++ = c1 & 0x7F;
1889 coding->composing = COMPOSING_WITH_RULE_HEAD;
1890 continue;
1891 }
1892 else if (coding->composing == COMPOSING_WITH_RULE_HEAD)
1893 coding->composing = COMPOSING_WITH_RULE_RULE;
1894 if (c1 == 0xA0)
1895 {
1896 /* This is an ASCII component. */
1897 ONE_MORE_BYTE (c1);
1898 c1 &= 0x7F;
1899 }
1900 else
1901 /* This is a leading-code of non ASCII component. */
1902 c1 -= 0x20;
1903 }
1904 }
1905
1906 /* Now encode one character. C1 is a control character, an
1907 ASCII character, or a leading-code of multi-byte character. */
1908 switch (emacs_code_class[c1])
1909 {
1910 case EMACS_ascii_code:
8dbb769e 1911 c2 = 0;
bdd9fb48 1912 ENCODE_ISO_CHARACTER (CHARSET_ASCII, c1, /* dummy */ c2);
4ed46869
KH
1913 break;
1914
1915 case EMACS_control_code:
1916 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
e0e989f6 1917 ENCODE_RESET_PLANE_AND_REGISTER;
4ed46869 1918 *dst++ = c1;
d46c5b12 1919 coding->consumed_char++;
4ed46869
KH
1920 break;
1921
1922 case EMACS_carriage_return_code:
d46c5b12 1923 if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
4ed46869
KH
1924 {
1925 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
e0e989f6 1926 ENCODE_RESET_PLANE_AND_REGISTER;
4ed46869 1927 *dst++ = c1;
d46c5b12 1928 coding->consumed_char++;
4ed46869
KH
1929 break;
1930 }
1931 /* fall down to treat '\r' as '\n' ... */
1932
1933 case EMACS_linefeed_code:
1934 if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
e0e989f6
KH
1935 ENCODE_RESET_PLANE_AND_REGISTER;
1936 if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
1937 bcopy (coding->spec.iso2022.initial_designation,
1938 coding->spec.iso2022.current_designation,
1939 sizeof coding->spec.iso2022.initial_designation);
4ed46869 1940 if (coding->eol_type == CODING_EOL_LF
0ef69138 1941 || coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
1942 *dst++ = ISO_CODE_LF;
1943 else if (coding->eol_type == CODING_EOL_CRLF)
1944 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
1945 else
1946 *dst++ = ISO_CODE_CR;
e0e989f6 1947 CODING_SPEC_ISO_BOL (coding) = 1;
d46c5b12 1948 coding->consumed_char++;
4ed46869
KH
1949 break;
1950
1951 case EMACS_leading_code_2:
1952 ONE_MORE_BYTE (c2);
8dbb769e 1953 c3 = 0;
19a8d9e0
KH
1954 if (c2 < 0xA0)
1955 {
1956 /* invalid sequence */
1957 *dst++ = c1;
38cf95df
RS
1958 src--;
1959 coding->consumed_char++;
19a8d9e0
KH
1960 }
1961 else
1962 ENCODE_ISO_CHARACTER (c1, c2, /* dummy */ c3);
4ed46869
KH
1963 break;
1964
1965 case EMACS_leading_code_3:
1966 TWO_MORE_BYTES (c2, c3);
8dbb769e 1967 c4 = 0;
19a8d9e0
KH
1968 if (c2 < 0xA0 || c3 < 0xA0)
1969 {
1970 /* invalid sequence */
1971 *dst++ = c1;
38cf95df
RS
1972 src -= 2;
1973 coding->consumed_char++;
19a8d9e0
KH
1974 }
1975 else if (c1 < LEADING_CODE_PRIVATE_11)
bdd9fb48 1976 ENCODE_ISO_CHARACTER (c1, c2, c3);
4ed46869 1977 else
bdd9fb48 1978 ENCODE_ISO_CHARACTER (c2, c3, /* dummy */ c4);
4ed46869
KH
1979 break;
1980
1981 case EMACS_leading_code_4:
1982 THREE_MORE_BYTES (c2, c3, c4);
19a8d9e0
KH
1983 if (c2 < 0xA0 || c3 < 0xA0 || c4 < 0xA0)
1984 {
1985 /* invalid sequence */
1986 *dst++ = c1;
38cf95df
RS
1987 src -= 3;
1988 coding->consumed_char++;
19a8d9e0
KH
1989 }
1990 else
1991 ENCODE_ISO_CHARACTER (c2, c3, c4);
4ed46869
KH
1992 break;
1993
1994 case EMACS_leading_code_composition:
19a8d9e0
KH
1995 ONE_MORE_BYTE (c2);
1996 if (c2 < 0xA0)
1997 {
1998 /* invalid sequence */
1999 *dst++ = c1;
38cf95df
RS
2000 src--;
2001 coding->consumed_char++;
19a8d9e0
KH
2002 }
2003 else if (c2 == 0xFF)
4ed46869 2004 {
d46c5b12 2005 ENCODE_RESET_PLANE_AND_REGISTER;
4ed46869
KH
2006 coding->composing = COMPOSING_WITH_RULE_HEAD;
2007 ENCODE_COMPOSITION_WITH_RULE_START;
d46c5b12 2008 coding->consumed_char++;
4ed46869
KH
2009 }
2010 else
2011 {
d46c5b12 2012 ENCODE_RESET_PLANE_AND_REGISTER;
4ed46869
KH
2013 /* Rewind one byte because it is a character code of
2014 composition elements. */
2015 src--;
2016 coding->composing = COMPOSING_NO_RULE_HEAD;
2017 ENCODE_COMPOSITION_NO_RULE_START;
d46c5b12 2018 coding->consumed_char++;
4ed46869
KH
2019 }
2020 break;
2021
2022 case EMACS_invalid_code:
3efbce95
KH
2023 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2024 ENCODE_RESET_PLANE_AND_REGISTER;
4ed46869 2025 *dst++ = c1;
d46c5b12 2026 coding->consumed_char++;
4ed46869
KH
2027 break;
2028 }
2029 continue;
2030 label_end_of_loop:
d46c5b12
KH
2031 result = CODING_FINISH_INSUFFICIENT_SRC;
2032 src = src_base;
4ed46869
KH
2033 break;
2034 }
2035
49cb52b4
KH
2036 if (src < src_end && result == CODING_FINISH_NORMAL)
2037 result = CODING_FINISH_INSUFFICIENT_DST;
2038
2039 /* If this is the last block of the text to be encoded, we must
2040 reset graphic planes and registers to the initial state, and
2041 flush out the carryover if any. */
2042 if (coding->mode & CODING_MODE_LAST_BLOCK)
84fbb8a0
KH
2043 {
2044 ENCODE_RESET_PLANE_AND_REGISTER;
2045 if (COMPOSING_P (coding->composing))
2046 ENCODE_COMPOSITION_END;
88993dfd
KH
2047 if (result == CODING_FINISH_INSUFFICIENT_SRC)
2048 {
2049 while (src < src_end && dst < dst_end)
2050 *dst++ = *src++;
2051 }
84fbb8a0 2052 }
d46c5b12
KH
2053 coding->consumed = src - source;
2054 coding->produced = coding->produced_char = dst - destination;
2055 return result;
4ed46869
KH
2056}
2057
2058\f
2059/*** 4. SJIS and BIG5 handlers ***/
2060
f4dee582 2061/* Although SJIS and BIG5 are not ISO's coding system, they are used
4ed46869
KH
2062 quite widely. So, for the moment, Emacs supports them in the bare
2063 C code. But, in the future, they may be supported only by CCL. */
2064
2065/* SJIS is a coding system encoding three character sets: ASCII, right
2066 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
2067 as is. A character of charset katakana-jisx0201 is encoded by
2068 "position-code + 0x80". A character of charset japanese-jisx0208
2069 is encoded in 2-byte but two position-codes are divided and shifted
2070 so that it fit in the range below.
2071
2072 --- CODE RANGE of SJIS ---
2073 (character set) (range)
2074 ASCII 0x00 .. 0x7F
2075 KATAKANA-JISX0201 0xA0 .. 0xDF
c28a9453 2076 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
d14d03ac 2077 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
4ed46869
KH
2078 -------------------------------
2079
2080*/
2081
2082/* BIG5 is a coding system encoding two character sets: ASCII and
2083 Big5. An ASCII character is encoded as is. Big5 is a two-byte
2084 character set and is encoded in two-byte.
2085
2086 --- CODE RANGE of BIG5 ---
2087 (character set) (range)
2088 ASCII 0x00 .. 0x7F
2089 Big5 (1st byte) 0xA1 .. 0xFE
2090 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
2091 --------------------------
2092
2093 Since the number of characters in Big5 is larger than maximum
2094 characters in Emacs' charset (96x96), it can't be handled as one
2095 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
2096 and `charset-big5-2'. Both are DIMENSION2 and CHARS94. The former
2097 contains frequently used characters and the latter contains less
2098 frequently used characters. */
2099
2100/* Macros to decode or encode a character of Big5 in BIG5. B1 and B2
2101 are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
2102 C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
2103 format. CHARSET is `charset_big5_1' or `charset_big5_2'. */
2104
2105/* Number of Big5 characters which have the same code in 1st byte. */
2106#define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2107
2108#define DECODE_BIG5(b1, b2, charset, c1, c2) \
2109 do { \
2110 unsigned int temp \
2111 = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62); \
2112 if (b1 < 0xC9) \
2113 charset = charset_big5_1; \
2114 else \
2115 { \
2116 charset = charset_big5_2; \
2117 temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW; \
2118 } \
2119 c1 = temp / (0xFF - 0xA1) + 0x21; \
2120 c2 = temp % (0xFF - 0xA1) + 0x21; \
2121 } while (0)
2122
2123#define ENCODE_BIG5(charset, c1, c2, b1, b2) \
2124 do { \
2125 unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21); \
2126 if (charset == charset_big5_2) \
2127 temp += BIG5_SAME_ROW * (0xC9 - 0xA1); \
2128 b1 = temp / BIG5_SAME_ROW + 0xA1; \
2129 b2 = temp % BIG5_SAME_ROW; \
2130 b2 += b2 < 0x3F ? 0x40 : 0x62; \
2131 } while (0)
2132
a5d301df
KH
2133#define DECODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \
2134 do { \
2135 int c_alt, charset_alt = (charset); \
84fbb8a0
KH
2136 if (!NILP (translation_table) \
2137 && ((c_alt = translate_char (translation_table, \
2138 -1, (charset), c1, c2)) >= 0)) \
55ab7be3 2139 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
a5d301df
KH
2140 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
2141 DECODE_CHARACTER_ASCII (c1); \
2142 else if (CHARSET_DIMENSION (charset_alt) == 1) \
2143 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
2144 else \
2145 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
2146 } while (0)
2147
84fbb8a0
KH
2148#define ENCODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \
2149 do { \
2150 int c_alt, charset_alt; \
2151 if (!NILP (translation_table) \
2152 && ((c_alt = translate_char (translation_table, -1, \
2153 charset, c1, c2)) \
2154 >= 0)) \
2155 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
2156 else \
2157 charset_alt = charset; \
2158 if (charset_alt == charset_ascii) \
2159 *dst++ = c1; \
2160 else if (CHARSET_DIMENSION (charset_alt) == 1) \
2161 { \
2162 if (sjis_p && charset_alt == charset_katakana_jisx0201) \
2163 *dst++ = c1; \
497ee4fb
KH
2164 else if (sjis_p && charset_alt == charset_latin_jisx0201) \
2165 *dst++ = c1 & 0x7F; \
84fbb8a0
KH
2166 else \
2167 { \
2168 *dst++ = charset_alt, *dst++ = c1; \
2169 coding->fake_multibyte = 1; \
2170 } \
2171 } \
2172 else \
2173 { \
2174 c1 &= 0x7F, c2 &= 0x7F; \
d6bd663c
KH
2175 if (sjis_p && (charset_alt == charset_jisx0208 \
2176 || charset_alt == charset_jisx0208_1978))\
84fbb8a0
KH
2177 { \
2178 unsigned char s1, s2; \
2179 \
2180 ENCODE_SJIS (c1, c2, s1, s2); \
2181 *dst++ = s1, *dst++ = s2; \
2182 coding->fake_multibyte = 1; \
2183 } \
2184 else if (!sjis_p \
2185 && (charset_alt == charset_big5_1 \
2186 || charset_alt == charset_big5_2)) \
2187 { \
2188 unsigned char b1, b2; \
2189 \
2190 ENCODE_BIG5 (charset_alt, c1, c2, b1, b2); \
2191 *dst++ = b1, *dst++ = b2; \
2192 } \
2193 else \
2194 { \
2195 *dst++ = charset_alt, *dst++ = c1, *dst++ = c2; \
2196 coding->fake_multibyte = 1; \
2197 } \
2198 } \
2199 coding->consumed_char++; \
a5d301df
KH
2200 } while (0);
2201
4ed46869
KH
2202/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2203 Check if a text is encoded in SJIS. If it is, return
2204 CODING_CATEGORY_MASK_SJIS, else return 0. */
2205
2206int
2207detect_coding_sjis (src, src_end)
2208 unsigned char *src, *src_end;
2209{
2210 unsigned char c;
2211
2212 while (src < src_end)
2213 {
2214 c = *src++;
4ed46869
KH
2215 if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
2216 {
2217 if (src < src_end && *src++ < 0x40)
2218 return 0;
2219 }
2220 }
2221 return CODING_CATEGORY_MASK_SJIS;
2222}
2223
2224/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2225 Check if a text is encoded in BIG5. If it is, return
2226 CODING_CATEGORY_MASK_BIG5, else return 0. */
2227
2228int
2229detect_coding_big5 (src, src_end)
2230 unsigned char *src, *src_end;
2231{
2232 unsigned char c;
2233
2234 while (src < src_end)
2235 {
2236 c = *src++;
4ed46869
KH
2237 if (c >= 0xA1)
2238 {
2239 if (src >= src_end)
2240 break;
2241 c = *src++;
2242 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
2243 return 0;
2244 }
2245 }
2246 return CODING_CATEGORY_MASK_BIG5;
2247}
2248
2249/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2250 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
2251
2252int
2253decode_coding_sjis_big5 (coding, source, destination,
d46c5b12 2254 src_bytes, dst_bytes, sjis_p)
4ed46869
KH
2255 struct coding_system *coding;
2256 unsigned char *source, *destination;
2257 int src_bytes, dst_bytes;
4ed46869
KH
2258 int sjis_p;
2259{
2260 unsigned char *src = source;
2261 unsigned char *src_end = source + src_bytes;
2262 unsigned char *dst = destination;
2263 unsigned char *dst_end = destination + dst_bytes;
2264 /* Since the maximum bytes produced by each loop is 4, we subtract 3
2265 from DST_END to assure overflow checking is necessary only at the
2266 head of loop. */
2267 unsigned char *adjusted_dst_end = dst_end - 3;
84fbb8a0 2268 Lisp_Object translation_table
f967223b 2269 = coding->translation_table_for_decode;
d46c5b12 2270 int result = CODING_FINISH_NORMAL;
a5d301df 2271
84fbb8a0 2272 if (!NILP (Venable_character_translation) && NILP (translation_table))
f967223b 2273 translation_table = Vstandard_translation_table_for_decode;
4ed46869 2274
d46c5b12 2275 coding->produced_char = 0;
fb88bf2d 2276 coding->fake_multibyte = 0;
d46c5b12
KH
2277 while (src < src_end && (dst_bytes
2278 ? (dst < adjusted_dst_end)
2279 : (dst < src - 3)))
4ed46869
KH
2280 {
2281 /* SRC_BASE remembers the start position in source in each loop.
2282 The loop will be exited when there's not enough source text
2283 to analyze two-byte character (within macro ONE_MORE_BYTE).
2284 In that case, SRC is reset to SRC_BASE before exiting. */
2285 unsigned char *src_base = src;
2286 unsigned char c1 = *src++, c2, c3, c4;
2287
d46c5b12 2288 if (c1 < 0x20)
4ed46869 2289 {
d46c5b12 2290 if (c1 == '\r')
4ed46869 2291 {
d46c5b12
KH
2292 if (coding->eol_type == CODING_EOL_CRLF)
2293 {
2294 ONE_MORE_BYTE (c2);
2295 if (c2 == '\n')
2296 *dst++ = c2;
2297 else if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2298 {
2299 result = CODING_FINISH_INCONSISTENT_EOL;
2300 goto label_end_of_loop_2;
2301 }
2302 else
2303 /* To process C2 again, SRC is subtracted by 1. */
2304 *dst++ = c1, src--;
2305 }
2306 else if (coding->eol_type == CODING_EOL_CR)
2307 *dst++ = '\n';
4ed46869 2308 else
d46c5b12
KH
2309 *dst++ = c1;
2310 }
2311 else if (c1 == '\n'
2312 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2313 && (coding->eol_type == CODING_EOL_CR
2314 || coding->eol_type == CODING_EOL_CRLF))
2315 {
2316 result = CODING_FINISH_INCONSISTENT_EOL;
2317 goto label_end_of_loop_2;
4ed46869
KH
2318 }
2319 else
2320 *dst++ = c1;
d46c5b12 2321 coding->produced_char++;
4ed46869 2322 }
a5d301df 2323 else if (c1 < 0x80)
5e34de15
KH
2324 {
2325 c2 = 0; /* avoid warning */
2326 DECODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
2327 }
54f78171 2328 else
4ed46869 2329 {
4ed46869
KH
2330 if (sjis_p)
2331 {
54f78171 2332 if (c1 < 0xA0 || (c1 >= 0xE0 && c1 < 0xF0))
fb88bf2d 2333 {
54f78171
KH
2334 /* SJIS -> JISX0208 */
2335 ONE_MORE_BYTE (c2);
d14d03ac 2336 if (c2 >= 0x40 && c2 != 0x7F && c2 <= 0xFC)
54f78171
KH
2337 {
2338 DECODE_SJIS (c1, c2, c3, c4);
2339 DECODE_SJIS_BIG5_CHARACTER (charset_jisx0208, c3, c4);
2340 }
2341 else
2342 goto label_invalid_code_2;
fb88bf2d 2343 }
54f78171
KH
2344 else if (c1 < 0xE0)
2345 /* SJIS -> JISX0201-Kana */
5e34de15
KH
2346 {
2347 c2 = 0; /* avoid warning */
2348 DECODE_SJIS_BIG5_CHARACTER (charset_katakana_jisx0201, c1,
2349 /* dummy */ c2);
2350 }
fb88bf2d 2351 else
54f78171 2352 goto label_invalid_code_1;
4ed46869 2353 }
fb88bf2d 2354 else
fb88bf2d 2355 {
54f78171
KH
2356 /* BIG5 -> Big5 */
2357 if (c1 >= 0xA1 && c1 <= 0xFE)
fb88bf2d 2358 {
54f78171
KH
2359 ONE_MORE_BYTE (c2);
2360 if ((c2 >= 0x40 && c2 <= 0x7E) || (c2 >= 0xA1 && c2 <= 0xFE))
2361 {
2362 int charset;
4ed46869 2363
54f78171
KH
2364 DECODE_BIG5 (c1, c2, charset, c3, c4);
2365 DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
2366 }
2367 else
2368 goto label_invalid_code_2;
fb88bf2d
KH
2369 }
2370 else
54f78171 2371 goto label_invalid_code_1;
4ed46869
KH
2372 }
2373 }
2374 continue;
2375
fb88bf2d
KH
2376 label_invalid_code_1:
2377 *dst++ = c1;
2378 coding->produced_char++;
2379 coding->fake_multibyte = 1;
2380 continue;
2381
2382 label_invalid_code_2:
2383 *dst++ = c1; *dst++= c2;
2384 coding->produced_char += 2;
2385 coding->fake_multibyte = 1;
2386 continue;
2387
4ed46869 2388 label_end_of_loop:
d46c5b12
KH
2389 result = CODING_FINISH_INSUFFICIENT_SRC;
2390 label_end_of_loop_2:
4ed46869
KH
2391 src = src_base;
2392 break;
2393 }
2394
fb88bf2d
KH
2395 if (src < src_end)
2396 {
2397 if (result == CODING_FINISH_NORMAL)
2398 result = CODING_FINISH_INSUFFICIENT_DST;
2399 else if (result != CODING_FINISH_INCONSISTENT_EOL
2400 && coding->mode & CODING_MODE_LAST_BLOCK)
2401 {
2402 src_bytes = src_end - src;
2403 if (dst_bytes && (dst_end - dst < src_bytes))
2404 src_bytes = dst_end - dst;
2405 bcopy (dst, src, src_bytes);
2406 src += src_bytes;
2407 dst += src_bytes;
2408 coding->fake_multibyte = 1;
2409 }
2410 }
d46c5b12
KH
2411
2412 coding->consumed = coding->consumed_char = src - source;
2413 coding->produced = dst - destination;
2414 return result;
4ed46869
KH
2415}
2416
2417/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
2418 This function can encode `charset_ascii', `charset_katakana_jisx0201',
2419 `charset_jisx0208', `charset_big5_1', and `charset_big5-2'. We are
2420 sure that all these charsets are registered as official charset
2421 (i.e. do not have extended leading-codes). Characters of other
2422 charsets are produced without any encoding. If SJIS_P is 1, encode
2423 SJIS text, else encode BIG5 text. */
2424
2425int
2426encode_coding_sjis_big5 (coding, source, destination,
d46c5b12 2427 src_bytes, dst_bytes, sjis_p)
4ed46869
KH
2428 struct coding_system *coding;
2429 unsigned char *source, *destination;
2430 int src_bytes, dst_bytes;
4ed46869
KH
2431 int sjis_p;
2432{
2433 unsigned char *src = source;
2434 unsigned char *src_end = source + src_bytes;
2435 unsigned char *dst = destination;
2436 unsigned char *dst_end = destination + dst_bytes;
2437 /* Since the maximum bytes produced by each loop is 2, we subtract 1
2438 from DST_END to assure overflow checking is necessary only at the
2439 head of loop. */
2440 unsigned char *adjusted_dst_end = dst_end - 1;
84fbb8a0 2441 Lisp_Object translation_table
f967223b 2442 = coding->translation_table_for_encode;
d46c5b12 2443 int result = CODING_FINISH_NORMAL;
a5d301df 2444
84fbb8a0 2445 if (!NILP (Venable_character_translation) && NILP (translation_table))
f967223b 2446 translation_table = Vstandard_translation_table_for_encode;
4ed46869 2447
d46c5b12 2448 coding->consumed_char = 0;
fb88bf2d 2449 coding->fake_multibyte = 0;
d46c5b12
KH
2450 while (src < src_end && (dst_bytes
2451 ? (dst < adjusted_dst_end)
2452 : (dst < src - 1)))
4ed46869
KH
2453 {
2454 /* SRC_BASE remembers the start position in source in each loop.
2455 The loop will be exited when there's not enough source text
2456 to analyze multi-byte codes (within macros ONE_MORE_BYTE and
2457 TWO_MORE_BYTES). In that case, SRC is reset to SRC_BASE
2458 before exiting. */
2459 unsigned char *src_base = src;
2460 unsigned char c1 = *src++, c2, c3, c4;
2461
2462 if (coding->composing)
2463 {
2464 if (c1 == 0xA0)
2465 {
2466 ONE_MORE_BYTE (c1);
2467 c1 &= 0x7F;
2468 }
2469 else if (c1 >= 0xA0)
2470 c1 -= 0x20;
2471 else
2472 coding->composing = 0;
2473 }
2474
2475 switch (emacs_code_class[c1])
2476 {
2477 case EMACS_ascii_code:
a5d301df
KH
2478 ENCODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
2479 break;
2480
4ed46869
KH
2481 case EMACS_control_code:
2482 *dst++ = c1;
d46c5b12 2483 coding->consumed_char++;
4ed46869
KH
2484 break;
2485
2486 case EMACS_carriage_return_code:
d46c5b12 2487 if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
4ed46869
KH
2488 {
2489 *dst++ = c1;
d46c5b12 2490 coding->consumed_char++;
4ed46869
KH
2491 break;
2492 }
2493 /* fall down to treat '\r' as '\n' ... */
2494
2495 case EMACS_linefeed_code:
2496 if (coding->eol_type == CODING_EOL_LF
0ef69138 2497 || coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
2498 *dst++ = '\n';
2499 else if (coding->eol_type == CODING_EOL_CRLF)
2500 *dst++ = '\r', *dst++ = '\n';
2501 else
2502 *dst++ = '\r';
d46c5b12 2503 coding->consumed_char++;
4ed46869
KH
2504 break;
2505
2506 case EMACS_leading_code_2:
2507 ONE_MORE_BYTE (c2);
a5d301df 2508 ENCODE_SJIS_BIG5_CHARACTER (c1, c2, /* dummy */ c3);
4ed46869
KH
2509 break;
2510
2511 case EMACS_leading_code_3:
2512 TWO_MORE_BYTES (c2, c3);
a5d301df 2513 ENCODE_SJIS_BIG5_CHARACTER (c1, c2, c3);
4ed46869
KH
2514 break;
2515
2516 case EMACS_leading_code_4:
2517 THREE_MORE_BYTES (c2, c3, c4);
a5d301df 2518 ENCODE_SJIS_BIG5_CHARACTER (c2, c3, c4);
4ed46869
KH
2519 break;
2520
2521 case EMACS_leading_code_composition:
2522 coding->composing = 1;
2523 break;
2524
2525 default: /* i.e. case EMACS_invalid_code: */
2526 *dst++ = c1;
d46c5b12 2527 coding->consumed_char++;
4ed46869
KH
2528 }
2529 continue;
2530
2531 label_end_of_loop:
d46c5b12
KH
2532 result = CODING_FINISH_INSUFFICIENT_SRC;
2533 src = src_base;
4ed46869
KH
2534 break;
2535 }
2536
d46c5b12
KH
2537 if (result == CODING_FINISH_NORMAL
2538 && src < src_end)
2539 result = CODING_FINISH_INSUFFICIENT_DST;
2540 coding->consumed = src - source;
2541 coding->produced = coding->produced_char = dst - destination;
2542 return result;
4ed46869
KH
2543}
2544
2545\f
1397dc18
KH
2546/*** 5. CCL handlers ***/
2547
2548/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2549 Check if a text is encoded in a coding system of which
2550 encoder/decoder are written in CCL program. If it is, return
2551 CODING_CATEGORY_MASK_CCL, else return 0. */
2552
2553int
2554detect_coding_ccl (src, src_end)
2555 unsigned char *src, *src_end;
2556{
2557 unsigned char *valid;
2558
2559 /* No coding system is assigned to coding-category-ccl. */
2560 if (!coding_system_table[CODING_CATEGORY_IDX_CCL])
2561 return 0;
2562
2563 valid = coding_system_table[CODING_CATEGORY_IDX_CCL]->spec.ccl.valid_codes;
2564 while (src < src_end)
2565 {
2566 if (! valid[*src]) return 0;
2567 src++;
2568 }
2569 return CODING_CATEGORY_MASK_CCL;
2570}
2571
2572\f
2573/*** 6. End-of-line handlers ***/
4ed46869
KH
2574
2575/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2576 This function is called only when `coding->eol_type' is
2577 CODING_EOL_CRLF or CODING_EOL_CR. */
2578
dfcf069d 2579int
d46c5b12 2580decode_eol (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
2581 struct coding_system *coding;
2582 unsigned char *source, *destination;
2583 int src_bytes, dst_bytes;
4ed46869
KH
2584{
2585 unsigned char *src = source;
2586 unsigned char *src_end = source + src_bytes;
2587 unsigned char *dst = destination;
2588 unsigned char *dst_end = destination + dst_bytes;
fb88bf2d 2589 unsigned char c;
d46c5b12
KH
2590 int result = CODING_FINISH_NORMAL;
2591
fb88bf2d
KH
2592 coding->fake_multibyte = 0;
2593
d46c5b12 2594 if (src_bytes <= 0)
716e0b0a
AI
2595 {
2596 coding->produced = coding->produced_char = 0;
2597 coding->consumed = coding->consumed_char = 0;
2598 return result;
2599 }
4ed46869
KH
2600
2601 switch (coding->eol_type)
2602 {
2603 case CODING_EOL_CRLF:
2604 {
2605 /* Since the maximum bytes produced by each loop is 2, we
2606 subtract 1 from DST_END to assure overflow checking is
2607 necessary only at the head of loop. */
2608 unsigned char *adjusted_dst_end = dst_end - 1;
2609
d46c5b12
KH
2610 while (src < src_end && (dst_bytes
2611 ? (dst < adjusted_dst_end)
2612 : (dst < src - 1)))
4ed46869
KH
2613 {
2614 unsigned char *src_base = src;
fb88bf2d
KH
2615
2616 c = *src++;
4ed46869
KH
2617 if (c == '\r')
2618 {
2619 ONE_MORE_BYTE (c);
fdfcf19d
KH
2620 if (c == '\n')
2621 *dst++ = c;
2622 else
d46c5b12
KH
2623 {
2624 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2625 {
2626 result = CODING_FINISH_INCONSISTENT_EOL;
2627 goto label_end_of_loop_2;
2628 }
fdfcf19d 2629 src--;
d46c5b12 2630 *dst++ = '\r';
fb88bf2d
KH
2631 if (BASE_LEADING_CODE_P (c))
2632 coding->fake_multibyte = 1;
d46c5b12 2633 }
4ed46869 2634 }
d46c5b12
KH
2635 else if (c == '\n'
2636 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL))
2637 {
2638 result = CODING_FINISH_INCONSISTENT_EOL;
2639 goto label_end_of_loop_2;
2640 }
4ed46869 2641 else
fb88bf2d
KH
2642 {
2643 *dst++ = c;
2644 if (BASE_LEADING_CODE_P (c))
2645 coding->fake_multibyte = 1;
2646 }
4ed46869
KH
2647 continue;
2648
2649 label_end_of_loop:
d46c5b12
KH
2650 result = CODING_FINISH_INSUFFICIENT_SRC;
2651 label_end_of_loop_2:
4ed46869
KH
2652 src = src_base;
2653 break;
2654 }
fdfcf19d
KH
2655 if (src < src_end)
2656 {
2657 if (result == CODING_FINISH_NORMAL)
2658 result = CODING_FINISH_INSUFFICIENT_DST;
2659 else if (result != CODING_FINISH_INCONSISTENT_EOL
2660 && coding->mode & CODING_MODE_LAST_BLOCK)
2661 {
2662 /* This is the last block of the text to be decoded.
2663 We flush out all remaining codes. */
2664 src_bytes = src_end - src;
2665 if (dst_bytes && (dst_end - dst < src_bytes))
2666 src_bytes = dst_end - dst;
2667 bcopy (src, dst, src_bytes);
2668 dst += src_bytes;
2669 src += src_bytes;
2670 }
2671 }
4ed46869 2672 }
d46c5b12 2673 break;
4ed46869
KH
2674
2675 case CODING_EOL_CR:
d46c5b12
KH
2676 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2677 {
fb88bf2d
KH
2678 while (src < src_end)
2679 {
2680 if ((c = *src++) == '\n')
2681 break;
2682 if (BASE_LEADING_CODE_P (c))
2683 coding->fake_multibyte = 1;
2684 }
d46c5b12
KH
2685 if (*--src == '\n')
2686 {
2687 src_bytes = src - source;
2688 result = CODING_FINISH_INCONSISTENT_EOL;
2689 }
2690 }
2691 if (dst_bytes && src_bytes > dst_bytes)
2692 {
2693 result = CODING_FINISH_INSUFFICIENT_DST;
2694 src_bytes = dst_bytes;
2695 }
2696 if (dst_bytes)
2697 bcopy (source, destination, src_bytes);
2698 else
2699 safe_bcopy (source, destination, src_bytes);
2700 src = source + src_bytes;
2701 while (src_bytes--) if (*dst++ == '\r') dst[-1] = '\n';
4ed46869
KH
2702 break;
2703
2704 default: /* i.e. case: CODING_EOL_LF */
d46c5b12
KH
2705 if (dst_bytes && src_bytes > dst_bytes)
2706 {
2707 result = CODING_FINISH_INSUFFICIENT_DST;
2708 src_bytes = dst_bytes;
2709 }
2710 if (dst_bytes)
2711 bcopy (source, destination, src_bytes);
2712 else
2713 safe_bcopy (source, destination, src_bytes);
2714 src += src_bytes;
993824c9 2715 dst += src_bytes;
fb88bf2d 2716 coding->fake_multibyte = 1;
4ed46869
KH
2717 break;
2718 }
2719
d46c5b12
KH
2720 coding->consumed = coding->consumed_char = src - source;
2721 coding->produced = coding->produced_char = dst - destination;
2722 return result;
4ed46869
KH
2723}
2724
2725/* See "GENERAL NOTES about `encode_coding_XXX ()' functions". Encode
2726 format of end-of-line according to `coding->eol_type'. If
d46c5b12
KH
2727 `coding->mode & CODING_MODE_SELECTIVE_DISPLAY' is nonzero, code
2728 '\r' in source text also means end-of-line. */
4ed46869 2729
dfcf069d 2730int
d46c5b12 2731encode_eol (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
2732 struct coding_system *coding;
2733 unsigned char *source, *destination;
2734 int src_bytes, dst_bytes;
4ed46869
KH
2735{
2736 unsigned char *src = source;
2737 unsigned char *dst = destination;
d46c5b12 2738 int result = CODING_FINISH_NORMAL;
4ed46869 2739
fb88bf2d
KH
2740 coding->fake_multibyte = 0;
2741
d46c5b12
KH
2742 if (coding->eol_type == CODING_EOL_CRLF)
2743 {
2744 unsigned char c;
2745 unsigned char *src_end = source + src_bytes;
2746 unsigned char *dst_end = destination + dst_bytes;
2747 /* Since the maximum bytes produced by each loop is 2, we
2748 subtract 1 from DST_END to assure overflow checking is
2749 necessary only at the head of loop. */
2750 unsigned char *adjusted_dst_end = dst_end - 1;
2751
2752 while (src < src_end && (dst_bytes
2753 ? (dst < adjusted_dst_end)
2754 : (dst < src - 1)))
2755 {
2756 c = *src++;
2757 if (c == '\n'
2758 || (c == '\r' && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY)))
2759 *dst++ = '\r', *dst++ = '\n';
2760 else
fb88bf2d
KH
2761 {
2762 *dst++ = c;
2763 if (BASE_LEADING_CODE_P (c))
2764 coding->fake_multibyte = 1;
2765 }
d46c5b12
KH
2766 }
2767 if (src < src_end)
2768 result = CODING_FINISH_INSUFFICIENT_DST;
2769 }
2770 else
4ed46869 2771 {
fb88bf2d
KH
2772 unsigned char c;
2773
d46c5b12 2774 if (dst_bytes && src_bytes > dst_bytes)
4ed46869 2775 {
d46c5b12
KH
2776 src_bytes = dst_bytes;
2777 result = CODING_FINISH_INSUFFICIENT_DST;
2778 }
2779 if (dst_bytes)
2780 bcopy (source, destination, src_bytes);
2781 else
993824c9
RS
2782 safe_bcopy (source, destination, src_bytes);
2783 dst_bytes = src_bytes;
2784 if (coding->eol_type == CODING_EOL_CR)
d46c5b12
KH
2785 {
2786 while (src_bytes--)
fb88bf2d
KH
2787 {
2788 if ((c = *dst++) == '\n')
2789 dst[-1] = '\r';
2790 else if (BASE_LEADING_CODE_P (c))
993824c9 2791 coding->fake_multibyte = 1;
fb88bf2d 2792 }
d46c5b12 2793 }
fb88bf2d 2794 else
d46c5b12 2795 {
fb88bf2d
KH
2796 if (coding->mode & CODING_MODE_SELECTIVE_DISPLAY)
2797 {
2798 while (src_bytes--)
2799 if (*dst++ == '\r') dst[-1] = '\n';
2800 }
2801 coding->fake_multibyte = 1;
4ed46869 2802 }
fb88bf2d
KH
2803 src = source + dst_bytes;
2804 dst = destination + dst_bytes;
4ed46869
KH
2805 }
2806
d46c5b12
KH
2807 coding->consumed = coding->consumed_char = src - source;
2808 coding->produced = coding->produced_char = dst - destination;
2809 return result;
4ed46869
KH
2810}
2811
2812\f
1397dc18 2813/*** 7. C library functions ***/
4ed46869
KH
2814
2815/* In Emacs Lisp, coding system is represented by a Lisp symbol which
2816 has a property `coding-system'. The value of this property is a
2817 vector of length 5 (called as coding-vector). Among elements of
2818 this vector, the first (element[0]) and the fifth (element[4])
2819 carry important information for decoding/encoding. Before
2820 decoding/encoding, this information should be set in fields of a
2821 structure of type `coding_system'.
2822
2823 A value of property `coding-system' can be a symbol of another
2824 subsidiary coding-system. In that case, Emacs gets coding-vector
2825 from that symbol.
2826
2827 `element[0]' contains information to be set in `coding->type'. The
2828 value and its meaning is as follows:
2829
0ef69138
KH
2830 0 -- coding_type_emacs_mule
2831 1 -- coding_type_sjis
2832 2 -- coding_type_iso2022
2833 3 -- coding_type_big5
2834 4 -- coding_type_ccl encoder/decoder written in CCL
2835 nil -- coding_type_no_conversion
2836 t -- coding_type_undecided (automatic conversion on decoding,
2837 no-conversion on encoding)
4ed46869
KH
2838
2839 `element[4]' contains information to be set in `coding->flags' and
2840 `coding->spec'. The meaning varies by `coding->type'.
2841
2842 If `coding->type' is `coding_type_iso2022', element[4] is a vector
2843 of length 32 (of which the first 13 sub-elements are used now).
2844 Meanings of these sub-elements are:
2845
2846 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2847 If the value is an integer of valid charset, the charset is
2848 assumed to be designated to graphic register N initially.
2849
2850 If the value is minus, it is a minus value of charset which
2851 reserves graphic register N, which means that the charset is
2852 not designated initially but should be designated to graphic
2853 register N just before encoding a character in that charset.
2854
2855 If the value is nil, graphic register N is never used on
2856 encoding.
2857
2858 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2859 Each value takes t or nil. See the section ISO2022 of
2860 `coding.h' for more information.
2861
2862 If `coding->type' is `coding_type_big5', element[4] is t to denote
2863 BIG5-ETen or nil to denote BIG5-HKU.
2864
2865 If `coding->type' takes the other value, element[4] is ignored.
2866
2867 Emacs Lisp's coding system also carries information about format of
2868 end-of-line in a value of property `eol-type'. If the value is
2869 integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2870 means CODING_EOL_CR. If it is not integer, it should be a vector
2871 of subsidiary coding systems of which property `eol-type' has one
2872 of above values.
2873
2874*/
2875
2876/* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2877 and set it in CODING. If CODING_SYSTEM_SYMBOL is invalid, CODING
2878 is setup so that no conversion is necessary and return -1, else
2879 return 0. */
2880
2881int
e0e989f6
KH
2882setup_coding_system (coding_system, coding)
2883 Lisp_Object coding_system;
4ed46869
KH
2884 struct coding_system *coding;
2885{
d46c5b12 2886 Lisp_Object coding_spec, coding_type, eol_type, plist;
4608c386 2887 Lisp_Object val;
70c22245 2888 int i;
4ed46869 2889
d46c5b12 2890 /* Initialize some fields required for all kinds of coding systems. */
774324d6 2891 coding->symbol = coding_system;
d46c5b12
KH
2892 coding->common_flags = 0;
2893 coding->mode = 0;
2894 coding->heading_ascii = -1;
2895 coding->post_read_conversion = coding->pre_write_conversion = Qnil;
1f5dbf34
KH
2896
2897 if (NILP (coding_system))
2898 goto label_invalid_coding_system;
2899
4608c386 2900 coding_spec = Fget (coding_system, Qcoding_system);
1f5dbf34 2901
4608c386
KH
2902 if (!VECTORP (coding_spec)
2903 || XVECTOR (coding_spec)->size != 5
2904 || !CONSP (XVECTOR (coding_spec)->contents[3]))
4ed46869 2905 goto label_invalid_coding_system;
4608c386 2906
d46c5b12
KH
2907 eol_type = inhibit_eol_conversion ? Qnil : Fget (coding_system, Qeol_type);
2908 if (VECTORP (eol_type))
2909 {
2910 coding->eol_type = CODING_EOL_UNDECIDED;
2911 coding->common_flags = CODING_REQUIRE_DETECTION_MASK;
2912 }
2913 else if (XFASTINT (eol_type) == 1)
2914 {
2915 coding->eol_type = CODING_EOL_CRLF;
2916 coding->common_flags
2917 = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2918 }
2919 else if (XFASTINT (eol_type) == 2)
2920 {
2921 coding->eol_type = CODING_EOL_CR;
2922 coding->common_flags
2923 = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2924 }
2925 else
2926 coding->eol_type = CODING_EOL_LF;
2927
2928 coding_type = XVECTOR (coding_spec)->contents[0];
2929 /* Try short cut. */
2930 if (SYMBOLP (coding_type))
2931 {
2932 if (EQ (coding_type, Qt))
2933 {
2934 coding->type = coding_type_undecided;
2935 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
2936 }
2937 else
2938 coding->type = coding_type_no_conversion;
2939 return 0;
2940 }
2941
2942 /* Initialize remaining fields. */
2943 coding->composing = 0;
a63063ae 2944 coding->composed_chars = 0;
d46c5b12
KH
2945
2946 /* Get values of coding system properties:
2947 `post-read-conversion', `pre-write-conversion',
f967223b 2948 `translation-table-for-decode', `translation-table-for-encode'. */
4608c386 2949 plist = XVECTOR (coding_spec)->contents[3];
b843d1ae
KH
2950 /* Pre & post conversion functions should be disabled if
2951 inhibit_eol_conversion is nozero. This is the case that a code
2952 conversion function is called while those functions are running. */
2953 if (! inhibit_pre_post_conversion)
2954 {
2955 coding->post_read_conversion = Fplist_get (plist, Qpost_read_conversion);
2956 coding->pre_write_conversion = Fplist_get (plist, Qpre_write_conversion);
2957 }
f967223b 2958 val = Fplist_get (plist, Qtranslation_table_for_decode);
4608c386 2959 if (SYMBOLP (val))
f967223b
KH
2960 val = Fget (val, Qtranslation_table_for_decode);
2961 coding->translation_table_for_decode = CHAR_TABLE_P (val) ? val : Qnil;
2962 val = Fplist_get (plist, Qtranslation_table_for_encode);
4608c386 2963 if (SYMBOLP (val))
f967223b
KH
2964 val = Fget (val, Qtranslation_table_for_encode);
2965 coding->translation_table_for_encode = CHAR_TABLE_P (val) ? val : Qnil;
d46c5b12
KH
2966 val = Fplist_get (plist, Qcoding_category);
2967 if (!NILP (val))
2968 {
2969 val = Fget (val, Qcoding_category_index);
2970 if (INTEGERP (val))
2971 coding->category_idx = XINT (val);
2972 else
2973 goto label_invalid_coding_system;
2974 }
2975 else
2976 goto label_invalid_coding_system;
4608c386 2977
70c22245
KH
2978 val = Fplist_get (plist, Qsafe_charsets);
2979 if (EQ (val, Qt))
2980 {
2981 for (i = 0; i <= MAX_CHARSET; i++)
2982 coding->safe_charsets[i] = 1;
2983 }
2984 else
2985 {
2986 bzero (coding->safe_charsets, MAX_CHARSET + 1);
2987 while (CONSP (val))
2988 {
03699b14 2989 if ((i = get_charset_id (XCAR (val))) >= 0)
70c22245 2990 coding->safe_charsets[i] = 1;
03699b14 2991 val = XCDR (val);
70c22245
KH
2992 }
2993 }
2994
d46c5b12 2995 switch (XFASTINT (coding_type))
4ed46869
KH
2996 {
2997 case 0:
0ef69138 2998 coding->type = coding_type_emacs_mule;
c952af22
KH
2999 if (!NILP (coding->post_read_conversion))
3000 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
3001 if (!NILP (coding->pre_write_conversion))
3002 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
4ed46869
KH
3003 break;
3004
3005 case 1:
3006 coding->type = coding_type_sjis;
c952af22
KH
3007 coding->common_flags
3008 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
4ed46869
KH
3009 break;
3010
3011 case 2:
3012 coding->type = coding_type_iso2022;
c952af22
KH
3013 coding->common_flags
3014 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
4ed46869 3015 {
70c22245 3016 Lisp_Object val, temp;
4ed46869 3017 Lisp_Object *flags;
d46c5b12 3018 int i, charset, reg_bits = 0;
4ed46869 3019
4608c386 3020 val = XVECTOR (coding_spec)->contents[4];
f44d27ce 3021
4ed46869
KH
3022 if (!VECTORP (val) || XVECTOR (val)->size != 32)
3023 goto label_invalid_coding_system;
3024
3025 flags = XVECTOR (val)->contents;
3026 coding->flags
3027 = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
3028 | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
3029 | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
3030 | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
3031 | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
3032 | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
3033 | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
3034 | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
e0e989f6
KH
3035 | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
3036 | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
c4825358
KH
3037 | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3038 | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
3f003981 3039 | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
c4825358 3040 );
4ed46869
KH
3041
3042 /* Invoke graphic register 0 to plane 0. */
3043 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
3044 /* Invoke graphic register 1 to plane 1 if we can use full 8-bit. */
3045 CODING_SPEC_ISO_INVOCATION (coding, 1)
3046 = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
3047 /* Not single shifting at first. */
6e85d753 3048 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
e0e989f6 3049 /* Beginning of buffer should also be regarded as bol. */
6e85d753 3050 CODING_SPEC_ISO_BOL (coding) = 1;
4ed46869 3051
70c22245
KH
3052 for (charset = 0; charset <= MAX_CHARSET; charset++)
3053 CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = 255;
3054 val = Vcharset_revision_alist;
3055 while (CONSP (val))
3056 {
03699b14 3057 charset = get_charset_id (Fcar_safe (XCAR (val)));
70c22245 3058 if (charset >= 0
03699b14 3059 && (temp = Fcdr_safe (XCAR (val)), INTEGERP (temp))
70c22245
KH
3060 && (i = XINT (temp), (i >= 0 && (i + '@') < 128)))
3061 CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = i;
03699b14 3062 val = XCDR (val);
70c22245
KH
3063 }
3064
4ed46869
KH
3065 /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
3066 FLAGS[REG] can be one of below:
3067 integer CHARSET: CHARSET occupies register I,
3068 t: designate nothing to REG initially, but can be used
3069 by any charsets,
3070 list of integer, nil, or t: designate the first
3071 element (if integer) to REG initially, the remaining
3072 elements (if integer) is designated to REG on request,
d46c5b12 3073 if an element is t, REG can be used by any charsets,
4ed46869 3074 nil: REG is never used. */
467e7675 3075 for (charset = 0; charset <= MAX_CHARSET; charset++)
1ba9e4ab
KH
3076 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3077 = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
4ed46869
KH
3078 for (i = 0; i < 4; i++)
3079 {
3080 if (INTEGERP (flags[i])
e0e989f6
KH
3081 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset))
3082 || (charset = get_charset_id (flags[i])) >= 0)
4ed46869
KH
3083 {
3084 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3085 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
3086 }
3087 else if (EQ (flags[i], Qt))
3088 {
3089 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
d46c5b12
KH
3090 reg_bits |= 1 << i;
3091 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
4ed46869
KH
3092 }
3093 else if (CONSP (flags[i]))
3094 {
84d60297
RS
3095 Lisp_Object tail;
3096 tail = flags[i];
4ed46869 3097
d46c5b12 3098 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
03699b14
KR
3099 if (INTEGERP (XCAR (tail))
3100 && (charset = XINT (XCAR (tail)),
e0e989f6 3101 CHARSET_VALID_P (charset))
03699b14 3102 || (charset = get_charset_id (XCAR (tail))) >= 0)
4ed46869
KH
3103 {
3104 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3105 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
3106 }
3107 else
3108 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
03699b14 3109 tail = XCDR (tail);
4ed46869
KH
3110 while (CONSP (tail))
3111 {
03699b14
KR
3112 if (INTEGERP (XCAR (tail))
3113 && (charset = XINT (XCAR (tail)),
e0e989f6 3114 CHARSET_VALID_P (charset))
03699b14 3115 || (charset = get_charset_id (XCAR (tail))) >= 0)
70c22245
KH
3116 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3117 = i;
03699b14 3118 else if (EQ (XCAR (tail), Qt))
d46c5b12 3119 reg_bits |= 1 << i;
03699b14 3120 tail = XCDR (tail);
4ed46869
KH
3121 }
3122 }
3123 else
3124 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3125
3126 CODING_SPEC_ISO_DESIGNATION (coding, i)
3127 = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
3128 }
3129
d46c5b12 3130 if (reg_bits && ! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
4ed46869
KH
3131 {
3132 /* REG 1 can be used only by locking shift in 7-bit env. */
3133 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
d46c5b12 3134 reg_bits &= ~2;
4ed46869
KH
3135 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
3136 /* Without any shifting, only REG 0 and 1 can be used. */
d46c5b12 3137 reg_bits &= 3;
4ed46869
KH
3138 }
3139
d46c5b12
KH
3140 if (reg_bits)
3141 for (charset = 0; charset <= MAX_CHARSET; charset++)
6e85d753 3142 {
d46c5b12
KH
3143 if (CHARSET_VALID_P (charset))
3144 {
3145 /* There exist some default graphic registers to be
3146 used CHARSET. */
3147
3148 /* We had better avoid designating a charset of
3149 CHARS96 to REG 0 as far as possible. */
3150 if (CHARSET_CHARS (charset) == 96)
3151 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3152 = (reg_bits & 2
3153 ? 1 : (reg_bits & 4 ? 2 : (reg_bits & 8 ? 3 : 0)));
3154 else
3155 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3156 = (reg_bits & 1
3157 ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
3158 }
6e85d753 3159 }
4ed46869 3160 }
c952af22 3161 coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
d46c5b12 3162 coding->spec.iso2022.last_invalid_designation_register = -1;
4ed46869
KH
3163 break;
3164
3165 case 3:
3166 coding->type = coding_type_big5;
c952af22
KH
3167 coding->common_flags
3168 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
4ed46869 3169 coding->flags
4608c386 3170 = (NILP (XVECTOR (coding_spec)->contents[4])
4ed46869
KH
3171 ? CODING_FLAG_BIG5_HKU
3172 : CODING_FLAG_BIG5_ETEN);
3173 break;
3174
3175 case 4:
3176 coding->type = coding_type_ccl;
c952af22
KH
3177 coding->common_flags
3178 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
4ed46869 3179 {
84d60297 3180 val = XVECTOR (coding_spec)->contents[4];
ef4ced28
KH
3181 if (! CONSP (val)
3182 || setup_ccl_program (&(coding->spec.ccl.decoder),
03699b14 3183 XCAR (val)) < 0
ef4ced28 3184 || setup_ccl_program (&(coding->spec.ccl.encoder),
03699b14 3185 XCDR (val)) < 0)
4ed46869 3186 goto label_invalid_coding_system;
1397dc18
KH
3187
3188 bzero (coding->spec.ccl.valid_codes, 256);
3189 val = Fplist_get (plist, Qvalid_codes);
3190 if (CONSP (val))
3191 {
3192 Lisp_Object this;
3193
03699b14 3194 for (; CONSP (val); val = XCDR (val))
1397dc18 3195 {
03699b14 3196 this = XCAR (val);
1397dc18
KH
3197 if (INTEGERP (this)
3198 && XINT (this) >= 0 && XINT (this) < 256)
3199 coding->spec.ccl.valid_codes[XINT (this)] = 1;
3200 else if (CONSP (this)
03699b14
KR
3201 && INTEGERP (XCAR (this))
3202 && INTEGERP (XCDR (this)))
1397dc18 3203 {
03699b14
KR
3204 int start = XINT (XCAR (this));
3205 int end = XINT (XCDR (this));
1397dc18
KH
3206
3207 if (start >= 0 && start <= end && end < 256)
e133c8fa 3208 while (start <= end)
1397dc18
KH
3209 coding->spec.ccl.valid_codes[start++] = 1;
3210 }
3211 }
3212 }
4ed46869 3213 }
c952af22 3214 coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
4ed46869
KH
3215 break;
3216
27901516
KH
3217 case 5:
3218 coding->type = coding_type_raw_text;
3219 break;
3220
4ed46869 3221 default:
d46c5b12 3222 goto label_invalid_coding_system;
4ed46869
KH
3223 }
3224 return 0;
3225
3226 label_invalid_coding_system:
3227 coding->type = coding_type_no_conversion;
d46c5b12 3228 coding->category_idx = CODING_CATEGORY_IDX_BINARY;
c952af22 3229 coding->common_flags = 0;
dec137e5 3230 coding->eol_type = CODING_EOL_LF;
d46c5b12 3231 coding->pre_write_conversion = coding->post_read_conversion = Qnil;
4ed46869
KH
3232 return -1;
3233}
3234
54f78171
KH
3235/* Setup raw-text or one of its subsidiaries in the structure
3236 coding_system CODING according to the already setup value eol_type
3237 in CODING. CODING should be setup for some coding system in
3238 advance. */
3239
3240void
3241setup_raw_text_coding_system (coding)
3242 struct coding_system *coding;
3243{
3244 if (coding->type != coding_type_raw_text)
3245 {
3246 coding->symbol = Qraw_text;
3247 coding->type = coding_type_raw_text;
3248 if (coding->eol_type != CODING_EOL_UNDECIDED)
3249 {
84d60297
RS
3250 Lisp_Object subsidiaries;
3251 subsidiaries = Fget (Qraw_text, Qeol_type);
54f78171
KH
3252
3253 if (VECTORP (subsidiaries)
3254 && XVECTOR (subsidiaries)->size == 3)
3255 coding->symbol
3256 = XVECTOR (subsidiaries)->contents[coding->eol_type];
3257 }
716e0b0a 3258 setup_coding_system (coding->symbol, coding);
54f78171
KH
3259 }
3260 return;
3261}
3262
4ed46869
KH
3263/* Emacs has a mechanism to automatically detect a coding system if it
3264 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
3265 it's impossible to distinguish some coding systems accurately
3266 because they use the same range of codes. So, at first, coding
3267 systems are categorized into 7, those are:
3268
0ef69138 3269 o coding-category-emacs-mule
4ed46869
KH
3270
3271 The category for a coding system which has the same code range
3272 as Emacs' internal format. Assigned the coding-system (Lisp
0ef69138 3273 symbol) `emacs-mule' by default.
4ed46869
KH
3274
3275 o coding-category-sjis
3276
3277 The category for a coding system which has the same code range
3278 as SJIS. Assigned the coding-system (Lisp
7717c392 3279 symbol) `japanese-shift-jis' by default.
4ed46869
KH
3280
3281 o coding-category-iso-7
3282
3283 The category for a coding system which has the same code range
7717c392 3284 as ISO2022 of 7-bit environment. This doesn't use any locking
d46c5b12
KH
3285 shift and single shift functions. This can encode/decode all
3286 charsets. Assigned the coding-system (Lisp symbol)
3287 `iso-2022-7bit' by default.
3288
3289 o coding-category-iso-7-tight
3290
3291 Same as coding-category-iso-7 except that this can
3292 encode/decode only the specified charsets.
4ed46869
KH
3293
3294 o coding-category-iso-8-1
3295
3296 The category for a coding system which has the same code range
3297 as ISO2022 of 8-bit environment and graphic plane 1 used only
7717c392
KH
3298 for DIMENSION1 charset. This doesn't use any locking shift
3299 and single shift functions. Assigned the coding-system (Lisp
3300 symbol) `iso-latin-1' by default.
4ed46869
KH
3301
3302 o coding-category-iso-8-2
3303
3304 The category for a coding system which has the same code range
3305 as ISO2022 of 8-bit environment and graphic plane 1 used only
7717c392
KH
3306 for DIMENSION2 charset. This doesn't use any locking shift
3307 and single shift functions. Assigned the coding-system (Lisp
3308 symbol) `japanese-iso-8bit' by default.
4ed46869 3309
7717c392 3310 o coding-category-iso-7-else
4ed46869
KH
3311
3312 The category for a coding system which has the same code range
7717c392
KH
3313 as ISO2022 of 7-bit environemnt but uses locking shift or
3314 single shift functions. Assigned the coding-system (Lisp
3315 symbol) `iso-2022-7bit-lock' by default.
3316
3317 o coding-category-iso-8-else
3318
3319 The category for a coding system which has the same code range
3320 as ISO2022 of 8-bit environemnt but uses locking shift or
3321 single shift functions. Assigned the coding-system (Lisp
3322 symbol) `iso-2022-8bit-ss2' by default.
4ed46869
KH
3323
3324 o coding-category-big5
3325
3326 The category for a coding system which has the same code range
3327 as BIG5. Assigned the coding-system (Lisp symbol)
e0e989f6 3328 `cn-big5' by default.
4ed46869 3329
1397dc18
KH
3330 o coding-category-ccl
3331
3332 The category for a coding system of which encoder/decoder is
3333 written in CCL programs. The default value is nil, i.e., no
3334 coding system is assigned.
3335
4ed46869
KH
3336 o coding-category-binary
3337
3338 The category for a coding system not categorized in any of the
3339 above. Assigned the coding-system (Lisp symbol)
e0e989f6 3340 `no-conversion' by default.
4ed46869
KH
3341
3342 Each of them is a Lisp symbol and the value is an actual
3343 `coding-system's (this is also a Lisp symbol) assigned by a user.
3344 What Emacs does actually is to detect a category of coding system.
3345 Then, it uses a `coding-system' assigned to it. If Emacs can't
3346 decide only one possible category, it selects a category of the
3347 highest priority. Priorities of categories are also specified by a
3348 user in a Lisp variable `coding-category-list'.
3349
3350*/
3351
66cfb530
KH
3352static
3353int ascii_skip_code[256];
3354
d46c5b12 3355/* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded.
4ed46869
KH
3356 If it detects possible coding systems, return an integer in which
3357 appropriate flag bits are set. Flag bits are defined by macros
d46c5b12 3358 CODING_CATEGORY_MASK_XXX in `coding.h'.
4ed46869 3359
d46c5b12
KH
3360 How many ASCII characters are at the head is returned as *SKIP. */
3361
3362static int
3363detect_coding_mask (source, src_bytes, priorities, skip)
3364 unsigned char *source;
3365 int src_bytes, *priorities, *skip;
4ed46869
KH
3366{
3367 register unsigned char c;
d46c5b12 3368 unsigned char *src = source, *src_end = source + src_bytes;
66cfb530 3369 unsigned int mask;
d46c5b12 3370 int i;
4ed46869
KH
3371
3372 /* At first, skip all ASCII characters and control characters except
3373 for three ISO2022 specific control characters. */
66cfb530
KH
3374 ascii_skip_code[ISO_CODE_SO] = 0;
3375 ascii_skip_code[ISO_CODE_SI] = 0;
3376 ascii_skip_code[ISO_CODE_ESC] = 0;
3377
bcf26d6a 3378 label_loop_detect_coding:
66cfb530 3379 while (src < src_end && ascii_skip_code[*src]) src++;
d46c5b12 3380 *skip = src - source;
4ed46869
KH
3381
3382 if (src >= src_end)
3383 /* We found nothing other than ASCII. There's nothing to do. */
d46c5b12 3384 return 0;
4ed46869 3385
8a8147d6 3386 c = *src;
4ed46869
KH
3387 /* The text seems to be encoded in some multilingual coding system.
3388 Now, try to find in which coding system the text is encoded. */
3389 if (c < 0x80)
bcf26d6a
KH
3390 {
3391 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
3392 /* C is an ISO2022 specific control code of C0. */
3393 mask = detect_coding_iso2022 (src, src_end);
1b2af4b0 3394 if (mask == 0)
d46c5b12
KH
3395 {
3396 /* No valid ISO2022 code follows C. Try again. */
3397 src++;
66cfb530
KH
3398 if (c == ISO_CODE_ESC)
3399 ascii_skip_code[ISO_CODE_ESC] = 1;
3400 else
3401 ascii_skip_code[ISO_CODE_SO] = ascii_skip_code[ISO_CODE_SI] = 1;
d46c5b12
KH
3402 goto label_loop_detect_coding;
3403 }
3404 if (priorities)
3405 goto label_return_highest_only;
bcf26d6a 3406 }
d46c5b12 3407 else
c4825358 3408 {
d46c5b12 3409 int try;
4ed46869 3410
d46c5b12
KH
3411 if (c < 0xA0)
3412 {
3413 /* C is the first byte of SJIS character code,
3414 or a leading-code of Emacs' internal format (emacs-mule). */
3415 try = CODING_CATEGORY_MASK_SJIS | CODING_CATEGORY_MASK_EMACS_MULE;
3416
3417 /* Or, if C is a special latin extra code,
3418 or is an ISO2022 specific control code of C1 (SS2 or SS3),
3419 or is an ISO2022 control-sequence-introducer (CSI),
3420 we should also consider the possibility of ISO2022 codings. */
3421 if ((VECTORP (Vlatin_extra_code_table)
3422 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
3423 || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
3424 || (c == ISO_CODE_CSI
3425 && (src < src_end
3426 && (*src == ']'
3427 || ((*src == '0' || *src == '1' || *src == '2')
3428 && src + 1 < src_end
3429 && src[1] == ']')))))
3430 try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
3431 | CODING_CATEGORY_MASK_ISO_8BIT);
3432 }
c4825358 3433 else
d46c5b12
KH
3434 /* C is a character of ISO2022 in graphic plane right,
3435 or a SJIS's 1-byte character code (i.e. JISX0201),
3436 or the first byte of BIG5's 2-byte code. */
3437 try = (CODING_CATEGORY_MASK_ISO_8_ELSE
3438 | CODING_CATEGORY_MASK_ISO_8BIT
3439 | CODING_CATEGORY_MASK_SJIS
3440 | CODING_CATEGORY_MASK_BIG5);
3441
1397dc18
KH
3442 /* Or, we may have to consider the possibility of CCL. */
3443 if (coding_system_table[CODING_CATEGORY_IDX_CCL]
3444 && (coding_system_table[CODING_CATEGORY_IDX_CCL]
3445 ->spec.ccl.valid_codes)[c])
3446 try |= CODING_CATEGORY_MASK_CCL;
3447
d46c5b12
KH
3448 mask = 0;
3449 if (priorities)
3450 {
3451 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3452 {
5ab13dd0 3453 if (priorities[i] & try & CODING_CATEGORY_MASK_ISO)
d46c5b12 3454 mask = detect_coding_iso2022 (src, src_end);
5ab13dd0 3455 else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS)
d46c5b12 3456 mask = detect_coding_sjis (src, src_end);
5ab13dd0 3457 else if (priorities[i] & try & CODING_CATEGORY_MASK_BIG5)
d46c5b12 3458 mask = detect_coding_big5 (src, src_end);
5ab13dd0 3459 else if (priorities[i] & try & CODING_CATEGORY_MASK_EMACS_MULE)
d46c5b12 3460 mask = detect_coding_emacs_mule (src, src_end);
89fa8b36 3461 else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL)
1397dc18 3462 mask = detect_coding_ccl (src, src_end);
5ab13dd0
RS
3463 else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT)
3464 mask = CODING_CATEGORY_MASK_RAW_TEXT;
3465 else if (priorities[i] & CODING_CATEGORY_MASK_BINARY)
3466 mask = CODING_CATEGORY_MASK_BINARY;
d46c5b12
KH
3467 if (mask)
3468 goto label_return_highest_only;
3469 }
3470 return CODING_CATEGORY_MASK_RAW_TEXT;
3471 }
3472 if (try & CODING_CATEGORY_MASK_ISO)
3473 mask |= detect_coding_iso2022 (src, src_end);
3474 if (try & CODING_CATEGORY_MASK_SJIS)
3475 mask |= detect_coding_sjis (src, src_end);
3476 if (try & CODING_CATEGORY_MASK_BIG5)
3477 mask |= detect_coding_big5 (src, src_end);
3478 if (try & CODING_CATEGORY_MASK_EMACS_MULE)
1397dc18
KH
3479 mask |= detect_coding_emacs_mule (src, src_end);
3480 if (try & CODING_CATEGORY_MASK_CCL)
3481 mask |= detect_coding_ccl (src, src_end);
c4825358 3482 }
5ab13dd0 3483 return (mask | CODING_CATEGORY_MASK_RAW_TEXT | CODING_CATEGORY_MASK_BINARY);
d46c5b12
KH
3484
3485 label_return_highest_only:
3486 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3487 {
3488 if (mask & priorities[i])
3489 return priorities[i];
3490 }
3491 return CODING_CATEGORY_MASK_RAW_TEXT;
4ed46869
KH
3492}
3493
3494/* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
3495 The information of the detected coding system is set in CODING. */
3496
3497void
3498detect_coding (coding, src, src_bytes)
3499 struct coding_system *coding;
3500 unsigned char *src;
3501 int src_bytes;
3502{
d46c5b12
KH
3503 unsigned int idx;
3504 int skip, mask, i;
84d60297 3505 Lisp_Object val;
4ed46869 3506
84d60297 3507 val = Vcoding_category_list;
66cfb530 3508 mask = detect_coding_mask (src, src_bytes, coding_priorities, &skip);
d46c5b12 3509 coding->heading_ascii = skip;
4ed46869 3510
d46c5b12
KH
3511 if (!mask) return;
3512
3513 /* We found a single coding system of the highest priority in MASK. */
3514 idx = 0;
3515 while (mask && ! (mask & 1)) mask >>= 1, idx++;
3516 if (! mask)
3517 idx = CODING_CATEGORY_IDX_RAW_TEXT;
4ed46869 3518
d46c5b12
KH
3519 val = XSYMBOL (XVECTOR (Vcoding_category_table)->contents[idx])->value;
3520
3521 if (coding->eol_type != CODING_EOL_UNDECIDED)
27901516 3522 {
84d60297 3523 Lisp_Object tmp;
d46c5b12 3524
84d60297 3525 tmp = Fget (val, Qeol_type);
d46c5b12
KH
3526 if (VECTORP (tmp))
3527 val = XVECTOR (tmp)->contents[coding->eol_type];
4ed46869 3528 }
d46c5b12
KH
3529 setup_coding_system (val, coding);
3530 /* Set this again because setup_coding_system reset this member. */
3531 coding->heading_ascii = skip;
4ed46869
KH
3532}
3533
d46c5b12
KH
3534/* Detect how end-of-line of a text of length SRC_BYTES pointed by
3535 SOURCE is encoded. Return one of CODING_EOL_LF, CODING_EOL_CRLF,
3536 CODING_EOL_CR, and CODING_EOL_UNDECIDED.
3537
3538 How many non-eol characters are at the head is returned as *SKIP. */
4ed46869 3539
bc4bc72a
RS
3540#define MAX_EOL_CHECK_COUNT 3
3541
d46c5b12
KH
3542static int
3543detect_eol_type (source, src_bytes, skip)
3544 unsigned char *source;
3545 int src_bytes, *skip;
4ed46869 3546{
d46c5b12 3547 unsigned char *src = source, *src_end = src + src_bytes;
4ed46869 3548 unsigned char c;
bc4bc72a
RS
3549 int total = 0; /* How many end-of-lines are found so far. */
3550 int eol_type = CODING_EOL_UNDECIDED;
3551 int this_eol_type;
4ed46869 3552
d46c5b12
KH
3553 *skip = 0;
3554
bc4bc72a 3555 while (src < src_end && total < MAX_EOL_CHECK_COUNT)
4ed46869
KH
3556 {
3557 c = *src++;
bc4bc72a 3558 if (c == '\n' || c == '\r')
4ed46869 3559 {
d46c5b12
KH
3560 if (*skip == 0)
3561 *skip = src - 1 - source;
bc4bc72a
RS
3562 total++;
3563 if (c == '\n')
3564 this_eol_type = CODING_EOL_LF;
3565 else if (src >= src_end || *src != '\n')
3566 this_eol_type = CODING_EOL_CR;
4ed46869 3567 else
bc4bc72a
RS
3568 this_eol_type = CODING_EOL_CRLF, src++;
3569
3570 if (eol_type == CODING_EOL_UNDECIDED)
3571 /* This is the first end-of-line. */
3572 eol_type = this_eol_type;
3573 else if (eol_type != this_eol_type)
d46c5b12
KH
3574 {
3575 /* The found type is different from what found before. */
3576 eol_type = CODING_EOL_INCONSISTENT;
3577 break;
3578 }
4ed46869
KH
3579 }
3580 }
bc4bc72a 3581
d46c5b12
KH
3582 if (*skip == 0)
3583 *skip = src_end - source;
85a02ca4 3584 return eol_type;
4ed46869
KH
3585}
3586
3587/* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
3588 is encoded. If it detects an appropriate format of end-of-line, it
3589 sets the information in *CODING. */
3590
3591void
3592detect_eol (coding, src, src_bytes)
3593 struct coding_system *coding;
3594 unsigned char *src;
3595 int src_bytes;
3596{
4608c386 3597 Lisp_Object val;
d46c5b12
KH
3598 int skip;
3599 int eol_type = detect_eol_type (src, src_bytes, &skip);
3600
3601 if (coding->heading_ascii > skip)
3602 coding->heading_ascii = skip;
3603 else
3604 skip = coding->heading_ascii;
4ed46869 3605
0ef69138 3606 if (eol_type == CODING_EOL_UNDECIDED)
4ed46869 3607 return;
27901516
KH
3608 if (eol_type == CODING_EOL_INCONSISTENT)
3609 {
3610#if 0
3611 /* This code is suppressed until we find a better way to
992f23f2 3612 distinguish raw text file and binary file. */
27901516
KH
3613
3614 /* If we have already detected that the coding is raw-text, the
3615 coding should actually be no-conversion. */
3616 if (coding->type == coding_type_raw_text)
3617 {
3618 setup_coding_system (Qno_conversion, coding);
3619 return;
3620 }
3621 /* Else, let's decode only text code anyway. */
3622#endif /* 0 */
1b2af4b0 3623 eol_type = CODING_EOL_LF;
27901516
KH
3624 }
3625
4608c386 3626 val = Fget (coding->symbol, Qeol_type);
4ed46869 3627 if (VECTORP (val) && XVECTOR (val)->size == 3)
d46c5b12
KH
3628 {
3629 setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
3630 coding->heading_ascii = skip;
3631 }
3632}
3633
3634#define CONVERSION_BUFFER_EXTRA_ROOM 256
3635
3636#define DECODING_BUFFER_MAG(coding) \
3637 (coding->type == coding_type_iso2022 \
3638 ? 3 \
3639 : ((coding->type == coding_type_sjis || coding->type == coding_type_big5) \
3640 ? 2 \
3641 : (coding->type == coding_type_raw_text \
3642 ? 1 \
3643 : (coding->type == coding_type_ccl \
3644 ? coding->spec.ccl.decoder.buf_magnification \
3645 : 2))))
3646
3647/* Return maximum size (bytes) of a buffer enough for decoding
3648 SRC_BYTES of text encoded in CODING. */
3649
3650int
3651decoding_buffer_size (coding, src_bytes)
3652 struct coding_system *coding;
3653 int src_bytes;
3654{
3655 return (src_bytes * DECODING_BUFFER_MAG (coding)
3656 + CONVERSION_BUFFER_EXTRA_ROOM);
3657}
3658
3659/* Return maximum size (bytes) of a buffer enough for encoding
3660 SRC_BYTES of text to CODING. */
3661
3662int
3663encoding_buffer_size (coding, src_bytes)
3664 struct coding_system *coding;
3665 int src_bytes;
3666{
3667 int magnification;
3668
3669 if (coding->type == coding_type_ccl)
3670 magnification = coding->spec.ccl.encoder.buf_magnification;
3671 else
3672 magnification = 3;
3673
3674 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
3675}
3676
3677#ifndef MINIMUM_CONVERSION_BUFFER_SIZE
3678#define MINIMUM_CONVERSION_BUFFER_SIZE 1024
3679#endif
3680
3681char *conversion_buffer;
3682int conversion_buffer_size;
3683
3684/* Return a pointer to a SIZE bytes of buffer to be used for encoding
3685 or decoding. Sufficient memory is allocated automatically. If we
3686 run out of memory, return NULL. */
3687
3688char *
3689get_conversion_buffer (size)
3690 int size;
3691{
3692 if (size > conversion_buffer_size)
3693 {
3694 char *buf;
3695 int real_size = conversion_buffer_size * 2;
3696
3697 while (real_size < size) real_size *= 2;
3698 buf = (char *) xmalloc (real_size);
3699 xfree (conversion_buffer);
3700 conversion_buffer = buf;
3701 conversion_buffer_size = real_size;
3702 }
3703 return conversion_buffer;
3704}
3705
3706int
3707ccl_coding_driver (coding, source, destination, src_bytes, dst_bytes, encodep)
3708 struct coding_system *coding;
3709 unsigned char *source, *destination;
3710 int src_bytes, dst_bytes, encodep;
3711{
3712 struct ccl_program *ccl
3713 = encodep ? &coding->spec.ccl.encoder : &coding->spec.ccl.decoder;
3714 int result;
3715
ae9ff118 3716 ccl->last_block = coding->mode & CODING_MODE_LAST_BLOCK;
7b179c2d 3717
d46c5b12
KH
3718 coding->produced = ccl_driver (ccl, source, destination,
3719 src_bytes, dst_bytes, &(coding->consumed));
69f76525 3720 coding->produced_char
48942766
KH
3721 = (encodep
3722 ? coding->produced
3723 : multibyte_chars_in_text (destination, coding->produced));
69f76525
KH
3724 coding->consumed_char
3725 = multibyte_chars_in_text (source, coding->consumed);
3726
d46c5b12
KH
3727 switch (ccl->status)
3728 {
3729 case CCL_STAT_SUSPEND_BY_SRC:
3730 result = CODING_FINISH_INSUFFICIENT_SRC;
3731 break;
3732 case CCL_STAT_SUSPEND_BY_DST:
3733 result = CODING_FINISH_INSUFFICIENT_DST;
3734 break;
9864ebce
KH
3735 case CCL_STAT_QUIT:
3736 case CCL_STAT_INVALID_CMD:
3737 result = CODING_FINISH_INTERRUPT;
3738 break;
d46c5b12
KH
3739 default:
3740 result = CODING_FINISH_NORMAL;
3741 break;
3742 }
3743 return result;
4ed46869
KH
3744}
3745
3746/* See "GENERAL NOTES about `decode_coding_XXX ()' functions". Before
3747 decoding, it may detect coding system and format of end-of-line if
52d41803
KH
3748 those are not yet decided.
3749
3750 This function does not make full use of DESTINATION buffer. For
3751 instance, if coding->type is coding_type_iso2022, it uses only
3752 (DST_BYTES - 7) bytes of DESTINATION buffer. In the case that
3753 DST_BYTES is decided by the function decoding_buffer_size, it
3754 contains extra 256 bytes (defined by CONVERSION_BUFFER_EXTRA_ROOM).
3755 So, this function can decode the full SOURCE. But, in the other
3756 case, if you want to avoid carry over, you must supply at least 7
3757 bytes more area in DESTINATION buffer than expected maximum bytes
3758 that will be produced by this function. */
4ed46869
KH
3759
3760int
d46c5b12 3761decode_coding (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
3762 struct coding_system *coding;
3763 unsigned char *source, *destination;
3764 int src_bytes, dst_bytes;
4ed46869 3765{
d46c5b12 3766 int result;
4ed46869 3767
d4e57bcd 3768 if (src_bytes <= 0
944bd420 3769 && coding->type != coding_type_ccl
d4e57bcd
KH
3770 && ! (coding->mode & CODING_MODE_LAST_BLOCK
3771 && CODING_REQUIRE_FLUSHING (coding)))
4ed46869 3772 {
d46c5b12
KH
3773 coding->produced = coding->produced_char = 0;
3774 coding->consumed = coding->consumed_char = 0;
fb88bf2d 3775 coding->fake_multibyte = 0;
d46c5b12 3776 return CODING_FINISH_NORMAL;
4ed46869
KH
3777 }
3778
0ef69138 3779 if (coding->type == coding_type_undecided)
4ed46869
KH
3780 detect_coding (coding, source, src_bytes);
3781
0ef69138 3782 if (coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
3783 detect_eol (coding, source, src_bytes);
3784
4ed46869
KH
3785 switch (coding->type)
3786 {
0ef69138
KH
3787 case coding_type_emacs_mule:
3788 case coding_type_undecided:
27901516 3789 case coding_type_raw_text:
4ed46869 3790 if (coding->eol_type == CODING_EOL_LF
0ef69138 3791 || coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869 3792 goto label_no_conversion;
d46c5b12 3793 result = decode_eol (coding, source, destination, src_bytes, dst_bytes);
4ed46869
KH
3794 break;
3795
3796 case coding_type_sjis:
d46c5b12
KH
3797 result = decode_coding_sjis_big5 (coding, source, destination,
3798 src_bytes, dst_bytes, 1);
4ed46869
KH
3799 break;
3800
3801 case coding_type_iso2022:
d46c5b12
KH
3802 result = decode_coding_iso2022 (coding, source, destination,
3803 src_bytes, dst_bytes);
4ed46869
KH
3804 break;
3805
3806 case coding_type_big5:
d46c5b12
KH
3807 result = decode_coding_sjis_big5 (coding, source, destination,
3808 src_bytes, dst_bytes, 0);
4ed46869
KH
3809 break;
3810
3811 case coding_type_ccl:
d46c5b12
KH
3812 result = ccl_coding_driver (coding, source, destination,
3813 src_bytes, dst_bytes, 0);
3814 break;
3815
3816 default: /* i.e. case coding_type_no_conversion: */
3817 label_no_conversion:
3818 if (dst_bytes && src_bytes > dst_bytes)
3819 {
3820 coding->produced = dst_bytes;
3821 result = CODING_FINISH_INSUFFICIENT_DST;
3822 }
3823 else
3824 {
3825 coding->produced = src_bytes;
3826 result = CODING_FINISH_NORMAL;
3827 }
3828 if (dst_bytes)
3829 bcopy (source, destination, coding->produced);
3830 else
3831 safe_bcopy (source, destination, coding->produced);
fb88bf2d 3832 coding->fake_multibyte = 1;
d46c5b12
KH
3833 coding->consumed
3834 = coding->consumed_char = coding->produced_char = coding->produced;
4ed46869
KH
3835 break;
3836 }
3837
d46c5b12 3838 return result;
4ed46869
KH
3839}
3840
52d41803
KH
3841/* See "GENERAL NOTES about `encode_coding_XXX ()' functions".
3842
3843 This function does not make full use of DESTINATION buffer. For
3844 instance, if coding->type is coding_type_iso2022, it uses only
3845 (DST_BYTES - 20) bytes of DESTINATION buffer. In the case that
3846 DST_BYTES is decided by the function encoding_buffer_size, it
3847 contains extra 256 bytes (defined by CONVERSION_BUFFER_EXTRA_ROOM).
3848 So, this function can encode the full SOURCE. But, in the other
3849 case, if you want to avoid carry over, you must supply at least 20
3850 bytes more area in DESTINATION buffer than expected maximum bytes
3851 that will be produced by this function. */
4ed46869
KH
3852
3853int
d46c5b12 3854encode_coding (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
3855 struct coding_system *coding;
3856 unsigned char *source, *destination;
3857 int src_bytes, dst_bytes;
4ed46869 3858{
d46c5b12 3859 int result;
4ed46869 3860
d4e57bcd
KH
3861 if (src_bytes <= 0
3862 && ! (coding->mode & CODING_MODE_LAST_BLOCK
3863 && CODING_REQUIRE_FLUSHING (coding)))
4ed46869 3864 {
d46c5b12
KH
3865 coding->produced = coding->produced_char = 0;
3866 coding->consumed = coding->consumed_char = 0;
fb88bf2d 3867 coding->fake_multibyte = 0;
d46c5b12
KH
3868 return CODING_FINISH_NORMAL;
3869 }
4ed46869 3870
d46c5b12
KH
3871 switch (coding->type)
3872 {
0ef69138
KH
3873 case coding_type_emacs_mule:
3874 case coding_type_undecided:
27901516 3875 case coding_type_raw_text:
4ed46869 3876 if (coding->eol_type == CODING_EOL_LF
0ef69138 3877 || coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869 3878 goto label_no_conversion;
d46c5b12 3879 result = encode_eol (coding, source, destination, src_bytes, dst_bytes);
4ed46869
KH
3880 break;
3881
3882 case coding_type_sjis:
d46c5b12
KH
3883 result = encode_coding_sjis_big5 (coding, source, destination,
3884 src_bytes, dst_bytes, 1);
4ed46869
KH
3885 break;
3886
3887 case coding_type_iso2022:
d46c5b12
KH
3888 result = encode_coding_iso2022 (coding, source, destination,
3889 src_bytes, dst_bytes);
4ed46869
KH
3890 break;
3891
3892 case coding_type_big5:
d46c5b12
KH
3893 result = encode_coding_sjis_big5 (coding, source, destination,
3894 src_bytes, dst_bytes, 0);
4ed46869
KH
3895 break;
3896
3897 case coding_type_ccl:
d46c5b12
KH
3898 result = ccl_coding_driver (coding, source, destination,
3899 src_bytes, dst_bytes, 1);
3900 break;
3901
3902 default: /* i.e. case coding_type_no_conversion: */
3903 label_no_conversion:
3904 if (dst_bytes && src_bytes > dst_bytes)
3905 {
3906 coding->produced = dst_bytes;
3907 result = CODING_FINISH_INSUFFICIENT_DST;
3908 }
3909 else
3910 {
3911 coding->produced = src_bytes;
3912 result = CODING_FINISH_NORMAL;
3913 }
3914 if (dst_bytes)
3915 bcopy (source, destination, coding->produced);
3916 else
3917 safe_bcopy (source, destination, coding->produced);
3918 if (coding->mode & CODING_MODE_SELECTIVE_DISPLAY)
3919 {
3920 unsigned char *p = destination, *pend = p + coding->produced;
3921 while (p < pend)
3922 if (*p++ == '\015') p[-1] = '\n';
3923 }
fb88bf2d 3924 coding->fake_multibyte = 1;
d46c5b12
KH
3925 coding->consumed
3926 = coding->consumed_char = coding->produced_char = coding->produced;
4ed46869
KH
3927 break;
3928 }
3929
d46c5b12 3930 return result;
4ed46869
KH
3931}
3932
fb88bf2d
KH
3933/* Scan text in the region between *BEG and *END (byte positions),
3934 skip characters which we don't have to decode by coding system
3935 CODING at the head and tail, then set *BEG and *END to the region
3936 of the text we actually have to convert. The caller should move
3937 the gap out of the region in advance.
4ed46869 3938
d46c5b12
KH
3939 If STR is not NULL, *BEG and *END are indices into STR. */
3940
3941static void
3942shrink_decoding_region (beg, end, coding, str)
3943 int *beg, *end;
3944 struct coding_system *coding;
3945 unsigned char *str;
3946{
fb88bf2d 3947 unsigned char *begp_orig, *begp, *endp_orig, *endp, c;
d46c5b12 3948 int eol_conversion;
88993dfd 3949 Lisp_Object translation_table;
d46c5b12
KH
3950
3951 if (coding->type == coding_type_ccl
3952 || coding->type == coding_type_undecided
3953 || !NILP (coding->post_read_conversion))
3954 {
3955 /* We can't skip any data. */
3956 return;
3957 }
3958 else if (coding->type == coding_type_no_conversion)
3959 {
fb88bf2d
KH
3960 /* We need no conversion, but don't have to skip any data here.
3961 Decoding routine handles them effectively anyway. */
d46c5b12
KH
3962 return;
3963 }
3964
88993dfd
KH
3965 translation_table = coding->translation_table_for_decode;
3966 if (NILP (translation_table) && !NILP (Venable_character_translation))
3967 translation_table = Vstandard_translation_table_for_decode;
3968 if (CHAR_TABLE_P (translation_table))
3969 {
3970 int i;
3971 for (i = 0; i < 128; i++)
3972 if (!NILP (CHAR_TABLE_REF (translation_table, i)))
3973 break;
3974 if (i < 128)
3975 /* Some ASCII character should be tranlsated. We give up
3976 shrinking. */
3977 return;
3978 }
3979
aa60dea6
KH
3980 eol_conversion = (coding->eol_type != CODING_EOL_LF);
3981
3982 if ((! eol_conversion) && (coding->heading_ascii >= 0))
d46c5b12
KH
3983 /* Detection routine has already found how much we can skip at the
3984 head. */
3985 *beg += coding->heading_ascii;
3986
3987 if (str)
3988 {
3989 begp_orig = begp = str + *beg;
3990 endp_orig = endp = str + *end;
3991 }
3992 else
3993 {
fb88bf2d 3994 begp_orig = begp = BYTE_POS_ADDR (*beg);
d46c5b12
KH
3995 endp_orig = endp = begp + *end - *beg;
3996 }
3997
d46c5b12
KH
3998 switch (coding->type)
3999 {
4000 case coding_type_emacs_mule:
4001 case coding_type_raw_text:
4002 if (eol_conversion)
4003 {
4004 if (coding->heading_ascii < 0)
fb88bf2d 4005 while (begp < endp && *begp != '\r' && *begp < 0x80) begp++;
ee59c65f 4006 while (begp < endp && endp[-1] != '\r' && endp[-1] < 0x80)
fb88bf2d 4007 endp--;
ee59c65f
RS
4008 /* Do not consider LF as ascii if preceded by CR, since that
4009 confuses eol decoding. */
4010 if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
4011 endp++;
d46c5b12
KH
4012 }
4013 else
4014 begp = endp;
4015 break;
4016
4017 case coding_type_sjis:
4018 case coding_type_big5:
4019 /* We can skip all ASCII characters at the head. */
4020 if (coding->heading_ascii < 0)
4021 {
4022 if (eol_conversion)
de9d083c 4023 while (begp < endp && *begp < 0x80 && *begp != '\r') begp++;
d46c5b12
KH
4024 else
4025 while (begp < endp && *begp < 0x80) begp++;
4026 }
4027 /* We can skip all ASCII characters at the tail except for the
4028 second byte of SJIS or BIG5 code. */
4029 if (eol_conversion)
de9d083c 4030 while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\r') endp--;
d46c5b12
KH
4031 else
4032 while (begp < endp && endp[-1] < 0x80) endp--;
ee59c65f
RS
4033 /* Do not consider LF as ascii if preceded by CR, since that
4034 confuses eol decoding. */
4035 if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
4036 endp++;
d46c5b12
KH
4037 if (begp < endp && endp < endp_orig && endp[-1] >= 0x80)
4038 endp++;
4039 break;
4040
4041 default: /* i.e. case coding_type_iso2022: */
622fece5
KH
4042 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
4043 /* We can't skip any data. */
4044 break;
d46c5b12
KH
4045 if (coding->heading_ascii < 0)
4046 {
d46c5b12
KH
4047 /* We can skip all ASCII characters at the head except for a
4048 few control codes. */
4049 while (begp < endp && (c = *begp) < 0x80
4050 && c != ISO_CODE_CR && c != ISO_CODE_SO
4051 && c != ISO_CODE_SI && c != ISO_CODE_ESC
4052 && (!eol_conversion || c != ISO_CODE_LF))
4053 begp++;
4054 }
4055 switch (coding->category_idx)
4056 {
4057 case CODING_CATEGORY_IDX_ISO_8_1:
4058 case CODING_CATEGORY_IDX_ISO_8_2:
4059 /* We can skip all ASCII characters at the tail. */
4060 if (eol_conversion)
de9d083c 4061 while (begp < endp && (c = endp[-1]) < 0x80 && c != '\r') endp--;
d46c5b12
KH
4062 else
4063 while (begp < endp && endp[-1] < 0x80) endp--;
ee59c65f
RS
4064 /* Do not consider LF as ascii if preceded by CR, since that
4065 confuses eol decoding. */
4066 if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
4067 endp++;
d46c5b12
KH
4068 break;
4069
4070 case CODING_CATEGORY_IDX_ISO_7:
4071 case CODING_CATEGORY_IDX_ISO_7_TIGHT:
de79a6a5
KH
4072 {
4073 /* We can skip all charactes at the tail except for 8-bit
4074 codes and ESC and the following 2-byte at the tail. */
4075 unsigned char *eight_bit = NULL;
4076
4077 if (eol_conversion)
4078 while (begp < endp
4079 && (c = endp[-1]) != ISO_CODE_ESC && c != '\r')
4080 {
4081 if (!eight_bit && c & 0x80) eight_bit = endp;
4082 endp--;
4083 }
4084 else
4085 while (begp < endp
4086 && (c = endp[-1]) != ISO_CODE_ESC)
4087 {
4088 if (!eight_bit && c & 0x80) eight_bit = endp;
4089 endp--;
4090 }
4091 /* Do not consider LF as ascii if preceded by CR, since that
4092 confuses eol decoding. */
4093 if (begp < endp && endp < endp_orig
4094 && endp[-1] == '\r' && endp[0] == '\n')
4095 endp++;
4096 if (begp < endp && endp[-1] == ISO_CODE_ESC)
4097 {
4098 if (endp + 1 < endp_orig && end[0] == '(' && end[1] == 'B')
4099 /* This is an ASCII designation sequence. We can
4100 surely skip the tail. But, if we have
4101 encountered an 8-bit code, skip only the codes
4102 after that. */
4103 endp = eight_bit ? eight_bit : endp + 2;
4104 else
4105 /* Hmmm, we can't skip the tail. */
4106 endp = endp_orig;
4107 }
4108 else if (eight_bit)
4109 endp = eight_bit;
4110 }
d46c5b12
KH
4111 }
4112 }
4113 *beg += begp - begp_orig;
4114 *end += endp - endp_orig;
4115 return;
4116}
4117
4118/* Like shrink_decoding_region but for encoding. */
4119
4120static void
4121shrink_encoding_region (beg, end, coding, str)
4122 int *beg, *end;
4123 struct coding_system *coding;
4124 unsigned char *str;
4125{
4126 unsigned char *begp_orig, *begp, *endp_orig, *endp;
4127 int eol_conversion;
88993dfd 4128 Lisp_Object translation_table;
d46c5b12
KH
4129
4130 if (coding->type == coding_type_ccl)
4131 /* We can't skip any data. */
4132 return;
4133 else if (coding->type == coding_type_no_conversion)
4134 {
4135 /* We need no conversion. */
4136 *beg = *end;
4137 return;
4138 }
4139
88993dfd
KH
4140 translation_table = coding->translation_table_for_encode;
4141 if (NILP (translation_table) && !NILP (Venable_character_translation))
4142 translation_table = Vstandard_translation_table_for_encode;
4143 if (CHAR_TABLE_P (translation_table))
4144 {
4145 int i;
4146 for (i = 0; i < 128; i++)
4147 if (!NILP (CHAR_TABLE_REF (translation_table, i)))
4148 break;
4149 if (i < 128)
4150 /* Some ASCII character should be tranlsated. We give up
4151 shrinking. */
4152 return;
4153 }
4154
d46c5b12
KH
4155 if (str)
4156 {
4157 begp_orig = begp = str + *beg;
4158 endp_orig = endp = str + *end;
4159 }
4160 else
4161 {
fb88bf2d 4162 begp_orig = begp = BYTE_POS_ADDR (*beg);
d46c5b12
KH
4163 endp_orig = endp = begp + *end - *beg;
4164 }
4165
4166 eol_conversion = (coding->eol_type == CODING_EOL_CR
4167 || coding->eol_type == CODING_EOL_CRLF);
4168
4169 /* Here, we don't have to check coding->pre_write_conversion because
4170 the caller is expected to have handled it already. */
4171 switch (coding->type)
4172 {
4173 case coding_type_undecided:
4174 case coding_type_emacs_mule:
4175 case coding_type_raw_text:
4176 if (eol_conversion)
4177 {
4178 while (begp < endp && *begp != '\n') begp++;
4179 while (begp < endp && endp[-1] != '\n') endp--;
4180 }
4181 else
4182 begp = endp;
4183 break;
4184
4185 case coding_type_iso2022:
622fece5
KH
4186 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
4187 /* We can't skip any data. */
4188 break;
d46c5b12
KH
4189 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
4190 {
4191 unsigned char *bol = begp;
4192 while (begp < endp && *begp < 0x80)
4193 {
4194 begp++;
4195 if (begp[-1] == '\n')
4196 bol = begp;
4197 }
4198 begp = bol;
4199 goto label_skip_tail;
4200 }
4201 /* fall down ... */
4202
4203 default:
4204 /* We can skip all ASCII characters at the head and tail. */
4205 if (eol_conversion)
4206 while (begp < endp && *begp < 0x80 && *begp != '\n') begp++;
4207 else
4208 while (begp < endp && *begp < 0x80) begp++;
4209 label_skip_tail:
4210 if (eol_conversion)
4211 while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\n') endp--;
4212 else
4213 while (begp < endp && *(endp - 1) < 0x80) endp--;
4214 break;
4215 }
4216
4217 *beg += begp - begp_orig;
4218 *end += endp - endp_orig;
4219 return;
4220}
4221
88993dfd
KH
4222/* As shrinking conversion region requires some overhead, we don't try
4223 shrinking if the length of conversion region is less than this
4224 value. */
4225static int shrink_conversion_region_threshhold = 1024;
4226
4227#define SHRINK_CONVERSION_REGION(beg, end, coding, str, encodep) \
4228 do { \
4229 if (*(end) - *(beg) > shrink_conversion_region_threshhold) \
4230 { \
4231 if (encodep) shrink_encoding_region (beg, end, coding, str); \
4232 else shrink_decoding_region (beg, end, coding, str); \
4233 } \
4234 } while (0)
4235
b843d1ae
KH
4236static Lisp_Object
4237code_convert_region_unwind (dummy)
4238 Lisp_Object dummy;
4239{
4240 inhibit_pre_post_conversion = 0;
4241 return Qnil;
4242}
4243
d46c5b12 4244/* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the
fb88bf2d
KH
4245 text from FROM to TO (byte positions are FROM_BYTE and TO_BYTE) by
4246 coding system CODING, and return the status code of code conversion
4247 (currently, this value has no meaning).
4248
4249 How many characters (and bytes) are converted to how many
4250 characters (and bytes) are recorded in members of the structure
4251 CODING.
d46c5b12 4252
6e44253b 4253 If REPLACE is nonzero, we do various things as if the original text
d46c5b12 4254 is deleted and a new text is inserted. See the comments in
6e44253b 4255 replace_range (insdel.c) to know what we are doing. */
4ed46869
KH
4256
4257int
6e44253b
KH
4258code_convert_region (from, from_byte, to, to_byte, coding, encodep, replace)
4259 int from, from_byte, to, to_byte, encodep, replace;
4ed46869 4260 struct coding_system *coding;
4ed46869 4261{
fb88bf2d
KH
4262 int len = to - from, len_byte = to_byte - from_byte;
4263 int require, inserted, inserted_byte;
12410ef1 4264 int head_skip, tail_skip, total_skip;
84d60297 4265 Lisp_Object saved_coding_symbol;
fb88bf2d
KH
4266 int multibyte = !NILP (current_buffer->enable_multibyte_characters);
4267 int first = 1;
4268 int fake_multibyte = 0;
4269 unsigned char *src, *dst;
84d60297 4270 Lisp_Object deletion;
e133c8fa 4271 int orig_point = PT, orig_len = len;
6abb9bd9 4272 int prev_Z;
84d60297
RS
4273
4274 deletion = Qnil;
4275 saved_coding_symbol = Qnil;
d46c5b12 4276
83fa074f 4277 if (from < PT && PT < to)
e133c8fa
KH
4278 {
4279 TEMP_SET_PT_BOTH (from, from_byte);
4280 orig_point = from;
4281 }
83fa074f 4282
6e44253b 4283 if (replace)
d46c5b12 4284 {
fb88bf2d
KH
4285 int saved_from = from;
4286
d46c5b12 4287 prepare_to_modify_buffer (from, to, &from);
fb88bf2d
KH
4288 if (saved_from != from)
4289 {
4290 to = from + len;
4291 if (multibyte)
4292 from_byte = CHAR_TO_BYTE (from), to_byte = CHAR_TO_BYTE (to);
4293 else
4294 from_byte = from, to_byte = to;
4295 len_byte = to_byte - from_byte;
4296 }
d46c5b12 4297 }
d46c5b12
KH
4298
4299 if (! encodep && CODING_REQUIRE_DETECTION (coding))
4300 {
12410ef1 4301 /* We must detect encoding of text and eol format. */
d46c5b12
KH
4302
4303 if (from < GPT && to > GPT)
4304 move_gap_both (from, from_byte);
4305 if (coding->type == coding_type_undecided)
4306 {
fb88bf2d 4307 detect_coding (coding, BYTE_POS_ADDR (from_byte), len_byte);
d46c5b12 4308 if (coding->type == coding_type_undecided)
12410ef1
KH
4309 /* It seems that the text contains only ASCII, but we
4310 should not left it undecided because the deeper
4311 decoding routine (decode_coding) tries to detect the
4312 encodings again in vain. */
d46c5b12
KH
4313 coding->type = coding_type_emacs_mule;
4314 }
4315 if (coding->eol_type == CODING_EOL_UNDECIDED)
4316 {
4317 saved_coding_symbol = coding->symbol;
4318 detect_eol (coding, BYTE_POS_ADDR (from_byte), len_byte);
4319 if (coding->eol_type == CODING_EOL_UNDECIDED)
4320 coding->eol_type = CODING_EOL_LF;
4321 /* We had better recover the original eol format if we
4322 encounter an inconsitent eol format while decoding. */
4323 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4324 }
4325 }
4326
fb88bf2d
KH
4327 coding->consumed_char = len, coding->consumed = len_byte;
4328
d46c5b12
KH
4329 if (encodep
4330 ? ! CODING_REQUIRE_ENCODING (coding)
4331 : ! CODING_REQUIRE_DECODING (coding))
fb88bf2d
KH
4332 {
4333 coding->produced = len_byte;
12410ef1
KH
4334 if (multibyte
4335 && ! replace
4336 /* See the comment of the member heading_ascii in coding.h. */
4337 && coding->heading_ascii < len_byte)
fb88bf2d 4338 {
6e44253b
KH
4339 /* We still may have to combine byte at the head and the
4340 tail of the text in the region. */
12410ef1 4341 if (from < GPT && GPT < to)
6e44253b 4342 move_gap_both (to, to_byte);
12410ef1
KH
4343 len = multibyte_chars_in_text (BYTE_POS_ADDR (from_byte), len_byte);
4344 adjust_after_insert (from, from_byte, to, to_byte, len);
4345 coding->produced_char = len;
fb88bf2d
KH
4346 }
4347 else
68e3a8f1
AS
4348 {
4349 if (!replace)
4350 adjust_after_insert (from, from_byte, to, to_byte, len_byte);
4351 coding->produced_char = len_byte;
4352 }
fb88bf2d
KH
4353 return 0;
4354 }
d46c5b12
KH
4355
4356 /* Now we convert the text. */
4357
4358 /* For encoding, we must process pre-write-conversion in advance. */
4359 if (encodep
d46c5b12
KH
4360 && ! NILP (coding->pre_write_conversion)
4361 && SYMBOLP (coding->pre_write_conversion)
4362 && ! NILP (Ffboundp (coding->pre_write_conversion)))
4363 {
2b4f9037
KH
4364 /* The function in pre-write-conversion may put a new text in a
4365 new buffer. */
0007bdd0
KH
4366 struct buffer *prev = current_buffer;
4367 Lisp_Object new;
b843d1ae 4368 int count = specpdl_ptr - specpdl;
d46c5b12 4369
b843d1ae
KH
4370 record_unwind_protect (code_convert_region_unwind, Qnil);
4371 /* We should not call any more pre-write/post-read-conversion
4372 functions while this pre-write-conversion is running. */
4373 inhibit_pre_post_conversion = 1;
b39f748c
AS
4374 call2 (coding->pre_write_conversion,
4375 make_number (from), make_number (to));
b843d1ae
KH
4376 inhibit_pre_post_conversion = 0;
4377 /* Discard the unwind protect. */
4378 specpdl_ptr--;
4379
d46c5b12
KH
4380 if (current_buffer != prev)
4381 {
4382 len = ZV - BEGV;
0007bdd0 4383 new = Fcurrent_buffer ();
d46c5b12 4384 set_buffer_internal_1 (prev);
ddbc19ff 4385 del_range_2 (from, from_byte, to, to_byte);
e133c8fa 4386 TEMP_SET_PT_BOTH (from, from_byte);
0007bdd0
KH
4387 insert_from_buffer (XBUFFER (new), 1, len, 0);
4388 Fkill_buffer (new);
e133c8fa
KH
4389 if (orig_point >= to)
4390 orig_point += len - orig_len;
4391 else if (orig_point > from)
4392 orig_point = from;
4393 orig_len = len;
d46c5b12 4394 to = from + len;
e133c8fa 4395 from_byte = multibyte ? CHAR_TO_BYTE (from) : from_byte;
fb88bf2d 4396 to_byte = multibyte ? CHAR_TO_BYTE (to) : to;
d46c5b12 4397 len_byte = to_byte - from_byte;
e133c8fa 4398 TEMP_SET_PT_BOTH (from, from_byte);
d46c5b12
KH
4399 }
4400 }
4401
12410ef1
KH
4402 if (replace)
4403 deletion = make_buffer_string_both (from, from_byte, to, to_byte, 1);
4404
d46c5b12 4405 /* Try to skip the heading and tailing ASCIIs. */
12410ef1
KH
4406 {
4407 int from_byte_orig = from_byte, to_byte_orig = to_byte;
4408
4409 if (from < GPT && GPT < to)
4410 move_gap_both (from, from_byte);
88993dfd 4411 SHRINK_CONVERSION_REGION (&from_byte, &to_byte, coding, NULL, encodep);
d4e57bcd 4412 if (from_byte == to_byte
944bd420 4413 && coding->type != coding_type_ccl
d4e57bcd
KH
4414 && ! (coding->mode & CODING_MODE_LAST_BLOCK
4415 && CODING_REQUIRE_FLUSHING (coding)))
12410ef1
KH
4416 {
4417 coding->produced = len_byte;
4418 coding->produced_char = multibyte ? len : len_byte;
4419 if (!replace)
4420 /* We must record and adjust for this new text now. */
4421 adjust_after_insert (from, from_byte_orig, to, to_byte_orig, len);
4422 return 0;
4423 }
fb88bf2d 4424
12410ef1
KH
4425 head_skip = from_byte - from_byte_orig;
4426 tail_skip = to_byte_orig - to_byte;
4427 total_skip = head_skip + tail_skip;
4428 from += head_skip;
4429 to -= tail_skip;
4430 len -= total_skip; len_byte -= total_skip;
4431 }
d46c5b12 4432
88993dfd 4433 /* The code conversion routine can not preserve text properties for
55d8d769
KH
4434 now. So, we must remove all text properties in the region.
4435 Here, we must suppress all modification hooks. */
88993dfd 4436 if (replace)
55d8d769
KH
4437 {
4438 int saved_inhibit_modification_hooks = inhibit_modification_hooks;
4439 inhibit_modification_hooks = 1;
4440 Fset_text_properties (make_number (from), make_number (to), Qnil, Qnil);
4441 inhibit_modification_hooks = saved_inhibit_modification_hooks;
4442 }
88993dfd 4443
fb88bf2d
KH
4444 /* For converion, we must put the gap before the text in addition to
4445 making the gap larger for efficient decoding. The required gap
4446 size starts from 2000 which is the magic number used in make_gap.
4447 But, after one batch of conversion, it will be incremented if we
4448 find that it is not enough . */
d46c5b12
KH
4449 require = 2000;
4450
4451 if (GAP_SIZE < require)
4452 make_gap (require - GAP_SIZE);
4453 move_gap_both (from, from_byte);
4454
d46c5b12 4455 inserted = inserted_byte = 0;
fb88bf2d
KH
4456 src = GAP_END_ADDR, dst = GPT_ADDR;
4457
4458 GAP_SIZE += len_byte;
4459 ZV -= len;
4460 Z -= len;
4461 ZV_BYTE -= len_byte;
4462 Z_BYTE -= len_byte;
4463
d9f9a1bc
GM
4464 if (GPT - BEG < BEG_UNCHANGED)
4465 BEG_UNCHANGED = GPT - BEG;
4466 if (Z - GPT < END_UNCHANGED)
4467 END_UNCHANGED = Z - GPT;
f2558efd 4468
d46c5b12
KH
4469 for (;;)
4470 {
fb88bf2d 4471 int result;
d46c5b12
KH
4472
4473 /* The buffer memory is changed from:
fb88bf2d
KH
4474 +--------+converted-text+---------+-------original-text------+---+
4475 |<-from->|<--inserted-->|---------|<-----------len---------->|---|
4476 |<------------------- GAP_SIZE -------------------->| */
d46c5b12 4477 if (encodep)
fb88bf2d 4478 result = encode_coding (coding, src, dst, len_byte, 0);
d46c5b12 4479 else
fb88bf2d 4480 result = decode_coding (coding, src, dst, len_byte, 0);
d46c5b12
KH
4481 /* to:
4482 +--------+-------converted-text--------+--+---original-text--+---+
fb88bf2d
KH
4483 |<-from->|<--inserted-->|<--produced-->|--|<-(len-consumed)->|---|
4484 |<------------------- GAP_SIZE -------------------->| */
4485 if (coding->fake_multibyte)
4486 fake_multibyte = 1;
d46c5b12 4487
fb88bf2d
KH
4488 if (!encodep && !multibyte)
4489 coding->produced_char = coding->produced;
d46c5b12
KH
4490 inserted += coding->produced_char;
4491 inserted_byte += coding->produced;
d46c5b12 4492 len_byte -= coding->consumed;
fb88bf2d
KH
4493 src += coding->consumed;
4494 dst += inserted_byte;
d46c5b12 4495
9864ebce
KH
4496 if (result == CODING_FINISH_NORMAL)
4497 {
4498 src += len_byte;
4499 break;
4500 }
d46c5b12
KH
4501 if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
4502 {
fb88bf2d 4503 unsigned char *pend = dst, *p = pend - inserted_byte;
38edf7d4 4504 Lisp_Object eol_type;
d46c5b12
KH
4505
4506 /* Encode LFs back to the original eol format (CR or CRLF). */
4507 if (coding->eol_type == CODING_EOL_CR)
4508 {
4509 while (p < pend) if (*p++ == '\n') p[-1] = '\r';
4510 }
4511 else
4512 {
d46c5b12
KH
4513 int count = 0;
4514
fb88bf2d
KH
4515 while (p < pend) if (*p++ == '\n') count++;
4516 if (src - dst < count)
d46c5b12 4517 {
38edf7d4 4518 /* We don't have sufficient room for encoding LFs
fb88bf2d
KH
4519 back to CRLF. We must record converted and
4520 not-yet-converted text back to the buffer
4521 content, enlarge the gap, then record them out of
4522 the buffer contents again. */
4523 int add = len_byte + inserted_byte;
4524
4525 GAP_SIZE -= add;
4526 ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
4527 GPT += inserted_byte; GPT_BYTE += inserted_byte;
4528 make_gap (count - GAP_SIZE);
4529 GAP_SIZE += add;
4530 ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
4531 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
4532 /* Don't forget to update SRC, DST, and PEND. */
4533 src = GAP_END_ADDR - len_byte;
4534 dst = GPT_ADDR + inserted_byte;
4535 pend = dst;
d46c5b12 4536 }
d46c5b12
KH
4537 inserted += count;
4538 inserted_byte += count;
fb88bf2d
KH
4539 coding->produced += count;
4540 p = dst = pend + count;
4541 while (count)
4542 {
4543 *--p = *--pend;
4544 if (*p == '\n') count--, *--p = '\r';
4545 }
d46c5b12
KH
4546 }
4547
4548 /* Suppress eol-format conversion in the further conversion. */
4549 coding->eol_type = CODING_EOL_LF;
4550
38edf7d4
KH
4551 /* Set the coding system symbol to that for Unix-like EOL. */
4552 eol_type = Fget (saved_coding_symbol, Qeol_type);
4553 if (VECTORP (eol_type)
4554 && XVECTOR (eol_type)->size == 3
4555 && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
4556 coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
4557 else
4558 coding->symbol = saved_coding_symbol;
fb88bf2d
KH
4559
4560 continue;
d46c5b12
KH
4561 }
4562 if (len_byte <= 0)
944bd420
KH
4563 {
4564 if (coding->type != coding_type_ccl
4565 || coding->mode & CODING_MODE_LAST_BLOCK)
4566 break;
4567 coding->mode |= CODING_MODE_LAST_BLOCK;
4568 continue;
4569 }
d46c5b12
KH
4570 if (result == CODING_FINISH_INSUFFICIENT_SRC)
4571 {
4572 /* The source text ends in invalid codes. Let's just
4573 make them valid buffer contents, and finish conversion. */
fb88bf2d 4574 inserted += len_byte;
d46c5b12 4575 inserted_byte += len_byte;
fb88bf2d 4576 while (len_byte--)
ee59c65f 4577 *dst++ = *src++;
fb88bf2d 4578 fake_multibyte = 1;
d46c5b12
KH
4579 break;
4580 }
9864ebce
KH
4581 if (result == CODING_FINISH_INTERRUPT)
4582 {
4583 /* The conversion procedure was interrupted by a user. */
4584 fake_multibyte = 1;
4585 break;
4586 }
4587 /* Now RESULT == CODING_FINISH_INSUFFICIENT_DST */
4588 if (coding->consumed < 1)
4589 {
4590 /* It's quite strange to require more memory without
4591 consuming any bytes. Perhaps CCL program bug. */
4592 fake_multibyte = 1;
4593 break;
4594 }
fb88bf2d
KH
4595 if (first)
4596 {
4597 /* We have just done the first batch of conversion which was
4598 stoped because of insufficient gap. Let's reconsider the
4599 required gap size (i.e. SRT - DST) now.
4600
4601 We have converted ORIG bytes (== coding->consumed) into
4602 NEW bytes (coding->produced). To convert the remaining
4603 LEN bytes, we may need REQUIRE bytes of gap, where:
4604 REQUIRE + LEN_BYTE = LEN_BYTE * (NEW / ORIG)
4605 REQUIRE = LEN_BYTE * (NEW - ORIG) / ORIG
4606 Here, we are sure that NEW >= ORIG. */
6e44253b
KH
4607 float ratio = coding->produced - coding->consumed;
4608 ratio /= coding->consumed;
4609 require = len_byte * ratio;
fb88bf2d
KH
4610 first = 0;
4611 }
4612 if ((src - dst) < (require + 2000))
4613 {
4614 /* See the comment above the previous call of make_gap. */
4615 int add = len_byte + inserted_byte;
4616
4617 GAP_SIZE -= add;
4618 ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
4619 GPT += inserted_byte; GPT_BYTE += inserted_byte;
4620 make_gap (require + 2000);
4621 GAP_SIZE += add;
4622 ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
4623 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
4624 /* Don't forget to update SRC, DST. */
4625 src = GAP_END_ADDR - len_byte;
4626 dst = GPT_ADDR + inserted_byte;
4627 }
d46c5b12 4628 }
fb88bf2d
KH
4629 if (src - dst > 0) *dst = 0; /* Put an anchor. */
4630
2b4f9037 4631 if (multibyte
88993dfd
KH
4632 && (encodep
4633 || fake_multibyte
4634 || (to - from) != (to_byte - from_byte)))
2b4f9037 4635 inserted = multibyte_chars_in_text (GPT_ADDR, inserted_byte);
7553d0e1 4636
12410ef1
KH
4637 /* If we have shrinked the conversion area, adjust it now. */
4638 if (total_skip > 0)
4639 {
4640 if (tail_skip > 0)
4641 safe_bcopy (GAP_END_ADDR, GPT_ADDR + inserted_byte, tail_skip);
4642 inserted += total_skip; inserted_byte += total_skip;
4643 GAP_SIZE += total_skip;
4644 GPT -= head_skip; GPT_BYTE -= head_skip;
4645 ZV -= total_skip; ZV_BYTE -= total_skip;
4646 Z -= total_skip; Z_BYTE -= total_skip;
4647 from -= head_skip; from_byte -= head_skip;
4648 to += tail_skip; to_byte += tail_skip;
4649 }
4650
6abb9bd9 4651 prev_Z = Z;
12410ef1 4652 adjust_after_replace (from, from_byte, deletion, inserted, inserted_byte);
6abb9bd9 4653 inserted = Z - prev_Z;
4ed46869 4654
2b4f9037 4655 if (! encodep && ! NILP (coding->post_read_conversion))
d46c5b12 4656 {
2b4f9037 4657 Lisp_Object val;
b843d1ae 4658 int count = specpdl_ptr - specpdl;
4ed46869 4659
e133c8fa
KH
4660 if (from != PT)
4661 TEMP_SET_PT_BOTH (from, from_byte);
6abb9bd9 4662 prev_Z = Z;
b843d1ae
KH
4663 record_unwind_protect (code_convert_region_unwind, Qnil);
4664 /* We should not call any more pre-write/post-read-conversion
4665 functions while this post-read-conversion is running. */
4666 inhibit_pre_post_conversion = 1;
2b4f9037 4667 val = call1 (coding->post_read_conversion, make_number (inserted));
b843d1ae
KH
4668 inhibit_pre_post_conversion = 0;
4669 /* Discard the unwind protect. */
4670 specpdl_ptr--;
6abb9bd9 4671 CHECK_NUMBER (val, 0);
944bd420 4672 inserted += Z - prev_Z;
e133c8fa
KH
4673 }
4674
4675 if (orig_point >= from)
4676 {
4677 if (orig_point >= from + orig_len)
4678 orig_point += inserted - orig_len;
4679 else
4680 orig_point = from;
4681 TEMP_SET_PT (orig_point);
d46c5b12 4682 }
4ed46869 4683
2b4f9037
KH
4684 signal_after_change (from, to - from, inserted);
4685
fb88bf2d 4686 {
12410ef1
KH
4687 coding->consumed = to_byte - from_byte;
4688 coding->consumed_char = to - from;
4689 coding->produced = inserted_byte;
4690 coding->produced_char = inserted;
fb88bf2d 4691 }
7553d0e1 4692
fb88bf2d 4693 return 0;
d46c5b12
KH
4694}
4695
4696Lisp_Object
4697code_convert_string (str, coding, encodep, nocopy)
4698 Lisp_Object str;
4ed46869 4699 struct coding_system *coding;
d46c5b12 4700 int encodep, nocopy;
4ed46869 4701{
d46c5b12
KH
4702 int len;
4703 char *buf;
fc932ac6
RS
4704 int from = 0, to = XSTRING (str)->size;
4705 int to_byte = STRING_BYTES (XSTRING (str));
d46c5b12 4706 struct gcpro gcpro1;
84d60297 4707 Lisp_Object saved_coding_symbol;
d46c5b12 4708 int result;
4ed46869 4709
84d60297 4710 saved_coding_symbol = Qnil;
b843d1ae
KH
4711 if ((encodep && !NILP (coding->pre_write_conversion)
4712 || !encodep && !NILP (coding->post_read_conversion)))
d46c5b12
KH
4713 {
4714 /* Since we have to call Lisp functions which assume target text
b843d1ae
KH
4715 is in a buffer, after setting a temporary buffer, call
4716 code_convert_region. */
d46c5b12
KH
4717 int count = specpdl_ptr - specpdl;
4718 struct buffer *prev = current_buffer;
b843d1ae 4719 int multibyte = STRING_MULTIBYTE (str);
e133c8fa 4720
d46c5b12 4721 record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
b843d1ae
KH
4722 record_unwind_protect (code_convert_region_unwind, Qnil);
4723 inhibit_pre_post_conversion = 1;
4724 GCPRO1 (str);
d46c5b12
KH
4725 temp_output_buffer_setup (" *code-converting-work*");
4726 set_buffer_internal (XBUFFER (Vstandard_output));
b843d1ae
KH
4727 /* We must insert the contents of STR as is without
4728 unibyte<->multibyte conversion. For that, we adjust the
4729 multibyteness of the working buffer to that of STR. */
4730 Ferase_buffer (); /* for safety */
4731 current_buffer->enable_multibyte_characters = multibyte ? Qt : Qnil;
4732 insert_from_string (str, 0, 0, to, to_byte, 0);
4733 UNGCPRO;
fb88bf2d 4734 code_convert_region (BEGV, BEGV_BYTE, ZV, ZV_BYTE, coding, encodep, 1);
b843d1ae
KH
4735 /* Make a unibyte string if we are encoding, otherwise make a
4736 multibyte string. */
4737 Fset_buffer_multibyte (encodep ? Qnil : Qt);
d46c5b12 4738 str = make_buffer_string (BEGV, ZV, 0);
d46c5b12
KH
4739 return unbind_to (count, str);
4740 }
4ed46869 4741
d46c5b12
KH
4742 if (! encodep && CODING_REQUIRE_DETECTION (coding))
4743 {
4744 /* See the comments in code_convert_region. */
4745 if (coding->type == coding_type_undecided)
4746 {
4747 detect_coding (coding, XSTRING (str)->data, to_byte);
4748 if (coding->type == coding_type_undecided)
4749 coding->type = coding_type_emacs_mule;
4750 }
4751 if (coding->eol_type == CODING_EOL_UNDECIDED)
4752 {
4753 saved_coding_symbol = coding->symbol;
4754 detect_eol (coding, XSTRING (str)->data, to_byte);
4755 if (coding->eol_type == CODING_EOL_UNDECIDED)
4756 coding->eol_type = CODING_EOL_LF;
4757 /* We had better recover the original eol format if we
4758 encounter an inconsitent eol format while decoding. */
4759 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4760 }
4761 }
4ed46869 4762
d46c5b12
KH
4763 if (encodep
4764 ? ! CODING_REQUIRE_ENCODING (coding)
4765 : ! CODING_REQUIRE_DECODING (coding))
4766 from = to_byte;
4767 else
4768 {
4769 /* Try to skip the heading and tailing ASCIIs. */
88993dfd
KH
4770 SHRINK_CONVERSION_REGION (&from, &to_byte, coding, XSTRING (str)->data,
4771 encodep);
d46c5b12 4772 }
e133c8fa
KH
4773 if (from == to_byte
4774 && coding->type != coding_type_ccl)
d46c5b12 4775 return (nocopy ? str : Fcopy_sequence (str));
4ed46869 4776
d46c5b12
KH
4777 if (encodep)
4778 len = encoding_buffer_size (coding, to_byte - from);
4779 else
4780 len = decoding_buffer_size (coding, to_byte - from);
fc932ac6 4781 len += from + STRING_BYTES (XSTRING (str)) - to_byte;
d46c5b12
KH
4782 GCPRO1 (str);
4783 buf = get_conversion_buffer (len);
4784 UNGCPRO;
4ed46869 4785
d46c5b12
KH
4786 if (from > 0)
4787 bcopy (XSTRING (str)->data, buf, from);
4788 result = (encodep
4789 ? encode_coding (coding, XSTRING (str)->data + from,
4790 buf + from, to_byte - from, len)
4791 : decode_coding (coding, XSTRING (str)->data + from,
f30cc612 4792 buf + from, to_byte - from, len));
d46c5b12 4793 if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
4ed46869 4794 {
d46c5b12
KH
4795 /* We simple try to decode the whole string again but without
4796 eol-conversion this time. */
4797 coding->eol_type = CODING_EOL_LF;
4798 coding->symbol = saved_coding_symbol;
4799 return code_convert_string (str, coding, encodep, nocopy);
4ed46869 4800 }
d46c5b12
KH
4801
4802 bcopy (XSTRING (str)->data + to_byte, buf + from + coding->produced,
fc932ac6 4803 STRING_BYTES (XSTRING (str)) - to_byte);
d46c5b12 4804
fc932ac6 4805 len = from + STRING_BYTES (XSTRING (str)) - to_byte;
d46c5b12
KH
4806 if (encodep)
4807 str = make_unibyte_string (buf, len + coding->produced);
4808 else
826bfb8b
KH
4809 {
4810 int chars= (coding->fake_multibyte
4811 ? multibyte_chars_in_text (buf + from, coding->produced)
4812 : coding->produced_char);
4813 str = make_multibyte_string (buf, len + chars, len + coding->produced);
4814 }
4815
d46c5b12 4816 return str;
4ed46869
KH
4817}
4818
4819\f
4820#ifdef emacs
1397dc18 4821/*** 8. Emacs Lisp library functions ***/
4ed46869 4822
4ed46869
KH
4823DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
4824 "Return t if OBJECT is nil or a coding-system.\n\
3a73fa5d
RS
4825See the documentation of `make-coding-system' for information\n\
4826about coding-system objects.")
4ed46869
KH
4827 (obj)
4828 Lisp_Object obj;
4829{
4608c386
KH
4830 if (NILP (obj))
4831 return Qt;
4832 if (!SYMBOLP (obj))
4833 return Qnil;
4834 /* Get coding-spec vector for OBJ. */
4835 obj = Fget (obj, Qcoding_system);
4836 return ((VECTORP (obj) && XVECTOR (obj)->size == 5)
4837 ? Qt : Qnil);
4ed46869
KH
4838}
4839
9d991de8
RS
4840DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
4841 Sread_non_nil_coding_system, 1, 1, 0,
e0e989f6 4842 "Read a coding system from the minibuffer, prompting with string PROMPT.")
4ed46869
KH
4843 (prompt)
4844 Lisp_Object prompt;
4845{
e0e989f6 4846 Lisp_Object val;
9d991de8
RS
4847 do
4848 {
4608c386
KH
4849 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
4850 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
9d991de8
RS
4851 }
4852 while (XSTRING (val)->size == 0);
e0e989f6 4853 return (Fintern (val, Qnil));
4ed46869
KH
4854}
4855
9b787f3e
RS
4856DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
4857 "Read a coding system from the minibuffer, prompting with string PROMPT.\n\
4858If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.")
4859 (prompt, default_coding_system)
4860 Lisp_Object prompt, default_coding_system;
4ed46869 4861{
f44d27ce 4862 Lisp_Object val;
9b787f3e
RS
4863 if (SYMBOLP (default_coding_system))
4864 XSETSTRING (default_coding_system, XSYMBOL (default_coding_system)->name);
4608c386 4865 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
9b787f3e
RS
4866 Qt, Qnil, Qcoding_system_history,
4867 default_coding_system, Qnil);
e0e989f6 4868 return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
4ed46869
KH
4869}
4870
4871DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
4872 1, 1, 0,
4873 "Check validity of CODING-SYSTEM.\n\
3a73fa5d
RS
4874If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.\n\
4875It is valid if it is a symbol with a non-nil `coding-system' property.\n\
4ed46869
KH
4876The value of property should be a vector of length 5.")
4877 (coding_system)
4878 Lisp_Object coding_system;
4879{
4880 CHECK_SYMBOL (coding_system, 0);
4881 if (!NILP (Fcoding_system_p (coding_system)))
4882 return coding_system;
4883 while (1)
02ba4723 4884 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
4ed46869 4885}
3a73fa5d 4886\f
d46c5b12
KH
4887Lisp_Object
4888detect_coding_system (src, src_bytes, highest)
4889 unsigned char *src;
4890 int src_bytes, highest;
4ed46869
KH
4891{
4892 int coding_mask, eol_type;
d46c5b12
KH
4893 Lisp_Object val, tmp;
4894 int dummy;
4ed46869 4895
d46c5b12
KH
4896 coding_mask = detect_coding_mask (src, src_bytes, NULL, &dummy);
4897 eol_type = detect_eol_type (src, src_bytes, &dummy);
4898 if (eol_type == CODING_EOL_INCONSISTENT)
25b02698 4899 eol_type = CODING_EOL_UNDECIDED;
4ed46869 4900
d46c5b12 4901 if (!coding_mask)
4ed46869 4902 {
27901516 4903 val = Qundecided;
d46c5b12 4904 if (eol_type != CODING_EOL_UNDECIDED)
4ed46869 4905 {
f44d27ce
RS
4906 Lisp_Object val2;
4907 val2 = Fget (Qundecided, Qeol_type);
4ed46869
KH
4908 if (VECTORP (val2))
4909 val = XVECTOR (val2)->contents[eol_type];
4910 }
80e803b4 4911 return (highest ? val : Fcons (val, Qnil));
4ed46869 4912 }
4ed46869 4913
d46c5b12
KH
4914 /* At first, gather possible coding systems in VAL. */
4915 val = Qnil;
03699b14 4916 for (tmp = Vcoding_category_list; !NILP (tmp); tmp = XCDR (tmp))
4ed46869 4917 {
d46c5b12 4918 int idx
03699b14 4919 = XFASTINT (Fget (XCAR (tmp), Qcoding_category_index));
d46c5b12 4920 if (coding_mask & (1 << idx))
4ed46869 4921 {
03699b14 4922 val = Fcons (Fsymbol_value (XCAR (tmp)), val);
d46c5b12
KH
4923 if (highest)
4924 break;
4ed46869
KH
4925 }
4926 }
d46c5b12
KH
4927 if (!highest)
4928 val = Fnreverse (val);
4ed46869 4929
65059037 4930 /* Then, replace the elements with subsidiary coding systems. */
03699b14 4931 for (tmp = val; !NILP (tmp); tmp = XCDR (tmp))
4ed46869 4932 {
65059037
RS
4933 if (eol_type != CODING_EOL_UNDECIDED
4934 && eol_type != CODING_EOL_INCONSISTENT)
4ed46869 4935 {
d46c5b12 4936 Lisp_Object eol;
03699b14 4937 eol = Fget (XCAR (tmp), Qeol_type);
d46c5b12 4938 if (VECTORP (eol))
03699b14 4939 XCAR (tmp) = XVECTOR (eol)->contents[eol_type];
4ed46869
KH
4940 }
4941 }
03699b14 4942 return (highest ? XCAR (val) : val);
d46c5b12 4943}
4ed46869 4944
d46c5b12
KH
4945DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
4946 2, 3, 0,
4947 "Detect coding system of the text in the region between START and END.\n\
4948Return a list of possible coding systems ordered by priority.\n\
4949\n\
80e803b4
KH
4950If only ASCII characters are found, it returns a list of single element\n\
4951`undecided' or its subsidiary coding system according to a detected\n\
4952end-of-line format.\n\
d46c5b12
KH
4953\n\
4954If optional argument HIGHEST is non-nil, return the coding system of\n\
4955highest priority.")
4956 (start, end, highest)
4957 Lisp_Object start, end, highest;
4958{
4959 int from, to;
4960 int from_byte, to_byte;
6289dd10 4961
d46c5b12
KH
4962 CHECK_NUMBER_COERCE_MARKER (start, 0);
4963 CHECK_NUMBER_COERCE_MARKER (end, 1);
4ed46869 4964
d46c5b12
KH
4965 validate_region (&start, &end);
4966 from = XINT (start), to = XINT (end);
4967 from_byte = CHAR_TO_BYTE (from);
4968 to_byte = CHAR_TO_BYTE (to);
6289dd10 4969
d46c5b12
KH
4970 if (from < GPT && to >= GPT)
4971 move_gap_both (to, to_byte);
4ed46869 4972
d46c5b12
KH
4973 return detect_coding_system (BYTE_POS_ADDR (from_byte),
4974 to_byte - from_byte,
4975 !NILP (highest));
4976}
6289dd10 4977
d46c5b12
KH
4978DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
4979 1, 2, 0,
4980 "Detect coding system of the text in STRING.\n\
4981Return a list of possible coding systems ordered by priority.\n\
4982\n\
80e803b4
KH
4983If only ASCII characters are found, it returns a list of single element\n\
4984`undecided' or its subsidiary coding system according to a detected\n\
4985end-of-line format.\n\
d46c5b12
KH
4986\n\
4987If optional argument HIGHEST is non-nil, return the coding system of\n\
4988highest priority.")
4989 (string, highest)
4990 Lisp_Object string, highest;
4991{
4992 CHECK_STRING (string, 0);
4ed46869 4993
d46c5b12 4994 return detect_coding_system (XSTRING (string)->data,
fc932ac6 4995 STRING_BYTES (XSTRING (string)),
d46c5b12 4996 !NILP (highest));
4ed46869
KH
4997}
4998
4031e2bf
KH
4999Lisp_Object
5000code_convert_region1 (start, end, coding_system, encodep)
d46c5b12 5001 Lisp_Object start, end, coding_system;
4031e2bf 5002 int encodep;
3a73fa5d
RS
5003{
5004 struct coding_system coding;
4031e2bf 5005 int from, to, len;
3a73fa5d 5006
d46c5b12
KH
5007 CHECK_NUMBER_COERCE_MARKER (start, 0);
5008 CHECK_NUMBER_COERCE_MARKER (end, 1);
3a73fa5d
RS
5009 CHECK_SYMBOL (coding_system, 2);
5010
d46c5b12
KH
5011 validate_region (&start, &end);
5012 from = XFASTINT (start);
5013 to = XFASTINT (end);
5014
3a73fa5d 5015 if (NILP (coding_system))
d46c5b12
KH
5016 return make_number (to - from);
5017
3a73fa5d 5018 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
d46c5b12 5019 error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
3a73fa5d 5020
d46c5b12 5021 coding.mode |= CODING_MODE_LAST_BLOCK;
fb88bf2d
KH
5022 code_convert_region (from, CHAR_TO_BYTE (from), to, CHAR_TO_BYTE (to),
5023 &coding, encodep, 1);
f072a3e8 5024 Vlast_coding_system_used = coding.symbol;
fb88bf2d 5025 return make_number (coding.produced_char);
4031e2bf
KH
5026}
5027
5028DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
5029 3, 3, "r\nzCoding system: ",
5030 "Decode the current region by specified coding system.\n\
5031When called from a program, takes three arguments:\n\
5032START, END, and CODING-SYSTEM. START and END are buffer positions.\n\
f072a3e8
RS
5033This function sets `last-coding-system-used' to the precise coding system\n\
5034used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5035not fully specified.)\n\
5036It returns the length of the decoded text.")
4031e2bf
KH
5037 (start, end, coding_system)
5038 Lisp_Object start, end, coding_system;
5039{
5040 return code_convert_region1 (start, end, coding_system, 0);
3a73fa5d
RS
5041}
5042
5043DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
5044 3, 3, "r\nzCoding system: ",
d46c5b12 5045 "Encode the current region by specified coding system.\n\
3a73fa5d 5046When called from a program, takes three arguments:\n\
d46c5b12 5047START, END, and CODING-SYSTEM. START and END are buffer positions.\n\
f072a3e8
RS
5048This function sets `last-coding-system-used' to the precise coding system\n\
5049used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5050not fully specified.)\n\
5051It returns the length of the encoded text.")
d46c5b12
KH
5052 (start, end, coding_system)
5053 Lisp_Object start, end, coding_system;
3a73fa5d 5054{
4031e2bf
KH
5055 return code_convert_region1 (start, end, coding_system, 1);
5056}
3a73fa5d 5057
4031e2bf
KH
5058Lisp_Object
5059code_convert_string1 (string, coding_system, nocopy, encodep)
5060 Lisp_Object string, coding_system, nocopy;
5061 int encodep;
5062{
5063 struct coding_system coding;
3a73fa5d 5064
4031e2bf
KH
5065 CHECK_STRING (string, 0);
5066 CHECK_SYMBOL (coding_system, 1);
4ed46869 5067
d46c5b12 5068 if (NILP (coding_system))
4031e2bf 5069 return (NILP (nocopy) ? Fcopy_sequence (string) : string);
4ed46869 5070
d46c5b12
KH
5071 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
5072 error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
5f1cd180 5073
d46c5b12 5074 coding.mode |= CODING_MODE_LAST_BLOCK;
f072a3e8 5075 Vlast_coding_system_used = coding.symbol;
4031e2bf 5076 return code_convert_string (string, &coding, encodep, !NILP (nocopy));
4ed46869
KH
5077}
5078
4ed46869 5079DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
e0e989f6
KH
5080 2, 3, 0,
5081 "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
fe487a71 5082Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
f072a3e8
RS
5083if the decoding operation is trivial.\n\
5084This function sets `last-coding-system-used' to the precise coding system\n\
5085used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5086not fully specified.)")
e0e989f6
KH
5087 (string, coding_system, nocopy)
5088 Lisp_Object string, coding_system, nocopy;
4ed46869 5089{
f072a3e8 5090 return code_convert_string1 (string, coding_system, nocopy, 0);
4ed46869
KH
5091}
5092
5093DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
e0e989f6
KH
5094 2, 3, 0,
5095 "Encode STRING to CODING-SYSTEM, and return the result.\n\
fe487a71 5096Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
f072a3e8
RS
5097if the encoding operation is trivial.\n\
5098This function sets `last-coding-system-used' to the precise coding system\n\
5099used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5100not fully specified.)")
e0e989f6
KH
5101 (string, coding_system, nocopy)
5102 Lisp_Object string, coding_system, nocopy;
4ed46869 5103{
f072a3e8 5104 return code_convert_string1 (string, coding_system, nocopy, 1);
4ed46869 5105}
4031e2bf 5106
ecec61c1
KH
5107/* Encode or decode STRING according to CODING_SYSTEM.
5108 Do not set Vlast_coding_system_used. */
5109
5110Lisp_Object
5111code_convert_string_norecord (string, coding_system, encodep)
5112 Lisp_Object string, coding_system;
5113 int encodep;
5114{
5115 struct coding_system coding;
5116
5117 CHECK_STRING (string, 0);
5118 CHECK_SYMBOL (coding_system, 1);
5119
5120 if (NILP (coding_system))
5121 return string;
5122
5123 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
5124 error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
5125
5126 coding.mode |= CODING_MODE_LAST_BLOCK;
5127 return code_convert_string (string, &coding, encodep, Qt);
5128}
3a73fa5d 5129\f
4ed46869 5130DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
55ab7be3 5131 "Decode a Japanese character which has CODE in shift_jis encoding.\n\
4ed46869
KH
5132Return the corresponding character.")
5133 (code)
5134 Lisp_Object code;
5135{
5136 unsigned char c1, c2, s1, s2;
5137 Lisp_Object val;
5138
5139 CHECK_NUMBER (code, 0);
5140 s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
55ab7be3
KH
5141 if (s1 == 0)
5142 {
c28a9453
KH
5143 if (s2 < 0x80)
5144 XSETFASTINT (val, s2);
5145 else if (s2 >= 0xA0 || s2 <= 0xDF)
5146 XSETFASTINT (val,
5147 MAKE_NON_ASCII_CHAR (charset_katakana_jisx0201, s2, 0));
5148 else
9da8350f 5149 error ("Invalid Shift JIS code: %x", XFASTINT (code));
55ab7be3
KH
5150 }
5151 else
5152 {
5153 if ((s1 < 0x80 || s1 > 0x9F && s1 < 0xE0 || s1 > 0xEF)
5154 || (s2 < 0x40 || s2 == 0x7F || s2 > 0xFC))
9da8350f 5155 error ("Invalid Shift JIS code: %x", XFASTINT (code));
55ab7be3
KH
5156 DECODE_SJIS (s1, s2, c1, c2);
5157 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset_jisx0208, c1, c2));
5158 }
4ed46869
KH
5159 return val;
5160}
5161
5162DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
55ab7be3
KH
5163 "Encode a Japanese character CHAR to shift_jis encoding.\n\
5164Return the corresponding code in SJIS.")
4ed46869
KH
5165 (ch)
5166 Lisp_Object ch;
5167{
bcf26d6a 5168 int charset, c1, c2, s1, s2;
4ed46869
KH
5169 Lisp_Object val;
5170
5171 CHECK_NUMBER (ch, 0);
5172 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
c28a9453
KH
5173 if (charset == CHARSET_ASCII)
5174 {
5175 val = ch;
5176 }
5177 else if (charset == charset_jisx0208
5178 && c1 > 0x20 && c1 < 0x7F && c2 > 0x20 && c2 < 0x7F)
4ed46869
KH
5179 {
5180 ENCODE_SJIS (c1, c2, s1, s2);
bcf26d6a 5181 XSETFASTINT (val, (s1 << 8) | s2);
4ed46869 5182 }
55ab7be3
KH
5183 else if (charset == charset_katakana_jisx0201
5184 && c1 > 0x20 && c2 < 0xE0)
5185 {
5186 XSETFASTINT (val, c1 | 0x80);
5187 }
4ed46869 5188 else
55ab7be3 5189 error ("Can't encode to shift_jis: %d", XFASTINT (ch));
4ed46869
KH
5190 return val;
5191}
5192
5193DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
c28a9453 5194 "Decode a Big5 character which has CODE in BIG5 coding system.\n\
4ed46869
KH
5195Return the corresponding character.")
5196 (code)
5197 Lisp_Object code;
5198{
5199 int charset;
5200 unsigned char b1, b2, c1, c2;
5201 Lisp_Object val;
5202
5203 CHECK_NUMBER (code, 0);
5204 b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
c28a9453
KH
5205 if (b1 == 0)
5206 {
5207 if (b2 >= 0x80)
9da8350f 5208 error ("Invalid BIG5 code: %x", XFASTINT (code));
c28a9453
KH
5209 val = code;
5210 }
5211 else
5212 {
5213 if ((b1 < 0xA1 || b1 > 0xFE)
5214 || (b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE))
9da8350f 5215 error ("Invalid BIG5 code: %x", XFASTINT (code));
c28a9453
KH
5216 DECODE_BIG5 (b1, b2, charset, c1, c2);
5217 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset, c1, c2));
5218 }
4ed46869
KH
5219 return val;
5220}
5221
5222DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
d46c5b12 5223 "Encode the Big5 character CHAR to BIG5 coding system.\n\
4ed46869
KH
5224Return the corresponding character code in Big5.")
5225 (ch)
5226 Lisp_Object ch;
5227{
bcf26d6a 5228 int charset, c1, c2, b1, b2;
4ed46869
KH
5229 Lisp_Object val;
5230
5231 CHECK_NUMBER (ch, 0);
5232 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
c28a9453
KH
5233 if (charset == CHARSET_ASCII)
5234 {
5235 val = ch;
5236 }
5237 else if ((charset == charset_big5_1
5238 && (XFASTINT (ch) >= 0x250a1 && XFASTINT (ch) <= 0x271ec))
5239 || (charset == charset_big5_2
5240 && XFASTINT (ch) >= 0x290a1 && XFASTINT (ch) <= 0x2bdb2))
4ed46869
KH
5241 {
5242 ENCODE_BIG5 (charset, c1, c2, b1, b2);
bcf26d6a 5243 XSETFASTINT (val, (b1 << 8) | b2);
4ed46869
KH
5244 }
5245 else
c28a9453 5246 error ("Can't encode to Big5: %d", XFASTINT (ch));
4ed46869
KH
5247 return val;
5248}
3a73fa5d 5249\f
1ba9e4ab
KH
5250DEFUN ("set-terminal-coding-system-internal",
5251 Fset_terminal_coding_system_internal,
5252 Sset_terminal_coding_system_internal, 1, 1, 0, "")
4ed46869
KH
5253 (coding_system)
5254 Lisp_Object coding_system;
5255{
5256 CHECK_SYMBOL (coding_system, 0);
5257 setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
70c22245 5258 /* We had better not send unsafe characters to terminal. */
6e85d753
KH
5259 terminal_coding.flags |= CODING_FLAG_ISO_SAFE;
5260
4ed46869
KH
5261 return Qnil;
5262}
5263
c4825358
KH
5264DEFUN ("set-safe-terminal-coding-system-internal",
5265 Fset_safe_terminal_coding_system_internal,
5266 Sset_safe_terminal_coding_system_internal, 1, 1, 0, "")
5267 (coding_system)
5268 Lisp_Object coding_system;
5269{
5270 CHECK_SYMBOL (coding_system, 0);
5271 setup_coding_system (Fcheck_coding_system (coding_system),
5272 &safe_terminal_coding);
5273 return Qnil;
5274}
5275
4ed46869
KH
5276DEFUN ("terminal-coding-system",
5277 Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
3a73fa5d 5278 "Return coding system specified for terminal output.")
4ed46869
KH
5279 ()
5280{
5281 return terminal_coding.symbol;
5282}
5283
1ba9e4ab
KH
5284DEFUN ("set-keyboard-coding-system-internal",
5285 Fset_keyboard_coding_system_internal,
5286 Sset_keyboard_coding_system_internal, 1, 1, 0, "")
4ed46869
KH
5287 (coding_system)
5288 Lisp_Object coding_system;
5289{
5290 CHECK_SYMBOL (coding_system, 0);
5291 setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
5292 return Qnil;
5293}
5294
5295DEFUN ("keyboard-coding-system",
5296 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
3a73fa5d 5297 "Return coding system specified for decoding keyboard input.")
4ed46869
KH
5298 ()
5299{
5300 return keyboard_coding.symbol;
5301}
5302
5303\f
a5d301df
KH
5304DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
5305 Sfind_operation_coding_system, 1, MANY, 0,
5306 "Choose a coding system for an operation based on the target name.\n\
69f76525 5307The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).\n\
9ce27fde
KH
5308DECODING-SYSTEM is the coding system to use for decoding\n\
5309\(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system\n\
5310for encoding (in case OPERATION does encoding).\n\
ccdb79f5
RS
5311\n\
5312The first argument OPERATION specifies an I/O primitive:\n\
5313 For file I/O, `insert-file-contents' or `write-region'.\n\
5314 For process I/O, `call-process', `call-process-region', or `start-process'.\n\
5315 For network I/O, `open-network-stream'.\n\
5316\n\
5317The remaining arguments should be the same arguments that were passed\n\
5318to the primitive. Depending on which primitive, one of those arguments\n\
5319is selected as the TARGET. For example, if OPERATION does file I/O,\n\
5320whichever argument specifies the file name is TARGET.\n\
5321\n\
5322TARGET has a meaning which depends on OPERATION:\n\
4ed46869
KH
5323 For file I/O, TARGET is a file name.\n\
5324 For process I/O, TARGET is a process name.\n\
5325 For network I/O, TARGET is a service name or a port number\n\
5326\n\
02ba4723
KH
5327This function looks up what specified for TARGET in,\n\
5328`file-coding-system-alist', `process-coding-system-alist',\n\
5329or `network-coding-system-alist' depending on OPERATION.\n\
5330They may specify a coding system, a cons of coding systems,\n\
5331or a function symbol to call.\n\
5332In the last case, we call the function with one argument,\n\
9ce27fde 5333which is a list of all the arguments given to this function.")
4ed46869
KH
5334 (nargs, args)
5335 int nargs;
5336 Lisp_Object *args;
5337{
5338 Lisp_Object operation, target_idx, target, val;
5339 register Lisp_Object chain;
5340
5341 if (nargs < 2)
5342 error ("Too few arguments");
5343 operation = args[0];
5344 if (!SYMBOLP (operation)
5345 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
5346 error ("Invalid first arguement");
5347 if (nargs < 1 + XINT (target_idx))
5348 error ("Too few arguments for operation: %s",
5349 XSYMBOL (operation)->name->data);
5350 target = args[XINT (target_idx) + 1];
5351 if (!(STRINGP (target)
5352 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
5353 error ("Invalid %dth argument", XINT (target_idx) + 1);
5354
2e34157c
RS
5355 chain = ((EQ (operation, Qinsert_file_contents)
5356 || EQ (operation, Qwrite_region))
02ba4723 5357 ? Vfile_coding_system_alist
2e34157c 5358 : (EQ (operation, Qopen_network_stream)
02ba4723
KH
5359 ? Vnetwork_coding_system_alist
5360 : Vprocess_coding_system_alist));
4ed46869
KH
5361 if (NILP (chain))
5362 return Qnil;
5363
03699b14 5364 for (; CONSP (chain); chain = XCDR (chain))
4ed46869 5365 {
f44d27ce 5366 Lisp_Object elt;
03699b14 5367 elt = XCAR (chain);
4ed46869
KH
5368
5369 if (CONSP (elt)
5370 && ((STRINGP (target)
03699b14
KR
5371 && STRINGP (XCAR (elt))
5372 && fast_string_match (XCAR (elt), target) >= 0)
5373 || (INTEGERP (target) && EQ (target, XCAR (elt)))))
02ba4723 5374 {
03699b14 5375 val = XCDR (elt);
b19fd4c5
KH
5376 /* Here, if VAL is both a valid coding system and a valid
5377 function symbol, we return VAL as a coding system. */
02ba4723
KH
5378 if (CONSP (val))
5379 return val;
5380 if (! SYMBOLP (val))
5381 return Qnil;
5382 if (! NILP (Fcoding_system_p (val)))
5383 return Fcons (val, val);
b19fd4c5
KH
5384 if (! NILP (Ffboundp (val)))
5385 {
5386 val = call1 (val, Flist (nargs, args));
5387 if (CONSP (val))
5388 return val;
5389 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
5390 return Fcons (val, val);
5391 }
02ba4723
KH
5392 return Qnil;
5393 }
4ed46869
KH
5394 }
5395 return Qnil;
5396}
5397
1397dc18
KH
5398DEFUN ("update-coding-systems-internal", Fupdate_coding_systems_internal,
5399 Supdate_coding_systems_internal, 0, 0, 0,
5400 "Update internal database for ISO2022 and CCL based coding systems.\n\
d46c5b12
KH
5401When values of the following coding categories are changed, you must\n\
5402call this function:\n\
5403 coding-category-iso-7, coding-category-iso-7-tight,\n\
5404 coding-category-iso-8-1, coding-category-iso-8-2,\n\
1397dc18
KH
5405 coding-category-iso-7-else, coding-category-iso-8-else,\n\
5406 coding-category-ccl")
d46c5b12
KH
5407 ()
5408{
5409 int i;
5410
1397dc18 5411 for (i = CODING_CATEGORY_IDX_ISO_7; i <= CODING_CATEGORY_IDX_CCL; i++)
d46c5b12 5412 {
1397dc18
KH
5413 Lisp_Object val;
5414
5415 val = XSYMBOL (XVECTOR (Vcoding_category_table)->contents[i])->value;
5416 if (!NILP (val))
5417 {
5418 if (! coding_system_table[i])
5419 coding_system_table[i] = ((struct coding_system *)
5420 xmalloc (sizeof (struct coding_system)));
5421 setup_coding_system (val, coding_system_table[i]);
5422 }
5423 else if (coding_system_table[i])
5424 {
5425 xfree (coding_system_table[i]);
5426 coding_system_table[i] = NULL;
5427 }
d46c5b12 5428 }
1397dc18 5429
d46c5b12
KH
5430 return Qnil;
5431}
5432
66cfb530
KH
5433DEFUN ("set-coding-priority-internal", Fset_coding_priority_internal,
5434 Sset_coding_priority_internal, 0, 0, 0,
5435 "Update internal database for the current value of `coding-category-list'.\n\
5436This function is internal use only.")
5437 ()
5438{
5439 int i = 0, idx;
84d60297
RS
5440 Lisp_Object val;
5441
5442 val = Vcoding_category_list;
66cfb530
KH
5443
5444 while (CONSP (val) && i < CODING_CATEGORY_IDX_MAX)
5445 {
03699b14 5446 if (! SYMBOLP (XCAR (val)))
66cfb530 5447 break;
03699b14 5448 idx = XFASTINT (Fget (XCAR (val), Qcoding_category_index));
66cfb530
KH
5449 if (idx >= CODING_CATEGORY_IDX_MAX)
5450 break;
5451 coding_priorities[i++] = (1 << idx);
03699b14 5452 val = XCDR (val);
66cfb530
KH
5453 }
5454 /* If coding-category-list is valid and contains all coding
5455 categories, `i' should be CODING_CATEGORY_IDX_MAX now. If not,
5456 the following code saves Emacs from craching. */
5457 while (i < CODING_CATEGORY_IDX_MAX)
5458 coding_priorities[i++] = CODING_CATEGORY_MASK_RAW_TEXT;
5459
5460 return Qnil;
5461}
5462
4ed46869
KH
5463#endif /* emacs */
5464
5465\f
1397dc18 5466/*** 9. Post-amble ***/
4ed46869 5467
6d74c3aa
KH
5468void
5469init_coding ()
5470{
5471 conversion_buffer = (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE);
5472}
5473
dfcf069d 5474void
4ed46869
KH
5475init_coding_once ()
5476{
5477 int i;
5478
0ef69138 5479 /* Emacs' internal format specific initialize routine. */
4ed46869
KH
5480 for (i = 0; i <= 0x20; i++)
5481 emacs_code_class[i] = EMACS_control_code;
5482 emacs_code_class[0x0A] = EMACS_linefeed_code;
5483 emacs_code_class[0x0D] = EMACS_carriage_return_code;
5484 for (i = 0x21 ; i < 0x7F; i++)
5485 emacs_code_class[i] = EMACS_ascii_code;
5486 emacs_code_class[0x7F] = EMACS_control_code;
5487 emacs_code_class[0x80] = EMACS_leading_code_composition;
5488 for (i = 0x81; i < 0xFF; i++)
5489 emacs_code_class[i] = EMACS_invalid_code;
5490 emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
5491 emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
5492 emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
5493 emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
5494
5495 /* ISO2022 specific initialize routine. */
5496 for (i = 0; i < 0x20; i++)
5497 iso_code_class[i] = ISO_control_code;
5498 for (i = 0x21; i < 0x7F; i++)
5499 iso_code_class[i] = ISO_graphic_plane_0;
5500 for (i = 0x80; i < 0xA0; i++)
5501 iso_code_class[i] = ISO_control_code;
5502 for (i = 0xA1; i < 0xFF; i++)
5503 iso_code_class[i] = ISO_graphic_plane_1;
5504 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
5505 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
5506 iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
5507 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
5508 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
5509 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
5510 iso_code_class[ISO_CODE_ESC] = ISO_escape;
5511 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
5512 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
5513 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
5514
e0e989f6 5515 conversion_buffer_size = MINIMUM_CONVERSION_BUFFER_SIZE;
e0e989f6
KH
5516
5517 setup_coding_system (Qnil, &keyboard_coding);
5518 setup_coding_system (Qnil, &terminal_coding);
c4825358 5519 setup_coding_system (Qnil, &safe_terminal_coding);
6bc51348 5520 setup_coding_system (Qnil, &default_buffer_file_coding);
9ce27fde 5521
d46c5b12
KH
5522 bzero (coding_system_table, sizeof coding_system_table);
5523
66cfb530
KH
5524 bzero (ascii_skip_code, sizeof ascii_skip_code);
5525 for (i = 0; i < 128; i++)
5526 ascii_skip_code[i] = 1;
5527
9ce27fde
KH
5528#if defined (MSDOS) || defined (WINDOWSNT)
5529 system_eol_type = CODING_EOL_CRLF;
5530#else
5531 system_eol_type = CODING_EOL_LF;
5532#endif
b843d1ae
KH
5533
5534 inhibit_pre_post_conversion = 0;
e0e989f6
KH
5535}
5536
5537#ifdef emacs
5538
dfcf069d 5539void
e0e989f6
KH
5540syms_of_coding ()
5541{
5542 Qtarget_idx = intern ("target-idx");
5543 staticpro (&Qtarget_idx);
5544
bb0115a2
RS
5545 Qcoding_system_history = intern ("coding-system-history");
5546 staticpro (&Qcoding_system_history);
5547 Fset (Qcoding_system_history, Qnil);
5548
9ce27fde 5549 /* Target FILENAME is the first argument. */
e0e989f6 5550 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
9ce27fde 5551 /* Target FILENAME is the third argument. */
e0e989f6
KH
5552 Fput (Qwrite_region, Qtarget_idx, make_number (2));
5553
5554 Qcall_process = intern ("call-process");
5555 staticpro (&Qcall_process);
9ce27fde 5556 /* Target PROGRAM is the first argument. */
e0e989f6
KH
5557 Fput (Qcall_process, Qtarget_idx, make_number (0));
5558
5559 Qcall_process_region = intern ("call-process-region");
5560 staticpro (&Qcall_process_region);
9ce27fde 5561 /* Target PROGRAM is the third argument. */
e0e989f6
KH
5562 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
5563
5564 Qstart_process = intern ("start-process");
5565 staticpro (&Qstart_process);
9ce27fde 5566 /* Target PROGRAM is the third argument. */
e0e989f6
KH
5567 Fput (Qstart_process, Qtarget_idx, make_number (2));
5568
5569 Qopen_network_stream = intern ("open-network-stream");
5570 staticpro (&Qopen_network_stream);
9ce27fde 5571 /* Target SERVICE is the fourth argument. */
e0e989f6
KH
5572 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
5573
4ed46869
KH
5574 Qcoding_system = intern ("coding-system");
5575 staticpro (&Qcoding_system);
5576
5577 Qeol_type = intern ("eol-type");
5578 staticpro (&Qeol_type);
5579
5580 Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
5581 staticpro (&Qbuffer_file_coding_system);
5582
5583 Qpost_read_conversion = intern ("post-read-conversion");
5584 staticpro (&Qpost_read_conversion);
5585
5586 Qpre_write_conversion = intern ("pre-write-conversion");
5587 staticpro (&Qpre_write_conversion);
5588
27901516
KH
5589 Qno_conversion = intern ("no-conversion");
5590 staticpro (&Qno_conversion);
5591
5592 Qundecided = intern ("undecided");
5593 staticpro (&Qundecided);
5594
4ed46869
KH
5595 Qcoding_system_p = intern ("coding-system-p");
5596 staticpro (&Qcoding_system_p);
5597
5598 Qcoding_system_error = intern ("coding-system-error");
5599 staticpro (&Qcoding_system_error);
5600
5601 Fput (Qcoding_system_error, Qerror_conditions,
5602 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
5603 Fput (Qcoding_system_error, Qerror_message,
9ce27fde 5604 build_string ("Invalid coding system"));
4ed46869 5605
d46c5b12
KH
5606 Qcoding_category = intern ("coding-category");
5607 staticpro (&Qcoding_category);
4ed46869
KH
5608 Qcoding_category_index = intern ("coding-category-index");
5609 staticpro (&Qcoding_category_index);
5610
d46c5b12
KH
5611 Vcoding_category_table
5612 = Fmake_vector (make_number (CODING_CATEGORY_IDX_MAX), Qnil);
5613 staticpro (&Vcoding_category_table);
4ed46869
KH
5614 {
5615 int i;
5616 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
5617 {
d46c5b12
KH
5618 XVECTOR (Vcoding_category_table)->contents[i]
5619 = intern (coding_category_name[i]);
5620 Fput (XVECTOR (Vcoding_category_table)->contents[i],
5621 Qcoding_category_index, make_number (i));
4ed46869
KH
5622 }
5623 }
5624
f967223b
KH
5625 Qtranslation_table = intern ("translation-table");
5626 staticpro (&Qtranslation_table);
1397dc18 5627 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (1));
bdd9fb48 5628
f967223b
KH
5629 Qtranslation_table_id = intern ("translation-table-id");
5630 staticpro (&Qtranslation_table_id);
84fbb8a0 5631
f967223b
KH
5632 Qtranslation_table_for_decode = intern ("translation-table-for-decode");
5633 staticpro (&Qtranslation_table_for_decode);
a5d301df 5634
f967223b
KH
5635 Qtranslation_table_for_encode = intern ("translation-table-for-encode");
5636 staticpro (&Qtranslation_table_for_encode);
a5d301df 5637
70c22245
KH
5638 Qsafe_charsets = intern ("safe-charsets");
5639 staticpro (&Qsafe_charsets);
5640
1397dc18
KH
5641 Qvalid_codes = intern ("valid-codes");
5642 staticpro (&Qvalid_codes);
5643
9ce27fde
KH
5644 Qemacs_mule = intern ("emacs-mule");
5645 staticpro (&Qemacs_mule);
5646
d46c5b12
KH
5647 Qraw_text = intern ("raw-text");
5648 staticpro (&Qraw_text);
5649
4ed46869
KH
5650 defsubr (&Scoding_system_p);
5651 defsubr (&Sread_coding_system);
5652 defsubr (&Sread_non_nil_coding_system);
5653 defsubr (&Scheck_coding_system);
5654 defsubr (&Sdetect_coding_region);
d46c5b12 5655 defsubr (&Sdetect_coding_string);
4ed46869
KH
5656 defsubr (&Sdecode_coding_region);
5657 defsubr (&Sencode_coding_region);
5658 defsubr (&Sdecode_coding_string);
5659 defsubr (&Sencode_coding_string);
5660 defsubr (&Sdecode_sjis_char);
5661 defsubr (&Sencode_sjis_char);
5662 defsubr (&Sdecode_big5_char);
5663 defsubr (&Sencode_big5_char);
1ba9e4ab 5664 defsubr (&Sset_terminal_coding_system_internal);
c4825358 5665 defsubr (&Sset_safe_terminal_coding_system_internal);
4ed46869 5666 defsubr (&Sterminal_coding_system);
1ba9e4ab 5667 defsubr (&Sset_keyboard_coding_system_internal);
4ed46869 5668 defsubr (&Skeyboard_coding_system);
a5d301df 5669 defsubr (&Sfind_operation_coding_system);
1397dc18 5670 defsubr (&Supdate_coding_systems_internal);
66cfb530 5671 defsubr (&Sset_coding_priority_internal);
4ed46869 5672
4608c386
KH
5673 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
5674 "List of coding systems.\n\
5675\n\
5676Do not alter the value of this variable manually. This variable should be\n\
5677updated by the functions `make-coding-system' and\n\
5678`define-coding-system-alias'.");
5679 Vcoding_system_list = Qnil;
5680
5681 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
5682 "Alist of coding system names.\n\
5683Each element is one element list of coding system name.\n\
5684This variable is given to `completing-read' as TABLE argument.\n\
5685\n\
5686Do not alter the value of this variable manually. This variable should be\n\
5687updated by the functions `make-coding-system' and\n\
5688`define-coding-system-alias'.");
5689 Vcoding_system_alist = Qnil;
5690
4ed46869
KH
5691 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
5692 "List of coding-categories (symbols) ordered by priority.");
5693 {
5694 int i;
5695
5696 Vcoding_category_list = Qnil;
5697 for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
5698 Vcoding_category_list
d46c5b12
KH
5699 = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
5700 Vcoding_category_list);
4ed46869
KH
5701 }
5702
5703 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
10bff6f1 5704 "Specify the coding system for read operations.\n\
2ebb362d 5705It is useful to bind this variable with `let', but do not set it globally.\n\
4ed46869 5706If the value is a coding system, it is used for decoding on read operation.\n\
a67a9c66 5707If not, an appropriate element is used from one of the coding system alists:\n\
10bff6f1 5708There are three such tables, `file-coding-system-alist',\n\
a67a9c66 5709`process-coding-system-alist', and `network-coding-system-alist'.");
4ed46869
KH
5710 Vcoding_system_for_read = Qnil;
5711
5712 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
10bff6f1 5713 "Specify the coding system for write operations.\n\
928aedd8
RS
5714Programs bind this variable with `let', but you should not set it globally.\n\
5715If the value is a coding system, it is used for encoding of output,\n\
5716when writing it to a file and when sending it to a file or subprocess.\n\
5717\n\
5718If this does not specify a coding system, an appropriate element\n\
5719is used from one of the coding system alists:\n\
10bff6f1 5720There are three such tables, `file-coding-system-alist',\n\
928aedd8
RS
5721`process-coding-system-alist', and `network-coding-system-alist'.\n\
5722For output to files, if the above procedure does not specify a coding system,\n\
5723the value of `buffer-file-coding-system' is used.");
4ed46869
KH
5724 Vcoding_system_for_write = Qnil;
5725
5726 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
a67a9c66 5727 "Coding system used in the latest file or process I/O.");
4ed46869
KH
5728 Vlast_coding_system_used = Qnil;
5729
9ce27fde 5730 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
f07f4a24 5731 "*Non-nil means always inhibit code conversion of end-of-line format.\n\
94c7a214
DL
5732See info node `Coding Systems' and info node `Text and Binary' concerning\n\
5733such conversion.");
9ce27fde
KH
5734 inhibit_eol_conversion = 0;
5735
ed29121d
EZ
5736 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
5737 "Non-nil means process buffer inherits coding system of process output.\n\
5738Bind it to t if the process output is to be treated as if it were a file\n\
5739read from some filesystem.");
5740 inherit_process_coding_system = 0;
5741
02ba4723
KH
5742 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
5743 "Alist to decide a coding system to use for a file I/O operation.\n\
5744The format is ((PATTERN . VAL) ...),\n\
5745where PATTERN is a regular expression matching a file name,\n\
5746VAL is a coding system, a cons of coding systems, or a function symbol.\n\
5747If VAL is a coding system, it is used for both decoding and encoding\n\
5748the file contents.\n\
5749If VAL is a cons of coding systems, the car part is used for decoding,\n\
5750and the cdr part is used for encoding.\n\
5751If VAL is a function symbol, the function must return a coding system\n\
5752or a cons of coding systems which are used as above.\n\
e0e989f6 5753\n\
a85a871a 5754See also the function `find-operation-coding-system'\n\
eda284ac 5755and the variable `auto-coding-alist'.");
02ba4723
KH
5756 Vfile_coding_system_alist = Qnil;
5757
5758 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
5759 "Alist to decide a coding system to use for a process I/O operation.\n\
5760The format is ((PATTERN . VAL) ...),\n\
5761where PATTERN is a regular expression matching a program name,\n\
5762VAL is a coding system, a cons of coding systems, or a function symbol.\n\
5763If VAL is a coding system, it is used for both decoding what received\n\
5764from the program and encoding what sent to the program.\n\
5765If VAL is a cons of coding systems, the car part is used for decoding,\n\
5766and the cdr part is used for encoding.\n\
5767If VAL is a function symbol, the function must return a coding system\n\
5768or a cons of coding systems which are used as above.\n\
4ed46869 5769\n\
9ce27fde 5770See also the function `find-operation-coding-system'.");
02ba4723
KH
5771 Vprocess_coding_system_alist = Qnil;
5772
5773 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
5774 "Alist to decide a coding system to use for a network I/O operation.\n\
5775The format is ((PATTERN . VAL) ...),\n\
5776where PATTERN is a regular expression matching a network service name\n\
5777or is a port number to connect to,\n\
5778VAL is a coding system, a cons of coding systems, or a function symbol.\n\
5779If VAL is a coding system, it is used for both decoding what received\n\
5780from the network stream and encoding what sent to the network stream.\n\
5781If VAL is a cons of coding systems, the car part is used for decoding,\n\
5782and the cdr part is used for encoding.\n\
5783If VAL is a function symbol, the function must return a coding system\n\
5784or a cons of coding systems which are used as above.\n\
4ed46869 5785\n\
9ce27fde 5786See also the function `find-operation-coding-system'.");
02ba4723 5787 Vnetwork_coding_system_alist = Qnil;
4ed46869 5788
7722baf9
EZ
5789 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
5790 "*String displayed in mode line for UNIX-like (LF) end-of-line format.");
5791 eol_mnemonic_unix = build_string (":");
4ed46869 5792
7722baf9
EZ
5793 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
5794 "*String displayed in mode line for DOS-like (CRLF) end-of-line format.");
5795 eol_mnemonic_dos = build_string ("\\");
4ed46869 5796
7722baf9
EZ
5797 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
5798 "*String displayed in mode line for MAC-like (CR) end-of-line format.");
5799 eol_mnemonic_mac = build_string ("/");
4ed46869 5800
7722baf9
EZ
5801 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
5802 "*String displayed in mode line when end-of-line format is not yet determined.");
5803 eol_mnemonic_undecided = build_string (":");
4ed46869 5804
84fbb8a0 5805 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
f967223b 5806 "*Non-nil enables character translation while encoding and decoding.");
84fbb8a0 5807 Venable_character_translation = Qt;
bdd9fb48 5808
f967223b
KH
5809 DEFVAR_LISP ("standard-translation-table-for-decode",
5810 &Vstandard_translation_table_for_decode,
84fbb8a0 5811 "Table for translating characters while decoding.");
f967223b 5812 Vstandard_translation_table_for_decode = Qnil;
bdd9fb48 5813
f967223b
KH
5814 DEFVAR_LISP ("standard-translation-table-for-encode",
5815 &Vstandard_translation_table_for_encode,
84fbb8a0 5816 "Table for translationg characters while encoding.");
f967223b 5817 Vstandard_translation_table_for_encode = Qnil;
4ed46869
KH
5818
5819 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
5820 "Alist of charsets vs revision numbers.\n\
5821While encoding, if a charset (car part of an element) is found,\n\
5822designate it with the escape sequence identifing revision (cdr part of the element).");
5823 Vcharset_revision_alist = Qnil;
02ba4723
KH
5824
5825 DEFVAR_LISP ("default-process-coding-system",
5826 &Vdefault_process_coding_system,
5827 "Cons of coding systems used for process I/O by default.\n\
5828The car part is used for decoding a process output,\n\
5829the cdr part is used for encoding a text to be sent to a process.");
5830 Vdefault_process_coding_system = Qnil;
c4825358 5831
3f003981
KH
5832 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
5833 "Table of extra Latin codes in the range 128..159 (inclusive).\n\
c4825358
KH
5834This is a vector of length 256.\n\
5835If Nth element is non-nil, the existence of code N in a file\n\
bb0115a2 5836\(or output of subprocess) doesn't prevent it to be detected as\n\
3f003981
KH
5837a coding system of ISO 2022 variant which has a flag\n\
5838`accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file\n\
c4825358
KH
5839or reading output of a subprocess.\n\
5840Only 128th through 159th elements has a meaning.");
3f003981 5841 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
d46c5b12
KH
5842
5843 DEFVAR_LISP ("select-safe-coding-system-function",
5844 &Vselect_safe_coding_system_function,
5845 "Function to call to select safe coding system for encoding a text.\n\
5846\n\
5847If set, this function is called to force a user to select a proper\n\
5848coding system which can encode the text in the case that a default\n\
5849coding system used in each operation can't encode the text.\n\
5850\n\
a85a871a 5851The default value is `select-safe-coding-system' (which see).");
d46c5b12
KH
5852 Vselect_safe_coding_system_function = Qnil;
5853
4ed46869
KH
5854}
5855
5856#endif /* emacs */