("romanian-prefix"): New input method.
[bpt/emacs.git] / src / coding.c
CommitLineData
4ed46869 1/* Coding system handler (conversion, detection, and etc).
4a2f9c6a 2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
203cb916 3 Licensed to the Free Software Foundation.
4ed46869 4
369314dc
KH
5This file is part of GNU Emacs.
6
7GNU Emacs is free software; you can redistribute it and/or modify
8it under the terms of the GNU General Public License as published by
9the Free Software Foundation; either version 2, or (at your option)
10any later version.
4ed46869 11
369314dc
KH
12GNU Emacs is distributed in the hope that it will be useful,
13but WITHOUT ANY WARRANTY; without even the implied warranty of
14MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15GNU General Public License for more details.
4ed46869 16
369314dc
KH
17You should have received a copy of the GNU General Public License
18along with GNU Emacs; see the file COPYING. If not, write to
19the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20Boston, MA 02111-1307, USA. */
4ed46869
KH
21
22/*** TABLE OF CONTENTS ***
23
24 1. Preamble
0ef69138 25 2. Emacs' internal format (emacs-mule) handlers
4ed46869
KH
26 3. ISO2022 handlers
27 4. Shift-JIS and BIG5 handlers
1397dc18
KH
28 5. CCL handlers
29 6. End-of-line handlers
30 7. C library functions
31 8. Emacs Lisp library functions
32 9. Post-amble
4ed46869
KH
33
34*/
35
36/*** GENERAL NOTE on CODING SYSTEM ***
37
38 Coding system is an encoding mechanism of one or more character
39 sets. Here's a list of coding systems which Emacs can handle. When
40 we say "decode", it means converting some other coding system to
0ef69138
KH
41 Emacs' internal format (emacs-internal), and when we say "encode",
42 it means converting the coding system emacs-mule to some other
43 coding system.
4ed46869 44
0ef69138 45 0. Emacs' internal format (emacs-mule)
4ed46869
KH
46
47 Emacs itself holds a multi-lingual character in a buffer and a string
f4dee582 48 in a special format. Details are described in section 2.
4ed46869
KH
49
50 1. ISO2022
51
52 The most famous coding system for multiple character sets. X's
f4dee582
RS
53 Compound Text, various EUCs (Extended Unix Code), and coding
54 systems used in Internet communication such as ISO-2022-JP are
55 all variants of ISO2022. Details are described in section 3.
4ed46869
KH
56
57 2. SJIS (or Shift-JIS or MS-Kanji-Code)
58
59 A coding system to encode character sets: ASCII, JISX0201, and
60 JISX0208. Widely used for PC's in Japan. Details are described in
f4dee582 61 section 4.
4ed46869
KH
62
63 3. BIG5
64
65 A coding system to encode character sets: ASCII and Big5. Widely
66 used by Chinese (mainly in Taiwan and Hong Kong). Details are
f4dee582
RS
67 described in section 4. In this file, when we write "BIG5"
68 (all uppercase), we mean the coding system, and when we write
69 "Big5" (capitalized), we mean the character set.
4ed46869 70
27901516
KH
71 4. Raw text
72
4608c386
KH
73 A coding system for a text containing random 8-bit code. Emacs does
74 no code conversion on such a text except for end-of-line format.
27901516
KH
75
76 5. Other
4ed46869 77
f4dee582 78 If a user wants to read/write a text encoded in a coding system not
4ed46869
KH
79 listed above, he can supply a decoder and an encoder for it in CCL
80 (Code Conversion Language) programs. Emacs executes the CCL program
81 while reading/writing.
82
d46c5b12
KH
83 Emacs represents a coding system by a Lisp symbol that has a property
84 `coding-system'. But, before actually using the coding system, the
4ed46869 85 information about it is set in a structure of type `struct
f4dee582 86 coding_system' for rapid processing. See section 6 for more details.
4ed46869
KH
87
88*/
89
90/*** GENERAL NOTES on END-OF-LINE FORMAT ***
91
92 How end-of-line of a text is encoded depends on a system. For
93 instance, Unix's format is just one byte of `line-feed' code,
f4dee582 94 whereas DOS's format is two-byte sequence of `carriage-return' and
d46c5b12
KH
95 `line-feed' codes. MacOS's format is usually one byte of
96 `carriage-return'.
4ed46869 97
f4dee582
RS
98 Since text characters encoding and end-of-line encoding are
99 independent, any coding system described above can take
4ed46869 100 any format of end-of-line. So, Emacs has information of format of
f4dee582 101 end-of-line in each coding-system. See section 6 for more details.
4ed46869
KH
102
103*/
104
105/*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
106
107 These functions check if a text between SRC and SRC_END is encoded
108 in the coding system category XXX. Each returns an integer value in
109 which appropriate flag bits for the category XXX is set. The flag
110 bits are defined in macros CODING_CATEGORY_MASK_XXX. Below is the
111 template of these functions. */
112#if 0
113int
0ef69138 114detect_coding_emacs_mule (src, src_end)
4ed46869
KH
115 unsigned char *src, *src_end;
116{
117 ...
118}
119#endif
120
121/*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
122
123 These functions decode SRC_BYTES length text at SOURCE encoded in
0ef69138 124 CODING to Emacs' internal format (emacs-mule). The resulting text
d46c5b12
KH
125 goes to a place pointed to by DESTINATION, the length of which
126 should not exceed DST_BYTES. These functions set the information of
127 original and decoded texts in the members produced, produced_char,
128 consumed, and consumed_char of the structure *CODING.
129
130 The return value is an integer (CODING_FINISH_XXX) indicating how
131 the decoding finished.
132
133 DST_BYTES zero means that source area and destination area are
134 overlapped, which means that we can produce a decoded text until it
135 reaches at the head of not-yet-decoded source text.
136
137 Below is a template of these functions. */
4ed46869 138#if 0
d46c5b12 139decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
140 struct coding_system *coding;
141 unsigned char *source, *destination;
142 int src_bytes, dst_bytes;
4ed46869
KH
143{
144 ...
145}
146#endif
147
148/*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
149
0ef69138
KH
150 These functions encode SRC_BYTES length text at SOURCE of Emacs'
151 internal format (emacs-mule) to CODING. The resulting text goes to
f4dee582 152 a place pointed to by DESTINATION, the length of which should not
d46c5b12
KH
153 exceed DST_BYTES. These functions set the information of
154 original and encoded texts in the members produced, produced_char,
155 consumed, and consumed_char of the structure *CODING.
156
157 The return value is an integer (CODING_FINISH_XXX) indicating how
158 the encoding finished.
159
160 DST_BYTES zero means that source area and destination area are
161 overlapped, which means that we can produce a decoded text until it
162 reaches at the head of not-yet-decoded source text.
163
164 Below is a template of these functions. */
4ed46869 165#if 0
d46c5b12 166encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
167 struct coding_system *coding;
168 unsigned char *source, *destination;
169 int src_bytes, dst_bytes;
4ed46869
KH
170{
171 ...
172}
173#endif
174
175/*** COMMONLY USED MACROS ***/
176
177/* The following three macros ONE_MORE_BYTE, TWO_MORE_BYTES, and
178 THREE_MORE_BYTES safely get one, two, and three bytes from the
179 source text respectively. If there are not enough bytes in the
180 source, they jump to `label_end_of_loop'. The caller should set
181 variables `src' and `src_end' to appropriate areas in advance. */
182
183#define ONE_MORE_BYTE(c1) \
184 do { \
185 if (src < src_end) \
186 c1 = *src++; \
187 else \
188 goto label_end_of_loop; \
189 } while (0)
190
191#define TWO_MORE_BYTES(c1, c2) \
192 do { \
193 if (src + 1 < src_end) \
194 c1 = *src++, c2 = *src++; \
195 else \
196 goto label_end_of_loop; \
197 } while (0)
198
199#define THREE_MORE_BYTES(c1, c2, c3) \
200 do { \
201 if (src + 2 < src_end) \
202 c1 = *src++, c2 = *src++, c3 = *src++; \
203 else \
204 goto label_end_of_loop; \
205 } while (0)
206
207/* The following three macros DECODE_CHARACTER_ASCII,
208 DECODE_CHARACTER_DIMENSION1, and DECODE_CHARACTER_DIMENSION2 put
209 the multi-byte form of a character of each class at the place
210 pointed by `dst'. The caller should set the variable `dst' to
211 point to an appropriate area and the variable `coding' to point to
212 the coding-system of the currently decoding text in advance. */
213
214/* Decode one ASCII character C. */
215
de79a6a5
KH
216#define DECODE_CHARACTER_ASCII(c) \
217 do { \
218 if (COMPOSING_P (coding->composing)) \
219 { \
220 *dst++ = 0xA0, *dst++ = (c) | 0x80; \
221 coding->composed_chars++; \
d14d03ac
KH
222 if (((c) | 0x80) < 0xA0) \
223 coding->fake_multibyte = 1; \
de79a6a5
KH
224 } \
225 else \
226 { \
227 *dst++ = (c); \
228 coding->produced_char++; \
d14d03ac
KH
229 if ((c) >= 0x80) \
230 coding->fake_multibyte = 1; \
de79a6a5 231 } \
4ed46869
KH
232 } while (0)
233
f4dee582 234/* Decode one DIMENSION1 character whose charset is CHARSET and whose
4ed46869
KH
235 position-code is C. */
236
237#define DECODE_CHARACTER_DIMENSION1(charset, c) \
238 do { \
239 unsigned char leading_code = CHARSET_LEADING_CODE_BASE (charset); \
240 if (COMPOSING_P (coding->composing)) \
de79a6a5
KH
241 { \
242 *dst++ = leading_code + 0x20; \
243 coding->composed_chars++; \
244 } \
4ed46869 245 else \
d46c5b12
KH
246 { \
247 *dst++ = leading_code; \
248 coding->produced_char++; \
249 } \
4ed46869
KH
250 if (leading_code = CHARSET_LEADING_CODE_EXT (charset)) \
251 *dst++ = leading_code; \
252 *dst++ = (c) | 0x80; \
d14d03ac
KH
253 if (((c) | 0x80) < 0xA0) \
254 coding->fake_multibyte = 1; \
4ed46869
KH
255 } while (0)
256
f4dee582 257/* Decode one DIMENSION2 character whose charset is CHARSET and whose
4ed46869
KH
258 position-codes are C1 and C2. */
259
260#define DECODE_CHARACTER_DIMENSION2(charset, c1, c2) \
261 do { \
262 DECODE_CHARACTER_DIMENSION1 (charset, c1); \
263 *dst++ = (c2) | 0x80; \
d14d03ac
KH
264 if (((c2) | 0x80) < 0xA0) \
265 coding->fake_multibyte = 1; \
4ed46869
KH
266 } while (0)
267
268\f
269/*** 1. Preamble ***/
270
271#include <stdio.h>
272
273#ifdef emacs
274
275#include <config.h>
276#include "lisp.h"
277#include "buffer.h"
278#include "charset.h"
279#include "ccl.h"
280#include "coding.h"
281#include "window.h"
282
283#else /* not emacs */
284
285#include "mulelib.h"
286
287#endif /* not emacs */
288
289Lisp_Object Qcoding_system, Qeol_type;
290Lisp_Object Qbuffer_file_coding_system;
291Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
27901516 292Lisp_Object Qno_conversion, Qundecided;
bb0115a2 293Lisp_Object Qcoding_system_history;
70c22245 294Lisp_Object Qsafe_charsets;
1397dc18 295Lisp_Object Qvalid_codes;
4ed46869
KH
296
297extern Lisp_Object Qinsert_file_contents, Qwrite_region;
298Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
299Lisp_Object Qstart_process, Qopen_network_stream;
300Lisp_Object Qtarget_idx;
301
d46c5b12
KH
302Lisp_Object Vselect_safe_coding_system_function;
303
7722baf9
EZ
304/* Mnemonic string for each format of end-of-line. */
305Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
306/* Mnemonic string to indicate format of end-of-line is not yet
4ed46869 307 decided. */
7722baf9 308Lisp_Object eol_mnemonic_undecided;
4ed46869 309
9ce27fde
KH
310/* Format of end-of-line decided by system. This is CODING_EOL_LF on
311 Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac. */
312int system_eol_type;
313
4ed46869
KH
314#ifdef emacs
315
4608c386
KH
316Lisp_Object Vcoding_system_list, Vcoding_system_alist;
317
318Lisp_Object Qcoding_system_p, Qcoding_system_error;
4ed46869 319
d46c5b12
KH
320/* Coding system emacs-mule and raw-text are for converting only
321 end-of-line format. */
322Lisp_Object Qemacs_mule, Qraw_text;
9ce27fde 323
4ed46869
KH
324/* Coding-systems are handed between Emacs Lisp programs and C internal
325 routines by the following three variables. */
326/* Coding-system for reading files and receiving data from process. */
327Lisp_Object Vcoding_system_for_read;
328/* Coding-system for writing files and sending data to process. */
329Lisp_Object Vcoding_system_for_write;
330/* Coding-system actually used in the latest I/O. */
331Lisp_Object Vlast_coding_system_used;
332
c4825358 333/* A vector of length 256 which contains information about special
94487c4e 334 Latin codes (especially for dealing with Microsoft codes). */
3f003981 335Lisp_Object Vlatin_extra_code_table;
c4825358 336
9ce27fde
KH
337/* Flag to inhibit code conversion of end-of-line format. */
338int inhibit_eol_conversion;
339
ed29121d
EZ
340/* Flag to make buffer-file-coding-system inherit from process-coding. */
341int inherit_process_coding_system;
342
c4825358 343/* Coding system to be used to encode text for terminal display. */
4ed46869
KH
344struct coding_system terminal_coding;
345
c4825358
KH
346/* Coding system to be used to encode text for terminal display when
347 terminal coding system is nil. */
348struct coding_system safe_terminal_coding;
349
350/* Coding system of what is sent from terminal keyboard. */
4ed46869
KH
351struct coding_system keyboard_coding;
352
6bc51348
KH
353/* Default coding system to be used to write a file. */
354struct coding_system default_buffer_file_coding;
355
02ba4723
KH
356Lisp_Object Vfile_coding_system_alist;
357Lisp_Object Vprocess_coding_system_alist;
358Lisp_Object Vnetwork_coding_system_alist;
4ed46869
KH
359
360#endif /* emacs */
361
d46c5b12 362Lisp_Object Qcoding_category, Qcoding_category_index;
4ed46869
KH
363
364/* List of symbols `coding-category-xxx' ordered by priority. */
365Lisp_Object Vcoding_category_list;
366
d46c5b12
KH
367/* Table of coding categories (Lisp symbols). */
368Lisp_Object Vcoding_category_table;
4ed46869
KH
369
370/* Table of names of symbol for each coding-category. */
371char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
0ef69138 372 "coding-category-emacs-mule",
4ed46869
KH
373 "coding-category-sjis",
374 "coding-category-iso-7",
d46c5b12 375 "coding-category-iso-7-tight",
4ed46869
KH
376 "coding-category-iso-8-1",
377 "coding-category-iso-8-2",
7717c392
KH
378 "coding-category-iso-7-else",
379 "coding-category-iso-8-else",
89fa8b36 380 "coding-category-ccl",
4ed46869 381 "coding-category-big5",
27901516 382 "coding-category-raw-text",
89fa8b36 383 "coding-category-binary"
4ed46869
KH
384};
385
66cfb530 386/* Table of pointers to coding systems corresponding to each coding
d46c5b12
KH
387 categories. */
388struct coding_system *coding_system_table[CODING_CATEGORY_IDX_MAX];
389
66cfb530
KH
390/* Table of coding category masks. Nth element is a mask for a coding
391 cateogry of which priority is Nth. */
392static
393int coding_priorities[CODING_CATEGORY_IDX_MAX];
394
f967223b
KH
395/* Flag to tell if we look up translation table on character code
396 conversion. */
84fbb8a0 397Lisp_Object Venable_character_translation;
f967223b
KH
398/* Standard translation table to look up on decoding (reading). */
399Lisp_Object Vstandard_translation_table_for_decode;
400/* Standard translation table to look up on encoding (writing). */
401Lisp_Object Vstandard_translation_table_for_encode;
84fbb8a0 402
f967223b
KH
403Lisp_Object Qtranslation_table;
404Lisp_Object Qtranslation_table_id;
405Lisp_Object Qtranslation_table_for_decode;
406Lisp_Object Qtranslation_table_for_encode;
4ed46869
KH
407
408/* Alist of charsets vs revision number. */
409Lisp_Object Vcharset_revision_alist;
410
02ba4723
KH
411/* Default coding systems used for process I/O. */
412Lisp_Object Vdefault_process_coding_system;
413
4ed46869 414\f
0ef69138 415/*** 2. Emacs internal format (emacs-mule) handlers ***/
4ed46869
KH
416
417/* Emacs' internal format for encoding multiple character sets is a
f4dee582
RS
418 kind of multi-byte encoding, i.e. characters are encoded by
419 variable-length sequences of one-byte codes. ASCII characters
420 and control characters (e.g. `tab', `newline') are represented by
421 one-byte sequences which are their ASCII codes, in the range 0x00
422 through 0x7F. The other characters are represented by a sequence
423 of `base leading-code', optional `extended leading-code', and one
424 or two `position-code's. The length of the sequence is determined
425 by the base leading-code. Leading-code takes the range 0x80
426 through 0x9F, whereas extended leading-code and position-code take
427 the range 0xA0 through 0xFF. See `charset.h' for more details
428 about leading-code and position-code.
429
430 There's one exception to this rule. Special leading-code
4ed46869
KH
431 `leading-code-composition' denotes that the following several
432 characters should be composed into one character. Leading-codes of
433 components (except for ASCII) are added 0x20. An ASCII character
434 component is represented by a 2-byte sequence of `0xA0' and
f4dee582
RS
435 `ASCII-code + 0x80'. See also the comments in `charset.h' for the
436 details of composite character. Hence, we can summarize the code
4ed46869
KH
437 range as follows:
438
439 --- CODE RANGE of Emacs' internal format ---
440 (character set) (range)
441 ASCII 0x00 .. 0x7F
442 ELSE (1st byte) 0x80 .. 0x9F
443 (rest bytes) 0xA0 .. 0xFF
444 ---------------------------------------------
445
446 */
447
448enum emacs_code_class_type emacs_code_class[256];
449
450/* Go to the next statement only if *SRC is accessible and the code is
451 greater than 0xA0. */
452#define CHECK_CODE_RANGE_A0_FF \
453 do { \
454 if (src >= src_end) \
455 goto label_end_of_switch; \
456 else if (*src++ < 0xA0) \
457 return 0; \
458 } while (0)
459
460/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
461 Check if a text is encoded in Emacs' internal format. If it is,
d46c5b12 462 return CODING_CATEGORY_MASK_EMACS_MULE, else return 0. */
4ed46869
KH
463
464int
0ef69138 465detect_coding_emacs_mule (src, src_end)
4ed46869
KH
466 unsigned char *src, *src_end;
467{
468 unsigned char c;
469 int composing = 0;
470
471 while (src < src_end)
472 {
473 c = *src++;
474
475 if (composing)
476 {
477 if (c < 0xA0)
478 composing = 0;
479 else
480 c -= 0x20;
481 }
482
483 switch (emacs_code_class[c])
484 {
485 case EMACS_ascii_code:
486 case EMACS_linefeed_code:
487 break;
488
489 case EMACS_control_code:
490 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
491 return 0;
492 break;
493
494 case EMACS_invalid_code:
495 return 0;
496
497 case EMACS_leading_code_composition: /* c == 0x80 */
498 if (composing)
499 CHECK_CODE_RANGE_A0_FF;
500 else
501 composing = 1;
502 break;
503
504 case EMACS_leading_code_4:
505 CHECK_CODE_RANGE_A0_FF;
506 /* fall down to check it two more times ... */
507
508 case EMACS_leading_code_3:
509 CHECK_CODE_RANGE_A0_FF;
510 /* fall down to check it one more time ... */
511
512 case EMACS_leading_code_2:
513 CHECK_CODE_RANGE_A0_FF;
514 break;
515
516 default:
517 label_end_of_switch:
518 break;
519 }
520 }
0ef69138 521 return CODING_CATEGORY_MASK_EMACS_MULE;
4ed46869
KH
522}
523
524\f
525/*** 3. ISO2022 handlers ***/
526
527/* The following note describes the coding system ISO2022 briefly.
39787efd
KH
528 Since the intention of this note is to help understand the
529 functions in this file, some parts are NOT ACCURATE or OVERLY
530 SIMPLIFIED. For thorough understanding, please refer to the
4ed46869
KH
531 original document of ISO2022.
532
533 ISO2022 provides many mechanisms to encode several character sets
39787efd
KH
534 in 7-bit and 8-bit environments. For 7-bite environments, all text
535 is encoded using bytes less than 128. This may make the encoded
536 text a little bit longer, but the text passes more easily through
537 several gateways, some of which strip off MSB (Most Signigant Bit).
538
539 There are two kinds of character sets: control character set and
4ed46869
KH
540 graphic character set. The former contains control characters such
541 as `newline' and `escape' to provide control functions (control
39787efd
KH
542 functions are also provided by escape sequences). The latter
543 contains graphic characters such as 'A' and '-'. Emacs recognizes
4ed46869
KH
544 two control character sets and many graphic character sets.
545
546 Graphic character sets are classified into one of the following
39787efd
KH
547 four classes, according to the number of bytes (DIMENSION) and
548 number of characters in one dimension (CHARS) of the set:
549 - DIMENSION1_CHARS94
550 - DIMENSION1_CHARS96
551 - DIMENSION2_CHARS94
552 - DIMENSION2_CHARS96
553
554 In addition, each character set is assigned an identification tag,
555 unique for each set, called "final character" (denoted as <F>
556 hereafter). The <F> of each character set is decided by ECMA(*)
557 when it is registered in ISO. The code range of <F> is 0x30..0x7F
558 (0x30..0x3F are for private use only).
4ed46869
KH
559
560 Note (*): ECMA = European Computer Manufacturers Association
561
562 Here are examples of graphic character set [NAME(<F>)]:
563 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
564 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
565 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
566 o DIMENSION2_CHARS96 -- none for the moment
567
39787efd 568 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
4ed46869
KH
569 C0 [0x00..0x1F] -- control character plane 0
570 GL [0x20..0x7F] -- graphic character plane 0
571 C1 [0x80..0x9F] -- control character plane 1
572 GR [0xA0..0xFF] -- graphic character plane 1
573
574 A control character set is directly designated and invoked to C0 or
39787efd
KH
575 C1 by an escape sequence. The most common case is that:
576 - ISO646's control character set is designated/invoked to C0, and
577 - ISO6429's control character set is designated/invoked to C1,
578 and usually these designations/invocations are omitted in encoded
579 text. In a 7-bit environment, only C0 can be used, and a control
580 character for C1 is encoded by an appropriate escape sequence to
581 fit into the environment. All control characters for C1 are
582 defined to have corresponding escape sequences.
4ed46869
KH
583
584 A graphic character set is at first designated to one of four
585 graphic registers (G0 through G3), then these graphic registers are
586 invoked to GL or GR. These designations and invocations can be
587 done independently. The most common case is that G0 is invoked to
39787efd
KH
588 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
589 these invocations and designations are omitted in encoded text.
590 In a 7-bit environment, only GL can be used.
4ed46869 591
39787efd
KH
592 When a graphic character set of CHARS94 is invoked to GL, codes
593 0x20 and 0x7F of the GL area work as control characters SPACE and
594 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
595 be used.
4ed46869
KH
596
597 There are two ways of invocation: locking-shift and single-shift.
598 With locking-shift, the invocation lasts until the next different
39787efd
KH
599 invocation, whereas with single-shift, the invocation affects the
600 following character only and doesn't affect the locking-shift
601 state. Invocations are done by the following control characters or
602 escape sequences:
4ed46869
KH
603
604 ----------------------------------------------------------------------
39787efd 605 abbrev function cntrl escape seq description
4ed46869 606 ----------------------------------------------------------------------
39787efd
KH
607 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
608 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
609 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
610 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
611 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
612 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
613 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
614 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
615 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
4ed46869 616 ----------------------------------------------------------------------
39787efd
KH
617 (*) These are not used by any known coding system.
618
619 Control characters for these functions are defined by macros
620 ISO_CODE_XXX in `coding.h'.
4ed46869 621
39787efd 622 Designations are done by the following escape sequences:
4ed46869
KH
623 ----------------------------------------------------------------------
624 escape sequence description
625 ----------------------------------------------------------------------
626 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
627 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
628 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
629 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
630 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
631 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
632 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
633 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
634 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
635 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
636 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
637 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
638 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
639 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
640 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
641 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
642 ----------------------------------------------------------------------
643
644 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
39787efd 645 of dimension 1, chars 94, and final character <F>, etc...
4ed46869
KH
646
647 Note (*): Although these designations are not allowed in ISO2022,
648 Emacs accepts them on decoding, and produces them on encoding
39787efd 649 CHARS96 character sets in a coding system which is characterized as
4ed46869
KH
650 7-bit environment, non-locking-shift, and non-single-shift.
651
652 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
39787efd 653 '(' can be omitted. We refer to this as "short-form" hereafter.
4ed46869
KH
654
655 Now you may notice that there are a lot of ways for encoding the
39787efd
KH
656 same multilingual text in ISO2022. Actually, there exist many
657 coding systems such as Compound Text (used in X11's inter client
658 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
659 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
4ed46869
KH
660 localized platforms), and all of these are variants of ISO2022.
661
662 In addition to the above, Emacs handles two more kinds of escape
663 sequences: ISO6429's direction specification and Emacs' private
664 sequence for specifying character composition.
665
39787efd 666 ISO6429's direction specification takes the following form:
4ed46869
KH
667 o CSI ']' -- end of the current direction
668 o CSI '0' ']' -- end of the current direction
669 o CSI '1' ']' -- start of left-to-right text
670 o CSI '2' ']' -- start of right-to-left text
671 The control character CSI (0x9B: control sequence introducer) is
39787efd
KH
672 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
673
674 Character composition specification takes the following form:
4ed46869
KH
675 o ESC '0' -- start character composition
676 o ESC '1' -- end character composition
39787efd
KH
677 Since these are not standard escape sequences of any ISO standard,
678 the use of them for these meaning is restricted to Emacs only. */
4ed46869
KH
679
680enum iso_code_class_type iso_code_class[256];
681
f024b6aa
RS
682#define CHARSET_OK(idx, charset) \
683 (coding_system_table[idx] \
684 && (coding_system_table[idx]->safe_charsets[charset] \
685 || (CODING_SPEC_ISO_REQUESTED_DESIGNATION \
686 (coding_system_table[idx], charset) \
687 != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)))
d46c5b12
KH
688
689#define SHIFT_OUT_OK(idx) \
690 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding_system_table[idx], 1) >= 0)
691
4ed46869
KH
692/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
693 Check if a text is encoded in ISO2022. If it is, returns an
694 integer in which appropriate flag bits any of:
695 CODING_CATEGORY_MASK_ISO_7
d46c5b12 696 CODING_CATEGORY_MASK_ISO_7_TIGHT
4ed46869
KH
697 CODING_CATEGORY_MASK_ISO_8_1
698 CODING_CATEGORY_MASK_ISO_8_2
7717c392
KH
699 CODING_CATEGORY_MASK_ISO_7_ELSE
700 CODING_CATEGORY_MASK_ISO_8_ELSE
4ed46869
KH
701 are set. If a code which should never appear in ISO2022 is found,
702 returns 0. */
703
704int
705detect_coding_iso2022 (src, src_end)
706 unsigned char *src, *src_end;
707{
d46c5b12
KH
708 int mask = CODING_CATEGORY_MASK_ISO;
709 int mask_found = 0;
f46869e4 710 int reg[4], shift_out = 0, single_shifting = 0;
d46c5b12 711 int c, c1, i, charset;
3f003981 712
d46c5b12 713 reg[0] = CHARSET_ASCII, reg[1] = reg[2] = reg[3] = -1;
3f003981 714 while (mask && src < src_end)
4ed46869
KH
715 {
716 c = *src++;
717 switch (c)
718 {
719 case ISO_CODE_ESC:
f46869e4 720 single_shifting = 0;
e0e989f6 721 if (src >= src_end)
4ed46869
KH
722 break;
723 c = *src++;
d46c5b12 724 if (c >= '(' && c <= '/')
4ed46869 725 {
bf9cdd4e
KH
726 /* Designation sequence for a charset of dimension 1. */
727 if (src >= src_end)
728 break;
d46c5b12
KH
729 c1 = *src++;
730 if (c1 < ' ' || c1 >= 0x80
731 || (charset = iso_charset_table[0][c >= ','][c1]) < 0)
732 /* Invalid designation sequence. Just ignore. */
733 break;
734 reg[(c - '(') % 4] = charset;
bf9cdd4e
KH
735 }
736 else if (c == '$')
737 {
738 /* Designation sequence for a charset of dimension 2. */
739 if (src >= src_end)
740 break;
741 c = *src++;
742 if (c >= '@' && c <= 'B')
743 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
d46c5b12 744 reg[0] = charset = iso_charset_table[1][0][c];
bf9cdd4e 745 else if (c >= '(' && c <= '/')
bcf26d6a 746 {
bf9cdd4e
KH
747 if (src >= src_end)
748 break;
d46c5b12
KH
749 c1 = *src++;
750 if (c1 < ' ' || c1 >= 0x80
751 || (charset = iso_charset_table[1][c >= ','][c1]) < 0)
752 /* Invalid designation sequence. Just ignore. */
753 break;
754 reg[(c - '(') % 4] = charset;
bcf26d6a 755 }
bf9cdd4e 756 else
d46c5b12
KH
757 /* Invalid designation sequence. Just ignore. */
758 break;
759 }
ae9ff118 760 else if (c == 'N' || c == 'O')
d46c5b12 761 {
ae9ff118
KH
762 /* ESC <Fe> for SS2 or SS3. */
763 mask &= CODING_CATEGORY_MASK_ISO_7_ELSE;
d46c5b12 764 break;
4ed46869 765 }
bf9cdd4e 766 else if (c == '0' || c == '1' || c == '2')
ae9ff118 767 /* ESC <Fp> for start/end composition. Just ignore. */
d46c5b12 768 break;
bf9cdd4e 769 else
d46c5b12
KH
770 /* Invalid escape sequence. Just ignore. */
771 break;
772
773 /* We found a valid designation sequence for CHARSET. */
774 mask &= ~CODING_CATEGORY_MASK_ISO_8BIT;
775 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7, charset))
776 mask_found |= CODING_CATEGORY_MASK_ISO_7;
777 else
778 mask &= ~CODING_CATEGORY_MASK_ISO_7;
779 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT, charset))
780 mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
781 else
782 mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
ae9ff118
KH
783 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE, charset))
784 mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
785 else
d46c5b12 786 mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
ae9ff118
KH
787 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE, charset))
788 mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
789 else
d46c5b12 790 mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
4ed46869
KH
791 break;
792
4ed46869 793 case ISO_CODE_SO:
f46869e4 794 single_shifting = 0;
d46c5b12
KH
795 if (shift_out == 0
796 && (reg[1] >= 0
797 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE)
798 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE)))
799 {
800 /* Locking shift out. */
801 mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
802 mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
803 }
e0e989f6
KH
804 break;
805
d46c5b12 806 case ISO_CODE_SI:
f46869e4 807 single_shifting = 0;
d46c5b12
KH
808 if (shift_out == 1)
809 {
810 /* Locking shift in. */
811 mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
812 mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
813 }
814 break;
815
4ed46869 816 case ISO_CODE_CSI:
f46869e4 817 single_shifting = 0;
4ed46869
KH
818 case ISO_CODE_SS2:
819 case ISO_CODE_SS3:
3f003981
KH
820 {
821 int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
822
70c22245
KH
823 if (c != ISO_CODE_CSI)
824 {
d46c5b12
KH
825 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
826 & CODING_FLAG_ISO_SINGLE_SHIFT)
70c22245 827 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
d46c5b12
KH
828 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
829 & CODING_FLAG_ISO_SINGLE_SHIFT)
70c22245 830 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
f46869e4 831 single_shifting = 1;
70c22245 832 }
3f003981
KH
833 if (VECTORP (Vlatin_extra_code_table)
834 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
835 {
d46c5b12
KH
836 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
837 & CODING_FLAG_ISO_LATIN_EXTRA)
3f003981 838 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
d46c5b12
KH
839 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
840 & CODING_FLAG_ISO_LATIN_EXTRA)
3f003981
KH
841 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
842 }
843 mask &= newmask;
d46c5b12 844 mask_found |= newmask;
3f003981
KH
845 }
846 break;
4ed46869
KH
847
848 default:
849 if (c < 0x80)
f46869e4
KH
850 {
851 single_shifting = 0;
852 break;
853 }
4ed46869 854 else if (c < 0xA0)
c4825358 855 {
f46869e4 856 single_shifting = 0;
3f003981
KH
857 if (VECTORP (Vlatin_extra_code_table)
858 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
c4825358 859 {
3f003981
KH
860 int newmask = 0;
861
d46c5b12
KH
862 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
863 & CODING_FLAG_ISO_LATIN_EXTRA)
3f003981 864 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
d46c5b12
KH
865 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
866 & CODING_FLAG_ISO_LATIN_EXTRA)
3f003981
KH
867 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
868 mask &= newmask;
d46c5b12 869 mask_found |= newmask;
c4825358 870 }
3f003981
KH
871 else
872 return 0;
c4825358 873 }
4ed46869
KH
874 else
875 {
7717c392 876 unsigned char *src_begin = src;
4ed46869 877
d46c5b12 878 mask &= ~(CODING_CATEGORY_MASK_ISO_7BIT
7717c392 879 | CODING_CATEGORY_MASK_ISO_7_ELSE);
d46c5b12 880 mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
f46869e4
KH
881 /* Check the length of succeeding codes of the range
882 0xA0..0FF. If the byte length is odd, we exclude
883 CODING_CATEGORY_MASK_ISO_8_2. We can check this only
884 when we are not single shifting. */
885 if (!single_shifting)
886 {
887 while (src < src_end && *src >= 0xA0)
888 src++;
889 if ((src - src_begin - 1) & 1 && src < src_end)
890 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
891 else
892 mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
893 }
4ed46869
KH
894 }
895 break;
896 }
897 }
898
d46c5b12 899 return (mask & mask_found);
4ed46869
KH
900}
901
902/* Decode a character of which charset is CHARSET and the 1st position
bdd9fb48 903 code is C1. If dimension of CHARSET is 2, the 2nd position code is
4ed46869
KH
904 fetched from SRC and set to C2. If CHARSET is negative, it means
905 that we are decoding ill formed text, and what we can do is just to
906 read C1 as is. */
907
bdd9fb48
KH
908#define DECODE_ISO_CHARACTER(charset, c1) \
909 do { \
910 int c_alt, charset_alt = (charset); \
911 if (COMPOSING_HEAD_P (coding->composing)) \
912 { \
913 *dst++ = LEADING_CODE_COMPOSITION; \
914 if (COMPOSING_WITH_RULE_P (coding->composing)) \
915 /* To tell composition rules are embeded. */ \
916 *dst++ = 0xFF; \
917 coding->composing += 2; \
918 } \
85bbb134 919 if (charset_alt >= 0) \
bdd9fb48 920 { \
85bbb134 921 if (CHARSET_DIMENSION (charset_alt) == 2) \
70c22245
KH
922 { \
923 ONE_MORE_BYTE (c2); \
924 if (iso_code_class[(c2) & 0x7F] != ISO_0x20_or_0x7F \
925 && iso_code_class[(c2) & 0x7F] != ISO_graphic_plane_0) \
926 { \
927 src--; \
85bbb134 928 charset_alt = CHARSET_ASCII; \
70c22245
KH
929 } \
930 } \
84fbb8a0
KH
931 if (!NILP (translation_table) \
932 && ((c_alt = translate_char (translation_table, \
85bbb134 933 -1, charset_alt, c1, c2)) >= 0)) \
bdd9fb48
KH
934 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
935 } \
936 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
937 DECODE_CHARACTER_ASCII (c1); \
938 else if (CHARSET_DIMENSION (charset_alt) == 1) \
939 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
940 else \
941 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
942 if (COMPOSING_WITH_RULE_P (coding->composing)) \
943 /* To tell a composition rule follows. */ \
944 coding->composing = COMPOSING_WITH_RULE_RULE; \
4ed46869
KH
945 } while (0)
946
947/* Set designation state into CODING. */
d46c5b12
KH
948#define DECODE_DESIGNATION(reg, dimension, chars, final_char) \
949 do { \
944bd420
KH
950 int charset; \
951 \
952 if (final_char < '0' || final_char >= 128) \
953 goto label_invalid_code; \
954 charset = ISO_CHARSET_TABLE (make_number (dimension), \
955 make_number (chars), \
956 make_number (final_char)); \
d46c5b12 957 if (charset >= 0 \
704c5781
KH
958 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) == reg \
959 || coding->safe_charsets[charset])) \
d46c5b12
KH
960 { \
961 if (coding->spec.iso2022.last_invalid_designation_register == 0 \
962 && reg == 0 \
963 && charset == CHARSET_ASCII) \
964 { \
965 /* We should insert this designation sequence as is so \
966 that it is surely written back to a file. */ \
967 coding->spec.iso2022.last_invalid_designation_register = -1; \
968 goto label_invalid_code; \
969 } \
970 coding->spec.iso2022.last_invalid_designation_register = -1; \
971 if ((coding->mode & CODING_MODE_DIRECTION) \
972 && CHARSET_REVERSE_CHARSET (charset) >= 0) \
973 charset = CHARSET_REVERSE_CHARSET (charset); \
974 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
975 } \
976 else \
977 { \
978 coding->spec.iso2022.last_invalid_designation_register = reg; \
979 goto label_invalid_code; \
980 } \
4ed46869
KH
981 } while (0)
982
88993dfd
KH
983/* Return 0 if there's a valid composing sequence starting at SRC and
984 ending before SRC_END, else return -1. */
d46c5b12 985
84fbb8a0
KH
986int
987check_composing_code (coding, src, src_end)
d46c5b12
KH
988 struct coding_system *coding;
989 unsigned char *src, *src_end;
990{
d46c5b12
KH
991 int charset, c, c1, dim;
992
993 while (src < src_end)
994 {
88993dfd
KH
995 c = *src++;
996 if (c >= 0x20)
997 continue;
998 if (c != ISO_CODE_ESC || src >= src_end)
999 return -1;
1000 c = *src++;
1001 if (c == '1') /* end of compsition */
1002 return 0;
1003 if (src + 2 >= src_end
1004 || !coding->flags & CODING_FLAG_ISO_DESIGNATION)
1005 return -1;
1006
1007 dim = (c == '$');
1008 if (dim == 1)
1009 c = (*src >= '@' && *src <= 'B') ? '(' : *src++;
1010 if (c >= '(' && c <= '/')
d46c5b12 1011 {
88993dfd
KH
1012 c1 = *src++;
1013 if ((c1 < ' ' || c1 >= 0x80)
1014 || (charset = iso_charset_table[dim][c >= ','][c1]) < 0
1015 || ! coding->safe_charsets[charset]
1016 || (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
1017 == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
1018 return -1;
d46c5b12 1019 }
88993dfd
KH
1020 else
1021 return -1;
d46c5b12 1022 }
88993dfd
KH
1023
1024 /* We have not found the sequence "ESC 1". */
1025 return -1;
d46c5b12
KH
1026}
1027
4ed46869
KH
1028/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
1029
1030int
d46c5b12 1031decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
1032 struct coding_system *coding;
1033 unsigned char *source, *destination;
1034 int src_bytes, dst_bytes;
4ed46869
KH
1035{
1036 unsigned char *src = source;
1037 unsigned char *src_end = source + src_bytes;
1038 unsigned char *dst = destination;
1039 unsigned char *dst_end = destination + dst_bytes;
1040 /* Since the maximum bytes produced by each loop is 7, we subtract 6
1041 from DST_END to assure that overflow checking is necessary only
1042 at the head of loop. */
1043 unsigned char *adjusted_dst_end = dst_end - 6;
1044 int charset;
1045 /* Charsets invoked to graphic plane 0 and 1 respectively. */
1046 int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1047 int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
84fbb8a0 1048 Lisp_Object translation_table
f967223b 1049 = coding->translation_table_for_decode;
d46c5b12 1050 int result = CODING_FINISH_NORMAL;
bdd9fb48 1051
84fbb8a0 1052 if (!NILP (Venable_character_translation) && NILP (translation_table))
f967223b 1053 translation_table = Vstandard_translation_table_for_decode;
4ed46869 1054
d46c5b12 1055 coding->produced_char = 0;
fb88bf2d 1056 coding->fake_multibyte = 0;
d46c5b12
KH
1057 while (src < src_end && (dst_bytes
1058 ? (dst < adjusted_dst_end)
1059 : (dst < src - 6)))
4ed46869
KH
1060 {
1061 /* SRC_BASE remembers the start position in source in each loop.
1062 The loop will be exited when there's not enough source text
1063 to analyze long escape sequence or 2-byte code (within macros
1064 ONE_MORE_BYTE or TWO_MORE_BYTES). In that case, SRC is reset
1065 to SRC_BASE before exiting. */
1066 unsigned char *src_base = src;
bdd9fb48 1067 int c1 = *src++, c2;
4ed46869
KH
1068
1069 switch (iso_code_class [c1])
1070 {
1071 case ISO_0x20_or_0x7F:
1072 if (!coding->composing
1073 && (charset0 < 0 || CHARSET_CHARS (charset0) == 94))
1074 {
1075 /* This is SPACE or DEL. */
1076 *dst++ = c1;
d46c5b12 1077 coding->produced_char++;
4ed46869
KH
1078 break;
1079 }
1080 /* This is a graphic character, we fall down ... */
1081
1082 case ISO_graphic_plane_0:
1083 if (coding->composing == COMPOSING_WITH_RULE_RULE)
1084 {
1085 /* This is a composition rule. */
1086 *dst++ = c1 | 0x80;
1087 coding->composing = COMPOSING_WITH_RULE_TAIL;
1088 }
1089 else
1090 DECODE_ISO_CHARACTER (charset0, c1);
1091 break;
1092
1093 case ISO_0xA0_or_0xFF:
d46c5b12
KH
1094 if (charset1 < 0 || CHARSET_CHARS (charset1) == 94
1095 || coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
fb88bf2d 1096 goto label_invalid_code;
4ed46869
KH
1097 /* This is a graphic character, we fall down ... */
1098
1099 case ISO_graphic_plane_1:
d46c5b12 1100 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
fb88bf2d 1101 goto label_invalid_code;
d46c5b12
KH
1102 else
1103 DECODE_ISO_CHARACTER (charset1, c1);
4ed46869
KH
1104 break;
1105
1106 case ISO_control_code:
1107 /* All ISO2022 control characters in this class have the
1108 same representation in Emacs internal format. */
d46c5b12
KH
1109 if (c1 == '\n'
1110 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1111 && (coding->eol_type == CODING_EOL_CR
1112 || coding->eol_type == CODING_EOL_CRLF))
1113 {
1114 result = CODING_FINISH_INCONSISTENT_EOL;
1115 goto label_end_of_loop_2;
1116 }
4ed46869 1117 *dst++ = c1;
d46c5b12 1118 coding->produced_char++;
174a4cbe
KH
1119 if (c1 >= 0x80)
1120 coding->fake_multibyte = 1;
4ed46869
KH
1121 break;
1122
1123 case ISO_carriage_return:
1124 if (coding->eol_type == CODING_EOL_CR)
d46c5b12 1125 *dst++ = '\n';
4ed46869
KH
1126 else if (coding->eol_type == CODING_EOL_CRLF)
1127 {
1128 ONE_MORE_BYTE (c1);
1129 if (c1 == ISO_CODE_LF)
1130 *dst++ = '\n';
1131 else
1132 {
d46c5b12
KH
1133 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1134 {
1135 result = CODING_FINISH_INCONSISTENT_EOL;
1136 goto label_end_of_loop_2;
1137 }
4ed46869 1138 src--;
d46c5b12 1139 *dst++ = '\r';
4ed46869
KH
1140 }
1141 }
1142 else
d46c5b12
KH
1143 *dst++ = c1;
1144 coding->produced_char++;
4ed46869
KH
1145 break;
1146
1147 case ISO_shift_out:
d46c5b12
KH
1148 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1149 || CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
1150 goto label_invalid_code;
4ed46869
KH
1151 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
1152 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1153 break;
1154
1155 case ISO_shift_in:
d46c5b12
KH
1156 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
1157 goto label_invalid_code;
4ed46869
KH
1158 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
1159 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1160 break;
1161
1162 case ISO_single_shift_2_7:
1163 case ISO_single_shift_2:
d46c5b12
KH
1164 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1165 goto label_invalid_code;
4ed46869
KH
1166 /* SS2 is handled as an escape sequence of ESC 'N' */
1167 c1 = 'N';
1168 goto label_escape_sequence;
1169
1170 case ISO_single_shift_3:
d46c5b12
KH
1171 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1172 goto label_invalid_code;
4ed46869
KH
1173 /* SS2 is handled as an escape sequence of ESC 'O' */
1174 c1 = 'O';
1175 goto label_escape_sequence;
1176
1177 case ISO_control_sequence_introducer:
1178 /* CSI is handled as an escape sequence of ESC '[' ... */
1179 c1 = '[';
1180 goto label_escape_sequence;
1181
1182 case ISO_escape:
1183 ONE_MORE_BYTE (c1);
1184 label_escape_sequence:
1185 /* Escape sequences handled by Emacs are invocation,
1186 designation, direction specification, and character
1187 composition specification. */
1188 switch (c1)
1189 {
1190 case '&': /* revision of following character set */
1191 ONE_MORE_BYTE (c1);
1192 if (!(c1 >= '@' && c1 <= '~'))
d46c5b12 1193 goto label_invalid_code;
4ed46869
KH
1194 ONE_MORE_BYTE (c1);
1195 if (c1 != ISO_CODE_ESC)
d46c5b12 1196 goto label_invalid_code;
4ed46869
KH
1197 ONE_MORE_BYTE (c1);
1198 goto label_escape_sequence;
1199
1200 case '$': /* designation of 2-byte character set */
d46c5b12
KH
1201 if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1202 goto label_invalid_code;
4ed46869
KH
1203 ONE_MORE_BYTE (c1);
1204 if (c1 >= '@' && c1 <= 'B')
1205 { /* designation of JISX0208.1978, GB2312.1980,
88993dfd 1206 or JISX0208.1980 */
4ed46869
KH
1207 DECODE_DESIGNATION (0, 2, 94, c1);
1208 }
1209 else if (c1 >= 0x28 && c1 <= 0x2B)
1210 { /* designation of DIMENSION2_CHARS94 character set */
1211 ONE_MORE_BYTE (c2);
1212 DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
1213 }
1214 else if (c1 >= 0x2C && c1 <= 0x2F)
1215 { /* designation of DIMENSION2_CHARS96 character set */
1216 ONE_MORE_BYTE (c2);
1217 DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
1218 }
1219 else
d46c5b12 1220 goto label_invalid_code;
4ed46869
KH
1221 break;
1222
1223 case 'n': /* invocation of locking-shift-2 */
d46c5b12
KH
1224 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1225 || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1226 goto label_invalid_code;
4ed46869 1227 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
e0e989f6 1228 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
4ed46869
KH
1229 break;
1230
1231 case 'o': /* invocation of locking-shift-3 */
d46c5b12
KH
1232 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1233 || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1234 goto label_invalid_code;
4ed46869 1235 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
e0e989f6 1236 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
4ed46869
KH
1237 break;
1238
1239 case 'N': /* invocation of single-shift-2 */
d46c5b12
KH
1240 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1241 || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1242 goto label_invalid_code;
4ed46869
KH
1243 ONE_MORE_BYTE (c1);
1244 charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
1245 DECODE_ISO_CHARACTER (charset, c1);
1246 break;
1247
1248 case 'O': /* invocation of single-shift-3 */
d46c5b12
KH
1249 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1250 || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1251 goto label_invalid_code;
4ed46869
KH
1252 ONE_MORE_BYTE (c1);
1253 charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
1254 DECODE_ISO_CHARACTER (charset, c1);
1255 break;
1256
d46c5b12
KH
1257 case '0': case '2': /* start composing */
1258 /* Before processing composing, we must be sure that all
1259 characters being composed are supported by CODING.
88993dfd
KH
1260 If not, we must give up composing. */
1261 if (check_composing_code (coding, src, src_end) == 0)
1262 {
1263 /* We are looking at a valid composition sequence. */
1264 coding->composing = (c1 == '0'
1265 ? COMPOSING_NO_RULE_HEAD
1266 : COMPOSING_WITH_RULE_HEAD);
1267 coding->composed_chars = 0;
1268 }
1269 else
1270 {
1271 *dst++ = ISO_CODE_ESC;
1272 *dst++ = c1;
1273 coding->produced_char += 2;
1274 }
4ed46869
KH
1275 break;
1276
1277 case '1': /* end composing */
88993dfd
KH
1278 if (!coding->composing)
1279 {
1280 *dst++ = ISO_CODE_ESC;
1281 *dst++ = c1;
1282 coding->produced_char += 2;
1283 break;
1284 }
1285
de79a6a5
KH
1286 if (coding->composed_chars > 0)
1287 {
1288 if (coding->composed_chars == 1)
1289 {
1290 unsigned char *this_char_start = dst;
1291 int this_bytes;
1292
1293 /* Only one character is in the composing
1294 sequence. Make it a normal character. */
1295 while (*--this_char_start != LEADING_CODE_COMPOSITION);
1296 dst = (this_char_start
1297 + (coding->composing == COMPOSING_NO_RULE_TAIL
1298 ? 1 : 2));
1299 *dst -= 0x20;
1300 if (*dst == 0x80)
1301 *++dst &= 0x7F;
1302 this_bytes = BYTES_BY_CHAR_HEAD (*dst);
1303 while (this_bytes--) *this_char_start++ = *dst++;
1304 dst = this_char_start;
1305 }
1306 coding->produced_char++;
1307 }
4ed46869 1308 coding->composing = COMPOSING_NO;
4ed46869
KH
1309 break;
1310
1311 case '[': /* specification of direction */
d46c5b12
KH
1312 if (coding->flags & CODING_FLAG_ISO_NO_DIRECTION)
1313 goto label_invalid_code;
4ed46869 1314 /* For the moment, nested direction is not supported.
d46c5b12
KH
1315 So, `coding->mode & CODING_MODE_DIRECTION' zero means
1316 left-to-right, and nozero means right-to-left. */
4ed46869
KH
1317 ONE_MORE_BYTE (c1);
1318 switch (c1)
1319 {
1320 case ']': /* end of the current direction */
d46c5b12 1321 coding->mode &= ~CODING_MODE_DIRECTION;
4ed46869
KH
1322
1323 case '0': /* end of the current direction */
1324 case '1': /* start of left-to-right direction */
1325 ONE_MORE_BYTE (c1);
1326 if (c1 == ']')
d46c5b12 1327 coding->mode &= ~CODING_MODE_DIRECTION;
4ed46869 1328 else
d46c5b12 1329 goto label_invalid_code;
4ed46869
KH
1330 break;
1331
1332 case '2': /* start of right-to-left direction */
1333 ONE_MORE_BYTE (c1);
1334 if (c1 == ']')
d46c5b12 1335 coding->mode |= CODING_MODE_DIRECTION;
4ed46869 1336 else
d46c5b12 1337 goto label_invalid_code;
4ed46869
KH
1338 break;
1339
1340 default:
d46c5b12 1341 goto label_invalid_code;
4ed46869
KH
1342 }
1343 break;
1344
1345 default:
d46c5b12
KH
1346 if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1347 goto label_invalid_code;
4ed46869
KH
1348 if (c1 >= 0x28 && c1 <= 0x2B)
1349 { /* designation of DIMENSION1_CHARS94 character set */
1350 ONE_MORE_BYTE (c2);
1351 DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
1352 }
1353 else if (c1 >= 0x2C && c1 <= 0x2F)
1354 { /* designation of DIMENSION1_CHARS96 character set */
1355 ONE_MORE_BYTE (c2);
1356 DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
1357 }
1358 else
1359 {
d46c5b12 1360 goto label_invalid_code;
4ed46869
KH
1361 }
1362 }
1363 /* We must update these variables now. */
1364 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1365 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1366 break;
1367
d46c5b12 1368 label_invalid_code:
d46c5b12
KH
1369 while (src_base < src)
1370 *dst++ = *src_base++;
fb88bf2d 1371 coding->fake_multibyte = 1;
4ed46869
KH
1372 }
1373 continue;
1374
1375 label_end_of_loop:
d46c5b12
KH
1376 result = CODING_FINISH_INSUFFICIENT_SRC;
1377 label_end_of_loop_2:
4ed46869
KH
1378 src = src_base;
1379 break;
1380 }
1381
fb88bf2d 1382 if (src < src_end)
4ed46869 1383 {
fb88bf2d
KH
1384 if (result == CODING_FINISH_NORMAL)
1385 result = CODING_FINISH_INSUFFICIENT_DST;
1386 else if (result != CODING_FINISH_INCONSISTENT_EOL
1387 && coding->mode & CODING_MODE_LAST_BLOCK)
1388 {
1389 /* This is the last block of the text to be decoded. We had
1390 better just flush out all remaining codes in the text
1391 although they are not valid characters. */
1392 src_bytes = src_end - src;
1393 if (dst_bytes && (dst_end - dst < src_bytes))
1394 src_bytes = dst_end - dst;
1395 bcopy (src, dst, src_bytes);
1396 dst += src_bytes;
1397 src += src_bytes;
1398 coding->fake_multibyte = 1;
1399 }
4ed46869 1400 }
fb88bf2d 1401
d46c5b12
KH
1402 coding->consumed = coding->consumed_char = src - source;
1403 coding->produced = dst - destination;
1404 return result;
4ed46869
KH
1405}
1406
f4dee582 1407/* ISO2022 encoding stuff. */
4ed46869
KH
1408
1409/*
f4dee582 1410 It is not enough to say just "ISO2022" on encoding, we have to
d46c5b12 1411 specify more details. In Emacs, each coding system of ISO2022
4ed46869
KH
1412 variant has the following specifications:
1413 1. Initial designation to G0 thru G3.
1414 2. Allows short-form designation?
1415 3. ASCII should be designated to G0 before control characters?
1416 4. ASCII should be designated to G0 at end of line?
1417 5. 7-bit environment or 8-bit environment?
1418 6. Use locking-shift?
1419 7. Use Single-shift?
1420 And the following two are only for Japanese:
1421 8. Use ASCII in place of JIS0201-1976-Roman?
1422 9. Use JISX0208-1983 in place of JISX0208-1978?
1423 These specifications are encoded in `coding->flags' as flag bits
1424 defined by macros CODING_FLAG_ISO_XXX. See `coding.h' for more
f4dee582 1425 details.
4ed46869
KH
1426*/
1427
1428/* Produce codes (escape sequence) for designating CHARSET to graphic
1429 register REG. If <final-char> of CHARSET is '@', 'A', or 'B' and
1430 the coding system CODING allows, produce designation sequence of
1431 short-form. */
1432
1433#define ENCODE_DESIGNATION(charset, reg, coding) \
1434 do { \
1435 unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset); \
1436 char *intermediate_char_94 = "()*+"; \
1437 char *intermediate_char_96 = ",-./"; \
70c22245
KH
1438 int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset); \
1439 if (revision < 255) \
1440 { \
4ed46869
KH
1441 *dst++ = ISO_CODE_ESC; \
1442 *dst++ = '&'; \
70c22245 1443 *dst++ = '@' + revision; \
4ed46869
KH
1444 } \
1445 *dst++ = ISO_CODE_ESC; \
1446 if (CHARSET_DIMENSION (charset) == 1) \
1447 { \
1448 if (CHARSET_CHARS (charset) == 94) \
1449 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1450 else \
1451 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1452 } \
1453 else \
1454 { \
1455 *dst++ = '$'; \
1456 if (CHARSET_CHARS (charset) == 94) \
1457 { \
1458 if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM) \
1459 || reg != 0 \
1460 || final_char < '@' || final_char > 'B') \
1461 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1462 } \
1463 else \
1464 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1465 } \
1466 *dst++ = final_char; \
1467 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
1468 } while (0)
1469
1470/* The following two macros produce codes (control character or escape
1471 sequence) for ISO2022 single-shift functions (single-shift-2 and
1472 single-shift-3). */
1473
1474#define ENCODE_SINGLE_SHIFT_2 \
1475 do { \
1476 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1477 *dst++ = ISO_CODE_ESC, *dst++ = 'N'; \
1478 else \
fb88bf2d
KH
1479 { \
1480 *dst++ = ISO_CODE_SS2; \
1481 coding->fake_multibyte = 1; \
1482 } \
4ed46869
KH
1483 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1484 } while (0)
1485
fb88bf2d
KH
1486#define ENCODE_SINGLE_SHIFT_3 \
1487 do { \
4ed46869 1488 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
fb88bf2d
KH
1489 *dst++ = ISO_CODE_ESC, *dst++ = 'O'; \
1490 else \
1491 { \
1492 *dst++ = ISO_CODE_SS3; \
1493 coding->fake_multibyte = 1; \
1494 } \
4ed46869
KH
1495 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1496 } while (0)
1497
1498/* The following four macros produce codes (control character or
1499 escape sequence) for ISO2022 locking-shift functions (shift-in,
1500 shift-out, locking-shift-2, and locking-shift-3). */
1501
1502#define ENCODE_SHIFT_IN \
1503 do { \
1504 *dst++ = ISO_CODE_SI; \
1505 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1506 } while (0)
1507
1508#define ENCODE_SHIFT_OUT \
1509 do { \
1510 *dst++ = ISO_CODE_SO; \
1511 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1512 } while (0)
1513
1514#define ENCODE_LOCKING_SHIFT_2 \
1515 do { \
1516 *dst++ = ISO_CODE_ESC, *dst++ = 'n'; \
1517 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1518 } while (0)
1519
1520#define ENCODE_LOCKING_SHIFT_3 \
1521 do { \
1522 *dst++ = ISO_CODE_ESC, *dst++ = 'o'; \
1523 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1524 } while (0)
1525
f4dee582
RS
1526/* Produce codes for a DIMENSION1 character whose character set is
1527 CHARSET and whose position-code is C1. Designation and invocation
4ed46869
KH
1528 sequences are also produced in advance if necessary. */
1529
1530
6e85d753
KH
1531#define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
1532 do { \
1533 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1534 { \
1535 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1536 *dst++ = c1 & 0x7F; \
1537 else \
1538 *dst++ = c1 | 0x80; \
1539 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1540 break; \
1541 } \
1542 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1543 { \
1544 *dst++ = c1 & 0x7F; \
1545 break; \
1546 } \
1547 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1548 { \
1549 *dst++ = c1 | 0x80; \
1550 break; \
1551 } \
1552 else if (coding->flags & CODING_FLAG_ISO_SAFE \
70c22245 1553 && !coding->safe_charsets[charset]) \
6e85d753
KH
1554 { \
1555 /* We should not encode this character, instead produce one or \
1556 two `?'s. */ \
1557 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1558 if (CHARSET_WIDTH (charset) == 2) \
1559 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1560 break; \
1561 } \
1562 else \
1563 /* Since CHARSET is not yet invoked to any graphic planes, we \
1564 must invoke it, or, at first, designate it to some graphic \
1565 register. Then repeat the loop to actually produce the \
1566 character. */ \
1567 dst = encode_invocation_designation (charset, coding, dst); \
4ed46869
KH
1568 } while (1)
1569
f4dee582
RS
1570/* Produce codes for a DIMENSION2 character whose character set is
1571 CHARSET and whose position-codes are C1 and C2. Designation and
4ed46869
KH
1572 invocation codes are also produced in advance if necessary. */
1573
6e85d753
KH
1574#define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
1575 do { \
1576 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1577 { \
1578 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1579 *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F; \
1580 else \
1581 *dst++ = c1 | 0x80, *dst++ = c2 | 0x80; \
1582 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1583 break; \
1584 } \
1585 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1586 { \
1587 *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F; \
1588 break; \
1589 } \
1590 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1591 { \
1592 *dst++ = c1 | 0x80, *dst++= c2 | 0x80; \
1593 break; \
1594 } \
1595 else if (coding->flags & CODING_FLAG_ISO_SAFE \
70c22245 1596 && !coding->safe_charsets[charset]) \
6e85d753
KH
1597 { \
1598 /* We should not encode this character, instead produce one or \
1599 two `?'s. */ \
1600 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1601 if (CHARSET_WIDTH (charset) == 2) \
1602 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1603 break; \
1604 } \
1605 else \
1606 /* Since CHARSET is not yet invoked to any graphic planes, we \
1607 must invoke it, or, at first, designate it to some graphic \
1608 register. Then repeat the loop to actually produce the \
1609 character. */ \
1610 dst = encode_invocation_designation (charset, coding, dst); \
4ed46869
KH
1611 } while (1)
1612
6f551029
KH
1613#define ENCODE_ISO_CHARACTER(charset, c1, c2) \
1614 do { \
1615 int c_alt, charset_alt; \
1616 if (!NILP (translation_table) \
1617 && ((c_alt = translate_char (translation_table, -1, \
1618 charset, c1, c2)) \
1619 >= 0)) \
1620 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1621 else \
1622 charset_alt = charset; \
1623 if (CHARSET_DEFINED_P (charset_alt)) \
1624 { \
1625 if (CHARSET_DIMENSION (charset_alt) == 1) \
1626 { \
1627 if (charset == CHARSET_ASCII \
1628 && coding->flags & CODING_FLAG_ISO_USE_ROMAN) \
1629 charset_alt = charset_latin_jisx0201; \
1630 ENCODE_ISO_CHARACTER_DIMENSION1 (charset_alt, c1); \
1631 } \
1632 else \
1633 { \
1634 if (charset == charset_jisx0208 \
1635 && coding->flags & CODING_FLAG_ISO_USE_OLDJIS) \
1636 charset_alt = charset_jisx0208_1978; \
1637 ENCODE_ISO_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
1638 } \
1639 } \
1640 else \
1641 { \
1642 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1643 { \
1644 *dst++ = charset & 0x7f; \
1645 *dst++ = c1 & 0x7f; \
1646 if (c2) \
1647 *dst++ = c2 & 0x7f; \
1648 } \
1649 else \
1650 { \
1651 *dst++ = charset; \
1652 *dst++ = c1; \
1653 if (c2) \
1654 *dst++ = c2; \
1655 } \
1656 } \
1657 if (! COMPOSING_P (coding->composing)) \
1658 coding->consumed_char++; \
84fbb8a0 1659 } while (0)
bdd9fb48 1660
4ed46869
KH
1661/* Produce designation and invocation codes at a place pointed by DST
1662 to use CHARSET. The element `spec.iso2022' of *CODING is updated.
1663 Return new DST. */
1664
1665unsigned char *
1666encode_invocation_designation (charset, coding, dst)
1667 int charset;
1668 struct coding_system *coding;
1669 unsigned char *dst;
1670{
1671 int reg; /* graphic register number */
1672
1673 /* At first, check designations. */
1674 for (reg = 0; reg < 4; reg++)
1675 if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
1676 break;
1677
1678 if (reg >= 4)
1679 {
1680 /* CHARSET is not yet designated to any graphic registers. */
1681 /* At first check the requested designation. */
1682 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1ba9e4ab
KH
1683 if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1684 /* Since CHARSET requests no special designation, designate it
1685 to graphic register 0. */
4ed46869
KH
1686 reg = 0;
1687
1688 ENCODE_DESIGNATION (charset, reg, coding);
1689 }
1690
1691 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
1692 && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
1693 {
1694 /* Since the graphic register REG is not invoked to any graphic
1695 planes, invoke it to graphic plane 0. */
1696 switch (reg)
1697 {
1698 case 0: /* graphic register 0 */
1699 ENCODE_SHIFT_IN;
1700 break;
1701
1702 case 1: /* graphic register 1 */
1703 ENCODE_SHIFT_OUT;
1704 break;
1705
1706 case 2: /* graphic register 2 */
1707 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1708 ENCODE_SINGLE_SHIFT_2;
1709 else
1710 ENCODE_LOCKING_SHIFT_2;
1711 break;
1712
1713 case 3: /* graphic register 3 */
1714 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1715 ENCODE_SINGLE_SHIFT_3;
1716 else
1717 ENCODE_LOCKING_SHIFT_3;
1718 break;
1719 }
1720 }
1721 return dst;
1722}
1723
1724/* The following two macros produce codes for indicating composition. */
1725#define ENCODE_COMPOSITION_NO_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '0'
1726#define ENCODE_COMPOSITION_WITH_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '2'
1727#define ENCODE_COMPOSITION_END *dst++ = ISO_CODE_ESC, *dst++ = '1'
1728
1729/* The following three macros produce codes for indicating direction
1730 of text. */
1731#define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
1732 do { \
1733 if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS) \
1734 *dst++ = ISO_CODE_ESC, *dst++ = '['; \
1735 else \
1736 *dst++ = ISO_CODE_CSI; \
1737 } while (0)
1738
1739#define ENCODE_DIRECTION_R2L \
1740 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '2', *dst++ = ']'
1741
1742#define ENCODE_DIRECTION_L2R \
1743 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '0', *dst++ = ']'
1744
1745/* Produce codes for designation and invocation to reset the graphic
1746 planes and registers to initial state. */
e0e989f6
KH
1747#define ENCODE_RESET_PLANE_AND_REGISTER \
1748 do { \
1749 int reg; \
1750 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0) \
1751 ENCODE_SHIFT_IN; \
1752 for (reg = 0; reg < 4; reg++) \
1753 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0 \
1754 && (CODING_SPEC_ISO_DESIGNATION (coding, reg) \
1755 != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg))) \
1756 ENCODE_DESIGNATION \
1757 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
4ed46869
KH
1758 } while (0)
1759
bdd9fb48 1760/* Produce designation sequences of charsets in the line started from
d46c5b12 1761 SRC to a place pointed by *DSTP, and update DSTP.
bdd9fb48
KH
1762
1763 If the current block ends before any end-of-line, we may fail to
d46c5b12
KH
1764 find all the necessary designations. */
1765
dfcf069d 1766void
bdd9fb48 1767encode_designation_at_bol (coding, table, src, src_end, dstp)
e0e989f6 1768 struct coding_system *coding;
bdd9fb48 1769 Lisp_Object table;
e0e989f6
KH
1770 unsigned char *src, *src_end, **dstp;
1771{
bdd9fb48
KH
1772 int charset, c, found = 0, reg;
1773 /* Table of charsets to be designated to each graphic register. */
1774 int r[4];
1775 unsigned char *dst = *dstp;
1776
1777 for (reg = 0; reg < 4; reg++)
1778 r[reg] = -1;
1779
1780 while (src < src_end && *src != '\n' && found < 4)
e0e989f6 1781 {
bdd9fb48
KH
1782 int bytes = BYTES_BY_CHAR_HEAD (*src);
1783
1784 if (NILP (table))
1785 charset = CHARSET_AT (src);
1786 else
e0e989f6 1787 {
35cb8686
RS
1788 int c_alt;
1789 unsigned char c1, c2;
bdd9fb48
KH
1790
1791 SPLIT_STRING(src, bytes, charset, c1, c2);
84fbb8a0 1792 if ((c_alt = translate_char (table, -1, charset, c1, c2)) >= 0)
bdd9fb48 1793 charset = CHAR_CHARSET (c_alt);
e0e989f6 1794 }
bdd9fb48 1795
e0e989f6 1796 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
d46c5b12 1797 if (reg != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION && r[reg] < 0)
bdd9fb48
KH
1798 {
1799 found++;
1800 r[reg] = charset;
1801 }
1802
1803 src += bytes;
1804 }
1805
1806 if (found)
1807 {
1808 for (reg = 0; reg < 4; reg++)
1809 if (r[reg] >= 0
1810 && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
1811 ENCODE_DESIGNATION (r[reg], reg, coding);
1812 *dstp = dst;
e0e989f6 1813 }
e0e989f6
KH
1814}
1815
4ed46869
KH
1816/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
1817
1818int
d46c5b12 1819encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
1820 struct coding_system *coding;
1821 unsigned char *source, *destination;
1822 int src_bytes, dst_bytes;
4ed46869
KH
1823{
1824 unsigned char *src = source;
1825 unsigned char *src_end = source + src_bytes;
1826 unsigned char *dst = destination;
1827 unsigned char *dst_end = destination + dst_bytes;
e0e989f6 1828 /* Since the maximum bytes produced by each loop is 20, we subtract 19
4ed46869
KH
1829 from DST_END to assure overflow checking is necessary only at the
1830 head of loop. */
e0e989f6 1831 unsigned char *adjusted_dst_end = dst_end - 19;
84fbb8a0 1832 Lisp_Object translation_table
f967223b 1833 = coding->translation_table_for_encode;
d46c5b12 1834 int result = CODING_FINISH_NORMAL;
bdd9fb48 1835
84fbb8a0 1836 if (!NILP (Venable_character_translation) && NILP (translation_table))
f967223b 1837 translation_table = Vstandard_translation_table_for_encode;
4ed46869 1838
d46c5b12 1839 coding->consumed_char = 0;
fb88bf2d 1840 coding->fake_multibyte = 0;
d46c5b12
KH
1841 while (src < src_end && (dst_bytes
1842 ? (dst < adjusted_dst_end)
1843 : (dst < src - 19)))
4ed46869
KH
1844 {
1845 /* SRC_BASE remembers the start position in source in each loop.
1846 The loop will be exited when there's not enough source text
1847 to analyze multi-byte codes (within macros ONE_MORE_BYTE,
1848 TWO_MORE_BYTES, and THREE_MORE_BYTES). In that case, SRC is
1849 reset to SRC_BASE before exiting. */
1850 unsigned char *src_base = src;
bdd9fb48 1851 int charset, c1, c2, c3, c4;
4ed46869 1852
e0e989f6
KH
1853 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
1854 && CODING_SPEC_ISO_BOL (coding))
1855 {
bdd9fb48 1856 /* We have to produce designation sequences if any now. */
84fbb8a0 1857 encode_designation_at_bol (coding, translation_table,
bdd9fb48 1858 src, src_end, &dst);
e0e989f6
KH
1859 CODING_SPEC_ISO_BOL (coding) = 0;
1860 }
1861
1862 c1 = *src++;
4ed46869 1863 /* If we are seeing a component of a composite character, we are
d46c5b12
KH
1864 seeing a leading-code encoded irregularly for composition, or
1865 a composition rule if composing with rule. We must set C1 to
1866 a normal leading-code or an ASCII code. If we are not seeing
1867 a composite character, we must reset composition,
1868 designation, and invocation states. */
4ed46869
KH
1869 if (COMPOSING_P (coding->composing))
1870 {
1871 if (c1 < 0xA0)
1872 {
1873 /* We are not in a composite character any longer. */
1874 coding->composing = COMPOSING_NO;
d46c5b12 1875 ENCODE_RESET_PLANE_AND_REGISTER;
4ed46869
KH
1876 ENCODE_COMPOSITION_END;
1877 }
1878 else
1879 {
1880 if (coding->composing == COMPOSING_WITH_RULE_RULE)
1881 {
1882 *dst++ = c1 & 0x7F;
1883 coding->composing = COMPOSING_WITH_RULE_HEAD;
1884 continue;
1885 }
1886 else if (coding->composing == COMPOSING_WITH_RULE_HEAD)
1887 coding->composing = COMPOSING_WITH_RULE_RULE;
1888 if (c1 == 0xA0)
1889 {
1890 /* This is an ASCII component. */
1891 ONE_MORE_BYTE (c1);
1892 c1 &= 0x7F;
1893 }
1894 else
1895 /* This is a leading-code of non ASCII component. */
1896 c1 -= 0x20;
1897 }
1898 }
1899
1900 /* Now encode one character. C1 is a control character, an
1901 ASCII character, or a leading-code of multi-byte character. */
1902 switch (emacs_code_class[c1])
1903 {
1904 case EMACS_ascii_code:
8dbb769e 1905 c2 = 0;
bdd9fb48 1906 ENCODE_ISO_CHARACTER (CHARSET_ASCII, c1, /* dummy */ c2);
4ed46869
KH
1907 break;
1908
1909 case EMACS_control_code:
1910 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
e0e989f6 1911 ENCODE_RESET_PLANE_AND_REGISTER;
4ed46869 1912 *dst++ = c1;
d46c5b12 1913 coding->consumed_char++;
4ed46869
KH
1914 break;
1915
1916 case EMACS_carriage_return_code:
d46c5b12 1917 if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
4ed46869
KH
1918 {
1919 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
e0e989f6 1920 ENCODE_RESET_PLANE_AND_REGISTER;
4ed46869 1921 *dst++ = c1;
d46c5b12 1922 coding->consumed_char++;
4ed46869
KH
1923 break;
1924 }
1925 /* fall down to treat '\r' as '\n' ... */
1926
1927 case EMACS_linefeed_code:
1928 if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
e0e989f6
KH
1929 ENCODE_RESET_PLANE_AND_REGISTER;
1930 if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
1931 bcopy (coding->spec.iso2022.initial_designation,
1932 coding->spec.iso2022.current_designation,
1933 sizeof coding->spec.iso2022.initial_designation);
4ed46869 1934 if (coding->eol_type == CODING_EOL_LF
0ef69138 1935 || coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
1936 *dst++ = ISO_CODE_LF;
1937 else if (coding->eol_type == CODING_EOL_CRLF)
1938 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
1939 else
1940 *dst++ = ISO_CODE_CR;
e0e989f6 1941 CODING_SPEC_ISO_BOL (coding) = 1;
d46c5b12 1942 coding->consumed_char++;
4ed46869
KH
1943 break;
1944
1945 case EMACS_leading_code_2:
1946 ONE_MORE_BYTE (c2);
8dbb769e 1947 c3 = 0;
19a8d9e0
KH
1948 if (c2 < 0xA0)
1949 {
1950 /* invalid sequence */
1951 *dst++ = c1;
38cf95df
RS
1952 src--;
1953 coding->consumed_char++;
19a8d9e0
KH
1954 }
1955 else
1956 ENCODE_ISO_CHARACTER (c1, c2, /* dummy */ c3);
4ed46869
KH
1957 break;
1958
1959 case EMACS_leading_code_3:
1960 TWO_MORE_BYTES (c2, c3);
8dbb769e 1961 c4 = 0;
19a8d9e0
KH
1962 if (c2 < 0xA0 || c3 < 0xA0)
1963 {
1964 /* invalid sequence */
1965 *dst++ = c1;
38cf95df
RS
1966 src -= 2;
1967 coding->consumed_char++;
19a8d9e0
KH
1968 }
1969 else if (c1 < LEADING_CODE_PRIVATE_11)
bdd9fb48 1970 ENCODE_ISO_CHARACTER (c1, c2, c3);
4ed46869 1971 else
bdd9fb48 1972 ENCODE_ISO_CHARACTER (c2, c3, /* dummy */ c4);
4ed46869
KH
1973 break;
1974
1975 case EMACS_leading_code_4:
1976 THREE_MORE_BYTES (c2, c3, c4);
19a8d9e0
KH
1977 if (c2 < 0xA0 || c3 < 0xA0 || c4 < 0xA0)
1978 {
1979 /* invalid sequence */
1980 *dst++ = c1;
38cf95df
RS
1981 src -= 3;
1982 coding->consumed_char++;
19a8d9e0
KH
1983 }
1984 else
1985 ENCODE_ISO_CHARACTER (c2, c3, c4);
4ed46869
KH
1986 break;
1987
1988 case EMACS_leading_code_composition:
19a8d9e0
KH
1989 ONE_MORE_BYTE (c2);
1990 if (c2 < 0xA0)
1991 {
1992 /* invalid sequence */
1993 *dst++ = c1;
38cf95df
RS
1994 src--;
1995 coding->consumed_char++;
19a8d9e0
KH
1996 }
1997 else if (c2 == 0xFF)
4ed46869 1998 {
d46c5b12 1999 ENCODE_RESET_PLANE_AND_REGISTER;
4ed46869
KH
2000 coding->composing = COMPOSING_WITH_RULE_HEAD;
2001 ENCODE_COMPOSITION_WITH_RULE_START;
d46c5b12 2002 coding->consumed_char++;
4ed46869
KH
2003 }
2004 else
2005 {
d46c5b12 2006 ENCODE_RESET_PLANE_AND_REGISTER;
4ed46869
KH
2007 /* Rewind one byte because it is a character code of
2008 composition elements. */
2009 src--;
2010 coding->composing = COMPOSING_NO_RULE_HEAD;
2011 ENCODE_COMPOSITION_NO_RULE_START;
d46c5b12 2012 coding->consumed_char++;
4ed46869
KH
2013 }
2014 break;
2015
2016 case EMACS_invalid_code:
3efbce95
KH
2017 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2018 ENCODE_RESET_PLANE_AND_REGISTER;
4ed46869 2019 *dst++ = c1;
d46c5b12 2020 coding->consumed_char++;
4ed46869
KH
2021 break;
2022 }
2023 continue;
2024 label_end_of_loop:
d46c5b12
KH
2025 result = CODING_FINISH_INSUFFICIENT_SRC;
2026 src = src_base;
4ed46869
KH
2027 break;
2028 }
2029
49cb52b4
KH
2030 if (src < src_end && result == CODING_FINISH_NORMAL)
2031 result = CODING_FINISH_INSUFFICIENT_DST;
2032
2033 /* If this is the last block of the text to be encoded, we must
2034 reset graphic planes and registers to the initial state, and
2035 flush out the carryover if any. */
2036 if (coding->mode & CODING_MODE_LAST_BLOCK)
84fbb8a0
KH
2037 {
2038 ENCODE_RESET_PLANE_AND_REGISTER;
2039 if (COMPOSING_P (coding->composing))
2040 ENCODE_COMPOSITION_END;
88993dfd
KH
2041 if (result == CODING_FINISH_INSUFFICIENT_SRC)
2042 {
2043 while (src < src_end && dst < dst_end)
2044 *dst++ = *src++;
2045 }
84fbb8a0 2046 }
d46c5b12
KH
2047 coding->consumed = src - source;
2048 coding->produced = coding->produced_char = dst - destination;
2049 return result;
4ed46869
KH
2050}
2051
2052\f
2053/*** 4. SJIS and BIG5 handlers ***/
2054
f4dee582 2055/* Although SJIS and BIG5 are not ISO's coding system, they are used
4ed46869
KH
2056 quite widely. So, for the moment, Emacs supports them in the bare
2057 C code. But, in the future, they may be supported only by CCL. */
2058
2059/* SJIS is a coding system encoding three character sets: ASCII, right
2060 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
2061 as is. A character of charset katakana-jisx0201 is encoded by
2062 "position-code + 0x80". A character of charset japanese-jisx0208
2063 is encoded in 2-byte but two position-codes are divided and shifted
2064 so that it fit in the range below.
2065
2066 --- CODE RANGE of SJIS ---
2067 (character set) (range)
2068 ASCII 0x00 .. 0x7F
2069 KATAKANA-JISX0201 0xA0 .. 0xDF
c28a9453 2070 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
d14d03ac 2071 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
4ed46869
KH
2072 -------------------------------
2073
2074*/
2075
2076/* BIG5 is a coding system encoding two character sets: ASCII and
2077 Big5. An ASCII character is encoded as is. Big5 is a two-byte
2078 character set and is encoded in two-byte.
2079
2080 --- CODE RANGE of BIG5 ---
2081 (character set) (range)
2082 ASCII 0x00 .. 0x7F
2083 Big5 (1st byte) 0xA1 .. 0xFE
2084 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
2085 --------------------------
2086
2087 Since the number of characters in Big5 is larger than maximum
2088 characters in Emacs' charset (96x96), it can't be handled as one
2089 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
2090 and `charset-big5-2'. Both are DIMENSION2 and CHARS94. The former
2091 contains frequently used characters and the latter contains less
2092 frequently used characters. */
2093
2094/* Macros to decode or encode a character of Big5 in BIG5. B1 and B2
2095 are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
2096 C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
2097 format. CHARSET is `charset_big5_1' or `charset_big5_2'. */
2098
2099/* Number of Big5 characters which have the same code in 1st byte. */
2100#define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2101
2102#define DECODE_BIG5(b1, b2, charset, c1, c2) \
2103 do { \
2104 unsigned int temp \
2105 = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62); \
2106 if (b1 < 0xC9) \
2107 charset = charset_big5_1; \
2108 else \
2109 { \
2110 charset = charset_big5_2; \
2111 temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW; \
2112 } \
2113 c1 = temp / (0xFF - 0xA1) + 0x21; \
2114 c2 = temp % (0xFF - 0xA1) + 0x21; \
2115 } while (0)
2116
2117#define ENCODE_BIG5(charset, c1, c2, b1, b2) \
2118 do { \
2119 unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21); \
2120 if (charset == charset_big5_2) \
2121 temp += BIG5_SAME_ROW * (0xC9 - 0xA1); \
2122 b1 = temp / BIG5_SAME_ROW + 0xA1; \
2123 b2 = temp % BIG5_SAME_ROW; \
2124 b2 += b2 < 0x3F ? 0x40 : 0x62; \
2125 } while (0)
2126
a5d301df
KH
2127#define DECODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \
2128 do { \
2129 int c_alt, charset_alt = (charset); \
84fbb8a0
KH
2130 if (!NILP (translation_table) \
2131 && ((c_alt = translate_char (translation_table, \
2132 -1, (charset), c1, c2)) >= 0)) \
55ab7be3 2133 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
a5d301df
KH
2134 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
2135 DECODE_CHARACTER_ASCII (c1); \
2136 else if (CHARSET_DIMENSION (charset_alt) == 1) \
2137 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
2138 else \
2139 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
2140 } while (0)
2141
84fbb8a0
KH
2142#define ENCODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \
2143 do { \
2144 int c_alt, charset_alt; \
2145 if (!NILP (translation_table) \
2146 && ((c_alt = translate_char (translation_table, -1, \
2147 charset, c1, c2)) \
2148 >= 0)) \
2149 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
2150 else \
2151 charset_alt = charset; \
2152 if (charset_alt == charset_ascii) \
2153 *dst++ = c1; \
2154 else if (CHARSET_DIMENSION (charset_alt) == 1) \
2155 { \
2156 if (sjis_p && charset_alt == charset_katakana_jisx0201) \
2157 *dst++ = c1; \
2158 else \
2159 { \
2160 *dst++ = charset_alt, *dst++ = c1; \
2161 coding->fake_multibyte = 1; \
2162 } \
2163 } \
2164 else \
2165 { \
2166 c1 &= 0x7F, c2 &= 0x7F; \
2167 if (sjis_p && charset_alt == charset_jisx0208) \
2168 { \
2169 unsigned char s1, s2; \
2170 \
2171 ENCODE_SJIS (c1, c2, s1, s2); \
2172 *dst++ = s1, *dst++ = s2; \
2173 coding->fake_multibyte = 1; \
2174 } \
2175 else if (!sjis_p \
2176 && (charset_alt == charset_big5_1 \
2177 || charset_alt == charset_big5_2)) \
2178 { \
2179 unsigned char b1, b2; \
2180 \
2181 ENCODE_BIG5 (charset_alt, c1, c2, b1, b2); \
2182 *dst++ = b1, *dst++ = b2; \
2183 } \
2184 else \
2185 { \
2186 *dst++ = charset_alt, *dst++ = c1, *dst++ = c2; \
2187 coding->fake_multibyte = 1; \
2188 } \
2189 } \
2190 coding->consumed_char++; \
a5d301df
KH
2191 } while (0);
2192
4ed46869
KH
2193/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2194 Check if a text is encoded in SJIS. If it is, return
2195 CODING_CATEGORY_MASK_SJIS, else return 0. */
2196
2197int
2198detect_coding_sjis (src, src_end)
2199 unsigned char *src, *src_end;
2200{
2201 unsigned char c;
2202
2203 while (src < src_end)
2204 {
2205 c = *src++;
4ed46869
KH
2206 if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
2207 {
2208 if (src < src_end && *src++ < 0x40)
2209 return 0;
2210 }
2211 }
2212 return CODING_CATEGORY_MASK_SJIS;
2213}
2214
2215/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2216 Check if a text is encoded in BIG5. If it is, return
2217 CODING_CATEGORY_MASK_BIG5, else return 0. */
2218
2219int
2220detect_coding_big5 (src, src_end)
2221 unsigned char *src, *src_end;
2222{
2223 unsigned char c;
2224
2225 while (src < src_end)
2226 {
2227 c = *src++;
4ed46869
KH
2228 if (c >= 0xA1)
2229 {
2230 if (src >= src_end)
2231 break;
2232 c = *src++;
2233 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
2234 return 0;
2235 }
2236 }
2237 return CODING_CATEGORY_MASK_BIG5;
2238}
2239
2240/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2241 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
2242
2243int
2244decode_coding_sjis_big5 (coding, source, destination,
d46c5b12 2245 src_bytes, dst_bytes, sjis_p)
4ed46869
KH
2246 struct coding_system *coding;
2247 unsigned char *source, *destination;
2248 int src_bytes, dst_bytes;
4ed46869
KH
2249 int sjis_p;
2250{
2251 unsigned char *src = source;
2252 unsigned char *src_end = source + src_bytes;
2253 unsigned char *dst = destination;
2254 unsigned char *dst_end = destination + dst_bytes;
2255 /* Since the maximum bytes produced by each loop is 4, we subtract 3
2256 from DST_END to assure overflow checking is necessary only at the
2257 head of loop. */
2258 unsigned char *adjusted_dst_end = dst_end - 3;
84fbb8a0 2259 Lisp_Object translation_table
f967223b 2260 = coding->translation_table_for_decode;
d46c5b12 2261 int result = CODING_FINISH_NORMAL;
a5d301df 2262
84fbb8a0 2263 if (!NILP (Venable_character_translation) && NILP (translation_table))
f967223b 2264 translation_table = Vstandard_translation_table_for_decode;
4ed46869 2265
d46c5b12 2266 coding->produced_char = 0;
fb88bf2d 2267 coding->fake_multibyte = 0;
d46c5b12
KH
2268 while (src < src_end && (dst_bytes
2269 ? (dst < adjusted_dst_end)
2270 : (dst < src - 3)))
4ed46869
KH
2271 {
2272 /* SRC_BASE remembers the start position in source in each loop.
2273 The loop will be exited when there's not enough source text
2274 to analyze two-byte character (within macro ONE_MORE_BYTE).
2275 In that case, SRC is reset to SRC_BASE before exiting. */
2276 unsigned char *src_base = src;
2277 unsigned char c1 = *src++, c2, c3, c4;
2278
d46c5b12 2279 if (c1 < 0x20)
4ed46869 2280 {
d46c5b12 2281 if (c1 == '\r')
4ed46869 2282 {
d46c5b12
KH
2283 if (coding->eol_type == CODING_EOL_CRLF)
2284 {
2285 ONE_MORE_BYTE (c2);
2286 if (c2 == '\n')
2287 *dst++ = c2;
2288 else if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2289 {
2290 result = CODING_FINISH_INCONSISTENT_EOL;
2291 goto label_end_of_loop_2;
2292 }
2293 else
2294 /* To process C2 again, SRC is subtracted by 1. */
2295 *dst++ = c1, src--;
2296 }
2297 else if (coding->eol_type == CODING_EOL_CR)
2298 *dst++ = '\n';
4ed46869 2299 else
d46c5b12
KH
2300 *dst++ = c1;
2301 }
2302 else if (c1 == '\n'
2303 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2304 && (coding->eol_type == CODING_EOL_CR
2305 || coding->eol_type == CODING_EOL_CRLF))
2306 {
2307 result = CODING_FINISH_INCONSISTENT_EOL;
2308 goto label_end_of_loop_2;
4ed46869
KH
2309 }
2310 else
2311 *dst++ = c1;
d46c5b12 2312 coding->produced_char++;
4ed46869 2313 }
a5d301df
KH
2314 else if (c1 < 0x80)
2315 DECODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
54f78171 2316 else
4ed46869 2317 {
4ed46869
KH
2318 if (sjis_p)
2319 {
54f78171 2320 if (c1 < 0xA0 || (c1 >= 0xE0 && c1 < 0xF0))
fb88bf2d 2321 {
54f78171
KH
2322 /* SJIS -> JISX0208 */
2323 ONE_MORE_BYTE (c2);
d14d03ac 2324 if (c2 >= 0x40 && c2 != 0x7F && c2 <= 0xFC)
54f78171
KH
2325 {
2326 DECODE_SJIS (c1, c2, c3, c4);
2327 DECODE_SJIS_BIG5_CHARACTER (charset_jisx0208, c3, c4);
2328 }
2329 else
2330 goto label_invalid_code_2;
fb88bf2d 2331 }
54f78171
KH
2332 else if (c1 < 0xE0)
2333 /* SJIS -> JISX0201-Kana */
2334 DECODE_SJIS_BIG5_CHARACTER (charset_katakana_jisx0201, c1,
2335 /* dummy */ c2);
fb88bf2d 2336 else
54f78171 2337 goto label_invalid_code_1;
4ed46869 2338 }
fb88bf2d 2339 else
fb88bf2d 2340 {
54f78171
KH
2341 /* BIG5 -> Big5 */
2342 if (c1 >= 0xA1 && c1 <= 0xFE)
fb88bf2d 2343 {
54f78171
KH
2344 ONE_MORE_BYTE (c2);
2345 if ((c2 >= 0x40 && c2 <= 0x7E) || (c2 >= 0xA1 && c2 <= 0xFE))
2346 {
2347 int charset;
4ed46869 2348
54f78171
KH
2349 DECODE_BIG5 (c1, c2, charset, c3, c4);
2350 DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
2351 }
2352 else
2353 goto label_invalid_code_2;
fb88bf2d
KH
2354 }
2355 else
54f78171 2356 goto label_invalid_code_1;
4ed46869
KH
2357 }
2358 }
2359 continue;
2360
fb88bf2d
KH
2361 label_invalid_code_1:
2362 *dst++ = c1;
2363 coding->produced_char++;
2364 coding->fake_multibyte = 1;
2365 continue;
2366
2367 label_invalid_code_2:
2368 *dst++ = c1; *dst++= c2;
2369 coding->produced_char += 2;
2370 coding->fake_multibyte = 1;
2371 continue;
2372
4ed46869 2373 label_end_of_loop:
d46c5b12
KH
2374 result = CODING_FINISH_INSUFFICIENT_SRC;
2375 label_end_of_loop_2:
4ed46869
KH
2376 src = src_base;
2377 break;
2378 }
2379
fb88bf2d
KH
2380 if (src < src_end)
2381 {
2382 if (result == CODING_FINISH_NORMAL)
2383 result = CODING_FINISH_INSUFFICIENT_DST;
2384 else if (result != CODING_FINISH_INCONSISTENT_EOL
2385 && coding->mode & CODING_MODE_LAST_BLOCK)
2386 {
2387 src_bytes = src_end - src;
2388 if (dst_bytes && (dst_end - dst < src_bytes))
2389 src_bytes = dst_end - dst;
2390 bcopy (dst, src, src_bytes);
2391 src += src_bytes;
2392 dst += src_bytes;
2393 coding->fake_multibyte = 1;
2394 }
2395 }
d46c5b12
KH
2396
2397 coding->consumed = coding->consumed_char = src - source;
2398 coding->produced = dst - destination;
2399 return result;
4ed46869
KH
2400}
2401
2402/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
2403 This function can encode `charset_ascii', `charset_katakana_jisx0201',
2404 `charset_jisx0208', `charset_big5_1', and `charset_big5-2'. We are
2405 sure that all these charsets are registered as official charset
2406 (i.e. do not have extended leading-codes). Characters of other
2407 charsets are produced without any encoding. If SJIS_P is 1, encode
2408 SJIS text, else encode BIG5 text. */
2409
2410int
2411encode_coding_sjis_big5 (coding, source, destination,
d46c5b12 2412 src_bytes, dst_bytes, sjis_p)
4ed46869
KH
2413 struct coding_system *coding;
2414 unsigned char *source, *destination;
2415 int src_bytes, dst_bytes;
4ed46869
KH
2416 int sjis_p;
2417{
2418 unsigned char *src = source;
2419 unsigned char *src_end = source + src_bytes;
2420 unsigned char *dst = destination;
2421 unsigned char *dst_end = destination + dst_bytes;
2422 /* Since the maximum bytes produced by each loop is 2, we subtract 1
2423 from DST_END to assure overflow checking is necessary only at the
2424 head of loop. */
2425 unsigned char *adjusted_dst_end = dst_end - 1;
84fbb8a0 2426 Lisp_Object translation_table
f967223b 2427 = coding->translation_table_for_encode;
d46c5b12 2428 int result = CODING_FINISH_NORMAL;
a5d301df 2429
84fbb8a0 2430 if (!NILP (Venable_character_translation) && NILP (translation_table))
f967223b 2431 translation_table = Vstandard_translation_table_for_encode;
4ed46869 2432
d46c5b12 2433 coding->consumed_char = 0;
fb88bf2d 2434 coding->fake_multibyte = 0;
d46c5b12
KH
2435 while (src < src_end && (dst_bytes
2436 ? (dst < adjusted_dst_end)
2437 : (dst < src - 1)))
4ed46869
KH
2438 {
2439 /* SRC_BASE remembers the start position in source in each loop.
2440 The loop will be exited when there's not enough source text
2441 to analyze multi-byte codes (within macros ONE_MORE_BYTE and
2442 TWO_MORE_BYTES). In that case, SRC is reset to SRC_BASE
2443 before exiting. */
2444 unsigned char *src_base = src;
2445 unsigned char c1 = *src++, c2, c3, c4;
2446
2447 if (coding->composing)
2448 {
2449 if (c1 == 0xA0)
2450 {
2451 ONE_MORE_BYTE (c1);
2452 c1 &= 0x7F;
2453 }
2454 else if (c1 >= 0xA0)
2455 c1 -= 0x20;
2456 else
2457 coding->composing = 0;
2458 }
2459
2460 switch (emacs_code_class[c1])
2461 {
2462 case EMACS_ascii_code:
a5d301df
KH
2463 ENCODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
2464 break;
2465
4ed46869
KH
2466 case EMACS_control_code:
2467 *dst++ = c1;
d46c5b12 2468 coding->consumed_char++;
4ed46869
KH
2469 break;
2470
2471 case EMACS_carriage_return_code:
d46c5b12 2472 if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
4ed46869
KH
2473 {
2474 *dst++ = c1;
d46c5b12 2475 coding->consumed_char++;
4ed46869
KH
2476 break;
2477 }
2478 /* fall down to treat '\r' as '\n' ... */
2479
2480 case EMACS_linefeed_code:
2481 if (coding->eol_type == CODING_EOL_LF
0ef69138 2482 || coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
2483 *dst++ = '\n';
2484 else if (coding->eol_type == CODING_EOL_CRLF)
2485 *dst++ = '\r', *dst++ = '\n';
2486 else
2487 *dst++ = '\r';
d46c5b12 2488 coding->consumed_char++;
4ed46869
KH
2489 break;
2490
2491 case EMACS_leading_code_2:
2492 ONE_MORE_BYTE (c2);
a5d301df 2493 ENCODE_SJIS_BIG5_CHARACTER (c1, c2, /* dummy */ c3);
4ed46869
KH
2494 break;
2495
2496 case EMACS_leading_code_3:
2497 TWO_MORE_BYTES (c2, c3);
a5d301df 2498 ENCODE_SJIS_BIG5_CHARACTER (c1, c2, c3);
4ed46869
KH
2499 break;
2500
2501 case EMACS_leading_code_4:
2502 THREE_MORE_BYTES (c2, c3, c4);
a5d301df 2503 ENCODE_SJIS_BIG5_CHARACTER (c2, c3, c4);
4ed46869
KH
2504 break;
2505
2506 case EMACS_leading_code_composition:
2507 coding->composing = 1;
2508 break;
2509
2510 default: /* i.e. case EMACS_invalid_code: */
2511 *dst++ = c1;
d46c5b12 2512 coding->consumed_char++;
4ed46869
KH
2513 }
2514 continue;
2515
2516 label_end_of_loop:
d46c5b12
KH
2517 result = CODING_FINISH_INSUFFICIENT_SRC;
2518 src = src_base;
4ed46869
KH
2519 break;
2520 }
2521
d46c5b12
KH
2522 if (result == CODING_FINISH_NORMAL
2523 && src < src_end)
2524 result = CODING_FINISH_INSUFFICIENT_DST;
2525 coding->consumed = src - source;
2526 coding->produced = coding->produced_char = dst - destination;
2527 return result;
4ed46869
KH
2528}
2529
2530\f
1397dc18
KH
2531/*** 5. CCL handlers ***/
2532
2533/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2534 Check if a text is encoded in a coding system of which
2535 encoder/decoder are written in CCL program. If it is, return
2536 CODING_CATEGORY_MASK_CCL, else return 0. */
2537
2538int
2539detect_coding_ccl (src, src_end)
2540 unsigned char *src, *src_end;
2541{
2542 unsigned char *valid;
2543
2544 /* No coding system is assigned to coding-category-ccl. */
2545 if (!coding_system_table[CODING_CATEGORY_IDX_CCL])
2546 return 0;
2547
2548 valid = coding_system_table[CODING_CATEGORY_IDX_CCL]->spec.ccl.valid_codes;
2549 while (src < src_end)
2550 {
2551 if (! valid[*src]) return 0;
2552 src++;
2553 }
2554 return CODING_CATEGORY_MASK_CCL;
2555}
2556
2557\f
2558/*** 6. End-of-line handlers ***/
4ed46869
KH
2559
2560/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2561 This function is called only when `coding->eol_type' is
2562 CODING_EOL_CRLF or CODING_EOL_CR. */
2563
dfcf069d 2564int
d46c5b12 2565decode_eol (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
2566 struct coding_system *coding;
2567 unsigned char *source, *destination;
2568 int src_bytes, dst_bytes;
4ed46869
KH
2569{
2570 unsigned char *src = source;
2571 unsigned char *src_end = source + src_bytes;
2572 unsigned char *dst = destination;
2573 unsigned char *dst_end = destination + dst_bytes;
fb88bf2d 2574 unsigned char c;
d46c5b12
KH
2575 int result = CODING_FINISH_NORMAL;
2576
fb88bf2d
KH
2577 coding->fake_multibyte = 0;
2578
d46c5b12 2579 if (src_bytes <= 0)
716e0b0a
AI
2580 {
2581 coding->produced = coding->produced_char = 0;
2582 coding->consumed = coding->consumed_char = 0;
2583 return result;
2584 }
4ed46869
KH
2585
2586 switch (coding->eol_type)
2587 {
2588 case CODING_EOL_CRLF:
2589 {
2590 /* Since the maximum bytes produced by each loop is 2, we
2591 subtract 1 from DST_END to assure overflow checking is
2592 necessary only at the head of loop. */
2593 unsigned char *adjusted_dst_end = dst_end - 1;
2594
d46c5b12
KH
2595 while (src < src_end && (dst_bytes
2596 ? (dst < adjusted_dst_end)
2597 : (dst < src - 1)))
4ed46869
KH
2598 {
2599 unsigned char *src_base = src;
fb88bf2d
KH
2600
2601 c = *src++;
4ed46869
KH
2602 if (c == '\r')
2603 {
2604 ONE_MORE_BYTE (c);
fdfcf19d
KH
2605 if (c == '\n')
2606 *dst++ = c;
2607 else
d46c5b12
KH
2608 {
2609 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2610 {
2611 result = CODING_FINISH_INCONSISTENT_EOL;
2612 goto label_end_of_loop_2;
2613 }
fdfcf19d 2614 src--;
d46c5b12 2615 *dst++ = '\r';
fb88bf2d
KH
2616 if (BASE_LEADING_CODE_P (c))
2617 coding->fake_multibyte = 1;
d46c5b12 2618 }
4ed46869 2619 }
d46c5b12
KH
2620 else if (c == '\n'
2621 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL))
2622 {
2623 result = CODING_FINISH_INCONSISTENT_EOL;
2624 goto label_end_of_loop_2;
2625 }
4ed46869 2626 else
fb88bf2d
KH
2627 {
2628 *dst++ = c;
2629 if (BASE_LEADING_CODE_P (c))
2630 coding->fake_multibyte = 1;
2631 }
4ed46869
KH
2632 continue;
2633
2634 label_end_of_loop:
d46c5b12
KH
2635 result = CODING_FINISH_INSUFFICIENT_SRC;
2636 label_end_of_loop_2:
4ed46869
KH
2637 src = src_base;
2638 break;
2639 }
fdfcf19d
KH
2640 if (src < src_end)
2641 {
2642 if (result == CODING_FINISH_NORMAL)
2643 result = CODING_FINISH_INSUFFICIENT_DST;
2644 else if (result != CODING_FINISH_INCONSISTENT_EOL
2645 && coding->mode & CODING_MODE_LAST_BLOCK)
2646 {
2647 /* This is the last block of the text to be decoded.
2648 We flush out all remaining codes. */
2649 src_bytes = src_end - src;
2650 if (dst_bytes && (dst_end - dst < src_bytes))
2651 src_bytes = dst_end - dst;
2652 bcopy (src, dst, src_bytes);
2653 dst += src_bytes;
2654 src += src_bytes;
2655 }
2656 }
4ed46869 2657 }
d46c5b12 2658 break;
4ed46869
KH
2659
2660 case CODING_EOL_CR:
d46c5b12
KH
2661 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2662 {
fb88bf2d
KH
2663 while (src < src_end)
2664 {
2665 if ((c = *src++) == '\n')
2666 break;
2667 if (BASE_LEADING_CODE_P (c))
2668 coding->fake_multibyte = 1;
2669 }
d46c5b12
KH
2670 if (*--src == '\n')
2671 {
2672 src_bytes = src - source;
2673 result = CODING_FINISH_INCONSISTENT_EOL;
2674 }
2675 }
2676 if (dst_bytes && src_bytes > dst_bytes)
2677 {
2678 result = CODING_FINISH_INSUFFICIENT_DST;
2679 src_bytes = dst_bytes;
2680 }
2681 if (dst_bytes)
2682 bcopy (source, destination, src_bytes);
2683 else
2684 safe_bcopy (source, destination, src_bytes);
2685 src = source + src_bytes;
2686 while (src_bytes--) if (*dst++ == '\r') dst[-1] = '\n';
4ed46869
KH
2687 break;
2688
2689 default: /* i.e. case: CODING_EOL_LF */
d46c5b12
KH
2690 if (dst_bytes && src_bytes > dst_bytes)
2691 {
2692 result = CODING_FINISH_INSUFFICIENT_DST;
2693 src_bytes = dst_bytes;
2694 }
2695 if (dst_bytes)
2696 bcopy (source, destination, src_bytes);
2697 else
2698 safe_bcopy (source, destination, src_bytes);
2699 src += src_bytes;
993824c9 2700 dst += src_bytes;
fb88bf2d 2701 coding->fake_multibyte = 1;
4ed46869
KH
2702 break;
2703 }
2704
d46c5b12
KH
2705 coding->consumed = coding->consumed_char = src - source;
2706 coding->produced = coding->produced_char = dst - destination;
2707 return result;
4ed46869
KH
2708}
2709
2710/* See "GENERAL NOTES about `encode_coding_XXX ()' functions". Encode
2711 format of end-of-line according to `coding->eol_type'. If
d46c5b12
KH
2712 `coding->mode & CODING_MODE_SELECTIVE_DISPLAY' is nonzero, code
2713 '\r' in source text also means end-of-line. */
4ed46869 2714
dfcf069d 2715int
d46c5b12 2716encode_eol (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
2717 struct coding_system *coding;
2718 unsigned char *source, *destination;
2719 int src_bytes, dst_bytes;
4ed46869
KH
2720{
2721 unsigned char *src = source;
2722 unsigned char *dst = destination;
d46c5b12 2723 int result = CODING_FINISH_NORMAL;
4ed46869 2724
fb88bf2d
KH
2725 coding->fake_multibyte = 0;
2726
d46c5b12
KH
2727 if (coding->eol_type == CODING_EOL_CRLF)
2728 {
2729 unsigned char c;
2730 unsigned char *src_end = source + src_bytes;
2731 unsigned char *dst_end = destination + dst_bytes;
2732 /* Since the maximum bytes produced by each loop is 2, we
2733 subtract 1 from DST_END to assure overflow checking is
2734 necessary only at the head of loop. */
2735 unsigned char *adjusted_dst_end = dst_end - 1;
2736
2737 while (src < src_end && (dst_bytes
2738 ? (dst < adjusted_dst_end)
2739 : (dst < src - 1)))
2740 {
2741 c = *src++;
2742 if (c == '\n'
2743 || (c == '\r' && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY)))
2744 *dst++ = '\r', *dst++ = '\n';
2745 else
fb88bf2d
KH
2746 {
2747 *dst++ = c;
2748 if (BASE_LEADING_CODE_P (c))
2749 coding->fake_multibyte = 1;
2750 }
d46c5b12
KH
2751 }
2752 if (src < src_end)
2753 result = CODING_FINISH_INSUFFICIENT_DST;
2754 }
2755 else
4ed46869 2756 {
fb88bf2d
KH
2757 unsigned char c;
2758
d46c5b12 2759 if (dst_bytes && src_bytes > dst_bytes)
4ed46869 2760 {
d46c5b12
KH
2761 src_bytes = dst_bytes;
2762 result = CODING_FINISH_INSUFFICIENT_DST;
2763 }
2764 if (dst_bytes)
2765 bcopy (source, destination, src_bytes);
2766 else
993824c9
RS
2767 safe_bcopy (source, destination, src_bytes);
2768 dst_bytes = src_bytes;
2769 if (coding->eol_type == CODING_EOL_CR)
d46c5b12
KH
2770 {
2771 while (src_bytes--)
fb88bf2d
KH
2772 {
2773 if ((c = *dst++) == '\n')
2774 dst[-1] = '\r';
2775 else if (BASE_LEADING_CODE_P (c))
993824c9 2776 coding->fake_multibyte = 1;
fb88bf2d 2777 }
d46c5b12 2778 }
fb88bf2d 2779 else
d46c5b12 2780 {
fb88bf2d
KH
2781 if (coding->mode & CODING_MODE_SELECTIVE_DISPLAY)
2782 {
2783 while (src_bytes--)
2784 if (*dst++ == '\r') dst[-1] = '\n';
2785 }
2786 coding->fake_multibyte = 1;
4ed46869 2787 }
fb88bf2d
KH
2788 src = source + dst_bytes;
2789 dst = destination + dst_bytes;
4ed46869
KH
2790 }
2791
d46c5b12
KH
2792 coding->consumed = coding->consumed_char = src - source;
2793 coding->produced = coding->produced_char = dst - destination;
2794 return result;
4ed46869
KH
2795}
2796
2797\f
1397dc18 2798/*** 7. C library functions ***/
4ed46869
KH
2799
2800/* In Emacs Lisp, coding system is represented by a Lisp symbol which
2801 has a property `coding-system'. The value of this property is a
2802 vector of length 5 (called as coding-vector). Among elements of
2803 this vector, the first (element[0]) and the fifth (element[4])
2804 carry important information for decoding/encoding. Before
2805 decoding/encoding, this information should be set in fields of a
2806 structure of type `coding_system'.
2807
2808 A value of property `coding-system' can be a symbol of another
2809 subsidiary coding-system. In that case, Emacs gets coding-vector
2810 from that symbol.
2811
2812 `element[0]' contains information to be set in `coding->type'. The
2813 value and its meaning is as follows:
2814
0ef69138
KH
2815 0 -- coding_type_emacs_mule
2816 1 -- coding_type_sjis
2817 2 -- coding_type_iso2022
2818 3 -- coding_type_big5
2819 4 -- coding_type_ccl encoder/decoder written in CCL
2820 nil -- coding_type_no_conversion
2821 t -- coding_type_undecided (automatic conversion on decoding,
2822 no-conversion on encoding)
4ed46869
KH
2823
2824 `element[4]' contains information to be set in `coding->flags' and
2825 `coding->spec'. The meaning varies by `coding->type'.
2826
2827 If `coding->type' is `coding_type_iso2022', element[4] is a vector
2828 of length 32 (of which the first 13 sub-elements are used now).
2829 Meanings of these sub-elements are:
2830
2831 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2832 If the value is an integer of valid charset, the charset is
2833 assumed to be designated to graphic register N initially.
2834
2835 If the value is minus, it is a minus value of charset which
2836 reserves graphic register N, which means that the charset is
2837 not designated initially but should be designated to graphic
2838 register N just before encoding a character in that charset.
2839
2840 If the value is nil, graphic register N is never used on
2841 encoding.
2842
2843 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2844 Each value takes t or nil. See the section ISO2022 of
2845 `coding.h' for more information.
2846
2847 If `coding->type' is `coding_type_big5', element[4] is t to denote
2848 BIG5-ETen or nil to denote BIG5-HKU.
2849
2850 If `coding->type' takes the other value, element[4] is ignored.
2851
2852 Emacs Lisp's coding system also carries information about format of
2853 end-of-line in a value of property `eol-type'. If the value is
2854 integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2855 means CODING_EOL_CR. If it is not integer, it should be a vector
2856 of subsidiary coding systems of which property `eol-type' has one
2857 of above values.
2858
2859*/
2860
2861/* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2862 and set it in CODING. If CODING_SYSTEM_SYMBOL is invalid, CODING
2863 is setup so that no conversion is necessary and return -1, else
2864 return 0. */
2865
2866int
e0e989f6
KH
2867setup_coding_system (coding_system, coding)
2868 Lisp_Object coding_system;
4ed46869
KH
2869 struct coding_system *coding;
2870{
d46c5b12 2871 Lisp_Object coding_spec, coding_type, eol_type, plist;
4608c386 2872 Lisp_Object val;
70c22245 2873 int i;
4ed46869 2874
d46c5b12 2875 /* Initialize some fields required for all kinds of coding systems. */
774324d6 2876 coding->symbol = coding_system;
d46c5b12
KH
2877 coding->common_flags = 0;
2878 coding->mode = 0;
2879 coding->heading_ascii = -1;
2880 coding->post_read_conversion = coding->pre_write_conversion = Qnil;
1f5dbf34
KH
2881
2882 if (NILP (coding_system))
2883 goto label_invalid_coding_system;
2884
4608c386 2885 coding_spec = Fget (coding_system, Qcoding_system);
1f5dbf34 2886
4608c386
KH
2887 if (!VECTORP (coding_spec)
2888 || XVECTOR (coding_spec)->size != 5
2889 || !CONSP (XVECTOR (coding_spec)->contents[3]))
4ed46869 2890 goto label_invalid_coding_system;
4608c386 2891
d46c5b12
KH
2892 eol_type = inhibit_eol_conversion ? Qnil : Fget (coding_system, Qeol_type);
2893 if (VECTORP (eol_type))
2894 {
2895 coding->eol_type = CODING_EOL_UNDECIDED;
2896 coding->common_flags = CODING_REQUIRE_DETECTION_MASK;
2897 }
2898 else if (XFASTINT (eol_type) == 1)
2899 {
2900 coding->eol_type = CODING_EOL_CRLF;
2901 coding->common_flags
2902 = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2903 }
2904 else if (XFASTINT (eol_type) == 2)
2905 {
2906 coding->eol_type = CODING_EOL_CR;
2907 coding->common_flags
2908 = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2909 }
2910 else
2911 coding->eol_type = CODING_EOL_LF;
2912
2913 coding_type = XVECTOR (coding_spec)->contents[0];
2914 /* Try short cut. */
2915 if (SYMBOLP (coding_type))
2916 {
2917 if (EQ (coding_type, Qt))
2918 {
2919 coding->type = coding_type_undecided;
2920 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
2921 }
2922 else
2923 coding->type = coding_type_no_conversion;
2924 return 0;
2925 }
2926
2927 /* Initialize remaining fields. */
2928 coding->composing = 0;
a63063ae 2929 coding->composed_chars = 0;
d46c5b12
KH
2930
2931 /* Get values of coding system properties:
2932 `post-read-conversion', `pre-write-conversion',
f967223b 2933 `translation-table-for-decode', `translation-table-for-encode'. */
4608c386
KH
2934 plist = XVECTOR (coding_spec)->contents[3];
2935 coding->post_read_conversion = Fplist_get (plist, Qpost_read_conversion);
2936 coding->pre_write_conversion = Fplist_get (plist, Qpre_write_conversion);
f967223b 2937 val = Fplist_get (plist, Qtranslation_table_for_decode);
4608c386 2938 if (SYMBOLP (val))
f967223b
KH
2939 val = Fget (val, Qtranslation_table_for_decode);
2940 coding->translation_table_for_decode = CHAR_TABLE_P (val) ? val : Qnil;
2941 val = Fplist_get (plist, Qtranslation_table_for_encode);
4608c386 2942 if (SYMBOLP (val))
f967223b
KH
2943 val = Fget (val, Qtranslation_table_for_encode);
2944 coding->translation_table_for_encode = CHAR_TABLE_P (val) ? val : Qnil;
d46c5b12
KH
2945 val = Fplist_get (plist, Qcoding_category);
2946 if (!NILP (val))
2947 {
2948 val = Fget (val, Qcoding_category_index);
2949 if (INTEGERP (val))
2950 coding->category_idx = XINT (val);
2951 else
2952 goto label_invalid_coding_system;
2953 }
2954 else
2955 goto label_invalid_coding_system;
4608c386 2956
70c22245
KH
2957 val = Fplist_get (plist, Qsafe_charsets);
2958 if (EQ (val, Qt))
2959 {
2960 for (i = 0; i <= MAX_CHARSET; i++)
2961 coding->safe_charsets[i] = 1;
2962 }
2963 else
2964 {
2965 bzero (coding->safe_charsets, MAX_CHARSET + 1);
2966 while (CONSP (val))
2967 {
2968 if ((i = get_charset_id (XCONS (val)->car)) >= 0)
2969 coding->safe_charsets[i] = 1;
2970 val = XCONS (val)->cdr;
2971 }
2972 }
2973
d46c5b12 2974 switch (XFASTINT (coding_type))
4ed46869
KH
2975 {
2976 case 0:
0ef69138 2977 coding->type = coding_type_emacs_mule;
c952af22
KH
2978 if (!NILP (coding->post_read_conversion))
2979 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
2980 if (!NILP (coding->pre_write_conversion))
2981 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
4ed46869
KH
2982 break;
2983
2984 case 1:
2985 coding->type = coding_type_sjis;
c952af22
KH
2986 coding->common_flags
2987 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
4ed46869
KH
2988 break;
2989
2990 case 2:
2991 coding->type = coding_type_iso2022;
c952af22
KH
2992 coding->common_flags
2993 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
4ed46869 2994 {
70c22245 2995 Lisp_Object val, temp;
4ed46869 2996 Lisp_Object *flags;
d46c5b12 2997 int i, charset, reg_bits = 0;
4ed46869 2998
4608c386 2999 val = XVECTOR (coding_spec)->contents[4];
f44d27ce 3000
4ed46869
KH
3001 if (!VECTORP (val) || XVECTOR (val)->size != 32)
3002 goto label_invalid_coding_system;
3003
3004 flags = XVECTOR (val)->contents;
3005 coding->flags
3006 = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
3007 | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
3008 | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
3009 | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
3010 | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
3011 | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
3012 | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
3013 | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
e0e989f6
KH
3014 | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
3015 | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
c4825358
KH
3016 | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3017 | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
3f003981 3018 | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
c4825358 3019 );
4ed46869
KH
3020
3021 /* Invoke graphic register 0 to plane 0. */
3022 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
3023 /* Invoke graphic register 1 to plane 1 if we can use full 8-bit. */
3024 CODING_SPEC_ISO_INVOCATION (coding, 1)
3025 = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
3026 /* Not single shifting at first. */
6e85d753 3027 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
e0e989f6 3028 /* Beginning of buffer should also be regarded as bol. */
6e85d753 3029 CODING_SPEC_ISO_BOL (coding) = 1;
4ed46869 3030
70c22245
KH
3031 for (charset = 0; charset <= MAX_CHARSET; charset++)
3032 CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = 255;
3033 val = Vcharset_revision_alist;
3034 while (CONSP (val))
3035 {
3036 charset = get_charset_id (Fcar_safe (XCONS (val)->car));
3037 if (charset >= 0
3038 && (temp = Fcdr_safe (XCONS (val)->car), INTEGERP (temp))
3039 && (i = XINT (temp), (i >= 0 && (i + '@') < 128)))
3040 CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = i;
3041 val = XCONS (val)->cdr;
3042 }
3043
4ed46869
KH
3044 /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
3045 FLAGS[REG] can be one of below:
3046 integer CHARSET: CHARSET occupies register I,
3047 t: designate nothing to REG initially, but can be used
3048 by any charsets,
3049 list of integer, nil, or t: designate the first
3050 element (if integer) to REG initially, the remaining
3051 elements (if integer) is designated to REG on request,
d46c5b12 3052 if an element is t, REG can be used by any charsets,
4ed46869 3053 nil: REG is never used. */
467e7675 3054 for (charset = 0; charset <= MAX_CHARSET; charset++)
1ba9e4ab
KH
3055 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3056 = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
4ed46869
KH
3057 for (i = 0; i < 4; i++)
3058 {
3059 if (INTEGERP (flags[i])
e0e989f6
KH
3060 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset))
3061 || (charset = get_charset_id (flags[i])) >= 0)
4ed46869
KH
3062 {
3063 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3064 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
3065 }
3066 else if (EQ (flags[i], Qt))
3067 {
3068 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
d46c5b12
KH
3069 reg_bits |= 1 << i;
3070 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
4ed46869
KH
3071 }
3072 else if (CONSP (flags[i]))
3073 {
84d60297
RS
3074 Lisp_Object tail;
3075 tail = flags[i];
4ed46869 3076
d46c5b12 3077 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
4ed46869
KH
3078 if (INTEGERP (XCONS (tail)->car)
3079 && (charset = XINT (XCONS (tail)->car),
e0e989f6
KH
3080 CHARSET_VALID_P (charset))
3081 || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
4ed46869
KH
3082 {
3083 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3084 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
3085 }
3086 else
3087 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3088 tail = XCONS (tail)->cdr;
3089 while (CONSP (tail))
3090 {
3091 if (INTEGERP (XCONS (tail)->car)
3092 && (charset = XINT (XCONS (tail)->car),
e0e989f6
KH
3093 CHARSET_VALID_P (charset))
3094 || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
70c22245
KH
3095 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3096 = i;
4ed46869 3097 else if (EQ (XCONS (tail)->car, Qt))
d46c5b12 3098 reg_bits |= 1 << i;
4ed46869
KH
3099 tail = XCONS (tail)->cdr;
3100 }
3101 }
3102 else
3103 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3104
3105 CODING_SPEC_ISO_DESIGNATION (coding, i)
3106 = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
3107 }
3108
d46c5b12 3109 if (reg_bits && ! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
4ed46869
KH
3110 {
3111 /* REG 1 can be used only by locking shift in 7-bit env. */
3112 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
d46c5b12 3113 reg_bits &= ~2;
4ed46869
KH
3114 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
3115 /* Without any shifting, only REG 0 and 1 can be used. */
d46c5b12 3116 reg_bits &= 3;
4ed46869
KH
3117 }
3118
d46c5b12
KH
3119 if (reg_bits)
3120 for (charset = 0; charset <= MAX_CHARSET; charset++)
6e85d753 3121 {
d46c5b12
KH
3122 if (CHARSET_VALID_P (charset))
3123 {
3124 /* There exist some default graphic registers to be
3125 used CHARSET. */
3126
3127 /* We had better avoid designating a charset of
3128 CHARS96 to REG 0 as far as possible. */
3129 if (CHARSET_CHARS (charset) == 96)
3130 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3131 = (reg_bits & 2
3132 ? 1 : (reg_bits & 4 ? 2 : (reg_bits & 8 ? 3 : 0)));
3133 else
3134 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3135 = (reg_bits & 1
3136 ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
3137 }
6e85d753 3138 }
4ed46869 3139 }
c952af22 3140 coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
d46c5b12 3141 coding->spec.iso2022.last_invalid_designation_register = -1;
4ed46869
KH
3142 break;
3143
3144 case 3:
3145 coding->type = coding_type_big5;
c952af22
KH
3146 coding->common_flags
3147 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
4ed46869 3148 coding->flags
4608c386 3149 = (NILP (XVECTOR (coding_spec)->contents[4])
4ed46869
KH
3150 ? CODING_FLAG_BIG5_HKU
3151 : CODING_FLAG_BIG5_ETEN);
3152 break;
3153
3154 case 4:
3155 coding->type = coding_type_ccl;
c952af22
KH
3156 coding->common_flags
3157 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
4ed46869 3158 {
84d60297 3159 Lisp_Object val;
d21ca14d
KH
3160 Lisp_Object decoder, encoder;
3161
84d60297 3162 val = XVECTOR (coding_spec)->contents[4];
4ed46869 3163 if (CONSP (val)
d21ca14d
KH
3164 && SYMBOLP (XCONS (val)->car)
3165 && !NILP (decoder = Fget (XCONS (val)->car, Qccl_program_idx))
f82423d7 3166 && !NILP (decoder = Fcdr (Faref (Vccl_program_table, decoder)))
d21ca14d
KH
3167 && SYMBOLP (XCONS (val)->cdr)
3168 && !NILP (encoder = Fget (XCONS (val)->cdr, Qccl_program_idx))
f82423d7 3169 && !NILP (encoder = Fcdr (Faref (Vccl_program_table, encoder))))
4ed46869 3170 {
d21ca14d
KH
3171 setup_ccl_program (&(coding->spec.ccl.decoder), decoder);
3172 setup_ccl_program (&(coding->spec.ccl.encoder), encoder);
4ed46869
KH
3173 }
3174 else
3175 goto label_invalid_coding_system;
1397dc18
KH
3176
3177 bzero (coding->spec.ccl.valid_codes, 256);
3178 val = Fplist_get (plist, Qvalid_codes);
3179 if (CONSP (val))
3180 {
3181 Lisp_Object this;
3182
7b179c2d 3183 for (; CONSP (val); val = XCONS (val)->cdr)
1397dc18 3184 {
7b179c2d 3185 this = XCONS (val)->car;
1397dc18
KH
3186 if (INTEGERP (this)
3187 && XINT (this) >= 0 && XINT (this) < 256)
3188 coding->spec.ccl.valid_codes[XINT (this)] = 1;
3189 else if (CONSP (this)
3190 && INTEGERP (XCONS (this)->car)
3191 && INTEGERP (XCONS (this)->cdr))
3192 {
3193 int start = XINT (XCONS (this)->car);
3194 int end = XINT (XCONS (this)->cdr);
3195
3196 if (start >= 0 && start <= end && end < 256)
e133c8fa 3197 while (start <= end)
1397dc18
KH
3198 coding->spec.ccl.valid_codes[start++] = 1;
3199 }
3200 }
3201 }
4ed46869 3202 }
c952af22 3203 coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
4ed46869
KH
3204 break;
3205
27901516
KH
3206 case 5:
3207 coding->type = coding_type_raw_text;
3208 break;
3209
4ed46869 3210 default:
d46c5b12 3211 goto label_invalid_coding_system;
4ed46869
KH
3212 }
3213 return 0;
3214
3215 label_invalid_coding_system:
3216 coding->type = coding_type_no_conversion;
d46c5b12 3217 coding->category_idx = CODING_CATEGORY_IDX_BINARY;
c952af22 3218 coding->common_flags = 0;
dec137e5 3219 coding->eol_type = CODING_EOL_LF;
d46c5b12 3220 coding->pre_write_conversion = coding->post_read_conversion = Qnil;
4ed46869
KH
3221 return -1;
3222}
3223
54f78171
KH
3224/* Setup raw-text or one of its subsidiaries in the structure
3225 coding_system CODING according to the already setup value eol_type
3226 in CODING. CODING should be setup for some coding system in
3227 advance. */
3228
3229void
3230setup_raw_text_coding_system (coding)
3231 struct coding_system *coding;
3232{
3233 if (coding->type != coding_type_raw_text)
3234 {
3235 coding->symbol = Qraw_text;
3236 coding->type = coding_type_raw_text;
3237 if (coding->eol_type != CODING_EOL_UNDECIDED)
3238 {
84d60297
RS
3239 Lisp_Object subsidiaries;
3240 subsidiaries = Fget (Qraw_text, Qeol_type);
54f78171
KH
3241
3242 if (VECTORP (subsidiaries)
3243 && XVECTOR (subsidiaries)->size == 3)
3244 coding->symbol
3245 = XVECTOR (subsidiaries)->contents[coding->eol_type];
3246 }
716e0b0a 3247 setup_coding_system (coding->symbol, coding);
54f78171
KH
3248 }
3249 return;
3250}
3251
4ed46869
KH
3252/* Emacs has a mechanism to automatically detect a coding system if it
3253 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
3254 it's impossible to distinguish some coding systems accurately
3255 because they use the same range of codes. So, at first, coding
3256 systems are categorized into 7, those are:
3257
0ef69138 3258 o coding-category-emacs-mule
4ed46869
KH
3259
3260 The category for a coding system which has the same code range
3261 as Emacs' internal format. Assigned the coding-system (Lisp
0ef69138 3262 symbol) `emacs-mule' by default.
4ed46869
KH
3263
3264 o coding-category-sjis
3265
3266 The category for a coding system which has the same code range
3267 as SJIS. Assigned the coding-system (Lisp
7717c392 3268 symbol) `japanese-shift-jis' by default.
4ed46869
KH
3269
3270 o coding-category-iso-7
3271
3272 The category for a coding system which has the same code range
7717c392 3273 as ISO2022 of 7-bit environment. This doesn't use any locking
d46c5b12
KH
3274 shift and single shift functions. This can encode/decode all
3275 charsets. Assigned the coding-system (Lisp symbol)
3276 `iso-2022-7bit' by default.
3277
3278 o coding-category-iso-7-tight
3279
3280 Same as coding-category-iso-7 except that this can
3281 encode/decode only the specified charsets.
4ed46869
KH
3282
3283 o coding-category-iso-8-1
3284
3285 The category for a coding system which has the same code range
3286 as ISO2022 of 8-bit environment and graphic plane 1 used only
7717c392
KH
3287 for DIMENSION1 charset. This doesn't use any locking shift
3288 and single shift functions. Assigned the coding-system (Lisp
3289 symbol) `iso-latin-1' by default.
4ed46869
KH
3290
3291 o coding-category-iso-8-2
3292
3293 The category for a coding system which has the same code range
3294 as ISO2022 of 8-bit environment and graphic plane 1 used only
7717c392
KH
3295 for DIMENSION2 charset. This doesn't use any locking shift
3296 and single shift functions. Assigned the coding-system (Lisp
3297 symbol) `japanese-iso-8bit' by default.
4ed46869 3298
7717c392 3299 o coding-category-iso-7-else
4ed46869
KH
3300
3301 The category for a coding system which has the same code range
7717c392
KH
3302 as ISO2022 of 7-bit environemnt but uses locking shift or
3303 single shift functions. Assigned the coding-system (Lisp
3304 symbol) `iso-2022-7bit-lock' by default.
3305
3306 o coding-category-iso-8-else
3307
3308 The category for a coding system which has the same code range
3309 as ISO2022 of 8-bit environemnt but uses locking shift or
3310 single shift functions. Assigned the coding-system (Lisp
3311 symbol) `iso-2022-8bit-ss2' by default.
4ed46869
KH
3312
3313 o coding-category-big5
3314
3315 The category for a coding system which has the same code range
3316 as BIG5. Assigned the coding-system (Lisp symbol)
e0e989f6 3317 `cn-big5' by default.
4ed46869 3318
1397dc18
KH
3319 o coding-category-ccl
3320
3321 The category for a coding system of which encoder/decoder is
3322 written in CCL programs. The default value is nil, i.e., no
3323 coding system is assigned.
3324
4ed46869
KH
3325 o coding-category-binary
3326
3327 The category for a coding system not categorized in any of the
3328 above. Assigned the coding-system (Lisp symbol)
e0e989f6 3329 `no-conversion' by default.
4ed46869
KH
3330
3331 Each of them is a Lisp symbol and the value is an actual
3332 `coding-system's (this is also a Lisp symbol) assigned by a user.
3333 What Emacs does actually is to detect a category of coding system.
3334 Then, it uses a `coding-system' assigned to it. If Emacs can't
3335 decide only one possible category, it selects a category of the
3336 highest priority. Priorities of categories are also specified by a
3337 user in a Lisp variable `coding-category-list'.
3338
3339*/
3340
66cfb530
KH
3341static
3342int ascii_skip_code[256];
3343
d46c5b12 3344/* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded.
4ed46869
KH
3345 If it detects possible coding systems, return an integer in which
3346 appropriate flag bits are set. Flag bits are defined by macros
d46c5b12 3347 CODING_CATEGORY_MASK_XXX in `coding.h'.
4ed46869 3348
d46c5b12
KH
3349 How many ASCII characters are at the head is returned as *SKIP. */
3350
3351static int
3352detect_coding_mask (source, src_bytes, priorities, skip)
3353 unsigned char *source;
3354 int src_bytes, *priorities, *skip;
4ed46869
KH
3355{
3356 register unsigned char c;
d46c5b12 3357 unsigned char *src = source, *src_end = source + src_bytes;
66cfb530 3358 unsigned int mask;
d46c5b12 3359 int i;
4ed46869
KH
3360
3361 /* At first, skip all ASCII characters and control characters except
3362 for three ISO2022 specific control characters. */
66cfb530
KH
3363 ascii_skip_code[ISO_CODE_SO] = 0;
3364 ascii_skip_code[ISO_CODE_SI] = 0;
3365 ascii_skip_code[ISO_CODE_ESC] = 0;
3366
bcf26d6a 3367 label_loop_detect_coding:
66cfb530 3368 while (src < src_end && ascii_skip_code[*src]) src++;
d46c5b12 3369 *skip = src - source;
4ed46869
KH
3370
3371 if (src >= src_end)
3372 /* We found nothing other than ASCII. There's nothing to do. */
d46c5b12 3373 return 0;
4ed46869 3374
8a8147d6 3375 c = *src;
4ed46869
KH
3376 /* The text seems to be encoded in some multilingual coding system.
3377 Now, try to find in which coding system the text is encoded. */
3378 if (c < 0x80)
bcf26d6a
KH
3379 {
3380 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
3381 /* C is an ISO2022 specific control code of C0. */
3382 mask = detect_coding_iso2022 (src, src_end);
1b2af4b0 3383 if (mask == 0)
d46c5b12
KH
3384 {
3385 /* No valid ISO2022 code follows C. Try again. */
3386 src++;
66cfb530
KH
3387 if (c == ISO_CODE_ESC)
3388 ascii_skip_code[ISO_CODE_ESC] = 1;
3389 else
3390 ascii_skip_code[ISO_CODE_SO] = ascii_skip_code[ISO_CODE_SI] = 1;
d46c5b12
KH
3391 goto label_loop_detect_coding;
3392 }
3393 if (priorities)
3394 goto label_return_highest_only;
bcf26d6a 3395 }
d46c5b12 3396 else
c4825358 3397 {
d46c5b12 3398 int try;
4ed46869 3399
d46c5b12
KH
3400 if (c < 0xA0)
3401 {
3402 /* C is the first byte of SJIS character code,
3403 or a leading-code of Emacs' internal format (emacs-mule). */
3404 try = CODING_CATEGORY_MASK_SJIS | CODING_CATEGORY_MASK_EMACS_MULE;
3405
3406 /* Or, if C is a special latin extra code,
3407 or is an ISO2022 specific control code of C1 (SS2 or SS3),
3408 or is an ISO2022 control-sequence-introducer (CSI),
3409 we should also consider the possibility of ISO2022 codings. */
3410 if ((VECTORP (Vlatin_extra_code_table)
3411 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
3412 || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
3413 || (c == ISO_CODE_CSI
3414 && (src < src_end
3415 && (*src == ']'
3416 || ((*src == '0' || *src == '1' || *src == '2')
3417 && src + 1 < src_end
3418 && src[1] == ']')))))
3419 try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
3420 | CODING_CATEGORY_MASK_ISO_8BIT);
3421 }
c4825358 3422 else
d46c5b12
KH
3423 /* C is a character of ISO2022 in graphic plane right,
3424 or a SJIS's 1-byte character code (i.e. JISX0201),
3425 or the first byte of BIG5's 2-byte code. */
3426 try = (CODING_CATEGORY_MASK_ISO_8_ELSE
3427 | CODING_CATEGORY_MASK_ISO_8BIT
3428 | CODING_CATEGORY_MASK_SJIS
3429 | CODING_CATEGORY_MASK_BIG5);
3430
1397dc18
KH
3431 /* Or, we may have to consider the possibility of CCL. */
3432 if (coding_system_table[CODING_CATEGORY_IDX_CCL]
3433 && (coding_system_table[CODING_CATEGORY_IDX_CCL]
3434 ->spec.ccl.valid_codes)[c])
3435 try |= CODING_CATEGORY_MASK_CCL;
3436
d46c5b12
KH
3437 mask = 0;
3438 if (priorities)
3439 {
3440 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3441 {
5ab13dd0 3442 if (priorities[i] & try & CODING_CATEGORY_MASK_ISO)
d46c5b12 3443 mask = detect_coding_iso2022 (src, src_end);
5ab13dd0 3444 else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS)
d46c5b12 3445 mask = detect_coding_sjis (src, src_end);
5ab13dd0 3446 else if (priorities[i] & try & CODING_CATEGORY_MASK_BIG5)
d46c5b12 3447 mask = detect_coding_big5 (src, src_end);
5ab13dd0 3448 else if (priorities[i] & try & CODING_CATEGORY_MASK_EMACS_MULE)
d46c5b12 3449 mask = detect_coding_emacs_mule (src, src_end);
89fa8b36 3450 else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL)
1397dc18 3451 mask = detect_coding_ccl (src, src_end);
5ab13dd0
RS
3452 else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT)
3453 mask = CODING_CATEGORY_MASK_RAW_TEXT;
3454 else if (priorities[i] & CODING_CATEGORY_MASK_BINARY)
3455 mask = CODING_CATEGORY_MASK_BINARY;
d46c5b12
KH
3456 if (mask)
3457 goto label_return_highest_only;
3458 }
3459 return CODING_CATEGORY_MASK_RAW_TEXT;
3460 }
3461 if (try & CODING_CATEGORY_MASK_ISO)
3462 mask |= detect_coding_iso2022 (src, src_end);
3463 if (try & CODING_CATEGORY_MASK_SJIS)
3464 mask |= detect_coding_sjis (src, src_end);
3465 if (try & CODING_CATEGORY_MASK_BIG5)
3466 mask |= detect_coding_big5 (src, src_end);
3467 if (try & CODING_CATEGORY_MASK_EMACS_MULE)
1397dc18
KH
3468 mask |= detect_coding_emacs_mule (src, src_end);
3469 if (try & CODING_CATEGORY_MASK_CCL)
3470 mask |= detect_coding_ccl (src, src_end);
c4825358 3471 }
5ab13dd0 3472 return (mask | CODING_CATEGORY_MASK_RAW_TEXT | CODING_CATEGORY_MASK_BINARY);
d46c5b12
KH
3473
3474 label_return_highest_only:
3475 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3476 {
3477 if (mask & priorities[i])
3478 return priorities[i];
3479 }
3480 return CODING_CATEGORY_MASK_RAW_TEXT;
4ed46869
KH
3481}
3482
3483/* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
3484 The information of the detected coding system is set in CODING. */
3485
3486void
3487detect_coding (coding, src, src_bytes)
3488 struct coding_system *coding;
3489 unsigned char *src;
3490 int src_bytes;
3491{
d46c5b12
KH
3492 unsigned int idx;
3493 int skip, mask, i;
84d60297 3494 Lisp_Object val;
4ed46869 3495
84d60297 3496 val = Vcoding_category_list;
66cfb530 3497 mask = detect_coding_mask (src, src_bytes, coding_priorities, &skip);
d46c5b12 3498 coding->heading_ascii = skip;
4ed46869 3499
d46c5b12
KH
3500 if (!mask) return;
3501
3502 /* We found a single coding system of the highest priority in MASK. */
3503 idx = 0;
3504 while (mask && ! (mask & 1)) mask >>= 1, idx++;
3505 if (! mask)
3506 idx = CODING_CATEGORY_IDX_RAW_TEXT;
4ed46869 3507
d46c5b12
KH
3508 val = XSYMBOL (XVECTOR (Vcoding_category_table)->contents[idx])->value;
3509
3510 if (coding->eol_type != CODING_EOL_UNDECIDED)
27901516 3511 {
84d60297 3512 Lisp_Object tmp;
d46c5b12 3513
84d60297 3514 tmp = Fget (val, Qeol_type);
d46c5b12
KH
3515 if (VECTORP (tmp))
3516 val = XVECTOR (tmp)->contents[coding->eol_type];
4ed46869 3517 }
d46c5b12
KH
3518 setup_coding_system (val, coding);
3519 /* Set this again because setup_coding_system reset this member. */
3520 coding->heading_ascii = skip;
4ed46869
KH
3521}
3522
d46c5b12
KH
3523/* Detect how end-of-line of a text of length SRC_BYTES pointed by
3524 SOURCE is encoded. Return one of CODING_EOL_LF, CODING_EOL_CRLF,
3525 CODING_EOL_CR, and CODING_EOL_UNDECIDED.
3526
3527 How many non-eol characters are at the head is returned as *SKIP. */
4ed46869 3528
bc4bc72a
RS
3529#define MAX_EOL_CHECK_COUNT 3
3530
d46c5b12
KH
3531static int
3532detect_eol_type (source, src_bytes, skip)
3533 unsigned char *source;
3534 int src_bytes, *skip;
4ed46869 3535{
d46c5b12 3536 unsigned char *src = source, *src_end = src + src_bytes;
4ed46869 3537 unsigned char c;
bc4bc72a
RS
3538 int total = 0; /* How many end-of-lines are found so far. */
3539 int eol_type = CODING_EOL_UNDECIDED;
3540 int this_eol_type;
4ed46869 3541
d46c5b12
KH
3542 *skip = 0;
3543
bc4bc72a 3544 while (src < src_end && total < MAX_EOL_CHECK_COUNT)
4ed46869
KH
3545 {
3546 c = *src++;
bc4bc72a 3547 if (c == '\n' || c == '\r')
4ed46869 3548 {
d46c5b12
KH
3549 if (*skip == 0)
3550 *skip = src - 1 - source;
bc4bc72a
RS
3551 total++;
3552 if (c == '\n')
3553 this_eol_type = CODING_EOL_LF;
3554 else if (src >= src_end || *src != '\n')
3555 this_eol_type = CODING_EOL_CR;
4ed46869 3556 else
bc4bc72a
RS
3557 this_eol_type = CODING_EOL_CRLF, src++;
3558
3559 if (eol_type == CODING_EOL_UNDECIDED)
3560 /* This is the first end-of-line. */
3561 eol_type = this_eol_type;
3562 else if (eol_type != this_eol_type)
d46c5b12
KH
3563 {
3564 /* The found type is different from what found before. */
3565 eol_type = CODING_EOL_INCONSISTENT;
3566 break;
3567 }
4ed46869
KH
3568 }
3569 }
bc4bc72a 3570
d46c5b12
KH
3571 if (*skip == 0)
3572 *skip = src_end - source;
85a02ca4 3573 return eol_type;
4ed46869
KH
3574}
3575
3576/* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
3577 is encoded. If it detects an appropriate format of end-of-line, it
3578 sets the information in *CODING. */
3579
3580void
3581detect_eol (coding, src, src_bytes)
3582 struct coding_system *coding;
3583 unsigned char *src;
3584 int src_bytes;
3585{
4608c386 3586 Lisp_Object val;
d46c5b12
KH
3587 int skip;
3588 int eol_type = detect_eol_type (src, src_bytes, &skip);
3589
3590 if (coding->heading_ascii > skip)
3591 coding->heading_ascii = skip;
3592 else
3593 skip = coding->heading_ascii;
4ed46869 3594
0ef69138 3595 if (eol_type == CODING_EOL_UNDECIDED)
4ed46869 3596 return;
27901516
KH
3597 if (eol_type == CODING_EOL_INCONSISTENT)
3598 {
3599#if 0
3600 /* This code is suppressed until we find a better way to
992f23f2 3601 distinguish raw text file and binary file. */
27901516
KH
3602
3603 /* If we have already detected that the coding is raw-text, the
3604 coding should actually be no-conversion. */
3605 if (coding->type == coding_type_raw_text)
3606 {
3607 setup_coding_system (Qno_conversion, coding);
3608 return;
3609 }
3610 /* Else, let's decode only text code anyway. */
3611#endif /* 0 */
1b2af4b0 3612 eol_type = CODING_EOL_LF;
27901516
KH
3613 }
3614
4608c386 3615 val = Fget (coding->symbol, Qeol_type);
4ed46869 3616 if (VECTORP (val) && XVECTOR (val)->size == 3)
d46c5b12
KH
3617 {
3618 setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
3619 coding->heading_ascii = skip;
3620 }
3621}
3622
3623#define CONVERSION_BUFFER_EXTRA_ROOM 256
3624
3625#define DECODING_BUFFER_MAG(coding) \
3626 (coding->type == coding_type_iso2022 \
3627 ? 3 \
3628 : ((coding->type == coding_type_sjis || coding->type == coding_type_big5) \
3629 ? 2 \
3630 : (coding->type == coding_type_raw_text \
3631 ? 1 \
3632 : (coding->type == coding_type_ccl \
3633 ? coding->spec.ccl.decoder.buf_magnification \
3634 : 2))))
3635
3636/* Return maximum size (bytes) of a buffer enough for decoding
3637 SRC_BYTES of text encoded in CODING. */
3638
3639int
3640decoding_buffer_size (coding, src_bytes)
3641 struct coding_system *coding;
3642 int src_bytes;
3643{
3644 return (src_bytes * DECODING_BUFFER_MAG (coding)
3645 + CONVERSION_BUFFER_EXTRA_ROOM);
3646}
3647
3648/* Return maximum size (bytes) of a buffer enough for encoding
3649 SRC_BYTES of text to CODING. */
3650
3651int
3652encoding_buffer_size (coding, src_bytes)
3653 struct coding_system *coding;
3654 int src_bytes;
3655{
3656 int magnification;
3657
3658 if (coding->type == coding_type_ccl)
3659 magnification = coding->spec.ccl.encoder.buf_magnification;
3660 else
3661 magnification = 3;
3662
3663 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
3664}
3665
3666#ifndef MINIMUM_CONVERSION_BUFFER_SIZE
3667#define MINIMUM_CONVERSION_BUFFER_SIZE 1024
3668#endif
3669
3670char *conversion_buffer;
3671int conversion_buffer_size;
3672
3673/* Return a pointer to a SIZE bytes of buffer to be used for encoding
3674 or decoding. Sufficient memory is allocated automatically. If we
3675 run out of memory, return NULL. */
3676
3677char *
3678get_conversion_buffer (size)
3679 int size;
3680{
3681 if (size > conversion_buffer_size)
3682 {
3683 char *buf;
3684 int real_size = conversion_buffer_size * 2;
3685
3686 while (real_size < size) real_size *= 2;
3687 buf = (char *) xmalloc (real_size);
3688 xfree (conversion_buffer);
3689 conversion_buffer = buf;
3690 conversion_buffer_size = real_size;
3691 }
3692 return conversion_buffer;
3693}
3694
3695int
3696ccl_coding_driver (coding, source, destination, src_bytes, dst_bytes, encodep)
3697 struct coding_system *coding;
3698 unsigned char *source, *destination;
3699 int src_bytes, dst_bytes, encodep;
3700{
3701 struct ccl_program *ccl
3702 = encodep ? &coding->spec.ccl.encoder : &coding->spec.ccl.decoder;
3703 int result;
3704
ae9ff118 3705 ccl->last_block = coding->mode & CODING_MODE_LAST_BLOCK;
7b179c2d 3706
d46c5b12
KH
3707 coding->produced = ccl_driver (ccl, source, destination,
3708 src_bytes, dst_bytes, &(coding->consumed));
69f76525 3709 coding->produced_char
48942766
KH
3710 = (encodep
3711 ? coding->produced
3712 : multibyte_chars_in_text (destination, coding->produced));
69f76525
KH
3713 coding->consumed_char
3714 = multibyte_chars_in_text (source, coding->consumed);
3715
d46c5b12
KH
3716 switch (ccl->status)
3717 {
3718 case CCL_STAT_SUSPEND_BY_SRC:
3719 result = CODING_FINISH_INSUFFICIENT_SRC;
3720 break;
3721 case CCL_STAT_SUSPEND_BY_DST:
3722 result = CODING_FINISH_INSUFFICIENT_DST;
3723 break;
9864ebce
KH
3724 case CCL_STAT_QUIT:
3725 case CCL_STAT_INVALID_CMD:
3726 result = CODING_FINISH_INTERRUPT;
3727 break;
d46c5b12
KH
3728 default:
3729 result = CODING_FINISH_NORMAL;
3730 break;
3731 }
3732 return result;
4ed46869
KH
3733}
3734
3735/* See "GENERAL NOTES about `decode_coding_XXX ()' functions". Before
3736 decoding, it may detect coding system and format of end-of-line if
52d41803
KH
3737 those are not yet decided.
3738
3739 This function does not make full use of DESTINATION buffer. For
3740 instance, if coding->type is coding_type_iso2022, it uses only
3741 (DST_BYTES - 7) bytes of DESTINATION buffer. In the case that
3742 DST_BYTES is decided by the function decoding_buffer_size, it
3743 contains extra 256 bytes (defined by CONVERSION_BUFFER_EXTRA_ROOM).
3744 So, this function can decode the full SOURCE. But, in the other
3745 case, if you want to avoid carry over, you must supply at least 7
3746 bytes more area in DESTINATION buffer than expected maximum bytes
3747 that will be produced by this function. */
4ed46869
KH
3748
3749int
d46c5b12 3750decode_coding (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
3751 struct coding_system *coding;
3752 unsigned char *source, *destination;
3753 int src_bytes, dst_bytes;
4ed46869 3754{
d46c5b12 3755 int result;
4ed46869 3756
d4e57bcd 3757 if (src_bytes <= 0
944bd420 3758 && coding->type != coding_type_ccl
d4e57bcd
KH
3759 && ! (coding->mode & CODING_MODE_LAST_BLOCK
3760 && CODING_REQUIRE_FLUSHING (coding)))
4ed46869 3761 {
d46c5b12
KH
3762 coding->produced = coding->produced_char = 0;
3763 coding->consumed = coding->consumed_char = 0;
fb88bf2d 3764 coding->fake_multibyte = 0;
d46c5b12 3765 return CODING_FINISH_NORMAL;
4ed46869
KH
3766 }
3767
0ef69138 3768 if (coding->type == coding_type_undecided)
4ed46869
KH
3769 detect_coding (coding, source, src_bytes);
3770
0ef69138 3771 if (coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
3772 detect_eol (coding, source, src_bytes);
3773
4ed46869
KH
3774 switch (coding->type)
3775 {
0ef69138
KH
3776 case coding_type_emacs_mule:
3777 case coding_type_undecided:
27901516 3778 case coding_type_raw_text:
4ed46869 3779 if (coding->eol_type == CODING_EOL_LF
0ef69138 3780 || coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869 3781 goto label_no_conversion;
d46c5b12 3782 result = decode_eol (coding, source, destination, src_bytes, dst_bytes);
4ed46869
KH
3783 break;
3784
3785 case coding_type_sjis:
d46c5b12
KH
3786 result = decode_coding_sjis_big5 (coding, source, destination,
3787 src_bytes, dst_bytes, 1);
4ed46869
KH
3788 break;
3789
3790 case coding_type_iso2022:
d46c5b12
KH
3791 result = decode_coding_iso2022 (coding, source, destination,
3792 src_bytes, dst_bytes);
4ed46869
KH
3793 break;
3794
3795 case coding_type_big5:
d46c5b12
KH
3796 result = decode_coding_sjis_big5 (coding, source, destination,
3797 src_bytes, dst_bytes, 0);
4ed46869
KH
3798 break;
3799
3800 case coding_type_ccl:
d46c5b12
KH
3801 result = ccl_coding_driver (coding, source, destination,
3802 src_bytes, dst_bytes, 0);
3803 break;
3804
3805 default: /* i.e. case coding_type_no_conversion: */
3806 label_no_conversion:
3807 if (dst_bytes && src_bytes > dst_bytes)
3808 {
3809 coding->produced = dst_bytes;
3810 result = CODING_FINISH_INSUFFICIENT_DST;
3811 }
3812 else
3813 {
3814 coding->produced = src_bytes;
3815 result = CODING_FINISH_NORMAL;
3816 }
3817 if (dst_bytes)
3818 bcopy (source, destination, coding->produced);
3819 else
3820 safe_bcopy (source, destination, coding->produced);
fb88bf2d 3821 coding->fake_multibyte = 1;
d46c5b12
KH
3822 coding->consumed
3823 = coding->consumed_char = coding->produced_char = coding->produced;
4ed46869
KH
3824 break;
3825 }
3826
d46c5b12 3827 return result;
4ed46869
KH
3828}
3829
52d41803
KH
3830/* See "GENERAL NOTES about `encode_coding_XXX ()' functions".
3831
3832 This function does not make full use of DESTINATION buffer. For
3833 instance, if coding->type is coding_type_iso2022, it uses only
3834 (DST_BYTES - 20) bytes of DESTINATION buffer. In the case that
3835 DST_BYTES is decided by the function encoding_buffer_size, it
3836 contains extra 256 bytes (defined by CONVERSION_BUFFER_EXTRA_ROOM).
3837 So, this function can encode the full SOURCE. But, in the other
3838 case, if you want to avoid carry over, you must supply at least 20
3839 bytes more area in DESTINATION buffer than expected maximum bytes
3840 that will be produced by this function. */
4ed46869
KH
3841
3842int
d46c5b12 3843encode_coding (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
3844 struct coding_system *coding;
3845 unsigned char *source, *destination;
3846 int src_bytes, dst_bytes;
4ed46869 3847{
d46c5b12 3848 int result;
4ed46869 3849
d4e57bcd
KH
3850 if (src_bytes <= 0
3851 && ! (coding->mode & CODING_MODE_LAST_BLOCK
3852 && CODING_REQUIRE_FLUSHING (coding)))
4ed46869 3853 {
d46c5b12
KH
3854 coding->produced = coding->produced_char = 0;
3855 coding->consumed = coding->consumed_char = 0;
fb88bf2d 3856 coding->fake_multibyte = 0;
d46c5b12
KH
3857 return CODING_FINISH_NORMAL;
3858 }
4ed46869 3859
d46c5b12
KH
3860 switch (coding->type)
3861 {
0ef69138
KH
3862 case coding_type_emacs_mule:
3863 case coding_type_undecided:
27901516 3864 case coding_type_raw_text:
4ed46869 3865 if (coding->eol_type == CODING_EOL_LF
0ef69138 3866 || coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869 3867 goto label_no_conversion;
d46c5b12 3868 result = encode_eol (coding, source, destination, src_bytes, dst_bytes);
4ed46869
KH
3869 break;
3870
3871 case coding_type_sjis:
d46c5b12
KH
3872 result = encode_coding_sjis_big5 (coding, source, destination,
3873 src_bytes, dst_bytes, 1);
4ed46869
KH
3874 break;
3875
3876 case coding_type_iso2022:
d46c5b12
KH
3877 result = encode_coding_iso2022 (coding, source, destination,
3878 src_bytes, dst_bytes);
4ed46869
KH
3879 break;
3880
3881 case coding_type_big5:
d46c5b12
KH
3882 result = encode_coding_sjis_big5 (coding, source, destination,
3883 src_bytes, dst_bytes, 0);
4ed46869
KH
3884 break;
3885
3886 case coding_type_ccl:
d46c5b12
KH
3887 result = ccl_coding_driver (coding, source, destination,
3888 src_bytes, dst_bytes, 1);
3889 break;
3890
3891 default: /* i.e. case coding_type_no_conversion: */
3892 label_no_conversion:
3893 if (dst_bytes && src_bytes > dst_bytes)
3894 {
3895 coding->produced = dst_bytes;
3896 result = CODING_FINISH_INSUFFICIENT_DST;
3897 }
3898 else
3899 {
3900 coding->produced = src_bytes;
3901 result = CODING_FINISH_NORMAL;
3902 }
3903 if (dst_bytes)
3904 bcopy (source, destination, coding->produced);
3905 else
3906 safe_bcopy (source, destination, coding->produced);
3907 if (coding->mode & CODING_MODE_SELECTIVE_DISPLAY)
3908 {
3909 unsigned char *p = destination, *pend = p + coding->produced;
3910 while (p < pend)
3911 if (*p++ == '\015') p[-1] = '\n';
3912 }
fb88bf2d 3913 coding->fake_multibyte = 1;
d46c5b12
KH
3914 coding->consumed
3915 = coding->consumed_char = coding->produced_char = coding->produced;
4ed46869
KH
3916 break;
3917 }
3918
d46c5b12 3919 return result;
4ed46869
KH
3920}
3921
fb88bf2d
KH
3922/* Scan text in the region between *BEG and *END (byte positions),
3923 skip characters which we don't have to decode by coding system
3924 CODING at the head and tail, then set *BEG and *END to the region
3925 of the text we actually have to convert. The caller should move
3926 the gap out of the region in advance.
4ed46869 3927
d46c5b12
KH
3928 If STR is not NULL, *BEG and *END are indices into STR. */
3929
3930static void
3931shrink_decoding_region (beg, end, coding, str)
3932 int *beg, *end;
3933 struct coding_system *coding;
3934 unsigned char *str;
3935{
fb88bf2d 3936 unsigned char *begp_orig, *begp, *endp_orig, *endp, c;
d46c5b12 3937 int eol_conversion;
88993dfd 3938 Lisp_Object translation_table;
d46c5b12
KH
3939
3940 if (coding->type == coding_type_ccl
3941 || coding->type == coding_type_undecided
3942 || !NILP (coding->post_read_conversion))
3943 {
3944 /* We can't skip any data. */
3945 return;
3946 }
3947 else if (coding->type == coding_type_no_conversion)
3948 {
fb88bf2d
KH
3949 /* We need no conversion, but don't have to skip any data here.
3950 Decoding routine handles them effectively anyway. */
d46c5b12
KH
3951 return;
3952 }
3953
88993dfd
KH
3954 translation_table = coding->translation_table_for_decode;
3955 if (NILP (translation_table) && !NILP (Venable_character_translation))
3956 translation_table = Vstandard_translation_table_for_decode;
3957 if (CHAR_TABLE_P (translation_table))
3958 {
3959 int i;
3960 for (i = 0; i < 128; i++)
3961 if (!NILP (CHAR_TABLE_REF (translation_table, i)))
3962 break;
3963 if (i < 128)
3964 /* Some ASCII character should be tranlsated. We give up
3965 shrinking. */
3966 return;
3967 }
3968
aa60dea6
KH
3969 eol_conversion = (coding->eol_type != CODING_EOL_LF);
3970
3971 if ((! eol_conversion) && (coding->heading_ascii >= 0))
d46c5b12
KH
3972 /* Detection routine has already found how much we can skip at the
3973 head. */
3974 *beg += coding->heading_ascii;
3975
3976 if (str)
3977 {
3978 begp_orig = begp = str + *beg;
3979 endp_orig = endp = str + *end;
3980 }
3981 else
3982 {
fb88bf2d 3983 begp_orig = begp = BYTE_POS_ADDR (*beg);
d46c5b12
KH
3984 endp_orig = endp = begp + *end - *beg;
3985 }
3986
d46c5b12
KH
3987 switch (coding->type)
3988 {
3989 case coding_type_emacs_mule:
3990 case coding_type_raw_text:
3991 if (eol_conversion)
3992 {
3993 if (coding->heading_ascii < 0)
fb88bf2d 3994 while (begp < endp && *begp != '\r' && *begp < 0x80) begp++;
ee59c65f 3995 while (begp < endp && endp[-1] != '\r' && endp[-1] < 0x80)
fb88bf2d 3996 endp--;
ee59c65f
RS
3997 /* Do not consider LF as ascii if preceded by CR, since that
3998 confuses eol decoding. */
3999 if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
4000 endp++;
d46c5b12
KH
4001 }
4002 else
4003 begp = endp;
4004 break;
4005
4006 case coding_type_sjis:
4007 case coding_type_big5:
4008 /* We can skip all ASCII characters at the head. */
4009 if (coding->heading_ascii < 0)
4010 {
4011 if (eol_conversion)
de9d083c 4012 while (begp < endp && *begp < 0x80 && *begp != '\r') begp++;
d46c5b12
KH
4013 else
4014 while (begp < endp && *begp < 0x80) begp++;
4015 }
4016 /* We can skip all ASCII characters at the tail except for the
4017 second byte of SJIS or BIG5 code. */
4018 if (eol_conversion)
de9d083c 4019 while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\r') endp--;
d46c5b12
KH
4020 else
4021 while (begp < endp && endp[-1] < 0x80) endp--;
ee59c65f
RS
4022 /* Do not consider LF as ascii if preceded by CR, since that
4023 confuses eol decoding. */
4024 if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
4025 endp++;
d46c5b12
KH
4026 if (begp < endp && endp < endp_orig && endp[-1] >= 0x80)
4027 endp++;
4028 break;
4029
4030 default: /* i.e. case coding_type_iso2022: */
622fece5
KH
4031 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
4032 /* We can't skip any data. */
4033 break;
d46c5b12
KH
4034 if (coding->heading_ascii < 0)
4035 {
d46c5b12
KH
4036 /* We can skip all ASCII characters at the head except for a
4037 few control codes. */
4038 while (begp < endp && (c = *begp) < 0x80
4039 && c != ISO_CODE_CR && c != ISO_CODE_SO
4040 && c != ISO_CODE_SI && c != ISO_CODE_ESC
4041 && (!eol_conversion || c != ISO_CODE_LF))
4042 begp++;
4043 }
4044 switch (coding->category_idx)
4045 {
4046 case CODING_CATEGORY_IDX_ISO_8_1:
4047 case CODING_CATEGORY_IDX_ISO_8_2:
4048 /* We can skip all ASCII characters at the tail. */
4049 if (eol_conversion)
de9d083c 4050 while (begp < endp && (c = endp[-1]) < 0x80 && c != '\r') endp--;
d46c5b12
KH
4051 else
4052 while (begp < endp && endp[-1] < 0x80) endp--;
ee59c65f
RS
4053 /* Do not consider LF as ascii if preceded by CR, since that
4054 confuses eol decoding. */
4055 if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
4056 endp++;
d46c5b12
KH
4057 break;
4058
4059 case CODING_CATEGORY_IDX_ISO_7:
4060 case CODING_CATEGORY_IDX_ISO_7_TIGHT:
de79a6a5
KH
4061 {
4062 /* We can skip all charactes at the tail except for 8-bit
4063 codes and ESC and the following 2-byte at the tail. */
4064 unsigned char *eight_bit = NULL;
4065
4066 if (eol_conversion)
4067 while (begp < endp
4068 && (c = endp[-1]) != ISO_CODE_ESC && c != '\r')
4069 {
4070 if (!eight_bit && c & 0x80) eight_bit = endp;
4071 endp--;
4072 }
4073 else
4074 while (begp < endp
4075 && (c = endp[-1]) != ISO_CODE_ESC)
4076 {
4077 if (!eight_bit && c & 0x80) eight_bit = endp;
4078 endp--;
4079 }
4080 /* Do not consider LF as ascii if preceded by CR, since that
4081 confuses eol decoding. */
4082 if (begp < endp && endp < endp_orig
4083 && endp[-1] == '\r' && endp[0] == '\n')
4084 endp++;
4085 if (begp < endp && endp[-1] == ISO_CODE_ESC)
4086 {
4087 if (endp + 1 < endp_orig && end[0] == '(' && end[1] == 'B')
4088 /* This is an ASCII designation sequence. We can
4089 surely skip the tail. But, if we have
4090 encountered an 8-bit code, skip only the codes
4091 after that. */
4092 endp = eight_bit ? eight_bit : endp + 2;
4093 else
4094 /* Hmmm, we can't skip the tail. */
4095 endp = endp_orig;
4096 }
4097 else if (eight_bit)
4098 endp = eight_bit;
4099 }
d46c5b12
KH
4100 }
4101 }
4102 *beg += begp - begp_orig;
4103 *end += endp - endp_orig;
4104 return;
4105}
4106
4107/* Like shrink_decoding_region but for encoding. */
4108
4109static void
4110shrink_encoding_region (beg, end, coding, str)
4111 int *beg, *end;
4112 struct coding_system *coding;
4113 unsigned char *str;
4114{
4115 unsigned char *begp_orig, *begp, *endp_orig, *endp;
4116 int eol_conversion;
88993dfd 4117 Lisp_Object translation_table;
d46c5b12
KH
4118
4119 if (coding->type == coding_type_ccl)
4120 /* We can't skip any data. */
4121 return;
4122 else if (coding->type == coding_type_no_conversion)
4123 {
4124 /* We need no conversion. */
4125 *beg = *end;
4126 return;
4127 }
4128
88993dfd
KH
4129 translation_table = coding->translation_table_for_encode;
4130 if (NILP (translation_table) && !NILP (Venable_character_translation))
4131 translation_table = Vstandard_translation_table_for_encode;
4132 if (CHAR_TABLE_P (translation_table))
4133 {
4134 int i;
4135 for (i = 0; i < 128; i++)
4136 if (!NILP (CHAR_TABLE_REF (translation_table, i)))
4137 break;
4138 if (i < 128)
4139 /* Some ASCII character should be tranlsated. We give up
4140 shrinking. */
4141 return;
4142 }
4143
d46c5b12
KH
4144 if (str)
4145 {
4146 begp_orig = begp = str + *beg;
4147 endp_orig = endp = str + *end;
4148 }
4149 else
4150 {
fb88bf2d 4151 begp_orig = begp = BYTE_POS_ADDR (*beg);
d46c5b12
KH
4152 endp_orig = endp = begp + *end - *beg;
4153 }
4154
4155 eol_conversion = (coding->eol_type == CODING_EOL_CR
4156 || coding->eol_type == CODING_EOL_CRLF);
4157
4158 /* Here, we don't have to check coding->pre_write_conversion because
4159 the caller is expected to have handled it already. */
4160 switch (coding->type)
4161 {
4162 case coding_type_undecided:
4163 case coding_type_emacs_mule:
4164 case coding_type_raw_text:
4165 if (eol_conversion)
4166 {
4167 while (begp < endp && *begp != '\n') begp++;
4168 while (begp < endp && endp[-1] != '\n') endp--;
4169 }
4170 else
4171 begp = endp;
4172 break;
4173
4174 case coding_type_iso2022:
622fece5
KH
4175 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
4176 /* We can't skip any data. */
4177 break;
d46c5b12
KH
4178 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
4179 {
4180 unsigned char *bol = begp;
4181 while (begp < endp && *begp < 0x80)
4182 {
4183 begp++;
4184 if (begp[-1] == '\n')
4185 bol = begp;
4186 }
4187 begp = bol;
4188 goto label_skip_tail;
4189 }
4190 /* fall down ... */
4191
4192 default:
4193 /* We can skip all ASCII characters at the head and tail. */
4194 if (eol_conversion)
4195 while (begp < endp && *begp < 0x80 && *begp != '\n') begp++;
4196 else
4197 while (begp < endp && *begp < 0x80) begp++;
4198 label_skip_tail:
4199 if (eol_conversion)
4200 while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\n') endp--;
4201 else
4202 while (begp < endp && *(endp - 1) < 0x80) endp--;
4203 break;
4204 }
4205
4206 *beg += begp - begp_orig;
4207 *end += endp - endp_orig;
4208 return;
4209}
4210
88993dfd
KH
4211/* As shrinking conversion region requires some overhead, we don't try
4212 shrinking if the length of conversion region is less than this
4213 value. */
4214static int shrink_conversion_region_threshhold = 1024;
4215
4216#define SHRINK_CONVERSION_REGION(beg, end, coding, str, encodep) \
4217 do { \
4218 if (*(end) - *(beg) > shrink_conversion_region_threshhold) \
4219 { \
4220 if (encodep) shrink_encoding_region (beg, end, coding, str); \
4221 else shrink_decoding_region (beg, end, coding, str); \
4222 } \
4223 } while (0)
4224
d46c5b12 4225/* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the
fb88bf2d
KH
4226 text from FROM to TO (byte positions are FROM_BYTE and TO_BYTE) by
4227 coding system CODING, and return the status code of code conversion
4228 (currently, this value has no meaning).
4229
4230 How many characters (and bytes) are converted to how many
4231 characters (and bytes) are recorded in members of the structure
4232 CODING.
d46c5b12 4233
6e44253b 4234 If REPLACE is nonzero, we do various things as if the original text
d46c5b12 4235 is deleted and a new text is inserted. See the comments in
6e44253b 4236 replace_range (insdel.c) to know what we are doing. */
4ed46869
KH
4237
4238int
6e44253b
KH
4239code_convert_region (from, from_byte, to, to_byte, coding, encodep, replace)
4240 int from, from_byte, to, to_byte, encodep, replace;
4ed46869 4241 struct coding_system *coding;
4ed46869 4242{
fb88bf2d
KH
4243 int len = to - from, len_byte = to_byte - from_byte;
4244 int require, inserted, inserted_byte;
12410ef1 4245 int head_skip, tail_skip, total_skip;
84d60297 4246 Lisp_Object saved_coding_symbol;
fb88bf2d
KH
4247 int multibyte = !NILP (current_buffer->enable_multibyte_characters);
4248 int first = 1;
4249 int fake_multibyte = 0;
4250 unsigned char *src, *dst;
84d60297 4251 Lisp_Object deletion;
e133c8fa 4252 int orig_point = PT, orig_len = len;
6abb9bd9 4253 int prev_Z;
84d60297
RS
4254
4255 deletion = Qnil;
4256 saved_coding_symbol = Qnil;
d46c5b12 4257
83fa074f 4258 if (from < PT && PT < to)
e133c8fa
KH
4259 {
4260 TEMP_SET_PT_BOTH (from, from_byte);
4261 orig_point = from;
4262 }
83fa074f 4263
6e44253b 4264 if (replace)
d46c5b12 4265 {
fb88bf2d
KH
4266 int saved_from = from;
4267
d46c5b12 4268 prepare_to_modify_buffer (from, to, &from);
fb88bf2d
KH
4269 if (saved_from != from)
4270 {
4271 to = from + len;
4272 if (multibyte)
4273 from_byte = CHAR_TO_BYTE (from), to_byte = CHAR_TO_BYTE (to);
4274 else
4275 from_byte = from, to_byte = to;
4276 len_byte = to_byte - from_byte;
4277 }
d46c5b12 4278 }
d46c5b12
KH
4279
4280 if (! encodep && CODING_REQUIRE_DETECTION (coding))
4281 {
12410ef1 4282 /* We must detect encoding of text and eol format. */
d46c5b12
KH
4283
4284 if (from < GPT && to > GPT)
4285 move_gap_both (from, from_byte);
4286 if (coding->type == coding_type_undecided)
4287 {
fb88bf2d 4288 detect_coding (coding, BYTE_POS_ADDR (from_byte), len_byte);
d46c5b12 4289 if (coding->type == coding_type_undecided)
12410ef1
KH
4290 /* It seems that the text contains only ASCII, but we
4291 should not left it undecided because the deeper
4292 decoding routine (decode_coding) tries to detect the
4293 encodings again in vain. */
d46c5b12
KH
4294 coding->type = coding_type_emacs_mule;
4295 }
4296 if (coding->eol_type == CODING_EOL_UNDECIDED)
4297 {
4298 saved_coding_symbol = coding->symbol;
4299 detect_eol (coding, BYTE_POS_ADDR (from_byte), len_byte);
4300 if (coding->eol_type == CODING_EOL_UNDECIDED)
4301 coding->eol_type = CODING_EOL_LF;
4302 /* We had better recover the original eol format if we
4303 encounter an inconsitent eol format while decoding. */
4304 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4305 }
4306 }
4307
fb88bf2d
KH
4308 coding->consumed_char = len, coding->consumed = len_byte;
4309
d46c5b12
KH
4310 if (encodep
4311 ? ! CODING_REQUIRE_ENCODING (coding)
4312 : ! CODING_REQUIRE_DECODING (coding))
fb88bf2d
KH
4313 {
4314 coding->produced = len_byte;
12410ef1
KH
4315 if (multibyte
4316 && ! replace
4317 /* See the comment of the member heading_ascii in coding.h. */
4318 && coding->heading_ascii < len_byte)
fb88bf2d 4319 {
6e44253b
KH
4320 /* We still may have to combine byte at the head and the
4321 tail of the text in the region. */
12410ef1 4322 if (from < GPT && GPT < to)
6e44253b 4323 move_gap_both (to, to_byte);
12410ef1
KH
4324 len = multibyte_chars_in_text (BYTE_POS_ADDR (from_byte), len_byte);
4325 adjust_after_insert (from, from_byte, to, to_byte, len);
4326 coding->produced_char = len;
fb88bf2d
KH
4327 }
4328 else
68e3a8f1
AS
4329 {
4330 if (!replace)
4331 adjust_after_insert (from, from_byte, to, to_byte, len_byte);
4332 coding->produced_char = len_byte;
4333 }
fb88bf2d
KH
4334 return 0;
4335 }
d46c5b12
KH
4336
4337 /* Now we convert the text. */
4338
4339 /* For encoding, we must process pre-write-conversion in advance. */
4340 if (encodep
d46c5b12
KH
4341 && ! NILP (coding->pre_write_conversion)
4342 && SYMBOLP (coding->pre_write_conversion)
4343 && ! NILP (Ffboundp (coding->pre_write_conversion)))
4344 {
2b4f9037
KH
4345 /* The function in pre-write-conversion may put a new text in a
4346 new buffer. */
0007bdd0
KH
4347 struct buffer *prev = current_buffer;
4348 Lisp_Object new;
d46c5b12 4349
b39f748c
AS
4350 call2 (coding->pre_write_conversion,
4351 make_number (from), make_number (to));
d46c5b12
KH
4352 if (current_buffer != prev)
4353 {
4354 len = ZV - BEGV;
0007bdd0 4355 new = Fcurrent_buffer ();
d46c5b12 4356 set_buffer_internal_1 (prev);
ddbc19ff 4357 del_range_2 (from, from_byte, to, to_byte);
e133c8fa 4358 TEMP_SET_PT_BOTH (from, from_byte);
0007bdd0
KH
4359 insert_from_buffer (XBUFFER (new), 1, len, 0);
4360 Fkill_buffer (new);
e133c8fa
KH
4361 if (orig_point >= to)
4362 orig_point += len - orig_len;
4363 else if (orig_point > from)
4364 orig_point = from;
4365 orig_len = len;
d46c5b12 4366 to = from + len;
e133c8fa 4367 from_byte = multibyte ? CHAR_TO_BYTE (from) : from_byte;
fb88bf2d 4368 to_byte = multibyte ? CHAR_TO_BYTE (to) : to;
d46c5b12 4369 len_byte = to_byte - from_byte;
e133c8fa 4370 TEMP_SET_PT_BOTH (from, from_byte);
d46c5b12
KH
4371 }
4372 }
4373
12410ef1
KH
4374 if (replace)
4375 deletion = make_buffer_string_both (from, from_byte, to, to_byte, 1);
4376
d46c5b12 4377 /* Try to skip the heading and tailing ASCIIs. */
12410ef1
KH
4378 {
4379 int from_byte_orig = from_byte, to_byte_orig = to_byte;
4380
4381 if (from < GPT && GPT < to)
4382 move_gap_both (from, from_byte);
88993dfd 4383 SHRINK_CONVERSION_REGION (&from_byte, &to_byte, coding, NULL, encodep);
d4e57bcd 4384 if (from_byte == to_byte
944bd420 4385 && coding->type != coding_type_ccl
d4e57bcd
KH
4386 && ! (coding->mode & CODING_MODE_LAST_BLOCK
4387 && CODING_REQUIRE_FLUSHING (coding)))
12410ef1
KH
4388 {
4389 coding->produced = len_byte;
4390 coding->produced_char = multibyte ? len : len_byte;
4391 if (!replace)
4392 /* We must record and adjust for this new text now. */
4393 adjust_after_insert (from, from_byte_orig, to, to_byte_orig, len);
4394 return 0;
4395 }
fb88bf2d 4396
12410ef1
KH
4397 head_skip = from_byte - from_byte_orig;
4398 tail_skip = to_byte_orig - to_byte;
4399 total_skip = head_skip + tail_skip;
4400 from += head_skip;
4401 to -= tail_skip;
4402 len -= total_skip; len_byte -= total_skip;
4403 }
d46c5b12 4404
88993dfd 4405 /* The code conversion routine can not preserve text properties for
55d8d769
KH
4406 now. So, we must remove all text properties in the region.
4407 Here, we must suppress all modification hooks. */
88993dfd 4408 if (replace)
55d8d769
KH
4409 {
4410 int saved_inhibit_modification_hooks = inhibit_modification_hooks;
4411 inhibit_modification_hooks = 1;
4412 Fset_text_properties (make_number (from), make_number (to), Qnil, Qnil);
4413 inhibit_modification_hooks = saved_inhibit_modification_hooks;
4414 }
88993dfd 4415
fb88bf2d
KH
4416 /* For converion, we must put the gap before the text in addition to
4417 making the gap larger for efficient decoding. The required gap
4418 size starts from 2000 which is the magic number used in make_gap.
4419 But, after one batch of conversion, it will be incremented if we
4420 find that it is not enough . */
d46c5b12
KH
4421 require = 2000;
4422
4423 if (GAP_SIZE < require)
4424 make_gap (require - GAP_SIZE);
4425 move_gap_both (from, from_byte);
4426
d46c5b12 4427 inserted = inserted_byte = 0;
fb88bf2d
KH
4428 src = GAP_END_ADDR, dst = GPT_ADDR;
4429
4430 GAP_SIZE += len_byte;
4431 ZV -= len;
4432 Z -= len;
4433 ZV_BYTE -= len_byte;
4434 Z_BYTE -= len_byte;
4435
f2558efd
KH
4436 if (GPT - BEG < beg_unchanged)
4437 beg_unchanged = GPT - BEG;
4438 if (Z - GPT < end_unchanged)
4439 end_unchanged = Z - GPT;
4440
d46c5b12
KH
4441 for (;;)
4442 {
fb88bf2d 4443 int result;
d46c5b12
KH
4444
4445 /* The buffer memory is changed from:
fb88bf2d
KH
4446 +--------+converted-text+---------+-------original-text------+---+
4447 |<-from->|<--inserted-->|---------|<-----------len---------->|---|
4448 |<------------------- GAP_SIZE -------------------->| */
d46c5b12 4449 if (encodep)
fb88bf2d 4450 result = encode_coding (coding, src, dst, len_byte, 0);
d46c5b12 4451 else
fb88bf2d 4452 result = decode_coding (coding, src, dst, len_byte, 0);
d46c5b12
KH
4453 /* to:
4454 +--------+-------converted-text--------+--+---original-text--+---+
fb88bf2d
KH
4455 |<-from->|<--inserted-->|<--produced-->|--|<-(len-consumed)->|---|
4456 |<------------------- GAP_SIZE -------------------->| */
4457 if (coding->fake_multibyte)
4458 fake_multibyte = 1;
d46c5b12 4459
fb88bf2d
KH
4460 if (!encodep && !multibyte)
4461 coding->produced_char = coding->produced;
d46c5b12
KH
4462 inserted += coding->produced_char;
4463 inserted_byte += coding->produced;
d46c5b12 4464 len_byte -= coding->consumed;
fb88bf2d
KH
4465 src += coding->consumed;
4466 dst += inserted_byte;
d46c5b12 4467
9864ebce
KH
4468 if (result == CODING_FINISH_NORMAL)
4469 {
4470 src += len_byte;
4471 break;
4472 }
d46c5b12
KH
4473 if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
4474 {
fb88bf2d 4475 unsigned char *pend = dst, *p = pend - inserted_byte;
d46c5b12
KH
4476
4477 /* Encode LFs back to the original eol format (CR or CRLF). */
4478 if (coding->eol_type == CODING_EOL_CR)
4479 {
4480 while (p < pend) if (*p++ == '\n') p[-1] = '\r';
4481 }
4482 else
4483 {
d46c5b12
KH
4484 int count = 0;
4485
fb88bf2d
KH
4486 while (p < pend) if (*p++ == '\n') count++;
4487 if (src - dst < count)
d46c5b12 4488 {
fb88bf2d
KH
4489 /* We don't have sufficient room for putting LFs
4490 back to CRLF. We must record converted and
4491 not-yet-converted text back to the buffer
4492 content, enlarge the gap, then record them out of
4493 the buffer contents again. */
4494 int add = len_byte + inserted_byte;
4495
4496 GAP_SIZE -= add;
4497 ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
4498 GPT += inserted_byte; GPT_BYTE += inserted_byte;
4499 make_gap (count - GAP_SIZE);
4500 GAP_SIZE += add;
4501 ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
4502 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
4503 /* Don't forget to update SRC, DST, and PEND. */
4504 src = GAP_END_ADDR - len_byte;
4505 dst = GPT_ADDR + inserted_byte;
4506 pend = dst;
d46c5b12 4507 }
d46c5b12
KH
4508 inserted += count;
4509 inserted_byte += count;
fb88bf2d
KH
4510 coding->produced += count;
4511 p = dst = pend + count;
4512 while (count)
4513 {
4514 *--p = *--pend;
4515 if (*p == '\n') count--, *--p = '\r';
4516 }
d46c5b12
KH
4517 }
4518
4519 /* Suppress eol-format conversion in the further conversion. */
4520 coding->eol_type = CODING_EOL_LF;
4521
4522 /* Restore the original symbol. */
4523 coding->symbol = saved_coding_symbol;
fb88bf2d
KH
4524
4525 continue;
d46c5b12
KH
4526 }
4527 if (len_byte <= 0)
944bd420
KH
4528 {
4529 if (coding->type != coding_type_ccl
4530 || coding->mode & CODING_MODE_LAST_BLOCK)
4531 break;
4532 coding->mode |= CODING_MODE_LAST_BLOCK;
4533 continue;
4534 }
d46c5b12
KH
4535 if (result == CODING_FINISH_INSUFFICIENT_SRC)
4536 {
4537 /* The source text ends in invalid codes. Let's just
4538 make them valid buffer contents, and finish conversion. */
fb88bf2d 4539 inserted += len_byte;
d46c5b12 4540 inserted_byte += len_byte;
fb88bf2d 4541 while (len_byte--)
ee59c65f 4542 *dst++ = *src++;
fb88bf2d 4543 fake_multibyte = 1;
d46c5b12
KH
4544 break;
4545 }
9864ebce
KH
4546 if (result == CODING_FINISH_INTERRUPT)
4547 {
4548 /* The conversion procedure was interrupted by a user. */
4549 fake_multibyte = 1;
4550 break;
4551 }
4552 /* Now RESULT == CODING_FINISH_INSUFFICIENT_DST */
4553 if (coding->consumed < 1)
4554 {
4555 /* It's quite strange to require more memory without
4556 consuming any bytes. Perhaps CCL program bug. */
4557 fake_multibyte = 1;
4558 break;
4559 }
fb88bf2d
KH
4560 if (first)
4561 {
4562 /* We have just done the first batch of conversion which was
4563 stoped because of insufficient gap. Let's reconsider the
4564 required gap size (i.e. SRT - DST) now.
4565
4566 We have converted ORIG bytes (== coding->consumed) into
4567 NEW bytes (coding->produced). To convert the remaining
4568 LEN bytes, we may need REQUIRE bytes of gap, where:
4569 REQUIRE + LEN_BYTE = LEN_BYTE * (NEW / ORIG)
4570 REQUIRE = LEN_BYTE * (NEW - ORIG) / ORIG
4571 Here, we are sure that NEW >= ORIG. */
6e44253b
KH
4572 float ratio = coding->produced - coding->consumed;
4573 ratio /= coding->consumed;
4574 require = len_byte * ratio;
fb88bf2d
KH
4575 first = 0;
4576 }
4577 if ((src - dst) < (require + 2000))
4578 {
4579 /* See the comment above the previous call of make_gap. */
4580 int add = len_byte + inserted_byte;
4581
4582 GAP_SIZE -= add;
4583 ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
4584 GPT += inserted_byte; GPT_BYTE += inserted_byte;
4585 make_gap (require + 2000);
4586 GAP_SIZE += add;
4587 ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
4588 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
4589 /* Don't forget to update SRC, DST. */
4590 src = GAP_END_ADDR - len_byte;
4591 dst = GPT_ADDR + inserted_byte;
4592 }
d46c5b12 4593 }
fb88bf2d
KH
4594 if (src - dst > 0) *dst = 0; /* Put an anchor. */
4595
2b4f9037 4596 if (multibyte
88993dfd
KH
4597 && (encodep
4598 || fake_multibyte
4599 || (to - from) != (to_byte - from_byte)))
2b4f9037 4600 inserted = multibyte_chars_in_text (GPT_ADDR, inserted_byte);
7553d0e1 4601
12410ef1
KH
4602 /* If we have shrinked the conversion area, adjust it now. */
4603 if (total_skip > 0)
4604 {
4605 if (tail_skip > 0)
4606 safe_bcopy (GAP_END_ADDR, GPT_ADDR + inserted_byte, tail_skip);
4607 inserted += total_skip; inserted_byte += total_skip;
4608 GAP_SIZE += total_skip;
4609 GPT -= head_skip; GPT_BYTE -= head_skip;
4610 ZV -= total_skip; ZV_BYTE -= total_skip;
4611 Z -= total_skip; Z_BYTE -= total_skip;
4612 from -= head_skip; from_byte -= head_skip;
4613 to += tail_skip; to_byte += tail_skip;
4614 }
4615
6abb9bd9 4616 prev_Z = Z;
12410ef1 4617 adjust_after_replace (from, from_byte, deletion, inserted, inserted_byte);
6abb9bd9 4618 inserted = Z - prev_Z;
4ed46869 4619
2b4f9037 4620 if (! encodep && ! NILP (coding->post_read_conversion))
d46c5b12 4621 {
2b4f9037 4622 Lisp_Object val;
4ed46869 4623
e133c8fa
KH
4624 if (from != PT)
4625 TEMP_SET_PT_BOTH (from, from_byte);
6abb9bd9 4626 prev_Z = Z;
2b4f9037 4627 val = call1 (coding->post_read_conversion, make_number (inserted));
6abb9bd9 4628 CHECK_NUMBER (val, 0);
944bd420 4629 inserted += Z - prev_Z;
e133c8fa
KH
4630 }
4631
4632 if (orig_point >= from)
4633 {
4634 if (orig_point >= from + orig_len)
4635 orig_point += inserted - orig_len;
4636 else
4637 orig_point = from;
4638 TEMP_SET_PT (orig_point);
d46c5b12 4639 }
4ed46869 4640
2b4f9037
KH
4641 signal_after_change (from, to - from, inserted);
4642
fb88bf2d 4643 {
12410ef1
KH
4644 coding->consumed = to_byte - from_byte;
4645 coding->consumed_char = to - from;
4646 coding->produced = inserted_byte;
4647 coding->produced_char = inserted;
fb88bf2d 4648 }
7553d0e1 4649
fb88bf2d 4650 return 0;
d46c5b12
KH
4651}
4652
4653Lisp_Object
4654code_convert_string (str, coding, encodep, nocopy)
4655 Lisp_Object str;
4ed46869 4656 struct coding_system *coding;
d46c5b12 4657 int encodep, nocopy;
4ed46869 4658{
d46c5b12
KH
4659 int len;
4660 char *buf;
fc932ac6
RS
4661 int from = 0, to = XSTRING (str)->size;
4662 int to_byte = STRING_BYTES (XSTRING (str));
d46c5b12 4663 struct gcpro gcpro1;
84d60297 4664 Lisp_Object saved_coding_symbol;
d46c5b12 4665 int result;
4ed46869 4666
84d60297 4667 saved_coding_symbol = Qnil;
d46c5b12
KH
4668 if (encodep && !NILP (coding->pre_write_conversion)
4669 || !encodep && !NILP (coding->post_read_conversion))
4670 {
4671 /* Since we have to call Lisp functions which assume target text
4672 is in a buffer, after setting a temporary buffer, call
4673 code_convert_region. */
4674 int count = specpdl_ptr - specpdl;
4675 struct buffer *prev = current_buffer;
e133c8fa 4676
d46c5b12
KH
4677 record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
4678 temp_output_buffer_setup (" *code-converting-work*");
4679 set_buffer_internal (XBUFFER (Vstandard_output));
4680 if (encodep)
4681 insert_from_string (str, 0, 0, to, to_byte, 0);
4682 else
4683 {
4684 /* We must insert the contents of STR as is without
4685 unibyte<->multibyte conversion. */
4686 current_buffer->enable_multibyte_characters = Qnil;
4687 insert_from_string (str, 0, 0, to_byte, to_byte, 0);
4688 current_buffer->enable_multibyte_characters = Qt;
4689 }
fb88bf2d 4690 code_convert_region (BEGV, BEGV_BYTE, ZV, ZV_BYTE, coding, encodep, 1);
d46c5b12
KH
4691 if (encodep)
4692 /* We must return the buffer contents as unibyte string. */
4693 current_buffer->enable_multibyte_characters = Qnil;
4694 str = make_buffer_string (BEGV, ZV, 0);
4695 set_buffer_internal (prev);
4696 return unbind_to (count, str);
4697 }
4ed46869 4698
d46c5b12
KH
4699 if (! encodep && CODING_REQUIRE_DETECTION (coding))
4700 {
4701 /* See the comments in code_convert_region. */
4702 if (coding->type == coding_type_undecided)
4703 {
4704 detect_coding (coding, XSTRING (str)->data, to_byte);
4705 if (coding->type == coding_type_undecided)
4706 coding->type = coding_type_emacs_mule;
4707 }
4708 if (coding->eol_type == CODING_EOL_UNDECIDED)
4709 {
4710 saved_coding_symbol = coding->symbol;
4711 detect_eol (coding, XSTRING (str)->data, to_byte);
4712 if (coding->eol_type == CODING_EOL_UNDECIDED)
4713 coding->eol_type = CODING_EOL_LF;
4714 /* We had better recover the original eol format if we
4715 encounter an inconsitent eol format while decoding. */
4716 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4717 }
4718 }
4ed46869 4719
d46c5b12
KH
4720 if (encodep
4721 ? ! CODING_REQUIRE_ENCODING (coding)
4722 : ! CODING_REQUIRE_DECODING (coding))
4723 from = to_byte;
4724 else
4725 {
4726 /* Try to skip the heading and tailing ASCIIs. */
88993dfd
KH
4727 SHRINK_CONVERSION_REGION (&from, &to_byte, coding, XSTRING (str)->data,
4728 encodep);
d46c5b12 4729 }
e133c8fa
KH
4730 if (from == to_byte
4731 && coding->type != coding_type_ccl)
d46c5b12 4732 return (nocopy ? str : Fcopy_sequence (str));
4ed46869 4733
d46c5b12
KH
4734 if (encodep)
4735 len = encoding_buffer_size (coding, to_byte - from);
4736 else
4737 len = decoding_buffer_size (coding, to_byte - from);
fc932ac6 4738 len += from + STRING_BYTES (XSTRING (str)) - to_byte;
d46c5b12
KH
4739 GCPRO1 (str);
4740 buf = get_conversion_buffer (len);
4741 UNGCPRO;
4ed46869 4742
d46c5b12
KH
4743 if (from > 0)
4744 bcopy (XSTRING (str)->data, buf, from);
4745 result = (encodep
4746 ? encode_coding (coding, XSTRING (str)->data + from,
4747 buf + from, to_byte - from, len)
4748 : decode_coding (coding, XSTRING (str)->data + from,
f30cc612 4749 buf + from, to_byte - from, len));
d46c5b12 4750 if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
4ed46869 4751 {
d46c5b12
KH
4752 /* We simple try to decode the whole string again but without
4753 eol-conversion this time. */
4754 coding->eol_type = CODING_EOL_LF;
4755 coding->symbol = saved_coding_symbol;
4756 return code_convert_string (str, coding, encodep, nocopy);
4ed46869 4757 }
d46c5b12
KH
4758
4759 bcopy (XSTRING (str)->data + to_byte, buf + from + coding->produced,
fc932ac6 4760 STRING_BYTES (XSTRING (str)) - to_byte);
d46c5b12 4761
fc932ac6 4762 len = from + STRING_BYTES (XSTRING (str)) - to_byte;
d46c5b12
KH
4763 if (encodep)
4764 str = make_unibyte_string (buf, len + coding->produced);
4765 else
826bfb8b
KH
4766 {
4767 int chars= (coding->fake_multibyte
4768 ? multibyte_chars_in_text (buf + from, coding->produced)
4769 : coding->produced_char);
4770 str = make_multibyte_string (buf, len + chars, len + coding->produced);
4771 }
4772
d46c5b12 4773 return str;
4ed46869
KH
4774}
4775
4776\f
4777#ifdef emacs
1397dc18 4778/*** 8. Emacs Lisp library functions ***/
4ed46869 4779
4ed46869
KH
4780DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
4781 "Return t if OBJECT is nil or a coding-system.\n\
3a73fa5d
RS
4782See the documentation of `make-coding-system' for information\n\
4783about coding-system objects.")
4ed46869
KH
4784 (obj)
4785 Lisp_Object obj;
4786{
4608c386
KH
4787 if (NILP (obj))
4788 return Qt;
4789 if (!SYMBOLP (obj))
4790 return Qnil;
4791 /* Get coding-spec vector for OBJ. */
4792 obj = Fget (obj, Qcoding_system);
4793 return ((VECTORP (obj) && XVECTOR (obj)->size == 5)
4794 ? Qt : Qnil);
4ed46869
KH
4795}
4796
9d991de8
RS
4797DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
4798 Sread_non_nil_coding_system, 1, 1, 0,
e0e989f6 4799 "Read a coding system from the minibuffer, prompting with string PROMPT.")
4ed46869
KH
4800 (prompt)
4801 Lisp_Object prompt;
4802{
e0e989f6 4803 Lisp_Object val;
9d991de8
RS
4804 do
4805 {
4608c386
KH
4806 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
4807 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
9d991de8
RS
4808 }
4809 while (XSTRING (val)->size == 0);
e0e989f6 4810 return (Fintern (val, Qnil));
4ed46869
KH
4811}
4812
9b787f3e
RS
4813DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
4814 "Read a coding system from the minibuffer, prompting with string PROMPT.\n\
4815If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.")
4816 (prompt, default_coding_system)
4817 Lisp_Object prompt, default_coding_system;
4ed46869 4818{
f44d27ce 4819 Lisp_Object val;
9b787f3e
RS
4820 if (SYMBOLP (default_coding_system))
4821 XSETSTRING (default_coding_system, XSYMBOL (default_coding_system)->name);
4608c386 4822 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
9b787f3e
RS
4823 Qt, Qnil, Qcoding_system_history,
4824 default_coding_system, Qnil);
e0e989f6 4825 return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
4ed46869
KH
4826}
4827
4828DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
4829 1, 1, 0,
4830 "Check validity of CODING-SYSTEM.\n\
3a73fa5d
RS
4831If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.\n\
4832It is valid if it is a symbol with a non-nil `coding-system' property.\n\
4ed46869
KH
4833The value of property should be a vector of length 5.")
4834 (coding_system)
4835 Lisp_Object coding_system;
4836{
4837 CHECK_SYMBOL (coding_system, 0);
4838 if (!NILP (Fcoding_system_p (coding_system)))
4839 return coding_system;
4840 while (1)
02ba4723 4841 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
4ed46869 4842}
3a73fa5d 4843\f
d46c5b12
KH
4844Lisp_Object
4845detect_coding_system (src, src_bytes, highest)
4846 unsigned char *src;
4847 int src_bytes, highest;
4ed46869
KH
4848{
4849 int coding_mask, eol_type;
d46c5b12
KH
4850 Lisp_Object val, tmp;
4851 int dummy;
4ed46869 4852
d46c5b12
KH
4853 coding_mask = detect_coding_mask (src, src_bytes, NULL, &dummy);
4854 eol_type = detect_eol_type (src, src_bytes, &dummy);
4855 if (eol_type == CODING_EOL_INCONSISTENT)
25b02698 4856 eol_type = CODING_EOL_UNDECIDED;
4ed46869 4857
d46c5b12 4858 if (!coding_mask)
4ed46869 4859 {
27901516 4860 val = Qundecided;
d46c5b12 4861 if (eol_type != CODING_EOL_UNDECIDED)
4ed46869 4862 {
f44d27ce
RS
4863 Lisp_Object val2;
4864 val2 = Fget (Qundecided, Qeol_type);
4ed46869
KH
4865 if (VECTORP (val2))
4866 val = XVECTOR (val2)->contents[eol_type];
4867 }
80e803b4 4868 return (highest ? val : Fcons (val, Qnil));
4ed46869 4869 }
4ed46869 4870
d46c5b12
KH
4871 /* At first, gather possible coding systems in VAL. */
4872 val = Qnil;
4873 for (tmp = Vcoding_category_list; !NILP (tmp); tmp = XCONS (tmp)->cdr)
4ed46869 4874 {
d46c5b12
KH
4875 int idx
4876 = XFASTINT (Fget (XCONS (tmp)->car, Qcoding_category_index));
4877 if (coding_mask & (1 << idx))
4ed46869 4878 {
d46c5b12
KH
4879 val = Fcons (Fsymbol_value (XCONS (tmp)->car), val);
4880 if (highest)
4881 break;
4ed46869
KH
4882 }
4883 }
d46c5b12
KH
4884 if (!highest)
4885 val = Fnreverse (val);
4ed46869 4886
65059037 4887 /* Then, replace the elements with subsidiary coding systems. */
d46c5b12 4888 for (tmp = val; !NILP (tmp); tmp = XCONS (tmp)->cdr)
4ed46869 4889 {
65059037
RS
4890 if (eol_type != CODING_EOL_UNDECIDED
4891 && eol_type != CODING_EOL_INCONSISTENT)
4ed46869 4892 {
d46c5b12
KH
4893 Lisp_Object eol;
4894 eol = Fget (XCONS (tmp)->car, Qeol_type);
4895 if (VECTORP (eol))
4896 XCONS (tmp)->car = XVECTOR (eol)->contents[eol_type];
4ed46869
KH
4897 }
4898 }
d46c5b12
KH
4899 return (highest ? XCONS (val)->car : val);
4900}
4ed46869 4901
d46c5b12
KH
4902DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
4903 2, 3, 0,
4904 "Detect coding system of the text in the region between START and END.\n\
4905Return a list of possible coding systems ordered by priority.\n\
4906\n\
80e803b4
KH
4907If only ASCII characters are found, it returns a list of single element\n\
4908`undecided' or its subsidiary coding system according to a detected\n\
4909end-of-line format.\n\
d46c5b12
KH
4910\n\
4911If optional argument HIGHEST is non-nil, return the coding system of\n\
4912highest priority.")
4913 (start, end, highest)
4914 Lisp_Object start, end, highest;
4915{
4916 int from, to;
4917 int from_byte, to_byte;
6289dd10 4918
d46c5b12
KH
4919 CHECK_NUMBER_COERCE_MARKER (start, 0);
4920 CHECK_NUMBER_COERCE_MARKER (end, 1);
4ed46869 4921
d46c5b12
KH
4922 validate_region (&start, &end);
4923 from = XINT (start), to = XINT (end);
4924 from_byte = CHAR_TO_BYTE (from);
4925 to_byte = CHAR_TO_BYTE (to);
6289dd10 4926
d46c5b12
KH
4927 if (from < GPT && to >= GPT)
4928 move_gap_both (to, to_byte);
4ed46869 4929
d46c5b12
KH
4930 return detect_coding_system (BYTE_POS_ADDR (from_byte),
4931 to_byte - from_byte,
4932 !NILP (highest));
4933}
6289dd10 4934
d46c5b12
KH
4935DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
4936 1, 2, 0,
4937 "Detect coding system of the text in STRING.\n\
4938Return a list of possible coding systems ordered by priority.\n\
4939\n\
80e803b4
KH
4940If only ASCII characters are found, it returns a list of single element\n\
4941`undecided' or its subsidiary coding system according to a detected\n\
4942end-of-line format.\n\
d46c5b12
KH
4943\n\
4944If optional argument HIGHEST is non-nil, return the coding system of\n\
4945highest priority.")
4946 (string, highest)
4947 Lisp_Object string, highest;
4948{
4949 CHECK_STRING (string, 0);
4ed46869 4950
d46c5b12 4951 return detect_coding_system (XSTRING (string)->data,
fc932ac6 4952 STRING_BYTES (XSTRING (string)),
d46c5b12 4953 !NILP (highest));
4ed46869
KH
4954}
4955
4031e2bf
KH
4956Lisp_Object
4957code_convert_region1 (start, end, coding_system, encodep)
d46c5b12 4958 Lisp_Object start, end, coding_system;
4031e2bf 4959 int encodep;
3a73fa5d
RS
4960{
4961 struct coding_system coding;
4031e2bf 4962 int from, to, len;
3a73fa5d 4963
d46c5b12
KH
4964 CHECK_NUMBER_COERCE_MARKER (start, 0);
4965 CHECK_NUMBER_COERCE_MARKER (end, 1);
3a73fa5d
RS
4966 CHECK_SYMBOL (coding_system, 2);
4967
d46c5b12
KH
4968 validate_region (&start, &end);
4969 from = XFASTINT (start);
4970 to = XFASTINT (end);
4971
3a73fa5d 4972 if (NILP (coding_system))
d46c5b12
KH
4973 return make_number (to - from);
4974
3a73fa5d 4975 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
d46c5b12 4976 error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
3a73fa5d 4977
d46c5b12 4978 coding.mode |= CODING_MODE_LAST_BLOCK;
fb88bf2d
KH
4979 code_convert_region (from, CHAR_TO_BYTE (from), to, CHAR_TO_BYTE (to),
4980 &coding, encodep, 1);
f072a3e8 4981 Vlast_coding_system_used = coding.symbol;
fb88bf2d 4982 return make_number (coding.produced_char);
4031e2bf
KH
4983}
4984
4985DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
4986 3, 3, "r\nzCoding system: ",
4987 "Decode the current region by specified coding system.\n\
4988When called from a program, takes three arguments:\n\
4989START, END, and CODING-SYSTEM. START and END are buffer positions.\n\
f072a3e8
RS
4990This function sets `last-coding-system-used' to the precise coding system\n\
4991used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
4992not fully specified.)\n\
4993It returns the length of the decoded text.")
4031e2bf
KH
4994 (start, end, coding_system)
4995 Lisp_Object start, end, coding_system;
4996{
4997 return code_convert_region1 (start, end, coding_system, 0);
3a73fa5d
RS
4998}
4999
5000DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
5001 3, 3, "r\nzCoding system: ",
d46c5b12 5002 "Encode the current region by specified coding system.\n\
3a73fa5d 5003When called from a program, takes three arguments:\n\
d46c5b12 5004START, END, and CODING-SYSTEM. START and END are buffer positions.\n\
f072a3e8
RS
5005This function sets `last-coding-system-used' to the precise coding system\n\
5006used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5007not fully specified.)\n\
5008It returns the length of the encoded text.")
d46c5b12
KH
5009 (start, end, coding_system)
5010 Lisp_Object start, end, coding_system;
3a73fa5d 5011{
4031e2bf
KH
5012 return code_convert_region1 (start, end, coding_system, 1);
5013}
3a73fa5d 5014
4031e2bf
KH
5015Lisp_Object
5016code_convert_string1 (string, coding_system, nocopy, encodep)
5017 Lisp_Object string, coding_system, nocopy;
5018 int encodep;
5019{
5020 struct coding_system coding;
3a73fa5d 5021
4031e2bf
KH
5022 CHECK_STRING (string, 0);
5023 CHECK_SYMBOL (coding_system, 1);
4ed46869 5024
d46c5b12 5025 if (NILP (coding_system))
4031e2bf 5026 return (NILP (nocopy) ? Fcopy_sequence (string) : string);
4ed46869 5027
d46c5b12
KH
5028 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
5029 error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
5f1cd180 5030
d46c5b12 5031 coding.mode |= CODING_MODE_LAST_BLOCK;
f072a3e8 5032 Vlast_coding_system_used = coding.symbol;
4031e2bf 5033 return code_convert_string (string, &coding, encodep, !NILP (nocopy));
4ed46869
KH
5034}
5035
4ed46869 5036DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
e0e989f6
KH
5037 2, 3, 0,
5038 "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
fe487a71 5039Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
f072a3e8
RS
5040if the decoding operation is trivial.\n\
5041This function sets `last-coding-system-used' to the precise coding system\n\
5042used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5043not fully specified.)")
e0e989f6
KH
5044 (string, coding_system, nocopy)
5045 Lisp_Object string, coding_system, nocopy;
4ed46869 5046{
f072a3e8 5047 return code_convert_string1 (string, coding_system, nocopy, 0);
4ed46869
KH
5048}
5049
5050DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
e0e989f6
KH
5051 2, 3, 0,
5052 "Encode STRING to CODING-SYSTEM, and return the result.\n\
fe487a71 5053Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
f072a3e8
RS
5054if the encoding operation is trivial.\n\
5055This function sets `last-coding-system-used' to the precise coding system\n\
5056used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5057not fully specified.)")
e0e989f6
KH
5058 (string, coding_system, nocopy)
5059 Lisp_Object string, coding_system, nocopy;
4ed46869 5060{
f072a3e8 5061 return code_convert_string1 (string, coding_system, nocopy, 1);
4ed46869 5062}
4031e2bf 5063
ecec61c1
KH
5064/* Encode or decode STRING according to CODING_SYSTEM.
5065 Do not set Vlast_coding_system_used. */
5066
5067Lisp_Object
5068code_convert_string_norecord (string, coding_system, encodep)
5069 Lisp_Object string, coding_system;
5070 int encodep;
5071{
5072 struct coding_system coding;
5073
5074 CHECK_STRING (string, 0);
5075 CHECK_SYMBOL (coding_system, 1);
5076
5077 if (NILP (coding_system))
5078 return string;
5079
5080 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
5081 error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
5082
5083 coding.mode |= CODING_MODE_LAST_BLOCK;
5084 return code_convert_string (string, &coding, encodep, Qt);
5085}
3a73fa5d 5086\f
4ed46869 5087DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
55ab7be3 5088 "Decode a Japanese character which has CODE in shift_jis encoding.\n\
4ed46869
KH
5089Return the corresponding character.")
5090 (code)
5091 Lisp_Object code;
5092{
5093 unsigned char c1, c2, s1, s2;
5094 Lisp_Object val;
5095
5096 CHECK_NUMBER (code, 0);
5097 s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
55ab7be3
KH
5098 if (s1 == 0)
5099 {
c28a9453
KH
5100 if (s2 < 0x80)
5101 XSETFASTINT (val, s2);
5102 else if (s2 >= 0xA0 || s2 <= 0xDF)
5103 XSETFASTINT (val,
5104 MAKE_NON_ASCII_CHAR (charset_katakana_jisx0201, s2, 0));
5105 else
9da8350f 5106 error ("Invalid Shift JIS code: %x", XFASTINT (code));
55ab7be3
KH
5107 }
5108 else
5109 {
5110 if ((s1 < 0x80 || s1 > 0x9F && s1 < 0xE0 || s1 > 0xEF)
5111 || (s2 < 0x40 || s2 == 0x7F || s2 > 0xFC))
9da8350f 5112 error ("Invalid Shift JIS code: %x", XFASTINT (code));
55ab7be3
KH
5113 DECODE_SJIS (s1, s2, c1, c2);
5114 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset_jisx0208, c1, c2));
5115 }
4ed46869
KH
5116 return val;
5117}
5118
5119DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
55ab7be3
KH
5120 "Encode a Japanese character CHAR to shift_jis encoding.\n\
5121Return the corresponding code in SJIS.")
4ed46869
KH
5122 (ch)
5123 Lisp_Object ch;
5124{
bcf26d6a 5125 int charset, c1, c2, s1, s2;
4ed46869
KH
5126 Lisp_Object val;
5127
5128 CHECK_NUMBER (ch, 0);
5129 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
c28a9453
KH
5130 if (charset == CHARSET_ASCII)
5131 {
5132 val = ch;
5133 }
5134 else if (charset == charset_jisx0208
5135 && c1 > 0x20 && c1 < 0x7F && c2 > 0x20 && c2 < 0x7F)
4ed46869
KH
5136 {
5137 ENCODE_SJIS (c1, c2, s1, s2);
bcf26d6a 5138 XSETFASTINT (val, (s1 << 8) | s2);
4ed46869 5139 }
55ab7be3
KH
5140 else if (charset == charset_katakana_jisx0201
5141 && c1 > 0x20 && c2 < 0xE0)
5142 {
5143 XSETFASTINT (val, c1 | 0x80);
5144 }
4ed46869 5145 else
55ab7be3 5146 error ("Can't encode to shift_jis: %d", XFASTINT (ch));
4ed46869
KH
5147 return val;
5148}
5149
5150DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
c28a9453 5151 "Decode a Big5 character which has CODE in BIG5 coding system.\n\
4ed46869
KH
5152Return the corresponding character.")
5153 (code)
5154 Lisp_Object code;
5155{
5156 int charset;
5157 unsigned char b1, b2, c1, c2;
5158 Lisp_Object val;
5159
5160 CHECK_NUMBER (code, 0);
5161 b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
c28a9453
KH
5162 if (b1 == 0)
5163 {
5164 if (b2 >= 0x80)
9da8350f 5165 error ("Invalid BIG5 code: %x", XFASTINT (code));
c28a9453
KH
5166 val = code;
5167 }
5168 else
5169 {
5170 if ((b1 < 0xA1 || b1 > 0xFE)
5171 || (b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE))
9da8350f 5172 error ("Invalid BIG5 code: %x", XFASTINT (code));
c28a9453
KH
5173 DECODE_BIG5 (b1, b2, charset, c1, c2);
5174 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset, c1, c2));
5175 }
4ed46869
KH
5176 return val;
5177}
5178
5179DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
d46c5b12 5180 "Encode the Big5 character CHAR to BIG5 coding system.\n\
4ed46869
KH
5181Return the corresponding character code in Big5.")
5182 (ch)
5183 Lisp_Object ch;
5184{
bcf26d6a 5185 int charset, c1, c2, b1, b2;
4ed46869
KH
5186 Lisp_Object val;
5187
5188 CHECK_NUMBER (ch, 0);
5189 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
c28a9453
KH
5190 if (charset == CHARSET_ASCII)
5191 {
5192 val = ch;
5193 }
5194 else if ((charset == charset_big5_1
5195 && (XFASTINT (ch) >= 0x250a1 && XFASTINT (ch) <= 0x271ec))
5196 || (charset == charset_big5_2
5197 && XFASTINT (ch) >= 0x290a1 && XFASTINT (ch) <= 0x2bdb2))
4ed46869
KH
5198 {
5199 ENCODE_BIG5 (charset, c1, c2, b1, b2);
bcf26d6a 5200 XSETFASTINT (val, (b1 << 8) | b2);
4ed46869
KH
5201 }
5202 else
c28a9453 5203 error ("Can't encode to Big5: %d", XFASTINT (ch));
4ed46869
KH
5204 return val;
5205}
3a73fa5d 5206\f
1ba9e4ab
KH
5207DEFUN ("set-terminal-coding-system-internal",
5208 Fset_terminal_coding_system_internal,
5209 Sset_terminal_coding_system_internal, 1, 1, 0, "")
4ed46869
KH
5210 (coding_system)
5211 Lisp_Object coding_system;
5212{
5213 CHECK_SYMBOL (coding_system, 0);
5214 setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
70c22245 5215 /* We had better not send unsafe characters to terminal. */
6e85d753
KH
5216 terminal_coding.flags |= CODING_FLAG_ISO_SAFE;
5217
4ed46869
KH
5218 return Qnil;
5219}
5220
c4825358
KH
5221DEFUN ("set-safe-terminal-coding-system-internal",
5222 Fset_safe_terminal_coding_system_internal,
5223 Sset_safe_terminal_coding_system_internal, 1, 1, 0, "")
5224 (coding_system)
5225 Lisp_Object coding_system;
5226{
5227 CHECK_SYMBOL (coding_system, 0);
5228 setup_coding_system (Fcheck_coding_system (coding_system),
5229 &safe_terminal_coding);
5230 return Qnil;
5231}
5232
4ed46869
KH
5233DEFUN ("terminal-coding-system",
5234 Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
3a73fa5d 5235 "Return coding system specified for terminal output.")
4ed46869
KH
5236 ()
5237{
5238 return terminal_coding.symbol;
5239}
5240
1ba9e4ab
KH
5241DEFUN ("set-keyboard-coding-system-internal",
5242 Fset_keyboard_coding_system_internal,
5243 Sset_keyboard_coding_system_internal, 1, 1, 0, "")
4ed46869
KH
5244 (coding_system)
5245 Lisp_Object coding_system;
5246{
5247 CHECK_SYMBOL (coding_system, 0);
5248 setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
5249 return Qnil;
5250}
5251
5252DEFUN ("keyboard-coding-system",
5253 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
3a73fa5d 5254 "Return coding system specified for decoding keyboard input.")
4ed46869
KH
5255 ()
5256{
5257 return keyboard_coding.symbol;
5258}
5259
5260\f
a5d301df
KH
5261DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
5262 Sfind_operation_coding_system, 1, MANY, 0,
5263 "Choose a coding system for an operation based on the target name.\n\
69f76525 5264The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).\n\
9ce27fde
KH
5265DECODING-SYSTEM is the coding system to use for decoding\n\
5266\(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system\n\
5267for encoding (in case OPERATION does encoding).\n\
ccdb79f5
RS
5268\n\
5269The first argument OPERATION specifies an I/O primitive:\n\
5270 For file I/O, `insert-file-contents' or `write-region'.\n\
5271 For process I/O, `call-process', `call-process-region', or `start-process'.\n\
5272 For network I/O, `open-network-stream'.\n\
5273\n\
5274The remaining arguments should be the same arguments that were passed\n\
5275to the primitive. Depending on which primitive, one of those arguments\n\
5276is selected as the TARGET. For example, if OPERATION does file I/O,\n\
5277whichever argument specifies the file name is TARGET.\n\
5278\n\
5279TARGET has a meaning which depends on OPERATION:\n\
4ed46869
KH
5280 For file I/O, TARGET is a file name.\n\
5281 For process I/O, TARGET is a process name.\n\
5282 For network I/O, TARGET is a service name or a port number\n\
5283\n\
02ba4723
KH
5284This function looks up what specified for TARGET in,\n\
5285`file-coding-system-alist', `process-coding-system-alist',\n\
5286or `network-coding-system-alist' depending on OPERATION.\n\
5287They may specify a coding system, a cons of coding systems,\n\
5288or a function symbol to call.\n\
5289In the last case, we call the function with one argument,\n\
9ce27fde 5290which is a list of all the arguments given to this function.")
4ed46869
KH
5291 (nargs, args)
5292 int nargs;
5293 Lisp_Object *args;
5294{
5295 Lisp_Object operation, target_idx, target, val;
5296 register Lisp_Object chain;
5297
5298 if (nargs < 2)
5299 error ("Too few arguments");
5300 operation = args[0];
5301 if (!SYMBOLP (operation)
5302 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
5303 error ("Invalid first arguement");
5304 if (nargs < 1 + XINT (target_idx))
5305 error ("Too few arguments for operation: %s",
5306 XSYMBOL (operation)->name->data);
5307 target = args[XINT (target_idx) + 1];
5308 if (!(STRINGP (target)
5309 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
5310 error ("Invalid %dth argument", XINT (target_idx) + 1);
5311
2e34157c
RS
5312 chain = ((EQ (operation, Qinsert_file_contents)
5313 || EQ (operation, Qwrite_region))
02ba4723 5314 ? Vfile_coding_system_alist
2e34157c 5315 : (EQ (operation, Qopen_network_stream)
02ba4723
KH
5316 ? Vnetwork_coding_system_alist
5317 : Vprocess_coding_system_alist));
4ed46869
KH
5318 if (NILP (chain))
5319 return Qnil;
5320
02ba4723 5321 for (; CONSP (chain); chain = XCONS (chain)->cdr)
4ed46869 5322 {
f44d27ce
RS
5323 Lisp_Object elt;
5324 elt = XCONS (chain)->car;
4ed46869
KH
5325
5326 if (CONSP (elt)
5327 && ((STRINGP (target)
5328 && STRINGP (XCONS (elt)->car)
5329 && fast_string_match (XCONS (elt)->car, target) >= 0)
5330 || (INTEGERP (target) && EQ (target, XCONS (elt)->car))))
02ba4723
KH
5331 {
5332 val = XCONS (elt)->cdr;
b19fd4c5
KH
5333 /* Here, if VAL is both a valid coding system and a valid
5334 function symbol, we return VAL as a coding system. */
02ba4723
KH
5335 if (CONSP (val))
5336 return val;
5337 if (! SYMBOLP (val))
5338 return Qnil;
5339 if (! NILP (Fcoding_system_p (val)))
5340 return Fcons (val, val);
b19fd4c5
KH
5341 if (! NILP (Ffboundp (val)))
5342 {
5343 val = call1 (val, Flist (nargs, args));
5344 if (CONSP (val))
5345 return val;
5346 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
5347 return Fcons (val, val);
5348 }
02ba4723
KH
5349 return Qnil;
5350 }
4ed46869
KH
5351 }
5352 return Qnil;
5353}
5354
1397dc18
KH
5355DEFUN ("update-coding-systems-internal", Fupdate_coding_systems_internal,
5356 Supdate_coding_systems_internal, 0, 0, 0,
5357 "Update internal database for ISO2022 and CCL based coding systems.\n\
d46c5b12
KH
5358When values of the following coding categories are changed, you must\n\
5359call this function:\n\
5360 coding-category-iso-7, coding-category-iso-7-tight,\n\
5361 coding-category-iso-8-1, coding-category-iso-8-2,\n\
1397dc18
KH
5362 coding-category-iso-7-else, coding-category-iso-8-else,\n\
5363 coding-category-ccl")
d46c5b12
KH
5364 ()
5365{
5366 int i;
5367
1397dc18 5368 for (i = CODING_CATEGORY_IDX_ISO_7; i <= CODING_CATEGORY_IDX_CCL; i++)
d46c5b12 5369 {
1397dc18
KH
5370 Lisp_Object val;
5371
5372 val = XSYMBOL (XVECTOR (Vcoding_category_table)->contents[i])->value;
5373 if (!NILP (val))
5374 {
5375 if (! coding_system_table[i])
5376 coding_system_table[i] = ((struct coding_system *)
5377 xmalloc (sizeof (struct coding_system)));
5378 setup_coding_system (val, coding_system_table[i]);
5379 }
5380 else if (coding_system_table[i])
5381 {
5382 xfree (coding_system_table[i]);
5383 coding_system_table[i] = NULL;
5384 }
d46c5b12 5385 }
1397dc18 5386
d46c5b12
KH
5387 return Qnil;
5388}
5389
66cfb530
KH
5390DEFUN ("set-coding-priority-internal", Fset_coding_priority_internal,
5391 Sset_coding_priority_internal, 0, 0, 0,
5392 "Update internal database for the current value of `coding-category-list'.\n\
5393This function is internal use only.")
5394 ()
5395{
5396 int i = 0, idx;
84d60297
RS
5397 Lisp_Object val;
5398
5399 val = Vcoding_category_list;
66cfb530
KH
5400
5401 while (CONSP (val) && i < CODING_CATEGORY_IDX_MAX)
5402 {
5403 if (! SYMBOLP (XCONS (val)->car))
5404 break;
5405 idx = XFASTINT (Fget (XCONS (val)->car, Qcoding_category_index));
5406 if (idx >= CODING_CATEGORY_IDX_MAX)
5407 break;
5408 coding_priorities[i++] = (1 << idx);
5409 val = XCONS (val)->cdr;
5410 }
5411 /* If coding-category-list is valid and contains all coding
5412 categories, `i' should be CODING_CATEGORY_IDX_MAX now. If not,
5413 the following code saves Emacs from craching. */
5414 while (i < CODING_CATEGORY_IDX_MAX)
5415 coding_priorities[i++] = CODING_CATEGORY_MASK_RAW_TEXT;
5416
5417 return Qnil;
5418}
5419
4ed46869
KH
5420#endif /* emacs */
5421
5422\f
1397dc18 5423/*** 9. Post-amble ***/
4ed46869 5424
6d74c3aa
KH
5425void
5426init_coding ()
5427{
5428 conversion_buffer = (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE);
5429}
5430
dfcf069d 5431void
4ed46869
KH
5432init_coding_once ()
5433{
5434 int i;
5435
0ef69138 5436 /* Emacs' internal format specific initialize routine. */
4ed46869
KH
5437 for (i = 0; i <= 0x20; i++)
5438 emacs_code_class[i] = EMACS_control_code;
5439 emacs_code_class[0x0A] = EMACS_linefeed_code;
5440 emacs_code_class[0x0D] = EMACS_carriage_return_code;
5441 for (i = 0x21 ; i < 0x7F; i++)
5442 emacs_code_class[i] = EMACS_ascii_code;
5443 emacs_code_class[0x7F] = EMACS_control_code;
5444 emacs_code_class[0x80] = EMACS_leading_code_composition;
5445 for (i = 0x81; i < 0xFF; i++)
5446 emacs_code_class[i] = EMACS_invalid_code;
5447 emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
5448 emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
5449 emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
5450 emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
5451
5452 /* ISO2022 specific initialize routine. */
5453 for (i = 0; i < 0x20; i++)
5454 iso_code_class[i] = ISO_control_code;
5455 for (i = 0x21; i < 0x7F; i++)
5456 iso_code_class[i] = ISO_graphic_plane_0;
5457 for (i = 0x80; i < 0xA0; i++)
5458 iso_code_class[i] = ISO_control_code;
5459 for (i = 0xA1; i < 0xFF; i++)
5460 iso_code_class[i] = ISO_graphic_plane_1;
5461 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
5462 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
5463 iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
5464 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
5465 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
5466 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
5467 iso_code_class[ISO_CODE_ESC] = ISO_escape;
5468 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
5469 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
5470 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
5471
e0e989f6 5472 conversion_buffer_size = MINIMUM_CONVERSION_BUFFER_SIZE;
e0e989f6
KH
5473
5474 setup_coding_system (Qnil, &keyboard_coding);
5475 setup_coding_system (Qnil, &terminal_coding);
c4825358 5476 setup_coding_system (Qnil, &safe_terminal_coding);
6bc51348 5477 setup_coding_system (Qnil, &default_buffer_file_coding);
9ce27fde 5478
d46c5b12
KH
5479 bzero (coding_system_table, sizeof coding_system_table);
5480
66cfb530
KH
5481 bzero (ascii_skip_code, sizeof ascii_skip_code);
5482 for (i = 0; i < 128; i++)
5483 ascii_skip_code[i] = 1;
5484
9ce27fde
KH
5485#if defined (MSDOS) || defined (WINDOWSNT)
5486 system_eol_type = CODING_EOL_CRLF;
5487#else
5488 system_eol_type = CODING_EOL_LF;
5489#endif
e0e989f6
KH
5490}
5491
5492#ifdef emacs
5493
dfcf069d 5494void
e0e989f6
KH
5495syms_of_coding ()
5496{
5497 Qtarget_idx = intern ("target-idx");
5498 staticpro (&Qtarget_idx);
5499
bb0115a2
RS
5500 Qcoding_system_history = intern ("coding-system-history");
5501 staticpro (&Qcoding_system_history);
5502 Fset (Qcoding_system_history, Qnil);
5503
9ce27fde 5504 /* Target FILENAME is the first argument. */
e0e989f6 5505 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
9ce27fde 5506 /* Target FILENAME is the third argument. */
e0e989f6
KH
5507 Fput (Qwrite_region, Qtarget_idx, make_number (2));
5508
5509 Qcall_process = intern ("call-process");
5510 staticpro (&Qcall_process);
9ce27fde 5511 /* Target PROGRAM is the first argument. */
e0e989f6
KH
5512 Fput (Qcall_process, Qtarget_idx, make_number (0));
5513
5514 Qcall_process_region = intern ("call-process-region");
5515 staticpro (&Qcall_process_region);
9ce27fde 5516 /* Target PROGRAM is the third argument. */
e0e989f6
KH
5517 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
5518
5519 Qstart_process = intern ("start-process");
5520 staticpro (&Qstart_process);
9ce27fde 5521 /* Target PROGRAM is the third argument. */
e0e989f6
KH
5522 Fput (Qstart_process, Qtarget_idx, make_number (2));
5523
5524 Qopen_network_stream = intern ("open-network-stream");
5525 staticpro (&Qopen_network_stream);
9ce27fde 5526 /* Target SERVICE is the fourth argument. */
e0e989f6
KH
5527 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
5528
4ed46869
KH
5529 Qcoding_system = intern ("coding-system");
5530 staticpro (&Qcoding_system);
5531
5532 Qeol_type = intern ("eol-type");
5533 staticpro (&Qeol_type);
5534
5535 Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
5536 staticpro (&Qbuffer_file_coding_system);
5537
5538 Qpost_read_conversion = intern ("post-read-conversion");
5539 staticpro (&Qpost_read_conversion);
5540
5541 Qpre_write_conversion = intern ("pre-write-conversion");
5542 staticpro (&Qpre_write_conversion);
5543
27901516
KH
5544 Qno_conversion = intern ("no-conversion");
5545 staticpro (&Qno_conversion);
5546
5547 Qundecided = intern ("undecided");
5548 staticpro (&Qundecided);
5549
4ed46869
KH
5550 Qcoding_system_p = intern ("coding-system-p");
5551 staticpro (&Qcoding_system_p);
5552
5553 Qcoding_system_error = intern ("coding-system-error");
5554 staticpro (&Qcoding_system_error);
5555
5556 Fput (Qcoding_system_error, Qerror_conditions,
5557 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
5558 Fput (Qcoding_system_error, Qerror_message,
9ce27fde 5559 build_string ("Invalid coding system"));
4ed46869 5560
d46c5b12
KH
5561 Qcoding_category = intern ("coding-category");
5562 staticpro (&Qcoding_category);
4ed46869
KH
5563 Qcoding_category_index = intern ("coding-category-index");
5564 staticpro (&Qcoding_category_index);
5565
d46c5b12
KH
5566 Vcoding_category_table
5567 = Fmake_vector (make_number (CODING_CATEGORY_IDX_MAX), Qnil);
5568 staticpro (&Vcoding_category_table);
4ed46869
KH
5569 {
5570 int i;
5571 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
5572 {
d46c5b12
KH
5573 XVECTOR (Vcoding_category_table)->contents[i]
5574 = intern (coding_category_name[i]);
5575 Fput (XVECTOR (Vcoding_category_table)->contents[i],
5576 Qcoding_category_index, make_number (i));
4ed46869
KH
5577 }
5578 }
5579
f967223b
KH
5580 Qtranslation_table = intern ("translation-table");
5581 staticpro (&Qtranslation_table);
1397dc18 5582 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (1));
bdd9fb48 5583
f967223b
KH
5584 Qtranslation_table_id = intern ("translation-table-id");
5585 staticpro (&Qtranslation_table_id);
84fbb8a0 5586
f967223b
KH
5587 Qtranslation_table_for_decode = intern ("translation-table-for-decode");
5588 staticpro (&Qtranslation_table_for_decode);
a5d301df 5589
f967223b
KH
5590 Qtranslation_table_for_encode = intern ("translation-table-for-encode");
5591 staticpro (&Qtranslation_table_for_encode);
a5d301df 5592
70c22245
KH
5593 Qsafe_charsets = intern ("safe-charsets");
5594 staticpro (&Qsafe_charsets);
5595
1397dc18
KH
5596 Qvalid_codes = intern ("valid-codes");
5597 staticpro (&Qvalid_codes);
5598
9ce27fde
KH
5599 Qemacs_mule = intern ("emacs-mule");
5600 staticpro (&Qemacs_mule);
5601
d46c5b12
KH
5602 Qraw_text = intern ("raw-text");
5603 staticpro (&Qraw_text);
5604
4ed46869
KH
5605 defsubr (&Scoding_system_p);
5606 defsubr (&Sread_coding_system);
5607 defsubr (&Sread_non_nil_coding_system);
5608 defsubr (&Scheck_coding_system);
5609 defsubr (&Sdetect_coding_region);
d46c5b12 5610 defsubr (&Sdetect_coding_string);
4ed46869
KH
5611 defsubr (&Sdecode_coding_region);
5612 defsubr (&Sencode_coding_region);
5613 defsubr (&Sdecode_coding_string);
5614 defsubr (&Sencode_coding_string);
5615 defsubr (&Sdecode_sjis_char);
5616 defsubr (&Sencode_sjis_char);
5617 defsubr (&Sdecode_big5_char);
5618 defsubr (&Sencode_big5_char);
1ba9e4ab 5619 defsubr (&Sset_terminal_coding_system_internal);
c4825358 5620 defsubr (&Sset_safe_terminal_coding_system_internal);
4ed46869 5621 defsubr (&Sterminal_coding_system);
1ba9e4ab 5622 defsubr (&Sset_keyboard_coding_system_internal);
4ed46869 5623 defsubr (&Skeyboard_coding_system);
a5d301df 5624 defsubr (&Sfind_operation_coding_system);
1397dc18 5625 defsubr (&Supdate_coding_systems_internal);
66cfb530 5626 defsubr (&Sset_coding_priority_internal);
4ed46869 5627
4608c386
KH
5628 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
5629 "List of coding systems.\n\
5630\n\
5631Do not alter the value of this variable manually. This variable should be\n\
5632updated by the functions `make-coding-system' and\n\
5633`define-coding-system-alias'.");
5634 Vcoding_system_list = Qnil;
5635
5636 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
5637 "Alist of coding system names.\n\
5638Each element is one element list of coding system name.\n\
5639This variable is given to `completing-read' as TABLE argument.\n\
5640\n\
5641Do not alter the value of this variable manually. This variable should be\n\
5642updated by the functions `make-coding-system' and\n\
5643`define-coding-system-alias'.");
5644 Vcoding_system_alist = Qnil;
5645
4ed46869
KH
5646 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
5647 "List of coding-categories (symbols) ordered by priority.");
5648 {
5649 int i;
5650
5651 Vcoding_category_list = Qnil;
5652 for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
5653 Vcoding_category_list
d46c5b12
KH
5654 = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
5655 Vcoding_category_list);
4ed46869
KH
5656 }
5657
5658 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
10bff6f1 5659 "Specify the coding system for read operations.\n\
2ebb362d 5660It is useful to bind this variable with `let', but do not set it globally.\n\
4ed46869 5661If the value is a coding system, it is used for decoding on read operation.\n\
a67a9c66 5662If not, an appropriate element is used from one of the coding system alists:\n\
10bff6f1 5663There are three such tables, `file-coding-system-alist',\n\
a67a9c66 5664`process-coding-system-alist', and `network-coding-system-alist'.");
4ed46869
KH
5665 Vcoding_system_for_read = Qnil;
5666
5667 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
10bff6f1 5668 "Specify the coding system for write operations.\n\
2ebb362d 5669It is useful to bind this variable with `let', but do not set it globally.\n\
4ed46869 5670If the value is a coding system, it is used for encoding on write operation.\n\
a67a9c66 5671If not, an appropriate element is used from one of the coding system alists:\n\
10bff6f1 5672There are three such tables, `file-coding-system-alist',\n\
a67a9c66 5673`process-coding-system-alist', and `network-coding-system-alist'.");
4ed46869
KH
5674 Vcoding_system_for_write = Qnil;
5675
5676 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
a67a9c66 5677 "Coding system used in the latest file or process I/O.");
4ed46869
KH
5678 Vlast_coding_system_used = Qnil;
5679
9ce27fde 5680 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
f07f4a24 5681 "*Non-nil means always inhibit code conversion of end-of-line format.\n\
94c7a214
DL
5682See info node `Coding Systems' and info node `Text and Binary' concerning\n\
5683such conversion.");
9ce27fde
KH
5684 inhibit_eol_conversion = 0;
5685
ed29121d
EZ
5686 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
5687 "Non-nil means process buffer inherits coding system of process output.\n\
5688Bind it to t if the process output is to be treated as if it were a file\n\
5689read from some filesystem.");
5690 inherit_process_coding_system = 0;
5691
02ba4723
KH
5692 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
5693 "Alist to decide a coding system to use for a file I/O operation.\n\
5694The format is ((PATTERN . VAL) ...),\n\
5695where PATTERN is a regular expression matching a file name,\n\
5696VAL is a coding system, a cons of coding systems, or a function symbol.\n\
5697If VAL is a coding system, it is used for both decoding and encoding\n\
5698the file contents.\n\
5699If VAL is a cons of coding systems, the car part is used for decoding,\n\
5700and the cdr part is used for encoding.\n\
5701If VAL is a function symbol, the function must return a coding system\n\
5702or a cons of coding systems which are used as above.\n\
e0e989f6 5703\n\
a85a871a 5704See also the function `find-operation-coding-system'\n\
eda284ac 5705and the variable `auto-coding-alist'.");
02ba4723
KH
5706 Vfile_coding_system_alist = Qnil;
5707
5708 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
5709 "Alist to decide a coding system to use for a process I/O operation.\n\
5710The format is ((PATTERN . VAL) ...),\n\
5711where PATTERN is a regular expression matching a program name,\n\
5712VAL is a coding system, a cons of coding systems, or a function symbol.\n\
5713If VAL is a coding system, it is used for both decoding what received\n\
5714from the program and encoding what sent to the program.\n\
5715If VAL is a cons of coding systems, the car part is used for decoding,\n\
5716and the cdr part is used for encoding.\n\
5717If VAL is a function symbol, the function must return a coding system\n\
5718or a cons of coding systems which are used as above.\n\
4ed46869 5719\n\
9ce27fde 5720See also the function `find-operation-coding-system'.");
02ba4723
KH
5721 Vprocess_coding_system_alist = Qnil;
5722
5723 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
5724 "Alist to decide a coding system to use for a network I/O operation.\n\
5725The format is ((PATTERN . VAL) ...),\n\
5726where PATTERN is a regular expression matching a network service name\n\
5727or is a port number to connect to,\n\
5728VAL is a coding system, a cons of coding systems, or a function symbol.\n\
5729If VAL is a coding system, it is used for both decoding what received\n\
5730from the network stream and encoding what sent to the network stream.\n\
5731If VAL is a cons of coding systems, the car part is used for decoding,\n\
5732and the cdr part is used for encoding.\n\
5733If VAL is a function symbol, the function must return a coding system\n\
5734or a cons of coding systems which are used as above.\n\
4ed46869 5735\n\
9ce27fde 5736See also the function `find-operation-coding-system'.");
02ba4723 5737 Vnetwork_coding_system_alist = Qnil;
4ed46869 5738
7722baf9
EZ
5739 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
5740 "*String displayed in mode line for UNIX-like (LF) end-of-line format.");
5741 eol_mnemonic_unix = build_string (":");
4ed46869 5742
7722baf9
EZ
5743 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
5744 "*String displayed in mode line for DOS-like (CRLF) end-of-line format.");
5745 eol_mnemonic_dos = build_string ("\\");
4ed46869 5746
7722baf9
EZ
5747 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
5748 "*String displayed in mode line for MAC-like (CR) end-of-line format.");
5749 eol_mnemonic_mac = build_string ("/");
4ed46869 5750
7722baf9
EZ
5751 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
5752 "*String displayed in mode line when end-of-line format is not yet determined.");
5753 eol_mnemonic_undecided = build_string (":");
4ed46869 5754
84fbb8a0 5755 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
f967223b 5756 "*Non-nil enables character translation while encoding and decoding.");
84fbb8a0 5757 Venable_character_translation = Qt;
bdd9fb48 5758
f967223b
KH
5759 DEFVAR_LISP ("standard-translation-table-for-decode",
5760 &Vstandard_translation_table_for_decode,
84fbb8a0 5761 "Table for translating characters while decoding.");
f967223b 5762 Vstandard_translation_table_for_decode = Qnil;
bdd9fb48 5763
f967223b
KH
5764 DEFVAR_LISP ("standard-translation-table-for-encode",
5765 &Vstandard_translation_table_for_encode,
84fbb8a0 5766 "Table for translationg characters while encoding.");
f967223b 5767 Vstandard_translation_table_for_encode = Qnil;
4ed46869
KH
5768
5769 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
5770 "Alist of charsets vs revision numbers.\n\
5771While encoding, if a charset (car part of an element) is found,\n\
5772designate it with the escape sequence identifing revision (cdr part of the element).");
5773 Vcharset_revision_alist = Qnil;
02ba4723
KH
5774
5775 DEFVAR_LISP ("default-process-coding-system",
5776 &Vdefault_process_coding_system,
5777 "Cons of coding systems used for process I/O by default.\n\
5778The car part is used for decoding a process output,\n\
5779the cdr part is used for encoding a text to be sent to a process.");
5780 Vdefault_process_coding_system = Qnil;
c4825358 5781
3f003981
KH
5782 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
5783 "Table of extra Latin codes in the range 128..159 (inclusive).\n\
c4825358
KH
5784This is a vector of length 256.\n\
5785If Nth element is non-nil, the existence of code N in a file\n\
bb0115a2 5786\(or output of subprocess) doesn't prevent it to be detected as\n\
3f003981
KH
5787a coding system of ISO 2022 variant which has a flag\n\
5788`accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file\n\
c4825358
KH
5789or reading output of a subprocess.\n\
5790Only 128th through 159th elements has a meaning.");
3f003981 5791 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
d46c5b12
KH
5792
5793 DEFVAR_LISP ("select-safe-coding-system-function",
5794 &Vselect_safe_coding_system_function,
5795 "Function to call to select safe coding system for encoding a text.\n\
5796\n\
5797If set, this function is called to force a user to select a proper\n\
5798coding system which can encode the text in the case that a default\n\
5799coding system used in each operation can't encode the text.\n\
5800\n\
a85a871a 5801The default value is `select-safe-coding-system' (which see).");
d46c5b12
KH
5802 Vselect_safe_coding_system_function = Qnil;
5803
4ed46869
KH
5804}
5805
5806#endif /* emacs */