(help-make-xrefs): Default info references to an
[bpt/emacs.git] / src / coding.c
CommitLineData
4ed46869 1/* Coding system handler (conversion, detection, and etc).
4a2f9c6a 2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
203cb916 3 Licensed to the Free Software Foundation.
4ed46869 4
369314dc
KH
5This file is part of GNU Emacs.
6
7GNU Emacs is free software; you can redistribute it and/or modify
8it under the terms of the GNU General Public License as published by
9the Free Software Foundation; either version 2, or (at your option)
10any later version.
4ed46869 11
369314dc
KH
12GNU Emacs is distributed in the hope that it will be useful,
13but WITHOUT ANY WARRANTY; without even the implied warranty of
14MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15GNU General Public License for more details.
4ed46869 16
369314dc
KH
17You should have received a copy of the GNU General Public License
18along with GNU Emacs; see the file COPYING. If not, write to
19the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20Boston, MA 02111-1307, USA. */
4ed46869
KH
21
22/*** TABLE OF CONTENTS ***
23
24 1. Preamble
0ef69138 25 2. Emacs' internal format (emacs-mule) handlers
4ed46869
KH
26 3. ISO2022 handlers
27 4. Shift-JIS and BIG5 handlers
1397dc18
KH
28 5. CCL handlers
29 6. End-of-line handlers
30 7. C library functions
31 8. Emacs Lisp library functions
32 9. Post-amble
4ed46869
KH
33
34*/
35
36/*** GENERAL NOTE on CODING SYSTEM ***
37
38 Coding system is an encoding mechanism of one or more character
39 sets. Here's a list of coding systems which Emacs can handle. When
40 we say "decode", it means converting some other coding system to
0ef69138
KH
41 Emacs' internal format (emacs-internal), and when we say "encode",
42 it means converting the coding system emacs-mule to some other
43 coding system.
4ed46869 44
0ef69138 45 0. Emacs' internal format (emacs-mule)
4ed46869
KH
46
47 Emacs itself holds a multi-lingual character in a buffer and a string
f4dee582 48 in a special format. Details are described in section 2.
4ed46869
KH
49
50 1. ISO2022
51
52 The most famous coding system for multiple character sets. X's
f4dee582
RS
53 Compound Text, various EUCs (Extended Unix Code), and coding
54 systems used in Internet communication such as ISO-2022-JP are
55 all variants of ISO2022. Details are described in section 3.
4ed46869
KH
56
57 2. SJIS (or Shift-JIS or MS-Kanji-Code)
58
59 A coding system to encode character sets: ASCII, JISX0201, and
60 JISX0208. Widely used for PC's in Japan. Details are described in
f4dee582 61 section 4.
4ed46869
KH
62
63 3. BIG5
64
65 A coding system to encode character sets: ASCII and Big5. Widely
66 used by Chinese (mainly in Taiwan and Hong Kong). Details are
f4dee582
RS
67 described in section 4. In this file, when we write "BIG5"
68 (all uppercase), we mean the coding system, and when we write
69 "Big5" (capitalized), we mean the character set.
4ed46869 70
27901516
KH
71 4. Raw text
72
4608c386
KH
73 A coding system for a text containing random 8-bit code. Emacs does
74 no code conversion on such a text except for end-of-line format.
27901516
KH
75
76 5. Other
4ed46869 77
f4dee582 78 If a user wants to read/write a text encoded in a coding system not
4ed46869
KH
79 listed above, he can supply a decoder and an encoder for it in CCL
80 (Code Conversion Language) programs. Emacs executes the CCL program
81 while reading/writing.
82
d46c5b12
KH
83 Emacs represents a coding system by a Lisp symbol that has a property
84 `coding-system'. But, before actually using the coding system, the
4ed46869 85 information about it is set in a structure of type `struct
f4dee582 86 coding_system' for rapid processing. See section 6 for more details.
4ed46869
KH
87
88*/
89
90/*** GENERAL NOTES on END-OF-LINE FORMAT ***
91
92 How end-of-line of a text is encoded depends on a system. For
93 instance, Unix's format is just one byte of `line-feed' code,
f4dee582 94 whereas DOS's format is two-byte sequence of `carriage-return' and
d46c5b12
KH
95 `line-feed' codes. MacOS's format is usually one byte of
96 `carriage-return'.
4ed46869 97
f4dee582
RS
98 Since text characters encoding and end-of-line encoding are
99 independent, any coding system described above can take
4ed46869 100 any format of end-of-line. So, Emacs has information of format of
f4dee582 101 end-of-line in each coding-system. See section 6 for more details.
4ed46869
KH
102
103*/
104
105/*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
106
107 These functions check if a text between SRC and SRC_END is encoded
108 in the coding system category XXX. Each returns an integer value in
109 which appropriate flag bits for the category XXX is set. The flag
110 bits are defined in macros CODING_CATEGORY_MASK_XXX. Below is the
111 template of these functions. */
112#if 0
113int
0ef69138 114detect_coding_emacs_mule (src, src_end)
4ed46869
KH
115 unsigned char *src, *src_end;
116{
117 ...
118}
119#endif
120
121/*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
122
123 These functions decode SRC_BYTES length text at SOURCE encoded in
0ef69138 124 CODING to Emacs' internal format (emacs-mule). The resulting text
d46c5b12
KH
125 goes to a place pointed to by DESTINATION, the length of which
126 should not exceed DST_BYTES. These functions set the information of
127 original and decoded texts in the members produced, produced_char,
128 consumed, and consumed_char of the structure *CODING.
129
130 The return value is an integer (CODING_FINISH_XXX) indicating how
131 the decoding finished.
132
133 DST_BYTES zero means that source area and destination area are
134 overlapped, which means that we can produce a decoded text until it
135 reaches at the head of not-yet-decoded source text.
136
137 Below is a template of these functions. */
4ed46869 138#if 0
d46c5b12 139decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
140 struct coding_system *coding;
141 unsigned char *source, *destination;
142 int src_bytes, dst_bytes;
4ed46869
KH
143{
144 ...
145}
146#endif
147
148/*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
149
0ef69138
KH
150 These functions encode SRC_BYTES length text at SOURCE of Emacs'
151 internal format (emacs-mule) to CODING. The resulting text goes to
f4dee582 152 a place pointed to by DESTINATION, the length of which should not
d46c5b12
KH
153 exceed DST_BYTES. These functions set the information of
154 original and encoded texts in the members produced, produced_char,
155 consumed, and consumed_char of the structure *CODING.
156
157 The return value is an integer (CODING_FINISH_XXX) indicating how
158 the encoding finished.
159
160 DST_BYTES zero means that source area and destination area are
161 overlapped, which means that we can produce a decoded text until it
162 reaches at the head of not-yet-decoded source text.
163
164 Below is a template of these functions. */
4ed46869 165#if 0
d46c5b12 166encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
167 struct coding_system *coding;
168 unsigned char *source, *destination;
169 int src_bytes, dst_bytes;
4ed46869
KH
170{
171 ...
172}
173#endif
174
175/*** COMMONLY USED MACROS ***/
176
177/* The following three macros ONE_MORE_BYTE, TWO_MORE_BYTES, and
178 THREE_MORE_BYTES safely get one, two, and three bytes from the
179 source text respectively. If there are not enough bytes in the
180 source, they jump to `label_end_of_loop'. The caller should set
181 variables `src' and `src_end' to appropriate areas in advance. */
182
183#define ONE_MORE_BYTE(c1) \
184 do { \
185 if (src < src_end) \
186 c1 = *src++; \
187 else \
188 goto label_end_of_loop; \
189 } while (0)
190
191#define TWO_MORE_BYTES(c1, c2) \
192 do { \
193 if (src + 1 < src_end) \
194 c1 = *src++, c2 = *src++; \
195 else \
196 goto label_end_of_loop; \
197 } while (0)
198
199#define THREE_MORE_BYTES(c1, c2, c3) \
200 do { \
201 if (src + 2 < src_end) \
202 c1 = *src++, c2 = *src++, c3 = *src++; \
203 else \
204 goto label_end_of_loop; \
205 } while (0)
206
207/* The following three macros DECODE_CHARACTER_ASCII,
208 DECODE_CHARACTER_DIMENSION1, and DECODE_CHARACTER_DIMENSION2 put
209 the multi-byte form of a character of each class at the place
210 pointed by `dst'. The caller should set the variable `dst' to
211 point to an appropriate area and the variable `coding' to point to
212 the coding-system of the currently decoding text in advance. */
213
214/* Decode one ASCII character C. */
215
de79a6a5
KH
216#define DECODE_CHARACTER_ASCII(c) \
217 do { \
218 if (COMPOSING_P (coding->composing)) \
219 { \
220 *dst++ = 0xA0, *dst++ = (c) | 0x80; \
221 coding->composed_chars++; \
d14d03ac
KH
222 if (((c) | 0x80) < 0xA0) \
223 coding->fake_multibyte = 1; \
de79a6a5
KH
224 } \
225 else \
226 { \
227 *dst++ = (c); \
228 coding->produced_char++; \
d14d03ac
KH
229 if ((c) >= 0x80) \
230 coding->fake_multibyte = 1; \
de79a6a5 231 } \
4ed46869
KH
232 } while (0)
233
f4dee582 234/* Decode one DIMENSION1 character whose charset is CHARSET and whose
4ed46869
KH
235 position-code is C. */
236
237#define DECODE_CHARACTER_DIMENSION1(charset, c) \
238 do { \
239 unsigned char leading_code = CHARSET_LEADING_CODE_BASE (charset); \
240 if (COMPOSING_P (coding->composing)) \
de79a6a5
KH
241 { \
242 *dst++ = leading_code + 0x20; \
243 coding->composed_chars++; \
244 } \
4ed46869 245 else \
d46c5b12
KH
246 { \
247 *dst++ = leading_code; \
248 coding->produced_char++; \
249 } \
4ed46869
KH
250 if (leading_code = CHARSET_LEADING_CODE_EXT (charset)) \
251 *dst++ = leading_code; \
252 *dst++ = (c) | 0x80; \
d14d03ac
KH
253 if (((c) | 0x80) < 0xA0) \
254 coding->fake_multibyte = 1; \
4ed46869
KH
255 } while (0)
256
f4dee582 257/* Decode one DIMENSION2 character whose charset is CHARSET and whose
4ed46869
KH
258 position-codes are C1 and C2. */
259
260#define DECODE_CHARACTER_DIMENSION2(charset, c1, c2) \
261 do { \
262 DECODE_CHARACTER_DIMENSION1 (charset, c1); \
263 *dst++ = (c2) | 0x80; \
d14d03ac
KH
264 if (((c2) | 0x80) < 0xA0) \
265 coding->fake_multibyte = 1; \
4ed46869
KH
266 } while (0)
267
268\f
269/*** 1. Preamble ***/
270
271#include <stdio.h>
272
273#ifdef emacs
274
275#include <config.h>
276#include "lisp.h"
277#include "buffer.h"
278#include "charset.h"
279#include "ccl.h"
280#include "coding.h"
281#include "window.h"
282
283#else /* not emacs */
284
285#include "mulelib.h"
286
287#endif /* not emacs */
288
289Lisp_Object Qcoding_system, Qeol_type;
290Lisp_Object Qbuffer_file_coding_system;
291Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
27901516 292Lisp_Object Qno_conversion, Qundecided;
bb0115a2 293Lisp_Object Qcoding_system_history;
70c22245 294Lisp_Object Qsafe_charsets;
1397dc18 295Lisp_Object Qvalid_codes;
4ed46869
KH
296
297extern Lisp_Object Qinsert_file_contents, Qwrite_region;
298Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
299Lisp_Object Qstart_process, Qopen_network_stream;
300Lisp_Object Qtarget_idx;
301
d46c5b12
KH
302Lisp_Object Vselect_safe_coding_system_function;
303
4ed46869
KH
304/* Mnemonic character of each format of end-of-line. */
305int eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
306/* Mnemonic character to indicate format of end-of-line is not yet
307 decided. */
308int eol_mnemonic_undecided;
309
9ce27fde
KH
310/* Format of end-of-line decided by system. This is CODING_EOL_LF on
311 Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac. */
312int system_eol_type;
313
4ed46869
KH
314#ifdef emacs
315
4608c386
KH
316Lisp_Object Vcoding_system_list, Vcoding_system_alist;
317
318Lisp_Object Qcoding_system_p, Qcoding_system_error;
4ed46869 319
d46c5b12
KH
320/* Coding system emacs-mule and raw-text are for converting only
321 end-of-line format. */
322Lisp_Object Qemacs_mule, Qraw_text;
9ce27fde 323
4ed46869
KH
324/* Coding-systems are handed between Emacs Lisp programs and C internal
325 routines by the following three variables. */
326/* Coding-system for reading files and receiving data from process. */
327Lisp_Object Vcoding_system_for_read;
328/* Coding-system for writing files and sending data to process. */
329Lisp_Object Vcoding_system_for_write;
330/* Coding-system actually used in the latest I/O. */
331Lisp_Object Vlast_coding_system_used;
332
c4825358 333/* A vector of length 256 which contains information about special
94487c4e 334 Latin codes (especially for dealing with Microsoft codes). */
3f003981 335Lisp_Object Vlatin_extra_code_table;
c4825358 336
9ce27fde
KH
337/* Flag to inhibit code conversion of end-of-line format. */
338int inhibit_eol_conversion;
339
ed29121d
EZ
340/* Flag to make buffer-file-coding-system inherit from process-coding. */
341int inherit_process_coding_system;
342
c4825358 343/* Coding system to be used to encode text for terminal display. */
4ed46869
KH
344struct coding_system terminal_coding;
345
c4825358
KH
346/* Coding system to be used to encode text for terminal display when
347 terminal coding system is nil. */
348struct coding_system safe_terminal_coding;
349
350/* Coding system of what is sent from terminal keyboard. */
4ed46869
KH
351struct coding_system keyboard_coding;
352
6bc51348
KH
353/* Default coding system to be used to write a file. */
354struct coding_system default_buffer_file_coding;
355
02ba4723
KH
356Lisp_Object Vfile_coding_system_alist;
357Lisp_Object Vprocess_coding_system_alist;
358Lisp_Object Vnetwork_coding_system_alist;
4ed46869
KH
359
360#endif /* emacs */
361
d46c5b12 362Lisp_Object Qcoding_category, Qcoding_category_index;
4ed46869
KH
363
364/* List of symbols `coding-category-xxx' ordered by priority. */
365Lisp_Object Vcoding_category_list;
366
d46c5b12
KH
367/* Table of coding categories (Lisp symbols). */
368Lisp_Object Vcoding_category_table;
4ed46869
KH
369
370/* Table of names of symbol for each coding-category. */
371char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
0ef69138 372 "coding-category-emacs-mule",
4ed46869
KH
373 "coding-category-sjis",
374 "coding-category-iso-7",
d46c5b12 375 "coding-category-iso-7-tight",
4ed46869
KH
376 "coding-category-iso-8-1",
377 "coding-category-iso-8-2",
7717c392
KH
378 "coding-category-iso-7-else",
379 "coding-category-iso-8-else",
89fa8b36 380 "coding-category-ccl",
4ed46869 381 "coding-category-big5",
27901516 382 "coding-category-raw-text",
89fa8b36 383 "coding-category-binary"
4ed46869
KH
384};
385
66cfb530 386/* Table of pointers to coding systems corresponding to each coding
d46c5b12
KH
387 categories. */
388struct coding_system *coding_system_table[CODING_CATEGORY_IDX_MAX];
389
66cfb530
KH
390/* Table of coding category masks. Nth element is a mask for a coding
391 cateogry of which priority is Nth. */
392static
393int coding_priorities[CODING_CATEGORY_IDX_MAX];
394
f967223b
KH
395/* Flag to tell if we look up translation table on character code
396 conversion. */
84fbb8a0 397Lisp_Object Venable_character_translation;
f967223b
KH
398/* Standard translation table to look up on decoding (reading). */
399Lisp_Object Vstandard_translation_table_for_decode;
400/* Standard translation table to look up on encoding (writing). */
401Lisp_Object Vstandard_translation_table_for_encode;
84fbb8a0 402
f967223b
KH
403Lisp_Object Qtranslation_table;
404Lisp_Object Qtranslation_table_id;
405Lisp_Object Qtranslation_table_for_decode;
406Lisp_Object Qtranslation_table_for_encode;
4ed46869
KH
407
408/* Alist of charsets vs revision number. */
409Lisp_Object Vcharset_revision_alist;
410
02ba4723
KH
411/* Default coding systems used for process I/O. */
412Lisp_Object Vdefault_process_coding_system;
413
4ed46869 414\f
0ef69138 415/*** 2. Emacs internal format (emacs-mule) handlers ***/
4ed46869
KH
416
417/* Emacs' internal format for encoding multiple character sets is a
f4dee582
RS
418 kind of multi-byte encoding, i.e. characters are encoded by
419 variable-length sequences of one-byte codes. ASCII characters
420 and control characters (e.g. `tab', `newline') are represented by
421 one-byte sequences which are their ASCII codes, in the range 0x00
422 through 0x7F. The other characters are represented by a sequence
423 of `base leading-code', optional `extended leading-code', and one
424 or two `position-code's. The length of the sequence is determined
425 by the base leading-code. Leading-code takes the range 0x80
426 through 0x9F, whereas extended leading-code and position-code take
427 the range 0xA0 through 0xFF. See `charset.h' for more details
428 about leading-code and position-code.
429
430 There's one exception to this rule. Special leading-code
4ed46869
KH
431 `leading-code-composition' denotes that the following several
432 characters should be composed into one character. Leading-codes of
433 components (except for ASCII) are added 0x20. An ASCII character
434 component is represented by a 2-byte sequence of `0xA0' and
f4dee582
RS
435 `ASCII-code + 0x80'. See also the comments in `charset.h' for the
436 details of composite character. Hence, we can summarize the code
4ed46869
KH
437 range as follows:
438
439 --- CODE RANGE of Emacs' internal format ---
440 (character set) (range)
441 ASCII 0x00 .. 0x7F
442 ELSE (1st byte) 0x80 .. 0x9F
443 (rest bytes) 0xA0 .. 0xFF
444 ---------------------------------------------
445
446 */
447
448enum emacs_code_class_type emacs_code_class[256];
449
450/* Go to the next statement only if *SRC is accessible and the code is
451 greater than 0xA0. */
452#define CHECK_CODE_RANGE_A0_FF \
453 do { \
454 if (src >= src_end) \
455 goto label_end_of_switch; \
456 else if (*src++ < 0xA0) \
457 return 0; \
458 } while (0)
459
460/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
461 Check if a text is encoded in Emacs' internal format. If it is,
d46c5b12 462 return CODING_CATEGORY_MASK_EMACS_MULE, else return 0. */
4ed46869
KH
463
464int
0ef69138 465detect_coding_emacs_mule (src, src_end)
4ed46869
KH
466 unsigned char *src, *src_end;
467{
468 unsigned char c;
469 int composing = 0;
470
471 while (src < src_end)
472 {
473 c = *src++;
474
475 if (composing)
476 {
477 if (c < 0xA0)
478 composing = 0;
479 else
480 c -= 0x20;
481 }
482
483 switch (emacs_code_class[c])
484 {
485 case EMACS_ascii_code:
486 case EMACS_linefeed_code:
487 break;
488
489 case EMACS_control_code:
490 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
491 return 0;
492 break;
493
494 case EMACS_invalid_code:
495 return 0;
496
497 case EMACS_leading_code_composition: /* c == 0x80 */
498 if (composing)
499 CHECK_CODE_RANGE_A0_FF;
500 else
501 composing = 1;
502 break;
503
504 case EMACS_leading_code_4:
505 CHECK_CODE_RANGE_A0_FF;
506 /* fall down to check it two more times ... */
507
508 case EMACS_leading_code_3:
509 CHECK_CODE_RANGE_A0_FF;
510 /* fall down to check it one more time ... */
511
512 case EMACS_leading_code_2:
513 CHECK_CODE_RANGE_A0_FF;
514 break;
515
516 default:
517 label_end_of_switch:
518 break;
519 }
520 }
0ef69138 521 return CODING_CATEGORY_MASK_EMACS_MULE;
4ed46869
KH
522}
523
524\f
525/*** 3. ISO2022 handlers ***/
526
527/* The following note describes the coding system ISO2022 briefly.
f4dee582
RS
528 Since the intention of this note is to help in understanding of
529 the programs in this file, some parts are NOT ACCURATE or OVERLY
4ed46869
KH
530 SIMPLIFIED. For the thorough understanding, please refer to the
531 original document of ISO2022.
532
533 ISO2022 provides many mechanisms to encode several character sets
f4dee582 534 in 7-bit and 8-bit environment. If one chooses 7-bite environment,
4ed46869 535 all text is encoded by codes of less than 128. This may make the
f4dee582
RS
536 encoded text a little bit longer, but the text gets more stability
537 to pass through several gateways (some of them strip off the MSB).
4ed46869 538
f4dee582 539 There are two kinds of character set: control character set and
4ed46869
KH
540 graphic character set. The former contains control characters such
541 as `newline' and `escape' to provide control functions (control
f4dee582 542 functions are provided also by escape sequences). The latter
4ed46869
KH
543 contains graphic characters such as ' A' and '-'. Emacs recognizes
544 two control character sets and many graphic character sets.
545
546 Graphic character sets are classified into one of the following
547 four classes, DIMENSION1_CHARS94, DIMENSION1_CHARS96,
548 DIMENSION2_CHARS94, DIMENSION2_CHARS96 according to the number of
549 bytes (DIMENSION) and the number of characters in one dimension
550 (CHARS) of the set. In addition, each character set is assigned an
551 identification tag (called "final character" and denoted as <F>
552 here after) which is unique in each class. <F> of each character
553 set is decided by ECMA(*) when it is registered in ISO. Code range
554 of <F> is 0x30..0x7F (0x30..0x3F are for private use only).
555
556 Note (*): ECMA = European Computer Manufacturers Association
557
558 Here are examples of graphic character set [NAME(<F>)]:
559 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
560 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
561 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
562 o DIMENSION2_CHARS96 -- none for the moment
563
564 A code area (1byte=8bits) is divided into 4 areas, C0, GL, C1, and GR.
565 C0 [0x00..0x1F] -- control character plane 0
566 GL [0x20..0x7F] -- graphic character plane 0
567 C1 [0x80..0x9F] -- control character plane 1
568 GR [0xA0..0xFF] -- graphic character plane 1
569
570 A control character set is directly designated and invoked to C0 or
571 C1 by an escape sequence. The most common case is that ISO646's
572 control character set is designated/invoked to C0 and ISO6429's
573 control character set is designated/invoked to C1, and usually
574 these designations/invocations are omitted in a coded text. With
575 7-bit environment, only C0 can be used, and a control character for
576 C1 is encoded by an appropriate escape sequence to fit in the
577 environment. All control characters for C1 are defined the
578 corresponding escape sequences.
579
580 A graphic character set is at first designated to one of four
581 graphic registers (G0 through G3), then these graphic registers are
582 invoked to GL or GR. These designations and invocations can be
583 done independently. The most common case is that G0 is invoked to
584 GL, G1 is invoked to GR, and ASCII is designated to G0, and usually
585 these invocations and designations are omitted in a coded text.
586 With 7-bit environment, only GL can be used.
587
588 When a graphic character set of CHARS94 is invoked to GL, code 0x20
589 and 0x7F of GL area work as control characters SPACE and DEL
590 respectively, and code 0xA0 and 0xFF of GR area should not be used.
591
592 There are two ways of invocation: locking-shift and single-shift.
593 With locking-shift, the invocation lasts until the next different
594 invocation, whereas with single-shift, the invocation works only
595 for the following character and doesn't affect locking-shift.
596 Invocations are done by the following control characters or escape
597 sequences.
598
599 ----------------------------------------------------------------------
600 function control char escape sequence description
601 ----------------------------------------------------------------------
602 SI (shift-in) 0x0F none invoke G0 to GL
10bff6f1 603 SO (shift-out) 0x0E none invoke G1 to GL
4ed46869
KH
604 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
605 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
606 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 into GL
607 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 into GL
608 ----------------------------------------------------------------------
609 The first four are for locking-shift. Control characters for these
610 functions are defined by macros ISO_CODE_XXX in `coding.h'.
611
612 Designations are done by the following escape sequences.
613 ----------------------------------------------------------------------
614 escape sequence description
615 ----------------------------------------------------------------------
616 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
617 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
618 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
619 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
620 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
621 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
622 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
623 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
624 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
625 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
626 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
627 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
628 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
629 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
630 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
631 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
632 ----------------------------------------------------------------------
633
634 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
635 of dimension 1, chars 94, and final character <F>, and etc.
636
637 Note (*): Although these designations are not allowed in ISO2022,
638 Emacs accepts them on decoding, and produces them on encoding
639 CHARS96 character set in a coding system which is characterized as
640 7-bit environment, non-locking-shift, and non-single-shift.
641
642 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
643 '(' can be omitted. We call this as "short-form" here after.
644
645 Now you may notice that there are a lot of ways for encoding the
f4dee582 646 same multilingual text in ISO2022. Actually, there exists many
4ed46869
KH
647 coding systems such as Compound Text (used in X's inter client
648 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
649 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
650 localized platforms), and all of these are variants of ISO2022.
651
652 In addition to the above, Emacs handles two more kinds of escape
653 sequences: ISO6429's direction specification and Emacs' private
654 sequence for specifying character composition.
655
656 ISO6429's direction specification takes the following format:
657 o CSI ']' -- end of the current direction
658 o CSI '0' ']' -- end of the current direction
659 o CSI '1' ']' -- start of left-to-right text
660 o CSI '2' ']' -- start of right-to-left text
661 The control character CSI (0x9B: control sequence introducer) is
662 abbreviated to the escape sequence ESC '[' in 7-bit environment.
663
664 Character composition specification takes the following format:
665 o ESC '0' -- start character composition
666 o ESC '1' -- end character composition
667 Since these are not standard escape sequences of any ISO, the use
668 of them for these meaning is restricted to Emacs only. */
669
670enum iso_code_class_type iso_code_class[256];
671
f024b6aa
RS
672#define CHARSET_OK(idx, charset) \
673 (coding_system_table[idx] \
674 && (coding_system_table[idx]->safe_charsets[charset] \
675 || (CODING_SPEC_ISO_REQUESTED_DESIGNATION \
676 (coding_system_table[idx], charset) \
677 != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)))
d46c5b12
KH
678
679#define SHIFT_OUT_OK(idx) \
680 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding_system_table[idx], 1) >= 0)
681
4ed46869
KH
682/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
683 Check if a text is encoded in ISO2022. If it is, returns an
684 integer in which appropriate flag bits any of:
685 CODING_CATEGORY_MASK_ISO_7
d46c5b12 686 CODING_CATEGORY_MASK_ISO_7_TIGHT
4ed46869
KH
687 CODING_CATEGORY_MASK_ISO_8_1
688 CODING_CATEGORY_MASK_ISO_8_2
7717c392
KH
689 CODING_CATEGORY_MASK_ISO_7_ELSE
690 CODING_CATEGORY_MASK_ISO_8_ELSE
4ed46869
KH
691 are set. If a code which should never appear in ISO2022 is found,
692 returns 0. */
693
694int
695detect_coding_iso2022 (src, src_end)
696 unsigned char *src, *src_end;
697{
d46c5b12
KH
698 int mask = CODING_CATEGORY_MASK_ISO;
699 int mask_found = 0;
f46869e4 700 int reg[4], shift_out = 0, single_shifting = 0;
d46c5b12 701 int c, c1, i, charset;
3f003981 702
d46c5b12 703 reg[0] = CHARSET_ASCII, reg[1] = reg[2] = reg[3] = -1;
3f003981 704 while (mask && src < src_end)
4ed46869
KH
705 {
706 c = *src++;
707 switch (c)
708 {
709 case ISO_CODE_ESC:
f46869e4 710 single_shifting = 0;
e0e989f6 711 if (src >= src_end)
4ed46869
KH
712 break;
713 c = *src++;
d46c5b12 714 if (c >= '(' && c <= '/')
4ed46869 715 {
bf9cdd4e
KH
716 /* Designation sequence for a charset of dimension 1. */
717 if (src >= src_end)
718 break;
d46c5b12
KH
719 c1 = *src++;
720 if (c1 < ' ' || c1 >= 0x80
721 || (charset = iso_charset_table[0][c >= ','][c1]) < 0)
722 /* Invalid designation sequence. Just ignore. */
723 break;
724 reg[(c - '(') % 4] = charset;
bf9cdd4e
KH
725 }
726 else if (c == '$')
727 {
728 /* Designation sequence for a charset of dimension 2. */
729 if (src >= src_end)
730 break;
731 c = *src++;
732 if (c >= '@' && c <= 'B')
733 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
d46c5b12 734 reg[0] = charset = iso_charset_table[1][0][c];
bf9cdd4e 735 else if (c >= '(' && c <= '/')
bcf26d6a 736 {
bf9cdd4e
KH
737 if (src >= src_end)
738 break;
d46c5b12
KH
739 c1 = *src++;
740 if (c1 < ' ' || c1 >= 0x80
741 || (charset = iso_charset_table[1][c >= ','][c1]) < 0)
742 /* Invalid designation sequence. Just ignore. */
743 break;
744 reg[(c - '(') % 4] = charset;
bcf26d6a 745 }
bf9cdd4e 746 else
d46c5b12
KH
747 /* Invalid designation sequence. Just ignore. */
748 break;
749 }
ae9ff118 750 else if (c == 'N' || c == 'O')
d46c5b12 751 {
ae9ff118
KH
752 /* ESC <Fe> for SS2 or SS3. */
753 mask &= CODING_CATEGORY_MASK_ISO_7_ELSE;
d46c5b12 754 break;
4ed46869 755 }
bf9cdd4e 756 else if (c == '0' || c == '1' || c == '2')
ae9ff118 757 /* ESC <Fp> for start/end composition. Just ignore. */
d46c5b12 758 break;
bf9cdd4e 759 else
d46c5b12
KH
760 /* Invalid escape sequence. Just ignore. */
761 break;
762
763 /* We found a valid designation sequence for CHARSET. */
764 mask &= ~CODING_CATEGORY_MASK_ISO_8BIT;
765 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7, charset))
766 mask_found |= CODING_CATEGORY_MASK_ISO_7;
767 else
768 mask &= ~CODING_CATEGORY_MASK_ISO_7;
769 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT, charset))
770 mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
771 else
772 mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
ae9ff118
KH
773 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE, charset))
774 mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
775 else
d46c5b12 776 mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
ae9ff118
KH
777 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE, charset))
778 mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
779 else
d46c5b12 780 mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
4ed46869
KH
781 break;
782
4ed46869 783 case ISO_CODE_SO:
f46869e4 784 single_shifting = 0;
d46c5b12
KH
785 if (shift_out == 0
786 && (reg[1] >= 0
787 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE)
788 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE)))
789 {
790 /* Locking shift out. */
791 mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
792 mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
793 }
e0e989f6
KH
794 break;
795
d46c5b12 796 case ISO_CODE_SI:
f46869e4 797 single_shifting = 0;
d46c5b12
KH
798 if (shift_out == 1)
799 {
800 /* Locking shift in. */
801 mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
802 mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
803 }
804 break;
805
4ed46869 806 case ISO_CODE_CSI:
f46869e4 807 single_shifting = 0;
4ed46869
KH
808 case ISO_CODE_SS2:
809 case ISO_CODE_SS3:
3f003981
KH
810 {
811 int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
812
70c22245
KH
813 if (c != ISO_CODE_CSI)
814 {
d46c5b12
KH
815 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
816 & CODING_FLAG_ISO_SINGLE_SHIFT)
70c22245 817 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
d46c5b12
KH
818 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
819 & CODING_FLAG_ISO_SINGLE_SHIFT)
70c22245 820 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
f46869e4 821 single_shifting = 1;
70c22245 822 }
3f003981
KH
823 if (VECTORP (Vlatin_extra_code_table)
824 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
825 {
d46c5b12
KH
826 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
827 & CODING_FLAG_ISO_LATIN_EXTRA)
3f003981 828 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
d46c5b12
KH
829 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
830 & CODING_FLAG_ISO_LATIN_EXTRA)
3f003981
KH
831 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
832 }
833 mask &= newmask;
d46c5b12 834 mask_found |= newmask;
3f003981
KH
835 }
836 break;
4ed46869
KH
837
838 default:
839 if (c < 0x80)
f46869e4
KH
840 {
841 single_shifting = 0;
842 break;
843 }
4ed46869 844 else if (c < 0xA0)
c4825358 845 {
f46869e4 846 single_shifting = 0;
3f003981
KH
847 if (VECTORP (Vlatin_extra_code_table)
848 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
c4825358 849 {
3f003981
KH
850 int newmask = 0;
851
d46c5b12
KH
852 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
853 & CODING_FLAG_ISO_LATIN_EXTRA)
3f003981 854 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
d46c5b12
KH
855 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
856 & CODING_FLAG_ISO_LATIN_EXTRA)
3f003981
KH
857 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
858 mask &= newmask;
d46c5b12 859 mask_found |= newmask;
c4825358 860 }
3f003981
KH
861 else
862 return 0;
c4825358 863 }
4ed46869
KH
864 else
865 {
7717c392 866 unsigned char *src_begin = src;
4ed46869 867
d46c5b12 868 mask &= ~(CODING_CATEGORY_MASK_ISO_7BIT
7717c392 869 | CODING_CATEGORY_MASK_ISO_7_ELSE);
d46c5b12 870 mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
f46869e4
KH
871 /* Check the length of succeeding codes of the range
872 0xA0..0FF. If the byte length is odd, we exclude
873 CODING_CATEGORY_MASK_ISO_8_2. We can check this only
874 when we are not single shifting. */
875 if (!single_shifting)
876 {
877 while (src < src_end && *src >= 0xA0)
878 src++;
879 if ((src - src_begin - 1) & 1 && src < src_end)
880 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
881 else
882 mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
883 }
4ed46869
KH
884 }
885 break;
886 }
887 }
888
d46c5b12 889 return (mask & mask_found);
4ed46869
KH
890}
891
892/* Decode a character of which charset is CHARSET and the 1st position
bdd9fb48 893 code is C1. If dimension of CHARSET is 2, the 2nd position code is
4ed46869
KH
894 fetched from SRC and set to C2. If CHARSET is negative, it means
895 that we are decoding ill formed text, and what we can do is just to
896 read C1 as is. */
897
bdd9fb48
KH
898#define DECODE_ISO_CHARACTER(charset, c1) \
899 do { \
900 int c_alt, charset_alt = (charset); \
901 if (COMPOSING_HEAD_P (coding->composing)) \
902 { \
903 *dst++ = LEADING_CODE_COMPOSITION; \
904 if (COMPOSING_WITH_RULE_P (coding->composing)) \
905 /* To tell composition rules are embeded. */ \
906 *dst++ = 0xFF; \
907 coding->composing += 2; \
908 } \
85bbb134 909 if (charset_alt >= 0) \
bdd9fb48 910 { \
85bbb134 911 if (CHARSET_DIMENSION (charset_alt) == 2) \
70c22245
KH
912 { \
913 ONE_MORE_BYTE (c2); \
914 if (iso_code_class[(c2) & 0x7F] != ISO_0x20_or_0x7F \
915 && iso_code_class[(c2) & 0x7F] != ISO_graphic_plane_0) \
916 { \
917 src--; \
85bbb134 918 charset_alt = CHARSET_ASCII; \
70c22245
KH
919 } \
920 } \
84fbb8a0
KH
921 if (!NILP (translation_table) \
922 && ((c_alt = translate_char (translation_table, \
85bbb134 923 -1, charset_alt, c1, c2)) >= 0)) \
bdd9fb48
KH
924 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
925 } \
926 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
927 DECODE_CHARACTER_ASCII (c1); \
928 else if (CHARSET_DIMENSION (charset_alt) == 1) \
929 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
930 else \
931 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
932 if (COMPOSING_WITH_RULE_P (coding->composing)) \
933 /* To tell a composition rule follows. */ \
934 coding->composing = COMPOSING_WITH_RULE_RULE; \
4ed46869
KH
935 } while (0)
936
937/* Set designation state into CODING. */
d46c5b12
KH
938#define DECODE_DESIGNATION(reg, dimension, chars, final_char) \
939 do { \
944bd420
KH
940 int charset; \
941 \
942 if (final_char < '0' || final_char >= 128) \
943 goto label_invalid_code; \
944 charset = ISO_CHARSET_TABLE (make_number (dimension), \
945 make_number (chars), \
946 make_number (final_char)); \
d46c5b12 947 if (charset >= 0 \
704c5781
KH
948 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) == reg \
949 || coding->safe_charsets[charset])) \
d46c5b12
KH
950 { \
951 if (coding->spec.iso2022.last_invalid_designation_register == 0 \
952 && reg == 0 \
953 && charset == CHARSET_ASCII) \
954 { \
955 /* We should insert this designation sequence as is so \
956 that it is surely written back to a file. */ \
957 coding->spec.iso2022.last_invalid_designation_register = -1; \
958 goto label_invalid_code; \
959 } \
960 coding->spec.iso2022.last_invalid_designation_register = -1; \
961 if ((coding->mode & CODING_MODE_DIRECTION) \
962 && CHARSET_REVERSE_CHARSET (charset) >= 0) \
963 charset = CHARSET_REVERSE_CHARSET (charset); \
964 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
965 } \
966 else \
967 { \
968 coding->spec.iso2022.last_invalid_designation_register = reg; \
969 goto label_invalid_code; \
970 } \
4ed46869
KH
971 } while (0)
972
88993dfd
KH
973/* Return 0 if there's a valid composing sequence starting at SRC and
974 ending before SRC_END, else return -1. */
d46c5b12 975
84fbb8a0
KH
976int
977check_composing_code (coding, src, src_end)
d46c5b12
KH
978 struct coding_system *coding;
979 unsigned char *src, *src_end;
980{
d46c5b12
KH
981 int charset, c, c1, dim;
982
983 while (src < src_end)
984 {
88993dfd
KH
985 c = *src++;
986 if (c >= 0x20)
987 continue;
988 if (c != ISO_CODE_ESC || src >= src_end)
989 return -1;
990 c = *src++;
991 if (c == '1') /* end of compsition */
992 return 0;
993 if (src + 2 >= src_end
994 || !coding->flags & CODING_FLAG_ISO_DESIGNATION)
995 return -1;
996
997 dim = (c == '$');
998 if (dim == 1)
999 c = (*src >= '@' && *src <= 'B') ? '(' : *src++;
1000 if (c >= '(' && c <= '/')
d46c5b12 1001 {
88993dfd
KH
1002 c1 = *src++;
1003 if ((c1 < ' ' || c1 >= 0x80)
1004 || (charset = iso_charset_table[dim][c >= ','][c1]) < 0
1005 || ! coding->safe_charsets[charset]
1006 || (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
1007 == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
1008 return -1;
d46c5b12 1009 }
88993dfd
KH
1010 else
1011 return -1;
d46c5b12 1012 }
88993dfd
KH
1013
1014 /* We have not found the sequence "ESC 1". */
1015 return -1;
d46c5b12
KH
1016}
1017
4ed46869
KH
1018/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
1019
1020int
d46c5b12 1021decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
1022 struct coding_system *coding;
1023 unsigned char *source, *destination;
1024 int src_bytes, dst_bytes;
4ed46869
KH
1025{
1026 unsigned char *src = source;
1027 unsigned char *src_end = source + src_bytes;
1028 unsigned char *dst = destination;
1029 unsigned char *dst_end = destination + dst_bytes;
1030 /* Since the maximum bytes produced by each loop is 7, we subtract 6
1031 from DST_END to assure that overflow checking is necessary only
1032 at the head of loop. */
1033 unsigned char *adjusted_dst_end = dst_end - 6;
1034 int charset;
1035 /* Charsets invoked to graphic plane 0 and 1 respectively. */
1036 int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1037 int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
84fbb8a0 1038 Lisp_Object translation_table
f967223b 1039 = coding->translation_table_for_decode;
d46c5b12 1040 int result = CODING_FINISH_NORMAL;
bdd9fb48 1041
84fbb8a0 1042 if (!NILP (Venable_character_translation) && NILP (translation_table))
f967223b 1043 translation_table = Vstandard_translation_table_for_decode;
4ed46869 1044
d46c5b12 1045 coding->produced_char = 0;
de79a6a5 1046 coding->composed_chars = 0;
fb88bf2d 1047 coding->fake_multibyte = 0;
d46c5b12
KH
1048 while (src < src_end && (dst_bytes
1049 ? (dst < adjusted_dst_end)
1050 : (dst < src - 6)))
4ed46869
KH
1051 {
1052 /* SRC_BASE remembers the start position in source in each loop.
1053 The loop will be exited when there's not enough source text
1054 to analyze long escape sequence or 2-byte code (within macros
1055 ONE_MORE_BYTE or TWO_MORE_BYTES). In that case, SRC is reset
1056 to SRC_BASE before exiting. */
1057 unsigned char *src_base = src;
bdd9fb48 1058 int c1 = *src++, c2;
4ed46869
KH
1059
1060 switch (iso_code_class [c1])
1061 {
1062 case ISO_0x20_or_0x7F:
1063 if (!coding->composing
1064 && (charset0 < 0 || CHARSET_CHARS (charset0) == 94))
1065 {
1066 /* This is SPACE or DEL. */
1067 *dst++ = c1;
d46c5b12 1068 coding->produced_char++;
4ed46869
KH
1069 break;
1070 }
1071 /* This is a graphic character, we fall down ... */
1072
1073 case ISO_graphic_plane_0:
1074 if (coding->composing == COMPOSING_WITH_RULE_RULE)
1075 {
1076 /* This is a composition rule. */
1077 *dst++ = c1 | 0x80;
1078 coding->composing = COMPOSING_WITH_RULE_TAIL;
1079 }
1080 else
1081 DECODE_ISO_CHARACTER (charset0, c1);
1082 break;
1083
1084 case ISO_0xA0_or_0xFF:
d46c5b12
KH
1085 if (charset1 < 0 || CHARSET_CHARS (charset1) == 94
1086 || coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
fb88bf2d 1087 goto label_invalid_code;
4ed46869
KH
1088 /* This is a graphic character, we fall down ... */
1089
1090 case ISO_graphic_plane_1:
d46c5b12 1091 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
fb88bf2d 1092 goto label_invalid_code;
d46c5b12
KH
1093 else
1094 DECODE_ISO_CHARACTER (charset1, c1);
4ed46869
KH
1095 break;
1096
1097 case ISO_control_code:
1098 /* All ISO2022 control characters in this class have the
1099 same representation in Emacs internal format. */
d46c5b12
KH
1100 if (c1 == '\n'
1101 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1102 && (coding->eol_type == CODING_EOL_CR
1103 || coding->eol_type == CODING_EOL_CRLF))
1104 {
1105 result = CODING_FINISH_INCONSISTENT_EOL;
1106 goto label_end_of_loop_2;
1107 }
4ed46869 1108 *dst++ = c1;
d46c5b12 1109 coding->produced_char++;
174a4cbe
KH
1110 if (c1 >= 0x80)
1111 coding->fake_multibyte = 1;
4ed46869
KH
1112 break;
1113
1114 case ISO_carriage_return:
1115 if (coding->eol_type == CODING_EOL_CR)
d46c5b12 1116 *dst++ = '\n';
4ed46869
KH
1117 else if (coding->eol_type == CODING_EOL_CRLF)
1118 {
1119 ONE_MORE_BYTE (c1);
1120 if (c1 == ISO_CODE_LF)
1121 *dst++ = '\n';
1122 else
1123 {
d46c5b12
KH
1124 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1125 {
1126 result = CODING_FINISH_INCONSISTENT_EOL;
1127 goto label_end_of_loop_2;
1128 }
4ed46869 1129 src--;
d46c5b12 1130 *dst++ = '\r';
4ed46869
KH
1131 }
1132 }
1133 else
d46c5b12
KH
1134 *dst++ = c1;
1135 coding->produced_char++;
4ed46869
KH
1136 break;
1137
1138 case ISO_shift_out:
d46c5b12
KH
1139 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1140 || CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
1141 goto label_invalid_code;
4ed46869
KH
1142 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
1143 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1144 break;
1145
1146 case ISO_shift_in:
d46c5b12
KH
1147 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
1148 goto label_invalid_code;
4ed46869
KH
1149 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
1150 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1151 break;
1152
1153 case ISO_single_shift_2_7:
1154 case ISO_single_shift_2:
d46c5b12
KH
1155 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1156 goto label_invalid_code;
4ed46869
KH
1157 /* SS2 is handled as an escape sequence of ESC 'N' */
1158 c1 = 'N';
1159 goto label_escape_sequence;
1160
1161 case ISO_single_shift_3:
d46c5b12
KH
1162 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1163 goto label_invalid_code;
4ed46869
KH
1164 /* SS2 is handled as an escape sequence of ESC 'O' */
1165 c1 = 'O';
1166 goto label_escape_sequence;
1167
1168 case ISO_control_sequence_introducer:
1169 /* CSI is handled as an escape sequence of ESC '[' ... */
1170 c1 = '[';
1171 goto label_escape_sequence;
1172
1173 case ISO_escape:
1174 ONE_MORE_BYTE (c1);
1175 label_escape_sequence:
1176 /* Escape sequences handled by Emacs are invocation,
1177 designation, direction specification, and character
1178 composition specification. */
1179 switch (c1)
1180 {
1181 case '&': /* revision of following character set */
1182 ONE_MORE_BYTE (c1);
1183 if (!(c1 >= '@' && c1 <= '~'))
d46c5b12 1184 goto label_invalid_code;
4ed46869
KH
1185 ONE_MORE_BYTE (c1);
1186 if (c1 != ISO_CODE_ESC)
d46c5b12 1187 goto label_invalid_code;
4ed46869
KH
1188 ONE_MORE_BYTE (c1);
1189 goto label_escape_sequence;
1190
1191 case '$': /* designation of 2-byte character set */
d46c5b12
KH
1192 if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1193 goto label_invalid_code;
4ed46869
KH
1194 ONE_MORE_BYTE (c1);
1195 if (c1 >= '@' && c1 <= 'B')
1196 { /* designation of JISX0208.1978, GB2312.1980,
88993dfd 1197 or JISX0208.1980 */
4ed46869
KH
1198 DECODE_DESIGNATION (0, 2, 94, c1);
1199 }
1200 else if (c1 >= 0x28 && c1 <= 0x2B)
1201 { /* designation of DIMENSION2_CHARS94 character set */
1202 ONE_MORE_BYTE (c2);
1203 DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
1204 }
1205 else if (c1 >= 0x2C && c1 <= 0x2F)
1206 { /* designation of DIMENSION2_CHARS96 character set */
1207 ONE_MORE_BYTE (c2);
1208 DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
1209 }
1210 else
d46c5b12 1211 goto label_invalid_code;
4ed46869
KH
1212 break;
1213
1214 case 'n': /* invocation of locking-shift-2 */
d46c5b12
KH
1215 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1216 || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1217 goto label_invalid_code;
4ed46869 1218 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
e0e989f6 1219 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
4ed46869
KH
1220 break;
1221
1222 case 'o': /* invocation of locking-shift-3 */
d46c5b12
KH
1223 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1224 || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1225 goto label_invalid_code;
4ed46869 1226 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
e0e989f6 1227 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
4ed46869
KH
1228 break;
1229
1230 case 'N': /* invocation of single-shift-2 */
d46c5b12
KH
1231 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1232 || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1233 goto label_invalid_code;
4ed46869
KH
1234 ONE_MORE_BYTE (c1);
1235 charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
1236 DECODE_ISO_CHARACTER (charset, c1);
1237 break;
1238
1239 case 'O': /* invocation of single-shift-3 */
d46c5b12
KH
1240 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1241 || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1242 goto label_invalid_code;
4ed46869
KH
1243 ONE_MORE_BYTE (c1);
1244 charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
1245 DECODE_ISO_CHARACTER (charset, c1);
1246 break;
1247
d46c5b12
KH
1248 case '0': case '2': /* start composing */
1249 /* Before processing composing, we must be sure that all
1250 characters being composed are supported by CODING.
88993dfd
KH
1251 If not, we must give up composing. */
1252 if (check_composing_code (coding, src, src_end) == 0)
1253 {
1254 /* We are looking at a valid composition sequence. */
1255 coding->composing = (c1 == '0'
1256 ? COMPOSING_NO_RULE_HEAD
1257 : COMPOSING_WITH_RULE_HEAD);
1258 coding->composed_chars = 0;
1259 }
1260 else
1261 {
1262 *dst++ = ISO_CODE_ESC;
1263 *dst++ = c1;
1264 coding->produced_char += 2;
1265 }
4ed46869
KH
1266 break;
1267
1268 case '1': /* end composing */
88993dfd
KH
1269 if (!coding->composing)
1270 {
1271 *dst++ = ISO_CODE_ESC;
1272 *dst++ = c1;
1273 coding->produced_char += 2;
1274 break;
1275 }
1276
de79a6a5
KH
1277 if (coding->composed_chars > 0)
1278 {
1279 if (coding->composed_chars == 1)
1280 {
1281 unsigned char *this_char_start = dst;
1282 int this_bytes;
1283
1284 /* Only one character is in the composing
1285 sequence. Make it a normal character. */
1286 while (*--this_char_start != LEADING_CODE_COMPOSITION);
1287 dst = (this_char_start
1288 + (coding->composing == COMPOSING_NO_RULE_TAIL
1289 ? 1 : 2));
1290 *dst -= 0x20;
1291 if (*dst == 0x80)
1292 *++dst &= 0x7F;
1293 this_bytes = BYTES_BY_CHAR_HEAD (*dst);
1294 while (this_bytes--) *this_char_start++ = *dst++;
1295 dst = this_char_start;
1296 }
1297 coding->produced_char++;
1298 }
4ed46869 1299 coding->composing = COMPOSING_NO;
4ed46869
KH
1300 break;
1301
1302 case '[': /* specification of direction */
d46c5b12
KH
1303 if (coding->flags & CODING_FLAG_ISO_NO_DIRECTION)
1304 goto label_invalid_code;
4ed46869 1305 /* For the moment, nested direction is not supported.
d46c5b12
KH
1306 So, `coding->mode & CODING_MODE_DIRECTION' zero means
1307 left-to-right, and nozero means right-to-left. */
4ed46869
KH
1308 ONE_MORE_BYTE (c1);
1309 switch (c1)
1310 {
1311 case ']': /* end of the current direction */
d46c5b12 1312 coding->mode &= ~CODING_MODE_DIRECTION;
4ed46869
KH
1313
1314 case '0': /* end of the current direction */
1315 case '1': /* start of left-to-right direction */
1316 ONE_MORE_BYTE (c1);
1317 if (c1 == ']')
d46c5b12 1318 coding->mode &= ~CODING_MODE_DIRECTION;
4ed46869 1319 else
d46c5b12 1320 goto label_invalid_code;
4ed46869
KH
1321 break;
1322
1323 case '2': /* start of right-to-left direction */
1324 ONE_MORE_BYTE (c1);
1325 if (c1 == ']')
d46c5b12 1326 coding->mode |= CODING_MODE_DIRECTION;
4ed46869 1327 else
d46c5b12 1328 goto label_invalid_code;
4ed46869
KH
1329 break;
1330
1331 default:
d46c5b12 1332 goto label_invalid_code;
4ed46869
KH
1333 }
1334 break;
1335
1336 default:
d46c5b12
KH
1337 if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1338 goto label_invalid_code;
4ed46869
KH
1339 if (c1 >= 0x28 && c1 <= 0x2B)
1340 { /* designation of DIMENSION1_CHARS94 character set */
1341 ONE_MORE_BYTE (c2);
1342 DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
1343 }
1344 else if (c1 >= 0x2C && c1 <= 0x2F)
1345 { /* designation of DIMENSION1_CHARS96 character set */
1346 ONE_MORE_BYTE (c2);
1347 DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
1348 }
1349 else
1350 {
d46c5b12 1351 goto label_invalid_code;
4ed46869
KH
1352 }
1353 }
1354 /* We must update these variables now. */
1355 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1356 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1357 break;
1358
d46c5b12 1359 label_invalid_code:
d46c5b12
KH
1360 while (src_base < src)
1361 *dst++ = *src_base++;
fb88bf2d 1362 coding->fake_multibyte = 1;
4ed46869
KH
1363 }
1364 continue;
1365
1366 label_end_of_loop:
d46c5b12
KH
1367 result = CODING_FINISH_INSUFFICIENT_SRC;
1368 label_end_of_loop_2:
4ed46869
KH
1369 src = src_base;
1370 break;
1371 }
1372
fb88bf2d 1373 if (src < src_end)
4ed46869 1374 {
fb88bf2d
KH
1375 if (result == CODING_FINISH_NORMAL)
1376 result = CODING_FINISH_INSUFFICIENT_DST;
1377 else if (result != CODING_FINISH_INCONSISTENT_EOL
1378 && coding->mode & CODING_MODE_LAST_BLOCK)
1379 {
1380 /* This is the last block of the text to be decoded. We had
1381 better just flush out all remaining codes in the text
1382 although they are not valid characters. */
1383 src_bytes = src_end - src;
1384 if (dst_bytes && (dst_end - dst < src_bytes))
1385 src_bytes = dst_end - dst;
1386 bcopy (src, dst, src_bytes);
1387 dst += src_bytes;
1388 src += src_bytes;
1389 coding->fake_multibyte = 1;
1390 }
4ed46869 1391 }
fb88bf2d 1392
d46c5b12
KH
1393 coding->consumed = coding->consumed_char = src - source;
1394 coding->produced = dst - destination;
1395 return result;
4ed46869
KH
1396}
1397
f4dee582 1398/* ISO2022 encoding stuff. */
4ed46869
KH
1399
1400/*
f4dee582 1401 It is not enough to say just "ISO2022" on encoding, we have to
d46c5b12 1402 specify more details. In Emacs, each coding system of ISO2022
4ed46869
KH
1403 variant has the following specifications:
1404 1. Initial designation to G0 thru G3.
1405 2. Allows short-form designation?
1406 3. ASCII should be designated to G0 before control characters?
1407 4. ASCII should be designated to G0 at end of line?
1408 5. 7-bit environment or 8-bit environment?
1409 6. Use locking-shift?
1410 7. Use Single-shift?
1411 And the following two are only for Japanese:
1412 8. Use ASCII in place of JIS0201-1976-Roman?
1413 9. Use JISX0208-1983 in place of JISX0208-1978?
1414 These specifications are encoded in `coding->flags' as flag bits
1415 defined by macros CODING_FLAG_ISO_XXX. See `coding.h' for more
f4dee582 1416 details.
4ed46869
KH
1417*/
1418
1419/* Produce codes (escape sequence) for designating CHARSET to graphic
1420 register REG. If <final-char> of CHARSET is '@', 'A', or 'B' and
1421 the coding system CODING allows, produce designation sequence of
1422 short-form. */
1423
1424#define ENCODE_DESIGNATION(charset, reg, coding) \
1425 do { \
1426 unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset); \
1427 char *intermediate_char_94 = "()*+"; \
1428 char *intermediate_char_96 = ",-./"; \
70c22245
KH
1429 int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset); \
1430 if (revision < 255) \
1431 { \
4ed46869
KH
1432 *dst++ = ISO_CODE_ESC; \
1433 *dst++ = '&'; \
70c22245 1434 *dst++ = '@' + revision; \
4ed46869
KH
1435 } \
1436 *dst++ = ISO_CODE_ESC; \
1437 if (CHARSET_DIMENSION (charset) == 1) \
1438 { \
1439 if (CHARSET_CHARS (charset) == 94) \
1440 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1441 else \
1442 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1443 } \
1444 else \
1445 { \
1446 *dst++ = '$'; \
1447 if (CHARSET_CHARS (charset) == 94) \
1448 { \
1449 if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM) \
1450 || reg != 0 \
1451 || final_char < '@' || final_char > 'B') \
1452 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1453 } \
1454 else \
1455 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1456 } \
1457 *dst++ = final_char; \
1458 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
1459 } while (0)
1460
1461/* The following two macros produce codes (control character or escape
1462 sequence) for ISO2022 single-shift functions (single-shift-2 and
1463 single-shift-3). */
1464
1465#define ENCODE_SINGLE_SHIFT_2 \
1466 do { \
1467 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1468 *dst++ = ISO_CODE_ESC, *dst++ = 'N'; \
1469 else \
fb88bf2d
KH
1470 { \
1471 *dst++ = ISO_CODE_SS2; \
1472 coding->fake_multibyte = 1; \
1473 } \
4ed46869
KH
1474 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1475 } while (0)
1476
fb88bf2d
KH
1477#define ENCODE_SINGLE_SHIFT_3 \
1478 do { \
4ed46869 1479 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
fb88bf2d
KH
1480 *dst++ = ISO_CODE_ESC, *dst++ = 'O'; \
1481 else \
1482 { \
1483 *dst++ = ISO_CODE_SS3; \
1484 coding->fake_multibyte = 1; \
1485 } \
4ed46869
KH
1486 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1487 } while (0)
1488
1489/* The following four macros produce codes (control character or
1490 escape sequence) for ISO2022 locking-shift functions (shift-in,
1491 shift-out, locking-shift-2, and locking-shift-3). */
1492
1493#define ENCODE_SHIFT_IN \
1494 do { \
1495 *dst++ = ISO_CODE_SI; \
1496 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1497 } while (0)
1498
1499#define ENCODE_SHIFT_OUT \
1500 do { \
1501 *dst++ = ISO_CODE_SO; \
1502 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1503 } while (0)
1504
1505#define ENCODE_LOCKING_SHIFT_2 \
1506 do { \
1507 *dst++ = ISO_CODE_ESC, *dst++ = 'n'; \
1508 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1509 } while (0)
1510
1511#define ENCODE_LOCKING_SHIFT_3 \
1512 do { \
1513 *dst++ = ISO_CODE_ESC, *dst++ = 'o'; \
1514 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1515 } while (0)
1516
f4dee582
RS
1517/* Produce codes for a DIMENSION1 character whose character set is
1518 CHARSET and whose position-code is C1. Designation and invocation
4ed46869
KH
1519 sequences are also produced in advance if necessary. */
1520
1521
6e85d753
KH
1522#define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
1523 do { \
1524 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1525 { \
1526 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1527 *dst++ = c1 & 0x7F; \
1528 else \
1529 *dst++ = c1 | 0x80; \
1530 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1531 break; \
1532 } \
1533 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1534 { \
1535 *dst++ = c1 & 0x7F; \
1536 break; \
1537 } \
1538 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1539 { \
1540 *dst++ = c1 | 0x80; \
1541 break; \
1542 } \
1543 else if (coding->flags & CODING_FLAG_ISO_SAFE \
70c22245 1544 && !coding->safe_charsets[charset]) \
6e85d753
KH
1545 { \
1546 /* We should not encode this character, instead produce one or \
1547 two `?'s. */ \
1548 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1549 if (CHARSET_WIDTH (charset) == 2) \
1550 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1551 break; \
1552 } \
1553 else \
1554 /* Since CHARSET is not yet invoked to any graphic planes, we \
1555 must invoke it, or, at first, designate it to some graphic \
1556 register. Then repeat the loop to actually produce the \
1557 character. */ \
1558 dst = encode_invocation_designation (charset, coding, dst); \
4ed46869
KH
1559 } while (1)
1560
f4dee582
RS
1561/* Produce codes for a DIMENSION2 character whose character set is
1562 CHARSET and whose position-codes are C1 and C2. Designation and
4ed46869
KH
1563 invocation codes are also produced in advance if necessary. */
1564
6e85d753
KH
1565#define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
1566 do { \
1567 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1568 { \
1569 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1570 *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F; \
1571 else \
1572 *dst++ = c1 | 0x80, *dst++ = c2 | 0x80; \
1573 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1574 break; \
1575 } \
1576 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1577 { \
1578 *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F; \
1579 break; \
1580 } \
1581 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1582 { \
1583 *dst++ = c1 | 0x80, *dst++= c2 | 0x80; \
1584 break; \
1585 } \
1586 else if (coding->flags & CODING_FLAG_ISO_SAFE \
70c22245 1587 && !coding->safe_charsets[charset]) \
6e85d753
KH
1588 { \
1589 /* We should not encode this character, instead produce one or \
1590 two `?'s. */ \
1591 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1592 if (CHARSET_WIDTH (charset) == 2) \
1593 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1594 break; \
1595 } \
1596 else \
1597 /* Since CHARSET is not yet invoked to any graphic planes, we \
1598 must invoke it, or, at first, designate it to some graphic \
1599 register. Then repeat the loop to actually produce the \
1600 character. */ \
1601 dst = encode_invocation_designation (charset, coding, dst); \
4ed46869
KH
1602 } while (1)
1603
84fbb8a0
KH
1604#define ENCODE_ISO_CHARACTER(charset, c1, c2) \
1605 do { \
1606 int c_alt, charset_alt; \
1607 if (!NILP (translation_table) \
1608 && ((c_alt = translate_char (translation_table, -1, \
1609 charset, c1, c2)) \
1610 >= 0)) \
1611 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1612 else \
1613 charset_alt = charset; \
1614 if (CHARSET_DIMENSION (charset_alt) == 1) \
1615 { \
1616 if (charset == CHARSET_ASCII \
1617 && coding->flags & CODING_FLAG_ISO_USE_ROMAN) \
1618 charset_alt = charset_latin_jisx0201; \
1619 ENCODE_ISO_CHARACTER_DIMENSION1 (charset_alt, c1); \
1620 } \
1621 else \
1622 { \
1623 if (charset == charset_jisx0208 \
1624 && coding->flags & CODING_FLAG_ISO_USE_OLDJIS) \
1625 charset_alt = charset_jisx0208_1978; \
1626 ENCODE_ISO_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
1627 } \
1628 if (! COMPOSING_P (coding->composing)) \
1629 coding->consumed_char++; \
1630 } while (0)
bdd9fb48 1631
4ed46869
KH
1632/* Produce designation and invocation codes at a place pointed by DST
1633 to use CHARSET. The element `spec.iso2022' of *CODING is updated.
1634 Return new DST. */
1635
1636unsigned char *
1637encode_invocation_designation (charset, coding, dst)
1638 int charset;
1639 struct coding_system *coding;
1640 unsigned char *dst;
1641{
1642 int reg; /* graphic register number */
1643
1644 /* At first, check designations. */
1645 for (reg = 0; reg < 4; reg++)
1646 if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
1647 break;
1648
1649 if (reg >= 4)
1650 {
1651 /* CHARSET is not yet designated to any graphic registers. */
1652 /* At first check the requested designation. */
1653 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1ba9e4ab
KH
1654 if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1655 /* Since CHARSET requests no special designation, designate it
1656 to graphic register 0. */
4ed46869
KH
1657 reg = 0;
1658
1659 ENCODE_DESIGNATION (charset, reg, coding);
1660 }
1661
1662 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
1663 && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
1664 {
1665 /* Since the graphic register REG is not invoked to any graphic
1666 planes, invoke it to graphic plane 0. */
1667 switch (reg)
1668 {
1669 case 0: /* graphic register 0 */
1670 ENCODE_SHIFT_IN;
1671 break;
1672
1673 case 1: /* graphic register 1 */
1674 ENCODE_SHIFT_OUT;
1675 break;
1676
1677 case 2: /* graphic register 2 */
1678 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1679 ENCODE_SINGLE_SHIFT_2;
1680 else
1681 ENCODE_LOCKING_SHIFT_2;
1682 break;
1683
1684 case 3: /* graphic register 3 */
1685 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1686 ENCODE_SINGLE_SHIFT_3;
1687 else
1688 ENCODE_LOCKING_SHIFT_3;
1689 break;
1690 }
1691 }
1692 return dst;
1693}
1694
1695/* The following two macros produce codes for indicating composition. */
1696#define ENCODE_COMPOSITION_NO_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '0'
1697#define ENCODE_COMPOSITION_WITH_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '2'
1698#define ENCODE_COMPOSITION_END *dst++ = ISO_CODE_ESC, *dst++ = '1'
1699
1700/* The following three macros produce codes for indicating direction
1701 of text. */
1702#define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
1703 do { \
1704 if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS) \
1705 *dst++ = ISO_CODE_ESC, *dst++ = '['; \
1706 else \
1707 *dst++ = ISO_CODE_CSI; \
1708 } while (0)
1709
1710#define ENCODE_DIRECTION_R2L \
1711 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '2', *dst++ = ']'
1712
1713#define ENCODE_DIRECTION_L2R \
1714 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '0', *dst++ = ']'
1715
1716/* Produce codes for designation and invocation to reset the graphic
1717 planes and registers to initial state. */
e0e989f6
KH
1718#define ENCODE_RESET_PLANE_AND_REGISTER \
1719 do { \
1720 int reg; \
1721 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0) \
1722 ENCODE_SHIFT_IN; \
1723 for (reg = 0; reg < 4; reg++) \
1724 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0 \
1725 && (CODING_SPEC_ISO_DESIGNATION (coding, reg) \
1726 != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg))) \
1727 ENCODE_DESIGNATION \
1728 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
4ed46869
KH
1729 } while (0)
1730
bdd9fb48 1731/* Produce designation sequences of charsets in the line started from
d46c5b12 1732 SRC to a place pointed by *DSTP, and update DSTP.
bdd9fb48
KH
1733
1734 If the current block ends before any end-of-line, we may fail to
d46c5b12
KH
1735 find all the necessary designations. */
1736
dfcf069d 1737void
bdd9fb48 1738encode_designation_at_bol (coding, table, src, src_end, dstp)
e0e989f6 1739 struct coding_system *coding;
bdd9fb48 1740 Lisp_Object table;
e0e989f6
KH
1741 unsigned char *src, *src_end, **dstp;
1742{
bdd9fb48
KH
1743 int charset, c, found = 0, reg;
1744 /* Table of charsets to be designated to each graphic register. */
1745 int r[4];
1746 unsigned char *dst = *dstp;
1747
1748 for (reg = 0; reg < 4; reg++)
1749 r[reg] = -1;
1750
1751 while (src < src_end && *src != '\n' && found < 4)
e0e989f6 1752 {
bdd9fb48
KH
1753 int bytes = BYTES_BY_CHAR_HEAD (*src);
1754
1755 if (NILP (table))
1756 charset = CHARSET_AT (src);
1757 else
e0e989f6 1758 {
35cb8686
RS
1759 int c_alt;
1760 unsigned char c1, c2;
bdd9fb48
KH
1761
1762 SPLIT_STRING(src, bytes, charset, c1, c2);
84fbb8a0 1763 if ((c_alt = translate_char (table, -1, charset, c1, c2)) >= 0)
bdd9fb48 1764 charset = CHAR_CHARSET (c_alt);
e0e989f6 1765 }
bdd9fb48 1766
e0e989f6 1767 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
d46c5b12 1768 if (reg != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION && r[reg] < 0)
bdd9fb48
KH
1769 {
1770 found++;
1771 r[reg] = charset;
1772 }
1773
1774 src += bytes;
1775 }
1776
1777 if (found)
1778 {
1779 for (reg = 0; reg < 4; reg++)
1780 if (r[reg] >= 0
1781 && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
1782 ENCODE_DESIGNATION (r[reg], reg, coding);
1783 *dstp = dst;
e0e989f6 1784 }
e0e989f6
KH
1785}
1786
4ed46869
KH
1787/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
1788
1789int
d46c5b12 1790encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
1791 struct coding_system *coding;
1792 unsigned char *source, *destination;
1793 int src_bytes, dst_bytes;
4ed46869
KH
1794{
1795 unsigned char *src = source;
1796 unsigned char *src_end = source + src_bytes;
1797 unsigned char *dst = destination;
1798 unsigned char *dst_end = destination + dst_bytes;
e0e989f6 1799 /* Since the maximum bytes produced by each loop is 20, we subtract 19
4ed46869
KH
1800 from DST_END to assure overflow checking is necessary only at the
1801 head of loop. */
e0e989f6 1802 unsigned char *adjusted_dst_end = dst_end - 19;
84fbb8a0 1803 Lisp_Object translation_table
f967223b 1804 = coding->translation_table_for_encode;
d46c5b12 1805 int result = CODING_FINISH_NORMAL;
bdd9fb48 1806
84fbb8a0 1807 if (!NILP (Venable_character_translation) && NILP (translation_table))
f967223b 1808 translation_table = Vstandard_translation_table_for_encode;
4ed46869 1809
d46c5b12 1810 coding->consumed_char = 0;
fb88bf2d 1811 coding->fake_multibyte = 0;
d46c5b12
KH
1812 while (src < src_end && (dst_bytes
1813 ? (dst < adjusted_dst_end)
1814 : (dst < src - 19)))
4ed46869
KH
1815 {
1816 /* SRC_BASE remembers the start position in source in each loop.
1817 The loop will be exited when there's not enough source text
1818 to analyze multi-byte codes (within macros ONE_MORE_BYTE,
1819 TWO_MORE_BYTES, and THREE_MORE_BYTES). In that case, SRC is
1820 reset to SRC_BASE before exiting. */
1821 unsigned char *src_base = src;
bdd9fb48 1822 int charset, c1, c2, c3, c4;
4ed46869 1823
e0e989f6
KH
1824 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
1825 && CODING_SPEC_ISO_BOL (coding))
1826 {
bdd9fb48 1827 /* We have to produce designation sequences if any now. */
84fbb8a0 1828 encode_designation_at_bol (coding, translation_table,
bdd9fb48 1829 src, src_end, &dst);
e0e989f6
KH
1830 CODING_SPEC_ISO_BOL (coding) = 0;
1831 }
1832
1833 c1 = *src++;
4ed46869 1834 /* If we are seeing a component of a composite character, we are
d46c5b12
KH
1835 seeing a leading-code encoded irregularly for composition, or
1836 a composition rule if composing with rule. We must set C1 to
1837 a normal leading-code or an ASCII code. If we are not seeing
1838 a composite character, we must reset composition,
1839 designation, and invocation states. */
4ed46869
KH
1840 if (COMPOSING_P (coding->composing))
1841 {
1842 if (c1 < 0xA0)
1843 {
1844 /* We are not in a composite character any longer. */
1845 coding->composing = COMPOSING_NO;
d46c5b12 1846 ENCODE_RESET_PLANE_AND_REGISTER;
4ed46869
KH
1847 ENCODE_COMPOSITION_END;
1848 }
1849 else
1850 {
1851 if (coding->composing == COMPOSING_WITH_RULE_RULE)
1852 {
1853 *dst++ = c1 & 0x7F;
1854 coding->composing = COMPOSING_WITH_RULE_HEAD;
1855 continue;
1856 }
1857 else if (coding->composing == COMPOSING_WITH_RULE_HEAD)
1858 coding->composing = COMPOSING_WITH_RULE_RULE;
1859 if (c1 == 0xA0)
1860 {
1861 /* This is an ASCII component. */
1862 ONE_MORE_BYTE (c1);
1863 c1 &= 0x7F;
1864 }
1865 else
1866 /* This is a leading-code of non ASCII component. */
1867 c1 -= 0x20;
1868 }
1869 }
1870
1871 /* Now encode one character. C1 is a control character, an
1872 ASCII character, or a leading-code of multi-byte character. */
1873 switch (emacs_code_class[c1])
1874 {
1875 case EMACS_ascii_code:
bdd9fb48 1876 ENCODE_ISO_CHARACTER (CHARSET_ASCII, c1, /* dummy */ c2);
4ed46869
KH
1877 break;
1878
1879 case EMACS_control_code:
1880 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
e0e989f6 1881 ENCODE_RESET_PLANE_AND_REGISTER;
4ed46869 1882 *dst++ = c1;
d46c5b12 1883 coding->consumed_char++;
4ed46869
KH
1884 break;
1885
1886 case EMACS_carriage_return_code:
d46c5b12 1887 if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
4ed46869
KH
1888 {
1889 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
e0e989f6 1890 ENCODE_RESET_PLANE_AND_REGISTER;
4ed46869 1891 *dst++ = c1;
d46c5b12 1892 coding->consumed_char++;
4ed46869
KH
1893 break;
1894 }
1895 /* fall down to treat '\r' as '\n' ... */
1896
1897 case EMACS_linefeed_code:
1898 if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
e0e989f6
KH
1899 ENCODE_RESET_PLANE_AND_REGISTER;
1900 if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
1901 bcopy (coding->spec.iso2022.initial_designation,
1902 coding->spec.iso2022.current_designation,
1903 sizeof coding->spec.iso2022.initial_designation);
4ed46869 1904 if (coding->eol_type == CODING_EOL_LF
0ef69138 1905 || coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
1906 *dst++ = ISO_CODE_LF;
1907 else if (coding->eol_type == CODING_EOL_CRLF)
1908 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
1909 else
1910 *dst++ = ISO_CODE_CR;
e0e989f6 1911 CODING_SPEC_ISO_BOL (coding) = 1;
d46c5b12 1912 coding->consumed_char++;
4ed46869
KH
1913 break;
1914
1915 case EMACS_leading_code_2:
1916 ONE_MORE_BYTE (c2);
19a8d9e0
KH
1917 if (c2 < 0xA0)
1918 {
1919 /* invalid sequence */
1920 *dst++ = c1;
38cf95df
RS
1921 src--;
1922 coding->consumed_char++;
19a8d9e0
KH
1923 }
1924 else
1925 ENCODE_ISO_CHARACTER (c1, c2, /* dummy */ c3);
4ed46869
KH
1926 break;
1927
1928 case EMACS_leading_code_3:
1929 TWO_MORE_BYTES (c2, c3);
19a8d9e0
KH
1930 if (c2 < 0xA0 || c3 < 0xA0)
1931 {
1932 /* invalid sequence */
1933 *dst++ = c1;
38cf95df
RS
1934 src -= 2;
1935 coding->consumed_char++;
19a8d9e0
KH
1936 }
1937 else if (c1 < LEADING_CODE_PRIVATE_11)
bdd9fb48 1938 ENCODE_ISO_CHARACTER (c1, c2, c3);
4ed46869 1939 else
bdd9fb48 1940 ENCODE_ISO_CHARACTER (c2, c3, /* dummy */ c4);
4ed46869
KH
1941 break;
1942
1943 case EMACS_leading_code_4:
1944 THREE_MORE_BYTES (c2, c3, c4);
19a8d9e0
KH
1945 if (c2 < 0xA0 || c3 < 0xA0 || c4 < 0xA0)
1946 {
1947 /* invalid sequence */
1948 *dst++ = c1;
38cf95df
RS
1949 src -= 3;
1950 coding->consumed_char++;
19a8d9e0
KH
1951 }
1952 else
1953 ENCODE_ISO_CHARACTER (c2, c3, c4);
4ed46869
KH
1954 break;
1955
1956 case EMACS_leading_code_composition:
19a8d9e0
KH
1957 ONE_MORE_BYTE (c2);
1958 if (c2 < 0xA0)
1959 {
1960 /* invalid sequence */
1961 *dst++ = c1;
38cf95df
RS
1962 src--;
1963 coding->consumed_char++;
19a8d9e0
KH
1964 }
1965 else if (c2 == 0xFF)
4ed46869 1966 {
d46c5b12 1967 ENCODE_RESET_PLANE_AND_REGISTER;
4ed46869
KH
1968 coding->composing = COMPOSING_WITH_RULE_HEAD;
1969 ENCODE_COMPOSITION_WITH_RULE_START;
d46c5b12 1970 coding->consumed_char++;
4ed46869
KH
1971 }
1972 else
1973 {
d46c5b12 1974 ENCODE_RESET_PLANE_AND_REGISTER;
4ed46869
KH
1975 /* Rewind one byte because it is a character code of
1976 composition elements. */
1977 src--;
1978 coding->composing = COMPOSING_NO_RULE_HEAD;
1979 ENCODE_COMPOSITION_NO_RULE_START;
d46c5b12 1980 coding->consumed_char++;
4ed46869
KH
1981 }
1982 break;
1983
1984 case EMACS_invalid_code:
1985 *dst++ = c1;
d46c5b12 1986 coding->consumed_char++;
4ed46869
KH
1987 break;
1988 }
1989 continue;
1990 label_end_of_loop:
d46c5b12
KH
1991 result = CODING_FINISH_INSUFFICIENT_SRC;
1992 src = src_base;
4ed46869
KH
1993 break;
1994 }
1995
49cb52b4
KH
1996 if (src < src_end && result == CODING_FINISH_NORMAL)
1997 result = CODING_FINISH_INSUFFICIENT_DST;
1998
1999 /* If this is the last block of the text to be encoded, we must
2000 reset graphic planes and registers to the initial state, and
2001 flush out the carryover if any. */
2002 if (coding->mode & CODING_MODE_LAST_BLOCK)
84fbb8a0
KH
2003 {
2004 ENCODE_RESET_PLANE_AND_REGISTER;
2005 if (COMPOSING_P (coding->composing))
2006 ENCODE_COMPOSITION_END;
88993dfd
KH
2007 if (result == CODING_FINISH_INSUFFICIENT_SRC)
2008 {
2009 while (src < src_end && dst < dst_end)
2010 *dst++ = *src++;
2011 }
84fbb8a0 2012 }
d46c5b12
KH
2013 coding->consumed = src - source;
2014 coding->produced = coding->produced_char = dst - destination;
2015 return result;
4ed46869
KH
2016}
2017
2018\f
2019/*** 4. SJIS and BIG5 handlers ***/
2020
f4dee582 2021/* Although SJIS and BIG5 are not ISO's coding system, they are used
4ed46869
KH
2022 quite widely. So, for the moment, Emacs supports them in the bare
2023 C code. But, in the future, they may be supported only by CCL. */
2024
2025/* SJIS is a coding system encoding three character sets: ASCII, right
2026 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
2027 as is. A character of charset katakana-jisx0201 is encoded by
2028 "position-code + 0x80". A character of charset japanese-jisx0208
2029 is encoded in 2-byte but two position-codes are divided and shifted
2030 so that it fit in the range below.
2031
2032 --- CODE RANGE of SJIS ---
2033 (character set) (range)
2034 ASCII 0x00 .. 0x7F
2035 KATAKANA-JISX0201 0xA0 .. 0xDF
54f78171 2036 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xEF
d14d03ac 2037 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
4ed46869
KH
2038 -------------------------------
2039
2040*/
2041
2042/* BIG5 is a coding system encoding two character sets: ASCII and
2043 Big5. An ASCII character is encoded as is. Big5 is a two-byte
2044 character set and is encoded in two-byte.
2045
2046 --- CODE RANGE of BIG5 ---
2047 (character set) (range)
2048 ASCII 0x00 .. 0x7F
2049 Big5 (1st byte) 0xA1 .. 0xFE
2050 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
2051 --------------------------
2052
2053 Since the number of characters in Big5 is larger than maximum
2054 characters in Emacs' charset (96x96), it can't be handled as one
2055 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
2056 and `charset-big5-2'. Both are DIMENSION2 and CHARS94. The former
2057 contains frequently used characters and the latter contains less
2058 frequently used characters. */
2059
2060/* Macros to decode or encode a character of Big5 in BIG5. B1 and B2
2061 are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
2062 C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
2063 format. CHARSET is `charset_big5_1' or `charset_big5_2'. */
2064
2065/* Number of Big5 characters which have the same code in 1st byte. */
2066#define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2067
2068#define DECODE_BIG5(b1, b2, charset, c1, c2) \
2069 do { \
2070 unsigned int temp \
2071 = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62); \
2072 if (b1 < 0xC9) \
2073 charset = charset_big5_1; \
2074 else \
2075 { \
2076 charset = charset_big5_2; \
2077 temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW; \
2078 } \
2079 c1 = temp / (0xFF - 0xA1) + 0x21; \
2080 c2 = temp % (0xFF - 0xA1) + 0x21; \
2081 } while (0)
2082
2083#define ENCODE_BIG5(charset, c1, c2, b1, b2) \
2084 do { \
2085 unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21); \
2086 if (charset == charset_big5_2) \
2087 temp += BIG5_SAME_ROW * (0xC9 - 0xA1); \
2088 b1 = temp / BIG5_SAME_ROW + 0xA1; \
2089 b2 = temp % BIG5_SAME_ROW; \
2090 b2 += b2 < 0x3F ? 0x40 : 0x62; \
2091 } while (0)
2092
a5d301df
KH
2093#define DECODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \
2094 do { \
2095 int c_alt, charset_alt = (charset); \
84fbb8a0
KH
2096 if (!NILP (translation_table) \
2097 && ((c_alt = translate_char (translation_table, \
2098 -1, (charset), c1, c2)) >= 0)) \
55ab7be3 2099 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
a5d301df
KH
2100 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
2101 DECODE_CHARACTER_ASCII (c1); \
2102 else if (CHARSET_DIMENSION (charset_alt) == 1) \
2103 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
2104 else \
2105 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
2106 } while (0)
2107
84fbb8a0
KH
2108#define ENCODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \
2109 do { \
2110 int c_alt, charset_alt; \
2111 if (!NILP (translation_table) \
2112 && ((c_alt = translate_char (translation_table, -1, \
2113 charset, c1, c2)) \
2114 >= 0)) \
2115 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
2116 else \
2117 charset_alt = charset; \
2118 if (charset_alt == charset_ascii) \
2119 *dst++ = c1; \
2120 else if (CHARSET_DIMENSION (charset_alt) == 1) \
2121 { \
2122 if (sjis_p && charset_alt == charset_katakana_jisx0201) \
2123 *dst++ = c1; \
2124 else \
2125 { \
2126 *dst++ = charset_alt, *dst++ = c1; \
2127 coding->fake_multibyte = 1; \
2128 } \
2129 } \
2130 else \
2131 { \
2132 c1 &= 0x7F, c2 &= 0x7F; \
2133 if (sjis_p && charset_alt == charset_jisx0208) \
2134 { \
2135 unsigned char s1, s2; \
2136 \
2137 ENCODE_SJIS (c1, c2, s1, s2); \
2138 *dst++ = s1, *dst++ = s2; \
2139 coding->fake_multibyte = 1; \
2140 } \
2141 else if (!sjis_p \
2142 && (charset_alt == charset_big5_1 \
2143 || charset_alt == charset_big5_2)) \
2144 { \
2145 unsigned char b1, b2; \
2146 \
2147 ENCODE_BIG5 (charset_alt, c1, c2, b1, b2); \
2148 *dst++ = b1, *dst++ = b2; \
2149 } \
2150 else \
2151 { \
2152 *dst++ = charset_alt, *dst++ = c1, *dst++ = c2; \
2153 coding->fake_multibyte = 1; \
2154 } \
2155 } \
2156 coding->consumed_char++; \
a5d301df
KH
2157 } while (0);
2158
4ed46869
KH
2159/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2160 Check if a text is encoded in SJIS. If it is, return
2161 CODING_CATEGORY_MASK_SJIS, else return 0. */
2162
2163int
2164detect_coding_sjis (src, src_end)
2165 unsigned char *src, *src_end;
2166{
2167 unsigned char c;
2168
2169 while (src < src_end)
2170 {
2171 c = *src++;
4ed46869
KH
2172 if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
2173 {
2174 if (src < src_end && *src++ < 0x40)
2175 return 0;
2176 }
2177 }
2178 return CODING_CATEGORY_MASK_SJIS;
2179}
2180
2181/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2182 Check if a text is encoded in BIG5. If it is, return
2183 CODING_CATEGORY_MASK_BIG5, else return 0. */
2184
2185int
2186detect_coding_big5 (src, src_end)
2187 unsigned char *src, *src_end;
2188{
2189 unsigned char c;
2190
2191 while (src < src_end)
2192 {
2193 c = *src++;
4ed46869
KH
2194 if (c >= 0xA1)
2195 {
2196 if (src >= src_end)
2197 break;
2198 c = *src++;
2199 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
2200 return 0;
2201 }
2202 }
2203 return CODING_CATEGORY_MASK_BIG5;
2204}
2205
2206/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2207 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
2208
2209int
2210decode_coding_sjis_big5 (coding, source, destination,
d46c5b12 2211 src_bytes, dst_bytes, sjis_p)
4ed46869
KH
2212 struct coding_system *coding;
2213 unsigned char *source, *destination;
2214 int src_bytes, dst_bytes;
4ed46869
KH
2215 int sjis_p;
2216{
2217 unsigned char *src = source;
2218 unsigned char *src_end = source + src_bytes;
2219 unsigned char *dst = destination;
2220 unsigned char *dst_end = destination + dst_bytes;
2221 /* Since the maximum bytes produced by each loop is 4, we subtract 3
2222 from DST_END to assure overflow checking is necessary only at the
2223 head of loop. */
2224 unsigned char *adjusted_dst_end = dst_end - 3;
84fbb8a0 2225 Lisp_Object translation_table
f967223b 2226 = coding->translation_table_for_decode;
d46c5b12 2227 int result = CODING_FINISH_NORMAL;
a5d301df 2228
84fbb8a0 2229 if (!NILP (Venable_character_translation) && NILP (translation_table))
f967223b 2230 translation_table = Vstandard_translation_table_for_decode;
4ed46869 2231
d46c5b12 2232 coding->produced_char = 0;
fb88bf2d 2233 coding->fake_multibyte = 0;
d46c5b12
KH
2234 while (src < src_end && (dst_bytes
2235 ? (dst < adjusted_dst_end)
2236 : (dst < src - 3)))
4ed46869
KH
2237 {
2238 /* SRC_BASE remembers the start position in source in each loop.
2239 The loop will be exited when there's not enough source text
2240 to analyze two-byte character (within macro ONE_MORE_BYTE).
2241 In that case, SRC is reset to SRC_BASE before exiting. */
2242 unsigned char *src_base = src;
2243 unsigned char c1 = *src++, c2, c3, c4;
2244
d46c5b12 2245 if (c1 < 0x20)
4ed46869 2246 {
d46c5b12 2247 if (c1 == '\r')
4ed46869 2248 {
d46c5b12
KH
2249 if (coding->eol_type == CODING_EOL_CRLF)
2250 {
2251 ONE_MORE_BYTE (c2);
2252 if (c2 == '\n')
2253 *dst++ = c2;
2254 else if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2255 {
2256 result = CODING_FINISH_INCONSISTENT_EOL;
2257 goto label_end_of_loop_2;
2258 }
2259 else
2260 /* To process C2 again, SRC is subtracted by 1. */
2261 *dst++ = c1, src--;
2262 }
2263 else if (coding->eol_type == CODING_EOL_CR)
2264 *dst++ = '\n';
4ed46869 2265 else
d46c5b12
KH
2266 *dst++ = c1;
2267 }
2268 else if (c1 == '\n'
2269 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2270 && (coding->eol_type == CODING_EOL_CR
2271 || coding->eol_type == CODING_EOL_CRLF))
2272 {
2273 result = CODING_FINISH_INCONSISTENT_EOL;
2274 goto label_end_of_loop_2;
4ed46869
KH
2275 }
2276 else
2277 *dst++ = c1;
d46c5b12 2278 coding->produced_char++;
4ed46869 2279 }
a5d301df
KH
2280 else if (c1 < 0x80)
2281 DECODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
54f78171 2282 else
4ed46869 2283 {
4ed46869
KH
2284 if (sjis_p)
2285 {
54f78171 2286 if (c1 < 0xA0 || (c1 >= 0xE0 && c1 < 0xF0))
fb88bf2d 2287 {
54f78171
KH
2288 /* SJIS -> JISX0208 */
2289 ONE_MORE_BYTE (c2);
d14d03ac 2290 if (c2 >= 0x40 && c2 != 0x7F && c2 <= 0xFC)
54f78171
KH
2291 {
2292 DECODE_SJIS (c1, c2, c3, c4);
2293 DECODE_SJIS_BIG5_CHARACTER (charset_jisx0208, c3, c4);
2294 }
2295 else
2296 goto label_invalid_code_2;
fb88bf2d 2297 }
54f78171
KH
2298 else if (c1 < 0xE0)
2299 /* SJIS -> JISX0201-Kana */
2300 DECODE_SJIS_BIG5_CHARACTER (charset_katakana_jisx0201, c1,
2301 /* dummy */ c2);
fb88bf2d 2302 else
54f78171 2303 goto label_invalid_code_1;
4ed46869 2304 }
fb88bf2d 2305 else
fb88bf2d 2306 {
54f78171
KH
2307 /* BIG5 -> Big5 */
2308 if (c1 >= 0xA1 && c1 <= 0xFE)
fb88bf2d 2309 {
54f78171
KH
2310 ONE_MORE_BYTE (c2);
2311 if ((c2 >= 0x40 && c2 <= 0x7E) || (c2 >= 0xA1 && c2 <= 0xFE))
2312 {
2313 int charset;
4ed46869 2314
54f78171
KH
2315 DECODE_BIG5 (c1, c2, charset, c3, c4);
2316 DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
2317 }
2318 else
2319 goto label_invalid_code_2;
fb88bf2d
KH
2320 }
2321 else
54f78171 2322 goto label_invalid_code_1;
4ed46869
KH
2323 }
2324 }
2325 continue;
2326
fb88bf2d
KH
2327 label_invalid_code_1:
2328 *dst++ = c1;
2329 coding->produced_char++;
2330 coding->fake_multibyte = 1;
2331 continue;
2332
2333 label_invalid_code_2:
2334 *dst++ = c1; *dst++= c2;
2335 coding->produced_char += 2;
2336 coding->fake_multibyte = 1;
2337 continue;
2338
4ed46869 2339 label_end_of_loop:
d46c5b12
KH
2340 result = CODING_FINISH_INSUFFICIENT_SRC;
2341 label_end_of_loop_2:
4ed46869
KH
2342 src = src_base;
2343 break;
2344 }
2345
fb88bf2d
KH
2346 if (src < src_end)
2347 {
2348 if (result == CODING_FINISH_NORMAL)
2349 result = CODING_FINISH_INSUFFICIENT_DST;
2350 else if (result != CODING_FINISH_INCONSISTENT_EOL
2351 && coding->mode & CODING_MODE_LAST_BLOCK)
2352 {
2353 src_bytes = src_end - src;
2354 if (dst_bytes && (dst_end - dst < src_bytes))
2355 src_bytes = dst_end - dst;
2356 bcopy (dst, src, src_bytes);
2357 src += src_bytes;
2358 dst += src_bytes;
2359 coding->fake_multibyte = 1;
2360 }
2361 }
d46c5b12
KH
2362
2363 coding->consumed = coding->consumed_char = src - source;
2364 coding->produced = dst - destination;
2365 return result;
4ed46869
KH
2366}
2367
2368/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
2369 This function can encode `charset_ascii', `charset_katakana_jisx0201',
2370 `charset_jisx0208', `charset_big5_1', and `charset_big5-2'. We are
2371 sure that all these charsets are registered as official charset
2372 (i.e. do not have extended leading-codes). Characters of other
2373 charsets are produced without any encoding. If SJIS_P is 1, encode
2374 SJIS text, else encode BIG5 text. */
2375
2376int
2377encode_coding_sjis_big5 (coding, source, destination,
d46c5b12 2378 src_bytes, dst_bytes, sjis_p)
4ed46869
KH
2379 struct coding_system *coding;
2380 unsigned char *source, *destination;
2381 int src_bytes, dst_bytes;
4ed46869
KH
2382 int sjis_p;
2383{
2384 unsigned char *src = source;
2385 unsigned char *src_end = source + src_bytes;
2386 unsigned char *dst = destination;
2387 unsigned char *dst_end = destination + dst_bytes;
2388 /* Since the maximum bytes produced by each loop is 2, we subtract 1
2389 from DST_END to assure overflow checking is necessary only at the
2390 head of loop. */
2391 unsigned char *adjusted_dst_end = dst_end - 1;
84fbb8a0 2392 Lisp_Object translation_table
f967223b 2393 = coding->translation_table_for_encode;
d46c5b12 2394 int result = CODING_FINISH_NORMAL;
a5d301df 2395
84fbb8a0 2396 if (!NILP (Venable_character_translation) && NILP (translation_table))
f967223b 2397 translation_table = Vstandard_translation_table_for_encode;
4ed46869 2398
d46c5b12 2399 coding->consumed_char = 0;
fb88bf2d 2400 coding->fake_multibyte = 0;
d46c5b12
KH
2401 while (src < src_end && (dst_bytes
2402 ? (dst < adjusted_dst_end)
2403 : (dst < src - 1)))
4ed46869
KH
2404 {
2405 /* SRC_BASE remembers the start position in source in each loop.
2406 The loop will be exited when there's not enough source text
2407 to analyze multi-byte codes (within macros ONE_MORE_BYTE and
2408 TWO_MORE_BYTES). In that case, SRC is reset to SRC_BASE
2409 before exiting. */
2410 unsigned char *src_base = src;
2411 unsigned char c1 = *src++, c2, c3, c4;
2412
2413 if (coding->composing)
2414 {
2415 if (c1 == 0xA0)
2416 {
2417 ONE_MORE_BYTE (c1);
2418 c1 &= 0x7F;
2419 }
2420 else if (c1 >= 0xA0)
2421 c1 -= 0x20;
2422 else
2423 coding->composing = 0;
2424 }
2425
2426 switch (emacs_code_class[c1])
2427 {
2428 case EMACS_ascii_code:
a5d301df
KH
2429 ENCODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
2430 break;
2431
4ed46869
KH
2432 case EMACS_control_code:
2433 *dst++ = c1;
d46c5b12 2434 coding->consumed_char++;
4ed46869
KH
2435 break;
2436
2437 case EMACS_carriage_return_code:
d46c5b12 2438 if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
4ed46869
KH
2439 {
2440 *dst++ = c1;
d46c5b12 2441 coding->consumed_char++;
4ed46869
KH
2442 break;
2443 }
2444 /* fall down to treat '\r' as '\n' ... */
2445
2446 case EMACS_linefeed_code:
2447 if (coding->eol_type == CODING_EOL_LF
0ef69138 2448 || coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
2449 *dst++ = '\n';
2450 else if (coding->eol_type == CODING_EOL_CRLF)
2451 *dst++ = '\r', *dst++ = '\n';
2452 else
2453 *dst++ = '\r';
d46c5b12 2454 coding->consumed_char++;
4ed46869
KH
2455 break;
2456
2457 case EMACS_leading_code_2:
2458 ONE_MORE_BYTE (c2);
a5d301df 2459 ENCODE_SJIS_BIG5_CHARACTER (c1, c2, /* dummy */ c3);
4ed46869
KH
2460 break;
2461
2462 case EMACS_leading_code_3:
2463 TWO_MORE_BYTES (c2, c3);
a5d301df 2464 ENCODE_SJIS_BIG5_CHARACTER (c1, c2, c3);
4ed46869
KH
2465 break;
2466
2467 case EMACS_leading_code_4:
2468 THREE_MORE_BYTES (c2, c3, c4);
a5d301df 2469 ENCODE_SJIS_BIG5_CHARACTER (c2, c3, c4);
4ed46869
KH
2470 break;
2471
2472 case EMACS_leading_code_composition:
2473 coding->composing = 1;
2474 break;
2475
2476 default: /* i.e. case EMACS_invalid_code: */
2477 *dst++ = c1;
d46c5b12 2478 coding->consumed_char++;
4ed46869
KH
2479 }
2480 continue;
2481
2482 label_end_of_loop:
d46c5b12
KH
2483 result = CODING_FINISH_INSUFFICIENT_SRC;
2484 src = src_base;
4ed46869
KH
2485 break;
2486 }
2487
d46c5b12
KH
2488 if (result == CODING_FINISH_NORMAL
2489 && src < src_end)
2490 result = CODING_FINISH_INSUFFICIENT_DST;
2491 coding->consumed = src - source;
2492 coding->produced = coding->produced_char = dst - destination;
2493 return result;
4ed46869
KH
2494}
2495
2496\f
1397dc18
KH
2497/*** 5. CCL handlers ***/
2498
2499/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2500 Check if a text is encoded in a coding system of which
2501 encoder/decoder are written in CCL program. If it is, return
2502 CODING_CATEGORY_MASK_CCL, else return 0. */
2503
2504int
2505detect_coding_ccl (src, src_end)
2506 unsigned char *src, *src_end;
2507{
2508 unsigned char *valid;
2509
2510 /* No coding system is assigned to coding-category-ccl. */
2511 if (!coding_system_table[CODING_CATEGORY_IDX_CCL])
2512 return 0;
2513
2514 valid = coding_system_table[CODING_CATEGORY_IDX_CCL]->spec.ccl.valid_codes;
2515 while (src < src_end)
2516 {
2517 if (! valid[*src]) return 0;
2518 src++;
2519 }
2520 return CODING_CATEGORY_MASK_CCL;
2521}
2522
2523\f
2524/*** 6. End-of-line handlers ***/
4ed46869
KH
2525
2526/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2527 This function is called only when `coding->eol_type' is
2528 CODING_EOL_CRLF or CODING_EOL_CR. */
2529
dfcf069d 2530int
d46c5b12 2531decode_eol (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
2532 struct coding_system *coding;
2533 unsigned char *source, *destination;
2534 int src_bytes, dst_bytes;
4ed46869
KH
2535{
2536 unsigned char *src = source;
2537 unsigned char *src_end = source + src_bytes;
2538 unsigned char *dst = destination;
2539 unsigned char *dst_end = destination + dst_bytes;
fb88bf2d 2540 unsigned char c;
d46c5b12
KH
2541 int result = CODING_FINISH_NORMAL;
2542
fb88bf2d
KH
2543 coding->fake_multibyte = 0;
2544
d46c5b12
KH
2545 if (src_bytes <= 0)
2546 return result;
4ed46869
KH
2547
2548 switch (coding->eol_type)
2549 {
2550 case CODING_EOL_CRLF:
2551 {
2552 /* Since the maximum bytes produced by each loop is 2, we
2553 subtract 1 from DST_END to assure overflow checking is
2554 necessary only at the head of loop. */
2555 unsigned char *adjusted_dst_end = dst_end - 1;
2556
d46c5b12
KH
2557 while (src < src_end && (dst_bytes
2558 ? (dst < adjusted_dst_end)
2559 : (dst < src - 1)))
4ed46869
KH
2560 {
2561 unsigned char *src_base = src;
fb88bf2d
KH
2562
2563 c = *src++;
4ed46869
KH
2564 if (c == '\r')
2565 {
2566 ONE_MORE_BYTE (c);
fdfcf19d
KH
2567 if (c == '\n')
2568 *dst++ = c;
2569 else
d46c5b12
KH
2570 {
2571 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2572 {
2573 result = CODING_FINISH_INCONSISTENT_EOL;
2574 goto label_end_of_loop_2;
2575 }
fdfcf19d 2576 src--;
d46c5b12 2577 *dst++ = '\r';
fb88bf2d
KH
2578 if (BASE_LEADING_CODE_P (c))
2579 coding->fake_multibyte = 1;
d46c5b12 2580 }
4ed46869 2581 }
d46c5b12
KH
2582 else if (c == '\n'
2583 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL))
2584 {
2585 result = CODING_FINISH_INCONSISTENT_EOL;
2586 goto label_end_of_loop_2;
2587 }
4ed46869 2588 else
fb88bf2d
KH
2589 {
2590 *dst++ = c;
2591 if (BASE_LEADING_CODE_P (c))
2592 coding->fake_multibyte = 1;
2593 }
4ed46869
KH
2594 continue;
2595
2596 label_end_of_loop:
d46c5b12
KH
2597 result = CODING_FINISH_INSUFFICIENT_SRC;
2598 label_end_of_loop_2:
4ed46869
KH
2599 src = src_base;
2600 break;
2601 }
fdfcf19d
KH
2602 if (src < src_end)
2603 {
2604 if (result == CODING_FINISH_NORMAL)
2605 result = CODING_FINISH_INSUFFICIENT_DST;
2606 else if (result != CODING_FINISH_INCONSISTENT_EOL
2607 && coding->mode & CODING_MODE_LAST_BLOCK)
2608 {
2609 /* This is the last block of the text to be decoded.
2610 We flush out all remaining codes. */
2611 src_bytes = src_end - src;
2612 if (dst_bytes && (dst_end - dst < src_bytes))
2613 src_bytes = dst_end - dst;
2614 bcopy (src, dst, src_bytes);
2615 dst += src_bytes;
2616 src += src_bytes;
2617 }
2618 }
4ed46869 2619 }
d46c5b12 2620 break;
4ed46869
KH
2621
2622 case CODING_EOL_CR:
d46c5b12
KH
2623 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2624 {
fb88bf2d
KH
2625 while (src < src_end)
2626 {
2627 if ((c = *src++) == '\n')
2628 break;
2629 if (BASE_LEADING_CODE_P (c))
2630 coding->fake_multibyte = 1;
2631 }
d46c5b12
KH
2632 if (*--src == '\n')
2633 {
2634 src_bytes = src - source;
2635 result = CODING_FINISH_INCONSISTENT_EOL;
2636 }
2637 }
2638 if (dst_bytes && src_bytes > dst_bytes)
2639 {
2640 result = CODING_FINISH_INSUFFICIENT_DST;
2641 src_bytes = dst_bytes;
2642 }
2643 if (dst_bytes)
2644 bcopy (source, destination, src_bytes);
2645 else
2646 safe_bcopy (source, destination, src_bytes);
2647 src = source + src_bytes;
2648 while (src_bytes--) if (*dst++ == '\r') dst[-1] = '\n';
4ed46869
KH
2649 break;
2650
2651 default: /* i.e. case: CODING_EOL_LF */
d46c5b12
KH
2652 if (dst_bytes && src_bytes > dst_bytes)
2653 {
2654 result = CODING_FINISH_INSUFFICIENT_DST;
2655 src_bytes = dst_bytes;
2656 }
2657 if (dst_bytes)
2658 bcopy (source, destination, src_bytes);
2659 else
2660 safe_bcopy (source, destination, src_bytes);
2661 src += src_bytes;
993824c9 2662 dst += src_bytes;
fb88bf2d 2663 coding->fake_multibyte = 1;
4ed46869
KH
2664 break;
2665 }
2666
d46c5b12
KH
2667 coding->consumed = coding->consumed_char = src - source;
2668 coding->produced = coding->produced_char = dst - destination;
2669 return result;
4ed46869
KH
2670}
2671
2672/* See "GENERAL NOTES about `encode_coding_XXX ()' functions". Encode
2673 format of end-of-line according to `coding->eol_type'. If
d46c5b12
KH
2674 `coding->mode & CODING_MODE_SELECTIVE_DISPLAY' is nonzero, code
2675 '\r' in source text also means end-of-line. */
4ed46869 2676
dfcf069d 2677int
d46c5b12 2678encode_eol (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
2679 struct coding_system *coding;
2680 unsigned char *source, *destination;
2681 int src_bytes, dst_bytes;
4ed46869
KH
2682{
2683 unsigned char *src = source;
2684 unsigned char *dst = destination;
d46c5b12 2685 int result = CODING_FINISH_NORMAL;
4ed46869 2686
fb88bf2d
KH
2687 coding->fake_multibyte = 0;
2688
d46c5b12
KH
2689 if (coding->eol_type == CODING_EOL_CRLF)
2690 {
2691 unsigned char c;
2692 unsigned char *src_end = source + src_bytes;
2693 unsigned char *dst_end = destination + dst_bytes;
2694 /* Since the maximum bytes produced by each loop is 2, we
2695 subtract 1 from DST_END to assure overflow checking is
2696 necessary only at the head of loop. */
2697 unsigned char *adjusted_dst_end = dst_end - 1;
2698
2699 while (src < src_end && (dst_bytes
2700 ? (dst < adjusted_dst_end)
2701 : (dst < src - 1)))
2702 {
2703 c = *src++;
2704 if (c == '\n'
2705 || (c == '\r' && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY)))
2706 *dst++ = '\r', *dst++ = '\n';
2707 else
fb88bf2d
KH
2708 {
2709 *dst++ = c;
2710 if (BASE_LEADING_CODE_P (c))
2711 coding->fake_multibyte = 1;
2712 }
d46c5b12
KH
2713 }
2714 if (src < src_end)
2715 result = CODING_FINISH_INSUFFICIENT_DST;
2716 }
2717 else
4ed46869 2718 {
fb88bf2d
KH
2719 unsigned char c;
2720
d46c5b12 2721 if (dst_bytes && src_bytes > dst_bytes)
4ed46869 2722 {
d46c5b12
KH
2723 src_bytes = dst_bytes;
2724 result = CODING_FINISH_INSUFFICIENT_DST;
2725 }
2726 if (dst_bytes)
2727 bcopy (source, destination, src_bytes);
2728 else
993824c9
RS
2729 safe_bcopy (source, destination, src_bytes);
2730 dst_bytes = src_bytes;
2731 if (coding->eol_type == CODING_EOL_CR)
d46c5b12
KH
2732 {
2733 while (src_bytes--)
fb88bf2d
KH
2734 {
2735 if ((c = *dst++) == '\n')
2736 dst[-1] = '\r';
2737 else if (BASE_LEADING_CODE_P (c))
993824c9 2738 coding->fake_multibyte = 1;
fb88bf2d 2739 }
d46c5b12 2740 }
fb88bf2d 2741 else
d46c5b12 2742 {
fb88bf2d
KH
2743 if (coding->mode & CODING_MODE_SELECTIVE_DISPLAY)
2744 {
2745 while (src_bytes--)
2746 if (*dst++ == '\r') dst[-1] = '\n';
2747 }
2748 coding->fake_multibyte = 1;
4ed46869 2749 }
fb88bf2d
KH
2750 src = source + dst_bytes;
2751 dst = destination + dst_bytes;
4ed46869
KH
2752 }
2753
d46c5b12
KH
2754 coding->consumed = coding->consumed_char = src - source;
2755 coding->produced = coding->produced_char = dst - destination;
2756 return result;
4ed46869
KH
2757}
2758
2759\f
1397dc18 2760/*** 7. C library functions ***/
4ed46869
KH
2761
2762/* In Emacs Lisp, coding system is represented by a Lisp symbol which
2763 has a property `coding-system'. The value of this property is a
2764 vector of length 5 (called as coding-vector). Among elements of
2765 this vector, the first (element[0]) and the fifth (element[4])
2766 carry important information for decoding/encoding. Before
2767 decoding/encoding, this information should be set in fields of a
2768 structure of type `coding_system'.
2769
2770 A value of property `coding-system' can be a symbol of another
2771 subsidiary coding-system. In that case, Emacs gets coding-vector
2772 from that symbol.
2773
2774 `element[0]' contains information to be set in `coding->type'. The
2775 value and its meaning is as follows:
2776
0ef69138
KH
2777 0 -- coding_type_emacs_mule
2778 1 -- coding_type_sjis
2779 2 -- coding_type_iso2022
2780 3 -- coding_type_big5
2781 4 -- coding_type_ccl encoder/decoder written in CCL
2782 nil -- coding_type_no_conversion
2783 t -- coding_type_undecided (automatic conversion on decoding,
2784 no-conversion on encoding)
4ed46869
KH
2785
2786 `element[4]' contains information to be set in `coding->flags' and
2787 `coding->spec'. The meaning varies by `coding->type'.
2788
2789 If `coding->type' is `coding_type_iso2022', element[4] is a vector
2790 of length 32 (of which the first 13 sub-elements are used now).
2791 Meanings of these sub-elements are:
2792
2793 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2794 If the value is an integer of valid charset, the charset is
2795 assumed to be designated to graphic register N initially.
2796
2797 If the value is minus, it is a minus value of charset which
2798 reserves graphic register N, which means that the charset is
2799 not designated initially but should be designated to graphic
2800 register N just before encoding a character in that charset.
2801
2802 If the value is nil, graphic register N is never used on
2803 encoding.
2804
2805 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2806 Each value takes t or nil. See the section ISO2022 of
2807 `coding.h' for more information.
2808
2809 If `coding->type' is `coding_type_big5', element[4] is t to denote
2810 BIG5-ETen or nil to denote BIG5-HKU.
2811
2812 If `coding->type' takes the other value, element[4] is ignored.
2813
2814 Emacs Lisp's coding system also carries information about format of
2815 end-of-line in a value of property `eol-type'. If the value is
2816 integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2817 means CODING_EOL_CR. If it is not integer, it should be a vector
2818 of subsidiary coding systems of which property `eol-type' has one
2819 of above values.
2820
2821*/
2822
2823/* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2824 and set it in CODING. If CODING_SYSTEM_SYMBOL is invalid, CODING
2825 is setup so that no conversion is necessary and return -1, else
2826 return 0. */
2827
2828int
e0e989f6
KH
2829setup_coding_system (coding_system, coding)
2830 Lisp_Object coding_system;
4ed46869
KH
2831 struct coding_system *coding;
2832{
d46c5b12 2833 Lisp_Object coding_spec, coding_type, eol_type, plist;
4608c386 2834 Lisp_Object val;
70c22245 2835 int i;
4ed46869 2836
d46c5b12 2837 /* Initialize some fields required for all kinds of coding systems. */
774324d6 2838 coding->symbol = coding_system;
d46c5b12
KH
2839 coding->common_flags = 0;
2840 coding->mode = 0;
2841 coding->heading_ascii = -1;
2842 coding->post_read_conversion = coding->pre_write_conversion = Qnil;
4608c386
KH
2843 coding_spec = Fget (coding_system, Qcoding_system);
2844 if (!VECTORP (coding_spec)
2845 || XVECTOR (coding_spec)->size != 5
2846 || !CONSP (XVECTOR (coding_spec)->contents[3]))
4ed46869 2847 goto label_invalid_coding_system;
4608c386 2848
d46c5b12
KH
2849 eol_type = inhibit_eol_conversion ? Qnil : Fget (coding_system, Qeol_type);
2850 if (VECTORP (eol_type))
2851 {
2852 coding->eol_type = CODING_EOL_UNDECIDED;
2853 coding->common_flags = CODING_REQUIRE_DETECTION_MASK;
2854 }
2855 else if (XFASTINT (eol_type) == 1)
2856 {
2857 coding->eol_type = CODING_EOL_CRLF;
2858 coding->common_flags
2859 = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2860 }
2861 else if (XFASTINT (eol_type) == 2)
2862 {
2863 coding->eol_type = CODING_EOL_CR;
2864 coding->common_flags
2865 = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2866 }
2867 else
2868 coding->eol_type = CODING_EOL_LF;
2869
2870 coding_type = XVECTOR (coding_spec)->contents[0];
2871 /* Try short cut. */
2872 if (SYMBOLP (coding_type))
2873 {
2874 if (EQ (coding_type, Qt))
2875 {
2876 coding->type = coding_type_undecided;
2877 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
2878 }
2879 else
2880 coding->type = coding_type_no_conversion;
2881 return 0;
2882 }
2883
2884 /* Initialize remaining fields. */
2885 coding->composing = 0;
d46c5b12
KH
2886
2887 /* Get values of coding system properties:
2888 `post-read-conversion', `pre-write-conversion',
f967223b 2889 `translation-table-for-decode', `translation-table-for-encode'. */
4608c386
KH
2890 plist = XVECTOR (coding_spec)->contents[3];
2891 coding->post_read_conversion = Fplist_get (plist, Qpost_read_conversion);
2892 coding->pre_write_conversion = Fplist_get (plist, Qpre_write_conversion);
f967223b 2893 val = Fplist_get (plist, Qtranslation_table_for_decode);
4608c386 2894 if (SYMBOLP (val))
f967223b
KH
2895 val = Fget (val, Qtranslation_table_for_decode);
2896 coding->translation_table_for_decode = CHAR_TABLE_P (val) ? val : Qnil;
2897 val = Fplist_get (plist, Qtranslation_table_for_encode);
4608c386 2898 if (SYMBOLP (val))
f967223b
KH
2899 val = Fget (val, Qtranslation_table_for_encode);
2900 coding->translation_table_for_encode = CHAR_TABLE_P (val) ? val : Qnil;
d46c5b12
KH
2901 val = Fplist_get (plist, Qcoding_category);
2902 if (!NILP (val))
2903 {
2904 val = Fget (val, Qcoding_category_index);
2905 if (INTEGERP (val))
2906 coding->category_idx = XINT (val);
2907 else
2908 goto label_invalid_coding_system;
2909 }
2910 else
2911 goto label_invalid_coding_system;
4608c386 2912
70c22245
KH
2913 val = Fplist_get (plist, Qsafe_charsets);
2914 if (EQ (val, Qt))
2915 {
2916 for (i = 0; i <= MAX_CHARSET; i++)
2917 coding->safe_charsets[i] = 1;
2918 }
2919 else
2920 {
2921 bzero (coding->safe_charsets, MAX_CHARSET + 1);
2922 while (CONSP (val))
2923 {
2924 if ((i = get_charset_id (XCONS (val)->car)) >= 0)
2925 coding->safe_charsets[i] = 1;
2926 val = XCONS (val)->cdr;
2927 }
2928 }
2929
d46c5b12 2930 switch (XFASTINT (coding_type))
4ed46869
KH
2931 {
2932 case 0:
0ef69138 2933 coding->type = coding_type_emacs_mule;
c952af22
KH
2934 if (!NILP (coding->post_read_conversion))
2935 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
2936 if (!NILP (coding->pre_write_conversion))
2937 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
4ed46869
KH
2938 break;
2939
2940 case 1:
2941 coding->type = coding_type_sjis;
c952af22
KH
2942 coding->common_flags
2943 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
4ed46869
KH
2944 break;
2945
2946 case 2:
2947 coding->type = coding_type_iso2022;
c952af22
KH
2948 coding->common_flags
2949 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
4ed46869 2950 {
70c22245 2951 Lisp_Object val, temp;
4ed46869 2952 Lisp_Object *flags;
d46c5b12 2953 int i, charset, reg_bits = 0;
4ed46869 2954
4608c386 2955 val = XVECTOR (coding_spec)->contents[4];
f44d27ce 2956
4ed46869
KH
2957 if (!VECTORP (val) || XVECTOR (val)->size != 32)
2958 goto label_invalid_coding_system;
2959
2960 flags = XVECTOR (val)->contents;
2961 coding->flags
2962 = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
2963 | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
2964 | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
2965 | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
2966 | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
2967 | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
2968 | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
2969 | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
e0e989f6
KH
2970 | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
2971 | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
c4825358
KH
2972 | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
2973 | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
3f003981 2974 | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
c4825358 2975 );
4ed46869
KH
2976
2977 /* Invoke graphic register 0 to plane 0. */
2978 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
2979 /* Invoke graphic register 1 to plane 1 if we can use full 8-bit. */
2980 CODING_SPEC_ISO_INVOCATION (coding, 1)
2981 = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
2982 /* Not single shifting at first. */
6e85d753 2983 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
e0e989f6 2984 /* Beginning of buffer should also be regarded as bol. */
6e85d753 2985 CODING_SPEC_ISO_BOL (coding) = 1;
4ed46869 2986
70c22245
KH
2987 for (charset = 0; charset <= MAX_CHARSET; charset++)
2988 CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = 255;
2989 val = Vcharset_revision_alist;
2990 while (CONSP (val))
2991 {
2992 charset = get_charset_id (Fcar_safe (XCONS (val)->car));
2993 if (charset >= 0
2994 && (temp = Fcdr_safe (XCONS (val)->car), INTEGERP (temp))
2995 && (i = XINT (temp), (i >= 0 && (i + '@') < 128)))
2996 CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = i;
2997 val = XCONS (val)->cdr;
2998 }
2999
4ed46869
KH
3000 /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
3001 FLAGS[REG] can be one of below:
3002 integer CHARSET: CHARSET occupies register I,
3003 t: designate nothing to REG initially, but can be used
3004 by any charsets,
3005 list of integer, nil, or t: designate the first
3006 element (if integer) to REG initially, the remaining
3007 elements (if integer) is designated to REG on request,
d46c5b12 3008 if an element is t, REG can be used by any charsets,
4ed46869 3009 nil: REG is never used. */
467e7675 3010 for (charset = 0; charset <= MAX_CHARSET; charset++)
1ba9e4ab
KH
3011 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3012 = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
4ed46869
KH
3013 for (i = 0; i < 4; i++)
3014 {
3015 if (INTEGERP (flags[i])
e0e989f6
KH
3016 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset))
3017 || (charset = get_charset_id (flags[i])) >= 0)
4ed46869
KH
3018 {
3019 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3020 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
3021 }
3022 else if (EQ (flags[i], Qt))
3023 {
3024 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
d46c5b12
KH
3025 reg_bits |= 1 << i;
3026 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
4ed46869
KH
3027 }
3028 else if (CONSP (flags[i]))
3029 {
84d60297
RS
3030 Lisp_Object tail;
3031 tail = flags[i];
4ed46869 3032
d46c5b12 3033 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
4ed46869
KH
3034 if (INTEGERP (XCONS (tail)->car)
3035 && (charset = XINT (XCONS (tail)->car),
e0e989f6
KH
3036 CHARSET_VALID_P (charset))
3037 || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
4ed46869
KH
3038 {
3039 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3040 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
3041 }
3042 else
3043 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3044 tail = XCONS (tail)->cdr;
3045 while (CONSP (tail))
3046 {
3047 if (INTEGERP (XCONS (tail)->car)
3048 && (charset = XINT (XCONS (tail)->car),
e0e989f6
KH
3049 CHARSET_VALID_P (charset))
3050 || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
70c22245
KH
3051 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3052 = i;
4ed46869 3053 else if (EQ (XCONS (tail)->car, Qt))
d46c5b12 3054 reg_bits |= 1 << i;
4ed46869
KH
3055 tail = XCONS (tail)->cdr;
3056 }
3057 }
3058 else
3059 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3060
3061 CODING_SPEC_ISO_DESIGNATION (coding, i)
3062 = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
3063 }
3064
d46c5b12 3065 if (reg_bits && ! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
4ed46869
KH
3066 {
3067 /* REG 1 can be used only by locking shift in 7-bit env. */
3068 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
d46c5b12 3069 reg_bits &= ~2;
4ed46869
KH
3070 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
3071 /* Without any shifting, only REG 0 and 1 can be used. */
d46c5b12 3072 reg_bits &= 3;
4ed46869
KH
3073 }
3074
d46c5b12
KH
3075 if (reg_bits)
3076 for (charset = 0; charset <= MAX_CHARSET; charset++)
6e85d753 3077 {
d46c5b12
KH
3078 if (CHARSET_VALID_P (charset))
3079 {
3080 /* There exist some default graphic registers to be
3081 used CHARSET. */
3082
3083 /* We had better avoid designating a charset of
3084 CHARS96 to REG 0 as far as possible. */
3085 if (CHARSET_CHARS (charset) == 96)
3086 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3087 = (reg_bits & 2
3088 ? 1 : (reg_bits & 4 ? 2 : (reg_bits & 8 ? 3 : 0)));
3089 else
3090 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3091 = (reg_bits & 1
3092 ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
3093 }
6e85d753 3094 }
4ed46869 3095 }
c952af22 3096 coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
d46c5b12 3097 coding->spec.iso2022.last_invalid_designation_register = -1;
4ed46869
KH
3098 break;
3099
3100 case 3:
3101 coding->type = coding_type_big5;
c952af22
KH
3102 coding->common_flags
3103 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
4ed46869 3104 coding->flags
4608c386 3105 = (NILP (XVECTOR (coding_spec)->contents[4])
4ed46869
KH
3106 ? CODING_FLAG_BIG5_HKU
3107 : CODING_FLAG_BIG5_ETEN);
3108 break;
3109
3110 case 4:
3111 coding->type = coding_type_ccl;
c952af22
KH
3112 coding->common_flags
3113 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
4ed46869 3114 {
84d60297 3115 Lisp_Object val;
d21ca14d
KH
3116 Lisp_Object decoder, encoder;
3117
84d60297 3118 val = XVECTOR (coding_spec)->contents[4];
4ed46869 3119 if (CONSP (val)
d21ca14d
KH
3120 && SYMBOLP (XCONS (val)->car)
3121 && !NILP (decoder = Fget (XCONS (val)->car, Qccl_program_idx))
f82423d7 3122 && !NILP (decoder = Fcdr (Faref (Vccl_program_table, decoder)))
d21ca14d
KH
3123 && SYMBOLP (XCONS (val)->cdr)
3124 && !NILP (encoder = Fget (XCONS (val)->cdr, Qccl_program_idx))
f82423d7 3125 && !NILP (encoder = Fcdr (Faref (Vccl_program_table, encoder))))
4ed46869 3126 {
d21ca14d
KH
3127 setup_ccl_program (&(coding->spec.ccl.decoder), decoder);
3128 setup_ccl_program (&(coding->spec.ccl.encoder), encoder);
4ed46869
KH
3129 }
3130 else
3131 goto label_invalid_coding_system;
1397dc18
KH
3132
3133 bzero (coding->spec.ccl.valid_codes, 256);
3134 val = Fplist_get (plist, Qvalid_codes);
3135 if (CONSP (val))
3136 {
3137 Lisp_Object this;
3138
7b179c2d 3139 for (; CONSP (val); val = XCONS (val)->cdr)
1397dc18 3140 {
7b179c2d 3141 this = XCONS (val)->car;
1397dc18
KH
3142 if (INTEGERP (this)
3143 && XINT (this) >= 0 && XINT (this) < 256)
3144 coding->spec.ccl.valid_codes[XINT (this)] = 1;
3145 else if (CONSP (this)
3146 && INTEGERP (XCONS (this)->car)
3147 && INTEGERP (XCONS (this)->cdr))
3148 {
3149 int start = XINT (XCONS (this)->car);
3150 int end = XINT (XCONS (this)->cdr);
3151
3152 if (start >= 0 && start <= end && end < 256)
e133c8fa 3153 while (start <= end)
1397dc18
KH
3154 coding->spec.ccl.valid_codes[start++] = 1;
3155 }
3156 }
3157 }
4ed46869 3158 }
c952af22 3159 coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
4ed46869
KH
3160 break;
3161
27901516
KH
3162 case 5:
3163 coding->type = coding_type_raw_text;
3164 break;
3165
4ed46869 3166 default:
d46c5b12 3167 goto label_invalid_coding_system;
4ed46869
KH
3168 }
3169 return 0;
3170
3171 label_invalid_coding_system:
3172 coding->type = coding_type_no_conversion;
d46c5b12 3173 coding->category_idx = CODING_CATEGORY_IDX_BINARY;
c952af22 3174 coding->common_flags = 0;
dec137e5 3175 coding->eol_type = CODING_EOL_LF;
d46c5b12 3176 coding->pre_write_conversion = coding->post_read_conversion = Qnil;
4ed46869
KH
3177 return -1;
3178}
3179
54f78171
KH
3180/* Setup raw-text or one of its subsidiaries in the structure
3181 coding_system CODING according to the already setup value eol_type
3182 in CODING. CODING should be setup for some coding system in
3183 advance. */
3184
3185void
3186setup_raw_text_coding_system (coding)
3187 struct coding_system *coding;
3188{
3189 if (coding->type != coding_type_raw_text)
3190 {
3191 coding->symbol = Qraw_text;
3192 coding->type = coding_type_raw_text;
3193 if (coding->eol_type != CODING_EOL_UNDECIDED)
3194 {
84d60297
RS
3195 Lisp_Object subsidiaries;
3196 subsidiaries = Fget (Qraw_text, Qeol_type);
54f78171
KH
3197
3198 if (VECTORP (subsidiaries)
3199 && XVECTOR (subsidiaries)->size == 3)
3200 coding->symbol
3201 = XVECTOR (subsidiaries)->contents[coding->eol_type];
3202 }
3203 }
3204 return;
3205}
3206
4ed46869
KH
3207/* Emacs has a mechanism to automatically detect a coding system if it
3208 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
3209 it's impossible to distinguish some coding systems accurately
3210 because they use the same range of codes. So, at first, coding
3211 systems are categorized into 7, those are:
3212
0ef69138 3213 o coding-category-emacs-mule
4ed46869
KH
3214
3215 The category for a coding system which has the same code range
3216 as Emacs' internal format. Assigned the coding-system (Lisp
0ef69138 3217 symbol) `emacs-mule' by default.
4ed46869
KH
3218
3219 o coding-category-sjis
3220
3221 The category for a coding system which has the same code range
3222 as SJIS. Assigned the coding-system (Lisp
7717c392 3223 symbol) `japanese-shift-jis' by default.
4ed46869
KH
3224
3225 o coding-category-iso-7
3226
3227 The category for a coding system which has the same code range
7717c392 3228 as ISO2022 of 7-bit environment. This doesn't use any locking
d46c5b12
KH
3229 shift and single shift functions. This can encode/decode all
3230 charsets. Assigned the coding-system (Lisp symbol)
3231 `iso-2022-7bit' by default.
3232
3233 o coding-category-iso-7-tight
3234
3235 Same as coding-category-iso-7 except that this can
3236 encode/decode only the specified charsets.
4ed46869
KH
3237
3238 o coding-category-iso-8-1
3239
3240 The category for a coding system which has the same code range
3241 as ISO2022 of 8-bit environment and graphic plane 1 used only
7717c392
KH
3242 for DIMENSION1 charset. This doesn't use any locking shift
3243 and single shift functions. Assigned the coding-system (Lisp
3244 symbol) `iso-latin-1' by default.
4ed46869
KH
3245
3246 o coding-category-iso-8-2
3247
3248 The category for a coding system which has the same code range
3249 as ISO2022 of 8-bit environment and graphic plane 1 used only
7717c392
KH
3250 for DIMENSION2 charset. This doesn't use any locking shift
3251 and single shift functions. Assigned the coding-system (Lisp
3252 symbol) `japanese-iso-8bit' by default.
4ed46869 3253
7717c392 3254 o coding-category-iso-7-else
4ed46869
KH
3255
3256 The category for a coding system which has the same code range
7717c392
KH
3257 as ISO2022 of 7-bit environemnt but uses locking shift or
3258 single shift functions. Assigned the coding-system (Lisp
3259 symbol) `iso-2022-7bit-lock' by default.
3260
3261 o coding-category-iso-8-else
3262
3263 The category for a coding system which has the same code range
3264 as ISO2022 of 8-bit environemnt but uses locking shift or
3265 single shift functions. Assigned the coding-system (Lisp
3266 symbol) `iso-2022-8bit-ss2' by default.
4ed46869
KH
3267
3268 o coding-category-big5
3269
3270 The category for a coding system which has the same code range
3271 as BIG5. Assigned the coding-system (Lisp symbol)
e0e989f6 3272 `cn-big5' by default.
4ed46869 3273
1397dc18
KH
3274 o coding-category-ccl
3275
3276 The category for a coding system of which encoder/decoder is
3277 written in CCL programs. The default value is nil, i.e., no
3278 coding system is assigned.
3279
4ed46869
KH
3280 o coding-category-binary
3281
3282 The category for a coding system not categorized in any of the
3283 above. Assigned the coding-system (Lisp symbol)
e0e989f6 3284 `no-conversion' by default.
4ed46869
KH
3285
3286 Each of them is a Lisp symbol and the value is an actual
3287 `coding-system's (this is also a Lisp symbol) assigned by a user.
3288 What Emacs does actually is to detect a category of coding system.
3289 Then, it uses a `coding-system' assigned to it. If Emacs can't
3290 decide only one possible category, it selects a category of the
3291 highest priority. Priorities of categories are also specified by a
3292 user in a Lisp variable `coding-category-list'.
3293
3294*/
3295
66cfb530
KH
3296static
3297int ascii_skip_code[256];
3298
d46c5b12 3299/* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded.
4ed46869
KH
3300 If it detects possible coding systems, return an integer in which
3301 appropriate flag bits are set. Flag bits are defined by macros
d46c5b12 3302 CODING_CATEGORY_MASK_XXX in `coding.h'.
4ed46869 3303
d46c5b12
KH
3304 How many ASCII characters are at the head is returned as *SKIP. */
3305
3306static int
3307detect_coding_mask (source, src_bytes, priorities, skip)
3308 unsigned char *source;
3309 int src_bytes, *priorities, *skip;
4ed46869
KH
3310{
3311 register unsigned char c;
d46c5b12 3312 unsigned char *src = source, *src_end = source + src_bytes;
66cfb530 3313 unsigned int mask;
d46c5b12 3314 int i;
4ed46869
KH
3315
3316 /* At first, skip all ASCII characters and control characters except
3317 for three ISO2022 specific control characters. */
66cfb530
KH
3318 ascii_skip_code[ISO_CODE_SO] = 0;
3319 ascii_skip_code[ISO_CODE_SI] = 0;
3320 ascii_skip_code[ISO_CODE_ESC] = 0;
3321
bcf26d6a 3322 label_loop_detect_coding:
66cfb530 3323 while (src < src_end && ascii_skip_code[*src]) src++;
d46c5b12 3324 *skip = src - source;
4ed46869
KH
3325
3326 if (src >= src_end)
3327 /* We found nothing other than ASCII. There's nothing to do. */
d46c5b12 3328 return 0;
4ed46869 3329
8a8147d6 3330 c = *src;
4ed46869
KH
3331 /* The text seems to be encoded in some multilingual coding system.
3332 Now, try to find in which coding system the text is encoded. */
3333 if (c < 0x80)
bcf26d6a
KH
3334 {
3335 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
3336 /* C is an ISO2022 specific control code of C0. */
3337 mask = detect_coding_iso2022 (src, src_end);
1b2af4b0 3338 if (mask == 0)
d46c5b12
KH
3339 {
3340 /* No valid ISO2022 code follows C. Try again. */
3341 src++;
66cfb530
KH
3342 if (c == ISO_CODE_ESC)
3343 ascii_skip_code[ISO_CODE_ESC] = 1;
3344 else
3345 ascii_skip_code[ISO_CODE_SO] = ascii_skip_code[ISO_CODE_SI] = 1;
d46c5b12
KH
3346 goto label_loop_detect_coding;
3347 }
3348 if (priorities)
3349 goto label_return_highest_only;
bcf26d6a 3350 }
d46c5b12 3351 else
c4825358 3352 {
d46c5b12 3353 int try;
4ed46869 3354
d46c5b12
KH
3355 if (c < 0xA0)
3356 {
3357 /* C is the first byte of SJIS character code,
3358 or a leading-code of Emacs' internal format (emacs-mule). */
3359 try = CODING_CATEGORY_MASK_SJIS | CODING_CATEGORY_MASK_EMACS_MULE;
3360
3361 /* Or, if C is a special latin extra code,
3362 or is an ISO2022 specific control code of C1 (SS2 or SS3),
3363 or is an ISO2022 control-sequence-introducer (CSI),
3364 we should also consider the possibility of ISO2022 codings. */
3365 if ((VECTORP (Vlatin_extra_code_table)
3366 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
3367 || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
3368 || (c == ISO_CODE_CSI
3369 && (src < src_end
3370 && (*src == ']'
3371 || ((*src == '0' || *src == '1' || *src == '2')
3372 && src + 1 < src_end
3373 && src[1] == ']')))))
3374 try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
3375 | CODING_CATEGORY_MASK_ISO_8BIT);
3376 }
c4825358 3377 else
d46c5b12
KH
3378 /* C is a character of ISO2022 in graphic plane right,
3379 or a SJIS's 1-byte character code (i.e. JISX0201),
3380 or the first byte of BIG5's 2-byte code. */
3381 try = (CODING_CATEGORY_MASK_ISO_8_ELSE
3382 | CODING_CATEGORY_MASK_ISO_8BIT
3383 | CODING_CATEGORY_MASK_SJIS
3384 | CODING_CATEGORY_MASK_BIG5);
3385
1397dc18
KH
3386 /* Or, we may have to consider the possibility of CCL. */
3387 if (coding_system_table[CODING_CATEGORY_IDX_CCL]
3388 && (coding_system_table[CODING_CATEGORY_IDX_CCL]
3389 ->spec.ccl.valid_codes)[c])
3390 try |= CODING_CATEGORY_MASK_CCL;
3391
d46c5b12
KH
3392 mask = 0;
3393 if (priorities)
3394 {
3395 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3396 {
5ab13dd0 3397 if (priorities[i] & try & CODING_CATEGORY_MASK_ISO)
d46c5b12 3398 mask = detect_coding_iso2022 (src, src_end);
5ab13dd0 3399 else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS)
d46c5b12 3400 mask = detect_coding_sjis (src, src_end);
5ab13dd0 3401 else if (priorities[i] & try & CODING_CATEGORY_MASK_BIG5)
d46c5b12 3402 mask = detect_coding_big5 (src, src_end);
5ab13dd0 3403 else if (priorities[i] & try & CODING_CATEGORY_MASK_EMACS_MULE)
d46c5b12 3404 mask = detect_coding_emacs_mule (src, src_end);
89fa8b36 3405 else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL)
1397dc18 3406 mask = detect_coding_ccl (src, src_end);
5ab13dd0
RS
3407 else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT)
3408 mask = CODING_CATEGORY_MASK_RAW_TEXT;
3409 else if (priorities[i] & CODING_CATEGORY_MASK_BINARY)
3410 mask = CODING_CATEGORY_MASK_BINARY;
d46c5b12
KH
3411 if (mask)
3412 goto label_return_highest_only;
3413 }
3414 return CODING_CATEGORY_MASK_RAW_TEXT;
3415 }
3416 if (try & CODING_CATEGORY_MASK_ISO)
3417 mask |= detect_coding_iso2022 (src, src_end);
3418 if (try & CODING_CATEGORY_MASK_SJIS)
3419 mask |= detect_coding_sjis (src, src_end);
3420 if (try & CODING_CATEGORY_MASK_BIG5)
3421 mask |= detect_coding_big5 (src, src_end);
3422 if (try & CODING_CATEGORY_MASK_EMACS_MULE)
1397dc18
KH
3423 mask |= detect_coding_emacs_mule (src, src_end);
3424 if (try & CODING_CATEGORY_MASK_CCL)
3425 mask |= detect_coding_ccl (src, src_end);
c4825358 3426 }
5ab13dd0 3427 return (mask | CODING_CATEGORY_MASK_RAW_TEXT | CODING_CATEGORY_MASK_BINARY);
d46c5b12
KH
3428
3429 label_return_highest_only:
3430 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3431 {
3432 if (mask & priorities[i])
3433 return priorities[i];
3434 }
3435 return CODING_CATEGORY_MASK_RAW_TEXT;
4ed46869
KH
3436}
3437
3438/* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
3439 The information of the detected coding system is set in CODING. */
3440
3441void
3442detect_coding (coding, src, src_bytes)
3443 struct coding_system *coding;
3444 unsigned char *src;
3445 int src_bytes;
3446{
d46c5b12
KH
3447 unsigned int idx;
3448 int skip, mask, i;
84d60297 3449 Lisp_Object val;
4ed46869 3450
84d60297 3451 val = Vcoding_category_list;
66cfb530 3452 mask = detect_coding_mask (src, src_bytes, coding_priorities, &skip);
d46c5b12 3453 coding->heading_ascii = skip;
4ed46869 3454
d46c5b12
KH
3455 if (!mask) return;
3456
3457 /* We found a single coding system of the highest priority in MASK. */
3458 idx = 0;
3459 while (mask && ! (mask & 1)) mask >>= 1, idx++;
3460 if (! mask)
3461 idx = CODING_CATEGORY_IDX_RAW_TEXT;
4ed46869 3462
d46c5b12
KH
3463 val = XSYMBOL (XVECTOR (Vcoding_category_table)->contents[idx])->value;
3464
3465 if (coding->eol_type != CODING_EOL_UNDECIDED)
27901516 3466 {
84d60297 3467 Lisp_Object tmp;
d46c5b12 3468
84d60297 3469 tmp = Fget (val, Qeol_type);
d46c5b12
KH
3470 if (VECTORP (tmp))
3471 val = XVECTOR (tmp)->contents[coding->eol_type];
4ed46869 3472 }
d46c5b12
KH
3473 setup_coding_system (val, coding);
3474 /* Set this again because setup_coding_system reset this member. */
3475 coding->heading_ascii = skip;
4ed46869
KH
3476}
3477
d46c5b12
KH
3478/* Detect how end-of-line of a text of length SRC_BYTES pointed by
3479 SOURCE is encoded. Return one of CODING_EOL_LF, CODING_EOL_CRLF,
3480 CODING_EOL_CR, and CODING_EOL_UNDECIDED.
3481
3482 How many non-eol characters are at the head is returned as *SKIP. */
4ed46869 3483
bc4bc72a
RS
3484#define MAX_EOL_CHECK_COUNT 3
3485
d46c5b12
KH
3486static int
3487detect_eol_type (source, src_bytes, skip)
3488 unsigned char *source;
3489 int src_bytes, *skip;
4ed46869 3490{
d46c5b12 3491 unsigned char *src = source, *src_end = src + src_bytes;
4ed46869 3492 unsigned char c;
bc4bc72a
RS
3493 int total = 0; /* How many end-of-lines are found so far. */
3494 int eol_type = CODING_EOL_UNDECIDED;
3495 int this_eol_type;
4ed46869 3496
d46c5b12
KH
3497 *skip = 0;
3498
bc4bc72a 3499 while (src < src_end && total < MAX_EOL_CHECK_COUNT)
4ed46869
KH
3500 {
3501 c = *src++;
bc4bc72a 3502 if (c == '\n' || c == '\r')
4ed46869 3503 {
d46c5b12
KH
3504 if (*skip == 0)
3505 *skip = src - 1 - source;
bc4bc72a
RS
3506 total++;
3507 if (c == '\n')
3508 this_eol_type = CODING_EOL_LF;
3509 else if (src >= src_end || *src != '\n')
3510 this_eol_type = CODING_EOL_CR;
4ed46869 3511 else
bc4bc72a
RS
3512 this_eol_type = CODING_EOL_CRLF, src++;
3513
3514 if (eol_type == CODING_EOL_UNDECIDED)
3515 /* This is the first end-of-line. */
3516 eol_type = this_eol_type;
3517 else if (eol_type != this_eol_type)
d46c5b12
KH
3518 {
3519 /* The found type is different from what found before. */
3520 eol_type = CODING_EOL_INCONSISTENT;
3521 break;
3522 }
4ed46869
KH
3523 }
3524 }
bc4bc72a 3525
d46c5b12
KH
3526 if (*skip == 0)
3527 *skip = src_end - source;
85a02ca4 3528 return eol_type;
4ed46869
KH
3529}
3530
3531/* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
3532 is encoded. If it detects an appropriate format of end-of-line, it
3533 sets the information in *CODING. */
3534
3535void
3536detect_eol (coding, src, src_bytes)
3537 struct coding_system *coding;
3538 unsigned char *src;
3539 int src_bytes;
3540{
4608c386 3541 Lisp_Object val;
d46c5b12
KH
3542 int skip;
3543 int eol_type = detect_eol_type (src, src_bytes, &skip);
3544
3545 if (coding->heading_ascii > skip)
3546 coding->heading_ascii = skip;
3547 else
3548 skip = coding->heading_ascii;
4ed46869 3549
0ef69138 3550 if (eol_type == CODING_EOL_UNDECIDED)
4ed46869 3551 return;
27901516
KH
3552 if (eol_type == CODING_EOL_INCONSISTENT)
3553 {
3554#if 0
3555 /* This code is suppressed until we find a better way to
992f23f2 3556 distinguish raw text file and binary file. */
27901516
KH
3557
3558 /* If we have already detected that the coding is raw-text, the
3559 coding should actually be no-conversion. */
3560 if (coding->type == coding_type_raw_text)
3561 {
3562 setup_coding_system (Qno_conversion, coding);
3563 return;
3564 }
3565 /* Else, let's decode only text code anyway. */
3566#endif /* 0 */
1b2af4b0 3567 eol_type = CODING_EOL_LF;
27901516
KH
3568 }
3569
4608c386 3570 val = Fget (coding->symbol, Qeol_type);
4ed46869 3571 if (VECTORP (val) && XVECTOR (val)->size == 3)
d46c5b12
KH
3572 {
3573 setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
3574 coding->heading_ascii = skip;
3575 }
3576}
3577
3578#define CONVERSION_BUFFER_EXTRA_ROOM 256
3579
3580#define DECODING_BUFFER_MAG(coding) \
3581 (coding->type == coding_type_iso2022 \
3582 ? 3 \
3583 : ((coding->type == coding_type_sjis || coding->type == coding_type_big5) \
3584 ? 2 \
3585 : (coding->type == coding_type_raw_text \
3586 ? 1 \
3587 : (coding->type == coding_type_ccl \
3588 ? coding->spec.ccl.decoder.buf_magnification \
3589 : 2))))
3590
3591/* Return maximum size (bytes) of a buffer enough for decoding
3592 SRC_BYTES of text encoded in CODING. */
3593
3594int
3595decoding_buffer_size (coding, src_bytes)
3596 struct coding_system *coding;
3597 int src_bytes;
3598{
3599 return (src_bytes * DECODING_BUFFER_MAG (coding)
3600 + CONVERSION_BUFFER_EXTRA_ROOM);
3601}
3602
3603/* Return maximum size (bytes) of a buffer enough for encoding
3604 SRC_BYTES of text to CODING. */
3605
3606int
3607encoding_buffer_size (coding, src_bytes)
3608 struct coding_system *coding;
3609 int src_bytes;
3610{
3611 int magnification;
3612
3613 if (coding->type == coding_type_ccl)
3614 magnification = coding->spec.ccl.encoder.buf_magnification;
3615 else
3616 magnification = 3;
3617
3618 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
3619}
3620
3621#ifndef MINIMUM_CONVERSION_BUFFER_SIZE
3622#define MINIMUM_CONVERSION_BUFFER_SIZE 1024
3623#endif
3624
3625char *conversion_buffer;
3626int conversion_buffer_size;
3627
3628/* Return a pointer to a SIZE bytes of buffer to be used for encoding
3629 or decoding. Sufficient memory is allocated automatically. If we
3630 run out of memory, return NULL. */
3631
3632char *
3633get_conversion_buffer (size)
3634 int size;
3635{
3636 if (size > conversion_buffer_size)
3637 {
3638 char *buf;
3639 int real_size = conversion_buffer_size * 2;
3640
3641 while (real_size < size) real_size *= 2;
3642 buf = (char *) xmalloc (real_size);
3643 xfree (conversion_buffer);
3644 conversion_buffer = buf;
3645 conversion_buffer_size = real_size;
3646 }
3647 return conversion_buffer;
3648}
3649
3650int
3651ccl_coding_driver (coding, source, destination, src_bytes, dst_bytes, encodep)
3652 struct coding_system *coding;
3653 unsigned char *source, *destination;
3654 int src_bytes, dst_bytes, encodep;
3655{
3656 struct ccl_program *ccl
3657 = encodep ? &coding->spec.ccl.encoder : &coding->spec.ccl.decoder;
3658 int result;
3659
ae9ff118 3660 ccl->last_block = coding->mode & CODING_MODE_LAST_BLOCK;
7b179c2d 3661
d46c5b12
KH
3662 coding->produced = ccl_driver (ccl, source, destination,
3663 src_bytes, dst_bytes, &(coding->consumed));
69f76525
KH
3664 coding->produced_char
3665 = multibyte_chars_in_text (destination, coding->produced);
3666 coding->consumed_char
3667 = multibyte_chars_in_text (source, coding->consumed);
3668
d46c5b12
KH
3669 switch (ccl->status)
3670 {
3671 case CCL_STAT_SUSPEND_BY_SRC:
3672 result = CODING_FINISH_INSUFFICIENT_SRC;
3673 break;
3674 case CCL_STAT_SUSPEND_BY_DST:
3675 result = CODING_FINISH_INSUFFICIENT_DST;
3676 break;
9864ebce
KH
3677 case CCL_STAT_QUIT:
3678 case CCL_STAT_INVALID_CMD:
3679 result = CODING_FINISH_INTERRUPT;
3680 break;
d46c5b12
KH
3681 default:
3682 result = CODING_FINISH_NORMAL;
3683 break;
3684 }
3685 return result;
4ed46869
KH
3686}
3687
3688/* See "GENERAL NOTES about `decode_coding_XXX ()' functions". Before
3689 decoding, it may detect coding system and format of end-of-line if
3690 those are not yet decided. */
3691
3692int
d46c5b12 3693decode_coding (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
3694 struct coding_system *coding;
3695 unsigned char *source, *destination;
3696 int src_bytes, dst_bytes;
4ed46869 3697{
d46c5b12 3698 int result;
4ed46869 3699
d4e57bcd 3700 if (src_bytes <= 0
944bd420 3701 && coding->type != coding_type_ccl
d4e57bcd
KH
3702 && ! (coding->mode & CODING_MODE_LAST_BLOCK
3703 && CODING_REQUIRE_FLUSHING (coding)))
4ed46869 3704 {
d46c5b12
KH
3705 coding->produced = coding->produced_char = 0;
3706 coding->consumed = coding->consumed_char = 0;
fb88bf2d 3707 coding->fake_multibyte = 0;
d46c5b12 3708 return CODING_FINISH_NORMAL;
4ed46869
KH
3709 }
3710
0ef69138 3711 if (coding->type == coding_type_undecided)
4ed46869
KH
3712 detect_coding (coding, source, src_bytes);
3713
0ef69138 3714 if (coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
3715 detect_eol (coding, source, src_bytes);
3716
4ed46869
KH
3717 switch (coding->type)
3718 {
0ef69138
KH
3719 case coding_type_emacs_mule:
3720 case coding_type_undecided:
27901516 3721 case coding_type_raw_text:
4ed46869 3722 if (coding->eol_type == CODING_EOL_LF
0ef69138 3723 || coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869 3724 goto label_no_conversion;
d46c5b12 3725 result = decode_eol (coding, source, destination, src_bytes, dst_bytes);
4ed46869
KH
3726 break;
3727
3728 case coding_type_sjis:
d46c5b12
KH
3729 result = decode_coding_sjis_big5 (coding, source, destination,
3730 src_bytes, dst_bytes, 1);
4ed46869
KH
3731 break;
3732
3733 case coding_type_iso2022:
d46c5b12
KH
3734 result = decode_coding_iso2022 (coding, source, destination,
3735 src_bytes, dst_bytes);
4ed46869
KH
3736 break;
3737
3738 case coding_type_big5:
d46c5b12
KH
3739 result = decode_coding_sjis_big5 (coding, source, destination,
3740 src_bytes, dst_bytes, 0);
4ed46869
KH
3741 break;
3742
3743 case coding_type_ccl:
d46c5b12
KH
3744 result = ccl_coding_driver (coding, source, destination,
3745 src_bytes, dst_bytes, 0);
3746 break;
3747
3748 default: /* i.e. case coding_type_no_conversion: */
3749 label_no_conversion:
3750 if (dst_bytes && src_bytes > dst_bytes)
3751 {
3752 coding->produced = dst_bytes;
3753 result = CODING_FINISH_INSUFFICIENT_DST;
3754 }
3755 else
3756 {
3757 coding->produced = src_bytes;
3758 result = CODING_FINISH_NORMAL;
3759 }
3760 if (dst_bytes)
3761 bcopy (source, destination, coding->produced);
3762 else
3763 safe_bcopy (source, destination, coding->produced);
fb88bf2d 3764 coding->fake_multibyte = 1;
d46c5b12
KH
3765 coding->consumed
3766 = coding->consumed_char = coding->produced_char = coding->produced;
4ed46869
KH
3767 break;
3768 }
3769
d46c5b12 3770 return result;
4ed46869
KH
3771}
3772
3773/* See "GENERAL NOTES about `encode_coding_XXX ()' functions". */
3774
3775int
d46c5b12 3776encode_coding (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
3777 struct coding_system *coding;
3778 unsigned char *source, *destination;
3779 int src_bytes, dst_bytes;
4ed46869 3780{
d46c5b12 3781 int result;
4ed46869 3782
d4e57bcd
KH
3783 if (src_bytes <= 0
3784 && ! (coding->mode & CODING_MODE_LAST_BLOCK
3785 && CODING_REQUIRE_FLUSHING (coding)))
4ed46869 3786 {
d46c5b12
KH
3787 coding->produced = coding->produced_char = 0;
3788 coding->consumed = coding->consumed_char = 0;
fb88bf2d 3789 coding->fake_multibyte = 0;
d46c5b12
KH
3790 return CODING_FINISH_NORMAL;
3791 }
4ed46869 3792
d46c5b12
KH
3793 switch (coding->type)
3794 {
0ef69138
KH
3795 case coding_type_emacs_mule:
3796 case coding_type_undecided:
27901516 3797 case coding_type_raw_text:
4ed46869 3798 if (coding->eol_type == CODING_EOL_LF
0ef69138 3799 || coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869 3800 goto label_no_conversion;
d46c5b12 3801 result = encode_eol (coding, source, destination, src_bytes, dst_bytes);
4ed46869
KH
3802 break;
3803
3804 case coding_type_sjis:
d46c5b12
KH
3805 result = encode_coding_sjis_big5 (coding, source, destination,
3806 src_bytes, dst_bytes, 1);
4ed46869
KH
3807 break;
3808
3809 case coding_type_iso2022:
d46c5b12
KH
3810 result = encode_coding_iso2022 (coding, source, destination,
3811 src_bytes, dst_bytes);
4ed46869
KH
3812 break;
3813
3814 case coding_type_big5:
d46c5b12
KH
3815 result = encode_coding_sjis_big5 (coding, source, destination,
3816 src_bytes, dst_bytes, 0);
4ed46869
KH
3817 break;
3818
3819 case coding_type_ccl:
d46c5b12
KH
3820 result = ccl_coding_driver (coding, source, destination,
3821 src_bytes, dst_bytes, 1);
3822 break;
3823
3824 default: /* i.e. case coding_type_no_conversion: */
3825 label_no_conversion:
3826 if (dst_bytes && src_bytes > dst_bytes)
3827 {
3828 coding->produced = dst_bytes;
3829 result = CODING_FINISH_INSUFFICIENT_DST;
3830 }
3831 else
3832 {
3833 coding->produced = src_bytes;
3834 result = CODING_FINISH_NORMAL;
3835 }
3836 if (dst_bytes)
3837 bcopy (source, destination, coding->produced);
3838 else
3839 safe_bcopy (source, destination, coding->produced);
3840 if (coding->mode & CODING_MODE_SELECTIVE_DISPLAY)
3841 {
3842 unsigned char *p = destination, *pend = p + coding->produced;
3843 while (p < pend)
3844 if (*p++ == '\015') p[-1] = '\n';
3845 }
fb88bf2d 3846 coding->fake_multibyte = 1;
d46c5b12
KH
3847 coding->consumed
3848 = coding->consumed_char = coding->produced_char = coding->produced;
4ed46869
KH
3849 break;
3850 }
3851
d46c5b12 3852 return result;
4ed46869
KH
3853}
3854
fb88bf2d
KH
3855/* Scan text in the region between *BEG and *END (byte positions),
3856 skip characters which we don't have to decode by coding system
3857 CODING at the head and tail, then set *BEG and *END to the region
3858 of the text we actually have to convert. The caller should move
3859 the gap out of the region in advance.
4ed46869 3860
d46c5b12
KH
3861 If STR is not NULL, *BEG and *END are indices into STR. */
3862
3863static void
3864shrink_decoding_region (beg, end, coding, str)
3865 int *beg, *end;
3866 struct coding_system *coding;
3867 unsigned char *str;
3868{
fb88bf2d 3869 unsigned char *begp_orig, *begp, *endp_orig, *endp, c;
d46c5b12 3870 int eol_conversion;
88993dfd 3871 Lisp_Object translation_table;
d46c5b12
KH
3872
3873 if (coding->type == coding_type_ccl
3874 || coding->type == coding_type_undecided
3875 || !NILP (coding->post_read_conversion))
3876 {
3877 /* We can't skip any data. */
3878 return;
3879 }
3880 else if (coding->type == coding_type_no_conversion)
3881 {
fb88bf2d
KH
3882 /* We need no conversion, but don't have to skip any data here.
3883 Decoding routine handles them effectively anyway. */
d46c5b12
KH
3884 return;
3885 }
3886
88993dfd
KH
3887 translation_table = coding->translation_table_for_decode;
3888 if (NILP (translation_table) && !NILP (Venable_character_translation))
3889 translation_table = Vstandard_translation_table_for_decode;
3890 if (CHAR_TABLE_P (translation_table))
3891 {
3892 int i;
3893 for (i = 0; i < 128; i++)
3894 if (!NILP (CHAR_TABLE_REF (translation_table, i)))
3895 break;
3896 if (i < 128)
3897 /* Some ASCII character should be tranlsated. We give up
3898 shrinking. */
3899 return;
3900 }
3901
aa60dea6
KH
3902 eol_conversion = (coding->eol_type != CODING_EOL_LF);
3903
3904 if ((! eol_conversion) && (coding->heading_ascii >= 0))
d46c5b12
KH
3905 /* Detection routine has already found how much we can skip at the
3906 head. */
3907 *beg += coding->heading_ascii;
3908
3909 if (str)
3910 {
3911 begp_orig = begp = str + *beg;
3912 endp_orig = endp = str + *end;
3913 }
3914 else
3915 {
fb88bf2d 3916 begp_orig = begp = BYTE_POS_ADDR (*beg);
d46c5b12
KH
3917 endp_orig = endp = begp + *end - *beg;
3918 }
3919
d46c5b12
KH
3920 switch (coding->type)
3921 {
3922 case coding_type_emacs_mule:
3923 case coding_type_raw_text:
3924 if (eol_conversion)
3925 {
3926 if (coding->heading_ascii < 0)
fb88bf2d 3927 while (begp < endp && *begp != '\r' && *begp < 0x80) begp++;
ee59c65f 3928 while (begp < endp && endp[-1] != '\r' && endp[-1] < 0x80)
fb88bf2d 3929 endp--;
ee59c65f
RS
3930 /* Do not consider LF as ascii if preceded by CR, since that
3931 confuses eol decoding. */
3932 if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
3933 endp++;
d46c5b12
KH
3934 }
3935 else
3936 begp = endp;
3937 break;
3938
3939 case coding_type_sjis:
3940 case coding_type_big5:
3941 /* We can skip all ASCII characters at the head. */
3942 if (coding->heading_ascii < 0)
3943 {
3944 if (eol_conversion)
de9d083c 3945 while (begp < endp && *begp < 0x80 && *begp != '\r') begp++;
d46c5b12
KH
3946 else
3947 while (begp < endp && *begp < 0x80) begp++;
3948 }
3949 /* We can skip all ASCII characters at the tail except for the
3950 second byte of SJIS or BIG5 code. */
3951 if (eol_conversion)
de9d083c 3952 while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\r') endp--;
d46c5b12
KH
3953 else
3954 while (begp < endp && endp[-1] < 0x80) endp--;
ee59c65f
RS
3955 /* Do not consider LF as ascii if preceded by CR, since that
3956 confuses eol decoding. */
3957 if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
3958 endp++;
d46c5b12
KH
3959 if (begp < endp && endp < endp_orig && endp[-1] >= 0x80)
3960 endp++;
3961 break;
3962
3963 default: /* i.e. case coding_type_iso2022: */
622fece5
KH
3964 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
3965 /* We can't skip any data. */
3966 break;
d46c5b12
KH
3967 if (coding->heading_ascii < 0)
3968 {
d46c5b12
KH
3969 /* We can skip all ASCII characters at the head except for a
3970 few control codes. */
3971 while (begp < endp && (c = *begp) < 0x80
3972 && c != ISO_CODE_CR && c != ISO_CODE_SO
3973 && c != ISO_CODE_SI && c != ISO_CODE_ESC
3974 && (!eol_conversion || c != ISO_CODE_LF))
3975 begp++;
3976 }
3977 switch (coding->category_idx)
3978 {
3979 case CODING_CATEGORY_IDX_ISO_8_1:
3980 case CODING_CATEGORY_IDX_ISO_8_2:
3981 /* We can skip all ASCII characters at the tail. */
3982 if (eol_conversion)
de9d083c 3983 while (begp < endp && (c = endp[-1]) < 0x80 && c != '\r') endp--;
d46c5b12
KH
3984 else
3985 while (begp < endp && endp[-1] < 0x80) endp--;
ee59c65f
RS
3986 /* Do not consider LF as ascii if preceded by CR, since that
3987 confuses eol decoding. */
3988 if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
3989 endp++;
d46c5b12
KH
3990 break;
3991
3992 case CODING_CATEGORY_IDX_ISO_7:
3993 case CODING_CATEGORY_IDX_ISO_7_TIGHT:
de79a6a5
KH
3994 {
3995 /* We can skip all charactes at the tail except for 8-bit
3996 codes and ESC and the following 2-byte at the tail. */
3997 unsigned char *eight_bit = NULL;
3998
3999 if (eol_conversion)
4000 while (begp < endp
4001 && (c = endp[-1]) != ISO_CODE_ESC && c != '\r')
4002 {
4003 if (!eight_bit && c & 0x80) eight_bit = endp;
4004 endp--;
4005 }
4006 else
4007 while (begp < endp
4008 && (c = endp[-1]) != ISO_CODE_ESC)
4009 {
4010 if (!eight_bit && c & 0x80) eight_bit = endp;
4011 endp--;
4012 }
4013 /* Do not consider LF as ascii if preceded by CR, since that
4014 confuses eol decoding. */
4015 if (begp < endp && endp < endp_orig
4016 && endp[-1] == '\r' && endp[0] == '\n')
4017 endp++;
4018 if (begp < endp && endp[-1] == ISO_CODE_ESC)
4019 {
4020 if (endp + 1 < endp_orig && end[0] == '(' && end[1] == 'B')
4021 /* This is an ASCII designation sequence. We can
4022 surely skip the tail. But, if we have
4023 encountered an 8-bit code, skip only the codes
4024 after that. */
4025 endp = eight_bit ? eight_bit : endp + 2;
4026 else
4027 /* Hmmm, we can't skip the tail. */
4028 endp = endp_orig;
4029 }
4030 else if (eight_bit)
4031 endp = eight_bit;
4032 }
d46c5b12
KH
4033 }
4034 }
4035 *beg += begp - begp_orig;
4036 *end += endp - endp_orig;
4037 return;
4038}
4039
4040/* Like shrink_decoding_region but for encoding. */
4041
4042static void
4043shrink_encoding_region (beg, end, coding, str)
4044 int *beg, *end;
4045 struct coding_system *coding;
4046 unsigned char *str;
4047{
4048 unsigned char *begp_orig, *begp, *endp_orig, *endp;
4049 int eol_conversion;
88993dfd 4050 Lisp_Object translation_table;
d46c5b12
KH
4051
4052 if (coding->type == coding_type_ccl)
4053 /* We can't skip any data. */
4054 return;
4055 else if (coding->type == coding_type_no_conversion)
4056 {
4057 /* We need no conversion. */
4058 *beg = *end;
4059 return;
4060 }
4061
88993dfd
KH
4062 translation_table = coding->translation_table_for_encode;
4063 if (NILP (translation_table) && !NILP (Venable_character_translation))
4064 translation_table = Vstandard_translation_table_for_encode;
4065 if (CHAR_TABLE_P (translation_table))
4066 {
4067 int i;
4068 for (i = 0; i < 128; i++)
4069 if (!NILP (CHAR_TABLE_REF (translation_table, i)))
4070 break;
4071 if (i < 128)
4072 /* Some ASCII character should be tranlsated. We give up
4073 shrinking. */
4074 return;
4075 }
4076
d46c5b12
KH
4077 if (str)
4078 {
4079 begp_orig = begp = str + *beg;
4080 endp_orig = endp = str + *end;
4081 }
4082 else
4083 {
fb88bf2d 4084 begp_orig = begp = BYTE_POS_ADDR (*beg);
d46c5b12
KH
4085 endp_orig = endp = begp + *end - *beg;
4086 }
4087
4088 eol_conversion = (coding->eol_type == CODING_EOL_CR
4089 || coding->eol_type == CODING_EOL_CRLF);
4090
4091 /* Here, we don't have to check coding->pre_write_conversion because
4092 the caller is expected to have handled it already. */
4093 switch (coding->type)
4094 {
4095 case coding_type_undecided:
4096 case coding_type_emacs_mule:
4097 case coding_type_raw_text:
4098 if (eol_conversion)
4099 {
4100 while (begp < endp && *begp != '\n') begp++;
4101 while (begp < endp && endp[-1] != '\n') endp--;
4102 }
4103 else
4104 begp = endp;
4105 break;
4106
4107 case coding_type_iso2022:
622fece5
KH
4108 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
4109 /* We can't skip any data. */
4110 break;
d46c5b12
KH
4111 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
4112 {
4113 unsigned char *bol = begp;
4114 while (begp < endp && *begp < 0x80)
4115 {
4116 begp++;
4117 if (begp[-1] == '\n')
4118 bol = begp;
4119 }
4120 begp = bol;
4121 goto label_skip_tail;
4122 }
4123 /* fall down ... */
4124
4125 default:
4126 /* We can skip all ASCII characters at the head and tail. */
4127 if (eol_conversion)
4128 while (begp < endp && *begp < 0x80 && *begp != '\n') begp++;
4129 else
4130 while (begp < endp && *begp < 0x80) begp++;
4131 label_skip_tail:
4132 if (eol_conversion)
4133 while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\n') endp--;
4134 else
4135 while (begp < endp && *(endp - 1) < 0x80) endp--;
4136 break;
4137 }
4138
4139 *beg += begp - begp_orig;
4140 *end += endp - endp_orig;
4141 return;
4142}
4143
88993dfd
KH
4144/* As shrinking conversion region requires some overhead, we don't try
4145 shrinking if the length of conversion region is less than this
4146 value. */
4147static int shrink_conversion_region_threshhold = 1024;
4148
4149#define SHRINK_CONVERSION_REGION(beg, end, coding, str, encodep) \
4150 do { \
4151 if (*(end) - *(beg) > shrink_conversion_region_threshhold) \
4152 { \
4153 if (encodep) shrink_encoding_region (beg, end, coding, str); \
4154 else shrink_decoding_region (beg, end, coding, str); \
4155 } \
4156 } while (0)
4157
d46c5b12 4158/* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the
fb88bf2d
KH
4159 text from FROM to TO (byte positions are FROM_BYTE and TO_BYTE) by
4160 coding system CODING, and return the status code of code conversion
4161 (currently, this value has no meaning).
4162
4163 How many characters (and bytes) are converted to how many
4164 characters (and bytes) are recorded in members of the structure
4165 CODING.
d46c5b12 4166
6e44253b 4167 If REPLACE is nonzero, we do various things as if the original text
d46c5b12 4168 is deleted and a new text is inserted. See the comments in
6e44253b 4169 replace_range (insdel.c) to know what we are doing. */
4ed46869
KH
4170
4171int
6e44253b
KH
4172code_convert_region (from, from_byte, to, to_byte, coding, encodep, replace)
4173 int from, from_byte, to, to_byte, encodep, replace;
4ed46869 4174 struct coding_system *coding;
4ed46869 4175{
fb88bf2d
KH
4176 int len = to - from, len_byte = to_byte - from_byte;
4177 int require, inserted, inserted_byte;
12410ef1 4178 int head_skip, tail_skip, total_skip;
84d60297 4179 Lisp_Object saved_coding_symbol;
fb88bf2d
KH
4180 int multibyte = !NILP (current_buffer->enable_multibyte_characters);
4181 int first = 1;
4182 int fake_multibyte = 0;
4183 unsigned char *src, *dst;
84d60297 4184 Lisp_Object deletion;
e133c8fa 4185 int orig_point = PT, orig_len = len;
6abb9bd9 4186 int prev_Z;
84d60297
RS
4187
4188 deletion = Qnil;
4189 saved_coding_symbol = Qnil;
d46c5b12 4190
83fa074f 4191 if (from < PT && PT < to)
e133c8fa
KH
4192 {
4193 TEMP_SET_PT_BOTH (from, from_byte);
4194 orig_point = from;
4195 }
83fa074f 4196
6e44253b 4197 if (replace)
d46c5b12 4198 {
fb88bf2d
KH
4199 int saved_from = from;
4200
d46c5b12 4201 prepare_to_modify_buffer (from, to, &from);
fb88bf2d
KH
4202 if (saved_from != from)
4203 {
4204 to = from + len;
4205 if (multibyte)
4206 from_byte = CHAR_TO_BYTE (from), to_byte = CHAR_TO_BYTE (to);
4207 else
4208 from_byte = from, to_byte = to;
4209 len_byte = to_byte - from_byte;
4210 }
d46c5b12 4211 }
d46c5b12
KH
4212
4213 if (! encodep && CODING_REQUIRE_DETECTION (coding))
4214 {
12410ef1 4215 /* We must detect encoding of text and eol format. */
d46c5b12
KH
4216
4217 if (from < GPT && to > GPT)
4218 move_gap_both (from, from_byte);
4219 if (coding->type == coding_type_undecided)
4220 {
fb88bf2d 4221 detect_coding (coding, BYTE_POS_ADDR (from_byte), len_byte);
d46c5b12 4222 if (coding->type == coding_type_undecided)
12410ef1
KH
4223 /* It seems that the text contains only ASCII, but we
4224 should not left it undecided because the deeper
4225 decoding routine (decode_coding) tries to detect the
4226 encodings again in vain. */
d46c5b12
KH
4227 coding->type = coding_type_emacs_mule;
4228 }
4229 if (coding->eol_type == CODING_EOL_UNDECIDED)
4230 {
4231 saved_coding_symbol = coding->symbol;
4232 detect_eol (coding, BYTE_POS_ADDR (from_byte), len_byte);
4233 if (coding->eol_type == CODING_EOL_UNDECIDED)
4234 coding->eol_type = CODING_EOL_LF;
4235 /* We had better recover the original eol format if we
4236 encounter an inconsitent eol format while decoding. */
4237 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4238 }
4239 }
4240
fb88bf2d
KH
4241 coding->consumed_char = len, coding->consumed = len_byte;
4242
d46c5b12
KH
4243 if (encodep
4244 ? ! CODING_REQUIRE_ENCODING (coding)
4245 : ! CODING_REQUIRE_DECODING (coding))
fb88bf2d
KH
4246 {
4247 coding->produced = len_byte;
12410ef1
KH
4248 if (multibyte
4249 && ! replace
4250 /* See the comment of the member heading_ascii in coding.h. */
4251 && coding->heading_ascii < len_byte)
fb88bf2d 4252 {
6e44253b
KH
4253 /* We still may have to combine byte at the head and the
4254 tail of the text in the region. */
12410ef1 4255 if (from < GPT && GPT < to)
6e44253b 4256 move_gap_both (to, to_byte);
12410ef1
KH
4257 len = multibyte_chars_in_text (BYTE_POS_ADDR (from_byte), len_byte);
4258 adjust_after_insert (from, from_byte, to, to_byte, len);
4259 coding->produced_char = len;
fb88bf2d
KH
4260 }
4261 else
68e3a8f1
AS
4262 {
4263 if (!replace)
4264 adjust_after_insert (from, from_byte, to, to_byte, len_byte);
4265 coding->produced_char = len_byte;
4266 }
fb88bf2d
KH
4267 return 0;
4268 }
d46c5b12
KH
4269
4270 /* Now we convert the text. */
4271
4272 /* For encoding, we must process pre-write-conversion in advance. */
4273 if (encodep
d46c5b12
KH
4274 && ! NILP (coding->pre_write_conversion)
4275 && SYMBOLP (coding->pre_write_conversion)
4276 && ! NILP (Ffboundp (coding->pre_write_conversion)))
4277 {
2b4f9037
KH
4278 /* The function in pre-write-conversion may put a new text in a
4279 new buffer. */
0007bdd0
KH
4280 struct buffer *prev = current_buffer;
4281 Lisp_Object new;
d46c5b12 4282
b39f748c
AS
4283 call2 (coding->pre_write_conversion,
4284 make_number (from), make_number (to));
d46c5b12
KH
4285 if (current_buffer != prev)
4286 {
4287 len = ZV - BEGV;
0007bdd0 4288 new = Fcurrent_buffer ();
d46c5b12 4289 set_buffer_internal_1 (prev);
ddbc19ff 4290 del_range_2 (from, from_byte, to, to_byte);
e133c8fa 4291 TEMP_SET_PT_BOTH (from, from_byte);
0007bdd0
KH
4292 insert_from_buffer (XBUFFER (new), 1, len, 0);
4293 Fkill_buffer (new);
e133c8fa
KH
4294 if (orig_point >= to)
4295 orig_point += len - orig_len;
4296 else if (orig_point > from)
4297 orig_point = from;
4298 orig_len = len;
d46c5b12 4299 to = from + len;
e133c8fa 4300 from_byte = multibyte ? CHAR_TO_BYTE (from) : from_byte;
fb88bf2d 4301 to_byte = multibyte ? CHAR_TO_BYTE (to) : to;
d46c5b12 4302 len_byte = to_byte - from_byte;
e133c8fa 4303 TEMP_SET_PT_BOTH (from, from_byte);
d46c5b12
KH
4304 }
4305 }
4306
12410ef1
KH
4307 if (replace)
4308 deletion = make_buffer_string_both (from, from_byte, to, to_byte, 1);
4309
d46c5b12 4310 /* Try to skip the heading and tailing ASCIIs. */
12410ef1
KH
4311 {
4312 int from_byte_orig = from_byte, to_byte_orig = to_byte;
4313
4314 if (from < GPT && GPT < to)
4315 move_gap_both (from, from_byte);
88993dfd 4316 SHRINK_CONVERSION_REGION (&from_byte, &to_byte, coding, NULL, encodep);
d4e57bcd 4317 if (from_byte == to_byte
944bd420 4318 && coding->type != coding_type_ccl
d4e57bcd
KH
4319 && ! (coding->mode & CODING_MODE_LAST_BLOCK
4320 && CODING_REQUIRE_FLUSHING (coding)))
12410ef1
KH
4321 {
4322 coding->produced = len_byte;
4323 coding->produced_char = multibyte ? len : len_byte;
4324 if (!replace)
4325 /* We must record and adjust for this new text now. */
4326 adjust_after_insert (from, from_byte_orig, to, to_byte_orig, len);
4327 return 0;
4328 }
fb88bf2d 4329
12410ef1
KH
4330 head_skip = from_byte - from_byte_orig;
4331 tail_skip = to_byte_orig - to_byte;
4332 total_skip = head_skip + tail_skip;
4333 from += head_skip;
4334 to -= tail_skip;
4335 len -= total_skip; len_byte -= total_skip;
4336 }
d46c5b12 4337
88993dfd 4338 /* The code conversion routine can not preserve text properties for
55d8d769
KH
4339 now. So, we must remove all text properties in the region.
4340 Here, we must suppress all modification hooks. */
88993dfd 4341 if (replace)
55d8d769
KH
4342 {
4343 int saved_inhibit_modification_hooks = inhibit_modification_hooks;
4344 inhibit_modification_hooks = 1;
4345 Fset_text_properties (make_number (from), make_number (to), Qnil, Qnil);
4346 inhibit_modification_hooks = saved_inhibit_modification_hooks;
4347 }
88993dfd 4348
fb88bf2d
KH
4349 /* For converion, we must put the gap before the text in addition to
4350 making the gap larger for efficient decoding. The required gap
4351 size starts from 2000 which is the magic number used in make_gap.
4352 But, after one batch of conversion, it will be incremented if we
4353 find that it is not enough . */
d46c5b12
KH
4354 require = 2000;
4355
4356 if (GAP_SIZE < require)
4357 make_gap (require - GAP_SIZE);
4358 move_gap_both (from, from_byte);
4359
d46c5b12 4360 inserted = inserted_byte = 0;
fb88bf2d
KH
4361 src = GAP_END_ADDR, dst = GPT_ADDR;
4362
4363 GAP_SIZE += len_byte;
4364 ZV -= len;
4365 Z -= len;
4366 ZV_BYTE -= len_byte;
4367 Z_BYTE -= len_byte;
4368
f2558efd
KH
4369 if (GPT - BEG < beg_unchanged)
4370 beg_unchanged = GPT - BEG;
4371 if (Z - GPT < end_unchanged)
4372 end_unchanged = Z - GPT;
4373
d46c5b12
KH
4374 for (;;)
4375 {
fb88bf2d 4376 int result;
d46c5b12
KH
4377
4378 /* The buffer memory is changed from:
fb88bf2d
KH
4379 +--------+converted-text+---------+-------original-text------+---+
4380 |<-from->|<--inserted-->|---------|<-----------len---------->|---|
4381 |<------------------- GAP_SIZE -------------------->| */
d46c5b12 4382 if (encodep)
fb88bf2d 4383 result = encode_coding (coding, src, dst, len_byte, 0);
d46c5b12 4384 else
fb88bf2d 4385 result = decode_coding (coding, src, dst, len_byte, 0);
d46c5b12
KH
4386 /* to:
4387 +--------+-------converted-text--------+--+---original-text--+---+
fb88bf2d
KH
4388 |<-from->|<--inserted-->|<--produced-->|--|<-(len-consumed)->|---|
4389 |<------------------- GAP_SIZE -------------------->| */
4390 if (coding->fake_multibyte)
4391 fake_multibyte = 1;
d46c5b12 4392
fb88bf2d
KH
4393 if (!encodep && !multibyte)
4394 coding->produced_char = coding->produced;
d46c5b12
KH
4395 inserted += coding->produced_char;
4396 inserted_byte += coding->produced;
d46c5b12 4397 len_byte -= coding->consumed;
fb88bf2d
KH
4398 src += coding->consumed;
4399 dst += inserted_byte;
d46c5b12 4400
9864ebce
KH
4401 if (result == CODING_FINISH_NORMAL)
4402 {
4403 src += len_byte;
4404 break;
4405 }
d46c5b12
KH
4406 if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
4407 {
fb88bf2d 4408 unsigned char *pend = dst, *p = pend - inserted_byte;
d46c5b12
KH
4409
4410 /* Encode LFs back to the original eol format (CR or CRLF). */
4411 if (coding->eol_type == CODING_EOL_CR)
4412 {
4413 while (p < pend) if (*p++ == '\n') p[-1] = '\r';
4414 }
4415 else
4416 {
d46c5b12
KH
4417 int count = 0;
4418
fb88bf2d
KH
4419 while (p < pend) if (*p++ == '\n') count++;
4420 if (src - dst < count)
d46c5b12 4421 {
fb88bf2d
KH
4422 /* We don't have sufficient room for putting LFs
4423 back to CRLF. We must record converted and
4424 not-yet-converted text back to the buffer
4425 content, enlarge the gap, then record them out of
4426 the buffer contents again. */
4427 int add = len_byte + inserted_byte;
4428
4429 GAP_SIZE -= add;
4430 ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
4431 GPT += inserted_byte; GPT_BYTE += inserted_byte;
4432 make_gap (count - GAP_SIZE);
4433 GAP_SIZE += add;
4434 ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
4435 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
4436 /* Don't forget to update SRC, DST, and PEND. */
4437 src = GAP_END_ADDR - len_byte;
4438 dst = GPT_ADDR + inserted_byte;
4439 pend = dst;
d46c5b12 4440 }
d46c5b12
KH
4441 inserted += count;
4442 inserted_byte += count;
fb88bf2d
KH
4443 coding->produced += count;
4444 p = dst = pend + count;
4445 while (count)
4446 {
4447 *--p = *--pend;
4448 if (*p == '\n') count--, *--p = '\r';
4449 }
d46c5b12
KH
4450 }
4451
4452 /* Suppress eol-format conversion in the further conversion. */
4453 coding->eol_type = CODING_EOL_LF;
4454
4455 /* Restore the original symbol. */
4456 coding->symbol = saved_coding_symbol;
fb88bf2d
KH
4457
4458 continue;
d46c5b12
KH
4459 }
4460 if (len_byte <= 0)
944bd420
KH
4461 {
4462 if (coding->type != coding_type_ccl
4463 || coding->mode & CODING_MODE_LAST_BLOCK)
4464 break;
4465 coding->mode |= CODING_MODE_LAST_BLOCK;
4466 continue;
4467 }
d46c5b12
KH
4468 if (result == CODING_FINISH_INSUFFICIENT_SRC)
4469 {
4470 /* The source text ends in invalid codes. Let's just
4471 make them valid buffer contents, and finish conversion. */
fb88bf2d 4472 inserted += len_byte;
d46c5b12 4473 inserted_byte += len_byte;
fb88bf2d 4474 while (len_byte--)
ee59c65f 4475 *dst++ = *src++;
fb88bf2d 4476 fake_multibyte = 1;
d46c5b12
KH
4477 break;
4478 }
9864ebce
KH
4479 if (result == CODING_FINISH_INTERRUPT)
4480 {
4481 /* The conversion procedure was interrupted by a user. */
4482 fake_multibyte = 1;
4483 break;
4484 }
4485 /* Now RESULT == CODING_FINISH_INSUFFICIENT_DST */
4486 if (coding->consumed < 1)
4487 {
4488 /* It's quite strange to require more memory without
4489 consuming any bytes. Perhaps CCL program bug. */
4490 fake_multibyte = 1;
4491 break;
4492 }
fb88bf2d
KH
4493 if (first)
4494 {
4495 /* We have just done the first batch of conversion which was
4496 stoped because of insufficient gap. Let's reconsider the
4497 required gap size (i.e. SRT - DST) now.
4498
4499 We have converted ORIG bytes (== coding->consumed) into
4500 NEW bytes (coding->produced). To convert the remaining
4501 LEN bytes, we may need REQUIRE bytes of gap, where:
4502 REQUIRE + LEN_BYTE = LEN_BYTE * (NEW / ORIG)
4503 REQUIRE = LEN_BYTE * (NEW - ORIG) / ORIG
4504 Here, we are sure that NEW >= ORIG. */
6e44253b
KH
4505 float ratio = coding->produced - coding->consumed;
4506 ratio /= coding->consumed;
4507 require = len_byte * ratio;
fb88bf2d
KH
4508 first = 0;
4509 }
4510 if ((src - dst) < (require + 2000))
4511 {
4512 /* See the comment above the previous call of make_gap. */
4513 int add = len_byte + inserted_byte;
4514
4515 GAP_SIZE -= add;
4516 ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
4517 GPT += inserted_byte; GPT_BYTE += inserted_byte;
4518 make_gap (require + 2000);
4519 GAP_SIZE += add;
4520 ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
4521 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
4522 /* Don't forget to update SRC, DST. */
4523 src = GAP_END_ADDR - len_byte;
4524 dst = GPT_ADDR + inserted_byte;
4525 }
d46c5b12 4526 }
fb88bf2d
KH
4527 if (src - dst > 0) *dst = 0; /* Put an anchor. */
4528
2b4f9037 4529 if (multibyte
88993dfd
KH
4530 && (encodep
4531 || fake_multibyte
4532 || (to - from) != (to_byte - from_byte)))
2b4f9037 4533 inserted = multibyte_chars_in_text (GPT_ADDR, inserted_byte);
7553d0e1 4534
12410ef1
KH
4535 /* If we have shrinked the conversion area, adjust it now. */
4536 if (total_skip > 0)
4537 {
4538 if (tail_skip > 0)
4539 safe_bcopy (GAP_END_ADDR, GPT_ADDR + inserted_byte, tail_skip);
4540 inserted += total_skip; inserted_byte += total_skip;
4541 GAP_SIZE += total_skip;
4542 GPT -= head_skip; GPT_BYTE -= head_skip;
4543 ZV -= total_skip; ZV_BYTE -= total_skip;
4544 Z -= total_skip; Z_BYTE -= total_skip;
4545 from -= head_skip; from_byte -= head_skip;
4546 to += tail_skip; to_byte += tail_skip;
4547 }
4548
6abb9bd9 4549 prev_Z = Z;
12410ef1 4550 adjust_after_replace (from, from_byte, deletion, inserted, inserted_byte);
6abb9bd9 4551 inserted = Z - prev_Z;
4ed46869 4552
2b4f9037 4553 if (! encodep && ! NILP (coding->post_read_conversion))
d46c5b12 4554 {
2b4f9037 4555 Lisp_Object val;
4ed46869 4556
e133c8fa
KH
4557 if (from != PT)
4558 TEMP_SET_PT_BOTH (from, from_byte);
6abb9bd9 4559 prev_Z = Z;
2b4f9037 4560 val = call1 (coding->post_read_conversion, make_number (inserted));
6abb9bd9 4561 CHECK_NUMBER (val, 0);
944bd420 4562 inserted += Z - prev_Z;
e133c8fa
KH
4563 }
4564
4565 if (orig_point >= from)
4566 {
4567 if (orig_point >= from + orig_len)
4568 orig_point += inserted - orig_len;
4569 else
4570 orig_point = from;
4571 TEMP_SET_PT (orig_point);
d46c5b12 4572 }
4ed46869 4573
2b4f9037
KH
4574 signal_after_change (from, to - from, inserted);
4575
fb88bf2d 4576 {
12410ef1
KH
4577 coding->consumed = to_byte - from_byte;
4578 coding->consumed_char = to - from;
4579 coding->produced = inserted_byte;
4580 coding->produced_char = inserted;
fb88bf2d 4581 }
7553d0e1 4582
fb88bf2d 4583 return 0;
d46c5b12
KH
4584}
4585
4586Lisp_Object
4587code_convert_string (str, coding, encodep, nocopy)
4588 Lisp_Object str;
4ed46869 4589 struct coding_system *coding;
d46c5b12 4590 int encodep, nocopy;
4ed46869 4591{
d46c5b12
KH
4592 int len;
4593 char *buf;
fc932ac6
RS
4594 int from = 0, to = XSTRING (str)->size;
4595 int to_byte = STRING_BYTES (XSTRING (str));
d46c5b12 4596 struct gcpro gcpro1;
84d60297 4597 Lisp_Object saved_coding_symbol;
d46c5b12 4598 int result;
4ed46869 4599
84d60297 4600 saved_coding_symbol = Qnil;
d46c5b12
KH
4601 if (encodep && !NILP (coding->pre_write_conversion)
4602 || !encodep && !NILP (coding->post_read_conversion))
4603 {
4604 /* Since we have to call Lisp functions which assume target text
4605 is in a buffer, after setting a temporary buffer, call
4606 code_convert_region. */
4607 int count = specpdl_ptr - specpdl;
4608 struct buffer *prev = current_buffer;
e133c8fa 4609
d46c5b12
KH
4610 record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
4611 temp_output_buffer_setup (" *code-converting-work*");
4612 set_buffer_internal (XBUFFER (Vstandard_output));
4613 if (encodep)
4614 insert_from_string (str, 0, 0, to, to_byte, 0);
4615 else
4616 {
4617 /* We must insert the contents of STR as is without
4618 unibyte<->multibyte conversion. */
4619 current_buffer->enable_multibyte_characters = Qnil;
4620 insert_from_string (str, 0, 0, to_byte, to_byte, 0);
4621 current_buffer->enable_multibyte_characters = Qt;
4622 }
fb88bf2d 4623 code_convert_region (BEGV, BEGV_BYTE, ZV, ZV_BYTE, coding, encodep, 1);
d46c5b12
KH
4624 if (encodep)
4625 /* We must return the buffer contents as unibyte string. */
4626 current_buffer->enable_multibyte_characters = Qnil;
4627 str = make_buffer_string (BEGV, ZV, 0);
4628 set_buffer_internal (prev);
4629 return unbind_to (count, str);
4630 }
4ed46869 4631
d46c5b12
KH
4632 if (! encodep && CODING_REQUIRE_DETECTION (coding))
4633 {
4634 /* See the comments in code_convert_region. */
4635 if (coding->type == coding_type_undecided)
4636 {
4637 detect_coding (coding, XSTRING (str)->data, to_byte);
4638 if (coding->type == coding_type_undecided)
4639 coding->type = coding_type_emacs_mule;
4640 }
4641 if (coding->eol_type == CODING_EOL_UNDECIDED)
4642 {
4643 saved_coding_symbol = coding->symbol;
4644 detect_eol (coding, XSTRING (str)->data, to_byte);
4645 if (coding->eol_type == CODING_EOL_UNDECIDED)
4646 coding->eol_type = CODING_EOL_LF;
4647 /* We had better recover the original eol format if we
4648 encounter an inconsitent eol format while decoding. */
4649 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4650 }
4651 }
4ed46869 4652
d46c5b12
KH
4653 if (encodep
4654 ? ! CODING_REQUIRE_ENCODING (coding)
4655 : ! CODING_REQUIRE_DECODING (coding))
4656 from = to_byte;
4657 else
4658 {
4659 /* Try to skip the heading and tailing ASCIIs. */
88993dfd
KH
4660 SHRINK_CONVERSION_REGION (&from, &to_byte, coding, XSTRING (str)->data,
4661 encodep);
d46c5b12 4662 }
e133c8fa
KH
4663 if (from == to_byte
4664 && coding->type != coding_type_ccl)
d46c5b12 4665 return (nocopy ? str : Fcopy_sequence (str));
4ed46869 4666
d46c5b12
KH
4667 if (encodep)
4668 len = encoding_buffer_size (coding, to_byte - from);
4669 else
4670 len = decoding_buffer_size (coding, to_byte - from);
fc932ac6 4671 len += from + STRING_BYTES (XSTRING (str)) - to_byte;
d46c5b12
KH
4672 GCPRO1 (str);
4673 buf = get_conversion_buffer (len);
4674 UNGCPRO;
4ed46869 4675
d46c5b12
KH
4676 if (from > 0)
4677 bcopy (XSTRING (str)->data, buf, from);
4678 result = (encodep
4679 ? encode_coding (coding, XSTRING (str)->data + from,
4680 buf + from, to_byte - from, len)
4681 : decode_coding (coding, XSTRING (str)->data + from,
f30cc612 4682 buf + from, to_byte - from, len));
d46c5b12 4683 if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
4ed46869 4684 {
d46c5b12
KH
4685 /* We simple try to decode the whole string again but without
4686 eol-conversion this time. */
4687 coding->eol_type = CODING_EOL_LF;
4688 coding->symbol = saved_coding_symbol;
4689 return code_convert_string (str, coding, encodep, nocopy);
4ed46869 4690 }
d46c5b12
KH
4691
4692 bcopy (XSTRING (str)->data + to_byte, buf + from + coding->produced,
fc932ac6 4693 STRING_BYTES (XSTRING (str)) - to_byte);
d46c5b12 4694
fc932ac6 4695 len = from + STRING_BYTES (XSTRING (str)) - to_byte;
d46c5b12
KH
4696 if (encodep)
4697 str = make_unibyte_string (buf, len + coding->produced);
4698 else
826bfb8b
KH
4699 {
4700 int chars= (coding->fake_multibyte
4701 ? multibyte_chars_in_text (buf + from, coding->produced)
4702 : coding->produced_char);
4703 str = make_multibyte_string (buf, len + chars, len + coding->produced);
4704 }
4705
d46c5b12 4706 return str;
4ed46869
KH
4707}
4708
4709\f
4710#ifdef emacs
1397dc18 4711/*** 8. Emacs Lisp library functions ***/
4ed46869 4712
4ed46869
KH
4713DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
4714 "Return t if OBJECT is nil or a coding-system.\n\
3a73fa5d
RS
4715See the documentation of `make-coding-system' for information\n\
4716about coding-system objects.")
4ed46869
KH
4717 (obj)
4718 Lisp_Object obj;
4719{
4608c386
KH
4720 if (NILP (obj))
4721 return Qt;
4722 if (!SYMBOLP (obj))
4723 return Qnil;
4724 /* Get coding-spec vector for OBJ. */
4725 obj = Fget (obj, Qcoding_system);
4726 return ((VECTORP (obj) && XVECTOR (obj)->size == 5)
4727 ? Qt : Qnil);
4ed46869
KH
4728}
4729
9d991de8
RS
4730DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
4731 Sread_non_nil_coding_system, 1, 1, 0,
e0e989f6 4732 "Read a coding system from the minibuffer, prompting with string PROMPT.")
4ed46869
KH
4733 (prompt)
4734 Lisp_Object prompt;
4735{
e0e989f6 4736 Lisp_Object val;
9d991de8
RS
4737 do
4738 {
4608c386
KH
4739 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
4740 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
9d991de8
RS
4741 }
4742 while (XSTRING (val)->size == 0);
e0e989f6 4743 return (Fintern (val, Qnil));
4ed46869
KH
4744}
4745
9b787f3e
RS
4746DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
4747 "Read a coding system from the minibuffer, prompting with string PROMPT.\n\
4748If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.")
4749 (prompt, default_coding_system)
4750 Lisp_Object prompt, default_coding_system;
4ed46869 4751{
f44d27ce 4752 Lisp_Object val;
9b787f3e
RS
4753 if (SYMBOLP (default_coding_system))
4754 XSETSTRING (default_coding_system, XSYMBOL (default_coding_system)->name);
4608c386 4755 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
9b787f3e
RS
4756 Qt, Qnil, Qcoding_system_history,
4757 default_coding_system, Qnil);
e0e989f6 4758 return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
4ed46869
KH
4759}
4760
4761DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
4762 1, 1, 0,
4763 "Check validity of CODING-SYSTEM.\n\
3a73fa5d
RS
4764If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.\n\
4765It is valid if it is a symbol with a non-nil `coding-system' property.\n\
4ed46869
KH
4766The value of property should be a vector of length 5.")
4767 (coding_system)
4768 Lisp_Object coding_system;
4769{
4770 CHECK_SYMBOL (coding_system, 0);
4771 if (!NILP (Fcoding_system_p (coding_system)))
4772 return coding_system;
4773 while (1)
02ba4723 4774 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
4ed46869 4775}
3a73fa5d 4776\f
d46c5b12
KH
4777Lisp_Object
4778detect_coding_system (src, src_bytes, highest)
4779 unsigned char *src;
4780 int src_bytes, highest;
4ed46869
KH
4781{
4782 int coding_mask, eol_type;
d46c5b12
KH
4783 Lisp_Object val, tmp;
4784 int dummy;
4ed46869 4785
d46c5b12
KH
4786 coding_mask = detect_coding_mask (src, src_bytes, NULL, &dummy);
4787 eol_type = detect_eol_type (src, src_bytes, &dummy);
4788 if (eol_type == CODING_EOL_INCONSISTENT)
25b02698 4789 eol_type = CODING_EOL_UNDECIDED;
4ed46869 4790
d46c5b12 4791 if (!coding_mask)
4ed46869 4792 {
27901516 4793 val = Qundecided;
d46c5b12 4794 if (eol_type != CODING_EOL_UNDECIDED)
4ed46869 4795 {
f44d27ce
RS
4796 Lisp_Object val2;
4797 val2 = Fget (Qundecided, Qeol_type);
4ed46869
KH
4798 if (VECTORP (val2))
4799 val = XVECTOR (val2)->contents[eol_type];
4800 }
80e803b4 4801 return (highest ? val : Fcons (val, Qnil));
4ed46869 4802 }
4ed46869 4803
d46c5b12
KH
4804 /* At first, gather possible coding systems in VAL. */
4805 val = Qnil;
4806 for (tmp = Vcoding_category_list; !NILP (tmp); tmp = XCONS (tmp)->cdr)
4ed46869 4807 {
d46c5b12
KH
4808 int idx
4809 = XFASTINT (Fget (XCONS (tmp)->car, Qcoding_category_index));
4810 if (coding_mask & (1 << idx))
4ed46869 4811 {
d46c5b12
KH
4812 val = Fcons (Fsymbol_value (XCONS (tmp)->car), val);
4813 if (highest)
4814 break;
4ed46869
KH
4815 }
4816 }
d46c5b12
KH
4817 if (!highest)
4818 val = Fnreverse (val);
4ed46869 4819
65059037 4820 /* Then, replace the elements with subsidiary coding systems. */
d46c5b12 4821 for (tmp = val; !NILP (tmp); tmp = XCONS (tmp)->cdr)
4ed46869 4822 {
65059037
RS
4823 if (eol_type != CODING_EOL_UNDECIDED
4824 && eol_type != CODING_EOL_INCONSISTENT)
4ed46869 4825 {
d46c5b12
KH
4826 Lisp_Object eol;
4827 eol = Fget (XCONS (tmp)->car, Qeol_type);
4828 if (VECTORP (eol))
4829 XCONS (tmp)->car = XVECTOR (eol)->contents[eol_type];
4ed46869
KH
4830 }
4831 }
d46c5b12
KH
4832 return (highest ? XCONS (val)->car : val);
4833}
4ed46869 4834
d46c5b12
KH
4835DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
4836 2, 3, 0,
4837 "Detect coding system of the text in the region between START and END.\n\
4838Return a list of possible coding systems ordered by priority.\n\
4839\n\
80e803b4
KH
4840If only ASCII characters are found, it returns a list of single element\n\
4841`undecided' or its subsidiary coding system according to a detected\n\
4842end-of-line format.\n\
d46c5b12
KH
4843\n\
4844If optional argument HIGHEST is non-nil, return the coding system of\n\
4845highest priority.")
4846 (start, end, highest)
4847 Lisp_Object start, end, highest;
4848{
4849 int from, to;
4850 int from_byte, to_byte;
6289dd10 4851
d46c5b12
KH
4852 CHECK_NUMBER_COERCE_MARKER (start, 0);
4853 CHECK_NUMBER_COERCE_MARKER (end, 1);
4ed46869 4854
d46c5b12
KH
4855 validate_region (&start, &end);
4856 from = XINT (start), to = XINT (end);
4857 from_byte = CHAR_TO_BYTE (from);
4858 to_byte = CHAR_TO_BYTE (to);
6289dd10 4859
d46c5b12
KH
4860 if (from < GPT && to >= GPT)
4861 move_gap_both (to, to_byte);
4ed46869 4862
d46c5b12
KH
4863 return detect_coding_system (BYTE_POS_ADDR (from_byte),
4864 to_byte - from_byte,
4865 !NILP (highest));
4866}
6289dd10 4867
d46c5b12
KH
4868DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
4869 1, 2, 0,
4870 "Detect coding system of the text in STRING.\n\
4871Return a list of possible coding systems ordered by priority.\n\
4872\n\
80e803b4
KH
4873If only ASCII characters are found, it returns a list of single element\n\
4874`undecided' or its subsidiary coding system according to a detected\n\
4875end-of-line format.\n\
d46c5b12
KH
4876\n\
4877If optional argument HIGHEST is non-nil, return the coding system of\n\
4878highest priority.")
4879 (string, highest)
4880 Lisp_Object string, highest;
4881{
4882 CHECK_STRING (string, 0);
4ed46869 4883
d46c5b12 4884 return detect_coding_system (XSTRING (string)->data,
fc932ac6 4885 STRING_BYTES (XSTRING (string)),
d46c5b12 4886 !NILP (highest));
4ed46869
KH
4887}
4888
4031e2bf
KH
4889Lisp_Object
4890code_convert_region1 (start, end, coding_system, encodep)
d46c5b12 4891 Lisp_Object start, end, coding_system;
4031e2bf 4892 int encodep;
3a73fa5d
RS
4893{
4894 struct coding_system coding;
4031e2bf 4895 int from, to, len;
3a73fa5d 4896
d46c5b12
KH
4897 CHECK_NUMBER_COERCE_MARKER (start, 0);
4898 CHECK_NUMBER_COERCE_MARKER (end, 1);
3a73fa5d
RS
4899 CHECK_SYMBOL (coding_system, 2);
4900
d46c5b12
KH
4901 validate_region (&start, &end);
4902 from = XFASTINT (start);
4903 to = XFASTINT (end);
4904
3a73fa5d 4905 if (NILP (coding_system))
d46c5b12
KH
4906 return make_number (to - from);
4907
3a73fa5d 4908 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
d46c5b12 4909 error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
3a73fa5d 4910
d46c5b12 4911 coding.mode |= CODING_MODE_LAST_BLOCK;
fb88bf2d
KH
4912 code_convert_region (from, CHAR_TO_BYTE (from), to, CHAR_TO_BYTE (to),
4913 &coding, encodep, 1);
f072a3e8 4914 Vlast_coding_system_used = coding.symbol;
fb88bf2d 4915 return make_number (coding.produced_char);
4031e2bf
KH
4916}
4917
4918DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
4919 3, 3, "r\nzCoding system: ",
4920 "Decode the current region by specified coding system.\n\
4921When called from a program, takes three arguments:\n\
4922START, END, and CODING-SYSTEM. START and END are buffer positions.\n\
f072a3e8
RS
4923This function sets `last-coding-system-used' to the precise coding system\n\
4924used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
4925not fully specified.)\n\
4926It returns the length of the decoded text.")
4031e2bf
KH
4927 (start, end, coding_system)
4928 Lisp_Object start, end, coding_system;
4929{
4930 return code_convert_region1 (start, end, coding_system, 0);
3a73fa5d
RS
4931}
4932
4933DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
4934 3, 3, "r\nzCoding system: ",
d46c5b12 4935 "Encode the current region by specified coding system.\n\
3a73fa5d 4936When called from a program, takes three arguments:\n\
d46c5b12 4937START, END, and CODING-SYSTEM. START and END are buffer positions.\n\
f072a3e8
RS
4938This function sets `last-coding-system-used' to the precise coding system\n\
4939used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
4940not fully specified.)\n\
4941It returns the length of the encoded text.")
d46c5b12
KH
4942 (start, end, coding_system)
4943 Lisp_Object start, end, coding_system;
3a73fa5d 4944{
4031e2bf
KH
4945 return code_convert_region1 (start, end, coding_system, 1);
4946}
3a73fa5d 4947
4031e2bf
KH
4948Lisp_Object
4949code_convert_string1 (string, coding_system, nocopy, encodep)
4950 Lisp_Object string, coding_system, nocopy;
4951 int encodep;
4952{
4953 struct coding_system coding;
3a73fa5d 4954
4031e2bf
KH
4955 CHECK_STRING (string, 0);
4956 CHECK_SYMBOL (coding_system, 1);
4ed46869 4957
d46c5b12 4958 if (NILP (coding_system))
4031e2bf 4959 return (NILP (nocopy) ? Fcopy_sequence (string) : string);
4ed46869 4960
d46c5b12
KH
4961 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
4962 error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
5f1cd180 4963
d46c5b12 4964 coding.mode |= CODING_MODE_LAST_BLOCK;
f072a3e8 4965 Vlast_coding_system_used = coding.symbol;
4031e2bf 4966 return code_convert_string (string, &coding, encodep, !NILP (nocopy));
4ed46869
KH
4967}
4968
4ed46869 4969DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
e0e989f6
KH
4970 2, 3, 0,
4971 "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
fe487a71 4972Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
f072a3e8
RS
4973if the decoding operation is trivial.\n\
4974This function sets `last-coding-system-used' to the precise coding system\n\
4975used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
4976not fully specified.)")
e0e989f6
KH
4977 (string, coding_system, nocopy)
4978 Lisp_Object string, coding_system, nocopy;
4ed46869 4979{
f072a3e8 4980 return code_convert_string1 (string, coding_system, nocopy, 0);
4ed46869
KH
4981}
4982
4983DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
e0e989f6
KH
4984 2, 3, 0,
4985 "Encode STRING to CODING-SYSTEM, and return the result.\n\
fe487a71 4986Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
f072a3e8
RS
4987if the encoding operation is trivial.\n\
4988This function sets `last-coding-system-used' to the precise coding system\n\
4989used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
4990not fully specified.)")
e0e989f6
KH
4991 (string, coding_system, nocopy)
4992 Lisp_Object string, coding_system, nocopy;
4ed46869 4993{
f072a3e8 4994 return code_convert_string1 (string, coding_system, nocopy, 1);
4ed46869 4995}
4031e2bf 4996
ecec61c1
KH
4997/* Encode or decode STRING according to CODING_SYSTEM.
4998 Do not set Vlast_coding_system_used. */
4999
5000Lisp_Object
5001code_convert_string_norecord (string, coding_system, encodep)
5002 Lisp_Object string, coding_system;
5003 int encodep;
5004{
5005 struct coding_system coding;
5006
5007 CHECK_STRING (string, 0);
5008 CHECK_SYMBOL (coding_system, 1);
5009
5010 if (NILP (coding_system))
5011 return string;
5012
5013 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
5014 error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
5015
5016 coding.mode |= CODING_MODE_LAST_BLOCK;
5017 return code_convert_string (string, &coding, encodep, Qt);
5018}
3a73fa5d 5019\f
4ed46869 5020DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
55ab7be3 5021 "Decode a Japanese character which has CODE in shift_jis encoding.\n\
4ed46869
KH
5022Return the corresponding character.")
5023 (code)
5024 Lisp_Object code;
5025{
5026 unsigned char c1, c2, s1, s2;
5027 Lisp_Object val;
5028
5029 CHECK_NUMBER (code, 0);
5030 s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
55ab7be3
KH
5031 if (s1 == 0)
5032 {
5033 if (s2 < 0xA0 || s2 > 0xDF)
5034 error ("Invalid Shift JIS code: %s", XFASTINT (code));
5035 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset_katakana_jisx0201, s2, 0));
5036 }
5037 else
5038 {
5039 if ((s1 < 0x80 || s1 > 0x9F && s1 < 0xE0 || s1 > 0xEF)
5040 || (s2 < 0x40 || s2 == 0x7F || s2 > 0xFC))
5041 error ("Invalid Shift JIS code: %s", XFASTINT (code));
5042 DECODE_SJIS (s1, s2, c1, c2);
5043 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset_jisx0208, c1, c2));
5044 }
4ed46869
KH
5045 return val;
5046}
5047
5048DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
55ab7be3
KH
5049 "Encode a Japanese character CHAR to shift_jis encoding.\n\
5050Return the corresponding code in SJIS.")
4ed46869
KH
5051 (ch)
5052 Lisp_Object ch;
5053{
bcf26d6a 5054 int charset, c1, c2, s1, s2;
4ed46869
KH
5055 Lisp_Object val;
5056
5057 CHECK_NUMBER (ch, 0);
5058 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
55ab7be3
KH
5059 if (charset == charset_jisx0208
5060 && c1 > 0x20 && c1 < 0x7F && c2 > 0x20 && c2 < 0x7F)
4ed46869
KH
5061 {
5062 ENCODE_SJIS (c1, c2, s1, s2);
bcf26d6a 5063 XSETFASTINT (val, (s1 << 8) | s2);
4ed46869 5064 }
55ab7be3
KH
5065 else if (charset == charset_katakana_jisx0201
5066 && c1 > 0x20 && c2 < 0xE0)
5067 {
5068 XSETFASTINT (val, c1 | 0x80);
5069 }
4ed46869 5070 else
55ab7be3
KH
5071 error ("Can't encode to shift_jis: %d", XFASTINT (ch));
5072
4ed46869
KH
5073 return val;
5074}
5075
5076DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
d46c5b12 5077 "Decode a Big5 character CODE of BIG5 coding system.\n\
4ed46869
KH
5078CODE is the character code in BIG5.\n\
5079Return the corresponding character.")
5080 (code)
5081 Lisp_Object code;
5082{
5083 int charset;
5084 unsigned char b1, b2, c1, c2;
5085 Lisp_Object val;
5086
5087 CHECK_NUMBER (code, 0);
5088 b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
5089 DECODE_BIG5 (b1, b2, charset, c1, c2);
5090 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset, c1, c2));
5091 return val;
5092}
5093
5094DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
d46c5b12 5095 "Encode the Big5 character CHAR to BIG5 coding system.\n\
4ed46869
KH
5096Return the corresponding character code in Big5.")
5097 (ch)
5098 Lisp_Object ch;
5099{
bcf26d6a 5100 int charset, c1, c2, b1, b2;
4ed46869
KH
5101 Lisp_Object val;
5102
5103 CHECK_NUMBER (ch, 0);
5104 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
5105 if (charset == charset_big5_1 || charset == charset_big5_2)
5106 {
5107 ENCODE_BIG5 (charset, c1, c2, b1, b2);
bcf26d6a 5108 XSETFASTINT (val, (b1 << 8) | b2);
4ed46869
KH
5109 }
5110 else
5111 XSETFASTINT (val, 0);
5112 return val;
5113}
3a73fa5d 5114\f
1ba9e4ab
KH
5115DEFUN ("set-terminal-coding-system-internal",
5116 Fset_terminal_coding_system_internal,
5117 Sset_terminal_coding_system_internal, 1, 1, 0, "")
4ed46869
KH
5118 (coding_system)
5119 Lisp_Object coding_system;
5120{
5121 CHECK_SYMBOL (coding_system, 0);
5122 setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
70c22245 5123 /* We had better not send unsafe characters to terminal. */
6e85d753
KH
5124 terminal_coding.flags |= CODING_FLAG_ISO_SAFE;
5125
4ed46869
KH
5126 return Qnil;
5127}
5128
c4825358
KH
5129DEFUN ("set-safe-terminal-coding-system-internal",
5130 Fset_safe_terminal_coding_system_internal,
5131 Sset_safe_terminal_coding_system_internal, 1, 1, 0, "")
5132 (coding_system)
5133 Lisp_Object coding_system;
5134{
5135 CHECK_SYMBOL (coding_system, 0);
5136 setup_coding_system (Fcheck_coding_system (coding_system),
5137 &safe_terminal_coding);
5138 return Qnil;
5139}
5140
4ed46869
KH
5141DEFUN ("terminal-coding-system",
5142 Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
3a73fa5d 5143 "Return coding system specified for terminal output.")
4ed46869
KH
5144 ()
5145{
5146 return terminal_coding.symbol;
5147}
5148
1ba9e4ab
KH
5149DEFUN ("set-keyboard-coding-system-internal",
5150 Fset_keyboard_coding_system_internal,
5151 Sset_keyboard_coding_system_internal, 1, 1, 0, "")
4ed46869
KH
5152 (coding_system)
5153 Lisp_Object coding_system;
5154{
5155 CHECK_SYMBOL (coding_system, 0);
5156 setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
5157 return Qnil;
5158}
5159
5160DEFUN ("keyboard-coding-system",
5161 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
3a73fa5d 5162 "Return coding system specified for decoding keyboard input.")
4ed46869
KH
5163 ()
5164{
5165 return keyboard_coding.symbol;
5166}
5167
5168\f
a5d301df
KH
5169DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
5170 Sfind_operation_coding_system, 1, MANY, 0,
5171 "Choose a coding system for an operation based on the target name.\n\
69f76525 5172The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).\n\
9ce27fde
KH
5173DECODING-SYSTEM is the coding system to use for decoding\n\
5174\(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system\n\
5175for encoding (in case OPERATION does encoding).\n\
ccdb79f5
RS
5176\n\
5177The first argument OPERATION specifies an I/O primitive:\n\
5178 For file I/O, `insert-file-contents' or `write-region'.\n\
5179 For process I/O, `call-process', `call-process-region', or `start-process'.\n\
5180 For network I/O, `open-network-stream'.\n\
5181\n\
5182The remaining arguments should be the same arguments that were passed\n\
5183to the primitive. Depending on which primitive, one of those arguments\n\
5184is selected as the TARGET. For example, if OPERATION does file I/O,\n\
5185whichever argument specifies the file name is TARGET.\n\
5186\n\
5187TARGET has a meaning which depends on OPERATION:\n\
4ed46869
KH
5188 For file I/O, TARGET is a file name.\n\
5189 For process I/O, TARGET is a process name.\n\
5190 For network I/O, TARGET is a service name or a port number\n\
5191\n\
02ba4723
KH
5192This function looks up what specified for TARGET in,\n\
5193`file-coding-system-alist', `process-coding-system-alist',\n\
5194or `network-coding-system-alist' depending on OPERATION.\n\
5195They may specify a coding system, a cons of coding systems,\n\
5196or a function symbol to call.\n\
5197In the last case, we call the function with one argument,\n\
9ce27fde 5198which is a list of all the arguments given to this function.")
4ed46869
KH
5199 (nargs, args)
5200 int nargs;
5201 Lisp_Object *args;
5202{
5203 Lisp_Object operation, target_idx, target, val;
5204 register Lisp_Object chain;
5205
5206 if (nargs < 2)
5207 error ("Too few arguments");
5208 operation = args[0];
5209 if (!SYMBOLP (operation)
5210 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
5211 error ("Invalid first arguement");
5212 if (nargs < 1 + XINT (target_idx))
5213 error ("Too few arguments for operation: %s",
5214 XSYMBOL (operation)->name->data);
5215 target = args[XINT (target_idx) + 1];
5216 if (!(STRINGP (target)
5217 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
5218 error ("Invalid %dth argument", XINT (target_idx) + 1);
5219
2e34157c
RS
5220 chain = ((EQ (operation, Qinsert_file_contents)
5221 || EQ (operation, Qwrite_region))
02ba4723 5222 ? Vfile_coding_system_alist
2e34157c 5223 : (EQ (operation, Qopen_network_stream)
02ba4723
KH
5224 ? Vnetwork_coding_system_alist
5225 : Vprocess_coding_system_alist));
4ed46869
KH
5226 if (NILP (chain))
5227 return Qnil;
5228
02ba4723 5229 for (; CONSP (chain); chain = XCONS (chain)->cdr)
4ed46869 5230 {
f44d27ce
RS
5231 Lisp_Object elt;
5232 elt = XCONS (chain)->car;
4ed46869
KH
5233
5234 if (CONSP (elt)
5235 && ((STRINGP (target)
5236 && STRINGP (XCONS (elt)->car)
5237 && fast_string_match (XCONS (elt)->car, target) >= 0)
5238 || (INTEGERP (target) && EQ (target, XCONS (elt)->car))))
02ba4723
KH
5239 {
5240 val = XCONS (elt)->cdr;
b19fd4c5
KH
5241 /* Here, if VAL is both a valid coding system and a valid
5242 function symbol, we return VAL as a coding system. */
02ba4723
KH
5243 if (CONSP (val))
5244 return val;
5245 if (! SYMBOLP (val))
5246 return Qnil;
5247 if (! NILP (Fcoding_system_p (val)))
5248 return Fcons (val, val);
b19fd4c5
KH
5249 if (! NILP (Ffboundp (val)))
5250 {
5251 val = call1 (val, Flist (nargs, args));
5252 if (CONSP (val))
5253 return val;
5254 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
5255 return Fcons (val, val);
5256 }
02ba4723
KH
5257 return Qnil;
5258 }
4ed46869
KH
5259 }
5260 return Qnil;
5261}
5262
1397dc18
KH
5263DEFUN ("update-coding-systems-internal", Fupdate_coding_systems_internal,
5264 Supdate_coding_systems_internal, 0, 0, 0,
5265 "Update internal database for ISO2022 and CCL based coding systems.\n\
d46c5b12
KH
5266When values of the following coding categories are changed, you must\n\
5267call this function:\n\
5268 coding-category-iso-7, coding-category-iso-7-tight,\n\
5269 coding-category-iso-8-1, coding-category-iso-8-2,\n\
1397dc18
KH
5270 coding-category-iso-7-else, coding-category-iso-8-else,\n\
5271 coding-category-ccl")
d46c5b12
KH
5272 ()
5273{
5274 int i;
5275
1397dc18 5276 for (i = CODING_CATEGORY_IDX_ISO_7; i <= CODING_CATEGORY_IDX_CCL; i++)
d46c5b12 5277 {
1397dc18
KH
5278 Lisp_Object val;
5279
5280 val = XSYMBOL (XVECTOR (Vcoding_category_table)->contents[i])->value;
5281 if (!NILP (val))
5282 {
5283 if (! coding_system_table[i])
5284 coding_system_table[i] = ((struct coding_system *)
5285 xmalloc (sizeof (struct coding_system)));
5286 setup_coding_system (val, coding_system_table[i]);
5287 }
5288 else if (coding_system_table[i])
5289 {
5290 xfree (coding_system_table[i]);
5291 coding_system_table[i] = NULL;
5292 }
d46c5b12 5293 }
1397dc18 5294
d46c5b12
KH
5295 return Qnil;
5296}
5297
66cfb530
KH
5298DEFUN ("set-coding-priority-internal", Fset_coding_priority_internal,
5299 Sset_coding_priority_internal, 0, 0, 0,
5300 "Update internal database for the current value of `coding-category-list'.\n\
5301This function is internal use only.")
5302 ()
5303{
5304 int i = 0, idx;
84d60297
RS
5305 Lisp_Object val;
5306
5307 val = Vcoding_category_list;
66cfb530
KH
5308
5309 while (CONSP (val) && i < CODING_CATEGORY_IDX_MAX)
5310 {
5311 if (! SYMBOLP (XCONS (val)->car))
5312 break;
5313 idx = XFASTINT (Fget (XCONS (val)->car, Qcoding_category_index));
5314 if (idx >= CODING_CATEGORY_IDX_MAX)
5315 break;
5316 coding_priorities[i++] = (1 << idx);
5317 val = XCONS (val)->cdr;
5318 }
5319 /* If coding-category-list is valid and contains all coding
5320 categories, `i' should be CODING_CATEGORY_IDX_MAX now. If not,
5321 the following code saves Emacs from craching. */
5322 while (i < CODING_CATEGORY_IDX_MAX)
5323 coding_priorities[i++] = CODING_CATEGORY_MASK_RAW_TEXT;
5324
5325 return Qnil;
5326}
5327
4ed46869
KH
5328#endif /* emacs */
5329
5330\f
1397dc18 5331/*** 9. Post-amble ***/
4ed46869 5332
6d74c3aa
KH
5333void
5334init_coding ()
5335{
5336 conversion_buffer = (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE);
5337}
5338
dfcf069d 5339void
4ed46869
KH
5340init_coding_once ()
5341{
5342 int i;
5343
0ef69138 5344 /* Emacs' internal format specific initialize routine. */
4ed46869
KH
5345 for (i = 0; i <= 0x20; i++)
5346 emacs_code_class[i] = EMACS_control_code;
5347 emacs_code_class[0x0A] = EMACS_linefeed_code;
5348 emacs_code_class[0x0D] = EMACS_carriage_return_code;
5349 for (i = 0x21 ; i < 0x7F; i++)
5350 emacs_code_class[i] = EMACS_ascii_code;
5351 emacs_code_class[0x7F] = EMACS_control_code;
5352 emacs_code_class[0x80] = EMACS_leading_code_composition;
5353 for (i = 0x81; i < 0xFF; i++)
5354 emacs_code_class[i] = EMACS_invalid_code;
5355 emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
5356 emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
5357 emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
5358 emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
5359
5360 /* ISO2022 specific initialize routine. */
5361 for (i = 0; i < 0x20; i++)
5362 iso_code_class[i] = ISO_control_code;
5363 for (i = 0x21; i < 0x7F; i++)
5364 iso_code_class[i] = ISO_graphic_plane_0;
5365 for (i = 0x80; i < 0xA0; i++)
5366 iso_code_class[i] = ISO_control_code;
5367 for (i = 0xA1; i < 0xFF; i++)
5368 iso_code_class[i] = ISO_graphic_plane_1;
5369 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
5370 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
5371 iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
5372 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
5373 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
5374 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
5375 iso_code_class[ISO_CODE_ESC] = ISO_escape;
5376 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
5377 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
5378 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
5379
e0e989f6 5380 conversion_buffer_size = MINIMUM_CONVERSION_BUFFER_SIZE;
e0e989f6
KH
5381
5382 setup_coding_system (Qnil, &keyboard_coding);
5383 setup_coding_system (Qnil, &terminal_coding);
c4825358 5384 setup_coding_system (Qnil, &safe_terminal_coding);
6bc51348 5385 setup_coding_system (Qnil, &default_buffer_file_coding);
9ce27fde 5386
d46c5b12
KH
5387 bzero (coding_system_table, sizeof coding_system_table);
5388
66cfb530
KH
5389 bzero (ascii_skip_code, sizeof ascii_skip_code);
5390 for (i = 0; i < 128; i++)
5391 ascii_skip_code[i] = 1;
5392
9ce27fde
KH
5393#if defined (MSDOS) || defined (WINDOWSNT)
5394 system_eol_type = CODING_EOL_CRLF;
5395#else
5396 system_eol_type = CODING_EOL_LF;
5397#endif
e0e989f6
KH
5398}
5399
5400#ifdef emacs
5401
dfcf069d 5402void
e0e989f6
KH
5403syms_of_coding ()
5404{
5405 Qtarget_idx = intern ("target-idx");
5406 staticpro (&Qtarget_idx);
5407
bb0115a2
RS
5408 Qcoding_system_history = intern ("coding-system-history");
5409 staticpro (&Qcoding_system_history);
5410 Fset (Qcoding_system_history, Qnil);
5411
9ce27fde 5412 /* Target FILENAME is the first argument. */
e0e989f6 5413 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
9ce27fde 5414 /* Target FILENAME is the third argument. */
e0e989f6
KH
5415 Fput (Qwrite_region, Qtarget_idx, make_number (2));
5416
5417 Qcall_process = intern ("call-process");
5418 staticpro (&Qcall_process);
9ce27fde 5419 /* Target PROGRAM is the first argument. */
e0e989f6
KH
5420 Fput (Qcall_process, Qtarget_idx, make_number (0));
5421
5422 Qcall_process_region = intern ("call-process-region");
5423 staticpro (&Qcall_process_region);
9ce27fde 5424 /* Target PROGRAM is the third argument. */
e0e989f6
KH
5425 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
5426
5427 Qstart_process = intern ("start-process");
5428 staticpro (&Qstart_process);
9ce27fde 5429 /* Target PROGRAM is the third argument. */
e0e989f6
KH
5430 Fput (Qstart_process, Qtarget_idx, make_number (2));
5431
5432 Qopen_network_stream = intern ("open-network-stream");
5433 staticpro (&Qopen_network_stream);
9ce27fde 5434 /* Target SERVICE is the fourth argument. */
e0e989f6
KH
5435 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
5436
4ed46869
KH
5437 Qcoding_system = intern ("coding-system");
5438 staticpro (&Qcoding_system);
5439
5440 Qeol_type = intern ("eol-type");
5441 staticpro (&Qeol_type);
5442
5443 Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
5444 staticpro (&Qbuffer_file_coding_system);
5445
5446 Qpost_read_conversion = intern ("post-read-conversion");
5447 staticpro (&Qpost_read_conversion);
5448
5449 Qpre_write_conversion = intern ("pre-write-conversion");
5450 staticpro (&Qpre_write_conversion);
5451
27901516
KH
5452 Qno_conversion = intern ("no-conversion");
5453 staticpro (&Qno_conversion);
5454
5455 Qundecided = intern ("undecided");
5456 staticpro (&Qundecided);
5457
4ed46869
KH
5458 Qcoding_system_p = intern ("coding-system-p");
5459 staticpro (&Qcoding_system_p);
5460
5461 Qcoding_system_error = intern ("coding-system-error");
5462 staticpro (&Qcoding_system_error);
5463
5464 Fput (Qcoding_system_error, Qerror_conditions,
5465 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
5466 Fput (Qcoding_system_error, Qerror_message,
9ce27fde 5467 build_string ("Invalid coding system"));
4ed46869 5468
d46c5b12
KH
5469 Qcoding_category = intern ("coding-category");
5470 staticpro (&Qcoding_category);
4ed46869
KH
5471 Qcoding_category_index = intern ("coding-category-index");
5472 staticpro (&Qcoding_category_index);
5473
d46c5b12
KH
5474 Vcoding_category_table
5475 = Fmake_vector (make_number (CODING_CATEGORY_IDX_MAX), Qnil);
5476 staticpro (&Vcoding_category_table);
4ed46869
KH
5477 {
5478 int i;
5479 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
5480 {
d46c5b12
KH
5481 XVECTOR (Vcoding_category_table)->contents[i]
5482 = intern (coding_category_name[i]);
5483 Fput (XVECTOR (Vcoding_category_table)->contents[i],
5484 Qcoding_category_index, make_number (i));
4ed46869
KH
5485 }
5486 }
5487
f967223b
KH
5488 Qtranslation_table = intern ("translation-table");
5489 staticpro (&Qtranslation_table);
1397dc18 5490 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (1));
bdd9fb48 5491
f967223b
KH
5492 Qtranslation_table_id = intern ("translation-table-id");
5493 staticpro (&Qtranslation_table_id);
84fbb8a0 5494
f967223b
KH
5495 Qtranslation_table_for_decode = intern ("translation-table-for-decode");
5496 staticpro (&Qtranslation_table_for_decode);
a5d301df 5497
f967223b
KH
5498 Qtranslation_table_for_encode = intern ("translation-table-for-encode");
5499 staticpro (&Qtranslation_table_for_encode);
a5d301df 5500
70c22245
KH
5501 Qsafe_charsets = intern ("safe-charsets");
5502 staticpro (&Qsafe_charsets);
5503
1397dc18
KH
5504 Qvalid_codes = intern ("valid-codes");
5505 staticpro (&Qvalid_codes);
5506
9ce27fde
KH
5507 Qemacs_mule = intern ("emacs-mule");
5508 staticpro (&Qemacs_mule);
5509
d46c5b12
KH
5510 Qraw_text = intern ("raw-text");
5511 staticpro (&Qraw_text);
5512
4ed46869
KH
5513 defsubr (&Scoding_system_p);
5514 defsubr (&Sread_coding_system);
5515 defsubr (&Sread_non_nil_coding_system);
5516 defsubr (&Scheck_coding_system);
5517 defsubr (&Sdetect_coding_region);
d46c5b12 5518 defsubr (&Sdetect_coding_string);
4ed46869
KH
5519 defsubr (&Sdecode_coding_region);
5520 defsubr (&Sencode_coding_region);
5521 defsubr (&Sdecode_coding_string);
5522 defsubr (&Sencode_coding_string);
5523 defsubr (&Sdecode_sjis_char);
5524 defsubr (&Sencode_sjis_char);
5525 defsubr (&Sdecode_big5_char);
5526 defsubr (&Sencode_big5_char);
1ba9e4ab 5527 defsubr (&Sset_terminal_coding_system_internal);
c4825358 5528 defsubr (&Sset_safe_terminal_coding_system_internal);
4ed46869 5529 defsubr (&Sterminal_coding_system);
1ba9e4ab 5530 defsubr (&Sset_keyboard_coding_system_internal);
4ed46869 5531 defsubr (&Skeyboard_coding_system);
a5d301df 5532 defsubr (&Sfind_operation_coding_system);
1397dc18 5533 defsubr (&Supdate_coding_systems_internal);
66cfb530 5534 defsubr (&Sset_coding_priority_internal);
4ed46869 5535
4608c386
KH
5536 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
5537 "List of coding systems.\n\
5538\n\
5539Do not alter the value of this variable manually. This variable should be\n\
5540updated by the functions `make-coding-system' and\n\
5541`define-coding-system-alias'.");
5542 Vcoding_system_list = Qnil;
5543
5544 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
5545 "Alist of coding system names.\n\
5546Each element is one element list of coding system name.\n\
5547This variable is given to `completing-read' as TABLE argument.\n\
5548\n\
5549Do not alter the value of this variable manually. This variable should be\n\
5550updated by the functions `make-coding-system' and\n\
5551`define-coding-system-alias'.");
5552 Vcoding_system_alist = Qnil;
5553
4ed46869
KH
5554 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
5555 "List of coding-categories (symbols) ordered by priority.");
5556 {
5557 int i;
5558
5559 Vcoding_category_list = Qnil;
5560 for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
5561 Vcoding_category_list
d46c5b12
KH
5562 = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
5563 Vcoding_category_list);
4ed46869
KH
5564 }
5565
5566 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
10bff6f1 5567 "Specify the coding system for read operations.\n\
2ebb362d 5568It is useful to bind this variable with `let', but do not set it globally.\n\
4ed46869 5569If the value is a coding system, it is used for decoding on read operation.\n\
a67a9c66 5570If not, an appropriate element is used from one of the coding system alists:\n\
10bff6f1 5571There are three such tables, `file-coding-system-alist',\n\
a67a9c66 5572`process-coding-system-alist', and `network-coding-system-alist'.");
4ed46869
KH
5573 Vcoding_system_for_read = Qnil;
5574
5575 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
10bff6f1 5576 "Specify the coding system for write operations.\n\
2ebb362d 5577It is useful to bind this variable with `let', but do not set it globally.\n\
4ed46869 5578If the value is a coding system, it is used for encoding on write operation.\n\
a67a9c66 5579If not, an appropriate element is used from one of the coding system alists:\n\
10bff6f1 5580There are three such tables, `file-coding-system-alist',\n\
a67a9c66 5581`process-coding-system-alist', and `network-coding-system-alist'.");
4ed46869
KH
5582 Vcoding_system_for_write = Qnil;
5583
5584 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
a67a9c66 5585 "Coding system used in the latest file or process I/O.");
4ed46869
KH
5586 Vlast_coding_system_used = Qnil;
5587
9ce27fde 5588 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
f07f4a24
DL
5589 "*Non-nil means always inhibit code conversion of end-of-line format.\n\
5590See info node `Text and Binary' concerning such conversion.");
9ce27fde
KH
5591 inhibit_eol_conversion = 0;
5592
ed29121d
EZ
5593 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
5594 "Non-nil means process buffer inherits coding system of process output.\n\
5595Bind it to t if the process output is to be treated as if it were a file\n\
5596read from some filesystem.");
5597 inherit_process_coding_system = 0;
5598
02ba4723
KH
5599 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
5600 "Alist to decide a coding system to use for a file I/O operation.\n\
5601The format is ((PATTERN . VAL) ...),\n\
5602where PATTERN is a regular expression matching a file name,\n\
5603VAL is a coding system, a cons of coding systems, or a function symbol.\n\
5604If VAL is a coding system, it is used for both decoding and encoding\n\
5605the file contents.\n\
5606If VAL is a cons of coding systems, the car part is used for decoding,\n\
5607and the cdr part is used for encoding.\n\
5608If VAL is a function symbol, the function must return a coding system\n\
5609or a cons of coding systems which are used as above.\n\
e0e989f6 5610\n\
a85a871a 5611See also the function `find-operation-coding-system'\n\
eda284ac 5612and the variable `auto-coding-alist'.");
02ba4723
KH
5613 Vfile_coding_system_alist = Qnil;
5614
5615 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
5616 "Alist to decide a coding system to use for a process I/O operation.\n\
5617The format is ((PATTERN . VAL) ...),\n\
5618where PATTERN is a regular expression matching a program name,\n\
5619VAL is a coding system, a cons of coding systems, or a function symbol.\n\
5620If VAL is a coding system, it is used for both decoding what received\n\
5621from the program and encoding what sent to the program.\n\
5622If VAL is a cons of coding systems, the car part is used for decoding,\n\
5623and the cdr part is used for encoding.\n\
5624If VAL is a function symbol, the function must return a coding system\n\
5625or a cons of coding systems which are used as above.\n\
4ed46869 5626\n\
9ce27fde 5627See also the function `find-operation-coding-system'.");
02ba4723
KH
5628 Vprocess_coding_system_alist = Qnil;
5629
5630 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
5631 "Alist to decide a coding system to use for a network I/O operation.\n\
5632The format is ((PATTERN . VAL) ...),\n\
5633where PATTERN is a regular expression matching a network service name\n\
5634or is a port number to connect to,\n\
5635VAL is a coding system, a cons of coding systems, or a function symbol.\n\
5636If VAL is a coding system, it is used for both decoding what received\n\
5637from the network stream and encoding what sent to the network stream.\n\
5638If VAL is a cons of coding systems, the car part is used for decoding,\n\
5639and the cdr part is used for encoding.\n\
5640If VAL is a function symbol, the function must return a coding system\n\
5641or a cons of coding systems which are used as above.\n\
4ed46869 5642\n\
9ce27fde 5643See also the function `find-operation-coding-system'.");
02ba4723 5644 Vnetwork_coding_system_alist = Qnil;
4ed46869
KH
5645
5646 DEFVAR_INT ("eol-mnemonic-unix", &eol_mnemonic_unix,
5647 "Mnemonic character indicating UNIX-like end-of-line format (i.e. LF) .");
458822a0 5648 eol_mnemonic_unix = ':';
4ed46869
KH
5649
5650 DEFVAR_INT ("eol-mnemonic-dos", &eol_mnemonic_dos,
5651 "Mnemonic character indicating DOS-like end-of-line format (i.e. CRLF).");
458822a0 5652 eol_mnemonic_dos = '\\';
4ed46869
KH
5653
5654 DEFVAR_INT ("eol-mnemonic-mac", &eol_mnemonic_mac,
5655 "Mnemonic character indicating MAC-like end-of-line format (i.e. CR).");
458822a0 5656 eol_mnemonic_mac = '/';
4ed46869
KH
5657
5658 DEFVAR_INT ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
5659 "Mnemonic character indicating end-of-line format is not yet decided.");
458822a0 5660 eol_mnemonic_undecided = ':';
4ed46869 5661
84fbb8a0 5662 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
f967223b 5663 "*Non-nil enables character translation while encoding and decoding.");
84fbb8a0 5664 Venable_character_translation = Qt;
bdd9fb48 5665
f967223b
KH
5666 DEFVAR_LISP ("standard-translation-table-for-decode",
5667 &Vstandard_translation_table_for_decode,
84fbb8a0 5668 "Table for translating characters while decoding.");
f967223b 5669 Vstandard_translation_table_for_decode = Qnil;
bdd9fb48 5670
f967223b
KH
5671 DEFVAR_LISP ("standard-translation-table-for-encode",
5672 &Vstandard_translation_table_for_encode,
84fbb8a0 5673 "Table for translationg characters while encoding.");
f967223b 5674 Vstandard_translation_table_for_encode = Qnil;
4ed46869
KH
5675
5676 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
5677 "Alist of charsets vs revision numbers.\n\
5678While encoding, if a charset (car part of an element) is found,\n\
5679designate it with the escape sequence identifing revision (cdr part of the element).");
5680 Vcharset_revision_alist = Qnil;
02ba4723
KH
5681
5682 DEFVAR_LISP ("default-process-coding-system",
5683 &Vdefault_process_coding_system,
5684 "Cons of coding systems used for process I/O by default.\n\
5685The car part is used for decoding a process output,\n\
5686the cdr part is used for encoding a text to be sent to a process.");
5687 Vdefault_process_coding_system = Qnil;
c4825358 5688
3f003981
KH
5689 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
5690 "Table of extra Latin codes in the range 128..159 (inclusive).\n\
c4825358
KH
5691This is a vector of length 256.\n\
5692If Nth element is non-nil, the existence of code N in a file\n\
bb0115a2 5693\(or output of subprocess) doesn't prevent it to be detected as\n\
3f003981
KH
5694a coding system of ISO 2022 variant which has a flag\n\
5695`accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file\n\
c4825358
KH
5696or reading output of a subprocess.\n\
5697Only 128th through 159th elements has a meaning.");
3f003981 5698 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
d46c5b12
KH
5699
5700 DEFVAR_LISP ("select-safe-coding-system-function",
5701 &Vselect_safe_coding_system_function,
5702 "Function to call to select safe coding system for encoding a text.\n\
5703\n\
5704If set, this function is called to force a user to select a proper\n\
5705coding system which can encode the text in the case that a default\n\
5706coding system used in each operation can't encode the text.\n\
5707\n\
a85a871a 5708The default value is `select-safe-coding-system' (which see).");
d46c5b12
KH
5709 Vselect_safe_coding_system_function = Qnil;
5710
4ed46869
KH
5711}
5712
5713#endif /* emacs */