(format-replace-strings): Fix value of TO in REVERSE case.
[bpt/emacs.git] / src / coding.c
CommitLineData
4ed46869 1/* Coding system handler (conversion, detection, and etc).
4a2f9c6a 2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
203cb916 3 Licensed to the Free Software Foundation.
4ed46869 4
369314dc
KH
5This file is part of GNU Emacs.
6
7GNU Emacs is free software; you can redistribute it and/or modify
8it under the terms of the GNU General Public License as published by
9the Free Software Foundation; either version 2, or (at your option)
10any later version.
4ed46869 11
369314dc
KH
12GNU Emacs is distributed in the hope that it will be useful,
13but WITHOUT ANY WARRANTY; without even the implied warranty of
14MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15GNU General Public License for more details.
4ed46869 16
369314dc
KH
17You should have received a copy of the GNU General Public License
18along with GNU Emacs; see the file COPYING. If not, write to
19the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20Boston, MA 02111-1307, USA. */
4ed46869
KH
21
22/*** TABLE OF CONTENTS ***
23
24 1. Preamble
0ef69138 25 2. Emacs' internal format (emacs-mule) handlers
4ed46869
KH
26 3. ISO2022 handlers
27 4. Shift-JIS and BIG5 handlers
1397dc18
KH
28 5. CCL handlers
29 6. End-of-line handlers
30 7. C library functions
31 8. Emacs Lisp library functions
32 9. Post-amble
4ed46869
KH
33
34*/
35
36/*** GENERAL NOTE on CODING SYSTEM ***
37
38 Coding system is an encoding mechanism of one or more character
39 sets. Here's a list of coding systems which Emacs can handle. When
40 we say "decode", it means converting some other coding system to
0ef69138
KH
41 Emacs' internal format (emacs-internal), and when we say "encode",
42 it means converting the coding system emacs-mule to some other
43 coding system.
4ed46869 44
0ef69138 45 0. Emacs' internal format (emacs-mule)
4ed46869
KH
46
47 Emacs itself holds a multi-lingual character in a buffer and a string
f4dee582 48 in a special format. Details are described in section 2.
4ed46869
KH
49
50 1. ISO2022
51
52 The most famous coding system for multiple character sets. X's
f4dee582
RS
53 Compound Text, various EUCs (Extended Unix Code), and coding
54 systems used in Internet communication such as ISO-2022-JP are
55 all variants of ISO2022. Details are described in section 3.
4ed46869
KH
56
57 2. SJIS (or Shift-JIS or MS-Kanji-Code)
58
59 A coding system to encode character sets: ASCII, JISX0201, and
60 JISX0208. Widely used for PC's in Japan. Details are described in
f4dee582 61 section 4.
4ed46869
KH
62
63 3. BIG5
64
65 A coding system to encode character sets: ASCII and Big5. Widely
66 used by Chinese (mainly in Taiwan and Hong Kong). Details are
f4dee582
RS
67 described in section 4. In this file, when we write "BIG5"
68 (all uppercase), we mean the coding system, and when we write
69 "Big5" (capitalized), we mean the character set.
4ed46869 70
27901516
KH
71 4. Raw text
72
4608c386
KH
73 A coding system for a text containing random 8-bit code. Emacs does
74 no code conversion on such a text except for end-of-line format.
27901516
KH
75
76 5. Other
4ed46869 77
f4dee582 78 If a user wants to read/write a text encoded in a coding system not
4ed46869
KH
79 listed above, he can supply a decoder and an encoder for it in CCL
80 (Code Conversion Language) programs. Emacs executes the CCL program
81 while reading/writing.
82
d46c5b12
KH
83 Emacs represents a coding system by a Lisp symbol that has a property
84 `coding-system'. But, before actually using the coding system, the
4ed46869 85 information about it is set in a structure of type `struct
f4dee582 86 coding_system' for rapid processing. See section 6 for more details.
4ed46869
KH
87
88*/
89
90/*** GENERAL NOTES on END-OF-LINE FORMAT ***
91
92 How end-of-line of a text is encoded depends on a system. For
93 instance, Unix's format is just one byte of `line-feed' code,
f4dee582 94 whereas DOS's format is two-byte sequence of `carriage-return' and
d46c5b12
KH
95 `line-feed' codes. MacOS's format is usually one byte of
96 `carriage-return'.
4ed46869 97
f4dee582
RS
98 Since text characters encoding and end-of-line encoding are
99 independent, any coding system described above can take
4ed46869 100 any format of end-of-line. So, Emacs has information of format of
f4dee582 101 end-of-line in each coding-system. See section 6 for more details.
4ed46869
KH
102
103*/
104
105/*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
106
107 These functions check if a text between SRC and SRC_END is encoded
108 in the coding system category XXX. Each returns an integer value in
109 which appropriate flag bits for the category XXX is set. The flag
110 bits are defined in macros CODING_CATEGORY_MASK_XXX. Below is the
111 template of these functions. */
112#if 0
113int
0ef69138 114detect_coding_emacs_mule (src, src_end)
4ed46869
KH
115 unsigned char *src, *src_end;
116{
117 ...
118}
119#endif
120
121/*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
122
123 These functions decode SRC_BYTES length text at SOURCE encoded in
0ef69138 124 CODING to Emacs' internal format (emacs-mule). The resulting text
d46c5b12
KH
125 goes to a place pointed to by DESTINATION, the length of which
126 should not exceed DST_BYTES. These functions set the information of
127 original and decoded texts in the members produced, produced_char,
128 consumed, and consumed_char of the structure *CODING.
129
130 The return value is an integer (CODING_FINISH_XXX) indicating how
131 the decoding finished.
132
133 DST_BYTES zero means that source area and destination area are
134 overlapped, which means that we can produce a decoded text until it
135 reaches at the head of not-yet-decoded source text.
136
137 Below is a template of these functions. */
4ed46869 138#if 0
d46c5b12 139decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
140 struct coding_system *coding;
141 unsigned char *source, *destination;
142 int src_bytes, dst_bytes;
4ed46869
KH
143{
144 ...
145}
146#endif
147
148/*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
149
0ef69138
KH
150 These functions encode SRC_BYTES length text at SOURCE of Emacs'
151 internal format (emacs-mule) to CODING. The resulting text goes to
f4dee582 152 a place pointed to by DESTINATION, the length of which should not
d46c5b12
KH
153 exceed DST_BYTES. These functions set the information of
154 original and encoded texts in the members produced, produced_char,
155 consumed, and consumed_char of the structure *CODING.
156
157 The return value is an integer (CODING_FINISH_XXX) indicating how
158 the encoding finished.
159
160 DST_BYTES zero means that source area and destination area are
161 overlapped, which means that we can produce a decoded text until it
162 reaches at the head of not-yet-decoded source text.
163
164 Below is a template of these functions. */
4ed46869 165#if 0
d46c5b12 166encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
167 struct coding_system *coding;
168 unsigned char *source, *destination;
169 int src_bytes, dst_bytes;
4ed46869
KH
170{
171 ...
172}
173#endif
174
175/*** COMMONLY USED MACROS ***/
176
177/* The following three macros ONE_MORE_BYTE, TWO_MORE_BYTES, and
178 THREE_MORE_BYTES safely get one, two, and three bytes from the
179 source text respectively. If there are not enough bytes in the
180 source, they jump to `label_end_of_loop'. The caller should set
181 variables `src' and `src_end' to appropriate areas in advance. */
182
183#define ONE_MORE_BYTE(c1) \
184 do { \
185 if (src < src_end) \
186 c1 = *src++; \
187 else \
188 goto label_end_of_loop; \
189 } while (0)
190
191#define TWO_MORE_BYTES(c1, c2) \
192 do { \
193 if (src + 1 < src_end) \
194 c1 = *src++, c2 = *src++; \
195 else \
196 goto label_end_of_loop; \
197 } while (0)
198
199#define THREE_MORE_BYTES(c1, c2, c3) \
200 do { \
201 if (src + 2 < src_end) \
202 c1 = *src++, c2 = *src++, c3 = *src++; \
203 else \
204 goto label_end_of_loop; \
205 } while (0)
206
207/* The following three macros DECODE_CHARACTER_ASCII,
208 DECODE_CHARACTER_DIMENSION1, and DECODE_CHARACTER_DIMENSION2 put
209 the multi-byte form of a character of each class at the place
210 pointed by `dst'. The caller should set the variable `dst' to
211 point to an appropriate area and the variable `coding' to point to
212 the coding-system of the currently decoding text in advance. */
213
214/* Decode one ASCII character C. */
215
de79a6a5
KH
216#define DECODE_CHARACTER_ASCII(c) \
217 do { \
218 if (COMPOSING_P (coding->composing)) \
219 { \
220 *dst++ = 0xA0, *dst++ = (c) | 0x80; \
221 coding->composed_chars++; \
d14d03ac
KH
222 if (((c) | 0x80) < 0xA0) \
223 coding->fake_multibyte = 1; \
de79a6a5
KH
224 } \
225 else \
226 { \
227 *dst++ = (c); \
228 coding->produced_char++; \
d14d03ac
KH
229 if ((c) >= 0x80) \
230 coding->fake_multibyte = 1; \
de79a6a5 231 } \
4ed46869
KH
232 } while (0)
233
f4dee582 234/* Decode one DIMENSION1 character whose charset is CHARSET and whose
4ed46869
KH
235 position-code is C. */
236
237#define DECODE_CHARACTER_DIMENSION1(charset, c) \
238 do { \
239 unsigned char leading_code = CHARSET_LEADING_CODE_BASE (charset); \
240 if (COMPOSING_P (coding->composing)) \
de79a6a5
KH
241 { \
242 *dst++ = leading_code + 0x20; \
243 coding->composed_chars++; \
244 } \
4ed46869 245 else \
d46c5b12
KH
246 { \
247 *dst++ = leading_code; \
248 coding->produced_char++; \
249 } \
4ed46869
KH
250 if (leading_code = CHARSET_LEADING_CODE_EXT (charset)) \
251 *dst++ = leading_code; \
252 *dst++ = (c) | 0x80; \
d14d03ac
KH
253 if (((c) | 0x80) < 0xA0) \
254 coding->fake_multibyte = 1; \
4ed46869
KH
255 } while (0)
256
f4dee582 257/* Decode one DIMENSION2 character whose charset is CHARSET and whose
4ed46869
KH
258 position-codes are C1 and C2. */
259
260#define DECODE_CHARACTER_DIMENSION2(charset, c1, c2) \
261 do { \
262 DECODE_CHARACTER_DIMENSION1 (charset, c1); \
263 *dst++ = (c2) | 0x80; \
d14d03ac
KH
264 if (((c2) | 0x80) < 0xA0) \
265 coding->fake_multibyte = 1; \
4ed46869
KH
266 } while (0)
267
268\f
269/*** 1. Preamble ***/
270
271#include <stdio.h>
272
273#ifdef emacs
274
275#include <config.h>
276#include "lisp.h"
277#include "buffer.h"
278#include "charset.h"
279#include "ccl.h"
280#include "coding.h"
281#include "window.h"
282
283#else /* not emacs */
284
285#include "mulelib.h"
286
287#endif /* not emacs */
288
289Lisp_Object Qcoding_system, Qeol_type;
290Lisp_Object Qbuffer_file_coding_system;
291Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
27901516 292Lisp_Object Qno_conversion, Qundecided;
bb0115a2 293Lisp_Object Qcoding_system_history;
70c22245 294Lisp_Object Qsafe_charsets;
1397dc18 295Lisp_Object Qvalid_codes;
4ed46869
KH
296
297extern Lisp_Object Qinsert_file_contents, Qwrite_region;
298Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
299Lisp_Object Qstart_process, Qopen_network_stream;
300Lisp_Object Qtarget_idx;
301
d46c5b12
KH
302Lisp_Object Vselect_safe_coding_system_function;
303
7722baf9
EZ
304/* Mnemonic string for each format of end-of-line. */
305Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
306/* Mnemonic string to indicate format of end-of-line is not yet
4ed46869 307 decided. */
7722baf9 308Lisp_Object eol_mnemonic_undecided;
4ed46869 309
9ce27fde
KH
310/* Format of end-of-line decided by system. This is CODING_EOL_LF on
311 Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac. */
312int system_eol_type;
313
4ed46869
KH
314#ifdef emacs
315
4608c386
KH
316Lisp_Object Vcoding_system_list, Vcoding_system_alist;
317
318Lisp_Object Qcoding_system_p, Qcoding_system_error;
4ed46869 319
d46c5b12
KH
320/* Coding system emacs-mule and raw-text are for converting only
321 end-of-line format. */
322Lisp_Object Qemacs_mule, Qraw_text;
9ce27fde 323
4ed46869
KH
324/* Coding-systems are handed between Emacs Lisp programs and C internal
325 routines by the following three variables. */
326/* Coding-system for reading files and receiving data from process. */
327Lisp_Object Vcoding_system_for_read;
328/* Coding-system for writing files and sending data to process. */
329Lisp_Object Vcoding_system_for_write;
330/* Coding-system actually used in the latest I/O. */
331Lisp_Object Vlast_coding_system_used;
332
c4825358 333/* A vector of length 256 which contains information about special
94487c4e 334 Latin codes (especially for dealing with Microsoft codes). */
3f003981 335Lisp_Object Vlatin_extra_code_table;
c4825358 336
9ce27fde
KH
337/* Flag to inhibit code conversion of end-of-line format. */
338int inhibit_eol_conversion;
339
ed29121d
EZ
340/* Flag to make buffer-file-coding-system inherit from process-coding. */
341int inherit_process_coding_system;
342
c4825358 343/* Coding system to be used to encode text for terminal display. */
4ed46869
KH
344struct coding_system terminal_coding;
345
c4825358
KH
346/* Coding system to be used to encode text for terminal display when
347 terminal coding system is nil. */
348struct coding_system safe_terminal_coding;
349
350/* Coding system of what is sent from terminal keyboard. */
4ed46869
KH
351struct coding_system keyboard_coding;
352
6bc51348
KH
353/* Default coding system to be used to write a file. */
354struct coding_system default_buffer_file_coding;
355
02ba4723
KH
356Lisp_Object Vfile_coding_system_alist;
357Lisp_Object Vprocess_coding_system_alist;
358Lisp_Object Vnetwork_coding_system_alist;
4ed46869
KH
359
360#endif /* emacs */
361
d46c5b12 362Lisp_Object Qcoding_category, Qcoding_category_index;
4ed46869
KH
363
364/* List of symbols `coding-category-xxx' ordered by priority. */
365Lisp_Object Vcoding_category_list;
366
d46c5b12
KH
367/* Table of coding categories (Lisp symbols). */
368Lisp_Object Vcoding_category_table;
4ed46869
KH
369
370/* Table of names of symbol for each coding-category. */
371char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
0ef69138 372 "coding-category-emacs-mule",
4ed46869
KH
373 "coding-category-sjis",
374 "coding-category-iso-7",
d46c5b12 375 "coding-category-iso-7-tight",
4ed46869
KH
376 "coding-category-iso-8-1",
377 "coding-category-iso-8-2",
7717c392
KH
378 "coding-category-iso-7-else",
379 "coding-category-iso-8-else",
89fa8b36 380 "coding-category-ccl",
4ed46869 381 "coding-category-big5",
27901516 382 "coding-category-raw-text",
89fa8b36 383 "coding-category-binary"
4ed46869
KH
384};
385
66cfb530 386/* Table of pointers to coding systems corresponding to each coding
d46c5b12
KH
387 categories. */
388struct coding_system *coding_system_table[CODING_CATEGORY_IDX_MAX];
389
66cfb530
KH
390/* Table of coding category masks. Nth element is a mask for a coding
391 cateogry of which priority is Nth. */
392static
393int coding_priorities[CODING_CATEGORY_IDX_MAX];
394
f967223b
KH
395/* Flag to tell if we look up translation table on character code
396 conversion. */
84fbb8a0 397Lisp_Object Venable_character_translation;
f967223b
KH
398/* Standard translation table to look up on decoding (reading). */
399Lisp_Object Vstandard_translation_table_for_decode;
400/* Standard translation table to look up on encoding (writing). */
401Lisp_Object Vstandard_translation_table_for_encode;
84fbb8a0 402
f967223b
KH
403Lisp_Object Qtranslation_table;
404Lisp_Object Qtranslation_table_id;
405Lisp_Object Qtranslation_table_for_decode;
406Lisp_Object Qtranslation_table_for_encode;
4ed46869
KH
407
408/* Alist of charsets vs revision number. */
409Lisp_Object Vcharset_revision_alist;
410
02ba4723
KH
411/* Default coding systems used for process I/O. */
412Lisp_Object Vdefault_process_coding_system;
413
4ed46869 414\f
0ef69138 415/*** 2. Emacs internal format (emacs-mule) handlers ***/
4ed46869
KH
416
417/* Emacs' internal format for encoding multiple character sets is a
f4dee582
RS
418 kind of multi-byte encoding, i.e. characters are encoded by
419 variable-length sequences of one-byte codes. ASCII characters
420 and control characters (e.g. `tab', `newline') are represented by
421 one-byte sequences which are their ASCII codes, in the range 0x00
422 through 0x7F. The other characters are represented by a sequence
423 of `base leading-code', optional `extended leading-code', and one
424 or two `position-code's. The length of the sequence is determined
425 by the base leading-code. Leading-code takes the range 0x80
426 through 0x9F, whereas extended leading-code and position-code take
427 the range 0xA0 through 0xFF. See `charset.h' for more details
428 about leading-code and position-code.
429
430 There's one exception to this rule. Special leading-code
4ed46869
KH
431 `leading-code-composition' denotes that the following several
432 characters should be composed into one character. Leading-codes of
433 components (except for ASCII) are added 0x20. An ASCII character
434 component is represented by a 2-byte sequence of `0xA0' and
f4dee582
RS
435 `ASCII-code + 0x80'. See also the comments in `charset.h' for the
436 details of composite character. Hence, we can summarize the code
4ed46869
KH
437 range as follows:
438
439 --- CODE RANGE of Emacs' internal format ---
440 (character set) (range)
441 ASCII 0x00 .. 0x7F
442 ELSE (1st byte) 0x80 .. 0x9F
443 (rest bytes) 0xA0 .. 0xFF
444 ---------------------------------------------
445
446 */
447
448enum emacs_code_class_type emacs_code_class[256];
449
450/* Go to the next statement only if *SRC is accessible and the code is
451 greater than 0xA0. */
452#define CHECK_CODE_RANGE_A0_FF \
453 do { \
454 if (src >= src_end) \
455 goto label_end_of_switch; \
456 else if (*src++ < 0xA0) \
457 return 0; \
458 } while (0)
459
460/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
461 Check if a text is encoded in Emacs' internal format. If it is,
d46c5b12 462 return CODING_CATEGORY_MASK_EMACS_MULE, else return 0. */
4ed46869
KH
463
464int
0ef69138 465detect_coding_emacs_mule (src, src_end)
4ed46869
KH
466 unsigned char *src, *src_end;
467{
468 unsigned char c;
469 int composing = 0;
470
471 while (src < src_end)
472 {
473 c = *src++;
474
475 if (composing)
476 {
477 if (c < 0xA0)
478 composing = 0;
479 else
480 c -= 0x20;
481 }
482
483 switch (emacs_code_class[c])
484 {
485 case EMACS_ascii_code:
486 case EMACS_linefeed_code:
487 break;
488
489 case EMACS_control_code:
490 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
491 return 0;
492 break;
493
494 case EMACS_invalid_code:
495 return 0;
496
497 case EMACS_leading_code_composition: /* c == 0x80 */
498 if (composing)
499 CHECK_CODE_RANGE_A0_FF;
500 else
501 composing = 1;
502 break;
503
504 case EMACS_leading_code_4:
505 CHECK_CODE_RANGE_A0_FF;
506 /* fall down to check it two more times ... */
507
508 case EMACS_leading_code_3:
509 CHECK_CODE_RANGE_A0_FF;
510 /* fall down to check it one more time ... */
511
512 case EMACS_leading_code_2:
513 CHECK_CODE_RANGE_A0_FF;
514 break;
515
516 default:
517 label_end_of_switch:
518 break;
519 }
520 }
0ef69138 521 return CODING_CATEGORY_MASK_EMACS_MULE;
4ed46869
KH
522}
523
524\f
525/*** 3. ISO2022 handlers ***/
526
527/* The following note describes the coding system ISO2022 briefly.
f4dee582
RS
528 Since the intention of this note is to help in understanding of
529 the programs in this file, some parts are NOT ACCURATE or OVERLY
4ed46869
KH
530 SIMPLIFIED. For the thorough understanding, please refer to the
531 original document of ISO2022.
532
533 ISO2022 provides many mechanisms to encode several character sets
f4dee582 534 in 7-bit and 8-bit environment. If one chooses 7-bite environment,
4ed46869 535 all text is encoded by codes of less than 128. This may make the
f4dee582
RS
536 encoded text a little bit longer, but the text gets more stability
537 to pass through several gateways (some of them strip off the MSB).
4ed46869 538
f4dee582 539 There are two kinds of character set: control character set and
4ed46869
KH
540 graphic character set. The former contains control characters such
541 as `newline' and `escape' to provide control functions (control
f4dee582 542 functions are provided also by escape sequences). The latter
4ed46869
KH
543 contains graphic characters such as ' A' and '-'. Emacs recognizes
544 two control character sets and many graphic character sets.
545
546 Graphic character sets are classified into one of the following
547 four classes, DIMENSION1_CHARS94, DIMENSION1_CHARS96,
548 DIMENSION2_CHARS94, DIMENSION2_CHARS96 according to the number of
549 bytes (DIMENSION) and the number of characters in one dimension
550 (CHARS) of the set. In addition, each character set is assigned an
551 identification tag (called "final character" and denoted as <F>
552 here after) which is unique in each class. <F> of each character
553 set is decided by ECMA(*) when it is registered in ISO. Code range
554 of <F> is 0x30..0x7F (0x30..0x3F are for private use only).
555
556 Note (*): ECMA = European Computer Manufacturers Association
557
558 Here are examples of graphic character set [NAME(<F>)]:
559 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
560 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
561 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
562 o DIMENSION2_CHARS96 -- none for the moment
563
564 A code area (1byte=8bits) is divided into 4 areas, C0, GL, C1, and GR.
565 C0 [0x00..0x1F] -- control character plane 0
566 GL [0x20..0x7F] -- graphic character plane 0
567 C1 [0x80..0x9F] -- control character plane 1
568 GR [0xA0..0xFF] -- graphic character plane 1
569
570 A control character set is directly designated and invoked to C0 or
571 C1 by an escape sequence. The most common case is that ISO646's
572 control character set is designated/invoked to C0 and ISO6429's
573 control character set is designated/invoked to C1, and usually
574 these designations/invocations are omitted in a coded text. With
575 7-bit environment, only C0 can be used, and a control character for
576 C1 is encoded by an appropriate escape sequence to fit in the
577 environment. All control characters for C1 are defined the
578 corresponding escape sequences.
579
580 A graphic character set is at first designated to one of four
581 graphic registers (G0 through G3), then these graphic registers are
582 invoked to GL or GR. These designations and invocations can be
583 done independently. The most common case is that G0 is invoked to
584 GL, G1 is invoked to GR, and ASCII is designated to G0, and usually
585 these invocations and designations are omitted in a coded text.
586 With 7-bit environment, only GL can be used.
587
588 When a graphic character set of CHARS94 is invoked to GL, code 0x20
589 and 0x7F of GL area work as control characters SPACE and DEL
590 respectively, and code 0xA0 and 0xFF of GR area should not be used.
591
592 There are two ways of invocation: locking-shift and single-shift.
593 With locking-shift, the invocation lasts until the next different
594 invocation, whereas with single-shift, the invocation works only
595 for the following character and doesn't affect locking-shift.
596 Invocations are done by the following control characters or escape
597 sequences.
598
599 ----------------------------------------------------------------------
600 function control char escape sequence description
601 ----------------------------------------------------------------------
602 SI (shift-in) 0x0F none invoke G0 to GL
10bff6f1 603 SO (shift-out) 0x0E none invoke G1 to GL
4ed46869
KH
604 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
605 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
606 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 into GL
607 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 into GL
608 ----------------------------------------------------------------------
609 The first four are for locking-shift. Control characters for these
610 functions are defined by macros ISO_CODE_XXX in `coding.h'.
611
612 Designations are done by the following escape sequences.
613 ----------------------------------------------------------------------
614 escape sequence description
615 ----------------------------------------------------------------------
616 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
617 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
618 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
619 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
620 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
621 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
622 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
623 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
624 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
625 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
626 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
627 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
628 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
629 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
630 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
631 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
632 ----------------------------------------------------------------------
633
634 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
635 of dimension 1, chars 94, and final character <F>, and etc.
636
637 Note (*): Although these designations are not allowed in ISO2022,
638 Emacs accepts them on decoding, and produces them on encoding
639 CHARS96 character set in a coding system which is characterized as
640 7-bit environment, non-locking-shift, and non-single-shift.
641
642 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
643 '(' can be omitted. We call this as "short-form" here after.
644
645 Now you may notice that there are a lot of ways for encoding the
f4dee582 646 same multilingual text in ISO2022. Actually, there exists many
4ed46869
KH
647 coding systems such as Compound Text (used in X's inter client
648 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
649 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
650 localized platforms), and all of these are variants of ISO2022.
651
652 In addition to the above, Emacs handles two more kinds of escape
653 sequences: ISO6429's direction specification and Emacs' private
654 sequence for specifying character composition.
655
656 ISO6429's direction specification takes the following format:
657 o CSI ']' -- end of the current direction
658 o CSI '0' ']' -- end of the current direction
659 o CSI '1' ']' -- start of left-to-right text
660 o CSI '2' ']' -- start of right-to-left text
661 The control character CSI (0x9B: control sequence introducer) is
662 abbreviated to the escape sequence ESC '[' in 7-bit environment.
663
664 Character composition specification takes the following format:
665 o ESC '0' -- start character composition
666 o ESC '1' -- end character composition
667 Since these are not standard escape sequences of any ISO, the use
668 of them for these meaning is restricted to Emacs only. */
669
670enum iso_code_class_type iso_code_class[256];
671
f024b6aa
RS
672#define CHARSET_OK(idx, charset) \
673 (coding_system_table[idx] \
674 && (coding_system_table[idx]->safe_charsets[charset] \
675 || (CODING_SPEC_ISO_REQUESTED_DESIGNATION \
676 (coding_system_table[idx], charset) \
677 != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)))
d46c5b12
KH
678
679#define SHIFT_OUT_OK(idx) \
680 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding_system_table[idx], 1) >= 0)
681
4ed46869
KH
682/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
683 Check if a text is encoded in ISO2022. If it is, returns an
684 integer in which appropriate flag bits any of:
685 CODING_CATEGORY_MASK_ISO_7
d46c5b12 686 CODING_CATEGORY_MASK_ISO_7_TIGHT
4ed46869
KH
687 CODING_CATEGORY_MASK_ISO_8_1
688 CODING_CATEGORY_MASK_ISO_8_2
7717c392
KH
689 CODING_CATEGORY_MASK_ISO_7_ELSE
690 CODING_CATEGORY_MASK_ISO_8_ELSE
4ed46869
KH
691 are set. If a code which should never appear in ISO2022 is found,
692 returns 0. */
693
694int
695detect_coding_iso2022 (src, src_end)
696 unsigned char *src, *src_end;
697{
d46c5b12
KH
698 int mask = CODING_CATEGORY_MASK_ISO;
699 int mask_found = 0;
f46869e4 700 int reg[4], shift_out = 0, single_shifting = 0;
d46c5b12 701 int c, c1, i, charset;
3f003981 702
d46c5b12 703 reg[0] = CHARSET_ASCII, reg[1] = reg[2] = reg[3] = -1;
3f003981 704 while (mask && src < src_end)
4ed46869
KH
705 {
706 c = *src++;
707 switch (c)
708 {
709 case ISO_CODE_ESC:
f46869e4 710 single_shifting = 0;
e0e989f6 711 if (src >= src_end)
4ed46869
KH
712 break;
713 c = *src++;
d46c5b12 714 if (c >= '(' && c <= '/')
4ed46869 715 {
bf9cdd4e
KH
716 /* Designation sequence for a charset of dimension 1. */
717 if (src >= src_end)
718 break;
d46c5b12
KH
719 c1 = *src++;
720 if (c1 < ' ' || c1 >= 0x80
721 || (charset = iso_charset_table[0][c >= ','][c1]) < 0)
722 /* Invalid designation sequence. Just ignore. */
723 break;
724 reg[(c - '(') % 4] = charset;
bf9cdd4e
KH
725 }
726 else if (c == '$')
727 {
728 /* Designation sequence for a charset of dimension 2. */
729 if (src >= src_end)
730 break;
731 c = *src++;
732 if (c >= '@' && c <= 'B')
733 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
d46c5b12 734 reg[0] = charset = iso_charset_table[1][0][c];
bf9cdd4e 735 else if (c >= '(' && c <= '/')
bcf26d6a 736 {
bf9cdd4e
KH
737 if (src >= src_end)
738 break;
d46c5b12
KH
739 c1 = *src++;
740 if (c1 < ' ' || c1 >= 0x80
741 || (charset = iso_charset_table[1][c >= ','][c1]) < 0)
742 /* Invalid designation sequence. Just ignore. */
743 break;
744 reg[(c - '(') % 4] = charset;
bcf26d6a 745 }
bf9cdd4e 746 else
d46c5b12
KH
747 /* Invalid designation sequence. Just ignore. */
748 break;
749 }
ae9ff118 750 else if (c == 'N' || c == 'O')
d46c5b12 751 {
ae9ff118
KH
752 /* ESC <Fe> for SS2 or SS3. */
753 mask &= CODING_CATEGORY_MASK_ISO_7_ELSE;
d46c5b12 754 break;
4ed46869 755 }
bf9cdd4e 756 else if (c == '0' || c == '1' || c == '2')
ae9ff118 757 /* ESC <Fp> for start/end composition. Just ignore. */
d46c5b12 758 break;
bf9cdd4e 759 else
d46c5b12
KH
760 /* Invalid escape sequence. Just ignore. */
761 break;
762
763 /* We found a valid designation sequence for CHARSET. */
764 mask &= ~CODING_CATEGORY_MASK_ISO_8BIT;
765 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7, charset))
766 mask_found |= CODING_CATEGORY_MASK_ISO_7;
767 else
768 mask &= ~CODING_CATEGORY_MASK_ISO_7;
769 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT, charset))
770 mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
771 else
772 mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
ae9ff118
KH
773 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE, charset))
774 mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
775 else
d46c5b12 776 mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
ae9ff118
KH
777 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE, charset))
778 mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
779 else
d46c5b12 780 mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
4ed46869
KH
781 break;
782
4ed46869 783 case ISO_CODE_SO:
f46869e4 784 single_shifting = 0;
d46c5b12
KH
785 if (shift_out == 0
786 && (reg[1] >= 0
787 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE)
788 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE)))
789 {
790 /* Locking shift out. */
791 mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
792 mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
793 }
e0e989f6
KH
794 break;
795
d46c5b12 796 case ISO_CODE_SI:
f46869e4 797 single_shifting = 0;
d46c5b12
KH
798 if (shift_out == 1)
799 {
800 /* Locking shift in. */
801 mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
802 mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
803 }
804 break;
805
4ed46869 806 case ISO_CODE_CSI:
f46869e4 807 single_shifting = 0;
4ed46869
KH
808 case ISO_CODE_SS2:
809 case ISO_CODE_SS3:
3f003981
KH
810 {
811 int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
812
70c22245
KH
813 if (c != ISO_CODE_CSI)
814 {
d46c5b12
KH
815 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
816 & CODING_FLAG_ISO_SINGLE_SHIFT)
70c22245 817 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
d46c5b12
KH
818 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
819 & CODING_FLAG_ISO_SINGLE_SHIFT)
70c22245 820 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
f46869e4 821 single_shifting = 1;
70c22245 822 }
3f003981
KH
823 if (VECTORP (Vlatin_extra_code_table)
824 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
825 {
d46c5b12
KH
826 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
827 & CODING_FLAG_ISO_LATIN_EXTRA)
3f003981 828 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
d46c5b12
KH
829 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
830 & CODING_FLAG_ISO_LATIN_EXTRA)
3f003981
KH
831 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
832 }
833 mask &= newmask;
d46c5b12 834 mask_found |= newmask;
3f003981
KH
835 }
836 break;
4ed46869
KH
837
838 default:
839 if (c < 0x80)
f46869e4
KH
840 {
841 single_shifting = 0;
842 break;
843 }
4ed46869 844 else if (c < 0xA0)
c4825358 845 {
f46869e4 846 single_shifting = 0;
3f003981
KH
847 if (VECTORP (Vlatin_extra_code_table)
848 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
c4825358 849 {
3f003981
KH
850 int newmask = 0;
851
d46c5b12
KH
852 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
853 & CODING_FLAG_ISO_LATIN_EXTRA)
3f003981 854 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
d46c5b12
KH
855 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
856 & CODING_FLAG_ISO_LATIN_EXTRA)
3f003981
KH
857 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
858 mask &= newmask;
d46c5b12 859 mask_found |= newmask;
c4825358 860 }
3f003981
KH
861 else
862 return 0;
c4825358 863 }
4ed46869
KH
864 else
865 {
7717c392 866 unsigned char *src_begin = src;
4ed46869 867
d46c5b12 868 mask &= ~(CODING_CATEGORY_MASK_ISO_7BIT
7717c392 869 | CODING_CATEGORY_MASK_ISO_7_ELSE);
d46c5b12 870 mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
f46869e4
KH
871 /* Check the length of succeeding codes of the range
872 0xA0..0FF. If the byte length is odd, we exclude
873 CODING_CATEGORY_MASK_ISO_8_2. We can check this only
874 when we are not single shifting. */
875 if (!single_shifting)
876 {
877 while (src < src_end && *src >= 0xA0)
878 src++;
879 if ((src - src_begin - 1) & 1 && src < src_end)
880 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
881 else
882 mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
883 }
4ed46869
KH
884 }
885 break;
886 }
887 }
888
d46c5b12 889 return (mask & mask_found);
4ed46869
KH
890}
891
892/* Decode a character of which charset is CHARSET and the 1st position
bdd9fb48 893 code is C1. If dimension of CHARSET is 2, the 2nd position code is
4ed46869
KH
894 fetched from SRC and set to C2. If CHARSET is negative, it means
895 that we are decoding ill formed text, and what we can do is just to
896 read C1 as is. */
897
bdd9fb48
KH
898#define DECODE_ISO_CHARACTER(charset, c1) \
899 do { \
900 int c_alt, charset_alt = (charset); \
901 if (COMPOSING_HEAD_P (coding->composing)) \
902 { \
903 *dst++ = LEADING_CODE_COMPOSITION; \
904 if (COMPOSING_WITH_RULE_P (coding->composing)) \
905 /* To tell composition rules are embeded. */ \
906 *dst++ = 0xFF; \
907 coding->composing += 2; \
908 } \
85bbb134 909 if (charset_alt >= 0) \
bdd9fb48 910 { \
85bbb134 911 if (CHARSET_DIMENSION (charset_alt) == 2) \
70c22245
KH
912 { \
913 ONE_MORE_BYTE (c2); \
914 if (iso_code_class[(c2) & 0x7F] != ISO_0x20_or_0x7F \
915 && iso_code_class[(c2) & 0x7F] != ISO_graphic_plane_0) \
916 { \
917 src--; \
85bbb134 918 charset_alt = CHARSET_ASCII; \
70c22245
KH
919 } \
920 } \
84fbb8a0
KH
921 if (!NILP (translation_table) \
922 && ((c_alt = translate_char (translation_table, \
85bbb134 923 -1, charset_alt, c1, c2)) >= 0)) \
bdd9fb48
KH
924 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
925 } \
926 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
927 DECODE_CHARACTER_ASCII (c1); \
928 else if (CHARSET_DIMENSION (charset_alt) == 1) \
929 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
930 else \
931 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
932 if (COMPOSING_WITH_RULE_P (coding->composing)) \
933 /* To tell a composition rule follows. */ \
934 coding->composing = COMPOSING_WITH_RULE_RULE; \
4ed46869
KH
935 } while (0)
936
937/* Set designation state into CODING. */
d46c5b12
KH
938#define DECODE_DESIGNATION(reg, dimension, chars, final_char) \
939 do { \
944bd420
KH
940 int charset; \
941 \
942 if (final_char < '0' || final_char >= 128) \
943 goto label_invalid_code; \
944 charset = ISO_CHARSET_TABLE (make_number (dimension), \
945 make_number (chars), \
946 make_number (final_char)); \
d46c5b12 947 if (charset >= 0 \
704c5781
KH
948 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) == reg \
949 || coding->safe_charsets[charset])) \
d46c5b12
KH
950 { \
951 if (coding->spec.iso2022.last_invalid_designation_register == 0 \
952 && reg == 0 \
953 && charset == CHARSET_ASCII) \
954 { \
955 /* We should insert this designation sequence as is so \
956 that it is surely written back to a file. */ \
957 coding->spec.iso2022.last_invalid_designation_register = -1; \
958 goto label_invalid_code; \
959 } \
960 coding->spec.iso2022.last_invalid_designation_register = -1; \
961 if ((coding->mode & CODING_MODE_DIRECTION) \
962 && CHARSET_REVERSE_CHARSET (charset) >= 0) \
963 charset = CHARSET_REVERSE_CHARSET (charset); \
964 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
965 } \
966 else \
967 { \
968 coding->spec.iso2022.last_invalid_designation_register = reg; \
969 goto label_invalid_code; \
970 } \
4ed46869
KH
971 } while (0)
972
88993dfd
KH
973/* Return 0 if there's a valid composing sequence starting at SRC and
974 ending before SRC_END, else return -1. */
d46c5b12 975
84fbb8a0
KH
976int
977check_composing_code (coding, src, src_end)
d46c5b12
KH
978 struct coding_system *coding;
979 unsigned char *src, *src_end;
980{
d46c5b12
KH
981 int charset, c, c1, dim;
982
983 while (src < src_end)
984 {
88993dfd
KH
985 c = *src++;
986 if (c >= 0x20)
987 continue;
988 if (c != ISO_CODE_ESC || src >= src_end)
989 return -1;
990 c = *src++;
991 if (c == '1') /* end of compsition */
992 return 0;
993 if (src + 2 >= src_end
994 || !coding->flags & CODING_FLAG_ISO_DESIGNATION)
995 return -1;
996
997 dim = (c == '$');
998 if (dim == 1)
999 c = (*src >= '@' && *src <= 'B') ? '(' : *src++;
1000 if (c >= '(' && c <= '/')
d46c5b12 1001 {
88993dfd
KH
1002 c1 = *src++;
1003 if ((c1 < ' ' || c1 >= 0x80)
1004 || (charset = iso_charset_table[dim][c >= ','][c1]) < 0
1005 || ! coding->safe_charsets[charset]
1006 || (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
1007 == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
1008 return -1;
d46c5b12 1009 }
88993dfd
KH
1010 else
1011 return -1;
d46c5b12 1012 }
88993dfd
KH
1013
1014 /* We have not found the sequence "ESC 1". */
1015 return -1;
d46c5b12
KH
1016}
1017
4ed46869
KH
1018/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
1019
1020int
d46c5b12 1021decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
1022 struct coding_system *coding;
1023 unsigned char *source, *destination;
1024 int src_bytes, dst_bytes;
4ed46869
KH
1025{
1026 unsigned char *src = source;
1027 unsigned char *src_end = source + src_bytes;
1028 unsigned char *dst = destination;
1029 unsigned char *dst_end = destination + dst_bytes;
1030 /* Since the maximum bytes produced by each loop is 7, we subtract 6
1031 from DST_END to assure that overflow checking is necessary only
1032 at the head of loop. */
1033 unsigned char *adjusted_dst_end = dst_end - 6;
1034 int charset;
1035 /* Charsets invoked to graphic plane 0 and 1 respectively. */
1036 int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1037 int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
84fbb8a0 1038 Lisp_Object translation_table
f967223b 1039 = coding->translation_table_for_decode;
d46c5b12 1040 int result = CODING_FINISH_NORMAL;
bdd9fb48 1041
84fbb8a0 1042 if (!NILP (Venable_character_translation) && NILP (translation_table))
f967223b 1043 translation_table = Vstandard_translation_table_for_decode;
4ed46869 1044
d46c5b12 1045 coding->produced_char = 0;
de79a6a5 1046 coding->composed_chars = 0;
fb88bf2d 1047 coding->fake_multibyte = 0;
d46c5b12
KH
1048 while (src < src_end && (dst_bytes
1049 ? (dst < adjusted_dst_end)
1050 : (dst < src - 6)))
4ed46869
KH
1051 {
1052 /* SRC_BASE remembers the start position in source in each loop.
1053 The loop will be exited when there's not enough source text
1054 to analyze long escape sequence or 2-byte code (within macros
1055 ONE_MORE_BYTE or TWO_MORE_BYTES). In that case, SRC is reset
1056 to SRC_BASE before exiting. */
1057 unsigned char *src_base = src;
bdd9fb48 1058 int c1 = *src++, c2;
4ed46869
KH
1059
1060 switch (iso_code_class [c1])
1061 {
1062 case ISO_0x20_or_0x7F:
1063 if (!coding->composing
1064 && (charset0 < 0 || CHARSET_CHARS (charset0) == 94))
1065 {
1066 /* This is SPACE or DEL. */
1067 *dst++ = c1;
d46c5b12 1068 coding->produced_char++;
4ed46869
KH
1069 break;
1070 }
1071 /* This is a graphic character, we fall down ... */
1072
1073 case ISO_graphic_plane_0:
1074 if (coding->composing == COMPOSING_WITH_RULE_RULE)
1075 {
1076 /* This is a composition rule. */
1077 *dst++ = c1 | 0x80;
1078 coding->composing = COMPOSING_WITH_RULE_TAIL;
1079 }
1080 else
1081 DECODE_ISO_CHARACTER (charset0, c1);
1082 break;
1083
1084 case ISO_0xA0_or_0xFF:
d46c5b12
KH
1085 if (charset1 < 0 || CHARSET_CHARS (charset1) == 94
1086 || coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
fb88bf2d 1087 goto label_invalid_code;
4ed46869
KH
1088 /* This is a graphic character, we fall down ... */
1089
1090 case ISO_graphic_plane_1:
d46c5b12 1091 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
fb88bf2d 1092 goto label_invalid_code;
d46c5b12
KH
1093 else
1094 DECODE_ISO_CHARACTER (charset1, c1);
4ed46869
KH
1095 break;
1096
1097 case ISO_control_code:
1098 /* All ISO2022 control characters in this class have the
1099 same representation in Emacs internal format. */
d46c5b12
KH
1100 if (c1 == '\n'
1101 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1102 && (coding->eol_type == CODING_EOL_CR
1103 || coding->eol_type == CODING_EOL_CRLF))
1104 {
1105 result = CODING_FINISH_INCONSISTENT_EOL;
1106 goto label_end_of_loop_2;
1107 }
4ed46869 1108 *dst++ = c1;
d46c5b12 1109 coding->produced_char++;
174a4cbe
KH
1110 if (c1 >= 0x80)
1111 coding->fake_multibyte = 1;
4ed46869
KH
1112 break;
1113
1114 case ISO_carriage_return:
1115 if (coding->eol_type == CODING_EOL_CR)
d46c5b12 1116 *dst++ = '\n';
4ed46869
KH
1117 else if (coding->eol_type == CODING_EOL_CRLF)
1118 {
1119 ONE_MORE_BYTE (c1);
1120 if (c1 == ISO_CODE_LF)
1121 *dst++ = '\n';
1122 else
1123 {
d46c5b12
KH
1124 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1125 {
1126 result = CODING_FINISH_INCONSISTENT_EOL;
1127 goto label_end_of_loop_2;
1128 }
4ed46869 1129 src--;
d46c5b12 1130 *dst++ = '\r';
4ed46869
KH
1131 }
1132 }
1133 else
d46c5b12
KH
1134 *dst++ = c1;
1135 coding->produced_char++;
4ed46869
KH
1136 break;
1137
1138 case ISO_shift_out:
d46c5b12
KH
1139 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1140 || CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
1141 goto label_invalid_code;
4ed46869
KH
1142 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
1143 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1144 break;
1145
1146 case ISO_shift_in:
d46c5b12
KH
1147 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
1148 goto label_invalid_code;
4ed46869
KH
1149 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
1150 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1151 break;
1152
1153 case ISO_single_shift_2_7:
1154 case ISO_single_shift_2:
d46c5b12
KH
1155 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1156 goto label_invalid_code;
4ed46869
KH
1157 /* SS2 is handled as an escape sequence of ESC 'N' */
1158 c1 = 'N';
1159 goto label_escape_sequence;
1160
1161 case ISO_single_shift_3:
d46c5b12
KH
1162 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1163 goto label_invalid_code;
4ed46869
KH
1164 /* SS2 is handled as an escape sequence of ESC 'O' */
1165 c1 = 'O';
1166 goto label_escape_sequence;
1167
1168 case ISO_control_sequence_introducer:
1169 /* CSI is handled as an escape sequence of ESC '[' ... */
1170 c1 = '[';
1171 goto label_escape_sequence;
1172
1173 case ISO_escape:
1174 ONE_MORE_BYTE (c1);
1175 label_escape_sequence:
1176 /* Escape sequences handled by Emacs are invocation,
1177 designation, direction specification, and character
1178 composition specification. */
1179 switch (c1)
1180 {
1181 case '&': /* revision of following character set */
1182 ONE_MORE_BYTE (c1);
1183 if (!(c1 >= '@' && c1 <= '~'))
d46c5b12 1184 goto label_invalid_code;
4ed46869
KH
1185 ONE_MORE_BYTE (c1);
1186 if (c1 != ISO_CODE_ESC)
d46c5b12 1187 goto label_invalid_code;
4ed46869
KH
1188 ONE_MORE_BYTE (c1);
1189 goto label_escape_sequence;
1190
1191 case '$': /* designation of 2-byte character set */
d46c5b12
KH
1192 if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1193 goto label_invalid_code;
4ed46869
KH
1194 ONE_MORE_BYTE (c1);
1195 if (c1 >= '@' && c1 <= 'B')
1196 { /* designation of JISX0208.1978, GB2312.1980,
88993dfd 1197 or JISX0208.1980 */
4ed46869
KH
1198 DECODE_DESIGNATION (0, 2, 94, c1);
1199 }
1200 else if (c1 >= 0x28 && c1 <= 0x2B)
1201 { /* designation of DIMENSION2_CHARS94 character set */
1202 ONE_MORE_BYTE (c2);
1203 DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
1204 }
1205 else if (c1 >= 0x2C && c1 <= 0x2F)
1206 { /* designation of DIMENSION2_CHARS96 character set */
1207 ONE_MORE_BYTE (c2);
1208 DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
1209 }
1210 else
d46c5b12 1211 goto label_invalid_code;
4ed46869
KH
1212 break;
1213
1214 case 'n': /* invocation of locking-shift-2 */
d46c5b12
KH
1215 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1216 || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1217 goto label_invalid_code;
4ed46869 1218 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
e0e989f6 1219 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
4ed46869
KH
1220 break;
1221
1222 case 'o': /* invocation of locking-shift-3 */
d46c5b12
KH
1223 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1224 || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1225 goto label_invalid_code;
4ed46869 1226 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
e0e989f6 1227 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
4ed46869
KH
1228 break;
1229
1230 case 'N': /* invocation of single-shift-2 */
d46c5b12
KH
1231 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1232 || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1233 goto label_invalid_code;
4ed46869
KH
1234 ONE_MORE_BYTE (c1);
1235 charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
1236 DECODE_ISO_CHARACTER (charset, c1);
1237 break;
1238
1239 case 'O': /* invocation of single-shift-3 */
d46c5b12
KH
1240 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1241 || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1242 goto label_invalid_code;
4ed46869
KH
1243 ONE_MORE_BYTE (c1);
1244 charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
1245 DECODE_ISO_CHARACTER (charset, c1);
1246 break;
1247
d46c5b12
KH
1248 case '0': case '2': /* start composing */
1249 /* Before processing composing, we must be sure that all
1250 characters being composed are supported by CODING.
88993dfd
KH
1251 If not, we must give up composing. */
1252 if (check_composing_code (coding, src, src_end) == 0)
1253 {
1254 /* We are looking at a valid composition sequence. */
1255 coding->composing = (c1 == '0'
1256 ? COMPOSING_NO_RULE_HEAD
1257 : COMPOSING_WITH_RULE_HEAD);
1258 coding->composed_chars = 0;
1259 }
1260 else
1261 {
1262 *dst++ = ISO_CODE_ESC;
1263 *dst++ = c1;
1264 coding->produced_char += 2;
1265 }
4ed46869
KH
1266 break;
1267
1268 case '1': /* end composing */
88993dfd
KH
1269 if (!coding->composing)
1270 {
1271 *dst++ = ISO_CODE_ESC;
1272 *dst++ = c1;
1273 coding->produced_char += 2;
1274 break;
1275 }
1276
de79a6a5
KH
1277 if (coding->composed_chars > 0)
1278 {
1279 if (coding->composed_chars == 1)
1280 {
1281 unsigned char *this_char_start = dst;
1282 int this_bytes;
1283
1284 /* Only one character is in the composing
1285 sequence. Make it a normal character. */
1286 while (*--this_char_start != LEADING_CODE_COMPOSITION);
1287 dst = (this_char_start
1288 + (coding->composing == COMPOSING_NO_RULE_TAIL
1289 ? 1 : 2));
1290 *dst -= 0x20;
1291 if (*dst == 0x80)
1292 *++dst &= 0x7F;
1293 this_bytes = BYTES_BY_CHAR_HEAD (*dst);
1294 while (this_bytes--) *this_char_start++ = *dst++;
1295 dst = this_char_start;
1296 }
1297 coding->produced_char++;
1298 }
4ed46869 1299 coding->composing = COMPOSING_NO;
4ed46869
KH
1300 break;
1301
1302 case '[': /* specification of direction */
d46c5b12
KH
1303 if (coding->flags & CODING_FLAG_ISO_NO_DIRECTION)
1304 goto label_invalid_code;
4ed46869 1305 /* For the moment, nested direction is not supported.
d46c5b12
KH
1306 So, `coding->mode & CODING_MODE_DIRECTION' zero means
1307 left-to-right, and nozero means right-to-left. */
4ed46869
KH
1308 ONE_MORE_BYTE (c1);
1309 switch (c1)
1310 {
1311 case ']': /* end of the current direction */
d46c5b12 1312 coding->mode &= ~CODING_MODE_DIRECTION;
4ed46869
KH
1313
1314 case '0': /* end of the current direction */
1315 case '1': /* start of left-to-right direction */
1316 ONE_MORE_BYTE (c1);
1317 if (c1 == ']')
d46c5b12 1318 coding->mode &= ~CODING_MODE_DIRECTION;
4ed46869 1319 else
d46c5b12 1320 goto label_invalid_code;
4ed46869
KH
1321 break;
1322
1323 case '2': /* start of right-to-left direction */
1324 ONE_MORE_BYTE (c1);
1325 if (c1 == ']')
d46c5b12 1326 coding->mode |= CODING_MODE_DIRECTION;
4ed46869 1327 else
d46c5b12 1328 goto label_invalid_code;
4ed46869
KH
1329 break;
1330
1331 default:
d46c5b12 1332 goto label_invalid_code;
4ed46869
KH
1333 }
1334 break;
1335
1336 default:
d46c5b12
KH
1337 if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1338 goto label_invalid_code;
4ed46869
KH
1339 if (c1 >= 0x28 && c1 <= 0x2B)
1340 { /* designation of DIMENSION1_CHARS94 character set */
1341 ONE_MORE_BYTE (c2);
1342 DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
1343 }
1344 else if (c1 >= 0x2C && c1 <= 0x2F)
1345 { /* designation of DIMENSION1_CHARS96 character set */
1346 ONE_MORE_BYTE (c2);
1347 DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
1348 }
1349 else
1350 {
d46c5b12 1351 goto label_invalid_code;
4ed46869
KH
1352 }
1353 }
1354 /* We must update these variables now. */
1355 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1356 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1357 break;
1358
d46c5b12 1359 label_invalid_code:
d46c5b12
KH
1360 while (src_base < src)
1361 *dst++ = *src_base++;
fb88bf2d 1362 coding->fake_multibyte = 1;
4ed46869
KH
1363 }
1364 continue;
1365
1366 label_end_of_loop:
d46c5b12
KH
1367 result = CODING_FINISH_INSUFFICIENT_SRC;
1368 label_end_of_loop_2:
4ed46869
KH
1369 src = src_base;
1370 break;
1371 }
1372
fb88bf2d 1373 if (src < src_end)
4ed46869 1374 {
fb88bf2d
KH
1375 if (result == CODING_FINISH_NORMAL)
1376 result = CODING_FINISH_INSUFFICIENT_DST;
1377 else if (result != CODING_FINISH_INCONSISTENT_EOL
1378 && coding->mode & CODING_MODE_LAST_BLOCK)
1379 {
1380 /* This is the last block of the text to be decoded. We had
1381 better just flush out all remaining codes in the text
1382 although they are not valid characters. */
1383 src_bytes = src_end - src;
1384 if (dst_bytes && (dst_end - dst < src_bytes))
1385 src_bytes = dst_end - dst;
1386 bcopy (src, dst, src_bytes);
1387 dst += src_bytes;
1388 src += src_bytes;
1389 coding->fake_multibyte = 1;
1390 }
4ed46869 1391 }
fb88bf2d 1392
d46c5b12
KH
1393 coding->consumed = coding->consumed_char = src - source;
1394 coding->produced = dst - destination;
1395 return result;
4ed46869
KH
1396}
1397
f4dee582 1398/* ISO2022 encoding stuff. */
4ed46869
KH
1399
1400/*
f4dee582 1401 It is not enough to say just "ISO2022" on encoding, we have to
d46c5b12 1402 specify more details. In Emacs, each coding system of ISO2022
4ed46869
KH
1403 variant has the following specifications:
1404 1. Initial designation to G0 thru G3.
1405 2. Allows short-form designation?
1406 3. ASCII should be designated to G0 before control characters?
1407 4. ASCII should be designated to G0 at end of line?
1408 5. 7-bit environment or 8-bit environment?
1409 6. Use locking-shift?
1410 7. Use Single-shift?
1411 And the following two are only for Japanese:
1412 8. Use ASCII in place of JIS0201-1976-Roman?
1413 9. Use JISX0208-1983 in place of JISX0208-1978?
1414 These specifications are encoded in `coding->flags' as flag bits
1415 defined by macros CODING_FLAG_ISO_XXX. See `coding.h' for more
f4dee582 1416 details.
4ed46869
KH
1417*/
1418
1419/* Produce codes (escape sequence) for designating CHARSET to graphic
1420 register REG. If <final-char> of CHARSET is '@', 'A', or 'B' and
1421 the coding system CODING allows, produce designation sequence of
1422 short-form. */
1423
1424#define ENCODE_DESIGNATION(charset, reg, coding) \
1425 do { \
1426 unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset); \
1427 char *intermediate_char_94 = "()*+"; \
1428 char *intermediate_char_96 = ",-./"; \
70c22245
KH
1429 int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset); \
1430 if (revision < 255) \
1431 { \
4ed46869
KH
1432 *dst++ = ISO_CODE_ESC; \
1433 *dst++ = '&'; \
70c22245 1434 *dst++ = '@' + revision; \
4ed46869
KH
1435 } \
1436 *dst++ = ISO_CODE_ESC; \
1437 if (CHARSET_DIMENSION (charset) == 1) \
1438 { \
1439 if (CHARSET_CHARS (charset) == 94) \
1440 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1441 else \
1442 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1443 } \
1444 else \
1445 { \
1446 *dst++ = '$'; \
1447 if (CHARSET_CHARS (charset) == 94) \
1448 { \
1449 if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM) \
1450 || reg != 0 \
1451 || final_char < '@' || final_char > 'B') \
1452 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1453 } \
1454 else \
1455 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1456 } \
1457 *dst++ = final_char; \
1458 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
1459 } while (0)
1460
1461/* The following two macros produce codes (control character or escape
1462 sequence) for ISO2022 single-shift functions (single-shift-2 and
1463 single-shift-3). */
1464
1465#define ENCODE_SINGLE_SHIFT_2 \
1466 do { \
1467 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1468 *dst++ = ISO_CODE_ESC, *dst++ = 'N'; \
1469 else \
fb88bf2d
KH
1470 { \
1471 *dst++ = ISO_CODE_SS2; \
1472 coding->fake_multibyte = 1; \
1473 } \
4ed46869
KH
1474 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1475 } while (0)
1476
fb88bf2d
KH
1477#define ENCODE_SINGLE_SHIFT_3 \
1478 do { \
4ed46869 1479 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
fb88bf2d
KH
1480 *dst++ = ISO_CODE_ESC, *dst++ = 'O'; \
1481 else \
1482 { \
1483 *dst++ = ISO_CODE_SS3; \
1484 coding->fake_multibyte = 1; \
1485 } \
4ed46869
KH
1486 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1487 } while (0)
1488
1489/* The following four macros produce codes (control character or
1490 escape sequence) for ISO2022 locking-shift functions (shift-in,
1491 shift-out, locking-shift-2, and locking-shift-3). */
1492
1493#define ENCODE_SHIFT_IN \
1494 do { \
1495 *dst++ = ISO_CODE_SI; \
1496 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1497 } while (0)
1498
1499#define ENCODE_SHIFT_OUT \
1500 do { \
1501 *dst++ = ISO_CODE_SO; \
1502 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1503 } while (0)
1504
1505#define ENCODE_LOCKING_SHIFT_2 \
1506 do { \
1507 *dst++ = ISO_CODE_ESC, *dst++ = 'n'; \
1508 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1509 } while (0)
1510
1511#define ENCODE_LOCKING_SHIFT_3 \
1512 do { \
1513 *dst++ = ISO_CODE_ESC, *dst++ = 'o'; \
1514 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1515 } while (0)
1516
f4dee582
RS
1517/* Produce codes for a DIMENSION1 character whose character set is
1518 CHARSET and whose position-code is C1. Designation and invocation
4ed46869
KH
1519 sequences are also produced in advance if necessary. */
1520
1521
6e85d753
KH
1522#define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
1523 do { \
1524 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1525 { \
1526 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1527 *dst++ = c1 & 0x7F; \
1528 else \
1529 *dst++ = c1 | 0x80; \
1530 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1531 break; \
1532 } \
1533 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1534 { \
1535 *dst++ = c1 & 0x7F; \
1536 break; \
1537 } \
1538 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1539 { \
1540 *dst++ = c1 | 0x80; \
1541 break; \
1542 } \
1543 else if (coding->flags & CODING_FLAG_ISO_SAFE \
70c22245 1544 && !coding->safe_charsets[charset]) \
6e85d753
KH
1545 { \
1546 /* We should not encode this character, instead produce one or \
1547 two `?'s. */ \
1548 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1549 if (CHARSET_WIDTH (charset) == 2) \
1550 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1551 break; \
1552 } \
1553 else \
1554 /* Since CHARSET is not yet invoked to any graphic planes, we \
1555 must invoke it, or, at first, designate it to some graphic \
1556 register. Then repeat the loop to actually produce the \
1557 character. */ \
1558 dst = encode_invocation_designation (charset, coding, dst); \
4ed46869
KH
1559 } while (1)
1560
f4dee582
RS
1561/* Produce codes for a DIMENSION2 character whose character set is
1562 CHARSET and whose position-codes are C1 and C2. Designation and
4ed46869
KH
1563 invocation codes are also produced in advance if necessary. */
1564
6e85d753
KH
1565#define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
1566 do { \
1567 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1568 { \
1569 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1570 *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F; \
1571 else \
1572 *dst++ = c1 | 0x80, *dst++ = c2 | 0x80; \
1573 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1574 break; \
1575 } \
1576 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1577 { \
1578 *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F; \
1579 break; \
1580 } \
1581 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1582 { \
1583 *dst++ = c1 | 0x80, *dst++= c2 | 0x80; \
1584 break; \
1585 } \
1586 else if (coding->flags & CODING_FLAG_ISO_SAFE \
70c22245 1587 && !coding->safe_charsets[charset]) \
6e85d753
KH
1588 { \
1589 /* We should not encode this character, instead produce one or \
1590 two `?'s. */ \
1591 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1592 if (CHARSET_WIDTH (charset) == 2) \
1593 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1594 break; \
1595 } \
1596 else \
1597 /* Since CHARSET is not yet invoked to any graphic planes, we \
1598 must invoke it, or, at first, designate it to some graphic \
1599 register. Then repeat the loop to actually produce the \
1600 character. */ \
1601 dst = encode_invocation_designation (charset, coding, dst); \
4ed46869
KH
1602 } while (1)
1603
84fbb8a0
KH
1604#define ENCODE_ISO_CHARACTER(charset, c1, c2) \
1605 do { \
1606 int c_alt, charset_alt; \
1607 if (!NILP (translation_table) \
1608 && ((c_alt = translate_char (translation_table, -1, \
1609 charset, c1, c2)) \
1610 >= 0)) \
1611 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1612 else \
1613 charset_alt = charset; \
1614 if (CHARSET_DIMENSION (charset_alt) == 1) \
1615 { \
1616 if (charset == CHARSET_ASCII \
1617 && coding->flags & CODING_FLAG_ISO_USE_ROMAN) \
1618 charset_alt = charset_latin_jisx0201; \
1619 ENCODE_ISO_CHARACTER_DIMENSION1 (charset_alt, c1); \
1620 } \
1621 else \
1622 { \
1623 if (charset == charset_jisx0208 \
1624 && coding->flags & CODING_FLAG_ISO_USE_OLDJIS) \
1625 charset_alt = charset_jisx0208_1978; \
1626 ENCODE_ISO_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
1627 } \
1628 if (! COMPOSING_P (coding->composing)) \
1629 coding->consumed_char++; \
1630 } while (0)
bdd9fb48 1631
4ed46869
KH
1632/* Produce designation and invocation codes at a place pointed by DST
1633 to use CHARSET. The element `spec.iso2022' of *CODING is updated.
1634 Return new DST. */
1635
1636unsigned char *
1637encode_invocation_designation (charset, coding, dst)
1638 int charset;
1639 struct coding_system *coding;
1640 unsigned char *dst;
1641{
1642 int reg; /* graphic register number */
1643
1644 /* At first, check designations. */
1645 for (reg = 0; reg < 4; reg++)
1646 if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
1647 break;
1648
1649 if (reg >= 4)
1650 {
1651 /* CHARSET is not yet designated to any graphic registers. */
1652 /* At first check the requested designation. */
1653 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1ba9e4ab
KH
1654 if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1655 /* Since CHARSET requests no special designation, designate it
1656 to graphic register 0. */
4ed46869
KH
1657 reg = 0;
1658
1659 ENCODE_DESIGNATION (charset, reg, coding);
1660 }
1661
1662 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
1663 && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
1664 {
1665 /* Since the graphic register REG is not invoked to any graphic
1666 planes, invoke it to graphic plane 0. */
1667 switch (reg)
1668 {
1669 case 0: /* graphic register 0 */
1670 ENCODE_SHIFT_IN;
1671 break;
1672
1673 case 1: /* graphic register 1 */
1674 ENCODE_SHIFT_OUT;
1675 break;
1676
1677 case 2: /* graphic register 2 */
1678 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1679 ENCODE_SINGLE_SHIFT_2;
1680 else
1681 ENCODE_LOCKING_SHIFT_2;
1682 break;
1683
1684 case 3: /* graphic register 3 */
1685 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1686 ENCODE_SINGLE_SHIFT_3;
1687 else
1688 ENCODE_LOCKING_SHIFT_3;
1689 break;
1690 }
1691 }
1692 return dst;
1693}
1694
1695/* The following two macros produce codes for indicating composition. */
1696#define ENCODE_COMPOSITION_NO_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '0'
1697#define ENCODE_COMPOSITION_WITH_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '2'
1698#define ENCODE_COMPOSITION_END *dst++ = ISO_CODE_ESC, *dst++ = '1'
1699
1700/* The following three macros produce codes for indicating direction
1701 of text. */
1702#define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
1703 do { \
1704 if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS) \
1705 *dst++ = ISO_CODE_ESC, *dst++ = '['; \
1706 else \
1707 *dst++ = ISO_CODE_CSI; \
1708 } while (0)
1709
1710#define ENCODE_DIRECTION_R2L \
1711 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '2', *dst++ = ']'
1712
1713#define ENCODE_DIRECTION_L2R \
1714 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '0', *dst++ = ']'
1715
1716/* Produce codes for designation and invocation to reset the graphic
1717 planes and registers to initial state. */
e0e989f6
KH
1718#define ENCODE_RESET_PLANE_AND_REGISTER \
1719 do { \
1720 int reg; \
1721 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0) \
1722 ENCODE_SHIFT_IN; \
1723 for (reg = 0; reg < 4; reg++) \
1724 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0 \
1725 && (CODING_SPEC_ISO_DESIGNATION (coding, reg) \
1726 != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg))) \
1727 ENCODE_DESIGNATION \
1728 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
4ed46869
KH
1729 } while (0)
1730
bdd9fb48 1731/* Produce designation sequences of charsets in the line started from
d46c5b12 1732 SRC to a place pointed by *DSTP, and update DSTP.
bdd9fb48
KH
1733
1734 If the current block ends before any end-of-line, we may fail to
d46c5b12
KH
1735 find all the necessary designations. */
1736
dfcf069d 1737void
bdd9fb48 1738encode_designation_at_bol (coding, table, src, src_end, dstp)
e0e989f6 1739 struct coding_system *coding;
bdd9fb48 1740 Lisp_Object table;
e0e989f6
KH
1741 unsigned char *src, *src_end, **dstp;
1742{
bdd9fb48
KH
1743 int charset, c, found = 0, reg;
1744 /* Table of charsets to be designated to each graphic register. */
1745 int r[4];
1746 unsigned char *dst = *dstp;
1747
1748 for (reg = 0; reg < 4; reg++)
1749 r[reg] = -1;
1750
1751 while (src < src_end && *src != '\n' && found < 4)
e0e989f6 1752 {
bdd9fb48
KH
1753 int bytes = BYTES_BY_CHAR_HEAD (*src);
1754
1755 if (NILP (table))
1756 charset = CHARSET_AT (src);
1757 else
e0e989f6 1758 {
35cb8686
RS
1759 int c_alt;
1760 unsigned char c1, c2;
bdd9fb48
KH
1761
1762 SPLIT_STRING(src, bytes, charset, c1, c2);
84fbb8a0 1763 if ((c_alt = translate_char (table, -1, charset, c1, c2)) >= 0)
bdd9fb48 1764 charset = CHAR_CHARSET (c_alt);
e0e989f6 1765 }
bdd9fb48 1766
e0e989f6 1767 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
d46c5b12 1768 if (reg != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION && r[reg] < 0)
bdd9fb48
KH
1769 {
1770 found++;
1771 r[reg] = charset;
1772 }
1773
1774 src += bytes;
1775 }
1776
1777 if (found)
1778 {
1779 for (reg = 0; reg < 4; reg++)
1780 if (r[reg] >= 0
1781 && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
1782 ENCODE_DESIGNATION (r[reg], reg, coding);
1783 *dstp = dst;
e0e989f6 1784 }
e0e989f6
KH
1785}
1786
4ed46869
KH
1787/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
1788
1789int
d46c5b12 1790encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
1791 struct coding_system *coding;
1792 unsigned char *source, *destination;
1793 int src_bytes, dst_bytes;
4ed46869
KH
1794{
1795 unsigned char *src = source;
1796 unsigned char *src_end = source + src_bytes;
1797 unsigned char *dst = destination;
1798 unsigned char *dst_end = destination + dst_bytes;
e0e989f6 1799 /* Since the maximum bytes produced by each loop is 20, we subtract 19
4ed46869
KH
1800 from DST_END to assure overflow checking is necessary only at the
1801 head of loop. */
e0e989f6 1802 unsigned char *adjusted_dst_end = dst_end - 19;
84fbb8a0 1803 Lisp_Object translation_table
f967223b 1804 = coding->translation_table_for_encode;
d46c5b12 1805 int result = CODING_FINISH_NORMAL;
bdd9fb48 1806
84fbb8a0 1807 if (!NILP (Venable_character_translation) && NILP (translation_table))
f967223b 1808 translation_table = Vstandard_translation_table_for_encode;
4ed46869 1809
d46c5b12 1810 coding->consumed_char = 0;
fb88bf2d 1811 coding->fake_multibyte = 0;
d46c5b12
KH
1812 while (src < src_end && (dst_bytes
1813 ? (dst < adjusted_dst_end)
1814 : (dst < src - 19)))
4ed46869
KH
1815 {
1816 /* SRC_BASE remembers the start position in source in each loop.
1817 The loop will be exited when there's not enough source text
1818 to analyze multi-byte codes (within macros ONE_MORE_BYTE,
1819 TWO_MORE_BYTES, and THREE_MORE_BYTES). In that case, SRC is
1820 reset to SRC_BASE before exiting. */
1821 unsigned char *src_base = src;
bdd9fb48 1822 int charset, c1, c2, c3, c4;
4ed46869 1823
e0e989f6
KH
1824 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
1825 && CODING_SPEC_ISO_BOL (coding))
1826 {
bdd9fb48 1827 /* We have to produce designation sequences if any now. */
84fbb8a0 1828 encode_designation_at_bol (coding, translation_table,
bdd9fb48 1829 src, src_end, &dst);
e0e989f6
KH
1830 CODING_SPEC_ISO_BOL (coding) = 0;
1831 }
1832
1833 c1 = *src++;
4ed46869 1834 /* If we are seeing a component of a composite character, we are
d46c5b12
KH
1835 seeing a leading-code encoded irregularly for composition, or
1836 a composition rule if composing with rule. We must set C1 to
1837 a normal leading-code or an ASCII code. If we are not seeing
1838 a composite character, we must reset composition,
1839 designation, and invocation states. */
4ed46869
KH
1840 if (COMPOSING_P (coding->composing))
1841 {
1842 if (c1 < 0xA0)
1843 {
1844 /* We are not in a composite character any longer. */
1845 coding->composing = COMPOSING_NO;
d46c5b12 1846 ENCODE_RESET_PLANE_AND_REGISTER;
4ed46869
KH
1847 ENCODE_COMPOSITION_END;
1848 }
1849 else
1850 {
1851 if (coding->composing == COMPOSING_WITH_RULE_RULE)
1852 {
1853 *dst++ = c1 & 0x7F;
1854 coding->composing = COMPOSING_WITH_RULE_HEAD;
1855 continue;
1856 }
1857 else if (coding->composing == COMPOSING_WITH_RULE_HEAD)
1858 coding->composing = COMPOSING_WITH_RULE_RULE;
1859 if (c1 == 0xA0)
1860 {
1861 /* This is an ASCII component. */
1862 ONE_MORE_BYTE (c1);
1863 c1 &= 0x7F;
1864 }
1865 else
1866 /* This is a leading-code of non ASCII component. */
1867 c1 -= 0x20;
1868 }
1869 }
1870
1871 /* Now encode one character. C1 is a control character, an
1872 ASCII character, or a leading-code of multi-byte character. */
1873 switch (emacs_code_class[c1])
1874 {
1875 case EMACS_ascii_code:
bdd9fb48 1876 ENCODE_ISO_CHARACTER (CHARSET_ASCII, c1, /* dummy */ c2);
4ed46869
KH
1877 break;
1878
1879 case EMACS_control_code:
1880 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
e0e989f6 1881 ENCODE_RESET_PLANE_AND_REGISTER;
4ed46869 1882 *dst++ = c1;
d46c5b12 1883 coding->consumed_char++;
4ed46869
KH
1884 break;
1885
1886 case EMACS_carriage_return_code:
d46c5b12 1887 if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
4ed46869
KH
1888 {
1889 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
e0e989f6 1890 ENCODE_RESET_PLANE_AND_REGISTER;
4ed46869 1891 *dst++ = c1;
d46c5b12 1892 coding->consumed_char++;
4ed46869
KH
1893 break;
1894 }
1895 /* fall down to treat '\r' as '\n' ... */
1896
1897 case EMACS_linefeed_code:
1898 if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
e0e989f6
KH
1899 ENCODE_RESET_PLANE_AND_REGISTER;
1900 if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
1901 bcopy (coding->spec.iso2022.initial_designation,
1902 coding->spec.iso2022.current_designation,
1903 sizeof coding->spec.iso2022.initial_designation);
4ed46869 1904 if (coding->eol_type == CODING_EOL_LF
0ef69138 1905 || coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
1906 *dst++ = ISO_CODE_LF;
1907 else if (coding->eol_type == CODING_EOL_CRLF)
1908 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
1909 else
1910 *dst++ = ISO_CODE_CR;
e0e989f6 1911 CODING_SPEC_ISO_BOL (coding) = 1;
d46c5b12 1912 coding->consumed_char++;
4ed46869
KH
1913 break;
1914
1915 case EMACS_leading_code_2:
1916 ONE_MORE_BYTE (c2);
19a8d9e0
KH
1917 if (c2 < 0xA0)
1918 {
1919 /* invalid sequence */
1920 *dst++ = c1;
38cf95df
RS
1921 src--;
1922 coding->consumed_char++;
19a8d9e0
KH
1923 }
1924 else
1925 ENCODE_ISO_CHARACTER (c1, c2, /* dummy */ c3);
4ed46869
KH
1926 break;
1927
1928 case EMACS_leading_code_3:
1929 TWO_MORE_BYTES (c2, c3);
19a8d9e0
KH
1930 if (c2 < 0xA0 || c3 < 0xA0)
1931 {
1932 /* invalid sequence */
1933 *dst++ = c1;
38cf95df
RS
1934 src -= 2;
1935 coding->consumed_char++;
19a8d9e0
KH
1936 }
1937 else if (c1 < LEADING_CODE_PRIVATE_11)
bdd9fb48 1938 ENCODE_ISO_CHARACTER (c1, c2, c3);
4ed46869 1939 else
bdd9fb48 1940 ENCODE_ISO_CHARACTER (c2, c3, /* dummy */ c4);
4ed46869
KH
1941 break;
1942
1943 case EMACS_leading_code_4:
1944 THREE_MORE_BYTES (c2, c3, c4);
19a8d9e0
KH
1945 if (c2 < 0xA0 || c3 < 0xA0 || c4 < 0xA0)
1946 {
1947 /* invalid sequence */
1948 *dst++ = c1;
38cf95df
RS
1949 src -= 3;
1950 coding->consumed_char++;
19a8d9e0
KH
1951 }
1952 else
1953 ENCODE_ISO_CHARACTER (c2, c3, c4);
4ed46869
KH
1954 break;
1955
1956 case EMACS_leading_code_composition:
19a8d9e0
KH
1957 ONE_MORE_BYTE (c2);
1958 if (c2 < 0xA0)
1959 {
1960 /* invalid sequence */
1961 *dst++ = c1;
38cf95df
RS
1962 src--;
1963 coding->consumed_char++;
19a8d9e0
KH
1964 }
1965 else if (c2 == 0xFF)
4ed46869 1966 {
d46c5b12 1967 ENCODE_RESET_PLANE_AND_REGISTER;
4ed46869
KH
1968 coding->composing = COMPOSING_WITH_RULE_HEAD;
1969 ENCODE_COMPOSITION_WITH_RULE_START;
d46c5b12 1970 coding->consumed_char++;
4ed46869
KH
1971 }
1972 else
1973 {
d46c5b12 1974 ENCODE_RESET_PLANE_AND_REGISTER;
4ed46869
KH
1975 /* Rewind one byte because it is a character code of
1976 composition elements. */
1977 src--;
1978 coding->composing = COMPOSING_NO_RULE_HEAD;
1979 ENCODE_COMPOSITION_NO_RULE_START;
d46c5b12 1980 coding->consumed_char++;
4ed46869
KH
1981 }
1982 break;
1983
1984 case EMACS_invalid_code:
3efbce95
KH
1985 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
1986 ENCODE_RESET_PLANE_AND_REGISTER;
4ed46869 1987 *dst++ = c1;
d46c5b12 1988 coding->consumed_char++;
4ed46869
KH
1989 break;
1990 }
1991 continue;
1992 label_end_of_loop:
d46c5b12
KH
1993 result = CODING_FINISH_INSUFFICIENT_SRC;
1994 src = src_base;
4ed46869
KH
1995 break;
1996 }
1997
49cb52b4
KH
1998 if (src < src_end && result == CODING_FINISH_NORMAL)
1999 result = CODING_FINISH_INSUFFICIENT_DST;
2000
2001 /* If this is the last block of the text to be encoded, we must
2002 reset graphic planes and registers to the initial state, and
2003 flush out the carryover if any. */
2004 if (coding->mode & CODING_MODE_LAST_BLOCK)
84fbb8a0
KH
2005 {
2006 ENCODE_RESET_PLANE_AND_REGISTER;
2007 if (COMPOSING_P (coding->composing))
2008 ENCODE_COMPOSITION_END;
88993dfd
KH
2009 if (result == CODING_FINISH_INSUFFICIENT_SRC)
2010 {
2011 while (src < src_end && dst < dst_end)
2012 *dst++ = *src++;
2013 }
84fbb8a0 2014 }
d46c5b12
KH
2015 coding->consumed = src - source;
2016 coding->produced = coding->produced_char = dst - destination;
2017 return result;
4ed46869
KH
2018}
2019
2020\f
2021/*** 4. SJIS and BIG5 handlers ***/
2022
f4dee582 2023/* Although SJIS and BIG5 are not ISO's coding system, they are used
4ed46869
KH
2024 quite widely. So, for the moment, Emacs supports them in the bare
2025 C code. But, in the future, they may be supported only by CCL. */
2026
2027/* SJIS is a coding system encoding three character sets: ASCII, right
2028 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
2029 as is. A character of charset katakana-jisx0201 is encoded by
2030 "position-code + 0x80". A character of charset japanese-jisx0208
2031 is encoded in 2-byte but two position-codes are divided and shifted
2032 so that it fit in the range below.
2033
2034 --- CODE RANGE of SJIS ---
2035 (character set) (range)
2036 ASCII 0x00 .. 0x7F
2037 KATAKANA-JISX0201 0xA0 .. 0xDF
54f78171 2038 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xEF
d14d03ac 2039 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
4ed46869
KH
2040 -------------------------------
2041
2042*/
2043
2044/* BIG5 is a coding system encoding two character sets: ASCII and
2045 Big5. An ASCII character is encoded as is. Big5 is a two-byte
2046 character set and is encoded in two-byte.
2047
2048 --- CODE RANGE of BIG5 ---
2049 (character set) (range)
2050 ASCII 0x00 .. 0x7F
2051 Big5 (1st byte) 0xA1 .. 0xFE
2052 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
2053 --------------------------
2054
2055 Since the number of characters in Big5 is larger than maximum
2056 characters in Emacs' charset (96x96), it can't be handled as one
2057 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
2058 and `charset-big5-2'. Both are DIMENSION2 and CHARS94. The former
2059 contains frequently used characters and the latter contains less
2060 frequently used characters. */
2061
2062/* Macros to decode or encode a character of Big5 in BIG5. B1 and B2
2063 are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
2064 C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
2065 format. CHARSET is `charset_big5_1' or `charset_big5_2'. */
2066
2067/* Number of Big5 characters which have the same code in 1st byte. */
2068#define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2069
2070#define DECODE_BIG5(b1, b2, charset, c1, c2) \
2071 do { \
2072 unsigned int temp \
2073 = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62); \
2074 if (b1 < 0xC9) \
2075 charset = charset_big5_1; \
2076 else \
2077 { \
2078 charset = charset_big5_2; \
2079 temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW; \
2080 } \
2081 c1 = temp / (0xFF - 0xA1) + 0x21; \
2082 c2 = temp % (0xFF - 0xA1) + 0x21; \
2083 } while (0)
2084
2085#define ENCODE_BIG5(charset, c1, c2, b1, b2) \
2086 do { \
2087 unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21); \
2088 if (charset == charset_big5_2) \
2089 temp += BIG5_SAME_ROW * (0xC9 - 0xA1); \
2090 b1 = temp / BIG5_SAME_ROW + 0xA1; \
2091 b2 = temp % BIG5_SAME_ROW; \
2092 b2 += b2 < 0x3F ? 0x40 : 0x62; \
2093 } while (0)
2094
a5d301df
KH
2095#define DECODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \
2096 do { \
2097 int c_alt, charset_alt = (charset); \
84fbb8a0
KH
2098 if (!NILP (translation_table) \
2099 && ((c_alt = translate_char (translation_table, \
2100 -1, (charset), c1, c2)) >= 0)) \
55ab7be3 2101 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
a5d301df
KH
2102 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
2103 DECODE_CHARACTER_ASCII (c1); \
2104 else if (CHARSET_DIMENSION (charset_alt) == 1) \
2105 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
2106 else \
2107 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
2108 } while (0)
2109
84fbb8a0
KH
2110#define ENCODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \
2111 do { \
2112 int c_alt, charset_alt; \
2113 if (!NILP (translation_table) \
2114 && ((c_alt = translate_char (translation_table, -1, \
2115 charset, c1, c2)) \
2116 >= 0)) \
2117 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
2118 else \
2119 charset_alt = charset; \
2120 if (charset_alt == charset_ascii) \
2121 *dst++ = c1; \
2122 else if (CHARSET_DIMENSION (charset_alt) == 1) \
2123 { \
2124 if (sjis_p && charset_alt == charset_katakana_jisx0201) \
2125 *dst++ = c1; \
2126 else \
2127 { \
2128 *dst++ = charset_alt, *dst++ = c1; \
2129 coding->fake_multibyte = 1; \
2130 } \
2131 } \
2132 else \
2133 { \
2134 c1 &= 0x7F, c2 &= 0x7F; \
2135 if (sjis_p && charset_alt == charset_jisx0208) \
2136 { \
2137 unsigned char s1, s2; \
2138 \
2139 ENCODE_SJIS (c1, c2, s1, s2); \
2140 *dst++ = s1, *dst++ = s2; \
2141 coding->fake_multibyte = 1; \
2142 } \
2143 else if (!sjis_p \
2144 && (charset_alt == charset_big5_1 \
2145 || charset_alt == charset_big5_2)) \
2146 { \
2147 unsigned char b1, b2; \
2148 \
2149 ENCODE_BIG5 (charset_alt, c1, c2, b1, b2); \
2150 *dst++ = b1, *dst++ = b2; \
2151 } \
2152 else \
2153 { \
2154 *dst++ = charset_alt, *dst++ = c1, *dst++ = c2; \
2155 coding->fake_multibyte = 1; \
2156 } \
2157 } \
2158 coding->consumed_char++; \
a5d301df
KH
2159 } while (0);
2160
4ed46869
KH
2161/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2162 Check if a text is encoded in SJIS. If it is, return
2163 CODING_CATEGORY_MASK_SJIS, else return 0. */
2164
2165int
2166detect_coding_sjis (src, src_end)
2167 unsigned char *src, *src_end;
2168{
2169 unsigned char c;
2170
2171 while (src < src_end)
2172 {
2173 c = *src++;
4ed46869
KH
2174 if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
2175 {
2176 if (src < src_end && *src++ < 0x40)
2177 return 0;
2178 }
2179 }
2180 return CODING_CATEGORY_MASK_SJIS;
2181}
2182
2183/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2184 Check if a text is encoded in BIG5. If it is, return
2185 CODING_CATEGORY_MASK_BIG5, else return 0. */
2186
2187int
2188detect_coding_big5 (src, src_end)
2189 unsigned char *src, *src_end;
2190{
2191 unsigned char c;
2192
2193 while (src < src_end)
2194 {
2195 c = *src++;
4ed46869
KH
2196 if (c >= 0xA1)
2197 {
2198 if (src >= src_end)
2199 break;
2200 c = *src++;
2201 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
2202 return 0;
2203 }
2204 }
2205 return CODING_CATEGORY_MASK_BIG5;
2206}
2207
2208/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2209 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
2210
2211int
2212decode_coding_sjis_big5 (coding, source, destination,
d46c5b12 2213 src_bytes, dst_bytes, sjis_p)
4ed46869
KH
2214 struct coding_system *coding;
2215 unsigned char *source, *destination;
2216 int src_bytes, dst_bytes;
4ed46869
KH
2217 int sjis_p;
2218{
2219 unsigned char *src = source;
2220 unsigned char *src_end = source + src_bytes;
2221 unsigned char *dst = destination;
2222 unsigned char *dst_end = destination + dst_bytes;
2223 /* Since the maximum bytes produced by each loop is 4, we subtract 3
2224 from DST_END to assure overflow checking is necessary only at the
2225 head of loop. */
2226 unsigned char *adjusted_dst_end = dst_end - 3;
84fbb8a0 2227 Lisp_Object translation_table
f967223b 2228 = coding->translation_table_for_decode;
d46c5b12 2229 int result = CODING_FINISH_NORMAL;
a5d301df 2230
84fbb8a0 2231 if (!NILP (Venable_character_translation) && NILP (translation_table))
f967223b 2232 translation_table = Vstandard_translation_table_for_decode;
4ed46869 2233
d46c5b12 2234 coding->produced_char = 0;
fb88bf2d 2235 coding->fake_multibyte = 0;
d46c5b12
KH
2236 while (src < src_end && (dst_bytes
2237 ? (dst < adjusted_dst_end)
2238 : (dst < src - 3)))
4ed46869
KH
2239 {
2240 /* SRC_BASE remembers the start position in source in each loop.
2241 The loop will be exited when there's not enough source text
2242 to analyze two-byte character (within macro ONE_MORE_BYTE).
2243 In that case, SRC is reset to SRC_BASE before exiting. */
2244 unsigned char *src_base = src;
2245 unsigned char c1 = *src++, c2, c3, c4;
2246
d46c5b12 2247 if (c1 < 0x20)
4ed46869 2248 {
d46c5b12 2249 if (c1 == '\r')
4ed46869 2250 {
d46c5b12
KH
2251 if (coding->eol_type == CODING_EOL_CRLF)
2252 {
2253 ONE_MORE_BYTE (c2);
2254 if (c2 == '\n')
2255 *dst++ = c2;
2256 else if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2257 {
2258 result = CODING_FINISH_INCONSISTENT_EOL;
2259 goto label_end_of_loop_2;
2260 }
2261 else
2262 /* To process C2 again, SRC is subtracted by 1. */
2263 *dst++ = c1, src--;
2264 }
2265 else if (coding->eol_type == CODING_EOL_CR)
2266 *dst++ = '\n';
4ed46869 2267 else
d46c5b12
KH
2268 *dst++ = c1;
2269 }
2270 else if (c1 == '\n'
2271 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2272 && (coding->eol_type == CODING_EOL_CR
2273 || coding->eol_type == CODING_EOL_CRLF))
2274 {
2275 result = CODING_FINISH_INCONSISTENT_EOL;
2276 goto label_end_of_loop_2;
4ed46869
KH
2277 }
2278 else
2279 *dst++ = c1;
d46c5b12 2280 coding->produced_char++;
4ed46869 2281 }
a5d301df
KH
2282 else if (c1 < 0x80)
2283 DECODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
54f78171 2284 else
4ed46869 2285 {
4ed46869
KH
2286 if (sjis_p)
2287 {
54f78171 2288 if (c1 < 0xA0 || (c1 >= 0xE0 && c1 < 0xF0))
fb88bf2d 2289 {
54f78171
KH
2290 /* SJIS -> JISX0208 */
2291 ONE_MORE_BYTE (c2);
d14d03ac 2292 if (c2 >= 0x40 && c2 != 0x7F && c2 <= 0xFC)
54f78171
KH
2293 {
2294 DECODE_SJIS (c1, c2, c3, c4);
2295 DECODE_SJIS_BIG5_CHARACTER (charset_jisx0208, c3, c4);
2296 }
2297 else
2298 goto label_invalid_code_2;
fb88bf2d 2299 }
54f78171
KH
2300 else if (c1 < 0xE0)
2301 /* SJIS -> JISX0201-Kana */
2302 DECODE_SJIS_BIG5_CHARACTER (charset_katakana_jisx0201, c1,
2303 /* dummy */ c2);
fb88bf2d 2304 else
54f78171 2305 goto label_invalid_code_1;
4ed46869 2306 }
fb88bf2d 2307 else
fb88bf2d 2308 {
54f78171
KH
2309 /* BIG5 -> Big5 */
2310 if (c1 >= 0xA1 && c1 <= 0xFE)
fb88bf2d 2311 {
54f78171
KH
2312 ONE_MORE_BYTE (c2);
2313 if ((c2 >= 0x40 && c2 <= 0x7E) || (c2 >= 0xA1 && c2 <= 0xFE))
2314 {
2315 int charset;
4ed46869 2316
54f78171
KH
2317 DECODE_BIG5 (c1, c2, charset, c3, c4);
2318 DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
2319 }
2320 else
2321 goto label_invalid_code_2;
fb88bf2d
KH
2322 }
2323 else
54f78171 2324 goto label_invalid_code_1;
4ed46869
KH
2325 }
2326 }
2327 continue;
2328
fb88bf2d
KH
2329 label_invalid_code_1:
2330 *dst++ = c1;
2331 coding->produced_char++;
2332 coding->fake_multibyte = 1;
2333 continue;
2334
2335 label_invalid_code_2:
2336 *dst++ = c1; *dst++= c2;
2337 coding->produced_char += 2;
2338 coding->fake_multibyte = 1;
2339 continue;
2340
4ed46869 2341 label_end_of_loop:
d46c5b12
KH
2342 result = CODING_FINISH_INSUFFICIENT_SRC;
2343 label_end_of_loop_2:
4ed46869
KH
2344 src = src_base;
2345 break;
2346 }
2347
fb88bf2d
KH
2348 if (src < src_end)
2349 {
2350 if (result == CODING_FINISH_NORMAL)
2351 result = CODING_FINISH_INSUFFICIENT_DST;
2352 else if (result != CODING_FINISH_INCONSISTENT_EOL
2353 && coding->mode & CODING_MODE_LAST_BLOCK)
2354 {
2355 src_bytes = src_end - src;
2356 if (dst_bytes && (dst_end - dst < src_bytes))
2357 src_bytes = dst_end - dst;
2358 bcopy (dst, src, src_bytes);
2359 src += src_bytes;
2360 dst += src_bytes;
2361 coding->fake_multibyte = 1;
2362 }
2363 }
d46c5b12
KH
2364
2365 coding->consumed = coding->consumed_char = src - source;
2366 coding->produced = dst - destination;
2367 return result;
4ed46869
KH
2368}
2369
2370/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
2371 This function can encode `charset_ascii', `charset_katakana_jisx0201',
2372 `charset_jisx0208', `charset_big5_1', and `charset_big5-2'. We are
2373 sure that all these charsets are registered as official charset
2374 (i.e. do not have extended leading-codes). Characters of other
2375 charsets are produced without any encoding. If SJIS_P is 1, encode
2376 SJIS text, else encode BIG5 text. */
2377
2378int
2379encode_coding_sjis_big5 (coding, source, destination,
d46c5b12 2380 src_bytes, dst_bytes, sjis_p)
4ed46869
KH
2381 struct coding_system *coding;
2382 unsigned char *source, *destination;
2383 int src_bytes, dst_bytes;
4ed46869
KH
2384 int sjis_p;
2385{
2386 unsigned char *src = source;
2387 unsigned char *src_end = source + src_bytes;
2388 unsigned char *dst = destination;
2389 unsigned char *dst_end = destination + dst_bytes;
2390 /* Since the maximum bytes produced by each loop is 2, we subtract 1
2391 from DST_END to assure overflow checking is necessary only at the
2392 head of loop. */
2393 unsigned char *adjusted_dst_end = dst_end - 1;
84fbb8a0 2394 Lisp_Object translation_table
f967223b 2395 = coding->translation_table_for_encode;
d46c5b12 2396 int result = CODING_FINISH_NORMAL;
a5d301df 2397
84fbb8a0 2398 if (!NILP (Venable_character_translation) && NILP (translation_table))
f967223b 2399 translation_table = Vstandard_translation_table_for_encode;
4ed46869 2400
d46c5b12 2401 coding->consumed_char = 0;
fb88bf2d 2402 coding->fake_multibyte = 0;
d46c5b12
KH
2403 while (src < src_end && (dst_bytes
2404 ? (dst < adjusted_dst_end)
2405 : (dst < src - 1)))
4ed46869
KH
2406 {
2407 /* SRC_BASE remembers the start position in source in each loop.
2408 The loop will be exited when there's not enough source text
2409 to analyze multi-byte codes (within macros ONE_MORE_BYTE and
2410 TWO_MORE_BYTES). In that case, SRC is reset to SRC_BASE
2411 before exiting. */
2412 unsigned char *src_base = src;
2413 unsigned char c1 = *src++, c2, c3, c4;
2414
2415 if (coding->composing)
2416 {
2417 if (c1 == 0xA0)
2418 {
2419 ONE_MORE_BYTE (c1);
2420 c1 &= 0x7F;
2421 }
2422 else if (c1 >= 0xA0)
2423 c1 -= 0x20;
2424 else
2425 coding->composing = 0;
2426 }
2427
2428 switch (emacs_code_class[c1])
2429 {
2430 case EMACS_ascii_code:
a5d301df
KH
2431 ENCODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
2432 break;
2433
4ed46869
KH
2434 case EMACS_control_code:
2435 *dst++ = c1;
d46c5b12 2436 coding->consumed_char++;
4ed46869
KH
2437 break;
2438
2439 case EMACS_carriage_return_code:
d46c5b12 2440 if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
4ed46869
KH
2441 {
2442 *dst++ = c1;
d46c5b12 2443 coding->consumed_char++;
4ed46869
KH
2444 break;
2445 }
2446 /* fall down to treat '\r' as '\n' ... */
2447
2448 case EMACS_linefeed_code:
2449 if (coding->eol_type == CODING_EOL_LF
0ef69138 2450 || coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
2451 *dst++ = '\n';
2452 else if (coding->eol_type == CODING_EOL_CRLF)
2453 *dst++ = '\r', *dst++ = '\n';
2454 else
2455 *dst++ = '\r';
d46c5b12 2456 coding->consumed_char++;
4ed46869
KH
2457 break;
2458
2459 case EMACS_leading_code_2:
2460 ONE_MORE_BYTE (c2);
a5d301df 2461 ENCODE_SJIS_BIG5_CHARACTER (c1, c2, /* dummy */ c3);
4ed46869
KH
2462 break;
2463
2464 case EMACS_leading_code_3:
2465 TWO_MORE_BYTES (c2, c3);
a5d301df 2466 ENCODE_SJIS_BIG5_CHARACTER (c1, c2, c3);
4ed46869
KH
2467 break;
2468
2469 case EMACS_leading_code_4:
2470 THREE_MORE_BYTES (c2, c3, c4);
a5d301df 2471 ENCODE_SJIS_BIG5_CHARACTER (c2, c3, c4);
4ed46869
KH
2472 break;
2473
2474 case EMACS_leading_code_composition:
2475 coding->composing = 1;
2476 break;
2477
2478 default: /* i.e. case EMACS_invalid_code: */
2479 *dst++ = c1;
d46c5b12 2480 coding->consumed_char++;
4ed46869
KH
2481 }
2482 continue;
2483
2484 label_end_of_loop:
d46c5b12
KH
2485 result = CODING_FINISH_INSUFFICIENT_SRC;
2486 src = src_base;
4ed46869
KH
2487 break;
2488 }
2489
d46c5b12
KH
2490 if (result == CODING_FINISH_NORMAL
2491 && src < src_end)
2492 result = CODING_FINISH_INSUFFICIENT_DST;
2493 coding->consumed = src - source;
2494 coding->produced = coding->produced_char = dst - destination;
2495 return result;
4ed46869
KH
2496}
2497
2498\f
1397dc18
KH
2499/*** 5. CCL handlers ***/
2500
2501/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2502 Check if a text is encoded in a coding system of which
2503 encoder/decoder are written in CCL program. If it is, return
2504 CODING_CATEGORY_MASK_CCL, else return 0. */
2505
2506int
2507detect_coding_ccl (src, src_end)
2508 unsigned char *src, *src_end;
2509{
2510 unsigned char *valid;
2511
2512 /* No coding system is assigned to coding-category-ccl. */
2513 if (!coding_system_table[CODING_CATEGORY_IDX_CCL])
2514 return 0;
2515
2516 valid = coding_system_table[CODING_CATEGORY_IDX_CCL]->spec.ccl.valid_codes;
2517 while (src < src_end)
2518 {
2519 if (! valid[*src]) return 0;
2520 src++;
2521 }
2522 return CODING_CATEGORY_MASK_CCL;
2523}
2524
2525\f
2526/*** 6. End-of-line handlers ***/
4ed46869
KH
2527
2528/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2529 This function is called only when `coding->eol_type' is
2530 CODING_EOL_CRLF or CODING_EOL_CR. */
2531
dfcf069d 2532int
d46c5b12 2533decode_eol (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
2534 struct coding_system *coding;
2535 unsigned char *source, *destination;
2536 int src_bytes, dst_bytes;
4ed46869
KH
2537{
2538 unsigned char *src = source;
2539 unsigned char *src_end = source + src_bytes;
2540 unsigned char *dst = destination;
2541 unsigned char *dst_end = destination + dst_bytes;
fb88bf2d 2542 unsigned char c;
d46c5b12
KH
2543 int result = CODING_FINISH_NORMAL;
2544
fb88bf2d
KH
2545 coding->fake_multibyte = 0;
2546
d46c5b12
KH
2547 if (src_bytes <= 0)
2548 return result;
4ed46869
KH
2549
2550 switch (coding->eol_type)
2551 {
2552 case CODING_EOL_CRLF:
2553 {
2554 /* Since the maximum bytes produced by each loop is 2, we
2555 subtract 1 from DST_END to assure overflow checking is
2556 necessary only at the head of loop. */
2557 unsigned char *adjusted_dst_end = dst_end - 1;
2558
d46c5b12
KH
2559 while (src < src_end && (dst_bytes
2560 ? (dst < adjusted_dst_end)
2561 : (dst < src - 1)))
4ed46869
KH
2562 {
2563 unsigned char *src_base = src;
fb88bf2d
KH
2564
2565 c = *src++;
4ed46869
KH
2566 if (c == '\r')
2567 {
2568 ONE_MORE_BYTE (c);
fdfcf19d
KH
2569 if (c == '\n')
2570 *dst++ = c;
2571 else
d46c5b12
KH
2572 {
2573 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2574 {
2575 result = CODING_FINISH_INCONSISTENT_EOL;
2576 goto label_end_of_loop_2;
2577 }
fdfcf19d 2578 src--;
d46c5b12 2579 *dst++ = '\r';
fb88bf2d
KH
2580 if (BASE_LEADING_CODE_P (c))
2581 coding->fake_multibyte = 1;
d46c5b12 2582 }
4ed46869 2583 }
d46c5b12
KH
2584 else if (c == '\n'
2585 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL))
2586 {
2587 result = CODING_FINISH_INCONSISTENT_EOL;
2588 goto label_end_of_loop_2;
2589 }
4ed46869 2590 else
fb88bf2d
KH
2591 {
2592 *dst++ = c;
2593 if (BASE_LEADING_CODE_P (c))
2594 coding->fake_multibyte = 1;
2595 }
4ed46869
KH
2596 continue;
2597
2598 label_end_of_loop:
d46c5b12
KH
2599 result = CODING_FINISH_INSUFFICIENT_SRC;
2600 label_end_of_loop_2:
4ed46869
KH
2601 src = src_base;
2602 break;
2603 }
fdfcf19d
KH
2604 if (src < src_end)
2605 {
2606 if (result == CODING_FINISH_NORMAL)
2607 result = CODING_FINISH_INSUFFICIENT_DST;
2608 else if (result != CODING_FINISH_INCONSISTENT_EOL
2609 && coding->mode & CODING_MODE_LAST_BLOCK)
2610 {
2611 /* This is the last block of the text to be decoded.
2612 We flush out all remaining codes. */
2613 src_bytes = src_end - src;
2614 if (dst_bytes && (dst_end - dst < src_bytes))
2615 src_bytes = dst_end - dst;
2616 bcopy (src, dst, src_bytes);
2617 dst += src_bytes;
2618 src += src_bytes;
2619 }
2620 }
4ed46869 2621 }
d46c5b12 2622 break;
4ed46869
KH
2623
2624 case CODING_EOL_CR:
d46c5b12
KH
2625 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2626 {
fb88bf2d
KH
2627 while (src < src_end)
2628 {
2629 if ((c = *src++) == '\n')
2630 break;
2631 if (BASE_LEADING_CODE_P (c))
2632 coding->fake_multibyte = 1;
2633 }
d46c5b12
KH
2634 if (*--src == '\n')
2635 {
2636 src_bytes = src - source;
2637 result = CODING_FINISH_INCONSISTENT_EOL;
2638 }
2639 }
2640 if (dst_bytes && src_bytes > dst_bytes)
2641 {
2642 result = CODING_FINISH_INSUFFICIENT_DST;
2643 src_bytes = dst_bytes;
2644 }
2645 if (dst_bytes)
2646 bcopy (source, destination, src_bytes);
2647 else
2648 safe_bcopy (source, destination, src_bytes);
2649 src = source + src_bytes;
2650 while (src_bytes--) if (*dst++ == '\r') dst[-1] = '\n';
4ed46869
KH
2651 break;
2652
2653 default: /* i.e. case: CODING_EOL_LF */
d46c5b12
KH
2654 if (dst_bytes && src_bytes > dst_bytes)
2655 {
2656 result = CODING_FINISH_INSUFFICIENT_DST;
2657 src_bytes = dst_bytes;
2658 }
2659 if (dst_bytes)
2660 bcopy (source, destination, src_bytes);
2661 else
2662 safe_bcopy (source, destination, src_bytes);
2663 src += src_bytes;
993824c9 2664 dst += src_bytes;
fb88bf2d 2665 coding->fake_multibyte = 1;
4ed46869
KH
2666 break;
2667 }
2668
d46c5b12
KH
2669 coding->consumed = coding->consumed_char = src - source;
2670 coding->produced = coding->produced_char = dst - destination;
2671 return result;
4ed46869
KH
2672}
2673
2674/* See "GENERAL NOTES about `encode_coding_XXX ()' functions". Encode
2675 format of end-of-line according to `coding->eol_type'. If
d46c5b12
KH
2676 `coding->mode & CODING_MODE_SELECTIVE_DISPLAY' is nonzero, code
2677 '\r' in source text also means end-of-line. */
4ed46869 2678
dfcf069d 2679int
d46c5b12 2680encode_eol (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
2681 struct coding_system *coding;
2682 unsigned char *source, *destination;
2683 int src_bytes, dst_bytes;
4ed46869
KH
2684{
2685 unsigned char *src = source;
2686 unsigned char *dst = destination;
d46c5b12 2687 int result = CODING_FINISH_NORMAL;
4ed46869 2688
fb88bf2d
KH
2689 coding->fake_multibyte = 0;
2690
d46c5b12
KH
2691 if (coding->eol_type == CODING_EOL_CRLF)
2692 {
2693 unsigned char c;
2694 unsigned char *src_end = source + src_bytes;
2695 unsigned char *dst_end = destination + dst_bytes;
2696 /* Since the maximum bytes produced by each loop is 2, we
2697 subtract 1 from DST_END to assure overflow checking is
2698 necessary only at the head of loop. */
2699 unsigned char *adjusted_dst_end = dst_end - 1;
2700
2701 while (src < src_end && (dst_bytes
2702 ? (dst < adjusted_dst_end)
2703 : (dst < src - 1)))
2704 {
2705 c = *src++;
2706 if (c == '\n'
2707 || (c == '\r' && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY)))
2708 *dst++ = '\r', *dst++ = '\n';
2709 else
fb88bf2d
KH
2710 {
2711 *dst++ = c;
2712 if (BASE_LEADING_CODE_P (c))
2713 coding->fake_multibyte = 1;
2714 }
d46c5b12
KH
2715 }
2716 if (src < src_end)
2717 result = CODING_FINISH_INSUFFICIENT_DST;
2718 }
2719 else
4ed46869 2720 {
fb88bf2d
KH
2721 unsigned char c;
2722
d46c5b12 2723 if (dst_bytes && src_bytes > dst_bytes)
4ed46869 2724 {
d46c5b12
KH
2725 src_bytes = dst_bytes;
2726 result = CODING_FINISH_INSUFFICIENT_DST;
2727 }
2728 if (dst_bytes)
2729 bcopy (source, destination, src_bytes);
2730 else
993824c9
RS
2731 safe_bcopy (source, destination, src_bytes);
2732 dst_bytes = src_bytes;
2733 if (coding->eol_type == CODING_EOL_CR)
d46c5b12
KH
2734 {
2735 while (src_bytes--)
fb88bf2d
KH
2736 {
2737 if ((c = *dst++) == '\n')
2738 dst[-1] = '\r';
2739 else if (BASE_LEADING_CODE_P (c))
993824c9 2740 coding->fake_multibyte = 1;
fb88bf2d 2741 }
d46c5b12 2742 }
fb88bf2d 2743 else
d46c5b12 2744 {
fb88bf2d
KH
2745 if (coding->mode & CODING_MODE_SELECTIVE_DISPLAY)
2746 {
2747 while (src_bytes--)
2748 if (*dst++ == '\r') dst[-1] = '\n';
2749 }
2750 coding->fake_multibyte = 1;
4ed46869 2751 }
fb88bf2d
KH
2752 src = source + dst_bytes;
2753 dst = destination + dst_bytes;
4ed46869
KH
2754 }
2755
d46c5b12
KH
2756 coding->consumed = coding->consumed_char = src - source;
2757 coding->produced = coding->produced_char = dst - destination;
2758 return result;
4ed46869
KH
2759}
2760
2761\f
1397dc18 2762/*** 7. C library functions ***/
4ed46869
KH
2763
2764/* In Emacs Lisp, coding system is represented by a Lisp symbol which
2765 has a property `coding-system'. The value of this property is a
2766 vector of length 5 (called as coding-vector). Among elements of
2767 this vector, the first (element[0]) and the fifth (element[4])
2768 carry important information for decoding/encoding. Before
2769 decoding/encoding, this information should be set in fields of a
2770 structure of type `coding_system'.
2771
2772 A value of property `coding-system' can be a symbol of another
2773 subsidiary coding-system. In that case, Emacs gets coding-vector
2774 from that symbol.
2775
2776 `element[0]' contains information to be set in `coding->type'. The
2777 value and its meaning is as follows:
2778
0ef69138
KH
2779 0 -- coding_type_emacs_mule
2780 1 -- coding_type_sjis
2781 2 -- coding_type_iso2022
2782 3 -- coding_type_big5
2783 4 -- coding_type_ccl encoder/decoder written in CCL
2784 nil -- coding_type_no_conversion
2785 t -- coding_type_undecided (automatic conversion on decoding,
2786 no-conversion on encoding)
4ed46869
KH
2787
2788 `element[4]' contains information to be set in `coding->flags' and
2789 `coding->spec'. The meaning varies by `coding->type'.
2790
2791 If `coding->type' is `coding_type_iso2022', element[4] is a vector
2792 of length 32 (of which the first 13 sub-elements are used now).
2793 Meanings of these sub-elements are:
2794
2795 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2796 If the value is an integer of valid charset, the charset is
2797 assumed to be designated to graphic register N initially.
2798
2799 If the value is minus, it is a minus value of charset which
2800 reserves graphic register N, which means that the charset is
2801 not designated initially but should be designated to graphic
2802 register N just before encoding a character in that charset.
2803
2804 If the value is nil, graphic register N is never used on
2805 encoding.
2806
2807 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2808 Each value takes t or nil. See the section ISO2022 of
2809 `coding.h' for more information.
2810
2811 If `coding->type' is `coding_type_big5', element[4] is t to denote
2812 BIG5-ETen or nil to denote BIG5-HKU.
2813
2814 If `coding->type' takes the other value, element[4] is ignored.
2815
2816 Emacs Lisp's coding system also carries information about format of
2817 end-of-line in a value of property `eol-type'. If the value is
2818 integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2819 means CODING_EOL_CR. If it is not integer, it should be a vector
2820 of subsidiary coding systems of which property `eol-type' has one
2821 of above values.
2822
2823*/
2824
2825/* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2826 and set it in CODING. If CODING_SYSTEM_SYMBOL is invalid, CODING
2827 is setup so that no conversion is necessary and return -1, else
2828 return 0. */
2829
2830int
e0e989f6
KH
2831setup_coding_system (coding_system, coding)
2832 Lisp_Object coding_system;
4ed46869
KH
2833 struct coding_system *coding;
2834{
d46c5b12 2835 Lisp_Object coding_spec, coding_type, eol_type, plist;
4608c386 2836 Lisp_Object val;
70c22245 2837 int i;
4ed46869 2838
d46c5b12 2839 /* Initialize some fields required for all kinds of coding systems. */
774324d6 2840 coding->symbol = coding_system;
d46c5b12
KH
2841 coding->common_flags = 0;
2842 coding->mode = 0;
2843 coding->heading_ascii = -1;
2844 coding->post_read_conversion = coding->pre_write_conversion = Qnil;
4608c386
KH
2845 coding_spec = Fget (coding_system, Qcoding_system);
2846 if (!VECTORP (coding_spec)
2847 || XVECTOR (coding_spec)->size != 5
2848 || !CONSP (XVECTOR (coding_spec)->contents[3]))
4ed46869 2849 goto label_invalid_coding_system;
4608c386 2850
d46c5b12
KH
2851 eol_type = inhibit_eol_conversion ? Qnil : Fget (coding_system, Qeol_type);
2852 if (VECTORP (eol_type))
2853 {
2854 coding->eol_type = CODING_EOL_UNDECIDED;
2855 coding->common_flags = CODING_REQUIRE_DETECTION_MASK;
2856 }
2857 else if (XFASTINT (eol_type) == 1)
2858 {
2859 coding->eol_type = CODING_EOL_CRLF;
2860 coding->common_flags
2861 = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2862 }
2863 else if (XFASTINT (eol_type) == 2)
2864 {
2865 coding->eol_type = CODING_EOL_CR;
2866 coding->common_flags
2867 = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2868 }
2869 else
2870 coding->eol_type = CODING_EOL_LF;
2871
2872 coding_type = XVECTOR (coding_spec)->contents[0];
2873 /* Try short cut. */
2874 if (SYMBOLP (coding_type))
2875 {
2876 if (EQ (coding_type, Qt))
2877 {
2878 coding->type = coding_type_undecided;
2879 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
2880 }
2881 else
2882 coding->type = coding_type_no_conversion;
2883 return 0;
2884 }
2885
2886 /* Initialize remaining fields. */
2887 coding->composing = 0;
d46c5b12
KH
2888
2889 /* Get values of coding system properties:
2890 `post-read-conversion', `pre-write-conversion',
f967223b 2891 `translation-table-for-decode', `translation-table-for-encode'. */
4608c386
KH
2892 plist = XVECTOR (coding_spec)->contents[3];
2893 coding->post_read_conversion = Fplist_get (plist, Qpost_read_conversion);
2894 coding->pre_write_conversion = Fplist_get (plist, Qpre_write_conversion);
f967223b 2895 val = Fplist_get (plist, Qtranslation_table_for_decode);
4608c386 2896 if (SYMBOLP (val))
f967223b
KH
2897 val = Fget (val, Qtranslation_table_for_decode);
2898 coding->translation_table_for_decode = CHAR_TABLE_P (val) ? val : Qnil;
2899 val = Fplist_get (plist, Qtranslation_table_for_encode);
4608c386 2900 if (SYMBOLP (val))
f967223b
KH
2901 val = Fget (val, Qtranslation_table_for_encode);
2902 coding->translation_table_for_encode = CHAR_TABLE_P (val) ? val : Qnil;
d46c5b12
KH
2903 val = Fplist_get (plist, Qcoding_category);
2904 if (!NILP (val))
2905 {
2906 val = Fget (val, Qcoding_category_index);
2907 if (INTEGERP (val))
2908 coding->category_idx = XINT (val);
2909 else
2910 goto label_invalid_coding_system;
2911 }
2912 else
2913 goto label_invalid_coding_system;
4608c386 2914
70c22245
KH
2915 val = Fplist_get (plist, Qsafe_charsets);
2916 if (EQ (val, Qt))
2917 {
2918 for (i = 0; i <= MAX_CHARSET; i++)
2919 coding->safe_charsets[i] = 1;
2920 }
2921 else
2922 {
2923 bzero (coding->safe_charsets, MAX_CHARSET + 1);
2924 while (CONSP (val))
2925 {
2926 if ((i = get_charset_id (XCONS (val)->car)) >= 0)
2927 coding->safe_charsets[i] = 1;
2928 val = XCONS (val)->cdr;
2929 }
2930 }
2931
d46c5b12 2932 switch (XFASTINT (coding_type))
4ed46869
KH
2933 {
2934 case 0:
0ef69138 2935 coding->type = coding_type_emacs_mule;
c952af22
KH
2936 if (!NILP (coding->post_read_conversion))
2937 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
2938 if (!NILP (coding->pre_write_conversion))
2939 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
4ed46869
KH
2940 break;
2941
2942 case 1:
2943 coding->type = coding_type_sjis;
c952af22
KH
2944 coding->common_flags
2945 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
4ed46869
KH
2946 break;
2947
2948 case 2:
2949 coding->type = coding_type_iso2022;
c952af22
KH
2950 coding->common_flags
2951 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
4ed46869 2952 {
70c22245 2953 Lisp_Object val, temp;
4ed46869 2954 Lisp_Object *flags;
d46c5b12 2955 int i, charset, reg_bits = 0;
4ed46869 2956
4608c386 2957 val = XVECTOR (coding_spec)->contents[4];
f44d27ce 2958
4ed46869
KH
2959 if (!VECTORP (val) || XVECTOR (val)->size != 32)
2960 goto label_invalid_coding_system;
2961
2962 flags = XVECTOR (val)->contents;
2963 coding->flags
2964 = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
2965 | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
2966 | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
2967 | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
2968 | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
2969 | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
2970 | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
2971 | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
e0e989f6
KH
2972 | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
2973 | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
c4825358
KH
2974 | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
2975 | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
3f003981 2976 | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
c4825358 2977 );
4ed46869
KH
2978
2979 /* Invoke graphic register 0 to plane 0. */
2980 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
2981 /* Invoke graphic register 1 to plane 1 if we can use full 8-bit. */
2982 CODING_SPEC_ISO_INVOCATION (coding, 1)
2983 = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
2984 /* Not single shifting at first. */
6e85d753 2985 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
e0e989f6 2986 /* Beginning of buffer should also be regarded as bol. */
6e85d753 2987 CODING_SPEC_ISO_BOL (coding) = 1;
4ed46869 2988
70c22245
KH
2989 for (charset = 0; charset <= MAX_CHARSET; charset++)
2990 CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = 255;
2991 val = Vcharset_revision_alist;
2992 while (CONSP (val))
2993 {
2994 charset = get_charset_id (Fcar_safe (XCONS (val)->car));
2995 if (charset >= 0
2996 && (temp = Fcdr_safe (XCONS (val)->car), INTEGERP (temp))
2997 && (i = XINT (temp), (i >= 0 && (i + '@') < 128)))
2998 CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = i;
2999 val = XCONS (val)->cdr;
3000 }
3001
4ed46869
KH
3002 /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
3003 FLAGS[REG] can be one of below:
3004 integer CHARSET: CHARSET occupies register I,
3005 t: designate nothing to REG initially, but can be used
3006 by any charsets,
3007 list of integer, nil, or t: designate the first
3008 element (if integer) to REG initially, the remaining
3009 elements (if integer) is designated to REG on request,
d46c5b12 3010 if an element is t, REG can be used by any charsets,
4ed46869 3011 nil: REG is never used. */
467e7675 3012 for (charset = 0; charset <= MAX_CHARSET; charset++)
1ba9e4ab
KH
3013 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3014 = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
4ed46869
KH
3015 for (i = 0; i < 4; i++)
3016 {
3017 if (INTEGERP (flags[i])
e0e989f6
KH
3018 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset))
3019 || (charset = get_charset_id (flags[i])) >= 0)
4ed46869
KH
3020 {
3021 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3022 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
3023 }
3024 else if (EQ (flags[i], Qt))
3025 {
3026 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
d46c5b12
KH
3027 reg_bits |= 1 << i;
3028 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
4ed46869
KH
3029 }
3030 else if (CONSP (flags[i]))
3031 {
84d60297
RS
3032 Lisp_Object tail;
3033 tail = flags[i];
4ed46869 3034
d46c5b12 3035 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
4ed46869
KH
3036 if (INTEGERP (XCONS (tail)->car)
3037 && (charset = XINT (XCONS (tail)->car),
e0e989f6
KH
3038 CHARSET_VALID_P (charset))
3039 || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
4ed46869
KH
3040 {
3041 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3042 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
3043 }
3044 else
3045 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3046 tail = XCONS (tail)->cdr;
3047 while (CONSP (tail))
3048 {
3049 if (INTEGERP (XCONS (tail)->car)
3050 && (charset = XINT (XCONS (tail)->car),
e0e989f6
KH
3051 CHARSET_VALID_P (charset))
3052 || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
70c22245
KH
3053 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3054 = i;
4ed46869 3055 else if (EQ (XCONS (tail)->car, Qt))
d46c5b12 3056 reg_bits |= 1 << i;
4ed46869
KH
3057 tail = XCONS (tail)->cdr;
3058 }
3059 }
3060 else
3061 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3062
3063 CODING_SPEC_ISO_DESIGNATION (coding, i)
3064 = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
3065 }
3066
d46c5b12 3067 if (reg_bits && ! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
4ed46869
KH
3068 {
3069 /* REG 1 can be used only by locking shift in 7-bit env. */
3070 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
d46c5b12 3071 reg_bits &= ~2;
4ed46869
KH
3072 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
3073 /* Without any shifting, only REG 0 and 1 can be used. */
d46c5b12 3074 reg_bits &= 3;
4ed46869
KH
3075 }
3076
d46c5b12
KH
3077 if (reg_bits)
3078 for (charset = 0; charset <= MAX_CHARSET; charset++)
6e85d753 3079 {
d46c5b12
KH
3080 if (CHARSET_VALID_P (charset))
3081 {
3082 /* There exist some default graphic registers to be
3083 used CHARSET. */
3084
3085 /* We had better avoid designating a charset of
3086 CHARS96 to REG 0 as far as possible. */
3087 if (CHARSET_CHARS (charset) == 96)
3088 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3089 = (reg_bits & 2
3090 ? 1 : (reg_bits & 4 ? 2 : (reg_bits & 8 ? 3 : 0)));
3091 else
3092 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3093 = (reg_bits & 1
3094 ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
3095 }
6e85d753 3096 }
4ed46869 3097 }
c952af22 3098 coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
d46c5b12 3099 coding->spec.iso2022.last_invalid_designation_register = -1;
4ed46869
KH
3100 break;
3101
3102 case 3:
3103 coding->type = coding_type_big5;
c952af22
KH
3104 coding->common_flags
3105 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
4ed46869 3106 coding->flags
4608c386 3107 = (NILP (XVECTOR (coding_spec)->contents[4])
4ed46869
KH
3108 ? CODING_FLAG_BIG5_HKU
3109 : CODING_FLAG_BIG5_ETEN);
3110 break;
3111
3112 case 4:
3113 coding->type = coding_type_ccl;
c952af22
KH
3114 coding->common_flags
3115 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
4ed46869 3116 {
84d60297 3117 Lisp_Object val;
d21ca14d
KH
3118 Lisp_Object decoder, encoder;
3119
84d60297 3120 val = XVECTOR (coding_spec)->contents[4];
4ed46869 3121 if (CONSP (val)
d21ca14d
KH
3122 && SYMBOLP (XCONS (val)->car)
3123 && !NILP (decoder = Fget (XCONS (val)->car, Qccl_program_idx))
f82423d7 3124 && !NILP (decoder = Fcdr (Faref (Vccl_program_table, decoder)))
d21ca14d
KH
3125 && SYMBOLP (XCONS (val)->cdr)
3126 && !NILP (encoder = Fget (XCONS (val)->cdr, Qccl_program_idx))
f82423d7 3127 && !NILP (encoder = Fcdr (Faref (Vccl_program_table, encoder))))
4ed46869 3128 {
d21ca14d
KH
3129 setup_ccl_program (&(coding->spec.ccl.decoder), decoder);
3130 setup_ccl_program (&(coding->spec.ccl.encoder), encoder);
4ed46869
KH
3131 }
3132 else
3133 goto label_invalid_coding_system;
1397dc18
KH
3134
3135 bzero (coding->spec.ccl.valid_codes, 256);
3136 val = Fplist_get (plist, Qvalid_codes);
3137 if (CONSP (val))
3138 {
3139 Lisp_Object this;
3140
7b179c2d 3141 for (; CONSP (val); val = XCONS (val)->cdr)
1397dc18 3142 {
7b179c2d 3143 this = XCONS (val)->car;
1397dc18
KH
3144 if (INTEGERP (this)
3145 && XINT (this) >= 0 && XINT (this) < 256)
3146 coding->spec.ccl.valid_codes[XINT (this)] = 1;
3147 else if (CONSP (this)
3148 && INTEGERP (XCONS (this)->car)
3149 && INTEGERP (XCONS (this)->cdr))
3150 {
3151 int start = XINT (XCONS (this)->car);
3152 int end = XINT (XCONS (this)->cdr);
3153
3154 if (start >= 0 && start <= end && end < 256)
e133c8fa 3155 while (start <= end)
1397dc18
KH
3156 coding->spec.ccl.valid_codes[start++] = 1;
3157 }
3158 }
3159 }
4ed46869 3160 }
c952af22 3161 coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
4ed46869
KH
3162 break;
3163
27901516
KH
3164 case 5:
3165 coding->type = coding_type_raw_text;
3166 break;
3167
4ed46869 3168 default:
d46c5b12 3169 goto label_invalid_coding_system;
4ed46869
KH
3170 }
3171 return 0;
3172
3173 label_invalid_coding_system:
3174 coding->type = coding_type_no_conversion;
d46c5b12 3175 coding->category_idx = CODING_CATEGORY_IDX_BINARY;
c952af22 3176 coding->common_flags = 0;
dec137e5 3177 coding->eol_type = CODING_EOL_LF;
d46c5b12 3178 coding->pre_write_conversion = coding->post_read_conversion = Qnil;
4ed46869
KH
3179 return -1;
3180}
3181
54f78171
KH
3182/* Setup raw-text or one of its subsidiaries in the structure
3183 coding_system CODING according to the already setup value eol_type
3184 in CODING. CODING should be setup for some coding system in
3185 advance. */
3186
3187void
3188setup_raw_text_coding_system (coding)
3189 struct coding_system *coding;
3190{
3191 if (coding->type != coding_type_raw_text)
3192 {
3193 coding->symbol = Qraw_text;
3194 coding->type = coding_type_raw_text;
3195 if (coding->eol_type != CODING_EOL_UNDECIDED)
3196 {
84d60297
RS
3197 Lisp_Object subsidiaries;
3198 subsidiaries = Fget (Qraw_text, Qeol_type);
54f78171
KH
3199
3200 if (VECTORP (subsidiaries)
3201 && XVECTOR (subsidiaries)->size == 3)
3202 coding->symbol
3203 = XVECTOR (subsidiaries)->contents[coding->eol_type];
3204 }
3205 }
3206 return;
3207}
3208
4ed46869
KH
3209/* Emacs has a mechanism to automatically detect a coding system if it
3210 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
3211 it's impossible to distinguish some coding systems accurately
3212 because they use the same range of codes. So, at first, coding
3213 systems are categorized into 7, those are:
3214
0ef69138 3215 o coding-category-emacs-mule
4ed46869
KH
3216
3217 The category for a coding system which has the same code range
3218 as Emacs' internal format. Assigned the coding-system (Lisp
0ef69138 3219 symbol) `emacs-mule' by default.
4ed46869
KH
3220
3221 o coding-category-sjis
3222
3223 The category for a coding system which has the same code range
3224 as SJIS. Assigned the coding-system (Lisp
7717c392 3225 symbol) `japanese-shift-jis' by default.
4ed46869
KH
3226
3227 o coding-category-iso-7
3228
3229 The category for a coding system which has the same code range
7717c392 3230 as ISO2022 of 7-bit environment. This doesn't use any locking
d46c5b12
KH
3231 shift and single shift functions. This can encode/decode all
3232 charsets. Assigned the coding-system (Lisp symbol)
3233 `iso-2022-7bit' by default.
3234
3235 o coding-category-iso-7-tight
3236
3237 Same as coding-category-iso-7 except that this can
3238 encode/decode only the specified charsets.
4ed46869
KH
3239
3240 o coding-category-iso-8-1
3241
3242 The category for a coding system which has the same code range
3243 as ISO2022 of 8-bit environment and graphic plane 1 used only
7717c392
KH
3244 for DIMENSION1 charset. This doesn't use any locking shift
3245 and single shift functions. Assigned the coding-system (Lisp
3246 symbol) `iso-latin-1' by default.
4ed46869
KH
3247
3248 o coding-category-iso-8-2
3249
3250 The category for a coding system which has the same code range
3251 as ISO2022 of 8-bit environment and graphic plane 1 used only
7717c392
KH
3252 for DIMENSION2 charset. This doesn't use any locking shift
3253 and single shift functions. Assigned the coding-system (Lisp
3254 symbol) `japanese-iso-8bit' by default.
4ed46869 3255
7717c392 3256 o coding-category-iso-7-else
4ed46869
KH
3257
3258 The category for a coding system which has the same code range
7717c392
KH
3259 as ISO2022 of 7-bit environemnt but uses locking shift or
3260 single shift functions. Assigned the coding-system (Lisp
3261 symbol) `iso-2022-7bit-lock' by default.
3262
3263 o coding-category-iso-8-else
3264
3265 The category for a coding system which has the same code range
3266 as ISO2022 of 8-bit environemnt but uses locking shift or
3267 single shift functions. Assigned the coding-system (Lisp
3268 symbol) `iso-2022-8bit-ss2' by default.
4ed46869
KH
3269
3270 o coding-category-big5
3271
3272 The category for a coding system which has the same code range
3273 as BIG5. Assigned the coding-system (Lisp symbol)
e0e989f6 3274 `cn-big5' by default.
4ed46869 3275
1397dc18
KH
3276 o coding-category-ccl
3277
3278 The category for a coding system of which encoder/decoder is
3279 written in CCL programs. The default value is nil, i.e., no
3280 coding system is assigned.
3281
4ed46869
KH
3282 o coding-category-binary
3283
3284 The category for a coding system not categorized in any of the
3285 above. Assigned the coding-system (Lisp symbol)
e0e989f6 3286 `no-conversion' by default.
4ed46869
KH
3287
3288 Each of them is a Lisp symbol and the value is an actual
3289 `coding-system's (this is also a Lisp symbol) assigned by a user.
3290 What Emacs does actually is to detect a category of coding system.
3291 Then, it uses a `coding-system' assigned to it. If Emacs can't
3292 decide only one possible category, it selects a category of the
3293 highest priority. Priorities of categories are also specified by a
3294 user in a Lisp variable `coding-category-list'.
3295
3296*/
3297
66cfb530
KH
3298static
3299int ascii_skip_code[256];
3300
d46c5b12 3301/* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded.
4ed46869
KH
3302 If it detects possible coding systems, return an integer in which
3303 appropriate flag bits are set. Flag bits are defined by macros
d46c5b12 3304 CODING_CATEGORY_MASK_XXX in `coding.h'.
4ed46869 3305
d46c5b12
KH
3306 How many ASCII characters are at the head is returned as *SKIP. */
3307
3308static int
3309detect_coding_mask (source, src_bytes, priorities, skip)
3310 unsigned char *source;
3311 int src_bytes, *priorities, *skip;
4ed46869
KH
3312{
3313 register unsigned char c;
d46c5b12 3314 unsigned char *src = source, *src_end = source + src_bytes;
66cfb530 3315 unsigned int mask;
d46c5b12 3316 int i;
4ed46869
KH
3317
3318 /* At first, skip all ASCII characters and control characters except
3319 for three ISO2022 specific control characters. */
66cfb530
KH
3320 ascii_skip_code[ISO_CODE_SO] = 0;
3321 ascii_skip_code[ISO_CODE_SI] = 0;
3322 ascii_skip_code[ISO_CODE_ESC] = 0;
3323
bcf26d6a 3324 label_loop_detect_coding:
66cfb530 3325 while (src < src_end && ascii_skip_code[*src]) src++;
d46c5b12 3326 *skip = src - source;
4ed46869
KH
3327
3328 if (src >= src_end)
3329 /* We found nothing other than ASCII. There's nothing to do. */
d46c5b12 3330 return 0;
4ed46869 3331
8a8147d6 3332 c = *src;
4ed46869
KH
3333 /* The text seems to be encoded in some multilingual coding system.
3334 Now, try to find in which coding system the text is encoded. */
3335 if (c < 0x80)
bcf26d6a
KH
3336 {
3337 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
3338 /* C is an ISO2022 specific control code of C0. */
3339 mask = detect_coding_iso2022 (src, src_end);
1b2af4b0 3340 if (mask == 0)
d46c5b12
KH
3341 {
3342 /* No valid ISO2022 code follows C. Try again. */
3343 src++;
66cfb530
KH
3344 if (c == ISO_CODE_ESC)
3345 ascii_skip_code[ISO_CODE_ESC] = 1;
3346 else
3347 ascii_skip_code[ISO_CODE_SO] = ascii_skip_code[ISO_CODE_SI] = 1;
d46c5b12
KH
3348 goto label_loop_detect_coding;
3349 }
3350 if (priorities)
3351 goto label_return_highest_only;
bcf26d6a 3352 }
d46c5b12 3353 else
c4825358 3354 {
d46c5b12 3355 int try;
4ed46869 3356
d46c5b12
KH
3357 if (c < 0xA0)
3358 {
3359 /* C is the first byte of SJIS character code,
3360 or a leading-code of Emacs' internal format (emacs-mule). */
3361 try = CODING_CATEGORY_MASK_SJIS | CODING_CATEGORY_MASK_EMACS_MULE;
3362
3363 /* Or, if C is a special latin extra code,
3364 or is an ISO2022 specific control code of C1 (SS2 or SS3),
3365 or is an ISO2022 control-sequence-introducer (CSI),
3366 we should also consider the possibility of ISO2022 codings. */
3367 if ((VECTORP (Vlatin_extra_code_table)
3368 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
3369 || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
3370 || (c == ISO_CODE_CSI
3371 && (src < src_end
3372 && (*src == ']'
3373 || ((*src == '0' || *src == '1' || *src == '2')
3374 && src + 1 < src_end
3375 && src[1] == ']')))))
3376 try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
3377 | CODING_CATEGORY_MASK_ISO_8BIT);
3378 }
c4825358 3379 else
d46c5b12
KH
3380 /* C is a character of ISO2022 in graphic plane right,
3381 or a SJIS's 1-byte character code (i.e. JISX0201),
3382 or the first byte of BIG5's 2-byte code. */
3383 try = (CODING_CATEGORY_MASK_ISO_8_ELSE
3384 | CODING_CATEGORY_MASK_ISO_8BIT
3385 | CODING_CATEGORY_MASK_SJIS
3386 | CODING_CATEGORY_MASK_BIG5);
3387
1397dc18
KH
3388 /* Or, we may have to consider the possibility of CCL. */
3389 if (coding_system_table[CODING_CATEGORY_IDX_CCL]
3390 && (coding_system_table[CODING_CATEGORY_IDX_CCL]
3391 ->spec.ccl.valid_codes)[c])
3392 try |= CODING_CATEGORY_MASK_CCL;
3393
d46c5b12
KH
3394 mask = 0;
3395 if (priorities)
3396 {
3397 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3398 {
5ab13dd0 3399 if (priorities[i] & try & CODING_CATEGORY_MASK_ISO)
d46c5b12 3400 mask = detect_coding_iso2022 (src, src_end);
5ab13dd0 3401 else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS)
d46c5b12 3402 mask = detect_coding_sjis (src, src_end);
5ab13dd0 3403 else if (priorities[i] & try & CODING_CATEGORY_MASK_BIG5)
d46c5b12 3404 mask = detect_coding_big5 (src, src_end);
5ab13dd0 3405 else if (priorities[i] & try & CODING_CATEGORY_MASK_EMACS_MULE)
d46c5b12 3406 mask = detect_coding_emacs_mule (src, src_end);
89fa8b36 3407 else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL)
1397dc18 3408 mask = detect_coding_ccl (src, src_end);
5ab13dd0
RS
3409 else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT)
3410 mask = CODING_CATEGORY_MASK_RAW_TEXT;
3411 else if (priorities[i] & CODING_CATEGORY_MASK_BINARY)
3412 mask = CODING_CATEGORY_MASK_BINARY;
d46c5b12
KH
3413 if (mask)
3414 goto label_return_highest_only;
3415 }
3416 return CODING_CATEGORY_MASK_RAW_TEXT;
3417 }
3418 if (try & CODING_CATEGORY_MASK_ISO)
3419 mask |= detect_coding_iso2022 (src, src_end);
3420 if (try & CODING_CATEGORY_MASK_SJIS)
3421 mask |= detect_coding_sjis (src, src_end);
3422 if (try & CODING_CATEGORY_MASK_BIG5)
3423 mask |= detect_coding_big5 (src, src_end);
3424 if (try & CODING_CATEGORY_MASK_EMACS_MULE)
1397dc18
KH
3425 mask |= detect_coding_emacs_mule (src, src_end);
3426 if (try & CODING_CATEGORY_MASK_CCL)
3427 mask |= detect_coding_ccl (src, src_end);
c4825358 3428 }
5ab13dd0 3429 return (mask | CODING_CATEGORY_MASK_RAW_TEXT | CODING_CATEGORY_MASK_BINARY);
d46c5b12
KH
3430
3431 label_return_highest_only:
3432 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3433 {
3434 if (mask & priorities[i])
3435 return priorities[i];
3436 }
3437 return CODING_CATEGORY_MASK_RAW_TEXT;
4ed46869
KH
3438}
3439
3440/* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
3441 The information of the detected coding system is set in CODING. */
3442
3443void
3444detect_coding (coding, src, src_bytes)
3445 struct coding_system *coding;
3446 unsigned char *src;
3447 int src_bytes;
3448{
d46c5b12
KH
3449 unsigned int idx;
3450 int skip, mask, i;
84d60297 3451 Lisp_Object val;
4ed46869 3452
84d60297 3453 val = Vcoding_category_list;
66cfb530 3454 mask = detect_coding_mask (src, src_bytes, coding_priorities, &skip);
d46c5b12 3455 coding->heading_ascii = skip;
4ed46869 3456
d46c5b12
KH
3457 if (!mask) return;
3458
3459 /* We found a single coding system of the highest priority in MASK. */
3460 idx = 0;
3461 while (mask && ! (mask & 1)) mask >>= 1, idx++;
3462 if (! mask)
3463 idx = CODING_CATEGORY_IDX_RAW_TEXT;
4ed46869 3464
d46c5b12
KH
3465 val = XSYMBOL (XVECTOR (Vcoding_category_table)->contents[idx])->value;
3466
3467 if (coding->eol_type != CODING_EOL_UNDECIDED)
27901516 3468 {
84d60297 3469 Lisp_Object tmp;
d46c5b12 3470
84d60297 3471 tmp = Fget (val, Qeol_type);
d46c5b12
KH
3472 if (VECTORP (tmp))
3473 val = XVECTOR (tmp)->contents[coding->eol_type];
4ed46869 3474 }
d46c5b12
KH
3475 setup_coding_system (val, coding);
3476 /* Set this again because setup_coding_system reset this member. */
3477 coding->heading_ascii = skip;
4ed46869
KH
3478}
3479
d46c5b12
KH
3480/* Detect how end-of-line of a text of length SRC_BYTES pointed by
3481 SOURCE is encoded. Return one of CODING_EOL_LF, CODING_EOL_CRLF,
3482 CODING_EOL_CR, and CODING_EOL_UNDECIDED.
3483
3484 How many non-eol characters are at the head is returned as *SKIP. */
4ed46869 3485
bc4bc72a
RS
3486#define MAX_EOL_CHECK_COUNT 3
3487
d46c5b12
KH
3488static int
3489detect_eol_type (source, src_bytes, skip)
3490 unsigned char *source;
3491 int src_bytes, *skip;
4ed46869 3492{
d46c5b12 3493 unsigned char *src = source, *src_end = src + src_bytes;
4ed46869 3494 unsigned char c;
bc4bc72a
RS
3495 int total = 0; /* How many end-of-lines are found so far. */
3496 int eol_type = CODING_EOL_UNDECIDED;
3497 int this_eol_type;
4ed46869 3498
d46c5b12
KH
3499 *skip = 0;
3500
bc4bc72a 3501 while (src < src_end && total < MAX_EOL_CHECK_COUNT)
4ed46869
KH
3502 {
3503 c = *src++;
bc4bc72a 3504 if (c == '\n' || c == '\r')
4ed46869 3505 {
d46c5b12
KH
3506 if (*skip == 0)
3507 *skip = src - 1 - source;
bc4bc72a
RS
3508 total++;
3509 if (c == '\n')
3510 this_eol_type = CODING_EOL_LF;
3511 else if (src >= src_end || *src != '\n')
3512 this_eol_type = CODING_EOL_CR;
4ed46869 3513 else
bc4bc72a
RS
3514 this_eol_type = CODING_EOL_CRLF, src++;
3515
3516 if (eol_type == CODING_EOL_UNDECIDED)
3517 /* This is the first end-of-line. */
3518 eol_type = this_eol_type;
3519 else if (eol_type != this_eol_type)
d46c5b12
KH
3520 {
3521 /* The found type is different from what found before. */
3522 eol_type = CODING_EOL_INCONSISTENT;
3523 break;
3524 }
4ed46869
KH
3525 }
3526 }
bc4bc72a 3527
d46c5b12
KH
3528 if (*skip == 0)
3529 *skip = src_end - source;
85a02ca4 3530 return eol_type;
4ed46869
KH
3531}
3532
3533/* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
3534 is encoded. If it detects an appropriate format of end-of-line, it
3535 sets the information in *CODING. */
3536
3537void
3538detect_eol (coding, src, src_bytes)
3539 struct coding_system *coding;
3540 unsigned char *src;
3541 int src_bytes;
3542{
4608c386 3543 Lisp_Object val;
d46c5b12
KH
3544 int skip;
3545 int eol_type = detect_eol_type (src, src_bytes, &skip);
3546
3547 if (coding->heading_ascii > skip)
3548 coding->heading_ascii = skip;
3549 else
3550 skip = coding->heading_ascii;
4ed46869 3551
0ef69138 3552 if (eol_type == CODING_EOL_UNDECIDED)
4ed46869 3553 return;
27901516
KH
3554 if (eol_type == CODING_EOL_INCONSISTENT)
3555 {
3556#if 0
3557 /* This code is suppressed until we find a better way to
992f23f2 3558 distinguish raw text file and binary file. */
27901516
KH
3559
3560 /* If we have already detected that the coding is raw-text, the
3561 coding should actually be no-conversion. */
3562 if (coding->type == coding_type_raw_text)
3563 {
3564 setup_coding_system (Qno_conversion, coding);
3565 return;
3566 }
3567 /* Else, let's decode only text code anyway. */
3568#endif /* 0 */
1b2af4b0 3569 eol_type = CODING_EOL_LF;
27901516
KH
3570 }
3571
4608c386 3572 val = Fget (coding->symbol, Qeol_type);
4ed46869 3573 if (VECTORP (val) && XVECTOR (val)->size == 3)
d46c5b12
KH
3574 {
3575 setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
3576 coding->heading_ascii = skip;
3577 }
3578}
3579
3580#define CONVERSION_BUFFER_EXTRA_ROOM 256
3581
3582#define DECODING_BUFFER_MAG(coding) \
3583 (coding->type == coding_type_iso2022 \
3584 ? 3 \
3585 : ((coding->type == coding_type_sjis || coding->type == coding_type_big5) \
3586 ? 2 \
3587 : (coding->type == coding_type_raw_text \
3588 ? 1 \
3589 : (coding->type == coding_type_ccl \
3590 ? coding->spec.ccl.decoder.buf_magnification \
3591 : 2))))
3592
3593/* Return maximum size (bytes) of a buffer enough for decoding
3594 SRC_BYTES of text encoded in CODING. */
3595
3596int
3597decoding_buffer_size (coding, src_bytes)
3598 struct coding_system *coding;
3599 int src_bytes;
3600{
3601 return (src_bytes * DECODING_BUFFER_MAG (coding)
3602 + CONVERSION_BUFFER_EXTRA_ROOM);
3603}
3604
3605/* Return maximum size (bytes) of a buffer enough for encoding
3606 SRC_BYTES of text to CODING. */
3607
3608int
3609encoding_buffer_size (coding, src_bytes)
3610 struct coding_system *coding;
3611 int src_bytes;
3612{
3613 int magnification;
3614
3615 if (coding->type == coding_type_ccl)
3616 magnification = coding->spec.ccl.encoder.buf_magnification;
3617 else
3618 magnification = 3;
3619
3620 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
3621}
3622
3623#ifndef MINIMUM_CONVERSION_BUFFER_SIZE
3624#define MINIMUM_CONVERSION_BUFFER_SIZE 1024
3625#endif
3626
3627char *conversion_buffer;
3628int conversion_buffer_size;
3629
3630/* Return a pointer to a SIZE bytes of buffer to be used for encoding
3631 or decoding. Sufficient memory is allocated automatically. If we
3632 run out of memory, return NULL. */
3633
3634char *
3635get_conversion_buffer (size)
3636 int size;
3637{
3638 if (size > conversion_buffer_size)
3639 {
3640 char *buf;
3641 int real_size = conversion_buffer_size * 2;
3642
3643 while (real_size < size) real_size *= 2;
3644 buf = (char *) xmalloc (real_size);
3645 xfree (conversion_buffer);
3646 conversion_buffer = buf;
3647 conversion_buffer_size = real_size;
3648 }
3649 return conversion_buffer;
3650}
3651
3652int
3653ccl_coding_driver (coding, source, destination, src_bytes, dst_bytes, encodep)
3654 struct coding_system *coding;
3655 unsigned char *source, *destination;
3656 int src_bytes, dst_bytes, encodep;
3657{
3658 struct ccl_program *ccl
3659 = encodep ? &coding->spec.ccl.encoder : &coding->spec.ccl.decoder;
3660 int result;
3661
ae9ff118 3662 ccl->last_block = coding->mode & CODING_MODE_LAST_BLOCK;
7b179c2d 3663
d46c5b12
KH
3664 coding->produced = ccl_driver (ccl, source, destination,
3665 src_bytes, dst_bytes, &(coding->consumed));
69f76525 3666 coding->produced_char
48942766
KH
3667 = (encodep
3668 ? coding->produced
3669 : multibyte_chars_in_text (destination, coding->produced));
69f76525
KH
3670 coding->consumed_char
3671 = multibyte_chars_in_text (source, coding->consumed);
3672
d46c5b12
KH
3673 switch (ccl->status)
3674 {
3675 case CCL_STAT_SUSPEND_BY_SRC:
3676 result = CODING_FINISH_INSUFFICIENT_SRC;
3677 break;
3678 case CCL_STAT_SUSPEND_BY_DST:
3679 result = CODING_FINISH_INSUFFICIENT_DST;
3680 break;
9864ebce
KH
3681 case CCL_STAT_QUIT:
3682 case CCL_STAT_INVALID_CMD:
3683 result = CODING_FINISH_INTERRUPT;
3684 break;
d46c5b12
KH
3685 default:
3686 result = CODING_FINISH_NORMAL;
3687 break;
3688 }
3689 return result;
4ed46869
KH
3690}
3691
3692/* See "GENERAL NOTES about `decode_coding_XXX ()' functions". Before
3693 decoding, it may detect coding system and format of end-of-line if
3694 those are not yet decided. */
3695
3696int
d46c5b12 3697decode_coding (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
3698 struct coding_system *coding;
3699 unsigned char *source, *destination;
3700 int src_bytes, dst_bytes;
4ed46869 3701{
d46c5b12 3702 int result;
4ed46869 3703
d4e57bcd 3704 if (src_bytes <= 0
944bd420 3705 && coding->type != coding_type_ccl
d4e57bcd
KH
3706 && ! (coding->mode & CODING_MODE_LAST_BLOCK
3707 && CODING_REQUIRE_FLUSHING (coding)))
4ed46869 3708 {
d46c5b12
KH
3709 coding->produced = coding->produced_char = 0;
3710 coding->consumed = coding->consumed_char = 0;
fb88bf2d 3711 coding->fake_multibyte = 0;
d46c5b12 3712 return CODING_FINISH_NORMAL;
4ed46869
KH
3713 }
3714
0ef69138 3715 if (coding->type == coding_type_undecided)
4ed46869
KH
3716 detect_coding (coding, source, src_bytes);
3717
0ef69138 3718 if (coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
3719 detect_eol (coding, source, src_bytes);
3720
4ed46869
KH
3721 switch (coding->type)
3722 {
0ef69138
KH
3723 case coding_type_emacs_mule:
3724 case coding_type_undecided:
27901516 3725 case coding_type_raw_text:
4ed46869 3726 if (coding->eol_type == CODING_EOL_LF
0ef69138 3727 || coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869 3728 goto label_no_conversion;
d46c5b12 3729 result = decode_eol (coding, source, destination, src_bytes, dst_bytes);
4ed46869
KH
3730 break;
3731
3732 case coding_type_sjis:
d46c5b12
KH
3733 result = decode_coding_sjis_big5 (coding, source, destination,
3734 src_bytes, dst_bytes, 1);
4ed46869
KH
3735 break;
3736
3737 case coding_type_iso2022:
d46c5b12
KH
3738 result = decode_coding_iso2022 (coding, source, destination,
3739 src_bytes, dst_bytes);
4ed46869
KH
3740 break;
3741
3742 case coding_type_big5:
d46c5b12
KH
3743 result = decode_coding_sjis_big5 (coding, source, destination,
3744 src_bytes, dst_bytes, 0);
4ed46869
KH
3745 break;
3746
3747 case coding_type_ccl:
d46c5b12
KH
3748 result = ccl_coding_driver (coding, source, destination,
3749 src_bytes, dst_bytes, 0);
3750 break;
3751
3752 default: /* i.e. case coding_type_no_conversion: */
3753 label_no_conversion:
3754 if (dst_bytes && src_bytes > dst_bytes)
3755 {
3756 coding->produced = dst_bytes;
3757 result = CODING_FINISH_INSUFFICIENT_DST;
3758 }
3759 else
3760 {
3761 coding->produced = src_bytes;
3762 result = CODING_FINISH_NORMAL;
3763 }
3764 if (dst_bytes)
3765 bcopy (source, destination, coding->produced);
3766 else
3767 safe_bcopy (source, destination, coding->produced);
fb88bf2d 3768 coding->fake_multibyte = 1;
d46c5b12
KH
3769 coding->consumed
3770 = coding->consumed_char = coding->produced_char = coding->produced;
4ed46869
KH
3771 break;
3772 }
3773
d46c5b12 3774 return result;
4ed46869
KH
3775}
3776
3777/* See "GENERAL NOTES about `encode_coding_XXX ()' functions". */
3778
3779int
d46c5b12 3780encode_coding (coding, source, destination, src_bytes, dst_bytes)
4ed46869
KH
3781 struct coding_system *coding;
3782 unsigned char *source, *destination;
3783 int src_bytes, dst_bytes;
4ed46869 3784{
d46c5b12 3785 int result;
4ed46869 3786
d4e57bcd
KH
3787 if (src_bytes <= 0
3788 && ! (coding->mode & CODING_MODE_LAST_BLOCK
3789 && CODING_REQUIRE_FLUSHING (coding)))
4ed46869 3790 {
d46c5b12
KH
3791 coding->produced = coding->produced_char = 0;
3792 coding->consumed = coding->consumed_char = 0;
fb88bf2d 3793 coding->fake_multibyte = 0;
d46c5b12
KH
3794 return CODING_FINISH_NORMAL;
3795 }
4ed46869 3796
d46c5b12
KH
3797 switch (coding->type)
3798 {
0ef69138
KH
3799 case coding_type_emacs_mule:
3800 case coding_type_undecided:
27901516 3801 case coding_type_raw_text:
4ed46869 3802 if (coding->eol_type == CODING_EOL_LF
0ef69138 3803 || coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869 3804 goto label_no_conversion;
d46c5b12 3805 result = encode_eol (coding, source, destination, src_bytes, dst_bytes);
4ed46869
KH
3806 break;
3807
3808 case coding_type_sjis:
d46c5b12
KH
3809 result = encode_coding_sjis_big5 (coding, source, destination,
3810 src_bytes, dst_bytes, 1);
4ed46869
KH
3811 break;
3812
3813 case coding_type_iso2022:
d46c5b12
KH
3814 result = encode_coding_iso2022 (coding, source, destination,
3815 src_bytes, dst_bytes);
4ed46869
KH
3816 break;
3817
3818 case coding_type_big5:
d46c5b12
KH
3819 result = encode_coding_sjis_big5 (coding, source, destination,
3820 src_bytes, dst_bytes, 0);
4ed46869
KH
3821 break;
3822
3823 case coding_type_ccl:
d46c5b12
KH
3824 result = ccl_coding_driver (coding, source, destination,
3825 src_bytes, dst_bytes, 1);
3826 break;
3827
3828 default: /* i.e. case coding_type_no_conversion: */
3829 label_no_conversion:
3830 if (dst_bytes && src_bytes > dst_bytes)
3831 {
3832 coding->produced = dst_bytes;
3833 result = CODING_FINISH_INSUFFICIENT_DST;
3834 }
3835 else
3836 {
3837 coding->produced = src_bytes;
3838 result = CODING_FINISH_NORMAL;
3839 }
3840 if (dst_bytes)
3841 bcopy (source, destination, coding->produced);
3842 else
3843 safe_bcopy (source, destination, coding->produced);
3844 if (coding->mode & CODING_MODE_SELECTIVE_DISPLAY)
3845 {
3846 unsigned char *p = destination, *pend = p + coding->produced;
3847 while (p < pend)
3848 if (*p++ == '\015') p[-1] = '\n';
3849 }
fb88bf2d 3850 coding->fake_multibyte = 1;
d46c5b12
KH
3851 coding->consumed
3852 = coding->consumed_char = coding->produced_char = coding->produced;
4ed46869
KH
3853 break;
3854 }
3855
d46c5b12 3856 return result;
4ed46869
KH
3857}
3858
fb88bf2d
KH
3859/* Scan text in the region between *BEG and *END (byte positions),
3860 skip characters which we don't have to decode by coding system
3861 CODING at the head and tail, then set *BEG and *END to the region
3862 of the text we actually have to convert. The caller should move
3863 the gap out of the region in advance.
4ed46869 3864
d46c5b12
KH
3865 If STR is not NULL, *BEG and *END are indices into STR. */
3866
3867static void
3868shrink_decoding_region (beg, end, coding, str)
3869 int *beg, *end;
3870 struct coding_system *coding;
3871 unsigned char *str;
3872{
fb88bf2d 3873 unsigned char *begp_orig, *begp, *endp_orig, *endp, c;
d46c5b12 3874 int eol_conversion;
88993dfd 3875 Lisp_Object translation_table;
d46c5b12
KH
3876
3877 if (coding->type == coding_type_ccl
3878 || coding->type == coding_type_undecided
3879 || !NILP (coding->post_read_conversion))
3880 {
3881 /* We can't skip any data. */
3882 return;
3883 }
3884 else if (coding->type == coding_type_no_conversion)
3885 {
fb88bf2d
KH
3886 /* We need no conversion, but don't have to skip any data here.
3887 Decoding routine handles them effectively anyway. */
d46c5b12
KH
3888 return;
3889 }
3890
88993dfd
KH
3891 translation_table = coding->translation_table_for_decode;
3892 if (NILP (translation_table) && !NILP (Venable_character_translation))
3893 translation_table = Vstandard_translation_table_for_decode;
3894 if (CHAR_TABLE_P (translation_table))
3895 {
3896 int i;
3897 for (i = 0; i < 128; i++)
3898 if (!NILP (CHAR_TABLE_REF (translation_table, i)))
3899 break;
3900 if (i < 128)
3901 /* Some ASCII character should be tranlsated. We give up
3902 shrinking. */
3903 return;
3904 }
3905
aa60dea6
KH
3906 eol_conversion = (coding->eol_type != CODING_EOL_LF);
3907
3908 if ((! eol_conversion) && (coding->heading_ascii >= 0))
d46c5b12
KH
3909 /* Detection routine has already found how much we can skip at the
3910 head. */
3911 *beg += coding->heading_ascii;
3912
3913 if (str)
3914 {
3915 begp_orig = begp = str + *beg;
3916 endp_orig = endp = str + *end;
3917 }
3918 else
3919 {
fb88bf2d 3920 begp_orig = begp = BYTE_POS_ADDR (*beg);
d46c5b12
KH
3921 endp_orig = endp = begp + *end - *beg;
3922 }
3923
d46c5b12
KH
3924 switch (coding->type)
3925 {
3926 case coding_type_emacs_mule:
3927 case coding_type_raw_text:
3928 if (eol_conversion)
3929 {
3930 if (coding->heading_ascii < 0)
fb88bf2d 3931 while (begp < endp && *begp != '\r' && *begp < 0x80) begp++;
ee59c65f 3932 while (begp < endp && endp[-1] != '\r' && endp[-1] < 0x80)
fb88bf2d 3933 endp--;
ee59c65f
RS
3934 /* Do not consider LF as ascii if preceded by CR, since that
3935 confuses eol decoding. */
3936 if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
3937 endp++;
d46c5b12
KH
3938 }
3939 else
3940 begp = endp;
3941 break;
3942
3943 case coding_type_sjis:
3944 case coding_type_big5:
3945 /* We can skip all ASCII characters at the head. */
3946 if (coding->heading_ascii < 0)
3947 {
3948 if (eol_conversion)
de9d083c 3949 while (begp < endp && *begp < 0x80 && *begp != '\r') begp++;
d46c5b12
KH
3950 else
3951 while (begp < endp && *begp < 0x80) begp++;
3952 }
3953 /* We can skip all ASCII characters at the tail except for the
3954 second byte of SJIS or BIG5 code. */
3955 if (eol_conversion)
de9d083c 3956 while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\r') endp--;
d46c5b12
KH
3957 else
3958 while (begp < endp && endp[-1] < 0x80) endp--;
ee59c65f
RS
3959 /* Do not consider LF as ascii if preceded by CR, since that
3960 confuses eol decoding. */
3961 if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
3962 endp++;
d46c5b12
KH
3963 if (begp < endp && endp < endp_orig && endp[-1] >= 0x80)
3964 endp++;
3965 break;
3966
3967 default: /* i.e. case coding_type_iso2022: */
622fece5
KH
3968 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
3969 /* We can't skip any data. */
3970 break;
d46c5b12
KH
3971 if (coding->heading_ascii < 0)
3972 {
d46c5b12
KH
3973 /* We can skip all ASCII characters at the head except for a
3974 few control codes. */
3975 while (begp < endp && (c = *begp) < 0x80
3976 && c != ISO_CODE_CR && c != ISO_CODE_SO
3977 && c != ISO_CODE_SI && c != ISO_CODE_ESC
3978 && (!eol_conversion || c != ISO_CODE_LF))
3979 begp++;
3980 }
3981 switch (coding->category_idx)
3982 {
3983 case CODING_CATEGORY_IDX_ISO_8_1:
3984 case CODING_CATEGORY_IDX_ISO_8_2:
3985 /* We can skip all ASCII characters at the tail. */
3986 if (eol_conversion)
de9d083c 3987 while (begp < endp && (c = endp[-1]) < 0x80 && c != '\r') endp--;
d46c5b12
KH
3988 else
3989 while (begp < endp && endp[-1] < 0x80) endp--;
ee59c65f
RS
3990 /* Do not consider LF as ascii if preceded by CR, since that
3991 confuses eol decoding. */
3992 if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
3993 endp++;
d46c5b12
KH
3994 break;
3995
3996 case CODING_CATEGORY_IDX_ISO_7:
3997 case CODING_CATEGORY_IDX_ISO_7_TIGHT:
de79a6a5
KH
3998 {
3999 /* We can skip all charactes at the tail except for 8-bit
4000 codes and ESC and the following 2-byte at the tail. */
4001 unsigned char *eight_bit = NULL;
4002
4003 if (eol_conversion)
4004 while (begp < endp
4005 && (c = endp[-1]) != ISO_CODE_ESC && c != '\r')
4006 {
4007 if (!eight_bit && c & 0x80) eight_bit = endp;
4008 endp--;
4009 }
4010 else
4011 while (begp < endp
4012 && (c = endp[-1]) != ISO_CODE_ESC)
4013 {
4014 if (!eight_bit && c & 0x80) eight_bit = endp;
4015 endp--;
4016 }
4017 /* Do not consider LF as ascii if preceded by CR, since that
4018 confuses eol decoding. */
4019 if (begp < endp && endp < endp_orig
4020 && endp[-1] == '\r' && endp[0] == '\n')
4021 endp++;
4022 if (begp < endp && endp[-1] == ISO_CODE_ESC)
4023 {
4024 if (endp + 1 < endp_orig && end[0] == '(' && end[1] == 'B')
4025 /* This is an ASCII designation sequence. We can
4026 surely skip the tail. But, if we have
4027 encountered an 8-bit code, skip only the codes
4028 after that. */
4029 endp = eight_bit ? eight_bit : endp + 2;
4030 else
4031 /* Hmmm, we can't skip the tail. */
4032 endp = endp_orig;
4033 }
4034 else if (eight_bit)
4035 endp = eight_bit;
4036 }
d46c5b12
KH
4037 }
4038 }
4039 *beg += begp - begp_orig;
4040 *end += endp - endp_orig;
4041 return;
4042}
4043
4044/* Like shrink_decoding_region but for encoding. */
4045
4046static void
4047shrink_encoding_region (beg, end, coding, str)
4048 int *beg, *end;
4049 struct coding_system *coding;
4050 unsigned char *str;
4051{
4052 unsigned char *begp_orig, *begp, *endp_orig, *endp;
4053 int eol_conversion;
88993dfd 4054 Lisp_Object translation_table;
d46c5b12
KH
4055
4056 if (coding->type == coding_type_ccl)
4057 /* We can't skip any data. */
4058 return;
4059 else if (coding->type == coding_type_no_conversion)
4060 {
4061 /* We need no conversion. */
4062 *beg = *end;
4063 return;
4064 }
4065
88993dfd
KH
4066 translation_table = coding->translation_table_for_encode;
4067 if (NILP (translation_table) && !NILP (Venable_character_translation))
4068 translation_table = Vstandard_translation_table_for_encode;
4069 if (CHAR_TABLE_P (translation_table))
4070 {
4071 int i;
4072 for (i = 0; i < 128; i++)
4073 if (!NILP (CHAR_TABLE_REF (translation_table, i)))
4074 break;
4075 if (i < 128)
4076 /* Some ASCII character should be tranlsated. We give up
4077 shrinking. */
4078 return;
4079 }
4080
d46c5b12
KH
4081 if (str)
4082 {
4083 begp_orig = begp = str + *beg;
4084 endp_orig = endp = str + *end;
4085 }
4086 else
4087 {
fb88bf2d 4088 begp_orig = begp = BYTE_POS_ADDR (*beg);
d46c5b12
KH
4089 endp_orig = endp = begp + *end - *beg;
4090 }
4091
4092 eol_conversion = (coding->eol_type == CODING_EOL_CR
4093 || coding->eol_type == CODING_EOL_CRLF);
4094
4095 /* Here, we don't have to check coding->pre_write_conversion because
4096 the caller is expected to have handled it already. */
4097 switch (coding->type)
4098 {
4099 case coding_type_undecided:
4100 case coding_type_emacs_mule:
4101 case coding_type_raw_text:
4102 if (eol_conversion)
4103 {
4104 while (begp < endp && *begp != '\n') begp++;
4105 while (begp < endp && endp[-1] != '\n') endp--;
4106 }
4107 else
4108 begp = endp;
4109 break;
4110
4111 case coding_type_iso2022:
622fece5
KH
4112 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
4113 /* We can't skip any data. */
4114 break;
d46c5b12
KH
4115 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
4116 {
4117 unsigned char *bol = begp;
4118 while (begp < endp && *begp < 0x80)
4119 {
4120 begp++;
4121 if (begp[-1] == '\n')
4122 bol = begp;
4123 }
4124 begp = bol;
4125 goto label_skip_tail;
4126 }
4127 /* fall down ... */
4128
4129 default:
4130 /* We can skip all ASCII characters at the head and tail. */
4131 if (eol_conversion)
4132 while (begp < endp && *begp < 0x80 && *begp != '\n') begp++;
4133 else
4134 while (begp < endp && *begp < 0x80) begp++;
4135 label_skip_tail:
4136 if (eol_conversion)
4137 while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\n') endp--;
4138 else
4139 while (begp < endp && *(endp - 1) < 0x80) endp--;
4140 break;
4141 }
4142
4143 *beg += begp - begp_orig;
4144 *end += endp - endp_orig;
4145 return;
4146}
4147
88993dfd
KH
4148/* As shrinking conversion region requires some overhead, we don't try
4149 shrinking if the length of conversion region is less than this
4150 value. */
4151static int shrink_conversion_region_threshhold = 1024;
4152
4153#define SHRINK_CONVERSION_REGION(beg, end, coding, str, encodep) \
4154 do { \
4155 if (*(end) - *(beg) > shrink_conversion_region_threshhold) \
4156 { \
4157 if (encodep) shrink_encoding_region (beg, end, coding, str); \
4158 else shrink_decoding_region (beg, end, coding, str); \
4159 } \
4160 } while (0)
4161
d46c5b12 4162/* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the
fb88bf2d
KH
4163 text from FROM to TO (byte positions are FROM_BYTE and TO_BYTE) by
4164 coding system CODING, and return the status code of code conversion
4165 (currently, this value has no meaning).
4166
4167 How many characters (and bytes) are converted to how many
4168 characters (and bytes) are recorded in members of the structure
4169 CODING.
d46c5b12 4170
6e44253b 4171 If REPLACE is nonzero, we do various things as if the original text
d46c5b12 4172 is deleted and a new text is inserted. See the comments in
6e44253b 4173 replace_range (insdel.c) to know what we are doing. */
4ed46869
KH
4174
4175int
6e44253b
KH
4176code_convert_region (from, from_byte, to, to_byte, coding, encodep, replace)
4177 int from, from_byte, to, to_byte, encodep, replace;
4ed46869 4178 struct coding_system *coding;
4ed46869 4179{
fb88bf2d
KH
4180 int len = to - from, len_byte = to_byte - from_byte;
4181 int require, inserted, inserted_byte;
12410ef1 4182 int head_skip, tail_skip, total_skip;
84d60297 4183 Lisp_Object saved_coding_symbol;
fb88bf2d
KH
4184 int multibyte = !NILP (current_buffer->enable_multibyte_characters);
4185 int first = 1;
4186 int fake_multibyte = 0;
4187 unsigned char *src, *dst;
84d60297 4188 Lisp_Object deletion;
e133c8fa 4189 int orig_point = PT, orig_len = len;
6abb9bd9 4190 int prev_Z;
84d60297
RS
4191
4192 deletion = Qnil;
4193 saved_coding_symbol = Qnil;
d46c5b12 4194
83fa074f 4195 if (from < PT && PT < to)
e133c8fa
KH
4196 {
4197 TEMP_SET_PT_BOTH (from, from_byte);
4198 orig_point = from;
4199 }
83fa074f 4200
6e44253b 4201 if (replace)
d46c5b12 4202 {
fb88bf2d
KH
4203 int saved_from = from;
4204
d46c5b12 4205 prepare_to_modify_buffer (from, to, &from);
fb88bf2d
KH
4206 if (saved_from != from)
4207 {
4208 to = from + len;
4209 if (multibyte)
4210 from_byte = CHAR_TO_BYTE (from), to_byte = CHAR_TO_BYTE (to);
4211 else
4212 from_byte = from, to_byte = to;
4213 len_byte = to_byte - from_byte;
4214 }
d46c5b12 4215 }
d46c5b12
KH
4216
4217 if (! encodep && CODING_REQUIRE_DETECTION (coding))
4218 {
12410ef1 4219 /* We must detect encoding of text and eol format. */
d46c5b12
KH
4220
4221 if (from < GPT && to > GPT)
4222 move_gap_both (from, from_byte);
4223 if (coding->type == coding_type_undecided)
4224 {
fb88bf2d 4225 detect_coding (coding, BYTE_POS_ADDR (from_byte), len_byte);
d46c5b12 4226 if (coding->type == coding_type_undecided)
12410ef1
KH
4227 /* It seems that the text contains only ASCII, but we
4228 should not left it undecided because the deeper
4229 decoding routine (decode_coding) tries to detect the
4230 encodings again in vain. */
d46c5b12
KH
4231 coding->type = coding_type_emacs_mule;
4232 }
4233 if (coding->eol_type == CODING_EOL_UNDECIDED)
4234 {
4235 saved_coding_symbol = coding->symbol;
4236 detect_eol (coding, BYTE_POS_ADDR (from_byte), len_byte);
4237 if (coding->eol_type == CODING_EOL_UNDECIDED)
4238 coding->eol_type = CODING_EOL_LF;
4239 /* We had better recover the original eol format if we
4240 encounter an inconsitent eol format while decoding. */
4241 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4242 }
4243 }
4244
fb88bf2d
KH
4245 coding->consumed_char = len, coding->consumed = len_byte;
4246
d46c5b12
KH
4247 if (encodep
4248 ? ! CODING_REQUIRE_ENCODING (coding)
4249 : ! CODING_REQUIRE_DECODING (coding))
fb88bf2d
KH
4250 {
4251 coding->produced = len_byte;
12410ef1
KH
4252 if (multibyte
4253 && ! replace
4254 /* See the comment of the member heading_ascii in coding.h. */
4255 && coding->heading_ascii < len_byte)
fb88bf2d 4256 {
6e44253b
KH
4257 /* We still may have to combine byte at the head and the
4258 tail of the text in the region. */
12410ef1 4259 if (from < GPT && GPT < to)
6e44253b 4260 move_gap_both (to, to_byte);
12410ef1
KH
4261 len = multibyte_chars_in_text (BYTE_POS_ADDR (from_byte), len_byte);
4262 adjust_after_insert (from, from_byte, to, to_byte, len);
4263 coding->produced_char = len;
fb88bf2d
KH
4264 }
4265 else
68e3a8f1
AS
4266 {
4267 if (!replace)
4268 adjust_after_insert (from, from_byte, to, to_byte, len_byte);
4269 coding->produced_char = len_byte;
4270 }
fb88bf2d
KH
4271 return 0;
4272 }
d46c5b12
KH
4273
4274 /* Now we convert the text. */
4275
4276 /* For encoding, we must process pre-write-conversion in advance. */
4277 if (encodep
d46c5b12
KH
4278 && ! NILP (coding->pre_write_conversion)
4279 && SYMBOLP (coding->pre_write_conversion)
4280 && ! NILP (Ffboundp (coding->pre_write_conversion)))
4281 {
2b4f9037
KH
4282 /* The function in pre-write-conversion may put a new text in a
4283 new buffer. */
0007bdd0
KH
4284 struct buffer *prev = current_buffer;
4285 Lisp_Object new;
d46c5b12 4286
b39f748c
AS
4287 call2 (coding->pre_write_conversion,
4288 make_number (from), make_number (to));
d46c5b12
KH
4289 if (current_buffer != prev)
4290 {
4291 len = ZV - BEGV;
0007bdd0 4292 new = Fcurrent_buffer ();
d46c5b12 4293 set_buffer_internal_1 (prev);
ddbc19ff 4294 del_range_2 (from, from_byte, to, to_byte);
e133c8fa 4295 TEMP_SET_PT_BOTH (from, from_byte);
0007bdd0
KH
4296 insert_from_buffer (XBUFFER (new), 1, len, 0);
4297 Fkill_buffer (new);
e133c8fa
KH
4298 if (orig_point >= to)
4299 orig_point += len - orig_len;
4300 else if (orig_point > from)
4301 orig_point = from;
4302 orig_len = len;
d46c5b12 4303 to = from + len;
e133c8fa 4304 from_byte = multibyte ? CHAR_TO_BYTE (from) : from_byte;
fb88bf2d 4305 to_byte = multibyte ? CHAR_TO_BYTE (to) : to;
d46c5b12 4306 len_byte = to_byte - from_byte;
e133c8fa 4307 TEMP_SET_PT_BOTH (from, from_byte);
d46c5b12
KH
4308 }
4309 }
4310
12410ef1
KH
4311 if (replace)
4312 deletion = make_buffer_string_both (from, from_byte, to, to_byte, 1);
4313
d46c5b12 4314 /* Try to skip the heading and tailing ASCIIs. */
12410ef1
KH
4315 {
4316 int from_byte_orig = from_byte, to_byte_orig = to_byte;
4317
4318 if (from < GPT && GPT < to)
4319 move_gap_both (from, from_byte);
88993dfd 4320 SHRINK_CONVERSION_REGION (&from_byte, &to_byte, coding, NULL, encodep);
d4e57bcd 4321 if (from_byte == to_byte
944bd420 4322 && coding->type != coding_type_ccl
d4e57bcd
KH
4323 && ! (coding->mode & CODING_MODE_LAST_BLOCK
4324 && CODING_REQUIRE_FLUSHING (coding)))
12410ef1
KH
4325 {
4326 coding->produced = len_byte;
4327 coding->produced_char = multibyte ? len : len_byte;
4328 if (!replace)
4329 /* We must record and adjust for this new text now. */
4330 adjust_after_insert (from, from_byte_orig, to, to_byte_orig, len);
4331 return 0;
4332 }
fb88bf2d 4333
12410ef1
KH
4334 head_skip = from_byte - from_byte_orig;
4335 tail_skip = to_byte_orig - to_byte;
4336 total_skip = head_skip + tail_skip;
4337 from += head_skip;
4338 to -= tail_skip;
4339 len -= total_skip; len_byte -= total_skip;
4340 }
d46c5b12 4341
88993dfd 4342 /* The code conversion routine can not preserve text properties for
55d8d769
KH
4343 now. So, we must remove all text properties in the region.
4344 Here, we must suppress all modification hooks. */
88993dfd 4345 if (replace)
55d8d769
KH
4346 {
4347 int saved_inhibit_modification_hooks = inhibit_modification_hooks;
4348 inhibit_modification_hooks = 1;
4349 Fset_text_properties (make_number (from), make_number (to), Qnil, Qnil);
4350 inhibit_modification_hooks = saved_inhibit_modification_hooks;
4351 }
88993dfd 4352
fb88bf2d
KH
4353 /* For converion, we must put the gap before the text in addition to
4354 making the gap larger for efficient decoding. The required gap
4355 size starts from 2000 which is the magic number used in make_gap.
4356 But, after one batch of conversion, it will be incremented if we
4357 find that it is not enough . */
d46c5b12
KH
4358 require = 2000;
4359
4360 if (GAP_SIZE < require)
4361 make_gap (require - GAP_SIZE);
4362 move_gap_both (from, from_byte);
4363
d46c5b12 4364 inserted = inserted_byte = 0;
fb88bf2d
KH
4365 src = GAP_END_ADDR, dst = GPT_ADDR;
4366
4367 GAP_SIZE += len_byte;
4368 ZV -= len;
4369 Z -= len;
4370 ZV_BYTE -= len_byte;
4371 Z_BYTE -= len_byte;
4372
f2558efd
KH
4373 if (GPT - BEG < beg_unchanged)
4374 beg_unchanged = GPT - BEG;
4375 if (Z - GPT < end_unchanged)
4376 end_unchanged = Z - GPT;
4377
d46c5b12
KH
4378 for (;;)
4379 {
fb88bf2d 4380 int result;
d46c5b12
KH
4381
4382 /* The buffer memory is changed from:
fb88bf2d
KH
4383 +--------+converted-text+---------+-------original-text------+---+
4384 |<-from->|<--inserted-->|---------|<-----------len---------->|---|
4385 |<------------------- GAP_SIZE -------------------->| */
d46c5b12 4386 if (encodep)
fb88bf2d 4387 result = encode_coding (coding, src, dst, len_byte, 0);
d46c5b12 4388 else
fb88bf2d 4389 result = decode_coding (coding, src, dst, len_byte, 0);
d46c5b12
KH
4390 /* to:
4391 +--------+-------converted-text--------+--+---original-text--+---+
fb88bf2d
KH
4392 |<-from->|<--inserted-->|<--produced-->|--|<-(len-consumed)->|---|
4393 |<------------------- GAP_SIZE -------------------->| */
4394 if (coding->fake_multibyte)
4395 fake_multibyte = 1;
d46c5b12 4396
fb88bf2d
KH
4397 if (!encodep && !multibyte)
4398 coding->produced_char = coding->produced;
d46c5b12
KH
4399 inserted += coding->produced_char;
4400 inserted_byte += coding->produced;
d46c5b12 4401 len_byte -= coding->consumed;
fb88bf2d
KH
4402 src += coding->consumed;
4403 dst += inserted_byte;
d46c5b12 4404
9864ebce
KH
4405 if (result == CODING_FINISH_NORMAL)
4406 {
4407 src += len_byte;
4408 break;
4409 }
d46c5b12
KH
4410 if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
4411 {
fb88bf2d 4412 unsigned char *pend = dst, *p = pend - inserted_byte;
d46c5b12
KH
4413
4414 /* Encode LFs back to the original eol format (CR or CRLF). */
4415 if (coding->eol_type == CODING_EOL_CR)
4416 {
4417 while (p < pend) if (*p++ == '\n') p[-1] = '\r';
4418 }
4419 else
4420 {
d46c5b12
KH
4421 int count = 0;
4422
fb88bf2d
KH
4423 while (p < pend) if (*p++ == '\n') count++;
4424 if (src - dst < count)
d46c5b12 4425 {
fb88bf2d
KH
4426 /* We don't have sufficient room for putting LFs
4427 back to CRLF. We must record converted and
4428 not-yet-converted text back to the buffer
4429 content, enlarge the gap, then record them out of
4430 the buffer contents again. */
4431 int add = len_byte + inserted_byte;
4432
4433 GAP_SIZE -= add;
4434 ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
4435 GPT += inserted_byte; GPT_BYTE += inserted_byte;
4436 make_gap (count - GAP_SIZE);
4437 GAP_SIZE += add;
4438 ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
4439 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
4440 /* Don't forget to update SRC, DST, and PEND. */
4441 src = GAP_END_ADDR - len_byte;
4442 dst = GPT_ADDR + inserted_byte;
4443 pend = dst;
d46c5b12 4444 }
d46c5b12
KH
4445 inserted += count;
4446 inserted_byte += count;
fb88bf2d
KH
4447 coding->produced += count;
4448 p = dst = pend + count;
4449 while (count)
4450 {
4451 *--p = *--pend;
4452 if (*p == '\n') count--, *--p = '\r';
4453 }
d46c5b12
KH
4454 }
4455
4456 /* Suppress eol-format conversion in the further conversion. */
4457 coding->eol_type = CODING_EOL_LF;
4458
4459 /* Restore the original symbol. */
4460 coding->symbol = saved_coding_symbol;
fb88bf2d
KH
4461
4462 continue;
d46c5b12
KH
4463 }
4464 if (len_byte <= 0)
944bd420
KH
4465 {
4466 if (coding->type != coding_type_ccl
4467 || coding->mode & CODING_MODE_LAST_BLOCK)
4468 break;
4469 coding->mode |= CODING_MODE_LAST_BLOCK;
4470 continue;
4471 }
d46c5b12
KH
4472 if (result == CODING_FINISH_INSUFFICIENT_SRC)
4473 {
4474 /* The source text ends in invalid codes. Let's just
4475 make them valid buffer contents, and finish conversion. */
fb88bf2d 4476 inserted += len_byte;
d46c5b12 4477 inserted_byte += len_byte;
fb88bf2d 4478 while (len_byte--)
ee59c65f 4479 *dst++ = *src++;
fb88bf2d 4480 fake_multibyte = 1;
d46c5b12
KH
4481 break;
4482 }
9864ebce
KH
4483 if (result == CODING_FINISH_INTERRUPT)
4484 {
4485 /* The conversion procedure was interrupted by a user. */
4486 fake_multibyte = 1;
4487 break;
4488 }
4489 /* Now RESULT == CODING_FINISH_INSUFFICIENT_DST */
4490 if (coding->consumed < 1)
4491 {
4492 /* It's quite strange to require more memory without
4493 consuming any bytes. Perhaps CCL program bug. */
4494 fake_multibyte = 1;
4495 break;
4496 }
fb88bf2d
KH
4497 if (first)
4498 {
4499 /* We have just done the first batch of conversion which was
4500 stoped because of insufficient gap. Let's reconsider the
4501 required gap size (i.e. SRT - DST) now.
4502
4503 We have converted ORIG bytes (== coding->consumed) into
4504 NEW bytes (coding->produced). To convert the remaining
4505 LEN bytes, we may need REQUIRE bytes of gap, where:
4506 REQUIRE + LEN_BYTE = LEN_BYTE * (NEW / ORIG)
4507 REQUIRE = LEN_BYTE * (NEW - ORIG) / ORIG
4508 Here, we are sure that NEW >= ORIG. */
6e44253b
KH
4509 float ratio = coding->produced - coding->consumed;
4510 ratio /= coding->consumed;
4511 require = len_byte * ratio;
fb88bf2d
KH
4512 first = 0;
4513 }
4514 if ((src - dst) < (require + 2000))
4515 {
4516 /* See the comment above the previous call of make_gap. */
4517 int add = len_byte + inserted_byte;
4518
4519 GAP_SIZE -= add;
4520 ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
4521 GPT += inserted_byte; GPT_BYTE += inserted_byte;
4522 make_gap (require + 2000);
4523 GAP_SIZE += add;
4524 ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
4525 GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
4526 /* Don't forget to update SRC, DST. */
4527 src = GAP_END_ADDR - len_byte;
4528 dst = GPT_ADDR + inserted_byte;
4529 }
d46c5b12 4530 }
fb88bf2d
KH
4531 if (src - dst > 0) *dst = 0; /* Put an anchor. */
4532
2b4f9037 4533 if (multibyte
88993dfd
KH
4534 && (encodep
4535 || fake_multibyte
4536 || (to - from) != (to_byte - from_byte)))
2b4f9037 4537 inserted = multibyte_chars_in_text (GPT_ADDR, inserted_byte);
7553d0e1 4538
12410ef1
KH
4539 /* If we have shrinked the conversion area, adjust it now. */
4540 if (total_skip > 0)
4541 {
4542 if (tail_skip > 0)
4543 safe_bcopy (GAP_END_ADDR, GPT_ADDR + inserted_byte, tail_skip);
4544 inserted += total_skip; inserted_byte += total_skip;
4545 GAP_SIZE += total_skip;
4546 GPT -= head_skip; GPT_BYTE -= head_skip;
4547 ZV -= total_skip; ZV_BYTE -= total_skip;
4548 Z -= total_skip; Z_BYTE -= total_skip;
4549 from -= head_skip; from_byte -= head_skip;
4550 to += tail_skip; to_byte += tail_skip;
4551 }
4552
6abb9bd9 4553 prev_Z = Z;
12410ef1 4554 adjust_after_replace (from, from_byte, deletion, inserted, inserted_byte);
6abb9bd9 4555 inserted = Z - prev_Z;
4ed46869 4556
2b4f9037 4557 if (! encodep && ! NILP (coding->post_read_conversion))
d46c5b12 4558 {
2b4f9037 4559 Lisp_Object val;
4ed46869 4560
e133c8fa
KH
4561 if (from != PT)
4562 TEMP_SET_PT_BOTH (from, from_byte);
6abb9bd9 4563 prev_Z = Z;
2b4f9037 4564 val = call1 (coding->post_read_conversion, make_number (inserted));
6abb9bd9 4565 CHECK_NUMBER (val, 0);
944bd420 4566 inserted += Z - prev_Z;
e133c8fa
KH
4567 }
4568
4569 if (orig_point >= from)
4570 {
4571 if (orig_point >= from + orig_len)
4572 orig_point += inserted - orig_len;
4573 else
4574 orig_point = from;
4575 TEMP_SET_PT (orig_point);
d46c5b12 4576 }
4ed46869 4577
2b4f9037
KH
4578 signal_after_change (from, to - from, inserted);
4579
fb88bf2d 4580 {
12410ef1
KH
4581 coding->consumed = to_byte - from_byte;
4582 coding->consumed_char = to - from;
4583 coding->produced = inserted_byte;
4584 coding->produced_char = inserted;
fb88bf2d 4585 }
7553d0e1 4586
fb88bf2d 4587 return 0;
d46c5b12
KH
4588}
4589
4590Lisp_Object
4591code_convert_string (str, coding, encodep, nocopy)
4592 Lisp_Object str;
4ed46869 4593 struct coding_system *coding;
d46c5b12 4594 int encodep, nocopy;
4ed46869 4595{
d46c5b12
KH
4596 int len;
4597 char *buf;
fc932ac6
RS
4598 int from = 0, to = XSTRING (str)->size;
4599 int to_byte = STRING_BYTES (XSTRING (str));
d46c5b12 4600 struct gcpro gcpro1;
84d60297 4601 Lisp_Object saved_coding_symbol;
d46c5b12 4602 int result;
4ed46869 4603
84d60297 4604 saved_coding_symbol = Qnil;
d46c5b12
KH
4605 if (encodep && !NILP (coding->pre_write_conversion)
4606 || !encodep && !NILP (coding->post_read_conversion))
4607 {
4608 /* Since we have to call Lisp functions which assume target text
4609 is in a buffer, after setting a temporary buffer, call
4610 code_convert_region. */
4611 int count = specpdl_ptr - specpdl;
4612 struct buffer *prev = current_buffer;
e133c8fa 4613
d46c5b12
KH
4614 record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
4615 temp_output_buffer_setup (" *code-converting-work*");
4616 set_buffer_internal (XBUFFER (Vstandard_output));
4617 if (encodep)
4618 insert_from_string (str, 0, 0, to, to_byte, 0);
4619 else
4620 {
4621 /* We must insert the contents of STR as is without
4622 unibyte<->multibyte conversion. */
4623 current_buffer->enable_multibyte_characters = Qnil;
4624 insert_from_string (str, 0, 0, to_byte, to_byte, 0);
4625 current_buffer->enable_multibyte_characters = Qt;
4626 }
fb88bf2d 4627 code_convert_region (BEGV, BEGV_BYTE, ZV, ZV_BYTE, coding, encodep, 1);
d46c5b12
KH
4628 if (encodep)
4629 /* We must return the buffer contents as unibyte string. */
4630 current_buffer->enable_multibyte_characters = Qnil;
4631 str = make_buffer_string (BEGV, ZV, 0);
4632 set_buffer_internal (prev);
4633 return unbind_to (count, str);
4634 }
4ed46869 4635
d46c5b12
KH
4636 if (! encodep && CODING_REQUIRE_DETECTION (coding))
4637 {
4638 /* See the comments in code_convert_region. */
4639 if (coding->type == coding_type_undecided)
4640 {
4641 detect_coding (coding, XSTRING (str)->data, to_byte);
4642 if (coding->type == coding_type_undecided)
4643 coding->type = coding_type_emacs_mule;
4644 }
4645 if (coding->eol_type == CODING_EOL_UNDECIDED)
4646 {
4647 saved_coding_symbol = coding->symbol;
4648 detect_eol (coding, XSTRING (str)->data, to_byte);
4649 if (coding->eol_type == CODING_EOL_UNDECIDED)
4650 coding->eol_type = CODING_EOL_LF;
4651 /* We had better recover the original eol format if we
4652 encounter an inconsitent eol format while decoding. */
4653 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4654 }
4655 }
4ed46869 4656
d46c5b12
KH
4657 if (encodep
4658 ? ! CODING_REQUIRE_ENCODING (coding)
4659 : ! CODING_REQUIRE_DECODING (coding))
4660 from = to_byte;
4661 else
4662 {
4663 /* Try to skip the heading and tailing ASCIIs. */
88993dfd
KH
4664 SHRINK_CONVERSION_REGION (&from, &to_byte, coding, XSTRING (str)->data,
4665 encodep);
d46c5b12 4666 }
e133c8fa
KH
4667 if (from == to_byte
4668 && coding->type != coding_type_ccl)
d46c5b12 4669 return (nocopy ? str : Fcopy_sequence (str));
4ed46869 4670
d46c5b12
KH
4671 if (encodep)
4672 len = encoding_buffer_size (coding, to_byte - from);
4673 else
4674 len = decoding_buffer_size (coding, to_byte - from);
fc932ac6 4675 len += from + STRING_BYTES (XSTRING (str)) - to_byte;
d46c5b12
KH
4676 GCPRO1 (str);
4677 buf = get_conversion_buffer (len);
4678 UNGCPRO;
4ed46869 4679
d46c5b12
KH
4680 if (from > 0)
4681 bcopy (XSTRING (str)->data, buf, from);
4682 result = (encodep
4683 ? encode_coding (coding, XSTRING (str)->data + from,
4684 buf + from, to_byte - from, len)
4685 : decode_coding (coding, XSTRING (str)->data + from,
f30cc612 4686 buf + from, to_byte - from, len));
d46c5b12 4687 if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
4ed46869 4688 {
d46c5b12
KH
4689 /* We simple try to decode the whole string again but without
4690 eol-conversion this time. */
4691 coding->eol_type = CODING_EOL_LF;
4692 coding->symbol = saved_coding_symbol;
4693 return code_convert_string (str, coding, encodep, nocopy);
4ed46869 4694 }
d46c5b12
KH
4695
4696 bcopy (XSTRING (str)->data + to_byte, buf + from + coding->produced,
fc932ac6 4697 STRING_BYTES (XSTRING (str)) - to_byte);
d46c5b12 4698
fc932ac6 4699 len = from + STRING_BYTES (XSTRING (str)) - to_byte;
d46c5b12
KH
4700 if (encodep)
4701 str = make_unibyte_string (buf, len + coding->produced);
4702 else
826bfb8b
KH
4703 {
4704 int chars= (coding->fake_multibyte
4705 ? multibyte_chars_in_text (buf + from, coding->produced)
4706 : coding->produced_char);
4707 str = make_multibyte_string (buf, len + chars, len + coding->produced);
4708 }
4709
d46c5b12 4710 return str;
4ed46869
KH
4711}
4712
4713\f
4714#ifdef emacs
1397dc18 4715/*** 8. Emacs Lisp library functions ***/
4ed46869 4716
4ed46869
KH
4717DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
4718 "Return t if OBJECT is nil or a coding-system.\n\
3a73fa5d
RS
4719See the documentation of `make-coding-system' for information\n\
4720about coding-system objects.")
4ed46869
KH
4721 (obj)
4722 Lisp_Object obj;
4723{
4608c386
KH
4724 if (NILP (obj))
4725 return Qt;
4726 if (!SYMBOLP (obj))
4727 return Qnil;
4728 /* Get coding-spec vector for OBJ. */
4729 obj = Fget (obj, Qcoding_system);
4730 return ((VECTORP (obj) && XVECTOR (obj)->size == 5)
4731 ? Qt : Qnil);
4ed46869
KH
4732}
4733
9d991de8
RS
4734DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
4735 Sread_non_nil_coding_system, 1, 1, 0,
e0e989f6 4736 "Read a coding system from the minibuffer, prompting with string PROMPT.")
4ed46869
KH
4737 (prompt)
4738 Lisp_Object prompt;
4739{
e0e989f6 4740 Lisp_Object val;
9d991de8
RS
4741 do
4742 {
4608c386
KH
4743 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
4744 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
9d991de8
RS
4745 }
4746 while (XSTRING (val)->size == 0);
e0e989f6 4747 return (Fintern (val, Qnil));
4ed46869
KH
4748}
4749
9b787f3e
RS
4750DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
4751 "Read a coding system from the minibuffer, prompting with string PROMPT.\n\
4752If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.")
4753 (prompt, default_coding_system)
4754 Lisp_Object prompt, default_coding_system;
4ed46869 4755{
f44d27ce 4756 Lisp_Object val;
9b787f3e
RS
4757 if (SYMBOLP (default_coding_system))
4758 XSETSTRING (default_coding_system, XSYMBOL (default_coding_system)->name);
4608c386 4759 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
9b787f3e
RS
4760 Qt, Qnil, Qcoding_system_history,
4761 default_coding_system, Qnil);
e0e989f6 4762 return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
4ed46869
KH
4763}
4764
4765DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
4766 1, 1, 0,
4767 "Check validity of CODING-SYSTEM.\n\
3a73fa5d
RS
4768If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.\n\
4769It is valid if it is a symbol with a non-nil `coding-system' property.\n\
4ed46869
KH
4770The value of property should be a vector of length 5.")
4771 (coding_system)
4772 Lisp_Object coding_system;
4773{
4774 CHECK_SYMBOL (coding_system, 0);
4775 if (!NILP (Fcoding_system_p (coding_system)))
4776 return coding_system;
4777 while (1)
02ba4723 4778 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
4ed46869 4779}
3a73fa5d 4780\f
d46c5b12
KH
4781Lisp_Object
4782detect_coding_system (src, src_bytes, highest)
4783 unsigned char *src;
4784 int src_bytes, highest;
4ed46869
KH
4785{
4786 int coding_mask, eol_type;
d46c5b12
KH
4787 Lisp_Object val, tmp;
4788 int dummy;
4ed46869 4789
d46c5b12
KH
4790 coding_mask = detect_coding_mask (src, src_bytes, NULL, &dummy);
4791 eol_type = detect_eol_type (src, src_bytes, &dummy);
4792 if (eol_type == CODING_EOL_INCONSISTENT)
25b02698 4793 eol_type = CODING_EOL_UNDECIDED;
4ed46869 4794
d46c5b12 4795 if (!coding_mask)
4ed46869 4796 {
27901516 4797 val = Qundecided;
d46c5b12 4798 if (eol_type != CODING_EOL_UNDECIDED)
4ed46869 4799 {
f44d27ce
RS
4800 Lisp_Object val2;
4801 val2 = Fget (Qundecided, Qeol_type);
4ed46869
KH
4802 if (VECTORP (val2))
4803 val = XVECTOR (val2)->contents[eol_type];
4804 }
80e803b4 4805 return (highest ? val : Fcons (val, Qnil));
4ed46869 4806 }
4ed46869 4807
d46c5b12
KH
4808 /* At first, gather possible coding systems in VAL. */
4809 val = Qnil;
4810 for (tmp = Vcoding_category_list; !NILP (tmp); tmp = XCONS (tmp)->cdr)
4ed46869 4811 {
d46c5b12
KH
4812 int idx
4813 = XFASTINT (Fget (XCONS (tmp)->car, Qcoding_category_index));
4814 if (coding_mask & (1 << idx))
4ed46869 4815 {
d46c5b12
KH
4816 val = Fcons (Fsymbol_value (XCONS (tmp)->car), val);
4817 if (highest)
4818 break;
4ed46869
KH
4819 }
4820 }
d46c5b12
KH
4821 if (!highest)
4822 val = Fnreverse (val);
4ed46869 4823
65059037 4824 /* Then, replace the elements with subsidiary coding systems. */
d46c5b12 4825 for (tmp = val; !NILP (tmp); tmp = XCONS (tmp)->cdr)
4ed46869 4826 {
65059037
RS
4827 if (eol_type != CODING_EOL_UNDECIDED
4828 && eol_type != CODING_EOL_INCONSISTENT)
4ed46869 4829 {
d46c5b12
KH
4830 Lisp_Object eol;
4831 eol = Fget (XCONS (tmp)->car, Qeol_type);
4832 if (VECTORP (eol))
4833 XCONS (tmp)->car = XVECTOR (eol)->contents[eol_type];
4ed46869
KH
4834 }
4835 }
d46c5b12
KH
4836 return (highest ? XCONS (val)->car : val);
4837}
4ed46869 4838
d46c5b12
KH
4839DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
4840 2, 3, 0,
4841 "Detect coding system of the text in the region between START and END.\n\
4842Return a list of possible coding systems ordered by priority.\n\
4843\n\
80e803b4
KH
4844If only ASCII characters are found, it returns a list of single element\n\
4845`undecided' or its subsidiary coding system according to a detected\n\
4846end-of-line format.\n\
d46c5b12
KH
4847\n\
4848If optional argument HIGHEST is non-nil, return the coding system of\n\
4849highest priority.")
4850 (start, end, highest)
4851 Lisp_Object start, end, highest;
4852{
4853 int from, to;
4854 int from_byte, to_byte;
6289dd10 4855
d46c5b12
KH
4856 CHECK_NUMBER_COERCE_MARKER (start, 0);
4857 CHECK_NUMBER_COERCE_MARKER (end, 1);
4ed46869 4858
d46c5b12
KH
4859 validate_region (&start, &end);
4860 from = XINT (start), to = XINT (end);
4861 from_byte = CHAR_TO_BYTE (from);
4862 to_byte = CHAR_TO_BYTE (to);
6289dd10 4863
d46c5b12
KH
4864 if (from < GPT && to >= GPT)
4865 move_gap_both (to, to_byte);
4ed46869 4866
d46c5b12
KH
4867 return detect_coding_system (BYTE_POS_ADDR (from_byte),
4868 to_byte - from_byte,
4869 !NILP (highest));
4870}
6289dd10 4871
d46c5b12
KH
4872DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
4873 1, 2, 0,
4874 "Detect coding system of the text in STRING.\n\
4875Return a list of possible coding systems ordered by priority.\n\
4876\n\
80e803b4
KH
4877If only ASCII characters are found, it returns a list of single element\n\
4878`undecided' or its subsidiary coding system according to a detected\n\
4879end-of-line format.\n\
d46c5b12
KH
4880\n\
4881If optional argument HIGHEST is non-nil, return the coding system of\n\
4882highest priority.")
4883 (string, highest)
4884 Lisp_Object string, highest;
4885{
4886 CHECK_STRING (string, 0);
4ed46869 4887
d46c5b12 4888 return detect_coding_system (XSTRING (string)->data,
fc932ac6 4889 STRING_BYTES (XSTRING (string)),
d46c5b12 4890 !NILP (highest));
4ed46869
KH
4891}
4892
4031e2bf
KH
4893Lisp_Object
4894code_convert_region1 (start, end, coding_system, encodep)
d46c5b12 4895 Lisp_Object start, end, coding_system;
4031e2bf 4896 int encodep;
3a73fa5d
RS
4897{
4898 struct coding_system coding;
4031e2bf 4899 int from, to, len;
3a73fa5d 4900
d46c5b12
KH
4901 CHECK_NUMBER_COERCE_MARKER (start, 0);
4902 CHECK_NUMBER_COERCE_MARKER (end, 1);
3a73fa5d
RS
4903 CHECK_SYMBOL (coding_system, 2);
4904
d46c5b12
KH
4905 validate_region (&start, &end);
4906 from = XFASTINT (start);
4907 to = XFASTINT (end);
4908
3a73fa5d 4909 if (NILP (coding_system))
d46c5b12
KH
4910 return make_number (to - from);
4911
3a73fa5d 4912 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
d46c5b12 4913 error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
3a73fa5d 4914
d46c5b12 4915 coding.mode |= CODING_MODE_LAST_BLOCK;
fb88bf2d
KH
4916 code_convert_region (from, CHAR_TO_BYTE (from), to, CHAR_TO_BYTE (to),
4917 &coding, encodep, 1);
f072a3e8 4918 Vlast_coding_system_used = coding.symbol;
fb88bf2d 4919 return make_number (coding.produced_char);
4031e2bf
KH
4920}
4921
4922DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
4923 3, 3, "r\nzCoding system: ",
4924 "Decode the current region by specified coding system.\n\
4925When called from a program, takes three arguments:\n\
4926START, END, and CODING-SYSTEM. START and END are buffer positions.\n\
f072a3e8
RS
4927This function sets `last-coding-system-used' to the precise coding system\n\
4928used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
4929not fully specified.)\n\
4930It returns the length of the decoded text.")
4031e2bf
KH
4931 (start, end, coding_system)
4932 Lisp_Object start, end, coding_system;
4933{
4934 return code_convert_region1 (start, end, coding_system, 0);
3a73fa5d
RS
4935}
4936
4937DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
4938 3, 3, "r\nzCoding system: ",
d46c5b12 4939 "Encode the current region by specified coding system.\n\
3a73fa5d 4940When called from a program, takes three arguments:\n\
d46c5b12 4941START, END, and CODING-SYSTEM. START and END are buffer positions.\n\
f072a3e8
RS
4942This function sets `last-coding-system-used' to the precise coding system\n\
4943used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
4944not fully specified.)\n\
4945It returns the length of the encoded text.")
d46c5b12
KH
4946 (start, end, coding_system)
4947 Lisp_Object start, end, coding_system;
3a73fa5d 4948{
4031e2bf
KH
4949 return code_convert_region1 (start, end, coding_system, 1);
4950}
3a73fa5d 4951
4031e2bf
KH
4952Lisp_Object
4953code_convert_string1 (string, coding_system, nocopy, encodep)
4954 Lisp_Object string, coding_system, nocopy;
4955 int encodep;
4956{
4957 struct coding_system coding;
3a73fa5d 4958
4031e2bf
KH
4959 CHECK_STRING (string, 0);
4960 CHECK_SYMBOL (coding_system, 1);
4ed46869 4961
d46c5b12 4962 if (NILP (coding_system))
4031e2bf 4963 return (NILP (nocopy) ? Fcopy_sequence (string) : string);
4ed46869 4964
d46c5b12
KH
4965 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
4966 error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
5f1cd180 4967
d46c5b12 4968 coding.mode |= CODING_MODE_LAST_BLOCK;
f072a3e8 4969 Vlast_coding_system_used = coding.symbol;
4031e2bf 4970 return code_convert_string (string, &coding, encodep, !NILP (nocopy));
4ed46869
KH
4971}
4972
4ed46869 4973DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
e0e989f6
KH
4974 2, 3, 0,
4975 "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
fe487a71 4976Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
f072a3e8
RS
4977if the decoding operation is trivial.\n\
4978This function sets `last-coding-system-used' to the precise coding system\n\
4979used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
4980not fully specified.)")
e0e989f6
KH
4981 (string, coding_system, nocopy)
4982 Lisp_Object string, coding_system, nocopy;
4ed46869 4983{
f072a3e8 4984 return code_convert_string1 (string, coding_system, nocopy, 0);
4ed46869
KH
4985}
4986
4987DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
e0e989f6
KH
4988 2, 3, 0,
4989 "Encode STRING to CODING-SYSTEM, and return the result.\n\
fe487a71 4990Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
f072a3e8
RS
4991if the encoding operation is trivial.\n\
4992This function sets `last-coding-system-used' to the precise coding system\n\
4993used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
4994not fully specified.)")
e0e989f6
KH
4995 (string, coding_system, nocopy)
4996 Lisp_Object string, coding_system, nocopy;
4ed46869 4997{
f072a3e8 4998 return code_convert_string1 (string, coding_system, nocopy, 1);
4ed46869 4999}
4031e2bf 5000
ecec61c1
KH
5001/* Encode or decode STRING according to CODING_SYSTEM.
5002 Do not set Vlast_coding_system_used. */
5003
5004Lisp_Object
5005code_convert_string_norecord (string, coding_system, encodep)
5006 Lisp_Object string, coding_system;
5007 int encodep;
5008{
5009 struct coding_system coding;
5010
5011 CHECK_STRING (string, 0);
5012 CHECK_SYMBOL (coding_system, 1);
5013
5014 if (NILP (coding_system))
5015 return string;
5016
5017 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
5018 error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
5019
5020 coding.mode |= CODING_MODE_LAST_BLOCK;
5021 return code_convert_string (string, &coding, encodep, Qt);
5022}
3a73fa5d 5023\f
4ed46869 5024DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
55ab7be3 5025 "Decode a Japanese character which has CODE in shift_jis encoding.\n\
4ed46869
KH
5026Return the corresponding character.")
5027 (code)
5028 Lisp_Object code;
5029{
5030 unsigned char c1, c2, s1, s2;
5031 Lisp_Object val;
5032
5033 CHECK_NUMBER (code, 0);
5034 s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
55ab7be3
KH
5035 if (s1 == 0)
5036 {
5037 if (s2 < 0xA0 || s2 > 0xDF)
5038 error ("Invalid Shift JIS code: %s", XFASTINT (code));
5039 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset_katakana_jisx0201, s2, 0));
5040 }
5041 else
5042 {
5043 if ((s1 < 0x80 || s1 > 0x9F && s1 < 0xE0 || s1 > 0xEF)
5044 || (s2 < 0x40 || s2 == 0x7F || s2 > 0xFC))
5045 error ("Invalid Shift JIS code: %s", XFASTINT (code));
5046 DECODE_SJIS (s1, s2, c1, c2);
5047 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset_jisx0208, c1, c2));
5048 }
4ed46869
KH
5049 return val;
5050}
5051
5052DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
55ab7be3
KH
5053 "Encode a Japanese character CHAR to shift_jis encoding.\n\
5054Return the corresponding code in SJIS.")
4ed46869
KH
5055 (ch)
5056 Lisp_Object ch;
5057{
bcf26d6a 5058 int charset, c1, c2, s1, s2;
4ed46869
KH
5059 Lisp_Object val;
5060
5061 CHECK_NUMBER (ch, 0);
5062 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
55ab7be3
KH
5063 if (charset == charset_jisx0208
5064 && c1 > 0x20 && c1 < 0x7F && c2 > 0x20 && c2 < 0x7F)
4ed46869
KH
5065 {
5066 ENCODE_SJIS (c1, c2, s1, s2);
bcf26d6a 5067 XSETFASTINT (val, (s1 << 8) | s2);
4ed46869 5068 }
55ab7be3
KH
5069 else if (charset == charset_katakana_jisx0201
5070 && c1 > 0x20 && c2 < 0xE0)
5071 {
5072 XSETFASTINT (val, c1 | 0x80);
5073 }
4ed46869 5074 else
55ab7be3
KH
5075 error ("Can't encode to shift_jis: %d", XFASTINT (ch));
5076
4ed46869
KH
5077 return val;
5078}
5079
5080DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
d46c5b12 5081 "Decode a Big5 character CODE of BIG5 coding system.\n\
4ed46869
KH
5082CODE is the character code in BIG5.\n\
5083Return the corresponding character.")
5084 (code)
5085 Lisp_Object code;
5086{
5087 int charset;
5088 unsigned char b1, b2, c1, c2;
5089 Lisp_Object val;
5090
5091 CHECK_NUMBER (code, 0);
5092 b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
5093 DECODE_BIG5 (b1, b2, charset, c1, c2);
5094 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset, c1, c2));
5095 return val;
5096}
5097
5098DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
d46c5b12 5099 "Encode the Big5 character CHAR to BIG5 coding system.\n\
4ed46869
KH
5100Return the corresponding character code in Big5.")
5101 (ch)
5102 Lisp_Object ch;
5103{
bcf26d6a 5104 int charset, c1, c2, b1, b2;
4ed46869
KH
5105 Lisp_Object val;
5106
5107 CHECK_NUMBER (ch, 0);
5108 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
5109 if (charset == charset_big5_1 || charset == charset_big5_2)
5110 {
5111 ENCODE_BIG5 (charset, c1, c2, b1, b2);
bcf26d6a 5112 XSETFASTINT (val, (b1 << 8) | b2);
4ed46869
KH
5113 }
5114 else
5115 XSETFASTINT (val, 0);
5116 return val;
5117}
3a73fa5d 5118\f
1ba9e4ab
KH
5119DEFUN ("set-terminal-coding-system-internal",
5120 Fset_terminal_coding_system_internal,
5121 Sset_terminal_coding_system_internal, 1, 1, 0, "")
4ed46869
KH
5122 (coding_system)
5123 Lisp_Object coding_system;
5124{
5125 CHECK_SYMBOL (coding_system, 0);
5126 setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
70c22245 5127 /* We had better not send unsafe characters to terminal. */
6e85d753
KH
5128 terminal_coding.flags |= CODING_FLAG_ISO_SAFE;
5129
4ed46869
KH
5130 return Qnil;
5131}
5132
c4825358
KH
5133DEFUN ("set-safe-terminal-coding-system-internal",
5134 Fset_safe_terminal_coding_system_internal,
5135 Sset_safe_terminal_coding_system_internal, 1, 1, 0, "")
5136 (coding_system)
5137 Lisp_Object coding_system;
5138{
5139 CHECK_SYMBOL (coding_system, 0);
5140 setup_coding_system (Fcheck_coding_system (coding_system),
5141 &safe_terminal_coding);
5142 return Qnil;
5143}
5144
4ed46869
KH
5145DEFUN ("terminal-coding-system",
5146 Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
3a73fa5d 5147 "Return coding system specified for terminal output.")
4ed46869
KH
5148 ()
5149{
5150 return terminal_coding.symbol;
5151}
5152
1ba9e4ab
KH
5153DEFUN ("set-keyboard-coding-system-internal",
5154 Fset_keyboard_coding_system_internal,
5155 Sset_keyboard_coding_system_internal, 1, 1, 0, "")
4ed46869
KH
5156 (coding_system)
5157 Lisp_Object coding_system;
5158{
5159 CHECK_SYMBOL (coding_system, 0);
5160 setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
5161 return Qnil;
5162}
5163
5164DEFUN ("keyboard-coding-system",
5165 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
3a73fa5d 5166 "Return coding system specified for decoding keyboard input.")
4ed46869
KH
5167 ()
5168{
5169 return keyboard_coding.symbol;
5170}
5171
5172\f
a5d301df
KH
5173DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
5174 Sfind_operation_coding_system, 1, MANY, 0,
5175 "Choose a coding system for an operation based on the target name.\n\
69f76525 5176The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).\n\
9ce27fde
KH
5177DECODING-SYSTEM is the coding system to use for decoding\n\
5178\(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system\n\
5179for encoding (in case OPERATION does encoding).\n\
ccdb79f5
RS
5180\n\
5181The first argument OPERATION specifies an I/O primitive:\n\
5182 For file I/O, `insert-file-contents' or `write-region'.\n\
5183 For process I/O, `call-process', `call-process-region', or `start-process'.\n\
5184 For network I/O, `open-network-stream'.\n\
5185\n\
5186The remaining arguments should be the same arguments that were passed\n\
5187to the primitive. Depending on which primitive, one of those arguments\n\
5188is selected as the TARGET. For example, if OPERATION does file I/O,\n\
5189whichever argument specifies the file name is TARGET.\n\
5190\n\
5191TARGET has a meaning which depends on OPERATION:\n\
4ed46869
KH
5192 For file I/O, TARGET is a file name.\n\
5193 For process I/O, TARGET is a process name.\n\
5194 For network I/O, TARGET is a service name or a port number\n\
5195\n\
02ba4723
KH
5196This function looks up what specified for TARGET in,\n\
5197`file-coding-system-alist', `process-coding-system-alist',\n\
5198or `network-coding-system-alist' depending on OPERATION.\n\
5199They may specify a coding system, a cons of coding systems,\n\
5200or a function symbol to call.\n\
5201In the last case, we call the function with one argument,\n\
9ce27fde 5202which is a list of all the arguments given to this function.")
4ed46869
KH
5203 (nargs, args)
5204 int nargs;
5205 Lisp_Object *args;
5206{
5207 Lisp_Object operation, target_idx, target, val;
5208 register Lisp_Object chain;
5209
5210 if (nargs < 2)
5211 error ("Too few arguments");
5212 operation = args[0];
5213 if (!SYMBOLP (operation)
5214 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
5215 error ("Invalid first arguement");
5216 if (nargs < 1 + XINT (target_idx))
5217 error ("Too few arguments for operation: %s",
5218 XSYMBOL (operation)->name->data);
5219 target = args[XINT (target_idx) + 1];
5220 if (!(STRINGP (target)
5221 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
5222 error ("Invalid %dth argument", XINT (target_idx) + 1);
5223
2e34157c
RS
5224 chain = ((EQ (operation, Qinsert_file_contents)
5225 || EQ (operation, Qwrite_region))
02ba4723 5226 ? Vfile_coding_system_alist
2e34157c 5227 : (EQ (operation, Qopen_network_stream)
02ba4723
KH
5228 ? Vnetwork_coding_system_alist
5229 : Vprocess_coding_system_alist));
4ed46869
KH
5230 if (NILP (chain))
5231 return Qnil;
5232
02ba4723 5233 for (; CONSP (chain); chain = XCONS (chain)->cdr)
4ed46869 5234 {
f44d27ce
RS
5235 Lisp_Object elt;
5236 elt = XCONS (chain)->car;
4ed46869
KH
5237
5238 if (CONSP (elt)
5239 && ((STRINGP (target)
5240 && STRINGP (XCONS (elt)->car)
5241 && fast_string_match (XCONS (elt)->car, target) >= 0)
5242 || (INTEGERP (target) && EQ (target, XCONS (elt)->car))))
02ba4723
KH
5243 {
5244 val = XCONS (elt)->cdr;
b19fd4c5
KH
5245 /* Here, if VAL is both a valid coding system and a valid
5246 function symbol, we return VAL as a coding system. */
02ba4723
KH
5247 if (CONSP (val))
5248 return val;
5249 if (! SYMBOLP (val))
5250 return Qnil;
5251 if (! NILP (Fcoding_system_p (val)))
5252 return Fcons (val, val);
b19fd4c5
KH
5253 if (! NILP (Ffboundp (val)))
5254 {
5255 val = call1 (val, Flist (nargs, args));
5256 if (CONSP (val))
5257 return val;
5258 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
5259 return Fcons (val, val);
5260 }
02ba4723
KH
5261 return Qnil;
5262 }
4ed46869
KH
5263 }
5264 return Qnil;
5265}
5266
1397dc18
KH
5267DEFUN ("update-coding-systems-internal", Fupdate_coding_systems_internal,
5268 Supdate_coding_systems_internal, 0, 0, 0,
5269 "Update internal database for ISO2022 and CCL based coding systems.\n\
d46c5b12
KH
5270When values of the following coding categories are changed, you must\n\
5271call this function:\n\
5272 coding-category-iso-7, coding-category-iso-7-tight,\n\
5273 coding-category-iso-8-1, coding-category-iso-8-2,\n\
1397dc18
KH
5274 coding-category-iso-7-else, coding-category-iso-8-else,\n\
5275 coding-category-ccl")
d46c5b12
KH
5276 ()
5277{
5278 int i;
5279
1397dc18 5280 for (i = CODING_CATEGORY_IDX_ISO_7; i <= CODING_CATEGORY_IDX_CCL; i++)
d46c5b12 5281 {
1397dc18
KH
5282 Lisp_Object val;
5283
5284 val = XSYMBOL (XVECTOR (Vcoding_category_table)->contents[i])->value;
5285 if (!NILP (val))
5286 {
5287 if (! coding_system_table[i])
5288 coding_system_table[i] = ((struct coding_system *)
5289 xmalloc (sizeof (struct coding_system)));
5290 setup_coding_system (val, coding_system_table[i]);
5291 }
5292 else if (coding_system_table[i])
5293 {
5294 xfree (coding_system_table[i]);
5295 coding_system_table[i] = NULL;
5296 }
d46c5b12 5297 }
1397dc18 5298
d46c5b12
KH
5299 return Qnil;
5300}
5301
66cfb530
KH
5302DEFUN ("set-coding-priority-internal", Fset_coding_priority_internal,
5303 Sset_coding_priority_internal, 0, 0, 0,
5304 "Update internal database for the current value of `coding-category-list'.\n\
5305This function is internal use only.")
5306 ()
5307{
5308 int i = 0, idx;
84d60297
RS
5309 Lisp_Object val;
5310
5311 val = Vcoding_category_list;
66cfb530
KH
5312
5313 while (CONSP (val) && i < CODING_CATEGORY_IDX_MAX)
5314 {
5315 if (! SYMBOLP (XCONS (val)->car))
5316 break;
5317 idx = XFASTINT (Fget (XCONS (val)->car, Qcoding_category_index));
5318 if (idx >= CODING_CATEGORY_IDX_MAX)
5319 break;
5320 coding_priorities[i++] = (1 << idx);
5321 val = XCONS (val)->cdr;
5322 }
5323 /* If coding-category-list is valid and contains all coding
5324 categories, `i' should be CODING_CATEGORY_IDX_MAX now. If not,
5325 the following code saves Emacs from craching. */
5326 while (i < CODING_CATEGORY_IDX_MAX)
5327 coding_priorities[i++] = CODING_CATEGORY_MASK_RAW_TEXT;
5328
5329 return Qnil;
5330}
5331
4ed46869
KH
5332#endif /* emacs */
5333
5334\f
1397dc18 5335/*** 9. Post-amble ***/
4ed46869 5336
6d74c3aa
KH
5337void
5338init_coding ()
5339{
5340 conversion_buffer = (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE);
5341}
5342
dfcf069d 5343void
4ed46869
KH
5344init_coding_once ()
5345{
5346 int i;
5347
0ef69138 5348 /* Emacs' internal format specific initialize routine. */
4ed46869
KH
5349 for (i = 0; i <= 0x20; i++)
5350 emacs_code_class[i] = EMACS_control_code;
5351 emacs_code_class[0x0A] = EMACS_linefeed_code;
5352 emacs_code_class[0x0D] = EMACS_carriage_return_code;
5353 for (i = 0x21 ; i < 0x7F; i++)
5354 emacs_code_class[i] = EMACS_ascii_code;
5355 emacs_code_class[0x7F] = EMACS_control_code;
5356 emacs_code_class[0x80] = EMACS_leading_code_composition;
5357 for (i = 0x81; i < 0xFF; i++)
5358 emacs_code_class[i] = EMACS_invalid_code;
5359 emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
5360 emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
5361 emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
5362 emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
5363
5364 /* ISO2022 specific initialize routine. */
5365 for (i = 0; i < 0x20; i++)
5366 iso_code_class[i] = ISO_control_code;
5367 for (i = 0x21; i < 0x7F; i++)
5368 iso_code_class[i] = ISO_graphic_plane_0;
5369 for (i = 0x80; i < 0xA0; i++)
5370 iso_code_class[i] = ISO_control_code;
5371 for (i = 0xA1; i < 0xFF; i++)
5372 iso_code_class[i] = ISO_graphic_plane_1;
5373 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
5374 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
5375 iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
5376 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
5377 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
5378 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
5379 iso_code_class[ISO_CODE_ESC] = ISO_escape;
5380 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
5381 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
5382 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
5383
e0e989f6 5384 conversion_buffer_size = MINIMUM_CONVERSION_BUFFER_SIZE;
e0e989f6
KH
5385
5386 setup_coding_system (Qnil, &keyboard_coding);
5387 setup_coding_system (Qnil, &terminal_coding);
c4825358 5388 setup_coding_system (Qnil, &safe_terminal_coding);
6bc51348 5389 setup_coding_system (Qnil, &default_buffer_file_coding);
9ce27fde 5390
d46c5b12
KH
5391 bzero (coding_system_table, sizeof coding_system_table);
5392
66cfb530
KH
5393 bzero (ascii_skip_code, sizeof ascii_skip_code);
5394 for (i = 0; i < 128; i++)
5395 ascii_skip_code[i] = 1;
5396
9ce27fde
KH
5397#if defined (MSDOS) || defined (WINDOWSNT)
5398 system_eol_type = CODING_EOL_CRLF;
5399#else
5400 system_eol_type = CODING_EOL_LF;
5401#endif
e0e989f6
KH
5402}
5403
5404#ifdef emacs
5405
dfcf069d 5406void
e0e989f6
KH
5407syms_of_coding ()
5408{
5409 Qtarget_idx = intern ("target-idx");
5410 staticpro (&Qtarget_idx);
5411
bb0115a2
RS
5412 Qcoding_system_history = intern ("coding-system-history");
5413 staticpro (&Qcoding_system_history);
5414 Fset (Qcoding_system_history, Qnil);
5415
9ce27fde 5416 /* Target FILENAME is the first argument. */
e0e989f6 5417 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
9ce27fde 5418 /* Target FILENAME is the third argument. */
e0e989f6
KH
5419 Fput (Qwrite_region, Qtarget_idx, make_number (2));
5420
5421 Qcall_process = intern ("call-process");
5422 staticpro (&Qcall_process);
9ce27fde 5423 /* Target PROGRAM is the first argument. */
e0e989f6
KH
5424 Fput (Qcall_process, Qtarget_idx, make_number (0));
5425
5426 Qcall_process_region = intern ("call-process-region");
5427 staticpro (&Qcall_process_region);
9ce27fde 5428 /* Target PROGRAM is the third argument. */
e0e989f6
KH
5429 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
5430
5431 Qstart_process = intern ("start-process");
5432 staticpro (&Qstart_process);
9ce27fde 5433 /* Target PROGRAM is the third argument. */
e0e989f6
KH
5434 Fput (Qstart_process, Qtarget_idx, make_number (2));
5435
5436 Qopen_network_stream = intern ("open-network-stream");
5437 staticpro (&Qopen_network_stream);
9ce27fde 5438 /* Target SERVICE is the fourth argument. */
e0e989f6
KH
5439 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
5440
4ed46869
KH
5441 Qcoding_system = intern ("coding-system");
5442 staticpro (&Qcoding_system);
5443
5444 Qeol_type = intern ("eol-type");
5445 staticpro (&Qeol_type);
5446
5447 Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
5448 staticpro (&Qbuffer_file_coding_system);
5449
5450 Qpost_read_conversion = intern ("post-read-conversion");
5451 staticpro (&Qpost_read_conversion);
5452
5453 Qpre_write_conversion = intern ("pre-write-conversion");
5454 staticpro (&Qpre_write_conversion);
5455
27901516
KH
5456 Qno_conversion = intern ("no-conversion");
5457 staticpro (&Qno_conversion);
5458
5459 Qundecided = intern ("undecided");
5460 staticpro (&Qundecided);
5461
4ed46869
KH
5462 Qcoding_system_p = intern ("coding-system-p");
5463 staticpro (&Qcoding_system_p);
5464
5465 Qcoding_system_error = intern ("coding-system-error");
5466 staticpro (&Qcoding_system_error);
5467
5468 Fput (Qcoding_system_error, Qerror_conditions,
5469 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
5470 Fput (Qcoding_system_error, Qerror_message,
9ce27fde 5471 build_string ("Invalid coding system"));
4ed46869 5472
d46c5b12
KH
5473 Qcoding_category = intern ("coding-category");
5474 staticpro (&Qcoding_category);
4ed46869
KH
5475 Qcoding_category_index = intern ("coding-category-index");
5476 staticpro (&Qcoding_category_index);
5477
d46c5b12
KH
5478 Vcoding_category_table
5479 = Fmake_vector (make_number (CODING_CATEGORY_IDX_MAX), Qnil);
5480 staticpro (&Vcoding_category_table);
4ed46869
KH
5481 {
5482 int i;
5483 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
5484 {
d46c5b12
KH
5485 XVECTOR (Vcoding_category_table)->contents[i]
5486 = intern (coding_category_name[i]);
5487 Fput (XVECTOR (Vcoding_category_table)->contents[i],
5488 Qcoding_category_index, make_number (i));
4ed46869
KH
5489 }
5490 }
5491
f967223b
KH
5492 Qtranslation_table = intern ("translation-table");
5493 staticpro (&Qtranslation_table);
1397dc18 5494 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (1));
bdd9fb48 5495
f967223b
KH
5496 Qtranslation_table_id = intern ("translation-table-id");
5497 staticpro (&Qtranslation_table_id);
84fbb8a0 5498
f967223b
KH
5499 Qtranslation_table_for_decode = intern ("translation-table-for-decode");
5500 staticpro (&Qtranslation_table_for_decode);
a5d301df 5501
f967223b
KH
5502 Qtranslation_table_for_encode = intern ("translation-table-for-encode");
5503 staticpro (&Qtranslation_table_for_encode);
a5d301df 5504
70c22245
KH
5505 Qsafe_charsets = intern ("safe-charsets");
5506 staticpro (&Qsafe_charsets);
5507
1397dc18
KH
5508 Qvalid_codes = intern ("valid-codes");
5509 staticpro (&Qvalid_codes);
5510
9ce27fde
KH
5511 Qemacs_mule = intern ("emacs-mule");
5512 staticpro (&Qemacs_mule);
5513
d46c5b12
KH
5514 Qraw_text = intern ("raw-text");
5515 staticpro (&Qraw_text);
5516
4ed46869
KH
5517 defsubr (&Scoding_system_p);
5518 defsubr (&Sread_coding_system);
5519 defsubr (&Sread_non_nil_coding_system);
5520 defsubr (&Scheck_coding_system);
5521 defsubr (&Sdetect_coding_region);
d46c5b12 5522 defsubr (&Sdetect_coding_string);
4ed46869
KH
5523 defsubr (&Sdecode_coding_region);
5524 defsubr (&Sencode_coding_region);
5525 defsubr (&Sdecode_coding_string);
5526 defsubr (&Sencode_coding_string);
5527 defsubr (&Sdecode_sjis_char);
5528 defsubr (&Sencode_sjis_char);
5529 defsubr (&Sdecode_big5_char);
5530 defsubr (&Sencode_big5_char);
1ba9e4ab 5531 defsubr (&Sset_terminal_coding_system_internal);
c4825358 5532 defsubr (&Sset_safe_terminal_coding_system_internal);
4ed46869 5533 defsubr (&Sterminal_coding_system);
1ba9e4ab 5534 defsubr (&Sset_keyboard_coding_system_internal);
4ed46869 5535 defsubr (&Skeyboard_coding_system);
a5d301df 5536 defsubr (&Sfind_operation_coding_system);
1397dc18 5537 defsubr (&Supdate_coding_systems_internal);
66cfb530 5538 defsubr (&Sset_coding_priority_internal);
4ed46869 5539
4608c386
KH
5540 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
5541 "List of coding systems.\n\
5542\n\
5543Do not alter the value of this variable manually. This variable should be\n\
5544updated by the functions `make-coding-system' and\n\
5545`define-coding-system-alias'.");
5546 Vcoding_system_list = Qnil;
5547
5548 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
5549 "Alist of coding system names.\n\
5550Each element is one element list of coding system name.\n\
5551This variable is given to `completing-read' as TABLE argument.\n\
5552\n\
5553Do not alter the value of this variable manually. This variable should be\n\
5554updated by the functions `make-coding-system' and\n\
5555`define-coding-system-alias'.");
5556 Vcoding_system_alist = Qnil;
5557
4ed46869
KH
5558 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
5559 "List of coding-categories (symbols) ordered by priority.");
5560 {
5561 int i;
5562
5563 Vcoding_category_list = Qnil;
5564 for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
5565 Vcoding_category_list
d46c5b12
KH
5566 = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
5567 Vcoding_category_list);
4ed46869
KH
5568 }
5569
5570 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
10bff6f1 5571 "Specify the coding system for read operations.\n\
2ebb362d 5572It is useful to bind this variable with `let', but do not set it globally.\n\
4ed46869 5573If the value is a coding system, it is used for decoding on read operation.\n\
a67a9c66 5574If not, an appropriate element is used from one of the coding system alists:\n\
10bff6f1 5575There are three such tables, `file-coding-system-alist',\n\
a67a9c66 5576`process-coding-system-alist', and `network-coding-system-alist'.");
4ed46869
KH
5577 Vcoding_system_for_read = Qnil;
5578
5579 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
10bff6f1 5580 "Specify the coding system for write operations.\n\
2ebb362d 5581It is useful to bind this variable with `let', but do not set it globally.\n\
4ed46869 5582If the value is a coding system, it is used for encoding on write operation.\n\
a67a9c66 5583If not, an appropriate element is used from one of the coding system alists:\n\
10bff6f1 5584There are three such tables, `file-coding-system-alist',\n\
a67a9c66 5585`process-coding-system-alist', and `network-coding-system-alist'.");
4ed46869
KH
5586 Vcoding_system_for_write = Qnil;
5587
5588 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
a67a9c66 5589 "Coding system used in the latest file or process I/O.");
4ed46869
KH
5590 Vlast_coding_system_used = Qnil;
5591
9ce27fde 5592 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
f07f4a24 5593 "*Non-nil means always inhibit code conversion of end-of-line format.\n\
94c7a214
DL
5594See info node `Coding Systems' and info node `Text and Binary' concerning\n\
5595such conversion.");
9ce27fde
KH
5596 inhibit_eol_conversion = 0;
5597
ed29121d
EZ
5598 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
5599 "Non-nil means process buffer inherits coding system of process output.\n\
5600Bind it to t if the process output is to be treated as if it were a file\n\
5601read from some filesystem.");
5602 inherit_process_coding_system = 0;
5603
02ba4723
KH
5604 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
5605 "Alist to decide a coding system to use for a file I/O operation.\n\
5606The format is ((PATTERN . VAL) ...),\n\
5607where PATTERN is a regular expression matching a file name,\n\
5608VAL is a coding system, a cons of coding systems, or a function symbol.\n\
5609If VAL is a coding system, it is used for both decoding and encoding\n\
5610the file contents.\n\
5611If VAL is a cons of coding systems, the car part is used for decoding,\n\
5612and the cdr part is used for encoding.\n\
5613If VAL is a function symbol, the function must return a coding system\n\
5614or a cons of coding systems which are used as above.\n\
e0e989f6 5615\n\
a85a871a 5616See also the function `find-operation-coding-system'\n\
eda284ac 5617and the variable `auto-coding-alist'.");
02ba4723
KH
5618 Vfile_coding_system_alist = Qnil;
5619
5620 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
5621 "Alist to decide a coding system to use for a process I/O operation.\n\
5622The format is ((PATTERN . VAL) ...),\n\
5623where PATTERN is a regular expression matching a program name,\n\
5624VAL is a coding system, a cons of coding systems, or a function symbol.\n\
5625If VAL is a coding system, it is used for both decoding what received\n\
5626from the program and encoding what sent to the program.\n\
5627If VAL is a cons of coding systems, the car part is used for decoding,\n\
5628and the cdr part is used for encoding.\n\
5629If VAL is a function symbol, the function must return a coding system\n\
5630or a cons of coding systems which are used as above.\n\
4ed46869 5631\n\
9ce27fde 5632See also the function `find-operation-coding-system'.");
02ba4723
KH
5633 Vprocess_coding_system_alist = Qnil;
5634
5635 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
5636 "Alist to decide a coding system to use for a network I/O operation.\n\
5637The format is ((PATTERN . VAL) ...),\n\
5638where PATTERN is a regular expression matching a network service name\n\
5639or is a port number to connect to,\n\
5640VAL is a coding system, a cons of coding systems, or a function symbol.\n\
5641If VAL is a coding system, it is used for both decoding what received\n\
5642from the network stream and encoding what sent to the network stream.\n\
5643If VAL is a cons of coding systems, the car part is used for decoding,\n\
5644and the cdr part is used for encoding.\n\
5645If VAL is a function symbol, the function must return a coding system\n\
5646or a cons of coding systems which are used as above.\n\
4ed46869 5647\n\
9ce27fde 5648See also the function `find-operation-coding-system'.");
02ba4723 5649 Vnetwork_coding_system_alist = Qnil;
4ed46869 5650
7722baf9
EZ
5651 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
5652 "*String displayed in mode line for UNIX-like (LF) end-of-line format.");
5653 eol_mnemonic_unix = build_string (":");
4ed46869 5654
7722baf9
EZ
5655 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
5656 "*String displayed in mode line for DOS-like (CRLF) end-of-line format.");
5657 eol_mnemonic_dos = build_string ("\\");
4ed46869 5658
7722baf9
EZ
5659 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
5660 "*String displayed in mode line for MAC-like (CR) end-of-line format.");
5661 eol_mnemonic_mac = build_string ("/");
4ed46869 5662
7722baf9
EZ
5663 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
5664 "*String displayed in mode line when end-of-line format is not yet determined.");
5665 eol_mnemonic_undecided = build_string (":");
4ed46869 5666
84fbb8a0 5667 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
f967223b 5668 "*Non-nil enables character translation while encoding and decoding.");
84fbb8a0 5669 Venable_character_translation = Qt;
bdd9fb48 5670
f967223b
KH
5671 DEFVAR_LISP ("standard-translation-table-for-decode",
5672 &Vstandard_translation_table_for_decode,
84fbb8a0 5673 "Table for translating characters while decoding.");
f967223b 5674 Vstandard_translation_table_for_decode = Qnil;
bdd9fb48 5675
f967223b
KH
5676 DEFVAR_LISP ("standard-translation-table-for-encode",
5677 &Vstandard_translation_table_for_encode,
84fbb8a0 5678 "Table for translationg characters while encoding.");
f967223b 5679 Vstandard_translation_table_for_encode = Qnil;
4ed46869
KH
5680
5681 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
5682 "Alist of charsets vs revision numbers.\n\
5683While encoding, if a charset (car part of an element) is found,\n\
5684designate it with the escape sequence identifing revision (cdr part of the element).");
5685 Vcharset_revision_alist = Qnil;
02ba4723
KH
5686
5687 DEFVAR_LISP ("default-process-coding-system",
5688 &Vdefault_process_coding_system,
5689 "Cons of coding systems used for process I/O by default.\n\
5690The car part is used for decoding a process output,\n\
5691the cdr part is used for encoding a text to be sent to a process.");
5692 Vdefault_process_coding_system = Qnil;
c4825358 5693
3f003981
KH
5694 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
5695 "Table of extra Latin codes in the range 128..159 (inclusive).\n\
c4825358
KH
5696This is a vector of length 256.\n\
5697If Nth element is non-nil, the existence of code N in a file\n\
bb0115a2 5698\(or output of subprocess) doesn't prevent it to be detected as\n\
3f003981
KH
5699a coding system of ISO 2022 variant which has a flag\n\
5700`accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file\n\
c4825358
KH
5701or reading output of a subprocess.\n\
5702Only 128th through 159th elements has a meaning.");
3f003981 5703 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
d46c5b12
KH
5704
5705 DEFVAR_LISP ("select-safe-coding-system-function",
5706 &Vselect_safe_coding_system_function,
5707 "Function to call to select safe coding system for encoding a text.\n\
5708\n\
5709If set, this function is called to force a user to select a proper\n\
5710coding system which can encode the text in the case that a default\n\
5711coding system used in each operation can't encode the text.\n\
5712\n\
a85a871a 5713The default value is `select-safe-coding-system' (which see).");
d46c5b12
KH
5714 Vselect_safe_coding_system_function = Qnil;
5715
4ed46869
KH
5716}
5717
5718#endif /* emacs */