(CODING_FLAG_ISO_LATIN_EXTRA): New macro.
[bpt/emacs.git] / src / coding.c
CommitLineData
4ed46869 1/* Coding system handler (conversion, detection, and etc).
203cb916
RS
2 Copyright (C) 1995, 1997 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4ed46869 4
369314dc
KH
5This file is part of GNU Emacs.
6
7GNU Emacs is free software; you can redistribute it and/or modify
8it under the terms of the GNU General Public License as published by
9the Free Software Foundation; either version 2, or (at your option)
10any later version.
4ed46869 11
369314dc
KH
12GNU Emacs is distributed in the hope that it will be useful,
13but WITHOUT ANY WARRANTY; without even the implied warranty of
14MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15GNU General Public License for more details.
4ed46869 16
369314dc
KH
17You should have received a copy of the GNU General Public License
18along with GNU Emacs; see the file COPYING. If not, write to
19the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20Boston, MA 02111-1307, USA. */
4ed46869
KH
21
22/*** TABLE OF CONTENTS ***
23
24 1. Preamble
0ef69138 25 2. Emacs' internal format (emacs-mule) handlers
4ed46869
KH
26 3. ISO2022 handlers
27 4. Shift-JIS and BIG5 handlers
28 5. End-of-line handlers
29 6. C library functions
30 7. Emacs Lisp library functions
31 8. Post-amble
32
33*/
34
35/*** GENERAL NOTE on CODING SYSTEM ***
36
37 Coding system is an encoding mechanism of one or more character
38 sets. Here's a list of coding systems which Emacs can handle. When
39 we say "decode", it means converting some other coding system to
0ef69138
KH
40 Emacs' internal format (emacs-internal), and when we say "encode",
41 it means converting the coding system emacs-mule to some other
42 coding system.
4ed46869 43
0ef69138 44 0. Emacs' internal format (emacs-mule)
4ed46869
KH
45
46 Emacs itself holds a multi-lingual character in a buffer and a string
f4dee582 47 in a special format. Details are described in section 2.
4ed46869
KH
48
49 1. ISO2022
50
51 The most famous coding system for multiple character sets. X's
f4dee582
RS
52 Compound Text, various EUCs (Extended Unix Code), and coding
53 systems used in Internet communication such as ISO-2022-JP are
54 all variants of ISO2022. Details are described in section 3.
4ed46869
KH
55
56 2. SJIS (or Shift-JIS or MS-Kanji-Code)
57
58 A coding system to encode character sets: ASCII, JISX0201, and
59 JISX0208. Widely used for PC's in Japan. Details are described in
f4dee582 60 section 4.
4ed46869
KH
61
62 3. BIG5
63
64 A coding system to encode character sets: ASCII and Big5. Widely
65 used by Chinese (mainly in Taiwan and Hong Kong). Details are
f4dee582
RS
66 described in section 4. In this file, when we write "BIG5"
67 (all uppercase), we mean the coding system, and when we write
68 "Big5" (capitalized), we mean the character set.
4ed46869 69
f4dee582 70 4. Other
4ed46869 71
f4dee582 72 If a user wants to read/write a text encoded in a coding system not
4ed46869
KH
73 listed above, he can supply a decoder and an encoder for it in CCL
74 (Code Conversion Language) programs. Emacs executes the CCL program
75 while reading/writing.
76
f4dee582 77 Emacs represents a coding-system by a Lisp symbol that has a property
4ed46869
KH
78 `coding-system'. But, before actually using the coding-system, the
79 information about it is set in a structure of type `struct
f4dee582 80 coding_system' for rapid processing. See section 6 for more details.
4ed46869
KH
81
82*/
83
84/*** GENERAL NOTES on END-OF-LINE FORMAT ***
85
86 How end-of-line of a text is encoded depends on a system. For
87 instance, Unix's format is just one byte of `line-feed' code,
f4dee582 88 whereas DOS's format is two-byte sequence of `carriage-return' and
4ed46869
KH
89 `line-feed' codes. MacOS's format is one byte of `carriage-return'.
90
f4dee582
RS
91 Since text characters encoding and end-of-line encoding are
92 independent, any coding system described above can take
4ed46869 93 any format of end-of-line. So, Emacs has information of format of
f4dee582 94 end-of-line in each coding-system. See section 6 for more details.
4ed46869
KH
95
96*/
97
98/*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
99
100 These functions check if a text between SRC and SRC_END is encoded
101 in the coding system category XXX. Each returns an integer value in
102 which appropriate flag bits for the category XXX is set. The flag
103 bits are defined in macros CODING_CATEGORY_MASK_XXX. Below is the
104 template of these functions. */
105#if 0
106int
0ef69138 107detect_coding_emacs_mule (src, src_end)
4ed46869
KH
108 unsigned char *src, *src_end;
109{
110 ...
111}
112#endif
113
114/*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
115
116 These functions decode SRC_BYTES length text at SOURCE encoded in
0ef69138 117 CODING to Emacs' internal format (emacs-mule). The resulting text
f4dee582
RS
118 goes to a place pointed to by DESTINATION, the length of which should
119 not exceed DST_BYTES. The number of bytes actually processed is
120 returned as *CONSUMED. The return value is the length of the decoded
121 text. Below is a template of these functions. */
4ed46869
KH
122#if 0
123decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
124 struct coding_system *coding;
125 unsigned char *source, *destination;
126 int src_bytes, dst_bytes;
127 int *consumed;
128{
129 ...
130}
131#endif
132
133/*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
134
0ef69138
KH
135 These functions encode SRC_BYTES length text at SOURCE of Emacs'
136 internal format (emacs-mule) to CODING. The resulting text goes to
f4dee582
RS
137 a place pointed to by DESTINATION, the length of which should not
138 exceed DST_BYTES. The number of bytes actually processed is
139 returned as *CONSUMED. The return value is the length of the
140 encoded text. Below is a template of these functions. */
4ed46869
KH
141#if 0
142encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
143 struct coding_system *coding;
144 unsigned char *source, *destination;
145 int src_bytes, dst_bytes;
146 int *consumed;
147{
148 ...
149}
150#endif
151
152/*** COMMONLY USED MACROS ***/
153
154/* The following three macros ONE_MORE_BYTE, TWO_MORE_BYTES, and
155 THREE_MORE_BYTES safely get one, two, and three bytes from the
156 source text respectively. If there are not enough bytes in the
157 source, they jump to `label_end_of_loop'. The caller should set
158 variables `src' and `src_end' to appropriate areas in advance. */
159
160#define ONE_MORE_BYTE(c1) \
161 do { \
162 if (src < src_end) \
163 c1 = *src++; \
164 else \
165 goto label_end_of_loop; \
166 } while (0)
167
168#define TWO_MORE_BYTES(c1, c2) \
169 do { \
170 if (src + 1 < src_end) \
171 c1 = *src++, c2 = *src++; \
172 else \
173 goto label_end_of_loop; \
174 } while (0)
175
176#define THREE_MORE_BYTES(c1, c2, c3) \
177 do { \
178 if (src + 2 < src_end) \
179 c1 = *src++, c2 = *src++, c3 = *src++; \
180 else \
181 goto label_end_of_loop; \
182 } while (0)
183
184/* The following three macros DECODE_CHARACTER_ASCII,
185 DECODE_CHARACTER_DIMENSION1, and DECODE_CHARACTER_DIMENSION2 put
186 the multi-byte form of a character of each class at the place
187 pointed by `dst'. The caller should set the variable `dst' to
188 point to an appropriate area and the variable `coding' to point to
189 the coding-system of the currently decoding text in advance. */
190
191/* Decode one ASCII character C. */
192
193#define DECODE_CHARACTER_ASCII(c) \
194 do { \
195 if (COMPOSING_P (coding->composing)) \
196 *dst++ = 0xA0, *dst++ = (c) | 0x80; \
197 else \
198 *dst++ = (c); \
199 } while (0)
200
f4dee582 201/* Decode one DIMENSION1 character whose charset is CHARSET and whose
4ed46869
KH
202 position-code is C. */
203
204#define DECODE_CHARACTER_DIMENSION1(charset, c) \
205 do { \
206 unsigned char leading_code = CHARSET_LEADING_CODE_BASE (charset); \
207 if (COMPOSING_P (coding->composing)) \
208 *dst++ = leading_code + 0x20; \
209 else \
210 *dst++ = leading_code; \
211 if (leading_code = CHARSET_LEADING_CODE_EXT (charset)) \
212 *dst++ = leading_code; \
213 *dst++ = (c) | 0x80; \
214 } while (0)
215
f4dee582 216/* Decode one DIMENSION2 character whose charset is CHARSET and whose
4ed46869
KH
217 position-codes are C1 and C2. */
218
219#define DECODE_CHARACTER_DIMENSION2(charset, c1, c2) \
220 do { \
221 DECODE_CHARACTER_DIMENSION1 (charset, c1); \
222 *dst++ = (c2) | 0x80; \
223 } while (0)
224
225\f
226/*** 1. Preamble ***/
227
228#include <stdio.h>
229
230#ifdef emacs
231
232#include <config.h>
233#include "lisp.h"
234#include "buffer.h"
235#include "charset.h"
236#include "ccl.h"
237#include "coding.h"
238#include "window.h"
239
240#else /* not emacs */
241
242#include "mulelib.h"
243
244#endif /* not emacs */
245
246Lisp_Object Qcoding_system, Qeol_type;
247Lisp_Object Qbuffer_file_coding_system;
248Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
249
250extern Lisp_Object Qinsert_file_contents, Qwrite_region;
251Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
252Lisp_Object Qstart_process, Qopen_network_stream;
253Lisp_Object Qtarget_idx;
254
255/* Mnemonic character of each format of end-of-line. */
256int eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
257/* Mnemonic character to indicate format of end-of-line is not yet
258 decided. */
259int eol_mnemonic_undecided;
260
9ce27fde
KH
261/* Format of end-of-line decided by system. This is CODING_EOL_LF on
262 Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac. */
263int system_eol_type;
264
4ed46869
KH
265#ifdef emacs
266
02ba4723 267Lisp_Object Qcoding_system_spec, Qcoding_system_p, Qcoding_system_error;
4ed46869 268
9ce27fde
KH
269/* Coding system emacs-mule is for converting only end-of-line format. */
270Lisp_Object Qemacs_mule;
271
4ed46869
KH
272/* Coding-systems are handed between Emacs Lisp programs and C internal
273 routines by the following three variables. */
274/* Coding-system for reading files and receiving data from process. */
275Lisp_Object Vcoding_system_for_read;
276/* Coding-system for writing files and sending data to process. */
277Lisp_Object Vcoding_system_for_write;
278/* Coding-system actually used in the latest I/O. */
279Lisp_Object Vlast_coding_system_used;
280
c4825358 281/* A vector of length 256 which contains information about special
3f003981
KH
282 Latin codes (espepcially for dealing with Microsoft code). */
283Lisp_Object Vlatin_extra_code_table;
c4825358 284
9ce27fde
KH
285/* Flag to inhibit code conversion of end-of-line format. */
286int inhibit_eol_conversion;
287
c4825358 288/* Coding system to be used to encode text for terminal display. */
4ed46869
KH
289struct coding_system terminal_coding;
290
c4825358
KH
291/* Coding system to be used to encode text for terminal display when
292 terminal coding system is nil. */
293struct coding_system safe_terminal_coding;
294
295/* Coding system of what is sent from terminal keyboard. */
4ed46869
KH
296struct coding_system keyboard_coding;
297
02ba4723
KH
298Lisp_Object Vfile_coding_system_alist;
299Lisp_Object Vprocess_coding_system_alist;
300Lisp_Object Vnetwork_coding_system_alist;
4ed46869
KH
301
302#endif /* emacs */
303
304Lisp_Object Qcoding_category_index;
305
306/* List of symbols `coding-category-xxx' ordered by priority. */
307Lisp_Object Vcoding_category_list;
308
309/* Table of coding-systems currently assigned to each coding-category. */
310Lisp_Object coding_category_table[CODING_CATEGORY_IDX_MAX];
311
312/* Table of names of symbol for each coding-category. */
313char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
0ef69138 314 "coding-category-emacs-mule",
4ed46869
KH
315 "coding-category-sjis",
316 "coding-category-iso-7",
317 "coding-category-iso-8-1",
318 "coding-category-iso-8-2",
7717c392
KH
319 "coding-category-iso-7-else",
320 "coding-category-iso-8-else",
4ed46869
KH
321 "coding-category-big5",
322 "coding-category-binary"
323};
324
bdd9fb48
KH
325/* Flag to tell if we look up unification table on character code
326 conversion. */
327Lisp_Object Venable_character_unification;
a5d301df
KH
328/* Standard unification table to look up on decoding (reading). */
329Lisp_Object Vstandard_character_unification_table_for_decode;
330/* Standard unification table to look up on encoding (writing). */
331Lisp_Object Vstandard_character_unification_table_for_encode;
bdd9fb48
KH
332
333Lisp_Object Qcharacter_unification_table;
a5d301df
KH
334Lisp_Object Qcharacter_unification_table_for_decode;
335Lisp_Object Qcharacter_unification_table_for_encode;
4ed46869
KH
336
337/* Alist of charsets vs revision number. */
338Lisp_Object Vcharset_revision_alist;
339
02ba4723
KH
340/* Default coding systems used for process I/O. */
341Lisp_Object Vdefault_process_coding_system;
342
4ed46869 343\f
0ef69138 344/*** 2. Emacs internal format (emacs-mule) handlers ***/
4ed46869
KH
345
346/* Emacs' internal format for encoding multiple character sets is a
f4dee582
RS
347 kind of multi-byte encoding, i.e. characters are encoded by
348 variable-length sequences of one-byte codes. ASCII characters
349 and control characters (e.g. `tab', `newline') are represented by
350 one-byte sequences which are their ASCII codes, in the range 0x00
351 through 0x7F. The other characters are represented by a sequence
352 of `base leading-code', optional `extended leading-code', and one
353 or two `position-code's. The length of the sequence is determined
354 by the base leading-code. Leading-code takes the range 0x80
355 through 0x9F, whereas extended leading-code and position-code take
356 the range 0xA0 through 0xFF. See `charset.h' for more details
357 about leading-code and position-code.
358
359 There's one exception to this rule. Special leading-code
4ed46869
KH
360 `leading-code-composition' denotes that the following several
361 characters should be composed into one character. Leading-codes of
362 components (except for ASCII) are added 0x20. An ASCII character
363 component is represented by a 2-byte sequence of `0xA0' and
f4dee582
RS
364 `ASCII-code + 0x80'. See also the comments in `charset.h' for the
365 details of composite character. Hence, we can summarize the code
4ed46869
KH
366 range as follows:
367
368 --- CODE RANGE of Emacs' internal format ---
369 (character set) (range)
370 ASCII 0x00 .. 0x7F
371 ELSE (1st byte) 0x80 .. 0x9F
372 (rest bytes) 0xA0 .. 0xFF
373 ---------------------------------------------
374
375 */
376
377enum emacs_code_class_type emacs_code_class[256];
378
379/* Go to the next statement only if *SRC is accessible and the code is
380 greater than 0xA0. */
381#define CHECK_CODE_RANGE_A0_FF \
382 do { \
383 if (src >= src_end) \
384 goto label_end_of_switch; \
385 else if (*src++ < 0xA0) \
386 return 0; \
387 } while (0)
388
389/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
390 Check if a text is encoded in Emacs' internal format. If it is,
0ef69138 391 return CODING_CATEGORY_MASK_EMASC_MULE, else return 0. */
4ed46869
KH
392
393int
0ef69138 394detect_coding_emacs_mule (src, src_end)
4ed46869
KH
395 unsigned char *src, *src_end;
396{
397 unsigned char c;
398 int composing = 0;
399
400 while (src < src_end)
401 {
402 c = *src++;
403
404 if (composing)
405 {
406 if (c < 0xA0)
407 composing = 0;
408 else
409 c -= 0x20;
410 }
411
412 switch (emacs_code_class[c])
413 {
414 case EMACS_ascii_code:
415 case EMACS_linefeed_code:
416 break;
417
418 case EMACS_control_code:
419 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
420 return 0;
421 break;
422
423 case EMACS_invalid_code:
424 return 0;
425
426 case EMACS_leading_code_composition: /* c == 0x80 */
427 if (composing)
428 CHECK_CODE_RANGE_A0_FF;
429 else
430 composing = 1;
431 break;
432
433 case EMACS_leading_code_4:
434 CHECK_CODE_RANGE_A0_FF;
435 /* fall down to check it two more times ... */
436
437 case EMACS_leading_code_3:
438 CHECK_CODE_RANGE_A0_FF;
439 /* fall down to check it one more time ... */
440
441 case EMACS_leading_code_2:
442 CHECK_CODE_RANGE_A0_FF;
443 break;
444
445 default:
446 label_end_of_switch:
447 break;
448 }
449 }
0ef69138 450 return CODING_CATEGORY_MASK_EMACS_MULE;
4ed46869
KH
451}
452
453\f
454/*** 3. ISO2022 handlers ***/
455
456/* The following note describes the coding system ISO2022 briefly.
f4dee582
RS
457 Since the intention of this note is to help in understanding of
458 the programs in this file, some parts are NOT ACCURATE or OVERLY
4ed46869
KH
459 SIMPLIFIED. For the thorough understanding, please refer to the
460 original document of ISO2022.
461
462 ISO2022 provides many mechanisms to encode several character sets
f4dee582 463 in 7-bit and 8-bit environment. If one chooses 7-bite environment,
4ed46869 464 all text is encoded by codes of less than 128. This may make the
f4dee582
RS
465 encoded text a little bit longer, but the text gets more stability
466 to pass through several gateways (some of them strip off the MSB).
4ed46869 467
f4dee582 468 There are two kinds of character set: control character set and
4ed46869
KH
469 graphic character set. The former contains control characters such
470 as `newline' and `escape' to provide control functions (control
f4dee582 471 functions are provided also by escape sequences). The latter
4ed46869
KH
472 contains graphic characters such as ' A' and '-'. Emacs recognizes
473 two control character sets and many graphic character sets.
474
475 Graphic character sets are classified into one of the following
476 four classes, DIMENSION1_CHARS94, DIMENSION1_CHARS96,
477 DIMENSION2_CHARS94, DIMENSION2_CHARS96 according to the number of
478 bytes (DIMENSION) and the number of characters in one dimension
479 (CHARS) of the set. In addition, each character set is assigned an
480 identification tag (called "final character" and denoted as <F>
481 here after) which is unique in each class. <F> of each character
482 set is decided by ECMA(*) when it is registered in ISO. Code range
483 of <F> is 0x30..0x7F (0x30..0x3F are for private use only).
484
485 Note (*): ECMA = European Computer Manufacturers Association
486
487 Here are examples of graphic character set [NAME(<F>)]:
488 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
489 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
490 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
491 o DIMENSION2_CHARS96 -- none for the moment
492
493 A code area (1byte=8bits) is divided into 4 areas, C0, GL, C1, and GR.
494 C0 [0x00..0x1F] -- control character plane 0
495 GL [0x20..0x7F] -- graphic character plane 0
496 C1 [0x80..0x9F] -- control character plane 1
497 GR [0xA0..0xFF] -- graphic character plane 1
498
499 A control character set is directly designated and invoked to C0 or
500 C1 by an escape sequence. The most common case is that ISO646's
501 control character set is designated/invoked to C0 and ISO6429's
502 control character set is designated/invoked to C1, and usually
503 these designations/invocations are omitted in a coded text. With
504 7-bit environment, only C0 can be used, and a control character for
505 C1 is encoded by an appropriate escape sequence to fit in the
506 environment. All control characters for C1 are defined the
507 corresponding escape sequences.
508
509 A graphic character set is at first designated to one of four
510 graphic registers (G0 through G3), then these graphic registers are
511 invoked to GL or GR. These designations and invocations can be
512 done independently. The most common case is that G0 is invoked to
513 GL, G1 is invoked to GR, and ASCII is designated to G0, and usually
514 these invocations and designations are omitted in a coded text.
515 With 7-bit environment, only GL can be used.
516
517 When a graphic character set of CHARS94 is invoked to GL, code 0x20
518 and 0x7F of GL area work as control characters SPACE and DEL
519 respectively, and code 0xA0 and 0xFF of GR area should not be used.
520
521 There are two ways of invocation: locking-shift and single-shift.
522 With locking-shift, the invocation lasts until the next different
523 invocation, whereas with single-shift, the invocation works only
524 for the following character and doesn't affect locking-shift.
525 Invocations are done by the following control characters or escape
526 sequences.
527
528 ----------------------------------------------------------------------
529 function control char escape sequence description
530 ----------------------------------------------------------------------
531 SI (shift-in) 0x0F none invoke G0 to GL
10bff6f1 532 SO (shift-out) 0x0E none invoke G1 to GL
4ed46869
KH
533 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
534 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
535 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 into GL
536 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 into GL
537 ----------------------------------------------------------------------
538 The first four are for locking-shift. Control characters for these
539 functions are defined by macros ISO_CODE_XXX in `coding.h'.
540
541 Designations are done by the following escape sequences.
542 ----------------------------------------------------------------------
543 escape sequence description
544 ----------------------------------------------------------------------
545 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
546 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
547 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
548 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
549 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
550 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
551 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
552 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
553 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
554 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
555 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
556 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
557 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
558 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
559 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
560 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
561 ----------------------------------------------------------------------
562
563 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
564 of dimension 1, chars 94, and final character <F>, and etc.
565
566 Note (*): Although these designations are not allowed in ISO2022,
567 Emacs accepts them on decoding, and produces them on encoding
568 CHARS96 character set in a coding system which is characterized as
569 7-bit environment, non-locking-shift, and non-single-shift.
570
571 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
572 '(' can be omitted. We call this as "short-form" here after.
573
574 Now you may notice that there are a lot of ways for encoding the
f4dee582 575 same multilingual text in ISO2022. Actually, there exists many
4ed46869
KH
576 coding systems such as Compound Text (used in X's inter client
577 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
578 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
579 localized platforms), and all of these are variants of ISO2022.
580
581 In addition to the above, Emacs handles two more kinds of escape
582 sequences: ISO6429's direction specification and Emacs' private
583 sequence for specifying character composition.
584
585 ISO6429's direction specification takes the following format:
586 o CSI ']' -- end of the current direction
587 o CSI '0' ']' -- end of the current direction
588 o CSI '1' ']' -- start of left-to-right text
589 o CSI '2' ']' -- start of right-to-left text
590 The control character CSI (0x9B: control sequence introducer) is
591 abbreviated to the escape sequence ESC '[' in 7-bit environment.
592
593 Character composition specification takes the following format:
594 o ESC '0' -- start character composition
595 o ESC '1' -- end character composition
596 Since these are not standard escape sequences of any ISO, the use
597 of them for these meaning is restricted to Emacs only. */
598
599enum iso_code_class_type iso_code_class[256];
600
601/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
602 Check if a text is encoded in ISO2022. If it is, returns an
603 integer in which appropriate flag bits any of:
604 CODING_CATEGORY_MASK_ISO_7
605 CODING_CATEGORY_MASK_ISO_8_1
606 CODING_CATEGORY_MASK_ISO_8_2
7717c392
KH
607 CODING_CATEGORY_MASK_ISO_7_ELSE
608 CODING_CATEGORY_MASK_ISO_8_ELSE
4ed46869
KH
609 are set. If a code which should never appear in ISO2022 is found,
610 returns 0. */
611
612int
613detect_coding_iso2022 (src, src_end)
614 unsigned char *src, *src_end;
615{
765a2ca5
KH
616 int mask = (CODING_CATEGORY_MASK_ISO_7
617 | CODING_CATEGORY_MASK_ISO_8_1
618 | CODING_CATEGORY_MASK_ISO_8_2
7717c392
KH
619 | CODING_CATEGORY_MASK_ISO_7_ELSE
620 | CODING_CATEGORY_MASK_ISO_8_ELSE
621 );
bcf26d6a
KH
622 int g1 = 0; /* 1 iff designating to G1. */
623 int c, i;
3f003981 624 struct coding_system coding_iso_8_1, coding_iso_8_2;
4ed46869 625
3f003981
KH
626 /* Coding systems of these categories may accept latin extra codes. */
627 setup_coding_system
628 (XSYMBOL (coding_category_table[CODING_CATEGORY_IDX_ISO_8_1])->value,
629 &coding_iso_8_1);
630 setup_coding_system
631 (XSYMBOL (coding_category_table[CODING_CATEGORY_IDX_ISO_8_2])->value,
632 &coding_iso_8_2);
633
634 while (mask && src < src_end)
4ed46869
KH
635 {
636 c = *src++;
637 switch (c)
638 {
639 case ISO_CODE_ESC:
e0e989f6 640 if (src >= src_end)
4ed46869
KH
641 break;
642 c = *src++;
bf9cdd4e 643 if ((c >= '(' && c <= '/'))
4ed46869 644 {
bf9cdd4e
KH
645 /* Designation sequence for a charset of dimension 1. */
646 if (src >= src_end)
647 break;
648 c = *src++;
649 if (c < ' ' || c >= 0x80)
650 /* Invalid designation sequence. */
651 return 0;
652 }
653 else if (c == '$')
654 {
655 /* Designation sequence for a charset of dimension 2. */
656 if (src >= src_end)
657 break;
658 c = *src++;
659 if (c >= '@' && c <= 'B')
660 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
661 ;
662 else if (c >= '(' && c <= '/')
bcf26d6a 663 {
bf9cdd4e
KH
664 if (src >= src_end)
665 break;
666 c = *src++;
667 if (c < ' ' || c >= 0x80)
668 /* Invalid designation sequence. */
669 return 0;
bcf26d6a 670 }
bf9cdd4e
KH
671 else
672 /* Invalid designation sequence. */
673 return 0;
4ed46869 674 }
4ed46869 675 else if (c == 'N' || c == 'O' || c == 'n' || c == 'o')
bf9cdd4e 676 /* Locking shift. */
7717c392
KH
677 mask &= (CODING_CATEGORY_MASK_ISO_7_ELSE
678 | CODING_CATEGORY_MASK_ISO_8_ELSE);
bf9cdd4e
KH
679 else if (c == '0' || c == '1' || c == '2')
680 /* Start/end composition. */
681 ;
682 else
683 /* Invalid escape sequence. */
684 return 0;
4ed46869
KH
685 break;
686
4ed46869 687 case ISO_CODE_SO:
bf9cdd4e
KH
688 mask &= (CODING_CATEGORY_MASK_ISO_7_ELSE
689 | CODING_CATEGORY_MASK_ISO_8_ELSE);
e0e989f6
KH
690 break;
691
4ed46869
KH
692 case ISO_CODE_CSI:
693 case ISO_CODE_SS2:
694 case ISO_CODE_SS3:
3f003981
KH
695 {
696 int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
697
698 if (VECTORP (Vlatin_extra_code_table)
699 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
700 {
701 if (coding_iso_8_1.flags & CODING_FLAG_ISO_LATIN_EXTRA)
702 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
703 if (coding_iso_8_2.flags & CODING_FLAG_ISO_LATIN_EXTRA)
704 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
705 }
706 mask &= newmask;
707 }
708 break;
4ed46869
KH
709
710 default:
711 if (c < 0x80)
712 break;
713 else if (c < 0xA0)
c4825358 714 {
3f003981
KH
715 if (VECTORP (Vlatin_extra_code_table)
716 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
c4825358 717 {
3f003981
KH
718 int newmask = 0;
719
720 if (coding_iso_8_1.flags & CODING_FLAG_ISO_LATIN_EXTRA)
721 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
722 if (coding_iso_8_2.flags & CODING_FLAG_ISO_LATIN_EXTRA)
723 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
724 mask &= newmask;
c4825358 725 }
3f003981
KH
726 else
727 return 0;
c4825358 728 }
4ed46869
KH
729 else
730 {
7717c392 731 unsigned char *src_begin = src;
4ed46869 732
7717c392
KH
733 mask &= ~(CODING_CATEGORY_MASK_ISO_7
734 | CODING_CATEGORY_MASK_ISO_7_ELSE);
e0e989f6 735 while (src < src_end && *src >= 0xA0)
7717c392
KH
736 src++;
737 if ((src - src_begin - 1) & 1 && src < src_end)
4ed46869
KH
738 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
739 }
740 break;
741 }
742 }
743
744 return mask;
745}
746
747/* Decode a character of which charset is CHARSET and the 1st position
bdd9fb48 748 code is C1. If dimension of CHARSET is 2, the 2nd position code is
4ed46869
KH
749 fetched from SRC and set to C2. If CHARSET is negative, it means
750 that we are decoding ill formed text, and what we can do is just to
751 read C1 as is. */
752
bdd9fb48
KH
753#define DECODE_ISO_CHARACTER(charset, c1) \
754 do { \
755 int c_alt, charset_alt = (charset); \
756 if (COMPOSING_HEAD_P (coding->composing)) \
757 { \
758 *dst++ = LEADING_CODE_COMPOSITION; \
759 if (COMPOSING_WITH_RULE_P (coding->composing)) \
760 /* To tell composition rules are embeded. */ \
761 *dst++ = 0xFF; \
762 coding->composing += 2; \
763 } \
764 if ((charset) >= 0) \
765 { \
766 if (CHARSET_DIMENSION (charset) == 2) \
767 ONE_MORE_BYTE (c2); \
768 if (!NILP (unification_table) \
769 && ((c_alt = unify_char (unification_table, \
770 -1, (charset), c1, c2)) >= 0)) \
771 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
772 } \
773 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
774 DECODE_CHARACTER_ASCII (c1); \
775 else if (CHARSET_DIMENSION (charset_alt) == 1) \
776 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
777 else \
778 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
779 if (COMPOSING_WITH_RULE_P (coding->composing)) \
780 /* To tell a composition rule follows. */ \
781 coding->composing = COMPOSING_WITH_RULE_RULE; \
4ed46869
KH
782 } while (0)
783
784/* Set designation state into CODING. */
785#define DECODE_DESIGNATION(reg, dimension, chars, final_char) \
786 do { \
2e34157c
RS
787 int charset = ISO_CHARSET_TABLE (make_number (dimension), \
788 make_number (chars), \
789 make_number (final_char)); \
4ed46869
KH
790 if (charset >= 0) \
791 { \
792 if (coding->direction == 1 \
793 && CHARSET_REVERSE_CHARSET (charset) >= 0) \
794 charset = CHARSET_REVERSE_CHARSET (charset); \
795 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
796 } \
797 } while (0)
798
799/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
800
801int
802decode_coding_iso2022 (coding, source, destination,
803 src_bytes, dst_bytes, consumed)
804 struct coding_system *coding;
805 unsigned char *source, *destination;
806 int src_bytes, dst_bytes;
807 int *consumed;
808{
809 unsigned char *src = source;
810 unsigned char *src_end = source + src_bytes;
811 unsigned char *dst = destination;
812 unsigned char *dst_end = destination + dst_bytes;
813 /* Since the maximum bytes produced by each loop is 7, we subtract 6
814 from DST_END to assure that overflow checking is necessary only
815 at the head of loop. */
816 unsigned char *adjusted_dst_end = dst_end - 6;
817 int charset;
818 /* Charsets invoked to graphic plane 0 and 1 respectively. */
819 int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
820 int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
a5d301df
KH
821 Lisp_Object unification_table
822 = coding->character_unification_table_for_decode;
bdd9fb48
KH
823
824 if (!NILP (Venable_character_unification) && NILP (unification_table))
a5d301df 825 unification_table = Vstandard_character_unification_table_for_decode;
4ed46869
KH
826
827 while (src < src_end && dst < adjusted_dst_end)
828 {
829 /* SRC_BASE remembers the start position in source in each loop.
830 The loop will be exited when there's not enough source text
831 to analyze long escape sequence or 2-byte code (within macros
832 ONE_MORE_BYTE or TWO_MORE_BYTES). In that case, SRC is reset
833 to SRC_BASE before exiting. */
834 unsigned char *src_base = src;
bdd9fb48 835 int c1 = *src++, c2;
4ed46869
KH
836
837 switch (iso_code_class [c1])
838 {
839 case ISO_0x20_or_0x7F:
840 if (!coding->composing
841 && (charset0 < 0 || CHARSET_CHARS (charset0) == 94))
842 {
843 /* This is SPACE or DEL. */
844 *dst++ = c1;
845 break;
846 }
847 /* This is a graphic character, we fall down ... */
848
849 case ISO_graphic_plane_0:
850 if (coding->composing == COMPOSING_WITH_RULE_RULE)
851 {
852 /* This is a composition rule. */
853 *dst++ = c1 | 0x80;
854 coding->composing = COMPOSING_WITH_RULE_TAIL;
855 }
856 else
857 DECODE_ISO_CHARACTER (charset0, c1);
858 break;
859
860 case ISO_0xA0_or_0xFF:
861 if (charset1 < 0 || CHARSET_CHARS (charset1) == 94)
862 {
863 /* Invalid code. */
864 *dst++ = c1;
865 break;
866 }
867 /* This is a graphic character, we fall down ... */
868
869 case ISO_graphic_plane_1:
870 DECODE_ISO_CHARACTER (charset1, c1);
871 break;
872
873 case ISO_control_code:
874 /* All ISO2022 control characters in this class have the
875 same representation in Emacs internal format. */
876 *dst++ = c1;
877 break;
878
879 case ISO_carriage_return:
880 if (coding->eol_type == CODING_EOL_CR)
881 {
882 *dst++ = '\n';
883 }
884 else if (coding->eol_type == CODING_EOL_CRLF)
885 {
886 ONE_MORE_BYTE (c1);
887 if (c1 == ISO_CODE_LF)
888 *dst++ = '\n';
889 else
890 {
891 src--;
892 *dst++ = c1;
893 }
894 }
895 else
896 {
897 *dst++ = c1;
898 }
899 break;
900
901 case ISO_shift_out:
e0e989f6
KH
902 if (CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
903 goto label_invalid_escape_sequence;
4ed46869
KH
904 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
905 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
906 break;
907
908 case ISO_shift_in:
909 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
910 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
911 break;
912
913 case ISO_single_shift_2_7:
914 case ISO_single_shift_2:
915 /* SS2 is handled as an escape sequence of ESC 'N' */
916 c1 = 'N';
917 goto label_escape_sequence;
918
919 case ISO_single_shift_3:
920 /* SS2 is handled as an escape sequence of ESC 'O' */
921 c1 = 'O';
922 goto label_escape_sequence;
923
924 case ISO_control_sequence_introducer:
925 /* CSI is handled as an escape sequence of ESC '[' ... */
926 c1 = '[';
927 goto label_escape_sequence;
928
929 case ISO_escape:
930 ONE_MORE_BYTE (c1);
931 label_escape_sequence:
932 /* Escape sequences handled by Emacs are invocation,
933 designation, direction specification, and character
934 composition specification. */
935 switch (c1)
936 {
937 case '&': /* revision of following character set */
938 ONE_MORE_BYTE (c1);
939 if (!(c1 >= '@' && c1 <= '~'))
e0e989f6 940 goto label_invalid_escape_sequence;
4ed46869
KH
941 ONE_MORE_BYTE (c1);
942 if (c1 != ISO_CODE_ESC)
e0e989f6 943 goto label_invalid_escape_sequence;
4ed46869
KH
944 ONE_MORE_BYTE (c1);
945 goto label_escape_sequence;
946
947 case '$': /* designation of 2-byte character set */
948 ONE_MORE_BYTE (c1);
949 if (c1 >= '@' && c1 <= 'B')
950 { /* designation of JISX0208.1978, GB2312.1980,
951 or JISX0208.1980 */
952 DECODE_DESIGNATION (0, 2, 94, c1);
953 }
954 else if (c1 >= 0x28 && c1 <= 0x2B)
955 { /* designation of DIMENSION2_CHARS94 character set */
956 ONE_MORE_BYTE (c2);
957 DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
958 }
959 else if (c1 >= 0x2C && c1 <= 0x2F)
960 { /* designation of DIMENSION2_CHARS96 character set */
961 ONE_MORE_BYTE (c2);
962 DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
963 }
964 else
e0e989f6 965 goto label_invalid_escape_sequence;
4ed46869
KH
966 break;
967
968 case 'n': /* invocation of locking-shift-2 */
e0e989f6
KH
969 if (CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
970 goto label_invalid_escape_sequence;
4ed46869 971 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
e0e989f6 972 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
4ed46869
KH
973 break;
974
975 case 'o': /* invocation of locking-shift-3 */
e0e989f6
KH
976 if (CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
977 goto label_invalid_escape_sequence;
4ed46869 978 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
e0e989f6 979 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
4ed46869
KH
980 break;
981
982 case 'N': /* invocation of single-shift-2 */
e0e989f6
KH
983 if (CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
984 goto label_invalid_escape_sequence;
4ed46869
KH
985 ONE_MORE_BYTE (c1);
986 charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
987 DECODE_ISO_CHARACTER (charset, c1);
988 break;
989
990 case 'O': /* invocation of single-shift-3 */
e0e989f6
KH
991 if (CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
992 goto label_invalid_escape_sequence;
4ed46869
KH
993 ONE_MORE_BYTE (c1);
994 charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
995 DECODE_ISO_CHARACTER (charset, c1);
996 break;
997
998 case '0': /* start composing without embeded rules */
999 coding->composing = COMPOSING_NO_RULE_HEAD;
1000 break;
1001
1002 case '1': /* end composing */
1003 coding->composing = COMPOSING_NO;
1004 break;
1005
1006 case '2': /* start composing with embeded rules */
1007 coding->composing = COMPOSING_WITH_RULE_HEAD;
1008 break;
1009
1010 case '[': /* specification of direction */
1011 /* For the moment, nested direction is not supported.
1012 So, the value of `coding->direction' is 0 or 1: 0
1013 means left-to-right, 1 means right-to-left. */
1014 ONE_MORE_BYTE (c1);
1015 switch (c1)
1016 {
1017 case ']': /* end of the current direction */
1018 coding->direction = 0;
1019
1020 case '0': /* end of the current direction */
1021 case '1': /* start of left-to-right direction */
1022 ONE_MORE_BYTE (c1);
1023 if (c1 == ']')
1024 coding->direction = 0;
1025 else
1026 goto label_invalid_escape_sequence;
1027 break;
1028
1029 case '2': /* start of right-to-left direction */
1030 ONE_MORE_BYTE (c1);
1031 if (c1 == ']')
1032 coding->direction= 1;
1033 else
1034 goto label_invalid_escape_sequence;
1035 break;
1036
1037 default:
1038 goto label_invalid_escape_sequence;
1039 }
1040 break;
1041
1042 default:
1043 if (c1 >= 0x28 && c1 <= 0x2B)
1044 { /* designation of DIMENSION1_CHARS94 character set */
1045 ONE_MORE_BYTE (c2);
1046 DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
1047 }
1048 else if (c1 >= 0x2C && c1 <= 0x2F)
1049 { /* designation of DIMENSION1_CHARS96 character set */
1050 ONE_MORE_BYTE (c2);
1051 DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
1052 }
1053 else
1054 {
1055 goto label_invalid_escape_sequence;
1056 }
1057 }
1058 /* We must update these variables now. */
1059 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1060 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1061 break;
1062
1063 label_invalid_escape_sequence:
1064 {
1065 int length = src - src_base;
1066
1067 bcopy (src_base, dst, length);
1068 dst += length;
1069 }
1070 }
1071 continue;
1072
1073 label_end_of_loop:
1074 coding->carryover_size = src - src_base;
1075 bcopy (src_base, coding->carryover, coding->carryover_size);
1076 src = src_base;
1077 break;
1078 }
1079
1080 /* If this is the last block of the text to be decoded, we had
1081 better just flush out all remaining codes in the text although
1082 they are not valid characters. */
1083 if (coding->last_block)
1084 {
1085 bcopy (src, dst, src_end - src);
1086 dst += (src_end - src);
1087 src = src_end;
1088 }
1089 *consumed = src - source;
1090 return dst - destination;
1091}
1092
f4dee582 1093/* ISO2022 encoding stuff. */
4ed46869
KH
1094
1095/*
f4dee582 1096 It is not enough to say just "ISO2022" on encoding, we have to
4ed46869
KH
1097 specify more details. In Emacs, each coding-system of ISO2022
1098 variant has the following specifications:
1099 1. Initial designation to G0 thru G3.
1100 2. Allows short-form designation?
1101 3. ASCII should be designated to G0 before control characters?
1102 4. ASCII should be designated to G0 at end of line?
1103 5. 7-bit environment or 8-bit environment?
1104 6. Use locking-shift?
1105 7. Use Single-shift?
1106 And the following two are only for Japanese:
1107 8. Use ASCII in place of JIS0201-1976-Roman?
1108 9. Use JISX0208-1983 in place of JISX0208-1978?
1109 These specifications are encoded in `coding->flags' as flag bits
1110 defined by macros CODING_FLAG_ISO_XXX. See `coding.h' for more
f4dee582 1111 details.
4ed46869
KH
1112*/
1113
1114/* Produce codes (escape sequence) for designating CHARSET to graphic
1115 register REG. If <final-char> of CHARSET is '@', 'A', or 'B' and
1116 the coding system CODING allows, produce designation sequence of
1117 short-form. */
1118
1119#define ENCODE_DESIGNATION(charset, reg, coding) \
1120 do { \
1121 unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset); \
1122 char *intermediate_char_94 = "()*+"; \
1123 char *intermediate_char_96 = ",-./"; \
1124 Lisp_Object temp \
1125 = Fassq (make_number (charset), Vcharset_revision_alist); \
1126 if (! NILP (temp)) \
1127 { \
1128 *dst++ = ISO_CODE_ESC; \
1129 *dst++ = '&'; \
1130 *dst++ = XINT (XCONS (temp)->cdr) + '@'; \
1131 } \
1132 *dst++ = ISO_CODE_ESC; \
1133 if (CHARSET_DIMENSION (charset) == 1) \
1134 { \
1135 if (CHARSET_CHARS (charset) == 94) \
1136 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1137 else \
1138 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1139 } \
1140 else \
1141 { \
1142 *dst++ = '$'; \
1143 if (CHARSET_CHARS (charset) == 94) \
1144 { \
1145 if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM) \
1146 || reg != 0 \
1147 || final_char < '@' || final_char > 'B') \
1148 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1149 } \
1150 else \
1151 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1152 } \
1153 *dst++ = final_char; \
1154 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
1155 } while (0)
1156
1157/* The following two macros produce codes (control character or escape
1158 sequence) for ISO2022 single-shift functions (single-shift-2 and
1159 single-shift-3). */
1160
1161#define ENCODE_SINGLE_SHIFT_2 \
1162 do { \
1163 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1164 *dst++ = ISO_CODE_ESC, *dst++ = 'N'; \
1165 else \
1166 *dst++ = ISO_CODE_SS2; \
1167 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1168 } while (0)
1169
1170#define ENCODE_SINGLE_SHIFT_3 \
1171 do { \
1172 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1173 *dst++ = ISO_CODE_ESC, *dst++ = 'O'; \
1174 else \
1175 *dst++ = ISO_CODE_SS3; \
1176 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1177 } while (0)
1178
1179/* The following four macros produce codes (control character or
1180 escape sequence) for ISO2022 locking-shift functions (shift-in,
1181 shift-out, locking-shift-2, and locking-shift-3). */
1182
1183#define ENCODE_SHIFT_IN \
1184 do { \
1185 *dst++ = ISO_CODE_SI; \
1186 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1187 } while (0)
1188
1189#define ENCODE_SHIFT_OUT \
1190 do { \
1191 *dst++ = ISO_CODE_SO; \
1192 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1193 } while (0)
1194
1195#define ENCODE_LOCKING_SHIFT_2 \
1196 do { \
1197 *dst++ = ISO_CODE_ESC, *dst++ = 'n'; \
1198 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1199 } while (0)
1200
1201#define ENCODE_LOCKING_SHIFT_3 \
1202 do { \
1203 *dst++ = ISO_CODE_ESC, *dst++ = 'o'; \
1204 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1205 } while (0)
1206
f4dee582
RS
1207/* Produce codes for a DIMENSION1 character whose character set is
1208 CHARSET and whose position-code is C1. Designation and invocation
4ed46869
KH
1209 sequences are also produced in advance if necessary. */
1210
1211
6e85d753
KH
1212#define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
1213 do { \
1214 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1215 { \
1216 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1217 *dst++ = c1 & 0x7F; \
1218 else \
1219 *dst++ = c1 | 0x80; \
1220 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1221 break; \
1222 } \
1223 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1224 { \
1225 *dst++ = c1 & 0x7F; \
1226 break; \
1227 } \
1228 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1229 { \
1230 *dst++ = c1 | 0x80; \
1231 break; \
1232 } \
1233 else if (coding->flags & CODING_FLAG_ISO_SAFE \
1234 && !CODING_SPEC_ISO_EXPECTED_CHARSETS (coding)[charset]) \
1235 { \
1236 /* We should not encode this character, instead produce one or \
1237 two `?'s. */ \
1238 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1239 if (CHARSET_WIDTH (charset) == 2) \
1240 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1241 break; \
1242 } \
1243 else \
1244 /* Since CHARSET is not yet invoked to any graphic planes, we \
1245 must invoke it, or, at first, designate it to some graphic \
1246 register. Then repeat the loop to actually produce the \
1247 character. */ \
1248 dst = encode_invocation_designation (charset, coding, dst); \
4ed46869
KH
1249 } while (1)
1250
f4dee582
RS
1251/* Produce codes for a DIMENSION2 character whose character set is
1252 CHARSET and whose position-codes are C1 and C2. Designation and
4ed46869
KH
1253 invocation codes are also produced in advance if necessary. */
1254
6e85d753
KH
1255#define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
1256 do { \
1257 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1258 { \
1259 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1260 *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F; \
1261 else \
1262 *dst++ = c1 | 0x80, *dst++ = c2 | 0x80; \
1263 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1264 break; \
1265 } \
1266 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1267 { \
1268 *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F; \
1269 break; \
1270 } \
1271 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1272 { \
1273 *dst++ = c1 | 0x80, *dst++= c2 | 0x80; \
1274 break; \
1275 } \
1276 else if (coding->flags & CODING_FLAG_ISO_SAFE \
1277 && !CODING_SPEC_ISO_EXPECTED_CHARSETS (coding)[charset]) \
1278 { \
1279 /* We should not encode this character, instead produce one or \
1280 two `?'s. */ \
1281 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1282 if (CHARSET_WIDTH (charset) == 2) \
1283 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1284 break; \
1285 } \
1286 else \
1287 /* Since CHARSET is not yet invoked to any graphic planes, we \
1288 must invoke it, or, at first, designate it to some graphic \
1289 register. Then repeat the loop to actually produce the \
1290 character. */ \
1291 dst = encode_invocation_designation (charset, coding, dst); \
4ed46869
KH
1292 } while (1)
1293
bdd9fb48
KH
1294#define ENCODE_ISO_CHARACTER(charset, c1, c2) \
1295 do { \
1296 int c_alt, charset_alt; \
1297 if (!NILP (unification_table) \
1298 && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
a5d301df 1299 >= 0)) \
bdd9fb48
KH
1300 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1301 else \
1302 charset_alt = charset; \
1303 if (CHARSET_DIMENSION (charset_alt) == 1) \
1304 ENCODE_ISO_CHARACTER_DIMENSION1 (charset_alt, c1); \
1305 else \
1306 ENCODE_ISO_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
1307 } while (0)
1308
4ed46869
KH
1309/* Produce designation and invocation codes at a place pointed by DST
1310 to use CHARSET. The element `spec.iso2022' of *CODING is updated.
1311 Return new DST. */
1312
1313unsigned char *
1314encode_invocation_designation (charset, coding, dst)
1315 int charset;
1316 struct coding_system *coding;
1317 unsigned char *dst;
1318{
1319 int reg; /* graphic register number */
1320
1321 /* At first, check designations. */
1322 for (reg = 0; reg < 4; reg++)
1323 if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
1324 break;
1325
1326 if (reg >= 4)
1327 {
1328 /* CHARSET is not yet designated to any graphic registers. */
1329 /* At first check the requested designation. */
1330 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1ba9e4ab
KH
1331 if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1332 /* Since CHARSET requests no special designation, designate it
1333 to graphic register 0. */
4ed46869
KH
1334 reg = 0;
1335
1336 ENCODE_DESIGNATION (charset, reg, coding);
1337 }
1338
1339 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
1340 && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
1341 {
1342 /* Since the graphic register REG is not invoked to any graphic
1343 planes, invoke it to graphic plane 0. */
1344 switch (reg)
1345 {
1346 case 0: /* graphic register 0 */
1347 ENCODE_SHIFT_IN;
1348 break;
1349
1350 case 1: /* graphic register 1 */
1351 ENCODE_SHIFT_OUT;
1352 break;
1353
1354 case 2: /* graphic register 2 */
1355 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1356 ENCODE_SINGLE_SHIFT_2;
1357 else
1358 ENCODE_LOCKING_SHIFT_2;
1359 break;
1360
1361 case 3: /* graphic register 3 */
1362 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1363 ENCODE_SINGLE_SHIFT_3;
1364 else
1365 ENCODE_LOCKING_SHIFT_3;
1366 break;
1367 }
1368 }
1369 return dst;
1370}
1371
1372/* The following two macros produce codes for indicating composition. */
1373#define ENCODE_COMPOSITION_NO_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '0'
1374#define ENCODE_COMPOSITION_WITH_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '2'
1375#define ENCODE_COMPOSITION_END *dst++ = ISO_CODE_ESC, *dst++ = '1'
1376
1377/* The following three macros produce codes for indicating direction
1378 of text. */
1379#define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
1380 do { \
1381 if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS) \
1382 *dst++ = ISO_CODE_ESC, *dst++ = '['; \
1383 else \
1384 *dst++ = ISO_CODE_CSI; \
1385 } while (0)
1386
1387#define ENCODE_DIRECTION_R2L \
1388 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '2', *dst++ = ']'
1389
1390#define ENCODE_DIRECTION_L2R \
1391 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '0', *dst++ = ']'
1392
1393/* Produce codes for designation and invocation to reset the graphic
1394 planes and registers to initial state. */
e0e989f6
KH
1395#define ENCODE_RESET_PLANE_AND_REGISTER \
1396 do { \
1397 int reg; \
1398 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0) \
1399 ENCODE_SHIFT_IN; \
1400 for (reg = 0; reg < 4; reg++) \
1401 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0 \
1402 && (CODING_SPEC_ISO_DESIGNATION (coding, reg) \
1403 != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg))) \
1404 ENCODE_DESIGNATION \
1405 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
4ed46869
KH
1406 } while (0)
1407
bdd9fb48
KH
1408/* Produce designation sequences of charsets in the line started from
1409 *SRC to a place pointed by DSTP.
1410
1411 If the current block ends before any end-of-line, we may fail to
1412 find all the necessary *designations. */
1413encode_designation_at_bol (coding, table, src, src_end, dstp)
e0e989f6 1414 struct coding_system *coding;
bdd9fb48 1415 Lisp_Object table;
e0e989f6
KH
1416 unsigned char *src, *src_end, **dstp;
1417{
bdd9fb48
KH
1418 int charset, c, found = 0, reg;
1419 /* Table of charsets to be designated to each graphic register. */
1420 int r[4];
1421 unsigned char *dst = *dstp;
1422
1423 for (reg = 0; reg < 4; reg++)
1424 r[reg] = -1;
1425
1426 while (src < src_end && *src != '\n' && found < 4)
e0e989f6 1427 {
bdd9fb48
KH
1428 int bytes = BYTES_BY_CHAR_HEAD (*src);
1429
1430 if (NILP (table))
1431 charset = CHARSET_AT (src);
1432 else
e0e989f6 1433 {
bdd9fb48
KH
1434 int c_alt, c1, c2;
1435
1436 SPLIT_STRING(src, bytes, charset, c1, c2);
1437 if ((c_alt = unify_char (table, -1, charset, c1, c2)) >= 0)
1438 charset = CHAR_CHARSET (c_alt);
e0e989f6 1439 }
bdd9fb48 1440
e0e989f6 1441 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1ba9e4ab 1442 if (r[reg] == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
bdd9fb48
KH
1443 {
1444 found++;
1445 r[reg] = charset;
1446 }
1447
1448 src += bytes;
1449 }
1450
1451 if (found)
1452 {
1453 for (reg = 0; reg < 4; reg++)
1454 if (r[reg] >= 0
1455 && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
1456 ENCODE_DESIGNATION (r[reg], reg, coding);
1457 *dstp = dst;
e0e989f6 1458 }
e0e989f6
KH
1459}
1460
4ed46869
KH
1461/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
1462
1463int
1464encode_coding_iso2022 (coding, source, destination,
1465 src_bytes, dst_bytes, consumed)
1466 struct coding_system *coding;
1467 unsigned char *source, *destination;
1468 int src_bytes, dst_bytes;
1469 int *consumed;
1470{
1471 unsigned char *src = source;
1472 unsigned char *src_end = source + src_bytes;
1473 unsigned char *dst = destination;
1474 unsigned char *dst_end = destination + dst_bytes;
e0e989f6 1475 /* Since the maximum bytes produced by each loop is 20, we subtract 19
4ed46869
KH
1476 from DST_END to assure overflow checking is necessary only at the
1477 head of loop. */
e0e989f6 1478 unsigned char *adjusted_dst_end = dst_end - 19;
a5d301df
KH
1479 Lisp_Object unification_table
1480 = coding->character_unification_table_for_encode;
bdd9fb48
KH
1481
1482 if (!NILP (Venable_character_unification) && NILP (unification_table))
a5d301df 1483 unification_table = Vstandard_character_unification_table_for_encode;
4ed46869
KH
1484
1485 while (src < src_end && dst < adjusted_dst_end)
1486 {
1487 /* SRC_BASE remembers the start position in source in each loop.
1488 The loop will be exited when there's not enough source text
1489 to analyze multi-byte codes (within macros ONE_MORE_BYTE,
1490 TWO_MORE_BYTES, and THREE_MORE_BYTES). In that case, SRC is
1491 reset to SRC_BASE before exiting. */
1492 unsigned char *src_base = src;
bdd9fb48 1493 int charset, c1, c2, c3, c4;
4ed46869 1494
e0e989f6
KH
1495 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
1496 && CODING_SPEC_ISO_BOL (coding))
1497 {
bdd9fb48
KH
1498 /* We have to produce designation sequences if any now. */
1499 encode_designation_at_bol (coding, unification_table,
1500 src, src_end, &dst);
e0e989f6
KH
1501 CODING_SPEC_ISO_BOL (coding) = 0;
1502 }
1503
1504 c1 = *src++;
4ed46869
KH
1505 /* If we are seeing a component of a composite character, we are
1506 seeing a leading-code specially encoded for composition, or a
1507 composition rule if composing with rule. We must set C1
1508 to a normal leading-code or an ASCII code. If we are not at
1509 a composed character, we must reset the composition state. */
1510 if (COMPOSING_P (coding->composing))
1511 {
1512 if (c1 < 0xA0)
1513 {
1514 /* We are not in a composite character any longer. */
1515 coding->composing = COMPOSING_NO;
1516 ENCODE_COMPOSITION_END;
1517 }
1518 else
1519 {
1520 if (coding->composing == COMPOSING_WITH_RULE_RULE)
1521 {
1522 *dst++ = c1 & 0x7F;
1523 coding->composing = COMPOSING_WITH_RULE_HEAD;
1524 continue;
1525 }
1526 else if (coding->composing == COMPOSING_WITH_RULE_HEAD)
1527 coding->composing = COMPOSING_WITH_RULE_RULE;
1528 if (c1 == 0xA0)
1529 {
1530 /* This is an ASCII component. */
1531 ONE_MORE_BYTE (c1);
1532 c1 &= 0x7F;
1533 }
1534 else
1535 /* This is a leading-code of non ASCII component. */
1536 c1 -= 0x20;
1537 }
1538 }
1539
1540 /* Now encode one character. C1 is a control character, an
1541 ASCII character, or a leading-code of multi-byte character. */
1542 switch (emacs_code_class[c1])
1543 {
1544 case EMACS_ascii_code:
bdd9fb48 1545 ENCODE_ISO_CHARACTER (CHARSET_ASCII, c1, /* dummy */ c2);
4ed46869
KH
1546 break;
1547
1548 case EMACS_control_code:
1549 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
e0e989f6 1550 ENCODE_RESET_PLANE_AND_REGISTER;
4ed46869
KH
1551 *dst++ = c1;
1552 break;
1553
1554 case EMACS_carriage_return_code:
1555 if (!coding->selective)
1556 {
1557 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
e0e989f6 1558 ENCODE_RESET_PLANE_AND_REGISTER;
4ed46869
KH
1559 *dst++ = c1;
1560 break;
1561 }
1562 /* fall down to treat '\r' as '\n' ... */
1563
1564 case EMACS_linefeed_code:
1565 if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
e0e989f6
KH
1566 ENCODE_RESET_PLANE_AND_REGISTER;
1567 if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
1568 bcopy (coding->spec.iso2022.initial_designation,
1569 coding->spec.iso2022.current_designation,
1570 sizeof coding->spec.iso2022.initial_designation);
4ed46869 1571 if (coding->eol_type == CODING_EOL_LF
0ef69138 1572 || coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
1573 *dst++ = ISO_CODE_LF;
1574 else if (coding->eol_type == CODING_EOL_CRLF)
1575 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
1576 else
1577 *dst++ = ISO_CODE_CR;
e0e989f6 1578 CODING_SPEC_ISO_BOL (coding) = 1;
4ed46869
KH
1579 break;
1580
1581 case EMACS_leading_code_2:
1582 ONE_MORE_BYTE (c2);
19a8d9e0
KH
1583 if (c2 < 0xA0)
1584 {
1585 /* invalid sequence */
1586 *dst++ = c1;
1587 *dst++ = c2;
1588 }
1589 else
1590 ENCODE_ISO_CHARACTER (c1, c2, /* dummy */ c3);
4ed46869
KH
1591 break;
1592
1593 case EMACS_leading_code_3:
1594 TWO_MORE_BYTES (c2, c3);
19a8d9e0
KH
1595 if (c2 < 0xA0 || c3 < 0xA0)
1596 {
1597 /* invalid sequence */
1598 *dst++ = c1;
1599 *dst++ = c2;
1600 *dst++ = c3;
1601 }
1602 else if (c1 < LEADING_CODE_PRIVATE_11)
bdd9fb48 1603 ENCODE_ISO_CHARACTER (c1, c2, c3);
4ed46869 1604 else
bdd9fb48 1605 ENCODE_ISO_CHARACTER (c2, c3, /* dummy */ c4);
4ed46869
KH
1606 break;
1607
1608 case EMACS_leading_code_4:
1609 THREE_MORE_BYTES (c2, c3, c4);
19a8d9e0
KH
1610 if (c2 < 0xA0 || c3 < 0xA0 || c4 < 0xA0)
1611 {
1612 /* invalid sequence */
1613 *dst++ = c1;
1614 *dst++ = c2;
1615 *dst++ = c3;
1616 *dst++ = c4;
1617 }
1618 else
1619 ENCODE_ISO_CHARACTER (c2, c3, c4);
4ed46869
KH
1620 break;
1621
1622 case EMACS_leading_code_composition:
19a8d9e0
KH
1623 ONE_MORE_BYTE (c2);
1624 if (c2 < 0xA0)
1625 {
1626 /* invalid sequence */
1627 *dst++ = c1;
1628 *dst++ = c2;
1629 }
1630 else if (c2 == 0xFF)
4ed46869
KH
1631 {
1632 coding->composing = COMPOSING_WITH_RULE_HEAD;
1633 ENCODE_COMPOSITION_WITH_RULE_START;
1634 }
1635 else
1636 {
1637 /* Rewind one byte because it is a character code of
1638 composition elements. */
1639 src--;
1640 coding->composing = COMPOSING_NO_RULE_HEAD;
1641 ENCODE_COMPOSITION_NO_RULE_START;
1642 }
1643 break;
1644
1645 case EMACS_invalid_code:
1646 *dst++ = c1;
1647 break;
1648 }
1649 continue;
1650 label_end_of_loop:
76376439
KH
1651 /* We reach here because the source date ends not at character
1652 boundary. */
1653 coding->carryover_size = src_end - src_base;
4ed46869 1654 bcopy (src_base, coding->carryover, coding->carryover_size);
76376439 1655 src = src_end;
4ed46869
KH
1656 break;
1657 }
1658
1659 /* If this is the last block of the text to be encoded, we must
bdd9fb48
KH
1660 reset graphic planes and registers to the initial state. */
1661 if (src >= src_end && coding->last_block)
4ed46869 1662 {
e0e989f6 1663 ENCODE_RESET_PLANE_AND_REGISTER;
bdd9fb48
KH
1664 if (coding->carryover_size > 0
1665 && coding->carryover_size < (dst_end - dst))
1666 {
1667 bcopy (coding->carryover, dst, coding->carryover_size);
1668 dst += coding->carryover_size;
1669 coding->carryover_size = 0;
1670 }
4ed46869
KH
1671 }
1672 *consumed = src - source;
1673 return dst - destination;
1674}
1675
1676\f
1677/*** 4. SJIS and BIG5 handlers ***/
1678
f4dee582 1679/* Although SJIS and BIG5 are not ISO's coding system, they are used
4ed46869
KH
1680 quite widely. So, for the moment, Emacs supports them in the bare
1681 C code. But, in the future, they may be supported only by CCL. */
1682
1683/* SJIS is a coding system encoding three character sets: ASCII, right
1684 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
1685 as is. A character of charset katakana-jisx0201 is encoded by
1686 "position-code + 0x80". A character of charset japanese-jisx0208
1687 is encoded in 2-byte but two position-codes are divided and shifted
1688 so that it fit in the range below.
1689
1690 --- CODE RANGE of SJIS ---
1691 (character set) (range)
1692 ASCII 0x00 .. 0x7F
1693 KATAKANA-JISX0201 0xA0 .. 0xDF
1694 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xFF
1695 (2nd byte) 0x40 .. 0xFF
1696 -------------------------------
1697
1698*/
1699
1700/* BIG5 is a coding system encoding two character sets: ASCII and
1701 Big5. An ASCII character is encoded as is. Big5 is a two-byte
1702 character set and is encoded in two-byte.
1703
1704 --- CODE RANGE of BIG5 ---
1705 (character set) (range)
1706 ASCII 0x00 .. 0x7F
1707 Big5 (1st byte) 0xA1 .. 0xFE
1708 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
1709 --------------------------
1710
1711 Since the number of characters in Big5 is larger than maximum
1712 characters in Emacs' charset (96x96), it can't be handled as one
1713 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
1714 and `charset-big5-2'. Both are DIMENSION2 and CHARS94. The former
1715 contains frequently used characters and the latter contains less
1716 frequently used characters. */
1717
1718/* Macros to decode or encode a character of Big5 in BIG5. B1 and B2
1719 are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
1720 C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
1721 format. CHARSET is `charset_big5_1' or `charset_big5_2'. */
1722
1723/* Number of Big5 characters which have the same code in 1st byte. */
1724#define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
1725
1726#define DECODE_BIG5(b1, b2, charset, c1, c2) \
1727 do { \
1728 unsigned int temp \
1729 = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62); \
1730 if (b1 < 0xC9) \
1731 charset = charset_big5_1; \
1732 else \
1733 { \
1734 charset = charset_big5_2; \
1735 temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW; \
1736 } \
1737 c1 = temp / (0xFF - 0xA1) + 0x21; \
1738 c2 = temp % (0xFF - 0xA1) + 0x21; \
1739 } while (0)
1740
1741#define ENCODE_BIG5(charset, c1, c2, b1, b2) \
1742 do { \
1743 unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21); \
1744 if (charset == charset_big5_2) \
1745 temp += BIG5_SAME_ROW * (0xC9 - 0xA1); \
1746 b1 = temp / BIG5_SAME_ROW + 0xA1; \
1747 b2 = temp % BIG5_SAME_ROW; \
1748 b2 += b2 < 0x3F ? 0x40 : 0x62; \
1749 } while (0)
1750
a5d301df
KH
1751#define DECODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \
1752 do { \
1753 int c_alt, charset_alt = (charset); \
1754 if (!NILP (unification_table) \
1755 && ((c_alt = unify_char (unification_table, \
1756 -1, (charset), c1, c2)) >= 0)) \
1757 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1758 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
1759 DECODE_CHARACTER_ASCII (c1); \
1760 else if (CHARSET_DIMENSION (charset_alt) == 1) \
1761 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
1762 else \
1763 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
1764 } while (0)
1765
1766#define ENCODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \
1767 do { \
1768 int c_alt, charset_alt; \
1769 if (!NILP (unification_table) \
1770 && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
1771 >= 0)) \
1772 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1773 else \
1774 charset_alt = charset; \
1775 if (charset_alt == charset_ascii) \
1776 *dst++ = c1; \
1777 else if (CHARSET_DIMENSION (charset_alt) == 1) \
1778 { \
1779 if (sjis_p && charset_alt == charset_katakana_jisx0201) \
1780 *dst++ = c1; \
1781 else \
1782 *dst++ = charset_alt, *dst++ = c1; \
1783 } \
1784 else \
1785 { \
1786 c1 &= 0x7F, c2 &= 0x7F; \
1787 if (sjis_p && charset_alt == charset_jisx0208) \
1788 { \
1789 unsigned char s1, s2; \
1790 \
1791 ENCODE_SJIS (c1, c2, s1, s2); \
1792 *dst++ = s1, *dst++ = s2; \
1793 } \
1794 else if (!sjis_p \
1795 && (charset_alt == charset_big5_1 \
1796 || charset_alt == charset_big5_2)) \
1797 { \
1798 unsigned char b1, b2; \
1799 \
9ce27fde 1800 ENCODE_BIG5 (charset_alt, c1, c2, b1, b2); \
a5d301df
KH
1801 *dst++ = b1, *dst++ = b2; \
1802 } \
1803 else \
1804 *dst++ = charset_alt, *dst++ = c1, *dst++ = c2; \
1805 } \
1806 } while (0);
1807
4ed46869
KH
1808/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1809 Check if a text is encoded in SJIS. If it is, return
1810 CODING_CATEGORY_MASK_SJIS, else return 0. */
1811
1812int
1813detect_coding_sjis (src, src_end)
1814 unsigned char *src, *src_end;
1815{
1816 unsigned char c;
1817
1818 while (src < src_end)
1819 {
1820 c = *src++;
1821 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1822 return 0;
1823 if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
1824 {
1825 if (src < src_end && *src++ < 0x40)
1826 return 0;
1827 }
1828 }
1829 return CODING_CATEGORY_MASK_SJIS;
1830}
1831
1832/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1833 Check if a text is encoded in BIG5. If it is, return
1834 CODING_CATEGORY_MASK_BIG5, else return 0. */
1835
1836int
1837detect_coding_big5 (src, src_end)
1838 unsigned char *src, *src_end;
1839{
1840 unsigned char c;
1841
1842 while (src < src_end)
1843 {
1844 c = *src++;
1845 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1846 return 0;
1847 if (c >= 0xA1)
1848 {
1849 if (src >= src_end)
1850 break;
1851 c = *src++;
1852 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
1853 return 0;
1854 }
1855 }
1856 return CODING_CATEGORY_MASK_BIG5;
1857}
1858
1859/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
1860 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
1861
1862int
1863decode_coding_sjis_big5 (coding, source, destination,
1864 src_bytes, dst_bytes, consumed, sjis_p)
1865 struct coding_system *coding;
1866 unsigned char *source, *destination;
1867 int src_bytes, dst_bytes;
1868 int *consumed;
1869 int sjis_p;
1870{
1871 unsigned char *src = source;
1872 unsigned char *src_end = source + src_bytes;
1873 unsigned char *dst = destination;
1874 unsigned char *dst_end = destination + dst_bytes;
1875 /* Since the maximum bytes produced by each loop is 4, we subtract 3
1876 from DST_END to assure overflow checking is necessary only at the
1877 head of loop. */
1878 unsigned char *adjusted_dst_end = dst_end - 3;
a5d301df
KH
1879 Lisp_Object unification_table
1880 = coding->character_unification_table_for_decode;
1881
1882 if (!NILP (Venable_character_unification) && NILP (unification_table))
1883 unification_table = Vstandard_character_unification_table_for_decode;
4ed46869
KH
1884
1885 while (src < src_end && dst < adjusted_dst_end)
1886 {
1887 /* SRC_BASE remembers the start position in source in each loop.
1888 The loop will be exited when there's not enough source text
1889 to analyze two-byte character (within macro ONE_MORE_BYTE).
1890 In that case, SRC is reset to SRC_BASE before exiting. */
1891 unsigned char *src_base = src;
1892 unsigned char c1 = *src++, c2, c3, c4;
1893
1894 if (c1 == '\r')
1895 {
1896 if (coding->eol_type == CODING_EOL_CRLF)
1897 {
1898 ONE_MORE_BYTE (c2);
1899 if (c2 == '\n')
1900 *dst++ = c2;
1901 else
1902 /* To process C2 again, SRC is subtracted by 1. */
1903 *dst++ = c1, src--;
1904 }
1905 else
1906 *dst++ = c1;
1907 }
a5d301df 1908 else if (c1 < 0x20)
4ed46869 1909 *dst++ = c1;
a5d301df
KH
1910 else if (c1 < 0x80)
1911 DECODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
4ed46869
KH
1912 else if (c1 < 0xA0 || c1 >= 0xE0)
1913 {
1914 /* SJIS -> JISX0208, BIG5 -> Big5 (only if 0xE0 <= c1 < 0xFF) */
1915 if (sjis_p)
1916 {
1917 ONE_MORE_BYTE (c2);
1918 DECODE_SJIS (c1, c2, c3, c4);
a5d301df 1919 DECODE_SJIS_BIG5_CHARACTER (charset_jisx0208, c3, c4);
4ed46869
KH
1920 }
1921 else if (c1 >= 0xE0 && c1 < 0xFF)
1922 {
1923 int charset;
1924
1925 ONE_MORE_BYTE (c2);
1926 DECODE_BIG5 (c1, c2, charset, c3, c4);
a5d301df 1927 DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
4ed46869
KH
1928 }
1929 else /* Invalid code */
1930 *dst++ = c1;
1931 }
1932 else
1933 {
1934 /* SJIS -> JISX0201-Kana, BIG5 -> Big5 */
1935 if (sjis_p)
a5d301df 1936 DECODE_SJIS_BIG5_CHARACTER (charset_katakana_jisx0201, c1, /* dummy */ c2);
4ed46869
KH
1937 else
1938 {
1939 int charset;
1940
1941 ONE_MORE_BYTE (c2);
1942 DECODE_BIG5 (c1, c2, charset, c3, c4);
a5d301df 1943 DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
4ed46869
KH
1944 }
1945 }
1946 continue;
1947
1948 label_end_of_loop:
1949 coding->carryover_size = src - src_base;
1950 bcopy (src_base, coding->carryover, coding->carryover_size);
1951 src = src_base;
1952 break;
1953 }
1954
1955 *consumed = src - source;
1956 return dst - destination;
1957}
1958
1959/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
1960 This function can encode `charset_ascii', `charset_katakana_jisx0201',
1961 `charset_jisx0208', `charset_big5_1', and `charset_big5-2'. We are
1962 sure that all these charsets are registered as official charset
1963 (i.e. do not have extended leading-codes). Characters of other
1964 charsets are produced without any encoding. If SJIS_P is 1, encode
1965 SJIS text, else encode BIG5 text. */
1966
1967int
1968encode_coding_sjis_big5 (coding, source, destination,
1969 src_bytes, dst_bytes, consumed, sjis_p)
1970 struct coding_system *coding;
1971 unsigned char *source, *destination;
1972 int src_bytes, dst_bytes;
1973 int *consumed;
1974 int sjis_p;
1975{
1976 unsigned char *src = source;
1977 unsigned char *src_end = source + src_bytes;
1978 unsigned char *dst = destination;
1979 unsigned char *dst_end = destination + dst_bytes;
1980 /* Since the maximum bytes produced by each loop is 2, we subtract 1
1981 from DST_END to assure overflow checking is necessary only at the
1982 head of loop. */
1983 unsigned char *adjusted_dst_end = dst_end - 1;
a5d301df
KH
1984 Lisp_Object unification_table
1985 = coding->character_unification_table_for_encode;
1986
1987 if (!NILP (Venable_character_unification) && NILP (unification_table))
1988 unification_table = Vstandard_character_unification_table_for_encode;
4ed46869
KH
1989
1990 while (src < src_end && dst < adjusted_dst_end)
1991 {
1992 /* SRC_BASE remembers the start position in source in each loop.
1993 The loop will be exited when there's not enough source text
1994 to analyze multi-byte codes (within macros ONE_MORE_BYTE and
1995 TWO_MORE_BYTES). In that case, SRC is reset to SRC_BASE
1996 before exiting. */
1997 unsigned char *src_base = src;
1998 unsigned char c1 = *src++, c2, c3, c4;
1999
2000 if (coding->composing)
2001 {
2002 if (c1 == 0xA0)
2003 {
2004 ONE_MORE_BYTE (c1);
2005 c1 &= 0x7F;
2006 }
2007 else if (c1 >= 0xA0)
2008 c1 -= 0x20;
2009 else
2010 coding->composing = 0;
2011 }
2012
2013 switch (emacs_code_class[c1])
2014 {
2015 case EMACS_ascii_code:
a5d301df
KH
2016 ENCODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
2017 break;
2018
4ed46869
KH
2019 case EMACS_control_code:
2020 *dst++ = c1;
2021 break;
2022
2023 case EMACS_carriage_return_code:
2024 if (!coding->selective)
2025 {
2026 *dst++ = c1;
2027 break;
2028 }
2029 /* fall down to treat '\r' as '\n' ... */
2030
2031 case EMACS_linefeed_code:
2032 if (coding->eol_type == CODING_EOL_LF
0ef69138 2033 || coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
2034 *dst++ = '\n';
2035 else if (coding->eol_type == CODING_EOL_CRLF)
2036 *dst++ = '\r', *dst++ = '\n';
2037 else
2038 *dst++ = '\r';
2039 break;
2040
2041 case EMACS_leading_code_2:
2042 ONE_MORE_BYTE (c2);
a5d301df 2043 ENCODE_SJIS_BIG5_CHARACTER (c1, c2, /* dummy */ c3);
4ed46869
KH
2044 break;
2045
2046 case EMACS_leading_code_3:
2047 TWO_MORE_BYTES (c2, c3);
a5d301df 2048 ENCODE_SJIS_BIG5_CHARACTER (c1, c2, c3);
4ed46869
KH
2049 break;
2050
2051 case EMACS_leading_code_4:
2052 THREE_MORE_BYTES (c2, c3, c4);
a5d301df 2053 ENCODE_SJIS_BIG5_CHARACTER (c2, c3, c4);
4ed46869
KH
2054 break;
2055
2056 case EMACS_leading_code_composition:
2057 coding->composing = 1;
2058 break;
2059
2060 default: /* i.e. case EMACS_invalid_code: */
2061 *dst++ = c1;
2062 }
2063 continue;
2064
2065 label_end_of_loop:
76376439 2066 coding->carryover_size = src_end - src_base;
4ed46869 2067 bcopy (src_base, coding->carryover, coding->carryover_size);
76376439 2068 src = src_end;
4ed46869
KH
2069 break;
2070 }
2071
2072 *consumed = src - source;
2073 return dst - destination;
2074}
2075
2076\f
2077/*** 5. End-of-line handlers ***/
2078
2079/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2080 This function is called only when `coding->eol_type' is
2081 CODING_EOL_CRLF or CODING_EOL_CR. */
2082
2083decode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
2084 struct coding_system *coding;
2085 unsigned char *source, *destination;
2086 int src_bytes, dst_bytes;
2087 int *consumed;
2088{
2089 unsigned char *src = source;
2090 unsigned char *src_end = source + src_bytes;
2091 unsigned char *dst = destination;
2092 unsigned char *dst_end = destination + dst_bytes;
2093 int produced;
2094
2095 switch (coding->eol_type)
2096 {
2097 case CODING_EOL_CRLF:
2098 {
2099 /* Since the maximum bytes produced by each loop is 2, we
2100 subtract 1 from DST_END to assure overflow checking is
2101 necessary only at the head of loop. */
2102 unsigned char *adjusted_dst_end = dst_end - 1;
2103
2104 while (src < src_end && dst < adjusted_dst_end)
2105 {
2106 unsigned char *src_base = src;
2107 unsigned char c = *src++;
2108 if (c == '\r')
2109 {
2110 ONE_MORE_BYTE (c);
2111 if (c != '\n')
2112 *dst++ = '\r';
bfd99048 2113 *dst++ = c;
4ed46869
KH
2114 }
2115 else
2116 *dst++ = c;
2117 continue;
2118
2119 label_end_of_loop:
2120 coding->carryover_size = src - src_base;
2121 bcopy (src_base, coding->carryover, coding->carryover_size);
2122 src = src_base;
2123 break;
2124 }
2125 *consumed = src - source;
2126 produced = dst - destination;
2127 break;
2128 }
2129
2130 case CODING_EOL_CR:
2131 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2132 bcopy (source, destination, produced);
2133 dst_end = destination + produced;
2134 while (dst < dst_end)
2135 if (*dst++ == '\r') dst[-1] = '\n';
2136 *consumed = produced;
2137 break;
2138
2139 default: /* i.e. case: CODING_EOL_LF */
2140 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2141 bcopy (source, destination, produced);
2142 *consumed = produced;
2143 break;
2144 }
2145
2146 return produced;
2147}
2148
2149/* See "GENERAL NOTES about `encode_coding_XXX ()' functions". Encode
2150 format of end-of-line according to `coding->eol_type'. If
2151 `coding->selective' is 1, code '\r' in source text also means
2152 end-of-line. */
2153
2154encode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
2155 struct coding_system *coding;
2156 unsigned char *source, *destination;
2157 int src_bytes, dst_bytes;
2158 int *consumed;
2159{
2160 unsigned char *src = source;
2161 unsigned char *dst = destination;
2162 int produced;
2163
2164 if (src_bytes <= 0)
2165 return 0;
2166
2167 switch (coding->eol_type)
2168 {
2169 case CODING_EOL_LF:
0ef69138 2170 case CODING_EOL_UNDECIDED:
4ed46869
KH
2171 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2172 bcopy (source, destination, produced);
2173 if (coding->selective)
2174 {
2175 int i = produced;
2176 while (i--)
2177 if (*dst++ == '\r') dst[-1] = '\n';
2178 }
2179 *consumed = produced;
2180
2181 case CODING_EOL_CRLF:
2182 {
2183 unsigned char c;
2184 unsigned char *src_end = source + src_bytes;
2185 unsigned char *dst_end = destination + dst_bytes;
2186 /* Since the maximum bytes produced by each loop is 2, we
2187 subtract 1 from DST_END to assure overflow checking is
2188 necessary only at the head of loop. */
2189 unsigned char *adjusted_dst_end = dst_end - 1;
2190
2191 while (src < src_end && dst < adjusted_dst_end)
2192 {
2193 c = *src++;
2194 if (c == '\n' || (c == '\r' && coding->selective))
2195 *dst++ = '\r', *dst++ = '\n';
2196 else
2197 *dst++ = c;
2198 }
2199 produced = dst - destination;
2200 *consumed = src - source;
2201 break;
2202 }
2203
2204 default: /* i.e. case CODING_EOL_CR: */
2205 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2206 bcopy (source, destination, produced);
2207 {
2208 int i = produced;
2209 while (i--)
2210 if (*dst++ == '\n') dst[-1] = '\r';
2211 }
2212 *consumed = produced;
2213 }
2214
2215 return produced;
2216}
2217
2218\f
2219/*** 6. C library functions ***/
2220
2221/* In Emacs Lisp, coding system is represented by a Lisp symbol which
2222 has a property `coding-system'. The value of this property is a
2223 vector of length 5 (called as coding-vector). Among elements of
2224 this vector, the first (element[0]) and the fifth (element[4])
2225 carry important information for decoding/encoding. Before
2226 decoding/encoding, this information should be set in fields of a
2227 structure of type `coding_system'.
2228
2229 A value of property `coding-system' can be a symbol of another
2230 subsidiary coding-system. In that case, Emacs gets coding-vector
2231 from that symbol.
2232
2233 `element[0]' contains information to be set in `coding->type'. The
2234 value and its meaning is as follows:
2235
0ef69138
KH
2236 0 -- coding_type_emacs_mule
2237 1 -- coding_type_sjis
2238 2 -- coding_type_iso2022
2239 3 -- coding_type_big5
2240 4 -- coding_type_ccl encoder/decoder written in CCL
2241 nil -- coding_type_no_conversion
2242 t -- coding_type_undecided (automatic conversion on decoding,
2243 no-conversion on encoding)
4ed46869
KH
2244
2245 `element[4]' contains information to be set in `coding->flags' and
2246 `coding->spec'. The meaning varies by `coding->type'.
2247
2248 If `coding->type' is `coding_type_iso2022', element[4] is a vector
2249 of length 32 (of which the first 13 sub-elements are used now).
2250 Meanings of these sub-elements are:
2251
2252 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2253 If the value is an integer of valid charset, the charset is
2254 assumed to be designated to graphic register N initially.
2255
2256 If the value is minus, it is a minus value of charset which
2257 reserves graphic register N, which means that the charset is
2258 not designated initially but should be designated to graphic
2259 register N just before encoding a character in that charset.
2260
2261 If the value is nil, graphic register N is never used on
2262 encoding.
2263
2264 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2265 Each value takes t or nil. See the section ISO2022 of
2266 `coding.h' for more information.
2267
2268 If `coding->type' is `coding_type_big5', element[4] is t to denote
2269 BIG5-ETen or nil to denote BIG5-HKU.
2270
2271 If `coding->type' takes the other value, element[4] is ignored.
2272
2273 Emacs Lisp's coding system also carries information about format of
2274 end-of-line in a value of property `eol-type'. If the value is
2275 integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2276 means CODING_EOL_CR. If it is not integer, it should be a vector
2277 of subsidiary coding systems of which property `eol-type' has one
2278 of above values.
2279
2280*/
2281
2282/* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2283 and set it in CODING. If CODING_SYSTEM_SYMBOL is invalid, CODING
2284 is setup so that no conversion is necessary and return -1, else
2285 return 0. */
2286
2287int
e0e989f6
KH
2288setup_coding_system (coding_system, coding)
2289 Lisp_Object coding_system;
4ed46869
KH
2290 struct coding_system *coding;
2291{
4ed46869
KH
2292 Lisp_Object type, eol_type;
2293
f4dee582 2294 /* At first, set several fields to default values. */
4ed46869
KH
2295 coding->require_flushing = 0;
2296 coding->last_block = 0;
2297 coding->selective = 0;
2298 coding->composing = 0;
2299 coding->direction = 0;
2300 coding->carryover_size = 0;
4ed46869 2301 coding->post_read_conversion = coding->pre_write_conversion = Qnil;
a5d301df
KH
2302 coding->character_unification_table_for_decode = Qnil;
2303 coding->character_unification_table_for_encode = Qnil;
4ed46869 2304
e0e989f6
KH
2305 Vlast_coding_system_used = coding->symbol = coding_system;
2306 eol_type = Qnil;
2307 /* Get value of property `coding-system' until we get a vector.
2308 While doing that, also get values of properties
a5d301df
KH
2309 `post-read-conversion', `pre-write-conversion',
2310 `character-unification-table-for-decode',
2311 `character-unification-table-for-encode' and `eol-type'. */
e0e989f6 2312 while (!NILP (coding_system) && SYMBOLP (coding_system))
4ed46869 2313 {
4ed46869 2314 if (NILP (coding->post_read_conversion))
e0e989f6 2315 coding->post_read_conversion = Fget (coding_system,
4ed46869 2316 Qpost_read_conversion);
e0e989f6
KH
2317 if (NILP (coding->pre_write_conversion))
2318 coding->pre_write_conversion = Fget (coding_system,
4ed46869 2319 Qpre_write_conversion);
9ce27fde 2320 if (!inhibit_eol_conversion && NILP (eol_type))
e0e989f6 2321 eol_type = Fget (coding_system, Qeol_type);
a5d301df
KH
2322
2323 if (NILP (coding->character_unification_table_for_decode))
2324 coding->character_unification_table_for_decode
2325 = Fget (coding_system, Qcharacter_unification_table_for_decode);
2326
2327 if (NILP (coding->character_unification_table_for_encode))
2328 coding->character_unification_table_for_encode
2329 = Fget (coding_system, Qcharacter_unification_table_for_encode);
2330
e0e989f6 2331 coding_system = Fget (coding_system, Qcoding_system);
4ed46869 2332 }
a5d301df
KH
2333
2334 while (!NILP (coding->character_unification_table_for_decode)
2335 && SYMBOLP (coding->character_unification_table_for_decode))
2336 coding->character_unification_table_for_decode
2337 = Fget (coding->character_unification_table_for_decode,
2338 Qcharacter_unification_table_for_decode);
2339 if (!NILP (coding->character_unification_table_for_decode)
2340 && !CHAR_TABLE_P (coding->character_unification_table_for_decode))
2341 coding->character_unification_table_for_decode = Qnil;
2342
2343 while (!NILP (coding->character_unification_table_for_encode)
2344 && SYMBOLP (coding->character_unification_table_for_encode))
2345 coding->character_unification_table_for_encode
2346 = Fget (coding->character_unification_table_for_encode,
2347 Qcharacter_unification_table_for_encode);
2348 if (!NILP (coding->character_unification_table_for_encode)
2349 && !CHAR_TABLE_P (coding->character_unification_table_for_encode))
2350 coding->character_unification_table_for_encode = Qnil;
2351
e0e989f6
KH
2352 if (!VECTORP (coding_system)
2353 || XVECTOR (coding_system)->size != 5)
4ed46869
KH
2354 goto label_invalid_coding_system;
2355
4ed46869 2356 if (VECTORP (eol_type))
0ef69138 2357 coding->eol_type = CODING_EOL_UNDECIDED;
4ed46869
KH
2358 else if (XFASTINT (eol_type) == 1)
2359 coding->eol_type = CODING_EOL_CRLF;
2360 else if (XFASTINT (eol_type) == 2)
2361 coding->eol_type = CODING_EOL_CR;
2362 else
2363 coding->eol_type = CODING_EOL_LF;
2364
e0e989f6 2365 type = XVECTOR (coding_system)->contents[0];
4ed46869
KH
2366 switch (XFASTINT (type))
2367 {
2368 case 0:
0ef69138 2369 coding->type = coding_type_emacs_mule;
4ed46869
KH
2370 break;
2371
2372 case 1:
2373 coding->type = coding_type_sjis;
2374 break;
2375
2376 case 2:
2377 coding->type = coding_type_iso2022;
2378 {
e0e989f6 2379 Lisp_Object val = XVECTOR (coding_system)->contents[4];
4ed46869
KH
2380 Lisp_Object *flags;
2381 int i, charset, default_reg_bits = 0;
2382
2383 if (!VECTORP (val) || XVECTOR (val)->size != 32)
2384 goto label_invalid_coding_system;
2385
2386 flags = XVECTOR (val)->contents;
2387 coding->flags
2388 = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
2389 | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
2390 | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
2391 | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
2392 | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
2393 | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
2394 | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
2395 | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
e0e989f6
KH
2396 | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
2397 | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
c4825358
KH
2398 | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
2399 | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
3f003981 2400 | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
c4825358 2401 );
4ed46869
KH
2402
2403 /* Invoke graphic register 0 to plane 0. */
2404 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
2405 /* Invoke graphic register 1 to plane 1 if we can use full 8-bit. */
2406 CODING_SPEC_ISO_INVOCATION (coding, 1)
2407 = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
2408 /* Not single shifting at first. */
6e85d753 2409 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
e0e989f6 2410 /* Beginning of buffer should also be regarded as bol. */
6e85d753 2411 CODING_SPEC_ISO_BOL (coding) = 1;
4ed46869
KH
2412
2413 /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
2414 FLAGS[REG] can be one of below:
2415 integer CHARSET: CHARSET occupies register I,
2416 t: designate nothing to REG initially, but can be used
2417 by any charsets,
2418 list of integer, nil, or t: designate the first
2419 element (if integer) to REG initially, the remaining
2420 elements (if integer) is designated to REG on request,
2421 if an element is t, REG can be used by any charset,
2422 nil: REG is never used. */
467e7675 2423 for (charset = 0; charset <= MAX_CHARSET; charset++)
1ba9e4ab
KH
2424 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2425 = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
6e85d753 2426 bzero (CODING_SPEC_ISO_EXPECTED_CHARSETS (coding), MAX_CHARSET + 1);
4ed46869
KH
2427 for (i = 0; i < 4; i++)
2428 {
2429 if (INTEGERP (flags[i])
e0e989f6
KH
2430 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset))
2431 || (charset = get_charset_id (flags[i])) >= 0)
4ed46869
KH
2432 {
2433 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2434 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
6e85d753 2435 CODING_SPEC_ISO_EXPECTED_CHARSETS (coding)[charset] = 1;
4ed46869
KH
2436 }
2437 else if (EQ (flags[i], Qt))
2438 {
2439 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2440 default_reg_bits |= 1 << i;
2441 }
2442 else if (CONSP (flags[i]))
2443 {
2444 Lisp_Object tail = flags[i];
2445
2446 if (INTEGERP (XCONS (tail)->car)
2447 && (charset = XINT (XCONS (tail)->car),
e0e989f6
KH
2448 CHARSET_VALID_P (charset))
2449 || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
4ed46869
KH
2450 {
2451 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2452 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
6e85d753 2453 CODING_SPEC_ISO_EXPECTED_CHARSETS (coding)[charset] = 1;
4ed46869
KH
2454 }
2455 else
2456 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2457 tail = XCONS (tail)->cdr;
2458 while (CONSP (tail))
2459 {
2460 if (INTEGERP (XCONS (tail)->car)
2461 && (charset = XINT (XCONS (tail)->car),
e0e989f6
KH
2462 CHARSET_VALID_P (charset))
2463 || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
6e85d753
KH
2464 {
2465 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2466 = i;
2467 CODING_SPEC_ISO_EXPECTED_CHARSETS (coding)[charset]
2468 = 1;
2469 }
4ed46869
KH
2470 else if (EQ (XCONS (tail)->car, Qt))
2471 default_reg_bits |= 1 << i;
2472 tail = XCONS (tail)->cdr;
2473 }
2474 }
2475 else
2476 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2477
2478 CODING_SPEC_ISO_DESIGNATION (coding, i)
2479 = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
2480 }
2481
2482 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
2483 {
2484 /* REG 1 can be used only by locking shift in 7-bit env. */
2485 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2486 default_reg_bits &= ~2;
2487 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
2488 /* Without any shifting, only REG 0 and 1 can be used. */
2489 default_reg_bits &= 3;
2490 }
2491
6e85d753
KH
2492 for (charset = 0; charset <= MAX_CHARSET; charset++)
2493 if (CHARSET_VALID_P (charset)
2494 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2495 == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
2496 {
2497 /* We have not yet decided where to designate CHARSET. */
2498 int reg_bits = default_reg_bits;
2499
2500 if (CHARSET_CHARS (charset) == 96)
2501 /* A charset of CHARS96 can't be designated to REG 0. */
2502 reg_bits &= ~1;
2503
2504 if (reg_bits)
2505 /* There exist some default graphic register. */
2506 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2507 = (reg_bits & 1
2508 ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
2509 else
2510 /* We anyway have to designate CHARSET to somewhere. */
2511 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2512 = (CHARSET_CHARS (charset) == 94
2513 ? 0
2514 : ((coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT
2515 || ! coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2516 ? 1
2517 : (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT
2518 ? 2 : 0)));
2519 }
4ed46869
KH
2520 }
2521 coding->require_flushing = 1;
2522 break;
2523
2524 case 3:
2525 coding->type = coding_type_big5;
2526 coding->flags
e0e989f6 2527 = (NILP (XVECTOR (coding_system)->contents[4])
4ed46869
KH
2528 ? CODING_FLAG_BIG5_HKU
2529 : CODING_FLAG_BIG5_ETEN);
2530 break;
2531
2532 case 4:
2533 coding->type = coding_type_ccl;
2534 {
e0e989f6 2535 Lisp_Object val = XVECTOR (coding_system)->contents[4];
4ed46869
KH
2536 if (CONSP (val)
2537 && VECTORP (XCONS (val)->car)
2538 && VECTORP (XCONS (val)->cdr))
2539 {
2540 setup_ccl_program (&(coding->spec.ccl.decoder), XCONS (val)->car);
2541 setup_ccl_program (&(coding->spec.ccl.encoder), XCONS (val)->cdr);
2542 }
2543 else
2544 goto label_invalid_coding_system;
2545 }
2546 coding->require_flushing = 1;
2547 break;
2548
2549 default:
2550 if (EQ (type, Qt))
0ef69138 2551 coding->type = coding_type_undecided;
4ed46869
KH
2552 else
2553 coding->type = coding_type_no_conversion;
2554 break;
2555 }
2556 return 0;
2557
2558 label_invalid_coding_system:
2559 coding->type = coding_type_no_conversion;
dec137e5 2560 coding->eol_type = CODING_EOL_LF;
e0e989f6
KH
2561 coding->symbol = coding->pre_write_conversion = coding->post_read_conversion
2562 = Qnil;
4ed46869
KH
2563 return -1;
2564}
2565
2566/* Emacs has a mechanism to automatically detect a coding system if it
2567 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
2568 it's impossible to distinguish some coding systems accurately
2569 because they use the same range of codes. So, at first, coding
2570 systems are categorized into 7, those are:
2571
0ef69138 2572 o coding-category-emacs-mule
4ed46869
KH
2573
2574 The category for a coding system which has the same code range
2575 as Emacs' internal format. Assigned the coding-system (Lisp
0ef69138 2576 symbol) `emacs-mule' by default.
4ed46869
KH
2577
2578 o coding-category-sjis
2579
2580 The category for a coding system which has the same code range
2581 as SJIS. Assigned the coding-system (Lisp
7717c392 2582 symbol) `japanese-shift-jis' by default.
4ed46869
KH
2583
2584 o coding-category-iso-7
2585
2586 The category for a coding system which has the same code range
7717c392
KH
2587 as ISO2022 of 7-bit environment. This doesn't use any locking
2588 shift and single shift functions. Assigned the coding-system
2589 (Lisp symbol) `iso-2022-7bit' by default.
4ed46869
KH
2590
2591 o coding-category-iso-8-1
2592
2593 The category for a coding system which has the same code range
2594 as ISO2022 of 8-bit environment and graphic plane 1 used only
7717c392
KH
2595 for DIMENSION1 charset. This doesn't use any locking shift
2596 and single shift functions. Assigned the coding-system (Lisp
2597 symbol) `iso-latin-1' by default.
4ed46869
KH
2598
2599 o coding-category-iso-8-2
2600
2601 The category for a coding system which has the same code range
2602 as ISO2022 of 8-bit environment and graphic plane 1 used only
7717c392
KH
2603 for DIMENSION2 charset. This doesn't use any locking shift
2604 and single shift functions. Assigned the coding-system (Lisp
2605 symbol) `japanese-iso-8bit' by default.
4ed46869 2606
7717c392 2607 o coding-category-iso-7-else
4ed46869
KH
2608
2609 The category for a coding system which has the same code range
7717c392
KH
2610 as ISO2022 of 7-bit environemnt but uses locking shift or
2611 single shift functions. Assigned the coding-system (Lisp
2612 symbol) `iso-2022-7bit-lock' by default.
2613
2614 o coding-category-iso-8-else
2615
2616 The category for a coding system which has the same code range
2617 as ISO2022 of 8-bit environemnt but uses locking shift or
2618 single shift functions. Assigned the coding-system (Lisp
2619 symbol) `iso-2022-8bit-ss2' by default.
4ed46869
KH
2620
2621 o coding-category-big5
2622
2623 The category for a coding system which has the same code range
2624 as BIG5. Assigned the coding-system (Lisp symbol)
e0e989f6 2625 `cn-big5' by default.
4ed46869
KH
2626
2627 o coding-category-binary
2628
2629 The category for a coding system not categorized in any of the
2630 above. Assigned the coding-system (Lisp symbol)
e0e989f6 2631 `no-conversion' by default.
4ed46869
KH
2632
2633 Each of them is a Lisp symbol and the value is an actual
2634 `coding-system's (this is also a Lisp symbol) assigned by a user.
2635 What Emacs does actually is to detect a category of coding system.
2636 Then, it uses a `coding-system' assigned to it. If Emacs can't
2637 decide only one possible category, it selects a category of the
2638 highest priority. Priorities of categories are also specified by a
2639 user in a Lisp variable `coding-category-list'.
2640
2641*/
2642
2643/* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2644 If it detects possible coding systems, return an integer in which
2645 appropriate flag bits are set. Flag bits are defined by macros
2646 CODING_CATEGORY_MASK_XXX in `coding.h'. */
2647
2648int
2649detect_coding_mask (src, src_bytes)
2650 unsigned char *src;
2651 int src_bytes;
2652{
2653 register unsigned char c;
2654 unsigned char *src_end = src + src_bytes;
2655 int mask;
2656
2657 /* At first, skip all ASCII characters and control characters except
2658 for three ISO2022 specific control characters. */
bcf26d6a 2659 label_loop_detect_coding:
4ed46869
KH
2660 while (src < src_end)
2661 {
2662 c = *src;
2663 if (c >= 0x80
2664 || (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
2665 break;
2666 src++;
2667 }
2668
2669 if (src >= src_end)
2670 /* We found nothing other than ASCII. There's nothing to do. */
2671 return CODING_CATEGORY_MASK_ANY;
2672
2673 /* The text seems to be encoded in some multilingual coding system.
2674 Now, try to find in which coding system the text is encoded. */
2675 if (c < 0x80)
bcf26d6a
KH
2676 {
2677 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
2678 /* C is an ISO2022 specific control code of C0. */
2679 mask = detect_coding_iso2022 (src, src_end);
2680 src++;
2681 if (mask == CODING_CATEGORY_MASK_ANY)
2682 /* No valid ISO2022 code follows C. Try again. */
2683 goto label_loop_detect_coding;
2684 }
4ed46869 2685 else if (c < 0xA0)
c4825358 2686 {
3f003981 2687 /* If C is a special latin extra code,
c4825358
KH
2688 or is an ISO2022 specific control code of C1 (SS2 or SS3),
2689 or is an ISO2022 control-sequence-introducer (CSI),
2690 we should also consider the possibility of someof ISO2022 codings. */
3f003981
KH
2691 if ((VECTORP (Vlatin_extra_code_table)
2692 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
c4825358
KH
2693 || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
2694 || (c == ISO_CODE_CSI
2695 && (src < src_end
2696 && (*src == ']'
2697 || (src + 1 < src_end
2698 && src[1] == ']'
2699 && (*src == '0' || *src == '1' || *src == '2'))))))
2700 mask = (detect_coding_iso2022 (src, src_end)
2701 | detect_coding_sjis (src, src_end)
2702 | detect_coding_emacs_mule (src, src_end)
2703 | CODING_CATEGORY_MASK_BINARY);
4ed46869 2704
c4825358
KH
2705 else
2706 /* C is the first byte of SJIS character code, or a
2707 leading-code of Emacs. */
2708 mask = (detect_coding_sjis (src, src_end)
2709 | detect_coding_emacs_mule (src, src_end)
2710 | CODING_CATEGORY_MASK_BINARY);
2711 }
4ed46869
KH
2712 else
2713 /* C is a character of ISO2022 in graphic plane right,
2714 or a SJIS's 1-byte character code (i.e. JISX0201),
2715 or the first byte of BIG5's 2-byte code. */
2716 mask = (detect_coding_iso2022 (src, src_end)
2717 | detect_coding_sjis (src, src_end)
10bff6f1
RS
2718 | detect_coding_big5 (src, src_end)
2719 | CODING_CATEGORY_MASK_BINARY);
4ed46869
KH
2720
2721 return mask;
2722}
2723
2724/* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2725 The information of the detected coding system is set in CODING. */
2726
2727void
2728detect_coding (coding, src, src_bytes)
2729 struct coding_system *coding;
2730 unsigned char *src;
2731 int src_bytes;
2732{
2733 int mask = detect_coding_mask (src, src_bytes);
2734 int idx;
2735
2736 if (mask == CODING_CATEGORY_MASK_ANY)
2737 /* We found nothing other than ASCII. There's nothing to do. */
2738 return;
2739
2740 if (!mask)
2741 /* The source text seems to be encoded in unknown coding system.
2742 Emacs regards the category of such a kind of coding system as
2743 `coding-category-binary'. We assume that a user has assigned
2744 an appropriate coding system for a `coding-category-binary'. */
2745 idx = CODING_CATEGORY_IDX_BINARY;
2746 else
2747 {
2748 /* We found some plausible coding systems. Let's use a coding
2749 system of the highest priority. */
2750 Lisp_Object val = Vcoding_category_list;
2751
2752 if (CONSP (val))
2753 while (!NILP (val))
2754 {
2755 idx = XFASTINT (Fget (XCONS (val)->car, Qcoding_category_index));
2756 if ((idx < CODING_CATEGORY_IDX_MAX) && (mask & (1 << idx)))
2757 break;
2758 val = XCONS (val)->cdr;
2759 }
2760 else
2761 val = Qnil;
2762
2763 if (NILP (val))
2764 {
2765 /* For unknown reason, `Vcoding_category_list' contains none
2766 of found categories. Let's use any of them. */
2767 for (idx = 0; idx < CODING_CATEGORY_IDX_MAX; idx++)
2768 if (mask & (1 << idx))
2769 break;
2770 }
2771 }
2772 setup_coding_system (XSYMBOL (coding_category_table[idx])->value, coding);
2773}
2774
2775/* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2776 is encoded. Return one of CODING_EOL_LF, CODING_EOL_CRLF,
0ef69138 2777 CODING_EOL_CR, and CODING_EOL_UNDECIDED. */
4ed46869 2778
bc4bc72a
RS
2779#define MAX_EOL_CHECK_COUNT 3
2780
4ed46869
KH
2781int
2782detect_eol_type (src, src_bytes)
2783 unsigned char *src;
2784 int src_bytes;
2785{
2786 unsigned char *src_end = src + src_bytes;
2787 unsigned char c;
bc4bc72a
RS
2788 int total = 0; /* How many end-of-lines are found so far. */
2789 int eol_type = CODING_EOL_UNDECIDED;
2790 int this_eol_type;
4ed46869 2791
bc4bc72a 2792 while (src < src_end && total < MAX_EOL_CHECK_COUNT)
4ed46869
KH
2793 {
2794 c = *src++;
bc4bc72a 2795 if (c == '\n' || c == '\r')
4ed46869 2796 {
bc4bc72a
RS
2797 total++;
2798 if (c == '\n')
2799 this_eol_type = CODING_EOL_LF;
2800 else if (src >= src_end || *src != '\n')
2801 this_eol_type = CODING_EOL_CR;
4ed46869 2802 else
bc4bc72a
RS
2803 this_eol_type = CODING_EOL_CRLF, src++;
2804
2805 if (eol_type == CODING_EOL_UNDECIDED)
2806 /* This is the first end-of-line. */
2807 eol_type = this_eol_type;
2808 else if (eol_type != this_eol_type)
2809 /* The found type is different from what found before.
2810 We had better not decode end-of-line. */
2811 return CODING_EOL_LF;
4ed46869
KH
2812 }
2813 }
bc4bc72a 2814
85a02ca4 2815 return eol_type;
4ed46869
KH
2816}
2817
2818/* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2819 is encoded. If it detects an appropriate format of end-of-line, it
2820 sets the information in *CODING. */
2821
2822void
2823detect_eol (coding, src, src_bytes)
2824 struct coding_system *coding;
2825 unsigned char *src;
2826 int src_bytes;
2827{
2828 Lisp_Object val;
2829 int eol_type = detect_eol_type (src, src_bytes);
2830
0ef69138 2831 if (eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
2832 /* We found no end-of-line in the source text. */
2833 return;
2834
2835 val = Fget (coding->symbol, Qeol_type);
2836 if (VECTORP (val) && XVECTOR (val)->size == 3)
2837 setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
2838}
2839
2840/* See "GENERAL NOTES about `decode_coding_XXX ()' functions". Before
2841 decoding, it may detect coding system and format of end-of-line if
2842 those are not yet decided. */
2843
2844int
2845decode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2846 struct coding_system *coding;
2847 unsigned char *source, *destination;
2848 int src_bytes, dst_bytes;
2849 int *consumed;
2850{
2851 int produced;
2852
2853 if (src_bytes <= 0)
2854 {
2855 *consumed = 0;
2856 return 0;
2857 }
2858
0ef69138 2859 if (coding->type == coding_type_undecided)
4ed46869
KH
2860 detect_coding (coding, source, src_bytes);
2861
0ef69138 2862 if (coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
2863 detect_eol (coding, source, src_bytes);
2864
2865 coding->carryover_size = 0;
2866 switch (coding->type)
2867 {
2868 case coding_type_no_conversion:
2869 label_no_conversion:
2870 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2871 bcopy (source, destination, produced);
2872 *consumed = produced;
2873 break;
2874
0ef69138
KH
2875 case coding_type_emacs_mule:
2876 case coding_type_undecided:
4ed46869 2877 if (coding->eol_type == CODING_EOL_LF
0ef69138 2878 || coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
2879 goto label_no_conversion;
2880 produced = decode_eol (coding, source, destination,
2881 src_bytes, dst_bytes, consumed);
2882 break;
2883
2884 case coding_type_sjis:
2885 produced = decode_coding_sjis_big5 (coding, source, destination,
2886 src_bytes, dst_bytes, consumed,
2887 1);
2888 break;
2889
2890 case coding_type_iso2022:
2891 produced = decode_coding_iso2022 (coding, source, destination,
2892 src_bytes, dst_bytes, consumed);
2893 break;
2894
2895 case coding_type_big5:
2896 produced = decode_coding_sjis_big5 (coding, source, destination,
2897 src_bytes, dst_bytes, consumed,
2898 0);
2899 break;
2900
2901 case coding_type_ccl:
2902 produced = ccl_driver (&coding->spec.ccl.decoder, source, destination,
2903 src_bytes, dst_bytes, consumed);
2904 break;
2905 }
2906
2907 return produced;
2908}
2909
2910/* See "GENERAL NOTES about `encode_coding_XXX ()' functions". */
2911
2912int
2913encode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2914 struct coding_system *coding;
2915 unsigned char *source, *destination;
2916 int src_bytes, dst_bytes;
2917 int *consumed;
2918{
2919 int produced;
2920
4ed46869
KH
2921 switch (coding->type)
2922 {
2923 case coding_type_no_conversion:
2924 label_no_conversion:
2925 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2926 if (produced > 0)
2927 {
2928 bcopy (source, destination, produced);
2929 if (coding->selective)
2930 {
2931 unsigned char *p = destination, *pend = destination + produced;
2932 while (p < pend)
e0e989f6 2933 if (*p++ == '\015') p[-1] = '\n';
4ed46869
KH
2934 }
2935 }
2936 *consumed = produced;
2937 break;
2938
0ef69138
KH
2939 case coding_type_emacs_mule:
2940 case coding_type_undecided:
4ed46869 2941 if (coding->eol_type == CODING_EOL_LF
0ef69138 2942 || coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
2943 goto label_no_conversion;
2944 produced = encode_eol (coding, source, destination,
2945 src_bytes, dst_bytes, consumed);
2946 break;
2947
2948 case coding_type_sjis:
2949 produced = encode_coding_sjis_big5 (coding, source, destination,
2950 src_bytes, dst_bytes, consumed,
2951 1);
2952 break;
2953
2954 case coding_type_iso2022:
2955 produced = encode_coding_iso2022 (coding, source, destination,
2956 src_bytes, dst_bytes, consumed);
2957 break;
2958
2959 case coding_type_big5:
2960 produced = encode_coding_sjis_big5 (coding, source, destination,
2961 src_bytes, dst_bytes, consumed,
2962 0);
2963 break;
2964
2965 case coding_type_ccl:
2966 produced = ccl_driver (&coding->spec.ccl.encoder, source, destination,
2967 src_bytes, dst_bytes, consumed);
2968 break;
2969 }
2970
2971 return produced;
2972}
2973
2974#define CONVERSION_BUFFER_EXTRA_ROOM 256
2975
2976/* Return maximum size (bytes) of a buffer enough for decoding
2977 SRC_BYTES of text encoded in CODING. */
2978
2979int
2980decoding_buffer_size (coding, src_bytes)
2981 struct coding_system *coding;
2982 int src_bytes;
2983{
2984 int magnification;
2985
2986 if (coding->type == coding_type_iso2022)
2987 magnification = 3;
2988 else if (coding->type == coding_type_ccl)
2989 magnification = coding->spec.ccl.decoder.buf_magnification;
2990 else
2991 magnification = 2;
2992
2993 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
2994}
2995
2996/* Return maximum size (bytes) of a buffer enough for encoding
2997 SRC_BYTES of text to CODING. */
2998
2999int
3000encoding_buffer_size (coding, src_bytes)
3001 struct coding_system *coding;
3002 int src_bytes;
3003{
3004 int magnification;
3005
3006 if (coding->type == coding_type_ccl)
3007 magnification = coding->spec.ccl.encoder.buf_magnification;
3008 else
3009 magnification = 3;
3010
3011 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
3012}
3013
3014#ifndef MINIMUM_CONVERSION_BUFFER_SIZE
3015#define MINIMUM_CONVERSION_BUFFER_SIZE 1024
3016#endif
3017
3018char *conversion_buffer;
3019int conversion_buffer_size;
3020
3021/* Return a pointer to a SIZE bytes of buffer to be used for encoding
3022 or decoding. Sufficient memory is allocated automatically. If we
3023 run out of memory, return NULL. */
3024
3025char *
3026get_conversion_buffer (size)
3027 int size;
3028{
3029 if (size > conversion_buffer_size)
3030 {
3031 char *buf;
3032 int real_size = conversion_buffer_size * 2;
3033
3034 while (real_size < size) real_size *= 2;
3035 buf = (char *) xmalloc (real_size);
3036 xfree (conversion_buffer);
3037 conversion_buffer = buf;
3038 conversion_buffer_size = real_size;
3039 }
3040 return conversion_buffer;
3041}
3042
3043\f
3044#ifdef emacs
3045/*** 7. Emacs Lisp library functions ***/
3046
02ba4723 3047DEFUN ("coding-system-spec", Fcoding_system_spec, Scoding_system_spec,
4ed46869 3048 1, 1, 0,
02ba4723 3049 "Return coding-spec of CODING-SYSTEM.\n\
4ed46869
KH
3050If CODING-SYSTEM is not a valid coding-system, return nil.")
3051 (obj)
3052 Lisp_Object obj;
3053{
3054 while (SYMBOLP (obj) && !NILP (obj))
3055 obj = Fget (obj, Qcoding_system);
3056 return ((NILP (obj) || !VECTORP (obj) || XVECTOR (obj)->size != 5)
3057 ? Qnil : obj);
3058}
3059
3060DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
3061 "Return t if OBJECT is nil or a coding-system.\n\
3062See document of make-coding-system for coding-system object.")
3063 (obj)
3064 Lisp_Object obj;
3065{
02ba4723 3066 return ((NILP (obj) || !NILP (Fcoding_system_spec (obj))) ? Qt : Qnil);
4ed46869
KH
3067}
3068
9d991de8
RS
3069DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
3070 Sread_non_nil_coding_system, 1, 1, 0,
e0e989f6 3071 "Read a coding system from the minibuffer, prompting with string PROMPT.")
4ed46869
KH
3072 (prompt)
3073 Lisp_Object prompt;
3074{
e0e989f6 3075 Lisp_Object val;
9d991de8
RS
3076 do
3077 {
02ba4723 3078 val = Fcompleting_read (prompt, Vobarray, Qcoding_system_spec,
9d991de8
RS
3079 Qt, Qnil, Qnil, Qnil);
3080 }
3081 while (XSTRING (val)->size == 0);
e0e989f6 3082 return (Fintern (val, Qnil));
4ed46869
KH
3083}
3084
3085DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 1, 0,
e0e989f6 3086 "Read a coding system or nil from the minibuffer, prompting with string PROMPT.")
4ed46869
KH
3087 (prompt)
3088 Lisp_Object prompt;
3089{
e0e989f6 3090 Lisp_Object val = Fcompleting_read (prompt, Vobarray, Qcoding_system_p,
9d991de8 3091 Qt, Qnil, Qnil, Qnil);
e0e989f6 3092 return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
4ed46869
KH
3093}
3094
3095DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
3096 1, 1, 0,
3097 "Check validity of CODING-SYSTEM.\n\
3098If valid, return CODING-SYSTEM, else `coding-system-error' is signaled.\n\
3099CODING-SYSTEM is valid if it is a symbol and has \"coding-system\" property.\n\
3100The value of property should be a vector of length 5.")
3101 (coding_system)
3102 Lisp_Object coding_system;
3103{
3104 CHECK_SYMBOL (coding_system, 0);
3105 if (!NILP (Fcoding_system_p (coding_system)))
3106 return coding_system;
3107 while (1)
02ba4723 3108 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
4ed46869
KH
3109}
3110
3111DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
3112 2, 2, 0,
bf9cdd4e
KH
3113 "Detect coding system of the text in the region between START and END.\n\
3114Return a list of possible coding systems ordered by priority.\n\
0ef69138 3115If only ASCII characters are found, it returns `undecided'\n\
bf9cdd4e 3116 or its subsidiary coding system according to a detected end-of-line format.")
4ed46869
KH
3117 (b, e)
3118 Lisp_Object b, e;
3119{
3120 int coding_mask, eol_type;
3121 Lisp_Object val;
3122 int beg, end;
3123
3124 validate_region (&b, &e);
3125 beg = XINT (b), end = XINT (e);
3126 if (beg < GPT && end >= GPT) move_gap (end);
3127
3128 coding_mask = detect_coding_mask (POS_ADDR (beg), end - beg);
3129 eol_type = detect_eol_type (POS_ADDR (beg), end - beg);
3130
3131 if (coding_mask == CODING_CATEGORY_MASK_ANY)
3132 {
0ef69138
KH
3133 val = intern ("undecided");
3134 if (eol_type != CODING_EOL_UNDECIDED)
4ed46869
KH
3135 {
3136 Lisp_Object val2 = Fget (val, Qeol_type);
3137 if (VECTORP (val2))
3138 val = XVECTOR (val2)->contents[eol_type];
3139 }
3140 }
3141 else
3142 {
3143 Lisp_Object val2;
3144
3145 /* At first, gather possible coding-systems in VAL in a reverse
3146 order. */
3147 val = Qnil;
3148 for (val2 = Vcoding_category_list;
3149 !NILP (val2);
3150 val2 = XCONS (val2)->cdr)
3151 {
3152 int idx
3153 = XFASTINT (Fget (XCONS (val2)->car, Qcoding_category_index));
3154 if (coding_mask & (1 << idx))
3155 val = Fcons (Fsymbol_value (XCONS (val2)->car), val);
3156 }
3157
3158 /* Then, change the order of the list, while getting subsidiary
3159 coding-systems. */
3160 val2 = val;
3161 val = Qnil;
3162 for (; !NILP (val2); val2 = XCONS (val2)->cdr)
3163 {
0ef69138 3164 if (eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
3165 val = Fcons (XCONS (val2)->car, val);
3166 else
3167 {
3168 Lisp_Object val3 = Fget (XCONS (val2)->car, Qeol_type);
3169 if (VECTORP (val3))
3170 val = Fcons (XVECTOR (val3)->contents[eol_type], val);
3171 else
3172 val = Fcons (XCONS (val2)->car, val);
3173 }
3174 }
3175 }
3176
3177 return val;
3178}
3179
3180/* Scan text in the region between *BEGP and *ENDP, skip characters
3181 which we never have to encode to (iff ENCODEP is 1) or decode from
3182 coding system CODING at the head and tail, then set BEGP and ENDP
3183 to the addresses of start and end of the text we actually convert. */
3184
3185void
3186shrink_conversion_area (begp, endp, coding, encodep)
3187 unsigned char **begp, **endp;
3188 struct coding_system *coding;
3189 int encodep;
3190{
3191 register unsigned char *beg_addr = *begp, *end_addr = *endp;
3192
3193 if (coding->eol_type != CODING_EOL_LF
0ef69138 3194 && coding->eol_type != CODING_EOL_UNDECIDED)
4ed46869
KH
3195 /* Since we anyway have to convert end-of-line format, it is not
3196 worth skipping at most 100 bytes or so. */
3197 return;
3198
3199 if (encodep) /* for encoding */
3200 {
3201 switch (coding->type)
3202 {
3203 case coding_type_no_conversion:
0ef69138
KH
3204 case coding_type_emacs_mule:
3205 case coding_type_undecided:
4ed46869
KH
3206 /* We need no conversion. */
3207 *begp = *endp;
3208 return;
3209 case coding_type_ccl:
3210 /* We can't skip any data. */
3211 return;
e0e989f6
KH
3212 case coding_type_iso2022:
3213 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3214 {
3215 unsigned char *bol = beg_addr;
3216 while (beg_addr < end_addr && *beg_addr < 0x80)
3217 {
3218 beg_addr++;
3219 if (*(beg_addr - 1) == '\n')
3220 bol = beg_addr;
3221 }
3222 beg_addr = bol;
3223 goto label_skip_tail;
3224 }
3225 /* fall down ... */
4ed46869
KH
3226 default:
3227 /* We can skip all ASCII characters at the head and tail. */
3228 while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
e0e989f6 3229 label_skip_tail:
4ed46869
KH
3230 while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
3231 break;
3232 }
3233 }
3234 else /* for decoding */
3235 {
3236 switch (coding->type)
3237 {
3238 case coding_type_no_conversion:
3239 /* We need no conversion. */
3240 *begp = *endp;
3241 return;
0ef69138 3242 case coding_type_emacs_mule:
4ed46869
KH
3243 if (coding->eol_type == CODING_EOL_LF)
3244 {
3245 /* We need no conversion. */
3246 *begp = *endp;
3247 return;
3248 }
3249 /* We can skip all but carriage-return. */
3250 while (beg_addr < end_addr && *beg_addr != '\r') beg_addr++;
3251 while (beg_addr < end_addr && *(end_addr - 1) != '\r') end_addr--;
3252 break;
3253 case coding_type_sjis:
3254 case coding_type_big5:
3255 /* We can skip all ASCII characters at the head. */
3256 while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
3257 /* We can skip all ASCII characters at the tail except for
3258 the second byte of SJIS or BIG5 code. */
3259 while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
3260 if (end_addr != *endp)
3261 end_addr++;
3262 break;
3263 case coding_type_ccl:
3264 /* We can't skip any data. */
3265 return;
3266 default: /* i.e. case coding_type_iso2022: */
3267 {
3268 unsigned char c;
3269
3270 /* We can skip all ASCII characters except for a few
3271 control codes at the head. */
3272 while (beg_addr < end_addr && (c = *beg_addr) < 0x80
3273 && c != ISO_CODE_CR && c != ISO_CODE_SO
3274 && c != ISO_CODE_SI && c != ISO_CODE_ESC)
3275 beg_addr++;
3276 }
3277 break;
3278 }
3279 }
3280 *begp = beg_addr;
3281 *endp = end_addr;
3282 return;
3283}
3284
3285/* Encode to (iff ENCODEP is 1) or decode form coding system CODING a
3286 text between B and E. B and E are buffer position. */
3287
3288Lisp_Object
3289code_convert_region (b, e, coding, encodep)
3290 Lisp_Object b, e;
3291 struct coding_system *coding;
3292 int encodep;
3293{
3294 int beg, end, len, consumed, produced;
3295 char *buf;
3296 unsigned char *begp, *endp;
3297 int pos = PT;
3298
3299 validate_region (&b, &e);
3300 beg = XINT (b), end = XINT (e);
3301 if (beg < GPT && end >= GPT)
3302 move_gap (end);
3303
3304 if (encodep && !NILP (coding->pre_write_conversion))
3305 {
3306 /* We must call a pre-conversion function which may put a new
3307 text to be converted in a new buffer. */
3308 struct buffer *old = current_buffer, *new;
3309
3310 TEMP_SET_PT (beg);
3311 call2 (coding->pre_write_conversion, b, e);
3312 if (old != current_buffer)
3313 {
3314 /* Replace the original text by the text just generated. */
3315 len = ZV - BEGV;
3316 new = current_buffer;
3317 set_buffer_internal (old);
3318 del_range (beg, end);
3319 insert_from_buffer (new, 1, len, 0);
3320 end = beg + len;
3321 }
3322 }
3323
3324 /* We may be able to shrink the conversion region. */
3325 begp = POS_ADDR (beg); endp = begp + (end - beg);
3326 shrink_conversion_area (&begp, &endp, coding, encodep);
3327
3328 if (begp == endp)
3329 /* We need no conversion. */
3330 len = end - beg;
3331 else
3332 {
3333 beg += begp - POS_ADDR (beg);
3334 end = beg + (endp - begp);
3335
3336 if (encodep)
3337 len = encoding_buffer_size (coding, end - beg);
3338 else
3339 len = decoding_buffer_size (coding, end - beg);
3340 buf = get_conversion_buffer (len);
3341
3342 coding->last_block = 1;
3343 produced = (encodep
3344 ? encode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
3345 &consumed)
3346 : decode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
3347 &consumed));
3348
3349 len = produced + (beg - XINT (b)) + (XINT (e) - end);
3350
3351 TEMP_SET_PT (beg);
3352 insert (buf, produced);
3353 del_range (PT, PT + end - beg);
3354 if (pos >= end)
3355 pos = PT + (pos - end);
3356 else if (pos > beg)
3357 pos = beg;
3358 TEMP_SET_PT (pos);
3359 }
3360
3361 if (!encodep && !NILP (coding->post_read_conversion))
3362 {
3363 /* We must call a post-conversion function which may alter
3364 the text just converted. */
3365 Lisp_Object insval;
3366
3367 beg = XINT (b);
3368 TEMP_SET_PT (beg);
3369 insval = call1 (coding->post_read_conversion, make_number (len));
3370 CHECK_NUMBER (insval, 0);
3371 len = XINT (insval);
3372 }
3373
3374 return make_number (len);
3375}
3376
3377Lisp_Object
e0e989f6
KH
3378code_convert_string (str, coding, encodep, nocopy)
3379 Lisp_Object str, nocopy;
4ed46869
KH
3380 struct coding_system *coding;
3381 int encodep;
3382{
3383 int len, consumed, produced;
3384 char *buf;
3385 unsigned char *begp, *endp;
3386 int head_skip, tail_skip;
3387 struct gcpro gcpro1;
3388
3389 if (encodep && !NILP (coding->pre_write_conversion)
3390 || !encodep && !NILP (coding->post_read_conversion))
3391 {
3392 /* Since we have to call Lisp functions which assume target text
3393 is in a buffer, after setting a temporary buffer, call
3394 code_convert_region. */
3395 int count = specpdl_ptr - specpdl;
3396 int len = XSTRING (str)->size;
3397 Lisp_Object result;
3398 struct buffer *old = current_buffer;
3399
3400 record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
3401 temp_output_buffer_setup (" *code-converting-work*");
3402 set_buffer_internal (XBUFFER (Vstandard_output));
3403 insert_from_string (str, 0, len, 0);
3404 code_convert_region (make_number (BEGV), make_number (ZV),
3405 coding, encodep);
3406 result = make_buffer_string (BEGV, ZV, 0);
3407 set_buffer_internal (old);
3408 return unbind_to (count, result);
3409 }
3410
3411 /* We may be able to shrink the conversion region. */
3412 begp = XSTRING (str)->data;
3413 endp = begp + XSTRING (str)->size;
3414 shrink_conversion_area (&begp, &endp, coding, encodep);
3415
3416 if (begp == endp)
3417 /* We need no conversion. */
e0e989f6 3418 return (NILP (nocopy) ? Fcopy_sequence (str) : str);
4ed46869
KH
3419
3420 head_skip = begp - XSTRING (str)->data;
3421 tail_skip = XSTRING (str)->size - head_skip - (endp - begp);
3422
3423 GCPRO1 (str);
3424
3425 if (encodep)
3426 len = encoding_buffer_size (coding, endp - begp);
3427 else
3428 len = decoding_buffer_size (coding, endp - begp);
3429 buf = get_conversion_buffer (len + head_skip + tail_skip);
3430
3431 bcopy (XSTRING (str)->data, buf, head_skip);
3432 coding->last_block = 1;
3433 produced = (encodep
3434 ? encode_coding (coding, XSTRING (str)->data + head_skip,
3435 buf + head_skip, endp - begp, len, &consumed)
3436 : decode_coding (coding, XSTRING (str)->data + head_skip,
3437 buf + head_skip, endp - begp, len, &consumed));
3438 bcopy (XSTRING (str)->data + head_skip + (endp - begp),
3439 buf + head_skip + produced,
3440 tail_skip);
3441
3442 UNGCPRO;
3443
3444 return make_string (buf, head_skip + produced + tail_skip);
3445}
3446
3447DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
e0e989f6
KH
3448 3, 3, "r\nzCoding system: ",
3449 "Decode current region by specified coding system.\n\
3450When called from a program, takes three arguments:\n\
3451START, END, and CODING-SYSTEM. START END are buffer positions.\n\
4ed46869
KH
3452Return length of decoded text.")
3453 (b, e, coding_system)
3454 Lisp_Object b, e, coding_system;
3455{
3456 struct coding_system coding;
3457
3458 CHECK_NUMBER_COERCE_MARKER (b, 0);
3459 CHECK_NUMBER_COERCE_MARKER (e, 1);
3460 CHECK_SYMBOL (coding_system, 2);
3461
e0e989f6
KH
3462 if (NILP (coding_system))
3463 return make_number (XFASTINT (e) - XFASTINT (b));
4ed46869
KH
3464 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3465 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3466
3467 return code_convert_region (b, e, &coding, 0);
3468}
3469
3470DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
e0e989f6
KH
3471 3, 3, "r\nzCoding system: ",
3472 "Encode current region by specified coding system.\n\
3473When called from a program, takes three arguments:\n\
3474START, END, and CODING-SYSTEM. START END are buffer positions.\n\
4ed46869
KH
3475Return length of encoded text.")
3476 (b, e, coding_system)
3477 Lisp_Object b, e, coding_system;
3478{
3479 struct coding_system coding;
3480
3481 CHECK_NUMBER_COERCE_MARKER (b, 0);
3482 CHECK_NUMBER_COERCE_MARKER (e, 1);
3483 CHECK_SYMBOL (coding_system, 2);
3484
e0e989f6
KH
3485 if (NILP (coding_system))
3486 return make_number (XFASTINT (e) - XFASTINT (b));
4ed46869
KH
3487 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3488 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3489
3490 return code_convert_region (b, e, &coding, 1);
3491}
3492
3493DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
e0e989f6
KH
3494 2, 3, 0,
3495 "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
3496Optional arg NOCOPY non-nil means return STRING itself if there's no need\n\
3497of decoding.")
3498 (string, coding_system, nocopy)
3499 Lisp_Object string, coding_system, nocopy;
4ed46869
KH
3500{
3501 struct coding_system coding;
3502
3503 CHECK_STRING (string, 0);
3504 CHECK_SYMBOL (coding_system, 1);
3505
e0e989f6
KH
3506 if (NILP (coding_system))
3507 return (NILP (nocopy) ? Fcopy_sequence (string) : string);
4ed46869
KH
3508 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3509 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3510
e0e989f6 3511 return code_convert_string (string, &coding, 0, nocopy);
4ed46869
KH
3512}
3513
3514DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
e0e989f6
KH
3515 2, 3, 0,
3516 "Encode STRING to CODING-SYSTEM, and return the result.\n\
3517Optional arg NOCOPY non-nil means return STRING itself if there's no need\n\
3518of encoding.")
3519 (string, coding_system, nocopy)
3520 Lisp_Object string, coding_system, nocopy;
4ed46869
KH
3521{
3522 struct coding_system coding;
3523
3524 CHECK_STRING (string, 0);
3525 CHECK_SYMBOL (coding_system, 1);
3526
e0e989f6
KH
3527 if (NILP (coding_system))
3528 return (NILP (nocopy) ? Fcopy_sequence (string) : string);
4ed46869
KH
3529 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3530 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3531
e0e989f6 3532 return code_convert_string (string, &coding, 1, nocopy);
4ed46869
KH
3533}
3534
3535DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
e0e989f6 3536 "Decode a JISX0208 character of shift-jis encoding.\n\
4ed46869
KH
3537CODE is the character code in SJIS.\n\
3538Return the corresponding character.")
3539 (code)
3540 Lisp_Object code;
3541{
3542 unsigned char c1, c2, s1, s2;
3543 Lisp_Object val;
3544
3545 CHECK_NUMBER (code, 0);
3546 s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
3547 DECODE_SJIS (s1, s2, c1, c2);
3548 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset_jisx0208, c1, c2));
3549 return val;
3550}
3551
3552DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
3553 "Encode a JISX0208 character CHAR to SJIS coding-system.\n\
3554Return the corresponding character code in SJIS.")
3555 (ch)
3556 Lisp_Object ch;
3557{
bcf26d6a 3558 int charset, c1, c2, s1, s2;
4ed46869
KH
3559 Lisp_Object val;
3560
3561 CHECK_NUMBER (ch, 0);
3562 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3563 if (charset == charset_jisx0208)
3564 {
3565 ENCODE_SJIS (c1, c2, s1, s2);
bcf26d6a 3566 XSETFASTINT (val, (s1 << 8) | s2);
4ed46869
KH
3567 }
3568 else
3569 XSETFASTINT (val, 0);
3570 return val;
3571}
3572
3573DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
3574 "Decode a Big5 character CODE of BIG5 coding-system.\n\
3575CODE is the character code in BIG5.\n\
3576Return the corresponding character.")
3577 (code)
3578 Lisp_Object code;
3579{
3580 int charset;
3581 unsigned char b1, b2, c1, c2;
3582 Lisp_Object val;
3583
3584 CHECK_NUMBER (code, 0);
3585 b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
3586 DECODE_BIG5 (b1, b2, charset, c1, c2);
3587 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset, c1, c2));
3588 return val;
3589}
3590
3591DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
3592 "Encode the Big5 character CHAR to BIG5 coding-system.\n\
3593Return the corresponding character code in Big5.")
3594 (ch)
3595 Lisp_Object ch;
3596{
bcf26d6a 3597 int charset, c1, c2, b1, b2;
4ed46869
KH
3598 Lisp_Object val;
3599
3600 CHECK_NUMBER (ch, 0);
3601 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3602 if (charset == charset_big5_1 || charset == charset_big5_2)
3603 {
3604 ENCODE_BIG5 (charset, c1, c2, b1, b2);
bcf26d6a 3605 XSETFASTINT (val, (b1 << 8) | b2);
4ed46869
KH
3606 }
3607 else
3608 XSETFASTINT (val, 0);
3609 return val;
3610}
3611
1ba9e4ab
KH
3612DEFUN ("set-terminal-coding-system-internal",
3613 Fset_terminal_coding_system_internal,
3614 Sset_terminal_coding_system_internal, 1, 1, 0, "")
4ed46869
KH
3615 (coding_system)
3616 Lisp_Object coding_system;
3617{
3618 CHECK_SYMBOL (coding_system, 0);
3619 setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
6e85d753
KH
3620 /* We had better not send unexpected characters to terminal. */
3621 terminal_coding.flags |= CODING_FLAG_ISO_SAFE;
3622
4ed46869
KH
3623 return Qnil;
3624}
3625
c4825358
KH
3626DEFUN ("set-safe-terminal-coding-system-internal",
3627 Fset_safe_terminal_coding_system_internal,
3628 Sset_safe_terminal_coding_system_internal, 1, 1, 0, "")
3629 (coding_system)
3630 Lisp_Object coding_system;
3631{
3632 CHECK_SYMBOL (coding_system, 0);
3633 setup_coding_system (Fcheck_coding_system (coding_system),
3634 &safe_terminal_coding);
3635 return Qnil;
3636}
3637
4ed46869
KH
3638DEFUN ("terminal-coding-system",
3639 Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
3640 "Return coding-system of your terminal.")
3641 ()
3642{
3643 return terminal_coding.symbol;
3644}
3645
1ba9e4ab
KH
3646DEFUN ("set-keyboard-coding-system-internal",
3647 Fset_keyboard_coding_system_internal,
3648 Sset_keyboard_coding_system_internal, 1, 1, 0, "")
4ed46869
KH
3649 (coding_system)
3650 Lisp_Object coding_system;
3651{
3652 CHECK_SYMBOL (coding_system, 0);
3653 setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
3654 return Qnil;
3655}
3656
3657DEFUN ("keyboard-coding-system",
3658 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
3659 "Return coding-system of what is sent from terminal keyboard.")
3660 ()
3661{
3662 return keyboard_coding.symbol;
3663}
3664
3665\f
a5d301df
KH
3666DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
3667 Sfind_operation_coding_system, 1, MANY, 0,
3668 "Choose a coding system for an operation based on the target name.\n\
9ce27fde
KH
3669The value names a pair of coding systems: (DECODING-SYSTEM ENCODING-SYSTEM).\n\
3670DECODING-SYSTEM is the coding system to use for decoding\n\
3671\(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system\n\
3672for encoding (in case OPERATION does encoding).\n\
ccdb79f5
RS
3673\n\
3674The first argument OPERATION specifies an I/O primitive:\n\
3675 For file I/O, `insert-file-contents' or `write-region'.\n\
3676 For process I/O, `call-process', `call-process-region', or `start-process'.\n\
3677 For network I/O, `open-network-stream'.\n\
3678\n\
3679The remaining arguments should be the same arguments that were passed\n\
3680to the primitive. Depending on which primitive, one of those arguments\n\
3681is selected as the TARGET. For example, if OPERATION does file I/O,\n\
3682whichever argument specifies the file name is TARGET.\n\
3683\n\
3684TARGET has a meaning which depends on OPERATION:\n\
4ed46869
KH
3685 For file I/O, TARGET is a file name.\n\
3686 For process I/O, TARGET is a process name.\n\
3687 For network I/O, TARGET is a service name or a port number\n\
3688\n\
02ba4723
KH
3689This function looks up what specified for TARGET in,\n\
3690`file-coding-system-alist', `process-coding-system-alist',\n\
3691or `network-coding-system-alist' depending on OPERATION.\n\
3692They may specify a coding system, a cons of coding systems,\n\
3693or a function symbol to call.\n\
3694In the last case, we call the function with one argument,\n\
9ce27fde 3695which is a list of all the arguments given to this function.")
4ed46869
KH
3696 (nargs, args)
3697 int nargs;
3698 Lisp_Object *args;
3699{
3700 Lisp_Object operation, target_idx, target, val;
3701 register Lisp_Object chain;
3702
3703 if (nargs < 2)
3704 error ("Too few arguments");
3705 operation = args[0];
3706 if (!SYMBOLP (operation)
3707 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
3708 error ("Invalid first arguement");
3709 if (nargs < 1 + XINT (target_idx))
3710 error ("Too few arguments for operation: %s",
3711 XSYMBOL (operation)->name->data);
3712 target = args[XINT (target_idx) + 1];
3713 if (!(STRINGP (target)
3714 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
3715 error ("Invalid %dth argument", XINT (target_idx) + 1);
3716
2e34157c
RS
3717 chain = ((EQ (operation, Qinsert_file_contents)
3718 || EQ (operation, Qwrite_region))
02ba4723 3719 ? Vfile_coding_system_alist
2e34157c 3720 : (EQ (operation, Qopen_network_stream)
02ba4723
KH
3721 ? Vnetwork_coding_system_alist
3722 : Vprocess_coding_system_alist));
4ed46869
KH
3723 if (NILP (chain))
3724 return Qnil;
3725
02ba4723 3726 for (; CONSP (chain); chain = XCONS (chain)->cdr)
4ed46869
KH
3727 {
3728 Lisp_Object elt = XCONS (chain)->car;
3729
3730 if (CONSP (elt)
3731 && ((STRINGP (target)
3732 && STRINGP (XCONS (elt)->car)
3733 && fast_string_match (XCONS (elt)->car, target) >= 0)
3734 || (INTEGERP (target) && EQ (target, XCONS (elt)->car))))
02ba4723
KH
3735 {
3736 val = XCONS (elt)->cdr;
3737 if (CONSP (val))
3738 return val;
3739 if (! SYMBOLP (val))
3740 return Qnil;
3741 if (! NILP (Fcoding_system_p (val)))
3742 return Fcons (val, val);
465edc86 3743 if (!NILP (Ffboundp (val)))
5d632ccf 3744 return call1 (val, Flist (nargs, args));
02ba4723
KH
3745 return Qnil;
3746 }
4ed46869
KH
3747 }
3748 return Qnil;
3749}
3750
3751#endif /* emacs */
3752
3753\f
3754/*** 8. Post-amble ***/
3755
3756init_coding_once ()
3757{
3758 int i;
3759
0ef69138 3760 /* Emacs' internal format specific initialize routine. */
4ed46869
KH
3761 for (i = 0; i <= 0x20; i++)
3762 emacs_code_class[i] = EMACS_control_code;
3763 emacs_code_class[0x0A] = EMACS_linefeed_code;
3764 emacs_code_class[0x0D] = EMACS_carriage_return_code;
3765 for (i = 0x21 ; i < 0x7F; i++)
3766 emacs_code_class[i] = EMACS_ascii_code;
3767 emacs_code_class[0x7F] = EMACS_control_code;
3768 emacs_code_class[0x80] = EMACS_leading_code_composition;
3769 for (i = 0x81; i < 0xFF; i++)
3770 emacs_code_class[i] = EMACS_invalid_code;
3771 emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
3772 emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
3773 emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
3774 emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
3775
3776 /* ISO2022 specific initialize routine. */
3777 for (i = 0; i < 0x20; i++)
3778 iso_code_class[i] = ISO_control_code;
3779 for (i = 0x21; i < 0x7F; i++)
3780 iso_code_class[i] = ISO_graphic_plane_0;
3781 for (i = 0x80; i < 0xA0; i++)
3782 iso_code_class[i] = ISO_control_code;
3783 for (i = 0xA1; i < 0xFF; i++)
3784 iso_code_class[i] = ISO_graphic_plane_1;
3785 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
3786 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
3787 iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
3788 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
3789 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
3790 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
3791 iso_code_class[ISO_CODE_ESC] = ISO_escape;
3792 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
3793 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
3794 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
3795
e0e989f6
KH
3796 conversion_buffer_size = MINIMUM_CONVERSION_BUFFER_SIZE;
3797 conversion_buffer = (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE);
3798
3799 setup_coding_system (Qnil, &keyboard_coding);
3800 setup_coding_system (Qnil, &terminal_coding);
c4825358 3801 setup_coding_system (Qnil, &safe_terminal_coding);
9ce27fde
KH
3802
3803#if defined (MSDOS) || defined (WINDOWSNT)
3804 system_eol_type = CODING_EOL_CRLF;
3805#else
3806 system_eol_type = CODING_EOL_LF;
3807#endif
e0e989f6
KH
3808}
3809
3810#ifdef emacs
3811
3812syms_of_coding ()
3813{
3814 Qtarget_idx = intern ("target-idx");
3815 staticpro (&Qtarget_idx);
3816
9ce27fde 3817 /* Target FILENAME is the first argument. */
e0e989f6 3818 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
9ce27fde 3819 /* Target FILENAME is the third argument. */
e0e989f6
KH
3820 Fput (Qwrite_region, Qtarget_idx, make_number (2));
3821
3822 Qcall_process = intern ("call-process");
3823 staticpro (&Qcall_process);
9ce27fde 3824 /* Target PROGRAM is the first argument. */
e0e989f6
KH
3825 Fput (Qcall_process, Qtarget_idx, make_number (0));
3826
3827 Qcall_process_region = intern ("call-process-region");
3828 staticpro (&Qcall_process_region);
9ce27fde 3829 /* Target PROGRAM is the third argument. */
e0e989f6
KH
3830 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
3831
3832 Qstart_process = intern ("start-process");
3833 staticpro (&Qstart_process);
9ce27fde 3834 /* Target PROGRAM is the third argument. */
e0e989f6
KH
3835 Fput (Qstart_process, Qtarget_idx, make_number (2));
3836
3837 Qopen_network_stream = intern ("open-network-stream");
3838 staticpro (&Qopen_network_stream);
9ce27fde 3839 /* Target SERVICE is the fourth argument. */
e0e989f6
KH
3840 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
3841
4ed46869
KH
3842 Qcoding_system = intern ("coding-system");
3843 staticpro (&Qcoding_system);
3844
3845 Qeol_type = intern ("eol-type");
3846 staticpro (&Qeol_type);
3847
3848 Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
3849 staticpro (&Qbuffer_file_coding_system);
3850
3851 Qpost_read_conversion = intern ("post-read-conversion");
3852 staticpro (&Qpost_read_conversion);
3853
3854 Qpre_write_conversion = intern ("pre-write-conversion");
3855 staticpro (&Qpre_write_conversion);
3856
02ba4723
KH
3857 Qcoding_system_spec = intern ("coding-system-spec");
3858 staticpro (&Qcoding_system_spec);
4ed46869
KH
3859
3860 Qcoding_system_p = intern ("coding-system-p");
3861 staticpro (&Qcoding_system_p);
3862
3863 Qcoding_system_error = intern ("coding-system-error");
3864 staticpro (&Qcoding_system_error);
3865
3866 Fput (Qcoding_system_error, Qerror_conditions,
3867 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
3868 Fput (Qcoding_system_error, Qerror_message,
9ce27fde 3869 build_string ("Invalid coding system"));
4ed46869
KH
3870
3871 Qcoding_category_index = intern ("coding-category-index");
3872 staticpro (&Qcoding_category_index);
3873
3874 {
3875 int i;
3876 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3877 {
3878 coding_category_table[i] = intern (coding_category_name[i]);
3879 staticpro (&coding_category_table[i]);
3880 Fput (coding_category_table[i], Qcoding_category_index,
3881 make_number (i));
3882 }
3883 }
3884
bdd9fb48
KH
3885 Qcharacter_unification_table = intern ("character-unification-table");
3886 staticpro (&Qcharacter_unification_table);
3887 Fput (Qcharacter_unification_table, Qchar_table_extra_slots,
3888 make_number (0));
3889
a5d301df
KH
3890 Qcharacter_unification_table_for_decode
3891 = intern ("character-unification-table-for-decode");
3892 staticpro (&Qcharacter_unification_table_for_decode);
3893
3894 Qcharacter_unification_table_for_encode
3895 = intern ("character-unification-table-for-encode");
3896 staticpro (&Qcharacter_unification_table_for_encode);
3897
9ce27fde
KH
3898 Qemacs_mule = intern ("emacs-mule");
3899 staticpro (&Qemacs_mule);
3900
02ba4723 3901 defsubr (&Scoding_system_spec);
4ed46869
KH
3902 defsubr (&Scoding_system_p);
3903 defsubr (&Sread_coding_system);
3904 defsubr (&Sread_non_nil_coding_system);
3905 defsubr (&Scheck_coding_system);
3906 defsubr (&Sdetect_coding_region);
3907 defsubr (&Sdecode_coding_region);
3908 defsubr (&Sencode_coding_region);
3909 defsubr (&Sdecode_coding_string);
3910 defsubr (&Sencode_coding_string);
3911 defsubr (&Sdecode_sjis_char);
3912 defsubr (&Sencode_sjis_char);
3913 defsubr (&Sdecode_big5_char);
3914 defsubr (&Sencode_big5_char);
1ba9e4ab 3915 defsubr (&Sset_terminal_coding_system_internal);
c4825358 3916 defsubr (&Sset_safe_terminal_coding_system_internal);
4ed46869 3917 defsubr (&Sterminal_coding_system);
1ba9e4ab 3918 defsubr (&Sset_keyboard_coding_system_internal);
4ed46869 3919 defsubr (&Skeyboard_coding_system);
a5d301df 3920 defsubr (&Sfind_operation_coding_system);
4ed46869
KH
3921
3922 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
3923 "List of coding-categories (symbols) ordered by priority.");
3924 {
3925 int i;
3926
3927 Vcoding_category_list = Qnil;
3928 for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
3929 Vcoding_category_list
3930 = Fcons (coding_category_table[i], Vcoding_category_list);
3931 }
3932
3933 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
10bff6f1 3934 "Specify the coding system for read operations.\n\
2ebb362d 3935It is useful to bind this variable with `let', but do not set it globally.\n\
4ed46869 3936If the value is a coding system, it is used for decoding on read operation.\n\
a67a9c66 3937If not, an appropriate element is used from one of the coding system alists:\n\
10bff6f1 3938There are three such tables, `file-coding-system-alist',\n\
a67a9c66 3939`process-coding-system-alist', and `network-coding-system-alist'.");
4ed46869
KH
3940 Vcoding_system_for_read = Qnil;
3941
3942 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
10bff6f1 3943 "Specify the coding system for write operations.\n\
2ebb362d 3944It is useful to bind this variable with `let', but do not set it globally.\n\
4ed46869 3945If the value is a coding system, it is used for encoding on write operation.\n\
a67a9c66 3946If not, an appropriate element is used from one of the coding system alists:\n\
10bff6f1 3947There are three such tables, `file-coding-system-alist',\n\
a67a9c66 3948`process-coding-system-alist', and `network-coding-system-alist'.");
4ed46869
KH
3949 Vcoding_system_for_write = Qnil;
3950
3951 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
a67a9c66 3952 "Coding system used in the latest file or process I/O.");
4ed46869
KH
3953 Vlast_coding_system_used = Qnil;
3954
9ce27fde
KH
3955 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
3956 "*Non-nil inhibit code conversion of end-of-line format in any cases.");
3957 inhibit_eol_conversion = 0;
3958
02ba4723
KH
3959 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
3960 "Alist to decide a coding system to use for a file I/O operation.\n\
3961The format is ((PATTERN . VAL) ...),\n\
3962where PATTERN is a regular expression matching a file name,\n\
3963VAL is a coding system, a cons of coding systems, or a function symbol.\n\
3964If VAL is a coding system, it is used for both decoding and encoding\n\
3965the file contents.\n\
3966If VAL is a cons of coding systems, the car part is used for decoding,\n\
3967and the cdr part is used for encoding.\n\
3968If VAL is a function symbol, the function must return a coding system\n\
3969or a cons of coding systems which are used as above.\n\
e0e989f6 3970\n\
9ce27fde 3971See also the function `find-operation-coding-system'.");
02ba4723
KH
3972 Vfile_coding_system_alist = Qnil;
3973
3974 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
3975 "Alist to decide a coding system to use for a process I/O operation.\n\
3976The format is ((PATTERN . VAL) ...),\n\
3977where PATTERN is a regular expression matching a program name,\n\
3978VAL is a coding system, a cons of coding systems, or a function symbol.\n\
3979If VAL is a coding system, it is used for both decoding what received\n\
3980from the program and encoding what sent to the program.\n\
3981If VAL is a cons of coding systems, the car part is used for decoding,\n\
3982and the cdr part is used for encoding.\n\
3983If VAL is a function symbol, the function must return a coding system\n\
3984or a cons of coding systems which are used as above.\n\
4ed46869 3985\n\
9ce27fde 3986See also the function `find-operation-coding-system'.");
02ba4723
KH
3987 Vprocess_coding_system_alist = Qnil;
3988
3989 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
3990 "Alist to decide a coding system to use for a network I/O operation.\n\
3991The format is ((PATTERN . VAL) ...),\n\
3992where PATTERN is a regular expression matching a network service name\n\
3993or is a port number to connect to,\n\
3994VAL is a coding system, a cons of coding systems, or a function symbol.\n\
3995If VAL is a coding system, it is used for both decoding what received\n\
3996from the network stream and encoding what sent to the network stream.\n\
3997If VAL is a cons of coding systems, the car part is used for decoding,\n\
3998and the cdr part is used for encoding.\n\
3999If VAL is a function symbol, the function must return a coding system\n\
4000or a cons of coding systems which are used as above.\n\
4ed46869 4001\n\
9ce27fde 4002See also the function `find-operation-coding-system'.");
02ba4723 4003 Vnetwork_coding_system_alist = Qnil;
4ed46869
KH
4004
4005 DEFVAR_INT ("eol-mnemonic-unix", &eol_mnemonic_unix,
4006 "Mnemonic character indicating UNIX-like end-of-line format (i.e. LF) .");
458822a0 4007 eol_mnemonic_unix = ':';
4ed46869
KH
4008
4009 DEFVAR_INT ("eol-mnemonic-dos", &eol_mnemonic_dos,
4010 "Mnemonic character indicating DOS-like end-of-line format (i.e. CRLF).");
458822a0 4011 eol_mnemonic_dos = '\\';
4ed46869
KH
4012
4013 DEFVAR_INT ("eol-mnemonic-mac", &eol_mnemonic_mac,
4014 "Mnemonic character indicating MAC-like end-of-line format (i.e. CR).");
458822a0 4015 eol_mnemonic_mac = '/';
4ed46869
KH
4016
4017 DEFVAR_INT ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
4018 "Mnemonic character indicating end-of-line format is not yet decided.");
458822a0 4019 eol_mnemonic_undecided = ':';
4ed46869 4020
bdd9fb48
KH
4021 DEFVAR_LISP ("enable-character-unification", &Venable_character_unification,
4022 "Non-nil means ISO 2022 encoder/decoder do character unification.");
4023 Venable_character_unification = Qt;
4024
a5d301df
KH
4025 DEFVAR_LISP ("standard-character-unification-table-for-decode",
4026 &Vstandard_character_unification_table_for_decode,
bdd9fb48 4027 "Table for unifying characters when reading.");
a5d301df 4028 Vstandard_character_unification_table_for_decode = Qnil;
bdd9fb48 4029
a5d301df
KH
4030 DEFVAR_LISP ("standard-character-unification-table-for-encode",
4031 &Vstandard_character_unification_table_for_encode,
bdd9fb48 4032 "Table for unifying characters when writing.");
a5d301df 4033 Vstandard_character_unification_table_for_encode = Qnil;
4ed46869
KH
4034
4035 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
4036 "Alist of charsets vs revision numbers.\n\
4037While encoding, if a charset (car part of an element) is found,\n\
4038designate it with the escape sequence identifing revision (cdr part of the element).");
4039 Vcharset_revision_alist = Qnil;
02ba4723
KH
4040
4041 DEFVAR_LISP ("default-process-coding-system",
4042 &Vdefault_process_coding_system,
4043 "Cons of coding systems used for process I/O by default.\n\
4044The car part is used for decoding a process output,\n\
4045the cdr part is used for encoding a text to be sent to a process.");
4046 Vdefault_process_coding_system = Qnil;
c4825358 4047
3f003981
KH
4048 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
4049 "Table of extra Latin codes in the range 128..159 (inclusive).\n\
c4825358
KH
4050This is a vector of length 256.\n\
4051If Nth element is non-nil, the existence of code N in a file\n\
4052(or output of subprocess) doesn't prevent it to be detected as\n\
3f003981
KH
4053a coding system of ISO 2022 variant which has a flag\n\
4054`accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file\n\
c4825358
KH
4055or reading output of a subprocess.\n\
4056Only 128th through 159th elements has a meaning.");
3f003981 4057 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
4ed46869
KH
4058}
4059
4060#endif /* emacs */