(install): Fix previous change.
[bpt/emacs.git] / src / coding.c
CommitLineData
4ed46869 1/* Coding system handler (conversion, detection, and etc).
203cb916
RS
2 Copyright (C) 1995, 1997 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4ed46869 4
369314dc
KH
5This file is part of GNU Emacs.
6
7GNU Emacs is free software; you can redistribute it and/or modify
8it under the terms of the GNU General Public License as published by
9the Free Software Foundation; either version 2, or (at your option)
10any later version.
4ed46869 11
369314dc
KH
12GNU Emacs is distributed in the hope that it will be useful,
13but WITHOUT ANY WARRANTY; without even the implied warranty of
14MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15GNU General Public License for more details.
4ed46869 16
369314dc
KH
17You should have received a copy of the GNU General Public License
18along with GNU Emacs; see the file COPYING. If not, write to
19the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20Boston, MA 02111-1307, USA. */
4ed46869
KH
21
22/*** TABLE OF CONTENTS ***
23
24 1. Preamble
0ef69138 25 2. Emacs' internal format (emacs-mule) handlers
4ed46869
KH
26 3. ISO2022 handlers
27 4. Shift-JIS and BIG5 handlers
28 5. End-of-line handlers
29 6. C library functions
30 7. Emacs Lisp library functions
31 8. Post-amble
32
33*/
34
35/*** GENERAL NOTE on CODING SYSTEM ***
36
37 Coding system is an encoding mechanism of one or more character
38 sets. Here's a list of coding systems which Emacs can handle. When
39 we say "decode", it means converting some other coding system to
0ef69138
KH
40 Emacs' internal format (emacs-internal), and when we say "encode",
41 it means converting the coding system emacs-mule to some other
42 coding system.
4ed46869 43
0ef69138 44 0. Emacs' internal format (emacs-mule)
4ed46869
KH
45
46 Emacs itself holds a multi-lingual character in a buffer and a string
f4dee582 47 in a special format. Details are described in section 2.
4ed46869
KH
48
49 1. ISO2022
50
51 The most famous coding system for multiple character sets. X's
f4dee582
RS
52 Compound Text, various EUCs (Extended Unix Code), and coding
53 systems used in Internet communication such as ISO-2022-JP are
54 all variants of ISO2022. Details are described in section 3.
4ed46869
KH
55
56 2. SJIS (or Shift-JIS or MS-Kanji-Code)
57
58 A coding system to encode character sets: ASCII, JISX0201, and
59 JISX0208. Widely used for PC's in Japan. Details are described in
f4dee582 60 section 4.
4ed46869
KH
61
62 3. BIG5
63
64 A coding system to encode character sets: ASCII and Big5. Widely
65 used by Chinese (mainly in Taiwan and Hong Kong). Details are
f4dee582
RS
66 described in section 4. In this file, when we write "BIG5"
67 (all uppercase), we mean the coding system, and when we write
68 "Big5" (capitalized), we mean the character set.
4ed46869 69
27901516
KH
70 4. Raw text
71
72 A coding system to for a text containing random 8-bit code. Emacs
73 does no code conversion on such a text except for end-of-line
74 format.
75
76 5. Other
4ed46869 77
f4dee582 78 If a user wants to read/write a text encoded in a coding system not
4ed46869
KH
79 listed above, he can supply a decoder and an encoder for it in CCL
80 (Code Conversion Language) programs. Emacs executes the CCL program
81 while reading/writing.
82
f4dee582 83 Emacs represents a coding-system by a Lisp symbol that has a property
4ed46869
KH
84 `coding-system'. But, before actually using the coding-system, the
85 information about it is set in a structure of type `struct
f4dee582 86 coding_system' for rapid processing. See section 6 for more details.
4ed46869
KH
87
88*/
89
90/*** GENERAL NOTES on END-OF-LINE FORMAT ***
91
92 How end-of-line of a text is encoded depends on a system. For
93 instance, Unix's format is just one byte of `line-feed' code,
f4dee582 94 whereas DOS's format is two-byte sequence of `carriage-return' and
4ed46869
KH
95 `line-feed' codes. MacOS's format is one byte of `carriage-return'.
96
f4dee582
RS
97 Since text characters encoding and end-of-line encoding are
98 independent, any coding system described above can take
4ed46869 99 any format of end-of-line. So, Emacs has information of format of
f4dee582 100 end-of-line in each coding-system. See section 6 for more details.
4ed46869
KH
101
102*/
103
104/*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
105
106 These functions check if a text between SRC and SRC_END is encoded
107 in the coding system category XXX. Each returns an integer value in
108 which appropriate flag bits for the category XXX is set. The flag
109 bits are defined in macros CODING_CATEGORY_MASK_XXX. Below is the
110 template of these functions. */
111#if 0
112int
0ef69138 113detect_coding_emacs_mule (src, src_end)
4ed46869
KH
114 unsigned char *src, *src_end;
115{
116 ...
117}
118#endif
119
120/*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
121
122 These functions decode SRC_BYTES length text at SOURCE encoded in
0ef69138 123 CODING to Emacs' internal format (emacs-mule). The resulting text
f4dee582
RS
124 goes to a place pointed to by DESTINATION, the length of which should
125 not exceed DST_BYTES. The number of bytes actually processed is
126 returned as *CONSUMED. The return value is the length of the decoded
127 text. Below is a template of these functions. */
4ed46869
KH
128#if 0
129decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
130 struct coding_system *coding;
131 unsigned char *source, *destination;
132 int src_bytes, dst_bytes;
133 int *consumed;
134{
135 ...
136}
137#endif
138
139/*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
140
0ef69138
KH
141 These functions encode SRC_BYTES length text at SOURCE of Emacs'
142 internal format (emacs-mule) to CODING. The resulting text goes to
f4dee582
RS
143 a place pointed to by DESTINATION, the length of which should not
144 exceed DST_BYTES. The number of bytes actually processed is
145 returned as *CONSUMED. The return value is the length of the
146 encoded text. Below is a template of these functions. */
4ed46869
KH
147#if 0
148encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
149 struct coding_system *coding;
150 unsigned char *source, *destination;
151 int src_bytes, dst_bytes;
152 int *consumed;
153{
154 ...
155}
156#endif
157
158/*** COMMONLY USED MACROS ***/
159
160/* The following three macros ONE_MORE_BYTE, TWO_MORE_BYTES, and
161 THREE_MORE_BYTES safely get one, two, and three bytes from the
162 source text respectively. If there are not enough bytes in the
163 source, they jump to `label_end_of_loop'. The caller should set
164 variables `src' and `src_end' to appropriate areas in advance. */
165
166#define ONE_MORE_BYTE(c1) \
167 do { \
168 if (src < src_end) \
169 c1 = *src++; \
170 else \
171 goto label_end_of_loop; \
172 } while (0)
173
174#define TWO_MORE_BYTES(c1, c2) \
175 do { \
176 if (src + 1 < src_end) \
177 c1 = *src++, c2 = *src++; \
178 else \
179 goto label_end_of_loop; \
180 } while (0)
181
182#define THREE_MORE_BYTES(c1, c2, c3) \
183 do { \
184 if (src + 2 < src_end) \
185 c1 = *src++, c2 = *src++, c3 = *src++; \
186 else \
187 goto label_end_of_loop; \
188 } while (0)
189
190/* The following three macros DECODE_CHARACTER_ASCII,
191 DECODE_CHARACTER_DIMENSION1, and DECODE_CHARACTER_DIMENSION2 put
192 the multi-byte form of a character of each class at the place
193 pointed by `dst'. The caller should set the variable `dst' to
194 point to an appropriate area and the variable `coding' to point to
195 the coding-system of the currently decoding text in advance. */
196
197/* Decode one ASCII character C. */
198
199#define DECODE_CHARACTER_ASCII(c) \
200 do { \
201 if (COMPOSING_P (coding->composing)) \
202 *dst++ = 0xA0, *dst++ = (c) | 0x80; \
203 else \
204 *dst++ = (c); \
205 } while (0)
206
f4dee582 207/* Decode one DIMENSION1 character whose charset is CHARSET and whose
4ed46869
KH
208 position-code is C. */
209
210#define DECODE_CHARACTER_DIMENSION1(charset, c) \
211 do { \
212 unsigned char leading_code = CHARSET_LEADING_CODE_BASE (charset); \
213 if (COMPOSING_P (coding->composing)) \
214 *dst++ = leading_code + 0x20; \
215 else \
216 *dst++ = leading_code; \
217 if (leading_code = CHARSET_LEADING_CODE_EXT (charset)) \
218 *dst++ = leading_code; \
219 *dst++ = (c) | 0x80; \
220 } while (0)
221
f4dee582 222/* Decode one DIMENSION2 character whose charset is CHARSET and whose
4ed46869
KH
223 position-codes are C1 and C2. */
224
225#define DECODE_CHARACTER_DIMENSION2(charset, c1, c2) \
226 do { \
227 DECODE_CHARACTER_DIMENSION1 (charset, c1); \
228 *dst++ = (c2) | 0x80; \
229 } while (0)
230
231\f
232/*** 1. Preamble ***/
233
234#include <stdio.h>
235
236#ifdef emacs
237
238#include <config.h>
239#include "lisp.h"
240#include "buffer.h"
241#include "charset.h"
242#include "ccl.h"
243#include "coding.h"
244#include "window.h"
245
246#else /* not emacs */
247
248#include "mulelib.h"
249
250#endif /* not emacs */
251
252Lisp_Object Qcoding_system, Qeol_type;
253Lisp_Object Qbuffer_file_coding_system;
254Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
27901516 255Lisp_Object Qno_conversion, Qundecided;
4ed46869
KH
256
257extern Lisp_Object Qinsert_file_contents, Qwrite_region;
258Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
259Lisp_Object Qstart_process, Qopen_network_stream;
260Lisp_Object Qtarget_idx;
261
262/* Mnemonic character of each format of end-of-line. */
263int eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
264/* Mnemonic character to indicate format of end-of-line is not yet
265 decided. */
266int eol_mnemonic_undecided;
267
9ce27fde
KH
268/* Format of end-of-line decided by system. This is CODING_EOL_LF on
269 Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac. */
270int system_eol_type;
271
4ed46869
KH
272#ifdef emacs
273
02ba4723 274Lisp_Object Qcoding_system_spec, Qcoding_system_p, Qcoding_system_error;
4ed46869 275
9ce27fde
KH
276/* Coding system emacs-mule is for converting only end-of-line format. */
277Lisp_Object Qemacs_mule;
278
4ed46869
KH
279/* Coding-systems are handed between Emacs Lisp programs and C internal
280 routines by the following three variables. */
281/* Coding-system for reading files and receiving data from process. */
282Lisp_Object Vcoding_system_for_read;
283/* Coding-system for writing files and sending data to process. */
284Lisp_Object Vcoding_system_for_write;
285/* Coding-system actually used in the latest I/O. */
286Lisp_Object Vlast_coding_system_used;
287
c4825358 288/* A vector of length 256 which contains information about special
3f003981
KH
289 Latin codes (espepcially for dealing with Microsoft code). */
290Lisp_Object Vlatin_extra_code_table;
c4825358 291
9ce27fde
KH
292/* Flag to inhibit code conversion of end-of-line format. */
293int inhibit_eol_conversion;
294
c4825358 295/* Coding system to be used to encode text for terminal display. */
4ed46869
KH
296struct coding_system terminal_coding;
297
c4825358
KH
298/* Coding system to be used to encode text for terminal display when
299 terminal coding system is nil. */
300struct coding_system safe_terminal_coding;
301
302/* Coding system of what is sent from terminal keyboard. */
4ed46869
KH
303struct coding_system keyboard_coding;
304
02ba4723
KH
305Lisp_Object Vfile_coding_system_alist;
306Lisp_Object Vprocess_coding_system_alist;
307Lisp_Object Vnetwork_coding_system_alist;
4ed46869
KH
308
309#endif /* emacs */
310
311Lisp_Object Qcoding_category_index;
312
313/* List of symbols `coding-category-xxx' ordered by priority. */
314Lisp_Object Vcoding_category_list;
315
316/* Table of coding-systems currently assigned to each coding-category. */
317Lisp_Object coding_category_table[CODING_CATEGORY_IDX_MAX];
318
319/* Table of names of symbol for each coding-category. */
320char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
0ef69138 321 "coding-category-emacs-mule",
4ed46869
KH
322 "coding-category-sjis",
323 "coding-category-iso-7",
324 "coding-category-iso-8-1",
325 "coding-category-iso-8-2",
7717c392
KH
326 "coding-category-iso-7-else",
327 "coding-category-iso-8-else",
4ed46869 328 "coding-category-big5",
27901516 329 "coding-category-raw-text",
4ed46869
KH
330 "coding-category-binary"
331};
332
bdd9fb48
KH
333/* Flag to tell if we look up unification table on character code
334 conversion. */
335Lisp_Object Venable_character_unification;
a5d301df
KH
336/* Standard unification table to look up on decoding (reading). */
337Lisp_Object Vstandard_character_unification_table_for_decode;
338/* Standard unification table to look up on encoding (writing). */
339Lisp_Object Vstandard_character_unification_table_for_encode;
bdd9fb48
KH
340
341Lisp_Object Qcharacter_unification_table;
a5d301df
KH
342Lisp_Object Qcharacter_unification_table_for_decode;
343Lisp_Object Qcharacter_unification_table_for_encode;
4ed46869
KH
344
345/* Alist of charsets vs revision number. */
346Lisp_Object Vcharset_revision_alist;
347
02ba4723
KH
348/* Default coding systems used for process I/O. */
349Lisp_Object Vdefault_process_coding_system;
350
4ed46869 351\f
0ef69138 352/*** 2. Emacs internal format (emacs-mule) handlers ***/
4ed46869
KH
353
354/* Emacs' internal format for encoding multiple character sets is a
f4dee582
RS
355 kind of multi-byte encoding, i.e. characters are encoded by
356 variable-length sequences of one-byte codes. ASCII characters
357 and control characters (e.g. `tab', `newline') are represented by
358 one-byte sequences which are their ASCII codes, in the range 0x00
359 through 0x7F. The other characters are represented by a sequence
360 of `base leading-code', optional `extended leading-code', and one
361 or two `position-code's. The length of the sequence is determined
362 by the base leading-code. Leading-code takes the range 0x80
363 through 0x9F, whereas extended leading-code and position-code take
364 the range 0xA0 through 0xFF. See `charset.h' for more details
365 about leading-code and position-code.
366
367 There's one exception to this rule. Special leading-code
4ed46869
KH
368 `leading-code-composition' denotes that the following several
369 characters should be composed into one character. Leading-codes of
370 components (except for ASCII) are added 0x20. An ASCII character
371 component is represented by a 2-byte sequence of `0xA0' and
f4dee582
RS
372 `ASCII-code + 0x80'. See also the comments in `charset.h' for the
373 details of composite character. Hence, we can summarize the code
4ed46869
KH
374 range as follows:
375
376 --- CODE RANGE of Emacs' internal format ---
377 (character set) (range)
378 ASCII 0x00 .. 0x7F
379 ELSE (1st byte) 0x80 .. 0x9F
380 (rest bytes) 0xA0 .. 0xFF
381 ---------------------------------------------
382
383 */
384
385enum emacs_code_class_type emacs_code_class[256];
386
387/* Go to the next statement only if *SRC is accessible and the code is
388 greater than 0xA0. */
389#define CHECK_CODE_RANGE_A0_FF \
390 do { \
391 if (src >= src_end) \
392 goto label_end_of_switch; \
393 else if (*src++ < 0xA0) \
394 return 0; \
395 } while (0)
396
397/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
398 Check if a text is encoded in Emacs' internal format. If it is,
0ef69138 399 return CODING_CATEGORY_MASK_EMASC_MULE, else return 0. */
4ed46869
KH
400
401int
0ef69138 402detect_coding_emacs_mule (src, src_end)
4ed46869
KH
403 unsigned char *src, *src_end;
404{
405 unsigned char c;
406 int composing = 0;
407
408 while (src < src_end)
409 {
410 c = *src++;
411
412 if (composing)
413 {
414 if (c < 0xA0)
415 composing = 0;
416 else
417 c -= 0x20;
418 }
419
420 switch (emacs_code_class[c])
421 {
422 case EMACS_ascii_code:
423 case EMACS_linefeed_code:
424 break;
425
426 case EMACS_control_code:
427 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
428 return 0;
429 break;
430
431 case EMACS_invalid_code:
432 return 0;
433
434 case EMACS_leading_code_composition: /* c == 0x80 */
435 if (composing)
436 CHECK_CODE_RANGE_A0_FF;
437 else
438 composing = 1;
439 break;
440
441 case EMACS_leading_code_4:
442 CHECK_CODE_RANGE_A0_FF;
443 /* fall down to check it two more times ... */
444
445 case EMACS_leading_code_3:
446 CHECK_CODE_RANGE_A0_FF;
447 /* fall down to check it one more time ... */
448
449 case EMACS_leading_code_2:
450 CHECK_CODE_RANGE_A0_FF;
451 break;
452
453 default:
454 label_end_of_switch:
455 break;
456 }
457 }
0ef69138 458 return CODING_CATEGORY_MASK_EMACS_MULE;
4ed46869
KH
459}
460
461\f
462/*** 3. ISO2022 handlers ***/
463
464/* The following note describes the coding system ISO2022 briefly.
f4dee582
RS
465 Since the intention of this note is to help in understanding of
466 the programs in this file, some parts are NOT ACCURATE or OVERLY
4ed46869
KH
467 SIMPLIFIED. For the thorough understanding, please refer to the
468 original document of ISO2022.
469
470 ISO2022 provides many mechanisms to encode several character sets
f4dee582 471 in 7-bit and 8-bit environment. If one chooses 7-bite environment,
4ed46869 472 all text is encoded by codes of less than 128. This may make the
f4dee582
RS
473 encoded text a little bit longer, but the text gets more stability
474 to pass through several gateways (some of them strip off the MSB).
4ed46869 475
f4dee582 476 There are two kinds of character set: control character set and
4ed46869
KH
477 graphic character set. The former contains control characters such
478 as `newline' and `escape' to provide control functions (control
f4dee582 479 functions are provided also by escape sequences). The latter
4ed46869
KH
480 contains graphic characters such as ' A' and '-'. Emacs recognizes
481 two control character sets and many graphic character sets.
482
483 Graphic character sets are classified into one of the following
484 four classes, DIMENSION1_CHARS94, DIMENSION1_CHARS96,
485 DIMENSION2_CHARS94, DIMENSION2_CHARS96 according to the number of
486 bytes (DIMENSION) and the number of characters in one dimension
487 (CHARS) of the set. In addition, each character set is assigned an
488 identification tag (called "final character" and denoted as <F>
489 here after) which is unique in each class. <F> of each character
490 set is decided by ECMA(*) when it is registered in ISO. Code range
491 of <F> is 0x30..0x7F (0x30..0x3F are for private use only).
492
493 Note (*): ECMA = European Computer Manufacturers Association
494
495 Here are examples of graphic character set [NAME(<F>)]:
496 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
497 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
498 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
499 o DIMENSION2_CHARS96 -- none for the moment
500
501 A code area (1byte=8bits) is divided into 4 areas, C0, GL, C1, and GR.
502 C0 [0x00..0x1F] -- control character plane 0
503 GL [0x20..0x7F] -- graphic character plane 0
504 C1 [0x80..0x9F] -- control character plane 1
505 GR [0xA0..0xFF] -- graphic character plane 1
506
507 A control character set is directly designated and invoked to C0 or
508 C1 by an escape sequence. The most common case is that ISO646's
509 control character set is designated/invoked to C0 and ISO6429's
510 control character set is designated/invoked to C1, and usually
511 these designations/invocations are omitted in a coded text. With
512 7-bit environment, only C0 can be used, and a control character for
513 C1 is encoded by an appropriate escape sequence to fit in the
514 environment. All control characters for C1 are defined the
515 corresponding escape sequences.
516
517 A graphic character set is at first designated to one of four
518 graphic registers (G0 through G3), then these graphic registers are
519 invoked to GL or GR. These designations and invocations can be
520 done independently. The most common case is that G0 is invoked to
521 GL, G1 is invoked to GR, and ASCII is designated to G0, and usually
522 these invocations and designations are omitted in a coded text.
523 With 7-bit environment, only GL can be used.
524
525 When a graphic character set of CHARS94 is invoked to GL, code 0x20
526 and 0x7F of GL area work as control characters SPACE and DEL
527 respectively, and code 0xA0 and 0xFF of GR area should not be used.
528
529 There are two ways of invocation: locking-shift and single-shift.
530 With locking-shift, the invocation lasts until the next different
531 invocation, whereas with single-shift, the invocation works only
532 for the following character and doesn't affect locking-shift.
533 Invocations are done by the following control characters or escape
534 sequences.
535
536 ----------------------------------------------------------------------
537 function control char escape sequence description
538 ----------------------------------------------------------------------
539 SI (shift-in) 0x0F none invoke G0 to GL
10bff6f1 540 SO (shift-out) 0x0E none invoke G1 to GL
4ed46869
KH
541 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
542 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
543 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 into GL
544 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 into GL
545 ----------------------------------------------------------------------
546 The first four are for locking-shift. Control characters for these
547 functions are defined by macros ISO_CODE_XXX in `coding.h'.
548
549 Designations are done by the following escape sequences.
550 ----------------------------------------------------------------------
551 escape sequence description
552 ----------------------------------------------------------------------
553 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
554 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
555 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
556 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
557 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
558 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
559 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
560 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
561 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
562 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
563 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
564 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
565 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
566 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
567 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
568 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
569 ----------------------------------------------------------------------
570
571 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
572 of dimension 1, chars 94, and final character <F>, and etc.
573
574 Note (*): Although these designations are not allowed in ISO2022,
575 Emacs accepts them on decoding, and produces them on encoding
576 CHARS96 character set in a coding system which is characterized as
577 7-bit environment, non-locking-shift, and non-single-shift.
578
579 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
580 '(' can be omitted. We call this as "short-form" here after.
581
582 Now you may notice that there are a lot of ways for encoding the
f4dee582 583 same multilingual text in ISO2022. Actually, there exists many
4ed46869
KH
584 coding systems such as Compound Text (used in X's inter client
585 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
586 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
587 localized platforms), and all of these are variants of ISO2022.
588
589 In addition to the above, Emacs handles two more kinds of escape
590 sequences: ISO6429's direction specification and Emacs' private
591 sequence for specifying character composition.
592
593 ISO6429's direction specification takes the following format:
594 o CSI ']' -- end of the current direction
595 o CSI '0' ']' -- end of the current direction
596 o CSI '1' ']' -- start of left-to-right text
597 o CSI '2' ']' -- start of right-to-left text
598 The control character CSI (0x9B: control sequence introducer) is
599 abbreviated to the escape sequence ESC '[' in 7-bit environment.
600
601 Character composition specification takes the following format:
602 o ESC '0' -- start character composition
603 o ESC '1' -- end character composition
604 Since these are not standard escape sequences of any ISO, the use
605 of them for these meaning is restricted to Emacs only. */
606
607enum iso_code_class_type iso_code_class[256];
608
609/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
610 Check if a text is encoded in ISO2022. If it is, returns an
611 integer in which appropriate flag bits any of:
612 CODING_CATEGORY_MASK_ISO_7
613 CODING_CATEGORY_MASK_ISO_8_1
614 CODING_CATEGORY_MASK_ISO_8_2
7717c392
KH
615 CODING_CATEGORY_MASK_ISO_7_ELSE
616 CODING_CATEGORY_MASK_ISO_8_ELSE
4ed46869
KH
617 are set. If a code which should never appear in ISO2022 is found,
618 returns 0. */
619
620int
621detect_coding_iso2022 (src, src_end)
622 unsigned char *src, *src_end;
623{
765a2ca5
KH
624 int mask = (CODING_CATEGORY_MASK_ISO_7
625 | CODING_CATEGORY_MASK_ISO_8_1
626 | CODING_CATEGORY_MASK_ISO_8_2
7717c392
KH
627 | CODING_CATEGORY_MASK_ISO_7_ELSE
628 | CODING_CATEGORY_MASK_ISO_8_ELSE
629 );
bcf26d6a
KH
630 int g1 = 0; /* 1 iff designating to G1. */
631 int c, i;
3f003981 632 struct coding_system coding_iso_8_1, coding_iso_8_2;
4ed46869 633
3f003981
KH
634 /* Coding systems of these categories may accept latin extra codes. */
635 setup_coding_system
636 (XSYMBOL (coding_category_table[CODING_CATEGORY_IDX_ISO_8_1])->value,
637 &coding_iso_8_1);
638 setup_coding_system
639 (XSYMBOL (coding_category_table[CODING_CATEGORY_IDX_ISO_8_2])->value,
640 &coding_iso_8_2);
641
642 while (mask && src < src_end)
4ed46869
KH
643 {
644 c = *src++;
645 switch (c)
646 {
647 case ISO_CODE_ESC:
e0e989f6 648 if (src >= src_end)
4ed46869
KH
649 break;
650 c = *src++;
bf9cdd4e 651 if ((c >= '(' && c <= '/'))
4ed46869 652 {
bf9cdd4e
KH
653 /* Designation sequence for a charset of dimension 1. */
654 if (src >= src_end)
655 break;
656 c = *src++;
657 if (c < ' ' || c >= 0x80)
658 /* Invalid designation sequence. */
659 return 0;
660 }
661 else if (c == '$')
662 {
663 /* Designation sequence for a charset of dimension 2. */
664 if (src >= src_end)
665 break;
666 c = *src++;
667 if (c >= '@' && c <= 'B')
668 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
669 ;
670 else if (c >= '(' && c <= '/')
bcf26d6a 671 {
bf9cdd4e
KH
672 if (src >= src_end)
673 break;
674 c = *src++;
675 if (c < ' ' || c >= 0x80)
676 /* Invalid designation sequence. */
677 return 0;
bcf26d6a 678 }
bf9cdd4e
KH
679 else
680 /* Invalid designation sequence. */
681 return 0;
4ed46869 682 }
4ed46869 683 else if (c == 'N' || c == 'O' || c == 'n' || c == 'o')
bf9cdd4e 684 /* Locking shift. */
7717c392
KH
685 mask &= (CODING_CATEGORY_MASK_ISO_7_ELSE
686 | CODING_CATEGORY_MASK_ISO_8_ELSE);
bf9cdd4e
KH
687 else if (c == '0' || c == '1' || c == '2')
688 /* Start/end composition. */
689 ;
690 else
691 /* Invalid escape sequence. */
692 return 0;
4ed46869
KH
693 break;
694
4ed46869 695 case ISO_CODE_SO:
bf9cdd4e
KH
696 mask &= (CODING_CATEGORY_MASK_ISO_7_ELSE
697 | CODING_CATEGORY_MASK_ISO_8_ELSE);
e0e989f6
KH
698 break;
699
4ed46869
KH
700 case ISO_CODE_CSI:
701 case ISO_CODE_SS2:
702 case ISO_CODE_SS3:
3f003981
KH
703 {
704 int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
705
706 if (VECTORP (Vlatin_extra_code_table)
707 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
708 {
709 if (coding_iso_8_1.flags & CODING_FLAG_ISO_LATIN_EXTRA)
710 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
711 if (coding_iso_8_2.flags & CODING_FLAG_ISO_LATIN_EXTRA)
712 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
713 }
714 mask &= newmask;
715 }
716 break;
4ed46869
KH
717
718 default:
719 if (c < 0x80)
720 break;
721 else if (c < 0xA0)
c4825358 722 {
3f003981
KH
723 if (VECTORP (Vlatin_extra_code_table)
724 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
c4825358 725 {
3f003981
KH
726 int newmask = 0;
727
728 if (coding_iso_8_1.flags & CODING_FLAG_ISO_LATIN_EXTRA)
729 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
730 if (coding_iso_8_2.flags & CODING_FLAG_ISO_LATIN_EXTRA)
731 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
732 mask &= newmask;
c4825358 733 }
3f003981
KH
734 else
735 return 0;
c4825358 736 }
4ed46869
KH
737 else
738 {
7717c392 739 unsigned char *src_begin = src;
4ed46869 740
7717c392
KH
741 mask &= ~(CODING_CATEGORY_MASK_ISO_7
742 | CODING_CATEGORY_MASK_ISO_7_ELSE);
e0e989f6 743 while (src < src_end && *src >= 0xA0)
7717c392
KH
744 src++;
745 if ((src - src_begin - 1) & 1 && src < src_end)
4ed46869
KH
746 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
747 }
748 break;
749 }
750 }
751
752 return mask;
753}
754
755/* Decode a character of which charset is CHARSET and the 1st position
bdd9fb48 756 code is C1. If dimension of CHARSET is 2, the 2nd position code is
4ed46869
KH
757 fetched from SRC and set to C2. If CHARSET is negative, it means
758 that we are decoding ill formed text, and what we can do is just to
759 read C1 as is. */
760
bdd9fb48
KH
761#define DECODE_ISO_CHARACTER(charset, c1) \
762 do { \
763 int c_alt, charset_alt = (charset); \
764 if (COMPOSING_HEAD_P (coding->composing)) \
765 { \
766 *dst++ = LEADING_CODE_COMPOSITION; \
767 if (COMPOSING_WITH_RULE_P (coding->composing)) \
768 /* To tell composition rules are embeded. */ \
769 *dst++ = 0xFF; \
770 coding->composing += 2; \
771 } \
772 if ((charset) >= 0) \
773 { \
774 if (CHARSET_DIMENSION (charset) == 2) \
775 ONE_MORE_BYTE (c2); \
776 if (!NILP (unification_table) \
777 && ((c_alt = unify_char (unification_table, \
778 -1, (charset), c1, c2)) >= 0)) \
779 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
780 } \
781 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
782 DECODE_CHARACTER_ASCII (c1); \
783 else if (CHARSET_DIMENSION (charset_alt) == 1) \
784 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
785 else \
786 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
787 if (COMPOSING_WITH_RULE_P (coding->composing)) \
788 /* To tell a composition rule follows. */ \
789 coding->composing = COMPOSING_WITH_RULE_RULE; \
4ed46869
KH
790 } while (0)
791
792/* Set designation state into CODING. */
793#define DECODE_DESIGNATION(reg, dimension, chars, final_char) \
794 do { \
2e34157c
RS
795 int charset = ISO_CHARSET_TABLE (make_number (dimension), \
796 make_number (chars), \
797 make_number (final_char)); \
4ed46869
KH
798 if (charset >= 0) \
799 { \
800 if (coding->direction == 1 \
801 && CHARSET_REVERSE_CHARSET (charset) >= 0) \
802 charset = CHARSET_REVERSE_CHARSET (charset); \
803 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
804 } \
805 } while (0)
806
807/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
808
809int
810decode_coding_iso2022 (coding, source, destination,
811 src_bytes, dst_bytes, consumed)
812 struct coding_system *coding;
813 unsigned char *source, *destination;
814 int src_bytes, dst_bytes;
815 int *consumed;
816{
817 unsigned char *src = source;
818 unsigned char *src_end = source + src_bytes;
819 unsigned char *dst = destination;
820 unsigned char *dst_end = destination + dst_bytes;
821 /* Since the maximum bytes produced by each loop is 7, we subtract 6
822 from DST_END to assure that overflow checking is necessary only
823 at the head of loop. */
824 unsigned char *adjusted_dst_end = dst_end - 6;
825 int charset;
826 /* Charsets invoked to graphic plane 0 and 1 respectively. */
827 int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
828 int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
a5d301df
KH
829 Lisp_Object unification_table
830 = coding->character_unification_table_for_decode;
bdd9fb48
KH
831
832 if (!NILP (Venable_character_unification) && NILP (unification_table))
a5d301df 833 unification_table = Vstandard_character_unification_table_for_decode;
4ed46869
KH
834
835 while (src < src_end && dst < adjusted_dst_end)
836 {
837 /* SRC_BASE remembers the start position in source in each loop.
838 The loop will be exited when there's not enough source text
839 to analyze long escape sequence or 2-byte code (within macros
840 ONE_MORE_BYTE or TWO_MORE_BYTES). In that case, SRC is reset
841 to SRC_BASE before exiting. */
842 unsigned char *src_base = src;
bdd9fb48 843 int c1 = *src++, c2;
4ed46869
KH
844
845 switch (iso_code_class [c1])
846 {
847 case ISO_0x20_or_0x7F:
848 if (!coding->composing
849 && (charset0 < 0 || CHARSET_CHARS (charset0) == 94))
850 {
851 /* This is SPACE or DEL. */
852 *dst++ = c1;
853 break;
854 }
855 /* This is a graphic character, we fall down ... */
856
857 case ISO_graphic_plane_0:
858 if (coding->composing == COMPOSING_WITH_RULE_RULE)
859 {
860 /* This is a composition rule. */
861 *dst++ = c1 | 0x80;
862 coding->composing = COMPOSING_WITH_RULE_TAIL;
863 }
864 else
865 DECODE_ISO_CHARACTER (charset0, c1);
866 break;
867
868 case ISO_0xA0_or_0xFF:
869 if (charset1 < 0 || CHARSET_CHARS (charset1) == 94)
870 {
871 /* Invalid code. */
872 *dst++ = c1;
873 break;
874 }
875 /* This is a graphic character, we fall down ... */
876
877 case ISO_graphic_plane_1:
878 DECODE_ISO_CHARACTER (charset1, c1);
879 break;
880
881 case ISO_control_code:
882 /* All ISO2022 control characters in this class have the
883 same representation in Emacs internal format. */
884 *dst++ = c1;
885 break;
886
887 case ISO_carriage_return:
888 if (coding->eol_type == CODING_EOL_CR)
889 {
890 *dst++ = '\n';
891 }
892 else if (coding->eol_type == CODING_EOL_CRLF)
893 {
894 ONE_MORE_BYTE (c1);
895 if (c1 == ISO_CODE_LF)
896 *dst++ = '\n';
897 else
898 {
899 src--;
900 *dst++ = c1;
901 }
902 }
903 else
904 {
905 *dst++ = c1;
906 }
907 break;
908
909 case ISO_shift_out:
e0e989f6
KH
910 if (CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
911 goto label_invalid_escape_sequence;
4ed46869
KH
912 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
913 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
914 break;
915
916 case ISO_shift_in:
917 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
918 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
919 break;
920
921 case ISO_single_shift_2_7:
922 case ISO_single_shift_2:
923 /* SS2 is handled as an escape sequence of ESC 'N' */
924 c1 = 'N';
925 goto label_escape_sequence;
926
927 case ISO_single_shift_3:
928 /* SS2 is handled as an escape sequence of ESC 'O' */
929 c1 = 'O';
930 goto label_escape_sequence;
931
932 case ISO_control_sequence_introducer:
933 /* CSI is handled as an escape sequence of ESC '[' ... */
934 c1 = '[';
935 goto label_escape_sequence;
936
937 case ISO_escape:
938 ONE_MORE_BYTE (c1);
939 label_escape_sequence:
940 /* Escape sequences handled by Emacs are invocation,
941 designation, direction specification, and character
942 composition specification. */
943 switch (c1)
944 {
945 case '&': /* revision of following character set */
946 ONE_MORE_BYTE (c1);
947 if (!(c1 >= '@' && c1 <= '~'))
e0e989f6 948 goto label_invalid_escape_sequence;
4ed46869
KH
949 ONE_MORE_BYTE (c1);
950 if (c1 != ISO_CODE_ESC)
e0e989f6 951 goto label_invalid_escape_sequence;
4ed46869
KH
952 ONE_MORE_BYTE (c1);
953 goto label_escape_sequence;
954
955 case '$': /* designation of 2-byte character set */
956 ONE_MORE_BYTE (c1);
957 if (c1 >= '@' && c1 <= 'B')
958 { /* designation of JISX0208.1978, GB2312.1980,
959 or JISX0208.1980 */
960 DECODE_DESIGNATION (0, 2, 94, c1);
961 }
962 else if (c1 >= 0x28 && c1 <= 0x2B)
963 { /* designation of DIMENSION2_CHARS94 character set */
964 ONE_MORE_BYTE (c2);
965 DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
966 }
967 else if (c1 >= 0x2C && c1 <= 0x2F)
968 { /* designation of DIMENSION2_CHARS96 character set */
969 ONE_MORE_BYTE (c2);
970 DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
971 }
972 else
e0e989f6 973 goto label_invalid_escape_sequence;
4ed46869
KH
974 break;
975
976 case 'n': /* invocation of locking-shift-2 */
e0e989f6
KH
977 if (CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
978 goto label_invalid_escape_sequence;
4ed46869 979 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
e0e989f6 980 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
4ed46869
KH
981 break;
982
983 case 'o': /* invocation of locking-shift-3 */
e0e989f6
KH
984 if (CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
985 goto label_invalid_escape_sequence;
4ed46869 986 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
e0e989f6 987 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
4ed46869
KH
988 break;
989
990 case 'N': /* invocation of single-shift-2 */
e0e989f6
KH
991 if (CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
992 goto label_invalid_escape_sequence;
4ed46869
KH
993 ONE_MORE_BYTE (c1);
994 charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
995 DECODE_ISO_CHARACTER (charset, c1);
996 break;
997
998 case 'O': /* invocation of single-shift-3 */
e0e989f6
KH
999 if (CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1000 goto label_invalid_escape_sequence;
4ed46869
KH
1001 ONE_MORE_BYTE (c1);
1002 charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
1003 DECODE_ISO_CHARACTER (charset, c1);
1004 break;
1005
1006 case '0': /* start composing without embeded rules */
1007 coding->composing = COMPOSING_NO_RULE_HEAD;
1008 break;
1009
1010 case '1': /* end composing */
1011 coding->composing = COMPOSING_NO;
1012 break;
1013
1014 case '2': /* start composing with embeded rules */
1015 coding->composing = COMPOSING_WITH_RULE_HEAD;
1016 break;
1017
1018 case '[': /* specification of direction */
1019 /* For the moment, nested direction is not supported.
1020 So, the value of `coding->direction' is 0 or 1: 0
1021 means left-to-right, 1 means right-to-left. */
1022 ONE_MORE_BYTE (c1);
1023 switch (c1)
1024 {
1025 case ']': /* end of the current direction */
1026 coding->direction = 0;
1027
1028 case '0': /* end of the current direction */
1029 case '1': /* start of left-to-right direction */
1030 ONE_MORE_BYTE (c1);
1031 if (c1 == ']')
1032 coding->direction = 0;
1033 else
1034 goto label_invalid_escape_sequence;
1035 break;
1036
1037 case '2': /* start of right-to-left direction */
1038 ONE_MORE_BYTE (c1);
1039 if (c1 == ']')
1040 coding->direction= 1;
1041 else
1042 goto label_invalid_escape_sequence;
1043 break;
1044
1045 default:
1046 goto label_invalid_escape_sequence;
1047 }
1048 break;
1049
1050 default:
1051 if (c1 >= 0x28 && c1 <= 0x2B)
1052 { /* designation of DIMENSION1_CHARS94 character set */
1053 ONE_MORE_BYTE (c2);
1054 DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
1055 }
1056 else if (c1 >= 0x2C && c1 <= 0x2F)
1057 { /* designation of DIMENSION1_CHARS96 character set */
1058 ONE_MORE_BYTE (c2);
1059 DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
1060 }
1061 else
1062 {
1063 goto label_invalid_escape_sequence;
1064 }
1065 }
1066 /* We must update these variables now. */
1067 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1068 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1069 break;
1070
1071 label_invalid_escape_sequence:
1072 {
1073 int length = src - src_base;
1074
1075 bcopy (src_base, dst, length);
1076 dst += length;
1077 }
1078 }
1079 continue;
1080
1081 label_end_of_loop:
1082 coding->carryover_size = src - src_base;
1083 bcopy (src_base, coding->carryover, coding->carryover_size);
1084 src = src_base;
1085 break;
1086 }
1087
1088 /* If this is the last block of the text to be decoded, we had
1089 better just flush out all remaining codes in the text although
1090 they are not valid characters. */
1091 if (coding->last_block)
1092 {
1093 bcopy (src, dst, src_end - src);
1094 dst += (src_end - src);
1095 src = src_end;
1096 }
1097 *consumed = src - source;
1098 return dst - destination;
1099}
1100
f4dee582 1101/* ISO2022 encoding stuff. */
4ed46869
KH
1102
1103/*
f4dee582 1104 It is not enough to say just "ISO2022" on encoding, we have to
4ed46869
KH
1105 specify more details. In Emacs, each coding-system of ISO2022
1106 variant has the following specifications:
1107 1. Initial designation to G0 thru G3.
1108 2. Allows short-form designation?
1109 3. ASCII should be designated to G0 before control characters?
1110 4. ASCII should be designated to G0 at end of line?
1111 5. 7-bit environment or 8-bit environment?
1112 6. Use locking-shift?
1113 7. Use Single-shift?
1114 And the following two are only for Japanese:
1115 8. Use ASCII in place of JIS0201-1976-Roman?
1116 9. Use JISX0208-1983 in place of JISX0208-1978?
1117 These specifications are encoded in `coding->flags' as flag bits
1118 defined by macros CODING_FLAG_ISO_XXX. See `coding.h' for more
f4dee582 1119 details.
4ed46869
KH
1120*/
1121
1122/* Produce codes (escape sequence) for designating CHARSET to graphic
1123 register REG. If <final-char> of CHARSET is '@', 'A', or 'B' and
1124 the coding system CODING allows, produce designation sequence of
1125 short-form. */
1126
1127#define ENCODE_DESIGNATION(charset, reg, coding) \
1128 do { \
1129 unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset); \
1130 char *intermediate_char_94 = "()*+"; \
1131 char *intermediate_char_96 = ",-./"; \
1132 Lisp_Object temp \
1133 = Fassq (make_number (charset), Vcharset_revision_alist); \
1134 if (! NILP (temp)) \
1135 { \
1136 *dst++ = ISO_CODE_ESC; \
1137 *dst++ = '&'; \
1138 *dst++ = XINT (XCONS (temp)->cdr) + '@'; \
1139 } \
1140 *dst++ = ISO_CODE_ESC; \
1141 if (CHARSET_DIMENSION (charset) == 1) \
1142 { \
1143 if (CHARSET_CHARS (charset) == 94) \
1144 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1145 else \
1146 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1147 } \
1148 else \
1149 { \
1150 *dst++ = '$'; \
1151 if (CHARSET_CHARS (charset) == 94) \
1152 { \
1153 if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM) \
1154 || reg != 0 \
1155 || final_char < '@' || final_char > 'B') \
1156 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1157 } \
1158 else \
1159 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1160 } \
1161 *dst++ = final_char; \
1162 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
1163 } while (0)
1164
1165/* The following two macros produce codes (control character or escape
1166 sequence) for ISO2022 single-shift functions (single-shift-2 and
1167 single-shift-3). */
1168
1169#define ENCODE_SINGLE_SHIFT_2 \
1170 do { \
1171 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1172 *dst++ = ISO_CODE_ESC, *dst++ = 'N'; \
1173 else \
1174 *dst++ = ISO_CODE_SS2; \
1175 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1176 } while (0)
1177
1178#define ENCODE_SINGLE_SHIFT_3 \
1179 do { \
1180 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1181 *dst++ = ISO_CODE_ESC, *dst++ = 'O'; \
1182 else \
1183 *dst++ = ISO_CODE_SS3; \
1184 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1185 } while (0)
1186
1187/* The following four macros produce codes (control character or
1188 escape sequence) for ISO2022 locking-shift functions (shift-in,
1189 shift-out, locking-shift-2, and locking-shift-3). */
1190
1191#define ENCODE_SHIFT_IN \
1192 do { \
1193 *dst++ = ISO_CODE_SI; \
1194 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1195 } while (0)
1196
1197#define ENCODE_SHIFT_OUT \
1198 do { \
1199 *dst++ = ISO_CODE_SO; \
1200 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1201 } while (0)
1202
1203#define ENCODE_LOCKING_SHIFT_2 \
1204 do { \
1205 *dst++ = ISO_CODE_ESC, *dst++ = 'n'; \
1206 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1207 } while (0)
1208
1209#define ENCODE_LOCKING_SHIFT_3 \
1210 do { \
1211 *dst++ = ISO_CODE_ESC, *dst++ = 'o'; \
1212 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1213 } while (0)
1214
f4dee582
RS
1215/* Produce codes for a DIMENSION1 character whose character set is
1216 CHARSET and whose position-code is C1. Designation and invocation
4ed46869
KH
1217 sequences are also produced in advance if necessary. */
1218
1219
6e85d753
KH
1220#define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
1221 do { \
1222 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1223 { \
1224 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1225 *dst++ = c1 & 0x7F; \
1226 else \
1227 *dst++ = c1 | 0x80; \
1228 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1229 break; \
1230 } \
1231 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1232 { \
1233 *dst++ = c1 & 0x7F; \
1234 break; \
1235 } \
1236 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1237 { \
1238 *dst++ = c1 | 0x80; \
1239 break; \
1240 } \
1241 else if (coding->flags & CODING_FLAG_ISO_SAFE \
1242 && !CODING_SPEC_ISO_EXPECTED_CHARSETS (coding)[charset]) \
1243 { \
1244 /* We should not encode this character, instead produce one or \
1245 two `?'s. */ \
1246 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1247 if (CHARSET_WIDTH (charset) == 2) \
1248 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1249 break; \
1250 } \
1251 else \
1252 /* Since CHARSET is not yet invoked to any graphic planes, we \
1253 must invoke it, or, at first, designate it to some graphic \
1254 register. Then repeat the loop to actually produce the \
1255 character. */ \
1256 dst = encode_invocation_designation (charset, coding, dst); \
4ed46869
KH
1257 } while (1)
1258
f4dee582
RS
1259/* Produce codes for a DIMENSION2 character whose character set is
1260 CHARSET and whose position-codes are C1 and C2. Designation and
4ed46869
KH
1261 invocation codes are also produced in advance if necessary. */
1262
6e85d753
KH
1263#define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
1264 do { \
1265 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1266 { \
1267 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1268 *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F; \
1269 else \
1270 *dst++ = c1 | 0x80, *dst++ = c2 | 0x80; \
1271 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1272 break; \
1273 } \
1274 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1275 { \
1276 *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F; \
1277 break; \
1278 } \
1279 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1280 { \
1281 *dst++ = c1 | 0x80, *dst++= c2 | 0x80; \
1282 break; \
1283 } \
1284 else if (coding->flags & CODING_FLAG_ISO_SAFE \
1285 && !CODING_SPEC_ISO_EXPECTED_CHARSETS (coding)[charset]) \
1286 { \
1287 /* We should not encode this character, instead produce one or \
1288 two `?'s. */ \
1289 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1290 if (CHARSET_WIDTH (charset) == 2) \
1291 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1292 break; \
1293 } \
1294 else \
1295 /* Since CHARSET is not yet invoked to any graphic planes, we \
1296 must invoke it, or, at first, designate it to some graphic \
1297 register. Then repeat the loop to actually produce the \
1298 character. */ \
1299 dst = encode_invocation_designation (charset, coding, dst); \
4ed46869
KH
1300 } while (1)
1301
bdd9fb48
KH
1302#define ENCODE_ISO_CHARACTER(charset, c1, c2) \
1303 do { \
1304 int c_alt, charset_alt; \
1305 if (!NILP (unification_table) \
1306 && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
a5d301df 1307 >= 0)) \
bdd9fb48
KH
1308 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1309 else \
1310 charset_alt = charset; \
1311 if (CHARSET_DIMENSION (charset_alt) == 1) \
1312 ENCODE_ISO_CHARACTER_DIMENSION1 (charset_alt, c1); \
1313 else \
1314 ENCODE_ISO_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
1315 } while (0)
1316
4ed46869
KH
1317/* Produce designation and invocation codes at a place pointed by DST
1318 to use CHARSET. The element `spec.iso2022' of *CODING is updated.
1319 Return new DST. */
1320
1321unsigned char *
1322encode_invocation_designation (charset, coding, dst)
1323 int charset;
1324 struct coding_system *coding;
1325 unsigned char *dst;
1326{
1327 int reg; /* graphic register number */
1328
1329 /* At first, check designations. */
1330 for (reg = 0; reg < 4; reg++)
1331 if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
1332 break;
1333
1334 if (reg >= 4)
1335 {
1336 /* CHARSET is not yet designated to any graphic registers. */
1337 /* At first check the requested designation. */
1338 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1ba9e4ab
KH
1339 if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1340 /* Since CHARSET requests no special designation, designate it
1341 to graphic register 0. */
4ed46869
KH
1342 reg = 0;
1343
1344 ENCODE_DESIGNATION (charset, reg, coding);
1345 }
1346
1347 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
1348 && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
1349 {
1350 /* Since the graphic register REG is not invoked to any graphic
1351 planes, invoke it to graphic plane 0. */
1352 switch (reg)
1353 {
1354 case 0: /* graphic register 0 */
1355 ENCODE_SHIFT_IN;
1356 break;
1357
1358 case 1: /* graphic register 1 */
1359 ENCODE_SHIFT_OUT;
1360 break;
1361
1362 case 2: /* graphic register 2 */
1363 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1364 ENCODE_SINGLE_SHIFT_2;
1365 else
1366 ENCODE_LOCKING_SHIFT_2;
1367 break;
1368
1369 case 3: /* graphic register 3 */
1370 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1371 ENCODE_SINGLE_SHIFT_3;
1372 else
1373 ENCODE_LOCKING_SHIFT_3;
1374 break;
1375 }
1376 }
1377 return dst;
1378}
1379
1380/* The following two macros produce codes for indicating composition. */
1381#define ENCODE_COMPOSITION_NO_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '0'
1382#define ENCODE_COMPOSITION_WITH_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '2'
1383#define ENCODE_COMPOSITION_END *dst++ = ISO_CODE_ESC, *dst++ = '1'
1384
1385/* The following three macros produce codes for indicating direction
1386 of text. */
1387#define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
1388 do { \
1389 if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS) \
1390 *dst++ = ISO_CODE_ESC, *dst++ = '['; \
1391 else \
1392 *dst++ = ISO_CODE_CSI; \
1393 } while (0)
1394
1395#define ENCODE_DIRECTION_R2L \
1396 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '2', *dst++ = ']'
1397
1398#define ENCODE_DIRECTION_L2R \
1399 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '0', *dst++ = ']'
1400
1401/* Produce codes for designation and invocation to reset the graphic
1402 planes and registers to initial state. */
e0e989f6
KH
1403#define ENCODE_RESET_PLANE_AND_REGISTER \
1404 do { \
1405 int reg; \
1406 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0) \
1407 ENCODE_SHIFT_IN; \
1408 for (reg = 0; reg < 4; reg++) \
1409 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0 \
1410 && (CODING_SPEC_ISO_DESIGNATION (coding, reg) \
1411 != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg))) \
1412 ENCODE_DESIGNATION \
1413 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
4ed46869
KH
1414 } while (0)
1415
bdd9fb48
KH
1416/* Produce designation sequences of charsets in the line started from
1417 *SRC to a place pointed by DSTP.
1418
1419 If the current block ends before any end-of-line, we may fail to
1420 find all the necessary *designations. */
1421encode_designation_at_bol (coding, table, src, src_end, dstp)
e0e989f6 1422 struct coding_system *coding;
bdd9fb48 1423 Lisp_Object table;
e0e989f6
KH
1424 unsigned char *src, *src_end, **dstp;
1425{
bdd9fb48
KH
1426 int charset, c, found = 0, reg;
1427 /* Table of charsets to be designated to each graphic register. */
1428 int r[4];
1429 unsigned char *dst = *dstp;
1430
1431 for (reg = 0; reg < 4; reg++)
1432 r[reg] = -1;
1433
1434 while (src < src_end && *src != '\n' && found < 4)
e0e989f6 1435 {
bdd9fb48
KH
1436 int bytes = BYTES_BY_CHAR_HEAD (*src);
1437
1438 if (NILP (table))
1439 charset = CHARSET_AT (src);
1440 else
e0e989f6 1441 {
bdd9fb48
KH
1442 int c_alt, c1, c2;
1443
1444 SPLIT_STRING(src, bytes, charset, c1, c2);
1445 if ((c_alt = unify_char (table, -1, charset, c1, c2)) >= 0)
1446 charset = CHAR_CHARSET (c_alt);
e0e989f6 1447 }
bdd9fb48 1448
e0e989f6 1449 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1ba9e4ab 1450 if (r[reg] == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
bdd9fb48
KH
1451 {
1452 found++;
1453 r[reg] = charset;
1454 }
1455
1456 src += bytes;
1457 }
1458
1459 if (found)
1460 {
1461 for (reg = 0; reg < 4; reg++)
1462 if (r[reg] >= 0
1463 && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
1464 ENCODE_DESIGNATION (r[reg], reg, coding);
1465 *dstp = dst;
e0e989f6 1466 }
e0e989f6
KH
1467}
1468
4ed46869
KH
1469/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
1470
1471int
1472encode_coding_iso2022 (coding, source, destination,
1473 src_bytes, dst_bytes, consumed)
1474 struct coding_system *coding;
1475 unsigned char *source, *destination;
1476 int src_bytes, dst_bytes;
1477 int *consumed;
1478{
1479 unsigned char *src = source;
1480 unsigned char *src_end = source + src_bytes;
1481 unsigned char *dst = destination;
1482 unsigned char *dst_end = destination + dst_bytes;
e0e989f6 1483 /* Since the maximum bytes produced by each loop is 20, we subtract 19
4ed46869
KH
1484 from DST_END to assure overflow checking is necessary only at the
1485 head of loop. */
e0e989f6 1486 unsigned char *adjusted_dst_end = dst_end - 19;
a5d301df
KH
1487 Lisp_Object unification_table
1488 = coding->character_unification_table_for_encode;
bdd9fb48
KH
1489
1490 if (!NILP (Venable_character_unification) && NILP (unification_table))
a5d301df 1491 unification_table = Vstandard_character_unification_table_for_encode;
4ed46869
KH
1492
1493 while (src < src_end && dst < adjusted_dst_end)
1494 {
1495 /* SRC_BASE remembers the start position in source in each loop.
1496 The loop will be exited when there's not enough source text
1497 to analyze multi-byte codes (within macros ONE_MORE_BYTE,
1498 TWO_MORE_BYTES, and THREE_MORE_BYTES). In that case, SRC is
1499 reset to SRC_BASE before exiting. */
1500 unsigned char *src_base = src;
bdd9fb48 1501 int charset, c1, c2, c3, c4;
4ed46869 1502
e0e989f6
KH
1503 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
1504 && CODING_SPEC_ISO_BOL (coding))
1505 {
bdd9fb48
KH
1506 /* We have to produce designation sequences if any now. */
1507 encode_designation_at_bol (coding, unification_table,
1508 src, src_end, &dst);
e0e989f6
KH
1509 CODING_SPEC_ISO_BOL (coding) = 0;
1510 }
1511
1512 c1 = *src++;
4ed46869
KH
1513 /* If we are seeing a component of a composite character, we are
1514 seeing a leading-code specially encoded for composition, or a
1515 composition rule if composing with rule. We must set C1
1516 to a normal leading-code or an ASCII code. If we are not at
1517 a composed character, we must reset the composition state. */
1518 if (COMPOSING_P (coding->composing))
1519 {
1520 if (c1 < 0xA0)
1521 {
1522 /* We are not in a composite character any longer. */
1523 coding->composing = COMPOSING_NO;
1524 ENCODE_COMPOSITION_END;
1525 }
1526 else
1527 {
1528 if (coding->composing == COMPOSING_WITH_RULE_RULE)
1529 {
1530 *dst++ = c1 & 0x7F;
1531 coding->composing = COMPOSING_WITH_RULE_HEAD;
1532 continue;
1533 }
1534 else if (coding->composing == COMPOSING_WITH_RULE_HEAD)
1535 coding->composing = COMPOSING_WITH_RULE_RULE;
1536 if (c1 == 0xA0)
1537 {
1538 /* This is an ASCII component. */
1539 ONE_MORE_BYTE (c1);
1540 c1 &= 0x7F;
1541 }
1542 else
1543 /* This is a leading-code of non ASCII component. */
1544 c1 -= 0x20;
1545 }
1546 }
1547
1548 /* Now encode one character. C1 is a control character, an
1549 ASCII character, or a leading-code of multi-byte character. */
1550 switch (emacs_code_class[c1])
1551 {
1552 case EMACS_ascii_code:
bdd9fb48 1553 ENCODE_ISO_CHARACTER (CHARSET_ASCII, c1, /* dummy */ c2);
4ed46869
KH
1554 break;
1555
1556 case EMACS_control_code:
1557 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
e0e989f6 1558 ENCODE_RESET_PLANE_AND_REGISTER;
4ed46869
KH
1559 *dst++ = c1;
1560 break;
1561
1562 case EMACS_carriage_return_code:
1563 if (!coding->selective)
1564 {
1565 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
e0e989f6 1566 ENCODE_RESET_PLANE_AND_REGISTER;
4ed46869
KH
1567 *dst++ = c1;
1568 break;
1569 }
1570 /* fall down to treat '\r' as '\n' ... */
1571
1572 case EMACS_linefeed_code:
1573 if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
e0e989f6
KH
1574 ENCODE_RESET_PLANE_AND_REGISTER;
1575 if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
1576 bcopy (coding->spec.iso2022.initial_designation,
1577 coding->spec.iso2022.current_designation,
1578 sizeof coding->spec.iso2022.initial_designation);
4ed46869 1579 if (coding->eol_type == CODING_EOL_LF
0ef69138 1580 || coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
1581 *dst++ = ISO_CODE_LF;
1582 else if (coding->eol_type == CODING_EOL_CRLF)
1583 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
1584 else
1585 *dst++ = ISO_CODE_CR;
e0e989f6 1586 CODING_SPEC_ISO_BOL (coding) = 1;
4ed46869
KH
1587 break;
1588
1589 case EMACS_leading_code_2:
1590 ONE_MORE_BYTE (c2);
19a8d9e0
KH
1591 if (c2 < 0xA0)
1592 {
1593 /* invalid sequence */
1594 *dst++ = c1;
1595 *dst++ = c2;
1596 }
1597 else
1598 ENCODE_ISO_CHARACTER (c1, c2, /* dummy */ c3);
4ed46869
KH
1599 break;
1600
1601 case EMACS_leading_code_3:
1602 TWO_MORE_BYTES (c2, c3);
19a8d9e0
KH
1603 if (c2 < 0xA0 || c3 < 0xA0)
1604 {
1605 /* invalid sequence */
1606 *dst++ = c1;
1607 *dst++ = c2;
1608 *dst++ = c3;
1609 }
1610 else if (c1 < LEADING_CODE_PRIVATE_11)
bdd9fb48 1611 ENCODE_ISO_CHARACTER (c1, c2, c3);
4ed46869 1612 else
bdd9fb48 1613 ENCODE_ISO_CHARACTER (c2, c3, /* dummy */ c4);
4ed46869
KH
1614 break;
1615
1616 case EMACS_leading_code_4:
1617 THREE_MORE_BYTES (c2, c3, c4);
19a8d9e0
KH
1618 if (c2 < 0xA0 || c3 < 0xA0 || c4 < 0xA0)
1619 {
1620 /* invalid sequence */
1621 *dst++ = c1;
1622 *dst++ = c2;
1623 *dst++ = c3;
1624 *dst++ = c4;
1625 }
1626 else
1627 ENCODE_ISO_CHARACTER (c2, c3, c4);
4ed46869
KH
1628 break;
1629
1630 case EMACS_leading_code_composition:
19a8d9e0
KH
1631 ONE_MORE_BYTE (c2);
1632 if (c2 < 0xA0)
1633 {
1634 /* invalid sequence */
1635 *dst++ = c1;
1636 *dst++ = c2;
1637 }
1638 else if (c2 == 0xFF)
4ed46869
KH
1639 {
1640 coding->composing = COMPOSING_WITH_RULE_HEAD;
1641 ENCODE_COMPOSITION_WITH_RULE_START;
1642 }
1643 else
1644 {
1645 /* Rewind one byte because it is a character code of
1646 composition elements. */
1647 src--;
1648 coding->composing = COMPOSING_NO_RULE_HEAD;
1649 ENCODE_COMPOSITION_NO_RULE_START;
1650 }
1651 break;
1652
1653 case EMACS_invalid_code:
1654 *dst++ = c1;
1655 break;
1656 }
1657 continue;
1658 label_end_of_loop:
76376439
KH
1659 /* We reach here because the source date ends not at character
1660 boundary. */
1661 coding->carryover_size = src_end - src_base;
4ed46869 1662 bcopy (src_base, coding->carryover, coding->carryover_size);
76376439 1663 src = src_end;
4ed46869
KH
1664 break;
1665 }
1666
1667 /* If this is the last block of the text to be encoded, we must
bdd9fb48
KH
1668 reset graphic planes and registers to the initial state. */
1669 if (src >= src_end && coding->last_block)
4ed46869 1670 {
e0e989f6 1671 ENCODE_RESET_PLANE_AND_REGISTER;
bdd9fb48
KH
1672 if (coding->carryover_size > 0
1673 && coding->carryover_size < (dst_end - dst))
1674 {
1675 bcopy (coding->carryover, dst, coding->carryover_size);
1676 dst += coding->carryover_size;
1677 coding->carryover_size = 0;
1678 }
4ed46869
KH
1679 }
1680 *consumed = src - source;
1681 return dst - destination;
1682}
1683
1684\f
1685/*** 4. SJIS and BIG5 handlers ***/
1686
f4dee582 1687/* Although SJIS and BIG5 are not ISO's coding system, they are used
4ed46869
KH
1688 quite widely. So, for the moment, Emacs supports them in the bare
1689 C code. But, in the future, they may be supported only by CCL. */
1690
1691/* SJIS is a coding system encoding three character sets: ASCII, right
1692 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
1693 as is. A character of charset katakana-jisx0201 is encoded by
1694 "position-code + 0x80". A character of charset japanese-jisx0208
1695 is encoded in 2-byte but two position-codes are divided and shifted
1696 so that it fit in the range below.
1697
1698 --- CODE RANGE of SJIS ---
1699 (character set) (range)
1700 ASCII 0x00 .. 0x7F
1701 KATAKANA-JISX0201 0xA0 .. 0xDF
1702 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xFF
1703 (2nd byte) 0x40 .. 0xFF
1704 -------------------------------
1705
1706*/
1707
1708/* BIG5 is a coding system encoding two character sets: ASCII and
1709 Big5. An ASCII character is encoded as is. Big5 is a two-byte
1710 character set and is encoded in two-byte.
1711
1712 --- CODE RANGE of BIG5 ---
1713 (character set) (range)
1714 ASCII 0x00 .. 0x7F
1715 Big5 (1st byte) 0xA1 .. 0xFE
1716 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
1717 --------------------------
1718
1719 Since the number of characters in Big5 is larger than maximum
1720 characters in Emacs' charset (96x96), it can't be handled as one
1721 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
1722 and `charset-big5-2'. Both are DIMENSION2 and CHARS94. The former
1723 contains frequently used characters and the latter contains less
1724 frequently used characters. */
1725
1726/* Macros to decode or encode a character of Big5 in BIG5. B1 and B2
1727 are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
1728 C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
1729 format. CHARSET is `charset_big5_1' or `charset_big5_2'. */
1730
1731/* Number of Big5 characters which have the same code in 1st byte. */
1732#define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
1733
1734#define DECODE_BIG5(b1, b2, charset, c1, c2) \
1735 do { \
1736 unsigned int temp \
1737 = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62); \
1738 if (b1 < 0xC9) \
1739 charset = charset_big5_1; \
1740 else \
1741 { \
1742 charset = charset_big5_2; \
1743 temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW; \
1744 } \
1745 c1 = temp / (0xFF - 0xA1) + 0x21; \
1746 c2 = temp % (0xFF - 0xA1) + 0x21; \
1747 } while (0)
1748
1749#define ENCODE_BIG5(charset, c1, c2, b1, b2) \
1750 do { \
1751 unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21); \
1752 if (charset == charset_big5_2) \
1753 temp += BIG5_SAME_ROW * (0xC9 - 0xA1); \
1754 b1 = temp / BIG5_SAME_ROW + 0xA1; \
1755 b2 = temp % BIG5_SAME_ROW; \
1756 b2 += b2 < 0x3F ? 0x40 : 0x62; \
1757 } while (0)
1758
a5d301df
KH
1759#define DECODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \
1760 do { \
1761 int c_alt, charset_alt = (charset); \
1762 if (!NILP (unification_table) \
1763 && ((c_alt = unify_char (unification_table, \
1764 -1, (charset), c1, c2)) >= 0)) \
1765 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1766 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
1767 DECODE_CHARACTER_ASCII (c1); \
1768 else if (CHARSET_DIMENSION (charset_alt) == 1) \
1769 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
1770 else \
1771 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
1772 } while (0)
1773
1774#define ENCODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \
1775 do { \
1776 int c_alt, charset_alt; \
1777 if (!NILP (unification_table) \
1778 && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
1779 >= 0)) \
1780 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1781 else \
1782 charset_alt = charset; \
1783 if (charset_alt == charset_ascii) \
1784 *dst++ = c1; \
1785 else if (CHARSET_DIMENSION (charset_alt) == 1) \
1786 { \
1787 if (sjis_p && charset_alt == charset_katakana_jisx0201) \
1788 *dst++ = c1; \
1789 else \
1790 *dst++ = charset_alt, *dst++ = c1; \
1791 } \
1792 else \
1793 { \
1794 c1 &= 0x7F, c2 &= 0x7F; \
1795 if (sjis_p && charset_alt == charset_jisx0208) \
1796 { \
1797 unsigned char s1, s2; \
1798 \
1799 ENCODE_SJIS (c1, c2, s1, s2); \
1800 *dst++ = s1, *dst++ = s2; \
1801 } \
1802 else if (!sjis_p \
1803 && (charset_alt == charset_big5_1 \
1804 || charset_alt == charset_big5_2)) \
1805 { \
1806 unsigned char b1, b2; \
1807 \
9ce27fde 1808 ENCODE_BIG5 (charset_alt, c1, c2, b1, b2); \
a5d301df
KH
1809 *dst++ = b1, *dst++ = b2; \
1810 } \
1811 else \
1812 *dst++ = charset_alt, *dst++ = c1, *dst++ = c2; \
1813 } \
1814 } while (0);
1815
4ed46869
KH
1816/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1817 Check if a text is encoded in SJIS. If it is, return
1818 CODING_CATEGORY_MASK_SJIS, else return 0. */
1819
1820int
1821detect_coding_sjis (src, src_end)
1822 unsigned char *src, *src_end;
1823{
1824 unsigned char c;
1825
1826 while (src < src_end)
1827 {
1828 c = *src++;
1829 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1830 return 0;
1831 if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
1832 {
1833 if (src < src_end && *src++ < 0x40)
1834 return 0;
1835 }
1836 }
1837 return CODING_CATEGORY_MASK_SJIS;
1838}
1839
1840/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1841 Check if a text is encoded in BIG5. If it is, return
1842 CODING_CATEGORY_MASK_BIG5, else return 0. */
1843
1844int
1845detect_coding_big5 (src, src_end)
1846 unsigned char *src, *src_end;
1847{
1848 unsigned char c;
1849
1850 while (src < src_end)
1851 {
1852 c = *src++;
1853 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1854 return 0;
1855 if (c >= 0xA1)
1856 {
1857 if (src >= src_end)
1858 break;
1859 c = *src++;
1860 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
1861 return 0;
1862 }
1863 }
1864 return CODING_CATEGORY_MASK_BIG5;
1865}
1866
1867/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
1868 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
1869
1870int
1871decode_coding_sjis_big5 (coding, source, destination,
1872 src_bytes, dst_bytes, consumed, sjis_p)
1873 struct coding_system *coding;
1874 unsigned char *source, *destination;
1875 int src_bytes, dst_bytes;
1876 int *consumed;
1877 int sjis_p;
1878{
1879 unsigned char *src = source;
1880 unsigned char *src_end = source + src_bytes;
1881 unsigned char *dst = destination;
1882 unsigned char *dst_end = destination + dst_bytes;
1883 /* Since the maximum bytes produced by each loop is 4, we subtract 3
1884 from DST_END to assure overflow checking is necessary only at the
1885 head of loop. */
1886 unsigned char *adjusted_dst_end = dst_end - 3;
a5d301df
KH
1887 Lisp_Object unification_table
1888 = coding->character_unification_table_for_decode;
1889
1890 if (!NILP (Venable_character_unification) && NILP (unification_table))
1891 unification_table = Vstandard_character_unification_table_for_decode;
4ed46869
KH
1892
1893 while (src < src_end && dst < adjusted_dst_end)
1894 {
1895 /* SRC_BASE remembers the start position in source in each loop.
1896 The loop will be exited when there's not enough source text
1897 to analyze two-byte character (within macro ONE_MORE_BYTE).
1898 In that case, SRC is reset to SRC_BASE before exiting. */
1899 unsigned char *src_base = src;
1900 unsigned char c1 = *src++, c2, c3, c4;
1901
1902 if (c1 == '\r')
1903 {
1904 if (coding->eol_type == CODING_EOL_CRLF)
1905 {
1906 ONE_MORE_BYTE (c2);
1907 if (c2 == '\n')
1908 *dst++ = c2;
1909 else
1910 /* To process C2 again, SRC is subtracted by 1. */
1911 *dst++ = c1, src--;
1912 }
1913 else
1914 *dst++ = c1;
1915 }
a5d301df 1916 else if (c1 < 0x20)
4ed46869 1917 *dst++ = c1;
a5d301df
KH
1918 else if (c1 < 0x80)
1919 DECODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
4ed46869
KH
1920 else if (c1 < 0xA0 || c1 >= 0xE0)
1921 {
1922 /* SJIS -> JISX0208, BIG5 -> Big5 (only if 0xE0 <= c1 < 0xFF) */
1923 if (sjis_p)
1924 {
1925 ONE_MORE_BYTE (c2);
1926 DECODE_SJIS (c1, c2, c3, c4);
a5d301df 1927 DECODE_SJIS_BIG5_CHARACTER (charset_jisx0208, c3, c4);
4ed46869
KH
1928 }
1929 else if (c1 >= 0xE0 && c1 < 0xFF)
1930 {
1931 int charset;
1932
1933 ONE_MORE_BYTE (c2);
1934 DECODE_BIG5 (c1, c2, charset, c3, c4);
a5d301df 1935 DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
4ed46869
KH
1936 }
1937 else /* Invalid code */
1938 *dst++ = c1;
1939 }
1940 else
1941 {
1942 /* SJIS -> JISX0201-Kana, BIG5 -> Big5 */
1943 if (sjis_p)
a5d301df 1944 DECODE_SJIS_BIG5_CHARACTER (charset_katakana_jisx0201, c1, /* dummy */ c2);
4ed46869
KH
1945 else
1946 {
1947 int charset;
1948
1949 ONE_MORE_BYTE (c2);
1950 DECODE_BIG5 (c1, c2, charset, c3, c4);
a5d301df 1951 DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
4ed46869
KH
1952 }
1953 }
1954 continue;
1955
1956 label_end_of_loop:
1957 coding->carryover_size = src - src_base;
1958 bcopy (src_base, coding->carryover, coding->carryover_size);
1959 src = src_base;
1960 break;
1961 }
1962
1963 *consumed = src - source;
1964 return dst - destination;
1965}
1966
1967/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
1968 This function can encode `charset_ascii', `charset_katakana_jisx0201',
1969 `charset_jisx0208', `charset_big5_1', and `charset_big5-2'. We are
1970 sure that all these charsets are registered as official charset
1971 (i.e. do not have extended leading-codes). Characters of other
1972 charsets are produced without any encoding. If SJIS_P is 1, encode
1973 SJIS text, else encode BIG5 text. */
1974
1975int
1976encode_coding_sjis_big5 (coding, source, destination,
1977 src_bytes, dst_bytes, consumed, sjis_p)
1978 struct coding_system *coding;
1979 unsigned char *source, *destination;
1980 int src_bytes, dst_bytes;
1981 int *consumed;
1982 int sjis_p;
1983{
1984 unsigned char *src = source;
1985 unsigned char *src_end = source + src_bytes;
1986 unsigned char *dst = destination;
1987 unsigned char *dst_end = destination + dst_bytes;
1988 /* Since the maximum bytes produced by each loop is 2, we subtract 1
1989 from DST_END to assure overflow checking is necessary only at the
1990 head of loop. */
1991 unsigned char *adjusted_dst_end = dst_end - 1;
a5d301df
KH
1992 Lisp_Object unification_table
1993 = coding->character_unification_table_for_encode;
1994
1995 if (!NILP (Venable_character_unification) && NILP (unification_table))
1996 unification_table = Vstandard_character_unification_table_for_encode;
4ed46869
KH
1997
1998 while (src < src_end && dst < adjusted_dst_end)
1999 {
2000 /* SRC_BASE remembers the start position in source in each loop.
2001 The loop will be exited when there's not enough source text
2002 to analyze multi-byte codes (within macros ONE_MORE_BYTE and
2003 TWO_MORE_BYTES). In that case, SRC is reset to SRC_BASE
2004 before exiting. */
2005 unsigned char *src_base = src;
2006 unsigned char c1 = *src++, c2, c3, c4;
2007
2008 if (coding->composing)
2009 {
2010 if (c1 == 0xA0)
2011 {
2012 ONE_MORE_BYTE (c1);
2013 c1 &= 0x7F;
2014 }
2015 else if (c1 >= 0xA0)
2016 c1 -= 0x20;
2017 else
2018 coding->composing = 0;
2019 }
2020
2021 switch (emacs_code_class[c1])
2022 {
2023 case EMACS_ascii_code:
a5d301df
KH
2024 ENCODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
2025 break;
2026
4ed46869
KH
2027 case EMACS_control_code:
2028 *dst++ = c1;
2029 break;
2030
2031 case EMACS_carriage_return_code:
2032 if (!coding->selective)
2033 {
2034 *dst++ = c1;
2035 break;
2036 }
2037 /* fall down to treat '\r' as '\n' ... */
2038
2039 case EMACS_linefeed_code:
2040 if (coding->eol_type == CODING_EOL_LF
0ef69138 2041 || coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
2042 *dst++ = '\n';
2043 else if (coding->eol_type == CODING_EOL_CRLF)
2044 *dst++ = '\r', *dst++ = '\n';
2045 else
2046 *dst++ = '\r';
2047 break;
2048
2049 case EMACS_leading_code_2:
2050 ONE_MORE_BYTE (c2);
a5d301df 2051 ENCODE_SJIS_BIG5_CHARACTER (c1, c2, /* dummy */ c3);
4ed46869
KH
2052 break;
2053
2054 case EMACS_leading_code_3:
2055 TWO_MORE_BYTES (c2, c3);
a5d301df 2056 ENCODE_SJIS_BIG5_CHARACTER (c1, c2, c3);
4ed46869
KH
2057 break;
2058
2059 case EMACS_leading_code_4:
2060 THREE_MORE_BYTES (c2, c3, c4);
a5d301df 2061 ENCODE_SJIS_BIG5_CHARACTER (c2, c3, c4);
4ed46869
KH
2062 break;
2063
2064 case EMACS_leading_code_composition:
2065 coding->composing = 1;
2066 break;
2067
2068 default: /* i.e. case EMACS_invalid_code: */
2069 *dst++ = c1;
2070 }
2071 continue;
2072
2073 label_end_of_loop:
76376439 2074 coding->carryover_size = src_end - src_base;
4ed46869 2075 bcopy (src_base, coding->carryover, coding->carryover_size);
76376439 2076 src = src_end;
4ed46869
KH
2077 break;
2078 }
2079
2080 *consumed = src - source;
2081 return dst - destination;
2082}
2083
2084\f
2085/*** 5. End-of-line handlers ***/
2086
2087/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2088 This function is called only when `coding->eol_type' is
2089 CODING_EOL_CRLF or CODING_EOL_CR. */
2090
2091decode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
2092 struct coding_system *coding;
2093 unsigned char *source, *destination;
2094 int src_bytes, dst_bytes;
2095 int *consumed;
2096{
2097 unsigned char *src = source;
2098 unsigned char *src_end = source + src_bytes;
2099 unsigned char *dst = destination;
2100 unsigned char *dst_end = destination + dst_bytes;
2101 int produced;
2102
2103 switch (coding->eol_type)
2104 {
2105 case CODING_EOL_CRLF:
2106 {
2107 /* Since the maximum bytes produced by each loop is 2, we
2108 subtract 1 from DST_END to assure overflow checking is
2109 necessary only at the head of loop. */
2110 unsigned char *adjusted_dst_end = dst_end - 1;
2111
2112 while (src < src_end && dst < adjusted_dst_end)
2113 {
2114 unsigned char *src_base = src;
2115 unsigned char c = *src++;
2116 if (c == '\r')
2117 {
2118 ONE_MORE_BYTE (c);
2119 if (c != '\n')
2120 *dst++ = '\r';
bfd99048 2121 *dst++ = c;
4ed46869
KH
2122 }
2123 else
2124 *dst++ = c;
2125 continue;
2126
2127 label_end_of_loop:
2128 coding->carryover_size = src - src_base;
2129 bcopy (src_base, coding->carryover, coding->carryover_size);
2130 src = src_base;
2131 break;
2132 }
2133 *consumed = src - source;
2134 produced = dst - destination;
2135 break;
2136 }
2137
2138 case CODING_EOL_CR:
2139 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2140 bcopy (source, destination, produced);
2141 dst_end = destination + produced;
2142 while (dst < dst_end)
2143 if (*dst++ == '\r') dst[-1] = '\n';
2144 *consumed = produced;
2145 break;
2146
2147 default: /* i.e. case: CODING_EOL_LF */
2148 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2149 bcopy (source, destination, produced);
2150 *consumed = produced;
2151 break;
2152 }
2153
2154 return produced;
2155}
2156
2157/* See "GENERAL NOTES about `encode_coding_XXX ()' functions". Encode
2158 format of end-of-line according to `coding->eol_type'. If
2159 `coding->selective' is 1, code '\r' in source text also means
2160 end-of-line. */
2161
2162encode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
2163 struct coding_system *coding;
2164 unsigned char *source, *destination;
2165 int src_bytes, dst_bytes;
2166 int *consumed;
2167{
2168 unsigned char *src = source;
2169 unsigned char *dst = destination;
2170 int produced;
2171
2172 if (src_bytes <= 0)
2173 return 0;
2174
2175 switch (coding->eol_type)
2176 {
2177 case CODING_EOL_LF:
0ef69138 2178 case CODING_EOL_UNDECIDED:
4ed46869
KH
2179 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2180 bcopy (source, destination, produced);
2181 if (coding->selective)
2182 {
2183 int i = produced;
2184 while (i--)
2185 if (*dst++ == '\r') dst[-1] = '\n';
2186 }
2187 *consumed = produced;
2188
2189 case CODING_EOL_CRLF:
2190 {
2191 unsigned char c;
2192 unsigned char *src_end = source + src_bytes;
2193 unsigned char *dst_end = destination + dst_bytes;
2194 /* Since the maximum bytes produced by each loop is 2, we
2195 subtract 1 from DST_END to assure overflow checking is
2196 necessary only at the head of loop. */
2197 unsigned char *adjusted_dst_end = dst_end - 1;
2198
2199 while (src < src_end && dst < adjusted_dst_end)
2200 {
2201 c = *src++;
2202 if (c == '\n' || (c == '\r' && coding->selective))
2203 *dst++ = '\r', *dst++ = '\n';
2204 else
2205 *dst++ = c;
2206 }
2207 produced = dst - destination;
2208 *consumed = src - source;
2209 break;
2210 }
2211
2212 default: /* i.e. case CODING_EOL_CR: */
2213 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2214 bcopy (source, destination, produced);
2215 {
2216 int i = produced;
2217 while (i--)
2218 if (*dst++ == '\n') dst[-1] = '\r';
2219 }
2220 *consumed = produced;
2221 }
2222
2223 return produced;
2224}
2225
2226\f
2227/*** 6. C library functions ***/
2228
2229/* In Emacs Lisp, coding system is represented by a Lisp symbol which
2230 has a property `coding-system'. The value of this property is a
2231 vector of length 5 (called as coding-vector). Among elements of
2232 this vector, the first (element[0]) and the fifth (element[4])
2233 carry important information for decoding/encoding. Before
2234 decoding/encoding, this information should be set in fields of a
2235 structure of type `coding_system'.
2236
2237 A value of property `coding-system' can be a symbol of another
2238 subsidiary coding-system. In that case, Emacs gets coding-vector
2239 from that symbol.
2240
2241 `element[0]' contains information to be set in `coding->type'. The
2242 value and its meaning is as follows:
2243
0ef69138
KH
2244 0 -- coding_type_emacs_mule
2245 1 -- coding_type_sjis
2246 2 -- coding_type_iso2022
2247 3 -- coding_type_big5
2248 4 -- coding_type_ccl encoder/decoder written in CCL
2249 nil -- coding_type_no_conversion
2250 t -- coding_type_undecided (automatic conversion on decoding,
2251 no-conversion on encoding)
4ed46869
KH
2252
2253 `element[4]' contains information to be set in `coding->flags' and
2254 `coding->spec'. The meaning varies by `coding->type'.
2255
2256 If `coding->type' is `coding_type_iso2022', element[4] is a vector
2257 of length 32 (of which the first 13 sub-elements are used now).
2258 Meanings of these sub-elements are:
2259
2260 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2261 If the value is an integer of valid charset, the charset is
2262 assumed to be designated to graphic register N initially.
2263
2264 If the value is minus, it is a minus value of charset which
2265 reserves graphic register N, which means that the charset is
2266 not designated initially but should be designated to graphic
2267 register N just before encoding a character in that charset.
2268
2269 If the value is nil, graphic register N is never used on
2270 encoding.
2271
2272 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2273 Each value takes t or nil. See the section ISO2022 of
2274 `coding.h' for more information.
2275
2276 If `coding->type' is `coding_type_big5', element[4] is t to denote
2277 BIG5-ETen or nil to denote BIG5-HKU.
2278
2279 If `coding->type' takes the other value, element[4] is ignored.
2280
2281 Emacs Lisp's coding system also carries information about format of
2282 end-of-line in a value of property `eol-type'. If the value is
2283 integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2284 means CODING_EOL_CR. If it is not integer, it should be a vector
2285 of subsidiary coding systems of which property `eol-type' has one
2286 of above values.
2287
2288*/
2289
2290/* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2291 and set it in CODING. If CODING_SYSTEM_SYMBOL is invalid, CODING
2292 is setup so that no conversion is necessary and return -1, else
2293 return 0. */
2294
2295int
e0e989f6
KH
2296setup_coding_system (coding_system, coding)
2297 Lisp_Object coding_system;
4ed46869
KH
2298 struct coding_system *coding;
2299{
4ed46869
KH
2300 Lisp_Object type, eol_type;
2301
f4dee582 2302 /* At first, set several fields to default values. */
4ed46869
KH
2303 coding->require_flushing = 0;
2304 coding->last_block = 0;
2305 coding->selective = 0;
2306 coding->composing = 0;
2307 coding->direction = 0;
2308 coding->carryover_size = 0;
4ed46869 2309 coding->post_read_conversion = coding->pre_write_conversion = Qnil;
a5d301df
KH
2310 coding->character_unification_table_for_decode = Qnil;
2311 coding->character_unification_table_for_encode = Qnil;
4ed46869 2312
e0e989f6
KH
2313 Vlast_coding_system_used = coding->symbol = coding_system;
2314 eol_type = Qnil;
2315 /* Get value of property `coding-system' until we get a vector.
2316 While doing that, also get values of properties
a5d301df
KH
2317 `post-read-conversion', `pre-write-conversion',
2318 `character-unification-table-for-decode',
2319 `character-unification-table-for-encode' and `eol-type'. */
e0e989f6 2320 while (!NILP (coding_system) && SYMBOLP (coding_system))
4ed46869 2321 {
4ed46869 2322 if (NILP (coding->post_read_conversion))
e0e989f6 2323 coding->post_read_conversion = Fget (coding_system,
4ed46869 2324 Qpost_read_conversion);
e0e989f6
KH
2325 if (NILP (coding->pre_write_conversion))
2326 coding->pre_write_conversion = Fget (coding_system,
4ed46869 2327 Qpre_write_conversion);
9ce27fde 2328 if (!inhibit_eol_conversion && NILP (eol_type))
e0e989f6 2329 eol_type = Fget (coding_system, Qeol_type);
a5d301df
KH
2330
2331 if (NILP (coding->character_unification_table_for_decode))
2332 coding->character_unification_table_for_decode
2333 = Fget (coding_system, Qcharacter_unification_table_for_decode);
2334
2335 if (NILP (coding->character_unification_table_for_encode))
2336 coding->character_unification_table_for_encode
2337 = Fget (coding_system, Qcharacter_unification_table_for_encode);
2338
e0e989f6 2339 coding_system = Fget (coding_system, Qcoding_system);
4ed46869 2340 }
a5d301df
KH
2341
2342 while (!NILP (coding->character_unification_table_for_decode)
2343 && SYMBOLP (coding->character_unification_table_for_decode))
2344 coding->character_unification_table_for_decode
2345 = Fget (coding->character_unification_table_for_decode,
2346 Qcharacter_unification_table_for_decode);
2347 if (!NILP (coding->character_unification_table_for_decode)
2348 && !CHAR_TABLE_P (coding->character_unification_table_for_decode))
2349 coding->character_unification_table_for_decode = Qnil;
2350
2351 while (!NILP (coding->character_unification_table_for_encode)
2352 && SYMBOLP (coding->character_unification_table_for_encode))
2353 coding->character_unification_table_for_encode
2354 = Fget (coding->character_unification_table_for_encode,
2355 Qcharacter_unification_table_for_encode);
2356 if (!NILP (coding->character_unification_table_for_encode)
2357 && !CHAR_TABLE_P (coding->character_unification_table_for_encode))
2358 coding->character_unification_table_for_encode = Qnil;
2359
e0e989f6
KH
2360 if (!VECTORP (coding_system)
2361 || XVECTOR (coding_system)->size != 5)
4ed46869
KH
2362 goto label_invalid_coding_system;
2363
4ed46869 2364 if (VECTORP (eol_type))
0ef69138 2365 coding->eol_type = CODING_EOL_UNDECIDED;
4ed46869
KH
2366 else if (XFASTINT (eol_type) == 1)
2367 coding->eol_type = CODING_EOL_CRLF;
2368 else if (XFASTINT (eol_type) == 2)
2369 coding->eol_type = CODING_EOL_CR;
2370 else
2371 coding->eol_type = CODING_EOL_LF;
2372
e0e989f6 2373 type = XVECTOR (coding_system)->contents[0];
4ed46869
KH
2374 switch (XFASTINT (type))
2375 {
2376 case 0:
0ef69138 2377 coding->type = coding_type_emacs_mule;
4ed46869
KH
2378 break;
2379
2380 case 1:
2381 coding->type = coding_type_sjis;
2382 break;
2383
2384 case 2:
2385 coding->type = coding_type_iso2022;
2386 {
e0e989f6 2387 Lisp_Object val = XVECTOR (coding_system)->contents[4];
4ed46869
KH
2388 Lisp_Object *flags;
2389 int i, charset, default_reg_bits = 0;
2390
2391 if (!VECTORP (val) || XVECTOR (val)->size != 32)
2392 goto label_invalid_coding_system;
2393
2394 flags = XVECTOR (val)->contents;
2395 coding->flags
2396 = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
2397 | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
2398 | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
2399 | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
2400 | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
2401 | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
2402 | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
2403 | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
e0e989f6
KH
2404 | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
2405 | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
c4825358
KH
2406 | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
2407 | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
3f003981 2408 | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
c4825358 2409 );
4ed46869
KH
2410
2411 /* Invoke graphic register 0 to plane 0. */
2412 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
2413 /* Invoke graphic register 1 to plane 1 if we can use full 8-bit. */
2414 CODING_SPEC_ISO_INVOCATION (coding, 1)
2415 = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
2416 /* Not single shifting at first. */
6e85d753 2417 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
e0e989f6 2418 /* Beginning of buffer should also be regarded as bol. */
6e85d753 2419 CODING_SPEC_ISO_BOL (coding) = 1;
4ed46869
KH
2420
2421 /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
2422 FLAGS[REG] can be one of below:
2423 integer CHARSET: CHARSET occupies register I,
2424 t: designate nothing to REG initially, but can be used
2425 by any charsets,
2426 list of integer, nil, or t: designate the first
2427 element (if integer) to REG initially, the remaining
2428 elements (if integer) is designated to REG on request,
2429 if an element is t, REG can be used by any charset,
2430 nil: REG is never used. */
467e7675 2431 for (charset = 0; charset <= MAX_CHARSET; charset++)
1ba9e4ab
KH
2432 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2433 = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
6e85d753 2434 bzero (CODING_SPEC_ISO_EXPECTED_CHARSETS (coding), MAX_CHARSET + 1);
4ed46869
KH
2435 for (i = 0; i < 4; i++)
2436 {
2437 if (INTEGERP (flags[i])
e0e989f6
KH
2438 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset))
2439 || (charset = get_charset_id (flags[i])) >= 0)
4ed46869
KH
2440 {
2441 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2442 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
6e85d753 2443 CODING_SPEC_ISO_EXPECTED_CHARSETS (coding)[charset] = 1;
4ed46869
KH
2444 }
2445 else if (EQ (flags[i], Qt))
2446 {
2447 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2448 default_reg_bits |= 1 << i;
2449 }
2450 else if (CONSP (flags[i]))
2451 {
2452 Lisp_Object tail = flags[i];
2453
2454 if (INTEGERP (XCONS (tail)->car)
2455 && (charset = XINT (XCONS (tail)->car),
e0e989f6
KH
2456 CHARSET_VALID_P (charset))
2457 || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
4ed46869
KH
2458 {
2459 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2460 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
6e85d753 2461 CODING_SPEC_ISO_EXPECTED_CHARSETS (coding)[charset] = 1;
4ed46869
KH
2462 }
2463 else
2464 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2465 tail = XCONS (tail)->cdr;
2466 while (CONSP (tail))
2467 {
2468 if (INTEGERP (XCONS (tail)->car)
2469 && (charset = XINT (XCONS (tail)->car),
e0e989f6
KH
2470 CHARSET_VALID_P (charset))
2471 || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
6e85d753
KH
2472 {
2473 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2474 = i;
2475 CODING_SPEC_ISO_EXPECTED_CHARSETS (coding)[charset]
2476 = 1;
2477 }
4ed46869
KH
2478 else if (EQ (XCONS (tail)->car, Qt))
2479 default_reg_bits |= 1 << i;
2480 tail = XCONS (tail)->cdr;
2481 }
2482 }
2483 else
2484 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2485
2486 CODING_SPEC_ISO_DESIGNATION (coding, i)
2487 = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
2488 }
2489
2490 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
2491 {
2492 /* REG 1 can be used only by locking shift in 7-bit env. */
2493 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2494 default_reg_bits &= ~2;
2495 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
2496 /* Without any shifting, only REG 0 and 1 can be used. */
2497 default_reg_bits &= 3;
2498 }
2499
6e85d753
KH
2500 for (charset = 0; charset <= MAX_CHARSET; charset++)
2501 if (CHARSET_VALID_P (charset)
2502 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2503 == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
2504 {
2505 /* We have not yet decided where to designate CHARSET. */
2506 int reg_bits = default_reg_bits;
2507
2508 if (CHARSET_CHARS (charset) == 96)
2509 /* A charset of CHARS96 can't be designated to REG 0. */
2510 reg_bits &= ~1;
2511
2512 if (reg_bits)
2513 /* There exist some default graphic register. */
2514 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2515 = (reg_bits & 1
2516 ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
2517 else
2518 /* We anyway have to designate CHARSET to somewhere. */
2519 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2520 = (CHARSET_CHARS (charset) == 94
2521 ? 0
2522 : ((coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT
2523 || ! coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2524 ? 1
2525 : (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT
2526 ? 2 : 0)));
2527 }
4ed46869
KH
2528 }
2529 coding->require_flushing = 1;
2530 break;
2531
2532 case 3:
2533 coding->type = coding_type_big5;
2534 coding->flags
e0e989f6 2535 = (NILP (XVECTOR (coding_system)->contents[4])
4ed46869
KH
2536 ? CODING_FLAG_BIG5_HKU
2537 : CODING_FLAG_BIG5_ETEN);
2538 break;
2539
2540 case 4:
2541 coding->type = coding_type_ccl;
2542 {
e0e989f6 2543 Lisp_Object val = XVECTOR (coding_system)->contents[4];
4ed46869
KH
2544 if (CONSP (val)
2545 && VECTORP (XCONS (val)->car)
2546 && VECTORP (XCONS (val)->cdr))
2547 {
2548 setup_ccl_program (&(coding->spec.ccl.decoder), XCONS (val)->car);
2549 setup_ccl_program (&(coding->spec.ccl.encoder), XCONS (val)->cdr);
2550 }
2551 else
2552 goto label_invalid_coding_system;
2553 }
2554 coding->require_flushing = 1;
2555 break;
2556
27901516
KH
2557 case 5:
2558 coding->type = coding_type_raw_text;
2559 break;
2560
4ed46869
KH
2561 default:
2562 if (EQ (type, Qt))
0ef69138 2563 coding->type = coding_type_undecided;
4ed46869
KH
2564 else
2565 coding->type = coding_type_no_conversion;
2566 break;
2567 }
2568 return 0;
2569
2570 label_invalid_coding_system:
2571 coding->type = coding_type_no_conversion;
dec137e5 2572 coding->eol_type = CODING_EOL_LF;
e0e989f6
KH
2573 coding->symbol = coding->pre_write_conversion = coding->post_read_conversion
2574 = Qnil;
4ed46869
KH
2575 return -1;
2576}
2577
2578/* Emacs has a mechanism to automatically detect a coding system if it
2579 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
2580 it's impossible to distinguish some coding systems accurately
2581 because they use the same range of codes. So, at first, coding
2582 systems are categorized into 7, those are:
2583
0ef69138 2584 o coding-category-emacs-mule
4ed46869
KH
2585
2586 The category for a coding system which has the same code range
2587 as Emacs' internal format. Assigned the coding-system (Lisp
0ef69138 2588 symbol) `emacs-mule' by default.
4ed46869
KH
2589
2590 o coding-category-sjis
2591
2592 The category for a coding system which has the same code range
2593 as SJIS. Assigned the coding-system (Lisp
7717c392 2594 symbol) `japanese-shift-jis' by default.
4ed46869
KH
2595
2596 o coding-category-iso-7
2597
2598 The category for a coding system which has the same code range
7717c392
KH
2599 as ISO2022 of 7-bit environment. This doesn't use any locking
2600 shift and single shift functions. Assigned the coding-system
2601 (Lisp symbol) `iso-2022-7bit' by default.
4ed46869
KH
2602
2603 o coding-category-iso-8-1
2604
2605 The category for a coding system which has the same code range
2606 as ISO2022 of 8-bit environment and graphic plane 1 used only
7717c392
KH
2607 for DIMENSION1 charset. This doesn't use any locking shift
2608 and single shift functions. Assigned the coding-system (Lisp
2609 symbol) `iso-latin-1' by default.
4ed46869
KH
2610
2611 o coding-category-iso-8-2
2612
2613 The category for a coding system which has the same code range
2614 as ISO2022 of 8-bit environment and graphic plane 1 used only
7717c392
KH
2615 for DIMENSION2 charset. This doesn't use any locking shift
2616 and single shift functions. Assigned the coding-system (Lisp
2617 symbol) `japanese-iso-8bit' by default.
4ed46869 2618
7717c392 2619 o coding-category-iso-7-else
4ed46869
KH
2620
2621 The category for a coding system which has the same code range
7717c392
KH
2622 as ISO2022 of 7-bit environemnt but uses locking shift or
2623 single shift functions. Assigned the coding-system (Lisp
2624 symbol) `iso-2022-7bit-lock' by default.
2625
2626 o coding-category-iso-8-else
2627
2628 The category for a coding system which has the same code range
2629 as ISO2022 of 8-bit environemnt but uses locking shift or
2630 single shift functions. Assigned the coding-system (Lisp
2631 symbol) `iso-2022-8bit-ss2' by default.
4ed46869
KH
2632
2633 o coding-category-big5
2634
2635 The category for a coding system which has the same code range
2636 as BIG5. Assigned the coding-system (Lisp symbol)
e0e989f6 2637 `cn-big5' by default.
4ed46869
KH
2638
2639 o coding-category-binary
2640
2641 The category for a coding system not categorized in any of the
2642 above. Assigned the coding-system (Lisp symbol)
e0e989f6 2643 `no-conversion' by default.
4ed46869
KH
2644
2645 Each of them is a Lisp symbol and the value is an actual
2646 `coding-system's (this is also a Lisp symbol) assigned by a user.
2647 What Emacs does actually is to detect a category of coding system.
2648 Then, it uses a `coding-system' assigned to it. If Emacs can't
2649 decide only one possible category, it selects a category of the
2650 highest priority. Priorities of categories are also specified by a
2651 user in a Lisp variable `coding-category-list'.
2652
2653*/
2654
2655/* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2656 If it detects possible coding systems, return an integer in which
2657 appropriate flag bits are set. Flag bits are defined by macros
2658 CODING_CATEGORY_MASK_XXX in `coding.h'. */
2659
2660int
2661detect_coding_mask (src, src_bytes)
2662 unsigned char *src;
2663 int src_bytes;
2664{
2665 register unsigned char c;
2666 unsigned char *src_end = src + src_bytes;
2667 int mask;
2668
2669 /* At first, skip all ASCII characters and control characters except
2670 for three ISO2022 specific control characters. */
bcf26d6a 2671 label_loop_detect_coding:
4ed46869
KH
2672 while (src < src_end)
2673 {
2674 c = *src;
2675 if (c >= 0x80
2676 || (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
2677 break;
2678 src++;
2679 }
2680
2681 if (src >= src_end)
2682 /* We found nothing other than ASCII. There's nothing to do. */
2683 return CODING_CATEGORY_MASK_ANY;
2684
2685 /* The text seems to be encoded in some multilingual coding system.
2686 Now, try to find in which coding system the text is encoded. */
2687 if (c < 0x80)
bcf26d6a
KH
2688 {
2689 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
2690 /* C is an ISO2022 specific control code of C0. */
2691 mask = detect_coding_iso2022 (src, src_end);
2692 src++;
2693 if (mask == CODING_CATEGORY_MASK_ANY)
2694 /* No valid ISO2022 code follows C. Try again. */
2695 goto label_loop_detect_coding;
5d648571 2696 mask |= CODING_CATEGORY_MASK_RAW_TEXT;
bcf26d6a 2697 }
4ed46869 2698 else if (c < 0xA0)
c4825358 2699 {
3f003981 2700 /* If C is a special latin extra code,
c4825358
KH
2701 or is an ISO2022 specific control code of C1 (SS2 or SS3),
2702 or is an ISO2022 control-sequence-introducer (CSI),
27901516 2703 we should also consider the possibility of ISO2022 codings. */
3f003981
KH
2704 if ((VECTORP (Vlatin_extra_code_table)
2705 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
c4825358
KH
2706 || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
2707 || (c == ISO_CODE_CSI
2708 && (src < src_end
2709 && (*src == ']'
2710 || (src + 1 < src_end
2711 && src[1] == ']'
2712 && (*src == '0' || *src == '1' || *src == '2'))))))
2713 mask = (detect_coding_iso2022 (src, src_end)
2714 | detect_coding_sjis (src, src_end)
2715 | detect_coding_emacs_mule (src, src_end)
27901516 2716 | CODING_CATEGORY_MASK_RAW_TEXT);
4ed46869 2717
c4825358 2718 else
27901516
KH
2719 /* C is the first byte of SJIS character code,
2720 or a leading-code of Emacs' internal format (emacs-mule). */
c4825358
KH
2721 mask = (detect_coding_sjis (src, src_end)
2722 | detect_coding_emacs_mule (src, src_end)
27901516 2723 | CODING_CATEGORY_MASK_RAW_TEXT);
c4825358 2724 }
4ed46869
KH
2725 else
2726 /* C is a character of ISO2022 in graphic plane right,
2727 or a SJIS's 1-byte character code (i.e. JISX0201),
2728 or the first byte of BIG5's 2-byte code. */
2729 mask = (detect_coding_iso2022 (src, src_end)
2730 | detect_coding_sjis (src, src_end)
10bff6f1 2731 | detect_coding_big5 (src, src_end)
27901516 2732 | CODING_CATEGORY_MASK_RAW_TEXT);
4ed46869
KH
2733
2734 return mask;
2735}
2736
2737/* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2738 The information of the detected coding system is set in CODING. */
2739
2740void
2741detect_coding (coding, src, src_bytes)
2742 struct coding_system *coding;
2743 unsigned char *src;
2744 int src_bytes;
2745{
2746 int mask = detect_coding_mask (src, src_bytes);
2747 int idx;
27901516 2748 Lisp_Object val = Vcoding_category_list;
4ed46869
KH
2749
2750 if (mask == CODING_CATEGORY_MASK_ANY)
2751 /* We found nothing other than ASCII. There's nothing to do. */
2752 return;
2753
27901516
KH
2754 /* We found some plausible coding systems. Let's use a coding
2755 system of the highest priority. */
4ed46869 2756
27901516
KH
2757 if (CONSP (val))
2758 while (!NILP (val))
2759 {
2760 idx = XFASTINT (Fget (XCONS (val)->car, Qcoding_category_index));
2761 if ((idx < CODING_CATEGORY_IDX_MAX) && (mask & (1 << idx)))
2762 break;
2763 val = XCONS (val)->cdr;
2764 }
2765 else
2766 val = Qnil;
4ed46869 2767
27901516
KH
2768 if (NILP (val))
2769 {
2770 /* For unknown reason, `Vcoding_category_list' contains none of
2771 found categories. Let's use any of them. */
2772 for (idx = 0; idx < CODING_CATEGORY_IDX_MAX; idx++)
2773 if (mask & (1 << idx))
2774 break;
4ed46869
KH
2775 }
2776 setup_coding_system (XSYMBOL (coding_category_table[idx])->value, coding);
2777}
2778
2779/* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2780 is encoded. Return one of CODING_EOL_LF, CODING_EOL_CRLF,
0ef69138 2781 CODING_EOL_CR, and CODING_EOL_UNDECIDED. */
4ed46869 2782
bc4bc72a
RS
2783#define MAX_EOL_CHECK_COUNT 3
2784
4ed46869
KH
2785int
2786detect_eol_type (src, src_bytes)
2787 unsigned char *src;
2788 int src_bytes;
2789{
2790 unsigned char *src_end = src + src_bytes;
2791 unsigned char c;
bc4bc72a
RS
2792 int total = 0; /* How many end-of-lines are found so far. */
2793 int eol_type = CODING_EOL_UNDECIDED;
2794 int this_eol_type;
4ed46869 2795
bc4bc72a 2796 while (src < src_end && total < MAX_EOL_CHECK_COUNT)
4ed46869
KH
2797 {
2798 c = *src++;
bc4bc72a 2799 if (c == '\n' || c == '\r')
4ed46869 2800 {
bc4bc72a
RS
2801 total++;
2802 if (c == '\n')
2803 this_eol_type = CODING_EOL_LF;
2804 else if (src >= src_end || *src != '\n')
2805 this_eol_type = CODING_EOL_CR;
4ed46869 2806 else
bc4bc72a
RS
2807 this_eol_type = CODING_EOL_CRLF, src++;
2808
2809 if (eol_type == CODING_EOL_UNDECIDED)
2810 /* This is the first end-of-line. */
2811 eol_type = this_eol_type;
2812 else if (eol_type != this_eol_type)
2813 /* The found type is different from what found before.
27901516
KH
2814 Let's notice the caller about this inconsistency. */
2815 return CODING_EOL_INCONSISTENT;
4ed46869
KH
2816 }
2817 }
bc4bc72a 2818
85a02ca4 2819 return eol_type;
4ed46869
KH
2820}
2821
2822/* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2823 is encoded. If it detects an appropriate format of end-of-line, it
2824 sets the information in *CODING. */
2825
2826void
2827detect_eol (coding, src, src_bytes)
2828 struct coding_system *coding;
2829 unsigned char *src;
2830 int src_bytes;
2831{
fb3903d3 2832 Lisp_Object val, coding_system;
4ed46869
KH
2833 int eol_type = detect_eol_type (src, src_bytes);
2834
0ef69138 2835 if (eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
2836 /* We found no end-of-line in the source text. */
2837 return;
2838
27901516
KH
2839 if (eol_type == CODING_EOL_INCONSISTENT)
2840 {
2841#if 0
2842 /* This code is suppressed until we find a better way to
992f23f2 2843 distinguish raw text file and binary file. */
27901516
KH
2844
2845 /* If we have already detected that the coding is raw-text, the
2846 coding should actually be no-conversion. */
2847 if (coding->type == coding_type_raw_text)
2848 {
2849 setup_coding_system (Qno_conversion, coding);
2850 return;
2851 }
2852 /* Else, let's decode only text code anyway. */
2853#endif /* 0 */
2854 eol_type == CODING_EOL_LF;
2855 }
2856
fb3903d3
KH
2857 coding_system = coding->symbol;
2858 while (!NILP (coding_system)
2859 && NILP (val = Fget (coding_system, Qeol_type)))
2860 coding_system = Fget (coding_system, Qcoding_system);
4ed46869
KH
2861 if (VECTORP (val) && XVECTOR (val)->size == 3)
2862 setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
2863}
2864
2865/* See "GENERAL NOTES about `decode_coding_XXX ()' functions". Before
2866 decoding, it may detect coding system and format of end-of-line if
2867 those are not yet decided. */
2868
2869int
2870decode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2871 struct coding_system *coding;
2872 unsigned char *source, *destination;
2873 int src_bytes, dst_bytes;
2874 int *consumed;
2875{
2876 int produced;
2877
2878 if (src_bytes <= 0)
2879 {
2880 *consumed = 0;
2881 return 0;
2882 }
2883
0ef69138 2884 if (coding->type == coding_type_undecided)
4ed46869
KH
2885 detect_coding (coding, source, src_bytes);
2886
0ef69138 2887 if (coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
2888 detect_eol (coding, source, src_bytes);
2889
2890 coding->carryover_size = 0;
2891 switch (coding->type)
2892 {
2893 case coding_type_no_conversion:
2894 label_no_conversion:
2895 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2896 bcopy (source, destination, produced);
2897 *consumed = produced;
2898 break;
2899
0ef69138
KH
2900 case coding_type_emacs_mule:
2901 case coding_type_undecided:
27901516 2902 case coding_type_raw_text:
4ed46869 2903 if (coding->eol_type == CODING_EOL_LF
0ef69138 2904 || coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
2905 goto label_no_conversion;
2906 produced = decode_eol (coding, source, destination,
2907 src_bytes, dst_bytes, consumed);
2908 break;
2909
2910 case coding_type_sjis:
2911 produced = decode_coding_sjis_big5 (coding, source, destination,
2912 src_bytes, dst_bytes, consumed,
2913 1);
2914 break;
2915
2916 case coding_type_iso2022:
2917 produced = decode_coding_iso2022 (coding, source, destination,
2918 src_bytes, dst_bytes, consumed);
2919 break;
2920
2921 case coding_type_big5:
2922 produced = decode_coding_sjis_big5 (coding, source, destination,
2923 src_bytes, dst_bytes, consumed,
2924 0);
2925 break;
2926
2927 case coding_type_ccl:
2928 produced = ccl_driver (&coding->spec.ccl.decoder, source, destination,
2929 src_bytes, dst_bytes, consumed);
2930 break;
2931 }
2932
2933 return produced;
2934}
2935
2936/* See "GENERAL NOTES about `encode_coding_XXX ()' functions". */
2937
2938int
2939encode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2940 struct coding_system *coding;
2941 unsigned char *source, *destination;
2942 int src_bytes, dst_bytes;
2943 int *consumed;
2944{
2945 int produced;
2946
4ed46869
KH
2947 switch (coding->type)
2948 {
2949 case coding_type_no_conversion:
2950 label_no_conversion:
2951 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2952 if (produced > 0)
2953 {
2954 bcopy (source, destination, produced);
2955 if (coding->selective)
2956 {
2957 unsigned char *p = destination, *pend = destination + produced;
2958 while (p < pend)
e0e989f6 2959 if (*p++ == '\015') p[-1] = '\n';
4ed46869
KH
2960 }
2961 }
2962 *consumed = produced;
2963 break;
2964
0ef69138
KH
2965 case coding_type_emacs_mule:
2966 case coding_type_undecided:
27901516 2967 case coding_type_raw_text:
4ed46869 2968 if (coding->eol_type == CODING_EOL_LF
0ef69138 2969 || coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
2970 goto label_no_conversion;
2971 produced = encode_eol (coding, source, destination,
2972 src_bytes, dst_bytes, consumed);
2973 break;
2974
2975 case coding_type_sjis:
2976 produced = encode_coding_sjis_big5 (coding, source, destination,
2977 src_bytes, dst_bytes, consumed,
2978 1);
2979 break;
2980
2981 case coding_type_iso2022:
2982 produced = encode_coding_iso2022 (coding, source, destination,
2983 src_bytes, dst_bytes, consumed);
2984 break;
2985
2986 case coding_type_big5:
2987 produced = encode_coding_sjis_big5 (coding, source, destination,
2988 src_bytes, dst_bytes, consumed,
2989 0);
2990 break;
2991
2992 case coding_type_ccl:
2993 produced = ccl_driver (&coding->spec.ccl.encoder, source, destination,
2994 src_bytes, dst_bytes, consumed);
2995 break;
2996 }
2997
2998 return produced;
2999}
3000
3001#define CONVERSION_BUFFER_EXTRA_ROOM 256
3002
3003/* Return maximum size (bytes) of a buffer enough for decoding
3004 SRC_BYTES of text encoded in CODING. */
3005
3006int
3007decoding_buffer_size (coding, src_bytes)
3008 struct coding_system *coding;
3009 int src_bytes;
3010{
3011 int magnification;
3012
3013 if (coding->type == coding_type_iso2022)
3014 magnification = 3;
3015 else if (coding->type == coding_type_ccl)
3016 magnification = coding->spec.ccl.decoder.buf_magnification;
3017 else
3018 magnification = 2;
3019
3020 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
3021}
3022
3023/* Return maximum size (bytes) of a buffer enough for encoding
3024 SRC_BYTES of text to CODING. */
3025
3026int
3027encoding_buffer_size (coding, src_bytes)
3028 struct coding_system *coding;
3029 int src_bytes;
3030{
3031 int magnification;
3032
3033 if (coding->type == coding_type_ccl)
3034 magnification = coding->spec.ccl.encoder.buf_magnification;
3035 else
3036 magnification = 3;
3037
3038 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
3039}
3040
3041#ifndef MINIMUM_CONVERSION_BUFFER_SIZE
3042#define MINIMUM_CONVERSION_BUFFER_SIZE 1024
3043#endif
3044
3045char *conversion_buffer;
3046int conversion_buffer_size;
3047
3048/* Return a pointer to a SIZE bytes of buffer to be used for encoding
3049 or decoding. Sufficient memory is allocated automatically. If we
3050 run out of memory, return NULL. */
3051
3052char *
3053get_conversion_buffer (size)
3054 int size;
3055{
3056 if (size > conversion_buffer_size)
3057 {
3058 char *buf;
3059 int real_size = conversion_buffer_size * 2;
3060
3061 while (real_size < size) real_size *= 2;
3062 buf = (char *) xmalloc (real_size);
3063 xfree (conversion_buffer);
3064 conversion_buffer = buf;
3065 conversion_buffer_size = real_size;
3066 }
3067 return conversion_buffer;
3068}
3069
3070\f
3071#ifdef emacs
3072/*** 7. Emacs Lisp library functions ***/
3073
02ba4723 3074DEFUN ("coding-system-spec", Fcoding_system_spec, Scoding_system_spec,
4ed46869 3075 1, 1, 0,
02ba4723 3076 "Return coding-spec of CODING-SYSTEM.\n\
4ed46869
KH
3077If CODING-SYSTEM is not a valid coding-system, return nil.")
3078 (obj)
3079 Lisp_Object obj;
3080{
3081 while (SYMBOLP (obj) && !NILP (obj))
3082 obj = Fget (obj, Qcoding_system);
3083 return ((NILP (obj) || !VECTORP (obj) || XVECTOR (obj)->size != 5)
3084 ? Qnil : obj);
3085}
3086
3087DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
3088 "Return t if OBJECT is nil or a coding-system.\n\
3089See document of make-coding-system for coding-system object.")
3090 (obj)
3091 Lisp_Object obj;
3092{
02ba4723 3093 return ((NILP (obj) || !NILP (Fcoding_system_spec (obj))) ? Qt : Qnil);
4ed46869
KH
3094}
3095
9d991de8
RS
3096DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
3097 Sread_non_nil_coding_system, 1, 1, 0,
e0e989f6 3098 "Read a coding system from the minibuffer, prompting with string PROMPT.")
4ed46869
KH
3099 (prompt)
3100 Lisp_Object prompt;
3101{
e0e989f6 3102 Lisp_Object val;
9d991de8
RS
3103 do
3104 {
02ba4723 3105 val = Fcompleting_read (prompt, Vobarray, Qcoding_system_spec,
61e011d9 3106 Qt, Qnil, Qnil, Qnil, Qnil);
9d991de8
RS
3107 }
3108 while (XSTRING (val)->size == 0);
e0e989f6 3109 return (Fintern (val, Qnil));
4ed46869
KH
3110}
3111
3112DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 1, 0,
e0e989f6 3113 "Read a coding system or nil from the minibuffer, prompting with string PROMPT.")
4ed46869
KH
3114 (prompt)
3115 Lisp_Object prompt;
3116{
e0e989f6 3117 Lisp_Object val = Fcompleting_read (prompt, Vobarray, Qcoding_system_p,
61e011d9 3118 Qt, Qnil, Qnil, Qnil, Qnil);
e0e989f6 3119 return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
4ed46869
KH
3120}
3121
3122DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
3123 1, 1, 0,
3124 "Check validity of CODING-SYSTEM.\n\
3125If valid, return CODING-SYSTEM, else `coding-system-error' is signaled.\n\
3126CODING-SYSTEM is valid if it is a symbol and has \"coding-system\" property.\n\
3127The value of property should be a vector of length 5.")
3128 (coding_system)
3129 Lisp_Object coding_system;
3130{
3131 CHECK_SYMBOL (coding_system, 0);
3132 if (!NILP (Fcoding_system_p (coding_system)))
3133 return coding_system;
3134 while (1)
02ba4723 3135 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
4ed46869
KH
3136}
3137
3138DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
3139 2, 2, 0,
bf9cdd4e
KH
3140 "Detect coding system of the text in the region between START and END.\n\
3141Return a list of possible coding systems ordered by priority.\n\
0ef69138 3142If only ASCII characters are found, it returns `undecided'\n\
bf9cdd4e 3143 or its subsidiary coding system according to a detected end-of-line format.")
4ed46869
KH
3144 (b, e)
3145 Lisp_Object b, e;
3146{
3147 int coding_mask, eol_type;
3148 Lisp_Object val;
3149 int beg, end;
3150
3151 validate_region (&b, &e);
3152 beg = XINT (b), end = XINT (e);
3153 if (beg < GPT && end >= GPT) move_gap (end);
3154
3155 coding_mask = detect_coding_mask (POS_ADDR (beg), end - beg);
3156 eol_type = detect_eol_type (POS_ADDR (beg), end - beg);
3157
3158 if (coding_mask == CODING_CATEGORY_MASK_ANY)
3159 {
27901516
KH
3160 val = Qundecided;
3161 if (eol_type != CODING_EOL_UNDECIDED
3162 && eol_type != CODING_EOL_INCONSISTENT)
4ed46869 3163 {
27901516 3164 Lisp_Object val2 = Fget (Qundecided, Qeol_type);
4ed46869
KH
3165 if (VECTORP (val2))
3166 val = XVECTOR (val2)->contents[eol_type];
3167 }
3168 }
3169 else
3170 {
3171 Lisp_Object val2;
3172
3173 /* At first, gather possible coding-systems in VAL in a reverse
3174 order. */
3175 val = Qnil;
3176 for (val2 = Vcoding_category_list;
3177 !NILP (val2);
3178 val2 = XCONS (val2)->cdr)
3179 {
3180 int idx
3181 = XFASTINT (Fget (XCONS (val2)->car, Qcoding_category_index));
3182 if (coding_mask & (1 << idx))
27901516
KH
3183 {
3184#if 0
3185 /* This code is suppressed until we find a better way to
992f23f2 3186 distinguish raw text file and binary file. */
27901516
KH
3187
3188 if (idx == CODING_CATEGORY_IDX_RAW_TEXT
3189 && eol_type == CODING_EOL_INCONSISTENT)
3190 val = Fcons (Qno_conversion, val);
3191 else
3192#endif /* 0 */
3193 val = Fcons (Fsymbol_value (XCONS (val2)->car), val);
3194 }
4ed46869
KH
3195 }
3196
3197 /* Then, change the order of the list, while getting subsidiary
3198 coding-systems. */
3199 val2 = val;
3200 val = Qnil;
27901516
KH
3201 if (eol_type == CODING_EOL_INCONSISTENT)
3202 eol_type == CODING_EOL_UNDECIDED;
4ed46869
KH
3203 for (; !NILP (val2); val2 = XCONS (val2)->cdr)
3204 {
0ef69138 3205 if (eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
3206 val = Fcons (XCONS (val2)->car, val);
3207 else
3208 {
3209 Lisp_Object val3 = Fget (XCONS (val2)->car, Qeol_type);
3210 if (VECTORP (val3))
3211 val = Fcons (XVECTOR (val3)->contents[eol_type], val);
3212 else
3213 val = Fcons (XCONS (val2)->car, val);
3214 }
3215 }
3216 }
3217
3218 return val;
3219}
3220
3221/* Scan text in the region between *BEGP and *ENDP, skip characters
3222 which we never have to encode to (iff ENCODEP is 1) or decode from
3223 coding system CODING at the head and tail, then set BEGP and ENDP
3224 to the addresses of start and end of the text we actually convert. */
3225
3226void
3227shrink_conversion_area (begp, endp, coding, encodep)
3228 unsigned char **begp, **endp;
3229 struct coding_system *coding;
3230 int encodep;
3231{
3232 register unsigned char *beg_addr = *begp, *end_addr = *endp;
3233
3234 if (coding->eol_type != CODING_EOL_LF
0ef69138 3235 && coding->eol_type != CODING_EOL_UNDECIDED)
4ed46869
KH
3236 /* Since we anyway have to convert end-of-line format, it is not
3237 worth skipping at most 100 bytes or so. */
3238 return;
3239
3240 if (encodep) /* for encoding */
3241 {
3242 switch (coding->type)
3243 {
3244 case coding_type_no_conversion:
0ef69138
KH
3245 case coding_type_emacs_mule:
3246 case coding_type_undecided:
27901516 3247 case coding_type_raw_text:
4ed46869
KH
3248 /* We need no conversion. */
3249 *begp = *endp;
3250 return;
3251 case coding_type_ccl:
3252 /* We can't skip any data. */
3253 return;
e0e989f6
KH
3254 case coding_type_iso2022:
3255 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3256 {
3257 unsigned char *bol = beg_addr;
3258 while (beg_addr < end_addr && *beg_addr < 0x80)
3259 {
3260 beg_addr++;
3261 if (*(beg_addr - 1) == '\n')
3262 bol = beg_addr;
3263 }
3264 beg_addr = bol;
3265 goto label_skip_tail;
3266 }
3267 /* fall down ... */
4ed46869
KH
3268 default:
3269 /* We can skip all ASCII characters at the head and tail. */
3270 while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
e0e989f6 3271 label_skip_tail:
4ed46869
KH
3272 while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
3273 break;
3274 }
3275 }
3276 else /* for decoding */
3277 {
3278 switch (coding->type)
3279 {
3280 case coding_type_no_conversion:
3281 /* We need no conversion. */
3282 *begp = *endp;
3283 return;
0ef69138 3284 case coding_type_emacs_mule:
27901516 3285 case coding_type_raw_text:
4ed46869
KH
3286 if (coding->eol_type == CODING_EOL_LF)
3287 {
3288 /* We need no conversion. */
3289 *begp = *endp;
3290 return;
3291 }
3292 /* We can skip all but carriage-return. */
3293 while (beg_addr < end_addr && *beg_addr != '\r') beg_addr++;
3294 while (beg_addr < end_addr && *(end_addr - 1) != '\r') end_addr--;
3295 break;
3296 case coding_type_sjis:
3297 case coding_type_big5:
3298 /* We can skip all ASCII characters at the head. */
3299 while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
3300 /* We can skip all ASCII characters at the tail except for
3301 the second byte of SJIS or BIG5 code. */
3302 while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
3303 if (end_addr != *endp)
3304 end_addr++;
3305 break;
3306 case coding_type_ccl:
3307 /* We can't skip any data. */
3308 return;
3309 default: /* i.e. case coding_type_iso2022: */
3310 {
3311 unsigned char c;
3312
3313 /* We can skip all ASCII characters except for a few
3314 control codes at the head. */
3315 while (beg_addr < end_addr && (c = *beg_addr) < 0x80
3316 && c != ISO_CODE_CR && c != ISO_CODE_SO
3317 && c != ISO_CODE_SI && c != ISO_CODE_ESC)
3318 beg_addr++;
3319 }
3320 break;
3321 }
3322 }
3323 *begp = beg_addr;
3324 *endp = end_addr;
3325 return;
3326}
3327
3328/* Encode to (iff ENCODEP is 1) or decode form coding system CODING a
3329 text between B and E. B and E are buffer position. */
3330
3331Lisp_Object
3332code_convert_region (b, e, coding, encodep)
3333 Lisp_Object b, e;
3334 struct coding_system *coding;
3335 int encodep;
3336{
3337 int beg, end, len, consumed, produced;
3338 char *buf;
3339 unsigned char *begp, *endp;
3340 int pos = PT;
3341
3342 validate_region (&b, &e);
3343 beg = XINT (b), end = XINT (e);
3344 if (beg < GPT && end >= GPT)
3345 move_gap (end);
3346
3347 if (encodep && !NILP (coding->pre_write_conversion))
3348 {
3349 /* We must call a pre-conversion function which may put a new
3350 text to be converted in a new buffer. */
3351 struct buffer *old = current_buffer, *new;
3352
3353 TEMP_SET_PT (beg);
3354 call2 (coding->pre_write_conversion, b, e);
3355 if (old != current_buffer)
3356 {
3357 /* Replace the original text by the text just generated. */
3358 len = ZV - BEGV;
3359 new = current_buffer;
3360 set_buffer_internal (old);
3361 del_range (beg, end);
3362 insert_from_buffer (new, 1, len, 0);
3363 end = beg + len;
3364 }
3365 }
3366
3367 /* We may be able to shrink the conversion region. */
3368 begp = POS_ADDR (beg); endp = begp + (end - beg);
3369 shrink_conversion_area (&begp, &endp, coding, encodep);
3370
3371 if (begp == endp)
3372 /* We need no conversion. */
3373 len = end - beg;
3374 else
3375 {
3376 beg += begp - POS_ADDR (beg);
3377 end = beg + (endp - begp);
3378
3379 if (encodep)
3380 len = encoding_buffer_size (coding, end - beg);
3381 else
3382 len = decoding_buffer_size (coding, end - beg);
3383 buf = get_conversion_buffer (len);
3384
3385 coding->last_block = 1;
3386 produced = (encodep
3387 ? encode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
3388 &consumed)
3389 : decode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
3390 &consumed));
3391
3392 len = produced + (beg - XINT (b)) + (XINT (e) - end);
3393
3394 TEMP_SET_PT (beg);
3395 insert (buf, produced);
3396 del_range (PT, PT + end - beg);
3397 if (pos >= end)
3398 pos = PT + (pos - end);
3399 else if (pos > beg)
3400 pos = beg;
3401 TEMP_SET_PT (pos);
3402 }
3403
3404 if (!encodep && !NILP (coding->post_read_conversion))
3405 {
3406 /* We must call a post-conversion function which may alter
3407 the text just converted. */
3408 Lisp_Object insval;
3409
3410 beg = XINT (b);
3411 TEMP_SET_PT (beg);
3412 insval = call1 (coding->post_read_conversion, make_number (len));
3413 CHECK_NUMBER (insval, 0);
3414 len = XINT (insval);
3415 }
3416
3417 return make_number (len);
3418}
3419
3420Lisp_Object
e0e989f6
KH
3421code_convert_string (str, coding, encodep, nocopy)
3422 Lisp_Object str, nocopy;
4ed46869
KH
3423 struct coding_system *coding;
3424 int encodep;
3425{
3426 int len, consumed, produced;
3427 char *buf;
3428 unsigned char *begp, *endp;
3429 int head_skip, tail_skip;
3430 struct gcpro gcpro1;
3431
3432 if (encodep && !NILP (coding->pre_write_conversion)
3433 || !encodep && !NILP (coding->post_read_conversion))
3434 {
3435 /* Since we have to call Lisp functions which assume target text
3436 is in a buffer, after setting a temporary buffer, call
3437 code_convert_region. */
3438 int count = specpdl_ptr - specpdl;
3439 int len = XSTRING (str)->size;
3440 Lisp_Object result;
3441 struct buffer *old = current_buffer;
3442
3443 record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
3444 temp_output_buffer_setup (" *code-converting-work*");
3445 set_buffer_internal (XBUFFER (Vstandard_output));
3446 insert_from_string (str, 0, len, 0);
3447 code_convert_region (make_number (BEGV), make_number (ZV),
3448 coding, encodep);
3449 result = make_buffer_string (BEGV, ZV, 0);
3450 set_buffer_internal (old);
3451 return unbind_to (count, result);
3452 }
3453
3454 /* We may be able to shrink the conversion region. */
3455 begp = XSTRING (str)->data;
3456 endp = begp + XSTRING (str)->size;
3457 shrink_conversion_area (&begp, &endp, coding, encodep);
3458
3459 if (begp == endp)
3460 /* We need no conversion. */
e0e989f6 3461 return (NILP (nocopy) ? Fcopy_sequence (str) : str);
4ed46869
KH
3462
3463 head_skip = begp - XSTRING (str)->data;
3464 tail_skip = XSTRING (str)->size - head_skip - (endp - begp);
3465
3466 GCPRO1 (str);
3467
3468 if (encodep)
3469 len = encoding_buffer_size (coding, endp - begp);
3470 else
3471 len = decoding_buffer_size (coding, endp - begp);
3472 buf = get_conversion_buffer (len + head_skip + tail_skip);
3473
3474 bcopy (XSTRING (str)->data, buf, head_skip);
3475 coding->last_block = 1;
3476 produced = (encodep
3477 ? encode_coding (coding, XSTRING (str)->data + head_skip,
3478 buf + head_skip, endp - begp, len, &consumed)
3479 : decode_coding (coding, XSTRING (str)->data + head_skip,
3480 buf + head_skip, endp - begp, len, &consumed));
3481 bcopy (XSTRING (str)->data + head_skip + (endp - begp),
3482 buf + head_skip + produced,
3483 tail_skip);
3484
3485 UNGCPRO;
3486
3487 return make_string (buf, head_skip + produced + tail_skip);
3488}
3489
3490DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
e0e989f6
KH
3491 3, 3, "r\nzCoding system: ",
3492 "Decode current region by specified coding system.\n\
3493When called from a program, takes three arguments:\n\
3494START, END, and CODING-SYSTEM. START END are buffer positions.\n\
4ed46869
KH
3495Return length of decoded text.")
3496 (b, e, coding_system)
3497 Lisp_Object b, e, coding_system;
3498{
3499 struct coding_system coding;
3500
3501 CHECK_NUMBER_COERCE_MARKER (b, 0);
3502 CHECK_NUMBER_COERCE_MARKER (e, 1);
3503 CHECK_SYMBOL (coding_system, 2);
3504
e0e989f6
KH
3505 if (NILP (coding_system))
3506 return make_number (XFASTINT (e) - XFASTINT (b));
4ed46869
KH
3507 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3508 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3509
3510 return code_convert_region (b, e, &coding, 0);
3511}
3512
3513DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
e0e989f6
KH
3514 3, 3, "r\nzCoding system: ",
3515 "Encode current region by specified coding system.\n\
3516When called from a program, takes three arguments:\n\
3517START, END, and CODING-SYSTEM. START END are buffer positions.\n\
4ed46869
KH
3518Return length of encoded text.")
3519 (b, e, coding_system)
3520 Lisp_Object b, e, coding_system;
3521{
3522 struct coding_system coding;
3523
3524 CHECK_NUMBER_COERCE_MARKER (b, 0);
3525 CHECK_NUMBER_COERCE_MARKER (e, 1);
3526 CHECK_SYMBOL (coding_system, 2);
3527
e0e989f6
KH
3528 if (NILP (coding_system))
3529 return make_number (XFASTINT (e) - XFASTINT (b));
4ed46869
KH
3530 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3531 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3532
3533 return code_convert_region (b, e, &coding, 1);
3534}
3535
3536DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
e0e989f6
KH
3537 2, 3, 0,
3538 "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
3539Optional arg NOCOPY non-nil means return STRING itself if there's no need\n\
3540of decoding.")
3541 (string, coding_system, nocopy)
3542 Lisp_Object string, coding_system, nocopy;
4ed46869
KH
3543{
3544 struct coding_system coding;
3545
3546 CHECK_STRING (string, 0);
3547 CHECK_SYMBOL (coding_system, 1);
3548
e0e989f6
KH
3549 if (NILP (coding_system))
3550 return (NILP (nocopy) ? Fcopy_sequence (string) : string);
4ed46869
KH
3551 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3552 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3553
e0e989f6 3554 return code_convert_string (string, &coding, 0, nocopy);
4ed46869
KH
3555}
3556
3557DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
e0e989f6
KH
3558 2, 3, 0,
3559 "Encode STRING to CODING-SYSTEM, and return the result.\n\
3560Optional arg NOCOPY non-nil means return STRING itself if there's no need\n\
3561of encoding.")
3562 (string, coding_system, nocopy)
3563 Lisp_Object string, coding_system, nocopy;
4ed46869
KH
3564{
3565 struct coding_system coding;
3566
3567 CHECK_STRING (string, 0);
3568 CHECK_SYMBOL (coding_system, 1);
3569
e0e989f6
KH
3570 if (NILP (coding_system))
3571 return (NILP (nocopy) ? Fcopy_sequence (string) : string);
4ed46869
KH
3572 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3573 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3574
e0e989f6 3575 return code_convert_string (string, &coding, 1, nocopy);
4ed46869
KH
3576}
3577
3578DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
e0e989f6 3579 "Decode a JISX0208 character of shift-jis encoding.\n\
4ed46869
KH
3580CODE is the character code in SJIS.\n\
3581Return the corresponding character.")
3582 (code)
3583 Lisp_Object code;
3584{
3585 unsigned char c1, c2, s1, s2;
3586 Lisp_Object val;
3587
3588 CHECK_NUMBER (code, 0);
3589 s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
3590 DECODE_SJIS (s1, s2, c1, c2);
3591 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset_jisx0208, c1, c2));
3592 return val;
3593}
3594
3595DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
3596 "Encode a JISX0208 character CHAR to SJIS coding-system.\n\
3597Return the corresponding character code in SJIS.")
3598 (ch)
3599 Lisp_Object ch;
3600{
bcf26d6a 3601 int charset, c1, c2, s1, s2;
4ed46869
KH
3602 Lisp_Object val;
3603
3604 CHECK_NUMBER (ch, 0);
3605 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3606 if (charset == charset_jisx0208)
3607 {
3608 ENCODE_SJIS (c1, c2, s1, s2);
bcf26d6a 3609 XSETFASTINT (val, (s1 << 8) | s2);
4ed46869
KH
3610 }
3611 else
3612 XSETFASTINT (val, 0);
3613 return val;
3614}
3615
3616DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
3617 "Decode a Big5 character CODE of BIG5 coding-system.\n\
3618CODE is the character code in BIG5.\n\
3619Return the corresponding character.")
3620 (code)
3621 Lisp_Object code;
3622{
3623 int charset;
3624 unsigned char b1, b2, c1, c2;
3625 Lisp_Object val;
3626
3627 CHECK_NUMBER (code, 0);
3628 b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
3629 DECODE_BIG5 (b1, b2, charset, c1, c2);
3630 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset, c1, c2));
3631 return val;
3632}
3633
3634DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
3635 "Encode the Big5 character CHAR to BIG5 coding-system.\n\
3636Return the corresponding character code in Big5.")
3637 (ch)
3638 Lisp_Object ch;
3639{
bcf26d6a 3640 int charset, c1, c2, b1, b2;
4ed46869
KH
3641 Lisp_Object val;
3642
3643 CHECK_NUMBER (ch, 0);
3644 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3645 if (charset == charset_big5_1 || charset == charset_big5_2)
3646 {
3647 ENCODE_BIG5 (charset, c1, c2, b1, b2);
bcf26d6a 3648 XSETFASTINT (val, (b1 << 8) | b2);
4ed46869
KH
3649 }
3650 else
3651 XSETFASTINT (val, 0);
3652 return val;
3653}
3654
1ba9e4ab
KH
3655DEFUN ("set-terminal-coding-system-internal",
3656 Fset_terminal_coding_system_internal,
3657 Sset_terminal_coding_system_internal, 1, 1, 0, "")
4ed46869
KH
3658 (coding_system)
3659 Lisp_Object coding_system;
3660{
3661 CHECK_SYMBOL (coding_system, 0);
3662 setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
6e85d753
KH
3663 /* We had better not send unexpected characters to terminal. */
3664 terminal_coding.flags |= CODING_FLAG_ISO_SAFE;
3665
4ed46869
KH
3666 return Qnil;
3667}
3668
c4825358
KH
3669DEFUN ("set-safe-terminal-coding-system-internal",
3670 Fset_safe_terminal_coding_system_internal,
3671 Sset_safe_terminal_coding_system_internal, 1, 1, 0, "")
3672 (coding_system)
3673 Lisp_Object coding_system;
3674{
3675 CHECK_SYMBOL (coding_system, 0);
3676 setup_coding_system (Fcheck_coding_system (coding_system),
3677 &safe_terminal_coding);
3678 return Qnil;
3679}
3680
4ed46869
KH
3681DEFUN ("terminal-coding-system",
3682 Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
3683 "Return coding-system of your terminal.")
3684 ()
3685{
3686 return terminal_coding.symbol;
3687}
3688
1ba9e4ab
KH
3689DEFUN ("set-keyboard-coding-system-internal",
3690 Fset_keyboard_coding_system_internal,
3691 Sset_keyboard_coding_system_internal, 1, 1, 0, "")
4ed46869
KH
3692 (coding_system)
3693 Lisp_Object coding_system;
3694{
3695 CHECK_SYMBOL (coding_system, 0);
3696 setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
3697 return Qnil;
3698}
3699
3700DEFUN ("keyboard-coding-system",
3701 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
3702 "Return coding-system of what is sent from terminal keyboard.")
3703 ()
3704{
3705 return keyboard_coding.symbol;
3706}
3707
3708\f
a5d301df
KH
3709DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
3710 Sfind_operation_coding_system, 1, MANY, 0,
3711 "Choose a coding system for an operation based on the target name.\n\
9ce27fde
KH
3712The value names a pair of coding systems: (DECODING-SYSTEM ENCODING-SYSTEM).\n\
3713DECODING-SYSTEM is the coding system to use for decoding\n\
3714\(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system\n\
3715for encoding (in case OPERATION does encoding).\n\
ccdb79f5
RS
3716\n\
3717The first argument OPERATION specifies an I/O primitive:\n\
3718 For file I/O, `insert-file-contents' or `write-region'.\n\
3719 For process I/O, `call-process', `call-process-region', or `start-process'.\n\
3720 For network I/O, `open-network-stream'.\n\
3721\n\
3722The remaining arguments should be the same arguments that were passed\n\
3723to the primitive. Depending on which primitive, one of those arguments\n\
3724is selected as the TARGET. For example, if OPERATION does file I/O,\n\
3725whichever argument specifies the file name is TARGET.\n\
3726\n\
3727TARGET has a meaning which depends on OPERATION:\n\
4ed46869
KH
3728 For file I/O, TARGET is a file name.\n\
3729 For process I/O, TARGET is a process name.\n\
3730 For network I/O, TARGET is a service name or a port number\n\
3731\n\
02ba4723
KH
3732This function looks up what specified for TARGET in,\n\
3733`file-coding-system-alist', `process-coding-system-alist',\n\
3734or `network-coding-system-alist' depending on OPERATION.\n\
3735They may specify a coding system, a cons of coding systems,\n\
3736or a function symbol to call.\n\
3737In the last case, we call the function with one argument,\n\
9ce27fde 3738which is a list of all the arguments given to this function.")
4ed46869
KH
3739 (nargs, args)
3740 int nargs;
3741 Lisp_Object *args;
3742{
3743 Lisp_Object operation, target_idx, target, val;
3744 register Lisp_Object chain;
3745
3746 if (nargs < 2)
3747 error ("Too few arguments");
3748 operation = args[0];
3749 if (!SYMBOLP (operation)
3750 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
3751 error ("Invalid first arguement");
3752 if (nargs < 1 + XINT (target_idx))
3753 error ("Too few arguments for operation: %s",
3754 XSYMBOL (operation)->name->data);
3755 target = args[XINT (target_idx) + 1];
3756 if (!(STRINGP (target)
3757 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
3758 error ("Invalid %dth argument", XINT (target_idx) + 1);
3759
2e34157c
RS
3760 chain = ((EQ (operation, Qinsert_file_contents)
3761 || EQ (operation, Qwrite_region))
02ba4723 3762 ? Vfile_coding_system_alist
2e34157c 3763 : (EQ (operation, Qopen_network_stream)
02ba4723
KH
3764 ? Vnetwork_coding_system_alist
3765 : Vprocess_coding_system_alist));
4ed46869
KH
3766 if (NILP (chain))
3767 return Qnil;
3768
02ba4723 3769 for (; CONSP (chain); chain = XCONS (chain)->cdr)
4ed46869
KH
3770 {
3771 Lisp_Object elt = XCONS (chain)->car;
3772
3773 if (CONSP (elt)
3774 && ((STRINGP (target)
3775 && STRINGP (XCONS (elt)->car)
3776 && fast_string_match (XCONS (elt)->car, target) >= 0)
3777 || (INTEGERP (target) && EQ (target, XCONS (elt)->car))))
02ba4723
KH
3778 {
3779 val = XCONS (elt)->cdr;
3780 if (CONSP (val))
3781 return val;
3782 if (! SYMBOLP (val))
3783 return Qnil;
3784 if (! NILP (Fcoding_system_p (val)))
3785 return Fcons (val, val);
465edc86 3786 if (!NILP (Ffboundp (val)))
5d632ccf 3787 return call1 (val, Flist (nargs, args));
02ba4723
KH
3788 return Qnil;
3789 }
4ed46869
KH
3790 }
3791 return Qnil;
3792}
3793
3794#endif /* emacs */
3795
3796\f
3797/*** 8. Post-amble ***/
3798
3799init_coding_once ()
3800{
3801 int i;
3802
0ef69138 3803 /* Emacs' internal format specific initialize routine. */
4ed46869
KH
3804 for (i = 0; i <= 0x20; i++)
3805 emacs_code_class[i] = EMACS_control_code;
3806 emacs_code_class[0x0A] = EMACS_linefeed_code;
3807 emacs_code_class[0x0D] = EMACS_carriage_return_code;
3808 for (i = 0x21 ; i < 0x7F; i++)
3809 emacs_code_class[i] = EMACS_ascii_code;
3810 emacs_code_class[0x7F] = EMACS_control_code;
3811 emacs_code_class[0x80] = EMACS_leading_code_composition;
3812 for (i = 0x81; i < 0xFF; i++)
3813 emacs_code_class[i] = EMACS_invalid_code;
3814 emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
3815 emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
3816 emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
3817 emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
3818
3819 /* ISO2022 specific initialize routine. */
3820 for (i = 0; i < 0x20; i++)
3821 iso_code_class[i] = ISO_control_code;
3822 for (i = 0x21; i < 0x7F; i++)
3823 iso_code_class[i] = ISO_graphic_plane_0;
3824 for (i = 0x80; i < 0xA0; i++)
3825 iso_code_class[i] = ISO_control_code;
3826 for (i = 0xA1; i < 0xFF; i++)
3827 iso_code_class[i] = ISO_graphic_plane_1;
3828 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
3829 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
3830 iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
3831 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
3832 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
3833 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
3834 iso_code_class[ISO_CODE_ESC] = ISO_escape;
3835 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
3836 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
3837 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
3838
e0e989f6
KH
3839 conversion_buffer_size = MINIMUM_CONVERSION_BUFFER_SIZE;
3840 conversion_buffer = (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE);
3841
3842 setup_coding_system (Qnil, &keyboard_coding);
3843 setup_coding_system (Qnil, &terminal_coding);
c4825358 3844 setup_coding_system (Qnil, &safe_terminal_coding);
9ce27fde
KH
3845
3846#if defined (MSDOS) || defined (WINDOWSNT)
3847 system_eol_type = CODING_EOL_CRLF;
3848#else
3849 system_eol_type = CODING_EOL_LF;
3850#endif
e0e989f6
KH
3851}
3852
3853#ifdef emacs
3854
3855syms_of_coding ()
3856{
3857 Qtarget_idx = intern ("target-idx");
3858 staticpro (&Qtarget_idx);
3859
9ce27fde 3860 /* Target FILENAME is the first argument. */
e0e989f6 3861 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
9ce27fde 3862 /* Target FILENAME is the third argument. */
e0e989f6
KH
3863 Fput (Qwrite_region, Qtarget_idx, make_number (2));
3864
3865 Qcall_process = intern ("call-process");
3866 staticpro (&Qcall_process);
9ce27fde 3867 /* Target PROGRAM is the first argument. */
e0e989f6
KH
3868 Fput (Qcall_process, Qtarget_idx, make_number (0));
3869
3870 Qcall_process_region = intern ("call-process-region");
3871 staticpro (&Qcall_process_region);
9ce27fde 3872 /* Target PROGRAM is the third argument. */
e0e989f6
KH
3873 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
3874
3875 Qstart_process = intern ("start-process");
3876 staticpro (&Qstart_process);
9ce27fde 3877 /* Target PROGRAM is the third argument. */
e0e989f6
KH
3878 Fput (Qstart_process, Qtarget_idx, make_number (2));
3879
3880 Qopen_network_stream = intern ("open-network-stream");
3881 staticpro (&Qopen_network_stream);
9ce27fde 3882 /* Target SERVICE is the fourth argument. */
e0e989f6
KH
3883 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
3884
4ed46869
KH
3885 Qcoding_system = intern ("coding-system");
3886 staticpro (&Qcoding_system);
3887
3888 Qeol_type = intern ("eol-type");
3889 staticpro (&Qeol_type);
3890
3891 Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
3892 staticpro (&Qbuffer_file_coding_system);
3893
3894 Qpost_read_conversion = intern ("post-read-conversion");
3895 staticpro (&Qpost_read_conversion);
3896
3897 Qpre_write_conversion = intern ("pre-write-conversion");
3898 staticpro (&Qpre_write_conversion);
3899
27901516
KH
3900 Qno_conversion = intern ("no-conversion");
3901 staticpro (&Qno_conversion);
3902
3903 Qundecided = intern ("undecided");
3904 staticpro (&Qundecided);
3905
02ba4723
KH
3906 Qcoding_system_spec = intern ("coding-system-spec");
3907 staticpro (&Qcoding_system_spec);
4ed46869
KH
3908
3909 Qcoding_system_p = intern ("coding-system-p");
3910 staticpro (&Qcoding_system_p);
3911
3912 Qcoding_system_error = intern ("coding-system-error");
3913 staticpro (&Qcoding_system_error);
3914
3915 Fput (Qcoding_system_error, Qerror_conditions,
3916 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
3917 Fput (Qcoding_system_error, Qerror_message,
9ce27fde 3918 build_string ("Invalid coding system"));
4ed46869
KH
3919
3920 Qcoding_category_index = intern ("coding-category-index");
3921 staticpro (&Qcoding_category_index);
3922
3923 {
3924 int i;
3925 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3926 {
3927 coding_category_table[i] = intern (coding_category_name[i]);
3928 staticpro (&coding_category_table[i]);
3929 Fput (coding_category_table[i], Qcoding_category_index,
3930 make_number (i));
3931 }
3932 }
3933
bdd9fb48
KH
3934 Qcharacter_unification_table = intern ("character-unification-table");
3935 staticpro (&Qcharacter_unification_table);
3936 Fput (Qcharacter_unification_table, Qchar_table_extra_slots,
3937 make_number (0));
3938
a5d301df
KH
3939 Qcharacter_unification_table_for_decode
3940 = intern ("character-unification-table-for-decode");
3941 staticpro (&Qcharacter_unification_table_for_decode);
3942
3943 Qcharacter_unification_table_for_encode
3944 = intern ("character-unification-table-for-encode");
3945 staticpro (&Qcharacter_unification_table_for_encode);
3946
9ce27fde
KH
3947 Qemacs_mule = intern ("emacs-mule");
3948 staticpro (&Qemacs_mule);
3949
02ba4723 3950 defsubr (&Scoding_system_spec);
4ed46869
KH
3951 defsubr (&Scoding_system_p);
3952 defsubr (&Sread_coding_system);
3953 defsubr (&Sread_non_nil_coding_system);
3954 defsubr (&Scheck_coding_system);
3955 defsubr (&Sdetect_coding_region);
3956 defsubr (&Sdecode_coding_region);
3957 defsubr (&Sencode_coding_region);
3958 defsubr (&Sdecode_coding_string);
3959 defsubr (&Sencode_coding_string);
3960 defsubr (&Sdecode_sjis_char);
3961 defsubr (&Sencode_sjis_char);
3962 defsubr (&Sdecode_big5_char);
3963 defsubr (&Sencode_big5_char);
1ba9e4ab 3964 defsubr (&Sset_terminal_coding_system_internal);
c4825358 3965 defsubr (&Sset_safe_terminal_coding_system_internal);
4ed46869 3966 defsubr (&Sterminal_coding_system);
1ba9e4ab 3967 defsubr (&Sset_keyboard_coding_system_internal);
4ed46869 3968 defsubr (&Skeyboard_coding_system);
a5d301df 3969 defsubr (&Sfind_operation_coding_system);
4ed46869
KH
3970
3971 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
3972 "List of coding-categories (symbols) ordered by priority.");
3973 {
3974 int i;
3975
3976 Vcoding_category_list = Qnil;
3977 for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
3978 Vcoding_category_list
3979 = Fcons (coding_category_table[i], Vcoding_category_list);
3980 }
3981
3982 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
10bff6f1 3983 "Specify the coding system for read operations.\n\
2ebb362d 3984It is useful to bind this variable with `let', but do not set it globally.\n\
4ed46869 3985If the value is a coding system, it is used for decoding on read operation.\n\
a67a9c66 3986If not, an appropriate element is used from one of the coding system alists:\n\
10bff6f1 3987There are three such tables, `file-coding-system-alist',\n\
a67a9c66 3988`process-coding-system-alist', and `network-coding-system-alist'.");
4ed46869
KH
3989 Vcoding_system_for_read = Qnil;
3990
3991 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
10bff6f1 3992 "Specify the coding system for write operations.\n\
2ebb362d 3993It is useful to bind this variable with `let', but do not set it globally.\n\
4ed46869 3994If the value is a coding system, it is used for encoding on write operation.\n\
a67a9c66 3995If not, an appropriate element is used from one of the coding system alists:\n\
10bff6f1 3996There are three such tables, `file-coding-system-alist',\n\
a67a9c66 3997`process-coding-system-alist', and `network-coding-system-alist'.");
4ed46869
KH
3998 Vcoding_system_for_write = Qnil;
3999
4000 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
a67a9c66 4001 "Coding system used in the latest file or process I/O.");
4ed46869
KH
4002 Vlast_coding_system_used = Qnil;
4003
9ce27fde
KH
4004 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
4005 "*Non-nil inhibit code conversion of end-of-line format in any cases.");
4006 inhibit_eol_conversion = 0;
4007
02ba4723
KH
4008 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
4009 "Alist to decide a coding system to use for a file I/O operation.\n\
4010The format is ((PATTERN . VAL) ...),\n\
4011where PATTERN is a regular expression matching a file name,\n\
4012VAL is a coding system, a cons of coding systems, or a function symbol.\n\
4013If VAL is a coding system, it is used for both decoding and encoding\n\
4014the file contents.\n\
4015If VAL is a cons of coding systems, the car part is used for decoding,\n\
4016and the cdr part is used for encoding.\n\
4017If VAL is a function symbol, the function must return a coding system\n\
4018or a cons of coding systems which are used as above.\n\
e0e989f6 4019\n\
9ce27fde 4020See also the function `find-operation-coding-system'.");
02ba4723
KH
4021 Vfile_coding_system_alist = Qnil;
4022
4023 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
4024 "Alist to decide a coding system to use for a process I/O operation.\n\
4025The format is ((PATTERN . VAL) ...),\n\
4026where PATTERN is a regular expression matching a program name,\n\
4027VAL is a coding system, a cons of coding systems, or a function symbol.\n\
4028If VAL is a coding system, it is used for both decoding what received\n\
4029from the program and encoding what sent to the program.\n\
4030If VAL is a cons of coding systems, the car part is used for decoding,\n\
4031and the cdr part is used for encoding.\n\
4032If VAL is a function symbol, the function must return a coding system\n\
4033or a cons of coding systems which are used as above.\n\
4ed46869 4034\n\
9ce27fde 4035See also the function `find-operation-coding-system'.");
02ba4723
KH
4036 Vprocess_coding_system_alist = Qnil;
4037
4038 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
4039 "Alist to decide a coding system to use for a network I/O operation.\n\
4040The format is ((PATTERN . VAL) ...),\n\
4041where PATTERN is a regular expression matching a network service name\n\
4042or is a port number to connect to,\n\
4043VAL is a coding system, a cons of coding systems, or a function symbol.\n\
4044If VAL is a coding system, it is used for both decoding what received\n\
4045from the network stream and encoding what sent to the network stream.\n\
4046If VAL is a cons of coding systems, the car part is used for decoding,\n\
4047and the cdr part is used for encoding.\n\
4048If VAL is a function symbol, the function must return a coding system\n\
4049or a cons of coding systems which are used as above.\n\
4ed46869 4050\n\
9ce27fde 4051See also the function `find-operation-coding-system'.");
02ba4723 4052 Vnetwork_coding_system_alist = Qnil;
4ed46869
KH
4053
4054 DEFVAR_INT ("eol-mnemonic-unix", &eol_mnemonic_unix,
4055 "Mnemonic character indicating UNIX-like end-of-line format (i.e. LF) .");
458822a0 4056 eol_mnemonic_unix = ':';
4ed46869
KH
4057
4058 DEFVAR_INT ("eol-mnemonic-dos", &eol_mnemonic_dos,
4059 "Mnemonic character indicating DOS-like end-of-line format (i.e. CRLF).");
458822a0 4060 eol_mnemonic_dos = '\\';
4ed46869
KH
4061
4062 DEFVAR_INT ("eol-mnemonic-mac", &eol_mnemonic_mac,
4063 "Mnemonic character indicating MAC-like end-of-line format (i.e. CR).");
458822a0 4064 eol_mnemonic_mac = '/';
4ed46869
KH
4065
4066 DEFVAR_INT ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
4067 "Mnemonic character indicating end-of-line format is not yet decided.");
458822a0 4068 eol_mnemonic_undecided = ':';
4ed46869 4069
bdd9fb48
KH
4070 DEFVAR_LISP ("enable-character-unification", &Venable_character_unification,
4071 "Non-nil means ISO 2022 encoder/decoder do character unification.");
4072 Venable_character_unification = Qt;
4073
a5d301df
KH
4074 DEFVAR_LISP ("standard-character-unification-table-for-decode",
4075 &Vstandard_character_unification_table_for_decode,
bdd9fb48 4076 "Table for unifying characters when reading.");
a5d301df 4077 Vstandard_character_unification_table_for_decode = Qnil;
bdd9fb48 4078
a5d301df
KH
4079 DEFVAR_LISP ("standard-character-unification-table-for-encode",
4080 &Vstandard_character_unification_table_for_encode,
bdd9fb48 4081 "Table for unifying characters when writing.");
a5d301df 4082 Vstandard_character_unification_table_for_encode = Qnil;
4ed46869
KH
4083
4084 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
4085 "Alist of charsets vs revision numbers.\n\
4086While encoding, if a charset (car part of an element) is found,\n\
4087designate it with the escape sequence identifing revision (cdr part of the element).");
4088 Vcharset_revision_alist = Qnil;
02ba4723
KH
4089
4090 DEFVAR_LISP ("default-process-coding-system",
4091 &Vdefault_process_coding_system,
4092 "Cons of coding systems used for process I/O by default.\n\
4093The car part is used for decoding a process output,\n\
4094the cdr part is used for encoding a text to be sent to a process.");
4095 Vdefault_process_coding_system = Qnil;
c4825358 4096
3f003981
KH
4097 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
4098 "Table of extra Latin codes in the range 128..159 (inclusive).\n\
c4825358
KH
4099This is a vector of length 256.\n\
4100If Nth element is non-nil, the existence of code N in a file\n\
4101(or output of subprocess) doesn't prevent it to be detected as\n\
3f003981
KH
4102a coding system of ISO 2022 variant which has a flag\n\
4103`accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file\n\
c4825358
KH
4104or reading output of a subprocess.\n\
4105Only 128th through 159th elements has a meaning.");
3f003981 4106 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
4ed46869
KH
4107}
4108
4109#endif /* emacs */