(struct iso2022_spec): Member requested_designation is
[bpt/emacs.git] / src / coding.c
CommitLineData
4ed46869
KH
1/* Coding system handler (conversion, detection, and etc).
2 Ver.1.0.
4ed46869
KH
3 Copyright (C) 1995 Free Software Foundation, Inc.
4 Copyright (C) 1995 Electrotechnical Laboratory, JAPAN.
5
369314dc
KH
6This file is part of GNU Emacs.
7
8GNU Emacs is free software; you can redistribute it and/or modify
9it under the terms of the GNU General Public License as published by
10the Free Software Foundation; either version 2, or (at your option)
11any later version.
4ed46869 12
369314dc
KH
13GNU Emacs is distributed in the hope that it will be useful,
14but WITHOUT ANY WARRANTY; without even the implied warranty of
15MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16GNU General Public License for more details.
4ed46869 17
369314dc
KH
18You should have received a copy of the GNU General Public License
19along with GNU Emacs; see the file COPYING. If not, write to
20the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
21Boston, MA 02111-1307, USA. */
4ed46869
KH
22
23/*** TABLE OF CONTENTS ***
24
25 1. Preamble
0ef69138 26 2. Emacs' internal format (emacs-mule) handlers
4ed46869
KH
27 3. ISO2022 handlers
28 4. Shift-JIS and BIG5 handlers
29 5. End-of-line handlers
30 6. C library functions
31 7. Emacs Lisp library functions
32 8. Post-amble
33
34*/
35
36/*** GENERAL NOTE on CODING SYSTEM ***
37
38 Coding system is an encoding mechanism of one or more character
39 sets. Here's a list of coding systems which Emacs can handle. When
40 we say "decode", it means converting some other coding system to
0ef69138
KH
41 Emacs' internal format (emacs-internal), and when we say "encode",
42 it means converting the coding system emacs-mule to some other
43 coding system.
4ed46869 44
0ef69138 45 0. Emacs' internal format (emacs-mule)
4ed46869
KH
46
47 Emacs itself holds a multi-lingual character in a buffer and a string
48 in a special format. Details are described in the section 2.
49
50 1. ISO2022
51
52 The most famous coding system for multiple character sets. X's
53 Compound Text, various EUCs (Extended Unix Code), and such coding
54 systems used in Internet communication as ISO-2022-JP are all
55 variants of ISO2022. Details are described in the section 3.
56
57 2. SJIS (or Shift-JIS or MS-Kanji-Code)
58
59 A coding system to encode character sets: ASCII, JISX0201, and
60 JISX0208. Widely used for PC's in Japan. Details are described in
61 the section 4.
62
63 3. BIG5
64
65 A coding system to encode character sets: ASCII and Big5. Widely
66 used by Chinese (mainly in Taiwan and Hong Kong). Details are
67 described in the section 4. In this file, when written as "BIG5"
68 (all uppercase), it means the coding system, and when written as
69 "Big5" (capitalized), it means the character set.
70
71 4. Else
72
73 If a user want to read/write a text encoded in a coding system not
74 listed above, he can supply a decoder and an encoder for it in CCL
75 (Code Conversion Language) programs. Emacs executes the CCL program
76 while reading/writing.
77
78 Emacs represent a coding-system by a Lisp symbol that has a property
79 `coding-system'. But, before actually using the coding-system, the
80 information about it is set in a structure of type `struct
81 coding_system' for rapid processing. See the section 6 for more
82 detail.
83
84*/
85
86/*** GENERAL NOTES on END-OF-LINE FORMAT ***
87
88 How end-of-line of a text is encoded depends on a system. For
89 instance, Unix's format is just one byte of `line-feed' code,
90 whereas DOS's format is two bytes sequence of `carriage-return' and
91 `line-feed' codes. MacOS's format is one byte of `carriage-return'.
92
93 Since how characters in a text is encoded and how end-of-line is
94 encoded is independent, any coding system described above can take
95 any format of end-of-line. So, Emacs has information of format of
96 end-of-line in each coding-system. See the section 6 for more
97 detail.
98
99*/
100
101/*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
102
103 These functions check if a text between SRC and SRC_END is encoded
104 in the coding system category XXX. Each returns an integer value in
105 which appropriate flag bits for the category XXX is set. The flag
106 bits are defined in macros CODING_CATEGORY_MASK_XXX. Below is the
107 template of these functions. */
108#if 0
109int
0ef69138 110detect_coding_emacs_mule (src, src_end)
4ed46869
KH
111 unsigned char *src, *src_end;
112{
113 ...
114}
115#endif
116
117/*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
118
119 These functions decode SRC_BYTES length text at SOURCE encoded in
0ef69138
KH
120 CODING to Emacs' internal format (emacs-mule). The resulting text
121 goes to a place pointed by DESTINATION, the length of which should
122 not exceed DST_BYTES. The bytes actually processed is returned as
123 *CONSUMED. The return value is the length of the decoded text.
124 Below is a template of these functions. */
4ed46869
KH
125#if 0
126decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
127 struct coding_system *coding;
128 unsigned char *source, *destination;
129 int src_bytes, dst_bytes;
130 int *consumed;
131{
132 ...
133}
134#endif
135
136/*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
137
0ef69138
KH
138 These functions encode SRC_BYTES length text at SOURCE of Emacs'
139 internal format (emacs-mule) to CODING. The resulting text goes to
140 a place pointed by DESTINATION, the length of which should not
141 exceed DST_BYTES. The bytes actually processed is returned as
142 *CONSUMED. The return value is the length of the encoded text.
143 Below is a template of these functions. */
4ed46869
KH
144#if 0
145encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
146 struct coding_system *coding;
147 unsigned char *source, *destination;
148 int src_bytes, dst_bytes;
149 int *consumed;
150{
151 ...
152}
153#endif
154
155/*** COMMONLY USED MACROS ***/
156
157/* The following three macros ONE_MORE_BYTE, TWO_MORE_BYTES, and
158 THREE_MORE_BYTES safely get one, two, and three bytes from the
159 source text respectively. If there are not enough bytes in the
160 source, they jump to `label_end_of_loop'. The caller should set
161 variables `src' and `src_end' to appropriate areas in advance. */
162
163#define ONE_MORE_BYTE(c1) \
164 do { \
165 if (src < src_end) \
166 c1 = *src++; \
167 else \
168 goto label_end_of_loop; \
169 } while (0)
170
171#define TWO_MORE_BYTES(c1, c2) \
172 do { \
173 if (src + 1 < src_end) \
174 c1 = *src++, c2 = *src++; \
175 else \
176 goto label_end_of_loop; \
177 } while (0)
178
179#define THREE_MORE_BYTES(c1, c2, c3) \
180 do { \
181 if (src + 2 < src_end) \
182 c1 = *src++, c2 = *src++, c3 = *src++; \
183 else \
184 goto label_end_of_loop; \
185 } while (0)
186
187/* The following three macros DECODE_CHARACTER_ASCII,
188 DECODE_CHARACTER_DIMENSION1, and DECODE_CHARACTER_DIMENSION2 put
189 the multi-byte form of a character of each class at the place
190 pointed by `dst'. The caller should set the variable `dst' to
191 point to an appropriate area and the variable `coding' to point to
192 the coding-system of the currently decoding text in advance. */
193
194/* Decode one ASCII character C. */
195
196#define DECODE_CHARACTER_ASCII(c) \
197 do { \
198 if (COMPOSING_P (coding->composing)) \
199 *dst++ = 0xA0, *dst++ = (c) | 0x80; \
200 else \
201 *dst++ = (c); \
202 } while (0)
203
204/* Decode one DIMENSION1 character of which charset is CHARSET and
205 position-code is C. */
206
207#define DECODE_CHARACTER_DIMENSION1(charset, c) \
208 do { \
209 unsigned char leading_code = CHARSET_LEADING_CODE_BASE (charset); \
210 if (COMPOSING_P (coding->composing)) \
211 *dst++ = leading_code + 0x20; \
212 else \
213 *dst++ = leading_code; \
214 if (leading_code = CHARSET_LEADING_CODE_EXT (charset)) \
215 *dst++ = leading_code; \
216 *dst++ = (c) | 0x80; \
217 } while (0)
218
219/* Decode one DIMENSION2 character of which charset is CHARSET and
220 position-codes are C1 and C2. */
221
222#define DECODE_CHARACTER_DIMENSION2(charset, c1, c2) \
223 do { \
224 DECODE_CHARACTER_DIMENSION1 (charset, c1); \
225 *dst++ = (c2) | 0x80; \
226 } while (0)
227
228\f
229/*** 1. Preamble ***/
230
231#include <stdio.h>
232
233#ifdef emacs
234
235#include <config.h>
236#include "lisp.h"
237#include "buffer.h"
238#include "charset.h"
239#include "ccl.h"
240#include "coding.h"
241#include "window.h"
242
243#else /* not emacs */
244
245#include "mulelib.h"
246
247#endif /* not emacs */
248
249Lisp_Object Qcoding_system, Qeol_type;
250Lisp_Object Qbuffer_file_coding_system;
251Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
252
253extern Lisp_Object Qinsert_file_contents, Qwrite_region;
254Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
255Lisp_Object Qstart_process, Qopen_network_stream;
256Lisp_Object Qtarget_idx;
257
258/* Mnemonic character of each format of end-of-line. */
259int eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
260/* Mnemonic character to indicate format of end-of-line is not yet
261 decided. */
262int eol_mnemonic_undecided;
263
264#ifdef emacs
265
266Lisp_Object Qcoding_system_vector, Qcoding_system_p, Qcoding_system_error;
267
268/* Coding-systems are handed between Emacs Lisp programs and C internal
269 routines by the following three variables. */
270/* Coding-system for reading files and receiving data from process. */
271Lisp_Object Vcoding_system_for_read;
272/* Coding-system for writing files and sending data to process. */
273Lisp_Object Vcoding_system_for_write;
274/* Coding-system actually used in the latest I/O. */
275Lisp_Object Vlast_coding_system_used;
276
277/* Coding-system of what terminal accept for displaying. */
278struct coding_system terminal_coding;
279
280/* Coding-system of what is sent from terminal keyboard. */
281struct coding_system keyboard_coding;
282
283Lisp_Object Vcoding_system_alist;
284
285#endif /* emacs */
286
287Lisp_Object Qcoding_category_index;
288
289/* List of symbols `coding-category-xxx' ordered by priority. */
290Lisp_Object Vcoding_category_list;
291
292/* Table of coding-systems currently assigned to each coding-category. */
293Lisp_Object coding_category_table[CODING_CATEGORY_IDX_MAX];
294
295/* Table of names of symbol for each coding-category. */
296char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
0ef69138 297 "coding-category-emacs-mule",
4ed46869
KH
298 "coding-category-sjis",
299 "coding-category-iso-7",
300 "coding-category-iso-8-1",
301 "coding-category-iso-8-2",
302 "coding-category-iso-else",
303 "coding-category-big5",
304 "coding-category-binary"
305};
306
bdd9fb48
KH
307/* Flag to tell if we look up unification table on character code
308 conversion. */
309Lisp_Object Venable_character_unification;
310/* Standard unification table to look up on reading (decoding). */
311Lisp_Object Vstandard_character_unification_table_for_read;
312/* Standard unification table to look up on writing (encoding). */
313Lisp_Object Vstandard_character_unification_table_for_write;
314
315Lisp_Object Qcharacter_unification_table;
4ed46869
KH
316
317/* Alist of charsets vs revision number. */
318Lisp_Object Vcharset_revision_alist;
319
320\f
0ef69138 321/*** 2. Emacs internal format (emacs-mule) handlers ***/
4ed46869
KH
322
323/* Emacs' internal format for encoding multiple character sets is a
324 kind of multi-byte encoding, i.e. encoding a character by a sequence
325 of one-byte codes of variable length. ASCII characters and control
326 characters (e.g. `tab', `newline') are represented by one-byte as
327 is. It takes the range 0x00 through 0x7F. The other characters
328 are represented by a sequence of `base leading-code', optional
329 `extended leading-code', and one or two `position-code's. Length
330 of the sequence is decided by the base leading-code. Leading-code
331 takes the range 0x80 through 0x9F, whereas extended leading-code
332 and position-code take the range 0xA0 through 0xFF. See the
333 document of `charset.h' for more detail about leading-code and
334 position-code.
335
336 There's one exception in this rule. Special leading-code
337 `leading-code-composition' denotes that the following several
338 characters should be composed into one character. Leading-codes of
339 components (except for ASCII) are added 0x20. An ASCII character
340 component is represented by a 2-byte sequence of `0xA0' and
341 `ASCII-code + 0x80'. See also the document in `charset.h' for the
342 detail of composite character. Hence, we can summarize the code
343 range as follows:
344
345 --- CODE RANGE of Emacs' internal format ---
346 (character set) (range)
347 ASCII 0x00 .. 0x7F
348 ELSE (1st byte) 0x80 .. 0x9F
349 (rest bytes) 0xA0 .. 0xFF
350 ---------------------------------------------
351
352 */
353
354enum emacs_code_class_type emacs_code_class[256];
355
356/* Go to the next statement only if *SRC is accessible and the code is
357 greater than 0xA0. */
358#define CHECK_CODE_RANGE_A0_FF \
359 do { \
360 if (src >= src_end) \
361 goto label_end_of_switch; \
362 else if (*src++ < 0xA0) \
363 return 0; \
364 } while (0)
365
366/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
367 Check if a text is encoded in Emacs' internal format. If it is,
0ef69138 368 return CODING_CATEGORY_MASK_EMASC_MULE, else return 0. */
4ed46869
KH
369
370int
0ef69138 371detect_coding_emacs_mule (src, src_end)
4ed46869
KH
372 unsigned char *src, *src_end;
373{
374 unsigned char c;
375 int composing = 0;
376
377 while (src < src_end)
378 {
379 c = *src++;
380
381 if (composing)
382 {
383 if (c < 0xA0)
384 composing = 0;
385 else
386 c -= 0x20;
387 }
388
389 switch (emacs_code_class[c])
390 {
391 case EMACS_ascii_code:
392 case EMACS_linefeed_code:
393 break;
394
395 case EMACS_control_code:
396 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
397 return 0;
398 break;
399
400 case EMACS_invalid_code:
401 return 0;
402
403 case EMACS_leading_code_composition: /* c == 0x80 */
404 if (composing)
405 CHECK_CODE_RANGE_A0_FF;
406 else
407 composing = 1;
408 break;
409
410 case EMACS_leading_code_4:
411 CHECK_CODE_RANGE_A0_FF;
412 /* fall down to check it two more times ... */
413
414 case EMACS_leading_code_3:
415 CHECK_CODE_RANGE_A0_FF;
416 /* fall down to check it one more time ... */
417
418 case EMACS_leading_code_2:
419 CHECK_CODE_RANGE_A0_FF;
420 break;
421
422 default:
423 label_end_of_switch:
424 break;
425 }
426 }
0ef69138 427 return CODING_CATEGORY_MASK_EMACS_MULE;
4ed46869
KH
428}
429
430\f
431/*** 3. ISO2022 handlers ***/
432
433/* The following note describes the coding system ISO2022 briefly.
434 Since the intension of this note is to help understanding of the
435 programs in this file, some parts are NOT ACCURATE or OVERLY
436 SIMPLIFIED. For the thorough understanding, please refer to the
437 original document of ISO2022.
438
439 ISO2022 provides many mechanisms to encode several character sets
440 in 7-bit and 8-bit environment. If one choose 7-bite environment,
441 all text is encoded by codes of less than 128. This may make the
442 encoded text a little bit longer, but the text get more stability
443 to pass through several gateways (some of them split MSB off).
444
445 There are two kind of character set: control character set and
446 graphic character set. The former contains control characters such
447 as `newline' and `escape' to provide control functions (control
448 functions are provided also by escape sequence). The latter
449 contains graphic characters such as ' A' and '-'. Emacs recognizes
450 two control character sets and many graphic character sets.
451
452 Graphic character sets are classified into one of the following
453 four classes, DIMENSION1_CHARS94, DIMENSION1_CHARS96,
454 DIMENSION2_CHARS94, DIMENSION2_CHARS96 according to the number of
455 bytes (DIMENSION) and the number of characters in one dimension
456 (CHARS) of the set. In addition, each character set is assigned an
457 identification tag (called "final character" and denoted as <F>
458 here after) which is unique in each class. <F> of each character
459 set is decided by ECMA(*) when it is registered in ISO. Code range
460 of <F> is 0x30..0x7F (0x30..0x3F are for private use only).
461
462 Note (*): ECMA = European Computer Manufacturers Association
463
464 Here are examples of graphic character set [NAME(<F>)]:
465 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
466 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
467 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
468 o DIMENSION2_CHARS96 -- none for the moment
469
470 A code area (1byte=8bits) is divided into 4 areas, C0, GL, C1, and GR.
471 C0 [0x00..0x1F] -- control character plane 0
472 GL [0x20..0x7F] -- graphic character plane 0
473 C1 [0x80..0x9F] -- control character plane 1
474 GR [0xA0..0xFF] -- graphic character plane 1
475
476 A control character set is directly designated and invoked to C0 or
477 C1 by an escape sequence. The most common case is that ISO646's
478 control character set is designated/invoked to C0 and ISO6429's
479 control character set is designated/invoked to C1, and usually
480 these designations/invocations are omitted in a coded text. With
481 7-bit environment, only C0 can be used, and a control character for
482 C1 is encoded by an appropriate escape sequence to fit in the
483 environment. All control characters for C1 are defined the
484 corresponding escape sequences.
485
486 A graphic character set is at first designated to one of four
487 graphic registers (G0 through G3), then these graphic registers are
488 invoked to GL or GR. These designations and invocations can be
489 done independently. The most common case is that G0 is invoked to
490 GL, G1 is invoked to GR, and ASCII is designated to G0, and usually
491 these invocations and designations are omitted in a coded text.
492 With 7-bit environment, only GL can be used.
493
494 When a graphic character set of CHARS94 is invoked to GL, code 0x20
495 and 0x7F of GL area work as control characters SPACE and DEL
496 respectively, and code 0xA0 and 0xFF of GR area should not be used.
497
498 There are two ways of invocation: locking-shift and single-shift.
499 With locking-shift, the invocation lasts until the next different
500 invocation, whereas with single-shift, the invocation works only
501 for the following character and doesn't affect locking-shift.
502 Invocations are done by the following control characters or escape
503 sequences.
504
505 ----------------------------------------------------------------------
506 function control char escape sequence description
507 ----------------------------------------------------------------------
508 SI (shift-in) 0x0F none invoke G0 to GL
509 SI (shift-out) 0x0E none invoke G1 to GL
510 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
511 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
512 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 into GL
513 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 into GL
514 ----------------------------------------------------------------------
515 The first four are for locking-shift. Control characters for these
516 functions are defined by macros ISO_CODE_XXX in `coding.h'.
517
518 Designations are done by the following escape sequences.
519 ----------------------------------------------------------------------
520 escape sequence description
521 ----------------------------------------------------------------------
522 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
523 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
524 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
525 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
526 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
527 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
528 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
529 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
530 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
531 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
532 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
533 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
534 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
535 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
536 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
537 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
538 ----------------------------------------------------------------------
539
540 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
541 of dimension 1, chars 94, and final character <F>, and etc.
542
543 Note (*): Although these designations are not allowed in ISO2022,
544 Emacs accepts them on decoding, and produces them on encoding
545 CHARS96 character set in a coding system which is characterized as
546 7-bit environment, non-locking-shift, and non-single-shift.
547
548 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
549 '(' can be omitted. We call this as "short-form" here after.
550
551 Now you may notice that there are a lot of ways for encoding the
552 same multilingual text in ISO2022. Actually, there exist many
553 coding systems such as Compound Text (used in X's inter client
554 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
555 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
556 localized platforms), and all of these are variants of ISO2022.
557
558 In addition to the above, Emacs handles two more kinds of escape
559 sequences: ISO6429's direction specification and Emacs' private
560 sequence for specifying character composition.
561
562 ISO6429's direction specification takes the following format:
563 o CSI ']' -- end of the current direction
564 o CSI '0' ']' -- end of the current direction
565 o CSI '1' ']' -- start of left-to-right text
566 o CSI '2' ']' -- start of right-to-left text
567 The control character CSI (0x9B: control sequence introducer) is
568 abbreviated to the escape sequence ESC '[' in 7-bit environment.
569
570 Character composition specification takes the following format:
571 o ESC '0' -- start character composition
572 o ESC '1' -- end character composition
573 Since these are not standard escape sequences of any ISO, the use
574 of them for these meaning is restricted to Emacs only. */
575
576enum iso_code_class_type iso_code_class[256];
577
578/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
579 Check if a text is encoded in ISO2022. If it is, returns an
580 integer in which appropriate flag bits any of:
581 CODING_CATEGORY_MASK_ISO_7
582 CODING_CATEGORY_MASK_ISO_8_1
583 CODING_CATEGORY_MASK_ISO_8_2
584 CODING_CATEGORY_MASK_ISO_ELSE
585 are set. If a code which should never appear in ISO2022 is found,
586 returns 0. */
587
588int
589detect_coding_iso2022 (src, src_end)
590 unsigned char *src, *src_end;
591{
bcf26d6a
KH
592 int mask = CODING_CATEGORY_MASK_ANY;
593 int g1 = 0; /* 1 iff designating to G1. */
594 int c, i;
4ed46869 595
e0e989f6 596 while (src < src_end)
4ed46869
KH
597 {
598 c = *src++;
599 switch (c)
600 {
601 case ISO_CODE_ESC:
e0e989f6 602 if (src >= src_end)
4ed46869
KH
603 break;
604 c = *src++;
bcf26d6a 605 if (src < src_end
e0e989f6
KH
606 && ((c >= '(' && c <= '/')
607 || c == '$' && ((*src >= '(' && *src <= '/')
608 || (*src >= '@' && *src <= 'B'))))
4ed46869 609 {
e0e989f6 610 /* Valid designation sequence. */
bcf26d6a
KH
611 mask &= (CODING_CATEGORY_MASK_ISO_7
612 | CODING_CATEGORY_MASK_ISO_8_1
613 | CODING_CATEGORY_MASK_ISO_8_2
614 | CODING_CATEGORY_MASK_ISO_ELSE);
e0e989f6 615 if (c == ')' || (c == '$' && *src == ')'))
bcf26d6a
KH
616 {
617 g1 = 1;
618 mask &= ~CODING_CATEGORY_MASK_ISO_7;
619 }
e0e989f6
KH
620 src++;
621 break;
4ed46869 622 }
4ed46869
KH
623 else if (c == 'N' || c == 'O' || c == 'n' || c == 'o')
624 return CODING_CATEGORY_MASK_ISO_ELSE;
625 break;
626
4ed46869 627 case ISO_CODE_SO:
e0e989f6
KH
628 if (g1)
629 return CODING_CATEGORY_MASK_ISO_ELSE;
630 break;
631
4ed46869
KH
632 case ISO_CODE_CSI:
633 case ISO_CODE_SS2:
634 case ISO_CODE_SS3:
635 mask &= ~CODING_CATEGORY_MASK_ISO_7;
636 break;
637
638 default:
639 if (c < 0x80)
640 break;
641 else if (c < 0xA0)
642 return 0;
643 else
644 {
645 int count = 1;
646
647 mask &= ~CODING_CATEGORY_MASK_ISO_7;
e0e989f6 648 while (src < src_end && *src >= 0xA0)
4ed46869 649 count++, src++;
e0e989f6 650 if (count & 1 && src < src_end)
4ed46869
KH
651 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
652 }
653 break;
654 }
655 }
656
657 return mask;
658}
659
660/* Decode a character of which charset is CHARSET and the 1st position
bdd9fb48 661 code is C1. If dimension of CHARSET is 2, the 2nd position code is
4ed46869
KH
662 fetched from SRC and set to C2. If CHARSET is negative, it means
663 that we are decoding ill formed text, and what we can do is just to
664 read C1 as is. */
665
bdd9fb48
KH
666#define DECODE_ISO_CHARACTER(charset, c1) \
667 do { \
668 int c_alt, charset_alt = (charset); \
669 if (COMPOSING_HEAD_P (coding->composing)) \
670 { \
671 *dst++ = LEADING_CODE_COMPOSITION; \
672 if (COMPOSING_WITH_RULE_P (coding->composing)) \
673 /* To tell composition rules are embeded. */ \
674 *dst++ = 0xFF; \
675 coding->composing += 2; \
676 } \
677 if ((charset) >= 0) \
678 { \
679 if (CHARSET_DIMENSION (charset) == 2) \
680 ONE_MORE_BYTE (c2); \
681 if (!NILP (unification_table) \
682 && ((c_alt = unify_char (unification_table, \
683 -1, (charset), c1, c2)) >= 0)) \
684 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
685 } \
686 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
687 DECODE_CHARACTER_ASCII (c1); \
688 else if (CHARSET_DIMENSION (charset_alt) == 1) \
689 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
690 else \
691 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
692 if (COMPOSING_WITH_RULE_P (coding->composing)) \
693 /* To tell a composition rule follows. */ \
694 coding->composing = COMPOSING_WITH_RULE_RULE; \
4ed46869
KH
695 } while (0)
696
697/* Set designation state into CODING. */
698#define DECODE_DESIGNATION(reg, dimension, chars, final_char) \
699 do { \
700 int charset = ISO_CHARSET_TABLE (dimension, chars, final_char); \
4ed46869
KH
701 if (charset >= 0) \
702 { \
703 if (coding->direction == 1 \
704 && CHARSET_REVERSE_CHARSET (charset) >= 0) \
705 charset = CHARSET_REVERSE_CHARSET (charset); \
706 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
707 } \
708 } while (0)
709
710/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
711
712int
713decode_coding_iso2022 (coding, source, destination,
714 src_bytes, dst_bytes, consumed)
715 struct coding_system *coding;
716 unsigned char *source, *destination;
717 int src_bytes, dst_bytes;
718 int *consumed;
719{
720 unsigned char *src = source;
721 unsigned char *src_end = source + src_bytes;
722 unsigned char *dst = destination;
723 unsigned char *dst_end = destination + dst_bytes;
724 /* Since the maximum bytes produced by each loop is 7, we subtract 6
725 from DST_END to assure that overflow checking is necessary only
726 at the head of loop. */
727 unsigned char *adjusted_dst_end = dst_end - 6;
728 int charset;
729 /* Charsets invoked to graphic plane 0 and 1 respectively. */
730 int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
731 int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
bdd9fb48
KH
732 Lisp_Object unification_table = coding->character_unification_table;
733
734 if (!NILP (Venable_character_unification) && NILP (unification_table))
735 unification_table = Vstandard_character_unification_table_for_read;
4ed46869
KH
736
737 while (src < src_end && dst < adjusted_dst_end)
738 {
739 /* SRC_BASE remembers the start position in source in each loop.
740 The loop will be exited when there's not enough source text
741 to analyze long escape sequence or 2-byte code (within macros
742 ONE_MORE_BYTE or TWO_MORE_BYTES). In that case, SRC is reset
743 to SRC_BASE before exiting. */
744 unsigned char *src_base = src;
bdd9fb48 745 int c1 = *src++, c2;
4ed46869
KH
746
747 switch (iso_code_class [c1])
748 {
749 case ISO_0x20_or_0x7F:
750 if (!coding->composing
751 && (charset0 < 0 || CHARSET_CHARS (charset0) == 94))
752 {
753 /* This is SPACE or DEL. */
754 *dst++ = c1;
755 break;
756 }
757 /* This is a graphic character, we fall down ... */
758
759 case ISO_graphic_plane_0:
760 if (coding->composing == COMPOSING_WITH_RULE_RULE)
761 {
762 /* This is a composition rule. */
763 *dst++ = c1 | 0x80;
764 coding->composing = COMPOSING_WITH_RULE_TAIL;
765 }
766 else
767 DECODE_ISO_CHARACTER (charset0, c1);
768 break;
769
770 case ISO_0xA0_or_0xFF:
771 if (charset1 < 0 || CHARSET_CHARS (charset1) == 94)
772 {
773 /* Invalid code. */
774 *dst++ = c1;
775 break;
776 }
777 /* This is a graphic character, we fall down ... */
778
779 case ISO_graphic_plane_1:
780 DECODE_ISO_CHARACTER (charset1, c1);
781 break;
782
783 case ISO_control_code:
784 /* All ISO2022 control characters in this class have the
785 same representation in Emacs internal format. */
786 *dst++ = c1;
787 break;
788
789 case ISO_carriage_return:
790 if (coding->eol_type == CODING_EOL_CR)
791 {
792 *dst++ = '\n';
793 }
794 else if (coding->eol_type == CODING_EOL_CRLF)
795 {
796 ONE_MORE_BYTE (c1);
797 if (c1 == ISO_CODE_LF)
798 *dst++ = '\n';
799 else
800 {
801 src--;
802 *dst++ = c1;
803 }
804 }
805 else
806 {
807 *dst++ = c1;
808 }
809 break;
810
811 case ISO_shift_out:
e0e989f6
KH
812 if (CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
813 goto label_invalid_escape_sequence;
4ed46869
KH
814 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
815 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
816 break;
817
818 case ISO_shift_in:
819 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
820 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
821 break;
822
823 case ISO_single_shift_2_7:
824 case ISO_single_shift_2:
825 /* SS2 is handled as an escape sequence of ESC 'N' */
826 c1 = 'N';
827 goto label_escape_sequence;
828
829 case ISO_single_shift_3:
830 /* SS2 is handled as an escape sequence of ESC 'O' */
831 c1 = 'O';
832 goto label_escape_sequence;
833
834 case ISO_control_sequence_introducer:
835 /* CSI is handled as an escape sequence of ESC '[' ... */
836 c1 = '[';
837 goto label_escape_sequence;
838
839 case ISO_escape:
840 ONE_MORE_BYTE (c1);
841 label_escape_sequence:
842 /* Escape sequences handled by Emacs are invocation,
843 designation, direction specification, and character
844 composition specification. */
845 switch (c1)
846 {
847 case '&': /* revision of following character set */
848 ONE_MORE_BYTE (c1);
849 if (!(c1 >= '@' && c1 <= '~'))
e0e989f6 850 goto label_invalid_escape_sequence;
4ed46869
KH
851 ONE_MORE_BYTE (c1);
852 if (c1 != ISO_CODE_ESC)
e0e989f6 853 goto label_invalid_escape_sequence;
4ed46869
KH
854 ONE_MORE_BYTE (c1);
855 goto label_escape_sequence;
856
857 case '$': /* designation of 2-byte character set */
858 ONE_MORE_BYTE (c1);
859 if (c1 >= '@' && c1 <= 'B')
860 { /* designation of JISX0208.1978, GB2312.1980,
861 or JISX0208.1980 */
862 DECODE_DESIGNATION (0, 2, 94, c1);
863 }
864 else if (c1 >= 0x28 && c1 <= 0x2B)
865 { /* designation of DIMENSION2_CHARS94 character set */
866 ONE_MORE_BYTE (c2);
867 DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
868 }
869 else if (c1 >= 0x2C && c1 <= 0x2F)
870 { /* designation of DIMENSION2_CHARS96 character set */
871 ONE_MORE_BYTE (c2);
872 DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
873 }
874 else
e0e989f6 875 goto label_invalid_escape_sequence;
4ed46869
KH
876 break;
877
878 case 'n': /* invocation of locking-shift-2 */
e0e989f6
KH
879 if (CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
880 goto label_invalid_escape_sequence;
4ed46869 881 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
e0e989f6 882 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
4ed46869
KH
883 break;
884
885 case 'o': /* invocation of locking-shift-3 */
e0e989f6
KH
886 if (CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
887 goto label_invalid_escape_sequence;
4ed46869 888 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
e0e989f6 889 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
4ed46869
KH
890 break;
891
892 case 'N': /* invocation of single-shift-2 */
e0e989f6
KH
893 if (CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
894 goto label_invalid_escape_sequence;
4ed46869
KH
895 ONE_MORE_BYTE (c1);
896 charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
897 DECODE_ISO_CHARACTER (charset, c1);
898 break;
899
900 case 'O': /* invocation of single-shift-3 */
e0e989f6
KH
901 if (CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
902 goto label_invalid_escape_sequence;
4ed46869
KH
903 ONE_MORE_BYTE (c1);
904 charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
905 DECODE_ISO_CHARACTER (charset, c1);
906 break;
907
908 case '0': /* start composing without embeded rules */
909 coding->composing = COMPOSING_NO_RULE_HEAD;
910 break;
911
912 case '1': /* end composing */
913 coding->composing = COMPOSING_NO;
914 break;
915
916 case '2': /* start composing with embeded rules */
917 coding->composing = COMPOSING_WITH_RULE_HEAD;
918 break;
919
920 case '[': /* specification of direction */
921 /* For the moment, nested direction is not supported.
922 So, the value of `coding->direction' is 0 or 1: 0
923 means left-to-right, 1 means right-to-left. */
924 ONE_MORE_BYTE (c1);
925 switch (c1)
926 {
927 case ']': /* end of the current direction */
928 coding->direction = 0;
929
930 case '0': /* end of the current direction */
931 case '1': /* start of left-to-right direction */
932 ONE_MORE_BYTE (c1);
933 if (c1 == ']')
934 coding->direction = 0;
935 else
936 goto label_invalid_escape_sequence;
937 break;
938
939 case '2': /* start of right-to-left direction */
940 ONE_MORE_BYTE (c1);
941 if (c1 == ']')
942 coding->direction= 1;
943 else
944 goto label_invalid_escape_sequence;
945 break;
946
947 default:
948 goto label_invalid_escape_sequence;
949 }
950 break;
951
952 default:
953 if (c1 >= 0x28 && c1 <= 0x2B)
954 { /* designation of DIMENSION1_CHARS94 character set */
955 ONE_MORE_BYTE (c2);
956 DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
957 }
958 else if (c1 >= 0x2C && c1 <= 0x2F)
959 { /* designation of DIMENSION1_CHARS96 character set */
960 ONE_MORE_BYTE (c2);
961 DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
962 }
963 else
964 {
965 goto label_invalid_escape_sequence;
966 }
967 }
968 /* We must update these variables now. */
969 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
970 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
971 break;
972
973 label_invalid_escape_sequence:
974 {
975 int length = src - src_base;
976
977 bcopy (src_base, dst, length);
978 dst += length;
979 }
980 }
981 continue;
982
983 label_end_of_loop:
984 coding->carryover_size = src - src_base;
985 bcopy (src_base, coding->carryover, coding->carryover_size);
986 src = src_base;
987 break;
988 }
989
990 /* If this is the last block of the text to be decoded, we had
991 better just flush out all remaining codes in the text although
992 they are not valid characters. */
993 if (coding->last_block)
994 {
995 bcopy (src, dst, src_end - src);
996 dst += (src_end - src);
997 src = src_end;
998 }
999 *consumed = src - source;
1000 return dst - destination;
1001}
1002
1003/* ISO2022 encoding staffs. */
1004
1005/*
1006 It is not enough to say just "ISO2022" on encoding, but we have to
1007 specify more details. In Emacs, each coding-system of ISO2022
1008 variant has the following specifications:
1009 1. Initial designation to G0 thru G3.
1010 2. Allows short-form designation?
1011 3. ASCII should be designated to G0 before control characters?
1012 4. ASCII should be designated to G0 at end of line?
1013 5. 7-bit environment or 8-bit environment?
1014 6. Use locking-shift?
1015 7. Use Single-shift?
1016 And the following two are only for Japanese:
1017 8. Use ASCII in place of JIS0201-1976-Roman?
1018 9. Use JISX0208-1983 in place of JISX0208-1978?
1019 These specifications are encoded in `coding->flags' as flag bits
1020 defined by macros CODING_FLAG_ISO_XXX. See `coding.h' for more
1021 detail.
1022*/
1023
1024/* Produce codes (escape sequence) for designating CHARSET to graphic
1025 register REG. If <final-char> of CHARSET is '@', 'A', or 'B' and
1026 the coding system CODING allows, produce designation sequence of
1027 short-form. */
1028
1029#define ENCODE_DESIGNATION(charset, reg, coding) \
1030 do { \
1031 unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset); \
1032 char *intermediate_char_94 = "()*+"; \
1033 char *intermediate_char_96 = ",-./"; \
1034 Lisp_Object temp \
1035 = Fassq (make_number (charset), Vcharset_revision_alist); \
1036 if (! NILP (temp)) \
1037 { \
1038 *dst++ = ISO_CODE_ESC; \
1039 *dst++ = '&'; \
1040 *dst++ = XINT (XCONS (temp)->cdr) + '@'; \
1041 } \
1042 *dst++ = ISO_CODE_ESC; \
1043 if (CHARSET_DIMENSION (charset) == 1) \
1044 { \
1045 if (CHARSET_CHARS (charset) == 94) \
1046 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1047 else \
1048 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1049 } \
1050 else \
1051 { \
1052 *dst++ = '$'; \
1053 if (CHARSET_CHARS (charset) == 94) \
1054 { \
1055 if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM) \
1056 || reg != 0 \
1057 || final_char < '@' || final_char > 'B') \
1058 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1059 } \
1060 else \
1061 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1062 } \
1063 *dst++ = final_char; \
1064 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
1065 } while (0)
1066
1067/* The following two macros produce codes (control character or escape
1068 sequence) for ISO2022 single-shift functions (single-shift-2 and
1069 single-shift-3). */
1070
1071#define ENCODE_SINGLE_SHIFT_2 \
1072 do { \
1073 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1074 *dst++ = ISO_CODE_ESC, *dst++ = 'N'; \
1075 else \
1076 *dst++ = ISO_CODE_SS2; \
1077 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1078 } while (0)
1079
1080#define ENCODE_SINGLE_SHIFT_3 \
1081 do { \
1082 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1083 *dst++ = ISO_CODE_ESC, *dst++ = 'O'; \
1084 else \
1085 *dst++ = ISO_CODE_SS3; \
1086 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1087 } while (0)
1088
1089/* The following four macros produce codes (control character or
1090 escape sequence) for ISO2022 locking-shift functions (shift-in,
1091 shift-out, locking-shift-2, and locking-shift-3). */
1092
1093#define ENCODE_SHIFT_IN \
1094 do { \
1095 *dst++ = ISO_CODE_SI; \
1096 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1097 } while (0)
1098
1099#define ENCODE_SHIFT_OUT \
1100 do { \
1101 *dst++ = ISO_CODE_SO; \
1102 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1103 } while (0)
1104
1105#define ENCODE_LOCKING_SHIFT_2 \
1106 do { \
1107 *dst++ = ISO_CODE_ESC, *dst++ = 'n'; \
1108 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1109 } while (0)
1110
1111#define ENCODE_LOCKING_SHIFT_3 \
1112 do { \
1113 *dst++ = ISO_CODE_ESC, *dst++ = 'o'; \
1114 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1115 } while (0)
1116
1117/* Produce codes for a DIMENSION1 character of which character set is
1118 CHARSET and position-code is C1. Designation and invocation
1119 sequences are also produced in advance if necessary. */
1120
1121
1122#define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
1123 do { \
1124 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1125 { \
1126 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1127 *dst++ = c1 & 0x7F; \
1128 else \
1129 *dst++ = c1 | 0x80; \
1130 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1131 break; \
1132 } \
1133 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1134 { \
1135 *dst++ = c1 & 0x7F; \
1136 break; \
1137 } \
1138 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1139 { \
1140 *dst++ = c1 | 0x80; \
1141 break; \
1142 } \
1143 else \
1144 /* Since CHARSET is not yet invoked to any graphic planes, we \
1145 must invoke it, or, at first, designate it to some graphic \
1146 register. Then repeat the loop to actually produce the \
1147 character. */ \
1148 dst = encode_invocation_designation (charset, coding, dst); \
1149 } while (1)
1150
1151/* Produce codes for a DIMENSION2 character of which character set is
1152 CHARSET and position-codes are C1 and C2. Designation and
1153 invocation codes are also produced in advance if necessary. */
1154
1155#define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
1156 do { \
1157 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1158 { \
1159 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1160 *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F; \
1161 else \
1162 *dst++ = c1 | 0x80, *dst++ = c2 | 0x80; \
1163 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1164 break; \
1165 } \
1166 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1167 { \
1168 *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F; \
1169 break; \
1170 } \
1171 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1172 { \
1173 *dst++ = c1 | 0x80, *dst++= c2 | 0x80; \
1174 break; \
1175 } \
1176 else \
1177 /* Since CHARSET is not yet invoked to any graphic planes, we \
1178 must invoke it, or, at first, designate it to some graphic \
1179 register. Then repeat the loop to actually produce the \
1180 character. */ \
1181 dst = encode_invocation_designation (charset, coding, dst); \
1182 } while (1)
1183
bdd9fb48
KH
1184#define ENCODE_ISO_CHARACTER(charset, c1, c2) \
1185 do { \
1186 int c_alt, charset_alt; \
1187 if (!NILP (unification_table) \
1188 && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
1189 < 0)) \
1190 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1191 else \
1192 charset_alt = charset; \
1193 if (CHARSET_DIMENSION (charset_alt) == 1) \
1194 ENCODE_ISO_CHARACTER_DIMENSION1 (charset_alt, c1); \
1195 else \
1196 ENCODE_ISO_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
1197 } while (0)
1198
4ed46869
KH
1199/* Produce designation and invocation codes at a place pointed by DST
1200 to use CHARSET. The element `spec.iso2022' of *CODING is updated.
1201 Return new DST. */
1202
1203unsigned char *
1204encode_invocation_designation (charset, coding, dst)
1205 int charset;
1206 struct coding_system *coding;
1207 unsigned char *dst;
1208{
1209 int reg; /* graphic register number */
1210
1211 /* At first, check designations. */
1212 for (reg = 0; reg < 4; reg++)
1213 if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
1214 break;
1215
1216 if (reg >= 4)
1217 {
1218 /* CHARSET is not yet designated to any graphic registers. */
1219 /* At first check the requested designation. */
1220 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1221 if (reg < 0)
1222 /* Since CHARSET requests no special designation, designate to
1223 graphic register 0. */
1224 reg = 0;
1225
1226 ENCODE_DESIGNATION (charset, reg, coding);
1227 }
1228
1229 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
1230 && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
1231 {
1232 /* Since the graphic register REG is not invoked to any graphic
1233 planes, invoke it to graphic plane 0. */
1234 switch (reg)
1235 {
1236 case 0: /* graphic register 0 */
1237 ENCODE_SHIFT_IN;
1238 break;
1239
1240 case 1: /* graphic register 1 */
1241 ENCODE_SHIFT_OUT;
1242 break;
1243
1244 case 2: /* graphic register 2 */
1245 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1246 ENCODE_SINGLE_SHIFT_2;
1247 else
1248 ENCODE_LOCKING_SHIFT_2;
1249 break;
1250
1251 case 3: /* graphic register 3 */
1252 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1253 ENCODE_SINGLE_SHIFT_3;
1254 else
1255 ENCODE_LOCKING_SHIFT_3;
1256 break;
1257 }
1258 }
1259 return dst;
1260}
1261
1262/* The following two macros produce codes for indicating composition. */
1263#define ENCODE_COMPOSITION_NO_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '0'
1264#define ENCODE_COMPOSITION_WITH_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '2'
1265#define ENCODE_COMPOSITION_END *dst++ = ISO_CODE_ESC, *dst++ = '1'
1266
1267/* The following three macros produce codes for indicating direction
1268 of text. */
1269#define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
1270 do { \
1271 if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS) \
1272 *dst++ = ISO_CODE_ESC, *dst++ = '['; \
1273 else \
1274 *dst++ = ISO_CODE_CSI; \
1275 } while (0)
1276
1277#define ENCODE_DIRECTION_R2L \
1278 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '2', *dst++ = ']'
1279
1280#define ENCODE_DIRECTION_L2R \
1281 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '0', *dst++ = ']'
1282
1283/* Produce codes for designation and invocation to reset the graphic
1284 planes and registers to initial state. */
e0e989f6
KH
1285#define ENCODE_RESET_PLANE_AND_REGISTER \
1286 do { \
1287 int reg; \
1288 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0) \
1289 ENCODE_SHIFT_IN; \
1290 for (reg = 0; reg < 4; reg++) \
1291 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0 \
1292 && (CODING_SPEC_ISO_DESIGNATION (coding, reg) \
1293 != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg))) \
1294 ENCODE_DESIGNATION \
1295 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
4ed46869
KH
1296 } while (0)
1297
bdd9fb48
KH
1298/* Produce designation sequences of charsets in the line started from
1299 *SRC to a place pointed by DSTP.
1300
1301 If the current block ends before any end-of-line, we may fail to
1302 find all the necessary *designations. */
1303encode_designation_at_bol (coding, table, src, src_end, dstp)
e0e989f6 1304 struct coding_system *coding;
bdd9fb48 1305 Lisp_Object table;
e0e989f6
KH
1306 unsigned char *src, *src_end, **dstp;
1307{
bdd9fb48
KH
1308 int charset, c, found = 0, reg;
1309 /* Table of charsets to be designated to each graphic register. */
1310 int r[4];
1311 unsigned char *dst = *dstp;
1312
1313 for (reg = 0; reg < 4; reg++)
1314 r[reg] = -1;
1315
1316 while (src < src_end && *src != '\n' && found < 4)
e0e989f6 1317 {
bdd9fb48
KH
1318 int bytes = BYTES_BY_CHAR_HEAD (*src);
1319
1320 if (NILP (table))
1321 charset = CHARSET_AT (src);
1322 else
e0e989f6 1323 {
bdd9fb48
KH
1324 int c_alt, c1, c2;
1325
1326 SPLIT_STRING(src, bytes, charset, c1, c2);
1327 if ((c_alt = unify_char (table, -1, charset, c1, c2)) >= 0)
1328 charset = CHAR_CHARSET (c_alt);
e0e989f6 1329 }
bdd9fb48 1330
e0e989f6 1331 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
bdd9fb48
KH
1332 if (r[reg] < 0)
1333 {
1334 found++;
1335 r[reg] = charset;
1336 }
1337
1338 src += bytes;
1339 }
1340
1341 if (found)
1342 {
1343 for (reg = 0; reg < 4; reg++)
1344 if (r[reg] >= 0
1345 && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
1346 ENCODE_DESIGNATION (r[reg], reg, coding);
1347 *dstp = dst;
e0e989f6 1348 }
e0e989f6
KH
1349}
1350
4ed46869
KH
1351/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
1352
1353int
1354encode_coding_iso2022 (coding, source, destination,
1355 src_bytes, dst_bytes, consumed)
1356 struct coding_system *coding;
1357 unsigned char *source, *destination;
1358 int src_bytes, dst_bytes;
1359 int *consumed;
1360{
1361 unsigned char *src = source;
1362 unsigned char *src_end = source + src_bytes;
1363 unsigned char *dst = destination;
1364 unsigned char *dst_end = destination + dst_bytes;
e0e989f6 1365 /* Since the maximum bytes produced by each loop is 20, we subtract 19
4ed46869
KH
1366 from DST_END to assure overflow checking is necessary only at the
1367 head of loop. */
e0e989f6 1368 unsigned char *adjusted_dst_end = dst_end - 19;
bdd9fb48
KH
1369 Lisp_Object unification_table = coding->character_unification_table;
1370
1371 if (!NILP (Venable_character_unification) && NILP (unification_table))
1372 unification_table = Vstandard_character_unification_table_for_write;
4ed46869
KH
1373
1374 while (src < src_end && dst < adjusted_dst_end)
1375 {
1376 /* SRC_BASE remembers the start position in source in each loop.
1377 The loop will be exited when there's not enough source text
1378 to analyze multi-byte codes (within macros ONE_MORE_BYTE,
1379 TWO_MORE_BYTES, and THREE_MORE_BYTES). In that case, SRC is
1380 reset to SRC_BASE before exiting. */
1381 unsigned char *src_base = src;
bdd9fb48 1382 int charset, c1, c2, c3, c4;
4ed46869 1383
e0e989f6
KH
1384 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
1385 && CODING_SPEC_ISO_BOL (coding))
1386 {
bdd9fb48
KH
1387 /* We have to produce designation sequences if any now. */
1388 encode_designation_at_bol (coding, unification_table,
1389 src, src_end, &dst);
e0e989f6
KH
1390 CODING_SPEC_ISO_BOL (coding) = 0;
1391 }
1392
1393 c1 = *src++;
4ed46869
KH
1394 /* If we are seeing a component of a composite character, we are
1395 seeing a leading-code specially encoded for composition, or a
1396 composition rule if composing with rule. We must set C1
1397 to a normal leading-code or an ASCII code. If we are not at
1398 a composed character, we must reset the composition state. */
1399 if (COMPOSING_P (coding->composing))
1400 {
1401 if (c1 < 0xA0)
1402 {
1403 /* We are not in a composite character any longer. */
1404 coding->composing = COMPOSING_NO;
1405 ENCODE_COMPOSITION_END;
1406 }
1407 else
1408 {
1409 if (coding->composing == COMPOSING_WITH_RULE_RULE)
1410 {
1411 *dst++ = c1 & 0x7F;
1412 coding->composing = COMPOSING_WITH_RULE_HEAD;
1413 continue;
1414 }
1415 else if (coding->composing == COMPOSING_WITH_RULE_HEAD)
1416 coding->composing = COMPOSING_WITH_RULE_RULE;
1417 if (c1 == 0xA0)
1418 {
1419 /* This is an ASCII component. */
1420 ONE_MORE_BYTE (c1);
1421 c1 &= 0x7F;
1422 }
1423 else
1424 /* This is a leading-code of non ASCII component. */
1425 c1 -= 0x20;
1426 }
1427 }
1428
1429 /* Now encode one character. C1 is a control character, an
1430 ASCII character, or a leading-code of multi-byte character. */
1431 switch (emacs_code_class[c1])
1432 {
1433 case EMACS_ascii_code:
bdd9fb48 1434 ENCODE_ISO_CHARACTER (CHARSET_ASCII, c1, /* dummy */ c2);
4ed46869
KH
1435 break;
1436
1437 case EMACS_control_code:
1438 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
e0e989f6 1439 ENCODE_RESET_PLANE_AND_REGISTER;
4ed46869
KH
1440 *dst++ = c1;
1441 break;
1442
1443 case EMACS_carriage_return_code:
1444 if (!coding->selective)
1445 {
1446 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
e0e989f6 1447 ENCODE_RESET_PLANE_AND_REGISTER;
4ed46869
KH
1448 *dst++ = c1;
1449 break;
1450 }
1451 /* fall down to treat '\r' as '\n' ... */
1452
1453 case EMACS_linefeed_code:
1454 if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
e0e989f6
KH
1455 ENCODE_RESET_PLANE_AND_REGISTER;
1456 if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
1457 bcopy (coding->spec.iso2022.initial_designation,
1458 coding->spec.iso2022.current_designation,
1459 sizeof coding->spec.iso2022.initial_designation);
4ed46869 1460 if (coding->eol_type == CODING_EOL_LF
0ef69138 1461 || coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
1462 *dst++ = ISO_CODE_LF;
1463 else if (coding->eol_type == CODING_EOL_CRLF)
1464 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
1465 else
1466 *dst++ = ISO_CODE_CR;
e0e989f6 1467 CODING_SPEC_ISO_BOL (coding) = 1;
4ed46869
KH
1468 break;
1469
1470 case EMACS_leading_code_2:
1471 ONE_MORE_BYTE (c2);
bdd9fb48 1472 ENCODE_ISO_CHARACTER (c1, c2, /* dummy */ c3);
4ed46869
KH
1473 break;
1474
1475 case EMACS_leading_code_3:
1476 TWO_MORE_BYTES (c2, c3);
1477 if (c1 < LEADING_CODE_PRIVATE_11)
bdd9fb48 1478 ENCODE_ISO_CHARACTER (c1, c2, c3);
4ed46869 1479 else
bdd9fb48 1480 ENCODE_ISO_CHARACTER (c2, c3, /* dummy */ c4);
4ed46869
KH
1481 break;
1482
1483 case EMACS_leading_code_4:
1484 THREE_MORE_BYTES (c2, c3, c4);
bdd9fb48 1485 ENCODE_ISO_CHARACTER (c2, c3, c4);
4ed46869
KH
1486 break;
1487
1488 case EMACS_leading_code_composition:
1489 ONE_MORE_BYTE (c1);
1490 if (c1 == 0xFF)
1491 {
1492 coding->composing = COMPOSING_WITH_RULE_HEAD;
1493 ENCODE_COMPOSITION_WITH_RULE_START;
1494 }
1495 else
1496 {
1497 /* Rewind one byte because it is a character code of
1498 composition elements. */
1499 src--;
1500 coding->composing = COMPOSING_NO_RULE_HEAD;
1501 ENCODE_COMPOSITION_NO_RULE_START;
1502 }
1503 break;
1504
1505 case EMACS_invalid_code:
1506 *dst++ = c1;
1507 break;
1508 }
1509 continue;
1510 label_end_of_loop:
1511 coding->carryover_size = src - src_base;
1512 bcopy (src_base, coding->carryover, coding->carryover_size);
4ed46869
KH
1513 break;
1514 }
1515
1516 /* If this is the last block of the text to be encoded, we must
bdd9fb48
KH
1517 reset graphic planes and registers to the initial state. */
1518 if (src >= src_end && coding->last_block)
4ed46869 1519 {
e0e989f6 1520 ENCODE_RESET_PLANE_AND_REGISTER;
bdd9fb48
KH
1521 if (coding->carryover_size > 0
1522 && coding->carryover_size < (dst_end - dst))
1523 {
1524 bcopy (coding->carryover, dst, coding->carryover_size);
1525 dst += coding->carryover_size;
1526 coding->carryover_size = 0;
1527 }
4ed46869
KH
1528 }
1529 *consumed = src - source;
1530 return dst - destination;
1531}
1532
1533\f
1534/*** 4. SJIS and BIG5 handlers ***/
1535
1536/* Although SJIS and BIG5 are not ISO's coding system, They are used
1537 quite widely. So, for the moment, Emacs supports them in the bare
1538 C code. But, in the future, they may be supported only by CCL. */
1539
1540/* SJIS is a coding system encoding three character sets: ASCII, right
1541 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
1542 as is. A character of charset katakana-jisx0201 is encoded by
1543 "position-code + 0x80". A character of charset japanese-jisx0208
1544 is encoded in 2-byte but two position-codes are divided and shifted
1545 so that it fit in the range below.
1546
1547 --- CODE RANGE of SJIS ---
1548 (character set) (range)
1549 ASCII 0x00 .. 0x7F
1550 KATAKANA-JISX0201 0xA0 .. 0xDF
1551 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xFF
1552 (2nd byte) 0x40 .. 0xFF
1553 -------------------------------
1554
1555*/
1556
1557/* BIG5 is a coding system encoding two character sets: ASCII and
1558 Big5. An ASCII character is encoded as is. Big5 is a two-byte
1559 character set and is encoded in two-byte.
1560
1561 --- CODE RANGE of BIG5 ---
1562 (character set) (range)
1563 ASCII 0x00 .. 0x7F
1564 Big5 (1st byte) 0xA1 .. 0xFE
1565 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
1566 --------------------------
1567
1568 Since the number of characters in Big5 is larger than maximum
1569 characters in Emacs' charset (96x96), it can't be handled as one
1570 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
1571 and `charset-big5-2'. Both are DIMENSION2 and CHARS94. The former
1572 contains frequently used characters and the latter contains less
1573 frequently used characters. */
1574
1575/* Macros to decode or encode a character of Big5 in BIG5. B1 and B2
1576 are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
1577 C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
1578 format. CHARSET is `charset_big5_1' or `charset_big5_2'. */
1579
1580/* Number of Big5 characters which have the same code in 1st byte. */
1581#define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
1582
1583#define DECODE_BIG5(b1, b2, charset, c1, c2) \
1584 do { \
1585 unsigned int temp \
1586 = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62); \
1587 if (b1 < 0xC9) \
1588 charset = charset_big5_1; \
1589 else \
1590 { \
1591 charset = charset_big5_2; \
1592 temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW; \
1593 } \
1594 c1 = temp / (0xFF - 0xA1) + 0x21; \
1595 c2 = temp % (0xFF - 0xA1) + 0x21; \
1596 } while (0)
1597
1598#define ENCODE_BIG5(charset, c1, c2, b1, b2) \
1599 do { \
1600 unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21); \
1601 if (charset == charset_big5_2) \
1602 temp += BIG5_SAME_ROW * (0xC9 - 0xA1); \
1603 b1 = temp / BIG5_SAME_ROW + 0xA1; \
1604 b2 = temp % BIG5_SAME_ROW; \
1605 b2 += b2 < 0x3F ? 0x40 : 0x62; \
1606 } while (0)
1607
1608/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1609 Check if a text is encoded in SJIS. If it is, return
1610 CODING_CATEGORY_MASK_SJIS, else return 0. */
1611
1612int
1613detect_coding_sjis (src, src_end)
1614 unsigned char *src, *src_end;
1615{
1616 unsigned char c;
1617
1618 while (src < src_end)
1619 {
1620 c = *src++;
1621 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1622 return 0;
1623 if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
1624 {
1625 if (src < src_end && *src++ < 0x40)
1626 return 0;
1627 }
1628 }
1629 return CODING_CATEGORY_MASK_SJIS;
1630}
1631
1632/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1633 Check if a text is encoded in BIG5. If it is, return
1634 CODING_CATEGORY_MASK_BIG5, else return 0. */
1635
1636int
1637detect_coding_big5 (src, src_end)
1638 unsigned char *src, *src_end;
1639{
1640 unsigned char c;
1641
1642 while (src < src_end)
1643 {
1644 c = *src++;
1645 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1646 return 0;
1647 if (c >= 0xA1)
1648 {
1649 if (src >= src_end)
1650 break;
1651 c = *src++;
1652 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
1653 return 0;
1654 }
1655 }
1656 return CODING_CATEGORY_MASK_BIG5;
1657}
1658
1659/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
1660 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
1661
1662int
1663decode_coding_sjis_big5 (coding, source, destination,
1664 src_bytes, dst_bytes, consumed, sjis_p)
1665 struct coding_system *coding;
1666 unsigned char *source, *destination;
1667 int src_bytes, dst_bytes;
1668 int *consumed;
1669 int sjis_p;
1670{
1671 unsigned char *src = source;
1672 unsigned char *src_end = source + src_bytes;
1673 unsigned char *dst = destination;
1674 unsigned char *dst_end = destination + dst_bytes;
1675 /* Since the maximum bytes produced by each loop is 4, we subtract 3
1676 from DST_END to assure overflow checking is necessary only at the
1677 head of loop. */
1678 unsigned char *adjusted_dst_end = dst_end - 3;
1679
1680 while (src < src_end && dst < adjusted_dst_end)
1681 {
1682 /* SRC_BASE remembers the start position in source in each loop.
1683 The loop will be exited when there's not enough source text
1684 to analyze two-byte character (within macro ONE_MORE_BYTE).
1685 In that case, SRC is reset to SRC_BASE before exiting. */
1686 unsigned char *src_base = src;
1687 unsigned char c1 = *src++, c2, c3, c4;
1688
1689 if (c1 == '\r')
1690 {
1691 if (coding->eol_type == CODING_EOL_CRLF)
1692 {
1693 ONE_MORE_BYTE (c2);
1694 if (c2 == '\n')
1695 *dst++ = c2;
1696 else
1697 /* To process C2 again, SRC is subtracted by 1. */
1698 *dst++ = c1, src--;
1699 }
1700 else
1701 *dst++ = c1;
1702 }
1703 else if (c1 < 0x80)
1704 *dst++ = c1;
1705 else if (c1 < 0xA0 || c1 >= 0xE0)
1706 {
1707 /* SJIS -> JISX0208, BIG5 -> Big5 (only if 0xE0 <= c1 < 0xFF) */
1708 if (sjis_p)
1709 {
1710 ONE_MORE_BYTE (c2);
1711 DECODE_SJIS (c1, c2, c3, c4);
1712 DECODE_CHARACTER_DIMENSION2 (charset_jisx0208, c3, c4);
1713 }
1714 else if (c1 >= 0xE0 && c1 < 0xFF)
1715 {
1716 int charset;
1717
1718 ONE_MORE_BYTE (c2);
1719 DECODE_BIG5 (c1, c2, charset, c3, c4);
1720 DECODE_CHARACTER_DIMENSION2 (charset, c3, c4);
1721 }
1722 else /* Invalid code */
1723 *dst++ = c1;
1724 }
1725 else
1726 {
1727 /* SJIS -> JISX0201-Kana, BIG5 -> Big5 */
1728 if (sjis_p)
1729 DECODE_CHARACTER_DIMENSION1 (charset_katakana_jisx0201, c1);
1730 else
1731 {
1732 int charset;
1733
1734 ONE_MORE_BYTE (c2);
1735 DECODE_BIG5 (c1, c2, charset, c3, c4);
1736 DECODE_CHARACTER_DIMENSION2 (charset, c3, c4);
1737 }
1738 }
1739 continue;
1740
1741 label_end_of_loop:
1742 coding->carryover_size = src - src_base;
1743 bcopy (src_base, coding->carryover, coding->carryover_size);
1744 src = src_base;
1745 break;
1746 }
1747
1748 *consumed = src - source;
1749 return dst - destination;
1750}
1751
1752/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
1753 This function can encode `charset_ascii', `charset_katakana_jisx0201',
1754 `charset_jisx0208', `charset_big5_1', and `charset_big5-2'. We are
1755 sure that all these charsets are registered as official charset
1756 (i.e. do not have extended leading-codes). Characters of other
1757 charsets are produced without any encoding. If SJIS_P is 1, encode
1758 SJIS text, else encode BIG5 text. */
1759
1760int
1761encode_coding_sjis_big5 (coding, source, destination,
1762 src_bytes, dst_bytes, consumed, sjis_p)
1763 struct coding_system *coding;
1764 unsigned char *source, *destination;
1765 int src_bytes, dst_bytes;
1766 int *consumed;
1767 int sjis_p;
1768{
1769 unsigned char *src = source;
1770 unsigned char *src_end = source + src_bytes;
1771 unsigned char *dst = destination;
1772 unsigned char *dst_end = destination + dst_bytes;
1773 /* Since the maximum bytes produced by each loop is 2, we subtract 1
1774 from DST_END to assure overflow checking is necessary only at the
1775 head of loop. */
1776 unsigned char *adjusted_dst_end = dst_end - 1;
1777
1778 while (src < src_end && dst < adjusted_dst_end)
1779 {
1780 /* SRC_BASE remembers the start position in source in each loop.
1781 The loop will be exited when there's not enough source text
1782 to analyze multi-byte codes (within macros ONE_MORE_BYTE and
1783 TWO_MORE_BYTES). In that case, SRC is reset to SRC_BASE
1784 before exiting. */
1785 unsigned char *src_base = src;
1786 unsigned char c1 = *src++, c2, c3, c4;
1787
1788 if (coding->composing)
1789 {
1790 if (c1 == 0xA0)
1791 {
1792 ONE_MORE_BYTE (c1);
1793 c1 &= 0x7F;
1794 }
1795 else if (c1 >= 0xA0)
1796 c1 -= 0x20;
1797 else
1798 coding->composing = 0;
1799 }
1800
1801 switch (emacs_code_class[c1])
1802 {
1803 case EMACS_ascii_code:
1804 case EMACS_control_code:
1805 *dst++ = c1;
1806 break;
1807
1808 case EMACS_carriage_return_code:
1809 if (!coding->selective)
1810 {
1811 *dst++ = c1;
1812 break;
1813 }
1814 /* fall down to treat '\r' as '\n' ... */
1815
1816 case EMACS_linefeed_code:
1817 if (coding->eol_type == CODING_EOL_LF
0ef69138 1818 || coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
1819 *dst++ = '\n';
1820 else if (coding->eol_type == CODING_EOL_CRLF)
1821 *dst++ = '\r', *dst++ = '\n';
1822 else
1823 *dst++ = '\r';
1824 break;
1825
1826 case EMACS_leading_code_2:
1827 ONE_MORE_BYTE (c2);
1828 if (sjis_p && c1 == charset_katakana_jisx0201)
1829 *dst++ = c2;
1830 else
1831 *dst++ = c1, *dst++ = c2;
1832 break;
1833
1834 case EMACS_leading_code_3:
1835 TWO_MORE_BYTES (c2, c3);
1836 c2 &= 0x7F, c3 &= 0x7F;
1837 if (sjis_p && c1 == charset_jisx0208)
1838 {
1839 unsigned char s1, s2;
1840
1841 ENCODE_SJIS (c2, c3, s1, s2);
1842 *dst++ = s1, *dst++ = s2;
1843 }
1844 else if (!sjis_p && (c1 == charset_big5_1 || c1 == charset_big5_2))
1845 {
1846 unsigned char b1, b2;
1847
1848 ENCODE_BIG5 (c1, c2, c3, b1, b2);
1849 *dst++ = b1, *dst++ = b2;
1850 }
1851 else
1852 *dst++ = c1, *dst++ = c2, *dst++ = c3;
1853 break;
1854
1855 case EMACS_leading_code_4:
1856 THREE_MORE_BYTES (c2, c3, c4);
1857 *dst++ = c1, *dst++ = c2, *dst++ = c3, *dst++ = c4;
1858 break;
1859
1860 case EMACS_leading_code_composition:
1861 coding->composing = 1;
1862 break;
1863
1864 default: /* i.e. case EMACS_invalid_code: */
1865 *dst++ = c1;
1866 }
1867 continue;
1868
1869 label_end_of_loop:
1870 coding->carryover_size = src - src_base;
1871 bcopy (src_base, coding->carryover, coding->carryover_size);
1872 src = src_base;
1873 break;
1874 }
1875
1876 *consumed = src - source;
1877 return dst - destination;
1878}
1879
1880\f
1881/*** 5. End-of-line handlers ***/
1882
1883/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
1884 This function is called only when `coding->eol_type' is
1885 CODING_EOL_CRLF or CODING_EOL_CR. */
1886
1887decode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
1888 struct coding_system *coding;
1889 unsigned char *source, *destination;
1890 int src_bytes, dst_bytes;
1891 int *consumed;
1892{
1893 unsigned char *src = source;
1894 unsigned char *src_end = source + src_bytes;
1895 unsigned char *dst = destination;
1896 unsigned char *dst_end = destination + dst_bytes;
1897 int produced;
1898
1899 switch (coding->eol_type)
1900 {
1901 case CODING_EOL_CRLF:
1902 {
1903 /* Since the maximum bytes produced by each loop is 2, we
1904 subtract 1 from DST_END to assure overflow checking is
1905 necessary only at the head of loop. */
1906 unsigned char *adjusted_dst_end = dst_end - 1;
1907
1908 while (src < src_end && dst < adjusted_dst_end)
1909 {
1910 unsigned char *src_base = src;
1911 unsigned char c = *src++;
1912 if (c == '\r')
1913 {
1914 ONE_MORE_BYTE (c);
1915 if (c != '\n')
1916 *dst++ = '\r';
bfd99048 1917 *dst++ = c;
4ed46869
KH
1918 }
1919 else
1920 *dst++ = c;
1921 continue;
1922
1923 label_end_of_loop:
1924 coding->carryover_size = src - src_base;
1925 bcopy (src_base, coding->carryover, coding->carryover_size);
1926 src = src_base;
1927 break;
1928 }
1929 *consumed = src - source;
1930 produced = dst - destination;
1931 break;
1932 }
1933
1934 case CODING_EOL_CR:
1935 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
1936 bcopy (source, destination, produced);
1937 dst_end = destination + produced;
1938 while (dst < dst_end)
1939 if (*dst++ == '\r') dst[-1] = '\n';
1940 *consumed = produced;
1941 break;
1942
1943 default: /* i.e. case: CODING_EOL_LF */
1944 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
1945 bcopy (source, destination, produced);
1946 *consumed = produced;
1947 break;
1948 }
1949
1950 return produced;
1951}
1952
1953/* See "GENERAL NOTES about `encode_coding_XXX ()' functions". Encode
1954 format of end-of-line according to `coding->eol_type'. If
1955 `coding->selective' is 1, code '\r' in source text also means
1956 end-of-line. */
1957
1958encode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
1959 struct coding_system *coding;
1960 unsigned char *source, *destination;
1961 int src_bytes, dst_bytes;
1962 int *consumed;
1963{
1964 unsigned char *src = source;
1965 unsigned char *dst = destination;
1966 int produced;
1967
1968 if (src_bytes <= 0)
1969 return 0;
1970
1971 switch (coding->eol_type)
1972 {
1973 case CODING_EOL_LF:
0ef69138 1974 case CODING_EOL_UNDECIDED:
4ed46869
KH
1975 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
1976 bcopy (source, destination, produced);
1977 if (coding->selective)
1978 {
1979 int i = produced;
1980 while (i--)
1981 if (*dst++ == '\r') dst[-1] = '\n';
1982 }
1983 *consumed = produced;
1984
1985 case CODING_EOL_CRLF:
1986 {
1987 unsigned char c;
1988 unsigned char *src_end = source + src_bytes;
1989 unsigned char *dst_end = destination + dst_bytes;
1990 /* Since the maximum bytes produced by each loop is 2, we
1991 subtract 1 from DST_END to assure overflow checking is
1992 necessary only at the head of loop. */
1993 unsigned char *adjusted_dst_end = dst_end - 1;
1994
1995 while (src < src_end && dst < adjusted_dst_end)
1996 {
1997 c = *src++;
1998 if (c == '\n' || (c == '\r' && coding->selective))
1999 *dst++ = '\r', *dst++ = '\n';
2000 else
2001 *dst++ = c;
2002 }
2003 produced = dst - destination;
2004 *consumed = src - source;
2005 break;
2006 }
2007
2008 default: /* i.e. case CODING_EOL_CR: */
2009 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2010 bcopy (source, destination, produced);
2011 {
2012 int i = produced;
2013 while (i--)
2014 if (*dst++ == '\n') dst[-1] = '\r';
2015 }
2016 *consumed = produced;
2017 }
2018
2019 return produced;
2020}
2021
2022\f
2023/*** 6. C library functions ***/
2024
2025/* In Emacs Lisp, coding system is represented by a Lisp symbol which
2026 has a property `coding-system'. The value of this property is a
2027 vector of length 5 (called as coding-vector). Among elements of
2028 this vector, the first (element[0]) and the fifth (element[4])
2029 carry important information for decoding/encoding. Before
2030 decoding/encoding, this information should be set in fields of a
2031 structure of type `coding_system'.
2032
2033 A value of property `coding-system' can be a symbol of another
2034 subsidiary coding-system. In that case, Emacs gets coding-vector
2035 from that symbol.
2036
2037 `element[0]' contains information to be set in `coding->type'. The
2038 value and its meaning is as follows:
2039
0ef69138
KH
2040 0 -- coding_type_emacs_mule
2041 1 -- coding_type_sjis
2042 2 -- coding_type_iso2022
2043 3 -- coding_type_big5
2044 4 -- coding_type_ccl encoder/decoder written in CCL
2045 nil -- coding_type_no_conversion
2046 t -- coding_type_undecided (automatic conversion on decoding,
2047 no-conversion on encoding)
4ed46869
KH
2048
2049 `element[4]' contains information to be set in `coding->flags' and
2050 `coding->spec'. The meaning varies by `coding->type'.
2051
2052 If `coding->type' is `coding_type_iso2022', element[4] is a vector
2053 of length 32 (of which the first 13 sub-elements are used now).
2054 Meanings of these sub-elements are:
2055
2056 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2057 If the value is an integer of valid charset, the charset is
2058 assumed to be designated to graphic register N initially.
2059
2060 If the value is minus, it is a minus value of charset which
2061 reserves graphic register N, which means that the charset is
2062 not designated initially but should be designated to graphic
2063 register N just before encoding a character in that charset.
2064
2065 If the value is nil, graphic register N is never used on
2066 encoding.
2067
2068 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2069 Each value takes t or nil. See the section ISO2022 of
2070 `coding.h' for more information.
2071
2072 If `coding->type' is `coding_type_big5', element[4] is t to denote
2073 BIG5-ETen or nil to denote BIG5-HKU.
2074
2075 If `coding->type' takes the other value, element[4] is ignored.
2076
2077 Emacs Lisp's coding system also carries information about format of
2078 end-of-line in a value of property `eol-type'. If the value is
2079 integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2080 means CODING_EOL_CR. If it is not integer, it should be a vector
2081 of subsidiary coding systems of which property `eol-type' has one
2082 of above values.
2083
2084*/
2085
2086/* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2087 and set it in CODING. If CODING_SYSTEM_SYMBOL is invalid, CODING
2088 is setup so that no conversion is necessary and return -1, else
2089 return 0. */
2090
2091int
e0e989f6
KH
2092setup_coding_system (coding_system, coding)
2093 Lisp_Object coding_system;
4ed46869
KH
2094 struct coding_system *coding;
2095{
4ed46869
KH
2096 Lisp_Object type, eol_type;
2097
2098 /* At first, set several fields default values. */
2099 coding->require_flushing = 0;
2100 coding->last_block = 0;
2101 coding->selective = 0;
2102 coding->composing = 0;
2103 coding->direction = 0;
2104 coding->carryover_size = 0;
4ed46869 2105 coding->post_read_conversion = coding->pre_write_conversion = Qnil;
bdd9fb48
KH
2106 /* We have not yet implemented a way to specify unification table in
2107 a coding system. */
2108 coding->character_unification_table = Qnil;
4ed46869 2109
e0e989f6
KH
2110 Vlast_coding_system_used = coding->symbol = coding_system;
2111 eol_type = Qnil;
2112 /* Get value of property `coding-system' until we get a vector.
2113 While doing that, also get values of properties
2114 `post-read-conversion', `pre-write-conversion', and `eol-type'. */
2115 while (!NILP (coding_system) && SYMBOLP (coding_system))
4ed46869 2116 {
4ed46869 2117 if (NILP (coding->post_read_conversion))
e0e989f6 2118 coding->post_read_conversion = Fget (coding_system,
4ed46869 2119 Qpost_read_conversion);
e0e989f6
KH
2120 if (NILP (coding->pre_write_conversion))
2121 coding->pre_write_conversion = Fget (coding_system,
4ed46869 2122 Qpre_write_conversion);
e0e989f6
KH
2123 if (NILP (eol_type))
2124 eol_type = Fget (coding_system, Qeol_type);
2125 coding_system = Fget (coding_system, Qcoding_system);
4ed46869 2126 }
e0e989f6
KH
2127 if (!VECTORP (coding_system)
2128 || XVECTOR (coding_system)->size != 5)
4ed46869
KH
2129 goto label_invalid_coding_system;
2130
4ed46869 2131 if (VECTORP (eol_type))
0ef69138 2132 coding->eol_type = CODING_EOL_UNDECIDED;
4ed46869
KH
2133 else if (XFASTINT (eol_type) == 1)
2134 coding->eol_type = CODING_EOL_CRLF;
2135 else if (XFASTINT (eol_type) == 2)
2136 coding->eol_type = CODING_EOL_CR;
2137 else
2138 coding->eol_type = CODING_EOL_LF;
2139
e0e989f6 2140 type = XVECTOR (coding_system)->contents[0];
4ed46869
KH
2141 switch (XFASTINT (type))
2142 {
2143 case 0:
0ef69138 2144 coding->type = coding_type_emacs_mule;
4ed46869
KH
2145 break;
2146
2147 case 1:
2148 coding->type = coding_type_sjis;
2149 break;
2150
2151 case 2:
2152 coding->type = coding_type_iso2022;
2153 {
e0e989f6 2154 Lisp_Object val = XVECTOR (coding_system)->contents[4];
4ed46869
KH
2155 Lisp_Object *flags;
2156 int i, charset, default_reg_bits = 0;
2157
2158 if (!VECTORP (val) || XVECTOR (val)->size != 32)
2159 goto label_invalid_coding_system;
2160
2161 flags = XVECTOR (val)->contents;
2162 coding->flags
2163 = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
2164 | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
2165 | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
2166 | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
2167 | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
2168 | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
2169 | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
2170 | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
e0e989f6
KH
2171 | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
2172 | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
2173 | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL));
4ed46869
KH
2174
2175 /* Invoke graphic register 0 to plane 0. */
2176 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
2177 /* Invoke graphic register 1 to plane 1 if we can use full 8-bit. */
2178 CODING_SPEC_ISO_INVOCATION (coding, 1)
2179 = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
2180 /* Not single shifting at first. */
2181 CODING_SPEC_ISO_SINGLE_SHIFTING(coding) = 0;
e0e989f6
KH
2182 /* Beginning of buffer should also be regarded as bol. */
2183 CODING_SPEC_ISO_BOL(coding) = 1;
4ed46869
KH
2184
2185 /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
2186 FLAGS[REG] can be one of below:
2187 integer CHARSET: CHARSET occupies register I,
2188 t: designate nothing to REG initially, but can be used
2189 by any charsets,
2190 list of integer, nil, or t: designate the first
2191 element (if integer) to REG initially, the remaining
2192 elements (if integer) is designated to REG on request,
2193 if an element is t, REG can be used by any charset,
2194 nil: REG is never used. */
467e7675 2195 for (charset = 0; charset <= MAX_CHARSET; charset++)
4ed46869
KH
2196 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = -1;
2197 for (i = 0; i < 4; i++)
2198 {
2199 if (INTEGERP (flags[i])
e0e989f6
KH
2200 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset))
2201 || (charset = get_charset_id (flags[i])) >= 0)
4ed46869
KH
2202 {
2203 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2204 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
2205 }
2206 else if (EQ (flags[i], Qt))
2207 {
2208 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2209 default_reg_bits |= 1 << i;
2210 }
2211 else if (CONSP (flags[i]))
2212 {
2213 Lisp_Object tail = flags[i];
2214
2215 if (INTEGERP (XCONS (tail)->car)
2216 && (charset = XINT (XCONS (tail)->car),
e0e989f6
KH
2217 CHARSET_VALID_P (charset))
2218 || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
4ed46869
KH
2219 {
2220 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2221 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
2222 }
2223 else
2224 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2225 tail = XCONS (tail)->cdr;
2226 while (CONSP (tail))
2227 {
2228 if (INTEGERP (XCONS (tail)->car)
2229 && (charset = XINT (XCONS (tail)->car),
e0e989f6
KH
2230 CHARSET_VALID_P (charset))
2231 || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
4ed46869
KH
2232 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2233 = i;
2234 else if (EQ (XCONS (tail)->car, Qt))
2235 default_reg_bits |= 1 << i;
2236 tail = XCONS (tail)->cdr;
2237 }
2238 }
2239 else
2240 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2241
2242 CODING_SPEC_ISO_DESIGNATION (coding, i)
2243 = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
2244 }
2245
2246 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
2247 {
2248 /* REG 1 can be used only by locking shift in 7-bit env. */
2249 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2250 default_reg_bits &= ~2;
2251 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
2252 /* Without any shifting, only REG 0 and 1 can be used. */
2253 default_reg_bits &= 3;
2254 }
2255
467e7675 2256 for (charset = 0; charset <= MAX_CHARSET; charset++)
4ed46869
KH
2257 if (CHARSET_VALID_P (charset)
2258 && CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) < 0)
2259 {
2260 /* We have not yet decided where to designate CHARSET. */
2261 int reg_bits = default_reg_bits;
2262
2263 if (CHARSET_CHARS (charset) == 96)
2264 /* A charset of CHARS96 can't be designated to REG 0. */
2265 reg_bits &= ~1;
2266
2267 if (reg_bits)
2268 /* There exist some default graphic register. */
2269 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2270 = (reg_bits & 1
2271 ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
2272 else
2273 /* We anyway have to designate CHARSET to somewhere. */
2274 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2275 = (CHARSET_CHARS (charset) == 94
2276 ? 0
2277 : ((coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT
2278 || ! coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2279 ? 1
2280 : (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT
2281 ? 2 : 0)));
2282 }
2283 }
2284 coding->require_flushing = 1;
2285 break;
2286
2287 case 3:
2288 coding->type = coding_type_big5;
2289 coding->flags
e0e989f6 2290 = (NILP (XVECTOR (coding_system)->contents[4])
4ed46869
KH
2291 ? CODING_FLAG_BIG5_HKU
2292 : CODING_FLAG_BIG5_ETEN);
2293 break;
2294
2295 case 4:
2296 coding->type = coding_type_ccl;
2297 {
e0e989f6 2298 Lisp_Object val = XVECTOR (coding_system)->contents[4];
4ed46869
KH
2299 if (CONSP (val)
2300 && VECTORP (XCONS (val)->car)
2301 && VECTORP (XCONS (val)->cdr))
2302 {
2303 setup_ccl_program (&(coding->spec.ccl.decoder), XCONS (val)->car);
2304 setup_ccl_program (&(coding->spec.ccl.encoder), XCONS (val)->cdr);
2305 }
2306 else
2307 goto label_invalid_coding_system;
2308 }
2309 coding->require_flushing = 1;
2310 break;
2311
2312 default:
2313 if (EQ (type, Qt))
0ef69138 2314 coding->type = coding_type_undecided;
4ed46869
KH
2315 else
2316 coding->type = coding_type_no_conversion;
2317 break;
2318 }
2319 return 0;
2320
2321 label_invalid_coding_system:
2322 coding->type = coding_type_no_conversion;
dec137e5 2323 coding->eol_type = CODING_EOL_LF;
e0e989f6
KH
2324 coding->symbol = coding->pre_write_conversion = coding->post_read_conversion
2325 = Qnil;
4ed46869
KH
2326 return -1;
2327}
2328
2329/* Emacs has a mechanism to automatically detect a coding system if it
2330 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
2331 it's impossible to distinguish some coding systems accurately
2332 because they use the same range of codes. So, at first, coding
2333 systems are categorized into 7, those are:
2334
0ef69138 2335 o coding-category-emacs-mule
4ed46869
KH
2336
2337 The category for a coding system which has the same code range
2338 as Emacs' internal format. Assigned the coding-system (Lisp
0ef69138 2339 symbol) `emacs-mule' by default.
4ed46869
KH
2340
2341 o coding-category-sjis
2342
2343 The category for a coding system which has the same code range
2344 as SJIS. Assigned the coding-system (Lisp
e0e989f6 2345 symbol) `shift-jis' by default.
4ed46869
KH
2346
2347 o coding-category-iso-7
2348
2349 The category for a coding system which has the same code range
2350 as ISO2022 of 7-bit environment. Assigned the coding-system
e0e989f6 2351 (Lisp symbol) `iso-2022-7' by default.
4ed46869
KH
2352
2353 o coding-category-iso-8-1
2354
2355 The category for a coding system which has the same code range
2356 as ISO2022 of 8-bit environment and graphic plane 1 used only
2357 for DIMENSION1 charset. Assigned the coding-system (Lisp
e0e989f6 2358 symbol) `iso-8859-1' by default.
4ed46869
KH
2359
2360 o coding-category-iso-8-2
2361
2362 The category for a coding system which has the same code range
2363 as ISO2022 of 8-bit environment and graphic plane 1 used only
2364 for DIMENSION2 charset. Assigned the coding-system (Lisp
e0e989f6 2365 symbol) `euc-japan' by default.
4ed46869
KH
2366
2367 o coding-category-iso-else
2368
2369 The category for a coding system which has the same code range
2370 as ISO2022 but not belongs to any of the above three
2371 categories. Assigned the coding-system (Lisp symbol)
e0e989f6 2372 `iso-2022-ss2-7' by default.
4ed46869
KH
2373
2374 o coding-category-big5
2375
2376 The category for a coding system which has the same code range
2377 as BIG5. Assigned the coding-system (Lisp symbol)
e0e989f6 2378 `cn-big5' by default.
4ed46869
KH
2379
2380 o coding-category-binary
2381
2382 The category for a coding system not categorized in any of the
2383 above. Assigned the coding-system (Lisp symbol)
e0e989f6 2384 `no-conversion' by default.
4ed46869
KH
2385
2386 Each of them is a Lisp symbol and the value is an actual
2387 `coding-system's (this is also a Lisp symbol) assigned by a user.
2388 What Emacs does actually is to detect a category of coding system.
2389 Then, it uses a `coding-system' assigned to it. If Emacs can't
2390 decide only one possible category, it selects a category of the
2391 highest priority. Priorities of categories are also specified by a
2392 user in a Lisp variable `coding-category-list'.
2393
2394*/
2395
2396/* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2397 If it detects possible coding systems, return an integer in which
2398 appropriate flag bits are set. Flag bits are defined by macros
2399 CODING_CATEGORY_MASK_XXX in `coding.h'. */
2400
2401int
2402detect_coding_mask (src, src_bytes)
2403 unsigned char *src;
2404 int src_bytes;
2405{
2406 register unsigned char c;
2407 unsigned char *src_end = src + src_bytes;
2408 int mask;
2409
2410 /* At first, skip all ASCII characters and control characters except
2411 for three ISO2022 specific control characters. */
bcf26d6a 2412 label_loop_detect_coding:
4ed46869
KH
2413 while (src < src_end)
2414 {
2415 c = *src;
2416 if (c >= 0x80
2417 || (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
2418 break;
2419 src++;
2420 }
2421
2422 if (src >= src_end)
2423 /* We found nothing other than ASCII. There's nothing to do. */
2424 return CODING_CATEGORY_MASK_ANY;
2425
2426 /* The text seems to be encoded in some multilingual coding system.
2427 Now, try to find in which coding system the text is encoded. */
2428 if (c < 0x80)
bcf26d6a
KH
2429 {
2430 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
2431 /* C is an ISO2022 specific control code of C0. */
2432 mask = detect_coding_iso2022 (src, src_end);
2433 src++;
2434 if (mask == CODING_CATEGORY_MASK_ANY)
2435 /* No valid ISO2022 code follows C. Try again. */
2436 goto label_loop_detect_coding;
2437 }
4ed46869
KH
2438 else if (c == ISO_CODE_SS2 || c == ISO_CODE_SS3 || c == ISO_CODE_CSI)
2439 /* C is an ISO2022 specific control code of C1,
2440 or the first byte of SJIS's 2-byte character code,
2441 or a leading code of Emacs. */
2442 mask = (detect_coding_iso2022 (src, src_end)
2443 | detect_coding_sjis (src, src_end)
0ef69138 2444 | detect_coding_emacs_mule (src, src_end));
4ed46869
KH
2445
2446 else if (c < 0xA0)
2447 /* C is the first byte of SJIS character code,
2448 or a leading-code of Emacs. */
2449 mask = (detect_coding_sjis (src, src_end)
0ef69138 2450 | detect_coding_emacs_mule (src, src_end));
4ed46869
KH
2451
2452 else
2453 /* C is a character of ISO2022 in graphic plane right,
2454 or a SJIS's 1-byte character code (i.e. JISX0201),
2455 or the first byte of BIG5's 2-byte code. */
2456 mask = (detect_coding_iso2022 (src, src_end)
2457 | detect_coding_sjis (src, src_end)
2458 | detect_coding_big5 (src, src_end));
2459
2460 return mask;
2461}
2462
2463/* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2464 The information of the detected coding system is set in CODING. */
2465
2466void
2467detect_coding (coding, src, src_bytes)
2468 struct coding_system *coding;
2469 unsigned char *src;
2470 int src_bytes;
2471{
2472 int mask = detect_coding_mask (src, src_bytes);
2473 int idx;
2474
2475 if (mask == CODING_CATEGORY_MASK_ANY)
2476 /* We found nothing other than ASCII. There's nothing to do. */
2477 return;
2478
2479 if (!mask)
2480 /* The source text seems to be encoded in unknown coding system.
2481 Emacs regards the category of such a kind of coding system as
2482 `coding-category-binary'. We assume that a user has assigned
2483 an appropriate coding system for a `coding-category-binary'. */
2484 idx = CODING_CATEGORY_IDX_BINARY;
2485 else
2486 {
2487 /* We found some plausible coding systems. Let's use a coding
2488 system of the highest priority. */
2489 Lisp_Object val = Vcoding_category_list;
2490
2491 if (CONSP (val))
2492 while (!NILP (val))
2493 {
2494 idx = XFASTINT (Fget (XCONS (val)->car, Qcoding_category_index));
2495 if ((idx < CODING_CATEGORY_IDX_MAX) && (mask & (1 << idx)))
2496 break;
2497 val = XCONS (val)->cdr;
2498 }
2499 else
2500 val = Qnil;
2501
2502 if (NILP (val))
2503 {
2504 /* For unknown reason, `Vcoding_category_list' contains none
2505 of found categories. Let's use any of them. */
2506 for (idx = 0; idx < CODING_CATEGORY_IDX_MAX; idx++)
2507 if (mask & (1 << idx))
2508 break;
2509 }
2510 }
2511 setup_coding_system (XSYMBOL (coding_category_table[idx])->value, coding);
2512}
2513
2514/* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2515 is encoded. Return one of CODING_EOL_LF, CODING_EOL_CRLF,
0ef69138 2516 CODING_EOL_CR, and CODING_EOL_UNDECIDED. */
4ed46869
KH
2517
2518int
2519detect_eol_type (src, src_bytes)
2520 unsigned char *src;
2521 int src_bytes;
2522{
2523 unsigned char *src_end = src + src_bytes;
2524 unsigned char c;
2525
2526 while (src < src_end)
2527 {
2528 c = *src++;
2529 if (c == '\n')
2530 return CODING_EOL_LF;
2531 else if (c == '\r')
2532 {
2533 if (src < src_end && *src == '\n')
2534 return CODING_EOL_CRLF;
2535 else
2536 return CODING_EOL_CR;
2537 }
2538 }
0ef69138 2539 return CODING_EOL_UNDECIDED;
4ed46869
KH
2540}
2541
2542/* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2543 is encoded. If it detects an appropriate format of end-of-line, it
2544 sets the information in *CODING. */
2545
2546void
2547detect_eol (coding, src, src_bytes)
2548 struct coding_system *coding;
2549 unsigned char *src;
2550 int src_bytes;
2551{
2552 Lisp_Object val;
2553 int eol_type = detect_eol_type (src, src_bytes);
2554
0ef69138 2555 if (eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
2556 /* We found no end-of-line in the source text. */
2557 return;
2558
2559 val = Fget (coding->symbol, Qeol_type);
2560 if (VECTORP (val) && XVECTOR (val)->size == 3)
2561 setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
2562}
2563
2564/* See "GENERAL NOTES about `decode_coding_XXX ()' functions". Before
2565 decoding, it may detect coding system and format of end-of-line if
2566 those are not yet decided. */
2567
2568int
2569decode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2570 struct coding_system *coding;
2571 unsigned char *source, *destination;
2572 int src_bytes, dst_bytes;
2573 int *consumed;
2574{
2575 int produced;
2576
2577 if (src_bytes <= 0)
2578 {
2579 *consumed = 0;
2580 return 0;
2581 }
2582
0ef69138 2583 if (coding->type == coding_type_undecided)
4ed46869
KH
2584 detect_coding (coding, source, src_bytes);
2585
0ef69138 2586 if (coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
2587 detect_eol (coding, source, src_bytes);
2588
2589 coding->carryover_size = 0;
2590 switch (coding->type)
2591 {
2592 case coding_type_no_conversion:
2593 label_no_conversion:
2594 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2595 bcopy (source, destination, produced);
2596 *consumed = produced;
2597 break;
2598
0ef69138
KH
2599 case coding_type_emacs_mule:
2600 case coding_type_undecided:
4ed46869 2601 if (coding->eol_type == CODING_EOL_LF
0ef69138 2602 || coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
2603 goto label_no_conversion;
2604 produced = decode_eol (coding, source, destination,
2605 src_bytes, dst_bytes, consumed);
2606 break;
2607
2608 case coding_type_sjis:
2609 produced = decode_coding_sjis_big5 (coding, source, destination,
2610 src_bytes, dst_bytes, consumed,
2611 1);
2612 break;
2613
2614 case coding_type_iso2022:
2615 produced = decode_coding_iso2022 (coding, source, destination,
2616 src_bytes, dst_bytes, consumed);
2617 break;
2618
2619 case coding_type_big5:
2620 produced = decode_coding_sjis_big5 (coding, source, destination,
2621 src_bytes, dst_bytes, consumed,
2622 0);
2623 break;
2624
2625 case coding_type_ccl:
2626 produced = ccl_driver (&coding->spec.ccl.decoder, source, destination,
2627 src_bytes, dst_bytes, consumed);
2628 break;
2629 }
2630
2631 return produced;
2632}
2633
2634/* See "GENERAL NOTES about `encode_coding_XXX ()' functions". */
2635
2636int
2637encode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2638 struct coding_system *coding;
2639 unsigned char *source, *destination;
2640 int src_bytes, dst_bytes;
2641 int *consumed;
2642{
2643 int produced;
2644
2645 coding->carryover_size = 0;
2646 switch (coding->type)
2647 {
2648 case coding_type_no_conversion:
2649 label_no_conversion:
2650 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2651 if (produced > 0)
2652 {
2653 bcopy (source, destination, produced);
2654 if (coding->selective)
2655 {
2656 unsigned char *p = destination, *pend = destination + produced;
2657 while (p < pend)
e0e989f6 2658 if (*p++ == '\015') p[-1] = '\n';
4ed46869
KH
2659 }
2660 }
2661 *consumed = produced;
2662 break;
2663
0ef69138
KH
2664 case coding_type_emacs_mule:
2665 case coding_type_undecided:
4ed46869 2666 if (coding->eol_type == CODING_EOL_LF
0ef69138 2667 || coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
2668 goto label_no_conversion;
2669 produced = encode_eol (coding, source, destination,
2670 src_bytes, dst_bytes, consumed);
2671 break;
2672
2673 case coding_type_sjis:
2674 produced = encode_coding_sjis_big5 (coding, source, destination,
2675 src_bytes, dst_bytes, consumed,
2676 1);
2677 break;
2678
2679 case coding_type_iso2022:
2680 produced = encode_coding_iso2022 (coding, source, destination,
2681 src_bytes, dst_bytes, consumed);
2682 break;
2683
2684 case coding_type_big5:
2685 produced = encode_coding_sjis_big5 (coding, source, destination,
2686 src_bytes, dst_bytes, consumed,
2687 0);
2688 break;
2689
2690 case coding_type_ccl:
2691 produced = ccl_driver (&coding->spec.ccl.encoder, source, destination,
2692 src_bytes, dst_bytes, consumed);
2693 break;
2694 }
2695
2696 return produced;
2697}
2698
2699#define CONVERSION_BUFFER_EXTRA_ROOM 256
2700
2701/* Return maximum size (bytes) of a buffer enough for decoding
2702 SRC_BYTES of text encoded in CODING. */
2703
2704int
2705decoding_buffer_size (coding, src_bytes)
2706 struct coding_system *coding;
2707 int src_bytes;
2708{
2709 int magnification;
2710
2711 if (coding->type == coding_type_iso2022)
2712 magnification = 3;
2713 else if (coding->type == coding_type_ccl)
2714 magnification = coding->spec.ccl.decoder.buf_magnification;
2715 else
2716 magnification = 2;
2717
2718 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
2719}
2720
2721/* Return maximum size (bytes) of a buffer enough for encoding
2722 SRC_BYTES of text to CODING. */
2723
2724int
2725encoding_buffer_size (coding, src_bytes)
2726 struct coding_system *coding;
2727 int src_bytes;
2728{
2729 int magnification;
2730
2731 if (coding->type == coding_type_ccl)
2732 magnification = coding->spec.ccl.encoder.buf_magnification;
2733 else
2734 magnification = 3;
2735
2736 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
2737}
2738
2739#ifndef MINIMUM_CONVERSION_BUFFER_SIZE
2740#define MINIMUM_CONVERSION_BUFFER_SIZE 1024
2741#endif
2742
2743char *conversion_buffer;
2744int conversion_buffer_size;
2745
2746/* Return a pointer to a SIZE bytes of buffer to be used for encoding
2747 or decoding. Sufficient memory is allocated automatically. If we
2748 run out of memory, return NULL. */
2749
2750char *
2751get_conversion_buffer (size)
2752 int size;
2753{
2754 if (size > conversion_buffer_size)
2755 {
2756 char *buf;
2757 int real_size = conversion_buffer_size * 2;
2758
2759 while (real_size < size) real_size *= 2;
2760 buf = (char *) xmalloc (real_size);
2761 xfree (conversion_buffer);
2762 conversion_buffer = buf;
2763 conversion_buffer_size = real_size;
2764 }
2765 return conversion_buffer;
2766}
2767
2768\f
2769#ifdef emacs
2770/*** 7. Emacs Lisp library functions ***/
2771
2772DEFUN ("coding-system-vector", Fcoding_system_vector, Scoding_system_vector,
2773 1, 1, 0,
2774 "Return coding-vector of CODING-SYSTEM.\n\
2775If CODING-SYSTEM is not a valid coding-system, return nil.")
2776 (obj)
2777 Lisp_Object obj;
2778{
2779 while (SYMBOLP (obj) && !NILP (obj))
2780 obj = Fget (obj, Qcoding_system);
2781 return ((NILP (obj) || !VECTORP (obj) || XVECTOR (obj)->size != 5)
2782 ? Qnil : obj);
2783}
2784
2785DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
2786 "Return t if OBJECT is nil or a coding-system.\n\
2787See document of make-coding-system for coding-system object.")
2788 (obj)
2789 Lisp_Object obj;
2790{
2791 return ((NILP (obj) || !NILP (Fcoding_system_vector (obj))) ? Qt : Qnil);
2792}
2793
9d991de8
RS
2794DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
2795 Sread_non_nil_coding_system, 1, 1, 0,
e0e989f6 2796 "Read a coding system from the minibuffer, prompting with string PROMPT.")
4ed46869
KH
2797 (prompt)
2798 Lisp_Object prompt;
2799{
e0e989f6 2800 Lisp_Object val;
9d991de8
RS
2801 do
2802 {
2803 val = Fcompleting_read (prompt, Vobarray, Qcoding_system_vector,
2804 Qt, Qnil, Qnil, Qnil);
2805 }
2806 while (XSTRING (val)->size == 0);
e0e989f6 2807 return (Fintern (val, Qnil));
4ed46869
KH
2808}
2809
2810DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 1, 0,
e0e989f6 2811 "Read a coding system or nil from the minibuffer, prompting with string PROMPT.")
4ed46869
KH
2812 (prompt)
2813 Lisp_Object prompt;
2814{
e0e989f6 2815 Lisp_Object val = Fcompleting_read (prompt, Vobarray, Qcoding_system_p,
9d991de8 2816 Qt, Qnil, Qnil, Qnil);
e0e989f6 2817 return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
4ed46869
KH
2818}
2819
2820DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
2821 1, 1, 0,
2822 "Check validity of CODING-SYSTEM.\n\
2823If valid, return CODING-SYSTEM, else `coding-system-error' is signaled.\n\
2824CODING-SYSTEM is valid if it is a symbol and has \"coding-system\" property.\n\
2825The value of property should be a vector of length 5.")
2826 (coding_system)
2827 Lisp_Object coding_system;
2828{
2829 CHECK_SYMBOL (coding_system, 0);
2830 if (!NILP (Fcoding_system_p (coding_system)))
2831 return coding_system;
2832 while (1)
2833 Fsignal (Qcoding_system_error, coding_system);
2834}
2835
2836DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
2837 2, 2, 0,
2838 "Detect coding-system of the text in the region between START and END.\n\
2839Return a list of possible coding-systems ordered by priority.\n\
0ef69138 2840If only ASCII characters are found, it returns `undecided'\n\
4ed46869
KH
2841 or its subsidiary coding-system according to a detected end-of-line format.")
2842 (b, e)
2843 Lisp_Object b, e;
2844{
2845 int coding_mask, eol_type;
2846 Lisp_Object val;
2847 int beg, end;
2848
2849 validate_region (&b, &e);
2850 beg = XINT (b), end = XINT (e);
2851 if (beg < GPT && end >= GPT) move_gap (end);
2852
2853 coding_mask = detect_coding_mask (POS_ADDR (beg), end - beg);
2854 eol_type = detect_eol_type (POS_ADDR (beg), end - beg);
2855
2856 if (coding_mask == CODING_CATEGORY_MASK_ANY)
2857 {
0ef69138
KH
2858 val = intern ("undecided");
2859 if (eol_type != CODING_EOL_UNDECIDED)
4ed46869
KH
2860 {
2861 Lisp_Object val2 = Fget (val, Qeol_type);
2862 if (VECTORP (val2))
2863 val = XVECTOR (val2)->contents[eol_type];
2864 }
2865 }
2866 else
2867 {
2868 Lisp_Object val2;
2869
2870 /* At first, gather possible coding-systems in VAL in a reverse
2871 order. */
2872 val = Qnil;
2873 for (val2 = Vcoding_category_list;
2874 !NILP (val2);
2875 val2 = XCONS (val2)->cdr)
2876 {
2877 int idx
2878 = XFASTINT (Fget (XCONS (val2)->car, Qcoding_category_index));
2879 if (coding_mask & (1 << idx))
2880 val = Fcons (Fsymbol_value (XCONS (val2)->car), val);
2881 }
2882
2883 /* Then, change the order of the list, while getting subsidiary
2884 coding-systems. */
2885 val2 = val;
2886 val = Qnil;
2887 for (; !NILP (val2); val2 = XCONS (val2)->cdr)
2888 {
0ef69138 2889 if (eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
2890 val = Fcons (XCONS (val2)->car, val);
2891 else
2892 {
2893 Lisp_Object val3 = Fget (XCONS (val2)->car, Qeol_type);
2894 if (VECTORP (val3))
2895 val = Fcons (XVECTOR (val3)->contents[eol_type], val);
2896 else
2897 val = Fcons (XCONS (val2)->car, val);
2898 }
2899 }
2900 }
2901
2902 return val;
2903}
2904
2905/* Scan text in the region between *BEGP and *ENDP, skip characters
2906 which we never have to encode to (iff ENCODEP is 1) or decode from
2907 coding system CODING at the head and tail, then set BEGP and ENDP
2908 to the addresses of start and end of the text we actually convert. */
2909
2910void
2911shrink_conversion_area (begp, endp, coding, encodep)
2912 unsigned char **begp, **endp;
2913 struct coding_system *coding;
2914 int encodep;
2915{
2916 register unsigned char *beg_addr = *begp, *end_addr = *endp;
2917
2918 if (coding->eol_type != CODING_EOL_LF
0ef69138 2919 && coding->eol_type != CODING_EOL_UNDECIDED)
4ed46869
KH
2920 /* Since we anyway have to convert end-of-line format, it is not
2921 worth skipping at most 100 bytes or so. */
2922 return;
2923
2924 if (encodep) /* for encoding */
2925 {
2926 switch (coding->type)
2927 {
2928 case coding_type_no_conversion:
0ef69138
KH
2929 case coding_type_emacs_mule:
2930 case coding_type_undecided:
4ed46869
KH
2931 /* We need no conversion. */
2932 *begp = *endp;
2933 return;
2934 case coding_type_ccl:
2935 /* We can't skip any data. */
2936 return;
e0e989f6
KH
2937 case coding_type_iso2022:
2938 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
2939 {
2940 unsigned char *bol = beg_addr;
2941 while (beg_addr < end_addr && *beg_addr < 0x80)
2942 {
2943 beg_addr++;
2944 if (*(beg_addr - 1) == '\n')
2945 bol = beg_addr;
2946 }
2947 beg_addr = bol;
2948 goto label_skip_tail;
2949 }
2950 /* fall down ... */
4ed46869
KH
2951 default:
2952 /* We can skip all ASCII characters at the head and tail. */
2953 while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
e0e989f6 2954 label_skip_tail:
4ed46869
KH
2955 while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
2956 break;
2957 }
2958 }
2959 else /* for decoding */
2960 {
2961 switch (coding->type)
2962 {
2963 case coding_type_no_conversion:
2964 /* We need no conversion. */
2965 *begp = *endp;
2966 return;
0ef69138 2967 case coding_type_emacs_mule:
4ed46869
KH
2968 if (coding->eol_type == CODING_EOL_LF)
2969 {
2970 /* We need no conversion. */
2971 *begp = *endp;
2972 return;
2973 }
2974 /* We can skip all but carriage-return. */
2975 while (beg_addr < end_addr && *beg_addr != '\r') beg_addr++;
2976 while (beg_addr < end_addr && *(end_addr - 1) != '\r') end_addr--;
2977 break;
2978 case coding_type_sjis:
2979 case coding_type_big5:
2980 /* We can skip all ASCII characters at the head. */
2981 while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
2982 /* We can skip all ASCII characters at the tail except for
2983 the second byte of SJIS or BIG5 code. */
2984 while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
2985 if (end_addr != *endp)
2986 end_addr++;
2987 break;
2988 case coding_type_ccl:
2989 /* We can't skip any data. */
2990 return;
2991 default: /* i.e. case coding_type_iso2022: */
2992 {
2993 unsigned char c;
2994
2995 /* We can skip all ASCII characters except for a few
2996 control codes at the head. */
2997 while (beg_addr < end_addr && (c = *beg_addr) < 0x80
2998 && c != ISO_CODE_CR && c != ISO_CODE_SO
2999 && c != ISO_CODE_SI && c != ISO_CODE_ESC)
3000 beg_addr++;
3001 }
3002 break;
3003 }
3004 }
3005 *begp = beg_addr;
3006 *endp = end_addr;
3007 return;
3008}
3009
3010/* Encode to (iff ENCODEP is 1) or decode form coding system CODING a
3011 text between B and E. B and E are buffer position. */
3012
3013Lisp_Object
3014code_convert_region (b, e, coding, encodep)
3015 Lisp_Object b, e;
3016 struct coding_system *coding;
3017 int encodep;
3018{
3019 int beg, end, len, consumed, produced;
3020 char *buf;
3021 unsigned char *begp, *endp;
3022 int pos = PT;
3023
3024 validate_region (&b, &e);
3025 beg = XINT (b), end = XINT (e);
3026 if (beg < GPT && end >= GPT)
3027 move_gap (end);
3028
3029 if (encodep && !NILP (coding->pre_write_conversion))
3030 {
3031 /* We must call a pre-conversion function which may put a new
3032 text to be converted in a new buffer. */
3033 struct buffer *old = current_buffer, *new;
3034
3035 TEMP_SET_PT (beg);
3036 call2 (coding->pre_write_conversion, b, e);
3037 if (old != current_buffer)
3038 {
3039 /* Replace the original text by the text just generated. */
3040 len = ZV - BEGV;
3041 new = current_buffer;
3042 set_buffer_internal (old);
3043 del_range (beg, end);
3044 insert_from_buffer (new, 1, len, 0);
3045 end = beg + len;
3046 }
3047 }
3048
3049 /* We may be able to shrink the conversion region. */
3050 begp = POS_ADDR (beg); endp = begp + (end - beg);
3051 shrink_conversion_area (&begp, &endp, coding, encodep);
3052
3053 if (begp == endp)
3054 /* We need no conversion. */
3055 len = end - beg;
3056 else
3057 {
3058 beg += begp - POS_ADDR (beg);
3059 end = beg + (endp - begp);
3060
3061 if (encodep)
3062 len = encoding_buffer_size (coding, end - beg);
3063 else
3064 len = decoding_buffer_size (coding, end - beg);
3065 buf = get_conversion_buffer (len);
3066
3067 coding->last_block = 1;
3068 produced = (encodep
3069 ? encode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
3070 &consumed)
3071 : decode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
3072 &consumed));
3073
3074 len = produced + (beg - XINT (b)) + (XINT (e) - end);
3075
3076 TEMP_SET_PT (beg);
3077 insert (buf, produced);
3078 del_range (PT, PT + end - beg);
3079 if (pos >= end)
3080 pos = PT + (pos - end);
3081 else if (pos > beg)
3082 pos = beg;
3083 TEMP_SET_PT (pos);
3084 }
3085
3086 if (!encodep && !NILP (coding->post_read_conversion))
3087 {
3088 /* We must call a post-conversion function which may alter
3089 the text just converted. */
3090 Lisp_Object insval;
3091
3092 beg = XINT (b);
3093 TEMP_SET_PT (beg);
3094 insval = call1 (coding->post_read_conversion, make_number (len));
3095 CHECK_NUMBER (insval, 0);
3096 len = XINT (insval);
3097 }
3098
3099 return make_number (len);
3100}
3101
3102Lisp_Object
e0e989f6
KH
3103code_convert_string (str, coding, encodep, nocopy)
3104 Lisp_Object str, nocopy;
4ed46869
KH
3105 struct coding_system *coding;
3106 int encodep;
3107{
3108 int len, consumed, produced;
3109 char *buf;
3110 unsigned char *begp, *endp;
3111 int head_skip, tail_skip;
3112 struct gcpro gcpro1;
3113
3114 if (encodep && !NILP (coding->pre_write_conversion)
3115 || !encodep && !NILP (coding->post_read_conversion))
3116 {
3117 /* Since we have to call Lisp functions which assume target text
3118 is in a buffer, after setting a temporary buffer, call
3119 code_convert_region. */
3120 int count = specpdl_ptr - specpdl;
3121 int len = XSTRING (str)->size;
3122 Lisp_Object result;
3123 struct buffer *old = current_buffer;
3124
3125 record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
3126 temp_output_buffer_setup (" *code-converting-work*");
3127 set_buffer_internal (XBUFFER (Vstandard_output));
3128 insert_from_string (str, 0, len, 0);
3129 code_convert_region (make_number (BEGV), make_number (ZV),
3130 coding, encodep);
3131 result = make_buffer_string (BEGV, ZV, 0);
3132 set_buffer_internal (old);
3133 return unbind_to (count, result);
3134 }
3135
3136 /* We may be able to shrink the conversion region. */
3137 begp = XSTRING (str)->data;
3138 endp = begp + XSTRING (str)->size;
3139 shrink_conversion_area (&begp, &endp, coding, encodep);
3140
3141 if (begp == endp)
3142 /* We need no conversion. */
e0e989f6 3143 return (NILP (nocopy) ? Fcopy_sequence (str) : str);
4ed46869
KH
3144
3145 head_skip = begp - XSTRING (str)->data;
3146 tail_skip = XSTRING (str)->size - head_skip - (endp - begp);
3147
3148 GCPRO1 (str);
3149
3150 if (encodep)
3151 len = encoding_buffer_size (coding, endp - begp);
3152 else
3153 len = decoding_buffer_size (coding, endp - begp);
3154 buf = get_conversion_buffer (len + head_skip + tail_skip);
3155
3156 bcopy (XSTRING (str)->data, buf, head_skip);
3157 coding->last_block = 1;
3158 produced = (encodep
3159 ? encode_coding (coding, XSTRING (str)->data + head_skip,
3160 buf + head_skip, endp - begp, len, &consumed)
3161 : decode_coding (coding, XSTRING (str)->data + head_skip,
3162 buf + head_skip, endp - begp, len, &consumed));
3163 bcopy (XSTRING (str)->data + head_skip + (endp - begp),
3164 buf + head_skip + produced,
3165 tail_skip);
3166
3167 UNGCPRO;
3168
3169 return make_string (buf, head_skip + produced + tail_skip);
3170}
3171
3172DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
e0e989f6
KH
3173 3, 3, "r\nzCoding system: ",
3174 "Decode current region by specified coding system.\n\
3175When called from a program, takes three arguments:\n\
3176START, END, and CODING-SYSTEM. START END are buffer positions.\n\
4ed46869
KH
3177Return length of decoded text.")
3178 (b, e, coding_system)
3179 Lisp_Object b, e, coding_system;
3180{
3181 struct coding_system coding;
3182
3183 CHECK_NUMBER_COERCE_MARKER (b, 0);
3184 CHECK_NUMBER_COERCE_MARKER (e, 1);
3185 CHECK_SYMBOL (coding_system, 2);
3186
e0e989f6
KH
3187 if (NILP (coding_system))
3188 return make_number (XFASTINT (e) - XFASTINT (b));
4ed46869
KH
3189 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3190 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3191
3192 return code_convert_region (b, e, &coding, 0);
3193}
3194
3195DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
e0e989f6
KH
3196 3, 3, "r\nzCoding system: ",
3197 "Encode current region by specified coding system.\n\
3198When called from a program, takes three arguments:\n\
3199START, END, and CODING-SYSTEM. START END are buffer positions.\n\
4ed46869
KH
3200Return length of encoded text.")
3201 (b, e, coding_system)
3202 Lisp_Object b, e, coding_system;
3203{
3204 struct coding_system coding;
3205
3206 CHECK_NUMBER_COERCE_MARKER (b, 0);
3207 CHECK_NUMBER_COERCE_MARKER (e, 1);
3208 CHECK_SYMBOL (coding_system, 2);
3209
e0e989f6
KH
3210 if (NILP (coding_system))
3211 return make_number (XFASTINT (e) - XFASTINT (b));
4ed46869
KH
3212 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3213 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3214
3215 return code_convert_region (b, e, &coding, 1);
3216}
3217
3218DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
e0e989f6
KH
3219 2, 3, 0,
3220 "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
3221Optional arg NOCOPY non-nil means return STRING itself if there's no need\n\
3222of decoding.")
3223 (string, coding_system, nocopy)
3224 Lisp_Object string, coding_system, nocopy;
4ed46869
KH
3225{
3226 struct coding_system coding;
3227
3228 CHECK_STRING (string, 0);
3229 CHECK_SYMBOL (coding_system, 1);
3230
e0e989f6
KH
3231 if (NILP (coding_system))
3232 return (NILP (nocopy) ? Fcopy_sequence (string) : string);
4ed46869
KH
3233 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3234 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3235
e0e989f6 3236 return code_convert_string (string, &coding, 0, nocopy);
4ed46869
KH
3237}
3238
3239DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
e0e989f6
KH
3240 2, 3, 0,
3241 "Encode STRING to CODING-SYSTEM, and return the result.\n\
3242Optional arg NOCOPY non-nil means return STRING itself if there's no need\n\
3243of encoding.")
3244 (string, coding_system, nocopy)
3245 Lisp_Object string, coding_system, nocopy;
4ed46869
KH
3246{
3247 struct coding_system coding;
3248
3249 CHECK_STRING (string, 0);
3250 CHECK_SYMBOL (coding_system, 1);
3251
e0e989f6
KH
3252 if (NILP (coding_system))
3253 return (NILP (nocopy) ? Fcopy_sequence (string) : string);
4ed46869
KH
3254 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3255 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3256
e0e989f6 3257 return code_convert_string (string, &coding, 1, nocopy);
4ed46869
KH
3258}
3259
3260DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
e0e989f6 3261 "Decode a JISX0208 character of shift-jis encoding.\n\
4ed46869
KH
3262CODE is the character code in SJIS.\n\
3263Return the corresponding character.")
3264 (code)
3265 Lisp_Object code;
3266{
3267 unsigned char c1, c2, s1, s2;
3268 Lisp_Object val;
3269
3270 CHECK_NUMBER (code, 0);
3271 s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
3272 DECODE_SJIS (s1, s2, c1, c2);
3273 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset_jisx0208, c1, c2));
3274 return val;
3275}
3276
3277DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
3278 "Encode a JISX0208 character CHAR to SJIS coding-system.\n\
3279Return the corresponding character code in SJIS.")
3280 (ch)
3281 Lisp_Object ch;
3282{
bcf26d6a 3283 int charset, c1, c2, s1, s2;
4ed46869
KH
3284 Lisp_Object val;
3285
3286 CHECK_NUMBER (ch, 0);
3287 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3288 if (charset == charset_jisx0208)
3289 {
3290 ENCODE_SJIS (c1, c2, s1, s2);
bcf26d6a 3291 XSETFASTINT (val, (s1 << 8) | s2);
4ed46869
KH
3292 }
3293 else
3294 XSETFASTINT (val, 0);
3295 return val;
3296}
3297
3298DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
3299 "Decode a Big5 character CODE of BIG5 coding-system.\n\
3300CODE is the character code in BIG5.\n\
3301Return the corresponding character.")
3302 (code)
3303 Lisp_Object code;
3304{
3305 int charset;
3306 unsigned char b1, b2, c1, c2;
3307 Lisp_Object val;
3308
3309 CHECK_NUMBER (code, 0);
3310 b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
3311 DECODE_BIG5 (b1, b2, charset, c1, c2);
3312 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset, c1, c2));
3313 return val;
3314}
3315
3316DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
3317 "Encode the Big5 character CHAR to BIG5 coding-system.\n\
3318Return the corresponding character code in Big5.")
3319 (ch)
3320 Lisp_Object ch;
3321{
bcf26d6a 3322 int charset, c1, c2, b1, b2;
4ed46869
KH
3323 Lisp_Object val;
3324
3325 CHECK_NUMBER (ch, 0);
3326 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3327 if (charset == charset_big5_1 || charset == charset_big5_2)
3328 {
3329 ENCODE_BIG5 (charset, c1, c2, b1, b2);
bcf26d6a 3330 XSETFASTINT (val, (b1 << 8) | b2);
4ed46869
KH
3331 }
3332 else
3333 XSETFASTINT (val, 0);
3334 return val;
3335}
3336
3337DEFUN ("set-terminal-coding-system",
3338 Fset_terminal_coding_system, Sset_terminal_coding_system, 1, 1,
3339 "zCoding-system for terminal display: ",
3340 "Set coding-system of your terminal to CODING-SYSTEM.\n\
3341All outputs to terminal are encoded to this coding-system.")
3342 (coding_system)
3343 Lisp_Object coding_system;
3344{
3345 CHECK_SYMBOL (coding_system, 0);
3346 setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
3347 update_mode_lines++;
3348 if (!NILP (Finteractive_p ()))
3349 Fredraw_display ();
3350 return Qnil;
3351}
3352
3353DEFUN ("terminal-coding-system",
3354 Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
3355 "Return coding-system of your terminal.")
3356 ()
3357{
3358 return terminal_coding.symbol;
3359}
3360
3361DEFUN ("set-keyboard-coding-system",
bdd9fb48
KH
3362 Fset_keyboard_coding_system, Sset_keyboard_coding_system, 1, 1, 0,
3363 "Set coding-system of codes sent from terminal keyboard to CODING-SYSTEM.\n\
3364In Encoded-kbd minor mode, user inputs are decoded\n\
3365accoding to CODING-SYSTEM.\n\
3366Do not call this function directly, but use the command\n\
3367encoded-kbd-set-coding-system to activate Encoded-kbd mode\n\
3368with a specific coding system.")
4ed46869
KH
3369 (coding_system)
3370 Lisp_Object coding_system;
3371{
3372 CHECK_SYMBOL (coding_system, 0);
3373 setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
3374 return Qnil;
3375}
3376
3377DEFUN ("keyboard-coding-system",
3378 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
3379 "Return coding-system of what is sent from terminal keyboard.")
3380 ()
3381{
3382 return keyboard_coding.symbol;
3383}
3384
3385\f
3386DEFUN ("find-coding-system", Ffind_coding_system, Sfind_coding_system,
3387 1, MANY, 0,
ccdb79f5
RS
3388 "Choose a coding system for a file operation based on file name.\n\
3389The value names a pair of coding systems: (ENCODING-SYSTEM DECODING-SYSTEM).\n\
3390ENCODING-SYSTEM is the coding system to use for encoding\n\
3391\(in case OPERATION does encoding), and DECODING-SYSTEM is the coding system\n\
3392for decoding (in case OPERATION does decoding).\n\
3393\n\
3394The first argument OPERATION specifies an I/O primitive:\n\
3395 For file I/O, `insert-file-contents' or `write-region'.\n\
3396 For process I/O, `call-process', `call-process-region', or `start-process'.\n\
3397 For network I/O, `open-network-stream'.\n\
3398\n\
3399The remaining arguments should be the same arguments that were passed\n\
3400to the primitive. Depending on which primitive, one of those arguments\n\
3401is selected as the TARGET. For example, if OPERATION does file I/O,\n\
3402whichever argument specifies the file name is TARGET.\n\
3403\n\
3404TARGET has a meaning which depends on OPERATION:\n\
4ed46869
KH
3405 For file I/O, TARGET is a file name.\n\
3406 For process I/O, TARGET is a process name.\n\
3407 For network I/O, TARGET is a service name or a port number\n\
3408\n\
ccdb79f5
RS
3409This function looks up what `coding-system-alist' specifies for\n\
3410OPERATION and TARGET. It may specify a cons cell which represents\n\
3411a particular coding system or it may have a function to call.\n\
3412In the latter case, we call the function with one argument,\n\
3413which is a list of all the arguments given to `find-coding-system'.")
4ed46869
KH
3414 (nargs, args)
3415 int nargs;
3416 Lisp_Object *args;
3417{
3418 Lisp_Object operation, target_idx, target, val;
3419 register Lisp_Object chain;
3420
3421 if (nargs < 2)
3422 error ("Too few arguments");
3423 operation = args[0];
3424 if (!SYMBOLP (operation)
3425 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
3426 error ("Invalid first arguement");
3427 if (nargs < 1 + XINT (target_idx))
3428 error ("Too few arguments for operation: %s",
3429 XSYMBOL (operation)->name->data);
3430 target = args[XINT (target_idx) + 1];
3431 if (!(STRINGP (target)
3432 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
3433 error ("Invalid %dth argument", XINT (target_idx) + 1);
3434
3435 chain = Fassq (operation, Vcoding_system_alist);
3436 if (NILP (chain))
3437 return Qnil;
3438
3439 for (chain = XCONS (chain)->cdr; CONSP (chain); chain = XCONS (chain)->cdr)
3440 {
3441 Lisp_Object elt = XCONS (chain)->car;
3442
3443 if (CONSP (elt)
3444 && ((STRINGP (target)
3445 && STRINGP (XCONS (elt)->car)
3446 && fast_string_match (XCONS (elt)->car, target) >= 0)
3447 || (INTEGERP (target) && EQ (target, XCONS (elt)->car))))
efee6861 3448 return (val = XCONS (elt)->cdr, CONSP (val)
4ed46869 3449 ? val
efee6861 3450 : ((SYMBOLP (val) && !NILP (Fboundp (val))
4ed46869
KH
3451 ? call2 (val, Flist (nargs, args))
3452 : Qnil)));
3453 }
3454 return Qnil;
3455}
3456
3457#endif /* emacs */
3458
3459\f
3460/*** 8. Post-amble ***/
3461
3462init_coding_once ()
3463{
3464 int i;
3465
0ef69138 3466 /* Emacs' internal format specific initialize routine. */
4ed46869
KH
3467 for (i = 0; i <= 0x20; i++)
3468 emacs_code_class[i] = EMACS_control_code;
3469 emacs_code_class[0x0A] = EMACS_linefeed_code;
3470 emacs_code_class[0x0D] = EMACS_carriage_return_code;
3471 for (i = 0x21 ; i < 0x7F; i++)
3472 emacs_code_class[i] = EMACS_ascii_code;
3473 emacs_code_class[0x7F] = EMACS_control_code;
3474 emacs_code_class[0x80] = EMACS_leading_code_composition;
3475 for (i = 0x81; i < 0xFF; i++)
3476 emacs_code_class[i] = EMACS_invalid_code;
3477 emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
3478 emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
3479 emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
3480 emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
3481
3482 /* ISO2022 specific initialize routine. */
3483 for (i = 0; i < 0x20; i++)
3484 iso_code_class[i] = ISO_control_code;
3485 for (i = 0x21; i < 0x7F; i++)
3486 iso_code_class[i] = ISO_graphic_plane_0;
3487 for (i = 0x80; i < 0xA0; i++)
3488 iso_code_class[i] = ISO_control_code;
3489 for (i = 0xA1; i < 0xFF; i++)
3490 iso_code_class[i] = ISO_graphic_plane_1;
3491 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
3492 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
3493 iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
3494 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
3495 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
3496 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
3497 iso_code_class[ISO_CODE_ESC] = ISO_escape;
3498 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
3499 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
3500 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
3501
e0e989f6
KH
3502 conversion_buffer_size = MINIMUM_CONVERSION_BUFFER_SIZE;
3503 conversion_buffer = (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE);
3504
3505 setup_coding_system (Qnil, &keyboard_coding);
3506 setup_coding_system (Qnil, &terminal_coding);
3507}
3508
3509#ifdef emacs
3510
3511syms_of_coding ()
3512{
3513 Qtarget_idx = intern ("target-idx");
3514 staticpro (&Qtarget_idx);
3515
3516 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
3517 Fput (Qwrite_region, Qtarget_idx, make_number (2));
3518
3519 Qcall_process = intern ("call-process");
3520 staticpro (&Qcall_process);
3521 Fput (Qcall_process, Qtarget_idx, make_number (0));
3522
3523 Qcall_process_region = intern ("call-process-region");
3524 staticpro (&Qcall_process_region);
3525 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
3526
3527 Qstart_process = intern ("start-process");
3528 staticpro (&Qstart_process);
3529 Fput (Qstart_process, Qtarget_idx, make_number (2));
3530
3531 Qopen_network_stream = intern ("open-network-stream");
3532 staticpro (&Qopen_network_stream);
3533 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
3534
4ed46869
KH
3535 Qcoding_system = intern ("coding-system");
3536 staticpro (&Qcoding_system);
3537
3538 Qeol_type = intern ("eol-type");
3539 staticpro (&Qeol_type);
3540
3541 Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
3542 staticpro (&Qbuffer_file_coding_system);
3543
3544 Qpost_read_conversion = intern ("post-read-conversion");
3545 staticpro (&Qpost_read_conversion);
3546
3547 Qpre_write_conversion = intern ("pre-write-conversion");
3548 staticpro (&Qpre_write_conversion);
3549
3550 Qcoding_system_vector = intern ("coding-system-vector");
3551 staticpro (&Qcoding_system_vector);
3552
3553 Qcoding_system_p = intern ("coding-system-p");
3554 staticpro (&Qcoding_system_p);
3555
3556 Qcoding_system_error = intern ("coding-system-error");
3557 staticpro (&Qcoding_system_error);
3558
3559 Fput (Qcoding_system_error, Qerror_conditions,
3560 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
3561 Fput (Qcoding_system_error, Qerror_message,
3562 build_string ("Coding-system error"));
3563
3564 Qcoding_category_index = intern ("coding-category-index");
3565 staticpro (&Qcoding_category_index);
3566
3567 {
3568 int i;
3569 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3570 {
3571 coding_category_table[i] = intern (coding_category_name[i]);
3572 staticpro (&coding_category_table[i]);
3573 Fput (coding_category_table[i], Qcoding_category_index,
3574 make_number (i));
3575 }
3576 }
3577
bdd9fb48
KH
3578 Qcharacter_unification_table = intern ("character-unification-table");
3579 staticpro (&Qcharacter_unification_table);
3580 Fput (Qcharacter_unification_table, Qchar_table_extra_slots,
3581 make_number (0));
3582
4ed46869
KH
3583 defsubr (&Scoding_system_vector);
3584 defsubr (&Scoding_system_p);
3585 defsubr (&Sread_coding_system);
3586 defsubr (&Sread_non_nil_coding_system);
3587 defsubr (&Scheck_coding_system);
3588 defsubr (&Sdetect_coding_region);
3589 defsubr (&Sdecode_coding_region);
3590 defsubr (&Sencode_coding_region);
3591 defsubr (&Sdecode_coding_string);
3592 defsubr (&Sencode_coding_string);
3593 defsubr (&Sdecode_sjis_char);
3594 defsubr (&Sencode_sjis_char);
3595 defsubr (&Sdecode_big5_char);
3596 defsubr (&Sencode_big5_char);
3597 defsubr (&Sset_terminal_coding_system);
3598 defsubr (&Sterminal_coding_system);
3599 defsubr (&Sset_keyboard_coding_system);
3600 defsubr (&Skeyboard_coding_system);
3601 defsubr (&Sfind_coding_system);
3602
3603 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
3604 "List of coding-categories (symbols) ordered by priority.");
3605 {
3606 int i;
3607
3608 Vcoding_category_list = Qnil;
3609 for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
3610 Vcoding_category_list
3611 = Fcons (coding_category_table[i], Vcoding_category_list);
3612 }
3613
3614 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
3615 "A variable of internal use only.\n\
3616If the value is a coding system, it is used for decoding on read operation.\n\
3617If not, an appropriate element in `coding-system-alist' (which see) is used.");
3618 Vcoding_system_for_read = Qnil;
3619
3620 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
3621 "A variable of internal use only.\n\
3622If the value is a coding system, it is used for encoding on write operation.\n\
3623If not, an appropriate element in `coding-system-alist' (which see) is used.");
3624 Vcoding_system_for_write = Qnil;
3625
3626 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
3627 "Coding-system used in the latest file or process I/O.");
3628 Vlast_coding_system_used = Qnil;
3629
3630 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
3631 "Nested alist to decide a coding system for a specific I/O operation.\n\
3632The format is ((OPERATION . ((REGEXP . CODING-SYSTEMS) ...)) ...).\n\
e0e989f6 3633\n\
4ed46869
KH
3634OPERATION is one of the following Emacs I/O primitives:\n\
3635 For file I/O, insert-file-contents and write-region.\n\
3636 For process I/O, call-process, call-process-region, and start-process.\n\
3637 For network I/O, open-network-stream.\n\
3638In addition, for process I/O, `process-argument' can be specified for\n\
3639encoding arguments of the process.\n\
3640\n\
3641REGEXP is a regular expression matching a target of OPERATION, where\n\
3642target is a file name for file I/O operations, a process name for\n\
3643process I/O operations, or a service name for network I/O\n\
3644operations. REGEXP might be a port number for network I/O operation.\n\
3645\n\
3646CODING-SYSTEMS is a cons of coding systems to encode and decode\n\
3647character code on OPERATION, or a function symbol returning the cons.\n\
3648See the documentation of `find-coding-system' for more detail.");
3649 Vcoding_system_alist = Qnil;
3650
3651 DEFVAR_INT ("eol-mnemonic-unix", &eol_mnemonic_unix,
3652 "Mnemonic character indicating UNIX-like end-of-line format (i.e. LF) .");
3653 eol_mnemonic_unix = '.';
3654
3655 DEFVAR_INT ("eol-mnemonic-dos", &eol_mnemonic_dos,
3656 "Mnemonic character indicating DOS-like end-of-line format (i.e. CRLF).");
3657 eol_mnemonic_dos = ':';
3658
3659 DEFVAR_INT ("eol-mnemonic-mac", &eol_mnemonic_mac,
3660 "Mnemonic character indicating MAC-like end-of-line format (i.e. CR).");
3661 eol_mnemonic_mac = '\'';
3662
3663 DEFVAR_INT ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
3664 "Mnemonic character indicating end-of-line format is not yet decided.");
3665 eol_mnemonic_undecided = '-';
3666
bdd9fb48
KH
3667 DEFVAR_LISP ("enable-character-unification", &Venable_character_unification,
3668 "Non-nil means ISO 2022 encoder/decoder do character unification.");
3669 Venable_character_unification = Qt;
3670
3671 DEFVAR_LISP ("standard-character-unification-table-for-read",
3672 &Vstandard_character_unification_table_for_read,
3673 "Table for unifying characters when reading.");
3674 Vstandard_character_unification_table_for_read = Qnil;
3675
3676 DEFVAR_LISP ("standard-character-unification-table-for-write",
3677 &Vstandard_character_unification_table_for_write,
3678 "Table for unifying characters when writing.");
3679 Vstandard_character_unification_table_for_write = Qnil;
4ed46869
KH
3680
3681 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
3682 "Alist of charsets vs revision numbers.\n\
3683While encoding, if a charset (car part of an element) is found,\n\
3684designate it with the escape sequence identifing revision (cdr part of the element).");
3685 Vcharset_revision_alist = Qnil;
3686}
3687
3688#endif /* emacs */