Don't globally add to change-major-mode-hook.
[bpt/emacs.git] / src / coding.c
CommitLineData
4ed46869
KH
1/* Coding system handler (conversion, detection, and etc).
2 Ver.1.0.
4ed46869
KH
3 Copyright (C) 1995 Free Software Foundation, Inc.
4 Copyright (C) 1995 Electrotechnical Laboratory, JAPAN.
5
369314dc
KH
6This file is part of GNU Emacs.
7
8GNU Emacs is free software; you can redistribute it and/or modify
9it under the terms of the GNU General Public License as published by
10the Free Software Foundation; either version 2, or (at your option)
11any later version.
4ed46869 12
369314dc
KH
13GNU Emacs is distributed in the hope that it will be useful,
14but WITHOUT ANY WARRANTY; without even the implied warranty of
15MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16GNU General Public License for more details.
4ed46869 17
369314dc
KH
18You should have received a copy of the GNU General Public License
19along with GNU Emacs; see the file COPYING. If not, write to
20the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
21Boston, MA 02111-1307, USA. */
4ed46869
KH
22
23/*** TABLE OF CONTENTS ***
24
25 1. Preamble
0ef69138 26 2. Emacs' internal format (emacs-mule) handlers
4ed46869
KH
27 3. ISO2022 handlers
28 4. Shift-JIS and BIG5 handlers
29 5. End-of-line handlers
30 6. C library functions
31 7. Emacs Lisp library functions
32 8. Post-amble
33
34*/
35
36/*** GENERAL NOTE on CODING SYSTEM ***
37
38 Coding system is an encoding mechanism of one or more character
39 sets. Here's a list of coding systems which Emacs can handle. When
40 we say "decode", it means converting some other coding system to
0ef69138
KH
41 Emacs' internal format (emacs-internal), and when we say "encode",
42 it means converting the coding system emacs-mule to some other
43 coding system.
4ed46869 44
0ef69138 45 0. Emacs' internal format (emacs-mule)
4ed46869
KH
46
47 Emacs itself holds a multi-lingual character in a buffer and a string
48 in a special format. Details are described in the section 2.
49
50 1. ISO2022
51
52 The most famous coding system for multiple character sets. X's
53 Compound Text, various EUCs (Extended Unix Code), and such coding
54 systems used in Internet communication as ISO-2022-JP are all
55 variants of ISO2022. Details are described in the section 3.
56
57 2. SJIS (or Shift-JIS or MS-Kanji-Code)
58
59 A coding system to encode character sets: ASCII, JISX0201, and
60 JISX0208. Widely used for PC's in Japan. Details are described in
61 the section 4.
62
63 3. BIG5
64
65 A coding system to encode character sets: ASCII and Big5. Widely
66 used by Chinese (mainly in Taiwan and Hong Kong). Details are
67 described in the section 4. In this file, when written as "BIG5"
68 (all uppercase), it means the coding system, and when written as
69 "Big5" (capitalized), it means the character set.
70
71 4. Else
72
73 If a user want to read/write a text encoded in a coding system not
74 listed above, he can supply a decoder and an encoder for it in CCL
75 (Code Conversion Language) programs. Emacs executes the CCL program
76 while reading/writing.
77
78 Emacs represent a coding-system by a Lisp symbol that has a property
79 `coding-system'. But, before actually using the coding-system, the
80 information about it is set in a structure of type `struct
81 coding_system' for rapid processing. See the section 6 for more
82 detail.
83
84*/
85
86/*** GENERAL NOTES on END-OF-LINE FORMAT ***
87
88 How end-of-line of a text is encoded depends on a system. For
89 instance, Unix's format is just one byte of `line-feed' code,
90 whereas DOS's format is two bytes sequence of `carriage-return' and
91 `line-feed' codes. MacOS's format is one byte of `carriage-return'.
92
93 Since how characters in a text is encoded and how end-of-line is
94 encoded is independent, any coding system described above can take
95 any format of end-of-line. So, Emacs has information of format of
96 end-of-line in each coding-system. See the section 6 for more
97 detail.
98
99*/
100
101/*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
102
103 These functions check if a text between SRC and SRC_END is encoded
104 in the coding system category XXX. Each returns an integer value in
105 which appropriate flag bits for the category XXX is set. The flag
106 bits are defined in macros CODING_CATEGORY_MASK_XXX. Below is the
107 template of these functions. */
108#if 0
109int
0ef69138 110detect_coding_emacs_mule (src, src_end)
4ed46869
KH
111 unsigned char *src, *src_end;
112{
113 ...
114}
115#endif
116
117/*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
118
119 These functions decode SRC_BYTES length text at SOURCE encoded in
0ef69138
KH
120 CODING to Emacs' internal format (emacs-mule). The resulting text
121 goes to a place pointed by DESTINATION, the length of which should
122 not exceed DST_BYTES. The bytes actually processed is returned as
123 *CONSUMED. The return value is the length of the decoded text.
124 Below is a template of these functions. */
4ed46869
KH
125#if 0
126decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
127 struct coding_system *coding;
128 unsigned char *source, *destination;
129 int src_bytes, dst_bytes;
130 int *consumed;
131{
132 ...
133}
134#endif
135
136/*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
137
0ef69138
KH
138 These functions encode SRC_BYTES length text at SOURCE of Emacs'
139 internal format (emacs-mule) to CODING. The resulting text goes to
140 a place pointed by DESTINATION, the length of which should not
141 exceed DST_BYTES. The bytes actually processed is returned as
142 *CONSUMED. The return value is the length of the encoded text.
143 Below is a template of these functions. */
4ed46869
KH
144#if 0
145encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
146 struct coding_system *coding;
147 unsigned char *source, *destination;
148 int src_bytes, dst_bytes;
149 int *consumed;
150{
151 ...
152}
153#endif
154
155/*** COMMONLY USED MACROS ***/
156
157/* The following three macros ONE_MORE_BYTE, TWO_MORE_BYTES, and
158 THREE_MORE_BYTES safely get one, two, and three bytes from the
159 source text respectively. If there are not enough bytes in the
160 source, they jump to `label_end_of_loop'. The caller should set
161 variables `src' and `src_end' to appropriate areas in advance. */
162
163#define ONE_MORE_BYTE(c1) \
164 do { \
165 if (src < src_end) \
166 c1 = *src++; \
167 else \
168 goto label_end_of_loop; \
169 } while (0)
170
171#define TWO_MORE_BYTES(c1, c2) \
172 do { \
173 if (src + 1 < src_end) \
174 c1 = *src++, c2 = *src++; \
175 else \
176 goto label_end_of_loop; \
177 } while (0)
178
179#define THREE_MORE_BYTES(c1, c2, c3) \
180 do { \
181 if (src + 2 < src_end) \
182 c1 = *src++, c2 = *src++, c3 = *src++; \
183 else \
184 goto label_end_of_loop; \
185 } while (0)
186
187/* The following three macros DECODE_CHARACTER_ASCII,
188 DECODE_CHARACTER_DIMENSION1, and DECODE_CHARACTER_DIMENSION2 put
189 the multi-byte form of a character of each class at the place
190 pointed by `dst'. The caller should set the variable `dst' to
191 point to an appropriate area and the variable `coding' to point to
192 the coding-system of the currently decoding text in advance. */
193
194/* Decode one ASCII character C. */
195
196#define DECODE_CHARACTER_ASCII(c) \
197 do { \
198 if (COMPOSING_P (coding->composing)) \
199 *dst++ = 0xA0, *dst++ = (c) | 0x80; \
200 else \
201 *dst++ = (c); \
202 } while (0)
203
204/* Decode one DIMENSION1 character of which charset is CHARSET and
205 position-code is C. */
206
207#define DECODE_CHARACTER_DIMENSION1(charset, c) \
208 do { \
209 unsigned char leading_code = CHARSET_LEADING_CODE_BASE (charset); \
210 if (COMPOSING_P (coding->composing)) \
211 *dst++ = leading_code + 0x20; \
212 else \
213 *dst++ = leading_code; \
214 if (leading_code = CHARSET_LEADING_CODE_EXT (charset)) \
215 *dst++ = leading_code; \
216 *dst++ = (c) | 0x80; \
217 } while (0)
218
219/* Decode one DIMENSION2 character of which charset is CHARSET and
220 position-codes are C1 and C2. */
221
222#define DECODE_CHARACTER_DIMENSION2(charset, c1, c2) \
223 do { \
224 DECODE_CHARACTER_DIMENSION1 (charset, c1); \
225 *dst++ = (c2) | 0x80; \
226 } while (0)
227
228\f
229/*** 1. Preamble ***/
230
231#include <stdio.h>
232
233#ifdef emacs
234
235#include <config.h>
236#include "lisp.h"
237#include "buffer.h"
238#include "charset.h"
239#include "ccl.h"
240#include "coding.h"
241#include "window.h"
242
243#else /* not emacs */
244
245#include "mulelib.h"
246
247#endif /* not emacs */
248
249Lisp_Object Qcoding_system, Qeol_type;
250Lisp_Object Qbuffer_file_coding_system;
251Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
252
253extern Lisp_Object Qinsert_file_contents, Qwrite_region;
254Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
255Lisp_Object Qstart_process, Qopen_network_stream;
256Lisp_Object Qtarget_idx;
257
258/* Mnemonic character of each format of end-of-line. */
259int eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
260/* Mnemonic character to indicate format of end-of-line is not yet
261 decided. */
262int eol_mnemonic_undecided;
263
264#ifdef emacs
265
02ba4723 266Lisp_Object Qcoding_system_spec, Qcoding_system_p, Qcoding_system_error;
4ed46869
KH
267
268/* Coding-systems are handed between Emacs Lisp programs and C internal
269 routines by the following three variables. */
270/* Coding-system for reading files and receiving data from process. */
271Lisp_Object Vcoding_system_for_read;
272/* Coding-system for writing files and sending data to process. */
273Lisp_Object Vcoding_system_for_write;
274/* Coding-system actually used in the latest I/O. */
275Lisp_Object Vlast_coding_system_used;
276
277/* Coding-system of what terminal accept for displaying. */
278struct coding_system terminal_coding;
279
280/* Coding-system of what is sent from terminal keyboard. */
281struct coding_system keyboard_coding;
282
02ba4723
KH
283Lisp_Object Vfile_coding_system_alist;
284Lisp_Object Vprocess_coding_system_alist;
285Lisp_Object Vnetwork_coding_system_alist;
4ed46869
KH
286
287#endif /* emacs */
288
289Lisp_Object Qcoding_category_index;
290
291/* List of symbols `coding-category-xxx' ordered by priority. */
292Lisp_Object Vcoding_category_list;
293
294/* Table of coding-systems currently assigned to each coding-category. */
295Lisp_Object coding_category_table[CODING_CATEGORY_IDX_MAX];
296
297/* Table of names of symbol for each coding-category. */
298char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
0ef69138 299 "coding-category-emacs-mule",
4ed46869
KH
300 "coding-category-sjis",
301 "coding-category-iso-7",
302 "coding-category-iso-8-1",
303 "coding-category-iso-8-2",
304 "coding-category-iso-else",
305 "coding-category-big5",
306 "coding-category-binary"
307};
308
bdd9fb48
KH
309/* Flag to tell if we look up unification table on character code
310 conversion. */
311Lisp_Object Venable_character_unification;
312/* Standard unification table to look up on reading (decoding). */
313Lisp_Object Vstandard_character_unification_table_for_read;
314/* Standard unification table to look up on writing (encoding). */
315Lisp_Object Vstandard_character_unification_table_for_write;
316
317Lisp_Object Qcharacter_unification_table;
4ed46869
KH
318
319/* Alist of charsets vs revision number. */
320Lisp_Object Vcharset_revision_alist;
321
02ba4723
KH
322/* Default coding systems used for process I/O. */
323Lisp_Object Vdefault_process_coding_system;
324
4ed46869 325\f
0ef69138 326/*** 2. Emacs internal format (emacs-mule) handlers ***/
4ed46869
KH
327
328/* Emacs' internal format for encoding multiple character sets is a
329 kind of multi-byte encoding, i.e. encoding a character by a sequence
330 of one-byte codes of variable length. ASCII characters and control
331 characters (e.g. `tab', `newline') are represented by one-byte as
332 is. It takes the range 0x00 through 0x7F. The other characters
333 are represented by a sequence of `base leading-code', optional
334 `extended leading-code', and one or two `position-code's. Length
335 of the sequence is decided by the base leading-code. Leading-code
336 takes the range 0x80 through 0x9F, whereas extended leading-code
337 and position-code take the range 0xA0 through 0xFF. See the
338 document of `charset.h' for more detail about leading-code and
339 position-code.
340
341 There's one exception in this rule. Special leading-code
342 `leading-code-composition' denotes that the following several
343 characters should be composed into one character. Leading-codes of
344 components (except for ASCII) are added 0x20. An ASCII character
345 component is represented by a 2-byte sequence of `0xA0' and
346 `ASCII-code + 0x80'. See also the document in `charset.h' for the
347 detail of composite character. Hence, we can summarize the code
348 range as follows:
349
350 --- CODE RANGE of Emacs' internal format ---
351 (character set) (range)
352 ASCII 0x00 .. 0x7F
353 ELSE (1st byte) 0x80 .. 0x9F
354 (rest bytes) 0xA0 .. 0xFF
355 ---------------------------------------------
356
357 */
358
359enum emacs_code_class_type emacs_code_class[256];
360
361/* Go to the next statement only if *SRC is accessible and the code is
362 greater than 0xA0. */
363#define CHECK_CODE_RANGE_A0_FF \
364 do { \
365 if (src >= src_end) \
366 goto label_end_of_switch; \
367 else if (*src++ < 0xA0) \
368 return 0; \
369 } while (0)
370
371/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
372 Check if a text is encoded in Emacs' internal format. If it is,
0ef69138 373 return CODING_CATEGORY_MASK_EMASC_MULE, else return 0. */
4ed46869
KH
374
375int
0ef69138 376detect_coding_emacs_mule (src, src_end)
4ed46869
KH
377 unsigned char *src, *src_end;
378{
379 unsigned char c;
380 int composing = 0;
381
382 while (src < src_end)
383 {
384 c = *src++;
385
386 if (composing)
387 {
388 if (c < 0xA0)
389 composing = 0;
390 else
391 c -= 0x20;
392 }
393
394 switch (emacs_code_class[c])
395 {
396 case EMACS_ascii_code:
397 case EMACS_linefeed_code:
398 break;
399
400 case EMACS_control_code:
401 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
402 return 0;
403 break;
404
405 case EMACS_invalid_code:
406 return 0;
407
408 case EMACS_leading_code_composition: /* c == 0x80 */
409 if (composing)
410 CHECK_CODE_RANGE_A0_FF;
411 else
412 composing = 1;
413 break;
414
415 case EMACS_leading_code_4:
416 CHECK_CODE_RANGE_A0_FF;
417 /* fall down to check it two more times ... */
418
419 case EMACS_leading_code_3:
420 CHECK_CODE_RANGE_A0_FF;
421 /* fall down to check it one more time ... */
422
423 case EMACS_leading_code_2:
424 CHECK_CODE_RANGE_A0_FF;
425 break;
426
427 default:
428 label_end_of_switch:
429 break;
430 }
431 }
0ef69138 432 return CODING_CATEGORY_MASK_EMACS_MULE;
4ed46869
KH
433}
434
435\f
436/*** 3. ISO2022 handlers ***/
437
438/* The following note describes the coding system ISO2022 briefly.
439 Since the intension of this note is to help understanding of the
440 programs in this file, some parts are NOT ACCURATE or OVERLY
441 SIMPLIFIED. For the thorough understanding, please refer to the
442 original document of ISO2022.
443
444 ISO2022 provides many mechanisms to encode several character sets
445 in 7-bit and 8-bit environment. If one choose 7-bite environment,
446 all text is encoded by codes of less than 128. This may make the
447 encoded text a little bit longer, but the text get more stability
448 to pass through several gateways (some of them split MSB off).
449
450 There are two kind of character set: control character set and
451 graphic character set. The former contains control characters such
452 as `newline' and `escape' to provide control functions (control
453 functions are provided also by escape sequence). The latter
454 contains graphic characters such as ' A' and '-'. Emacs recognizes
455 two control character sets and many graphic character sets.
456
457 Graphic character sets are classified into one of the following
458 four classes, DIMENSION1_CHARS94, DIMENSION1_CHARS96,
459 DIMENSION2_CHARS94, DIMENSION2_CHARS96 according to the number of
460 bytes (DIMENSION) and the number of characters in one dimension
461 (CHARS) of the set. In addition, each character set is assigned an
462 identification tag (called "final character" and denoted as <F>
463 here after) which is unique in each class. <F> of each character
464 set is decided by ECMA(*) when it is registered in ISO. Code range
465 of <F> is 0x30..0x7F (0x30..0x3F are for private use only).
466
467 Note (*): ECMA = European Computer Manufacturers Association
468
469 Here are examples of graphic character set [NAME(<F>)]:
470 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
471 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
472 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
473 o DIMENSION2_CHARS96 -- none for the moment
474
475 A code area (1byte=8bits) is divided into 4 areas, C0, GL, C1, and GR.
476 C0 [0x00..0x1F] -- control character plane 0
477 GL [0x20..0x7F] -- graphic character plane 0
478 C1 [0x80..0x9F] -- control character plane 1
479 GR [0xA0..0xFF] -- graphic character plane 1
480
481 A control character set is directly designated and invoked to C0 or
482 C1 by an escape sequence. The most common case is that ISO646's
483 control character set is designated/invoked to C0 and ISO6429's
484 control character set is designated/invoked to C1, and usually
485 these designations/invocations are omitted in a coded text. With
486 7-bit environment, only C0 can be used, and a control character for
487 C1 is encoded by an appropriate escape sequence to fit in the
488 environment. All control characters for C1 are defined the
489 corresponding escape sequences.
490
491 A graphic character set is at first designated to one of four
492 graphic registers (G0 through G3), then these graphic registers are
493 invoked to GL or GR. These designations and invocations can be
494 done independently. The most common case is that G0 is invoked to
495 GL, G1 is invoked to GR, and ASCII is designated to G0, and usually
496 these invocations and designations are omitted in a coded text.
497 With 7-bit environment, only GL can be used.
498
499 When a graphic character set of CHARS94 is invoked to GL, code 0x20
500 and 0x7F of GL area work as control characters SPACE and DEL
501 respectively, and code 0xA0 and 0xFF of GR area should not be used.
502
503 There are two ways of invocation: locking-shift and single-shift.
504 With locking-shift, the invocation lasts until the next different
505 invocation, whereas with single-shift, the invocation works only
506 for the following character and doesn't affect locking-shift.
507 Invocations are done by the following control characters or escape
508 sequences.
509
510 ----------------------------------------------------------------------
511 function control char escape sequence description
512 ----------------------------------------------------------------------
513 SI (shift-in) 0x0F none invoke G0 to GL
514 SI (shift-out) 0x0E none invoke G1 to GL
515 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
516 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
517 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 into GL
518 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 into GL
519 ----------------------------------------------------------------------
520 The first four are for locking-shift. Control characters for these
521 functions are defined by macros ISO_CODE_XXX in `coding.h'.
522
523 Designations are done by the following escape sequences.
524 ----------------------------------------------------------------------
525 escape sequence description
526 ----------------------------------------------------------------------
527 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
528 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
529 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
530 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
531 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
532 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
533 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
534 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
535 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
536 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
537 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
538 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
539 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
540 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
541 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
542 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
543 ----------------------------------------------------------------------
544
545 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
546 of dimension 1, chars 94, and final character <F>, and etc.
547
548 Note (*): Although these designations are not allowed in ISO2022,
549 Emacs accepts them on decoding, and produces them on encoding
550 CHARS96 character set in a coding system which is characterized as
551 7-bit environment, non-locking-shift, and non-single-shift.
552
553 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
554 '(' can be omitted. We call this as "short-form" here after.
555
556 Now you may notice that there are a lot of ways for encoding the
557 same multilingual text in ISO2022. Actually, there exist many
558 coding systems such as Compound Text (used in X's inter client
559 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
560 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
561 localized platforms), and all of these are variants of ISO2022.
562
563 In addition to the above, Emacs handles two more kinds of escape
564 sequences: ISO6429's direction specification and Emacs' private
565 sequence for specifying character composition.
566
567 ISO6429's direction specification takes the following format:
568 o CSI ']' -- end of the current direction
569 o CSI '0' ']' -- end of the current direction
570 o CSI '1' ']' -- start of left-to-right text
571 o CSI '2' ']' -- start of right-to-left text
572 The control character CSI (0x9B: control sequence introducer) is
573 abbreviated to the escape sequence ESC '[' in 7-bit environment.
574
575 Character composition specification takes the following format:
576 o ESC '0' -- start character composition
577 o ESC '1' -- end character composition
578 Since these are not standard escape sequences of any ISO, the use
579 of them for these meaning is restricted to Emacs only. */
580
581enum iso_code_class_type iso_code_class[256];
582
583/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
584 Check if a text is encoded in ISO2022. If it is, returns an
585 integer in which appropriate flag bits any of:
586 CODING_CATEGORY_MASK_ISO_7
587 CODING_CATEGORY_MASK_ISO_8_1
588 CODING_CATEGORY_MASK_ISO_8_2
589 CODING_CATEGORY_MASK_ISO_ELSE
590 are set. If a code which should never appear in ISO2022 is found,
591 returns 0. */
592
593int
594detect_coding_iso2022 (src, src_end)
595 unsigned char *src, *src_end;
596{
bcf26d6a
KH
597 int mask = CODING_CATEGORY_MASK_ANY;
598 int g1 = 0; /* 1 iff designating to G1. */
599 int c, i;
4ed46869 600
e0e989f6 601 while (src < src_end)
4ed46869
KH
602 {
603 c = *src++;
604 switch (c)
605 {
606 case ISO_CODE_ESC:
e0e989f6 607 if (src >= src_end)
4ed46869
KH
608 break;
609 c = *src++;
bcf26d6a 610 if (src < src_end
e0e989f6
KH
611 && ((c >= '(' && c <= '/')
612 || c == '$' && ((*src >= '(' && *src <= '/')
613 || (*src >= '@' && *src <= 'B'))))
4ed46869 614 {
e0e989f6 615 /* Valid designation sequence. */
bcf26d6a
KH
616 mask &= (CODING_CATEGORY_MASK_ISO_7
617 | CODING_CATEGORY_MASK_ISO_8_1
618 | CODING_CATEGORY_MASK_ISO_8_2
619 | CODING_CATEGORY_MASK_ISO_ELSE);
e0e989f6 620 if (c == ')' || (c == '$' && *src == ')'))
bcf26d6a
KH
621 {
622 g1 = 1;
623 mask &= ~CODING_CATEGORY_MASK_ISO_7;
624 }
e0e989f6
KH
625 src++;
626 break;
4ed46869 627 }
4ed46869
KH
628 else if (c == 'N' || c == 'O' || c == 'n' || c == 'o')
629 return CODING_CATEGORY_MASK_ISO_ELSE;
630 break;
631
4ed46869 632 case ISO_CODE_SO:
e0e989f6
KH
633 if (g1)
634 return CODING_CATEGORY_MASK_ISO_ELSE;
635 break;
636
4ed46869
KH
637 case ISO_CODE_CSI:
638 case ISO_CODE_SS2:
639 case ISO_CODE_SS3:
640 mask &= ~CODING_CATEGORY_MASK_ISO_7;
641 break;
642
643 default:
644 if (c < 0x80)
645 break;
646 else if (c < 0xA0)
647 return 0;
648 else
649 {
650 int count = 1;
651
652 mask &= ~CODING_CATEGORY_MASK_ISO_7;
e0e989f6 653 while (src < src_end && *src >= 0xA0)
4ed46869 654 count++, src++;
e0e989f6 655 if (count & 1 && src < src_end)
4ed46869
KH
656 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
657 }
658 break;
659 }
660 }
661
662 return mask;
663}
664
665/* Decode a character of which charset is CHARSET and the 1st position
bdd9fb48 666 code is C1. If dimension of CHARSET is 2, the 2nd position code is
4ed46869
KH
667 fetched from SRC and set to C2. If CHARSET is negative, it means
668 that we are decoding ill formed text, and what we can do is just to
669 read C1 as is. */
670
bdd9fb48
KH
671#define DECODE_ISO_CHARACTER(charset, c1) \
672 do { \
673 int c_alt, charset_alt = (charset); \
674 if (COMPOSING_HEAD_P (coding->composing)) \
675 { \
676 *dst++ = LEADING_CODE_COMPOSITION; \
677 if (COMPOSING_WITH_RULE_P (coding->composing)) \
678 /* To tell composition rules are embeded. */ \
679 *dst++ = 0xFF; \
680 coding->composing += 2; \
681 } \
682 if ((charset) >= 0) \
683 { \
684 if (CHARSET_DIMENSION (charset) == 2) \
685 ONE_MORE_BYTE (c2); \
686 if (!NILP (unification_table) \
687 && ((c_alt = unify_char (unification_table, \
688 -1, (charset), c1, c2)) >= 0)) \
689 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
690 } \
691 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
692 DECODE_CHARACTER_ASCII (c1); \
693 else if (CHARSET_DIMENSION (charset_alt) == 1) \
694 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
695 else \
696 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
697 if (COMPOSING_WITH_RULE_P (coding->composing)) \
698 /* To tell a composition rule follows. */ \
699 coding->composing = COMPOSING_WITH_RULE_RULE; \
4ed46869
KH
700 } while (0)
701
702/* Set designation state into CODING. */
703#define DECODE_DESIGNATION(reg, dimension, chars, final_char) \
704 do { \
705 int charset = ISO_CHARSET_TABLE (dimension, chars, final_char); \
4ed46869
KH
706 if (charset >= 0) \
707 { \
708 if (coding->direction == 1 \
709 && CHARSET_REVERSE_CHARSET (charset) >= 0) \
710 charset = CHARSET_REVERSE_CHARSET (charset); \
711 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
712 } \
713 } while (0)
714
715/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
716
717int
718decode_coding_iso2022 (coding, source, destination,
719 src_bytes, dst_bytes, consumed)
720 struct coding_system *coding;
721 unsigned char *source, *destination;
722 int src_bytes, dst_bytes;
723 int *consumed;
724{
725 unsigned char *src = source;
726 unsigned char *src_end = source + src_bytes;
727 unsigned char *dst = destination;
728 unsigned char *dst_end = destination + dst_bytes;
729 /* Since the maximum bytes produced by each loop is 7, we subtract 6
730 from DST_END to assure that overflow checking is necessary only
731 at the head of loop. */
732 unsigned char *adjusted_dst_end = dst_end - 6;
733 int charset;
734 /* Charsets invoked to graphic plane 0 and 1 respectively. */
735 int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
736 int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
bdd9fb48
KH
737 Lisp_Object unification_table = coding->character_unification_table;
738
739 if (!NILP (Venable_character_unification) && NILP (unification_table))
740 unification_table = Vstandard_character_unification_table_for_read;
4ed46869
KH
741
742 while (src < src_end && dst < adjusted_dst_end)
743 {
744 /* SRC_BASE remembers the start position in source in each loop.
745 The loop will be exited when there's not enough source text
746 to analyze long escape sequence or 2-byte code (within macros
747 ONE_MORE_BYTE or TWO_MORE_BYTES). In that case, SRC is reset
748 to SRC_BASE before exiting. */
749 unsigned char *src_base = src;
bdd9fb48 750 int c1 = *src++, c2;
4ed46869
KH
751
752 switch (iso_code_class [c1])
753 {
754 case ISO_0x20_or_0x7F:
755 if (!coding->composing
756 && (charset0 < 0 || CHARSET_CHARS (charset0) == 94))
757 {
758 /* This is SPACE or DEL. */
759 *dst++ = c1;
760 break;
761 }
762 /* This is a graphic character, we fall down ... */
763
764 case ISO_graphic_plane_0:
765 if (coding->composing == COMPOSING_WITH_RULE_RULE)
766 {
767 /* This is a composition rule. */
768 *dst++ = c1 | 0x80;
769 coding->composing = COMPOSING_WITH_RULE_TAIL;
770 }
771 else
772 DECODE_ISO_CHARACTER (charset0, c1);
773 break;
774
775 case ISO_0xA0_or_0xFF:
776 if (charset1 < 0 || CHARSET_CHARS (charset1) == 94)
777 {
778 /* Invalid code. */
779 *dst++ = c1;
780 break;
781 }
782 /* This is a graphic character, we fall down ... */
783
784 case ISO_graphic_plane_1:
785 DECODE_ISO_CHARACTER (charset1, c1);
786 break;
787
788 case ISO_control_code:
789 /* All ISO2022 control characters in this class have the
790 same representation in Emacs internal format. */
791 *dst++ = c1;
792 break;
793
794 case ISO_carriage_return:
795 if (coding->eol_type == CODING_EOL_CR)
796 {
797 *dst++ = '\n';
798 }
799 else if (coding->eol_type == CODING_EOL_CRLF)
800 {
801 ONE_MORE_BYTE (c1);
802 if (c1 == ISO_CODE_LF)
803 *dst++ = '\n';
804 else
805 {
806 src--;
807 *dst++ = c1;
808 }
809 }
810 else
811 {
812 *dst++ = c1;
813 }
814 break;
815
816 case ISO_shift_out:
e0e989f6
KH
817 if (CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
818 goto label_invalid_escape_sequence;
4ed46869
KH
819 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
820 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
821 break;
822
823 case ISO_shift_in:
824 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
825 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
826 break;
827
828 case ISO_single_shift_2_7:
829 case ISO_single_shift_2:
830 /* SS2 is handled as an escape sequence of ESC 'N' */
831 c1 = 'N';
832 goto label_escape_sequence;
833
834 case ISO_single_shift_3:
835 /* SS2 is handled as an escape sequence of ESC 'O' */
836 c1 = 'O';
837 goto label_escape_sequence;
838
839 case ISO_control_sequence_introducer:
840 /* CSI is handled as an escape sequence of ESC '[' ... */
841 c1 = '[';
842 goto label_escape_sequence;
843
844 case ISO_escape:
845 ONE_MORE_BYTE (c1);
846 label_escape_sequence:
847 /* Escape sequences handled by Emacs are invocation,
848 designation, direction specification, and character
849 composition specification. */
850 switch (c1)
851 {
852 case '&': /* revision of following character set */
853 ONE_MORE_BYTE (c1);
854 if (!(c1 >= '@' && c1 <= '~'))
e0e989f6 855 goto label_invalid_escape_sequence;
4ed46869
KH
856 ONE_MORE_BYTE (c1);
857 if (c1 != ISO_CODE_ESC)
e0e989f6 858 goto label_invalid_escape_sequence;
4ed46869
KH
859 ONE_MORE_BYTE (c1);
860 goto label_escape_sequence;
861
862 case '$': /* designation of 2-byte character set */
863 ONE_MORE_BYTE (c1);
864 if (c1 >= '@' && c1 <= 'B')
865 { /* designation of JISX0208.1978, GB2312.1980,
866 or JISX0208.1980 */
867 DECODE_DESIGNATION (0, 2, 94, c1);
868 }
869 else if (c1 >= 0x28 && c1 <= 0x2B)
870 { /* designation of DIMENSION2_CHARS94 character set */
871 ONE_MORE_BYTE (c2);
872 DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
873 }
874 else if (c1 >= 0x2C && c1 <= 0x2F)
875 { /* designation of DIMENSION2_CHARS96 character set */
876 ONE_MORE_BYTE (c2);
877 DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
878 }
879 else
e0e989f6 880 goto label_invalid_escape_sequence;
4ed46869
KH
881 break;
882
883 case 'n': /* invocation of locking-shift-2 */
e0e989f6
KH
884 if (CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
885 goto label_invalid_escape_sequence;
4ed46869 886 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
e0e989f6 887 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
4ed46869
KH
888 break;
889
890 case 'o': /* invocation of locking-shift-3 */
e0e989f6
KH
891 if (CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
892 goto label_invalid_escape_sequence;
4ed46869 893 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
e0e989f6 894 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
4ed46869
KH
895 break;
896
897 case 'N': /* invocation of single-shift-2 */
e0e989f6
KH
898 if (CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
899 goto label_invalid_escape_sequence;
4ed46869
KH
900 ONE_MORE_BYTE (c1);
901 charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
902 DECODE_ISO_CHARACTER (charset, c1);
903 break;
904
905 case 'O': /* invocation of single-shift-3 */
e0e989f6
KH
906 if (CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
907 goto label_invalid_escape_sequence;
4ed46869
KH
908 ONE_MORE_BYTE (c1);
909 charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
910 DECODE_ISO_CHARACTER (charset, c1);
911 break;
912
913 case '0': /* start composing without embeded rules */
914 coding->composing = COMPOSING_NO_RULE_HEAD;
915 break;
916
917 case '1': /* end composing */
918 coding->composing = COMPOSING_NO;
919 break;
920
921 case '2': /* start composing with embeded rules */
922 coding->composing = COMPOSING_WITH_RULE_HEAD;
923 break;
924
925 case '[': /* specification of direction */
926 /* For the moment, nested direction is not supported.
927 So, the value of `coding->direction' is 0 or 1: 0
928 means left-to-right, 1 means right-to-left. */
929 ONE_MORE_BYTE (c1);
930 switch (c1)
931 {
932 case ']': /* end of the current direction */
933 coding->direction = 0;
934
935 case '0': /* end of the current direction */
936 case '1': /* start of left-to-right direction */
937 ONE_MORE_BYTE (c1);
938 if (c1 == ']')
939 coding->direction = 0;
940 else
941 goto label_invalid_escape_sequence;
942 break;
943
944 case '2': /* start of right-to-left direction */
945 ONE_MORE_BYTE (c1);
946 if (c1 == ']')
947 coding->direction= 1;
948 else
949 goto label_invalid_escape_sequence;
950 break;
951
952 default:
953 goto label_invalid_escape_sequence;
954 }
955 break;
956
957 default:
958 if (c1 >= 0x28 && c1 <= 0x2B)
959 { /* designation of DIMENSION1_CHARS94 character set */
960 ONE_MORE_BYTE (c2);
961 DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
962 }
963 else if (c1 >= 0x2C && c1 <= 0x2F)
964 { /* designation of DIMENSION1_CHARS96 character set */
965 ONE_MORE_BYTE (c2);
966 DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
967 }
968 else
969 {
970 goto label_invalid_escape_sequence;
971 }
972 }
973 /* We must update these variables now. */
974 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
975 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
976 break;
977
978 label_invalid_escape_sequence:
979 {
980 int length = src - src_base;
981
982 bcopy (src_base, dst, length);
983 dst += length;
984 }
985 }
986 continue;
987
988 label_end_of_loop:
989 coding->carryover_size = src - src_base;
990 bcopy (src_base, coding->carryover, coding->carryover_size);
991 src = src_base;
992 break;
993 }
994
995 /* If this is the last block of the text to be decoded, we had
996 better just flush out all remaining codes in the text although
997 they are not valid characters. */
998 if (coding->last_block)
999 {
1000 bcopy (src, dst, src_end - src);
1001 dst += (src_end - src);
1002 src = src_end;
1003 }
1004 *consumed = src - source;
1005 return dst - destination;
1006}
1007
1008/* ISO2022 encoding staffs. */
1009
1010/*
1011 It is not enough to say just "ISO2022" on encoding, but we have to
1012 specify more details. In Emacs, each coding-system of ISO2022
1013 variant has the following specifications:
1014 1. Initial designation to G0 thru G3.
1015 2. Allows short-form designation?
1016 3. ASCII should be designated to G0 before control characters?
1017 4. ASCII should be designated to G0 at end of line?
1018 5. 7-bit environment or 8-bit environment?
1019 6. Use locking-shift?
1020 7. Use Single-shift?
1021 And the following two are only for Japanese:
1022 8. Use ASCII in place of JIS0201-1976-Roman?
1023 9. Use JISX0208-1983 in place of JISX0208-1978?
1024 These specifications are encoded in `coding->flags' as flag bits
1025 defined by macros CODING_FLAG_ISO_XXX. See `coding.h' for more
1026 detail.
1027*/
1028
1029/* Produce codes (escape sequence) for designating CHARSET to graphic
1030 register REG. If <final-char> of CHARSET is '@', 'A', or 'B' and
1031 the coding system CODING allows, produce designation sequence of
1032 short-form. */
1033
1034#define ENCODE_DESIGNATION(charset, reg, coding) \
1035 do { \
1036 unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset); \
1037 char *intermediate_char_94 = "()*+"; \
1038 char *intermediate_char_96 = ",-./"; \
1039 Lisp_Object temp \
1040 = Fassq (make_number (charset), Vcharset_revision_alist); \
1041 if (! NILP (temp)) \
1042 { \
1043 *dst++ = ISO_CODE_ESC; \
1044 *dst++ = '&'; \
1045 *dst++ = XINT (XCONS (temp)->cdr) + '@'; \
1046 } \
1047 *dst++ = ISO_CODE_ESC; \
1048 if (CHARSET_DIMENSION (charset) == 1) \
1049 { \
1050 if (CHARSET_CHARS (charset) == 94) \
1051 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1052 else \
1053 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1054 } \
1055 else \
1056 { \
1057 *dst++ = '$'; \
1058 if (CHARSET_CHARS (charset) == 94) \
1059 { \
1060 if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM) \
1061 || reg != 0 \
1062 || final_char < '@' || final_char > 'B') \
1063 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1064 } \
1065 else \
1066 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1067 } \
1068 *dst++ = final_char; \
1069 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
1070 } while (0)
1071
1072/* The following two macros produce codes (control character or escape
1073 sequence) for ISO2022 single-shift functions (single-shift-2 and
1074 single-shift-3). */
1075
1076#define ENCODE_SINGLE_SHIFT_2 \
1077 do { \
1078 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1079 *dst++ = ISO_CODE_ESC, *dst++ = 'N'; \
1080 else \
1081 *dst++ = ISO_CODE_SS2; \
1082 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1083 } while (0)
1084
1085#define ENCODE_SINGLE_SHIFT_3 \
1086 do { \
1087 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1088 *dst++ = ISO_CODE_ESC, *dst++ = 'O'; \
1089 else \
1090 *dst++ = ISO_CODE_SS3; \
1091 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1092 } while (0)
1093
1094/* The following four macros produce codes (control character or
1095 escape sequence) for ISO2022 locking-shift functions (shift-in,
1096 shift-out, locking-shift-2, and locking-shift-3). */
1097
1098#define ENCODE_SHIFT_IN \
1099 do { \
1100 *dst++ = ISO_CODE_SI; \
1101 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1102 } while (0)
1103
1104#define ENCODE_SHIFT_OUT \
1105 do { \
1106 *dst++ = ISO_CODE_SO; \
1107 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1108 } while (0)
1109
1110#define ENCODE_LOCKING_SHIFT_2 \
1111 do { \
1112 *dst++ = ISO_CODE_ESC, *dst++ = 'n'; \
1113 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1114 } while (0)
1115
1116#define ENCODE_LOCKING_SHIFT_3 \
1117 do { \
1118 *dst++ = ISO_CODE_ESC, *dst++ = 'o'; \
1119 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1120 } while (0)
1121
1122/* Produce codes for a DIMENSION1 character of which character set is
1123 CHARSET and position-code is C1. Designation and invocation
1124 sequences are also produced in advance if necessary. */
1125
1126
1127#define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
1128 do { \
1129 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1130 { \
1131 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1132 *dst++ = c1 & 0x7F; \
1133 else \
1134 *dst++ = c1 | 0x80; \
1135 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1136 break; \
1137 } \
1138 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1139 { \
1140 *dst++ = c1 & 0x7F; \
1141 break; \
1142 } \
1143 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1144 { \
1145 *dst++ = c1 | 0x80; \
1146 break; \
1147 } \
1148 else \
1149 /* Since CHARSET is not yet invoked to any graphic planes, we \
1150 must invoke it, or, at first, designate it to some graphic \
1151 register. Then repeat the loop to actually produce the \
1152 character. */ \
1153 dst = encode_invocation_designation (charset, coding, dst); \
1154 } while (1)
1155
1156/* Produce codes for a DIMENSION2 character of which character set is
1157 CHARSET and position-codes are C1 and C2. Designation and
1158 invocation codes are also produced in advance if necessary. */
1159
1160#define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
1161 do { \
1162 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1163 { \
1164 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1165 *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F; \
1166 else \
1167 *dst++ = c1 | 0x80, *dst++ = c2 | 0x80; \
1168 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1169 break; \
1170 } \
1171 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1172 { \
1173 *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F; \
1174 break; \
1175 } \
1176 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1177 { \
1178 *dst++ = c1 | 0x80, *dst++= c2 | 0x80; \
1179 break; \
1180 } \
1181 else \
1182 /* Since CHARSET is not yet invoked to any graphic planes, we \
1183 must invoke it, or, at first, designate it to some graphic \
1184 register. Then repeat the loop to actually produce the \
1185 character. */ \
1186 dst = encode_invocation_designation (charset, coding, dst); \
1187 } while (1)
1188
bdd9fb48
KH
1189#define ENCODE_ISO_CHARACTER(charset, c1, c2) \
1190 do { \
1191 int c_alt, charset_alt; \
1192 if (!NILP (unification_table) \
1193 && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
1194 < 0)) \
1195 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1196 else \
1197 charset_alt = charset; \
1198 if (CHARSET_DIMENSION (charset_alt) == 1) \
1199 ENCODE_ISO_CHARACTER_DIMENSION1 (charset_alt, c1); \
1200 else \
1201 ENCODE_ISO_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
1202 } while (0)
1203
4ed46869
KH
1204/* Produce designation and invocation codes at a place pointed by DST
1205 to use CHARSET. The element `spec.iso2022' of *CODING is updated.
1206 Return new DST. */
1207
1208unsigned char *
1209encode_invocation_designation (charset, coding, dst)
1210 int charset;
1211 struct coding_system *coding;
1212 unsigned char *dst;
1213{
1214 int reg; /* graphic register number */
1215
1216 /* At first, check designations. */
1217 for (reg = 0; reg < 4; reg++)
1218 if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
1219 break;
1220
1221 if (reg >= 4)
1222 {
1223 /* CHARSET is not yet designated to any graphic registers. */
1224 /* At first check the requested designation. */
1225 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1ba9e4ab
KH
1226 if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1227 /* Since CHARSET requests no special designation, designate it
1228 to graphic register 0. */
4ed46869
KH
1229 reg = 0;
1230
1231 ENCODE_DESIGNATION (charset, reg, coding);
1232 }
1233
1234 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
1235 && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
1236 {
1237 /* Since the graphic register REG is not invoked to any graphic
1238 planes, invoke it to graphic plane 0. */
1239 switch (reg)
1240 {
1241 case 0: /* graphic register 0 */
1242 ENCODE_SHIFT_IN;
1243 break;
1244
1245 case 1: /* graphic register 1 */
1246 ENCODE_SHIFT_OUT;
1247 break;
1248
1249 case 2: /* graphic register 2 */
1250 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1251 ENCODE_SINGLE_SHIFT_2;
1252 else
1253 ENCODE_LOCKING_SHIFT_2;
1254 break;
1255
1256 case 3: /* graphic register 3 */
1257 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1258 ENCODE_SINGLE_SHIFT_3;
1259 else
1260 ENCODE_LOCKING_SHIFT_3;
1261 break;
1262 }
1263 }
1264 return dst;
1265}
1266
1267/* The following two macros produce codes for indicating composition. */
1268#define ENCODE_COMPOSITION_NO_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '0'
1269#define ENCODE_COMPOSITION_WITH_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '2'
1270#define ENCODE_COMPOSITION_END *dst++ = ISO_CODE_ESC, *dst++ = '1'
1271
1272/* The following three macros produce codes for indicating direction
1273 of text. */
1274#define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
1275 do { \
1276 if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS) \
1277 *dst++ = ISO_CODE_ESC, *dst++ = '['; \
1278 else \
1279 *dst++ = ISO_CODE_CSI; \
1280 } while (0)
1281
1282#define ENCODE_DIRECTION_R2L \
1283 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '2', *dst++ = ']'
1284
1285#define ENCODE_DIRECTION_L2R \
1286 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '0', *dst++ = ']'
1287
1288/* Produce codes for designation and invocation to reset the graphic
1289 planes and registers to initial state. */
e0e989f6
KH
1290#define ENCODE_RESET_PLANE_AND_REGISTER \
1291 do { \
1292 int reg; \
1293 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0) \
1294 ENCODE_SHIFT_IN; \
1295 for (reg = 0; reg < 4; reg++) \
1296 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0 \
1297 && (CODING_SPEC_ISO_DESIGNATION (coding, reg) \
1298 != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg))) \
1299 ENCODE_DESIGNATION \
1300 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
4ed46869
KH
1301 } while (0)
1302
bdd9fb48
KH
1303/* Produce designation sequences of charsets in the line started from
1304 *SRC to a place pointed by DSTP.
1305
1306 If the current block ends before any end-of-line, we may fail to
1307 find all the necessary *designations. */
1308encode_designation_at_bol (coding, table, src, src_end, dstp)
e0e989f6 1309 struct coding_system *coding;
bdd9fb48 1310 Lisp_Object table;
e0e989f6
KH
1311 unsigned char *src, *src_end, **dstp;
1312{
bdd9fb48
KH
1313 int charset, c, found = 0, reg;
1314 /* Table of charsets to be designated to each graphic register. */
1315 int r[4];
1316 unsigned char *dst = *dstp;
1317
1318 for (reg = 0; reg < 4; reg++)
1319 r[reg] = -1;
1320
1321 while (src < src_end && *src != '\n' && found < 4)
e0e989f6 1322 {
bdd9fb48
KH
1323 int bytes = BYTES_BY_CHAR_HEAD (*src);
1324
1325 if (NILP (table))
1326 charset = CHARSET_AT (src);
1327 else
e0e989f6 1328 {
bdd9fb48
KH
1329 int c_alt, c1, c2;
1330
1331 SPLIT_STRING(src, bytes, charset, c1, c2);
1332 if ((c_alt = unify_char (table, -1, charset, c1, c2)) >= 0)
1333 charset = CHAR_CHARSET (c_alt);
e0e989f6 1334 }
bdd9fb48 1335
e0e989f6 1336 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1ba9e4ab 1337 if (r[reg] == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
bdd9fb48
KH
1338 {
1339 found++;
1340 r[reg] = charset;
1341 }
1342
1343 src += bytes;
1344 }
1345
1346 if (found)
1347 {
1348 for (reg = 0; reg < 4; reg++)
1349 if (r[reg] >= 0
1350 && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
1351 ENCODE_DESIGNATION (r[reg], reg, coding);
1352 *dstp = dst;
e0e989f6 1353 }
e0e989f6
KH
1354}
1355
4ed46869
KH
1356/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
1357
1358int
1359encode_coding_iso2022 (coding, source, destination,
1360 src_bytes, dst_bytes, consumed)
1361 struct coding_system *coding;
1362 unsigned char *source, *destination;
1363 int src_bytes, dst_bytes;
1364 int *consumed;
1365{
1366 unsigned char *src = source;
1367 unsigned char *src_end = source + src_bytes;
1368 unsigned char *dst = destination;
1369 unsigned char *dst_end = destination + dst_bytes;
e0e989f6 1370 /* Since the maximum bytes produced by each loop is 20, we subtract 19
4ed46869
KH
1371 from DST_END to assure overflow checking is necessary only at the
1372 head of loop. */
e0e989f6 1373 unsigned char *adjusted_dst_end = dst_end - 19;
bdd9fb48
KH
1374 Lisp_Object unification_table = coding->character_unification_table;
1375
1376 if (!NILP (Venable_character_unification) && NILP (unification_table))
1377 unification_table = Vstandard_character_unification_table_for_write;
4ed46869
KH
1378
1379 while (src < src_end && dst < adjusted_dst_end)
1380 {
1381 /* SRC_BASE remembers the start position in source in each loop.
1382 The loop will be exited when there's not enough source text
1383 to analyze multi-byte codes (within macros ONE_MORE_BYTE,
1384 TWO_MORE_BYTES, and THREE_MORE_BYTES). In that case, SRC is
1385 reset to SRC_BASE before exiting. */
1386 unsigned char *src_base = src;
bdd9fb48 1387 int charset, c1, c2, c3, c4;
4ed46869 1388
e0e989f6
KH
1389 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
1390 && CODING_SPEC_ISO_BOL (coding))
1391 {
bdd9fb48
KH
1392 /* We have to produce designation sequences if any now. */
1393 encode_designation_at_bol (coding, unification_table,
1394 src, src_end, &dst);
e0e989f6
KH
1395 CODING_SPEC_ISO_BOL (coding) = 0;
1396 }
1397
1398 c1 = *src++;
4ed46869
KH
1399 /* If we are seeing a component of a composite character, we are
1400 seeing a leading-code specially encoded for composition, or a
1401 composition rule if composing with rule. We must set C1
1402 to a normal leading-code or an ASCII code. If we are not at
1403 a composed character, we must reset the composition state. */
1404 if (COMPOSING_P (coding->composing))
1405 {
1406 if (c1 < 0xA0)
1407 {
1408 /* We are not in a composite character any longer. */
1409 coding->composing = COMPOSING_NO;
1410 ENCODE_COMPOSITION_END;
1411 }
1412 else
1413 {
1414 if (coding->composing == COMPOSING_WITH_RULE_RULE)
1415 {
1416 *dst++ = c1 & 0x7F;
1417 coding->composing = COMPOSING_WITH_RULE_HEAD;
1418 continue;
1419 }
1420 else if (coding->composing == COMPOSING_WITH_RULE_HEAD)
1421 coding->composing = COMPOSING_WITH_RULE_RULE;
1422 if (c1 == 0xA0)
1423 {
1424 /* This is an ASCII component. */
1425 ONE_MORE_BYTE (c1);
1426 c1 &= 0x7F;
1427 }
1428 else
1429 /* This is a leading-code of non ASCII component. */
1430 c1 -= 0x20;
1431 }
1432 }
1433
1434 /* Now encode one character. C1 is a control character, an
1435 ASCII character, or a leading-code of multi-byte character. */
1436 switch (emacs_code_class[c1])
1437 {
1438 case EMACS_ascii_code:
bdd9fb48 1439 ENCODE_ISO_CHARACTER (CHARSET_ASCII, c1, /* dummy */ c2);
4ed46869
KH
1440 break;
1441
1442 case EMACS_control_code:
1443 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
e0e989f6 1444 ENCODE_RESET_PLANE_AND_REGISTER;
4ed46869
KH
1445 *dst++ = c1;
1446 break;
1447
1448 case EMACS_carriage_return_code:
1449 if (!coding->selective)
1450 {
1451 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
e0e989f6 1452 ENCODE_RESET_PLANE_AND_REGISTER;
4ed46869
KH
1453 *dst++ = c1;
1454 break;
1455 }
1456 /* fall down to treat '\r' as '\n' ... */
1457
1458 case EMACS_linefeed_code:
1459 if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
e0e989f6
KH
1460 ENCODE_RESET_PLANE_AND_REGISTER;
1461 if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
1462 bcopy (coding->spec.iso2022.initial_designation,
1463 coding->spec.iso2022.current_designation,
1464 sizeof coding->spec.iso2022.initial_designation);
4ed46869 1465 if (coding->eol_type == CODING_EOL_LF
0ef69138 1466 || coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
1467 *dst++ = ISO_CODE_LF;
1468 else if (coding->eol_type == CODING_EOL_CRLF)
1469 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
1470 else
1471 *dst++ = ISO_CODE_CR;
e0e989f6 1472 CODING_SPEC_ISO_BOL (coding) = 1;
4ed46869
KH
1473 break;
1474
1475 case EMACS_leading_code_2:
1476 ONE_MORE_BYTE (c2);
bdd9fb48 1477 ENCODE_ISO_CHARACTER (c1, c2, /* dummy */ c3);
4ed46869
KH
1478 break;
1479
1480 case EMACS_leading_code_3:
1481 TWO_MORE_BYTES (c2, c3);
1482 if (c1 < LEADING_CODE_PRIVATE_11)
bdd9fb48 1483 ENCODE_ISO_CHARACTER (c1, c2, c3);
4ed46869 1484 else
bdd9fb48 1485 ENCODE_ISO_CHARACTER (c2, c3, /* dummy */ c4);
4ed46869
KH
1486 break;
1487
1488 case EMACS_leading_code_4:
1489 THREE_MORE_BYTES (c2, c3, c4);
bdd9fb48 1490 ENCODE_ISO_CHARACTER (c2, c3, c4);
4ed46869
KH
1491 break;
1492
1493 case EMACS_leading_code_composition:
1494 ONE_MORE_BYTE (c1);
1495 if (c1 == 0xFF)
1496 {
1497 coding->composing = COMPOSING_WITH_RULE_HEAD;
1498 ENCODE_COMPOSITION_WITH_RULE_START;
1499 }
1500 else
1501 {
1502 /* Rewind one byte because it is a character code of
1503 composition elements. */
1504 src--;
1505 coding->composing = COMPOSING_NO_RULE_HEAD;
1506 ENCODE_COMPOSITION_NO_RULE_START;
1507 }
1508 break;
1509
1510 case EMACS_invalid_code:
1511 *dst++ = c1;
1512 break;
1513 }
1514 continue;
1515 label_end_of_loop:
1516 coding->carryover_size = src - src_base;
1517 bcopy (src_base, coding->carryover, coding->carryover_size);
4ed46869
KH
1518 break;
1519 }
1520
1521 /* If this is the last block of the text to be encoded, we must
bdd9fb48
KH
1522 reset graphic planes and registers to the initial state. */
1523 if (src >= src_end && coding->last_block)
4ed46869 1524 {
e0e989f6 1525 ENCODE_RESET_PLANE_AND_REGISTER;
bdd9fb48
KH
1526 if (coding->carryover_size > 0
1527 && coding->carryover_size < (dst_end - dst))
1528 {
1529 bcopy (coding->carryover, dst, coding->carryover_size);
1530 dst += coding->carryover_size;
1531 coding->carryover_size = 0;
1532 }
4ed46869
KH
1533 }
1534 *consumed = src - source;
1535 return dst - destination;
1536}
1537
1538\f
1539/*** 4. SJIS and BIG5 handlers ***/
1540
1541/* Although SJIS and BIG5 are not ISO's coding system, They are used
1542 quite widely. So, for the moment, Emacs supports them in the bare
1543 C code. But, in the future, they may be supported only by CCL. */
1544
1545/* SJIS is a coding system encoding three character sets: ASCII, right
1546 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
1547 as is. A character of charset katakana-jisx0201 is encoded by
1548 "position-code + 0x80". A character of charset japanese-jisx0208
1549 is encoded in 2-byte but two position-codes are divided and shifted
1550 so that it fit in the range below.
1551
1552 --- CODE RANGE of SJIS ---
1553 (character set) (range)
1554 ASCII 0x00 .. 0x7F
1555 KATAKANA-JISX0201 0xA0 .. 0xDF
1556 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xFF
1557 (2nd byte) 0x40 .. 0xFF
1558 -------------------------------
1559
1560*/
1561
1562/* BIG5 is a coding system encoding two character sets: ASCII and
1563 Big5. An ASCII character is encoded as is. Big5 is a two-byte
1564 character set and is encoded in two-byte.
1565
1566 --- CODE RANGE of BIG5 ---
1567 (character set) (range)
1568 ASCII 0x00 .. 0x7F
1569 Big5 (1st byte) 0xA1 .. 0xFE
1570 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
1571 --------------------------
1572
1573 Since the number of characters in Big5 is larger than maximum
1574 characters in Emacs' charset (96x96), it can't be handled as one
1575 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
1576 and `charset-big5-2'. Both are DIMENSION2 and CHARS94. The former
1577 contains frequently used characters and the latter contains less
1578 frequently used characters. */
1579
1580/* Macros to decode or encode a character of Big5 in BIG5. B1 and B2
1581 are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
1582 C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
1583 format. CHARSET is `charset_big5_1' or `charset_big5_2'. */
1584
1585/* Number of Big5 characters which have the same code in 1st byte. */
1586#define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
1587
1588#define DECODE_BIG5(b1, b2, charset, c1, c2) \
1589 do { \
1590 unsigned int temp \
1591 = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62); \
1592 if (b1 < 0xC9) \
1593 charset = charset_big5_1; \
1594 else \
1595 { \
1596 charset = charset_big5_2; \
1597 temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW; \
1598 } \
1599 c1 = temp / (0xFF - 0xA1) + 0x21; \
1600 c2 = temp % (0xFF - 0xA1) + 0x21; \
1601 } while (0)
1602
1603#define ENCODE_BIG5(charset, c1, c2, b1, b2) \
1604 do { \
1605 unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21); \
1606 if (charset == charset_big5_2) \
1607 temp += BIG5_SAME_ROW * (0xC9 - 0xA1); \
1608 b1 = temp / BIG5_SAME_ROW + 0xA1; \
1609 b2 = temp % BIG5_SAME_ROW; \
1610 b2 += b2 < 0x3F ? 0x40 : 0x62; \
1611 } while (0)
1612
1613/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1614 Check if a text is encoded in SJIS. If it is, return
1615 CODING_CATEGORY_MASK_SJIS, else return 0. */
1616
1617int
1618detect_coding_sjis (src, src_end)
1619 unsigned char *src, *src_end;
1620{
1621 unsigned char c;
1622
1623 while (src < src_end)
1624 {
1625 c = *src++;
1626 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1627 return 0;
1628 if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
1629 {
1630 if (src < src_end && *src++ < 0x40)
1631 return 0;
1632 }
1633 }
1634 return CODING_CATEGORY_MASK_SJIS;
1635}
1636
1637/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1638 Check if a text is encoded in BIG5. If it is, return
1639 CODING_CATEGORY_MASK_BIG5, else return 0. */
1640
1641int
1642detect_coding_big5 (src, src_end)
1643 unsigned char *src, *src_end;
1644{
1645 unsigned char c;
1646
1647 while (src < src_end)
1648 {
1649 c = *src++;
1650 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1651 return 0;
1652 if (c >= 0xA1)
1653 {
1654 if (src >= src_end)
1655 break;
1656 c = *src++;
1657 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
1658 return 0;
1659 }
1660 }
1661 return CODING_CATEGORY_MASK_BIG5;
1662}
1663
1664/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
1665 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
1666
1667int
1668decode_coding_sjis_big5 (coding, source, destination,
1669 src_bytes, dst_bytes, consumed, sjis_p)
1670 struct coding_system *coding;
1671 unsigned char *source, *destination;
1672 int src_bytes, dst_bytes;
1673 int *consumed;
1674 int sjis_p;
1675{
1676 unsigned char *src = source;
1677 unsigned char *src_end = source + src_bytes;
1678 unsigned char *dst = destination;
1679 unsigned char *dst_end = destination + dst_bytes;
1680 /* Since the maximum bytes produced by each loop is 4, we subtract 3
1681 from DST_END to assure overflow checking is necessary only at the
1682 head of loop. */
1683 unsigned char *adjusted_dst_end = dst_end - 3;
1684
1685 while (src < src_end && dst < adjusted_dst_end)
1686 {
1687 /* SRC_BASE remembers the start position in source in each loop.
1688 The loop will be exited when there's not enough source text
1689 to analyze two-byte character (within macro ONE_MORE_BYTE).
1690 In that case, SRC is reset to SRC_BASE before exiting. */
1691 unsigned char *src_base = src;
1692 unsigned char c1 = *src++, c2, c3, c4;
1693
1694 if (c1 == '\r')
1695 {
1696 if (coding->eol_type == CODING_EOL_CRLF)
1697 {
1698 ONE_MORE_BYTE (c2);
1699 if (c2 == '\n')
1700 *dst++ = c2;
1701 else
1702 /* To process C2 again, SRC is subtracted by 1. */
1703 *dst++ = c1, src--;
1704 }
1705 else
1706 *dst++ = c1;
1707 }
1708 else if (c1 < 0x80)
1709 *dst++ = c1;
1710 else if (c1 < 0xA0 || c1 >= 0xE0)
1711 {
1712 /* SJIS -> JISX0208, BIG5 -> Big5 (only if 0xE0 <= c1 < 0xFF) */
1713 if (sjis_p)
1714 {
1715 ONE_MORE_BYTE (c2);
1716 DECODE_SJIS (c1, c2, c3, c4);
1717 DECODE_CHARACTER_DIMENSION2 (charset_jisx0208, c3, c4);
1718 }
1719 else if (c1 >= 0xE0 && c1 < 0xFF)
1720 {
1721 int charset;
1722
1723 ONE_MORE_BYTE (c2);
1724 DECODE_BIG5 (c1, c2, charset, c3, c4);
1725 DECODE_CHARACTER_DIMENSION2 (charset, c3, c4);
1726 }
1727 else /* Invalid code */
1728 *dst++ = c1;
1729 }
1730 else
1731 {
1732 /* SJIS -> JISX0201-Kana, BIG5 -> Big5 */
1733 if (sjis_p)
1734 DECODE_CHARACTER_DIMENSION1 (charset_katakana_jisx0201, c1);
1735 else
1736 {
1737 int charset;
1738
1739 ONE_MORE_BYTE (c2);
1740 DECODE_BIG5 (c1, c2, charset, c3, c4);
1741 DECODE_CHARACTER_DIMENSION2 (charset, c3, c4);
1742 }
1743 }
1744 continue;
1745
1746 label_end_of_loop:
1747 coding->carryover_size = src - src_base;
1748 bcopy (src_base, coding->carryover, coding->carryover_size);
1749 src = src_base;
1750 break;
1751 }
1752
1753 *consumed = src - source;
1754 return dst - destination;
1755}
1756
1757/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
1758 This function can encode `charset_ascii', `charset_katakana_jisx0201',
1759 `charset_jisx0208', `charset_big5_1', and `charset_big5-2'. We are
1760 sure that all these charsets are registered as official charset
1761 (i.e. do not have extended leading-codes). Characters of other
1762 charsets are produced without any encoding. If SJIS_P is 1, encode
1763 SJIS text, else encode BIG5 text. */
1764
1765int
1766encode_coding_sjis_big5 (coding, source, destination,
1767 src_bytes, dst_bytes, consumed, sjis_p)
1768 struct coding_system *coding;
1769 unsigned char *source, *destination;
1770 int src_bytes, dst_bytes;
1771 int *consumed;
1772 int sjis_p;
1773{
1774 unsigned char *src = source;
1775 unsigned char *src_end = source + src_bytes;
1776 unsigned char *dst = destination;
1777 unsigned char *dst_end = destination + dst_bytes;
1778 /* Since the maximum bytes produced by each loop is 2, we subtract 1
1779 from DST_END to assure overflow checking is necessary only at the
1780 head of loop. */
1781 unsigned char *adjusted_dst_end = dst_end - 1;
1782
1783 while (src < src_end && dst < adjusted_dst_end)
1784 {
1785 /* SRC_BASE remembers the start position in source in each loop.
1786 The loop will be exited when there's not enough source text
1787 to analyze multi-byte codes (within macros ONE_MORE_BYTE and
1788 TWO_MORE_BYTES). In that case, SRC is reset to SRC_BASE
1789 before exiting. */
1790 unsigned char *src_base = src;
1791 unsigned char c1 = *src++, c2, c3, c4;
1792
1793 if (coding->composing)
1794 {
1795 if (c1 == 0xA0)
1796 {
1797 ONE_MORE_BYTE (c1);
1798 c1 &= 0x7F;
1799 }
1800 else if (c1 >= 0xA0)
1801 c1 -= 0x20;
1802 else
1803 coding->composing = 0;
1804 }
1805
1806 switch (emacs_code_class[c1])
1807 {
1808 case EMACS_ascii_code:
1809 case EMACS_control_code:
1810 *dst++ = c1;
1811 break;
1812
1813 case EMACS_carriage_return_code:
1814 if (!coding->selective)
1815 {
1816 *dst++ = c1;
1817 break;
1818 }
1819 /* fall down to treat '\r' as '\n' ... */
1820
1821 case EMACS_linefeed_code:
1822 if (coding->eol_type == CODING_EOL_LF
0ef69138 1823 || coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
1824 *dst++ = '\n';
1825 else if (coding->eol_type == CODING_EOL_CRLF)
1826 *dst++ = '\r', *dst++ = '\n';
1827 else
1828 *dst++ = '\r';
1829 break;
1830
1831 case EMACS_leading_code_2:
1832 ONE_MORE_BYTE (c2);
1833 if (sjis_p && c1 == charset_katakana_jisx0201)
1834 *dst++ = c2;
1835 else
1836 *dst++ = c1, *dst++ = c2;
1837 break;
1838
1839 case EMACS_leading_code_3:
1840 TWO_MORE_BYTES (c2, c3);
1841 c2 &= 0x7F, c3 &= 0x7F;
1842 if (sjis_p && c1 == charset_jisx0208)
1843 {
1844 unsigned char s1, s2;
1845
1846 ENCODE_SJIS (c2, c3, s1, s2);
1847 *dst++ = s1, *dst++ = s2;
1848 }
1849 else if (!sjis_p && (c1 == charset_big5_1 || c1 == charset_big5_2))
1850 {
1851 unsigned char b1, b2;
1852
1853 ENCODE_BIG5 (c1, c2, c3, b1, b2);
1854 *dst++ = b1, *dst++ = b2;
1855 }
1856 else
1857 *dst++ = c1, *dst++ = c2, *dst++ = c3;
1858 break;
1859
1860 case EMACS_leading_code_4:
1861 THREE_MORE_BYTES (c2, c3, c4);
1862 *dst++ = c1, *dst++ = c2, *dst++ = c3, *dst++ = c4;
1863 break;
1864
1865 case EMACS_leading_code_composition:
1866 coding->composing = 1;
1867 break;
1868
1869 default: /* i.e. case EMACS_invalid_code: */
1870 *dst++ = c1;
1871 }
1872 continue;
1873
1874 label_end_of_loop:
1875 coding->carryover_size = src - src_base;
1876 bcopy (src_base, coding->carryover, coding->carryover_size);
1877 src = src_base;
1878 break;
1879 }
1880
1881 *consumed = src - source;
1882 return dst - destination;
1883}
1884
1885\f
1886/*** 5. End-of-line handlers ***/
1887
1888/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
1889 This function is called only when `coding->eol_type' is
1890 CODING_EOL_CRLF or CODING_EOL_CR. */
1891
1892decode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
1893 struct coding_system *coding;
1894 unsigned char *source, *destination;
1895 int src_bytes, dst_bytes;
1896 int *consumed;
1897{
1898 unsigned char *src = source;
1899 unsigned char *src_end = source + src_bytes;
1900 unsigned char *dst = destination;
1901 unsigned char *dst_end = destination + dst_bytes;
1902 int produced;
1903
1904 switch (coding->eol_type)
1905 {
1906 case CODING_EOL_CRLF:
1907 {
1908 /* Since the maximum bytes produced by each loop is 2, we
1909 subtract 1 from DST_END to assure overflow checking is
1910 necessary only at the head of loop. */
1911 unsigned char *adjusted_dst_end = dst_end - 1;
1912
1913 while (src < src_end && dst < adjusted_dst_end)
1914 {
1915 unsigned char *src_base = src;
1916 unsigned char c = *src++;
1917 if (c == '\r')
1918 {
1919 ONE_MORE_BYTE (c);
1920 if (c != '\n')
1921 *dst++ = '\r';
bfd99048 1922 *dst++ = c;
4ed46869
KH
1923 }
1924 else
1925 *dst++ = c;
1926 continue;
1927
1928 label_end_of_loop:
1929 coding->carryover_size = src - src_base;
1930 bcopy (src_base, coding->carryover, coding->carryover_size);
1931 src = src_base;
1932 break;
1933 }
1934 *consumed = src - source;
1935 produced = dst - destination;
1936 break;
1937 }
1938
1939 case CODING_EOL_CR:
1940 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
1941 bcopy (source, destination, produced);
1942 dst_end = destination + produced;
1943 while (dst < dst_end)
1944 if (*dst++ == '\r') dst[-1] = '\n';
1945 *consumed = produced;
1946 break;
1947
1948 default: /* i.e. case: CODING_EOL_LF */
1949 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
1950 bcopy (source, destination, produced);
1951 *consumed = produced;
1952 break;
1953 }
1954
1955 return produced;
1956}
1957
1958/* See "GENERAL NOTES about `encode_coding_XXX ()' functions". Encode
1959 format of end-of-line according to `coding->eol_type'. If
1960 `coding->selective' is 1, code '\r' in source text also means
1961 end-of-line. */
1962
1963encode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
1964 struct coding_system *coding;
1965 unsigned char *source, *destination;
1966 int src_bytes, dst_bytes;
1967 int *consumed;
1968{
1969 unsigned char *src = source;
1970 unsigned char *dst = destination;
1971 int produced;
1972
1973 if (src_bytes <= 0)
1974 return 0;
1975
1976 switch (coding->eol_type)
1977 {
1978 case CODING_EOL_LF:
0ef69138 1979 case CODING_EOL_UNDECIDED:
4ed46869
KH
1980 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
1981 bcopy (source, destination, produced);
1982 if (coding->selective)
1983 {
1984 int i = produced;
1985 while (i--)
1986 if (*dst++ == '\r') dst[-1] = '\n';
1987 }
1988 *consumed = produced;
1989
1990 case CODING_EOL_CRLF:
1991 {
1992 unsigned char c;
1993 unsigned char *src_end = source + src_bytes;
1994 unsigned char *dst_end = destination + dst_bytes;
1995 /* Since the maximum bytes produced by each loop is 2, we
1996 subtract 1 from DST_END to assure overflow checking is
1997 necessary only at the head of loop. */
1998 unsigned char *adjusted_dst_end = dst_end - 1;
1999
2000 while (src < src_end && dst < adjusted_dst_end)
2001 {
2002 c = *src++;
2003 if (c == '\n' || (c == '\r' && coding->selective))
2004 *dst++ = '\r', *dst++ = '\n';
2005 else
2006 *dst++ = c;
2007 }
2008 produced = dst - destination;
2009 *consumed = src - source;
2010 break;
2011 }
2012
2013 default: /* i.e. case CODING_EOL_CR: */
2014 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2015 bcopy (source, destination, produced);
2016 {
2017 int i = produced;
2018 while (i--)
2019 if (*dst++ == '\n') dst[-1] = '\r';
2020 }
2021 *consumed = produced;
2022 }
2023
2024 return produced;
2025}
2026
2027\f
2028/*** 6. C library functions ***/
2029
2030/* In Emacs Lisp, coding system is represented by a Lisp symbol which
2031 has a property `coding-system'. The value of this property is a
2032 vector of length 5 (called as coding-vector). Among elements of
2033 this vector, the first (element[0]) and the fifth (element[4])
2034 carry important information for decoding/encoding. Before
2035 decoding/encoding, this information should be set in fields of a
2036 structure of type `coding_system'.
2037
2038 A value of property `coding-system' can be a symbol of another
2039 subsidiary coding-system. In that case, Emacs gets coding-vector
2040 from that symbol.
2041
2042 `element[0]' contains information to be set in `coding->type'. The
2043 value and its meaning is as follows:
2044
0ef69138
KH
2045 0 -- coding_type_emacs_mule
2046 1 -- coding_type_sjis
2047 2 -- coding_type_iso2022
2048 3 -- coding_type_big5
2049 4 -- coding_type_ccl encoder/decoder written in CCL
2050 nil -- coding_type_no_conversion
2051 t -- coding_type_undecided (automatic conversion on decoding,
2052 no-conversion on encoding)
4ed46869
KH
2053
2054 `element[4]' contains information to be set in `coding->flags' and
2055 `coding->spec'. The meaning varies by `coding->type'.
2056
2057 If `coding->type' is `coding_type_iso2022', element[4] is a vector
2058 of length 32 (of which the first 13 sub-elements are used now).
2059 Meanings of these sub-elements are:
2060
2061 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2062 If the value is an integer of valid charset, the charset is
2063 assumed to be designated to graphic register N initially.
2064
2065 If the value is minus, it is a minus value of charset which
2066 reserves graphic register N, which means that the charset is
2067 not designated initially but should be designated to graphic
2068 register N just before encoding a character in that charset.
2069
2070 If the value is nil, graphic register N is never used on
2071 encoding.
2072
2073 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2074 Each value takes t or nil. See the section ISO2022 of
2075 `coding.h' for more information.
2076
2077 If `coding->type' is `coding_type_big5', element[4] is t to denote
2078 BIG5-ETen or nil to denote BIG5-HKU.
2079
2080 If `coding->type' takes the other value, element[4] is ignored.
2081
2082 Emacs Lisp's coding system also carries information about format of
2083 end-of-line in a value of property `eol-type'. If the value is
2084 integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2085 means CODING_EOL_CR. If it is not integer, it should be a vector
2086 of subsidiary coding systems of which property `eol-type' has one
2087 of above values.
2088
2089*/
2090
2091/* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2092 and set it in CODING. If CODING_SYSTEM_SYMBOL is invalid, CODING
2093 is setup so that no conversion is necessary and return -1, else
2094 return 0. */
2095
2096int
e0e989f6
KH
2097setup_coding_system (coding_system, coding)
2098 Lisp_Object coding_system;
4ed46869
KH
2099 struct coding_system *coding;
2100{
4ed46869
KH
2101 Lisp_Object type, eol_type;
2102
2103 /* At first, set several fields default values. */
2104 coding->require_flushing = 0;
2105 coding->last_block = 0;
2106 coding->selective = 0;
2107 coding->composing = 0;
2108 coding->direction = 0;
2109 coding->carryover_size = 0;
4ed46869 2110 coding->post_read_conversion = coding->pre_write_conversion = Qnil;
bdd9fb48
KH
2111 /* We have not yet implemented a way to specify unification table in
2112 a coding system. */
2113 coding->character_unification_table = Qnil;
4ed46869 2114
e0e989f6
KH
2115 Vlast_coding_system_used = coding->symbol = coding_system;
2116 eol_type = Qnil;
2117 /* Get value of property `coding-system' until we get a vector.
2118 While doing that, also get values of properties
2119 `post-read-conversion', `pre-write-conversion', and `eol-type'. */
2120 while (!NILP (coding_system) && SYMBOLP (coding_system))
4ed46869 2121 {
4ed46869 2122 if (NILP (coding->post_read_conversion))
e0e989f6 2123 coding->post_read_conversion = Fget (coding_system,
4ed46869 2124 Qpost_read_conversion);
e0e989f6
KH
2125 if (NILP (coding->pre_write_conversion))
2126 coding->pre_write_conversion = Fget (coding_system,
4ed46869 2127 Qpre_write_conversion);
e0e989f6
KH
2128 if (NILP (eol_type))
2129 eol_type = Fget (coding_system, Qeol_type);
2130 coding_system = Fget (coding_system, Qcoding_system);
4ed46869 2131 }
e0e989f6
KH
2132 if (!VECTORP (coding_system)
2133 || XVECTOR (coding_system)->size != 5)
4ed46869
KH
2134 goto label_invalid_coding_system;
2135
4ed46869 2136 if (VECTORP (eol_type))
0ef69138 2137 coding->eol_type = CODING_EOL_UNDECIDED;
4ed46869
KH
2138 else if (XFASTINT (eol_type) == 1)
2139 coding->eol_type = CODING_EOL_CRLF;
2140 else if (XFASTINT (eol_type) == 2)
2141 coding->eol_type = CODING_EOL_CR;
2142 else
2143 coding->eol_type = CODING_EOL_LF;
2144
e0e989f6 2145 type = XVECTOR (coding_system)->contents[0];
4ed46869
KH
2146 switch (XFASTINT (type))
2147 {
2148 case 0:
0ef69138 2149 coding->type = coding_type_emacs_mule;
4ed46869
KH
2150 break;
2151
2152 case 1:
2153 coding->type = coding_type_sjis;
2154 break;
2155
2156 case 2:
2157 coding->type = coding_type_iso2022;
2158 {
e0e989f6 2159 Lisp_Object val = XVECTOR (coding_system)->contents[4];
4ed46869
KH
2160 Lisp_Object *flags;
2161 int i, charset, default_reg_bits = 0;
2162
2163 if (!VECTORP (val) || XVECTOR (val)->size != 32)
2164 goto label_invalid_coding_system;
2165
2166 flags = XVECTOR (val)->contents;
2167 coding->flags
2168 = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
2169 | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
2170 | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
2171 | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
2172 | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
2173 | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
2174 | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
2175 | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
e0e989f6
KH
2176 | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
2177 | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
2178 | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL));
4ed46869
KH
2179
2180 /* Invoke graphic register 0 to plane 0. */
2181 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
2182 /* Invoke graphic register 1 to plane 1 if we can use full 8-bit. */
2183 CODING_SPEC_ISO_INVOCATION (coding, 1)
2184 = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
2185 /* Not single shifting at first. */
2186 CODING_SPEC_ISO_SINGLE_SHIFTING(coding) = 0;
e0e989f6
KH
2187 /* Beginning of buffer should also be regarded as bol. */
2188 CODING_SPEC_ISO_BOL(coding) = 1;
4ed46869
KH
2189
2190 /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
2191 FLAGS[REG] can be one of below:
2192 integer CHARSET: CHARSET occupies register I,
2193 t: designate nothing to REG initially, but can be used
2194 by any charsets,
2195 list of integer, nil, or t: designate the first
2196 element (if integer) to REG initially, the remaining
2197 elements (if integer) is designated to REG on request,
2198 if an element is t, REG can be used by any charset,
2199 nil: REG is never used. */
467e7675 2200 for (charset = 0; charset <= MAX_CHARSET; charset++)
1ba9e4ab
KH
2201 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2202 = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
4ed46869
KH
2203 for (i = 0; i < 4; i++)
2204 {
2205 if (INTEGERP (flags[i])
e0e989f6
KH
2206 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset))
2207 || (charset = get_charset_id (flags[i])) >= 0)
4ed46869
KH
2208 {
2209 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2210 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
2211 }
2212 else if (EQ (flags[i], Qt))
2213 {
2214 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2215 default_reg_bits |= 1 << i;
2216 }
2217 else if (CONSP (flags[i]))
2218 {
2219 Lisp_Object tail = flags[i];
2220
2221 if (INTEGERP (XCONS (tail)->car)
2222 && (charset = XINT (XCONS (tail)->car),
e0e989f6
KH
2223 CHARSET_VALID_P (charset))
2224 || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
4ed46869
KH
2225 {
2226 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2227 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
2228 }
2229 else
2230 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2231 tail = XCONS (tail)->cdr;
2232 while (CONSP (tail))
2233 {
2234 if (INTEGERP (XCONS (tail)->car)
2235 && (charset = XINT (XCONS (tail)->car),
e0e989f6
KH
2236 CHARSET_VALID_P (charset))
2237 || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
4ed46869
KH
2238 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2239 = i;
2240 else if (EQ (XCONS (tail)->car, Qt))
2241 default_reg_bits |= 1 << i;
2242 tail = XCONS (tail)->cdr;
2243 }
2244 }
2245 else
2246 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2247
2248 CODING_SPEC_ISO_DESIGNATION (coding, i)
2249 = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
2250 }
2251
2252 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
2253 {
2254 /* REG 1 can be used only by locking shift in 7-bit env. */
2255 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2256 default_reg_bits &= ~2;
2257 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
2258 /* Without any shifting, only REG 0 and 1 can be used. */
2259 default_reg_bits &= 3;
2260 }
2261
467e7675 2262 for (charset = 0; charset <= MAX_CHARSET; charset++)
4ed46869 2263 if (CHARSET_VALID_P (charset)
1ba9e4ab
KH
2264 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2265 == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
4ed46869
KH
2266 {
2267 /* We have not yet decided where to designate CHARSET. */
2268 int reg_bits = default_reg_bits;
2269
2270 if (CHARSET_CHARS (charset) == 96)
2271 /* A charset of CHARS96 can't be designated to REG 0. */
2272 reg_bits &= ~1;
2273
2274 if (reg_bits)
2275 /* There exist some default graphic register. */
2276 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2277 = (reg_bits & 1
2278 ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
2279 else
2280 /* We anyway have to designate CHARSET to somewhere. */
2281 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2282 = (CHARSET_CHARS (charset) == 94
2283 ? 0
2284 : ((coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT
2285 || ! coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2286 ? 1
2287 : (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT
2288 ? 2 : 0)));
2289 }
2290 }
2291 coding->require_flushing = 1;
2292 break;
2293
2294 case 3:
2295 coding->type = coding_type_big5;
2296 coding->flags
e0e989f6 2297 = (NILP (XVECTOR (coding_system)->contents[4])
4ed46869
KH
2298 ? CODING_FLAG_BIG5_HKU
2299 : CODING_FLAG_BIG5_ETEN);
2300 break;
2301
2302 case 4:
2303 coding->type = coding_type_ccl;
2304 {
e0e989f6 2305 Lisp_Object val = XVECTOR (coding_system)->contents[4];
4ed46869
KH
2306 if (CONSP (val)
2307 && VECTORP (XCONS (val)->car)
2308 && VECTORP (XCONS (val)->cdr))
2309 {
2310 setup_ccl_program (&(coding->spec.ccl.decoder), XCONS (val)->car);
2311 setup_ccl_program (&(coding->spec.ccl.encoder), XCONS (val)->cdr);
2312 }
2313 else
2314 goto label_invalid_coding_system;
2315 }
2316 coding->require_flushing = 1;
2317 break;
2318
2319 default:
2320 if (EQ (type, Qt))
0ef69138 2321 coding->type = coding_type_undecided;
4ed46869
KH
2322 else
2323 coding->type = coding_type_no_conversion;
2324 break;
2325 }
2326 return 0;
2327
2328 label_invalid_coding_system:
2329 coding->type = coding_type_no_conversion;
dec137e5 2330 coding->eol_type = CODING_EOL_LF;
e0e989f6
KH
2331 coding->symbol = coding->pre_write_conversion = coding->post_read_conversion
2332 = Qnil;
4ed46869
KH
2333 return -1;
2334}
2335
2336/* Emacs has a mechanism to automatically detect a coding system if it
2337 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
2338 it's impossible to distinguish some coding systems accurately
2339 because they use the same range of codes. So, at first, coding
2340 systems are categorized into 7, those are:
2341
0ef69138 2342 o coding-category-emacs-mule
4ed46869
KH
2343
2344 The category for a coding system which has the same code range
2345 as Emacs' internal format. Assigned the coding-system (Lisp
0ef69138 2346 symbol) `emacs-mule' by default.
4ed46869
KH
2347
2348 o coding-category-sjis
2349
2350 The category for a coding system which has the same code range
2351 as SJIS. Assigned the coding-system (Lisp
e0e989f6 2352 symbol) `shift-jis' by default.
4ed46869
KH
2353
2354 o coding-category-iso-7
2355
2356 The category for a coding system which has the same code range
2357 as ISO2022 of 7-bit environment. Assigned the coding-system
e0e989f6 2358 (Lisp symbol) `iso-2022-7' by default.
4ed46869
KH
2359
2360 o coding-category-iso-8-1
2361
2362 The category for a coding system which has the same code range
2363 as ISO2022 of 8-bit environment and graphic plane 1 used only
2364 for DIMENSION1 charset. Assigned the coding-system (Lisp
e0e989f6 2365 symbol) `iso-8859-1' by default.
4ed46869
KH
2366
2367 o coding-category-iso-8-2
2368
2369 The category for a coding system which has the same code range
2370 as ISO2022 of 8-bit environment and graphic plane 1 used only
2371 for DIMENSION2 charset. Assigned the coding-system (Lisp
e0e989f6 2372 symbol) `euc-japan' by default.
4ed46869
KH
2373
2374 o coding-category-iso-else
2375
2376 The category for a coding system which has the same code range
2377 as ISO2022 but not belongs to any of the above three
2378 categories. Assigned the coding-system (Lisp symbol)
e0e989f6 2379 `iso-2022-ss2-7' by default.
4ed46869
KH
2380
2381 o coding-category-big5
2382
2383 The category for a coding system which has the same code range
2384 as BIG5. Assigned the coding-system (Lisp symbol)
e0e989f6 2385 `cn-big5' by default.
4ed46869
KH
2386
2387 o coding-category-binary
2388
2389 The category for a coding system not categorized in any of the
2390 above. Assigned the coding-system (Lisp symbol)
e0e989f6 2391 `no-conversion' by default.
4ed46869
KH
2392
2393 Each of them is a Lisp symbol and the value is an actual
2394 `coding-system's (this is also a Lisp symbol) assigned by a user.
2395 What Emacs does actually is to detect a category of coding system.
2396 Then, it uses a `coding-system' assigned to it. If Emacs can't
2397 decide only one possible category, it selects a category of the
2398 highest priority. Priorities of categories are also specified by a
2399 user in a Lisp variable `coding-category-list'.
2400
2401*/
2402
2403/* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2404 If it detects possible coding systems, return an integer in which
2405 appropriate flag bits are set. Flag bits are defined by macros
2406 CODING_CATEGORY_MASK_XXX in `coding.h'. */
2407
2408int
2409detect_coding_mask (src, src_bytes)
2410 unsigned char *src;
2411 int src_bytes;
2412{
2413 register unsigned char c;
2414 unsigned char *src_end = src + src_bytes;
2415 int mask;
2416
2417 /* At first, skip all ASCII characters and control characters except
2418 for three ISO2022 specific control characters. */
bcf26d6a 2419 label_loop_detect_coding:
4ed46869
KH
2420 while (src < src_end)
2421 {
2422 c = *src;
2423 if (c >= 0x80
2424 || (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
2425 break;
2426 src++;
2427 }
2428
2429 if (src >= src_end)
2430 /* We found nothing other than ASCII. There's nothing to do. */
2431 return CODING_CATEGORY_MASK_ANY;
2432
2433 /* The text seems to be encoded in some multilingual coding system.
2434 Now, try to find in which coding system the text is encoded. */
2435 if (c < 0x80)
bcf26d6a
KH
2436 {
2437 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
2438 /* C is an ISO2022 specific control code of C0. */
2439 mask = detect_coding_iso2022 (src, src_end);
2440 src++;
2441 if (mask == CODING_CATEGORY_MASK_ANY)
2442 /* No valid ISO2022 code follows C. Try again. */
2443 goto label_loop_detect_coding;
2444 }
4ed46869
KH
2445 else if (c == ISO_CODE_SS2 || c == ISO_CODE_SS3 || c == ISO_CODE_CSI)
2446 /* C is an ISO2022 specific control code of C1,
2447 or the first byte of SJIS's 2-byte character code,
2448 or a leading code of Emacs. */
2449 mask = (detect_coding_iso2022 (src, src_end)
2450 | detect_coding_sjis (src, src_end)
0ef69138 2451 | detect_coding_emacs_mule (src, src_end));
4ed46869
KH
2452
2453 else if (c < 0xA0)
2454 /* C is the first byte of SJIS character code,
2455 or a leading-code of Emacs. */
2456 mask = (detect_coding_sjis (src, src_end)
0ef69138 2457 | detect_coding_emacs_mule (src, src_end));
4ed46869
KH
2458
2459 else
2460 /* C is a character of ISO2022 in graphic plane right,
2461 or a SJIS's 1-byte character code (i.e. JISX0201),
2462 or the first byte of BIG5's 2-byte code. */
2463 mask = (detect_coding_iso2022 (src, src_end)
2464 | detect_coding_sjis (src, src_end)
2465 | detect_coding_big5 (src, src_end));
2466
2467 return mask;
2468}
2469
2470/* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2471 The information of the detected coding system is set in CODING. */
2472
2473void
2474detect_coding (coding, src, src_bytes)
2475 struct coding_system *coding;
2476 unsigned char *src;
2477 int src_bytes;
2478{
2479 int mask = detect_coding_mask (src, src_bytes);
2480 int idx;
2481
2482 if (mask == CODING_CATEGORY_MASK_ANY)
2483 /* We found nothing other than ASCII. There's nothing to do. */
2484 return;
2485
2486 if (!mask)
2487 /* The source text seems to be encoded in unknown coding system.
2488 Emacs regards the category of such a kind of coding system as
2489 `coding-category-binary'. We assume that a user has assigned
2490 an appropriate coding system for a `coding-category-binary'. */
2491 idx = CODING_CATEGORY_IDX_BINARY;
2492 else
2493 {
2494 /* We found some plausible coding systems. Let's use a coding
2495 system of the highest priority. */
2496 Lisp_Object val = Vcoding_category_list;
2497
2498 if (CONSP (val))
2499 while (!NILP (val))
2500 {
2501 idx = XFASTINT (Fget (XCONS (val)->car, Qcoding_category_index));
2502 if ((idx < CODING_CATEGORY_IDX_MAX) && (mask & (1 << idx)))
2503 break;
2504 val = XCONS (val)->cdr;
2505 }
2506 else
2507 val = Qnil;
2508
2509 if (NILP (val))
2510 {
2511 /* For unknown reason, `Vcoding_category_list' contains none
2512 of found categories. Let's use any of them. */
2513 for (idx = 0; idx < CODING_CATEGORY_IDX_MAX; idx++)
2514 if (mask & (1 << idx))
2515 break;
2516 }
2517 }
2518 setup_coding_system (XSYMBOL (coding_category_table[idx])->value, coding);
2519}
2520
2521/* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2522 is encoded. Return one of CODING_EOL_LF, CODING_EOL_CRLF,
0ef69138 2523 CODING_EOL_CR, and CODING_EOL_UNDECIDED. */
4ed46869
KH
2524
2525int
2526detect_eol_type (src, src_bytes)
2527 unsigned char *src;
2528 int src_bytes;
2529{
2530 unsigned char *src_end = src + src_bytes;
2531 unsigned char c;
2532
2533 while (src < src_end)
2534 {
2535 c = *src++;
2536 if (c == '\n')
2537 return CODING_EOL_LF;
2538 else if (c == '\r')
2539 {
2540 if (src < src_end && *src == '\n')
2541 return CODING_EOL_CRLF;
2542 else
2543 return CODING_EOL_CR;
2544 }
2545 }
0ef69138 2546 return CODING_EOL_UNDECIDED;
4ed46869
KH
2547}
2548
2549/* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2550 is encoded. If it detects an appropriate format of end-of-line, it
2551 sets the information in *CODING. */
2552
2553void
2554detect_eol (coding, src, src_bytes)
2555 struct coding_system *coding;
2556 unsigned char *src;
2557 int src_bytes;
2558{
2559 Lisp_Object val;
2560 int eol_type = detect_eol_type (src, src_bytes);
2561
0ef69138 2562 if (eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
2563 /* We found no end-of-line in the source text. */
2564 return;
2565
2566 val = Fget (coding->symbol, Qeol_type);
2567 if (VECTORP (val) && XVECTOR (val)->size == 3)
2568 setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
2569}
2570
2571/* See "GENERAL NOTES about `decode_coding_XXX ()' functions". Before
2572 decoding, it may detect coding system and format of end-of-line if
2573 those are not yet decided. */
2574
2575int
2576decode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2577 struct coding_system *coding;
2578 unsigned char *source, *destination;
2579 int src_bytes, dst_bytes;
2580 int *consumed;
2581{
2582 int produced;
2583
2584 if (src_bytes <= 0)
2585 {
2586 *consumed = 0;
2587 return 0;
2588 }
2589
0ef69138 2590 if (coding->type == coding_type_undecided)
4ed46869
KH
2591 detect_coding (coding, source, src_bytes);
2592
0ef69138 2593 if (coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
2594 detect_eol (coding, source, src_bytes);
2595
2596 coding->carryover_size = 0;
2597 switch (coding->type)
2598 {
2599 case coding_type_no_conversion:
2600 label_no_conversion:
2601 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2602 bcopy (source, destination, produced);
2603 *consumed = produced;
2604 break;
2605
0ef69138
KH
2606 case coding_type_emacs_mule:
2607 case coding_type_undecided:
4ed46869 2608 if (coding->eol_type == CODING_EOL_LF
0ef69138 2609 || coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
2610 goto label_no_conversion;
2611 produced = decode_eol (coding, source, destination,
2612 src_bytes, dst_bytes, consumed);
2613 break;
2614
2615 case coding_type_sjis:
2616 produced = decode_coding_sjis_big5 (coding, source, destination,
2617 src_bytes, dst_bytes, consumed,
2618 1);
2619 break;
2620
2621 case coding_type_iso2022:
2622 produced = decode_coding_iso2022 (coding, source, destination,
2623 src_bytes, dst_bytes, consumed);
2624 break;
2625
2626 case coding_type_big5:
2627 produced = decode_coding_sjis_big5 (coding, source, destination,
2628 src_bytes, dst_bytes, consumed,
2629 0);
2630 break;
2631
2632 case coding_type_ccl:
2633 produced = ccl_driver (&coding->spec.ccl.decoder, source, destination,
2634 src_bytes, dst_bytes, consumed);
2635 break;
2636 }
2637
2638 return produced;
2639}
2640
2641/* See "GENERAL NOTES about `encode_coding_XXX ()' functions". */
2642
2643int
2644encode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2645 struct coding_system *coding;
2646 unsigned char *source, *destination;
2647 int src_bytes, dst_bytes;
2648 int *consumed;
2649{
2650 int produced;
2651
2652 coding->carryover_size = 0;
2653 switch (coding->type)
2654 {
2655 case coding_type_no_conversion:
2656 label_no_conversion:
2657 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2658 if (produced > 0)
2659 {
2660 bcopy (source, destination, produced);
2661 if (coding->selective)
2662 {
2663 unsigned char *p = destination, *pend = destination + produced;
2664 while (p < pend)
e0e989f6 2665 if (*p++ == '\015') p[-1] = '\n';
4ed46869
KH
2666 }
2667 }
2668 *consumed = produced;
2669 break;
2670
0ef69138
KH
2671 case coding_type_emacs_mule:
2672 case coding_type_undecided:
4ed46869 2673 if (coding->eol_type == CODING_EOL_LF
0ef69138 2674 || coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
2675 goto label_no_conversion;
2676 produced = encode_eol (coding, source, destination,
2677 src_bytes, dst_bytes, consumed);
2678 break;
2679
2680 case coding_type_sjis:
2681 produced = encode_coding_sjis_big5 (coding, source, destination,
2682 src_bytes, dst_bytes, consumed,
2683 1);
2684 break;
2685
2686 case coding_type_iso2022:
2687 produced = encode_coding_iso2022 (coding, source, destination,
2688 src_bytes, dst_bytes, consumed);
2689 break;
2690
2691 case coding_type_big5:
2692 produced = encode_coding_sjis_big5 (coding, source, destination,
2693 src_bytes, dst_bytes, consumed,
2694 0);
2695 break;
2696
2697 case coding_type_ccl:
2698 produced = ccl_driver (&coding->spec.ccl.encoder, source, destination,
2699 src_bytes, dst_bytes, consumed);
2700 break;
2701 }
2702
2703 return produced;
2704}
2705
2706#define CONVERSION_BUFFER_EXTRA_ROOM 256
2707
2708/* Return maximum size (bytes) of a buffer enough for decoding
2709 SRC_BYTES of text encoded in CODING. */
2710
2711int
2712decoding_buffer_size (coding, src_bytes)
2713 struct coding_system *coding;
2714 int src_bytes;
2715{
2716 int magnification;
2717
2718 if (coding->type == coding_type_iso2022)
2719 magnification = 3;
2720 else if (coding->type == coding_type_ccl)
2721 magnification = coding->spec.ccl.decoder.buf_magnification;
2722 else
2723 magnification = 2;
2724
2725 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
2726}
2727
2728/* Return maximum size (bytes) of a buffer enough for encoding
2729 SRC_BYTES of text to CODING. */
2730
2731int
2732encoding_buffer_size (coding, src_bytes)
2733 struct coding_system *coding;
2734 int src_bytes;
2735{
2736 int magnification;
2737
2738 if (coding->type == coding_type_ccl)
2739 magnification = coding->spec.ccl.encoder.buf_magnification;
2740 else
2741 magnification = 3;
2742
2743 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
2744}
2745
2746#ifndef MINIMUM_CONVERSION_BUFFER_SIZE
2747#define MINIMUM_CONVERSION_BUFFER_SIZE 1024
2748#endif
2749
2750char *conversion_buffer;
2751int conversion_buffer_size;
2752
2753/* Return a pointer to a SIZE bytes of buffer to be used for encoding
2754 or decoding. Sufficient memory is allocated automatically. If we
2755 run out of memory, return NULL. */
2756
2757char *
2758get_conversion_buffer (size)
2759 int size;
2760{
2761 if (size > conversion_buffer_size)
2762 {
2763 char *buf;
2764 int real_size = conversion_buffer_size * 2;
2765
2766 while (real_size < size) real_size *= 2;
2767 buf = (char *) xmalloc (real_size);
2768 xfree (conversion_buffer);
2769 conversion_buffer = buf;
2770 conversion_buffer_size = real_size;
2771 }
2772 return conversion_buffer;
2773}
2774
2775\f
2776#ifdef emacs
2777/*** 7. Emacs Lisp library functions ***/
2778
02ba4723 2779DEFUN ("coding-system-spec", Fcoding_system_spec, Scoding_system_spec,
4ed46869 2780 1, 1, 0,
02ba4723 2781 "Return coding-spec of CODING-SYSTEM.\n\
4ed46869
KH
2782If CODING-SYSTEM is not a valid coding-system, return nil.")
2783 (obj)
2784 Lisp_Object obj;
2785{
2786 while (SYMBOLP (obj) && !NILP (obj))
2787 obj = Fget (obj, Qcoding_system);
2788 return ((NILP (obj) || !VECTORP (obj) || XVECTOR (obj)->size != 5)
2789 ? Qnil : obj);
2790}
2791
2792DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
2793 "Return t if OBJECT is nil or a coding-system.\n\
2794See document of make-coding-system for coding-system object.")
2795 (obj)
2796 Lisp_Object obj;
2797{
02ba4723 2798 return ((NILP (obj) || !NILP (Fcoding_system_spec (obj))) ? Qt : Qnil);
4ed46869
KH
2799}
2800
9d991de8
RS
2801DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
2802 Sread_non_nil_coding_system, 1, 1, 0,
e0e989f6 2803 "Read a coding system from the minibuffer, prompting with string PROMPT.")
4ed46869
KH
2804 (prompt)
2805 Lisp_Object prompt;
2806{
e0e989f6 2807 Lisp_Object val;
9d991de8
RS
2808 do
2809 {
02ba4723 2810 val = Fcompleting_read (prompt, Vobarray, Qcoding_system_spec,
9d991de8
RS
2811 Qt, Qnil, Qnil, Qnil);
2812 }
2813 while (XSTRING (val)->size == 0);
e0e989f6 2814 return (Fintern (val, Qnil));
4ed46869
KH
2815}
2816
2817DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 1, 0,
e0e989f6 2818 "Read a coding system or nil from the minibuffer, prompting with string PROMPT.")
4ed46869
KH
2819 (prompt)
2820 Lisp_Object prompt;
2821{
e0e989f6 2822 Lisp_Object val = Fcompleting_read (prompt, Vobarray, Qcoding_system_p,
9d991de8 2823 Qt, Qnil, Qnil, Qnil);
e0e989f6 2824 return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
4ed46869
KH
2825}
2826
2827DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
2828 1, 1, 0,
2829 "Check validity of CODING-SYSTEM.\n\
2830If valid, return CODING-SYSTEM, else `coding-system-error' is signaled.\n\
2831CODING-SYSTEM is valid if it is a symbol and has \"coding-system\" property.\n\
2832The value of property should be a vector of length 5.")
2833 (coding_system)
2834 Lisp_Object coding_system;
2835{
2836 CHECK_SYMBOL (coding_system, 0);
2837 if (!NILP (Fcoding_system_p (coding_system)))
2838 return coding_system;
2839 while (1)
02ba4723 2840 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
4ed46869
KH
2841}
2842
2843DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
2844 2, 2, 0,
2845 "Detect coding-system of the text in the region between START and END.\n\
2846Return a list of possible coding-systems ordered by priority.\n\
0ef69138 2847If only ASCII characters are found, it returns `undecided'\n\
4ed46869
KH
2848 or its subsidiary coding-system according to a detected end-of-line format.")
2849 (b, e)
2850 Lisp_Object b, e;
2851{
2852 int coding_mask, eol_type;
2853 Lisp_Object val;
2854 int beg, end;
2855
2856 validate_region (&b, &e);
2857 beg = XINT (b), end = XINT (e);
2858 if (beg < GPT && end >= GPT) move_gap (end);
2859
2860 coding_mask = detect_coding_mask (POS_ADDR (beg), end - beg);
2861 eol_type = detect_eol_type (POS_ADDR (beg), end - beg);
2862
2863 if (coding_mask == CODING_CATEGORY_MASK_ANY)
2864 {
0ef69138
KH
2865 val = intern ("undecided");
2866 if (eol_type != CODING_EOL_UNDECIDED)
4ed46869
KH
2867 {
2868 Lisp_Object val2 = Fget (val, Qeol_type);
2869 if (VECTORP (val2))
2870 val = XVECTOR (val2)->contents[eol_type];
2871 }
2872 }
2873 else
2874 {
2875 Lisp_Object val2;
2876
2877 /* At first, gather possible coding-systems in VAL in a reverse
2878 order. */
2879 val = Qnil;
2880 for (val2 = Vcoding_category_list;
2881 !NILP (val2);
2882 val2 = XCONS (val2)->cdr)
2883 {
2884 int idx
2885 = XFASTINT (Fget (XCONS (val2)->car, Qcoding_category_index));
2886 if (coding_mask & (1 << idx))
2887 val = Fcons (Fsymbol_value (XCONS (val2)->car), val);
2888 }
2889
2890 /* Then, change the order of the list, while getting subsidiary
2891 coding-systems. */
2892 val2 = val;
2893 val = Qnil;
2894 for (; !NILP (val2); val2 = XCONS (val2)->cdr)
2895 {
0ef69138 2896 if (eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
2897 val = Fcons (XCONS (val2)->car, val);
2898 else
2899 {
2900 Lisp_Object val3 = Fget (XCONS (val2)->car, Qeol_type);
2901 if (VECTORP (val3))
2902 val = Fcons (XVECTOR (val3)->contents[eol_type], val);
2903 else
2904 val = Fcons (XCONS (val2)->car, val);
2905 }
2906 }
2907 }
2908
2909 return val;
2910}
2911
2912/* Scan text in the region between *BEGP and *ENDP, skip characters
2913 which we never have to encode to (iff ENCODEP is 1) or decode from
2914 coding system CODING at the head and tail, then set BEGP and ENDP
2915 to the addresses of start and end of the text we actually convert. */
2916
2917void
2918shrink_conversion_area (begp, endp, coding, encodep)
2919 unsigned char **begp, **endp;
2920 struct coding_system *coding;
2921 int encodep;
2922{
2923 register unsigned char *beg_addr = *begp, *end_addr = *endp;
2924
2925 if (coding->eol_type != CODING_EOL_LF
0ef69138 2926 && coding->eol_type != CODING_EOL_UNDECIDED)
4ed46869
KH
2927 /* Since we anyway have to convert end-of-line format, it is not
2928 worth skipping at most 100 bytes or so. */
2929 return;
2930
2931 if (encodep) /* for encoding */
2932 {
2933 switch (coding->type)
2934 {
2935 case coding_type_no_conversion:
0ef69138
KH
2936 case coding_type_emacs_mule:
2937 case coding_type_undecided:
4ed46869
KH
2938 /* We need no conversion. */
2939 *begp = *endp;
2940 return;
2941 case coding_type_ccl:
2942 /* We can't skip any data. */
2943 return;
e0e989f6
KH
2944 case coding_type_iso2022:
2945 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
2946 {
2947 unsigned char *bol = beg_addr;
2948 while (beg_addr < end_addr && *beg_addr < 0x80)
2949 {
2950 beg_addr++;
2951 if (*(beg_addr - 1) == '\n')
2952 bol = beg_addr;
2953 }
2954 beg_addr = bol;
2955 goto label_skip_tail;
2956 }
2957 /* fall down ... */
4ed46869
KH
2958 default:
2959 /* We can skip all ASCII characters at the head and tail. */
2960 while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
e0e989f6 2961 label_skip_tail:
4ed46869
KH
2962 while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
2963 break;
2964 }
2965 }
2966 else /* for decoding */
2967 {
2968 switch (coding->type)
2969 {
2970 case coding_type_no_conversion:
2971 /* We need no conversion. */
2972 *begp = *endp;
2973 return;
0ef69138 2974 case coding_type_emacs_mule:
4ed46869
KH
2975 if (coding->eol_type == CODING_EOL_LF)
2976 {
2977 /* We need no conversion. */
2978 *begp = *endp;
2979 return;
2980 }
2981 /* We can skip all but carriage-return. */
2982 while (beg_addr < end_addr && *beg_addr != '\r') beg_addr++;
2983 while (beg_addr < end_addr && *(end_addr - 1) != '\r') end_addr--;
2984 break;
2985 case coding_type_sjis:
2986 case coding_type_big5:
2987 /* We can skip all ASCII characters at the head. */
2988 while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
2989 /* We can skip all ASCII characters at the tail except for
2990 the second byte of SJIS or BIG5 code. */
2991 while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
2992 if (end_addr != *endp)
2993 end_addr++;
2994 break;
2995 case coding_type_ccl:
2996 /* We can't skip any data. */
2997 return;
2998 default: /* i.e. case coding_type_iso2022: */
2999 {
3000 unsigned char c;
3001
3002 /* We can skip all ASCII characters except for a few
3003 control codes at the head. */
3004 while (beg_addr < end_addr && (c = *beg_addr) < 0x80
3005 && c != ISO_CODE_CR && c != ISO_CODE_SO
3006 && c != ISO_CODE_SI && c != ISO_CODE_ESC)
3007 beg_addr++;
3008 }
3009 break;
3010 }
3011 }
3012 *begp = beg_addr;
3013 *endp = end_addr;
3014 return;
3015}
3016
3017/* Encode to (iff ENCODEP is 1) or decode form coding system CODING a
3018 text between B and E. B and E are buffer position. */
3019
3020Lisp_Object
3021code_convert_region (b, e, coding, encodep)
3022 Lisp_Object b, e;
3023 struct coding_system *coding;
3024 int encodep;
3025{
3026 int beg, end, len, consumed, produced;
3027 char *buf;
3028 unsigned char *begp, *endp;
3029 int pos = PT;
3030
3031 validate_region (&b, &e);
3032 beg = XINT (b), end = XINT (e);
3033 if (beg < GPT && end >= GPT)
3034 move_gap (end);
3035
3036 if (encodep && !NILP (coding->pre_write_conversion))
3037 {
3038 /* We must call a pre-conversion function which may put a new
3039 text to be converted in a new buffer. */
3040 struct buffer *old = current_buffer, *new;
3041
3042 TEMP_SET_PT (beg);
3043 call2 (coding->pre_write_conversion, b, e);
3044 if (old != current_buffer)
3045 {
3046 /* Replace the original text by the text just generated. */
3047 len = ZV - BEGV;
3048 new = current_buffer;
3049 set_buffer_internal (old);
3050 del_range (beg, end);
3051 insert_from_buffer (new, 1, len, 0);
3052 end = beg + len;
3053 }
3054 }
3055
3056 /* We may be able to shrink the conversion region. */
3057 begp = POS_ADDR (beg); endp = begp + (end - beg);
3058 shrink_conversion_area (&begp, &endp, coding, encodep);
3059
3060 if (begp == endp)
3061 /* We need no conversion. */
3062 len = end - beg;
3063 else
3064 {
3065 beg += begp - POS_ADDR (beg);
3066 end = beg + (endp - begp);
3067
3068 if (encodep)
3069 len = encoding_buffer_size (coding, end - beg);
3070 else
3071 len = decoding_buffer_size (coding, end - beg);
3072 buf = get_conversion_buffer (len);
3073
3074 coding->last_block = 1;
3075 produced = (encodep
3076 ? encode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
3077 &consumed)
3078 : decode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
3079 &consumed));
3080
3081 len = produced + (beg - XINT (b)) + (XINT (e) - end);
3082
3083 TEMP_SET_PT (beg);
3084 insert (buf, produced);
3085 del_range (PT, PT + end - beg);
3086 if (pos >= end)
3087 pos = PT + (pos - end);
3088 else if (pos > beg)
3089 pos = beg;
3090 TEMP_SET_PT (pos);
3091 }
3092
3093 if (!encodep && !NILP (coding->post_read_conversion))
3094 {
3095 /* We must call a post-conversion function which may alter
3096 the text just converted. */
3097 Lisp_Object insval;
3098
3099 beg = XINT (b);
3100 TEMP_SET_PT (beg);
3101 insval = call1 (coding->post_read_conversion, make_number (len));
3102 CHECK_NUMBER (insval, 0);
3103 len = XINT (insval);
3104 }
3105
3106 return make_number (len);
3107}
3108
3109Lisp_Object
e0e989f6
KH
3110code_convert_string (str, coding, encodep, nocopy)
3111 Lisp_Object str, nocopy;
4ed46869
KH
3112 struct coding_system *coding;
3113 int encodep;
3114{
3115 int len, consumed, produced;
3116 char *buf;
3117 unsigned char *begp, *endp;
3118 int head_skip, tail_skip;
3119 struct gcpro gcpro1;
3120
3121 if (encodep && !NILP (coding->pre_write_conversion)
3122 || !encodep && !NILP (coding->post_read_conversion))
3123 {
3124 /* Since we have to call Lisp functions which assume target text
3125 is in a buffer, after setting a temporary buffer, call
3126 code_convert_region. */
3127 int count = specpdl_ptr - specpdl;
3128 int len = XSTRING (str)->size;
3129 Lisp_Object result;
3130 struct buffer *old = current_buffer;
3131
3132 record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
3133 temp_output_buffer_setup (" *code-converting-work*");
3134 set_buffer_internal (XBUFFER (Vstandard_output));
3135 insert_from_string (str, 0, len, 0);
3136 code_convert_region (make_number (BEGV), make_number (ZV),
3137 coding, encodep);
3138 result = make_buffer_string (BEGV, ZV, 0);
3139 set_buffer_internal (old);
3140 return unbind_to (count, result);
3141 }
3142
3143 /* We may be able to shrink the conversion region. */
3144 begp = XSTRING (str)->data;
3145 endp = begp + XSTRING (str)->size;
3146 shrink_conversion_area (&begp, &endp, coding, encodep);
3147
3148 if (begp == endp)
3149 /* We need no conversion. */
e0e989f6 3150 return (NILP (nocopy) ? Fcopy_sequence (str) : str);
4ed46869
KH
3151
3152 head_skip = begp - XSTRING (str)->data;
3153 tail_skip = XSTRING (str)->size - head_skip - (endp - begp);
3154
3155 GCPRO1 (str);
3156
3157 if (encodep)
3158 len = encoding_buffer_size (coding, endp - begp);
3159 else
3160 len = decoding_buffer_size (coding, endp - begp);
3161 buf = get_conversion_buffer (len + head_skip + tail_skip);
3162
3163 bcopy (XSTRING (str)->data, buf, head_skip);
3164 coding->last_block = 1;
3165 produced = (encodep
3166 ? encode_coding (coding, XSTRING (str)->data + head_skip,
3167 buf + head_skip, endp - begp, len, &consumed)
3168 : decode_coding (coding, XSTRING (str)->data + head_skip,
3169 buf + head_skip, endp - begp, len, &consumed));
3170 bcopy (XSTRING (str)->data + head_skip + (endp - begp),
3171 buf + head_skip + produced,
3172 tail_skip);
3173
3174 UNGCPRO;
3175
3176 return make_string (buf, head_skip + produced + tail_skip);
3177}
3178
3179DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
e0e989f6
KH
3180 3, 3, "r\nzCoding system: ",
3181 "Decode current region by specified coding system.\n\
3182When called from a program, takes three arguments:\n\
3183START, END, and CODING-SYSTEM. START END are buffer positions.\n\
4ed46869
KH
3184Return length of decoded text.")
3185 (b, e, coding_system)
3186 Lisp_Object b, e, coding_system;
3187{
3188 struct coding_system coding;
3189
3190 CHECK_NUMBER_COERCE_MARKER (b, 0);
3191 CHECK_NUMBER_COERCE_MARKER (e, 1);
3192 CHECK_SYMBOL (coding_system, 2);
3193
e0e989f6
KH
3194 if (NILP (coding_system))
3195 return make_number (XFASTINT (e) - XFASTINT (b));
4ed46869
KH
3196 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3197 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3198
3199 return code_convert_region (b, e, &coding, 0);
3200}
3201
3202DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
e0e989f6
KH
3203 3, 3, "r\nzCoding system: ",
3204 "Encode current region by specified coding system.\n\
3205When called from a program, takes three arguments:\n\
3206START, END, and CODING-SYSTEM. START END are buffer positions.\n\
4ed46869
KH
3207Return length of encoded text.")
3208 (b, e, coding_system)
3209 Lisp_Object b, e, coding_system;
3210{
3211 struct coding_system coding;
3212
3213 CHECK_NUMBER_COERCE_MARKER (b, 0);
3214 CHECK_NUMBER_COERCE_MARKER (e, 1);
3215 CHECK_SYMBOL (coding_system, 2);
3216
e0e989f6
KH
3217 if (NILP (coding_system))
3218 return make_number (XFASTINT (e) - XFASTINT (b));
4ed46869
KH
3219 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3220 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3221
3222 return code_convert_region (b, e, &coding, 1);
3223}
3224
3225DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
e0e989f6
KH
3226 2, 3, 0,
3227 "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
3228Optional arg NOCOPY non-nil means return STRING itself if there's no need\n\
3229of decoding.")
3230 (string, coding_system, nocopy)
3231 Lisp_Object string, coding_system, nocopy;
4ed46869
KH
3232{
3233 struct coding_system coding;
3234
3235 CHECK_STRING (string, 0);
3236 CHECK_SYMBOL (coding_system, 1);
3237
e0e989f6
KH
3238 if (NILP (coding_system))
3239 return (NILP (nocopy) ? Fcopy_sequence (string) : string);
4ed46869
KH
3240 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3241 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3242
e0e989f6 3243 return code_convert_string (string, &coding, 0, nocopy);
4ed46869
KH
3244}
3245
3246DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
e0e989f6
KH
3247 2, 3, 0,
3248 "Encode STRING to CODING-SYSTEM, and return the result.\n\
3249Optional arg NOCOPY non-nil means return STRING itself if there's no need\n\
3250of encoding.")
3251 (string, coding_system, nocopy)
3252 Lisp_Object string, coding_system, nocopy;
4ed46869
KH
3253{
3254 struct coding_system coding;
3255
3256 CHECK_STRING (string, 0);
3257 CHECK_SYMBOL (coding_system, 1);
3258
e0e989f6
KH
3259 if (NILP (coding_system))
3260 return (NILP (nocopy) ? Fcopy_sequence (string) : string);
4ed46869
KH
3261 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3262 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3263
e0e989f6 3264 return code_convert_string (string, &coding, 1, nocopy);
4ed46869
KH
3265}
3266
3267DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
e0e989f6 3268 "Decode a JISX0208 character of shift-jis encoding.\n\
4ed46869
KH
3269CODE is the character code in SJIS.\n\
3270Return the corresponding character.")
3271 (code)
3272 Lisp_Object code;
3273{
3274 unsigned char c1, c2, s1, s2;
3275 Lisp_Object val;
3276
3277 CHECK_NUMBER (code, 0);
3278 s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
3279 DECODE_SJIS (s1, s2, c1, c2);
3280 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset_jisx0208, c1, c2));
3281 return val;
3282}
3283
3284DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
3285 "Encode a JISX0208 character CHAR to SJIS coding-system.\n\
3286Return the corresponding character code in SJIS.")
3287 (ch)
3288 Lisp_Object ch;
3289{
bcf26d6a 3290 int charset, c1, c2, s1, s2;
4ed46869
KH
3291 Lisp_Object val;
3292
3293 CHECK_NUMBER (ch, 0);
3294 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3295 if (charset == charset_jisx0208)
3296 {
3297 ENCODE_SJIS (c1, c2, s1, s2);
bcf26d6a 3298 XSETFASTINT (val, (s1 << 8) | s2);
4ed46869
KH
3299 }
3300 else
3301 XSETFASTINT (val, 0);
3302 return val;
3303}
3304
3305DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
3306 "Decode a Big5 character CODE of BIG5 coding-system.\n\
3307CODE is the character code in BIG5.\n\
3308Return the corresponding character.")
3309 (code)
3310 Lisp_Object code;
3311{
3312 int charset;
3313 unsigned char b1, b2, c1, c2;
3314 Lisp_Object val;
3315
3316 CHECK_NUMBER (code, 0);
3317 b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
3318 DECODE_BIG5 (b1, b2, charset, c1, c2);
3319 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset, c1, c2));
3320 return val;
3321}
3322
3323DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
3324 "Encode the Big5 character CHAR to BIG5 coding-system.\n\
3325Return the corresponding character code in Big5.")
3326 (ch)
3327 Lisp_Object ch;
3328{
bcf26d6a 3329 int charset, c1, c2, b1, b2;
4ed46869
KH
3330 Lisp_Object val;
3331
3332 CHECK_NUMBER (ch, 0);
3333 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3334 if (charset == charset_big5_1 || charset == charset_big5_2)
3335 {
3336 ENCODE_BIG5 (charset, c1, c2, b1, b2);
bcf26d6a 3337 XSETFASTINT (val, (b1 << 8) | b2);
4ed46869
KH
3338 }
3339 else
3340 XSETFASTINT (val, 0);
3341 return val;
3342}
3343
1ba9e4ab
KH
3344DEFUN ("set-terminal-coding-system-internal",
3345 Fset_terminal_coding_system_internal,
3346 Sset_terminal_coding_system_internal, 1, 1, 0, "")
4ed46869
KH
3347 (coding_system)
3348 Lisp_Object coding_system;
3349{
3350 CHECK_SYMBOL (coding_system, 0);
3351 setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
4ed46869
KH
3352 return Qnil;
3353}
3354
3355DEFUN ("terminal-coding-system",
3356 Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
3357 "Return coding-system of your terminal.")
3358 ()
3359{
3360 return terminal_coding.symbol;
3361}
3362
1ba9e4ab
KH
3363DEFUN ("set-keyboard-coding-system-internal",
3364 Fset_keyboard_coding_system_internal,
3365 Sset_keyboard_coding_system_internal, 1, 1, 0, "")
4ed46869
KH
3366 (coding_system)
3367 Lisp_Object coding_system;
3368{
3369 CHECK_SYMBOL (coding_system, 0);
3370 setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
3371 return Qnil;
3372}
3373
3374DEFUN ("keyboard-coding-system",
3375 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
3376 "Return coding-system of what is sent from terminal keyboard.")
3377 ()
3378{
3379 return keyboard_coding.symbol;
3380}
3381
3382\f
3383DEFUN ("find-coding-system", Ffind_coding_system, Sfind_coding_system,
3384 1, MANY, 0,
ccdb79f5
RS
3385 "Choose a coding system for a file operation based on file name.\n\
3386The value names a pair of coding systems: (ENCODING-SYSTEM DECODING-SYSTEM).\n\
3387ENCODING-SYSTEM is the coding system to use for encoding\n\
3388\(in case OPERATION does encoding), and DECODING-SYSTEM is the coding system\n\
3389for decoding (in case OPERATION does decoding).\n\
3390\n\
3391The first argument OPERATION specifies an I/O primitive:\n\
3392 For file I/O, `insert-file-contents' or `write-region'.\n\
3393 For process I/O, `call-process', `call-process-region', or `start-process'.\n\
3394 For network I/O, `open-network-stream'.\n\
3395\n\
3396The remaining arguments should be the same arguments that were passed\n\
3397to the primitive. Depending on which primitive, one of those arguments\n\
3398is selected as the TARGET. For example, if OPERATION does file I/O,\n\
3399whichever argument specifies the file name is TARGET.\n\
3400\n\
3401TARGET has a meaning which depends on OPERATION:\n\
4ed46869
KH
3402 For file I/O, TARGET is a file name.\n\
3403 For process I/O, TARGET is a process name.\n\
3404 For network I/O, TARGET is a service name or a port number\n\
3405\n\
02ba4723
KH
3406This function looks up what specified for TARGET in,\n\
3407`file-coding-system-alist', `process-coding-system-alist',\n\
3408or `network-coding-system-alist' depending on OPERATION.\n\
3409They may specify a coding system, a cons of coding systems,\n\
3410or a function symbol to call.\n\
3411In the last case, we call the function with one argument,\n\
ccdb79f5 3412which is a list of all the arguments given to `find-coding-system'.")
4ed46869
KH
3413 (nargs, args)
3414 int nargs;
3415 Lisp_Object *args;
3416{
3417 Lisp_Object operation, target_idx, target, val;
3418 register Lisp_Object chain;
3419
3420 if (nargs < 2)
3421 error ("Too few arguments");
3422 operation = args[0];
3423 if (!SYMBOLP (operation)
3424 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
3425 error ("Invalid first arguement");
3426 if (nargs < 1 + XINT (target_idx))
3427 error ("Too few arguments for operation: %s",
3428 XSYMBOL (operation)->name->data);
3429 target = args[XINT (target_idx) + 1];
3430 if (!(STRINGP (target)
3431 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
3432 error ("Invalid %dth argument", XINT (target_idx) + 1);
3433
02ba4723
KH
3434 chain = (operation == Qinsert_file_contents || operation == Qwrite_region
3435 ? Vfile_coding_system_alist
3436 : (operation == Qopen_network_stream
3437 ? Vnetwork_coding_system_alist
3438 : Vprocess_coding_system_alist));
4ed46869
KH
3439 if (NILP (chain))
3440 return Qnil;
3441
02ba4723 3442 for (; CONSP (chain); chain = XCONS (chain)->cdr)
4ed46869
KH
3443 {
3444 Lisp_Object elt = XCONS (chain)->car;
3445
3446 if (CONSP (elt)
3447 && ((STRINGP (target)
3448 && STRINGP (XCONS (elt)->car)
3449 && fast_string_match (XCONS (elt)->car, target) >= 0)
3450 || (INTEGERP (target) && EQ (target, XCONS (elt)->car))))
02ba4723
KH
3451 {
3452 val = XCONS (elt)->cdr;
3453 if (CONSP (val))
3454 return val;
3455 if (! SYMBOLP (val))
3456 return Qnil;
3457 if (! NILP (Fcoding_system_p (val)))
3458 return Fcons (val, val);
3459 if (!NILP (Fboundp (val)))
3460 return call2 (val, Flist (nargs, args));
3461 return Qnil;
3462 }
4ed46869
KH
3463 }
3464 return Qnil;
3465}
3466
3467#endif /* emacs */
3468
3469\f
3470/*** 8. Post-amble ***/
3471
3472init_coding_once ()
3473{
3474 int i;
3475
0ef69138 3476 /* Emacs' internal format specific initialize routine. */
4ed46869
KH
3477 for (i = 0; i <= 0x20; i++)
3478 emacs_code_class[i] = EMACS_control_code;
3479 emacs_code_class[0x0A] = EMACS_linefeed_code;
3480 emacs_code_class[0x0D] = EMACS_carriage_return_code;
3481 for (i = 0x21 ; i < 0x7F; i++)
3482 emacs_code_class[i] = EMACS_ascii_code;
3483 emacs_code_class[0x7F] = EMACS_control_code;
3484 emacs_code_class[0x80] = EMACS_leading_code_composition;
3485 for (i = 0x81; i < 0xFF; i++)
3486 emacs_code_class[i] = EMACS_invalid_code;
3487 emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
3488 emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
3489 emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
3490 emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
3491
3492 /* ISO2022 specific initialize routine. */
3493 for (i = 0; i < 0x20; i++)
3494 iso_code_class[i] = ISO_control_code;
3495 for (i = 0x21; i < 0x7F; i++)
3496 iso_code_class[i] = ISO_graphic_plane_0;
3497 for (i = 0x80; i < 0xA0; i++)
3498 iso_code_class[i] = ISO_control_code;
3499 for (i = 0xA1; i < 0xFF; i++)
3500 iso_code_class[i] = ISO_graphic_plane_1;
3501 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
3502 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
3503 iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
3504 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
3505 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
3506 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
3507 iso_code_class[ISO_CODE_ESC] = ISO_escape;
3508 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
3509 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
3510 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
3511
e0e989f6
KH
3512 conversion_buffer_size = MINIMUM_CONVERSION_BUFFER_SIZE;
3513 conversion_buffer = (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE);
3514
3515 setup_coding_system (Qnil, &keyboard_coding);
3516 setup_coding_system (Qnil, &terminal_coding);
3517}
3518
3519#ifdef emacs
3520
3521syms_of_coding ()
3522{
3523 Qtarget_idx = intern ("target-idx");
3524 staticpro (&Qtarget_idx);
3525
3526 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
3527 Fput (Qwrite_region, Qtarget_idx, make_number (2));
3528
3529 Qcall_process = intern ("call-process");
3530 staticpro (&Qcall_process);
3531 Fput (Qcall_process, Qtarget_idx, make_number (0));
3532
3533 Qcall_process_region = intern ("call-process-region");
3534 staticpro (&Qcall_process_region);
3535 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
3536
3537 Qstart_process = intern ("start-process");
3538 staticpro (&Qstart_process);
3539 Fput (Qstart_process, Qtarget_idx, make_number (2));
3540
3541 Qopen_network_stream = intern ("open-network-stream");
3542 staticpro (&Qopen_network_stream);
3543 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
3544
4ed46869
KH
3545 Qcoding_system = intern ("coding-system");
3546 staticpro (&Qcoding_system);
3547
3548 Qeol_type = intern ("eol-type");
3549 staticpro (&Qeol_type);
3550
3551 Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
3552 staticpro (&Qbuffer_file_coding_system);
3553
3554 Qpost_read_conversion = intern ("post-read-conversion");
3555 staticpro (&Qpost_read_conversion);
3556
3557 Qpre_write_conversion = intern ("pre-write-conversion");
3558 staticpro (&Qpre_write_conversion);
3559
02ba4723
KH
3560 Qcoding_system_spec = intern ("coding-system-spec");
3561 staticpro (&Qcoding_system_spec);
4ed46869
KH
3562
3563 Qcoding_system_p = intern ("coding-system-p");
3564 staticpro (&Qcoding_system_p);
3565
3566 Qcoding_system_error = intern ("coding-system-error");
3567 staticpro (&Qcoding_system_error);
3568
3569 Fput (Qcoding_system_error, Qerror_conditions,
3570 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
3571 Fput (Qcoding_system_error, Qerror_message,
3572 build_string ("Coding-system error"));
3573
3574 Qcoding_category_index = intern ("coding-category-index");
3575 staticpro (&Qcoding_category_index);
3576
3577 {
3578 int i;
3579 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3580 {
3581 coding_category_table[i] = intern (coding_category_name[i]);
3582 staticpro (&coding_category_table[i]);
3583 Fput (coding_category_table[i], Qcoding_category_index,
3584 make_number (i));
3585 }
3586 }
3587
bdd9fb48
KH
3588 Qcharacter_unification_table = intern ("character-unification-table");
3589 staticpro (&Qcharacter_unification_table);
3590 Fput (Qcharacter_unification_table, Qchar_table_extra_slots,
3591 make_number (0));
3592
02ba4723 3593 defsubr (&Scoding_system_spec);
4ed46869
KH
3594 defsubr (&Scoding_system_p);
3595 defsubr (&Sread_coding_system);
3596 defsubr (&Sread_non_nil_coding_system);
3597 defsubr (&Scheck_coding_system);
3598 defsubr (&Sdetect_coding_region);
3599 defsubr (&Sdecode_coding_region);
3600 defsubr (&Sencode_coding_region);
3601 defsubr (&Sdecode_coding_string);
3602 defsubr (&Sencode_coding_string);
3603 defsubr (&Sdecode_sjis_char);
3604 defsubr (&Sencode_sjis_char);
3605 defsubr (&Sdecode_big5_char);
3606 defsubr (&Sencode_big5_char);
1ba9e4ab 3607 defsubr (&Sset_terminal_coding_system_internal);
4ed46869 3608 defsubr (&Sterminal_coding_system);
1ba9e4ab 3609 defsubr (&Sset_keyboard_coding_system_internal);
4ed46869
KH
3610 defsubr (&Skeyboard_coding_system);
3611 defsubr (&Sfind_coding_system);
3612
3613 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
3614 "List of coding-categories (symbols) ordered by priority.");
3615 {
3616 int i;
3617
3618 Vcoding_category_list = Qnil;
3619 for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
3620 Vcoding_category_list
3621 = Fcons (coding_category_table[i], Vcoding_category_list);
3622 }
3623
3624 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
3625 "A variable of internal use only.\n\
3626If the value is a coding system, it is used for decoding on read operation.\n\
3627If not, an appropriate element in `coding-system-alist' (which see) is used.");
3628 Vcoding_system_for_read = Qnil;
3629
3630 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
3631 "A variable of internal use only.\n\
3632If the value is a coding system, it is used for encoding on write operation.\n\
3633If not, an appropriate element in `coding-system-alist' (which see) is used.");
3634 Vcoding_system_for_write = Qnil;
3635
3636 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
3637 "Coding-system used in the latest file or process I/O.");
3638 Vlast_coding_system_used = Qnil;
3639
02ba4723
KH
3640 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
3641 "Alist to decide a coding system to use for a file I/O operation.\n\
3642The format is ((PATTERN . VAL) ...),\n\
3643where PATTERN is a regular expression matching a file name,\n\
3644VAL is a coding system, a cons of coding systems, or a function symbol.\n\
3645If VAL is a coding system, it is used for both decoding and encoding\n\
3646the file contents.\n\
3647If VAL is a cons of coding systems, the car part is used for decoding,\n\
3648and the cdr part is used for encoding.\n\
3649If VAL is a function symbol, the function must return a coding system\n\
3650or a cons of coding systems which are used as above.\n\
e0e989f6 3651\n\
02ba4723
KH
3652See also the function `find-coding-system'.");
3653 Vfile_coding_system_alist = Qnil;
3654
3655 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
3656 "Alist to decide a coding system to use for a process I/O operation.\n\
3657The format is ((PATTERN . VAL) ...),\n\
3658where PATTERN is a regular expression matching a program name,\n\
3659VAL is a coding system, a cons of coding systems, or a function symbol.\n\
3660If VAL is a coding system, it is used for both decoding what received\n\
3661from the program and encoding what sent to the program.\n\
3662If VAL is a cons of coding systems, the car part is used for decoding,\n\
3663and the cdr part is used for encoding.\n\
3664If VAL is a function symbol, the function must return a coding system\n\
3665or a cons of coding systems which are used as above.\n\
4ed46869 3666\n\
02ba4723
KH
3667See also the function `find-coding-system'.");
3668 Vprocess_coding_system_alist = Qnil;
3669
3670 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
3671 "Alist to decide a coding system to use for a network I/O operation.\n\
3672The format is ((PATTERN . VAL) ...),\n\
3673where PATTERN is a regular expression matching a network service name\n\
3674or is a port number to connect to,\n\
3675VAL is a coding system, a cons of coding systems, or a function symbol.\n\
3676If VAL is a coding system, it is used for both decoding what received\n\
3677from the network stream and encoding what sent to the network stream.\n\
3678If VAL is a cons of coding systems, the car part is used for decoding,\n\
3679and the cdr part is used for encoding.\n\
3680If VAL is a function symbol, the function must return a coding system\n\
3681or a cons of coding systems which are used as above.\n\
4ed46869 3682\n\
02ba4723
KH
3683See also the function `find-coding-system'.");
3684 Vnetwork_coding_system_alist = Qnil;
4ed46869
KH
3685
3686 DEFVAR_INT ("eol-mnemonic-unix", &eol_mnemonic_unix,
3687 "Mnemonic character indicating UNIX-like end-of-line format (i.e. LF) .");
3688 eol_mnemonic_unix = '.';
3689
3690 DEFVAR_INT ("eol-mnemonic-dos", &eol_mnemonic_dos,
3691 "Mnemonic character indicating DOS-like end-of-line format (i.e. CRLF).");
3692 eol_mnemonic_dos = ':';
3693
3694 DEFVAR_INT ("eol-mnemonic-mac", &eol_mnemonic_mac,
3695 "Mnemonic character indicating MAC-like end-of-line format (i.e. CR).");
3696 eol_mnemonic_mac = '\'';
3697
3698 DEFVAR_INT ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
3699 "Mnemonic character indicating end-of-line format is not yet decided.");
3700 eol_mnemonic_undecided = '-';
3701
bdd9fb48
KH
3702 DEFVAR_LISP ("enable-character-unification", &Venable_character_unification,
3703 "Non-nil means ISO 2022 encoder/decoder do character unification.");
3704 Venable_character_unification = Qt;
3705
3706 DEFVAR_LISP ("standard-character-unification-table-for-read",
3707 &Vstandard_character_unification_table_for_read,
3708 "Table for unifying characters when reading.");
3709 Vstandard_character_unification_table_for_read = Qnil;
3710
3711 DEFVAR_LISP ("standard-character-unification-table-for-write",
3712 &Vstandard_character_unification_table_for_write,
3713 "Table for unifying characters when writing.");
3714 Vstandard_character_unification_table_for_write = Qnil;
4ed46869
KH
3715
3716 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
3717 "Alist of charsets vs revision numbers.\n\
3718While encoding, if a charset (car part of an element) is found,\n\
3719designate it with the escape sequence identifing revision (cdr part of the element).");
3720 Vcharset_revision_alist = Qnil;
02ba4723
KH
3721
3722 DEFVAR_LISP ("default-process-coding-system",
3723 &Vdefault_process_coding_system,
3724 "Cons of coding systems used for process I/O by default.\n\
3725The car part is used for decoding a process output,\n\
3726the cdr part is used for encoding a text to be sent to a process.");
3727 Vdefault_process_coding_system = Qnil;
4ed46869
KH
3728}
3729
3730#endif /* emacs */