(Qemacs_mule): Extern it.
[bpt/emacs.git] / src / coding.c
CommitLineData
4ed46869 1/* Coding system handler (conversion, detection, and etc).
203cb916
RS
2 Copyright (C) 1995, 1997 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4ed46869 4
369314dc
KH
5This file is part of GNU Emacs.
6
7GNU Emacs is free software; you can redistribute it and/or modify
8it under the terms of the GNU General Public License as published by
9the Free Software Foundation; either version 2, or (at your option)
10any later version.
4ed46869 11
369314dc
KH
12GNU Emacs is distributed in the hope that it will be useful,
13but WITHOUT ANY WARRANTY; without even the implied warranty of
14MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15GNU General Public License for more details.
4ed46869 16
369314dc
KH
17You should have received a copy of the GNU General Public License
18along with GNU Emacs; see the file COPYING. If not, write to
19the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20Boston, MA 02111-1307, USA. */
4ed46869
KH
21
22/*** TABLE OF CONTENTS ***
23
24 1. Preamble
0ef69138 25 2. Emacs' internal format (emacs-mule) handlers
4ed46869
KH
26 3. ISO2022 handlers
27 4. Shift-JIS and BIG5 handlers
28 5. End-of-line handlers
29 6. C library functions
30 7. Emacs Lisp library functions
31 8. Post-amble
32
33*/
34
35/*** GENERAL NOTE on CODING SYSTEM ***
36
37 Coding system is an encoding mechanism of one or more character
38 sets. Here's a list of coding systems which Emacs can handle. When
39 we say "decode", it means converting some other coding system to
0ef69138
KH
40 Emacs' internal format (emacs-internal), and when we say "encode",
41 it means converting the coding system emacs-mule to some other
42 coding system.
4ed46869 43
0ef69138 44 0. Emacs' internal format (emacs-mule)
4ed46869
KH
45
46 Emacs itself holds a multi-lingual character in a buffer and a string
47 in a special format. Details are described in the section 2.
48
49 1. ISO2022
50
51 The most famous coding system for multiple character sets. X's
52 Compound Text, various EUCs (Extended Unix Code), and such coding
53 systems used in Internet communication as ISO-2022-JP are all
54 variants of ISO2022. Details are described in the section 3.
55
56 2. SJIS (or Shift-JIS or MS-Kanji-Code)
57
58 A coding system to encode character sets: ASCII, JISX0201, and
59 JISX0208. Widely used for PC's in Japan. Details are described in
60 the section 4.
61
62 3. BIG5
63
64 A coding system to encode character sets: ASCII and Big5. Widely
65 used by Chinese (mainly in Taiwan and Hong Kong). Details are
66 described in the section 4. In this file, when written as "BIG5"
67 (all uppercase), it means the coding system, and when written as
68 "Big5" (capitalized), it means the character set.
69
70 4. Else
71
72 If a user want to read/write a text encoded in a coding system not
73 listed above, he can supply a decoder and an encoder for it in CCL
74 (Code Conversion Language) programs. Emacs executes the CCL program
75 while reading/writing.
76
77 Emacs represent a coding-system by a Lisp symbol that has a property
78 `coding-system'. But, before actually using the coding-system, the
79 information about it is set in a structure of type `struct
80 coding_system' for rapid processing. See the section 6 for more
81 detail.
82
83*/
84
85/*** GENERAL NOTES on END-OF-LINE FORMAT ***
86
87 How end-of-line of a text is encoded depends on a system. For
88 instance, Unix's format is just one byte of `line-feed' code,
89 whereas DOS's format is two bytes sequence of `carriage-return' and
90 `line-feed' codes. MacOS's format is one byte of `carriage-return'.
91
92 Since how characters in a text is encoded and how end-of-line is
93 encoded is independent, any coding system described above can take
94 any format of end-of-line. So, Emacs has information of format of
95 end-of-line in each coding-system. See the section 6 for more
96 detail.
97
98*/
99
100/*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
101
102 These functions check if a text between SRC and SRC_END is encoded
103 in the coding system category XXX. Each returns an integer value in
104 which appropriate flag bits for the category XXX is set. The flag
105 bits are defined in macros CODING_CATEGORY_MASK_XXX. Below is the
106 template of these functions. */
107#if 0
108int
0ef69138 109detect_coding_emacs_mule (src, src_end)
4ed46869
KH
110 unsigned char *src, *src_end;
111{
112 ...
113}
114#endif
115
116/*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
117
118 These functions decode SRC_BYTES length text at SOURCE encoded in
0ef69138
KH
119 CODING to Emacs' internal format (emacs-mule). The resulting text
120 goes to a place pointed by DESTINATION, the length of which should
121 not exceed DST_BYTES. The bytes actually processed is returned as
122 *CONSUMED. The return value is the length of the decoded text.
123 Below is a template of these functions. */
4ed46869
KH
124#if 0
125decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
126 struct coding_system *coding;
127 unsigned char *source, *destination;
128 int src_bytes, dst_bytes;
129 int *consumed;
130{
131 ...
132}
133#endif
134
135/*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
136
0ef69138
KH
137 These functions encode SRC_BYTES length text at SOURCE of Emacs'
138 internal format (emacs-mule) to CODING. The resulting text goes to
139 a place pointed by DESTINATION, the length of which should not
140 exceed DST_BYTES. The bytes actually processed is returned as
141 *CONSUMED. The return value is the length of the encoded text.
142 Below is a template of these functions. */
4ed46869
KH
143#if 0
144encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
145 struct coding_system *coding;
146 unsigned char *source, *destination;
147 int src_bytes, dst_bytes;
148 int *consumed;
149{
150 ...
151}
152#endif
153
154/*** COMMONLY USED MACROS ***/
155
156/* The following three macros ONE_MORE_BYTE, TWO_MORE_BYTES, and
157 THREE_MORE_BYTES safely get one, two, and three bytes from the
158 source text respectively. If there are not enough bytes in the
159 source, they jump to `label_end_of_loop'. The caller should set
160 variables `src' and `src_end' to appropriate areas in advance. */
161
162#define ONE_MORE_BYTE(c1) \
163 do { \
164 if (src < src_end) \
165 c1 = *src++; \
166 else \
167 goto label_end_of_loop; \
168 } while (0)
169
170#define TWO_MORE_BYTES(c1, c2) \
171 do { \
172 if (src + 1 < src_end) \
173 c1 = *src++, c2 = *src++; \
174 else \
175 goto label_end_of_loop; \
176 } while (0)
177
178#define THREE_MORE_BYTES(c1, c2, c3) \
179 do { \
180 if (src + 2 < src_end) \
181 c1 = *src++, c2 = *src++, c3 = *src++; \
182 else \
183 goto label_end_of_loop; \
184 } while (0)
185
186/* The following three macros DECODE_CHARACTER_ASCII,
187 DECODE_CHARACTER_DIMENSION1, and DECODE_CHARACTER_DIMENSION2 put
188 the multi-byte form of a character of each class at the place
189 pointed by `dst'. The caller should set the variable `dst' to
190 point to an appropriate area and the variable `coding' to point to
191 the coding-system of the currently decoding text in advance. */
192
193/* Decode one ASCII character C. */
194
195#define DECODE_CHARACTER_ASCII(c) \
196 do { \
197 if (COMPOSING_P (coding->composing)) \
198 *dst++ = 0xA0, *dst++ = (c) | 0x80; \
199 else \
200 *dst++ = (c); \
201 } while (0)
202
203/* Decode one DIMENSION1 character of which charset is CHARSET and
204 position-code is C. */
205
206#define DECODE_CHARACTER_DIMENSION1(charset, c) \
207 do { \
208 unsigned char leading_code = CHARSET_LEADING_CODE_BASE (charset); \
209 if (COMPOSING_P (coding->composing)) \
210 *dst++ = leading_code + 0x20; \
211 else \
212 *dst++ = leading_code; \
213 if (leading_code = CHARSET_LEADING_CODE_EXT (charset)) \
214 *dst++ = leading_code; \
215 *dst++ = (c) | 0x80; \
216 } while (0)
217
218/* Decode one DIMENSION2 character of which charset is CHARSET and
219 position-codes are C1 and C2. */
220
221#define DECODE_CHARACTER_DIMENSION2(charset, c1, c2) \
222 do { \
223 DECODE_CHARACTER_DIMENSION1 (charset, c1); \
224 *dst++ = (c2) | 0x80; \
225 } while (0)
226
227\f
228/*** 1. Preamble ***/
229
230#include <stdio.h>
231
232#ifdef emacs
233
234#include <config.h>
235#include "lisp.h"
236#include "buffer.h"
237#include "charset.h"
238#include "ccl.h"
239#include "coding.h"
240#include "window.h"
241
242#else /* not emacs */
243
244#include "mulelib.h"
245
246#endif /* not emacs */
247
248Lisp_Object Qcoding_system, Qeol_type;
249Lisp_Object Qbuffer_file_coding_system;
250Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
251
252extern Lisp_Object Qinsert_file_contents, Qwrite_region;
253Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
254Lisp_Object Qstart_process, Qopen_network_stream;
255Lisp_Object Qtarget_idx;
256
257/* Mnemonic character of each format of end-of-line. */
258int eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
259/* Mnemonic character to indicate format of end-of-line is not yet
260 decided. */
261int eol_mnemonic_undecided;
262
263#ifdef emacs
264
02ba4723 265Lisp_Object Qcoding_system_spec, Qcoding_system_p, Qcoding_system_error;
4ed46869
KH
266
267/* Coding-systems are handed between Emacs Lisp programs and C internal
268 routines by the following three variables. */
269/* Coding-system for reading files and receiving data from process. */
270Lisp_Object Vcoding_system_for_read;
271/* Coding-system for writing files and sending data to process. */
272Lisp_Object Vcoding_system_for_write;
273/* Coding-system actually used in the latest I/O. */
274Lisp_Object Vlast_coding_system_used;
275
276/* Coding-system of what terminal accept for displaying. */
277struct coding_system terminal_coding;
278
279/* Coding-system of what is sent from terminal keyboard. */
280struct coding_system keyboard_coding;
281
02ba4723
KH
282Lisp_Object Vfile_coding_system_alist;
283Lisp_Object Vprocess_coding_system_alist;
284Lisp_Object Vnetwork_coding_system_alist;
4ed46869
KH
285
286#endif /* emacs */
287
288Lisp_Object Qcoding_category_index;
289
290/* List of symbols `coding-category-xxx' ordered by priority. */
291Lisp_Object Vcoding_category_list;
292
293/* Table of coding-systems currently assigned to each coding-category. */
294Lisp_Object coding_category_table[CODING_CATEGORY_IDX_MAX];
295
296/* Table of names of symbol for each coding-category. */
297char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
0ef69138 298 "coding-category-emacs-mule",
4ed46869
KH
299 "coding-category-sjis",
300 "coding-category-iso-7",
301 "coding-category-iso-8-1",
302 "coding-category-iso-8-2",
303 "coding-category-iso-else",
304 "coding-category-big5",
305 "coding-category-binary"
306};
307
bdd9fb48
KH
308/* Flag to tell if we look up unification table on character code
309 conversion. */
310Lisp_Object Venable_character_unification;
a5d301df
KH
311/* Standard unification table to look up on decoding (reading). */
312Lisp_Object Vstandard_character_unification_table_for_decode;
313/* Standard unification table to look up on encoding (writing). */
314Lisp_Object Vstandard_character_unification_table_for_encode;
bdd9fb48
KH
315
316Lisp_Object Qcharacter_unification_table;
a5d301df
KH
317Lisp_Object Qcharacter_unification_table_for_decode;
318Lisp_Object Qcharacter_unification_table_for_encode;
4ed46869
KH
319
320/* Alist of charsets vs revision number. */
321Lisp_Object Vcharset_revision_alist;
322
02ba4723
KH
323/* Default coding systems used for process I/O. */
324Lisp_Object Vdefault_process_coding_system;
325
4ed46869 326\f
0ef69138 327/*** 2. Emacs internal format (emacs-mule) handlers ***/
4ed46869
KH
328
329/* Emacs' internal format for encoding multiple character sets is a
330 kind of multi-byte encoding, i.e. encoding a character by a sequence
331 of one-byte codes of variable length. ASCII characters and control
332 characters (e.g. `tab', `newline') are represented by one-byte as
333 is. It takes the range 0x00 through 0x7F. The other characters
334 are represented by a sequence of `base leading-code', optional
335 `extended leading-code', and one or two `position-code's. Length
336 of the sequence is decided by the base leading-code. Leading-code
337 takes the range 0x80 through 0x9F, whereas extended leading-code
338 and position-code take the range 0xA0 through 0xFF. See the
339 document of `charset.h' for more detail about leading-code and
340 position-code.
341
342 There's one exception in this rule. Special leading-code
343 `leading-code-composition' denotes that the following several
344 characters should be composed into one character. Leading-codes of
345 components (except for ASCII) are added 0x20. An ASCII character
346 component is represented by a 2-byte sequence of `0xA0' and
347 `ASCII-code + 0x80'. See also the document in `charset.h' for the
348 detail of composite character. Hence, we can summarize the code
349 range as follows:
350
351 --- CODE RANGE of Emacs' internal format ---
352 (character set) (range)
353 ASCII 0x00 .. 0x7F
354 ELSE (1st byte) 0x80 .. 0x9F
355 (rest bytes) 0xA0 .. 0xFF
356 ---------------------------------------------
357
358 */
359
360enum emacs_code_class_type emacs_code_class[256];
361
362/* Go to the next statement only if *SRC is accessible and the code is
363 greater than 0xA0. */
364#define CHECK_CODE_RANGE_A0_FF \
365 do { \
366 if (src >= src_end) \
367 goto label_end_of_switch; \
368 else if (*src++ < 0xA0) \
369 return 0; \
370 } while (0)
371
372/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
373 Check if a text is encoded in Emacs' internal format. If it is,
0ef69138 374 return CODING_CATEGORY_MASK_EMASC_MULE, else return 0. */
4ed46869
KH
375
376int
0ef69138 377detect_coding_emacs_mule (src, src_end)
4ed46869
KH
378 unsigned char *src, *src_end;
379{
380 unsigned char c;
381 int composing = 0;
382
383 while (src < src_end)
384 {
385 c = *src++;
386
387 if (composing)
388 {
389 if (c < 0xA0)
390 composing = 0;
391 else
392 c -= 0x20;
393 }
394
395 switch (emacs_code_class[c])
396 {
397 case EMACS_ascii_code:
398 case EMACS_linefeed_code:
399 break;
400
401 case EMACS_control_code:
402 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
403 return 0;
404 break;
405
406 case EMACS_invalid_code:
407 return 0;
408
409 case EMACS_leading_code_composition: /* c == 0x80 */
410 if (composing)
411 CHECK_CODE_RANGE_A0_FF;
412 else
413 composing = 1;
414 break;
415
416 case EMACS_leading_code_4:
417 CHECK_CODE_RANGE_A0_FF;
418 /* fall down to check it two more times ... */
419
420 case EMACS_leading_code_3:
421 CHECK_CODE_RANGE_A0_FF;
422 /* fall down to check it one more time ... */
423
424 case EMACS_leading_code_2:
425 CHECK_CODE_RANGE_A0_FF;
426 break;
427
428 default:
429 label_end_of_switch:
430 break;
431 }
432 }
0ef69138 433 return CODING_CATEGORY_MASK_EMACS_MULE;
4ed46869
KH
434}
435
436\f
437/*** 3. ISO2022 handlers ***/
438
439/* The following note describes the coding system ISO2022 briefly.
440 Since the intension of this note is to help understanding of the
441 programs in this file, some parts are NOT ACCURATE or OVERLY
442 SIMPLIFIED. For the thorough understanding, please refer to the
443 original document of ISO2022.
444
445 ISO2022 provides many mechanisms to encode several character sets
446 in 7-bit and 8-bit environment. If one choose 7-bite environment,
447 all text is encoded by codes of less than 128. This may make the
448 encoded text a little bit longer, but the text get more stability
449 to pass through several gateways (some of them split MSB off).
450
451 There are two kind of character set: control character set and
452 graphic character set. The former contains control characters such
453 as `newline' and `escape' to provide control functions (control
454 functions are provided also by escape sequence). The latter
455 contains graphic characters such as ' A' and '-'. Emacs recognizes
456 two control character sets and many graphic character sets.
457
458 Graphic character sets are classified into one of the following
459 four classes, DIMENSION1_CHARS94, DIMENSION1_CHARS96,
460 DIMENSION2_CHARS94, DIMENSION2_CHARS96 according to the number of
461 bytes (DIMENSION) and the number of characters in one dimension
462 (CHARS) of the set. In addition, each character set is assigned an
463 identification tag (called "final character" and denoted as <F>
464 here after) which is unique in each class. <F> of each character
465 set is decided by ECMA(*) when it is registered in ISO. Code range
466 of <F> is 0x30..0x7F (0x30..0x3F are for private use only).
467
468 Note (*): ECMA = European Computer Manufacturers Association
469
470 Here are examples of graphic character set [NAME(<F>)]:
471 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
472 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
473 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
474 o DIMENSION2_CHARS96 -- none for the moment
475
476 A code area (1byte=8bits) is divided into 4 areas, C0, GL, C1, and GR.
477 C0 [0x00..0x1F] -- control character plane 0
478 GL [0x20..0x7F] -- graphic character plane 0
479 C1 [0x80..0x9F] -- control character plane 1
480 GR [0xA0..0xFF] -- graphic character plane 1
481
482 A control character set is directly designated and invoked to C0 or
483 C1 by an escape sequence. The most common case is that ISO646's
484 control character set is designated/invoked to C0 and ISO6429's
485 control character set is designated/invoked to C1, and usually
486 these designations/invocations are omitted in a coded text. With
487 7-bit environment, only C0 can be used, and a control character for
488 C1 is encoded by an appropriate escape sequence to fit in the
489 environment. All control characters for C1 are defined the
490 corresponding escape sequences.
491
492 A graphic character set is at first designated to one of four
493 graphic registers (G0 through G3), then these graphic registers are
494 invoked to GL or GR. These designations and invocations can be
495 done independently. The most common case is that G0 is invoked to
496 GL, G1 is invoked to GR, and ASCII is designated to G0, and usually
497 these invocations and designations are omitted in a coded text.
498 With 7-bit environment, only GL can be used.
499
500 When a graphic character set of CHARS94 is invoked to GL, code 0x20
501 and 0x7F of GL area work as control characters SPACE and DEL
502 respectively, and code 0xA0 and 0xFF of GR area should not be used.
503
504 There are two ways of invocation: locking-shift and single-shift.
505 With locking-shift, the invocation lasts until the next different
506 invocation, whereas with single-shift, the invocation works only
507 for the following character and doesn't affect locking-shift.
508 Invocations are done by the following control characters or escape
509 sequences.
510
511 ----------------------------------------------------------------------
512 function control char escape sequence description
513 ----------------------------------------------------------------------
514 SI (shift-in) 0x0F none invoke G0 to GL
515 SI (shift-out) 0x0E none invoke G1 to GL
516 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
517 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
518 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 into GL
519 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 into GL
520 ----------------------------------------------------------------------
521 The first four are for locking-shift. Control characters for these
522 functions are defined by macros ISO_CODE_XXX in `coding.h'.
523
524 Designations are done by the following escape sequences.
525 ----------------------------------------------------------------------
526 escape sequence description
527 ----------------------------------------------------------------------
528 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
529 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
530 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
531 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
532 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
533 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
534 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
535 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
536 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
537 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
538 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
539 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
540 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
541 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
542 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
543 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
544 ----------------------------------------------------------------------
545
546 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
547 of dimension 1, chars 94, and final character <F>, and etc.
548
549 Note (*): Although these designations are not allowed in ISO2022,
550 Emacs accepts them on decoding, and produces them on encoding
551 CHARS96 character set in a coding system which is characterized as
552 7-bit environment, non-locking-shift, and non-single-shift.
553
554 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
555 '(' can be omitted. We call this as "short-form" here after.
556
557 Now you may notice that there are a lot of ways for encoding the
558 same multilingual text in ISO2022. Actually, there exist many
559 coding systems such as Compound Text (used in X's inter client
560 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
561 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
562 localized platforms), and all of these are variants of ISO2022.
563
564 In addition to the above, Emacs handles two more kinds of escape
565 sequences: ISO6429's direction specification and Emacs' private
566 sequence for specifying character composition.
567
568 ISO6429's direction specification takes the following format:
569 o CSI ']' -- end of the current direction
570 o CSI '0' ']' -- end of the current direction
571 o CSI '1' ']' -- start of left-to-right text
572 o CSI '2' ']' -- start of right-to-left text
573 The control character CSI (0x9B: control sequence introducer) is
574 abbreviated to the escape sequence ESC '[' in 7-bit environment.
575
576 Character composition specification takes the following format:
577 o ESC '0' -- start character composition
578 o ESC '1' -- end character composition
579 Since these are not standard escape sequences of any ISO, the use
580 of them for these meaning is restricted to Emacs only. */
581
582enum iso_code_class_type iso_code_class[256];
583
584/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
585 Check if a text is encoded in ISO2022. If it is, returns an
586 integer in which appropriate flag bits any of:
587 CODING_CATEGORY_MASK_ISO_7
588 CODING_CATEGORY_MASK_ISO_8_1
589 CODING_CATEGORY_MASK_ISO_8_2
590 CODING_CATEGORY_MASK_ISO_ELSE
591 are set. If a code which should never appear in ISO2022 is found,
592 returns 0. */
593
594int
595detect_coding_iso2022 (src, src_end)
596 unsigned char *src, *src_end;
597{
765a2ca5
KH
598 int mask = (CODING_CATEGORY_MASK_ISO_7
599 | CODING_CATEGORY_MASK_ISO_8_1
600 | CODING_CATEGORY_MASK_ISO_8_2
601 | CODING_CATEGORY_MASK_ISO_ELSE);
bcf26d6a
KH
602 int g1 = 0; /* 1 iff designating to G1. */
603 int c, i;
4ed46869 604
e0e989f6 605 while (src < src_end)
4ed46869
KH
606 {
607 c = *src++;
608 switch (c)
609 {
610 case ISO_CODE_ESC:
e0e989f6 611 if (src >= src_end)
4ed46869
KH
612 break;
613 c = *src++;
bcf26d6a 614 if (src < src_end
e0e989f6
KH
615 && ((c >= '(' && c <= '/')
616 || c == '$' && ((*src >= '(' && *src <= '/')
617 || (*src >= '@' && *src <= 'B'))))
4ed46869 618 {
e0e989f6
KH
619 /* Valid designation sequence. */
620 if (c == ')' || (c == '$' && *src == ')'))
bcf26d6a
KH
621 {
622 g1 = 1;
623 mask &= ~CODING_CATEGORY_MASK_ISO_7;
624 }
e0e989f6
KH
625 src++;
626 break;
4ed46869 627 }
4ed46869
KH
628 else if (c == 'N' || c == 'O' || c == 'n' || c == 'o')
629 return CODING_CATEGORY_MASK_ISO_ELSE;
630 break;
631
4ed46869 632 case ISO_CODE_SO:
e0e989f6
KH
633 if (g1)
634 return CODING_CATEGORY_MASK_ISO_ELSE;
635 break;
636
4ed46869
KH
637 case ISO_CODE_CSI:
638 case ISO_CODE_SS2:
639 case ISO_CODE_SS3:
640 mask &= ~CODING_CATEGORY_MASK_ISO_7;
641 break;
642
643 default:
644 if (c < 0x80)
645 break;
646 else if (c < 0xA0)
647 return 0;
648 else
649 {
650 int count = 1;
651
652 mask &= ~CODING_CATEGORY_MASK_ISO_7;
e0e989f6 653 while (src < src_end && *src >= 0xA0)
4ed46869 654 count++, src++;
e0e989f6 655 if (count & 1 && src < src_end)
4ed46869
KH
656 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
657 }
658 break;
659 }
660 }
661
662 return mask;
663}
664
665/* Decode a character of which charset is CHARSET and the 1st position
bdd9fb48 666 code is C1. If dimension of CHARSET is 2, the 2nd position code is
4ed46869
KH
667 fetched from SRC and set to C2. If CHARSET is negative, it means
668 that we are decoding ill formed text, and what we can do is just to
669 read C1 as is. */
670
bdd9fb48
KH
671#define DECODE_ISO_CHARACTER(charset, c1) \
672 do { \
673 int c_alt, charset_alt = (charset); \
674 if (COMPOSING_HEAD_P (coding->composing)) \
675 { \
676 *dst++ = LEADING_CODE_COMPOSITION; \
677 if (COMPOSING_WITH_RULE_P (coding->composing)) \
678 /* To tell composition rules are embeded. */ \
679 *dst++ = 0xFF; \
680 coding->composing += 2; \
681 } \
682 if ((charset) >= 0) \
683 { \
684 if (CHARSET_DIMENSION (charset) == 2) \
685 ONE_MORE_BYTE (c2); \
686 if (!NILP (unification_table) \
687 && ((c_alt = unify_char (unification_table, \
688 -1, (charset), c1, c2)) >= 0)) \
689 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
690 } \
691 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
692 DECODE_CHARACTER_ASCII (c1); \
693 else if (CHARSET_DIMENSION (charset_alt) == 1) \
694 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
695 else \
696 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
697 if (COMPOSING_WITH_RULE_P (coding->composing)) \
698 /* To tell a composition rule follows. */ \
699 coding->composing = COMPOSING_WITH_RULE_RULE; \
4ed46869
KH
700 } while (0)
701
702/* Set designation state into CODING. */
703#define DECODE_DESIGNATION(reg, dimension, chars, final_char) \
704 do { \
2e34157c
RS
705 int charset = ISO_CHARSET_TABLE (make_number (dimension), \
706 make_number (chars), \
707 make_number (final_char)); \
4ed46869
KH
708 if (charset >= 0) \
709 { \
710 if (coding->direction == 1 \
711 && CHARSET_REVERSE_CHARSET (charset) >= 0) \
712 charset = CHARSET_REVERSE_CHARSET (charset); \
713 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
714 } \
715 } while (0)
716
717/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
718
719int
720decode_coding_iso2022 (coding, source, destination,
721 src_bytes, dst_bytes, consumed)
722 struct coding_system *coding;
723 unsigned char *source, *destination;
724 int src_bytes, dst_bytes;
725 int *consumed;
726{
727 unsigned char *src = source;
728 unsigned char *src_end = source + src_bytes;
729 unsigned char *dst = destination;
730 unsigned char *dst_end = destination + dst_bytes;
731 /* Since the maximum bytes produced by each loop is 7, we subtract 6
732 from DST_END to assure that overflow checking is necessary only
733 at the head of loop. */
734 unsigned char *adjusted_dst_end = dst_end - 6;
735 int charset;
736 /* Charsets invoked to graphic plane 0 and 1 respectively. */
737 int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
738 int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
a5d301df
KH
739 Lisp_Object unification_table
740 = coding->character_unification_table_for_decode;
bdd9fb48
KH
741
742 if (!NILP (Venable_character_unification) && NILP (unification_table))
a5d301df 743 unification_table = Vstandard_character_unification_table_for_decode;
4ed46869
KH
744
745 while (src < src_end && dst < adjusted_dst_end)
746 {
747 /* SRC_BASE remembers the start position in source in each loop.
748 The loop will be exited when there's not enough source text
749 to analyze long escape sequence or 2-byte code (within macros
750 ONE_MORE_BYTE or TWO_MORE_BYTES). In that case, SRC is reset
751 to SRC_BASE before exiting. */
752 unsigned char *src_base = src;
bdd9fb48 753 int c1 = *src++, c2;
4ed46869
KH
754
755 switch (iso_code_class [c1])
756 {
757 case ISO_0x20_or_0x7F:
758 if (!coding->composing
759 && (charset0 < 0 || CHARSET_CHARS (charset0) == 94))
760 {
761 /* This is SPACE or DEL. */
762 *dst++ = c1;
763 break;
764 }
765 /* This is a graphic character, we fall down ... */
766
767 case ISO_graphic_plane_0:
768 if (coding->composing == COMPOSING_WITH_RULE_RULE)
769 {
770 /* This is a composition rule. */
771 *dst++ = c1 | 0x80;
772 coding->composing = COMPOSING_WITH_RULE_TAIL;
773 }
774 else
775 DECODE_ISO_CHARACTER (charset0, c1);
776 break;
777
778 case ISO_0xA0_or_0xFF:
779 if (charset1 < 0 || CHARSET_CHARS (charset1) == 94)
780 {
781 /* Invalid code. */
782 *dst++ = c1;
783 break;
784 }
785 /* This is a graphic character, we fall down ... */
786
787 case ISO_graphic_plane_1:
788 DECODE_ISO_CHARACTER (charset1, c1);
789 break;
790
791 case ISO_control_code:
792 /* All ISO2022 control characters in this class have the
793 same representation in Emacs internal format. */
794 *dst++ = c1;
795 break;
796
797 case ISO_carriage_return:
798 if (coding->eol_type == CODING_EOL_CR)
799 {
800 *dst++ = '\n';
801 }
802 else if (coding->eol_type == CODING_EOL_CRLF)
803 {
804 ONE_MORE_BYTE (c1);
805 if (c1 == ISO_CODE_LF)
806 *dst++ = '\n';
807 else
808 {
809 src--;
810 *dst++ = c1;
811 }
812 }
813 else
814 {
815 *dst++ = c1;
816 }
817 break;
818
819 case ISO_shift_out:
e0e989f6
KH
820 if (CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
821 goto label_invalid_escape_sequence;
4ed46869
KH
822 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
823 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
824 break;
825
826 case ISO_shift_in:
827 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
828 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
829 break;
830
831 case ISO_single_shift_2_7:
832 case ISO_single_shift_2:
833 /* SS2 is handled as an escape sequence of ESC 'N' */
834 c1 = 'N';
835 goto label_escape_sequence;
836
837 case ISO_single_shift_3:
838 /* SS2 is handled as an escape sequence of ESC 'O' */
839 c1 = 'O';
840 goto label_escape_sequence;
841
842 case ISO_control_sequence_introducer:
843 /* CSI is handled as an escape sequence of ESC '[' ... */
844 c1 = '[';
845 goto label_escape_sequence;
846
847 case ISO_escape:
848 ONE_MORE_BYTE (c1);
849 label_escape_sequence:
850 /* Escape sequences handled by Emacs are invocation,
851 designation, direction specification, and character
852 composition specification. */
853 switch (c1)
854 {
855 case '&': /* revision of following character set */
856 ONE_MORE_BYTE (c1);
857 if (!(c1 >= '@' && c1 <= '~'))
e0e989f6 858 goto label_invalid_escape_sequence;
4ed46869
KH
859 ONE_MORE_BYTE (c1);
860 if (c1 != ISO_CODE_ESC)
e0e989f6 861 goto label_invalid_escape_sequence;
4ed46869
KH
862 ONE_MORE_BYTE (c1);
863 goto label_escape_sequence;
864
865 case '$': /* designation of 2-byte character set */
866 ONE_MORE_BYTE (c1);
867 if (c1 >= '@' && c1 <= 'B')
868 { /* designation of JISX0208.1978, GB2312.1980,
869 or JISX0208.1980 */
870 DECODE_DESIGNATION (0, 2, 94, c1);
871 }
872 else if (c1 >= 0x28 && c1 <= 0x2B)
873 { /* designation of DIMENSION2_CHARS94 character set */
874 ONE_MORE_BYTE (c2);
875 DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
876 }
877 else if (c1 >= 0x2C && c1 <= 0x2F)
878 { /* designation of DIMENSION2_CHARS96 character set */
879 ONE_MORE_BYTE (c2);
880 DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
881 }
882 else
e0e989f6 883 goto label_invalid_escape_sequence;
4ed46869
KH
884 break;
885
886 case 'n': /* invocation of locking-shift-2 */
e0e989f6
KH
887 if (CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
888 goto label_invalid_escape_sequence;
4ed46869 889 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
e0e989f6 890 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
4ed46869
KH
891 break;
892
893 case 'o': /* invocation of locking-shift-3 */
e0e989f6
KH
894 if (CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
895 goto label_invalid_escape_sequence;
4ed46869 896 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
e0e989f6 897 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
4ed46869
KH
898 break;
899
900 case 'N': /* invocation of single-shift-2 */
e0e989f6
KH
901 if (CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
902 goto label_invalid_escape_sequence;
4ed46869
KH
903 ONE_MORE_BYTE (c1);
904 charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
905 DECODE_ISO_CHARACTER (charset, c1);
906 break;
907
908 case 'O': /* invocation of single-shift-3 */
e0e989f6
KH
909 if (CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
910 goto label_invalid_escape_sequence;
4ed46869
KH
911 ONE_MORE_BYTE (c1);
912 charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
913 DECODE_ISO_CHARACTER (charset, c1);
914 break;
915
916 case '0': /* start composing without embeded rules */
917 coding->composing = COMPOSING_NO_RULE_HEAD;
918 break;
919
920 case '1': /* end composing */
921 coding->composing = COMPOSING_NO;
922 break;
923
924 case '2': /* start composing with embeded rules */
925 coding->composing = COMPOSING_WITH_RULE_HEAD;
926 break;
927
928 case '[': /* specification of direction */
929 /* For the moment, nested direction is not supported.
930 So, the value of `coding->direction' is 0 or 1: 0
931 means left-to-right, 1 means right-to-left. */
932 ONE_MORE_BYTE (c1);
933 switch (c1)
934 {
935 case ']': /* end of the current direction */
936 coding->direction = 0;
937
938 case '0': /* end of the current direction */
939 case '1': /* start of left-to-right direction */
940 ONE_MORE_BYTE (c1);
941 if (c1 == ']')
942 coding->direction = 0;
943 else
944 goto label_invalid_escape_sequence;
945 break;
946
947 case '2': /* start of right-to-left direction */
948 ONE_MORE_BYTE (c1);
949 if (c1 == ']')
950 coding->direction= 1;
951 else
952 goto label_invalid_escape_sequence;
953 break;
954
955 default:
956 goto label_invalid_escape_sequence;
957 }
958 break;
959
960 default:
961 if (c1 >= 0x28 && c1 <= 0x2B)
962 { /* designation of DIMENSION1_CHARS94 character set */
963 ONE_MORE_BYTE (c2);
964 DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
965 }
966 else if (c1 >= 0x2C && c1 <= 0x2F)
967 { /* designation of DIMENSION1_CHARS96 character set */
968 ONE_MORE_BYTE (c2);
969 DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
970 }
971 else
972 {
973 goto label_invalid_escape_sequence;
974 }
975 }
976 /* We must update these variables now. */
977 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
978 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
979 break;
980
981 label_invalid_escape_sequence:
982 {
983 int length = src - src_base;
984
985 bcopy (src_base, dst, length);
986 dst += length;
987 }
988 }
989 continue;
990
991 label_end_of_loop:
992 coding->carryover_size = src - src_base;
993 bcopy (src_base, coding->carryover, coding->carryover_size);
994 src = src_base;
995 break;
996 }
997
998 /* If this is the last block of the text to be decoded, we had
999 better just flush out all remaining codes in the text although
1000 they are not valid characters. */
1001 if (coding->last_block)
1002 {
1003 bcopy (src, dst, src_end - src);
1004 dst += (src_end - src);
1005 src = src_end;
1006 }
1007 *consumed = src - source;
1008 return dst - destination;
1009}
1010
1011/* ISO2022 encoding staffs. */
1012
1013/*
1014 It is not enough to say just "ISO2022" on encoding, but we have to
1015 specify more details. In Emacs, each coding-system of ISO2022
1016 variant has the following specifications:
1017 1. Initial designation to G0 thru G3.
1018 2. Allows short-form designation?
1019 3. ASCII should be designated to G0 before control characters?
1020 4. ASCII should be designated to G0 at end of line?
1021 5. 7-bit environment or 8-bit environment?
1022 6. Use locking-shift?
1023 7. Use Single-shift?
1024 And the following two are only for Japanese:
1025 8. Use ASCII in place of JIS0201-1976-Roman?
1026 9. Use JISX0208-1983 in place of JISX0208-1978?
1027 These specifications are encoded in `coding->flags' as flag bits
1028 defined by macros CODING_FLAG_ISO_XXX. See `coding.h' for more
1029 detail.
1030*/
1031
1032/* Produce codes (escape sequence) for designating CHARSET to graphic
1033 register REG. If <final-char> of CHARSET is '@', 'A', or 'B' and
1034 the coding system CODING allows, produce designation sequence of
1035 short-form. */
1036
1037#define ENCODE_DESIGNATION(charset, reg, coding) \
1038 do { \
1039 unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset); \
1040 char *intermediate_char_94 = "()*+"; \
1041 char *intermediate_char_96 = ",-./"; \
1042 Lisp_Object temp \
1043 = Fassq (make_number (charset), Vcharset_revision_alist); \
1044 if (! NILP (temp)) \
1045 { \
1046 *dst++ = ISO_CODE_ESC; \
1047 *dst++ = '&'; \
1048 *dst++ = XINT (XCONS (temp)->cdr) + '@'; \
1049 } \
1050 *dst++ = ISO_CODE_ESC; \
1051 if (CHARSET_DIMENSION (charset) == 1) \
1052 { \
1053 if (CHARSET_CHARS (charset) == 94) \
1054 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1055 else \
1056 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1057 } \
1058 else \
1059 { \
1060 *dst++ = '$'; \
1061 if (CHARSET_CHARS (charset) == 94) \
1062 { \
1063 if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM) \
1064 || reg != 0 \
1065 || final_char < '@' || final_char > 'B') \
1066 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1067 } \
1068 else \
1069 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1070 } \
1071 *dst++ = final_char; \
1072 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
1073 } while (0)
1074
1075/* The following two macros produce codes (control character or escape
1076 sequence) for ISO2022 single-shift functions (single-shift-2 and
1077 single-shift-3). */
1078
1079#define ENCODE_SINGLE_SHIFT_2 \
1080 do { \
1081 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1082 *dst++ = ISO_CODE_ESC, *dst++ = 'N'; \
1083 else \
1084 *dst++ = ISO_CODE_SS2; \
1085 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1086 } while (0)
1087
1088#define ENCODE_SINGLE_SHIFT_3 \
1089 do { \
1090 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1091 *dst++ = ISO_CODE_ESC, *dst++ = 'O'; \
1092 else \
1093 *dst++ = ISO_CODE_SS3; \
1094 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1095 } while (0)
1096
1097/* The following four macros produce codes (control character or
1098 escape sequence) for ISO2022 locking-shift functions (shift-in,
1099 shift-out, locking-shift-2, and locking-shift-3). */
1100
1101#define ENCODE_SHIFT_IN \
1102 do { \
1103 *dst++ = ISO_CODE_SI; \
1104 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1105 } while (0)
1106
1107#define ENCODE_SHIFT_OUT \
1108 do { \
1109 *dst++ = ISO_CODE_SO; \
1110 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1111 } while (0)
1112
1113#define ENCODE_LOCKING_SHIFT_2 \
1114 do { \
1115 *dst++ = ISO_CODE_ESC, *dst++ = 'n'; \
1116 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1117 } while (0)
1118
1119#define ENCODE_LOCKING_SHIFT_3 \
1120 do { \
1121 *dst++ = ISO_CODE_ESC, *dst++ = 'o'; \
1122 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1123 } while (0)
1124
1125/* Produce codes for a DIMENSION1 character of which character set is
1126 CHARSET and position-code is C1. Designation and invocation
1127 sequences are also produced in advance if necessary. */
1128
1129
1130#define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
1131 do { \
1132 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1133 { \
1134 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1135 *dst++ = c1 & 0x7F; \
1136 else \
1137 *dst++ = c1 | 0x80; \
1138 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1139 break; \
1140 } \
1141 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1142 { \
1143 *dst++ = c1 & 0x7F; \
1144 break; \
1145 } \
1146 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1147 { \
1148 *dst++ = c1 | 0x80; \
1149 break; \
1150 } \
1151 else \
1152 /* Since CHARSET is not yet invoked to any graphic planes, we \
1153 must invoke it, or, at first, designate it to some graphic \
1154 register. Then repeat the loop to actually produce the \
1155 character. */ \
1156 dst = encode_invocation_designation (charset, coding, dst); \
1157 } while (1)
1158
1159/* Produce codes for a DIMENSION2 character of which character set is
1160 CHARSET and position-codes are C1 and C2. Designation and
1161 invocation codes are also produced in advance if necessary. */
1162
1163#define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
1164 do { \
1165 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1166 { \
1167 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1168 *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F; \
1169 else \
1170 *dst++ = c1 | 0x80, *dst++ = c2 | 0x80; \
1171 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1172 break; \
1173 } \
1174 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1175 { \
1176 *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F; \
1177 break; \
1178 } \
1179 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1180 { \
1181 *dst++ = c1 | 0x80, *dst++= c2 | 0x80; \
1182 break; \
1183 } \
1184 else \
1185 /* Since CHARSET is not yet invoked to any graphic planes, we \
1186 must invoke it, or, at first, designate it to some graphic \
1187 register. Then repeat the loop to actually produce the \
1188 character. */ \
1189 dst = encode_invocation_designation (charset, coding, dst); \
1190 } while (1)
1191
bdd9fb48
KH
1192#define ENCODE_ISO_CHARACTER(charset, c1, c2) \
1193 do { \
1194 int c_alt, charset_alt; \
1195 if (!NILP (unification_table) \
1196 && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
a5d301df 1197 >= 0)) \
bdd9fb48
KH
1198 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1199 else \
1200 charset_alt = charset; \
1201 if (CHARSET_DIMENSION (charset_alt) == 1) \
1202 ENCODE_ISO_CHARACTER_DIMENSION1 (charset_alt, c1); \
1203 else \
1204 ENCODE_ISO_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
1205 } while (0)
1206
4ed46869
KH
1207/* Produce designation and invocation codes at a place pointed by DST
1208 to use CHARSET. The element `spec.iso2022' of *CODING is updated.
1209 Return new DST. */
1210
1211unsigned char *
1212encode_invocation_designation (charset, coding, dst)
1213 int charset;
1214 struct coding_system *coding;
1215 unsigned char *dst;
1216{
1217 int reg; /* graphic register number */
1218
1219 /* At first, check designations. */
1220 for (reg = 0; reg < 4; reg++)
1221 if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
1222 break;
1223
1224 if (reg >= 4)
1225 {
1226 /* CHARSET is not yet designated to any graphic registers. */
1227 /* At first check the requested designation. */
1228 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1ba9e4ab
KH
1229 if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1230 /* Since CHARSET requests no special designation, designate it
1231 to graphic register 0. */
4ed46869
KH
1232 reg = 0;
1233
1234 ENCODE_DESIGNATION (charset, reg, coding);
1235 }
1236
1237 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
1238 && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
1239 {
1240 /* Since the graphic register REG is not invoked to any graphic
1241 planes, invoke it to graphic plane 0. */
1242 switch (reg)
1243 {
1244 case 0: /* graphic register 0 */
1245 ENCODE_SHIFT_IN;
1246 break;
1247
1248 case 1: /* graphic register 1 */
1249 ENCODE_SHIFT_OUT;
1250 break;
1251
1252 case 2: /* graphic register 2 */
1253 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1254 ENCODE_SINGLE_SHIFT_2;
1255 else
1256 ENCODE_LOCKING_SHIFT_2;
1257 break;
1258
1259 case 3: /* graphic register 3 */
1260 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1261 ENCODE_SINGLE_SHIFT_3;
1262 else
1263 ENCODE_LOCKING_SHIFT_3;
1264 break;
1265 }
1266 }
1267 return dst;
1268}
1269
1270/* The following two macros produce codes for indicating composition. */
1271#define ENCODE_COMPOSITION_NO_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '0'
1272#define ENCODE_COMPOSITION_WITH_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '2'
1273#define ENCODE_COMPOSITION_END *dst++ = ISO_CODE_ESC, *dst++ = '1'
1274
1275/* The following three macros produce codes for indicating direction
1276 of text. */
1277#define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
1278 do { \
1279 if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS) \
1280 *dst++ = ISO_CODE_ESC, *dst++ = '['; \
1281 else \
1282 *dst++ = ISO_CODE_CSI; \
1283 } while (0)
1284
1285#define ENCODE_DIRECTION_R2L \
1286 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '2', *dst++ = ']'
1287
1288#define ENCODE_DIRECTION_L2R \
1289 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '0', *dst++ = ']'
1290
1291/* Produce codes for designation and invocation to reset the graphic
1292 planes and registers to initial state. */
e0e989f6
KH
1293#define ENCODE_RESET_PLANE_AND_REGISTER \
1294 do { \
1295 int reg; \
1296 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0) \
1297 ENCODE_SHIFT_IN; \
1298 for (reg = 0; reg < 4; reg++) \
1299 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0 \
1300 && (CODING_SPEC_ISO_DESIGNATION (coding, reg) \
1301 != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg))) \
1302 ENCODE_DESIGNATION \
1303 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
4ed46869
KH
1304 } while (0)
1305
bdd9fb48
KH
1306/* Produce designation sequences of charsets in the line started from
1307 *SRC to a place pointed by DSTP.
1308
1309 If the current block ends before any end-of-line, we may fail to
1310 find all the necessary *designations. */
1311encode_designation_at_bol (coding, table, src, src_end, dstp)
e0e989f6 1312 struct coding_system *coding;
bdd9fb48 1313 Lisp_Object table;
e0e989f6
KH
1314 unsigned char *src, *src_end, **dstp;
1315{
bdd9fb48
KH
1316 int charset, c, found = 0, reg;
1317 /* Table of charsets to be designated to each graphic register. */
1318 int r[4];
1319 unsigned char *dst = *dstp;
1320
1321 for (reg = 0; reg < 4; reg++)
1322 r[reg] = -1;
1323
1324 while (src < src_end && *src != '\n' && found < 4)
e0e989f6 1325 {
bdd9fb48
KH
1326 int bytes = BYTES_BY_CHAR_HEAD (*src);
1327
1328 if (NILP (table))
1329 charset = CHARSET_AT (src);
1330 else
e0e989f6 1331 {
bdd9fb48
KH
1332 int c_alt, c1, c2;
1333
1334 SPLIT_STRING(src, bytes, charset, c1, c2);
1335 if ((c_alt = unify_char (table, -1, charset, c1, c2)) >= 0)
1336 charset = CHAR_CHARSET (c_alt);
e0e989f6 1337 }
bdd9fb48 1338
e0e989f6 1339 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1ba9e4ab 1340 if (r[reg] == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
bdd9fb48
KH
1341 {
1342 found++;
1343 r[reg] = charset;
1344 }
1345
1346 src += bytes;
1347 }
1348
1349 if (found)
1350 {
1351 for (reg = 0; reg < 4; reg++)
1352 if (r[reg] >= 0
1353 && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
1354 ENCODE_DESIGNATION (r[reg], reg, coding);
1355 *dstp = dst;
e0e989f6 1356 }
e0e989f6
KH
1357}
1358
4ed46869
KH
1359/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
1360
1361int
1362encode_coding_iso2022 (coding, source, destination,
1363 src_bytes, dst_bytes, consumed)
1364 struct coding_system *coding;
1365 unsigned char *source, *destination;
1366 int src_bytes, dst_bytes;
1367 int *consumed;
1368{
1369 unsigned char *src = source;
1370 unsigned char *src_end = source + src_bytes;
1371 unsigned char *dst = destination;
1372 unsigned char *dst_end = destination + dst_bytes;
e0e989f6 1373 /* Since the maximum bytes produced by each loop is 20, we subtract 19
4ed46869
KH
1374 from DST_END to assure overflow checking is necessary only at the
1375 head of loop. */
e0e989f6 1376 unsigned char *adjusted_dst_end = dst_end - 19;
a5d301df
KH
1377 Lisp_Object unification_table
1378 = coding->character_unification_table_for_encode;
bdd9fb48
KH
1379
1380 if (!NILP (Venable_character_unification) && NILP (unification_table))
a5d301df 1381 unification_table = Vstandard_character_unification_table_for_encode;
4ed46869
KH
1382
1383 while (src < src_end && dst < adjusted_dst_end)
1384 {
1385 /* SRC_BASE remembers the start position in source in each loop.
1386 The loop will be exited when there's not enough source text
1387 to analyze multi-byte codes (within macros ONE_MORE_BYTE,
1388 TWO_MORE_BYTES, and THREE_MORE_BYTES). In that case, SRC is
1389 reset to SRC_BASE before exiting. */
1390 unsigned char *src_base = src;
bdd9fb48 1391 int charset, c1, c2, c3, c4;
4ed46869 1392
e0e989f6
KH
1393 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
1394 && CODING_SPEC_ISO_BOL (coding))
1395 {
bdd9fb48
KH
1396 /* We have to produce designation sequences if any now. */
1397 encode_designation_at_bol (coding, unification_table,
1398 src, src_end, &dst);
e0e989f6
KH
1399 CODING_SPEC_ISO_BOL (coding) = 0;
1400 }
1401
1402 c1 = *src++;
4ed46869
KH
1403 /* If we are seeing a component of a composite character, we are
1404 seeing a leading-code specially encoded for composition, or a
1405 composition rule if composing with rule. We must set C1
1406 to a normal leading-code or an ASCII code. If we are not at
1407 a composed character, we must reset the composition state. */
1408 if (COMPOSING_P (coding->composing))
1409 {
1410 if (c1 < 0xA0)
1411 {
1412 /* We are not in a composite character any longer. */
1413 coding->composing = COMPOSING_NO;
1414 ENCODE_COMPOSITION_END;
1415 }
1416 else
1417 {
1418 if (coding->composing == COMPOSING_WITH_RULE_RULE)
1419 {
1420 *dst++ = c1 & 0x7F;
1421 coding->composing = COMPOSING_WITH_RULE_HEAD;
1422 continue;
1423 }
1424 else if (coding->composing == COMPOSING_WITH_RULE_HEAD)
1425 coding->composing = COMPOSING_WITH_RULE_RULE;
1426 if (c1 == 0xA0)
1427 {
1428 /* This is an ASCII component. */
1429 ONE_MORE_BYTE (c1);
1430 c1 &= 0x7F;
1431 }
1432 else
1433 /* This is a leading-code of non ASCII component. */
1434 c1 -= 0x20;
1435 }
1436 }
1437
1438 /* Now encode one character. C1 is a control character, an
1439 ASCII character, or a leading-code of multi-byte character. */
1440 switch (emacs_code_class[c1])
1441 {
1442 case EMACS_ascii_code:
bdd9fb48 1443 ENCODE_ISO_CHARACTER (CHARSET_ASCII, c1, /* dummy */ c2);
4ed46869
KH
1444 break;
1445
1446 case EMACS_control_code:
1447 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
e0e989f6 1448 ENCODE_RESET_PLANE_AND_REGISTER;
4ed46869
KH
1449 *dst++ = c1;
1450 break;
1451
1452 case EMACS_carriage_return_code:
1453 if (!coding->selective)
1454 {
1455 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
e0e989f6 1456 ENCODE_RESET_PLANE_AND_REGISTER;
4ed46869
KH
1457 *dst++ = c1;
1458 break;
1459 }
1460 /* fall down to treat '\r' as '\n' ... */
1461
1462 case EMACS_linefeed_code:
1463 if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
e0e989f6
KH
1464 ENCODE_RESET_PLANE_AND_REGISTER;
1465 if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
1466 bcopy (coding->spec.iso2022.initial_designation,
1467 coding->spec.iso2022.current_designation,
1468 sizeof coding->spec.iso2022.initial_designation);
4ed46869 1469 if (coding->eol_type == CODING_EOL_LF
0ef69138 1470 || coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
1471 *dst++ = ISO_CODE_LF;
1472 else if (coding->eol_type == CODING_EOL_CRLF)
1473 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
1474 else
1475 *dst++ = ISO_CODE_CR;
e0e989f6 1476 CODING_SPEC_ISO_BOL (coding) = 1;
4ed46869
KH
1477 break;
1478
1479 case EMACS_leading_code_2:
1480 ONE_MORE_BYTE (c2);
bdd9fb48 1481 ENCODE_ISO_CHARACTER (c1, c2, /* dummy */ c3);
4ed46869
KH
1482 break;
1483
1484 case EMACS_leading_code_3:
1485 TWO_MORE_BYTES (c2, c3);
1486 if (c1 < LEADING_CODE_PRIVATE_11)
bdd9fb48 1487 ENCODE_ISO_CHARACTER (c1, c2, c3);
4ed46869 1488 else
bdd9fb48 1489 ENCODE_ISO_CHARACTER (c2, c3, /* dummy */ c4);
4ed46869
KH
1490 break;
1491
1492 case EMACS_leading_code_4:
1493 THREE_MORE_BYTES (c2, c3, c4);
bdd9fb48 1494 ENCODE_ISO_CHARACTER (c2, c3, c4);
4ed46869
KH
1495 break;
1496
1497 case EMACS_leading_code_composition:
1498 ONE_MORE_BYTE (c1);
1499 if (c1 == 0xFF)
1500 {
1501 coding->composing = COMPOSING_WITH_RULE_HEAD;
1502 ENCODE_COMPOSITION_WITH_RULE_START;
1503 }
1504 else
1505 {
1506 /* Rewind one byte because it is a character code of
1507 composition elements. */
1508 src--;
1509 coding->composing = COMPOSING_NO_RULE_HEAD;
1510 ENCODE_COMPOSITION_NO_RULE_START;
1511 }
1512 break;
1513
1514 case EMACS_invalid_code:
1515 *dst++ = c1;
1516 break;
1517 }
1518 continue;
1519 label_end_of_loop:
1520 coding->carryover_size = src - src_base;
1521 bcopy (src_base, coding->carryover, coding->carryover_size);
4ed46869
KH
1522 break;
1523 }
1524
1525 /* If this is the last block of the text to be encoded, we must
bdd9fb48
KH
1526 reset graphic planes and registers to the initial state. */
1527 if (src >= src_end && coding->last_block)
4ed46869 1528 {
e0e989f6 1529 ENCODE_RESET_PLANE_AND_REGISTER;
bdd9fb48
KH
1530 if (coding->carryover_size > 0
1531 && coding->carryover_size < (dst_end - dst))
1532 {
1533 bcopy (coding->carryover, dst, coding->carryover_size);
1534 dst += coding->carryover_size;
1535 coding->carryover_size = 0;
1536 }
4ed46869
KH
1537 }
1538 *consumed = src - source;
1539 return dst - destination;
1540}
1541
1542\f
1543/*** 4. SJIS and BIG5 handlers ***/
1544
1545/* Although SJIS and BIG5 are not ISO's coding system, They are used
1546 quite widely. So, for the moment, Emacs supports them in the bare
1547 C code. But, in the future, they may be supported only by CCL. */
1548
1549/* SJIS is a coding system encoding three character sets: ASCII, right
1550 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
1551 as is. A character of charset katakana-jisx0201 is encoded by
1552 "position-code + 0x80". A character of charset japanese-jisx0208
1553 is encoded in 2-byte but two position-codes are divided and shifted
1554 so that it fit in the range below.
1555
1556 --- CODE RANGE of SJIS ---
1557 (character set) (range)
1558 ASCII 0x00 .. 0x7F
1559 KATAKANA-JISX0201 0xA0 .. 0xDF
1560 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xFF
1561 (2nd byte) 0x40 .. 0xFF
1562 -------------------------------
1563
1564*/
1565
1566/* BIG5 is a coding system encoding two character sets: ASCII and
1567 Big5. An ASCII character is encoded as is. Big5 is a two-byte
1568 character set and is encoded in two-byte.
1569
1570 --- CODE RANGE of BIG5 ---
1571 (character set) (range)
1572 ASCII 0x00 .. 0x7F
1573 Big5 (1st byte) 0xA1 .. 0xFE
1574 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
1575 --------------------------
1576
1577 Since the number of characters in Big5 is larger than maximum
1578 characters in Emacs' charset (96x96), it can't be handled as one
1579 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
1580 and `charset-big5-2'. Both are DIMENSION2 and CHARS94. The former
1581 contains frequently used characters and the latter contains less
1582 frequently used characters. */
1583
1584/* Macros to decode or encode a character of Big5 in BIG5. B1 and B2
1585 are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
1586 C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
1587 format. CHARSET is `charset_big5_1' or `charset_big5_2'. */
1588
1589/* Number of Big5 characters which have the same code in 1st byte. */
1590#define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
1591
1592#define DECODE_BIG5(b1, b2, charset, c1, c2) \
1593 do { \
1594 unsigned int temp \
1595 = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62); \
1596 if (b1 < 0xC9) \
1597 charset = charset_big5_1; \
1598 else \
1599 { \
1600 charset = charset_big5_2; \
1601 temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW; \
1602 } \
1603 c1 = temp / (0xFF - 0xA1) + 0x21; \
1604 c2 = temp % (0xFF - 0xA1) + 0x21; \
1605 } while (0)
1606
1607#define ENCODE_BIG5(charset, c1, c2, b1, b2) \
1608 do { \
1609 unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21); \
1610 if (charset == charset_big5_2) \
1611 temp += BIG5_SAME_ROW * (0xC9 - 0xA1); \
1612 b1 = temp / BIG5_SAME_ROW + 0xA1; \
1613 b2 = temp % BIG5_SAME_ROW; \
1614 b2 += b2 < 0x3F ? 0x40 : 0x62; \
1615 } while (0)
1616
a5d301df
KH
1617#define DECODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \
1618 do { \
1619 int c_alt, charset_alt = (charset); \
1620 if (!NILP (unification_table) \
1621 && ((c_alt = unify_char (unification_table, \
1622 -1, (charset), c1, c2)) >= 0)) \
1623 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1624 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
1625 DECODE_CHARACTER_ASCII (c1); \
1626 else if (CHARSET_DIMENSION (charset_alt) == 1) \
1627 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
1628 else \
1629 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
1630 } while (0)
1631
1632#define ENCODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \
1633 do { \
1634 int c_alt, charset_alt; \
1635 if (!NILP (unification_table) \
1636 && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
1637 >= 0)) \
1638 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1639 else \
1640 charset_alt = charset; \
1641 if (charset_alt == charset_ascii) \
1642 *dst++ = c1; \
1643 else if (CHARSET_DIMENSION (charset_alt) == 1) \
1644 { \
1645 if (sjis_p && charset_alt == charset_katakana_jisx0201) \
1646 *dst++ = c1; \
1647 else \
1648 *dst++ = charset_alt, *dst++ = c1; \
1649 } \
1650 else \
1651 { \
1652 c1 &= 0x7F, c2 &= 0x7F; \
1653 if (sjis_p && charset_alt == charset_jisx0208) \
1654 { \
1655 unsigned char s1, s2; \
1656 \
1657 ENCODE_SJIS (c1, c2, s1, s2); \
1658 *dst++ = s1, *dst++ = s2; \
1659 } \
1660 else if (!sjis_p \
1661 && (charset_alt == charset_big5_1 \
1662 || charset_alt == charset_big5_2)) \
1663 { \
1664 unsigned char b1, b2; \
1665 \
1666 ENCODE_BIG5 (c1, c2, c3, b1, b2); \
1667 *dst++ = b1, *dst++ = b2; \
1668 } \
1669 else \
1670 *dst++ = charset_alt, *dst++ = c1, *dst++ = c2; \
1671 } \
1672 } while (0);
1673
4ed46869
KH
1674/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1675 Check if a text is encoded in SJIS. If it is, return
1676 CODING_CATEGORY_MASK_SJIS, else return 0. */
1677
1678int
1679detect_coding_sjis (src, src_end)
1680 unsigned char *src, *src_end;
1681{
1682 unsigned char c;
1683
1684 while (src < src_end)
1685 {
1686 c = *src++;
1687 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1688 return 0;
1689 if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
1690 {
1691 if (src < src_end && *src++ < 0x40)
1692 return 0;
1693 }
1694 }
1695 return CODING_CATEGORY_MASK_SJIS;
1696}
1697
1698/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1699 Check if a text is encoded in BIG5. If it is, return
1700 CODING_CATEGORY_MASK_BIG5, else return 0. */
1701
1702int
1703detect_coding_big5 (src, src_end)
1704 unsigned char *src, *src_end;
1705{
1706 unsigned char c;
1707
1708 while (src < src_end)
1709 {
1710 c = *src++;
1711 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1712 return 0;
1713 if (c >= 0xA1)
1714 {
1715 if (src >= src_end)
1716 break;
1717 c = *src++;
1718 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
1719 return 0;
1720 }
1721 }
1722 return CODING_CATEGORY_MASK_BIG5;
1723}
1724
1725/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
1726 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
1727
1728int
1729decode_coding_sjis_big5 (coding, source, destination,
1730 src_bytes, dst_bytes, consumed, sjis_p)
1731 struct coding_system *coding;
1732 unsigned char *source, *destination;
1733 int src_bytes, dst_bytes;
1734 int *consumed;
1735 int sjis_p;
1736{
1737 unsigned char *src = source;
1738 unsigned char *src_end = source + src_bytes;
1739 unsigned char *dst = destination;
1740 unsigned char *dst_end = destination + dst_bytes;
1741 /* Since the maximum bytes produced by each loop is 4, we subtract 3
1742 from DST_END to assure overflow checking is necessary only at the
1743 head of loop. */
1744 unsigned char *adjusted_dst_end = dst_end - 3;
a5d301df
KH
1745 Lisp_Object unification_table
1746 = coding->character_unification_table_for_decode;
1747
1748 if (!NILP (Venable_character_unification) && NILP (unification_table))
1749 unification_table = Vstandard_character_unification_table_for_decode;
4ed46869
KH
1750
1751 while (src < src_end && dst < adjusted_dst_end)
1752 {
1753 /* SRC_BASE remembers the start position in source in each loop.
1754 The loop will be exited when there's not enough source text
1755 to analyze two-byte character (within macro ONE_MORE_BYTE).
1756 In that case, SRC is reset to SRC_BASE before exiting. */
1757 unsigned char *src_base = src;
1758 unsigned char c1 = *src++, c2, c3, c4;
1759
1760 if (c1 == '\r')
1761 {
1762 if (coding->eol_type == CODING_EOL_CRLF)
1763 {
1764 ONE_MORE_BYTE (c2);
1765 if (c2 == '\n')
1766 *dst++ = c2;
1767 else
1768 /* To process C2 again, SRC is subtracted by 1. */
1769 *dst++ = c1, src--;
1770 }
1771 else
1772 *dst++ = c1;
1773 }
a5d301df 1774 else if (c1 < 0x20)
4ed46869 1775 *dst++ = c1;
a5d301df
KH
1776 else if (c1 < 0x80)
1777 DECODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
4ed46869
KH
1778 else if (c1 < 0xA0 || c1 >= 0xE0)
1779 {
1780 /* SJIS -> JISX0208, BIG5 -> Big5 (only if 0xE0 <= c1 < 0xFF) */
1781 if (sjis_p)
1782 {
1783 ONE_MORE_BYTE (c2);
1784 DECODE_SJIS (c1, c2, c3, c4);
a5d301df 1785 DECODE_SJIS_BIG5_CHARACTER (charset_jisx0208, c3, c4);
4ed46869
KH
1786 }
1787 else if (c1 >= 0xE0 && c1 < 0xFF)
1788 {
1789 int charset;
1790
1791 ONE_MORE_BYTE (c2);
1792 DECODE_BIG5 (c1, c2, charset, c3, c4);
a5d301df 1793 DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
4ed46869
KH
1794 }
1795 else /* Invalid code */
1796 *dst++ = c1;
1797 }
1798 else
1799 {
1800 /* SJIS -> JISX0201-Kana, BIG5 -> Big5 */
1801 if (sjis_p)
a5d301df 1802 DECODE_SJIS_BIG5_CHARACTER (charset_katakana_jisx0201, c1, /* dummy */ c2);
4ed46869
KH
1803 else
1804 {
1805 int charset;
1806
1807 ONE_MORE_BYTE (c2);
1808 DECODE_BIG5 (c1, c2, charset, c3, c4);
a5d301df 1809 DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
4ed46869
KH
1810 }
1811 }
1812 continue;
1813
1814 label_end_of_loop:
1815 coding->carryover_size = src - src_base;
1816 bcopy (src_base, coding->carryover, coding->carryover_size);
1817 src = src_base;
1818 break;
1819 }
1820
1821 *consumed = src - source;
1822 return dst - destination;
1823}
1824
1825/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
1826 This function can encode `charset_ascii', `charset_katakana_jisx0201',
1827 `charset_jisx0208', `charset_big5_1', and `charset_big5-2'. We are
1828 sure that all these charsets are registered as official charset
1829 (i.e. do not have extended leading-codes). Characters of other
1830 charsets are produced without any encoding. If SJIS_P is 1, encode
1831 SJIS text, else encode BIG5 text. */
1832
1833int
1834encode_coding_sjis_big5 (coding, source, destination,
1835 src_bytes, dst_bytes, consumed, sjis_p)
1836 struct coding_system *coding;
1837 unsigned char *source, *destination;
1838 int src_bytes, dst_bytes;
1839 int *consumed;
1840 int sjis_p;
1841{
1842 unsigned char *src = source;
1843 unsigned char *src_end = source + src_bytes;
1844 unsigned char *dst = destination;
1845 unsigned char *dst_end = destination + dst_bytes;
1846 /* Since the maximum bytes produced by each loop is 2, we subtract 1
1847 from DST_END to assure overflow checking is necessary only at the
1848 head of loop. */
1849 unsigned char *adjusted_dst_end = dst_end - 1;
a5d301df
KH
1850 Lisp_Object unification_table
1851 = coding->character_unification_table_for_encode;
1852
1853 if (!NILP (Venable_character_unification) && NILP (unification_table))
1854 unification_table = Vstandard_character_unification_table_for_encode;
4ed46869
KH
1855
1856 while (src < src_end && dst < adjusted_dst_end)
1857 {
1858 /* SRC_BASE remembers the start position in source in each loop.
1859 The loop will be exited when there's not enough source text
1860 to analyze multi-byte codes (within macros ONE_MORE_BYTE and
1861 TWO_MORE_BYTES). In that case, SRC is reset to SRC_BASE
1862 before exiting. */
1863 unsigned char *src_base = src;
1864 unsigned char c1 = *src++, c2, c3, c4;
1865
1866 if (coding->composing)
1867 {
1868 if (c1 == 0xA0)
1869 {
1870 ONE_MORE_BYTE (c1);
1871 c1 &= 0x7F;
1872 }
1873 else if (c1 >= 0xA0)
1874 c1 -= 0x20;
1875 else
1876 coding->composing = 0;
1877 }
1878
1879 switch (emacs_code_class[c1])
1880 {
1881 case EMACS_ascii_code:
a5d301df
KH
1882 ENCODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
1883 break;
1884
4ed46869
KH
1885 case EMACS_control_code:
1886 *dst++ = c1;
1887 break;
1888
1889 case EMACS_carriage_return_code:
1890 if (!coding->selective)
1891 {
1892 *dst++ = c1;
1893 break;
1894 }
1895 /* fall down to treat '\r' as '\n' ... */
1896
1897 case EMACS_linefeed_code:
1898 if (coding->eol_type == CODING_EOL_LF
0ef69138 1899 || coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
1900 *dst++ = '\n';
1901 else if (coding->eol_type == CODING_EOL_CRLF)
1902 *dst++ = '\r', *dst++ = '\n';
1903 else
1904 *dst++ = '\r';
1905 break;
1906
1907 case EMACS_leading_code_2:
1908 ONE_MORE_BYTE (c2);
a5d301df 1909 ENCODE_SJIS_BIG5_CHARACTER (c1, c2, /* dummy */ c3);
4ed46869
KH
1910 break;
1911
1912 case EMACS_leading_code_3:
1913 TWO_MORE_BYTES (c2, c3);
a5d301df 1914 ENCODE_SJIS_BIG5_CHARACTER (c1, c2, c3);
4ed46869
KH
1915 break;
1916
1917 case EMACS_leading_code_4:
1918 THREE_MORE_BYTES (c2, c3, c4);
a5d301df 1919 ENCODE_SJIS_BIG5_CHARACTER (c2, c3, c4);
4ed46869
KH
1920 break;
1921
1922 case EMACS_leading_code_composition:
1923 coding->composing = 1;
1924 break;
1925
1926 default: /* i.e. case EMACS_invalid_code: */
1927 *dst++ = c1;
1928 }
1929 continue;
1930
1931 label_end_of_loop:
1932 coding->carryover_size = src - src_base;
1933 bcopy (src_base, coding->carryover, coding->carryover_size);
1934 src = src_base;
1935 break;
1936 }
1937
1938 *consumed = src - source;
1939 return dst - destination;
1940}
1941
1942\f
1943/*** 5. End-of-line handlers ***/
1944
1945/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
1946 This function is called only when `coding->eol_type' is
1947 CODING_EOL_CRLF or CODING_EOL_CR. */
1948
1949decode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
1950 struct coding_system *coding;
1951 unsigned char *source, *destination;
1952 int src_bytes, dst_bytes;
1953 int *consumed;
1954{
1955 unsigned char *src = source;
1956 unsigned char *src_end = source + src_bytes;
1957 unsigned char *dst = destination;
1958 unsigned char *dst_end = destination + dst_bytes;
1959 int produced;
1960
1961 switch (coding->eol_type)
1962 {
1963 case CODING_EOL_CRLF:
1964 {
1965 /* Since the maximum bytes produced by each loop is 2, we
1966 subtract 1 from DST_END to assure overflow checking is
1967 necessary only at the head of loop. */
1968 unsigned char *adjusted_dst_end = dst_end - 1;
1969
1970 while (src < src_end && dst < adjusted_dst_end)
1971 {
1972 unsigned char *src_base = src;
1973 unsigned char c = *src++;
1974 if (c == '\r')
1975 {
1976 ONE_MORE_BYTE (c);
1977 if (c != '\n')
1978 *dst++ = '\r';
bfd99048 1979 *dst++ = c;
4ed46869
KH
1980 }
1981 else
1982 *dst++ = c;
1983 continue;
1984
1985 label_end_of_loop:
1986 coding->carryover_size = src - src_base;
1987 bcopy (src_base, coding->carryover, coding->carryover_size);
1988 src = src_base;
1989 break;
1990 }
1991 *consumed = src - source;
1992 produced = dst - destination;
1993 break;
1994 }
1995
1996 case CODING_EOL_CR:
1997 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
1998 bcopy (source, destination, produced);
1999 dst_end = destination + produced;
2000 while (dst < dst_end)
2001 if (*dst++ == '\r') dst[-1] = '\n';
2002 *consumed = produced;
2003 break;
2004
2005 default: /* i.e. case: CODING_EOL_LF */
2006 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2007 bcopy (source, destination, produced);
2008 *consumed = produced;
2009 break;
2010 }
2011
2012 return produced;
2013}
2014
2015/* See "GENERAL NOTES about `encode_coding_XXX ()' functions". Encode
2016 format of end-of-line according to `coding->eol_type'. If
2017 `coding->selective' is 1, code '\r' in source text also means
2018 end-of-line. */
2019
2020encode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
2021 struct coding_system *coding;
2022 unsigned char *source, *destination;
2023 int src_bytes, dst_bytes;
2024 int *consumed;
2025{
2026 unsigned char *src = source;
2027 unsigned char *dst = destination;
2028 int produced;
2029
2030 if (src_bytes <= 0)
2031 return 0;
2032
2033 switch (coding->eol_type)
2034 {
2035 case CODING_EOL_LF:
0ef69138 2036 case CODING_EOL_UNDECIDED:
4ed46869
KH
2037 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2038 bcopy (source, destination, produced);
2039 if (coding->selective)
2040 {
2041 int i = produced;
2042 while (i--)
2043 if (*dst++ == '\r') dst[-1] = '\n';
2044 }
2045 *consumed = produced;
2046
2047 case CODING_EOL_CRLF:
2048 {
2049 unsigned char c;
2050 unsigned char *src_end = source + src_bytes;
2051 unsigned char *dst_end = destination + dst_bytes;
2052 /* Since the maximum bytes produced by each loop is 2, we
2053 subtract 1 from DST_END to assure overflow checking is
2054 necessary only at the head of loop. */
2055 unsigned char *adjusted_dst_end = dst_end - 1;
2056
2057 while (src < src_end && dst < adjusted_dst_end)
2058 {
2059 c = *src++;
2060 if (c == '\n' || (c == '\r' && coding->selective))
2061 *dst++ = '\r', *dst++ = '\n';
2062 else
2063 *dst++ = c;
2064 }
2065 produced = dst - destination;
2066 *consumed = src - source;
2067 break;
2068 }
2069
2070 default: /* i.e. case CODING_EOL_CR: */
2071 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2072 bcopy (source, destination, produced);
2073 {
2074 int i = produced;
2075 while (i--)
2076 if (*dst++ == '\n') dst[-1] = '\r';
2077 }
2078 *consumed = produced;
2079 }
2080
2081 return produced;
2082}
2083
2084\f
2085/*** 6. C library functions ***/
2086
2087/* In Emacs Lisp, coding system is represented by a Lisp symbol which
2088 has a property `coding-system'. The value of this property is a
2089 vector of length 5 (called as coding-vector). Among elements of
2090 this vector, the first (element[0]) and the fifth (element[4])
2091 carry important information for decoding/encoding. Before
2092 decoding/encoding, this information should be set in fields of a
2093 structure of type `coding_system'.
2094
2095 A value of property `coding-system' can be a symbol of another
2096 subsidiary coding-system. In that case, Emacs gets coding-vector
2097 from that symbol.
2098
2099 `element[0]' contains information to be set in `coding->type'. The
2100 value and its meaning is as follows:
2101
0ef69138
KH
2102 0 -- coding_type_emacs_mule
2103 1 -- coding_type_sjis
2104 2 -- coding_type_iso2022
2105 3 -- coding_type_big5
2106 4 -- coding_type_ccl encoder/decoder written in CCL
2107 nil -- coding_type_no_conversion
2108 t -- coding_type_undecided (automatic conversion on decoding,
2109 no-conversion on encoding)
4ed46869
KH
2110
2111 `element[4]' contains information to be set in `coding->flags' and
2112 `coding->spec'. The meaning varies by `coding->type'.
2113
2114 If `coding->type' is `coding_type_iso2022', element[4] is a vector
2115 of length 32 (of which the first 13 sub-elements are used now).
2116 Meanings of these sub-elements are:
2117
2118 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2119 If the value is an integer of valid charset, the charset is
2120 assumed to be designated to graphic register N initially.
2121
2122 If the value is minus, it is a minus value of charset which
2123 reserves graphic register N, which means that the charset is
2124 not designated initially but should be designated to graphic
2125 register N just before encoding a character in that charset.
2126
2127 If the value is nil, graphic register N is never used on
2128 encoding.
2129
2130 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2131 Each value takes t or nil. See the section ISO2022 of
2132 `coding.h' for more information.
2133
2134 If `coding->type' is `coding_type_big5', element[4] is t to denote
2135 BIG5-ETen or nil to denote BIG5-HKU.
2136
2137 If `coding->type' takes the other value, element[4] is ignored.
2138
2139 Emacs Lisp's coding system also carries information about format of
2140 end-of-line in a value of property `eol-type'. If the value is
2141 integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2142 means CODING_EOL_CR. If it is not integer, it should be a vector
2143 of subsidiary coding systems of which property `eol-type' has one
2144 of above values.
2145
2146*/
2147
2148/* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2149 and set it in CODING. If CODING_SYSTEM_SYMBOL is invalid, CODING
2150 is setup so that no conversion is necessary and return -1, else
2151 return 0. */
2152
2153int
e0e989f6
KH
2154setup_coding_system (coding_system, coding)
2155 Lisp_Object coding_system;
4ed46869
KH
2156 struct coding_system *coding;
2157{
4ed46869
KH
2158 Lisp_Object type, eol_type;
2159
2160 /* At first, set several fields default values. */
2161 coding->require_flushing = 0;
2162 coding->last_block = 0;
2163 coding->selective = 0;
2164 coding->composing = 0;
2165 coding->direction = 0;
2166 coding->carryover_size = 0;
4ed46869 2167 coding->post_read_conversion = coding->pre_write_conversion = Qnil;
a5d301df
KH
2168 coding->character_unification_table_for_decode = Qnil;
2169 coding->character_unification_table_for_encode = Qnil;
4ed46869 2170
e0e989f6
KH
2171 Vlast_coding_system_used = coding->symbol = coding_system;
2172 eol_type = Qnil;
2173 /* Get value of property `coding-system' until we get a vector.
2174 While doing that, also get values of properties
a5d301df
KH
2175 `post-read-conversion', `pre-write-conversion',
2176 `character-unification-table-for-decode',
2177 `character-unification-table-for-encode' and `eol-type'. */
e0e989f6 2178 while (!NILP (coding_system) && SYMBOLP (coding_system))
4ed46869 2179 {
4ed46869 2180 if (NILP (coding->post_read_conversion))
e0e989f6 2181 coding->post_read_conversion = Fget (coding_system,
4ed46869 2182 Qpost_read_conversion);
e0e989f6
KH
2183 if (NILP (coding->pre_write_conversion))
2184 coding->pre_write_conversion = Fget (coding_system,
4ed46869 2185 Qpre_write_conversion);
e0e989f6
KH
2186 if (NILP (eol_type))
2187 eol_type = Fget (coding_system, Qeol_type);
a5d301df
KH
2188
2189 if (NILP (coding->character_unification_table_for_decode))
2190 coding->character_unification_table_for_decode
2191 = Fget (coding_system, Qcharacter_unification_table_for_decode);
2192
2193 if (NILP (coding->character_unification_table_for_encode))
2194 coding->character_unification_table_for_encode
2195 = Fget (coding_system, Qcharacter_unification_table_for_encode);
2196
e0e989f6 2197 coding_system = Fget (coding_system, Qcoding_system);
4ed46869 2198 }
a5d301df
KH
2199
2200 while (!NILP (coding->character_unification_table_for_decode)
2201 && SYMBOLP (coding->character_unification_table_for_decode))
2202 coding->character_unification_table_for_decode
2203 = Fget (coding->character_unification_table_for_decode,
2204 Qcharacter_unification_table_for_decode);
2205 if (!NILP (coding->character_unification_table_for_decode)
2206 && !CHAR_TABLE_P (coding->character_unification_table_for_decode))
2207 coding->character_unification_table_for_decode = Qnil;
2208
2209 while (!NILP (coding->character_unification_table_for_encode)
2210 && SYMBOLP (coding->character_unification_table_for_encode))
2211 coding->character_unification_table_for_encode
2212 = Fget (coding->character_unification_table_for_encode,
2213 Qcharacter_unification_table_for_encode);
2214 if (!NILP (coding->character_unification_table_for_encode)
2215 && !CHAR_TABLE_P (coding->character_unification_table_for_encode))
2216 coding->character_unification_table_for_encode = Qnil;
2217
e0e989f6
KH
2218 if (!VECTORP (coding_system)
2219 || XVECTOR (coding_system)->size != 5)
4ed46869
KH
2220 goto label_invalid_coding_system;
2221
4ed46869 2222 if (VECTORP (eol_type))
0ef69138 2223 coding->eol_type = CODING_EOL_UNDECIDED;
4ed46869
KH
2224 else if (XFASTINT (eol_type) == 1)
2225 coding->eol_type = CODING_EOL_CRLF;
2226 else if (XFASTINT (eol_type) == 2)
2227 coding->eol_type = CODING_EOL_CR;
2228 else
2229 coding->eol_type = CODING_EOL_LF;
2230
e0e989f6 2231 type = XVECTOR (coding_system)->contents[0];
4ed46869
KH
2232 switch (XFASTINT (type))
2233 {
2234 case 0:
0ef69138 2235 coding->type = coding_type_emacs_mule;
4ed46869
KH
2236 break;
2237
2238 case 1:
2239 coding->type = coding_type_sjis;
2240 break;
2241
2242 case 2:
2243 coding->type = coding_type_iso2022;
2244 {
e0e989f6 2245 Lisp_Object val = XVECTOR (coding_system)->contents[4];
4ed46869
KH
2246 Lisp_Object *flags;
2247 int i, charset, default_reg_bits = 0;
2248
2249 if (!VECTORP (val) || XVECTOR (val)->size != 32)
2250 goto label_invalid_coding_system;
2251
2252 flags = XVECTOR (val)->contents;
2253 coding->flags
2254 = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
2255 | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
2256 | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
2257 | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
2258 | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
2259 | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
2260 | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
2261 | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
e0e989f6
KH
2262 | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
2263 | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
2264 | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL));
4ed46869
KH
2265
2266 /* Invoke graphic register 0 to plane 0. */
2267 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
2268 /* Invoke graphic register 1 to plane 1 if we can use full 8-bit. */
2269 CODING_SPEC_ISO_INVOCATION (coding, 1)
2270 = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
2271 /* Not single shifting at first. */
2272 CODING_SPEC_ISO_SINGLE_SHIFTING(coding) = 0;
e0e989f6
KH
2273 /* Beginning of buffer should also be regarded as bol. */
2274 CODING_SPEC_ISO_BOL(coding) = 1;
4ed46869
KH
2275
2276 /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
2277 FLAGS[REG] can be one of below:
2278 integer CHARSET: CHARSET occupies register I,
2279 t: designate nothing to REG initially, but can be used
2280 by any charsets,
2281 list of integer, nil, or t: designate the first
2282 element (if integer) to REG initially, the remaining
2283 elements (if integer) is designated to REG on request,
2284 if an element is t, REG can be used by any charset,
2285 nil: REG is never used. */
467e7675 2286 for (charset = 0; charset <= MAX_CHARSET; charset++)
1ba9e4ab
KH
2287 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2288 = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
4ed46869
KH
2289 for (i = 0; i < 4; i++)
2290 {
2291 if (INTEGERP (flags[i])
e0e989f6
KH
2292 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset))
2293 || (charset = get_charset_id (flags[i])) >= 0)
4ed46869
KH
2294 {
2295 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2296 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
2297 }
2298 else if (EQ (flags[i], Qt))
2299 {
2300 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2301 default_reg_bits |= 1 << i;
2302 }
2303 else if (CONSP (flags[i]))
2304 {
2305 Lisp_Object tail = flags[i];
2306
2307 if (INTEGERP (XCONS (tail)->car)
2308 && (charset = XINT (XCONS (tail)->car),
e0e989f6
KH
2309 CHARSET_VALID_P (charset))
2310 || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
4ed46869
KH
2311 {
2312 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2313 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
2314 }
2315 else
2316 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2317 tail = XCONS (tail)->cdr;
2318 while (CONSP (tail))
2319 {
2320 if (INTEGERP (XCONS (tail)->car)
2321 && (charset = XINT (XCONS (tail)->car),
e0e989f6
KH
2322 CHARSET_VALID_P (charset))
2323 || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
4ed46869
KH
2324 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2325 = i;
2326 else if (EQ (XCONS (tail)->car, Qt))
2327 default_reg_bits |= 1 << i;
2328 tail = XCONS (tail)->cdr;
2329 }
2330 }
2331 else
2332 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2333
2334 CODING_SPEC_ISO_DESIGNATION (coding, i)
2335 = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
2336 }
2337
2338 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
2339 {
2340 /* REG 1 can be used only by locking shift in 7-bit env. */
2341 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2342 default_reg_bits &= ~2;
2343 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
2344 /* Without any shifting, only REG 0 and 1 can be used. */
2345 default_reg_bits &= 3;
2346 }
2347
467e7675 2348 for (charset = 0; charset <= MAX_CHARSET; charset++)
4ed46869 2349 if (CHARSET_VALID_P (charset)
1ba9e4ab
KH
2350 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2351 == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
4ed46869
KH
2352 {
2353 /* We have not yet decided where to designate CHARSET. */
2354 int reg_bits = default_reg_bits;
2355
2356 if (CHARSET_CHARS (charset) == 96)
2357 /* A charset of CHARS96 can't be designated to REG 0. */
2358 reg_bits &= ~1;
2359
2360 if (reg_bits)
2361 /* There exist some default graphic register. */
2362 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2363 = (reg_bits & 1
2364 ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
2365 else
2366 /* We anyway have to designate CHARSET to somewhere. */
2367 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2368 = (CHARSET_CHARS (charset) == 94
2369 ? 0
2370 : ((coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT
2371 || ! coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2372 ? 1
2373 : (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT
2374 ? 2 : 0)));
2375 }
2376 }
2377 coding->require_flushing = 1;
2378 break;
2379
2380 case 3:
2381 coding->type = coding_type_big5;
2382 coding->flags
e0e989f6 2383 = (NILP (XVECTOR (coding_system)->contents[4])
4ed46869
KH
2384 ? CODING_FLAG_BIG5_HKU
2385 : CODING_FLAG_BIG5_ETEN);
2386 break;
2387
2388 case 4:
2389 coding->type = coding_type_ccl;
2390 {
e0e989f6 2391 Lisp_Object val = XVECTOR (coding_system)->contents[4];
4ed46869
KH
2392 if (CONSP (val)
2393 && VECTORP (XCONS (val)->car)
2394 && VECTORP (XCONS (val)->cdr))
2395 {
2396 setup_ccl_program (&(coding->spec.ccl.decoder), XCONS (val)->car);
2397 setup_ccl_program (&(coding->spec.ccl.encoder), XCONS (val)->cdr);
2398 }
2399 else
2400 goto label_invalid_coding_system;
2401 }
2402 coding->require_flushing = 1;
2403 break;
2404
2405 default:
2406 if (EQ (type, Qt))
0ef69138 2407 coding->type = coding_type_undecided;
4ed46869
KH
2408 else
2409 coding->type = coding_type_no_conversion;
2410 break;
2411 }
2412 return 0;
2413
2414 label_invalid_coding_system:
2415 coding->type = coding_type_no_conversion;
dec137e5 2416 coding->eol_type = CODING_EOL_LF;
e0e989f6
KH
2417 coding->symbol = coding->pre_write_conversion = coding->post_read_conversion
2418 = Qnil;
4ed46869
KH
2419 return -1;
2420}
2421
2422/* Emacs has a mechanism to automatically detect a coding system if it
2423 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
2424 it's impossible to distinguish some coding systems accurately
2425 because they use the same range of codes. So, at first, coding
2426 systems are categorized into 7, those are:
2427
0ef69138 2428 o coding-category-emacs-mule
4ed46869
KH
2429
2430 The category for a coding system which has the same code range
2431 as Emacs' internal format. Assigned the coding-system (Lisp
0ef69138 2432 symbol) `emacs-mule' by default.
4ed46869
KH
2433
2434 o coding-category-sjis
2435
2436 The category for a coding system which has the same code range
2437 as SJIS. Assigned the coding-system (Lisp
e0e989f6 2438 symbol) `shift-jis' by default.
4ed46869
KH
2439
2440 o coding-category-iso-7
2441
2442 The category for a coding system which has the same code range
2443 as ISO2022 of 7-bit environment. Assigned the coding-system
e0e989f6 2444 (Lisp symbol) `iso-2022-7' by default.
4ed46869
KH
2445
2446 o coding-category-iso-8-1
2447
2448 The category for a coding system which has the same code range
2449 as ISO2022 of 8-bit environment and graphic plane 1 used only
2450 for DIMENSION1 charset. Assigned the coding-system (Lisp
e0e989f6 2451 symbol) `iso-8859-1' by default.
4ed46869
KH
2452
2453 o coding-category-iso-8-2
2454
2455 The category for a coding system which has the same code range
2456 as ISO2022 of 8-bit environment and graphic plane 1 used only
2457 for DIMENSION2 charset. Assigned the coding-system (Lisp
e0e989f6 2458 symbol) `euc-japan' by default.
4ed46869
KH
2459
2460 o coding-category-iso-else
2461
2462 The category for a coding system which has the same code range
2463 as ISO2022 but not belongs to any of the above three
2464 categories. Assigned the coding-system (Lisp symbol)
e0e989f6 2465 `iso-2022-ss2-7' by default.
4ed46869
KH
2466
2467 o coding-category-big5
2468
2469 The category for a coding system which has the same code range
2470 as BIG5. Assigned the coding-system (Lisp symbol)
e0e989f6 2471 `cn-big5' by default.
4ed46869
KH
2472
2473 o coding-category-binary
2474
2475 The category for a coding system not categorized in any of the
2476 above. Assigned the coding-system (Lisp symbol)
e0e989f6 2477 `no-conversion' by default.
4ed46869
KH
2478
2479 Each of them is a Lisp symbol and the value is an actual
2480 `coding-system's (this is also a Lisp symbol) assigned by a user.
2481 What Emacs does actually is to detect a category of coding system.
2482 Then, it uses a `coding-system' assigned to it. If Emacs can't
2483 decide only one possible category, it selects a category of the
2484 highest priority. Priorities of categories are also specified by a
2485 user in a Lisp variable `coding-category-list'.
2486
2487*/
2488
2489/* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2490 If it detects possible coding systems, return an integer in which
2491 appropriate flag bits are set. Flag bits are defined by macros
2492 CODING_CATEGORY_MASK_XXX in `coding.h'. */
2493
2494int
2495detect_coding_mask (src, src_bytes)
2496 unsigned char *src;
2497 int src_bytes;
2498{
2499 register unsigned char c;
2500 unsigned char *src_end = src + src_bytes;
2501 int mask;
2502
2503 /* At first, skip all ASCII characters and control characters except
2504 for three ISO2022 specific control characters. */
bcf26d6a 2505 label_loop_detect_coding:
4ed46869
KH
2506 while (src < src_end)
2507 {
2508 c = *src;
2509 if (c >= 0x80
2510 || (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
2511 break;
2512 src++;
2513 }
2514
2515 if (src >= src_end)
2516 /* We found nothing other than ASCII. There's nothing to do. */
2517 return CODING_CATEGORY_MASK_ANY;
2518
2519 /* The text seems to be encoded in some multilingual coding system.
2520 Now, try to find in which coding system the text is encoded. */
2521 if (c < 0x80)
bcf26d6a
KH
2522 {
2523 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
2524 /* C is an ISO2022 specific control code of C0. */
2525 mask = detect_coding_iso2022 (src, src_end);
2526 src++;
2527 if (mask == CODING_CATEGORY_MASK_ANY)
2528 /* No valid ISO2022 code follows C. Try again. */
2529 goto label_loop_detect_coding;
2530 }
4ed46869
KH
2531 else if (c == ISO_CODE_SS2 || c == ISO_CODE_SS3 || c == ISO_CODE_CSI)
2532 /* C is an ISO2022 specific control code of C1,
2533 or the first byte of SJIS's 2-byte character code,
2534 or a leading code of Emacs. */
2535 mask = (detect_coding_iso2022 (src, src_end)
2536 | detect_coding_sjis (src, src_end)
0ef69138 2537 | detect_coding_emacs_mule (src, src_end));
4ed46869
KH
2538
2539 else if (c < 0xA0)
2540 /* C is the first byte of SJIS character code,
2541 or a leading-code of Emacs. */
2542 mask = (detect_coding_sjis (src, src_end)
0ef69138 2543 | detect_coding_emacs_mule (src, src_end));
4ed46869
KH
2544
2545 else
2546 /* C is a character of ISO2022 in graphic plane right,
2547 or a SJIS's 1-byte character code (i.e. JISX0201),
2548 or the first byte of BIG5's 2-byte code. */
2549 mask = (detect_coding_iso2022 (src, src_end)
2550 | detect_coding_sjis (src, src_end)
2551 | detect_coding_big5 (src, src_end));
2552
2553 return mask;
2554}
2555
2556/* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2557 The information of the detected coding system is set in CODING. */
2558
2559void
2560detect_coding (coding, src, src_bytes)
2561 struct coding_system *coding;
2562 unsigned char *src;
2563 int src_bytes;
2564{
2565 int mask = detect_coding_mask (src, src_bytes);
2566 int idx;
2567
2568 if (mask == CODING_CATEGORY_MASK_ANY)
2569 /* We found nothing other than ASCII. There's nothing to do. */
2570 return;
2571
2572 if (!mask)
2573 /* The source text seems to be encoded in unknown coding system.
2574 Emacs regards the category of such a kind of coding system as
2575 `coding-category-binary'. We assume that a user has assigned
2576 an appropriate coding system for a `coding-category-binary'. */
2577 idx = CODING_CATEGORY_IDX_BINARY;
2578 else
2579 {
2580 /* We found some plausible coding systems. Let's use a coding
2581 system of the highest priority. */
2582 Lisp_Object val = Vcoding_category_list;
2583
2584 if (CONSP (val))
2585 while (!NILP (val))
2586 {
2587 idx = XFASTINT (Fget (XCONS (val)->car, Qcoding_category_index));
2588 if ((idx < CODING_CATEGORY_IDX_MAX) && (mask & (1 << idx)))
2589 break;
2590 val = XCONS (val)->cdr;
2591 }
2592 else
2593 val = Qnil;
2594
2595 if (NILP (val))
2596 {
2597 /* For unknown reason, `Vcoding_category_list' contains none
2598 of found categories. Let's use any of them. */
2599 for (idx = 0; idx < CODING_CATEGORY_IDX_MAX; idx++)
2600 if (mask & (1 << idx))
2601 break;
2602 }
2603 }
2604 setup_coding_system (XSYMBOL (coding_category_table[idx])->value, coding);
2605}
2606
2607/* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2608 is encoded. Return one of CODING_EOL_LF, CODING_EOL_CRLF,
0ef69138 2609 CODING_EOL_CR, and CODING_EOL_UNDECIDED. */
4ed46869
KH
2610
2611int
2612detect_eol_type (src, src_bytes)
2613 unsigned char *src;
2614 int src_bytes;
2615{
2616 unsigned char *src_end = src + src_bytes;
2617 unsigned char c;
2618
2619 while (src < src_end)
2620 {
2621 c = *src++;
2622 if (c == '\n')
2623 return CODING_EOL_LF;
2624 else if (c == '\r')
2625 {
2626 if (src < src_end && *src == '\n')
2627 return CODING_EOL_CRLF;
2628 else
2629 return CODING_EOL_CR;
2630 }
2631 }
0ef69138 2632 return CODING_EOL_UNDECIDED;
4ed46869
KH
2633}
2634
2635/* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2636 is encoded. If it detects an appropriate format of end-of-line, it
2637 sets the information in *CODING. */
2638
2639void
2640detect_eol (coding, src, src_bytes)
2641 struct coding_system *coding;
2642 unsigned char *src;
2643 int src_bytes;
2644{
2645 Lisp_Object val;
2646 int eol_type = detect_eol_type (src, src_bytes);
2647
0ef69138 2648 if (eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
2649 /* We found no end-of-line in the source text. */
2650 return;
2651
2652 val = Fget (coding->symbol, Qeol_type);
2653 if (VECTORP (val) && XVECTOR (val)->size == 3)
2654 setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
2655}
2656
2657/* See "GENERAL NOTES about `decode_coding_XXX ()' functions". Before
2658 decoding, it may detect coding system and format of end-of-line if
2659 those are not yet decided. */
2660
2661int
2662decode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2663 struct coding_system *coding;
2664 unsigned char *source, *destination;
2665 int src_bytes, dst_bytes;
2666 int *consumed;
2667{
2668 int produced;
2669
2670 if (src_bytes <= 0)
2671 {
2672 *consumed = 0;
2673 return 0;
2674 }
2675
0ef69138 2676 if (coding->type == coding_type_undecided)
4ed46869
KH
2677 detect_coding (coding, source, src_bytes);
2678
0ef69138 2679 if (coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
2680 detect_eol (coding, source, src_bytes);
2681
2682 coding->carryover_size = 0;
2683 switch (coding->type)
2684 {
2685 case coding_type_no_conversion:
2686 label_no_conversion:
2687 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2688 bcopy (source, destination, produced);
2689 *consumed = produced;
2690 break;
2691
0ef69138
KH
2692 case coding_type_emacs_mule:
2693 case coding_type_undecided:
4ed46869 2694 if (coding->eol_type == CODING_EOL_LF
0ef69138 2695 || coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
2696 goto label_no_conversion;
2697 produced = decode_eol (coding, source, destination,
2698 src_bytes, dst_bytes, consumed);
2699 break;
2700
2701 case coding_type_sjis:
2702 produced = decode_coding_sjis_big5 (coding, source, destination,
2703 src_bytes, dst_bytes, consumed,
2704 1);
2705 break;
2706
2707 case coding_type_iso2022:
2708 produced = decode_coding_iso2022 (coding, source, destination,
2709 src_bytes, dst_bytes, consumed);
2710 break;
2711
2712 case coding_type_big5:
2713 produced = decode_coding_sjis_big5 (coding, source, destination,
2714 src_bytes, dst_bytes, consumed,
2715 0);
2716 break;
2717
2718 case coding_type_ccl:
2719 produced = ccl_driver (&coding->spec.ccl.decoder, source, destination,
2720 src_bytes, dst_bytes, consumed);
2721 break;
2722 }
2723
2724 return produced;
2725}
2726
2727/* See "GENERAL NOTES about `encode_coding_XXX ()' functions". */
2728
2729int
2730encode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2731 struct coding_system *coding;
2732 unsigned char *source, *destination;
2733 int src_bytes, dst_bytes;
2734 int *consumed;
2735{
2736 int produced;
2737
2738 coding->carryover_size = 0;
2739 switch (coding->type)
2740 {
2741 case coding_type_no_conversion:
2742 label_no_conversion:
2743 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2744 if (produced > 0)
2745 {
2746 bcopy (source, destination, produced);
2747 if (coding->selective)
2748 {
2749 unsigned char *p = destination, *pend = destination + produced;
2750 while (p < pend)
e0e989f6 2751 if (*p++ == '\015') p[-1] = '\n';
4ed46869
KH
2752 }
2753 }
2754 *consumed = produced;
2755 break;
2756
0ef69138
KH
2757 case coding_type_emacs_mule:
2758 case coding_type_undecided:
4ed46869 2759 if (coding->eol_type == CODING_EOL_LF
0ef69138 2760 || coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
2761 goto label_no_conversion;
2762 produced = encode_eol (coding, source, destination,
2763 src_bytes, dst_bytes, consumed);
2764 break;
2765
2766 case coding_type_sjis:
2767 produced = encode_coding_sjis_big5 (coding, source, destination,
2768 src_bytes, dst_bytes, consumed,
2769 1);
2770 break;
2771
2772 case coding_type_iso2022:
2773 produced = encode_coding_iso2022 (coding, source, destination,
2774 src_bytes, dst_bytes, consumed);
2775 break;
2776
2777 case coding_type_big5:
2778 produced = encode_coding_sjis_big5 (coding, source, destination,
2779 src_bytes, dst_bytes, consumed,
2780 0);
2781 break;
2782
2783 case coding_type_ccl:
2784 produced = ccl_driver (&coding->spec.ccl.encoder, source, destination,
2785 src_bytes, dst_bytes, consumed);
2786 break;
2787 }
2788
2789 return produced;
2790}
2791
2792#define CONVERSION_BUFFER_EXTRA_ROOM 256
2793
2794/* Return maximum size (bytes) of a buffer enough for decoding
2795 SRC_BYTES of text encoded in CODING. */
2796
2797int
2798decoding_buffer_size (coding, src_bytes)
2799 struct coding_system *coding;
2800 int src_bytes;
2801{
2802 int magnification;
2803
2804 if (coding->type == coding_type_iso2022)
2805 magnification = 3;
2806 else if (coding->type == coding_type_ccl)
2807 magnification = coding->spec.ccl.decoder.buf_magnification;
2808 else
2809 magnification = 2;
2810
2811 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
2812}
2813
2814/* Return maximum size (bytes) of a buffer enough for encoding
2815 SRC_BYTES of text to CODING. */
2816
2817int
2818encoding_buffer_size (coding, src_bytes)
2819 struct coding_system *coding;
2820 int src_bytes;
2821{
2822 int magnification;
2823
2824 if (coding->type == coding_type_ccl)
2825 magnification = coding->spec.ccl.encoder.buf_magnification;
2826 else
2827 magnification = 3;
2828
2829 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
2830}
2831
2832#ifndef MINIMUM_CONVERSION_BUFFER_SIZE
2833#define MINIMUM_CONVERSION_BUFFER_SIZE 1024
2834#endif
2835
2836char *conversion_buffer;
2837int conversion_buffer_size;
2838
2839/* Return a pointer to a SIZE bytes of buffer to be used for encoding
2840 or decoding. Sufficient memory is allocated automatically. If we
2841 run out of memory, return NULL. */
2842
2843char *
2844get_conversion_buffer (size)
2845 int size;
2846{
2847 if (size > conversion_buffer_size)
2848 {
2849 char *buf;
2850 int real_size = conversion_buffer_size * 2;
2851
2852 while (real_size < size) real_size *= 2;
2853 buf = (char *) xmalloc (real_size);
2854 xfree (conversion_buffer);
2855 conversion_buffer = buf;
2856 conversion_buffer_size = real_size;
2857 }
2858 return conversion_buffer;
2859}
2860
2861\f
2862#ifdef emacs
2863/*** 7. Emacs Lisp library functions ***/
2864
02ba4723 2865DEFUN ("coding-system-spec", Fcoding_system_spec, Scoding_system_spec,
4ed46869 2866 1, 1, 0,
02ba4723 2867 "Return coding-spec of CODING-SYSTEM.\n\
4ed46869
KH
2868If CODING-SYSTEM is not a valid coding-system, return nil.")
2869 (obj)
2870 Lisp_Object obj;
2871{
2872 while (SYMBOLP (obj) && !NILP (obj))
2873 obj = Fget (obj, Qcoding_system);
2874 return ((NILP (obj) || !VECTORP (obj) || XVECTOR (obj)->size != 5)
2875 ? Qnil : obj);
2876}
2877
2878DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
2879 "Return t if OBJECT is nil or a coding-system.\n\
2880See document of make-coding-system for coding-system object.")
2881 (obj)
2882 Lisp_Object obj;
2883{
02ba4723 2884 return ((NILP (obj) || !NILP (Fcoding_system_spec (obj))) ? Qt : Qnil);
4ed46869
KH
2885}
2886
9d991de8
RS
2887DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
2888 Sread_non_nil_coding_system, 1, 1, 0,
e0e989f6 2889 "Read a coding system from the minibuffer, prompting with string PROMPT.")
4ed46869
KH
2890 (prompt)
2891 Lisp_Object prompt;
2892{
e0e989f6 2893 Lisp_Object val;
9d991de8
RS
2894 do
2895 {
02ba4723 2896 val = Fcompleting_read (prompt, Vobarray, Qcoding_system_spec,
9d991de8
RS
2897 Qt, Qnil, Qnil, Qnil);
2898 }
2899 while (XSTRING (val)->size == 0);
e0e989f6 2900 return (Fintern (val, Qnil));
4ed46869
KH
2901}
2902
2903DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 1, 0,
e0e989f6 2904 "Read a coding system or nil from the minibuffer, prompting with string PROMPT.")
4ed46869
KH
2905 (prompt)
2906 Lisp_Object prompt;
2907{
e0e989f6 2908 Lisp_Object val = Fcompleting_read (prompt, Vobarray, Qcoding_system_p,
9d991de8 2909 Qt, Qnil, Qnil, Qnil);
e0e989f6 2910 return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
4ed46869
KH
2911}
2912
2913DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
2914 1, 1, 0,
2915 "Check validity of CODING-SYSTEM.\n\
2916If valid, return CODING-SYSTEM, else `coding-system-error' is signaled.\n\
2917CODING-SYSTEM is valid if it is a symbol and has \"coding-system\" property.\n\
2918The value of property should be a vector of length 5.")
2919 (coding_system)
2920 Lisp_Object coding_system;
2921{
2922 CHECK_SYMBOL (coding_system, 0);
2923 if (!NILP (Fcoding_system_p (coding_system)))
2924 return coding_system;
2925 while (1)
02ba4723 2926 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
4ed46869
KH
2927}
2928
2929DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
2930 2, 2, 0,
2931 "Detect coding-system of the text in the region between START and END.\n\
2932Return a list of possible coding-systems ordered by priority.\n\
0ef69138 2933If only ASCII characters are found, it returns `undecided'\n\
4ed46869
KH
2934 or its subsidiary coding-system according to a detected end-of-line format.")
2935 (b, e)
2936 Lisp_Object b, e;
2937{
2938 int coding_mask, eol_type;
2939 Lisp_Object val;
2940 int beg, end;
2941
2942 validate_region (&b, &e);
2943 beg = XINT (b), end = XINT (e);
2944 if (beg < GPT && end >= GPT) move_gap (end);
2945
2946 coding_mask = detect_coding_mask (POS_ADDR (beg), end - beg);
2947 eol_type = detect_eol_type (POS_ADDR (beg), end - beg);
2948
2949 if (coding_mask == CODING_CATEGORY_MASK_ANY)
2950 {
0ef69138
KH
2951 val = intern ("undecided");
2952 if (eol_type != CODING_EOL_UNDECIDED)
4ed46869
KH
2953 {
2954 Lisp_Object val2 = Fget (val, Qeol_type);
2955 if (VECTORP (val2))
2956 val = XVECTOR (val2)->contents[eol_type];
2957 }
2958 }
2959 else
2960 {
2961 Lisp_Object val2;
2962
2963 /* At first, gather possible coding-systems in VAL in a reverse
2964 order. */
2965 val = Qnil;
2966 for (val2 = Vcoding_category_list;
2967 !NILP (val2);
2968 val2 = XCONS (val2)->cdr)
2969 {
2970 int idx
2971 = XFASTINT (Fget (XCONS (val2)->car, Qcoding_category_index));
2972 if (coding_mask & (1 << idx))
2973 val = Fcons (Fsymbol_value (XCONS (val2)->car), val);
2974 }
2975
2976 /* Then, change the order of the list, while getting subsidiary
2977 coding-systems. */
2978 val2 = val;
2979 val = Qnil;
2980 for (; !NILP (val2); val2 = XCONS (val2)->cdr)
2981 {
0ef69138 2982 if (eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
2983 val = Fcons (XCONS (val2)->car, val);
2984 else
2985 {
2986 Lisp_Object val3 = Fget (XCONS (val2)->car, Qeol_type);
2987 if (VECTORP (val3))
2988 val = Fcons (XVECTOR (val3)->contents[eol_type], val);
2989 else
2990 val = Fcons (XCONS (val2)->car, val);
2991 }
2992 }
2993 }
2994
2995 return val;
2996}
2997
2998/* Scan text in the region between *BEGP and *ENDP, skip characters
2999 which we never have to encode to (iff ENCODEP is 1) or decode from
3000 coding system CODING at the head and tail, then set BEGP and ENDP
3001 to the addresses of start and end of the text we actually convert. */
3002
3003void
3004shrink_conversion_area (begp, endp, coding, encodep)
3005 unsigned char **begp, **endp;
3006 struct coding_system *coding;
3007 int encodep;
3008{
3009 register unsigned char *beg_addr = *begp, *end_addr = *endp;
3010
3011 if (coding->eol_type != CODING_EOL_LF
0ef69138 3012 && coding->eol_type != CODING_EOL_UNDECIDED)
4ed46869
KH
3013 /* Since we anyway have to convert end-of-line format, it is not
3014 worth skipping at most 100 bytes or so. */
3015 return;
3016
3017 if (encodep) /* for encoding */
3018 {
3019 switch (coding->type)
3020 {
3021 case coding_type_no_conversion:
0ef69138
KH
3022 case coding_type_emacs_mule:
3023 case coding_type_undecided:
4ed46869
KH
3024 /* We need no conversion. */
3025 *begp = *endp;
3026 return;
3027 case coding_type_ccl:
3028 /* We can't skip any data. */
3029 return;
e0e989f6
KH
3030 case coding_type_iso2022:
3031 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3032 {
3033 unsigned char *bol = beg_addr;
3034 while (beg_addr < end_addr && *beg_addr < 0x80)
3035 {
3036 beg_addr++;
3037 if (*(beg_addr - 1) == '\n')
3038 bol = beg_addr;
3039 }
3040 beg_addr = bol;
3041 goto label_skip_tail;
3042 }
3043 /* fall down ... */
4ed46869
KH
3044 default:
3045 /* We can skip all ASCII characters at the head and tail. */
3046 while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
e0e989f6 3047 label_skip_tail:
4ed46869
KH
3048 while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
3049 break;
3050 }
3051 }
3052 else /* for decoding */
3053 {
3054 switch (coding->type)
3055 {
3056 case coding_type_no_conversion:
3057 /* We need no conversion. */
3058 *begp = *endp;
3059 return;
0ef69138 3060 case coding_type_emacs_mule:
4ed46869
KH
3061 if (coding->eol_type == CODING_EOL_LF)
3062 {
3063 /* We need no conversion. */
3064 *begp = *endp;
3065 return;
3066 }
3067 /* We can skip all but carriage-return. */
3068 while (beg_addr < end_addr && *beg_addr != '\r') beg_addr++;
3069 while (beg_addr < end_addr && *(end_addr - 1) != '\r') end_addr--;
3070 break;
3071 case coding_type_sjis:
3072 case coding_type_big5:
3073 /* We can skip all ASCII characters at the head. */
3074 while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
3075 /* We can skip all ASCII characters at the tail except for
3076 the second byte of SJIS or BIG5 code. */
3077 while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
3078 if (end_addr != *endp)
3079 end_addr++;
3080 break;
3081 case coding_type_ccl:
3082 /* We can't skip any data. */
3083 return;
3084 default: /* i.e. case coding_type_iso2022: */
3085 {
3086 unsigned char c;
3087
3088 /* We can skip all ASCII characters except for a few
3089 control codes at the head. */
3090 while (beg_addr < end_addr && (c = *beg_addr) < 0x80
3091 && c != ISO_CODE_CR && c != ISO_CODE_SO
3092 && c != ISO_CODE_SI && c != ISO_CODE_ESC)
3093 beg_addr++;
3094 }
3095 break;
3096 }
3097 }
3098 *begp = beg_addr;
3099 *endp = end_addr;
3100 return;
3101}
3102
3103/* Encode to (iff ENCODEP is 1) or decode form coding system CODING a
3104 text between B and E. B and E are buffer position. */
3105
3106Lisp_Object
3107code_convert_region (b, e, coding, encodep)
3108 Lisp_Object b, e;
3109 struct coding_system *coding;
3110 int encodep;
3111{
3112 int beg, end, len, consumed, produced;
3113 char *buf;
3114 unsigned char *begp, *endp;
3115 int pos = PT;
3116
3117 validate_region (&b, &e);
3118 beg = XINT (b), end = XINT (e);
3119 if (beg < GPT && end >= GPT)
3120 move_gap (end);
3121
3122 if (encodep && !NILP (coding->pre_write_conversion))
3123 {
3124 /* We must call a pre-conversion function which may put a new
3125 text to be converted in a new buffer. */
3126 struct buffer *old = current_buffer, *new;
3127
3128 TEMP_SET_PT (beg);
3129 call2 (coding->pre_write_conversion, b, e);
3130 if (old != current_buffer)
3131 {
3132 /* Replace the original text by the text just generated. */
3133 len = ZV - BEGV;
3134 new = current_buffer;
3135 set_buffer_internal (old);
3136 del_range (beg, end);
3137 insert_from_buffer (new, 1, len, 0);
3138 end = beg + len;
3139 }
3140 }
3141
3142 /* We may be able to shrink the conversion region. */
3143 begp = POS_ADDR (beg); endp = begp + (end - beg);
3144 shrink_conversion_area (&begp, &endp, coding, encodep);
3145
3146 if (begp == endp)
3147 /* We need no conversion. */
3148 len = end - beg;
3149 else
3150 {
3151 beg += begp - POS_ADDR (beg);
3152 end = beg + (endp - begp);
3153
3154 if (encodep)
3155 len = encoding_buffer_size (coding, end - beg);
3156 else
3157 len = decoding_buffer_size (coding, end - beg);
3158 buf = get_conversion_buffer (len);
3159
3160 coding->last_block = 1;
3161 produced = (encodep
3162 ? encode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
3163 &consumed)
3164 : decode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
3165 &consumed));
3166
3167 len = produced + (beg - XINT (b)) + (XINT (e) - end);
3168
3169 TEMP_SET_PT (beg);
3170 insert (buf, produced);
3171 del_range (PT, PT + end - beg);
3172 if (pos >= end)
3173 pos = PT + (pos - end);
3174 else if (pos > beg)
3175 pos = beg;
3176 TEMP_SET_PT (pos);
3177 }
3178
3179 if (!encodep && !NILP (coding->post_read_conversion))
3180 {
3181 /* We must call a post-conversion function which may alter
3182 the text just converted. */
3183 Lisp_Object insval;
3184
3185 beg = XINT (b);
3186 TEMP_SET_PT (beg);
3187 insval = call1 (coding->post_read_conversion, make_number (len));
3188 CHECK_NUMBER (insval, 0);
3189 len = XINT (insval);
3190 }
3191
3192 return make_number (len);
3193}
3194
3195Lisp_Object
e0e989f6
KH
3196code_convert_string (str, coding, encodep, nocopy)
3197 Lisp_Object str, nocopy;
4ed46869
KH
3198 struct coding_system *coding;
3199 int encodep;
3200{
3201 int len, consumed, produced;
3202 char *buf;
3203 unsigned char *begp, *endp;
3204 int head_skip, tail_skip;
3205 struct gcpro gcpro1;
3206
3207 if (encodep && !NILP (coding->pre_write_conversion)
3208 || !encodep && !NILP (coding->post_read_conversion))
3209 {
3210 /* Since we have to call Lisp functions which assume target text
3211 is in a buffer, after setting a temporary buffer, call
3212 code_convert_region. */
3213 int count = specpdl_ptr - specpdl;
3214 int len = XSTRING (str)->size;
3215 Lisp_Object result;
3216 struct buffer *old = current_buffer;
3217
3218 record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
3219 temp_output_buffer_setup (" *code-converting-work*");
3220 set_buffer_internal (XBUFFER (Vstandard_output));
3221 insert_from_string (str, 0, len, 0);
3222 code_convert_region (make_number (BEGV), make_number (ZV),
3223 coding, encodep);
3224 result = make_buffer_string (BEGV, ZV, 0);
3225 set_buffer_internal (old);
3226 return unbind_to (count, result);
3227 }
3228
3229 /* We may be able to shrink the conversion region. */
3230 begp = XSTRING (str)->data;
3231 endp = begp + XSTRING (str)->size;
3232 shrink_conversion_area (&begp, &endp, coding, encodep);
3233
3234 if (begp == endp)
3235 /* We need no conversion. */
e0e989f6 3236 return (NILP (nocopy) ? Fcopy_sequence (str) : str);
4ed46869
KH
3237
3238 head_skip = begp - XSTRING (str)->data;
3239 tail_skip = XSTRING (str)->size - head_skip - (endp - begp);
3240
3241 GCPRO1 (str);
3242
3243 if (encodep)
3244 len = encoding_buffer_size (coding, endp - begp);
3245 else
3246 len = decoding_buffer_size (coding, endp - begp);
3247 buf = get_conversion_buffer (len + head_skip + tail_skip);
3248
3249 bcopy (XSTRING (str)->data, buf, head_skip);
3250 coding->last_block = 1;
3251 produced = (encodep
3252 ? encode_coding (coding, XSTRING (str)->data + head_skip,
3253 buf + head_skip, endp - begp, len, &consumed)
3254 : decode_coding (coding, XSTRING (str)->data + head_skip,
3255 buf + head_skip, endp - begp, len, &consumed));
3256 bcopy (XSTRING (str)->data + head_skip + (endp - begp),
3257 buf + head_skip + produced,
3258 tail_skip);
3259
3260 UNGCPRO;
3261
3262 return make_string (buf, head_skip + produced + tail_skip);
3263}
3264
3265DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
e0e989f6
KH
3266 3, 3, "r\nzCoding system: ",
3267 "Decode current region by specified coding system.\n\
3268When called from a program, takes three arguments:\n\
3269START, END, and CODING-SYSTEM. START END are buffer positions.\n\
4ed46869
KH
3270Return length of decoded text.")
3271 (b, e, coding_system)
3272 Lisp_Object b, e, coding_system;
3273{
3274 struct coding_system coding;
3275
3276 CHECK_NUMBER_COERCE_MARKER (b, 0);
3277 CHECK_NUMBER_COERCE_MARKER (e, 1);
3278 CHECK_SYMBOL (coding_system, 2);
3279
e0e989f6
KH
3280 if (NILP (coding_system))
3281 return make_number (XFASTINT (e) - XFASTINT (b));
4ed46869
KH
3282 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3283 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3284
3285 return code_convert_region (b, e, &coding, 0);
3286}
3287
3288DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
e0e989f6
KH
3289 3, 3, "r\nzCoding system: ",
3290 "Encode current region by specified coding system.\n\
3291When called from a program, takes three arguments:\n\
3292START, END, and CODING-SYSTEM. START END are buffer positions.\n\
4ed46869
KH
3293Return length of encoded text.")
3294 (b, e, coding_system)
3295 Lisp_Object b, e, coding_system;
3296{
3297 struct coding_system coding;
3298
3299 CHECK_NUMBER_COERCE_MARKER (b, 0);
3300 CHECK_NUMBER_COERCE_MARKER (e, 1);
3301 CHECK_SYMBOL (coding_system, 2);
3302
e0e989f6
KH
3303 if (NILP (coding_system))
3304 return make_number (XFASTINT (e) - XFASTINT (b));
4ed46869
KH
3305 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3306 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3307
3308 return code_convert_region (b, e, &coding, 1);
3309}
3310
3311DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
e0e989f6
KH
3312 2, 3, 0,
3313 "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
3314Optional arg NOCOPY non-nil means return STRING itself if there's no need\n\
3315of decoding.")
3316 (string, coding_system, nocopy)
3317 Lisp_Object string, coding_system, nocopy;
4ed46869
KH
3318{
3319 struct coding_system coding;
3320
3321 CHECK_STRING (string, 0);
3322 CHECK_SYMBOL (coding_system, 1);
3323
e0e989f6
KH
3324 if (NILP (coding_system))
3325 return (NILP (nocopy) ? Fcopy_sequence (string) : string);
4ed46869
KH
3326 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3327 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3328
e0e989f6 3329 return code_convert_string (string, &coding, 0, nocopy);
4ed46869
KH
3330}
3331
3332DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
e0e989f6
KH
3333 2, 3, 0,
3334 "Encode STRING to CODING-SYSTEM, and return the result.\n\
3335Optional arg NOCOPY non-nil means return STRING itself if there's no need\n\
3336of encoding.")
3337 (string, coding_system, nocopy)
3338 Lisp_Object string, coding_system, nocopy;
4ed46869
KH
3339{
3340 struct coding_system coding;
3341
3342 CHECK_STRING (string, 0);
3343 CHECK_SYMBOL (coding_system, 1);
3344
e0e989f6
KH
3345 if (NILP (coding_system))
3346 return (NILP (nocopy) ? Fcopy_sequence (string) : string);
4ed46869
KH
3347 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3348 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3349
e0e989f6 3350 return code_convert_string (string, &coding, 1, nocopy);
4ed46869
KH
3351}
3352
3353DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
e0e989f6 3354 "Decode a JISX0208 character of shift-jis encoding.\n\
4ed46869
KH
3355CODE is the character code in SJIS.\n\
3356Return the corresponding character.")
3357 (code)
3358 Lisp_Object code;
3359{
3360 unsigned char c1, c2, s1, s2;
3361 Lisp_Object val;
3362
3363 CHECK_NUMBER (code, 0);
3364 s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
3365 DECODE_SJIS (s1, s2, c1, c2);
3366 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset_jisx0208, c1, c2));
3367 return val;
3368}
3369
3370DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
3371 "Encode a JISX0208 character CHAR to SJIS coding-system.\n\
3372Return the corresponding character code in SJIS.")
3373 (ch)
3374 Lisp_Object ch;
3375{
bcf26d6a 3376 int charset, c1, c2, s1, s2;
4ed46869
KH
3377 Lisp_Object val;
3378
3379 CHECK_NUMBER (ch, 0);
3380 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3381 if (charset == charset_jisx0208)
3382 {
3383 ENCODE_SJIS (c1, c2, s1, s2);
bcf26d6a 3384 XSETFASTINT (val, (s1 << 8) | s2);
4ed46869
KH
3385 }
3386 else
3387 XSETFASTINT (val, 0);
3388 return val;
3389}
3390
3391DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
3392 "Decode a Big5 character CODE of BIG5 coding-system.\n\
3393CODE is the character code in BIG5.\n\
3394Return the corresponding character.")
3395 (code)
3396 Lisp_Object code;
3397{
3398 int charset;
3399 unsigned char b1, b2, c1, c2;
3400 Lisp_Object val;
3401
3402 CHECK_NUMBER (code, 0);
3403 b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
3404 DECODE_BIG5 (b1, b2, charset, c1, c2);
3405 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset, c1, c2));
3406 return val;
3407}
3408
3409DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
3410 "Encode the Big5 character CHAR to BIG5 coding-system.\n\
3411Return the corresponding character code in Big5.")
3412 (ch)
3413 Lisp_Object ch;
3414{
bcf26d6a 3415 int charset, c1, c2, b1, b2;
4ed46869
KH
3416 Lisp_Object val;
3417
3418 CHECK_NUMBER (ch, 0);
3419 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3420 if (charset == charset_big5_1 || charset == charset_big5_2)
3421 {
3422 ENCODE_BIG5 (charset, c1, c2, b1, b2);
bcf26d6a 3423 XSETFASTINT (val, (b1 << 8) | b2);
4ed46869
KH
3424 }
3425 else
3426 XSETFASTINT (val, 0);
3427 return val;
3428}
3429
1ba9e4ab
KH
3430DEFUN ("set-terminal-coding-system-internal",
3431 Fset_terminal_coding_system_internal,
3432 Sset_terminal_coding_system_internal, 1, 1, 0, "")
4ed46869
KH
3433 (coding_system)
3434 Lisp_Object coding_system;
3435{
3436 CHECK_SYMBOL (coding_system, 0);
3437 setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
4ed46869
KH
3438 return Qnil;
3439}
3440
3441DEFUN ("terminal-coding-system",
3442 Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
3443 "Return coding-system of your terminal.")
3444 ()
3445{
3446 return terminal_coding.symbol;
3447}
3448
1ba9e4ab
KH
3449DEFUN ("set-keyboard-coding-system-internal",
3450 Fset_keyboard_coding_system_internal,
3451 Sset_keyboard_coding_system_internal, 1, 1, 0, "")
4ed46869
KH
3452 (coding_system)
3453 Lisp_Object coding_system;
3454{
3455 CHECK_SYMBOL (coding_system, 0);
3456 setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
3457 return Qnil;
3458}
3459
3460DEFUN ("keyboard-coding-system",
3461 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
3462 "Return coding-system of what is sent from terminal keyboard.")
3463 ()
3464{
3465 return keyboard_coding.symbol;
3466}
3467
3468\f
a5d301df
KH
3469DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
3470 Sfind_operation_coding_system, 1, MANY, 0,
3471 "Choose a coding system for an operation based on the target name.\n\
ccdb79f5
RS
3472The value names a pair of coding systems: (ENCODING-SYSTEM DECODING-SYSTEM).\n\
3473ENCODING-SYSTEM is the coding system to use for encoding\n\
3474\(in case OPERATION does encoding), and DECODING-SYSTEM is the coding system\n\
3475for decoding (in case OPERATION does decoding).\n\
3476\n\
3477The first argument OPERATION specifies an I/O primitive:\n\
3478 For file I/O, `insert-file-contents' or `write-region'.\n\
3479 For process I/O, `call-process', `call-process-region', or `start-process'.\n\
3480 For network I/O, `open-network-stream'.\n\
3481\n\
3482The remaining arguments should be the same arguments that were passed\n\
3483to the primitive. Depending on which primitive, one of those arguments\n\
3484is selected as the TARGET. For example, if OPERATION does file I/O,\n\
3485whichever argument specifies the file name is TARGET.\n\
3486\n\
3487TARGET has a meaning which depends on OPERATION:\n\
4ed46869
KH
3488 For file I/O, TARGET is a file name.\n\
3489 For process I/O, TARGET is a process name.\n\
3490 For network I/O, TARGET is a service name or a port number\n\
3491\n\
02ba4723
KH
3492This function looks up what specified for TARGET in,\n\
3493`file-coding-system-alist', `process-coding-system-alist',\n\
3494or `network-coding-system-alist' depending on OPERATION.\n\
3495They may specify a coding system, a cons of coding systems,\n\
3496or a function symbol to call.\n\
3497In the last case, we call the function with one argument,\n\
ccdb79f5 3498which is a list of all the arguments given to `find-coding-system'.")
4ed46869
KH
3499 (nargs, args)
3500 int nargs;
3501 Lisp_Object *args;
3502{
3503 Lisp_Object operation, target_idx, target, val;
3504 register Lisp_Object chain;
3505
3506 if (nargs < 2)
3507 error ("Too few arguments");
3508 operation = args[0];
3509 if (!SYMBOLP (operation)
3510 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
3511 error ("Invalid first arguement");
3512 if (nargs < 1 + XINT (target_idx))
3513 error ("Too few arguments for operation: %s",
3514 XSYMBOL (operation)->name->data);
3515 target = args[XINT (target_idx) + 1];
3516 if (!(STRINGP (target)
3517 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
3518 error ("Invalid %dth argument", XINT (target_idx) + 1);
3519
2e34157c
RS
3520 chain = ((EQ (operation, Qinsert_file_contents)
3521 || EQ (operation, Qwrite_region))
02ba4723 3522 ? Vfile_coding_system_alist
2e34157c 3523 : (EQ (operation, Qopen_network_stream)
02ba4723
KH
3524 ? Vnetwork_coding_system_alist
3525 : Vprocess_coding_system_alist));
4ed46869
KH
3526 if (NILP (chain))
3527 return Qnil;
3528
02ba4723 3529 for (; CONSP (chain); chain = XCONS (chain)->cdr)
4ed46869
KH
3530 {
3531 Lisp_Object elt = XCONS (chain)->car;
3532
3533 if (CONSP (elt)
3534 && ((STRINGP (target)
3535 && STRINGP (XCONS (elt)->car)
3536 && fast_string_match (XCONS (elt)->car, target) >= 0)
3537 || (INTEGERP (target) && EQ (target, XCONS (elt)->car))))
02ba4723
KH
3538 {
3539 val = XCONS (elt)->cdr;
3540 if (CONSP (val))
3541 return val;
3542 if (! SYMBOLP (val))
3543 return Qnil;
3544 if (! NILP (Fcoding_system_p (val)))
3545 return Fcons (val, val);
3546 if (!NILP (Fboundp (val)))
3547 return call2 (val, Flist (nargs, args));
3548 return Qnil;
3549 }
4ed46869
KH
3550 }
3551 return Qnil;
3552}
3553
3554#endif /* emacs */
3555
3556\f
3557/*** 8. Post-amble ***/
3558
3559init_coding_once ()
3560{
3561 int i;
3562
0ef69138 3563 /* Emacs' internal format specific initialize routine. */
4ed46869
KH
3564 for (i = 0; i <= 0x20; i++)
3565 emacs_code_class[i] = EMACS_control_code;
3566 emacs_code_class[0x0A] = EMACS_linefeed_code;
3567 emacs_code_class[0x0D] = EMACS_carriage_return_code;
3568 for (i = 0x21 ; i < 0x7F; i++)
3569 emacs_code_class[i] = EMACS_ascii_code;
3570 emacs_code_class[0x7F] = EMACS_control_code;
3571 emacs_code_class[0x80] = EMACS_leading_code_composition;
3572 for (i = 0x81; i < 0xFF; i++)
3573 emacs_code_class[i] = EMACS_invalid_code;
3574 emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
3575 emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
3576 emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
3577 emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
3578
3579 /* ISO2022 specific initialize routine. */
3580 for (i = 0; i < 0x20; i++)
3581 iso_code_class[i] = ISO_control_code;
3582 for (i = 0x21; i < 0x7F; i++)
3583 iso_code_class[i] = ISO_graphic_plane_0;
3584 for (i = 0x80; i < 0xA0; i++)
3585 iso_code_class[i] = ISO_control_code;
3586 for (i = 0xA1; i < 0xFF; i++)
3587 iso_code_class[i] = ISO_graphic_plane_1;
3588 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
3589 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
3590 iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
3591 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
3592 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
3593 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
3594 iso_code_class[ISO_CODE_ESC] = ISO_escape;
3595 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
3596 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
3597 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
3598
e0e989f6
KH
3599 conversion_buffer_size = MINIMUM_CONVERSION_BUFFER_SIZE;
3600 conversion_buffer = (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE);
3601
3602 setup_coding_system (Qnil, &keyboard_coding);
3603 setup_coding_system (Qnil, &terminal_coding);
3604}
3605
3606#ifdef emacs
3607
3608syms_of_coding ()
3609{
3610 Qtarget_idx = intern ("target-idx");
3611 staticpro (&Qtarget_idx);
3612
3613 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
3614 Fput (Qwrite_region, Qtarget_idx, make_number (2));
3615
3616 Qcall_process = intern ("call-process");
3617 staticpro (&Qcall_process);
3618 Fput (Qcall_process, Qtarget_idx, make_number (0));
3619
3620 Qcall_process_region = intern ("call-process-region");
3621 staticpro (&Qcall_process_region);
3622 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
3623
3624 Qstart_process = intern ("start-process");
3625 staticpro (&Qstart_process);
3626 Fput (Qstart_process, Qtarget_idx, make_number (2));
3627
3628 Qopen_network_stream = intern ("open-network-stream");
3629 staticpro (&Qopen_network_stream);
3630 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
3631
4ed46869
KH
3632 Qcoding_system = intern ("coding-system");
3633 staticpro (&Qcoding_system);
3634
3635 Qeol_type = intern ("eol-type");
3636 staticpro (&Qeol_type);
3637
3638 Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
3639 staticpro (&Qbuffer_file_coding_system);
3640
3641 Qpost_read_conversion = intern ("post-read-conversion");
3642 staticpro (&Qpost_read_conversion);
3643
3644 Qpre_write_conversion = intern ("pre-write-conversion");
3645 staticpro (&Qpre_write_conversion);
3646
02ba4723
KH
3647 Qcoding_system_spec = intern ("coding-system-spec");
3648 staticpro (&Qcoding_system_spec);
4ed46869
KH
3649
3650 Qcoding_system_p = intern ("coding-system-p");
3651 staticpro (&Qcoding_system_p);
3652
3653 Qcoding_system_error = intern ("coding-system-error");
3654 staticpro (&Qcoding_system_error);
3655
3656 Fput (Qcoding_system_error, Qerror_conditions,
3657 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
3658 Fput (Qcoding_system_error, Qerror_message,
3659 build_string ("Coding-system error"));
3660
3661 Qcoding_category_index = intern ("coding-category-index");
3662 staticpro (&Qcoding_category_index);
3663
3664 {
3665 int i;
3666 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3667 {
3668 coding_category_table[i] = intern (coding_category_name[i]);
3669 staticpro (&coding_category_table[i]);
3670 Fput (coding_category_table[i], Qcoding_category_index,
3671 make_number (i));
3672 }
3673 }
3674
bdd9fb48
KH
3675 Qcharacter_unification_table = intern ("character-unification-table");
3676 staticpro (&Qcharacter_unification_table);
3677 Fput (Qcharacter_unification_table, Qchar_table_extra_slots,
3678 make_number (0));
3679
a5d301df
KH
3680 Qcharacter_unification_table_for_decode
3681 = intern ("character-unification-table-for-decode");
3682 staticpro (&Qcharacter_unification_table_for_decode);
3683
3684 Qcharacter_unification_table_for_encode
3685 = intern ("character-unification-table-for-encode");
3686 staticpro (&Qcharacter_unification_table_for_encode);
3687
02ba4723 3688 defsubr (&Scoding_system_spec);
4ed46869
KH
3689 defsubr (&Scoding_system_p);
3690 defsubr (&Sread_coding_system);
3691 defsubr (&Sread_non_nil_coding_system);
3692 defsubr (&Scheck_coding_system);
3693 defsubr (&Sdetect_coding_region);
3694 defsubr (&Sdecode_coding_region);
3695 defsubr (&Sencode_coding_region);
3696 defsubr (&Sdecode_coding_string);
3697 defsubr (&Sencode_coding_string);
3698 defsubr (&Sdecode_sjis_char);
3699 defsubr (&Sencode_sjis_char);
3700 defsubr (&Sdecode_big5_char);
3701 defsubr (&Sencode_big5_char);
1ba9e4ab 3702 defsubr (&Sset_terminal_coding_system_internal);
4ed46869 3703 defsubr (&Sterminal_coding_system);
1ba9e4ab 3704 defsubr (&Sset_keyboard_coding_system_internal);
4ed46869 3705 defsubr (&Skeyboard_coding_system);
a5d301df 3706 defsubr (&Sfind_operation_coding_system);
4ed46869
KH
3707
3708 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
3709 "List of coding-categories (symbols) ordered by priority.");
3710 {
3711 int i;
3712
3713 Vcoding_category_list = Qnil;
3714 for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
3715 Vcoding_category_list
3716 = Fcons (coding_category_table[i], Vcoding_category_list);
3717 }
3718
3719 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
3720 "A variable of internal use only.\n\
3721If the value is a coding system, it is used for decoding on read operation.\n\
3722If not, an appropriate element in `coding-system-alist' (which see) is used.");
3723 Vcoding_system_for_read = Qnil;
3724
3725 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
3726 "A variable of internal use only.\n\
3727If the value is a coding system, it is used for encoding on write operation.\n\
3728If not, an appropriate element in `coding-system-alist' (which see) is used.");
3729 Vcoding_system_for_write = Qnil;
3730
3731 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
3732 "Coding-system used in the latest file or process I/O.");
3733 Vlast_coding_system_used = Qnil;
3734
02ba4723
KH
3735 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
3736 "Alist to decide a coding system to use for a file I/O operation.\n\
3737The format is ((PATTERN . VAL) ...),\n\
3738where PATTERN is a regular expression matching a file name,\n\
3739VAL is a coding system, a cons of coding systems, or a function symbol.\n\
3740If VAL is a coding system, it is used for both decoding and encoding\n\
3741the file contents.\n\
3742If VAL is a cons of coding systems, the car part is used for decoding,\n\
3743and the cdr part is used for encoding.\n\
3744If VAL is a function symbol, the function must return a coding system\n\
3745or a cons of coding systems which are used as above.\n\
e0e989f6 3746\n\
02ba4723
KH
3747See also the function `find-coding-system'.");
3748 Vfile_coding_system_alist = Qnil;
3749
3750 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
3751 "Alist to decide a coding system to use for a process I/O operation.\n\
3752The format is ((PATTERN . VAL) ...),\n\
3753where PATTERN is a regular expression matching a program name,\n\
3754VAL is a coding system, a cons of coding systems, or a function symbol.\n\
3755If VAL is a coding system, it is used for both decoding what received\n\
3756from the program and encoding what sent to the program.\n\
3757If VAL is a cons of coding systems, the car part is used for decoding,\n\
3758and the cdr part is used for encoding.\n\
3759If VAL is a function symbol, the function must return a coding system\n\
3760or a cons of coding systems which are used as above.\n\
4ed46869 3761\n\
02ba4723
KH
3762See also the function `find-coding-system'.");
3763 Vprocess_coding_system_alist = Qnil;
3764
3765 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
3766 "Alist to decide a coding system to use for a network I/O operation.\n\
3767The format is ((PATTERN . VAL) ...),\n\
3768where PATTERN is a regular expression matching a network service name\n\
3769or is a port number to connect to,\n\
3770VAL is a coding system, a cons of coding systems, or a function symbol.\n\
3771If VAL is a coding system, it is used for both decoding what received\n\
3772from the network stream and encoding what sent to the network stream.\n\
3773If VAL is a cons of coding systems, the car part is used for decoding,\n\
3774and the cdr part is used for encoding.\n\
3775If VAL is a function symbol, the function must return a coding system\n\
3776or a cons of coding systems which are used as above.\n\
4ed46869 3777\n\
02ba4723
KH
3778See also the function `find-coding-system'.");
3779 Vnetwork_coding_system_alist = Qnil;
4ed46869
KH
3780
3781 DEFVAR_INT ("eol-mnemonic-unix", &eol_mnemonic_unix,
3782 "Mnemonic character indicating UNIX-like end-of-line format (i.e. LF) .");
458822a0 3783 eol_mnemonic_unix = ':';
4ed46869
KH
3784
3785 DEFVAR_INT ("eol-mnemonic-dos", &eol_mnemonic_dos,
3786 "Mnemonic character indicating DOS-like end-of-line format (i.e. CRLF).");
458822a0 3787 eol_mnemonic_dos = '\\';
4ed46869
KH
3788
3789 DEFVAR_INT ("eol-mnemonic-mac", &eol_mnemonic_mac,
3790 "Mnemonic character indicating MAC-like end-of-line format (i.e. CR).");
458822a0 3791 eol_mnemonic_mac = '/';
4ed46869
KH
3792
3793 DEFVAR_INT ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
3794 "Mnemonic character indicating end-of-line format is not yet decided.");
458822a0 3795 eol_mnemonic_undecided = ':';
4ed46869 3796
bdd9fb48
KH
3797 DEFVAR_LISP ("enable-character-unification", &Venable_character_unification,
3798 "Non-nil means ISO 2022 encoder/decoder do character unification.");
3799 Venable_character_unification = Qt;
3800
a5d301df
KH
3801 DEFVAR_LISP ("standard-character-unification-table-for-decode",
3802 &Vstandard_character_unification_table_for_decode,
bdd9fb48 3803 "Table for unifying characters when reading.");
a5d301df 3804 Vstandard_character_unification_table_for_decode = Qnil;
bdd9fb48 3805
a5d301df
KH
3806 DEFVAR_LISP ("standard-character-unification-table-for-encode",
3807 &Vstandard_character_unification_table_for_encode,
bdd9fb48 3808 "Table for unifying characters when writing.");
a5d301df 3809 Vstandard_character_unification_table_for_encode = Qnil;
4ed46869
KH
3810
3811 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
3812 "Alist of charsets vs revision numbers.\n\
3813While encoding, if a charset (car part of an element) is found,\n\
3814designate it with the escape sequence identifing revision (cdr part of the element).");
3815 Vcharset_revision_alist = Qnil;
02ba4723
KH
3816
3817 DEFVAR_LISP ("default-process-coding-system",
3818 &Vdefault_process_coding_system,
3819 "Cons of coding systems used for process I/O by default.\n\
3820The car part is used for decoding a process output,\n\
3821the cdr part is used for encoding a text to be sent to a process.");
3822 Vdefault_process_coding_system = Qnil;
4ed46869
KH
3823}
3824
3825#endif /* emacs */