(Ffind_coding_system): Use call1, not call2.
[bpt/emacs.git] / src / coding.c
CommitLineData
4ed46869 1/* Coding system handler (conversion, detection, and etc).
203cb916
RS
2 Copyright (C) 1995, 1997 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4ed46869 4
369314dc
KH
5This file is part of GNU Emacs.
6
7GNU Emacs is free software; you can redistribute it and/or modify
8it under the terms of the GNU General Public License as published by
9the Free Software Foundation; either version 2, or (at your option)
10any later version.
4ed46869 11
369314dc
KH
12GNU Emacs is distributed in the hope that it will be useful,
13but WITHOUT ANY WARRANTY; without even the implied warranty of
14MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15GNU General Public License for more details.
4ed46869 16
369314dc
KH
17You should have received a copy of the GNU General Public License
18along with GNU Emacs; see the file COPYING. If not, write to
19the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20Boston, MA 02111-1307, USA. */
4ed46869
KH
21
22/*** TABLE OF CONTENTS ***
23
24 1. Preamble
0ef69138 25 2. Emacs' internal format (emacs-mule) handlers
4ed46869
KH
26 3. ISO2022 handlers
27 4. Shift-JIS and BIG5 handlers
28 5. End-of-line handlers
29 6. C library functions
30 7. Emacs Lisp library functions
31 8. Post-amble
32
33*/
34
35/*** GENERAL NOTE on CODING SYSTEM ***
36
37 Coding system is an encoding mechanism of one or more character
38 sets. Here's a list of coding systems which Emacs can handle. When
39 we say "decode", it means converting some other coding system to
0ef69138
KH
40 Emacs' internal format (emacs-internal), and when we say "encode",
41 it means converting the coding system emacs-mule to some other
42 coding system.
4ed46869 43
0ef69138 44 0. Emacs' internal format (emacs-mule)
4ed46869
KH
45
46 Emacs itself holds a multi-lingual character in a buffer and a string
47 in a special format. Details are described in the section 2.
48
49 1. ISO2022
50
51 The most famous coding system for multiple character sets. X's
52 Compound Text, various EUCs (Extended Unix Code), and such coding
53 systems used in Internet communication as ISO-2022-JP are all
54 variants of ISO2022. Details are described in the section 3.
55
56 2. SJIS (or Shift-JIS or MS-Kanji-Code)
57
58 A coding system to encode character sets: ASCII, JISX0201, and
59 JISX0208. Widely used for PC's in Japan. Details are described in
60 the section 4.
61
62 3. BIG5
63
64 A coding system to encode character sets: ASCII and Big5. Widely
65 used by Chinese (mainly in Taiwan and Hong Kong). Details are
66 described in the section 4. In this file, when written as "BIG5"
67 (all uppercase), it means the coding system, and when written as
68 "Big5" (capitalized), it means the character set.
69
70 4. Else
71
72 If a user want to read/write a text encoded in a coding system not
73 listed above, he can supply a decoder and an encoder for it in CCL
74 (Code Conversion Language) programs. Emacs executes the CCL program
75 while reading/writing.
76
77 Emacs represent a coding-system by a Lisp symbol that has a property
78 `coding-system'. But, before actually using the coding-system, the
79 information about it is set in a structure of type `struct
80 coding_system' for rapid processing. See the section 6 for more
81 detail.
82
83*/
84
85/*** GENERAL NOTES on END-OF-LINE FORMAT ***
86
87 How end-of-line of a text is encoded depends on a system. For
88 instance, Unix's format is just one byte of `line-feed' code,
89 whereas DOS's format is two bytes sequence of `carriage-return' and
90 `line-feed' codes. MacOS's format is one byte of `carriage-return'.
91
92 Since how characters in a text is encoded and how end-of-line is
93 encoded is independent, any coding system described above can take
94 any format of end-of-line. So, Emacs has information of format of
95 end-of-line in each coding-system. See the section 6 for more
96 detail.
97
98*/
99
100/*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
101
102 These functions check if a text between SRC and SRC_END is encoded
103 in the coding system category XXX. Each returns an integer value in
104 which appropriate flag bits for the category XXX is set. The flag
105 bits are defined in macros CODING_CATEGORY_MASK_XXX. Below is the
106 template of these functions. */
107#if 0
108int
0ef69138 109detect_coding_emacs_mule (src, src_end)
4ed46869
KH
110 unsigned char *src, *src_end;
111{
112 ...
113}
114#endif
115
116/*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
117
118 These functions decode SRC_BYTES length text at SOURCE encoded in
0ef69138
KH
119 CODING to Emacs' internal format (emacs-mule). The resulting text
120 goes to a place pointed by DESTINATION, the length of which should
121 not exceed DST_BYTES. The bytes actually processed is returned as
122 *CONSUMED. The return value is the length of the decoded text.
123 Below is a template of these functions. */
4ed46869
KH
124#if 0
125decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
126 struct coding_system *coding;
127 unsigned char *source, *destination;
128 int src_bytes, dst_bytes;
129 int *consumed;
130{
131 ...
132}
133#endif
134
135/*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
136
0ef69138
KH
137 These functions encode SRC_BYTES length text at SOURCE of Emacs'
138 internal format (emacs-mule) to CODING. The resulting text goes to
139 a place pointed by DESTINATION, the length of which should not
140 exceed DST_BYTES. The bytes actually processed is returned as
141 *CONSUMED. The return value is the length of the encoded text.
142 Below is a template of these functions. */
4ed46869
KH
143#if 0
144encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
145 struct coding_system *coding;
146 unsigned char *source, *destination;
147 int src_bytes, dst_bytes;
148 int *consumed;
149{
150 ...
151}
152#endif
153
154/*** COMMONLY USED MACROS ***/
155
156/* The following three macros ONE_MORE_BYTE, TWO_MORE_BYTES, and
157 THREE_MORE_BYTES safely get one, two, and three bytes from the
158 source text respectively. If there are not enough bytes in the
159 source, they jump to `label_end_of_loop'. The caller should set
160 variables `src' and `src_end' to appropriate areas in advance. */
161
162#define ONE_MORE_BYTE(c1) \
163 do { \
164 if (src < src_end) \
165 c1 = *src++; \
166 else \
167 goto label_end_of_loop; \
168 } while (0)
169
170#define TWO_MORE_BYTES(c1, c2) \
171 do { \
172 if (src + 1 < src_end) \
173 c1 = *src++, c2 = *src++; \
174 else \
175 goto label_end_of_loop; \
176 } while (0)
177
178#define THREE_MORE_BYTES(c1, c2, c3) \
179 do { \
180 if (src + 2 < src_end) \
181 c1 = *src++, c2 = *src++, c3 = *src++; \
182 else \
183 goto label_end_of_loop; \
184 } while (0)
185
186/* The following three macros DECODE_CHARACTER_ASCII,
187 DECODE_CHARACTER_DIMENSION1, and DECODE_CHARACTER_DIMENSION2 put
188 the multi-byte form of a character of each class at the place
189 pointed by `dst'. The caller should set the variable `dst' to
190 point to an appropriate area and the variable `coding' to point to
191 the coding-system of the currently decoding text in advance. */
192
193/* Decode one ASCII character C. */
194
195#define DECODE_CHARACTER_ASCII(c) \
196 do { \
197 if (COMPOSING_P (coding->composing)) \
198 *dst++ = 0xA0, *dst++ = (c) | 0x80; \
199 else \
200 *dst++ = (c); \
201 } while (0)
202
203/* Decode one DIMENSION1 character of which charset is CHARSET and
204 position-code is C. */
205
206#define DECODE_CHARACTER_DIMENSION1(charset, c) \
207 do { \
208 unsigned char leading_code = CHARSET_LEADING_CODE_BASE (charset); \
209 if (COMPOSING_P (coding->composing)) \
210 *dst++ = leading_code + 0x20; \
211 else \
212 *dst++ = leading_code; \
213 if (leading_code = CHARSET_LEADING_CODE_EXT (charset)) \
214 *dst++ = leading_code; \
215 *dst++ = (c) | 0x80; \
216 } while (0)
217
218/* Decode one DIMENSION2 character of which charset is CHARSET and
219 position-codes are C1 and C2. */
220
221#define DECODE_CHARACTER_DIMENSION2(charset, c1, c2) \
222 do { \
223 DECODE_CHARACTER_DIMENSION1 (charset, c1); \
224 *dst++ = (c2) | 0x80; \
225 } while (0)
226
227\f
228/*** 1. Preamble ***/
229
230#include <stdio.h>
231
232#ifdef emacs
233
234#include <config.h>
235#include "lisp.h"
236#include "buffer.h"
237#include "charset.h"
238#include "ccl.h"
239#include "coding.h"
240#include "window.h"
241
242#else /* not emacs */
243
244#include "mulelib.h"
245
246#endif /* not emacs */
247
248Lisp_Object Qcoding_system, Qeol_type;
249Lisp_Object Qbuffer_file_coding_system;
250Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
251
252extern Lisp_Object Qinsert_file_contents, Qwrite_region;
253Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
254Lisp_Object Qstart_process, Qopen_network_stream;
255Lisp_Object Qtarget_idx;
256
257/* Mnemonic character of each format of end-of-line. */
258int eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
259/* Mnemonic character to indicate format of end-of-line is not yet
260 decided. */
261int eol_mnemonic_undecided;
262
9ce27fde
KH
263/* Format of end-of-line decided by system. This is CODING_EOL_LF on
264 Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac. */
265int system_eol_type;
266
4ed46869
KH
267#ifdef emacs
268
02ba4723 269Lisp_Object Qcoding_system_spec, Qcoding_system_p, Qcoding_system_error;
4ed46869 270
9ce27fde
KH
271/* Coding system emacs-mule is for converting only end-of-line format. */
272Lisp_Object Qemacs_mule;
273
4ed46869
KH
274/* Coding-systems are handed between Emacs Lisp programs and C internal
275 routines by the following three variables. */
276/* Coding-system for reading files and receiving data from process. */
277Lisp_Object Vcoding_system_for_read;
278/* Coding-system for writing files and sending data to process. */
279Lisp_Object Vcoding_system_for_write;
280/* Coding-system actually used in the latest I/O. */
281Lisp_Object Vlast_coding_system_used;
282
9ce27fde
KH
283/* Flag to inhibit code conversion of end-of-line format. */
284int inhibit_eol_conversion;
285
4ed46869
KH
286/* Coding-system of what terminal accept for displaying. */
287struct coding_system terminal_coding;
288
289/* Coding-system of what is sent from terminal keyboard. */
290struct coding_system keyboard_coding;
291
02ba4723
KH
292Lisp_Object Vfile_coding_system_alist;
293Lisp_Object Vprocess_coding_system_alist;
294Lisp_Object Vnetwork_coding_system_alist;
4ed46869
KH
295
296#endif /* emacs */
297
298Lisp_Object Qcoding_category_index;
299
300/* List of symbols `coding-category-xxx' ordered by priority. */
301Lisp_Object Vcoding_category_list;
302
303/* Table of coding-systems currently assigned to each coding-category. */
304Lisp_Object coding_category_table[CODING_CATEGORY_IDX_MAX];
305
306/* Table of names of symbol for each coding-category. */
307char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
0ef69138 308 "coding-category-emacs-mule",
4ed46869
KH
309 "coding-category-sjis",
310 "coding-category-iso-7",
311 "coding-category-iso-8-1",
312 "coding-category-iso-8-2",
313 "coding-category-iso-else",
314 "coding-category-big5",
315 "coding-category-binary"
316};
317
bdd9fb48
KH
318/* Flag to tell if we look up unification table on character code
319 conversion. */
320Lisp_Object Venable_character_unification;
a5d301df
KH
321/* Standard unification table to look up on decoding (reading). */
322Lisp_Object Vstandard_character_unification_table_for_decode;
323/* Standard unification table to look up on encoding (writing). */
324Lisp_Object Vstandard_character_unification_table_for_encode;
bdd9fb48
KH
325
326Lisp_Object Qcharacter_unification_table;
a5d301df
KH
327Lisp_Object Qcharacter_unification_table_for_decode;
328Lisp_Object Qcharacter_unification_table_for_encode;
4ed46869
KH
329
330/* Alist of charsets vs revision number. */
331Lisp_Object Vcharset_revision_alist;
332
02ba4723
KH
333/* Default coding systems used for process I/O. */
334Lisp_Object Vdefault_process_coding_system;
335
4ed46869 336\f
0ef69138 337/*** 2. Emacs internal format (emacs-mule) handlers ***/
4ed46869
KH
338
339/* Emacs' internal format for encoding multiple character sets is a
340 kind of multi-byte encoding, i.e. encoding a character by a sequence
341 of one-byte codes of variable length. ASCII characters and control
342 characters (e.g. `tab', `newline') are represented by one-byte as
343 is. It takes the range 0x00 through 0x7F. The other characters
344 are represented by a sequence of `base leading-code', optional
345 `extended leading-code', and one or two `position-code's. Length
346 of the sequence is decided by the base leading-code. Leading-code
347 takes the range 0x80 through 0x9F, whereas extended leading-code
348 and position-code take the range 0xA0 through 0xFF. See the
349 document of `charset.h' for more detail about leading-code and
350 position-code.
351
352 There's one exception in this rule. Special leading-code
353 `leading-code-composition' denotes that the following several
354 characters should be composed into one character. Leading-codes of
355 components (except for ASCII) are added 0x20. An ASCII character
356 component is represented by a 2-byte sequence of `0xA0' and
357 `ASCII-code + 0x80'. See also the document in `charset.h' for the
358 detail of composite character. Hence, we can summarize the code
359 range as follows:
360
361 --- CODE RANGE of Emacs' internal format ---
362 (character set) (range)
363 ASCII 0x00 .. 0x7F
364 ELSE (1st byte) 0x80 .. 0x9F
365 (rest bytes) 0xA0 .. 0xFF
366 ---------------------------------------------
367
368 */
369
370enum emacs_code_class_type emacs_code_class[256];
371
372/* Go to the next statement only if *SRC is accessible and the code is
373 greater than 0xA0. */
374#define CHECK_CODE_RANGE_A0_FF \
375 do { \
376 if (src >= src_end) \
377 goto label_end_of_switch; \
378 else if (*src++ < 0xA0) \
379 return 0; \
380 } while (0)
381
382/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
383 Check if a text is encoded in Emacs' internal format. If it is,
0ef69138 384 return CODING_CATEGORY_MASK_EMASC_MULE, else return 0. */
4ed46869
KH
385
386int
0ef69138 387detect_coding_emacs_mule (src, src_end)
4ed46869
KH
388 unsigned char *src, *src_end;
389{
390 unsigned char c;
391 int composing = 0;
392
393 while (src < src_end)
394 {
395 c = *src++;
396
397 if (composing)
398 {
399 if (c < 0xA0)
400 composing = 0;
401 else
402 c -= 0x20;
403 }
404
405 switch (emacs_code_class[c])
406 {
407 case EMACS_ascii_code:
408 case EMACS_linefeed_code:
409 break;
410
411 case EMACS_control_code:
412 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
413 return 0;
414 break;
415
416 case EMACS_invalid_code:
417 return 0;
418
419 case EMACS_leading_code_composition: /* c == 0x80 */
420 if (composing)
421 CHECK_CODE_RANGE_A0_FF;
422 else
423 composing = 1;
424 break;
425
426 case EMACS_leading_code_4:
427 CHECK_CODE_RANGE_A0_FF;
428 /* fall down to check it two more times ... */
429
430 case EMACS_leading_code_3:
431 CHECK_CODE_RANGE_A0_FF;
432 /* fall down to check it one more time ... */
433
434 case EMACS_leading_code_2:
435 CHECK_CODE_RANGE_A0_FF;
436 break;
437
438 default:
439 label_end_of_switch:
440 break;
441 }
442 }
0ef69138 443 return CODING_CATEGORY_MASK_EMACS_MULE;
4ed46869
KH
444}
445
446\f
447/*** 3. ISO2022 handlers ***/
448
449/* The following note describes the coding system ISO2022 briefly.
450 Since the intension of this note is to help understanding of the
451 programs in this file, some parts are NOT ACCURATE or OVERLY
452 SIMPLIFIED. For the thorough understanding, please refer to the
453 original document of ISO2022.
454
455 ISO2022 provides many mechanisms to encode several character sets
456 in 7-bit and 8-bit environment. If one choose 7-bite environment,
457 all text is encoded by codes of less than 128. This may make the
458 encoded text a little bit longer, but the text get more stability
459 to pass through several gateways (some of them split MSB off).
460
461 There are two kind of character set: control character set and
462 graphic character set. The former contains control characters such
463 as `newline' and `escape' to provide control functions (control
464 functions are provided also by escape sequence). The latter
465 contains graphic characters such as ' A' and '-'. Emacs recognizes
466 two control character sets and many graphic character sets.
467
468 Graphic character sets are classified into one of the following
469 four classes, DIMENSION1_CHARS94, DIMENSION1_CHARS96,
470 DIMENSION2_CHARS94, DIMENSION2_CHARS96 according to the number of
471 bytes (DIMENSION) and the number of characters in one dimension
472 (CHARS) of the set. In addition, each character set is assigned an
473 identification tag (called "final character" and denoted as <F>
474 here after) which is unique in each class. <F> of each character
475 set is decided by ECMA(*) when it is registered in ISO. Code range
476 of <F> is 0x30..0x7F (0x30..0x3F are for private use only).
477
478 Note (*): ECMA = European Computer Manufacturers Association
479
480 Here are examples of graphic character set [NAME(<F>)]:
481 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
482 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
483 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
484 o DIMENSION2_CHARS96 -- none for the moment
485
486 A code area (1byte=8bits) is divided into 4 areas, C0, GL, C1, and GR.
487 C0 [0x00..0x1F] -- control character plane 0
488 GL [0x20..0x7F] -- graphic character plane 0
489 C1 [0x80..0x9F] -- control character plane 1
490 GR [0xA0..0xFF] -- graphic character plane 1
491
492 A control character set is directly designated and invoked to C0 or
493 C1 by an escape sequence. The most common case is that ISO646's
494 control character set is designated/invoked to C0 and ISO6429's
495 control character set is designated/invoked to C1, and usually
496 these designations/invocations are omitted in a coded text. With
497 7-bit environment, only C0 can be used, and a control character for
498 C1 is encoded by an appropriate escape sequence to fit in the
499 environment. All control characters for C1 are defined the
500 corresponding escape sequences.
501
502 A graphic character set is at first designated to one of four
503 graphic registers (G0 through G3), then these graphic registers are
504 invoked to GL or GR. These designations and invocations can be
505 done independently. The most common case is that G0 is invoked to
506 GL, G1 is invoked to GR, and ASCII is designated to G0, and usually
507 these invocations and designations are omitted in a coded text.
508 With 7-bit environment, only GL can be used.
509
510 When a graphic character set of CHARS94 is invoked to GL, code 0x20
511 and 0x7F of GL area work as control characters SPACE and DEL
512 respectively, and code 0xA0 and 0xFF of GR area should not be used.
513
514 There are two ways of invocation: locking-shift and single-shift.
515 With locking-shift, the invocation lasts until the next different
516 invocation, whereas with single-shift, the invocation works only
517 for the following character and doesn't affect locking-shift.
518 Invocations are done by the following control characters or escape
519 sequences.
520
521 ----------------------------------------------------------------------
522 function control char escape sequence description
523 ----------------------------------------------------------------------
524 SI (shift-in) 0x0F none invoke G0 to GL
525 SI (shift-out) 0x0E none invoke G1 to GL
526 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
527 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
528 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 into GL
529 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 into GL
530 ----------------------------------------------------------------------
531 The first four are for locking-shift. Control characters for these
532 functions are defined by macros ISO_CODE_XXX in `coding.h'.
533
534 Designations are done by the following escape sequences.
535 ----------------------------------------------------------------------
536 escape sequence description
537 ----------------------------------------------------------------------
538 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
539 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
540 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
541 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
542 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
543 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
544 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
545 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
546 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
547 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
548 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
549 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
550 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
551 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
552 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
553 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
554 ----------------------------------------------------------------------
555
556 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
557 of dimension 1, chars 94, and final character <F>, and etc.
558
559 Note (*): Although these designations are not allowed in ISO2022,
560 Emacs accepts them on decoding, and produces them on encoding
561 CHARS96 character set in a coding system which is characterized as
562 7-bit environment, non-locking-shift, and non-single-shift.
563
564 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
565 '(' can be omitted. We call this as "short-form" here after.
566
567 Now you may notice that there are a lot of ways for encoding the
568 same multilingual text in ISO2022. Actually, there exist many
569 coding systems such as Compound Text (used in X's inter client
570 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
571 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
572 localized platforms), and all of these are variants of ISO2022.
573
574 In addition to the above, Emacs handles two more kinds of escape
575 sequences: ISO6429's direction specification and Emacs' private
576 sequence for specifying character composition.
577
578 ISO6429's direction specification takes the following format:
579 o CSI ']' -- end of the current direction
580 o CSI '0' ']' -- end of the current direction
581 o CSI '1' ']' -- start of left-to-right text
582 o CSI '2' ']' -- start of right-to-left text
583 The control character CSI (0x9B: control sequence introducer) is
584 abbreviated to the escape sequence ESC '[' in 7-bit environment.
585
586 Character composition specification takes the following format:
587 o ESC '0' -- start character composition
588 o ESC '1' -- end character composition
589 Since these are not standard escape sequences of any ISO, the use
590 of them for these meaning is restricted to Emacs only. */
591
592enum iso_code_class_type iso_code_class[256];
593
594/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
595 Check if a text is encoded in ISO2022. If it is, returns an
596 integer in which appropriate flag bits any of:
597 CODING_CATEGORY_MASK_ISO_7
598 CODING_CATEGORY_MASK_ISO_8_1
599 CODING_CATEGORY_MASK_ISO_8_2
600 CODING_CATEGORY_MASK_ISO_ELSE
601 are set. If a code which should never appear in ISO2022 is found,
602 returns 0. */
603
604int
605detect_coding_iso2022 (src, src_end)
606 unsigned char *src, *src_end;
607{
765a2ca5
KH
608 int mask = (CODING_CATEGORY_MASK_ISO_7
609 | CODING_CATEGORY_MASK_ISO_8_1
610 | CODING_CATEGORY_MASK_ISO_8_2
611 | CODING_CATEGORY_MASK_ISO_ELSE);
bcf26d6a
KH
612 int g1 = 0; /* 1 iff designating to G1. */
613 int c, i;
4ed46869 614
e0e989f6 615 while (src < src_end)
4ed46869
KH
616 {
617 c = *src++;
618 switch (c)
619 {
620 case ISO_CODE_ESC:
e0e989f6 621 if (src >= src_end)
4ed46869
KH
622 break;
623 c = *src++;
bcf26d6a 624 if (src < src_end
e0e989f6
KH
625 && ((c >= '(' && c <= '/')
626 || c == '$' && ((*src >= '(' && *src <= '/')
627 || (*src >= '@' && *src <= 'B'))))
4ed46869 628 {
e0e989f6
KH
629 /* Valid designation sequence. */
630 if (c == ')' || (c == '$' && *src == ')'))
bcf26d6a
KH
631 {
632 g1 = 1;
633 mask &= ~CODING_CATEGORY_MASK_ISO_7;
634 }
e0e989f6
KH
635 src++;
636 break;
4ed46869 637 }
4ed46869
KH
638 else if (c == 'N' || c == 'O' || c == 'n' || c == 'o')
639 return CODING_CATEGORY_MASK_ISO_ELSE;
640 break;
641
4ed46869 642 case ISO_CODE_SO:
e0e989f6
KH
643 if (g1)
644 return CODING_CATEGORY_MASK_ISO_ELSE;
645 break;
646
4ed46869
KH
647 case ISO_CODE_CSI:
648 case ISO_CODE_SS2:
649 case ISO_CODE_SS3:
650 mask &= ~CODING_CATEGORY_MASK_ISO_7;
651 break;
652
653 default:
654 if (c < 0x80)
655 break;
656 else if (c < 0xA0)
657 return 0;
658 else
659 {
660 int count = 1;
661
662 mask &= ~CODING_CATEGORY_MASK_ISO_7;
e0e989f6 663 while (src < src_end && *src >= 0xA0)
4ed46869 664 count++, src++;
e0e989f6 665 if (count & 1 && src < src_end)
4ed46869
KH
666 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
667 }
668 break;
669 }
670 }
671
672 return mask;
673}
674
675/* Decode a character of which charset is CHARSET and the 1st position
bdd9fb48 676 code is C1. If dimension of CHARSET is 2, the 2nd position code is
4ed46869
KH
677 fetched from SRC and set to C2. If CHARSET is negative, it means
678 that we are decoding ill formed text, and what we can do is just to
679 read C1 as is. */
680
bdd9fb48
KH
681#define DECODE_ISO_CHARACTER(charset, c1) \
682 do { \
683 int c_alt, charset_alt = (charset); \
684 if (COMPOSING_HEAD_P (coding->composing)) \
685 { \
686 *dst++ = LEADING_CODE_COMPOSITION; \
687 if (COMPOSING_WITH_RULE_P (coding->composing)) \
688 /* To tell composition rules are embeded. */ \
689 *dst++ = 0xFF; \
690 coding->composing += 2; \
691 } \
692 if ((charset) >= 0) \
693 { \
694 if (CHARSET_DIMENSION (charset) == 2) \
695 ONE_MORE_BYTE (c2); \
696 if (!NILP (unification_table) \
697 && ((c_alt = unify_char (unification_table, \
698 -1, (charset), c1, c2)) >= 0)) \
699 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
700 } \
701 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
702 DECODE_CHARACTER_ASCII (c1); \
703 else if (CHARSET_DIMENSION (charset_alt) == 1) \
704 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
705 else \
706 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
707 if (COMPOSING_WITH_RULE_P (coding->composing)) \
708 /* To tell a composition rule follows. */ \
709 coding->composing = COMPOSING_WITH_RULE_RULE; \
4ed46869
KH
710 } while (0)
711
712/* Set designation state into CODING. */
713#define DECODE_DESIGNATION(reg, dimension, chars, final_char) \
714 do { \
2e34157c
RS
715 int charset = ISO_CHARSET_TABLE (make_number (dimension), \
716 make_number (chars), \
717 make_number (final_char)); \
4ed46869
KH
718 if (charset >= 0) \
719 { \
720 if (coding->direction == 1 \
721 && CHARSET_REVERSE_CHARSET (charset) >= 0) \
722 charset = CHARSET_REVERSE_CHARSET (charset); \
723 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
724 } \
725 } while (0)
726
727/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
728
729int
730decode_coding_iso2022 (coding, source, destination,
731 src_bytes, dst_bytes, consumed)
732 struct coding_system *coding;
733 unsigned char *source, *destination;
734 int src_bytes, dst_bytes;
735 int *consumed;
736{
737 unsigned char *src = source;
738 unsigned char *src_end = source + src_bytes;
739 unsigned char *dst = destination;
740 unsigned char *dst_end = destination + dst_bytes;
741 /* Since the maximum bytes produced by each loop is 7, we subtract 6
742 from DST_END to assure that overflow checking is necessary only
743 at the head of loop. */
744 unsigned char *adjusted_dst_end = dst_end - 6;
745 int charset;
746 /* Charsets invoked to graphic plane 0 and 1 respectively. */
747 int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
748 int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
a5d301df
KH
749 Lisp_Object unification_table
750 = coding->character_unification_table_for_decode;
bdd9fb48
KH
751
752 if (!NILP (Venable_character_unification) && NILP (unification_table))
a5d301df 753 unification_table = Vstandard_character_unification_table_for_decode;
4ed46869
KH
754
755 while (src < src_end && dst < adjusted_dst_end)
756 {
757 /* SRC_BASE remembers the start position in source in each loop.
758 The loop will be exited when there's not enough source text
759 to analyze long escape sequence or 2-byte code (within macros
760 ONE_MORE_BYTE or TWO_MORE_BYTES). In that case, SRC is reset
761 to SRC_BASE before exiting. */
762 unsigned char *src_base = src;
bdd9fb48 763 int c1 = *src++, c2;
4ed46869
KH
764
765 switch (iso_code_class [c1])
766 {
767 case ISO_0x20_or_0x7F:
768 if (!coding->composing
769 && (charset0 < 0 || CHARSET_CHARS (charset0) == 94))
770 {
771 /* This is SPACE or DEL. */
772 *dst++ = c1;
773 break;
774 }
775 /* This is a graphic character, we fall down ... */
776
777 case ISO_graphic_plane_0:
778 if (coding->composing == COMPOSING_WITH_RULE_RULE)
779 {
780 /* This is a composition rule. */
781 *dst++ = c1 | 0x80;
782 coding->composing = COMPOSING_WITH_RULE_TAIL;
783 }
784 else
785 DECODE_ISO_CHARACTER (charset0, c1);
786 break;
787
788 case ISO_0xA0_or_0xFF:
789 if (charset1 < 0 || CHARSET_CHARS (charset1) == 94)
790 {
791 /* Invalid code. */
792 *dst++ = c1;
793 break;
794 }
795 /* This is a graphic character, we fall down ... */
796
797 case ISO_graphic_plane_1:
798 DECODE_ISO_CHARACTER (charset1, c1);
799 break;
800
801 case ISO_control_code:
802 /* All ISO2022 control characters in this class have the
803 same representation in Emacs internal format. */
804 *dst++ = c1;
805 break;
806
807 case ISO_carriage_return:
808 if (coding->eol_type == CODING_EOL_CR)
809 {
810 *dst++ = '\n';
811 }
812 else if (coding->eol_type == CODING_EOL_CRLF)
813 {
814 ONE_MORE_BYTE (c1);
815 if (c1 == ISO_CODE_LF)
816 *dst++ = '\n';
817 else
818 {
819 src--;
820 *dst++ = c1;
821 }
822 }
823 else
824 {
825 *dst++ = c1;
826 }
827 break;
828
829 case ISO_shift_out:
e0e989f6
KH
830 if (CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
831 goto label_invalid_escape_sequence;
4ed46869
KH
832 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
833 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
834 break;
835
836 case ISO_shift_in:
837 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
838 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
839 break;
840
841 case ISO_single_shift_2_7:
842 case ISO_single_shift_2:
843 /* SS2 is handled as an escape sequence of ESC 'N' */
844 c1 = 'N';
845 goto label_escape_sequence;
846
847 case ISO_single_shift_3:
848 /* SS2 is handled as an escape sequence of ESC 'O' */
849 c1 = 'O';
850 goto label_escape_sequence;
851
852 case ISO_control_sequence_introducer:
853 /* CSI is handled as an escape sequence of ESC '[' ... */
854 c1 = '[';
855 goto label_escape_sequence;
856
857 case ISO_escape:
858 ONE_MORE_BYTE (c1);
859 label_escape_sequence:
860 /* Escape sequences handled by Emacs are invocation,
861 designation, direction specification, and character
862 composition specification. */
863 switch (c1)
864 {
865 case '&': /* revision of following character set */
866 ONE_MORE_BYTE (c1);
867 if (!(c1 >= '@' && c1 <= '~'))
e0e989f6 868 goto label_invalid_escape_sequence;
4ed46869
KH
869 ONE_MORE_BYTE (c1);
870 if (c1 != ISO_CODE_ESC)
e0e989f6 871 goto label_invalid_escape_sequence;
4ed46869
KH
872 ONE_MORE_BYTE (c1);
873 goto label_escape_sequence;
874
875 case '$': /* designation of 2-byte character set */
876 ONE_MORE_BYTE (c1);
877 if (c1 >= '@' && c1 <= 'B')
878 { /* designation of JISX0208.1978, GB2312.1980,
879 or JISX0208.1980 */
880 DECODE_DESIGNATION (0, 2, 94, c1);
881 }
882 else if (c1 >= 0x28 && c1 <= 0x2B)
883 { /* designation of DIMENSION2_CHARS94 character set */
884 ONE_MORE_BYTE (c2);
885 DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
886 }
887 else if (c1 >= 0x2C && c1 <= 0x2F)
888 { /* designation of DIMENSION2_CHARS96 character set */
889 ONE_MORE_BYTE (c2);
890 DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
891 }
892 else
e0e989f6 893 goto label_invalid_escape_sequence;
4ed46869
KH
894 break;
895
896 case 'n': /* invocation of locking-shift-2 */
e0e989f6
KH
897 if (CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
898 goto label_invalid_escape_sequence;
4ed46869 899 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
e0e989f6 900 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
4ed46869
KH
901 break;
902
903 case 'o': /* invocation of locking-shift-3 */
e0e989f6
KH
904 if (CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
905 goto label_invalid_escape_sequence;
4ed46869 906 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
e0e989f6 907 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
4ed46869
KH
908 break;
909
910 case 'N': /* invocation of single-shift-2 */
e0e989f6
KH
911 if (CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
912 goto label_invalid_escape_sequence;
4ed46869
KH
913 ONE_MORE_BYTE (c1);
914 charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
915 DECODE_ISO_CHARACTER (charset, c1);
916 break;
917
918 case 'O': /* invocation of single-shift-3 */
e0e989f6
KH
919 if (CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
920 goto label_invalid_escape_sequence;
4ed46869
KH
921 ONE_MORE_BYTE (c1);
922 charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
923 DECODE_ISO_CHARACTER (charset, c1);
924 break;
925
926 case '0': /* start composing without embeded rules */
927 coding->composing = COMPOSING_NO_RULE_HEAD;
928 break;
929
930 case '1': /* end composing */
931 coding->composing = COMPOSING_NO;
932 break;
933
934 case '2': /* start composing with embeded rules */
935 coding->composing = COMPOSING_WITH_RULE_HEAD;
936 break;
937
938 case '[': /* specification of direction */
939 /* For the moment, nested direction is not supported.
940 So, the value of `coding->direction' is 0 or 1: 0
941 means left-to-right, 1 means right-to-left. */
942 ONE_MORE_BYTE (c1);
943 switch (c1)
944 {
945 case ']': /* end of the current direction */
946 coding->direction = 0;
947
948 case '0': /* end of the current direction */
949 case '1': /* start of left-to-right direction */
950 ONE_MORE_BYTE (c1);
951 if (c1 == ']')
952 coding->direction = 0;
953 else
954 goto label_invalid_escape_sequence;
955 break;
956
957 case '2': /* start of right-to-left direction */
958 ONE_MORE_BYTE (c1);
959 if (c1 == ']')
960 coding->direction= 1;
961 else
962 goto label_invalid_escape_sequence;
963 break;
964
965 default:
966 goto label_invalid_escape_sequence;
967 }
968 break;
969
970 default:
971 if (c1 >= 0x28 && c1 <= 0x2B)
972 { /* designation of DIMENSION1_CHARS94 character set */
973 ONE_MORE_BYTE (c2);
974 DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
975 }
976 else if (c1 >= 0x2C && c1 <= 0x2F)
977 { /* designation of DIMENSION1_CHARS96 character set */
978 ONE_MORE_BYTE (c2);
979 DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
980 }
981 else
982 {
983 goto label_invalid_escape_sequence;
984 }
985 }
986 /* We must update these variables now. */
987 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
988 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
989 break;
990
991 label_invalid_escape_sequence:
992 {
993 int length = src - src_base;
994
995 bcopy (src_base, dst, length);
996 dst += length;
997 }
998 }
999 continue;
1000
1001 label_end_of_loop:
1002 coding->carryover_size = src - src_base;
1003 bcopy (src_base, coding->carryover, coding->carryover_size);
1004 src = src_base;
1005 break;
1006 }
1007
1008 /* If this is the last block of the text to be decoded, we had
1009 better just flush out all remaining codes in the text although
1010 they are not valid characters. */
1011 if (coding->last_block)
1012 {
1013 bcopy (src, dst, src_end - src);
1014 dst += (src_end - src);
1015 src = src_end;
1016 }
1017 *consumed = src - source;
1018 return dst - destination;
1019}
1020
1021/* ISO2022 encoding staffs. */
1022
1023/*
1024 It is not enough to say just "ISO2022" on encoding, but we have to
1025 specify more details. In Emacs, each coding-system of ISO2022
1026 variant has the following specifications:
1027 1. Initial designation to G0 thru G3.
1028 2. Allows short-form designation?
1029 3. ASCII should be designated to G0 before control characters?
1030 4. ASCII should be designated to G0 at end of line?
1031 5. 7-bit environment or 8-bit environment?
1032 6. Use locking-shift?
1033 7. Use Single-shift?
1034 And the following two are only for Japanese:
1035 8. Use ASCII in place of JIS0201-1976-Roman?
1036 9. Use JISX0208-1983 in place of JISX0208-1978?
1037 These specifications are encoded in `coding->flags' as flag bits
1038 defined by macros CODING_FLAG_ISO_XXX. See `coding.h' for more
1039 detail.
1040*/
1041
1042/* Produce codes (escape sequence) for designating CHARSET to graphic
1043 register REG. If <final-char> of CHARSET is '@', 'A', or 'B' and
1044 the coding system CODING allows, produce designation sequence of
1045 short-form. */
1046
1047#define ENCODE_DESIGNATION(charset, reg, coding) \
1048 do { \
1049 unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset); \
1050 char *intermediate_char_94 = "()*+"; \
1051 char *intermediate_char_96 = ",-./"; \
1052 Lisp_Object temp \
1053 = Fassq (make_number (charset), Vcharset_revision_alist); \
1054 if (! NILP (temp)) \
1055 { \
1056 *dst++ = ISO_CODE_ESC; \
1057 *dst++ = '&'; \
1058 *dst++ = XINT (XCONS (temp)->cdr) + '@'; \
1059 } \
1060 *dst++ = ISO_CODE_ESC; \
1061 if (CHARSET_DIMENSION (charset) == 1) \
1062 { \
1063 if (CHARSET_CHARS (charset) == 94) \
1064 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1065 else \
1066 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1067 } \
1068 else \
1069 { \
1070 *dst++ = '$'; \
1071 if (CHARSET_CHARS (charset) == 94) \
1072 { \
1073 if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM) \
1074 || reg != 0 \
1075 || final_char < '@' || final_char > 'B') \
1076 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1077 } \
1078 else \
1079 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1080 } \
1081 *dst++ = final_char; \
1082 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
1083 } while (0)
1084
1085/* The following two macros produce codes (control character or escape
1086 sequence) for ISO2022 single-shift functions (single-shift-2 and
1087 single-shift-3). */
1088
1089#define ENCODE_SINGLE_SHIFT_2 \
1090 do { \
1091 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1092 *dst++ = ISO_CODE_ESC, *dst++ = 'N'; \
1093 else \
1094 *dst++ = ISO_CODE_SS2; \
1095 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1096 } while (0)
1097
1098#define ENCODE_SINGLE_SHIFT_3 \
1099 do { \
1100 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1101 *dst++ = ISO_CODE_ESC, *dst++ = 'O'; \
1102 else \
1103 *dst++ = ISO_CODE_SS3; \
1104 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1105 } while (0)
1106
1107/* The following four macros produce codes (control character or
1108 escape sequence) for ISO2022 locking-shift functions (shift-in,
1109 shift-out, locking-shift-2, and locking-shift-3). */
1110
1111#define ENCODE_SHIFT_IN \
1112 do { \
1113 *dst++ = ISO_CODE_SI; \
1114 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1115 } while (0)
1116
1117#define ENCODE_SHIFT_OUT \
1118 do { \
1119 *dst++ = ISO_CODE_SO; \
1120 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1121 } while (0)
1122
1123#define ENCODE_LOCKING_SHIFT_2 \
1124 do { \
1125 *dst++ = ISO_CODE_ESC, *dst++ = 'n'; \
1126 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1127 } while (0)
1128
1129#define ENCODE_LOCKING_SHIFT_3 \
1130 do { \
1131 *dst++ = ISO_CODE_ESC, *dst++ = 'o'; \
1132 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1133 } while (0)
1134
1135/* Produce codes for a DIMENSION1 character of which character set is
1136 CHARSET and position-code is C1. Designation and invocation
1137 sequences are also produced in advance if necessary. */
1138
1139
1140#define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
1141 do { \
1142 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1143 { \
1144 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1145 *dst++ = c1 & 0x7F; \
1146 else \
1147 *dst++ = c1 | 0x80; \
1148 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1149 break; \
1150 } \
1151 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1152 { \
1153 *dst++ = c1 & 0x7F; \
1154 break; \
1155 } \
1156 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1157 { \
1158 *dst++ = c1 | 0x80; \
1159 break; \
1160 } \
1161 else \
1162 /* Since CHARSET is not yet invoked to any graphic planes, we \
1163 must invoke it, or, at first, designate it to some graphic \
1164 register. Then repeat the loop to actually produce the \
1165 character. */ \
1166 dst = encode_invocation_designation (charset, coding, dst); \
1167 } while (1)
1168
1169/* Produce codes for a DIMENSION2 character of which character set is
1170 CHARSET and position-codes are C1 and C2. Designation and
1171 invocation codes are also produced in advance if necessary. */
1172
1173#define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
1174 do { \
1175 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1176 { \
1177 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1178 *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F; \
1179 else \
1180 *dst++ = c1 | 0x80, *dst++ = c2 | 0x80; \
1181 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1182 break; \
1183 } \
1184 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1185 { \
1186 *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F; \
1187 break; \
1188 } \
1189 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1190 { \
1191 *dst++ = c1 | 0x80, *dst++= c2 | 0x80; \
1192 break; \
1193 } \
1194 else \
1195 /* Since CHARSET is not yet invoked to any graphic planes, we \
1196 must invoke it, or, at first, designate it to some graphic \
1197 register. Then repeat the loop to actually produce the \
1198 character. */ \
1199 dst = encode_invocation_designation (charset, coding, dst); \
1200 } while (1)
1201
bdd9fb48
KH
1202#define ENCODE_ISO_CHARACTER(charset, c1, c2) \
1203 do { \
1204 int c_alt, charset_alt; \
1205 if (!NILP (unification_table) \
1206 && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
a5d301df 1207 >= 0)) \
bdd9fb48
KH
1208 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1209 else \
1210 charset_alt = charset; \
1211 if (CHARSET_DIMENSION (charset_alt) == 1) \
1212 ENCODE_ISO_CHARACTER_DIMENSION1 (charset_alt, c1); \
1213 else \
1214 ENCODE_ISO_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
1215 } while (0)
1216
4ed46869
KH
1217/* Produce designation and invocation codes at a place pointed by DST
1218 to use CHARSET. The element `spec.iso2022' of *CODING is updated.
1219 Return new DST. */
1220
1221unsigned char *
1222encode_invocation_designation (charset, coding, dst)
1223 int charset;
1224 struct coding_system *coding;
1225 unsigned char *dst;
1226{
1227 int reg; /* graphic register number */
1228
1229 /* At first, check designations. */
1230 for (reg = 0; reg < 4; reg++)
1231 if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
1232 break;
1233
1234 if (reg >= 4)
1235 {
1236 /* CHARSET is not yet designated to any graphic registers. */
1237 /* At first check the requested designation. */
1238 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1ba9e4ab
KH
1239 if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1240 /* Since CHARSET requests no special designation, designate it
1241 to graphic register 0. */
4ed46869
KH
1242 reg = 0;
1243
1244 ENCODE_DESIGNATION (charset, reg, coding);
1245 }
1246
1247 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
1248 && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
1249 {
1250 /* Since the graphic register REG is not invoked to any graphic
1251 planes, invoke it to graphic plane 0. */
1252 switch (reg)
1253 {
1254 case 0: /* graphic register 0 */
1255 ENCODE_SHIFT_IN;
1256 break;
1257
1258 case 1: /* graphic register 1 */
1259 ENCODE_SHIFT_OUT;
1260 break;
1261
1262 case 2: /* graphic register 2 */
1263 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1264 ENCODE_SINGLE_SHIFT_2;
1265 else
1266 ENCODE_LOCKING_SHIFT_2;
1267 break;
1268
1269 case 3: /* graphic register 3 */
1270 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1271 ENCODE_SINGLE_SHIFT_3;
1272 else
1273 ENCODE_LOCKING_SHIFT_3;
1274 break;
1275 }
1276 }
1277 return dst;
1278}
1279
1280/* The following two macros produce codes for indicating composition. */
1281#define ENCODE_COMPOSITION_NO_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '0'
1282#define ENCODE_COMPOSITION_WITH_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '2'
1283#define ENCODE_COMPOSITION_END *dst++ = ISO_CODE_ESC, *dst++ = '1'
1284
1285/* The following three macros produce codes for indicating direction
1286 of text. */
1287#define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
1288 do { \
1289 if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS) \
1290 *dst++ = ISO_CODE_ESC, *dst++ = '['; \
1291 else \
1292 *dst++ = ISO_CODE_CSI; \
1293 } while (0)
1294
1295#define ENCODE_DIRECTION_R2L \
1296 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '2', *dst++ = ']'
1297
1298#define ENCODE_DIRECTION_L2R \
1299 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '0', *dst++ = ']'
1300
1301/* Produce codes for designation and invocation to reset the graphic
1302 planes and registers to initial state. */
e0e989f6
KH
1303#define ENCODE_RESET_PLANE_AND_REGISTER \
1304 do { \
1305 int reg; \
1306 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0) \
1307 ENCODE_SHIFT_IN; \
1308 for (reg = 0; reg < 4; reg++) \
1309 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0 \
1310 && (CODING_SPEC_ISO_DESIGNATION (coding, reg) \
1311 != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg))) \
1312 ENCODE_DESIGNATION \
1313 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
4ed46869
KH
1314 } while (0)
1315
bdd9fb48
KH
1316/* Produce designation sequences of charsets in the line started from
1317 *SRC to a place pointed by DSTP.
1318
1319 If the current block ends before any end-of-line, we may fail to
1320 find all the necessary *designations. */
1321encode_designation_at_bol (coding, table, src, src_end, dstp)
e0e989f6 1322 struct coding_system *coding;
bdd9fb48 1323 Lisp_Object table;
e0e989f6
KH
1324 unsigned char *src, *src_end, **dstp;
1325{
bdd9fb48
KH
1326 int charset, c, found = 0, reg;
1327 /* Table of charsets to be designated to each graphic register. */
1328 int r[4];
1329 unsigned char *dst = *dstp;
1330
1331 for (reg = 0; reg < 4; reg++)
1332 r[reg] = -1;
1333
1334 while (src < src_end && *src != '\n' && found < 4)
e0e989f6 1335 {
bdd9fb48
KH
1336 int bytes = BYTES_BY_CHAR_HEAD (*src);
1337
1338 if (NILP (table))
1339 charset = CHARSET_AT (src);
1340 else
e0e989f6 1341 {
bdd9fb48
KH
1342 int c_alt, c1, c2;
1343
1344 SPLIT_STRING(src, bytes, charset, c1, c2);
1345 if ((c_alt = unify_char (table, -1, charset, c1, c2)) >= 0)
1346 charset = CHAR_CHARSET (c_alt);
e0e989f6 1347 }
bdd9fb48 1348
e0e989f6 1349 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1ba9e4ab 1350 if (r[reg] == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
bdd9fb48
KH
1351 {
1352 found++;
1353 r[reg] = charset;
1354 }
1355
1356 src += bytes;
1357 }
1358
1359 if (found)
1360 {
1361 for (reg = 0; reg < 4; reg++)
1362 if (r[reg] >= 0
1363 && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
1364 ENCODE_DESIGNATION (r[reg], reg, coding);
1365 *dstp = dst;
e0e989f6 1366 }
e0e989f6
KH
1367}
1368
4ed46869
KH
1369/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
1370
1371int
1372encode_coding_iso2022 (coding, source, destination,
1373 src_bytes, dst_bytes, consumed)
1374 struct coding_system *coding;
1375 unsigned char *source, *destination;
1376 int src_bytes, dst_bytes;
1377 int *consumed;
1378{
1379 unsigned char *src = source;
1380 unsigned char *src_end = source + src_bytes;
1381 unsigned char *dst = destination;
1382 unsigned char *dst_end = destination + dst_bytes;
e0e989f6 1383 /* Since the maximum bytes produced by each loop is 20, we subtract 19
4ed46869
KH
1384 from DST_END to assure overflow checking is necessary only at the
1385 head of loop. */
e0e989f6 1386 unsigned char *adjusted_dst_end = dst_end - 19;
a5d301df
KH
1387 Lisp_Object unification_table
1388 = coding->character_unification_table_for_encode;
bdd9fb48
KH
1389
1390 if (!NILP (Venable_character_unification) && NILP (unification_table))
a5d301df 1391 unification_table = Vstandard_character_unification_table_for_encode;
4ed46869
KH
1392
1393 while (src < src_end && dst < adjusted_dst_end)
1394 {
1395 /* SRC_BASE remembers the start position in source in each loop.
1396 The loop will be exited when there's not enough source text
1397 to analyze multi-byte codes (within macros ONE_MORE_BYTE,
1398 TWO_MORE_BYTES, and THREE_MORE_BYTES). In that case, SRC is
1399 reset to SRC_BASE before exiting. */
1400 unsigned char *src_base = src;
bdd9fb48 1401 int charset, c1, c2, c3, c4;
4ed46869 1402
e0e989f6
KH
1403 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
1404 && CODING_SPEC_ISO_BOL (coding))
1405 {
bdd9fb48
KH
1406 /* We have to produce designation sequences if any now. */
1407 encode_designation_at_bol (coding, unification_table,
1408 src, src_end, &dst);
e0e989f6
KH
1409 CODING_SPEC_ISO_BOL (coding) = 0;
1410 }
1411
1412 c1 = *src++;
4ed46869
KH
1413 /* If we are seeing a component of a composite character, we are
1414 seeing a leading-code specially encoded for composition, or a
1415 composition rule if composing with rule. We must set C1
1416 to a normal leading-code or an ASCII code. If we are not at
1417 a composed character, we must reset the composition state. */
1418 if (COMPOSING_P (coding->composing))
1419 {
1420 if (c1 < 0xA0)
1421 {
1422 /* We are not in a composite character any longer. */
1423 coding->composing = COMPOSING_NO;
1424 ENCODE_COMPOSITION_END;
1425 }
1426 else
1427 {
1428 if (coding->composing == COMPOSING_WITH_RULE_RULE)
1429 {
1430 *dst++ = c1 & 0x7F;
1431 coding->composing = COMPOSING_WITH_RULE_HEAD;
1432 continue;
1433 }
1434 else if (coding->composing == COMPOSING_WITH_RULE_HEAD)
1435 coding->composing = COMPOSING_WITH_RULE_RULE;
1436 if (c1 == 0xA0)
1437 {
1438 /* This is an ASCII component. */
1439 ONE_MORE_BYTE (c1);
1440 c1 &= 0x7F;
1441 }
1442 else
1443 /* This is a leading-code of non ASCII component. */
1444 c1 -= 0x20;
1445 }
1446 }
1447
1448 /* Now encode one character. C1 is a control character, an
1449 ASCII character, or a leading-code of multi-byte character. */
1450 switch (emacs_code_class[c1])
1451 {
1452 case EMACS_ascii_code:
bdd9fb48 1453 ENCODE_ISO_CHARACTER (CHARSET_ASCII, c1, /* dummy */ c2);
4ed46869
KH
1454 break;
1455
1456 case EMACS_control_code:
1457 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
e0e989f6 1458 ENCODE_RESET_PLANE_AND_REGISTER;
4ed46869
KH
1459 *dst++ = c1;
1460 break;
1461
1462 case EMACS_carriage_return_code:
1463 if (!coding->selective)
1464 {
1465 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
e0e989f6 1466 ENCODE_RESET_PLANE_AND_REGISTER;
4ed46869
KH
1467 *dst++ = c1;
1468 break;
1469 }
1470 /* fall down to treat '\r' as '\n' ... */
1471
1472 case EMACS_linefeed_code:
1473 if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
e0e989f6
KH
1474 ENCODE_RESET_PLANE_AND_REGISTER;
1475 if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
1476 bcopy (coding->spec.iso2022.initial_designation,
1477 coding->spec.iso2022.current_designation,
1478 sizeof coding->spec.iso2022.initial_designation);
4ed46869 1479 if (coding->eol_type == CODING_EOL_LF
0ef69138 1480 || coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
1481 *dst++ = ISO_CODE_LF;
1482 else if (coding->eol_type == CODING_EOL_CRLF)
1483 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
1484 else
1485 *dst++ = ISO_CODE_CR;
e0e989f6 1486 CODING_SPEC_ISO_BOL (coding) = 1;
4ed46869
KH
1487 break;
1488
1489 case EMACS_leading_code_2:
1490 ONE_MORE_BYTE (c2);
bdd9fb48 1491 ENCODE_ISO_CHARACTER (c1, c2, /* dummy */ c3);
4ed46869
KH
1492 break;
1493
1494 case EMACS_leading_code_3:
1495 TWO_MORE_BYTES (c2, c3);
1496 if (c1 < LEADING_CODE_PRIVATE_11)
bdd9fb48 1497 ENCODE_ISO_CHARACTER (c1, c2, c3);
4ed46869 1498 else
bdd9fb48 1499 ENCODE_ISO_CHARACTER (c2, c3, /* dummy */ c4);
4ed46869
KH
1500 break;
1501
1502 case EMACS_leading_code_4:
1503 THREE_MORE_BYTES (c2, c3, c4);
bdd9fb48 1504 ENCODE_ISO_CHARACTER (c2, c3, c4);
4ed46869
KH
1505 break;
1506
1507 case EMACS_leading_code_composition:
1508 ONE_MORE_BYTE (c1);
1509 if (c1 == 0xFF)
1510 {
1511 coding->composing = COMPOSING_WITH_RULE_HEAD;
1512 ENCODE_COMPOSITION_WITH_RULE_START;
1513 }
1514 else
1515 {
1516 /* Rewind one byte because it is a character code of
1517 composition elements. */
1518 src--;
1519 coding->composing = COMPOSING_NO_RULE_HEAD;
1520 ENCODE_COMPOSITION_NO_RULE_START;
1521 }
1522 break;
1523
1524 case EMACS_invalid_code:
1525 *dst++ = c1;
1526 break;
1527 }
1528 continue;
1529 label_end_of_loop:
1530 coding->carryover_size = src - src_base;
1531 bcopy (src_base, coding->carryover, coding->carryover_size);
4ed46869
KH
1532 break;
1533 }
1534
1535 /* If this is the last block of the text to be encoded, we must
bdd9fb48
KH
1536 reset graphic planes and registers to the initial state. */
1537 if (src >= src_end && coding->last_block)
4ed46869 1538 {
e0e989f6 1539 ENCODE_RESET_PLANE_AND_REGISTER;
bdd9fb48
KH
1540 if (coding->carryover_size > 0
1541 && coding->carryover_size < (dst_end - dst))
1542 {
1543 bcopy (coding->carryover, dst, coding->carryover_size);
1544 dst += coding->carryover_size;
1545 coding->carryover_size = 0;
1546 }
4ed46869
KH
1547 }
1548 *consumed = src - source;
1549 return dst - destination;
1550}
1551
1552\f
1553/*** 4. SJIS and BIG5 handlers ***/
1554
1555/* Although SJIS and BIG5 are not ISO's coding system, They are used
1556 quite widely. So, for the moment, Emacs supports them in the bare
1557 C code. But, in the future, they may be supported only by CCL. */
1558
1559/* SJIS is a coding system encoding three character sets: ASCII, right
1560 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
1561 as is. A character of charset katakana-jisx0201 is encoded by
1562 "position-code + 0x80". A character of charset japanese-jisx0208
1563 is encoded in 2-byte but two position-codes are divided and shifted
1564 so that it fit in the range below.
1565
1566 --- CODE RANGE of SJIS ---
1567 (character set) (range)
1568 ASCII 0x00 .. 0x7F
1569 KATAKANA-JISX0201 0xA0 .. 0xDF
1570 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xFF
1571 (2nd byte) 0x40 .. 0xFF
1572 -------------------------------
1573
1574*/
1575
1576/* BIG5 is a coding system encoding two character sets: ASCII and
1577 Big5. An ASCII character is encoded as is. Big5 is a two-byte
1578 character set and is encoded in two-byte.
1579
1580 --- CODE RANGE of BIG5 ---
1581 (character set) (range)
1582 ASCII 0x00 .. 0x7F
1583 Big5 (1st byte) 0xA1 .. 0xFE
1584 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
1585 --------------------------
1586
1587 Since the number of characters in Big5 is larger than maximum
1588 characters in Emacs' charset (96x96), it can't be handled as one
1589 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
1590 and `charset-big5-2'. Both are DIMENSION2 and CHARS94. The former
1591 contains frequently used characters and the latter contains less
1592 frequently used characters. */
1593
1594/* Macros to decode or encode a character of Big5 in BIG5. B1 and B2
1595 are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
1596 C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
1597 format. CHARSET is `charset_big5_1' or `charset_big5_2'. */
1598
1599/* Number of Big5 characters which have the same code in 1st byte. */
1600#define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
1601
1602#define DECODE_BIG5(b1, b2, charset, c1, c2) \
1603 do { \
1604 unsigned int temp \
1605 = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62); \
1606 if (b1 < 0xC9) \
1607 charset = charset_big5_1; \
1608 else \
1609 { \
1610 charset = charset_big5_2; \
1611 temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW; \
1612 } \
1613 c1 = temp / (0xFF - 0xA1) + 0x21; \
1614 c2 = temp % (0xFF - 0xA1) + 0x21; \
1615 } while (0)
1616
1617#define ENCODE_BIG5(charset, c1, c2, b1, b2) \
1618 do { \
1619 unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21); \
1620 if (charset == charset_big5_2) \
1621 temp += BIG5_SAME_ROW * (0xC9 - 0xA1); \
1622 b1 = temp / BIG5_SAME_ROW + 0xA1; \
1623 b2 = temp % BIG5_SAME_ROW; \
1624 b2 += b2 < 0x3F ? 0x40 : 0x62; \
1625 } while (0)
1626
a5d301df
KH
1627#define DECODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \
1628 do { \
1629 int c_alt, charset_alt = (charset); \
1630 if (!NILP (unification_table) \
1631 && ((c_alt = unify_char (unification_table, \
1632 -1, (charset), c1, c2)) >= 0)) \
1633 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1634 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
1635 DECODE_CHARACTER_ASCII (c1); \
1636 else if (CHARSET_DIMENSION (charset_alt) == 1) \
1637 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
1638 else \
1639 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
1640 } while (0)
1641
1642#define ENCODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \
1643 do { \
1644 int c_alt, charset_alt; \
1645 if (!NILP (unification_table) \
1646 && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
1647 >= 0)) \
1648 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1649 else \
1650 charset_alt = charset; \
1651 if (charset_alt == charset_ascii) \
1652 *dst++ = c1; \
1653 else if (CHARSET_DIMENSION (charset_alt) == 1) \
1654 { \
1655 if (sjis_p && charset_alt == charset_katakana_jisx0201) \
1656 *dst++ = c1; \
1657 else \
1658 *dst++ = charset_alt, *dst++ = c1; \
1659 } \
1660 else \
1661 { \
1662 c1 &= 0x7F, c2 &= 0x7F; \
1663 if (sjis_p && charset_alt == charset_jisx0208) \
1664 { \
1665 unsigned char s1, s2; \
1666 \
1667 ENCODE_SJIS (c1, c2, s1, s2); \
1668 *dst++ = s1, *dst++ = s2; \
1669 } \
1670 else if (!sjis_p \
1671 && (charset_alt == charset_big5_1 \
1672 || charset_alt == charset_big5_2)) \
1673 { \
1674 unsigned char b1, b2; \
1675 \
9ce27fde 1676 ENCODE_BIG5 (charset_alt, c1, c2, b1, b2); \
a5d301df
KH
1677 *dst++ = b1, *dst++ = b2; \
1678 } \
1679 else \
1680 *dst++ = charset_alt, *dst++ = c1, *dst++ = c2; \
1681 } \
1682 } while (0);
1683
4ed46869
KH
1684/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1685 Check if a text is encoded in SJIS. If it is, return
1686 CODING_CATEGORY_MASK_SJIS, else return 0. */
1687
1688int
1689detect_coding_sjis (src, src_end)
1690 unsigned char *src, *src_end;
1691{
1692 unsigned char c;
1693
1694 while (src < src_end)
1695 {
1696 c = *src++;
1697 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1698 return 0;
1699 if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
1700 {
1701 if (src < src_end && *src++ < 0x40)
1702 return 0;
1703 }
1704 }
1705 return CODING_CATEGORY_MASK_SJIS;
1706}
1707
1708/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1709 Check if a text is encoded in BIG5. If it is, return
1710 CODING_CATEGORY_MASK_BIG5, else return 0. */
1711
1712int
1713detect_coding_big5 (src, src_end)
1714 unsigned char *src, *src_end;
1715{
1716 unsigned char c;
1717
1718 while (src < src_end)
1719 {
1720 c = *src++;
1721 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1722 return 0;
1723 if (c >= 0xA1)
1724 {
1725 if (src >= src_end)
1726 break;
1727 c = *src++;
1728 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
1729 return 0;
1730 }
1731 }
1732 return CODING_CATEGORY_MASK_BIG5;
1733}
1734
1735/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
1736 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
1737
1738int
1739decode_coding_sjis_big5 (coding, source, destination,
1740 src_bytes, dst_bytes, consumed, sjis_p)
1741 struct coding_system *coding;
1742 unsigned char *source, *destination;
1743 int src_bytes, dst_bytes;
1744 int *consumed;
1745 int sjis_p;
1746{
1747 unsigned char *src = source;
1748 unsigned char *src_end = source + src_bytes;
1749 unsigned char *dst = destination;
1750 unsigned char *dst_end = destination + dst_bytes;
1751 /* Since the maximum bytes produced by each loop is 4, we subtract 3
1752 from DST_END to assure overflow checking is necessary only at the
1753 head of loop. */
1754 unsigned char *adjusted_dst_end = dst_end - 3;
a5d301df
KH
1755 Lisp_Object unification_table
1756 = coding->character_unification_table_for_decode;
1757
1758 if (!NILP (Venable_character_unification) && NILP (unification_table))
1759 unification_table = Vstandard_character_unification_table_for_decode;
4ed46869
KH
1760
1761 while (src < src_end && dst < adjusted_dst_end)
1762 {
1763 /* SRC_BASE remembers the start position in source in each loop.
1764 The loop will be exited when there's not enough source text
1765 to analyze two-byte character (within macro ONE_MORE_BYTE).
1766 In that case, SRC is reset to SRC_BASE before exiting. */
1767 unsigned char *src_base = src;
1768 unsigned char c1 = *src++, c2, c3, c4;
1769
1770 if (c1 == '\r')
1771 {
1772 if (coding->eol_type == CODING_EOL_CRLF)
1773 {
1774 ONE_MORE_BYTE (c2);
1775 if (c2 == '\n')
1776 *dst++ = c2;
1777 else
1778 /* To process C2 again, SRC is subtracted by 1. */
1779 *dst++ = c1, src--;
1780 }
1781 else
1782 *dst++ = c1;
1783 }
a5d301df 1784 else if (c1 < 0x20)
4ed46869 1785 *dst++ = c1;
a5d301df
KH
1786 else if (c1 < 0x80)
1787 DECODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
4ed46869
KH
1788 else if (c1 < 0xA0 || c1 >= 0xE0)
1789 {
1790 /* SJIS -> JISX0208, BIG5 -> Big5 (only if 0xE0 <= c1 < 0xFF) */
1791 if (sjis_p)
1792 {
1793 ONE_MORE_BYTE (c2);
1794 DECODE_SJIS (c1, c2, c3, c4);
a5d301df 1795 DECODE_SJIS_BIG5_CHARACTER (charset_jisx0208, c3, c4);
4ed46869
KH
1796 }
1797 else if (c1 >= 0xE0 && c1 < 0xFF)
1798 {
1799 int charset;
1800
1801 ONE_MORE_BYTE (c2);
1802 DECODE_BIG5 (c1, c2, charset, c3, c4);
a5d301df 1803 DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
4ed46869
KH
1804 }
1805 else /* Invalid code */
1806 *dst++ = c1;
1807 }
1808 else
1809 {
1810 /* SJIS -> JISX0201-Kana, BIG5 -> Big5 */
1811 if (sjis_p)
a5d301df 1812 DECODE_SJIS_BIG5_CHARACTER (charset_katakana_jisx0201, c1, /* dummy */ c2);
4ed46869
KH
1813 else
1814 {
1815 int charset;
1816
1817 ONE_MORE_BYTE (c2);
1818 DECODE_BIG5 (c1, c2, charset, c3, c4);
a5d301df 1819 DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
4ed46869
KH
1820 }
1821 }
1822 continue;
1823
1824 label_end_of_loop:
1825 coding->carryover_size = src - src_base;
1826 bcopy (src_base, coding->carryover, coding->carryover_size);
1827 src = src_base;
1828 break;
1829 }
1830
1831 *consumed = src - source;
1832 return dst - destination;
1833}
1834
1835/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
1836 This function can encode `charset_ascii', `charset_katakana_jisx0201',
1837 `charset_jisx0208', `charset_big5_1', and `charset_big5-2'. We are
1838 sure that all these charsets are registered as official charset
1839 (i.e. do not have extended leading-codes). Characters of other
1840 charsets are produced without any encoding. If SJIS_P is 1, encode
1841 SJIS text, else encode BIG5 text. */
1842
1843int
1844encode_coding_sjis_big5 (coding, source, destination,
1845 src_bytes, dst_bytes, consumed, sjis_p)
1846 struct coding_system *coding;
1847 unsigned char *source, *destination;
1848 int src_bytes, dst_bytes;
1849 int *consumed;
1850 int sjis_p;
1851{
1852 unsigned char *src = source;
1853 unsigned char *src_end = source + src_bytes;
1854 unsigned char *dst = destination;
1855 unsigned char *dst_end = destination + dst_bytes;
1856 /* Since the maximum bytes produced by each loop is 2, we subtract 1
1857 from DST_END to assure overflow checking is necessary only at the
1858 head of loop. */
1859 unsigned char *adjusted_dst_end = dst_end - 1;
a5d301df
KH
1860 Lisp_Object unification_table
1861 = coding->character_unification_table_for_encode;
1862
1863 if (!NILP (Venable_character_unification) && NILP (unification_table))
1864 unification_table = Vstandard_character_unification_table_for_encode;
4ed46869
KH
1865
1866 while (src < src_end && dst < adjusted_dst_end)
1867 {
1868 /* SRC_BASE remembers the start position in source in each loop.
1869 The loop will be exited when there's not enough source text
1870 to analyze multi-byte codes (within macros ONE_MORE_BYTE and
1871 TWO_MORE_BYTES). In that case, SRC is reset to SRC_BASE
1872 before exiting. */
1873 unsigned char *src_base = src;
1874 unsigned char c1 = *src++, c2, c3, c4;
1875
1876 if (coding->composing)
1877 {
1878 if (c1 == 0xA0)
1879 {
1880 ONE_MORE_BYTE (c1);
1881 c1 &= 0x7F;
1882 }
1883 else if (c1 >= 0xA0)
1884 c1 -= 0x20;
1885 else
1886 coding->composing = 0;
1887 }
1888
1889 switch (emacs_code_class[c1])
1890 {
1891 case EMACS_ascii_code:
a5d301df
KH
1892 ENCODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
1893 break;
1894
4ed46869
KH
1895 case EMACS_control_code:
1896 *dst++ = c1;
1897 break;
1898
1899 case EMACS_carriage_return_code:
1900 if (!coding->selective)
1901 {
1902 *dst++ = c1;
1903 break;
1904 }
1905 /* fall down to treat '\r' as '\n' ... */
1906
1907 case EMACS_linefeed_code:
1908 if (coding->eol_type == CODING_EOL_LF
0ef69138 1909 || coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
1910 *dst++ = '\n';
1911 else if (coding->eol_type == CODING_EOL_CRLF)
1912 *dst++ = '\r', *dst++ = '\n';
1913 else
1914 *dst++ = '\r';
1915 break;
1916
1917 case EMACS_leading_code_2:
1918 ONE_MORE_BYTE (c2);
a5d301df 1919 ENCODE_SJIS_BIG5_CHARACTER (c1, c2, /* dummy */ c3);
4ed46869
KH
1920 break;
1921
1922 case EMACS_leading_code_3:
1923 TWO_MORE_BYTES (c2, c3);
a5d301df 1924 ENCODE_SJIS_BIG5_CHARACTER (c1, c2, c3);
4ed46869
KH
1925 break;
1926
1927 case EMACS_leading_code_4:
1928 THREE_MORE_BYTES (c2, c3, c4);
a5d301df 1929 ENCODE_SJIS_BIG5_CHARACTER (c2, c3, c4);
4ed46869
KH
1930 break;
1931
1932 case EMACS_leading_code_composition:
1933 coding->composing = 1;
1934 break;
1935
1936 default: /* i.e. case EMACS_invalid_code: */
1937 *dst++ = c1;
1938 }
1939 continue;
1940
1941 label_end_of_loop:
1942 coding->carryover_size = src - src_base;
1943 bcopy (src_base, coding->carryover, coding->carryover_size);
1944 src = src_base;
1945 break;
1946 }
1947
1948 *consumed = src - source;
1949 return dst - destination;
1950}
1951
1952\f
1953/*** 5. End-of-line handlers ***/
1954
1955/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
1956 This function is called only when `coding->eol_type' is
1957 CODING_EOL_CRLF or CODING_EOL_CR. */
1958
1959decode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
1960 struct coding_system *coding;
1961 unsigned char *source, *destination;
1962 int src_bytes, dst_bytes;
1963 int *consumed;
1964{
1965 unsigned char *src = source;
1966 unsigned char *src_end = source + src_bytes;
1967 unsigned char *dst = destination;
1968 unsigned char *dst_end = destination + dst_bytes;
1969 int produced;
1970
1971 switch (coding->eol_type)
1972 {
1973 case CODING_EOL_CRLF:
1974 {
1975 /* Since the maximum bytes produced by each loop is 2, we
1976 subtract 1 from DST_END to assure overflow checking is
1977 necessary only at the head of loop. */
1978 unsigned char *adjusted_dst_end = dst_end - 1;
1979
1980 while (src < src_end && dst < adjusted_dst_end)
1981 {
1982 unsigned char *src_base = src;
1983 unsigned char c = *src++;
1984 if (c == '\r')
1985 {
1986 ONE_MORE_BYTE (c);
1987 if (c != '\n')
1988 *dst++ = '\r';
bfd99048 1989 *dst++ = c;
4ed46869
KH
1990 }
1991 else
1992 *dst++ = c;
1993 continue;
1994
1995 label_end_of_loop:
1996 coding->carryover_size = src - src_base;
1997 bcopy (src_base, coding->carryover, coding->carryover_size);
1998 src = src_base;
1999 break;
2000 }
2001 *consumed = src - source;
2002 produced = dst - destination;
2003 break;
2004 }
2005
2006 case CODING_EOL_CR:
2007 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2008 bcopy (source, destination, produced);
2009 dst_end = destination + produced;
2010 while (dst < dst_end)
2011 if (*dst++ == '\r') dst[-1] = '\n';
2012 *consumed = produced;
2013 break;
2014
2015 default: /* i.e. case: CODING_EOL_LF */
2016 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2017 bcopy (source, destination, produced);
2018 *consumed = produced;
2019 break;
2020 }
2021
2022 return produced;
2023}
2024
2025/* See "GENERAL NOTES about `encode_coding_XXX ()' functions". Encode
2026 format of end-of-line according to `coding->eol_type'. If
2027 `coding->selective' is 1, code '\r' in source text also means
2028 end-of-line. */
2029
2030encode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
2031 struct coding_system *coding;
2032 unsigned char *source, *destination;
2033 int src_bytes, dst_bytes;
2034 int *consumed;
2035{
2036 unsigned char *src = source;
2037 unsigned char *dst = destination;
2038 int produced;
2039
2040 if (src_bytes <= 0)
2041 return 0;
2042
2043 switch (coding->eol_type)
2044 {
2045 case CODING_EOL_LF:
0ef69138 2046 case CODING_EOL_UNDECIDED:
4ed46869
KH
2047 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2048 bcopy (source, destination, produced);
2049 if (coding->selective)
2050 {
2051 int i = produced;
2052 while (i--)
2053 if (*dst++ == '\r') dst[-1] = '\n';
2054 }
2055 *consumed = produced;
2056
2057 case CODING_EOL_CRLF:
2058 {
2059 unsigned char c;
2060 unsigned char *src_end = source + src_bytes;
2061 unsigned char *dst_end = destination + dst_bytes;
2062 /* Since the maximum bytes produced by each loop is 2, we
2063 subtract 1 from DST_END to assure overflow checking is
2064 necessary only at the head of loop. */
2065 unsigned char *adjusted_dst_end = dst_end - 1;
2066
2067 while (src < src_end && dst < adjusted_dst_end)
2068 {
2069 c = *src++;
2070 if (c == '\n' || (c == '\r' && coding->selective))
2071 *dst++ = '\r', *dst++ = '\n';
2072 else
2073 *dst++ = c;
2074 }
2075 produced = dst - destination;
2076 *consumed = src - source;
2077 break;
2078 }
2079
2080 default: /* i.e. case CODING_EOL_CR: */
2081 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2082 bcopy (source, destination, produced);
2083 {
2084 int i = produced;
2085 while (i--)
2086 if (*dst++ == '\n') dst[-1] = '\r';
2087 }
2088 *consumed = produced;
2089 }
2090
2091 return produced;
2092}
2093
2094\f
2095/*** 6. C library functions ***/
2096
2097/* In Emacs Lisp, coding system is represented by a Lisp symbol which
2098 has a property `coding-system'. The value of this property is a
2099 vector of length 5 (called as coding-vector). Among elements of
2100 this vector, the first (element[0]) and the fifth (element[4])
2101 carry important information for decoding/encoding. Before
2102 decoding/encoding, this information should be set in fields of a
2103 structure of type `coding_system'.
2104
2105 A value of property `coding-system' can be a symbol of another
2106 subsidiary coding-system. In that case, Emacs gets coding-vector
2107 from that symbol.
2108
2109 `element[0]' contains information to be set in `coding->type'. The
2110 value and its meaning is as follows:
2111
0ef69138
KH
2112 0 -- coding_type_emacs_mule
2113 1 -- coding_type_sjis
2114 2 -- coding_type_iso2022
2115 3 -- coding_type_big5
2116 4 -- coding_type_ccl encoder/decoder written in CCL
2117 nil -- coding_type_no_conversion
2118 t -- coding_type_undecided (automatic conversion on decoding,
2119 no-conversion on encoding)
4ed46869
KH
2120
2121 `element[4]' contains information to be set in `coding->flags' and
2122 `coding->spec'. The meaning varies by `coding->type'.
2123
2124 If `coding->type' is `coding_type_iso2022', element[4] is a vector
2125 of length 32 (of which the first 13 sub-elements are used now).
2126 Meanings of these sub-elements are:
2127
2128 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2129 If the value is an integer of valid charset, the charset is
2130 assumed to be designated to graphic register N initially.
2131
2132 If the value is minus, it is a minus value of charset which
2133 reserves graphic register N, which means that the charset is
2134 not designated initially but should be designated to graphic
2135 register N just before encoding a character in that charset.
2136
2137 If the value is nil, graphic register N is never used on
2138 encoding.
2139
2140 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2141 Each value takes t or nil. See the section ISO2022 of
2142 `coding.h' for more information.
2143
2144 If `coding->type' is `coding_type_big5', element[4] is t to denote
2145 BIG5-ETen or nil to denote BIG5-HKU.
2146
2147 If `coding->type' takes the other value, element[4] is ignored.
2148
2149 Emacs Lisp's coding system also carries information about format of
2150 end-of-line in a value of property `eol-type'. If the value is
2151 integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2152 means CODING_EOL_CR. If it is not integer, it should be a vector
2153 of subsidiary coding systems of which property `eol-type' has one
2154 of above values.
2155
2156*/
2157
2158/* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2159 and set it in CODING. If CODING_SYSTEM_SYMBOL is invalid, CODING
2160 is setup so that no conversion is necessary and return -1, else
2161 return 0. */
2162
2163int
e0e989f6
KH
2164setup_coding_system (coding_system, coding)
2165 Lisp_Object coding_system;
4ed46869
KH
2166 struct coding_system *coding;
2167{
4ed46869
KH
2168 Lisp_Object type, eol_type;
2169
2170 /* At first, set several fields default values. */
2171 coding->require_flushing = 0;
2172 coding->last_block = 0;
2173 coding->selective = 0;
2174 coding->composing = 0;
2175 coding->direction = 0;
2176 coding->carryover_size = 0;
4ed46869 2177 coding->post_read_conversion = coding->pre_write_conversion = Qnil;
a5d301df
KH
2178 coding->character_unification_table_for_decode = Qnil;
2179 coding->character_unification_table_for_encode = Qnil;
4ed46869 2180
e0e989f6
KH
2181 Vlast_coding_system_used = coding->symbol = coding_system;
2182 eol_type = Qnil;
2183 /* Get value of property `coding-system' until we get a vector.
2184 While doing that, also get values of properties
a5d301df
KH
2185 `post-read-conversion', `pre-write-conversion',
2186 `character-unification-table-for-decode',
2187 `character-unification-table-for-encode' and `eol-type'. */
e0e989f6 2188 while (!NILP (coding_system) && SYMBOLP (coding_system))
4ed46869 2189 {
4ed46869 2190 if (NILP (coding->post_read_conversion))
e0e989f6 2191 coding->post_read_conversion = Fget (coding_system,
4ed46869 2192 Qpost_read_conversion);
e0e989f6
KH
2193 if (NILP (coding->pre_write_conversion))
2194 coding->pre_write_conversion = Fget (coding_system,
4ed46869 2195 Qpre_write_conversion);
9ce27fde 2196 if (!inhibit_eol_conversion && NILP (eol_type))
e0e989f6 2197 eol_type = Fget (coding_system, Qeol_type);
a5d301df
KH
2198
2199 if (NILP (coding->character_unification_table_for_decode))
2200 coding->character_unification_table_for_decode
2201 = Fget (coding_system, Qcharacter_unification_table_for_decode);
2202
2203 if (NILP (coding->character_unification_table_for_encode))
2204 coding->character_unification_table_for_encode
2205 = Fget (coding_system, Qcharacter_unification_table_for_encode);
2206
e0e989f6 2207 coding_system = Fget (coding_system, Qcoding_system);
4ed46869 2208 }
a5d301df
KH
2209
2210 while (!NILP (coding->character_unification_table_for_decode)
2211 && SYMBOLP (coding->character_unification_table_for_decode))
2212 coding->character_unification_table_for_decode
2213 = Fget (coding->character_unification_table_for_decode,
2214 Qcharacter_unification_table_for_decode);
2215 if (!NILP (coding->character_unification_table_for_decode)
2216 && !CHAR_TABLE_P (coding->character_unification_table_for_decode))
2217 coding->character_unification_table_for_decode = Qnil;
2218
2219 while (!NILP (coding->character_unification_table_for_encode)
2220 && SYMBOLP (coding->character_unification_table_for_encode))
2221 coding->character_unification_table_for_encode
2222 = Fget (coding->character_unification_table_for_encode,
2223 Qcharacter_unification_table_for_encode);
2224 if (!NILP (coding->character_unification_table_for_encode)
2225 && !CHAR_TABLE_P (coding->character_unification_table_for_encode))
2226 coding->character_unification_table_for_encode = Qnil;
2227
e0e989f6
KH
2228 if (!VECTORP (coding_system)
2229 || XVECTOR (coding_system)->size != 5)
4ed46869
KH
2230 goto label_invalid_coding_system;
2231
4ed46869 2232 if (VECTORP (eol_type))
0ef69138 2233 coding->eol_type = CODING_EOL_UNDECIDED;
4ed46869
KH
2234 else if (XFASTINT (eol_type) == 1)
2235 coding->eol_type = CODING_EOL_CRLF;
2236 else if (XFASTINT (eol_type) == 2)
2237 coding->eol_type = CODING_EOL_CR;
2238 else
2239 coding->eol_type = CODING_EOL_LF;
2240
e0e989f6 2241 type = XVECTOR (coding_system)->contents[0];
4ed46869
KH
2242 switch (XFASTINT (type))
2243 {
2244 case 0:
0ef69138 2245 coding->type = coding_type_emacs_mule;
4ed46869
KH
2246 break;
2247
2248 case 1:
2249 coding->type = coding_type_sjis;
2250 break;
2251
2252 case 2:
2253 coding->type = coding_type_iso2022;
2254 {
e0e989f6 2255 Lisp_Object val = XVECTOR (coding_system)->contents[4];
4ed46869
KH
2256 Lisp_Object *flags;
2257 int i, charset, default_reg_bits = 0;
2258
2259 if (!VECTORP (val) || XVECTOR (val)->size != 32)
2260 goto label_invalid_coding_system;
2261
2262 flags = XVECTOR (val)->contents;
2263 coding->flags
2264 = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
2265 | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
2266 | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
2267 | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
2268 | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
2269 | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
2270 | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
2271 | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
e0e989f6
KH
2272 | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
2273 | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
2274 | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL));
4ed46869
KH
2275
2276 /* Invoke graphic register 0 to plane 0. */
2277 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
2278 /* Invoke graphic register 1 to plane 1 if we can use full 8-bit. */
2279 CODING_SPEC_ISO_INVOCATION (coding, 1)
2280 = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
2281 /* Not single shifting at first. */
2282 CODING_SPEC_ISO_SINGLE_SHIFTING(coding) = 0;
e0e989f6
KH
2283 /* Beginning of buffer should also be regarded as bol. */
2284 CODING_SPEC_ISO_BOL(coding) = 1;
4ed46869
KH
2285
2286 /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
2287 FLAGS[REG] can be one of below:
2288 integer CHARSET: CHARSET occupies register I,
2289 t: designate nothing to REG initially, but can be used
2290 by any charsets,
2291 list of integer, nil, or t: designate the first
2292 element (if integer) to REG initially, the remaining
2293 elements (if integer) is designated to REG on request,
2294 if an element is t, REG can be used by any charset,
2295 nil: REG is never used. */
467e7675 2296 for (charset = 0; charset <= MAX_CHARSET; charset++)
1ba9e4ab
KH
2297 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2298 = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
4ed46869
KH
2299 for (i = 0; i < 4; i++)
2300 {
2301 if (INTEGERP (flags[i])
e0e989f6
KH
2302 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset))
2303 || (charset = get_charset_id (flags[i])) >= 0)
4ed46869
KH
2304 {
2305 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2306 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
2307 }
2308 else if (EQ (flags[i], Qt))
2309 {
2310 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2311 default_reg_bits |= 1 << i;
2312 }
2313 else if (CONSP (flags[i]))
2314 {
2315 Lisp_Object tail = flags[i];
2316
2317 if (INTEGERP (XCONS (tail)->car)
2318 && (charset = XINT (XCONS (tail)->car),
e0e989f6
KH
2319 CHARSET_VALID_P (charset))
2320 || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
4ed46869
KH
2321 {
2322 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2323 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
2324 }
2325 else
2326 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2327 tail = XCONS (tail)->cdr;
2328 while (CONSP (tail))
2329 {
2330 if (INTEGERP (XCONS (tail)->car)
2331 && (charset = XINT (XCONS (tail)->car),
e0e989f6
KH
2332 CHARSET_VALID_P (charset))
2333 || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
4ed46869
KH
2334 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2335 = i;
2336 else if (EQ (XCONS (tail)->car, Qt))
2337 default_reg_bits |= 1 << i;
2338 tail = XCONS (tail)->cdr;
2339 }
2340 }
2341 else
2342 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2343
2344 CODING_SPEC_ISO_DESIGNATION (coding, i)
2345 = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
2346 }
2347
2348 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
2349 {
2350 /* REG 1 can be used only by locking shift in 7-bit env. */
2351 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2352 default_reg_bits &= ~2;
2353 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
2354 /* Without any shifting, only REG 0 and 1 can be used. */
2355 default_reg_bits &= 3;
2356 }
2357
467e7675 2358 for (charset = 0; charset <= MAX_CHARSET; charset++)
4ed46869 2359 if (CHARSET_VALID_P (charset)
1ba9e4ab
KH
2360 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2361 == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
4ed46869
KH
2362 {
2363 /* We have not yet decided where to designate CHARSET. */
2364 int reg_bits = default_reg_bits;
2365
2366 if (CHARSET_CHARS (charset) == 96)
2367 /* A charset of CHARS96 can't be designated to REG 0. */
2368 reg_bits &= ~1;
2369
2370 if (reg_bits)
2371 /* There exist some default graphic register. */
2372 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2373 = (reg_bits & 1
2374 ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
2375 else
2376 /* We anyway have to designate CHARSET to somewhere. */
2377 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2378 = (CHARSET_CHARS (charset) == 94
2379 ? 0
2380 : ((coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT
2381 || ! coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2382 ? 1
2383 : (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT
2384 ? 2 : 0)));
2385 }
2386 }
2387 coding->require_flushing = 1;
2388 break;
2389
2390 case 3:
2391 coding->type = coding_type_big5;
2392 coding->flags
e0e989f6 2393 = (NILP (XVECTOR (coding_system)->contents[4])
4ed46869
KH
2394 ? CODING_FLAG_BIG5_HKU
2395 : CODING_FLAG_BIG5_ETEN);
2396 break;
2397
2398 case 4:
2399 coding->type = coding_type_ccl;
2400 {
e0e989f6 2401 Lisp_Object val = XVECTOR (coding_system)->contents[4];
4ed46869
KH
2402 if (CONSP (val)
2403 && VECTORP (XCONS (val)->car)
2404 && VECTORP (XCONS (val)->cdr))
2405 {
2406 setup_ccl_program (&(coding->spec.ccl.decoder), XCONS (val)->car);
2407 setup_ccl_program (&(coding->spec.ccl.encoder), XCONS (val)->cdr);
2408 }
2409 else
2410 goto label_invalid_coding_system;
2411 }
2412 coding->require_flushing = 1;
2413 break;
2414
2415 default:
2416 if (EQ (type, Qt))
0ef69138 2417 coding->type = coding_type_undecided;
4ed46869
KH
2418 else
2419 coding->type = coding_type_no_conversion;
2420 break;
2421 }
2422 return 0;
2423
2424 label_invalid_coding_system:
2425 coding->type = coding_type_no_conversion;
dec137e5 2426 coding->eol_type = CODING_EOL_LF;
e0e989f6
KH
2427 coding->symbol = coding->pre_write_conversion = coding->post_read_conversion
2428 = Qnil;
4ed46869
KH
2429 return -1;
2430}
2431
2432/* Emacs has a mechanism to automatically detect a coding system if it
2433 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
2434 it's impossible to distinguish some coding systems accurately
2435 because they use the same range of codes. So, at first, coding
2436 systems are categorized into 7, those are:
2437
0ef69138 2438 o coding-category-emacs-mule
4ed46869
KH
2439
2440 The category for a coding system which has the same code range
2441 as Emacs' internal format. Assigned the coding-system (Lisp
0ef69138 2442 symbol) `emacs-mule' by default.
4ed46869
KH
2443
2444 o coding-category-sjis
2445
2446 The category for a coding system which has the same code range
2447 as SJIS. Assigned the coding-system (Lisp
e0e989f6 2448 symbol) `shift-jis' by default.
4ed46869
KH
2449
2450 o coding-category-iso-7
2451
2452 The category for a coding system which has the same code range
2453 as ISO2022 of 7-bit environment. Assigned the coding-system
e0e989f6 2454 (Lisp symbol) `iso-2022-7' by default.
4ed46869
KH
2455
2456 o coding-category-iso-8-1
2457
2458 The category for a coding system which has the same code range
2459 as ISO2022 of 8-bit environment and graphic plane 1 used only
2460 for DIMENSION1 charset. Assigned the coding-system (Lisp
e0e989f6 2461 symbol) `iso-8859-1' by default.
4ed46869
KH
2462
2463 o coding-category-iso-8-2
2464
2465 The category for a coding system which has the same code range
2466 as ISO2022 of 8-bit environment and graphic plane 1 used only
2467 for DIMENSION2 charset. Assigned the coding-system (Lisp
e0e989f6 2468 symbol) `euc-japan' by default.
4ed46869
KH
2469
2470 o coding-category-iso-else
2471
2472 The category for a coding system which has the same code range
2473 as ISO2022 but not belongs to any of the above three
2474 categories. Assigned the coding-system (Lisp symbol)
e0e989f6 2475 `iso-2022-ss2-7' by default.
4ed46869
KH
2476
2477 o coding-category-big5
2478
2479 The category for a coding system which has the same code range
2480 as BIG5. Assigned the coding-system (Lisp symbol)
e0e989f6 2481 `cn-big5' by default.
4ed46869
KH
2482
2483 o coding-category-binary
2484
2485 The category for a coding system not categorized in any of the
2486 above. Assigned the coding-system (Lisp symbol)
e0e989f6 2487 `no-conversion' by default.
4ed46869
KH
2488
2489 Each of them is a Lisp symbol and the value is an actual
2490 `coding-system's (this is also a Lisp symbol) assigned by a user.
2491 What Emacs does actually is to detect a category of coding system.
2492 Then, it uses a `coding-system' assigned to it. If Emacs can't
2493 decide only one possible category, it selects a category of the
2494 highest priority. Priorities of categories are also specified by a
2495 user in a Lisp variable `coding-category-list'.
2496
2497*/
2498
2499/* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2500 If it detects possible coding systems, return an integer in which
2501 appropriate flag bits are set. Flag bits are defined by macros
2502 CODING_CATEGORY_MASK_XXX in `coding.h'. */
2503
2504int
2505detect_coding_mask (src, src_bytes)
2506 unsigned char *src;
2507 int src_bytes;
2508{
2509 register unsigned char c;
2510 unsigned char *src_end = src + src_bytes;
2511 int mask;
2512
2513 /* At first, skip all ASCII characters and control characters except
2514 for three ISO2022 specific control characters. */
bcf26d6a 2515 label_loop_detect_coding:
4ed46869
KH
2516 while (src < src_end)
2517 {
2518 c = *src;
2519 if (c >= 0x80
2520 || (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
2521 break;
2522 src++;
2523 }
2524
2525 if (src >= src_end)
2526 /* We found nothing other than ASCII. There's nothing to do. */
2527 return CODING_CATEGORY_MASK_ANY;
2528
2529 /* The text seems to be encoded in some multilingual coding system.
2530 Now, try to find in which coding system the text is encoded. */
2531 if (c < 0x80)
bcf26d6a
KH
2532 {
2533 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
2534 /* C is an ISO2022 specific control code of C0. */
2535 mask = detect_coding_iso2022 (src, src_end);
2536 src++;
2537 if (mask == CODING_CATEGORY_MASK_ANY)
2538 /* No valid ISO2022 code follows C. Try again. */
2539 goto label_loop_detect_coding;
2540 }
4ed46869
KH
2541 else if (c == ISO_CODE_SS2 || c == ISO_CODE_SS3 || c == ISO_CODE_CSI)
2542 /* C is an ISO2022 specific control code of C1,
2543 or the first byte of SJIS's 2-byte character code,
2544 or a leading code of Emacs. */
2545 mask = (detect_coding_iso2022 (src, src_end)
2546 | detect_coding_sjis (src, src_end)
0ef69138 2547 | detect_coding_emacs_mule (src, src_end));
4ed46869
KH
2548
2549 else if (c < 0xA0)
2550 /* C is the first byte of SJIS character code,
2551 or a leading-code of Emacs. */
2552 mask = (detect_coding_sjis (src, src_end)
0ef69138 2553 | detect_coding_emacs_mule (src, src_end));
4ed46869
KH
2554
2555 else
2556 /* C is a character of ISO2022 in graphic plane right,
2557 or a SJIS's 1-byte character code (i.e. JISX0201),
2558 or the first byte of BIG5's 2-byte code. */
2559 mask = (detect_coding_iso2022 (src, src_end)
2560 | detect_coding_sjis (src, src_end)
2561 | detect_coding_big5 (src, src_end));
2562
2563 return mask;
2564}
2565
2566/* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2567 The information of the detected coding system is set in CODING. */
2568
2569void
2570detect_coding (coding, src, src_bytes)
2571 struct coding_system *coding;
2572 unsigned char *src;
2573 int src_bytes;
2574{
2575 int mask = detect_coding_mask (src, src_bytes);
2576 int idx;
2577
2578 if (mask == CODING_CATEGORY_MASK_ANY)
2579 /* We found nothing other than ASCII. There's nothing to do. */
2580 return;
2581
2582 if (!mask)
2583 /* The source text seems to be encoded in unknown coding system.
2584 Emacs regards the category of such a kind of coding system as
2585 `coding-category-binary'. We assume that a user has assigned
2586 an appropriate coding system for a `coding-category-binary'. */
2587 idx = CODING_CATEGORY_IDX_BINARY;
2588 else
2589 {
2590 /* We found some plausible coding systems. Let's use a coding
2591 system of the highest priority. */
2592 Lisp_Object val = Vcoding_category_list;
2593
2594 if (CONSP (val))
2595 while (!NILP (val))
2596 {
2597 idx = XFASTINT (Fget (XCONS (val)->car, Qcoding_category_index));
2598 if ((idx < CODING_CATEGORY_IDX_MAX) && (mask & (1 << idx)))
2599 break;
2600 val = XCONS (val)->cdr;
2601 }
2602 else
2603 val = Qnil;
2604
2605 if (NILP (val))
2606 {
2607 /* For unknown reason, `Vcoding_category_list' contains none
2608 of found categories. Let's use any of them. */
2609 for (idx = 0; idx < CODING_CATEGORY_IDX_MAX; idx++)
2610 if (mask & (1 << idx))
2611 break;
2612 }
2613 }
2614 setup_coding_system (XSYMBOL (coding_category_table[idx])->value, coding);
2615}
2616
2617/* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2618 is encoded. Return one of CODING_EOL_LF, CODING_EOL_CRLF,
0ef69138 2619 CODING_EOL_CR, and CODING_EOL_UNDECIDED. */
4ed46869
KH
2620
2621int
2622detect_eol_type (src, src_bytes)
2623 unsigned char *src;
2624 int src_bytes;
2625{
2626 unsigned char *src_end = src + src_bytes;
2627 unsigned char c;
2628
2629 while (src < src_end)
2630 {
2631 c = *src++;
2632 if (c == '\n')
2633 return CODING_EOL_LF;
2634 else if (c == '\r')
2635 {
2636 if (src < src_end && *src == '\n')
2637 return CODING_EOL_CRLF;
2638 else
2639 return CODING_EOL_CR;
2640 }
2641 }
0ef69138 2642 return CODING_EOL_UNDECIDED;
4ed46869
KH
2643}
2644
2645/* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2646 is encoded. If it detects an appropriate format of end-of-line, it
2647 sets the information in *CODING. */
2648
2649void
2650detect_eol (coding, src, src_bytes)
2651 struct coding_system *coding;
2652 unsigned char *src;
2653 int src_bytes;
2654{
2655 Lisp_Object val;
2656 int eol_type = detect_eol_type (src, src_bytes);
2657
0ef69138 2658 if (eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
2659 /* We found no end-of-line in the source text. */
2660 return;
2661
2662 val = Fget (coding->symbol, Qeol_type);
2663 if (VECTORP (val) && XVECTOR (val)->size == 3)
2664 setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
2665}
2666
2667/* See "GENERAL NOTES about `decode_coding_XXX ()' functions". Before
2668 decoding, it may detect coding system and format of end-of-line if
2669 those are not yet decided. */
2670
2671int
2672decode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2673 struct coding_system *coding;
2674 unsigned char *source, *destination;
2675 int src_bytes, dst_bytes;
2676 int *consumed;
2677{
2678 int produced;
2679
2680 if (src_bytes <= 0)
2681 {
2682 *consumed = 0;
2683 return 0;
2684 }
2685
0ef69138 2686 if (coding->type == coding_type_undecided)
4ed46869
KH
2687 detect_coding (coding, source, src_bytes);
2688
0ef69138 2689 if (coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
2690 detect_eol (coding, source, src_bytes);
2691
2692 coding->carryover_size = 0;
2693 switch (coding->type)
2694 {
2695 case coding_type_no_conversion:
2696 label_no_conversion:
2697 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2698 bcopy (source, destination, produced);
2699 *consumed = produced;
2700 break;
2701
0ef69138
KH
2702 case coding_type_emacs_mule:
2703 case coding_type_undecided:
4ed46869 2704 if (coding->eol_type == CODING_EOL_LF
0ef69138 2705 || coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
2706 goto label_no_conversion;
2707 produced = decode_eol (coding, source, destination,
2708 src_bytes, dst_bytes, consumed);
2709 break;
2710
2711 case coding_type_sjis:
2712 produced = decode_coding_sjis_big5 (coding, source, destination,
2713 src_bytes, dst_bytes, consumed,
2714 1);
2715 break;
2716
2717 case coding_type_iso2022:
2718 produced = decode_coding_iso2022 (coding, source, destination,
2719 src_bytes, dst_bytes, consumed);
2720 break;
2721
2722 case coding_type_big5:
2723 produced = decode_coding_sjis_big5 (coding, source, destination,
2724 src_bytes, dst_bytes, consumed,
2725 0);
2726 break;
2727
2728 case coding_type_ccl:
2729 produced = ccl_driver (&coding->spec.ccl.decoder, source, destination,
2730 src_bytes, dst_bytes, consumed);
2731 break;
2732 }
2733
2734 return produced;
2735}
2736
2737/* See "GENERAL NOTES about `encode_coding_XXX ()' functions". */
2738
2739int
2740encode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2741 struct coding_system *coding;
2742 unsigned char *source, *destination;
2743 int src_bytes, dst_bytes;
2744 int *consumed;
2745{
2746 int produced;
2747
2748 coding->carryover_size = 0;
2749 switch (coding->type)
2750 {
2751 case coding_type_no_conversion:
2752 label_no_conversion:
2753 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2754 if (produced > 0)
2755 {
2756 bcopy (source, destination, produced);
2757 if (coding->selective)
2758 {
2759 unsigned char *p = destination, *pend = destination + produced;
2760 while (p < pend)
e0e989f6 2761 if (*p++ == '\015') p[-1] = '\n';
4ed46869
KH
2762 }
2763 }
2764 *consumed = produced;
2765 break;
2766
0ef69138
KH
2767 case coding_type_emacs_mule:
2768 case coding_type_undecided:
4ed46869 2769 if (coding->eol_type == CODING_EOL_LF
0ef69138 2770 || coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
2771 goto label_no_conversion;
2772 produced = encode_eol (coding, source, destination,
2773 src_bytes, dst_bytes, consumed);
2774 break;
2775
2776 case coding_type_sjis:
2777 produced = encode_coding_sjis_big5 (coding, source, destination,
2778 src_bytes, dst_bytes, consumed,
2779 1);
2780 break;
2781
2782 case coding_type_iso2022:
2783 produced = encode_coding_iso2022 (coding, source, destination,
2784 src_bytes, dst_bytes, consumed);
2785 break;
2786
2787 case coding_type_big5:
2788 produced = encode_coding_sjis_big5 (coding, source, destination,
2789 src_bytes, dst_bytes, consumed,
2790 0);
2791 break;
2792
2793 case coding_type_ccl:
2794 produced = ccl_driver (&coding->spec.ccl.encoder, source, destination,
2795 src_bytes, dst_bytes, consumed);
2796 break;
2797 }
2798
2799 return produced;
2800}
2801
2802#define CONVERSION_BUFFER_EXTRA_ROOM 256
2803
2804/* Return maximum size (bytes) of a buffer enough for decoding
2805 SRC_BYTES of text encoded in CODING. */
2806
2807int
2808decoding_buffer_size (coding, src_bytes)
2809 struct coding_system *coding;
2810 int src_bytes;
2811{
2812 int magnification;
2813
2814 if (coding->type == coding_type_iso2022)
2815 magnification = 3;
2816 else if (coding->type == coding_type_ccl)
2817 magnification = coding->spec.ccl.decoder.buf_magnification;
2818 else
2819 magnification = 2;
2820
2821 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
2822}
2823
2824/* Return maximum size (bytes) of a buffer enough for encoding
2825 SRC_BYTES of text to CODING. */
2826
2827int
2828encoding_buffer_size (coding, src_bytes)
2829 struct coding_system *coding;
2830 int src_bytes;
2831{
2832 int magnification;
2833
2834 if (coding->type == coding_type_ccl)
2835 magnification = coding->spec.ccl.encoder.buf_magnification;
2836 else
2837 magnification = 3;
2838
2839 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
2840}
2841
2842#ifndef MINIMUM_CONVERSION_BUFFER_SIZE
2843#define MINIMUM_CONVERSION_BUFFER_SIZE 1024
2844#endif
2845
2846char *conversion_buffer;
2847int conversion_buffer_size;
2848
2849/* Return a pointer to a SIZE bytes of buffer to be used for encoding
2850 or decoding. Sufficient memory is allocated automatically. If we
2851 run out of memory, return NULL. */
2852
2853char *
2854get_conversion_buffer (size)
2855 int size;
2856{
2857 if (size > conversion_buffer_size)
2858 {
2859 char *buf;
2860 int real_size = conversion_buffer_size * 2;
2861
2862 while (real_size < size) real_size *= 2;
2863 buf = (char *) xmalloc (real_size);
2864 xfree (conversion_buffer);
2865 conversion_buffer = buf;
2866 conversion_buffer_size = real_size;
2867 }
2868 return conversion_buffer;
2869}
2870
2871\f
2872#ifdef emacs
2873/*** 7. Emacs Lisp library functions ***/
2874
02ba4723 2875DEFUN ("coding-system-spec", Fcoding_system_spec, Scoding_system_spec,
4ed46869 2876 1, 1, 0,
02ba4723 2877 "Return coding-spec of CODING-SYSTEM.\n\
4ed46869
KH
2878If CODING-SYSTEM is not a valid coding-system, return nil.")
2879 (obj)
2880 Lisp_Object obj;
2881{
2882 while (SYMBOLP (obj) && !NILP (obj))
2883 obj = Fget (obj, Qcoding_system);
2884 return ((NILP (obj) || !VECTORP (obj) || XVECTOR (obj)->size != 5)
2885 ? Qnil : obj);
2886}
2887
2888DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
2889 "Return t if OBJECT is nil or a coding-system.\n\
2890See document of make-coding-system for coding-system object.")
2891 (obj)
2892 Lisp_Object obj;
2893{
02ba4723 2894 return ((NILP (obj) || !NILP (Fcoding_system_spec (obj))) ? Qt : Qnil);
4ed46869
KH
2895}
2896
9d991de8
RS
2897DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
2898 Sread_non_nil_coding_system, 1, 1, 0,
e0e989f6 2899 "Read a coding system from the minibuffer, prompting with string PROMPT.")
4ed46869
KH
2900 (prompt)
2901 Lisp_Object prompt;
2902{
e0e989f6 2903 Lisp_Object val;
9d991de8
RS
2904 do
2905 {
02ba4723 2906 val = Fcompleting_read (prompt, Vobarray, Qcoding_system_spec,
9d991de8
RS
2907 Qt, Qnil, Qnil, Qnil);
2908 }
2909 while (XSTRING (val)->size == 0);
e0e989f6 2910 return (Fintern (val, Qnil));
4ed46869
KH
2911}
2912
2913DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 1, 0,
e0e989f6 2914 "Read a coding system or nil from the minibuffer, prompting with string PROMPT.")
4ed46869
KH
2915 (prompt)
2916 Lisp_Object prompt;
2917{
e0e989f6 2918 Lisp_Object val = Fcompleting_read (prompt, Vobarray, Qcoding_system_p,
9d991de8 2919 Qt, Qnil, Qnil, Qnil);
e0e989f6 2920 return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
4ed46869
KH
2921}
2922
2923DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
2924 1, 1, 0,
2925 "Check validity of CODING-SYSTEM.\n\
2926If valid, return CODING-SYSTEM, else `coding-system-error' is signaled.\n\
2927CODING-SYSTEM is valid if it is a symbol and has \"coding-system\" property.\n\
2928The value of property should be a vector of length 5.")
2929 (coding_system)
2930 Lisp_Object coding_system;
2931{
2932 CHECK_SYMBOL (coding_system, 0);
2933 if (!NILP (Fcoding_system_p (coding_system)))
2934 return coding_system;
2935 while (1)
02ba4723 2936 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
4ed46869
KH
2937}
2938
2939DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
2940 2, 2, 0,
2941 "Detect coding-system of the text in the region between START and END.\n\
2942Return a list of possible coding-systems ordered by priority.\n\
0ef69138 2943If only ASCII characters are found, it returns `undecided'\n\
4ed46869
KH
2944 or its subsidiary coding-system according to a detected end-of-line format.")
2945 (b, e)
2946 Lisp_Object b, e;
2947{
2948 int coding_mask, eol_type;
2949 Lisp_Object val;
2950 int beg, end;
2951
2952 validate_region (&b, &e);
2953 beg = XINT (b), end = XINT (e);
2954 if (beg < GPT && end >= GPT) move_gap (end);
2955
2956 coding_mask = detect_coding_mask (POS_ADDR (beg), end - beg);
2957 eol_type = detect_eol_type (POS_ADDR (beg), end - beg);
2958
2959 if (coding_mask == CODING_CATEGORY_MASK_ANY)
2960 {
0ef69138
KH
2961 val = intern ("undecided");
2962 if (eol_type != CODING_EOL_UNDECIDED)
4ed46869
KH
2963 {
2964 Lisp_Object val2 = Fget (val, Qeol_type);
2965 if (VECTORP (val2))
2966 val = XVECTOR (val2)->contents[eol_type];
2967 }
2968 }
2969 else
2970 {
2971 Lisp_Object val2;
2972
2973 /* At first, gather possible coding-systems in VAL in a reverse
2974 order. */
2975 val = Qnil;
2976 for (val2 = Vcoding_category_list;
2977 !NILP (val2);
2978 val2 = XCONS (val2)->cdr)
2979 {
2980 int idx
2981 = XFASTINT (Fget (XCONS (val2)->car, Qcoding_category_index));
2982 if (coding_mask & (1 << idx))
2983 val = Fcons (Fsymbol_value (XCONS (val2)->car), val);
2984 }
2985
2986 /* Then, change the order of the list, while getting subsidiary
2987 coding-systems. */
2988 val2 = val;
2989 val = Qnil;
2990 for (; !NILP (val2); val2 = XCONS (val2)->cdr)
2991 {
0ef69138 2992 if (eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
2993 val = Fcons (XCONS (val2)->car, val);
2994 else
2995 {
2996 Lisp_Object val3 = Fget (XCONS (val2)->car, Qeol_type);
2997 if (VECTORP (val3))
2998 val = Fcons (XVECTOR (val3)->contents[eol_type], val);
2999 else
3000 val = Fcons (XCONS (val2)->car, val);
3001 }
3002 }
3003 }
3004
3005 return val;
3006}
3007
3008/* Scan text in the region between *BEGP and *ENDP, skip characters
3009 which we never have to encode to (iff ENCODEP is 1) or decode from
3010 coding system CODING at the head and tail, then set BEGP and ENDP
3011 to the addresses of start and end of the text we actually convert. */
3012
3013void
3014shrink_conversion_area (begp, endp, coding, encodep)
3015 unsigned char **begp, **endp;
3016 struct coding_system *coding;
3017 int encodep;
3018{
3019 register unsigned char *beg_addr = *begp, *end_addr = *endp;
3020
3021 if (coding->eol_type != CODING_EOL_LF
0ef69138 3022 && coding->eol_type != CODING_EOL_UNDECIDED)
4ed46869
KH
3023 /* Since we anyway have to convert end-of-line format, it is not
3024 worth skipping at most 100 bytes or so. */
3025 return;
3026
3027 if (encodep) /* for encoding */
3028 {
3029 switch (coding->type)
3030 {
3031 case coding_type_no_conversion:
0ef69138
KH
3032 case coding_type_emacs_mule:
3033 case coding_type_undecided:
4ed46869
KH
3034 /* We need no conversion. */
3035 *begp = *endp;
3036 return;
3037 case coding_type_ccl:
3038 /* We can't skip any data. */
3039 return;
e0e989f6
KH
3040 case coding_type_iso2022:
3041 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3042 {
3043 unsigned char *bol = beg_addr;
3044 while (beg_addr < end_addr && *beg_addr < 0x80)
3045 {
3046 beg_addr++;
3047 if (*(beg_addr - 1) == '\n')
3048 bol = beg_addr;
3049 }
3050 beg_addr = bol;
3051 goto label_skip_tail;
3052 }
3053 /* fall down ... */
4ed46869
KH
3054 default:
3055 /* We can skip all ASCII characters at the head and tail. */
3056 while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
e0e989f6 3057 label_skip_tail:
4ed46869
KH
3058 while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
3059 break;
3060 }
3061 }
3062 else /* for decoding */
3063 {
3064 switch (coding->type)
3065 {
3066 case coding_type_no_conversion:
3067 /* We need no conversion. */
3068 *begp = *endp;
3069 return;
0ef69138 3070 case coding_type_emacs_mule:
4ed46869
KH
3071 if (coding->eol_type == CODING_EOL_LF)
3072 {
3073 /* We need no conversion. */
3074 *begp = *endp;
3075 return;
3076 }
3077 /* We can skip all but carriage-return. */
3078 while (beg_addr < end_addr && *beg_addr != '\r') beg_addr++;
3079 while (beg_addr < end_addr && *(end_addr - 1) != '\r') end_addr--;
3080 break;
3081 case coding_type_sjis:
3082 case coding_type_big5:
3083 /* We can skip all ASCII characters at the head. */
3084 while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
3085 /* We can skip all ASCII characters at the tail except for
3086 the second byte of SJIS or BIG5 code. */
3087 while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
3088 if (end_addr != *endp)
3089 end_addr++;
3090 break;
3091 case coding_type_ccl:
3092 /* We can't skip any data. */
3093 return;
3094 default: /* i.e. case coding_type_iso2022: */
3095 {
3096 unsigned char c;
3097
3098 /* We can skip all ASCII characters except for a few
3099 control codes at the head. */
3100 while (beg_addr < end_addr && (c = *beg_addr) < 0x80
3101 && c != ISO_CODE_CR && c != ISO_CODE_SO
3102 && c != ISO_CODE_SI && c != ISO_CODE_ESC)
3103 beg_addr++;
3104 }
3105 break;
3106 }
3107 }
3108 *begp = beg_addr;
3109 *endp = end_addr;
3110 return;
3111}
3112
3113/* Encode to (iff ENCODEP is 1) or decode form coding system CODING a
3114 text between B and E. B and E are buffer position. */
3115
3116Lisp_Object
3117code_convert_region (b, e, coding, encodep)
3118 Lisp_Object b, e;
3119 struct coding_system *coding;
3120 int encodep;
3121{
3122 int beg, end, len, consumed, produced;
3123 char *buf;
3124 unsigned char *begp, *endp;
3125 int pos = PT;
3126
3127 validate_region (&b, &e);
3128 beg = XINT (b), end = XINT (e);
3129 if (beg < GPT && end >= GPT)
3130 move_gap (end);
3131
3132 if (encodep && !NILP (coding->pre_write_conversion))
3133 {
3134 /* We must call a pre-conversion function which may put a new
3135 text to be converted in a new buffer. */
3136 struct buffer *old = current_buffer, *new;
3137
3138 TEMP_SET_PT (beg);
3139 call2 (coding->pre_write_conversion, b, e);
3140 if (old != current_buffer)
3141 {
3142 /* Replace the original text by the text just generated. */
3143 len = ZV - BEGV;
3144 new = current_buffer;
3145 set_buffer_internal (old);
3146 del_range (beg, end);
3147 insert_from_buffer (new, 1, len, 0);
3148 end = beg + len;
3149 }
3150 }
3151
3152 /* We may be able to shrink the conversion region. */
3153 begp = POS_ADDR (beg); endp = begp + (end - beg);
3154 shrink_conversion_area (&begp, &endp, coding, encodep);
3155
3156 if (begp == endp)
3157 /* We need no conversion. */
3158 len = end - beg;
3159 else
3160 {
3161 beg += begp - POS_ADDR (beg);
3162 end = beg + (endp - begp);
3163
3164 if (encodep)
3165 len = encoding_buffer_size (coding, end - beg);
3166 else
3167 len = decoding_buffer_size (coding, end - beg);
3168 buf = get_conversion_buffer (len);
3169
3170 coding->last_block = 1;
3171 produced = (encodep
3172 ? encode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
3173 &consumed)
3174 : decode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
3175 &consumed));
3176
3177 len = produced + (beg - XINT (b)) + (XINT (e) - end);
3178
3179 TEMP_SET_PT (beg);
3180 insert (buf, produced);
3181 del_range (PT, PT + end - beg);
3182 if (pos >= end)
3183 pos = PT + (pos - end);
3184 else if (pos > beg)
3185 pos = beg;
3186 TEMP_SET_PT (pos);
3187 }
3188
3189 if (!encodep && !NILP (coding->post_read_conversion))
3190 {
3191 /* We must call a post-conversion function which may alter
3192 the text just converted. */
3193 Lisp_Object insval;
3194
3195 beg = XINT (b);
3196 TEMP_SET_PT (beg);
3197 insval = call1 (coding->post_read_conversion, make_number (len));
3198 CHECK_NUMBER (insval, 0);
3199 len = XINT (insval);
3200 }
3201
3202 return make_number (len);
3203}
3204
3205Lisp_Object
e0e989f6
KH
3206code_convert_string (str, coding, encodep, nocopy)
3207 Lisp_Object str, nocopy;
4ed46869
KH
3208 struct coding_system *coding;
3209 int encodep;
3210{
3211 int len, consumed, produced;
3212 char *buf;
3213 unsigned char *begp, *endp;
3214 int head_skip, tail_skip;
3215 struct gcpro gcpro1;
3216
3217 if (encodep && !NILP (coding->pre_write_conversion)
3218 || !encodep && !NILP (coding->post_read_conversion))
3219 {
3220 /* Since we have to call Lisp functions which assume target text
3221 is in a buffer, after setting a temporary buffer, call
3222 code_convert_region. */
3223 int count = specpdl_ptr - specpdl;
3224 int len = XSTRING (str)->size;
3225 Lisp_Object result;
3226 struct buffer *old = current_buffer;
3227
3228 record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
3229 temp_output_buffer_setup (" *code-converting-work*");
3230 set_buffer_internal (XBUFFER (Vstandard_output));
3231 insert_from_string (str, 0, len, 0);
3232 code_convert_region (make_number (BEGV), make_number (ZV),
3233 coding, encodep);
3234 result = make_buffer_string (BEGV, ZV, 0);
3235 set_buffer_internal (old);
3236 return unbind_to (count, result);
3237 }
3238
3239 /* We may be able to shrink the conversion region. */
3240 begp = XSTRING (str)->data;
3241 endp = begp + XSTRING (str)->size;
3242 shrink_conversion_area (&begp, &endp, coding, encodep);
3243
3244 if (begp == endp)
3245 /* We need no conversion. */
e0e989f6 3246 return (NILP (nocopy) ? Fcopy_sequence (str) : str);
4ed46869
KH
3247
3248 head_skip = begp - XSTRING (str)->data;
3249 tail_skip = XSTRING (str)->size - head_skip - (endp - begp);
3250
3251 GCPRO1 (str);
3252
3253 if (encodep)
3254 len = encoding_buffer_size (coding, endp - begp);
3255 else
3256 len = decoding_buffer_size (coding, endp - begp);
3257 buf = get_conversion_buffer (len + head_skip + tail_skip);
3258
3259 bcopy (XSTRING (str)->data, buf, head_skip);
3260 coding->last_block = 1;
3261 produced = (encodep
3262 ? encode_coding (coding, XSTRING (str)->data + head_skip,
3263 buf + head_skip, endp - begp, len, &consumed)
3264 : decode_coding (coding, XSTRING (str)->data + head_skip,
3265 buf + head_skip, endp - begp, len, &consumed));
3266 bcopy (XSTRING (str)->data + head_skip + (endp - begp),
3267 buf + head_skip + produced,
3268 tail_skip);
3269
3270 UNGCPRO;
3271
3272 return make_string (buf, head_skip + produced + tail_skip);
3273}
3274
3275DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
e0e989f6
KH
3276 3, 3, "r\nzCoding system: ",
3277 "Decode current region by specified coding system.\n\
3278When called from a program, takes three arguments:\n\
3279START, END, and CODING-SYSTEM. START END are buffer positions.\n\
4ed46869
KH
3280Return length of decoded text.")
3281 (b, e, coding_system)
3282 Lisp_Object b, e, coding_system;
3283{
3284 struct coding_system coding;
3285
3286 CHECK_NUMBER_COERCE_MARKER (b, 0);
3287 CHECK_NUMBER_COERCE_MARKER (e, 1);
3288 CHECK_SYMBOL (coding_system, 2);
3289
e0e989f6
KH
3290 if (NILP (coding_system))
3291 return make_number (XFASTINT (e) - XFASTINT (b));
4ed46869
KH
3292 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3293 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3294
3295 return code_convert_region (b, e, &coding, 0);
3296}
3297
3298DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
e0e989f6
KH
3299 3, 3, "r\nzCoding system: ",
3300 "Encode current region by specified coding system.\n\
3301When called from a program, takes three arguments:\n\
3302START, END, and CODING-SYSTEM. START END are buffer positions.\n\
4ed46869
KH
3303Return length of encoded text.")
3304 (b, e, coding_system)
3305 Lisp_Object b, e, coding_system;
3306{
3307 struct coding_system coding;
3308
3309 CHECK_NUMBER_COERCE_MARKER (b, 0);
3310 CHECK_NUMBER_COERCE_MARKER (e, 1);
3311 CHECK_SYMBOL (coding_system, 2);
3312
e0e989f6
KH
3313 if (NILP (coding_system))
3314 return make_number (XFASTINT (e) - XFASTINT (b));
4ed46869
KH
3315 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3316 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3317
3318 return code_convert_region (b, e, &coding, 1);
3319}
3320
3321DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
e0e989f6
KH
3322 2, 3, 0,
3323 "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
3324Optional arg NOCOPY non-nil means return STRING itself if there's no need\n\
3325of decoding.")
3326 (string, coding_system, nocopy)
3327 Lisp_Object string, coding_system, nocopy;
4ed46869
KH
3328{
3329 struct coding_system coding;
3330
3331 CHECK_STRING (string, 0);
3332 CHECK_SYMBOL (coding_system, 1);
3333
e0e989f6
KH
3334 if (NILP (coding_system))
3335 return (NILP (nocopy) ? Fcopy_sequence (string) : string);
4ed46869
KH
3336 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3337 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3338
e0e989f6 3339 return code_convert_string (string, &coding, 0, nocopy);
4ed46869
KH
3340}
3341
3342DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
e0e989f6
KH
3343 2, 3, 0,
3344 "Encode STRING to CODING-SYSTEM, and return the result.\n\
3345Optional arg NOCOPY non-nil means return STRING itself if there's no need\n\
3346of encoding.")
3347 (string, coding_system, nocopy)
3348 Lisp_Object string, coding_system, nocopy;
4ed46869
KH
3349{
3350 struct coding_system coding;
3351
3352 CHECK_STRING (string, 0);
3353 CHECK_SYMBOL (coding_system, 1);
3354
e0e989f6
KH
3355 if (NILP (coding_system))
3356 return (NILP (nocopy) ? Fcopy_sequence (string) : string);
4ed46869
KH
3357 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3358 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3359
e0e989f6 3360 return code_convert_string (string, &coding, 1, nocopy);
4ed46869
KH
3361}
3362
3363DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
e0e989f6 3364 "Decode a JISX0208 character of shift-jis encoding.\n\
4ed46869
KH
3365CODE is the character code in SJIS.\n\
3366Return the corresponding character.")
3367 (code)
3368 Lisp_Object code;
3369{
3370 unsigned char c1, c2, s1, s2;
3371 Lisp_Object val;
3372
3373 CHECK_NUMBER (code, 0);
3374 s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
3375 DECODE_SJIS (s1, s2, c1, c2);
3376 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset_jisx0208, c1, c2));
3377 return val;
3378}
3379
3380DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
3381 "Encode a JISX0208 character CHAR to SJIS coding-system.\n\
3382Return the corresponding character code in SJIS.")
3383 (ch)
3384 Lisp_Object ch;
3385{
bcf26d6a 3386 int charset, c1, c2, s1, s2;
4ed46869
KH
3387 Lisp_Object val;
3388
3389 CHECK_NUMBER (ch, 0);
3390 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3391 if (charset == charset_jisx0208)
3392 {
3393 ENCODE_SJIS (c1, c2, s1, s2);
bcf26d6a 3394 XSETFASTINT (val, (s1 << 8) | s2);
4ed46869
KH
3395 }
3396 else
3397 XSETFASTINT (val, 0);
3398 return val;
3399}
3400
3401DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
3402 "Decode a Big5 character CODE of BIG5 coding-system.\n\
3403CODE is the character code in BIG5.\n\
3404Return the corresponding character.")
3405 (code)
3406 Lisp_Object code;
3407{
3408 int charset;
3409 unsigned char b1, b2, c1, c2;
3410 Lisp_Object val;
3411
3412 CHECK_NUMBER (code, 0);
3413 b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
3414 DECODE_BIG5 (b1, b2, charset, c1, c2);
3415 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset, c1, c2));
3416 return val;
3417}
3418
3419DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
3420 "Encode the Big5 character CHAR to BIG5 coding-system.\n\
3421Return the corresponding character code in Big5.")
3422 (ch)
3423 Lisp_Object ch;
3424{
bcf26d6a 3425 int charset, c1, c2, b1, b2;
4ed46869
KH
3426 Lisp_Object val;
3427
3428 CHECK_NUMBER (ch, 0);
3429 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3430 if (charset == charset_big5_1 || charset == charset_big5_2)
3431 {
3432 ENCODE_BIG5 (charset, c1, c2, b1, b2);
bcf26d6a 3433 XSETFASTINT (val, (b1 << 8) | b2);
4ed46869
KH
3434 }
3435 else
3436 XSETFASTINT (val, 0);
3437 return val;
3438}
3439
1ba9e4ab
KH
3440DEFUN ("set-terminal-coding-system-internal",
3441 Fset_terminal_coding_system_internal,
3442 Sset_terminal_coding_system_internal, 1, 1, 0, "")
4ed46869
KH
3443 (coding_system)
3444 Lisp_Object coding_system;
3445{
3446 CHECK_SYMBOL (coding_system, 0);
3447 setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
4ed46869
KH
3448 return Qnil;
3449}
3450
3451DEFUN ("terminal-coding-system",
3452 Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
3453 "Return coding-system of your terminal.")
3454 ()
3455{
3456 return terminal_coding.symbol;
3457}
3458
1ba9e4ab
KH
3459DEFUN ("set-keyboard-coding-system-internal",
3460 Fset_keyboard_coding_system_internal,
3461 Sset_keyboard_coding_system_internal, 1, 1, 0, "")
4ed46869
KH
3462 (coding_system)
3463 Lisp_Object coding_system;
3464{
3465 CHECK_SYMBOL (coding_system, 0);
3466 setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
3467 return Qnil;
3468}
3469
3470DEFUN ("keyboard-coding-system",
3471 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
3472 "Return coding-system of what is sent from terminal keyboard.")
3473 ()
3474{
3475 return keyboard_coding.symbol;
3476}
3477
3478\f
a5d301df
KH
3479DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
3480 Sfind_operation_coding_system, 1, MANY, 0,
3481 "Choose a coding system for an operation based on the target name.\n\
9ce27fde
KH
3482The value names a pair of coding systems: (DECODING-SYSTEM ENCODING-SYSTEM).\n\
3483DECODING-SYSTEM is the coding system to use for decoding\n\
3484\(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system\n\
3485for encoding (in case OPERATION does encoding).\n\
ccdb79f5
RS
3486\n\
3487The first argument OPERATION specifies an I/O primitive:\n\
3488 For file I/O, `insert-file-contents' or `write-region'.\n\
3489 For process I/O, `call-process', `call-process-region', or `start-process'.\n\
3490 For network I/O, `open-network-stream'.\n\
3491\n\
3492The remaining arguments should be the same arguments that were passed\n\
3493to the primitive. Depending on which primitive, one of those arguments\n\
3494is selected as the TARGET. For example, if OPERATION does file I/O,\n\
3495whichever argument specifies the file name is TARGET.\n\
3496\n\
3497TARGET has a meaning which depends on OPERATION:\n\
4ed46869
KH
3498 For file I/O, TARGET is a file name.\n\
3499 For process I/O, TARGET is a process name.\n\
3500 For network I/O, TARGET is a service name or a port number\n\
3501\n\
02ba4723
KH
3502This function looks up what specified for TARGET in,\n\
3503`file-coding-system-alist', `process-coding-system-alist',\n\
3504or `network-coding-system-alist' depending on OPERATION.\n\
3505They may specify a coding system, a cons of coding systems,\n\
3506or a function symbol to call.\n\
3507In the last case, we call the function with one argument,\n\
9ce27fde 3508which is a list of all the arguments given to this function.")
4ed46869
KH
3509 (nargs, args)
3510 int nargs;
3511 Lisp_Object *args;
3512{
3513 Lisp_Object operation, target_idx, target, val;
3514 register Lisp_Object chain;
3515
3516 if (nargs < 2)
3517 error ("Too few arguments");
3518 operation = args[0];
3519 if (!SYMBOLP (operation)
3520 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
3521 error ("Invalid first arguement");
3522 if (nargs < 1 + XINT (target_idx))
3523 error ("Too few arguments for operation: %s",
3524 XSYMBOL (operation)->name->data);
3525 target = args[XINT (target_idx) + 1];
3526 if (!(STRINGP (target)
3527 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
3528 error ("Invalid %dth argument", XINT (target_idx) + 1);
3529
2e34157c
RS
3530 chain = ((EQ (operation, Qinsert_file_contents)
3531 || EQ (operation, Qwrite_region))
02ba4723 3532 ? Vfile_coding_system_alist
2e34157c 3533 : (EQ (operation, Qopen_network_stream)
02ba4723
KH
3534 ? Vnetwork_coding_system_alist
3535 : Vprocess_coding_system_alist));
4ed46869
KH
3536 if (NILP (chain))
3537 return Qnil;
3538
02ba4723 3539 for (; CONSP (chain); chain = XCONS (chain)->cdr)
4ed46869
KH
3540 {
3541 Lisp_Object elt = XCONS (chain)->car;
3542
3543 if (CONSP (elt)
3544 && ((STRINGP (target)
3545 && STRINGP (XCONS (elt)->car)
3546 && fast_string_match (XCONS (elt)->car, target) >= 0)
3547 || (INTEGERP (target) && EQ (target, XCONS (elt)->car))))
02ba4723
KH
3548 {
3549 val = XCONS (elt)->cdr;
3550 if (CONSP (val))
3551 return val;
3552 if (! SYMBOLP (val))
3553 return Qnil;
3554 if (! NILP (Fcoding_system_p (val)))
3555 return Fcons (val, val);
3556 if (!NILP (Fboundp (val)))
5d632ccf 3557 return call1 (val, Flist (nargs, args));
02ba4723
KH
3558 return Qnil;
3559 }
4ed46869
KH
3560 }
3561 return Qnil;
3562}
3563
3564#endif /* emacs */
3565
3566\f
3567/*** 8. Post-amble ***/
3568
3569init_coding_once ()
3570{
3571 int i;
3572
0ef69138 3573 /* Emacs' internal format specific initialize routine. */
4ed46869
KH
3574 for (i = 0; i <= 0x20; i++)
3575 emacs_code_class[i] = EMACS_control_code;
3576 emacs_code_class[0x0A] = EMACS_linefeed_code;
3577 emacs_code_class[0x0D] = EMACS_carriage_return_code;
3578 for (i = 0x21 ; i < 0x7F; i++)
3579 emacs_code_class[i] = EMACS_ascii_code;
3580 emacs_code_class[0x7F] = EMACS_control_code;
3581 emacs_code_class[0x80] = EMACS_leading_code_composition;
3582 for (i = 0x81; i < 0xFF; i++)
3583 emacs_code_class[i] = EMACS_invalid_code;
3584 emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
3585 emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
3586 emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
3587 emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
3588
3589 /* ISO2022 specific initialize routine. */
3590 for (i = 0; i < 0x20; i++)
3591 iso_code_class[i] = ISO_control_code;
3592 for (i = 0x21; i < 0x7F; i++)
3593 iso_code_class[i] = ISO_graphic_plane_0;
3594 for (i = 0x80; i < 0xA0; i++)
3595 iso_code_class[i] = ISO_control_code;
3596 for (i = 0xA1; i < 0xFF; i++)
3597 iso_code_class[i] = ISO_graphic_plane_1;
3598 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
3599 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
3600 iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
3601 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
3602 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
3603 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
3604 iso_code_class[ISO_CODE_ESC] = ISO_escape;
3605 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
3606 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
3607 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
3608
e0e989f6
KH
3609 conversion_buffer_size = MINIMUM_CONVERSION_BUFFER_SIZE;
3610 conversion_buffer = (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE);
3611
3612 setup_coding_system (Qnil, &keyboard_coding);
3613 setup_coding_system (Qnil, &terminal_coding);
9ce27fde
KH
3614
3615#if defined (MSDOS) || defined (WINDOWSNT)
3616 system_eol_type = CODING_EOL_CRLF;
3617#else
3618 system_eol_type = CODING_EOL_LF;
3619#endif
e0e989f6
KH
3620}
3621
3622#ifdef emacs
3623
3624syms_of_coding ()
3625{
3626 Qtarget_idx = intern ("target-idx");
3627 staticpro (&Qtarget_idx);
3628
9ce27fde 3629 /* Target FILENAME is the first argument. */
e0e989f6 3630 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
9ce27fde 3631 /* Target FILENAME is the third argument. */
e0e989f6
KH
3632 Fput (Qwrite_region, Qtarget_idx, make_number (2));
3633
3634 Qcall_process = intern ("call-process");
3635 staticpro (&Qcall_process);
9ce27fde 3636 /* Target PROGRAM is the first argument. */
e0e989f6
KH
3637 Fput (Qcall_process, Qtarget_idx, make_number (0));
3638
3639 Qcall_process_region = intern ("call-process-region");
3640 staticpro (&Qcall_process_region);
9ce27fde 3641 /* Target PROGRAM is the third argument. */
e0e989f6
KH
3642 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
3643
3644 Qstart_process = intern ("start-process");
3645 staticpro (&Qstart_process);
9ce27fde 3646 /* Target PROGRAM is the third argument. */
e0e989f6
KH
3647 Fput (Qstart_process, Qtarget_idx, make_number (2));
3648
3649 Qopen_network_stream = intern ("open-network-stream");
3650 staticpro (&Qopen_network_stream);
9ce27fde 3651 /* Target SERVICE is the fourth argument. */
e0e989f6
KH
3652 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
3653
4ed46869
KH
3654 Qcoding_system = intern ("coding-system");
3655 staticpro (&Qcoding_system);
3656
3657 Qeol_type = intern ("eol-type");
3658 staticpro (&Qeol_type);
3659
3660 Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
3661 staticpro (&Qbuffer_file_coding_system);
3662
3663 Qpost_read_conversion = intern ("post-read-conversion");
3664 staticpro (&Qpost_read_conversion);
3665
3666 Qpre_write_conversion = intern ("pre-write-conversion");
3667 staticpro (&Qpre_write_conversion);
3668
02ba4723
KH
3669 Qcoding_system_spec = intern ("coding-system-spec");
3670 staticpro (&Qcoding_system_spec);
4ed46869
KH
3671
3672 Qcoding_system_p = intern ("coding-system-p");
3673 staticpro (&Qcoding_system_p);
3674
3675 Qcoding_system_error = intern ("coding-system-error");
3676 staticpro (&Qcoding_system_error);
3677
3678 Fput (Qcoding_system_error, Qerror_conditions,
3679 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
3680 Fput (Qcoding_system_error, Qerror_message,
9ce27fde 3681 build_string ("Invalid coding system"));
4ed46869
KH
3682
3683 Qcoding_category_index = intern ("coding-category-index");
3684 staticpro (&Qcoding_category_index);
3685
3686 {
3687 int i;
3688 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3689 {
3690 coding_category_table[i] = intern (coding_category_name[i]);
3691 staticpro (&coding_category_table[i]);
3692 Fput (coding_category_table[i], Qcoding_category_index,
3693 make_number (i));
3694 }
3695 }
3696
bdd9fb48
KH
3697 Qcharacter_unification_table = intern ("character-unification-table");
3698 staticpro (&Qcharacter_unification_table);
3699 Fput (Qcharacter_unification_table, Qchar_table_extra_slots,
3700 make_number (0));
3701
a5d301df
KH
3702 Qcharacter_unification_table_for_decode
3703 = intern ("character-unification-table-for-decode");
3704 staticpro (&Qcharacter_unification_table_for_decode);
3705
3706 Qcharacter_unification_table_for_encode
3707 = intern ("character-unification-table-for-encode");
3708 staticpro (&Qcharacter_unification_table_for_encode);
3709
9ce27fde
KH
3710 Qemacs_mule = intern ("emacs-mule");
3711 staticpro (&Qemacs_mule);
3712
02ba4723 3713 defsubr (&Scoding_system_spec);
4ed46869
KH
3714 defsubr (&Scoding_system_p);
3715 defsubr (&Sread_coding_system);
3716 defsubr (&Sread_non_nil_coding_system);
3717 defsubr (&Scheck_coding_system);
3718 defsubr (&Sdetect_coding_region);
3719 defsubr (&Sdecode_coding_region);
3720 defsubr (&Sencode_coding_region);
3721 defsubr (&Sdecode_coding_string);
3722 defsubr (&Sencode_coding_string);
3723 defsubr (&Sdecode_sjis_char);
3724 defsubr (&Sencode_sjis_char);
3725 defsubr (&Sdecode_big5_char);
3726 defsubr (&Sencode_big5_char);
1ba9e4ab 3727 defsubr (&Sset_terminal_coding_system_internal);
4ed46869 3728 defsubr (&Sterminal_coding_system);
1ba9e4ab 3729 defsubr (&Sset_keyboard_coding_system_internal);
4ed46869 3730 defsubr (&Skeyboard_coding_system);
a5d301df 3731 defsubr (&Sfind_operation_coding_system);
4ed46869
KH
3732
3733 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
3734 "List of coding-categories (symbols) ordered by priority.");
3735 {
3736 int i;
3737
3738 Vcoding_category_list = Qnil;
3739 for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
3740 Vcoding_category_list
3741 = Fcons (coding_category_table[i], Vcoding_category_list);
3742 }
3743
3744 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
3745 "A variable of internal use only.\n\
3746If the value is a coding system, it is used for decoding on read operation.\n\
3747If not, an appropriate element in `coding-system-alist' (which see) is used.");
3748 Vcoding_system_for_read = Qnil;
3749
3750 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
3751 "A variable of internal use only.\n\
3752If the value is a coding system, it is used for encoding on write operation.\n\
3753If not, an appropriate element in `coding-system-alist' (which see) is used.");
3754 Vcoding_system_for_write = Qnil;
3755
3756 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
3757 "Coding-system used in the latest file or process I/O.");
3758 Vlast_coding_system_used = Qnil;
3759
9ce27fde
KH
3760 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
3761 "*Non-nil inhibit code conversion of end-of-line format in any cases.");
3762 inhibit_eol_conversion = 0;
3763
02ba4723
KH
3764 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
3765 "Alist to decide a coding system to use for a file I/O operation.\n\
3766The format is ((PATTERN . VAL) ...),\n\
3767where PATTERN is a regular expression matching a file name,\n\
3768VAL is a coding system, a cons of coding systems, or a function symbol.\n\
3769If VAL is a coding system, it is used for both decoding and encoding\n\
3770the file contents.\n\
3771If VAL is a cons of coding systems, the car part is used for decoding,\n\
3772and the cdr part is used for encoding.\n\
3773If VAL is a function symbol, the function must return a coding system\n\
3774or a cons of coding systems which are used as above.\n\
e0e989f6 3775\n\
9ce27fde 3776See also the function `find-operation-coding-system'.");
02ba4723
KH
3777 Vfile_coding_system_alist = Qnil;
3778
3779 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
3780 "Alist to decide a coding system to use for a process I/O operation.\n\
3781The format is ((PATTERN . VAL) ...),\n\
3782where PATTERN is a regular expression matching a program name,\n\
3783VAL is a coding system, a cons of coding systems, or a function symbol.\n\
3784If VAL is a coding system, it is used for both decoding what received\n\
3785from the program and encoding what sent to the program.\n\
3786If VAL is a cons of coding systems, the car part is used for decoding,\n\
3787and the cdr part is used for encoding.\n\
3788If VAL is a function symbol, the function must return a coding system\n\
3789or a cons of coding systems which are used as above.\n\
4ed46869 3790\n\
9ce27fde 3791See also the function `find-operation-coding-system'.");
02ba4723
KH
3792 Vprocess_coding_system_alist = Qnil;
3793
3794 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
3795 "Alist to decide a coding system to use for a network I/O operation.\n\
3796The format is ((PATTERN . VAL) ...),\n\
3797where PATTERN is a regular expression matching a network service name\n\
3798or is a port number to connect to,\n\
3799VAL is a coding system, a cons of coding systems, or a function symbol.\n\
3800If VAL is a coding system, it is used for both decoding what received\n\
3801from the network stream and encoding what sent to the network stream.\n\
3802If VAL is a cons of coding systems, the car part is used for decoding,\n\
3803and the cdr part is used for encoding.\n\
3804If VAL is a function symbol, the function must return a coding system\n\
3805or a cons of coding systems which are used as above.\n\
4ed46869 3806\n\
9ce27fde 3807See also the function `find-operation-coding-system'.");
02ba4723 3808 Vnetwork_coding_system_alist = Qnil;
4ed46869
KH
3809
3810 DEFVAR_INT ("eol-mnemonic-unix", &eol_mnemonic_unix,
3811 "Mnemonic character indicating UNIX-like end-of-line format (i.e. LF) .");
458822a0 3812 eol_mnemonic_unix = ':';
4ed46869
KH
3813
3814 DEFVAR_INT ("eol-mnemonic-dos", &eol_mnemonic_dos,
3815 "Mnemonic character indicating DOS-like end-of-line format (i.e. CRLF).");
458822a0 3816 eol_mnemonic_dos = '\\';
4ed46869
KH
3817
3818 DEFVAR_INT ("eol-mnemonic-mac", &eol_mnemonic_mac,
3819 "Mnemonic character indicating MAC-like end-of-line format (i.e. CR).");
458822a0 3820 eol_mnemonic_mac = '/';
4ed46869
KH
3821
3822 DEFVAR_INT ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
3823 "Mnemonic character indicating end-of-line format is not yet decided.");
458822a0 3824 eol_mnemonic_undecided = ':';
4ed46869 3825
bdd9fb48
KH
3826 DEFVAR_LISP ("enable-character-unification", &Venable_character_unification,
3827 "Non-nil means ISO 2022 encoder/decoder do character unification.");
3828 Venable_character_unification = Qt;
3829
a5d301df
KH
3830 DEFVAR_LISP ("standard-character-unification-table-for-decode",
3831 &Vstandard_character_unification_table_for_decode,
bdd9fb48 3832 "Table for unifying characters when reading.");
a5d301df 3833 Vstandard_character_unification_table_for_decode = Qnil;
bdd9fb48 3834
a5d301df
KH
3835 DEFVAR_LISP ("standard-character-unification-table-for-encode",
3836 &Vstandard_character_unification_table_for_encode,
bdd9fb48 3837 "Table for unifying characters when writing.");
a5d301df 3838 Vstandard_character_unification_table_for_encode = Qnil;
4ed46869
KH
3839
3840 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
3841 "Alist of charsets vs revision numbers.\n\
3842While encoding, if a charset (car part of an element) is found,\n\
3843designate it with the escape sequence identifing revision (cdr part of the element).");
3844 Vcharset_revision_alist = Qnil;
02ba4723
KH
3845
3846 DEFVAR_LISP ("default-process-coding-system",
3847 &Vdefault_process_coding_system,
3848 "Cons of coding systems used for process I/O by default.\n\
3849The car part is used for decoding a process output,\n\
3850the cdr part is used for encoding a text to be sent to a process.");
3851 Vdefault_process_coding_system = Qnil;
4ed46869
KH
3852}
3853
3854#endif /* emacs */