Include leim/ChangeLog in leim distribution.
[bpt/emacs.git] / src / coding.c
CommitLineData
4ed46869 1/* Coding system handler (conversion, detection, and etc).
203cb916
RS
2 Copyright (C) 1995, 1997 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4ed46869 4
369314dc
KH
5This file is part of GNU Emacs.
6
7GNU Emacs is free software; you can redistribute it and/or modify
8it under the terms of the GNU General Public License as published by
9the Free Software Foundation; either version 2, or (at your option)
10any later version.
4ed46869 11
369314dc
KH
12GNU Emacs is distributed in the hope that it will be useful,
13but WITHOUT ANY WARRANTY; without even the implied warranty of
14MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15GNU General Public License for more details.
4ed46869 16
369314dc
KH
17You should have received a copy of the GNU General Public License
18along with GNU Emacs; see the file COPYING. If not, write to
19the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20Boston, MA 02111-1307, USA. */
4ed46869
KH
21
22/*** TABLE OF CONTENTS ***
23
24 1. Preamble
0ef69138 25 2. Emacs' internal format (emacs-mule) handlers
4ed46869
KH
26 3. ISO2022 handlers
27 4. Shift-JIS and BIG5 handlers
28 5. End-of-line handlers
29 6. C library functions
30 7. Emacs Lisp library functions
31 8. Post-amble
32
33*/
34
35/*** GENERAL NOTE on CODING SYSTEM ***
36
37 Coding system is an encoding mechanism of one or more character
38 sets. Here's a list of coding systems which Emacs can handle. When
39 we say "decode", it means converting some other coding system to
0ef69138
KH
40 Emacs' internal format (emacs-internal), and when we say "encode",
41 it means converting the coding system emacs-mule to some other
42 coding system.
4ed46869 43
0ef69138 44 0. Emacs' internal format (emacs-mule)
4ed46869
KH
45
46 Emacs itself holds a multi-lingual character in a buffer and a string
f4dee582 47 in a special format. Details are described in section 2.
4ed46869
KH
48
49 1. ISO2022
50
51 The most famous coding system for multiple character sets. X's
f4dee582
RS
52 Compound Text, various EUCs (Extended Unix Code), and coding
53 systems used in Internet communication such as ISO-2022-JP are
54 all variants of ISO2022. Details are described in section 3.
4ed46869
KH
55
56 2. SJIS (or Shift-JIS or MS-Kanji-Code)
57
58 A coding system to encode character sets: ASCII, JISX0201, and
59 JISX0208. Widely used for PC's in Japan. Details are described in
f4dee582 60 section 4.
4ed46869
KH
61
62 3. BIG5
63
64 A coding system to encode character sets: ASCII and Big5. Widely
65 used by Chinese (mainly in Taiwan and Hong Kong). Details are
f4dee582
RS
66 described in section 4. In this file, when we write "BIG5"
67 (all uppercase), we mean the coding system, and when we write
68 "Big5" (capitalized), we mean the character set.
4ed46869 69
f4dee582 70 4. Other
4ed46869 71
f4dee582 72 If a user wants to read/write a text encoded in a coding system not
4ed46869
KH
73 listed above, he can supply a decoder and an encoder for it in CCL
74 (Code Conversion Language) programs. Emacs executes the CCL program
75 while reading/writing.
76
f4dee582 77 Emacs represents a coding-system by a Lisp symbol that has a property
4ed46869
KH
78 `coding-system'. But, before actually using the coding-system, the
79 information about it is set in a structure of type `struct
f4dee582 80 coding_system' for rapid processing. See section 6 for more details.
4ed46869
KH
81
82*/
83
84/*** GENERAL NOTES on END-OF-LINE FORMAT ***
85
86 How end-of-line of a text is encoded depends on a system. For
87 instance, Unix's format is just one byte of `line-feed' code,
f4dee582 88 whereas DOS's format is two-byte sequence of `carriage-return' and
4ed46869
KH
89 `line-feed' codes. MacOS's format is one byte of `carriage-return'.
90
f4dee582
RS
91 Since text characters encoding and end-of-line encoding are
92 independent, any coding system described above can take
4ed46869 93 any format of end-of-line. So, Emacs has information of format of
f4dee582 94 end-of-line in each coding-system. See section 6 for more details.
4ed46869
KH
95
96*/
97
98/*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
99
100 These functions check if a text between SRC and SRC_END is encoded
101 in the coding system category XXX. Each returns an integer value in
102 which appropriate flag bits for the category XXX is set. The flag
103 bits are defined in macros CODING_CATEGORY_MASK_XXX. Below is the
104 template of these functions. */
105#if 0
106int
0ef69138 107detect_coding_emacs_mule (src, src_end)
4ed46869
KH
108 unsigned char *src, *src_end;
109{
110 ...
111}
112#endif
113
114/*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
115
116 These functions decode SRC_BYTES length text at SOURCE encoded in
0ef69138 117 CODING to Emacs' internal format (emacs-mule). The resulting text
f4dee582
RS
118 goes to a place pointed to by DESTINATION, the length of which should
119 not exceed DST_BYTES. The number of bytes actually processed is
120 returned as *CONSUMED. The return value is the length of the decoded
121 text. Below is a template of these functions. */
4ed46869
KH
122#if 0
123decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
124 struct coding_system *coding;
125 unsigned char *source, *destination;
126 int src_bytes, dst_bytes;
127 int *consumed;
128{
129 ...
130}
131#endif
132
133/*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
134
0ef69138
KH
135 These functions encode SRC_BYTES length text at SOURCE of Emacs'
136 internal format (emacs-mule) to CODING. The resulting text goes to
f4dee582
RS
137 a place pointed to by DESTINATION, the length of which should not
138 exceed DST_BYTES. The number of bytes actually processed is
139 returned as *CONSUMED. The return value is the length of the
140 encoded text. Below is a template of these functions. */
4ed46869
KH
141#if 0
142encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
143 struct coding_system *coding;
144 unsigned char *source, *destination;
145 int src_bytes, dst_bytes;
146 int *consumed;
147{
148 ...
149}
150#endif
151
152/*** COMMONLY USED MACROS ***/
153
154/* The following three macros ONE_MORE_BYTE, TWO_MORE_BYTES, and
155 THREE_MORE_BYTES safely get one, two, and three bytes from the
156 source text respectively. If there are not enough bytes in the
157 source, they jump to `label_end_of_loop'. The caller should set
158 variables `src' and `src_end' to appropriate areas in advance. */
159
160#define ONE_MORE_BYTE(c1) \
161 do { \
162 if (src < src_end) \
163 c1 = *src++; \
164 else \
165 goto label_end_of_loop; \
166 } while (0)
167
168#define TWO_MORE_BYTES(c1, c2) \
169 do { \
170 if (src + 1 < src_end) \
171 c1 = *src++, c2 = *src++; \
172 else \
173 goto label_end_of_loop; \
174 } while (0)
175
176#define THREE_MORE_BYTES(c1, c2, c3) \
177 do { \
178 if (src + 2 < src_end) \
179 c1 = *src++, c2 = *src++, c3 = *src++; \
180 else \
181 goto label_end_of_loop; \
182 } while (0)
183
184/* The following three macros DECODE_CHARACTER_ASCII,
185 DECODE_CHARACTER_DIMENSION1, and DECODE_CHARACTER_DIMENSION2 put
186 the multi-byte form of a character of each class at the place
187 pointed by `dst'. The caller should set the variable `dst' to
188 point to an appropriate area and the variable `coding' to point to
189 the coding-system of the currently decoding text in advance. */
190
191/* Decode one ASCII character C. */
192
193#define DECODE_CHARACTER_ASCII(c) \
194 do { \
195 if (COMPOSING_P (coding->composing)) \
196 *dst++ = 0xA0, *dst++ = (c) | 0x80; \
197 else \
198 *dst++ = (c); \
199 } while (0)
200
f4dee582 201/* Decode one DIMENSION1 character whose charset is CHARSET and whose
4ed46869
KH
202 position-code is C. */
203
204#define DECODE_CHARACTER_DIMENSION1(charset, c) \
205 do { \
206 unsigned char leading_code = CHARSET_LEADING_CODE_BASE (charset); \
207 if (COMPOSING_P (coding->composing)) \
208 *dst++ = leading_code + 0x20; \
209 else \
210 *dst++ = leading_code; \
211 if (leading_code = CHARSET_LEADING_CODE_EXT (charset)) \
212 *dst++ = leading_code; \
213 *dst++ = (c) | 0x80; \
214 } while (0)
215
f4dee582 216/* Decode one DIMENSION2 character whose charset is CHARSET and whose
4ed46869
KH
217 position-codes are C1 and C2. */
218
219#define DECODE_CHARACTER_DIMENSION2(charset, c1, c2) \
220 do { \
221 DECODE_CHARACTER_DIMENSION1 (charset, c1); \
222 *dst++ = (c2) | 0x80; \
223 } while (0)
224
225\f
226/*** 1. Preamble ***/
227
228#include <stdio.h>
229
230#ifdef emacs
231
232#include <config.h>
233#include "lisp.h"
234#include "buffer.h"
235#include "charset.h"
236#include "ccl.h"
237#include "coding.h"
238#include "window.h"
239
240#else /* not emacs */
241
242#include "mulelib.h"
243
244#endif /* not emacs */
245
246Lisp_Object Qcoding_system, Qeol_type;
247Lisp_Object Qbuffer_file_coding_system;
248Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
249
250extern Lisp_Object Qinsert_file_contents, Qwrite_region;
251Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
252Lisp_Object Qstart_process, Qopen_network_stream;
253Lisp_Object Qtarget_idx;
254
255/* Mnemonic character of each format of end-of-line. */
256int eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
257/* Mnemonic character to indicate format of end-of-line is not yet
258 decided. */
259int eol_mnemonic_undecided;
260
9ce27fde
KH
261/* Format of end-of-line decided by system. This is CODING_EOL_LF on
262 Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac. */
263int system_eol_type;
264
4ed46869
KH
265#ifdef emacs
266
02ba4723 267Lisp_Object Qcoding_system_spec, Qcoding_system_p, Qcoding_system_error;
4ed46869 268
9ce27fde
KH
269/* Coding system emacs-mule is for converting only end-of-line format. */
270Lisp_Object Qemacs_mule;
271
4ed46869
KH
272/* Coding-systems are handed between Emacs Lisp programs and C internal
273 routines by the following three variables. */
274/* Coding-system for reading files and receiving data from process. */
275Lisp_Object Vcoding_system_for_read;
276/* Coding-system for writing files and sending data to process. */
277Lisp_Object Vcoding_system_for_write;
278/* Coding-system actually used in the latest I/O. */
279Lisp_Object Vlast_coding_system_used;
280
9ce27fde
KH
281/* Flag to inhibit code conversion of end-of-line format. */
282int inhibit_eol_conversion;
283
4ed46869
KH
284/* Coding-system of what terminal accept for displaying. */
285struct coding_system terminal_coding;
286
287/* Coding-system of what is sent from terminal keyboard. */
288struct coding_system keyboard_coding;
289
02ba4723
KH
290Lisp_Object Vfile_coding_system_alist;
291Lisp_Object Vprocess_coding_system_alist;
292Lisp_Object Vnetwork_coding_system_alist;
4ed46869
KH
293
294#endif /* emacs */
295
296Lisp_Object Qcoding_category_index;
297
298/* List of symbols `coding-category-xxx' ordered by priority. */
299Lisp_Object Vcoding_category_list;
300
301/* Table of coding-systems currently assigned to each coding-category. */
302Lisp_Object coding_category_table[CODING_CATEGORY_IDX_MAX];
303
304/* Table of names of symbol for each coding-category. */
305char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
0ef69138 306 "coding-category-emacs-mule",
4ed46869
KH
307 "coding-category-sjis",
308 "coding-category-iso-7",
309 "coding-category-iso-8-1",
310 "coding-category-iso-8-2",
7717c392
KH
311 "coding-category-iso-7-else",
312 "coding-category-iso-8-else",
4ed46869
KH
313 "coding-category-big5",
314 "coding-category-binary"
315};
316
bdd9fb48
KH
317/* Flag to tell if we look up unification table on character code
318 conversion. */
319Lisp_Object Venable_character_unification;
a5d301df
KH
320/* Standard unification table to look up on decoding (reading). */
321Lisp_Object Vstandard_character_unification_table_for_decode;
322/* Standard unification table to look up on encoding (writing). */
323Lisp_Object Vstandard_character_unification_table_for_encode;
bdd9fb48
KH
324
325Lisp_Object Qcharacter_unification_table;
a5d301df
KH
326Lisp_Object Qcharacter_unification_table_for_decode;
327Lisp_Object Qcharacter_unification_table_for_encode;
4ed46869
KH
328
329/* Alist of charsets vs revision number. */
330Lisp_Object Vcharset_revision_alist;
331
02ba4723
KH
332/* Default coding systems used for process I/O. */
333Lisp_Object Vdefault_process_coding_system;
334
4ed46869 335\f
0ef69138 336/*** 2. Emacs internal format (emacs-mule) handlers ***/
4ed46869
KH
337
338/* Emacs' internal format for encoding multiple character sets is a
f4dee582
RS
339 kind of multi-byte encoding, i.e. characters are encoded by
340 variable-length sequences of one-byte codes. ASCII characters
341 and control characters (e.g. `tab', `newline') are represented by
342 one-byte sequences which are their ASCII codes, in the range 0x00
343 through 0x7F. The other characters are represented by a sequence
344 of `base leading-code', optional `extended leading-code', and one
345 or two `position-code's. The length of the sequence is determined
346 by the base leading-code. Leading-code takes the range 0x80
347 through 0x9F, whereas extended leading-code and position-code take
348 the range 0xA0 through 0xFF. See `charset.h' for more details
349 about leading-code and position-code.
350
351 There's one exception to this rule. Special leading-code
4ed46869
KH
352 `leading-code-composition' denotes that the following several
353 characters should be composed into one character. Leading-codes of
354 components (except for ASCII) are added 0x20. An ASCII character
355 component is represented by a 2-byte sequence of `0xA0' and
f4dee582
RS
356 `ASCII-code + 0x80'. See also the comments in `charset.h' for the
357 details of composite character. Hence, we can summarize the code
4ed46869
KH
358 range as follows:
359
360 --- CODE RANGE of Emacs' internal format ---
361 (character set) (range)
362 ASCII 0x00 .. 0x7F
363 ELSE (1st byte) 0x80 .. 0x9F
364 (rest bytes) 0xA0 .. 0xFF
365 ---------------------------------------------
366
367 */
368
369enum emacs_code_class_type emacs_code_class[256];
370
371/* Go to the next statement only if *SRC is accessible and the code is
372 greater than 0xA0. */
373#define CHECK_CODE_RANGE_A0_FF \
374 do { \
375 if (src >= src_end) \
376 goto label_end_of_switch; \
377 else if (*src++ < 0xA0) \
378 return 0; \
379 } while (0)
380
381/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
382 Check if a text is encoded in Emacs' internal format. If it is,
0ef69138 383 return CODING_CATEGORY_MASK_EMASC_MULE, else return 0. */
4ed46869
KH
384
385int
0ef69138 386detect_coding_emacs_mule (src, src_end)
4ed46869
KH
387 unsigned char *src, *src_end;
388{
389 unsigned char c;
390 int composing = 0;
391
392 while (src < src_end)
393 {
394 c = *src++;
395
396 if (composing)
397 {
398 if (c < 0xA0)
399 composing = 0;
400 else
401 c -= 0x20;
402 }
403
404 switch (emacs_code_class[c])
405 {
406 case EMACS_ascii_code:
407 case EMACS_linefeed_code:
408 break;
409
410 case EMACS_control_code:
411 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
412 return 0;
413 break;
414
415 case EMACS_invalid_code:
416 return 0;
417
418 case EMACS_leading_code_composition: /* c == 0x80 */
419 if (composing)
420 CHECK_CODE_RANGE_A0_FF;
421 else
422 composing = 1;
423 break;
424
425 case EMACS_leading_code_4:
426 CHECK_CODE_RANGE_A0_FF;
427 /* fall down to check it two more times ... */
428
429 case EMACS_leading_code_3:
430 CHECK_CODE_RANGE_A0_FF;
431 /* fall down to check it one more time ... */
432
433 case EMACS_leading_code_2:
434 CHECK_CODE_RANGE_A0_FF;
435 break;
436
437 default:
438 label_end_of_switch:
439 break;
440 }
441 }
0ef69138 442 return CODING_CATEGORY_MASK_EMACS_MULE;
4ed46869
KH
443}
444
445\f
446/*** 3. ISO2022 handlers ***/
447
448/* The following note describes the coding system ISO2022 briefly.
f4dee582
RS
449 Since the intention of this note is to help in understanding of
450 the programs in this file, some parts are NOT ACCURATE or OVERLY
4ed46869
KH
451 SIMPLIFIED. For the thorough understanding, please refer to the
452 original document of ISO2022.
453
454 ISO2022 provides many mechanisms to encode several character sets
f4dee582 455 in 7-bit and 8-bit environment. If one chooses 7-bite environment,
4ed46869 456 all text is encoded by codes of less than 128. This may make the
f4dee582
RS
457 encoded text a little bit longer, but the text gets more stability
458 to pass through several gateways (some of them strip off the MSB).
4ed46869 459
f4dee582 460 There are two kinds of character set: control character set and
4ed46869
KH
461 graphic character set. The former contains control characters such
462 as `newline' and `escape' to provide control functions (control
f4dee582 463 functions are provided also by escape sequences). The latter
4ed46869
KH
464 contains graphic characters such as ' A' and '-'. Emacs recognizes
465 two control character sets and many graphic character sets.
466
467 Graphic character sets are classified into one of the following
468 four classes, DIMENSION1_CHARS94, DIMENSION1_CHARS96,
469 DIMENSION2_CHARS94, DIMENSION2_CHARS96 according to the number of
470 bytes (DIMENSION) and the number of characters in one dimension
471 (CHARS) of the set. In addition, each character set is assigned an
472 identification tag (called "final character" and denoted as <F>
473 here after) which is unique in each class. <F> of each character
474 set is decided by ECMA(*) when it is registered in ISO. Code range
475 of <F> is 0x30..0x7F (0x30..0x3F are for private use only).
476
477 Note (*): ECMA = European Computer Manufacturers Association
478
479 Here are examples of graphic character set [NAME(<F>)]:
480 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
481 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
482 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
483 o DIMENSION2_CHARS96 -- none for the moment
484
485 A code area (1byte=8bits) is divided into 4 areas, C0, GL, C1, and GR.
486 C0 [0x00..0x1F] -- control character plane 0
487 GL [0x20..0x7F] -- graphic character plane 0
488 C1 [0x80..0x9F] -- control character plane 1
489 GR [0xA0..0xFF] -- graphic character plane 1
490
491 A control character set is directly designated and invoked to C0 or
492 C1 by an escape sequence. The most common case is that ISO646's
493 control character set is designated/invoked to C0 and ISO6429's
494 control character set is designated/invoked to C1, and usually
495 these designations/invocations are omitted in a coded text. With
496 7-bit environment, only C0 can be used, and a control character for
497 C1 is encoded by an appropriate escape sequence to fit in the
498 environment. All control characters for C1 are defined the
499 corresponding escape sequences.
500
501 A graphic character set is at first designated to one of four
502 graphic registers (G0 through G3), then these graphic registers are
503 invoked to GL or GR. These designations and invocations can be
504 done independently. The most common case is that G0 is invoked to
505 GL, G1 is invoked to GR, and ASCII is designated to G0, and usually
506 these invocations and designations are omitted in a coded text.
507 With 7-bit environment, only GL can be used.
508
509 When a graphic character set of CHARS94 is invoked to GL, code 0x20
510 and 0x7F of GL area work as control characters SPACE and DEL
511 respectively, and code 0xA0 and 0xFF of GR area should not be used.
512
513 There are two ways of invocation: locking-shift and single-shift.
514 With locking-shift, the invocation lasts until the next different
515 invocation, whereas with single-shift, the invocation works only
516 for the following character and doesn't affect locking-shift.
517 Invocations are done by the following control characters or escape
518 sequences.
519
520 ----------------------------------------------------------------------
521 function control char escape sequence description
522 ----------------------------------------------------------------------
523 SI (shift-in) 0x0F none invoke G0 to GL
10bff6f1 524 SO (shift-out) 0x0E none invoke G1 to GL
4ed46869
KH
525 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
526 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
527 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 into GL
528 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 into GL
529 ----------------------------------------------------------------------
530 The first four are for locking-shift. Control characters for these
531 functions are defined by macros ISO_CODE_XXX in `coding.h'.
532
533 Designations are done by the following escape sequences.
534 ----------------------------------------------------------------------
535 escape sequence description
536 ----------------------------------------------------------------------
537 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
538 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
539 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
540 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
541 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
542 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
543 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
544 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
545 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
546 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
547 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
548 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
549 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
550 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
551 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
552 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
553 ----------------------------------------------------------------------
554
555 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
556 of dimension 1, chars 94, and final character <F>, and etc.
557
558 Note (*): Although these designations are not allowed in ISO2022,
559 Emacs accepts them on decoding, and produces them on encoding
560 CHARS96 character set in a coding system which is characterized as
561 7-bit environment, non-locking-shift, and non-single-shift.
562
563 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
564 '(' can be omitted. We call this as "short-form" here after.
565
566 Now you may notice that there are a lot of ways for encoding the
f4dee582 567 same multilingual text in ISO2022. Actually, there exists many
4ed46869
KH
568 coding systems such as Compound Text (used in X's inter client
569 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
570 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
571 localized platforms), and all of these are variants of ISO2022.
572
573 In addition to the above, Emacs handles two more kinds of escape
574 sequences: ISO6429's direction specification and Emacs' private
575 sequence for specifying character composition.
576
577 ISO6429's direction specification takes the following format:
578 o CSI ']' -- end of the current direction
579 o CSI '0' ']' -- end of the current direction
580 o CSI '1' ']' -- start of left-to-right text
581 o CSI '2' ']' -- start of right-to-left text
582 The control character CSI (0x9B: control sequence introducer) is
583 abbreviated to the escape sequence ESC '[' in 7-bit environment.
584
585 Character composition specification takes the following format:
586 o ESC '0' -- start character composition
587 o ESC '1' -- end character composition
588 Since these are not standard escape sequences of any ISO, the use
589 of them for these meaning is restricted to Emacs only. */
590
591enum iso_code_class_type iso_code_class[256];
592
593/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
594 Check if a text is encoded in ISO2022. If it is, returns an
595 integer in which appropriate flag bits any of:
596 CODING_CATEGORY_MASK_ISO_7
597 CODING_CATEGORY_MASK_ISO_8_1
598 CODING_CATEGORY_MASK_ISO_8_2
7717c392
KH
599 CODING_CATEGORY_MASK_ISO_7_ELSE
600 CODING_CATEGORY_MASK_ISO_8_ELSE
4ed46869
KH
601 are set. If a code which should never appear in ISO2022 is found,
602 returns 0. */
603
604int
605detect_coding_iso2022 (src, src_end)
606 unsigned char *src, *src_end;
607{
765a2ca5
KH
608 int mask = (CODING_CATEGORY_MASK_ISO_7
609 | CODING_CATEGORY_MASK_ISO_8_1
610 | CODING_CATEGORY_MASK_ISO_8_2
7717c392
KH
611 | CODING_CATEGORY_MASK_ISO_7_ELSE
612 | CODING_CATEGORY_MASK_ISO_8_ELSE
613 );
bcf26d6a
KH
614 int g1 = 0; /* 1 iff designating to G1. */
615 int c, i;
4ed46869 616
e0e989f6 617 while (src < src_end)
4ed46869
KH
618 {
619 c = *src++;
620 switch (c)
621 {
622 case ISO_CODE_ESC:
e0e989f6 623 if (src >= src_end)
4ed46869
KH
624 break;
625 c = *src++;
bf9cdd4e 626 if ((c >= '(' && c <= '/'))
4ed46869 627 {
bf9cdd4e
KH
628 /* Designation sequence for a charset of dimension 1. */
629 if (src >= src_end)
630 break;
631 c = *src++;
632 if (c < ' ' || c >= 0x80)
633 /* Invalid designation sequence. */
634 return 0;
635 }
636 else if (c == '$')
637 {
638 /* Designation sequence for a charset of dimension 2. */
639 if (src >= src_end)
640 break;
641 c = *src++;
642 if (c >= '@' && c <= 'B')
643 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
644 ;
645 else if (c >= '(' && c <= '/')
bcf26d6a 646 {
bf9cdd4e
KH
647 if (src >= src_end)
648 break;
649 c = *src++;
650 if (c < ' ' || c >= 0x80)
651 /* Invalid designation sequence. */
652 return 0;
bcf26d6a 653 }
bf9cdd4e
KH
654 else
655 /* Invalid designation sequence. */
656 return 0;
4ed46869 657 }
4ed46869 658 else if (c == 'N' || c == 'O' || c == 'n' || c == 'o')
bf9cdd4e 659 /* Locking shift. */
7717c392
KH
660 mask &= (CODING_CATEGORY_MASK_ISO_7_ELSE
661 | CODING_CATEGORY_MASK_ISO_8_ELSE);
bf9cdd4e
KH
662 else if (c == '0' || c == '1' || c == '2')
663 /* Start/end composition. */
664 ;
665 else
666 /* Invalid escape sequence. */
667 return 0;
4ed46869
KH
668 break;
669
4ed46869 670 case ISO_CODE_SO:
bf9cdd4e
KH
671 mask &= (CODING_CATEGORY_MASK_ISO_7_ELSE
672 | CODING_CATEGORY_MASK_ISO_8_ELSE);
e0e989f6
KH
673 break;
674
4ed46869
KH
675 case ISO_CODE_CSI:
676 case ISO_CODE_SS2:
677 case ISO_CODE_SS3:
bf9cdd4e 678 return CODING_CATEGORY_MASK_ISO_8_ELSE;
4ed46869
KH
679
680 default:
681 if (c < 0x80)
682 break;
683 else if (c < 0xA0)
684 return 0;
685 else
686 {
7717c392 687 unsigned char *src_begin = src;
4ed46869 688
7717c392
KH
689 mask &= ~(CODING_CATEGORY_MASK_ISO_7
690 | CODING_CATEGORY_MASK_ISO_7_ELSE);
e0e989f6 691 while (src < src_end && *src >= 0xA0)
7717c392
KH
692 src++;
693 if ((src - src_begin - 1) & 1 && src < src_end)
4ed46869
KH
694 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
695 }
696 break;
697 }
698 }
699
700 return mask;
701}
702
703/* Decode a character of which charset is CHARSET and the 1st position
bdd9fb48 704 code is C1. If dimension of CHARSET is 2, the 2nd position code is
4ed46869
KH
705 fetched from SRC and set to C2. If CHARSET is negative, it means
706 that we are decoding ill formed text, and what we can do is just to
707 read C1 as is. */
708
bdd9fb48
KH
709#define DECODE_ISO_CHARACTER(charset, c1) \
710 do { \
711 int c_alt, charset_alt = (charset); \
712 if (COMPOSING_HEAD_P (coding->composing)) \
713 { \
714 *dst++ = LEADING_CODE_COMPOSITION; \
715 if (COMPOSING_WITH_RULE_P (coding->composing)) \
716 /* To tell composition rules are embeded. */ \
717 *dst++ = 0xFF; \
718 coding->composing += 2; \
719 } \
720 if ((charset) >= 0) \
721 { \
722 if (CHARSET_DIMENSION (charset) == 2) \
723 ONE_MORE_BYTE (c2); \
724 if (!NILP (unification_table) \
725 && ((c_alt = unify_char (unification_table, \
726 -1, (charset), c1, c2)) >= 0)) \
727 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
728 } \
729 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
730 DECODE_CHARACTER_ASCII (c1); \
731 else if (CHARSET_DIMENSION (charset_alt) == 1) \
732 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
733 else \
734 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
735 if (COMPOSING_WITH_RULE_P (coding->composing)) \
736 /* To tell a composition rule follows. */ \
737 coding->composing = COMPOSING_WITH_RULE_RULE; \
4ed46869
KH
738 } while (0)
739
740/* Set designation state into CODING. */
741#define DECODE_DESIGNATION(reg, dimension, chars, final_char) \
742 do { \
2e34157c
RS
743 int charset = ISO_CHARSET_TABLE (make_number (dimension), \
744 make_number (chars), \
745 make_number (final_char)); \
4ed46869
KH
746 if (charset >= 0) \
747 { \
748 if (coding->direction == 1 \
749 && CHARSET_REVERSE_CHARSET (charset) >= 0) \
750 charset = CHARSET_REVERSE_CHARSET (charset); \
751 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
752 } \
753 } while (0)
754
755/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
756
757int
758decode_coding_iso2022 (coding, source, destination,
759 src_bytes, dst_bytes, consumed)
760 struct coding_system *coding;
761 unsigned char *source, *destination;
762 int src_bytes, dst_bytes;
763 int *consumed;
764{
765 unsigned char *src = source;
766 unsigned char *src_end = source + src_bytes;
767 unsigned char *dst = destination;
768 unsigned char *dst_end = destination + dst_bytes;
769 /* Since the maximum bytes produced by each loop is 7, we subtract 6
770 from DST_END to assure that overflow checking is necessary only
771 at the head of loop. */
772 unsigned char *adjusted_dst_end = dst_end - 6;
773 int charset;
774 /* Charsets invoked to graphic plane 0 and 1 respectively. */
775 int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
776 int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
a5d301df
KH
777 Lisp_Object unification_table
778 = coding->character_unification_table_for_decode;
bdd9fb48
KH
779
780 if (!NILP (Venable_character_unification) && NILP (unification_table))
a5d301df 781 unification_table = Vstandard_character_unification_table_for_decode;
4ed46869
KH
782
783 while (src < src_end && dst < adjusted_dst_end)
784 {
785 /* SRC_BASE remembers the start position in source in each loop.
786 The loop will be exited when there's not enough source text
787 to analyze long escape sequence or 2-byte code (within macros
788 ONE_MORE_BYTE or TWO_MORE_BYTES). In that case, SRC is reset
789 to SRC_BASE before exiting. */
790 unsigned char *src_base = src;
bdd9fb48 791 int c1 = *src++, c2;
4ed46869
KH
792
793 switch (iso_code_class [c1])
794 {
795 case ISO_0x20_or_0x7F:
796 if (!coding->composing
797 && (charset0 < 0 || CHARSET_CHARS (charset0) == 94))
798 {
799 /* This is SPACE or DEL. */
800 *dst++ = c1;
801 break;
802 }
803 /* This is a graphic character, we fall down ... */
804
805 case ISO_graphic_plane_0:
806 if (coding->composing == COMPOSING_WITH_RULE_RULE)
807 {
808 /* This is a composition rule. */
809 *dst++ = c1 | 0x80;
810 coding->composing = COMPOSING_WITH_RULE_TAIL;
811 }
812 else
813 DECODE_ISO_CHARACTER (charset0, c1);
814 break;
815
816 case ISO_0xA0_or_0xFF:
817 if (charset1 < 0 || CHARSET_CHARS (charset1) == 94)
818 {
819 /* Invalid code. */
820 *dst++ = c1;
821 break;
822 }
823 /* This is a graphic character, we fall down ... */
824
825 case ISO_graphic_plane_1:
826 DECODE_ISO_CHARACTER (charset1, c1);
827 break;
828
829 case ISO_control_code:
830 /* All ISO2022 control characters in this class have the
831 same representation in Emacs internal format. */
832 *dst++ = c1;
833 break;
834
835 case ISO_carriage_return:
836 if (coding->eol_type == CODING_EOL_CR)
837 {
838 *dst++ = '\n';
839 }
840 else if (coding->eol_type == CODING_EOL_CRLF)
841 {
842 ONE_MORE_BYTE (c1);
843 if (c1 == ISO_CODE_LF)
844 *dst++ = '\n';
845 else
846 {
847 src--;
848 *dst++ = c1;
849 }
850 }
851 else
852 {
853 *dst++ = c1;
854 }
855 break;
856
857 case ISO_shift_out:
e0e989f6
KH
858 if (CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
859 goto label_invalid_escape_sequence;
4ed46869
KH
860 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
861 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
862 break;
863
864 case ISO_shift_in:
865 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
866 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
867 break;
868
869 case ISO_single_shift_2_7:
870 case ISO_single_shift_2:
871 /* SS2 is handled as an escape sequence of ESC 'N' */
872 c1 = 'N';
873 goto label_escape_sequence;
874
875 case ISO_single_shift_3:
876 /* SS2 is handled as an escape sequence of ESC 'O' */
877 c1 = 'O';
878 goto label_escape_sequence;
879
880 case ISO_control_sequence_introducer:
881 /* CSI is handled as an escape sequence of ESC '[' ... */
882 c1 = '[';
883 goto label_escape_sequence;
884
885 case ISO_escape:
886 ONE_MORE_BYTE (c1);
887 label_escape_sequence:
888 /* Escape sequences handled by Emacs are invocation,
889 designation, direction specification, and character
890 composition specification. */
891 switch (c1)
892 {
893 case '&': /* revision of following character set */
894 ONE_MORE_BYTE (c1);
895 if (!(c1 >= '@' && c1 <= '~'))
e0e989f6 896 goto label_invalid_escape_sequence;
4ed46869
KH
897 ONE_MORE_BYTE (c1);
898 if (c1 != ISO_CODE_ESC)
e0e989f6 899 goto label_invalid_escape_sequence;
4ed46869
KH
900 ONE_MORE_BYTE (c1);
901 goto label_escape_sequence;
902
903 case '$': /* designation of 2-byte character set */
904 ONE_MORE_BYTE (c1);
905 if (c1 >= '@' && c1 <= 'B')
906 { /* designation of JISX0208.1978, GB2312.1980,
907 or JISX0208.1980 */
908 DECODE_DESIGNATION (0, 2, 94, c1);
909 }
910 else if (c1 >= 0x28 && c1 <= 0x2B)
911 { /* designation of DIMENSION2_CHARS94 character set */
912 ONE_MORE_BYTE (c2);
913 DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
914 }
915 else if (c1 >= 0x2C && c1 <= 0x2F)
916 { /* designation of DIMENSION2_CHARS96 character set */
917 ONE_MORE_BYTE (c2);
918 DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
919 }
920 else
e0e989f6 921 goto label_invalid_escape_sequence;
4ed46869
KH
922 break;
923
924 case 'n': /* invocation of locking-shift-2 */
e0e989f6
KH
925 if (CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
926 goto label_invalid_escape_sequence;
4ed46869 927 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
e0e989f6 928 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
4ed46869
KH
929 break;
930
931 case 'o': /* invocation of locking-shift-3 */
e0e989f6
KH
932 if (CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
933 goto label_invalid_escape_sequence;
4ed46869 934 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
e0e989f6 935 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
4ed46869
KH
936 break;
937
938 case 'N': /* invocation of single-shift-2 */
e0e989f6
KH
939 if (CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
940 goto label_invalid_escape_sequence;
4ed46869
KH
941 ONE_MORE_BYTE (c1);
942 charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
943 DECODE_ISO_CHARACTER (charset, c1);
944 break;
945
946 case 'O': /* invocation of single-shift-3 */
e0e989f6
KH
947 if (CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
948 goto label_invalid_escape_sequence;
4ed46869
KH
949 ONE_MORE_BYTE (c1);
950 charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
951 DECODE_ISO_CHARACTER (charset, c1);
952 break;
953
954 case '0': /* start composing without embeded rules */
955 coding->composing = COMPOSING_NO_RULE_HEAD;
956 break;
957
958 case '1': /* end composing */
959 coding->composing = COMPOSING_NO;
960 break;
961
962 case '2': /* start composing with embeded rules */
963 coding->composing = COMPOSING_WITH_RULE_HEAD;
964 break;
965
966 case '[': /* specification of direction */
967 /* For the moment, nested direction is not supported.
968 So, the value of `coding->direction' is 0 or 1: 0
969 means left-to-right, 1 means right-to-left. */
970 ONE_MORE_BYTE (c1);
971 switch (c1)
972 {
973 case ']': /* end of the current direction */
974 coding->direction = 0;
975
976 case '0': /* end of the current direction */
977 case '1': /* start of left-to-right direction */
978 ONE_MORE_BYTE (c1);
979 if (c1 == ']')
980 coding->direction = 0;
981 else
982 goto label_invalid_escape_sequence;
983 break;
984
985 case '2': /* start of right-to-left direction */
986 ONE_MORE_BYTE (c1);
987 if (c1 == ']')
988 coding->direction= 1;
989 else
990 goto label_invalid_escape_sequence;
991 break;
992
993 default:
994 goto label_invalid_escape_sequence;
995 }
996 break;
997
998 default:
999 if (c1 >= 0x28 && c1 <= 0x2B)
1000 { /* designation of DIMENSION1_CHARS94 character set */
1001 ONE_MORE_BYTE (c2);
1002 DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
1003 }
1004 else if (c1 >= 0x2C && c1 <= 0x2F)
1005 { /* designation of DIMENSION1_CHARS96 character set */
1006 ONE_MORE_BYTE (c2);
1007 DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
1008 }
1009 else
1010 {
1011 goto label_invalid_escape_sequence;
1012 }
1013 }
1014 /* We must update these variables now. */
1015 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1016 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1017 break;
1018
1019 label_invalid_escape_sequence:
1020 {
1021 int length = src - src_base;
1022
1023 bcopy (src_base, dst, length);
1024 dst += length;
1025 }
1026 }
1027 continue;
1028
1029 label_end_of_loop:
1030 coding->carryover_size = src - src_base;
1031 bcopy (src_base, coding->carryover, coding->carryover_size);
1032 src = src_base;
1033 break;
1034 }
1035
1036 /* If this is the last block of the text to be decoded, we had
1037 better just flush out all remaining codes in the text although
1038 they are not valid characters. */
1039 if (coding->last_block)
1040 {
1041 bcopy (src, dst, src_end - src);
1042 dst += (src_end - src);
1043 src = src_end;
1044 }
1045 *consumed = src - source;
1046 return dst - destination;
1047}
1048
f4dee582 1049/* ISO2022 encoding stuff. */
4ed46869
KH
1050
1051/*
f4dee582 1052 It is not enough to say just "ISO2022" on encoding, we have to
4ed46869
KH
1053 specify more details. In Emacs, each coding-system of ISO2022
1054 variant has the following specifications:
1055 1. Initial designation to G0 thru G3.
1056 2. Allows short-form designation?
1057 3. ASCII should be designated to G0 before control characters?
1058 4. ASCII should be designated to G0 at end of line?
1059 5. 7-bit environment or 8-bit environment?
1060 6. Use locking-shift?
1061 7. Use Single-shift?
1062 And the following two are only for Japanese:
1063 8. Use ASCII in place of JIS0201-1976-Roman?
1064 9. Use JISX0208-1983 in place of JISX0208-1978?
1065 These specifications are encoded in `coding->flags' as flag bits
1066 defined by macros CODING_FLAG_ISO_XXX. See `coding.h' for more
f4dee582 1067 details.
4ed46869
KH
1068*/
1069
1070/* Produce codes (escape sequence) for designating CHARSET to graphic
1071 register REG. If <final-char> of CHARSET is '@', 'A', or 'B' and
1072 the coding system CODING allows, produce designation sequence of
1073 short-form. */
1074
1075#define ENCODE_DESIGNATION(charset, reg, coding) \
1076 do { \
1077 unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset); \
1078 char *intermediate_char_94 = "()*+"; \
1079 char *intermediate_char_96 = ",-./"; \
1080 Lisp_Object temp \
1081 = Fassq (make_number (charset), Vcharset_revision_alist); \
1082 if (! NILP (temp)) \
1083 { \
1084 *dst++ = ISO_CODE_ESC; \
1085 *dst++ = '&'; \
1086 *dst++ = XINT (XCONS (temp)->cdr) + '@'; \
1087 } \
1088 *dst++ = ISO_CODE_ESC; \
1089 if (CHARSET_DIMENSION (charset) == 1) \
1090 { \
1091 if (CHARSET_CHARS (charset) == 94) \
1092 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1093 else \
1094 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1095 } \
1096 else \
1097 { \
1098 *dst++ = '$'; \
1099 if (CHARSET_CHARS (charset) == 94) \
1100 { \
1101 if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM) \
1102 || reg != 0 \
1103 || final_char < '@' || final_char > 'B') \
1104 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1105 } \
1106 else \
1107 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1108 } \
1109 *dst++ = final_char; \
1110 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
1111 } while (0)
1112
1113/* The following two macros produce codes (control character or escape
1114 sequence) for ISO2022 single-shift functions (single-shift-2 and
1115 single-shift-3). */
1116
1117#define ENCODE_SINGLE_SHIFT_2 \
1118 do { \
1119 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1120 *dst++ = ISO_CODE_ESC, *dst++ = 'N'; \
1121 else \
1122 *dst++ = ISO_CODE_SS2; \
1123 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1124 } while (0)
1125
1126#define ENCODE_SINGLE_SHIFT_3 \
1127 do { \
1128 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1129 *dst++ = ISO_CODE_ESC, *dst++ = 'O'; \
1130 else \
1131 *dst++ = ISO_CODE_SS3; \
1132 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1133 } while (0)
1134
1135/* The following four macros produce codes (control character or
1136 escape sequence) for ISO2022 locking-shift functions (shift-in,
1137 shift-out, locking-shift-2, and locking-shift-3). */
1138
1139#define ENCODE_SHIFT_IN \
1140 do { \
1141 *dst++ = ISO_CODE_SI; \
1142 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1143 } while (0)
1144
1145#define ENCODE_SHIFT_OUT \
1146 do { \
1147 *dst++ = ISO_CODE_SO; \
1148 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1149 } while (0)
1150
1151#define ENCODE_LOCKING_SHIFT_2 \
1152 do { \
1153 *dst++ = ISO_CODE_ESC, *dst++ = 'n'; \
1154 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1155 } while (0)
1156
1157#define ENCODE_LOCKING_SHIFT_3 \
1158 do { \
1159 *dst++ = ISO_CODE_ESC, *dst++ = 'o'; \
1160 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1161 } while (0)
1162
f4dee582
RS
1163/* Produce codes for a DIMENSION1 character whose character set is
1164 CHARSET and whose position-code is C1. Designation and invocation
4ed46869
KH
1165 sequences are also produced in advance if necessary. */
1166
1167
1168#define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
1169 do { \
1170 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1171 { \
1172 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1173 *dst++ = c1 & 0x7F; \
1174 else \
1175 *dst++ = c1 | 0x80; \
1176 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1177 break; \
1178 } \
1179 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1180 { \
1181 *dst++ = c1 & 0x7F; \
1182 break; \
1183 } \
1184 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1185 { \
1186 *dst++ = c1 | 0x80; \
1187 break; \
1188 } \
1189 else \
1190 /* Since CHARSET is not yet invoked to any graphic planes, we \
1191 must invoke it, or, at first, designate it to some graphic \
1192 register. Then repeat the loop to actually produce the \
1193 character. */ \
1194 dst = encode_invocation_designation (charset, coding, dst); \
1195 } while (1)
1196
f4dee582
RS
1197/* Produce codes for a DIMENSION2 character whose character set is
1198 CHARSET and whose position-codes are C1 and C2. Designation and
4ed46869
KH
1199 invocation codes are also produced in advance if necessary. */
1200
1201#define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
1202 do { \
1203 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1204 { \
1205 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1206 *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F; \
1207 else \
1208 *dst++ = c1 | 0x80, *dst++ = c2 | 0x80; \
1209 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1210 break; \
1211 } \
1212 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1213 { \
1214 *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F; \
1215 break; \
1216 } \
1217 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1218 { \
1219 *dst++ = c1 | 0x80, *dst++= c2 | 0x80; \
1220 break; \
1221 } \
1222 else \
1223 /* Since CHARSET is not yet invoked to any graphic planes, we \
1224 must invoke it, or, at first, designate it to some graphic \
1225 register. Then repeat the loop to actually produce the \
1226 character. */ \
1227 dst = encode_invocation_designation (charset, coding, dst); \
1228 } while (1)
1229
bdd9fb48
KH
1230#define ENCODE_ISO_CHARACTER(charset, c1, c2) \
1231 do { \
1232 int c_alt, charset_alt; \
1233 if (!NILP (unification_table) \
1234 && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
a5d301df 1235 >= 0)) \
bdd9fb48
KH
1236 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1237 else \
1238 charset_alt = charset; \
1239 if (CHARSET_DIMENSION (charset_alt) == 1) \
1240 ENCODE_ISO_CHARACTER_DIMENSION1 (charset_alt, c1); \
1241 else \
1242 ENCODE_ISO_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
1243 } while (0)
1244
4ed46869
KH
1245/* Produce designation and invocation codes at a place pointed by DST
1246 to use CHARSET. The element `spec.iso2022' of *CODING is updated.
1247 Return new DST. */
1248
1249unsigned char *
1250encode_invocation_designation (charset, coding, dst)
1251 int charset;
1252 struct coding_system *coding;
1253 unsigned char *dst;
1254{
1255 int reg; /* graphic register number */
1256
1257 /* At first, check designations. */
1258 for (reg = 0; reg < 4; reg++)
1259 if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
1260 break;
1261
1262 if (reg >= 4)
1263 {
1264 /* CHARSET is not yet designated to any graphic registers. */
1265 /* At first check the requested designation. */
1266 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1ba9e4ab
KH
1267 if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1268 /* Since CHARSET requests no special designation, designate it
1269 to graphic register 0. */
4ed46869
KH
1270 reg = 0;
1271
1272 ENCODE_DESIGNATION (charset, reg, coding);
1273 }
1274
1275 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
1276 && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
1277 {
1278 /* Since the graphic register REG is not invoked to any graphic
1279 planes, invoke it to graphic plane 0. */
1280 switch (reg)
1281 {
1282 case 0: /* graphic register 0 */
1283 ENCODE_SHIFT_IN;
1284 break;
1285
1286 case 1: /* graphic register 1 */
1287 ENCODE_SHIFT_OUT;
1288 break;
1289
1290 case 2: /* graphic register 2 */
1291 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1292 ENCODE_SINGLE_SHIFT_2;
1293 else
1294 ENCODE_LOCKING_SHIFT_2;
1295 break;
1296
1297 case 3: /* graphic register 3 */
1298 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1299 ENCODE_SINGLE_SHIFT_3;
1300 else
1301 ENCODE_LOCKING_SHIFT_3;
1302 break;
1303 }
1304 }
1305 return dst;
1306}
1307
1308/* The following two macros produce codes for indicating composition. */
1309#define ENCODE_COMPOSITION_NO_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '0'
1310#define ENCODE_COMPOSITION_WITH_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '2'
1311#define ENCODE_COMPOSITION_END *dst++ = ISO_CODE_ESC, *dst++ = '1'
1312
1313/* The following three macros produce codes for indicating direction
1314 of text. */
1315#define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
1316 do { \
1317 if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS) \
1318 *dst++ = ISO_CODE_ESC, *dst++ = '['; \
1319 else \
1320 *dst++ = ISO_CODE_CSI; \
1321 } while (0)
1322
1323#define ENCODE_DIRECTION_R2L \
1324 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '2', *dst++ = ']'
1325
1326#define ENCODE_DIRECTION_L2R \
1327 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '0', *dst++ = ']'
1328
1329/* Produce codes for designation and invocation to reset the graphic
1330 planes and registers to initial state. */
e0e989f6
KH
1331#define ENCODE_RESET_PLANE_AND_REGISTER \
1332 do { \
1333 int reg; \
1334 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0) \
1335 ENCODE_SHIFT_IN; \
1336 for (reg = 0; reg < 4; reg++) \
1337 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0 \
1338 && (CODING_SPEC_ISO_DESIGNATION (coding, reg) \
1339 != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg))) \
1340 ENCODE_DESIGNATION \
1341 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
4ed46869
KH
1342 } while (0)
1343
bdd9fb48
KH
1344/* Produce designation sequences of charsets in the line started from
1345 *SRC to a place pointed by DSTP.
1346
1347 If the current block ends before any end-of-line, we may fail to
1348 find all the necessary *designations. */
1349encode_designation_at_bol (coding, table, src, src_end, dstp)
e0e989f6 1350 struct coding_system *coding;
bdd9fb48 1351 Lisp_Object table;
e0e989f6
KH
1352 unsigned char *src, *src_end, **dstp;
1353{
bdd9fb48
KH
1354 int charset, c, found = 0, reg;
1355 /* Table of charsets to be designated to each graphic register. */
1356 int r[4];
1357 unsigned char *dst = *dstp;
1358
1359 for (reg = 0; reg < 4; reg++)
1360 r[reg] = -1;
1361
1362 while (src < src_end && *src != '\n' && found < 4)
e0e989f6 1363 {
bdd9fb48
KH
1364 int bytes = BYTES_BY_CHAR_HEAD (*src);
1365
1366 if (NILP (table))
1367 charset = CHARSET_AT (src);
1368 else
e0e989f6 1369 {
bdd9fb48
KH
1370 int c_alt, c1, c2;
1371
1372 SPLIT_STRING(src, bytes, charset, c1, c2);
1373 if ((c_alt = unify_char (table, -1, charset, c1, c2)) >= 0)
1374 charset = CHAR_CHARSET (c_alt);
e0e989f6 1375 }
bdd9fb48 1376
e0e989f6 1377 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1ba9e4ab 1378 if (r[reg] == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
bdd9fb48
KH
1379 {
1380 found++;
1381 r[reg] = charset;
1382 }
1383
1384 src += bytes;
1385 }
1386
1387 if (found)
1388 {
1389 for (reg = 0; reg < 4; reg++)
1390 if (r[reg] >= 0
1391 && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
1392 ENCODE_DESIGNATION (r[reg], reg, coding);
1393 *dstp = dst;
e0e989f6 1394 }
e0e989f6
KH
1395}
1396
4ed46869
KH
1397/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
1398
1399int
1400encode_coding_iso2022 (coding, source, destination,
1401 src_bytes, dst_bytes, consumed)
1402 struct coding_system *coding;
1403 unsigned char *source, *destination;
1404 int src_bytes, dst_bytes;
1405 int *consumed;
1406{
1407 unsigned char *src = source;
1408 unsigned char *src_end = source + src_bytes;
1409 unsigned char *dst = destination;
1410 unsigned char *dst_end = destination + dst_bytes;
e0e989f6 1411 /* Since the maximum bytes produced by each loop is 20, we subtract 19
4ed46869
KH
1412 from DST_END to assure overflow checking is necessary only at the
1413 head of loop. */
e0e989f6 1414 unsigned char *adjusted_dst_end = dst_end - 19;
a5d301df
KH
1415 Lisp_Object unification_table
1416 = coding->character_unification_table_for_encode;
bdd9fb48
KH
1417
1418 if (!NILP (Venable_character_unification) && NILP (unification_table))
a5d301df 1419 unification_table = Vstandard_character_unification_table_for_encode;
4ed46869
KH
1420
1421 while (src < src_end && dst < adjusted_dst_end)
1422 {
1423 /* SRC_BASE remembers the start position in source in each loop.
1424 The loop will be exited when there's not enough source text
1425 to analyze multi-byte codes (within macros ONE_MORE_BYTE,
1426 TWO_MORE_BYTES, and THREE_MORE_BYTES). In that case, SRC is
1427 reset to SRC_BASE before exiting. */
1428 unsigned char *src_base = src;
bdd9fb48 1429 int charset, c1, c2, c3, c4;
4ed46869 1430
e0e989f6
KH
1431 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
1432 && CODING_SPEC_ISO_BOL (coding))
1433 {
bdd9fb48
KH
1434 /* We have to produce designation sequences if any now. */
1435 encode_designation_at_bol (coding, unification_table,
1436 src, src_end, &dst);
e0e989f6
KH
1437 CODING_SPEC_ISO_BOL (coding) = 0;
1438 }
1439
1440 c1 = *src++;
4ed46869
KH
1441 /* If we are seeing a component of a composite character, we are
1442 seeing a leading-code specially encoded for composition, or a
1443 composition rule if composing with rule. We must set C1
1444 to a normal leading-code or an ASCII code. If we are not at
1445 a composed character, we must reset the composition state. */
1446 if (COMPOSING_P (coding->composing))
1447 {
1448 if (c1 < 0xA0)
1449 {
1450 /* We are not in a composite character any longer. */
1451 coding->composing = COMPOSING_NO;
1452 ENCODE_COMPOSITION_END;
1453 }
1454 else
1455 {
1456 if (coding->composing == COMPOSING_WITH_RULE_RULE)
1457 {
1458 *dst++ = c1 & 0x7F;
1459 coding->composing = COMPOSING_WITH_RULE_HEAD;
1460 continue;
1461 }
1462 else if (coding->composing == COMPOSING_WITH_RULE_HEAD)
1463 coding->composing = COMPOSING_WITH_RULE_RULE;
1464 if (c1 == 0xA0)
1465 {
1466 /* This is an ASCII component. */
1467 ONE_MORE_BYTE (c1);
1468 c1 &= 0x7F;
1469 }
1470 else
1471 /* This is a leading-code of non ASCII component. */
1472 c1 -= 0x20;
1473 }
1474 }
1475
1476 /* Now encode one character. C1 is a control character, an
1477 ASCII character, or a leading-code of multi-byte character. */
1478 switch (emacs_code_class[c1])
1479 {
1480 case EMACS_ascii_code:
bdd9fb48 1481 ENCODE_ISO_CHARACTER (CHARSET_ASCII, c1, /* dummy */ c2);
4ed46869
KH
1482 break;
1483
1484 case EMACS_control_code:
1485 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
e0e989f6 1486 ENCODE_RESET_PLANE_AND_REGISTER;
4ed46869
KH
1487 *dst++ = c1;
1488 break;
1489
1490 case EMACS_carriage_return_code:
1491 if (!coding->selective)
1492 {
1493 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
e0e989f6 1494 ENCODE_RESET_PLANE_AND_REGISTER;
4ed46869
KH
1495 *dst++ = c1;
1496 break;
1497 }
1498 /* fall down to treat '\r' as '\n' ... */
1499
1500 case EMACS_linefeed_code:
1501 if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
e0e989f6
KH
1502 ENCODE_RESET_PLANE_AND_REGISTER;
1503 if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
1504 bcopy (coding->spec.iso2022.initial_designation,
1505 coding->spec.iso2022.current_designation,
1506 sizeof coding->spec.iso2022.initial_designation);
4ed46869 1507 if (coding->eol_type == CODING_EOL_LF
0ef69138 1508 || coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
1509 *dst++ = ISO_CODE_LF;
1510 else if (coding->eol_type == CODING_EOL_CRLF)
1511 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
1512 else
1513 *dst++ = ISO_CODE_CR;
e0e989f6 1514 CODING_SPEC_ISO_BOL (coding) = 1;
4ed46869
KH
1515 break;
1516
1517 case EMACS_leading_code_2:
1518 ONE_MORE_BYTE (c2);
19a8d9e0
KH
1519 if (c2 < 0xA0)
1520 {
1521 /* invalid sequence */
1522 *dst++ = c1;
1523 *dst++ = c2;
1524 }
1525 else
1526 ENCODE_ISO_CHARACTER (c1, c2, /* dummy */ c3);
4ed46869
KH
1527 break;
1528
1529 case EMACS_leading_code_3:
1530 TWO_MORE_BYTES (c2, c3);
19a8d9e0
KH
1531 if (c2 < 0xA0 || c3 < 0xA0)
1532 {
1533 /* invalid sequence */
1534 *dst++ = c1;
1535 *dst++ = c2;
1536 *dst++ = c3;
1537 }
1538 else if (c1 < LEADING_CODE_PRIVATE_11)
bdd9fb48 1539 ENCODE_ISO_CHARACTER (c1, c2, c3);
4ed46869 1540 else
bdd9fb48 1541 ENCODE_ISO_CHARACTER (c2, c3, /* dummy */ c4);
4ed46869
KH
1542 break;
1543
1544 case EMACS_leading_code_4:
1545 THREE_MORE_BYTES (c2, c3, c4);
19a8d9e0
KH
1546 if (c2 < 0xA0 || c3 < 0xA0 || c4 < 0xA0)
1547 {
1548 /* invalid sequence */
1549 *dst++ = c1;
1550 *dst++ = c2;
1551 *dst++ = c3;
1552 *dst++ = c4;
1553 }
1554 else
1555 ENCODE_ISO_CHARACTER (c2, c3, c4);
4ed46869
KH
1556 break;
1557
1558 case EMACS_leading_code_composition:
19a8d9e0
KH
1559 ONE_MORE_BYTE (c2);
1560 if (c2 < 0xA0)
1561 {
1562 /* invalid sequence */
1563 *dst++ = c1;
1564 *dst++ = c2;
1565 }
1566 else if (c2 == 0xFF)
4ed46869
KH
1567 {
1568 coding->composing = COMPOSING_WITH_RULE_HEAD;
1569 ENCODE_COMPOSITION_WITH_RULE_START;
1570 }
1571 else
1572 {
1573 /* Rewind one byte because it is a character code of
1574 composition elements. */
1575 src--;
1576 coding->composing = COMPOSING_NO_RULE_HEAD;
1577 ENCODE_COMPOSITION_NO_RULE_START;
1578 }
1579 break;
1580
1581 case EMACS_invalid_code:
1582 *dst++ = c1;
1583 break;
1584 }
1585 continue;
1586 label_end_of_loop:
76376439
KH
1587 /* We reach here because the source date ends not at character
1588 boundary. */
1589 coding->carryover_size = src_end - src_base;
4ed46869 1590 bcopy (src_base, coding->carryover, coding->carryover_size);
76376439 1591 src = src_end;
4ed46869
KH
1592 break;
1593 }
1594
1595 /* If this is the last block of the text to be encoded, we must
bdd9fb48
KH
1596 reset graphic planes and registers to the initial state. */
1597 if (src >= src_end && coding->last_block)
4ed46869 1598 {
e0e989f6 1599 ENCODE_RESET_PLANE_AND_REGISTER;
bdd9fb48
KH
1600 if (coding->carryover_size > 0
1601 && coding->carryover_size < (dst_end - dst))
1602 {
1603 bcopy (coding->carryover, dst, coding->carryover_size);
1604 dst += coding->carryover_size;
1605 coding->carryover_size = 0;
1606 }
4ed46869
KH
1607 }
1608 *consumed = src - source;
1609 return dst - destination;
1610}
1611
1612\f
1613/*** 4. SJIS and BIG5 handlers ***/
1614
f4dee582 1615/* Although SJIS and BIG5 are not ISO's coding system, they are used
4ed46869
KH
1616 quite widely. So, for the moment, Emacs supports them in the bare
1617 C code. But, in the future, they may be supported only by CCL. */
1618
1619/* SJIS is a coding system encoding three character sets: ASCII, right
1620 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
1621 as is. A character of charset katakana-jisx0201 is encoded by
1622 "position-code + 0x80". A character of charset japanese-jisx0208
1623 is encoded in 2-byte but two position-codes are divided and shifted
1624 so that it fit in the range below.
1625
1626 --- CODE RANGE of SJIS ---
1627 (character set) (range)
1628 ASCII 0x00 .. 0x7F
1629 KATAKANA-JISX0201 0xA0 .. 0xDF
1630 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xFF
1631 (2nd byte) 0x40 .. 0xFF
1632 -------------------------------
1633
1634*/
1635
1636/* BIG5 is a coding system encoding two character sets: ASCII and
1637 Big5. An ASCII character is encoded as is. Big5 is a two-byte
1638 character set and is encoded in two-byte.
1639
1640 --- CODE RANGE of BIG5 ---
1641 (character set) (range)
1642 ASCII 0x00 .. 0x7F
1643 Big5 (1st byte) 0xA1 .. 0xFE
1644 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
1645 --------------------------
1646
1647 Since the number of characters in Big5 is larger than maximum
1648 characters in Emacs' charset (96x96), it can't be handled as one
1649 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
1650 and `charset-big5-2'. Both are DIMENSION2 and CHARS94. The former
1651 contains frequently used characters and the latter contains less
1652 frequently used characters. */
1653
1654/* Macros to decode or encode a character of Big5 in BIG5. B1 and B2
1655 are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
1656 C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
1657 format. CHARSET is `charset_big5_1' or `charset_big5_2'. */
1658
1659/* Number of Big5 characters which have the same code in 1st byte. */
1660#define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
1661
1662#define DECODE_BIG5(b1, b2, charset, c1, c2) \
1663 do { \
1664 unsigned int temp \
1665 = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62); \
1666 if (b1 < 0xC9) \
1667 charset = charset_big5_1; \
1668 else \
1669 { \
1670 charset = charset_big5_2; \
1671 temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW; \
1672 } \
1673 c1 = temp / (0xFF - 0xA1) + 0x21; \
1674 c2 = temp % (0xFF - 0xA1) + 0x21; \
1675 } while (0)
1676
1677#define ENCODE_BIG5(charset, c1, c2, b1, b2) \
1678 do { \
1679 unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21); \
1680 if (charset == charset_big5_2) \
1681 temp += BIG5_SAME_ROW * (0xC9 - 0xA1); \
1682 b1 = temp / BIG5_SAME_ROW + 0xA1; \
1683 b2 = temp % BIG5_SAME_ROW; \
1684 b2 += b2 < 0x3F ? 0x40 : 0x62; \
1685 } while (0)
1686
a5d301df
KH
1687#define DECODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \
1688 do { \
1689 int c_alt, charset_alt = (charset); \
1690 if (!NILP (unification_table) \
1691 && ((c_alt = unify_char (unification_table, \
1692 -1, (charset), c1, c2)) >= 0)) \
1693 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1694 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
1695 DECODE_CHARACTER_ASCII (c1); \
1696 else if (CHARSET_DIMENSION (charset_alt) == 1) \
1697 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
1698 else \
1699 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
1700 } while (0)
1701
1702#define ENCODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \
1703 do { \
1704 int c_alt, charset_alt; \
1705 if (!NILP (unification_table) \
1706 && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
1707 >= 0)) \
1708 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1709 else \
1710 charset_alt = charset; \
1711 if (charset_alt == charset_ascii) \
1712 *dst++ = c1; \
1713 else if (CHARSET_DIMENSION (charset_alt) == 1) \
1714 { \
1715 if (sjis_p && charset_alt == charset_katakana_jisx0201) \
1716 *dst++ = c1; \
1717 else \
1718 *dst++ = charset_alt, *dst++ = c1; \
1719 } \
1720 else \
1721 { \
1722 c1 &= 0x7F, c2 &= 0x7F; \
1723 if (sjis_p && charset_alt == charset_jisx0208) \
1724 { \
1725 unsigned char s1, s2; \
1726 \
1727 ENCODE_SJIS (c1, c2, s1, s2); \
1728 *dst++ = s1, *dst++ = s2; \
1729 } \
1730 else if (!sjis_p \
1731 && (charset_alt == charset_big5_1 \
1732 || charset_alt == charset_big5_2)) \
1733 { \
1734 unsigned char b1, b2; \
1735 \
9ce27fde 1736 ENCODE_BIG5 (charset_alt, c1, c2, b1, b2); \
a5d301df
KH
1737 *dst++ = b1, *dst++ = b2; \
1738 } \
1739 else \
1740 *dst++ = charset_alt, *dst++ = c1, *dst++ = c2; \
1741 } \
1742 } while (0);
1743
4ed46869
KH
1744/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1745 Check if a text is encoded in SJIS. If it is, return
1746 CODING_CATEGORY_MASK_SJIS, else return 0. */
1747
1748int
1749detect_coding_sjis (src, src_end)
1750 unsigned char *src, *src_end;
1751{
1752 unsigned char c;
1753
1754 while (src < src_end)
1755 {
1756 c = *src++;
1757 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1758 return 0;
1759 if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
1760 {
1761 if (src < src_end && *src++ < 0x40)
1762 return 0;
1763 }
1764 }
1765 return CODING_CATEGORY_MASK_SJIS;
1766}
1767
1768/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1769 Check if a text is encoded in BIG5. If it is, return
1770 CODING_CATEGORY_MASK_BIG5, else return 0. */
1771
1772int
1773detect_coding_big5 (src, src_end)
1774 unsigned char *src, *src_end;
1775{
1776 unsigned char c;
1777
1778 while (src < src_end)
1779 {
1780 c = *src++;
1781 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1782 return 0;
1783 if (c >= 0xA1)
1784 {
1785 if (src >= src_end)
1786 break;
1787 c = *src++;
1788 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
1789 return 0;
1790 }
1791 }
1792 return CODING_CATEGORY_MASK_BIG5;
1793}
1794
1795/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
1796 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
1797
1798int
1799decode_coding_sjis_big5 (coding, source, destination,
1800 src_bytes, dst_bytes, consumed, sjis_p)
1801 struct coding_system *coding;
1802 unsigned char *source, *destination;
1803 int src_bytes, dst_bytes;
1804 int *consumed;
1805 int sjis_p;
1806{
1807 unsigned char *src = source;
1808 unsigned char *src_end = source + src_bytes;
1809 unsigned char *dst = destination;
1810 unsigned char *dst_end = destination + dst_bytes;
1811 /* Since the maximum bytes produced by each loop is 4, we subtract 3
1812 from DST_END to assure overflow checking is necessary only at the
1813 head of loop. */
1814 unsigned char *adjusted_dst_end = dst_end - 3;
a5d301df
KH
1815 Lisp_Object unification_table
1816 = coding->character_unification_table_for_decode;
1817
1818 if (!NILP (Venable_character_unification) && NILP (unification_table))
1819 unification_table = Vstandard_character_unification_table_for_decode;
4ed46869
KH
1820
1821 while (src < src_end && dst < adjusted_dst_end)
1822 {
1823 /* SRC_BASE remembers the start position in source in each loop.
1824 The loop will be exited when there's not enough source text
1825 to analyze two-byte character (within macro ONE_MORE_BYTE).
1826 In that case, SRC is reset to SRC_BASE before exiting. */
1827 unsigned char *src_base = src;
1828 unsigned char c1 = *src++, c2, c3, c4;
1829
1830 if (c1 == '\r')
1831 {
1832 if (coding->eol_type == CODING_EOL_CRLF)
1833 {
1834 ONE_MORE_BYTE (c2);
1835 if (c2 == '\n')
1836 *dst++ = c2;
1837 else
1838 /* To process C2 again, SRC is subtracted by 1. */
1839 *dst++ = c1, src--;
1840 }
1841 else
1842 *dst++ = c1;
1843 }
a5d301df 1844 else if (c1 < 0x20)
4ed46869 1845 *dst++ = c1;
a5d301df
KH
1846 else if (c1 < 0x80)
1847 DECODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
4ed46869
KH
1848 else if (c1 < 0xA0 || c1 >= 0xE0)
1849 {
1850 /* SJIS -> JISX0208, BIG5 -> Big5 (only if 0xE0 <= c1 < 0xFF) */
1851 if (sjis_p)
1852 {
1853 ONE_MORE_BYTE (c2);
1854 DECODE_SJIS (c1, c2, c3, c4);
a5d301df 1855 DECODE_SJIS_BIG5_CHARACTER (charset_jisx0208, c3, c4);
4ed46869
KH
1856 }
1857 else if (c1 >= 0xE0 && c1 < 0xFF)
1858 {
1859 int charset;
1860
1861 ONE_MORE_BYTE (c2);
1862 DECODE_BIG5 (c1, c2, charset, c3, c4);
a5d301df 1863 DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
4ed46869
KH
1864 }
1865 else /* Invalid code */
1866 *dst++ = c1;
1867 }
1868 else
1869 {
1870 /* SJIS -> JISX0201-Kana, BIG5 -> Big5 */
1871 if (sjis_p)
a5d301df 1872 DECODE_SJIS_BIG5_CHARACTER (charset_katakana_jisx0201, c1, /* dummy */ c2);
4ed46869
KH
1873 else
1874 {
1875 int charset;
1876
1877 ONE_MORE_BYTE (c2);
1878 DECODE_BIG5 (c1, c2, charset, c3, c4);
a5d301df 1879 DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
4ed46869
KH
1880 }
1881 }
1882 continue;
1883
1884 label_end_of_loop:
1885 coding->carryover_size = src - src_base;
1886 bcopy (src_base, coding->carryover, coding->carryover_size);
1887 src = src_base;
1888 break;
1889 }
1890
1891 *consumed = src - source;
1892 return dst - destination;
1893}
1894
1895/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
1896 This function can encode `charset_ascii', `charset_katakana_jisx0201',
1897 `charset_jisx0208', `charset_big5_1', and `charset_big5-2'. We are
1898 sure that all these charsets are registered as official charset
1899 (i.e. do not have extended leading-codes). Characters of other
1900 charsets are produced without any encoding. If SJIS_P is 1, encode
1901 SJIS text, else encode BIG5 text. */
1902
1903int
1904encode_coding_sjis_big5 (coding, source, destination,
1905 src_bytes, dst_bytes, consumed, sjis_p)
1906 struct coding_system *coding;
1907 unsigned char *source, *destination;
1908 int src_bytes, dst_bytes;
1909 int *consumed;
1910 int sjis_p;
1911{
1912 unsigned char *src = source;
1913 unsigned char *src_end = source + src_bytes;
1914 unsigned char *dst = destination;
1915 unsigned char *dst_end = destination + dst_bytes;
1916 /* Since the maximum bytes produced by each loop is 2, we subtract 1
1917 from DST_END to assure overflow checking is necessary only at the
1918 head of loop. */
1919 unsigned char *adjusted_dst_end = dst_end - 1;
a5d301df
KH
1920 Lisp_Object unification_table
1921 = coding->character_unification_table_for_encode;
1922
1923 if (!NILP (Venable_character_unification) && NILP (unification_table))
1924 unification_table = Vstandard_character_unification_table_for_encode;
4ed46869
KH
1925
1926 while (src < src_end && dst < adjusted_dst_end)
1927 {
1928 /* SRC_BASE remembers the start position in source in each loop.
1929 The loop will be exited when there's not enough source text
1930 to analyze multi-byte codes (within macros ONE_MORE_BYTE and
1931 TWO_MORE_BYTES). In that case, SRC is reset to SRC_BASE
1932 before exiting. */
1933 unsigned char *src_base = src;
1934 unsigned char c1 = *src++, c2, c3, c4;
1935
1936 if (coding->composing)
1937 {
1938 if (c1 == 0xA0)
1939 {
1940 ONE_MORE_BYTE (c1);
1941 c1 &= 0x7F;
1942 }
1943 else if (c1 >= 0xA0)
1944 c1 -= 0x20;
1945 else
1946 coding->composing = 0;
1947 }
1948
1949 switch (emacs_code_class[c1])
1950 {
1951 case EMACS_ascii_code:
a5d301df
KH
1952 ENCODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
1953 break;
1954
4ed46869
KH
1955 case EMACS_control_code:
1956 *dst++ = c1;
1957 break;
1958
1959 case EMACS_carriage_return_code:
1960 if (!coding->selective)
1961 {
1962 *dst++ = c1;
1963 break;
1964 }
1965 /* fall down to treat '\r' as '\n' ... */
1966
1967 case EMACS_linefeed_code:
1968 if (coding->eol_type == CODING_EOL_LF
0ef69138 1969 || coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
1970 *dst++ = '\n';
1971 else if (coding->eol_type == CODING_EOL_CRLF)
1972 *dst++ = '\r', *dst++ = '\n';
1973 else
1974 *dst++ = '\r';
1975 break;
1976
1977 case EMACS_leading_code_2:
1978 ONE_MORE_BYTE (c2);
a5d301df 1979 ENCODE_SJIS_BIG5_CHARACTER (c1, c2, /* dummy */ c3);
4ed46869
KH
1980 break;
1981
1982 case EMACS_leading_code_3:
1983 TWO_MORE_BYTES (c2, c3);
a5d301df 1984 ENCODE_SJIS_BIG5_CHARACTER (c1, c2, c3);
4ed46869
KH
1985 break;
1986
1987 case EMACS_leading_code_4:
1988 THREE_MORE_BYTES (c2, c3, c4);
a5d301df 1989 ENCODE_SJIS_BIG5_CHARACTER (c2, c3, c4);
4ed46869
KH
1990 break;
1991
1992 case EMACS_leading_code_composition:
1993 coding->composing = 1;
1994 break;
1995
1996 default: /* i.e. case EMACS_invalid_code: */
1997 *dst++ = c1;
1998 }
1999 continue;
2000
2001 label_end_of_loop:
76376439 2002 coding->carryover_size = src_end - src_base;
4ed46869 2003 bcopy (src_base, coding->carryover, coding->carryover_size);
76376439 2004 src = src_end;
4ed46869
KH
2005 break;
2006 }
2007
2008 *consumed = src - source;
2009 return dst - destination;
2010}
2011
2012\f
2013/*** 5. End-of-line handlers ***/
2014
2015/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2016 This function is called only when `coding->eol_type' is
2017 CODING_EOL_CRLF or CODING_EOL_CR. */
2018
2019decode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
2020 struct coding_system *coding;
2021 unsigned char *source, *destination;
2022 int src_bytes, dst_bytes;
2023 int *consumed;
2024{
2025 unsigned char *src = source;
2026 unsigned char *src_end = source + src_bytes;
2027 unsigned char *dst = destination;
2028 unsigned char *dst_end = destination + dst_bytes;
2029 int produced;
2030
2031 switch (coding->eol_type)
2032 {
2033 case CODING_EOL_CRLF:
2034 {
2035 /* Since the maximum bytes produced by each loop is 2, we
2036 subtract 1 from DST_END to assure overflow checking is
2037 necessary only at the head of loop. */
2038 unsigned char *adjusted_dst_end = dst_end - 1;
2039
2040 while (src < src_end && dst < adjusted_dst_end)
2041 {
2042 unsigned char *src_base = src;
2043 unsigned char c = *src++;
2044 if (c == '\r')
2045 {
2046 ONE_MORE_BYTE (c);
2047 if (c != '\n')
2048 *dst++ = '\r';
bfd99048 2049 *dst++ = c;
4ed46869
KH
2050 }
2051 else
2052 *dst++ = c;
2053 continue;
2054
2055 label_end_of_loop:
2056 coding->carryover_size = src - src_base;
2057 bcopy (src_base, coding->carryover, coding->carryover_size);
2058 src = src_base;
2059 break;
2060 }
2061 *consumed = src - source;
2062 produced = dst - destination;
2063 break;
2064 }
2065
2066 case CODING_EOL_CR:
2067 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2068 bcopy (source, destination, produced);
2069 dst_end = destination + produced;
2070 while (dst < dst_end)
2071 if (*dst++ == '\r') dst[-1] = '\n';
2072 *consumed = produced;
2073 break;
2074
2075 default: /* i.e. case: CODING_EOL_LF */
2076 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2077 bcopy (source, destination, produced);
2078 *consumed = produced;
2079 break;
2080 }
2081
2082 return produced;
2083}
2084
2085/* See "GENERAL NOTES about `encode_coding_XXX ()' functions". Encode
2086 format of end-of-line according to `coding->eol_type'. If
2087 `coding->selective' is 1, code '\r' in source text also means
2088 end-of-line. */
2089
2090encode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
2091 struct coding_system *coding;
2092 unsigned char *source, *destination;
2093 int src_bytes, dst_bytes;
2094 int *consumed;
2095{
2096 unsigned char *src = source;
2097 unsigned char *dst = destination;
2098 int produced;
2099
2100 if (src_bytes <= 0)
2101 return 0;
2102
2103 switch (coding->eol_type)
2104 {
2105 case CODING_EOL_LF:
0ef69138 2106 case CODING_EOL_UNDECIDED:
4ed46869
KH
2107 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2108 bcopy (source, destination, produced);
2109 if (coding->selective)
2110 {
2111 int i = produced;
2112 while (i--)
2113 if (*dst++ == '\r') dst[-1] = '\n';
2114 }
2115 *consumed = produced;
2116
2117 case CODING_EOL_CRLF:
2118 {
2119 unsigned char c;
2120 unsigned char *src_end = source + src_bytes;
2121 unsigned char *dst_end = destination + dst_bytes;
2122 /* Since the maximum bytes produced by each loop is 2, we
2123 subtract 1 from DST_END to assure overflow checking is
2124 necessary only at the head of loop. */
2125 unsigned char *adjusted_dst_end = dst_end - 1;
2126
2127 while (src < src_end && dst < adjusted_dst_end)
2128 {
2129 c = *src++;
2130 if (c == '\n' || (c == '\r' && coding->selective))
2131 *dst++ = '\r', *dst++ = '\n';
2132 else
2133 *dst++ = c;
2134 }
2135 produced = dst - destination;
2136 *consumed = src - source;
2137 break;
2138 }
2139
2140 default: /* i.e. case CODING_EOL_CR: */
2141 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2142 bcopy (source, destination, produced);
2143 {
2144 int i = produced;
2145 while (i--)
2146 if (*dst++ == '\n') dst[-1] = '\r';
2147 }
2148 *consumed = produced;
2149 }
2150
2151 return produced;
2152}
2153
2154\f
2155/*** 6. C library functions ***/
2156
2157/* In Emacs Lisp, coding system is represented by a Lisp symbol which
2158 has a property `coding-system'. The value of this property is a
2159 vector of length 5 (called as coding-vector). Among elements of
2160 this vector, the first (element[0]) and the fifth (element[4])
2161 carry important information for decoding/encoding. Before
2162 decoding/encoding, this information should be set in fields of a
2163 structure of type `coding_system'.
2164
2165 A value of property `coding-system' can be a symbol of another
2166 subsidiary coding-system. In that case, Emacs gets coding-vector
2167 from that symbol.
2168
2169 `element[0]' contains information to be set in `coding->type'. The
2170 value and its meaning is as follows:
2171
0ef69138
KH
2172 0 -- coding_type_emacs_mule
2173 1 -- coding_type_sjis
2174 2 -- coding_type_iso2022
2175 3 -- coding_type_big5
2176 4 -- coding_type_ccl encoder/decoder written in CCL
2177 nil -- coding_type_no_conversion
2178 t -- coding_type_undecided (automatic conversion on decoding,
2179 no-conversion on encoding)
4ed46869
KH
2180
2181 `element[4]' contains information to be set in `coding->flags' and
2182 `coding->spec'. The meaning varies by `coding->type'.
2183
2184 If `coding->type' is `coding_type_iso2022', element[4] is a vector
2185 of length 32 (of which the first 13 sub-elements are used now).
2186 Meanings of these sub-elements are:
2187
2188 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2189 If the value is an integer of valid charset, the charset is
2190 assumed to be designated to graphic register N initially.
2191
2192 If the value is minus, it is a minus value of charset which
2193 reserves graphic register N, which means that the charset is
2194 not designated initially but should be designated to graphic
2195 register N just before encoding a character in that charset.
2196
2197 If the value is nil, graphic register N is never used on
2198 encoding.
2199
2200 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2201 Each value takes t or nil. See the section ISO2022 of
2202 `coding.h' for more information.
2203
2204 If `coding->type' is `coding_type_big5', element[4] is t to denote
2205 BIG5-ETen or nil to denote BIG5-HKU.
2206
2207 If `coding->type' takes the other value, element[4] is ignored.
2208
2209 Emacs Lisp's coding system also carries information about format of
2210 end-of-line in a value of property `eol-type'. If the value is
2211 integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2212 means CODING_EOL_CR. If it is not integer, it should be a vector
2213 of subsidiary coding systems of which property `eol-type' has one
2214 of above values.
2215
2216*/
2217
2218/* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2219 and set it in CODING. If CODING_SYSTEM_SYMBOL is invalid, CODING
2220 is setup so that no conversion is necessary and return -1, else
2221 return 0. */
2222
2223int
e0e989f6
KH
2224setup_coding_system (coding_system, coding)
2225 Lisp_Object coding_system;
4ed46869
KH
2226 struct coding_system *coding;
2227{
4ed46869
KH
2228 Lisp_Object type, eol_type;
2229
f4dee582 2230 /* At first, set several fields to default values. */
4ed46869
KH
2231 coding->require_flushing = 0;
2232 coding->last_block = 0;
2233 coding->selective = 0;
2234 coding->composing = 0;
2235 coding->direction = 0;
2236 coding->carryover_size = 0;
4ed46869 2237 coding->post_read_conversion = coding->pre_write_conversion = Qnil;
a5d301df
KH
2238 coding->character_unification_table_for_decode = Qnil;
2239 coding->character_unification_table_for_encode = Qnil;
4ed46869 2240
e0e989f6
KH
2241 Vlast_coding_system_used = coding->symbol = coding_system;
2242 eol_type = Qnil;
2243 /* Get value of property `coding-system' until we get a vector.
2244 While doing that, also get values of properties
a5d301df
KH
2245 `post-read-conversion', `pre-write-conversion',
2246 `character-unification-table-for-decode',
2247 `character-unification-table-for-encode' and `eol-type'. */
e0e989f6 2248 while (!NILP (coding_system) && SYMBOLP (coding_system))
4ed46869 2249 {
4ed46869 2250 if (NILP (coding->post_read_conversion))
e0e989f6 2251 coding->post_read_conversion = Fget (coding_system,
4ed46869 2252 Qpost_read_conversion);
e0e989f6
KH
2253 if (NILP (coding->pre_write_conversion))
2254 coding->pre_write_conversion = Fget (coding_system,
4ed46869 2255 Qpre_write_conversion);
9ce27fde 2256 if (!inhibit_eol_conversion && NILP (eol_type))
e0e989f6 2257 eol_type = Fget (coding_system, Qeol_type);
a5d301df
KH
2258
2259 if (NILP (coding->character_unification_table_for_decode))
2260 coding->character_unification_table_for_decode
2261 = Fget (coding_system, Qcharacter_unification_table_for_decode);
2262
2263 if (NILP (coding->character_unification_table_for_encode))
2264 coding->character_unification_table_for_encode
2265 = Fget (coding_system, Qcharacter_unification_table_for_encode);
2266
e0e989f6 2267 coding_system = Fget (coding_system, Qcoding_system);
4ed46869 2268 }
a5d301df
KH
2269
2270 while (!NILP (coding->character_unification_table_for_decode)
2271 && SYMBOLP (coding->character_unification_table_for_decode))
2272 coding->character_unification_table_for_decode
2273 = Fget (coding->character_unification_table_for_decode,
2274 Qcharacter_unification_table_for_decode);
2275 if (!NILP (coding->character_unification_table_for_decode)
2276 && !CHAR_TABLE_P (coding->character_unification_table_for_decode))
2277 coding->character_unification_table_for_decode = Qnil;
2278
2279 while (!NILP (coding->character_unification_table_for_encode)
2280 && SYMBOLP (coding->character_unification_table_for_encode))
2281 coding->character_unification_table_for_encode
2282 = Fget (coding->character_unification_table_for_encode,
2283 Qcharacter_unification_table_for_encode);
2284 if (!NILP (coding->character_unification_table_for_encode)
2285 && !CHAR_TABLE_P (coding->character_unification_table_for_encode))
2286 coding->character_unification_table_for_encode = Qnil;
2287
e0e989f6
KH
2288 if (!VECTORP (coding_system)
2289 || XVECTOR (coding_system)->size != 5)
4ed46869
KH
2290 goto label_invalid_coding_system;
2291
4ed46869 2292 if (VECTORP (eol_type))
0ef69138 2293 coding->eol_type = CODING_EOL_UNDECIDED;
4ed46869
KH
2294 else if (XFASTINT (eol_type) == 1)
2295 coding->eol_type = CODING_EOL_CRLF;
2296 else if (XFASTINT (eol_type) == 2)
2297 coding->eol_type = CODING_EOL_CR;
2298 else
2299 coding->eol_type = CODING_EOL_LF;
2300
e0e989f6 2301 type = XVECTOR (coding_system)->contents[0];
4ed46869
KH
2302 switch (XFASTINT (type))
2303 {
2304 case 0:
0ef69138 2305 coding->type = coding_type_emacs_mule;
4ed46869
KH
2306 break;
2307
2308 case 1:
2309 coding->type = coding_type_sjis;
2310 break;
2311
2312 case 2:
2313 coding->type = coding_type_iso2022;
2314 {
e0e989f6 2315 Lisp_Object val = XVECTOR (coding_system)->contents[4];
4ed46869
KH
2316 Lisp_Object *flags;
2317 int i, charset, default_reg_bits = 0;
2318
2319 if (!VECTORP (val) || XVECTOR (val)->size != 32)
2320 goto label_invalid_coding_system;
2321
2322 flags = XVECTOR (val)->contents;
2323 coding->flags
2324 = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
2325 | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
2326 | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
2327 | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
2328 | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
2329 | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
2330 | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
2331 | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
e0e989f6
KH
2332 | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
2333 | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
2334 | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL));
4ed46869
KH
2335
2336 /* Invoke graphic register 0 to plane 0. */
2337 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
2338 /* Invoke graphic register 1 to plane 1 if we can use full 8-bit. */
2339 CODING_SPEC_ISO_INVOCATION (coding, 1)
2340 = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
2341 /* Not single shifting at first. */
2342 CODING_SPEC_ISO_SINGLE_SHIFTING(coding) = 0;
e0e989f6
KH
2343 /* Beginning of buffer should also be regarded as bol. */
2344 CODING_SPEC_ISO_BOL(coding) = 1;
4ed46869
KH
2345
2346 /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
2347 FLAGS[REG] can be one of below:
2348 integer CHARSET: CHARSET occupies register I,
2349 t: designate nothing to REG initially, but can be used
2350 by any charsets,
2351 list of integer, nil, or t: designate the first
2352 element (if integer) to REG initially, the remaining
2353 elements (if integer) is designated to REG on request,
2354 if an element is t, REG can be used by any charset,
2355 nil: REG is never used. */
467e7675 2356 for (charset = 0; charset <= MAX_CHARSET; charset++)
1ba9e4ab
KH
2357 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2358 = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
4ed46869
KH
2359 for (i = 0; i < 4; i++)
2360 {
2361 if (INTEGERP (flags[i])
e0e989f6
KH
2362 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset))
2363 || (charset = get_charset_id (flags[i])) >= 0)
4ed46869
KH
2364 {
2365 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2366 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
2367 }
2368 else if (EQ (flags[i], Qt))
2369 {
2370 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2371 default_reg_bits |= 1 << i;
2372 }
2373 else if (CONSP (flags[i]))
2374 {
2375 Lisp_Object tail = flags[i];
2376
2377 if (INTEGERP (XCONS (tail)->car)
2378 && (charset = XINT (XCONS (tail)->car),
e0e989f6
KH
2379 CHARSET_VALID_P (charset))
2380 || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
4ed46869
KH
2381 {
2382 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2383 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
2384 }
2385 else
2386 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2387 tail = XCONS (tail)->cdr;
2388 while (CONSP (tail))
2389 {
2390 if (INTEGERP (XCONS (tail)->car)
2391 && (charset = XINT (XCONS (tail)->car),
e0e989f6
KH
2392 CHARSET_VALID_P (charset))
2393 || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
4ed46869
KH
2394 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2395 = i;
2396 else if (EQ (XCONS (tail)->car, Qt))
2397 default_reg_bits |= 1 << i;
2398 tail = XCONS (tail)->cdr;
2399 }
2400 }
2401 else
2402 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2403
2404 CODING_SPEC_ISO_DESIGNATION (coding, i)
2405 = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
2406 }
2407
2408 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
2409 {
2410 /* REG 1 can be used only by locking shift in 7-bit env. */
2411 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2412 default_reg_bits &= ~2;
2413 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
2414 /* Without any shifting, only REG 0 and 1 can be used. */
2415 default_reg_bits &= 3;
2416 }
2417
467e7675 2418 for (charset = 0; charset <= MAX_CHARSET; charset++)
4ed46869 2419 if (CHARSET_VALID_P (charset)
1ba9e4ab
KH
2420 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2421 == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
4ed46869
KH
2422 {
2423 /* We have not yet decided where to designate CHARSET. */
2424 int reg_bits = default_reg_bits;
2425
2426 if (CHARSET_CHARS (charset) == 96)
2427 /* A charset of CHARS96 can't be designated to REG 0. */
2428 reg_bits &= ~1;
2429
2430 if (reg_bits)
2431 /* There exist some default graphic register. */
2432 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2433 = (reg_bits & 1
2434 ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
2435 else
2436 /* We anyway have to designate CHARSET to somewhere. */
2437 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2438 = (CHARSET_CHARS (charset) == 94
2439 ? 0
2440 : ((coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT
2441 || ! coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2442 ? 1
2443 : (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT
2444 ? 2 : 0)));
2445 }
2446 }
2447 coding->require_flushing = 1;
2448 break;
2449
2450 case 3:
2451 coding->type = coding_type_big5;
2452 coding->flags
e0e989f6 2453 = (NILP (XVECTOR (coding_system)->contents[4])
4ed46869
KH
2454 ? CODING_FLAG_BIG5_HKU
2455 : CODING_FLAG_BIG5_ETEN);
2456 break;
2457
2458 case 4:
2459 coding->type = coding_type_ccl;
2460 {
e0e989f6 2461 Lisp_Object val = XVECTOR (coding_system)->contents[4];
4ed46869
KH
2462 if (CONSP (val)
2463 && VECTORP (XCONS (val)->car)
2464 && VECTORP (XCONS (val)->cdr))
2465 {
2466 setup_ccl_program (&(coding->spec.ccl.decoder), XCONS (val)->car);
2467 setup_ccl_program (&(coding->spec.ccl.encoder), XCONS (val)->cdr);
2468 }
2469 else
2470 goto label_invalid_coding_system;
2471 }
2472 coding->require_flushing = 1;
2473 break;
2474
2475 default:
2476 if (EQ (type, Qt))
0ef69138 2477 coding->type = coding_type_undecided;
4ed46869
KH
2478 else
2479 coding->type = coding_type_no_conversion;
2480 break;
2481 }
2482 return 0;
2483
2484 label_invalid_coding_system:
2485 coding->type = coding_type_no_conversion;
dec137e5 2486 coding->eol_type = CODING_EOL_LF;
e0e989f6
KH
2487 coding->symbol = coding->pre_write_conversion = coding->post_read_conversion
2488 = Qnil;
4ed46869
KH
2489 return -1;
2490}
2491
2492/* Emacs has a mechanism to automatically detect a coding system if it
2493 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
2494 it's impossible to distinguish some coding systems accurately
2495 because they use the same range of codes. So, at first, coding
2496 systems are categorized into 7, those are:
2497
0ef69138 2498 o coding-category-emacs-mule
4ed46869
KH
2499
2500 The category for a coding system which has the same code range
2501 as Emacs' internal format. Assigned the coding-system (Lisp
0ef69138 2502 symbol) `emacs-mule' by default.
4ed46869
KH
2503
2504 o coding-category-sjis
2505
2506 The category for a coding system which has the same code range
2507 as SJIS. Assigned the coding-system (Lisp
7717c392 2508 symbol) `japanese-shift-jis' by default.
4ed46869
KH
2509
2510 o coding-category-iso-7
2511
2512 The category for a coding system which has the same code range
7717c392
KH
2513 as ISO2022 of 7-bit environment. This doesn't use any locking
2514 shift and single shift functions. Assigned the coding-system
2515 (Lisp symbol) `iso-2022-7bit' by default.
4ed46869
KH
2516
2517 o coding-category-iso-8-1
2518
2519 The category for a coding system which has the same code range
2520 as ISO2022 of 8-bit environment and graphic plane 1 used only
7717c392
KH
2521 for DIMENSION1 charset. This doesn't use any locking shift
2522 and single shift functions. Assigned the coding-system (Lisp
2523 symbol) `iso-latin-1' by default.
4ed46869
KH
2524
2525 o coding-category-iso-8-2
2526
2527 The category for a coding system which has the same code range
2528 as ISO2022 of 8-bit environment and graphic plane 1 used only
7717c392
KH
2529 for DIMENSION2 charset. This doesn't use any locking shift
2530 and single shift functions. Assigned the coding-system (Lisp
2531 symbol) `japanese-iso-8bit' by default.
4ed46869 2532
7717c392 2533 o coding-category-iso-7-else
4ed46869
KH
2534
2535 The category for a coding system which has the same code range
7717c392
KH
2536 as ISO2022 of 7-bit environemnt but uses locking shift or
2537 single shift functions. Assigned the coding-system (Lisp
2538 symbol) `iso-2022-7bit-lock' by default.
2539
2540 o coding-category-iso-8-else
2541
2542 The category for a coding system which has the same code range
2543 as ISO2022 of 8-bit environemnt but uses locking shift or
2544 single shift functions. Assigned the coding-system (Lisp
2545 symbol) `iso-2022-8bit-ss2' by default.
4ed46869
KH
2546
2547 o coding-category-big5
2548
2549 The category for a coding system which has the same code range
2550 as BIG5. Assigned the coding-system (Lisp symbol)
e0e989f6 2551 `cn-big5' by default.
4ed46869
KH
2552
2553 o coding-category-binary
2554
2555 The category for a coding system not categorized in any of the
2556 above. Assigned the coding-system (Lisp symbol)
e0e989f6 2557 `no-conversion' by default.
4ed46869
KH
2558
2559 Each of them is a Lisp symbol and the value is an actual
2560 `coding-system's (this is also a Lisp symbol) assigned by a user.
2561 What Emacs does actually is to detect a category of coding system.
2562 Then, it uses a `coding-system' assigned to it. If Emacs can't
2563 decide only one possible category, it selects a category of the
2564 highest priority. Priorities of categories are also specified by a
2565 user in a Lisp variable `coding-category-list'.
2566
2567*/
2568
2569/* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2570 If it detects possible coding systems, return an integer in which
2571 appropriate flag bits are set. Flag bits are defined by macros
2572 CODING_CATEGORY_MASK_XXX in `coding.h'. */
2573
2574int
2575detect_coding_mask (src, src_bytes)
2576 unsigned char *src;
2577 int src_bytes;
2578{
2579 register unsigned char c;
2580 unsigned char *src_end = src + src_bytes;
2581 int mask;
2582
2583 /* At first, skip all ASCII characters and control characters except
2584 for three ISO2022 specific control characters. */
bcf26d6a 2585 label_loop_detect_coding:
4ed46869
KH
2586 while (src < src_end)
2587 {
2588 c = *src;
2589 if (c >= 0x80
2590 || (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
2591 break;
2592 src++;
2593 }
2594
2595 if (src >= src_end)
2596 /* We found nothing other than ASCII. There's nothing to do. */
2597 return CODING_CATEGORY_MASK_ANY;
2598
2599 /* The text seems to be encoded in some multilingual coding system.
2600 Now, try to find in which coding system the text is encoded. */
2601 if (c < 0x80)
bcf26d6a
KH
2602 {
2603 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
2604 /* C is an ISO2022 specific control code of C0. */
2605 mask = detect_coding_iso2022 (src, src_end);
2606 src++;
2607 if (mask == CODING_CATEGORY_MASK_ANY)
2608 /* No valid ISO2022 code follows C. Try again. */
2609 goto label_loop_detect_coding;
2610 }
19a8d9e0 2611 else if (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
4ed46869
KH
2612 /* C is an ISO2022 specific control code of C1,
2613 or the first byte of SJIS's 2-byte character code,
2614 or a leading code of Emacs. */
2615 mask = (detect_coding_iso2022 (src, src_end)
2616 | detect_coding_sjis (src, src_end)
10bff6f1
RS
2617 | detect_coding_emacs_mule (src, src_end)
2618 | CODING_CATEGORY_MASK_BINARY);
4ed46869 2619
19a8d9e0
KH
2620 else if (c == ISO_CODE_CSI
2621 && (src < src_end
2622 && (*src == ']'
2623 || (src + 1 < src_end
2624 && src[1] == ']'
2625 && (*src == '0' || *src == '1' || *src == '2')))))
2626 /* C is an ISO2022's control-sequence-introducer. */
2627 mask = (detect_coding_iso2022 (src, src_end)
2628 | detect_coding_sjis (src, src_end)
10bff6f1
RS
2629 | detect_coding_emacs_mule (src, src_end)
2630 | CODING_CATEGORY_MASK_BINARY);
19a8d9e0 2631
4ed46869
KH
2632 else if (c < 0xA0)
2633 /* C is the first byte of SJIS character code,
2634 or a leading-code of Emacs. */
2635 mask = (detect_coding_sjis (src, src_end)
10bff6f1
RS
2636 | detect_coding_emacs_mule (src, src_end)
2637 | CODING_CATEGORY_MASK_BINARY);
4ed46869
KH
2638
2639 else
2640 /* C is a character of ISO2022 in graphic plane right,
2641 or a SJIS's 1-byte character code (i.e. JISX0201),
2642 or the first byte of BIG5's 2-byte code. */
2643 mask = (detect_coding_iso2022 (src, src_end)
2644 | detect_coding_sjis (src, src_end)
10bff6f1
RS
2645 | detect_coding_big5 (src, src_end)
2646 | CODING_CATEGORY_MASK_BINARY);
4ed46869
KH
2647
2648 return mask;
2649}
2650
2651/* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2652 The information of the detected coding system is set in CODING. */
2653
2654void
2655detect_coding (coding, src, src_bytes)
2656 struct coding_system *coding;
2657 unsigned char *src;
2658 int src_bytes;
2659{
2660 int mask = detect_coding_mask (src, src_bytes);
2661 int idx;
2662
2663 if (mask == CODING_CATEGORY_MASK_ANY)
2664 /* We found nothing other than ASCII. There's nothing to do. */
2665 return;
2666
2667 if (!mask)
2668 /* The source text seems to be encoded in unknown coding system.
2669 Emacs regards the category of such a kind of coding system as
2670 `coding-category-binary'. We assume that a user has assigned
2671 an appropriate coding system for a `coding-category-binary'. */
2672 idx = CODING_CATEGORY_IDX_BINARY;
2673 else
2674 {
2675 /* We found some plausible coding systems. Let's use a coding
2676 system of the highest priority. */
2677 Lisp_Object val = Vcoding_category_list;
2678
2679 if (CONSP (val))
2680 while (!NILP (val))
2681 {
2682 idx = XFASTINT (Fget (XCONS (val)->car, Qcoding_category_index));
2683 if ((idx < CODING_CATEGORY_IDX_MAX) && (mask & (1 << idx)))
2684 break;
2685 val = XCONS (val)->cdr;
2686 }
2687 else
2688 val = Qnil;
2689
2690 if (NILP (val))
2691 {
2692 /* For unknown reason, `Vcoding_category_list' contains none
2693 of found categories. Let's use any of them. */
2694 for (idx = 0; idx < CODING_CATEGORY_IDX_MAX; idx++)
2695 if (mask & (1 << idx))
2696 break;
2697 }
2698 }
2699 setup_coding_system (XSYMBOL (coding_category_table[idx])->value, coding);
2700}
2701
2702/* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2703 is encoded. Return one of CODING_EOL_LF, CODING_EOL_CRLF,
0ef69138 2704 CODING_EOL_CR, and CODING_EOL_UNDECIDED. */
4ed46869 2705
bc4bc72a
RS
2706#define MAX_EOL_CHECK_COUNT 3
2707
4ed46869
KH
2708int
2709detect_eol_type (src, src_bytes)
2710 unsigned char *src;
2711 int src_bytes;
2712{
2713 unsigned char *src_end = src + src_bytes;
2714 unsigned char c;
bc4bc72a
RS
2715 int total = 0; /* How many end-of-lines are found so far. */
2716 int eol_type = CODING_EOL_UNDECIDED;
2717 int this_eol_type;
4ed46869 2718
bc4bc72a 2719 while (src < src_end && total < MAX_EOL_CHECK_COUNT)
4ed46869
KH
2720 {
2721 c = *src++;
bc4bc72a 2722 if (c == '\n' || c == '\r')
4ed46869 2723 {
bc4bc72a
RS
2724 total++;
2725 if (c == '\n')
2726 this_eol_type = CODING_EOL_LF;
2727 else if (src >= src_end || *src != '\n')
2728 this_eol_type = CODING_EOL_CR;
4ed46869 2729 else
bc4bc72a
RS
2730 this_eol_type = CODING_EOL_CRLF, src++;
2731
2732 if (eol_type == CODING_EOL_UNDECIDED)
2733 /* This is the first end-of-line. */
2734 eol_type = this_eol_type;
2735 else if (eol_type != this_eol_type)
2736 /* The found type is different from what found before.
2737 We had better not decode end-of-line. */
2738 return CODING_EOL_LF;
4ed46869
KH
2739 }
2740 }
bc4bc72a 2741
85a02ca4 2742 return eol_type;
4ed46869
KH
2743}
2744
2745/* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2746 is encoded. If it detects an appropriate format of end-of-line, it
2747 sets the information in *CODING. */
2748
2749void
2750detect_eol (coding, src, src_bytes)
2751 struct coding_system *coding;
2752 unsigned char *src;
2753 int src_bytes;
2754{
2755 Lisp_Object val;
2756 int eol_type = detect_eol_type (src, src_bytes);
2757
0ef69138 2758 if (eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
2759 /* We found no end-of-line in the source text. */
2760 return;
2761
2762 val = Fget (coding->symbol, Qeol_type);
2763 if (VECTORP (val) && XVECTOR (val)->size == 3)
2764 setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
2765}
2766
2767/* See "GENERAL NOTES about `decode_coding_XXX ()' functions". Before
2768 decoding, it may detect coding system and format of end-of-line if
2769 those are not yet decided. */
2770
2771int
2772decode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2773 struct coding_system *coding;
2774 unsigned char *source, *destination;
2775 int src_bytes, dst_bytes;
2776 int *consumed;
2777{
2778 int produced;
2779
2780 if (src_bytes <= 0)
2781 {
2782 *consumed = 0;
2783 return 0;
2784 }
2785
0ef69138 2786 if (coding->type == coding_type_undecided)
4ed46869
KH
2787 detect_coding (coding, source, src_bytes);
2788
0ef69138 2789 if (coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
2790 detect_eol (coding, source, src_bytes);
2791
2792 coding->carryover_size = 0;
2793 switch (coding->type)
2794 {
2795 case coding_type_no_conversion:
2796 label_no_conversion:
2797 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2798 bcopy (source, destination, produced);
2799 *consumed = produced;
2800 break;
2801
0ef69138
KH
2802 case coding_type_emacs_mule:
2803 case coding_type_undecided:
4ed46869 2804 if (coding->eol_type == CODING_EOL_LF
0ef69138 2805 || coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
2806 goto label_no_conversion;
2807 produced = decode_eol (coding, source, destination,
2808 src_bytes, dst_bytes, consumed);
2809 break;
2810
2811 case coding_type_sjis:
2812 produced = decode_coding_sjis_big5 (coding, source, destination,
2813 src_bytes, dst_bytes, consumed,
2814 1);
2815 break;
2816
2817 case coding_type_iso2022:
2818 produced = decode_coding_iso2022 (coding, source, destination,
2819 src_bytes, dst_bytes, consumed);
2820 break;
2821
2822 case coding_type_big5:
2823 produced = decode_coding_sjis_big5 (coding, source, destination,
2824 src_bytes, dst_bytes, consumed,
2825 0);
2826 break;
2827
2828 case coding_type_ccl:
2829 produced = ccl_driver (&coding->spec.ccl.decoder, source, destination,
2830 src_bytes, dst_bytes, consumed);
2831 break;
2832 }
2833
2834 return produced;
2835}
2836
2837/* See "GENERAL NOTES about `encode_coding_XXX ()' functions". */
2838
2839int
2840encode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2841 struct coding_system *coding;
2842 unsigned char *source, *destination;
2843 int src_bytes, dst_bytes;
2844 int *consumed;
2845{
2846 int produced;
2847
4ed46869
KH
2848 switch (coding->type)
2849 {
2850 case coding_type_no_conversion:
2851 label_no_conversion:
2852 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2853 if (produced > 0)
2854 {
2855 bcopy (source, destination, produced);
2856 if (coding->selective)
2857 {
2858 unsigned char *p = destination, *pend = destination + produced;
2859 while (p < pend)
e0e989f6 2860 if (*p++ == '\015') p[-1] = '\n';
4ed46869
KH
2861 }
2862 }
2863 *consumed = produced;
2864 break;
2865
0ef69138
KH
2866 case coding_type_emacs_mule:
2867 case coding_type_undecided:
4ed46869 2868 if (coding->eol_type == CODING_EOL_LF
0ef69138 2869 || coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
2870 goto label_no_conversion;
2871 produced = encode_eol (coding, source, destination,
2872 src_bytes, dst_bytes, consumed);
2873 break;
2874
2875 case coding_type_sjis:
2876 produced = encode_coding_sjis_big5 (coding, source, destination,
2877 src_bytes, dst_bytes, consumed,
2878 1);
2879 break;
2880
2881 case coding_type_iso2022:
2882 produced = encode_coding_iso2022 (coding, source, destination,
2883 src_bytes, dst_bytes, consumed);
2884 break;
2885
2886 case coding_type_big5:
2887 produced = encode_coding_sjis_big5 (coding, source, destination,
2888 src_bytes, dst_bytes, consumed,
2889 0);
2890 break;
2891
2892 case coding_type_ccl:
2893 produced = ccl_driver (&coding->spec.ccl.encoder, source, destination,
2894 src_bytes, dst_bytes, consumed);
2895 break;
2896 }
2897
2898 return produced;
2899}
2900
2901#define CONVERSION_BUFFER_EXTRA_ROOM 256
2902
2903/* Return maximum size (bytes) of a buffer enough for decoding
2904 SRC_BYTES of text encoded in CODING. */
2905
2906int
2907decoding_buffer_size (coding, src_bytes)
2908 struct coding_system *coding;
2909 int src_bytes;
2910{
2911 int magnification;
2912
2913 if (coding->type == coding_type_iso2022)
2914 magnification = 3;
2915 else if (coding->type == coding_type_ccl)
2916 magnification = coding->spec.ccl.decoder.buf_magnification;
2917 else
2918 magnification = 2;
2919
2920 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
2921}
2922
2923/* Return maximum size (bytes) of a buffer enough for encoding
2924 SRC_BYTES of text to CODING. */
2925
2926int
2927encoding_buffer_size (coding, src_bytes)
2928 struct coding_system *coding;
2929 int src_bytes;
2930{
2931 int magnification;
2932
2933 if (coding->type == coding_type_ccl)
2934 magnification = coding->spec.ccl.encoder.buf_magnification;
2935 else
2936 magnification = 3;
2937
2938 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
2939}
2940
2941#ifndef MINIMUM_CONVERSION_BUFFER_SIZE
2942#define MINIMUM_CONVERSION_BUFFER_SIZE 1024
2943#endif
2944
2945char *conversion_buffer;
2946int conversion_buffer_size;
2947
2948/* Return a pointer to a SIZE bytes of buffer to be used for encoding
2949 or decoding. Sufficient memory is allocated automatically. If we
2950 run out of memory, return NULL. */
2951
2952char *
2953get_conversion_buffer (size)
2954 int size;
2955{
2956 if (size > conversion_buffer_size)
2957 {
2958 char *buf;
2959 int real_size = conversion_buffer_size * 2;
2960
2961 while (real_size < size) real_size *= 2;
2962 buf = (char *) xmalloc (real_size);
2963 xfree (conversion_buffer);
2964 conversion_buffer = buf;
2965 conversion_buffer_size = real_size;
2966 }
2967 return conversion_buffer;
2968}
2969
2970\f
2971#ifdef emacs
2972/*** 7. Emacs Lisp library functions ***/
2973
02ba4723 2974DEFUN ("coding-system-spec", Fcoding_system_spec, Scoding_system_spec,
4ed46869 2975 1, 1, 0,
02ba4723 2976 "Return coding-spec of CODING-SYSTEM.\n\
4ed46869
KH
2977If CODING-SYSTEM is not a valid coding-system, return nil.")
2978 (obj)
2979 Lisp_Object obj;
2980{
2981 while (SYMBOLP (obj) && !NILP (obj))
2982 obj = Fget (obj, Qcoding_system);
2983 return ((NILP (obj) || !VECTORP (obj) || XVECTOR (obj)->size != 5)
2984 ? Qnil : obj);
2985}
2986
2987DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
2988 "Return t if OBJECT is nil or a coding-system.\n\
2989See document of make-coding-system for coding-system object.")
2990 (obj)
2991 Lisp_Object obj;
2992{
02ba4723 2993 return ((NILP (obj) || !NILP (Fcoding_system_spec (obj))) ? Qt : Qnil);
4ed46869
KH
2994}
2995
9d991de8
RS
2996DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
2997 Sread_non_nil_coding_system, 1, 1, 0,
e0e989f6 2998 "Read a coding system from the minibuffer, prompting with string PROMPT.")
4ed46869
KH
2999 (prompt)
3000 Lisp_Object prompt;
3001{
e0e989f6 3002 Lisp_Object val;
9d991de8
RS
3003 do
3004 {
02ba4723 3005 val = Fcompleting_read (prompt, Vobarray, Qcoding_system_spec,
9d991de8
RS
3006 Qt, Qnil, Qnil, Qnil);
3007 }
3008 while (XSTRING (val)->size == 0);
e0e989f6 3009 return (Fintern (val, Qnil));
4ed46869
KH
3010}
3011
3012DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 1, 0,
e0e989f6 3013 "Read a coding system or nil from the minibuffer, prompting with string PROMPT.")
4ed46869
KH
3014 (prompt)
3015 Lisp_Object prompt;
3016{
e0e989f6 3017 Lisp_Object val = Fcompleting_read (prompt, Vobarray, Qcoding_system_p,
9d991de8 3018 Qt, Qnil, Qnil, Qnil);
e0e989f6 3019 return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
4ed46869
KH
3020}
3021
3022DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
3023 1, 1, 0,
3024 "Check validity of CODING-SYSTEM.\n\
3025If valid, return CODING-SYSTEM, else `coding-system-error' is signaled.\n\
3026CODING-SYSTEM is valid if it is a symbol and has \"coding-system\" property.\n\
3027The value of property should be a vector of length 5.")
3028 (coding_system)
3029 Lisp_Object coding_system;
3030{
3031 CHECK_SYMBOL (coding_system, 0);
3032 if (!NILP (Fcoding_system_p (coding_system)))
3033 return coding_system;
3034 while (1)
02ba4723 3035 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
4ed46869
KH
3036}
3037
3038DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
3039 2, 2, 0,
bf9cdd4e
KH
3040 "Detect coding system of the text in the region between START and END.\n\
3041Return a list of possible coding systems ordered by priority.\n\
0ef69138 3042If only ASCII characters are found, it returns `undecided'\n\
bf9cdd4e 3043 or its subsidiary coding system according to a detected end-of-line format.")
4ed46869
KH
3044 (b, e)
3045 Lisp_Object b, e;
3046{
3047 int coding_mask, eol_type;
3048 Lisp_Object val;
3049 int beg, end;
3050
3051 validate_region (&b, &e);
3052 beg = XINT (b), end = XINT (e);
3053 if (beg < GPT && end >= GPT) move_gap (end);
3054
3055 coding_mask = detect_coding_mask (POS_ADDR (beg), end - beg);
3056 eol_type = detect_eol_type (POS_ADDR (beg), end - beg);
3057
3058 if (coding_mask == CODING_CATEGORY_MASK_ANY)
3059 {
0ef69138
KH
3060 val = intern ("undecided");
3061 if (eol_type != CODING_EOL_UNDECIDED)
4ed46869
KH
3062 {
3063 Lisp_Object val2 = Fget (val, Qeol_type);
3064 if (VECTORP (val2))
3065 val = XVECTOR (val2)->contents[eol_type];
3066 }
3067 }
3068 else
3069 {
3070 Lisp_Object val2;
3071
3072 /* At first, gather possible coding-systems in VAL in a reverse
3073 order. */
3074 val = Qnil;
3075 for (val2 = Vcoding_category_list;
3076 !NILP (val2);
3077 val2 = XCONS (val2)->cdr)
3078 {
3079 int idx
3080 = XFASTINT (Fget (XCONS (val2)->car, Qcoding_category_index));
3081 if (coding_mask & (1 << idx))
3082 val = Fcons (Fsymbol_value (XCONS (val2)->car), val);
3083 }
3084
3085 /* Then, change the order of the list, while getting subsidiary
3086 coding-systems. */
3087 val2 = val;
3088 val = Qnil;
3089 for (; !NILP (val2); val2 = XCONS (val2)->cdr)
3090 {
0ef69138 3091 if (eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
3092 val = Fcons (XCONS (val2)->car, val);
3093 else
3094 {
3095 Lisp_Object val3 = Fget (XCONS (val2)->car, Qeol_type);
3096 if (VECTORP (val3))
3097 val = Fcons (XVECTOR (val3)->contents[eol_type], val);
3098 else
3099 val = Fcons (XCONS (val2)->car, val);
3100 }
3101 }
3102 }
3103
3104 return val;
3105}
3106
3107/* Scan text in the region between *BEGP and *ENDP, skip characters
3108 which we never have to encode to (iff ENCODEP is 1) or decode from
3109 coding system CODING at the head and tail, then set BEGP and ENDP
3110 to the addresses of start and end of the text we actually convert. */
3111
3112void
3113shrink_conversion_area (begp, endp, coding, encodep)
3114 unsigned char **begp, **endp;
3115 struct coding_system *coding;
3116 int encodep;
3117{
3118 register unsigned char *beg_addr = *begp, *end_addr = *endp;
3119
3120 if (coding->eol_type != CODING_EOL_LF
0ef69138 3121 && coding->eol_type != CODING_EOL_UNDECIDED)
4ed46869
KH
3122 /* Since we anyway have to convert end-of-line format, it is not
3123 worth skipping at most 100 bytes or so. */
3124 return;
3125
3126 if (encodep) /* for encoding */
3127 {
3128 switch (coding->type)
3129 {
3130 case coding_type_no_conversion:
0ef69138
KH
3131 case coding_type_emacs_mule:
3132 case coding_type_undecided:
4ed46869
KH
3133 /* We need no conversion. */
3134 *begp = *endp;
3135 return;
3136 case coding_type_ccl:
3137 /* We can't skip any data. */
3138 return;
e0e989f6
KH
3139 case coding_type_iso2022:
3140 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3141 {
3142 unsigned char *bol = beg_addr;
3143 while (beg_addr < end_addr && *beg_addr < 0x80)
3144 {
3145 beg_addr++;
3146 if (*(beg_addr - 1) == '\n')
3147 bol = beg_addr;
3148 }
3149 beg_addr = bol;
3150 goto label_skip_tail;
3151 }
3152 /* fall down ... */
4ed46869
KH
3153 default:
3154 /* We can skip all ASCII characters at the head and tail. */
3155 while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
e0e989f6 3156 label_skip_tail:
4ed46869
KH
3157 while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
3158 break;
3159 }
3160 }
3161 else /* for decoding */
3162 {
3163 switch (coding->type)
3164 {
3165 case coding_type_no_conversion:
3166 /* We need no conversion. */
3167 *begp = *endp;
3168 return;
0ef69138 3169 case coding_type_emacs_mule:
4ed46869
KH
3170 if (coding->eol_type == CODING_EOL_LF)
3171 {
3172 /* We need no conversion. */
3173 *begp = *endp;
3174 return;
3175 }
3176 /* We can skip all but carriage-return. */
3177 while (beg_addr < end_addr && *beg_addr != '\r') beg_addr++;
3178 while (beg_addr < end_addr && *(end_addr - 1) != '\r') end_addr--;
3179 break;
3180 case coding_type_sjis:
3181 case coding_type_big5:
3182 /* We can skip all ASCII characters at the head. */
3183 while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
3184 /* We can skip all ASCII characters at the tail except for
3185 the second byte of SJIS or BIG5 code. */
3186 while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
3187 if (end_addr != *endp)
3188 end_addr++;
3189 break;
3190 case coding_type_ccl:
3191 /* We can't skip any data. */
3192 return;
3193 default: /* i.e. case coding_type_iso2022: */
3194 {
3195 unsigned char c;
3196
3197 /* We can skip all ASCII characters except for a few
3198 control codes at the head. */
3199 while (beg_addr < end_addr && (c = *beg_addr) < 0x80
3200 && c != ISO_CODE_CR && c != ISO_CODE_SO
3201 && c != ISO_CODE_SI && c != ISO_CODE_ESC)
3202 beg_addr++;
3203 }
3204 break;
3205 }
3206 }
3207 *begp = beg_addr;
3208 *endp = end_addr;
3209 return;
3210}
3211
3212/* Encode to (iff ENCODEP is 1) or decode form coding system CODING a
3213 text between B and E. B and E are buffer position. */
3214
3215Lisp_Object
3216code_convert_region (b, e, coding, encodep)
3217 Lisp_Object b, e;
3218 struct coding_system *coding;
3219 int encodep;
3220{
3221 int beg, end, len, consumed, produced;
3222 char *buf;
3223 unsigned char *begp, *endp;
3224 int pos = PT;
3225
3226 validate_region (&b, &e);
3227 beg = XINT (b), end = XINT (e);
3228 if (beg < GPT && end >= GPT)
3229 move_gap (end);
3230
3231 if (encodep && !NILP (coding->pre_write_conversion))
3232 {
3233 /* We must call a pre-conversion function which may put a new
3234 text to be converted in a new buffer. */
3235 struct buffer *old = current_buffer, *new;
3236
3237 TEMP_SET_PT (beg);
3238 call2 (coding->pre_write_conversion, b, e);
3239 if (old != current_buffer)
3240 {
3241 /* Replace the original text by the text just generated. */
3242 len = ZV - BEGV;
3243 new = current_buffer;
3244 set_buffer_internal (old);
3245 del_range (beg, end);
3246 insert_from_buffer (new, 1, len, 0);
3247 end = beg + len;
3248 }
3249 }
3250
3251 /* We may be able to shrink the conversion region. */
3252 begp = POS_ADDR (beg); endp = begp + (end - beg);
3253 shrink_conversion_area (&begp, &endp, coding, encodep);
3254
3255 if (begp == endp)
3256 /* We need no conversion. */
3257 len = end - beg;
3258 else
3259 {
3260 beg += begp - POS_ADDR (beg);
3261 end = beg + (endp - begp);
3262
3263 if (encodep)
3264 len = encoding_buffer_size (coding, end - beg);
3265 else
3266 len = decoding_buffer_size (coding, end - beg);
3267 buf = get_conversion_buffer (len);
3268
3269 coding->last_block = 1;
3270 produced = (encodep
3271 ? encode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
3272 &consumed)
3273 : decode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
3274 &consumed));
3275
3276 len = produced + (beg - XINT (b)) + (XINT (e) - end);
3277
3278 TEMP_SET_PT (beg);
3279 insert (buf, produced);
3280 del_range (PT, PT + end - beg);
3281 if (pos >= end)
3282 pos = PT + (pos - end);
3283 else if (pos > beg)
3284 pos = beg;
3285 TEMP_SET_PT (pos);
3286 }
3287
3288 if (!encodep && !NILP (coding->post_read_conversion))
3289 {
3290 /* We must call a post-conversion function which may alter
3291 the text just converted. */
3292 Lisp_Object insval;
3293
3294 beg = XINT (b);
3295 TEMP_SET_PT (beg);
3296 insval = call1 (coding->post_read_conversion, make_number (len));
3297 CHECK_NUMBER (insval, 0);
3298 len = XINT (insval);
3299 }
3300
3301 return make_number (len);
3302}
3303
3304Lisp_Object
e0e989f6
KH
3305code_convert_string (str, coding, encodep, nocopy)
3306 Lisp_Object str, nocopy;
4ed46869
KH
3307 struct coding_system *coding;
3308 int encodep;
3309{
3310 int len, consumed, produced;
3311 char *buf;
3312 unsigned char *begp, *endp;
3313 int head_skip, tail_skip;
3314 struct gcpro gcpro1;
3315
3316 if (encodep && !NILP (coding->pre_write_conversion)
3317 || !encodep && !NILP (coding->post_read_conversion))
3318 {
3319 /* Since we have to call Lisp functions which assume target text
3320 is in a buffer, after setting a temporary buffer, call
3321 code_convert_region. */
3322 int count = specpdl_ptr - specpdl;
3323 int len = XSTRING (str)->size;
3324 Lisp_Object result;
3325 struct buffer *old = current_buffer;
3326
3327 record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
3328 temp_output_buffer_setup (" *code-converting-work*");
3329 set_buffer_internal (XBUFFER (Vstandard_output));
3330 insert_from_string (str, 0, len, 0);
3331 code_convert_region (make_number (BEGV), make_number (ZV),
3332 coding, encodep);
3333 result = make_buffer_string (BEGV, ZV, 0);
3334 set_buffer_internal (old);
3335 return unbind_to (count, result);
3336 }
3337
3338 /* We may be able to shrink the conversion region. */
3339 begp = XSTRING (str)->data;
3340 endp = begp + XSTRING (str)->size;
3341 shrink_conversion_area (&begp, &endp, coding, encodep);
3342
3343 if (begp == endp)
3344 /* We need no conversion. */
e0e989f6 3345 return (NILP (nocopy) ? Fcopy_sequence (str) : str);
4ed46869
KH
3346
3347 head_skip = begp - XSTRING (str)->data;
3348 tail_skip = XSTRING (str)->size - head_skip - (endp - begp);
3349
3350 GCPRO1 (str);
3351
3352 if (encodep)
3353 len = encoding_buffer_size (coding, endp - begp);
3354 else
3355 len = decoding_buffer_size (coding, endp - begp);
3356 buf = get_conversion_buffer (len + head_skip + tail_skip);
3357
3358 bcopy (XSTRING (str)->data, buf, head_skip);
3359 coding->last_block = 1;
3360 produced = (encodep
3361 ? encode_coding (coding, XSTRING (str)->data + head_skip,
3362 buf + head_skip, endp - begp, len, &consumed)
3363 : decode_coding (coding, XSTRING (str)->data + head_skip,
3364 buf + head_skip, endp - begp, len, &consumed));
3365 bcopy (XSTRING (str)->data + head_skip + (endp - begp),
3366 buf + head_skip + produced,
3367 tail_skip);
3368
3369 UNGCPRO;
3370
3371 return make_string (buf, head_skip + produced + tail_skip);
3372}
3373
3374DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
e0e989f6
KH
3375 3, 3, "r\nzCoding system: ",
3376 "Decode current region by specified coding system.\n\
3377When called from a program, takes three arguments:\n\
3378START, END, and CODING-SYSTEM. START END are buffer positions.\n\
4ed46869
KH
3379Return length of decoded text.")
3380 (b, e, coding_system)
3381 Lisp_Object b, e, coding_system;
3382{
3383 struct coding_system coding;
3384
3385 CHECK_NUMBER_COERCE_MARKER (b, 0);
3386 CHECK_NUMBER_COERCE_MARKER (e, 1);
3387 CHECK_SYMBOL (coding_system, 2);
3388
e0e989f6
KH
3389 if (NILP (coding_system))
3390 return make_number (XFASTINT (e) - XFASTINT (b));
4ed46869
KH
3391 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3392 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3393
3394 return code_convert_region (b, e, &coding, 0);
3395}
3396
3397DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
e0e989f6
KH
3398 3, 3, "r\nzCoding system: ",
3399 "Encode current region by specified coding system.\n\
3400When called from a program, takes three arguments:\n\
3401START, END, and CODING-SYSTEM. START END are buffer positions.\n\
4ed46869
KH
3402Return length of encoded text.")
3403 (b, e, coding_system)
3404 Lisp_Object b, e, coding_system;
3405{
3406 struct coding_system coding;
3407
3408 CHECK_NUMBER_COERCE_MARKER (b, 0);
3409 CHECK_NUMBER_COERCE_MARKER (e, 1);
3410 CHECK_SYMBOL (coding_system, 2);
3411
e0e989f6
KH
3412 if (NILP (coding_system))
3413 return make_number (XFASTINT (e) - XFASTINT (b));
4ed46869
KH
3414 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3415 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3416
3417 return code_convert_region (b, e, &coding, 1);
3418}
3419
3420DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
e0e989f6
KH
3421 2, 3, 0,
3422 "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
3423Optional arg NOCOPY non-nil means return STRING itself if there's no need\n\
3424of decoding.")
3425 (string, coding_system, nocopy)
3426 Lisp_Object string, coding_system, nocopy;
4ed46869
KH
3427{
3428 struct coding_system coding;
3429
3430 CHECK_STRING (string, 0);
3431 CHECK_SYMBOL (coding_system, 1);
3432
e0e989f6
KH
3433 if (NILP (coding_system))
3434 return (NILP (nocopy) ? Fcopy_sequence (string) : string);
4ed46869
KH
3435 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3436 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3437
e0e989f6 3438 return code_convert_string (string, &coding, 0, nocopy);
4ed46869
KH
3439}
3440
3441DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
e0e989f6
KH
3442 2, 3, 0,
3443 "Encode STRING to CODING-SYSTEM, and return the result.\n\
3444Optional arg NOCOPY non-nil means return STRING itself if there's no need\n\
3445of encoding.")
3446 (string, coding_system, nocopy)
3447 Lisp_Object string, coding_system, nocopy;
4ed46869
KH
3448{
3449 struct coding_system coding;
3450
3451 CHECK_STRING (string, 0);
3452 CHECK_SYMBOL (coding_system, 1);
3453
e0e989f6
KH
3454 if (NILP (coding_system))
3455 return (NILP (nocopy) ? Fcopy_sequence (string) : string);
4ed46869
KH
3456 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3457 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3458
e0e989f6 3459 return code_convert_string (string, &coding, 1, nocopy);
4ed46869
KH
3460}
3461
3462DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
e0e989f6 3463 "Decode a JISX0208 character of shift-jis encoding.\n\
4ed46869
KH
3464CODE is the character code in SJIS.\n\
3465Return the corresponding character.")
3466 (code)
3467 Lisp_Object code;
3468{
3469 unsigned char c1, c2, s1, s2;
3470 Lisp_Object val;
3471
3472 CHECK_NUMBER (code, 0);
3473 s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
3474 DECODE_SJIS (s1, s2, c1, c2);
3475 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset_jisx0208, c1, c2));
3476 return val;
3477}
3478
3479DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
3480 "Encode a JISX0208 character CHAR to SJIS coding-system.\n\
3481Return the corresponding character code in SJIS.")
3482 (ch)
3483 Lisp_Object ch;
3484{
bcf26d6a 3485 int charset, c1, c2, s1, s2;
4ed46869
KH
3486 Lisp_Object val;
3487
3488 CHECK_NUMBER (ch, 0);
3489 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3490 if (charset == charset_jisx0208)
3491 {
3492 ENCODE_SJIS (c1, c2, s1, s2);
bcf26d6a 3493 XSETFASTINT (val, (s1 << 8) | s2);
4ed46869
KH
3494 }
3495 else
3496 XSETFASTINT (val, 0);
3497 return val;
3498}
3499
3500DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
3501 "Decode a Big5 character CODE of BIG5 coding-system.\n\
3502CODE is the character code in BIG5.\n\
3503Return the corresponding character.")
3504 (code)
3505 Lisp_Object code;
3506{
3507 int charset;
3508 unsigned char b1, b2, c1, c2;
3509 Lisp_Object val;
3510
3511 CHECK_NUMBER (code, 0);
3512 b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
3513 DECODE_BIG5 (b1, b2, charset, c1, c2);
3514 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset, c1, c2));
3515 return val;
3516}
3517
3518DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
3519 "Encode the Big5 character CHAR to BIG5 coding-system.\n\
3520Return the corresponding character code in Big5.")
3521 (ch)
3522 Lisp_Object ch;
3523{
bcf26d6a 3524 int charset, c1, c2, b1, b2;
4ed46869
KH
3525 Lisp_Object val;
3526
3527 CHECK_NUMBER (ch, 0);
3528 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3529 if (charset == charset_big5_1 || charset == charset_big5_2)
3530 {
3531 ENCODE_BIG5 (charset, c1, c2, b1, b2);
bcf26d6a 3532 XSETFASTINT (val, (b1 << 8) | b2);
4ed46869
KH
3533 }
3534 else
3535 XSETFASTINT (val, 0);
3536 return val;
3537}
3538
1ba9e4ab
KH
3539DEFUN ("set-terminal-coding-system-internal",
3540 Fset_terminal_coding_system_internal,
3541 Sset_terminal_coding_system_internal, 1, 1, 0, "")
4ed46869
KH
3542 (coding_system)
3543 Lisp_Object coding_system;
3544{
3545 CHECK_SYMBOL (coding_system, 0);
3546 setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
4ed46869
KH
3547 return Qnil;
3548}
3549
3550DEFUN ("terminal-coding-system",
3551 Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
3552 "Return coding-system of your terminal.")
3553 ()
3554{
3555 return terminal_coding.symbol;
3556}
3557
1ba9e4ab
KH
3558DEFUN ("set-keyboard-coding-system-internal",
3559 Fset_keyboard_coding_system_internal,
3560 Sset_keyboard_coding_system_internal, 1, 1, 0, "")
4ed46869
KH
3561 (coding_system)
3562 Lisp_Object coding_system;
3563{
3564 CHECK_SYMBOL (coding_system, 0);
3565 setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
3566 return Qnil;
3567}
3568
3569DEFUN ("keyboard-coding-system",
3570 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
3571 "Return coding-system of what is sent from terminal keyboard.")
3572 ()
3573{
3574 return keyboard_coding.symbol;
3575}
3576
3577\f
a5d301df
KH
3578DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
3579 Sfind_operation_coding_system, 1, MANY, 0,
3580 "Choose a coding system for an operation based on the target name.\n\
9ce27fde
KH
3581The value names a pair of coding systems: (DECODING-SYSTEM ENCODING-SYSTEM).\n\
3582DECODING-SYSTEM is the coding system to use for decoding\n\
3583\(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system\n\
3584for encoding (in case OPERATION does encoding).\n\
ccdb79f5
RS
3585\n\
3586The first argument OPERATION specifies an I/O primitive:\n\
3587 For file I/O, `insert-file-contents' or `write-region'.\n\
3588 For process I/O, `call-process', `call-process-region', or `start-process'.\n\
3589 For network I/O, `open-network-stream'.\n\
3590\n\
3591The remaining arguments should be the same arguments that were passed\n\
3592to the primitive. Depending on which primitive, one of those arguments\n\
3593is selected as the TARGET. For example, if OPERATION does file I/O,\n\
3594whichever argument specifies the file name is TARGET.\n\
3595\n\
3596TARGET has a meaning which depends on OPERATION:\n\
4ed46869
KH
3597 For file I/O, TARGET is a file name.\n\
3598 For process I/O, TARGET is a process name.\n\
3599 For network I/O, TARGET is a service name or a port number\n\
3600\n\
02ba4723
KH
3601This function looks up what specified for TARGET in,\n\
3602`file-coding-system-alist', `process-coding-system-alist',\n\
3603or `network-coding-system-alist' depending on OPERATION.\n\
3604They may specify a coding system, a cons of coding systems,\n\
3605or a function symbol to call.\n\
3606In the last case, we call the function with one argument,\n\
9ce27fde 3607which is a list of all the arguments given to this function.")
4ed46869
KH
3608 (nargs, args)
3609 int nargs;
3610 Lisp_Object *args;
3611{
3612 Lisp_Object operation, target_idx, target, val;
3613 register Lisp_Object chain;
3614
3615 if (nargs < 2)
3616 error ("Too few arguments");
3617 operation = args[0];
3618 if (!SYMBOLP (operation)
3619 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
3620 error ("Invalid first arguement");
3621 if (nargs < 1 + XINT (target_idx))
3622 error ("Too few arguments for operation: %s",
3623 XSYMBOL (operation)->name->data);
3624 target = args[XINT (target_idx) + 1];
3625 if (!(STRINGP (target)
3626 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
3627 error ("Invalid %dth argument", XINT (target_idx) + 1);
3628
2e34157c
RS
3629 chain = ((EQ (operation, Qinsert_file_contents)
3630 || EQ (operation, Qwrite_region))
02ba4723 3631 ? Vfile_coding_system_alist
2e34157c 3632 : (EQ (operation, Qopen_network_stream)
02ba4723
KH
3633 ? Vnetwork_coding_system_alist
3634 : Vprocess_coding_system_alist));
4ed46869
KH
3635 if (NILP (chain))
3636 return Qnil;
3637
02ba4723 3638 for (; CONSP (chain); chain = XCONS (chain)->cdr)
4ed46869
KH
3639 {
3640 Lisp_Object elt = XCONS (chain)->car;
3641
3642 if (CONSP (elt)
3643 && ((STRINGP (target)
3644 && STRINGP (XCONS (elt)->car)
3645 && fast_string_match (XCONS (elt)->car, target) >= 0)
3646 || (INTEGERP (target) && EQ (target, XCONS (elt)->car))))
02ba4723
KH
3647 {
3648 val = XCONS (elt)->cdr;
3649 if (CONSP (val))
3650 return val;
3651 if (! SYMBOLP (val))
3652 return Qnil;
3653 if (! NILP (Fcoding_system_p (val)))
3654 return Fcons (val, val);
465edc86 3655 if (!NILP (Ffboundp (val)))
5d632ccf 3656 return call1 (val, Flist (nargs, args));
02ba4723
KH
3657 return Qnil;
3658 }
4ed46869
KH
3659 }
3660 return Qnil;
3661}
3662
3663#endif /* emacs */
3664
3665\f
3666/*** 8. Post-amble ***/
3667
3668init_coding_once ()
3669{
3670 int i;
3671
0ef69138 3672 /* Emacs' internal format specific initialize routine. */
4ed46869
KH
3673 for (i = 0; i <= 0x20; i++)
3674 emacs_code_class[i] = EMACS_control_code;
3675 emacs_code_class[0x0A] = EMACS_linefeed_code;
3676 emacs_code_class[0x0D] = EMACS_carriage_return_code;
3677 for (i = 0x21 ; i < 0x7F; i++)
3678 emacs_code_class[i] = EMACS_ascii_code;
3679 emacs_code_class[0x7F] = EMACS_control_code;
3680 emacs_code_class[0x80] = EMACS_leading_code_composition;
3681 for (i = 0x81; i < 0xFF; i++)
3682 emacs_code_class[i] = EMACS_invalid_code;
3683 emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
3684 emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
3685 emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
3686 emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
3687
3688 /* ISO2022 specific initialize routine. */
3689 for (i = 0; i < 0x20; i++)
3690 iso_code_class[i] = ISO_control_code;
3691 for (i = 0x21; i < 0x7F; i++)
3692 iso_code_class[i] = ISO_graphic_plane_0;
3693 for (i = 0x80; i < 0xA0; i++)
3694 iso_code_class[i] = ISO_control_code;
3695 for (i = 0xA1; i < 0xFF; i++)
3696 iso_code_class[i] = ISO_graphic_plane_1;
3697 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
3698 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
3699 iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
3700 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
3701 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
3702 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
3703 iso_code_class[ISO_CODE_ESC] = ISO_escape;
3704 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
3705 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
3706 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
3707
e0e989f6
KH
3708 conversion_buffer_size = MINIMUM_CONVERSION_BUFFER_SIZE;
3709 conversion_buffer = (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE);
3710
3711 setup_coding_system (Qnil, &keyboard_coding);
3712 setup_coding_system (Qnil, &terminal_coding);
9ce27fde
KH
3713
3714#if defined (MSDOS) || defined (WINDOWSNT)
3715 system_eol_type = CODING_EOL_CRLF;
3716#else
3717 system_eol_type = CODING_EOL_LF;
3718#endif
e0e989f6
KH
3719}
3720
3721#ifdef emacs
3722
3723syms_of_coding ()
3724{
3725 Qtarget_idx = intern ("target-idx");
3726 staticpro (&Qtarget_idx);
3727
9ce27fde 3728 /* Target FILENAME is the first argument. */
e0e989f6 3729 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
9ce27fde 3730 /* Target FILENAME is the third argument. */
e0e989f6
KH
3731 Fput (Qwrite_region, Qtarget_idx, make_number (2));
3732
3733 Qcall_process = intern ("call-process");
3734 staticpro (&Qcall_process);
9ce27fde 3735 /* Target PROGRAM is the first argument. */
e0e989f6
KH
3736 Fput (Qcall_process, Qtarget_idx, make_number (0));
3737
3738 Qcall_process_region = intern ("call-process-region");
3739 staticpro (&Qcall_process_region);
9ce27fde 3740 /* Target PROGRAM is the third argument. */
e0e989f6
KH
3741 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
3742
3743 Qstart_process = intern ("start-process");
3744 staticpro (&Qstart_process);
9ce27fde 3745 /* Target PROGRAM is the third argument. */
e0e989f6
KH
3746 Fput (Qstart_process, Qtarget_idx, make_number (2));
3747
3748 Qopen_network_stream = intern ("open-network-stream");
3749 staticpro (&Qopen_network_stream);
9ce27fde 3750 /* Target SERVICE is the fourth argument. */
e0e989f6
KH
3751 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
3752
4ed46869
KH
3753 Qcoding_system = intern ("coding-system");
3754 staticpro (&Qcoding_system);
3755
3756 Qeol_type = intern ("eol-type");
3757 staticpro (&Qeol_type);
3758
3759 Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
3760 staticpro (&Qbuffer_file_coding_system);
3761
3762 Qpost_read_conversion = intern ("post-read-conversion");
3763 staticpro (&Qpost_read_conversion);
3764
3765 Qpre_write_conversion = intern ("pre-write-conversion");
3766 staticpro (&Qpre_write_conversion);
3767
02ba4723
KH
3768 Qcoding_system_spec = intern ("coding-system-spec");
3769 staticpro (&Qcoding_system_spec);
4ed46869
KH
3770
3771 Qcoding_system_p = intern ("coding-system-p");
3772 staticpro (&Qcoding_system_p);
3773
3774 Qcoding_system_error = intern ("coding-system-error");
3775 staticpro (&Qcoding_system_error);
3776
3777 Fput (Qcoding_system_error, Qerror_conditions,
3778 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
3779 Fput (Qcoding_system_error, Qerror_message,
9ce27fde 3780 build_string ("Invalid coding system"));
4ed46869
KH
3781
3782 Qcoding_category_index = intern ("coding-category-index");
3783 staticpro (&Qcoding_category_index);
3784
3785 {
3786 int i;
3787 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3788 {
3789 coding_category_table[i] = intern (coding_category_name[i]);
3790 staticpro (&coding_category_table[i]);
3791 Fput (coding_category_table[i], Qcoding_category_index,
3792 make_number (i));
3793 }
3794 }
3795
bdd9fb48
KH
3796 Qcharacter_unification_table = intern ("character-unification-table");
3797 staticpro (&Qcharacter_unification_table);
3798 Fput (Qcharacter_unification_table, Qchar_table_extra_slots,
3799 make_number (0));
3800
a5d301df
KH
3801 Qcharacter_unification_table_for_decode
3802 = intern ("character-unification-table-for-decode");
3803 staticpro (&Qcharacter_unification_table_for_decode);
3804
3805 Qcharacter_unification_table_for_encode
3806 = intern ("character-unification-table-for-encode");
3807 staticpro (&Qcharacter_unification_table_for_encode);
3808
9ce27fde
KH
3809 Qemacs_mule = intern ("emacs-mule");
3810 staticpro (&Qemacs_mule);
3811
02ba4723 3812 defsubr (&Scoding_system_spec);
4ed46869
KH
3813 defsubr (&Scoding_system_p);
3814 defsubr (&Sread_coding_system);
3815 defsubr (&Sread_non_nil_coding_system);
3816 defsubr (&Scheck_coding_system);
3817 defsubr (&Sdetect_coding_region);
3818 defsubr (&Sdecode_coding_region);
3819 defsubr (&Sencode_coding_region);
3820 defsubr (&Sdecode_coding_string);
3821 defsubr (&Sencode_coding_string);
3822 defsubr (&Sdecode_sjis_char);
3823 defsubr (&Sencode_sjis_char);
3824 defsubr (&Sdecode_big5_char);
3825 defsubr (&Sencode_big5_char);
1ba9e4ab 3826 defsubr (&Sset_terminal_coding_system_internal);
4ed46869 3827 defsubr (&Sterminal_coding_system);
1ba9e4ab 3828 defsubr (&Sset_keyboard_coding_system_internal);
4ed46869 3829 defsubr (&Skeyboard_coding_system);
a5d301df 3830 defsubr (&Sfind_operation_coding_system);
4ed46869
KH
3831
3832 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
3833 "List of coding-categories (symbols) ordered by priority.");
3834 {
3835 int i;
3836
3837 Vcoding_category_list = Qnil;
3838 for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
3839 Vcoding_category_list
3840 = Fcons (coding_category_table[i], Vcoding_category_list);
3841 }
3842
3843 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
10bff6f1
RS
3844 "Specify the coding system for read operations.\n\
3845It is useful to bind this variable with `let', but do not set it globally.
4ed46869 3846If the value is a coding system, it is used for decoding on read operation.\n\
a67a9c66 3847If not, an appropriate element is used from one of the coding system alists:\n\
10bff6f1 3848There are three such tables, `file-coding-system-alist',\n\
a67a9c66 3849`process-coding-system-alist', and `network-coding-system-alist'.");
4ed46869
KH
3850 Vcoding_system_for_read = Qnil;
3851
3852 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
10bff6f1
RS
3853 "Specify the coding system for write operations.\n\
3854It is useful to bind this variable with `let', but do not set it globally.
4ed46869 3855If the value is a coding system, it is used for encoding on write operation.\n\
a67a9c66 3856If not, an appropriate element is used from one of the coding system alists:\n\
10bff6f1 3857There are three such tables, `file-coding-system-alist',\n\
a67a9c66 3858`process-coding-system-alist', and `network-coding-system-alist'.");
4ed46869
KH
3859 Vcoding_system_for_write = Qnil;
3860
3861 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
a67a9c66 3862 "Coding system used in the latest file or process I/O.");
4ed46869
KH
3863 Vlast_coding_system_used = Qnil;
3864
9ce27fde
KH
3865 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
3866 "*Non-nil inhibit code conversion of end-of-line format in any cases.");
3867 inhibit_eol_conversion = 0;
3868
02ba4723
KH
3869 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
3870 "Alist to decide a coding system to use for a file I/O operation.\n\
3871The format is ((PATTERN . VAL) ...),\n\
3872where PATTERN is a regular expression matching a file name,\n\
3873VAL is a coding system, a cons of coding systems, or a function symbol.\n\
3874If VAL is a coding system, it is used for both decoding and encoding\n\
3875the file contents.\n\
3876If VAL is a cons of coding systems, the car part is used for decoding,\n\
3877and the cdr part is used for encoding.\n\
3878If VAL is a function symbol, the function must return a coding system\n\
3879or a cons of coding systems which are used as above.\n\
e0e989f6 3880\n\
9ce27fde 3881See also the function `find-operation-coding-system'.");
02ba4723
KH
3882 Vfile_coding_system_alist = Qnil;
3883
3884 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
3885 "Alist to decide a coding system to use for a process I/O operation.\n\
3886The format is ((PATTERN . VAL) ...),\n\
3887where PATTERN is a regular expression matching a program name,\n\
3888VAL is a coding system, a cons of coding systems, or a function symbol.\n\
3889If VAL is a coding system, it is used for both decoding what received\n\
3890from the program and encoding what sent to the program.\n\
3891If VAL is a cons of coding systems, the car part is used for decoding,\n\
3892and the cdr part is used for encoding.\n\
3893If VAL is a function symbol, the function must return a coding system\n\
3894or a cons of coding systems which are used as above.\n\
4ed46869 3895\n\
9ce27fde 3896See also the function `find-operation-coding-system'.");
02ba4723
KH
3897 Vprocess_coding_system_alist = Qnil;
3898
3899 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
3900 "Alist to decide a coding system to use for a network I/O operation.\n\
3901The format is ((PATTERN . VAL) ...),\n\
3902where PATTERN is a regular expression matching a network service name\n\
3903or is a port number to connect to,\n\
3904VAL is a coding system, a cons of coding systems, or a function symbol.\n\
3905If VAL is a coding system, it is used for both decoding what received\n\
3906from the network stream and encoding what sent to the network stream.\n\
3907If VAL is a cons of coding systems, the car part is used for decoding,\n\
3908and the cdr part is used for encoding.\n\
3909If VAL is a function symbol, the function must return a coding system\n\
3910or a cons of coding systems which are used as above.\n\
4ed46869 3911\n\
9ce27fde 3912See also the function `find-operation-coding-system'.");
02ba4723 3913 Vnetwork_coding_system_alist = Qnil;
4ed46869
KH
3914
3915 DEFVAR_INT ("eol-mnemonic-unix", &eol_mnemonic_unix,
3916 "Mnemonic character indicating UNIX-like end-of-line format (i.e. LF) .");
458822a0 3917 eol_mnemonic_unix = ':';
4ed46869
KH
3918
3919 DEFVAR_INT ("eol-mnemonic-dos", &eol_mnemonic_dos,
3920 "Mnemonic character indicating DOS-like end-of-line format (i.e. CRLF).");
458822a0 3921 eol_mnemonic_dos = '\\';
4ed46869
KH
3922
3923 DEFVAR_INT ("eol-mnemonic-mac", &eol_mnemonic_mac,
3924 "Mnemonic character indicating MAC-like end-of-line format (i.e. CR).");
458822a0 3925 eol_mnemonic_mac = '/';
4ed46869
KH
3926
3927 DEFVAR_INT ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
3928 "Mnemonic character indicating end-of-line format is not yet decided.");
458822a0 3929 eol_mnemonic_undecided = ':';
4ed46869 3930
bdd9fb48
KH
3931 DEFVAR_LISP ("enable-character-unification", &Venable_character_unification,
3932 "Non-nil means ISO 2022 encoder/decoder do character unification.");
3933 Venable_character_unification = Qt;
3934
a5d301df
KH
3935 DEFVAR_LISP ("standard-character-unification-table-for-decode",
3936 &Vstandard_character_unification_table_for_decode,
bdd9fb48 3937 "Table for unifying characters when reading.");
a5d301df 3938 Vstandard_character_unification_table_for_decode = Qnil;
bdd9fb48 3939
a5d301df
KH
3940 DEFVAR_LISP ("standard-character-unification-table-for-encode",
3941 &Vstandard_character_unification_table_for_encode,
bdd9fb48 3942 "Table for unifying characters when writing.");
a5d301df 3943 Vstandard_character_unification_table_for_encode = Qnil;
4ed46869
KH
3944
3945 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
3946 "Alist of charsets vs revision numbers.\n\
3947While encoding, if a charset (car part of an element) is found,\n\
3948designate it with the escape sequence identifing revision (cdr part of the element).");
3949 Vcharset_revision_alist = Qnil;
02ba4723
KH
3950
3951 DEFVAR_LISP ("default-process-coding-system",
3952 &Vdefault_process_coding_system,
3953 "Cons of coding systems used for process I/O by default.\n\
3954The car part is used for decoding a process output,\n\
3955the cdr part is used for encoding a text to be sent to a process.");
3956 Vdefault_process_coding_system = Qnil;
4ed46869
KH
3957}
3958
3959#endif /* emacs */