(internal_self_insert): Simplify handling of c2;
[bpt/emacs.git] / src / coding.c
CommitLineData
4ed46869 1/* Coding system handler (conversion, detection, and etc).
203cb916
RS
2 Copyright (C) 1995, 1997 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4ed46869 4
369314dc
KH
5This file is part of GNU Emacs.
6
7GNU Emacs is free software; you can redistribute it and/or modify
8it under the terms of the GNU General Public License as published by
9the Free Software Foundation; either version 2, or (at your option)
10any later version.
4ed46869 11
369314dc
KH
12GNU Emacs is distributed in the hope that it will be useful,
13but WITHOUT ANY WARRANTY; without even the implied warranty of
14MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15GNU General Public License for more details.
4ed46869 16
369314dc
KH
17You should have received a copy of the GNU General Public License
18along with GNU Emacs; see the file COPYING. If not, write to
19the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20Boston, MA 02111-1307, USA. */
4ed46869
KH
21
22/*** TABLE OF CONTENTS ***
23
24 1. Preamble
0ef69138 25 2. Emacs' internal format (emacs-mule) handlers
4ed46869
KH
26 3. ISO2022 handlers
27 4. Shift-JIS and BIG5 handlers
28 5. End-of-line handlers
29 6. C library functions
30 7. Emacs Lisp library functions
31 8. Post-amble
32
33*/
34
35/*** GENERAL NOTE on CODING SYSTEM ***
36
37 Coding system is an encoding mechanism of one or more character
38 sets. Here's a list of coding systems which Emacs can handle. When
39 we say "decode", it means converting some other coding system to
0ef69138
KH
40 Emacs' internal format (emacs-internal), and when we say "encode",
41 it means converting the coding system emacs-mule to some other
42 coding system.
4ed46869 43
0ef69138 44 0. Emacs' internal format (emacs-mule)
4ed46869
KH
45
46 Emacs itself holds a multi-lingual character in a buffer and a string
f4dee582 47 in a special format. Details are described in section 2.
4ed46869
KH
48
49 1. ISO2022
50
51 The most famous coding system for multiple character sets. X's
f4dee582
RS
52 Compound Text, various EUCs (Extended Unix Code), and coding
53 systems used in Internet communication such as ISO-2022-JP are
54 all variants of ISO2022. Details are described in section 3.
4ed46869
KH
55
56 2. SJIS (or Shift-JIS or MS-Kanji-Code)
57
58 A coding system to encode character sets: ASCII, JISX0201, and
59 JISX0208. Widely used for PC's in Japan. Details are described in
f4dee582 60 section 4.
4ed46869
KH
61
62 3. BIG5
63
64 A coding system to encode character sets: ASCII and Big5. Widely
65 used by Chinese (mainly in Taiwan and Hong Kong). Details are
f4dee582
RS
66 described in section 4. In this file, when we write "BIG5"
67 (all uppercase), we mean the coding system, and when we write
68 "Big5" (capitalized), we mean the character set.
4ed46869 69
27901516
KH
70 4. Raw text
71
4608c386
KH
72 A coding system for a text containing random 8-bit code. Emacs does
73 no code conversion on such a text except for end-of-line format.
27901516
KH
74
75 5. Other
4ed46869 76
f4dee582 77 If a user wants to read/write a text encoded in a coding system not
4ed46869
KH
78 listed above, he can supply a decoder and an encoder for it in CCL
79 (Code Conversion Language) programs. Emacs executes the CCL program
80 while reading/writing.
81
f4dee582 82 Emacs represents a coding-system by a Lisp symbol that has a property
4ed46869
KH
83 `coding-system'. But, before actually using the coding-system, the
84 information about it is set in a structure of type `struct
f4dee582 85 coding_system' for rapid processing. See section 6 for more details.
4ed46869
KH
86
87*/
88
89/*** GENERAL NOTES on END-OF-LINE FORMAT ***
90
91 How end-of-line of a text is encoded depends on a system. For
92 instance, Unix's format is just one byte of `line-feed' code,
f4dee582 93 whereas DOS's format is two-byte sequence of `carriage-return' and
4ed46869
KH
94 `line-feed' codes. MacOS's format is one byte of `carriage-return'.
95
f4dee582
RS
96 Since text characters encoding and end-of-line encoding are
97 independent, any coding system described above can take
4ed46869 98 any format of end-of-line. So, Emacs has information of format of
f4dee582 99 end-of-line in each coding-system. See section 6 for more details.
4ed46869
KH
100
101*/
102
103/*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
104
105 These functions check if a text between SRC and SRC_END is encoded
106 in the coding system category XXX. Each returns an integer value in
107 which appropriate flag bits for the category XXX is set. The flag
108 bits are defined in macros CODING_CATEGORY_MASK_XXX. Below is the
109 template of these functions. */
110#if 0
111int
0ef69138 112detect_coding_emacs_mule (src, src_end)
4ed46869
KH
113 unsigned char *src, *src_end;
114{
115 ...
116}
117#endif
118
119/*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
120
121 These functions decode SRC_BYTES length text at SOURCE encoded in
0ef69138 122 CODING to Emacs' internal format (emacs-mule). The resulting text
f4dee582
RS
123 goes to a place pointed to by DESTINATION, the length of which should
124 not exceed DST_BYTES. The number of bytes actually processed is
125 returned as *CONSUMED. The return value is the length of the decoded
126 text. Below is a template of these functions. */
4ed46869
KH
127#if 0
128decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
129 struct coding_system *coding;
130 unsigned char *source, *destination;
131 int src_bytes, dst_bytes;
132 int *consumed;
133{
134 ...
135}
136#endif
137
138/*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
139
0ef69138
KH
140 These functions encode SRC_BYTES length text at SOURCE of Emacs'
141 internal format (emacs-mule) to CODING. The resulting text goes to
f4dee582
RS
142 a place pointed to by DESTINATION, the length of which should not
143 exceed DST_BYTES. The number of bytes actually processed is
144 returned as *CONSUMED. The return value is the length of the
145 encoded text. Below is a template of these functions. */
4ed46869
KH
146#if 0
147encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
148 struct coding_system *coding;
149 unsigned char *source, *destination;
150 int src_bytes, dst_bytes;
151 int *consumed;
152{
153 ...
154}
155#endif
156
157/*** COMMONLY USED MACROS ***/
158
159/* The following three macros ONE_MORE_BYTE, TWO_MORE_BYTES, and
160 THREE_MORE_BYTES safely get one, two, and three bytes from the
161 source text respectively. If there are not enough bytes in the
162 source, they jump to `label_end_of_loop'. The caller should set
163 variables `src' and `src_end' to appropriate areas in advance. */
164
165#define ONE_MORE_BYTE(c1) \
166 do { \
167 if (src < src_end) \
168 c1 = *src++; \
169 else \
170 goto label_end_of_loop; \
171 } while (0)
172
173#define TWO_MORE_BYTES(c1, c2) \
174 do { \
175 if (src + 1 < src_end) \
176 c1 = *src++, c2 = *src++; \
177 else \
178 goto label_end_of_loop; \
179 } while (0)
180
181#define THREE_MORE_BYTES(c1, c2, c3) \
182 do { \
183 if (src + 2 < src_end) \
184 c1 = *src++, c2 = *src++, c3 = *src++; \
185 else \
186 goto label_end_of_loop; \
187 } while (0)
188
189/* The following three macros DECODE_CHARACTER_ASCII,
190 DECODE_CHARACTER_DIMENSION1, and DECODE_CHARACTER_DIMENSION2 put
191 the multi-byte form of a character of each class at the place
192 pointed by `dst'. The caller should set the variable `dst' to
193 point to an appropriate area and the variable `coding' to point to
194 the coding-system of the currently decoding text in advance. */
195
196/* Decode one ASCII character C. */
197
198#define DECODE_CHARACTER_ASCII(c) \
199 do { \
200 if (COMPOSING_P (coding->composing)) \
201 *dst++ = 0xA0, *dst++ = (c) | 0x80; \
202 else \
203 *dst++ = (c); \
204 } while (0)
205
f4dee582 206/* Decode one DIMENSION1 character whose charset is CHARSET and whose
4ed46869
KH
207 position-code is C. */
208
209#define DECODE_CHARACTER_DIMENSION1(charset, c) \
210 do { \
211 unsigned char leading_code = CHARSET_LEADING_CODE_BASE (charset); \
212 if (COMPOSING_P (coding->composing)) \
213 *dst++ = leading_code + 0x20; \
214 else \
215 *dst++ = leading_code; \
216 if (leading_code = CHARSET_LEADING_CODE_EXT (charset)) \
217 *dst++ = leading_code; \
218 *dst++ = (c) | 0x80; \
219 } while (0)
220
f4dee582 221/* Decode one DIMENSION2 character whose charset is CHARSET and whose
4ed46869
KH
222 position-codes are C1 and C2. */
223
224#define DECODE_CHARACTER_DIMENSION2(charset, c1, c2) \
225 do { \
226 DECODE_CHARACTER_DIMENSION1 (charset, c1); \
227 *dst++ = (c2) | 0x80; \
228 } while (0)
229
230\f
231/*** 1. Preamble ***/
232
233#include <stdio.h>
234
235#ifdef emacs
236
237#include <config.h>
238#include "lisp.h"
239#include "buffer.h"
240#include "charset.h"
241#include "ccl.h"
242#include "coding.h"
243#include "window.h"
244
245#else /* not emacs */
246
247#include "mulelib.h"
248
249#endif /* not emacs */
250
251Lisp_Object Qcoding_system, Qeol_type;
252Lisp_Object Qbuffer_file_coding_system;
253Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
27901516 254Lisp_Object Qno_conversion, Qundecided;
bb0115a2 255Lisp_Object Qcoding_system_history;
70c22245 256Lisp_Object Qsafe_charsets;
4ed46869
KH
257
258extern Lisp_Object Qinsert_file_contents, Qwrite_region;
259Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
260Lisp_Object Qstart_process, Qopen_network_stream;
261Lisp_Object Qtarget_idx;
262
263/* Mnemonic character of each format of end-of-line. */
264int eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
265/* Mnemonic character to indicate format of end-of-line is not yet
266 decided. */
267int eol_mnemonic_undecided;
268
9ce27fde
KH
269/* Format of end-of-line decided by system. This is CODING_EOL_LF on
270 Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac. */
271int system_eol_type;
272
4ed46869
KH
273#ifdef emacs
274
4608c386
KH
275Lisp_Object Vcoding_system_list, Vcoding_system_alist;
276
277Lisp_Object Qcoding_system_p, Qcoding_system_error;
4ed46869 278
9ce27fde
KH
279/* Coding system emacs-mule is for converting only end-of-line format. */
280Lisp_Object Qemacs_mule;
281
4ed46869
KH
282/* Coding-systems are handed between Emacs Lisp programs and C internal
283 routines by the following three variables. */
284/* Coding-system for reading files and receiving data from process. */
285Lisp_Object Vcoding_system_for_read;
286/* Coding-system for writing files and sending data to process. */
287Lisp_Object Vcoding_system_for_write;
288/* Coding-system actually used in the latest I/O. */
289Lisp_Object Vlast_coding_system_used;
290
c4825358 291/* A vector of length 256 which contains information about special
3f003981
KH
292 Latin codes (espepcially for dealing with Microsoft code). */
293Lisp_Object Vlatin_extra_code_table;
c4825358 294
9ce27fde
KH
295/* Flag to inhibit code conversion of end-of-line format. */
296int inhibit_eol_conversion;
297
c4825358 298/* Coding system to be used to encode text for terminal display. */
4ed46869
KH
299struct coding_system terminal_coding;
300
c4825358
KH
301/* Coding system to be used to encode text for terminal display when
302 terminal coding system is nil. */
303struct coding_system safe_terminal_coding;
304
305/* Coding system of what is sent from terminal keyboard. */
4ed46869
KH
306struct coding_system keyboard_coding;
307
02ba4723
KH
308Lisp_Object Vfile_coding_system_alist;
309Lisp_Object Vprocess_coding_system_alist;
310Lisp_Object Vnetwork_coding_system_alist;
4ed46869
KH
311
312#endif /* emacs */
313
314Lisp_Object Qcoding_category_index;
315
316/* List of symbols `coding-category-xxx' ordered by priority. */
317Lisp_Object Vcoding_category_list;
318
319/* Table of coding-systems currently assigned to each coding-category. */
320Lisp_Object coding_category_table[CODING_CATEGORY_IDX_MAX];
321
322/* Table of names of symbol for each coding-category. */
323char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
0ef69138 324 "coding-category-emacs-mule",
4ed46869
KH
325 "coding-category-sjis",
326 "coding-category-iso-7",
327 "coding-category-iso-8-1",
328 "coding-category-iso-8-2",
7717c392
KH
329 "coding-category-iso-7-else",
330 "coding-category-iso-8-else",
4ed46869 331 "coding-category-big5",
27901516 332 "coding-category-raw-text",
4ed46869
KH
333 "coding-category-binary"
334};
335
bdd9fb48
KH
336/* Flag to tell if we look up unification table on character code
337 conversion. */
338Lisp_Object Venable_character_unification;
a5d301df
KH
339/* Standard unification table to look up on decoding (reading). */
340Lisp_Object Vstandard_character_unification_table_for_decode;
341/* Standard unification table to look up on encoding (writing). */
342Lisp_Object Vstandard_character_unification_table_for_encode;
bdd9fb48
KH
343
344Lisp_Object Qcharacter_unification_table;
a5d301df
KH
345Lisp_Object Qcharacter_unification_table_for_decode;
346Lisp_Object Qcharacter_unification_table_for_encode;
4ed46869
KH
347
348/* Alist of charsets vs revision number. */
349Lisp_Object Vcharset_revision_alist;
350
02ba4723
KH
351/* Default coding systems used for process I/O. */
352Lisp_Object Vdefault_process_coding_system;
353
4ed46869 354\f
0ef69138 355/*** 2. Emacs internal format (emacs-mule) handlers ***/
4ed46869
KH
356
357/* Emacs' internal format for encoding multiple character sets is a
f4dee582
RS
358 kind of multi-byte encoding, i.e. characters are encoded by
359 variable-length sequences of one-byte codes. ASCII characters
360 and control characters (e.g. `tab', `newline') are represented by
361 one-byte sequences which are their ASCII codes, in the range 0x00
362 through 0x7F. The other characters are represented by a sequence
363 of `base leading-code', optional `extended leading-code', and one
364 or two `position-code's. The length of the sequence is determined
365 by the base leading-code. Leading-code takes the range 0x80
366 through 0x9F, whereas extended leading-code and position-code take
367 the range 0xA0 through 0xFF. See `charset.h' for more details
368 about leading-code and position-code.
369
370 There's one exception to this rule. Special leading-code
4ed46869
KH
371 `leading-code-composition' denotes that the following several
372 characters should be composed into one character. Leading-codes of
373 components (except for ASCII) are added 0x20. An ASCII character
374 component is represented by a 2-byte sequence of `0xA0' and
f4dee582
RS
375 `ASCII-code + 0x80'. See also the comments in `charset.h' for the
376 details of composite character. Hence, we can summarize the code
4ed46869
KH
377 range as follows:
378
379 --- CODE RANGE of Emacs' internal format ---
380 (character set) (range)
381 ASCII 0x00 .. 0x7F
382 ELSE (1st byte) 0x80 .. 0x9F
383 (rest bytes) 0xA0 .. 0xFF
384 ---------------------------------------------
385
386 */
387
388enum emacs_code_class_type emacs_code_class[256];
389
390/* Go to the next statement only if *SRC is accessible and the code is
391 greater than 0xA0. */
392#define CHECK_CODE_RANGE_A0_FF \
393 do { \
394 if (src >= src_end) \
395 goto label_end_of_switch; \
396 else if (*src++ < 0xA0) \
397 return 0; \
398 } while (0)
399
400/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
401 Check if a text is encoded in Emacs' internal format. If it is,
0ef69138 402 return CODING_CATEGORY_MASK_EMASC_MULE, else return 0. */
4ed46869
KH
403
404int
0ef69138 405detect_coding_emacs_mule (src, src_end)
4ed46869
KH
406 unsigned char *src, *src_end;
407{
408 unsigned char c;
409 int composing = 0;
410
411 while (src < src_end)
412 {
413 c = *src++;
414
415 if (composing)
416 {
417 if (c < 0xA0)
418 composing = 0;
419 else
420 c -= 0x20;
421 }
422
423 switch (emacs_code_class[c])
424 {
425 case EMACS_ascii_code:
426 case EMACS_linefeed_code:
427 break;
428
429 case EMACS_control_code:
430 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
431 return 0;
432 break;
433
434 case EMACS_invalid_code:
435 return 0;
436
437 case EMACS_leading_code_composition: /* c == 0x80 */
438 if (composing)
439 CHECK_CODE_RANGE_A0_FF;
440 else
441 composing = 1;
442 break;
443
444 case EMACS_leading_code_4:
445 CHECK_CODE_RANGE_A0_FF;
446 /* fall down to check it two more times ... */
447
448 case EMACS_leading_code_3:
449 CHECK_CODE_RANGE_A0_FF;
450 /* fall down to check it one more time ... */
451
452 case EMACS_leading_code_2:
453 CHECK_CODE_RANGE_A0_FF;
454 break;
455
456 default:
457 label_end_of_switch:
458 break;
459 }
460 }
0ef69138 461 return CODING_CATEGORY_MASK_EMACS_MULE;
4ed46869
KH
462}
463
464\f
465/*** 3. ISO2022 handlers ***/
466
467/* The following note describes the coding system ISO2022 briefly.
f4dee582
RS
468 Since the intention of this note is to help in understanding of
469 the programs in this file, some parts are NOT ACCURATE or OVERLY
4ed46869
KH
470 SIMPLIFIED. For the thorough understanding, please refer to the
471 original document of ISO2022.
472
473 ISO2022 provides many mechanisms to encode several character sets
f4dee582 474 in 7-bit and 8-bit environment. If one chooses 7-bite environment,
4ed46869 475 all text is encoded by codes of less than 128. This may make the
f4dee582
RS
476 encoded text a little bit longer, but the text gets more stability
477 to pass through several gateways (some of them strip off the MSB).
4ed46869 478
f4dee582 479 There are two kinds of character set: control character set and
4ed46869
KH
480 graphic character set. The former contains control characters such
481 as `newline' and `escape' to provide control functions (control
f4dee582 482 functions are provided also by escape sequences). The latter
4ed46869
KH
483 contains graphic characters such as ' A' and '-'. Emacs recognizes
484 two control character sets and many graphic character sets.
485
486 Graphic character sets are classified into one of the following
487 four classes, DIMENSION1_CHARS94, DIMENSION1_CHARS96,
488 DIMENSION2_CHARS94, DIMENSION2_CHARS96 according to the number of
489 bytes (DIMENSION) and the number of characters in one dimension
490 (CHARS) of the set. In addition, each character set is assigned an
491 identification tag (called "final character" and denoted as <F>
492 here after) which is unique in each class. <F> of each character
493 set is decided by ECMA(*) when it is registered in ISO. Code range
494 of <F> is 0x30..0x7F (0x30..0x3F are for private use only).
495
496 Note (*): ECMA = European Computer Manufacturers Association
497
498 Here are examples of graphic character set [NAME(<F>)]:
499 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
500 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
501 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
502 o DIMENSION2_CHARS96 -- none for the moment
503
504 A code area (1byte=8bits) is divided into 4 areas, C0, GL, C1, and GR.
505 C0 [0x00..0x1F] -- control character plane 0
506 GL [0x20..0x7F] -- graphic character plane 0
507 C1 [0x80..0x9F] -- control character plane 1
508 GR [0xA0..0xFF] -- graphic character plane 1
509
510 A control character set is directly designated and invoked to C0 or
511 C1 by an escape sequence. The most common case is that ISO646's
512 control character set is designated/invoked to C0 and ISO6429's
513 control character set is designated/invoked to C1, and usually
514 these designations/invocations are omitted in a coded text. With
515 7-bit environment, only C0 can be used, and a control character for
516 C1 is encoded by an appropriate escape sequence to fit in the
517 environment. All control characters for C1 are defined the
518 corresponding escape sequences.
519
520 A graphic character set is at first designated to one of four
521 graphic registers (G0 through G3), then these graphic registers are
522 invoked to GL or GR. These designations and invocations can be
523 done independently. The most common case is that G0 is invoked to
524 GL, G1 is invoked to GR, and ASCII is designated to G0, and usually
525 these invocations and designations are omitted in a coded text.
526 With 7-bit environment, only GL can be used.
527
528 When a graphic character set of CHARS94 is invoked to GL, code 0x20
529 and 0x7F of GL area work as control characters SPACE and DEL
530 respectively, and code 0xA0 and 0xFF of GR area should not be used.
531
532 There are two ways of invocation: locking-shift and single-shift.
533 With locking-shift, the invocation lasts until the next different
534 invocation, whereas with single-shift, the invocation works only
535 for the following character and doesn't affect locking-shift.
536 Invocations are done by the following control characters or escape
537 sequences.
538
539 ----------------------------------------------------------------------
540 function control char escape sequence description
541 ----------------------------------------------------------------------
542 SI (shift-in) 0x0F none invoke G0 to GL
10bff6f1 543 SO (shift-out) 0x0E none invoke G1 to GL
4ed46869
KH
544 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
545 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
546 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 into GL
547 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 into GL
548 ----------------------------------------------------------------------
549 The first four are for locking-shift. Control characters for these
550 functions are defined by macros ISO_CODE_XXX in `coding.h'.
551
552 Designations are done by the following escape sequences.
553 ----------------------------------------------------------------------
554 escape sequence description
555 ----------------------------------------------------------------------
556 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
557 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
558 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
559 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
560 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
561 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
562 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
563 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
564 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
565 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
566 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
567 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
568 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
569 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
570 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
571 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
572 ----------------------------------------------------------------------
573
574 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
575 of dimension 1, chars 94, and final character <F>, and etc.
576
577 Note (*): Although these designations are not allowed in ISO2022,
578 Emacs accepts them on decoding, and produces them on encoding
579 CHARS96 character set in a coding system which is characterized as
580 7-bit environment, non-locking-shift, and non-single-shift.
581
582 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
583 '(' can be omitted. We call this as "short-form" here after.
584
585 Now you may notice that there are a lot of ways for encoding the
f4dee582 586 same multilingual text in ISO2022. Actually, there exists many
4ed46869
KH
587 coding systems such as Compound Text (used in X's inter client
588 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
589 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
590 localized platforms), and all of these are variants of ISO2022.
591
592 In addition to the above, Emacs handles two more kinds of escape
593 sequences: ISO6429's direction specification and Emacs' private
594 sequence for specifying character composition.
595
596 ISO6429's direction specification takes the following format:
597 o CSI ']' -- end of the current direction
598 o CSI '0' ']' -- end of the current direction
599 o CSI '1' ']' -- start of left-to-right text
600 o CSI '2' ']' -- start of right-to-left text
601 The control character CSI (0x9B: control sequence introducer) is
602 abbreviated to the escape sequence ESC '[' in 7-bit environment.
603
604 Character composition specification takes the following format:
605 o ESC '0' -- start character composition
606 o ESC '1' -- end character composition
607 Since these are not standard escape sequences of any ISO, the use
608 of them for these meaning is restricted to Emacs only. */
609
610enum iso_code_class_type iso_code_class[256];
611
612/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
613 Check if a text is encoded in ISO2022. If it is, returns an
614 integer in which appropriate flag bits any of:
615 CODING_CATEGORY_MASK_ISO_7
616 CODING_CATEGORY_MASK_ISO_8_1
617 CODING_CATEGORY_MASK_ISO_8_2
7717c392
KH
618 CODING_CATEGORY_MASK_ISO_7_ELSE
619 CODING_CATEGORY_MASK_ISO_8_ELSE
4ed46869
KH
620 are set. If a code which should never appear in ISO2022 is found,
621 returns 0. */
622
623int
624detect_coding_iso2022 (src, src_end)
625 unsigned char *src, *src_end;
626{
765a2ca5
KH
627 int mask = (CODING_CATEGORY_MASK_ISO_7
628 | CODING_CATEGORY_MASK_ISO_8_1
629 | CODING_CATEGORY_MASK_ISO_8_2
7717c392
KH
630 | CODING_CATEGORY_MASK_ISO_7_ELSE
631 | CODING_CATEGORY_MASK_ISO_8_ELSE
632 );
bcf26d6a
KH
633 int g1 = 0; /* 1 iff designating to G1. */
634 int c, i;
3f003981 635 struct coding_system coding_iso_8_1, coding_iso_8_2;
4ed46869 636
3f003981
KH
637 /* Coding systems of these categories may accept latin extra codes. */
638 setup_coding_system
639 (XSYMBOL (coding_category_table[CODING_CATEGORY_IDX_ISO_8_1])->value,
640 &coding_iso_8_1);
641 setup_coding_system
642 (XSYMBOL (coding_category_table[CODING_CATEGORY_IDX_ISO_8_2])->value,
643 &coding_iso_8_2);
644
645 while (mask && src < src_end)
4ed46869
KH
646 {
647 c = *src++;
648 switch (c)
649 {
650 case ISO_CODE_ESC:
e0e989f6 651 if (src >= src_end)
4ed46869
KH
652 break;
653 c = *src++;
bf9cdd4e 654 if ((c >= '(' && c <= '/'))
4ed46869 655 {
bf9cdd4e
KH
656 /* Designation sequence for a charset of dimension 1. */
657 if (src >= src_end)
658 break;
659 c = *src++;
660 if (c < ' ' || c >= 0x80)
661 /* Invalid designation sequence. */
662 return 0;
663 }
664 else if (c == '$')
665 {
666 /* Designation sequence for a charset of dimension 2. */
667 if (src >= src_end)
668 break;
669 c = *src++;
670 if (c >= '@' && c <= 'B')
671 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
672 ;
673 else if (c >= '(' && c <= '/')
bcf26d6a 674 {
bf9cdd4e
KH
675 if (src >= src_end)
676 break;
677 c = *src++;
678 if (c < ' ' || c >= 0x80)
679 /* Invalid designation sequence. */
680 return 0;
bcf26d6a 681 }
bf9cdd4e
KH
682 else
683 /* Invalid designation sequence. */
684 return 0;
4ed46869 685 }
4ed46869 686 else if (c == 'N' || c == 'O' || c == 'n' || c == 'o')
bf9cdd4e 687 /* Locking shift. */
7717c392
KH
688 mask &= (CODING_CATEGORY_MASK_ISO_7_ELSE
689 | CODING_CATEGORY_MASK_ISO_8_ELSE);
bf9cdd4e
KH
690 else if (c == '0' || c == '1' || c == '2')
691 /* Start/end composition. */
692 ;
693 else
694 /* Invalid escape sequence. */
695 return 0;
4ed46869
KH
696 break;
697
4ed46869 698 case ISO_CODE_SO:
bf9cdd4e
KH
699 mask &= (CODING_CATEGORY_MASK_ISO_7_ELSE
700 | CODING_CATEGORY_MASK_ISO_8_ELSE);
e0e989f6
KH
701 break;
702
4ed46869
KH
703 case ISO_CODE_CSI:
704 case ISO_CODE_SS2:
705 case ISO_CODE_SS3:
3f003981
KH
706 {
707 int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
708
70c22245
KH
709 if (c != ISO_CODE_CSI)
710 {
711 if (coding_iso_8_1.flags & CODING_FLAG_ISO_SINGLE_SHIFT)
712 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
713 if (coding_iso_8_2.flags & CODING_FLAG_ISO_SINGLE_SHIFT)
714 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
715 }
3f003981
KH
716 if (VECTORP (Vlatin_extra_code_table)
717 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
718 {
719 if (coding_iso_8_1.flags & CODING_FLAG_ISO_LATIN_EXTRA)
720 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
721 if (coding_iso_8_2.flags & CODING_FLAG_ISO_LATIN_EXTRA)
722 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
723 }
724 mask &= newmask;
725 }
726 break;
4ed46869
KH
727
728 default:
729 if (c < 0x80)
730 break;
731 else if (c < 0xA0)
c4825358 732 {
3f003981
KH
733 if (VECTORP (Vlatin_extra_code_table)
734 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
c4825358 735 {
3f003981
KH
736 int newmask = 0;
737
738 if (coding_iso_8_1.flags & CODING_FLAG_ISO_LATIN_EXTRA)
739 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
740 if (coding_iso_8_2.flags & CODING_FLAG_ISO_LATIN_EXTRA)
741 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
742 mask &= newmask;
c4825358 743 }
3f003981
KH
744 else
745 return 0;
c4825358 746 }
4ed46869
KH
747 else
748 {
7717c392 749 unsigned char *src_begin = src;
4ed46869 750
7717c392
KH
751 mask &= ~(CODING_CATEGORY_MASK_ISO_7
752 | CODING_CATEGORY_MASK_ISO_7_ELSE);
e0e989f6 753 while (src < src_end && *src >= 0xA0)
7717c392
KH
754 src++;
755 if ((src - src_begin - 1) & 1 && src < src_end)
4ed46869
KH
756 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
757 }
758 break;
759 }
760 }
761
762 return mask;
763}
764
765/* Decode a character of which charset is CHARSET and the 1st position
bdd9fb48 766 code is C1. If dimension of CHARSET is 2, the 2nd position code is
4ed46869
KH
767 fetched from SRC and set to C2. If CHARSET is negative, it means
768 that we are decoding ill formed text, and what we can do is just to
769 read C1 as is. */
770
bdd9fb48
KH
771#define DECODE_ISO_CHARACTER(charset, c1) \
772 do { \
773 int c_alt, charset_alt = (charset); \
774 if (COMPOSING_HEAD_P (coding->composing)) \
775 { \
776 *dst++ = LEADING_CODE_COMPOSITION; \
777 if (COMPOSING_WITH_RULE_P (coding->composing)) \
778 /* To tell composition rules are embeded. */ \
779 *dst++ = 0xFF; \
780 coding->composing += 2; \
781 } \
782 if ((charset) >= 0) \
783 { \
784 if (CHARSET_DIMENSION (charset) == 2) \
70c22245
KH
785 { \
786 ONE_MORE_BYTE (c2); \
787 if (iso_code_class[(c2) & 0x7F] != ISO_0x20_or_0x7F \
788 && iso_code_class[(c2) & 0x7F] != ISO_graphic_plane_0) \
789 { \
790 src--; \
791 c2 = ' '; \
792 } \
793 } \
bdd9fb48
KH
794 if (!NILP (unification_table) \
795 && ((c_alt = unify_char (unification_table, \
796 -1, (charset), c1, c2)) >= 0)) \
797 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
798 } \
799 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
800 DECODE_CHARACTER_ASCII (c1); \
801 else if (CHARSET_DIMENSION (charset_alt) == 1) \
802 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
803 else \
804 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
805 if (COMPOSING_WITH_RULE_P (coding->composing)) \
806 /* To tell a composition rule follows. */ \
807 coding->composing = COMPOSING_WITH_RULE_RULE; \
4ed46869
KH
808 } while (0)
809
810/* Set designation state into CODING. */
811#define DECODE_DESIGNATION(reg, dimension, chars, final_char) \
812 do { \
2e34157c
RS
813 int charset = ISO_CHARSET_TABLE (make_number (dimension), \
814 make_number (chars), \
815 make_number (final_char)); \
4ed46869
KH
816 if (charset >= 0) \
817 { \
818 if (coding->direction == 1 \
819 && CHARSET_REVERSE_CHARSET (charset) >= 0) \
820 charset = CHARSET_REVERSE_CHARSET (charset); \
821 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
822 } \
823 } while (0)
824
825/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
826
827int
828decode_coding_iso2022 (coding, source, destination,
829 src_bytes, dst_bytes, consumed)
830 struct coding_system *coding;
831 unsigned char *source, *destination;
832 int src_bytes, dst_bytes;
833 int *consumed;
834{
835 unsigned char *src = source;
836 unsigned char *src_end = source + src_bytes;
837 unsigned char *dst = destination;
838 unsigned char *dst_end = destination + dst_bytes;
839 /* Since the maximum bytes produced by each loop is 7, we subtract 6
840 from DST_END to assure that overflow checking is necessary only
841 at the head of loop. */
842 unsigned char *adjusted_dst_end = dst_end - 6;
843 int charset;
844 /* Charsets invoked to graphic plane 0 and 1 respectively. */
845 int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
846 int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
a5d301df
KH
847 Lisp_Object unification_table
848 = coding->character_unification_table_for_decode;
bdd9fb48
KH
849
850 if (!NILP (Venable_character_unification) && NILP (unification_table))
a5d301df 851 unification_table = Vstandard_character_unification_table_for_decode;
4ed46869
KH
852
853 while (src < src_end && dst < adjusted_dst_end)
854 {
855 /* SRC_BASE remembers the start position in source in each loop.
856 The loop will be exited when there's not enough source text
857 to analyze long escape sequence or 2-byte code (within macros
858 ONE_MORE_BYTE or TWO_MORE_BYTES). In that case, SRC is reset
859 to SRC_BASE before exiting. */
860 unsigned char *src_base = src;
bdd9fb48 861 int c1 = *src++, c2;
4ed46869
KH
862
863 switch (iso_code_class [c1])
864 {
865 case ISO_0x20_or_0x7F:
866 if (!coding->composing
867 && (charset0 < 0 || CHARSET_CHARS (charset0) == 94))
868 {
869 /* This is SPACE or DEL. */
870 *dst++ = c1;
871 break;
872 }
873 /* This is a graphic character, we fall down ... */
874
875 case ISO_graphic_plane_0:
876 if (coding->composing == COMPOSING_WITH_RULE_RULE)
877 {
878 /* This is a composition rule. */
879 *dst++ = c1 | 0x80;
880 coding->composing = COMPOSING_WITH_RULE_TAIL;
881 }
882 else
883 DECODE_ISO_CHARACTER (charset0, c1);
884 break;
885
886 case ISO_0xA0_or_0xFF:
887 if (charset1 < 0 || CHARSET_CHARS (charset1) == 94)
888 {
889 /* Invalid code. */
890 *dst++ = c1;
891 break;
892 }
893 /* This is a graphic character, we fall down ... */
894
895 case ISO_graphic_plane_1:
896 DECODE_ISO_CHARACTER (charset1, c1);
897 break;
898
899 case ISO_control_code:
900 /* All ISO2022 control characters in this class have the
901 same representation in Emacs internal format. */
902 *dst++ = c1;
903 break;
904
905 case ISO_carriage_return:
906 if (coding->eol_type == CODING_EOL_CR)
907 {
908 *dst++ = '\n';
909 }
910 else if (coding->eol_type == CODING_EOL_CRLF)
911 {
912 ONE_MORE_BYTE (c1);
913 if (c1 == ISO_CODE_LF)
914 *dst++ = '\n';
915 else
916 {
917 src--;
918 *dst++ = c1;
919 }
920 }
921 else
922 {
923 *dst++ = c1;
924 }
925 break;
926
927 case ISO_shift_out:
e0e989f6
KH
928 if (CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
929 goto label_invalid_escape_sequence;
4ed46869
KH
930 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
931 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
932 break;
933
934 case ISO_shift_in:
935 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
936 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
937 break;
938
939 case ISO_single_shift_2_7:
940 case ISO_single_shift_2:
941 /* SS2 is handled as an escape sequence of ESC 'N' */
942 c1 = 'N';
943 goto label_escape_sequence;
944
945 case ISO_single_shift_3:
946 /* SS2 is handled as an escape sequence of ESC 'O' */
947 c1 = 'O';
948 goto label_escape_sequence;
949
950 case ISO_control_sequence_introducer:
951 /* CSI is handled as an escape sequence of ESC '[' ... */
952 c1 = '[';
953 goto label_escape_sequence;
954
955 case ISO_escape:
956 ONE_MORE_BYTE (c1);
957 label_escape_sequence:
958 /* Escape sequences handled by Emacs are invocation,
959 designation, direction specification, and character
960 composition specification. */
961 switch (c1)
962 {
963 case '&': /* revision of following character set */
964 ONE_MORE_BYTE (c1);
965 if (!(c1 >= '@' && c1 <= '~'))
e0e989f6 966 goto label_invalid_escape_sequence;
4ed46869
KH
967 ONE_MORE_BYTE (c1);
968 if (c1 != ISO_CODE_ESC)
e0e989f6 969 goto label_invalid_escape_sequence;
4ed46869
KH
970 ONE_MORE_BYTE (c1);
971 goto label_escape_sequence;
972
973 case '$': /* designation of 2-byte character set */
974 ONE_MORE_BYTE (c1);
975 if (c1 >= '@' && c1 <= 'B')
976 { /* designation of JISX0208.1978, GB2312.1980,
977 or JISX0208.1980 */
978 DECODE_DESIGNATION (0, 2, 94, c1);
979 }
980 else if (c1 >= 0x28 && c1 <= 0x2B)
981 { /* designation of DIMENSION2_CHARS94 character set */
982 ONE_MORE_BYTE (c2);
983 DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
984 }
985 else if (c1 >= 0x2C && c1 <= 0x2F)
986 { /* designation of DIMENSION2_CHARS96 character set */
987 ONE_MORE_BYTE (c2);
988 DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
989 }
990 else
e0e989f6 991 goto label_invalid_escape_sequence;
4ed46869
KH
992 break;
993
994 case 'n': /* invocation of locking-shift-2 */
e0e989f6
KH
995 if (CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
996 goto label_invalid_escape_sequence;
4ed46869 997 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
e0e989f6 998 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
4ed46869
KH
999 break;
1000
1001 case 'o': /* invocation of locking-shift-3 */
e0e989f6
KH
1002 if (CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1003 goto label_invalid_escape_sequence;
4ed46869 1004 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
e0e989f6 1005 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
4ed46869
KH
1006 break;
1007
1008 case 'N': /* invocation of single-shift-2 */
e0e989f6
KH
1009 if (CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1010 goto label_invalid_escape_sequence;
4ed46869
KH
1011 ONE_MORE_BYTE (c1);
1012 charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
1013 DECODE_ISO_CHARACTER (charset, c1);
1014 break;
1015
1016 case 'O': /* invocation of single-shift-3 */
e0e989f6
KH
1017 if (CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1018 goto label_invalid_escape_sequence;
4ed46869
KH
1019 ONE_MORE_BYTE (c1);
1020 charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
1021 DECODE_ISO_CHARACTER (charset, c1);
1022 break;
1023
1024 case '0': /* start composing without embeded rules */
1025 coding->composing = COMPOSING_NO_RULE_HEAD;
1026 break;
1027
1028 case '1': /* end composing */
1029 coding->composing = COMPOSING_NO;
1030 break;
1031
1032 case '2': /* start composing with embeded rules */
1033 coding->composing = COMPOSING_WITH_RULE_HEAD;
1034 break;
1035
1036 case '[': /* specification of direction */
1037 /* For the moment, nested direction is not supported.
1038 So, the value of `coding->direction' is 0 or 1: 0
1039 means left-to-right, 1 means right-to-left. */
1040 ONE_MORE_BYTE (c1);
1041 switch (c1)
1042 {
1043 case ']': /* end of the current direction */
1044 coding->direction = 0;
1045
1046 case '0': /* end of the current direction */
1047 case '1': /* start of left-to-right direction */
1048 ONE_MORE_BYTE (c1);
1049 if (c1 == ']')
1050 coding->direction = 0;
1051 else
1052 goto label_invalid_escape_sequence;
1053 break;
1054
1055 case '2': /* start of right-to-left direction */
1056 ONE_MORE_BYTE (c1);
1057 if (c1 == ']')
1058 coding->direction= 1;
1059 else
1060 goto label_invalid_escape_sequence;
1061 break;
1062
1063 default:
1064 goto label_invalid_escape_sequence;
1065 }
1066 break;
1067
1068 default:
1069 if (c1 >= 0x28 && c1 <= 0x2B)
1070 { /* designation of DIMENSION1_CHARS94 character set */
1071 ONE_MORE_BYTE (c2);
1072 DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
1073 }
1074 else if (c1 >= 0x2C && c1 <= 0x2F)
1075 { /* designation of DIMENSION1_CHARS96 character set */
1076 ONE_MORE_BYTE (c2);
1077 DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
1078 }
1079 else
1080 {
1081 goto label_invalid_escape_sequence;
1082 }
1083 }
1084 /* We must update these variables now. */
1085 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1086 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1087 break;
1088
1089 label_invalid_escape_sequence:
1090 {
1091 int length = src - src_base;
1092
1093 bcopy (src_base, dst, length);
1094 dst += length;
1095 }
1096 }
1097 continue;
1098
1099 label_end_of_loop:
1100 coding->carryover_size = src - src_base;
1101 bcopy (src_base, coding->carryover, coding->carryover_size);
1102 src = src_base;
1103 break;
1104 }
1105
1106 /* If this is the last block of the text to be decoded, we had
1107 better just flush out all remaining codes in the text although
1108 they are not valid characters. */
1109 if (coding->last_block)
1110 {
1111 bcopy (src, dst, src_end - src);
1112 dst += (src_end - src);
1113 src = src_end;
1114 }
1115 *consumed = src - source;
1116 return dst - destination;
1117}
1118
f4dee582 1119/* ISO2022 encoding stuff. */
4ed46869
KH
1120
1121/*
f4dee582 1122 It is not enough to say just "ISO2022" on encoding, we have to
4ed46869
KH
1123 specify more details. In Emacs, each coding-system of ISO2022
1124 variant has the following specifications:
1125 1. Initial designation to G0 thru G3.
1126 2. Allows short-form designation?
1127 3. ASCII should be designated to G0 before control characters?
1128 4. ASCII should be designated to G0 at end of line?
1129 5. 7-bit environment or 8-bit environment?
1130 6. Use locking-shift?
1131 7. Use Single-shift?
1132 And the following two are only for Japanese:
1133 8. Use ASCII in place of JIS0201-1976-Roman?
1134 9. Use JISX0208-1983 in place of JISX0208-1978?
1135 These specifications are encoded in `coding->flags' as flag bits
1136 defined by macros CODING_FLAG_ISO_XXX. See `coding.h' for more
f4dee582 1137 details.
4ed46869
KH
1138*/
1139
1140/* Produce codes (escape sequence) for designating CHARSET to graphic
1141 register REG. If <final-char> of CHARSET is '@', 'A', or 'B' and
1142 the coding system CODING allows, produce designation sequence of
1143 short-form. */
1144
1145#define ENCODE_DESIGNATION(charset, reg, coding) \
1146 do { \
1147 unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset); \
1148 char *intermediate_char_94 = "()*+"; \
1149 char *intermediate_char_96 = ",-./"; \
70c22245
KH
1150 int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset); \
1151 if (revision < 255) \
1152 { \
4ed46869
KH
1153 *dst++ = ISO_CODE_ESC; \
1154 *dst++ = '&'; \
70c22245 1155 *dst++ = '@' + revision; \
4ed46869
KH
1156 } \
1157 *dst++ = ISO_CODE_ESC; \
1158 if (CHARSET_DIMENSION (charset) == 1) \
1159 { \
1160 if (CHARSET_CHARS (charset) == 94) \
1161 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1162 else \
1163 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1164 } \
1165 else \
1166 { \
1167 *dst++ = '$'; \
1168 if (CHARSET_CHARS (charset) == 94) \
1169 { \
1170 if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM) \
1171 || reg != 0 \
1172 || final_char < '@' || final_char > 'B') \
1173 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1174 } \
1175 else \
1176 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1177 } \
1178 *dst++ = final_char; \
1179 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
1180 } while (0)
1181
1182/* The following two macros produce codes (control character or escape
1183 sequence) for ISO2022 single-shift functions (single-shift-2 and
1184 single-shift-3). */
1185
1186#define ENCODE_SINGLE_SHIFT_2 \
1187 do { \
1188 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1189 *dst++ = ISO_CODE_ESC, *dst++ = 'N'; \
1190 else \
1191 *dst++ = ISO_CODE_SS2; \
1192 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1193 } while (0)
1194
1195#define ENCODE_SINGLE_SHIFT_3 \
1196 do { \
1197 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1198 *dst++ = ISO_CODE_ESC, *dst++ = 'O'; \
1199 else \
1200 *dst++ = ISO_CODE_SS3; \
1201 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1202 } while (0)
1203
1204/* The following four macros produce codes (control character or
1205 escape sequence) for ISO2022 locking-shift functions (shift-in,
1206 shift-out, locking-shift-2, and locking-shift-3). */
1207
1208#define ENCODE_SHIFT_IN \
1209 do { \
1210 *dst++ = ISO_CODE_SI; \
1211 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1212 } while (0)
1213
1214#define ENCODE_SHIFT_OUT \
1215 do { \
1216 *dst++ = ISO_CODE_SO; \
1217 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1218 } while (0)
1219
1220#define ENCODE_LOCKING_SHIFT_2 \
1221 do { \
1222 *dst++ = ISO_CODE_ESC, *dst++ = 'n'; \
1223 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1224 } while (0)
1225
1226#define ENCODE_LOCKING_SHIFT_3 \
1227 do { \
1228 *dst++ = ISO_CODE_ESC, *dst++ = 'o'; \
1229 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1230 } while (0)
1231
f4dee582
RS
1232/* Produce codes for a DIMENSION1 character whose character set is
1233 CHARSET and whose position-code is C1. Designation and invocation
4ed46869
KH
1234 sequences are also produced in advance if necessary. */
1235
1236
6e85d753
KH
1237#define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
1238 do { \
1239 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1240 { \
1241 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1242 *dst++ = c1 & 0x7F; \
1243 else \
1244 *dst++ = c1 | 0x80; \
1245 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1246 break; \
1247 } \
1248 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1249 { \
1250 *dst++ = c1 & 0x7F; \
1251 break; \
1252 } \
1253 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1254 { \
1255 *dst++ = c1 | 0x80; \
1256 break; \
1257 } \
1258 else if (coding->flags & CODING_FLAG_ISO_SAFE \
70c22245 1259 && !coding->safe_charsets[charset]) \
6e85d753
KH
1260 { \
1261 /* We should not encode this character, instead produce one or \
1262 two `?'s. */ \
1263 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1264 if (CHARSET_WIDTH (charset) == 2) \
1265 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1266 break; \
1267 } \
1268 else \
1269 /* Since CHARSET is not yet invoked to any graphic planes, we \
1270 must invoke it, or, at first, designate it to some graphic \
1271 register. Then repeat the loop to actually produce the \
1272 character. */ \
1273 dst = encode_invocation_designation (charset, coding, dst); \
4ed46869
KH
1274 } while (1)
1275
f4dee582
RS
1276/* Produce codes for a DIMENSION2 character whose character set is
1277 CHARSET and whose position-codes are C1 and C2. Designation and
4ed46869
KH
1278 invocation codes are also produced in advance if necessary. */
1279
6e85d753
KH
1280#define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
1281 do { \
1282 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1283 { \
1284 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1285 *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F; \
1286 else \
1287 *dst++ = c1 | 0x80, *dst++ = c2 | 0x80; \
1288 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1289 break; \
1290 } \
1291 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1292 { \
1293 *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F; \
1294 break; \
1295 } \
1296 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1297 { \
1298 *dst++ = c1 | 0x80, *dst++= c2 | 0x80; \
1299 break; \
1300 } \
1301 else if (coding->flags & CODING_FLAG_ISO_SAFE \
70c22245 1302 && !coding->safe_charsets[charset]) \
6e85d753
KH
1303 { \
1304 /* We should not encode this character, instead produce one or \
1305 two `?'s. */ \
1306 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1307 if (CHARSET_WIDTH (charset) == 2) \
1308 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1309 break; \
1310 } \
1311 else \
1312 /* Since CHARSET is not yet invoked to any graphic planes, we \
1313 must invoke it, or, at first, designate it to some graphic \
1314 register. Then repeat the loop to actually produce the \
1315 character. */ \
1316 dst = encode_invocation_designation (charset, coding, dst); \
4ed46869
KH
1317 } while (1)
1318
bdd9fb48
KH
1319#define ENCODE_ISO_CHARACTER(charset, c1, c2) \
1320 do { \
1321 int c_alt, charset_alt; \
1322 if (!NILP (unification_table) \
1323 && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
a5d301df 1324 >= 0)) \
bdd9fb48
KH
1325 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1326 else \
1327 charset_alt = charset; \
1328 if (CHARSET_DIMENSION (charset_alt) == 1) \
1329 ENCODE_ISO_CHARACTER_DIMENSION1 (charset_alt, c1); \
1330 else \
1331 ENCODE_ISO_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
1332 } while (0)
1333
4ed46869
KH
1334/* Produce designation and invocation codes at a place pointed by DST
1335 to use CHARSET. The element `spec.iso2022' of *CODING is updated.
1336 Return new DST. */
1337
1338unsigned char *
1339encode_invocation_designation (charset, coding, dst)
1340 int charset;
1341 struct coding_system *coding;
1342 unsigned char *dst;
1343{
1344 int reg; /* graphic register number */
1345
1346 /* At first, check designations. */
1347 for (reg = 0; reg < 4; reg++)
1348 if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
1349 break;
1350
1351 if (reg >= 4)
1352 {
1353 /* CHARSET is not yet designated to any graphic registers. */
1354 /* At first check the requested designation. */
1355 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1ba9e4ab
KH
1356 if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1357 /* Since CHARSET requests no special designation, designate it
1358 to graphic register 0. */
4ed46869
KH
1359 reg = 0;
1360
1361 ENCODE_DESIGNATION (charset, reg, coding);
1362 }
1363
1364 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
1365 && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
1366 {
1367 /* Since the graphic register REG is not invoked to any graphic
1368 planes, invoke it to graphic plane 0. */
1369 switch (reg)
1370 {
1371 case 0: /* graphic register 0 */
1372 ENCODE_SHIFT_IN;
1373 break;
1374
1375 case 1: /* graphic register 1 */
1376 ENCODE_SHIFT_OUT;
1377 break;
1378
1379 case 2: /* graphic register 2 */
1380 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1381 ENCODE_SINGLE_SHIFT_2;
1382 else
1383 ENCODE_LOCKING_SHIFT_2;
1384 break;
1385
1386 case 3: /* graphic register 3 */
1387 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1388 ENCODE_SINGLE_SHIFT_3;
1389 else
1390 ENCODE_LOCKING_SHIFT_3;
1391 break;
1392 }
1393 }
1394 return dst;
1395}
1396
1397/* The following two macros produce codes for indicating composition. */
1398#define ENCODE_COMPOSITION_NO_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '0'
1399#define ENCODE_COMPOSITION_WITH_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '2'
1400#define ENCODE_COMPOSITION_END *dst++ = ISO_CODE_ESC, *dst++ = '1'
1401
1402/* The following three macros produce codes for indicating direction
1403 of text. */
1404#define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
1405 do { \
1406 if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS) \
1407 *dst++ = ISO_CODE_ESC, *dst++ = '['; \
1408 else \
1409 *dst++ = ISO_CODE_CSI; \
1410 } while (0)
1411
1412#define ENCODE_DIRECTION_R2L \
1413 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '2', *dst++ = ']'
1414
1415#define ENCODE_DIRECTION_L2R \
1416 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '0', *dst++ = ']'
1417
1418/* Produce codes for designation and invocation to reset the graphic
1419 planes and registers to initial state. */
e0e989f6
KH
1420#define ENCODE_RESET_PLANE_AND_REGISTER \
1421 do { \
1422 int reg; \
1423 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0) \
1424 ENCODE_SHIFT_IN; \
1425 for (reg = 0; reg < 4; reg++) \
1426 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0 \
1427 && (CODING_SPEC_ISO_DESIGNATION (coding, reg) \
1428 != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg))) \
1429 ENCODE_DESIGNATION \
1430 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
4ed46869
KH
1431 } while (0)
1432
bdd9fb48
KH
1433/* Produce designation sequences of charsets in the line started from
1434 *SRC to a place pointed by DSTP.
1435
1436 If the current block ends before any end-of-line, we may fail to
1437 find all the necessary *designations. */
1438encode_designation_at_bol (coding, table, src, src_end, dstp)
e0e989f6 1439 struct coding_system *coding;
bdd9fb48 1440 Lisp_Object table;
e0e989f6
KH
1441 unsigned char *src, *src_end, **dstp;
1442{
bdd9fb48
KH
1443 int charset, c, found = 0, reg;
1444 /* Table of charsets to be designated to each graphic register. */
1445 int r[4];
1446 unsigned char *dst = *dstp;
1447
1448 for (reg = 0; reg < 4; reg++)
1449 r[reg] = -1;
1450
1451 while (src < src_end && *src != '\n' && found < 4)
e0e989f6 1452 {
bdd9fb48
KH
1453 int bytes = BYTES_BY_CHAR_HEAD (*src);
1454
1455 if (NILP (table))
1456 charset = CHARSET_AT (src);
1457 else
e0e989f6 1458 {
35cb8686
RS
1459 int c_alt;
1460 unsigned char c1, c2;
bdd9fb48
KH
1461
1462 SPLIT_STRING(src, bytes, charset, c1, c2);
1463 if ((c_alt = unify_char (table, -1, charset, c1, c2)) >= 0)
1464 charset = CHAR_CHARSET (c_alt);
e0e989f6 1465 }
bdd9fb48 1466
e0e989f6 1467 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
70c22245 1468 if (r[reg] < 0)
bdd9fb48
KH
1469 {
1470 found++;
1471 r[reg] = charset;
1472 }
1473
1474 src += bytes;
1475 }
1476
1477 if (found)
1478 {
1479 for (reg = 0; reg < 4; reg++)
1480 if (r[reg] >= 0
1481 && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
1482 ENCODE_DESIGNATION (r[reg], reg, coding);
1483 *dstp = dst;
e0e989f6 1484 }
e0e989f6
KH
1485}
1486
4ed46869
KH
1487/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
1488
1489int
1490encode_coding_iso2022 (coding, source, destination,
1491 src_bytes, dst_bytes, consumed)
1492 struct coding_system *coding;
1493 unsigned char *source, *destination;
1494 int src_bytes, dst_bytes;
1495 int *consumed;
1496{
1497 unsigned char *src = source;
1498 unsigned char *src_end = source + src_bytes;
1499 unsigned char *dst = destination;
1500 unsigned char *dst_end = destination + dst_bytes;
e0e989f6 1501 /* Since the maximum bytes produced by each loop is 20, we subtract 19
4ed46869
KH
1502 from DST_END to assure overflow checking is necessary only at the
1503 head of loop. */
e0e989f6 1504 unsigned char *adjusted_dst_end = dst_end - 19;
a5d301df
KH
1505 Lisp_Object unification_table
1506 = coding->character_unification_table_for_encode;
bdd9fb48
KH
1507
1508 if (!NILP (Venable_character_unification) && NILP (unification_table))
a5d301df 1509 unification_table = Vstandard_character_unification_table_for_encode;
4ed46869
KH
1510
1511 while (src < src_end && dst < adjusted_dst_end)
1512 {
1513 /* SRC_BASE remembers the start position in source in each loop.
1514 The loop will be exited when there's not enough source text
1515 to analyze multi-byte codes (within macros ONE_MORE_BYTE,
1516 TWO_MORE_BYTES, and THREE_MORE_BYTES). In that case, SRC is
1517 reset to SRC_BASE before exiting. */
1518 unsigned char *src_base = src;
bdd9fb48 1519 int charset, c1, c2, c3, c4;
4ed46869 1520
e0e989f6
KH
1521 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
1522 && CODING_SPEC_ISO_BOL (coding))
1523 {
bdd9fb48
KH
1524 /* We have to produce designation sequences if any now. */
1525 encode_designation_at_bol (coding, unification_table,
1526 src, src_end, &dst);
e0e989f6
KH
1527 CODING_SPEC_ISO_BOL (coding) = 0;
1528 }
1529
1530 c1 = *src++;
4ed46869
KH
1531 /* If we are seeing a component of a composite character, we are
1532 seeing a leading-code specially encoded for composition, or a
1533 composition rule if composing with rule. We must set C1
1534 to a normal leading-code or an ASCII code. If we are not at
1535 a composed character, we must reset the composition state. */
1536 if (COMPOSING_P (coding->composing))
1537 {
1538 if (c1 < 0xA0)
1539 {
1540 /* We are not in a composite character any longer. */
1541 coding->composing = COMPOSING_NO;
1542 ENCODE_COMPOSITION_END;
1543 }
1544 else
1545 {
1546 if (coding->composing == COMPOSING_WITH_RULE_RULE)
1547 {
1548 *dst++ = c1 & 0x7F;
1549 coding->composing = COMPOSING_WITH_RULE_HEAD;
1550 continue;
1551 }
1552 else if (coding->composing == COMPOSING_WITH_RULE_HEAD)
1553 coding->composing = COMPOSING_WITH_RULE_RULE;
1554 if (c1 == 0xA0)
1555 {
1556 /* This is an ASCII component. */
1557 ONE_MORE_BYTE (c1);
1558 c1 &= 0x7F;
1559 }
1560 else
1561 /* This is a leading-code of non ASCII component. */
1562 c1 -= 0x20;
1563 }
1564 }
1565
1566 /* Now encode one character. C1 is a control character, an
1567 ASCII character, or a leading-code of multi-byte character. */
1568 switch (emacs_code_class[c1])
1569 {
1570 case EMACS_ascii_code:
bdd9fb48 1571 ENCODE_ISO_CHARACTER (CHARSET_ASCII, c1, /* dummy */ c2);
4ed46869
KH
1572 break;
1573
1574 case EMACS_control_code:
1575 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
e0e989f6 1576 ENCODE_RESET_PLANE_AND_REGISTER;
4ed46869
KH
1577 *dst++ = c1;
1578 break;
1579
1580 case EMACS_carriage_return_code:
1581 if (!coding->selective)
1582 {
1583 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
e0e989f6 1584 ENCODE_RESET_PLANE_AND_REGISTER;
4ed46869
KH
1585 *dst++ = c1;
1586 break;
1587 }
1588 /* fall down to treat '\r' as '\n' ... */
1589
1590 case EMACS_linefeed_code:
1591 if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
e0e989f6
KH
1592 ENCODE_RESET_PLANE_AND_REGISTER;
1593 if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
1594 bcopy (coding->spec.iso2022.initial_designation,
1595 coding->spec.iso2022.current_designation,
1596 sizeof coding->spec.iso2022.initial_designation);
4ed46869 1597 if (coding->eol_type == CODING_EOL_LF
0ef69138 1598 || coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
1599 *dst++ = ISO_CODE_LF;
1600 else if (coding->eol_type == CODING_EOL_CRLF)
1601 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
1602 else
1603 *dst++ = ISO_CODE_CR;
e0e989f6 1604 CODING_SPEC_ISO_BOL (coding) = 1;
4ed46869
KH
1605 break;
1606
1607 case EMACS_leading_code_2:
1608 ONE_MORE_BYTE (c2);
19a8d9e0
KH
1609 if (c2 < 0xA0)
1610 {
1611 /* invalid sequence */
1612 *dst++ = c1;
1613 *dst++ = c2;
1614 }
1615 else
1616 ENCODE_ISO_CHARACTER (c1, c2, /* dummy */ c3);
4ed46869
KH
1617 break;
1618
1619 case EMACS_leading_code_3:
1620 TWO_MORE_BYTES (c2, c3);
19a8d9e0
KH
1621 if (c2 < 0xA0 || c3 < 0xA0)
1622 {
1623 /* invalid sequence */
1624 *dst++ = c1;
1625 *dst++ = c2;
1626 *dst++ = c3;
1627 }
1628 else if (c1 < LEADING_CODE_PRIVATE_11)
bdd9fb48 1629 ENCODE_ISO_CHARACTER (c1, c2, c3);
4ed46869 1630 else
bdd9fb48 1631 ENCODE_ISO_CHARACTER (c2, c3, /* dummy */ c4);
4ed46869
KH
1632 break;
1633
1634 case EMACS_leading_code_4:
1635 THREE_MORE_BYTES (c2, c3, c4);
19a8d9e0
KH
1636 if (c2 < 0xA0 || c3 < 0xA0 || c4 < 0xA0)
1637 {
1638 /* invalid sequence */
1639 *dst++ = c1;
1640 *dst++ = c2;
1641 *dst++ = c3;
1642 *dst++ = c4;
1643 }
1644 else
1645 ENCODE_ISO_CHARACTER (c2, c3, c4);
4ed46869
KH
1646 break;
1647
1648 case EMACS_leading_code_composition:
19a8d9e0
KH
1649 ONE_MORE_BYTE (c2);
1650 if (c2 < 0xA0)
1651 {
1652 /* invalid sequence */
1653 *dst++ = c1;
1654 *dst++ = c2;
1655 }
1656 else if (c2 == 0xFF)
4ed46869
KH
1657 {
1658 coding->composing = COMPOSING_WITH_RULE_HEAD;
1659 ENCODE_COMPOSITION_WITH_RULE_START;
1660 }
1661 else
1662 {
1663 /* Rewind one byte because it is a character code of
1664 composition elements. */
1665 src--;
1666 coding->composing = COMPOSING_NO_RULE_HEAD;
1667 ENCODE_COMPOSITION_NO_RULE_START;
1668 }
1669 break;
1670
1671 case EMACS_invalid_code:
1672 *dst++ = c1;
1673 break;
1674 }
1675 continue;
1676 label_end_of_loop:
76376439
KH
1677 /* We reach here because the source date ends not at character
1678 boundary. */
1679 coding->carryover_size = src_end - src_base;
4ed46869 1680 bcopy (src_base, coding->carryover, coding->carryover_size);
76376439 1681 src = src_end;
4ed46869
KH
1682 break;
1683 }
1684
1685 /* If this is the last block of the text to be encoded, we must
bdd9fb48
KH
1686 reset graphic planes and registers to the initial state. */
1687 if (src >= src_end && coding->last_block)
4ed46869 1688 {
e0e989f6 1689 ENCODE_RESET_PLANE_AND_REGISTER;
bdd9fb48
KH
1690 if (coding->carryover_size > 0
1691 && coding->carryover_size < (dst_end - dst))
1692 {
1693 bcopy (coding->carryover, dst, coding->carryover_size);
1694 dst += coding->carryover_size;
1695 coding->carryover_size = 0;
1696 }
4ed46869
KH
1697 }
1698 *consumed = src - source;
1699 return dst - destination;
1700}
1701
1702\f
1703/*** 4. SJIS and BIG5 handlers ***/
1704
f4dee582 1705/* Although SJIS and BIG5 are not ISO's coding system, they are used
4ed46869
KH
1706 quite widely. So, for the moment, Emacs supports them in the bare
1707 C code. But, in the future, they may be supported only by CCL. */
1708
1709/* SJIS is a coding system encoding three character sets: ASCII, right
1710 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
1711 as is. A character of charset katakana-jisx0201 is encoded by
1712 "position-code + 0x80". A character of charset japanese-jisx0208
1713 is encoded in 2-byte but two position-codes are divided and shifted
1714 so that it fit in the range below.
1715
1716 --- CODE RANGE of SJIS ---
1717 (character set) (range)
1718 ASCII 0x00 .. 0x7F
1719 KATAKANA-JISX0201 0xA0 .. 0xDF
1720 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xFF
1721 (2nd byte) 0x40 .. 0xFF
1722 -------------------------------
1723
1724*/
1725
1726/* BIG5 is a coding system encoding two character sets: ASCII and
1727 Big5. An ASCII character is encoded as is. Big5 is a two-byte
1728 character set and is encoded in two-byte.
1729
1730 --- CODE RANGE of BIG5 ---
1731 (character set) (range)
1732 ASCII 0x00 .. 0x7F
1733 Big5 (1st byte) 0xA1 .. 0xFE
1734 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
1735 --------------------------
1736
1737 Since the number of characters in Big5 is larger than maximum
1738 characters in Emacs' charset (96x96), it can't be handled as one
1739 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
1740 and `charset-big5-2'. Both are DIMENSION2 and CHARS94. The former
1741 contains frequently used characters and the latter contains less
1742 frequently used characters. */
1743
1744/* Macros to decode or encode a character of Big5 in BIG5. B1 and B2
1745 are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
1746 C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
1747 format. CHARSET is `charset_big5_1' or `charset_big5_2'. */
1748
1749/* Number of Big5 characters which have the same code in 1st byte. */
1750#define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
1751
1752#define DECODE_BIG5(b1, b2, charset, c1, c2) \
1753 do { \
1754 unsigned int temp \
1755 = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62); \
1756 if (b1 < 0xC9) \
1757 charset = charset_big5_1; \
1758 else \
1759 { \
1760 charset = charset_big5_2; \
1761 temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW; \
1762 } \
1763 c1 = temp / (0xFF - 0xA1) + 0x21; \
1764 c2 = temp % (0xFF - 0xA1) + 0x21; \
1765 } while (0)
1766
1767#define ENCODE_BIG5(charset, c1, c2, b1, b2) \
1768 do { \
1769 unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21); \
1770 if (charset == charset_big5_2) \
1771 temp += BIG5_SAME_ROW * (0xC9 - 0xA1); \
1772 b1 = temp / BIG5_SAME_ROW + 0xA1; \
1773 b2 = temp % BIG5_SAME_ROW; \
1774 b2 += b2 < 0x3F ? 0x40 : 0x62; \
1775 } while (0)
1776
a5d301df
KH
1777#define DECODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \
1778 do { \
1779 int c_alt, charset_alt = (charset); \
1780 if (!NILP (unification_table) \
1781 && ((c_alt = unify_char (unification_table, \
1782 -1, (charset), c1, c2)) >= 0)) \
1783 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1784 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
1785 DECODE_CHARACTER_ASCII (c1); \
1786 else if (CHARSET_DIMENSION (charset_alt) == 1) \
1787 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
1788 else \
1789 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
1790 } while (0)
1791
1792#define ENCODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \
1793 do { \
1794 int c_alt, charset_alt; \
1795 if (!NILP (unification_table) \
1796 && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
1797 >= 0)) \
1798 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1799 else \
1800 charset_alt = charset; \
1801 if (charset_alt == charset_ascii) \
1802 *dst++ = c1; \
1803 else if (CHARSET_DIMENSION (charset_alt) == 1) \
1804 { \
1805 if (sjis_p && charset_alt == charset_katakana_jisx0201) \
1806 *dst++ = c1; \
1807 else \
1808 *dst++ = charset_alt, *dst++ = c1; \
1809 } \
1810 else \
1811 { \
1812 c1 &= 0x7F, c2 &= 0x7F; \
1813 if (sjis_p && charset_alt == charset_jisx0208) \
1814 { \
1815 unsigned char s1, s2; \
1816 \
1817 ENCODE_SJIS (c1, c2, s1, s2); \
1818 *dst++ = s1, *dst++ = s2; \
1819 } \
1820 else if (!sjis_p \
1821 && (charset_alt == charset_big5_1 \
1822 || charset_alt == charset_big5_2)) \
1823 { \
1824 unsigned char b1, b2; \
1825 \
9ce27fde 1826 ENCODE_BIG5 (charset_alt, c1, c2, b1, b2); \
a5d301df
KH
1827 *dst++ = b1, *dst++ = b2; \
1828 } \
1829 else \
1830 *dst++ = charset_alt, *dst++ = c1, *dst++ = c2; \
1831 } \
1832 } while (0);
1833
4ed46869
KH
1834/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1835 Check if a text is encoded in SJIS. If it is, return
1836 CODING_CATEGORY_MASK_SJIS, else return 0. */
1837
1838int
1839detect_coding_sjis (src, src_end)
1840 unsigned char *src, *src_end;
1841{
1842 unsigned char c;
1843
1844 while (src < src_end)
1845 {
1846 c = *src++;
1847 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1848 return 0;
1849 if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
1850 {
1851 if (src < src_end && *src++ < 0x40)
1852 return 0;
1853 }
1854 }
1855 return CODING_CATEGORY_MASK_SJIS;
1856}
1857
1858/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1859 Check if a text is encoded in BIG5. If it is, return
1860 CODING_CATEGORY_MASK_BIG5, else return 0. */
1861
1862int
1863detect_coding_big5 (src, src_end)
1864 unsigned char *src, *src_end;
1865{
1866 unsigned char c;
1867
1868 while (src < src_end)
1869 {
1870 c = *src++;
1871 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1872 return 0;
1873 if (c >= 0xA1)
1874 {
1875 if (src >= src_end)
1876 break;
1877 c = *src++;
1878 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
1879 return 0;
1880 }
1881 }
1882 return CODING_CATEGORY_MASK_BIG5;
1883}
1884
1885/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
1886 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
1887
1888int
1889decode_coding_sjis_big5 (coding, source, destination,
1890 src_bytes, dst_bytes, consumed, sjis_p)
1891 struct coding_system *coding;
1892 unsigned char *source, *destination;
1893 int src_bytes, dst_bytes;
1894 int *consumed;
1895 int sjis_p;
1896{
1897 unsigned char *src = source;
1898 unsigned char *src_end = source + src_bytes;
1899 unsigned char *dst = destination;
1900 unsigned char *dst_end = destination + dst_bytes;
1901 /* Since the maximum bytes produced by each loop is 4, we subtract 3
1902 from DST_END to assure overflow checking is necessary only at the
1903 head of loop. */
1904 unsigned char *adjusted_dst_end = dst_end - 3;
a5d301df
KH
1905 Lisp_Object unification_table
1906 = coding->character_unification_table_for_decode;
1907
1908 if (!NILP (Venable_character_unification) && NILP (unification_table))
1909 unification_table = Vstandard_character_unification_table_for_decode;
4ed46869
KH
1910
1911 while (src < src_end && dst < adjusted_dst_end)
1912 {
1913 /* SRC_BASE remembers the start position in source in each loop.
1914 The loop will be exited when there's not enough source text
1915 to analyze two-byte character (within macro ONE_MORE_BYTE).
1916 In that case, SRC is reset to SRC_BASE before exiting. */
1917 unsigned char *src_base = src;
1918 unsigned char c1 = *src++, c2, c3, c4;
1919
1920 if (c1 == '\r')
1921 {
1922 if (coding->eol_type == CODING_EOL_CRLF)
1923 {
1924 ONE_MORE_BYTE (c2);
1925 if (c2 == '\n')
1926 *dst++ = c2;
1927 else
1928 /* To process C2 again, SRC is subtracted by 1. */
1929 *dst++ = c1, src--;
1930 }
774324d6
KH
1931 else if (coding->eol_type == CODING_EOL_CR)
1932 *dst++ = '\n';
4ed46869
KH
1933 else
1934 *dst++ = c1;
1935 }
a5d301df 1936 else if (c1 < 0x20)
4ed46869 1937 *dst++ = c1;
a5d301df
KH
1938 else if (c1 < 0x80)
1939 DECODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
4ed46869
KH
1940 else if (c1 < 0xA0 || c1 >= 0xE0)
1941 {
1942 /* SJIS -> JISX0208, BIG5 -> Big5 (only if 0xE0 <= c1 < 0xFF) */
1943 if (sjis_p)
1944 {
1945 ONE_MORE_BYTE (c2);
1946 DECODE_SJIS (c1, c2, c3, c4);
a5d301df 1947 DECODE_SJIS_BIG5_CHARACTER (charset_jisx0208, c3, c4);
4ed46869
KH
1948 }
1949 else if (c1 >= 0xE0 && c1 < 0xFF)
1950 {
1951 int charset;
1952
1953 ONE_MORE_BYTE (c2);
1954 DECODE_BIG5 (c1, c2, charset, c3, c4);
a5d301df 1955 DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
4ed46869
KH
1956 }
1957 else /* Invalid code */
1958 *dst++ = c1;
1959 }
1960 else
1961 {
1962 /* SJIS -> JISX0201-Kana, BIG5 -> Big5 */
1963 if (sjis_p)
a5d301df 1964 DECODE_SJIS_BIG5_CHARACTER (charset_katakana_jisx0201, c1, /* dummy */ c2);
4ed46869
KH
1965 else
1966 {
1967 int charset;
1968
1969 ONE_MORE_BYTE (c2);
1970 DECODE_BIG5 (c1, c2, charset, c3, c4);
a5d301df 1971 DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
4ed46869
KH
1972 }
1973 }
1974 continue;
1975
1976 label_end_of_loop:
1977 coding->carryover_size = src - src_base;
1978 bcopy (src_base, coding->carryover, coding->carryover_size);
1979 src = src_base;
1980 break;
1981 }
1982
1983 *consumed = src - source;
1984 return dst - destination;
1985}
1986
1987/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
1988 This function can encode `charset_ascii', `charset_katakana_jisx0201',
1989 `charset_jisx0208', `charset_big5_1', and `charset_big5-2'. We are
1990 sure that all these charsets are registered as official charset
1991 (i.e. do not have extended leading-codes). Characters of other
1992 charsets are produced without any encoding. If SJIS_P is 1, encode
1993 SJIS text, else encode BIG5 text. */
1994
1995int
1996encode_coding_sjis_big5 (coding, source, destination,
1997 src_bytes, dst_bytes, consumed, sjis_p)
1998 struct coding_system *coding;
1999 unsigned char *source, *destination;
2000 int src_bytes, dst_bytes;
2001 int *consumed;
2002 int sjis_p;
2003{
2004 unsigned char *src = source;
2005 unsigned char *src_end = source + src_bytes;
2006 unsigned char *dst = destination;
2007 unsigned char *dst_end = destination + dst_bytes;
2008 /* Since the maximum bytes produced by each loop is 2, we subtract 1
2009 from DST_END to assure overflow checking is necessary only at the
2010 head of loop. */
2011 unsigned char *adjusted_dst_end = dst_end - 1;
a5d301df
KH
2012 Lisp_Object unification_table
2013 = coding->character_unification_table_for_encode;
2014
2015 if (!NILP (Venable_character_unification) && NILP (unification_table))
2016 unification_table = Vstandard_character_unification_table_for_encode;
4ed46869
KH
2017
2018 while (src < src_end && dst < adjusted_dst_end)
2019 {
2020 /* SRC_BASE remembers the start position in source in each loop.
2021 The loop will be exited when there's not enough source text
2022 to analyze multi-byte codes (within macros ONE_MORE_BYTE and
2023 TWO_MORE_BYTES). In that case, SRC is reset to SRC_BASE
2024 before exiting. */
2025 unsigned char *src_base = src;
2026 unsigned char c1 = *src++, c2, c3, c4;
2027
2028 if (coding->composing)
2029 {
2030 if (c1 == 0xA0)
2031 {
2032 ONE_MORE_BYTE (c1);
2033 c1 &= 0x7F;
2034 }
2035 else if (c1 >= 0xA0)
2036 c1 -= 0x20;
2037 else
2038 coding->composing = 0;
2039 }
2040
2041 switch (emacs_code_class[c1])
2042 {
2043 case EMACS_ascii_code:
a5d301df
KH
2044 ENCODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
2045 break;
2046
4ed46869
KH
2047 case EMACS_control_code:
2048 *dst++ = c1;
2049 break;
2050
2051 case EMACS_carriage_return_code:
2052 if (!coding->selective)
2053 {
2054 *dst++ = c1;
2055 break;
2056 }
2057 /* fall down to treat '\r' as '\n' ... */
2058
2059 case EMACS_linefeed_code:
2060 if (coding->eol_type == CODING_EOL_LF
0ef69138 2061 || coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
2062 *dst++ = '\n';
2063 else if (coding->eol_type == CODING_EOL_CRLF)
2064 *dst++ = '\r', *dst++ = '\n';
2065 else
2066 *dst++ = '\r';
2067 break;
2068
2069 case EMACS_leading_code_2:
2070 ONE_MORE_BYTE (c2);
a5d301df 2071 ENCODE_SJIS_BIG5_CHARACTER (c1, c2, /* dummy */ c3);
4ed46869
KH
2072 break;
2073
2074 case EMACS_leading_code_3:
2075 TWO_MORE_BYTES (c2, c3);
a5d301df 2076 ENCODE_SJIS_BIG5_CHARACTER (c1, c2, c3);
4ed46869
KH
2077 break;
2078
2079 case EMACS_leading_code_4:
2080 THREE_MORE_BYTES (c2, c3, c4);
a5d301df 2081 ENCODE_SJIS_BIG5_CHARACTER (c2, c3, c4);
4ed46869
KH
2082 break;
2083
2084 case EMACS_leading_code_composition:
2085 coding->composing = 1;
2086 break;
2087
2088 default: /* i.e. case EMACS_invalid_code: */
2089 *dst++ = c1;
2090 }
2091 continue;
2092
2093 label_end_of_loop:
76376439 2094 coding->carryover_size = src_end - src_base;
4ed46869 2095 bcopy (src_base, coding->carryover, coding->carryover_size);
76376439 2096 src = src_end;
4ed46869
KH
2097 break;
2098 }
2099
2100 *consumed = src - source;
2101 return dst - destination;
2102}
2103
2104\f
2105/*** 5. End-of-line handlers ***/
2106
2107/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2108 This function is called only when `coding->eol_type' is
2109 CODING_EOL_CRLF or CODING_EOL_CR. */
2110
2111decode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
2112 struct coding_system *coding;
2113 unsigned char *source, *destination;
2114 int src_bytes, dst_bytes;
2115 int *consumed;
2116{
2117 unsigned char *src = source;
2118 unsigned char *src_end = source + src_bytes;
2119 unsigned char *dst = destination;
2120 unsigned char *dst_end = destination + dst_bytes;
2121 int produced;
2122
2123 switch (coding->eol_type)
2124 {
2125 case CODING_EOL_CRLF:
2126 {
2127 /* Since the maximum bytes produced by each loop is 2, we
2128 subtract 1 from DST_END to assure overflow checking is
2129 necessary only at the head of loop. */
2130 unsigned char *adjusted_dst_end = dst_end - 1;
2131
2132 while (src < src_end && dst < adjusted_dst_end)
2133 {
2134 unsigned char *src_base = src;
2135 unsigned char c = *src++;
2136 if (c == '\r')
2137 {
2138 ONE_MORE_BYTE (c);
2139 if (c != '\n')
2140 *dst++ = '\r';
bfd99048 2141 *dst++ = c;
4ed46869
KH
2142 }
2143 else
2144 *dst++ = c;
2145 continue;
2146
2147 label_end_of_loop:
2148 coding->carryover_size = src - src_base;
2149 bcopy (src_base, coding->carryover, coding->carryover_size);
2150 src = src_base;
2151 break;
2152 }
2153 *consumed = src - source;
2154 produced = dst - destination;
2155 break;
2156 }
2157
2158 case CODING_EOL_CR:
2159 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2160 bcopy (source, destination, produced);
2161 dst_end = destination + produced;
2162 while (dst < dst_end)
2163 if (*dst++ == '\r') dst[-1] = '\n';
2164 *consumed = produced;
2165 break;
2166
2167 default: /* i.e. case: CODING_EOL_LF */
2168 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2169 bcopy (source, destination, produced);
2170 *consumed = produced;
2171 break;
2172 }
2173
2174 return produced;
2175}
2176
2177/* See "GENERAL NOTES about `encode_coding_XXX ()' functions". Encode
2178 format of end-of-line according to `coding->eol_type'. If
2179 `coding->selective' is 1, code '\r' in source text also means
2180 end-of-line. */
2181
2182encode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
2183 struct coding_system *coding;
2184 unsigned char *source, *destination;
2185 int src_bytes, dst_bytes;
2186 int *consumed;
2187{
2188 unsigned char *src = source;
2189 unsigned char *dst = destination;
2190 int produced;
2191
2192 if (src_bytes <= 0)
2193 return 0;
2194
2195 switch (coding->eol_type)
2196 {
2197 case CODING_EOL_LF:
0ef69138 2198 case CODING_EOL_UNDECIDED:
4ed46869
KH
2199 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2200 bcopy (source, destination, produced);
2201 if (coding->selective)
2202 {
2203 int i = produced;
2204 while (i--)
2205 if (*dst++ == '\r') dst[-1] = '\n';
2206 }
2207 *consumed = produced;
2208
2209 case CODING_EOL_CRLF:
2210 {
2211 unsigned char c;
2212 unsigned char *src_end = source + src_bytes;
2213 unsigned char *dst_end = destination + dst_bytes;
2214 /* Since the maximum bytes produced by each loop is 2, we
2215 subtract 1 from DST_END to assure overflow checking is
2216 necessary only at the head of loop. */
2217 unsigned char *adjusted_dst_end = dst_end - 1;
2218
2219 while (src < src_end && dst < adjusted_dst_end)
2220 {
2221 c = *src++;
2222 if (c == '\n' || (c == '\r' && coding->selective))
2223 *dst++ = '\r', *dst++ = '\n';
2224 else
2225 *dst++ = c;
2226 }
2227 produced = dst - destination;
2228 *consumed = src - source;
2229 break;
2230 }
2231
2232 default: /* i.e. case CODING_EOL_CR: */
2233 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2234 bcopy (source, destination, produced);
2235 {
2236 int i = produced;
2237 while (i--)
2238 if (*dst++ == '\n') dst[-1] = '\r';
2239 }
2240 *consumed = produced;
2241 }
2242
2243 return produced;
2244}
2245
2246\f
2247/*** 6. C library functions ***/
2248
2249/* In Emacs Lisp, coding system is represented by a Lisp symbol which
2250 has a property `coding-system'. The value of this property is a
2251 vector of length 5 (called as coding-vector). Among elements of
2252 this vector, the first (element[0]) and the fifth (element[4])
2253 carry important information for decoding/encoding. Before
2254 decoding/encoding, this information should be set in fields of a
2255 structure of type `coding_system'.
2256
2257 A value of property `coding-system' can be a symbol of another
2258 subsidiary coding-system. In that case, Emacs gets coding-vector
2259 from that symbol.
2260
2261 `element[0]' contains information to be set in `coding->type'. The
2262 value and its meaning is as follows:
2263
0ef69138
KH
2264 0 -- coding_type_emacs_mule
2265 1 -- coding_type_sjis
2266 2 -- coding_type_iso2022
2267 3 -- coding_type_big5
2268 4 -- coding_type_ccl encoder/decoder written in CCL
2269 nil -- coding_type_no_conversion
2270 t -- coding_type_undecided (automatic conversion on decoding,
2271 no-conversion on encoding)
4ed46869
KH
2272
2273 `element[4]' contains information to be set in `coding->flags' and
2274 `coding->spec'. The meaning varies by `coding->type'.
2275
2276 If `coding->type' is `coding_type_iso2022', element[4] is a vector
2277 of length 32 (of which the first 13 sub-elements are used now).
2278 Meanings of these sub-elements are:
2279
2280 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2281 If the value is an integer of valid charset, the charset is
2282 assumed to be designated to graphic register N initially.
2283
2284 If the value is minus, it is a minus value of charset which
2285 reserves graphic register N, which means that the charset is
2286 not designated initially but should be designated to graphic
2287 register N just before encoding a character in that charset.
2288
2289 If the value is nil, graphic register N is never used on
2290 encoding.
2291
2292 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2293 Each value takes t or nil. See the section ISO2022 of
2294 `coding.h' for more information.
2295
2296 If `coding->type' is `coding_type_big5', element[4] is t to denote
2297 BIG5-ETen or nil to denote BIG5-HKU.
2298
2299 If `coding->type' takes the other value, element[4] is ignored.
2300
2301 Emacs Lisp's coding system also carries information about format of
2302 end-of-line in a value of property `eol-type'. If the value is
2303 integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2304 means CODING_EOL_CR. If it is not integer, it should be a vector
2305 of subsidiary coding systems of which property `eol-type' has one
2306 of above values.
2307
2308*/
2309
2310/* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2311 and set it in CODING. If CODING_SYSTEM_SYMBOL is invalid, CODING
2312 is setup so that no conversion is necessary and return -1, else
2313 return 0. */
2314
2315int
e0e989f6
KH
2316setup_coding_system (coding_system, coding)
2317 Lisp_Object coding_system;
4ed46869
KH
2318 struct coding_system *coding;
2319{
4608c386
KH
2320 Lisp_Object coding_spec, plist, type, eol_type;
2321 Lisp_Object val;
70c22245 2322 int i;
4ed46869 2323
f4dee582 2324 /* At first, set several fields to default values. */
4ed46869
KH
2325 coding->last_block = 0;
2326 coding->selective = 0;
2327 coding->composing = 0;
2328 coding->direction = 0;
2329 coding->carryover_size = 0;
4ed46869 2330 coding->post_read_conversion = coding->pre_write_conversion = Qnil;
a5d301df
KH
2331 coding->character_unification_table_for_decode = Qnil;
2332 coding->character_unification_table_for_encode = Qnil;
4ed46869 2333
774324d6 2334 coding->symbol = coding_system;
e0e989f6 2335 eol_type = Qnil;
4608c386
KH
2336
2337 /* Get values of property `coding-system' and `eol-type'.
2338 Also get values of coding system properties:
a5d301df
KH
2339 `post-read-conversion', `pre-write-conversion',
2340 `character-unification-table-for-decode',
4608c386
KH
2341 `character-unification-table-for-encode'. */
2342 coding_spec = Fget (coding_system, Qcoding_system);
2343 if (!VECTORP (coding_spec)
2344 || XVECTOR (coding_spec)->size != 5
2345 || !CONSP (XVECTOR (coding_spec)->contents[3]))
4ed46869 2346 goto label_invalid_coding_system;
4608c386
KH
2347 if (!inhibit_eol_conversion)
2348 eol_type = Fget (coding_system, Qeol_type);
2349
2350 plist = XVECTOR (coding_spec)->contents[3];
2351 coding->post_read_conversion = Fplist_get (plist, Qpost_read_conversion);
2352 coding->pre_write_conversion = Fplist_get (plist, Qpre_write_conversion);
2353 val = Fplist_get (plist, Qcharacter_unification_table_for_decode);
2354 if (SYMBOLP (val))
2355 val = Fget (val, Qcharacter_unification_table_for_decode);
2356 coding->character_unification_table_for_decode
2357 = CHAR_TABLE_P (val) ? val : Qnil;
2358 val = Fplist_get (plist, Qcharacter_unification_table_for_encode);
2359 if (SYMBOLP (val))
2360 val = Fget (val, Qcharacter_unification_table_for_encode);
2361 coding->character_unification_table_for_encode
2362 = CHAR_TABLE_P (val) ? val : Qnil;
2363
70c22245
KH
2364 val = Fplist_get (plist, Qsafe_charsets);
2365 if (EQ (val, Qt))
2366 {
2367 for (i = 0; i <= MAX_CHARSET; i++)
2368 coding->safe_charsets[i] = 1;
2369 }
2370 else
2371 {
2372 bzero (coding->safe_charsets, MAX_CHARSET + 1);
2373 while (CONSP (val))
2374 {
2375 if ((i = get_charset_id (XCONS (val)->car)) >= 0)
2376 coding->safe_charsets[i] = 1;
2377 val = XCONS (val)->cdr;
2378 }
2379 }
2380
4ed46869 2381 if (VECTORP (eol_type))
c952af22
KH
2382 {
2383 coding->eol_type = CODING_EOL_UNDECIDED;
2384 coding->common_flags = CODING_REQUIRE_DETECTION_MASK;
2385 }
4ed46869 2386 else if (XFASTINT (eol_type) == 1)
c952af22
KH
2387 {
2388 coding->eol_type = CODING_EOL_CRLF;
2389 coding->common_flags
2390 = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2391 }
4ed46869 2392 else if (XFASTINT (eol_type) == 2)
c952af22
KH
2393 {
2394 coding->eol_type = CODING_EOL_CR;
2395 coding->common_flags
2396 = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2397 }
4ed46869 2398 else
c952af22
KH
2399 {
2400 coding->eol_type = CODING_EOL_LF;
2401 coding->common_flags = 0;
2402 }
4ed46869 2403
4608c386 2404 type = XVECTOR (coding_spec)->contents[0];
4ed46869
KH
2405 switch (XFASTINT (type))
2406 {
2407 case 0:
0ef69138 2408 coding->type = coding_type_emacs_mule;
c952af22
KH
2409 if (!NILP (coding->post_read_conversion))
2410 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
2411 if (!NILP (coding->pre_write_conversion))
2412 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
4ed46869
KH
2413 break;
2414
2415 case 1:
2416 coding->type = coding_type_sjis;
c952af22
KH
2417 coding->common_flags
2418 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
4ed46869
KH
2419 break;
2420
2421 case 2:
2422 coding->type = coding_type_iso2022;
c952af22
KH
2423 coding->common_flags
2424 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
4ed46869 2425 {
70c22245 2426 Lisp_Object val, temp;
4ed46869
KH
2427 Lisp_Object *flags;
2428 int i, charset, default_reg_bits = 0;
2429
4608c386 2430 val = XVECTOR (coding_spec)->contents[4];
f44d27ce 2431
4ed46869
KH
2432 if (!VECTORP (val) || XVECTOR (val)->size != 32)
2433 goto label_invalid_coding_system;
2434
2435 flags = XVECTOR (val)->contents;
2436 coding->flags
2437 = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
2438 | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
2439 | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
2440 | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
2441 | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
2442 | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
2443 | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
2444 | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
e0e989f6
KH
2445 | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
2446 | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
c4825358
KH
2447 | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
2448 | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
3f003981 2449 | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
c4825358 2450 );
4ed46869
KH
2451
2452 /* Invoke graphic register 0 to plane 0. */
2453 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
2454 /* Invoke graphic register 1 to plane 1 if we can use full 8-bit. */
2455 CODING_SPEC_ISO_INVOCATION (coding, 1)
2456 = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
2457 /* Not single shifting at first. */
6e85d753 2458 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
e0e989f6 2459 /* Beginning of buffer should also be regarded as bol. */
6e85d753 2460 CODING_SPEC_ISO_BOL (coding) = 1;
4ed46869 2461
70c22245
KH
2462 for (charset = 0; charset <= MAX_CHARSET; charset++)
2463 CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = 255;
2464 val = Vcharset_revision_alist;
2465 while (CONSP (val))
2466 {
2467 charset = get_charset_id (Fcar_safe (XCONS (val)->car));
2468 if (charset >= 0
2469 && (temp = Fcdr_safe (XCONS (val)->car), INTEGERP (temp))
2470 && (i = XINT (temp), (i >= 0 && (i + '@') < 128)))
2471 CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = i;
2472 val = XCONS (val)->cdr;
2473 }
2474
4ed46869
KH
2475 /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
2476 FLAGS[REG] can be one of below:
2477 integer CHARSET: CHARSET occupies register I,
2478 t: designate nothing to REG initially, but can be used
2479 by any charsets,
2480 list of integer, nil, or t: designate the first
2481 element (if integer) to REG initially, the remaining
2482 elements (if integer) is designated to REG on request,
2483 if an element is t, REG can be used by any charset,
2484 nil: REG is never used. */
467e7675 2485 for (charset = 0; charset <= MAX_CHARSET; charset++)
1ba9e4ab
KH
2486 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2487 = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
4ed46869
KH
2488 for (i = 0; i < 4; i++)
2489 {
2490 if (INTEGERP (flags[i])
e0e989f6
KH
2491 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset))
2492 || (charset = get_charset_id (flags[i])) >= 0)
4ed46869
KH
2493 {
2494 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2495 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
2496 }
2497 else if (EQ (flags[i], Qt))
2498 {
2499 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2500 default_reg_bits |= 1 << i;
2501 }
2502 else if (CONSP (flags[i]))
2503 {
2504 Lisp_Object tail = flags[i];
2505
2506 if (INTEGERP (XCONS (tail)->car)
2507 && (charset = XINT (XCONS (tail)->car),
e0e989f6
KH
2508 CHARSET_VALID_P (charset))
2509 || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
4ed46869
KH
2510 {
2511 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2512 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
2513 }
2514 else
2515 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2516 tail = XCONS (tail)->cdr;
2517 while (CONSP (tail))
2518 {
2519 if (INTEGERP (XCONS (tail)->car)
2520 && (charset = XINT (XCONS (tail)->car),
e0e989f6
KH
2521 CHARSET_VALID_P (charset))
2522 || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
70c22245
KH
2523 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2524 = i;
4ed46869
KH
2525 else if (EQ (XCONS (tail)->car, Qt))
2526 default_reg_bits |= 1 << i;
2527 tail = XCONS (tail)->cdr;
2528 }
2529 }
2530 else
2531 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2532
2533 CODING_SPEC_ISO_DESIGNATION (coding, i)
2534 = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
2535 }
2536
2537 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
2538 {
2539 /* REG 1 can be used only by locking shift in 7-bit env. */
2540 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2541 default_reg_bits &= ~2;
2542 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
2543 /* Without any shifting, only REG 0 and 1 can be used. */
2544 default_reg_bits &= 3;
2545 }
2546
6e85d753
KH
2547 for (charset = 0; charset <= MAX_CHARSET; charset++)
2548 if (CHARSET_VALID_P (charset)
2549 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2550 == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
2551 {
2552 /* We have not yet decided where to designate CHARSET. */
2553 int reg_bits = default_reg_bits;
2554
2555 if (CHARSET_CHARS (charset) == 96)
2556 /* A charset of CHARS96 can't be designated to REG 0. */
2557 reg_bits &= ~1;
2558
2559 if (reg_bits)
2560 /* There exist some default graphic register. */
2561 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2562 = (reg_bits & 1
2563 ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
2564 else
2565 /* We anyway have to designate CHARSET to somewhere. */
2566 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2567 = (CHARSET_CHARS (charset) == 94
2568 ? 0
2569 : ((coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT
2570 || ! coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2571 ? 1
2572 : (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT
2573 ? 2 : 0)));
2574 }
4ed46869 2575 }
c952af22 2576 coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
4ed46869
KH
2577 break;
2578
2579 case 3:
2580 coding->type = coding_type_big5;
c952af22
KH
2581 coding->common_flags
2582 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
4ed46869 2583 coding->flags
4608c386 2584 = (NILP (XVECTOR (coding_spec)->contents[4])
4ed46869
KH
2585 ? CODING_FLAG_BIG5_HKU
2586 : CODING_FLAG_BIG5_ETEN);
2587 break;
2588
2589 case 4:
2590 coding->type = coding_type_ccl;
c952af22
KH
2591 coding->common_flags
2592 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
4ed46869 2593 {
4608c386 2594 Lisp_Object val = XVECTOR (coding_spec)->contents[4];
4ed46869
KH
2595 if (CONSP (val)
2596 && VECTORP (XCONS (val)->car)
2597 && VECTORP (XCONS (val)->cdr))
2598 {
2599 setup_ccl_program (&(coding->spec.ccl.decoder), XCONS (val)->car);
2600 setup_ccl_program (&(coding->spec.ccl.encoder), XCONS (val)->cdr);
2601 }
2602 else
2603 goto label_invalid_coding_system;
2604 }
c952af22 2605 coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
4ed46869
KH
2606 break;
2607
27901516
KH
2608 case 5:
2609 coding->type = coding_type_raw_text;
2610 break;
2611
4ed46869
KH
2612 default:
2613 if (EQ (type, Qt))
c952af22
KH
2614 {
2615 coding->type = coding_type_undecided;
2616 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
2617 }
4ed46869
KH
2618 else
2619 coding->type = coding_type_no_conversion;
2620 break;
2621 }
2622 return 0;
2623
2624 label_invalid_coding_system:
2625 coding->type = coding_type_no_conversion;
c952af22 2626 coding->common_flags = 0;
dec137e5 2627 coding->eol_type = CODING_EOL_LF;
e0e989f6
KH
2628 coding->symbol = coding->pre_write_conversion = coding->post_read_conversion
2629 = Qnil;
4ed46869
KH
2630 return -1;
2631}
2632
2633/* Emacs has a mechanism to automatically detect a coding system if it
2634 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
2635 it's impossible to distinguish some coding systems accurately
2636 because they use the same range of codes. So, at first, coding
2637 systems are categorized into 7, those are:
2638
0ef69138 2639 o coding-category-emacs-mule
4ed46869
KH
2640
2641 The category for a coding system which has the same code range
2642 as Emacs' internal format. Assigned the coding-system (Lisp
0ef69138 2643 symbol) `emacs-mule' by default.
4ed46869
KH
2644
2645 o coding-category-sjis
2646
2647 The category for a coding system which has the same code range
2648 as SJIS. Assigned the coding-system (Lisp
7717c392 2649 symbol) `japanese-shift-jis' by default.
4ed46869
KH
2650
2651 o coding-category-iso-7
2652
2653 The category for a coding system which has the same code range
7717c392
KH
2654 as ISO2022 of 7-bit environment. This doesn't use any locking
2655 shift and single shift functions. Assigned the coding-system
2656 (Lisp symbol) `iso-2022-7bit' by default.
4ed46869
KH
2657
2658 o coding-category-iso-8-1
2659
2660 The category for a coding system which has the same code range
2661 as ISO2022 of 8-bit environment and graphic plane 1 used only
7717c392
KH
2662 for DIMENSION1 charset. This doesn't use any locking shift
2663 and single shift functions. Assigned the coding-system (Lisp
2664 symbol) `iso-latin-1' by default.
4ed46869
KH
2665
2666 o coding-category-iso-8-2
2667
2668 The category for a coding system which has the same code range
2669 as ISO2022 of 8-bit environment and graphic plane 1 used only
7717c392
KH
2670 for DIMENSION2 charset. This doesn't use any locking shift
2671 and single shift functions. Assigned the coding-system (Lisp
2672 symbol) `japanese-iso-8bit' by default.
4ed46869 2673
7717c392 2674 o coding-category-iso-7-else
4ed46869
KH
2675
2676 The category for a coding system which has the same code range
7717c392
KH
2677 as ISO2022 of 7-bit environemnt but uses locking shift or
2678 single shift functions. Assigned the coding-system (Lisp
2679 symbol) `iso-2022-7bit-lock' by default.
2680
2681 o coding-category-iso-8-else
2682
2683 The category for a coding system which has the same code range
2684 as ISO2022 of 8-bit environemnt but uses locking shift or
2685 single shift functions. Assigned the coding-system (Lisp
2686 symbol) `iso-2022-8bit-ss2' by default.
4ed46869
KH
2687
2688 o coding-category-big5
2689
2690 The category for a coding system which has the same code range
2691 as BIG5. Assigned the coding-system (Lisp symbol)
e0e989f6 2692 `cn-big5' by default.
4ed46869
KH
2693
2694 o coding-category-binary
2695
2696 The category for a coding system not categorized in any of the
2697 above. Assigned the coding-system (Lisp symbol)
e0e989f6 2698 `no-conversion' by default.
4ed46869
KH
2699
2700 Each of them is a Lisp symbol and the value is an actual
2701 `coding-system's (this is also a Lisp symbol) assigned by a user.
2702 What Emacs does actually is to detect a category of coding system.
2703 Then, it uses a `coding-system' assigned to it. If Emacs can't
2704 decide only one possible category, it selects a category of the
2705 highest priority. Priorities of categories are also specified by a
2706 user in a Lisp variable `coding-category-list'.
2707
2708*/
2709
2710/* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2711 If it detects possible coding systems, return an integer in which
2712 appropriate flag bits are set. Flag bits are defined by macros
2713 CODING_CATEGORY_MASK_XXX in `coding.h'. */
2714
2715int
2716detect_coding_mask (src, src_bytes)
2717 unsigned char *src;
2718 int src_bytes;
2719{
2720 register unsigned char c;
2721 unsigned char *src_end = src + src_bytes;
2722 int mask;
2723
2724 /* At first, skip all ASCII characters and control characters except
2725 for three ISO2022 specific control characters. */
bcf26d6a 2726 label_loop_detect_coding:
4ed46869
KH
2727 while (src < src_end)
2728 {
2729 c = *src;
2730 if (c >= 0x80
2731 || (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
2732 break;
2733 src++;
2734 }
2735
2736 if (src >= src_end)
2737 /* We found nothing other than ASCII. There's nothing to do. */
2738 return CODING_CATEGORY_MASK_ANY;
2739
2740 /* The text seems to be encoded in some multilingual coding system.
2741 Now, try to find in which coding system the text is encoded. */
2742 if (c < 0x80)
bcf26d6a
KH
2743 {
2744 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
2745 /* C is an ISO2022 specific control code of C0. */
2746 mask = detect_coding_iso2022 (src, src_end);
2747 src++;
1b2af4b0 2748 if (mask == 0)
bcf26d6a
KH
2749 /* No valid ISO2022 code follows C. Try again. */
2750 goto label_loop_detect_coding;
5d648571 2751 mask |= CODING_CATEGORY_MASK_RAW_TEXT;
bcf26d6a 2752 }
4ed46869 2753 else if (c < 0xA0)
c4825358 2754 {
3f003981 2755 /* If C is a special latin extra code,
c4825358
KH
2756 or is an ISO2022 specific control code of C1 (SS2 or SS3),
2757 or is an ISO2022 control-sequence-introducer (CSI),
27901516 2758 we should also consider the possibility of ISO2022 codings. */
3f003981
KH
2759 if ((VECTORP (Vlatin_extra_code_table)
2760 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
c4825358
KH
2761 || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
2762 || (c == ISO_CODE_CSI
2763 && (src < src_end
2764 && (*src == ']'
2765 || (src + 1 < src_end
2766 && src[1] == ']'
2767 && (*src == '0' || *src == '1' || *src == '2'))))))
2768 mask = (detect_coding_iso2022 (src, src_end)
2769 | detect_coding_sjis (src, src_end)
2770 | detect_coding_emacs_mule (src, src_end)
27901516 2771 | CODING_CATEGORY_MASK_RAW_TEXT);
4ed46869 2772
c4825358 2773 else
27901516
KH
2774 /* C is the first byte of SJIS character code,
2775 or a leading-code of Emacs' internal format (emacs-mule). */
c4825358
KH
2776 mask = (detect_coding_sjis (src, src_end)
2777 | detect_coding_emacs_mule (src, src_end)
27901516 2778 | CODING_CATEGORY_MASK_RAW_TEXT);
c4825358 2779 }
4ed46869
KH
2780 else
2781 /* C is a character of ISO2022 in graphic plane right,
2782 or a SJIS's 1-byte character code (i.e. JISX0201),
2783 or the first byte of BIG5's 2-byte code. */
2784 mask = (detect_coding_iso2022 (src, src_end)
2785 | detect_coding_sjis (src, src_end)
10bff6f1 2786 | detect_coding_big5 (src, src_end)
27901516 2787 | CODING_CATEGORY_MASK_RAW_TEXT);
4ed46869
KH
2788
2789 return mask;
2790}
2791
2792/* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2793 The information of the detected coding system is set in CODING. */
2794
2795void
2796detect_coding (coding, src, src_bytes)
2797 struct coding_system *coding;
2798 unsigned char *src;
2799 int src_bytes;
2800{
2801 int mask = detect_coding_mask (src, src_bytes);
2802 int idx;
27901516 2803 Lisp_Object val = Vcoding_category_list;
4ed46869
KH
2804
2805 if (mask == CODING_CATEGORY_MASK_ANY)
2806 /* We found nothing other than ASCII. There's nothing to do. */
2807 return;
2808
27901516
KH
2809 /* We found some plausible coding systems. Let's use a coding
2810 system of the highest priority. */
4ed46869 2811
27901516
KH
2812 if (CONSP (val))
2813 while (!NILP (val))
2814 {
2815 idx = XFASTINT (Fget (XCONS (val)->car, Qcoding_category_index));
2816 if ((idx < CODING_CATEGORY_IDX_MAX) && (mask & (1 << idx)))
2817 break;
2818 val = XCONS (val)->cdr;
2819 }
2820 else
2821 val = Qnil;
4ed46869 2822
27901516
KH
2823 if (NILP (val))
2824 {
2825 /* For unknown reason, `Vcoding_category_list' contains none of
2826 found categories. Let's use any of them. */
2827 for (idx = 0; idx < CODING_CATEGORY_IDX_MAX; idx++)
2828 if (mask & (1 << idx))
2829 break;
4ed46869
KH
2830 }
2831 setup_coding_system (XSYMBOL (coding_category_table[idx])->value, coding);
2832}
2833
2834/* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2835 is encoded. Return one of CODING_EOL_LF, CODING_EOL_CRLF,
0ef69138 2836 CODING_EOL_CR, and CODING_EOL_UNDECIDED. */
4ed46869 2837
bc4bc72a
RS
2838#define MAX_EOL_CHECK_COUNT 3
2839
4ed46869
KH
2840int
2841detect_eol_type (src, src_bytes)
2842 unsigned char *src;
2843 int src_bytes;
2844{
2845 unsigned char *src_end = src + src_bytes;
2846 unsigned char c;
bc4bc72a
RS
2847 int total = 0; /* How many end-of-lines are found so far. */
2848 int eol_type = CODING_EOL_UNDECIDED;
2849 int this_eol_type;
4ed46869 2850
bc4bc72a 2851 while (src < src_end && total < MAX_EOL_CHECK_COUNT)
4ed46869
KH
2852 {
2853 c = *src++;
bc4bc72a 2854 if (c == '\n' || c == '\r')
4ed46869 2855 {
bc4bc72a
RS
2856 total++;
2857 if (c == '\n')
2858 this_eol_type = CODING_EOL_LF;
2859 else if (src >= src_end || *src != '\n')
2860 this_eol_type = CODING_EOL_CR;
4ed46869 2861 else
bc4bc72a
RS
2862 this_eol_type = CODING_EOL_CRLF, src++;
2863
2864 if (eol_type == CODING_EOL_UNDECIDED)
2865 /* This is the first end-of-line. */
2866 eol_type = this_eol_type;
2867 else if (eol_type != this_eol_type)
2868 /* The found type is different from what found before.
27901516
KH
2869 Let's notice the caller about this inconsistency. */
2870 return CODING_EOL_INCONSISTENT;
4ed46869
KH
2871 }
2872 }
bc4bc72a 2873
85a02ca4 2874 return eol_type;
4ed46869
KH
2875}
2876
2877/* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2878 is encoded. If it detects an appropriate format of end-of-line, it
2879 sets the information in *CODING. */
2880
2881void
2882detect_eol (coding, src, src_bytes)
2883 struct coding_system *coding;
2884 unsigned char *src;
2885 int src_bytes;
2886{
4608c386 2887 Lisp_Object val;
4ed46869
KH
2888 int eol_type = detect_eol_type (src, src_bytes);
2889
0ef69138 2890 if (eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
2891 /* We found no end-of-line in the source text. */
2892 return;
2893
27901516
KH
2894 if (eol_type == CODING_EOL_INCONSISTENT)
2895 {
2896#if 0
2897 /* This code is suppressed until we find a better way to
992f23f2 2898 distinguish raw text file and binary file. */
27901516
KH
2899
2900 /* If we have already detected that the coding is raw-text, the
2901 coding should actually be no-conversion. */
2902 if (coding->type == coding_type_raw_text)
2903 {
2904 setup_coding_system (Qno_conversion, coding);
2905 return;
2906 }
2907 /* Else, let's decode only text code anyway. */
2908#endif /* 0 */
1b2af4b0 2909 eol_type = CODING_EOL_LF;
27901516
KH
2910 }
2911
4608c386 2912 val = Fget (coding->symbol, Qeol_type);
4ed46869
KH
2913 if (VECTORP (val) && XVECTOR (val)->size == 3)
2914 setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
2915}
2916
2917/* See "GENERAL NOTES about `decode_coding_XXX ()' functions". Before
2918 decoding, it may detect coding system and format of end-of-line if
2919 those are not yet decided. */
2920
2921int
2922decode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2923 struct coding_system *coding;
2924 unsigned char *source, *destination;
2925 int src_bytes, dst_bytes;
2926 int *consumed;
2927{
2928 int produced;
2929
2930 if (src_bytes <= 0)
2931 {
2932 *consumed = 0;
2933 return 0;
2934 }
2935
0ef69138 2936 if (coding->type == coding_type_undecided)
4ed46869
KH
2937 detect_coding (coding, source, src_bytes);
2938
0ef69138 2939 if (coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
2940 detect_eol (coding, source, src_bytes);
2941
2942 coding->carryover_size = 0;
2943 switch (coding->type)
2944 {
2945 case coding_type_no_conversion:
2946 label_no_conversion:
2947 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2948 bcopy (source, destination, produced);
2949 *consumed = produced;
2950 break;
2951
0ef69138
KH
2952 case coding_type_emacs_mule:
2953 case coding_type_undecided:
27901516 2954 case coding_type_raw_text:
4ed46869 2955 if (coding->eol_type == CODING_EOL_LF
0ef69138 2956 || coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
2957 goto label_no_conversion;
2958 produced = decode_eol (coding, source, destination,
2959 src_bytes, dst_bytes, consumed);
2960 break;
2961
2962 case coding_type_sjis:
2963 produced = decode_coding_sjis_big5 (coding, source, destination,
2964 src_bytes, dst_bytes, consumed,
2965 1);
2966 break;
2967
2968 case coding_type_iso2022:
2969 produced = decode_coding_iso2022 (coding, source, destination,
2970 src_bytes, dst_bytes, consumed);
2971 break;
2972
2973 case coding_type_big5:
2974 produced = decode_coding_sjis_big5 (coding, source, destination,
2975 src_bytes, dst_bytes, consumed,
2976 0);
2977 break;
2978
2979 case coding_type_ccl:
2980 produced = ccl_driver (&coding->spec.ccl.decoder, source, destination,
2981 src_bytes, dst_bytes, consumed);
2982 break;
2983 }
2984
2985 return produced;
2986}
2987
2988/* See "GENERAL NOTES about `encode_coding_XXX ()' functions". */
2989
2990int
2991encode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2992 struct coding_system *coding;
2993 unsigned char *source, *destination;
2994 int src_bytes, dst_bytes;
2995 int *consumed;
2996{
2997 int produced;
2998
4ed46869
KH
2999 switch (coding->type)
3000 {
3001 case coding_type_no_conversion:
3002 label_no_conversion:
3003 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
3004 if (produced > 0)
3005 {
3006 bcopy (source, destination, produced);
3007 if (coding->selective)
3008 {
3009 unsigned char *p = destination, *pend = destination + produced;
3010 while (p < pend)
e0e989f6 3011 if (*p++ == '\015') p[-1] = '\n';
4ed46869
KH
3012 }
3013 }
3014 *consumed = produced;
3015 break;
3016
0ef69138
KH
3017 case coding_type_emacs_mule:
3018 case coding_type_undecided:
27901516 3019 case coding_type_raw_text:
4ed46869 3020 if (coding->eol_type == CODING_EOL_LF
0ef69138 3021 || coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
3022 goto label_no_conversion;
3023 produced = encode_eol (coding, source, destination,
3024 src_bytes, dst_bytes, consumed);
3025 break;
3026
3027 case coding_type_sjis:
3028 produced = encode_coding_sjis_big5 (coding, source, destination,
3029 src_bytes, dst_bytes, consumed,
3030 1);
3031 break;
3032
3033 case coding_type_iso2022:
3034 produced = encode_coding_iso2022 (coding, source, destination,
3035 src_bytes, dst_bytes, consumed);
3036 break;
3037
3038 case coding_type_big5:
3039 produced = encode_coding_sjis_big5 (coding, source, destination,
3040 src_bytes, dst_bytes, consumed,
3041 0);
3042 break;
3043
3044 case coding_type_ccl:
3045 produced = ccl_driver (&coding->spec.ccl.encoder, source, destination,
3046 src_bytes, dst_bytes, consumed);
3047 break;
3048 }
3049
3050 return produced;
3051}
3052
3053#define CONVERSION_BUFFER_EXTRA_ROOM 256
3054
3055/* Return maximum size (bytes) of a buffer enough for decoding
3056 SRC_BYTES of text encoded in CODING. */
3057
3058int
3059decoding_buffer_size (coding, src_bytes)
3060 struct coding_system *coding;
3061 int src_bytes;
3062{
3063 int magnification;
3064
3065 if (coding->type == coding_type_iso2022)
3066 magnification = 3;
3067 else if (coding->type == coding_type_ccl)
3068 magnification = coding->spec.ccl.decoder.buf_magnification;
3069 else
3070 magnification = 2;
3071
3072 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
3073}
3074
3075/* Return maximum size (bytes) of a buffer enough for encoding
3076 SRC_BYTES of text to CODING. */
3077
3078int
3079encoding_buffer_size (coding, src_bytes)
3080 struct coding_system *coding;
3081 int src_bytes;
3082{
3083 int magnification;
3084
3085 if (coding->type == coding_type_ccl)
3086 magnification = coding->spec.ccl.encoder.buf_magnification;
3087 else
3088 magnification = 3;
3089
3090 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
3091}
3092
3093#ifndef MINIMUM_CONVERSION_BUFFER_SIZE
3094#define MINIMUM_CONVERSION_BUFFER_SIZE 1024
3095#endif
3096
3097char *conversion_buffer;
3098int conversion_buffer_size;
3099
3100/* Return a pointer to a SIZE bytes of buffer to be used for encoding
3101 or decoding. Sufficient memory is allocated automatically. If we
3102 run out of memory, return NULL. */
3103
3104char *
3105get_conversion_buffer (size)
3106 int size;
3107{
3108 if (size > conversion_buffer_size)
3109 {
3110 char *buf;
3111 int real_size = conversion_buffer_size * 2;
3112
3113 while (real_size < size) real_size *= 2;
3114 buf = (char *) xmalloc (real_size);
3115 xfree (conversion_buffer);
3116 conversion_buffer = buf;
3117 conversion_buffer_size = real_size;
3118 }
3119 return conversion_buffer;
3120}
3121
3122\f
3123#ifdef emacs
3124/*** 7. Emacs Lisp library functions ***/
3125
4ed46869
KH
3126DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
3127 "Return t if OBJECT is nil or a coding-system.\n\
3128See document of make-coding-system for coding-system object.")
3129 (obj)
3130 Lisp_Object obj;
3131{
4608c386
KH
3132 if (NILP (obj))
3133 return Qt;
3134 if (!SYMBOLP (obj))
3135 return Qnil;
3136 /* Get coding-spec vector for OBJ. */
3137 obj = Fget (obj, Qcoding_system);
3138 return ((VECTORP (obj) && XVECTOR (obj)->size == 5)
3139 ? Qt : Qnil);
4ed46869
KH
3140}
3141
9d991de8
RS
3142DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
3143 Sread_non_nil_coding_system, 1, 1, 0,
e0e989f6 3144 "Read a coding system from the minibuffer, prompting with string PROMPT.")
4ed46869
KH
3145 (prompt)
3146 Lisp_Object prompt;
3147{
e0e989f6 3148 Lisp_Object val;
9d991de8
RS
3149 do
3150 {
4608c386
KH
3151 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
3152 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
9d991de8
RS
3153 }
3154 while (XSTRING (val)->size == 0);
e0e989f6 3155 return (Fintern (val, Qnil));
4ed46869
KH
3156}
3157
9b787f3e
RS
3158DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
3159 "Read a coding system from the minibuffer, prompting with string PROMPT.\n\
3160If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.")
3161 (prompt, default_coding_system)
3162 Lisp_Object prompt, default_coding_system;
4ed46869 3163{
f44d27ce 3164 Lisp_Object val;
9b787f3e
RS
3165 if (SYMBOLP (default_coding_system))
3166 XSETSTRING (default_coding_system, XSYMBOL (default_coding_system)->name);
4608c386 3167 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
9b787f3e
RS
3168 Qt, Qnil, Qcoding_system_history,
3169 default_coding_system, Qnil);
e0e989f6 3170 return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
4ed46869
KH
3171}
3172
3173DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
3174 1, 1, 0,
3175 "Check validity of CODING-SYSTEM.\n\
3176If valid, return CODING-SYSTEM, else `coding-system-error' is signaled.\n\
3177CODING-SYSTEM is valid if it is a symbol and has \"coding-system\" property.\n\
3178The value of property should be a vector of length 5.")
3179 (coding_system)
3180 Lisp_Object coding_system;
3181{
3182 CHECK_SYMBOL (coding_system, 0);
3183 if (!NILP (Fcoding_system_p (coding_system)))
3184 return coding_system;
3185 while (1)
02ba4723 3186 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
4ed46869
KH
3187}
3188
3189DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
3190 2, 2, 0,
bf9cdd4e
KH
3191 "Detect coding system of the text in the region between START and END.\n\
3192Return a list of possible coding systems ordered by priority.\n\
0ef69138 3193If only ASCII characters are found, it returns `undecided'\n\
bf9cdd4e 3194 or its subsidiary coding system according to a detected end-of-line format.")
4ed46869
KH
3195 (b, e)
3196 Lisp_Object b, e;
3197{
3198 int coding_mask, eol_type;
3199 Lisp_Object val;
3200 int beg, end;
6289dd10 3201 int beg_byte, end_byte;
4ed46869
KH
3202
3203 validate_region (&b, &e);
3204 beg = XINT (b), end = XINT (e);
6289dd10
RS
3205 beg_byte = CHAR_TO_BYTE (beg);
3206 end_byte = CHAR_TO_BYTE (end);
4ed46869 3207
6289dd10
RS
3208 if (beg < GPT && end >= GPT)
3209 move_gap_both (end, end_byte);
3210
3211 coding_mask = detect_coding_mask (BYTE_POS_ADDR (beg_byte),
3212 end_byte - beg_byte);
3213 eol_type = detect_eol_type (BYTE_POS_ADDR (beg_byte), end_byte - beg_byte);
4ed46869
KH
3214
3215 if (coding_mask == CODING_CATEGORY_MASK_ANY)
3216 {
27901516
KH
3217 val = Qundecided;
3218 if (eol_type != CODING_EOL_UNDECIDED
3219 && eol_type != CODING_EOL_INCONSISTENT)
4ed46869 3220 {
f44d27ce
RS
3221 Lisp_Object val2;
3222 val2 = Fget (Qundecided, Qeol_type);
4ed46869
KH
3223 if (VECTORP (val2))
3224 val = XVECTOR (val2)->contents[eol_type];
3225 }
3226 }
3227 else
3228 {
3229 Lisp_Object val2;
3230
3231 /* At first, gather possible coding-systems in VAL in a reverse
3232 order. */
3233 val = Qnil;
3234 for (val2 = Vcoding_category_list;
3235 !NILP (val2);
3236 val2 = XCONS (val2)->cdr)
3237 {
3238 int idx
3239 = XFASTINT (Fget (XCONS (val2)->car, Qcoding_category_index));
3240 if (coding_mask & (1 << idx))
27901516
KH
3241 {
3242#if 0
3243 /* This code is suppressed until we find a better way to
992f23f2 3244 distinguish raw text file and binary file. */
27901516
KH
3245
3246 if (idx == CODING_CATEGORY_IDX_RAW_TEXT
3247 && eol_type == CODING_EOL_INCONSISTENT)
3248 val = Fcons (Qno_conversion, val);
3249 else
3250#endif /* 0 */
3251 val = Fcons (Fsymbol_value (XCONS (val2)->car), val);
3252 }
4ed46869
KH
3253 }
3254
3255 /* Then, change the order of the list, while getting subsidiary
3256 coding-systems. */
3257 val2 = val;
3258 val = Qnil;
27901516
KH
3259 if (eol_type == CODING_EOL_INCONSISTENT)
3260 eol_type == CODING_EOL_UNDECIDED;
4ed46869
KH
3261 for (; !NILP (val2); val2 = XCONS (val2)->cdr)
3262 {
0ef69138 3263 if (eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
3264 val = Fcons (XCONS (val2)->car, val);
3265 else
3266 {
f44d27ce
RS
3267 Lisp_Object val3;
3268 val3 = Fget (XCONS (val2)->car, Qeol_type);
4ed46869
KH
3269 if (VECTORP (val3))
3270 val = Fcons (XVECTOR (val3)->contents[eol_type], val);
3271 else
3272 val = Fcons (XCONS (val2)->car, val);
3273 }
3274 }
3275 }
3276
3277 return val;
3278}
3279
3280/* Scan text in the region between *BEGP and *ENDP, skip characters
3281 which we never have to encode to (iff ENCODEP is 1) or decode from
3282 coding system CODING at the head and tail, then set BEGP and ENDP
3283 to the addresses of start and end of the text we actually convert. */
3284
3285void
3286shrink_conversion_area (begp, endp, coding, encodep)
3287 unsigned char **begp, **endp;
3288 struct coding_system *coding;
3289 int encodep;
3290{
3291 register unsigned char *beg_addr = *begp, *end_addr = *endp;
3292
3293 if (coding->eol_type != CODING_EOL_LF
0ef69138 3294 && coding->eol_type != CODING_EOL_UNDECIDED)
4ed46869
KH
3295 /* Since we anyway have to convert end-of-line format, it is not
3296 worth skipping at most 100 bytes or so. */
3297 return;
3298
3299 if (encodep) /* for encoding */
3300 {
3301 switch (coding->type)
3302 {
3303 case coding_type_no_conversion:
0ef69138
KH
3304 case coding_type_emacs_mule:
3305 case coding_type_undecided:
27901516 3306 case coding_type_raw_text:
4ed46869
KH
3307 /* We need no conversion. */
3308 *begp = *endp;
3309 return;
3310 case coding_type_ccl:
3311 /* We can't skip any data. */
3312 return;
e0e989f6
KH
3313 case coding_type_iso2022:
3314 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3315 {
3316 unsigned char *bol = beg_addr;
3317 while (beg_addr < end_addr && *beg_addr < 0x80)
3318 {
3319 beg_addr++;
3320 if (*(beg_addr - 1) == '\n')
3321 bol = beg_addr;
3322 }
3323 beg_addr = bol;
3324 goto label_skip_tail;
3325 }
3326 /* fall down ... */
4ed46869
KH
3327 default:
3328 /* We can skip all ASCII characters at the head and tail. */
3329 while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
e0e989f6 3330 label_skip_tail:
4ed46869
KH
3331 while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
3332 break;
3333 }
3334 }
3335 else /* for decoding */
3336 {
3337 switch (coding->type)
3338 {
3339 case coding_type_no_conversion:
3340 /* We need no conversion. */
3341 *begp = *endp;
3342 return;
0ef69138 3343 case coding_type_emacs_mule:
27901516 3344 case coding_type_raw_text:
4ed46869
KH
3345 if (coding->eol_type == CODING_EOL_LF)
3346 {
3347 /* We need no conversion. */
3348 *begp = *endp;
3349 return;
3350 }
3351 /* We can skip all but carriage-return. */
3352 while (beg_addr < end_addr && *beg_addr != '\r') beg_addr++;
3353 while (beg_addr < end_addr && *(end_addr - 1) != '\r') end_addr--;
3354 break;
3355 case coding_type_sjis:
3356 case coding_type_big5:
3357 /* We can skip all ASCII characters at the head. */
3358 while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
3359 /* We can skip all ASCII characters at the tail except for
3360 the second byte of SJIS or BIG5 code. */
3361 while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
3362 if (end_addr != *endp)
3363 end_addr++;
3364 break;
3365 case coding_type_ccl:
3366 /* We can't skip any data. */
3367 return;
3368 default: /* i.e. case coding_type_iso2022: */
3369 {
3370 unsigned char c;
3371
3372 /* We can skip all ASCII characters except for a few
3373 control codes at the head. */
3374 while (beg_addr < end_addr && (c = *beg_addr) < 0x80
3375 && c != ISO_CODE_CR && c != ISO_CODE_SO
3376 && c != ISO_CODE_SI && c != ISO_CODE_ESC)
3377 beg_addr++;
3378 }
3379 break;
3380 }
3381 }
3382 *begp = beg_addr;
3383 *endp = end_addr;
3384 return;
3385}
3386
6289dd10
RS
3387/* Encode into or decode from (according to ENCODEP) coding system CODING
3388 the text between char positions B and E. */
4ed46869
KH
3389
3390Lisp_Object
3391code_convert_region (b, e, coding, encodep)
3392 Lisp_Object b, e;
3393 struct coding_system *coding;
3394 int encodep;
3395{
3396 int beg, end, len, consumed, produced;
3397 char *buf;
3398 unsigned char *begp, *endp;
6289dd10
RS
3399 int opoint = PT, opoint_byte = PT_BYTE;
3400 int beg_byte, end_byte, len_byte;
3401 int zv_before = ZV;
3402 int zv_byte_before = ZV_BYTE;
4ed46869
KH
3403
3404 validate_region (&b, &e);
3405 beg = XINT (b), end = XINT (e);
6289dd10
RS
3406 beg_byte = CHAR_TO_BYTE (beg);
3407 end_byte = CHAR_TO_BYTE (end);
3408
4ed46869 3409 if (beg < GPT && end >= GPT)
6289dd10 3410 move_gap_both (end, end_byte);
4ed46869
KH
3411
3412 if (encodep && !NILP (coding->pre_write_conversion))
3413 {
3414 /* We must call a pre-conversion function which may put a new
3415 text to be converted in a new buffer. */
3416 struct buffer *old = current_buffer, *new;
3417
6289dd10 3418 TEMP_SET_PT_BOTH (beg, beg_byte);
4ed46869
KH
3419 call2 (coding->pre_write_conversion, b, e);
3420 if (old != current_buffer)
3421 {
3422 /* Replace the original text by the text just generated. */
3423 len = ZV - BEGV;
6289dd10 3424 len_byte = ZV_BYTE - BEGV_BYTE;
4ed46869
KH
3425 new = current_buffer;
3426 set_buffer_internal (old);
6289dd10 3427 del_range_both (beg, end, beg_byte, end_byte, 1);
4ed46869
KH
3428 insert_from_buffer (new, 1, len, 0);
3429 end = beg + len;
6289dd10 3430 end_byte = len_byte;
4ed46869
KH
3431 }
3432 }
3433
3434 /* We may be able to shrink the conversion region. */
6289dd10
RS
3435 begp = BYTE_POS_ADDR (beg_byte);
3436 endp = begp + (end_byte - beg_byte);
4ed46869
KH
3437 shrink_conversion_area (&begp, &endp, coding, encodep);
3438
3439 if (begp == endp)
3440 /* We need no conversion. */
3441 len = end - beg;
3442 else
3443 {
6289dd10
RS
3444 int shrunk_beg_byte, shrunk_end_byte;
3445 int shrunk_beg;
3446 int shrunk_len_byte;
3447 int new_len_byte;
3448 int buflen;
3449 int zv_before;
3450
3451 shrunk_beg_byte = PTR_BYTE_POS (begp);
3452 shrunk_beg = BYTE_TO_CHAR (shrunk_beg_byte);
3453 shrunk_end_byte = PTR_BYTE_POS (endp);
3454 shrunk_len_byte = shrunk_end_byte - shrunk_beg_byte;
4ed46869
KH
3455
3456 if (encodep)
6289dd10 3457 buflen = encoding_buffer_size (coding, shrunk_len_byte);
4ed46869 3458 else
6289dd10
RS
3459 buflen = decoding_buffer_size (coding, shrunk_len_byte);
3460 buf = get_conversion_buffer (buflen);
4ed46869
KH
3461
3462 coding->last_block = 1;
3463 produced = (encodep
6289dd10 3464 ? encode_coding (coding, begp, buf, shrunk_len_byte, buflen,
4ed46869 3465 &consumed)
6289dd10 3466 : decode_coding (coding, begp, buf, shrunk_len_byte, buflen,
4ed46869
KH
3467 &consumed));
3468
6289dd10 3469 TEMP_SET_PT_BOTH (shrunk_beg, shrunk_beg_byte);
4ed46869 3470 insert (buf, produced);
6289dd10
RS
3471 del_range_byte (PT_BYTE, PT_BYTE + shrunk_len_byte, 1);
3472
3473 if (opoint >= end)
3474 {
3475 opoint += ZV - zv_before;
3476 opoint_byte += ZV_BYTE - zv_byte_before;
3477 }
3478 else if (opoint > beg)
3479 {
3480 opoint = beg;
3481 opoint_byte = beg_byte;
3482 }
3483 TEMP_SET_PT_BOTH (opoint, opoint_byte);
3484
3485 end += ZV - zv_before;
3486 }
4ed46869
KH
3487
3488 if (!encodep && !NILP (coding->post_read_conversion))
3489 {
6289dd10
RS
3490 Lisp_Object insval;
3491
4ed46869
KH
3492 /* We must call a post-conversion function which may alter
3493 the text just converted. */
6289dd10
RS
3494 zv_before = ZV;
3495 zv_byte_before = ZV_BYTE;
4ed46869 3496
6289dd10
RS
3497 TEMP_SET_PT_BOTH (beg, beg_byte);
3498 insval = call1 (coding->post_read_conversion, make_number (end - beg));
4ed46869 3499 CHECK_NUMBER (insval, 0);
6289dd10
RS
3500
3501 if (opoint >= beg + ZV - zv_before)
3502 {
3503 opoint += ZV - zv_before;
3504 opoint_byte += ZV_BYTE - zv_byte_before;
3505 }
3506 else if (opoint > beg)
3507 {
3508 opoint = beg;
3509 opoint_byte = beg_byte;
3510 }
3511 TEMP_SET_PT_BOTH (opoint, opoint_byte);
4ed46869
KH
3512 len = XINT (insval);
3513 }
3514
3515 return make_number (len);
3516}
3517
6289dd10
RS
3518/* Encode or decode (according to ENCODEP) the text of string STR
3519 using coding CODING. If NOCOPY is nil, we never return STR
3520 itself, but always a copy. If NOCOPY is non-nil, we return STR
3521 if no change is needed. */
3522
4ed46869 3523Lisp_Object
e0e989f6
KH
3524code_convert_string (str, coding, encodep, nocopy)
3525 Lisp_Object str, nocopy;
4ed46869
KH
3526 struct coding_system *coding;
3527 int encodep;
3528{
3529 int len, consumed, produced;
3530 char *buf;
3531 unsigned char *begp, *endp;
3532 int head_skip, tail_skip;
3533 struct gcpro gcpro1;
3534
3535 if (encodep && !NILP (coding->pre_write_conversion)
3536 || !encodep && !NILP (coding->post_read_conversion))
3537 {
3538 /* Since we have to call Lisp functions which assume target text
3539 is in a buffer, after setting a temporary buffer, call
3540 code_convert_region. */
3541 int count = specpdl_ptr - specpdl;
3542 int len = XSTRING (str)->size;
3543 Lisp_Object result;
3544 struct buffer *old = current_buffer;
3545
3546 record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
3547 temp_output_buffer_setup (" *code-converting-work*");
3548 set_buffer_internal (XBUFFER (Vstandard_output));
3549 insert_from_string (str, 0, len, 0);
3550 code_convert_region (make_number (BEGV), make_number (ZV),
3551 coding, encodep);
3552 result = make_buffer_string (BEGV, ZV, 0);
3553 set_buffer_internal (old);
3554 return unbind_to (count, result);
3555 }
3556
3557 /* We may be able to shrink the conversion region. */
3558 begp = XSTRING (str)->data;
3559 endp = begp + XSTRING (str)->size;
3560 shrink_conversion_area (&begp, &endp, coding, encodep);
3561
3562 if (begp == endp)
3563 /* We need no conversion. */
e0e989f6 3564 return (NILP (nocopy) ? Fcopy_sequence (str) : str);
4ed46869
KH
3565
3566 head_skip = begp - XSTRING (str)->data;
3567 tail_skip = XSTRING (str)->size - head_skip - (endp - begp);
3568
3569 GCPRO1 (str);
3570
3571 if (encodep)
3572 len = encoding_buffer_size (coding, endp - begp);
3573 else
3574 len = decoding_buffer_size (coding, endp - begp);
3575 buf = get_conversion_buffer (len + head_skip + tail_skip);
3576
3577 bcopy (XSTRING (str)->data, buf, head_skip);
3578 coding->last_block = 1;
3579 produced = (encodep
3580 ? encode_coding (coding, XSTRING (str)->data + head_skip,
3581 buf + head_skip, endp - begp, len, &consumed)
3582 : decode_coding (coding, XSTRING (str)->data + head_skip,
3583 buf + head_skip, endp - begp, len, &consumed));
3584 bcopy (XSTRING (str)->data + head_skip + (endp - begp),
3585 buf + head_skip + produced,
3586 tail_skip);
3587
3588 UNGCPRO;
3589
3590 return make_string (buf, head_skip + produced + tail_skip);
3591}
3592
3593DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
e0e989f6
KH
3594 3, 3, "r\nzCoding system: ",
3595 "Decode current region by specified coding system.\n\
3596When called from a program, takes three arguments:\n\
3597START, END, and CODING-SYSTEM. START END are buffer positions.\n\
4ed46869
KH
3598Return length of decoded text.")
3599 (b, e, coding_system)
3600 Lisp_Object b, e, coding_system;
3601{
3602 struct coding_system coding;
3603
3604 CHECK_NUMBER_COERCE_MARKER (b, 0);
3605 CHECK_NUMBER_COERCE_MARKER (e, 1);
3606 CHECK_SYMBOL (coding_system, 2);
3607
e0e989f6
KH
3608 if (NILP (coding_system))
3609 return make_number (XFASTINT (e) - XFASTINT (b));
4ed46869
KH
3610 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3611 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3612
3613 return code_convert_region (b, e, &coding, 0);
3614}
3615
3616DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
e0e989f6
KH
3617 3, 3, "r\nzCoding system: ",
3618 "Encode current region by specified coding system.\n\
3619When called from a program, takes three arguments:\n\
3620START, END, and CODING-SYSTEM. START END are buffer positions.\n\
4ed46869
KH
3621Return length of encoded text.")
3622 (b, e, coding_system)
3623 Lisp_Object b, e, coding_system;
3624{
3625 struct coding_system coding;
3626
3627 CHECK_NUMBER_COERCE_MARKER (b, 0);
3628 CHECK_NUMBER_COERCE_MARKER (e, 1);
3629 CHECK_SYMBOL (coding_system, 2);
3630
e0e989f6
KH
3631 if (NILP (coding_system))
3632 return make_number (XFASTINT (e) - XFASTINT (b));
4ed46869
KH
3633 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3634 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3635
3636 return code_convert_region (b, e, &coding, 1);
3637}
3638
3639DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
e0e989f6
KH
3640 2, 3, 0,
3641 "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
fe487a71
RS
3642Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
3643if the decoding operation is trivial.")
e0e989f6
KH
3644 (string, coding_system, nocopy)
3645 Lisp_Object string, coding_system, nocopy;
4ed46869
KH
3646{
3647 struct coding_system coding;
3648
3649 CHECK_STRING (string, 0);
3650 CHECK_SYMBOL (coding_system, 1);
3651
e0e989f6
KH
3652 if (NILP (coding_system))
3653 return (NILP (nocopy) ? Fcopy_sequence (string) : string);
4ed46869
KH
3654 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3655 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3656
e0e989f6 3657 return code_convert_string (string, &coding, 0, nocopy);
4ed46869
KH
3658}
3659
3660DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
e0e989f6
KH
3661 2, 3, 0,
3662 "Encode STRING to CODING-SYSTEM, and return the result.\n\
fe487a71
RS
3663Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
3664if the encoding operation is trivial.")
e0e989f6
KH
3665 (string, coding_system, nocopy)
3666 Lisp_Object string, coding_system, nocopy;
4ed46869
KH
3667{
3668 struct coding_system coding;
3669
3670 CHECK_STRING (string, 0);
3671 CHECK_SYMBOL (coding_system, 1);
3672
e0e989f6
KH
3673 if (NILP (coding_system))
3674 return (NILP (nocopy) ? Fcopy_sequence (string) : string);
4ed46869
KH
3675 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3676 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3677
e0e989f6 3678 return code_convert_string (string, &coding, 1, nocopy);
4ed46869
KH
3679}
3680
3681DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
e0e989f6 3682 "Decode a JISX0208 character of shift-jis encoding.\n\
4ed46869
KH
3683CODE is the character code in SJIS.\n\
3684Return the corresponding character.")
3685 (code)
3686 Lisp_Object code;
3687{
3688 unsigned char c1, c2, s1, s2;
3689 Lisp_Object val;
3690
3691 CHECK_NUMBER (code, 0);
3692 s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
3693 DECODE_SJIS (s1, s2, c1, c2);
3694 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset_jisx0208, c1, c2));
3695 return val;
3696}
3697
3698DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
3699 "Encode a JISX0208 character CHAR to SJIS coding-system.\n\
3700Return the corresponding character code in SJIS.")
3701 (ch)
3702 Lisp_Object ch;
3703{
bcf26d6a 3704 int charset, c1, c2, s1, s2;
4ed46869
KH
3705 Lisp_Object val;
3706
3707 CHECK_NUMBER (ch, 0);
3708 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3709 if (charset == charset_jisx0208)
3710 {
3711 ENCODE_SJIS (c1, c2, s1, s2);
bcf26d6a 3712 XSETFASTINT (val, (s1 << 8) | s2);
4ed46869
KH
3713 }
3714 else
3715 XSETFASTINT (val, 0);
3716 return val;
3717}
3718
3719DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
3720 "Decode a Big5 character CODE of BIG5 coding-system.\n\
3721CODE is the character code in BIG5.\n\
3722Return the corresponding character.")
3723 (code)
3724 Lisp_Object code;
3725{
3726 int charset;
3727 unsigned char b1, b2, c1, c2;
3728 Lisp_Object val;
3729
3730 CHECK_NUMBER (code, 0);
3731 b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
3732 DECODE_BIG5 (b1, b2, charset, c1, c2);
3733 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset, c1, c2));
3734 return val;
3735}
3736
3737DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
3738 "Encode the Big5 character CHAR to BIG5 coding-system.\n\
3739Return the corresponding character code in Big5.")
3740 (ch)
3741 Lisp_Object ch;
3742{
bcf26d6a 3743 int charset, c1, c2, b1, b2;
4ed46869
KH
3744 Lisp_Object val;
3745
3746 CHECK_NUMBER (ch, 0);
3747 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3748 if (charset == charset_big5_1 || charset == charset_big5_2)
3749 {
3750 ENCODE_BIG5 (charset, c1, c2, b1, b2);
bcf26d6a 3751 XSETFASTINT (val, (b1 << 8) | b2);
4ed46869
KH
3752 }
3753 else
3754 XSETFASTINT (val, 0);
3755 return val;
3756}
3757
1ba9e4ab
KH
3758DEFUN ("set-terminal-coding-system-internal",
3759 Fset_terminal_coding_system_internal,
3760 Sset_terminal_coding_system_internal, 1, 1, 0, "")
4ed46869
KH
3761 (coding_system)
3762 Lisp_Object coding_system;
3763{
3764 CHECK_SYMBOL (coding_system, 0);
3765 setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
70c22245 3766 /* We had better not send unsafe characters to terminal. */
6e85d753
KH
3767 terminal_coding.flags |= CODING_FLAG_ISO_SAFE;
3768
4ed46869
KH
3769 return Qnil;
3770}
3771
c4825358
KH
3772DEFUN ("set-safe-terminal-coding-system-internal",
3773 Fset_safe_terminal_coding_system_internal,
3774 Sset_safe_terminal_coding_system_internal, 1, 1, 0, "")
3775 (coding_system)
3776 Lisp_Object coding_system;
3777{
3778 CHECK_SYMBOL (coding_system, 0);
3779 setup_coding_system (Fcheck_coding_system (coding_system),
3780 &safe_terminal_coding);
3781 return Qnil;
3782}
3783
4ed46869
KH
3784DEFUN ("terminal-coding-system",
3785 Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
3786 "Return coding-system of your terminal.")
3787 ()
3788{
3789 return terminal_coding.symbol;
3790}
3791
1ba9e4ab
KH
3792DEFUN ("set-keyboard-coding-system-internal",
3793 Fset_keyboard_coding_system_internal,
3794 Sset_keyboard_coding_system_internal, 1, 1, 0, "")
4ed46869
KH
3795 (coding_system)
3796 Lisp_Object coding_system;
3797{
3798 CHECK_SYMBOL (coding_system, 0);
3799 setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
3800 return Qnil;
3801}
3802
3803DEFUN ("keyboard-coding-system",
3804 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
3805 "Return coding-system of what is sent from terminal keyboard.")
3806 ()
3807{
3808 return keyboard_coding.symbol;
3809}
3810
3811\f
a5d301df
KH
3812DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
3813 Sfind_operation_coding_system, 1, MANY, 0,
3814 "Choose a coding system for an operation based on the target name.\n\
9ce27fde
KH
3815The value names a pair of coding systems: (DECODING-SYSTEM ENCODING-SYSTEM).\n\
3816DECODING-SYSTEM is the coding system to use for decoding\n\
3817\(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system\n\
3818for encoding (in case OPERATION does encoding).\n\
ccdb79f5
RS
3819\n\
3820The first argument OPERATION specifies an I/O primitive:\n\
3821 For file I/O, `insert-file-contents' or `write-region'.\n\
3822 For process I/O, `call-process', `call-process-region', or `start-process'.\n\
3823 For network I/O, `open-network-stream'.\n\
3824\n\
3825The remaining arguments should be the same arguments that were passed\n\
3826to the primitive. Depending on which primitive, one of those arguments\n\
3827is selected as the TARGET. For example, if OPERATION does file I/O,\n\
3828whichever argument specifies the file name is TARGET.\n\
3829\n\
3830TARGET has a meaning which depends on OPERATION:\n\
4ed46869
KH
3831 For file I/O, TARGET is a file name.\n\
3832 For process I/O, TARGET is a process name.\n\
3833 For network I/O, TARGET is a service name or a port number\n\
3834\n\
02ba4723
KH
3835This function looks up what specified for TARGET in,\n\
3836`file-coding-system-alist', `process-coding-system-alist',\n\
3837or `network-coding-system-alist' depending on OPERATION.\n\
3838They may specify a coding system, a cons of coding systems,\n\
3839or a function symbol to call.\n\
3840In the last case, we call the function with one argument,\n\
9ce27fde 3841which is a list of all the arguments given to this function.")
4ed46869
KH
3842 (nargs, args)
3843 int nargs;
3844 Lisp_Object *args;
3845{
3846 Lisp_Object operation, target_idx, target, val;
3847 register Lisp_Object chain;
3848
3849 if (nargs < 2)
3850 error ("Too few arguments");
3851 operation = args[0];
3852 if (!SYMBOLP (operation)
3853 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
3854 error ("Invalid first arguement");
3855 if (nargs < 1 + XINT (target_idx))
3856 error ("Too few arguments for operation: %s",
3857 XSYMBOL (operation)->name->data);
3858 target = args[XINT (target_idx) + 1];
3859 if (!(STRINGP (target)
3860 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
3861 error ("Invalid %dth argument", XINT (target_idx) + 1);
3862
2e34157c
RS
3863 chain = ((EQ (operation, Qinsert_file_contents)
3864 || EQ (operation, Qwrite_region))
02ba4723 3865 ? Vfile_coding_system_alist
2e34157c 3866 : (EQ (operation, Qopen_network_stream)
02ba4723
KH
3867 ? Vnetwork_coding_system_alist
3868 : Vprocess_coding_system_alist));
4ed46869
KH
3869 if (NILP (chain))
3870 return Qnil;
3871
02ba4723 3872 for (; CONSP (chain); chain = XCONS (chain)->cdr)
4ed46869 3873 {
f44d27ce
RS
3874 Lisp_Object elt;
3875 elt = XCONS (chain)->car;
4ed46869
KH
3876
3877 if (CONSP (elt)
3878 && ((STRINGP (target)
3879 && STRINGP (XCONS (elt)->car)
3880 && fast_string_match (XCONS (elt)->car, target) >= 0)
3881 || (INTEGERP (target) && EQ (target, XCONS (elt)->car))))
02ba4723
KH
3882 {
3883 val = XCONS (elt)->cdr;
b19fd4c5
KH
3884 /* Here, if VAL is both a valid coding system and a valid
3885 function symbol, we return VAL as a coding system. */
02ba4723
KH
3886 if (CONSP (val))
3887 return val;
3888 if (! SYMBOLP (val))
3889 return Qnil;
3890 if (! NILP (Fcoding_system_p (val)))
3891 return Fcons (val, val);
b19fd4c5
KH
3892 if (! NILP (Ffboundp (val)))
3893 {
3894 val = call1 (val, Flist (nargs, args));
3895 if (CONSP (val))
3896 return val;
3897 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
3898 return Fcons (val, val);
3899 }
02ba4723
KH
3900 return Qnil;
3901 }
4ed46869
KH
3902 }
3903 return Qnil;
3904}
3905
3906#endif /* emacs */
3907
3908\f
3909/*** 8. Post-amble ***/
3910
3911init_coding_once ()
3912{
3913 int i;
3914
0ef69138 3915 /* Emacs' internal format specific initialize routine. */
4ed46869
KH
3916 for (i = 0; i <= 0x20; i++)
3917 emacs_code_class[i] = EMACS_control_code;
3918 emacs_code_class[0x0A] = EMACS_linefeed_code;
3919 emacs_code_class[0x0D] = EMACS_carriage_return_code;
3920 for (i = 0x21 ; i < 0x7F; i++)
3921 emacs_code_class[i] = EMACS_ascii_code;
3922 emacs_code_class[0x7F] = EMACS_control_code;
3923 emacs_code_class[0x80] = EMACS_leading_code_composition;
3924 for (i = 0x81; i < 0xFF; i++)
3925 emacs_code_class[i] = EMACS_invalid_code;
3926 emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
3927 emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
3928 emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
3929 emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
3930
3931 /* ISO2022 specific initialize routine. */
3932 for (i = 0; i < 0x20; i++)
3933 iso_code_class[i] = ISO_control_code;
3934 for (i = 0x21; i < 0x7F; i++)
3935 iso_code_class[i] = ISO_graphic_plane_0;
3936 for (i = 0x80; i < 0xA0; i++)
3937 iso_code_class[i] = ISO_control_code;
3938 for (i = 0xA1; i < 0xFF; i++)
3939 iso_code_class[i] = ISO_graphic_plane_1;
3940 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
3941 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
3942 iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
3943 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
3944 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
3945 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
3946 iso_code_class[ISO_CODE_ESC] = ISO_escape;
3947 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
3948 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
3949 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
3950
e0e989f6
KH
3951 conversion_buffer_size = MINIMUM_CONVERSION_BUFFER_SIZE;
3952 conversion_buffer = (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE);
3953
3954 setup_coding_system (Qnil, &keyboard_coding);
3955 setup_coding_system (Qnil, &terminal_coding);
c4825358 3956 setup_coding_system (Qnil, &safe_terminal_coding);
9ce27fde
KH
3957
3958#if defined (MSDOS) || defined (WINDOWSNT)
3959 system_eol_type = CODING_EOL_CRLF;
3960#else
3961 system_eol_type = CODING_EOL_LF;
3962#endif
e0e989f6
KH
3963}
3964
3965#ifdef emacs
3966
3967syms_of_coding ()
3968{
3969 Qtarget_idx = intern ("target-idx");
3970 staticpro (&Qtarget_idx);
3971
bb0115a2
RS
3972 Qcoding_system_history = intern ("coding-system-history");
3973 staticpro (&Qcoding_system_history);
3974 Fset (Qcoding_system_history, Qnil);
3975
9ce27fde 3976 /* Target FILENAME is the first argument. */
e0e989f6 3977 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
9ce27fde 3978 /* Target FILENAME is the third argument. */
e0e989f6
KH
3979 Fput (Qwrite_region, Qtarget_idx, make_number (2));
3980
3981 Qcall_process = intern ("call-process");
3982 staticpro (&Qcall_process);
9ce27fde 3983 /* Target PROGRAM is the first argument. */
e0e989f6
KH
3984 Fput (Qcall_process, Qtarget_idx, make_number (0));
3985
3986 Qcall_process_region = intern ("call-process-region");
3987 staticpro (&Qcall_process_region);
9ce27fde 3988 /* Target PROGRAM is the third argument. */
e0e989f6
KH
3989 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
3990
3991 Qstart_process = intern ("start-process");
3992 staticpro (&Qstart_process);
9ce27fde 3993 /* Target PROGRAM is the third argument. */
e0e989f6
KH
3994 Fput (Qstart_process, Qtarget_idx, make_number (2));
3995
3996 Qopen_network_stream = intern ("open-network-stream");
3997 staticpro (&Qopen_network_stream);
9ce27fde 3998 /* Target SERVICE is the fourth argument. */
e0e989f6
KH
3999 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
4000
4ed46869
KH
4001 Qcoding_system = intern ("coding-system");
4002 staticpro (&Qcoding_system);
4003
4004 Qeol_type = intern ("eol-type");
4005 staticpro (&Qeol_type);
4006
4007 Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
4008 staticpro (&Qbuffer_file_coding_system);
4009
4010 Qpost_read_conversion = intern ("post-read-conversion");
4011 staticpro (&Qpost_read_conversion);
4012
4013 Qpre_write_conversion = intern ("pre-write-conversion");
4014 staticpro (&Qpre_write_conversion);
4015
27901516
KH
4016 Qno_conversion = intern ("no-conversion");
4017 staticpro (&Qno_conversion);
4018
4019 Qundecided = intern ("undecided");
4020 staticpro (&Qundecided);
4021
4ed46869
KH
4022 Qcoding_system_p = intern ("coding-system-p");
4023 staticpro (&Qcoding_system_p);
4024
4025 Qcoding_system_error = intern ("coding-system-error");
4026 staticpro (&Qcoding_system_error);
4027
4028 Fput (Qcoding_system_error, Qerror_conditions,
4029 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
4030 Fput (Qcoding_system_error, Qerror_message,
9ce27fde 4031 build_string ("Invalid coding system"));
4ed46869
KH
4032
4033 Qcoding_category_index = intern ("coding-category-index");
4034 staticpro (&Qcoding_category_index);
4035
4036 {
4037 int i;
4038 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
4039 {
4040 coding_category_table[i] = intern (coding_category_name[i]);
4041 staticpro (&coding_category_table[i]);
4042 Fput (coding_category_table[i], Qcoding_category_index,
4043 make_number (i));
4044 }
4045 }
4046
bdd9fb48
KH
4047 Qcharacter_unification_table = intern ("character-unification-table");
4048 staticpro (&Qcharacter_unification_table);
4049 Fput (Qcharacter_unification_table, Qchar_table_extra_slots,
4050 make_number (0));
4051
a5d301df
KH
4052 Qcharacter_unification_table_for_decode
4053 = intern ("character-unification-table-for-decode");
4054 staticpro (&Qcharacter_unification_table_for_decode);
4055
4056 Qcharacter_unification_table_for_encode
4057 = intern ("character-unification-table-for-encode");
4058 staticpro (&Qcharacter_unification_table_for_encode);
4059
70c22245
KH
4060 Qsafe_charsets = intern ("safe-charsets");
4061 staticpro (&Qsafe_charsets);
4062
9ce27fde
KH
4063 Qemacs_mule = intern ("emacs-mule");
4064 staticpro (&Qemacs_mule);
4065
4ed46869
KH
4066 defsubr (&Scoding_system_p);
4067 defsubr (&Sread_coding_system);
4068 defsubr (&Sread_non_nil_coding_system);
4069 defsubr (&Scheck_coding_system);
4070 defsubr (&Sdetect_coding_region);
4071 defsubr (&Sdecode_coding_region);
4072 defsubr (&Sencode_coding_region);
4073 defsubr (&Sdecode_coding_string);
4074 defsubr (&Sencode_coding_string);
4075 defsubr (&Sdecode_sjis_char);
4076 defsubr (&Sencode_sjis_char);
4077 defsubr (&Sdecode_big5_char);
4078 defsubr (&Sencode_big5_char);
1ba9e4ab 4079 defsubr (&Sset_terminal_coding_system_internal);
c4825358 4080 defsubr (&Sset_safe_terminal_coding_system_internal);
4ed46869 4081 defsubr (&Sterminal_coding_system);
1ba9e4ab 4082 defsubr (&Sset_keyboard_coding_system_internal);
4ed46869 4083 defsubr (&Skeyboard_coding_system);
a5d301df 4084 defsubr (&Sfind_operation_coding_system);
4ed46869 4085
4608c386
KH
4086 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
4087 "List of coding systems.\n\
4088\n\
4089Do not alter the value of this variable manually. This variable should be\n\
4090updated by the functions `make-coding-system' and\n\
4091`define-coding-system-alias'.");
4092 Vcoding_system_list = Qnil;
4093
4094 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
4095 "Alist of coding system names.\n\
4096Each element is one element list of coding system name.\n\
4097This variable is given to `completing-read' as TABLE argument.\n\
4098\n\
4099Do not alter the value of this variable manually. This variable should be\n\
4100updated by the functions `make-coding-system' and\n\
4101`define-coding-system-alias'.");
4102 Vcoding_system_alist = Qnil;
4103
4ed46869
KH
4104 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
4105 "List of coding-categories (symbols) ordered by priority.");
4106 {
4107 int i;
4108
4109 Vcoding_category_list = Qnil;
4110 for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
4111 Vcoding_category_list
4112 = Fcons (coding_category_table[i], Vcoding_category_list);
4113 }
4114
4115 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
10bff6f1 4116 "Specify the coding system for read operations.\n\
2ebb362d 4117It is useful to bind this variable with `let', but do not set it globally.\n\
4ed46869 4118If the value is a coding system, it is used for decoding on read operation.\n\
a67a9c66 4119If not, an appropriate element is used from one of the coding system alists:\n\
10bff6f1 4120There are three such tables, `file-coding-system-alist',\n\
a67a9c66 4121`process-coding-system-alist', and `network-coding-system-alist'.");
4ed46869
KH
4122 Vcoding_system_for_read = Qnil;
4123
4124 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
10bff6f1 4125 "Specify the coding system for write operations.\n\
2ebb362d 4126It is useful to bind this variable with `let', but do not set it globally.\n\
4ed46869 4127If the value is a coding system, it is used for encoding on write operation.\n\
a67a9c66 4128If not, an appropriate element is used from one of the coding system alists:\n\
10bff6f1 4129There are three such tables, `file-coding-system-alist',\n\
a67a9c66 4130`process-coding-system-alist', and `network-coding-system-alist'.");
4ed46869
KH
4131 Vcoding_system_for_write = Qnil;
4132
4133 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
a67a9c66 4134 "Coding system used in the latest file or process I/O.");
4ed46869
KH
4135 Vlast_coding_system_used = Qnil;
4136
9ce27fde
KH
4137 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
4138 "*Non-nil inhibit code conversion of end-of-line format in any cases.");
4139 inhibit_eol_conversion = 0;
4140
02ba4723
KH
4141 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
4142 "Alist to decide a coding system to use for a file I/O operation.\n\
4143The format is ((PATTERN . VAL) ...),\n\
4144where PATTERN is a regular expression matching a file name,\n\
4145VAL is a coding system, a cons of coding systems, or a function symbol.\n\
4146If VAL is a coding system, it is used for both decoding and encoding\n\
4147the file contents.\n\
4148If VAL is a cons of coding systems, the car part is used for decoding,\n\
4149and the cdr part is used for encoding.\n\
4150If VAL is a function symbol, the function must return a coding system\n\
4151or a cons of coding systems which are used as above.\n\
e0e989f6 4152\n\
9ce27fde 4153See also the function `find-operation-coding-system'.");
02ba4723
KH
4154 Vfile_coding_system_alist = Qnil;
4155
4156 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
4157 "Alist to decide a coding system to use for a process I/O operation.\n\
4158The format is ((PATTERN . VAL) ...),\n\
4159where PATTERN is a regular expression matching a program name,\n\
4160VAL is a coding system, a cons of coding systems, or a function symbol.\n\
4161If VAL is a coding system, it is used for both decoding what received\n\
4162from the program and encoding what sent to the program.\n\
4163If VAL is a cons of coding systems, the car part is used for decoding,\n\
4164and the cdr part is used for encoding.\n\
4165If VAL is a function symbol, the function must return a coding system\n\
4166or a cons of coding systems which are used as above.\n\
4ed46869 4167\n\
9ce27fde 4168See also the function `find-operation-coding-system'.");
02ba4723
KH
4169 Vprocess_coding_system_alist = Qnil;
4170
4171 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
4172 "Alist to decide a coding system to use for a network I/O operation.\n\
4173The format is ((PATTERN . VAL) ...),\n\
4174where PATTERN is a regular expression matching a network service name\n\
4175or is a port number to connect to,\n\
4176VAL is a coding system, a cons of coding systems, or a function symbol.\n\
4177If VAL is a coding system, it is used for both decoding what received\n\
4178from the network stream and encoding what sent to the network stream.\n\
4179If VAL is a cons of coding systems, the car part is used for decoding,\n\
4180and the cdr part is used for encoding.\n\
4181If VAL is a function symbol, the function must return a coding system\n\
4182or a cons of coding systems which are used as above.\n\
4ed46869 4183\n\
9ce27fde 4184See also the function `find-operation-coding-system'.");
02ba4723 4185 Vnetwork_coding_system_alist = Qnil;
4ed46869
KH
4186
4187 DEFVAR_INT ("eol-mnemonic-unix", &eol_mnemonic_unix,
4188 "Mnemonic character indicating UNIX-like end-of-line format (i.e. LF) .");
458822a0 4189 eol_mnemonic_unix = ':';
4ed46869
KH
4190
4191 DEFVAR_INT ("eol-mnemonic-dos", &eol_mnemonic_dos,
4192 "Mnemonic character indicating DOS-like end-of-line format (i.e. CRLF).");
458822a0 4193 eol_mnemonic_dos = '\\';
4ed46869
KH
4194
4195 DEFVAR_INT ("eol-mnemonic-mac", &eol_mnemonic_mac,
4196 "Mnemonic character indicating MAC-like end-of-line format (i.e. CR).");
458822a0 4197 eol_mnemonic_mac = '/';
4ed46869
KH
4198
4199 DEFVAR_INT ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
4200 "Mnemonic character indicating end-of-line format is not yet decided.");
458822a0 4201 eol_mnemonic_undecided = ':';
4ed46869 4202
bdd9fb48
KH
4203 DEFVAR_LISP ("enable-character-unification", &Venable_character_unification,
4204 "Non-nil means ISO 2022 encoder/decoder do character unification.");
4205 Venable_character_unification = Qt;
4206
a5d301df
KH
4207 DEFVAR_LISP ("standard-character-unification-table-for-decode",
4208 &Vstandard_character_unification_table_for_decode,
bdd9fb48 4209 "Table for unifying characters when reading.");
a5d301df 4210 Vstandard_character_unification_table_for_decode = Qnil;
bdd9fb48 4211
a5d301df
KH
4212 DEFVAR_LISP ("standard-character-unification-table-for-encode",
4213 &Vstandard_character_unification_table_for_encode,
bdd9fb48 4214 "Table for unifying characters when writing.");
a5d301df 4215 Vstandard_character_unification_table_for_encode = Qnil;
4ed46869
KH
4216
4217 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
4218 "Alist of charsets vs revision numbers.\n\
4219While encoding, if a charset (car part of an element) is found,\n\
4220designate it with the escape sequence identifing revision (cdr part of the element).");
4221 Vcharset_revision_alist = Qnil;
02ba4723
KH
4222
4223 DEFVAR_LISP ("default-process-coding-system",
4224 &Vdefault_process_coding_system,
4225 "Cons of coding systems used for process I/O by default.\n\
4226The car part is used for decoding a process output,\n\
4227the cdr part is used for encoding a text to be sent to a process.");
4228 Vdefault_process_coding_system = Qnil;
c4825358 4229
3f003981
KH
4230 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
4231 "Table of extra Latin codes in the range 128..159 (inclusive).\n\
c4825358
KH
4232This is a vector of length 256.\n\
4233If Nth element is non-nil, the existence of code N in a file\n\
bb0115a2 4234\(or output of subprocess) doesn't prevent it to be detected as\n\
3f003981
KH
4235a coding system of ISO 2022 variant which has a flag\n\
4236`accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file\n\
c4825358
KH
4237or reading output of a subprocess.\n\
4238Only 128th through 159th elements has a meaning.");
3f003981 4239 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
4ed46869
KH
4240}
4241
4242#endif /* emacs */