(message-unix-mail-delimiter): Initialize
[bpt/emacs.git] / src / coding.c
CommitLineData
4ed46869 1/* Coding system handler (conversion, detection, and etc).
203cb916
RS
2 Copyright (C) 1995, 1997 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4ed46869 4
369314dc
KH
5This file is part of GNU Emacs.
6
7GNU Emacs is free software; you can redistribute it and/or modify
8it under the terms of the GNU General Public License as published by
9the Free Software Foundation; either version 2, or (at your option)
10any later version.
4ed46869 11
369314dc
KH
12GNU Emacs is distributed in the hope that it will be useful,
13but WITHOUT ANY WARRANTY; without even the implied warranty of
14MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15GNU General Public License for more details.
4ed46869 16
369314dc
KH
17You should have received a copy of the GNU General Public License
18along with GNU Emacs; see the file COPYING. If not, write to
19the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20Boston, MA 02111-1307, USA. */
4ed46869
KH
21
22/*** TABLE OF CONTENTS ***
23
24 1. Preamble
0ef69138 25 2. Emacs' internal format (emacs-mule) handlers
4ed46869
KH
26 3. ISO2022 handlers
27 4. Shift-JIS and BIG5 handlers
28 5. End-of-line handlers
29 6. C library functions
30 7. Emacs Lisp library functions
31 8. Post-amble
32
33*/
34
35/*** GENERAL NOTE on CODING SYSTEM ***
36
37 Coding system is an encoding mechanism of one or more character
38 sets. Here's a list of coding systems which Emacs can handle. When
39 we say "decode", it means converting some other coding system to
0ef69138
KH
40 Emacs' internal format (emacs-internal), and when we say "encode",
41 it means converting the coding system emacs-mule to some other
42 coding system.
4ed46869 43
0ef69138 44 0. Emacs' internal format (emacs-mule)
4ed46869
KH
45
46 Emacs itself holds a multi-lingual character in a buffer and a string
f4dee582 47 in a special format. Details are described in section 2.
4ed46869
KH
48
49 1. ISO2022
50
51 The most famous coding system for multiple character sets. X's
f4dee582
RS
52 Compound Text, various EUCs (Extended Unix Code), and coding
53 systems used in Internet communication such as ISO-2022-JP are
54 all variants of ISO2022. Details are described in section 3.
4ed46869
KH
55
56 2. SJIS (or Shift-JIS or MS-Kanji-Code)
57
58 A coding system to encode character sets: ASCII, JISX0201, and
59 JISX0208. Widely used for PC's in Japan. Details are described in
f4dee582 60 section 4.
4ed46869
KH
61
62 3. BIG5
63
64 A coding system to encode character sets: ASCII and Big5. Widely
65 used by Chinese (mainly in Taiwan and Hong Kong). Details are
f4dee582
RS
66 described in section 4. In this file, when we write "BIG5"
67 (all uppercase), we mean the coding system, and when we write
68 "Big5" (capitalized), we mean the character set.
4ed46869 69
27901516
KH
70 4. Raw text
71
72 A coding system to for a text containing random 8-bit code. Emacs
73 does no code conversion on such a text except for end-of-line
74 format.
75
76 5. Other
4ed46869 77
f4dee582 78 If a user wants to read/write a text encoded in a coding system not
4ed46869
KH
79 listed above, he can supply a decoder and an encoder for it in CCL
80 (Code Conversion Language) programs. Emacs executes the CCL program
81 while reading/writing.
82
f4dee582 83 Emacs represents a coding-system by a Lisp symbol that has a property
4ed46869
KH
84 `coding-system'. But, before actually using the coding-system, the
85 information about it is set in a structure of type `struct
f4dee582 86 coding_system' for rapid processing. See section 6 for more details.
4ed46869
KH
87
88*/
89
90/*** GENERAL NOTES on END-OF-LINE FORMAT ***
91
92 How end-of-line of a text is encoded depends on a system. For
93 instance, Unix's format is just one byte of `line-feed' code,
f4dee582 94 whereas DOS's format is two-byte sequence of `carriage-return' and
4ed46869
KH
95 `line-feed' codes. MacOS's format is one byte of `carriage-return'.
96
f4dee582
RS
97 Since text characters encoding and end-of-line encoding are
98 independent, any coding system described above can take
4ed46869 99 any format of end-of-line. So, Emacs has information of format of
f4dee582 100 end-of-line in each coding-system. See section 6 for more details.
4ed46869
KH
101
102*/
103
104/*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
105
106 These functions check if a text between SRC and SRC_END is encoded
107 in the coding system category XXX. Each returns an integer value in
108 which appropriate flag bits for the category XXX is set. The flag
109 bits are defined in macros CODING_CATEGORY_MASK_XXX. Below is the
110 template of these functions. */
111#if 0
112int
0ef69138 113detect_coding_emacs_mule (src, src_end)
4ed46869
KH
114 unsigned char *src, *src_end;
115{
116 ...
117}
118#endif
119
120/*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
121
122 These functions decode SRC_BYTES length text at SOURCE encoded in
0ef69138 123 CODING to Emacs' internal format (emacs-mule). The resulting text
f4dee582
RS
124 goes to a place pointed to by DESTINATION, the length of which should
125 not exceed DST_BYTES. The number of bytes actually processed is
126 returned as *CONSUMED. The return value is the length of the decoded
127 text. Below is a template of these functions. */
4ed46869
KH
128#if 0
129decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
130 struct coding_system *coding;
131 unsigned char *source, *destination;
132 int src_bytes, dst_bytes;
133 int *consumed;
134{
135 ...
136}
137#endif
138
139/*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
140
0ef69138
KH
141 These functions encode SRC_BYTES length text at SOURCE of Emacs'
142 internal format (emacs-mule) to CODING. The resulting text goes to
f4dee582
RS
143 a place pointed to by DESTINATION, the length of which should not
144 exceed DST_BYTES. The number of bytes actually processed is
145 returned as *CONSUMED. The return value is the length of the
146 encoded text. Below is a template of these functions. */
4ed46869
KH
147#if 0
148encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
149 struct coding_system *coding;
150 unsigned char *source, *destination;
151 int src_bytes, dst_bytes;
152 int *consumed;
153{
154 ...
155}
156#endif
157
158/*** COMMONLY USED MACROS ***/
159
160/* The following three macros ONE_MORE_BYTE, TWO_MORE_BYTES, and
161 THREE_MORE_BYTES safely get one, two, and three bytes from the
162 source text respectively. If there are not enough bytes in the
163 source, they jump to `label_end_of_loop'. The caller should set
164 variables `src' and `src_end' to appropriate areas in advance. */
165
166#define ONE_MORE_BYTE(c1) \
167 do { \
168 if (src < src_end) \
169 c1 = *src++; \
170 else \
171 goto label_end_of_loop; \
172 } while (0)
173
174#define TWO_MORE_BYTES(c1, c2) \
175 do { \
176 if (src + 1 < src_end) \
177 c1 = *src++, c2 = *src++; \
178 else \
179 goto label_end_of_loop; \
180 } while (0)
181
182#define THREE_MORE_BYTES(c1, c2, c3) \
183 do { \
184 if (src + 2 < src_end) \
185 c1 = *src++, c2 = *src++, c3 = *src++; \
186 else \
187 goto label_end_of_loop; \
188 } while (0)
189
190/* The following three macros DECODE_CHARACTER_ASCII,
191 DECODE_CHARACTER_DIMENSION1, and DECODE_CHARACTER_DIMENSION2 put
192 the multi-byte form of a character of each class at the place
193 pointed by `dst'. The caller should set the variable `dst' to
194 point to an appropriate area and the variable `coding' to point to
195 the coding-system of the currently decoding text in advance. */
196
197/* Decode one ASCII character C. */
198
199#define DECODE_CHARACTER_ASCII(c) \
200 do { \
201 if (COMPOSING_P (coding->composing)) \
202 *dst++ = 0xA0, *dst++ = (c) | 0x80; \
203 else \
204 *dst++ = (c); \
205 } while (0)
206
f4dee582 207/* Decode one DIMENSION1 character whose charset is CHARSET and whose
4ed46869
KH
208 position-code is C. */
209
210#define DECODE_CHARACTER_DIMENSION1(charset, c) \
211 do { \
212 unsigned char leading_code = CHARSET_LEADING_CODE_BASE (charset); \
213 if (COMPOSING_P (coding->composing)) \
214 *dst++ = leading_code + 0x20; \
215 else \
216 *dst++ = leading_code; \
217 if (leading_code = CHARSET_LEADING_CODE_EXT (charset)) \
218 *dst++ = leading_code; \
219 *dst++ = (c) | 0x80; \
220 } while (0)
221
f4dee582 222/* Decode one DIMENSION2 character whose charset is CHARSET and whose
4ed46869
KH
223 position-codes are C1 and C2. */
224
225#define DECODE_CHARACTER_DIMENSION2(charset, c1, c2) \
226 do { \
227 DECODE_CHARACTER_DIMENSION1 (charset, c1); \
228 *dst++ = (c2) | 0x80; \
229 } while (0)
230
231\f
232/*** 1. Preamble ***/
233
234#include <stdio.h>
235
236#ifdef emacs
237
238#include <config.h>
239#include "lisp.h"
240#include "buffer.h"
241#include "charset.h"
242#include "ccl.h"
243#include "coding.h"
244#include "window.h"
245
246#else /* not emacs */
247
248#include "mulelib.h"
249
250#endif /* not emacs */
251
252Lisp_Object Qcoding_system, Qeol_type;
253Lisp_Object Qbuffer_file_coding_system;
254Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
27901516 255Lisp_Object Qno_conversion, Qundecided;
bb0115a2 256Lisp_Object Qcoding_system_history;
4ed46869
KH
257
258extern Lisp_Object Qinsert_file_contents, Qwrite_region;
259Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
260Lisp_Object Qstart_process, Qopen_network_stream;
261Lisp_Object Qtarget_idx;
262
263/* Mnemonic character of each format of end-of-line. */
264int eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
265/* Mnemonic character to indicate format of end-of-line is not yet
266 decided. */
267int eol_mnemonic_undecided;
268
9ce27fde
KH
269/* Format of end-of-line decided by system. This is CODING_EOL_LF on
270 Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac. */
271int system_eol_type;
272
4ed46869
KH
273#ifdef emacs
274
02ba4723 275Lisp_Object Qcoding_system_spec, Qcoding_system_p, Qcoding_system_error;
4ed46869 276
9ce27fde
KH
277/* Coding system emacs-mule is for converting only end-of-line format. */
278Lisp_Object Qemacs_mule;
279
4ed46869
KH
280/* Coding-systems are handed between Emacs Lisp programs and C internal
281 routines by the following three variables. */
282/* Coding-system for reading files and receiving data from process. */
283Lisp_Object Vcoding_system_for_read;
284/* Coding-system for writing files and sending data to process. */
285Lisp_Object Vcoding_system_for_write;
286/* Coding-system actually used in the latest I/O. */
287Lisp_Object Vlast_coding_system_used;
288
c4825358 289/* A vector of length 256 which contains information about special
3f003981
KH
290 Latin codes (espepcially for dealing with Microsoft code). */
291Lisp_Object Vlatin_extra_code_table;
c4825358 292
9ce27fde
KH
293/* Flag to inhibit code conversion of end-of-line format. */
294int inhibit_eol_conversion;
295
c4825358 296/* Coding system to be used to encode text for terminal display. */
4ed46869
KH
297struct coding_system terminal_coding;
298
c4825358
KH
299/* Coding system to be used to encode text for terminal display when
300 terminal coding system is nil. */
301struct coding_system safe_terminal_coding;
302
303/* Coding system of what is sent from terminal keyboard. */
4ed46869
KH
304struct coding_system keyboard_coding;
305
02ba4723
KH
306Lisp_Object Vfile_coding_system_alist;
307Lisp_Object Vprocess_coding_system_alist;
308Lisp_Object Vnetwork_coding_system_alist;
4ed46869
KH
309
310#endif /* emacs */
311
312Lisp_Object Qcoding_category_index;
313
314/* List of symbols `coding-category-xxx' ordered by priority. */
315Lisp_Object Vcoding_category_list;
316
317/* Table of coding-systems currently assigned to each coding-category. */
318Lisp_Object coding_category_table[CODING_CATEGORY_IDX_MAX];
319
320/* Table of names of symbol for each coding-category. */
321char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
0ef69138 322 "coding-category-emacs-mule",
4ed46869
KH
323 "coding-category-sjis",
324 "coding-category-iso-7",
325 "coding-category-iso-8-1",
326 "coding-category-iso-8-2",
7717c392
KH
327 "coding-category-iso-7-else",
328 "coding-category-iso-8-else",
4ed46869 329 "coding-category-big5",
27901516 330 "coding-category-raw-text",
4ed46869
KH
331 "coding-category-binary"
332};
333
bdd9fb48
KH
334/* Flag to tell if we look up unification table on character code
335 conversion. */
336Lisp_Object Venable_character_unification;
a5d301df
KH
337/* Standard unification table to look up on decoding (reading). */
338Lisp_Object Vstandard_character_unification_table_for_decode;
339/* Standard unification table to look up on encoding (writing). */
340Lisp_Object Vstandard_character_unification_table_for_encode;
bdd9fb48
KH
341
342Lisp_Object Qcharacter_unification_table;
a5d301df
KH
343Lisp_Object Qcharacter_unification_table_for_decode;
344Lisp_Object Qcharacter_unification_table_for_encode;
4ed46869
KH
345
346/* Alist of charsets vs revision number. */
347Lisp_Object Vcharset_revision_alist;
348
02ba4723
KH
349/* Default coding systems used for process I/O. */
350Lisp_Object Vdefault_process_coding_system;
351
4ed46869 352\f
0ef69138 353/*** 2. Emacs internal format (emacs-mule) handlers ***/
4ed46869
KH
354
355/* Emacs' internal format for encoding multiple character sets is a
f4dee582
RS
356 kind of multi-byte encoding, i.e. characters are encoded by
357 variable-length sequences of one-byte codes. ASCII characters
358 and control characters (e.g. `tab', `newline') are represented by
359 one-byte sequences which are their ASCII codes, in the range 0x00
360 through 0x7F. The other characters are represented by a sequence
361 of `base leading-code', optional `extended leading-code', and one
362 or two `position-code's. The length of the sequence is determined
363 by the base leading-code. Leading-code takes the range 0x80
364 through 0x9F, whereas extended leading-code and position-code take
365 the range 0xA0 through 0xFF. See `charset.h' for more details
366 about leading-code and position-code.
367
368 There's one exception to this rule. Special leading-code
4ed46869
KH
369 `leading-code-composition' denotes that the following several
370 characters should be composed into one character. Leading-codes of
371 components (except for ASCII) are added 0x20. An ASCII character
372 component is represented by a 2-byte sequence of `0xA0' and
f4dee582
RS
373 `ASCII-code + 0x80'. See also the comments in `charset.h' for the
374 details of composite character. Hence, we can summarize the code
4ed46869
KH
375 range as follows:
376
377 --- CODE RANGE of Emacs' internal format ---
378 (character set) (range)
379 ASCII 0x00 .. 0x7F
380 ELSE (1st byte) 0x80 .. 0x9F
381 (rest bytes) 0xA0 .. 0xFF
382 ---------------------------------------------
383
384 */
385
386enum emacs_code_class_type emacs_code_class[256];
387
388/* Go to the next statement only if *SRC is accessible and the code is
389 greater than 0xA0. */
390#define CHECK_CODE_RANGE_A0_FF \
391 do { \
392 if (src >= src_end) \
393 goto label_end_of_switch; \
394 else if (*src++ < 0xA0) \
395 return 0; \
396 } while (0)
397
398/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
399 Check if a text is encoded in Emacs' internal format. If it is,
0ef69138 400 return CODING_CATEGORY_MASK_EMASC_MULE, else return 0. */
4ed46869
KH
401
402int
0ef69138 403detect_coding_emacs_mule (src, src_end)
4ed46869
KH
404 unsigned char *src, *src_end;
405{
406 unsigned char c;
407 int composing = 0;
408
409 while (src < src_end)
410 {
411 c = *src++;
412
413 if (composing)
414 {
415 if (c < 0xA0)
416 composing = 0;
417 else
418 c -= 0x20;
419 }
420
421 switch (emacs_code_class[c])
422 {
423 case EMACS_ascii_code:
424 case EMACS_linefeed_code:
425 break;
426
427 case EMACS_control_code:
428 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
429 return 0;
430 break;
431
432 case EMACS_invalid_code:
433 return 0;
434
435 case EMACS_leading_code_composition: /* c == 0x80 */
436 if (composing)
437 CHECK_CODE_RANGE_A0_FF;
438 else
439 composing = 1;
440 break;
441
442 case EMACS_leading_code_4:
443 CHECK_CODE_RANGE_A0_FF;
444 /* fall down to check it two more times ... */
445
446 case EMACS_leading_code_3:
447 CHECK_CODE_RANGE_A0_FF;
448 /* fall down to check it one more time ... */
449
450 case EMACS_leading_code_2:
451 CHECK_CODE_RANGE_A0_FF;
452 break;
453
454 default:
455 label_end_of_switch:
456 break;
457 }
458 }
0ef69138 459 return CODING_CATEGORY_MASK_EMACS_MULE;
4ed46869
KH
460}
461
462\f
463/*** 3. ISO2022 handlers ***/
464
465/* The following note describes the coding system ISO2022 briefly.
f4dee582
RS
466 Since the intention of this note is to help in understanding of
467 the programs in this file, some parts are NOT ACCURATE or OVERLY
4ed46869
KH
468 SIMPLIFIED. For the thorough understanding, please refer to the
469 original document of ISO2022.
470
471 ISO2022 provides many mechanisms to encode several character sets
f4dee582 472 in 7-bit and 8-bit environment. If one chooses 7-bite environment,
4ed46869 473 all text is encoded by codes of less than 128. This may make the
f4dee582
RS
474 encoded text a little bit longer, but the text gets more stability
475 to pass through several gateways (some of them strip off the MSB).
4ed46869 476
f4dee582 477 There are two kinds of character set: control character set and
4ed46869
KH
478 graphic character set. The former contains control characters such
479 as `newline' and `escape' to provide control functions (control
f4dee582 480 functions are provided also by escape sequences). The latter
4ed46869
KH
481 contains graphic characters such as ' A' and '-'. Emacs recognizes
482 two control character sets and many graphic character sets.
483
484 Graphic character sets are classified into one of the following
485 four classes, DIMENSION1_CHARS94, DIMENSION1_CHARS96,
486 DIMENSION2_CHARS94, DIMENSION2_CHARS96 according to the number of
487 bytes (DIMENSION) and the number of characters in one dimension
488 (CHARS) of the set. In addition, each character set is assigned an
489 identification tag (called "final character" and denoted as <F>
490 here after) which is unique in each class. <F> of each character
491 set is decided by ECMA(*) when it is registered in ISO. Code range
492 of <F> is 0x30..0x7F (0x30..0x3F are for private use only).
493
494 Note (*): ECMA = European Computer Manufacturers Association
495
496 Here are examples of graphic character set [NAME(<F>)]:
497 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
498 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
499 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
500 o DIMENSION2_CHARS96 -- none for the moment
501
502 A code area (1byte=8bits) is divided into 4 areas, C0, GL, C1, and GR.
503 C0 [0x00..0x1F] -- control character plane 0
504 GL [0x20..0x7F] -- graphic character plane 0
505 C1 [0x80..0x9F] -- control character plane 1
506 GR [0xA0..0xFF] -- graphic character plane 1
507
508 A control character set is directly designated and invoked to C0 or
509 C1 by an escape sequence. The most common case is that ISO646's
510 control character set is designated/invoked to C0 and ISO6429's
511 control character set is designated/invoked to C1, and usually
512 these designations/invocations are omitted in a coded text. With
513 7-bit environment, only C0 can be used, and a control character for
514 C1 is encoded by an appropriate escape sequence to fit in the
515 environment. All control characters for C1 are defined the
516 corresponding escape sequences.
517
518 A graphic character set is at first designated to one of four
519 graphic registers (G0 through G3), then these graphic registers are
520 invoked to GL or GR. These designations and invocations can be
521 done independently. The most common case is that G0 is invoked to
522 GL, G1 is invoked to GR, and ASCII is designated to G0, and usually
523 these invocations and designations are omitted in a coded text.
524 With 7-bit environment, only GL can be used.
525
526 When a graphic character set of CHARS94 is invoked to GL, code 0x20
527 and 0x7F of GL area work as control characters SPACE and DEL
528 respectively, and code 0xA0 and 0xFF of GR area should not be used.
529
530 There are two ways of invocation: locking-shift and single-shift.
531 With locking-shift, the invocation lasts until the next different
532 invocation, whereas with single-shift, the invocation works only
533 for the following character and doesn't affect locking-shift.
534 Invocations are done by the following control characters or escape
535 sequences.
536
537 ----------------------------------------------------------------------
538 function control char escape sequence description
539 ----------------------------------------------------------------------
540 SI (shift-in) 0x0F none invoke G0 to GL
10bff6f1 541 SO (shift-out) 0x0E none invoke G1 to GL
4ed46869
KH
542 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
543 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
544 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 into GL
545 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 into GL
546 ----------------------------------------------------------------------
547 The first four are for locking-shift. Control characters for these
548 functions are defined by macros ISO_CODE_XXX in `coding.h'.
549
550 Designations are done by the following escape sequences.
551 ----------------------------------------------------------------------
552 escape sequence description
553 ----------------------------------------------------------------------
554 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
555 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
556 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
557 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
558 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
559 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
560 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
561 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
562 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
563 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
564 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
565 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
566 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
567 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
568 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
569 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
570 ----------------------------------------------------------------------
571
572 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
573 of dimension 1, chars 94, and final character <F>, and etc.
574
575 Note (*): Although these designations are not allowed in ISO2022,
576 Emacs accepts them on decoding, and produces them on encoding
577 CHARS96 character set in a coding system which is characterized as
578 7-bit environment, non-locking-shift, and non-single-shift.
579
580 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
581 '(' can be omitted. We call this as "short-form" here after.
582
583 Now you may notice that there are a lot of ways for encoding the
f4dee582 584 same multilingual text in ISO2022. Actually, there exists many
4ed46869
KH
585 coding systems such as Compound Text (used in X's inter client
586 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
587 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
588 localized platforms), and all of these are variants of ISO2022.
589
590 In addition to the above, Emacs handles two more kinds of escape
591 sequences: ISO6429's direction specification and Emacs' private
592 sequence for specifying character composition.
593
594 ISO6429's direction specification takes the following format:
595 o CSI ']' -- end of the current direction
596 o CSI '0' ']' -- end of the current direction
597 o CSI '1' ']' -- start of left-to-right text
598 o CSI '2' ']' -- start of right-to-left text
599 The control character CSI (0x9B: control sequence introducer) is
600 abbreviated to the escape sequence ESC '[' in 7-bit environment.
601
602 Character composition specification takes the following format:
603 o ESC '0' -- start character composition
604 o ESC '1' -- end character composition
605 Since these are not standard escape sequences of any ISO, the use
606 of them for these meaning is restricted to Emacs only. */
607
608enum iso_code_class_type iso_code_class[256];
609
610/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
611 Check if a text is encoded in ISO2022. If it is, returns an
612 integer in which appropriate flag bits any of:
613 CODING_CATEGORY_MASK_ISO_7
614 CODING_CATEGORY_MASK_ISO_8_1
615 CODING_CATEGORY_MASK_ISO_8_2
7717c392
KH
616 CODING_CATEGORY_MASK_ISO_7_ELSE
617 CODING_CATEGORY_MASK_ISO_8_ELSE
4ed46869
KH
618 are set. If a code which should never appear in ISO2022 is found,
619 returns 0. */
620
621int
622detect_coding_iso2022 (src, src_end)
623 unsigned char *src, *src_end;
624{
765a2ca5
KH
625 int mask = (CODING_CATEGORY_MASK_ISO_7
626 | CODING_CATEGORY_MASK_ISO_8_1
627 | CODING_CATEGORY_MASK_ISO_8_2
7717c392
KH
628 | CODING_CATEGORY_MASK_ISO_7_ELSE
629 | CODING_CATEGORY_MASK_ISO_8_ELSE
630 );
bcf26d6a
KH
631 int g1 = 0; /* 1 iff designating to G1. */
632 int c, i;
3f003981 633 struct coding_system coding_iso_8_1, coding_iso_8_2;
4ed46869 634
3f003981
KH
635 /* Coding systems of these categories may accept latin extra codes. */
636 setup_coding_system
637 (XSYMBOL (coding_category_table[CODING_CATEGORY_IDX_ISO_8_1])->value,
638 &coding_iso_8_1);
639 setup_coding_system
640 (XSYMBOL (coding_category_table[CODING_CATEGORY_IDX_ISO_8_2])->value,
641 &coding_iso_8_2);
642
643 while (mask && src < src_end)
4ed46869
KH
644 {
645 c = *src++;
646 switch (c)
647 {
648 case ISO_CODE_ESC:
e0e989f6 649 if (src >= src_end)
4ed46869
KH
650 break;
651 c = *src++;
bf9cdd4e 652 if ((c >= '(' && c <= '/'))
4ed46869 653 {
bf9cdd4e
KH
654 /* Designation sequence for a charset of dimension 1. */
655 if (src >= src_end)
656 break;
657 c = *src++;
658 if (c < ' ' || c >= 0x80)
659 /* Invalid designation sequence. */
660 return 0;
661 }
662 else if (c == '$')
663 {
664 /* Designation sequence for a charset of dimension 2. */
665 if (src >= src_end)
666 break;
667 c = *src++;
668 if (c >= '@' && c <= 'B')
669 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
670 ;
671 else if (c >= '(' && c <= '/')
bcf26d6a 672 {
bf9cdd4e
KH
673 if (src >= src_end)
674 break;
675 c = *src++;
676 if (c < ' ' || c >= 0x80)
677 /* Invalid designation sequence. */
678 return 0;
bcf26d6a 679 }
bf9cdd4e
KH
680 else
681 /* Invalid designation sequence. */
682 return 0;
4ed46869 683 }
4ed46869 684 else if (c == 'N' || c == 'O' || c == 'n' || c == 'o')
bf9cdd4e 685 /* Locking shift. */
7717c392
KH
686 mask &= (CODING_CATEGORY_MASK_ISO_7_ELSE
687 | CODING_CATEGORY_MASK_ISO_8_ELSE);
bf9cdd4e
KH
688 else if (c == '0' || c == '1' || c == '2')
689 /* Start/end composition. */
690 ;
691 else
692 /* Invalid escape sequence. */
693 return 0;
4ed46869
KH
694 break;
695
4ed46869 696 case ISO_CODE_SO:
bf9cdd4e
KH
697 mask &= (CODING_CATEGORY_MASK_ISO_7_ELSE
698 | CODING_CATEGORY_MASK_ISO_8_ELSE);
e0e989f6
KH
699 break;
700
4ed46869
KH
701 case ISO_CODE_CSI:
702 case ISO_CODE_SS2:
703 case ISO_CODE_SS3:
3f003981
KH
704 {
705 int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
706
707 if (VECTORP (Vlatin_extra_code_table)
708 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
709 {
710 if (coding_iso_8_1.flags & CODING_FLAG_ISO_LATIN_EXTRA)
711 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
712 if (coding_iso_8_2.flags & CODING_FLAG_ISO_LATIN_EXTRA)
713 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
714 }
715 mask &= newmask;
716 }
717 break;
4ed46869
KH
718
719 default:
720 if (c < 0x80)
721 break;
722 else if (c < 0xA0)
c4825358 723 {
3f003981
KH
724 if (VECTORP (Vlatin_extra_code_table)
725 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
c4825358 726 {
3f003981
KH
727 int newmask = 0;
728
729 if (coding_iso_8_1.flags & CODING_FLAG_ISO_LATIN_EXTRA)
730 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
731 if (coding_iso_8_2.flags & CODING_FLAG_ISO_LATIN_EXTRA)
732 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
733 mask &= newmask;
c4825358 734 }
3f003981
KH
735 else
736 return 0;
c4825358 737 }
4ed46869
KH
738 else
739 {
7717c392 740 unsigned char *src_begin = src;
4ed46869 741
7717c392
KH
742 mask &= ~(CODING_CATEGORY_MASK_ISO_7
743 | CODING_CATEGORY_MASK_ISO_7_ELSE);
e0e989f6 744 while (src < src_end && *src >= 0xA0)
7717c392
KH
745 src++;
746 if ((src - src_begin - 1) & 1 && src < src_end)
4ed46869
KH
747 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
748 }
749 break;
750 }
751 }
752
753 return mask;
754}
755
756/* Decode a character of which charset is CHARSET and the 1st position
bdd9fb48 757 code is C1. If dimension of CHARSET is 2, the 2nd position code is
4ed46869
KH
758 fetched from SRC and set to C2. If CHARSET is negative, it means
759 that we are decoding ill formed text, and what we can do is just to
760 read C1 as is. */
761
bdd9fb48
KH
762#define DECODE_ISO_CHARACTER(charset, c1) \
763 do { \
764 int c_alt, charset_alt = (charset); \
765 if (COMPOSING_HEAD_P (coding->composing)) \
766 { \
767 *dst++ = LEADING_CODE_COMPOSITION; \
768 if (COMPOSING_WITH_RULE_P (coding->composing)) \
769 /* To tell composition rules are embeded. */ \
770 *dst++ = 0xFF; \
771 coding->composing += 2; \
772 } \
773 if ((charset) >= 0) \
774 { \
775 if (CHARSET_DIMENSION (charset) == 2) \
776 ONE_MORE_BYTE (c2); \
777 if (!NILP (unification_table) \
778 && ((c_alt = unify_char (unification_table, \
779 -1, (charset), c1, c2)) >= 0)) \
780 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
781 } \
782 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
783 DECODE_CHARACTER_ASCII (c1); \
784 else if (CHARSET_DIMENSION (charset_alt) == 1) \
785 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
786 else \
787 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
788 if (COMPOSING_WITH_RULE_P (coding->composing)) \
789 /* To tell a composition rule follows. */ \
790 coding->composing = COMPOSING_WITH_RULE_RULE; \
4ed46869
KH
791 } while (0)
792
793/* Set designation state into CODING. */
794#define DECODE_DESIGNATION(reg, dimension, chars, final_char) \
795 do { \
2e34157c
RS
796 int charset = ISO_CHARSET_TABLE (make_number (dimension), \
797 make_number (chars), \
798 make_number (final_char)); \
4ed46869
KH
799 if (charset >= 0) \
800 { \
801 if (coding->direction == 1 \
802 && CHARSET_REVERSE_CHARSET (charset) >= 0) \
803 charset = CHARSET_REVERSE_CHARSET (charset); \
804 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
805 } \
806 } while (0)
807
808/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
809
810int
811decode_coding_iso2022 (coding, source, destination,
812 src_bytes, dst_bytes, consumed)
813 struct coding_system *coding;
814 unsigned char *source, *destination;
815 int src_bytes, dst_bytes;
816 int *consumed;
817{
818 unsigned char *src = source;
819 unsigned char *src_end = source + src_bytes;
820 unsigned char *dst = destination;
821 unsigned char *dst_end = destination + dst_bytes;
822 /* Since the maximum bytes produced by each loop is 7, we subtract 6
823 from DST_END to assure that overflow checking is necessary only
824 at the head of loop. */
825 unsigned char *adjusted_dst_end = dst_end - 6;
826 int charset;
827 /* Charsets invoked to graphic plane 0 and 1 respectively. */
828 int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
829 int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
a5d301df
KH
830 Lisp_Object unification_table
831 = coding->character_unification_table_for_decode;
bdd9fb48
KH
832
833 if (!NILP (Venable_character_unification) && NILP (unification_table))
a5d301df 834 unification_table = Vstandard_character_unification_table_for_decode;
4ed46869
KH
835
836 while (src < src_end && dst < adjusted_dst_end)
837 {
838 /* SRC_BASE remembers the start position in source in each loop.
839 The loop will be exited when there's not enough source text
840 to analyze long escape sequence or 2-byte code (within macros
841 ONE_MORE_BYTE or TWO_MORE_BYTES). In that case, SRC is reset
842 to SRC_BASE before exiting. */
843 unsigned char *src_base = src;
bdd9fb48 844 int c1 = *src++, c2;
4ed46869
KH
845
846 switch (iso_code_class [c1])
847 {
848 case ISO_0x20_or_0x7F:
849 if (!coding->composing
850 && (charset0 < 0 || CHARSET_CHARS (charset0) == 94))
851 {
852 /* This is SPACE or DEL. */
853 *dst++ = c1;
854 break;
855 }
856 /* This is a graphic character, we fall down ... */
857
858 case ISO_graphic_plane_0:
859 if (coding->composing == COMPOSING_WITH_RULE_RULE)
860 {
861 /* This is a composition rule. */
862 *dst++ = c1 | 0x80;
863 coding->composing = COMPOSING_WITH_RULE_TAIL;
864 }
865 else
866 DECODE_ISO_CHARACTER (charset0, c1);
867 break;
868
869 case ISO_0xA0_or_0xFF:
870 if (charset1 < 0 || CHARSET_CHARS (charset1) == 94)
871 {
872 /* Invalid code. */
873 *dst++ = c1;
874 break;
875 }
876 /* This is a graphic character, we fall down ... */
877
878 case ISO_graphic_plane_1:
879 DECODE_ISO_CHARACTER (charset1, c1);
880 break;
881
882 case ISO_control_code:
883 /* All ISO2022 control characters in this class have the
884 same representation in Emacs internal format. */
885 *dst++ = c1;
886 break;
887
888 case ISO_carriage_return:
889 if (coding->eol_type == CODING_EOL_CR)
890 {
891 *dst++ = '\n';
892 }
893 else if (coding->eol_type == CODING_EOL_CRLF)
894 {
895 ONE_MORE_BYTE (c1);
896 if (c1 == ISO_CODE_LF)
897 *dst++ = '\n';
898 else
899 {
900 src--;
901 *dst++ = c1;
902 }
903 }
904 else
905 {
906 *dst++ = c1;
907 }
908 break;
909
910 case ISO_shift_out:
e0e989f6
KH
911 if (CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
912 goto label_invalid_escape_sequence;
4ed46869
KH
913 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
914 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
915 break;
916
917 case ISO_shift_in:
918 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
919 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
920 break;
921
922 case ISO_single_shift_2_7:
923 case ISO_single_shift_2:
924 /* SS2 is handled as an escape sequence of ESC 'N' */
925 c1 = 'N';
926 goto label_escape_sequence;
927
928 case ISO_single_shift_3:
929 /* SS2 is handled as an escape sequence of ESC 'O' */
930 c1 = 'O';
931 goto label_escape_sequence;
932
933 case ISO_control_sequence_introducer:
934 /* CSI is handled as an escape sequence of ESC '[' ... */
935 c1 = '[';
936 goto label_escape_sequence;
937
938 case ISO_escape:
939 ONE_MORE_BYTE (c1);
940 label_escape_sequence:
941 /* Escape sequences handled by Emacs are invocation,
942 designation, direction specification, and character
943 composition specification. */
944 switch (c1)
945 {
946 case '&': /* revision of following character set */
947 ONE_MORE_BYTE (c1);
948 if (!(c1 >= '@' && c1 <= '~'))
e0e989f6 949 goto label_invalid_escape_sequence;
4ed46869
KH
950 ONE_MORE_BYTE (c1);
951 if (c1 != ISO_CODE_ESC)
e0e989f6 952 goto label_invalid_escape_sequence;
4ed46869
KH
953 ONE_MORE_BYTE (c1);
954 goto label_escape_sequence;
955
956 case '$': /* designation of 2-byte character set */
957 ONE_MORE_BYTE (c1);
958 if (c1 >= '@' && c1 <= 'B')
959 { /* designation of JISX0208.1978, GB2312.1980,
960 or JISX0208.1980 */
961 DECODE_DESIGNATION (0, 2, 94, c1);
962 }
963 else if (c1 >= 0x28 && c1 <= 0x2B)
964 { /* designation of DIMENSION2_CHARS94 character set */
965 ONE_MORE_BYTE (c2);
966 DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
967 }
968 else if (c1 >= 0x2C && c1 <= 0x2F)
969 { /* designation of DIMENSION2_CHARS96 character set */
970 ONE_MORE_BYTE (c2);
971 DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
972 }
973 else
e0e989f6 974 goto label_invalid_escape_sequence;
4ed46869
KH
975 break;
976
977 case 'n': /* invocation of locking-shift-2 */
e0e989f6
KH
978 if (CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
979 goto label_invalid_escape_sequence;
4ed46869 980 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
e0e989f6 981 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
4ed46869
KH
982 break;
983
984 case 'o': /* invocation of locking-shift-3 */
e0e989f6
KH
985 if (CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
986 goto label_invalid_escape_sequence;
4ed46869 987 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
e0e989f6 988 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
4ed46869
KH
989 break;
990
991 case 'N': /* invocation of single-shift-2 */
e0e989f6
KH
992 if (CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
993 goto label_invalid_escape_sequence;
4ed46869
KH
994 ONE_MORE_BYTE (c1);
995 charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
996 DECODE_ISO_CHARACTER (charset, c1);
997 break;
998
999 case 'O': /* invocation of single-shift-3 */
e0e989f6
KH
1000 if (CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1001 goto label_invalid_escape_sequence;
4ed46869
KH
1002 ONE_MORE_BYTE (c1);
1003 charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
1004 DECODE_ISO_CHARACTER (charset, c1);
1005 break;
1006
1007 case '0': /* start composing without embeded rules */
1008 coding->composing = COMPOSING_NO_RULE_HEAD;
1009 break;
1010
1011 case '1': /* end composing */
1012 coding->composing = COMPOSING_NO;
1013 break;
1014
1015 case '2': /* start composing with embeded rules */
1016 coding->composing = COMPOSING_WITH_RULE_HEAD;
1017 break;
1018
1019 case '[': /* specification of direction */
1020 /* For the moment, nested direction is not supported.
1021 So, the value of `coding->direction' is 0 or 1: 0
1022 means left-to-right, 1 means right-to-left. */
1023 ONE_MORE_BYTE (c1);
1024 switch (c1)
1025 {
1026 case ']': /* end of the current direction */
1027 coding->direction = 0;
1028
1029 case '0': /* end of the current direction */
1030 case '1': /* start of left-to-right direction */
1031 ONE_MORE_BYTE (c1);
1032 if (c1 == ']')
1033 coding->direction = 0;
1034 else
1035 goto label_invalid_escape_sequence;
1036 break;
1037
1038 case '2': /* start of right-to-left direction */
1039 ONE_MORE_BYTE (c1);
1040 if (c1 == ']')
1041 coding->direction= 1;
1042 else
1043 goto label_invalid_escape_sequence;
1044 break;
1045
1046 default:
1047 goto label_invalid_escape_sequence;
1048 }
1049 break;
1050
1051 default:
1052 if (c1 >= 0x28 && c1 <= 0x2B)
1053 { /* designation of DIMENSION1_CHARS94 character set */
1054 ONE_MORE_BYTE (c2);
1055 DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
1056 }
1057 else if (c1 >= 0x2C && c1 <= 0x2F)
1058 { /* designation of DIMENSION1_CHARS96 character set */
1059 ONE_MORE_BYTE (c2);
1060 DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
1061 }
1062 else
1063 {
1064 goto label_invalid_escape_sequence;
1065 }
1066 }
1067 /* We must update these variables now. */
1068 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1069 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1070 break;
1071
1072 label_invalid_escape_sequence:
1073 {
1074 int length = src - src_base;
1075
1076 bcopy (src_base, dst, length);
1077 dst += length;
1078 }
1079 }
1080 continue;
1081
1082 label_end_of_loop:
1083 coding->carryover_size = src - src_base;
1084 bcopy (src_base, coding->carryover, coding->carryover_size);
1085 src = src_base;
1086 break;
1087 }
1088
1089 /* If this is the last block of the text to be decoded, we had
1090 better just flush out all remaining codes in the text although
1091 they are not valid characters. */
1092 if (coding->last_block)
1093 {
1094 bcopy (src, dst, src_end - src);
1095 dst += (src_end - src);
1096 src = src_end;
1097 }
1098 *consumed = src - source;
1099 return dst - destination;
1100}
1101
f4dee582 1102/* ISO2022 encoding stuff. */
4ed46869
KH
1103
1104/*
f4dee582 1105 It is not enough to say just "ISO2022" on encoding, we have to
4ed46869
KH
1106 specify more details. In Emacs, each coding-system of ISO2022
1107 variant has the following specifications:
1108 1. Initial designation to G0 thru G3.
1109 2. Allows short-form designation?
1110 3. ASCII should be designated to G0 before control characters?
1111 4. ASCII should be designated to G0 at end of line?
1112 5. 7-bit environment or 8-bit environment?
1113 6. Use locking-shift?
1114 7. Use Single-shift?
1115 And the following two are only for Japanese:
1116 8. Use ASCII in place of JIS0201-1976-Roman?
1117 9. Use JISX0208-1983 in place of JISX0208-1978?
1118 These specifications are encoded in `coding->flags' as flag bits
1119 defined by macros CODING_FLAG_ISO_XXX. See `coding.h' for more
f4dee582 1120 details.
4ed46869
KH
1121*/
1122
1123/* Produce codes (escape sequence) for designating CHARSET to graphic
1124 register REG. If <final-char> of CHARSET is '@', 'A', or 'B' and
1125 the coding system CODING allows, produce designation sequence of
1126 short-form. */
1127
1128#define ENCODE_DESIGNATION(charset, reg, coding) \
1129 do { \
1130 unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset); \
1131 char *intermediate_char_94 = "()*+"; \
1132 char *intermediate_char_96 = ",-./"; \
1133 Lisp_Object temp \
1134 = Fassq (make_number (charset), Vcharset_revision_alist); \
1135 if (! NILP (temp)) \
1136 { \
1137 *dst++ = ISO_CODE_ESC; \
1138 *dst++ = '&'; \
1139 *dst++ = XINT (XCONS (temp)->cdr) + '@'; \
1140 } \
1141 *dst++ = ISO_CODE_ESC; \
1142 if (CHARSET_DIMENSION (charset) == 1) \
1143 { \
1144 if (CHARSET_CHARS (charset) == 94) \
1145 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1146 else \
1147 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1148 } \
1149 else \
1150 { \
1151 *dst++ = '$'; \
1152 if (CHARSET_CHARS (charset) == 94) \
1153 { \
1154 if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM) \
1155 || reg != 0 \
1156 || final_char < '@' || final_char > 'B') \
1157 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1158 } \
1159 else \
1160 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1161 } \
1162 *dst++ = final_char; \
1163 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
1164 } while (0)
1165
1166/* The following two macros produce codes (control character or escape
1167 sequence) for ISO2022 single-shift functions (single-shift-2 and
1168 single-shift-3). */
1169
1170#define ENCODE_SINGLE_SHIFT_2 \
1171 do { \
1172 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1173 *dst++ = ISO_CODE_ESC, *dst++ = 'N'; \
1174 else \
1175 *dst++ = ISO_CODE_SS2; \
1176 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1177 } while (0)
1178
1179#define ENCODE_SINGLE_SHIFT_3 \
1180 do { \
1181 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1182 *dst++ = ISO_CODE_ESC, *dst++ = 'O'; \
1183 else \
1184 *dst++ = ISO_CODE_SS3; \
1185 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1186 } while (0)
1187
1188/* The following four macros produce codes (control character or
1189 escape sequence) for ISO2022 locking-shift functions (shift-in,
1190 shift-out, locking-shift-2, and locking-shift-3). */
1191
1192#define ENCODE_SHIFT_IN \
1193 do { \
1194 *dst++ = ISO_CODE_SI; \
1195 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1196 } while (0)
1197
1198#define ENCODE_SHIFT_OUT \
1199 do { \
1200 *dst++ = ISO_CODE_SO; \
1201 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1202 } while (0)
1203
1204#define ENCODE_LOCKING_SHIFT_2 \
1205 do { \
1206 *dst++ = ISO_CODE_ESC, *dst++ = 'n'; \
1207 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1208 } while (0)
1209
1210#define ENCODE_LOCKING_SHIFT_3 \
1211 do { \
1212 *dst++ = ISO_CODE_ESC, *dst++ = 'o'; \
1213 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1214 } while (0)
1215
f4dee582
RS
1216/* Produce codes for a DIMENSION1 character whose character set is
1217 CHARSET and whose position-code is C1. Designation and invocation
4ed46869
KH
1218 sequences are also produced in advance if necessary. */
1219
1220
6e85d753
KH
1221#define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
1222 do { \
1223 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1224 { \
1225 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1226 *dst++ = c1 & 0x7F; \
1227 else \
1228 *dst++ = c1 | 0x80; \
1229 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1230 break; \
1231 } \
1232 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1233 { \
1234 *dst++ = c1 & 0x7F; \
1235 break; \
1236 } \
1237 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1238 { \
1239 *dst++ = c1 | 0x80; \
1240 break; \
1241 } \
1242 else if (coding->flags & CODING_FLAG_ISO_SAFE \
1243 && !CODING_SPEC_ISO_EXPECTED_CHARSETS (coding)[charset]) \
1244 { \
1245 /* We should not encode this character, instead produce one or \
1246 two `?'s. */ \
1247 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1248 if (CHARSET_WIDTH (charset) == 2) \
1249 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1250 break; \
1251 } \
1252 else \
1253 /* Since CHARSET is not yet invoked to any graphic planes, we \
1254 must invoke it, or, at first, designate it to some graphic \
1255 register. Then repeat the loop to actually produce the \
1256 character. */ \
1257 dst = encode_invocation_designation (charset, coding, dst); \
4ed46869
KH
1258 } while (1)
1259
f4dee582
RS
1260/* Produce codes for a DIMENSION2 character whose character set is
1261 CHARSET and whose position-codes are C1 and C2. Designation and
4ed46869
KH
1262 invocation codes are also produced in advance if necessary. */
1263
6e85d753
KH
1264#define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
1265 do { \
1266 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1267 { \
1268 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1269 *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F; \
1270 else \
1271 *dst++ = c1 | 0x80, *dst++ = c2 | 0x80; \
1272 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1273 break; \
1274 } \
1275 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1276 { \
1277 *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F; \
1278 break; \
1279 } \
1280 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1281 { \
1282 *dst++ = c1 | 0x80, *dst++= c2 | 0x80; \
1283 break; \
1284 } \
1285 else if (coding->flags & CODING_FLAG_ISO_SAFE \
1286 && !CODING_SPEC_ISO_EXPECTED_CHARSETS (coding)[charset]) \
1287 { \
1288 /* We should not encode this character, instead produce one or \
1289 two `?'s. */ \
1290 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1291 if (CHARSET_WIDTH (charset) == 2) \
1292 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1293 break; \
1294 } \
1295 else \
1296 /* Since CHARSET is not yet invoked to any graphic planes, we \
1297 must invoke it, or, at first, designate it to some graphic \
1298 register. Then repeat the loop to actually produce the \
1299 character. */ \
1300 dst = encode_invocation_designation (charset, coding, dst); \
4ed46869
KH
1301 } while (1)
1302
bdd9fb48
KH
1303#define ENCODE_ISO_CHARACTER(charset, c1, c2) \
1304 do { \
1305 int c_alt, charset_alt; \
1306 if (!NILP (unification_table) \
1307 && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
a5d301df 1308 >= 0)) \
bdd9fb48
KH
1309 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1310 else \
1311 charset_alt = charset; \
1312 if (CHARSET_DIMENSION (charset_alt) == 1) \
1313 ENCODE_ISO_CHARACTER_DIMENSION1 (charset_alt, c1); \
1314 else \
1315 ENCODE_ISO_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
1316 } while (0)
1317
4ed46869
KH
1318/* Produce designation and invocation codes at a place pointed by DST
1319 to use CHARSET. The element `spec.iso2022' of *CODING is updated.
1320 Return new DST. */
1321
1322unsigned char *
1323encode_invocation_designation (charset, coding, dst)
1324 int charset;
1325 struct coding_system *coding;
1326 unsigned char *dst;
1327{
1328 int reg; /* graphic register number */
1329
1330 /* At first, check designations. */
1331 for (reg = 0; reg < 4; reg++)
1332 if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
1333 break;
1334
1335 if (reg >= 4)
1336 {
1337 /* CHARSET is not yet designated to any graphic registers. */
1338 /* At first check the requested designation. */
1339 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1ba9e4ab
KH
1340 if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1341 /* Since CHARSET requests no special designation, designate it
1342 to graphic register 0. */
4ed46869
KH
1343 reg = 0;
1344
1345 ENCODE_DESIGNATION (charset, reg, coding);
1346 }
1347
1348 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
1349 && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
1350 {
1351 /* Since the graphic register REG is not invoked to any graphic
1352 planes, invoke it to graphic plane 0. */
1353 switch (reg)
1354 {
1355 case 0: /* graphic register 0 */
1356 ENCODE_SHIFT_IN;
1357 break;
1358
1359 case 1: /* graphic register 1 */
1360 ENCODE_SHIFT_OUT;
1361 break;
1362
1363 case 2: /* graphic register 2 */
1364 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1365 ENCODE_SINGLE_SHIFT_2;
1366 else
1367 ENCODE_LOCKING_SHIFT_2;
1368 break;
1369
1370 case 3: /* graphic register 3 */
1371 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1372 ENCODE_SINGLE_SHIFT_3;
1373 else
1374 ENCODE_LOCKING_SHIFT_3;
1375 break;
1376 }
1377 }
1378 return dst;
1379}
1380
1381/* The following two macros produce codes for indicating composition. */
1382#define ENCODE_COMPOSITION_NO_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '0'
1383#define ENCODE_COMPOSITION_WITH_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '2'
1384#define ENCODE_COMPOSITION_END *dst++ = ISO_CODE_ESC, *dst++ = '1'
1385
1386/* The following three macros produce codes for indicating direction
1387 of text. */
1388#define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
1389 do { \
1390 if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS) \
1391 *dst++ = ISO_CODE_ESC, *dst++ = '['; \
1392 else \
1393 *dst++ = ISO_CODE_CSI; \
1394 } while (0)
1395
1396#define ENCODE_DIRECTION_R2L \
1397 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '2', *dst++ = ']'
1398
1399#define ENCODE_DIRECTION_L2R \
1400 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '0', *dst++ = ']'
1401
1402/* Produce codes for designation and invocation to reset the graphic
1403 planes and registers to initial state. */
e0e989f6
KH
1404#define ENCODE_RESET_PLANE_AND_REGISTER \
1405 do { \
1406 int reg; \
1407 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0) \
1408 ENCODE_SHIFT_IN; \
1409 for (reg = 0; reg < 4; reg++) \
1410 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0 \
1411 && (CODING_SPEC_ISO_DESIGNATION (coding, reg) \
1412 != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg))) \
1413 ENCODE_DESIGNATION \
1414 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
4ed46869
KH
1415 } while (0)
1416
bdd9fb48
KH
1417/* Produce designation sequences of charsets in the line started from
1418 *SRC to a place pointed by DSTP.
1419
1420 If the current block ends before any end-of-line, we may fail to
1421 find all the necessary *designations. */
1422encode_designation_at_bol (coding, table, src, src_end, dstp)
e0e989f6 1423 struct coding_system *coding;
bdd9fb48 1424 Lisp_Object table;
e0e989f6
KH
1425 unsigned char *src, *src_end, **dstp;
1426{
bdd9fb48
KH
1427 int charset, c, found = 0, reg;
1428 /* Table of charsets to be designated to each graphic register. */
1429 int r[4];
1430 unsigned char *dst = *dstp;
1431
1432 for (reg = 0; reg < 4; reg++)
1433 r[reg] = -1;
1434
1435 while (src < src_end && *src != '\n' && found < 4)
e0e989f6 1436 {
bdd9fb48
KH
1437 int bytes = BYTES_BY_CHAR_HEAD (*src);
1438
1439 if (NILP (table))
1440 charset = CHARSET_AT (src);
1441 else
e0e989f6 1442 {
bdd9fb48
KH
1443 int c_alt, c1, c2;
1444
1445 SPLIT_STRING(src, bytes, charset, c1, c2);
1446 if ((c_alt = unify_char (table, -1, charset, c1, c2)) >= 0)
1447 charset = CHAR_CHARSET (c_alt);
e0e989f6 1448 }
bdd9fb48 1449
e0e989f6 1450 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1ba9e4ab 1451 if (r[reg] == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
bdd9fb48
KH
1452 {
1453 found++;
1454 r[reg] = charset;
1455 }
1456
1457 src += bytes;
1458 }
1459
1460 if (found)
1461 {
1462 for (reg = 0; reg < 4; reg++)
1463 if (r[reg] >= 0
1464 && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
1465 ENCODE_DESIGNATION (r[reg], reg, coding);
1466 *dstp = dst;
e0e989f6 1467 }
e0e989f6
KH
1468}
1469
4ed46869
KH
1470/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
1471
1472int
1473encode_coding_iso2022 (coding, source, destination,
1474 src_bytes, dst_bytes, consumed)
1475 struct coding_system *coding;
1476 unsigned char *source, *destination;
1477 int src_bytes, dst_bytes;
1478 int *consumed;
1479{
1480 unsigned char *src = source;
1481 unsigned char *src_end = source + src_bytes;
1482 unsigned char *dst = destination;
1483 unsigned char *dst_end = destination + dst_bytes;
e0e989f6 1484 /* Since the maximum bytes produced by each loop is 20, we subtract 19
4ed46869
KH
1485 from DST_END to assure overflow checking is necessary only at the
1486 head of loop. */
e0e989f6 1487 unsigned char *adjusted_dst_end = dst_end - 19;
a5d301df
KH
1488 Lisp_Object unification_table
1489 = coding->character_unification_table_for_encode;
bdd9fb48
KH
1490
1491 if (!NILP (Venable_character_unification) && NILP (unification_table))
a5d301df 1492 unification_table = Vstandard_character_unification_table_for_encode;
4ed46869
KH
1493
1494 while (src < src_end && dst < adjusted_dst_end)
1495 {
1496 /* SRC_BASE remembers the start position in source in each loop.
1497 The loop will be exited when there's not enough source text
1498 to analyze multi-byte codes (within macros ONE_MORE_BYTE,
1499 TWO_MORE_BYTES, and THREE_MORE_BYTES). In that case, SRC is
1500 reset to SRC_BASE before exiting. */
1501 unsigned char *src_base = src;
bdd9fb48 1502 int charset, c1, c2, c3, c4;
4ed46869 1503
e0e989f6
KH
1504 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
1505 && CODING_SPEC_ISO_BOL (coding))
1506 {
bdd9fb48
KH
1507 /* We have to produce designation sequences if any now. */
1508 encode_designation_at_bol (coding, unification_table,
1509 src, src_end, &dst);
e0e989f6
KH
1510 CODING_SPEC_ISO_BOL (coding) = 0;
1511 }
1512
1513 c1 = *src++;
4ed46869
KH
1514 /* If we are seeing a component of a composite character, we are
1515 seeing a leading-code specially encoded for composition, or a
1516 composition rule if composing with rule. We must set C1
1517 to a normal leading-code or an ASCII code. If we are not at
1518 a composed character, we must reset the composition state. */
1519 if (COMPOSING_P (coding->composing))
1520 {
1521 if (c1 < 0xA0)
1522 {
1523 /* We are not in a composite character any longer. */
1524 coding->composing = COMPOSING_NO;
1525 ENCODE_COMPOSITION_END;
1526 }
1527 else
1528 {
1529 if (coding->composing == COMPOSING_WITH_RULE_RULE)
1530 {
1531 *dst++ = c1 & 0x7F;
1532 coding->composing = COMPOSING_WITH_RULE_HEAD;
1533 continue;
1534 }
1535 else if (coding->composing == COMPOSING_WITH_RULE_HEAD)
1536 coding->composing = COMPOSING_WITH_RULE_RULE;
1537 if (c1 == 0xA0)
1538 {
1539 /* This is an ASCII component. */
1540 ONE_MORE_BYTE (c1);
1541 c1 &= 0x7F;
1542 }
1543 else
1544 /* This is a leading-code of non ASCII component. */
1545 c1 -= 0x20;
1546 }
1547 }
1548
1549 /* Now encode one character. C1 is a control character, an
1550 ASCII character, or a leading-code of multi-byte character. */
1551 switch (emacs_code_class[c1])
1552 {
1553 case EMACS_ascii_code:
bdd9fb48 1554 ENCODE_ISO_CHARACTER (CHARSET_ASCII, c1, /* dummy */ c2);
4ed46869
KH
1555 break;
1556
1557 case EMACS_control_code:
1558 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
e0e989f6 1559 ENCODE_RESET_PLANE_AND_REGISTER;
4ed46869
KH
1560 *dst++ = c1;
1561 break;
1562
1563 case EMACS_carriage_return_code:
1564 if (!coding->selective)
1565 {
1566 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
e0e989f6 1567 ENCODE_RESET_PLANE_AND_REGISTER;
4ed46869
KH
1568 *dst++ = c1;
1569 break;
1570 }
1571 /* fall down to treat '\r' as '\n' ... */
1572
1573 case EMACS_linefeed_code:
1574 if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
e0e989f6
KH
1575 ENCODE_RESET_PLANE_AND_REGISTER;
1576 if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
1577 bcopy (coding->spec.iso2022.initial_designation,
1578 coding->spec.iso2022.current_designation,
1579 sizeof coding->spec.iso2022.initial_designation);
4ed46869 1580 if (coding->eol_type == CODING_EOL_LF
0ef69138 1581 || coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
1582 *dst++ = ISO_CODE_LF;
1583 else if (coding->eol_type == CODING_EOL_CRLF)
1584 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
1585 else
1586 *dst++ = ISO_CODE_CR;
e0e989f6 1587 CODING_SPEC_ISO_BOL (coding) = 1;
4ed46869
KH
1588 break;
1589
1590 case EMACS_leading_code_2:
1591 ONE_MORE_BYTE (c2);
19a8d9e0
KH
1592 if (c2 < 0xA0)
1593 {
1594 /* invalid sequence */
1595 *dst++ = c1;
1596 *dst++ = c2;
1597 }
1598 else
1599 ENCODE_ISO_CHARACTER (c1, c2, /* dummy */ c3);
4ed46869
KH
1600 break;
1601
1602 case EMACS_leading_code_3:
1603 TWO_MORE_BYTES (c2, c3);
19a8d9e0
KH
1604 if (c2 < 0xA0 || c3 < 0xA0)
1605 {
1606 /* invalid sequence */
1607 *dst++ = c1;
1608 *dst++ = c2;
1609 *dst++ = c3;
1610 }
1611 else if (c1 < LEADING_CODE_PRIVATE_11)
bdd9fb48 1612 ENCODE_ISO_CHARACTER (c1, c2, c3);
4ed46869 1613 else
bdd9fb48 1614 ENCODE_ISO_CHARACTER (c2, c3, /* dummy */ c4);
4ed46869
KH
1615 break;
1616
1617 case EMACS_leading_code_4:
1618 THREE_MORE_BYTES (c2, c3, c4);
19a8d9e0
KH
1619 if (c2 < 0xA0 || c3 < 0xA0 || c4 < 0xA0)
1620 {
1621 /* invalid sequence */
1622 *dst++ = c1;
1623 *dst++ = c2;
1624 *dst++ = c3;
1625 *dst++ = c4;
1626 }
1627 else
1628 ENCODE_ISO_CHARACTER (c2, c3, c4);
4ed46869
KH
1629 break;
1630
1631 case EMACS_leading_code_composition:
19a8d9e0
KH
1632 ONE_MORE_BYTE (c2);
1633 if (c2 < 0xA0)
1634 {
1635 /* invalid sequence */
1636 *dst++ = c1;
1637 *dst++ = c2;
1638 }
1639 else if (c2 == 0xFF)
4ed46869
KH
1640 {
1641 coding->composing = COMPOSING_WITH_RULE_HEAD;
1642 ENCODE_COMPOSITION_WITH_RULE_START;
1643 }
1644 else
1645 {
1646 /* Rewind one byte because it is a character code of
1647 composition elements. */
1648 src--;
1649 coding->composing = COMPOSING_NO_RULE_HEAD;
1650 ENCODE_COMPOSITION_NO_RULE_START;
1651 }
1652 break;
1653
1654 case EMACS_invalid_code:
1655 *dst++ = c1;
1656 break;
1657 }
1658 continue;
1659 label_end_of_loop:
76376439
KH
1660 /* We reach here because the source date ends not at character
1661 boundary. */
1662 coding->carryover_size = src_end - src_base;
4ed46869 1663 bcopy (src_base, coding->carryover, coding->carryover_size);
76376439 1664 src = src_end;
4ed46869
KH
1665 break;
1666 }
1667
1668 /* If this is the last block of the text to be encoded, we must
bdd9fb48
KH
1669 reset graphic planes and registers to the initial state. */
1670 if (src >= src_end && coding->last_block)
4ed46869 1671 {
e0e989f6 1672 ENCODE_RESET_PLANE_AND_REGISTER;
bdd9fb48
KH
1673 if (coding->carryover_size > 0
1674 && coding->carryover_size < (dst_end - dst))
1675 {
1676 bcopy (coding->carryover, dst, coding->carryover_size);
1677 dst += coding->carryover_size;
1678 coding->carryover_size = 0;
1679 }
4ed46869
KH
1680 }
1681 *consumed = src - source;
1682 return dst - destination;
1683}
1684
1685\f
1686/*** 4. SJIS and BIG5 handlers ***/
1687
f4dee582 1688/* Although SJIS and BIG5 are not ISO's coding system, they are used
4ed46869
KH
1689 quite widely. So, for the moment, Emacs supports them in the bare
1690 C code. But, in the future, they may be supported only by CCL. */
1691
1692/* SJIS is a coding system encoding three character sets: ASCII, right
1693 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
1694 as is. A character of charset katakana-jisx0201 is encoded by
1695 "position-code + 0x80". A character of charset japanese-jisx0208
1696 is encoded in 2-byte but two position-codes are divided and shifted
1697 so that it fit in the range below.
1698
1699 --- CODE RANGE of SJIS ---
1700 (character set) (range)
1701 ASCII 0x00 .. 0x7F
1702 KATAKANA-JISX0201 0xA0 .. 0xDF
1703 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xFF
1704 (2nd byte) 0x40 .. 0xFF
1705 -------------------------------
1706
1707*/
1708
1709/* BIG5 is a coding system encoding two character sets: ASCII and
1710 Big5. An ASCII character is encoded as is. Big5 is a two-byte
1711 character set and is encoded in two-byte.
1712
1713 --- CODE RANGE of BIG5 ---
1714 (character set) (range)
1715 ASCII 0x00 .. 0x7F
1716 Big5 (1st byte) 0xA1 .. 0xFE
1717 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
1718 --------------------------
1719
1720 Since the number of characters in Big5 is larger than maximum
1721 characters in Emacs' charset (96x96), it can't be handled as one
1722 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
1723 and `charset-big5-2'. Both are DIMENSION2 and CHARS94. The former
1724 contains frequently used characters and the latter contains less
1725 frequently used characters. */
1726
1727/* Macros to decode or encode a character of Big5 in BIG5. B1 and B2
1728 are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
1729 C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
1730 format. CHARSET is `charset_big5_1' or `charset_big5_2'. */
1731
1732/* Number of Big5 characters which have the same code in 1st byte. */
1733#define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
1734
1735#define DECODE_BIG5(b1, b2, charset, c1, c2) \
1736 do { \
1737 unsigned int temp \
1738 = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62); \
1739 if (b1 < 0xC9) \
1740 charset = charset_big5_1; \
1741 else \
1742 { \
1743 charset = charset_big5_2; \
1744 temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW; \
1745 } \
1746 c1 = temp / (0xFF - 0xA1) + 0x21; \
1747 c2 = temp % (0xFF - 0xA1) + 0x21; \
1748 } while (0)
1749
1750#define ENCODE_BIG5(charset, c1, c2, b1, b2) \
1751 do { \
1752 unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21); \
1753 if (charset == charset_big5_2) \
1754 temp += BIG5_SAME_ROW * (0xC9 - 0xA1); \
1755 b1 = temp / BIG5_SAME_ROW + 0xA1; \
1756 b2 = temp % BIG5_SAME_ROW; \
1757 b2 += b2 < 0x3F ? 0x40 : 0x62; \
1758 } while (0)
1759
a5d301df
KH
1760#define DECODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \
1761 do { \
1762 int c_alt, charset_alt = (charset); \
1763 if (!NILP (unification_table) \
1764 && ((c_alt = unify_char (unification_table, \
1765 -1, (charset), c1, c2)) >= 0)) \
1766 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1767 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
1768 DECODE_CHARACTER_ASCII (c1); \
1769 else if (CHARSET_DIMENSION (charset_alt) == 1) \
1770 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
1771 else \
1772 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
1773 } while (0)
1774
1775#define ENCODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \
1776 do { \
1777 int c_alt, charset_alt; \
1778 if (!NILP (unification_table) \
1779 && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
1780 >= 0)) \
1781 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1782 else \
1783 charset_alt = charset; \
1784 if (charset_alt == charset_ascii) \
1785 *dst++ = c1; \
1786 else if (CHARSET_DIMENSION (charset_alt) == 1) \
1787 { \
1788 if (sjis_p && charset_alt == charset_katakana_jisx0201) \
1789 *dst++ = c1; \
1790 else \
1791 *dst++ = charset_alt, *dst++ = c1; \
1792 } \
1793 else \
1794 { \
1795 c1 &= 0x7F, c2 &= 0x7F; \
1796 if (sjis_p && charset_alt == charset_jisx0208) \
1797 { \
1798 unsigned char s1, s2; \
1799 \
1800 ENCODE_SJIS (c1, c2, s1, s2); \
1801 *dst++ = s1, *dst++ = s2; \
1802 } \
1803 else if (!sjis_p \
1804 && (charset_alt == charset_big5_1 \
1805 || charset_alt == charset_big5_2)) \
1806 { \
1807 unsigned char b1, b2; \
1808 \
9ce27fde 1809 ENCODE_BIG5 (charset_alt, c1, c2, b1, b2); \
a5d301df
KH
1810 *dst++ = b1, *dst++ = b2; \
1811 } \
1812 else \
1813 *dst++ = charset_alt, *dst++ = c1, *dst++ = c2; \
1814 } \
1815 } while (0);
1816
4ed46869
KH
1817/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1818 Check if a text is encoded in SJIS. If it is, return
1819 CODING_CATEGORY_MASK_SJIS, else return 0. */
1820
1821int
1822detect_coding_sjis (src, src_end)
1823 unsigned char *src, *src_end;
1824{
1825 unsigned char c;
1826
1827 while (src < src_end)
1828 {
1829 c = *src++;
1830 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1831 return 0;
1832 if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
1833 {
1834 if (src < src_end && *src++ < 0x40)
1835 return 0;
1836 }
1837 }
1838 return CODING_CATEGORY_MASK_SJIS;
1839}
1840
1841/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1842 Check if a text is encoded in BIG5. If it is, return
1843 CODING_CATEGORY_MASK_BIG5, else return 0. */
1844
1845int
1846detect_coding_big5 (src, src_end)
1847 unsigned char *src, *src_end;
1848{
1849 unsigned char c;
1850
1851 while (src < src_end)
1852 {
1853 c = *src++;
1854 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1855 return 0;
1856 if (c >= 0xA1)
1857 {
1858 if (src >= src_end)
1859 break;
1860 c = *src++;
1861 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
1862 return 0;
1863 }
1864 }
1865 return CODING_CATEGORY_MASK_BIG5;
1866}
1867
1868/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
1869 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
1870
1871int
1872decode_coding_sjis_big5 (coding, source, destination,
1873 src_bytes, dst_bytes, consumed, sjis_p)
1874 struct coding_system *coding;
1875 unsigned char *source, *destination;
1876 int src_bytes, dst_bytes;
1877 int *consumed;
1878 int sjis_p;
1879{
1880 unsigned char *src = source;
1881 unsigned char *src_end = source + src_bytes;
1882 unsigned char *dst = destination;
1883 unsigned char *dst_end = destination + dst_bytes;
1884 /* Since the maximum bytes produced by each loop is 4, we subtract 3
1885 from DST_END to assure overflow checking is necessary only at the
1886 head of loop. */
1887 unsigned char *adjusted_dst_end = dst_end - 3;
a5d301df
KH
1888 Lisp_Object unification_table
1889 = coding->character_unification_table_for_decode;
1890
1891 if (!NILP (Venable_character_unification) && NILP (unification_table))
1892 unification_table = Vstandard_character_unification_table_for_decode;
4ed46869
KH
1893
1894 while (src < src_end && dst < adjusted_dst_end)
1895 {
1896 /* SRC_BASE remembers the start position in source in each loop.
1897 The loop will be exited when there's not enough source text
1898 to analyze two-byte character (within macro ONE_MORE_BYTE).
1899 In that case, SRC is reset to SRC_BASE before exiting. */
1900 unsigned char *src_base = src;
1901 unsigned char c1 = *src++, c2, c3, c4;
1902
1903 if (c1 == '\r')
1904 {
1905 if (coding->eol_type == CODING_EOL_CRLF)
1906 {
1907 ONE_MORE_BYTE (c2);
1908 if (c2 == '\n')
1909 *dst++ = c2;
1910 else
1911 /* To process C2 again, SRC is subtracted by 1. */
1912 *dst++ = c1, src--;
1913 }
1914 else
1915 *dst++ = c1;
1916 }
a5d301df 1917 else if (c1 < 0x20)
4ed46869 1918 *dst++ = c1;
a5d301df
KH
1919 else if (c1 < 0x80)
1920 DECODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
4ed46869
KH
1921 else if (c1 < 0xA0 || c1 >= 0xE0)
1922 {
1923 /* SJIS -> JISX0208, BIG5 -> Big5 (only if 0xE0 <= c1 < 0xFF) */
1924 if (sjis_p)
1925 {
1926 ONE_MORE_BYTE (c2);
1927 DECODE_SJIS (c1, c2, c3, c4);
a5d301df 1928 DECODE_SJIS_BIG5_CHARACTER (charset_jisx0208, c3, c4);
4ed46869
KH
1929 }
1930 else if (c1 >= 0xE0 && c1 < 0xFF)
1931 {
1932 int charset;
1933
1934 ONE_MORE_BYTE (c2);
1935 DECODE_BIG5 (c1, c2, charset, c3, c4);
a5d301df 1936 DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
4ed46869
KH
1937 }
1938 else /* Invalid code */
1939 *dst++ = c1;
1940 }
1941 else
1942 {
1943 /* SJIS -> JISX0201-Kana, BIG5 -> Big5 */
1944 if (sjis_p)
a5d301df 1945 DECODE_SJIS_BIG5_CHARACTER (charset_katakana_jisx0201, c1, /* dummy */ c2);
4ed46869
KH
1946 else
1947 {
1948 int charset;
1949
1950 ONE_MORE_BYTE (c2);
1951 DECODE_BIG5 (c1, c2, charset, c3, c4);
a5d301df 1952 DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
4ed46869
KH
1953 }
1954 }
1955 continue;
1956
1957 label_end_of_loop:
1958 coding->carryover_size = src - src_base;
1959 bcopy (src_base, coding->carryover, coding->carryover_size);
1960 src = src_base;
1961 break;
1962 }
1963
1964 *consumed = src - source;
1965 return dst - destination;
1966}
1967
1968/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
1969 This function can encode `charset_ascii', `charset_katakana_jisx0201',
1970 `charset_jisx0208', `charset_big5_1', and `charset_big5-2'. We are
1971 sure that all these charsets are registered as official charset
1972 (i.e. do not have extended leading-codes). Characters of other
1973 charsets are produced without any encoding. If SJIS_P is 1, encode
1974 SJIS text, else encode BIG5 text. */
1975
1976int
1977encode_coding_sjis_big5 (coding, source, destination,
1978 src_bytes, dst_bytes, consumed, sjis_p)
1979 struct coding_system *coding;
1980 unsigned char *source, *destination;
1981 int src_bytes, dst_bytes;
1982 int *consumed;
1983 int sjis_p;
1984{
1985 unsigned char *src = source;
1986 unsigned char *src_end = source + src_bytes;
1987 unsigned char *dst = destination;
1988 unsigned char *dst_end = destination + dst_bytes;
1989 /* Since the maximum bytes produced by each loop is 2, we subtract 1
1990 from DST_END to assure overflow checking is necessary only at the
1991 head of loop. */
1992 unsigned char *adjusted_dst_end = dst_end - 1;
a5d301df
KH
1993 Lisp_Object unification_table
1994 = coding->character_unification_table_for_encode;
1995
1996 if (!NILP (Venable_character_unification) && NILP (unification_table))
1997 unification_table = Vstandard_character_unification_table_for_encode;
4ed46869
KH
1998
1999 while (src < src_end && dst < adjusted_dst_end)
2000 {
2001 /* SRC_BASE remembers the start position in source in each loop.
2002 The loop will be exited when there's not enough source text
2003 to analyze multi-byte codes (within macros ONE_MORE_BYTE and
2004 TWO_MORE_BYTES). In that case, SRC is reset to SRC_BASE
2005 before exiting. */
2006 unsigned char *src_base = src;
2007 unsigned char c1 = *src++, c2, c3, c4;
2008
2009 if (coding->composing)
2010 {
2011 if (c1 == 0xA0)
2012 {
2013 ONE_MORE_BYTE (c1);
2014 c1 &= 0x7F;
2015 }
2016 else if (c1 >= 0xA0)
2017 c1 -= 0x20;
2018 else
2019 coding->composing = 0;
2020 }
2021
2022 switch (emacs_code_class[c1])
2023 {
2024 case EMACS_ascii_code:
a5d301df
KH
2025 ENCODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
2026 break;
2027
4ed46869
KH
2028 case EMACS_control_code:
2029 *dst++ = c1;
2030 break;
2031
2032 case EMACS_carriage_return_code:
2033 if (!coding->selective)
2034 {
2035 *dst++ = c1;
2036 break;
2037 }
2038 /* fall down to treat '\r' as '\n' ... */
2039
2040 case EMACS_linefeed_code:
2041 if (coding->eol_type == CODING_EOL_LF
0ef69138 2042 || coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
2043 *dst++ = '\n';
2044 else if (coding->eol_type == CODING_EOL_CRLF)
2045 *dst++ = '\r', *dst++ = '\n';
2046 else
2047 *dst++ = '\r';
2048 break;
2049
2050 case EMACS_leading_code_2:
2051 ONE_MORE_BYTE (c2);
a5d301df 2052 ENCODE_SJIS_BIG5_CHARACTER (c1, c2, /* dummy */ c3);
4ed46869
KH
2053 break;
2054
2055 case EMACS_leading_code_3:
2056 TWO_MORE_BYTES (c2, c3);
a5d301df 2057 ENCODE_SJIS_BIG5_CHARACTER (c1, c2, c3);
4ed46869
KH
2058 break;
2059
2060 case EMACS_leading_code_4:
2061 THREE_MORE_BYTES (c2, c3, c4);
a5d301df 2062 ENCODE_SJIS_BIG5_CHARACTER (c2, c3, c4);
4ed46869
KH
2063 break;
2064
2065 case EMACS_leading_code_composition:
2066 coding->composing = 1;
2067 break;
2068
2069 default: /* i.e. case EMACS_invalid_code: */
2070 *dst++ = c1;
2071 }
2072 continue;
2073
2074 label_end_of_loop:
76376439 2075 coding->carryover_size = src_end - src_base;
4ed46869 2076 bcopy (src_base, coding->carryover, coding->carryover_size);
76376439 2077 src = src_end;
4ed46869
KH
2078 break;
2079 }
2080
2081 *consumed = src - source;
2082 return dst - destination;
2083}
2084
2085\f
2086/*** 5. End-of-line handlers ***/
2087
2088/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2089 This function is called only when `coding->eol_type' is
2090 CODING_EOL_CRLF or CODING_EOL_CR. */
2091
2092decode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
2093 struct coding_system *coding;
2094 unsigned char *source, *destination;
2095 int src_bytes, dst_bytes;
2096 int *consumed;
2097{
2098 unsigned char *src = source;
2099 unsigned char *src_end = source + src_bytes;
2100 unsigned char *dst = destination;
2101 unsigned char *dst_end = destination + dst_bytes;
2102 int produced;
2103
2104 switch (coding->eol_type)
2105 {
2106 case CODING_EOL_CRLF:
2107 {
2108 /* Since the maximum bytes produced by each loop is 2, we
2109 subtract 1 from DST_END to assure overflow checking is
2110 necessary only at the head of loop. */
2111 unsigned char *adjusted_dst_end = dst_end - 1;
2112
2113 while (src < src_end && dst < adjusted_dst_end)
2114 {
2115 unsigned char *src_base = src;
2116 unsigned char c = *src++;
2117 if (c == '\r')
2118 {
2119 ONE_MORE_BYTE (c);
2120 if (c != '\n')
2121 *dst++ = '\r';
bfd99048 2122 *dst++ = c;
4ed46869
KH
2123 }
2124 else
2125 *dst++ = c;
2126 continue;
2127
2128 label_end_of_loop:
2129 coding->carryover_size = src - src_base;
2130 bcopy (src_base, coding->carryover, coding->carryover_size);
2131 src = src_base;
2132 break;
2133 }
2134 *consumed = src - source;
2135 produced = dst - destination;
2136 break;
2137 }
2138
2139 case CODING_EOL_CR:
2140 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2141 bcopy (source, destination, produced);
2142 dst_end = destination + produced;
2143 while (dst < dst_end)
2144 if (*dst++ == '\r') dst[-1] = '\n';
2145 *consumed = produced;
2146 break;
2147
2148 default: /* i.e. case: CODING_EOL_LF */
2149 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2150 bcopy (source, destination, produced);
2151 *consumed = produced;
2152 break;
2153 }
2154
2155 return produced;
2156}
2157
2158/* See "GENERAL NOTES about `encode_coding_XXX ()' functions". Encode
2159 format of end-of-line according to `coding->eol_type'. If
2160 `coding->selective' is 1, code '\r' in source text also means
2161 end-of-line. */
2162
2163encode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
2164 struct coding_system *coding;
2165 unsigned char *source, *destination;
2166 int src_bytes, dst_bytes;
2167 int *consumed;
2168{
2169 unsigned char *src = source;
2170 unsigned char *dst = destination;
2171 int produced;
2172
2173 if (src_bytes <= 0)
2174 return 0;
2175
2176 switch (coding->eol_type)
2177 {
2178 case CODING_EOL_LF:
0ef69138 2179 case CODING_EOL_UNDECIDED:
4ed46869
KH
2180 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2181 bcopy (source, destination, produced);
2182 if (coding->selective)
2183 {
2184 int i = produced;
2185 while (i--)
2186 if (*dst++ == '\r') dst[-1] = '\n';
2187 }
2188 *consumed = produced;
2189
2190 case CODING_EOL_CRLF:
2191 {
2192 unsigned char c;
2193 unsigned char *src_end = source + src_bytes;
2194 unsigned char *dst_end = destination + dst_bytes;
2195 /* Since the maximum bytes produced by each loop is 2, we
2196 subtract 1 from DST_END to assure overflow checking is
2197 necessary only at the head of loop. */
2198 unsigned char *adjusted_dst_end = dst_end - 1;
2199
2200 while (src < src_end && dst < adjusted_dst_end)
2201 {
2202 c = *src++;
2203 if (c == '\n' || (c == '\r' && coding->selective))
2204 *dst++ = '\r', *dst++ = '\n';
2205 else
2206 *dst++ = c;
2207 }
2208 produced = dst - destination;
2209 *consumed = src - source;
2210 break;
2211 }
2212
2213 default: /* i.e. case CODING_EOL_CR: */
2214 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2215 bcopy (source, destination, produced);
2216 {
2217 int i = produced;
2218 while (i--)
2219 if (*dst++ == '\n') dst[-1] = '\r';
2220 }
2221 *consumed = produced;
2222 }
2223
2224 return produced;
2225}
2226
2227\f
2228/*** 6. C library functions ***/
2229
2230/* In Emacs Lisp, coding system is represented by a Lisp symbol which
2231 has a property `coding-system'. The value of this property is a
2232 vector of length 5 (called as coding-vector). Among elements of
2233 this vector, the first (element[0]) and the fifth (element[4])
2234 carry important information for decoding/encoding. Before
2235 decoding/encoding, this information should be set in fields of a
2236 structure of type `coding_system'.
2237
2238 A value of property `coding-system' can be a symbol of another
2239 subsidiary coding-system. In that case, Emacs gets coding-vector
2240 from that symbol.
2241
2242 `element[0]' contains information to be set in `coding->type'. The
2243 value and its meaning is as follows:
2244
0ef69138
KH
2245 0 -- coding_type_emacs_mule
2246 1 -- coding_type_sjis
2247 2 -- coding_type_iso2022
2248 3 -- coding_type_big5
2249 4 -- coding_type_ccl encoder/decoder written in CCL
2250 nil -- coding_type_no_conversion
2251 t -- coding_type_undecided (automatic conversion on decoding,
2252 no-conversion on encoding)
4ed46869
KH
2253
2254 `element[4]' contains information to be set in `coding->flags' and
2255 `coding->spec'. The meaning varies by `coding->type'.
2256
2257 If `coding->type' is `coding_type_iso2022', element[4] is a vector
2258 of length 32 (of which the first 13 sub-elements are used now).
2259 Meanings of these sub-elements are:
2260
2261 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2262 If the value is an integer of valid charset, the charset is
2263 assumed to be designated to graphic register N initially.
2264
2265 If the value is minus, it is a minus value of charset which
2266 reserves graphic register N, which means that the charset is
2267 not designated initially but should be designated to graphic
2268 register N just before encoding a character in that charset.
2269
2270 If the value is nil, graphic register N is never used on
2271 encoding.
2272
2273 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2274 Each value takes t or nil. See the section ISO2022 of
2275 `coding.h' for more information.
2276
2277 If `coding->type' is `coding_type_big5', element[4] is t to denote
2278 BIG5-ETen or nil to denote BIG5-HKU.
2279
2280 If `coding->type' takes the other value, element[4] is ignored.
2281
2282 Emacs Lisp's coding system also carries information about format of
2283 end-of-line in a value of property `eol-type'. If the value is
2284 integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2285 means CODING_EOL_CR. If it is not integer, it should be a vector
2286 of subsidiary coding systems of which property `eol-type' has one
2287 of above values.
2288
2289*/
2290
2291/* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2292 and set it in CODING. If CODING_SYSTEM_SYMBOL is invalid, CODING
2293 is setup so that no conversion is necessary and return -1, else
2294 return 0. */
2295
2296int
e0e989f6
KH
2297setup_coding_system (coding_system, coding)
2298 Lisp_Object coding_system;
4ed46869
KH
2299 struct coding_system *coding;
2300{
4ed46869
KH
2301 Lisp_Object type, eol_type;
2302
f4dee582 2303 /* At first, set several fields to default values. */
4ed46869
KH
2304 coding->require_flushing = 0;
2305 coding->last_block = 0;
2306 coding->selective = 0;
2307 coding->composing = 0;
2308 coding->direction = 0;
2309 coding->carryover_size = 0;
4ed46869 2310 coding->post_read_conversion = coding->pre_write_conversion = Qnil;
a5d301df
KH
2311 coding->character_unification_table_for_decode = Qnil;
2312 coding->character_unification_table_for_encode = Qnil;
4ed46869 2313
e0e989f6
KH
2314 Vlast_coding_system_used = coding->symbol = coding_system;
2315 eol_type = Qnil;
2316 /* Get value of property `coding-system' until we get a vector.
2317 While doing that, also get values of properties
a5d301df
KH
2318 `post-read-conversion', `pre-write-conversion',
2319 `character-unification-table-for-decode',
2320 `character-unification-table-for-encode' and `eol-type'. */
e0e989f6 2321 while (!NILP (coding_system) && SYMBOLP (coding_system))
4ed46869 2322 {
4ed46869 2323 if (NILP (coding->post_read_conversion))
e0e989f6 2324 coding->post_read_conversion = Fget (coding_system,
4ed46869 2325 Qpost_read_conversion);
e0e989f6
KH
2326 if (NILP (coding->pre_write_conversion))
2327 coding->pre_write_conversion = Fget (coding_system,
4ed46869 2328 Qpre_write_conversion);
9ce27fde 2329 if (!inhibit_eol_conversion && NILP (eol_type))
e0e989f6 2330 eol_type = Fget (coding_system, Qeol_type);
a5d301df
KH
2331
2332 if (NILP (coding->character_unification_table_for_decode))
2333 coding->character_unification_table_for_decode
2334 = Fget (coding_system, Qcharacter_unification_table_for_decode);
2335
2336 if (NILP (coding->character_unification_table_for_encode))
2337 coding->character_unification_table_for_encode
2338 = Fget (coding_system, Qcharacter_unification_table_for_encode);
2339
e0e989f6 2340 coding_system = Fget (coding_system, Qcoding_system);
4ed46869 2341 }
a5d301df
KH
2342
2343 while (!NILP (coding->character_unification_table_for_decode)
2344 && SYMBOLP (coding->character_unification_table_for_decode))
2345 coding->character_unification_table_for_decode
2346 = Fget (coding->character_unification_table_for_decode,
2347 Qcharacter_unification_table_for_decode);
2348 if (!NILP (coding->character_unification_table_for_decode)
2349 && !CHAR_TABLE_P (coding->character_unification_table_for_decode))
2350 coding->character_unification_table_for_decode = Qnil;
2351
2352 while (!NILP (coding->character_unification_table_for_encode)
2353 && SYMBOLP (coding->character_unification_table_for_encode))
2354 coding->character_unification_table_for_encode
2355 = Fget (coding->character_unification_table_for_encode,
2356 Qcharacter_unification_table_for_encode);
2357 if (!NILP (coding->character_unification_table_for_encode)
2358 && !CHAR_TABLE_P (coding->character_unification_table_for_encode))
2359 coding->character_unification_table_for_encode = Qnil;
2360
e0e989f6
KH
2361 if (!VECTORP (coding_system)
2362 || XVECTOR (coding_system)->size != 5)
4ed46869
KH
2363 goto label_invalid_coding_system;
2364
4ed46869 2365 if (VECTORP (eol_type))
0ef69138 2366 coding->eol_type = CODING_EOL_UNDECIDED;
4ed46869
KH
2367 else if (XFASTINT (eol_type) == 1)
2368 coding->eol_type = CODING_EOL_CRLF;
2369 else if (XFASTINT (eol_type) == 2)
2370 coding->eol_type = CODING_EOL_CR;
2371 else
2372 coding->eol_type = CODING_EOL_LF;
2373
e0e989f6 2374 type = XVECTOR (coding_system)->contents[0];
4ed46869
KH
2375 switch (XFASTINT (type))
2376 {
2377 case 0:
0ef69138 2378 coding->type = coding_type_emacs_mule;
4ed46869
KH
2379 break;
2380
2381 case 1:
2382 coding->type = coding_type_sjis;
2383 break;
2384
2385 case 2:
2386 coding->type = coding_type_iso2022;
2387 {
f44d27ce 2388 Lisp_Object val;
4ed46869
KH
2389 Lisp_Object *flags;
2390 int i, charset, default_reg_bits = 0;
2391
f44d27ce
RS
2392 val = XVECTOR (coding_system)->contents[4];
2393
4ed46869
KH
2394 if (!VECTORP (val) || XVECTOR (val)->size != 32)
2395 goto label_invalid_coding_system;
2396
2397 flags = XVECTOR (val)->contents;
2398 coding->flags
2399 = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
2400 | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
2401 | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
2402 | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
2403 | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
2404 | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
2405 | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
2406 | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
e0e989f6
KH
2407 | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
2408 | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
c4825358
KH
2409 | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
2410 | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
3f003981 2411 | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
c4825358 2412 );
4ed46869
KH
2413
2414 /* Invoke graphic register 0 to plane 0. */
2415 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
2416 /* Invoke graphic register 1 to plane 1 if we can use full 8-bit. */
2417 CODING_SPEC_ISO_INVOCATION (coding, 1)
2418 = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
2419 /* Not single shifting at first. */
6e85d753 2420 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
e0e989f6 2421 /* Beginning of buffer should also be regarded as bol. */
6e85d753 2422 CODING_SPEC_ISO_BOL (coding) = 1;
4ed46869
KH
2423
2424 /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
2425 FLAGS[REG] can be one of below:
2426 integer CHARSET: CHARSET occupies register I,
2427 t: designate nothing to REG initially, but can be used
2428 by any charsets,
2429 list of integer, nil, or t: designate the first
2430 element (if integer) to REG initially, the remaining
2431 elements (if integer) is designated to REG on request,
2432 if an element is t, REG can be used by any charset,
2433 nil: REG is never used. */
467e7675 2434 for (charset = 0; charset <= MAX_CHARSET; charset++)
1ba9e4ab
KH
2435 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2436 = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
6e85d753 2437 bzero (CODING_SPEC_ISO_EXPECTED_CHARSETS (coding), MAX_CHARSET + 1);
4ed46869
KH
2438 for (i = 0; i < 4; i++)
2439 {
2440 if (INTEGERP (flags[i])
e0e989f6
KH
2441 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset))
2442 || (charset = get_charset_id (flags[i])) >= 0)
4ed46869
KH
2443 {
2444 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2445 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
6e85d753 2446 CODING_SPEC_ISO_EXPECTED_CHARSETS (coding)[charset] = 1;
4ed46869
KH
2447 }
2448 else if (EQ (flags[i], Qt))
2449 {
2450 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2451 default_reg_bits |= 1 << i;
2452 }
2453 else if (CONSP (flags[i]))
2454 {
2455 Lisp_Object tail = flags[i];
2456
2457 if (INTEGERP (XCONS (tail)->car)
2458 && (charset = XINT (XCONS (tail)->car),
e0e989f6
KH
2459 CHARSET_VALID_P (charset))
2460 || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
4ed46869
KH
2461 {
2462 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2463 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
6e85d753 2464 CODING_SPEC_ISO_EXPECTED_CHARSETS (coding)[charset] = 1;
4ed46869
KH
2465 }
2466 else
2467 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2468 tail = XCONS (tail)->cdr;
2469 while (CONSP (tail))
2470 {
2471 if (INTEGERP (XCONS (tail)->car)
2472 && (charset = XINT (XCONS (tail)->car),
e0e989f6
KH
2473 CHARSET_VALID_P (charset))
2474 || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
6e85d753
KH
2475 {
2476 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2477 = i;
2478 CODING_SPEC_ISO_EXPECTED_CHARSETS (coding)[charset]
2479 = 1;
2480 }
4ed46869
KH
2481 else if (EQ (XCONS (tail)->car, Qt))
2482 default_reg_bits |= 1 << i;
2483 tail = XCONS (tail)->cdr;
2484 }
2485 }
2486 else
2487 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2488
2489 CODING_SPEC_ISO_DESIGNATION (coding, i)
2490 = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
2491 }
2492
2493 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
2494 {
2495 /* REG 1 can be used only by locking shift in 7-bit env. */
2496 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2497 default_reg_bits &= ~2;
2498 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
2499 /* Without any shifting, only REG 0 and 1 can be used. */
2500 default_reg_bits &= 3;
2501 }
2502
6e85d753
KH
2503 for (charset = 0; charset <= MAX_CHARSET; charset++)
2504 if (CHARSET_VALID_P (charset)
2505 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2506 == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
2507 {
2508 /* We have not yet decided where to designate CHARSET. */
2509 int reg_bits = default_reg_bits;
2510
2511 if (CHARSET_CHARS (charset) == 96)
2512 /* A charset of CHARS96 can't be designated to REG 0. */
2513 reg_bits &= ~1;
2514
2515 if (reg_bits)
2516 /* There exist some default graphic register. */
2517 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2518 = (reg_bits & 1
2519 ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
2520 else
2521 /* We anyway have to designate CHARSET to somewhere. */
2522 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2523 = (CHARSET_CHARS (charset) == 94
2524 ? 0
2525 : ((coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT
2526 || ! coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2527 ? 1
2528 : (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT
2529 ? 2 : 0)));
2530 }
4ed46869
KH
2531 }
2532 coding->require_flushing = 1;
2533 break;
2534
2535 case 3:
2536 coding->type = coding_type_big5;
2537 coding->flags
e0e989f6 2538 = (NILP (XVECTOR (coding_system)->contents[4])
4ed46869
KH
2539 ? CODING_FLAG_BIG5_HKU
2540 : CODING_FLAG_BIG5_ETEN);
2541 break;
2542
2543 case 4:
2544 coding->type = coding_type_ccl;
2545 {
e0e989f6 2546 Lisp_Object val = XVECTOR (coding_system)->contents[4];
4ed46869
KH
2547 if (CONSP (val)
2548 && VECTORP (XCONS (val)->car)
2549 && VECTORP (XCONS (val)->cdr))
2550 {
2551 setup_ccl_program (&(coding->spec.ccl.decoder), XCONS (val)->car);
2552 setup_ccl_program (&(coding->spec.ccl.encoder), XCONS (val)->cdr);
2553 }
2554 else
2555 goto label_invalid_coding_system;
2556 }
2557 coding->require_flushing = 1;
2558 break;
2559
27901516
KH
2560 case 5:
2561 coding->type = coding_type_raw_text;
2562 break;
2563
4ed46869
KH
2564 default:
2565 if (EQ (type, Qt))
0ef69138 2566 coding->type = coding_type_undecided;
4ed46869
KH
2567 else
2568 coding->type = coding_type_no_conversion;
2569 break;
2570 }
2571 return 0;
2572
2573 label_invalid_coding_system:
2574 coding->type = coding_type_no_conversion;
dec137e5 2575 coding->eol_type = CODING_EOL_LF;
e0e989f6
KH
2576 coding->symbol = coding->pre_write_conversion = coding->post_read_conversion
2577 = Qnil;
4ed46869
KH
2578 return -1;
2579}
2580
2581/* Emacs has a mechanism to automatically detect a coding system if it
2582 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
2583 it's impossible to distinguish some coding systems accurately
2584 because they use the same range of codes. So, at first, coding
2585 systems are categorized into 7, those are:
2586
0ef69138 2587 o coding-category-emacs-mule
4ed46869
KH
2588
2589 The category for a coding system which has the same code range
2590 as Emacs' internal format. Assigned the coding-system (Lisp
0ef69138 2591 symbol) `emacs-mule' by default.
4ed46869
KH
2592
2593 o coding-category-sjis
2594
2595 The category for a coding system which has the same code range
2596 as SJIS. Assigned the coding-system (Lisp
7717c392 2597 symbol) `japanese-shift-jis' by default.
4ed46869
KH
2598
2599 o coding-category-iso-7
2600
2601 The category for a coding system which has the same code range
7717c392
KH
2602 as ISO2022 of 7-bit environment. This doesn't use any locking
2603 shift and single shift functions. Assigned the coding-system
2604 (Lisp symbol) `iso-2022-7bit' by default.
4ed46869
KH
2605
2606 o coding-category-iso-8-1
2607
2608 The category for a coding system which has the same code range
2609 as ISO2022 of 8-bit environment and graphic plane 1 used only
7717c392
KH
2610 for DIMENSION1 charset. This doesn't use any locking shift
2611 and single shift functions. Assigned the coding-system (Lisp
2612 symbol) `iso-latin-1' by default.
4ed46869
KH
2613
2614 o coding-category-iso-8-2
2615
2616 The category for a coding system which has the same code range
2617 as ISO2022 of 8-bit environment and graphic plane 1 used only
7717c392
KH
2618 for DIMENSION2 charset. This doesn't use any locking shift
2619 and single shift functions. Assigned the coding-system (Lisp
2620 symbol) `japanese-iso-8bit' by default.
4ed46869 2621
7717c392 2622 o coding-category-iso-7-else
4ed46869
KH
2623
2624 The category for a coding system which has the same code range
7717c392
KH
2625 as ISO2022 of 7-bit environemnt but uses locking shift or
2626 single shift functions. Assigned the coding-system (Lisp
2627 symbol) `iso-2022-7bit-lock' by default.
2628
2629 o coding-category-iso-8-else
2630
2631 The category for a coding system which has the same code range
2632 as ISO2022 of 8-bit environemnt but uses locking shift or
2633 single shift functions. Assigned the coding-system (Lisp
2634 symbol) `iso-2022-8bit-ss2' by default.
4ed46869
KH
2635
2636 o coding-category-big5
2637
2638 The category for a coding system which has the same code range
2639 as BIG5. Assigned the coding-system (Lisp symbol)
e0e989f6 2640 `cn-big5' by default.
4ed46869
KH
2641
2642 o coding-category-binary
2643
2644 The category for a coding system not categorized in any of the
2645 above. Assigned the coding-system (Lisp symbol)
e0e989f6 2646 `no-conversion' by default.
4ed46869
KH
2647
2648 Each of them is a Lisp symbol and the value is an actual
2649 `coding-system's (this is also a Lisp symbol) assigned by a user.
2650 What Emacs does actually is to detect a category of coding system.
2651 Then, it uses a `coding-system' assigned to it. If Emacs can't
2652 decide only one possible category, it selects a category of the
2653 highest priority. Priorities of categories are also specified by a
2654 user in a Lisp variable `coding-category-list'.
2655
2656*/
2657
2658/* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2659 If it detects possible coding systems, return an integer in which
2660 appropriate flag bits are set. Flag bits are defined by macros
2661 CODING_CATEGORY_MASK_XXX in `coding.h'. */
2662
2663int
2664detect_coding_mask (src, src_bytes)
2665 unsigned char *src;
2666 int src_bytes;
2667{
2668 register unsigned char c;
2669 unsigned char *src_end = src + src_bytes;
2670 int mask;
2671
2672 /* At first, skip all ASCII characters and control characters except
2673 for three ISO2022 specific control characters. */
bcf26d6a 2674 label_loop_detect_coding:
4ed46869
KH
2675 while (src < src_end)
2676 {
2677 c = *src;
2678 if (c >= 0x80
2679 || (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
2680 break;
2681 src++;
2682 }
2683
2684 if (src >= src_end)
2685 /* We found nothing other than ASCII. There's nothing to do. */
2686 return CODING_CATEGORY_MASK_ANY;
2687
2688 /* The text seems to be encoded in some multilingual coding system.
2689 Now, try to find in which coding system the text is encoded. */
2690 if (c < 0x80)
bcf26d6a
KH
2691 {
2692 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
2693 /* C is an ISO2022 specific control code of C0. */
2694 mask = detect_coding_iso2022 (src, src_end);
2695 src++;
1b2af4b0 2696 if (mask == 0)
bcf26d6a
KH
2697 /* No valid ISO2022 code follows C. Try again. */
2698 goto label_loop_detect_coding;
5d648571 2699 mask |= CODING_CATEGORY_MASK_RAW_TEXT;
bcf26d6a 2700 }
4ed46869 2701 else if (c < 0xA0)
c4825358 2702 {
3f003981 2703 /* If C is a special latin extra code,
c4825358
KH
2704 or is an ISO2022 specific control code of C1 (SS2 or SS3),
2705 or is an ISO2022 control-sequence-introducer (CSI),
27901516 2706 we should also consider the possibility of ISO2022 codings. */
3f003981
KH
2707 if ((VECTORP (Vlatin_extra_code_table)
2708 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
c4825358
KH
2709 || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
2710 || (c == ISO_CODE_CSI
2711 && (src < src_end
2712 && (*src == ']'
2713 || (src + 1 < src_end
2714 && src[1] == ']'
2715 && (*src == '0' || *src == '1' || *src == '2'))))))
2716 mask = (detect_coding_iso2022 (src, src_end)
2717 | detect_coding_sjis (src, src_end)
2718 | detect_coding_emacs_mule (src, src_end)
27901516 2719 | CODING_CATEGORY_MASK_RAW_TEXT);
4ed46869 2720
c4825358 2721 else
27901516
KH
2722 /* C is the first byte of SJIS character code,
2723 or a leading-code of Emacs' internal format (emacs-mule). */
c4825358
KH
2724 mask = (detect_coding_sjis (src, src_end)
2725 | detect_coding_emacs_mule (src, src_end)
27901516 2726 | CODING_CATEGORY_MASK_RAW_TEXT);
c4825358 2727 }
4ed46869
KH
2728 else
2729 /* C is a character of ISO2022 in graphic plane right,
2730 or a SJIS's 1-byte character code (i.e. JISX0201),
2731 or the first byte of BIG5's 2-byte code. */
2732 mask = (detect_coding_iso2022 (src, src_end)
2733 | detect_coding_sjis (src, src_end)
10bff6f1 2734 | detect_coding_big5 (src, src_end)
27901516 2735 | CODING_CATEGORY_MASK_RAW_TEXT);
4ed46869
KH
2736
2737 return mask;
2738}
2739
2740/* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2741 The information of the detected coding system is set in CODING. */
2742
2743void
2744detect_coding (coding, src, src_bytes)
2745 struct coding_system *coding;
2746 unsigned char *src;
2747 int src_bytes;
2748{
2749 int mask = detect_coding_mask (src, src_bytes);
2750 int idx;
27901516 2751 Lisp_Object val = Vcoding_category_list;
4ed46869
KH
2752
2753 if (mask == CODING_CATEGORY_MASK_ANY)
2754 /* We found nothing other than ASCII. There's nothing to do. */
2755 return;
2756
27901516
KH
2757 /* We found some plausible coding systems. Let's use a coding
2758 system of the highest priority. */
4ed46869 2759
27901516
KH
2760 if (CONSP (val))
2761 while (!NILP (val))
2762 {
2763 idx = XFASTINT (Fget (XCONS (val)->car, Qcoding_category_index));
2764 if ((idx < CODING_CATEGORY_IDX_MAX) && (mask & (1 << idx)))
2765 break;
2766 val = XCONS (val)->cdr;
2767 }
2768 else
2769 val = Qnil;
4ed46869 2770
27901516
KH
2771 if (NILP (val))
2772 {
2773 /* For unknown reason, `Vcoding_category_list' contains none of
2774 found categories. Let's use any of them. */
2775 for (idx = 0; idx < CODING_CATEGORY_IDX_MAX; idx++)
2776 if (mask & (1 << idx))
2777 break;
4ed46869
KH
2778 }
2779 setup_coding_system (XSYMBOL (coding_category_table[idx])->value, coding);
2780}
2781
2782/* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2783 is encoded. Return one of CODING_EOL_LF, CODING_EOL_CRLF,
0ef69138 2784 CODING_EOL_CR, and CODING_EOL_UNDECIDED. */
4ed46869 2785
bc4bc72a
RS
2786#define MAX_EOL_CHECK_COUNT 3
2787
4ed46869
KH
2788int
2789detect_eol_type (src, src_bytes)
2790 unsigned char *src;
2791 int src_bytes;
2792{
2793 unsigned char *src_end = src + src_bytes;
2794 unsigned char c;
bc4bc72a
RS
2795 int total = 0; /* How many end-of-lines are found so far. */
2796 int eol_type = CODING_EOL_UNDECIDED;
2797 int this_eol_type;
4ed46869 2798
bc4bc72a 2799 while (src < src_end && total < MAX_EOL_CHECK_COUNT)
4ed46869
KH
2800 {
2801 c = *src++;
bc4bc72a 2802 if (c == '\n' || c == '\r')
4ed46869 2803 {
bc4bc72a
RS
2804 total++;
2805 if (c == '\n')
2806 this_eol_type = CODING_EOL_LF;
2807 else if (src >= src_end || *src != '\n')
2808 this_eol_type = CODING_EOL_CR;
4ed46869 2809 else
bc4bc72a
RS
2810 this_eol_type = CODING_EOL_CRLF, src++;
2811
2812 if (eol_type == CODING_EOL_UNDECIDED)
2813 /* This is the first end-of-line. */
2814 eol_type = this_eol_type;
2815 else if (eol_type != this_eol_type)
2816 /* The found type is different from what found before.
27901516
KH
2817 Let's notice the caller about this inconsistency. */
2818 return CODING_EOL_INCONSISTENT;
4ed46869
KH
2819 }
2820 }
bc4bc72a 2821
85a02ca4 2822 return eol_type;
4ed46869
KH
2823}
2824
2825/* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2826 is encoded. If it detects an appropriate format of end-of-line, it
2827 sets the information in *CODING. */
2828
2829void
2830detect_eol (coding, src, src_bytes)
2831 struct coding_system *coding;
2832 unsigned char *src;
2833 int src_bytes;
2834{
fb3903d3 2835 Lisp_Object val, coding_system;
4ed46869
KH
2836 int eol_type = detect_eol_type (src, src_bytes);
2837
0ef69138 2838 if (eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
2839 /* We found no end-of-line in the source text. */
2840 return;
2841
27901516
KH
2842 if (eol_type == CODING_EOL_INCONSISTENT)
2843 {
2844#if 0
2845 /* This code is suppressed until we find a better way to
992f23f2 2846 distinguish raw text file and binary file. */
27901516
KH
2847
2848 /* If we have already detected that the coding is raw-text, the
2849 coding should actually be no-conversion. */
2850 if (coding->type == coding_type_raw_text)
2851 {
2852 setup_coding_system (Qno_conversion, coding);
2853 return;
2854 }
2855 /* Else, let's decode only text code anyway. */
2856#endif /* 0 */
1b2af4b0 2857 eol_type = CODING_EOL_LF;
27901516
KH
2858 }
2859
fb3903d3
KH
2860 coding_system = coding->symbol;
2861 while (!NILP (coding_system)
2862 && NILP (val = Fget (coding_system, Qeol_type)))
2863 coding_system = Fget (coding_system, Qcoding_system);
4ed46869
KH
2864 if (VECTORP (val) && XVECTOR (val)->size == 3)
2865 setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
2866}
2867
2868/* See "GENERAL NOTES about `decode_coding_XXX ()' functions". Before
2869 decoding, it may detect coding system and format of end-of-line if
2870 those are not yet decided. */
2871
2872int
2873decode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2874 struct coding_system *coding;
2875 unsigned char *source, *destination;
2876 int src_bytes, dst_bytes;
2877 int *consumed;
2878{
2879 int produced;
2880
2881 if (src_bytes <= 0)
2882 {
2883 *consumed = 0;
2884 return 0;
2885 }
2886
0ef69138 2887 if (coding->type == coding_type_undecided)
4ed46869
KH
2888 detect_coding (coding, source, src_bytes);
2889
0ef69138 2890 if (coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
2891 detect_eol (coding, source, src_bytes);
2892
2893 coding->carryover_size = 0;
2894 switch (coding->type)
2895 {
2896 case coding_type_no_conversion:
2897 label_no_conversion:
2898 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2899 bcopy (source, destination, produced);
2900 *consumed = produced;
2901 break;
2902
0ef69138
KH
2903 case coding_type_emacs_mule:
2904 case coding_type_undecided:
27901516 2905 case coding_type_raw_text:
4ed46869 2906 if (coding->eol_type == CODING_EOL_LF
0ef69138 2907 || coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
2908 goto label_no_conversion;
2909 produced = decode_eol (coding, source, destination,
2910 src_bytes, dst_bytes, consumed);
2911 break;
2912
2913 case coding_type_sjis:
2914 produced = decode_coding_sjis_big5 (coding, source, destination,
2915 src_bytes, dst_bytes, consumed,
2916 1);
2917 break;
2918
2919 case coding_type_iso2022:
2920 produced = decode_coding_iso2022 (coding, source, destination,
2921 src_bytes, dst_bytes, consumed);
2922 break;
2923
2924 case coding_type_big5:
2925 produced = decode_coding_sjis_big5 (coding, source, destination,
2926 src_bytes, dst_bytes, consumed,
2927 0);
2928 break;
2929
2930 case coding_type_ccl:
2931 produced = ccl_driver (&coding->spec.ccl.decoder, source, destination,
2932 src_bytes, dst_bytes, consumed);
2933 break;
2934 }
2935
2936 return produced;
2937}
2938
2939/* See "GENERAL NOTES about `encode_coding_XXX ()' functions". */
2940
2941int
2942encode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2943 struct coding_system *coding;
2944 unsigned char *source, *destination;
2945 int src_bytes, dst_bytes;
2946 int *consumed;
2947{
2948 int produced;
2949
4ed46869
KH
2950 switch (coding->type)
2951 {
2952 case coding_type_no_conversion:
2953 label_no_conversion:
2954 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2955 if (produced > 0)
2956 {
2957 bcopy (source, destination, produced);
2958 if (coding->selective)
2959 {
2960 unsigned char *p = destination, *pend = destination + produced;
2961 while (p < pend)
e0e989f6 2962 if (*p++ == '\015') p[-1] = '\n';
4ed46869
KH
2963 }
2964 }
2965 *consumed = produced;
2966 break;
2967
0ef69138
KH
2968 case coding_type_emacs_mule:
2969 case coding_type_undecided:
27901516 2970 case coding_type_raw_text:
4ed46869 2971 if (coding->eol_type == CODING_EOL_LF
0ef69138 2972 || coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
2973 goto label_no_conversion;
2974 produced = encode_eol (coding, source, destination,
2975 src_bytes, dst_bytes, consumed);
2976 break;
2977
2978 case coding_type_sjis:
2979 produced = encode_coding_sjis_big5 (coding, source, destination,
2980 src_bytes, dst_bytes, consumed,
2981 1);
2982 break;
2983
2984 case coding_type_iso2022:
2985 produced = encode_coding_iso2022 (coding, source, destination,
2986 src_bytes, dst_bytes, consumed);
2987 break;
2988
2989 case coding_type_big5:
2990 produced = encode_coding_sjis_big5 (coding, source, destination,
2991 src_bytes, dst_bytes, consumed,
2992 0);
2993 break;
2994
2995 case coding_type_ccl:
2996 produced = ccl_driver (&coding->spec.ccl.encoder, source, destination,
2997 src_bytes, dst_bytes, consumed);
2998 break;
2999 }
3000
3001 return produced;
3002}
3003
3004#define CONVERSION_BUFFER_EXTRA_ROOM 256
3005
3006/* Return maximum size (bytes) of a buffer enough for decoding
3007 SRC_BYTES of text encoded in CODING. */
3008
3009int
3010decoding_buffer_size (coding, src_bytes)
3011 struct coding_system *coding;
3012 int src_bytes;
3013{
3014 int magnification;
3015
3016 if (coding->type == coding_type_iso2022)
3017 magnification = 3;
3018 else if (coding->type == coding_type_ccl)
3019 magnification = coding->spec.ccl.decoder.buf_magnification;
3020 else
3021 magnification = 2;
3022
3023 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
3024}
3025
3026/* Return maximum size (bytes) of a buffer enough for encoding
3027 SRC_BYTES of text to CODING. */
3028
3029int
3030encoding_buffer_size (coding, src_bytes)
3031 struct coding_system *coding;
3032 int src_bytes;
3033{
3034 int magnification;
3035
3036 if (coding->type == coding_type_ccl)
3037 magnification = coding->spec.ccl.encoder.buf_magnification;
3038 else
3039 magnification = 3;
3040
3041 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
3042}
3043
3044#ifndef MINIMUM_CONVERSION_BUFFER_SIZE
3045#define MINIMUM_CONVERSION_BUFFER_SIZE 1024
3046#endif
3047
3048char *conversion_buffer;
3049int conversion_buffer_size;
3050
3051/* Return a pointer to a SIZE bytes of buffer to be used for encoding
3052 or decoding. Sufficient memory is allocated automatically. If we
3053 run out of memory, return NULL. */
3054
3055char *
3056get_conversion_buffer (size)
3057 int size;
3058{
3059 if (size > conversion_buffer_size)
3060 {
3061 char *buf;
3062 int real_size = conversion_buffer_size * 2;
3063
3064 while (real_size < size) real_size *= 2;
3065 buf = (char *) xmalloc (real_size);
3066 xfree (conversion_buffer);
3067 conversion_buffer = buf;
3068 conversion_buffer_size = real_size;
3069 }
3070 return conversion_buffer;
3071}
3072
3073\f
3074#ifdef emacs
3075/*** 7. Emacs Lisp library functions ***/
3076
02ba4723 3077DEFUN ("coding-system-spec", Fcoding_system_spec, Scoding_system_spec,
4ed46869 3078 1, 1, 0,
02ba4723 3079 "Return coding-spec of CODING-SYSTEM.\n\
4ed46869
KH
3080If CODING-SYSTEM is not a valid coding-system, return nil.")
3081 (obj)
3082 Lisp_Object obj;
3083{
3084 while (SYMBOLP (obj) && !NILP (obj))
3085 obj = Fget (obj, Qcoding_system);
3086 return ((NILP (obj) || !VECTORP (obj) || XVECTOR (obj)->size != 5)
3087 ? Qnil : obj);
3088}
3089
3090DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
3091 "Return t if OBJECT is nil or a coding-system.\n\
3092See document of make-coding-system for coding-system object.")
3093 (obj)
3094 Lisp_Object obj;
3095{
02ba4723 3096 return ((NILP (obj) || !NILP (Fcoding_system_spec (obj))) ? Qt : Qnil);
4ed46869
KH
3097}
3098
9d991de8
RS
3099DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
3100 Sread_non_nil_coding_system, 1, 1, 0,
e0e989f6 3101 "Read a coding system from the minibuffer, prompting with string PROMPT.")
4ed46869
KH
3102 (prompt)
3103 Lisp_Object prompt;
3104{
e0e989f6 3105 Lisp_Object val;
9d991de8
RS
3106 do
3107 {
02ba4723 3108 val = Fcompleting_read (prompt, Vobarray, Qcoding_system_spec,
61e011d9 3109 Qt, Qnil, Qnil, Qnil, Qnil);
9d991de8
RS
3110 }
3111 while (XSTRING (val)->size == 0);
e0e989f6 3112 return (Fintern (val, Qnil));
4ed46869
KH
3113}
3114
9b787f3e
RS
3115DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
3116 "Read a coding system from the minibuffer, prompting with string PROMPT.\n\
3117If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.")
3118 (prompt, default_coding_system)
3119 Lisp_Object prompt, default_coding_system;
4ed46869 3120{
f44d27ce 3121 Lisp_Object val;
9b787f3e
RS
3122 if (SYMBOLP (default_coding_system))
3123 XSETSTRING (default_coding_system, XSYMBOL (default_coding_system)->name);
f44d27ce 3124 val = Fcompleting_read (prompt, Vobarray, Qcoding_system_p,
9b787f3e
RS
3125 Qt, Qnil, Qcoding_system_history,
3126 default_coding_system, Qnil);
e0e989f6 3127 return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
4ed46869
KH
3128}
3129
3130DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
3131 1, 1, 0,
3132 "Check validity of CODING-SYSTEM.\n\
3133If valid, return CODING-SYSTEM, else `coding-system-error' is signaled.\n\
3134CODING-SYSTEM is valid if it is a symbol and has \"coding-system\" property.\n\
3135The value of property should be a vector of length 5.")
3136 (coding_system)
3137 Lisp_Object coding_system;
3138{
3139 CHECK_SYMBOL (coding_system, 0);
3140 if (!NILP (Fcoding_system_p (coding_system)))
3141 return coding_system;
3142 while (1)
02ba4723 3143 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
4ed46869
KH
3144}
3145
3146DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
3147 2, 2, 0,
bf9cdd4e
KH
3148 "Detect coding system of the text in the region between START and END.\n\
3149Return a list of possible coding systems ordered by priority.\n\
0ef69138 3150If only ASCII characters are found, it returns `undecided'\n\
bf9cdd4e 3151 or its subsidiary coding system according to a detected end-of-line format.")
4ed46869
KH
3152 (b, e)
3153 Lisp_Object b, e;
3154{
3155 int coding_mask, eol_type;
3156 Lisp_Object val;
3157 int beg, end;
3158
3159 validate_region (&b, &e);
3160 beg = XINT (b), end = XINT (e);
3161 if (beg < GPT && end >= GPT) move_gap (end);
3162
3163 coding_mask = detect_coding_mask (POS_ADDR (beg), end - beg);
3164 eol_type = detect_eol_type (POS_ADDR (beg), end - beg);
3165
3166 if (coding_mask == CODING_CATEGORY_MASK_ANY)
3167 {
27901516
KH
3168 val = Qundecided;
3169 if (eol_type != CODING_EOL_UNDECIDED
3170 && eol_type != CODING_EOL_INCONSISTENT)
4ed46869 3171 {
f44d27ce
RS
3172 Lisp_Object val2;
3173 val2 = Fget (Qundecided, Qeol_type);
4ed46869
KH
3174 if (VECTORP (val2))
3175 val = XVECTOR (val2)->contents[eol_type];
3176 }
3177 }
3178 else
3179 {
3180 Lisp_Object val2;
3181
3182 /* At first, gather possible coding-systems in VAL in a reverse
3183 order. */
3184 val = Qnil;
3185 for (val2 = Vcoding_category_list;
3186 !NILP (val2);
3187 val2 = XCONS (val2)->cdr)
3188 {
3189 int idx
3190 = XFASTINT (Fget (XCONS (val2)->car, Qcoding_category_index));
3191 if (coding_mask & (1 << idx))
27901516
KH
3192 {
3193#if 0
3194 /* This code is suppressed until we find a better way to
992f23f2 3195 distinguish raw text file and binary file. */
27901516
KH
3196
3197 if (idx == CODING_CATEGORY_IDX_RAW_TEXT
3198 && eol_type == CODING_EOL_INCONSISTENT)
3199 val = Fcons (Qno_conversion, val);
3200 else
3201#endif /* 0 */
3202 val = Fcons (Fsymbol_value (XCONS (val2)->car), val);
3203 }
4ed46869
KH
3204 }
3205
3206 /* Then, change the order of the list, while getting subsidiary
3207 coding-systems. */
3208 val2 = val;
3209 val = Qnil;
27901516
KH
3210 if (eol_type == CODING_EOL_INCONSISTENT)
3211 eol_type == CODING_EOL_UNDECIDED;
4ed46869
KH
3212 for (; !NILP (val2); val2 = XCONS (val2)->cdr)
3213 {
0ef69138 3214 if (eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
3215 val = Fcons (XCONS (val2)->car, val);
3216 else
3217 {
f44d27ce
RS
3218 Lisp_Object val3;
3219 val3 = Fget (XCONS (val2)->car, Qeol_type);
4ed46869
KH
3220 if (VECTORP (val3))
3221 val = Fcons (XVECTOR (val3)->contents[eol_type], val);
3222 else
3223 val = Fcons (XCONS (val2)->car, val);
3224 }
3225 }
3226 }
3227
3228 return val;
3229}
3230
3231/* Scan text in the region between *BEGP and *ENDP, skip characters
3232 which we never have to encode to (iff ENCODEP is 1) or decode from
3233 coding system CODING at the head and tail, then set BEGP and ENDP
3234 to the addresses of start and end of the text we actually convert. */
3235
3236void
3237shrink_conversion_area (begp, endp, coding, encodep)
3238 unsigned char **begp, **endp;
3239 struct coding_system *coding;
3240 int encodep;
3241{
3242 register unsigned char *beg_addr = *begp, *end_addr = *endp;
3243
3244 if (coding->eol_type != CODING_EOL_LF
0ef69138 3245 && coding->eol_type != CODING_EOL_UNDECIDED)
4ed46869
KH
3246 /* Since we anyway have to convert end-of-line format, it is not
3247 worth skipping at most 100 bytes or so. */
3248 return;
3249
3250 if (encodep) /* for encoding */
3251 {
3252 switch (coding->type)
3253 {
3254 case coding_type_no_conversion:
0ef69138
KH
3255 case coding_type_emacs_mule:
3256 case coding_type_undecided:
27901516 3257 case coding_type_raw_text:
4ed46869
KH
3258 /* We need no conversion. */
3259 *begp = *endp;
3260 return;
3261 case coding_type_ccl:
3262 /* We can't skip any data. */
3263 return;
e0e989f6
KH
3264 case coding_type_iso2022:
3265 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3266 {
3267 unsigned char *bol = beg_addr;
3268 while (beg_addr < end_addr && *beg_addr < 0x80)
3269 {
3270 beg_addr++;
3271 if (*(beg_addr - 1) == '\n')
3272 bol = beg_addr;
3273 }
3274 beg_addr = bol;
3275 goto label_skip_tail;
3276 }
3277 /* fall down ... */
4ed46869
KH
3278 default:
3279 /* We can skip all ASCII characters at the head and tail. */
3280 while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
e0e989f6 3281 label_skip_tail:
4ed46869
KH
3282 while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
3283 break;
3284 }
3285 }
3286 else /* for decoding */
3287 {
3288 switch (coding->type)
3289 {
3290 case coding_type_no_conversion:
3291 /* We need no conversion. */
3292 *begp = *endp;
3293 return;
0ef69138 3294 case coding_type_emacs_mule:
27901516 3295 case coding_type_raw_text:
4ed46869
KH
3296 if (coding->eol_type == CODING_EOL_LF)
3297 {
3298 /* We need no conversion. */
3299 *begp = *endp;
3300 return;
3301 }
3302 /* We can skip all but carriage-return. */
3303 while (beg_addr < end_addr && *beg_addr != '\r') beg_addr++;
3304 while (beg_addr < end_addr && *(end_addr - 1) != '\r') end_addr--;
3305 break;
3306 case coding_type_sjis:
3307 case coding_type_big5:
3308 /* We can skip all ASCII characters at the head. */
3309 while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
3310 /* We can skip all ASCII characters at the tail except for
3311 the second byte of SJIS or BIG5 code. */
3312 while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
3313 if (end_addr != *endp)
3314 end_addr++;
3315 break;
3316 case coding_type_ccl:
3317 /* We can't skip any data. */
3318 return;
3319 default: /* i.e. case coding_type_iso2022: */
3320 {
3321 unsigned char c;
3322
3323 /* We can skip all ASCII characters except for a few
3324 control codes at the head. */
3325 while (beg_addr < end_addr && (c = *beg_addr) < 0x80
3326 && c != ISO_CODE_CR && c != ISO_CODE_SO
3327 && c != ISO_CODE_SI && c != ISO_CODE_ESC)
3328 beg_addr++;
3329 }
3330 break;
3331 }
3332 }
3333 *begp = beg_addr;
3334 *endp = end_addr;
3335 return;
3336}
3337
3338/* Encode to (iff ENCODEP is 1) or decode form coding system CODING a
3339 text between B and E. B and E are buffer position. */
3340
3341Lisp_Object
3342code_convert_region (b, e, coding, encodep)
3343 Lisp_Object b, e;
3344 struct coding_system *coding;
3345 int encodep;
3346{
3347 int beg, end, len, consumed, produced;
3348 char *buf;
3349 unsigned char *begp, *endp;
3350 int pos = PT;
3351
3352 validate_region (&b, &e);
3353 beg = XINT (b), end = XINT (e);
3354 if (beg < GPT && end >= GPT)
3355 move_gap (end);
3356
3357 if (encodep && !NILP (coding->pre_write_conversion))
3358 {
3359 /* We must call a pre-conversion function which may put a new
3360 text to be converted in a new buffer. */
3361 struct buffer *old = current_buffer, *new;
3362
3363 TEMP_SET_PT (beg);
3364 call2 (coding->pre_write_conversion, b, e);
3365 if (old != current_buffer)
3366 {
3367 /* Replace the original text by the text just generated. */
3368 len = ZV - BEGV;
3369 new = current_buffer;
3370 set_buffer_internal (old);
3371 del_range (beg, end);
3372 insert_from_buffer (new, 1, len, 0);
3373 end = beg + len;
3374 }
3375 }
3376
3377 /* We may be able to shrink the conversion region. */
3378 begp = POS_ADDR (beg); endp = begp + (end - beg);
3379 shrink_conversion_area (&begp, &endp, coding, encodep);
3380
3381 if (begp == endp)
3382 /* We need no conversion. */
3383 len = end - beg;
3384 else
3385 {
3386 beg += begp - POS_ADDR (beg);
3387 end = beg + (endp - begp);
3388
3389 if (encodep)
3390 len = encoding_buffer_size (coding, end - beg);
3391 else
3392 len = decoding_buffer_size (coding, end - beg);
3393 buf = get_conversion_buffer (len);
3394
3395 coding->last_block = 1;
3396 produced = (encodep
3397 ? encode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
3398 &consumed)
3399 : decode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
3400 &consumed));
3401
3402 len = produced + (beg - XINT (b)) + (XINT (e) - end);
3403
3404 TEMP_SET_PT (beg);
3405 insert (buf, produced);
3406 del_range (PT, PT + end - beg);
3407 if (pos >= end)
3408 pos = PT + (pos - end);
3409 else if (pos > beg)
3410 pos = beg;
3411 TEMP_SET_PT (pos);
3412 }
3413
3414 if (!encodep && !NILP (coding->post_read_conversion))
3415 {
3416 /* We must call a post-conversion function which may alter
3417 the text just converted. */
3418 Lisp_Object insval;
3419
3420 beg = XINT (b);
3421 TEMP_SET_PT (beg);
3422 insval = call1 (coding->post_read_conversion, make_number (len));
3423 CHECK_NUMBER (insval, 0);
3424 len = XINT (insval);
3425 }
3426
3427 return make_number (len);
3428}
3429
3430Lisp_Object
e0e989f6
KH
3431code_convert_string (str, coding, encodep, nocopy)
3432 Lisp_Object str, nocopy;
4ed46869
KH
3433 struct coding_system *coding;
3434 int encodep;
3435{
3436 int len, consumed, produced;
3437 char *buf;
3438 unsigned char *begp, *endp;
3439 int head_skip, tail_skip;
3440 struct gcpro gcpro1;
3441
3442 if (encodep && !NILP (coding->pre_write_conversion)
3443 || !encodep && !NILP (coding->post_read_conversion))
3444 {
3445 /* Since we have to call Lisp functions which assume target text
3446 is in a buffer, after setting a temporary buffer, call
3447 code_convert_region. */
3448 int count = specpdl_ptr - specpdl;
3449 int len = XSTRING (str)->size;
3450 Lisp_Object result;
3451 struct buffer *old = current_buffer;
3452
3453 record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
3454 temp_output_buffer_setup (" *code-converting-work*");
3455 set_buffer_internal (XBUFFER (Vstandard_output));
3456 insert_from_string (str, 0, len, 0);
3457 code_convert_region (make_number (BEGV), make_number (ZV),
3458 coding, encodep);
3459 result = make_buffer_string (BEGV, ZV, 0);
3460 set_buffer_internal (old);
3461 return unbind_to (count, result);
3462 }
3463
3464 /* We may be able to shrink the conversion region. */
3465 begp = XSTRING (str)->data;
3466 endp = begp + XSTRING (str)->size;
3467 shrink_conversion_area (&begp, &endp, coding, encodep);
3468
3469 if (begp == endp)
3470 /* We need no conversion. */
e0e989f6 3471 return (NILP (nocopy) ? Fcopy_sequence (str) : str);
4ed46869
KH
3472
3473 head_skip = begp - XSTRING (str)->data;
3474 tail_skip = XSTRING (str)->size - head_skip - (endp - begp);
3475
3476 GCPRO1 (str);
3477
3478 if (encodep)
3479 len = encoding_buffer_size (coding, endp - begp);
3480 else
3481 len = decoding_buffer_size (coding, endp - begp);
3482 buf = get_conversion_buffer (len + head_skip + tail_skip);
3483
3484 bcopy (XSTRING (str)->data, buf, head_skip);
3485 coding->last_block = 1;
3486 produced = (encodep
3487 ? encode_coding (coding, XSTRING (str)->data + head_skip,
3488 buf + head_skip, endp - begp, len, &consumed)
3489 : decode_coding (coding, XSTRING (str)->data + head_skip,
3490 buf + head_skip, endp - begp, len, &consumed));
3491 bcopy (XSTRING (str)->data + head_skip + (endp - begp),
3492 buf + head_skip + produced,
3493 tail_skip);
3494
3495 UNGCPRO;
3496
3497 return make_string (buf, head_skip + produced + tail_skip);
3498}
3499
3500DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
e0e989f6
KH
3501 3, 3, "r\nzCoding system: ",
3502 "Decode current region by specified coding system.\n\
3503When called from a program, takes three arguments:\n\
3504START, END, and CODING-SYSTEM. START END are buffer positions.\n\
4ed46869
KH
3505Return length of decoded text.")
3506 (b, e, coding_system)
3507 Lisp_Object b, e, coding_system;
3508{
3509 struct coding_system coding;
3510
3511 CHECK_NUMBER_COERCE_MARKER (b, 0);
3512 CHECK_NUMBER_COERCE_MARKER (e, 1);
3513 CHECK_SYMBOL (coding_system, 2);
3514
e0e989f6
KH
3515 if (NILP (coding_system))
3516 return make_number (XFASTINT (e) - XFASTINT (b));
4ed46869
KH
3517 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3518 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3519
3520 return code_convert_region (b, e, &coding, 0);
3521}
3522
3523DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
e0e989f6
KH
3524 3, 3, "r\nzCoding system: ",
3525 "Encode current region by specified coding system.\n\
3526When called from a program, takes three arguments:\n\
3527START, END, and CODING-SYSTEM. START END are buffer positions.\n\
4ed46869
KH
3528Return length of encoded text.")
3529 (b, e, coding_system)
3530 Lisp_Object b, e, coding_system;
3531{
3532 struct coding_system coding;
3533
3534 CHECK_NUMBER_COERCE_MARKER (b, 0);
3535 CHECK_NUMBER_COERCE_MARKER (e, 1);
3536 CHECK_SYMBOL (coding_system, 2);
3537
e0e989f6
KH
3538 if (NILP (coding_system))
3539 return make_number (XFASTINT (e) - XFASTINT (b));
4ed46869
KH
3540 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3541 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3542
3543 return code_convert_region (b, e, &coding, 1);
3544}
3545
3546DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
e0e989f6
KH
3547 2, 3, 0,
3548 "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
3549Optional arg NOCOPY non-nil means return STRING itself if there's no need\n\
3550of decoding.")
3551 (string, coding_system, nocopy)
3552 Lisp_Object string, coding_system, nocopy;
4ed46869
KH
3553{
3554 struct coding_system coding;
3555
3556 CHECK_STRING (string, 0);
3557 CHECK_SYMBOL (coding_system, 1);
3558
e0e989f6
KH
3559 if (NILP (coding_system))
3560 return (NILP (nocopy) ? Fcopy_sequence (string) : string);
4ed46869
KH
3561 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3562 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3563
e0e989f6 3564 return code_convert_string (string, &coding, 0, nocopy);
4ed46869
KH
3565}
3566
3567DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
e0e989f6
KH
3568 2, 3, 0,
3569 "Encode STRING to CODING-SYSTEM, and return the result.\n\
3570Optional arg NOCOPY non-nil means return STRING itself if there's no need\n\
3571of encoding.")
3572 (string, coding_system, nocopy)
3573 Lisp_Object string, coding_system, nocopy;
4ed46869
KH
3574{
3575 struct coding_system coding;
3576
3577 CHECK_STRING (string, 0);
3578 CHECK_SYMBOL (coding_system, 1);
3579
e0e989f6
KH
3580 if (NILP (coding_system))
3581 return (NILP (nocopy) ? Fcopy_sequence (string) : string);
4ed46869
KH
3582 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3583 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3584
e0e989f6 3585 return code_convert_string (string, &coding, 1, nocopy);
4ed46869
KH
3586}
3587
3588DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
e0e989f6 3589 "Decode a JISX0208 character of shift-jis encoding.\n\
4ed46869
KH
3590CODE is the character code in SJIS.\n\
3591Return the corresponding character.")
3592 (code)
3593 Lisp_Object code;
3594{
3595 unsigned char c1, c2, s1, s2;
3596 Lisp_Object val;
3597
3598 CHECK_NUMBER (code, 0);
3599 s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
3600 DECODE_SJIS (s1, s2, c1, c2);
3601 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset_jisx0208, c1, c2));
3602 return val;
3603}
3604
3605DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
3606 "Encode a JISX0208 character CHAR to SJIS coding-system.\n\
3607Return the corresponding character code in SJIS.")
3608 (ch)
3609 Lisp_Object ch;
3610{
bcf26d6a 3611 int charset, c1, c2, s1, s2;
4ed46869
KH
3612 Lisp_Object val;
3613
3614 CHECK_NUMBER (ch, 0);
3615 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3616 if (charset == charset_jisx0208)
3617 {
3618 ENCODE_SJIS (c1, c2, s1, s2);
bcf26d6a 3619 XSETFASTINT (val, (s1 << 8) | s2);
4ed46869
KH
3620 }
3621 else
3622 XSETFASTINT (val, 0);
3623 return val;
3624}
3625
3626DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
3627 "Decode a Big5 character CODE of BIG5 coding-system.\n\
3628CODE is the character code in BIG5.\n\
3629Return the corresponding character.")
3630 (code)
3631 Lisp_Object code;
3632{
3633 int charset;
3634 unsigned char b1, b2, c1, c2;
3635 Lisp_Object val;
3636
3637 CHECK_NUMBER (code, 0);
3638 b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
3639 DECODE_BIG5 (b1, b2, charset, c1, c2);
3640 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset, c1, c2));
3641 return val;
3642}
3643
3644DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
3645 "Encode the Big5 character CHAR to BIG5 coding-system.\n\
3646Return the corresponding character code in Big5.")
3647 (ch)
3648 Lisp_Object ch;
3649{
bcf26d6a 3650 int charset, c1, c2, b1, b2;
4ed46869
KH
3651 Lisp_Object val;
3652
3653 CHECK_NUMBER (ch, 0);
3654 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3655 if (charset == charset_big5_1 || charset == charset_big5_2)
3656 {
3657 ENCODE_BIG5 (charset, c1, c2, b1, b2);
bcf26d6a 3658 XSETFASTINT (val, (b1 << 8) | b2);
4ed46869
KH
3659 }
3660 else
3661 XSETFASTINT (val, 0);
3662 return val;
3663}
3664
1ba9e4ab
KH
3665DEFUN ("set-terminal-coding-system-internal",
3666 Fset_terminal_coding_system_internal,
3667 Sset_terminal_coding_system_internal, 1, 1, 0, "")
4ed46869
KH
3668 (coding_system)
3669 Lisp_Object coding_system;
3670{
3671 CHECK_SYMBOL (coding_system, 0);
3672 setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
6e85d753
KH
3673 /* We had better not send unexpected characters to terminal. */
3674 terminal_coding.flags |= CODING_FLAG_ISO_SAFE;
3675
4ed46869
KH
3676 return Qnil;
3677}
3678
c4825358
KH
3679DEFUN ("set-safe-terminal-coding-system-internal",
3680 Fset_safe_terminal_coding_system_internal,
3681 Sset_safe_terminal_coding_system_internal, 1, 1, 0, "")
3682 (coding_system)
3683 Lisp_Object coding_system;
3684{
3685 CHECK_SYMBOL (coding_system, 0);
3686 setup_coding_system (Fcheck_coding_system (coding_system),
3687 &safe_terminal_coding);
3688 return Qnil;
3689}
3690
4ed46869
KH
3691DEFUN ("terminal-coding-system",
3692 Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
3693 "Return coding-system of your terminal.")
3694 ()
3695{
3696 return terminal_coding.symbol;
3697}
3698
1ba9e4ab
KH
3699DEFUN ("set-keyboard-coding-system-internal",
3700 Fset_keyboard_coding_system_internal,
3701 Sset_keyboard_coding_system_internal, 1, 1, 0, "")
4ed46869
KH
3702 (coding_system)
3703 Lisp_Object coding_system;
3704{
3705 CHECK_SYMBOL (coding_system, 0);
3706 setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
3707 return Qnil;
3708}
3709
3710DEFUN ("keyboard-coding-system",
3711 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
3712 "Return coding-system of what is sent from terminal keyboard.")
3713 ()
3714{
3715 return keyboard_coding.symbol;
3716}
3717
3718\f
a5d301df
KH
3719DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
3720 Sfind_operation_coding_system, 1, MANY, 0,
3721 "Choose a coding system for an operation based on the target name.\n\
9ce27fde
KH
3722The value names a pair of coding systems: (DECODING-SYSTEM ENCODING-SYSTEM).\n\
3723DECODING-SYSTEM is the coding system to use for decoding\n\
3724\(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system\n\
3725for encoding (in case OPERATION does encoding).\n\
ccdb79f5
RS
3726\n\
3727The first argument OPERATION specifies an I/O primitive:\n\
3728 For file I/O, `insert-file-contents' or `write-region'.\n\
3729 For process I/O, `call-process', `call-process-region', or `start-process'.\n\
3730 For network I/O, `open-network-stream'.\n\
3731\n\
3732The remaining arguments should be the same arguments that were passed\n\
3733to the primitive. Depending on which primitive, one of those arguments\n\
3734is selected as the TARGET. For example, if OPERATION does file I/O,\n\
3735whichever argument specifies the file name is TARGET.\n\
3736\n\
3737TARGET has a meaning which depends on OPERATION:\n\
4ed46869
KH
3738 For file I/O, TARGET is a file name.\n\
3739 For process I/O, TARGET is a process name.\n\
3740 For network I/O, TARGET is a service name or a port number\n\
3741\n\
02ba4723
KH
3742This function looks up what specified for TARGET in,\n\
3743`file-coding-system-alist', `process-coding-system-alist',\n\
3744or `network-coding-system-alist' depending on OPERATION.\n\
3745They may specify a coding system, a cons of coding systems,\n\
3746or a function symbol to call.\n\
3747In the last case, we call the function with one argument,\n\
9ce27fde 3748which is a list of all the arguments given to this function.")
4ed46869
KH
3749 (nargs, args)
3750 int nargs;
3751 Lisp_Object *args;
3752{
3753 Lisp_Object operation, target_idx, target, val;
3754 register Lisp_Object chain;
3755
3756 if (nargs < 2)
3757 error ("Too few arguments");
3758 operation = args[0];
3759 if (!SYMBOLP (operation)
3760 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
3761 error ("Invalid first arguement");
3762 if (nargs < 1 + XINT (target_idx))
3763 error ("Too few arguments for operation: %s",
3764 XSYMBOL (operation)->name->data);
3765 target = args[XINT (target_idx) + 1];
3766 if (!(STRINGP (target)
3767 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
3768 error ("Invalid %dth argument", XINT (target_idx) + 1);
3769
2e34157c
RS
3770 chain = ((EQ (operation, Qinsert_file_contents)
3771 || EQ (operation, Qwrite_region))
02ba4723 3772 ? Vfile_coding_system_alist
2e34157c 3773 : (EQ (operation, Qopen_network_stream)
02ba4723
KH
3774 ? Vnetwork_coding_system_alist
3775 : Vprocess_coding_system_alist));
4ed46869
KH
3776 if (NILP (chain))
3777 return Qnil;
3778
02ba4723 3779 for (; CONSP (chain); chain = XCONS (chain)->cdr)
4ed46869 3780 {
f44d27ce
RS
3781 Lisp_Object elt;
3782 elt = XCONS (chain)->car;
4ed46869
KH
3783
3784 if (CONSP (elt)
3785 && ((STRINGP (target)
3786 && STRINGP (XCONS (elt)->car)
3787 && fast_string_match (XCONS (elt)->car, target) >= 0)
3788 || (INTEGERP (target) && EQ (target, XCONS (elt)->car))))
02ba4723
KH
3789 {
3790 val = XCONS (elt)->cdr;
3791 if (CONSP (val))
3792 return val;
3793 if (! SYMBOLP (val))
3794 return Qnil;
3795 if (! NILP (Fcoding_system_p (val)))
3796 return Fcons (val, val);
465edc86 3797 if (!NILP (Ffboundp (val)))
5d632ccf 3798 return call1 (val, Flist (nargs, args));
02ba4723
KH
3799 return Qnil;
3800 }
4ed46869
KH
3801 }
3802 return Qnil;
3803}
3804
3805#endif /* emacs */
3806
3807\f
3808/*** 8. Post-amble ***/
3809
3810init_coding_once ()
3811{
3812 int i;
3813
0ef69138 3814 /* Emacs' internal format specific initialize routine. */
4ed46869
KH
3815 for (i = 0; i <= 0x20; i++)
3816 emacs_code_class[i] = EMACS_control_code;
3817 emacs_code_class[0x0A] = EMACS_linefeed_code;
3818 emacs_code_class[0x0D] = EMACS_carriage_return_code;
3819 for (i = 0x21 ; i < 0x7F; i++)
3820 emacs_code_class[i] = EMACS_ascii_code;
3821 emacs_code_class[0x7F] = EMACS_control_code;
3822 emacs_code_class[0x80] = EMACS_leading_code_composition;
3823 for (i = 0x81; i < 0xFF; i++)
3824 emacs_code_class[i] = EMACS_invalid_code;
3825 emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
3826 emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
3827 emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
3828 emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
3829
3830 /* ISO2022 specific initialize routine. */
3831 for (i = 0; i < 0x20; i++)
3832 iso_code_class[i] = ISO_control_code;
3833 for (i = 0x21; i < 0x7F; i++)
3834 iso_code_class[i] = ISO_graphic_plane_0;
3835 for (i = 0x80; i < 0xA0; i++)
3836 iso_code_class[i] = ISO_control_code;
3837 for (i = 0xA1; i < 0xFF; i++)
3838 iso_code_class[i] = ISO_graphic_plane_1;
3839 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
3840 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
3841 iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
3842 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
3843 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
3844 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
3845 iso_code_class[ISO_CODE_ESC] = ISO_escape;
3846 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
3847 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
3848 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
3849
e0e989f6
KH
3850 conversion_buffer_size = MINIMUM_CONVERSION_BUFFER_SIZE;
3851 conversion_buffer = (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE);
3852
3853 setup_coding_system (Qnil, &keyboard_coding);
3854 setup_coding_system (Qnil, &terminal_coding);
c4825358 3855 setup_coding_system (Qnil, &safe_terminal_coding);
9ce27fde
KH
3856
3857#if defined (MSDOS) || defined (WINDOWSNT)
3858 system_eol_type = CODING_EOL_CRLF;
3859#else
3860 system_eol_type = CODING_EOL_LF;
3861#endif
e0e989f6
KH
3862}
3863
3864#ifdef emacs
3865
3866syms_of_coding ()
3867{
3868 Qtarget_idx = intern ("target-idx");
3869 staticpro (&Qtarget_idx);
3870
bb0115a2
RS
3871 Qcoding_system_history = intern ("coding-system-history");
3872 staticpro (&Qcoding_system_history);
3873 Fset (Qcoding_system_history, Qnil);
3874
9ce27fde 3875 /* Target FILENAME is the first argument. */
e0e989f6 3876 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
9ce27fde 3877 /* Target FILENAME is the third argument. */
e0e989f6
KH
3878 Fput (Qwrite_region, Qtarget_idx, make_number (2));
3879
3880 Qcall_process = intern ("call-process");
3881 staticpro (&Qcall_process);
9ce27fde 3882 /* Target PROGRAM is the first argument. */
e0e989f6
KH
3883 Fput (Qcall_process, Qtarget_idx, make_number (0));
3884
3885 Qcall_process_region = intern ("call-process-region");
3886 staticpro (&Qcall_process_region);
9ce27fde 3887 /* Target PROGRAM is the third argument. */
e0e989f6
KH
3888 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
3889
3890 Qstart_process = intern ("start-process");
3891 staticpro (&Qstart_process);
9ce27fde 3892 /* Target PROGRAM is the third argument. */
e0e989f6
KH
3893 Fput (Qstart_process, Qtarget_idx, make_number (2));
3894
3895 Qopen_network_stream = intern ("open-network-stream");
3896 staticpro (&Qopen_network_stream);
9ce27fde 3897 /* Target SERVICE is the fourth argument. */
e0e989f6
KH
3898 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
3899
4ed46869
KH
3900 Qcoding_system = intern ("coding-system");
3901 staticpro (&Qcoding_system);
3902
3903 Qeol_type = intern ("eol-type");
3904 staticpro (&Qeol_type);
3905
3906 Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
3907 staticpro (&Qbuffer_file_coding_system);
3908
3909 Qpost_read_conversion = intern ("post-read-conversion");
3910 staticpro (&Qpost_read_conversion);
3911
3912 Qpre_write_conversion = intern ("pre-write-conversion");
3913 staticpro (&Qpre_write_conversion);
3914
27901516
KH
3915 Qno_conversion = intern ("no-conversion");
3916 staticpro (&Qno_conversion);
3917
3918 Qundecided = intern ("undecided");
3919 staticpro (&Qundecided);
3920
02ba4723
KH
3921 Qcoding_system_spec = intern ("coding-system-spec");
3922 staticpro (&Qcoding_system_spec);
4ed46869
KH
3923
3924 Qcoding_system_p = intern ("coding-system-p");
3925 staticpro (&Qcoding_system_p);
3926
3927 Qcoding_system_error = intern ("coding-system-error");
3928 staticpro (&Qcoding_system_error);
3929
3930 Fput (Qcoding_system_error, Qerror_conditions,
3931 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
3932 Fput (Qcoding_system_error, Qerror_message,
9ce27fde 3933 build_string ("Invalid coding system"));
4ed46869
KH
3934
3935 Qcoding_category_index = intern ("coding-category-index");
3936 staticpro (&Qcoding_category_index);
3937
3938 {
3939 int i;
3940 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3941 {
3942 coding_category_table[i] = intern (coding_category_name[i]);
3943 staticpro (&coding_category_table[i]);
3944 Fput (coding_category_table[i], Qcoding_category_index,
3945 make_number (i));
3946 }
3947 }
3948
bdd9fb48
KH
3949 Qcharacter_unification_table = intern ("character-unification-table");
3950 staticpro (&Qcharacter_unification_table);
3951 Fput (Qcharacter_unification_table, Qchar_table_extra_slots,
3952 make_number (0));
3953
a5d301df
KH
3954 Qcharacter_unification_table_for_decode
3955 = intern ("character-unification-table-for-decode");
3956 staticpro (&Qcharacter_unification_table_for_decode);
3957
3958 Qcharacter_unification_table_for_encode
3959 = intern ("character-unification-table-for-encode");
3960 staticpro (&Qcharacter_unification_table_for_encode);
3961
9ce27fde
KH
3962 Qemacs_mule = intern ("emacs-mule");
3963 staticpro (&Qemacs_mule);
3964
02ba4723 3965 defsubr (&Scoding_system_spec);
4ed46869
KH
3966 defsubr (&Scoding_system_p);
3967 defsubr (&Sread_coding_system);
3968 defsubr (&Sread_non_nil_coding_system);
3969 defsubr (&Scheck_coding_system);
3970 defsubr (&Sdetect_coding_region);
3971 defsubr (&Sdecode_coding_region);
3972 defsubr (&Sencode_coding_region);
3973 defsubr (&Sdecode_coding_string);
3974 defsubr (&Sencode_coding_string);
3975 defsubr (&Sdecode_sjis_char);
3976 defsubr (&Sencode_sjis_char);
3977 defsubr (&Sdecode_big5_char);
3978 defsubr (&Sencode_big5_char);
1ba9e4ab 3979 defsubr (&Sset_terminal_coding_system_internal);
c4825358 3980 defsubr (&Sset_safe_terminal_coding_system_internal);
4ed46869 3981 defsubr (&Sterminal_coding_system);
1ba9e4ab 3982 defsubr (&Sset_keyboard_coding_system_internal);
4ed46869 3983 defsubr (&Skeyboard_coding_system);
a5d301df 3984 defsubr (&Sfind_operation_coding_system);
4ed46869
KH
3985
3986 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
3987 "List of coding-categories (symbols) ordered by priority.");
3988 {
3989 int i;
3990
3991 Vcoding_category_list = Qnil;
3992 for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
3993 Vcoding_category_list
3994 = Fcons (coding_category_table[i], Vcoding_category_list);
3995 }
3996
3997 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
10bff6f1 3998 "Specify the coding system for read operations.\n\
2ebb362d 3999It is useful to bind this variable with `let', but do not set it globally.\n\
4ed46869 4000If the value is a coding system, it is used for decoding on read operation.\n\
a67a9c66 4001If not, an appropriate element is used from one of the coding system alists:\n\
10bff6f1 4002There are three such tables, `file-coding-system-alist',\n\
a67a9c66 4003`process-coding-system-alist', and `network-coding-system-alist'.");
4ed46869
KH
4004 Vcoding_system_for_read = Qnil;
4005
4006 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
10bff6f1 4007 "Specify the coding system for write operations.\n\
2ebb362d 4008It is useful to bind this variable with `let', but do not set it globally.\n\
4ed46869 4009If the value is a coding system, it is used for encoding on write operation.\n\
a67a9c66 4010If not, an appropriate element is used from one of the coding system alists:\n\
10bff6f1 4011There are three such tables, `file-coding-system-alist',\n\
a67a9c66 4012`process-coding-system-alist', and `network-coding-system-alist'.");
4ed46869
KH
4013 Vcoding_system_for_write = Qnil;
4014
4015 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
a67a9c66 4016 "Coding system used in the latest file or process I/O.");
4ed46869
KH
4017 Vlast_coding_system_used = Qnil;
4018
9ce27fde
KH
4019 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
4020 "*Non-nil inhibit code conversion of end-of-line format in any cases.");
4021 inhibit_eol_conversion = 0;
4022
02ba4723
KH
4023 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
4024 "Alist to decide a coding system to use for a file I/O operation.\n\
4025The format is ((PATTERN . VAL) ...),\n\
4026where PATTERN is a regular expression matching a file name,\n\
4027VAL is a coding system, a cons of coding systems, or a function symbol.\n\
4028If VAL is a coding system, it is used for both decoding and encoding\n\
4029the file contents.\n\
4030If VAL is a cons of coding systems, the car part is used for decoding,\n\
4031and the cdr part is used for encoding.\n\
4032If VAL is a function symbol, the function must return a coding system\n\
4033or a cons of coding systems which are used as above.\n\
e0e989f6 4034\n\
9ce27fde 4035See also the function `find-operation-coding-system'.");
02ba4723
KH
4036 Vfile_coding_system_alist = Qnil;
4037
4038 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
4039 "Alist to decide a coding system to use for a process I/O operation.\n\
4040The format is ((PATTERN . VAL) ...),\n\
4041where PATTERN is a regular expression matching a program name,\n\
4042VAL is a coding system, a cons of coding systems, or a function symbol.\n\
4043If VAL is a coding system, it is used for both decoding what received\n\
4044from the program and encoding what sent to the program.\n\
4045If VAL is a cons of coding systems, the car part is used for decoding,\n\
4046and the cdr part is used for encoding.\n\
4047If VAL is a function symbol, the function must return a coding system\n\
4048or a cons of coding systems which are used as above.\n\
4ed46869 4049\n\
9ce27fde 4050See also the function `find-operation-coding-system'.");
02ba4723
KH
4051 Vprocess_coding_system_alist = Qnil;
4052
4053 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
4054 "Alist to decide a coding system to use for a network I/O operation.\n\
4055The format is ((PATTERN . VAL) ...),\n\
4056where PATTERN is a regular expression matching a network service name\n\
4057or is a port number to connect to,\n\
4058VAL is a coding system, a cons of coding systems, or a function symbol.\n\
4059If VAL is a coding system, it is used for both decoding what received\n\
4060from the network stream and encoding what sent to the network stream.\n\
4061If VAL is a cons of coding systems, the car part is used for decoding,\n\
4062and the cdr part is used for encoding.\n\
4063If VAL is a function symbol, the function must return a coding system\n\
4064or a cons of coding systems which are used as above.\n\
4ed46869 4065\n\
9ce27fde 4066See also the function `find-operation-coding-system'.");
02ba4723 4067 Vnetwork_coding_system_alist = Qnil;
4ed46869
KH
4068
4069 DEFVAR_INT ("eol-mnemonic-unix", &eol_mnemonic_unix,
4070 "Mnemonic character indicating UNIX-like end-of-line format (i.e. LF) .");
458822a0 4071 eol_mnemonic_unix = ':';
4ed46869
KH
4072
4073 DEFVAR_INT ("eol-mnemonic-dos", &eol_mnemonic_dos,
4074 "Mnemonic character indicating DOS-like end-of-line format (i.e. CRLF).");
458822a0 4075 eol_mnemonic_dos = '\\';
4ed46869
KH
4076
4077 DEFVAR_INT ("eol-mnemonic-mac", &eol_mnemonic_mac,
4078 "Mnemonic character indicating MAC-like end-of-line format (i.e. CR).");
458822a0 4079 eol_mnemonic_mac = '/';
4ed46869
KH
4080
4081 DEFVAR_INT ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
4082 "Mnemonic character indicating end-of-line format is not yet decided.");
458822a0 4083 eol_mnemonic_undecided = ':';
4ed46869 4084
bdd9fb48
KH
4085 DEFVAR_LISP ("enable-character-unification", &Venable_character_unification,
4086 "Non-nil means ISO 2022 encoder/decoder do character unification.");
4087 Venable_character_unification = Qt;
4088
a5d301df
KH
4089 DEFVAR_LISP ("standard-character-unification-table-for-decode",
4090 &Vstandard_character_unification_table_for_decode,
bdd9fb48 4091 "Table for unifying characters when reading.");
a5d301df 4092 Vstandard_character_unification_table_for_decode = Qnil;
bdd9fb48 4093
a5d301df
KH
4094 DEFVAR_LISP ("standard-character-unification-table-for-encode",
4095 &Vstandard_character_unification_table_for_encode,
bdd9fb48 4096 "Table for unifying characters when writing.");
a5d301df 4097 Vstandard_character_unification_table_for_encode = Qnil;
4ed46869
KH
4098
4099 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
4100 "Alist of charsets vs revision numbers.\n\
4101While encoding, if a charset (car part of an element) is found,\n\
4102designate it with the escape sequence identifing revision (cdr part of the element).");
4103 Vcharset_revision_alist = Qnil;
02ba4723
KH
4104
4105 DEFVAR_LISP ("default-process-coding-system",
4106 &Vdefault_process_coding_system,
4107 "Cons of coding systems used for process I/O by default.\n\
4108The car part is used for decoding a process output,\n\
4109the cdr part is used for encoding a text to be sent to a process.");
4110 Vdefault_process_coding_system = Qnil;
c4825358 4111
3f003981
KH
4112 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
4113 "Table of extra Latin codes in the range 128..159 (inclusive).\n\
c4825358
KH
4114This is a vector of length 256.\n\
4115If Nth element is non-nil, the existence of code N in a file\n\
bb0115a2 4116\(or output of subprocess) doesn't prevent it to be detected as\n\
3f003981
KH
4117a coding system of ISO 2022 variant which has a flag\n\
4118`accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file\n\
c4825358
KH
4119or reading output of a subprocess.\n\
4120Only 128th through 159th elements has a meaning.");
3f003981 4121 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
4ed46869
KH
4122}
4123
4124#endif /* emacs */