(encode_designation_at_bol): Fix type of local vars C1, C2.
[bpt/emacs.git] / src / coding.c
CommitLineData
4ed46869 1/* Coding system handler (conversion, detection, and etc).
203cb916
RS
2 Copyright (C) 1995, 1997 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4ed46869 4
369314dc
KH
5This file is part of GNU Emacs.
6
7GNU Emacs is free software; you can redistribute it and/or modify
8it under the terms of the GNU General Public License as published by
9the Free Software Foundation; either version 2, or (at your option)
10any later version.
4ed46869 11
369314dc
KH
12GNU Emacs is distributed in the hope that it will be useful,
13but WITHOUT ANY WARRANTY; without even the implied warranty of
14MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15GNU General Public License for more details.
4ed46869 16
369314dc
KH
17You should have received a copy of the GNU General Public License
18along with GNU Emacs; see the file COPYING. If not, write to
19the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20Boston, MA 02111-1307, USA. */
4ed46869
KH
21
22/*** TABLE OF CONTENTS ***
23
24 1. Preamble
0ef69138 25 2. Emacs' internal format (emacs-mule) handlers
4ed46869
KH
26 3. ISO2022 handlers
27 4. Shift-JIS and BIG5 handlers
28 5. End-of-line handlers
29 6. C library functions
30 7. Emacs Lisp library functions
31 8. Post-amble
32
33*/
34
35/*** GENERAL NOTE on CODING SYSTEM ***
36
37 Coding system is an encoding mechanism of one or more character
38 sets. Here's a list of coding systems which Emacs can handle. When
39 we say "decode", it means converting some other coding system to
0ef69138
KH
40 Emacs' internal format (emacs-internal), and when we say "encode",
41 it means converting the coding system emacs-mule to some other
42 coding system.
4ed46869 43
0ef69138 44 0. Emacs' internal format (emacs-mule)
4ed46869
KH
45
46 Emacs itself holds a multi-lingual character in a buffer and a string
f4dee582 47 in a special format. Details are described in section 2.
4ed46869
KH
48
49 1. ISO2022
50
51 The most famous coding system for multiple character sets. X's
f4dee582
RS
52 Compound Text, various EUCs (Extended Unix Code), and coding
53 systems used in Internet communication such as ISO-2022-JP are
54 all variants of ISO2022. Details are described in section 3.
4ed46869
KH
55
56 2. SJIS (or Shift-JIS or MS-Kanji-Code)
57
58 A coding system to encode character sets: ASCII, JISX0201, and
59 JISX0208. Widely used for PC's in Japan. Details are described in
f4dee582 60 section 4.
4ed46869
KH
61
62 3. BIG5
63
64 A coding system to encode character sets: ASCII and Big5. Widely
65 used by Chinese (mainly in Taiwan and Hong Kong). Details are
f4dee582
RS
66 described in section 4. In this file, when we write "BIG5"
67 (all uppercase), we mean the coding system, and when we write
68 "Big5" (capitalized), we mean the character set.
4ed46869 69
27901516
KH
70 4. Raw text
71
72 A coding system to for a text containing random 8-bit code. Emacs
73 does no code conversion on such a text except for end-of-line
74 format.
75
76 5. Other
4ed46869 77
f4dee582 78 If a user wants to read/write a text encoded in a coding system not
4ed46869
KH
79 listed above, he can supply a decoder and an encoder for it in CCL
80 (Code Conversion Language) programs. Emacs executes the CCL program
81 while reading/writing.
82
f4dee582 83 Emacs represents a coding-system by a Lisp symbol that has a property
4ed46869
KH
84 `coding-system'. But, before actually using the coding-system, the
85 information about it is set in a structure of type `struct
f4dee582 86 coding_system' for rapid processing. See section 6 for more details.
4ed46869
KH
87
88*/
89
90/*** GENERAL NOTES on END-OF-LINE FORMAT ***
91
92 How end-of-line of a text is encoded depends on a system. For
93 instance, Unix's format is just one byte of `line-feed' code,
f4dee582 94 whereas DOS's format is two-byte sequence of `carriage-return' and
4ed46869
KH
95 `line-feed' codes. MacOS's format is one byte of `carriage-return'.
96
f4dee582
RS
97 Since text characters encoding and end-of-line encoding are
98 independent, any coding system described above can take
4ed46869 99 any format of end-of-line. So, Emacs has information of format of
f4dee582 100 end-of-line in each coding-system. See section 6 for more details.
4ed46869
KH
101
102*/
103
104/*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
105
106 These functions check if a text between SRC and SRC_END is encoded
107 in the coding system category XXX. Each returns an integer value in
108 which appropriate flag bits for the category XXX is set. The flag
109 bits are defined in macros CODING_CATEGORY_MASK_XXX. Below is the
110 template of these functions. */
111#if 0
112int
0ef69138 113detect_coding_emacs_mule (src, src_end)
4ed46869
KH
114 unsigned char *src, *src_end;
115{
116 ...
117}
118#endif
119
120/*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
121
122 These functions decode SRC_BYTES length text at SOURCE encoded in
0ef69138 123 CODING to Emacs' internal format (emacs-mule). The resulting text
f4dee582
RS
124 goes to a place pointed to by DESTINATION, the length of which should
125 not exceed DST_BYTES. The number of bytes actually processed is
126 returned as *CONSUMED. The return value is the length of the decoded
127 text. Below is a template of these functions. */
4ed46869
KH
128#if 0
129decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
130 struct coding_system *coding;
131 unsigned char *source, *destination;
132 int src_bytes, dst_bytes;
133 int *consumed;
134{
135 ...
136}
137#endif
138
139/*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
140
0ef69138
KH
141 These functions encode SRC_BYTES length text at SOURCE of Emacs'
142 internal format (emacs-mule) to CODING. The resulting text goes to
f4dee582
RS
143 a place pointed to by DESTINATION, the length of which should not
144 exceed DST_BYTES. The number of bytes actually processed is
145 returned as *CONSUMED. The return value is the length of the
146 encoded text. Below is a template of these functions. */
4ed46869
KH
147#if 0
148encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
149 struct coding_system *coding;
150 unsigned char *source, *destination;
151 int src_bytes, dst_bytes;
152 int *consumed;
153{
154 ...
155}
156#endif
157
158/*** COMMONLY USED MACROS ***/
159
160/* The following three macros ONE_MORE_BYTE, TWO_MORE_BYTES, and
161 THREE_MORE_BYTES safely get one, two, and three bytes from the
162 source text respectively. If there are not enough bytes in the
163 source, they jump to `label_end_of_loop'. The caller should set
164 variables `src' and `src_end' to appropriate areas in advance. */
165
166#define ONE_MORE_BYTE(c1) \
167 do { \
168 if (src < src_end) \
169 c1 = *src++; \
170 else \
171 goto label_end_of_loop; \
172 } while (0)
173
174#define TWO_MORE_BYTES(c1, c2) \
175 do { \
176 if (src + 1 < src_end) \
177 c1 = *src++, c2 = *src++; \
178 else \
179 goto label_end_of_loop; \
180 } while (0)
181
182#define THREE_MORE_BYTES(c1, c2, c3) \
183 do { \
184 if (src + 2 < src_end) \
185 c1 = *src++, c2 = *src++, c3 = *src++; \
186 else \
187 goto label_end_of_loop; \
188 } while (0)
189
190/* The following three macros DECODE_CHARACTER_ASCII,
191 DECODE_CHARACTER_DIMENSION1, and DECODE_CHARACTER_DIMENSION2 put
192 the multi-byte form of a character of each class at the place
193 pointed by `dst'. The caller should set the variable `dst' to
194 point to an appropriate area and the variable `coding' to point to
195 the coding-system of the currently decoding text in advance. */
196
197/* Decode one ASCII character C. */
198
199#define DECODE_CHARACTER_ASCII(c) \
200 do { \
201 if (COMPOSING_P (coding->composing)) \
202 *dst++ = 0xA0, *dst++ = (c) | 0x80; \
203 else \
204 *dst++ = (c); \
205 } while (0)
206
f4dee582 207/* Decode one DIMENSION1 character whose charset is CHARSET and whose
4ed46869
KH
208 position-code is C. */
209
210#define DECODE_CHARACTER_DIMENSION1(charset, c) \
211 do { \
212 unsigned char leading_code = CHARSET_LEADING_CODE_BASE (charset); \
213 if (COMPOSING_P (coding->composing)) \
214 *dst++ = leading_code + 0x20; \
215 else \
216 *dst++ = leading_code; \
217 if (leading_code = CHARSET_LEADING_CODE_EXT (charset)) \
218 *dst++ = leading_code; \
219 *dst++ = (c) | 0x80; \
220 } while (0)
221
f4dee582 222/* Decode one DIMENSION2 character whose charset is CHARSET and whose
4ed46869
KH
223 position-codes are C1 and C2. */
224
225#define DECODE_CHARACTER_DIMENSION2(charset, c1, c2) \
226 do { \
227 DECODE_CHARACTER_DIMENSION1 (charset, c1); \
228 *dst++ = (c2) | 0x80; \
229 } while (0)
230
231\f
232/*** 1. Preamble ***/
233
234#include <stdio.h>
235
236#ifdef emacs
237
238#include <config.h>
239#include "lisp.h"
240#include "buffer.h"
241#include "charset.h"
242#include "ccl.h"
243#include "coding.h"
244#include "window.h"
245
246#else /* not emacs */
247
248#include "mulelib.h"
249
250#endif /* not emacs */
251
252Lisp_Object Qcoding_system, Qeol_type;
253Lisp_Object Qbuffer_file_coding_system;
254Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
27901516 255Lisp_Object Qno_conversion, Qundecided;
bb0115a2 256Lisp_Object Qcoding_system_history;
4ed46869
KH
257
258extern Lisp_Object Qinsert_file_contents, Qwrite_region;
259Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
260Lisp_Object Qstart_process, Qopen_network_stream;
261Lisp_Object Qtarget_idx;
262
263/* Mnemonic character of each format of end-of-line. */
264int eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
265/* Mnemonic character to indicate format of end-of-line is not yet
266 decided. */
267int eol_mnemonic_undecided;
268
9ce27fde
KH
269/* Format of end-of-line decided by system. This is CODING_EOL_LF on
270 Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac. */
271int system_eol_type;
272
4ed46869
KH
273#ifdef emacs
274
02ba4723 275Lisp_Object Qcoding_system_spec, Qcoding_system_p, Qcoding_system_error;
4ed46869 276
9ce27fde
KH
277/* Coding system emacs-mule is for converting only end-of-line format. */
278Lisp_Object Qemacs_mule;
279
4ed46869
KH
280/* Coding-systems are handed between Emacs Lisp programs and C internal
281 routines by the following three variables. */
282/* Coding-system for reading files and receiving data from process. */
283Lisp_Object Vcoding_system_for_read;
284/* Coding-system for writing files and sending data to process. */
285Lisp_Object Vcoding_system_for_write;
286/* Coding-system actually used in the latest I/O. */
287Lisp_Object Vlast_coding_system_used;
288
c4825358 289/* A vector of length 256 which contains information about special
3f003981
KH
290 Latin codes (espepcially for dealing with Microsoft code). */
291Lisp_Object Vlatin_extra_code_table;
c4825358 292
9ce27fde
KH
293/* Flag to inhibit code conversion of end-of-line format. */
294int inhibit_eol_conversion;
295
c4825358 296/* Coding system to be used to encode text for terminal display. */
4ed46869
KH
297struct coding_system terminal_coding;
298
c4825358
KH
299/* Coding system to be used to encode text for terminal display when
300 terminal coding system is nil. */
301struct coding_system safe_terminal_coding;
302
303/* Coding system of what is sent from terminal keyboard. */
4ed46869
KH
304struct coding_system keyboard_coding;
305
02ba4723
KH
306Lisp_Object Vfile_coding_system_alist;
307Lisp_Object Vprocess_coding_system_alist;
308Lisp_Object Vnetwork_coding_system_alist;
4ed46869
KH
309
310#endif /* emacs */
311
312Lisp_Object Qcoding_category_index;
313
314/* List of symbols `coding-category-xxx' ordered by priority. */
315Lisp_Object Vcoding_category_list;
316
317/* Table of coding-systems currently assigned to each coding-category. */
318Lisp_Object coding_category_table[CODING_CATEGORY_IDX_MAX];
319
320/* Table of names of symbol for each coding-category. */
321char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
0ef69138 322 "coding-category-emacs-mule",
4ed46869
KH
323 "coding-category-sjis",
324 "coding-category-iso-7",
325 "coding-category-iso-8-1",
326 "coding-category-iso-8-2",
7717c392
KH
327 "coding-category-iso-7-else",
328 "coding-category-iso-8-else",
4ed46869 329 "coding-category-big5",
27901516 330 "coding-category-raw-text",
4ed46869
KH
331 "coding-category-binary"
332};
333
bdd9fb48
KH
334/* Flag to tell if we look up unification table on character code
335 conversion. */
336Lisp_Object Venable_character_unification;
a5d301df
KH
337/* Standard unification table to look up on decoding (reading). */
338Lisp_Object Vstandard_character_unification_table_for_decode;
339/* Standard unification table to look up on encoding (writing). */
340Lisp_Object Vstandard_character_unification_table_for_encode;
bdd9fb48
KH
341
342Lisp_Object Qcharacter_unification_table;
a5d301df
KH
343Lisp_Object Qcharacter_unification_table_for_decode;
344Lisp_Object Qcharacter_unification_table_for_encode;
4ed46869
KH
345
346/* Alist of charsets vs revision number. */
347Lisp_Object Vcharset_revision_alist;
348
02ba4723
KH
349/* Default coding systems used for process I/O. */
350Lisp_Object Vdefault_process_coding_system;
351
4ed46869 352\f
0ef69138 353/*** 2. Emacs internal format (emacs-mule) handlers ***/
4ed46869
KH
354
355/* Emacs' internal format for encoding multiple character sets is a
f4dee582
RS
356 kind of multi-byte encoding, i.e. characters are encoded by
357 variable-length sequences of one-byte codes. ASCII characters
358 and control characters (e.g. `tab', `newline') are represented by
359 one-byte sequences which are their ASCII codes, in the range 0x00
360 through 0x7F. The other characters are represented by a sequence
361 of `base leading-code', optional `extended leading-code', and one
362 or two `position-code's. The length of the sequence is determined
363 by the base leading-code. Leading-code takes the range 0x80
364 through 0x9F, whereas extended leading-code and position-code take
365 the range 0xA0 through 0xFF. See `charset.h' for more details
366 about leading-code and position-code.
367
368 There's one exception to this rule. Special leading-code
4ed46869
KH
369 `leading-code-composition' denotes that the following several
370 characters should be composed into one character. Leading-codes of
371 components (except for ASCII) are added 0x20. An ASCII character
372 component is represented by a 2-byte sequence of `0xA0' and
f4dee582
RS
373 `ASCII-code + 0x80'. See also the comments in `charset.h' for the
374 details of composite character. Hence, we can summarize the code
4ed46869
KH
375 range as follows:
376
377 --- CODE RANGE of Emacs' internal format ---
378 (character set) (range)
379 ASCII 0x00 .. 0x7F
380 ELSE (1st byte) 0x80 .. 0x9F
381 (rest bytes) 0xA0 .. 0xFF
382 ---------------------------------------------
383
384 */
385
386enum emacs_code_class_type emacs_code_class[256];
387
388/* Go to the next statement only if *SRC is accessible and the code is
389 greater than 0xA0. */
390#define CHECK_CODE_RANGE_A0_FF \
391 do { \
392 if (src >= src_end) \
393 goto label_end_of_switch; \
394 else if (*src++ < 0xA0) \
395 return 0; \
396 } while (0)
397
398/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
399 Check if a text is encoded in Emacs' internal format. If it is,
0ef69138 400 return CODING_CATEGORY_MASK_EMASC_MULE, else return 0. */
4ed46869
KH
401
402int
0ef69138 403detect_coding_emacs_mule (src, src_end)
4ed46869
KH
404 unsigned char *src, *src_end;
405{
406 unsigned char c;
407 int composing = 0;
408
409 while (src < src_end)
410 {
411 c = *src++;
412
413 if (composing)
414 {
415 if (c < 0xA0)
416 composing = 0;
417 else
418 c -= 0x20;
419 }
420
421 switch (emacs_code_class[c])
422 {
423 case EMACS_ascii_code:
424 case EMACS_linefeed_code:
425 break;
426
427 case EMACS_control_code:
428 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
429 return 0;
430 break;
431
432 case EMACS_invalid_code:
433 return 0;
434
435 case EMACS_leading_code_composition: /* c == 0x80 */
436 if (composing)
437 CHECK_CODE_RANGE_A0_FF;
438 else
439 composing = 1;
440 break;
441
442 case EMACS_leading_code_4:
443 CHECK_CODE_RANGE_A0_FF;
444 /* fall down to check it two more times ... */
445
446 case EMACS_leading_code_3:
447 CHECK_CODE_RANGE_A0_FF;
448 /* fall down to check it one more time ... */
449
450 case EMACS_leading_code_2:
451 CHECK_CODE_RANGE_A0_FF;
452 break;
453
454 default:
455 label_end_of_switch:
456 break;
457 }
458 }
0ef69138 459 return CODING_CATEGORY_MASK_EMACS_MULE;
4ed46869
KH
460}
461
462\f
463/*** 3. ISO2022 handlers ***/
464
465/* The following note describes the coding system ISO2022 briefly.
f4dee582
RS
466 Since the intention of this note is to help in understanding of
467 the programs in this file, some parts are NOT ACCURATE or OVERLY
4ed46869
KH
468 SIMPLIFIED. For the thorough understanding, please refer to the
469 original document of ISO2022.
470
471 ISO2022 provides many mechanisms to encode several character sets
f4dee582 472 in 7-bit and 8-bit environment. If one chooses 7-bite environment,
4ed46869 473 all text is encoded by codes of less than 128. This may make the
f4dee582
RS
474 encoded text a little bit longer, but the text gets more stability
475 to pass through several gateways (some of them strip off the MSB).
4ed46869 476
f4dee582 477 There are two kinds of character set: control character set and
4ed46869
KH
478 graphic character set. The former contains control characters such
479 as `newline' and `escape' to provide control functions (control
f4dee582 480 functions are provided also by escape sequences). The latter
4ed46869
KH
481 contains graphic characters such as ' A' and '-'. Emacs recognizes
482 two control character sets and many graphic character sets.
483
484 Graphic character sets are classified into one of the following
485 four classes, DIMENSION1_CHARS94, DIMENSION1_CHARS96,
486 DIMENSION2_CHARS94, DIMENSION2_CHARS96 according to the number of
487 bytes (DIMENSION) and the number of characters in one dimension
488 (CHARS) of the set. In addition, each character set is assigned an
489 identification tag (called "final character" and denoted as <F>
490 here after) which is unique in each class. <F> of each character
491 set is decided by ECMA(*) when it is registered in ISO. Code range
492 of <F> is 0x30..0x7F (0x30..0x3F are for private use only).
493
494 Note (*): ECMA = European Computer Manufacturers Association
495
496 Here are examples of graphic character set [NAME(<F>)]:
497 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
498 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
499 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
500 o DIMENSION2_CHARS96 -- none for the moment
501
502 A code area (1byte=8bits) is divided into 4 areas, C0, GL, C1, and GR.
503 C0 [0x00..0x1F] -- control character plane 0
504 GL [0x20..0x7F] -- graphic character plane 0
505 C1 [0x80..0x9F] -- control character plane 1
506 GR [0xA0..0xFF] -- graphic character plane 1
507
508 A control character set is directly designated and invoked to C0 or
509 C1 by an escape sequence. The most common case is that ISO646's
510 control character set is designated/invoked to C0 and ISO6429's
511 control character set is designated/invoked to C1, and usually
512 these designations/invocations are omitted in a coded text. With
513 7-bit environment, only C0 can be used, and a control character for
514 C1 is encoded by an appropriate escape sequence to fit in the
515 environment. All control characters for C1 are defined the
516 corresponding escape sequences.
517
518 A graphic character set is at first designated to one of four
519 graphic registers (G0 through G3), then these graphic registers are
520 invoked to GL or GR. These designations and invocations can be
521 done independently. The most common case is that G0 is invoked to
522 GL, G1 is invoked to GR, and ASCII is designated to G0, and usually
523 these invocations and designations are omitted in a coded text.
524 With 7-bit environment, only GL can be used.
525
526 When a graphic character set of CHARS94 is invoked to GL, code 0x20
527 and 0x7F of GL area work as control characters SPACE and DEL
528 respectively, and code 0xA0 and 0xFF of GR area should not be used.
529
530 There are two ways of invocation: locking-shift and single-shift.
531 With locking-shift, the invocation lasts until the next different
532 invocation, whereas with single-shift, the invocation works only
533 for the following character and doesn't affect locking-shift.
534 Invocations are done by the following control characters or escape
535 sequences.
536
537 ----------------------------------------------------------------------
538 function control char escape sequence description
539 ----------------------------------------------------------------------
540 SI (shift-in) 0x0F none invoke G0 to GL
10bff6f1 541 SO (shift-out) 0x0E none invoke G1 to GL
4ed46869
KH
542 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
543 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
544 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 into GL
545 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 into GL
546 ----------------------------------------------------------------------
547 The first four are for locking-shift. Control characters for these
548 functions are defined by macros ISO_CODE_XXX in `coding.h'.
549
550 Designations are done by the following escape sequences.
551 ----------------------------------------------------------------------
552 escape sequence description
553 ----------------------------------------------------------------------
554 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
555 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
556 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
557 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
558 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
559 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
560 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
561 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
562 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
563 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
564 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
565 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
566 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
567 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
568 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
569 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
570 ----------------------------------------------------------------------
571
572 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
573 of dimension 1, chars 94, and final character <F>, and etc.
574
575 Note (*): Although these designations are not allowed in ISO2022,
576 Emacs accepts them on decoding, and produces them on encoding
577 CHARS96 character set in a coding system which is characterized as
578 7-bit environment, non-locking-shift, and non-single-shift.
579
580 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
581 '(' can be omitted. We call this as "short-form" here after.
582
583 Now you may notice that there are a lot of ways for encoding the
f4dee582 584 same multilingual text in ISO2022. Actually, there exists many
4ed46869
KH
585 coding systems such as Compound Text (used in X's inter client
586 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
587 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
588 localized platforms), and all of these are variants of ISO2022.
589
590 In addition to the above, Emacs handles two more kinds of escape
591 sequences: ISO6429's direction specification and Emacs' private
592 sequence for specifying character composition.
593
594 ISO6429's direction specification takes the following format:
595 o CSI ']' -- end of the current direction
596 o CSI '0' ']' -- end of the current direction
597 o CSI '1' ']' -- start of left-to-right text
598 o CSI '2' ']' -- start of right-to-left text
599 The control character CSI (0x9B: control sequence introducer) is
600 abbreviated to the escape sequence ESC '[' in 7-bit environment.
601
602 Character composition specification takes the following format:
603 o ESC '0' -- start character composition
604 o ESC '1' -- end character composition
605 Since these are not standard escape sequences of any ISO, the use
606 of them for these meaning is restricted to Emacs only. */
607
608enum iso_code_class_type iso_code_class[256];
609
610/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
611 Check if a text is encoded in ISO2022. If it is, returns an
612 integer in which appropriate flag bits any of:
613 CODING_CATEGORY_MASK_ISO_7
614 CODING_CATEGORY_MASK_ISO_8_1
615 CODING_CATEGORY_MASK_ISO_8_2
7717c392
KH
616 CODING_CATEGORY_MASK_ISO_7_ELSE
617 CODING_CATEGORY_MASK_ISO_8_ELSE
4ed46869
KH
618 are set. If a code which should never appear in ISO2022 is found,
619 returns 0. */
620
621int
622detect_coding_iso2022 (src, src_end)
623 unsigned char *src, *src_end;
624{
765a2ca5
KH
625 int mask = (CODING_CATEGORY_MASK_ISO_7
626 | CODING_CATEGORY_MASK_ISO_8_1
627 | CODING_CATEGORY_MASK_ISO_8_2
7717c392
KH
628 | CODING_CATEGORY_MASK_ISO_7_ELSE
629 | CODING_CATEGORY_MASK_ISO_8_ELSE
630 );
bcf26d6a
KH
631 int g1 = 0; /* 1 iff designating to G1. */
632 int c, i;
3f003981 633 struct coding_system coding_iso_8_1, coding_iso_8_2;
4ed46869 634
3f003981
KH
635 /* Coding systems of these categories may accept latin extra codes. */
636 setup_coding_system
637 (XSYMBOL (coding_category_table[CODING_CATEGORY_IDX_ISO_8_1])->value,
638 &coding_iso_8_1);
639 setup_coding_system
640 (XSYMBOL (coding_category_table[CODING_CATEGORY_IDX_ISO_8_2])->value,
641 &coding_iso_8_2);
642
643 while (mask && src < src_end)
4ed46869
KH
644 {
645 c = *src++;
646 switch (c)
647 {
648 case ISO_CODE_ESC:
e0e989f6 649 if (src >= src_end)
4ed46869
KH
650 break;
651 c = *src++;
bf9cdd4e 652 if ((c >= '(' && c <= '/'))
4ed46869 653 {
bf9cdd4e
KH
654 /* Designation sequence for a charset of dimension 1. */
655 if (src >= src_end)
656 break;
657 c = *src++;
658 if (c < ' ' || c >= 0x80)
659 /* Invalid designation sequence. */
660 return 0;
661 }
662 else if (c == '$')
663 {
664 /* Designation sequence for a charset of dimension 2. */
665 if (src >= src_end)
666 break;
667 c = *src++;
668 if (c >= '@' && c <= 'B')
669 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
670 ;
671 else if (c >= '(' && c <= '/')
bcf26d6a 672 {
bf9cdd4e
KH
673 if (src >= src_end)
674 break;
675 c = *src++;
676 if (c < ' ' || c >= 0x80)
677 /* Invalid designation sequence. */
678 return 0;
bcf26d6a 679 }
bf9cdd4e
KH
680 else
681 /* Invalid designation sequence. */
682 return 0;
4ed46869 683 }
4ed46869 684 else if (c == 'N' || c == 'O' || c == 'n' || c == 'o')
bf9cdd4e 685 /* Locking shift. */
7717c392
KH
686 mask &= (CODING_CATEGORY_MASK_ISO_7_ELSE
687 | CODING_CATEGORY_MASK_ISO_8_ELSE);
bf9cdd4e
KH
688 else if (c == '0' || c == '1' || c == '2')
689 /* Start/end composition. */
690 ;
691 else
692 /* Invalid escape sequence. */
693 return 0;
4ed46869
KH
694 break;
695
4ed46869 696 case ISO_CODE_SO:
bf9cdd4e
KH
697 mask &= (CODING_CATEGORY_MASK_ISO_7_ELSE
698 | CODING_CATEGORY_MASK_ISO_8_ELSE);
e0e989f6
KH
699 break;
700
4ed46869
KH
701 case ISO_CODE_CSI:
702 case ISO_CODE_SS2:
703 case ISO_CODE_SS3:
3f003981
KH
704 {
705 int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
706
707 if (VECTORP (Vlatin_extra_code_table)
708 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
709 {
710 if (coding_iso_8_1.flags & CODING_FLAG_ISO_LATIN_EXTRA)
711 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
712 if (coding_iso_8_2.flags & CODING_FLAG_ISO_LATIN_EXTRA)
713 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
714 }
715 mask &= newmask;
716 }
717 break;
4ed46869
KH
718
719 default:
720 if (c < 0x80)
721 break;
722 else if (c < 0xA0)
c4825358 723 {
3f003981
KH
724 if (VECTORP (Vlatin_extra_code_table)
725 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
c4825358 726 {
3f003981
KH
727 int newmask = 0;
728
729 if (coding_iso_8_1.flags & CODING_FLAG_ISO_LATIN_EXTRA)
730 newmask |= CODING_CATEGORY_MASK_ISO_8_1;
731 if (coding_iso_8_2.flags & CODING_FLAG_ISO_LATIN_EXTRA)
732 newmask |= CODING_CATEGORY_MASK_ISO_8_2;
733 mask &= newmask;
c4825358 734 }
3f003981
KH
735 else
736 return 0;
c4825358 737 }
4ed46869
KH
738 else
739 {
7717c392 740 unsigned char *src_begin = src;
4ed46869 741
7717c392
KH
742 mask &= ~(CODING_CATEGORY_MASK_ISO_7
743 | CODING_CATEGORY_MASK_ISO_7_ELSE);
e0e989f6 744 while (src < src_end && *src >= 0xA0)
7717c392
KH
745 src++;
746 if ((src - src_begin - 1) & 1 && src < src_end)
4ed46869
KH
747 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
748 }
749 break;
750 }
751 }
752
753 return mask;
754}
755
756/* Decode a character of which charset is CHARSET and the 1st position
bdd9fb48 757 code is C1. If dimension of CHARSET is 2, the 2nd position code is
4ed46869
KH
758 fetched from SRC and set to C2. If CHARSET is negative, it means
759 that we are decoding ill formed text, and what we can do is just to
760 read C1 as is. */
761
bdd9fb48
KH
762#define DECODE_ISO_CHARACTER(charset, c1) \
763 do { \
764 int c_alt, charset_alt = (charset); \
765 if (COMPOSING_HEAD_P (coding->composing)) \
766 { \
767 *dst++ = LEADING_CODE_COMPOSITION; \
768 if (COMPOSING_WITH_RULE_P (coding->composing)) \
769 /* To tell composition rules are embeded. */ \
770 *dst++ = 0xFF; \
771 coding->composing += 2; \
772 } \
773 if ((charset) >= 0) \
774 { \
775 if (CHARSET_DIMENSION (charset) == 2) \
776 ONE_MORE_BYTE (c2); \
777 if (!NILP (unification_table) \
778 && ((c_alt = unify_char (unification_table, \
779 -1, (charset), c1, c2)) >= 0)) \
780 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
781 } \
782 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
783 DECODE_CHARACTER_ASCII (c1); \
784 else if (CHARSET_DIMENSION (charset_alt) == 1) \
785 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
786 else \
787 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
788 if (COMPOSING_WITH_RULE_P (coding->composing)) \
789 /* To tell a composition rule follows. */ \
790 coding->composing = COMPOSING_WITH_RULE_RULE; \
4ed46869
KH
791 } while (0)
792
793/* Set designation state into CODING. */
794#define DECODE_DESIGNATION(reg, dimension, chars, final_char) \
795 do { \
2e34157c
RS
796 int charset = ISO_CHARSET_TABLE (make_number (dimension), \
797 make_number (chars), \
798 make_number (final_char)); \
4ed46869
KH
799 if (charset >= 0) \
800 { \
801 if (coding->direction == 1 \
802 && CHARSET_REVERSE_CHARSET (charset) >= 0) \
803 charset = CHARSET_REVERSE_CHARSET (charset); \
804 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
805 } \
806 } while (0)
807
808/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
809
810int
811decode_coding_iso2022 (coding, source, destination,
812 src_bytes, dst_bytes, consumed)
813 struct coding_system *coding;
814 unsigned char *source, *destination;
815 int src_bytes, dst_bytes;
816 int *consumed;
817{
818 unsigned char *src = source;
819 unsigned char *src_end = source + src_bytes;
820 unsigned char *dst = destination;
821 unsigned char *dst_end = destination + dst_bytes;
822 /* Since the maximum bytes produced by each loop is 7, we subtract 6
823 from DST_END to assure that overflow checking is necessary only
824 at the head of loop. */
825 unsigned char *adjusted_dst_end = dst_end - 6;
826 int charset;
827 /* Charsets invoked to graphic plane 0 and 1 respectively. */
828 int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
829 int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
a5d301df
KH
830 Lisp_Object unification_table
831 = coding->character_unification_table_for_decode;
bdd9fb48
KH
832
833 if (!NILP (Venable_character_unification) && NILP (unification_table))
a5d301df 834 unification_table = Vstandard_character_unification_table_for_decode;
4ed46869
KH
835
836 while (src < src_end && dst < adjusted_dst_end)
837 {
838 /* SRC_BASE remembers the start position in source in each loop.
839 The loop will be exited when there's not enough source text
840 to analyze long escape sequence or 2-byte code (within macros
841 ONE_MORE_BYTE or TWO_MORE_BYTES). In that case, SRC is reset
842 to SRC_BASE before exiting. */
843 unsigned char *src_base = src;
bdd9fb48 844 int c1 = *src++, c2;
4ed46869
KH
845
846 switch (iso_code_class [c1])
847 {
848 case ISO_0x20_or_0x7F:
849 if (!coding->composing
850 && (charset0 < 0 || CHARSET_CHARS (charset0) == 94))
851 {
852 /* This is SPACE or DEL. */
853 *dst++ = c1;
854 break;
855 }
856 /* This is a graphic character, we fall down ... */
857
858 case ISO_graphic_plane_0:
859 if (coding->composing == COMPOSING_WITH_RULE_RULE)
860 {
861 /* This is a composition rule. */
862 *dst++ = c1 | 0x80;
863 coding->composing = COMPOSING_WITH_RULE_TAIL;
864 }
865 else
866 DECODE_ISO_CHARACTER (charset0, c1);
867 break;
868
869 case ISO_0xA0_or_0xFF:
870 if (charset1 < 0 || CHARSET_CHARS (charset1) == 94)
871 {
872 /* Invalid code. */
873 *dst++ = c1;
874 break;
875 }
876 /* This is a graphic character, we fall down ... */
877
878 case ISO_graphic_plane_1:
879 DECODE_ISO_CHARACTER (charset1, c1);
880 break;
881
882 case ISO_control_code:
883 /* All ISO2022 control characters in this class have the
884 same representation in Emacs internal format. */
885 *dst++ = c1;
886 break;
887
888 case ISO_carriage_return:
889 if (coding->eol_type == CODING_EOL_CR)
890 {
891 *dst++ = '\n';
892 }
893 else if (coding->eol_type == CODING_EOL_CRLF)
894 {
895 ONE_MORE_BYTE (c1);
896 if (c1 == ISO_CODE_LF)
897 *dst++ = '\n';
898 else
899 {
900 src--;
901 *dst++ = c1;
902 }
903 }
904 else
905 {
906 *dst++ = c1;
907 }
908 break;
909
910 case ISO_shift_out:
e0e989f6
KH
911 if (CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
912 goto label_invalid_escape_sequence;
4ed46869
KH
913 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
914 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
915 break;
916
917 case ISO_shift_in:
918 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
919 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
920 break;
921
922 case ISO_single_shift_2_7:
923 case ISO_single_shift_2:
924 /* SS2 is handled as an escape sequence of ESC 'N' */
925 c1 = 'N';
926 goto label_escape_sequence;
927
928 case ISO_single_shift_3:
929 /* SS2 is handled as an escape sequence of ESC 'O' */
930 c1 = 'O';
931 goto label_escape_sequence;
932
933 case ISO_control_sequence_introducer:
934 /* CSI is handled as an escape sequence of ESC '[' ... */
935 c1 = '[';
936 goto label_escape_sequence;
937
938 case ISO_escape:
939 ONE_MORE_BYTE (c1);
940 label_escape_sequence:
941 /* Escape sequences handled by Emacs are invocation,
942 designation, direction specification, and character
943 composition specification. */
944 switch (c1)
945 {
946 case '&': /* revision of following character set */
947 ONE_MORE_BYTE (c1);
948 if (!(c1 >= '@' && c1 <= '~'))
e0e989f6 949 goto label_invalid_escape_sequence;
4ed46869
KH
950 ONE_MORE_BYTE (c1);
951 if (c1 != ISO_CODE_ESC)
e0e989f6 952 goto label_invalid_escape_sequence;
4ed46869
KH
953 ONE_MORE_BYTE (c1);
954 goto label_escape_sequence;
955
956 case '$': /* designation of 2-byte character set */
957 ONE_MORE_BYTE (c1);
958 if (c1 >= '@' && c1 <= 'B')
959 { /* designation of JISX0208.1978, GB2312.1980,
960 or JISX0208.1980 */
961 DECODE_DESIGNATION (0, 2, 94, c1);
962 }
963 else if (c1 >= 0x28 && c1 <= 0x2B)
964 { /* designation of DIMENSION2_CHARS94 character set */
965 ONE_MORE_BYTE (c2);
966 DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
967 }
968 else if (c1 >= 0x2C && c1 <= 0x2F)
969 { /* designation of DIMENSION2_CHARS96 character set */
970 ONE_MORE_BYTE (c2);
971 DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
972 }
973 else
e0e989f6 974 goto label_invalid_escape_sequence;
4ed46869
KH
975 break;
976
977 case 'n': /* invocation of locking-shift-2 */
e0e989f6
KH
978 if (CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
979 goto label_invalid_escape_sequence;
4ed46869 980 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
e0e989f6 981 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
4ed46869
KH
982 break;
983
984 case 'o': /* invocation of locking-shift-3 */
e0e989f6
KH
985 if (CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
986 goto label_invalid_escape_sequence;
4ed46869 987 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
e0e989f6 988 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
4ed46869
KH
989 break;
990
991 case 'N': /* invocation of single-shift-2 */
e0e989f6
KH
992 if (CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
993 goto label_invalid_escape_sequence;
4ed46869
KH
994 ONE_MORE_BYTE (c1);
995 charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
996 DECODE_ISO_CHARACTER (charset, c1);
997 break;
998
999 case 'O': /* invocation of single-shift-3 */
e0e989f6
KH
1000 if (CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1001 goto label_invalid_escape_sequence;
4ed46869
KH
1002 ONE_MORE_BYTE (c1);
1003 charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
1004 DECODE_ISO_CHARACTER (charset, c1);
1005 break;
1006
1007 case '0': /* start composing without embeded rules */
1008 coding->composing = COMPOSING_NO_RULE_HEAD;
1009 break;
1010
1011 case '1': /* end composing */
1012 coding->composing = COMPOSING_NO;
1013 break;
1014
1015 case '2': /* start composing with embeded rules */
1016 coding->composing = COMPOSING_WITH_RULE_HEAD;
1017 break;
1018
1019 case '[': /* specification of direction */
1020 /* For the moment, nested direction is not supported.
1021 So, the value of `coding->direction' is 0 or 1: 0
1022 means left-to-right, 1 means right-to-left. */
1023 ONE_MORE_BYTE (c1);
1024 switch (c1)
1025 {
1026 case ']': /* end of the current direction */
1027 coding->direction = 0;
1028
1029 case '0': /* end of the current direction */
1030 case '1': /* start of left-to-right direction */
1031 ONE_MORE_BYTE (c1);
1032 if (c1 == ']')
1033 coding->direction = 0;
1034 else
1035 goto label_invalid_escape_sequence;
1036 break;
1037
1038 case '2': /* start of right-to-left direction */
1039 ONE_MORE_BYTE (c1);
1040 if (c1 == ']')
1041 coding->direction= 1;
1042 else
1043 goto label_invalid_escape_sequence;
1044 break;
1045
1046 default:
1047 goto label_invalid_escape_sequence;
1048 }
1049 break;
1050
1051 default:
1052 if (c1 >= 0x28 && c1 <= 0x2B)
1053 { /* designation of DIMENSION1_CHARS94 character set */
1054 ONE_MORE_BYTE (c2);
1055 DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
1056 }
1057 else if (c1 >= 0x2C && c1 <= 0x2F)
1058 { /* designation of DIMENSION1_CHARS96 character set */
1059 ONE_MORE_BYTE (c2);
1060 DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
1061 }
1062 else
1063 {
1064 goto label_invalid_escape_sequence;
1065 }
1066 }
1067 /* We must update these variables now. */
1068 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1069 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1070 break;
1071
1072 label_invalid_escape_sequence:
1073 {
1074 int length = src - src_base;
1075
1076 bcopy (src_base, dst, length);
1077 dst += length;
1078 }
1079 }
1080 continue;
1081
1082 label_end_of_loop:
1083 coding->carryover_size = src - src_base;
1084 bcopy (src_base, coding->carryover, coding->carryover_size);
1085 src = src_base;
1086 break;
1087 }
1088
1089 /* If this is the last block of the text to be decoded, we had
1090 better just flush out all remaining codes in the text although
1091 they are not valid characters. */
1092 if (coding->last_block)
1093 {
1094 bcopy (src, dst, src_end - src);
1095 dst += (src_end - src);
1096 src = src_end;
1097 }
1098 *consumed = src - source;
1099 return dst - destination;
1100}
1101
f4dee582 1102/* ISO2022 encoding stuff. */
4ed46869
KH
1103
1104/*
f4dee582 1105 It is not enough to say just "ISO2022" on encoding, we have to
4ed46869
KH
1106 specify more details. In Emacs, each coding-system of ISO2022
1107 variant has the following specifications:
1108 1. Initial designation to G0 thru G3.
1109 2. Allows short-form designation?
1110 3. ASCII should be designated to G0 before control characters?
1111 4. ASCII should be designated to G0 at end of line?
1112 5. 7-bit environment or 8-bit environment?
1113 6. Use locking-shift?
1114 7. Use Single-shift?
1115 And the following two are only for Japanese:
1116 8. Use ASCII in place of JIS0201-1976-Roman?
1117 9. Use JISX0208-1983 in place of JISX0208-1978?
1118 These specifications are encoded in `coding->flags' as flag bits
1119 defined by macros CODING_FLAG_ISO_XXX. See `coding.h' for more
f4dee582 1120 details.
4ed46869
KH
1121*/
1122
1123/* Produce codes (escape sequence) for designating CHARSET to graphic
1124 register REG. If <final-char> of CHARSET is '@', 'A', or 'B' and
1125 the coding system CODING allows, produce designation sequence of
1126 short-form. */
1127
1128#define ENCODE_DESIGNATION(charset, reg, coding) \
1129 do { \
1130 unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset); \
1131 char *intermediate_char_94 = "()*+"; \
1132 char *intermediate_char_96 = ",-./"; \
1133 Lisp_Object temp \
1134 = Fassq (make_number (charset), Vcharset_revision_alist); \
1135 if (! NILP (temp)) \
1136 { \
1137 *dst++ = ISO_CODE_ESC; \
1138 *dst++ = '&'; \
1139 *dst++ = XINT (XCONS (temp)->cdr) + '@'; \
1140 } \
1141 *dst++ = ISO_CODE_ESC; \
1142 if (CHARSET_DIMENSION (charset) == 1) \
1143 { \
1144 if (CHARSET_CHARS (charset) == 94) \
1145 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1146 else \
1147 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1148 } \
1149 else \
1150 { \
1151 *dst++ = '$'; \
1152 if (CHARSET_CHARS (charset) == 94) \
1153 { \
1154 if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM) \
1155 || reg != 0 \
1156 || final_char < '@' || final_char > 'B') \
1157 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1158 } \
1159 else \
1160 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1161 } \
1162 *dst++ = final_char; \
1163 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
1164 } while (0)
1165
1166/* The following two macros produce codes (control character or escape
1167 sequence) for ISO2022 single-shift functions (single-shift-2 and
1168 single-shift-3). */
1169
1170#define ENCODE_SINGLE_SHIFT_2 \
1171 do { \
1172 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1173 *dst++ = ISO_CODE_ESC, *dst++ = 'N'; \
1174 else \
1175 *dst++ = ISO_CODE_SS2; \
1176 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1177 } while (0)
1178
1179#define ENCODE_SINGLE_SHIFT_3 \
1180 do { \
1181 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1182 *dst++ = ISO_CODE_ESC, *dst++ = 'O'; \
1183 else \
1184 *dst++ = ISO_CODE_SS3; \
1185 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1186 } while (0)
1187
1188/* The following four macros produce codes (control character or
1189 escape sequence) for ISO2022 locking-shift functions (shift-in,
1190 shift-out, locking-shift-2, and locking-shift-3). */
1191
1192#define ENCODE_SHIFT_IN \
1193 do { \
1194 *dst++ = ISO_CODE_SI; \
1195 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1196 } while (0)
1197
1198#define ENCODE_SHIFT_OUT \
1199 do { \
1200 *dst++ = ISO_CODE_SO; \
1201 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1202 } while (0)
1203
1204#define ENCODE_LOCKING_SHIFT_2 \
1205 do { \
1206 *dst++ = ISO_CODE_ESC, *dst++ = 'n'; \
1207 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1208 } while (0)
1209
1210#define ENCODE_LOCKING_SHIFT_3 \
1211 do { \
1212 *dst++ = ISO_CODE_ESC, *dst++ = 'o'; \
1213 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1214 } while (0)
1215
f4dee582
RS
1216/* Produce codes for a DIMENSION1 character whose character set is
1217 CHARSET and whose position-code is C1. Designation and invocation
4ed46869
KH
1218 sequences are also produced in advance if necessary. */
1219
1220
6e85d753
KH
1221#define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
1222 do { \
1223 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1224 { \
1225 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1226 *dst++ = c1 & 0x7F; \
1227 else \
1228 *dst++ = c1 | 0x80; \
1229 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1230 break; \
1231 } \
1232 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1233 { \
1234 *dst++ = c1 & 0x7F; \
1235 break; \
1236 } \
1237 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1238 { \
1239 *dst++ = c1 | 0x80; \
1240 break; \
1241 } \
1242 else if (coding->flags & CODING_FLAG_ISO_SAFE \
1243 && !CODING_SPEC_ISO_EXPECTED_CHARSETS (coding)[charset]) \
1244 { \
1245 /* We should not encode this character, instead produce one or \
1246 two `?'s. */ \
1247 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1248 if (CHARSET_WIDTH (charset) == 2) \
1249 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1250 break; \
1251 } \
1252 else \
1253 /* Since CHARSET is not yet invoked to any graphic planes, we \
1254 must invoke it, or, at first, designate it to some graphic \
1255 register. Then repeat the loop to actually produce the \
1256 character. */ \
1257 dst = encode_invocation_designation (charset, coding, dst); \
4ed46869
KH
1258 } while (1)
1259
f4dee582
RS
1260/* Produce codes for a DIMENSION2 character whose character set is
1261 CHARSET and whose position-codes are C1 and C2. Designation and
4ed46869
KH
1262 invocation codes are also produced in advance if necessary. */
1263
6e85d753
KH
1264#define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
1265 do { \
1266 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1267 { \
1268 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1269 *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F; \
1270 else \
1271 *dst++ = c1 | 0x80, *dst++ = c2 | 0x80; \
1272 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1273 break; \
1274 } \
1275 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1276 { \
1277 *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F; \
1278 break; \
1279 } \
1280 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1281 { \
1282 *dst++ = c1 | 0x80, *dst++= c2 | 0x80; \
1283 break; \
1284 } \
1285 else if (coding->flags & CODING_FLAG_ISO_SAFE \
1286 && !CODING_SPEC_ISO_EXPECTED_CHARSETS (coding)[charset]) \
1287 { \
1288 /* We should not encode this character, instead produce one or \
1289 two `?'s. */ \
1290 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1291 if (CHARSET_WIDTH (charset) == 2) \
1292 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1293 break; \
1294 } \
1295 else \
1296 /* Since CHARSET is not yet invoked to any graphic planes, we \
1297 must invoke it, or, at first, designate it to some graphic \
1298 register. Then repeat the loop to actually produce the \
1299 character. */ \
1300 dst = encode_invocation_designation (charset, coding, dst); \
4ed46869
KH
1301 } while (1)
1302
bdd9fb48
KH
1303#define ENCODE_ISO_CHARACTER(charset, c1, c2) \
1304 do { \
1305 int c_alt, charset_alt; \
1306 if (!NILP (unification_table) \
1307 && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
a5d301df 1308 >= 0)) \
bdd9fb48
KH
1309 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1310 else \
1311 charset_alt = charset; \
1312 if (CHARSET_DIMENSION (charset_alt) == 1) \
1313 ENCODE_ISO_CHARACTER_DIMENSION1 (charset_alt, c1); \
1314 else \
1315 ENCODE_ISO_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
1316 } while (0)
1317
4ed46869
KH
1318/* Produce designation and invocation codes at a place pointed by DST
1319 to use CHARSET. The element `spec.iso2022' of *CODING is updated.
1320 Return new DST. */
1321
1322unsigned char *
1323encode_invocation_designation (charset, coding, dst)
1324 int charset;
1325 struct coding_system *coding;
1326 unsigned char *dst;
1327{
1328 int reg; /* graphic register number */
1329
1330 /* At first, check designations. */
1331 for (reg = 0; reg < 4; reg++)
1332 if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
1333 break;
1334
1335 if (reg >= 4)
1336 {
1337 /* CHARSET is not yet designated to any graphic registers. */
1338 /* At first check the requested designation. */
1339 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1ba9e4ab
KH
1340 if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1341 /* Since CHARSET requests no special designation, designate it
1342 to graphic register 0. */
4ed46869
KH
1343 reg = 0;
1344
1345 ENCODE_DESIGNATION (charset, reg, coding);
1346 }
1347
1348 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
1349 && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
1350 {
1351 /* Since the graphic register REG is not invoked to any graphic
1352 planes, invoke it to graphic plane 0. */
1353 switch (reg)
1354 {
1355 case 0: /* graphic register 0 */
1356 ENCODE_SHIFT_IN;
1357 break;
1358
1359 case 1: /* graphic register 1 */
1360 ENCODE_SHIFT_OUT;
1361 break;
1362
1363 case 2: /* graphic register 2 */
1364 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1365 ENCODE_SINGLE_SHIFT_2;
1366 else
1367 ENCODE_LOCKING_SHIFT_2;
1368 break;
1369
1370 case 3: /* graphic register 3 */
1371 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1372 ENCODE_SINGLE_SHIFT_3;
1373 else
1374 ENCODE_LOCKING_SHIFT_3;
1375 break;
1376 }
1377 }
1378 return dst;
1379}
1380
1381/* The following two macros produce codes for indicating composition. */
1382#define ENCODE_COMPOSITION_NO_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '0'
1383#define ENCODE_COMPOSITION_WITH_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '2'
1384#define ENCODE_COMPOSITION_END *dst++ = ISO_CODE_ESC, *dst++ = '1'
1385
1386/* The following three macros produce codes for indicating direction
1387 of text. */
1388#define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
1389 do { \
1390 if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS) \
1391 *dst++ = ISO_CODE_ESC, *dst++ = '['; \
1392 else \
1393 *dst++ = ISO_CODE_CSI; \
1394 } while (0)
1395
1396#define ENCODE_DIRECTION_R2L \
1397 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '2', *dst++ = ']'
1398
1399#define ENCODE_DIRECTION_L2R \
1400 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '0', *dst++ = ']'
1401
1402/* Produce codes for designation and invocation to reset the graphic
1403 planes and registers to initial state. */
e0e989f6
KH
1404#define ENCODE_RESET_PLANE_AND_REGISTER \
1405 do { \
1406 int reg; \
1407 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0) \
1408 ENCODE_SHIFT_IN; \
1409 for (reg = 0; reg < 4; reg++) \
1410 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0 \
1411 && (CODING_SPEC_ISO_DESIGNATION (coding, reg) \
1412 != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg))) \
1413 ENCODE_DESIGNATION \
1414 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
4ed46869
KH
1415 } while (0)
1416
bdd9fb48
KH
1417/* Produce designation sequences of charsets in the line started from
1418 *SRC to a place pointed by DSTP.
1419
1420 If the current block ends before any end-of-line, we may fail to
1421 find all the necessary *designations. */
1422encode_designation_at_bol (coding, table, src, src_end, dstp)
e0e989f6 1423 struct coding_system *coding;
bdd9fb48 1424 Lisp_Object table;
e0e989f6
KH
1425 unsigned char *src, *src_end, **dstp;
1426{
bdd9fb48
KH
1427 int charset, c, found = 0, reg;
1428 /* Table of charsets to be designated to each graphic register. */
1429 int r[4];
1430 unsigned char *dst = *dstp;
1431
1432 for (reg = 0; reg < 4; reg++)
1433 r[reg] = -1;
1434
1435 while (src < src_end && *src != '\n' && found < 4)
e0e989f6 1436 {
bdd9fb48
KH
1437 int bytes = BYTES_BY_CHAR_HEAD (*src);
1438
1439 if (NILP (table))
1440 charset = CHARSET_AT (src);
1441 else
e0e989f6 1442 {
35cb8686
RS
1443 int c_alt;
1444 unsigned char c1, c2;
bdd9fb48
KH
1445
1446 SPLIT_STRING(src, bytes, charset, c1, c2);
1447 if ((c_alt = unify_char (table, -1, charset, c1, c2)) >= 0)
1448 charset = CHAR_CHARSET (c_alt);
e0e989f6 1449 }
bdd9fb48 1450
e0e989f6 1451 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1ba9e4ab 1452 if (r[reg] == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
bdd9fb48
KH
1453 {
1454 found++;
1455 r[reg] = charset;
1456 }
1457
1458 src += bytes;
1459 }
1460
1461 if (found)
1462 {
1463 for (reg = 0; reg < 4; reg++)
1464 if (r[reg] >= 0
1465 && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
1466 ENCODE_DESIGNATION (r[reg], reg, coding);
1467 *dstp = dst;
e0e989f6 1468 }
e0e989f6
KH
1469}
1470
4ed46869
KH
1471/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
1472
1473int
1474encode_coding_iso2022 (coding, source, destination,
1475 src_bytes, dst_bytes, consumed)
1476 struct coding_system *coding;
1477 unsigned char *source, *destination;
1478 int src_bytes, dst_bytes;
1479 int *consumed;
1480{
1481 unsigned char *src = source;
1482 unsigned char *src_end = source + src_bytes;
1483 unsigned char *dst = destination;
1484 unsigned char *dst_end = destination + dst_bytes;
e0e989f6 1485 /* Since the maximum bytes produced by each loop is 20, we subtract 19
4ed46869
KH
1486 from DST_END to assure overflow checking is necessary only at the
1487 head of loop. */
e0e989f6 1488 unsigned char *adjusted_dst_end = dst_end - 19;
a5d301df
KH
1489 Lisp_Object unification_table
1490 = coding->character_unification_table_for_encode;
bdd9fb48
KH
1491
1492 if (!NILP (Venable_character_unification) && NILP (unification_table))
a5d301df 1493 unification_table = Vstandard_character_unification_table_for_encode;
4ed46869
KH
1494
1495 while (src < src_end && dst < adjusted_dst_end)
1496 {
1497 /* SRC_BASE remembers the start position in source in each loop.
1498 The loop will be exited when there's not enough source text
1499 to analyze multi-byte codes (within macros ONE_MORE_BYTE,
1500 TWO_MORE_BYTES, and THREE_MORE_BYTES). In that case, SRC is
1501 reset to SRC_BASE before exiting. */
1502 unsigned char *src_base = src;
bdd9fb48 1503 int charset, c1, c2, c3, c4;
4ed46869 1504
e0e989f6
KH
1505 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
1506 && CODING_SPEC_ISO_BOL (coding))
1507 {
bdd9fb48
KH
1508 /* We have to produce designation sequences if any now. */
1509 encode_designation_at_bol (coding, unification_table,
1510 src, src_end, &dst);
e0e989f6
KH
1511 CODING_SPEC_ISO_BOL (coding) = 0;
1512 }
1513
1514 c1 = *src++;
4ed46869
KH
1515 /* If we are seeing a component of a composite character, we are
1516 seeing a leading-code specially encoded for composition, or a
1517 composition rule if composing with rule. We must set C1
1518 to a normal leading-code or an ASCII code. If we are not at
1519 a composed character, we must reset the composition state. */
1520 if (COMPOSING_P (coding->composing))
1521 {
1522 if (c1 < 0xA0)
1523 {
1524 /* We are not in a composite character any longer. */
1525 coding->composing = COMPOSING_NO;
1526 ENCODE_COMPOSITION_END;
1527 }
1528 else
1529 {
1530 if (coding->composing == COMPOSING_WITH_RULE_RULE)
1531 {
1532 *dst++ = c1 & 0x7F;
1533 coding->composing = COMPOSING_WITH_RULE_HEAD;
1534 continue;
1535 }
1536 else if (coding->composing == COMPOSING_WITH_RULE_HEAD)
1537 coding->composing = COMPOSING_WITH_RULE_RULE;
1538 if (c1 == 0xA0)
1539 {
1540 /* This is an ASCII component. */
1541 ONE_MORE_BYTE (c1);
1542 c1 &= 0x7F;
1543 }
1544 else
1545 /* This is a leading-code of non ASCII component. */
1546 c1 -= 0x20;
1547 }
1548 }
1549
1550 /* Now encode one character. C1 is a control character, an
1551 ASCII character, or a leading-code of multi-byte character. */
1552 switch (emacs_code_class[c1])
1553 {
1554 case EMACS_ascii_code:
bdd9fb48 1555 ENCODE_ISO_CHARACTER (CHARSET_ASCII, c1, /* dummy */ c2);
4ed46869
KH
1556 break;
1557
1558 case EMACS_control_code:
1559 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
e0e989f6 1560 ENCODE_RESET_PLANE_AND_REGISTER;
4ed46869
KH
1561 *dst++ = c1;
1562 break;
1563
1564 case EMACS_carriage_return_code:
1565 if (!coding->selective)
1566 {
1567 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
e0e989f6 1568 ENCODE_RESET_PLANE_AND_REGISTER;
4ed46869
KH
1569 *dst++ = c1;
1570 break;
1571 }
1572 /* fall down to treat '\r' as '\n' ... */
1573
1574 case EMACS_linefeed_code:
1575 if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
e0e989f6
KH
1576 ENCODE_RESET_PLANE_AND_REGISTER;
1577 if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
1578 bcopy (coding->spec.iso2022.initial_designation,
1579 coding->spec.iso2022.current_designation,
1580 sizeof coding->spec.iso2022.initial_designation);
4ed46869 1581 if (coding->eol_type == CODING_EOL_LF
0ef69138 1582 || coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
1583 *dst++ = ISO_CODE_LF;
1584 else if (coding->eol_type == CODING_EOL_CRLF)
1585 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
1586 else
1587 *dst++ = ISO_CODE_CR;
e0e989f6 1588 CODING_SPEC_ISO_BOL (coding) = 1;
4ed46869
KH
1589 break;
1590
1591 case EMACS_leading_code_2:
1592 ONE_MORE_BYTE (c2);
19a8d9e0
KH
1593 if (c2 < 0xA0)
1594 {
1595 /* invalid sequence */
1596 *dst++ = c1;
1597 *dst++ = c2;
1598 }
1599 else
1600 ENCODE_ISO_CHARACTER (c1, c2, /* dummy */ c3);
4ed46869
KH
1601 break;
1602
1603 case EMACS_leading_code_3:
1604 TWO_MORE_BYTES (c2, c3);
19a8d9e0
KH
1605 if (c2 < 0xA0 || c3 < 0xA0)
1606 {
1607 /* invalid sequence */
1608 *dst++ = c1;
1609 *dst++ = c2;
1610 *dst++ = c3;
1611 }
1612 else if (c1 < LEADING_CODE_PRIVATE_11)
bdd9fb48 1613 ENCODE_ISO_CHARACTER (c1, c2, c3);
4ed46869 1614 else
bdd9fb48 1615 ENCODE_ISO_CHARACTER (c2, c3, /* dummy */ c4);
4ed46869
KH
1616 break;
1617
1618 case EMACS_leading_code_4:
1619 THREE_MORE_BYTES (c2, c3, c4);
19a8d9e0
KH
1620 if (c2 < 0xA0 || c3 < 0xA0 || c4 < 0xA0)
1621 {
1622 /* invalid sequence */
1623 *dst++ = c1;
1624 *dst++ = c2;
1625 *dst++ = c3;
1626 *dst++ = c4;
1627 }
1628 else
1629 ENCODE_ISO_CHARACTER (c2, c3, c4);
4ed46869
KH
1630 break;
1631
1632 case EMACS_leading_code_composition:
19a8d9e0
KH
1633 ONE_MORE_BYTE (c2);
1634 if (c2 < 0xA0)
1635 {
1636 /* invalid sequence */
1637 *dst++ = c1;
1638 *dst++ = c2;
1639 }
1640 else if (c2 == 0xFF)
4ed46869
KH
1641 {
1642 coding->composing = COMPOSING_WITH_RULE_HEAD;
1643 ENCODE_COMPOSITION_WITH_RULE_START;
1644 }
1645 else
1646 {
1647 /* Rewind one byte because it is a character code of
1648 composition elements. */
1649 src--;
1650 coding->composing = COMPOSING_NO_RULE_HEAD;
1651 ENCODE_COMPOSITION_NO_RULE_START;
1652 }
1653 break;
1654
1655 case EMACS_invalid_code:
1656 *dst++ = c1;
1657 break;
1658 }
1659 continue;
1660 label_end_of_loop:
76376439
KH
1661 /* We reach here because the source date ends not at character
1662 boundary. */
1663 coding->carryover_size = src_end - src_base;
4ed46869 1664 bcopy (src_base, coding->carryover, coding->carryover_size);
76376439 1665 src = src_end;
4ed46869
KH
1666 break;
1667 }
1668
1669 /* If this is the last block of the text to be encoded, we must
bdd9fb48
KH
1670 reset graphic planes and registers to the initial state. */
1671 if (src >= src_end && coding->last_block)
4ed46869 1672 {
e0e989f6 1673 ENCODE_RESET_PLANE_AND_REGISTER;
bdd9fb48
KH
1674 if (coding->carryover_size > 0
1675 && coding->carryover_size < (dst_end - dst))
1676 {
1677 bcopy (coding->carryover, dst, coding->carryover_size);
1678 dst += coding->carryover_size;
1679 coding->carryover_size = 0;
1680 }
4ed46869
KH
1681 }
1682 *consumed = src - source;
1683 return dst - destination;
1684}
1685
1686\f
1687/*** 4. SJIS and BIG5 handlers ***/
1688
f4dee582 1689/* Although SJIS and BIG5 are not ISO's coding system, they are used
4ed46869
KH
1690 quite widely. So, for the moment, Emacs supports them in the bare
1691 C code. But, in the future, they may be supported only by CCL. */
1692
1693/* SJIS is a coding system encoding three character sets: ASCII, right
1694 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
1695 as is. A character of charset katakana-jisx0201 is encoded by
1696 "position-code + 0x80". A character of charset japanese-jisx0208
1697 is encoded in 2-byte but two position-codes are divided and shifted
1698 so that it fit in the range below.
1699
1700 --- CODE RANGE of SJIS ---
1701 (character set) (range)
1702 ASCII 0x00 .. 0x7F
1703 KATAKANA-JISX0201 0xA0 .. 0xDF
1704 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xFF
1705 (2nd byte) 0x40 .. 0xFF
1706 -------------------------------
1707
1708*/
1709
1710/* BIG5 is a coding system encoding two character sets: ASCII and
1711 Big5. An ASCII character is encoded as is. Big5 is a two-byte
1712 character set and is encoded in two-byte.
1713
1714 --- CODE RANGE of BIG5 ---
1715 (character set) (range)
1716 ASCII 0x00 .. 0x7F
1717 Big5 (1st byte) 0xA1 .. 0xFE
1718 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
1719 --------------------------
1720
1721 Since the number of characters in Big5 is larger than maximum
1722 characters in Emacs' charset (96x96), it can't be handled as one
1723 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
1724 and `charset-big5-2'. Both are DIMENSION2 and CHARS94. The former
1725 contains frequently used characters and the latter contains less
1726 frequently used characters. */
1727
1728/* Macros to decode or encode a character of Big5 in BIG5. B1 and B2
1729 are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
1730 C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
1731 format. CHARSET is `charset_big5_1' or `charset_big5_2'. */
1732
1733/* Number of Big5 characters which have the same code in 1st byte. */
1734#define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
1735
1736#define DECODE_BIG5(b1, b2, charset, c1, c2) \
1737 do { \
1738 unsigned int temp \
1739 = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62); \
1740 if (b1 < 0xC9) \
1741 charset = charset_big5_1; \
1742 else \
1743 { \
1744 charset = charset_big5_2; \
1745 temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW; \
1746 } \
1747 c1 = temp / (0xFF - 0xA1) + 0x21; \
1748 c2 = temp % (0xFF - 0xA1) + 0x21; \
1749 } while (0)
1750
1751#define ENCODE_BIG5(charset, c1, c2, b1, b2) \
1752 do { \
1753 unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21); \
1754 if (charset == charset_big5_2) \
1755 temp += BIG5_SAME_ROW * (0xC9 - 0xA1); \
1756 b1 = temp / BIG5_SAME_ROW + 0xA1; \
1757 b2 = temp % BIG5_SAME_ROW; \
1758 b2 += b2 < 0x3F ? 0x40 : 0x62; \
1759 } while (0)
1760
a5d301df
KH
1761#define DECODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \
1762 do { \
1763 int c_alt, charset_alt = (charset); \
1764 if (!NILP (unification_table) \
1765 && ((c_alt = unify_char (unification_table, \
1766 -1, (charset), c1, c2)) >= 0)) \
1767 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1768 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
1769 DECODE_CHARACTER_ASCII (c1); \
1770 else if (CHARSET_DIMENSION (charset_alt) == 1) \
1771 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
1772 else \
1773 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
1774 } while (0)
1775
1776#define ENCODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \
1777 do { \
1778 int c_alt, charset_alt; \
1779 if (!NILP (unification_table) \
1780 && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
1781 >= 0)) \
1782 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1783 else \
1784 charset_alt = charset; \
1785 if (charset_alt == charset_ascii) \
1786 *dst++ = c1; \
1787 else if (CHARSET_DIMENSION (charset_alt) == 1) \
1788 { \
1789 if (sjis_p && charset_alt == charset_katakana_jisx0201) \
1790 *dst++ = c1; \
1791 else \
1792 *dst++ = charset_alt, *dst++ = c1; \
1793 } \
1794 else \
1795 { \
1796 c1 &= 0x7F, c2 &= 0x7F; \
1797 if (sjis_p && charset_alt == charset_jisx0208) \
1798 { \
1799 unsigned char s1, s2; \
1800 \
1801 ENCODE_SJIS (c1, c2, s1, s2); \
1802 *dst++ = s1, *dst++ = s2; \
1803 } \
1804 else if (!sjis_p \
1805 && (charset_alt == charset_big5_1 \
1806 || charset_alt == charset_big5_2)) \
1807 { \
1808 unsigned char b1, b2; \
1809 \
9ce27fde 1810 ENCODE_BIG5 (charset_alt, c1, c2, b1, b2); \
a5d301df
KH
1811 *dst++ = b1, *dst++ = b2; \
1812 } \
1813 else \
1814 *dst++ = charset_alt, *dst++ = c1, *dst++ = c2; \
1815 } \
1816 } while (0);
1817
4ed46869
KH
1818/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1819 Check if a text is encoded in SJIS. If it is, return
1820 CODING_CATEGORY_MASK_SJIS, else return 0. */
1821
1822int
1823detect_coding_sjis (src, src_end)
1824 unsigned char *src, *src_end;
1825{
1826 unsigned char c;
1827
1828 while (src < src_end)
1829 {
1830 c = *src++;
1831 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1832 return 0;
1833 if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
1834 {
1835 if (src < src_end && *src++ < 0x40)
1836 return 0;
1837 }
1838 }
1839 return CODING_CATEGORY_MASK_SJIS;
1840}
1841
1842/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1843 Check if a text is encoded in BIG5. If it is, return
1844 CODING_CATEGORY_MASK_BIG5, else return 0. */
1845
1846int
1847detect_coding_big5 (src, src_end)
1848 unsigned char *src, *src_end;
1849{
1850 unsigned char c;
1851
1852 while (src < src_end)
1853 {
1854 c = *src++;
1855 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1856 return 0;
1857 if (c >= 0xA1)
1858 {
1859 if (src >= src_end)
1860 break;
1861 c = *src++;
1862 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
1863 return 0;
1864 }
1865 }
1866 return CODING_CATEGORY_MASK_BIG5;
1867}
1868
1869/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
1870 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
1871
1872int
1873decode_coding_sjis_big5 (coding, source, destination,
1874 src_bytes, dst_bytes, consumed, sjis_p)
1875 struct coding_system *coding;
1876 unsigned char *source, *destination;
1877 int src_bytes, dst_bytes;
1878 int *consumed;
1879 int sjis_p;
1880{
1881 unsigned char *src = source;
1882 unsigned char *src_end = source + src_bytes;
1883 unsigned char *dst = destination;
1884 unsigned char *dst_end = destination + dst_bytes;
1885 /* Since the maximum bytes produced by each loop is 4, we subtract 3
1886 from DST_END to assure overflow checking is necessary only at the
1887 head of loop. */
1888 unsigned char *adjusted_dst_end = dst_end - 3;
a5d301df
KH
1889 Lisp_Object unification_table
1890 = coding->character_unification_table_for_decode;
1891
1892 if (!NILP (Venable_character_unification) && NILP (unification_table))
1893 unification_table = Vstandard_character_unification_table_for_decode;
4ed46869
KH
1894
1895 while (src < src_end && dst < adjusted_dst_end)
1896 {
1897 /* SRC_BASE remembers the start position in source in each loop.
1898 The loop will be exited when there's not enough source text
1899 to analyze two-byte character (within macro ONE_MORE_BYTE).
1900 In that case, SRC is reset to SRC_BASE before exiting. */
1901 unsigned char *src_base = src;
1902 unsigned char c1 = *src++, c2, c3, c4;
1903
1904 if (c1 == '\r')
1905 {
1906 if (coding->eol_type == CODING_EOL_CRLF)
1907 {
1908 ONE_MORE_BYTE (c2);
1909 if (c2 == '\n')
1910 *dst++ = c2;
1911 else
1912 /* To process C2 again, SRC is subtracted by 1. */
1913 *dst++ = c1, src--;
1914 }
1915 else
1916 *dst++ = c1;
1917 }
a5d301df 1918 else if (c1 < 0x20)
4ed46869 1919 *dst++ = c1;
a5d301df
KH
1920 else if (c1 < 0x80)
1921 DECODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
4ed46869
KH
1922 else if (c1 < 0xA0 || c1 >= 0xE0)
1923 {
1924 /* SJIS -> JISX0208, BIG5 -> Big5 (only if 0xE0 <= c1 < 0xFF) */
1925 if (sjis_p)
1926 {
1927 ONE_MORE_BYTE (c2);
1928 DECODE_SJIS (c1, c2, c3, c4);
a5d301df 1929 DECODE_SJIS_BIG5_CHARACTER (charset_jisx0208, c3, c4);
4ed46869
KH
1930 }
1931 else if (c1 >= 0xE0 && c1 < 0xFF)
1932 {
1933 int charset;
1934
1935 ONE_MORE_BYTE (c2);
1936 DECODE_BIG5 (c1, c2, charset, c3, c4);
a5d301df 1937 DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
4ed46869
KH
1938 }
1939 else /* Invalid code */
1940 *dst++ = c1;
1941 }
1942 else
1943 {
1944 /* SJIS -> JISX0201-Kana, BIG5 -> Big5 */
1945 if (sjis_p)
a5d301df 1946 DECODE_SJIS_BIG5_CHARACTER (charset_katakana_jisx0201, c1, /* dummy */ c2);
4ed46869
KH
1947 else
1948 {
1949 int charset;
1950
1951 ONE_MORE_BYTE (c2);
1952 DECODE_BIG5 (c1, c2, charset, c3, c4);
a5d301df 1953 DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
4ed46869
KH
1954 }
1955 }
1956 continue;
1957
1958 label_end_of_loop:
1959 coding->carryover_size = src - src_base;
1960 bcopy (src_base, coding->carryover, coding->carryover_size);
1961 src = src_base;
1962 break;
1963 }
1964
1965 *consumed = src - source;
1966 return dst - destination;
1967}
1968
1969/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
1970 This function can encode `charset_ascii', `charset_katakana_jisx0201',
1971 `charset_jisx0208', `charset_big5_1', and `charset_big5-2'. We are
1972 sure that all these charsets are registered as official charset
1973 (i.e. do not have extended leading-codes). Characters of other
1974 charsets are produced without any encoding. If SJIS_P is 1, encode
1975 SJIS text, else encode BIG5 text. */
1976
1977int
1978encode_coding_sjis_big5 (coding, source, destination,
1979 src_bytes, dst_bytes, consumed, sjis_p)
1980 struct coding_system *coding;
1981 unsigned char *source, *destination;
1982 int src_bytes, dst_bytes;
1983 int *consumed;
1984 int sjis_p;
1985{
1986 unsigned char *src = source;
1987 unsigned char *src_end = source + src_bytes;
1988 unsigned char *dst = destination;
1989 unsigned char *dst_end = destination + dst_bytes;
1990 /* Since the maximum bytes produced by each loop is 2, we subtract 1
1991 from DST_END to assure overflow checking is necessary only at the
1992 head of loop. */
1993 unsigned char *adjusted_dst_end = dst_end - 1;
a5d301df
KH
1994 Lisp_Object unification_table
1995 = coding->character_unification_table_for_encode;
1996
1997 if (!NILP (Venable_character_unification) && NILP (unification_table))
1998 unification_table = Vstandard_character_unification_table_for_encode;
4ed46869
KH
1999
2000 while (src < src_end && dst < adjusted_dst_end)
2001 {
2002 /* SRC_BASE remembers the start position in source in each loop.
2003 The loop will be exited when there's not enough source text
2004 to analyze multi-byte codes (within macros ONE_MORE_BYTE and
2005 TWO_MORE_BYTES). In that case, SRC is reset to SRC_BASE
2006 before exiting. */
2007 unsigned char *src_base = src;
2008 unsigned char c1 = *src++, c2, c3, c4;
2009
2010 if (coding->composing)
2011 {
2012 if (c1 == 0xA0)
2013 {
2014 ONE_MORE_BYTE (c1);
2015 c1 &= 0x7F;
2016 }
2017 else if (c1 >= 0xA0)
2018 c1 -= 0x20;
2019 else
2020 coding->composing = 0;
2021 }
2022
2023 switch (emacs_code_class[c1])
2024 {
2025 case EMACS_ascii_code:
a5d301df
KH
2026 ENCODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
2027 break;
2028
4ed46869
KH
2029 case EMACS_control_code:
2030 *dst++ = c1;
2031 break;
2032
2033 case EMACS_carriage_return_code:
2034 if (!coding->selective)
2035 {
2036 *dst++ = c1;
2037 break;
2038 }
2039 /* fall down to treat '\r' as '\n' ... */
2040
2041 case EMACS_linefeed_code:
2042 if (coding->eol_type == CODING_EOL_LF
0ef69138 2043 || coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
2044 *dst++ = '\n';
2045 else if (coding->eol_type == CODING_EOL_CRLF)
2046 *dst++ = '\r', *dst++ = '\n';
2047 else
2048 *dst++ = '\r';
2049 break;
2050
2051 case EMACS_leading_code_2:
2052 ONE_MORE_BYTE (c2);
a5d301df 2053 ENCODE_SJIS_BIG5_CHARACTER (c1, c2, /* dummy */ c3);
4ed46869
KH
2054 break;
2055
2056 case EMACS_leading_code_3:
2057 TWO_MORE_BYTES (c2, c3);
a5d301df 2058 ENCODE_SJIS_BIG5_CHARACTER (c1, c2, c3);
4ed46869
KH
2059 break;
2060
2061 case EMACS_leading_code_4:
2062 THREE_MORE_BYTES (c2, c3, c4);
a5d301df 2063 ENCODE_SJIS_BIG5_CHARACTER (c2, c3, c4);
4ed46869
KH
2064 break;
2065
2066 case EMACS_leading_code_composition:
2067 coding->composing = 1;
2068 break;
2069
2070 default: /* i.e. case EMACS_invalid_code: */
2071 *dst++ = c1;
2072 }
2073 continue;
2074
2075 label_end_of_loop:
76376439 2076 coding->carryover_size = src_end - src_base;
4ed46869 2077 bcopy (src_base, coding->carryover, coding->carryover_size);
76376439 2078 src = src_end;
4ed46869
KH
2079 break;
2080 }
2081
2082 *consumed = src - source;
2083 return dst - destination;
2084}
2085
2086\f
2087/*** 5. End-of-line handlers ***/
2088
2089/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2090 This function is called only when `coding->eol_type' is
2091 CODING_EOL_CRLF or CODING_EOL_CR. */
2092
2093decode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
2094 struct coding_system *coding;
2095 unsigned char *source, *destination;
2096 int src_bytes, dst_bytes;
2097 int *consumed;
2098{
2099 unsigned char *src = source;
2100 unsigned char *src_end = source + src_bytes;
2101 unsigned char *dst = destination;
2102 unsigned char *dst_end = destination + dst_bytes;
2103 int produced;
2104
2105 switch (coding->eol_type)
2106 {
2107 case CODING_EOL_CRLF:
2108 {
2109 /* Since the maximum bytes produced by each loop is 2, we
2110 subtract 1 from DST_END to assure overflow checking is
2111 necessary only at the head of loop. */
2112 unsigned char *adjusted_dst_end = dst_end - 1;
2113
2114 while (src < src_end && dst < adjusted_dst_end)
2115 {
2116 unsigned char *src_base = src;
2117 unsigned char c = *src++;
2118 if (c == '\r')
2119 {
2120 ONE_MORE_BYTE (c);
2121 if (c != '\n')
2122 *dst++ = '\r';
bfd99048 2123 *dst++ = c;
4ed46869
KH
2124 }
2125 else
2126 *dst++ = c;
2127 continue;
2128
2129 label_end_of_loop:
2130 coding->carryover_size = src - src_base;
2131 bcopy (src_base, coding->carryover, coding->carryover_size);
2132 src = src_base;
2133 break;
2134 }
2135 *consumed = src - source;
2136 produced = dst - destination;
2137 break;
2138 }
2139
2140 case CODING_EOL_CR:
2141 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2142 bcopy (source, destination, produced);
2143 dst_end = destination + produced;
2144 while (dst < dst_end)
2145 if (*dst++ == '\r') dst[-1] = '\n';
2146 *consumed = produced;
2147 break;
2148
2149 default: /* i.e. case: CODING_EOL_LF */
2150 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2151 bcopy (source, destination, produced);
2152 *consumed = produced;
2153 break;
2154 }
2155
2156 return produced;
2157}
2158
2159/* See "GENERAL NOTES about `encode_coding_XXX ()' functions". Encode
2160 format of end-of-line according to `coding->eol_type'. If
2161 `coding->selective' is 1, code '\r' in source text also means
2162 end-of-line. */
2163
2164encode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
2165 struct coding_system *coding;
2166 unsigned char *source, *destination;
2167 int src_bytes, dst_bytes;
2168 int *consumed;
2169{
2170 unsigned char *src = source;
2171 unsigned char *dst = destination;
2172 int produced;
2173
2174 if (src_bytes <= 0)
2175 return 0;
2176
2177 switch (coding->eol_type)
2178 {
2179 case CODING_EOL_LF:
0ef69138 2180 case CODING_EOL_UNDECIDED:
4ed46869
KH
2181 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2182 bcopy (source, destination, produced);
2183 if (coding->selective)
2184 {
2185 int i = produced;
2186 while (i--)
2187 if (*dst++ == '\r') dst[-1] = '\n';
2188 }
2189 *consumed = produced;
2190
2191 case CODING_EOL_CRLF:
2192 {
2193 unsigned char c;
2194 unsigned char *src_end = source + src_bytes;
2195 unsigned char *dst_end = destination + dst_bytes;
2196 /* Since the maximum bytes produced by each loop is 2, we
2197 subtract 1 from DST_END to assure overflow checking is
2198 necessary only at the head of loop. */
2199 unsigned char *adjusted_dst_end = dst_end - 1;
2200
2201 while (src < src_end && dst < adjusted_dst_end)
2202 {
2203 c = *src++;
2204 if (c == '\n' || (c == '\r' && coding->selective))
2205 *dst++ = '\r', *dst++ = '\n';
2206 else
2207 *dst++ = c;
2208 }
2209 produced = dst - destination;
2210 *consumed = src - source;
2211 break;
2212 }
2213
2214 default: /* i.e. case CODING_EOL_CR: */
2215 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2216 bcopy (source, destination, produced);
2217 {
2218 int i = produced;
2219 while (i--)
2220 if (*dst++ == '\n') dst[-1] = '\r';
2221 }
2222 *consumed = produced;
2223 }
2224
2225 return produced;
2226}
2227
2228\f
2229/*** 6. C library functions ***/
2230
2231/* In Emacs Lisp, coding system is represented by a Lisp symbol which
2232 has a property `coding-system'. The value of this property is a
2233 vector of length 5 (called as coding-vector). Among elements of
2234 this vector, the first (element[0]) and the fifth (element[4])
2235 carry important information for decoding/encoding. Before
2236 decoding/encoding, this information should be set in fields of a
2237 structure of type `coding_system'.
2238
2239 A value of property `coding-system' can be a symbol of another
2240 subsidiary coding-system. In that case, Emacs gets coding-vector
2241 from that symbol.
2242
2243 `element[0]' contains information to be set in `coding->type'. The
2244 value and its meaning is as follows:
2245
0ef69138
KH
2246 0 -- coding_type_emacs_mule
2247 1 -- coding_type_sjis
2248 2 -- coding_type_iso2022
2249 3 -- coding_type_big5
2250 4 -- coding_type_ccl encoder/decoder written in CCL
2251 nil -- coding_type_no_conversion
2252 t -- coding_type_undecided (automatic conversion on decoding,
2253 no-conversion on encoding)
4ed46869
KH
2254
2255 `element[4]' contains information to be set in `coding->flags' and
2256 `coding->spec'. The meaning varies by `coding->type'.
2257
2258 If `coding->type' is `coding_type_iso2022', element[4] is a vector
2259 of length 32 (of which the first 13 sub-elements are used now).
2260 Meanings of these sub-elements are:
2261
2262 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2263 If the value is an integer of valid charset, the charset is
2264 assumed to be designated to graphic register N initially.
2265
2266 If the value is minus, it is a minus value of charset which
2267 reserves graphic register N, which means that the charset is
2268 not designated initially but should be designated to graphic
2269 register N just before encoding a character in that charset.
2270
2271 If the value is nil, graphic register N is never used on
2272 encoding.
2273
2274 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2275 Each value takes t or nil. See the section ISO2022 of
2276 `coding.h' for more information.
2277
2278 If `coding->type' is `coding_type_big5', element[4] is t to denote
2279 BIG5-ETen or nil to denote BIG5-HKU.
2280
2281 If `coding->type' takes the other value, element[4] is ignored.
2282
2283 Emacs Lisp's coding system also carries information about format of
2284 end-of-line in a value of property `eol-type'. If the value is
2285 integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2286 means CODING_EOL_CR. If it is not integer, it should be a vector
2287 of subsidiary coding systems of which property `eol-type' has one
2288 of above values.
2289
2290*/
2291
2292/* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2293 and set it in CODING. If CODING_SYSTEM_SYMBOL is invalid, CODING
2294 is setup so that no conversion is necessary and return -1, else
2295 return 0. */
2296
2297int
e0e989f6
KH
2298setup_coding_system (coding_system, coding)
2299 Lisp_Object coding_system;
4ed46869
KH
2300 struct coding_system *coding;
2301{
4ed46869
KH
2302 Lisp_Object type, eol_type;
2303
f4dee582 2304 /* At first, set several fields to default values. */
4ed46869
KH
2305 coding->require_flushing = 0;
2306 coding->last_block = 0;
2307 coding->selective = 0;
2308 coding->composing = 0;
2309 coding->direction = 0;
2310 coding->carryover_size = 0;
4ed46869 2311 coding->post_read_conversion = coding->pre_write_conversion = Qnil;
a5d301df
KH
2312 coding->character_unification_table_for_decode = Qnil;
2313 coding->character_unification_table_for_encode = Qnil;
4ed46869 2314
e0e989f6
KH
2315 Vlast_coding_system_used = coding->symbol = coding_system;
2316 eol_type = Qnil;
2317 /* Get value of property `coding-system' until we get a vector.
2318 While doing that, also get values of properties
a5d301df
KH
2319 `post-read-conversion', `pre-write-conversion',
2320 `character-unification-table-for-decode',
2321 `character-unification-table-for-encode' and `eol-type'. */
e0e989f6 2322 while (!NILP (coding_system) && SYMBOLP (coding_system))
4ed46869 2323 {
4ed46869 2324 if (NILP (coding->post_read_conversion))
e0e989f6 2325 coding->post_read_conversion = Fget (coding_system,
4ed46869 2326 Qpost_read_conversion);
e0e989f6
KH
2327 if (NILP (coding->pre_write_conversion))
2328 coding->pre_write_conversion = Fget (coding_system,
4ed46869 2329 Qpre_write_conversion);
9ce27fde 2330 if (!inhibit_eol_conversion && NILP (eol_type))
e0e989f6 2331 eol_type = Fget (coding_system, Qeol_type);
a5d301df
KH
2332
2333 if (NILP (coding->character_unification_table_for_decode))
2334 coding->character_unification_table_for_decode
2335 = Fget (coding_system, Qcharacter_unification_table_for_decode);
2336
2337 if (NILP (coding->character_unification_table_for_encode))
2338 coding->character_unification_table_for_encode
2339 = Fget (coding_system, Qcharacter_unification_table_for_encode);
2340
e0e989f6 2341 coding_system = Fget (coding_system, Qcoding_system);
4ed46869 2342 }
a5d301df
KH
2343
2344 while (!NILP (coding->character_unification_table_for_decode)
2345 && SYMBOLP (coding->character_unification_table_for_decode))
2346 coding->character_unification_table_for_decode
2347 = Fget (coding->character_unification_table_for_decode,
2348 Qcharacter_unification_table_for_decode);
2349 if (!NILP (coding->character_unification_table_for_decode)
2350 && !CHAR_TABLE_P (coding->character_unification_table_for_decode))
2351 coding->character_unification_table_for_decode = Qnil;
2352
2353 while (!NILP (coding->character_unification_table_for_encode)
2354 && SYMBOLP (coding->character_unification_table_for_encode))
2355 coding->character_unification_table_for_encode
2356 = Fget (coding->character_unification_table_for_encode,
2357 Qcharacter_unification_table_for_encode);
2358 if (!NILP (coding->character_unification_table_for_encode)
2359 && !CHAR_TABLE_P (coding->character_unification_table_for_encode))
2360 coding->character_unification_table_for_encode = Qnil;
2361
e0e989f6
KH
2362 if (!VECTORP (coding_system)
2363 || XVECTOR (coding_system)->size != 5)
4ed46869
KH
2364 goto label_invalid_coding_system;
2365
4ed46869 2366 if (VECTORP (eol_type))
0ef69138 2367 coding->eol_type = CODING_EOL_UNDECIDED;
4ed46869
KH
2368 else if (XFASTINT (eol_type) == 1)
2369 coding->eol_type = CODING_EOL_CRLF;
2370 else if (XFASTINT (eol_type) == 2)
2371 coding->eol_type = CODING_EOL_CR;
2372 else
2373 coding->eol_type = CODING_EOL_LF;
2374
e0e989f6 2375 type = XVECTOR (coding_system)->contents[0];
4ed46869
KH
2376 switch (XFASTINT (type))
2377 {
2378 case 0:
0ef69138 2379 coding->type = coding_type_emacs_mule;
4ed46869
KH
2380 break;
2381
2382 case 1:
2383 coding->type = coding_type_sjis;
2384 break;
2385
2386 case 2:
2387 coding->type = coding_type_iso2022;
2388 {
f44d27ce 2389 Lisp_Object val;
4ed46869
KH
2390 Lisp_Object *flags;
2391 int i, charset, default_reg_bits = 0;
2392
f44d27ce
RS
2393 val = XVECTOR (coding_system)->contents[4];
2394
4ed46869
KH
2395 if (!VECTORP (val) || XVECTOR (val)->size != 32)
2396 goto label_invalid_coding_system;
2397
2398 flags = XVECTOR (val)->contents;
2399 coding->flags
2400 = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
2401 | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
2402 | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
2403 | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
2404 | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
2405 | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
2406 | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
2407 | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
e0e989f6
KH
2408 | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
2409 | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
c4825358
KH
2410 | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
2411 | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
3f003981 2412 | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
c4825358 2413 );
4ed46869
KH
2414
2415 /* Invoke graphic register 0 to plane 0. */
2416 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
2417 /* Invoke graphic register 1 to plane 1 if we can use full 8-bit. */
2418 CODING_SPEC_ISO_INVOCATION (coding, 1)
2419 = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
2420 /* Not single shifting at first. */
6e85d753 2421 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
e0e989f6 2422 /* Beginning of buffer should also be regarded as bol. */
6e85d753 2423 CODING_SPEC_ISO_BOL (coding) = 1;
4ed46869
KH
2424
2425 /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
2426 FLAGS[REG] can be one of below:
2427 integer CHARSET: CHARSET occupies register I,
2428 t: designate nothing to REG initially, but can be used
2429 by any charsets,
2430 list of integer, nil, or t: designate the first
2431 element (if integer) to REG initially, the remaining
2432 elements (if integer) is designated to REG on request,
2433 if an element is t, REG can be used by any charset,
2434 nil: REG is never used. */
467e7675 2435 for (charset = 0; charset <= MAX_CHARSET; charset++)
1ba9e4ab
KH
2436 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2437 = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
6e85d753 2438 bzero (CODING_SPEC_ISO_EXPECTED_CHARSETS (coding), MAX_CHARSET + 1);
4ed46869
KH
2439 for (i = 0; i < 4; i++)
2440 {
2441 if (INTEGERP (flags[i])
e0e989f6
KH
2442 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset))
2443 || (charset = get_charset_id (flags[i])) >= 0)
4ed46869
KH
2444 {
2445 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2446 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
6e85d753 2447 CODING_SPEC_ISO_EXPECTED_CHARSETS (coding)[charset] = 1;
4ed46869
KH
2448 }
2449 else if (EQ (flags[i], Qt))
2450 {
2451 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2452 default_reg_bits |= 1 << i;
2453 }
2454 else if (CONSP (flags[i]))
2455 {
2456 Lisp_Object tail = flags[i];
2457
2458 if (INTEGERP (XCONS (tail)->car)
2459 && (charset = XINT (XCONS (tail)->car),
e0e989f6
KH
2460 CHARSET_VALID_P (charset))
2461 || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
4ed46869
KH
2462 {
2463 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2464 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
6e85d753 2465 CODING_SPEC_ISO_EXPECTED_CHARSETS (coding)[charset] = 1;
4ed46869
KH
2466 }
2467 else
2468 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2469 tail = XCONS (tail)->cdr;
2470 while (CONSP (tail))
2471 {
2472 if (INTEGERP (XCONS (tail)->car)
2473 && (charset = XINT (XCONS (tail)->car),
e0e989f6
KH
2474 CHARSET_VALID_P (charset))
2475 || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
6e85d753
KH
2476 {
2477 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2478 = i;
2479 CODING_SPEC_ISO_EXPECTED_CHARSETS (coding)[charset]
2480 = 1;
2481 }
4ed46869
KH
2482 else if (EQ (XCONS (tail)->car, Qt))
2483 default_reg_bits |= 1 << i;
2484 tail = XCONS (tail)->cdr;
2485 }
2486 }
2487 else
2488 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2489
2490 CODING_SPEC_ISO_DESIGNATION (coding, i)
2491 = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
2492 }
2493
2494 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
2495 {
2496 /* REG 1 can be used only by locking shift in 7-bit env. */
2497 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2498 default_reg_bits &= ~2;
2499 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
2500 /* Without any shifting, only REG 0 and 1 can be used. */
2501 default_reg_bits &= 3;
2502 }
2503
6e85d753
KH
2504 for (charset = 0; charset <= MAX_CHARSET; charset++)
2505 if (CHARSET_VALID_P (charset)
2506 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2507 == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
2508 {
2509 /* We have not yet decided where to designate CHARSET. */
2510 int reg_bits = default_reg_bits;
2511
2512 if (CHARSET_CHARS (charset) == 96)
2513 /* A charset of CHARS96 can't be designated to REG 0. */
2514 reg_bits &= ~1;
2515
2516 if (reg_bits)
2517 /* There exist some default graphic register. */
2518 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2519 = (reg_bits & 1
2520 ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
2521 else
2522 /* We anyway have to designate CHARSET to somewhere. */
2523 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2524 = (CHARSET_CHARS (charset) == 94
2525 ? 0
2526 : ((coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT
2527 || ! coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2528 ? 1
2529 : (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT
2530 ? 2 : 0)));
2531 }
4ed46869
KH
2532 }
2533 coding->require_flushing = 1;
2534 break;
2535
2536 case 3:
2537 coding->type = coding_type_big5;
2538 coding->flags
e0e989f6 2539 = (NILP (XVECTOR (coding_system)->contents[4])
4ed46869
KH
2540 ? CODING_FLAG_BIG5_HKU
2541 : CODING_FLAG_BIG5_ETEN);
2542 break;
2543
2544 case 4:
2545 coding->type = coding_type_ccl;
2546 {
e0e989f6 2547 Lisp_Object val = XVECTOR (coding_system)->contents[4];
4ed46869
KH
2548 if (CONSP (val)
2549 && VECTORP (XCONS (val)->car)
2550 && VECTORP (XCONS (val)->cdr))
2551 {
2552 setup_ccl_program (&(coding->spec.ccl.decoder), XCONS (val)->car);
2553 setup_ccl_program (&(coding->spec.ccl.encoder), XCONS (val)->cdr);
2554 }
2555 else
2556 goto label_invalid_coding_system;
2557 }
2558 coding->require_flushing = 1;
2559 break;
2560
27901516
KH
2561 case 5:
2562 coding->type = coding_type_raw_text;
2563 break;
2564
4ed46869
KH
2565 default:
2566 if (EQ (type, Qt))
0ef69138 2567 coding->type = coding_type_undecided;
4ed46869
KH
2568 else
2569 coding->type = coding_type_no_conversion;
2570 break;
2571 }
2572 return 0;
2573
2574 label_invalid_coding_system:
2575 coding->type = coding_type_no_conversion;
dec137e5 2576 coding->eol_type = CODING_EOL_LF;
e0e989f6
KH
2577 coding->symbol = coding->pre_write_conversion = coding->post_read_conversion
2578 = Qnil;
4ed46869
KH
2579 return -1;
2580}
2581
2582/* Emacs has a mechanism to automatically detect a coding system if it
2583 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
2584 it's impossible to distinguish some coding systems accurately
2585 because they use the same range of codes. So, at first, coding
2586 systems are categorized into 7, those are:
2587
0ef69138 2588 o coding-category-emacs-mule
4ed46869
KH
2589
2590 The category for a coding system which has the same code range
2591 as Emacs' internal format. Assigned the coding-system (Lisp
0ef69138 2592 symbol) `emacs-mule' by default.
4ed46869
KH
2593
2594 o coding-category-sjis
2595
2596 The category for a coding system which has the same code range
2597 as SJIS. Assigned the coding-system (Lisp
7717c392 2598 symbol) `japanese-shift-jis' by default.
4ed46869
KH
2599
2600 o coding-category-iso-7
2601
2602 The category for a coding system which has the same code range
7717c392
KH
2603 as ISO2022 of 7-bit environment. This doesn't use any locking
2604 shift and single shift functions. Assigned the coding-system
2605 (Lisp symbol) `iso-2022-7bit' by default.
4ed46869
KH
2606
2607 o coding-category-iso-8-1
2608
2609 The category for a coding system which has the same code range
2610 as ISO2022 of 8-bit environment and graphic plane 1 used only
7717c392
KH
2611 for DIMENSION1 charset. This doesn't use any locking shift
2612 and single shift functions. Assigned the coding-system (Lisp
2613 symbol) `iso-latin-1' by default.
4ed46869
KH
2614
2615 o coding-category-iso-8-2
2616
2617 The category for a coding system which has the same code range
2618 as ISO2022 of 8-bit environment and graphic plane 1 used only
7717c392
KH
2619 for DIMENSION2 charset. This doesn't use any locking shift
2620 and single shift functions. Assigned the coding-system (Lisp
2621 symbol) `japanese-iso-8bit' by default.
4ed46869 2622
7717c392 2623 o coding-category-iso-7-else
4ed46869
KH
2624
2625 The category for a coding system which has the same code range
7717c392
KH
2626 as ISO2022 of 7-bit environemnt but uses locking shift or
2627 single shift functions. Assigned the coding-system (Lisp
2628 symbol) `iso-2022-7bit-lock' by default.
2629
2630 o coding-category-iso-8-else
2631
2632 The category for a coding system which has the same code range
2633 as ISO2022 of 8-bit environemnt but uses locking shift or
2634 single shift functions. Assigned the coding-system (Lisp
2635 symbol) `iso-2022-8bit-ss2' by default.
4ed46869
KH
2636
2637 o coding-category-big5
2638
2639 The category for a coding system which has the same code range
2640 as BIG5. Assigned the coding-system (Lisp symbol)
e0e989f6 2641 `cn-big5' by default.
4ed46869
KH
2642
2643 o coding-category-binary
2644
2645 The category for a coding system not categorized in any of the
2646 above. Assigned the coding-system (Lisp symbol)
e0e989f6 2647 `no-conversion' by default.
4ed46869
KH
2648
2649 Each of them is a Lisp symbol and the value is an actual
2650 `coding-system's (this is also a Lisp symbol) assigned by a user.
2651 What Emacs does actually is to detect a category of coding system.
2652 Then, it uses a `coding-system' assigned to it. If Emacs can't
2653 decide only one possible category, it selects a category of the
2654 highest priority. Priorities of categories are also specified by a
2655 user in a Lisp variable `coding-category-list'.
2656
2657*/
2658
2659/* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2660 If it detects possible coding systems, return an integer in which
2661 appropriate flag bits are set. Flag bits are defined by macros
2662 CODING_CATEGORY_MASK_XXX in `coding.h'. */
2663
2664int
2665detect_coding_mask (src, src_bytes)
2666 unsigned char *src;
2667 int src_bytes;
2668{
2669 register unsigned char c;
2670 unsigned char *src_end = src + src_bytes;
2671 int mask;
2672
2673 /* At first, skip all ASCII characters and control characters except
2674 for three ISO2022 specific control characters. */
bcf26d6a 2675 label_loop_detect_coding:
4ed46869
KH
2676 while (src < src_end)
2677 {
2678 c = *src;
2679 if (c >= 0x80
2680 || (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
2681 break;
2682 src++;
2683 }
2684
2685 if (src >= src_end)
2686 /* We found nothing other than ASCII. There's nothing to do. */
2687 return CODING_CATEGORY_MASK_ANY;
2688
2689 /* The text seems to be encoded in some multilingual coding system.
2690 Now, try to find in which coding system the text is encoded. */
2691 if (c < 0x80)
bcf26d6a
KH
2692 {
2693 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
2694 /* C is an ISO2022 specific control code of C0. */
2695 mask = detect_coding_iso2022 (src, src_end);
2696 src++;
1b2af4b0 2697 if (mask == 0)
bcf26d6a
KH
2698 /* No valid ISO2022 code follows C. Try again. */
2699 goto label_loop_detect_coding;
5d648571 2700 mask |= CODING_CATEGORY_MASK_RAW_TEXT;
bcf26d6a 2701 }
4ed46869 2702 else if (c < 0xA0)
c4825358 2703 {
3f003981 2704 /* If C is a special latin extra code,
c4825358
KH
2705 or is an ISO2022 specific control code of C1 (SS2 or SS3),
2706 or is an ISO2022 control-sequence-introducer (CSI),
27901516 2707 we should also consider the possibility of ISO2022 codings. */
3f003981
KH
2708 if ((VECTORP (Vlatin_extra_code_table)
2709 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
c4825358
KH
2710 || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
2711 || (c == ISO_CODE_CSI
2712 && (src < src_end
2713 && (*src == ']'
2714 || (src + 1 < src_end
2715 && src[1] == ']'
2716 && (*src == '0' || *src == '1' || *src == '2'))))))
2717 mask = (detect_coding_iso2022 (src, src_end)
2718 | detect_coding_sjis (src, src_end)
2719 | detect_coding_emacs_mule (src, src_end)
27901516 2720 | CODING_CATEGORY_MASK_RAW_TEXT);
4ed46869 2721
c4825358 2722 else
27901516
KH
2723 /* C is the first byte of SJIS character code,
2724 or a leading-code of Emacs' internal format (emacs-mule). */
c4825358
KH
2725 mask = (detect_coding_sjis (src, src_end)
2726 | detect_coding_emacs_mule (src, src_end)
27901516 2727 | CODING_CATEGORY_MASK_RAW_TEXT);
c4825358 2728 }
4ed46869
KH
2729 else
2730 /* C is a character of ISO2022 in graphic plane right,
2731 or a SJIS's 1-byte character code (i.e. JISX0201),
2732 or the first byte of BIG5's 2-byte code. */
2733 mask = (detect_coding_iso2022 (src, src_end)
2734 | detect_coding_sjis (src, src_end)
10bff6f1 2735 | detect_coding_big5 (src, src_end)
27901516 2736 | CODING_CATEGORY_MASK_RAW_TEXT);
4ed46869
KH
2737
2738 return mask;
2739}
2740
2741/* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2742 The information of the detected coding system is set in CODING. */
2743
2744void
2745detect_coding (coding, src, src_bytes)
2746 struct coding_system *coding;
2747 unsigned char *src;
2748 int src_bytes;
2749{
2750 int mask = detect_coding_mask (src, src_bytes);
2751 int idx;
27901516 2752 Lisp_Object val = Vcoding_category_list;
4ed46869
KH
2753
2754 if (mask == CODING_CATEGORY_MASK_ANY)
2755 /* We found nothing other than ASCII. There's nothing to do. */
2756 return;
2757
27901516
KH
2758 /* We found some plausible coding systems. Let's use a coding
2759 system of the highest priority. */
4ed46869 2760
27901516
KH
2761 if (CONSP (val))
2762 while (!NILP (val))
2763 {
2764 idx = XFASTINT (Fget (XCONS (val)->car, Qcoding_category_index));
2765 if ((idx < CODING_CATEGORY_IDX_MAX) && (mask & (1 << idx)))
2766 break;
2767 val = XCONS (val)->cdr;
2768 }
2769 else
2770 val = Qnil;
4ed46869 2771
27901516
KH
2772 if (NILP (val))
2773 {
2774 /* For unknown reason, `Vcoding_category_list' contains none of
2775 found categories. Let's use any of them. */
2776 for (idx = 0; idx < CODING_CATEGORY_IDX_MAX; idx++)
2777 if (mask & (1 << idx))
2778 break;
4ed46869
KH
2779 }
2780 setup_coding_system (XSYMBOL (coding_category_table[idx])->value, coding);
2781}
2782
2783/* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2784 is encoded. Return one of CODING_EOL_LF, CODING_EOL_CRLF,
0ef69138 2785 CODING_EOL_CR, and CODING_EOL_UNDECIDED. */
4ed46869 2786
bc4bc72a
RS
2787#define MAX_EOL_CHECK_COUNT 3
2788
4ed46869
KH
2789int
2790detect_eol_type (src, src_bytes)
2791 unsigned char *src;
2792 int src_bytes;
2793{
2794 unsigned char *src_end = src + src_bytes;
2795 unsigned char c;
bc4bc72a
RS
2796 int total = 0; /* How many end-of-lines are found so far. */
2797 int eol_type = CODING_EOL_UNDECIDED;
2798 int this_eol_type;
4ed46869 2799
bc4bc72a 2800 while (src < src_end && total < MAX_EOL_CHECK_COUNT)
4ed46869
KH
2801 {
2802 c = *src++;
bc4bc72a 2803 if (c == '\n' || c == '\r')
4ed46869 2804 {
bc4bc72a
RS
2805 total++;
2806 if (c == '\n')
2807 this_eol_type = CODING_EOL_LF;
2808 else if (src >= src_end || *src != '\n')
2809 this_eol_type = CODING_EOL_CR;
4ed46869 2810 else
bc4bc72a
RS
2811 this_eol_type = CODING_EOL_CRLF, src++;
2812
2813 if (eol_type == CODING_EOL_UNDECIDED)
2814 /* This is the first end-of-line. */
2815 eol_type = this_eol_type;
2816 else if (eol_type != this_eol_type)
2817 /* The found type is different from what found before.
27901516
KH
2818 Let's notice the caller about this inconsistency. */
2819 return CODING_EOL_INCONSISTENT;
4ed46869
KH
2820 }
2821 }
bc4bc72a 2822
85a02ca4 2823 return eol_type;
4ed46869
KH
2824}
2825
2826/* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2827 is encoded. If it detects an appropriate format of end-of-line, it
2828 sets the information in *CODING. */
2829
2830void
2831detect_eol (coding, src, src_bytes)
2832 struct coding_system *coding;
2833 unsigned char *src;
2834 int src_bytes;
2835{
fb3903d3 2836 Lisp_Object val, coding_system;
4ed46869
KH
2837 int eol_type = detect_eol_type (src, src_bytes);
2838
0ef69138 2839 if (eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
2840 /* We found no end-of-line in the source text. */
2841 return;
2842
27901516
KH
2843 if (eol_type == CODING_EOL_INCONSISTENT)
2844 {
2845#if 0
2846 /* This code is suppressed until we find a better way to
992f23f2 2847 distinguish raw text file and binary file. */
27901516
KH
2848
2849 /* If we have already detected that the coding is raw-text, the
2850 coding should actually be no-conversion. */
2851 if (coding->type == coding_type_raw_text)
2852 {
2853 setup_coding_system (Qno_conversion, coding);
2854 return;
2855 }
2856 /* Else, let's decode only text code anyway. */
2857#endif /* 0 */
1b2af4b0 2858 eol_type = CODING_EOL_LF;
27901516
KH
2859 }
2860
fb3903d3
KH
2861 coding_system = coding->symbol;
2862 while (!NILP (coding_system)
2863 && NILP (val = Fget (coding_system, Qeol_type)))
2864 coding_system = Fget (coding_system, Qcoding_system);
4ed46869
KH
2865 if (VECTORP (val) && XVECTOR (val)->size == 3)
2866 setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
2867}
2868
2869/* See "GENERAL NOTES about `decode_coding_XXX ()' functions". Before
2870 decoding, it may detect coding system and format of end-of-line if
2871 those are not yet decided. */
2872
2873int
2874decode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2875 struct coding_system *coding;
2876 unsigned char *source, *destination;
2877 int src_bytes, dst_bytes;
2878 int *consumed;
2879{
2880 int produced;
2881
2882 if (src_bytes <= 0)
2883 {
2884 *consumed = 0;
2885 return 0;
2886 }
2887
0ef69138 2888 if (coding->type == coding_type_undecided)
4ed46869
KH
2889 detect_coding (coding, source, src_bytes);
2890
0ef69138 2891 if (coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
2892 detect_eol (coding, source, src_bytes);
2893
2894 coding->carryover_size = 0;
2895 switch (coding->type)
2896 {
2897 case coding_type_no_conversion:
2898 label_no_conversion:
2899 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2900 bcopy (source, destination, produced);
2901 *consumed = produced;
2902 break;
2903
0ef69138
KH
2904 case coding_type_emacs_mule:
2905 case coding_type_undecided:
27901516 2906 case coding_type_raw_text:
4ed46869 2907 if (coding->eol_type == CODING_EOL_LF
0ef69138 2908 || coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
2909 goto label_no_conversion;
2910 produced = decode_eol (coding, source, destination,
2911 src_bytes, dst_bytes, consumed);
2912 break;
2913
2914 case coding_type_sjis:
2915 produced = decode_coding_sjis_big5 (coding, source, destination,
2916 src_bytes, dst_bytes, consumed,
2917 1);
2918 break;
2919
2920 case coding_type_iso2022:
2921 produced = decode_coding_iso2022 (coding, source, destination,
2922 src_bytes, dst_bytes, consumed);
2923 break;
2924
2925 case coding_type_big5:
2926 produced = decode_coding_sjis_big5 (coding, source, destination,
2927 src_bytes, dst_bytes, consumed,
2928 0);
2929 break;
2930
2931 case coding_type_ccl:
2932 produced = ccl_driver (&coding->spec.ccl.decoder, source, destination,
2933 src_bytes, dst_bytes, consumed);
2934 break;
2935 }
2936
2937 return produced;
2938}
2939
2940/* See "GENERAL NOTES about `encode_coding_XXX ()' functions". */
2941
2942int
2943encode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2944 struct coding_system *coding;
2945 unsigned char *source, *destination;
2946 int src_bytes, dst_bytes;
2947 int *consumed;
2948{
2949 int produced;
2950
4ed46869
KH
2951 switch (coding->type)
2952 {
2953 case coding_type_no_conversion:
2954 label_no_conversion:
2955 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2956 if (produced > 0)
2957 {
2958 bcopy (source, destination, produced);
2959 if (coding->selective)
2960 {
2961 unsigned char *p = destination, *pend = destination + produced;
2962 while (p < pend)
e0e989f6 2963 if (*p++ == '\015') p[-1] = '\n';
4ed46869
KH
2964 }
2965 }
2966 *consumed = produced;
2967 break;
2968
0ef69138
KH
2969 case coding_type_emacs_mule:
2970 case coding_type_undecided:
27901516 2971 case coding_type_raw_text:
4ed46869 2972 if (coding->eol_type == CODING_EOL_LF
0ef69138 2973 || coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
2974 goto label_no_conversion;
2975 produced = encode_eol (coding, source, destination,
2976 src_bytes, dst_bytes, consumed);
2977 break;
2978
2979 case coding_type_sjis:
2980 produced = encode_coding_sjis_big5 (coding, source, destination,
2981 src_bytes, dst_bytes, consumed,
2982 1);
2983 break;
2984
2985 case coding_type_iso2022:
2986 produced = encode_coding_iso2022 (coding, source, destination,
2987 src_bytes, dst_bytes, consumed);
2988 break;
2989
2990 case coding_type_big5:
2991 produced = encode_coding_sjis_big5 (coding, source, destination,
2992 src_bytes, dst_bytes, consumed,
2993 0);
2994 break;
2995
2996 case coding_type_ccl:
2997 produced = ccl_driver (&coding->spec.ccl.encoder, source, destination,
2998 src_bytes, dst_bytes, consumed);
2999 break;
3000 }
3001
3002 return produced;
3003}
3004
3005#define CONVERSION_BUFFER_EXTRA_ROOM 256
3006
3007/* Return maximum size (bytes) of a buffer enough for decoding
3008 SRC_BYTES of text encoded in CODING. */
3009
3010int
3011decoding_buffer_size (coding, src_bytes)
3012 struct coding_system *coding;
3013 int src_bytes;
3014{
3015 int magnification;
3016
3017 if (coding->type == coding_type_iso2022)
3018 magnification = 3;
3019 else if (coding->type == coding_type_ccl)
3020 magnification = coding->spec.ccl.decoder.buf_magnification;
3021 else
3022 magnification = 2;
3023
3024 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
3025}
3026
3027/* Return maximum size (bytes) of a buffer enough for encoding
3028 SRC_BYTES of text to CODING. */
3029
3030int
3031encoding_buffer_size (coding, src_bytes)
3032 struct coding_system *coding;
3033 int src_bytes;
3034{
3035 int magnification;
3036
3037 if (coding->type == coding_type_ccl)
3038 magnification = coding->spec.ccl.encoder.buf_magnification;
3039 else
3040 magnification = 3;
3041
3042 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
3043}
3044
3045#ifndef MINIMUM_CONVERSION_BUFFER_SIZE
3046#define MINIMUM_CONVERSION_BUFFER_SIZE 1024
3047#endif
3048
3049char *conversion_buffer;
3050int conversion_buffer_size;
3051
3052/* Return a pointer to a SIZE bytes of buffer to be used for encoding
3053 or decoding. Sufficient memory is allocated automatically. If we
3054 run out of memory, return NULL. */
3055
3056char *
3057get_conversion_buffer (size)
3058 int size;
3059{
3060 if (size > conversion_buffer_size)
3061 {
3062 char *buf;
3063 int real_size = conversion_buffer_size * 2;
3064
3065 while (real_size < size) real_size *= 2;
3066 buf = (char *) xmalloc (real_size);
3067 xfree (conversion_buffer);
3068 conversion_buffer = buf;
3069 conversion_buffer_size = real_size;
3070 }
3071 return conversion_buffer;
3072}
3073
3074\f
3075#ifdef emacs
3076/*** 7. Emacs Lisp library functions ***/
3077
02ba4723 3078DEFUN ("coding-system-spec", Fcoding_system_spec, Scoding_system_spec,
4ed46869 3079 1, 1, 0,
02ba4723 3080 "Return coding-spec of CODING-SYSTEM.\n\
4ed46869
KH
3081If CODING-SYSTEM is not a valid coding-system, return nil.")
3082 (obj)
3083 Lisp_Object obj;
3084{
3085 while (SYMBOLP (obj) && !NILP (obj))
3086 obj = Fget (obj, Qcoding_system);
3087 return ((NILP (obj) || !VECTORP (obj) || XVECTOR (obj)->size != 5)
3088 ? Qnil : obj);
3089}
3090
3091DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
3092 "Return t if OBJECT is nil or a coding-system.\n\
3093See document of make-coding-system for coding-system object.")
3094 (obj)
3095 Lisp_Object obj;
3096{
02ba4723 3097 return ((NILP (obj) || !NILP (Fcoding_system_spec (obj))) ? Qt : Qnil);
4ed46869
KH
3098}
3099
9d991de8
RS
3100DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
3101 Sread_non_nil_coding_system, 1, 1, 0,
e0e989f6 3102 "Read a coding system from the minibuffer, prompting with string PROMPT.")
4ed46869
KH
3103 (prompt)
3104 Lisp_Object prompt;
3105{
e0e989f6 3106 Lisp_Object val;
9d991de8
RS
3107 do
3108 {
02ba4723 3109 val = Fcompleting_read (prompt, Vobarray, Qcoding_system_spec,
61e011d9 3110 Qt, Qnil, Qnil, Qnil, Qnil);
9d991de8
RS
3111 }
3112 while (XSTRING (val)->size == 0);
e0e989f6 3113 return (Fintern (val, Qnil));
4ed46869
KH
3114}
3115
9b787f3e
RS
3116DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
3117 "Read a coding system from the minibuffer, prompting with string PROMPT.\n\
3118If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.")
3119 (prompt, default_coding_system)
3120 Lisp_Object prompt, default_coding_system;
4ed46869 3121{
f44d27ce 3122 Lisp_Object val;
9b787f3e
RS
3123 if (SYMBOLP (default_coding_system))
3124 XSETSTRING (default_coding_system, XSYMBOL (default_coding_system)->name);
f44d27ce 3125 val = Fcompleting_read (prompt, Vobarray, Qcoding_system_p,
9b787f3e
RS
3126 Qt, Qnil, Qcoding_system_history,
3127 default_coding_system, Qnil);
e0e989f6 3128 return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
4ed46869
KH
3129}
3130
3131DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
3132 1, 1, 0,
3133 "Check validity of CODING-SYSTEM.\n\
3134If valid, return CODING-SYSTEM, else `coding-system-error' is signaled.\n\
3135CODING-SYSTEM is valid if it is a symbol and has \"coding-system\" property.\n\
3136The value of property should be a vector of length 5.")
3137 (coding_system)
3138 Lisp_Object coding_system;
3139{
3140 CHECK_SYMBOL (coding_system, 0);
3141 if (!NILP (Fcoding_system_p (coding_system)))
3142 return coding_system;
3143 while (1)
02ba4723 3144 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
4ed46869
KH
3145}
3146
3147DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
3148 2, 2, 0,
bf9cdd4e
KH
3149 "Detect coding system of the text in the region between START and END.\n\
3150Return a list of possible coding systems ordered by priority.\n\
0ef69138 3151If only ASCII characters are found, it returns `undecided'\n\
bf9cdd4e 3152 or its subsidiary coding system according to a detected end-of-line format.")
4ed46869
KH
3153 (b, e)
3154 Lisp_Object b, e;
3155{
3156 int coding_mask, eol_type;
3157 Lisp_Object val;
3158 int beg, end;
3159
3160 validate_region (&b, &e);
3161 beg = XINT (b), end = XINT (e);
3162 if (beg < GPT && end >= GPT) move_gap (end);
3163
3164 coding_mask = detect_coding_mask (POS_ADDR (beg), end - beg);
3165 eol_type = detect_eol_type (POS_ADDR (beg), end - beg);
3166
3167 if (coding_mask == CODING_CATEGORY_MASK_ANY)
3168 {
27901516
KH
3169 val = Qundecided;
3170 if (eol_type != CODING_EOL_UNDECIDED
3171 && eol_type != CODING_EOL_INCONSISTENT)
4ed46869 3172 {
f44d27ce
RS
3173 Lisp_Object val2;
3174 val2 = Fget (Qundecided, Qeol_type);
4ed46869
KH
3175 if (VECTORP (val2))
3176 val = XVECTOR (val2)->contents[eol_type];
3177 }
3178 }
3179 else
3180 {
3181 Lisp_Object val2;
3182
3183 /* At first, gather possible coding-systems in VAL in a reverse
3184 order. */
3185 val = Qnil;
3186 for (val2 = Vcoding_category_list;
3187 !NILP (val2);
3188 val2 = XCONS (val2)->cdr)
3189 {
3190 int idx
3191 = XFASTINT (Fget (XCONS (val2)->car, Qcoding_category_index));
3192 if (coding_mask & (1 << idx))
27901516
KH
3193 {
3194#if 0
3195 /* This code is suppressed until we find a better way to
992f23f2 3196 distinguish raw text file and binary file. */
27901516
KH
3197
3198 if (idx == CODING_CATEGORY_IDX_RAW_TEXT
3199 && eol_type == CODING_EOL_INCONSISTENT)
3200 val = Fcons (Qno_conversion, val);
3201 else
3202#endif /* 0 */
3203 val = Fcons (Fsymbol_value (XCONS (val2)->car), val);
3204 }
4ed46869
KH
3205 }
3206
3207 /* Then, change the order of the list, while getting subsidiary
3208 coding-systems. */
3209 val2 = val;
3210 val = Qnil;
27901516
KH
3211 if (eol_type == CODING_EOL_INCONSISTENT)
3212 eol_type == CODING_EOL_UNDECIDED;
4ed46869
KH
3213 for (; !NILP (val2); val2 = XCONS (val2)->cdr)
3214 {
0ef69138 3215 if (eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
3216 val = Fcons (XCONS (val2)->car, val);
3217 else
3218 {
f44d27ce
RS
3219 Lisp_Object val3;
3220 val3 = Fget (XCONS (val2)->car, Qeol_type);
4ed46869
KH
3221 if (VECTORP (val3))
3222 val = Fcons (XVECTOR (val3)->contents[eol_type], val);
3223 else
3224 val = Fcons (XCONS (val2)->car, val);
3225 }
3226 }
3227 }
3228
3229 return val;
3230}
3231
3232/* Scan text in the region between *BEGP and *ENDP, skip characters
3233 which we never have to encode to (iff ENCODEP is 1) or decode from
3234 coding system CODING at the head and tail, then set BEGP and ENDP
3235 to the addresses of start and end of the text we actually convert. */
3236
3237void
3238shrink_conversion_area (begp, endp, coding, encodep)
3239 unsigned char **begp, **endp;
3240 struct coding_system *coding;
3241 int encodep;
3242{
3243 register unsigned char *beg_addr = *begp, *end_addr = *endp;
3244
3245 if (coding->eol_type != CODING_EOL_LF
0ef69138 3246 && coding->eol_type != CODING_EOL_UNDECIDED)
4ed46869
KH
3247 /* Since we anyway have to convert end-of-line format, it is not
3248 worth skipping at most 100 bytes or so. */
3249 return;
3250
3251 if (encodep) /* for encoding */
3252 {
3253 switch (coding->type)
3254 {
3255 case coding_type_no_conversion:
0ef69138
KH
3256 case coding_type_emacs_mule:
3257 case coding_type_undecided:
27901516 3258 case coding_type_raw_text:
4ed46869
KH
3259 /* We need no conversion. */
3260 *begp = *endp;
3261 return;
3262 case coding_type_ccl:
3263 /* We can't skip any data. */
3264 return;
e0e989f6
KH
3265 case coding_type_iso2022:
3266 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3267 {
3268 unsigned char *bol = beg_addr;
3269 while (beg_addr < end_addr && *beg_addr < 0x80)
3270 {
3271 beg_addr++;
3272 if (*(beg_addr - 1) == '\n')
3273 bol = beg_addr;
3274 }
3275 beg_addr = bol;
3276 goto label_skip_tail;
3277 }
3278 /* fall down ... */
4ed46869
KH
3279 default:
3280 /* We can skip all ASCII characters at the head and tail. */
3281 while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
e0e989f6 3282 label_skip_tail:
4ed46869
KH
3283 while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
3284 break;
3285 }
3286 }
3287 else /* for decoding */
3288 {
3289 switch (coding->type)
3290 {
3291 case coding_type_no_conversion:
3292 /* We need no conversion. */
3293 *begp = *endp;
3294 return;
0ef69138 3295 case coding_type_emacs_mule:
27901516 3296 case coding_type_raw_text:
4ed46869
KH
3297 if (coding->eol_type == CODING_EOL_LF)
3298 {
3299 /* We need no conversion. */
3300 *begp = *endp;
3301 return;
3302 }
3303 /* We can skip all but carriage-return. */
3304 while (beg_addr < end_addr && *beg_addr != '\r') beg_addr++;
3305 while (beg_addr < end_addr && *(end_addr - 1) != '\r') end_addr--;
3306 break;
3307 case coding_type_sjis:
3308 case coding_type_big5:
3309 /* We can skip all ASCII characters at the head. */
3310 while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
3311 /* We can skip all ASCII characters at the tail except for
3312 the second byte of SJIS or BIG5 code. */
3313 while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
3314 if (end_addr != *endp)
3315 end_addr++;
3316 break;
3317 case coding_type_ccl:
3318 /* We can't skip any data. */
3319 return;
3320 default: /* i.e. case coding_type_iso2022: */
3321 {
3322 unsigned char c;
3323
3324 /* We can skip all ASCII characters except for a few
3325 control codes at the head. */
3326 while (beg_addr < end_addr && (c = *beg_addr) < 0x80
3327 && c != ISO_CODE_CR && c != ISO_CODE_SO
3328 && c != ISO_CODE_SI && c != ISO_CODE_ESC)
3329 beg_addr++;
3330 }
3331 break;
3332 }
3333 }
3334 *begp = beg_addr;
3335 *endp = end_addr;
3336 return;
3337}
3338
3339/* Encode to (iff ENCODEP is 1) or decode form coding system CODING a
3340 text between B and E. B and E are buffer position. */
3341
3342Lisp_Object
3343code_convert_region (b, e, coding, encodep)
3344 Lisp_Object b, e;
3345 struct coding_system *coding;
3346 int encodep;
3347{
3348 int beg, end, len, consumed, produced;
3349 char *buf;
3350 unsigned char *begp, *endp;
3351 int pos = PT;
3352
3353 validate_region (&b, &e);
3354 beg = XINT (b), end = XINT (e);
3355 if (beg < GPT && end >= GPT)
3356 move_gap (end);
3357
3358 if (encodep && !NILP (coding->pre_write_conversion))
3359 {
3360 /* We must call a pre-conversion function which may put a new
3361 text to be converted in a new buffer. */
3362 struct buffer *old = current_buffer, *new;
3363
3364 TEMP_SET_PT (beg);
3365 call2 (coding->pre_write_conversion, b, e);
3366 if (old != current_buffer)
3367 {
3368 /* Replace the original text by the text just generated. */
3369 len = ZV - BEGV;
3370 new = current_buffer;
3371 set_buffer_internal (old);
3372 del_range (beg, end);
3373 insert_from_buffer (new, 1, len, 0);
3374 end = beg + len;
3375 }
3376 }
3377
3378 /* We may be able to shrink the conversion region. */
3379 begp = POS_ADDR (beg); endp = begp + (end - beg);
3380 shrink_conversion_area (&begp, &endp, coding, encodep);
3381
3382 if (begp == endp)
3383 /* We need no conversion. */
3384 len = end - beg;
3385 else
3386 {
3387 beg += begp - POS_ADDR (beg);
3388 end = beg + (endp - begp);
3389
3390 if (encodep)
3391 len = encoding_buffer_size (coding, end - beg);
3392 else
3393 len = decoding_buffer_size (coding, end - beg);
3394 buf = get_conversion_buffer (len);
3395
3396 coding->last_block = 1;
3397 produced = (encodep
3398 ? encode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
3399 &consumed)
3400 : decode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
3401 &consumed));
3402
3403 len = produced + (beg - XINT (b)) + (XINT (e) - end);
3404
3405 TEMP_SET_PT (beg);
3406 insert (buf, produced);
3407 del_range (PT, PT + end - beg);
3408 if (pos >= end)
3409 pos = PT + (pos - end);
3410 else if (pos > beg)
3411 pos = beg;
3412 TEMP_SET_PT (pos);
3413 }
3414
3415 if (!encodep && !NILP (coding->post_read_conversion))
3416 {
3417 /* We must call a post-conversion function which may alter
3418 the text just converted. */
3419 Lisp_Object insval;
3420
3421 beg = XINT (b);
3422 TEMP_SET_PT (beg);
3423 insval = call1 (coding->post_read_conversion, make_number (len));
3424 CHECK_NUMBER (insval, 0);
3425 len = XINT (insval);
3426 }
3427
3428 return make_number (len);
3429}
3430
3431Lisp_Object
e0e989f6
KH
3432code_convert_string (str, coding, encodep, nocopy)
3433 Lisp_Object str, nocopy;
4ed46869
KH
3434 struct coding_system *coding;
3435 int encodep;
3436{
3437 int len, consumed, produced;
3438 char *buf;
3439 unsigned char *begp, *endp;
3440 int head_skip, tail_skip;
3441 struct gcpro gcpro1;
3442
3443 if (encodep && !NILP (coding->pre_write_conversion)
3444 || !encodep && !NILP (coding->post_read_conversion))
3445 {
3446 /* Since we have to call Lisp functions which assume target text
3447 is in a buffer, after setting a temporary buffer, call
3448 code_convert_region. */
3449 int count = specpdl_ptr - specpdl;
3450 int len = XSTRING (str)->size;
3451 Lisp_Object result;
3452 struct buffer *old = current_buffer;
3453
3454 record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
3455 temp_output_buffer_setup (" *code-converting-work*");
3456 set_buffer_internal (XBUFFER (Vstandard_output));
3457 insert_from_string (str, 0, len, 0);
3458 code_convert_region (make_number (BEGV), make_number (ZV),
3459 coding, encodep);
3460 result = make_buffer_string (BEGV, ZV, 0);
3461 set_buffer_internal (old);
3462 return unbind_to (count, result);
3463 }
3464
3465 /* We may be able to shrink the conversion region. */
3466 begp = XSTRING (str)->data;
3467 endp = begp + XSTRING (str)->size;
3468 shrink_conversion_area (&begp, &endp, coding, encodep);
3469
3470 if (begp == endp)
3471 /* We need no conversion. */
e0e989f6 3472 return (NILP (nocopy) ? Fcopy_sequence (str) : str);
4ed46869
KH
3473
3474 head_skip = begp - XSTRING (str)->data;
3475 tail_skip = XSTRING (str)->size - head_skip - (endp - begp);
3476
3477 GCPRO1 (str);
3478
3479 if (encodep)
3480 len = encoding_buffer_size (coding, endp - begp);
3481 else
3482 len = decoding_buffer_size (coding, endp - begp);
3483 buf = get_conversion_buffer (len + head_skip + tail_skip);
3484
3485 bcopy (XSTRING (str)->data, buf, head_skip);
3486 coding->last_block = 1;
3487 produced = (encodep
3488 ? encode_coding (coding, XSTRING (str)->data + head_skip,
3489 buf + head_skip, endp - begp, len, &consumed)
3490 : decode_coding (coding, XSTRING (str)->data + head_skip,
3491 buf + head_skip, endp - begp, len, &consumed));
3492 bcopy (XSTRING (str)->data + head_skip + (endp - begp),
3493 buf + head_skip + produced,
3494 tail_skip);
3495
3496 UNGCPRO;
3497
3498 return make_string (buf, head_skip + produced + tail_skip);
3499}
3500
3501DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
e0e989f6
KH
3502 3, 3, "r\nzCoding system: ",
3503 "Decode current region by specified coding system.\n\
3504When called from a program, takes three arguments:\n\
3505START, END, and CODING-SYSTEM. START END are buffer positions.\n\
4ed46869
KH
3506Return length of decoded text.")
3507 (b, e, coding_system)
3508 Lisp_Object b, e, coding_system;
3509{
3510 struct coding_system coding;
3511
3512 CHECK_NUMBER_COERCE_MARKER (b, 0);
3513 CHECK_NUMBER_COERCE_MARKER (e, 1);
3514 CHECK_SYMBOL (coding_system, 2);
3515
e0e989f6
KH
3516 if (NILP (coding_system))
3517 return make_number (XFASTINT (e) - XFASTINT (b));
4ed46869
KH
3518 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3519 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3520
3521 return code_convert_region (b, e, &coding, 0);
3522}
3523
3524DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
e0e989f6
KH
3525 3, 3, "r\nzCoding system: ",
3526 "Encode current region by specified coding system.\n\
3527When called from a program, takes three arguments:\n\
3528START, END, and CODING-SYSTEM. START END are buffer positions.\n\
4ed46869
KH
3529Return length of encoded text.")
3530 (b, e, coding_system)
3531 Lisp_Object b, e, coding_system;
3532{
3533 struct coding_system coding;
3534
3535 CHECK_NUMBER_COERCE_MARKER (b, 0);
3536 CHECK_NUMBER_COERCE_MARKER (e, 1);
3537 CHECK_SYMBOL (coding_system, 2);
3538
e0e989f6
KH
3539 if (NILP (coding_system))
3540 return make_number (XFASTINT (e) - XFASTINT (b));
4ed46869
KH
3541 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3542 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3543
3544 return code_convert_region (b, e, &coding, 1);
3545}
3546
3547DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
e0e989f6
KH
3548 2, 3, 0,
3549 "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
fe487a71
RS
3550Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
3551if the decoding operation is trivial.")
e0e989f6
KH
3552 (string, coding_system, nocopy)
3553 Lisp_Object string, coding_system, nocopy;
4ed46869
KH
3554{
3555 struct coding_system coding;
3556
3557 CHECK_STRING (string, 0);
3558 CHECK_SYMBOL (coding_system, 1);
3559
e0e989f6
KH
3560 if (NILP (coding_system))
3561 return (NILP (nocopy) ? Fcopy_sequence (string) : string);
4ed46869
KH
3562 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3563 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3564
e0e989f6 3565 return code_convert_string (string, &coding, 0, nocopy);
4ed46869
KH
3566}
3567
3568DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
e0e989f6
KH
3569 2, 3, 0,
3570 "Encode STRING to CODING-SYSTEM, and return the result.\n\
fe487a71
RS
3571Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
3572if the encoding operation is trivial.")
e0e989f6
KH
3573 (string, coding_system, nocopy)
3574 Lisp_Object string, coding_system, nocopy;
4ed46869
KH
3575{
3576 struct coding_system coding;
3577
3578 CHECK_STRING (string, 0);
3579 CHECK_SYMBOL (coding_system, 1);
3580
e0e989f6
KH
3581 if (NILP (coding_system))
3582 return (NILP (nocopy) ? Fcopy_sequence (string) : string);
4ed46869
KH
3583 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3584 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3585
e0e989f6 3586 return code_convert_string (string, &coding, 1, nocopy);
4ed46869
KH
3587}
3588
3589DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
e0e989f6 3590 "Decode a JISX0208 character of shift-jis encoding.\n\
4ed46869
KH
3591CODE is the character code in SJIS.\n\
3592Return the corresponding character.")
3593 (code)
3594 Lisp_Object code;
3595{
3596 unsigned char c1, c2, s1, s2;
3597 Lisp_Object val;
3598
3599 CHECK_NUMBER (code, 0);
3600 s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
3601 DECODE_SJIS (s1, s2, c1, c2);
3602 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset_jisx0208, c1, c2));
3603 return val;
3604}
3605
3606DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
3607 "Encode a JISX0208 character CHAR to SJIS coding-system.\n\
3608Return the corresponding character code in SJIS.")
3609 (ch)
3610 Lisp_Object ch;
3611{
bcf26d6a 3612 int charset, c1, c2, s1, s2;
4ed46869
KH
3613 Lisp_Object val;
3614
3615 CHECK_NUMBER (ch, 0);
3616 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3617 if (charset == charset_jisx0208)
3618 {
3619 ENCODE_SJIS (c1, c2, s1, s2);
bcf26d6a 3620 XSETFASTINT (val, (s1 << 8) | s2);
4ed46869
KH
3621 }
3622 else
3623 XSETFASTINT (val, 0);
3624 return val;
3625}
3626
3627DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
3628 "Decode a Big5 character CODE of BIG5 coding-system.\n\
3629CODE is the character code in BIG5.\n\
3630Return the corresponding character.")
3631 (code)
3632 Lisp_Object code;
3633{
3634 int charset;
3635 unsigned char b1, b2, c1, c2;
3636 Lisp_Object val;
3637
3638 CHECK_NUMBER (code, 0);
3639 b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
3640 DECODE_BIG5 (b1, b2, charset, c1, c2);
3641 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset, c1, c2));
3642 return val;
3643}
3644
3645DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
3646 "Encode the Big5 character CHAR to BIG5 coding-system.\n\
3647Return the corresponding character code in Big5.")
3648 (ch)
3649 Lisp_Object ch;
3650{
bcf26d6a 3651 int charset, c1, c2, b1, b2;
4ed46869
KH
3652 Lisp_Object val;
3653
3654 CHECK_NUMBER (ch, 0);
3655 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3656 if (charset == charset_big5_1 || charset == charset_big5_2)
3657 {
3658 ENCODE_BIG5 (charset, c1, c2, b1, b2);
bcf26d6a 3659 XSETFASTINT (val, (b1 << 8) | b2);
4ed46869
KH
3660 }
3661 else
3662 XSETFASTINT (val, 0);
3663 return val;
3664}
3665
1ba9e4ab
KH
3666DEFUN ("set-terminal-coding-system-internal",
3667 Fset_terminal_coding_system_internal,
3668 Sset_terminal_coding_system_internal, 1, 1, 0, "")
4ed46869
KH
3669 (coding_system)
3670 Lisp_Object coding_system;
3671{
3672 CHECK_SYMBOL (coding_system, 0);
3673 setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
6e85d753
KH
3674 /* We had better not send unexpected characters to terminal. */
3675 terminal_coding.flags |= CODING_FLAG_ISO_SAFE;
3676
4ed46869
KH
3677 return Qnil;
3678}
3679
c4825358
KH
3680DEFUN ("set-safe-terminal-coding-system-internal",
3681 Fset_safe_terminal_coding_system_internal,
3682 Sset_safe_terminal_coding_system_internal, 1, 1, 0, "")
3683 (coding_system)
3684 Lisp_Object coding_system;
3685{
3686 CHECK_SYMBOL (coding_system, 0);
3687 setup_coding_system (Fcheck_coding_system (coding_system),
3688 &safe_terminal_coding);
3689 return Qnil;
3690}
3691
4ed46869
KH
3692DEFUN ("terminal-coding-system",
3693 Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
3694 "Return coding-system of your terminal.")
3695 ()
3696{
3697 return terminal_coding.symbol;
3698}
3699
1ba9e4ab
KH
3700DEFUN ("set-keyboard-coding-system-internal",
3701 Fset_keyboard_coding_system_internal,
3702 Sset_keyboard_coding_system_internal, 1, 1, 0, "")
4ed46869
KH
3703 (coding_system)
3704 Lisp_Object coding_system;
3705{
3706 CHECK_SYMBOL (coding_system, 0);
3707 setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
3708 return Qnil;
3709}
3710
3711DEFUN ("keyboard-coding-system",
3712 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
3713 "Return coding-system of what is sent from terminal keyboard.")
3714 ()
3715{
3716 return keyboard_coding.symbol;
3717}
3718
3719\f
a5d301df
KH
3720DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
3721 Sfind_operation_coding_system, 1, MANY, 0,
3722 "Choose a coding system for an operation based on the target name.\n\
9ce27fde
KH
3723The value names a pair of coding systems: (DECODING-SYSTEM ENCODING-SYSTEM).\n\
3724DECODING-SYSTEM is the coding system to use for decoding\n\
3725\(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system\n\
3726for encoding (in case OPERATION does encoding).\n\
ccdb79f5
RS
3727\n\
3728The first argument OPERATION specifies an I/O primitive:\n\
3729 For file I/O, `insert-file-contents' or `write-region'.\n\
3730 For process I/O, `call-process', `call-process-region', or `start-process'.\n\
3731 For network I/O, `open-network-stream'.\n\
3732\n\
3733The remaining arguments should be the same arguments that were passed\n\
3734to the primitive. Depending on which primitive, one of those arguments\n\
3735is selected as the TARGET. For example, if OPERATION does file I/O,\n\
3736whichever argument specifies the file name is TARGET.\n\
3737\n\
3738TARGET has a meaning which depends on OPERATION:\n\
4ed46869
KH
3739 For file I/O, TARGET is a file name.\n\
3740 For process I/O, TARGET is a process name.\n\
3741 For network I/O, TARGET is a service name or a port number\n\
3742\n\
02ba4723
KH
3743This function looks up what specified for TARGET in,\n\
3744`file-coding-system-alist', `process-coding-system-alist',\n\
3745or `network-coding-system-alist' depending on OPERATION.\n\
3746They may specify a coding system, a cons of coding systems,\n\
3747or a function symbol to call.\n\
3748In the last case, we call the function with one argument,\n\
9ce27fde 3749which is a list of all the arguments given to this function.")
4ed46869
KH
3750 (nargs, args)
3751 int nargs;
3752 Lisp_Object *args;
3753{
3754 Lisp_Object operation, target_idx, target, val;
3755 register Lisp_Object chain;
3756
3757 if (nargs < 2)
3758 error ("Too few arguments");
3759 operation = args[0];
3760 if (!SYMBOLP (operation)
3761 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
3762 error ("Invalid first arguement");
3763 if (nargs < 1 + XINT (target_idx))
3764 error ("Too few arguments for operation: %s",
3765 XSYMBOL (operation)->name->data);
3766 target = args[XINT (target_idx) + 1];
3767 if (!(STRINGP (target)
3768 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
3769 error ("Invalid %dth argument", XINT (target_idx) + 1);
3770
2e34157c
RS
3771 chain = ((EQ (operation, Qinsert_file_contents)
3772 || EQ (operation, Qwrite_region))
02ba4723 3773 ? Vfile_coding_system_alist
2e34157c 3774 : (EQ (operation, Qopen_network_stream)
02ba4723
KH
3775 ? Vnetwork_coding_system_alist
3776 : Vprocess_coding_system_alist));
4ed46869
KH
3777 if (NILP (chain))
3778 return Qnil;
3779
02ba4723 3780 for (; CONSP (chain); chain = XCONS (chain)->cdr)
4ed46869 3781 {
f44d27ce
RS
3782 Lisp_Object elt;
3783 elt = XCONS (chain)->car;
4ed46869
KH
3784
3785 if (CONSP (elt)
3786 && ((STRINGP (target)
3787 && STRINGP (XCONS (elt)->car)
3788 && fast_string_match (XCONS (elt)->car, target) >= 0)
3789 || (INTEGERP (target) && EQ (target, XCONS (elt)->car))))
02ba4723
KH
3790 {
3791 val = XCONS (elt)->cdr;
b19fd4c5
KH
3792 /* Here, if VAL is both a valid coding system and a valid
3793 function symbol, we return VAL as a coding system. */
02ba4723
KH
3794 if (CONSP (val))
3795 return val;
3796 if (! SYMBOLP (val))
3797 return Qnil;
3798 if (! NILP (Fcoding_system_p (val)))
3799 return Fcons (val, val);
b19fd4c5
KH
3800 if (! NILP (Ffboundp (val)))
3801 {
3802 val = call1 (val, Flist (nargs, args));
3803 if (CONSP (val))
3804 return val;
3805 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
3806 return Fcons (val, val);
3807 }
02ba4723
KH
3808 return Qnil;
3809 }
4ed46869
KH
3810 }
3811 return Qnil;
3812}
3813
3814#endif /* emacs */
3815
3816\f
3817/*** 8. Post-amble ***/
3818
3819init_coding_once ()
3820{
3821 int i;
3822
0ef69138 3823 /* Emacs' internal format specific initialize routine. */
4ed46869
KH
3824 for (i = 0; i <= 0x20; i++)
3825 emacs_code_class[i] = EMACS_control_code;
3826 emacs_code_class[0x0A] = EMACS_linefeed_code;
3827 emacs_code_class[0x0D] = EMACS_carriage_return_code;
3828 for (i = 0x21 ; i < 0x7F; i++)
3829 emacs_code_class[i] = EMACS_ascii_code;
3830 emacs_code_class[0x7F] = EMACS_control_code;
3831 emacs_code_class[0x80] = EMACS_leading_code_composition;
3832 for (i = 0x81; i < 0xFF; i++)
3833 emacs_code_class[i] = EMACS_invalid_code;
3834 emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
3835 emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
3836 emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
3837 emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
3838
3839 /* ISO2022 specific initialize routine. */
3840 for (i = 0; i < 0x20; i++)
3841 iso_code_class[i] = ISO_control_code;
3842 for (i = 0x21; i < 0x7F; i++)
3843 iso_code_class[i] = ISO_graphic_plane_0;
3844 for (i = 0x80; i < 0xA0; i++)
3845 iso_code_class[i] = ISO_control_code;
3846 for (i = 0xA1; i < 0xFF; i++)
3847 iso_code_class[i] = ISO_graphic_plane_1;
3848 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
3849 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
3850 iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
3851 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
3852 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
3853 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
3854 iso_code_class[ISO_CODE_ESC] = ISO_escape;
3855 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
3856 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
3857 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
3858
e0e989f6
KH
3859 conversion_buffer_size = MINIMUM_CONVERSION_BUFFER_SIZE;
3860 conversion_buffer = (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE);
3861
3862 setup_coding_system (Qnil, &keyboard_coding);
3863 setup_coding_system (Qnil, &terminal_coding);
c4825358 3864 setup_coding_system (Qnil, &safe_terminal_coding);
9ce27fde
KH
3865
3866#if defined (MSDOS) || defined (WINDOWSNT)
3867 system_eol_type = CODING_EOL_CRLF;
3868#else
3869 system_eol_type = CODING_EOL_LF;
3870#endif
e0e989f6
KH
3871}
3872
3873#ifdef emacs
3874
3875syms_of_coding ()
3876{
3877 Qtarget_idx = intern ("target-idx");
3878 staticpro (&Qtarget_idx);
3879
bb0115a2
RS
3880 Qcoding_system_history = intern ("coding-system-history");
3881 staticpro (&Qcoding_system_history);
3882 Fset (Qcoding_system_history, Qnil);
3883
9ce27fde 3884 /* Target FILENAME is the first argument. */
e0e989f6 3885 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
9ce27fde 3886 /* Target FILENAME is the third argument. */
e0e989f6
KH
3887 Fput (Qwrite_region, Qtarget_idx, make_number (2));
3888
3889 Qcall_process = intern ("call-process");
3890 staticpro (&Qcall_process);
9ce27fde 3891 /* Target PROGRAM is the first argument. */
e0e989f6
KH
3892 Fput (Qcall_process, Qtarget_idx, make_number (0));
3893
3894 Qcall_process_region = intern ("call-process-region");
3895 staticpro (&Qcall_process_region);
9ce27fde 3896 /* Target PROGRAM is the third argument. */
e0e989f6
KH
3897 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
3898
3899 Qstart_process = intern ("start-process");
3900 staticpro (&Qstart_process);
9ce27fde 3901 /* Target PROGRAM is the third argument. */
e0e989f6
KH
3902 Fput (Qstart_process, Qtarget_idx, make_number (2));
3903
3904 Qopen_network_stream = intern ("open-network-stream");
3905 staticpro (&Qopen_network_stream);
9ce27fde 3906 /* Target SERVICE is the fourth argument. */
e0e989f6
KH
3907 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
3908
4ed46869
KH
3909 Qcoding_system = intern ("coding-system");
3910 staticpro (&Qcoding_system);
3911
3912 Qeol_type = intern ("eol-type");
3913 staticpro (&Qeol_type);
3914
3915 Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
3916 staticpro (&Qbuffer_file_coding_system);
3917
3918 Qpost_read_conversion = intern ("post-read-conversion");
3919 staticpro (&Qpost_read_conversion);
3920
3921 Qpre_write_conversion = intern ("pre-write-conversion");
3922 staticpro (&Qpre_write_conversion);
3923
27901516
KH
3924 Qno_conversion = intern ("no-conversion");
3925 staticpro (&Qno_conversion);
3926
3927 Qundecided = intern ("undecided");
3928 staticpro (&Qundecided);
3929
02ba4723
KH
3930 Qcoding_system_spec = intern ("coding-system-spec");
3931 staticpro (&Qcoding_system_spec);
4ed46869
KH
3932
3933 Qcoding_system_p = intern ("coding-system-p");
3934 staticpro (&Qcoding_system_p);
3935
3936 Qcoding_system_error = intern ("coding-system-error");
3937 staticpro (&Qcoding_system_error);
3938
3939 Fput (Qcoding_system_error, Qerror_conditions,
3940 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
3941 Fput (Qcoding_system_error, Qerror_message,
9ce27fde 3942 build_string ("Invalid coding system"));
4ed46869
KH
3943
3944 Qcoding_category_index = intern ("coding-category-index");
3945 staticpro (&Qcoding_category_index);
3946
3947 {
3948 int i;
3949 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3950 {
3951 coding_category_table[i] = intern (coding_category_name[i]);
3952 staticpro (&coding_category_table[i]);
3953 Fput (coding_category_table[i], Qcoding_category_index,
3954 make_number (i));
3955 }
3956 }
3957
bdd9fb48
KH
3958 Qcharacter_unification_table = intern ("character-unification-table");
3959 staticpro (&Qcharacter_unification_table);
3960 Fput (Qcharacter_unification_table, Qchar_table_extra_slots,
3961 make_number (0));
3962
a5d301df
KH
3963 Qcharacter_unification_table_for_decode
3964 = intern ("character-unification-table-for-decode");
3965 staticpro (&Qcharacter_unification_table_for_decode);
3966
3967 Qcharacter_unification_table_for_encode
3968 = intern ("character-unification-table-for-encode");
3969 staticpro (&Qcharacter_unification_table_for_encode);
3970
9ce27fde
KH
3971 Qemacs_mule = intern ("emacs-mule");
3972 staticpro (&Qemacs_mule);
3973
02ba4723 3974 defsubr (&Scoding_system_spec);
4ed46869
KH
3975 defsubr (&Scoding_system_p);
3976 defsubr (&Sread_coding_system);
3977 defsubr (&Sread_non_nil_coding_system);
3978 defsubr (&Scheck_coding_system);
3979 defsubr (&Sdetect_coding_region);
3980 defsubr (&Sdecode_coding_region);
3981 defsubr (&Sencode_coding_region);
3982 defsubr (&Sdecode_coding_string);
3983 defsubr (&Sencode_coding_string);
3984 defsubr (&Sdecode_sjis_char);
3985 defsubr (&Sencode_sjis_char);
3986 defsubr (&Sdecode_big5_char);
3987 defsubr (&Sencode_big5_char);
1ba9e4ab 3988 defsubr (&Sset_terminal_coding_system_internal);
c4825358 3989 defsubr (&Sset_safe_terminal_coding_system_internal);
4ed46869 3990 defsubr (&Sterminal_coding_system);
1ba9e4ab 3991 defsubr (&Sset_keyboard_coding_system_internal);
4ed46869 3992 defsubr (&Skeyboard_coding_system);
a5d301df 3993 defsubr (&Sfind_operation_coding_system);
4ed46869
KH
3994
3995 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
3996 "List of coding-categories (symbols) ordered by priority.");
3997 {
3998 int i;
3999
4000 Vcoding_category_list = Qnil;
4001 for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
4002 Vcoding_category_list
4003 = Fcons (coding_category_table[i], Vcoding_category_list);
4004 }
4005
4006 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
10bff6f1 4007 "Specify the coding system for read operations.\n\
2ebb362d 4008It is useful to bind this variable with `let', but do not set it globally.\n\
4ed46869 4009If the value is a coding system, it is used for decoding on read operation.\n\
a67a9c66 4010If not, an appropriate element is used from one of the coding system alists:\n\
10bff6f1 4011There are three such tables, `file-coding-system-alist',\n\
a67a9c66 4012`process-coding-system-alist', and `network-coding-system-alist'.");
4ed46869
KH
4013 Vcoding_system_for_read = Qnil;
4014
4015 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
10bff6f1 4016 "Specify the coding system for write operations.\n\
2ebb362d 4017It is useful to bind this variable with `let', but do not set it globally.\n\
4ed46869 4018If the value is a coding system, it is used for encoding on write operation.\n\
a67a9c66 4019If not, an appropriate element is used from one of the coding system alists:\n\
10bff6f1 4020There are three such tables, `file-coding-system-alist',\n\
a67a9c66 4021`process-coding-system-alist', and `network-coding-system-alist'.");
4ed46869
KH
4022 Vcoding_system_for_write = Qnil;
4023
4024 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
a67a9c66 4025 "Coding system used in the latest file or process I/O.");
4ed46869
KH
4026 Vlast_coding_system_used = Qnil;
4027
9ce27fde
KH
4028 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
4029 "*Non-nil inhibit code conversion of end-of-line format in any cases.");
4030 inhibit_eol_conversion = 0;
4031
02ba4723
KH
4032 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
4033 "Alist to decide a coding system to use for a file I/O operation.\n\
4034The format is ((PATTERN . VAL) ...),\n\
4035where PATTERN is a regular expression matching a file name,\n\
4036VAL is a coding system, a cons of coding systems, or a function symbol.\n\
4037If VAL is a coding system, it is used for both decoding and encoding\n\
4038the file contents.\n\
4039If VAL is a cons of coding systems, the car part is used for decoding,\n\
4040and the cdr part is used for encoding.\n\
4041If VAL is a function symbol, the function must return a coding system\n\
4042or a cons of coding systems which are used as above.\n\
e0e989f6 4043\n\
9ce27fde 4044See also the function `find-operation-coding-system'.");
02ba4723
KH
4045 Vfile_coding_system_alist = Qnil;
4046
4047 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
4048 "Alist to decide a coding system to use for a process I/O operation.\n\
4049The format is ((PATTERN . VAL) ...),\n\
4050where PATTERN is a regular expression matching a program name,\n\
4051VAL is a coding system, a cons of coding systems, or a function symbol.\n\
4052If VAL is a coding system, it is used for both decoding what received\n\
4053from the program and encoding what sent to the program.\n\
4054If VAL is a cons of coding systems, the car part is used for decoding,\n\
4055and the cdr part is used for encoding.\n\
4056If VAL is a function symbol, the function must return a coding system\n\
4057or a cons of coding systems which are used as above.\n\
4ed46869 4058\n\
9ce27fde 4059See also the function `find-operation-coding-system'.");
02ba4723
KH
4060 Vprocess_coding_system_alist = Qnil;
4061
4062 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
4063 "Alist to decide a coding system to use for a network I/O operation.\n\
4064The format is ((PATTERN . VAL) ...),\n\
4065where PATTERN is a regular expression matching a network service name\n\
4066or is a port number to connect to,\n\
4067VAL is a coding system, a cons of coding systems, or a function symbol.\n\
4068If VAL is a coding system, it is used for both decoding what received\n\
4069from the network stream and encoding what sent to the network stream.\n\
4070If VAL is a cons of coding systems, the car part is used for decoding,\n\
4071and the cdr part is used for encoding.\n\
4072If VAL is a function symbol, the function must return a coding system\n\
4073or a cons of coding systems which are used as above.\n\
4ed46869 4074\n\
9ce27fde 4075See also the function `find-operation-coding-system'.");
02ba4723 4076 Vnetwork_coding_system_alist = Qnil;
4ed46869
KH
4077
4078 DEFVAR_INT ("eol-mnemonic-unix", &eol_mnemonic_unix,
4079 "Mnemonic character indicating UNIX-like end-of-line format (i.e. LF) .");
458822a0 4080 eol_mnemonic_unix = ':';
4ed46869
KH
4081
4082 DEFVAR_INT ("eol-mnemonic-dos", &eol_mnemonic_dos,
4083 "Mnemonic character indicating DOS-like end-of-line format (i.e. CRLF).");
458822a0 4084 eol_mnemonic_dos = '\\';
4ed46869
KH
4085
4086 DEFVAR_INT ("eol-mnemonic-mac", &eol_mnemonic_mac,
4087 "Mnemonic character indicating MAC-like end-of-line format (i.e. CR).");
458822a0 4088 eol_mnemonic_mac = '/';
4ed46869
KH
4089
4090 DEFVAR_INT ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
4091 "Mnemonic character indicating end-of-line format is not yet decided.");
458822a0 4092 eol_mnemonic_undecided = ':';
4ed46869 4093
bdd9fb48
KH
4094 DEFVAR_LISP ("enable-character-unification", &Venable_character_unification,
4095 "Non-nil means ISO 2022 encoder/decoder do character unification.");
4096 Venable_character_unification = Qt;
4097
a5d301df
KH
4098 DEFVAR_LISP ("standard-character-unification-table-for-decode",
4099 &Vstandard_character_unification_table_for_decode,
bdd9fb48 4100 "Table for unifying characters when reading.");
a5d301df 4101 Vstandard_character_unification_table_for_decode = Qnil;
bdd9fb48 4102
a5d301df
KH
4103 DEFVAR_LISP ("standard-character-unification-table-for-encode",
4104 &Vstandard_character_unification_table_for_encode,
bdd9fb48 4105 "Table for unifying characters when writing.");
a5d301df 4106 Vstandard_character_unification_table_for_encode = Qnil;
4ed46869
KH
4107
4108 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
4109 "Alist of charsets vs revision numbers.\n\
4110While encoding, if a charset (car part of an element) is found,\n\
4111designate it with the escape sequence identifing revision (cdr part of the element).");
4112 Vcharset_revision_alist = Qnil;
02ba4723
KH
4113
4114 DEFVAR_LISP ("default-process-coding-system",
4115 &Vdefault_process_coding_system,
4116 "Cons of coding systems used for process I/O by default.\n\
4117The car part is used for decoding a process output,\n\
4118the cdr part is used for encoding a text to be sent to a process.");
4119 Vdefault_process_coding_system = Qnil;
c4825358 4120
3f003981
KH
4121 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
4122 "Table of extra Latin codes in the range 128..159 (inclusive).\n\
c4825358
KH
4123This is a vector of length 256.\n\
4124If Nth element is non-nil, the existence of code N in a file\n\
bb0115a2 4125\(or output of subprocess) doesn't prevent it to be detected as\n\
3f003981
KH
4126a coding system of ISO 2022 variant which has a flag\n\
4127`accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file\n\
c4825358
KH
4128or reading output of a subprocess.\n\
4129Only 128th through 159th elements has a meaning.");
3f003981 4130 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
4ed46869
KH
4131}
4132
4133#endif /* emacs */