(Frequire): Pass t for the MUST-SUFFIX arg to Fload.
[bpt/emacs.git] / src / coding.c
CommitLineData
4ed46869 1/* Coding system handler (conversion, detection, and etc).
203cb916
RS
2 Copyright (C) 1995, 1997 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4ed46869 4
369314dc
KH
5This file is part of GNU Emacs.
6
7GNU Emacs is free software; you can redistribute it and/or modify
8it under the terms of the GNU General Public License as published by
9the Free Software Foundation; either version 2, or (at your option)
10any later version.
4ed46869 11
369314dc
KH
12GNU Emacs is distributed in the hope that it will be useful,
13but WITHOUT ANY WARRANTY; without even the implied warranty of
14MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15GNU General Public License for more details.
4ed46869 16
369314dc
KH
17You should have received a copy of the GNU General Public License
18along with GNU Emacs; see the file COPYING. If not, write to
19the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20Boston, MA 02111-1307, USA. */
4ed46869
KH
21
22/*** TABLE OF CONTENTS ***
23
24 1. Preamble
0ef69138 25 2. Emacs' internal format (emacs-mule) handlers
4ed46869
KH
26 3. ISO2022 handlers
27 4. Shift-JIS and BIG5 handlers
28 5. End-of-line handlers
29 6. C library functions
30 7. Emacs Lisp library functions
31 8. Post-amble
32
33*/
34
35/*** GENERAL NOTE on CODING SYSTEM ***
36
37 Coding system is an encoding mechanism of one or more character
38 sets. Here's a list of coding systems which Emacs can handle. When
39 we say "decode", it means converting some other coding system to
0ef69138
KH
40 Emacs' internal format (emacs-internal), and when we say "encode",
41 it means converting the coding system emacs-mule to some other
42 coding system.
4ed46869 43
0ef69138 44 0. Emacs' internal format (emacs-mule)
4ed46869
KH
45
46 Emacs itself holds a multi-lingual character in a buffer and a string
f4dee582 47 in a special format. Details are described in section 2.
4ed46869
KH
48
49 1. ISO2022
50
51 The most famous coding system for multiple character sets. X's
f4dee582
RS
52 Compound Text, various EUCs (Extended Unix Code), and coding
53 systems used in Internet communication such as ISO-2022-JP are
54 all variants of ISO2022. Details are described in section 3.
4ed46869
KH
55
56 2. SJIS (or Shift-JIS or MS-Kanji-Code)
57
58 A coding system to encode character sets: ASCII, JISX0201, and
59 JISX0208. Widely used for PC's in Japan. Details are described in
f4dee582 60 section 4.
4ed46869
KH
61
62 3. BIG5
63
64 A coding system to encode character sets: ASCII and Big5. Widely
65 used by Chinese (mainly in Taiwan and Hong Kong). Details are
f4dee582
RS
66 described in section 4. In this file, when we write "BIG5"
67 (all uppercase), we mean the coding system, and when we write
68 "Big5" (capitalized), we mean the character set.
4ed46869 69
f4dee582 70 4. Other
4ed46869 71
f4dee582 72 If a user wants to read/write a text encoded in a coding system not
4ed46869
KH
73 listed above, he can supply a decoder and an encoder for it in CCL
74 (Code Conversion Language) programs. Emacs executes the CCL program
75 while reading/writing.
76
f4dee582 77 Emacs represents a coding-system by a Lisp symbol that has a property
4ed46869
KH
78 `coding-system'. But, before actually using the coding-system, the
79 information about it is set in a structure of type `struct
f4dee582 80 coding_system' for rapid processing. See section 6 for more details.
4ed46869
KH
81
82*/
83
84/*** GENERAL NOTES on END-OF-LINE FORMAT ***
85
86 How end-of-line of a text is encoded depends on a system. For
87 instance, Unix's format is just one byte of `line-feed' code,
f4dee582 88 whereas DOS's format is two-byte sequence of `carriage-return' and
4ed46869
KH
89 `line-feed' codes. MacOS's format is one byte of `carriage-return'.
90
f4dee582
RS
91 Since text characters encoding and end-of-line encoding are
92 independent, any coding system described above can take
4ed46869 93 any format of end-of-line. So, Emacs has information of format of
f4dee582 94 end-of-line in each coding-system. See section 6 for more details.
4ed46869
KH
95
96*/
97
98/*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
99
100 These functions check if a text between SRC and SRC_END is encoded
101 in the coding system category XXX. Each returns an integer value in
102 which appropriate flag bits for the category XXX is set. The flag
103 bits are defined in macros CODING_CATEGORY_MASK_XXX. Below is the
104 template of these functions. */
105#if 0
106int
0ef69138 107detect_coding_emacs_mule (src, src_end)
4ed46869
KH
108 unsigned char *src, *src_end;
109{
110 ...
111}
112#endif
113
114/*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
115
116 These functions decode SRC_BYTES length text at SOURCE encoded in
0ef69138 117 CODING to Emacs' internal format (emacs-mule). The resulting text
f4dee582
RS
118 goes to a place pointed to by DESTINATION, the length of which should
119 not exceed DST_BYTES. The number of bytes actually processed is
120 returned as *CONSUMED. The return value is the length of the decoded
121 text. Below is a template of these functions. */
4ed46869
KH
122#if 0
123decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
124 struct coding_system *coding;
125 unsigned char *source, *destination;
126 int src_bytes, dst_bytes;
127 int *consumed;
128{
129 ...
130}
131#endif
132
133/*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
134
0ef69138
KH
135 These functions encode SRC_BYTES length text at SOURCE of Emacs'
136 internal format (emacs-mule) to CODING. The resulting text goes to
f4dee582
RS
137 a place pointed to by DESTINATION, the length of which should not
138 exceed DST_BYTES. The number of bytes actually processed is
139 returned as *CONSUMED. The return value is the length of the
140 encoded text. Below is a template of these functions. */
4ed46869
KH
141#if 0
142encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
143 struct coding_system *coding;
144 unsigned char *source, *destination;
145 int src_bytes, dst_bytes;
146 int *consumed;
147{
148 ...
149}
150#endif
151
152/*** COMMONLY USED MACROS ***/
153
154/* The following three macros ONE_MORE_BYTE, TWO_MORE_BYTES, and
155 THREE_MORE_BYTES safely get one, two, and three bytes from the
156 source text respectively. If there are not enough bytes in the
157 source, they jump to `label_end_of_loop'. The caller should set
158 variables `src' and `src_end' to appropriate areas in advance. */
159
160#define ONE_MORE_BYTE(c1) \
161 do { \
162 if (src < src_end) \
163 c1 = *src++; \
164 else \
165 goto label_end_of_loop; \
166 } while (0)
167
168#define TWO_MORE_BYTES(c1, c2) \
169 do { \
170 if (src + 1 < src_end) \
171 c1 = *src++, c2 = *src++; \
172 else \
173 goto label_end_of_loop; \
174 } while (0)
175
176#define THREE_MORE_BYTES(c1, c2, c3) \
177 do { \
178 if (src + 2 < src_end) \
179 c1 = *src++, c2 = *src++, c3 = *src++; \
180 else \
181 goto label_end_of_loop; \
182 } while (0)
183
184/* The following three macros DECODE_CHARACTER_ASCII,
185 DECODE_CHARACTER_DIMENSION1, and DECODE_CHARACTER_DIMENSION2 put
186 the multi-byte form of a character of each class at the place
187 pointed by `dst'. The caller should set the variable `dst' to
188 point to an appropriate area and the variable `coding' to point to
189 the coding-system of the currently decoding text in advance. */
190
191/* Decode one ASCII character C. */
192
193#define DECODE_CHARACTER_ASCII(c) \
194 do { \
195 if (COMPOSING_P (coding->composing)) \
196 *dst++ = 0xA0, *dst++ = (c) | 0x80; \
197 else \
198 *dst++ = (c); \
199 } while (0)
200
f4dee582 201/* Decode one DIMENSION1 character whose charset is CHARSET and whose
4ed46869
KH
202 position-code is C. */
203
204#define DECODE_CHARACTER_DIMENSION1(charset, c) \
205 do { \
206 unsigned char leading_code = CHARSET_LEADING_CODE_BASE (charset); \
207 if (COMPOSING_P (coding->composing)) \
208 *dst++ = leading_code + 0x20; \
209 else \
210 *dst++ = leading_code; \
211 if (leading_code = CHARSET_LEADING_CODE_EXT (charset)) \
212 *dst++ = leading_code; \
213 *dst++ = (c) | 0x80; \
214 } while (0)
215
f4dee582 216/* Decode one DIMENSION2 character whose charset is CHARSET and whose
4ed46869
KH
217 position-codes are C1 and C2. */
218
219#define DECODE_CHARACTER_DIMENSION2(charset, c1, c2) \
220 do { \
221 DECODE_CHARACTER_DIMENSION1 (charset, c1); \
222 *dst++ = (c2) | 0x80; \
223 } while (0)
224
225\f
226/*** 1. Preamble ***/
227
228#include <stdio.h>
229
230#ifdef emacs
231
232#include <config.h>
233#include "lisp.h"
234#include "buffer.h"
235#include "charset.h"
236#include "ccl.h"
237#include "coding.h"
238#include "window.h"
239
240#else /* not emacs */
241
242#include "mulelib.h"
243
244#endif /* not emacs */
245
246Lisp_Object Qcoding_system, Qeol_type;
247Lisp_Object Qbuffer_file_coding_system;
248Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
249
250extern Lisp_Object Qinsert_file_contents, Qwrite_region;
251Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
252Lisp_Object Qstart_process, Qopen_network_stream;
253Lisp_Object Qtarget_idx;
254
255/* Mnemonic character of each format of end-of-line. */
256int eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
257/* Mnemonic character to indicate format of end-of-line is not yet
258 decided. */
259int eol_mnemonic_undecided;
260
9ce27fde
KH
261/* Format of end-of-line decided by system. This is CODING_EOL_LF on
262 Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac. */
263int system_eol_type;
264
4ed46869
KH
265#ifdef emacs
266
02ba4723 267Lisp_Object Qcoding_system_spec, Qcoding_system_p, Qcoding_system_error;
4ed46869 268
9ce27fde
KH
269/* Coding system emacs-mule is for converting only end-of-line format. */
270Lisp_Object Qemacs_mule;
271
4ed46869
KH
272/* Coding-systems are handed between Emacs Lisp programs and C internal
273 routines by the following three variables. */
274/* Coding-system for reading files and receiving data from process. */
275Lisp_Object Vcoding_system_for_read;
276/* Coding-system for writing files and sending data to process. */
277Lisp_Object Vcoding_system_for_write;
278/* Coding-system actually used in the latest I/O. */
279Lisp_Object Vlast_coding_system_used;
280
9ce27fde
KH
281/* Flag to inhibit code conversion of end-of-line format. */
282int inhibit_eol_conversion;
283
4ed46869
KH
284/* Coding-system of what terminal accept for displaying. */
285struct coding_system terminal_coding;
286
287/* Coding-system of what is sent from terminal keyboard. */
288struct coding_system keyboard_coding;
289
02ba4723
KH
290Lisp_Object Vfile_coding_system_alist;
291Lisp_Object Vprocess_coding_system_alist;
292Lisp_Object Vnetwork_coding_system_alist;
4ed46869
KH
293
294#endif /* emacs */
295
296Lisp_Object Qcoding_category_index;
297
298/* List of symbols `coding-category-xxx' ordered by priority. */
299Lisp_Object Vcoding_category_list;
300
301/* Table of coding-systems currently assigned to each coding-category. */
302Lisp_Object coding_category_table[CODING_CATEGORY_IDX_MAX];
303
304/* Table of names of symbol for each coding-category. */
305char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
0ef69138 306 "coding-category-emacs-mule",
4ed46869
KH
307 "coding-category-sjis",
308 "coding-category-iso-7",
309 "coding-category-iso-8-1",
310 "coding-category-iso-8-2",
7717c392
KH
311 "coding-category-iso-7-else",
312 "coding-category-iso-8-else",
4ed46869
KH
313 "coding-category-big5",
314 "coding-category-binary"
315};
316
bdd9fb48
KH
317/* Flag to tell if we look up unification table on character code
318 conversion. */
319Lisp_Object Venable_character_unification;
a5d301df
KH
320/* Standard unification table to look up on decoding (reading). */
321Lisp_Object Vstandard_character_unification_table_for_decode;
322/* Standard unification table to look up on encoding (writing). */
323Lisp_Object Vstandard_character_unification_table_for_encode;
bdd9fb48
KH
324
325Lisp_Object Qcharacter_unification_table;
a5d301df
KH
326Lisp_Object Qcharacter_unification_table_for_decode;
327Lisp_Object Qcharacter_unification_table_for_encode;
4ed46869
KH
328
329/* Alist of charsets vs revision number. */
330Lisp_Object Vcharset_revision_alist;
331
02ba4723
KH
332/* Default coding systems used for process I/O. */
333Lisp_Object Vdefault_process_coding_system;
334
4ed46869 335\f
0ef69138 336/*** 2. Emacs internal format (emacs-mule) handlers ***/
4ed46869
KH
337
338/* Emacs' internal format for encoding multiple character sets is a
f4dee582
RS
339 kind of multi-byte encoding, i.e. characters are encoded by
340 variable-length sequences of one-byte codes. ASCII characters
341 and control characters (e.g. `tab', `newline') are represented by
342 one-byte sequences which are their ASCII codes, in the range 0x00
343 through 0x7F. The other characters are represented by a sequence
344 of `base leading-code', optional `extended leading-code', and one
345 or two `position-code's. The length of the sequence is determined
346 by the base leading-code. Leading-code takes the range 0x80
347 through 0x9F, whereas extended leading-code and position-code take
348 the range 0xA0 through 0xFF. See `charset.h' for more details
349 about leading-code and position-code.
350
351 There's one exception to this rule. Special leading-code
4ed46869
KH
352 `leading-code-composition' denotes that the following several
353 characters should be composed into one character. Leading-codes of
354 components (except for ASCII) are added 0x20. An ASCII character
355 component is represented by a 2-byte sequence of `0xA0' and
f4dee582
RS
356 `ASCII-code + 0x80'. See also the comments in `charset.h' for the
357 details of composite character. Hence, we can summarize the code
4ed46869
KH
358 range as follows:
359
360 --- CODE RANGE of Emacs' internal format ---
361 (character set) (range)
362 ASCII 0x00 .. 0x7F
363 ELSE (1st byte) 0x80 .. 0x9F
364 (rest bytes) 0xA0 .. 0xFF
365 ---------------------------------------------
366
367 */
368
369enum emacs_code_class_type emacs_code_class[256];
370
371/* Go to the next statement only if *SRC is accessible and the code is
372 greater than 0xA0. */
373#define CHECK_CODE_RANGE_A0_FF \
374 do { \
375 if (src >= src_end) \
376 goto label_end_of_switch; \
377 else if (*src++ < 0xA0) \
378 return 0; \
379 } while (0)
380
381/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
382 Check if a text is encoded in Emacs' internal format. If it is,
0ef69138 383 return CODING_CATEGORY_MASK_EMASC_MULE, else return 0. */
4ed46869
KH
384
385int
0ef69138 386detect_coding_emacs_mule (src, src_end)
4ed46869
KH
387 unsigned char *src, *src_end;
388{
389 unsigned char c;
390 int composing = 0;
391
392 while (src < src_end)
393 {
394 c = *src++;
395
396 if (composing)
397 {
398 if (c < 0xA0)
399 composing = 0;
400 else
401 c -= 0x20;
402 }
403
404 switch (emacs_code_class[c])
405 {
406 case EMACS_ascii_code:
407 case EMACS_linefeed_code:
408 break;
409
410 case EMACS_control_code:
411 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
412 return 0;
413 break;
414
415 case EMACS_invalid_code:
416 return 0;
417
418 case EMACS_leading_code_composition: /* c == 0x80 */
419 if (composing)
420 CHECK_CODE_RANGE_A0_FF;
421 else
422 composing = 1;
423 break;
424
425 case EMACS_leading_code_4:
426 CHECK_CODE_RANGE_A0_FF;
427 /* fall down to check it two more times ... */
428
429 case EMACS_leading_code_3:
430 CHECK_CODE_RANGE_A0_FF;
431 /* fall down to check it one more time ... */
432
433 case EMACS_leading_code_2:
434 CHECK_CODE_RANGE_A0_FF;
435 break;
436
437 default:
438 label_end_of_switch:
439 break;
440 }
441 }
0ef69138 442 return CODING_CATEGORY_MASK_EMACS_MULE;
4ed46869
KH
443}
444
445\f
446/*** 3. ISO2022 handlers ***/
447
448/* The following note describes the coding system ISO2022 briefly.
f4dee582
RS
449 Since the intention of this note is to help in understanding of
450 the programs in this file, some parts are NOT ACCURATE or OVERLY
4ed46869
KH
451 SIMPLIFIED. For the thorough understanding, please refer to the
452 original document of ISO2022.
453
454 ISO2022 provides many mechanisms to encode several character sets
f4dee582 455 in 7-bit and 8-bit environment. If one chooses 7-bite environment,
4ed46869 456 all text is encoded by codes of less than 128. This may make the
f4dee582
RS
457 encoded text a little bit longer, but the text gets more stability
458 to pass through several gateways (some of them strip off the MSB).
4ed46869 459
f4dee582 460 There are two kinds of character set: control character set and
4ed46869
KH
461 graphic character set. The former contains control characters such
462 as `newline' and `escape' to provide control functions (control
f4dee582 463 functions are provided also by escape sequences). The latter
4ed46869
KH
464 contains graphic characters such as ' A' and '-'. Emacs recognizes
465 two control character sets and many graphic character sets.
466
467 Graphic character sets are classified into one of the following
468 four classes, DIMENSION1_CHARS94, DIMENSION1_CHARS96,
469 DIMENSION2_CHARS94, DIMENSION2_CHARS96 according to the number of
470 bytes (DIMENSION) and the number of characters in one dimension
471 (CHARS) of the set. In addition, each character set is assigned an
472 identification tag (called "final character" and denoted as <F>
473 here after) which is unique in each class. <F> of each character
474 set is decided by ECMA(*) when it is registered in ISO. Code range
475 of <F> is 0x30..0x7F (0x30..0x3F are for private use only).
476
477 Note (*): ECMA = European Computer Manufacturers Association
478
479 Here are examples of graphic character set [NAME(<F>)]:
480 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
481 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
482 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
483 o DIMENSION2_CHARS96 -- none for the moment
484
485 A code area (1byte=8bits) is divided into 4 areas, C0, GL, C1, and GR.
486 C0 [0x00..0x1F] -- control character plane 0
487 GL [0x20..0x7F] -- graphic character plane 0
488 C1 [0x80..0x9F] -- control character plane 1
489 GR [0xA0..0xFF] -- graphic character plane 1
490
491 A control character set is directly designated and invoked to C0 or
492 C1 by an escape sequence. The most common case is that ISO646's
493 control character set is designated/invoked to C0 and ISO6429's
494 control character set is designated/invoked to C1, and usually
495 these designations/invocations are omitted in a coded text. With
496 7-bit environment, only C0 can be used, and a control character for
497 C1 is encoded by an appropriate escape sequence to fit in the
498 environment. All control characters for C1 are defined the
499 corresponding escape sequences.
500
501 A graphic character set is at first designated to one of four
502 graphic registers (G0 through G3), then these graphic registers are
503 invoked to GL or GR. These designations and invocations can be
504 done independently. The most common case is that G0 is invoked to
505 GL, G1 is invoked to GR, and ASCII is designated to G0, and usually
506 these invocations and designations are omitted in a coded text.
507 With 7-bit environment, only GL can be used.
508
509 When a graphic character set of CHARS94 is invoked to GL, code 0x20
510 and 0x7F of GL area work as control characters SPACE and DEL
511 respectively, and code 0xA0 and 0xFF of GR area should not be used.
512
513 There are two ways of invocation: locking-shift and single-shift.
514 With locking-shift, the invocation lasts until the next different
515 invocation, whereas with single-shift, the invocation works only
516 for the following character and doesn't affect locking-shift.
517 Invocations are done by the following control characters or escape
518 sequences.
519
520 ----------------------------------------------------------------------
521 function control char escape sequence description
522 ----------------------------------------------------------------------
523 SI (shift-in) 0x0F none invoke G0 to GL
524 SI (shift-out) 0x0E none invoke G1 to GL
525 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
526 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
527 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 into GL
528 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 into GL
529 ----------------------------------------------------------------------
530 The first four are for locking-shift. Control characters for these
531 functions are defined by macros ISO_CODE_XXX in `coding.h'.
532
533 Designations are done by the following escape sequences.
534 ----------------------------------------------------------------------
535 escape sequence description
536 ----------------------------------------------------------------------
537 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
538 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
539 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
540 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
541 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
542 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
543 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
544 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
545 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
546 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
547 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
548 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
549 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
550 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
551 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
552 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
553 ----------------------------------------------------------------------
554
555 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
556 of dimension 1, chars 94, and final character <F>, and etc.
557
558 Note (*): Although these designations are not allowed in ISO2022,
559 Emacs accepts them on decoding, and produces them on encoding
560 CHARS96 character set in a coding system which is characterized as
561 7-bit environment, non-locking-shift, and non-single-shift.
562
563 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
564 '(' can be omitted. We call this as "short-form" here after.
565
566 Now you may notice that there are a lot of ways for encoding the
f4dee582 567 same multilingual text in ISO2022. Actually, there exists many
4ed46869
KH
568 coding systems such as Compound Text (used in X's inter client
569 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
570 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
571 localized platforms), and all of these are variants of ISO2022.
572
573 In addition to the above, Emacs handles two more kinds of escape
574 sequences: ISO6429's direction specification and Emacs' private
575 sequence for specifying character composition.
576
577 ISO6429's direction specification takes the following format:
578 o CSI ']' -- end of the current direction
579 o CSI '0' ']' -- end of the current direction
580 o CSI '1' ']' -- start of left-to-right text
581 o CSI '2' ']' -- start of right-to-left text
582 The control character CSI (0x9B: control sequence introducer) is
583 abbreviated to the escape sequence ESC '[' in 7-bit environment.
584
585 Character composition specification takes the following format:
586 o ESC '0' -- start character composition
587 o ESC '1' -- end character composition
588 Since these are not standard escape sequences of any ISO, the use
589 of them for these meaning is restricted to Emacs only. */
590
591enum iso_code_class_type iso_code_class[256];
592
593/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
594 Check if a text is encoded in ISO2022. If it is, returns an
595 integer in which appropriate flag bits any of:
596 CODING_CATEGORY_MASK_ISO_7
597 CODING_CATEGORY_MASK_ISO_8_1
598 CODING_CATEGORY_MASK_ISO_8_2
7717c392
KH
599 CODING_CATEGORY_MASK_ISO_7_ELSE
600 CODING_CATEGORY_MASK_ISO_8_ELSE
4ed46869
KH
601 are set. If a code which should never appear in ISO2022 is found,
602 returns 0. */
603
604int
605detect_coding_iso2022 (src, src_end)
606 unsigned char *src, *src_end;
607{
765a2ca5
KH
608 int mask = (CODING_CATEGORY_MASK_ISO_7
609 | CODING_CATEGORY_MASK_ISO_8_1
610 | CODING_CATEGORY_MASK_ISO_8_2
7717c392
KH
611 | CODING_CATEGORY_MASK_ISO_7_ELSE
612 | CODING_CATEGORY_MASK_ISO_8_ELSE
613 );
bcf26d6a
KH
614 int g1 = 0; /* 1 iff designating to G1. */
615 int c, i;
4ed46869 616
e0e989f6 617 while (src < src_end)
4ed46869
KH
618 {
619 c = *src++;
620 switch (c)
621 {
622 case ISO_CODE_ESC:
e0e989f6 623 if (src >= src_end)
4ed46869
KH
624 break;
625 c = *src++;
bcf26d6a 626 if (src < src_end
e0e989f6
KH
627 && ((c >= '(' && c <= '/')
628 || c == '$' && ((*src >= '(' && *src <= '/')
629 || (*src >= '@' && *src <= 'B'))))
4ed46869 630 {
e0e989f6
KH
631 /* Valid designation sequence. */
632 if (c == ')' || (c == '$' && *src == ')'))
bcf26d6a
KH
633 {
634 g1 = 1;
7717c392
KH
635 mask &= ~(CODING_CATEGORY_MASK_ISO_7
636 | CODING_CATEGORY_MASK_ISO_7_ELSE);
bcf26d6a 637 }
e0e989f6
KH
638 src++;
639 break;
4ed46869 640 }
4ed46869 641 else if (c == 'N' || c == 'O' || c == 'n' || c == 'o')
7717c392
KH
642 mask &= (CODING_CATEGORY_MASK_ISO_7_ELSE
643 | CODING_CATEGORY_MASK_ISO_8_ELSE);
4ed46869
KH
644 break;
645
4ed46869 646 case ISO_CODE_SO:
e0e989f6 647 if (g1)
7717c392
KH
648 mask &= (CODING_CATEGORY_MASK_ISO_7_ELSE
649 | CODING_CATEGORY_MASK_ISO_8_ELSE);
e0e989f6
KH
650 break;
651
4ed46869
KH
652 case ISO_CODE_CSI:
653 case ISO_CODE_SS2:
654 case ISO_CODE_SS3:
7717c392
KH
655 mask &= ~(CODING_CATEGORY_MASK_ISO_7
656 | CODING_CATEGORY_MASK_ISO_7_ELSE);
4ed46869
KH
657 break;
658
659 default:
660 if (c < 0x80)
661 break;
662 else if (c < 0xA0)
663 return 0;
664 else
665 {
7717c392 666 unsigned char *src_begin = src;
4ed46869 667
7717c392
KH
668 mask &= ~(CODING_CATEGORY_MASK_ISO_7
669 | CODING_CATEGORY_MASK_ISO_7_ELSE);
e0e989f6 670 while (src < src_end && *src >= 0xA0)
7717c392
KH
671 src++;
672 if ((src - src_begin - 1) & 1 && src < src_end)
4ed46869
KH
673 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
674 }
675 break;
676 }
677 }
678
679 return mask;
680}
681
682/* Decode a character of which charset is CHARSET and the 1st position
bdd9fb48 683 code is C1. If dimension of CHARSET is 2, the 2nd position code is
4ed46869
KH
684 fetched from SRC and set to C2. If CHARSET is negative, it means
685 that we are decoding ill formed text, and what we can do is just to
686 read C1 as is. */
687
bdd9fb48
KH
688#define DECODE_ISO_CHARACTER(charset, c1) \
689 do { \
690 int c_alt, charset_alt = (charset); \
691 if (COMPOSING_HEAD_P (coding->composing)) \
692 { \
693 *dst++ = LEADING_CODE_COMPOSITION; \
694 if (COMPOSING_WITH_RULE_P (coding->composing)) \
695 /* To tell composition rules are embeded. */ \
696 *dst++ = 0xFF; \
697 coding->composing += 2; \
698 } \
699 if ((charset) >= 0) \
700 { \
701 if (CHARSET_DIMENSION (charset) == 2) \
702 ONE_MORE_BYTE (c2); \
703 if (!NILP (unification_table) \
704 && ((c_alt = unify_char (unification_table, \
705 -1, (charset), c1, c2)) >= 0)) \
706 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
707 } \
708 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
709 DECODE_CHARACTER_ASCII (c1); \
710 else if (CHARSET_DIMENSION (charset_alt) == 1) \
711 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
712 else \
713 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
714 if (COMPOSING_WITH_RULE_P (coding->composing)) \
715 /* To tell a composition rule follows. */ \
716 coding->composing = COMPOSING_WITH_RULE_RULE; \
4ed46869
KH
717 } while (0)
718
719/* Set designation state into CODING. */
720#define DECODE_DESIGNATION(reg, dimension, chars, final_char) \
721 do { \
2e34157c
RS
722 int charset = ISO_CHARSET_TABLE (make_number (dimension), \
723 make_number (chars), \
724 make_number (final_char)); \
4ed46869
KH
725 if (charset >= 0) \
726 { \
727 if (coding->direction == 1 \
728 && CHARSET_REVERSE_CHARSET (charset) >= 0) \
729 charset = CHARSET_REVERSE_CHARSET (charset); \
730 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
731 } \
732 } while (0)
733
734/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
735
736int
737decode_coding_iso2022 (coding, source, destination,
738 src_bytes, dst_bytes, consumed)
739 struct coding_system *coding;
740 unsigned char *source, *destination;
741 int src_bytes, dst_bytes;
742 int *consumed;
743{
744 unsigned char *src = source;
745 unsigned char *src_end = source + src_bytes;
746 unsigned char *dst = destination;
747 unsigned char *dst_end = destination + dst_bytes;
748 /* Since the maximum bytes produced by each loop is 7, we subtract 6
749 from DST_END to assure that overflow checking is necessary only
750 at the head of loop. */
751 unsigned char *adjusted_dst_end = dst_end - 6;
752 int charset;
753 /* Charsets invoked to graphic plane 0 and 1 respectively. */
754 int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
755 int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
a5d301df
KH
756 Lisp_Object unification_table
757 = coding->character_unification_table_for_decode;
bdd9fb48
KH
758
759 if (!NILP (Venable_character_unification) && NILP (unification_table))
a5d301df 760 unification_table = Vstandard_character_unification_table_for_decode;
4ed46869
KH
761
762 while (src < src_end && dst < adjusted_dst_end)
763 {
764 /* SRC_BASE remembers the start position in source in each loop.
765 The loop will be exited when there's not enough source text
766 to analyze long escape sequence or 2-byte code (within macros
767 ONE_MORE_BYTE or TWO_MORE_BYTES). In that case, SRC is reset
768 to SRC_BASE before exiting. */
769 unsigned char *src_base = src;
bdd9fb48 770 int c1 = *src++, c2;
4ed46869
KH
771
772 switch (iso_code_class [c1])
773 {
774 case ISO_0x20_or_0x7F:
775 if (!coding->composing
776 && (charset0 < 0 || CHARSET_CHARS (charset0) == 94))
777 {
778 /* This is SPACE or DEL. */
779 *dst++ = c1;
780 break;
781 }
782 /* This is a graphic character, we fall down ... */
783
784 case ISO_graphic_plane_0:
785 if (coding->composing == COMPOSING_WITH_RULE_RULE)
786 {
787 /* This is a composition rule. */
788 *dst++ = c1 | 0x80;
789 coding->composing = COMPOSING_WITH_RULE_TAIL;
790 }
791 else
792 DECODE_ISO_CHARACTER (charset0, c1);
793 break;
794
795 case ISO_0xA0_or_0xFF:
796 if (charset1 < 0 || CHARSET_CHARS (charset1) == 94)
797 {
798 /* Invalid code. */
799 *dst++ = c1;
800 break;
801 }
802 /* This is a graphic character, we fall down ... */
803
804 case ISO_graphic_plane_1:
805 DECODE_ISO_CHARACTER (charset1, c1);
806 break;
807
808 case ISO_control_code:
809 /* All ISO2022 control characters in this class have the
810 same representation in Emacs internal format. */
811 *dst++ = c1;
812 break;
813
814 case ISO_carriage_return:
815 if (coding->eol_type == CODING_EOL_CR)
816 {
817 *dst++ = '\n';
818 }
819 else if (coding->eol_type == CODING_EOL_CRLF)
820 {
821 ONE_MORE_BYTE (c1);
822 if (c1 == ISO_CODE_LF)
823 *dst++ = '\n';
824 else
825 {
826 src--;
827 *dst++ = c1;
828 }
829 }
830 else
831 {
832 *dst++ = c1;
833 }
834 break;
835
836 case ISO_shift_out:
e0e989f6
KH
837 if (CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
838 goto label_invalid_escape_sequence;
4ed46869
KH
839 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
840 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
841 break;
842
843 case ISO_shift_in:
844 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
845 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
846 break;
847
848 case ISO_single_shift_2_7:
849 case ISO_single_shift_2:
850 /* SS2 is handled as an escape sequence of ESC 'N' */
851 c1 = 'N';
852 goto label_escape_sequence;
853
854 case ISO_single_shift_3:
855 /* SS2 is handled as an escape sequence of ESC 'O' */
856 c1 = 'O';
857 goto label_escape_sequence;
858
859 case ISO_control_sequence_introducer:
860 /* CSI is handled as an escape sequence of ESC '[' ... */
861 c1 = '[';
862 goto label_escape_sequence;
863
864 case ISO_escape:
865 ONE_MORE_BYTE (c1);
866 label_escape_sequence:
867 /* Escape sequences handled by Emacs are invocation,
868 designation, direction specification, and character
869 composition specification. */
870 switch (c1)
871 {
872 case '&': /* revision of following character set */
873 ONE_MORE_BYTE (c1);
874 if (!(c1 >= '@' && c1 <= '~'))
e0e989f6 875 goto label_invalid_escape_sequence;
4ed46869
KH
876 ONE_MORE_BYTE (c1);
877 if (c1 != ISO_CODE_ESC)
e0e989f6 878 goto label_invalid_escape_sequence;
4ed46869
KH
879 ONE_MORE_BYTE (c1);
880 goto label_escape_sequence;
881
882 case '$': /* designation of 2-byte character set */
883 ONE_MORE_BYTE (c1);
884 if (c1 >= '@' && c1 <= 'B')
885 { /* designation of JISX0208.1978, GB2312.1980,
886 or JISX0208.1980 */
887 DECODE_DESIGNATION (0, 2, 94, c1);
888 }
889 else if (c1 >= 0x28 && c1 <= 0x2B)
890 { /* designation of DIMENSION2_CHARS94 character set */
891 ONE_MORE_BYTE (c2);
892 DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
893 }
894 else if (c1 >= 0x2C && c1 <= 0x2F)
895 { /* designation of DIMENSION2_CHARS96 character set */
896 ONE_MORE_BYTE (c2);
897 DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
898 }
899 else
e0e989f6 900 goto label_invalid_escape_sequence;
4ed46869
KH
901 break;
902
903 case 'n': /* invocation of locking-shift-2 */
e0e989f6
KH
904 if (CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
905 goto label_invalid_escape_sequence;
4ed46869 906 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
e0e989f6 907 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
4ed46869
KH
908 break;
909
910 case 'o': /* invocation of locking-shift-3 */
e0e989f6
KH
911 if (CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
912 goto label_invalid_escape_sequence;
4ed46869 913 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
e0e989f6 914 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
4ed46869
KH
915 break;
916
917 case 'N': /* invocation of single-shift-2 */
e0e989f6
KH
918 if (CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
919 goto label_invalid_escape_sequence;
4ed46869
KH
920 ONE_MORE_BYTE (c1);
921 charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
922 DECODE_ISO_CHARACTER (charset, c1);
923 break;
924
925 case 'O': /* invocation of single-shift-3 */
e0e989f6
KH
926 if (CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
927 goto label_invalid_escape_sequence;
4ed46869
KH
928 ONE_MORE_BYTE (c1);
929 charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
930 DECODE_ISO_CHARACTER (charset, c1);
931 break;
932
933 case '0': /* start composing without embeded rules */
934 coding->composing = COMPOSING_NO_RULE_HEAD;
935 break;
936
937 case '1': /* end composing */
938 coding->composing = COMPOSING_NO;
939 break;
940
941 case '2': /* start composing with embeded rules */
942 coding->composing = COMPOSING_WITH_RULE_HEAD;
943 break;
944
945 case '[': /* specification of direction */
946 /* For the moment, nested direction is not supported.
947 So, the value of `coding->direction' is 0 or 1: 0
948 means left-to-right, 1 means right-to-left. */
949 ONE_MORE_BYTE (c1);
950 switch (c1)
951 {
952 case ']': /* end of the current direction */
953 coding->direction = 0;
954
955 case '0': /* end of the current direction */
956 case '1': /* start of left-to-right direction */
957 ONE_MORE_BYTE (c1);
958 if (c1 == ']')
959 coding->direction = 0;
960 else
961 goto label_invalid_escape_sequence;
962 break;
963
964 case '2': /* start of right-to-left direction */
965 ONE_MORE_BYTE (c1);
966 if (c1 == ']')
967 coding->direction= 1;
968 else
969 goto label_invalid_escape_sequence;
970 break;
971
972 default:
973 goto label_invalid_escape_sequence;
974 }
975 break;
976
977 default:
978 if (c1 >= 0x28 && c1 <= 0x2B)
979 { /* designation of DIMENSION1_CHARS94 character set */
980 ONE_MORE_BYTE (c2);
981 DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
982 }
983 else if (c1 >= 0x2C && c1 <= 0x2F)
984 { /* designation of DIMENSION1_CHARS96 character set */
985 ONE_MORE_BYTE (c2);
986 DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
987 }
988 else
989 {
990 goto label_invalid_escape_sequence;
991 }
992 }
993 /* We must update these variables now. */
994 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
995 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
996 break;
997
998 label_invalid_escape_sequence:
999 {
1000 int length = src - src_base;
1001
1002 bcopy (src_base, dst, length);
1003 dst += length;
1004 }
1005 }
1006 continue;
1007
1008 label_end_of_loop:
1009 coding->carryover_size = src - src_base;
1010 bcopy (src_base, coding->carryover, coding->carryover_size);
1011 src = src_base;
1012 break;
1013 }
1014
1015 /* If this is the last block of the text to be decoded, we had
1016 better just flush out all remaining codes in the text although
1017 they are not valid characters. */
1018 if (coding->last_block)
1019 {
1020 bcopy (src, dst, src_end - src);
1021 dst += (src_end - src);
1022 src = src_end;
1023 }
1024 *consumed = src - source;
1025 return dst - destination;
1026}
1027
f4dee582 1028/* ISO2022 encoding stuff. */
4ed46869
KH
1029
1030/*
f4dee582 1031 It is not enough to say just "ISO2022" on encoding, we have to
4ed46869
KH
1032 specify more details. In Emacs, each coding-system of ISO2022
1033 variant has the following specifications:
1034 1. Initial designation to G0 thru G3.
1035 2. Allows short-form designation?
1036 3. ASCII should be designated to G0 before control characters?
1037 4. ASCII should be designated to G0 at end of line?
1038 5. 7-bit environment or 8-bit environment?
1039 6. Use locking-shift?
1040 7. Use Single-shift?
1041 And the following two are only for Japanese:
1042 8. Use ASCII in place of JIS0201-1976-Roman?
1043 9. Use JISX0208-1983 in place of JISX0208-1978?
1044 These specifications are encoded in `coding->flags' as flag bits
1045 defined by macros CODING_FLAG_ISO_XXX. See `coding.h' for more
f4dee582 1046 details.
4ed46869
KH
1047*/
1048
1049/* Produce codes (escape sequence) for designating CHARSET to graphic
1050 register REG. If <final-char> of CHARSET is '@', 'A', or 'B' and
1051 the coding system CODING allows, produce designation sequence of
1052 short-form. */
1053
1054#define ENCODE_DESIGNATION(charset, reg, coding) \
1055 do { \
1056 unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset); \
1057 char *intermediate_char_94 = "()*+"; \
1058 char *intermediate_char_96 = ",-./"; \
1059 Lisp_Object temp \
1060 = Fassq (make_number (charset), Vcharset_revision_alist); \
1061 if (! NILP (temp)) \
1062 { \
1063 *dst++ = ISO_CODE_ESC; \
1064 *dst++ = '&'; \
1065 *dst++ = XINT (XCONS (temp)->cdr) + '@'; \
1066 } \
1067 *dst++ = ISO_CODE_ESC; \
1068 if (CHARSET_DIMENSION (charset) == 1) \
1069 { \
1070 if (CHARSET_CHARS (charset) == 94) \
1071 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1072 else \
1073 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1074 } \
1075 else \
1076 { \
1077 *dst++ = '$'; \
1078 if (CHARSET_CHARS (charset) == 94) \
1079 { \
1080 if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM) \
1081 || reg != 0 \
1082 || final_char < '@' || final_char > 'B') \
1083 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1084 } \
1085 else \
1086 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1087 } \
1088 *dst++ = final_char; \
1089 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
1090 } while (0)
1091
1092/* The following two macros produce codes (control character or escape
1093 sequence) for ISO2022 single-shift functions (single-shift-2 and
1094 single-shift-3). */
1095
1096#define ENCODE_SINGLE_SHIFT_2 \
1097 do { \
1098 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1099 *dst++ = ISO_CODE_ESC, *dst++ = 'N'; \
1100 else \
1101 *dst++ = ISO_CODE_SS2; \
1102 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1103 } while (0)
1104
1105#define ENCODE_SINGLE_SHIFT_3 \
1106 do { \
1107 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1108 *dst++ = ISO_CODE_ESC, *dst++ = 'O'; \
1109 else \
1110 *dst++ = ISO_CODE_SS3; \
1111 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1112 } while (0)
1113
1114/* The following four macros produce codes (control character or
1115 escape sequence) for ISO2022 locking-shift functions (shift-in,
1116 shift-out, locking-shift-2, and locking-shift-3). */
1117
1118#define ENCODE_SHIFT_IN \
1119 do { \
1120 *dst++ = ISO_CODE_SI; \
1121 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1122 } while (0)
1123
1124#define ENCODE_SHIFT_OUT \
1125 do { \
1126 *dst++ = ISO_CODE_SO; \
1127 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1128 } while (0)
1129
1130#define ENCODE_LOCKING_SHIFT_2 \
1131 do { \
1132 *dst++ = ISO_CODE_ESC, *dst++ = 'n'; \
1133 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1134 } while (0)
1135
1136#define ENCODE_LOCKING_SHIFT_3 \
1137 do { \
1138 *dst++ = ISO_CODE_ESC, *dst++ = 'o'; \
1139 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1140 } while (0)
1141
f4dee582
RS
1142/* Produce codes for a DIMENSION1 character whose character set is
1143 CHARSET and whose position-code is C1. Designation and invocation
4ed46869
KH
1144 sequences are also produced in advance if necessary. */
1145
1146
1147#define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
1148 do { \
1149 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1150 { \
1151 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1152 *dst++ = c1 & 0x7F; \
1153 else \
1154 *dst++ = c1 | 0x80; \
1155 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1156 break; \
1157 } \
1158 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1159 { \
1160 *dst++ = c1 & 0x7F; \
1161 break; \
1162 } \
1163 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1164 { \
1165 *dst++ = c1 | 0x80; \
1166 break; \
1167 } \
1168 else \
1169 /* Since CHARSET is not yet invoked to any graphic planes, we \
1170 must invoke it, or, at first, designate it to some graphic \
1171 register. Then repeat the loop to actually produce the \
1172 character. */ \
1173 dst = encode_invocation_designation (charset, coding, dst); \
1174 } while (1)
1175
f4dee582
RS
1176/* Produce codes for a DIMENSION2 character whose character set is
1177 CHARSET and whose position-codes are C1 and C2. Designation and
4ed46869
KH
1178 invocation codes are also produced in advance if necessary. */
1179
1180#define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
1181 do { \
1182 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1183 { \
1184 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1185 *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F; \
1186 else \
1187 *dst++ = c1 | 0x80, *dst++ = c2 | 0x80; \
1188 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1189 break; \
1190 } \
1191 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1192 { \
1193 *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F; \
1194 break; \
1195 } \
1196 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1197 { \
1198 *dst++ = c1 | 0x80, *dst++= c2 | 0x80; \
1199 break; \
1200 } \
1201 else \
1202 /* Since CHARSET is not yet invoked to any graphic planes, we \
1203 must invoke it, or, at first, designate it to some graphic \
1204 register. Then repeat the loop to actually produce the \
1205 character. */ \
1206 dst = encode_invocation_designation (charset, coding, dst); \
1207 } while (1)
1208
bdd9fb48
KH
1209#define ENCODE_ISO_CHARACTER(charset, c1, c2) \
1210 do { \
1211 int c_alt, charset_alt; \
1212 if (!NILP (unification_table) \
1213 && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
a5d301df 1214 >= 0)) \
bdd9fb48
KH
1215 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1216 else \
1217 charset_alt = charset; \
1218 if (CHARSET_DIMENSION (charset_alt) == 1) \
1219 ENCODE_ISO_CHARACTER_DIMENSION1 (charset_alt, c1); \
1220 else \
1221 ENCODE_ISO_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
1222 } while (0)
1223
4ed46869
KH
1224/* Produce designation and invocation codes at a place pointed by DST
1225 to use CHARSET. The element `spec.iso2022' of *CODING is updated.
1226 Return new DST. */
1227
1228unsigned char *
1229encode_invocation_designation (charset, coding, dst)
1230 int charset;
1231 struct coding_system *coding;
1232 unsigned char *dst;
1233{
1234 int reg; /* graphic register number */
1235
1236 /* At first, check designations. */
1237 for (reg = 0; reg < 4; reg++)
1238 if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
1239 break;
1240
1241 if (reg >= 4)
1242 {
1243 /* CHARSET is not yet designated to any graphic registers. */
1244 /* At first check the requested designation. */
1245 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1ba9e4ab
KH
1246 if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1247 /* Since CHARSET requests no special designation, designate it
1248 to graphic register 0. */
4ed46869
KH
1249 reg = 0;
1250
1251 ENCODE_DESIGNATION (charset, reg, coding);
1252 }
1253
1254 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
1255 && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
1256 {
1257 /* Since the graphic register REG is not invoked to any graphic
1258 planes, invoke it to graphic plane 0. */
1259 switch (reg)
1260 {
1261 case 0: /* graphic register 0 */
1262 ENCODE_SHIFT_IN;
1263 break;
1264
1265 case 1: /* graphic register 1 */
1266 ENCODE_SHIFT_OUT;
1267 break;
1268
1269 case 2: /* graphic register 2 */
1270 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1271 ENCODE_SINGLE_SHIFT_2;
1272 else
1273 ENCODE_LOCKING_SHIFT_2;
1274 break;
1275
1276 case 3: /* graphic register 3 */
1277 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1278 ENCODE_SINGLE_SHIFT_3;
1279 else
1280 ENCODE_LOCKING_SHIFT_3;
1281 break;
1282 }
1283 }
1284 return dst;
1285}
1286
1287/* The following two macros produce codes for indicating composition. */
1288#define ENCODE_COMPOSITION_NO_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '0'
1289#define ENCODE_COMPOSITION_WITH_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '2'
1290#define ENCODE_COMPOSITION_END *dst++ = ISO_CODE_ESC, *dst++ = '1'
1291
1292/* The following three macros produce codes for indicating direction
1293 of text. */
1294#define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
1295 do { \
1296 if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS) \
1297 *dst++ = ISO_CODE_ESC, *dst++ = '['; \
1298 else \
1299 *dst++ = ISO_CODE_CSI; \
1300 } while (0)
1301
1302#define ENCODE_DIRECTION_R2L \
1303 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '2', *dst++ = ']'
1304
1305#define ENCODE_DIRECTION_L2R \
1306 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '0', *dst++ = ']'
1307
1308/* Produce codes for designation and invocation to reset the graphic
1309 planes and registers to initial state. */
e0e989f6
KH
1310#define ENCODE_RESET_PLANE_AND_REGISTER \
1311 do { \
1312 int reg; \
1313 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0) \
1314 ENCODE_SHIFT_IN; \
1315 for (reg = 0; reg < 4; reg++) \
1316 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0 \
1317 && (CODING_SPEC_ISO_DESIGNATION (coding, reg) \
1318 != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg))) \
1319 ENCODE_DESIGNATION \
1320 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
4ed46869
KH
1321 } while (0)
1322
bdd9fb48
KH
1323/* Produce designation sequences of charsets in the line started from
1324 *SRC to a place pointed by DSTP.
1325
1326 If the current block ends before any end-of-line, we may fail to
1327 find all the necessary *designations. */
1328encode_designation_at_bol (coding, table, src, src_end, dstp)
e0e989f6 1329 struct coding_system *coding;
bdd9fb48 1330 Lisp_Object table;
e0e989f6
KH
1331 unsigned char *src, *src_end, **dstp;
1332{
bdd9fb48
KH
1333 int charset, c, found = 0, reg;
1334 /* Table of charsets to be designated to each graphic register. */
1335 int r[4];
1336 unsigned char *dst = *dstp;
1337
1338 for (reg = 0; reg < 4; reg++)
1339 r[reg] = -1;
1340
1341 while (src < src_end && *src != '\n' && found < 4)
e0e989f6 1342 {
bdd9fb48
KH
1343 int bytes = BYTES_BY_CHAR_HEAD (*src);
1344
1345 if (NILP (table))
1346 charset = CHARSET_AT (src);
1347 else
e0e989f6 1348 {
bdd9fb48
KH
1349 int c_alt, c1, c2;
1350
1351 SPLIT_STRING(src, bytes, charset, c1, c2);
1352 if ((c_alt = unify_char (table, -1, charset, c1, c2)) >= 0)
1353 charset = CHAR_CHARSET (c_alt);
e0e989f6 1354 }
bdd9fb48 1355
e0e989f6 1356 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1ba9e4ab 1357 if (r[reg] == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
bdd9fb48
KH
1358 {
1359 found++;
1360 r[reg] = charset;
1361 }
1362
1363 src += bytes;
1364 }
1365
1366 if (found)
1367 {
1368 for (reg = 0; reg < 4; reg++)
1369 if (r[reg] >= 0
1370 && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
1371 ENCODE_DESIGNATION (r[reg], reg, coding);
1372 *dstp = dst;
e0e989f6 1373 }
e0e989f6
KH
1374}
1375
4ed46869
KH
1376/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
1377
1378int
1379encode_coding_iso2022 (coding, source, destination,
1380 src_bytes, dst_bytes, consumed)
1381 struct coding_system *coding;
1382 unsigned char *source, *destination;
1383 int src_bytes, dst_bytes;
1384 int *consumed;
1385{
1386 unsigned char *src = source;
1387 unsigned char *src_end = source + src_bytes;
1388 unsigned char *dst = destination;
1389 unsigned char *dst_end = destination + dst_bytes;
e0e989f6 1390 /* Since the maximum bytes produced by each loop is 20, we subtract 19
4ed46869
KH
1391 from DST_END to assure overflow checking is necessary only at the
1392 head of loop. */
e0e989f6 1393 unsigned char *adjusted_dst_end = dst_end - 19;
a5d301df
KH
1394 Lisp_Object unification_table
1395 = coding->character_unification_table_for_encode;
bdd9fb48
KH
1396
1397 if (!NILP (Venable_character_unification) && NILP (unification_table))
a5d301df 1398 unification_table = Vstandard_character_unification_table_for_encode;
4ed46869
KH
1399
1400 while (src < src_end && dst < adjusted_dst_end)
1401 {
1402 /* SRC_BASE remembers the start position in source in each loop.
1403 The loop will be exited when there's not enough source text
1404 to analyze multi-byte codes (within macros ONE_MORE_BYTE,
1405 TWO_MORE_BYTES, and THREE_MORE_BYTES). In that case, SRC is
1406 reset to SRC_BASE before exiting. */
1407 unsigned char *src_base = src;
bdd9fb48 1408 int charset, c1, c2, c3, c4;
4ed46869 1409
e0e989f6
KH
1410 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
1411 && CODING_SPEC_ISO_BOL (coding))
1412 {
bdd9fb48
KH
1413 /* We have to produce designation sequences if any now. */
1414 encode_designation_at_bol (coding, unification_table,
1415 src, src_end, &dst);
e0e989f6
KH
1416 CODING_SPEC_ISO_BOL (coding) = 0;
1417 }
1418
1419 c1 = *src++;
4ed46869
KH
1420 /* If we are seeing a component of a composite character, we are
1421 seeing a leading-code specially encoded for composition, or a
1422 composition rule if composing with rule. We must set C1
1423 to a normal leading-code or an ASCII code. If we are not at
1424 a composed character, we must reset the composition state. */
1425 if (COMPOSING_P (coding->composing))
1426 {
1427 if (c1 < 0xA0)
1428 {
1429 /* We are not in a composite character any longer. */
1430 coding->composing = COMPOSING_NO;
1431 ENCODE_COMPOSITION_END;
1432 }
1433 else
1434 {
1435 if (coding->composing == COMPOSING_WITH_RULE_RULE)
1436 {
1437 *dst++ = c1 & 0x7F;
1438 coding->composing = COMPOSING_WITH_RULE_HEAD;
1439 continue;
1440 }
1441 else if (coding->composing == COMPOSING_WITH_RULE_HEAD)
1442 coding->composing = COMPOSING_WITH_RULE_RULE;
1443 if (c1 == 0xA0)
1444 {
1445 /* This is an ASCII component. */
1446 ONE_MORE_BYTE (c1);
1447 c1 &= 0x7F;
1448 }
1449 else
1450 /* This is a leading-code of non ASCII component. */
1451 c1 -= 0x20;
1452 }
1453 }
1454
1455 /* Now encode one character. C1 is a control character, an
1456 ASCII character, or a leading-code of multi-byte character. */
1457 switch (emacs_code_class[c1])
1458 {
1459 case EMACS_ascii_code:
bdd9fb48 1460 ENCODE_ISO_CHARACTER (CHARSET_ASCII, c1, /* dummy */ c2);
4ed46869
KH
1461 break;
1462
1463 case EMACS_control_code:
1464 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
e0e989f6 1465 ENCODE_RESET_PLANE_AND_REGISTER;
4ed46869
KH
1466 *dst++ = c1;
1467 break;
1468
1469 case EMACS_carriage_return_code:
1470 if (!coding->selective)
1471 {
1472 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
e0e989f6 1473 ENCODE_RESET_PLANE_AND_REGISTER;
4ed46869
KH
1474 *dst++ = c1;
1475 break;
1476 }
1477 /* fall down to treat '\r' as '\n' ... */
1478
1479 case EMACS_linefeed_code:
1480 if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
e0e989f6
KH
1481 ENCODE_RESET_PLANE_AND_REGISTER;
1482 if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
1483 bcopy (coding->spec.iso2022.initial_designation,
1484 coding->spec.iso2022.current_designation,
1485 sizeof coding->spec.iso2022.initial_designation);
4ed46869 1486 if (coding->eol_type == CODING_EOL_LF
0ef69138 1487 || coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
1488 *dst++ = ISO_CODE_LF;
1489 else if (coding->eol_type == CODING_EOL_CRLF)
1490 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
1491 else
1492 *dst++ = ISO_CODE_CR;
e0e989f6 1493 CODING_SPEC_ISO_BOL (coding) = 1;
4ed46869
KH
1494 break;
1495
1496 case EMACS_leading_code_2:
1497 ONE_MORE_BYTE (c2);
19a8d9e0
KH
1498 if (c2 < 0xA0)
1499 {
1500 /* invalid sequence */
1501 *dst++ = c1;
1502 *dst++ = c2;
1503 }
1504 else
1505 ENCODE_ISO_CHARACTER (c1, c2, /* dummy */ c3);
4ed46869
KH
1506 break;
1507
1508 case EMACS_leading_code_3:
1509 TWO_MORE_BYTES (c2, c3);
19a8d9e0
KH
1510 if (c2 < 0xA0 || c3 < 0xA0)
1511 {
1512 /* invalid sequence */
1513 *dst++ = c1;
1514 *dst++ = c2;
1515 *dst++ = c3;
1516 }
1517 else if (c1 < LEADING_CODE_PRIVATE_11)
bdd9fb48 1518 ENCODE_ISO_CHARACTER (c1, c2, c3);
4ed46869 1519 else
bdd9fb48 1520 ENCODE_ISO_CHARACTER (c2, c3, /* dummy */ c4);
4ed46869
KH
1521 break;
1522
1523 case EMACS_leading_code_4:
1524 THREE_MORE_BYTES (c2, c3, c4);
19a8d9e0
KH
1525 if (c2 < 0xA0 || c3 < 0xA0 || c4 < 0xA0)
1526 {
1527 /* invalid sequence */
1528 *dst++ = c1;
1529 *dst++ = c2;
1530 *dst++ = c3;
1531 *dst++ = c4;
1532 }
1533 else
1534 ENCODE_ISO_CHARACTER (c2, c3, c4);
4ed46869
KH
1535 break;
1536
1537 case EMACS_leading_code_composition:
19a8d9e0
KH
1538 ONE_MORE_BYTE (c2);
1539 if (c2 < 0xA0)
1540 {
1541 /* invalid sequence */
1542 *dst++ = c1;
1543 *dst++ = c2;
1544 }
1545 else if (c2 == 0xFF)
4ed46869
KH
1546 {
1547 coding->composing = COMPOSING_WITH_RULE_HEAD;
1548 ENCODE_COMPOSITION_WITH_RULE_START;
1549 }
1550 else
1551 {
1552 /* Rewind one byte because it is a character code of
1553 composition elements. */
1554 src--;
1555 coding->composing = COMPOSING_NO_RULE_HEAD;
1556 ENCODE_COMPOSITION_NO_RULE_START;
1557 }
1558 break;
1559
1560 case EMACS_invalid_code:
1561 *dst++ = c1;
1562 break;
1563 }
1564 continue;
1565 label_end_of_loop:
76376439
KH
1566 /* We reach here because the source date ends not at character
1567 boundary. */
1568 coding->carryover_size = src_end - src_base;
4ed46869 1569 bcopy (src_base, coding->carryover, coding->carryover_size);
76376439 1570 src = src_end;
4ed46869
KH
1571 break;
1572 }
1573
1574 /* If this is the last block of the text to be encoded, we must
bdd9fb48
KH
1575 reset graphic planes and registers to the initial state. */
1576 if (src >= src_end && coding->last_block)
4ed46869 1577 {
e0e989f6 1578 ENCODE_RESET_PLANE_AND_REGISTER;
bdd9fb48
KH
1579 if (coding->carryover_size > 0
1580 && coding->carryover_size < (dst_end - dst))
1581 {
1582 bcopy (coding->carryover, dst, coding->carryover_size);
1583 dst += coding->carryover_size;
1584 coding->carryover_size = 0;
1585 }
4ed46869
KH
1586 }
1587 *consumed = src - source;
1588 return dst - destination;
1589}
1590
1591\f
1592/*** 4. SJIS and BIG5 handlers ***/
1593
f4dee582 1594/* Although SJIS and BIG5 are not ISO's coding system, they are used
4ed46869
KH
1595 quite widely. So, for the moment, Emacs supports them in the bare
1596 C code. But, in the future, they may be supported only by CCL. */
1597
1598/* SJIS is a coding system encoding three character sets: ASCII, right
1599 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
1600 as is. A character of charset katakana-jisx0201 is encoded by
1601 "position-code + 0x80". A character of charset japanese-jisx0208
1602 is encoded in 2-byte but two position-codes are divided and shifted
1603 so that it fit in the range below.
1604
1605 --- CODE RANGE of SJIS ---
1606 (character set) (range)
1607 ASCII 0x00 .. 0x7F
1608 KATAKANA-JISX0201 0xA0 .. 0xDF
1609 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xFF
1610 (2nd byte) 0x40 .. 0xFF
1611 -------------------------------
1612
1613*/
1614
1615/* BIG5 is a coding system encoding two character sets: ASCII and
1616 Big5. An ASCII character is encoded as is. Big5 is a two-byte
1617 character set and is encoded in two-byte.
1618
1619 --- CODE RANGE of BIG5 ---
1620 (character set) (range)
1621 ASCII 0x00 .. 0x7F
1622 Big5 (1st byte) 0xA1 .. 0xFE
1623 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
1624 --------------------------
1625
1626 Since the number of characters in Big5 is larger than maximum
1627 characters in Emacs' charset (96x96), it can't be handled as one
1628 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
1629 and `charset-big5-2'. Both are DIMENSION2 and CHARS94. The former
1630 contains frequently used characters and the latter contains less
1631 frequently used characters. */
1632
1633/* Macros to decode or encode a character of Big5 in BIG5. B1 and B2
1634 are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
1635 C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
1636 format. CHARSET is `charset_big5_1' or `charset_big5_2'. */
1637
1638/* Number of Big5 characters which have the same code in 1st byte. */
1639#define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
1640
1641#define DECODE_BIG5(b1, b2, charset, c1, c2) \
1642 do { \
1643 unsigned int temp \
1644 = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62); \
1645 if (b1 < 0xC9) \
1646 charset = charset_big5_1; \
1647 else \
1648 { \
1649 charset = charset_big5_2; \
1650 temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW; \
1651 } \
1652 c1 = temp / (0xFF - 0xA1) + 0x21; \
1653 c2 = temp % (0xFF - 0xA1) + 0x21; \
1654 } while (0)
1655
1656#define ENCODE_BIG5(charset, c1, c2, b1, b2) \
1657 do { \
1658 unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21); \
1659 if (charset == charset_big5_2) \
1660 temp += BIG5_SAME_ROW * (0xC9 - 0xA1); \
1661 b1 = temp / BIG5_SAME_ROW + 0xA1; \
1662 b2 = temp % BIG5_SAME_ROW; \
1663 b2 += b2 < 0x3F ? 0x40 : 0x62; \
1664 } while (0)
1665
a5d301df
KH
1666#define DECODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \
1667 do { \
1668 int c_alt, charset_alt = (charset); \
1669 if (!NILP (unification_table) \
1670 && ((c_alt = unify_char (unification_table, \
1671 -1, (charset), c1, c2)) >= 0)) \
1672 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1673 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
1674 DECODE_CHARACTER_ASCII (c1); \
1675 else if (CHARSET_DIMENSION (charset_alt) == 1) \
1676 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
1677 else \
1678 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
1679 } while (0)
1680
1681#define ENCODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \
1682 do { \
1683 int c_alt, charset_alt; \
1684 if (!NILP (unification_table) \
1685 && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
1686 >= 0)) \
1687 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1688 else \
1689 charset_alt = charset; \
1690 if (charset_alt == charset_ascii) \
1691 *dst++ = c1; \
1692 else if (CHARSET_DIMENSION (charset_alt) == 1) \
1693 { \
1694 if (sjis_p && charset_alt == charset_katakana_jisx0201) \
1695 *dst++ = c1; \
1696 else \
1697 *dst++ = charset_alt, *dst++ = c1; \
1698 } \
1699 else \
1700 { \
1701 c1 &= 0x7F, c2 &= 0x7F; \
1702 if (sjis_p && charset_alt == charset_jisx0208) \
1703 { \
1704 unsigned char s1, s2; \
1705 \
1706 ENCODE_SJIS (c1, c2, s1, s2); \
1707 *dst++ = s1, *dst++ = s2; \
1708 } \
1709 else if (!sjis_p \
1710 && (charset_alt == charset_big5_1 \
1711 || charset_alt == charset_big5_2)) \
1712 { \
1713 unsigned char b1, b2; \
1714 \
9ce27fde 1715 ENCODE_BIG5 (charset_alt, c1, c2, b1, b2); \
a5d301df
KH
1716 *dst++ = b1, *dst++ = b2; \
1717 } \
1718 else \
1719 *dst++ = charset_alt, *dst++ = c1, *dst++ = c2; \
1720 } \
1721 } while (0);
1722
4ed46869
KH
1723/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1724 Check if a text is encoded in SJIS. If it is, return
1725 CODING_CATEGORY_MASK_SJIS, else return 0. */
1726
1727int
1728detect_coding_sjis (src, src_end)
1729 unsigned char *src, *src_end;
1730{
1731 unsigned char c;
1732
1733 while (src < src_end)
1734 {
1735 c = *src++;
1736 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1737 return 0;
1738 if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
1739 {
1740 if (src < src_end && *src++ < 0x40)
1741 return 0;
1742 }
1743 }
1744 return CODING_CATEGORY_MASK_SJIS;
1745}
1746
1747/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1748 Check if a text is encoded in BIG5. If it is, return
1749 CODING_CATEGORY_MASK_BIG5, else return 0. */
1750
1751int
1752detect_coding_big5 (src, src_end)
1753 unsigned char *src, *src_end;
1754{
1755 unsigned char c;
1756
1757 while (src < src_end)
1758 {
1759 c = *src++;
1760 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1761 return 0;
1762 if (c >= 0xA1)
1763 {
1764 if (src >= src_end)
1765 break;
1766 c = *src++;
1767 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
1768 return 0;
1769 }
1770 }
1771 return CODING_CATEGORY_MASK_BIG5;
1772}
1773
1774/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
1775 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
1776
1777int
1778decode_coding_sjis_big5 (coding, source, destination,
1779 src_bytes, dst_bytes, consumed, sjis_p)
1780 struct coding_system *coding;
1781 unsigned char *source, *destination;
1782 int src_bytes, dst_bytes;
1783 int *consumed;
1784 int sjis_p;
1785{
1786 unsigned char *src = source;
1787 unsigned char *src_end = source + src_bytes;
1788 unsigned char *dst = destination;
1789 unsigned char *dst_end = destination + dst_bytes;
1790 /* Since the maximum bytes produced by each loop is 4, we subtract 3
1791 from DST_END to assure overflow checking is necessary only at the
1792 head of loop. */
1793 unsigned char *adjusted_dst_end = dst_end - 3;
a5d301df
KH
1794 Lisp_Object unification_table
1795 = coding->character_unification_table_for_decode;
1796
1797 if (!NILP (Venable_character_unification) && NILP (unification_table))
1798 unification_table = Vstandard_character_unification_table_for_decode;
4ed46869
KH
1799
1800 while (src < src_end && dst < adjusted_dst_end)
1801 {
1802 /* SRC_BASE remembers the start position in source in each loop.
1803 The loop will be exited when there's not enough source text
1804 to analyze two-byte character (within macro ONE_MORE_BYTE).
1805 In that case, SRC is reset to SRC_BASE before exiting. */
1806 unsigned char *src_base = src;
1807 unsigned char c1 = *src++, c2, c3, c4;
1808
1809 if (c1 == '\r')
1810 {
1811 if (coding->eol_type == CODING_EOL_CRLF)
1812 {
1813 ONE_MORE_BYTE (c2);
1814 if (c2 == '\n')
1815 *dst++ = c2;
1816 else
1817 /* To process C2 again, SRC is subtracted by 1. */
1818 *dst++ = c1, src--;
1819 }
1820 else
1821 *dst++ = c1;
1822 }
a5d301df 1823 else if (c1 < 0x20)
4ed46869 1824 *dst++ = c1;
a5d301df
KH
1825 else if (c1 < 0x80)
1826 DECODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
4ed46869
KH
1827 else if (c1 < 0xA0 || c1 >= 0xE0)
1828 {
1829 /* SJIS -> JISX0208, BIG5 -> Big5 (only if 0xE0 <= c1 < 0xFF) */
1830 if (sjis_p)
1831 {
1832 ONE_MORE_BYTE (c2);
1833 DECODE_SJIS (c1, c2, c3, c4);
a5d301df 1834 DECODE_SJIS_BIG5_CHARACTER (charset_jisx0208, c3, c4);
4ed46869
KH
1835 }
1836 else if (c1 >= 0xE0 && c1 < 0xFF)
1837 {
1838 int charset;
1839
1840 ONE_MORE_BYTE (c2);
1841 DECODE_BIG5 (c1, c2, charset, c3, c4);
a5d301df 1842 DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
4ed46869
KH
1843 }
1844 else /* Invalid code */
1845 *dst++ = c1;
1846 }
1847 else
1848 {
1849 /* SJIS -> JISX0201-Kana, BIG5 -> Big5 */
1850 if (sjis_p)
a5d301df 1851 DECODE_SJIS_BIG5_CHARACTER (charset_katakana_jisx0201, c1, /* dummy */ c2);
4ed46869
KH
1852 else
1853 {
1854 int charset;
1855
1856 ONE_MORE_BYTE (c2);
1857 DECODE_BIG5 (c1, c2, charset, c3, c4);
a5d301df 1858 DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
4ed46869
KH
1859 }
1860 }
1861 continue;
1862
1863 label_end_of_loop:
1864 coding->carryover_size = src - src_base;
1865 bcopy (src_base, coding->carryover, coding->carryover_size);
1866 src = src_base;
1867 break;
1868 }
1869
1870 *consumed = src - source;
1871 return dst - destination;
1872}
1873
1874/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
1875 This function can encode `charset_ascii', `charset_katakana_jisx0201',
1876 `charset_jisx0208', `charset_big5_1', and `charset_big5-2'. We are
1877 sure that all these charsets are registered as official charset
1878 (i.e. do not have extended leading-codes). Characters of other
1879 charsets are produced without any encoding. If SJIS_P is 1, encode
1880 SJIS text, else encode BIG5 text. */
1881
1882int
1883encode_coding_sjis_big5 (coding, source, destination,
1884 src_bytes, dst_bytes, consumed, sjis_p)
1885 struct coding_system *coding;
1886 unsigned char *source, *destination;
1887 int src_bytes, dst_bytes;
1888 int *consumed;
1889 int sjis_p;
1890{
1891 unsigned char *src = source;
1892 unsigned char *src_end = source + src_bytes;
1893 unsigned char *dst = destination;
1894 unsigned char *dst_end = destination + dst_bytes;
1895 /* Since the maximum bytes produced by each loop is 2, we subtract 1
1896 from DST_END to assure overflow checking is necessary only at the
1897 head of loop. */
1898 unsigned char *adjusted_dst_end = dst_end - 1;
a5d301df
KH
1899 Lisp_Object unification_table
1900 = coding->character_unification_table_for_encode;
1901
1902 if (!NILP (Venable_character_unification) && NILP (unification_table))
1903 unification_table = Vstandard_character_unification_table_for_encode;
4ed46869
KH
1904
1905 while (src < src_end && dst < adjusted_dst_end)
1906 {
1907 /* SRC_BASE remembers the start position in source in each loop.
1908 The loop will be exited when there's not enough source text
1909 to analyze multi-byte codes (within macros ONE_MORE_BYTE and
1910 TWO_MORE_BYTES). In that case, SRC is reset to SRC_BASE
1911 before exiting. */
1912 unsigned char *src_base = src;
1913 unsigned char c1 = *src++, c2, c3, c4;
1914
1915 if (coding->composing)
1916 {
1917 if (c1 == 0xA0)
1918 {
1919 ONE_MORE_BYTE (c1);
1920 c1 &= 0x7F;
1921 }
1922 else if (c1 >= 0xA0)
1923 c1 -= 0x20;
1924 else
1925 coding->composing = 0;
1926 }
1927
1928 switch (emacs_code_class[c1])
1929 {
1930 case EMACS_ascii_code:
a5d301df
KH
1931 ENCODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
1932 break;
1933
4ed46869
KH
1934 case EMACS_control_code:
1935 *dst++ = c1;
1936 break;
1937
1938 case EMACS_carriage_return_code:
1939 if (!coding->selective)
1940 {
1941 *dst++ = c1;
1942 break;
1943 }
1944 /* fall down to treat '\r' as '\n' ... */
1945
1946 case EMACS_linefeed_code:
1947 if (coding->eol_type == CODING_EOL_LF
0ef69138 1948 || coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
1949 *dst++ = '\n';
1950 else if (coding->eol_type == CODING_EOL_CRLF)
1951 *dst++ = '\r', *dst++ = '\n';
1952 else
1953 *dst++ = '\r';
1954 break;
1955
1956 case EMACS_leading_code_2:
1957 ONE_MORE_BYTE (c2);
a5d301df 1958 ENCODE_SJIS_BIG5_CHARACTER (c1, c2, /* dummy */ c3);
4ed46869
KH
1959 break;
1960
1961 case EMACS_leading_code_3:
1962 TWO_MORE_BYTES (c2, c3);
a5d301df 1963 ENCODE_SJIS_BIG5_CHARACTER (c1, c2, c3);
4ed46869
KH
1964 break;
1965
1966 case EMACS_leading_code_4:
1967 THREE_MORE_BYTES (c2, c3, c4);
a5d301df 1968 ENCODE_SJIS_BIG5_CHARACTER (c2, c3, c4);
4ed46869
KH
1969 break;
1970
1971 case EMACS_leading_code_composition:
1972 coding->composing = 1;
1973 break;
1974
1975 default: /* i.e. case EMACS_invalid_code: */
1976 *dst++ = c1;
1977 }
1978 continue;
1979
1980 label_end_of_loop:
76376439 1981 coding->carryover_size = src_end - src_base;
4ed46869 1982 bcopy (src_base, coding->carryover, coding->carryover_size);
76376439 1983 src = src_end;
4ed46869
KH
1984 break;
1985 }
1986
1987 *consumed = src - source;
1988 return dst - destination;
1989}
1990
1991\f
1992/*** 5. End-of-line handlers ***/
1993
1994/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
1995 This function is called only when `coding->eol_type' is
1996 CODING_EOL_CRLF or CODING_EOL_CR. */
1997
1998decode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
1999 struct coding_system *coding;
2000 unsigned char *source, *destination;
2001 int src_bytes, dst_bytes;
2002 int *consumed;
2003{
2004 unsigned char *src = source;
2005 unsigned char *src_end = source + src_bytes;
2006 unsigned char *dst = destination;
2007 unsigned char *dst_end = destination + dst_bytes;
2008 int produced;
2009
2010 switch (coding->eol_type)
2011 {
2012 case CODING_EOL_CRLF:
2013 {
2014 /* Since the maximum bytes produced by each loop is 2, we
2015 subtract 1 from DST_END to assure overflow checking is
2016 necessary only at the head of loop. */
2017 unsigned char *adjusted_dst_end = dst_end - 1;
2018
2019 while (src < src_end && dst < adjusted_dst_end)
2020 {
2021 unsigned char *src_base = src;
2022 unsigned char c = *src++;
2023 if (c == '\r')
2024 {
2025 ONE_MORE_BYTE (c);
2026 if (c != '\n')
2027 *dst++ = '\r';
bfd99048 2028 *dst++ = c;
4ed46869
KH
2029 }
2030 else
2031 *dst++ = c;
2032 continue;
2033
2034 label_end_of_loop:
2035 coding->carryover_size = src - src_base;
2036 bcopy (src_base, coding->carryover, coding->carryover_size);
2037 src = src_base;
2038 break;
2039 }
2040 *consumed = src - source;
2041 produced = dst - destination;
2042 break;
2043 }
2044
2045 case CODING_EOL_CR:
2046 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2047 bcopy (source, destination, produced);
2048 dst_end = destination + produced;
2049 while (dst < dst_end)
2050 if (*dst++ == '\r') dst[-1] = '\n';
2051 *consumed = produced;
2052 break;
2053
2054 default: /* i.e. case: CODING_EOL_LF */
2055 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2056 bcopy (source, destination, produced);
2057 *consumed = produced;
2058 break;
2059 }
2060
2061 return produced;
2062}
2063
2064/* See "GENERAL NOTES about `encode_coding_XXX ()' functions". Encode
2065 format of end-of-line according to `coding->eol_type'. If
2066 `coding->selective' is 1, code '\r' in source text also means
2067 end-of-line. */
2068
2069encode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
2070 struct coding_system *coding;
2071 unsigned char *source, *destination;
2072 int src_bytes, dst_bytes;
2073 int *consumed;
2074{
2075 unsigned char *src = source;
2076 unsigned char *dst = destination;
2077 int produced;
2078
2079 if (src_bytes <= 0)
2080 return 0;
2081
2082 switch (coding->eol_type)
2083 {
2084 case CODING_EOL_LF:
0ef69138 2085 case CODING_EOL_UNDECIDED:
4ed46869
KH
2086 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2087 bcopy (source, destination, produced);
2088 if (coding->selective)
2089 {
2090 int i = produced;
2091 while (i--)
2092 if (*dst++ == '\r') dst[-1] = '\n';
2093 }
2094 *consumed = produced;
2095
2096 case CODING_EOL_CRLF:
2097 {
2098 unsigned char c;
2099 unsigned char *src_end = source + src_bytes;
2100 unsigned char *dst_end = destination + dst_bytes;
2101 /* Since the maximum bytes produced by each loop is 2, we
2102 subtract 1 from DST_END to assure overflow checking is
2103 necessary only at the head of loop. */
2104 unsigned char *adjusted_dst_end = dst_end - 1;
2105
2106 while (src < src_end && dst < adjusted_dst_end)
2107 {
2108 c = *src++;
2109 if (c == '\n' || (c == '\r' && coding->selective))
2110 *dst++ = '\r', *dst++ = '\n';
2111 else
2112 *dst++ = c;
2113 }
2114 produced = dst - destination;
2115 *consumed = src - source;
2116 break;
2117 }
2118
2119 default: /* i.e. case CODING_EOL_CR: */
2120 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2121 bcopy (source, destination, produced);
2122 {
2123 int i = produced;
2124 while (i--)
2125 if (*dst++ == '\n') dst[-1] = '\r';
2126 }
2127 *consumed = produced;
2128 }
2129
2130 return produced;
2131}
2132
2133\f
2134/*** 6. C library functions ***/
2135
2136/* In Emacs Lisp, coding system is represented by a Lisp symbol which
2137 has a property `coding-system'. The value of this property is a
2138 vector of length 5 (called as coding-vector). Among elements of
2139 this vector, the first (element[0]) and the fifth (element[4])
2140 carry important information for decoding/encoding. Before
2141 decoding/encoding, this information should be set in fields of a
2142 structure of type `coding_system'.
2143
2144 A value of property `coding-system' can be a symbol of another
2145 subsidiary coding-system. In that case, Emacs gets coding-vector
2146 from that symbol.
2147
2148 `element[0]' contains information to be set in `coding->type'. The
2149 value and its meaning is as follows:
2150
0ef69138
KH
2151 0 -- coding_type_emacs_mule
2152 1 -- coding_type_sjis
2153 2 -- coding_type_iso2022
2154 3 -- coding_type_big5
2155 4 -- coding_type_ccl encoder/decoder written in CCL
2156 nil -- coding_type_no_conversion
2157 t -- coding_type_undecided (automatic conversion on decoding,
2158 no-conversion on encoding)
4ed46869
KH
2159
2160 `element[4]' contains information to be set in `coding->flags' and
2161 `coding->spec'. The meaning varies by `coding->type'.
2162
2163 If `coding->type' is `coding_type_iso2022', element[4] is a vector
2164 of length 32 (of which the first 13 sub-elements are used now).
2165 Meanings of these sub-elements are:
2166
2167 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2168 If the value is an integer of valid charset, the charset is
2169 assumed to be designated to graphic register N initially.
2170
2171 If the value is minus, it is a minus value of charset which
2172 reserves graphic register N, which means that the charset is
2173 not designated initially but should be designated to graphic
2174 register N just before encoding a character in that charset.
2175
2176 If the value is nil, graphic register N is never used on
2177 encoding.
2178
2179 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2180 Each value takes t or nil. See the section ISO2022 of
2181 `coding.h' for more information.
2182
2183 If `coding->type' is `coding_type_big5', element[4] is t to denote
2184 BIG5-ETen or nil to denote BIG5-HKU.
2185
2186 If `coding->type' takes the other value, element[4] is ignored.
2187
2188 Emacs Lisp's coding system also carries information about format of
2189 end-of-line in a value of property `eol-type'. If the value is
2190 integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2191 means CODING_EOL_CR. If it is not integer, it should be a vector
2192 of subsidiary coding systems of which property `eol-type' has one
2193 of above values.
2194
2195*/
2196
2197/* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2198 and set it in CODING. If CODING_SYSTEM_SYMBOL is invalid, CODING
2199 is setup so that no conversion is necessary and return -1, else
2200 return 0. */
2201
2202int
e0e989f6
KH
2203setup_coding_system (coding_system, coding)
2204 Lisp_Object coding_system;
4ed46869
KH
2205 struct coding_system *coding;
2206{
4ed46869
KH
2207 Lisp_Object type, eol_type;
2208
f4dee582 2209 /* At first, set several fields to default values. */
4ed46869
KH
2210 coding->require_flushing = 0;
2211 coding->last_block = 0;
2212 coding->selective = 0;
2213 coding->composing = 0;
2214 coding->direction = 0;
2215 coding->carryover_size = 0;
4ed46869 2216 coding->post_read_conversion = coding->pre_write_conversion = Qnil;
a5d301df
KH
2217 coding->character_unification_table_for_decode = Qnil;
2218 coding->character_unification_table_for_encode = Qnil;
4ed46869 2219
e0e989f6
KH
2220 Vlast_coding_system_used = coding->symbol = coding_system;
2221 eol_type = Qnil;
2222 /* Get value of property `coding-system' until we get a vector.
2223 While doing that, also get values of properties
a5d301df
KH
2224 `post-read-conversion', `pre-write-conversion',
2225 `character-unification-table-for-decode',
2226 `character-unification-table-for-encode' and `eol-type'. */
e0e989f6 2227 while (!NILP (coding_system) && SYMBOLP (coding_system))
4ed46869 2228 {
4ed46869 2229 if (NILP (coding->post_read_conversion))
e0e989f6 2230 coding->post_read_conversion = Fget (coding_system,
4ed46869 2231 Qpost_read_conversion);
e0e989f6
KH
2232 if (NILP (coding->pre_write_conversion))
2233 coding->pre_write_conversion = Fget (coding_system,
4ed46869 2234 Qpre_write_conversion);
9ce27fde 2235 if (!inhibit_eol_conversion && NILP (eol_type))
e0e989f6 2236 eol_type = Fget (coding_system, Qeol_type);
a5d301df
KH
2237
2238 if (NILP (coding->character_unification_table_for_decode))
2239 coding->character_unification_table_for_decode
2240 = Fget (coding_system, Qcharacter_unification_table_for_decode);
2241
2242 if (NILP (coding->character_unification_table_for_encode))
2243 coding->character_unification_table_for_encode
2244 = Fget (coding_system, Qcharacter_unification_table_for_encode);
2245
e0e989f6 2246 coding_system = Fget (coding_system, Qcoding_system);
4ed46869 2247 }
a5d301df
KH
2248
2249 while (!NILP (coding->character_unification_table_for_decode)
2250 && SYMBOLP (coding->character_unification_table_for_decode))
2251 coding->character_unification_table_for_decode
2252 = Fget (coding->character_unification_table_for_decode,
2253 Qcharacter_unification_table_for_decode);
2254 if (!NILP (coding->character_unification_table_for_decode)
2255 && !CHAR_TABLE_P (coding->character_unification_table_for_decode))
2256 coding->character_unification_table_for_decode = Qnil;
2257
2258 while (!NILP (coding->character_unification_table_for_encode)
2259 && SYMBOLP (coding->character_unification_table_for_encode))
2260 coding->character_unification_table_for_encode
2261 = Fget (coding->character_unification_table_for_encode,
2262 Qcharacter_unification_table_for_encode);
2263 if (!NILP (coding->character_unification_table_for_encode)
2264 && !CHAR_TABLE_P (coding->character_unification_table_for_encode))
2265 coding->character_unification_table_for_encode = Qnil;
2266
e0e989f6
KH
2267 if (!VECTORP (coding_system)
2268 || XVECTOR (coding_system)->size != 5)
4ed46869
KH
2269 goto label_invalid_coding_system;
2270
4ed46869 2271 if (VECTORP (eol_type))
0ef69138 2272 coding->eol_type = CODING_EOL_UNDECIDED;
4ed46869
KH
2273 else if (XFASTINT (eol_type) == 1)
2274 coding->eol_type = CODING_EOL_CRLF;
2275 else if (XFASTINT (eol_type) == 2)
2276 coding->eol_type = CODING_EOL_CR;
2277 else
2278 coding->eol_type = CODING_EOL_LF;
2279
e0e989f6 2280 type = XVECTOR (coding_system)->contents[0];
4ed46869
KH
2281 switch (XFASTINT (type))
2282 {
2283 case 0:
0ef69138 2284 coding->type = coding_type_emacs_mule;
4ed46869
KH
2285 break;
2286
2287 case 1:
2288 coding->type = coding_type_sjis;
2289 break;
2290
2291 case 2:
2292 coding->type = coding_type_iso2022;
2293 {
e0e989f6 2294 Lisp_Object val = XVECTOR (coding_system)->contents[4];
4ed46869
KH
2295 Lisp_Object *flags;
2296 int i, charset, default_reg_bits = 0;
2297
2298 if (!VECTORP (val) || XVECTOR (val)->size != 32)
2299 goto label_invalid_coding_system;
2300
2301 flags = XVECTOR (val)->contents;
2302 coding->flags
2303 = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
2304 | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
2305 | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
2306 | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
2307 | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
2308 | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
2309 | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
2310 | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
e0e989f6
KH
2311 | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
2312 | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
2313 | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL));
4ed46869
KH
2314
2315 /* Invoke graphic register 0 to plane 0. */
2316 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
2317 /* Invoke graphic register 1 to plane 1 if we can use full 8-bit. */
2318 CODING_SPEC_ISO_INVOCATION (coding, 1)
2319 = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
2320 /* Not single shifting at first. */
2321 CODING_SPEC_ISO_SINGLE_SHIFTING(coding) = 0;
e0e989f6
KH
2322 /* Beginning of buffer should also be regarded as bol. */
2323 CODING_SPEC_ISO_BOL(coding) = 1;
4ed46869
KH
2324
2325 /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
2326 FLAGS[REG] can be one of below:
2327 integer CHARSET: CHARSET occupies register I,
2328 t: designate nothing to REG initially, but can be used
2329 by any charsets,
2330 list of integer, nil, or t: designate the first
2331 element (if integer) to REG initially, the remaining
2332 elements (if integer) is designated to REG on request,
2333 if an element is t, REG can be used by any charset,
2334 nil: REG is never used. */
467e7675 2335 for (charset = 0; charset <= MAX_CHARSET; charset++)
1ba9e4ab
KH
2336 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2337 = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
4ed46869
KH
2338 for (i = 0; i < 4; i++)
2339 {
2340 if (INTEGERP (flags[i])
e0e989f6
KH
2341 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset))
2342 || (charset = get_charset_id (flags[i])) >= 0)
4ed46869
KH
2343 {
2344 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2345 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
2346 }
2347 else if (EQ (flags[i], Qt))
2348 {
2349 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2350 default_reg_bits |= 1 << i;
2351 }
2352 else if (CONSP (flags[i]))
2353 {
2354 Lisp_Object tail = flags[i];
2355
2356 if (INTEGERP (XCONS (tail)->car)
2357 && (charset = XINT (XCONS (tail)->car),
e0e989f6
KH
2358 CHARSET_VALID_P (charset))
2359 || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
4ed46869
KH
2360 {
2361 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2362 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
2363 }
2364 else
2365 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2366 tail = XCONS (tail)->cdr;
2367 while (CONSP (tail))
2368 {
2369 if (INTEGERP (XCONS (tail)->car)
2370 && (charset = XINT (XCONS (tail)->car),
e0e989f6
KH
2371 CHARSET_VALID_P (charset))
2372 || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
4ed46869
KH
2373 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2374 = i;
2375 else if (EQ (XCONS (tail)->car, Qt))
2376 default_reg_bits |= 1 << i;
2377 tail = XCONS (tail)->cdr;
2378 }
2379 }
2380 else
2381 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2382
2383 CODING_SPEC_ISO_DESIGNATION (coding, i)
2384 = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
2385 }
2386
2387 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
2388 {
2389 /* REG 1 can be used only by locking shift in 7-bit env. */
2390 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2391 default_reg_bits &= ~2;
2392 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
2393 /* Without any shifting, only REG 0 and 1 can be used. */
2394 default_reg_bits &= 3;
2395 }
2396
467e7675 2397 for (charset = 0; charset <= MAX_CHARSET; charset++)
4ed46869 2398 if (CHARSET_VALID_P (charset)
1ba9e4ab
KH
2399 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2400 == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
4ed46869
KH
2401 {
2402 /* We have not yet decided where to designate CHARSET. */
2403 int reg_bits = default_reg_bits;
2404
2405 if (CHARSET_CHARS (charset) == 96)
2406 /* A charset of CHARS96 can't be designated to REG 0. */
2407 reg_bits &= ~1;
2408
2409 if (reg_bits)
2410 /* There exist some default graphic register. */
2411 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2412 = (reg_bits & 1
2413 ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
2414 else
2415 /* We anyway have to designate CHARSET to somewhere. */
2416 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2417 = (CHARSET_CHARS (charset) == 94
2418 ? 0
2419 : ((coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT
2420 || ! coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2421 ? 1
2422 : (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT
2423 ? 2 : 0)));
2424 }
2425 }
2426 coding->require_flushing = 1;
2427 break;
2428
2429 case 3:
2430 coding->type = coding_type_big5;
2431 coding->flags
e0e989f6 2432 = (NILP (XVECTOR (coding_system)->contents[4])
4ed46869
KH
2433 ? CODING_FLAG_BIG5_HKU
2434 : CODING_FLAG_BIG5_ETEN);
2435 break;
2436
2437 case 4:
2438 coding->type = coding_type_ccl;
2439 {
e0e989f6 2440 Lisp_Object val = XVECTOR (coding_system)->contents[4];
4ed46869
KH
2441 if (CONSP (val)
2442 && VECTORP (XCONS (val)->car)
2443 && VECTORP (XCONS (val)->cdr))
2444 {
2445 setup_ccl_program (&(coding->spec.ccl.decoder), XCONS (val)->car);
2446 setup_ccl_program (&(coding->spec.ccl.encoder), XCONS (val)->cdr);
2447 }
2448 else
2449 goto label_invalid_coding_system;
2450 }
2451 coding->require_flushing = 1;
2452 break;
2453
2454 default:
2455 if (EQ (type, Qt))
0ef69138 2456 coding->type = coding_type_undecided;
4ed46869
KH
2457 else
2458 coding->type = coding_type_no_conversion;
2459 break;
2460 }
2461 return 0;
2462
2463 label_invalid_coding_system:
2464 coding->type = coding_type_no_conversion;
dec137e5 2465 coding->eol_type = CODING_EOL_LF;
e0e989f6
KH
2466 coding->symbol = coding->pre_write_conversion = coding->post_read_conversion
2467 = Qnil;
4ed46869
KH
2468 return -1;
2469}
2470
2471/* Emacs has a mechanism to automatically detect a coding system if it
2472 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
2473 it's impossible to distinguish some coding systems accurately
2474 because they use the same range of codes. So, at first, coding
2475 systems are categorized into 7, those are:
2476
0ef69138 2477 o coding-category-emacs-mule
4ed46869
KH
2478
2479 The category for a coding system which has the same code range
2480 as Emacs' internal format. Assigned the coding-system (Lisp
0ef69138 2481 symbol) `emacs-mule' by default.
4ed46869
KH
2482
2483 o coding-category-sjis
2484
2485 The category for a coding system which has the same code range
2486 as SJIS. Assigned the coding-system (Lisp
7717c392 2487 symbol) `japanese-shift-jis' by default.
4ed46869
KH
2488
2489 o coding-category-iso-7
2490
2491 The category for a coding system which has the same code range
7717c392
KH
2492 as ISO2022 of 7-bit environment. This doesn't use any locking
2493 shift and single shift functions. Assigned the coding-system
2494 (Lisp symbol) `iso-2022-7bit' by default.
4ed46869
KH
2495
2496 o coding-category-iso-8-1
2497
2498 The category for a coding system which has the same code range
2499 as ISO2022 of 8-bit environment and graphic plane 1 used only
7717c392
KH
2500 for DIMENSION1 charset. This doesn't use any locking shift
2501 and single shift functions. Assigned the coding-system (Lisp
2502 symbol) `iso-latin-1' by default.
4ed46869
KH
2503
2504 o coding-category-iso-8-2
2505
2506 The category for a coding system which has the same code range
2507 as ISO2022 of 8-bit environment and graphic plane 1 used only
7717c392
KH
2508 for DIMENSION2 charset. This doesn't use any locking shift
2509 and single shift functions. Assigned the coding-system (Lisp
2510 symbol) `japanese-iso-8bit' by default.
4ed46869 2511
7717c392 2512 o coding-category-iso-7-else
4ed46869
KH
2513
2514 The category for a coding system which has the same code range
7717c392
KH
2515 as ISO2022 of 7-bit environemnt but uses locking shift or
2516 single shift functions. Assigned the coding-system (Lisp
2517 symbol) `iso-2022-7bit-lock' by default.
2518
2519 o coding-category-iso-8-else
2520
2521 The category for a coding system which has the same code range
2522 as ISO2022 of 8-bit environemnt but uses locking shift or
2523 single shift functions. Assigned the coding-system (Lisp
2524 symbol) `iso-2022-8bit-ss2' by default.
4ed46869
KH
2525
2526 o coding-category-big5
2527
2528 The category for a coding system which has the same code range
2529 as BIG5. Assigned the coding-system (Lisp symbol)
e0e989f6 2530 `cn-big5' by default.
4ed46869
KH
2531
2532 o coding-category-binary
2533
2534 The category for a coding system not categorized in any of the
2535 above. Assigned the coding-system (Lisp symbol)
e0e989f6 2536 `no-conversion' by default.
4ed46869
KH
2537
2538 Each of them is a Lisp symbol and the value is an actual
2539 `coding-system's (this is also a Lisp symbol) assigned by a user.
2540 What Emacs does actually is to detect a category of coding system.
2541 Then, it uses a `coding-system' assigned to it. If Emacs can't
2542 decide only one possible category, it selects a category of the
2543 highest priority. Priorities of categories are also specified by a
2544 user in a Lisp variable `coding-category-list'.
2545
2546*/
2547
2548/* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2549 If it detects possible coding systems, return an integer in which
2550 appropriate flag bits are set. Flag bits are defined by macros
2551 CODING_CATEGORY_MASK_XXX in `coding.h'. */
2552
2553int
2554detect_coding_mask (src, src_bytes)
2555 unsigned char *src;
2556 int src_bytes;
2557{
2558 register unsigned char c;
2559 unsigned char *src_end = src + src_bytes;
2560 int mask;
2561
2562 /* At first, skip all ASCII characters and control characters except
2563 for three ISO2022 specific control characters. */
bcf26d6a 2564 label_loop_detect_coding:
4ed46869
KH
2565 while (src < src_end)
2566 {
2567 c = *src;
2568 if (c >= 0x80
2569 || (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
2570 break;
2571 src++;
2572 }
2573
2574 if (src >= src_end)
2575 /* We found nothing other than ASCII. There's nothing to do. */
2576 return CODING_CATEGORY_MASK_ANY;
2577
2578 /* The text seems to be encoded in some multilingual coding system.
2579 Now, try to find in which coding system the text is encoded. */
2580 if (c < 0x80)
bcf26d6a
KH
2581 {
2582 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
2583 /* C is an ISO2022 specific control code of C0. */
2584 mask = detect_coding_iso2022 (src, src_end);
2585 src++;
2586 if (mask == CODING_CATEGORY_MASK_ANY)
2587 /* No valid ISO2022 code follows C. Try again. */
2588 goto label_loop_detect_coding;
2589 }
19a8d9e0 2590 else if (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
4ed46869
KH
2591 /* C is an ISO2022 specific control code of C1,
2592 or the first byte of SJIS's 2-byte character code,
2593 or a leading code of Emacs. */
2594 mask = (detect_coding_iso2022 (src, src_end)
2595 | detect_coding_sjis (src, src_end)
0ef69138 2596 | detect_coding_emacs_mule (src, src_end));
4ed46869 2597
19a8d9e0
KH
2598 else if (c == ISO_CODE_CSI
2599 && (src < src_end
2600 && (*src == ']'
2601 || (src + 1 < src_end
2602 && src[1] == ']'
2603 && (*src == '0' || *src == '1' || *src == '2')))))
2604 /* C is an ISO2022's control-sequence-introducer. */
2605 mask = (detect_coding_iso2022 (src, src_end)
2606 | detect_coding_sjis (src, src_end)
2607 | detect_coding_emacs_mule (src, src_end));
2608
4ed46869
KH
2609 else if (c < 0xA0)
2610 /* C is the first byte of SJIS character code,
2611 or a leading-code of Emacs. */
2612 mask = (detect_coding_sjis (src, src_end)
0ef69138 2613 | detect_coding_emacs_mule (src, src_end));
4ed46869
KH
2614
2615 else
2616 /* C is a character of ISO2022 in graphic plane right,
2617 or a SJIS's 1-byte character code (i.e. JISX0201),
2618 or the first byte of BIG5's 2-byte code. */
2619 mask = (detect_coding_iso2022 (src, src_end)
2620 | detect_coding_sjis (src, src_end)
2621 | detect_coding_big5 (src, src_end));
2622
2623 return mask;
2624}
2625
2626/* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2627 The information of the detected coding system is set in CODING. */
2628
2629void
2630detect_coding (coding, src, src_bytes)
2631 struct coding_system *coding;
2632 unsigned char *src;
2633 int src_bytes;
2634{
2635 int mask = detect_coding_mask (src, src_bytes);
2636 int idx;
2637
2638 if (mask == CODING_CATEGORY_MASK_ANY)
2639 /* We found nothing other than ASCII. There's nothing to do. */
2640 return;
2641
2642 if (!mask)
2643 /* The source text seems to be encoded in unknown coding system.
2644 Emacs regards the category of such a kind of coding system as
2645 `coding-category-binary'. We assume that a user has assigned
2646 an appropriate coding system for a `coding-category-binary'. */
2647 idx = CODING_CATEGORY_IDX_BINARY;
2648 else
2649 {
2650 /* We found some plausible coding systems. Let's use a coding
2651 system of the highest priority. */
2652 Lisp_Object val = Vcoding_category_list;
2653
2654 if (CONSP (val))
2655 while (!NILP (val))
2656 {
2657 idx = XFASTINT (Fget (XCONS (val)->car, Qcoding_category_index));
2658 if ((idx < CODING_CATEGORY_IDX_MAX) && (mask & (1 << idx)))
2659 break;
2660 val = XCONS (val)->cdr;
2661 }
2662 else
2663 val = Qnil;
2664
2665 if (NILP (val))
2666 {
2667 /* For unknown reason, `Vcoding_category_list' contains none
2668 of found categories. Let's use any of them. */
2669 for (idx = 0; idx < CODING_CATEGORY_IDX_MAX; idx++)
2670 if (mask & (1 << idx))
2671 break;
2672 }
2673 }
2674 setup_coding_system (XSYMBOL (coding_category_table[idx])->value, coding);
2675}
2676
2677/* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2678 is encoded. Return one of CODING_EOL_LF, CODING_EOL_CRLF,
0ef69138 2679 CODING_EOL_CR, and CODING_EOL_UNDECIDED. */
4ed46869
KH
2680
2681int
2682detect_eol_type (src, src_bytes)
2683 unsigned char *src;
2684 int src_bytes;
2685{
2686 unsigned char *src_end = src + src_bytes;
2687 unsigned char c;
2688
2689 while (src < src_end)
2690 {
2691 c = *src++;
2692 if (c == '\n')
2693 return CODING_EOL_LF;
2694 else if (c == '\r')
2695 {
2696 if (src < src_end && *src == '\n')
2697 return CODING_EOL_CRLF;
2698 else
2699 return CODING_EOL_CR;
2700 }
2701 }
0ef69138 2702 return CODING_EOL_UNDECIDED;
4ed46869
KH
2703}
2704
2705/* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2706 is encoded. If it detects an appropriate format of end-of-line, it
2707 sets the information in *CODING. */
2708
2709void
2710detect_eol (coding, src, src_bytes)
2711 struct coding_system *coding;
2712 unsigned char *src;
2713 int src_bytes;
2714{
2715 Lisp_Object val;
2716 int eol_type = detect_eol_type (src, src_bytes);
2717
0ef69138 2718 if (eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
2719 /* We found no end-of-line in the source text. */
2720 return;
2721
2722 val = Fget (coding->symbol, Qeol_type);
2723 if (VECTORP (val) && XVECTOR (val)->size == 3)
2724 setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
2725}
2726
2727/* See "GENERAL NOTES about `decode_coding_XXX ()' functions". Before
2728 decoding, it may detect coding system and format of end-of-line if
2729 those are not yet decided. */
2730
2731int
2732decode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2733 struct coding_system *coding;
2734 unsigned char *source, *destination;
2735 int src_bytes, dst_bytes;
2736 int *consumed;
2737{
2738 int produced;
2739
2740 if (src_bytes <= 0)
2741 {
2742 *consumed = 0;
2743 return 0;
2744 }
2745
0ef69138 2746 if (coding->type == coding_type_undecided)
4ed46869
KH
2747 detect_coding (coding, source, src_bytes);
2748
0ef69138 2749 if (coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
2750 detect_eol (coding, source, src_bytes);
2751
2752 coding->carryover_size = 0;
2753 switch (coding->type)
2754 {
2755 case coding_type_no_conversion:
2756 label_no_conversion:
2757 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2758 bcopy (source, destination, produced);
2759 *consumed = produced;
2760 break;
2761
0ef69138
KH
2762 case coding_type_emacs_mule:
2763 case coding_type_undecided:
4ed46869 2764 if (coding->eol_type == CODING_EOL_LF
0ef69138 2765 || coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
2766 goto label_no_conversion;
2767 produced = decode_eol (coding, source, destination,
2768 src_bytes, dst_bytes, consumed);
2769 break;
2770
2771 case coding_type_sjis:
2772 produced = decode_coding_sjis_big5 (coding, source, destination,
2773 src_bytes, dst_bytes, consumed,
2774 1);
2775 break;
2776
2777 case coding_type_iso2022:
2778 produced = decode_coding_iso2022 (coding, source, destination,
2779 src_bytes, dst_bytes, consumed);
2780 break;
2781
2782 case coding_type_big5:
2783 produced = decode_coding_sjis_big5 (coding, source, destination,
2784 src_bytes, dst_bytes, consumed,
2785 0);
2786 break;
2787
2788 case coding_type_ccl:
2789 produced = ccl_driver (&coding->spec.ccl.decoder, source, destination,
2790 src_bytes, dst_bytes, consumed);
2791 break;
2792 }
2793
2794 return produced;
2795}
2796
2797/* See "GENERAL NOTES about `encode_coding_XXX ()' functions". */
2798
2799int
2800encode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2801 struct coding_system *coding;
2802 unsigned char *source, *destination;
2803 int src_bytes, dst_bytes;
2804 int *consumed;
2805{
2806 int produced;
2807
4ed46869
KH
2808 switch (coding->type)
2809 {
2810 case coding_type_no_conversion:
2811 label_no_conversion:
2812 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2813 if (produced > 0)
2814 {
2815 bcopy (source, destination, produced);
2816 if (coding->selective)
2817 {
2818 unsigned char *p = destination, *pend = destination + produced;
2819 while (p < pend)
e0e989f6 2820 if (*p++ == '\015') p[-1] = '\n';
4ed46869
KH
2821 }
2822 }
2823 *consumed = produced;
2824 break;
2825
0ef69138
KH
2826 case coding_type_emacs_mule:
2827 case coding_type_undecided:
4ed46869 2828 if (coding->eol_type == CODING_EOL_LF
0ef69138 2829 || coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
2830 goto label_no_conversion;
2831 produced = encode_eol (coding, source, destination,
2832 src_bytes, dst_bytes, consumed);
2833 break;
2834
2835 case coding_type_sjis:
2836 produced = encode_coding_sjis_big5 (coding, source, destination,
2837 src_bytes, dst_bytes, consumed,
2838 1);
2839 break;
2840
2841 case coding_type_iso2022:
2842 produced = encode_coding_iso2022 (coding, source, destination,
2843 src_bytes, dst_bytes, consumed);
2844 break;
2845
2846 case coding_type_big5:
2847 produced = encode_coding_sjis_big5 (coding, source, destination,
2848 src_bytes, dst_bytes, consumed,
2849 0);
2850 break;
2851
2852 case coding_type_ccl:
2853 produced = ccl_driver (&coding->spec.ccl.encoder, source, destination,
2854 src_bytes, dst_bytes, consumed);
2855 break;
2856 }
2857
2858 return produced;
2859}
2860
2861#define CONVERSION_BUFFER_EXTRA_ROOM 256
2862
2863/* Return maximum size (bytes) of a buffer enough for decoding
2864 SRC_BYTES of text encoded in CODING. */
2865
2866int
2867decoding_buffer_size (coding, src_bytes)
2868 struct coding_system *coding;
2869 int src_bytes;
2870{
2871 int magnification;
2872
2873 if (coding->type == coding_type_iso2022)
2874 magnification = 3;
2875 else if (coding->type == coding_type_ccl)
2876 magnification = coding->spec.ccl.decoder.buf_magnification;
2877 else
2878 magnification = 2;
2879
2880 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
2881}
2882
2883/* Return maximum size (bytes) of a buffer enough for encoding
2884 SRC_BYTES of text to CODING. */
2885
2886int
2887encoding_buffer_size (coding, src_bytes)
2888 struct coding_system *coding;
2889 int src_bytes;
2890{
2891 int magnification;
2892
2893 if (coding->type == coding_type_ccl)
2894 magnification = coding->spec.ccl.encoder.buf_magnification;
2895 else
2896 magnification = 3;
2897
2898 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
2899}
2900
2901#ifndef MINIMUM_CONVERSION_BUFFER_SIZE
2902#define MINIMUM_CONVERSION_BUFFER_SIZE 1024
2903#endif
2904
2905char *conversion_buffer;
2906int conversion_buffer_size;
2907
2908/* Return a pointer to a SIZE bytes of buffer to be used for encoding
2909 or decoding. Sufficient memory is allocated automatically. If we
2910 run out of memory, return NULL. */
2911
2912char *
2913get_conversion_buffer (size)
2914 int size;
2915{
2916 if (size > conversion_buffer_size)
2917 {
2918 char *buf;
2919 int real_size = conversion_buffer_size * 2;
2920
2921 while (real_size < size) real_size *= 2;
2922 buf = (char *) xmalloc (real_size);
2923 xfree (conversion_buffer);
2924 conversion_buffer = buf;
2925 conversion_buffer_size = real_size;
2926 }
2927 return conversion_buffer;
2928}
2929
2930\f
2931#ifdef emacs
2932/*** 7. Emacs Lisp library functions ***/
2933
02ba4723 2934DEFUN ("coding-system-spec", Fcoding_system_spec, Scoding_system_spec,
4ed46869 2935 1, 1, 0,
02ba4723 2936 "Return coding-spec of CODING-SYSTEM.\n\
4ed46869
KH
2937If CODING-SYSTEM is not a valid coding-system, return nil.")
2938 (obj)
2939 Lisp_Object obj;
2940{
2941 while (SYMBOLP (obj) && !NILP (obj))
2942 obj = Fget (obj, Qcoding_system);
2943 return ((NILP (obj) || !VECTORP (obj) || XVECTOR (obj)->size != 5)
2944 ? Qnil : obj);
2945}
2946
2947DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
2948 "Return t if OBJECT is nil or a coding-system.\n\
2949See document of make-coding-system for coding-system object.")
2950 (obj)
2951 Lisp_Object obj;
2952{
02ba4723 2953 return ((NILP (obj) || !NILP (Fcoding_system_spec (obj))) ? Qt : Qnil);
4ed46869
KH
2954}
2955
9d991de8
RS
2956DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
2957 Sread_non_nil_coding_system, 1, 1, 0,
e0e989f6 2958 "Read a coding system from the minibuffer, prompting with string PROMPT.")
4ed46869
KH
2959 (prompt)
2960 Lisp_Object prompt;
2961{
e0e989f6 2962 Lisp_Object val;
9d991de8
RS
2963 do
2964 {
02ba4723 2965 val = Fcompleting_read (prompt, Vobarray, Qcoding_system_spec,
9d991de8
RS
2966 Qt, Qnil, Qnil, Qnil);
2967 }
2968 while (XSTRING (val)->size == 0);
e0e989f6 2969 return (Fintern (val, Qnil));
4ed46869
KH
2970}
2971
2972DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 1, 0,
e0e989f6 2973 "Read a coding system or nil from the minibuffer, prompting with string PROMPT.")
4ed46869
KH
2974 (prompt)
2975 Lisp_Object prompt;
2976{
e0e989f6 2977 Lisp_Object val = Fcompleting_read (prompt, Vobarray, Qcoding_system_p,
9d991de8 2978 Qt, Qnil, Qnil, Qnil);
e0e989f6 2979 return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
4ed46869
KH
2980}
2981
2982DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
2983 1, 1, 0,
2984 "Check validity of CODING-SYSTEM.\n\
2985If valid, return CODING-SYSTEM, else `coding-system-error' is signaled.\n\
2986CODING-SYSTEM is valid if it is a symbol and has \"coding-system\" property.\n\
2987The value of property should be a vector of length 5.")
2988 (coding_system)
2989 Lisp_Object coding_system;
2990{
2991 CHECK_SYMBOL (coding_system, 0);
2992 if (!NILP (Fcoding_system_p (coding_system)))
2993 return coding_system;
2994 while (1)
02ba4723 2995 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
4ed46869
KH
2996}
2997
2998DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
2999 2, 2, 0,
3000 "Detect coding-system of the text in the region between START and END.\n\
3001Return a list of possible coding-systems ordered by priority.\n\
0ef69138 3002If only ASCII characters are found, it returns `undecided'\n\
4ed46869
KH
3003 or its subsidiary coding-system according to a detected end-of-line format.")
3004 (b, e)
3005 Lisp_Object b, e;
3006{
3007 int coding_mask, eol_type;
3008 Lisp_Object val;
3009 int beg, end;
3010
3011 validate_region (&b, &e);
3012 beg = XINT (b), end = XINT (e);
3013 if (beg < GPT && end >= GPT) move_gap (end);
3014
3015 coding_mask = detect_coding_mask (POS_ADDR (beg), end - beg);
3016 eol_type = detect_eol_type (POS_ADDR (beg), end - beg);
3017
3018 if (coding_mask == CODING_CATEGORY_MASK_ANY)
3019 {
0ef69138
KH
3020 val = intern ("undecided");
3021 if (eol_type != CODING_EOL_UNDECIDED)
4ed46869
KH
3022 {
3023 Lisp_Object val2 = Fget (val, Qeol_type);
3024 if (VECTORP (val2))
3025 val = XVECTOR (val2)->contents[eol_type];
3026 }
3027 }
3028 else
3029 {
3030 Lisp_Object val2;
3031
3032 /* At first, gather possible coding-systems in VAL in a reverse
3033 order. */
3034 val = Qnil;
3035 for (val2 = Vcoding_category_list;
3036 !NILP (val2);
3037 val2 = XCONS (val2)->cdr)
3038 {
3039 int idx
3040 = XFASTINT (Fget (XCONS (val2)->car, Qcoding_category_index));
3041 if (coding_mask & (1 << idx))
3042 val = Fcons (Fsymbol_value (XCONS (val2)->car), val);
3043 }
3044
3045 /* Then, change the order of the list, while getting subsidiary
3046 coding-systems. */
3047 val2 = val;
3048 val = Qnil;
3049 for (; !NILP (val2); val2 = XCONS (val2)->cdr)
3050 {
0ef69138 3051 if (eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
3052 val = Fcons (XCONS (val2)->car, val);
3053 else
3054 {
3055 Lisp_Object val3 = Fget (XCONS (val2)->car, Qeol_type);
3056 if (VECTORP (val3))
3057 val = Fcons (XVECTOR (val3)->contents[eol_type], val);
3058 else
3059 val = Fcons (XCONS (val2)->car, val);
3060 }
3061 }
3062 }
3063
3064 return val;
3065}
3066
3067/* Scan text in the region between *BEGP and *ENDP, skip characters
3068 which we never have to encode to (iff ENCODEP is 1) or decode from
3069 coding system CODING at the head and tail, then set BEGP and ENDP
3070 to the addresses of start and end of the text we actually convert. */
3071
3072void
3073shrink_conversion_area (begp, endp, coding, encodep)
3074 unsigned char **begp, **endp;
3075 struct coding_system *coding;
3076 int encodep;
3077{
3078 register unsigned char *beg_addr = *begp, *end_addr = *endp;
3079
3080 if (coding->eol_type != CODING_EOL_LF
0ef69138 3081 && coding->eol_type != CODING_EOL_UNDECIDED)
4ed46869
KH
3082 /* Since we anyway have to convert end-of-line format, it is not
3083 worth skipping at most 100 bytes or so. */
3084 return;
3085
3086 if (encodep) /* for encoding */
3087 {
3088 switch (coding->type)
3089 {
3090 case coding_type_no_conversion:
0ef69138
KH
3091 case coding_type_emacs_mule:
3092 case coding_type_undecided:
4ed46869
KH
3093 /* We need no conversion. */
3094 *begp = *endp;
3095 return;
3096 case coding_type_ccl:
3097 /* We can't skip any data. */
3098 return;
e0e989f6
KH
3099 case coding_type_iso2022:
3100 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3101 {
3102 unsigned char *bol = beg_addr;
3103 while (beg_addr < end_addr && *beg_addr < 0x80)
3104 {
3105 beg_addr++;
3106 if (*(beg_addr - 1) == '\n')
3107 bol = beg_addr;
3108 }
3109 beg_addr = bol;
3110 goto label_skip_tail;
3111 }
3112 /* fall down ... */
4ed46869
KH
3113 default:
3114 /* We can skip all ASCII characters at the head and tail. */
3115 while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
e0e989f6 3116 label_skip_tail:
4ed46869
KH
3117 while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
3118 break;
3119 }
3120 }
3121 else /* for decoding */
3122 {
3123 switch (coding->type)
3124 {
3125 case coding_type_no_conversion:
3126 /* We need no conversion. */
3127 *begp = *endp;
3128 return;
0ef69138 3129 case coding_type_emacs_mule:
4ed46869
KH
3130 if (coding->eol_type == CODING_EOL_LF)
3131 {
3132 /* We need no conversion. */
3133 *begp = *endp;
3134 return;
3135 }
3136 /* We can skip all but carriage-return. */
3137 while (beg_addr < end_addr && *beg_addr != '\r') beg_addr++;
3138 while (beg_addr < end_addr && *(end_addr - 1) != '\r') end_addr--;
3139 break;
3140 case coding_type_sjis:
3141 case coding_type_big5:
3142 /* We can skip all ASCII characters at the head. */
3143 while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
3144 /* We can skip all ASCII characters at the tail except for
3145 the second byte of SJIS or BIG5 code. */
3146 while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
3147 if (end_addr != *endp)
3148 end_addr++;
3149 break;
3150 case coding_type_ccl:
3151 /* We can't skip any data. */
3152 return;
3153 default: /* i.e. case coding_type_iso2022: */
3154 {
3155 unsigned char c;
3156
3157 /* We can skip all ASCII characters except for a few
3158 control codes at the head. */
3159 while (beg_addr < end_addr && (c = *beg_addr) < 0x80
3160 && c != ISO_CODE_CR && c != ISO_CODE_SO
3161 && c != ISO_CODE_SI && c != ISO_CODE_ESC)
3162 beg_addr++;
3163 }
3164 break;
3165 }
3166 }
3167 *begp = beg_addr;
3168 *endp = end_addr;
3169 return;
3170}
3171
3172/* Encode to (iff ENCODEP is 1) or decode form coding system CODING a
3173 text between B and E. B and E are buffer position. */
3174
3175Lisp_Object
3176code_convert_region (b, e, coding, encodep)
3177 Lisp_Object b, e;
3178 struct coding_system *coding;
3179 int encodep;
3180{
3181 int beg, end, len, consumed, produced;
3182 char *buf;
3183 unsigned char *begp, *endp;
3184 int pos = PT;
3185
3186 validate_region (&b, &e);
3187 beg = XINT (b), end = XINT (e);
3188 if (beg < GPT && end >= GPT)
3189 move_gap (end);
3190
3191 if (encodep && !NILP (coding->pre_write_conversion))
3192 {
3193 /* We must call a pre-conversion function which may put a new
3194 text to be converted in a new buffer. */
3195 struct buffer *old = current_buffer, *new;
3196
3197 TEMP_SET_PT (beg);
3198 call2 (coding->pre_write_conversion, b, e);
3199 if (old != current_buffer)
3200 {
3201 /* Replace the original text by the text just generated. */
3202 len = ZV - BEGV;
3203 new = current_buffer;
3204 set_buffer_internal (old);
3205 del_range (beg, end);
3206 insert_from_buffer (new, 1, len, 0);
3207 end = beg + len;
3208 }
3209 }
3210
3211 /* We may be able to shrink the conversion region. */
3212 begp = POS_ADDR (beg); endp = begp + (end - beg);
3213 shrink_conversion_area (&begp, &endp, coding, encodep);
3214
3215 if (begp == endp)
3216 /* We need no conversion. */
3217 len = end - beg;
3218 else
3219 {
3220 beg += begp - POS_ADDR (beg);
3221 end = beg + (endp - begp);
3222
3223 if (encodep)
3224 len = encoding_buffer_size (coding, end - beg);
3225 else
3226 len = decoding_buffer_size (coding, end - beg);
3227 buf = get_conversion_buffer (len);
3228
3229 coding->last_block = 1;
3230 produced = (encodep
3231 ? encode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
3232 &consumed)
3233 : decode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
3234 &consumed));
3235
3236 len = produced + (beg - XINT (b)) + (XINT (e) - end);
3237
3238 TEMP_SET_PT (beg);
3239 insert (buf, produced);
3240 del_range (PT, PT + end - beg);
3241 if (pos >= end)
3242 pos = PT + (pos - end);
3243 else if (pos > beg)
3244 pos = beg;
3245 TEMP_SET_PT (pos);
3246 }
3247
3248 if (!encodep && !NILP (coding->post_read_conversion))
3249 {
3250 /* We must call a post-conversion function which may alter
3251 the text just converted. */
3252 Lisp_Object insval;
3253
3254 beg = XINT (b);
3255 TEMP_SET_PT (beg);
3256 insval = call1 (coding->post_read_conversion, make_number (len));
3257 CHECK_NUMBER (insval, 0);
3258 len = XINT (insval);
3259 }
3260
3261 return make_number (len);
3262}
3263
3264Lisp_Object
e0e989f6
KH
3265code_convert_string (str, coding, encodep, nocopy)
3266 Lisp_Object str, nocopy;
4ed46869
KH
3267 struct coding_system *coding;
3268 int encodep;
3269{
3270 int len, consumed, produced;
3271 char *buf;
3272 unsigned char *begp, *endp;
3273 int head_skip, tail_skip;
3274 struct gcpro gcpro1;
3275
3276 if (encodep && !NILP (coding->pre_write_conversion)
3277 || !encodep && !NILP (coding->post_read_conversion))
3278 {
3279 /* Since we have to call Lisp functions which assume target text
3280 is in a buffer, after setting a temporary buffer, call
3281 code_convert_region. */
3282 int count = specpdl_ptr - specpdl;
3283 int len = XSTRING (str)->size;
3284 Lisp_Object result;
3285 struct buffer *old = current_buffer;
3286
3287 record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
3288 temp_output_buffer_setup (" *code-converting-work*");
3289 set_buffer_internal (XBUFFER (Vstandard_output));
3290 insert_from_string (str, 0, len, 0);
3291 code_convert_region (make_number (BEGV), make_number (ZV),
3292 coding, encodep);
3293 result = make_buffer_string (BEGV, ZV, 0);
3294 set_buffer_internal (old);
3295 return unbind_to (count, result);
3296 }
3297
3298 /* We may be able to shrink the conversion region. */
3299 begp = XSTRING (str)->data;
3300 endp = begp + XSTRING (str)->size;
3301 shrink_conversion_area (&begp, &endp, coding, encodep);
3302
3303 if (begp == endp)
3304 /* We need no conversion. */
e0e989f6 3305 return (NILP (nocopy) ? Fcopy_sequence (str) : str);
4ed46869
KH
3306
3307 head_skip = begp - XSTRING (str)->data;
3308 tail_skip = XSTRING (str)->size - head_skip - (endp - begp);
3309
3310 GCPRO1 (str);
3311
3312 if (encodep)
3313 len = encoding_buffer_size (coding, endp - begp);
3314 else
3315 len = decoding_buffer_size (coding, endp - begp);
3316 buf = get_conversion_buffer (len + head_skip + tail_skip);
3317
3318 bcopy (XSTRING (str)->data, buf, head_skip);
3319 coding->last_block = 1;
3320 produced = (encodep
3321 ? encode_coding (coding, XSTRING (str)->data + head_skip,
3322 buf + head_skip, endp - begp, len, &consumed)
3323 : decode_coding (coding, XSTRING (str)->data + head_skip,
3324 buf + head_skip, endp - begp, len, &consumed));
3325 bcopy (XSTRING (str)->data + head_skip + (endp - begp),
3326 buf + head_skip + produced,
3327 tail_skip);
3328
3329 UNGCPRO;
3330
3331 return make_string (buf, head_skip + produced + tail_skip);
3332}
3333
3334DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
e0e989f6
KH
3335 3, 3, "r\nzCoding system: ",
3336 "Decode current region by specified coding system.\n\
3337When called from a program, takes three arguments:\n\
3338START, END, and CODING-SYSTEM. START END are buffer positions.\n\
4ed46869
KH
3339Return length of decoded text.")
3340 (b, e, coding_system)
3341 Lisp_Object b, e, coding_system;
3342{
3343 struct coding_system coding;
3344
3345 CHECK_NUMBER_COERCE_MARKER (b, 0);
3346 CHECK_NUMBER_COERCE_MARKER (e, 1);
3347 CHECK_SYMBOL (coding_system, 2);
3348
e0e989f6
KH
3349 if (NILP (coding_system))
3350 return make_number (XFASTINT (e) - XFASTINT (b));
4ed46869
KH
3351 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3352 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3353
3354 return code_convert_region (b, e, &coding, 0);
3355}
3356
3357DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
e0e989f6
KH
3358 3, 3, "r\nzCoding system: ",
3359 "Encode current region by specified coding system.\n\
3360When called from a program, takes three arguments:\n\
3361START, END, and CODING-SYSTEM. START END are buffer positions.\n\
4ed46869
KH
3362Return length of encoded text.")
3363 (b, e, coding_system)
3364 Lisp_Object b, e, coding_system;
3365{
3366 struct coding_system coding;
3367
3368 CHECK_NUMBER_COERCE_MARKER (b, 0);
3369 CHECK_NUMBER_COERCE_MARKER (e, 1);
3370 CHECK_SYMBOL (coding_system, 2);
3371
e0e989f6
KH
3372 if (NILP (coding_system))
3373 return make_number (XFASTINT (e) - XFASTINT (b));
4ed46869
KH
3374 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3375 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3376
3377 return code_convert_region (b, e, &coding, 1);
3378}
3379
3380DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
e0e989f6
KH
3381 2, 3, 0,
3382 "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
3383Optional arg NOCOPY non-nil means return STRING itself if there's no need\n\
3384of decoding.")
3385 (string, coding_system, nocopy)
3386 Lisp_Object string, coding_system, nocopy;
4ed46869
KH
3387{
3388 struct coding_system coding;
3389
3390 CHECK_STRING (string, 0);
3391 CHECK_SYMBOL (coding_system, 1);
3392
e0e989f6
KH
3393 if (NILP (coding_system))
3394 return (NILP (nocopy) ? Fcopy_sequence (string) : string);
4ed46869
KH
3395 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3396 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3397
e0e989f6 3398 return code_convert_string (string, &coding, 0, nocopy);
4ed46869
KH
3399}
3400
3401DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
e0e989f6
KH
3402 2, 3, 0,
3403 "Encode STRING to CODING-SYSTEM, and return the result.\n\
3404Optional arg NOCOPY non-nil means return STRING itself if there's no need\n\
3405of encoding.")
3406 (string, coding_system, nocopy)
3407 Lisp_Object string, coding_system, nocopy;
4ed46869
KH
3408{
3409 struct coding_system coding;
3410
3411 CHECK_STRING (string, 0);
3412 CHECK_SYMBOL (coding_system, 1);
3413
e0e989f6
KH
3414 if (NILP (coding_system))
3415 return (NILP (nocopy) ? Fcopy_sequence (string) : string);
4ed46869
KH
3416 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3417 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3418
e0e989f6 3419 return code_convert_string (string, &coding, 1, nocopy);
4ed46869
KH
3420}
3421
3422DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
e0e989f6 3423 "Decode a JISX0208 character of shift-jis encoding.\n\
4ed46869
KH
3424CODE is the character code in SJIS.\n\
3425Return the corresponding character.")
3426 (code)
3427 Lisp_Object code;
3428{
3429 unsigned char c1, c2, s1, s2;
3430 Lisp_Object val;
3431
3432 CHECK_NUMBER (code, 0);
3433 s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
3434 DECODE_SJIS (s1, s2, c1, c2);
3435 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset_jisx0208, c1, c2));
3436 return val;
3437}
3438
3439DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
3440 "Encode a JISX0208 character CHAR to SJIS coding-system.\n\
3441Return the corresponding character code in SJIS.")
3442 (ch)
3443 Lisp_Object ch;
3444{
bcf26d6a 3445 int charset, c1, c2, s1, s2;
4ed46869
KH
3446 Lisp_Object val;
3447
3448 CHECK_NUMBER (ch, 0);
3449 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3450 if (charset == charset_jisx0208)
3451 {
3452 ENCODE_SJIS (c1, c2, s1, s2);
bcf26d6a 3453 XSETFASTINT (val, (s1 << 8) | s2);
4ed46869
KH
3454 }
3455 else
3456 XSETFASTINT (val, 0);
3457 return val;
3458}
3459
3460DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
3461 "Decode a Big5 character CODE of BIG5 coding-system.\n\
3462CODE is the character code in BIG5.\n\
3463Return the corresponding character.")
3464 (code)
3465 Lisp_Object code;
3466{
3467 int charset;
3468 unsigned char b1, b2, c1, c2;
3469 Lisp_Object val;
3470
3471 CHECK_NUMBER (code, 0);
3472 b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
3473 DECODE_BIG5 (b1, b2, charset, c1, c2);
3474 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset, c1, c2));
3475 return val;
3476}
3477
3478DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
3479 "Encode the Big5 character CHAR to BIG5 coding-system.\n\
3480Return the corresponding character code in Big5.")
3481 (ch)
3482 Lisp_Object ch;
3483{
bcf26d6a 3484 int charset, c1, c2, b1, b2;
4ed46869
KH
3485 Lisp_Object val;
3486
3487 CHECK_NUMBER (ch, 0);
3488 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3489 if (charset == charset_big5_1 || charset == charset_big5_2)
3490 {
3491 ENCODE_BIG5 (charset, c1, c2, b1, b2);
bcf26d6a 3492 XSETFASTINT (val, (b1 << 8) | b2);
4ed46869
KH
3493 }
3494 else
3495 XSETFASTINT (val, 0);
3496 return val;
3497}
3498
1ba9e4ab
KH
3499DEFUN ("set-terminal-coding-system-internal",
3500 Fset_terminal_coding_system_internal,
3501 Sset_terminal_coding_system_internal, 1, 1, 0, "")
4ed46869
KH
3502 (coding_system)
3503 Lisp_Object coding_system;
3504{
3505 CHECK_SYMBOL (coding_system, 0);
3506 setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
4ed46869
KH
3507 return Qnil;
3508}
3509
3510DEFUN ("terminal-coding-system",
3511 Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
3512 "Return coding-system of your terminal.")
3513 ()
3514{
3515 return terminal_coding.symbol;
3516}
3517
1ba9e4ab
KH
3518DEFUN ("set-keyboard-coding-system-internal",
3519 Fset_keyboard_coding_system_internal,
3520 Sset_keyboard_coding_system_internal, 1, 1, 0, "")
4ed46869
KH
3521 (coding_system)
3522 Lisp_Object coding_system;
3523{
3524 CHECK_SYMBOL (coding_system, 0);
3525 setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
3526 return Qnil;
3527}
3528
3529DEFUN ("keyboard-coding-system",
3530 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
3531 "Return coding-system of what is sent from terminal keyboard.")
3532 ()
3533{
3534 return keyboard_coding.symbol;
3535}
3536
3537\f
a5d301df
KH
3538DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
3539 Sfind_operation_coding_system, 1, MANY, 0,
3540 "Choose a coding system for an operation based on the target name.\n\
9ce27fde
KH
3541The value names a pair of coding systems: (DECODING-SYSTEM ENCODING-SYSTEM).\n\
3542DECODING-SYSTEM is the coding system to use for decoding\n\
3543\(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system\n\
3544for encoding (in case OPERATION does encoding).\n\
ccdb79f5
RS
3545\n\
3546The first argument OPERATION specifies an I/O primitive:\n\
3547 For file I/O, `insert-file-contents' or `write-region'.\n\
3548 For process I/O, `call-process', `call-process-region', or `start-process'.\n\
3549 For network I/O, `open-network-stream'.\n\
3550\n\
3551The remaining arguments should be the same arguments that were passed\n\
3552to the primitive. Depending on which primitive, one of those arguments\n\
3553is selected as the TARGET. For example, if OPERATION does file I/O,\n\
3554whichever argument specifies the file name is TARGET.\n\
3555\n\
3556TARGET has a meaning which depends on OPERATION:\n\
4ed46869
KH
3557 For file I/O, TARGET is a file name.\n\
3558 For process I/O, TARGET is a process name.\n\
3559 For network I/O, TARGET is a service name or a port number\n\
3560\n\
02ba4723
KH
3561This function looks up what specified for TARGET in,\n\
3562`file-coding-system-alist', `process-coding-system-alist',\n\
3563or `network-coding-system-alist' depending on OPERATION.\n\
3564They may specify a coding system, a cons of coding systems,\n\
3565or a function symbol to call.\n\
3566In the last case, we call the function with one argument,\n\
9ce27fde 3567which is a list of all the arguments given to this function.")
4ed46869
KH
3568 (nargs, args)
3569 int nargs;
3570 Lisp_Object *args;
3571{
3572 Lisp_Object operation, target_idx, target, val;
3573 register Lisp_Object chain;
3574
3575 if (nargs < 2)
3576 error ("Too few arguments");
3577 operation = args[0];
3578 if (!SYMBOLP (operation)
3579 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
3580 error ("Invalid first arguement");
3581 if (nargs < 1 + XINT (target_idx))
3582 error ("Too few arguments for operation: %s",
3583 XSYMBOL (operation)->name->data);
3584 target = args[XINT (target_idx) + 1];
3585 if (!(STRINGP (target)
3586 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
3587 error ("Invalid %dth argument", XINT (target_idx) + 1);
3588
2e34157c
RS
3589 chain = ((EQ (operation, Qinsert_file_contents)
3590 || EQ (operation, Qwrite_region))
02ba4723 3591 ? Vfile_coding_system_alist
2e34157c 3592 : (EQ (operation, Qopen_network_stream)
02ba4723
KH
3593 ? Vnetwork_coding_system_alist
3594 : Vprocess_coding_system_alist));
4ed46869
KH
3595 if (NILP (chain))
3596 return Qnil;
3597
02ba4723 3598 for (; CONSP (chain); chain = XCONS (chain)->cdr)
4ed46869
KH
3599 {
3600 Lisp_Object elt = XCONS (chain)->car;
3601
3602 if (CONSP (elt)
3603 && ((STRINGP (target)
3604 && STRINGP (XCONS (elt)->car)
3605 && fast_string_match (XCONS (elt)->car, target) >= 0)
3606 || (INTEGERP (target) && EQ (target, XCONS (elt)->car))))
02ba4723
KH
3607 {
3608 val = XCONS (elt)->cdr;
3609 if (CONSP (val))
3610 return val;
3611 if (! SYMBOLP (val))
3612 return Qnil;
3613 if (! NILP (Fcoding_system_p (val)))
3614 return Fcons (val, val);
465edc86 3615 if (!NILP (Ffboundp (val)))
5d632ccf 3616 return call1 (val, Flist (nargs, args));
02ba4723
KH
3617 return Qnil;
3618 }
4ed46869
KH
3619 }
3620 return Qnil;
3621}
3622
3623#endif /* emacs */
3624
3625\f
3626/*** 8. Post-amble ***/
3627
3628init_coding_once ()
3629{
3630 int i;
3631
0ef69138 3632 /* Emacs' internal format specific initialize routine. */
4ed46869
KH
3633 for (i = 0; i <= 0x20; i++)
3634 emacs_code_class[i] = EMACS_control_code;
3635 emacs_code_class[0x0A] = EMACS_linefeed_code;
3636 emacs_code_class[0x0D] = EMACS_carriage_return_code;
3637 for (i = 0x21 ; i < 0x7F; i++)
3638 emacs_code_class[i] = EMACS_ascii_code;
3639 emacs_code_class[0x7F] = EMACS_control_code;
3640 emacs_code_class[0x80] = EMACS_leading_code_composition;
3641 for (i = 0x81; i < 0xFF; i++)
3642 emacs_code_class[i] = EMACS_invalid_code;
3643 emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
3644 emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
3645 emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
3646 emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
3647
3648 /* ISO2022 specific initialize routine. */
3649 for (i = 0; i < 0x20; i++)
3650 iso_code_class[i] = ISO_control_code;
3651 for (i = 0x21; i < 0x7F; i++)
3652 iso_code_class[i] = ISO_graphic_plane_0;
3653 for (i = 0x80; i < 0xA0; i++)
3654 iso_code_class[i] = ISO_control_code;
3655 for (i = 0xA1; i < 0xFF; i++)
3656 iso_code_class[i] = ISO_graphic_plane_1;
3657 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
3658 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
3659 iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
3660 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
3661 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
3662 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
3663 iso_code_class[ISO_CODE_ESC] = ISO_escape;
3664 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
3665 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
3666 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
3667
e0e989f6
KH
3668 conversion_buffer_size = MINIMUM_CONVERSION_BUFFER_SIZE;
3669 conversion_buffer = (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE);
3670
3671 setup_coding_system (Qnil, &keyboard_coding);
3672 setup_coding_system (Qnil, &terminal_coding);
9ce27fde
KH
3673
3674#if defined (MSDOS) || defined (WINDOWSNT)
3675 system_eol_type = CODING_EOL_CRLF;
3676#else
3677 system_eol_type = CODING_EOL_LF;
3678#endif
e0e989f6
KH
3679}
3680
3681#ifdef emacs
3682
3683syms_of_coding ()
3684{
3685 Qtarget_idx = intern ("target-idx");
3686 staticpro (&Qtarget_idx);
3687
9ce27fde 3688 /* Target FILENAME is the first argument. */
e0e989f6 3689 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
9ce27fde 3690 /* Target FILENAME is the third argument. */
e0e989f6
KH
3691 Fput (Qwrite_region, Qtarget_idx, make_number (2));
3692
3693 Qcall_process = intern ("call-process");
3694 staticpro (&Qcall_process);
9ce27fde 3695 /* Target PROGRAM is the first argument. */
e0e989f6
KH
3696 Fput (Qcall_process, Qtarget_idx, make_number (0));
3697
3698 Qcall_process_region = intern ("call-process-region");
3699 staticpro (&Qcall_process_region);
9ce27fde 3700 /* Target PROGRAM is the third argument. */
e0e989f6
KH
3701 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
3702
3703 Qstart_process = intern ("start-process");
3704 staticpro (&Qstart_process);
9ce27fde 3705 /* Target PROGRAM is the third argument. */
e0e989f6
KH
3706 Fput (Qstart_process, Qtarget_idx, make_number (2));
3707
3708 Qopen_network_stream = intern ("open-network-stream");
3709 staticpro (&Qopen_network_stream);
9ce27fde 3710 /* Target SERVICE is the fourth argument. */
e0e989f6
KH
3711 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
3712
4ed46869
KH
3713 Qcoding_system = intern ("coding-system");
3714 staticpro (&Qcoding_system);
3715
3716 Qeol_type = intern ("eol-type");
3717 staticpro (&Qeol_type);
3718
3719 Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
3720 staticpro (&Qbuffer_file_coding_system);
3721
3722 Qpost_read_conversion = intern ("post-read-conversion");
3723 staticpro (&Qpost_read_conversion);
3724
3725 Qpre_write_conversion = intern ("pre-write-conversion");
3726 staticpro (&Qpre_write_conversion);
3727
02ba4723
KH
3728 Qcoding_system_spec = intern ("coding-system-spec");
3729 staticpro (&Qcoding_system_spec);
4ed46869
KH
3730
3731 Qcoding_system_p = intern ("coding-system-p");
3732 staticpro (&Qcoding_system_p);
3733
3734 Qcoding_system_error = intern ("coding-system-error");
3735 staticpro (&Qcoding_system_error);
3736
3737 Fput (Qcoding_system_error, Qerror_conditions,
3738 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
3739 Fput (Qcoding_system_error, Qerror_message,
9ce27fde 3740 build_string ("Invalid coding system"));
4ed46869
KH
3741
3742 Qcoding_category_index = intern ("coding-category-index");
3743 staticpro (&Qcoding_category_index);
3744
3745 {
3746 int i;
3747 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3748 {
3749 coding_category_table[i] = intern (coding_category_name[i]);
3750 staticpro (&coding_category_table[i]);
3751 Fput (coding_category_table[i], Qcoding_category_index,
3752 make_number (i));
3753 }
3754 }
3755
bdd9fb48
KH
3756 Qcharacter_unification_table = intern ("character-unification-table");
3757 staticpro (&Qcharacter_unification_table);
3758 Fput (Qcharacter_unification_table, Qchar_table_extra_slots,
3759 make_number (0));
3760
a5d301df
KH
3761 Qcharacter_unification_table_for_decode
3762 = intern ("character-unification-table-for-decode");
3763 staticpro (&Qcharacter_unification_table_for_decode);
3764
3765 Qcharacter_unification_table_for_encode
3766 = intern ("character-unification-table-for-encode");
3767 staticpro (&Qcharacter_unification_table_for_encode);
3768
9ce27fde
KH
3769 Qemacs_mule = intern ("emacs-mule");
3770 staticpro (&Qemacs_mule);
3771
02ba4723 3772 defsubr (&Scoding_system_spec);
4ed46869
KH
3773 defsubr (&Scoding_system_p);
3774 defsubr (&Sread_coding_system);
3775 defsubr (&Sread_non_nil_coding_system);
3776 defsubr (&Scheck_coding_system);
3777 defsubr (&Sdetect_coding_region);
3778 defsubr (&Sdecode_coding_region);
3779 defsubr (&Sencode_coding_region);
3780 defsubr (&Sdecode_coding_string);
3781 defsubr (&Sencode_coding_string);
3782 defsubr (&Sdecode_sjis_char);
3783 defsubr (&Sencode_sjis_char);
3784 defsubr (&Sdecode_big5_char);
3785 defsubr (&Sencode_big5_char);
1ba9e4ab 3786 defsubr (&Sset_terminal_coding_system_internal);
4ed46869 3787 defsubr (&Sterminal_coding_system);
1ba9e4ab 3788 defsubr (&Sset_keyboard_coding_system_internal);
4ed46869 3789 defsubr (&Skeyboard_coding_system);
a5d301df 3790 defsubr (&Sfind_operation_coding_system);
4ed46869
KH
3791
3792 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
3793 "List of coding-categories (symbols) ordered by priority.");
3794 {
3795 int i;
3796
3797 Vcoding_category_list = Qnil;
3798 for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
3799 Vcoding_category_list
3800 = Fcons (coding_category_table[i], Vcoding_category_list);
3801 }
3802
3803 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
3804 "A variable of internal use only.\n\
3805If the value is a coding system, it is used for decoding on read operation.\n\
a67a9c66
RS
3806If not, an appropriate element is used from one of the coding system alists:\n\
3807There are three of such tables, `file-coding-system-alist',\n\
3808`process-coding-system-alist', and `network-coding-system-alist'.");
4ed46869
KH
3809 Vcoding_system_for_read = Qnil;
3810
3811 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
3812 "A variable of internal use only.\n\
3813If the value is a coding system, it is used for encoding on write operation.\n\
a67a9c66
RS
3814If not, an appropriate element is used from one of the coding system alists:\n\
3815There are three of such tables, `file-coding-system-alist',\n\
3816`process-coding-system-alist', and `network-coding-system-alist'.");
4ed46869
KH
3817 Vcoding_system_for_write = Qnil;
3818
3819 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
a67a9c66 3820 "Coding system used in the latest file or process I/O.");
4ed46869
KH
3821 Vlast_coding_system_used = Qnil;
3822
9ce27fde
KH
3823 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
3824 "*Non-nil inhibit code conversion of end-of-line format in any cases.");
3825 inhibit_eol_conversion = 0;
3826
02ba4723
KH
3827 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
3828 "Alist to decide a coding system to use for a file I/O operation.\n\
3829The format is ((PATTERN . VAL) ...),\n\
3830where PATTERN is a regular expression matching a file name,\n\
3831VAL is a coding system, a cons of coding systems, or a function symbol.\n\
3832If VAL is a coding system, it is used for both decoding and encoding\n\
3833the file contents.\n\
3834If VAL is a cons of coding systems, the car part is used for decoding,\n\
3835and the cdr part is used for encoding.\n\
3836If VAL is a function symbol, the function must return a coding system\n\
3837or a cons of coding systems which are used as above.\n\
e0e989f6 3838\n\
9ce27fde 3839See also the function `find-operation-coding-system'.");
02ba4723
KH
3840 Vfile_coding_system_alist = Qnil;
3841
3842 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
3843 "Alist to decide a coding system to use for a process I/O operation.\n\
3844The format is ((PATTERN . VAL) ...),\n\
3845where PATTERN is a regular expression matching a program name,\n\
3846VAL is a coding system, a cons of coding systems, or a function symbol.\n\
3847If VAL is a coding system, it is used for both decoding what received\n\
3848from the program and encoding what sent to the program.\n\
3849If VAL is a cons of coding systems, the car part is used for decoding,\n\
3850and the cdr part is used for encoding.\n\
3851If VAL is a function symbol, the function must return a coding system\n\
3852or a cons of coding systems which are used as above.\n\
4ed46869 3853\n\
9ce27fde 3854See also the function `find-operation-coding-system'.");
02ba4723
KH
3855 Vprocess_coding_system_alist = Qnil;
3856
3857 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
3858 "Alist to decide a coding system to use for a network I/O operation.\n\
3859The format is ((PATTERN . VAL) ...),\n\
3860where PATTERN is a regular expression matching a network service name\n\
3861or is a port number to connect to,\n\
3862VAL is a coding system, a cons of coding systems, or a function symbol.\n\
3863If VAL is a coding system, it is used for both decoding what received\n\
3864from the network stream and encoding what sent to the network stream.\n\
3865If VAL is a cons of coding systems, the car part is used for decoding,\n\
3866and the cdr part is used for encoding.\n\
3867If VAL is a function symbol, the function must return a coding system\n\
3868or a cons of coding systems which are used as above.\n\
4ed46869 3869\n\
9ce27fde 3870See also the function `find-operation-coding-system'.");
02ba4723 3871 Vnetwork_coding_system_alist = Qnil;
4ed46869
KH
3872
3873 DEFVAR_INT ("eol-mnemonic-unix", &eol_mnemonic_unix,
3874 "Mnemonic character indicating UNIX-like end-of-line format (i.e. LF) .");
458822a0 3875 eol_mnemonic_unix = ':';
4ed46869
KH
3876
3877 DEFVAR_INT ("eol-mnemonic-dos", &eol_mnemonic_dos,
3878 "Mnemonic character indicating DOS-like end-of-line format (i.e. CRLF).");
458822a0 3879 eol_mnemonic_dos = '\\';
4ed46869
KH
3880
3881 DEFVAR_INT ("eol-mnemonic-mac", &eol_mnemonic_mac,
3882 "Mnemonic character indicating MAC-like end-of-line format (i.e. CR).");
458822a0 3883 eol_mnemonic_mac = '/';
4ed46869
KH
3884
3885 DEFVAR_INT ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
3886 "Mnemonic character indicating end-of-line format is not yet decided.");
458822a0 3887 eol_mnemonic_undecided = ':';
4ed46869 3888
bdd9fb48
KH
3889 DEFVAR_LISP ("enable-character-unification", &Venable_character_unification,
3890 "Non-nil means ISO 2022 encoder/decoder do character unification.");
3891 Venable_character_unification = Qt;
3892
a5d301df
KH
3893 DEFVAR_LISP ("standard-character-unification-table-for-decode",
3894 &Vstandard_character_unification_table_for_decode,
bdd9fb48 3895 "Table for unifying characters when reading.");
a5d301df 3896 Vstandard_character_unification_table_for_decode = Qnil;
bdd9fb48 3897
a5d301df
KH
3898 DEFVAR_LISP ("standard-character-unification-table-for-encode",
3899 &Vstandard_character_unification_table_for_encode,
bdd9fb48 3900 "Table for unifying characters when writing.");
a5d301df 3901 Vstandard_character_unification_table_for_encode = Qnil;
4ed46869
KH
3902
3903 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
3904 "Alist of charsets vs revision numbers.\n\
3905While encoding, if a charset (car part of an element) is found,\n\
3906designate it with the escape sequence identifing revision (cdr part of the element).");
3907 Vcharset_revision_alist = Qnil;
02ba4723
KH
3908
3909 DEFVAR_LISP ("default-process-coding-system",
3910 &Vdefault_process_coding_system,
3911 "Cons of coding systems used for process I/O by default.\n\
3912The car part is used for decoding a process output,\n\
3913the cdr part is used for encoding a text to be sent to a process.");
3914 Vdefault_process_coding_system = Qnil;
4ed46869
KH
3915}
3916
3917#endif /* emacs */