(struct coding_system): New members
[bpt/emacs.git] / src / coding.c
CommitLineData
4ed46869 1/* Coding system handler (conversion, detection, and etc).
203cb916
RS
2 Copyright (C) 1995, 1997 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4ed46869 4
369314dc
KH
5This file is part of GNU Emacs.
6
7GNU Emacs is free software; you can redistribute it and/or modify
8it under the terms of the GNU General Public License as published by
9the Free Software Foundation; either version 2, or (at your option)
10any later version.
4ed46869 11
369314dc
KH
12GNU Emacs is distributed in the hope that it will be useful,
13but WITHOUT ANY WARRANTY; without even the implied warranty of
14MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15GNU General Public License for more details.
4ed46869 16
369314dc
KH
17You should have received a copy of the GNU General Public License
18along with GNU Emacs; see the file COPYING. If not, write to
19the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20Boston, MA 02111-1307, USA. */
4ed46869
KH
21
22/*** TABLE OF CONTENTS ***
23
24 1. Preamble
0ef69138 25 2. Emacs' internal format (emacs-mule) handlers
4ed46869
KH
26 3. ISO2022 handlers
27 4. Shift-JIS and BIG5 handlers
28 5. End-of-line handlers
29 6. C library functions
30 7. Emacs Lisp library functions
31 8. Post-amble
32
33*/
34
35/*** GENERAL NOTE on CODING SYSTEM ***
36
37 Coding system is an encoding mechanism of one or more character
38 sets. Here's a list of coding systems which Emacs can handle. When
39 we say "decode", it means converting some other coding system to
0ef69138
KH
40 Emacs' internal format (emacs-internal), and when we say "encode",
41 it means converting the coding system emacs-mule to some other
42 coding system.
4ed46869 43
0ef69138 44 0. Emacs' internal format (emacs-mule)
4ed46869
KH
45
46 Emacs itself holds a multi-lingual character in a buffer and a string
47 in a special format. Details are described in the section 2.
48
49 1. ISO2022
50
51 The most famous coding system for multiple character sets. X's
52 Compound Text, various EUCs (Extended Unix Code), and such coding
53 systems used in Internet communication as ISO-2022-JP are all
54 variants of ISO2022. Details are described in the section 3.
55
56 2. SJIS (or Shift-JIS or MS-Kanji-Code)
57
58 A coding system to encode character sets: ASCII, JISX0201, and
59 JISX0208. Widely used for PC's in Japan. Details are described in
60 the section 4.
61
62 3. BIG5
63
64 A coding system to encode character sets: ASCII and Big5. Widely
65 used by Chinese (mainly in Taiwan and Hong Kong). Details are
66 described in the section 4. In this file, when written as "BIG5"
67 (all uppercase), it means the coding system, and when written as
68 "Big5" (capitalized), it means the character set.
69
70 4. Else
71
72 If a user want to read/write a text encoded in a coding system not
73 listed above, he can supply a decoder and an encoder for it in CCL
74 (Code Conversion Language) programs. Emacs executes the CCL program
75 while reading/writing.
76
77 Emacs represent a coding-system by a Lisp symbol that has a property
78 `coding-system'. But, before actually using the coding-system, the
79 information about it is set in a structure of type `struct
80 coding_system' for rapid processing. See the section 6 for more
81 detail.
82
83*/
84
85/*** GENERAL NOTES on END-OF-LINE FORMAT ***
86
87 How end-of-line of a text is encoded depends on a system. For
88 instance, Unix's format is just one byte of `line-feed' code,
89 whereas DOS's format is two bytes sequence of `carriage-return' and
90 `line-feed' codes. MacOS's format is one byte of `carriage-return'.
91
92 Since how characters in a text is encoded and how end-of-line is
93 encoded is independent, any coding system described above can take
94 any format of end-of-line. So, Emacs has information of format of
95 end-of-line in each coding-system. See the section 6 for more
96 detail.
97
98*/
99
100/*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
101
102 These functions check if a text between SRC and SRC_END is encoded
103 in the coding system category XXX. Each returns an integer value in
104 which appropriate flag bits for the category XXX is set. The flag
105 bits are defined in macros CODING_CATEGORY_MASK_XXX. Below is the
106 template of these functions. */
107#if 0
108int
0ef69138 109detect_coding_emacs_mule (src, src_end)
4ed46869
KH
110 unsigned char *src, *src_end;
111{
112 ...
113}
114#endif
115
116/*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
117
118 These functions decode SRC_BYTES length text at SOURCE encoded in
0ef69138
KH
119 CODING to Emacs' internal format (emacs-mule). The resulting text
120 goes to a place pointed by DESTINATION, the length of which should
121 not exceed DST_BYTES. The bytes actually processed is returned as
122 *CONSUMED. The return value is the length of the decoded text.
123 Below is a template of these functions. */
4ed46869
KH
124#if 0
125decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
126 struct coding_system *coding;
127 unsigned char *source, *destination;
128 int src_bytes, dst_bytes;
129 int *consumed;
130{
131 ...
132}
133#endif
134
135/*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
136
0ef69138
KH
137 These functions encode SRC_BYTES length text at SOURCE of Emacs'
138 internal format (emacs-mule) to CODING. The resulting text goes to
139 a place pointed by DESTINATION, the length of which should not
140 exceed DST_BYTES. The bytes actually processed is returned as
141 *CONSUMED. The return value is the length of the encoded text.
142 Below is a template of these functions. */
4ed46869
KH
143#if 0
144encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
145 struct coding_system *coding;
146 unsigned char *source, *destination;
147 int src_bytes, dst_bytes;
148 int *consumed;
149{
150 ...
151}
152#endif
153
154/*** COMMONLY USED MACROS ***/
155
156/* The following three macros ONE_MORE_BYTE, TWO_MORE_BYTES, and
157 THREE_MORE_BYTES safely get one, two, and three bytes from the
158 source text respectively. If there are not enough bytes in the
159 source, they jump to `label_end_of_loop'. The caller should set
160 variables `src' and `src_end' to appropriate areas in advance. */
161
162#define ONE_MORE_BYTE(c1) \
163 do { \
164 if (src < src_end) \
165 c1 = *src++; \
166 else \
167 goto label_end_of_loop; \
168 } while (0)
169
170#define TWO_MORE_BYTES(c1, c2) \
171 do { \
172 if (src + 1 < src_end) \
173 c1 = *src++, c2 = *src++; \
174 else \
175 goto label_end_of_loop; \
176 } while (0)
177
178#define THREE_MORE_BYTES(c1, c2, c3) \
179 do { \
180 if (src + 2 < src_end) \
181 c1 = *src++, c2 = *src++, c3 = *src++; \
182 else \
183 goto label_end_of_loop; \
184 } while (0)
185
186/* The following three macros DECODE_CHARACTER_ASCII,
187 DECODE_CHARACTER_DIMENSION1, and DECODE_CHARACTER_DIMENSION2 put
188 the multi-byte form of a character of each class at the place
189 pointed by `dst'. The caller should set the variable `dst' to
190 point to an appropriate area and the variable `coding' to point to
191 the coding-system of the currently decoding text in advance. */
192
193/* Decode one ASCII character C. */
194
195#define DECODE_CHARACTER_ASCII(c) \
196 do { \
197 if (COMPOSING_P (coding->composing)) \
198 *dst++ = 0xA0, *dst++ = (c) | 0x80; \
199 else \
200 *dst++ = (c); \
201 } while (0)
202
203/* Decode one DIMENSION1 character of which charset is CHARSET and
204 position-code is C. */
205
206#define DECODE_CHARACTER_DIMENSION1(charset, c) \
207 do { \
208 unsigned char leading_code = CHARSET_LEADING_CODE_BASE (charset); \
209 if (COMPOSING_P (coding->composing)) \
210 *dst++ = leading_code + 0x20; \
211 else \
212 *dst++ = leading_code; \
213 if (leading_code = CHARSET_LEADING_CODE_EXT (charset)) \
214 *dst++ = leading_code; \
215 *dst++ = (c) | 0x80; \
216 } while (0)
217
218/* Decode one DIMENSION2 character of which charset is CHARSET and
219 position-codes are C1 and C2. */
220
221#define DECODE_CHARACTER_DIMENSION2(charset, c1, c2) \
222 do { \
223 DECODE_CHARACTER_DIMENSION1 (charset, c1); \
224 *dst++ = (c2) | 0x80; \
225 } while (0)
226
227\f
228/*** 1. Preamble ***/
229
230#include <stdio.h>
231
232#ifdef emacs
233
234#include <config.h>
235#include "lisp.h"
236#include "buffer.h"
237#include "charset.h"
238#include "ccl.h"
239#include "coding.h"
240#include "window.h"
241
242#else /* not emacs */
243
244#include "mulelib.h"
245
246#endif /* not emacs */
247
248Lisp_Object Qcoding_system, Qeol_type;
249Lisp_Object Qbuffer_file_coding_system;
250Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
251
252extern Lisp_Object Qinsert_file_contents, Qwrite_region;
253Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
254Lisp_Object Qstart_process, Qopen_network_stream;
255Lisp_Object Qtarget_idx;
256
257/* Mnemonic character of each format of end-of-line. */
258int eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
259/* Mnemonic character to indicate format of end-of-line is not yet
260 decided. */
261int eol_mnemonic_undecided;
262
263#ifdef emacs
264
02ba4723 265Lisp_Object Qcoding_system_spec, Qcoding_system_p, Qcoding_system_error;
4ed46869
KH
266
267/* Coding-systems are handed between Emacs Lisp programs and C internal
268 routines by the following three variables. */
269/* Coding-system for reading files and receiving data from process. */
270Lisp_Object Vcoding_system_for_read;
271/* Coding-system for writing files and sending data to process. */
272Lisp_Object Vcoding_system_for_write;
273/* Coding-system actually used in the latest I/O. */
274Lisp_Object Vlast_coding_system_used;
275
276/* Coding-system of what terminal accept for displaying. */
277struct coding_system terminal_coding;
278
279/* Coding-system of what is sent from terminal keyboard. */
280struct coding_system keyboard_coding;
281
02ba4723
KH
282Lisp_Object Vfile_coding_system_alist;
283Lisp_Object Vprocess_coding_system_alist;
284Lisp_Object Vnetwork_coding_system_alist;
4ed46869
KH
285
286#endif /* emacs */
287
288Lisp_Object Qcoding_category_index;
289
290/* List of symbols `coding-category-xxx' ordered by priority. */
291Lisp_Object Vcoding_category_list;
292
293/* Table of coding-systems currently assigned to each coding-category. */
294Lisp_Object coding_category_table[CODING_CATEGORY_IDX_MAX];
295
296/* Table of names of symbol for each coding-category. */
297char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
0ef69138 298 "coding-category-emacs-mule",
4ed46869
KH
299 "coding-category-sjis",
300 "coding-category-iso-7",
301 "coding-category-iso-8-1",
302 "coding-category-iso-8-2",
303 "coding-category-iso-else",
304 "coding-category-big5",
305 "coding-category-binary"
306};
307
bdd9fb48
KH
308/* Flag to tell if we look up unification table on character code
309 conversion. */
310Lisp_Object Venable_character_unification;
311/* Standard unification table to look up on reading (decoding). */
312Lisp_Object Vstandard_character_unification_table_for_read;
313/* Standard unification table to look up on writing (encoding). */
314Lisp_Object Vstandard_character_unification_table_for_write;
315
316Lisp_Object Qcharacter_unification_table;
4ed46869
KH
317
318/* Alist of charsets vs revision number. */
319Lisp_Object Vcharset_revision_alist;
320
02ba4723
KH
321/* Default coding systems used for process I/O. */
322Lisp_Object Vdefault_process_coding_system;
323
4ed46869 324\f
0ef69138 325/*** 2. Emacs internal format (emacs-mule) handlers ***/
4ed46869
KH
326
327/* Emacs' internal format for encoding multiple character sets is a
328 kind of multi-byte encoding, i.e. encoding a character by a sequence
329 of one-byte codes of variable length. ASCII characters and control
330 characters (e.g. `tab', `newline') are represented by one-byte as
331 is. It takes the range 0x00 through 0x7F. The other characters
332 are represented by a sequence of `base leading-code', optional
333 `extended leading-code', and one or two `position-code's. Length
334 of the sequence is decided by the base leading-code. Leading-code
335 takes the range 0x80 through 0x9F, whereas extended leading-code
336 and position-code take the range 0xA0 through 0xFF. See the
337 document of `charset.h' for more detail about leading-code and
338 position-code.
339
340 There's one exception in this rule. Special leading-code
341 `leading-code-composition' denotes that the following several
342 characters should be composed into one character. Leading-codes of
343 components (except for ASCII) are added 0x20. An ASCII character
344 component is represented by a 2-byte sequence of `0xA0' and
345 `ASCII-code + 0x80'. See also the document in `charset.h' for the
346 detail of composite character. Hence, we can summarize the code
347 range as follows:
348
349 --- CODE RANGE of Emacs' internal format ---
350 (character set) (range)
351 ASCII 0x00 .. 0x7F
352 ELSE (1st byte) 0x80 .. 0x9F
353 (rest bytes) 0xA0 .. 0xFF
354 ---------------------------------------------
355
356 */
357
358enum emacs_code_class_type emacs_code_class[256];
359
360/* Go to the next statement only if *SRC is accessible and the code is
361 greater than 0xA0. */
362#define CHECK_CODE_RANGE_A0_FF \
363 do { \
364 if (src >= src_end) \
365 goto label_end_of_switch; \
366 else if (*src++ < 0xA0) \
367 return 0; \
368 } while (0)
369
370/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
371 Check if a text is encoded in Emacs' internal format. If it is,
0ef69138 372 return CODING_CATEGORY_MASK_EMASC_MULE, else return 0. */
4ed46869
KH
373
374int
0ef69138 375detect_coding_emacs_mule (src, src_end)
4ed46869
KH
376 unsigned char *src, *src_end;
377{
378 unsigned char c;
379 int composing = 0;
380
381 while (src < src_end)
382 {
383 c = *src++;
384
385 if (composing)
386 {
387 if (c < 0xA0)
388 composing = 0;
389 else
390 c -= 0x20;
391 }
392
393 switch (emacs_code_class[c])
394 {
395 case EMACS_ascii_code:
396 case EMACS_linefeed_code:
397 break;
398
399 case EMACS_control_code:
400 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
401 return 0;
402 break;
403
404 case EMACS_invalid_code:
405 return 0;
406
407 case EMACS_leading_code_composition: /* c == 0x80 */
408 if (composing)
409 CHECK_CODE_RANGE_A0_FF;
410 else
411 composing = 1;
412 break;
413
414 case EMACS_leading_code_4:
415 CHECK_CODE_RANGE_A0_FF;
416 /* fall down to check it two more times ... */
417
418 case EMACS_leading_code_3:
419 CHECK_CODE_RANGE_A0_FF;
420 /* fall down to check it one more time ... */
421
422 case EMACS_leading_code_2:
423 CHECK_CODE_RANGE_A0_FF;
424 break;
425
426 default:
427 label_end_of_switch:
428 break;
429 }
430 }
0ef69138 431 return CODING_CATEGORY_MASK_EMACS_MULE;
4ed46869
KH
432}
433
434\f
435/*** 3. ISO2022 handlers ***/
436
437/* The following note describes the coding system ISO2022 briefly.
438 Since the intension of this note is to help understanding of the
439 programs in this file, some parts are NOT ACCURATE or OVERLY
440 SIMPLIFIED. For the thorough understanding, please refer to the
441 original document of ISO2022.
442
443 ISO2022 provides many mechanisms to encode several character sets
444 in 7-bit and 8-bit environment. If one choose 7-bite environment,
445 all text is encoded by codes of less than 128. This may make the
446 encoded text a little bit longer, but the text get more stability
447 to pass through several gateways (some of them split MSB off).
448
449 There are two kind of character set: control character set and
450 graphic character set. The former contains control characters such
451 as `newline' and `escape' to provide control functions (control
452 functions are provided also by escape sequence). The latter
453 contains graphic characters such as ' A' and '-'. Emacs recognizes
454 two control character sets and many graphic character sets.
455
456 Graphic character sets are classified into one of the following
457 four classes, DIMENSION1_CHARS94, DIMENSION1_CHARS96,
458 DIMENSION2_CHARS94, DIMENSION2_CHARS96 according to the number of
459 bytes (DIMENSION) and the number of characters in one dimension
460 (CHARS) of the set. In addition, each character set is assigned an
461 identification tag (called "final character" and denoted as <F>
462 here after) which is unique in each class. <F> of each character
463 set is decided by ECMA(*) when it is registered in ISO. Code range
464 of <F> is 0x30..0x7F (0x30..0x3F are for private use only).
465
466 Note (*): ECMA = European Computer Manufacturers Association
467
468 Here are examples of graphic character set [NAME(<F>)]:
469 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
470 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
471 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
472 o DIMENSION2_CHARS96 -- none for the moment
473
474 A code area (1byte=8bits) is divided into 4 areas, C0, GL, C1, and GR.
475 C0 [0x00..0x1F] -- control character plane 0
476 GL [0x20..0x7F] -- graphic character plane 0
477 C1 [0x80..0x9F] -- control character plane 1
478 GR [0xA0..0xFF] -- graphic character plane 1
479
480 A control character set is directly designated and invoked to C0 or
481 C1 by an escape sequence. The most common case is that ISO646's
482 control character set is designated/invoked to C0 and ISO6429's
483 control character set is designated/invoked to C1, and usually
484 these designations/invocations are omitted in a coded text. With
485 7-bit environment, only C0 can be used, and a control character for
486 C1 is encoded by an appropriate escape sequence to fit in the
487 environment. All control characters for C1 are defined the
488 corresponding escape sequences.
489
490 A graphic character set is at first designated to one of four
491 graphic registers (G0 through G3), then these graphic registers are
492 invoked to GL or GR. These designations and invocations can be
493 done independently. The most common case is that G0 is invoked to
494 GL, G1 is invoked to GR, and ASCII is designated to G0, and usually
495 these invocations and designations are omitted in a coded text.
496 With 7-bit environment, only GL can be used.
497
498 When a graphic character set of CHARS94 is invoked to GL, code 0x20
499 and 0x7F of GL area work as control characters SPACE and DEL
500 respectively, and code 0xA0 and 0xFF of GR area should not be used.
501
502 There are two ways of invocation: locking-shift and single-shift.
503 With locking-shift, the invocation lasts until the next different
504 invocation, whereas with single-shift, the invocation works only
505 for the following character and doesn't affect locking-shift.
506 Invocations are done by the following control characters or escape
507 sequences.
508
509 ----------------------------------------------------------------------
510 function control char escape sequence description
511 ----------------------------------------------------------------------
512 SI (shift-in) 0x0F none invoke G0 to GL
513 SI (shift-out) 0x0E none invoke G1 to GL
514 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
515 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
516 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 into GL
517 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 into GL
518 ----------------------------------------------------------------------
519 The first four are for locking-shift. Control characters for these
520 functions are defined by macros ISO_CODE_XXX in `coding.h'.
521
522 Designations are done by the following escape sequences.
523 ----------------------------------------------------------------------
524 escape sequence description
525 ----------------------------------------------------------------------
526 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
527 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
528 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
529 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
530 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
531 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
532 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
533 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
534 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
535 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
536 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
537 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
538 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
539 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
540 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
541 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
542 ----------------------------------------------------------------------
543
544 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
545 of dimension 1, chars 94, and final character <F>, and etc.
546
547 Note (*): Although these designations are not allowed in ISO2022,
548 Emacs accepts them on decoding, and produces them on encoding
549 CHARS96 character set in a coding system which is characterized as
550 7-bit environment, non-locking-shift, and non-single-shift.
551
552 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
553 '(' can be omitted. We call this as "short-form" here after.
554
555 Now you may notice that there are a lot of ways for encoding the
556 same multilingual text in ISO2022. Actually, there exist many
557 coding systems such as Compound Text (used in X's inter client
558 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
559 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
560 localized platforms), and all of these are variants of ISO2022.
561
562 In addition to the above, Emacs handles two more kinds of escape
563 sequences: ISO6429's direction specification and Emacs' private
564 sequence for specifying character composition.
565
566 ISO6429's direction specification takes the following format:
567 o CSI ']' -- end of the current direction
568 o CSI '0' ']' -- end of the current direction
569 o CSI '1' ']' -- start of left-to-right text
570 o CSI '2' ']' -- start of right-to-left text
571 The control character CSI (0x9B: control sequence introducer) is
572 abbreviated to the escape sequence ESC '[' in 7-bit environment.
573
574 Character composition specification takes the following format:
575 o ESC '0' -- start character composition
576 o ESC '1' -- end character composition
577 Since these are not standard escape sequences of any ISO, the use
578 of them for these meaning is restricted to Emacs only. */
579
580enum iso_code_class_type iso_code_class[256];
581
582/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
583 Check if a text is encoded in ISO2022. If it is, returns an
584 integer in which appropriate flag bits any of:
585 CODING_CATEGORY_MASK_ISO_7
586 CODING_CATEGORY_MASK_ISO_8_1
587 CODING_CATEGORY_MASK_ISO_8_2
588 CODING_CATEGORY_MASK_ISO_ELSE
589 are set. If a code which should never appear in ISO2022 is found,
590 returns 0. */
591
592int
593detect_coding_iso2022 (src, src_end)
594 unsigned char *src, *src_end;
595{
765a2ca5
KH
596 int mask = (CODING_CATEGORY_MASK_ISO_7
597 | CODING_CATEGORY_MASK_ISO_8_1
598 | CODING_CATEGORY_MASK_ISO_8_2
599 | CODING_CATEGORY_MASK_ISO_ELSE);
bcf26d6a
KH
600 int g1 = 0; /* 1 iff designating to G1. */
601 int c, i;
4ed46869 602
e0e989f6 603 while (src < src_end)
4ed46869
KH
604 {
605 c = *src++;
606 switch (c)
607 {
608 case ISO_CODE_ESC:
e0e989f6 609 if (src >= src_end)
4ed46869
KH
610 break;
611 c = *src++;
bcf26d6a 612 if (src < src_end
e0e989f6
KH
613 && ((c >= '(' && c <= '/')
614 || c == '$' && ((*src >= '(' && *src <= '/')
615 || (*src >= '@' && *src <= 'B'))))
4ed46869 616 {
e0e989f6
KH
617 /* Valid designation sequence. */
618 if (c == ')' || (c == '$' && *src == ')'))
bcf26d6a
KH
619 {
620 g1 = 1;
621 mask &= ~CODING_CATEGORY_MASK_ISO_7;
622 }
e0e989f6
KH
623 src++;
624 break;
4ed46869 625 }
4ed46869
KH
626 else if (c == 'N' || c == 'O' || c == 'n' || c == 'o')
627 return CODING_CATEGORY_MASK_ISO_ELSE;
628 break;
629
4ed46869 630 case ISO_CODE_SO:
e0e989f6
KH
631 if (g1)
632 return CODING_CATEGORY_MASK_ISO_ELSE;
633 break;
634
4ed46869
KH
635 case ISO_CODE_CSI:
636 case ISO_CODE_SS2:
637 case ISO_CODE_SS3:
638 mask &= ~CODING_CATEGORY_MASK_ISO_7;
639 break;
640
641 default:
642 if (c < 0x80)
643 break;
644 else if (c < 0xA0)
645 return 0;
646 else
647 {
648 int count = 1;
649
650 mask &= ~CODING_CATEGORY_MASK_ISO_7;
e0e989f6 651 while (src < src_end && *src >= 0xA0)
4ed46869 652 count++, src++;
e0e989f6 653 if (count & 1 && src < src_end)
4ed46869
KH
654 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
655 }
656 break;
657 }
658 }
659
660 return mask;
661}
662
663/* Decode a character of which charset is CHARSET and the 1st position
bdd9fb48 664 code is C1. If dimension of CHARSET is 2, the 2nd position code is
4ed46869
KH
665 fetched from SRC and set to C2. If CHARSET is negative, it means
666 that we are decoding ill formed text, and what we can do is just to
667 read C1 as is. */
668
bdd9fb48
KH
669#define DECODE_ISO_CHARACTER(charset, c1) \
670 do { \
671 int c_alt, charset_alt = (charset); \
672 if (COMPOSING_HEAD_P (coding->composing)) \
673 { \
674 *dst++ = LEADING_CODE_COMPOSITION; \
675 if (COMPOSING_WITH_RULE_P (coding->composing)) \
676 /* To tell composition rules are embeded. */ \
677 *dst++ = 0xFF; \
678 coding->composing += 2; \
679 } \
680 if ((charset) >= 0) \
681 { \
682 if (CHARSET_DIMENSION (charset) == 2) \
683 ONE_MORE_BYTE (c2); \
684 if (!NILP (unification_table) \
685 && ((c_alt = unify_char (unification_table, \
686 -1, (charset), c1, c2)) >= 0)) \
687 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
688 } \
689 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
690 DECODE_CHARACTER_ASCII (c1); \
691 else if (CHARSET_DIMENSION (charset_alt) == 1) \
692 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
693 else \
694 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
695 if (COMPOSING_WITH_RULE_P (coding->composing)) \
696 /* To tell a composition rule follows. */ \
697 coding->composing = COMPOSING_WITH_RULE_RULE; \
4ed46869
KH
698 } while (0)
699
700/* Set designation state into CODING. */
701#define DECODE_DESIGNATION(reg, dimension, chars, final_char) \
702 do { \
703 int charset = ISO_CHARSET_TABLE (dimension, chars, final_char); \
4ed46869
KH
704 if (charset >= 0) \
705 { \
706 if (coding->direction == 1 \
707 && CHARSET_REVERSE_CHARSET (charset) >= 0) \
708 charset = CHARSET_REVERSE_CHARSET (charset); \
709 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
710 } \
711 } while (0)
712
713/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
714
715int
716decode_coding_iso2022 (coding, source, destination,
717 src_bytes, dst_bytes, consumed)
718 struct coding_system *coding;
719 unsigned char *source, *destination;
720 int src_bytes, dst_bytes;
721 int *consumed;
722{
723 unsigned char *src = source;
724 unsigned char *src_end = source + src_bytes;
725 unsigned char *dst = destination;
726 unsigned char *dst_end = destination + dst_bytes;
727 /* Since the maximum bytes produced by each loop is 7, we subtract 6
728 from DST_END to assure that overflow checking is necessary only
729 at the head of loop. */
730 unsigned char *adjusted_dst_end = dst_end - 6;
731 int charset;
732 /* Charsets invoked to graphic plane 0 and 1 respectively. */
733 int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
734 int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
bdd9fb48
KH
735 Lisp_Object unification_table = coding->character_unification_table;
736
737 if (!NILP (Venable_character_unification) && NILP (unification_table))
738 unification_table = Vstandard_character_unification_table_for_read;
4ed46869
KH
739
740 while (src < src_end && dst < adjusted_dst_end)
741 {
742 /* SRC_BASE remembers the start position in source in each loop.
743 The loop will be exited when there's not enough source text
744 to analyze long escape sequence or 2-byte code (within macros
745 ONE_MORE_BYTE or TWO_MORE_BYTES). In that case, SRC is reset
746 to SRC_BASE before exiting. */
747 unsigned char *src_base = src;
bdd9fb48 748 int c1 = *src++, c2;
4ed46869
KH
749
750 switch (iso_code_class [c1])
751 {
752 case ISO_0x20_or_0x7F:
753 if (!coding->composing
754 && (charset0 < 0 || CHARSET_CHARS (charset0) == 94))
755 {
756 /* This is SPACE or DEL. */
757 *dst++ = c1;
758 break;
759 }
760 /* This is a graphic character, we fall down ... */
761
762 case ISO_graphic_plane_0:
763 if (coding->composing == COMPOSING_WITH_RULE_RULE)
764 {
765 /* This is a composition rule. */
766 *dst++ = c1 | 0x80;
767 coding->composing = COMPOSING_WITH_RULE_TAIL;
768 }
769 else
770 DECODE_ISO_CHARACTER (charset0, c1);
771 break;
772
773 case ISO_0xA0_or_0xFF:
774 if (charset1 < 0 || CHARSET_CHARS (charset1) == 94)
775 {
776 /* Invalid code. */
777 *dst++ = c1;
778 break;
779 }
780 /* This is a graphic character, we fall down ... */
781
782 case ISO_graphic_plane_1:
783 DECODE_ISO_CHARACTER (charset1, c1);
784 break;
785
786 case ISO_control_code:
787 /* All ISO2022 control characters in this class have the
788 same representation in Emacs internal format. */
789 *dst++ = c1;
790 break;
791
792 case ISO_carriage_return:
793 if (coding->eol_type == CODING_EOL_CR)
794 {
795 *dst++ = '\n';
796 }
797 else if (coding->eol_type == CODING_EOL_CRLF)
798 {
799 ONE_MORE_BYTE (c1);
800 if (c1 == ISO_CODE_LF)
801 *dst++ = '\n';
802 else
803 {
804 src--;
805 *dst++ = c1;
806 }
807 }
808 else
809 {
810 *dst++ = c1;
811 }
812 break;
813
814 case ISO_shift_out:
e0e989f6
KH
815 if (CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
816 goto label_invalid_escape_sequence;
4ed46869
KH
817 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
818 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
819 break;
820
821 case ISO_shift_in:
822 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
823 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
824 break;
825
826 case ISO_single_shift_2_7:
827 case ISO_single_shift_2:
828 /* SS2 is handled as an escape sequence of ESC 'N' */
829 c1 = 'N';
830 goto label_escape_sequence;
831
832 case ISO_single_shift_3:
833 /* SS2 is handled as an escape sequence of ESC 'O' */
834 c1 = 'O';
835 goto label_escape_sequence;
836
837 case ISO_control_sequence_introducer:
838 /* CSI is handled as an escape sequence of ESC '[' ... */
839 c1 = '[';
840 goto label_escape_sequence;
841
842 case ISO_escape:
843 ONE_MORE_BYTE (c1);
844 label_escape_sequence:
845 /* Escape sequences handled by Emacs are invocation,
846 designation, direction specification, and character
847 composition specification. */
848 switch (c1)
849 {
850 case '&': /* revision of following character set */
851 ONE_MORE_BYTE (c1);
852 if (!(c1 >= '@' && c1 <= '~'))
e0e989f6 853 goto label_invalid_escape_sequence;
4ed46869
KH
854 ONE_MORE_BYTE (c1);
855 if (c1 != ISO_CODE_ESC)
e0e989f6 856 goto label_invalid_escape_sequence;
4ed46869
KH
857 ONE_MORE_BYTE (c1);
858 goto label_escape_sequence;
859
860 case '$': /* designation of 2-byte character set */
861 ONE_MORE_BYTE (c1);
862 if (c1 >= '@' && c1 <= 'B')
863 { /* designation of JISX0208.1978, GB2312.1980,
864 or JISX0208.1980 */
865 DECODE_DESIGNATION (0, 2, 94, c1);
866 }
867 else if (c1 >= 0x28 && c1 <= 0x2B)
868 { /* designation of DIMENSION2_CHARS94 character set */
869 ONE_MORE_BYTE (c2);
870 DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
871 }
872 else if (c1 >= 0x2C && c1 <= 0x2F)
873 { /* designation of DIMENSION2_CHARS96 character set */
874 ONE_MORE_BYTE (c2);
875 DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
876 }
877 else
e0e989f6 878 goto label_invalid_escape_sequence;
4ed46869
KH
879 break;
880
881 case 'n': /* invocation of locking-shift-2 */
e0e989f6
KH
882 if (CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
883 goto label_invalid_escape_sequence;
4ed46869 884 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
e0e989f6 885 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
4ed46869
KH
886 break;
887
888 case 'o': /* invocation of locking-shift-3 */
e0e989f6
KH
889 if (CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
890 goto label_invalid_escape_sequence;
4ed46869 891 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
e0e989f6 892 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
4ed46869
KH
893 break;
894
895 case 'N': /* invocation of single-shift-2 */
e0e989f6
KH
896 if (CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
897 goto label_invalid_escape_sequence;
4ed46869
KH
898 ONE_MORE_BYTE (c1);
899 charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
900 DECODE_ISO_CHARACTER (charset, c1);
901 break;
902
903 case 'O': /* invocation of single-shift-3 */
e0e989f6
KH
904 if (CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
905 goto label_invalid_escape_sequence;
4ed46869
KH
906 ONE_MORE_BYTE (c1);
907 charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
908 DECODE_ISO_CHARACTER (charset, c1);
909 break;
910
911 case '0': /* start composing without embeded rules */
912 coding->composing = COMPOSING_NO_RULE_HEAD;
913 break;
914
915 case '1': /* end composing */
916 coding->composing = COMPOSING_NO;
917 break;
918
919 case '2': /* start composing with embeded rules */
920 coding->composing = COMPOSING_WITH_RULE_HEAD;
921 break;
922
923 case '[': /* specification of direction */
924 /* For the moment, nested direction is not supported.
925 So, the value of `coding->direction' is 0 or 1: 0
926 means left-to-right, 1 means right-to-left. */
927 ONE_MORE_BYTE (c1);
928 switch (c1)
929 {
930 case ']': /* end of the current direction */
931 coding->direction = 0;
932
933 case '0': /* end of the current direction */
934 case '1': /* start of left-to-right direction */
935 ONE_MORE_BYTE (c1);
936 if (c1 == ']')
937 coding->direction = 0;
938 else
939 goto label_invalid_escape_sequence;
940 break;
941
942 case '2': /* start of right-to-left direction */
943 ONE_MORE_BYTE (c1);
944 if (c1 == ']')
945 coding->direction= 1;
946 else
947 goto label_invalid_escape_sequence;
948 break;
949
950 default:
951 goto label_invalid_escape_sequence;
952 }
953 break;
954
955 default:
956 if (c1 >= 0x28 && c1 <= 0x2B)
957 { /* designation of DIMENSION1_CHARS94 character set */
958 ONE_MORE_BYTE (c2);
959 DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
960 }
961 else if (c1 >= 0x2C && c1 <= 0x2F)
962 { /* designation of DIMENSION1_CHARS96 character set */
963 ONE_MORE_BYTE (c2);
964 DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
965 }
966 else
967 {
968 goto label_invalid_escape_sequence;
969 }
970 }
971 /* We must update these variables now. */
972 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
973 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
974 break;
975
976 label_invalid_escape_sequence:
977 {
978 int length = src - src_base;
979
980 bcopy (src_base, dst, length);
981 dst += length;
982 }
983 }
984 continue;
985
986 label_end_of_loop:
987 coding->carryover_size = src - src_base;
988 bcopy (src_base, coding->carryover, coding->carryover_size);
989 src = src_base;
990 break;
991 }
992
993 /* If this is the last block of the text to be decoded, we had
994 better just flush out all remaining codes in the text although
995 they are not valid characters. */
996 if (coding->last_block)
997 {
998 bcopy (src, dst, src_end - src);
999 dst += (src_end - src);
1000 src = src_end;
1001 }
1002 *consumed = src - source;
1003 return dst - destination;
1004}
1005
1006/* ISO2022 encoding staffs. */
1007
1008/*
1009 It is not enough to say just "ISO2022" on encoding, but we have to
1010 specify more details. In Emacs, each coding-system of ISO2022
1011 variant has the following specifications:
1012 1. Initial designation to G0 thru G3.
1013 2. Allows short-form designation?
1014 3. ASCII should be designated to G0 before control characters?
1015 4. ASCII should be designated to G0 at end of line?
1016 5. 7-bit environment or 8-bit environment?
1017 6. Use locking-shift?
1018 7. Use Single-shift?
1019 And the following two are only for Japanese:
1020 8. Use ASCII in place of JIS0201-1976-Roman?
1021 9. Use JISX0208-1983 in place of JISX0208-1978?
1022 These specifications are encoded in `coding->flags' as flag bits
1023 defined by macros CODING_FLAG_ISO_XXX. See `coding.h' for more
1024 detail.
1025*/
1026
1027/* Produce codes (escape sequence) for designating CHARSET to graphic
1028 register REG. If <final-char> of CHARSET is '@', 'A', or 'B' and
1029 the coding system CODING allows, produce designation sequence of
1030 short-form. */
1031
1032#define ENCODE_DESIGNATION(charset, reg, coding) \
1033 do { \
1034 unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset); \
1035 char *intermediate_char_94 = "()*+"; \
1036 char *intermediate_char_96 = ",-./"; \
1037 Lisp_Object temp \
1038 = Fassq (make_number (charset), Vcharset_revision_alist); \
1039 if (! NILP (temp)) \
1040 { \
1041 *dst++ = ISO_CODE_ESC; \
1042 *dst++ = '&'; \
1043 *dst++ = XINT (XCONS (temp)->cdr) + '@'; \
1044 } \
1045 *dst++ = ISO_CODE_ESC; \
1046 if (CHARSET_DIMENSION (charset) == 1) \
1047 { \
1048 if (CHARSET_CHARS (charset) == 94) \
1049 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1050 else \
1051 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1052 } \
1053 else \
1054 { \
1055 *dst++ = '$'; \
1056 if (CHARSET_CHARS (charset) == 94) \
1057 { \
1058 if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM) \
1059 || reg != 0 \
1060 || final_char < '@' || final_char > 'B') \
1061 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1062 } \
1063 else \
1064 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1065 } \
1066 *dst++ = final_char; \
1067 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
1068 } while (0)
1069
1070/* The following two macros produce codes (control character or escape
1071 sequence) for ISO2022 single-shift functions (single-shift-2 and
1072 single-shift-3). */
1073
1074#define ENCODE_SINGLE_SHIFT_2 \
1075 do { \
1076 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1077 *dst++ = ISO_CODE_ESC, *dst++ = 'N'; \
1078 else \
1079 *dst++ = ISO_CODE_SS2; \
1080 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1081 } while (0)
1082
1083#define ENCODE_SINGLE_SHIFT_3 \
1084 do { \
1085 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1086 *dst++ = ISO_CODE_ESC, *dst++ = 'O'; \
1087 else \
1088 *dst++ = ISO_CODE_SS3; \
1089 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1090 } while (0)
1091
1092/* The following four macros produce codes (control character or
1093 escape sequence) for ISO2022 locking-shift functions (shift-in,
1094 shift-out, locking-shift-2, and locking-shift-3). */
1095
1096#define ENCODE_SHIFT_IN \
1097 do { \
1098 *dst++ = ISO_CODE_SI; \
1099 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1100 } while (0)
1101
1102#define ENCODE_SHIFT_OUT \
1103 do { \
1104 *dst++ = ISO_CODE_SO; \
1105 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1106 } while (0)
1107
1108#define ENCODE_LOCKING_SHIFT_2 \
1109 do { \
1110 *dst++ = ISO_CODE_ESC, *dst++ = 'n'; \
1111 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1112 } while (0)
1113
1114#define ENCODE_LOCKING_SHIFT_3 \
1115 do { \
1116 *dst++ = ISO_CODE_ESC, *dst++ = 'o'; \
1117 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1118 } while (0)
1119
1120/* Produce codes for a DIMENSION1 character of which character set is
1121 CHARSET and position-code is C1. Designation and invocation
1122 sequences are also produced in advance if necessary. */
1123
1124
1125#define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
1126 do { \
1127 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1128 { \
1129 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1130 *dst++ = c1 & 0x7F; \
1131 else \
1132 *dst++ = c1 | 0x80; \
1133 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1134 break; \
1135 } \
1136 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1137 { \
1138 *dst++ = c1 & 0x7F; \
1139 break; \
1140 } \
1141 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1142 { \
1143 *dst++ = c1 | 0x80; \
1144 break; \
1145 } \
1146 else \
1147 /* Since CHARSET is not yet invoked to any graphic planes, we \
1148 must invoke it, or, at first, designate it to some graphic \
1149 register. Then repeat the loop to actually produce the \
1150 character. */ \
1151 dst = encode_invocation_designation (charset, coding, dst); \
1152 } while (1)
1153
1154/* Produce codes for a DIMENSION2 character of which character set is
1155 CHARSET and position-codes are C1 and C2. Designation and
1156 invocation codes are also produced in advance if necessary. */
1157
1158#define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
1159 do { \
1160 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1161 { \
1162 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1163 *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F; \
1164 else \
1165 *dst++ = c1 | 0x80, *dst++ = c2 | 0x80; \
1166 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1167 break; \
1168 } \
1169 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1170 { \
1171 *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F; \
1172 break; \
1173 } \
1174 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1175 { \
1176 *dst++ = c1 | 0x80, *dst++= c2 | 0x80; \
1177 break; \
1178 } \
1179 else \
1180 /* Since CHARSET is not yet invoked to any graphic planes, we \
1181 must invoke it, or, at first, designate it to some graphic \
1182 register. Then repeat the loop to actually produce the \
1183 character. */ \
1184 dst = encode_invocation_designation (charset, coding, dst); \
1185 } while (1)
1186
bdd9fb48
KH
1187#define ENCODE_ISO_CHARACTER(charset, c1, c2) \
1188 do { \
1189 int c_alt, charset_alt; \
1190 if (!NILP (unification_table) \
1191 && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
1192 < 0)) \
1193 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1194 else \
1195 charset_alt = charset; \
1196 if (CHARSET_DIMENSION (charset_alt) == 1) \
1197 ENCODE_ISO_CHARACTER_DIMENSION1 (charset_alt, c1); \
1198 else \
1199 ENCODE_ISO_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
1200 } while (0)
1201
4ed46869
KH
1202/* Produce designation and invocation codes at a place pointed by DST
1203 to use CHARSET. The element `spec.iso2022' of *CODING is updated.
1204 Return new DST. */
1205
1206unsigned char *
1207encode_invocation_designation (charset, coding, dst)
1208 int charset;
1209 struct coding_system *coding;
1210 unsigned char *dst;
1211{
1212 int reg; /* graphic register number */
1213
1214 /* At first, check designations. */
1215 for (reg = 0; reg < 4; reg++)
1216 if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
1217 break;
1218
1219 if (reg >= 4)
1220 {
1221 /* CHARSET is not yet designated to any graphic registers. */
1222 /* At first check the requested designation. */
1223 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1ba9e4ab
KH
1224 if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1225 /* Since CHARSET requests no special designation, designate it
1226 to graphic register 0. */
4ed46869
KH
1227 reg = 0;
1228
1229 ENCODE_DESIGNATION (charset, reg, coding);
1230 }
1231
1232 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
1233 && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
1234 {
1235 /* Since the graphic register REG is not invoked to any graphic
1236 planes, invoke it to graphic plane 0. */
1237 switch (reg)
1238 {
1239 case 0: /* graphic register 0 */
1240 ENCODE_SHIFT_IN;
1241 break;
1242
1243 case 1: /* graphic register 1 */
1244 ENCODE_SHIFT_OUT;
1245 break;
1246
1247 case 2: /* graphic register 2 */
1248 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1249 ENCODE_SINGLE_SHIFT_2;
1250 else
1251 ENCODE_LOCKING_SHIFT_2;
1252 break;
1253
1254 case 3: /* graphic register 3 */
1255 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1256 ENCODE_SINGLE_SHIFT_3;
1257 else
1258 ENCODE_LOCKING_SHIFT_3;
1259 break;
1260 }
1261 }
1262 return dst;
1263}
1264
1265/* The following two macros produce codes for indicating composition. */
1266#define ENCODE_COMPOSITION_NO_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '0'
1267#define ENCODE_COMPOSITION_WITH_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '2'
1268#define ENCODE_COMPOSITION_END *dst++ = ISO_CODE_ESC, *dst++ = '1'
1269
1270/* The following three macros produce codes for indicating direction
1271 of text. */
1272#define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
1273 do { \
1274 if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS) \
1275 *dst++ = ISO_CODE_ESC, *dst++ = '['; \
1276 else \
1277 *dst++ = ISO_CODE_CSI; \
1278 } while (0)
1279
1280#define ENCODE_DIRECTION_R2L \
1281 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '2', *dst++ = ']'
1282
1283#define ENCODE_DIRECTION_L2R \
1284 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '0', *dst++ = ']'
1285
1286/* Produce codes for designation and invocation to reset the graphic
1287 planes and registers to initial state. */
e0e989f6
KH
1288#define ENCODE_RESET_PLANE_AND_REGISTER \
1289 do { \
1290 int reg; \
1291 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0) \
1292 ENCODE_SHIFT_IN; \
1293 for (reg = 0; reg < 4; reg++) \
1294 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0 \
1295 && (CODING_SPEC_ISO_DESIGNATION (coding, reg) \
1296 != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg))) \
1297 ENCODE_DESIGNATION \
1298 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
4ed46869
KH
1299 } while (0)
1300
bdd9fb48
KH
1301/* Produce designation sequences of charsets in the line started from
1302 *SRC to a place pointed by DSTP.
1303
1304 If the current block ends before any end-of-line, we may fail to
1305 find all the necessary *designations. */
1306encode_designation_at_bol (coding, table, src, src_end, dstp)
e0e989f6 1307 struct coding_system *coding;
bdd9fb48 1308 Lisp_Object table;
e0e989f6
KH
1309 unsigned char *src, *src_end, **dstp;
1310{
bdd9fb48
KH
1311 int charset, c, found = 0, reg;
1312 /* Table of charsets to be designated to each graphic register. */
1313 int r[4];
1314 unsigned char *dst = *dstp;
1315
1316 for (reg = 0; reg < 4; reg++)
1317 r[reg] = -1;
1318
1319 while (src < src_end && *src != '\n' && found < 4)
e0e989f6 1320 {
bdd9fb48
KH
1321 int bytes = BYTES_BY_CHAR_HEAD (*src);
1322
1323 if (NILP (table))
1324 charset = CHARSET_AT (src);
1325 else
e0e989f6 1326 {
bdd9fb48
KH
1327 int c_alt, c1, c2;
1328
1329 SPLIT_STRING(src, bytes, charset, c1, c2);
1330 if ((c_alt = unify_char (table, -1, charset, c1, c2)) >= 0)
1331 charset = CHAR_CHARSET (c_alt);
e0e989f6 1332 }
bdd9fb48 1333
e0e989f6 1334 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1ba9e4ab 1335 if (r[reg] == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
bdd9fb48
KH
1336 {
1337 found++;
1338 r[reg] = charset;
1339 }
1340
1341 src += bytes;
1342 }
1343
1344 if (found)
1345 {
1346 for (reg = 0; reg < 4; reg++)
1347 if (r[reg] >= 0
1348 && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
1349 ENCODE_DESIGNATION (r[reg], reg, coding);
1350 *dstp = dst;
e0e989f6 1351 }
e0e989f6
KH
1352}
1353
4ed46869
KH
1354/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
1355
1356int
1357encode_coding_iso2022 (coding, source, destination,
1358 src_bytes, dst_bytes, consumed)
1359 struct coding_system *coding;
1360 unsigned char *source, *destination;
1361 int src_bytes, dst_bytes;
1362 int *consumed;
1363{
1364 unsigned char *src = source;
1365 unsigned char *src_end = source + src_bytes;
1366 unsigned char *dst = destination;
1367 unsigned char *dst_end = destination + dst_bytes;
e0e989f6 1368 /* Since the maximum bytes produced by each loop is 20, we subtract 19
4ed46869
KH
1369 from DST_END to assure overflow checking is necessary only at the
1370 head of loop. */
e0e989f6 1371 unsigned char *adjusted_dst_end = dst_end - 19;
bdd9fb48
KH
1372 Lisp_Object unification_table = coding->character_unification_table;
1373
1374 if (!NILP (Venable_character_unification) && NILP (unification_table))
1375 unification_table = Vstandard_character_unification_table_for_write;
4ed46869
KH
1376
1377 while (src < src_end && dst < adjusted_dst_end)
1378 {
1379 /* SRC_BASE remembers the start position in source in each loop.
1380 The loop will be exited when there's not enough source text
1381 to analyze multi-byte codes (within macros ONE_MORE_BYTE,
1382 TWO_MORE_BYTES, and THREE_MORE_BYTES). In that case, SRC is
1383 reset to SRC_BASE before exiting. */
1384 unsigned char *src_base = src;
bdd9fb48 1385 int charset, c1, c2, c3, c4;
4ed46869 1386
e0e989f6
KH
1387 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
1388 && CODING_SPEC_ISO_BOL (coding))
1389 {
bdd9fb48
KH
1390 /* We have to produce designation sequences if any now. */
1391 encode_designation_at_bol (coding, unification_table,
1392 src, src_end, &dst);
e0e989f6
KH
1393 CODING_SPEC_ISO_BOL (coding) = 0;
1394 }
1395
1396 c1 = *src++;
4ed46869
KH
1397 /* If we are seeing a component of a composite character, we are
1398 seeing a leading-code specially encoded for composition, or a
1399 composition rule if composing with rule. We must set C1
1400 to a normal leading-code or an ASCII code. If we are not at
1401 a composed character, we must reset the composition state. */
1402 if (COMPOSING_P (coding->composing))
1403 {
1404 if (c1 < 0xA0)
1405 {
1406 /* We are not in a composite character any longer. */
1407 coding->composing = COMPOSING_NO;
1408 ENCODE_COMPOSITION_END;
1409 }
1410 else
1411 {
1412 if (coding->composing == COMPOSING_WITH_RULE_RULE)
1413 {
1414 *dst++ = c1 & 0x7F;
1415 coding->composing = COMPOSING_WITH_RULE_HEAD;
1416 continue;
1417 }
1418 else if (coding->composing == COMPOSING_WITH_RULE_HEAD)
1419 coding->composing = COMPOSING_WITH_RULE_RULE;
1420 if (c1 == 0xA0)
1421 {
1422 /* This is an ASCII component. */
1423 ONE_MORE_BYTE (c1);
1424 c1 &= 0x7F;
1425 }
1426 else
1427 /* This is a leading-code of non ASCII component. */
1428 c1 -= 0x20;
1429 }
1430 }
1431
1432 /* Now encode one character. C1 is a control character, an
1433 ASCII character, or a leading-code of multi-byte character. */
1434 switch (emacs_code_class[c1])
1435 {
1436 case EMACS_ascii_code:
bdd9fb48 1437 ENCODE_ISO_CHARACTER (CHARSET_ASCII, c1, /* dummy */ c2);
4ed46869
KH
1438 break;
1439
1440 case EMACS_control_code:
1441 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
e0e989f6 1442 ENCODE_RESET_PLANE_AND_REGISTER;
4ed46869
KH
1443 *dst++ = c1;
1444 break;
1445
1446 case EMACS_carriage_return_code:
1447 if (!coding->selective)
1448 {
1449 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
e0e989f6 1450 ENCODE_RESET_PLANE_AND_REGISTER;
4ed46869
KH
1451 *dst++ = c1;
1452 break;
1453 }
1454 /* fall down to treat '\r' as '\n' ... */
1455
1456 case EMACS_linefeed_code:
1457 if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
e0e989f6
KH
1458 ENCODE_RESET_PLANE_AND_REGISTER;
1459 if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
1460 bcopy (coding->spec.iso2022.initial_designation,
1461 coding->spec.iso2022.current_designation,
1462 sizeof coding->spec.iso2022.initial_designation);
4ed46869 1463 if (coding->eol_type == CODING_EOL_LF
0ef69138 1464 || coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
1465 *dst++ = ISO_CODE_LF;
1466 else if (coding->eol_type == CODING_EOL_CRLF)
1467 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
1468 else
1469 *dst++ = ISO_CODE_CR;
e0e989f6 1470 CODING_SPEC_ISO_BOL (coding) = 1;
4ed46869
KH
1471 break;
1472
1473 case EMACS_leading_code_2:
1474 ONE_MORE_BYTE (c2);
bdd9fb48 1475 ENCODE_ISO_CHARACTER (c1, c2, /* dummy */ c3);
4ed46869
KH
1476 break;
1477
1478 case EMACS_leading_code_3:
1479 TWO_MORE_BYTES (c2, c3);
1480 if (c1 < LEADING_CODE_PRIVATE_11)
bdd9fb48 1481 ENCODE_ISO_CHARACTER (c1, c2, c3);
4ed46869 1482 else
bdd9fb48 1483 ENCODE_ISO_CHARACTER (c2, c3, /* dummy */ c4);
4ed46869
KH
1484 break;
1485
1486 case EMACS_leading_code_4:
1487 THREE_MORE_BYTES (c2, c3, c4);
bdd9fb48 1488 ENCODE_ISO_CHARACTER (c2, c3, c4);
4ed46869
KH
1489 break;
1490
1491 case EMACS_leading_code_composition:
1492 ONE_MORE_BYTE (c1);
1493 if (c1 == 0xFF)
1494 {
1495 coding->composing = COMPOSING_WITH_RULE_HEAD;
1496 ENCODE_COMPOSITION_WITH_RULE_START;
1497 }
1498 else
1499 {
1500 /* Rewind one byte because it is a character code of
1501 composition elements. */
1502 src--;
1503 coding->composing = COMPOSING_NO_RULE_HEAD;
1504 ENCODE_COMPOSITION_NO_RULE_START;
1505 }
1506 break;
1507
1508 case EMACS_invalid_code:
1509 *dst++ = c1;
1510 break;
1511 }
1512 continue;
1513 label_end_of_loop:
1514 coding->carryover_size = src - src_base;
1515 bcopy (src_base, coding->carryover, coding->carryover_size);
4ed46869
KH
1516 break;
1517 }
1518
1519 /* If this is the last block of the text to be encoded, we must
bdd9fb48
KH
1520 reset graphic planes and registers to the initial state. */
1521 if (src >= src_end && coding->last_block)
4ed46869 1522 {
e0e989f6 1523 ENCODE_RESET_PLANE_AND_REGISTER;
bdd9fb48
KH
1524 if (coding->carryover_size > 0
1525 && coding->carryover_size < (dst_end - dst))
1526 {
1527 bcopy (coding->carryover, dst, coding->carryover_size);
1528 dst += coding->carryover_size;
1529 coding->carryover_size = 0;
1530 }
4ed46869
KH
1531 }
1532 *consumed = src - source;
1533 return dst - destination;
1534}
1535
1536\f
1537/*** 4. SJIS and BIG5 handlers ***/
1538
1539/* Although SJIS and BIG5 are not ISO's coding system, They are used
1540 quite widely. So, for the moment, Emacs supports them in the bare
1541 C code. But, in the future, they may be supported only by CCL. */
1542
1543/* SJIS is a coding system encoding three character sets: ASCII, right
1544 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
1545 as is. A character of charset katakana-jisx0201 is encoded by
1546 "position-code + 0x80". A character of charset japanese-jisx0208
1547 is encoded in 2-byte but two position-codes are divided and shifted
1548 so that it fit in the range below.
1549
1550 --- CODE RANGE of SJIS ---
1551 (character set) (range)
1552 ASCII 0x00 .. 0x7F
1553 KATAKANA-JISX0201 0xA0 .. 0xDF
1554 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xFF
1555 (2nd byte) 0x40 .. 0xFF
1556 -------------------------------
1557
1558*/
1559
1560/* BIG5 is a coding system encoding two character sets: ASCII and
1561 Big5. An ASCII character is encoded as is. Big5 is a two-byte
1562 character set and is encoded in two-byte.
1563
1564 --- CODE RANGE of BIG5 ---
1565 (character set) (range)
1566 ASCII 0x00 .. 0x7F
1567 Big5 (1st byte) 0xA1 .. 0xFE
1568 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
1569 --------------------------
1570
1571 Since the number of characters in Big5 is larger than maximum
1572 characters in Emacs' charset (96x96), it can't be handled as one
1573 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
1574 and `charset-big5-2'. Both are DIMENSION2 and CHARS94. The former
1575 contains frequently used characters and the latter contains less
1576 frequently used characters. */
1577
1578/* Macros to decode or encode a character of Big5 in BIG5. B1 and B2
1579 are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
1580 C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
1581 format. CHARSET is `charset_big5_1' or `charset_big5_2'. */
1582
1583/* Number of Big5 characters which have the same code in 1st byte. */
1584#define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
1585
1586#define DECODE_BIG5(b1, b2, charset, c1, c2) \
1587 do { \
1588 unsigned int temp \
1589 = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62); \
1590 if (b1 < 0xC9) \
1591 charset = charset_big5_1; \
1592 else \
1593 { \
1594 charset = charset_big5_2; \
1595 temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW; \
1596 } \
1597 c1 = temp / (0xFF - 0xA1) + 0x21; \
1598 c2 = temp % (0xFF - 0xA1) + 0x21; \
1599 } while (0)
1600
1601#define ENCODE_BIG5(charset, c1, c2, b1, b2) \
1602 do { \
1603 unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21); \
1604 if (charset == charset_big5_2) \
1605 temp += BIG5_SAME_ROW * (0xC9 - 0xA1); \
1606 b1 = temp / BIG5_SAME_ROW + 0xA1; \
1607 b2 = temp % BIG5_SAME_ROW; \
1608 b2 += b2 < 0x3F ? 0x40 : 0x62; \
1609 } while (0)
1610
1611/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1612 Check if a text is encoded in SJIS. If it is, return
1613 CODING_CATEGORY_MASK_SJIS, else return 0. */
1614
1615int
1616detect_coding_sjis (src, src_end)
1617 unsigned char *src, *src_end;
1618{
1619 unsigned char c;
1620
1621 while (src < src_end)
1622 {
1623 c = *src++;
1624 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1625 return 0;
1626 if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
1627 {
1628 if (src < src_end && *src++ < 0x40)
1629 return 0;
1630 }
1631 }
1632 return CODING_CATEGORY_MASK_SJIS;
1633}
1634
1635/* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1636 Check if a text is encoded in BIG5. If it is, return
1637 CODING_CATEGORY_MASK_BIG5, else return 0. */
1638
1639int
1640detect_coding_big5 (src, src_end)
1641 unsigned char *src, *src_end;
1642{
1643 unsigned char c;
1644
1645 while (src < src_end)
1646 {
1647 c = *src++;
1648 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1649 return 0;
1650 if (c >= 0xA1)
1651 {
1652 if (src >= src_end)
1653 break;
1654 c = *src++;
1655 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
1656 return 0;
1657 }
1658 }
1659 return CODING_CATEGORY_MASK_BIG5;
1660}
1661
1662/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
1663 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
1664
1665int
1666decode_coding_sjis_big5 (coding, source, destination,
1667 src_bytes, dst_bytes, consumed, sjis_p)
1668 struct coding_system *coding;
1669 unsigned char *source, *destination;
1670 int src_bytes, dst_bytes;
1671 int *consumed;
1672 int sjis_p;
1673{
1674 unsigned char *src = source;
1675 unsigned char *src_end = source + src_bytes;
1676 unsigned char *dst = destination;
1677 unsigned char *dst_end = destination + dst_bytes;
1678 /* Since the maximum bytes produced by each loop is 4, we subtract 3
1679 from DST_END to assure overflow checking is necessary only at the
1680 head of loop. */
1681 unsigned char *adjusted_dst_end = dst_end - 3;
1682
1683 while (src < src_end && dst < adjusted_dst_end)
1684 {
1685 /* SRC_BASE remembers the start position in source in each loop.
1686 The loop will be exited when there's not enough source text
1687 to analyze two-byte character (within macro ONE_MORE_BYTE).
1688 In that case, SRC is reset to SRC_BASE before exiting. */
1689 unsigned char *src_base = src;
1690 unsigned char c1 = *src++, c2, c3, c4;
1691
1692 if (c1 == '\r')
1693 {
1694 if (coding->eol_type == CODING_EOL_CRLF)
1695 {
1696 ONE_MORE_BYTE (c2);
1697 if (c2 == '\n')
1698 *dst++ = c2;
1699 else
1700 /* To process C2 again, SRC is subtracted by 1. */
1701 *dst++ = c1, src--;
1702 }
1703 else
1704 *dst++ = c1;
1705 }
1706 else if (c1 < 0x80)
1707 *dst++ = c1;
1708 else if (c1 < 0xA0 || c1 >= 0xE0)
1709 {
1710 /* SJIS -> JISX0208, BIG5 -> Big5 (only if 0xE0 <= c1 < 0xFF) */
1711 if (sjis_p)
1712 {
1713 ONE_MORE_BYTE (c2);
1714 DECODE_SJIS (c1, c2, c3, c4);
1715 DECODE_CHARACTER_DIMENSION2 (charset_jisx0208, c3, c4);
1716 }
1717 else if (c1 >= 0xE0 && c1 < 0xFF)
1718 {
1719 int charset;
1720
1721 ONE_MORE_BYTE (c2);
1722 DECODE_BIG5 (c1, c2, charset, c3, c4);
1723 DECODE_CHARACTER_DIMENSION2 (charset, c3, c4);
1724 }
1725 else /* Invalid code */
1726 *dst++ = c1;
1727 }
1728 else
1729 {
1730 /* SJIS -> JISX0201-Kana, BIG5 -> Big5 */
1731 if (sjis_p)
1732 DECODE_CHARACTER_DIMENSION1 (charset_katakana_jisx0201, c1);
1733 else
1734 {
1735 int charset;
1736
1737 ONE_MORE_BYTE (c2);
1738 DECODE_BIG5 (c1, c2, charset, c3, c4);
1739 DECODE_CHARACTER_DIMENSION2 (charset, c3, c4);
1740 }
1741 }
1742 continue;
1743
1744 label_end_of_loop:
1745 coding->carryover_size = src - src_base;
1746 bcopy (src_base, coding->carryover, coding->carryover_size);
1747 src = src_base;
1748 break;
1749 }
1750
1751 *consumed = src - source;
1752 return dst - destination;
1753}
1754
1755/* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
1756 This function can encode `charset_ascii', `charset_katakana_jisx0201',
1757 `charset_jisx0208', `charset_big5_1', and `charset_big5-2'. We are
1758 sure that all these charsets are registered as official charset
1759 (i.e. do not have extended leading-codes). Characters of other
1760 charsets are produced without any encoding. If SJIS_P is 1, encode
1761 SJIS text, else encode BIG5 text. */
1762
1763int
1764encode_coding_sjis_big5 (coding, source, destination,
1765 src_bytes, dst_bytes, consumed, sjis_p)
1766 struct coding_system *coding;
1767 unsigned char *source, *destination;
1768 int src_bytes, dst_bytes;
1769 int *consumed;
1770 int sjis_p;
1771{
1772 unsigned char *src = source;
1773 unsigned char *src_end = source + src_bytes;
1774 unsigned char *dst = destination;
1775 unsigned char *dst_end = destination + dst_bytes;
1776 /* Since the maximum bytes produced by each loop is 2, we subtract 1
1777 from DST_END to assure overflow checking is necessary only at the
1778 head of loop. */
1779 unsigned char *adjusted_dst_end = dst_end - 1;
1780
1781 while (src < src_end && dst < adjusted_dst_end)
1782 {
1783 /* SRC_BASE remembers the start position in source in each loop.
1784 The loop will be exited when there's not enough source text
1785 to analyze multi-byte codes (within macros ONE_MORE_BYTE and
1786 TWO_MORE_BYTES). In that case, SRC is reset to SRC_BASE
1787 before exiting. */
1788 unsigned char *src_base = src;
1789 unsigned char c1 = *src++, c2, c3, c4;
1790
1791 if (coding->composing)
1792 {
1793 if (c1 == 0xA0)
1794 {
1795 ONE_MORE_BYTE (c1);
1796 c1 &= 0x7F;
1797 }
1798 else if (c1 >= 0xA0)
1799 c1 -= 0x20;
1800 else
1801 coding->composing = 0;
1802 }
1803
1804 switch (emacs_code_class[c1])
1805 {
1806 case EMACS_ascii_code:
1807 case EMACS_control_code:
1808 *dst++ = c1;
1809 break;
1810
1811 case EMACS_carriage_return_code:
1812 if (!coding->selective)
1813 {
1814 *dst++ = c1;
1815 break;
1816 }
1817 /* fall down to treat '\r' as '\n' ... */
1818
1819 case EMACS_linefeed_code:
1820 if (coding->eol_type == CODING_EOL_LF
0ef69138 1821 || coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
1822 *dst++ = '\n';
1823 else if (coding->eol_type == CODING_EOL_CRLF)
1824 *dst++ = '\r', *dst++ = '\n';
1825 else
1826 *dst++ = '\r';
1827 break;
1828
1829 case EMACS_leading_code_2:
1830 ONE_MORE_BYTE (c2);
1831 if (sjis_p && c1 == charset_katakana_jisx0201)
1832 *dst++ = c2;
1833 else
1834 *dst++ = c1, *dst++ = c2;
1835 break;
1836
1837 case EMACS_leading_code_3:
1838 TWO_MORE_BYTES (c2, c3);
1839 c2 &= 0x7F, c3 &= 0x7F;
1840 if (sjis_p && c1 == charset_jisx0208)
1841 {
1842 unsigned char s1, s2;
1843
1844 ENCODE_SJIS (c2, c3, s1, s2);
1845 *dst++ = s1, *dst++ = s2;
1846 }
1847 else if (!sjis_p && (c1 == charset_big5_1 || c1 == charset_big5_2))
1848 {
1849 unsigned char b1, b2;
1850
1851 ENCODE_BIG5 (c1, c2, c3, b1, b2);
1852 *dst++ = b1, *dst++ = b2;
1853 }
1854 else
1855 *dst++ = c1, *dst++ = c2, *dst++ = c3;
1856 break;
1857
1858 case EMACS_leading_code_4:
1859 THREE_MORE_BYTES (c2, c3, c4);
1860 *dst++ = c1, *dst++ = c2, *dst++ = c3, *dst++ = c4;
1861 break;
1862
1863 case EMACS_leading_code_composition:
1864 coding->composing = 1;
1865 break;
1866
1867 default: /* i.e. case EMACS_invalid_code: */
1868 *dst++ = c1;
1869 }
1870 continue;
1871
1872 label_end_of_loop:
1873 coding->carryover_size = src - src_base;
1874 bcopy (src_base, coding->carryover, coding->carryover_size);
1875 src = src_base;
1876 break;
1877 }
1878
1879 *consumed = src - source;
1880 return dst - destination;
1881}
1882
1883\f
1884/*** 5. End-of-line handlers ***/
1885
1886/* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
1887 This function is called only when `coding->eol_type' is
1888 CODING_EOL_CRLF or CODING_EOL_CR. */
1889
1890decode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
1891 struct coding_system *coding;
1892 unsigned char *source, *destination;
1893 int src_bytes, dst_bytes;
1894 int *consumed;
1895{
1896 unsigned char *src = source;
1897 unsigned char *src_end = source + src_bytes;
1898 unsigned char *dst = destination;
1899 unsigned char *dst_end = destination + dst_bytes;
1900 int produced;
1901
1902 switch (coding->eol_type)
1903 {
1904 case CODING_EOL_CRLF:
1905 {
1906 /* Since the maximum bytes produced by each loop is 2, we
1907 subtract 1 from DST_END to assure overflow checking is
1908 necessary only at the head of loop. */
1909 unsigned char *adjusted_dst_end = dst_end - 1;
1910
1911 while (src < src_end && dst < adjusted_dst_end)
1912 {
1913 unsigned char *src_base = src;
1914 unsigned char c = *src++;
1915 if (c == '\r')
1916 {
1917 ONE_MORE_BYTE (c);
1918 if (c != '\n')
1919 *dst++ = '\r';
bfd99048 1920 *dst++ = c;
4ed46869
KH
1921 }
1922 else
1923 *dst++ = c;
1924 continue;
1925
1926 label_end_of_loop:
1927 coding->carryover_size = src - src_base;
1928 bcopy (src_base, coding->carryover, coding->carryover_size);
1929 src = src_base;
1930 break;
1931 }
1932 *consumed = src - source;
1933 produced = dst - destination;
1934 break;
1935 }
1936
1937 case CODING_EOL_CR:
1938 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
1939 bcopy (source, destination, produced);
1940 dst_end = destination + produced;
1941 while (dst < dst_end)
1942 if (*dst++ == '\r') dst[-1] = '\n';
1943 *consumed = produced;
1944 break;
1945
1946 default: /* i.e. case: CODING_EOL_LF */
1947 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
1948 bcopy (source, destination, produced);
1949 *consumed = produced;
1950 break;
1951 }
1952
1953 return produced;
1954}
1955
1956/* See "GENERAL NOTES about `encode_coding_XXX ()' functions". Encode
1957 format of end-of-line according to `coding->eol_type'. If
1958 `coding->selective' is 1, code '\r' in source text also means
1959 end-of-line. */
1960
1961encode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
1962 struct coding_system *coding;
1963 unsigned char *source, *destination;
1964 int src_bytes, dst_bytes;
1965 int *consumed;
1966{
1967 unsigned char *src = source;
1968 unsigned char *dst = destination;
1969 int produced;
1970
1971 if (src_bytes <= 0)
1972 return 0;
1973
1974 switch (coding->eol_type)
1975 {
1976 case CODING_EOL_LF:
0ef69138 1977 case CODING_EOL_UNDECIDED:
4ed46869
KH
1978 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
1979 bcopy (source, destination, produced);
1980 if (coding->selective)
1981 {
1982 int i = produced;
1983 while (i--)
1984 if (*dst++ == '\r') dst[-1] = '\n';
1985 }
1986 *consumed = produced;
1987
1988 case CODING_EOL_CRLF:
1989 {
1990 unsigned char c;
1991 unsigned char *src_end = source + src_bytes;
1992 unsigned char *dst_end = destination + dst_bytes;
1993 /* Since the maximum bytes produced by each loop is 2, we
1994 subtract 1 from DST_END to assure overflow checking is
1995 necessary only at the head of loop. */
1996 unsigned char *adjusted_dst_end = dst_end - 1;
1997
1998 while (src < src_end && dst < adjusted_dst_end)
1999 {
2000 c = *src++;
2001 if (c == '\n' || (c == '\r' && coding->selective))
2002 *dst++ = '\r', *dst++ = '\n';
2003 else
2004 *dst++ = c;
2005 }
2006 produced = dst - destination;
2007 *consumed = src - source;
2008 break;
2009 }
2010
2011 default: /* i.e. case CODING_EOL_CR: */
2012 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2013 bcopy (source, destination, produced);
2014 {
2015 int i = produced;
2016 while (i--)
2017 if (*dst++ == '\n') dst[-1] = '\r';
2018 }
2019 *consumed = produced;
2020 }
2021
2022 return produced;
2023}
2024
2025\f
2026/*** 6. C library functions ***/
2027
2028/* In Emacs Lisp, coding system is represented by a Lisp symbol which
2029 has a property `coding-system'. The value of this property is a
2030 vector of length 5 (called as coding-vector). Among elements of
2031 this vector, the first (element[0]) and the fifth (element[4])
2032 carry important information for decoding/encoding. Before
2033 decoding/encoding, this information should be set in fields of a
2034 structure of type `coding_system'.
2035
2036 A value of property `coding-system' can be a symbol of another
2037 subsidiary coding-system. In that case, Emacs gets coding-vector
2038 from that symbol.
2039
2040 `element[0]' contains information to be set in `coding->type'. The
2041 value and its meaning is as follows:
2042
0ef69138
KH
2043 0 -- coding_type_emacs_mule
2044 1 -- coding_type_sjis
2045 2 -- coding_type_iso2022
2046 3 -- coding_type_big5
2047 4 -- coding_type_ccl encoder/decoder written in CCL
2048 nil -- coding_type_no_conversion
2049 t -- coding_type_undecided (automatic conversion on decoding,
2050 no-conversion on encoding)
4ed46869
KH
2051
2052 `element[4]' contains information to be set in `coding->flags' and
2053 `coding->spec'. The meaning varies by `coding->type'.
2054
2055 If `coding->type' is `coding_type_iso2022', element[4] is a vector
2056 of length 32 (of which the first 13 sub-elements are used now).
2057 Meanings of these sub-elements are:
2058
2059 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2060 If the value is an integer of valid charset, the charset is
2061 assumed to be designated to graphic register N initially.
2062
2063 If the value is minus, it is a minus value of charset which
2064 reserves graphic register N, which means that the charset is
2065 not designated initially but should be designated to graphic
2066 register N just before encoding a character in that charset.
2067
2068 If the value is nil, graphic register N is never used on
2069 encoding.
2070
2071 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2072 Each value takes t or nil. See the section ISO2022 of
2073 `coding.h' for more information.
2074
2075 If `coding->type' is `coding_type_big5', element[4] is t to denote
2076 BIG5-ETen or nil to denote BIG5-HKU.
2077
2078 If `coding->type' takes the other value, element[4] is ignored.
2079
2080 Emacs Lisp's coding system also carries information about format of
2081 end-of-line in a value of property `eol-type'. If the value is
2082 integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2083 means CODING_EOL_CR. If it is not integer, it should be a vector
2084 of subsidiary coding systems of which property `eol-type' has one
2085 of above values.
2086
2087*/
2088
2089/* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2090 and set it in CODING. If CODING_SYSTEM_SYMBOL is invalid, CODING
2091 is setup so that no conversion is necessary and return -1, else
2092 return 0. */
2093
2094int
e0e989f6
KH
2095setup_coding_system (coding_system, coding)
2096 Lisp_Object coding_system;
4ed46869
KH
2097 struct coding_system *coding;
2098{
4ed46869
KH
2099 Lisp_Object type, eol_type;
2100
2101 /* At first, set several fields default values. */
2102 coding->require_flushing = 0;
2103 coding->last_block = 0;
2104 coding->selective = 0;
2105 coding->composing = 0;
2106 coding->direction = 0;
2107 coding->carryover_size = 0;
4ed46869 2108 coding->post_read_conversion = coding->pre_write_conversion = Qnil;
bdd9fb48
KH
2109 /* We have not yet implemented a way to specify unification table in
2110 a coding system. */
2111 coding->character_unification_table = Qnil;
4ed46869 2112
e0e989f6
KH
2113 Vlast_coding_system_used = coding->symbol = coding_system;
2114 eol_type = Qnil;
2115 /* Get value of property `coding-system' until we get a vector.
2116 While doing that, also get values of properties
2117 `post-read-conversion', `pre-write-conversion', and `eol-type'. */
2118 while (!NILP (coding_system) && SYMBOLP (coding_system))
4ed46869 2119 {
4ed46869 2120 if (NILP (coding->post_read_conversion))
e0e989f6 2121 coding->post_read_conversion = Fget (coding_system,
4ed46869 2122 Qpost_read_conversion);
e0e989f6
KH
2123 if (NILP (coding->pre_write_conversion))
2124 coding->pre_write_conversion = Fget (coding_system,
4ed46869 2125 Qpre_write_conversion);
e0e989f6
KH
2126 if (NILP (eol_type))
2127 eol_type = Fget (coding_system, Qeol_type);
2128 coding_system = Fget (coding_system, Qcoding_system);
4ed46869 2129 }
e0e989f6
KH
2130 if (!VECTORP (coding_system)
2131 || XVECTOR (coding_system)->size != 5)
4ed46869
KH
2132 goto label_invalid_coding_system;
2133
4ed46869 2134 if (VECTORP (eol_type))
0ef69138 2135 coding->eol_type = CODING_EOL_UNDECIDED;
4ed46869
KH
2136 else if (XFASTINT (eol_type) == 1)
2137 coding->eol_type = CODING_EOL_CRLF;
2138 else if (XFASTINT (eol_type) == 2)
2139 coding->eol_type = CODING_EOL_CR;
2140 else
2141 coding->eol_type = CODING_EOL_LF;
2142
e0e989f6 2143 type = XVECTOR (coding_system)->contents[0];
4ed46869
KH
2144 switch (XFASTINT (type))
2145 {
2146 case 0:
0ef69138 2147 coding->type = coding_type_emacs_mule;
4ed46869
KH
2148 break;
2149
2150 case 1:
2151 coding->type = coding_type_sjis;
2152 break;
2153
2154 case 2:
2155 coding->type = coding_type_iso2022;
2156 {
e0e989f6 2157 Lisp_Object val = XVECTOR (coding_system)->contents[4];
4ed46869
KH
2158 Lisp_Object *flags;
2159 int i, charset, default_reg_bits = 0;
2160
2161 if (!VECTORP (val) || XVECTOR (val)->size != 32)
2162 goto label_invalid_coding_system;
2163
2164 flags = XVECTOR (val)->contents;
2165 coding->flags
2166 = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
2167 | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
2168 | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
2169 | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
2170 | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
2171 | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
2172 | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
2173 | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
e0e989f6
KH
2174 | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
2175 | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
2176 | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL));
4ed46869
KH
2177
2178 /* Invoke graphic register 0 to plane 0. */
2179 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
2180 /* Invoke graphic register 1 to plane 1 if we can use full 8-bit. */
2181 CODING_SPEC_ISO_INVOCATION (coding, 1)
2182 = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
2183 /* Not single shifting at first. */
2184 CODING_SPEC_ISO_SINGLE_SHIFTING(coding) = 0;
e0e989f6
KH
2185 /* Beginning of buffer should also be regarded as bol. */
2186 CODING_SPEC_ISO_BOL(coding) = 1;
4ed46869
KH
2187
2188 /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
2189 FLAGS[REG] can be one of below:
2190 integer CHARSET: CHARSET occupies register I,
2191 t: designate nothing to REG initially, but can be used
2192 by any charsets,
2193 list of integer, nil, or t: designate the first
2194 element (if integer) to REG initially, the remaining
2195 elements (if integer) is designated to REG on request,
2196 if an element is t, REG can be used by any charset,
2197 nil: REG is never used. */
467e7675 2198 for (charset = 0; charset <= MAX_CHARSET; charset++)
1ba9e4ab
KH
2199 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2200 = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
4ed46869
KH
2201 for (i = 0; i < 4; i++)
2202 {
2203 if (INTEGERP (flags[i])
e0e989f6
KH
2204 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset))
2205 || (charset = get_charset_id (flags[i])) >= 0)
4ed46869
KH
2206 {
2207 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2208 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
2209 }
2210 else if (EQ (flags[i], Qt))
2211 {
2212 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2213 default_reg_bits |= 1 << i;
2214 }
2215 else if (CONSP (flags[i]))
2216 {
2217 Lisp_Object tail = flags[i];
2218
2219 if (INTEGERP (XCONS (tail)->car)
2220 && (charset = XINT (XCONS (tail)->car),
e0e989f6
KH
2221 CHARSET_VALID_P (charset))
2222 || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
4ed46869
KH
2223 {
2224 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2225 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
2226 }
2227 else
2228 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2229 tail = XCONS (tail)->cdr;
2230 while (CONSP (tail))
2231 {
2232 if (INTEGERP (XCONS (tail)->car)
2233 && (charset = XINT (XCONS (tail)->car),
e0e989f6
KH
2234 CHARSET_VALID_P (charset))
2235 || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
4ed46869
KH
2236 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2237 = i;
2238 else if (EQ (XCONS (tail)->car, Qt))
2239 default_reg_bits |= 1 << i;
2240 tail = XCONS (tail)->cdr;
2241 }
2242 }
2243 else
2244 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2245
2246 CODING_SPEC_ISO_DESIGNATION (coding, i)
2247 = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
2248 }
2249
2250 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
2251 {
2252 /* REG 1 can be used only by locking shift in 7-bit env. */
2253 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2254 default_reg_bits &= ~2;
2255 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
2256 /* Without any shifting, only REG 0 and 1 can be used. */
2257 default_reg_bits &= 3;
2258 }
2259
467e7675 2260 for (charset = 0; charset <= MAX_CHARSET; charset++)
4ed46869 2261 if (CHARSET_VALID_P (charset)
1ba9e4ab
KH
2262 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2263 == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
4ed46869
KH
2264 {
2265 /* We have not yet decided where to designate CHARSET. */
2266 int reg_bits = default_reg_bits;
2267
2268 if (CHARSET_CHARS (charset) == 96)
2269 /* A charset of CHARS96 can't be designated to REG 0. */
2270 reg_bits &= ~1;
2271
2272 if (reg_bits)
2273 /* There exist some default graphic register. */
2274 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2275 = (reg_bits & 1
2276 ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
2277 else
2278 /* We anyway have to designate CHARSET to somewhere. */
2279 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2280 = (CHARSET_CHARS (charset) == 94
2281 ? 0
2282 : ((coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT
2283 || ! coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2284 ? 1
2285 : (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT
2286 ? 2 : 0)));
2287 }
2288 }
2289 coding->require_flushing = 1;
2290 break;
2291
2292 case 3:
2293 coding->type = coding_type_big5;
2294 coding->flags
e0e989f6 2295 = (NILP (XVECTOR (coding_system)->contents[4])
4ed46869
KH
2296 ? CODING_FLAG_BIG5_HKU
2297 : CODING_FLAG_BIG5_ETEN);
2298 break;
2299
2300 case 4:
2301 coding->type = coding_type_ccl;
2302 {
e0e989f6 2303 Lisp_Object val = XVECTOR (coding_system)->contents[4];
4ed46869
KH
2304 if (CONSP (val)
2305 && VECTORP (XCONS (val)->car)
2306 && VECTORP (XCONS (val)->cdr))
2307 {
2308 setup_ccl_program (&(coding->spec.ccl.decoder), XCONS (val)->car);
2309 setup_ccl_program (&(coding->spec.ccl.encoder), XCONS (val)->cdr);
2310 }
2311 else
2312 goto label_invalid_coding_system;
2313 }
2314 coding->require_flushing = 1;
2315 break;
2316
2317 default:
2318 if (EQ (type, Qt))
0ef69138 2319 coding->type = coding_type_undecided;
4ed46869
KH
2320 else
2321 coding->type = coding_type_no_conversion;
2322 break;
2323 }
2324 return 0;
2325
2326 label_invalid_coding_system:
2327 coding->type = coding_type_no_conversion;
dec137e5 2328 coding->eol_type = CODING_EOL_LF;
e0e989f6
KH
2329 coding->symbol = coding->pre_write_conversion = coding->post_read_conversion
2330 = Qnil;
4ed46869
KH
2331 return -1;
2332}
2333
2334/* Emacs has a mechanism to automatically detect a coding system if it
2335 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
2336 it's impossible to distinguish some coding systems accurately
2337 because they use the same range of codes. So, at first, coding
2338 systems are categorized into 7, those are:
2339
0ef69138 2340 o coding-category-emacs-mule
4ed46869
KH
2341
2342 The category for a coding system which has the same code range
2343 as Emacs' internal format. Assigned the coding-system (Lisp
0ef69138 2344 symbol) `emacs-mule' by default.
4ed46869
KH
2345
2346 o coding-category-sjis
2347
2348 The category for a coding system which has the same code range
2349 as SJIS. Assigned the coding-system (Lisp
e0e989f6 2350 symbol) `shift-jis' by default.
4ed46869
KH
2351
2352 o coding-category-iso-7
2353
2354 The category for a coding system which has the same code range
2355 as ISO2022 of 7-bit environment. Assigned the coding-system
e0e989f6 2356 (Lisp symbol) `iso-2022-7' by default.
4ed46869
KH
2357
2358 o coding-category-iso-8-1
2359
2360 The category for a coding system which has the same code range
2361 as ISO2022 of 8-bit environment and graphic plane 1 used only
2362 for DIMENSION1 charset. Assigned the coding-system (Lisp
e0e989f6 2363 symbol) `iso-8859-1' by default.
4ed46869
KH
2364
2365 o coding-category-iso-8-2
2366
2367 The category for a coding system which has the same code range
2368 as ISO2022 of 8-bit environment and graphic plane 1 used only
2369 for DIMENSION2 charset. Assigned the coding-system (Lisp
e0e989f6 2370 symbol) `euc-japan' by default.
4ed46869
KH
2371
2372 o coding-category-iso-else
2373
2374 The category for a coding system which has the same code range
2375 as ISO2022 but not belongs to any of the above three
2376 categories. Assigned the coding-system (Lisp symbol)
e0e989f6 2377 `iso-2022-ss2-7' by default.
4ed46869
KH
2378
2379 o coding-category-big5
2380
2381 The category for a coding system which has the same code range
2382 as BIG5. Assigned the coding-system (Lisp symbol)
e0e989f6 2383 `cn-big5' by default.
4ed46869
KH
2384
2385 o coding-category-binary
2386
2387 The category for a coding system not categorized in any of the
2388 above. Assigned the coding-system (Lisp symbol)
e0e989f6 2389 `no-conversion' by default.
4ed46869
KH
2390
2391 Each of them is a Lisp symbol and the value is an actual
2392 `coding-system's (this is also a Lisp symbol) assigned by a user.
2393 What Emacs does actually is to detect a category of coding system.
2394 Then, it uses a `coding-system' assigned to it. If Emacs can't
2395 decide only one possible category, it selects a category of the
2396 highest priority. Priorities of categories are also specified by a
2397 user in a Lisp variable `coding-category-list'.
2398
2399*/
2400
2401/* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2402 If it detects possible coding systems, return an integer in which
2403 appropriate flag bits are set. Flag bits are defined by macros
2404 CODING_CATEGORY_MASK_XXX in `coding.h'. */
2405
2406int
2407detect_coding_mask (src, src_bytes)
2408 unsigned char *src;
2409 int src_bytes;
2410{
2411 register unsigned char c;
2412 unsigned char *src_end = src + src_bytes;
2413 int mask;
2414
2415 /* At first, skip all ASCII characters and control characters except
2416 for three ISO2022 specific control characters. */
bcf26d6a 2417 label_loop_detect_coding:
4ed46869
KH
2418 while (src < src_end)
2419 {
2420 c = *src;
2421 if (c >= 0x80
2422 || (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
2423 break;
2424 src++;
2425 }
2426
2427 if (src >= src_end)
2428 /* We found nothing other than ASCII. There's nothing to do. */
2429 return CODING_CATEGORY_MASK_ANY;
2430
2431 /* The text seems to be encoded in some multilingual coding system.
2432 Now, try to find in which coding system the text is encoded. */
2433 if (c < 0x80)
bcf26d6a
KH
2434 {
2435 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
2436 /* C is an ISO2022 specific control code of C0. */
2437 mask = detect_coding_iso2022 (src, src_end);
2438 src++;
2439 if (mask == CODING_CATEGORY_MASK_ANY)
2440 /* No valid ISO2022 code follows C. Try again. */
2441 goto label_loop_detect_coding;
2442 }
4ed46869
KH
2443 else if (c == ISO_CODE_SS2 || c == ISO_CODE_SS3 || c == ISO_CODE_CSI)
2444 /* C is an ISO2022 specific control code of C1,
2445 or the first byte of SJIS's 2-byte character code,
2446 or a leading code of Emacs. */
2447 mask = (detect_coding_iso2022 (src, src_end)
2448 | detect_coding_sjis (src, src_end)
0ef69138 2449 | detect_coding_emacs_mule (src, src_end));
4ed46869
KH
2450
2451 else if (c < 0xA0)
2452 /* C is the first byte of SJIS character code,
2453 or a leading-code of Emacs. */
2454 mask = (detect_coding_sjis (src, src_end)
0ef69138 2455 | detect_coding_emacs_mule (src, src_end));
4ed46869
KH
2456
2457 else
2458 /* C is a character of ISO2022 in graphic plane right,
2459 or a SJIS's 1-byte character code (i.e. JISX0201),
2460 or the first byte of BIG5's 2-byte code. */
2461 mask = (detect_coding_iso2022 (src, src_end)
2462 | detect_coding_sjis (src, src_end)
2463 | detect_coding_big5 (src, src_end));
2464
2465 return mask;
2466}
2467
2468/* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2469 The information of the detected coding system is set in CODING. */
2470
2471void
2472detect_coding (coding, src, src_bytes)
2473 struct coding_system *coding;
2474 unsigned char *src;
2475 int src_bytes;
2476{
2477 int mask = detect_coding_mask (src, src_bytes);
2478 int idx;
2479
2480 if (mask == CODING_CATEGORY_MASK_ANY)
2481 /* We found nothing other than ASCII. There's nothing to do. */
2482 return;
2483
2484 if (!mask)
2485 /* The source text seems to be encoded in unknown coding system.
2486 Emacs regards the category of such a kind of coding system as
2487 `coding-category-binary'. We assume that a user has assigned
2488 an appropriate coding system for a `coding-category-binary'. */
2489 idx = CODING_CATEGORY_IDX_BINARY;
2490 else
2491 {
2492 /* We found some plausible coding systems. Let's use a coding
2493 system of the highest priority. */
2494 Lisp_Object val = Vcoding_category_list;
2495
2496 if (CONSP (val))
2497 while (!NILP (val))
2498 {
2499 idx = XFASTINT (Fget (XCONS (val)->car, Qcoding_category_index));
2500 if ((idx < CODING_CATEGORY_IDX_MAX) && (mask & (1 << idx)))
2501 break;
2502 val = XCONS (val)->cdr;
2503 }
2504 else
2505 val = Qnil;
2506
2507 if (NILP (val))
2508 {
2509 /* For unknown reason, `Vcoding_category_list' contains none
2510 of found categories. Let's use any of them. */
2511 for (idx = 0; idx < CODING_CATEGORY_IDX_MAX; idx++)
2512 if (mask & (1 << idx))
2513 break;
2514 }
2515 }
2516 setup_coding_system (XSYMBOL (coding_category_table[idx])->value, coding);
2517}
2518
2519/* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2520 is encoded. Return one of CODING_EOL_LF, CODING_EOL_CRLF,
0ef69138 2521 CODING_EOL_CR, and CODING_EOL_UNDECIDED. */
4ed46869
KH
2522
2523int
2524detect_eol_type (src, src_bytes)
2525 unsigned char *src;
2526 int src_bytes;
2527{
2528 unsigned char *src_end = src + src_bytes;
2529 unsigned char c;
2530
2531 while (src < src_end)
2532 {
2533 c = *src++;
2534 if (c == '\n')
2535 return CODING_EOL_LF;
2536 else if (c == '\r')
2537 {
2538 if (src < src_end && *src == '\n')
2539 return CODING_EOL_CRLF;
2540 else
2541 return CODING_EOL_CR;
2542 }
2543 }
0ef69138 2544 return CODING_EOL_UNDECIDED;
4ed46869
KH
2545}
2546
2547/* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2548 is encoded. If it detects an appropriate format of end-of-line, it
2549 sets the information in *CODING. */
2550
2551void
2552detect_eol (coding, src, src_bytes)
2553 struct coding_system *coding;
2554 unsigned char *src;
2555 int src_bytes;
2556{
2557 Lisp_Object val;
2558 int eol_type = detect_eol_type (src, src_bytes);
2559
0ef69138 2560 if (eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
2561 /* We found no end-of-line in the source text. */
2562 return;
2563
2564 val = Fget (coding->symbol, Qeol_type);
2565 if (VECTORP (val) && XVECTOR (val)->size == 3)
2566 setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
2567}
2568
2569/* See "GENERAL NOTES about `decode_coding_XXX ()' functions". Before
2570 decoding, it may detect coding system and format of end-of-line if
2571 those are not yet decided. */
2572
2573int
2574decode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2575 struct coding_system *coding;
2576 unsigned char *source, *destination;
2577 int src_bytes, dst_bytes;
2578 int *consumed;
2579{
2580 int produced;
2581
2582 if (src_bytes <= 0)
2583 {
2584 *consumed = 0;
2585 return 0;
2586 }
2587
0ef69138 2588 if (coding->type == coding_type_undecided)
4ed46869
KH
2589 detect_coding (coding, source, src_bytes);
2590
0ef69138 2591 if (coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
2592 detect_eol (coding, source, src_bytes);
2593
2594 coding->carryover_size = 0;
2595 switch (coding->type)
2596 {
2597 case coding_type_no_conversion:
2598 label_no_conversion:
2599 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2600 bcopy (source, destination, produced);
2601 *consumed = produced;
2602 break;
2603
0ef69138
KH
2604 case coding_type_emacs_mule:
2605 case coding_type_undecided:
4ed46869 2606 if (coding->eol_type == CODING_EOL_LF
0ef69138 2607 || coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
2608 goto label_no_conversion;
2609 produced = decode_eol (coding, source, destination,
2610 src_bytes, dst_bytes, consumed);
2611 break;
2612
2613 case coding_type_sjis:
2614 produced = decode_coding_sjis_big5 (coding, source, destination,
2615 src_bytes, dst_bytes, consumed,
2616 1);
2617 break;
2618
2619 case coding_type_iso2022:
2620 produced = decode_coding_iso2022 (coding, source, destination,
2621 src_bytes, dst_bytes, consumed);
2622 break;
2623
2624 case coding_type_big5:
2625 produced = decode_coding_sjis_big5 (coding, source, destination,
2626 src_bytes, dst_bytes, consumed,
2627 0);
2628 break;
2629
2630 case coding_type_ccl:
2631 produced = ccl_driver (&coding->spec.ccl.decoder, source, destination,
2632 src_bytes, dst_bytes, consumed);
2633 break;
2634 }
2635
2636 return produced;
2637}
2638
2639/* See "GENERAL NOTES about `encode_coding_XXX ()' functions". */
2640
2641int
2642encode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2643 struct coding_system *coding;
2644 unsigned char *source, *destination;
2645 int src_bytes, dst_bytes;
2646 int *consumed;
2647{
2648 int produced;
2649
2650 coding->carryover_size = 0;
2651 switch (coding->type)
2652 {
2653 case coding_type_no_conversion:
2654 label_no_conversion:
2655 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2656 if (produced > 0)
2657 {
2658 bcopy (source, destination, produced);
2659 if (coding->selective)
2660 {
2661 unsigned char *p = destination, *pend = destination + produced;
2662 while (p < pend)
e0e989f6 2663 if (*p++ == '\015') p[-1] = '\n';
4ed46869
KH
2664 }
2665 }
2666 *consumed = produced;
2667 break;
2668
0ef69138
KH
2669 case coding_type_emacs_mule:
2670 case coding_type_undecided:
4ed46869 2671 if (coding->eol_type == CODING_EOL_LF
0ef69138 2672 || coding->eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
2673 goto label_no_conversion;
2674 produced = encode_eol (coding, source, destination,
2675 src_bytes, dst_bytes, consumed);
2676 break;
2677
2678 case coding_type_sjis:
2679 produced = encode_coding_sjis_big5 (coding, source, destination,
2680 src_bytes, dst_bytes, consumed,
2681 1);
2682 break;
2683
2684 case coding_type_iso2022:
2685 produced = encode_coding_iso2022 (coding, source, destination,
2686 src_bytes, dst_bytes, consumed);
2687 break;
2688
2689 case coding_type_big5:
2690 produced = encode_coding_sjis_big5 (coding, source, destination,
2691 src_bytes, dst_bytes, consumed,
2692 0);
2693 break;
2694
2695 case coding_type_ccl:
2696 produced = ccl_driver (&coding->spec.ccl.encoder, source, destination,
2697 src_bytes, dst_bytes, consumed);
2698 break;
2699 }
2700
2701 return produced;
2702}
2703
2704#define CONVERSION_BUFFER_EXTRA_ROOM 256
2705
2706/* Return maximum size (bytes) of a buffer enough for decoding
2707 SRC_BYTES of text encoded in CODING. */
2708
2709int
2710decoding_buffer_size (coding, src_bytes)
2711 struct coding_system *coding;
2712 int src_bytes;
2713{
2714 int magnification;
2715
2716 if (coding->type == coding_type_iso2022)
2717 magnification = 3;
2718 else if (coding->type == coding_type_ccl)
2719 magnification = coding->spec.ccl.decoder.buf_magnification;
2720 else
2721 magnification = 2;
2722
2723 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
2724}
2725
2726/* Return maximum size (bytes) of a buffer enough for encoding
2727 SRC_BYTES of text to CODING. */
2728
2729int
2730encoding_buffer_size (coding, src_bytes)
2731 struct coding_system *coding;
2732 int src_bytes;
2733{
2734 int magnification;
2735
2736 if (coding->type == coding_type_ccl)
2737 magnification = coding->spec.ccl.encoder.buf_magnification;
2738 else
2739 magnification = 3;
2740
2741 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
2742}
2743
2744#ifndef MINIMUM_CONVERSION_BUFFER_SIZE
2745#define MINIMUM_CONVERSION_BUFFER_SIZE 1024
2746#endif
2747
2748char *conversion_buffer;
2749int conversion_buffer_size;
2750
2751/* Return a pointer to a SIZE bytes of buffer to be used for encoding
2752 or decoding. Sufficient memory is allocated automatically. If we
2753 run out of memory, return NULL. */
2754
2755char *
2756get_conversion_buffer (size)
2757 int size;
2758{
2759 if (size > conversion_buffer_size)
2760 {
2761 char *buf;
2762 int real_size = conversion_buffer_size * 2;
2763
2764 while (real_size < size) real_size *= 2;
2765 buf = (char *) xmalloc (real_size);
2766 xfree (conversion_buffer);
2767 conversion_buffer = buf;
2768 conversion_buffer_size = real_size;
2769 }
2770 return conversion_buffer;
2771}
2772
2773\f
2774#ifdef emacs
2775/*** 7. Emacs Lisp library functions ***/
2776
02ba4723 2777DEFUN ("coding-system-spec", Fcoding_system_spec, Scoding_system_spec,
4ed46869 2778 1, 1, 0,
02ba4723 2779 "Return coding-spec of CODING-SYSTEM.\n\
4ed46869
KH
2780If CODING-SYSTEM is not a valid coding-system, return nil.")
2781 (obj)
2782 Lisp_Object obj;
2783{
2784 while (SYMBOLP (obj) && !NILP (obj))
2785 obj = Fget (obj, Qcoding_system);
2786 return ((NILP (obj) || !VECTORP (obj) || XVECTOR (obj)->size != 5)
2787 ? Qnil : obj);
2788}
2789
2790DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
2791 "Return t if OBJECT is nil or a coding-system.\n\
2792See document of make-coding-system for coding-system object.")
2793 (obj)
2794 Lisp_Object obj;
2795{
02ba4723 2796 return ((NILP (obj) || !NILP (Fcoding_system_spec (obj))) ? Qt : Qnil);
4ed46869
KH
2797}
2798
9d991de8
RS
2799DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
2800 Sread_non_nil_coding_system, 1, 1, 0,
e0e989f6 2801 "Read a coding system from the minibuffer, prompting with string PROMPT.")
4ed46869
KH
2802 (prompt)
2803 Lisp_Object prompt;
2804{
e0e989f6 2805 Lisp_Object val;
9d991de8
RS
2806 do
2807 {
02ba4723 2808 val = Fcompleting_read (prompt, Vobarray, Qcoding_system_spec,
9d991de8
RS
2809 Qt, Qnil, Qnil, Qnil);
2810 }
2811 while (XSTRING (val)->size == 0);
e0e989f6 2812 return (Fintern (val, Qnil));
4ed46869
KH
2813}
2814
2815DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 1, 0,
e0e989f6 2816 "Read a coding system or nil from the minibuffer, prompting with string PROMPT.")
4ed46869
KH
2817 (prompt)
2818 Lisp_Object prompt;
2819{
e0e989f6 2820 Lisp_Object val = Fcompleting_read (prompt, Vobarray, Qcoding_system_p,
9d991de8 2821 Qt, Qnil, Qnil, Qnil);
e0e989f6 2822 return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
4ed46869
KH
2823}
2824
2825DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
2826 1, 1, 0,
2827 "Check validity of CODING-SYSTEM.\n\
2828If valid, return CODING-SYSTEM, else `coding-system-error' is signaled.\n\
2829CODING-SYSTEM is valid if it is a symbol and has \"coding-system\" property.\n\
2830The value of property should be a vector of length 5.")
2831 (coding_system)
2832 Lisp_Object coding_system;
2833{
2834 CHECK_SYMBOL (coding_system, 0);
2835 if (!NILP (Fcoding_system_p (coding_system)))
2836 return coding_system;
2837 while (1)
02ba4723 2838 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
4ed46869
KH
2839}
2840
2841DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
2842 2, 2, 0,
2843 "Detect coding-system of the text in the region between START and END.\n\
2844Return a list of possible coding-systems ordered by priority.\n\
0ef69138 2845If only ASCII characters are found, it returns `undecided'\n\
4ed46869
KH
2846 or its subsidiary coding-system according to a detected end-of-line format.")
2847 (b, e)
2848 Lisp_Object b, e;
2849{
2850 int coding_mask, eol_type;
2851 Lisp_Object val;
2852 int beg, end;
2853
2854 validate_region (&b, &e);
2855 beg = XINT (b), end = XINT (e);
2856 if (beg < GPT && end >= GPT) move_gap (end);
2857
2858 coding_mask = detect_coding_mask (POS_ADDR (beg), end - beg);
2859 eol_type = detect_eol_type (POS_ADDR (beg), end - beg);
2860
2861 if (coding_mask == CODING_CATEGORY_MASK_ANY)
2862 {
0ef69138
KH
2863 val = intern ("undecided");
2864 if (eol_type != CODING_EOL_UNDECIDED)
4ed46869
KH
2865 {
2866 Lisp_Object val2 = Fget (val, Qeol_type);
2867 if (VECTORP (val2))
2868 val = XVECTOR (val2)->contents[eol_type];
2869 }
2870 }
2871 else
2872 {
2873 Lisp_Object val2;
2874
2875 /* At first, gather possible coding-systems in VAL in a reverse
2876 order. */
2877 val = Qnil;
2878 for (val2 = Vcoding_category_list;
2879 !NILP (val2);
2880 val2 = XCONS (val2)->cdr)
2881 {
2882 int idx
2883 = XFASTINT (Fget (XCONS (val2)->car, Qcoding_category_index));
2884 if (coding_mask & (1 << idx))
2885 val = Fcons (Fsymbol_value (XCONS (val2)->car), val);
2886 }
2887
2888 /* Then, change the order of the list, while getting subsidiary
2889 coding-systems. */
2890 val2 = val;
2891 val = Qnil;
2892 for (; !NILP (val2); val2 = XCONS (val2)->cdr)
2893 {
0ef69138 2894 if (eol_type == CODING_EOL_UNDECIDED)
4ed46869
KH
2895 val = Fcons (XCONS (val2)->car, val);
2896 else
2897 {
2898 Lisp_Object val3 = Fget (XCONS (val2)->car, Qeol_type);
2899 if (VECTORP (val3))
2900 val = Fcons (XVECTOR (val3)->contents[eol_type], val);
2901 else
2902 val = Fcons (XCONS (val2)->car, val);
2903 }
2904 }
2905 }
2906
2907 return val;
2908}
2909
2910/* Scan text in the region between *BEGP and *ENDP, skip characters
2911 which we never have to encode to (iff ENCODEP is 1) or decode from
2912 coding system CODING at the head and tail, then set BEGP and ENDP
2913 to the addresses of start and end of the text we actually convert. */
2914
2915void
2916shrink_conversion_area (begp, endp, coding, encodep)
2917 unsigned char **begp, **endp;
2918 struct coding_system *coding;
2919 int encodep;
2920{
2921 register unsigned char *beg_addr = *begp, *end_addr = *endp;
2922
2923 if (coding->eol_type != CODING_EOL_LF
0ef69138 2924 && coding->eol_type != CODING_EOL_UNDECIDED)
4ed46869
KH
2925 /* Since we anyway have to convert end-of-line format, it is not
2926 worth skipping at most 100 bytes or so. */
2927 return;
2928
2929 if (encodep) /* for encoding */
2930 {
2931 switch (coding->type)
2932 {
2933 case coding_type_no_conversion:
0ef69138
KH
2934 case coding_type_emacs_mule:
2935 case coding_type_undecided:
4ed46869
KH
2936 /* We need no conversion. */
2937 *begp = *endp;
2938 return;
2939 case coding_type_ccl:
2940 /* We can't skip any data. */
2941 return;
e0e989f6
KH
2942 case coding_type_iso2022:
2943 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
2944 {
2945 unsigned char *bol = beg_addr;
2946 while (beg_addr < end_addr && *beg_addr < 0x80)
2947 {
2948 beg_addr++;
2949 if (*(beg_addr - 1) == '\n')
2950 bol = beg_addr;
2951 }
2952 beg_addr = bol;
2953 goto label_skip_tail;
2954 }
2955 /* fall down ... */
4ed46869
KH
2956 default:
2957 /* We can skip all ASCII characters at the head and tail. */
2958 while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
e0e989f6 2959 label_skip_tail:
4ed46869
KH
2960 while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
2961 break;
2962 }
2963 }
2964 else /* for decoding */
2965 {
2966 switch (coding->type)
2967 {
2968 case coding_type_no_conversion:
2969 /* We need no conversion. */
2970 *begp = *endp;
2971 return;
0ef69138 2972 case coding_type_emacs_mule:
4ed46869
KH
2973 if (coding->eol_type == CODING_EOL_LF)
2974 {
2975 /* We need no conversion. */
2976 *begp = *endp;
2977 return;
2978 }
2979 /* We can skip all but carriage-return. */
2980 while (beg_addr < end_addr && *beg_addr != '\r') beg_addr++;
2981 while (beg_addr < end_addr && *(end_addr - 1) != '\r') end_addr--;
2982 break;
2983 case coding_type_sjis:
2984 case coding_type_big5:
2985 /* We can skip all ASCII characters at the head. */
2986 while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
2987 /* We can skip all ASCII characters at the tail except for
2988 the second byte of SJIS or BIG5 code. */
2989 while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
2990 if (end_addr != *endp)
2991 end_addr++;
2992 break;
2993 case coding_type_ccl:
2994 /* We can't skip any data. */
2995 return;
2996 default: /* i.e. case coding_type_iso2022: */
2997 {
2998 unsigned char c;
2999
3000 /* We can skip all ASCII characters except for a few
3001 control codes at the head. */
3002 while (beg_addr < end_addr && (c = *beg_addr) < 0x80
3003 && c != ISO_CODE_CR && c != ISO_CODE_SO
3004 && c != ISO_CODE_SI && c != ISO_CODE_ESC)
3005 beg_addr++;
3006 }
3007 break;
3008 }
3009 }
3010 *begp = beg_addr;
3011 *endp = end_addr;
3012 return;
3013}
3014
3015/* Encode to (iff ENCODEP is 1) or decode form coding system CODING a
3016 text between B and E. B and E are buffer position. */
3017
3018Lisp_Object
3019code_convert_region (b, e, coding, encodep)
3020 Lisp_Object b, e;
3021 struct coding_system *coding;
3022 int encodep;
3023{
3024 int beg, end, len, consumed, produced;
3025 char *buf;
3026 unsigned char *begp, *endp;
3027 int pos = PT;
3028
3029 validate_region (&b, &e);
3030 beg = XINT (b), end = XINT (e);
3031 if (beg < GPT && end >= GPT)
3032 move_gap (end);
3033
3034 if (encodep && !NILP (coding->pre_write_conversion))
3035 {
3036 /* We must call a pre-conversion function which may put a new
3037 text to be converted in a new buffer. */
3038 struct buffer *old = current_buffer, *new;
3039
3040 TEMP_SET_PT (beg);
3041 call2 (coding->pre_write_conversion, b, e);
3042 if (old != current_buffer)
3043 {
3044 /* Replace the original text by the text just generated. */
3045 len = ZV - BEGV;
3046 new = current_buffer;
3047 set_buffer_internal (old);
3048 del_range (beg, end);
3049 insert_from_buffer (new, 1, len, 0);
3050 end = beg + len;
3051 }
3052 }
3053
3054 /* We may be able to shrink the conversion region. */
3055 begp = POS_ADDR (beg); endp = begp + (end - beg);
3056 shrink_conversion_area (&begp, &endp, coding, encodep);
3057
3058 if (begp == endp)
3059 /* We need no conversion. */
3060 len = end - beg;
3061 else
3062 {
3063 beg += begp - POS_ADDR (beg);
3064 end = beg + (endp - begp);
3065
3066 if (encodep)
3067 len = encoding_buffer_size (coding, end - beg);
3068 else
3069 len = decoding_buffer_size (coding, end - beg);
3070 buf = get_conversion_buffer (len);
3071
3072 coding->last_block = 1;
3073 produced = (encodep
3074 ? encode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
3075 &consumed)
3076 : decode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
3077 &consumed));
3078
3079 len = produced + (beg - XINT (b)) + (XINT (e) - end);
3080
3081 TEMP_SET_PT (beg);
3082 insert (buf, produced);
3083 del_range (PT, PT + end - beg);
3084 if (pos >= end)
3085 pos = PT + (pos - end);
3086 else if (pos > beg)
3087 pos = beg;
3088 TEMP_SET_PT (pos);
3089 }
3090
3091 if (!encodep && !NILP (coding->post_read_conversion))
3092 {
3093 /* We must call a post-conversion function which may alter
3094 the text just converted. */
3095 Lisp_Object insval;
3096
3097 beg = XINT (b);
3098 TEMP_SET_PT (beg);
3099 insval = call1 (coding->post_read_conversion, make_number (len));
3100 CHECK_NUMBER (insval, 0);
3101 len = XINT (insval);
3102 }
3103
3104 return make_number (len);
3105}
3106
3107Lisp_Object
e0e989f6
KH
3108code_convert_string (str, coding, encodep, nocopy)
3109 Lisp_Object str, nocopy;
4ed46869
KH
3110 struct coding_system *coding;
3111 int encodep;
3112{
3113 int len, consumed, produced;
3114 char *buf;
3115 unsigned char *begp, *endp;
3116 int head_skip, tail_skip;
3117 struct gcpro gcpro1;
3118
3119 if (encodep && !NILP (coding->pre_write_conversion)
3120 || !encodep && !NILP (coding->post_read_conversion))
3121 {
3122 /* Since we have to call Lisp functions which assume target text
3123 is in a buffer, after setting a temporary buffer, call
3124 code_convert_region. */
3125 int count = specpdl_ptr - specpdl;
3126 int len = XSTRING (str)->size;
3127 Lisp_Object result;
3128 struct buffer *old = current_buffer;
3129
3130 record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
3131 temp_output_buffer_setup (" *code-converting-work*");
3132 set_buffer_internal (XBUFFER (Vstandard_output));
3133 insert_from_string (str, 0, len, 0);
3134 code_convert_region (make_number (BEGV), make_number (ZV),
3135 coding, encodep);
3136 result = make_buffer_string (BEGV, ZV, 0);
3137 set_buffer_internal (old);
3138 return unbind_to (count, result);
3139 }
3140
3141 /* We may be able to shrink the conversion region. */
3142 begp = XSTRING (str)->data;
3143 endp = begp + XSTRING (str)->size;
3144 shrink_conversion_area (&begp, &endp, coding, encodep);
3145
3146 if (begp == endp)
3147 /* We need no conversion. */
e0e989f6 3148 return (NILP (nocopy) ? Fcopy_sequence (str) : str);
4ed46869
KH
3149
3150 head_skip = begp - XSTRING (str)->data;
3151 tail_skip = XSTRING (str)->size - head_skip - (endp - begp);
3152
3153 GCPRO1 (str);
3154
3155 if (encodep)
3156 len = encoding_buffer_size (coding, endp - begp);
3157 else
3158 len = decoding_buffer_size (coding, endp - begp);
3159 buf = get_conversion_buffer (len + head_skip + tail_skip);
3160
3161 bcopy (XSTRING (str)->data, buf, head_skip);
3162 coding->last_block = 1;
3163 produced = (encodep
3164 ? encode_coding (coding, XSTRING (str)->data + head_skip,
3165 buf + head_skip, endp - begp, len, &consumed)
3166 : decode_coding (coding, XSTRING (str)->data + head_skip,
3167 buf + head_skip, endp - begp, len, &consumed));
3168 bcopy (XSTRING (str)->data + head_skip + (endp - begp),
3169 buf + head_skip + produced,
3170 tail_skip);
3171
3172 UNGCPRO;
3173
3174 return make_string (buf, head_skip + produced + tail_skip);
3175}
3176
3177DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
e0e989f6
KH
3178 3, 3, "r\nzCoding system: ",
3179 "Decode current region by specified coding system.\n\
3180When called from a program, takes three arguments:\n\
3181START, END, and CODING-SYSTEM. START END are buffer positions.\n\
4ed46869
KH
3182Return length of decoded text.")
3183 (b, e, coding_system)
3184 Lisp_Object b, e, coding_system;
3185{
3186 struct coding_system coding;
3187
3188 CHECK_NUMBER_COERCE_MARKER (b, 0);
3189 CHECK_NUMBER_COERCE_MARKER (e, 1);
3190 CHECK_SYMBOL (coding_system, 2);
3191
e0e989f6
KH
3192 if (NILP (coding_system))
3193 return make_number (XFASTINT (e) - XFASTINT (b));
4ed46869
KH
3194 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3195 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3196
3197 return code_convert_region (b, e, &coding, 0);
3198}
3199
3200DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
e0e989f6
KH
3201 3, 3, "r\nzCoding system: ",
3202 "Encode current region by specified coding system.\n\
3203When called from a program, takes three arguments:\n\
3204START, END, and CODING-SYSTEM. START END are buffer positions.\n\
4ed46869
KH
3205Return length of encoded text.")
3206 (b, e, coding_system)
3207 Lisp_Object b, e, coding_system;
3208{
3209 struct coding_system coding;
3210
3211 CHECK_NUMBER_COERCE_MARKER (b, 0);
3212 CHECK_NUMBER_COERCE_MARKER (e, 1);
3213 CHECK_SYMBOL (coding_system, 2);
3214
e0e989f6
KH
3215 if (NILP (coding_system))
3216 return make_number (XFASTINT (e) - XFASTINT (b));
4ed46869
KH
3217 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3218 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3219
3220 return code_convert_region (b, e, &coding, 1);
3221}
3222
3223DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
e0e989f6
KH
3224 2, 3, 0,
3225 "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
3226Optional arg NOCOPY non-nil means return STRING itself if there's no need\n\
3227of decoding.")
3228 (string, coding_system, nocopy)
3229 Lisp_Object string, coding_system, nocopy;
4ed46869
KH
3230{
3231 struct coding_system coding;
3232
3233 CHECK_STRING (string, 0);
3234 CHECK_SYMBOL (coding_system, 1);
3235
e0e989f6
KH
3236 if (NILP (coding_system))
3237 return (NILP (nocopy) ? Fcopy_sequence (string) : string);
4ed46869
KH
3238 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3239 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3240
e0e989f6 3241 return code_convert_string (string, &coding, 0, nocopy);
4ed46869
KH
3242}
3243
3244DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
e0e989f6
KH
3245 2, 3, 0,
3246 "Encode STRING to CODING-SYSTEM, and return the result.\n\
3247Optional arg NOCOPY non-nil means return STRING itself if there's no need\n\
3248of encoding.")
3249 (string, coding_system, nocopy)
3250 Lisp_Object string, coding_system, nocopy;
4ed46869
KH
3251{
3252 struct coding_system coding;
3253
3254 CHECK_STRING (string, 0);
3255 CHECK_SYMBOL (coding_system, 1);
3256
e0e989f6
KH
3257 if (NILP (coding_system))
3258 return (NILP (nocopy) ? Fcopy_sequence (string) : string);
4ed46869
KH
3259 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3260 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3261
e0e989f6 3262 return code_convert_string (string, &coding, 1, nocopy);
4ed46869
KH
3263}
3264
3265DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
e0e989f6 3266 "Decode a JISX0208 character of shift-jis encoding.\n\
4ed46869
KH
3267CODE is the character code in SJIS.\n\
3268Return the corresponding character.")
3269 (code)
3270 Lisp_Object code;
3271{
3272 unsigned char c1, c2, s1, s2;
3273 Lisp_Object val;
3274
3275 CHECK_NUMBER (code, 0);
3276 s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
3277 DECODE_SJIS (s1, s2, c1, c2);
3278 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset_jisx0208, c1, c2));
3279 return val;
3280}
3281
3282DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
3283 "Encode a JISX0208 character CHAR to SJIS coding-system.\n\
3284Return the corresponding character code in SJIS.")
3285 (ch)
3286 Lisp_Object ch;
3287{
bcf26d6a 3288 int charset, c1, c2, s1, s2;
4ed46869
KH
3289 Lisp_Object val;
3290
3291 CHECK_NUMBER (ch, 0);
3292 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3293 if (charset == charset_jisx0208)
3294 {
3295 ENCODE_SJIS (c1, c2, s1, s2);
bcf26d6a 3296 XSETFASTINT (val, (s1 << 8) | s2);
4ed46869
KH
3297 }
3298 else
3299 XSETFASTINT (val, 0);
3300 return val;
3301}
3302
3303DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
3304 "Decode a Big5 character CODE of BIG5 coding-system.\n\
3305CODE is the character code in BIG5.\n\
3306Return the corresponding character.")
3307 (code)
3308 Lisp_Object code;
3309{
3310 int charset;
3311 unsigned char b1, b2, c1, c2;
3312 Lisp_Object val;
3313
3314 CHECK_NUMBER (code, 0);
3315 b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
3316 DECODE_BIG5 (b1, b2, charset, c1, c2);
3317 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset, c1, c2));
3318 return val;
3319}
3320
3321DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
3322 "Encode the Big5 character CHAR to BIG5 coding-system.\n\
3323Return the corresponding character code in Big5.")
3324 (ch)
3325 Lisp_Object ch;
3326{
bcf26d6a 3327 int charset, c1, c2, b1, b2;
4ed46869
KH
3328 Lisp_Object val;
3329
3330 CHECK_NUMBER (ch, 0);
3331 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3332 if (charset == charset_big5_1 || charset == charset_big5_2)
3333 {
3334 ENCODE_BIG5 (charset, c1, c2, b1, b2);
bcf26d6a 3335 XSETFASTINT (val, (b1 << 8) | b2);
4ed46869
KH
3336 }
3337 else
3338 XSETFASTINT (val, 0);
3339 return val;
3340}
3341
1ba9e4ab
KH
3342DEFUN ("set-terminal-coding-system-internal",
3343 Fset_terminal_coding_system_internal,
3344 Sset_terminal_coding_system_internal, 1, 1, 0, "")
4ed46869
KH
3345 (coding_system)
3346 Lisp_Object coding_system;
3347{
3348 CHECK_SYMBOL (coding_system, 0);
3349 setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
4ed46869
KH
3350 return Qnil;
3351}
3352
3353DEFUN ("terminal-coding-system",
3354 Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
3355 "Return coding-system of your terminal.")
3356 ()
3357{
3358 return terminal_coding.symbol;
3359}
3360
1ba9e4ab
KH
3361DEFUN ("set-keyboard-coding-system-internal",
3362 Fset_keyboard_coding_system_internal,
3363 Sset_keyboard_coding_system_internal, 1, 1, 0, "")
4ed46869
KH
3364 (coding_system)
3365 Lisp_Object coding_system;
3366{
3367 CHECK_SYMBOL (coding_system, 0);
3368 setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
3369 return Qnil;
3370}
3371
3372DEFUN ("keyboard-coding-system",
3373 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
3374 "Return coding-system of what is sent from terminal keyboard.")
3375 ()
3376{
3377 return keyboard_coding.symbol;
3378}
3379
3380\f
3381DEFUN ("find-coding-system", Ffind_coding_system, Sfind_coding_system,
3382 1, MANY, 0,
ccdb79f5
RS
3383 "Choose a coding system for a file operation based on file name.\n\
3384The value names a pair of coding systems: (ENCODING-SYSTEM DECODING-SYSTEM).\n\
3385ENCODING-SYSTEM is the coding system to use for encoding\n\
3386\(in case OPERATION does encoding), and DECODING-SYSTEM is the coding system\n\
3387for decoding (in case OPERATION does decoding).\n\
3388\n\
3389The first argument OPERATION specifies an I/O primitive:\n\
3390 For file I/O, `insert-file-contents' or `write-region'.\n\
3391 For process I/O, `call-process', `call-process-region', or `start-process'.\n\
3392 For network I/O, `open-network-stream'.\n\
3393\n\
3394The remaining arguments should be the same arguments that were passed\n\
3395to the primitive. Depending on which primitive, one of those arguments\n\
3396is selected as the TARGET. For example, if OPERATION does file I/O,\n\
3397whichever argument specifies the file name is TARGET.\n\
3398\n\
3399TARGET has a meaning which depends on OPERATION:\n\
4ed46869
KH
3400 For file I/O, TARGET is a file name.\n\
3401 For process I/O, TARGET is a process name.\n\
3402 For network I/O, TARGET is a service name or a port number\n\
3403\n\
02ba4723
KH
3404This function looks up what specified for TARGET in,\n\
3405`file-coding-system-alist', `process-coding-system-alist',\n\
3406or `network-coding-system-alist' depending on OPERATION.\n\
3407They may specify a coding system, a cons of coding systems,\n\
3408or a function symbol to call.\n\
3409In the last case, we call the function with one argument,\n\
ccdb79f5 3410which is a list of all the arguments given to `find-coding-system'.")
4ed46869
KH
3411 (nargs, args)
3412 int nargs;
3413 Lisp_Object *args;
3414{
3415 Lisp_Object operation, target_idx, target, val;
3416 register Lisp_Object chain;
3417
3418 if (nargs < 2)
3419 error ("Too few arguments");
3420 operation = args[0];
3421 if (!SYMBOLP (operation)
3422 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
3423 error ("Invalid first arguement");
3424 if (nargs < 1 + XINT (target_idx))
3425 error ("Too few arguments for operation: %s",
3426 XSYMBOL (operation)->name->data);
3427 target = args[XINT (target_idx) + 1];
3428 if (!(STRINGP (target)
3429 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
3430 error ("Invalid %dth argument", XINT (target_idx) + 1);
3431
02ba4723
KH
3432 chain = (operation == Qinsert_file_contents || operation == Qwrite_region
3433 ? Vfile_coding_system_alist
3434 : (operation == Qopen_network_stream
3435 ? Vnetwork_coding_system_alist
3436 : Vprocess_coding_system_alist));
4ed46869
KH
3437 if (NILP (chain))
3438 return Qnil;
3439
02ba4723 3440 for (; CONSP (chain); chain = XCONS (chain)->cdr)
4ed46869
KH
3441 {
3442 Lisp_Object elt = XCONS (chain)->car;
3443
3444 if (CONSP (elt)
3445 && ((STRINGP (target)
3446 && STRINGP (XCONS (elt)->car)
3447 && fast_string_match (XCONS (elt)->car, target) >= 0)
3448 || (INTEGERP (target) && EQ (target, XCONS (elt)->car))))
02ba4723
KH
3449 {
3450 val = XCONS (elt)->cdr;
3451 if (CONSP (val))
3452 return val;
3453 if (! SYMBOLP (val))
3454 return Qnil;
3455 if (! NILP (Fcoding_system_p (val)))
3456 return Fcons (val, val);
3457 if (!NILP (Fboundp (val)))
3458 return call2 (val, Flist (nargs, args));
3459 return Qnil;
3460 }
4ed46869
KH
3461 }
3462 return Qnil;
3463}
3464
3465#endif /* emacs */
3466
3467\f
3468/*** 8. Post-amble ***/
3469
3470init_coding_once ()
3471{
3472 int i;
3473
0ef69138 3474 /* Emacs' internal format specific initialize routine. */
4ed46869
KH
3475 for (i = 0; i <= 0x20; i++)
3476 emacs_code_class[i] = EMACS_control_code;
3477 emacs_code_class[0x0A] = EMACS_linefeed_code;
3478 emacs_code_class[0x0D] = EMACS_carriage_return_code;
3479 for (i = 0x21 ; i < 0x7F; i++)
3480 emacs_code_class[i] = EMACS_ascii_code;
3481 emacs_code_class[0x7F] = EMACS_control_code;
3482 emacs_code_class[0x80] = EMACS_leading_code_composition;
3483 for (i = 0x81; i < 0xFF; i++)
3484 emacs_code_class[i] = EMACS_invalid_code;
3485 emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
3486 emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
3487 emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
3488 emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
3489
3490 /* ISO2022 specific initialize routine. */
3491 for (i = 0; i < 0x20; i++)
3492 iso_code_class[i] = ISO_control_code;
3493 for (i = 0x21; i < 0x7F; i++)
3494 iso_code_class[i] = ISO_graphic_plane_0;
3495 for (i = 0x80; i < 0xA0; i++)
3496 iso_code_class[i] = ISO_control_code;
3497 for (i = 0xA1; i < 0xFF; i++)
3498 iso_code_class[i] = ISO_graphic_plane_1;
3499 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
3500 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
3501 iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
3502 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
3503 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
3504 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
3505 iso_code_class[ISO_CODE_ESC] = ISO_escape;
3506 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
3507 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
3508 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
3509
e0e989f6
KH
3510 conversion_buffer_size = MINIMUM_CONVERSION_BUFFER_SIZE;
3511 conversion_buffer = (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE);
3512
3513 setup_coding_system (Qnil, &keyboard_coding);
3514 setup_coding_system (Qnil, &terminal_coding);
3515}
3516
3517#ifdef emacs
3518
3519syms_of_coding ()
3520{
3521 Qtarget_idx = intern ("target-idx");
3522 staticpro (&Qtarget_idx);
3523
3524 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
3525 Fput (Qwrite_region, Qtarget_idx, make_number (2));
3526
3527 Qcall_process = intern ("call-process");
3528 staticpro (&Qcall_process);
3529 Fput (Qcall_process, Qtarget_idx, make_number (0));
3530
3531 Qcall_process_region = intern ("call-process-region");
3532 staticpro (&Qcall_process_region);
3533 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
3534
3535 Qstart_process = intern ("start-process");
3536 staticpro (&Qstart_process);
3537 Fput (Qstart_process, Qtarget_idx, make_number (2));
3538
3539 Qopen_network_stream = intern ("open-network-stream");
3540 staticpro (&Qopen_network_stream);
3541 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
3542
4ed46869
KH
3543 Qcoding_system = intern ("coding-system");
3544 staticpro (&Qcoding_system);
3545
3546 Qeol_type = intern ("eol-type");
3547 staticpro (&Qeol_type);
3548
3549 Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
3550 staticpro (&Qbuffer_file_coding_system);
3551
3552 Qpost_read_conversion = intern ("post-read-conversion");
3553 staticpro (&Qpost_read_conversion);
3554
3555 Qpre_write_conversion = intern ("pre-write-conversion");
3556 staticpro (&Qpre_write_conversion);
3557
02ba4723
KH
3558 Qcoding_system_spec = intern ("coding-system-spec");
3559 staticpro (&Qcoding_system_spec);
4ed46869
KH
3560
3561 Qcoding_system_p = intern ("coding-system-p");
3562 staticpro (&Qcoding_system_p);
3563
3564 Qcoding_system_error = intern ("coding-system-error");
3565 staticpro (&Qcoding_system_error);
3566
3567 Fput (Qcoding_system_error, Qerror_conditions,
3568 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
3569 Fput (Qcoding_system_error, Qerror_message,
3570 build_string ("Coding-system error"));
3571
3572 Qcoding_category_index = intern ("coding-category-index");
3573 staticpro (&Qcoding_category_index);
3574
3575 {
3576 int i;
3577 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3578 {
3579 coding_category_table[i] = intern (coding_category_name[i]);
3580 staticpro (&coding_category_table[i]);
3581 Fput (coding_category_table[i], Qcoding_category_index,
3582 make_number (i));
3583 }
3584 }
3585
bdd9fb48
KH
3586 Qcharacter_unification_table = intern ("character-unification-table");
3587 staticpro (&Qcharacter_unification_table);
3588 Fput (Qcharacter_unification_table, Qchar_table_extra_slots,
3589 make_number (0));
3590
02ba4723 3591 defsubr (&Scoding_system_spec);
4ed46869
KH
3592 defsubr (&Scoding_system_p);
3593 defsubr (&Sread_coding_system);
3594 defsubr (&Sread_non_nil_coding_system);
3595 defsubr (&Scheck_coding_system);
3596 defsubr (&Sdetect_coding_region);
3597 defsubr (&Sdecode_coding_region);
3598 defsubr (&Sencode_coding_region);
3599 defsubr (&Sdecode_coding_string);
3600 defsubr (&Sencode_coding_string);
3601 defsubr (&Sdecode_sjis_char);
3602 defsubr (&Sencode_sjis_char);
3603 defsubr (&Sdecode_big5_char);
3604 defsubr (&Sencode_big5_char);
1ba9e4ab 3605 defsubr (&Sset_terminal_coding_system_internal);
4ed46869 3606 defsubr (&Sterminal_coding_system);
1ba9e4ab 3607 defsubr (&Sset_keyboard_coding_system_internal);
4ed46869
KH
3608 defsubr (&Skeyboard_coding_system);
3609 defsubr (&Sfind_coding_system);
3610
3611 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
3612 "List of coding-categories (symbols) ordered by priority.");
3613 {
3614 int i;
3615
3616 Vcoding_category_list = Qnil;
3617 for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
3618 Vcoding_category_list
3619 = Fcons (coding_category_table[i], Vcoding_category_list);
3620 }
3621
3622 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
3623 "A variable of internal use only.\n\
3624If the value is a coding system, it is used for decoding on read operation.\n\
3625If not, an appropriate element in `coding-system-alist' (which see) is used.");
3626 Vcoding_system_for_read = Qnil;
3627
3628 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
3629 "A variable of internal use only.\n\
3630If the value is a coding system, it is used for encoding on write operation.\n\
3631If not, an appropriate element in `coding-system-alist' (which see) is used.");
3632 Vcoding_system_for_write = Qnil;
3633
3634 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
3635 "Coding-system used in the latest file or process I/O.");
3636 Vlast_coding_system_used = Qnil;
3637
02ba4723
KH
3638 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
3639 "Alist to decide a coding system to use for a file I/O operation.\n\
3640The format is ((PATTERN . VAL) ...),\n\
3641where PATTERN is a regular expression matching a file name,\n\
3642VAL is a coding system, a cons of coding systems, or a function symbol.\n\
3643If VAL is a coding system, it is used for both decoding and encoding\n\
3644the file contents.\n\
3645If VAL is a cons of coding systems, the car part is used for decoding,\n\
3646and the cdr part is used for encoding.\n\
3647If VAL is a function symbol, the function must return a coding system\n\
3648or a cons of coding systems which are used as above.\n\
e0e989f6 3649\n\
02ba4723
KH
3650See also the function `find-coding-system'.");
3651 Vfile_coding_system_alist = Qnil;
3652
3653 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
3654 "Alist to decide a coding system to use for a process I/O operation.\n\
3655The format is ((PATTERN . VAL) ...),\n\
3656where PATTERN is a regular expression matching a program name,\n\
3657VAL is a coding system, a cons of coding systems, or a function symbol.\n\
3658If VAL is a coding system, it is used for both decoding what received\n\
3659from the program and encoding what sent to the program.\n\
3660If VAL is a cons of coding systems, the car part is used for decoding,\n\
3661and the cdr part is used for encoding.\n\
3662If VAL is a function symbol, the function must return a coding system\n\
3663or a cons of coding systems which are used as above.\n\
4ed46869 3664\n\
02ba4723
KH
3665See also the function `find-coding-system'.");
3666 Vprocess_coding_system_alist = Qnil;
3667
3668 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
3669 "Alist to decide a coding system to use for a network I/O operation.\n\
3670The format is ((PATTERN . VAL) ...),\n\
3671where PATTERN is a regular expression matching a network service name\n\
3672or is a port number to connect to,\n\
3673VAL is a coding system, a cons of coding systems, or a function symbol.\n\
3674If VAL is a coding system, it is used for both decoding what received\n\
3675from the network stream and encoding what sent to the network stream.\n\
3676If VAL is a cons of coding systems, the car part is used for decoding,\n\
3677and the cdr part is used for encoding.\n\
3678If VAL is a function symbol, the function must return a coding system\n\
3679or a cons of coding systems which are used as above.\n\
4ed46869 3680\n\
02ba4723
KH
3681See also the function `find-coding-system'.");
3682 Vnetwork_coding_system_alist = Qnil;
4ed46869
KH
3683
3684 DEFVAR_INT ("eol-mnemonic-unix", &eol_mnemonic_unix,
3685 "Mnemonic character indicating UNIX-like end-of-line format (i.e. LF) .");
458822a0 3686 eol_mnemonic_unix = ':';
4ed46869
KH
3687
3688 DEFVAR_INT ("eol-mnemonic-dos", &eol_mnemonic_dos,
3689 "Mnemonic character indicating DOS-like end-of-line format (i.e. CRLF).");
458822a0 3690 eol_mnemonic_dos = '\\';
4ed46869
KH
3691
3692 DEFVAR_INT ("eol-mnemonic-mac", &eol_mnemonic_mac,
3693 "Mnemonic character indicating MAC-like end-of-line format (i.e. CR).");
458822a0 3694 eol_mnemonic_mac = '/';
4ed46869
KH
3695
3696 DEFVAR_INT ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
3697 "Mnemonic character indicating end-of-line format is not yet decided.");
458822a0 3698 eol_mnemonic_undecided = ':';
4ed46869 3699
bdd9fb48
KH
3700 DEFVAR_LISP ("enable-character-unification", &Venable_character_unification,
3701 "Non-nil means ISO 2022 encoder/decoder do character unification.");
3702 Venable_character_unification = Qt;
3703
3704 DEFVAR_LISP ("standard-character-unification-table-for-read",
3705 &Vstandard_character_unification_table_for_read,
3706 "Table for unifying characters when reading.");
3707 Vstandard_character_unification_table_for_read = Qnil;
3708
3709 DEFVAR_LISP ("standard-character-unification-table-for-write",
3710 &Vstandard_character_unification_table_for_write,
3711 "Table for unifying characters when writing.");
3712 Vstandard_character_unification_table_for_write = Qnil;
4ed46869
KH
3713
3714 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
3715 "Alist of charsets vs revision numbers.\n\
3716While encoding, if a charset (car part of an element) is found,\n\
3717designate it with the escape sequence identifing revision (cdr part of the element).");
3718 Vcharset_revision_alist = Qnil;
02ba4723
KH
3719
3720 DEFVAR_LISP ("default-process-coding-system",
3721 &Vdefault_process_coding_system,
3722 "Cons of coding systems used for process I/O by default.\n\
3723The car part is used for decoding a process output,\n\
3724the cdr part is used for encoding a text to be sent to a process.");
3725 Vdefault_process_coding_system = Qnil;
4ed46869
KH
3726}
3727
3728#endif /* emacs */