Initial revision
[bpt/emacs.git] / src / coding.c
1 /* Coding system handler (conversion, detection, and etc).
2 Ver.1.0.
3
4 Copyright (C) 1995 Free Software Foundation, Inc.
5 Copyright (C) 1995 Electrotechnical Laboratory, JAPAN.
6
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
10 any later version.
11
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, write to the Free Software
19 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
20
21 /*** TABLE OF CONTENTS ***
22
23 1. Preamble
24 2. Emacs' internal format handlers
25 3. ISO2022 handlers
26 4. Shift-JIS and BIG5 handlers
27 5. End-of-line handlers
28 6. C library functions
29 7. Emacs Lisp library functions
30 8. Post-amble
31
32 */
33
34 /*** GENERAL NOTE on CODING SYSTEM ***
35
36 Coding system is an encoding mechanism of one or more character
37 sets. Here's a list of coding systems which Emacs can handle. When
38 we say "decode", it means converting some other coding system to
39 Emacs' internal format, and when we say "encode", it means
40 converting Emacs' internal format to some other coding system.
41
42 0. Emacs' internal format
43
44 Emacs itself holds a multi-lingual character in a buffer and a string
45 in a special format. Details are described in the section 2.
46
47 1. ISO2022
48
49 The most famous coding system for multiple character sets. X's
50 Compound Text, various EUCs (Extended Unix Code), and such coding
51 systems used in Internet communication as ISO-2022-JP are all
52 variants of ISO2022. Details are described in the section 3.
53
54 2. SJIS (or Shift-JIS or MS-Kanji-Code)
55
56 A coding system to encode character sets: ASCII, JISX0201, and
57 JISX0208. Widely used for PC's in Japan. Details are described in
58 the section 4.
59
60 3. BIG5
61
62 A coding system to encode character sets: ASCII and Big5. Widely
63 used by Chinese (mainly in Taiwan and Hong Kong). Details are
64 described in the section 4. In this file, when written as "BIG5"
65 (all uppercase), it means the coding system, and when written as
66 "Big5" (capitalized), it means the character set.
67
68 4. Else
69
70 If a user want to read/write a text encoded in a coding system not
71 listed above, he can supply a decoder and an encoder for it in CCL
72 (Code Conversion Language) programs. Emacs executes the CCL program
73 while reading/writing.
74
75 Emacs represent a coding-system by a Lisp symbol that has a property
76 `coding-system'. But, before actually using the coding-system, the
77 information about it is set in a structure of type `struct
78 coding_system' for rapid processing. See the section 6 for more
79 detail.
80
81 */
82
83 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
84
85 How end-of-line of a text is encoded depends on a system. For
86 instance, Unix's format is just one byte of `line-feed' code,
87 whereas DOS's format is two bytes sequence of `carriage-return' and
88 `line-feed' codes. MacOS's format is one byte of `carriage-return'.
89
90 Since how characters in a text is encoded and how end-of-line is
91 encoded is independent, any coding system described above can take
92 any format of end-of-line. So, Emacs has information of format of
93 end-of-line in each coding-system. See the section 6 for more
94 detail.
95
96 */
97
98 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
99
100 These functions check if a text between SRC and SRC_END is encoded
101 in the coding system category XXX. Each returns an integer value in
102 which appropriate flag bits for the category XXX is set. The flag
103 bits are defined in macros CODING_CATEGORY_MASK_XXX. Below is the
104 template of these functions. */
105 #if 0
106 int
107 detect_coding_internal (src, src_end)
108 unsigned char *src, *src_end;
109 {
110 ...
111 }
112 #endif
113
114 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
115
116 These functions decode SRC_BYTES length text at SOURCE encoded in
117 CODING to Emacs' internal format. The resulting text goes to a
118 place pointed by DESTINATION, the length of which should not exceed
119 DST_BYTES. The bytes actually processed is returned as *CONSUMED.
120 The return value is the length of the decoded text. Below is a
121 template of these functions. */
122 #if 0
123 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
124 struct coding_system *coding;
125 unsigned char *source, *destination;
126 int src_bytes, dst_bytes;
127 int *consumed;
128 {
129 ...
130 }
131 #endif
132
133 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
134
135 These functions encode SRC_BYTES length text at SOURCE of Emacs
136 internal format to CODING. The resulting text goes to a place
137 pointed by DESTINATION, the length of which should not exceed
138 DST_BYTES. The bytes actually processed is returned as *CONSUMED.
139 The return value is the length of the encoded text. Below is a
140 template of these functions. */
141 #if 0
142 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
143 struct coding_system *coding;
144 unsigned char *source, *destination;
145 int src_bytes, dst_bytes;
146 int *consumed;
147 {
148 ...
149 }
150 #endif
151
152 /*** COMMONLY USED MACROS ***/
153
154 /* The following three macros ONE_MORE_BYTE, TWO_MORE_BYTES, and
155 THREE_MORE_BYTES safely get one, two, and three bytes from the
156 source text respectively. If there are not enough bytes in the
157 source, they jump to `label_end_of_loop'. The caller should set
158 variables `src' and `src_end' to appropriate areas in advance. */
159
160 #define ONE_MORE_BYTE(c1) \
161 do { \
162 if (src < src_end) \
163 c1 = *src++; \
164 else \
165 goto label_end_of_loop; \
166 } while (0)
167
168 #define TWO_MORE_BYTES(c1, c2) \
169 do { \
170 if (src + 1 < src_end) \
171 c1 = *src++, c2 = *src++; \
172 else \
173 goto label_end_of_loop; \
174 } while (0)
175
176 #define THREE_MORE_BYTES(c1, c2, c3) \
177 do { \
178 if (src + 2 < src_end) \
179 c1 = *src++, c2 = *src++, c3 = *src++; \
180 else \
181 goto label_end_of_loop; \
182 } while (0)
183
184 /* The following three macros DECODE_CHARACTER_ASCII,
185 DECODE_CHARACTER_DIMENSION1, and DECODE_CHARACTER_DIMENSION2 put
186 the multi-byte form of a character of each class at the place
187 pointed by `dst'. The caller should set the variable `dst' to
188 point to an appropriate area and the variable `coding' to point to
189 the coding-system of the currently decoding text in advance. */
190
191 /* Decode one ASCII character C. */
192
193 #define DECODE_CHARACTER_ASCII(c) \
194 do { \
195 if (COMPOSING_P (coding->composing)) \
196 *dst++ = 0xA0, *dst++ = (c) | 0x80; \
197 else \
198 *dst++ = (c); \
199 } while (0)
200
201 /* Decode one DIMENSION1 character of which charset is CHARSET and
202 position-code is C. */
203
204 #define DECODE_CHARACTER_DIMENSION1(charset, c) \
205 do { \
206 unsigned char leading_code = CHARSET_LEADING_CODE_BASE (charset); \
207 if (COMPOSING_P (coding->composing)) \
208 *dst++ = leading_code + 0x20; \
209 else \
210 *dst++ = leading_code; \
211 if (leading_code = CHARSET_LEADING_CODE_EXT (charset)) \
212 *dst++ = leading_code; \
213 *dst++ = (c) | 0x80; \
214 } while (0)
215
216 /* Decode one DIMENSION2 character of which charset is CHARSET and
217 position-codes are C1 and C2. */
218
219 #define DECODE_CHARACTER_DIMENSION2(charset, c1, c2) \
220 do { \
221 DECODE_CHARACTER_DIMENSION1 (charset, c1); \
222 *dst++ = (c2) | 0x80; \
223 } while (0)
224
225 \f
226 /*** 1. Preamble ***/
227
228 #include <stdio.h>
229
230 #ifdef emacs
231
232 #include <config.h>
233 #include "lisp.h"
234 #include "buffer.h"
235 #include "charset.h"
236 #include "ccl.h"
237 #include "coding.h"
238 #include "window.h"
239
240 #else /* not emacs */
241
242 #include "mulelib.h"
243
244 #endif /* not emacs */
245
246 Lisp_Object Qcoding_system, Qeol_type;
247 Lisp_Object Qbuffer_file_coding_system;
248 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
249
250 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
251 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
252 Lisp_Object Qstart_process, Qopen_network_stream;
253 Lisp_Object Qtarget_idx;
254
255 /* Mnemonic character of each format of end-of-line. */
256 int eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
257 /* Mnemonic character to indicate format of end-of-line is not yet
258 decided. */
259 int eol_mnemonic_undecided;
260
261 #ifdef emacs
262
263 Lisp_Object Qcoding_system_vector, Qcoding_system_p, Qcoding_system_error;
264
265 /* Coding-systems are handed between Emacs Lisp programs and C internal
266 routines by the following three variables. */
267 /* Coding-system for reading files and receiving data from process. */
268 Lisp_Object Vcoding_system_for_read;
269 /* Coding-system for writing files and sending data to process. */
270 Lisp_Object Vcoding_system_for_write;
271 /* Coding-system actually used in the latest I/O. */
272 Lisp_Object Vlast_coding_system_used;
273
274 /* Coding-system of what terminal accept for displaying. */
275 struct coding_system terminal_coding;
276
277 /* Coding-system of what is sent from terminal keyboard. */
278 struct coding_system keyboard_coding;
279
280 Lisp_Object Vcoding_system_alist;
281
282 #endif /* emacs */
283
284 Lisp_Object Qcoding_category_index;
285
286 /* List of symbols `coding-category-xxx' ordered by priority. */
287 Lisp_Object Vcoding_category_list;
288
289 /* Table of coding-systems currently assigned to each coding-category. */
290 Lisp_Object coding_category_table[CODING_CATEGORY_IDX_MAX];
291
292 /* Table of names of symbol for each coding-category. */
293 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
294 "coding-category-internal",
295 "coding-category-sjis",
296 "coding-category-iso-7",
297 "coding-category-iso-8-1",
298 "coding-category-iso-8-2",
299 "coding-category-iso-else",
300 "coding-category-big5",
301 "coding-category-binary"
302 };
303
304 /* Alist of charsets vs the alternate charsets. */
305 Lisp_Object Valternate_charset_table;
306
307 /* Alist of charsets vs revision number. */
308 Lisp_Object Vcharset_revision_alist;
309
310 \f
311 /*** 2. Emacs internal format handlers ***/
312
313 /* Emacs' internal format for encoding multiple character sets is a
314 kind of multi-byte encoding, i.e. encoding a character by a sequence
315 of one-byte codes of variable length. ASCII characters and control
316 characters (e.g. `tab', `newline') are represented by one-byte as
317 is. It takes the range 0x00 through 0x7F. The other characters
318 are represented by a sequence of `base leading-code', optional
319 `extended leading-code', and one or two `position-code's. Length
320 of the sequence is decided by the base leading-code. Leading-code
321 takes the range 0x80 through 0x9F, whereas extended leading-code
322 and position-code take the range 0xA0 through 0xFF. See the
323 document of `charset.h' for more detail about leading-code and
324 position-code.
325
326 There's one exception in this rule. Special leading-code
327 `leading-code-composition' denotes that the following several
328 characters should be composed into one character. Leading-codes of
329 components (except for ASCII) are added 0x20. An ASCII character
330 component is represented by a 2-byte sequence of `0xA0' and
331 `ASCII-code + 0x80'. See also the document in `charset.h' for the
332 detail of composite character. Hence, we can summarize the code
333 range as follows:
334
335 --- CODE RANGE of Emacs' internal format ---
336 (character set) (range)
337 ASCII 0x00 .. 0x7F
338 ELSE (1st byte) 0x80 .. 0x9F
339 (rest bytes) 0xA0 .. 0xFF
340 ---------------------------------------------
341
342 */
343
344 enum emacs_code_class_type emacs_code_class[256];
345
346 /* Go to the next statement only if *SRC is accessible and the code is
347 greater than 0xA0. */
348 #define CHECK_CODE_RANGE_A0_FF \
349 do { \
350 if (src >= src_end) \
351 goto label_end_of_switch; \
352 else if (*src++ < 0xA0) \
353 return 0; \
354 } while (0)
355
356 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
357 Check if a text is encoded in Emacs' internal format. If it is,
358 return CODING_CATEGORY_MASK_INTERNAL, else return 0. */
359
360 int
361 detect_coding_internal (src, src_end)
362 unsigned char *src, *src_end;
363 {
364 unsigned char c;
365 int composing = 0;
366
367 while (src < src_end)
368 {
369 c = *src++;
370
371 if (composing)
372 {
373 if (c < 0xA0)
374 composing = 0;
375 else
376 c -= 0x20;
377 }
378
379 switch (emacs_code_class[c])
380 {
381 case EMACS_ascii_code:
382 case EMACS_linefeed_code:
383 break;
384
385 case EMACS_control_code:
386 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
387 return 0;
388 break;
389
390 case EMACS_invalid_code:
391 return 0;
392
393 case EMACS_leading_code_composition: /* c == 0x80 */
394 if (composing)
395 CHECK_CODE_RANGE_A0_FF;
396 else
397 composing = 1;
398 break;
399
400 case EMACS_leading_code_4:
401 CHECK_CODE_RANGE_A0_FF;
402 /* fall down to check it two more times ... */
403
404 case EMACS_leading_code_3:
405 CHECK_CODE_RANGE_A0_FF;
406 /* fall down to check it one more time ... */
407
408 case EMACS_leading_code_2:
409 CHECK_CODE_RANGE_A0_FF;
410 break;
411
412 default:
413 label_end_of_switch:
414 break;
415 }
416 }
417 return CODING_CATEGORY_MASK_INTERNAL;
418 }
419
420 \f
421 /*** 3. ISO2022 handlers ***/
422
423 /* The following note describes the coding system ISO2022 briefly.
424 Since the intension of this note is to help understanding of the
425 programs in this file, some parts are NOT ACCURATE or OVERLY
426 SIMPLIFIED. For the thorough understanding, please refer to the
427 original document of ISO2022.
428
429 ISO2022 provides many mechanisms to encode several character sets
430 in 7-bit and 8-bit environment. If one choose 7-bite environment,
431 all text is encoded by codes of less than 128. This may make the
432 encoded text a little bit longer, but the text get more stability
433 to pass through several gateways (some of them split MSB off).
434
435 There are two kind of character set: control character set and
436 graphic character set. The former contains control characters such
437 as `newline' and `escape' to provide control functions (control
438 functions are provided also by escape sequence). The latter
439 contains graphic characters such as ' A' and '-'. Emacs recognizes
440 two control character sets and many graphic character sets.
441
442 Graphic character sets are classified into one of the following
443 four classes, DIMENSION1_CHARS94, DIMENSION1_CHARS96,
444 DIMENSION2_CHARS94, DIMENSION2_CHARS96 according to the number of
445 bytes (DIMENSION) and the number of characters in one dimension
446 (CHARS) of the set. In addition, each character set is assigned an
447 identification tag (called "final character" and denoted as <F>
448 here after) which is unique in each class. <F> of each character
449 set is decided by ECMA(*) when it is registered in ISO. Code range
450 of <F> is 0x30..0x7F (0x30..0x3F are for private use only).
451
452 Note (*): ECMA = European Computer Manufacturers Association
453
454 Here are examples of graphic character set [NAME(<F>)]:
455 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
456 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
457 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
458 o DIMENSION2_CHARS96 -- none for the moment
459
460 A code area (1byte=8bits) is divided into 4 areas, C0, GL, C1, and GR.
461 C0 [0x00..0x1F] -- control character plane 0
462 GL [0x20..0x7F] -- graphic character plane 0
463 C1 [0x80..0x9F] -- control character plane 1
464 GR [0xA0..0xFF] -- graphic character plane 1
465
466 A control character set is directly designated and invoked to C0 or
467 C1 by an escape sequence. The most common case is that ISO646's
468 control character set is designated/invoked to C0 and ISO6429's
469 control character set is designated/invoked to C1, and usually
470 these designations/invocations are omitted in a coded text. With
471 7-bit environment, only C0 can be used, and a control character for
472 C1 is encoded by an appropriate escape sequence to fit in the
473 environment. All control characters for C1 are defined the
474 corresponding escape sequences.
475
476 A graphic character set is at first designated to one of four
477 graphic registers (G0 through G3), then these graphic registers are
478 invoked to GL or GR. These designations and invocations can be
479 done independently. The most common case is that G0 is invoked to
480 GL, G1 is invoked to GR, and ASCII is designated to G0, and usually
481 these invocations and designations are omitted in a coded text.
482 With 7-bit environment, only GL can be used.
483
484 When a graphic character set of CHARS94 is invoked to GL, code 0x20
485 and 0x7F of GL area work as control characters SPACE and DEL
486 respectively, and code 0xA0 and 0xFF of GR area should not be used.
487
488 There are two ways of invocation: locking-shift and single-shift.
489 With locking-shift, the invocation lasts until the next different
490 invocation, whereas with single-shift, the invocation works only
491 for the following character and doesn't affect locking-shift.
492 Invocations are done by the following control characters or escape
493 sequences.
494
495 ----------------------------------------------------------------------
496 function control char escape sequence description
497 ----------------------------------------------------------------------
498 SI (shift-in) 0x0F none invoke G0 to GL
499 SI (shift-out) 0x0E none invoke G1 to GL
500 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
501 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
502 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 into GL
503 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 into GL
504 ----------------------------------------------------------------------
505 The first four are for locking-shift. Control characters for these
506 functions are defined by macros ISO_CODE_XXX in `coding.h'.
507
508 Designations are done by the following escape sequences.
509 ----------------------------------------------------------------------
510 escape sequence description
511 ----------------------------------------------------------------------
512 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
513 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
514 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
515 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
516 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
517 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
518 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
519 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
520 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
521 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
522 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
523 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
524 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
525 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
526 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
527 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
528 ----------------------------------------------------------------------
529
530 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
531 of dimension 1, chars 94, and final character <F>, and etc.
532
533 Note (*): Although these designations are not allowed in ISO2022,
534 Emacs accepts them on decoding, and produces them on encoding
535 CHARS96 character set in a coding system which is characterized as
536 7-bit environment, non-locking-shift, and non-single-shift.
537
538 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
539 '(' can be omitted. We call this as "short-form" here after.
540
541 Now you may notice that there are a lot of ways for encoding the
542 same multilingual text in ISO2022. Actually, there exist many
543 coding systems such as Compound Text (used in X's inter client
544 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
545 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
546 localized platforms), and all of these are variants of ISO2022.
547
548 In addition to the above, Emacs handles two more kinds of escape
549 sequences: ISO6429's direction specification and Emacs' private
550 sequence for specifying character composition.
551
552 ISO6429's direction specification takes the following format:
553 o CSI ']' -- end of the current direction
554 o CSI '0' ']' -- end of the current direction
555 o CSI '1' ']' -- start of left-to-right text
556 o CSI '2' ']' -- start of right-to-left text
557 The control character CSI (0x9B: control sequence introducer) is
558 abbreviated to the escape sequence ESC '[' in 7-bit environment.
559
560 Character composition specification takes the following format:
561 o ESC '0' -- start character composition
562 o ESC '1' -- end character composition
563 Since these are not standard escape sequences of any ISO, the use
564 of them for these meaning is restricted to Emacs only. */
565
566 enum iso_code_class_type iso_code_class[256];
567
568 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
569 Check if a text is encoded in ISO2022. If it is, returns an
570 integer in which appropriate flag bits any of:
571 CODING_CATEGORY_MASK_ISO_7
572 CODING_CATEGORY_MASK_ISO_8_1
573 CODING_CATEGORY_MASK_ISO_8_2
574 CODING_CATEGORY_MASK_ISO_ELSE
575 are set. If a code which should never appear in ISO2022 is found,
576 returns 0. */
577
578 int
579 detect_coding_iso2022 (src, src_end)
580 unsigned char *src, *src_end;
581 {
582 unsigned char graphic_register[4];
583 unsigned char c, esc_cntl;
584 int mask = (CODING_CATEGORY_MASK_ISO_7
585 | CODING_CATEGORY_MASK_ISO_8_1
586 | CODING_CATEGORY_MASK_ISO_8_2);
587 /* We may look ahead maximum 3 bytes. */
588 unsigned char *adjusted_src_end = src_end - 3;
589 int i;
590
591 for (i = 0; i < 4; i++)
592 graphic_register[i] = CHARSET_ASCII;
593
594 while (src < adjusted_src_end)
595 {
596 c = *src++;
597 switch (c)
598 {
599 case ISO_CODE_ESC:
600 if (src >= adjusted_src_end)
601 break;
602 c = *src++;
603 if (c == '$')
604 {
605 /* Designation of 2-byte character set. */
606 if (src >= adjusted_src_end)
607 break;
608 c = *src++;
609 }
610 if ((c >= ')' && c <= '+') || (c >= '-' && c <= '/'))
611 /* Designation to graphic register 1, 2, or 3. */
612 mask &= ~CODING_CATEGORY_MASK_ISO_7;
613 else if (c == 'N' || c == 'O' || c == 'n' || c == 'o')
614 return CODING_CATEGORY_MASK_ISO_ELSE;
615 break;
616
617 case ISO_CODE_SI:
618 case ISO_CODE_SO:
619 return CODING_CATEGORY_MASK_ISO_ELSE;
620
621 case ISO_CODE_CSI:
622 case ISO_CODE_SS2:
623 case ISO_CODE_SS3:
624 mask &= ~CODING_CATEGORY_MASK_ISO_7;
625 break;
626
627 default:
628 if (c < 0x80)
629 break;
630 else if (c < 0xA0)
631 return 0;
632 else
633 {
634 int count = 1;
635
636 mask &= ~CODING_CATEGORY_MASK_ISO_7;
637 while (src < adjusted_src_end && *src >= 0xA0)
638 count++, src++;
639 if (count & 1 && src < adjusted_src_end)
640 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
641 }
642 break;
643 }
644 }
645
646 return mask;
647 }
648
649 /* Decode a character of which charset is CHARSET and the 1st position
650 code is C1. If dimension of CHARSET 2, the 2nd position code is
651 fetched from SRC and set to C2. If CHARSET is negative, it means
652 that we are decoding ill formed text, and what we can do is just to
653 read C1 as is. */
654
655 #define DECODE_ISO_CHARACTER(charset, c1) \
656 do { \
657 if ((charset) >= 0 && CHARSET_DIMENSION (charset) == 2) \
658 ONE_MORE_BYTE (c2); \
659 if (COMPOSING_HEAD_P (coding->composing)) \
660 { \
661 *dst++ = LEADING_CODE_COMPOSITION; \
662 if (COMPOSING_WITH_RULE_P (coding->composing)) \
663 /* To tell composition rules are embeded. */ \
664 *dst++ = 0xFF; \
665 coding->composing += 2; \
666 } \
667 if ((charset) < 0) \
668 *dst++ = c1; \
669 else if ((charset) == CHARSET_ASCII) \
670 DECODE_CHARACTER_ASCII (c1); \
671 else if (CHARSET_DIMENSION (charset) == 1) \
672 DECODE_CHARACTER_DIMENSION1 (charset, c1); \
673 else \
674 DECODE_CHARACTER_DIMENSION2 (charset, c1, c2); \
675 if (COMPOSING_WITH_RULE_P (coding->composing)) \
676 /* To tell a composition rule follows. */ \
677 coding->composing = COMPOSING_WITH_RULE_RULE; \
678 } while (0)
679
680 /* Set designation state into CODING. */
681 #define DECODE_DESIGNATION(reg, dimension, chars, final_char) \
682 do { \
683 int charset = ISO_CHARSET_TABLE (dimension, chars, final_char); \
684 Lisp_Object temp \
685 = Fassq (CHARSET_SYMBOL (charset), Valternate_charset_table); \
686 if (! NILP (temp)) \
687 charset = get_charset_id (XCONS (temp)->cdr); \
688 if (charset >= 0) \
689 { \
690 if (coding->direction == 1 \
691 && CHARSET_REVERSE_CHARSET (charset) >= 0) \
692 charset = CHARSET_REVERSE_CHARSET (charset); \
693 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
694 } \
695 } while (0)
696
697 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
698
699 int
700 decode_coding_iso2022 (coding, source, destination,
701 src_bytes, dst_bytes, consumed)
702 struct coding_system *coding;
703 unsigned char *source, *destination;
704 int src_bytes, dst_bytes;
705 int *consumed;
706 {
707 unsigned char *src = source;
708 unsigned char *src_end = source + src_bytes;
709 unsigned char *dst = destination;
710 unsigned char *dst_end = destination + dst_bytes;
711 /* Since the maximum bytes produced by each loop is 7, we subtract 6
712 from DST_END to assure that overflow checking is necessary only
713 at the head of loop. */
714 unsigned char *adjusted_dst_end = dst_end - 6;
715 int charset;
716 /* Charsets invoked to graphic plane 0 and 1 respectively. */
717 int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
718 int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
719
720 while (src < src_end && dst < adjusted_dst_end)
721 {
722 /* SRC_BASE remembers the start position in source in each loop.
723 The loop will be exited when there's not enough source text
724 to analyze long escape sequence or 2-byte code (within macros
725 ONE_MORE_BYTE or TWO_MORE_BYTES). In that case, SRC is reset
726 to SRC_BASE before exiting. */
727 unsigned char *src_base = src;
728 unsigned char c1 = *src++, c2, cmprule;
729
730 switch (iso_code_class [c1])
731 {
732 case ISO_0x20_or_0x7F:
733 if (!coding->composing
734 && (charset0 < 0 || CHARSET_CHARS (charset0) == 94))
735 {
736 /* This is SPACE or DEL. */
737 *dst++ = c1;
738 break;
739 }
740 /* This is a graphic character, we fall down ... */
741
742 case ISO_graphic_plane_0:
743 if (coding->composing == COMPOSING_WITH_RULE_RULE)
744 {
745 /* This is a composition rule. */
746 *dst++ = c1 | 0x80;
747 coding->composing = COMPOSING_WITH_RULE_TAIL;
748 }
749 else
750 DECODE_ISO_CHARACTER (charset0, c1);
751 break;
752
753 case ISO_0xA0_or_0xFF:
754 if (charset1 < 0 || CHARSET_CHARS (charset1) == 94)
755 {
756 /* Invalid code. */
757 *dst++ = c1;
758 break;
759 }
760 /* This is a graphic character, we fall down ... */
761
762 case ISO_graphic_plane_1:
763 DECODE_ISO_CHARACTER (charset1, c1);
764 break;
765
766 case ISO_control_code:
767 /* All ISO2022 control characters in this class have the
768 same representation in Emacs internal format. */
769 *dst++ = c1;
770 break;
771
772 case ISO_carriage_return:
773 if (coding->eol_type == CODING_EOL_CR)
774 {
775 *dst++ = '\n';
776 }
777 else if (coding->eol_type == CODING_EOL_CRLF)
778 {
779 ONE_MORE_BYTE (c1);
780 if (c1 == ISO_CODE_LF)
781 *dst++ = '\n';
782 else
783 {
784 src--;
785 *dst++ = c1;
786 }
787 }
788 else
789 {
790 *dst++ = c1;
791 }
792 break;
793
794 case ISO_shift_out:
795 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
796 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
797 break;
798
799 case ISO_shift_in:
800 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
801 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
802 break;
803
804 case ISO_single_shift_2_7:
805 case ISO_single_shift_2:
806 /* SS2 is handled as an escape sequence of ESC 'N' */
807 c1 = 'N';
808 goto label_escape_sequence;
809
810 case ISO_single_shift_3:
811 /* SS2 is handled as an escape sequence of ESC 'O' */
812 c1 = 'O';
813 goto label_escape_sequence;
814
815 case ISO_control_sequence_introducer:
816 /* CSI is handled as an escape sequence of ESC '[' ... */
817 c1 = '[';
818 goto label_escape_sequence;
819
820 case ISO_escape:
821 ONE_MORE_BYTE (c1);
822 label_escape_sequence:
823 /* Escape sequences handled by Emacs are invocation,
824 designation, direction specification, and character
825 composition specification. */
826 switch (c1)
827 {
828 case '&': /* revision of following character set */
829 ONE_MORE_BYTE (c1);
830 if (!(c1 >= '@' && c1 <= '~'))
831 {
832 goto label_invalid_escape_sequence;
833 }
834 ONE_MORE_BYTE (c1);
835 if (c1 != ISO_CODE_ESC)
836 {
837 goto label_invalid_escape_sequence;
838 }
839 ONE_MORE_BYTE (c1);
840 goto label_escape_sequence;
841
842 case '$': /* designation of 2-byte character set */
843 ONE_MORE_BYTE (c1);
844 if (c1 >= '@' && c1 <= 'B')
845 { /* designation of JISX0208.1978, GB2312.1980,
846 or JISX0208.1980 */
847 DECODE_DESIGNATION (0, 2, 94, c1);
848 }
849 else if (c1 >= 0x28 && c1 <= 0x2B)
850 { /* designation of DIMENSION2_CHARS94 character set */
851 ONE_MORE_BYTE (c2);
852 DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
853 }
854 else if (c1 >= 0x2C && c1 <= 0x2F)
855 { /* designation of DIMENSION2_CHARS96 character set */
856 ONE_MORE_BYTE (c2);
857 DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
858 }
859 else
860 {
861 goto label_invalid_escape_sequence;
862 }
863 break;
864
865 case 'n': /* invocation of locking-shift-2 */
866 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
867 break;
868
869 case 'o': /* invocation of locking-shift-3 */
870 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
871 break;
872
873 case 'N': /* invocation of single-shift-2 */
874 ONE_MORE_BYTE (c1);
875 charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
876 DECODE_ISO_CHARACTER (charset, c1);
877 break;
878
879 case 'O': /* invocation of single-shift-3 */
880 ONE_MORE_BYTE (c1);
881 charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
882 DECODE_ISO_CHARACTER (charset, c1);
883 break;
884
885 case '0': /* start composing without embeded rules */
886 coding->composing = COMPOSING_NO_RULE_HEAD;
887 break;
888
889 case '1': /* end composing */
890 coding->composing = COMPOSING_NO;
891 break;
892
893 case '2': /* start composing with embeded rules */
894 coding->composing = COMPOSING_WITH_RULE_HEAD;
895 break;
896
897 case '[': /* specification of direction */
898 /* For the moment, nested direction is not supported.
899 So, the value of `coding->direction' is 0 or 1: 0
900 means left-to-right, 1 means right-to-left. */
901 ONE_MORE_BYTE (c1);
902 switch (c1)
903 {
904 case ']': /* end of the current direction */
905 coding->direction = 0;
906
907 case '0': /* end of the current direction */
908 case '1': /* start of left-to-right direction */
909 ONE_MORE_BYTE (c1);
910 if (c1 == ']')
911 coding->direction = 0;
912 else
913 goto label_invalid_escape_sequence;
914 break;
915
916 case '2': /* start of right-to-left direction */
917 ONE_MORE_BYTE (c1);
918 if (c1 == ']')
919 coding->direction= 1;
920 else
921 goto label_invalid_escape_sequence;
922 break;
923
924 default:
925 goto label_invalid_escape_sequence;
926 }
927 break;
928
929 default:
930 if (c1 >= 0x28 && c1 <= 0x2B)
931 { /* designation of DIMENSION1_CHARS94 character set */
932 ONE_MORE_BYTE (c2);
933 DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
934 }
935 else if (c1 >= 0x2C && c1 <= 0x2F)
936 { /* designation of DIMENSION1_CHARS96 character set */
937 ONE_MORE_BYTE (c2);
938 DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
939 }
940 else
941 {
942 goto label_invalid_escape_sequence;
943 }
944 }
945 /* We must update these variables now. */
946 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
947 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
948 break;
949
950 label_invalid_escape_sequence:
951 {
952 int length = src - src_base;
953
954 bcopy (src_base, dst, length);
955 dst += length;
956 }
957 }
958 continue;
959
960 label_end_of_loop:
961 coding->carryover_size = src - src_base;
962 bcopy (src_base, coding->carryover, coding->carryover_size);
963 src = src_base;
964 break;
965 }
966
967 /* If this is the last block of the text to be decoded, we had
968 better just flush out all remaining codes in the text although
969 they are not valid characters. */
970 if (coding->last_block)
971 {
972 bcopy (src, dst, src_end - src);
973 dst += (src_end - src);
974 src = src_end;
975 }
976 *consumed = src - source;
977 return dst - destination;
978 }
979
980 /* ISO2022 encoding staffs. */
981
982 /*
983 It is not enough to say just "ISO2022" on encoding, but we have to
984 specify more details. In Emacs, each coding-system of ISO2022
985 variant has the following specifications:
986 1. Initial designation to G0 thru G3.
987 2. Allows short-form designation?
988 3. ASCII should be designated to G0 before control characters?
989 4. ASCII should be designated to G0 at end of line?
990 5. 7-bit environment or 8-bit environment?
991 6. Use locking-shift?
992 7. Use Single-shift?
993 And the following two are only for Japanese:
994 8. Use ASCII in place of JIS0201-1976-Roman?
995 9. Use JISX0208-1983 in place of JISX0208-1978?
996 These specifications are encoded in `coding->flags' as flag bits
997 defined by macros CODING_FLAG_ISO_XXX. See `coding.h' for more
998 detail.
999 */
1000
1001 /* Produce codes (escape sequence) for designating CHARSET to graphic
1002 register REG. If <final-char> of CHARSET is '@', 'A', or 'B' and
1003 the coding system CODING allows, produce designation sequence of
1004 short-form. */
1005
1006 #define ENCODE_DESIGNATION(charset, reg, coding) \
1007 do { \
1008 unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset); \
1009 char *intermediate_char_94 = "()*+"; \
1010 char *intermediate_char_96 = ",-./"; \
1011 Lisp_Object temp \
1012 = Fassq (make_number (charset), Vcharset_revision_alist); \
1013 if (! NILP (temp)) \
1014 { \
1015 *dst++ = ISO_CODE_ESC; \
1016 *dst++ = '&'; \
1017 *dst++ = XINT (XCONS (temp)->cdr) + '@'; \
1018 } \
1019 *dst++ = ISO_CODE_ESC; \
1020 if (CHARSET_DIMENSION (charset) == 1) \
1021 { \
1022 if (CHARSET_CHARS (charset) == 94) \
1023 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1024 else \
1025 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1026 } \
1027 else \
1028 { \
1029 *dst++ = '$'; \
1030 if (CHARSET_CHARS (charset) == 94) \
1031 { \
1032 if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM) \
1033 || reg != 0 \
1034 || final_char < '@' || final_char > 'B') \
1035 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1036 } \
1037 else \
1038 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1039 } \
1040 *dst++ = final_char; \
1041 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
1042 } while (0)
1043
1044 /* The following two macros produce codes (control character or escape
1045 sequence) for ISO2022 single-shift functions (single-shift-2 and
1046 single-shift-3). */
1047
1048 #define ENCODE_SINGLE_SHIFT_2 \
1049 do { \
1050 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1051 *dst++ = ISO_CODE_ESC, *dst++ = 'N'; \
1052 else \
1053 *dst++ = ISO_CODE_SS2; \
1054 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1055 } while (0)
1056
1057 #define ENCODE_SINGLE_SHIFT_3 \
1058 do { \
1059 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1060 *dst++ = ISO_CODE_ESC, *dst++ = 'O'; \
1061 else \
1062 *dst++ = ISO_CODE_SS3; \
1063 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1064 } while (0)
1065
1066 /* The following four macros produce codes (control character or
1067 escape sequence) for ISO2022 locking-shift functions (shift-in,
1068 shift-out, locking-shift-2, and locking-shift-3). */
1069
1070 #define ENCODE_SHIFT_IN \
1071 do { \
1072 *dst++ = ISO_CODE_SI; \
1073 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1074 } while (0)
1075
1076 #define ENCODE_SHIFT_OUT \
1077 do { \
1078 *dst++ = ISO_CODE_SO; \
1079 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1080 } while (0)
1081
1082 #define ENCODE_LOCKING_SHIFT_2 \
1083 do { \
1084 *dst++ = ISO_CODE_ESC, *dst++ = 'n'; \
1085 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1086 } while (0)
1087
1088 #define ENCODE_LOCKING_SHIFT_3 \
1089 do { \
1090 *dst++ = ISO_CODE_ESC, *dst++ = 'o'; \
1091 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1092 } while (0)
1093
1094 /* Produce codes for a DIMENSION1 character of which character set is
1095 CHARSET and position-code is C1. Designation and invocation
1096 sequences are also produced in advance if necessary. */
1097
1098
1099 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
1100 do { \
1101 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1102 { \
1103 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1104 *dst++ = c1 & 0x7F; \
1105 else \
1106 *dst++ = c1 | 0x80; \
1107 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1108 break; \
1109 } \
1110 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1111 { \
1112 *dst++ = c1 & 0x7F; \
1113 break; \
1114 } \
1115 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1116 { \
1117 *dst++ = c1 | 0x80; \
1118 break; \
1119 } \
1120 else \
1121 /* Since CHARSET is not yet invoked to any graphic planes, we \
1122 must invoke it, or, at first, designate it to some graphic \
1123 register. Then repeat the loop to actually produce the \
1124 character. */ \
1125 dst = encode_invocation_designation (charset, coding, dst); \
1126 } while (1)
1127
1128 /* Produce codes for a DIMENSION2 character of which character set is
1129 CHARSET and position-codes are C1 and C2. Designation and
1130 invocation codes are also produced in advance if necessary. */
1131
1132 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
1133 do { \
1134 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1135 { \
1136 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1137 *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F; \
1138 else \
1139 *dst++ = c1 | 0x80, *dst++ = c2 | 0x80; \
1140 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1141 break; \
1142 } \
1143 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1144 { \
1145 *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F; \
1146 break; \
1147 } \
1148 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1149 { \
1150 *dst++ = c1 | 0x80, *dst++= c2 | 0x80; \
1151 break; \
1152 } \
1153 else \
1154 /* Since CHARSET is not yet invoked to any graphic planes, we \
1155 must invoke it, or, at first, designate it to some graphic \
1156 register. Then repeat the loop to actually produce the \
1157 character. */ \
1158 dst = encode_invocation_designation (charset, coding, dst); \
1159 } while (1)
1160
1161 /* Produce designation and invocation codes at a place pointed by DST
1162 to use CHARSET. The element `spec.iso2022' of *CODING is updated.
1163 Return new DST. */
1164
1165 unsigned char *
1166 encode_invocation_designation (charset, coding, dst)
1167 int charset;
1168 struct coding_system *coding;
1169 unsigned char *dst;
1170 {
1171 int reg; /* graphic register number */
1172
1173 /* At first, check designations. */
1174 for (reg = 0; reg < 4; reg++)
1175 if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
1176 break;
1177
1178 if (reg >= 4)
1179 {
1180 /* CHARSET is not yet designated to any graphic registers. */
1181 /* At first check the requested designation. */
1182 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1183 if (reg < 0)
1184 /* Since CHARSET requests no special designation, designate to
1185 graphic register 0. */
1186 reg = 0;
1187
1188 ENCODE_DESIGNATION (charset, reg, coding);
1189 }
1190
1191 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
1192 && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
1193 {
1194 /* Since the graphic register REG is not invoked to any graphic
1195 planes, invoke it to graphic plane 0. */
1196 switch (reg)
1197 {
1198 case 0: /* graphic register 0 */
1199 ENCODE_SHIFT_IN;
1200 break;
1201
1202 case 1: /* graphic register 1 */
1203 ENCODE_SHIFT_OUT;
1204 break;
1205
1206 case 2: /* graphic register 2 */
1207 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1208 ENCODE_SINGLE_SHIFT_2;
1209 else
1210 ENCODE_LOCKING_SHIFT_2;
1211 break;
1212
1213 case 3: /* graphic register 3 */
1214 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1215 ENCODE_SINGLE_SHIFT_3;
1216 else
1217 ENCODE_LOCKING_SHIFT_3;
1218 break;
1219 }
1220 }
1221 return dst;
1222 }
1223
1224 /* The following two macros produce codes for indicating composition. */
1225 #define ENCODE_COMPOSITION_NO_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '0'
1226 #define ENCODE_COMPOSITION_WITH_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '2'
1227 #define ENCODE_COMPOSITION_END *dst++ = ISO_CODE_ESC, *dst++ = '1'
1228
1229 /* The following three macros produce codes for indicating direction
1230 of text. */
1231 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
1232 do { \
1233 if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS) \
1234 *dst++ = ISO_CODE_ESC, *dst++ = '['; \
1235 else \
1236 *dst++ = ISO_CODE_CSI; \
1237 } while (0)
1238
1239 #define ENCODE_DIRECTION_R2L \
1240 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '2', *dst++ = ']'
1241
1242 #define ENCODE_DIRECTION_L2R \
1243 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '0', *dst++ = ']'
1244
1245 /* Produce codes for designation and invocation to reset the graphic
1246 planes and registers to initial state. */
1247 #define ENCODE_RESET_PLANE_AND_REGISTER(eol) \
1248 do { \
1249 int reg; \
1250 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0) \
1251 ENCODE_SHIFT_IN; \
1252 for (reg = 0; reg < 4; reg++) \
1253 { \
1254 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) < 0) \
1255 { \
1256 if (eol) CODING_SPEC_ISO_DESIGNATION (coding, reg) = -1; \
1257 } \
1258 else if (CODING_SPEC_ISO_DESIGNATION (coding, reg) \
1259 != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg)) \
1260 ENCODE_DESIGNATION \
1261 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
1262 } \
1263 } while (0)
1264
1265 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
1266
1267 int
1268 encode_coding_iso2022 (coding, source, destination,
1269 src_bytes, dst_bytes, consumed)
1270 struct coding_system *coding;
1271 unsigned char *source, *destination;
1272 int src_bytes, dst_bytes;
1273 int *consumed;
1274 {
1275 unsigned char *src = source;
1276 unsigned char *src_end = source + src_bytes;
1277 unsigned char *dst = destination;
1278 unsigned char *dst_end = destination + dst_bytes;
1279 /* Since the maximum bytes produced by each loop is 6, we subtract 5
1280 from DST_END to assure overflow checking is necessary only at the
1281 head of loop. */
1282 unsigned char *adjusted_dst_end = dst_end - 5;
1283
1284 while (src < src_end && dst < adjusted_dst_end)
1285 {
1286 /* SRC_BASE remembers the start position in source in each loop.
1287 The loop will be exited when there's not enough source text
1288 to analyze multi-byte codes (within macros ONE_MORE_BYTE,
1289 TWO_MORE_BYTES, and THREE_MORE_BYTES). In that case, SRC is
1290 reset to SRC_BASE before exiting. */
1291 unsigned char *src_base = src;
1292 unsigned char c1 = *src++, c2, c3, c4;
1293 int charset;
1294
1295 /* If we are seeing a component of a composite character, we are
1296 seeing a leading-code specially encoded for composition, or a
1297 composition rule if composing with rule. We must set C1
1298 to a normal leading-code or an ASCII code. If we are not at
1299 a composed character, we must reset the composition state. */
1300 if (COMPOSING_P (coding->composing))
1301 {
1302 if (c1 < 0xA0)
1303 {
1304 /* We are not in a composite character any longer. */
1305 coding->composing = COMPOSING_NO;
1306 ENCODE_COMPOSITION_END;
1307 }
1308 else
1309 {
1310 if (coding->composing == COMPOSING_WITH_RULE_RULE)
1311 {
1312 *dst++ = c1 & 0x7F;
1313 coding->composing = COMPOSING_WITH_RULE_HEAD;
1314 continue;
1315 }
1316 else if (coding->composing == COMPOSING_WITH_RULE_HEAD)
1317 coding->composing = COMPOSING_WITH_RULE_RULE;
1318 if (c1 == 0xA0)
1319 {
1320 /* This is an ASCII component. */
1321 ONE_MORE_BYTE (c1);
1322 c1 &= 0x7F;
1323 }
1324 else
1325 /* This is a leading-code of non ASCII component. */
1326 c1 -= 0x20;
1327 }
1328 }
1329
1330 /* Now encode one character. C1 is a control character, an
1331 ASCII character, or a leading-code of multi-byte character. */
1332 switch (emacs_code_class[c1])
1333 {
1334 case EMACS_ascii_code:
1335 ENCODE_ISO_CHARACTER_DIMENSION1 (CHARSET_ASCII, c1);
1336 break;
1337
1338 case EMACS_control_code:
1339 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
1340 ENCODE_RESET_PLANE_AND_REGISTER (0);
1341 *dst++ = c1;
1342 break;
1343
1344 case EMACS_carriage_return_code:
1345 if (!coding->selective)
1346 {
1347 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
1348 ENCODE_RESET_PLANE_AND_REGISTER (0);
1349 *dst++ = c1;
1350 break;
1351 }
1352 /* fall down to treat '\r' as '\n' ... */
1353
1354 case EMACS_linefeed_code:
1355 if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
1356 ENCODE_RESET_PLANE_AND_REGISTER (1);
1357 if (coding->eol_type == CODING_EOL_LF
1358 || coding->eol_type == CODING_EOL_AUTOMATIC)
1359 *dst++ = ISO_CODE_LF;
1360 else if (coding->eol_type == CODING_EOL_CRLF)
1361 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
1362 else
1363 *dst++ = ISO_CODE_CR;
1364 break;
1365
1366 case EMACS_leading_code_2:
1367 ONE_MORE_BYTE (c2);
1368 ENCODE_ISO_CHARACTER_DIMENSION1 (c1, c2);
1369 break;
1370
1371 case EMACS_leading_code_3:
1372 TWO_MORE_BYTES (c2, c3);
1373 if (c1 < LEADING_CODE_PRIVATE_11)
1374 ENCODE_ISO_CHARACTER_DIMENSION2 (c1, c2, c3);
1375 else
1376 ENCODE_ISO_CHARACTER_DIMENSION1 (c2, c3);
1377 break;
1378
1379 case EMACS_leading_code_4:
1380 THREE_MORE_BYTES (c2, c3, c4);
1381 ENCODE_ISO_CHARACTER_DIMENSION2 (c2, c3, c4);
1382 break;
1383
1384 case EMACS_leading_code_composition:
1385 ONE_MORE_BYTE (c1);
1386 if (c1 == 0xFF)
1387 {
1388 coding->composing = COMPOSING_WITH_RULE_HEAD;
1389 ENCODE_COMPOSITION_WITH_RULE_START;
1390 }
1391 else
1392 {
1393 /* Rewind one byte because it is a character code of
1394 composition elements. */
1395 src--;
1396 coding->composing = COMPOSING_NO_RULE_HEAD;
1397 ENCODE_COMPOSITION_NO_RULE_START;
1398 }
1399 break;
1400
1401 case EMACS_invalid_code:
1402 *dst++ = c1;
1403 break;
1404 }
1405 continue;
1406 label_end_of_loop:
1407 coding->carryover_size = src - src_base;
1408 bcopy (src_base, coding->carryover, coding->carryover_size);
1409 src = src_base;
1410 break;
1411 }
1412
1413 /* If this is the last block of the text to be encoded, we must
1414 reset the state of graphic planes and registers to initial one.
1415 In addition, we had better just flush out all remaining codes in
1416 the text although they are not valid characters. */
1417 if (coding->last_block)
1418 {
1419 ENCODE_RESET_PLANE_AND_REGISTER (1);
1420 bcopy(src, dst, src_end - src);
1421 dst += (src_end - src);
1422 src = src_end;
1423 }
1424 *consumed = src - source;
1425 return dst - destination;
1426 }
1427
1428 \f
1429 /*** 4. SJIS and BIG5 handlers ***/
1430
1431 /* Although SJIS and BIG5 are not ISO's coding system, They are used
1432 quite widely. So, for the moment, Emacs supports them in the bare
1433 C code. But, in the future, they may be supported only by CCL. */
1434
1435 /* SJIS is a coding system encoding three character sets: ASCII, right
1436 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
1437 as is. A character of charset katakana-jisx0201 is encoded by
1438 "position-code + 0x80". A character of charset japanese-jisx0208
1439 is encoded in 2-byte but two position-codes are divided and shifted
1440 so that it fit in the range below.
1441
1442 --- CODE RANGE of SJIS ---
1443 (character set) (range)
1444 ASCII 0x00 .. 0x7F
1445 KATAKANA-JISX0201 0xA0 .. 0xDF
1446 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xFF
1447 (2nd byte) 0x40 .. 0xFF
1448 -------------------------------
1449
1450 */
1451
1452 /* BIG5 is a coding system encoding two character sets: ASCII and
1453 Big5. An ASCII character is encoded as is. Big5 is a two-byte
1454 character set and is encoded in two-byte.
1455
1456 --- CODE RANGE of BIG5 ---
1457 (character set) (range)
1458 ASCII 0x00 .. 0x7F
1459 Big5 (1st byte) 0xA1 .. 0xFE
1460 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
1461 --------------------------
1462
1463 Since the number of characters in Big5 is larger than maximum
1464 characters in Emacs' charset (96x96), it can't be handled as one
1465 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
1466 and `charset-big5-2'. Both are DIMENSION2 and CHARS94. The former
1467 contains frequently used characters and the latter contains less
1468 frequently used characters. */
1469
1470 /* Macros to decode or encode a character of Big5 in BIG5. B1 and B2
1471 are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
1472 C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
1473 format. CHARSET is `charset_big5_1' or `charset_big5_2'. */
1474
1475 /* Number of Big5 characters which have the same code in 1st byte. */
1476 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
1477
1478 #define DECODE_BIG5(b1, b2, charset, c1, c2) \
1479 do { \
1480 unsigned int temp \
1481 = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62); \
1482 if (b1 < 0xC9) \
1483 charset = charset_big5_1; \
1484 else \
1485 { \
1486 charset = charset_big5_2; \
1487 temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW; \
1488 } \
1489 c1 = temp / (0xFF - 0xA1) + 0x21; \
1490 c2 = temp % (0xFF - 0xA1) + 0x21; \
1491 } while (0)
1492
1493 #define ENCODE_BIG5(charset, c1, c2, b1, b2) \
1494 do { \
1495 unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21); \
1496 if (charset == charset_big5_2) \
1497 temp += BIG5_SAME_ROW * (0xC9 - 0xA1); \
1498 b1 = temp / BIG5_SAME_ROW + 0xA1; \
1499 b2 = temp % BIG5_SAME_ROW; \
1500 b2 += b2 < 0x3F ? 0x40 : 0x62; \
1501 } while (0)
1502
1503 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1504 Check if a text is encoded in SJIS. If it is, return
1505 CODING_CATEGORY_MASK_SJIS, else return 0. */
1506
1507 int
1508 detect_coding_sjis (src, src_end)
1509 unsigned char *src, *src_end;
1510 {
1511 unsigned char c;
1512
1513 while (src < src_end)
1514 {
1515 c = *src++;
1516 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1517 return 0;
1518 if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
1519 {
1520 if (src < src_end && *src++ < 0x40)
1521 return 0;
1522 }
1523 }
1524 return CODING_CATEGORY_MASK_SJIS;
1525 }
1526
1527 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1528 Check if a text is encoded in BIG5. If it is, return
1529 CODING_CATEGORY_MASK_BIG5, else return 0. */
1530
1531 int
1532 detect_coding_big5 (src, src_end)
1533 unsigned char *src, *src_end;
1534 {
1535 unsigned char c;
1536
1537 while (src < src_end)
1538 {
1539 c = *src++;
1540 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1541 return 0;
1542 if (c >= 0xA1)
1543 {
1544 if (src >= src_end)
1545 break;
1546 c = *src++;
1547 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
1548 return 0;
1549 }
1550 }
1551 return CODING_CATEGORY_MASK_BIG5;
1552 }
1553
1554 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
1555 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
1556
1557 int
1558 decode_coding_sjis_big5 (coding, source, destination,
1559 src_bytes, dst_bytes, consumed, sjis_p)
1560 struct coding_system *coding;
1561 unsigned char *source, *destination;
1562 int src_bytes, dst_bytes;
1563 int *consumed;
1564 int sjis_p;
1565 {
1566 unsigned char *src = source;
1567 unsigned char *src_end = source + src_bytes;
1568 unsigned char *dst = destination;
1569 unsigned char *dst_end = destination + dst_bytes;
1570 /* Since the maximum bytes produced by each loop is 4, we subtract 3
1571 from DST_END to assure overflow checking is necessary only at the
1572 head of loop. */
1573 unsigned char *adjusted_dst_end = dst_end - 3;
1574
1575 while (src < src_end && dst < adjusted_dst_end)
1576 {
1577 /* SRC_BASE remembers the start position in source in each loop.
1578 The loop will be exited when there's not enough source text
1579 to analyze two-byte character (within macro ONE_MORE_BYTE).
1580 In that case, SRC is reset to SRC_BASE before exiting. */
1581 unsigned char *src_base = src;
1582 unsigned char c1 = *src++, c2, c3, c4;
1583
1584 if (c1 == '\r')
1585 {
1586 if (coding->eol_type == CODING_EOL_CRLF)
1587 {
1588 ONE_MORE_BYTE (c2);
1589 if (c2 == '\n')
1590 *dst++ = c2;
1591 else
1592 /* To process C2 again, SRC is subtracted by 1. */
1593 *dst++ = c1, src--;
1594 }
1595 else
1596 *dst++ = c1;
1597 }
1598 else if (c1 < 0x80)
1599 *dst++ = c1;
1600 else if (c1 < 0xA0 || c1 >= 0xE0)
1601 {
1602 /* SJIS -> JISX0208, BIG5 -> Big5 (only if 0xE0 <= c1 < 0xFF) */
1603 if (sjis_p)
1604 {
1605 ONE_MORE_BYTE (c2);
1606 DECODE_SJIS (c1, c2, c3, c4);
1607 DECODE_CHARACTER_DIMENSION2 (charset_jisx0208, c3, c4);
1608 }
1609 else if (c1 >= 0xE0 && c1 < 0xFF)
1610 {
1611 int charset;
1612
1613 ONE_MORE_BYTE (c2);
1614 DECODE_BIG5 (c1, c2, charset, c3, c4);
1615 DECODE_CHARACTER_DIMENSION2 (charset, c3, c4);
1616 }
1617 else /* Invalid code */
1618 *dst++ = c1;
1619 }
1620 else
1621 {
1622 /* SJIS -> JISX0201-Kana, BIG5 -> Big5 */
1623 if (sjis_p)
1624 DECODE_CHARACTER_DIMENSION1 (charset_katakana_jisx0201, c1);
1625 else
1626 {
1627 int charset;
1628
1629 ONE_MORE_BYTE (c2);
1630 DECODE_BIG5 (c1, c2, charset, c3, c4);
1631 DECODE_CHARACTER_DIMENSION2 (charset, c3, c4);
1632 }
1633 }
1634 continue;
1635
1636 label_end_of_loop:
1637 coding->carryover_size = src - src_base;
1638 bcopy (src_base, coding->carryover, coding->carryover_size);
1639 src = src_base;
1640 break;
1641 }
1642
1643 *consumed = src - source;
1644 return dst - destination;
1645 }
1646
1647 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
1648 This function can encode `charset_ascii', `charset_katakana_jisx0201',
1649 `charset_jisx0208', `charset_big5_1', and `charset_big5-2'. We are
1650 sure that all these charsets are registered as official charset
1651 (i.e. do not have extended leading-codes). Characters of other
1652 charsets are produced without any encoding. If SJIS_P is 1, encode
1653 SJIS text, else encode BIG5 text. */
1654
1655 int
1656 encode_coding_sjis_big5 (coding, source, destination,
1657 src_bytes, dst_bytes, consumed, sjis_p)
1658 struct coding_system *coding;
1659 unsigned char *source, *destination;
1660 int src_bytes, dst_bytes;
1661 int *consumed;
1662 int sjis_p;
1663 {
1664 unsigned char *src = source;
1665 unsigned char *src_end = source + src_bytes;
1666 unsigned char *dst = destination;
1667 unsigned char *dst_end = destination + dst_bytes;
1668 /* Since the maximum bytes produced by each loop is 2, we subtract 1
1669 from DST_END to assure overflow checking is necessary only at the
1670 head of loop. */
1671 unsigned char *adjusted_dst_end = dst_end - 1;
1672
1673 while (src < src_end && dst < adjusted_dst_end)
1674 {
1675 /* SRC_BASE remembers the start position in source in each loop.
1676 The loop will be exited when there's not enough source text
1677 to analyze multi-byte codes (within macros ONE_MORE_BYTE and
1678 TWO_MORE_BYTES). In that case, SRC is reset to SRC_BASE
1679 before exiting. */
1680 unsigned char *src_base = src;
1681 unsigned char c1 = *src++, c2, c3, c4;
1682
1683 if (coding->composing)
1684 {
1685 if (c1 == 0xA0)
1686 {
1687 ONE_MORE_BYTE (c1);
1688 c1 &= 0x7F;
1689 }
1690 else if (c1 >= 0xA0)
1691 c1 -= 0x20;
1692 else
1693 coding->composing = 0;
1694 }
1695
1696 switch (emacs_code_class[c1])
1697 {
1698 case EMACS_ascii_code:
1699 case EMACS_control_code:
1700 *dst++ = c1;
1701 break;
1702
1703 case EMACS_carriage_return_code:
1704 if (!coding->selective)
1705 {
1706 *dst++ = c1;
1707 break;
1708 }
1709 /* fall down to treat '\r' as '\n' ... */
1710
1711 case EMACS_linefeed_code:
1712 if (coding->eol_type == CODING_EOL_LF
1713 || coding->eol_type == CODING_EOL_AUTOMATIC)
1714 *dst++ = '\n';
1715 else if (coding->eol_type == CODING_EOL_CRLF)
1716 *dst++ = '\r', *dst++ = '\n';
1717 else
1718 *dst++ = '\r';
1719 break;
1720
1721 case EMACS_leading_code_2:
1722 ONE_MORE_BYTE (c2);
1723 if (sjis_p && c1 == charset_katakana_jisx0201)
1724 *dst++ = c2;
1725 else
1726 *dst++ = c1, *dst++ = c2;
1727 break;
1728
1729 case EMACS_leading_code_3:
1730 TWO_MORE_BYTES (c2, c3);
1731 c2 &= 0x7F, c3 &= 0x7F;
1732 if (sjis_p && c1 == charset_jisx0208)
1733 {
1734 unsigned char s1, s2;
1735
1736 ENCODE_SJIS (c2, c3, s1, s2);
1737 *dst++ = s1, *dst++ = s2;
1738 }
1739 else if (!sjis_p && (c1 == charset_big5_1 || c1 == charset_big5_2))
1740 {
1741 unsigned char b1, b2;
1742
1743 ENCODE_BIG5 (c1, c2, c3, b1, b2);
1744 *dst++ = b1, *dst++ = b2;
1745 }
1746 else
1747 *dst++ = c1, *dst++ = c2, *dst++ = c3;
1748 break;
1749
1750 case EMACS_leading_code_4:
1751 THREE_MORE_BYTES (c2, c3, c4);
1752 *dst++ = c1, *dst++ = c2, *dst++ = c3, *dst++ = c4;
1753 break;
1754
1755 case EMACS_leading_code_composition:
1756 coding->composing = 1;
1757 break;
1758
1759 default: /* i.e. case EMACS_invalid_code: */
1760 *dst++ = c1;
1761 }
1762 continue;
1763
1764 label_end_of_loop:
1765 coding->carryover_size = src - src_base;
1766 bcopy (src_base, coding->carryover, coding->carryover_size);
1767 src = src_base;
1768 break;
1769 }
1770
1771 *consumed = src - source;
1772 return dst - destination;
1773 }
1774
1775 \f
1776 /*** 5. End-of-line handlers ***/
1777
1778 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
1779 This function is called only when `coding->eol_type' is
1780 CODING_EOL_CRLF or CODING_EOL_CR. */
1781
1782 decode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
1783 struct coding_system *coding;
1784 unsigned char *source, *destination;
1785 int src_bytes, dst_bytes;
1786 int *consumed;
1787 {
1788 unsigned char *src = source;
1789 unsigned char *src_end = source + src_bytes;
1790 unsigned char *dst = destination;
1791 unsigned char *dst_end = destination + dst_bytes;
1792 int produced;
1793
1794 switch (coding->eol_type)
1795 {
1796 case CODING_EOL_CRLF:
1797 {
1798 /* Since the maximum bytes produced by each loop is 2, we
1799 subtract 1 from DST_END to assure overflow checking is
1800 necessary only at the head of loop. */
1801 unsigned char *adjusted_dst_end = dst_end - 1;
1802
1803 while (src < src_end && dst < adjusted_dst_end)
1804 {
1805 unsigned char *src_base = src;
1806 unsigned char c = *src++;
1807 if (c == '\r')
1808 {
1809 ONE_MORE_BYTE (c);
1810 if (c != '\n')
1811 *dst++ = '\r';
1812
1813 }
1814 else
1815 *dst++ = c;
1816 continue;
1817
1818 label_end_of_loop:
1819 coding->carryover_size = src - src_base;
1820 bcopy (src_base, coding->carryover, coding->carryover_size);
1821 src = src_base;
1822 break;
1823 }
1824 *consumed = src - source;
1825 produced = dst - destination;
1826 break;
1827 }
1828
1829 case CODING_EOL_CR:
1830 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
1831 bcopy (source, destination, produced);
1832 dst_end = destination + produced;
1833 while (dst < dst_end)
1834 if (*dst++ == '\r') dst[-1] = '\n';
1835 *consumed = produced;
1836 break;
1837
1838 default: /* i.e. case: CODING_EOL_LF */
1839 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
1840 bcopy (source, destination, produced);
1841 *consumed = produced;
1842 break;
1843 }
1844
1845 return produced;
1846 }
1847
1848 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". Encode
1849 format of end-of-line according to `coding->eol_type'. If
1850 `coding->selective' is 1, code '\r' in source text also means
1851 end-of-line. */
1852
1853 encode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
1854 struct coding_system *coding;
1855 unsigned char *source, *destination;
1856 int src_bytes, dst_bytes;
1857 int *consumed;
1858 {
1859 unsigned char *src = source;
1860 unsigned char *dst = destination;
1861 int produced;
1862
1863 if (src_bytes <= 0)
1864 return 0;
1865
1866 switch (coding->eol_type)
1867 {
1868 case CODING_EOL_LF:
1869 case CODING_EOL_AUTOMATIC:
1870 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
1871 bcopy (source, destination, produced);
1872 if (coding->selective)
1873 {
1874 int i = produced;
1875 while (i--)
1876 if (*dst++ == '\r') dst[-1] = '\n';
1877 }
1878 *consumed = produced;
1879
1880 case CODING_EOL_CRLF:
1881 {
1882 unsigned char c;
1883 unsigned char *src_end = source + src_bytes;
1884 unsigned char *dst_end = destination + dst_bytes;
1885 /* Since the maximum bytes produced by each loop is 2, we
1886 subtract 1 from DST_END to assure overflow checking is
1887 necessary only at the head of loop. */
1888 unsigned char *adjusted_dst_end = dst_end - 1;
1889
1890 while (src < src_end && dst < adjusted_dst_end)
1891 {
1892 c = *src++;
1893 if (c == '\n' || (c == '\r' && coding->selective))
1894 *dst++ = '\r', *dst++ = '\n';
1895 else
1896 *dst++ = c;
1897 }
1898 produced = dst - destination;
1899 *consumed = src - source;
1900 break;
1901 }
1902
1903 default: /* i.e. case CODING_EOL_CR: */
1904 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
1905 bcopy (source, destination, produced);
1906 {
1907 int i = produced;
1908 while (i--)
1909 if (*dst++ == '\n') dst[-1] = '\r';
1910 }
1911 *consumed = produced;
1912 }
1913
1914 return produced;
1915 }
1916
1917 \f
1918 /*** 6. C library functions ***/
1919
1920 /* In Emacs Lisp, coding system is represented by a Lisp symbol which
1921 has a property `coding-system'. The value of this property is a
1922 vector of length 5 (called as coding-vector). Among elements of
1923 this vector, the first (element[0]) and the fifth (element[4])
1924 carry important information for decoding/encoding. Before
1925 decoding/encoding, this information should be set in fields of a
1926 structure of type `coding_system'.
1927
1928 A value of property `coding-system' can be a symbol of another
1929 subsidiary coding-system. In that case, Emacs gets coding-vector
1930 from that symbol.
1931
1932 `element[0]' contains information to be set in `coding->type'. The
1933 value and its meaning is as follows:
1934
1935 0 -- coding_system_internal
1936 1 -- coding_system_sjis
1937 2 -- coding_system_iso2022
1938 3 -- coding_system_big5
1939 4 -- coding_system_ccl
1940 nil -- coding_system_no_conversion
1941 t -- coding_system_automatic
1942
1943 `element[4]' contains information to be set in `coding->flags' and
1944 `coding->spec'. The meaning varies by `coding->type'.
1945
1946 If `coding->type' is `coding_type_iso2022', element[4] is a vector
1947 of length 32 (of which the first 13 sub-elements are used now).
1948 Meanings of these sub-elements are:
1949
1950 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
1951 If the value is an integer of valid charset, the charset is
1952 assumed to be designated to graphic register N initially.
1953
1954 If the value is minus, it is a minus value of charset which
1955 reserves graphic register N, which means that the charset is
1956 not designated initially but should be designated to graphic
1957 register N just before encoding a character in that charset.
1958
1959 If the value is nil, graphic register N is never used on
1960 encoding.
1961
1962 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
1963 Each value takes t or nil. See the section ISO2022 of
1964 `coding.h' for more information.
1965
1966 If `coding->type' is `coding_type_big5', element[4] is t to denote
1967 BIG5-ETen or nil to denote BIG5-HKU.
1968
1969 If `coding->type' takes the other value, element[4] is ignored.
1970
1971 Emacs Lisp's coding system also carries information about format of
1972 end-of-line in a value of property `eol-type'. If the value is
1973 integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
1974 means CODING_EOL_CR. If it is not integer, it should be a vector
1975 of subsidiary coding systems of which property `eol-type' has one
1976 of above values.
1977
1978 */
1979
1980 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
1981 and set it in CODING. If CODING_SYSTEM_SYMBOL is invalid, CODING
1982 is setup so that no conversion is necessary and return -1, else
1983 return 0. */
1984
1985 int
1986 setup_coding_system (coding_system_symbol, coding)
1987 Lisp_Object coding_system_symbol;
1988 struct coding_system *coding;
1989 {
1990 Lisp_Object coding_system_vector = Qnil;
1991 Lisp_Object type, eol_type;
1992
1993 /* At first, set several fields default values. */
1994 coding->require_flushing = 0;
1995 coding->last_block = 0;
1996 coding->selective = 0;
1997 coding->composing = 0;
1998 coding->direction = 0;
1999 coding->carryover_size = 0;
2000 coding->symbol = Qnil;
2001 coding->post_read_conversion = coding->pre_write_conversion = Qnil;
2002
2003 /* Get value of property `coding-system'. If it is a Lisp symbol
2004 pointing another coding system, fetch its property until we get a
2005 vector. */
2006 while (!NILP (coding_system_symbol))
2007 {
2008 coding->symbol = coding_system_symbol;
2009 if (NILP (coding->post_read_conversion))
2010 coding->post_read_conversion = Fget (coding_system_symbol,
2011 Qpost_read_conversion);
2012 if (NILP (coding->pre_write_conversion))
2013 coding->pre_write_conversion = Fget (coding_system_symbol,
2014 Qpre_write_conversion);
2015
2016 coding_system_vector = Fget (coding_system_symbol, Qcoding_system);
2017 if (VECTORP (coding_system_vector))
2018 break;
2019 coding_system_symbol = coding_system_vector;
2020 }
2021 Vlast_coding_system_used = coding->symbol;
2022
2023 if (!VECTORP (coding_system_vector)
2024 || XVECTOR (coding_system_vector)->size != 5)
2025 goto label_invalid_coding_system;
2026
2027 /* Get value of property `eol-type' by searching from the root
2028 coding-system. */
2029 coding_system_symbol = coding->symbol;
2030 eol_type = Qnil;
2031 while (SYMBOLP (coding_system_symbol) && !NILP (coding_system_symbol))
2032 {
2033 eol_type = Fget (coding_system_symbol, Qeol_type);
2034 if (!NILP (eol_type))
2035 break;
2036 coding_system_symbol = Fget (coding_system_symbol, Qcoding_system);
2037 }
2038
2039 if (VECTORP (eol_type))
2040 coding->eol_type = CODING_EOL_AUTOMATIC;
2041 else if (XFASTINT (eol_type) == 1)
2042 coding->eol_type = CODING_EOL_CRLF;
2043 else if (XFASTINT (eol_type) == 2)
2044 coding->eol_type = CODING_EOL_CR;
2045 else
2046 coding->eol_type = CODING_EOL_LF;
2047
2048 type = XVECTOR (coding_system_vector)->contents[0];
2049 switch (XFASTINT (type))
2050 {
2051 case 0:
2052 coding->type = coding_type_internal;
2053 break;
2054
2055 case 1:
2056 coding->type = coding_type_sjis;
2057 break;
2058
2059 case 2:
2060 coding->type = coding_type_iso2022;
2061 {
2062 Lisp_Object val = XVECTOR (coding_system_vector)->contents[4];
2063 Lisp_Object *flags;
2064 int i, charset, default_reg_bits = 0;
2065
2066 if (!VECTORP (val) || XVECTOR (val)->size != 32)
2067 goto label_invalid_coding_system;
2068
2069 flags = XVECTOR (val)->contents;
2070 coding->flags
2071 = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
2072 | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
2073 | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
2074 | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
2075 | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
2076 | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
2077 | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
2078 | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
2079 | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION));
2080
2081 /* Invoke graphic register 0 to plane 0. */
2082 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
2083 /* Invoke graphic register 1 to plane 1 if we can use full 8-bit. */
2084 CODING_SPEC_ISO_INVOCATION (coding, 1)
2085 = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
2086 /* Not single shifting at first. */
2087 CODING_SPEC_ISO_SINGLE_SHIFTING(coding) = 0;
2088
2089 /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
2090 FLAGS[REG] can be one of below:
2091 integer CHARSET: CHARSET occupies register I,
2092 t: designate nothing to REG initially, but can be used
2093 by any charsets,
2094 list of integer, nil, or t: designate the first
2095 element (if integer) to REG initially, the remaining
2096 elements (if integer) is designated to REG on request,
2097 if an element is t, REG can be used by any charset,
2098 nil: REG is never used. */
2099 for (charset = 0; charset < MAX_CHARSET; charset++)
2100 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = -1;
2101 for (i = 0; i < 4; i++)
2102 {
2103 if (INTEGERP (flags[i])
2104 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset)))
2105 {
2106 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2107 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
2108 }
2109 else if (EQ (flags[i], Qt))
2110 {
2111 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2112 default_reg_bits |= 1 << i;
2113 }
2114 else if (CONSP (flags[i]))
2115 {
2116 Lisp_Object tail = flags[i];
2117
2118 if (INTEGERP (XCONS (tail)->car)
2119 && (charset = XINT (XCONS (tail)->car),
2120 CHARSET_VALID_P (charset)))
2121 {
2122 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2123 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
2124 }
2125 else
2126 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2127 tail = XCONS (tail)->cdr;
2128 while (CONSP (tail))
2129 {
2130 if (INTEGERP (XCONS (tail)->car)
2131 && (charset = XINT (XCONS (tail)->car),
2132 CHARSET_VALID_P (charset)))
2133 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2134 = i;
2135 else if (EQ (XCONS (tail)->car, Qt))
2136 default_reg_bits |= 1 << i;
2137 tail = XCONS (tail)->cdr;
2138 }
2139 }
2140 else
2141 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2142
2143 CODING_SPEC_ISO_DESIGNATION (coding, i)
2144 = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
2145 }
2146
2147 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
2148 {
2149 /* REG 1 can be used only by locking shift in 7-bit env. */
2150 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2151 default_reg_bits &= ~2;
2152 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
2153 /* Without any shifting, only REG 0 and 1 can be used. */
2154 default_reg_bits &= 3;
2155 }
2156
2157 for (charset = 0; charset < MAX_CHARSET; charset++)
2158 if (CHARSET_VALID_P (charset)
2159 && CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) < 0)
2160 {
2161 /* We have not yet decided where to designate CHARSET. */
2162 int reg_bits = default_reg_bits;
2163
2164 if (CHARSET_CHARS (charset) == 96)
2165 /* A charset of CHARS96 can't be designated to REG 0. */
2166 reg_bits &= ~1;
2167
2168 if (reg_bits)
2169 /* There exist some default graphic register. */
2170 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2171 = (reg_bits & 1
2172 ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
2173 else
2174 /* We anyway have to designate CHARSET to somewhere. */
2175 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2176 = (CHARSET_CHARS (charset) == 94
2177 ? 0
2178 : ((coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT
2179 || ! coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2180 ? 1
2181 : (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT
2182 ? 2 : 0)));
2183 }
2184 }
2185 coding->require_flushing = 1;
2186 break;
2187
2188 case 3:
2189 coding->type = coding_type_big5;
2190 coding->flags
2191 = (NILP (XVECTOR (coding_system_vector)->contents[4])
2192 ? CODING_FLAG_BIG5_HKU
2193 : CODING_FLAG_BIG5_ETEN);
2194 break;
2195
2196 case 4:
2197 coding->type = coding_type_ccl;
2198 {
2199 Lisp_Object val = XVECTOR (coding_system_vector)->contents[4];
2200 if (CONSP (val)
2201 && VECTORP (XCONS (val)->car)
2202 && VECTORP (XCONS (val)->cdr))
2203 {
2204 setup_ccl_program (&(coding->spec.ccl.decoder), XCONS (val)->car);
2205 setup_ccl_program (&(coding->spec.ccl.encoder), XCONS (val)->cdr);
2206 }
2207 else
2208 goto label_invalid_coding_system;
2209 }
2210 coding->require_flushing = 1;
2211 break;
2212
2213 default:
2214 if (EQ (type, Qt))
2215 coding->type = coding_type_automatic;
2216 else
2217 coding->type = coding_type_no_conversion;
2218 break;
2219 }
2220 return 0;
2221
2222 label_invalid_coding_system:
2223 coding->type = coding_type_no_conversion;
2224 return -1;
2225 }
2226
2227 /* Emacs has a mechanism to automatically detect a coding system if it
2228 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
2229 it's impossible to distinguish some coding systems accurately
2230 because they use the same range of codes. So, at first, coding
2231 systems are categorized into 7, those are:
2232
2233 o coding-category-internal
2234
2235 The category for a coding system which has the same code range
2236 as Emacs' internal format. Assigned the coding-system (Lisp
2237 symbol) `coding-system-internal' by default.
2238
2239 o coding-category-sjis
2240
2241 The category for a coding system which has the same code range
2242 as SJIS. Assigned the coding-system (Lisp
2243 symbol) `coding-system-sjis' by default.
2244
2245 o coding-category-iso-7
2246
2247 The category for a coding system which has the same code range
2248 as ISO2022 of 7-bit environment. Assigned the coding-system
2249 (Lisp symbol) `coding-system-junet' by default.
2250
2251 o coding-category-iso-8-1
2252
2253 The category for a coding system which has the same code range
2254 as ISO2022 of 8-bit environment and graphic plane 1 used only
2255 for DIMENSION1 charset. Assigned the coding-system (Lisp
2256 symbol) `coding-system-ctext' by default.
2257
2258 o coding-category-iso-8-2
2259
2260 The category for a coding system which has the same code range
2261 as ISO2022 of 8-bit environment and graphic plane 1 used only
2262 for DIMENSION2 charset. Assigned the coding-system (Lisp
2263 symbol) `coding-system-euc-japan' by default.
2264
2265 o coding-category-iso-else
2266
2267 The category for a coding system which has the same code range
2268 as ISO2022 but not belongs to any of the above three
2269 categories. Assigned the coding-system (Lisp symbol)
2270 `coding-system-iso-2022-ss2-7' by default.
2271
2272 o coding-category-big5
2273
2274 The category for a coding system which has the same code range
2275 as BIG5. Assigned the coding-system (Lisp symbol)
2276 `coding-system-big5' by default.
2277
2278 o coding-category-binary
2279
2280 The category for a coding system not categorized in any of the
2281 above. Assigned the coding-system (Lisp symbol)
2282 `coding-system-noconv' by default.
2283
2284 Each of them is a Lisp symbol and the value is an actual
2285 `coding-system's (this is also a Lisp symbol) assigned by a user.
2286 What Emacs does actually is to detect a category of coding system.
2287 Then, it uses a `coding-system' assigned to it. If Emacs can't
2288 decide only one possible category, it selects a category of the
2289 highest priority. Priorities of categories are also specified by a
2290 user in a Lisp variable `coding-category-list'.
2291
2292 */
2293
2294 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2295 If it detects possible coding systems, return an integer in which
2296 appropriate flag bits are set. Flag bits are defined by macros
2297 CODING_CATEGORY_MASK_XXX in `coding.h'. */
2298
2299 int
2300 detect_coding_mask (src, src_bytes)
2301 unsigned char *src;
2302 int src_bytes;
2303 {
2304 register unsigned char c;
2305 unsigned char *src_end = src + src_bytes;
2306 int mask;
2307
2308 /* At first, skip all ASCII characters and control characters except
2309 for three ISO2022 specific control characters. */
2310 while (src < src_end)
2311 {
2312 c = *src;
2313 if (c >= 0x80
2314 || (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
2315 break;
2316 src++;
2317 }
2318
2319 if (src >= src_end)
2320 /* We found nothing other than ASCII. There's nothing to do. */
2321 return CODING_CATEGORY_MASK_ANY;
2322
2323 /* The text seems to be encoded in some multilingual coding system.
2324 Now, try to find in which coding system the text is encoded. */
2325 if (c < 0x80)
2326 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
2327 /* C is an ISO2022 specific control code of C0. */
2328 mask = detect_coding_iso2022 (src, src_end);
2329
2330 else if (c == ISO_CODE_SS2 || c == ISO_CODE_SS3 || c == ISO_CODE_CSI)
2331 /* C is an ISO2022 specific control code of C1,
2332 or the first byte of SJIS's 2-byte character code,
2333 or a leading code of Emacs. */
2334 mask = (detect_coding_iso2022 (src, src_end)
2335 | detect_coding_sjis (src, src_end)
2336 | detect_coding_internal (src, src_end));
2337
2338 else if (c < 0xA0)
2339 /* C is the first byte of SJIS character code,
2340 or a leading-code of Emacs. */
2341 mask = (detect_coding_sjis (src, src_end)
2342 | detect_coding_internal (src, src_end));
2343
2344 else
2345 /* C is a character of ISO2022 in graphic plane right,
2346 or a SJIS's 1-byte character code (i.e. JISX0201),
2347 or the first byte of BIG5's 2-byte code. */
2348 mask = (detect_coding_iso2022 (src, src_end)
2349 | detect_coding_sjis (src, src_end)
2350 | detect_coding_big5 (src, src_end));
2351
2352 return mask;
2353 }
2354
2355 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2356 The information of the detected coding system is set in CODING. */
2357
2358 void
2359 detect_coding (coding, src, src_bytes)
2360 struct coding_system *coding;
2361 unsigned char *src;
2362 int src_bytes;
2363 {
2364 int mask = detect_coding_mask (src, src_bytes);
2365 int idx;
2366
2367 if (mask == CODING_CATEGORY_MASK_ANY)
2368 /* We found nothing other than ASCII. There's nothing to do. */
2369 return;
2370
2371 if (!mask)
2372 /* The source text seems to be encoded in unknown coding system.
2373 Emacs regards the category of such a kind of coding system as
2374 `coding-category-binary'. We assume that a user has assigned
2375 an appropriate coding system for a `coding-category-binary'. */
2376 idx = CODING_CATEGORY_IDX_BINARY;
2377 else
2378 {
2379 /* We found some plausible coding systems. Let's use a coding
2380 system of the highest priority. */
2381 Lisp_Object val = Vcoding_category_list;
2382
2383 if (CONSP (val))
2384 while (!NILP (val))
2385 {
2386 idx = XFASTINT (Fget (XCONS (val)->car, Qcoding_category_index));
2387 if ((idx < CODING_CATEGORY_IDX_MAX) && (mask & (1 << idx)))
2388 break;
2389 val = XCONS (val)->cdr;
2390 }
2391 else
2392 val = Qnil;
2393
2394 if (NILP (val))
2395 {
2396 /* For unknown reason, `Vcoding_category_list' contains none
2397 of found categories. Let's use any of them. */
2398 for (idx = 0; idx < CODING_CATEGORY_IDX_MAX; idx++)
2399 if (mask & (1 << idx))
2400 break;
2401 }
2402 }
2403 setup_coding_system (XSYMBOL (coding_category_table[idx])->value, coding);
2404 }
2405
2406 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2407 is encoded. Return one of CODING_EOL_LF, CODING_EOL_CRLF,
2408 CODING_EOL_CR, and CODING_EOL_AUTOMATIC. */
2409
2410 int
2411 detect_eol_type (src, src_bytes)
2412 unsigned char *src;
2413 int src_bytes;
2414 {
2415 unsigned char *src_end = src + src_bytes;
2416 unsigned char c;
2417
2418 while (src < src_end)
2419 {
2420 c = *src++;
2421 if (c == '\n')
2422 return CODING_EOL_LF;
2423 else if (c == '\r')
2424 {
2425 if (src < src_end && *src == '\n')
2426 return CODING_EOL_CRLF;
2427 else
2428 return CODING_EOL_CR;
2429 }
2430 }
2431 return CODING_EOL_AUTOMATIC;
2432 }
2433
2434 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2435 is encoded. If it detects an appropriate format of end-of-line, it
2436 sets the information in *CODING. */
2437
2438 void
2439 detect_eol (coding, src, src_bytes)
2440 struct coding_system *coding;
2441 unsigned char *src;
2442 int src_bytes;
2443 {
2444 Lisp_Object val;
2445 int eol_type = detect_eol_type (src, src_bytes);
2446
2447 if (eol_type == CODING_EOL_AUTOMATIC)
2448 /* We found no end-of-line in the source text. */
2449 return;
2450
2451 val = Fget (coding->symbol, Qeol_type);
2452 if (VECTORP (val) && XVECTOR (val)->size == 3)
2453 setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
2454 }
2455
2456 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions". Before
2457 decoding, it may detect coding system and format of end-of-line if
2458 those are not yet decided. */
2459
2460 int
2461 decode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2462 struct coding_system *coding;
2463 unsigned char *source, *destination;
2464 int src_bytes, dst_bytes;
2465 int *consumed;
2466 {
2467 int produced;
2468
2469 if (src_bytes <= 0)
2470 {
2471 *consumed = 0;
2472 return 0;
2473 }
2474
2475 if (coding->type == coding_type_automatic)
2476 detect_coding (coding, source, src_bytes);
2477
2478 if (coding->eol_type == CODING_EOL_AUTOMATIC)
2479 detect_eol (coding, source, src_bytes);
2480
2481 coding->carryover_size = 0;
2482 switch (coding->type)
2483 {
2484 case coding_type_no_conversion:
2485 label_no_conversion:
2486 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2487 bcopy (source, destination, produced);
2488 *consumed = produced;
2489 break;
2490
2491 case coding_type_internal:
2492 case coding_type_automatic:
2493 if (coding->eol_type == CODING_EOL_LF
2494 || coding->eol_type == CODING_EOL_AUTOMATIC)
2495 goto label_no_conversion;
2496 produced = decode_eol (coding, source, destination,
2497 src_bytes, dst_bytes, consumed);
2498 break;
2499
2500 case coding_type_sjis:
2501 produced = decode_coding_sjis_big5 (coding, source, destination,
2502 src_bytes, dst_bytes, consumed,
2503 1);
2504 break;
2505
2506 case coding_type_iso2022:
2507 produced = decode_coding_iso2022 (coding, source, destination,
2508 src_bytes, dst_bytes, consumed);
2509 break;
2510
2511 case coding_type_big5:
2512 produced = decode_coding_sjis_big5 (coding, source, destination,
2513 src_bytes, dst_bytes, consumed,
2514 0);
2515 break;
2516
2517 case coding_type_ccl:
2518 produced = ccl_driver (&coding->spec.ccl.decoder, source, destination,
2519 src_bytes, dst_bytes, consumed);
2520 break;
2521 }
2522
2523 return produced;
2524 }
2525
2526 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". */
2527
2528 int
2529 encode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2530 struct coding_system *coding;
2531 unsigned char *source, *destination;
2532 int src_bytes, dst_bytes;
2533 int *consumed;
2534 {
2535 int produced;
2536
2537 coding->carryover_size = 0;
2538 switch (coding->type)
2539 {
2540 case coding_type_no_conversion:
2541 label_no_conversion:
2542 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2543 if (produced > 0)
2544 {
2545 bcopy (source, destination, produced);
2546 if (coding->selective)
2547 {
2548 unsigned char *p = destination, *pend = destination + produced;
2549 while (p < pend)
2550 if (*p++ = '\015') p[-1] = '\n';
2551 }
2552 }
2553 *consumed = produced;
2554 break;
2555
2556 case coding_type_internal:
2557 case coding_type_automatic:
2558 if (coding->eol_type == CODING_EOL_LF
2559 || coding->eol_type == CODING_EOL_AUTOMATIC)
2560 goto label_no_conversion;
2561 produced = encode_eol (coding, source, destination,
2562 src_bytes, dst_bytes, consumed);
2563 break;
2564
2565 case coding_type_sjis:
2566 produced = encode_coding_sjis_big5 (coding, source, destination,
2567 src_bytes, dst_bytes, consumed,
2568 1);
2569 break;
2570
2571 case coding_type_iso2022:
2572 produced = encode_coding_iso2022 (coding, source, destination,
2573 src_bytes, dst_bytes, consumed);
2574 break;
2575
2576 case coding_type_big5:
2577 produced = encode_coding_sjis_big5 (coding, source, destination,
2578 src_bytes, dst_bytes, consumed,
2579 0);
2580 break;
2581
2582 case coding_type_ccl:
2583 produced = ccl_driver (&coding->spec.ccl.encoder, source, destination,
2584 src_bytes, dst_bytes, consumed);
2585 break;
2586 }
2587
2588 return produced;
2589 }
2590
2591 #define CONVERSION_BUFFER_EXTRA_ROOM 256
2592
2593 /* Return maximum size (bytes) of a buffer enough for decoding
2594 SRC_BYTES of text encoded in CODING. */
2595
2596 int
2597 decoding_buffer_size (coding, src_bytes)
2598 struct coding_system *coding;
2599 int src_bytes;
2600 {
2601 int magnification;
2602
2603 if (coding->type == coding_type_iso2022)
2604 magnification = 3;
2605 else if (coding->type == coding_type_ccl)
2606 magnification = coding->spec.ccl.decoder.buf_magnification;
2607 else
2608 magnification = 2;
2609
2610 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
2611 }
2612
2613 /* Return maximum size (bytes) of a buffer enough for encoding
2614 SRC_BYTES of text to CODING. */
2615
2616 int
2617 encoding_buffer_size (coding, src_bytes)
2618 struct coding_system *coding;
2619 int src_bytes;
2620 {
2621 int magnification;
2622
2623 if (coding->type == coding_type_ccl)
2624 magnification = coding->spec.ccl.encoder.buf_magnification;
2625 else
2626 magnification = 3;
2627
2628 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
2629 }
2630
2631 #ifndef MINIMUM_CONVERSION_BUFFER_SIZE
2632 #define MINIMUM_CONVERSION_BUFFER_SIZE 1024
2633 #endif
2634
2635 char *conversion_buffer;
2636 int conversion_buffer_size;
2637
2638 /* Return a pointer to a SIZE bytes of buffer to be used for encoding
2639 or decoding. Sufficient memory is allocated automatically. If we
2640 run out of memory, return NULL. */
2641
2642 char *
2643 get_conversion_buffer (size)
2644 int size;
2645 {
2646 if (size > conversion_buffer_size)
2647 {
2648 char *buf;
2649 int real_size = conversion_buffer_size * 2;
2650
2651 while (real_size < size) real_size *= 2;
2652 buf = (char *) xmalloc (real_size);
2653 xfree (conversion_buffer);
2654 conversion_buffer = buf;
2655 conversion_buffer_size = real_size;
2656 }
2657 return conversion_buffer;
2658 }
2659
2660 \f
2661 #ifdef emacs
2662 /*** 7. Emacs Lisp library functions ***/
2663
2664 DEFUN ("coding-system-vector", Fcoding_system_vector, Scoding_system_vector,
2665 1, 1, 0,
2666 "Return coding-vector of CODING-SYSTEM.\n\
2667 If CODING-SYSTEM is not a valid coding-system, return nil.")
2668 (obj)
2669 Lisp_Object obj;
2670 {
2671 while (SYMBOLP (obj) && !NILP (obj))
2672 obj = Fget (obj, Qcoding_system);
2673 return ((NILP (obj) || !VECTORP (obj) || XVECTOR (obj)->size != 5)
2674 ? Qnil : obj);
2675 }
2676
2677 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
2678 "Return t if OBJECT is nil or a coding-system.\n\
2679 See document of make-coding-system for coding-system object.")
2680 (obj)
2681 Lisp_Object obj;
2682 {
2683 return ((NILP (obj) || !NILP (Fcoding_system_vector (obj))) ? Qt : Qnil);
2684 }
2685
2686 DEFUN ("read-non-nil-coding-system",
2687 Fread_non_nil_coding_system, Sread_non_nil_coding_system, 1, 1, 0,
2688 "Read a coding-system from the minibuffer, prompting with string PROMPT.")
2689 (prompt)
2690 Lisp_Object prompt;
2691 {
2692 return Fintern (Fcompleting_read (prompt, Vobarray, Qcoding_system_vector,
2693 Qt, Qnil, Qnil),
2694 Qnil);
2695 }
2696
2697 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 1, 0,
2698 "Read a coding-system or nil from the minibuffer, prompting with string PROMPT.")
2699 (prompt)
2700 Lisp_Object prompt;
2701 {
2702 return Fintern (Fcompleting_read (prompt, Vobarray, Qcoding_system_p,
2703 Qt, Qnil, Qnil),
2704 Qnil);
2705 }
2706
2707 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
2708 1, 1, 0,
2709 "Check validity of CODING-SYSTEM.\n\
2710 If valid, return CODING-SYSTEM, else `coding-system-error' is signaled.\n\
2711 CODING-SYSTEM is valid if it is a symbol and has \"coding-system\" property.\n\
2712 The value of property should be a vector of length 5.")
2713 (coding_system)
2714 Lisp_Object coding_system;
2715 {
2716 CHECK_SYMBOL (coding_system, 0);
2717 if (!NILP (Fcoding_system_p (coding_system)))
2718 return coding_system;
2719 while (1)
2720 Fsignal (Qcoding_system_error, coding_system);
2721 }
2722
2723 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
2724 2, 2, 0,
2725 "Detect coding-system of the text in the region between START and END.\n\
2726 Return a list of possible coding-systems ordered by priority.\n\
2727 If only ASCII characters are found, it returns `coding-system-automatic'\n\
2728 or its subsidiary coding-system according to a detected end-of-line format.")
2729 (b, e)
2730 Lisp_Object b, e;
2731 {
2732 int coding_mask, eol_type;
2733 Lisp_Object val;
2734 int beg, end;
2735
2736 validate_region (&b, &e);
2737 beg = XINT (b), end = XINT (e);
2738 if (beg < GPT && end >= GPT) move_gap (end);
2739
2740 coding_mask = detect_coding_mask (POS_ADDR (beg), end - beg);
2741 eol_type = detect_eol_type (POS_ADDR (beg), end - beg);
2742
2743 if (coding_mask == CODING_CATEGORY_MASK_ANY)
2744 {
2745 val = intern ("coding-system-automatic");
2746 if (eol_type != CODING_EOL_AUTOMATIC)
2747 {
2748 Lisp_Object val2 = Fget (val, Qeol_type);
2749 if (VECTORP (val2))
2750 val = XVECTOR (val2)->contents[eol_type];
2751 }
2752 }
2753 else
2754 {
2755 Lisp_Object val2;
2756
2757 /* At first, gather possible coding-systems in VAL in a reverse
2758 order. */
2759 val = Qnil;
2760 for (val2 = Vcoding_category_list;
2761 !NILP (val2);
2762 val2 = XCONS (val2)->cdr)
2763 {
2764 int idx
2765 = XFASTINT (Fget (XCONS (val2)->car, Qcoding_category_index));
2766 if (coding_mask & (1 << idx))
2767 val = Fcons (Fsymbol_value (XCONS (val2)->car), val);
2768 }
2769
2770 /* Then, change the order of the list, while getting subsidiary
2771 coding-systems. */
2772 val2 = val;
2773 val = Qnil;
2774 for (; !NILP (val2); val2 = XCONS (val2)->cdr)
2775 {
2776 if (eol_type == CODING_EOL_AUTOMATIC)
2777 val = Fcons (XCONS (val2)->car, val);
2778 else
2779 {
2780 Lisp_Object val3 = Fget (XCONS (val2)->car, Qeol_type);
2781 if (VECTORP (val3))
2782 val = Fcons (XVECTOR (val3)->contents[eol_type], val);
2783 else
2784 val = Fcons (XCONS (val2)->car, val);
2785 }
2786 }
2787 }
2788
2789 return val;
2790 }
2791
2792 /* Scan text in the region between *BEGP and *ENDP, skip characters
2793 which we never have to encode to (iff ENCODEP is 1) or decode from
2794 coding system CODING at the head and tail, then set BEGP and ENDP
2795 to the addresses of start and end of the text we actually convert. */
2796
2797 void
2798 shrink_conversion_area (begp, endp, coding, encodep)
2799 unsigned char **begp, **endp;
2800 struct coding_system *coding;
2801 int encodep;
2802 {
2803 register unsigned char *beg_addr = *begp, *end_addr = *endp;
2804
2805 if (coding->eol_type != CODING_EOL_LF
2806 && coding->eol_type != CODING_EOL_AUTOMATIC)
2807 /* Since we anyway have to convert end-of-line format, it is not
2808 worth skipping at most 100 bytes or so. */
2809 return;
2810
2811 if (encodep) /* for encoding */
2812 {
2813 switch (coding->type)
2814 {
2815 case coding_type_no_conversion:
2816 case coding_type_internal:
2817 case coding_type_automatic:
2818 /* We need no conversion. */
2819 *begp = *endp;
2820 return;
2821 case coding_type_ccl:
2822 /* We can't skip any data. */
2823 return;
2824 default:
2825 /* We can skip all ASCII characters at the head and tail. */
2826 while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
2827 while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
2828 break;
2829 }
2830 }
2831 else /* for decoding */
2832 {
2833 switch (coding->type)
2834 {
2835 case coding_type_no_conversion:
2836 /* We need no conversion. */
2837 *begp = *endp;
2838 return;
2839 case coding_type_internal:
2840 if (coding->eol_type == CODING_EOL_LF)
2841 {
2842 /* We need no conversion. */
2843 *begp = *endp;
2844 return;
2845 }
2846 /* We can skip all but carriage-return. */
2847 while (beg_addr < end_addr && *beg_addr != '\r') beg_addr++;
2848 while (beg_addr < end_addr && *(end_addr - 1) != '\r') end_addr--;
2849 break;
2850 case coding_type_sjis:
2851 case coding_type_big5:
2852 /* We can skip all ASCII characters at the head. */
2853 while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
2854 /* We can skip all ASCII characters at the tail except for
2855 the second byte of SJIS or BIG5 code. */
2856 while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
2857 if (end_addr != *endp)
2858 end_addr++;
2859 break;
2860 case coding_type_ccl:
2861 /* We can't skip any data. */
2862 return;
2863 default: /* i.e. case coding_type_iso2022: */
2864 {
2865 unsigned char c;
2866
2867 /* We can skip all ASCII characters except for a few
2868 control codes at the head. */
2869 while (beg_addr < end_addr && (c = *beg_addr) < 0x80
2870 && c != ISO_CODE_CR && c != ISO_CODE_SO
2871 && c != ISO_CODE_SI && c != ISO_CODE_ESC)
2872 beg_addr++;
2873 }
2874 break;
2875 }
2876 }
2877 *begp = beg_addr;
2878 *endp = end_addr;
2879 return;
2880 }
2881
2882 /* Encode to (iff ENCODEP is 1) or decode form coding system CODING a
2883 text between B and E. B and E are buffer position. */
2884
2885 Lisp_Object
2886 code_convert_region (b, e, coding, encodep)
2887 Lisp_Object b, e;
2888 struct coding_system *coding;
2889 int encodep;
2890 {
2891 int beg, end, len, consumed, produced;
2892 char *buf;
2893 unsigned char *begp, *endp;
2894 int pos = PT;
2895
2896 validate_region (&b, &e);
2897 beg = XINT (b), end = XINT (e);
2898 if (beg < GPT && end >= GPT)
2899 move_gap (end);
2900
2901 if (encodep && !NILP (coding->pre_write_conversion))
2902 {
2903 /* We must call a pre-conversion function which may put a new
2904 text to be converted in a new buffer. */
2905 struct buffer *old = current_buffer, *new;
2906
2907 TEMP_SET_PT (beg);
2908 call2 (coding->pre_write_conversion, b, e);
2909 if (old != current_buffer)
2910 {
2911 /* Replace the original text by the text just generated. */
2912 len = ZV - BEGV;
2913 new = current_buffer;
2914 set_buffer_internal (old);
2915 del_range (beg, end);
2916 insert_from_buffer (new, 1, len, 0);
2917 end = beg + len;
2918 }
2919 }
2920
2921 /* We may be able to shrink the conversion region. */
2922 begp = POS_ADDR (beg); endp = begp + (end - beg);
2923 shrink_conversion_area (&begp, &endp, coding, encodep);
2924
2925 if (begp == endp)
2926 /* We need no conversion. */
2927 len = end - beg;
2928 else
2929 {
2930 beg += begp - POS_ADDR (beg);
2931 end = beg + (endp - begp);
2932
2933 if (encodep)
2934 len = encoding_buffer_size (coding, end - beg);
2935 else
2936 len = decoding_buffer_size (coding, end - beg);
2937 buf = get_conversion_buffer (len);
2938
2939 coding->last_block = 1;
2940 produced = (encodep
2941 ? encode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
2942 &consumed)
2943 : decode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
2944 &consumed));
2945
2946 len = produced + (beg - XINT (b)) + (XINT (e) - end);
2947
2948 TEMP_SET_PT (beg);
2949 insert (buf, produced);
2950 del_range (PT, PT + end - beg);
2951 if (pos >= end)
2952 pos = PT + (pos - end);
2953 else if (pos > beg)
2954 pos = beg;
2955 TEMP_SET_PT (pos);
2956 }
2957
2958 if (!encodep && !NILP (coding->post_read_conversion))
2959 {
2960 /* We must call a post-conversion function which may alter
2961 the text just converted. */
2962 Lisp_Object insval;
2963
2964 beg = XINT (b);
2965 TEMP_SET_PT (beg);
2966 insval = call1 (coding->post_read_conversion, make_number (len));
2967 CHECK_NUMBER (insval, 0);
2968 len = XINT (insval);
2969 }
2970
2971 return make_number (len);
2972 }
2973
2974 Lisp_Object
2975 code_convert_string (str, coding, encodep)
2976 Lisp_Object str;
2977 struct coding_system *coding;
2978 int encodep;
2979 {
2980 int len, consumed, produced;
2981 char *buf;
2982 unsigned char *begp, *endp;
2983 int head_skip, tail_skip;
2984 struct gcpro gcpro1;
2985
2986 if (encodep && !NILP (coding->pre_write_conversion)
2987 || !encodep && !NILP (coding->post_read_conversion))
2988 {
2989 /* Since we have to call Lisp functions which assume target text
2990 is in a buffer, after setting a temporary buffer, call
2991 code_convert_region. */
2992 int count = specpdl_ptr - specpdl;
2993 int len = XSTRING (str)->size;
2994 Lisp_Object result;
2995 struct buffer *old = current_buffer;
2996
2997 record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
2998 temp_output_buffer_setup (" *code-converting-work*");
2999 set_buffer_internal (XBUFFER (Vstandard_output));
3000 insert_from_string (str, 0, len, 0);
3001 code_convert_region (make_number (BEGV), make_number (ZV),
3002 coding, encodep);
3003 result = make_buffer_string (BEGV, ZV, 0);
3004 set_buffer_internal (old);
3005 return unbind_to (count, result);
3006 }
3007
3008 /* We may be able to shrink the conversion region. */
3009 begp = XSTRING (str)->data;
3010 endp = begp + XSTRING (str)->size;
3011 shrink_conversion_area (&begp, &endp, coding, encodep);
3012
3013 if (begp == endp)
3014 /* We need no conversion. */
3015 return str;
3016
3017 head_skip = begp - XSTRING (str)->data;
3018 tail_skip = XSTRING (str)->size - head_skip - (endp - begp);
3019
3020 GCPRO1 (str);
3021
3022 if (encodep)
3023 len = encoding_buffer_size (coding, endp - begp);
3024 else
3025 len = decoding_buffer_size (coding, endp - begp);
3026 buf = get_conversion_buffer (len + head_skip + tail_skip);
3027
3028 bcopy (XSTRING (str)->data, buf, head_skip);
3029 coding->last_block = 1;
3030 produced = (encodep
3031 ? encode_coding (coding, XSTRING (str)->data + head_skip,
3032 buf + head_skip, endp - begp, len, &consumed)
3033 : decode_coding (coding, XSTRING (str)->data + head_skip,
3034 buf + head_skip, endp - begp, len, &consumed));
3035 bcopy (XSTRING (str)->data + head_skip + (endp - begp),
3036 buf + head_skip + produced,
3037 tail_skip);
3038
3039 UNGCPRO;
3040
3041 return make_string (buf, head_skip + produced + tail_skip);
3042 }
3043
3044 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
3045 3, 3, 0,
3046 "Decode the text between START and END which is encoded in CODING-SYSTEM.\n\
3047 Return length of decoded text.")
3048 (b, e, coding_system)
3049 Lisp_Object b, e, coding_system;
3050 {
3051 struct coding_system coding;
3052
3053 CHECK_NUMBER_COERCE_MARKER (b, 0);
3054 CHECK_NUMBER_COERCE_MARKER (e, 1);
3055 CHECK_SYMBOL (coding_system, 2);
3056
3057 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3058 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3059
3060 return code_convert_region (b, e, &coding, 0);
3061 }
3062
3063 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
3064 3, 3, 0,
3065 "Encode the text between START and END to CODING-SYSTEM.\n\
3066 Return length of encoded text.")
3067 (b, e, coding_system)
3068 Lisp_Object b, e, coding_system;
3069 {
3070 struct coding_system coding;
3071
3072 CHECK_NUMBER_COERCE_MARKER (b, 0);
3073 CHECK_NUMBER_COERCE_MARKER (e, 1);
3074 CHECK_SYMBOL (coding_system, 2);
3075
3076 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3077 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3078
3079 return code_convert_region (b, e, &coding, 1);
3080 }
3081
3082 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
3083 2, 2, 0,
3084 "Decode STRING which is encoded in CODING-SYSTEM, and return the result.")
3085 (string, coding_system)
3086 Lisp_Object string, coding_system;
3087 {
3088 struct coding_system coding;
3089
3090 CHECK_STRING (string, 0);
3091 CHECK_SYMBOL (coding_system, 1);
3092
3093 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3094 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3095
3096 return code_convert_string (string, &coding, 0);
3097 }
3098
3099 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
3100 2, 2, 0,
3101 "Encode STRING to CODING-SYSTEM, and return the result.")
3102 (string, coding_system)
3103 Lisp_Object string, coding_system;
3104 {
3105 struct coding_system coding;
3106
3107 CHECK_STRING (string, 0);
3108 CHECK_SYMBOL (coding_system, 1);
3109
3110 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3111 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3112
3113 return code_convert_string (string, &coding, 1);
3114 }
3115
3116 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
3117 "Decode a JISX0208 character of SJIS coding-system-sjis.\n\
3118 CODE is the character code in SJIS.\n\
3119 Return the corresponding character.")
3120 (code)
3121 Lisp_Object code;
3122 {
3123 unsigned char c1, c2, s1, s2;
3124 Lisp_Object val;
3125
3126 CHECK_NUMBER (code, 0);
3127 s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
3128 DECODE_SJIS (s1, s2, c1, c2);
3129 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset_jisx0208, c1, c2));
3130 return val;
3131 }
3132
3133 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
3134 "Encode a JISX0208 character CHAR to SJIS coding-system.\n\
3135 Return the corresponding character code in SJIS.")
3136 (ch)
3137 Lisp_Object ch;
3138 {
3139 int charset;
3140 unsigned char c1, c2, s1, s2;
3141 Lisp_Object val;
3142
3143 CHECK_NUMBER (ch, 0);
3144 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3145 if (charset == charset_jisx0208)
3146 {
3147 ENCODE_SJIS (c1, c2, s1, s2);
3148 XSETFASTINT (val, ((int)s1 << 8) | s2);
3149 }
3150 else
3151 XSETFASTINT (val, 0);
3152 return val;
3153 }
3154
3155 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
3156 "Decode a Big5 character CODE of BIG5 coding-system.\n\
3157 CODE is the character code in BIG5.\n\
3158 Return the corresponding character.")
3159 (code)
3160 Lisp_Object code;
3161 {
3162 int charset;
3163 unsigned char b1, b2, c1, c2;
3164 Lisp_Object val;
3165
3166 CHECK_NUMBER (code, 0);
3167 b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
3168 DECODE_BIG5 (b1, b2, charset, c1, c2);
3169 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset, c1, c2));
3170 return val;
3171 }
3172
3173 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
3174 "Encode the Big5 character CHAR to BIG5 coding-system.\n\
3175 Return the corresponding character code in Big5.")
3176 (ch)
3177 Lisp_Object ch;
3178 {
3179 int charset;
3180 unsigned char c1, c2, b1, b2;
3181 Lisp_Object val;
3182
3183 CHECK_NUMBER (ch, 0);
3184 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3185 if (charset == charset_big5_1 || charset == charset_big5_2)
3186 {
3187 ENCODE_BIG5 (charset, c1, c2, b1, b2);
3188 XSETFASTINT (val, ((int)b1 << 8) | b2);
3189 }
3190 else
3191 XSETFASTINT (val, 0);
3192 return val;
3193 }
3194
3195 DEFUN ("set-terminal-coding-system",
3196 Fset_terminal_coding_system, Sset_terminal_coding_system, 1, 1,
3197 "zCoding-system for terminal display: ",
3198 "Set coding-system of your terminal to CODING-SYSTEM.\n\
3199 All outputs to terminal are encoded to this coding-system.")
3200 (coding_system)
3201 Lisp_Object coding_system;
3202 {
3203 CHECK_SYMBOL (coding_system, 0);
3204 setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
3205 update_mode_lines++;
3206 if (!NILP (Finteractive_p ()))
3207 Fredraw_display ();
3208 return Qnil;
3209 }
3210
3211 DEFUN ("terminal-coding-system",
3212 Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
3213 "Return coding-system of your terminal.")
3214 ()
3215 {
3216 return terminal_coding.symbol;
3217 }
3218
3219 DEFUN ("set-keyboard-coding-system",
3220 Fset_keyboard_coding_system, Sset_keyboard_coding_system, 1, 1,
3221 "zCoding-system for keyboard input: ",
3222 "Set coding-system of what is sent from terminal keyboard to CODING-SYSTEM.\n\
3223 All inputs from terminal are decoded from this coding-system.")
3224 (coding_system)
3225 Lisp_Object coding_system;
3226 {
3227 CHECK_SYMBOL (coding_system, 0);
3228 setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
3229 return Qnil;
3230 }
3231
3232 DEFUN ("keyboard-coding-system",
3233 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
3234 "Return coding-system of what is sent from terminal keyboard.")
3235 ()
3236 {
3237 return keyboard_coding.symbol;
3238 }
3239
3240 \f
3241 DEFUN ("find-coding-system", Ffind_coding_system, Sfind_coding_system,
3242 1, MANY, 0,
3243 "Return a cons of coding systems for I/O primitive OPERATION.\n\
3244 Remaining arguments are for OPERATION.\n\
3245 OPERATION is one of the following Emacs I/O primitives:\n\
3246 For file I/O, insert-file-contents or write-region.\n\
3247 For process I/O, call-process, call-process-region, or start-process.\n\
3248 For network I/O, open-network-stream.\n\
3249 For each OPERATION, TARGET is selected from the arguments as below:\n\
3250 For file I/O, TARGET is a file name.\n\
3251 For process I/O, TARGET is a process name.\n\
3252 For network I/O, TARGET is a service name or a port number\n\
3253 \n\
3254 The return value is a cons of coding systems for decoding and encoding\n\
3255 registered in nested alist `coding-system-alist' (which see) at a slot\n\
3256 corresponding to OPERATION and TARGET.
3257 If a function symbol is at the slot, return a result of the function call.\n\
3258 The function is called with one argument, a list of all the arguments.")
3259 (nargs, args)
3260 int nargs;
3261 Lisp_Object *args;
3262 {
3263 Lisp_Object operation, target_idx, target, val;
3264 register Lisp_Object chain;
3265
3266 if (nargs < 2)
3267 error ("Too few arguments");
3268 operation = args[0];
3269 if (!SYMBOLP (operation)
3270 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
3271 error ("Invalid first arguement");
3272 if (nargs < 1 + XINT (target_idx))
3273 error ("Too few arguments for operation: %s",
3274 XSYMBOL (operation)->name->data);
3275 target = args[XINT (target_idx) + 1];
3276 if (!(STRINGP (target)
3277 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
3278 error ("Invalid %dth argument", XINT (target_idx) + 1);
3279
3280 chain = Fassq (operation, Vcoding_system_alist);
3281 if (NILP (chain))
3282 return Qnil;
3283
3284 for (chain = XCONS (chain)->cdr; CONSP (chain); chain = XCONS (chain)->cdr)
3285 {
3286 Lisp_Object elt = XCONS (chain)->car;
3287
3288 if (CONSP (elt)
3289 && ((STRINGP (target)
3290 && STRINGP (XCONS (elt)->car)
3291 && fast_string_match (XCONS (elt)->car, target) >= 0)
3292 || (INTEGERP (target) && EQ (target, XCONS (elt)->car))))
3293 return (CONSP (val = XCONS (elt)->cdr)
3294 ? val
3295 : ((SYMBOLP (val) && Fboundp (val)
3296 ? call2 (val, Flist (nargs, args))
3297 : Qnil)));
3298 }
3299 return Qnil;
3300 }
3301
3302 #endif /* emacs */
3303
3304 \f
3305 /*** 8. Post-amble ***/
3306
3307 init_coding_once ()
3308 {
3309 int i;
3310
3311 /* Emacs internal format specific initialize routine. */
3312 for (i = 0; i <= 0x20; i++)
3313 emacs_code_class[i] = EMACS_control_code;
3314 emacs_code_class[0x0A] = EMACS_linefeed_code;
3315 emacs_code_class[0x0D] = EMACS_carriage_return_code;
3316 for (i = 0x21 ; i < 0x7F; i++)
3317 emacs_code_class[i] = EMACS_ascii_code;
3318 emacs_code_class[0x7F] = EMACS_control_code;
3319 emacs_code_class[0x80] = EMACS_leading_code_composition;
3320 for (i = 0x81; i < 0xFF; i++)
3321 emacs_code_class[i] = EMACS_invalid_code;
3322 emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
3323 emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
3324 emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
3325 emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
3326
3327 /* ISO2022 specific initialize routine. */
3328 for (i = 0; i < 0x20; i++)
3329 iso_code_class[i] = ISO_control_code;
3330 for (i = 0x21; i < 0x7F; i++)
3331 iso_code_class[i] = ISO_graphic_plane_0;
3332 for (i = 0x80; i < 0xA0; i++)
3333 iso_code_class[i] = ISO_control_code;
3334 for (i = 0xA1; i < 0xFF; i++)
3335 iso_code_class[i] = ISO_graphic_plane_1;
3336 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
3337 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
3338 iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
3339 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
3340 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
3341 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
3342 iso_code_class[ISO_CODE_ESC] = ISO_escape;
3343 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
3344 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
3345 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
3346
3347 Qcoding_system = intern ("coding-system");
3348 staticpro (&Qcoding_system);
3349
3350 Qeol_type = intern ("eol-type");
3351 staticpro (&Qeol_type);
3352
3353 Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
3354 staticpro (&Qbuffer_file_coding_system);
3355
3356 Qpost_read_conversion = intern ("post-read-conversion");
3357 staticpro (&Qpost_read_conversion);
3358
3359 Qpre_write_conversion = intern ("pre-write-conversion");
3360 staticpro (&Qpre_write_conversion);
3361
3362 Qcoding_system_vector = intern ("coding-system-vector");
3363 staticpro (&Qcoding_system_vector);
3364
3365 Qcoding_system_p = intern ("coding-system-p");
3366 staticpro (&Qcoding_system_p);
3367
3368 Qcoding_system_error = intern ("coding-system-error");
3369 staticpro (&Qcoding_system_error);
3370
3371 Fput (Qcoding_system_error, Qerror_conditions,
3372 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
3373 Fput (Qcoding_system_error, Qerror_message,
3374 build_string ("Coding-system error"));
3375
3376 Qcoding_category_index = intern ("coding-category-index");
3377 staticpro (&Qcoding_category_index);
3378
3379 {
3380 int i;
3381 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3382 {
3383 coding_category_table[i] = intern (coding_category_name[i]);
3384 staticpro (&coding_category_table[i]);
3385 Fput (coding_category_table[i], Qcoding_category_index,
3386 make_number (i));
3387 }
3388 }
3389
3390 conversion_buffer_size = MINIMUM_CONVERSION_BUFFER_SIZE;
3391 conversion_buffer = (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE);
3392
3393 setup_coding_system (Qnil, &keyboard_coding);
3394 setup_coding_system (Qnil, &terminal_coding);
3395 }
3396
3397 #ifdef emacs
3398
3399 syms_of_coding ()
3400 {
3401 Qtarget_idx = intern ("target-idx");
3402 staticpro (&Qtarget_idx);
3403
3404 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
3405 Fput (Qwrite_region, Qtarget_idx, make_number (2));
3406
3407 Qcall_process = intern ("call-process");
3408 staticpro (&Qcall_process);
3409 Fput (Qcall_process, Qtarget_idx, make_number (0));
3410
3411 Qcall_process_region = intern ("call-process-region");
3412 staticpro (&Qcall_process_region);
3413 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
3414
3415 Qstart_process = intern ("start-process");
3416 staticpro (&Qstart_process);
3417 Fput (Qstart_process, Qtarget_idx, make_number (2));
3418
3419 Qopen_network_stream = intern ("open-network-stream");
3420 staticpro (&Qopen_network_stream);
3421 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
3422
3423 defsubr (&Scoding_system_vector);
3424 defsubr (&Scoding_system_p);
3425 defsubr (&Sread_coding_system);
3426 defsubr (&Sread_non_nil_coding_system);
3427 defsubr (&Scheck_coding_system);
3428 defsubr (&Sdetect_coding_region);
3429 defsubr (&Sdecode_coding_region);
3430 defsubr (&Sencode_coding_region);
3431 defsubr (&Sdecode_coding_string);
3432 defsubr (&Sencode_coding_string);
3433 defsubr (&Sdecode_sjis_char);
3434 defsubr (&Sencode_sjis_char);
3435 defsubr (&Sdecode_big5_char);
3436 defsubr (&Sencode_big5_char);
3437 defsubr (&Sset_terminal_coding_system);
3438 defsubr (&Sterminal_coding_system);
3439 defsubr (&Sset_keyboard_coding_system);
3440 defsubr (&Skeyboard_coding_system);
3441 defsubr (&Sfind_coding_system);
3442
3443 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
3444 "List of coding-categories (symbols) ordered by priority.");
3445 {
3446 int i;
3447
3448 Vcoding_category_list = Qnil;
3449 for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
3450 Vcoding_category_list
3451 = Fcons (coding_category_table[i], Vcoding_category_list);
3452 }
3453
3454 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
3455 "A variable of internal use only.\n\
3456 If the value is a coding system, it is used for decoding on read operation.\n\
3457 If not, an appropriate element in `coding-system-alist' (which see) is used.");
3458 Vcoding_system_for_read = Qnil;
3459
3460 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
3461 "A variable of internal use only.\n\
3462 If the value is a coding system, it is used for encoding on write operation.\n\
3463 If not, an appropriate element in `coding-system-alist' (which see) is used.");
3464 Vcoding_system_for_write = Qnil;
3465
3466 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
3467 "Coding-system used in the latest file or process I/O.");
3468 Vlast_coding_system_used = Qnil;
3469
3470 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
3471 "Nested alist to decide a coding system for a specific I/O operation.\n\
3472 The format is ((OPERATION . ((REGEXP . CODING-SYSTEMS) ...)) ...).\n\
3473
3474 OPERATION is one of the following Emacs I/O primitives:\n\
3475 For file I/O, insert-file-contents and write-region.\n\
3476 For process I/O, call-process, call-process-region, and start-process.\n\
3477 For network I/O, open-network-stream.\n\
3478 In addition, for process I/O, `process-argument' can be specified for\n\
3479 encoding arguments of the process.\n\
3480 \n\
3481 REGEXP is a regular expression matching a target of OPERATION, where\n\
3482 target is a file name for file I/O operations, a process name for\n\
3483 process I/O operations, or a service name for network I/O\n\
3484 operations. REGEXP might be a port number for network I/O operation.\n\
3485 \n\
3486 CODING-SYSTEMS is a cons of coding systems to encode and decode\n\
3487 character code on OPERATION, or a function symbol returning the cons.\n\
3488 See the documentation of `find-coding-system' for more detail.");
3489 Vcoding_system_alist = Qnil;
3490
3491 DEFVAR_INT ("eol-mnemonic-unix", &eol_mnemonic_unix,
3492 "Mnemonic character indicating UNIX-like end-of-line format (i.e. LF) .");
3493 eol_mnemonic_unix = '.';
3494
3495 DEFVAR_INT ("eol-mnemonic-dos", &eol_mnemonic_dos,
3496 "Mnemonic character indicating DOS-like end-of-line format (i.e. CRLF).");
3497 eol_mnemonic_dos = ':';
3498
3499 DEFVAR_INT ("eol-mnemonic-mac", &eol_mnemonic_mac,
3500 "Mnemonic character indicating MAC-like end-of-line format (i.e. CR).");
3501 eol_mnemonic_mac = '\'';
3502
3503 DEFVAR_INT ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
3504 "Mnemonic character indicating end-of-line format is not yet decided.");
3505 eol_mnemonic_undecided = '-';
3506
3507 DEFVAR_LISP ("alternate-charset-table", &Valternate_charset_table,
3508 "Alist of charsets vs the alternate charsets.\n\
3509 While decoding, if a charset (car part of an element) is found,\n\
3510 decode it as the alternate charset (cdr part of the element).");
3511 Valternate_charset_table = Qnil;
3512
3513 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
3514 "Alist of charsets vs revision numbers.\n\
3515 While encoding, if a charset (car part of an element) is found,\n\
3516 designate it with the escape sequence identifing revision (cdr part of the element).");
3517 Vcharset_revision_alist = Qnil;
3518 }
3519
3520 #endif /* emacs */