(Vstandard_character_unification_table_for_decode):
[bpt/emacs.git] / src / coding.c
1 /* Coding system handler (conversion, detection, and etc).
2 Copyright (C) 1995, 1997 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4
5 This file is part of GNU Emacs.
6
7 GNU Emacs is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
10 any later version.
11
12 GNU Emacs is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GNU Emacs; see the file COPYING. If not, write to
19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
21
22 /*** TABLE OF CONTENTS ***
23
24 1. Preamble
25 2. Emacs' internal format (emacs-mule) handlers
26 3. ISO2022 handlers
27 4. Shift-JIS and BIG5 handlers
28 5. End-of-line handlers
29 6. C library functions
30 7. Emacs Lisp library functions
31 8. Post-amble
32
33 */
34
35 /*** GENERAL NOTE on CODING SYSTEM ***
36
37 Coding system is an encoding mechanism of one or more character
38 sets. Here's a list of coding systems which Emacs can handle. When
39 we say "decode", it means converting some other coding system to
40 Emacs' internal format (emacs-internal), and when we say "encode",
41 it means converting the coding system emacs-mule to some other
42 coding system.
43
44 0. Emacs' internal format (emacs-mule)
45
46 Emacs itself holds a multi-lingual character in a buffer and a string
47 in a special format. Details are described in the section 2.
48
49 1. ISO2022
50
51 The most famous coding system for multiple character sets. X's
52 Compound Text, various EUCs (Extended Unix Code), and such coding
53 systems used in Internet communication as ISO-2022-JP are all
54 variants of ISO2022. Details are described in the section 3.
55
56 2. SJIS (or Shift-JIS or MS-Kanji-Code)
57
58 A coding system to encode character sets: ASCII, JISX0201, and
59 JISX0208. Widely used for PC's in Japan. Details are described in
60 the section 4.
61
62 3. BIG5
63
64 A coding system to encode character sets: ASCII and Big5. Widely
65 used by Chinese (mainly in Taiwan and Hong Kong). Details are
66 described in the section 4. In this file, when written as "BIG5"
67 (all uppercase), it means the coding system, and when written as
68 "Big5" (capitalized), it means the character set.
69
70 4. Else
71
72 If a user want to read/write a text encoded in a coding system not
73 listed above, he can supply a decoder and an encoder for it in CCL
74 (Code Conversion Language) programs. Emacs executes the CCL program
75 while reading/writing.
76
77 Emacs represent a coding-system by a Lisp symbol that has a property
78 `coding-system'. But, before actually using the coding-system, the
79 information about it is set in a structure of type `struct
80 coding_system' for rapid processing. See the section 6 for more
81 detail.
82
83 */
84
85 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
86
87 How end-of-line of a text is encoded depends on a system. For
88 instance, Unix's format is just one byte of `line-feed' code,
89 whereas DOS's format is two bytes sequence of `carriage-return' and
90 `line-feed' codes. MacOS's format is one byte of `carriage-return'.
91
92 Since how characters in a text is encoded and how end-of-line is
93 encoded is independent, any coding system described above can take
94 any format of end-of-line. So, Emacs has information of format of
95 end-of-line in each coding-system. See the section 6 for more
96 detail.
97
98 */
99
100 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
101
102 These functions check if a text between SRC and SRC_END is encoded
103 in the coding system category XXX. Each returns an integer value in
104 which appropriate flag bits for the category XXX is set. The flag
105 bits are defined in macros CODING_CATEGORY_MASK_XXX. Below is the
106 template of these functions. */
107 #if 0
108 int
109 detect_coding_emacs_mule (src, src_end)
110 unsigned char *src, *src_end;
111 {
112 ...
113 }
114 #endif
115
116 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
117
118 These functions decode SRC_BYTES length text at SOURCE encoded in
119 CODING to Emacs' internal format (emacs-mule). The resulting text
120 goes to a place pointed by DESTINATION, the length of which should
121 not exceed DST_BYTES. The bytes actually processed is returned as
122 *CONSUMED. The return value is the length of the decoded text.
123 Below is a template of these functions. */
124 #if 0
125 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
126 struct coding_system *coding;
127 unsigned char *source, *destination;
128 int src_bytes, dst_bytes;
129 int *consumed;
130 {
131 ...
132 }
133 #endif
134
135 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
136
137 These functions encode SRC_BYTES length text at SOURCE of Emacs'
138 internal format (emacs-mule) to CODING. The resulting text goes to
139 a place pointed by DESTINATION, the length of which should not
140 exceed DST_BYTES. The bytes actually processed is returned as
141 *CONSUMED. The return value is the length of the encoded text.
142 Below is a template of these functions. */
143 #if 0
144 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
145 struct coding_system *coding;
146 unsigned char *source, *destination;
147 int src_bytes, dst_bytes;
148 int *consumed;
149 {
150 ...
151 }
152 #endif
153
154 /*** COMMONLY USED MACROS ***/
155
156 /* The following three macros ONE_MORE_BYTE, TWO_MORE_BYTES, and
157 THREE_MORE_BYTES safely get one, two, and three bytes from the
158 source text respectively. If there are not enough bytes in the
159 source, they jump to `label_end_of_loop'. The caller should set
160 variables `src' and `src_end' to appropriate areas in advance. */
161
162 #define ONE_MORE_BYTE(c1) \
163 do { \
164 if (src < src_end) \
165 c1 = *src++; \
166 else \
167 goto label_end_of_loop; \
168 } while (0)
169
170 #define TWO_MORE_BYTES(c1, c2) \
171 do { \
172 if (src + 1 < src_end) \
173 c1 = *src++, c2 = *src++; \
174 else \
175 goto label_end_of_loop; \
176 } while (0)
177
178 #define THREE_MORE_BYTES(c1, c2, c3) \
179 do { \
180 if (src + 2 < src_end) \
181 c1 = *src++, c2 = *src++, c3 = *src++; \
182 else \
183 goto label_end_of_loop; \
184 } while (0)
185
186 /* The following three macros DECODE_CHARACTER_ASCII,
187 DECODE_CHARACTER_DIMENSION1, and DECODE_CHARACTER_DIMENSION2 put
188 the multi-byte form of a character of each class at the place
189 pointed by `dst'. The caller should set the variable `dst' to
190 point to an appropriate area and the variable `coding' to point to
191 the coding-system of the currently decoding text in advance. */
192
193 /* Decode one ASCII character C. */
194
195 #define DECODE_CHARACTER_ASCII(c) \
196 do { \
197 if (COMPOSING_P (coding->composing)) \
198 *dst++ = 0xA0, *dst++ = (c) | 0x80; \
199 else \
200 *dst++ = (c); \
201 } while (0)
202
203 /* Decode one DIMENSION1 character of which charset is CHARSET and
204 position-code is C. */
205
206 #define DECODE_CHARACTER_DIMENSION1(charset, c) \
207 do { \
208 unsigned char leading_code = CHARSET_LEADING_CODE_BASE (charset); \
209 if (COMPOSING_P (coding->composing)) \
210 *dst++ = leading_code + 0x20; \
211 else \
212 *dst++ = leading_code; \
213 if (leading_code = CHARSET_LEADING_CODE_EXT (charset)) \
214 *dst++ = leading_code; \
215 *dst++ = (c) | 0x80; \
216 } while (0)
217
218 /* Decode one DIMENSION2 character of which charset is CHARSET and
219 position-codes are C1 and C2. */
220
221 #define DECODE_CHARACTER_DIMENSION2(charset, c1, c2) \
222 do { \
223 DECODE_CHARACTER_DIMENSION1 (charset, c1); \
224 *dst++ = (c2) | 0x80; \
225 } while (0)
226
227 \f
228 /*** 1. Preamble ***/
229
230 #include <stdio.h>
231
232 #ifdef emacs
233
234 #include <config.h>
235 #include "lisp.h"
236 #include "buffer.h"
237 #include "charset.h"
238 #include "ccl.h"
239 #include "coding.h"
240 #include "window.h"
241
242 #else /* not emacs */
243
244 #include "mulelib.h"
245
246 #endif /* not emacs */
247
248 Lisp_Object Qcoding_system, Qeol_type;
249 Lisp_Object Qbuffer_file_coding_system;
250 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
251
252 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
253 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
254 Lisp_Object Qstart_process, Qopen_network_stream;
255 Lisp_Object Qtarget_idx;
256
257 /* Mnemonic character of each format of end-of-line. */
258 int eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
259 /* Mnemonic character to indicate format of end-of-line is not yet
260 decided. */
261 int eol_mnemonic_undecided;
262
263 #ifdef emacs
264
265 Lisp_Object Qcoding_system_spec, Qcoding_system_p, Qcoding_system_error;
266
267 /* Coding-systems are handed between Emacs Lisp programs and C internal
268 routines by the following three variables. */
269 /* Coding-system for reading files and receiving data from process. */
270 Lisp_Object Vcoding_system_for_read;
271 /* Coding-system for writing files and sending data to process. */
272 Lisp_Object Vcoding_system_for_write;
273 /* Coding-system actually used in the latest I/O. */
274 Lisp_Object Vlast_coding_system_used;
275
276 /* Coding-system of what terminal accept for displaying. */
277 struct coding_system terminal_coding;
278
279 /* Coding-system of what is sent from terminal keyboard. */
280 struct coding_system keyboard_coding;
281
282 Lisp_Object Vfile_coding_system_alist;
283 Lisp_Object Vprocess_coding_system_alist;
284 Lisp_Object Vnetwork_coding_system_alist;
285
286 #endif /* emacs */
287
288 Lisp_Object Qcoding_category_index;
289
290 /* List of symbols `coding-category-xxx' ordered by priority. */
291 Lisp_Object Vcoding_category_list;
292
293 /* Table of coding-systems currently assigned to each coding-category. */
294 Lisp_Object coding_category_table[CODING_CATEGORY_IDX_MAX];
295
296 /* Table of names of symbol for each coding-category. */
297 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
298 "coding-category-emacs-mule",
299 "coding-category-sjis",
300 "coding-category-iso-7",
301 "coding-category-iso-8-1",
302 "coding-category-iso-8-2",
303 "coding-category-iso-else",
304 "coding-category-big5",
305 "coding-category-binary"
306 };
307
308 /* Flag to tell if we look up unification table on character code
309 conversion. */
310 Lisp_Object Venable_character_unification;
311 /* Standard unification table to look up on decoding (reading). */
312 Lisp_Object Vstandard_character_unification_table_for_decode;
313 /* Standard unification table to look up on encoding (writing). */
314 Lisp_Object Vstandard_character_unification_table_for_encode;
315
316 Lisp_Object Qcharacter_unification_table;
317 Lisp_Object Qcharacter_unification_table_for_decode;
318 Lisp_Object Qcharacter_unification_table_for_encode;
319
320 /* Alist of charsets vs revision number. */
321 Lisp_Object Vcharset_revision_alist;
322
323 /* Default coding systems used for process I/O. */
324 Lisp_Object Vdefault_process_coding_system;
325
326 \f
327 /*** 2. Emacs internal format (emacs-mule) handlers ***/
328
329 /* Emacs' internal format for encoding multiple character sets is a
330 kind of multi-byte encoding, i.e. encoding a character by a sequence
331 of one-byte codes of variable length. ASCII characters and control
332 characters (e.g. `tab', `newline') are represented by one-byte as
333 is. It takes the range 0x00 through 0x7F. The other characters
334 are represented by a sequence of `base leading-code', optional
335 `extended leading-code', and one or two `position-code's. Length
336 of the sequence is decided by the base leading-code. Leading-code
337 takes the range 0x80 through 0x9F, whereas extended leading-code
338 and position-code take the range 0xA0 through 0xFF. See the
339 document of `charset.h' for more detail about leading-code and
340 position-code.
341
342 There's one exception in this rule. Special leading-code
343 `leading-code-composition' denotes that the following several
344 characters should be composed into one character. Leading-codes of
345 components (except for ASCII) are added 0x20. An ASCII character
346 component is represented by a 2-byte sequence of `0xA0' and
347 `ASCII-code + 0x80'. See also the document in `charset.h' for the
348 detail of composite character. Hence, we can summarize the code
349 range as follows:
350
351 --- CODE RANGE of Emacs' internal format ---
352 (character set) (range)
353 ASCII 0x00 .. 0x7F
354 ELSE (1st byte) 0x80 .. 0x9F
355 (rest bytes) 0xA0 .. 0xFF
356 ---------------------------------------------
357
358 */
359
360 enum emacs_code_class_type emacs_code_class[256];
361
362 /* Go to the next statement only if *SRC is accessible and the code is
363 greater than 0xA0. */
364 #define CHECK_CODE_RANGE_A0_FF \
365 do { \
366 if (src >= src_end) \
367 goto label_end_of_switch; \
368 else if (*src++ < 0xA0) \
369 return 0; \
370 } while (0)
371
372 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
373 Check if a text is encoded in Emacs' internal format. If it is,
374 return CODING_CATEGORY_MASK_EMASC_MULE, else return 0. */
375
376 int
377 detect_coding_emacs_mule (src, src_end)
378 unsigned char *src, *src_end;
379 {
380 unsigned char c;
381 int composing = 0;
382
383 while (src < src_end)
384 {
385 c = *src++;
386
387 if (composing)
388 {
389 if (c < 0xA0)
390 composing = 0;
391 else
392 c -= 0x20;
393 }
394
395 switch (emacs_code_class[c])
396 {
397 case EMACS_ascii_code:
398 case EMACS_linefeed_code:
399 break;
400
401 case EMACS_control_code:
402 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
403 return 0;
404 break;
405
406 case EMACS_invalid_code:
407 return 0;
408
409 case EMACS_leading_code_composition: /* c == 0x80 */
410 if (composing)
411 CHECK_CODE_RANGE_A0_FF;
412 else
413 composing = 1;
414 break;
415
416 case EMACS_leading_code_4:
417 CHECK_CODE_RANGE_A0_FF;
418 /* fall down to check it two more times ... */
419
420 case EMACS_leading_code_3:
421 CHECK_CODE_RANGE_A0_FF;
422 /* fall down to check it one more time ... */
423
424 case EMACS_leading_code_2:
425 CHECK_CODE_RANGE_A0_FF;
426 break;
427
428 default:
429 label_end_of_switch:
430 break;
431 }
432 }
433 return CODING_CATEGORY_MASK_EMACS_MULE;
434 }
435
436 \f
437 /*** 3. ISO2022 handlers ***/
438
439 /* The following note describes the coding system ISO2022 briefly.
440 Since the intension of this note is to help understanding of the
441 programs in this file, some parts are NOT ACCURATE or OVERLY
442 SIMPLIFIED. For the thorough understanding, please refer to the
443 original document of ISO2022.
444
445 ISO2022 provides many mechanisms to encode several character sets
446 in 7-bit and 8-bit environment. If one choose 7-bite environment,
447 all text is encoded by codes of less than 128. This may make the
448 encoded text a little bit longer, but the text get more stability
449 to pass through several gateways (some of them split MSB off).
450
451 There are two kind of character set: control character set and
452 graphic character set. The former contains control characters such
453 as `newline' and `escape' to provide control functions (control
454 functions are provided also by escape sequence). The latter
455 contains graphic characters such as ' A' and '-'. Emacs recognizes
456 two control character sets and many graphic character sets.
457
458 Graphic character sets are classified into one of the following
459 four classes, DIMENSION1_CHARS94, DIMENSION1_CHARS96,
460 DIMENSION2_CHARS94, DIMENSION2_CHARS96 according to the number of
461 bytes (DIMENSION) and the number of characters in one dimension
462 (CHARS) of the set. In addition, each character set is assigned an
463 identification tag (called "final character" and denoted as <F>
464 here after) which is unique in each class. <F> of each character
465 set is decided by ECMA(*) when it is registered in ISO. Code range
466 of <F> is 0x30..0x7F (0x30..0x3F are for private use only).
467
468 Note (*): ECMA = European Computer Manufacturers Association
469
470 Here are examples of graphic character set [NAME(<F>)]:
471 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
472 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
473 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
474 o DIMENSION2_CHARS96 -- none for the moment
475
476 A code area (1byte=8bits) is divided into 4 areas, C0, GL, C1, and GR.
477 C0 [0x00..0x1F] -- control character plane 0
478 GL [0x20..0x7F] -- graphic character plane 0
479 C1 [0x80..0x9F] -- control character plane 1
480 GR [0xA0..0xFF] -- graphic character plane 1
481
482 A control character set is directly designated and invoked to C0 or
483 C1 by an escape sequence. The most common case is that ISO646's
484 control character set is designated/invoked to C0 and ISO6429's
485 control character set is designated/invoked to C1, and usually
486 these designations/invocations are omitted in a coded text. With
487 7-bit environment, only C0 can be used, and a control character for
488 C1 is encoded by an appropriate escape sequence to fit in the
489 environment. All control characters for C1 are defined the
490 corresponding escape sequences.
491
492 A graphic character set is at first designated to one of four
493 graphic registers (G0 through G3), then these graphic registers are
494 invoked to GL or GR. These designations and invocations can be
495 done independently. The most common case is that G0 is invoked to
496 GL, G1 is invoked to GR, and ASCII is designated to G0, and usually
497 these invocations and designations are omitted in a coded text.
498 With 7-bit environment, only GL can be used.
499
500 When a graphic character set of CHARS94 is invoked to GL, code 0x20
501 and 0x7F of GL area work as control characters SPACE and DEL
502 respectively, and code 0xA0 and 0xFF of GR area should not be used.
503
504 There are two ways of invocation: locking-shift and single-shift.
505 With locking-shift, the invocation lasts until the next different
506 invocation, whereas with single-shift, the invocation works only
507 for the following character and doesn't affect locking-shift.
508 Invocations are done by the following control characters or escape
509 sequences.
510
511 ----------------------------------------------------------------------
512 function control char escape sequence description
513 ----------------------------------------------------------------------
514 SI (shift-in) 0x0F none invoke G0 to GL
515 SI (shift-out) 0x0E none invoke G1 to GL
516 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
517 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
518 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 into GL
519 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 into GL
520 ----------------------------------------------------------------------
521 The first four are for locking-shift. Control characters for these
522 functions are defined by macros ISO_CODE_XXX in `coding.h'.
523
524 Designations are done by the following escape sequences.
525 ----------------------------------------------------------------------
526 escape sequence description
527 ----------------------------------------------------------------------
528 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
529 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
530 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
531 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
532 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
533 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
534 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
535 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
536 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
537 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
538 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
539 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
540 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
541 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
542 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
543 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
544 ----------------------------------------------------------------------
545
546 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
547 of dimension 1, chars 94, and final character <F>, and etc.
548
549 Note (*): Although these designations are not allowed in ISO2022,
550 Emacs accepts them on decoding, and produces them on encoding
551 CHARS96 character set in a coding system which is characterized as
552 7-bit environment, non-locking-shift, and non-single-shift.
553
554 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
555 '(' can be omitted. We call this as "short-form" here after.
556
557 Now you may notice that there are a lot of ways for encoding the
558 same multilingual text in ISO2022. Actually, there exist many
559 coding systems such as Compound Text (used in X's inter client
560 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
561 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
562 localized platforms), and all of these are variants of ISO2022.
563
564 In addition to the above, Emacs handles two more kinds of escape
565 sequences: ISO6429's direction specification and Emacs' private
566 sequence for specifying character composition.
567
568 ISO6429's direction specification takes the following format:
569 o CSI ']' -- end of the current direction
570 o CSI '0' ']' -- end of the current direction
571 o CSI '1' ']' -- start of left-to-right text
572 o CSI '2' ']' -- start of right-to-left text
573 The control character CSI (0x9B: control sequence introducer) is
574 abbreviated to the escape sequence ESC '[' in 7-bit environment.
575
576 Character composition specification takes the following format:
577 o ESC '0' -- start character composition
578 o ESC '1' -- end character composition
579 Since these are not standard escape sequences of any ISO, the use
580 of them for these meaning is restricted to Emacs only. */
581
582 enum iso_code_class_type iso_code_class[256];
583
584 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
585 Check if a text is encoded in ISO2022. If it is, returns an
586 integer in which appropriate flag bits any of:
587 CODING_CATEGORY_MASK_ISO_7
588 CODING_CATEGORY_MASK_ISO_8_1
589 CODING_CATEGORY_MASK_ISO_8_2
590 CODING_CATEGORY_MASK_ISO_ELSE
591 are set. If a code which should never appear in ISO2022 is found,
592 returns 0. */
593
594 int
595 detect_coding_iso2022 (src, src_end)
596 unsigned char *src, *src_end;
597 {
598 int mask = (CODING_CATEGORY_MASK_ISO_7
599 | CODING_CATEGORY_MASK_ISO_8_1
600 | CODING_CATEGORY_MASK_ISO_8_2
601 | CODING_CATEGORY_MASK_ISO_ELSE);
602 int g1 = 0; /* 1 iff designating to G1. */
603 int c, i;
604
605 while (src < src_end)
606 {
607 c = *src++;
608 switch (c)
609 {
610 case ISO_CODE_ESC:
611 if (src >= src_end)
612 break;
613 c = *src++;
614 if (src < src_end
615 && ((c >= '(' && c <= '/')
616 || c == '$' && ((*src >= '(' && *src <= '/')
617 || (*src >= '@' && *src <= 'B'))))
618 {
619 /* Valid designation sequence. */
620 if (c == ')' || (c == '$' && *src == ')'))
621 {
622 g1 = 1;
623 mask &= ~CODING_CATEGORY_MASK_ISO_7;
624 }
625 src++;
626 break;
627 }
628 else if (c == 'N' || c == 'O' || c == 'n' || c == 'o')
629 return CODING_CATEGORY_MASK_ISO_ELSE;
630 break;
631
632 case ISO_CODE_SO:
633 if (g1)
634 return CODING_CATEGORY_MASK_ISO_ELSE;
635 break;
636
637 case ISO_CODE_CSI:
638 case ISO_CODE_SS2:
639 case ISO_CODE_SS3:
640 mask &= ~CODING_CATEGORY_MASK_ISO_7;
641 break;
642
643 default:
644 if (c < 0x80)
645 break;
646 else if (c < 0xA0)
647 return 0;
648 else
649 {
650 int count = 1;
651
652 mask &= ~CODING_CATEGORY_MASK_ISO_7;
653 while (src < src_end && *src >= 0xA0)
654 count++, src++;
655 if (count & 1 && src < src_end)
656 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
657 }
658 break;
659 }
660 }
661
662 return mask;
663 }
664
665 /* Decode a character of which charset is CHARSET and the 1st position
666 code is C1. If dimension of CHARSET is 2, the 2nd position code is
667 fetched from SRC and set to C2. If CHARSET is negative, it means
668 that we are decoding ill formed text, and what we can do is just to
669 read C1 as is. */
670
671 #define DECODE_ISO_CHARACTER(charset, c1) \
672 do { \
673 int c_alt, charset_alt = (charset); \
674 if (COMPOSING_HEAD_P (coding->composing)) \
675 { \
676 *dst++ = LEADING_CODE_COMPOSITION; \
677 if (COMPOSING_WITH_RULE_P (coding->composing)) \
678 /* To tell composition rules are embeded. */ \
679 *dst++ = 0xFF; \
680 coding->composing += 2; \
681 } \
682 if ((charset) >= 0) \
683 { \
684 if (CHARSET_DIMENSION (charset) == 2) \
685 ONE_MORE_BYTE (c2); \
686 if (!NILP (unification_table) \
687 && ((c_alt = unify_char (unification_table, \
688 -1, (charset), c1, c2)) >= 0)) \
689 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
690 } \
691 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
692 DECODE_CHARACTER_ASCII (c1); \
693 else if (CHARSET_DIMENSION (charset_alt) == 1) \
694 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
695 else \
696 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
697 if (COMPOSING_WITH_RULE_P (coding->composing)) \
698 /* To tell a composition rule follows. */ \
699 coding->composing = COMPOSING_WITH_RULE_RULE; \
700 } while (0)
701
702 /* Set designation state into CODING. */
703 #define DECODE_DESIGNATION(reg, dimension, chars, final_char) \
704 do { \
705 int charset = ISO_CHARSET_TABLE (dimension, chars, final_char); \
706 if (charset >= 0) \
707 { \
708 if (coding->direction == 1 \
709 && CHARSET_REVERSE_CHARSET (charset) >= 0) \
710 charset = CHARSET_REVERSE_CHARSET (charset); \
711 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
712 } \
713 } while (0)
714
715 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
716
717 int
718 decode_coding_iso2022 (coding, source, destination,
719 src_bytes, dst_bytes, consumed)
720 struct coding_system *coding;
721 unsigned char *source, *destination;
722 int src_bytes, dst_bytes;
723 int *consumed;
724 {
725 unsigned char *src = source;
726 unsigned char *src_end = source + src_bytes;
727 unsigned char *dst = destination;
728 unsigned char *dst_end = destination + dst_bytes;
729 /* Since the maximum bytes produced by each loop is 7, we subtract 6
730 from DST_END to assure that overflow checking is necessary only
731 at the head of loop. */
732 unsigned char *adjusted_dst_end = dst_end - 6;
733 int charset;
734 /* Charsets invoked to graphic plane 0 and 1 respectively. */
735 int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
736 int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
737 Lisp_Object unification_table
738 = coding->character_unification_table_for_decode;
739
740 if (!NILP (Venable_character_unification) && NILP (unification_table))
741 unification_table = Vstandard_character_unification_table_for_decode;
742
743 while (src < src_end && dst < adjusted_dst_end)
744 {
745 /* SRC_BASE remembers the start position in source in each loop.
746 The loop will be exited when there's not enough source text
747 to analyze long escape sequence or 2-byte code (within macros
748 ONE_MORE_BYTE or TWO_MORE_BYTES). In that case, SRC is reset
749 to SRC_BASE before exiting. */
750 unsigned char *src_base = src;
751 int c1 = *src++, c2;
752
753 switch (iso_code_class [c1])
754 {
755 case ISO_0x20_or_0x7F:
756 if (!coding->composing
757 && (charset0 < 0 || CHARSET_CHARS (charset0) == 94))
758 {
759 /* This is SPACE or DEL. */
760 *dst++ = c1;
761 break;
762 }
763 /* This is a graphic character, we fall down ... */
764
765 case ISO_graphic_plane_0:
766 if (coding->composing == COMPOSING_WITH_RULE_RULE)
767 {
768 /* This is a composition rule. */
769 *dst++ = c1 | 0x80;
770 coding->composing = COMPOSING_WITH_RULE_TAIL;
771 }
772 else
773 DECODE_ISO_CHARACTER (charset0, c1);
774 break;
775
776 case ISO_0xA0_or_0xFF:
777 if (charset1 < 0 || CHARSET_CHARS (charset1) == 94)
778 {
779 /* Invalid code. */
780 *dst++ = c1;
781 break;
782 }
783 /* This is a graphic character, we fall down ... */
784
785 case ISO_graphic_plane_1:
786 DECODE_ISO_CHARACTER (charset1, c1);
787 break;
788
789 case ISO_control_code:
790 /* All ISO2022 control characters in this class have the
791 same representation in Emacs internal format. */
792 *dst++ = c1;
793 break;
794
795 case ISO_carriage_return:
796 if (coding->eol_type == CODING_EOL_CR)
797 {
798 *dst++ = '\n';
799 }
800 else if (coding->eol_type == CODING_EOL_CRLF)
801 {
802 ONE_MORE_BYTE (c1);
803 if (c1 == ISO_CODE_LF)
804 *dst++ = '\n';
805 else
806 {
807 src--;
808 *dst++ = c1;
809 }
810 }
811 else
812 {
813 *dst++ = c1;
814 }
815 break;
816
817 case ISO_shift_out:
818 if (CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
819 goto label_invalid_escape_sequence;
820 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
821 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
822 break;
823
824 case ISO_shift_in:
825 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
826 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
827 break;
828
829 case ISO_single_shift_2_7:
830 case ISO_single_shift_2:
831 /* SS2 is handled as an escape sequence of ESC 'N' */
832 c1 = 'N';
833 goto label_escape_sequence;
834
835 case ISO_single_shift_3:
836 /* SS2 is handled as an escape sequence of ESC 'O' */
837 c1 = 'O';
838 goto label_escape_sequence;
839
840 case ISO_control_sequence_introducer:
841 /* CSI is handled as an escape sequence of ESC '[' ... */
842 c1 = '[';
843 goto label_escape_sequence;
844
845 case ISO_escape:
846 ONE_MORE_BYTE (c1);
847 label_escape_sequence:
848 /* Escape sequences handled by Emacs are invocation,
849 designation, direction specification, and character
850 composition specification. */
851 switch (c1)
852 {
853 case '&': /* revision of following character set */
854 ONE_MORE_BYTE (c1);
855 if (!(c1 >= '@' && c1 <= '~'))
856 goto label_invalid_escape_sequence;
857 ONE_MORE_BYTE (c1);
858 if (c1 != ISO_CODE_ESC)
859 goto label_invalid_escape_sequence;
860 ONE_MORE_BYTE (c1);
861 goto label_escape_sequence;
862
863 case '$': /* designation of 2-byte character set */
864 ONE_MORE_BYTE (c1);
865 if (c1 >= '@' && c1 <= 'B')
866 { /* designation of JISX0208.1978, GB2312.1980,
867 or JISX0208.1980 */
868 DECODE_DESIGNATION (0, 2, 94, c1);
869 }
870 else if (c1 >= 0x28 && c1 <= 0x2B)
871 { /* designation of DIMENSION2_CHARS94 character set */
872 ONE_MORE_BYTE (c2);
873 DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
874 }
875 else if (c1 >= 0x2C && c1 <= 0x2F)
876 { /* designation of DIMENSION2_CHARS96 character set */
877 ONE_MORE_BYTE (c2);
878 DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
879 }
880 else
881 goto label_invalid_escape_sequence;
882 break;
883
884 case 'n': /* invocation of locking-shift-2 */
885 if (CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
886 goto label_invalid_escape_sequence;
887 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
888 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
889 break;
890
891 case 'o': /* invocation of locking-shift-3 */
892 if (CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
893 goto label_invalid_escape_sequence;
894 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
895 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
896 break;
897
898 case 'N': /* invocation of single-shift-2 */
899 if (CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
900 goto label_invalid_escape_sequence;
901 ONE_MORE_BYTE (c1);
902 charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
903 DECODE_ISO_CHARACTER (charset, c1);
904 break;
905
906 case 'O': /* invocation of single-shift-3 */
907 if (CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
908 goto label_invalid_escape_sequence;
909 ONE_MORE_BYTE (c1);
910 charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
911 DECODE_ISO_CHARACTER (charset, c1);
912 break;
913
914 case '0': /* start composing without embeded rules */
915 coding->composing = COMPOSING_NO_RULE_HEAD;
916 break;
917
918 case '1': /* end composing */
919 coding->composing = COMPOSING_NO;
920 break;
921
922 case '2': /* start composing with embeded rules */
923 coding->composing = COMPOSING_WITH_RULE_HEAD;
924 break;
925
926 case '[': /* specification of direction */
927 /* For the moment, nested direction is not supported.
928 So, the value of `coding->direction' is 0 or 1: 0
929 means left-to-right, 1 means right-to-left. */
930 ONE_MORE_BYTE (c1);
931 switch (c1)
932 {
933 case ']': /* end of the current direction */
934 coding->direction = 0;
935
936 case '0': /* end of the current direction */
937 case '1': /* start of left-to-right direction */
938 ONE_MORE_BYTE (c1);
939 if (c1 == ']')
940 coding->direction = 0;
941 else
942 goto label_invalid_escape_sequence;
943 break;
944
945 case '2': /* start of right-to-left direction */
946 ONE_MORE_BYTE (c1);
947 if (c1 == ']')
948 coding->direction= 1;
949 else
950 goto label_invalid_escape_sequence;
951 break;
952
953 default:
954 goto label_invalid_escape_sequence;
955 }
956 break;
957
958 default:
959 if (c1 >= 0x28 && c1 <= 0x2B)
960 { /* designation of DIMENSION1_CHARS94 character set */
961 ONE_MORE_BYTE (c2);
962 DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
963 }
964 else if (c1 >= 0x2C && c1 <= 0x2F)
965 { /* designation of DIMENSION1_CHARS96 character set */
966 ONE_MORE_BYTE (c2);
967 DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
968 }
969 else
970 {
971 goto label_invalid_escape_sequence;
972 }
973 }
974 /* We must update these variables now. */
975 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
976 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
977 break;
978
979 label_invalid_escape_sequence:
980 {
981 int length = src - src_base;
982
983 bcopy (src_base, dst, length);
984 dst += length;
985 }
986 }
987 continue;
988
989 label_end_of_loop:
990 coding->carryover_size = src - src_base;
991 bcopy (src_base, coding->carryover, coding->carryover_size);
992 src = src_base;
993 break;
994 }
995
996 /* If this is the last block of the text to be decoded, we had
997 better just flush out all remaining codes in the text although
998 they are not valid characters. */
999 if (coding->last_block)
1000 {
1001 bcopy (src, dst, src_end - src);
1002 dst += (src_end - src);
1003 src = src_end;
1004 }
1005 *consumed = src - source;
1006 return dst - destination;
1007 }
1008
1009 /* ISO2022 encoding staffs. */
1010
1011 /*
1012 It is not enough to say just "ISO2022" on encoding, but we have to
1013 specify more details. In Emacs, each coding-system of ISO2022
1014 variant has the following specifications:
1015 1. Initial designation to G0 thru G3.
1016 2. Allows short-form designation?
1017 3. ASCII should be designated to G0 before control characters?
1018 4. ASCII should be designated to G0 at end of line?
1019 5. 7-bit environment or 8-bit environment?
1020 6. Use locking-shift?
1021 7. Use Single-shift?
1022 And the following two are only for Japanese:
1023 8. Use ASCII in place of JIS0201-1976-Roman?
1024 9. Use JISX0208-1983 in place of JISX0208-1978?
1025 These specifications are encoded in `coding->flags' as flag bits
1026 defined by macros CODING_FLAG_ISO_XXX. See `coding.h' for more
1027 detail.
1028 */
1029
1030 /* Produce codes (escape sequence) for designating CHARSET to graphic
1031 register REG. If <final-char> of CHARSET is '@', 'A', or 'B' and
1032 the coding system CODING allows, produce designation sequence of
1033 short-form. */
1034
1035 #define ENCODE_DESIGNATION(charset, reg, coding) \
1036 do { \
1037 unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset); \
1038 char *intermediate_char_94 = "()*+"; \
1039 char *intermediate_char_96 = ",-./"; \
1040 Lisp_Object temp \
1041 = Fassq (make_number (charset), Vcharset_revision_alist); \
1042 if (! NILP (temp)) \
1043 { \
1044 *dst++ = ISO_CODE_ESC; \
1045 *dst++ = '&'; \
1046 *dst++ = XINT (XCONS (temp)->cdr) + '@'; \
1047 } \
1048 *dst++ = ISO_CODE_ESC; \
1049 if (CHARSET_DIMENSION (charset) == 1) \
1050 { \
1051 if (CHARSET_CHARS (charset) == 94) \
1052 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1053 else \
1054 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1055 } \
1056 else \
1057 { \
1058 *dst++ = '$'; \
1059 if (CHARSET_CHARS (charset) == 94) \
1060 { \
1061 if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM) \
1062 || reg != 0 \
1063 || final_char < '@' || final_char > 'B') \
1064 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1065 } \
1066 else \
1067 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1068 } \
1069 *dst++ = final_char; \
1070 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
1071 } while (0)
1072
1073 /* The following two macros produce codes (control character or escape
1074 sequence) for ISO2022 single-shift functions (single-shift-2 and
1075 single-shift-3). */
1076
1077 #define ENCODE_SINGLE_SHIFT_2 \
1078 do { \
1079 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1080 *dst++ = ISO_CODE_ESC, *dst++ = 'N'; \
1081 else \
1082 *dst++ = ISO_CODE_SS2; \
1083 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1084 } while (0)
1085
1086 #define ENCODE_SINGLE_SHIFT_3 \
1087 do { \
1088 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1089 *dst++ = ISO_CODE_ESC, *dst++ = 'O'; \
1090 else \
1091 *dst++ = ISO_CODE_SS3; \
1092 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1093 } while (0)
1094
1095 /* The following four macros produce codes (control character or
1096 escape sequence) for ISO2022 locking-shift functions (shift-in,
1097 shift-out, locking-shift-2, and locking-shift-3). */
1098
1099 #define ENCODE_SHIFT_IN \
1100 do { \
1101 *dst++ = ISO_CODE_SI; \
1102 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1103 } while (0)
1104
1105 #define ENCODE_SHIFT_OUT \
1106 do { \
1107 *dst++ = ISO_CODE_SO; \
1108 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1109 } while (0)
1110
1111 #define ENCODE_LOCKING_SHIFT_2 \
1112 do { \
1113 *dst++ = ISO_CODE_ESC, *dst++ = 'n'; \
1114 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1115 } while (0)
1116
1117 #define ENCODE_LOCKING_SHIFT_3 \
1118 do { \
1119 *dst++ = ISO_CODE_ESC, *dst++ = 'o'; \
1120 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1121 } while (0)
1122
1123 /* Produce codes for a DIMENSION1 character of which character set is
1124 CHARSET and position-code is C1. Designation and invocation
1125 sequences are also produced in advance if necessary. */
1126
1127
1128 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
1129 do { \
1130 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1131 { \
1132 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1133 *dst++ = c1 & 0x7F; \
1134 else \
1135 *dst++ = c1 | 0x80; \
1136 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1137 break; \
1138 } \
1139 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1140 { \
1141 *dst++ = c1 & 0x7F; \
1142 break; \
1143 } \
1144 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1145 { \
1146 *dst++ = c1 | 0x80; \
1147 break; \
1148 } \
1149 else \
1150 /* Since CHARSET is not yet invoked to any graphic planes, we \
1151 must invoke it, or, at first, designate it to some graphic \
1152 register. Then repeat the loop to actually produce the \
1153 character. */ \
1154 dst = encode_invocation_designation (charset, coding, dst); \
1155 } while (1)
1156
1157 /* Produce codes for a DIMENSION2 character of which character set is
1158 CHARSET and position-codes are C1 and C2. Designation and
1159 invocation codes are also produced in advance if necessary. */
1160
1161 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
1162 do { \
1163 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1164 { \
1165 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1166 *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F; \
1167 else \
1168 *dst++ = c1 | 0x80, *dst++ = c2 | 0x80; \
1169 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1170 break; \
1171 } \
1172 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1173 { \
1174 *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F; \
1175 break; \
1176 } \
1177 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1178 { \
1179 *dst++ = c1 | 0x80, *dst++= c2 | 0x80; \
1180 break; \
1181 } \
1182 else \
1183 /* Since CHARSET is not yet invoked to any graphic planes, we \
1184 must invoke it, or, at first, designate it to some graphic \
1185 register. Then repeat the loop to actually produce the \
1186 character. */ \
1187 dst = encode_invocation_designation (charset, coding, dst); \
1188 } while (1)
1189
1190 #define ENCODE_ISO_CHARACTER(charset, c1, c2) \
1191 do { \
1192 int c_alt, charset_alt; \
1193 if (!NILP (unification_table) \
1194 && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
1195 >= 0)) \
1196 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1197 else \
1198 charset_alt = charset; \
1199 if (CHARSET_DIMENSION (charset_alt) == 1) \
1200 ENCODE_ISO_CHARACTER_DIMENSION1 (charset_alt, c1); \
1201 else \
1202 ENCODE_ISO_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
1203 } while (0)
1204
1205 /* Produce designation and invocation codes at a place pointed by DST
1206 to use CHARSET. The element `spec.iso2022' of *CODING is updated.
1207 Return new DST. */
1208
1209 unsigned char *
1210 encode_invocation_designation (charset, coding, dst)
1211 int charset;
1212 struct coding_system *coding;
1213 unsigned char *dst;
1214 {
1215 int reg; /* graphic register number */
1216
1217 /* At first, check designations. */
1218 for (reg = 0; reg < 4; reg++)
1219 if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
1220 break;
1221
1222 if (reg >= 4)
1223 {
1224 /* CHARSET is not yet designated to any graphic registers. */
1225 /* At first check the requested designation. */
1226 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1227 if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1228 /* Since CHARSET requests no special designation, designate it
1229 to graphic register 0. */
1230 reg = 0;
1231
1232 ENCODE_DESIGNATION (charset, reg, coding);
1233 }
1234
1235 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
1236 && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
1237 {
1238 /* Since the graphic register REG is not invoked to any graphic
1239 planes, invoke it to graphic plane 0. */
1240 switch (reg)
1241 {
1242 case 0: /* graphic register 0 */
1243 ENCODE_SHIFT_IN;
1244 break;
1245
1246 case 1: /* graphic register 1 */
1247 ENCODE_SHIFT_OUT;
1248 break;
1249
1250 case 2: /* graphic register 2 */
1251 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1252 ENCODE_SINGLE_SHIFT_2;
1253 else
1254 ENCODE_LOCKING_SHIFT_2;
1255 break;
1256
1257 case 3: /* graphic register 3 */
1258 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1259 ENCODE_SINGLE_SHIFT_3;
1260 else
1261 ENCODE_LOCKING_SHIFT_3;
1262 break;
1263 }
1264 }
1265 return dst;
1266 }
1267
1268 /* The following two macros produce codes for indicating composition. */
1269 #define ENCODE_COMPOSITION_NO_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '0'
1270 #define ENCODE_COMPOSITION_WITH_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '2'
1271 #define ENCODE_COMPOSITION_END *dst++ = ISO_CODE_ESC, *dst++ = '1'
1272
1273 /* The following three macros produce codes for indicating direction
1274 of text. */
1275 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
1276 do { \
1277 if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS) \
1278 *dst++ = ISO_CODE_ESC, *dst++ = '['; \
1279 else \
1280 *dst++ = ISO_CODE_CSI; \
1281 } while (0)
1282
1283 #define ENCODE_DIRECTION_R2L \
1284 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '2', *dst++ = ']'
1285
1286 #define ENCODE_DIRECTION_L2R \
1287 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '0', *dst++ = ']'
1288
1289 /* Produce codes for designation and invocation to reset the graphic
1290 planes and registers to initial state. */
1291 #define ENCODE_RESET_PLANE_AND_REGISTER \
1292 do { \
1293 int reg; \
1294 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0) \
1295 ENCODE_SHIFT_IN; \
1296 for (reg = 0; reg < 4; reg++) \
1297 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0 \
1298 && (CODING_SPEC_ISO_DESIGNATION (coding, reg) \
1299 != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg))) \
1300 ENCODE_DESIGNATION \
1301 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
1302 } while (0)
1303
1304 /* Produce designation sequences of charsets in the line started from
1305 *SRC to a place pointed by DSTP.
1306
1307 If the current block ends before any end-of-line, we may fail to
1308 find all the necessary *designations. */
1309 encode_designation_at_bol (coding, table, src, src_end, dstp)
1310 struct coding_system *coding;
1311 Lisp_Object table;
1312 unsigned char *src, *src_end, **dstp;
1313 {
1314 int charset, c, found = 0, reg;
1315 /* Table of charsets to be designated to each graphic register. */
1316 int r[4];
1317 unsigned char *dst = *dstp;
1318
1319 for (reg = 0; reg < 4; reg++)
1320 r[reg] = -1;
1321
1322 while (src < src_end && *src != '\n' && found < 4)
1323 {
1324 int bytes = BYTES_BY_CHAR_HEAD (*src);
1325
1326 if (NILP (table))
1327 charset = CHARSET_AT (src);
1328 else
1329 {
1330 int c_alt, c1, c2;
1331
1332 SPLIT_STRING(src, bytes, charset, c1, c2);
1333 if ((c_alt = unify_char (table, -1, charset, c1, c2)) >= 0)
1334 charset = CHAR_CHARSET (c_alt);
1335 }
1336
1337 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1338 if (r[reg] == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1339 {
1340 found++;
1341 r[reg] = charset;
1342 }
1343
1344 src += bytes;
1345 }
1346
1347 if (found)
1348 {
1349 for (reg = 0; reg < 4; reg++)
1350 if (r[reg] >= 0
1351 && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
1352 ENCODE_DESIGNATION (r[reg], reg, coding);
1353 *dstp = dst;
1354 }
1355 }
1356
1357 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
1358
1359 int
1360 encode_coding_iso2022 (coding, source, destination,
1361 src_bytes, dst_bytes, consumed)
1362 struct coding_system *coding;
1363 unsigned char *source, *destination;
1364 int src_bytes, dst_bytes;
1365 int *consumed;
1366 {
1367 unsigned char *src = source;
1368 unsigned char *src_end = source + src_bytes;
1369 unsigned char *dst = destination;
1370 unsigned char *dst_end = destination + dst_bytes;
1371 /* Since the maximum bytes produced by each loop is 20, we subtract 19
1372 from DST_END to assure overflow checking is necessary only at the
1373 head of loop. */
1374 unsigned char *adjusted_dst_end = dst_end - 19;
1375 Lisp_Object unification_table
1376 = coding->character_unification_table_for_encode;
1377
1378 if (!NILP (Venable_character_unification) && NILP (unification_table))
1379 unification_table = Vstandard_character_unification_table_for_encode;
1380
1381 while (src < src_end && dst < adjusted_dst_end)
1382 {
1383 /* SRC_BASE remembers the start position in source in each loop.
1384 The loop will be exited when there's not enough source text
1385 to analyze multi-byte codes (within macros ONE_MORE_BYTE,
1386 TWO_MORE_BYTES, and THREE_MORE_BYTES). In that case, SRC is
1387 reset to SRC_BASE before exiting. */
1388 unsigned char *src_base = src;
1389 int charset, c1, c2, c3, c4;
1390
1391 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
1392 && CODING_SPEC_ISO_BOL (coding))
1393 {
1394 /* We have to produce designation sequences if any now. */
1395 encode_designation_at_bol (coding, unification_table,
1396 src, src_end, &dst);
1397 CODING_SPEC_ISO_BOL (coding) = 0;
1398 }
1399
1400 c1 = *src++;
1401 /* If we are seeing a component of a composite character, we are
1402 seeing a leading-code specially encoded for composition, or a
1403 composition rule if composing with rule. We must set C1
1404 to a normal leading-code or an ASCII code. If we are not at
1405 a composed character, we must reset the composition state. */
1406 if (COMPOSING_P (coding->composing))
1407 {
1408 if (c1 < 0xA0)
1409 {
1410 /* We are not in a composite character any longer. */
1411 coding->composing = COMPOSING_NO;
1412 ENCODE_COMPOSITION_END;
1413 }
1414 else
1415 {
1416 if (coding->composing == COMPOSING_WITH_RULE_RULE)
1417 {
1418 *dst++ = c1 & 0x7F;
1419 coding->composing = COMPOSING_WITH_RULE_HEAD;
1420 continue;
1421 }
1422 else if (coding->composing == COMPOSING_WITH_RULE_HEAD)
1423 coding->composing = COMPOSING_WITH_RULE_RULE;
1424 if (c1 == 0xA0)
1425 {
1426 /* This is an ASCII component. */
1427 ONE_MORE_BYTE (c1);
1428 c1 &= 0x7F;
1429 }
1430 else
1431 /* This is a leading-code of non ASCII component. */
1432 c1 -= 0x20;
1433 }
1434 }
1435
1436 /* Now encode one character. C1 is a control character, an
1437 ASCII character, or a leading-code of multi-byte character. */
1438 switch (emacs_code_class[c1])
1439 {
1440 case EMACS_ascii_code:
1441 ENCODE_ISO_CHARACTER (CHARSET_ASCII, c1, /* dummy */ c2);
1442 break;
1443
1444 case EMACS_control_code:
1445 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
1446 ENCODE_RESET_PLANE_AND_REGISTER;
1447 *dst++ = c1;
1448 break;
1449
1450 case EMACS_carriage_return_code:
1451 if (!coding->selective)
1452 {
1453 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
1454 ENCODE_RESET_PLANE_AND_REGISTER;
1455 *dst++ = c1;
1456 break;
1457 }
1458 /* fall down to treat '\r' as '\n' ... */
1459
1460 case EMACS_linefeed_code:
1461 if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
1462 ENCODE_RESET_PLANE_AND_REGISTER;
1463 if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
1464 bcopy (coding->spec.iso2022.initial_designation,
1465 coding->spec.iso2022.current_designation,
1466 sizeof coding->spec.iso2022.initial_designation);
1467 if (coding->eol_type == CODING_EOL_LF
1468 || coding->eol_type == CODING_EOL_UNDECIDED)
1469 *dst++ = ISO_CODE_LF;
1470 else if (coding->eol_type == CODING_EOL_CRLF)
1471 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
1472 else
1473 *dst++ = ISO_CODE_CR;
1474 CODING_SPEC_ISO_BOL (coding) = 1;
1475 break;
1476
1477 case EMACS_leading_code_2:
1478 ONE_MORE_BYTE (c2);
1479 ENCODE_ISO_CHARACTER (c1, c2, /* dummy */ c3);
1480 break;
1481
1482 case EMACS_leading_code_3:
1483 TWO_MORE_BYTES (c2, c3);
1484 if (c1 < LEADING_CODE_PRIVATE_11)
1485 ENCODE_ISO_CHARACTER (c1, c2, c3);
1486 else
1487 ENCODE_ISO_CHARACTER (c2, c3, /* dummy */ c4);
1488 break;
1489
1490 case EMACS_leading_code_4:
1491 THREE_MORE_BYTES (c2, c3, c4);
1492 ENCODE_ISO_CHARACTER (c2, c3, c4);
1493 break;
1494
1495 case EMACS_leading_code_composition:
1496 ONE_MORE_BYTE (c1);
1497 if (c1 == 0xFF)
1498 {
1499 coding->composing = COMPOSING_WITH_RULE_HEAD;
1500 ENCODE_COMPOSITION_WITH_RULE_START;
1501 }
1502 else
1503 {
1504 /* Rewind one byte because it is a character code of
1505 composition elements. */
1506 src--;
1507 coding->composing = COMPOSING_NO_RULE_HEAD;
1508 ENCODE_COMPOSITION_NO_RULE_START;
1509 }
1510 break;
1511
1512 case EMACS_invalid_code:
1513 *dst++ = c1;
1514 break;
1515 }
1516 continue;
1517 label_end_of_loop:
1518 coding->carryover_size = src - src_base;
1519 bcopy (src_base, coding->carryover, coding->carryover_size);
1520 break;
1521 }
1522
1523 /* If this is the last block of the text to be encoded, we must
1524 reset graphic planes and registers to the initial state. */
1525 if (src >= src_end && coding->last_block)
1526 {
1527 ENCODE_RESET_PLANE_AND_REGISTER;
1528 if (coding->carryover_size > 0
1529 && coding->carryover_size < (dst_end - dst))
1530 {
1531 bcopy (coding->carryover, dst, coding->carryover_size);
1532 dst += coding->carryover_size;
1533 coding->carryover_size = 0;
1534 }
1535 }
1536 *consumed = src - source;
1537 return dst - destination;
1538 }
1539
1540 \f
1541 /*** 4. SJIS and BIG5 handlers ***/
1542
1543 /* Although SJIS and BIG5 are not ISO's coding system, They are used
1544 quite widely. So, for the moment, Emacs supports them in the bare
1545 C code. But, in the future, they may be supported only by CCL. */
1546
1547 /* SJIS is a coding system encoding three character sets: ASCII, right
1548 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
1549 as is. A character of charset katakana-jisx0201 is encoded by
1550 "position-code + 0x80". A character of charset japanese-jisx0208
1551 is encoded in 2-byte but two position-codes are divided and shifted
1552 so that it fit in the range below.
1553
1554 --- CODE RANGE of SJIS ---
1555 (character set) (range)
1556 ASCII 0x00 .. 0x7F
1557 KATAKANA-JISX0201 0xA0 .. 0xDF
1558 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xFF
1559 (2nd byte) 0x40 .. 0xFF
1560 -------------------------------
1561
1562 */
1563
1564 /* BIG5 is a coding system encoding two character sets: ASCII and
1565 Big5. An ASCII character is encoded as is. Big5 is a two-byte
1566 character set and is encoded in two-byte.
1567
1568 --- CODE RANGE of BIG5 ---
1569 (character set) (range)
1570 ASCII 0x00 .. 0x7F
1571 Big5 (1st byte) 0xA1 .. 0xFE
1572 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
1573 --------------------------
1574
1575 Since the number of characters in Big5 is larger than maximum
1576 characters in Emacs' charset (96x96), it can't be handled as one
1577 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
1578 and `charset-big5-2'. Both are DIMENSION2 and CHARS94. The former
1579 contains frequently used characters and the latter contains less
1580 frequently used characters. */
1581
1582 /* Macros to decode or encode a character of Big5 in BIG5. B1 and B2
1583 are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
1584 C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
1585 format. CHARSET is `charset_big5_1' or `charset_big5_2'. */
1586
1587 /* Number of Big5 characters which have the same code in 1st byte. */
1588 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
1589
1590 #define DECODE_BIG5(b1, b2, charset, c1, c2) \
1591 do { \
1592 unsigned int temp \
1593 = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62); \
1594 if (b1 < 0xC9) \
1595 charset = charset_big5_1; \
1596 else \
1597 { \
1598 charset = charset_big5_2; \
1599 temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW; \
1600 } \
1601 c1 = temp / (0xFF - 0xA1) + 0x21; \
1602 c2 = temp % (0xFF - 0xA1) + 0x21; \
1603 } while (0)
1604
1605 #define ENCODE_BIG5(charset, c1, c2, b1, b2) \
1606 do { \
1607 unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21); \
1608 if (charset == charset_big5_2) \
1609 temp += BIG5_SAME_ROW * (0xC9 - 0xA1); \
1610 b1 = temp / BIG5_SAME_ROW + 0xA1; \
1611 b2 = temp % BIG5_SAME_ROW; \
1612 b2 += b2 < 0x3F ? 0x40 : 0x62; \
1613 } while (0)
1614
1615 #define DECODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \
1616 do { \
1617 int c_alt, charset_alt = (charset); \
1618 if (!NILP (unification_table) \
1619 && ((c_alt = unify_char (unification_table, \
1620 -1, (charset), c1, c2)) >= 0)) \
1621 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1622 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
1623 DECODE_CHARACTER_ASCII (c1); \
1624 else if (CHARSET_DIMENSION (charset_alt) == 1) \
1625 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
1626 else \
1627 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
1628 } while (0)
1629
1630 #define ENCODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \
1631 do { \
1632 int c_alt, charset_alt; \
1633 if (!NILP (unification_table) \
1634 && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
1635 >= 0)) \
1636 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1637 else \
1638 charset_alt = charset; \
1639 if (charset_alt == charset_ascii) \
1640 *dst++ = c1; \
1641 else if (CHARSET_DIMENSION (charset_alt) == 1) \
1642 { \
1643 if (sjis_p && charset_alt == charset_katakana_jisx0201) \
1644 *dst++ = c1; \
1645 else \
1646 *dst++ = charset_alt, *dst++ = c1; \
1647 } \
1648 else \
1649 { \
1650 c1 &= 0x7F, c2 &= 0x7F; \
1651 if (sjis_p && charset_alt == charset_jisx0208) \
1652 { \
1653 unsigned char s1, s2; \
1654 \
1655 ENCODE_SJIS (c1, c2, s1, s2); \
1656 *dst++ = s1, *dst++ = s2; \
1657 } \
1658 else if (!sjis_p \
1659 && (charset_alt == charset_big5_1 \
1660 || charset_alt == charset_big5_2)) \
1661 { \
1662 unsigned char b1, b2; \
1663 \
1664 ENCODE_BIG5 (c1, c2, c3, b1, b2); \
1665 *dst++ = b1, *dst++ = b2; \
1666 } \
1667 else \
1668 *dst++ = charset_alt, *dst++ = c1, *dst++ = c2; \
1669 } \
1670 } while (0);
1671
1672 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1673 Check if a text is encoded in SJIS. If it is, return
1674 CODING_CATEGORY_MASK_SJIS, else return 0. */
1675
1676 int
1677 detect_coding_sjis (src, src_end)
1678 unsigned char *src, *src_end;
1679 {
1680 unsigned char c;
1681
1682 while (src < src_end)
1683 {
1684 c = *src++;
1685 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1686 return 0;
1687 if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
1688 {
1689 if (src < src_end && *src++ < 0x40)
1690 return 0;
1691 }
1692 }
1693 return CODING_CATEGORY_MASK_SJIS;
1694 }
1695
1696 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1697 Check if a text is encoded in BIG5. If it is, return
1698 CODING_CATEGORY_MASK_BIG5, else return 0. */
1699
1700 int
1701 detect_coding_big5 (src, src_end)
1702 unsigned char *src, *src_end;
1703 {
1704 unsigned char c;
1705
1706 while (src < src_end)
1707 {
1708 c = *src++;
1709 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1710 return 0;
1711 if (c >= 0xA1)
1712 {
1713 if (src >= src_end)
1714 break;
1715 c = *src++;
1716 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
1717 return 0;
1718 }
1719 }
1720 return CODING_CATEGORY_MASK_BIG5;
1721 }
1722
1723 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
1724 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
1725
1726 int
1727 decode_coding_sjis_big5 (coding, source, destination,
1728 src_bytes, dst_bytes, consumed, sjis_p)
1729 struct coding_system *coding;
1730 unsigned char *source, *destination;
1731 int src_bytes, dst_bytes;
1732 int *consumed;
1733 int sjis_p;
1734 {
1735 unsigned char *src = source;
1736 unsigned char *src_end = source + src_bytes;
1737 unsigned char *dst = destination;
1738 unsigned char *dst_end = destination + dst_bytes;
1739 /* Since the maximum bytes produced by each loop is 4, we subtract 3
1740 from DST_END to assure overflow checking is necessary only at the
1741 head of loop. */
1742 unsigned char *adjusted_dst_end = dst_end - 3;
1743 Lisp_Object unification_table
1744 = coding->character_unification_table_for_decode;
1745
1746 if (!NILP (Venable_character_unification) && NILP (unification_table))
1747 unification_table = Vstandard_character_unification_table_for_decode;
1748
1749 while (src < src_end && dst < adjusted_dst_end)
1750 {
1751 /* SRC_BASE remembers the start position in source in each loop.
1752 The loop will be exited when there's not enough source text
1753 to analyze two-byte character (within macro ONE_MORE_BYTE).
1754 In that case, SRC is reset to SRC_BASE before exiting. */
1755 unsigned char *src_base = src;
1756 unsigned char c1 = *src++, c2, c3, c4;
1757
1758 if (c1 == '\r')
1759 {
1760 if (coding->eol_type == CODING_EOL_CRLF)
1761 {
1762 ONE_MORE_BYTE (c2);
1763 if (c2 == '\n')
1764 *dst++ = c2;
1765 else
1766 /* To process C2 again, SRC is subtracted by 1. */
1767 *dst++ = c1, src--;
1768 }
1769 else
1770 *dst++ = c1;
1771 }
1772 else if (c1 < 0x20)
1773 *dst++ = c1;
1774 else if (c1 < 0x80)
1775 DECODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
1776 else if (c1 < 0xA0 || c1 >= 0xE0)
1777 {
1778 /* SJIS -> JISX0208, BIG5 -> Big5 (only if 0xE0 <= c1 < 0xFF) */
1779 if (sjis_p)
1780 {
1781 ONE_MORE_BYTE (c2);
1782 DECODE_SJIS (c1, c2, c3, c4);
1783 DECODE_SJIS_BIG5_CHARACTER (charset_jisx0208, c3, c4);
1784 }
1785 else if (c1 >= 0xE0 && c1 < 0xFF)
1786 {
1787 int charset;
1788
1789 ONE_MORE_BYTE (c2);
1790 DECODE_BIG5 (c1, c2, charset, c3, c4);
1791 DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
1792 }
1793 else /* Invalid code */
1794 *dst++ = c1;
1795 }
1796 else
1797 {
1798 /* SJIS -> JISX0201-Kana, BIG5 -> Big5 */
1799 if (sjis_p)
1800 DECODE_SJIS_BIG5_CHARACTER (charset_katakana_jisx0201, c1, /* dummy */ c2);
1801 else
1802 {
1803 int charset;
1804
1805 ONE_MORE_BYTE (c2);
1806 DECODE_BIG5 (c1, c2, charset, c3, c4);
1807 DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
1808 }
1809 }
1810 continue;
1811
1812 label_end_of_loop:
1813 coding->carryover_size = src - src_base;
1814 bcopy (src_base, coding->carryover, coding->carryover_size);
1815 src = src_base;
1816 break;
1817 }
1818
1819 *consumed = src - source;
1820 return dst - destination;
1821 }
1822
1823 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
1824 This function can encode `charset_ascii', `charset_katakana_jisx0201',
1825 `charset_jisx0208', `charset_big5_1', and `charset_big5-2'. We are
1826 sure that all these charsets are registered as official charset
1827 (i.e. do not have extended leading-codes). Characters of other
1828 charsets are produced without any encoding. If SJIS_P is 1, encode
1829 SJIS text, else encode BIG5 text. */
1830
1831 int
1832 encode_coding_sjis_big5 (coding, source, destination,
1833 src_bytes, dst_bytes, consumed, sjis_p)
1834 struct coding_system *coding;
1835 unsigned char *source, *destination;
1836 int src_bytes, dst_bytes;
1837 int *consumed;
1838 int sjis_p;
1839 {
1840 unsigned char *src = source;
1841 unsigned char *src_end = source + src_bytes;
1842 unsigned char *dst = destination;
1843 unsigned char *dst_end = destination + dst_bytes;
1844 /* Since the maximum bytes produced by each loop is 2, we subtract 1
1845 from DST_END to assure overflow checking is necessary only at the
1846 head of loop. */
1847 unsigned char *adjusted_dst_end = dst_end - 1;
1848 Lisp_Object unification_table
1849 = coding->character_unification_table_for_encode;
1850
1851 if (!NILP (Venable_character_unification) && NILP (unification_table))
1852 unification_table = Vstandard_character_unification_table_for_encode;
1853
1854 while (src < src_end && dst < adjusted_dst_end)
1855 {
1856 /* SRC_BASE remembers the start position in source in each loop.
1857 The loop will be exited when there's not enough source text
1858 to analyze multi-byte codes (within macros ONE_MORE_BYTE and
1859 TWO_MORE_BYTES). In that case, SRC is reset to SRC_BASE
1860 before exiting. */
1861 unsigned char *src_base = src;
1862 unsigned char c1 = *src++, c2, c3, c4;
1863
1864 if (coding->composing)
1865 {
1866 if (c1 == 0xA0)
1867 {
1868 ONE_MORE_BYTE (c1);
1869 c1 &= 0x7F;
1870 }
1871 else if (c1 >= 0xA0)
1872 c1 -= 0x20;
1873 else
1874 coding->composing = 0;
1875 }
1876
1877 switch (emacs_code_class[c1])
1878 {
1879 case EMACS_ascii_code:
1880 ENCODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
1881 break;
1882
1883 case EMACS_control_code:
1884 *dst++ = c1;
1885 break;
1886
1887 case EMACS_carriage_return_code:
1888 if (!coding->selective)
1889 {
1890 *dst++ = c1;
1891 break;
1892 }
1893 /* fall down to treat '\r' as '\n' ... */
1894
1895 case EMACS_linefeed_code:
1896 if (coding->eol_type == CODING_EOL_LF
1897 || coding->eol_type == CODING_EOL_UNDECIDED)
1898 *dst++ = '\n';
1899 else if (coding->eol_type == CODING_EOL_CRLF)
1900 *dst++ = '\r', *dst++ = '\n';
1901 else
1902 *dst++ = '\r';
1903 break;
1904
1905 case EMACS_leading_code_2:
1906 ONE_MORE_BYTE (c2);
1907 ENCODE_SJIS_BIG5_CHARACTER (c1, c2, /* dummy */ c3);
1908 break;
1909
1910 case EMACS_leading_code_3:
1911 TWO_MORE_BYTES (c2, c3);
1912 ENCODE_SJIS_BIG5_CHARACTER (c1, c2, c3);
1913 break;
1914
1915 case EMACS_leading_code_4:
1916 THREE_MORE_BYTES (c2, c3, c4);
1917 ENCODE_SJIS_BIG5_CHARACTER (c2, c3, c4);
1918 break;
1919
1920 case EMACS_leading_code_composition:
1921 coding->composing = 1;
1922 break;
1923
1924 default: /* i.e. case EMACS_invalid_code: */
1925 *dst++ = c1;
1926 }
1927 continue;
1928
1929 label_end_of_loop:
1930 coding->carryover_size = src - src_base;
1931 bcopy (src_base, coding->carryover, coding->carryover_size);
1932 src = src_base;
1933 break;
1934 }
1935
1936 *consumed = src - source;
1937 return dst - destination;
1938 }
1939
1940 \f
1941 /*** 5. End-of-line handlers ***/
1942
1943 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
1944 This function is called only when `coding->eol_type' is
1945 CODING_EOL_CRLF or CODING_EOL_CR. */
1946
1947 decode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
1948 struct coding_system *coding;
1949 unsigned char *source, *destination;
1950 int src_bytes, dst_bytes;
1951 int *consumed;
1952 {
1953 unsigned char *src = source;
1954 unsigned char *src_end = source + src_bytes;
1955 unsigned char *dst = destination;
1956 unsigned char *dst_end = destination + dst_bytes;
1957 int produced;
1958
1959 switch (coding->eol_type)
1960 {
1961 case CODING_EOL_CRLF:
1962 {
1963 /* Since the maximum bytes produced by each loop is 2, we
1964 subtract 1 from DST_END to assure overflow checking is
1965 necessary only at the head of loop. */
1966 unsigned char *adjusted_dst_end = dst_end - 1;
1967
1968 while (src < src_end && dst < adjusted_dst_end)
1969 {
1970 unsigned char *src_base = src;
1971 unsigned char c = *src++;
1972 if (c == '\r')
1973 {
1974 ONE_MORE_BYTE (c);
1975 if (c != '\n')
1976 *dst++ = '\r';
1977 *dst++ = c;
1978 }
1979 else
1980 *dst++ = c;
1981 continue;
1982
1983 label_end_of_loop:
1984 coding->carryover_size = src - src_base;
1985 bcopy (src_base, coding->carryover, coding->carryover_size);
1986 src = src_base;
1987 break;
1988 }
1989 *consumed = src - source;
1990 produced = dst - destination;
1991 break;
1992 }
1993
1994 case CODING_EOL_CR:
1995 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
1996 bcopy (source, destination, produced);
1997 dst_end = destination + produced;
1998 while (dst < dst_end)
1999 if (*dst++ == '\r') dst[-1] = '\n';
2000 *consumed = produced;
2001 break;
2002
2003 default: /* i.e. case: CODING_EOL_LF */
2004 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2005 bcopy (source, destination, produced);
2006 *consumed = produced;
2007 break;
2008 }
2009
2010 return produced;
2011 }
2012
2013 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". Encode
2014 format of end-of-line according to `coding->eol_type'. If
2015 `coding->selective' is 1, code '\r' in source text also means
2016 end-of-line. */
2017
2018 encode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
2019 struct coding_system *coding;
2020 unsigned char *source, *destination;
2021 int src_bytes, dst_bytes;
2022 int *consumed;
2023 {
2024 unsigned char *src = source;
2025 unsigned char *dst = destination;
2026 int produced;
2027
2028 if (src_bytes <= 0)
2029 return 0;
2030
2031 switch (coding->eol_type)
2032 {
2033 case CODING_EOL_LF:
2034 case CODING_EOL_UNDECIDED:
2035 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2036 bcopy (source, destination, produced);
2037 if (coding->selective)
2038 {
2039 int i = produced;
2040 while (i--)
2041 if (*dst++ == '\r') dst[-1] = '\n';
2042 }
2043 *consumed = produced;
2044
2045 case CODING_EOL_CRLF:
2046 {
2047 unsigned char c;
2048 unsigned char *src_end = source + src_bytes;
2049 unsigned char *dst_end = destination + dst_bytes;
2050 /* Since the maximum bytes produced by each loop is 2, we
2051 subtract 1 from DST_END to assure overflow checking is
2052 necessary only at the head of loop. */
2053 unsigned char *adjusted_dst_end = dst_end - 1;
2054
2055 while (src < src_end && dst < adjusted_dst_end)
2056 {
2057 c = *src++;
2058 if (c == '\n' || (c == '\r' && coding->selective))
2059 *dst++ = '\r', *dst++ = '\n';
2060 else
2061 *dst++ = c;
2062 }
2063 produced = dst - destination;
2064 *consumed = src - source;
2065 break;
2066 }
2067
2068 default: /* i.e. case CODING_EOL_CR: */
2069 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2070 bcopy (source, destination, produced);
2071 {
2072 int i = produced;
2073 while (i--)
2074 if (*dst++ == '\n') dst[-1] = '\r';
2075 }
2076 *consumed = produced;
2077 }
2078
2079 return produced;
2080 }
2081
2082 \f
2083 /*** 6. C library functions ***/
2084
2085 /* In Emacs Lisp, coding system is represented by a Lisp symbol which
2086 has a property `coding-system'. The value of this property is a
2087 vector of length 5 (called as coding-vector). Among elements of
2088 this vector, the first (element[0]) and the fifth (element[4])
2089 carry important information for decoding/encoding. Before
2090 decoding/encoding, this information should be set in fields of a
2091 structure of type `coding_system'.
2092
2093 A value of property `coding-system' can be a symbol of another
2094 subsidiary coding-system. In that case, Emacs gets coding-vector
2095 from that symbol.
2096
2097 `element[0]' contains information to be set in `coding->type'. The
2098 value and its meaning is as follows:
2099
2100 0 -- coding_type_emacs_mule
2101 1 -- coding_type_sjis
2102 2 -- coding_type_iso2022
2103 3 -- coding_type_big5
2104 4 -- coding_type_ccl encoder/decoder written in CCL
2105 nil -- coding_type_no_conversion
2106 t -- coding_type_undecided (automatic conversion on decoding,
2107 no-conversion on encoding)
2108
2109 `element[4]' contains information to be set in `coding->flags' and
2110 `coding->spec'. The meaning varies by `coding->type'.
2111
2112 If `coding->type' is `coding_type_iso2022', element[4] is a vector
2113 of length 32 (of which the first 13 sub-elements are used now).
2114 Meanings of these sub-elements are:
2115
2116 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2117 If the value is an integer of valid charset, the charset is
2118 assumed to be designated to graphic register N initially.
2119
2120 If the value is minus, it is a minus value of charset which
2121 reserves graphic register N, which means that the charset is
2122 not designated initially but should be designated to graphic
2123 register N just before encoding a character in that charset.
2124
2125 If the value is nil, graphic register N is never used on
2126 encoding.
2127
2128 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2129 Each value takes t or nil. See the section ISO2022 of
2130 `coding.h' for more information.
2131
2132 If `coding->type' is `coding_type_big5', element[4] is t to denote
2133 BIG5-ETen or nil to denote BIG5-HKU.
2134
2135 If `coding->type' takes the other value, element[4] is ignored.
2136
2137 Emacs Lisp's coding system also carries information about format of
2138 end-of-line in a value of property `eol-type'. If the value is
2139 integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2140 means CODING_EOL_CR. If it is not integer, it should be a vector
2141 of subsidiary coding systems of which property `eol-type' has one
2142 of above values.
2143
2144 */
2145
2146 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2147 and set it in CODING. If CODING_SYSTEM_SYMBOL is invalid, CODING
2148 is setup so that no conversion is necessary and return -1, else
2149 return 0. */
2150
2151 int
2152 setup_coding_system (coding_system, coding)
2153 Lisp_Object coding_system;
2154 struct coding_system *coding;
2155 {
2156 Lisp_Object type, eol_type;
2157
2158 /* At first, set several fields default values. */
2159 coding->require_flushing = 0;
2160 coding->last_block = 0;
2161 coding->selective = 0;
2162 coding->composing = 0;
2163 coding->direction = 0;
2164 coding->carryover_size = 0;
2165 coding->post_read_conversion = coding->pre_write_conversion = Qnil;
2166 coding->character_unification_table_for_decode = Qnil;
2167 coding->character_unification_table_for_encode = Qnil;
2168
2169 Vlast_coding_system_used = coding->symbol = coding_system;
2170 eol_type = Qnil;
2171 /* Get value of property `coding-system' until we get a vector.
2172 While doing that, also get values of properties
2173 `post-read-conversion', `pre-write-conversion',
2174 `character-unification-table-for-decode',
2175 `character-unification-table-for-encode' and `eol-type'. */
2176 while (!NILP (coding_system) && SYMBOLP (coding_system))
2177 {
2178 if (NILP (coding->post_read_conversion))
2179 coding->post_read_conversion = Fget (coding_system,
2180 Qpost_read_conversion);
2181 if (NILP (coding->pre_write_conversion))
2182 coding->pre_write_conversion = Fget (coding_system,
2183 Qpre_write_conversion);
2184 if (NILP (eol_type))
2185 eol_type = Fget (coding_system, Qeol_type);
2186
2187 if (NILP (coding->character_unification_table_for_decode))
2188 coding->character_unification_table_for_decode
2189 = Fget (coding_system, Qcharacter_unification_table_for_decode);
2190
2191 if (NILP (coding->character_unification_table_for_encode))
2192 coding->character_unification_table_for_encode
2193 = Fget (coding_system, Qcharacter_unification_table_for_encode);
2194
2195 coding_system = Fget (coding_system, Qcoding_system);
2196 }
2197
2198 while (!NILP (coding->character_unification_table_for_decode)
2199 && SYMBOLP (coding->character_unification_table_for_decode))
2200 coding->character_unification_table_for_decode
2201 = Fget (coding->character_unification_table_for_decode,
2202 Qcharacter_unification_table_for_decode);
2203 if (!NILP (coding->character_unification_table_for_decode)
2204 && !CHAR_TABLE_P (coding->character_unification_table_for_decode))
2205 coding->character_unification_table_for_decode = Qnil;
2206
2207 while (!NILP (coding->character_unification_table_for_encode)
2208 && SYMBOLP (coding->character_unification_table_for_encode))
2209 coding->character_unification_table_for_encode
2210 = Fget (coding->character_unification_table_for_encode,
2211 Qcharacter_unification_table_for_encode);
2212 if (!NILP (coding->character_unification_table_for_encode)
2213 && !CHAR_TABLE_P (coding->character_unification_table_for_encode))
2214 coding->character_unification_table_for_encode = Qnil;
2215
2216 if (!VECTORP (coding_system)
2217 || XVECTOR (coding_system)->size != 5)
2218 goto label_invalid_coding_system;
2219
2220 if (VECTORP (eol_type))
2221 coding->eol_type = CODING_EOL_UNDECIDED;
2222 else if (XFASTINT (eol_type) == 1)
2223 coding->eol_type = CODING_EOL_CRLF;
2224 else if (XFASTINT (eol_type) == 2)
2225 coding->eol_type = CODING_EOL_CR;
2226 else
2227 coding->eol_type = CODING_EOL_LF;
2228
2229 type = XVECTOR (coding_system)->contents[0];
2230 switch (XFASTINT (type))
2231 {
2232 case 0:
2233 coding->type = coding_type_emacs_mule;
2234 break;
2235
2236 case 1:
2237 coding->type = coding_type_sjis;
2238 break;
2239
2240 case 2:
2241 coding->type = coding_type_iso2022;
2242 {
2243 Lisp_Object val = XVECTOR (coding_system)->contents[4];
2244 Lisp_Object *flags;
2245 int i, charset, default_reg_bits = 0;
2246
2247 if (!VECTORP (val) || XVECTOR (val)->size != 32)
2248 goto label_invalid_coding_system;
2249
2250 flags = XVECTOR (val)->contents;
2251 coding->flags
2252 = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
2253 | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
2254 | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
2255 | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
2256 | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
2257 | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
2258 | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
2259 | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
2260 | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
2261 | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
2262 | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL));
2263
2264 /* Invoke graphic register 0 to plane 0. */
2265 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
2266 /* Invoke graphic register 1 to plane 1 if we can use full 8-bit. */
2267 CODING_SPEC_ISO_INVOCATION (coding, 1)
2268 = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
2269 /* Not single shifting at first. */
2270 CODING_SPEC_ISO_SINGLE_SHIFTING(coding) = 0;
2271 /* Beginning of buffer should also be regarded as bol. */
2272 CODING_SPEC_ISO_BOL(coding) = 1;
2273
2274 /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
2275 FLAGS[REG] can be one of below:
2276 integer CHARSET: CHARSET occupies register I,
2277 t: designate nothing to REG initially, but can be used
2278 by any charsets,
2279 list of integer, nil, or t: designate the first
2280 element (if integer) to REG initially, the remaining
2281 elements (if integer) is designated to REG on request,
2282 if an element is t, REG can be used by any charset,
2283 nil: REG is never used. */
2284 for (charset = 0; charset <= MAX_CHARSET; charset++)
2285 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2286 = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
2287 for (i = 0; i < 4; i++)
2288 {
2289 if (INTEGERP (flags[i])
2290 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset))
2291 || (charset = get_charset_id (flags[i])) >= 0)
2292 {
2293 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2294 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
2295 }
2296 else if (EQ (flags[i], Qt))
2297 {
2298 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2299 default_reg_bits |= 1 << i;
2300 }
2301 else if (CONSP (flags[i]))
2302 {
2303 Lisp_Object tail = flags[i];
2304
2305 if (INTEGERP (XCONS (tail)->car)
2306 && (charset = XINT (XCONS (tail)->car),
2307 CHARSET_VALID_P (charset))
2308 || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
2309 {
2310 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2311 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
2312 }
2313 else
2314 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2315 tail = XCONS (tail)->cdr;
2316 while (CONSP (tail))
2317 {
2318 if (INTEGERP (XCONS (tail)->car)
2319 && (charset = XINT (XCONS (tail)->car),
2320 CHARSET_VALID_P (charset))
2321 || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
2322 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2323 = i;
2324 else if (EQ (XCONS (tail)->car, Qt))
2325 default_reg_bits |= 1 << i;
2326 tail = XCONS (tail)->cdr;
2327 }
2328 }
2329 else
2330 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2331
2332 CODING_SPEC_ISO_DESIGNATION (coding, i)
2333 = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
2334 }
2335
2336 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
2337 {
2338 /* REG 1 can be used only by locking shift in 7-bit env. */
2339 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2340 default_reg_bits &= ~2;
2341 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
2342 /* Without any shifting, only REG 0 and 1 can be used. */
2343 default_reg_bits &= 3;
2344 }
2345
2346 for (charset = 0; charset <= MAX_CHARSET; charset++)
2347 if (CHARSET_VALID_P (charset)
2348 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2349 == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
2350 {
2351 /* We have not yet decided where to designate CHARSET. */
2352 int reg_bits = default_reg_bits;
2353
2354 if (CHARSET_CHARS (charset) == 96)
2355 /* A charset of CHARS96 can't be designated to REG 0. */
2356 reg_bits &= ~1;
2357
2358 if (reg_bits)
2359 /* There exist some default graphic register. */
2360 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2361 = (reg_bits & 1
2362 ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
2363 else
2364 /* We anyway have to designate CHARSET to somewhere. */
2365 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2366 = (CHARSET_CHARS (charset) == 94
2367 ? 0
2368 : ((coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT
2369 || ! coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2370 ? 1
2371 : (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT
2372 ? 2 : 0)));
2373 }
2374 }
2375 coding->require_flushing = 1;
2376 break;
2377
2378 case 3:
2379 coding->type = coding_type_big5;
2380 coding->flags
2381 = (NILP (XVECTOR (coding_system)->contents[4])
2382 ? CODING_FLAG_BIG5_HKU
2383 : CODING_FLAG_BIG5_ETEN);
2384 break;
2385
2386 case 4:
2387 coding->type = coding_type_ccl;
2388 {
2389 Lisp_Object val = XVECTOR (coding_system)->contents[4];
2390 if (CONSP (val)
2391 && VECTORP (XCONS (val)->car)
2392 && VECTORP (XCONS (val)->cdr))
2393 {
2394 setup_ccl_program (&(coding->spec.ccl.decoder), XCONS (val)->car);
2395 setup_ccl_program (&(coding->spec.ccl.encoder), XCONS (val)->cdr);
2396 }
2397 else
2398 goto label_invalid_coding_system;
2399 }
2400 coding->require_flushing = 1;
2401 break;
2402
2403 default:
2404 if (EQ (type, Qt))
2405 coding->type = coding_type_undecided;
2406 else
2407 coding->type = coding_type_no_conversion;
2408 break;
2409 }
2410 return 0;
2411
2412 label_invalid_coding_system:
2413 coding->type = coding_type_no_conversion;
2414 coding->eol_type = CODING_EOL_LF;
2415 coding->symbol = coding->pre_write_conversion = coding->post_read_conversion
2416 = Qnil;
2417 return -1;
2418 }
2419
2420 /* Emacs has a mechanism to automatically detect a coding system if it
2421 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
2422 it's impossible to distinguish some coding systems accurately
2423 because they use the same range of codes. So, at first, coding
2424 systems are categorized into 7, those are:
2425
2426 o coding-category-emacs-mule
2427
2428 The category for a coding system which has the same code range
2429 as Emacs' internal format. Assigned the coding-system (Lisp
2430 symbol) `emacs-mule' by default.
2431
2432 o coding-category-sjis
2433
2434 The category for a coding system which has the same code range
2435 as SJIS. Assigned the coding-system (Lisp
2436 symbol) `shift-jis' by default.
2437
2438 o coding-category-iso-7
2439
2440 The category for a coding system which has the same code range
2441 as ISO2022 of 7-bit environment. Assigned the coding-system
2442 (Lisp symbol) `iso-2022-7' by default.
2443
2444 o coding-category-iso-8-1
2445
2446 The category for a coding system which has the same code range
2447 as ISO2022 of 8-bit environment and graphic plane 1 used only
2448 for DIMENSION1 charset. Assigned the coding-system (Lisp
2449 symbol) `iso-8859-1' by default.
2450
2451 o coding-category-iso-8-2
2452
2453 The category for a coding system which has the same code range
2454 as ISO2022 of 8-bit environment and graphic plane 1 used only
2455 for DIMENSION2 charset. Assigned the coding-system (Lisp
2456 symbol) `euc-japan' by default.
2457
2458 o coding-category-iso-else
2459
2460 The category for a coding system which has the same code range
2461 as ISO2022 but not belongs to any of the above three
2462 categories. Assigned the coding-system (Lisp symbol)
2463 `iso-2022-ss2-7' by default.
2464
2465 o coding-category-big5
2466
2467 The category for a coding system which has the same code range
2468 as BIG5. Assigned the coding-system (Lisp symbol)
2469 `cn-big5' by default.
2470
2471 o coding-category-binary
2472
2473 The category for a coding system not categorized in any of the
2474 above. Assigned the coding-system (Lisp symbol)
2475 `no-conversion' by default.
2476
2477 Each of them is a Lisp symbol and the value is an actual
2478 `coding-system's (this is also a Lisp symbol) assigned by a user.
2479 What Emacs does actually is to detect a category of coding system.
2480 Then, it uses a `coding-system' assigned to it. If Emacs can't
2481 decide only one possible category, it selects a category of the
2482 highest priority. Priorities of categories are also specified by a
2483 user in a Lisp variable `coding-category-list'.
2484
2485 */
2486
2487 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2488 If it detects possible coding systems, return an integer in which
2489 appropriate flag bits are set. Flag bits are defined by macros
2490 CODING_CATEGORY_MASK_XXX in `coding.h'. */
2491
2492 int
2493 detect_coding_mask (src, src_bytes)
2494 unsigned char *src;
2495 int src_bytes;
2496 {
2497 register unsigned char c;
2498 unsigned char *src_end = src + src_bytes;
2499 int mask;
2500
2501 /* At first, skip all ASCII characters and control characters except
2502 for three ISO2022 specific control characters. */
2503 label_loop_detect_coding:
2504 while (src < src_end)
2505 {
2506 c = *src;
2507 if (c >= 0x80
2508 || (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
2509 break;
2510 src++;
2511 }
2512
2513 if (src >= src_end)
2514 /* We found nothing other than ASCII. There's nothing to do. */
2515 return CODING_CATEGORY_MASK_ANY;
2516
2517 /* The text seems to be encoded in some multilingual coding system.
2518 Now, try to find in which coding system the text is encoded. */
2519 if (c < 0x80)
2520 {
2521 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
2522 /* C is an ISO2022 specific control code of C0. */
2523 mask = detect_coding_iso2022 (src, src_end);
2524 src++;
2525 if (mask == CODING_CATEGORY_MASK_ANY)
2526 /* No valid ISO2022 code follows C. Try again. */
2527 goto label_loop_detect_coding;
2528 }
2529 else if (c == ISO_CODE_SS2 || c == ISO_CODE_SS3 || c == ISO_CODE_CSI)
2530 /* C is an ISO2022 specific control code of C1,
2531 or the first byte of SJIS's 2-byte character code,
2532 or a leading code of Emacs. */
2533 mask = (detect_coding_iso2022 (src, src_end)
2534 | detect_coding_sjis (src, src_end)
2535 | detect_coding_emacs_mule (src, src_end));
2536
2537 else if (c < 0xA0)
2538 /* C is the first byte of SJIS character code,
2539 or a leading-code of Emacs. */
2540 mask = (detect_coding_sjis (src, src_end)
2541 | detect_coding_emacs_mule (src, src_end));
2542
2543 else
2544 /* C is a character of ISO2022 in graphic plane right,
2545 or a SJIS's 1-byte character code (i.e. JISX0201),
2546 or the first byte of BIG5's 2-byte code. */
2547 mask = (detect_coding_iso2022 (src, src_end)
2548 | detect_coding_sjis (src, src_end)
2549 | detect_coding_big5 (src, src_end));
2550
2551 return mask;
2552 }
2553
2554 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2555 The information of the detected coding system is set in CODING. */
2556
2557 void
2558 detect_coding (coding, src, src_bytes)
2559 struct coding_system *coding;
2560 unsigned char *src;
2561 int src_bytes;
2562 {
2563 int mask = detect_coding_mask (src, src_bytes);
2564 int idx;
2565
2566 if (mask == CODING_CATEGORY_MASK_ANY)
2567 /* We found nothing other than ASCII. There's nothing to do. */
2568 return;
2569
2570 if (!mask)
2571 /* The source text seems to be encoded in unknown coding system.
2572 Emacs regards the category of such a kind of coding system as
2573 `coding-category-binary'. We assume that a user has assigned
2574 an appropriate coding system for a `coding-category-binary'. */
2575 idx = CODING_CATEGORY_IDX_BINARY;
2576 else
2577 {
2578 /* We found some plausible coding systems. Let's use a coding
2579 system of the highest priority. */
2580 Lisp_Object val = Vcoding_category_list;
2581
2582 if (CONSP (val))
2583 while (!NILP (val))
2584 {
2585 idx = XFASTINT (Fget (XCONS (val)->car, Qcoding_category_index));
2586 if ((idx < CODING_CATEGORY_IDX_MAX) && (mask & (1 << idx)))
2587 break;
2588 val = XCONS (val)->cdr;
2589 }
2590 else
2591 val = Qnil;
2592
2593 if (NILP (val))
2594 {
2595 /* For unknown reason, `Vcoding_category_list' contains none
2596 of found categories. Let's use any of them. */
2597 for (idx = 0; idx < CODING_CATEGORY_IDX_MAX; idx++)
2598 if (mask & (1 << idx))
2599 break;
2600 }
2601 }
2602 setup_coding_system (XSYMBOL (coding_category_table[idx])->value, coding);
2603 }
2604
2605 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2606 is encoded. Return one of CODING_EOL_LF, CODING_EOL_CRLF,
2607 CODING_EOL_CR, and CODING_EOL_UNDECIDED. */
2608
2609 int
2610 detect_eol_type (src, src_bytes)
2611 unsigned char *src;
2612 int src_bytes;
2613 {
2614 unsigned char *src_end = src + src_bytes;
2615 unsigned char c;
2616
2617 while (src < src_end)
2618 {
2619 c = *src++;
2620 if (c == '\n')
2621 return CODING_EOL_LF;
2622 else if (c == '\r')
2623 {
2624 if (src < src_end && *src == '\n')
2625 return CODING_EOL_CRLF;
2626 else
2627 return CODING_EOL_CR;
2628 }
2629 }
2630 return CODING_EOL_UNDECIDED;
2631 }
2632
2633 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2634 is encoded. If it detects an appropriate format of end-of-line, it
2635 sets the information in *CODING. */
2636
2637 void
2638 detect_eol (coding, src, src_bytes)
2639 struct coding_system *coding;
2640 unsigned char *src;
2641 int src_bytes;
2642 {
2643 Lisp_Object val;
2644 int eol_type = detect_eol_type (src, src_bytes);
2645
2646 if (eol_type == CODING_EOL_UNDECIDED)
2647 /* We found no end-of-line in the source text. */
2648 return;
2649
2650 val = Fget (coding->symbol, Qeol_type);
2651 if (VECTORP (val) && XVECTOR (val)->size == 3)
2652 setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
2653 }
2654
2655 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions". Before
2656 decoding, it may detect coding system and format of end-of-line if
2657 those are not yet decided. */
2658
2659 int
2660 decode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2661 struct coding_system *coding;
2662 unsigned char *source, *destination;
2663 int src_bytes, dst_bytes;
2664 int *consumed;
2665 {
2666 int produced;
2667
2668 if (src_bytes <= 0)
2669 {
2670 *consumed = 0;
2671 return 0;
2672 }
2673
2674 if (coding->type == coding_type_undecided)
2675 detect_coding (coding, source, src_bytes);
2676
2677 if (coding->eol_type == CODING_EOL_UNDECIDED)
2678 detect_eol (coding, source, src_bytes);
2679
2680 coding->carryover_size = 0;
2681 switch (coding->type)
2682 {
2683 case coding_type_no_conversion:
2684 label_no_conversion:
2685 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2686 bcopy (source, destination, produced);
2687 *consumed = produced;
2688 break;
2689
2690 case coding_type_emacs_mule:
2691 case coding_type_undecided:
2692 if (coding->eol_type == CODING_EOL_LF
2693 || coding->eol_type == CODING_EOL_UNDECIDED)
2694 goto label_no_conversion;
2695 produced = decode_eol (coding, source, destination,
2696 src_bytes, dst_bytes, consumed);
2697 break;
2698
2699 case coding_type_sjis:
2700 produced = decode_coding_sjis_big5 (coding, source, destination,
2701 src_bytes, dst_bytes, consumed,
2702 1);
2703 break;
2704
2705 case coding_type_iso2022:
2706 produced = decode_coding_iso2022 (coding, source, destination,
2707 src_bytes, dst_bytes, consumed);
2708 break;
2709
2710 case coding_type_big5:
2711 produced = decode_coding_sjis_big5 (coding, source, destination,
2712 src_bytes, dst_bytes, consumed,
2713 0);
2714 break;
2715
2716 case coding_type_ccl:
2717 produced = ccl_driver (&coding->spec.ccl.decoder, source, destination,
2718 src_bytes, dst_bytes, consumed);
2719 break;
2720 }
2721
2722 return produced;
2723 }
2724
2725 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". */
2726
2727 int
2728 encode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2729 struct coding_system *coding;
2730 unsigned char *source, *destination;
2731 int src_bytes, dst_bytes;
2732 int *consumed;
2733 {
2734 int produced;
2735
2736 coding->carryover_size = 0;
2737 switch (coding->type)
2738 {
2739 case coding_type_no_conversion:
2740 label_no_conversion:
2741 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2742 if (produced > 0)
2743 {
2744 bcopy (source, destination, produced);
2745 if (coding->selective)
2746 {
2747 unsigned char *p = destination, *pend = destination + produced;
2748 while (p < pend)
2749 if (*p++ == '\015') p[-1] = '\n';
2750 }
2751 }
2752 *consumed = produced;
2753 break;
2754
2755 case coding_type_emacs_mule:
2756 case coding_type_undecided:
2757 if (coding->eol_type == CODING_EOL_LF
2758 || coding->eol_type == CODING_EOL_UNDECIDED)
2759 goto label_no_conversion;
2760 produced = encode_eol (coding, source, destination,
2761 src_bytes, dst_bytes, consumed);
2762 break;
2763
2764 case coding_type_sjis:
2765 produced = encode_coding_sjis_big5 (coding, source, destination,
2766 src_bytes, dst_bytes, consumed,
2767 1);
2768 break;
2769
2770 case coding_type_iso2022:
2771 produced = encode_coding_iso2022 (coding, source, destination,
2772 src_bytes, dst_bytes, consumed);
2773 break;
2774
2775 case coding_type_big5:
2776 produced = encode_coding_sjis_big5 (coding, source, destination,
2777 src_bytes, dst_bytes, consumed,
2778 0);
2779 break;
2780
2781 case coding_type_ccl:
2782 produced = ccl_driver (&coding->spec.ccl.encoder, source, destination,
2783 src_bytes, dst_bytes, consumed);
2784 break;
2785 }
2786
2787 return produced;
2788 }
2789
2790 #define CONVERSION_BUFFER_EXTRA_ROOM 256
2791
2792 /* Return maximum size (bytes) of a buffer enough for decoding
2793 SRC_BYTES of text encoded in CODING. */
2794
2795 int
2796 decoding_buffer_size (coding, src_bytes)
2797 struct coding_system *coding;
2798 int src_bytes;
2799 {
2800 int magnification;
2801
2802 if (coding->type == coding_type_iso2022)
2803 magnification = 3;
2804 else if (coding->type == coding_type_ccl)
2805 magnification = coding->spec.ccl.decoder.buf_magnification;
2806 else
2807 magnification = 2;
2808
2809 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
2810 }
2811
2812 /* Return maximum size (bytes) of a buffer enough for encoding
2813 SRC_BYTES of text to CODING. */
2814
2815 int
2816 encoding_buffer_size (coding, src_bytes)
2817 struct coding_system *coding;
2818 int src_bytes;
2819 {
2820 int magnification;
2821
2822 if (coding->type == coding_type_ccl)
2823 magnification = coding->spec.ccl.encoder.buf_magnification;
2824 else
2825 magnification = 3;
2826
2827 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
2828 }
2829
2830 #ifndef MINIMUM_CONVERSION_BUFFER_SIZE
2831 #define MINIMUM_CONVERSION_BUFFER_SIZE 1024
2832 #endif
2833
2834 char *conversion_buffer;
2835 int conversion_buffer_size;
2836
2837 /* Return a pointer to a SIZE bytes of buffer to be used for encoding
2838 or decoding. Sufficient memory is allocated automatically. If we
2839 run out of memory, return NULL. */
2840
2841 char *
2842 get_conversion_buffer (size)
2843 int size;
2844 {
2845 if (size > conversion_buffer_size)
2846 {
2847 char *buf;
2848 int real_size = conversion_buffer_size * 2;
2849
2850 while (real_size < size) real_size *= 2;
2851 buf = (char *) xmalloc (real_size);
2852 xfree (conversion_buffer);
2853 conversion_buffer = buf;
2854 conversion_buffer_size = real_size;
2855 }
2856 return conversion_buffer;
2857 }
2858
2859 \f
2860 #ifdef emacs
2861 /*** 7. Emacs Lisp library functions ***/
2862
2863 DEFUN ("coding-system-spec", Fcoding_system_spec, Scoding_system_spec,
2864 1, 1, 0,
2865 "Return coding-spec of CODING-SYSTEM.\n\
2866 If CODING-SYSTEM is not a valid coding-system, return nil.")
2867 (obj)
2868 Lisp_Object obj;
2869 {
2870 while (SYMBOLP (obj) && !NILP (obj))
2871 obj = Fget (obj, Qcoding_system);
2872 return ((NILP (obj) || !VECTORP (obj) || XVECTOR (obj)->size != 5)
2873 ? Qnil : obj);
2874 }
2875
2876 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
2877 "Return t if OBJECT is nil or a coding-system.\n\
2878 See document of make-coding-system for coding-system object.")
2879 (obj)
2880 Lisp_Object obj;
2881 {
2882 return ((NILP (obj) || !NILP (Fcoding_system_spec (obj))) ? Qt : Qnil);
2883 }
2884
2885 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
2886 Sread_non_nil_coding_system, 1, 1, 0,
2887 "Read a coding system from the minibuffer, prompting with string PROMPT.")
2888 (prompt)
2889 Lisp_Object prompt;
2890 {
2891 Lisp_Object val;
2892 do
2893 {
2894 val = Fcompleting_read (prompt, Vobarray, Qcoding_system_spec,
2895 Qt, Qnil, Qnil, Qnil);
2896 }
2897 while (XSTRING (val)->size == 0);
2898 return (Fintern (val, Qnil));
2899 }
2900
2901 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 1, 0,
2902 "Read a coding system or nil from the minibuffer, prompting with string PROMPT.")
2903 (prompt)
2904 Lisp_Object prompt;
2905 {
2906 Lisp_Object val = Fcompleting_read (prompt, Vobarray, Qcoding_system_p,
2907 Qt, Qnil, Qnil, Qnil);
2908 return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
2909 }
2910
2911 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
2912 1, 1, 0,
2913 "Check validity of CODING-SYSTEM.\n\
2914 If valid, return CODING-SYSTEM, else `coding-system-error' is signaled.\n\
2915 CODING-SYSTEM is valid if it is a symbol and has \"coding-system\" property.\n\
2916 The value of property should be a vector of length 5.")
2917 (coding_system)
2918 Lisp_Object coding_system;
2919 {
2920 CHECK_SYMBOL (coding_system, 0);
2921 if (!NILP (Fcoding_system_p (coding_system)))
2922 return coding_system;
2923 while (1)
2924 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
2925 }
2926
2927 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
2928 2, 2, 0,
2929 "Detect coding-system of the text in the region between START and END.\n\
2930 Return a list of possible coding-systems ordered by priority.\n\
2931 If only ASCII characters are found, it returns `undecided'\n\
2932 or its subsidiary coding-system according to a detected end-of-line format.")
2933 (b, e)
2934 Lisp_Object b, e;
2935 {
2936 int coding_mask, eol_type;
2937 Lisp_Object val;
2938 int beg, end;
2939
2940 validate_region (&b, &e);
2941 beg = XINT (b), end = XINT (e);
2942 if (beg < GPT && end >= GPT) move_gap (end);
2943
2944 coding_mask = detect_coding_mask (POS_ADDR (beg), end - beg);
2945 eol_type = detect_eol_type (POS_ADDR (beg), end - beg);
2946
2947 if (coding_mask == CODING_CATEGORY_MASK_ANY)
2948 {
2949 val = intern ("undecided");
2950 if (eol_type != CODING_EOL_UNDECIDED)
2951 {
2952 Lisp_Object val2 = Fget (val, Qeol_type);
2953 if (VECTORP (val2))
2954 val = XVECTOR (val2)->contents[eol_type];
2955 }
2956 }
2957 else
2958 {
2959 Lisp_Object val2;
2960
2961 /* At first, gather possible coding-systems in VAL in a reverse
2962 order. */
2963 val = Qnil;
2964 for (val2 = Vcoding_category_list;
2965 !NILP (val2);
2966 val2 = XCONS (val2)->cdr)
2967 {
2968 int idx
2969 = XFASTINT (Fget (XCONS (val2)->car, Qcoding_category_index));
2970 if (coding_mask & (1 << idx))
2971 val = Fcons (Fsymbol_value (XCONS (val2)->car), val);
2972 }
2973
2974 /* Then, change the order of the list, while getting subsidiary
2975 coding-systems. */
2976 val2 = val;
2977 val = Qnil;
2978 for (; !NILP (val2); val2 = XCONS (val2)->cdr)
2979 {
2980 if (eol_type == CODING_EOL_UNDECIDED)
2981 val = Fcons (XCONS (val2)->car, val);
2982 else
2983 {
2984 Lisp_Object val3 = Fget (XCONS (val2)->car, Qeol_type);
2985 if (VECTORP (val3))
2986 val = Fcons (XVECTOR (val3)->contents[eol_type], val);
2987 else
2988 val = Fcons (XCONS (val2)->car, val);
2989 }
2990 }
2991 }
2992
2993 return val;
2994 }
2995
2996 /* Scan text in the region between *BEGP and *ENDP, skip characters
2997 which we never have to encode to (iff ENCODEP is 1) or decode from
2998 coding system CODING at the head and tail, then set BEGP and ENDP
2999 to the addresses of start and end of the text we actually convert. */
3000
3001 void
3002 shrink_conversion_area (begp, endp, coding, encodep)
3003 unsigned char **begp, **endp;
3004 struct coding_system *coding;
3005 int encodep;
3006 {
3007 register unsigned char *beg_addr = *begp, *end_addr = *endp;
3008
3009 if (coding->eol_type != CODING_EOL_LF
3010 && coding->eol_type != CODING_EOL_UNDECIDED)
3011 /* Since we anyway have to convert end-of-line format, it is not
3012 worth skipping at most 100 bytes or so. */
3013 return;
3014
3015 if (encodep) /* for encoding */
3016 {
3017 switch (coding->type)
3018 {
3019 case coding_type_no_conversion:
3020 case coding_type_emacs_mule:
3021 case coding_type_undecided:
3022 /* We need no conversion. */
3023 *begp = *endp;
3024 return;
3025 case coding_type_ccl:
3026 /* We can't skip any data. */
3027 return;
3028 case coding_type_iso2022:
3029 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3030 {
3031 unsigned char *bol = beg_addr;
3032 while (beg_addr < end_addr && *beg_addr < 0x80)
3033 {
3034 beg_addr++;
3035 if (*(beg_addr - 1) == '\n')
3036 bol = beg_addr;
3037 }
3038 beg_addr = bol;
3039 goto label_skip_tail;
3040 }
3041 /* fall down ... */
3042 default:
3043 /* We can skip all ASCII characters at the head and tail. */
3044 while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
3045 label_skip_tail:
3046 while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
3047 break;
3048 }
3049 }
3050 else /* for decoding */
3051 {
3052 switch (coding->type)
3053 {
3054 case coding_type_no_conversion:
3055 /* We need no conversion. */
3056 *begp = *endp;
3057 return;
3058 case coding_type_emacs_mule:
3059 if (coding->eol_type == CODING_EOL_LF)
3060 {
3061 /* We need no conversion. */
3062 *begp = *endp;
3063 return;
3064 }
3065 /* We can skip all but carriage-return. */
3066 while (beg_addr < end_addr && *beg_addr != '\r') beg_addr++;
3067 while (beg_addr < end_addr && *(end_addr - 1) != '\r') end_addr--;
3068 break;
3069 case coding_type_sjis:
3070 case coding_type_big5:
3071 /* We can skip all ASCII characters at the head. */
3072 while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
3073 /* We can skip all ASCII characters at the tail except for
3074 the second byte of SJIS or BIG5 code. */
3075 while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
3076 if (end_addr != *endp)
3077 end_addr++;
3078 break;
3079 case coding_type_ccl:
3080 /* We can't skip any data. */
3081 return;
3082 default: /* i.e. case coding_type_iso2022: */
3083 {
3084 unsigned char c;
3085
3086 /* We can skip all ASCII characters except for a few
3087 control codes at the head. */
3088 while (beg_addr < end_addr && (c = *beg_addr) < 0x80
3089 && c != ISO_CODE_CR && c != ISO_CODE_SO
3090 && c != ISO_CODE_SI && c != ISO_CODE_ESC)
3091 beg_addr++;
3092 }
3093 break;
3094 }
3095 }
3096 *begp = beg_addr;
3097 *endp = end_addr;
3098 return;
3099 }
3100
3101 /* Encode to (iff ENCODEP is 1) or decode form coding system CODING a
3102 text between B and E. B and E are buffer position. */
3103
3104 Lisp_Object
3105 code_convert_region (b, e, coding, encodep)
3106 Lisp_Object b, e;
3107 struct coding_system *coding;
3108 int encodep;
3109 {
3110 int beg, end, len, consumed, produced;
3111 char *buf;
3112 unsigned char *begp, *endp;
3113 int pos = PT;
3114
3115 validate_region (&b, &e);
3116 beg = XINT (b), end = XINT (e);
3117 if (beg < GPT && end >= GPT)
3118 move_gap (end);
3119
3120 if (encodep && !NILP (coding->pre_write_conversion))
3121 {
3122 /* We must call a pre-conversion function which may put a new
3123 text to be converted in a new buffer. */
3124 struct buffer *old = current_buffer, *new;
3125
3126 TEMP_SET_PT (beg);
3127 call2 (coding->pre_write_conversion, b, e);
3128 if (old != current_buffer)
3129 {
3130 /* Replace the original text by the text just generated. */
3131 len = ZV - BEGV;
3132 new = current_buffer;
3133 set_buffer_internal (old);
3134 del_range (beg, end);
3135 insert_from_buffer (new, 1, len, 0);
3136 end = beg + len;
3137 }
3138 }
3139
3140 /* We may be able to shrink the conversion region. */
3141 begp = POS_ADDR (beg); endp = begp + (end - beg);
3142 shrink_conversion_area (&begp, &endp, coding, encodep);
3143
3144 if (begp == endp)
3145 /* We need no conversion. */
3146 len = end - beg;
3147 else
3148 {
3149 beg += begp - POS_ADDR (beg);
3150 end = beg + (endp - begp);
3151
3152 if (encodep)
3153 len = encoding_buffer_size (coding, end - beg);
3154 else
3155 len = decoding_buffer_size (coding, end - beg);
3156 buf = get_conversion_buffer (len);
3157
3158 coding->last_block = 1;
3159 produced = (encodep
3160 ? encode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
3161 &consumed)
3162 : decode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
3163 &consumed));
3164
3165 len = produced + (beg - XINT (b)) + (XINT (e) - end);
3166
3167 TEMP_SET_PT (beg);
3168 insert (buf, produced);
3169 del_range (PT, PT + end - beg);
3170 if (pos >= end)
3171 pos = PT + (pos - end);
3172 else if (pos > beg)
3173 pos = beg;
3174 TEMP_SET_PT (pos);
3175 }
3176
3177 if (!encodep && !NILP (coding->post_read_conversion))
3178 {
3179 /* We must call a post-conversion function which may alter
3180 the text just converted. */
3181 Lisp_Object insval;
3182
3183 beg = XINT (b);
3184 TEMP_SET_PT (beg);
3185 insval = call1 (coding->post_read_conversion, make_number (len));
3186 CHECK_NUMBER (insval, 0);
3187 len = XINT (insval);
3188 }
3189
3190 return make_number (len);
3191 }
3192
3193 Lisp_Object
3194 code_convert_string (str, coding, encodep, nocopy)
3195 Lisp_Object str, nocopy;
3196 struct coding_system *coding;
3197 int encodep;
3198 {
3199 int len, consumed, produced;
3200 char *buf;
3201 unsigned char *begp, *endp;
3202 int head_skip, tail_skip;
3203 struct gcpro gcpro1;
3204
3205 if (encodep && !NILP (coding->pre_write_conversion)
3206 || !encodep && !NILP (coding->post_read_conversion))
3207 {
3208 /* Since we have to call Lisp functions which assume target text
3209 is in a buffer, after setting a temporary buffer, call
3210 code_convert_region. */
3211 int count = specpdl_ptr - specpdl;
3212 int len = XSTRING (str)->size;
3213 Lisp_Object result;
3214 struct buffer *old = current_buffer;
3215
3216 record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
3217 temp_output_buffer_setup (" *code-converting-work*");
3218 set_buffer_internal (XBUFFER (Vstandard_output));
3219 insert_from_string (str, 0, len, 0);
3220 code_convert_region (make_number (BEGV), make_number (ZV),
3221 coding, encodep);
3222 result = make_buffer_string (BEGV, ZV, 0);
3223 set_buffer_internal (old);
3224 return unbind_to (count, result);
3225 }
3226
3227 /* We may be able to shrink the conversion region. */
3228 begp = XSTRING (str)->data;
3229 endp = begp + XSTRING (str)->size;
3230 shrink_conversion_area (&begp, &endp, coding, encodep);
3231
3232 if (begp == endp)
3233 /* We need no conversion. */
3234 return (NILP (nocopy) ? Fcopy_sequence (str) : str);
3235
3236 head_skip = begp - XSTRING (str)->data;
3237 tail_skip = XSTRING (str)->size - head_skip - (endp - begp);
3238
3239 GCPRO1 (str);
3240
3241 if (encodep)
3242 len = encoding_buffer_size (coding, endp - begp);
3243 else
3244 len = decoding_buffer_size (coding, endp - begp);
3245 buf = get_conversion_buffer (len + head_skip + tail_skip);
3246
3247 bcopy (XSTRING (str)->data, buf, head_skip);
3248 coding->last_block = 1;
3249 produced = (encodep
3250 ? encode_coding (coding, XSTRING (str)->data + head_skip,
3251 buf + head_skip, endp - begp, len, &consumed)
3252 : decode_coding (coding, XSTRING (str)->data + head_skip,
3253 buf + head_skip, endp - begp, len, &consumed));
3254 bcopy (XSTRING (str)->data + head_skip + (endp - begp),
3255 buf + head_skip + produced,
3256 tail_skip);
3257
3258 UNGCPRO;
3259
3260 return make_string (buf, head_skip + produced + tail_skip);
3261 }
3262
3263 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
3264 3, 3, "r\nzCoding system: ",
3265 "Decode current region by specified coding system.\n\
3266 When called from a program, takes three arguments:\n\
3267 START, END, and CODING-SYSTEM. START END are buffer positions.\n\
3268 Return length of decoded text.")
3269 (b, e, coding_system)
3270 Lisp_Object b, e, coding_system;
3271 {
3272 struct coding_system coding;
3273
3274 CHECK_NUMBER_COERCE_MARKER (b, 0);
3275 CHECK_NUMBER_COERCE_MARKER (e, 1);
3276 CHECK_SYMBOL (coding_system, 2);
3277
3278 if (NILP (coding_system))
3279 return make_number (XFASTINT (e) - XFASTINT (b));
3280 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3281 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3282
3283 return code_convert_region (b, e, &coding, 0);
3284 }
3285
3286 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
3287 3, 3, "r\nzCoding system: ",
3288 "Encode current region by specified coding system.\n\
3289 When called from a program, takes three arguments:\n\
3290 START, END, and CODING-SYSTEM. START END are buffer positions.\n\
3291 Return length of encoded text.")
3292 (b, e, coding_system)
3293 Lisp_Object b, e, coding_system;
3294 {
3295 struct coding_system coding;
3296
3297 CHECK_NUMBER_COERCE_MARKER (b, 0);
3298 CHECK_NUMBER_COERCE_MARKER (e, 1);
3299 CHECK_SYMBOL (coding_system, 2);
3300
3301 if (NILP (coding_system))
3302 return make_number (XFASTINT (e) - XFASTINT (b));
3303 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3304 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3305
3306 return code_convert_region (b, e, &coding, 1);
3307 }
3308
3309 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
3310 2, 3, 0,
3311 "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
3312 Optional arg NOCOPY non-nil means return STRING itself if there's no need\n\
3313 of decoding.")
3314 (string, coding_system, nocopy)
3315 Lisp_Object string, coding_system, nocopy;
3316 {
3317 struct coding_system coding;
3318
3319 CHECK_STRING (string, 0);
3320 CHECK_SYMBOL (coding_system, 1);
3321
3322 if (NILP (coding_system))
3323 return (NILP (nocopy) ? Fcopy_sequence (string) : string);
3324 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3325 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3326
3327 return code_convert_string (string, &coding, 0, nocopy);
3328 }
3329
3330 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
3331 2, 3, 0,
3332 "Encode STRING to CODING-SYSTEM, and return the result.\n\
3333 Optional arg NOCOPY non-nil means return STRING itself if there's no need\n\
3334 of encoding.")
3335 (string, coding_system, nocopy)
3336 Lisp_Object string, coding_system, nocopy;
3337 {
3338 struct coding_system coding;
3339
3340 CHECK_STRING (string, 0);
3341 CHECK_SYMBOL (coding_system, 1);
3342
3343 if (NILP (coding_system))
3344 return (NILP (nocopy) ? Fcopy_sequence (string) : string);
3345 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3346 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3347
3348 return code_convert_string (string, &coding, 1, nocopy);
3349 }
3350
3351 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
3352 "Decode a JISX0208 character of shift-jis encoding.\n\
3353 CODE is the character code in SJIS.\n\
3354 Return the corresponding character.")
3355 (code)
3356 Lisp_Object code;
3357 {
3358 unsigned char c1, c2, s1, s2;
3359 Lisp_Object val;
3360
3361 CHECK_NUMBER (code, 0);
3362 s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
3363 DECODE_SJIS (s1, s2, c1, c2);
3364 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset_jisx0208, c1, c2));
3365 return val;
3366 }
3367
3368 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
3369 "Encode a JISX0208 character CHAR to SJIS coding-system.\n\
3370 Return the corresponding character code in SJIS.")
3371 (ch)
3372 Lisp_Object ch;
3373 {
3374 int charset, c1, c2, s1, s2;
3375 Lisp_Object val;
3376
3377 CHECK_NUMBER (ch, 0);
3378 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3379 if (charset == charset_jisx0208)
3380 {
3381 ENCODE_SJIS (c1, c2, s1, s2);
3382 XSETFASTINT (val, (s1 << 8) | s2);
3383 }
3384 else
3385 XSETFASTINT (val, 0);
3386 return val;
3387 }
3388
3389 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
3390 "Decode a Big5 character CODE of BIG5 coding-system.\n\
3391 CODE is the character code in BIG5.\n\
3392 Return the corresponding character.")
3393 (code)
3394 Lisp_Object code;
3395 {
3396 int charset;
3397 unsigned char b1, b2, c1, c2;
3398 Lisp_Object val;
3399
3400 CHECK_NUMBER (code, 0);
3401 b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
3402 DECODE_BIG5 (b1, b2, charset, c1, c2);
3403 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset, c1, c2));
3404 return val;
3405 }
3406
3407 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
3408 "Encode the Big5 character CHAR to BIG5 coding-system.\n\
3409 Return the corresponding character code in Big5.")
3410 (ch)
3411 Lisp_Object ch;
3412 {
3413 int charset, c1, c2, b1, b2;
3414 Lisp_Object val;
3415
3416 CHECK_NUMBER (ch, 0);
3417 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3418 if (charset == charset_big5_1 || charset == charset_big5_2)
3419 {
3420 ENCODE_BIG5 (charset, c1, c2, b1, b2);
3421 XSETFASTINT (val, (b1 << 8) | b2);
3422 }
3423 else
3424 XSETFASTINT (val, 0);
3425 return val;
3426 }
3427
3428 DEFUN ("set-terminal-coding-system-internal",
3429 Fset_terminal_coding_system_internal,
3430 Sset_terminal_coding_system_internal, 1, 1, 0, "")
3431 (coding_system)
3432 Lisp_Object coding_system;
3433 {
3434 CHECK_SYMBOL (coding_system, 0);
3435 setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
3436 return Qnil;
3437 }
3438
3439 DEFUN ("terminal-coding-system",
3440 Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
3441 "Return coding-system of your terminal.")
3442 ()
3443 {
3444 return terminal_coding.symbol;
3445 }
3446
3447 DEFUN ("set-keyboard-coding-system-internal",
3448 Fset_keyboard_coding_system_internal,
3449 Sset_keyboard_coding_system_internal, 1, 1, 0, "")
3450 (coding_system)
3451 Lisp_Object coding_system;
3452 {
3453 CHECK_SYMBOL (coding_system, 0);
3454 setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
3455 return Qnil;
3456 }
3457
3458 DEFUN ("keyboard-coding-system",
3459 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
3460 "Return coding-system of what is sent from terminal keyboard.")
3461 ()
3462 {
3463 return keyboard_coding.symbol;
3464 }
3465
3466 \f
3467 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
3468 Sfind_operation_coding_system, 1, MANY, 0,
3469 "Choose a coding system for an operation based on the target name.\n\
3470 The value names a pair of coding systems: (ENCODING-SYSTEM DECODING-SYSTEM).\n\
3471 ENCODING-SYSTEM is the coding system to use for encoding\n\
3472 \(in case OPERATION does encoding), and DECODING-SYSTEM is the coding system\n\
3473 for decoding (in case OPERATION does decoding).\n\
3474 \n\
3475 The first argument OPERATION specifies an I/O primitive:\n\
3476 For file I/O, `insert-file-contents' or `write-region'.\n\
3477 For process I/O, `call-process', `call-process-region', or `start-process'.\n\
3478 For network I/O, `open-network-stream'.\n\
3479 \n\
3480 The remaining arguments should be the same arguments that were passed\n\
3481 to the primitive. Depending on which primitive, one of those arguments\n\
3482 is selected as the TARGET. For example, if OPERATION does file I/O,\n\
3483 whichever argument specifies the file name is TARGET.\n\
3484 \n\
3485 TARGET has a meaning which depends on OPERATION:\n\
3486 For file I/O, TARGET is a file name.\n\
3487 For process I/O, TARGET is a process name.\n\
3488 For network I/O, TARGET is a service name or a port number\n\
3489 \n\
3490 This function looks up what specified for TARGET in,\n\
3491 `file-coding-system-alist', `process-coding-system-alist',\n\
3492 or `network-coding-system-alist' depending on OPERATION.\n\
3493 They may specify a coding system, a cons of coding systems,\n\
3494 or a function symbol to call.\n\
3495 In the last case, we call the function with one argument,\n\
3496 which is a list of all the arguments given to `find-coding-system'.")
3497 (nargs, args)
3498 int nargs;
3499 Lisp_Object *args;
3500 {
3501 Lisp_Object operation, target_idx, target, val;
3502 register Lisp_Object chain;
3503
3504 if (nargs < 2)
3505 error ("Too few arguments");
3506 operation = args[0];
3507 if (!SYMBOLP (operation)
3508 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
3509 error ("Invalid first arguement");
3510 if (nargs < 1 + XINT (target_idx))
3511 error ("Too few arguments for operation: %s",
3512 XSYMBOL (operation)->name->data);
3513 target = args[XINT (target_idx) + 1];
3514 if (!(STRINGP (target)
3515 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
3516 error ("Invalid %dth argument", XINT (target_idx) + 1);
3517
3518 chain = (operation == Qinsert_file_contents || operation == Qwrite_region
3519 ? Vfile_coding_system_alist
3520 : (operation == Qopen_network_stream
3521 ? Vnetwork_coding_system_alist
3522 : Vprocess_coding_system_alist));
3523 if (NILP (chain))
3524 return Qnil;
3525
3526 for (; CONSP (chain); chain = XCONS (chain)->cdr)
3527 {
3528 Lisp_Object elt = XCONS (chain)->car;
3529
3530 if (CONSP (elt)
3531 && ((STRINGP (target)
3532 && STRINGP (XCONS (elt)->car)
3533 && fast_string_match (XCONS (elt)->car, target) >= 0)
3534 || (INTEGERP (target) && EQ (target, XCONS (elt)->car))))
3535 {
3536 val = XCONS (elt)->cdr;
3537 if (CONSP (val))
3538 return val;
3539 if (! SYMBOLP (val))
3540 return Qnil;
3541 if (! NILP (Fcoding_system_p (val)))
3542 return Fcons (val, val);
3543 if (!NILP (Fboundp (val)))
3544 return call2 (val, Flist (nargs, args));
3545 return Qnil;
3546 }
3547 }
3548 return Qnil;
3549 }
3550
3551 #endif /* emacs */
3552
3553 \f
3554 /*** 8. Post-amble ***/
3555
3556 init_coding_once ()
3557 {
3558 int i;
3559
3560 /* Emacs' internal format specific initialize routine. */
3561 for (i = 0; i <= 0x20; i++)
3562 emacs_code_class[i] = EMACS_control_code;
3563 emacs_code_class[0x0A] = EMACS_linefeed_code;
3564 emacs_code_class[0x0D] = EMACS_carriage_return_code;
3565 for (i = 0x21 ; i < 0x7F; i++)
3566 emacs_code_class[i] = EMACS_ascii_code;
3567 emacs_code_class[0x7F] = EMACS_control_code;
3568 emacs_code_class[0x80] = EMACS_leading_code_composition;
3569 for (i = 0x81; i < 0xFF; i++)
3570 emacs_code_class[i] = EMACS_invalid_code;
3571 emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
3572 emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
3573 emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
3574 emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
3575
3576 /* ISO2022 specific initialize routine. */
3577 for (i = 0; i < 0x20; i++)
3578 iso_code_class[i] = ISO_control_code;
3579 for (i = 0x21; i < 0x7F; i++)
3580 iso_code_class[i] = ISO_graphic_plane_0;
3581 for (i = 0x80; i < 0xA0; i++)
3582 iso_code_class[i] = ISO_control_code;
3583 for (i = 0xA1; i < 0xFF; i++)
3584 iso_code_class[i] = ISO_graphic_plane_1;
3585 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
3586 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
3587 iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
3588 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
3589 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
3590 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
3591 iso_code_class[ISO_CODE_ESC] = ISO_escape;
3592 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
3593 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
3594 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
3595
3596 conversion_buffer_size = MINIMUM_CONVERSION_BUFFER_SIZE;
3597 conversion_buffer = (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE);
3598
3599 setup_coding_system (Qnil, &keyboard_coding);
3600 setup_coding_system (Qnil, &terminal_coding);
3601 }
3602
3603 #ifdef emacs
3604
3605 syms_of_coding ()
3606 {
3607 Qtarget_idx = intern ("target-idx");
3608 staticpro (&Qtarget_idx);
3609
3610 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
3611 Fput (Qwrite_region, Qtarget_idx, make_number (2));
3612
3613 Qcall_process = intern ("call-process");
3614 staticpro (&Qcall_process);
3615 Fput (Qcall_process, Qtarget_idx, make_number (0));
3616
3617 Qcall_process_region = intern ("call-process-region");
3618 staticpro (&Qcall_process_region);
3619 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
3620
3621 Qstart_process = intern ("start-process");
3622 staticpro (&Qstart_process);
3623 Fput (Qstart_process, Qtarget_idx, make_number (2));
3624
3625 Qopen_network_stream = intern ("open-network-stream");
3626 staticpro (&Qopen_network_stream);
3627 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
3628
3629 Qcoding_system = intern ("coding-system");
3630 staticpro (&Qcoding_system);
3631
3632 Qeol_type = intern ("eol-type");
3633 staticpro (&Qeol_type);
3634
3635 Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
3636 staticpro (&Qbuffer_file_coding_system);
3637
3638 Qpost_read_conversion = intern ("post-read-conversion");
3639 staticpro (&Qpost_read_conversion);
3640
3641 Qpre_write_conversion = intern ("pre-write-conversion");
3642 staticpro (&Qpre_write_conversion);
3643
3644 Qcoding_system_spec = intern ("coding-system-spec");
3645 staticpro (&Qcoding_system_spec);
3646
3647 Qcoding_system_p = intern ("coding-system-p");
3648 staticpro (&Qcoding_system_p);
3649
3650 Qcoding_system_error = intern ("coding-system-error");
3651 staticpro (&Qcoding_system_error);
3652
3653 Fput (Qcoding_system_error, Qerror_conditions,
3654 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
3655 Fput (Qcoding_system_error, Qerror_message,
3656 build_string ("Coding-system error"));
3657
3658 Qcoding_category_index = intern ("coding-category-index");
3659 staticpro (&Qcoding_category_index);
3660
3661 {
3662 int i;
3663 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3664 {
3665 coding_category_table[i] = intern (coding_category_name[i]);
3666 staticpro (&coding_category_table[i]);
3667 Fput (coding_category_table[i], Qcoding_category_index,
3668 make_number (i));
3669 }
3670 }
3671
3672 Qcharacter_unification_table = intern ("character-unification-table");
3673 staticpro (&Qcharacter_unification_table);
3674 Fput (Qcharacter_unification_table, Qchar_table_extra_slots,
3675 make_number (0));
3676
3677 Qcharacter_unification_table_for_decode
3678 = intern ("character-unification-table-for-decode");
3679 staticpro (&Qcharacter_unification_table_for_decode);
3680
3681 Qcharacter_unification_table_for_encode
3682 = intern ("character-unification-table-for-encode");
3683 staticpro (&Qcharacter_unification_table_for_encode);
3684
3685 defsubr (&Scoding_system_spec);
3686 defsubr (&Scoding_system_p);
3687 defsubr (&Sread_coding_system);
3688 defsubr (&Sread_non_nil_coding_system);
3689 defsubr (&Scheck_coding_system);
3690 defsubr (&Sdetect_coding_region);
3691 defsubr (&Sdecode_coding_region);
3692 defsubr (&Sencode_coding_region);
3693 defsubr (&Sdecode_coding_string);
3694 defsubr (&Sencode_coding_string);
3695 defsubr (&Sdecode_sjis_char);
3696 defsubr (&Sencode_sjis_char);
3697 defsubr (&Sdecode_big5_char);
3698 defsubr (&Sencode_big5_char);
3699 defsubr (&Sset_terminal_coding_system_internal);
3700 defsubr (&Sterminal_coding_system);
3701 defsubr (&Sset_keyboard_coding_system_internal);
3702 defsubr (&Skeyboard_coding_system);
3703 defsubr (&Sfind_operation_coding_system);
3704
3705 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
3706 "List of coding-categories (symbols) ordered by priority.");
3707 {
3708 int i;
3709
3710 Vcoding_category_list = Qnil;
3711 for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
3712 Vcoding_category_list
3713 = Fcons (coding_category_table[i], Vcoding_category_list);
3714 }
3715
3716 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
3717 "A variable of internal use only.\n\
3718 If the value is a coding system, it is used for decoding on read operation.\n\
3719 If not, an appropriate element in `coding-system-alist' (which see) is used.");
3720 Vcoding_system_for_read = Qnil;
3721
3722 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
3723 "A variable of internal use only.\n\
3724 If the value is a coding system, it is used for encoding on write operation.\n\
3725 If not, an appropriate element in `coding-system-alist' (which see) is used.");
3726 Vcoding_system_for_write = Qnil;
3727
3728 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
3729 "Coding-system used in the latest file or process I/O.");
3730 Vlast_coding_system_used = Qnil;
3731
3732 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
3733 "Alist to decide a coding system to use for a file I/O operation.\n\
3734 The format is ((PATTERN . VAL) ...),\n\
3735 where PATTERN is a regular expression matching a file name,\n\
3736 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
3737 If VAL is a coding system, it is used for both decoding and encoding\n\
3738 the file contents.\n\
3739 If VAL is a cons of coding systems, the car part is used for decoding,\n\
3740 and the cdr part is used for encoding.\n\
3741 If VAL is a function symbol, the function must return a coding system\n\
3742 or a cons of coding systems which are used as above.\n\
3743 \n\
3744 See also the function `find-coding-system'.");
3745 Vfile_coding_system_alist = Qnil;
3746
3747 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
3748 "Alist to decide a coding system to use for a process I/O operation.\n\
3749 The format is ((PATTERN . VAL) ...),\n\
3750 where PATTERN is a regular expression matching a program name,\n\
3751 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
3752 If VAL is a coding system, it is used for both decoding what received\n\
3753 from the program and encoding what sent to the program.\n\
3754 If VAL is a cons of coding systems, the car part is used for decoding,\n\
3755 and the cdr part is used for encoding.\n\
3756 If VAL is a function symbol, the function must return a coding system\n\
3757 or a cons of coding systems which are used as above.\n\
3758 \n\
3759 See also the function `find-coding-system'.");
3760 Vprocess_coding_system_alist = Qnil;
3761
3762 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
3763 "Alist to decide a coding system to use for a network I/O operation.\n\
3764 The format is ((PATTERN . VAL) ...),\n\
3765 where PATTERN is a regular expression matching a network service name\n\
3766 or is a port number to connect to,\n\
3767 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
3768 If VAL is a coding system, it is used for both decoding what received\n\
3769 from the network stream and encoding what sent to the network stream.\n\
3770 If VAL is a cons of coding systems, the car part is used for decoding,\n\
3771 and the cdr part is used for encoding.\n\
3772 If VAL is a function symbol, the function must return a coding system\n\
3773 or a cons of coding systems which are used as above.\n\
3774 \n\
3775 See also the function `find-coding-system'.");
3776 Vnetwork_coding_system_alist = Qnil;
3777
3778 DEFVAR_INT ("eol-mnemonic-unix", &eol_mnemonic_unix,
3779 "Mnemonic character indicating UNIX-like end-of-line format (i.e. LF) .");
3780 eol_mnemonic_unix = ':';
3781
3782 DEFVAR_INT ("eol-mnemonic-dos", &eol_mnemonic_dos,
3783 "Mnemonic character indicating DOS-like end-of-line format (i.e. CRLF).");
3784 eol_mnemonic_dos = '\\';
3785
3786 DEFVAR_INT ("eol-mnemonic-mac", &eol_mnemonic_mac,
3787 "Mnemonic character indicating MAC-like end-of-line format (i.e. CR).");
3788 eol_mnemonic_mac = '/';
3789
3790 DEFVAR_INT ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
3791 "Mnemonic character indicating end-of-line format is not yet decided.");
3792 eol_mnemonic_undecided = ':';
3793
3794 DEFVAR_LISP ("enable-character-unification", &Venable_character_unification,
3795 "Non-nil means ISO 2022 encoder/decoder do character unification.");
3796 Venable_character_unification = Qt;
3797
3798 DEFVAR_LISP ("standard-character-unification-table-for-decode",
3799 &Vstandard_character_unification_table_for_decode,
3800 "Table for unifying characters when reading.");
3801 Vstandard_character_unification_table_for_decode = Qnil;
3802
3803 DEFVAR_LISP ("standard-character-unification-table-for-encode",
3804 &Vstandard_character_unification_table_for_encode,
3805 "Table for unifying characters when writing.");
3806 Vstandard_character_unification_table_for_encode = Qnil;
3807
3808 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
3809 "Alist of charsets vs revision numbers.\n\
3810 While encoding, if a charset (car part of an element) is found,\n\
3811 designate it with the escape sequence identifing revision (cdr part of the element).");
3812 Vcharset_revision_alist = Qnil;
3813
3814 DEFVAR_LISP ("default-process-coding-system",
3815 &Vdefault_process_coding_system,
3816 "Cons of coding systems used for process I/O by default.\n\
3817 The car part is used for decoding a process output,\n\
3818 the cdr part is used for encoding a text to be sent to a process.");
3819 Vdefault_process_coding_system = Qnil;
3820 }
3821
3822 #endif /* emacs */