1 /* Coding system handler (conversion, detection, and etc).
2 Copyright (C) 1995, 1997 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
5 This file is part of GNU Emacs.
7 GNU Emacs is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
12 GNU Emacs is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GNU Emacs; see the file COPYING. If not, write to
19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
22 /*** TABLE OF CONTENTS ***
25 2. Emacs' internal format (emacs-mule) handlers
27 4. Shift-JIS and BIG5 handlers
28 5. End-of-line handlers
29 6. C library functions
30 7. Emacs Lisp library functions
35 /*** GENERAL NOTE on CODING SYSTEM ***
37 Coding system is an encoding mechanism of one or more character
38 sets. Here's a list of coding systems which Emacs can handle. When
39 we say "decode", it means converting some other coding system to
40 Emacs' internal format (emacs-internal), and when we say "encode",
41 it means converting the coding system emacs-mule to some other
44 0. Emacs' internal format (emacs-mule)
46 Emacs itself holds a multi-lingual character in a buffer and a string
47 in a special format. Details are described in the section 2.
51 The most famous coding system for multiple character sets. X's
52 Compound Text, various EUCs (Extended Unix Code), and such coding
53 systems used in Internet communication as ISO-2022-JP are all
54 variants of ISO2022. Details are described in the section 3.
56 2. SJIS (or Shift-JIS or MS-Kanji-Code)
58 A coding system to encode character sets: ASCII, JISX0201, and
59 JISX0208. Widely used for PC's in Japan. Details are described in
64 A coding system to encode character sets: ASCII and Big5. Widely
65 used by Chinese (mainly in Taiwan and Hong Kong). Details are
66 described in the section 4. In this file, when written as "BIG5"
67 (all uppercase), it means the coding system, and when written as
68 "Big5" (capitalized), it means the character set.
72 If a user want to read/write a text encoded in a coding system not
73 listed above, he can supply a decoder and an encoder for it in CCL
74 (Code Conversion Language) programs. Emacs executes the CCL program
75 while reading/writing.
77 Emacs represent a coding-system by a Lisp symbol that has a property
78 `coding-system'. But, before actually using the coding-system, the
79 information about it is set in a structure of type `struct
80 coding_system' for rapid processing. See the section 6 for more
85 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
87 How end-of-line of a text is encoded depends on a system. For
88 instance, Unix's format is just one byte of `line-feed' code,
89 whereas DOS's format is two bytes sequence of `carriage-return' and
90 `line-feed' codes. MacOS's format is one byte of `carriage-return'.
92 Since how characters in a text is encoded and how end-of-line is
93 encoded is independent, any coding system described above can take
94 any format of end-of-line. So, Emacs has information of format of
95 end-of-line in each coding-system. See the section 6 for more
100 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
102 These functions check if a text between SRC and SRC_END is encoded
103 in the coding system category XXX. Each returns an integer value in
104 which appropriate flag bits for the category XXX is set. The flag
105 bits are defined in macros CODING_CATEGORY_MASK_XXX. Below is the
106 template of these functions. */
109 detect_coding_emacs_mule (src
, src_end
)
110 unsigned char *src
, *src_end
;
116 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
118 These functions decode SRC_BYTES length text at SOURCE encoded in
119 CODING to Emacs' internal format (emacs-mule). The resulting text
120 goes to a place pointed by DESTINATION, the length of which should
121 not exceed DST_BYTES. The bytes actually processed is returned as
122 *CONSUMED. The return value is the length of the decoded text.
123 Below is a template of these functions. */
125 decode_coding_XXX (coding
, source
, destination
, src_bytes
, dst_bytes
, consumed
)
126 struct coding_system
*coding
;
127 unsigned char *source
, *destination
;
128 int src_bytes
, dst_bytes
;
135 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
137 These functions encode SRC_BYTES length text at SOURCE of Emacs'
138 internal format (emacs-mule) to CODING. The resulting text goes to
139 a place pointed by DESTINATION, the length of which should not
140 exceed DST_BYTES. The bytes actually processed is returned as
141 *CONSUMED. The return value is the length of the encoded text.
142 Below is a template of these functions. */
144 encode_coding_XXX (coding
, source
, destination
, src_bytes
, dst_bytes
, consumed
)
145 struct coding_system
*coding
;
146 unsigned char *source
, *destination
;
147 int src_bytes
, dst_bytes
;
154 /*** COMMONLY USED MACROS ***/
156 /* The following three macros ONE_MORE_BYTE, TWO_MORE_BYTES, and
157 THREE_MORE_BYTES safely get one, two, and three bytes from the
158 source text respectively. If there are not enough bytes in the
159 source, they jump to `label_end_of_loop'. The caller should set
160 variables `src' and `src_end' to appropriate areas in advance. */
162 #define ONE_MORE_BYTE(c1) \
167 goto label_end_of_loop; \
170 #define TWO_MORE_BYTES(c1, c2) \
172 if (src + 1 < src_end) \
173 c1 = *src++, c2 = *src++; \
175 goto label_end_of_loop; \
178 #define THREE_MORE_BYTES(c1, c2, c3) \
180 if (src + 2 < src_end) \
181 c1 = *src++, c2 = *src++, c3 = *src++; \
183 goto label_end_of_loop; \
186 /* The following three macros DECODE_CHARACTER_ASCII,
187 DECODE_CHARACTER_DIMENSION1, and DECODE_CHARACTER_DIMENSION2 put
188 the multi-byte form of a character of each class at the place
189 pointed by `dst'. The caller should set the variable `dst' to
190 point to an appropriate area and the variable `coding' to point to
191 the coding-system of the currently decoding text in advance. */
193 /* Decode one ASCII character C. */
195 #define DECODE_CHARACTER_ASCII(c) \
197 if (COMPOSING_P (coding->composing)) \
198 *dst++ = 0xA0, *dst++ = (c) | 0x80; \
203 /* Decode one DIMENSION1 character of which charset is CHARSET and
204 position-code is C. */
206 #define DECODE_CHARACTER_DIMENSION1(charset, c) \
208 unsigned char leading_code = CHARSET_LEADING_CODE_BASE (charset); \
209 if (COMPOSING_P (coding->composing)) \
210 *dst++ = leading_code + 0x20; \
212 *dst++ = leading_code; \
213 if (leading_code = CHARSET_LEADING_CODE_EXT (charset)) \
214 *dst++ = leading_code; \
215 *dst++ = (c) | 0x80; \
218 /* Decode one DIMENSION2 character of which charset is CHARSET and
219 position-codes are C1 and C2. */
221 #define DECODE_CHARACTER_DIMENSION2(charset, c1, c2) \
223 DECODE_CHARACTER_DIMENSION1 (charset, c1); \
224 *dst++ = (c2) | 0x80; \
228 /*** 1. Preamble ***/
242 #else /* not emacs */
246 #endif /* not emacs */
248 Lisp_Object Qcoding_system
, Qeol_type
;
249 Lisp_Object Qbuffer_file_coding_system
;
250 Lisp_Object Qpost_read_conversion
, Qpre_write_conversion
;
252 extern Lisp_Object Qinsert_file_contents
, Qwrite_region
;
253 Lisp_Object Qcall_process
, Qcall_process_region
, Qprocess_argument
;
254 Lisp_Object Qstart_process
, Qopen_network_stream
;
255 Lisp_Object Qtarget_idx
;
257 /* Mnemonic character of each format of end-of-line. */
258 int eol_mnemonic_unix
, eol_mnemonic_dos
, eol_mnemonic_mac
;
259 /* Mnemonic character to indicate format of end-of-line is not yet
261 int eol_mnemonic_undecided
;
265 Lisp_Object Qcoding_system_spec
, Qcoding_system_p
, Qcoding_system_error
;
267 /* Coding-systems are handed between Emacs Lisp programs and C internal
268 routines by the following three variables. */
269 /* Coding-system for reading files and receiving data from process. */
270 Lisp_Object Vcoding_system_for_read
;
271 /* Coding-system for writing files and sending data to process. */
272 Lisp_Object Vcoding_system_for_write
;
273 /* Coding-system actually used in the latest I/O. */
274 Lisp_Object Vlast_coding_system_used
;
276 /* Coding-system of what terminal accept for displaying. */
277 struct coding_system terminal_coding
;
279 /* Coding-system of what is sent from terminal keyboard. */
280 struct coding_system keyboard_coding
;
282 Lisp_Object Vfile_coding_system_alist
;
283 Lisp_Object Vprocess_coding_system_alist
;
284 Lisp_Object Vnetwork_coding_system_alist
;
288 Lisp_Object Qcoding_category_index
;
290 /* List of symbols `coding-category-xxx' ordered by priority. */
291 Lisp_Object Vcoding_category_list
;
293 /* Table of coding-systems currently assigned to each coding-category. */
294 Lisp_Object coding_category_table
[CODING_CATEGORY_IDX_MAX
];
296 /* Table of names of symbol for each coding-category. */
297 char *coding_category_name
[CODING_CATEGORY_IDX_MAX
] = {
298 "coding-category-emacs-mule",
299 "coding-category-sjis",
300 "coding-category-iso-7",
301 "coding-category-iso-8-1",
302 "coding-category-iso-8-2",
303 "coding-category-iso-else",
304 "coding-category-big5",
305 "coding-category-binary"
308 /* Flag to tell if we look up unification table on character code
310 Lisp_Object Venable_character_unification
;
311 /* Standard unification table to look up on reading (decoding). */
312 Lisp_Object Vstandard_character_unification_table_for_read
;
313 /* Standard unification table to look up on writing (encoding). */
314 Lisp_Object Vstandard_character_unification_table_for_write
;
316 Lisp_Object Qcharacter_unification_table
;
318 /* Alist of charsets vs revision number. */
319 Lisp_Object Vcharset_revision_alist
;
321 /* Default coding systems used for process I/O. */
322 Lisp_Object Vdefault_process_coding_system
;
325 /*** 2. Emacs internal format (emacs-mule) handlers ***/
327 /* Emacs' internal format for encoding multiple character sets is a
328 kind of multi-byte encoding, i.e. encoding a character by a sequence
329 of one-byte codes of variable length. ASCII characters and control
330 characters (e.g. `tab', `newline') are represented by one-byte as
331 is. It takes the range 0x00 through 0x7F. The other characters
332 are represented by a sequence of `base leading-code', optional
333 `extended leading-code', and one or two `position-code's. Length
334 of the sequence is decided by the base leading-code. Leading-code
335 takes the range 0x80 through 0x9F, whereas extended leading-code
336 and position-code take the range 0xA0 through 0xFF. See the
337 document of `charset.h' for more detail about leading-code and
340 There's one exception in this rule. Special leading-code
341 `leading-code-composition' denotes that the following several
342 characters should be composed into one character. Leading-codes of
343 components (except for ASCII) are added 0x20. An ASCII character
344 component is represented by a 2-byte sequence of `0xA0' and
345 `ASCII-code + 0x80'. See also the document in `charset.h' for the
346 detail of composite character. Hence, we can summarize the code
349 --- CODE RANGE of Emacs' internal format ---
350 (character set) (range)
352 ELSE (1st byte) 0x80 .. 0x9F
353 (rest bytes) 0xA0 .. 0xFF
354 ---------------------------------------------
358 enum emacs_code_class_type emacs_code_class
[256];
360 /* Go to the next statement only if *SRC is accessible and the code is
361 greater than 0xA0. */
362 #define CHECK_CODE_RANGE_A0_FF \
364 if (src >= src_end) \
365 goto label_end_of_switch; \
366 else if (*src++ < 0xA0) \
370 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
371 Check if a text is encoded in Emacs' internal format. If it is,
372 return CODING_CATEGORY_MASK_EMASC_MULE, else return 0. */
375 detect_coding_emacs_mule (src
, src_end
)
376 unsigned char *src
, *src_end
;
381 while (src
< src_end
)
393 switch (emacs_code_class
[c
])
395 case EMACS_ascii_code
:
396 case EMACS_linefeed_code
:
399 case EMACS_control_code
:
400 if (c
== ISO_CODE_ESC
|| c
== ISO_CODE_SI
|| c
== ISO_CODE_SO
)
404 case EMACS_invalid_code
:
407 case EMACS_leading_code_composition
: /* c == 0x80 */
409 CHECK_CODE_RANGE_A0_FF
;
414 case EMACS_leading_code_4
:
415 CHECK_CODE_RANGE_A0_FF
;
416 /* fall down to check it two more times ... */
418 case EMACS_leading_code_3
:
419 CHECK_CODE_RANGE_A0_FF
;
420 /* fall down to check it one more time ... */
422 case EMACS_leading_code_2
:
423 CHECK_CODE_RANGE_A0_FF
;
431 return CODING_CATEGORY_MASK_EMACS_MULE
;
435 /*** 3. ISO2022 handlers ***/
437 /* The following note describes the coding system ISO2022 briefly.
438 Since the intension of this note is to help understanding of the
439 programs in this file, some parts are NOT ACCURATE or OVERLY
440 SIMPLIFIED. For the thorough understanding, please refer to the
441 original document of ISO2022.
443 ISO2022 provides many mechanisms to encode several character sets
444 in 7-bit and 8-bit environment. If one choose 7-bite environment,
445 all text is encoded by codes of less than 128. This may make the
446 encoded text a little bit longer, but the text get more stability
447 to pass through several gateways (some of them split MSB off).
449 There are two kind of character set: control character set and
450 graphic character set. The former contains control characters such
451 as `newline' and `escape' to provide control functions (control
452 functions are provided also by escape sequence). The latter
453 contains graphic characters such as ' A' and '-'. Emacs recognizes
454 two control character sets and many graphic character sets.
456 Graphic character sets are classified into one of the following
457 four classes, DIMENSION1_CHARS94, DIMENSION1_CHARS96,
458 DIMENSION2_CHARS94, DIMENSION2_CHARS96 according to the number of
459 bytes (DIMENSION) and the number of characters in one dimension
460 (CHARS) of the set. In addition, each character set is assigned an
461 identification tag (called "final character" and denoted as <F>
462 here after) which is unique in each class. <F> of each character
463 set is decided by ECMA(*) when it is registered in ISO. Code range
464 of <F> is 0x30..0x7F (0x30..0x3F are for private use only).
466 Note (*): ECMA = European Computer Manufacturers Association
468 Here are examples of graphic character set [NAME(<F>)]:
469 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
470 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
471 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
472 o DIMENSION2_CHARS96 -- none for the moment
474 A code area (1byte=8bits) is divided into 4 areas, C0, GL, C1, and GR.
475 C0 [0x00..0x1F] -- control character plane 0
476 GL [0x20..0x7F] -- graphic character plane 0
477 C1 [0x80..0x9F] -- control character plane 1
478 GR [0xA0..0xFF] -- graphic character plane 1
480 A control character set is directly designated and invoked to C0 or
481 C1 by an escape sequence. The most common case is that ISO646's
482 control character set is designated/invoked to C0 and ISO6429's
483 control character set is designated/invoked to C1, and usually
484 these designations/invocations are omitted in a coded text. With
485 7-bit environment, only C0 can be used, and a control character for
486 C1 is encoded by an appropriate escape sequence to fit in the
487 environment. All control characters for C1 are defined the
488 corresponding escape sequences.
490 A graphic character set is at first designated to one of four
491 graphic registers (G0 through G3), then these graphic registers are
492 invoked to GL or GR. These designations and invocations can be
493 done independently. The most common case is that G0 is invoked to
494 GL, G1 is invoked to GR, and ASCII is designated to G0, and usually
495 these invocations and designations are omitted in a coded text.
496 With 7-bit environment, only GL can be used.
498 When a graphic character set of CHARS94 is invoked to GL, code 0x20
499 and 0x7F of GL area work as control characters SPACE and DEL
500 respectively, and code 0xA0 and 0xFF of GR area should not be used.
502 There are two ways of invocation: locking-shift and single-shift.
503 With locking-shift, the invocation lasts until the next different
504 invocation, whereas with single-shift, the invocation works only
505 for the following character and doesn't affect locking-shift.
506 Invocations are done by the following control characters or escape
509 ----------------------------------------------------------------------
510 function control char escape sequence description
511 ----------------------------------------------------------------------
512 SI (shift-in) 0x0F none invoke G0 to GL
513 SI (shift-out) 0x0E none invoke G1 to GL
514 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
515 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
516 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 into GL
517 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 into GL
518 ----------------------------------------------------------------------
519 The first four are for locking-shift. Control characters for these
520 functions are defined by macros ISO_CODE_XXX in `coding.h'.
522 Designations are done by the following escape sequences.
523 ----------------------------------------------------------------------
524 escape sequence description
525 ----------------------------------------------------------------------
526 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
527 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
528 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
529 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
530 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
531 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
532 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
533 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
534 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
535 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
536 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
537 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
538 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
539 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
540 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
541 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
542 ----------------------------------------------------------------------
544 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
545 of dimension 1, chars 94, and final character <F>, and etc.
547 Note (*): Although these designations are not allowed in ISO2022,
548 Emacs accepts them on decoding, and produces them on encoding
549 CHARS96 character set in a coding system which is characterized as
550 7-bit environment, non-locking-shift, and non-single-shift.
552 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
553 '(' can be omitted. We call this as "short-form" here after.
555 Now you may notice that there are a lot of ways for encoding the
556 same multilingual text in ISO2022. Actually, there exist many
557 coding systems such as Compound Text (used in X's inter client
558 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
559 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
560 localized platforms), and all of these are variants of ISO2022.
562 In addition to the above, Emacs handles two more kinds of escape
563 sequences: ISO6429's direction specification and Emacs' private
564 sequence for specifying character composition.
566 ISO6429's direction specification takes the following format:
567 o CSI ']' -- end of the current direction
568 o CSI '0' ']' -- end of the current direction
569 o CSI '1' ']' -- start of left-to-right text
570 o CSI '2' ']' -- start of right-to-left text
571 The control character CSI (0x9B: control sequence introducer) is
572 abbreviated to the escape sequence ESC '[' in 7-bit environment.
574 Character composition specification takes the following format:
575 o ESC '0' -- start character composition
576 o ESC '1' -- end character composition
577 Since these are not standard escape sequences of any ISO, the use
578 of them for these meaning is restricted to Emacs only. */
580 enum iso_code_class_type iso_code_class
[256];
582 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
583 Check if a text is encoded in ISO2022. If it is, returns an
584 integer in which appropriate flag bits any of:
585 CODING_CATEGORY_MASK_ISO_7
586 CODING_CATEGORY_MASK_ISO_8_1
587 CODING_CATEGORY_MASK_ISO_8_2
588 CODING_CATEGORY_MASK_ISO_ELSE
589 are set. If a code which should never appear in ISO2022 is found,
593 detect_coding_iso2022 (src
, src_end
)
594 unsigned char *src
, *src_end
;
596 int mask
= (CODING_CATEGORY_MASK_ISO_7
597 | CODING_CATEGORY_MASK_ISO_8_1
598 | CODING_CATEGORY_MASK_ISO_8_2
599 | CODING_CATEGORY_MASK_ISO_ELSE
);
600 int g1
= 0; /* 1 iff designating to G1. */
603 while (src
< src_end
)
613 && ((c
>= '(' && c
<= '/')
614 || c
== '$' && ((*src
>= '(' && *src
<= '/')
615 || (*src
>= '@' && *src
<= 'B'))))
617 /* Valid designation sequence. */
618 if (c
== ')' || (c
== '$' && *src
== ')'))
621 mask
&= ~CODING_CATEGORY_MASK_ISO_7
;
626 else if (c
== 'N' || c
== 'O' || c
== 'n' || c
== 'o')
627 return CODING_CATEGORY_MASK_ISO_ELSE
;
632 return CODING_CATEGORY_MASK_ISO_ELSE
;
638 mask
&= ~CODING_CATEGORY_MASK_ISO_7
;
650 mask
&= ~CODING_CATEGORY_MASK_ISO_7
;
651 while (src
< src_end
&& *src
>= 0xA0)
653 if (count
& 1 && src
< src_end
)
654 mask
&= ~CODING_CATEGORY_MASK_ISO_8_2
;
663 /* Decode a character of which charset is CHARSET and the 1st position
664 code is C1. If dimension of CHARSET is 2, the 2nd position code is
665 fetched from SRC and set to C2. If CHARSET is negative, it means
666 that we are decoding ill formed text, and what we can do is just to
669 #define DECODE_ISO_CHARACTER(charset, c1) \
671 int c_alt, charset_alt = (charset); \
672 if (COMPOSING_HEAD_P (coding->composing)) \
674 *dst++ = LEADING_CODE_COMPOSITION; \
675 if (COMPOSING_WITH_RULE_P (coding->composing)) \
676 /* To tell composition rules are embeded. */ \
678 coding->composing += 2; \
680 if ((charset) >= 0) \
682 if (CHARSET_DIMENSION (charset) == 2) \
683 ONE_MORE_BYTE (c2); \
684 if (!NILP (unification_table) \
685 && ((c_alt = unify_char (unification_table, \
686 -1, (charset), c1, c2)) >= 0)) \
687 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
689 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
690 DECODE_CHARACTER_ASCII (c1); \
691 else if (CHARSET_DIMENSION (charset_alt) == 1) \
692 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
694 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
695 if (COMPOSING_WITH_RULE_P (coding->composing)) \
696 /* To tell a composition rule follows. */ \
697 coding->composing = COMPOSING_WITH_RULE_RULE; \
700 /* Set designation state into CODING. */
701 #define DECODE_DESIGNATION(reg, dimension, chars, final_char) \
703 int charset = ISO_CHARSET_TABLE (dimension, chars, final_char); \
706 if (coding->direction == 1 \
707 && CHARSET_REVERSE_CHARSET (charset) >= 0) \
708 charset = CHARSET_REVERSE_CHARSET (charset); \
709 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
713 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
716 decode_coding_iso2022 (coding
, source
, destination
,
717 src_bytes
, dst_bytes
, consumed
)
718 struct coding_system
*coding
;
719 unsigned char *source
, *destination
;
720 int src_bytes
, dst_bytes
;
723 unsigned char *src
= source
;
724 unsigned char *src_end
= source
+ src_bytes
;
725 unsigned char *dst
= destination
;
726 unsigned char *dst_end
= destination
+ dst_bytes
;
727 /* Since the maximum bytes produced by each loop is 7, we subtract 6
728 from DST_END to assure that overflow checking is necessary only
729 at the head of loop. */
730 unsigned char *adjusted_dst_end
= dst_end
- 6;
732 /* Charsets invoked to graphic plane 0 and 1 respectively. */
733 int charset0
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 0);
734 int charset1
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 1);
735 Lisp_Object unification_table
= coding
->character_unification_table
;
737 if (!NILP (Venable_character_unification
) && NILP (unification_table
))
738 unification_table
= Vstandard_character_unification_table_for_read
;
740 while (src
< src_end
&& dst
< adjusted_dst_end
)
742 /* SRC_BASE remembers the start position in source in each loop.
743 The loop will be exited when there's not enough source text
744 to analyze long escape sequence or 2-byte code (within macros
745 ONE_MORE_BYTE or TWO_MORE_BYTES). In that case, SRC is reset
746 to SRC_BASE before exiting. */
747 unsigned char *src_base
= src
;
750 switch (iso_code_class
[c1
])
752 case ISO_0x20_or_0x7F
:
753 if (!coding
->composing
754 && (charset0
< 0 || CHARSET_CHARS (charset0
) == 94))
756 /* This is SPACE or DEL. */
760 /* This is a graphic character, we fall down ... */
762 case ISO_graphic_plane_0
:
763 if (coding
->composing
== COMPOSING_WITH_RULE_RULE
)
765 /* This is a composition rule. */
767 coding
->composing
= COMPOSING_WITH_RULE_TAIL
;
770 DECODE_ISO_CHARACTER (charset0
, c1
);
773 case ISO_0xA0_or_0xFF
:
774 if (charset1
< 0 || CHARSET_CHARS (charset1
) == 94)
780 /* This is a graphic character, we fall down ... */
782 case ISO_graphic_plane_1
:
783 DECODE_ISO_CHARACTER (charset1
, c1
);
786 case ISO_control_code
:
787 /* All ISO2022 control characters in this class have the
788 same representation in Emacs internal format. */
792 case ISO_carriage_return
:
793 if (coding
->eol_type
== CODING_EOL_CR
)
797 else if (coding
->eol_type
== CODING_EOL_CRLF
)
800 if (c1
== ISO_CODE_LF
)
815 if (CODING_SPEC_ISO_DESIGNATION (coding
, 1) < 0)
816 goto label_invalid_escape_sequence
;
817 CODING_SPEC_ISO_INVOCATION (coding
, 0) = 1;
818 charset0
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 0);
822 CODING_SPEC_ISO_INVOCATION (coding
, 0) = 0;
823 charset0
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 0);
826 case ISO_single_shift_2_7
:
827 case ISO_single_shift_2
:
828 /* SS2 is handled as an escape sequence of ESC 'N' */
830 goto label_escape_sequence
;
832 case ISO_single_shift_3
:
833 /* SS2 is handled as an escape sequence of ESC 'O' */
835 goto label_escape_sequence
;
837 case ISO_control_sequence_introducer
:
838 /* CSI is handled as an escape sequence of ESC '[' ... */
840 goto label_escape_sequence
;
844 label_escape_sequence
:
845 /* Escape sequences handled by Emacs are invocation,
846 designation, direction specification, and character
847 composition specification. */
850 case '&': /* revision of following character set */
852 if (!(c1
>= '@' && c1
<= '~'))
853 goto label_invalid_escape_sequence
;
855 if (c1
!= ISO_CODE_ESC
)
856 goto label_invalid_escape_sequence
;
858 goto label_escape_sequence
;
860 case '$': /* designation of 2-byte character set */
862 if (c1
>= '@' && c1
<= 'B')
863 { /* designation of JISX0208.1978, GB2312.1980,
865 DECODE_DESIGNATION (0, 2, 94, c1
);
867 else if (c1
>= 0x28 && c1
<= 0x2B)
868 { /* designation of DIMENSION2_CHARS94 character set */
870 DECODE_DESIGNATION (c1
- 0x28, 2, 94, c2
);
872 else if (c1
>= 0x2C && c1
<= 0x2F)
873 { /* designation of DIMENSION2_CHARS96 character set */
875 DECODE_DESIGNATION (c1
- 0x2C, 2, 96, c2
);
878 goto label_invalid_escape_sequence
;
881 case 'n': /* invocation of locking-shift-2 */
882 if (CODING_SPEC_ISO_DESIGNATION (coding
, 2) < 0)
883 goto label_invalid_escape_sequence
;
884 CODING_SPEC_ISO_INVOCATION (coding
, 0) = 2;
885 charset0
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 0);
888 case 'o': /* invocation of locking-shift-3 */
889 if (CODING_SPEC_ISO_DESIGNATION (coding
, 3) < 0)
890 goto label_invalid_escape_sequence
;
891 CODING_SPEC_ISO_INVOCATION (coding
, 0) = 3;
892 charset0
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 0);
895 case 'N': /* invocation of single-shift-2 */
896 if (CODING_SPEC_ISO_DESIGNATION (coding
, 2) < 0)
897 goto label_invalid_escape_sequence
;
899 charset
= CODING_SPEC_ISO_DESIGNATION (coding
, 2);
900 DECODE_ISO_CHARACTER (charset
, c1
);
903 case 'O': /* invocation of single-shift-3 */
904 if (CODING_SPEC_ISO_DESIGNATION (coding
, 3) < 0)
905 goto label_invalid_escape_sequence
;
907 charset
= CODING_SPEC_ISO_DESIGNATION (coding
, 3);
908 DECODE_ISO_CHARACTER (charset
, c1
);
911 case '0': /* start composing without embeded rules */
912 coding
->composing
= COMPOSING_NO_RULE_HEAD
;
915 case '1': /* end composing */
916 coding
->composing
= COMPOSING_NO
;
919 case '2': /* start composing with embeded rules */
920 coding
->composing
= COMPOSING_WITH_RULE_HEAD
;
923 case '[': /* specification of direction */
924 /* For the moment, nested direction is not supported.
925 So, the value of `coding->direction' is 0 or 1: 0
926 means left-to-right, 1 means right-to-left. */
930 case ']': /* end of the current direction */
931 coding
->direction
= 0;
933 case '0': /* end of the current direction */
934 case '1': /* start of left-to-right direction */
937 coding
->direction
= 0;
939 goto label_invalid_escape_sequence
;
942 case '2': /* start of right-to-left direction */
945 coding
->direction
= 1;
947 goto label_invalid_escape_sequence
;
951 goto label_invalid_escape_sequence
;
956 if (c1
>= 0x28 && c1
<= 0x2B)
957 { /* designation of DIMENSION1_CHARS94 character set */
959 DECODE_DESIGNATION (c1
- 0x28, 1, 94, c2
);
961 else if (c1
>= 0x2C && c1
<= 0x2F)
962 { /* designation of DIMENSION1_CHARS96 character set */
964 DECODE_DESIGNATION (c1
- 0x2C, 1, 96, c2
);
968 goto label_invalid_escape_sequence
;
971 /* We must update these variables now. */
972 charset0
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 0);
973 charset1
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 1);
976 label_invalid_escape_sequence
:
978 int length
= src
- src_base
;
980 bcopy (src_base
, dst
, length
);
987 coding
->carryover_size
= src
- src_base
;
988 bcopy (src_base
, coding
->carryover
, coding
->carryover_size
);
993 /* If this is the last block of the text to be decoded, we had
994 better just flush out all remaining codes in the text although
995 they are not valid characters. */
996 if (coding
->last_block
)
998 bcopy (src
, dst
, src_end
- src
);
999 dst
+= (src_end
- src
);
1002 *consumed
= src
- source
;
1003 return dst
- destination
;
1006 /* ISO2022 encoding staffs. */
1009 It is not enough to say just "ISO2022" on encoding, but we have to
1010 specify more details. In Emacs, each coding-system of ISO2022
1011 variant has the following specifications:
1012 1. Initial designation to G0 thru G3.
1013 2. Allows short-form designation?
1014 3. ASCII should be designated to G0 before control characters?
1015 4. ASCII should be designated to G0 at end of line?
1016 5. 7-bit environment or 8-bit environment?
1017 6. Use locking-shift?
1018 7. Use Single-shift?
1019 And the following two are only for Japanese:
1020 8. Use ASCII in place of JIS0201-1976-Roman?
1021 9. Use JISX0208-1983 in place of JISX0208-1978?
1022 These specifications are encoded in `coding->flags' as flag bits
1023 defined by macros CODING_FLAG_ISO_XXX. See `coding.h' for more
1027 /* Produce codes (escape sequence) for designating CHARSET to graphic
1028 register REG. If <final-char> of CHARSET is '@', 'A', or 'B' and
1029 the coding system CODING allows, produce designation sequence of
1032 #define ENCODE_DESIGNATION(charset, reg, coding) \
1034 unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset); \
1035 char *intermediate_char_94 = "()*+"; \
1036 char *intermediate_char_96 = ",-./"; \
1038 = Fassq (make_number (charset), Vcharset_revision_alist); \
1039 if (! NILP (temp)) \
1041 *dst++ = ISO_CODE_ESC; \
1043 *dst++ = XINT (XCONS (temp)->cdr) + '@'; \
1045 *dst++ = ISO_CODE_ESC; \
1046 if (CHARSET_DIMENSION (charset) == 1) \
1048 if (CHARSET_CHARS (charset) == 94) \
1049 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1051 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1056 if (CHARSET_CHARS (charset) == 94) \
1058 if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM) \
1060 || final_char < '@' || final_char > 'B') \
1061 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1064 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1066 *dst++ = final_char; \
1067 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
1070 /* The following two macros produce codes (control character or escape
1071 sequence) for ISO2022 single-shift functions (single-shift-2 and
1074 #define ENCODE_SINGLE_SHIFT_2 \
1076 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1077 *dst++ = ISO_CODE_ESC, *dst++ = 'N'; \
1079 *dst++ = ISO_CODE_SS2; \
1080 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1083 #define ENCODE_SINGLE_SHIFT_3 \
1085 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1086 *dst++ = ISO_CODE_ESC, *dst++ = 'O'; \
1088 *dst++ = ISO_CODE_SS3; \
1089 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1092 /* The following four macros produce codes (control character or
1093 escape sequence) for ISO2022 locking-shift functions (shift-in,
1094 shift-out, locking-shift-2, and locking-shift-3). */
1096 #define ENCODE_SHIFT_IN \
1098 *dst++ = ISO_CODE_SI; \
1099 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1102 #define ENCODE_SHIFT_OUT \
1104 *dst++ = ISO_CODE_SO; \
1105 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1108 #define ENCODE_LOCKING_SHIFT_2 \
1110 *dst++ = ISO_CODE_ESC, *dst++ = 'n'; \
1111 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1114 #define ENCODE_LOCKING_SHIFT_3 \
1116 *dst++ = ISO_CODE_ESC, *dst++ = 'o'; \
1117 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1120 /* Produce codes for a DIMENSION1 character of which character set is
1121 CHARSET and position-code is C1. Designation and invocation
1122 sequences are also produced in advance if necessary. */
1125 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
1127 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1129 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1130 *dst++ = c1 & 0x7F; \
1132 *dst++ = c1 | 0x80; \
1133 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1136 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1138 *dst++ = c1 & 0x7F; \
1141 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1143 *dst++ = c1 | 0x80; \
1147 /* Since CHARSET is not yet invoked to any graphic planes, we \
1148 must invoke it, or, at first, designate it to some graphic \
1149 register. Then repeat the loop to actually produce the \
1151 dst = encode_invocation_designation (charset, coding, dst); \
1154 /* Produce codes for a DIMENSION2 character of which character set is
1155 CHARSET and position-codes are C1 and C2. Designation and
1156 invocation codes are also produced in advance if necessary. */
1158 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
1160 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1162 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1163 *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F; \
1165 *dst++ = c1 | 0x80, *dst++ = c2 | 0x80; \
1166 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1169 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1171 *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F; \
1174 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1176 *dst++ = c1 | 0x80, *dst++= c2 | 0x80; \
1180 /* Since CHARSET is not yet invoked to any graphic planes, we \
1181 must invoke it, or, at first, designate it to some graphic \
1182 register. Then repeat the loop to actually produce the \
1184 dst = encode_invocation_designation (charset, coding, dst); \
1187 #define ENCODE_ISO_CHARACTER(charset, c1, c2) \
1189 int c_alt, charset_alt; \
1190 if (!NILP (unification_table) \
1191 && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
1193 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1195 charset_alt = charset; \
1196 if (CHARSET_DIMENSION (charset_alt) == 1) \
1197 ENCODE_ISO_CHARACTER_DIMENSION1 (charset_alt, c1); \
1199 ENCODE_ISO_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
1202 /* Produce designation and invocation codes at a place pointed by DST
1203 to use CHARSET. The element `spec.iso2022' of *CODING is updated.
1207 encode_invocation_designation (charset
, coding
, dst
)
1209 struct coding_system
*coding
;
1212 int reg
; /* graphic register number */
1214 /* At first, check designations. */
1215 for (reg
= 0; reg
< 4; reg
++)
1216 if (charset
== CODING_SPEC_ISO_DESIGNATION (coding
, reg
))
1221 /* CHARSET is not yet designated to any graphic registers. */
1222 /* At first check the requested designation. */
1223 reg
= CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
);
1224 if (reg
== CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION
)
1225 /* Since CHARSET requests no special designation, designate it
1226 to graphic register 0. */
1229 ENCODE_DESIGNATION (charset
, reg
, coding
);
1232 if (CODING_SPEC_ISO_INVOCATION (coding
, 0) != reg
1233 && CODING_SPEC_ISO_INVOCATION (coding
, 1) != reg
)
1235 /* Since the graphic register REG is not invoked to any graphic
1236 planes, invoke it to graphic plane 0. */
1239 case 0: /* graphic register 0 */
1243 case 1: /* graphic register 1 */
1247 case 2: /* graphic register 2 */
1248 if (coding
->flags
& CODING_FLAG_ISO_SINGLE_SHIFT
)
1249 ENCODE_SINGLE_SHIFT_2
;
1251 ENCODE_LOCKING_SHIFT_2
;
1254 case 3: /* graphic register 3 */
1255 if (coding
->flags
& CODING_FLAG_ISO_SINGLE_SHIFT
)
1256 ENCODE_SINGLE_SHIFT_3
;
1258 ENCODE_LOCKING_SHIFT_3
;
1265 /* The following two macros produce codes for indicating composition. */
1266 #define ENCODE_COMPOSITION_NO_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '0'
1267 #define ENCODE_COMPOSITION_WITH_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '2'
1268 #define ENCODE_COMPOSITION_END *dst++ = ISO_CODE_ESC, *dst++ = '1'
1270 /* The following three macros produce codes for indicating direction
1272 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
1274 if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS) \
1275 *dst++ = ISO_CODE_ESC, *dst++ = '['; \
1277 *dst++ = ISO_CODE_CSI; \
1280 #define ENCODE_DIRECTION_R2L \
1281 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '2', *dst++ = ']'
1283 #define ENCODE_DIRECTION_L2R \
1284 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '0', *dst++ = ']'
1286 /* Produce codes for designation and invocation to reset the graphic
1287 planes and registers to initial state. */
1288 #define ENCODE_RESET_PLANE_AND_REGISTER \
1291 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0) \
1293 for (reg = 0; reg < 4; reg++) \
1294 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0 \
1295 && (CODING_SPEC_ISO_DESIGNATION (coding, reg) \
1296 != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg))) \
1297 ENCODE_DESIGNATION \
1298 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
1301 /* Produce designation sequences of charsets in the line started from
1302 *SRC to a place pointed by DSTP.
1304 If the current block ends before any end-of-line, we may fail to
1305 find all the necessary *designations. */
1306 encode_designation_at_bol (coding
, table
, src
, src_end
, dstp
)
1307 struct coding_system
*coding
;
1309 unsigned char *src
, *src_end
, **dstp
;
1311 int charset
, c
, found
= 0, reg
;
1312 /* Table of charsets to be designated to each graphic register. */
1314 unsigned char *dst
= *dstp
;
1316 for (reg
= 0; reg
< 4; reg
++)
1319 while (src
< src_end
&& *src
!= '\n' && found
< 4)
1321 int bytes
= BYTES_BY_CHAR_HEAD (*src
);
1324 charset
= CHARSET_AT (src
);
1329 SPLIT_STRING(src
, bytes
, charset
, c1
, c2
);
1330 if ((c_alt
= unify_char (table
, -1, charset
, c1
, c2
)) >= 0)
1331 charset
= CHAR_CHARSET (c_alt
);
1334 reg
= CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
);
1335 if (r
[reg
] == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION
)
1346 for (reg
= 0; reg
< 4; reg
++)
1348 && CODING_SPEC_ISO_DESIGNATION (coding
, reg
) != r
[reg
])
1349 ENCODE_DESIGNATION (r
[reg
], reg
, coding
);
1354 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
1357 encode_coding_iso2022 (coding
, source
, destination
,
1358 src_bytes
, dst_bytes
, consumed
)
1359 struct coding_system
*coding
;
1360 unsigned char *source
, *destination
;
1361 int src_bytes
, dst_bytes
;
1364 unsigned char *src
= source
;
1365 unsigned char *src_end
= source
+ src_bytes
;
1366 unsigned char *dst
= destination
;
1367 unsigned char *dst_end
= destination
+ dst_bytes
;
1368 /* Since the maximum bytes produced by each loop is 20, we subtract 19
1369 from DST_END to assure overflow checking is necessary only at the
1371 unsigned char *adjusted_dst_end
= dst_end
- 19;
1372 Lisp_Object unification_table
= coding
->character_unification_table
;
1374 if (!NILP (Venable_character_unification
) && NILP (unification_table
))
1375 unification_table
= Vstandard_character_unification_table_for_write
;
1377 while (src
< src_end
&& dst
< adjusted_dst_end
)
1379 /* SRC_BASE remembers the start position in source in each loop.
1380 The loop will be exited when there's not enough source text
1381 to analyze multi-byte codes (within macros ONE_MORE_BYTE,
1382 TWO_MORE_BYTES, and THREE_MORE_BYTES). In that case, SRC is
1383 reset to SRC_BASE before exiting. */
1384 unsigned char *src_base
= src
;
1385 int charset
, c1
, c2
, c3
, c4
;
1387 if (coding
->flags
& CODING_FLAG_ISO_DESIGNATE_AT_BOL
1388 && CODING_SPEC_ISO_BOL (coding
))
1390 /* We have to produce designation sequences if any now. */
1391 encode_designation_at_bol (coding
, unification_table
,
1392 src
, src_end
, &dst
);
1393 CODING_SPEC_ISO_BOL (coding
) = 0;
1397 /* If we are seeing a component of a composite character, we are
1398 seeing a leading-code specially encoded for composition, or a
1399 composition rule if composing with rule. We must set C1
1400 to a normal leading-code or an ASCII code. If we are not at
1401 a composed character, we must reset the composition state. */
1402 if (COMPOSING_P (coding
->composing
))
1406 /* We are not in a composite character any longer. */
1407 coding
->composing
= COMPOSING_NO
;
1408 ENCODE_COMPOSITION_END
;
1412 if (coding
->composing
== COMPOSING_WITH_RULE_RULE
)
1415 coding
->composing
= COMPOSING_WITH_RULE_HEAD
;
1418 else if (coding
->composing
== COMPOSING_WITH_RULE_HEAD
)
1419 coding
->composing
= COMPOSING_WITH_RULE_RULE
;
1422 /* This is an ASCII component. */
1427 /* This is a leading-code of non ASCII component. */
1432 /* Now encode one character. C1 is a control character, an
1433 ASCII character, or a leading-code of multi-byte character. */
1434 switch (emacs_code_class
[c1
])
1436 case EMACS_ascii_code
:
1437 ENCODE_ISO_CHARACTER (CHARSET_ASCII
, c1
, /* dummy */ c2
);
1440 case EMACS_control_code
:
1441 if (coding
->flags
& CODING_FLAG_ISO_RESET_AT_CNTL
)
1442 ENCODE_RESET_PLANE_AND_REGISTER
;
1446 case EMACS_carriage_return_code
:
1447 if (!coding
->selective
)
1449 if (coding
->flags
& CODING_FLAG_ISO_RESET_AT_CNTL
)
1450 ENCODE_RESET_PLANE_AND_REGISTER
;
1454 /* fall down to treat '\r' as '\n' ... */
1456 case EMACS_linefeed_code
:
1457 if (coding
->flags
& CODING_FLAG_ISO_RESET_AT_EOL
)
1458 ENCODE_RESET_PLANE_AND_REGISTER
;
1459 if (coding
->flags
& CODING_FLAG_ISO_INIT_AT_BOL
)
1460 bcopy (coding
->spec
.iso2022
.initial_designation
,
1461 coding
->spec
.iso2022
.current_designation
,
1462 sizeof coding
->spec
.iso2022
.initial_designation
);
1463 if (coding
->eol_type
== CODING_EOL_LF
1464 || coding
->eol_type
== CODING_EOL_UNDECIDED
)
1465 *dst
++ = ISO_CODE_LF
;
1466 else if (coding
->eol_type
== CODING_EOL_CRLF
)
1467 *dst
++ = ISO_CODE_CR
, *dst
++ = ISO_CODE_LF
;
1469 *dst
++ = ISO_CODE_CR
;
1470 CODING_SPEC_ISO_BOL (coding
) = 1;
1473 case EMACS_leading_code_2
:
1475 ENCODE_ISO_CHARACTER (c1
, c2
, /* dummy */ c3
);
1478 case EMACS_leading_code_3
:
1479 TWO_MORE_BYTES (c2
, c3
);
1480 if (c1
< LEADING_CODE_PRIVATE_11
)
1481 ENCODE_ISO_CHARACTER (c1
, c2
, c3
);
1483 ENCODE_ISO_CHARACTER (c2
, c3
, /* dummy */ c4
);
1486 case EMACS_leading_code_4
:
1487 THREE_MORE_BYTES (c2
, c3
, c4
);
1488 ENCODE_ISO_CHARACTER (c2
, c3
, c4
);
1491 case EMACS_leading_code_composition
:
1495 coding
->composing
= COMPOSING_WITH_RULE_HEAD
;
1496 ENCODE_COMPOSITION_WITH_RULE_START
;
1500 /* Rewind one byte because it is a character code of
1501 composition elements. */
1503 coding
->composing
= COMPOSING_NO_RULE_HEAD
;
1504 ENCODE_COMPOSITION_NO_RULE_START
;
1508 case EMACS_invalid_code
:
1514 coding
->carryover_size
= src
- src_base
;
1515 bcopy (src_base
, coding
->carryover
, coding
->carryover_size
);
1519 /* If this is the last block of the text to be encoded, we must
1520 reset graphic planes and registers to the initial state. */
1521 if (src
>= src_end
&& coding
->last_block
)
1523 ENCODE_RESET_PLANE_AND_REGISTER
;
1524 if (coding
->carryover_size
> 0
1525 && coding
->carryover_size
< (dst_end
- dst
))
1527 bcopy (coding
->carryover
, dst
, coding
->carryover_size
);
1528 dst
+= coding
->carryover_size
;
1529 coding
->carryover_size
= 0;
1532 *consumed
= src
- source
;
1533 return dst
- destination
;
1537 /*** 4. SJIS and BIG5 handlers ***/
1539 /* Although SJIS and BIG5 are not ISO's coding system, They are used
1540 quite widely. So, for the moment, Emacs supports them in the bare
1541 C code. But, in the future, they may be supported only by CCL. */
1543 /* SJIS is a coding system encoding three character sets: ASCII, right
1544 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
1545 as is. A character of charset katakana-jisx0201 is encoded by
1546 "position-code + 0x80". A character of charset japanese-jisx0208
1547 is encoded in 2-byte but two position-codes are divided and shifted
1548 so that it fit in the range below.
1550 --- CODE RANGE of SJIS ---
1551 (character set) (range)
1553 KATAKANA-JISX0201 0xA0 .. 0xDF
1554 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xFF
1555 (2nd byte) 0x40 .. 0xFF
1556 -------------------------------
1560 /* BIG5 is a coding system encoding two character sets: ASCII and
1561 Big5. An ASCII character is encoded as is. Big5 is a two-byte
1562 character set and is encoded in two-byte.
1564 --- CODE RANGE of BIG5 ---
1565 (character set) (range)
1567 Big5 (1st byte) 0xA1 .. 0xFE
1568 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
1569 --------------------------
1571 Since the number of characters in Big5 is larger than maximum
1572 characters in Emacs' charset (96x96), it can't be handled as one
1573 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
1574 and `charset-big5-2'. Both are DIMENSION2 and CHARS94. The former
1575 contains frequently used characters and the latter contains less
1576 frequently used characters. */
1578 /* Macros to decode or encode a character of Big5 in BIG5. B1 and B2
1579 are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
1580 C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
1581 format. CHARSET is `charset_big5_1' or `charset_big5_2'. */
1583 /* Number of Big5 characters which have the same code in 1st byte. */
1584 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
1586 #define DECODE_BIG5(b1, b2, charset, c1, c2) \
1589 = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62); \
1591 charset = charset_big5_1; \
1594 charset = charset_big5_2; \
1595 temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW; \
1597 c1 = temp / (0xFF - 0xA1) + 0x21; \
1598 c2 = temp % (0xFF - 0xA1) + 0x21; \
1601 #define ENCODE_BIG5(charset, c1, c2, b1, b2) \
1603 unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21); \
1604 if (charset == charset_big5_2) \
1605 temp += BIG5_SAME_ROW * (0xC9 - 0xA1); \
1606 b1 = temp / BIG5_SAME_ROW + 0xA1; \
1607 b2 = temp % BIG5_SAME_ROW; \
1608 b2 += b2 < 0x3F ? 0x40 : 0x62; \
1611 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1612 Check if a text is encoded in SJIS. If it is, return
1613 CODING_CATEGORY_MASK_SJIS, else return 0. */
1616 detect_coding_sjis (src
, src_end
)
1617 unsigned char *src
, *src_end
;
1621 while (src
< src_end
)
1624 if (c
== ISO_CODE_ESC
|| c
== ISO_CODE_SI
|| c
== ISO_CODE_SO
)
1626 if ((c
>= 0x80 && c
< 0xA0) || c
>= 0xE0)
1628 if (src
< src_end
&& *src
++ < 0x40)
1632 return CODING_CATEGORY_MASK_SJIS
;
1635 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1636 Check if a text is encoded in BIG5. If it is, return
1637 CODING_CATEGORY_MASK_BIG5, else return 0. */
1640 detect_coding_big5 (src
, src_end
)
1641 unsigned char *src
, *src_end
;
1645 while (src
< src_end
)
1648 if (c
== ISO_CODE_ESC
|| c
== ISO_CODE_SI
|| c
== ISO_CODE_SO
)
1655 if (c
< 0x40 || (c
>= 0x7F && c
<= 0xA0))
1659 return CODING_CATEGORY_MASK_BIG5
;
1662 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
1663 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
1666 decode_coding_sjis_big5 (coding
, source
, destination
,
1667 src_bytes
, dst_bytes
, consumed
, sjis_p
)
1668 struct coding_system
*coding
;
1669 unsigned char *source
, *destination
;
1670 int src_bytes
, dst_bytes
;
1674 unsigned char *src
= source
;
1675 unsigned char *src_end
= source
+ src_bytes
;
1676 unsigned char *dst
= destination
;
1677 unsigned char *dst_end
= destination
+ dst_bytes
;
1678 /* Since the maximum bytes produced by each loop is 4, we subtract 3
1679 from DST_END to assure overflow checking is necessary only at the
1681 unsigned char *adjusted_dst_end
= dst_end
- 3;
1683 while (src
< src_end
&& dst
< adjusted_dst_end
)
1685 /* SRC_BASE remembers the start position in source in each loop.
1686 The loop will be exited when there's not enough source text
1687 to analyze two-byte character (within macro ONE_MORE_BYTE).
1688 In that case, SRC is reset to SRC_BASE before exiting. */
1689 unsigned char *src_base
= src
;
1690 unsigned char c1
= *src
++, c2
, c3
, c4
;
1694 if (coding
->eol_type
== CODING_EOL_CRLF
)
1700 /* To process C2 again, SRC is subtracted by 1. */
1708 else if (c1
< 0xA0 || c1
>= 0xE0)
1710 /* SJIS -> JISX0208, BIG5 -> Big5 (only if 0xE0 <= c1 < 0xFF) */
1714 DECODE_SJIS (c1
, c2
, c3
, c4
);
1715 DECODE_CHARACTER_DIMENSION2 (charset_jisx0208
, c3
, c4
);
1717 else if (c1
>= 0xE0 && c1
< 0xFF)
1722 DECODE_BIG5 (c1
, c2
, charset
, c3
, c4
);
1723 DECODE_CHARACTER_DIMENSION2 (charset
, c3
, c4
);
1725 else /* Invalid code */
1730 /* SJIS -> JISX0201-Kana, BIG5 -> Big5 */
1732 DECODE_CHARACTER_DIMENSION1 (charset_katakana_jisx0201
, c1
);
1738 DECODE_BIG5 (c1
, c2
, charset
, c3
, c4
);
1739 DECODE_CHARACTER_DIMENSION2 (charset
, c3
, c4
);
1745 coding
->carryover_size
= src
- src_base
;
1746 bcopy (src_base
, coding
->carryover
, coding
->carryover_size
);
1751 *consumed
= src
- source
;
1752 return dst
- destination
;
1755 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
1756 This function can encode `charset_ascii', `charset_katakana_jisx0201',
1757 `charset_jisx0208', `charset_big5_1', and `charset_big5-2'. We are
1758 sure that all these charsets are registered as official charset
1759 (i.e. do not have extended leading-codes). Characters of other
1760 charsets are produced without any encoding. If SJIS_P is 1, encode
1761 SJIS text, else encode BIG5 text. */
1764 encode_coding_sjis_big5 (coding
, source
, destination
,
1765 src_bytes
, dst_bytes
, consumed
, sjis_p
)
1766 struct coding_system
*coding
;
1767 unsigned char *source
, *destination
;
1768 int src_bytes
, dst_bytes
;
1772 unsigned char *src
= source
;
1773 unsigned char *src_end
= source
+ src_bytes
;
1774 unsigned char *dst
= destination
;
1775 unsigned char *dst_end
= destination
+ dst_bytes
;
1776 /* Since the maximum bytes produced by each loop is 2, we subtract 1
1777 from DST_END to assure overflow checking is necessary only at the
1779 unsigned char *adjusted_dst_end
= dst_end
- 1;
1781 while (src
< src_end
&& dst
< adjusted_dst_end
)
1783 /* SRC_BASE remembers the start position in source in each loop.
1784 The loop will be exited when there's not enough source text
1785 to analyze multi-byte codes (within macros ONE_MORE_BYTE and
1786 TWO_MORE_BYTES). In that case, SRC is reset to SRC_BASE
1788 unsigned char *src_base
= src
;
1789 unsigned char c1
= *src
++, c2
, c3
, c4
;
1791 if (coding
->composing
)
1798 else if (c1
>= 0xA0)
1801 coding
->composing
= 0;
1804 switch (emacs_code_class
[c1
])
1806 case EMACS_ascii_code
:
1807 case EMACS_control_code
:
1811 case EMACS_carriage_return_code
:
1812 if (!coding
->selective
)
1817 /* fall down to treat '\r' as '\n' ... */
1819 case EMACS_linefeed_code
:
1820 if (coding
->eol_type
== CODING_EOL_LF
1821 || coding
->eol_type
== CODING_EOL_UNDECIDED
)
1823 else if (coding
->eol_type
== CODING_EOL_CRLF
)
1824 *dst
++ = '\r', *dst
++ = '\n';
1829 case EMACS_leading_code_2
:
1831 if (sjis_p
&& c1
== charset_katakana_jisx0201
)
1834 *dst
++ = c1
, *dst
++ = c2
;
1837 case EMACS_leading_code_3
:
1838 TWO_MORE_BYTES (c2
, c3
);
1839 c2
&= 0x7F, c3
&= 0x7F;
1840 if (sjis_p
&& c1
== charset_jisx0208
)
1842 unsigned char s1
, s2
;
1844 ENCODE_SJIS (c2
, c3
, s1
, s2
);
1845 *dst
++ = s1
, *dst
++ = s2
;
1847 else if (!sjis_p
&& (c1
== charset_big5_1
|| c1
== charset_big5_2
))
1849 unsigned char b1
, b2
;
1851 ENCODE_BIG5 (c1
, c2
, c3
, b1
, b2
);
1852 *dst
++ = b1
, *dst
++ = b2
;
1855 *dst
++ = c1
, *dst
++ = c2
, *dst
++ = c3
;
1858 case EMACS_leading_code_4
:
1859 THREE_MORE_BYTES (c2
, c3
, c4
);
1860 *dst
++ = c1
, *dst
++ = c2
, *dst
++ = c3
, *dst
++ = c4
;
1863 case EMACS_leading_code_composition
:
1864 coding
->composing
= 1;
1867 default: /* i.e. case EMACS_invalid_code: */
1873 coding
->carryover_size
= src
- src_base
;
1874 bcopy (src_base
, coding
->carryover
, coding
->carryover_size
);
1879 *consumed
= src
- source
;
1880 return dst
- destination
;
1884 /*** 5. End-of-line handlers ***/
1886 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
1887 This function is called only when `coding->eol_type' is
1888 CODING_EOL_CRLF or CODING_EOL_CR. */
1890 decode_eol (coding
, source
, destination
, src_bytes
, dst_bytes
, consumed
)
1891 struct coding_system
*coding
;
1892 unsigned char *source
, *destination
;
1893 int src_bytes
, dst_bytes
;
1896 unsigned char *src
= source
;
1897 unsigned char *src_end
= source
+ src_bytes
;
1898 unsigned char *dst
= destination
;
1899 unsigned char *dst_end
= destination
+ dst_bytes
;
1902 switch (coding
->eol_type
)
1904 case CODING_EOL_CRLF
:
1906 /* Since the maximum bytes produced by each loop is 2, we
1907 subtract 1 from DST_END to assure overflow checking is
1908 necessary only at the head of loop. */
1909 unsigned char *adjusted_dst_end
= dst_end
- 1;
1911 while (src
< src_end
&& dst
< adjusted_dst_end
)
1913 unsigned char *src_base
= src
;
1914 unsigned char c
= *src
++;
1927 coding
->carryover_size
= src
- src_base
;
1928 bcopy (src_base
, coding
->carryover
, coding
->carryover_size
);
1932 *consumed
= src
- source
;
1933 produced
= dst
- destination
;
1938 produced
= (src_bytes
> dst_bytes
) ? dst_bytes
: src_bytes
;
1939 bcopy (source
, destination
, produced
);
1940 dst_end
= destination
+ produced
;
1941 while (dst
< dst_end
)
1942 if (*dst
++ == '\r') dst
[-1] = '\n';
1943 *consumed
= produced
;
1946 default: /* i.e. case: CODING_EOL_LF */
1947 produced
= (src_bytes
> dst_bytes
) ? dst_bytes
: src_bytes
;
1948 bcopy (source
, destination
, produced
);
1949 *consumed
= produced
;
1956 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". Encode
1957 format of end-of-line according to `coding->eol_type'. If
1958 `coding->selective' is 1, code '\r' in source text also means
1961 encode_eol (coding
, source
, destination
, src_bytes
, dst_bytes
, consumed
)
1962 struct coding_system
*coding
;
1963 unsigned char *source
, *destination
;
1964 int src_bytes
, dst_bytes
;
1967 unsigned char *src
= source
;
1968 unsigned char *dst
= destination
;
1974 switch (coding
->eol_type
)
1977 case CODING_EOL_UNDECIDED
:
1978 produced
= (src_bytes
> dst_bytes
) ? dst_bytes
: src_bytes
;
1979 bcopy (source
, destination
, produced
);
1980 if (coding
->selective
)
1984 if (*dst
++ == '\r') dst
[-1] = '\n';
1986 *consumed
= produced
;
1988 case CODING_EOL_CRLF
:
1991 unsigned char *src_end
= source
+ src_bytes
;
1992 unsigned char *dst_end
= destination
+ dst_bytes
;
1993 /* Since the maximum bytes produced by each loop is 2, we
1994 subtract 1 from DST_END to assure overflow checking is
1995 necessary only at the head of loop. */
1996 unsigned char *adjusted_dst_end
= dst_end
- 1;
1998 while (src
< src_end
&& dst
< adjusted_dst_end
)
2001 if (c
== '\n' || (c
== '\r' && coding
->selective
))
2002 *dst
++ = '\r', *dst
++ = '\n';
2006 produced
= dst
- destination
;
2007 *consumed
= src
- source
;
2011 default: /* i.e. case CODING_EOL_CR: */
2012 produced
= (src_bytes
> dst_bytes
) ? dst_bytes
: src_bytes
;
2013 bcopy (source
, destination
, produced
);
2017 if (*dst
++ == '\n') dst
[-1] = '\r';
2019 *consumed
= produced
;
2026 /*** 6. C library functions ***/
2028 /* In Emacs Lisp, coding system is represented by a Lisp symbol which
2029 has a property `coding-system'. The value of this property is a
2030 vector of length 5 (called as coding-vector). Among elements of
2031 this vector, the first (element[0]) and the fifth (element[4])
2032 carry important information for decoding/encoding. Before
2033 decoding/encoding, this information should be set in fields of a
2034 structure of type `coding_system'.
2036 A value of property `coding-system' can be a symbol of another
2037 subsidiary coding-system. In that case, Emacs gets coding-vector
2040 `element[0]' contains information to be set in `coding->type'. The
2041 value and its meaning is as follows:
2043 0 -- coding_type_emacs_mule
2044 1 -- coding_type_sjis
2045 2 -- coding_type_iso2022
2046 3 -- coding_type_big5
2047 4 -- coding_type_ccl encoder/decoder written in CCL
2048 nil -- coding_type_no_conversion
2049 t -- coding_type_undecided (automatic conversion on decoding,
2050 no-conversion on encoding)
2052 `element[4]' contains information to be set in `coding->flags' and
2053 `coding->spec'. The meaning varies by `coding->type'.
2055 If `coding->type' is `coding_type_iso2022', element[4] is a vector
2056 of length 32 (of which the first 13 sub-elements are used now).
2057 Meanings of these sub-elements are:
2059 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2060 If the value is an integer of valid charset, the charset is
2061 assumed to be designated to graphic register N initially.
2063 If the value is minus, it is a minus value of charset which
2064 reserves graphic register N, which means that the charset is
2065 not designated initially but should be designated to graphic
2066 register N just before encoding a character in that charset.
2068 If the value is nil, graphic register N is never used on
2071 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2072 Each value takes t or nil. See the section ISO2022 of
2073 `coding.h' for more information.
2075 If `coding->type' is `coding_type_big5', element[4] is t to denote
2076 BIG5-ETen or nil to denote BIG5-HKU.
2078 If `coding->type' takes the other value, element[4] is ignored.
2080 Emacs Lisp's coding system also carries information about format of
2081 end-of-line in a value of property `eol-type'. If the value is
2082 integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2083 means CODING_EOL_CR. If it is not integer, it should be a vector
2084 of subsidiary coding systems of which property `eol-type' has one
2089 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2090 and set it in CODING. If CODING_SYSTEM_SYMBOL is invalid, CODING
2091 is setup so that no conversion is necessary and return -1, else
2095 setup_coding_system (coding_system
, coding
)
2096 Lisp_Object coding_system
;
2097 struct coding_system
*coding
;
2099 Lisp_Object type
, eol_type
;
2101 /* At first, set several fields default values. */
2102 coding
->require_flushing
= 0;
2103 coding
->last_block
= 0;
2104 coding
->selective
= 0;
2105 coding
->composing
= 0;
2106 coding
->direction
= 0;
2107 coding
->carryover_size
= 0;
2108 coding
->post_read_conversion
= coding
->pre_write_conversion
= Qnil
;
2109 /* We have not yet implemented a way to specify unification table in
2111 coding
->character_unification_table
= Qnil
;
2113 Vlast_coding_system_used
= coding
->symbol
= coding_system
;
2115 /* Get value of property `coding-system' until we get a vector.
2116 While doing that, also get values of properties
2117 `post-read-conversion', `pre-write-conversion', and `eol-type'. */
2118 while (!NILP (coding_system
) && SYMBOLP (coding_system
))
2120 if (NILP (coding
->post_read_conversion
))
2121 coding
->post_read_conversion
= Fget (coding_system
,
2122 Qpost_read_conversion
);
2123 if (NILP (coding
->pre_write_conversion
))
2124 coding
->pre_write_conversion
= Fget (coding_system
,
2125 Qpre_write_conversion
);
2126 if (NILP (eol_type
))
2127 eol_type
= Fget (coding_system
, Qeol_type
);
2128 coding_system
= Fget (coding_system
, Qcoding_system
);
2130 if (!VECTORP (coding_system
)
2131 || XVECTOR (coding_system
)->size
!= 5)
2132 goto label_invalid_coding_system
;
2134 if (VECTORP (eol_type
))
2135 coding
->eol_type
= CODING_EOL_UNDECIDED
;
2136 else if (XFASTINT (eol_type
) == 1)
2137 coding
->eol_type
= CODING_EOL_CRLF
;
2138 else if (XFASTINT (eol_type
) == 2)
2139 coding
->eol_type
= CODING_EOL_CR
;
2141 coding
->eol_type
= CODING_EOL_LF
;
2143 type
= XVECTOR (coding_system
)->contents
[0];
2144 switch (XFASTINT (type
))
2147 coding
->type
= coding_type_emacs_mule
;
2151 coding
->type
= coding_type_sjis
;
2155 coding
->type
= coding_type_iso2022
;
2157 Lisp_Object val
= XVECTOR (coding_system
)->contents
[4];
2159 int i
, charset
, default_reg_bits
= 0;
2161 if (!VECTORP (val
) || XVECTOR (val
)->size
!= 32)
2162 goto label_invalid_coding_system
;
2164 flags
= XVECTOR (val
)->contents
;
2166 = ((NILP (flags
[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM
)
2167 | (NILP (flags
[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL
)
2168 | (NILP (flags
[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL
)
2169 | (NILP (flags
[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS
)
2170 | (NILP (flags
[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT
)
2171 | (NILP (flags
[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT
)
2172 | (NILP (flags
[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN
)
2173 | (NILP (flags
[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS
)
2174 | (NILP (flags
[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION
)
2175 | (NILP (flags
[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL
)
2176 | (NILP (flags
[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL
));
2178 /* Invoke graphic register 0 to plane 0. */
2179 CODING_SPEC_ISO_INVOCATION (coding
, 0) = 0;
2180 /* Invoke graphic register 1 to plane 1 if we can use full 8-bit. */
2181 CODING_SPEC_ISO_INVOCATION (coding
, 1)
2182 = (coding
->flags
& CODING_FLAG_ISO_SEVEN_BITS
? -1 : 1);
2183 /* Not single shifting at first. */
2184 CODING_SPEC_ISO_SINGLE_SHIFTING(coding
) = 0;
2185 /* Beginning of buffer should also be regarded as bol. */
2186 CODING_SPEC_ISO_BOL(coding
) = 1;
2188 /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
2189 FLAGS[REG] can be one of below:
2190 integer CHARSET: CHARSET occupies register I,
2191 t: designate nothing to REG initially, but can be used
2193 list of integer, nil, or t: designate the first
2194 element (if integer) to REG initially, the remaining
2195 elements (if integer) is designated to REG on request,
2196 if an element is t, REG can be used by any charset,
2197 nil: REG is never used. */
2198 for (charset
= 0; charset
<= MAX_CHARSET
; charset
++)
2199 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
)
2200 = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION
;
2201 for (i
= 0; i
< 4; i
++)
2203 if (INTEGERP (flags
[i
])
2204 && (charset
= XINT (flags
[i
]), CHARSET_VALID_P (charset
))
2205 || (charset
= get_charset_id (flags
[i
])) >= 0)
2207 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, i
) = charset
;
2208 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
) = i
;
2210 else if (EQ (flags
[i
], Qt
))
2212 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, i
) = -1;
2213 default_reg_bits
|= 1 << i
;
2215 else if (CONSP (flags
[i
]))
2217 Lisp_Object tail
= flags
[i
];
2219 if (INTEGERP (XCONS (tail
)->car
)
2220 && (charset
= XINT (XCONS (tail
)->car
),
2221 CHARSET_VALID_P (charset
))
2222 || (charset
= get_charset_id (XCONS (tail
)->car
)) >= 0)
2224 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, i
) = charset
;
2225 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
) =i
;
2228 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, i
) = -1;
2229 tail
= XCONS (tail
)->cdr
;
2230 while (CONSP (tail
))
2232 if (INTEGERP (XCONS (tail
)->car
)
2233 && (charset
= XINT (XCONS (tail
)->car
),
2234 CHARSET_VALID_P (charset
))
2235 || (charset
= get_charset_id (XCONS (tail
)->car
)) >= 0)
2236 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
)
2238 else if (EQ (XCONS (tail
)->car
, Qt
))
2239 default_reg_bits
|= 1 << i
;
2240 tail
= XCONS (tail
)->cdr
;
2244 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, i
) = -1;
2246 CODING_SPEC_ISO_DESIGNATION (coding
, i
)
2247 = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, i
);
2250 if (! (coding
->flags
& CODING_FLAG_ISO_LOCKING_SHIFT
))
2252 /* REG 1 can be used only by locking shift in 7-bit env. */
2253 if (coding
->flags
& CODING_FLAG_ISO_SEVEN_BITS
)
2254 default_reg_bits
&= ~2;
2255 if (! (coding
->flags
& CODING_FLAG_ISO_SINGLE_SHIFT
))
2256 /* Without any shifting, only REG 0 and 1 can be used. */
2257 default_reg_bits
&= 3;
2260 for (charset
= 0; charset
<= MAX_CHARSET
; charset
++)
2261 if (CHARSET_VALID_P (charset
)
2262 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
)
2263 == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION
))
2265 /* We have not yet decided where to designate CHARSET. */
2266 int reg_bits
= default_reg_bits
;
2268 if (CHARSET_CHARS (charset
) == 96)
2269 /* A charset of CHARS96 can't be designated to REG 0. */
2273 /* There exist some default graphic register. */
2274 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
)
2276 ? 0 : (reg_bits
& 2 ? 1 : (reg_bits
& 4 ? 2 : 3)));
2278 /* We anyway have to designate CHARSET to somewhere. */
2279 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
)
2280 = (CHARSET_CHARS (charset
) == 94
2282 : ((coding
->flags
& CODING_FLAG_ISO_LOCKING_SHIFT
2283 || ! coding
->flags
& CODING_FLAG_ISO_SEVEN_BITS
)
2285 : (coding
->flags
& CODING_FLAG_ISO_SINGLE_SHIFT
2289 coding
->require_flushing
= 1;
2293 coding
->type
= coding_type_big5
;
2295 = (NILP (XVECTOR (coding_system
)->contents
[4])
2296 ? CODING_FLAG_BIG5_HKU
2297 : CODING_FLAG_BIG5_ETEN
);
2301 coding
->type
= coding_type_ccl
;
2303 Lisp_Object val
= XVECTOR (coding_system
)->contents
[4];
2305 && VECTORP (XCONS (val
)->car
)
2306 && VECTORP (XCONS (val
)->cdr
))
2308 setup_ccl_program (&(coding
->spec
.ccl
.decoder
), XCONS (val
)->car
);
2309 setup_ccl_program (&(coding
->spec
.ccl
.encoder
), XCONS (val
)->cdr
);
2312 goto label_invalid_coding_system
;
2314 coding
->require_flushing
= 1;
2319 coding
->type
= coding_type_undecided
;
2321 coding
->type
= coding_type_no_conversion
;
2326 label_invalid_coding_system
:
2327 coding
->type
= coding_type_no_conversion
;
2328 coding
->eol_type
= CODING_EOL_LF
;
2329 coding
->symbol
= coding
->pre_write_conversion
= coding
->post_read_conversion
2334 /* Emacs has a mechanism to automatically detect a coding system if it
2335 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
2336 it's impossible to distinguish some coding systems accurately
2337 because they use the same range of codes. So, at first, coding
2338 systems are categorized into 7, those are:
2340 o coding-category-emacs-mule
2342 The category for a coding system which has the same code range
2343 as Emacs' internal format. Assigned the coding-system (Lisp
2344 symbol) `emacs-mule' by default.
2346 o coding-category-sjis
2348 The category for a coding system which has the same code range
2349 as SJIS. Assigned the coding-system (Lisp
2350 symbol) `shift-jis' by default.
2352 o coding-category-iso-7
2354 The category for a coding system which has the same code range
2355 as ISO2022 of 7-bit environment. Assigned the coding-system
2356 (Lisp symbol) `iso-2022-7' by default.
2358 o coding-category-iso-8-1
2360 The category for a coding system which has the same code range
2361 as ISO2022 of 8-bit environment and graphic plane 1 used only
2362 for DIMENSION1 charset. Assigned the coding-system (Lisp
2363 symbol) `iso-8859-1' by default.
2365 o coding-category-iso-8-2
2367 The category for a coding system which has the same code range
2368 as ISO2022 of 8-bit environment and graphic plane 1 used only
2369 for DIMENSION2 charset. Assigned the coding-system (Lisp
2370 symbol) `euc-japan' by default.
2372 o coding-category-iso-else
2374 The category for a coding system which has the same code range
2375 as ISO2022 but not belongs to any of the above three
2376 categories. Assigned the coding-system (Lisp symbol)
2377 `iso-2022-ss2-7' by default.
2379 o coding-category-big5
2381 The category for a coding system which has the same code range
2382 as BIG5. Assigned the coding-system (Lisp symbol)
2383 `cn-big5' by default.
2385 o coding-category-binary
2387 The category for a coding system not categorized in any of the
2388 above. Assigned the coding-system (Lisp symbol)
2389 `no-conversion' by default.
2391 Each of them is a Lisp symbol and the value is an actual
2392 `coding-system's (this is also a Lisp symbol) assigned by a user.
2393 What Emacs does actually is to detect a category of coding system.
2394 Then, it uses a `coding-system' assigned to it. If Emacs can't
2395 decide only one possible category, it selects a category of the
2396 highest priority. Priorities of categories are also specified by a
2397 user in a Lisp variable `coding-category-list'.
2401 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2402 If it detects possible coding systems, return an integer in which
2403 appropriate flag bits are set. Flag bits are defined by macros
2404 CODING_CATEGORY_MASK_XXX in `coding.h'. */
2407 detect_coding_mask (src
, src_bytes
)
2411 register unsigned char c
;
2412 unsigned char *src_end
= src
+ src_bytes
;
2415 /* At first, skip all ASCII characters and control characters except
2416 for three ISO2022 specific control characters. */
2417 label_loop_detect_coding
:
2418 while (src
< src_end
)
2422 || (c
== ISO_CODE_ESC
|| c
== ISO_CODE_SI
|| c
== ISO_CODE_SO
))
2428 /* We found nothing other than ASCII. There's nothing to do. */
2429 return CODING_CATEGORY_MASK_ANY
;
2431 /* The text seems to be encoded in some multilingual coding system.
2432 Now, try to find in which coding system the text is encoded. */
2435 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
2436 /* C is an ISO2022 specific control code of C0. */
2437 mask
= detect_coding_iso2022 (src
, src_end
);
2439 if (mask
== CODING_CATEGORY_MASK_ANY
)
2440 /* No valid ISO2022 code follows C. Try again. */
2441 goto label_loop_detect_coding
;
2443 else if (c
== ISO_CODE_SS2
|| c
== ISO_CODE_SS3
|| c
== ISO_CODE_CSI
)
2444 /* C is an ISO2022 specific control code of C1,
2445 or the first byte of SJIS's 2-byte character code,
2446 or a leading code of Emacs. */
2447 mask
= (detect_coding_iso2022 (src
, src_end
)
2448 | detect_coding_sjis (src
, src_end
)
2449 | detect_coding_emacs_mule (src
, src_end
));
2452 /* C is the first byte of SJIS character code,
2453 or a leading-code of Emacs. */
2454 mask
= (detect_coding_sjis (src
, src_end
)
2455 | detect_coding_emacs_mule (src
, src_end
));
2458 /* C is a character of ISO2022 in graphic plane right,
2459 or a SJIS's 1-byte character code (i.e. JISX0201),
2460 or the first byte of BIG5's 2-byte code. */
2461 mask
= (detect_coding_iso2022 (src
, src_end
)
2462 | detect_coding_sjis (src
, src_end
)
2463 | detect_coding_big5 (src
, src_end
));
2468 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2469 The information of the detected coding system is set in CODING. */
2472 detect_coding (coding
, src
, src_bytes
)
2473 struct coding_system
*coding
;
2477 int mask
= detect_coding_mask (src
, src_bytes
);
2480 if (mask
== CODING_CATEGORY_MASK_ANY
)
2481 /* We found nothing other than ASCII. There's nothing to do. */
2485 /* The source text seems to be encoded in unknown coding system.
2486 Emacs regards the category of such a kind of coding system as
2487 `coding-category-binary'. We assume that a user has assigned
2488 an appropriate coding system for a `coding-category-binary'. */
2489 idx
= CODING_CATEGORY_IDX_BINARY
;
2492 /* We found some plausible coding systems. Let's use a coding
2493 system of the highest priority. */
2494 Lisp_Object val
= Vcoding_category_list
;
2499 idx
= XFASTINT (Fget (XCONS (val
)->car
, Qcoding_category_index
));
2500 if ((idx
< CODING_CATEGORY_IDX_MAX
) && (mask
& (1 << idx
)))
2502 val
= XCONS (val
)->cdr
;
2509 /* For unknown reason, `Vcoding_category_list' contains none
2510 of found categories. Let's use any of them. */
2511 for (idx
= 0; idx
< CODING_CATEGORY_IDX_MAX
; idx
++)
2512 if (mask
& (1 << idx
))
2516 setup_coding_system (XSYMBOL (coding_category_table
[idx
])->value
, coding
);
2519 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2520 is encoded. Return one of CODING_EOL_LF, CODING_EOL_CRLF,
2521 CODING_EOL_CR, and CODING_EOL_UNDECIDED. */
2524 detect_eol_type (src
, src_bytes
)
2528 unsigned char *src_end
= src
+ src_bytes
;
2531 while (src
< src_end
)
2535 return CODING_EOL_LF
;
2538 if (src
< src_end
&& *src
== '\n')
2539 return CODING_EOL_CRLF
;
2541 return CODING_EOL_CR
;
2544 return CODING_EOL_UNDECIDED
;
2547 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2548 is encoded. If it detects an appropriate format of end-of-line, it
2549 sets the information in *CODING. */
2552 detect_eol (coding
, src
, src_bytes
)
2553 struct coding_system
*coding
;
2558 int eol_type
= detect_eol_type (src
, src_bytes
);
2560 if (eol_type
== CODING_EOL_UNDECIDED
)
2561 /* We found no end-of-line in the source text. */
2564 val
= Fget (coding
->symbol
, Qeol_type
);
2565 if (VECTORP (val
) && XVECTOR (val
)->size
== 3)
2566 setup_coding_system (XVECTOR (val
)->contents
[eol_type
], coding
);
2569 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions". Before
2570 decoding, it may detect coding system and format of end-of-line if
2571 those are not yet decided. */
2574 decode_coding (coding
, source
, destination
, src_bytes
, dst_bytes
, consumed
)
2575 struct coding_system
*coding
;
2576 unsigned char *source
, *destination
;
2577 int src_bytes
, dst_bytes
;
2588 if (coding
->type
== coding_type_undecided
)
2589 detect_coding (coding
, source
, src_bytes
);
2591 if (coding
->eol_type
== CODING_EOL_UNDECIDED
)
2592 detect_eol (coding
, source
, src_bytes
);
2594 coding
->carryover_size
= 0;
2595 switch (coding
->type
)
2597 case coding_type_no_conversion
:
2598 label_no_conversion
:
2599 produced
= (src_bytes
> dst_bytes
) ? dst_bytes
: src_bytes
;
2600 bcopy (source
, destination
, produced
);
2601 *consumed
= produced
;
2604 case coding_type_emacs_mule
:
2605 case coding_type_undecided
:
2606 if (coding
->eol_type
== CODING_EOL_LF
2607 || coding
->eol_type
== CODING_EOL_UNDECIDED
)
2608 goto label_no_conversion
;
2609 produced
= decode_eol (coding
, source
, destination
,
2610 src_bytes
, dst_bytes
, consumed
);
2613 case coding_type_sjis
:
2614 produced
= decode_coding_sjis_big5 (coding
, source
, destination
,
2615 src_bytes
, dst_bytes
, consumed
,
2619 case coding_type_iso2022
:
2620 produced
= decode_coding_iso2022 (coding
, source
, destination
,
2621 src_bytes
, dst_bytes
, consumed
);
2624 case coding_type_big5
:
2625 produced
= decode_coding_sjis_big5 (coding
, source
, destination
,
2626 src_bytes
, dst_bytes
, consumed
,
2630 case coding_type_ccl
:
2631 produced
= ccl_driver (&coding
->spec
.ccl
.decoder
, source
, destination
,
2632 src_bytes
, dst_bytes
, consumed
);
2639 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". */
2642 encode_coding (coding
, source
, destination
, src_bytes
, dst_bytes
, consumed
)
2643 struct coding_system
*coding
;
2644 unsigned char *source
, *destination
;
2645 int src_bytes
, dst_bytes
;
2650 coding
->carryover_size
= 0;
2651 switch (coding
->type
)
2653 case coding_type_no_conversion
:
2654 label_no_conversion
:
2655 produced
= (src_bytes
> dst_bytes
) ? dst_bytes
: src_bytes
;
2658 bcopy (source
, destination
, produced
);
2659 if (coding
->selective
)
2661 unsigned char *p
= destination
, *pend
= destination
+ produced
;
2663 if (*p
++ == '\015') p
[-1] = '\n';
2666 *consumed
= produced
;
2669 case coding_type_emacs_mule
:
2670 case coding_type_undecided
:
2671 if (coding
->eol_type
== CODING_EOL_LF
2672 || coding
->eol_type
== CODING_EOL_UNDECIDED
)
2673 goto label_no_conversion
;
2674 produced
= encode_eol (coding
, source
, destination
,
2675 src_bytes
, dst_bytes
, consumed
);
2678 case coding_type_sjis
:
2679 produced
= encode_coding_sjis_big5 (coding
, source
, destination
,
2680 src_bytes
, dst_bytes
, consumed
,
2684 case coding_type_iso2022
:
2685 produced
= encode_coding_iso2022 (coding
, source
, destination
,
2686 src_bytes
, dst_bytes
, consumed
);
2689 case coding_type_big5
:
2690 produced
= encode_coding_sjis_big5 (coding
, source
, destination
,
2691 src_bytes
, dst_bytes
, consumed
,
2695 case coding_type_ccl
:
2696 produced
= ccl_driver (&coding
->spec
.ccl
.encoder
, source
, destination
,
2697 src_bytes
, dst_bytes
, consumed
);
2704 #define CONVERSION_BUFFER_EXTRA_ROOM 256
2706 /* Return maximum size (bytes) of a buffer enough for decoding
2707 SRC_BYTES of text encoded in CODING. */
2710 decoding_buffer_size (coding
, src_bytes
)
2711 struct coding_system
*coding
;
2716 if (coding
->type
== coding_type_iso2022
)
2718 else if (coding
->type
== coding_type_ccl
)
2719 magnification
= coding
->spec
.ccl
.decoder
.buf_magnification
;
2723 return (src_bytes
* magnification
+ CONVERSION_BUFFER_EXTRA_ROOM
);
2726 /* Return maximum size (bytes) of a buffer enough for encoding
2727 SRC_BYTES of text to CODING. */
2730 encoding_buffer_size (coding
, src_bytes
)
2731 struct coding_system
*coding
;
2736 if (coding
->type
== coding_type_ccl
)
2737 magnification
= coding
->spec
.ccl
.encoder
.buf_magnification
;
2741 return (src_bytes
* magnification
+ CONVERSION_BUFFER_EXTRA_ROOM
);
2744 #ifndef MINIMUM_CONVERSION_BUFFER_SIZE
2745 #define MINIMUM_CONVERSION_BUFFER_SIZE 1024
2748 char *conversion_buffer
;
2749 int conversion_buffer_size
;
2751 /* Return a pointer to a SIZE bytes of buffer to be used for encoding
2752 or decoding. Sufficient memory is allocated automatically. If we
2753 run out of memory, return NULL. */
2756 get_conversion_buffer (size
)
2759 if (size
> conversion_buffer_size
)
2762 int real_size
= conversion_buffer_size
* 2;
2764 while (real_size
< size
) real_size
*= 2;
2765 buf
= (char *) xmalloc (real_size
);
2766 xfree (conversion_buffer
);
2767 conversion_buffer
= buf
;
2768 conversion_buffer_size
= real_size
;
2770 return conversion_buffer
;
2775 /*** 7. Emacs Lisp library functions ***/
2777 DEFUN ("coding-system-spec", Fcoding_system_spec
, Scoding_system_spec
,
2779 "Return coding-spec of CODING-SYSTEM.\n\
2780 If CODING-SYSTEM is not a valid coding-system, return nil.")
2784 while (SYMBOLP (obj
) && !NILP (obj
))
2785 obj
= Fget (obj
, Qcoding_system
);
2786 return ((NILP (obj
) || !VECTORP (obj
) || XVECTOR (obj
)->size
!= 5)
2790 DEFUN ("coding-system-p", Fcoding_system_p
, Scoding_system_p
, 1, 1, 0,
2791 "Return t if OBJECT is nil or a coding-system.\n\
2792 See document of make-coding-system for coding-system object.")
2796 return ((NILP (obj
) || !NILP (Fcoding_system_spec (obj
))) ? Qt
: Qnil
);
2799 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system
,
2800 Sread_non_nil_coding_system
, 1, 1, 0,
2801 "Read a coding system from the minibuffer, prompting with string PROMPT.")
2808 val
= Fcompleting_read (prompt
, Vobarray
, Qcoding_system_spec
,
2809 Qt
, Qnil
, Qnil
, Qnil
);
2811 while (XSTRING (val
)->size
== 0);
2812 return (Fintern (val
, Qnil
));
2815 DEFUN ("read-coding-system", Fread_coding_system
, Sread_coding_system
, 1, 1, 0,
2816 "Read a coding system or nil from the minibuffer, prompting with string PROMPT.")
2820 Lisp_Object val
= Fcompleting_read (prompt
, Vobarray
, Qcoding_system_p
,
2821 Qt
, Qnil
, Qnil
, Qnil
);
2822 return (XSTRING (val
)->size
== 0 ? Qnil
: Fintern (val
, Qnil
));
2825 DEFUN ("check-coding-system", Fcheck_coding_system
, Scheck_coding_system
,
2827 "Check validity of CODING-SYSTEM.\n\
2828 If valid, return CODING-SYSTEM, else `coding-system-error' is signaled.\n\
2829 CODING-SYSTEM is valid if it is a symbol and has \"coding-system\" property.\n\
2830 The value of property should be a vector of length 5.")
2832 Lisp_Object coding_system
;
2834 CHECK_SYMBOL (coding_system
, 0);
2835 if (!NILP (Fcoding_system_p (coding_system
)))
2836 return coding_system
;
2838 Fsignal (Qcoding_system_error
, Fcons (coding_system
, Qnil
));
2841 DEFUN ("detect-coding-region", Fdetect_coding_region
, Sdetect_coding_region
,
2843 "Detect coding-system of the text in the region between START and END.\n\
2844 Return a list of possible coding-systems ordered by priority.\n\
2845 If only ASCII characters are found, it returns `undecided'\n\
2846 or its subsidiary coding-system according to a detected end-of-line format.")
2850 int coding_mask
, eol_type
;
2854 validate_region (&b
, &e
);
2855 beg
= XINT (b
), end
= XINT (e
);
2856 if (beg
< GPT
&& end
>= GPT
) move_gap (end
);
2858 coding_mask
= detect_coding_mask (POS_ADDR (beg
), end
- beg
);
2859 eol_type
= detect_eol_type (POS_ADDR (beg
), end
- beg
);
2861 if (coding_mask
== CODING_CATEGORY_MASK_ANY
)
2863 val
= intern ("undecided");
2864 if (eol_type
!= CODING_EOL_UNDECIDED
)
2866 Lisp_Object val2
= Fget (val
, Qeol_type
);
2868 val
= XVECTOR (val2
)->contents
[eol_type
];
2875 /* At first, gather possible coding-systems in VAL in a reverse
2878 for (val2
= Vcoding_category_list
;
2880 val2
= XCONS (val2
)->cdr
)
2883 = XFASTINT (Fget (XCONS (val2
)->car
, Qcoding_category_index
));
2884 if (coding_mask
& (1 << idx
))
2885 val
= Fcons (Fsymbol_value (XCONS (val2
)->car
), val
);
2888 /* Then, change the order of the list, while getting subsidiary
2892 for (; !NILP (val2
); val2
= XCONS (val2
)->cdr
)
2894 if (eol_type
== CODING_EOL_UNDECIDED
)
2895 val
= Fcons (XCONS (val2
)->car
, val
);
2898 Lisp_Object val3
= Fget (XCONS (val2
)->car
, Qeol_type
);
2900 val
= Fcons (XVECTOR (val3
)->contents
[eol_type
], val
);
2902 val
= Fcons (XCONS (val2
)->car
, val
);
2910 /* Scan text in the region between *BEGP and *ENDP, skip characters
2911 which we never have to encode to (iff ENCODEP is 1) or decode from
2912 coding system CODING at the head and tail, then set BEGP and ENDP
2913 to the addresses of start and end of the text we actually convert. */
2916 shrink_conversion_area (begp
, endp
, coding
, encodep
)
2917 unsigned char **begp
, **endp
;
2918 struct coding_system
*coding
;
2921 register unsigned char *beg_addr
= *begp
, *end_addr
= *endp
;
2923 if (coding
->eol_type
!= CODING_EOL_LF
2924 && coding
->eol_type
!= CODING_EOL_UNDECIDED
)
2925 /* Since we anyway have to convert end-of-line format, it is not
2926 worth skipping at most 100 bytes or so. */
2929 if (encodep
) /* for encoding */
2931 switch (coding
->type
)
2933 case coding_type_no_conversion
:
2934 case coding_type_emacs_mule
:
2935 case coding_type_undecided
:
2936 /* We need no conversion. */
2939 case coding_type_ccl
:
2940 /* We can't skip any data. */
2942 case coding_type_iso2022
:
2943 if (coding
->flags
& CODING_FLAG_ISO_DESIGNATE_AT_BOL
)
2945 unsigned char *bol
= beg_addr
;
2946 while (beg_addr
< end_addr
&& *beg_addr
< 0x80)
2949 if (*(beg_addr
- 1) == '\n')
2953 goto label_skip_tail
;
2957 /* We can skip all ASCII characters at the head and tail. */
2958 while (beg_addr
< end_addr
&& *beg_addr
< 0x80) beg_addr
++;
2960 while (beg_addr
< end_addr
&& *(end_addr
- 1) < 0x80) end_addr
--;
2964 else /* for decoding */
2966 switch (coding
->type
)
2968 case coding_type_no_conversion
:
2969 /* We need no conversion. */
2972 case coding_type_emacs_mule
:
2973 if (coding
->eol_type
== CODING_EOL_LF
)
2975 /* We need no conversion. */
2979 /* We can skip all but carriage-return. */
2980 while (beg_addr
< end_addr
&& *beg_addr
!= '\r') beg_addr
++;
2981 while (beg_addr
< end_addr
&& *(end_addr
- 1) != '\r') end_addr
--;
2983 case coding_type_sjis
:
2984 case coding_type_big5
:
2985 /* We can skip all ASCII characters at the head. */
2986 while (beg_addr
< end_addr
&& *beg_addr
< 0x80) beg_addr
++;
2987 /* We can skip all ASCII characters at the tail except for
2988 the second byte of SJIS or BIG5 code. */
2989 while (beg_addr
< end_addr
&& *(end_addr
- 1) < 0x80) end_addr
--;
2990 if (end_addr
!= *endp
)
2993 case coding_type_ccl
:
2994 /* We can't skip any data. */
2996 default: /* i.e. case coding_type_iso2022: */
3000 /* We can skip all ASCII characters except for a few
3001 control codes at the head. */
3002 while (beg_addr
< end_addr
&& (c
= *beg_addr
) < 0x80
3003 && c
!= ISO_CODE_CR
&& c
!= ISO_CODE_SO
3004 && c
!= ISO_CODE_SI
&& c
!= ISO_CODE_ESC
)
3015 /* Encode to (iff ENCODEP is 1) or decode form coding system CODING a
3016 text between B and E. B and E are buffer position. */
3019 code_convert_region (b
, e
, coding
, encodep
)
3021 struct coding_system
*coding
;
3024 int beg
, end
, len
, consumed
, produced
;
3026 unsigned char *begp
, *endp
;
3029 validate_region (&b
, &e
);
3030 beg
= XINT (b
), end
= XINT (e
);
3031 if (beg
< GPT
&& end
>= GPT
)
3034 if (encodep
&& !NILP (coding
->pre_write_conversion
))
3036 /* We must call a pre-conversion function which may put a new
3037 text to be converted in a new buffer. */
3038 struct buffer
*old
= current_buffer
, *new;
3041 call2 (coding
->pre_write_conversion
, b
, e
);
3042 if (old
!= current_buffer
)
3044 /* Replace the original text by the text just generated. */
3046 new = current_buffer
;
3047 set_buffer_internal (old
);
3048 del_range (beg
, end
);
3049 insert_from_buffer (new, 1, len
, 0);
3054 /* We may be able to shrink the conversion region. */
3055 begp
= POS_ADDR (beg
); endp
= begp
+ (end
- beg
);
3056 shrink_conversion_area (&begp
, &endp
, coding
, encodep
);
3059 /* We need no conversion. */
3063 beg
+= begp
- POS_ADDR (beg
);
3064 end
= beg
+ (endp
- begp
);
3067 len
= encoding_buffer_size (coding
, end
- beg
);
3069 len
= decoding_buffer_size (coding
, end
- beg
);
3070 buf
= get_conversion_buffer (len
);
3072 coding
->last_block
= 1;
3074 ? encode_coding (coding
, POS_ADDR (beg
), buf
, end
- beg
, len
,
3076 : decode_coding (coding
, POS_ADDR (beg
), buf
, end
- beg
, len
,
3079 len
= produced
+ (beg
- XINT (b
)) + (XINT (e
) - end
);
3082 insert (buf
, produced
);
3083 del_range (PT
, PT
+ end
- beg
);
3085 pos
= PT
+ (pos
- end
);
3091 if (!encodep
&& !NILP (coding
->post_read_conversion
))
3093 /* We must call a post-conversion function which may alter
3094 the text just converted. */
3099 insval
= call1 (coding
->post_read_conversion
, make_number (len
));
3100 CHECK_NUMBER (insval
, 0);
3101 len
= XINT (insval
);
3104 return make_number (len
);
3108 code_convert_string (str
, coding
, encodep
, nocopy
)
3109 Lisp_Object str
, nocopy
;
3110 struct coding_system
*coding
;
3113 int len
, consumed
, produced
;
3115 unsigned char *begp
, *endp
;
3116 int head_skip
, tail_skip
;
3117 struct gcpro gcpro1
;
3119 if (encodep
&& !NILP (coding
->pre_write_conversion
)
3120 || !encodep
&& !NILP (coding
->post_read_conversion
))
3122 /* Since we have to call Lisp functions which assume target text
3123 is in a buffer, after setting a temporary buffer, call
3124 code_convert_region. */
3125 int count
= specpdl_ptr
- specpdl
;
3126 int len
= XSTRING (str
)->size
;
3128 struct buffer
*old
= current_buffer
;
3130 record_unwind_protect (Fset_buffer
, Fcurrent_buffer ());
3131 temp_output_buffer_setup (" *code-converting-work*");
3132 set_buffer_internal (XBUFFER (Vstandard_output
));
3133 insert_from_string (str
, 0, len
, 0);
3134 code_convert_region (make_number (BEGV
), make_number (ZV
),
3136 result
= make_buffer_string (BEGV
, ZV
, 0);
3137 set_buffer_internal (old
);
3138 return unbind_to (count
, result
);
3141 /* We may be able to shrink the conversion region. */
3142 begp
= XSTRING (str
)->data
;
3143 endp
= begp
+ XSTRING (str
)->size
;
3144 shrink_conversion_area (&begp
, &endp
, coding
, encodep
);
3147 /* We need no conversion. */
3148 return (NILP (nocopy
) ? Fcopy_sequence (str
) : str
);
3150 head_skip
= begp
- XSTRING (str
)->data
;
3151 tail_skip
= XSTRING (str
)->size
- head_skip
- (endp
- begp
);
3156 len
= encoding_buffer_size (coding
, endp
- begp
);
3158 len
= decoding_buffer_size (coding
, endp
- begp
);
3159 buf
= get_conversion_buffer (len
+ head_skip
+ tail_skip
);
3161 bcopy (XSTRING (str
)->data
, buf
, head_skip
);
3162 coding
->last_block
= 1;
3164 ? encode_coding (coding
, XSTRING (str
)->data
+ head_skip
,
3165 buf
+ head_skip
, endp
- begp
, len
, &consumed
)
3166 : decode_coding (coding
, XSTRING (str
)->data
+ head_skip
,
3167 buf
+ head_skip
, endp
- begp
, len
, &consumed
));
3168 bcopy (XSTRING (str
)->data
+ head_skip
+ (endp
- begp
),
3169 buf
+ head_skip
+ produced
,
3174 return make_string (buf
, head_skip
+ produced
+ tail_skip
);
3177 DEFUN ("decode-coding-region", Fdecode_coding_region
, Sdecode_coding_region
,
3178 3, 3, "r\nzCoding system: ",
3179 "Decode current region by specified coding system.\n\
3180 When called from a program, takes three arguments:\n\
3181 START, END, and CODING-SYSTEM. START END are buffer positions.\n\
3182 Return length of decoded text.")
3183 (b
, e
, coding_system
)
3184 Lisp_Object b
, e
, coding_system
;
3186 struct coding_system coding
;
3188 CHECK_NUMBER_COERCE_MARKER (b
, 0);
3189 CHECK_NUMBER_COERCE_MARKER (e
, 1);
3190 CHECK_SYMBOL (coding_system
, 2);
3192 if (NILP (coding_system
))
3193 return make_number (XFASTINT (e
) - XFASTINT (b
));
3194 if (setup_coding_system (Fcheck_coding_system (coding_system
), &coding
) < 0)
3195 error ("Invalid coding-system: %s", XSYMBOL (coding_system
)->name
->data
);
3197 return code_convert_region (b
, e
, &coding
, 0);
3200 DEFUN ("encode-coding-region", Fencode_coding_region
, Sencode_coding_region
,
3201 3, 3, "r\nzCoding system: ",
3202 "Encode current region by specified coding system.\n\
3203 When called from a program, takes three arguments:\n\
3204 START, END, and CODING-SYSTEM. START END are buffer positions.\n\
3205 Return length of encoded text.")
3206 (b
, e
, coding_system
)
3207 Lisp_Object b
, e
, coding_system
;
3209 struct coding_system coding
;
3211 CHECK_NUMBER_COERCE_MARKER (b
, 0);
3212 CHECK_NUMBER_COERCE_MARKER (e
, 1);
3213 CHECK_SYMBOL (coding_system
, 2);
3215 if (NILP (coding_system
))
3216 return make_number (XFASTINT (e
) - XFASTINT (b
));
3217 if (setup_coding_system (Fcheck_coding_system (coding_system
), &coding
) < 0)
3218 error ("Invalid coding-system: %s", XSYMBOL (coding_system
)->name
->data
);
3220 return code_convert_region (b
, e
, &coding
, 1);
3223 DEFUN ("decode-coding-string", Fdecode_coding_string
, Sdecode_coding_string
,
3225 "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
3226 Optional arg NOCOPY non-nil means return STRING itself if there's no need\n\
3228 (string
, coding_system
, nocopy
)
3229 Lisp_Object string
, coding_system
, nocopy
;
3231 struct coding_system coding
;
3233 CHECK_STRING (string
, 0);
3234 CHECK_SYMBOL (coding_system
, 1);
3236 if (NILP (coding_system
))
3237 return (NILP (nocopy
) ? Fcopy_sequence (string
) : string
);
3238 if (setup_coding_system (Fcheck_coding_system (coding_system
), &coding
) < 0)
3239 error ("Invalid coding-system: %s", XSYMBOL (coding_system
)->name
->data
);
3241 return code_convert_string (string
, &coding
, 0, nocopy
);
3244 DEFUN ("encode-coding-string", Fencode_coding_string
, Sencode_coding_string
,
3246 "Encode STRING to CODING-SYSTEM, and return the result.\n\
3247 Optional arg NOCOPY non-nil means return STRING itself if there's no need\n\
3249 (string
, coding_system
, nocopy
)
3250 Lisp_Object string
, coding_system
, nocopy
;
3252 struct coding_system coding
;
3254 CHECK_STRING (string
, 0);
3255 CHECK_SYMBOL (coding_system
, 1);
3257 if (NILP (coding_system
))
3258 return (NILP (nocopy
) ? Fcopy_sequence (string
) : string
);
3259 if (setup_coding_system (Fcheck_coding_system (coding_system
), &coding
) < 0)
3260 error ("Invalid coding-system: %s", XSYMBOL (coding_system
)->name
->data
);
3262 return code_convert_string (string
, &coding
, 1, nocopy
);
3265 DEFUN ("decode-sjis-char", Fdecode_sjis_char
, Sdecode_sjis_char
, 1, 1, 0,
3266 "Decode a JISX0208 character of shift-jis encoding.\n\
3267 CODE is the character code in SJIS.\n\
3268 Return the corresponding character.")
3272 unsigned char c1
, c2
, s1
, s2
;
3275 CHECK_NUMBER (code
, 0);
3276 s1
= (XFASTINT (code
)) >> 8, s2
= (XFASTINT (code
)) & 0xFF;
3277 DECODE_SJIS (s1
, s2
, c1
, c2
);
3278 XSETFASTINT (val
, MAKE_NON_ASCII_CHAR (charset_jisx0208
, c1
, c2
));
3282 DEFUN ("encode-sjis-char", Fencode_sjis_char
, Sencode_sjis_char
, 1, 1, 0,
3283 "Encode a JISX0208 character CHAR to SJIS coding-system.\n\
3284 Return the corresponding character code in SJIS.")
3288 int charset
, c1
, c2
, s1
, s2
;
3291 CHECK_NUMBER (ch
, 0);
3292 SPLIT_CHAR (XFASTINT (ch
), charset
, c1
, c2
);
3293 if (charset
== charset_jisx0208
)
3295 ENCODE_SJIS (c1
, c2
, s1
, s2
);
3296 XSETFASTINT (val
, (s1
<< 8) | s2
);
3299 XSETFASTINT (val
, 0);
3303 DEFUN ("decode-big5-char", Fdecode_big5_char
, Sdecode_big5_char
, 1, 1, 0,
3304 "Decode a Big5 character CODE of BIG5 coding-system.\n\
3305 CODE is the character code in BIG5.\n\
3306 Return the corresponding character.")
3311 unsigned char b1
, b2
, c1
, c2
;
3314 CHECK_NUMBER (code
, 0);
3315 b1
= (XFASTINT (code
)) >> 8, b2
= (XFASTINT (code
)) & 0xFF;
3316 DECODE_BIG5 (b1
, b2
, charset
, c1
, c2
);
3317 XSETFASTINT (val
, MAKE_NON_ASCII_CHAR (charset
, c1
, c2
));
3321 DEFUN ("encode-big5-char", Fencode_big5_char
, Sencode_big5_char
, 1, 1, 0,
3322 "Encode the Big5 character CHAR to BIG5 coding-system.\n\
3323 Return the corresponding character code in Big5.")
3327 int charset
, c1
, c2
, b1
, b2
;
3330 CHECK_NUMBER (ch
, 0);
3331 SPLIT_CHAR (XFASTINT (ch
), charset
, c1
, c2
);
3332 if (charset
== charset_big5_1
|| charset
== charset_big5_2
)
3334 ENCODE_BIG5 (charset
, c1
, c2
, b1
, b2
);
3335 XSETFASTINT (val
, (b1
<< 8) | b2
);
3338 XSETFASTINT (val
, 0);
3342 DEFUN ("set-terminal-coding-system-internal",
3343 Fset_terminal_coding_system_internal
,
3344 Sset_terminal_coding_system_internal
, 1, 1, 0, "")
3346 Lisp_Object coding_system
;
3348 CHECK_SYMBOL (coding_system
, 0);
3349 setup_coding_system (Fcheck_coding_system (coding_system
), &terminal_coding
);
3353 DEFUN ("terminal-coding-system",
3354 Fterminal_coding_system
, Sterminal_coding_system
, 0, 0, 0,
3355 "Return coding-system of your terminal.")
3358 return terminal_coding
.symbol
;
3361 DEFUN ("set-keyboard-coding-system-internal",
3362 Fset_keyboard_coding_system_internal
,
3363 Sset_keyboard_coding_system_internal
, 1, 1, 0, "")
3365 Lisp_Object coding_system
;
3367 CHECK_SYMBOL (coding_system
, 0);
3368 setup_coding_system (Fcheck_coding_system (coding_system
), &keyboard_coding
);
3372 DEFUN ("keyboard-coding-system",
3373 Fkeyboard_coding_system
, Skeyboard_coding_system
, 0, 0, 0,
3374 "Return coding-system of what is sent from terminal keyboard.")
3377 return keyboard_coding
.symbol
;
3381 DEFUN ("find-coding-system", Ffind_coding_system
, Sfind_coding_system
,
3383 "Choose a coding system for a file operation based on file name.\n\
3384 The value names a pair of coding systems: (ENCODING-SYSTEM DECODING-SYSTEM).\n\
3385 ENCODING-SYSTEM is the coding system to use for encoding\n\
3386 \(in case OPERATION does encoding), and DECODING-SYSTEM is the coding system\n\
3387 for decoding (in case OPERATION does decoding).\n\
3389 The first argument OPERATION specifies an I/O primitive:\n\
3390 For file I/O, `insert-file-contents' or `write-region'.\n\
3391 For process I/O, `call-process', `call-process-region', or `start-process'.\n\
3392 For network I/O, `open-network-stream'.\n\
3394 The remaining arguments should be the same arguments that were passed\n\
3395 to the primitive. Depending on which primitive, one of those arguments\n\
3396 is selected as the TARGET. For example, if OPERATION does file I/O,\n\
3397 whichever argument specifies the file name is TARGET.\n\
3399 TARGET has a meaning which depends on OPERATION:\n\
3400 For file I/O, TARGET is a file name.\n\
3401 For process I/O, TARGET is a process name.\n\
3402 For network I/O, TARGET is a service name or a port number\n\
3404 This function looks up what specified for TARGET in,\n\
3405 `file-coding-system-alist', `process-coding-system-alist',\n\
3406 or `network-coding-system-alist' depending on OPERATION.\n\
3407 They may specify a coding system, a cons of coding systems,\n\
3408 or a function symbol to call.\n\
3409 In the last case, we call the function with one argument,\n\
3410 which is a list of all the arguments given to `find-coding-system'.")
3415 Lisp_Object operation
, target_idx
, target
, val
;
3416 register Lisp_Object chain
;
3419 error ("Too few arguments");
3420 operation
= args
[0];
3421 if (!SYMBOLP (operation
)
3422 || !INTEGERP (target_idx
= Fget (operation
, Qtarget_idx
)))
3423 error ("Invalid first arguement");
3424 if (nargs
< 1 + XINT (target_idx
))
3425 error ("Too few arguments for operation: %s",
3426 XSYMBOL (operation
)->name
->data
);
3427 target
= args
[XINT (target_idx
) + 1];
3428 if (!(STRINGP (target
)
3429 || (EQ (operation
, Qopen_network_stream
) && INTEGERP (target
))))
3430 error ("Invalid %dth argument", XINT (target_idx
) + 1);
3432 chain
= (operation
== Qinsert_file_contents
|| operation
== Qwrite_region
3433 ? Vfile_coding_system_alist
3434 : (operation
== Qopen_network_stream
3435 ? Vnetwork_coding_system_alist
3436 : Vprocess_coding_system_alist
));
3440 for (; CONSP (chain
); chain
= XCONS (chain
)->cdr
)
3442 Lisp_Object elt
= XCONS (chain
)->car
;
3445 && ((STRINGP (target
)
3446 && STRINGP (XCONS (elt
)->car
)
3447 && fast_string_match (XCONS (elt
)->car
, target
) >= 0)
3448 || (INTEGERP (target
) && EQ (target
, XCONS (elt
)->car
))))
3450 val
= XCONS (elt
)->cdr
;
3453 if (! SYMBOLP (val
))
3455 if (! NILP (Fcoding_system_p (val
)))
3456 return Fcons (val
, val
);
3457 if (!NILP (Fboundp (val
)))
3458 return call2 (val
, Flist (nargs
, args
));
3468 /*** 8. Post-amble ***/
3474 /* Emacs' internal format specific initialize routine. */
3475 for (i
= 0; i
<= 0x20; i
++)
3476 emacs_code_class
[i
] = EMACS_control_code
;
3477 emacs_code_class
[0x0A] = EMACS_linefeed_code
;
3478 emacs_code_class
[0x0D] = EMACS_carriage_return_code
;
3479 for (i
= 0x21 ; i
< 0x7F; i
++)
3480 emacs_code_class
[i
] = EMACS_ascii_code
;
3481 emacs_code_class
[0x7F] = EMACS_control_code
;
3482 emacs_code_class
[0x80] = EMACS_leading_code_composition
;
3483 for (i
= 0x81; i
< 0xFF; i
++)
3484 emacs_code_class
[i
] = EMACS_invalid_code
;
3485 emacs_code_class
[LEADING_CODE_PRIVATE_11
] = EMACS_leading_code_3
;
3486 emacs_code_class
[LEADING_CODE_PRIVATE_12
] = EMACS_leading_code_3
;
3487 emacs_code_class
[LEADING_CODE_PRIVATE_21
] = EMACS_leading_code_4
;
3488 emacs_code_class
[LEADING_CODE_PRIVATE_22
] = EMACS_leading_code_4
;
3490 /* ISO2022 specific initialize routine. */
3491 for (i
= 0; i
< 0x20; i
++)
3492 iso_code_class
[i
] = ISO_control_code
;
3493 for (i
= 0x21; i
< 0x7F; i
++)
3494 iso_code_class
[i
] = ISO_graphic_plane_0
;
3495 for (i
= 0x80; i
< 0xA0; i
++)
3496 iso_code_class
[i
] = ISO_control_code
;
3497 for (i
= 0xA1; i
< 0xFF; i
++)
3498 iso_code_class
[i
] = ISO_graphic_plane_1
;
3499 iso_code_class
[0x20] = iso_code_class
[0x7F] = ISO_0x20_or_0x7F
;
3500 iso_code_class
[0xA0] = iso_code_class
[0xFF] = ISO_0xA0_or_0xFF
;
3501 iso_code_class
[ISO_CODE_CR
] = ISO_carriage_return
;
3502 iso_code_class
[ISO_CODE_SO
] = ISO_shift_out
;
3503 iso_code_class
[ISO_CODE_SI
] = ISO_shift_in
;
3504 iso_code_class
[ISO_CODE_SS2_7
] = ISO_single_shift_2_7
;
3505 iso_code_class
[ISO_CODE_ESC
] = ISO_escape
;
3506 iso_code_class
[ISO_CODE_SS2
] = ISO_single_shift_2
;
3507 iso_code_class
[ISO_CODE_SS3
] = ISO_single_shift_3
;
3508 iso_code_class
[ISO_CODE_CSI
] = ISO_control_sequence_introducer
;
3510 conversion_buffer_size
= MINIMUM_CONVERSION_BUFFER_SIZE
;
3511 conversion_buffer
= (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE
);
3513 setup_coding_system (Qnil
, &keyboard_coding
);
3514 setup_coding_system (Qnil
, &terminal_coding
);
3521 Qtarget_idx
= intern ("target-idx");
3522 staticpro (&Qtarget_idx
);
3524 Fput (Qinsert_file_contents
, Qtarget_idx
, make_number (0));
3525 Fput (Qwrite_region
, Qtarget_idx
, make_number (2));
3527 Qcall_process
= intern ("call-process");
3528 staticpro (&Qcall_process
);
3529 Fput (Qcall_process
, Qtarget_idx
, make_number (0));
3531 Qcall_process_region
= intern ("call-process-region");
3532 staticpro (&Qcall_process_region
);
3533 Fput (Qcall_process_region
, Qtarget_idx
, make_number (2));
3535 Qstart_process
= intern ("start-process");
3536 staticpro (&Qstart_process
);
3537 Fput (Qstart_process
, Qtarget_idx
, make_number (2));
3539 Qopen_network_stream
= intern ("open-network-stream");
3540 staticpro (&Qopen_network_stream
);
3541 Fput (Qopen_network_stream
, Qtarget_idx
, make_number (3));
3543 Qcoding_system
= intern ("coding-system");
3544 staticpro (&Qcoding_system
);
3546 Qeol_type
= intern ("eol-type");
3547 staticpro (&Qeol_type
);
3549 Qbuffer_file_coding_system
= intern ("buffer-file-coding-system");
3550 staticpro (&Qbuffer_file_coding_system
);
3552 Qpost_read_conversion
= intern ("post-read-conversion");
3553 staticpro (&Qpost_read_conversion
);
3555 Qpre_write_conversion
= intern ("pre-write-conversion");
3556 staticpro (&Qpre_write_conversion
);
3558 Qcoding_system_spec
= intern ("coding-system-spec");
3559 staticpro (&Qcoding_system_spec
);
3561 Qcoding_system_p
= intern ("coding-system-p");
3562 staticpro (&Qcoding_system_p
);
3564 Qcoding_system_error
= intern ("coding-system-error");
3565 staticpro (&Qcoding_system_error
);
3567 Fput (Qcoding_system_error
, Qerror_conditions
,
3568 Fcons (Qcoding_system_error
, Fcons (Qerror
, Qnil
)));
3569 Fput (Qcoding_system_error
, Qerror_message
,
3570 build_string ("Coding-system error"));
3572 Qcoding_category_index
= intern ("coding-category-index");
3573 staticpro (&Qcoding_category_index
);
3577 for (i
= 0; i
< CODING_CATEGORY_IDX_MAX
; i
++)
3579 coding_category_table
[i
] = intern (coding_category_name
[i
]);
3580 staticpro (&coding_category_table
[i
]);
3581 Fput (coding_category_table
[i
], Qcoding_category_index
,
3586 Qcharacter_unification_table
= intern ("character-unification-table");
3587 staticpro (&Qcharacter_unification_table
);
3588 Fput (Qcharacter_unification_table
, Qchar_table_extra_slots
,
3591 defsubr (&Scoding_system_spec
);
3592 defsubr (&Scoding_system_p
);
3593 defsubr (&Sread_coding_system
);
3594 defsubr (&Sread_non_nil_coding_system
);
3595 defsubr (&Scheck_coding_system
);
3596 defsubr (&Sdetect_coding_region
);
3597 defsubr (&Sdecode_coding_region
);
3598 defsubr (&Sencode_coding_region
);
3599 defsubr (&Sdecode_coding_string
);
3600 defsubr (&Sencode_coding_string
);
3601 defsubr (&Sdecode_sjis_char
);
3602 defsubr (&Sencode_sjis_char
);
3603 defsubr (&Sdecode_big5_char
);
3604 defsubr (&Sencode_big5_char
);
3605 defsubr (&Sset_terminal_coding_system_internal
);
3606 defsubr (&Sterminal_coding_system
);
3607 defsubr (&Sset_keyboard_coding_system_internal
);
3608 defsubr (&Skeyboard_coding_system
);
3609 defsubr (&Sfind_coding_system
);
3611 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list
,
3612 "List of coding-categories (symbols) ordered by priority.");
3616 Vcoding_category_list
= Qnil
;
3617 for (i
= CODING_CATEGORY_IDX_MAX
- 1; i
>= 0; i
--)
3618 Vcoding_category_list
3619 = Fcons (coding_category_table
[i
], Vcoding_category_list
);
3622 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read
,
3623 "A variable of internal use only.\n\
3624 If the value is a coding system, it is used for decoding on read operation.\n\
3625 If not, an appropriate element in `coding-system-alist' (which see) is used.");
3626 Vcoding_system_for_read
= Qnil
;
3628 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write
,
3629 "A variable of internal use only.\n\
3630 If the value is a coding system, it is used for encoding on write operation.\n\
3631 If not, an appropriate element in `coding-system-alist' (which see) is used.");
3632 Vcoding_system_for_write
= Qnil
;
3634 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used
,
3635 "Coding-system used in the latest file or process I/O.");
3636 Vlast_coding_system_used
= Qnil
;
3638 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist
,
3639 "Alist to decide a coding system to use for a file I/O operation.\n\
3640 The format is ((PATTERN . VAL) ...),\n\
3641 where PATTERN is a regular expression matching a file name,\n\
3642 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
3643 If VAL is a coding system, it is used for both decoding and encoding\n\
3644 the file contents.\n\
3645 If VAL is a cons of coding systems, the car part is used for decoding,\n\
3646 and the cdr part is used for encoding.\n\
3647 If VAL is a function symbol, the function must return a coding system\n\
3648 or a cons of coding systems which are used as above.\n\
3650 See also the function `find-coding-system'.");
3651 Vfile_coding_system_alist
= Qnil
;
3653 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist
,
3654 "Alist to decide a coding system to use for a process I/O operation.\n\
3655 The format is ((PATTERN . VAL) ...),\n\
3656 where PATTERN is a regular expression matching a program name,\n\
3657 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
3658 If VAL is a coding system, it is used for both decoding what received\n\
3659 from the program and encoding what sent to the program.\n\
3660 If VAL is a cons of coding systems, the car part is used for decoding,\n\
3661 and the cdr part is used for encoding.\n\
3662 If VAL is a function symbol, the function must return a coding system\n\
3663 or a cons of coding systems which are used as above.\n\
3665 See also the function `find-coding-system'.");
3666 Vprocess_coding_system_alist
= Qnil
;
3668 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist
,
3669 "Alist to decide a coding system to use for a network I/O operation.\n\
3670 The format is ((PATTERN . VAL) ...),\n\
3671 where PATTERN is a regular expression matching a network service name\n\
3672 or is a port number to connect to,\n\
3673 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
3674 If VAL is a coding system, it is used for both decoding what received\n\
3675 from the network stream and encoding what sent to the network stream.\n\
3676 If VAL is a cons of coding systems, the car part is used for decoding,\n\
3677 and the cdr part is used for encoding.\n\
3678 If VAL is a function symbol, the function must return a coding system\n\
3679 or a cons of coding systems which are used as above.\n\
3681 See also the function `find-coding-system'.");
3682 Vnetwork_coding_system_alist
= Qnil
;
3684 DEFVAR_INT ("eol-mnemonic-unix", &eol_mnemonic_unix
,
3685 "Mnemonic character indicating UNIX-like end-of-line format (i.e. LF) .");
3686 eol_mnemonic_unix
= ':';
3688 DEFVAR_INT ("eol-mnemonic-dos", &eol_mnemonic_dos
,
3689 "Mnemonic character indicating DOS-like end-of-line format (i.e. CRLF).");
3690 eol_mnemonic_dos
= '\\';
3692 DEFVAR_INT ("eol-mnemonic-mac", &eol_mnemonic_mac
,
3693 "Mnemonic character indicating MAC-like end-of-line format (i.e. CR).");
3694 eol_mnemonic_mac
= '/';
3696 DEFVAR_INT ("eol-mnemonic-undecided", &eol_mnemonic_undecided
,
3697 "Mnemonic character indicating end-of-line format is not yet decided.");
3698 eol_mnemonic_undecided
= ':';
3700 DEFVAR_LISP ("enable-character-unification", &Venable_character_unification
,
3701 "Non-nil means ISO 2022 encoder/decoder do character unification.");
3702 Venable_character_unification
= Qt
;
3704 DEFVAR_LISP ("standard-character-unification-table-for-read",
3705 &Vstandard_character_unification_table_for_read
,
3706 "Table for unifying characters when reading.");
3707 Vstandard_character_unification_table_for_read
= Qnil
;
3709 DEFVAR_LISP ("standard-character-unification-table-for-write",
3710 &Vstandard_character_unification_table_for_write
,
3711 "Table for unifying characters when writing.");
3712 Vstandard_character_unification_table_for_write
= Qnil
;
3714 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist
,
3715 "Alist of charsets vs revision numbers.\n\
3716 While encoding, if a charset (car part of an element) is found,\n\
3717 designate it with the escape sequence identifing revision (cdr part of the element).");
3718 Vcharset_revision_alist
= Qnil
;
3720 DEFVAR_LISP ("default-process-coding-system",
3721 &Vdefault_process_coding_system
,
3722 "Cons of coding systems used for process I/O by default.\n\
3723 The car part is used for decoding a process output,\n\
3724 the cdr part is used for encoding a text to be sent to a process.");
3725 Vdefault_process_coding_system
= Qnil
;