1 /* Coding system handler (conversion, detection, and etc).
2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
5 This file is part of GNU Emacs.
7 GNU Emacs is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
12 GNU Emacs is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GNU Emacs; see the file COPYING. If not, write to
19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
22 /*** TABLE OF CONTENTS ***
26 2. Emacs' internal format (emacs-mule) handlers
28 4. Shift-JIS and BIG5 handlers
30 6. End-of-line handlers
31 7. C library functions
32 8. Emacs Lisp library functions
37 /*** 0. General comments ***/
40 /*** GENERAL NOTE on CODING SYSTEM ***
42 Coding system is an encoding mechanism of one or more character
43 sets. Here's a list of coding systems which Emacs can handle. When
44 we say "decode", it means converting some other coding system to
45 Emacs' internal format (emacs-internal), and when we say "encode",
46 it means converting the coding system emacs-mule to some other
49 0. Emacs' internal format (emacs-mule)
51 Emacs itself holds a multi-lingual character in a buffer and a string
52 in a special format. Details are described in section 2.
56 The most famous coding system for multiple character sets. X's
57 Compound Text, various EUCs (Extended Unix Code), and coding
58 systems used in Internet communication such as ISO-2022-JP are
59 all variants of ISO2022. Details are described in section 3.
61 2. SJIS (or Shift-JIS or MS-Kanji-Code)
63 A coding system to encode character sets: ASCII, JISX0201, and
64 JISX0208. Widely used for PC's in Japan. Details are described in
69 A coding system to encode character sets: ASCII and Big5. Widely
70 used by Chinese (mainly in Taiwan and Hong Kong). Details are
71 described in section 4. In this file, when we write "BIG5"
72 (all uppercase), we mean the coding system, and when we write
73 "Big5" (capitalized), we mean the character set.
77 A coding system for a text containing random 8-bit code. Emacs does
78 no code conversion on such a text except for end-of-line format.
82 If a user wants to read/write a text encoded in a coding system not
83 listed above, he can supply a decoder and an encoder for it in CCL
84 (Code Conversion Language) programs. Emacs executes the CCL program
85 while reading/writing.
87 Emacs represents a coding system by a Lisp symbol that has a property
88 `coding-system'. But, before actually using the coding system, the
89 information about it is set in a structure of type `struct
90 coding_system' for rapid processing. See section 6 for more details.
94 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
96 How end-of-line of a text is encoded depends on a system. For
97 instance, Unix's format is just one byte of `line-feed' code,
98 whereas DOS's format is two-byte sequence of `carriage-return' and
99 `line-feed' codes. MacOS's format is usually one byte of
102 Since text characters encoding and end-of-line encoding are
103 independent, any coding system described above can take
104 any format of end-of-line. So, Emacs has information of format of
105 end-of-line in each coding-system. See section 6 for more details.
109 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
111 These functions check if a text between SRC and SRC_END is encoded
112 in the coding system category XXX. Each returns an integer value in
113 which appropriate flag bits for the category XXX is set. The flag
114 bits are defined in macros CODING_CATEGORY_MASK_XXX. Below is the
115 template of these functions. */
118 detect_coding_emacs_mule (src
, src_end
)
119 unsigned char *src
, *src_end
;
125 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
127 These functions decode SRC_BYTES length of unibyte text at SOURCE
128 encoded in CODING to Emacs' internal format. The resulting
129 multibyte text goes to a place pointed to by DESTINATION, the length
130 of which should not exceed DST_BYTES.
132 These functions set the information of original and decoded texts in
133 the members produced, produced_char, consumed, and consumed_char of
134 the structure *CODING. They also set the member result to one of
135 CODING_FINISH_XXX indicating how the decoding finished.
137 DST_BYTES zero means that source area and destination area are
138 overlapped, which means that we can produce a decoded text until it
139 reaches at the head of not-yet-decoded source text.
141 Below is a template of these functions. */
144 decode_coding_XXX (coding
, source
, destination
, src_bytes
, dst_bytes
)
145 struct coding_system
*coding
;
146 unsigned char *source
, *destination
;
147 int src_bytes
, dst_bytes
;
153 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
155 These functions encode SRC_BYTES length text at SOURCE of Emacs'
156 internal multibyte format to CODING. The resulting unibyte text
157 goes to a place pointed to by DESTINATION, the length of which
158 should not exceed DST_BYTES.
160 These functions set the information of original and encoded texts in
161 the members produced, produced_char, consumed, and consumed_char of
162 the structure *CODING. They also set the member result to one of
163 CODING_FINISH_XXX indicating how the encoding finished.
165 DST_BYTES zero means that source area and destination area are
166 overlapped, which means that we can produce a encoded text until it
167 reaches at the head of not-yet-encoded source text.
169 Below is a template of these functions. */
172 encode_coding_XXX (coding
, source
, destination
, src_bytes
, dst_bytes
)
173 struct coding_system
*coding
;
174 unsigned char *source
, *destination
;
175 int src_bytes
, dst_bytes
;
181 /*** COMMONLY USED MACROS ***/
183 /* The following two macros ONE_MORE_BYTE and TWO_MORE_BYTES safely
184 get one, two, and three bytes from the source text respectively.
185 If there are not enough bytes in the source, they jump to
186 `label_end_of_loop'. The caller should set variables `coding',
187 `src' and `src_end' to appropriate pointer in advance. These
188 macros are called from decoding routines `decode_coding_XXX', thus
189 it is assumed that the source text is unibyte. */
191 #define ONE_MORE_BYTE(c1) \
193 if (src >= src_end) \
195 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
196 goto label_end_of_loop; \
201 #define TWO_MORE_BYTES(c1, c2) \
203 if (src + 1 >= src_end) \
205 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
206 goto label_end_of_loop; \
213 /* Set C to the next character at the source text pointed by `src'.
214 If there are not enough characters in the source, jump to
215 `label_end_of_loop'. The caller should set variables `coding'
216 `src', `src_end', and `translation_table' to appropriate pointers
217 in advance. This macro is used in encoding routines
218 `encode_coding_XXX', thus it assumes that the source text is in
219 multibyte form except for 8-bit characters. 8-bit characters are
220 in multibyte form if coding->src_multibyte is nonzero, else they
221 are represented by a single byte. */
223 #define ONE_MORE_CHAR(c) \
225 int len = src_end - src; \
229 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \
230 goto label_end_of_loop; \
232 if (coding->src_multibyte \
233 || UNIBYTE_STR_AS_MULTIBYTE_P (src, len, bytes)) \
234 c = STRING_CHAR_AND_LENGTH (src, len, bytes); \
236 c = *src, bytes = 1; \
237 if (!NILP (translation_table)) \
238 c = translate_char (translation_table, c, 0, 0, 0); \
243 /* Produce a multibyte form of characater C to `dst'. Jump to
244 `label_end_of_loop' if there's not enough space at `dst'.
246 If we are now in the middle of composition sequence, the decoded
247 character may be ALTCHAR (for the current composition). In that
248 case, the character goes to coding->cmp_data->data instead of
251 This macro is used in decoding routines. */
253 #define EMIT_CHAR(c) \
255 if (! COMPOSING_P (coding) \
256 || coding->composing == COMPOSITION_RELATIVE \
257 || coding->composing == COMPOSITION_WITH_RULE) \
259 int bytes = CHAR_BYTES (c); \
260 if ((dst + bytes) > (dst_bytes ? dst_end : src)) \
262 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
263 goto label_end_of_loop; \
265 dst += CHAR_STRING (c, dst); \
266 coding->produced_char++; \
269 if (COMPOSING_P (coding) \
270 && coding->composing != COMPOSITION_RELATIVE) \
272 CODING_ADD_COMPOSITION_COMPONENT (coding, c); \
273 coding->composition_rule_follows \
274 = coding->composing != COMPOSITION_WITH_ALTCHARS; \
279 #define EMIT_ONE_BYTE(c) \
281 if (dst >= (dst_bytes ? dst_end : src)) \
283 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
284 goto label_end_of_loop; \
289 #define EMIT_TWO_BYTES(c1, c2) \
291 if (dst + 2 > (dst_bytes ? dst_end : src)) \
293 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
294 goto label_end_of_loop; \
296 *dst++ = c1, *dst++ = c2; \
299 #define EMIT_BYTES(from, to) \
301 if (dst + (to - from) > (dst_bytes ? dst_end : src)) \
303 coding->result = CODING_FINISH_INSUFFICIENT_DST; \
304 goto label_end_of_loop; \
311 /*** 1. Preamble ***/
324 #include "composite.h"
329 #else /* not emacs */
333 #endif /* not emacs */
335 Lisp_Object Qcoding_system
, Qeol_type
;
336 Lisp_Object Qbuffer_file_coding_system
;
337 Lisp_Object Qpost_read_conversion
, Qpre_write_conversion
;
338 Lisp_Object Qno_conversion
, Qundecided
;
339 Lisp_Object Qcoding_system_history
;
340 Lisp_Object Qsafe_charsets
;
341 Lisp_Object Qvalid_codes
;
343 extern Lisp_Object Qinsert_file_contents
, Qwrite_region
;
344 Lisp_Object Qcall_process
, Qcall_process_region
, Qprocess_argument
;
345 Lisp_Object Qstart_process
, Qopen_network_stream
;
346 Lisp_Object Qtarget_idx
;
348 Lisp_Object Vselect_safe_coding_system_function
;
350 /* Mnemonic string for each format of end-of-line. */
351 Lisp_Object eol_mnemonic_unix
, eol_mnemonic_dos
, eol_mnemonic_mac
;
352 /* Mnemonic string to indicate format of end-of-line is not yet
354 Lisp_Object eol_mnemonic_undecided
;
356 /* Format of end-of-line decided by system. This is CODING_EOL_LF on
357 Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac. */
362 Lisp_Object Vcoding_system_list
, Vcoding_system_alist
;
364 Lisp_Object Qcoding_system_p
, Qcoding_system_error
;
366 /* Coding system emacs-mule and raw-text are for converting only
367 end-of-line format. */
368 Lisp_Object Qemacs_mule
, Qraw_text
;
370 /* Coding-systems are handed between Emacs Lisp programs and C internal
371 routines by the following three variables. */
372 /* Coding-system for reading files and receiving data from process. */
373 Lisp_Object Vcoding_system_for_read
;
374 /* Coding-system for writing files and sending data to process. */
375 Lisp_Object Vcoding_system_for_write
;
376 /* Coding-system actually used in the latest I/O. */
377 Lisp_Object Vlast_coding_system_used
;
379 /* A vector of length 256 which contains information about special
380 Latin codes (especially for dealing with Microsoft codes). */
381 Lisp_Object Vlatin_extra_code_table
;
383 /* Flag to inhibit code conversion of end-of-line format. */
384 int inhibit_eol_conversion
;
386 /* Flag to make buffer-file-coding-system inherit from process-coding. */
387 int inherit_process_coding_system
;
389 /* Coding system to be used to encode text for terminal display. */
390 struct coding_system terminal_coding
;
392 /* Coding system to be used to encode text for terminal display when
393 terminal coding system is nil. */
394 struct coding_system safe_terminal_coding
;
396 /* Coding system of what is sent from terminal keyboard. */
397 struct coding_system keyboard_coding
;
399 /* Default coding system to be used to write a file. */
400 struct coding_system default_buffer_file_coding
;
402 Lisp_Object Vfile_coding_system_alist
;
403 Lisp_Object Vprocess_coding_system_alist
;
404 Lisp_Object Vnetwork_coding_system_alist
;
406 Lisp_Object Vlocale_coding_system
;
410 Lisp_Object Qcoding_category
, Qcoding_category_index
;
412 /* List of symbols `coding-category-xxx' ordered by priority. */
413 Lisp_Object Vcoding_category_list
;
415 /* Table of coding categories (Lisp symbols). */
416 Lisp_Object Vcoding_category_table
;
418 /* Table of names of symbol for each coding-category. */
419 char *coding_category_name
[CODING_CATEGORY_IDX_MAX
] = {
420 "coding-category-emacs-mule",
421 "coding-category-sjis",
422 "coding-category-iso-7",
423 "coding-category-iso-7-tight",
424 "coding-category-iso-8-1",
425 "coding-category-iso-8-2",
426 "coding-category-iso-7-else",
427 "coding-category-iso-8-else",
428 "coding-category-ccl",
429 "coding-category-big5",
430 "coding-category-utf-8",
431 "coding-category-utf-16-be",
432 "coding-category-utf-16-le",
433 "coding-category-raw-text",
434 "coding-category-binary"
437 /* Table of pointers to coding systems corresponding to each coding
439 struct coding_system
*coding_system_table
[CODING_CATEGORY_IDX_MAX
];
441 /* Table of coding category masks. Nth element is a mask for a coding
442 cateogry of which priority is Nth. */
444 int coding_priorities
[CODING_CATEGORY_IDX_MAX
];
446 /* Flag to tell if we look up translation table on character code
448 Lisp_Object Venable_character_translation
;
449 /* Standard translation table to look up on decoding (reading). */
450 Lisp_Object Vstandard_translation_table_for_decode
;
451 /* Standard translation table to look up on encoding (writing). */
452 Lisp_Object Vstandard_translation_table_for_encode
;
454 Lisp_Object Qtranslation_table
;
455 Lisp_Object Qtranslation_table_id
;
456 Lisp_Object Qtranslation_table_for_decode
;
457 Lisp_Object Qtranslation_table_for_encode
;
459 /* Alist of charsets vs revision number. */
460 Lisp_Object Vcharset_revision_alist
;
462 /* Default coding systems used for process I/O. */
463 Lisp_Object Vdefault_process_coding_system
;
465 /* Global flag to tell that we can't call post-read-conversion and
466 pre-write-conversion functions. Usually the value is zero, but it
467 is set to 1 temporarily while such functions are running. This is
468 to avoid infinite recursive call. */
469 static int inhibit_pre_post_conversion
;
472 /*** 2. Emacs internal format (emacs-mule) handlers ***/
474 /* Emacs' internal format for encoding multiple character sets is a
475 kind of multi-byte encoding, i.e. characters are encoded by
476 variable-length sequences of one-byte codes.
478 ASCII characters and control characters (e.g. `tab', `newline') are
479 represented by one-byte sequences which are their ASCII codes, in
480 the range 0x00 through 0x7F.
482 8-bit characters of the range 0x80..0x9F are represented by
483 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
486 8-bit characters of the range 0xA0..0xFF are represented by
487 one-byte sequences which are their 8-bit code.
489 The other characters are represented by a sequence of `base
490 leading-code', optional `extended leading-code', and one or two
491 `position-code's. The length of the sequence is determined by the
492 base leading-code. Leading-code takes the range 0x80 through 0x9F,
493 whereas extended leading-code and position-code take the range 0xA0
494 through 0xFF. See `charset.h' for more details about leading-code
497 --- CODE RANGE of Emacs' internal format ---
501 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
502 eight-bit-graphic 0xA0..0xBF
503 ELSE 0x81..0x9F + [0xA0..0xFF]+
504 ---------------------------------------------
508 enum emacs_code_class_type emacs_code_class
[256];
510 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
511 Check if a text is encoded in Emacs' internal format. If it is,
512 return CODING_CATEGORY_MASK_EMACS_MULE, else return 0. */
515 detect_coding_emacs_mule (src
, src_end
)
516 unsigned char *src
, *src_end
;
520 /* Dummy for ONE_MORE_BYTE. */
521 struct coding_system dummy_coding
;
522 struct coding_system
*coding
= &dummy_coding
;
543 if (c
== ISO_CODE_ESC
|| c
== ISO_CODE_SI
|| c
== ISO_CODE_SO
)
546 else if (c
>= 0x80 && c
< 0xA0)
549 /* Old leading code for a composite character. */
553 unsigned char *src_base
= src
- 1;
556 if (!UNIBYTE_STR_AS_MULTIBYTE_P (src_base
, src_end
- src_base
,
559 src
= src_base
+ bytes
;
564 return CODING_CATEGORY_MASK_EMACS_MULE
;
568 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
571 decode_coding_emacs_mule (coding
, source
, destination
, src_bytes
, dst_bytes
)
572 struct coding_system
*coding
;
573 unsigned char *source
, *destination
;
574 int src_bytes
, dst_bytes
;
576 unsigned char *src
= source
;
577 unsigned char *src_end
= source
+ src_bytes
;
578 unsigned char *dst
= destination
;
579 unsigned char *dst_end
= destination
+ dst_bytes
;
580 /* SRC_BASE remembers the start position in source in each loop.
581 The loop will be exited when there's not enough source code, or
582 when there's not enough destination area to produce a
584 unsigned char *src_base
;
586 coding
->produced_char
= 0;
587 while (src
< src_end
)
589 unsigned char tmp
[MAX_MULTIBYTE_LENGTH
], *p
;
593 if (UNIBYTE_STR_AS_MULTIBYTE_P (src
, src_end
- src
, bytes
))
600 bytes
= CHAR_STRING (*src
, tmp
);
604 if (dst
+ bytes
>= (dst_bytes
? dst_end
: src
))
606 coding
->result
= CODING_FINISH_INSUFFICIENT_DST
;
609 while (bytes
--) *dst
++ = *p
++;
610 coding
->produced_char
++;
612 coding
->consumed
= coding
->consumed_char
= src_base
- source
;
613 coding
->produced
= dst
- destination
;
616 #define encode_coding_emacs_mule(coding, source, destination, src_bytes, dst_bytes) \
617 encode_eol (coding, source, destination, src_bytes, dst_bytes)
621 /*** 3. ISO2022 handlers ***/
623 /* The following note describes the coding system ISO2022 briefly.
624 Since the intention of this note is to help understand the
625 functions in this file, some parts are NOT ACCURATE or OVERLY
626 SIMPLIFIED. For thorough understanding, please refer to the
627 original document of ISO2022.
629 ISO2022 provides many mechanisms to encode several character sets
630 in 7-bit and 8-bit environments. For 7-bite environments, all text
631 is encoded using bytes less than 128. This may make the encoded
632 text a little bit longer, but the text passes more easily through
633 several gateways, some of which strip off MSB (Most Signigant Bit).
635 There are two kinds of character sets: control character set and
636 graphic character set. The former contains control characters such
637 as `newline' and `escape' to provide control functions (control
638 functions are also provided by escape sequences). The latter
639 contains graphic characters such as 'A' and '-'. Emacs recognizes
640 two control character sets and many graphic character sets.
642 Graphic character sets are classified into one of the following
643 four classes, according to the number of bytes (DIMENSION) and
644 number of characters in one dimension (CHARS) of the set:
650 In addition, each character set is assigned an identification tag,
651 unique for each set, called "final character" (denoted as <F>
652 hereafter). The <F> of each character set is decided by ECMA(*)
653 when it is registered in ISO. The code range of <F> is 0x30..0x7F
654 (0x30..0x3F are for private use only).
656 Note (*): ECMA = European Computer Manufacturers Association
658 Here are examples of graphic character set [NAME(<F>)]:
659 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
660 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
661 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
662 o DIMENSION2_CHARS96 -- none for the moment
664 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
665 C0 [0x00..0x1F] -- control character plane 0
666 GL [0x20..0x7F] -- graphic character plane 0
667 C1 [0x80..0x9F] -- control character plane 1
668 GR [0xA0..0xFF] -- graphic character plane 1
670 A control character set is directly designated and invoked to C0 or
671 C1 by an escape sequence. The most common case is that:
672 - ISO646's control character set is designated/invoked to C0, and
673 - ISO6429's control character set is designated/invoked to C1,
674 and usually these designations/invocations are omitted in encoded
675 text. In a 7-bit environment, only C0 can be used, and a control
676 character for C1 is encoded by an appropriate escape sequence to
677 fit into the environment. All control characters for C1 are
678 defined to have corresponding escape sequences.
680 A graphic character set is at first designated to one of four
681 graphic registers (G0 through G3), then these graphic registers are
682 invoked to GL or GR. These designations and invocations can be
683 done independently. The most common case is that G0 is invoked to
684 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
685 these invocations and designations are omitted in encoded text.
686 In a 7-bit environment, only GL can be used.
688 When a graphic character set of CHARS94 is invoked to GL, codes
689 0x20 and 0x7F of the GL area work as control characters SPACE and
690 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
693 There are two ways of invocation: locking-shift and single-shift.
694 With locking-shift, the invocation lasts until the next different
695 invocation, whereas with single-shift, the invocation affects the
696 following character only and doesn't affect the locking-shift
697 state. Invocations are done by the following control characters or
700 ----------------------------------------------------------------------
701 abbrev function cntrl escape seq description
702 ----------------------------------------------------------------------
703 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
704 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
705 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
706 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
707 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
708 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
709 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
710 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
711 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
712 ----------------------------------------------------------------------
713 (*) These are not used by any known coding system.
715 Control characters for these functions are defined by macros
716 ISO_CODE_XXX in `coding.h'.
718 Designations are done by the following escape sequences:
719 ----------------------------------------------------------------------
720 escape sequence description
721 ----------------------------------------------------------------------
722 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
723 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
724 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
725 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
726 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
727 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
728 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
729 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
730 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
731 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
732 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
733 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
734 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
735 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
736 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
737 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
738 ----------------------------------------------------------------------
740 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
741 of dimension 1, chars 94, and final character <F>, etc...
743 Note (*): Although these designations are not allowed in ISO2022,
744 Emacs accepts them on decoding, and produces them on encoding
745 CHARS96 character sets in a coding system which is characterized as
746 7-bit environment, non-locking-shift, and non-single-shift.
748 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
749 '(' can be omitted. We refer to this as "short-form" hereafter.
751 Now you may notice that there are a lot of ways for encoding the
752 same multilingual text in ISO2022. Actually, there exist many
753 coding systems such as Compound Text (used in X11's inter client
754 communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
755 (used in Korean internet), EUC (Extended UNIX Code, used in Asian
756 localized platforms), and all of these are variants of ISO2022.
758 In addition to the above, Emacs handles two more kinds of escape
759 sequences: ISO6429's direction specification and Emacs' private
760 sequence for specifying character composition.
762 ISO6429's direction specification takes the following form:
763 o CSI ']' -- end of the current direction
764 o CSI '0' ']' -- end of the current direction
765 o CSI '1' ']' -- start of left-to-right text
766 o CSI '2' ']' -- start of right-to-left text
767 The control character CSI (0x9B: control sequence introducer) is
768 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
770 Character composition specification takes the following form:
771 o ESC '0' -- start relative composition
772 o ESC '1' -- end composition
773 o ESC '2' -- start rule-base composition (*)
774 o ESC '3' -- start relative composition with alternate chars (**)
775 o ESC '4' -- start rule-base composition with alternate chars (**)
776 Since these are not standard escape sequences of any ISO standard,
777 the use of them for these meaning is restricted to Emacs only.
779 (*) This form is used only in Emacs 20.5 and the older versions,
780 but the newer versions can safely decode it.
781 (**) This form is used only in Emacs 21.1 and the newer versions,
782 and the older versions can't decode it.
784 Here's a list of examples usages of these composition escape
785 sequences (categorized by `enum composition_method').
787 COMPOSITION_RELATIVE:
788 ESC 0 CHAR [ CHAR ] ESC 1
789 COMPOSITOIN_WITH_RULE:
790 ESC 2 CHAR [ RULE CHAR ] ESC 1
791 COMPOSITION_WITH_ALTCHARS:
792 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
793 COMPOSITION_WITH_RULE_ALTCHARS:
794 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
796 enum iso_code_class_type iso_code_class
[256];
798 #define CHARSET_OK(idx, charset) \
799 (coding_system_table[idx] \
800 && (coding_system_table[idx]->safe_charsets[charset] \
801 || (CODING_SPEC_ISO_REQUESTED_DESIGNATION \
802 (coding_system_table[idx], charset) \
803 != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)))
805 #define SHIFT_OUT_OK(idx) \
806 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding_system_table[idx], 1) >= 0)
808 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
809 Check if a text is encoded in ISO2022. If it is, returns an
810 integer in which appropriate flag bits any of:
811 CODING_CATEGORY_MASK_ISO_7
812 CODING_CATEGORY_MASK_ISO_7_TIGHT
813 CODING_CATEGORY_MASK_ISO_8_1
814 CODING_CATEGORY_MASK_ISO_8_2
815 CODING_CATEGORY_MASK_ISO_7_ELSE
816 CODING_CATEGORY_MASK_ISO_8_ELSE
817 are set. If a code which should never appear in ISO2022 is found,
821 detect_coding_iso2022 (src
, src_end
)
822 unsigned char *src
, *src_end
;
824 int mask
= CODING_CATEGORY_MASK_ISO
;
826 int reg
[4], shift_out
= 0, single_shifting
= 0;
827 int c
, c1
, i
, charset
;
828 /* Dummy for ONE_MORE_BYTE. */
829 struct coding_system dummy_coding
;
830 struct coding_system
*coding
= &dummy_coding
;
832 reg
[0] = CHARSET_ASCII
, reg
[1] = reg
[2] = reg
[3] = -1;
833 while (mask
&& src
< src_end
)
841 if (c
>= '(' && c
<= '/')
843 /* Designation sequence for a charset of dimension 1. */
845 if (c1
< ' ' || c1
>= 0x80
846 || (charset
= iso_charset_table
[0][c
>= ','][c1
]) < 0)
847 /* Invalid designation sequence. Just ignore. */
849 reg
[(c
- '(') % 4] = charset
;
853 /* Designation sequence for a charset of dimension 2. */
855 if (c
>= '@' && c
<= 'B')
856 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
857 reg
[0] = charset
= iso_charset_table
[1][0][c
];
858 else if (c
>= '(' && c
<= '/')
861 if (c1
< ' ' || c1
>= 0x80
862 || (charset
= iso_charset_table
[1][c
>= ','][c1
]) < 0)
863 /* Invalid designation sequence. Just ignore. */
865 reg
[(c
- '(') % 4] = charset
;
868 /* Invalid designation sequence. Just ignore. */
871 else if (c
== 'N' || c
== 'O')
873 /* ESC <Fe> for SS2 or SS3. */
874 mask
&= CODING_CATEGORY_MASK_ISO_7_ELSE
;
877 else if (c
>= '0' && c
<= '4')
879 /* ESC <Fp> for start/end composition. */
880 mask_found
|= CODING_CATEGORY_MASK_ISO
;
884 /* Invalid escape sequence. Just ignore. */
887 /* We found a valid designation sequence for CHARSET. */
888 mask
&= ~CODING_CATEGORY_MASK_ISO_8BIT
;
889 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7
, charset
))
890 mask_found
|= CODING_CATEGORY_MASK_ISO_7
;
892 mask
&= ~CODING_CATEGORY_MASK_ISO_7
;
893 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT
, charset
))
894 mask_found
|= CODING_CATEGORY_MASK_ISO_7_TIGHT
;
896 mask
&= ~CODING_CATEGORY_MASK_ISO_7_TIGHT
;
897 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE
, charset
))
898 mask_found
|= CODING_CATEGORY_MASK_ISO_7_ELSE
;
900 mask
&= ~CODING_CATEGORY_MASK_ISO_7_ELSE
;
901 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE
, charset
))
902 mask_found
|= CODING_CATEGORY_MASK_ISO_8_ELSE
;
904 mask
&= ~CODING_CATEGORY_MASK_ISO_8_ELSE
;
911 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE
)
912 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE
)))
914 /* Locking shift out. */
915 mask
&= ~CODING_CATEGORY_MASK_ISO_7BIT
;
916 mask_found
|= CODING_CATEGORY_MASK_ISO_SHIFT
;
924 /* Locking shift in. */
925 mask
&= ~CODING_CATEGORY_MASK_ISO_7BIT
;
926 mask_found
|= CODING_CATEGORY_MASK_ISO_SHIFT
;
935 int newmask
= CODING_CATEGORY_MASK_ISO_8_ELSE
;
937 if (c
!= ISO_CODE_CSI
)
939 if (coding_system_table
[CODING_CATEGORY_IDX_ISO_8_1
]->flags
940 & CODING_FLAG_ISO_SINGLE_SHIFT
)
941 newmask
|= CODING_CATEGORY_MASK_ISO_8_1
;
942 if (coding_system_table
[CODING_CATEGORY_IDX_ISO_8_2
]->flags
943 & CODING_FLAG_ISO_SINGLE_SHIFT
)
944 newmask
|= CODING_CATEGORY_MASK_ISO_8_2
;
947 if (VECTORP (Vlatin_extra_code_table
)
948 && !NILP (XVECTOR (Vlatin_extra_code_table
)->contents
[c
]))
950 if (coding_system_table
[CODING_CATEGORY_IDX_ISO_8_1
]->flags
951 & CODING_FLAG_ISO_LATIN_EXTRA
)
952 newmask
|= CODING_CATEGORY_MASK_ISO_8_1
;
953 if (coding_system_table
[CODING_CATEGORY_IDX_ISO_8_2
]->flags
954 & CODING_FLAG_ISO_LATIN_EXTRA
)
955 newmask
|= CODING_CATEGORY_MASK_ISO_8_2
;
958 mask_found
|= newmask
;
971 if (VECTORP (Vlatin_extra_code_table
)
972 && !NILP (XVECTOR (Vlatin_extra_code_table
)->contents
[c
]))
976 if (coding_system_table
[CODING_CATEGORY_IDX_ISO_8_1
]->flags
977 & CODING_FLAG_ISO_LATIN_EXTRA
)
978 newmask
|= CODING_CATEGORY_MASK_ISO_8_1
;
979 if (coding_system_table
[CODING_CATEGORY_IDX_ISO_8_2
]->flags
980 & CODING_FLAG_ISO_LATIN_EXTRA
)
981 newmask
|= CODING_CATEGORY_MASK_ISO_8_2
;
983 mask_found
|= newmask
;
990 unsigned char *src_begin
= src
;
992 mask
&= ~(CODING_CATEGORY_MASK_ISO_7BIT
993 | CODING_CATEGORY_MASK_ISO_7_ELSE
);
994 mask_found
|= CODING_CATEGORY_MASK_ISO_8_1
;
995 /* Check the length of succeeding codes of the range
996 0xA0..0FF. If the byte length is odd, we exclude
997 CODING_CATEGORY_MASK_ISO_8_2. We can check this only
998 when we are not single shifting. */
1000 && mask
& CODING_CATEGORY_MASK_ISO_8_2
)
1003 while (src
< src_end
)
1011 if (i
& 1 && src
< src_end
)
1012 mask
&= ~CODING_CATEGORY_MASK_ISO_8_2
;
1014 mask_found
|= CODING_CATEGORY_MASK_ISO_8_2
;
1021 return (mask
& mask_found
);
1024 /* Decode a character of which charset is CHARSET, the 1st position
1025 code is C1, the 2nd position code is C2, and return the decoded
1026 character code. If the variable `translation_table' is non-nil,
1027 returned the translated code. */
1029 #define DECODE_ISO_CHARACTER(charset, c1, c2) \
1030 (NILP (translation_table) \
1031 ? MAKE_CHAR (charset, c1, c2) \
1032 : translate_char (translation_table, -1, charset, c1, c2))
1034 /* Set designation state into CODING. */
1035 #define DECODE_DESIGNATION(reg, dimension, chars, final_char) \
1039 if (final_char < '0' || final_char >= 128) \
1040 goto label_invalid_code; \
1041 charset = ISO_CHARSET_TABLE (make_number (dimension), \
1042 make_number (chars), \
1043 make_number (final_char)); \
1045 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) == reg \
1046 || coding->safe_charsets[charset])) \
1048 if (coding->spec.iso2022.last_invalid_designation_register == 0 \
1050 && charset == CHARSET_ASCII) \
1052 /* We should insert this designation sequence as is so \
1053 that it is surely written back to a file. */ \
1054 coding->spec.iso2022.last_invalid_designation_register = -1; \
1055 goto label_invalid_code; \
1057 coding->spec.iso2022.last_invalid_designation_register = -1; \
1058 if ((coding->mode & CODING_MODE_DIRECTION) \
1059 && CHARSET_REVERSE_CHARSET (charset) >= 0) \
1060 charset = CHARSET_REVERSE_CHARSET (charset); \
1061 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
1065 coding->spec.iso2022.last_invalid_designation_register = reg; \
1066 goto label_invalid_code; \
1070 /* Allocate a memory block for storing information about compositions.
1071 The block is chained to the already allocated blocks. */
1074 coding_allocate_composition_data (coding
, char_offset
)
1075 struct coding_system
*coding
;
1078 struct composition_data
*cmp_data
1079 = (struct composition_data
*) xmalloc (sizeof *cmp_data
);
1081 cmp_data
->char_offset
= char_offset
;
1083 cmp_data
->prev
= coding
->cmp_data
;
1084 cmp_data
->next
= NULL
;
1085 if (coding
->cmp_data
)
1086 coding
->cmp_data
->next
= cmp_data
;
1087 coding
->cmp_data
= cmp_data
;
1088 coding
->cmp_data_start
= 0;
1091 /* Record the starting position START and METHOD of one composition. */
1093 #define CODING_ADD_COMPOSITION_START(coding, start, method) \
1095 struct composition_data *cmp_data = coding->cmp_data; \
1096 int *data = cmp_data->data + cmp_data->used; \
1097 coding->cmp_data_start = cmp_data->used; \
1099 data[1] = cmp_data->char_offset + start; \
1100 data[3] = (int) method; \
1101 cmp_data->used += 4; \
1104 /* Record the ending position END of the current composition. */
1106 #define CODING_ADD_COMPOSITION_END(coding, end) \
1108 struct composition_data *cmp_data = coding->cmp_data; \
1109 int *data = cmp_data->data + coding->cmp_data_start; \
1110 data[0] = cmp_data->used - coding->cmp_data_start; \
1111 data[2] = cmp_data->char_offset + end; \
1114 /* Record one COMPONENT (alternate character or composition rule). */
1116 #define CODING_ADD_COMPOSITION_COMPONENT(coding, component) \
1117 (coding->cmp_data->data[coding->cmp_data->used++] = component)
1119 /* Handle compositoin start sequence ESC 0, ESC 2, ESC 3, or ESC 4. */
1121 #define DECODE_COMPOSITION_START(c1) \
1123 if (coding->composing == COMPOSITION_DISABLED) \
1125 *dst++ = ISO_CODE_ESC; \
1126 *dst++ = c1 & 0x7f; \
1127 coding->produced_char += 2; \
1129 else if (!COMPOSING_P (coding)) \
1131 /* This is surely the start of a composition. We must be sure \
1132 that coding->cmp_data has enough space to store the \
1133 information about the composition. If not, terminate the \
1134 current decoding loop, allocate one more memory block for \
1135 coding->cmp_data in the calller, then start the decoding \
1136 loop again. We can't allocate memory here directly because \
1137 it may cause buffer/string relocation. */ \
1138 if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH \
1139 >= COMPOSITION_DATA_SIZE) \
1141 coding->result = CODING_FINISH_INSUFFICIENT_CMP; \
1142 goto label_end_of_loop; \
1144 coding->composing = (c1 == '0' ? COMPOSITION_RELATIVE \
1145 : c1 == '2' ? COMPOSITION_WITH_RULE \
1146 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
1147 : COMPOSITION_WITH_RULE_ALTCHARS); \
1148 CODING_ADD_COMPOSITION_START (coding, coding->produced_char, \
1149 coding->composing); \
1150 coding->composition_rule_follows = 0; \
1154 /* We are already handling a composition. If the method is \
1155 the following two, the codes following the current escape \
1156 sequence are actual characters stored in a buffer. */ \
1157 if (coding->composing == COMPOSITION_WITH_ALTCHARS \
1158 || coding->composing == COMPOSITION_WITH_RULE_ALTCHARS) \
1160 coding->composing = COMPOSITION_RELATIVE; \
1161 coding->composition_rule_follows = 0; \
1166 /* Handle compositoin end sequence ESC 1. */
1168 #define DECODE_COMPOSITION_END(c1) \
1170 if (coding->composing == COMPOSITION_DISABLED) \
1172 *dst++ = ISO_CODE_ESC; \
1174 coding->produced_char += 2; \
1178 CODING_ADD_COMPOSITION_END (coding, coding->produced_char); \
1179 coding->composing = COMPOSITION_NO; \
1183 /* Decode a composition rule from the byte C1 (and maybe one more byte
1184 from SRC) and store one encoded composition rule in
1185 coding->cmp_data. */
1187 #define DECODE_COMPOSITION_RULE(c1) \
1191 if (c1 < 81) /* old format (before ver.21) */ \
1193 int gref = (c1) / 9; \
1194 int nref = (c1) % 9; \
1195 if (gref == 4) gref = 10; \
1196 if (nref == 4) nref = 10; \
1197 rule = COMPOSITION_ENCODE_RULE (gref, nref); \
1199 else if (c1 < 93) /* new format (after ver.21) */ \
1201 ONE_MORE_BYTE (c2); \
1202 rule = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32); \
1204 CODING_ADD_COMPOSITION_COMPONENT (coding, rule); \
1205 coding->composition_rule_follows = 0; \
1209 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
1212 decode_coding_iso2022 (coding
, source
, destination
, src_bytes
, dst_bytes
)
1213 struct coding_system
*coding
;
1214 unsigned char *source
, *destination
;
1215 int src_bytes
, dst_bytes
;
1217 unsigned char *src
= source
;
1218 unsigned char *src_end
= source
+ src_bytes
;
1219 unsigned char *dst
= destination
;
1220 unsigned char *dst_end
= destination
+ dst_bytes
;
1221 /* Charsets invoked to graphic plane 0 and 1 respectively. */
1222 int charset0
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 0);
1223 int charset1
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 1);
1224 /* SRC_BASE remembers the start position in source in each loop.
1225 The loop will be exited when there's not enough source code
1226 (within macro ONE_MORE_BYTE), or when there's not enough
1227 destination area to produce a character (within macro
1229 unsigned char *src_base
;
1231 Lisp_Object translation_table
;
1233 if (NILP (Venable_character_translation
))
1234 translation_table
= Qnil
;
1237 translation_table
= coding
->translation_table_for_decode
;
1238 if (NILP (translation_table
))
1239 translation_table
= Vstandard_translation_table_for_decode
;
1242 coding
->result
= CODING_FINISH_NORMAL
;
1251 /* We produce no character or one character. */
1252 switch (iso_code_class
[c1
])
1254 case ISO_0x20_or_0x7F
:
1255 if (COMPOSING_P (coding
) && coding
->composition_rule_follows
)
1257 DECODE_COMPOSITION_RULE (c1
);
1260 if (charset0
< 0 || CHARSET_CHARS (charset0
) == 94)
1262 /* This is SPACE or DEL. */
1263 charset
= CHARSET_ASCII
;
1266 /* This is a graphic character, we fall down ... */
1268 case ISO_graphic_plane_0
:
1269 if (COMPOSING_P (coding
) && coding
->composition_rule_follows
)
1271 DECODE_COMPOSITION_RULE (c1
);
1277 case ISO_0xA0_or_0xFF
:
1278 if (charset1
< 0 || CHARSET_CHARS (charset1
) == 94
1279 || coding
->flags
& CODING_FLAG_ISO_SEVEN_BITS
)
1280 goto label_invalid_code
;
1281 /* This is a graphic character, we fall down ... */
1283 case ISO_graphic_plane_1
:
1285 goto label_invalid_code
;
1290 if (COMPOSING_P (coding
))
1291 DECODE_COMPOSITION_END ('1');
1293 /* All ISO2022 control characters in this class have the
1294 same representation in Emacs internal format. */
1296 && (coding
->mode
& CODING_MODE_INHIBIT_INCONSISTENT_EOL
)
1297 && (coding
->eol_type
== CODING_EOL_CR
1298 || coding
->eol_type
== CODING_EOL_CRLF
))
1300 coding
->result
= CODING_FINISH_INCONSISTENT_EOL
;
1301 goto label_end_of_loop
;
1303 charset
= CHARSET_ASCII
;
1307 if (COMPOSING_P (coding
))
1308 DECODE_COMPOSITION_END ('1');
1309 goto label_invalid_code
;
1311 case ISO_carriage_return
:
1312 if (COMPOSING_P (coding
))
1313 DECODE_COMPOSITION_END ('1');
1315 if (coding
->eol_type
== CODING_EOL_CR
)
1317 else if (coding
->eol_type
== CODING_EOL_CRLF
)
1320 if (c1
!= ISO_CODE_LF
)
1322 if (coding
->mode
& CODING_MODE_INHIBIT_INCONSISTENT_EOL
)
1324 coding
->result
= CODING_FINISH_INCONSISTENT_EOL
;
1325 goto label_end_of_loop
;
1331 charset
= CHARSET_ASCII
;
1335 if (! (coding
->flags
& CODING_FLAG_ISO_LOCKING_SHIFT
)
1336 || CODING_SPEC_ISO_DESIGNATION (coding
, 1) < 0)
1337 goto label_invalid_code
;
1338 CODING_SPEC_ISO_INVOCATION (coding
, 0) = 1;
1339 charset0
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 0);
1343 if (! (coding
->flags
& CODING_FLAG_ISO_LOCKING_SHIFT
))
1344 goto label_invalid_code
;
1345 CODING_SPEC_ISO_INVOCATION (coding
, 0) = 0;
1346 charset0
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 0);
1349 case ISO_single_shift_2_7
:
1350 case ISO_single_shift_2
:
1351 if (! (coding
->flags
& CODING_FLAG_ISO_SINGLE_SHIFT
))
1352 goto label_invalid_code
;
1353 /* SS2 is handled as an escape sequence of ESC 'N' */
1355 goto label_escape_sequence
;
1357 case ISO_single_shift_3
:
1358 if (! (coding
->flags
& CODING_FLAG_ISO_SINGLE_SHIFT
))
1359 goto label_invalid_code
;
1360 /* SS2 is handled as an escape sequence of ESC 'O' */
1362 goto label_escape_sequence
;
1364 case ISO_control_sequence_introducer
:
1365 /* CSI is handled as an escape sequence of ESC '[' ... */
1367 goto label_escape_sequence
;
1371 label_escape_sequence
:
1372 /* Escape sequences handled by Emacs are invocation,
1373 designation, direction specification, and character
1374 composition specification. */
1377 case '&': /* revision of following character set */
1379 if (!(c1
>= '@' && c1
<= '~'))
1380 goto label_invalid_code
;
1382 if (c1
!= ISO_CODE_ESC
)
1383 goto label_invalid_code
;
1385 goto label_escape_sequence
;
1387 case '$': /* designation of 2-byte character set */
1388 if (! (coding
->flags
& CODING_FLAG_ISO_DESIGNATION
))
1389 goto label_invalid_code
;
1391 if (c1
>= '@' && c1
<= 'B')
1392 { /* designation of JISX0208.1978, GB2312.1980,
1394 DECODE_DESIGNATION (0, 2, 94, c1
);
1396 else if (c1
>= 0x28 && c1
<= 0x2B)
1397 { /* designation of DIMENSION2_CHARS94 character set */
1399 DECODE_DESIGNATION (c1
- 0x28, 2, 94, c2
);
1401 else if (c1
>= 0x2C && c1
<= 0x2F)
1402 { /* designation of DIMENSION2_CHARS96 character set */
1404 DECODE_DESIGNATION (c1
- 0x2C, 2, 96, c2
);
1407 goto label_invalid_code
;
1408 /* We must update these variables now. */
1409 charset0
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 0);
1410 charset1
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 1);
1413 case 'n': /* invocation of locking-shift-2 */
1414 if (! (coding
->flags
& CODING_FLAG_ISO_LOCKING_SHIFT
)
1415 || CODING_SPEC_ISO_DESIGNATION (coding
, 2) < 0)
1416 goto label_invalid_code
;
1417 CODING_SPEC_ISO_INVOCATION (coding
, 0) = 2;
1418 charset0
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 0);
1421 case 'o': /* invocation of locking-shift-3 */
1422 if (! (coding
->flags
& CODING_FLAG_ISO_LOCKING_SHIFT
)
1423 || CODING_SPEC_ISO_DESIGNATION (coding
, 3) < 0)
1424 goto label_invalid_code
;
1425 CODING_SPEC_ISO_INVOCATION (coding
, 0) = 3;
1426 charset0
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 0);
1429 case 'N': /* invocation of single-shift-2 */
1430 if (! (coding
->flags
& CODING_FLAG_ISO_SINGLE_SHIFT
)
1431 || CODING_SPEC_ISO_DESIGNATION (coding
, 2) < 0)
1432 goto label_invalid_code
;
1433 charset
= CODING_SPEC_ISO_DESIGNATION (coding
, 2);
1437 case 'O': /* invocation of single-shift-3 */
1438 if (! (coding
->flags
& CODING_FLAG_ISO_SINGLE_SHIFT
)
1439 || CODING_SPEC_ISO_DESIGNATION (coding
, 3) < 0)
1440 goto label_invalid_code
;
1441 charset
= CODING_SPEC_ISO_DESIGNATION (coding
, 3);
1445 case '0': case '2': case '3': case '4': /* start composition */
1446 DECODE_COMPOSITION_START (c1
);
1449 case '1': /* end composition */
1450 DECODE_COMPOSITION_END (c1
);
1453 case '[': /* specification of direction */
1454 if (coding
->flags
& CODING_FLAG_ISO_NO_DIRECTION
)
1455 goto label_invalid_code
;
1456 /* For the moment, nested direction is not supported.
1457 So, `coding->mode & CODING_MODE_DIRECTION' zero means
1458 left-to-right, and nozero means right-to-left. */
1462 case ']': /* end of the current direction */
1463 coding
->mode
&= ~CODING_MODE_DIRECTION
;
1465 case '0': /* end of the current direction */
1466 case '1': /* start of left-to-right direction */
1469 coding
->mode
&= ~CODING_MODE_DIRECTION
;
1471 goto label_invalid_code
;
1474 case '2': /* start of right-to-left direction */
1477 coding
->mode
|= CODING_MODE_DIRECTION
;
1479 goto label_invalid_code
;
1483 goto label_invalid_code
;
1488 if (! (coding
->flags
& CODING_FLAG_ISO_DESIGNATION
))
1489 goto label_invalid_code
;
1490 if (c1
>= 0x28 && c1
<= 0x2B)
1491 { /* designation of DIMENSION1_CHARS94 character set */
1493 DECODE_DESIGNATION (c1
- 0x28, 1, 94, c2
);
1495 else if (c1
>= 0x2C && c1
<= 0x2F)
1496 { /* designation of DIMENSION1_CHARS96 character set */
1498 DECODE_DESIGNATION (c1
- 0x2C, 1, 96, c2
);
1501 goto label_invalid_code
;
1502 /* We must update these variables now. */
1503 charset0
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 0);
1504 charset1
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 1);
1509 /* Now we know CHARSET and 1st position code C1 of a character.
1510 Produce a multibyte sequence for that character while getting
1511 2nd position code C2 if necessary. */
1512 if (CHARSET_DIMENSION (charset
) == 2)
1515 if (c1
< 0x80 ? c2
< 0x20 || c2
>= 0x80 : c2
< 0xA0)
1516 /* C2 is not in a valid range. */
1517 goto label_invalid_code
;
1519 c
= DECODE_ISO_CHARACTER (charset
, c1
, c2
);
1525 if (COMPOSING_P (coding
))
1526 DECODE_COMPOSITION_END ('1');
1533 coding
->consumed
= coding
->consumed_char
= src_base
- source
;
1534 coding
->produced
= dst
- destination
;
1539 /* ISO2022 encoding stuff. */
1542 It is not enough to say just "ISO2022" on encoding, we have to
1543 specify more details. In Emacs, each coding system of ISO2022
1544 variant has the following specifications:
1545 1. Initial designation to G0 thru G3.
1546 2. Allows short-form designation?
1547 3. ASCII should be designated to G0 before control characters?
1548 4. ASCII should be designated to G0 at end of line?
1549 5. 7-bit environment or 8-bit environment?
1550 6. Use locking-shift?
1551 7. Use Single-shift?
1552 And the following two are only for Japanese:
1553 8. Use ASCII in place of JIS0201-1976-Roman?
1554 9. Use JISX0208-1983 in place of JISX0208-1978?
1555 These specifications are encoded in `coding->flags' as flag bits
1556 defined by macros CODING_FLAG_ISO_XXX. See `coding.h' for more
1560 /* Produce codes (escape sequence) for designating CHARSET to graphic
1561 register REG at DST, and increment DST. If <final-char> of CHARSET is
1562 '@', 'A', or 'B' and the coding system CODING allows, produce
1563 designation sequence of short-form. */
1565 #define ENCODE_DESIGNATION(charset, reg, coding) \
1567 unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset); \
1568 char *intermediate_char_94 = "()*+"; \
1569 char *intermediate_char_96 = ",-./"; \
1570 int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset); \
1572 if (revision < 255) \
1574 *dst++ = ISO_CODE_ESC; \
1576 *dst++ = '@' + revision; \
1578 *dst++ = ISO_CODE_ESC; \
1579 if (CHARSET_DIMENSION (charset) == 1) \
1581 if (CHARSET_CHARS (charset) == 94) \
1582 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1584 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1589 if (CHARSET_CHARS (charset) == 94) \
1591 if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM) \
1593 || final_char < '@' || final_char > 'B') \
1594 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1597 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1599 *dst++ = final_char; \
1600 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
1603 /* The following two macros produce codes (control character or escape
1604 sequence) for ISO2022 single-shift functions (single-shift-2 and
1607 #define ENCODE_SINGLE_SHIFT_2 \
1609 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1610 *dst++ = ISO_CODE_ESC, *dst++ = 'N'; \
1612 *dst++ = ISO_CODE_SS2; \
1613 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1616 #define ENCODE_SINGLE_SHIFT_3 \
1618 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1619 *dst++ = ISO_CODE_ESC, *dst++ = 'O'; \
1621 *dst++ = ISO_CODE_SS3; \
1622 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1625 /* The following four macros produce codes (control character or
1626 escape sequence) for ISO2022 locking-shift functions (shift-in,
1627 shift-out, locking-shift-2, and locking-shift-3). */
1629 #define ENCODE_SHIFT_IN \
1631 *dst++ = ISO_CODE_SI; \
1632 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1635 #define ENCODE_SHIFT_OUT \
1637 *dst++ = ISO_CODE_SO; \
1638 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1641 #define ENCODE_LOCKING_SHIFT_2 \
1643 *dst++ = ISO_CODE_ESC, *dst++ = 'n'; \
1644 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1647 #define ENCODE_LOCKING_SHIFT_3 \
1649 *dst++ = ISO_CODE_ESC, *dst++ = 'o'; \
1650 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1653 /* Produce codes for a DIMENSION1 character whose character set is
1654 CHARSET and whose position-code is C1. Designation and invocation
1655 sequences are also produced in advance if necessary. */
1657 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
1659 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1661 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1662 *dst++ = c1 & 0x7F; \
1664 *dst++ = c1 | 0x80; \
1665 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1668 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1670 *dst++ = c1 & 0x7F; \
1673 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1675 *dst++ = c1 | 0x80; \
1678 else if (coding->flags & CODING_FLAG_ISO_SAFE \
1679 && !coding->safe_charsets[charset]) \
1681 /* We should not encode this character, instead produce one or \
1683 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1684 if (CHARSET_WIDTH (charset) == 2) \
1685 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1689 /* Since CHARSET is not yet invoked to any graphic planes, we \
1690 must invoke it, or, at first, designate it to some graphic \
1691 register. Then repeat the loop to actually produce the \
1693 dst = encode_invocation_designation (charset, coding, dst); \
1696 /* Produce codes for a DIMENSION2 character whose character set is
1697 CHARSET and whose position-codes are C1 and C2. Designation and
1698 invocation codes are also produced in advance if necessary. */
1700 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
1702 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1704 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1705 *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F; \
1707 *dst++ = c1 | 0x80, *dst++ = c2 | 0x80; \
1708 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1711 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1713 *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F; \
1716 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1718 *dst++ = c1 | 0x80, *dst++= c2 | 0x80; \
1721 else if (coding->flags & CODING_FLAG_ISO_SAFE \
1722 && !coding->safe_charsets[charset]) \
1724 /* We should not encode this character, instead produce one or \
1726 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1727 if (CHARSET_WIDTH (charset) == 2) \
1728 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1732 /* Since CHARSET is not yet invoked to any graphic planes, we \
1733 must invoke it, or, at first, designate it to some graphic \
1734 register. Then repeat the loop to actually produce the \
1736 dst = encode_invocation_designation (charset, coding, dst); \
1739 #define ENCODE_ISO_CHARACTER(charset, c1, c2) \
1741 int alt_charset = charset; \
1743 if (CHARSET_DEFINED_P (charset)) \
1745 if (CHARSET_DIMENSION (charset) == 1) \
1747 if (charset == CHARSET_ASCII \
1748 && coding->flags & CODING_FLAG_ISO_USE_ROMAN) \
1749 alt_charset = charset_latin_jisx0201; \
1750 ENCODE_ISO_CHARACTER_DIMENSION1 (alt_charset, c1); \
1754 if (charset == charset_jisx0208 \
1755 && coding->flags & CODING_FLAG_ISO_USE_OLDJIS) \
1756 alt_charset = charset_jisx0208_1978; \
1757 ENCODE_ISO_CHARACTER_DIMENSION2 (alt_charset, c1, c2); \
1768 /* Produce designation and invocation codes at a place pointed by DST
1769 to use CHARSET. The element `spec.iso2022' of *CODING is updated.
1773 encode_invocation_designation (charset
, coding
, dst
)
1775 struct coding_system
*coding
;
1778 int reg
; /* graphic register number */
1780 /* At first, check designations. */
1781 for (reg
= 0; reg
< 4; reg
++)
1782 if (charset
== CODING_SPEC_ISO_DESIGNATION (coding
, reg
))
1787 /* CHARSET is not yet designated to any graphic registers. */
1788 /* At first check the requested designation. */
1789 reg
= CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
);
1790 if (reg
== CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION
)
1791 /* Since CHARSET requests no special designation, designate it
1792 to graphic register 0. */
1795 ENCODE_DESIGNATION (charset
, reg
, coding
);
1798 if (CODING_SPEC_ISO_INVOCATION (coding
, 0) != reg
1799 && CODING_SPEC_ISO_INVOCATION (coding
, 1) != reg
)
1801 /* Since the graphic register REG is not invoked to any graphic
1802 planes, invoke it to graphic plane 0. */
1805 case 0: /* graphic register 0 */
1809 case 1: /* graphic register 1 */
1813 case 2: /* graphic register 2 */
1814 if (coding
->flags
& CODING_FLAG_ISO_SINGLE_SHIFT
)
1815 ENCODE_SINGLE_SHIFT_2
;
1817 ENCODE_LOCKING_SHIFT_2
;
1820 case 3: /* graphic register 3 */
1821 if (coding
->flags
& CODING_FLAG_ISO_SINGLE_SHIFT
)
1822 ENCODE_SINGLE_SHIFT_3
;
1824 ENCODE_LOCKING_SHIFT_3
;
1832 /* Produce 2-byte codes for encoded composition rule RULE. */
1834 #define ENCODE_COMPOSITION_RULE(rule) \
1837 COMPOSITION_DECODE_RULE (rule, gref, nref); \
1838 *dst++ = 32 + 81 + gref; \
1839 *dst++ = 32 + nref; \
1842 /* Produce codes for indicating the start of a composition sequence
1843 (ESC 0, ESC 3, or ESC 4). DATA points to an array of integers
1844 which specify information about the composition. See the comment
1845 in coding.h for the format of DATA. */
1847 #define ENCODE_COMPOSITION_START(coding, data) \
1849 coding->composing = data[3]; \
1850 *dst++ = ISO_CODE_ESC; \
1851 if (coding->composing == COMPOSITION_RELATIVE) \
1855 *dst++ = (coding->composing == COMPOSITION_WITH_ALTCHARS \
1857 coding->cmp_data_index = coding->cmp_data_start + 4; \
1858 coding->composition_rule_follows = 0; \
1862 /* Produce codes for indicating the end of the current composition. */
1864 #define ENCODE_COMPOSITION_END(coding, data) \
1866 *dst++ = ISO_CODE_ESC; \
1868 coding->cmp_data_start += data[0]; \
1869 coding->composing = COMPOSITION_NO; \
1870 if (coding->cmp_data_start == coding->cmp_data->used \
1871 && coding->cmp_data->next) \
1873 coding->cmp_data = coding->cmp_data->next; \
1874 coding->cmp_data_start = 0; \
1878 /* Produce composition start sequence ESC 0. Here, this sequence
1879 doesn't mean the start of a new composition but means that we have
1880 just produced components (alternate chars and composition rules) of
1881 the composition and the actual text follows in SRC. */
1883 #define ENCODE_COMPOSITION_FAKE_START(coding) \
1885 *dst++ = ISO_CODE_ESC; \
1887 coding->composing = COMPOSITION_RELATIVE; \
1890 /* The following three macros produce codes for indicating direction
1892 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
1894 if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS) \
1895 *dst++ = ISO_CODE_ESC, *dst++ = '['; \
1897 *dst++ = ISO_CODE_CSI; \
1900 #define ENCODE_DIRECTION_R2L \
1901 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '2', *dst++ = ']'
1903 #define ENCODE_DIRECTION_L2R \
1904 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '0', *dst++ = ']'
1906 /* Produce codes for designation and invocation to reset the graphic
1907 planes and registers to initial state. */
1908 #define ENCODE_RESET_PLANE_AND_REGISTER \
1911 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0) \
1913 for (reg = 0; reg < 4; reg++) \
1914 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0 \
1915 && (CODING_SPEC_ISO_DESIGNATION (coding, reg) \
1916 != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg))) \
1917 ENCODE_DESIGNATION \
1918 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
1921 /* Produce designation sequences of charsets in the line started from
1922 SRC to a place pointed by DST, and return updated DST.
1924 If the current block ends before any end-of-line, we may fail to
1925 find all the necessary designations. */
1927 static unsigned char *
1928 encode_designation_at_bol (coding
, translation_table
, src
, src_end
, dst
)
1929 struct coding_system
*coding
;
1930 Lisp_Object translation_table
;
1931 unsigned char *src
, *src_end
, *dst
;
1933 int charset
, c
, found
= 0, reg
;
1934 /* Table of charsets to be designated to each graphic register. */
1937 for (reg
= 0; reg
< 4; reg
++)
1946 charset
= CHAR_CHARSET (c
);
1947 reg
= CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
);
1948 if (reg
!= CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION
&& r
[reg
] < 0)
1958 for (reg
= 0; reg
< 4; reg
++)
1960 && CODING_SPEC_ISO_DESIGNATION (coding
, reg
) != r
[reg
])
1961 ENCODE_DESIGNATION (r
[reg
], reg
, coding
);
1967 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
1970 encode_coding_iso2022 (coding
, source
, destination
, src_bytes
, dst_bytes
)
1971 struct coding_system
*coding
;
1972 unsigned char *source
, *destination
;
1973 int src_bytes
, dst_bytes
;
1975 unsigned char *src
= source
;
1976 unsigned char *src_end
= source
+ src_bytes
;
1977 unsigned char *dst
= destination
;
1978 unsigned char *dst_end
= destination
+ dst_bytes
;
1979 /* Since the maximum bytes produced by each loop is 20, we subtract 19
1980 from DST_END to assure overflow checking is necessary only at the
1982 unsigned char *adjusted_dst_end
= dst_end
- 19;
1983 /* SRC_BASE remembers the start position in source in each loop.
1984 The loop will be exited when there's not enough source text to
1985 analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
1986 there's not enough destination area to produce encoded codes
1987 (within macro EMIT_BYTES). */
1988 unsigned char *src_base
;
1990 Lisp_Object translation_table
;
1992 if (NILP (Venable_character_translation
))
1993 translation_table
= Qnil
;
1996 translation_table
= coding
->translation_table_for_encode
;
1997 if (NILP (translation_table
))
1998 translation_table
= Vstandard_translation_table_for_encode
;
2001 coding
->consumed_char
= 0;
2005 int charset
, c1
, c2
;
2009 if (dst
>= (dst_bytes
? adjusted_dst_end
: (src
- 19)))
2011 coding
->result
= CODING_FINISH_INSUFFICIENT_DST
;
2015 if (coding
->flags
& CODING_FLAG_ISO_DESIGNATE_AT_BOL
2016 && CODING_SPEC_ISO_BOL (coding
))
2018 /* We have to produce designation sequences if any now. */
2019 dst
= encode_designation_at_bol (coding
, translation_table
,
2021 CODING_SPEC_ISO_BOL (coding
) = 0;
2024 /* Check composition start and end. */
2025 if (coding
->composing
!= COMPOSITION_DISABLED
2026 && coding
->cmp_data_start
< coding
->cmp_data
->used
)
2028 struct composition_data
*cmp_data
= coding
->cmp_data
;
2029 int *data
= cmp_data
->data
+ coding
->cmp_data_start
;
2030 int this_pos
= cmp_data
->char_offset
+ coding
->consumed_char
;
2032 if (coding
->composing
== COMPOSITION_RELATIVE
)
2034 if (this_pos
== data
[2])
2036 ENCODE_COMPOSITION_END (coding
, data
);
2037 cmp_data
= coding
->cmp_data
;
2038 data
= cmp_data
->data
+ coding
->cmp_data_start
;
2041 else if (COMPOSING_P (coding
))
2043 /* COMPOSITION_WITH_ALTCHARS or COMPOSITION_WITH_RULE_ALTCHAR */
2044 if (coding
->cmp_data_index
== coding
->cmp_data_start
+ data
[0])
2045 /* We have consumed components of the composition.
2046 What follows in SRC is the compositions's base
2048 ENCODE_COMPOSITION_FAKE_START (coding
);
2051 int c
= cmp_data
->data
[coding
->cmp_data_index
++];
2052 if (coding
->composition_rule_follows
)
2054 ENCODE_COMPOSITION_RULE (c
);
2055 coding
->composition_rule_follows
= 0;
2059 SPLIT_CHAR (c
, charset
, c1
, c2
);
2060 ENCODE_ISO_CHARACTER (charset
, c1
, c2
);
2061 if (coding
->composing
== COMPOSITION_WITH_RULE_ALTCHARS
)
2062 coding
->composition_rule_follows
= 1;
2067 if (!COMPOSING_P (coding
))
2069 if (this_pos
== data
[1])
2071 ENCODE_COMPOSITION_START (coding
, data
);
2079 /* Now encode the character C. */
2080 if (c
< 0x20 || c
== 0x7F)
2084 if (! (coding
->mode
& CODING_MODE_SELECTIVE_DISPLAY
))
2086 if (coding
->flags
& CODING_FLAG_ISO_RESET_AT_CNTL
)
2087 ENCODE_RESET_PLANE_AND_REGISTER
;
2091 /* fall down to treat '\r' as '\n' ... */
2096 if (coding
->flags
& CODING_FLAG_ISO_RESET_AT_EOL
)
2097 ENCODE_RESET_PLANE_AND_REGISTER
;
2098 if (coding
->flags
& CODING_FLAG_ISO_INIT_AT_BOL
)
2099 bcopy (coding
->spec
.iso2022
.initial_designation
,
2100 coding
->spec
.iso2022
.current_designation
,
2101 sizeof coding
->spec
.iso2022
.initial_designation
);
2102 if (coding
->eol_type
== CODING_EOL_LF
2103 || coding
->eol_type
== CODING_EOL_UNDECIDED
)
2104 *dst
++ = ISO_CODE_LF
;
2105 else if (coding
->eol_type
== CODING_EOL_CRLF
)
2106 *dst
++ = ISO_CODE_CR
, *dst
++ = ISO_CODE_LF
;
2108 *dst
++ = ISO_CODE_CR
;
2109 CODING_SPEC_ISO_BOL (coding
) = 1;
2113 if (coding
->flags
& CODING_FLAG_ISO_RESET_AT_CNTL
)
2114 ENCODE_RESET_PLANE_AND_REGISTER
;
2118 else if (ASCII_BYTE_P (c
))
2119 ENCODE_ISO_CHARACTER (CHARSET_ASCII
, c
, /* dummy */ c1
);
2120 else if (SINGLE_BYTE_CHAR_P (c
))
2127 SPLIT_CHAR (c
, charset
, c1
, c2
);
2128 ENCODE_ISO_CHARACTER (charset
, c1
, c2
);
2131 coding
->consumed_char
++;
2135 coding
->consumed
= src_base
- source
;
2136 coding
->produced
= coding
->produced_char
= dst
- destination
;
2140 /*** 4. SJIS and BIG5 handlers ***/
2142 /* Although SJIS and BIG5 are not ISO's coding system, they are used
2143 quite widely. So, for the moment, Emacs supports them in the bare
2144 C code. But, in the future, they may be supported only by CCL. */
2146 /* SJIS is a coding system encoding three character sets: ASCII, right
2147 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
2148 as is. A character of charset katakana-jisx0201 is encoded by
2149 "position-code + 0x80". A character of charset japanese-jisx0208
2150 is encoded in 2-byte but two position-codes are divided and shifted
2151 so that it fit in the range below.
2153 --- CODE RANGE of SJIS ---
2154 (character set) (range)
2156 KATAKANA-JISX0201 0xA0 .. 0xDF
2157 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
2158 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
2159 -------------------------------
2163 /* BIG5 is a coding system encoding two character sets: ASCII and
2164 Big5. An ASCII character is encoded as is. Big5 is a two-byte
2165 character set and is encoded in two-byte.
2167 --- CODE RANGE of BIG5 ---
2168 (character set) (range)
2170 Big5 (1st byte) 0xA1 .. 0xFE
2171 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
2172 --------------------------
2174 Since the number of characters in Big5 is larger than maximum
2175 characters in Emacs' charset (96x96), it can't be handled as one
2176 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
2177 and `charset-big5-2'. Both are DIMENSION2 and CHARS94. The former
2178 contains frequently used characters and the latter contains less
2179 frequently used characters. */
2181 /* Macros to decode or encode a character of Big5 in BIG5. B1 and B2
2182 are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
2183 C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
2184 format. CHARSET is `charset_big5_1' or `charset_big5_2'. */
2186 /* Number of Big5 characters which have the same code in 1st byte. */
2187 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2189 #define DECODE_BIG5(b1, b2, charset, c1, c2) \
2192 = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62); \
2194 charset = charset_big5_1; \
2197 charset = charset_big5_2; \
2198 temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW; \
2200 c1 = temp / (0xFF - 0xA1) + 0x21; \
2201 c2 = temp % (0xFF - 0xA1) + 0x21; \
2204 #define ENCODE_BIG5(charset, c1, c2, b1, b2) \
2206 unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21); \
2207 if (charset == charset_big5_2) \
2208 temp += BIG5_SAME_ROW * (0xC9 - 0xA1); \
2209 b1 = temp / BIG5_SAME_ROW + 0xA1; \
2210 b2 = temp % BIG5_SAME_ROW; \
2211 b2 += b2 < 0x3F ? 0x40 : 0x62; \
2214 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2215 Check if a text is encoded in SJIS. If it is, return
2216 CODING_CATEGORY_MASK_SJIS, else return 0. */
2219 detect_coding_sjis (src
, src_end
)
2220 unsigned char *src
, *src_end
;
2223 /* Dummy for ONE_MORE_BYTE. */
2224 struct coding_system dummy_coding
;
2225 struct coding_system
*coding
= &dummy_coding
;
2230 if ((c
>= 0x80 && c
< 0xA0) || c
>= 0xE0)
2238 return CODING_CATEGORY_MASK_SJIS
;
2241 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2242 Check if a text is encoded in BIG5. If it is, return
2243 CODING_CATEGORY_MASK_BIG5, else return 0. */
2246 detect_coding_big5 (src
, src_end
)
2247 unsigned char *src
, *src_end
;
2250 /* Dummy for ONE_MORE_BYTE. */
2251 struct coding_system dummy_coding
;
2252 struct coding_system
*coding
= &dummy_coding
;
2260 if (c
< 0x40 || (c
>= 0x7F && c
<= 0xA0))
2265 return CODING_CATEGORY_MASK_BIG5
;
2268 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2269 Check if a text is encoded in UTF-8. If it is, return
2270 CODING_CATEGORY_MASK_UTF_8, else return 0. */
2272 #define UTF_8_1_OCTET_P(c) ((c) < 0x80)
2273 #define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
2274 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
2275 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
2276 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
2277 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
2278 #define UTF_8_6_OCTET_LEADING_P(c) (((c) & 0xFE) == 0xFC)
2281 detect_coding_utf_8 (src
, src_end
)
2282 unsigned char *src
, *src_end
;
2285 int seq_maybe_bytes
;
2286 /* Dummy for ONE_MORE_BYTE. */
2287 struct coding_system dummy_coding
;
2288 struct coding_system
*coding
= &dummy_coding
;
2293 if (UTF_8_1_OCTET_P (c
))
2295 else if (UTF_8_2_OCTET_LEADING_P (c
))
2296 seq_maybe_bytes
= 1;
2297 else if (UTF_8_3_OCTET_LEADING_P (c
))
2298 seq_maybe_bytes
= 2;
2299 else if (UTF_8_4_OCTET_LEADING_P (c
))
2300 seq_maybe_bytes
= 3;
2301 else if (UTF_8_5_OCTET_LEADING_P (c
))
2302 seq_maybe_bytes
= 4;
2303 else if (UTF_8_6_OCTET_LEADING_P (c
))
2304 seq_maybe_bytes
= 5;
2311 if (!UTF_8_EXTRA_OCTET_P (c
))
2315 while (seq_maybe_bytes
> 0);
2319 return CODING_CATEGORY_MASK_UTF_8
;
2322 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2323 Check if a text is encoded in UTF-16 Big Endian (endian == 1) or
2324 Little Endian (otherwise). If it is, return
2325 CODING_CATEGORY_MASK_UTF_16_BE or CODING_CATEGORY_MASK_UTF_16_LE,
2328 #define UTF_16_INVALID_P(val) \
2329 (((val) == 0xFFFE) \
2330 || ((val) == 0xFFFF))
2332 #define UTF_16_HIGH_SURROGATE_P(val) \
2333 (((val) & 0xD800) == 0xD800)
2335 #define UTF_16_LOW_SURROGATE_P(val) \
2336 (((val) & 0xDC00) == 0xDC00)
2339 detect_coding_utf_16 (src
, src_end
)
2340 unsigned char *src
, *src_end
;
2342 unsigned char c1
, c2
;
2343 /* Dummy for TWO_MORE_BYTES. */
2344 struct coding_system dummy_coding
;
2345 struct coding_system
*coding
= &dummy_coding
;
2347 TWO_MORE_BYTES (c1
, c2
);
2349 if ((c1
== 0xFF) && (c2
== 0xFE))
2350 return CODING_CATEGORY_MASK_UTF_16_LE
;
2351 else if ((c1
== 0xFE) && (c2
== 0xFF))
2352 return CODING_CATEGORY_MASK_UTF_16_BE
;
2358 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2359 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
2362 decode_coding_sjis_big5 (coding
, source
, destination
,
2363 src_bytes
, dst_bytes
, sjis_p
)
2364 struct coding_system
*coding
;
2365 unsigned char *source
, *destination
;
2366 int src_bytes
, dst_bytes
;
2369 unsigned char *src
= source
;
2370 unsigned char *src_end
= source
+ src_bytes
;
2371 unsigned char *dst
= destination
;
2372 unsigned char *dst_end
= destination
+ dst_bytes
;
2373 /* SRC_BASE remembers the start position in source in each loop.
2374 The loop will be exited when there's not enough source code
2375 (within macro ONE_MORE_BYTE), or when there's not enough
2376 destination area to produce a character (within macro
2378 unsigned char *src_base
;
2379 Lisp_Object translation_table
;
2381 if (NILP (Venable_character_translation
))
2382 translation_table
= Qnil
;
2385 translation_table
= coding
->translation_table_for_decode
;
2386 if (NILP (translation_table
))
2387 translation_table
= Vstandard_translation_table_for_decode
;
2390 coding
->produced_char
= 0;
2393 int c
, charset
, c1
, c2
;
2400 charset
= CHARSET_ASCII
;
2405 if (coding
->eol_type
== CODING_EOL_CRLF
)
2410 else if (coding
->mode
2411 & CODING_MODE_INHIBIT_INCONSISTENT_EOL
)
2413 coding
->result
= CODING_FINISH_INCONSISTENT_EOL
;
2414 goto label_end_of_loop
;
2417 /* To process C2 again, SRC is subtracted by 1. */
2420 else if (coding
->eol_type
== CODING_EOL_CR
)
2424 && (coding
->mode
& CODING_MODE_INHIBIT_INCONSISTENT_EOL
)
2425 && (coding
->eol_type
== CODING_EOL_CR
2426 || coding
->eol_type
== CODING_EOL_CRLF
))
2428 coding
->result
= CODING_FINISH_INCONSISTENT_EOL
;
2429 goto label_end_of_loop
;
2438 goto label_invalid_code
;
2439 if (c1
< 0xA0 || c1
>= 0xE0)
2441 /* SJIS -> JISX0208 */
2443 if (c2
< 0x40 || c2
== 0x7F || c2
> 0xFC)
2444 goto label_invalid_code
;
2445 DECODE_SJIS (c1
, c2
, c1
, c2
);
2446 charset
= charset_jisx0208
;
2449 /* SJIS -> JISX0201-Kana */
2450 charset
= charset_katakana_jisx0201
;
2455 if (c1
< 0xA1 || c1
> 0xFE)
2456 goto label_invalid_code
;
2458 if (c2
< 0x40 || (c2
> 0x7E && c2
< 0xA1) || c2
> 0xFE)
2459 goto label_invalid_code
;
2460 DECODE_BIG5 (c1
, c2
, charset
, c1
, c2
);
2464 c
= DECODE_ISO_CHARACTER (charset
, c1
, c2
);
2476 coding
->consumed
= coding
->consumed_char
= src_base
- source
;
2477 coding
->produced
= dst
- destination
;
2481 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
2482 This function can encode charsets `ascii', `katakana-jisx0201',
2483 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
2484 are sure that all these charsets are registered as official charset
2485 (i.e. do not have extended leading-codes). Characters of other
2486 charsets are produced without any encoding. If SJIS_P is 1, encode
2487 SJIS text, else encode BIG5 text. */
2490 encode_coding_sjis_big5 (coding
, source
, destination
,
2491 src_bytes
, dst_bytes
, sjis_p
)
2492 struct coding_system
*coding
;
2493 unsigned char *source
, *destination
;
2494 int src_bytes
, dst_bytes
;
2497 unsigned char *src
= source
;
2498 unsigned char *src_end
= source
+ src_bytes
;
2499 unsigned char *dst
= destination
;
2500 unsigned char *dst_end
= destination
+ dst_bytes
;
2501 /* SRC_BASE remembers the start position in source in each loop.
2502 The loop will be exited when there's not enough source text to
2503 analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
2504 there's not enough destination area to produce encoded codes
2505 (within macro EMIT_BYTES). */
2506 unsigned char *src_base
;
2507 Lisp_Object translation_table
;
2509 if (NILP (Venable_character_translation
))
2510 translation_table
= Qnil
;
2513 translation_table
= coding
->translation_table_for_decode
;
2514 if (NILP (translation_table
))
2515 translation_table
= Vstandard_translation_table_for_decode
;
2520 int c
, charset
, c1
, c2
;
2525 /* Now encode the character C. */
2526 if (SINGLE_BYTE_CHAR_P (c
))
2531 if (!coding
->mode
& CODING_MODE_SELECTIVE_DISPLAY
)
2538 if (coding
->eol_type
== CODING_EOL_CRLF
)
2540 EMIT_TWO_BYTES ('\r', c
);
2543 else if (coding
->eol_type
== CODING_EOL_CR
)
2551 SPLIT_CHAR (c
, charset
, c1
, c2
);
2554 if (charset
== charset_jisx0208
2555 || charset
== charset_jisx0208_1978
)
2557 ENCODE_SJIS (c1
, c2
, c1
, c2
);
2558 EMIT_TWO_BYTES (c1
, c2
);
2560 else if (charset
== charset_latin_jisx0201
)
2563 /* There's no way other than producing the internal
2565 EMIT_BYTES (src_base
, src
);
2569 if (charset
== charset_big5_1
|| charset
== charset_big5_2
)
2571 ENCODE_BIG5 (charset
, c1
, c2
, c1
, c2
);
2572 EMIT_TWO_BYTES (c1
, c2
);
2575 /* There's no way other than producing the internal
2577 EMIT_BYTES (src_base
, src
);
2580 coding
->consumed_char
++;
2584 coding
->consumed
= src_base
- source
;
2585 coding
->produced
= coding
->produced_char
= dst
- destination
;
2589 /*** 5. CCL handlers ***/
2591 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2592 Check if a text is encoded in a coding system of which
2593 encoder/decoder are written in CCL program. If it is, return
2594 CODING_CATEGORY_MASK_CCL, else return 0. */
2597 detect_coding_ccl (src
, src_end
)
2598 unsigned char *src
, *src_end
;
2600 unsigned char *valid
;
2602 /* Dummy for ONE_MORE_BYTE. */
2603 struct coding_system dummy_coding
;
2604 struct coding_system
*coding
= &dummy_coding
;
2606 /* No coding system is assigned to coding-category-ccl. */
2607 if (!coding_system_table
[CODING_CATEGORY_IDX_CCL
])
2610 valid
= coding_system_table
[CODING_CATEGORY_IDX_CCL
]->spec
.ccl
.valid_codes
;
2618 return CODING_CATEGORY_MASK_CCL
;
2622 /*** 6. End-of-line handlers ***/
2624 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
2627 decode_eol (coding
, source
, destination
, src_bytes
, dst_bytes
)
2628 struct coding_system
*coding
;
2629 unsigned char *source
, *destination
;
2630 int src_bytes
, dst_bytes
;
2632 unsigned char *src
= source
;
2633 unsigned char *dst
= destination
;
2634 unsigned char *src_end
= src
+ src_bytes
;
2635 unsigned char *dst_end
= dst
+ dst_bytes
;
2636 Lisp_Object translation_table
;
2637 /* SRC_BASE remembers the start position in source in each loop.
2638 The loop will be exited when there's not enough source code
2639 (within macro ONE_MORE_BYTE), or when there's not enough
2640 destination area to produce a character (within macro
2642 unsigned char *src_base
;
2645 translation_table
= Qnil
;
2646 switch (coding
->eol_type
)
2648 case CODING_EOL_CRLF
:
2658 if (coding
->mode
& CODING_MODE_INHIBIT_INCONSISTENT_EOL
)
2660 coding
->result
= CODING_FINISH_INCONSISTENT_EOL
;
2661 goto label_end_of_loop
;
2668 && (coding
->mode
& CODING_MODE_INHIBIT_INCONSISTENT_EOL
))
2670 coding
->result
= CODING_FINISH_INCONSISTENT_EOL
;
2671 goto label_end_of_loop
;
2684 if (coding
->mode
& CODING_MODE_INHIBIT_INCONSISTENT_EOL
)
2686 coding
->result
= CODING_FINISH_INCONSISTENT_EOL
;
2687 goto label_end_of_loop
;
2696 default: /* no need for EOL handling */
2706 coding
->consumed
= coding
->consumed_char
= src_base
- source
;
2707 coding
->produced
= dst
- destination
;
2711 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". Encode
2712 format of end-of-line according to `coding->eol_type'. It also
2713 convert multibyte form 8-bit characers to unibyte if
2714 CODING->src_multibyte is nonzero. If `coding->mode &
2715 CODING_MODE_SELECTIVE_DISPLAY' is nonzero, code '\r' in source text
2716 also means end-of-line. */
2719 encode_eol (coding
, source
, destination
, src_bytes
, dst_bytes
)
2720 struct coding_system
*coding
;
2721 unsigned char *source
, *destination
;
2722 int src_bytes
, dst_bytes
;
2724 unsigned char *src
= source
;
2725 unsigned char *dst
= destination
;
2726 unsigned char *src_end
= src
+ src_bytes
;
2727 unsigned char *dst_end
= dst
+ dst_bytes
;
2728 Lisp_Object translation_table
;
2729 /* SRC_BASE remembers the start position in source in each loop.
2730 The loop will be exited when there's not enough source text to
2731 analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
2732 there's not enough destination area to produce encoded codes
2733 (within macro EMIT_BYTES). */
2734 unsigned char *src_base
;
2736 int selective_display
= coding
->mode
& CODING_MODE_SELECTIVE_DISPLAY
;
2738 translation_table
= Qnil
;
2739 if (coding
->src_multibyte
2740 && *(src_end
- 1) == LEADING_CODE_8_BIT_CONTROL
)
2744 coding
->result
= CODING_FINISH_INSUFFICIENT_SRC
;
2747 if (coding
->eol_type
== CODING_EOL_CRLF
)
2749 while (src
< src_end
)
2755 else if (c
== '\n' || (c
== '\r' && selective_display
))
2756 EMIT_TWO_BYTES ('\r', '\n');
2766 if (src_bytes
<= dst_bytes
)
2768 safe_bcopy (src
, dst
, src_bytes
);
2774 if (coding
->src_multibyte
2775 && *(src
+ dst_bytes
- 1) == LEADING_CODE_8_BIT_CONTROL
)
2777 safe_bcopy (src
, dst
, dst_bytes
);
2778 src_base
= src
+ dst_bytes
;
2779 dst
= destination
+ dst_bytes
;
2780 coding
->result
= CODING_FINISH_INSUFFICIENT_DST
;
2782 if (coding
->eol_type
== CODING_EOL_CR
)
2784 for (src
= destination
; src
< dst
; src
++)
2785 if (*src
== '\n') *src
= '\r';
2787 else if (selective_display
)
2789 for (src
= destination
; src
< dst
; src
++)
2790 if (*src
== '\r') *src
= '\n';
2793 if (coding
->src_multibyte
)
2794 dst
= destination
+ str_as_unibyte (destination
, dst
- destination
);
2796 coding
->consumed
= src_base
- source
;
2797 coding
->produced
= dst
- destination
;
2801 /*** 7. C library functions ***/
2803 /* In Emacs Lisp, coding system is represented by a Lisp symbol which
2804 has a property `coding-system'. The value of this property is a
2805 vector of length 5 (called as coding-vector). Among elements of
2806 this vector, the first (element[0]) and the fifth (element[4])
2807 carry important information for decoding/encoding. Before
2808 decoding/encoding, this information should be set in fields of a
2809 structure of type `coding_system'.
2811 A value of property `coding-system' can be a symbol of another
2812 subsidiary coding-system. In that case, Emacs gets coding-vector
2815 `element[0]' contains information to be set in `coding->type'. The
2816 value and its meaning is as follows:
2818 0 -- coding_type_emacs_mule
2819 1 -- coding_type_sjis
2820 2 -- coding_type_iso2022
2821 3 -- coding_type_big5
2822 4 -- coding_type_ccl encoder/decoder written in CCL
2823 nil -- coding_type_no_conversion
2824 t -- coding_type_undecided (automatic conversion on decoding,
2825 no-conversion on encoding)
2827 `element[4]' contains information to be set in `coding->flags' and
2828 `coding->spec'. The meaning varies by `coding->type'.
2830 If `coding->type' is `coding_type_iso2022', element[4] is a vector
2831 of length 32 (of which the first 13 sub-elements are used now).
2832 Meanings of these sub-elements are:
2834 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2835 If the value is an integer of valid charset, the charset is
2836 assumed to be designated to graphic register N initially.
2838 If the value is minus, it is a minus value of charset which
2839 reserves graphic register N, which means that the charset is
2840 not designated initially but should be designated to graphic
2841 register N just before encoding a character in that charset.
2843 If the value is nil, graphic register N is never used on
2846 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2847 Each value takes t or nil. See the section ISO2022 of
2848 `coding.h' for more information.
2850 If `coding->type' is `coding_type_big5', element[4] is t to denote
2851 BIG5-ETen or nil to denote BIG5-HKU.
2853 If `coding->type' takes the other value, element[4] is ignored.
2855 Emacs Lisp's coding system also carries information about format of
2856 end-of-line in a value of property `eol-type'. If the value is
2857 integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2858 means CODING_EOL_CR. If it is not integer, it should be a vector
2859 of subsidiary coding systems of which property `eol-type' has one
2864 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2865 and set it in CODING. If CODING_SYSTEM_SYMBOL is invalid, CODING
2866 is setup so that no conversion is necessary and return -1, else
2870 setup_coding_system (coding_system
, coding
)
2871 Lisp_Object coding_system
;
2872 struct coding_system
*coding
;
2874 Lisp_Object coding_spec
, coding_type
, eol_type
, plist
;
2878 /* Initialize some fields required for all kinds of coding systems. */
2879 coding
->symbol
= coding_system
;
2880 coding
->common_flags
= 0;
2882 coding
->heading_ascii
= -1;
2883 coding
->post_read_conversion
= coding
->pre_write_conversion
= Qnil
;
2884 coding
->composing
= COMPOSITION_DISABLED
;
2885 coding
->cmp_data
= NULL
;
2887 if (NILP (coding_system
))
2888 goto label_invalid_coding_system
;
2890 coding_spec
= Fget (coding_system
, Qcoding_system
);
2892 if (!VECTORP (coding_spec
)
2893 || XVECTOR (coding_spec
)->size
!= 5
2894 || !CONSP (XVECTOR (coding_spec
)->contents
[3]))
2895 goto label_invalid_coding_system
;
2897 eol_type
= inhibit_eol_conversion
? Qnil
: Fget (coding_system
, Qeol_type
);
2898 if (VECTORP (eol_type
))
2900 coding
->eol_type
= CODING_EOL_UNDECIDED
;
2901 coding
->common_flags
= CODING_REQUIRE_DETECTION_MASK
;
2903 else if (XFASTINT (eol_type
) == 1)
2905 coding
->eol_type
= CODING_EOL_CRLF
;
2906 coding
->common_flags
2907 = CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
;
2909 else if (XFASTINT (eol_type
) == 2)
2911 coding
->eol_type
= CODING_EOL_CR
;
2912 coding
->common_flags
2913 = CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
;
2916 coding
->eol_type
= CODING_EOL_LF
;
2918 coding_type
= XVECTOR (coding_spec
)->contents
[0];
2919 /* Try short cut. */
2920 if (SYMBOLP (coding_type
))
2922 if (EQ (coding_type
, Qt
))
2924 coding
->type
= coding_type_undecided
;
2925 coding
->common_flags
|= CODING_REQUIRE_DETECTION_MASK
;
2928 coding
->type
= coding_type_no_conversion
;
2932 /* Get values of coding system properties:
2933 `post-read-conversion', `pre-write-conversion',
2934 `translation-table-for-decode', `translation-table-for-encode'. */
2935 plist
= XVECTOR (coding_spec
)->contents
[3];
2936 /* Pre & post conversion functions should be disabled if
2937 inhibit_eol_conversion is nozero. This is the case that a code
2938 conversion function is called while those functions are running. */
2939 if (! inhibit_pre_post_conversion
)
2941 coding
->post_read_conversion
= Fplist_get (plist
, Qpost_read_conversion
);
2942 coding
->pre_write_conversion
= Fplist_get (plist
, Qpre_write_conversion
);
2944 val
= Fplist_get (plist
, Qtranslation_table_for_decode
);
2946 val
= Fget (val
, Qtranslation_table_for_decode
);
2947 coding
->translation_table_for_decode
= CHAR_TABLE_P (val
) ? val
: Qnil
;
2948 val
= Fplist_get (plist
, Qtranslation_table_for_encode
);
2950 val
= Fget (val
, Qtranslation_table_for_encode
);
2951 coding
->translation_table_for_encode
= CHAR_TABLE_P (val
) ? val
: Qnil
;
2952 val
= Fplist_get (plist
, Qcoding_category
);
2955 val
= Fget (val
, Qcoding_category_index
);
2957 coding
->category_idx
= XINT (val
);
2959 goto label_invalid_coding_system
;
2962 goto label_invalid_coding_system
;
2964 val
= Fplist_get (plist
, Qsafe_charsets
);
2967 for (i
= 0; i
<= MAX_CHARSET
; i
++)
2968 coding
->safe_charsets
[i
] = 1;
2972 bzero (coding
->safe_charsets
, MAX_CHARSET
+ 1);
2975 if ((i
= get_charset_id (XCAR (val
))) >= 0)
2976 coding
->safe_charsets
[i
] = 1;
2981 /* If the coding system has non-nil `composition' property, enable
2982 composition handling. */
2983 val
= Fplist_get (plist
, Qcomposition
);
2985 coding
->composing
= COMPOSITION_NO
;
2987 switch (XFASTINT (coding_type
))
2990 coding
->type
= coding_type_emacs_mule
;
2991 if (!NILP (coding
->post_read_conversion
))
2992 coding
->common_flags
|= CODING_REQUIRE_DECODING_MASK
;
2993 if (!NILP (coding
->pre_write_conversion
))
2994 coding
->common_flags
|= CODING_REQUIRE_ENCODING_MASK
;
2998 coding
->type
= coding_type_sjis
;
2999 coding
->common_flags
3000 |= CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
;
3004 coding
->type
= coding_type_iso2022
;
3005 coding
->common_flags
3006 |= CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
;
3008 Lisp_Object val
, temp
;
3010 int i
, charset
, reg_bits
= 0;
3012 val
= XVECTOR (coding_spec
)->contents
[4];
3014 if (!VECTORP (val
) || XVECTOR (val
)->size
!= 32)
3015 goto label_invalid_coding_system
;
3017 flags
= XVECTOR (val
)->contents
;
3019 = ((NILP (flags
[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM
)
3020 | (NILP (flags
[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL
)
3021 | (NILP (flags
[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL
)
3022 | (NILP (flags
[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS
)
3023 | (NILP (flags
[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT
)
3024 | (NILP (flags
[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT
)
3025 | (NILP (flags
[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN
)
3026 | (NILP (flags
[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS
)
3027 | (NILP (flags
[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION
)
3028 | (NILP (flags
[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL
)
3029 | (NILP (flags
[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL
)
3030 | (NILP (flags
[15]) ? 0 : CODING_FLAG_ISO_SAFE
)
3031 | (NILP (flags
[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA
)
3034 /* Invoke graphic register 0 to plane 0. */
3035 CODING_SPEC_ISO_INVOCATION (coding
, 0) = 0;
3036 /* Invoke graphic register 1 to plane 1 if we can use full 8-bit. */
3037 CODING_SPEC_ISO_INVOCATION (coding
, 1)
3038 = (coding
->flags
& CODING_FLAG_ISO_SEVEN_BITS
? -1 : 1);
3039 /* Not single shifting at first. */
3040 CODING_SPEC_ISO_SINGLE_SHIFTING (coding
) = 0;
3041 /* Beginning of buffer should also be regarded as bol. */
3042 CODING_SPEC_ISO_BOL (coding
) = 1;
3044 for (charset
= 0; charset
<= MAX_CHARSET
; charset
++)
3045 CODING_SPEC_ISO_REVISION_NUMBER (coding
, charset
) = 255;
3046 val
= Vcharset_revision_alist
;
3049 charset
= get_charset_id (Fcar_safe (XCAR (val
)));
3051 && (temp
= Fcdr_safe (XCAR (val
)), INTEGERP (temp
))
3052 && (i
= XINT (temp
), (i
>= 0 && (i
+ '@') < 128)))
3053 CODING_SPEC_ISO_REVISION_NUMBER (coding
, charset
) = i
;
3057 /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
3058 FLAGS[REG] can be one of below:
3059 integer CHARSET: CHARSET occupies register I,
3060 t: designate nothing to REG initially, but can be used
3062 list of integer, nil, or t: designate the first
3063 element (if integer) to REG initially, the remaining
3064 elements (if integer) is designated to REG on request,
3065 if an element is t, REG can be used by any charsets,
3066 nil: REG is never used. */
3067 for (charset
= 0; charset
<= MAX_CHARSET
; charset
++)
3068 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
)
3069 = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION
;
3070 for (i
= 0; i
< 4; i
++)
3072 if (INTEGERP (flags
[i
])
3073 && (charset
= XINT (flags
[i
]), CHARSET_VALID_P (charset
))
3074 || (charset
= get_charset_id (flags
[i
])) >= 0)
3076 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, i
) = charset
;
3077 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
) = i
;
3079 else if (EQ (flags
[i
], Qt
))
3081 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, i
) = -1;
3083 coding
->flags
|= CODING_FLAG_ISO_DESIGNATION
;
3085 else if (CONSP (flags
[i
]))
3090 coding
->flags
|= CODING_FLAG_ISO_DESIGNATION
;
3091 if (INTEGERP (XCAR (tail
))
3092 && (charset
= XINT (XCAR (tail
)),
3093 CHARSET_VALID_P (charset
))
3094 || (charset
= get_charset_id (XCAR (tail
))) >= 0)
3096 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, i
) = charset
;
3097 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
) =i
;
3100 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, i
) = -1;
3102 while (CONSP (tail
))
3104 if (INTEGERP (XCAR (tail
))
3105 && (charset
= XINT (XCAR (tail
)),
3106 CHARSET_VALID_P (charset
))
3107 || (charset
= get_charset_id (XCAR (tail
))) >= 0)
3108 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
)
3110 else if (EQ (XCAR (tail
), Qt
))
3116 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, i
) = -1;
3118 CODING_SPEC_ISO_DESIGNATION (coding
, i
)
3119 = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, i
);
3122 if (reg_bits
&& ! (coding
->flags
& CODING_FLAG_ISO_LOCKING_SHIFT
))
3124 /* REG 1 can be used only by locking shift in 7-bit env. */
3125 if (coding
->flags
& CODING_FLAG_ISO_SEVEN_BITS
)
3127 if (! (coding
->flags
& CODING_FLAG_ISO_SINGLE_SHIFT
))
3128 /* Without any shifting, only REG 0 and 1 can be used. */
3133 for (charset
= 0; charset
<= MAX_CHARSET
; charset
++)
3135 if (CHARSET_VALID_P (charset
))
3137 /* There exist some default graphic registers to be
3140 /* We had better avoid designating a charset of
3141 CHARS96 to REG 0 as far as possible. */
3142 if (CHARSET_CHARS (charset
) == 96)
3143 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
)
3145 ? 1 : (reg_bits
& 4 ? 2 : (reg_bits
& 8 ? 3 : 0)));
3147 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
)
3149 ? 0 : (reg_bits
& 2 ? 1 : (reg_bits
& 4 ? 2 : 3)));
3153 coding
->common_flags
|= CODING_REQUIRE_FLUSHING_MASK
;
3154 coding
->spec
.iso2022
.last_invalid_designation_register
= -1;
3158 coding
->type
= coding_type_big5
;
3159 coding
->common_flags
3160 |= CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
;
3162 = (NILP (XVECTOR (coding_spec
)->contents
[4])
3163 ? CODING_FLAG_BIG5_HKU
3164 : CODING_FLAG_BIG5_ETEN
);
3168 coding
->type
= coding_type_ccl
;
3169 coding
->common_flags
3170 |= CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
;
3172 val
= XVECTOR (coding_spec
)->contents
[4];
3174 || setup_ccl_program (&(coding
->spec
.ccl
.decoder
),
3176 || setup_ccl_program (&(coding
->spec
.ccl
.encoder
),
3178 goto label_invalid_coding_system
;
3180 bzero (coding
->spec
.ccl
.valid_codes
, 256);
3181 val
= Fplist_get (plist
, Qvalid_codes
);
3186 for (; CONSP (val
); val
= XCDR (val
))
3190 && XINT (this) >= 0 && XINT (this) < 256)
3191 coding
->spec
.ccl
.valid_codes
[XINT (this)] = 1;
3192 else if (CONSP (this)
3193 && INTEGERP (XCAR (this))
3194 && INTEGERP (XCDR (this)))
3196 int start
= XINT (XCAR (this));
3197 int end
= XINT (XCDR (this));
3199 if (start
>= 0 && start
<= end
&& end
< 256)
3200 while (start
<= end
)
3201 coding
->spec
.ccl
.valid_codes
[start
++] = 1;
3206 coding
->common_flags
|= CODING_REQUIRE_FLUSHING_MASK
;
3210 coding
->type
= coding_type_raw_text
;
3214 goto label_invalid_coding_system
;
3218 label_invalid_coding_system
:
3219 coding
->type
= coding_type_no_conversion
;
3220 coding
->category_idx
= CODING_CATEGORY_IDX_BINARY
;
3221 coding
->common_flags
= 0;
3222 coding
->eol_type
= CODING_EOL_LF
;
3223 coding
->pre_write_conversion
= coding
->post_read_conversion
= Qnil
;
3227 /* Free memory blocks allocated for storing composition information. */
3230 coding_free_composition_data (coding
)
3231 struct coding_system
*coding
;
3233 struct composition_data
*cmp_data
= coding
->cmp_data
, *next
;
3237 /* Memory blocks are chained. At first, rewind to the first, then,
3238 free blocks one by one. */
3239 while (cmp_data
->prev
)
3240 cmp_data
= cmp_data
->prev
;
3243 next
= cmp_data
->next
;
3247 coding
->cmp_data
= NULL
;
3250 /* Set `char_offset' member of all memory blocks pointed by
3251 coding->cmp_data to POS. */
3254 coding_adjust_composition_offset (coding
, pos
)
3255 struct coding_system
*coding
;
3258 struct composition_data
*cmp_data
;
3260 for (cmp_data
= coding
->cmp_data
; cmp_data
; cmp_data
= cmp_data
->next
)
3261 cmp_data
->char_offset
= pos
;
3264 /* Setup raw-text or one of its subsidiaries in the structure
3265 coding_system CODING according to the already setup value eol_type
3266 in CODING. CODING should be setup for some coding system in
3270 setup_raw_text_coding_system (coding
)
3271 struct coding_system
*coding
;
3273 if (coding
->type
!= coding_type_raw_text
)
3275 coding
->symbol
= Qraw_text
;
3276 coding
->type
= coding_type_raw_text
;
3277 if (coding
->eol_type
!= CODING_EOL_UNDECIDED
)
3279 Lisp_Object subsidiaries
;
3280 subsidiaries
= Fget (Qraw_text
, Qeol_type
);
3282 if (VECTORP (subsidiaries
)
3283 && XVECTOR (subsidiaries
)->size
== 3)
3285 = XVECTOR (subsidiaries
)->contents
[coding
->eol_type
];
3287 setup_coding_system (coding
->symbol
, coding
);
3292 /* Emacs has a mechanism to automatically detect a coding system if it
3293 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
3294 it's impossible to distinguish some coding systems accurately
3295 because they use the same range of codes. So, at first, coding
3296 systems are categorized into 7, those are:
3298 o coding-category-emacs-mule
3300 The category for a coding system which has the same code range
3301 as Emacs' internal format. Assigned the coding-system (Lisp
3302 symbol) `emacs-mule' by default.
3304 o coding-category-sjis
3306 The category for a coding system which has the same code range
3307 as SJIS. Assigned the coding-system (Lisp
3308 symbol) `japanese-shift-jis' by default.
3310 o coding-category-iso-7
3312 The category for a coding system which has the same code range
3313 as ISO2022 of 7-bit environment. This doesn't use any locking
3314 shift and single shift functions. This can encode/decode all
3315 charsets. Assigned the coding-system (Lisp symbol)
3316 `iso-2022-7bit' by default.
3318 o coding-category-iso-7-tight
3320 Same as coding-category-iso-7 except that this can
3321 encode/decode only the specified charsets.
3323 o coding-category-iso-8-1
3325 The category for a coding system which has the same code range
3326 as ISO2022 of 8-bit environment and graphic plane 1 used only
3327 for DIMENSION1 charset. This doesn't use any locking shift
3328 and single shift functions. Assigned the coding-system (Lisp
3329 symbol) `iso-latin-1' by default.
3331 o coding-category-iso-8-2
3333 The category for a coding system which has the same code range
3334 as ISO2022 of 8-bit environment and graphic plane 1 used only
3335 for DIMENSION2 charset. This doesn't use any locking shift
3336 and single shift functions. Assigned the coding-system (Lisp
3337 symbol) `japanese-iso-8bit' by default.
3339 o coding-category-iso-7-else
3341 The category for a coding system which has the same code range
3342 as ISO2022 of 7-bit environemnt but uses locking shift or
3343 single shift functions. Assigned the coding-system (Lisp
3344 symbol) `iso-2022-7bit-lock' by default.
3346 o coding-category-iso-8-else
3348 The category for a coding system which has the same code range
3349 as ISO2022 of 8-bit environemnt but uses locking shift or
3350 single shift functions. Assigned the coding-system (Lisp
3351 symbol) `iso-2022-8bit-ss2' by default.
3353 o coding-category-big5
3355 The category for a coding system which has the same code range
3356 as BIG5. Assigned the coding-system (Lisp symbol)
3357 `cn-big5' by default.
3359 o coding-category-utf-8
3361 The category for a coding system which has the same code range
3362 as UTF-8 (cf. RFC2279). Assigned the coding-system (Lisp
3363 symbol) `utf-8' by default.
3365 o coding-category-utf-16-be
3367 The category for a coding system in which a text has an
3368 Unicode signature (cf. Unicode Standard) in the order of BIG
3369 endian at the head. Assigned the coding-system (Lisp symbol)
3370 `utf-16-be' by default.
3372 o coding-category-utf-16-le
3374 The category for a coding system in which a text has an
3375 Unicode signature (cf. Unicode Standard) in the order of
3376 LITTLE endian at the head. Assigned the coding-system (Lisp
3377 symbol) `utf-16-le' by default.
3379 o coding-category-ccl
3381 The category for a coding system of which encoder/decoder is
3382 written in CCL programs. The default value is nil, i.e., no
3383 coding system is assigned.
3385 o coding-category-binary
3387 The category for a coding system not categorized in any of the
3388 above. Assigned the coding-system (Lisp symbol)
3389 `no-conversion' by default.
3391 Each of them is a Lisp symbol and the value is an actual
3392 `coding-system's (this is also a Lisp symbol) assigned by a user.
3393 What Emacs does actually is to detect a category of coding system.
3394 Then, it uses a `coding-system' assigned to it. If Emacs can't
3395 decide only one possible category, it selects a category of the
3396 highest priority. Priorities of categories are also specified by a
3397 user in a Lisp variable `coding-category-list'.
3402 int ascii_skip_code
[256];
3404 /* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded.
3405 If it detects possible coding systems, return an integer in which
3406 appropriate flag bits are set. Flag bits are defined by macros
3407 CODING_CATEGORY_MASK_XXX in `coding.h'. If PRIORITIES is non-NULL,
3408 it should point the table `coding_priorities'. In that case, only
3409 the flag bit for a coding system of the highest priority is set in
3412 How many ASCII characters are at the head is returned as *SKIP. */
3415 detect_coding_mask (source
, src_bytes
, priorities
, skip
)
3416 unsigned char *source
;
3417 int src_bytes
, *priorities
, *skip
;
3419 register unsigned char c
;
3420 unsigned char *src
= source
, *src_end
= source
+ src_bytes
;
3421 unsigned int mask
, utf16_examined_p
, iso2022_examined_p
;
3424 /* At first, skip all ASCII characters and control characters except
3425 for three ISO2022 specific control characters. */
3426 ascii_skip_code
[ISO_CODE_SO
] = 0;
3427 ascii_skip_code
[ISO_CODE_SI
] = 0;
3428 ascii_skip_code
[ISO_CODE_ESC
] = 0;
3430 label_loop_detect_coding
:
3431 while (src
< src_end
&& ascii_skip_code
[*src
]) src
++;
3432 *skip
= src
- source
;
3435 /* We found nothing other than ASCII. There's nothing to do. */
3439 /* The text seems to be encoded in some multilingual coding system.
3440 Now, try to find in which coding system the text is encoded. */
3443 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
3444 /* C is an ISO2022 specific control code of C0. */
3445 mask
= detect_coding_iso2022 (src
, src_end
);
3448 /* No valid ISO2022 code follows C. Try again. */
3450 if (c
== ISO_CODE_ESC
)
3451 ascii_skip_code
[ISO_CODE_ESC
] = 1;
3453 ascii_skip_code
[ISO_CODE_SO
] = ascii_skip_code
[ISO_CODE_SI
] = 1;
3454 goto label_loop_detect_coding
;
3458 for (i
= 0; i
< CODING_CATEGORY_IDX_MAX
; i
++)
3460 if (mask
& priorities
[i
])
3461 return priorities
[i
];
3463 return CODING_CATEGORY_MASK_RAW_TEXT
;
3472 /* C is the first byte of SJIS character code,
3473 or a leading-code of Emacs' internal format (emacs-mule),
3474 or the first byte of UTF-16. */
3475 try = (CODING_CATEGORY_MASK_SJIS
3476 | CODING_CATEGORY_MASK_EMACS_MULE
3477 | CODING_CATEGORY_MASK_UTF_16_BE
3478 | CODING_CATEGORY_MASK_UTF_16_LE
);
3480 /* Or, if C is a special latin extra code,
3481 or is an ISO2022 specific control code of C1 (SS2 or SS3),
3482 or is an ISO2022 control-sequence-introducer (CSI),
3483 we should also consider the possibility of ISO2022 codings. */
3484 if ((VECTORP (Vlatin_extra_code_table
)
3485 && !NILP (XVECTOR (Vlatin_extra_code_table
)->contents
[c
]))
3486 || (c
== ISO_CODE_SS2
|| c
== ISO_CODE_SS3
)
3487 || (c
== ISO_CODE_CSI
3490 || ((*src
== '0' || *src
== '1' || *src
== '2')
3491 && src
+ 1 < src_end
3492 && src
[1] == ']')))))
3493 try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
3494 | CODING_CATEGORY_MASK_ISO_8BIT
);
3497 /* C is a character of ISO2022 in graphic plane right,
3498 or a SJIS's 1-byte character code (i.e. JISX0201),
3499 or the first byte of BIG5's 2-byte code,
3500 or the first byte of UTF-8/16. */
3501 try = (CODING_CATEGORY_MASK_ISO_8_ELSE
3502 | CODING_CATEGORY_MASK_ISO_8BIT
3503 | CODING_CATEGORY_MASK_SJIS
3504 | CODING_CATEGORY_MASK_BIG5
3505 | CODING_CATEGORY_MASK_UTF_8
3506 | CODING_CATEGORY_MASK_UTF_16_BE
3507 | CODING_CATEGORY_MASK_UTF_16_LE
);
3509 /* Or, we may have to consider the possibility of CCL. */
3510 if (coding_system_table
[CODING_CATEGORY_IDX_CCL
]
3511 && (coding_system_table
[CODING_CATEGORY_IDX_CCL
]
3512 ->spec
.ccl
.valid_codes
)[c
])
3513 try |= CODING_CATEGORY_MASK_CCL
;
3516 utf16_examined_p
= iso2022_examined_p
= 0;
3519 for (i
= 0; i
< CODING_CATEGORY_IDX_MAX
; i
++)
3521 if (!iso2022_examined_p
3522 && (priorities
[i
] & try & CODING_CATEGORY_MASK_ISO
))
3524 mask
|= detect_coding_iso2022 (src
, src_end
);
3525 iso2022_examined_p
= 1;
3527 else if (priorities
[i
] & try & CODING_CATEGORY_MASK_SJIS
)
3528 mask
|= detect_coding_sjis (src
, src_end
);
3529 else if (priorities
[i
] & try & CODING_CATEGORY_MASK_UTF_8
)
3530 mask
|= detect_coding_utf_8 (src
, src_end
);
3531 else if (!utf16_examined_p
3532 && (priorities
[i
] & try &
3533 CODING_CATEGORY_MASK_UTF_16_BE_LE
))
3535 mask
|= detect_coding_utf_16 (src
, src_end
);
3536 utf16_examined_p
= 1;
3538 else if (priorities
[i
] & try & CODING_CATEGORY_MASK_BIG5
)
3539 mask
|= detect_coding_big5 (src
, src_end
);
3540 else if (priorities
[i
] & try & CODING_CATEGORY_MASK_EMACS_MULE
)
3541 mask
|= detect_coding_emacs_mule (src
, src_end
);
3542 else if (priorities
[i
] & try & CODING_CATEGORY_MASK_CCL
)
3543 mask
|= detect_coding_ccl (src
, src_end
);
3544 else if (priorities
[i
] & CODING_CATEGORY_MASK_RAW_TEXT
)
3545 mask
|= CODING_CATEGORY_MASK_RAW_TEXT
;
3546 else if (priorities
[i
] & CODING_CATEGORY_MASK_BINARY
)
3547 mask
|= CODING_CATEGORY_MASK_BINARY
;
3548 if (mask
& priorities
[i
])
3549 return priorities
[i
];
3551 return CODING_CATEGORY_MASK_RAW_TEXT
;
3553 if (try & CODING_CATEGORY_MASK_ISO
)
3554 mask
|= detect_coding_iso2022 (src
, src_end
);
3555 if (try & CODING_CATEGORY_MASK_SJIS
)
3556 mask
|= detect_coding_sjis (src
, src_end
);
3557 if (try & CODING_CATEGORY_MASK_BIG5
)
3558 mask
|= detect_coding_big5 (src
, src_end
);
3559 if (try & CODING_CATEGORY_MASK_UTF_8
)
3560 mask
|= detect_coding_utf_8 (src
, src_end
);
3561 if (try & CODING_CATEGORY_MASK_UTF_16_BE_LE
)
3562 mask
|= detect_coding_utf_16 (src
, src_end
);
3563 if (try & CODING_CATEGORY_MASK_EMACS_MULE
)
3564 mask
|= detect_coding_emacs_mule (src
, src_end
);
3565 if (try & CODING_CATEGORY_MASK_CCL
)
3566 mask
|= detect_coding_ccl (src
, src_end
);
3568 return (mask
| CODING_CATEGORY_MASK_RAW_TEXT
| CODING_CATEGORY_MASK_BINARY
);
3571 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
3572 The information of the detected coding system is set in CODING. */
3575 detect_coding (coding
, src
, src_bytes
)
3576 struct coding_system
*coding
;
3584 val
= Vcoding_category_list
;
3585 mask
= detect_coding_mask (src
, src_bytes
, coding_priorities
, &skip
);
3586 coding
->heading_ascii
= skip
;
3590 /* We found a single coding system of the highest priority in MASK. */
3592 while (mask
&& ! (mask
& 1)) mask
>>= 1, idx
++;
3594 idx
= CODING_CATEGORY_IDX_RAW_TEXT
;
3596 val
= XSYMBOL (XVECTOR (Vcoding_category_table
)->contents
[idx
])->value
;
3598 if (coding
->eol_type
!= CODING_EOL_UNDECIDED
)
3602 tmp
= Fget (val
, Qeol_type
);
3604 val
= XVECTOR (tmp
)->contents
[coding
->eol_type
];
3607 /* Setup this new coding system while preserving some slots. */
3609 int src_multibyte
= coding
->src_multibyte
;
3610 int dst_multibyte
= coding
->dst_multibyte
;
3612 setup_coding_system (val
, coding
);
3613 coding
->src_multibyte
= src_multibyte
;
3614 coding
->dst_multibyte
= dst_multibyte
;
3615 coding
->heading_ascii
= skip
;
3619 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
3620 SOURCE is encoded. Return one of CODING_EOL_LF, CODING_EOL_CRLF,
3621 CODING_EOL_CR, and CODING_EOL_UNDECIDED.
3623 How many non-eol characters are at the head is returned as *SKIP. */
3625 #define MAX_EOL_CHECK_COUNT 3
3628 detect_eol_type (source
, src_bytes
, skip
)
3629 unsigned char *source
;
3630 int src_bytes
, *skip
;
3632 unsigned char *src
= source
, *src_end
= src
+ src_bytes
;
3634 int total
= 0; /* How many end-of-lines are found so far. */
3635 int eol_type
= CODING_EOL_UNDECIDED
;
3640 while (src
< src_end
&& total
< MAX_EOL_CHECK_COUNT
)
3643 if (c
== '\n' || c
== '\r')
3646 *skip
= src
- 1 - source
;
3649 this_eol_type
= CODING_EOL_LF
;
3650 else if (src
>= src_end
|| *src
!= '\n')
3651 this_eol_type
= CODING_EOL_CR
;
3653 this_eol_type
= CODING_EOL_CRLF
, src
++;
3655 if (eol_type
== CODING_EOL_UNDECIDED
)
3656 /* This is the first end-of-line. */
3657 eol_type
= this_eol_type
;
3658 else if (eol_type
!= this_eol_type
)
3660 /* The found type is different from what found before. */
3661 eol_type
= CODING_EOL_INCONSISTENT
;
3668 *skip
= src_end
- source
;
3672 /* Like detect_eol_type, but detect EOL type in 2-octet
3673 big-endian/little-endian format for coding systems utf-16-be and
3677 detect_eol_type_in_2_octet_form (source
, src_bytes
, skip
, big_endian_p
)
3678 unsigned char *source
;
3679 int src_bytes
, *skip
;
3681 unsigned char *src
= source
, *src_end
= src
+ src_bytes
;
3682 unsigned int c1
, c2
;
3683 int total
= 0; /* How many end-of-lines are found so far. */
3684 int eol_type
= CODING_EOL_UNDECIDED
;
3695 while ((src
+ 1) < src_end
&& total
< MAX_EOL_CHECK_COUNT
)
3697 c1
= (src
[msb
] << 8) | (src
[lsb
]);
3700 if (c1
== '\n' || c1
== '\r')
3703 *skip
= src
- 2 - source
;
3707 this_eol_type
= CODING_EOL_LF
;
3711 if ((src
+ 1) >= src_end
)
3713 this_eol_type
= CODING_EOL_CR
;
3717 c2
= (src
[msb
] << 8) | (src
[lsb
]);
3719 this_eol_type
= CODING_EOL_CRLF
, src
+= 2;
3721 this_eol_type
= CODING_EOL_CR
;
3725 if (eol_type
== CODING_EOL_UNDECIDED
)
3726 /* This is the first end-of-line. */
3727 eol_type
= this_eol_type
;
3728 else if (eol_type
!= this_eol_type
)
3730 /* The found type is different from what found before. */
3731 eol_type
= CODING_EOL_INCONSISTENT
;
3738 *skip
= src_end
- source
;
3742 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
3743 is encoded. If it detects an appropriate format of end-of-line, it
3744 sets the information in *CODING. */
3747 detect_eol (coding
, src
, src_bytes
)
3748 struct coding_system
*coding
;
3756 switch (coding
->category_idx
)
3758 case CODING_CATEGORY_IDX_UTF_16_BE
:
3759 eol_type
= detect_eol_type_in_2_octet_form (src
, src_bytes
, &skip
, 1);
3761 case CODING_CATEGORY_IDX_UTF_16_LE
:
3762 eol_type
= detect_eol_type_in_2_octet_form (src
, src_bytes
, &skip
, 0);
3765 eol_type
= detect_eol_type (src
, src_bytes
, &skip
);
3769 if (coding
->heading_ascii
> skip
)
3770 coding
->heading_ascii
= skip
;
3772 skip
= coding
->heading_ascii
;
3774 if (eol_type
== CODING_EOL_UNDECIDED
)
3776 if (eol_type
== CODING_EOL_INCONSISTENT
)
3779 /* This code is suppressed until we find a better way to
3780 distinguish raw text file and binary file. */
3782 /* If we have already detected that the coding is raw-text, the
3783 coding should actually be no-conversion. */
3784 if (coding
->type
== coding_type_raw_text
)
3786 setup_coding_system (Qno_conversion
, coding
);
3789 /* Else, let's decode only text code anyway. */
3791 eol_type
= CODING_EOL_LF
;
3794 val
= Fget (coding
->symbol
, Qeol_type
);
3795 if (VECTORP (val
) && XVECTOR (val
)->size
== 3)
3797 int src_multibyte
= coding
->src_multibyte
;
3798 int dst_multibyte
= coding
->dst_multibyte
;
3800 setup_coding_system (XVECTOR (val
)->contents
[eol_type
], coding
);
3801 coding
->src_multibyte
= src_multibyte
;
3802 coding
->dst_multibyte
= dst_multibyte
;
3803 coding
->heading_ascii
= skip
;
3807 #define CONVERSION_BUFFER_EXTRA_ROOM 256
3809 #define DECODING_BUFFER_MAG(coding) \
3810 (coding->type == coding_type_iso2022 \
3812 : (coding->type == coding_type_ccl \
3813 ? coding->spec.ccl.decoder.buf_magnification \
3816 /* Return maximum size (bytes) of a buffer enough for decoding
3817 SRC_BYTES of text encoded in CODING. */
3820 decoding_buffer_size (coding
, src_bytes
)
3821 struct coding_system
*coding
;
3824 return (src_bytes
* DECODING_BUFFER_MAG (coding
)
3825 + CONVERSION_BUFFER_EXTRA_ROOM
);
3828 /* Return maximum size (bytes) of a buffer enough for encoding
3829 SRC_BYTES of text to CODING. */
3832 encoding_buffer_size (coding
, src_bytes
)
3833 struct coding_system
*coding
;
3838 if (coding
->type
== coding_type_ccl
)
3839 magnification
= coding
->spec
.ccl
.encoder
.buf_magnification
;
3840 else if (CODING_REQUIRE_ENCODING (coding
))
3845 return (src_bytes
* magnification
+ CONVERSION_BUFFER_EXTRA_ROOM
);
3848 #ifndef MINIMUM_CONVERSION_BUFFER_SIZE
3849 #define MINIMUM_CONVERSION_BUFFER_SIZE 1024
3852 char *conversion_buffer
;
3853 int conversion_buffer_size
;
3855 /* Return a pointer to a SIZE bytes of buffer to be used for encoding
3856 or decoding. Sufficient memory is allocated automatically. If we
3857 run out of memory, return NULL. */
3860 get_conversion_buffer (size
)
3863 if (size
> conversion_buffer_size
)
3866 int real_size
= conversion_buffer_size
* 2;
3868 while (real_size
< size
) real_size
*= 2;
3869 buf
= (char *) xmalloc (real_size
);
3870 xfree (conversion_buffer
);
3871 conversion_buffer
= buf
;
3872 conversion_buffer_size
= real_size
;
3874 return conversion_buffer
;
3878 ccl_coding_driver (coding
, source
, destination
, src_bytes
, dst_bytes
, encodep
)
3879 struct coding_system
*coding
;
3880 unsigned char *source
, *destination
;
3881 int src_bytes
, dst_bytes
, encodep
;
3883 struct ccl_program
*ccl
3884 = encodep
? &coding
->spec
.ccl
.encoder
: &coding
->spec
.ccl
.decoder
;
3887 ccl
->last_block
= coding
->mode
& CODING_MODE_LAST_BLOCK
;
3889 coding
->produced
= ccl_driver (ccl
, source
, destination
,
3890 src_bytes
, dst_bytes
, &(coding
->consumed
));
3892 coding
->produced_char
= coding
->produced
;
3896 = dst_bytes
? dst_bytes
: source
+ coding
->consumed
- destination
;
3897 coding
->produced
= str_as_multibyte (destination
, bytes
,
3899 &(coding
->produced_char
));
3902 switch (ccl
->status
)
3904 case CCL_STAT_SUSPEND_BY_SRC
:
3905 result
= CODING_FINISH_INSUFFICIENT_SRC
;
3907 case CCL_STAT_SUSPEND_BY_DST
:
3908 result
= CODING_FINISH_INSUFFICIENT_DST
;
3911 case CCL_STAT_INVALID_CMD
:
3912 result
= CODING_FINISH_INTERRUPT
;
3915 result
= CODING_FINISH_NORMAL
;
3921 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions". Before
3922 decoding, it may detect coding system and format of end-of-line if
3923 those are not yet decided. The source should be unibyte, the
3924 result is multibyte if CODING->dst_multibyte is nonzero, else
3928 decode_coding (coding
, source
, destination
, src_bytes
, dst_bytes
)
3929 struct coding_system
*coding
;
3930 unsigned char *source
, *destination
;
3931 int src_bytes
, dst_bytes
;
3933 if (coding
->type
== coding_type_undecided
)
3934 detect_coding (coding
, source
, src_bytes
);
3936 if (coding
->eol_type
== CODING_EOL_UNDECIDED
)
3937 detect_eol (coding
, source
, src_bytes
);
3939 coding
->produced
= coding
->produced_char
= 0;
3940 coding
->consumed
= coding
->consumed_char
= 0;
3942 coding
->result
= CODING_FINISH_NORMAL
;
3944 switch (coding
->type
)
3946 case coding_type_sjis
:
3947 decode_coding_sjis_big5 (coding
, source
, destination
,
3948 src_bytes
, dst_bytes
, 1);
3951 case coding_type_iso2022
:
3952 decode_coding_iso2022 (coding
, source
, destination
,
3953 src_bytes
, dst_bytes
);
3956 case coding_type_big5
:
3957 decode_coding_sjis_big5 (coding
, source
, destination
,
3958 src_bytes
, dst_bytes
, 0);
3961 case coding_type_emacs_mule
:
3962 decode_coding_emacs_mule (coding
, source
, destination
,
3963 src_bytes
, dst_bytes
);
3966 case coding_type_ccl
:
3967 ccl_coding_driver (coding
, source
, destination
,
3968 src_bytes
, dst_bytes
, 0);
3972 decode_eol (coding
, source
, destination
, src_bytes
, dst_bytes
);
3975 if (coding
->result
== CODING_FINISH_INSUFFICIENT_SRC
3976 && coding
->consumed
== src_bytes
)
3977 coding
->result
= CODING_FINISH_NORMAL
;
3979 if (coding
->mode
& CODING_MODE_LAST_BLOCK
3980 && coding
->result
== CODING_FINISH_INSUFFICIENT_SRC
)
3982 unsigned char *src
= source
+ coding
->consumed
;
3983 unsigned char *dst
= destination
+ coding
->produced
;
3985 src_bytes
-= coding
->consumed
;
3987 if (COMPOSING_P (coding
))
3988 DECODE_COMPOSITION_END ('1');
3992 dst
+= CHAR_STRING (c
, dst
);
3993 coding
->produced_char
++;
3995 coding
->consumed
= coding
->consumed_char
= src
- source
;
3996 coding
->produced
= dst
- destination
;
3999 if (!coding
->dst_multibyte
)
4001 coding
->produced
= str_as_unibyte (destination
, coding
->produced
);
4002 coding
->produced_char
= coding
->produced
;
4005 return coding
->result
;
4008 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". The
4009 multibyteness of the source is CODING->src_multibyte, the
4010 multibyteness of the result is always unibyte. */
4013 encode_coding (coding
, source
, destination
, src_bytes
, dst_bytes
)
4014 struct coding_system
*coding
;
4015 unsigned char *source
, *destination
;
4016 int src_bytes
, dst_bytes
;
4018 coding
->produced
= coding
->produced_char
= 0;
4019 coding
->consumed
= coding
->consumed_char
= 0;
4021 coding
->result
= CODING_FINISH_NORMAL
;
4023 switch (coding
->type
)
4025 case coding_type_sjis
:
4026 encode_coding_sjis_big5 (coding
, source
, destination
,
4027 src_bytes
, dst_bytes
, 1);
4030 case coding_type_iso2022
:
4031 encode_coding_iso2022 (coding
, source
, destination
,
4032 src_bytes
, dst_bytes
);
4035 case coding_type_big5
:
4036 encode_coding_sjis_big5 (coding
, source
, destination
,
4037 src_bytes
, dst_bytes
, 0);
4040 case coding_type_emacs_mule
:
4041 encode_coding_emacs_mule (coding
, source
, destination
,
4042 src_bytes
, dst_bytes
);
4045 case coding_type_ccl
:
4046 ccl_coding_driver (coding
, source
, destination
,
4047 src_bytes
, dst_bytes
, 1);
4051 encode_eol (coding
, source
, destination
, src_bytes
, dst_bytes
);
4054 if (coding
->result
== CODING_FINISH_INSUFFICIENT_SRC
4055 && coding
->consumed
== src_bytes
)
4056 coding
->result
= CODING_FINISH_NORMAL
;
4058 if (coding
->mode
& CODING_MODE_LAST_BLOCK
)
4060 unsigned char *src
= source
+ coding
->consumed
;
4061 unsigned char *src_end
= src
+ src_bytes
;
4062 unsigned char *dst
= destination
+ coding
->produced
;
4064 if (coding
->type
== coding_type_iso2022
)
4065 ENCODE_RESET_PLANE_AND_REGISTER
;
4066 if (COMPOSING_P (coding
))
4067 *dst
++ = ISO_CODE_ESC
, *dst
++ = '1';
4068 if (coding
->consumed
< src_bytes
)
4070 int len
= src_bytes
- coding
->consumed
;
4072 BCOPY_SHORT (source
+ coding
->consumed
, dst
, len
);
4073 if (coding
->src_multibyte
)
4074 len
= str_as_unibyte (dst
, len
);
4076 coding
->consumed
= src_bytes
;
4078 coding
->produced
= coding
->produced_char
= dst
- destination
;
4081 return coding
->result
;
4084 /* Scan text in the region between *BEG and *END (byte positions),
4085 skip characters which we don't have to decode by coding system
4086 CODING at the head and tail, then set *BEG and *END to the region
4087 of the text we actually have to convert. The caller should move
4088 the gap out of the region in advance if the region is from a
4091 If STR is not NULL, *BEG and *END are indices into STR. */
4094 shrink_decoding_region (beg
, end
, coding
, str
)
4096 struct coding_system
*coding
;
4099 unsigned char *begp_orig
, *begp
, *endp_orig
, *endp
, c
;
4101 Lisp_Object translation_table
;
4103 if (coding
->type
== coding_type_ccl
4104 || coding
->type
== coding_type_undecided
4105 || coding
->eol_type
!= CODING_EOL_LF
4106 || !NILP (coding
->post_read_conversion
)
4107 || coding
->composing
!= COMPOSITION_DISABLED
)
4109 /* We can't skip any data. */
4112 if (coding
->type
== coding_type_no_conversion
4113 || coding
->type
== coding_type_raw_text
4114 || coding
->type
== coding_type_emacs_mule
)
4116 /* We need no conversion, but don't have to skip any data here.
4117 Decoding routine handles them effectively anyway. */
4121 translation_table
= coding
->translation_table_for_decode
;
4122 if (NILP (translation_table
) && !NILP (Venable_character_translation
))
4123 translation_table
= Vstandard_translation_table_for_decode
;
4124 if (CHAR_TABLE_P (translation_table
))
4127 for (i
= 0; i
< 128; i
++)
4128 if (!NILP (CHAR_TABLE_REF (translation_table
, i
)))
4131 /* Some ASCII character should be tranlsated. We give up
4136 if (coding
->heading_ascii
>= 0)
4137 /* Detection routine has already found how much we can skip at the
4139 *beg
+= coding
->heading_ascii
;
4143 begp_orig
= begp
= str
+ *beg
;
4144 endp_orig
= endp
= str
+ *end
;
4148 begp_orig
= begp
= BYTE_POS_ADDR (*beg
);
4149 endp_orig
= endp
= begp
+ *end
- *beg
;
4152 switch (coding
->type
)
4154 case coding_type_sjis
:
4155 case coding_type_big5
:
4156 /* We can skip all ASCII characters at the head. */
4157 if (coding
->heading_ascii
< 0)
4160 while (begp
< endp
&& *begp
< 0x80 && *begp
!= '\r') begp
++;
4162 while (begp
< endp
&& *begp
< 0x80) begp
++;
4164 /* We can skip all ASCII characters at the tail except for the
4165 second byte of SJIS or BIG5 code. */
4167 while (begp
< endp
&& endp
[-1] < 0x80 && endp
[-1] != '\r') endp
--;
4169 while (begp
< endp
&& endp
[-1] < 0x80) endp
--;
4170 /* Do not consider LF as ascii if preceded by CR, since that
4171 confuses eol decoding. */
4172 if (begp
< endp
&& endp
< endp_orig
&& endp
[-1] == '\r' && endp
[0] == '\n')
4174 if (begp
< endp
&& endp
< endp_orig
&& endp
[-1] >= 0x80)
4178 case coding_type_iso2022
:
4179 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, 0) != CHARSET_ASCII
)
4180 /* We can't skip any data. */
4182 if (coding
->heading_ascii
< 0)
4184 /* We can skip all ASCII characters at the head except for a
4185 few control codes. */
4186 while (begp
< endp
&& (c
= *begp
) < 0x80
4187 && c
!= ISO_CODE_CR
&& c
!= ISO_CODE_SO
4188 && c
!= ISO_CODE_SI
&& c
!= ISO_CODE_ESC
4189 && (!eol_conversion
|| c
!= ISO_CODE_LF
))
4192 switch (coding
->category_idx
)
4194 case CODING_CATEGORY_IDX_ISO_8_1
:
4195 case CODING_CATEGORY_IDX_ISO_8_2
:
4196 /* We can skip all ASCII characters at the tail. */
4198 while (begp
< endp
&& (c
= endp
[-1]) < 0x80 && c
!= '\r') endp
--;
4200 while (begp
< endp
&& endp
[-1] < 0x80) endp
--;
4201 /* Do not consider LF as ascii if preceded by CR, since that
4202 confuses eol decoding. */
4203 if (begp
< endp
&& endp
< endp_orig
&& endp
[-1] == '\r' && endp
[0] == '\n')
4207 case CODING_CATEGORY_IDX_ISO_7
:
4208 case CODING_CATEGORY_IDX_ISO_7_TIGHT
:
4210 /* We can skip all charactes at the tail except for 8-bit
4211 codes and ESC and the following 2-byte at the tail. */
4212 unsigned char *eight_bit
= NULL
;
4216 && (c
= endp
[-1]) != ISO_CODE_ESC
&& c
!= '\r')
4218 if (!eight_bit
&& c
& 0x80) eight_bit
= endp
;
4223 && (c
= endp
[-1]) != ISO_CODE_ESC
)
4225 if (!eight_bit
&& c
& 0x80) eight_bit
= endp
;
4228 /* Do not consider LF as ascii if preceded by CR, since that
4229 confuses eol decoding. */
4230 if (begp
< endp
&& endp
< endp_orig
4231 && endp
[-1] == '\r' && endp
[0] == '\n')
4233 if (begp
< endp
&& endp
[-1] == ISO_CODE_ESC
)
4235 if (endp
+ 1 < endp_orig
&& end
[0] == '(' && end
[1] == 'B')
4236 /* This is an ASCII designation sequence. We can
4237 surely skip the tail. But, if we have
4238 encountered an 8-bit code, skip only the codes
4240 endp
= eight_bit
? eight_bit
: endp
+ 2;
4242 /* Hmmm, we can't skip the tail. */
4254 *beg
+= begp
- begp_orig
;
4255 *end
+= endp
- endp_orig
;
4259 /* Like shrink_decoding_region but for encoding. */
4262 shrink_encoding_region (beg
, end
, coding
, str
)
4264 struct coding_system
*coding
;
4267 unsigned char *begp_orig
, *begp
, *endp_orig
, *endp
;
4269 Lisp_Object translation_table
;
4271 if (coding
->type
== coding_type_ccl
4272 || coding
->eol_type
== CODING_EOL_CRLF
4273 || coding
->eol_type
== CODING_EOL_CR
4274 || coding
->cmp_data
&& coding
->cmp_data
->used
> 0)
4276 /* We can't skip any data. */
4279 if (coding
->type
== coding_type_no_conversion
4280 || coding
->type
== coding_type_raw_text
4281 || coding
->type
== coding_type_emacs_mule
4282 || coding
->type
== coding_type_undecided
)
4284 /* We need no conversion, but don't have to skip any data here.
4285 Encoding routine handles them effectively anyway. */
4289 translation_table
= coding
->translation_table_for_encode
;
4290 if (NILP (translation_table
) && !NILP (Venable_character_translation
))
4291 translation_table
= Vstandard_translation_table_for_encode
;
4292 if (CHAR_TABLE_P (translation_table
))
4295 for (i
= 0; i
< 128; i
++)
4296 if (!NILP (CHAR_TABLE_REF (translation_table
, i
)))
4299 /* Some ASCII character should be tranlsated. We give up
4306 begp_orig
= begp
= str
+ *beg
;
4307 endp_orig
= endp
= str
+ *end
;
4311 begp_orig
= begp
= BYTE_POS_ADDR (*beg
);
4312 endp_orig
= endp
= begp
+ *end
- *beg
;
4315 eol_conversion
= (coding
->eol_type
== CODING_EOL_CR
4316 || coding
->eol_type
== CODING_EOL_CRLF
);
4318 /* Here, we don't have to check coding->pre_write_conversion because
4319 the caller is expected to have handled it already. */
4320 switch (coding
->type
)
4322 case coding_type_iso2022
:
4323 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, 0) != CHARSET_ASCII
)
4324 /* We can't skip any data. */
4326 if (coding
->flags
& CODING_FLAG_ISO_DESIGNATE_AT_BOL
)
4328 unsigned char *bol
= begp
;
4329 while (begp
< endp
&& *begp
< 0x80)
4332 if (begp
[-1] == '\n')
4336 goto label_skip_tail
;
4340 case coding_type_sjis
:
4341 case coding_type_big5
:
4342 /* We can skip all ASCII characters at the head and tail. */
4344 while (begp
< endp
&& *begp
< 0x80 && *begp
!= '\n') begp
++;
4346 while (begp
< endp
&& *begp
< 0x80) begp
++;
4349 while (begp
< endp
&& endp
[-1] < 0x80 && endp
[-1] != '\n') endp
--;
4351 while (begp
< endp
&& *(endp
- 1) < 0x80) endp
--;
4358 *beg
+= begp
- begp_orig
;
4359 *end
+= endp
- endp_orig
;
4363 /* As shrinking conversion region requires some overhead, we don't try
4364 shrinking if the length of conversion region is less than this
4366 static int shrink_conversion_region_threshhold
= 1024;
4368 #define SHRINK_CONVERSION_REGION(beg, end, coding, str, encodep) \
4370 if (*(end) - *(beg) > shrink_conversion_region_threshhold) \
4372 if (encodep) shrink_encoding_region (beg, end, coding, str); \
4373 else shrink_decoding_region (beg, end, coding, str); \
4378 code_convert_region_unwind (dummy
)
4381 inhibit_pre_post_conversion
= 0;
4385 /* Store information about all compositions in the range FROM and TO
4386 of OBJ in memory blocks pointed by CODING->cmp_data. OBJ is a
4387 buffer or a string, defaults to the current buffer. */
4390 coding_save_composition (coding
, from
, to
, obj
)
4391 struct coding_system
*coding
;
4398 if (coding
->composing
== COMPOSITION_DISABLED
)
4400 if (!coding
->cmp_data
)
4401 coding_allocate_composition_data (coding
, from
);
4402 if (!find_composition (from
, to
, &start
, &end
, &prop
, obj
)
4406 && (!find_composition (end
, to
, &start
, &end
, &prop
, obj
)
4409 coding
->composing
= COMPOSITION_NO
;
4412 if (COMPOSITION_VALID_P (start
, end
, prop
))
4414 enum composition_method method
= COMPOSITION_METHOD (prop
);
4415 if (coding
->cmp_data
->used
+ COMPOSITION_DATA_MAX_BUNCH_LENGTH
4416 >= COMPOSITION_DATA_SIZE
)
4417 coding_allocate_composition_data (coding
, from
);
4418 /* For relative composition, we remember start and end
4419 positions, for the other compositions, we also remember
4421 CODING_ADD_COMPOSITION_START (coding
, start
- from
, method
);
4422 if (method
!= COMPOSITION_RELATIVE
)
4424 /* We must store a*/
4425 Lisp_Object val
, ch
;
4427 val
= COMPOSITION_COMPONENTS (prop
);
4431 ch
= XCAR (val
), val
= XCDR (val
);
4432 CODING_ADD_COMPOSITION_COMPONENT (coding
, XINT (ch
));
4434 else if (VECTORP (val
) || STRINGP (val
))
4436 int len
= (VECTORP (val
)
4437 ? XVECTOR (val
)->size
: XSTRING (val
)->size
);
4439 for (i
= 0; i
< len
; i
++)
4442 ? Faref (val
, make_number (i
))
4443 : XVECTOR (val
)->contents
[i
]);
4444 CODING_ADD_COMPOSITION_COMPONENT (coding
, XINT (ch
));
4447 else /* INTEGERP (val) */
4448 CODING_ADD_COMPOSITION_COMPONENT (coding
, XINT (val
));
4450 CODING_ADD_COMPOSITION_END (coding
, end
- from
);
4455 && find_composition (start
, to
, &start
, &end
, &prop
, obj
)
4458 /* Make coding->cmp_data point to the first memory block. */
4459 while (coding
->cmp_data
->prev
)
4460 coding
->cmp_data
= coding
->cmp_data
->prev
;
4461 coding
->cmp_data_start
= 0;
4464 /* Reflect the saved information about compositions to OBJ.
4465 CODING->cmp_data points to a memory block for the informaiton. OBJ
4466 is a buffer or a string, defaults to the current buffer. */
4469 coding_restore_composition (coding
, obj
)
4470 struct coding_system
*coding
;
4473 struct composition_data
*cmp_data
= coding
->cmp_data
;
4478 while (cmp_data
->prev
)
4479 cmp_data
= cmp_data
->prev
;
4485 for (i
= 0; i
< cmp_data
->used
; i
+= cmp_data
->data
[i
])
4487 int *data
= cmp_data
->data
+ i
;
4488 enum composition_method method
= (enum composition_method
) data
[3];
4489 Lisp_Object components
;
4491 if (method
== COMPOSITION_RELATIVE
)
4495 int len
= data
[0] - 4, j
;
4496 Lisp_Object args
[MAX_COMPOSITION_COMPONENTS
* 2 - 1];
4498 for (j
= 0; j
< len
; j
++)
4499 args
[j
] = make_number (data
[4 + j
]);
4500 components
= (method
== COMPOSITION_WITH_ALTCHARS
4501 ? Fstring (len
, args
) : Fvector (len
, args
));
4503 compose_text (data
[1], data
[2], components
, Qnil
, obj
);
4505 cmp_data
= cmp_data
->next
;
4509 /* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the
4510 text from FROM to TO (byte positions are FROM_BYTE and TO_BYTE) by
4511 coding system CODING, and return the status code of code conversion
4512 (currently, this value has no meaning).
4514 How many characters (and bytes) are converted to how many
4515 characters (and bytes) are recorded in members of the structure
4518 If REPLACE is nonzero, we do various things as if the original text
4519 is deleted and a new text is inserted. See the comments in
4520 replace_range (insdel.c) to know what we are doing.
4522 If REPLACE is zero, it is assumed that the source text is unibyte.
4523 Otherwize, it is assumed that the source text is multibyte. */
4526 code_convert_region (from
, from_byte
, to
, to_byte
, coding
, encodep
, replace
)
4527 int from
, from_byte
, to
, to_byte
, encodep
, replace
;
4528 struct coding_system
*coding
;
4530 int len
= to
- from
, len_byte
= to_byte
- from_byte
;
4531 int require
, inserted
, inserted_byte
;
4532 int head_skip
, tail_skip
, total_skip
= 0;
4533 Lisp_Object saved_coding_symbol
;
4535 unsigned char *src
, *dst
;
4536 Lisp_Object deletion
;
4537 int orig_point
= PT
, orig_len
= len
;
4539 int multibyte_p
= !NILP (current_buffer
->enable_multibyte_characters
);
4541 coding
->src_multibyte
= replace
&& multibyte_p
;
4542 coding
->dst_multibyte
= multibyte_p
;
4545 saved_coding_symbol
= Qnil
;
4547 if (from
< PT
&& PT
< to
)
4549 TEMP_SET_PT_BOTH (from
, from_byte
);
4555 int saved_from
= from
;
4557 prepare_to_modify_buffer (from
, to
, &from
);
4558 if (saved_from
!= from
)
4561 from_byte
= CHAR_TO_BYTE (from
), to_byte
= CHAR_TO_BYTE (to
);
4562 len_byte
= to_byte
- from_byte
;
4566 if (! encodep
&& CODING_REQUIRE_DETECTION (coding
))
4568 /* We must detect encoding of text and eol format. */
4570 if (from
< GPT
&& to
> GPT
)
4571 move_gap_both (from
, from_byte
);
4572 if (coding
->type
== coding_type_undecided
)
4574 detect_coding (coding
, BYTE_POS_ADDR (from_byte
), len_byte
);
4575 if (coding
->type
== coding_type_undecided
)
4576 /* It seems that the text contains only ASCII, but we
4577 should not left it undecided because the deeper
4578 decoding routine (decode_coding) tries to detect the
4579 encodings again in vain. */
4580 coding
->type
= coding_type_emacs_mule
;
4582 if (coding
->eol_type
== CODING_EOL_UNDECIDED
)
4584 saved_coding_symbol
= coding
->symbol
;
4585 detect_eol (coding
, BYTE_POS_ADDR (from_byte
), len_byte
);
4586 if (coding
->eol_type
== CODING_EOL_UNDECIDED
)
4587 coding
->eol_type
= CODING_EOL_LF
;
4588 /* We had better recover the original eol format if we
4589 encounter an inconsitent eol format while decoding. */
4590 coding
->mode
|= CODING_MODE_INHIBIT_INCONSISTENT_EOL
;
4594 /* Now we convert the text. */
4596 /* For encoding, we must process pre-write-conversion in advance. */
4597 if (! inhibit_pre_post_conversion
4599 && SYMBOLP (coding
->pre_write_conversion
)
4600 && ! NILP (Ffboundp (coding
->pre_write_conversion
)))
4602 /* The function in pre-write-conversion may put a new text in a
4604 struct buffer
*prev
= current_buffer
;
4606 int count
= specpdl_ptr
- specpdl
;
4608 record_unwind_protect (code_convert_region_unwind
, Qnil
);
4609 /* We should not call any more pre-write/post-read-conversion
4610 functions while this pre-write-conversion is running. */
4611 inhibit_pre_post_conversion
= 1;
4612 call2 (coding
->pre_write_conversion
,
4613 make_number (from
), make_number (to
));
4614 inhibit_pre_post_conversion
= 0;
4615 /* Discard the unwind protect. */
4618 if (current_buffer
!= prev
)
4621 new = Fcurrent_buffer ();
4622 set_buffer_internal_1 (prev
);
4623 del_range_2 (from
, from_byte
, to
, to_byte
, 0);
4624 TEMP_SET_PT_BOTH (from
, from_byte
);
4625 insert_from_buffer (XBUFFER (new), 1, len
, 0);
4627 if (orig_point
>= to
)
4628 orig_point
+= len
- orig_len
;
4629 else if (orig_point
> from
)
4633 from_byte
= CHAR_TO_BYTE (from
);
4634 to_byte
= CHAR_TO_BYTE (to
);
4635 len_byte
= to_byte
- from_byte
;
4636 TEMP_SET_PT_BOTH (from
, from_byte
);
4641 deletion
= make_buffer_string_both (from
, from_byte
, to
, to_byte
, 1);
4643 if (coding
->composing
!= COMPOSITION_DISABLED
)
4646 coding_save_composition (coding
, from
, to
, Fcurrent_buffer ());
4648 coding_allocate_composition_data (coding
, from
);
4651 /* Try to skip the heading and tailing ASCIIs. */
4653 int from_byte_orig
= from_byte
, to_byte_orig
= to_byte
;
4655 if (from
< GPT
&& GPT
< to
)
4656 move_gap_both (from
, from_byte
);
4657 SHRINK_CONVERSION_REGION (&from_byte
, &to_byte
, coding
, NULL
, encodep
);
4658 if (from_byte
== to_byte
4659 && (encodep
|| NILP (coding
->post_read_conversion
))
4660 && ! CODING_REQUIRE_FLUSHING (coding
))
4662 coding
->produced
= len_byte
;
4663 coding
->produced_char
= len
;
4665 /* We must record and adjust for this new text now. */
4666 adjust_after_insert (from
, from_byte_orig
, to
, to_byte_orig
, len
);
4670 head_skip
= from_byte
- from_byte_orig
;
4671 tail_skip
= to_byte_orig
- to_byte
;
4672 total_skip
= head_skip
+ tail_skip
;
4675 len
-= total_skip
; len_byte
-= total_skip
;
4678 /* The code conversion routine can not preserve text properties for
4679 now. So, we must remove all text properties in the region.
4680 Here, we must suppress all modification hooks. */
4683 int saved_inhibit_modification_hooks
= inhibit_modification_hooks
;
4684 inhibit_modification_hooks
= 1;
4685 Fset_text_properties (make_number (from
), make_number (to
), Qnil
, Qnil
);
4686 inhibit_modification_hooks
= saved_inhibit_modification_hooks
;
4689 /* For converion, we must put the gap before the text in addition to
4690 making the gap larger for efficient decoding. The required gap
4691 size starts from 2000 which is the magic number used in make_gap.
4692 But, after one batch of conversion, it will be incremented if we
4693 find that it is not enough . */
4696 if (GAP_SIZE
< require
)
4697 make_gap (require
- GAP_SIZE
);
4698 move_gap_both (from
, from_byte
);
4700 inserted
= inserted_byte
= 0;
4702 GAP_SIZE
+= len_byte
;
4705 ZV_BYTE
-= len_byte
;
4708 if (GPT
- BEG
< BEG_UNCHANGED
)
4709 BEG_UNCHANGED
= GPT
- BEG
;
4710 if (Z
- GPT
< END_UNCHANGED
)
4711 END_UNCHANGED
= Z
- GPT
;
4713 if (!encodep
&& coding
->src_multibyte
)
4715 /* Decoding routines expects that the source text is unibyte.
4716 We must convert 8-bit characters of multibyte form to
4718 int len_byte_orig
= len_byte
;
4719 len_byte
= str_as_unibyte (GAP_END_ADDR
- len_byte
, len_byte
);
4720 if (len_byte
< len_byte_orig
)
4721 safe_bcopy (GAP_END_ADDR
- len_byte_orig
, GAP_END_ADDR
- len_byte
,
4723 coding
->src_multibyte
= 0;
4730 /* The buffer memory is now:
4731 +--------+converted-text+---------+-------original-text-------+---+
4732 |<-from->|<--inserted-->|---------|<--------len_byte--------->|---|
4733 |<---------------------- GAP ----------------------->| */
4734 src
= GAP_END_ADDR
- len_byte
;
4735 dst
= GPT_ADDR
+ inserted_byte
;
4738 result
= encode_coding (coding
, src
, dst
, len_byte
, 0);
4740 result
= decode_coding (coding
, src
, dst
, len_byte
, 0);
4742 /* The buffer memory is now:
4743 +--------+-------converted-text----+--+------original-text----+---+
4744 |<-from->|<-inserted->|<-produced->|--|<-(len_byte-consumed)->|---|
4745 |<---------------------- GAP ----------------------->| */
4747 inserted
+= coding
->produced_char
;
4748 inserted_byte
+= coding
->produced
;
4749 len_byte
-= coding
->consumed
;
4751 if (result
== CODING_FINISH_INSUFFICIENT_CMP
)
4753 coding_allocate_composition_data (coding
, from
+ inserted
);
4757 src
+= coding
->consumed
;
4758 dst
+= coding
->produced
;
4760 if (result
== CODING_FINISH_NORMAL
)
4765 if (! encodep
&& result
== CODING_FINISH_INCONSISTENT_EOL
)
4767 unsigned char *pend
= dst
, *p
= pend
- inserted_byte
;
4768 Lisp_Object eol_type
;
4770 /* Encode LFs back to the original eol format (CR or CRLF). */
4771 if (coding
->eol_type
== CODING_EOL_CR
)
4773 while (p
< pend
) if (*p
++ == '\n') p
[-1] = '\r';
4779 while (p
< pend
) if (*p
++ == '\n') count
++;
4780 if (src
- dst
< count
)
4782 /* We don't have sufficient room for encoding LFs
4783 back to CRLF. We must record converted and
4784 not-yet-converted text back to the buffer
4785 content, enlarge the gap, then record them out of
4786 the buffer contents again. */
4787 int add
= len_byte
+ inserted_byte
;
4790 ZV
+= add
; Z
+= add
; ZV_BYTE
+= add
; Z_BYTE
+= add
;
4791 GPT
+= inserted_byte
; GPT_BYTE
+= inserted_byte
;
4792 make_gap (count
- GAP_SIZE
);
4794 ZV
-= add
; Z
-= add
; ZV_BYTE
-= add
; Z_BYTE
-= add
;
4795 GPT
-= inserted_byte
; GPT_BYTE
-= inserted_byte
;
4796 /* Don't forget to update SRC, DST, and PEND. */
4797 src
= GAP_END_ADDR
- len_byte
;
4798 dst
= GPT_ADDR
+ inserted_byte
;
4802 inserted_byte
+= count
;
4803 coding
->produced
+= count
;
4804 p
= dst
= pend
+ count
;
4808 if (*p
== '\n') count
--, *--p
= '\r';
4812 /* Suppress eol-format conversion in the further conversion. */
4813 coding
->eol_type
= CODING_EOL_LF
;
4815 /* Set the coding system symbol to that for Unix-like EOL. */
4816 eol_type
= Fget (saved_coding_symbol
, Qeol_type
);
4817 if (VECTORP (eol_type
)
4818 && XVECTOR (eol_type
)->size
== 3
4819 && SYMBOLP (XVECTOR (eol_type
)->contents
[CODING_EOL_LF
]))
4820 coding
->symbol
= XVECTOR (eol_type
)->contents
[CODING_EOL_LF
];
4822 coding
->symbol
= saved_coding_symbol
;
4828 if (coding
->type
!= coding_type_ccl
4829 || coding
->mode
& CODING_MODE_LAST_BLOCK
)
4831 coding
->mode
|= CODING_MODE_LAST_BLOCK
;
4834 if (result
== CODING_FINISH_INSUFFICIENT_SRC
)
4836 /* The source text ends in invalid codes. Let's just
4837 make them valid buffer contents, and finish conversion. */
4838 inserted
+= len_byte
;
4839 inserted_byte
+= len_byte
;
4844 if (result
== CODING_FINISH_INTERRUPT
)
4846 /* The conversion procedure was interrupted by a user. */
4849 /* Now RESULT == CODING_FINISH_INSUFFICIENT_DST */
4850 if (coding
->consumed
< 1)
4852 /* It's quite strange to require more memory without
4853 consuming any bytes. Perhaps CCL program bug. */
4858 /* We have just done the first batch of conversion which was
4859 stoped because of insufficient gap. Let's reconsider the
4860 required gap size (i.e. SRT - DST) now.
4862 We have converted ORIG bytes (== coding->consumed) into
4863 NEW bytes (coding->produced). To convert the remaining
4864 LEN bytes, we may need REQUIRE bytes of gap, where:
4865 REQUIRE + LEN_BYTE = LEN_BYTE * (NEW / ORIG)
4866 REQUIRE = LEN_BYTE * (NEW - ORIG) / ORIG
4867 Here, we are sure that NEW >= ORIG. */
4868 float ratio
= coding
->produced
- coding
->consumed
;
4869 ratio
/= coding
->consumed
;
4870 require
= len_byte
* ratio
;
4873 if ((src
- dst
) < (require
+ 2000))
4875 /* See the comment above the previous call of make_gap. */
4876 int add
= len_byte
+ inserted_byte
;
4879 ZV
+= add
; Z
+= add
; ZV_BYTE
+= add
; Z_BYTE
+= add
;
4880 GPT
+= inserted_byte
; GPT_BYTE
+= inserted_byte
;
4881 make_gap (require
+ 2000);
4883 ZV
-= add
; Z
-= add
; ZV_BYTE
-= add
; Z_BYTE
-= add
;
4884 GPT
-= inserted_byte
; GPT_BYTE
-= inserted_byte
;
4887 if (src
- dst
> 0) *dst
= 0; /* Put an anchor. */
4889 if (encodep
&& coding
->dst_multibyte
)
4891 /* The output is unibyte. We must convert 8-bit characters to
4893 if (inserted_byte
* 2 > GAP_SIZE
)
4895 GAP_SIZE
-= inserted_byte
;
4896 ZV
+= inserted_byte
; Z
+= inserted_byte
;
4897 ZV_BYTE
+= inserted_byte
; Z_BYTE
+= inserted_byte
;
4898 GPT
+= inserted_byte
; GPT_BYTE
+= inserted_byte
;
4899 make_gap (inserted_byte
- GAP_SIZE
);
4900 GAP_SIZE
+= inserted_byte
;
4901 ZV
-= inserted_byte
; Z
-= inserted_byte
;
4902 ZV_BYTE
-= inserted_byte
; Z_BYTE
-= inserted_byte
;
4903 GPT
-= inserted_byte
; GPT_BYTE
-= inserted_byte
;
4905 inserted_byte
= str_to_multibyte (GPT_ADDR
, GAP_SIZE
, inserted_byte
);
4908 /* If we have shrinked the conversion area, adjust it now. */
4912 safe_bcopy (GAP_END_ADDR
, GPT_ADDR
+ inserted_byte
, tail_skip
);
4913 inserted
+= total_skip
; inserted_byte
+= total_skip
;
4914 GAP_SIZE
+= total_skip
;
4915 GPT
-= head_skip
; GPT_BYTE
-= head_skip
;
4916 ZV
-= total_skip
; ZV_BYTE
-= total_skip
;
4917 Z
-= total_skip
; Z_BYTE
-= total_skip
;
4918 from
-= head_skip
; from_byte
-= head_skip
;
4919 to
+= tail_skip
; to_byte
+= tail_skip
;
4923 adjust_after_replace (from
, from_byte
, deletion
, inserted
, inserted_byte
);
4924 inserted
= Z
- prev_Z
;
4926 if (!encodep
&& coding
->cmp_data
&& coding
->cmp_data
->used
)
4927 coding_restore_composition (coding
, Fcurrent_buffer ());
4928 coding_free_composition_data (coding
);
4930 if (! inhibit_pre_post_conversion
4931 && ! encodep
&& ! NILP (coding
->post_read_conversion
))
4934 int count
= specpdl_ptr
- specpdl
;
4937 TEMP_SET_PT_BOTH (from
, from_byte
);
4939 record_unwind_protect (code_convert_region_unwind
, Qnil
);
4940 /* We should not call any more pre-write/post-read-conversion
4941 functions while this post-read-conversion is running. */
4942 inhibit_pre_post_conversion
= 1;
4943 val
= call1 (coding
->post_read_conversion
, make_number (inserted
));
4944 inhibit_pre_post_conversion
= 0;
4945 /* Discard the unwind protect. */
4947 CHECK_NUMBER (val
, 0);
4948 inserted
+= Z
- prev_Z
;
4951 if (orig_point
>= from
)
4953 if (orig_point
>= from
+ orig_len
)
4954 orig_point
+= inserted
- orig_len
;
4957 TEMP_SET_PT (orig_point
);
4962 signal_after_change (from
, to
- from
, inserted
);
4963 update_compositions (from
, from
+ inserted
, CHECK_BORDER
);
4967 coding
->consumed
= to_byte
- from_byte
;
4968 coding
->consumed_char
= to
- from
;
4969 coding
->produced
= inserted_byte
;
4970 coding
->produced_char
= inserted
;
4977 run_pre_post_conversion_on_str (str
, coding
, encodep
)
4979 struct coding_system
*coding
;
4982 int count
= specpdl_ptr
- specpdl
;
4983 struct gcpro gcpro1
;
4984 struct buffer
*prev
= current_buffer
;
4985 int multibyte
= STRING_MULTIBYTE (str
);
4987 record_unwind_protect (Fset_buffer
, Fcurrent_buffer ());
4988 record_unwind_protect (code_convert_region_unwind
, Qnil
);
4990 temp_output_buffer_setup (" *code-converting-work*");
4991 set_buffer_internal (XBUFFER (Vstandard_output
));
4992 /* We must insert the contents of STR as is without
4993 unibyte<->multibyte conversion. For that, we adjust the
4994 multibyteness of the working buffer to that of STR. */
4996 current_buffer
->enable_multibyte_characters
= multibyte
? Qt
: Qnil
;
4997 insert_from_string (str
, 0, 0,
4998 XSTRING (str
)->size
, STRING_BYTES (XSTRING (str
)), 0);
5000 inhibit_pre_post_conversion
= 1;
5002 call2 (coding
->pre_write_conversion
, make_number (BEG
), make_number (Z
));
5005 TEMP_SET_PT_BOTH (BEG
, BEG_BYTE
);
5006 call1 (coding
->post_read_conversion
, make_number (Z
- BEG
));
5008 inhibit_pre_post_conversion
= 0;
5009 str
= make_buffer_string (BEG
, Z
, 0);
5010 return unbind_to (count
, str
);
5014 decode_coding_string (str
, coding
, nocopy
)
5016 struct coding_system
*coding
;
5021 int from
, to
, to_byte
;
5022 struct gcpro gcpro1
;
5023 Lisp_Object saved_coding_symbol
;
5027 to
= XSTRING (str
)->size
;
5028 to_byte
= STRING_BYTES (XSTRING (str
));
5030 saved_coding_symbol
= Qnil
;
5031 if (CODING_REQUIRE_DETECTION (coding
))
5033 /* See the comments in code_convert_region. */
5034 if (coding
->type
== coding_type_undecided
)
5036 detect_coding (coding
, XSTRING (str
)->data
, to_byte
);
5037 if (coding
->type
== coding_type_undecided
)
5038 coding
->type
= coding_type_emacs_mule
;
5040 if (coding
->eol_type
== CODING_EOL_UNDECIDED
)
5042 saved_coding_symbol
= coding
->symbol
;
5043 detect_eol (coding
, XSTRING (str
)->data
, to_byte
);
5044 if (coding
->eol_type
== CODING_EOL_UNDECIDED
)
5045 coding
->eol_type
= CODING_EOL_LF
;
5046 /* We had better recover the original eol format if we
5047 encounter an inconsitent eol format while decoding. */
5048 coding
->mode
|= CODING_MODE_INHIBIT_INCONSISTENT_EOL
;
5052 if (! CODING_REQUIRE_DECODING (coding
))
5054 if (!STRING_MULTIBYTE (str
))
5056 str
= Fstring_as_multibyte (str
);
5059 return (nocopy
? str
: Fcopy_sequence (str
));
5062 if (STRING_MULTIBYTE (str
))
5064 /* Decoding routines expect the source text to be unibyte. */
5065 str
= Fstring_as_unibyte (str
);
5067 coding
->src_multibyte
= 0;
5069 coding
->dst_multibyte
= 1;
5071 if (coding
->composing
!= COMPOSITION_DISABLED
)
5072 coding_allocate_composition_data (coding
, from
);
5074 /* Try to skip the heading and tailing ASCIIs. */
5076 int from_orig
= from
;
5078 SHRINK_CONVERSION_REGION (&from
, &to_byte
, coding
, XSTRING (str
)->data
,
5080 if (from
== to_byte
)
5081 return (nocopy
? str
: Fcopy_sequence (str
));
5084 len
= decoding_buffer_size (coding
, to_byte
- from
);
5085 len
+= from
+ STRING_BYTES (XSTRING (str
)) - to_byte
;
5087 buf
= get_conversion_buffer (len
);
5091 bcopy (XSTRING (str
)->data
, buf
, from
);
5092 result
= decode_coding (coding
, XSTRING (str
)->data
+ from
,
5093 buf
+ from
, to_byte
- from
, len
);
5094 if (result
== CODING_FINISH_INCONSISTENT_EOL
)
5096 /* We simply try to decode the whole string again but without
5097 eol-conversion this time. */
5098 coding
->eol_type
= CODING_EOL_LF
;
5099 coding
->symbol
= saved_coding_symbol
;
5100 coding_free_composition_data (coding
);
5101 return decode_coding_string (str
, coding
, nocopy
);
5104 bcopy (XSTRING (str
)->data
+ to_byte
, buf
+ from
+ coding
->produced
,
5105 STRING_BYTES (XSTRING (str
)) - to_byte
);
5107 len
= from
+ STRING_BYTES (XSTRING (str
)) - to_byte
;
5108 str
= make_multibyte_string (buf
, len
+ coding
->produced_char
,
5109 len
+ coding
->produced
);
5111 if (coding
->cmp_data
&& coding
->cmp_data
->used
)
5112 coding_restore_composition (coding
, str
);
5113 coding_free_composition_data (coding
);
5115 if (SYMBOLP (coding
->post_read_conversion
)
5116 && !NILP (Ffboundp (coding
->post_read_conversion
)))
5117 str
= run_pre_post_conversion_on_str (str
, coding
, 0);
5123 encode_coding_string (str
, coding
, nocopy
)
5125 struct coding_system
*coding
;
5130 int from
, to
, to_byte
;
5131 struct gcpro gcpro1
;
5132 Lisp_Object saved_coding_symbol
;
5135 if (SYMBOLP (coding
->pre_write_conversion
)
5136 && !NILP (Ffboundp (coding
->pre_write_conversion
)))
5137 str
= run_pre_post_conversion_on_str (str
, coding
, 1);
5140 to
= XSTRING (str
)->size
;
5141 to_byte
= STRING_BYTES (XSTRING (str
));
5143 saved_coding_symbol
= Qnil
;
5144 if (! CODING_REQUIRE_ENCODING (coding
))
5146 if (STRING_MULTIBYTE (str
))
5148 str
= Fstring_as_unibyte (str
);
5151 return (nocopy
? str
: Fcopy_sequence (str
));
5154 /* Encoding routines determine the multibyteness of the source text
5155 by coding->src_multibyte. */
5156 coding
->src_multibyte
= STRING_MULTIBYTE (str
);
5157 coding
->dst_multibyte
= 0;
5159 if (coding
->composing
!= COMPOSITION_DISABLED
)
5160 coding_save_composition (coding
, from
, to
, str
);
5162 /* Try to skip the heading and tailing ASCIIs. */
5164 int from_orig
= from
;
5166 SHRINK_CONVERSION_REGION (&from
, &to_byte
, coding
, XSTRING (str
)->data
,
5168 if (from
== to_byte
)
5169 return (nocopy
? str
: Fcopy_sequence (str
));
5172 len
= encoding_buffer_size (coding
, to_byte
- from
);
5173 len
+= from
+ STRING_BYTES (XSTRING (str
)) - to_byte
;
5175 buf
= get_conversion_buffer (len
);
5179 bcopy (XSTRING (str
)->data
, buf
, from
);
5180 result
= encode_coding (coding
, XSTRING (str
)->data
+ from
,
5181 buf
+ from
, to_byte
- from
, len
);
5182 bcopy (XSTRING (str
)->data
+ to_byte
, buf
+ from
+ coding
->produced
,
5183 STRING_BYTES (XSTRING (str
)) - to_byte
);
5185 len
= from
+ STRING_BYTES (XSTRING (str
)) - to_byte
;
5186 str
= make_unibyte_string (buf
, len
+ coding
->produced
);
5187 coding_free_composition_data (coding
);
5194 /*** 8. Emacs Lisp library functions ***/
5196 DEFUN ("coding-system-p", Fcoding_system_p
, Scoding_system_p
, 1, 1, 0,
5197 "Return t if OBJECT is nil or a coding-system.\n\
5198 See the documentation of `make-coding-system' for information\n\
5199 about coding-system objects.")
5207 /* Get coding-spec vector for OBJ. */
5208 obj
= Fget (obj
, Qcoding_system
);
5209 return ((VECTORP (obj
) && XVECTOR (obj
)->size
== 5)
5213 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system
,
5214 Sread_non_nil_coding_system
, 1, 1, 0,
5215 "Read a coding system from the minibuffer, prompting with string PROMPT.")
5222 val
= Fcompleting_read (prompt
, Vcoding_system_alist
, Qnil
,
5223 Qt
, Qnil
, Qcoding_system_history
, Qnil
, Qnil
);
5225 while (XSTRING (val
)->size
== 0);
5226 return (Fintern (val
, Qnil
));
5229 DEFUN ("read-coding-system", Fread_coding_system
, Sread_coding_system
, 1, 2, 0,
5230 "Read a coding system from the minibuffer, prompting with string PROMPT.\n\
5231 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.")
5232 (prompt
, default_coding_system
)
5233 Lisp_Object prompt
, default_coding_system
;
5236 if (SYMBOLP (default_coding_system
))
5237 XSETSTRING (default_coding_system
, XSYMBOL (default_coding_system
)->name
);
5238 val
= Fcompleting_read (prompt
, Vcoding_system_alist
, Qnil
,
5239 Qt
, Qnil
, Qcoding_system_history
,
5240 default_coding_system
, Qnil
);
5241 return (XSTRING (val
)->size
== 0 ? Qnil
: Fintern (val
, Qnil
));
5244 DEFUN ("check-coding-system", Fcheck_coding_system
, Scheck_coding_system
,
5246 "Check validity of CODING-SYSTEM.\n\
5247 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.\n\
5248 It is valid if it is a symbol with a non-nil `coding-system' property.\n\
5249 The value of property should be a vector of length 5.")
5251 Lisp_Object coding_system
;
5253 CHECK_SYMBOL (coding_system
, 0);
5254 if (!NILP (Fcoding_system_p (coding_system
)))
5255 return coding_system
;
5257 Fsignal (Qcoding_system_error
, Fcons (coding_system
, Qnil
));
5261 detect_coding_system (src
, src_bytes
, highest
)
5263 int src_bytes
, highest
;
5265 int coding_mask
, eol_type
;
5266 Lisp_Object val
, tmp
;
5269 coding_mask
= detect_coding_mask (src
, src_bytes
, NULL
, &dummy
);
5270 eol_type
= detect_eol_type (src
, src_bytes
, &dummy
);
5271 if (eol_type
== CODING_EOL_INCONSISTENT
)
5272 eol_type
= CODING_EOL_UNDECIDED
;
5277 if (eol_type
!= CODING_EOL_UNDECIDED
)
5280 val2
= Fget (Qundecided
, Qeol_type
);
5282 val
= XVECTOR (val2
)->contents
[eol_type
];
5284 return (highest
? val
: Fcons (val
, Qnil
));
5287 /* At first, gather possible coding systems in VAL. */
5289 for (tmp
= Vcoding_category_list
; CONSP (tmp
); tmp
= XCDR (tmp
))
5291 Lisp_Object category_val
, category_index
;
5293 category_index
= Fget (XCAR (tmp
), Qcoding_category_index
);
5294 category_val
= Fsymbol_value (XCAR (tmp
));
5295 if (!NILP (category_val
)
5296 && NATNUMP (category_index
)
5297 && (coding_mask
& (1 << XFASTINT (category_index
))))
5299 val
= Fcons (category_val
, val
);
5305 val
= Fnreverse (val
);
5307 /* Then, replace the elements with subsidiary coding systems. */
5308 for (tmp
= val
; CONSP (tmp
); tmp
= XCDR (tmp
))
5310 if (eol_type
!= CODING_EOL_UNDECIDED
5311 && eol_type
!= CODING_EOL_INCONSISTENT
)
5314 eol
= Fget (XCAR (tmp
), Qeol_type
);
5316 XCAR (tmp
) = XVECTOR (eol
)->contents
[eol_type
];
5319 return (highest
? XCAR (val
) : val
);
5322 DEFUN ("detect-coding-region", Fdetect_coding_region
, Sdetect_coding_region
,
5324 "Detect coding system of the text in the region between START and END.\n\
5325 Return a list of possible coding systems ordered by priority.\n\
5327 If only ASCII characters are found, it returns a list of single element\n\
5328 `undecided' or its subsidiary coding system according to a detected\n\
5329 end-of-line format.\n\
5331 If optional argument HIGHEST is non-nil, return the coding system of\n\
5333 (start
, end
, highest
)
5334 Lisp_Object start
, end
, highest
;
5337 int from_byte
, to_byte
;
5339 CHECK_NUMBER_COERCE_MARKER (start
, 0);
5340 CHECK_NUMBER_COERCE_MARKER (end
, 1);
5342 validate_region (&start
, &end
);
5343 from
= XINT (start
), to
= XINT (end
);
5344 from_byte
= CHAR_TO_BYTE (from
);
5345 to_byte
= CHAR_TO_BYTE (to
);
5347 if (from
< GPT
&& to
>= GPT
)
5348 move_gap_both (to
, to_byte
);
5350 return detect_coding_system (BYTE_POS_ADDR (from_byte
),
5351 to_byte
- from_byte
,
5355 DEFUN ("detect-coding-string", Fdetect_coding_string
, Sdetect_coding_string
,
5357 "Detect coding system of the text in STRING.\n\
5358 Return a list of possible coding systems ordered by priority.\n\
5360 If only ASCII characters are found, it returns a list of single element\n\
5361 `undecided' or its subsidiary coding system according to a detected\n\
5362 end-of-line format.\n\
5364 If optional argument HIGHEST is non-nil, return the coding system of\n\
5367 Lisp_Object string
, highest
;
5369 CHECK_STRING (string
, 0);
5371 return detect_coding_system (XSTRING (string
)->data
,
5372 STRING_BYTES (XSTRING (string
)),
5377 code_convert_region1 (start
, end
, coding_system
, encodep
)
5378 Lisp_Object start
, end
, coding_system
;
5381 struct coding_system coding
;
5384 CHECK_NUMBER_COERCE_MARKER (start
, 0);
5385 CHECK_NUMBER_COERCE_MARKER (end
, 1);
5386 CHECK_SYMBOL (coding_system
, 2);
5388 validate_region (&start
, &end
);
5389 from
= XFASTINT (start
);
5390 to
= XFASTINT (end
);
5392 if (NILP (coding_system
))
5393 return make_number (to
- from
);
5395 if (setup_coding_system (Fcheck_coding_system (coding_system
), &coding
) < 0)
5396 error ("Invalid coding system: %s", XSYMBOL (coding_system
)->name
->data
);
5398 coding
.mode
|= CODING_MODE_LAST_BLOCK
;
5399 coding
.src_multibyte
= coding
.dst_multibyte
5400 = !NILP (current_buffer
->enable_multibyte_characters
);
5401 code_convert_region (from
, CHAR_TO_BYTE (from
), to
, CHAR_TO_BYTE (to
),
5402 &coding
, encodep
, 1);
5403 Vlast_coding_system_used
= coding
.symbol
;
5404 return make_number (coding
.produced_char
);
5407 DEFUN ("decode-coding-region", Fdecode_coding_region
, Sdecode_coding_region
,
5408 3, 3, "r\nzCoding system: ",
5409 "Decode the current region by specified coding system.\n\
5410 When called from a program, takes three arguments:\n\
5411 START, END, and CODING-SYSTEM. START and END are buffer positions.\n\
5412 This function sets `last-coding-system-used' to the precise coding system\n\
5413 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5414 not fully specified.)\n\
5415 It returns the length of the decoded text.")
5416 (start
, end
, coding_system
)
5417 Lisp_Object start
, end
, coding_system
;
5419 return code_convert_region1 (start
, end
, coding_system
, 0);
5422 DEFUN ("encode-coding-region", Fencode_coding_region
, Sencode_coding_region
,
5423 3, 3, "r\nzCoding system: ",
5424 "Encode the current region by specified coding system.\n\
5425 When called from a program, takes three arguments:\n\
5426 START, END, and CODING-SYSTEM. START and END are buffer positions.\n\
5427 This function sets `last-coding-system-used' to the precise coding system\n\
5428 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5429 not fully specified.)\n\
5430 It returns the length of the encoded text.")
5431 (start
, end
, coding_system
)
5432 Lisp_Object start
, end
, coding_system
;
5434 return code_convert_region1 (start
, end
, coding_system
, 1);
5438 code_convert_string1 (string
, coding_system
, nocopy
, encodep
)
5439 Lisp_Object string
, coding_system
, nocopy
;
5442 struct coding_system coding
;
5444 CHECK_STRING (string
, 0);
5445 CHECK_SYMBOL (coding_system
, 1);
5447 if (NILP (coding_system
))
5448 return (NILP (nocopy
) ? Fcopy_sequence (string
) : string
);
5450 if (setup_coding_system (Fcheck_coding_system (coding_system
), &coding
) < 0)
5451 error ("Invalid coding system: %s", XSYMBOL (coding_system
)->name
->data
);
5453 coding
.mode
|= CODING_MODE_LAST_BLOCK
;
5455 ? encode_coding_string (string
, &coding
, !NILP (nocopy
))
5456 : decode_coding_string (string
, &coding
, !NILP (nocopy
)));
5457 Vlast_coding_system_used
= coding
.symbol
;
5462 DEFUN ("decode-coding-string", Fdecode_coding_string
, Sdecode_coding_string
,
5464 "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
5465 Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
5466 if the decoding operation is trivial.\n\
5467 This function sets `last-coding-system-used' to the precise coding system\n\
5468 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5469 not fully specified.)")
5470 (string
, coding_system
, nocopy
)
5471 Lisp_Object string
, coding_system
, nocopy
;
5473 return code_convert_string1 (string
, coding_system
, nocopy
, 0);
5476 DEFUN ("encode-coding-string", Fencode_coding_string
, Sencode_coding_string
,
5478 "Encode STRING to CODING-SYSTEM, and return the result.\n\
5479 Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
5480 if the encoding operation is trivial.\n\
5481 This function sets `last-coding-system-used' to the precise coding system\n\
5482 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5483 not fully specified.)")
5484 (string
, coding_system
, nocopy
)
5485 Lisp_Object string
, coding_system
, nocopy
;
5487 return code_convert_string1 (string
, coding_system
, nocopy
, 1);
5490 /* Encode or decode STRING according to CODING_SYSTEM.
5491 Do not set Vlast_coding_system_used.
5493 This function is called only from macros DECODE_FILE and
5494 ENCODE_FILE, thus we ignore character composition. */
5497 code_convert_string_norecord (string
, coding_system
, encodep
)
5498 Lisp_Object string
, coding_system
;
5501 struct coding_system coding
;
5503 CHECK_STRING (string
, 0);
5504 CHECK_SYMBOL (coding_system
, 1);
5506 if (NILP (coding_system
))
5509 if (setup_coding_system (Fcheck_coding_system (coding_system
), &coding
) < 0)
5510 error ("Invalid coding system: %s", XSYMBOL (coding_system
)->name
->data
);
5512 coding
.composing
= COMPOSITION_DISABLED
;
5513 coding
.mode
|= CODING_MODE_LAST_BLOCK
;
5515 ? encode_coding_string (string
, &coding
, 1)
5516 : decode_coding_string (string
, &coding
, 1));
5519 DEFUN ("decode-sjis-char", Fdecode_sjis_char
, Sdecode_sjis_char
, 1, 1, 0,
5520 "Decode a Japanese character which has CODE in shift_jis encoding.\n\
5521 Return the corresponding character.")
5525 unsigned char c1
, c2
, s1
, s2
;
5528 CHECK_NUMBER (code
, 0);
5529 s1
= (XFASTINT (code
)) >> 8, s2
= (XFASTINT (code
)) & 0xFF;
5533 XSETFASTINT (val
, s2
);
5534 else if (s2
>= 0xA0 || s2
<= 0xDF)
5535 XSETFASTINT (val
, MAKE_CHAR (charset_katakana_jisx0201
, s2
, 0));
5537 error ("Invalid Shift JIS code: %x", XFASTINT (code
));
5541 if ((s1
< 0x80 || s1
> 0x9F && s1
< 0xE0 || s1
> 0xEF)
5542 || (s2
< 0x40 || s2
== 0x7F || s2
> 0xFC))
5543 error ("Invalid Shift JIS code: %x", XFASTINT (code
));
5544 DECODE_SJIS (s1
, s2
, c1
, c2
);
5545 XSETFASTINT (val
, MAKE_CHAR (charset_jisx0208
, c1
, c2
));
5550 DEFUN ("encode-sjis-char", Fencode_sjis_char
, Sencode_sjis_char
, 1, 1, 0,
5551 "Encode a Japanese character CHAR to shift_jis encoding.\n\
5552 Return the corresponding code in SJIS.")
5556 int charset
, c1
, c2
, s1
, s2
;
5559 CHECK_NUMBER (ch
, 0);
5560 SPLIT_CHAR (XFASTINT (ch
), charset
, c1
, c2
);
5561 if (charset
== CHARSET_ASCII
)
5565 else if (charset
== charset_jisx0208
5566 && c1
> 0x20 && c1
< 0x7F && c2
> 0x20 && c2
< 0x7F)
5568 ENCODE_SJIS (c1
, c2
, s1
, s2
);
5569 XSETFASTINT (val
, (s1
<< 8) | s2
);
5571 else if (charset
== charset_katakana_jisx0201
5572 && c1
> 0x20 && c2
< 0xE0)
5574 XSETFASTINT (val
, c1
| 0x80);
5577 error ("Can't encode to shift_jis: %d", XFASTINT (ch
));
5581 DEFUN ("decode-big5-char", Fdecode_big5_char
, Sdecode_big5_char
, 1, 1, 0,
5582 "Decode a Big5 character which has CODE in BIG5 coding system.\n\
5583 Return the corresponding character.")
5588 unsigned char b1
, b2
, c1
, c2
;
5591 CHECK_NUMBER (code
, 0);
5592 b1
= (XFASTINT (code
)) >> 8, b2
= (XFASTINT (code
)) & 0xFF;
5596 error ("Invalid BIG5 code: %x", XFASTINT (code
));
5601 if ((b1
< 0xA1 || b1
> 0xFE)
5602 || (b2
< 0x40 || (b2
> 0x7E && b2
< 0xA1) || b2
> 0xFE))
5603 error ("Invalid BIG5 code: %x", XFASTINT (code
));
5604 DECODE_BIG5 (b1
, b2
, charset
, c1
, c2
);
5605 XSETFASTINT (val
, MAKE_CHAR (charset
, c1
, c2
));
5610 DEFUN ("encode-big5-char", Fencode_big5_char
, Sencode_big5_char
, 1, 1, 0,
5611 "Encode the Big5 character CHAR to BIG5 coding system.\n\
5612 Return the corresponding character code in Big5.")
5616 int charset
, c1
, c2
, b1
, b2
;
5619 CHECK_NUMBER (ch
, 0);
5620 SPLIT_CHAR (XFASTINT (ch
), charset
, c1
, c2
);
5621 if (charset
== CHARSET_ASCII
)
5625 else if ((charset
== charset_big5_1
5626 && (XFASTINT (ch
) >= 0x250a1 && XFASTINT (ch
) <= 0x271ec))
5627 || (charset
== charset_big5_2
5628 && XFASTINT (ch
) >= 0x290a1 && XFASTINT (ch
) <= 0x2bdb2))
5630 ENCODE_BIG5 (charset
, c1
, c2
, b1
, b2
);
5631 XSETFASTINT (val
, (b1
<< 8) | b2
);
5634 error ("Can't encode to Big5: %d", XFASTINT (ch
));
5638 DEFUN ("set-terminal-coding-system-internal",
5639 Fset_terminal_coding_system_internal
,
5640 Sset_terminal_coding_system_internal
, 1, 1, 0, "")
5642 Lisp_Object coding_system
;
5644 CHECK_SYMBOL (coding_system
, 0);
5645 setup_coding_system (Fcheck_coding_system (coding_system
), &terminal_coding
);
5646 /* We had better not send unsafe characters to terminal. */
5647 terminal_coding
.flags
|= CODING_FLAG_ISO_SAFE
;
5648 /* Characer composition should be disabled. */
5649 terminal_coding
.composing
= COMPOSITION_DISABLED
;
5650 terminal_coding
.src_multibyte
= 1;
5651 terminal_coding
.dst_multibyte
= 0;
5655 DEFUN ("set-safe-terminal-coding-system-internal",
5656 Fset_safe_terminal_coding_system_internal
,
5657 Sset_safe_terminal_coding_system_internal
, 1, 1, 0, "")
5659 Lisp_Object coding_system
;
5661 CHECK_SYMBOL (coding_system
, 0);
5662 setup_coding_system (Fcheck_coding_system (coding_system
),
5663 &safe_terminal_coding
);
5664 /* Characer composition should be disabled. */
5665 safe_terminal_coding
.composing
= COMPOSITION_DISABLED
;
5666 safe_terminal_coding
.src_multibyte
= 1;
5667 safe_terminal_coding
.dst_multibyte
= 0;
5671 DEFUN ("terminal-coding-system",
5672 Fterminal_coding_system
, Sterminal_coding_system
, 0, 0, 0,
5673 "Return coding system specified for terminal output.")
5676 return terminal_coding
.symbol
;
5679 DEFUN ("set-keyboard-coding-system-internal",
5680 Fset_keyboard_coding_system_internal
,
5681 Sset_keyboard_coding_system_internal
, 1, 1, 0, "")
5683 Lisp_Object coding_system
;
5685 CHECK_SYMBOL (coding_system
, 0);
5686 setup_coding_system (Fcheck_coding_system (coding_system
), &keyboard_coding
);
5687 /* Characer composition should be disabled. */
5688 keyboard_coding
.composing
= COMPOSITION_DISABLED
;
5692 DEFUN ("keyboard-coding-system",
5693 Fkeyboard_coding_system
, Skeyboard_coding_system
, 0, 0, 0,
5694 "Return coding system specified for decoding keyboard input.")
5697 return keyboard_coding
.symbol
;
5701 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system
,
5702 Sfind_operation_coding_system
, 1, MANY
, 0,
5703 "Choose a coding system for an operation based on the target name.\n\
5704 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).\n\
5705 DECODING-SYSTEM is the coding system to use for decoding\n\
5706 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system\n\
5707 for encoding (in case OPERATION does encoding).\n\
5709 The first argument OPERATION specifies an I/O primitive:\n\
5710 For file I/O, `insert-file-contents' or `write-region'.\n\
5711 For process I/O, `call-process', `call-process-region', or `start-process'.\n\
5712 For network I/O, `open-network-stream'.\n\
5714 The remaining arguments should be the same arguments that were passed\n\
5715 to the primitive. Depending on which primitive, one of those arguments\n\
5716 is selected as the TARGET. For example, if OPERATION does file I/O,\n\
5717 whichever argument specifies the file name is TARGET.\n\
5719 TARGET has a meaning which depends on OPERATION:\n\
5720 For file I/O, TARGET is a file name.\n\
5721 For process I/O, TARGET is a process name.\n\
5722 For network I/O, TARGET is a service name or a port number\n\
5724 This function looks up what specified for TARGET in,\n\
5725 `file-coding-system-alist', `process-coding-system-alist',\n\
5726 or `network-coding-system-alist' depending on OPERATION.\n\
5727 They may specify a coding system, a cons of coding systems,\n\
5728 or a function symbol to call.\n\
5729 In the last case, we call the function with one argument,\n\
5730 which is a list of all the arguments given to this function.")
5735 Lisp_Object operation
, target_idx
, target
, val
;
5736 register Lisp_Object chain
;
5739 error ("Too few arguments");
5740 operation
= args
[0];
5741 if (!SYMBOLP (operation
)
5742 || !INTEGERP (target_idx
= Fget (operation
, Qtarget_idx
)))
5743 error ("Invalid first arguement");
5744 if (nargs
< 1 + XINT (target_idx
))
5745 error ("Too few arguments for operation: %s",
5746 XSYMBOL (operation
)->name
->data
);
5747 target
= args
[XINT (target_idx
) + 1];
5748 if (!(STRINGP (target
)
5749 || (EQ (operation
, Qopen_network_stream
) && INTEGERP (target
))))
5750 error ("Invalid %dth argument", XINT (target_idx
) + 1);
5752 chain
= ((EQ (operation
, Qinsert_file_contents
)
5753 || EQ (operation
, Qwrite_region
))
5754 ? Vfile_coding_system_alist
5755 : (EQ (operation
, Qopen_network_stream
)
5756 ? Vnetwork_coding_system_alist
5757 : Vprocess_coding_system_alist
));
5761 for (; CONSP (chain
); chain
= XCDR (chain
))
5767 && ((STRINGP (target
)
5768 && STRINGP (XCAR (elt
))
5769 && fast_string_match (XCAR (elt
), target
) >= 0)
5770 || (INTEGERP (target
) && EQ (target
, XCAR (elt
)))))
5773 /* Here, if VAL is both a valid coding system and a valid
5774 function symbol, we return VAL as a coding system. */
5777 if (! SYMBOLP (val
))
5779 if (! NILP (Fcoding_system_p (val
)))
5780 return Fcons (val
, val
);
5781 if (! NILP (Ffboundp (val
)))
5783 val
= call1 (val
, Flist (nargs
, args
));
5786 if (SYMBOLP (val
) && ! NILP (Fcoding_system_p (val
)))
5787 return Fcons (val
, val
);
5795 DEFUN ("update-coding-systems-internal", Fupdate_coding_systems_internal
,
5796 Supdate_coding_systems_internal
, 0, 0, 0,
5797 "Update internal database for ISO2022 and CCL based coding systems.\n\
5798 When values of any coding categories are changed, you must\n\
5799 call this function")
5804 for (i
= CODING_CATEGORY_IDX_EMACS_MULE
; i
< CODING_CATEGORY_IDX_MAX
; i
++)
5808 val
= XSYMBOL (XVECTOR (Vcoding_category_table
)->contents
[i
])->value
;
5811 if (! coding_system_table
[i
])
5812 coding_system_table
[i
] = ((struct coding_system
*)
5813 xmalloc (sizeof (struct coding_system
)));
5814 setup_coding_system (val
, coding_system_table
[i
]);
5816 else if (coding_system_table
[i
])
5818 xfree (coding_system_table
[i
]);
5819 coding_system_table
[i
] = NULL
;
5826 DEFUN ("set-coding-priority-internal", Fset_coding_priority_internal
,
5827 Sset_coding_priority_internal
, 0, 0, 0,
5828 "Update internal database for the current value of `coding-category-list'.\n\
5829 This function is internal use only.")
5835 val
= Vcoding_category_list
;
5837 while (CONSP (val
) && i
< CODING_CATEGORY_IDX_MAX
)
5839 if (! SYMBOLP (XCAR (val
)))
5841 idx
= XFASTINT (Fget (XCAR (val
), Qcoding_category_index
));
5842 if (idx
>= CODING_CATEGORY_IDX_MAX
)
5844 coding_priorities
[i
++] = (1 << idx
);
5847 /* If coding-category-list is valid and contains all coding
5848 categories, `i' should be CODING_CATEGORY_IDX_MAX now. If not,
5849 the following code saves Emacs from crashing. */
5850 while (i
< CODING_CATEGORY_IDX_MAX
)
5851 coding_priorities
[i
++] = CODING_CATEGORY_MASK_RAW_TEXT
;
5859 /*** 9. Post-amble ***/
5864 conversion_buffer
= (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE
);
5872 /* Emacs' internal format specific initialize routine. */
5873 for (i
= 0; i
<= 0x20; i
++)
5874 emacs_code_class
[i
] = EMACS_control_code
;
5875 emacs_code_class
[0x0A] = EMACS_linefeed_code
;
5876 emacs_code_class
[0x0D] = EMACS_carriage_return_code
;
5877 for (i
= 0x21 ; i
< 0x7F; i
++)
5878 emacs_code_class
[i
] = EMACS_ascii_code
;
5879 emacs_code_class
[0x7F] = EMACS_control_code
;
5880 for (i
= 0x80; i
< 0xFF; i
++)
5881 emacs_code_class
[i
] = EMACS_invalid_code
;
5882 emacs_code_class
[LEADING_CODE_PRIVATE_11
] = EMACS_leading_code_3
;
5883 emacs_code_class
[LEADING_CODE_PRIVATE_12
] = EMACS_leading_code_3
;
5884 emacs_code_class
[LEADING_CODE_PRIVATE_21
] = EMACS_leading_code_4
;
5885 emacs_code_class
[LEADING_CODE_PRIVATE_22
] = EMACS_leading_code_4
;
5887 /* ISO2022 specific initialize routine. */
5888 for (i
= 0; i
< 0x20; i
++)
5889 iso_code_class
[i
] = ISO_control_0
;
5890 for (i
= 0x21; i
< 0x7F; i
++)
5891 iso_code_class
[i
] = ISO_graphic_plane_0
;
5892 for (i
= 0x80; i
< 0xA0; i
++)
5893 iso_code_class
[i
] = ISO_control_1
;
5894 for (i
= 0xA1; i
< 0xFF; i
++)
5895 iso_code_class
[i
] = ISO_graphic_plane_1
;
5896 iso_code_class
[0x20] = iso_code_class
[0x7F] = ISO_0x20_or_0x7F
;
5897 iso_code_class
[0xA0] = iso_code_class
[0xFF] = ISO_0xA0_or_0xFF
;
5898 iso_code_class
[ISO_CODE_CR
] = ISO_carriage_return
;
5899 iso_code_class
[ISO_CODE_SO
] = ISO_shift_out
;
5900 iso_code_class
[ISO_CODE_SI
] = ISO_shift_in
;
5901 iso_code_class
[ISO_CODE_SS2_7
] = ISO_single_shift_2_7
;
5902 iso_code_class
[ISO_CODE_ESC
] = ISO_escape
;
5903 iso_code_class
[ISO_CODE_SS2
] = ISO_single_shift_2
;
5904 iso_code_class
[ISO_CODE_SS3
] = ISO_single_shift_3
;
5905 iso_code_class
[ISO_CODE_CSI
] = ISO_control_sequence_introducer
;
5907 conversion_buffer_size
= MINIMUM_CONVERSION_BUFFER_SIZE
;
5909 setup_coding_system (Qnil
, &keyboard_coding
);
5910 setup_coding_system (Qnil
, &terminal_coding
);
5911 setup_coding_system (Qnil
, &safe_terminal_coding
);
5912 setup_coding_system (Qnil
, &default_buffer_file_coding
);
5914 bzero (coding_system_table
, sizeof coding_system_table
);
5916 bzero (ascii_skip_code
, sizeof ascii_skip_code
);
5917 for (i
= 0; i
< 128; i
++)
5918 ascii_skip_code
[i
] = 1;
5920 #if defined (MSDOS) || defined (WINDOWSNT)
5921 system_eol_type
= CODING_EOL_CRLF
;
5923 system_eol_type
= CODING_EOL_LF
;
5926 inhibit_pre_post_conversion
= 0;
5934 Qtarget_idx
= intern ("target-idx");
5935 staticpro (&Qtarget_idx
);
5937 Qcoding_system_history
= intern ("coding-system-history");
5938 staticpro (&Qcoding_system_history
);
5939 Fset (Qcoding_system_history
, Qnil
);
5941 /* Target FILENAME is the first argument. */
5942 Fput (Qinsert_file_contents
, Qtarget_idx
, make_number (0));
5943 /* Target FILENAME is the third argument. */
5944 Fput (Qwrite_region
, Qtarget_idx
, make_number (2));
5946 Qcall_process
= intern ("call-process");
5947 staticpro (&Qcall_process
);
5948 /* Target PROGRAM is the first argument. */
5949 Fput (Qcall_process
, Qtarget_idx
, make_number (0));
5951 Qcall_process_region
= intern ("call-process-region");
5952 staticpro (&Qcall_process_region
);
5953 /* Target PROGRAM is the third argument. */
5954 Fput (Qcall_process_region
, Qtarget_idx
, make_number (2));
5956 Qstart_process
= intern ("start-process");
5957 staticpro (&Qstart_process
);
5958 /* Target PROGRAM is the third argument. */
5959 Fput (Qstart_process
, Qtarget_idx
, make_number (2));
5961 Qopen_network_stream
= intern ("open-network-stream");
5962 staticpro (&Qopen_network_stream
);
5963 /* Target SERVICE is the fourth argument. */
5964 Fput (Qopen_network_stream
, Qtarget_idx
, make_number (3));
5966 Qcoding_system
= intern ("coding-system");
5967 staticpro (&Qcoding_system
);
5969 Qeol_type
= intern ("eol-type");
5970 staticpro (&Qeol_type
);
5972 Qbuffer_file_coding_system
= intern ("buffer-file-coding-system");
5973 staticpro (&Qbuffer_file_coding_system
);
5975 Qpost_read_conversion
= intern ("post-read-conversion");
5976 staticpro (&Qpost_read_conversion
);
5978 Qpre_write_conversion
= intern ("pre-write-conversion");
5979 staticpro (&Qpre_write_conversion
);
5981 Qno_conversion
= intern ("no-conversion");
5982 staticpro (&Qno_conversion
);
5984 Qundecided
= intern ("undecided");
5985 staticpro (&Qundecided
);
5987 Qcoding_system_p
= intern ("coding-system-p");
5988 staticpro (&Qcoding_system_p
);
5990 Qcoding_system_error
= intern ("coding-system-error");
5991 staticpro (&Qcoding_system_error
);
5993 Fput (Qcoding_system_error
, Qerror_conditions
,
5994 Fcons (Qcoding_system_error
, Fcons (Qerror
, Qnil
)));
5995 Fput (Qcoding_system_error
, Qerror_message
,
5996 build_string ("Invalid coding system"));
5998 Qcoding_category
= intern ("coding-category");
5999 staticpro (&Qcoding_category
);
6000 Qcoding_category_index
= intern ("coding-category-index");
6001 staticpro (&Qcoding_category_index
);
6003 Vcoding_category_table
6004 = Fmake_vector (make_number (CODING_CATEGORY_IDX_MAX
), Qnil
);
6005 staticpro (&Vcoding_category_table
);
6008 for (i
= 0; i
< CODING_CATEGORY_IDX_MAX
; i
++)
6010 XVECTOR (Vcoding_category_table
)->contents
[i
]
6011 = intern (coding_category_name
[i
]);
6012 Fput (XVECTOR (Vcoding_category_table
)->contents
[i
],
6013 Qcoding_category_index
, make_number (i
));
6017 Qtranslation_table
= intern ("translation-table");
6018 staticpro (&Qtranslation_table
);
6019 Fput (Qtranslation_table
, Qchar_table_extra_slots
, make_number (1));
6021 Qtranslation_table_id
= intern ("translation-table-id");
6022 staticpro (&Qtranslation_table_id
);
6024 Qtranslation_table_for_decode
= intern ("translation-table-for-decode");
6025 staticpro (&Qtranslation_table_for_decode
);
6027 Qtranslation_table_for_encode
= intern ("translation-table-for-encode");
6028 staticpro (&Qtranslation_table_for_encode
);
6030 Qsafe_charsets
= intern ("safe-charsets");
6031 staticpro (&Qsafe_charsets
);
6033 Qvalid_codes
= intern ("valid-codes");
6034 staticpro (&Qvalid_codes
);
6036 Qemacs_mule
= intern ("emacs-mule");
6037 staticpro (&Qemacs_mule
);
6039 Qraw_text
= intern ("raw-text");
6040 staticpro (&Qraw_text
);
6042 defsubr (&Scoding_system_p
);
6043 defsubr (&Sread_coding_system
);
6044 defsubr (&Sread_non_nil_coding_system
);
6045 defsubr (&Scheck_coding_system
);
6046 defsubr (&Sdetect_coding_region
);
6047 defsubr (&Sdetect_coding_string
);
6048 defsubr (&Sdecode_coding_region
);
6049 defsubr (&Sencode_coding_region
);
6050 defsubr (&Sdecode_coding_string
);
6051 defsubr (&Sencode_coding_string
);
6052 defsubr (&Sdecode_sjis_char
);
6053 defsubr (&Sencode_sjis_char
);
6054 defsubr (&Sdecode_big5_char
);
6055 defsubr (&Sencode_big5_char
);
6056 defsubr (&Sset_terminal_coding_system_internal
);
6057 defsubr (&Sset_safe_terminal_coding_system_internal
);
6058 defsubr (&Sterminal_coding_system
);
6059 defsubr (&Sset_keyboard_coding_system_internal
);
6060 defsubr (&Skeyboard_coding_system
);
6061 defsubr (&Sfind_operation_coding_system
);
6062 defsubr (&Supdate_coding_systems_internal
);
6063 defsubr (&Sset_coding_priority_internal
);
6065 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list
,
6066 "List of coding systems.\n\
6068 Do not alter the value of this variable manually. This variable should be\n\
6069 updated by the functions `make-coding-system' and\n\
6070 `define-coding-system-alias'.");
6071 Vcoding_system_list
= Qnil
;
6073 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist
,
6074 "Alist of coding system names.\n\
6075 Each element is one element list of coding system name.\n\
6076 This variable is given to `completing-read' as TABLE argument.\n\
6078 Do not alter the value of this variable manually. This variable should be\n\
6079 updated by the functions `make-coding-system' and\n\
6080 `define-coding-system-alias'.");
6081 Vcoding_system_alist
= Qnil
;
6083 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list
,
6084 "List of coding-categories (symbols) ordered by priority.");
6088 Vcoding_category_list
= Qnil
;
6089 for (i
= CODING_CATEGORY_IDX_MAX
- 1; i
>= 0; i
--)
6090 Vcoding_category_list
6091 = Fcons (XVECTOR (Vcoding_category_table
)->contents
[i
],
6092 Vcoding_category_list
);
6095 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read
,
6096 "Specify the coding system for read operations.\n\
6097 It is useful to bind this variable with `let', but do not set it globally.\n\
6098 If the value is a coding system, it is used for decoding on read operation.\n\
6099 If not, an appropriate element is used from one of the coding system alists:\n\
6100 There are three such tables, `file-coding-system-alist',\n\
6101 `process-coding-system-alist', and `network-coding-system-alist'.");
6102 Vcoding_system_for_read
= Qnil
;
6104 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write
,
6105 "Specify the coding system for write operations.\n\
6106 Programs bind this variable with `let', but you should not set it globally.\n\
6107 If the value is a coding system, it is used for encoding of output,\n\
6108 when writing it to a file and when sending it to a file or subprocess.\n\
6110 If this does not specify a coding system, an appropriate element\n\
6111 is used from one of the coding system alists:\n\
6112 There are three such tables, `file-coding-system-alist',\n\
6113 `process-coding-system-alist', and `network-coding-system-alist'.\n\
6114 For output to files, if the above procedure does not specify a coding system,\n\
6115 the value of `buffer-file-coding-system' is used.");
6116 Vcoding_system_for_write
= Qnil
;
6118 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used
,
6119 "Coding system used in the latest file or process I/O.");
6120 Vlast_coding_system_used
= Qnil
;
6122 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion
,
6123 "*Non-nil means always inhibit code conversion of end-of-line format.\n\
6124 See info node `Coding Systems' and info node `Text and Binary' concerning\n\
6126 inhibit_eol_conversion
= 0;
6128 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system
,
6129 "Non-nil means process buffer inherits coding system of process output.\n\
6130 Bind it to t if the process output is to be treated as if it were a file\n\
6131 read from some filesystem.");
6132 inherit_process_coding_system
= 0;
6134 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist
,
6135 "Alist to decide a coding system to use for a file I/O operation.\n\
6136 The format is ((PATTERN . VAL) ...),\n\
6137 where PATTERN is a regular expression matching a file name,\n\
6138 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
6139 If VAL is a coding system, it is used for both decoding and encoding\n\
6140 the file contents.\n\
6141 If VAL is a cons of coding systems, the car part is used for decoding,\n\
6142 and the cdr part is used for encoding.\n\
6143 If VAL is a function symbol, the function must return a coding system\n\
6144 or a cons of coding systems which are used as above.\n\
6146 See also the function `find-operation-coding-system'\n\
6147 and the variable `auto-coding-alist'.");
6148 Vfile_coding_system_alist
= Qnil
;
6150 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist
,
6151 "Alist to decide a coding system to use for a process I/O operation.\n\
6152 The format is ((PATTERN . VAL) ...),\n\
6153 where PATTERN is a regular expression matching a program name,\n\
6154 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
6155 If VAL is a coding system, it is used for both decoding what received\n\
6156 from the program and encoding what sent to the program.\n\
6157 If VAL is a cons of coding systems, the car part is used for decoding,\n\
6158 and the cdr part is used for encoding.\n\
6159 If VAL is a function symbol, the function must return a coding system\n\
6160 or a cons of coding systems which are used as above.\n\
6162 See also the function `find-operation-coding-system'.");
6163 Vprocess_coding_system_alist
= Qnil
;
6165 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist
,
6166 "Alist to decide a coding system to use for a network I/O operation.\n\
6167 The format is ((PATTERN . VAL) ...),\n\
6168 where PATTERN is a regular expression matching a network service name\n\
6169 or is a port number to connect to,\n\
6170 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
6171 If VAL is a coding system, it is used for both decoding what received\n\
6172 from the network stream and encoding what sent to the network stream.\n\
6173 If VAL is a cons of coding systems, the car part is used for decoding,\n\
6174 and the cdr part is used for encoding.\n\
6175 If VAL is a function symbol, the function must return a coding system\n\
6176 or a cons of coding systems which are used as above.\n\
6178 See also the function `find-operation-coding-system'.");
6179 Vnetwork_coding_system_alist
= Qnil
;
6181 DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system
,
6182 "Coding system to use with system messages.");
6183 Vlocale_coding_system
= Qnil
;
6185 /* The eol mnemonics are reset in startup.el system-dependently. */
6186 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix
,
6187 "*String displayed in mode line for UNIX-like (LF) end-of-line format.");
6188 eol_mnemonic_unix
= build_string (":");
6190 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos
,
6191 "*String displayed in mode line for DOS-like (CRLF) end-of-line format.");
6192 eol_mnemonic_dos
= build_string ("\\");
6194 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac
,
6195 "*String displayed in mode line for MAC-like (CR) end-of-line format.");
6196 eol_mnemonic_mac
= build_string ("/");
6198 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided
,
6199 "*String displayed in mode line when end-of-line format is not yet determined.");
6200 eol_mnemonic_undecided
= build_string (":");
6202 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation
,
6203 "*Non-nil enables character translation while encoding and decoding.");
6204 Venable_character_translation
= Qt
;
6206 DEFVAR_LISP ("standard-translation-table-for-decode",
6207 &Vstandard_translation_table_for_decode
,
6208 "Table for translating characters while decoding.");
6209 Vstandard_translation_table_for_decode
= Qnil
;
6211 DEFVAR_LISP ("standard-translation-table-for-encode",
6212 &Vstandard_translation_table_for_encode
,
6213 "Table for translationg characters while encoding.");
6214 Vstandard_translation_table_for_encode
= Qnil
;
6216 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist
,
6217 "Alist of charsets vs revision numbers.\n\
6218 While encoding, if a charset (car part of an element) is found,\n\
6219 designate it with the escape sequence identifing revision (cdr part of the element).");
6220 Vcharset_revision_alist
= Qnil
;
6222 DEFVAR_LISP ("default-process-coding-system",
6223 &Vdefault_process_coding_system
,
6224 "Cons of coding systems used for process I/O by default.\n\
6225 The car part is used for decoding a process output,\n\
6226 the cdr part is used for encoding a text to be sent to a process.");
6227 Vdefault_process_coding_system
= Qnil
;
6229 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table
,
6230 "Table of extra Latin codes in the range 128..159 (inclusive).\n\
6231 This is a vector of length 256.\n\
6232 If Nth element is non-nil, the existence of code N in a file\n\
6233 \(or output of subprocess) doesn't prevent it to be detected as\n\
6234 a coding system of ISO 2022 variant which has a flag\n\
6235 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file\n\
6236 or reading output of a subprocess.\n\
6237 Only 128th through 159th elements has a meaning.");
6238 Vlatin_extra_code_table
= Fmake_vector (make_number (256), Qnil
);
6240 DEFVAR_LISP ("select-safe-coding-system-function",
6241 &Vselect_safe_coding_system_function
,
6242 "Function to call to select safe coding system for encoding a text.\n\
6244 If set, this function is called to force a user to select a proper\n\
6245 coding system which can encode the text in the case that a default\n\
6246 coding system used in each operation can't encode the text.\n\
6248 The default value is `select-safe-coding-system' (which see).");
6249 Vselect_safe_coding_system_function
= Qnil
;
6254 emacs_strerror (error_number
)
6259 synchronize_system_messages_locale ();
6260 str
= strerror (error_number
);
6262 if (! NILP (Vlocale_coding_system
))
6264 Lisp_Object dec
= code_convert_string_norecord (build_string (str
),
6265 Vlocale_coding_system
,
6267 str
= (char *) XSTRING (dec
)->data
;