1 /* Coding system handler (conversion, detection, and etc).
2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
5 This file is part of GNU Emacs.
7 GNU Emacs is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
12 GNU Emacs is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GNU Emacs; see the file COPYING. If not, write to
19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
22 /*** TABLE OF CONTENTS ***
25 2. Emacs' internal format (emacs-mule) handlers
27 4. Shift-JIS and BIG5 handlers
29 6. End-of-line handlers
30 7. C library functions
31 8. Emacs Lisp library functions
36 /*** GENERAL NOTE on CODING SYSTEM ***
38 Coding system is an encoding mechanism of one or more character
39 sets. Here's a list of coding systems which Emacs can handle. When
40 we say "decode", it means converting some other coding system to
41 Emacs' internal format (emacs-internal), and when we say "encode",
42 it means converting the coding system emacs-mule to some other
45 0. Emacs' internal format (emacs-mule)
47 Emacs itself holds a multi-lingual character in a buffer and a string
48 in a special format. Details are described in section 2.
52 The most famous coding system for multiple character sets. X's
53 Compound Text, various EUCs (Extended Unix Code), and coding
54 systems used in Internet communication such as ISO-2022-JP are
55 all variants of ISO2022. Details are described in section 3.
57 2. SJIS (or Shift-JIS or MS-Kanji-Code)
59 A coding system to encode character sets: ASCII, JISX0201, and
60 JISX0208. Widely used for PC's in Japan. Details are described in
65 A coding system to encode character sets: ASCII and Big5. Widely
66 used by Chinese (mainly in Taiwan and Hong Kong). Details are
67 described in section 4. In this file, when we write "BIG5"
68 (all uppercase), we mean the coding system, and when we write
69 "Big5" (capitalized), we mean the character set.
73 A coding system for a text containing random 8-bit code. Emacs does
74 no code conversion on such a text except for end-of-line format.
78 If a user wants to read/write a text encoded in a coding system not
79 listed above, he can supply a decoder and an encoder for it in CCL
80 (Code Conversion Language) programs. Emacs executes the CCL program
81 while reading/writing.
83 Emacs represents a coding system by a Lisp symbol that has a property
84 `coding-system'. But, before actually using the coding system, the
85 information about it is set in a structure of type `struct
86 coding_system' for rapid processing. See section 6 for more details.
90 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
92 How end-of-line of a text is encoded depends on a system. For
93 instance, Unix's format is just one byte of `line-feed' code,
94 whereas DOS's format is two-byte sequence of `carriage-return' and
95 `line-feed' codes. MacOS's format is usually one byte of
98 Since text characters encoding and end-of-line encoding are
99 independent, any coding system described above can take
100 any format of end-of-line. So, Emacs has information of format of
101 end-of-line in each coding-system. See section 6 for more details.
105 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
107 These functions check if a text between SRC and SRC_END is encoded
108 in the coding system category XXX. Each returns an integer value in
109 which appropriate flag bits for the category XXX is set. The flag
110 bits are defined in macros CODING_CATEGORY_MASK_XXX. Below is the
111 template of these functions. */
114 detect_coding_emacs_mule (src
, src_end
)
115 unsigned char *src
, *src_end
;
121 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
123 These functions decode SRC_BYTES length text at SOURCE encoded in
124 CODING to Emacs' internal format (emacs-mule). The resulting text
125 goes to a place pointed to by DESTINATION, the length of which
126 should not exceed DST_BYTES. These functions set the information of
127 original and decoded texts in the members produced, produced_char,
128 consumed, and consumed_char of the structure *CODING.
130 The return value is an integer (CODING_FINISH_XXX) indicating how
131 the decoding finished.
133 DST_BYTES zero means that source area and destination area are
134 overlapped, which means that we can produce a decoded text until it
135 reaches at the head of not-yet-decoded source text.
137 Below is a template of these functions. */
139 decode_coding_XXX (coding
, source
, destination
, src_bytes
, dst_bytes
)
140 struct coding_system
*coding
;
141 unsigned char *source
, *destination
;
142 int src_bytes
, dst_bytes
;
148 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
150 These functions encode SRC_BYTES length text at SOURCE of Emacs'
151 internal format (emacs-mule) to CODING. The resulting text goes to
152 a place pointed to by DESTINATION, the length of which should not
153 exceed DST_BYTES. These functions set the information of
154 original and encoded texts in the members produced, produced_char,
155 consumed, and consumed_char of the structure *CODING.
157 The return value is an integer (CODING_FINISH_XXX) indicating how
158 the encoding finished.
160 DST_BYTES zero means that source area and destination area are
161 overlapped, which means that we can produce a decoded text until it
162 reaches at the head of not-yet-decoded source text.
164 Below is a template of these functions. */
166 encode_coding_XXX (coding
, source
, destination
, src_bytes
, dst_bytes
)
167 struct coding_system
*coding
;
168 unsigned char *source
, *destination
;
169 int src_bytes
, dst_bytes
;
175 /*** COMMONLY USED MACROS ***/
177 /* The following three macros ONE_MORE_BYTE, TWO_MORE_BYTES, and
178 THREE_MORE_BYTES safely get one, two, and three bytes from the
179 source text respectively. If there are not enough bytes in the
180 source, they jump to `label_end_of_loop'. The caller should set
181 variables `src' and `src_end' to appropriate areas in advance. */
183 #define ONE_MORE_BYTE(c1) \
188 goto label_end_of_loop; \
191 #define TWO_MORE_BYTES(c1, c2) \
193 if (src + 1 < src_end) \
194 c1 = *src++, c2 = *src++; \
196 goto label_end_of_loop; \
199 #define THREE_MORE_BYTES(c1, c2, c3) \
201 if (src + 2 < src_end) \
202 c1 = *src++, c2 = *src++, c3 = *src++; \
204 goto label_end_of_loop; \
207 /* The following three macros DECODE_CHARACTER_ASCII,
208 DECODE_CHARACTER_DIMENSION1, and DECODE_CHARACTER_DIMENSION2 put
209 the multi-byte form of a character of each class at the place
210 pointed by `dst'. The caller should set the variable `dst' to
211 point to an appropriate area and the variable `coding' to point to
212 the coding-system of the currently decoding text in advance. */
214 /* Decode one ASCII character C. */
216 #define DECODE_CHARACTER_ASCII(c) \
218 if (COMPOSING_P (coding->composing)) \
220 *dst++ = 0xA0, *dst++ = (c) | 0x80; \
221 coding->composed_chars++; \
222 if (((c) | 0x80) < 0xA0) \
223 coding->fake_multibyte = 1; \
228 coding->produced_char++; \
230 coding->fake_multibyte = 1; \
234 /* Decode one DIMENSION1 character whose charset is CHARSET and whose
235 position-code is C. */
237 #define DECODE_CHARACTER_DIMENSION1(charset, c) \
239 unsigned char leading_code = CHARSET_LEADING_CODE_BASE (charset); \
240 if (COMPOSING_P (coding->composing)) \
242 *dst++ = leading_code + 0x20; \
243 coding->composed_chars++; \
247 *dst++ = leading_code; \
248 coding->produced_char++; \
250 if (leading_code = CHARSET_LEADING_CODE_EXT (charset)) \
251 *dst++ = leading_code; \
252 *dst++ = (c) | 0x80; \
253 if (((c) | 0x80) < 0xA0) \
254 coding->fake_multibyte = 1; \
257 /* Decode one DIMENSION2 character whose charset is CHARSET and whose
258 position-codes are C1 and C2. */
260 #define DECODE_CHARACTER_DIMENSION2(charset, c1, c2) \
262 DECODE_CHARACTER_DIMENSION1 (charset, c1); \
263 *dst++ = (c2) | 0x80; \
264 if (((c2) | 0x80) < 0xA0) \
265 coding->fake_multibyte = 1; \
269 /*** 1. Preamble ***/
283 #else /* not emacs */
287 #endif /* not emacs */
289 Lisp_Object Qcoding_system
, Qeol_type
;
290 Lisp_Object Qbuffer_file_coding_system
;
291 Lisp_Object Qpost_read_conversion
, Qpre_write_conversion
;
292 Lisp_Object Qno_conversion
, Qundecided
;
293 Lisp_Object Qcoding_system_history
;
294 Lisp_Object Qsafe_charsets
;
295 Lisp_Object Qvalid_codes
;
297 extern Lisp_Object Qinsert_file_contents
, Qwrite_region
;
298 Lisp_Object Qcall_process
, Qcall_process_region
, Qprocess_argument
;
299 Lisp_Object Qstart_process
, Qopen_network_stream
;
300 Lisp_Object Qtarget_idx
;
302 Lisp_Object Vselect_safe_coding_system_function
;
304 /* Mnemonic character of each format of end-of-line. */
305 int eol_mnemonic_unix
, eol_mnemonic_dos
, eol_mnemonic_mac
;
306 /* Mnemonic character to indicate format of end-of-line is not yet
308 int eol_mnemonic_undecided
;
310 /* Format of end-of-line decided by system. This is CODING_EOL_LF on
311 Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac. */
316 Lisp_Object Vcoding_system_list
, Vcoding_system_alist
;
318 Lisp_Object Qcoding_system_p
, Qcoding_system_error
;
320 /* Coding system emacs-mule and raw-text are for converting only
321 end-of-line format. */
322 Lisp_Object Qemacs_mule
, Qraw_text
;
324 /* Coding-systems are handed between Emacs Lisp programs and C internal
325 routines by the following three variables. */
326 /* Coding-system for reading files and receiving data from process. */
327 Lisp_Object Vcoding_system_for_read
;
328 /* Coding-system for writing files and sending data to process. */
329 Lisp_Object Vcoding_system_for_write
;
330 /* Coding-system actually used in the latest I/O. */
331 Lisp_Object Vlast_coding_system_used
;
333 /* A vector of length 256 which contains information about special
334 Latin codes (especially for dealing with Microsoft codes). */
335 Lisp_Object Vlatin_extra_code_table
;
337 /* Flag to inhibit code conversion of end-of-line format. */
338 int inhibit_eol_conversion
;
340 /* Flag to make buffer-file-coding-system inherit from process-coding. */
341 int inherit_process_coding_system
;
343 /* Coding system to be used to encode text for terminal display. */
344 struct coding_system terminal_coding
;
346 /* Coding system to be used to encode text for terminal display when
347 terminal coding system is nil. */
348 struct coding_system safe_terminal_coding
;
350 /* Coding system of what is sent from terminal keyboard. */
351 struct coding_system keyboard_coding
;
353 /* Default coding system to be used to write a file. */
354 struct coding_system default_buffer_file_coding
;
356 Lisp_Object Vfile_coding_system_alist
;
357 Lisp_Object Vprocess_coding_system_alist
;
358 Lisp_Object Vnetwork_coding_system_alist
;
362 Lisp_Object Qcoding_category
, Qcoding_category_index
;
364 /* List of symbols `coding-category-xxx' ordered by priority. */
365 Lisp_Object Vcoding_category_list
;
367 /* Table of coding categories (Lisp symbols). */
368 Lisp_Object Vcoding_category_table
;
370 /* Table of names of symbol for each coding-category. */
371 char *coding_category_name
[CODING_CATEGORY_IDX_MAX
] = {
372 "coding-category-emacs-mule",
373 "coding-category-sjis",
374 "coding-category-iso-7",
375 "coding-category-iso-7-tight",
376 "coding-category-iso-8-1",
377 "coding-category-iso-8-2",
378 "coding-category-iso-7-else",
379 "coding-category-iso-8-else",
380 "coding-category-ccl",
381 "coding-category-big5",
382 "coding-category-raw-text",
383 "coding-category-binary"
386 /* Table of pointers to coding systems corresponding to each coding
388 struct coding_system
*coding_system_table
[CODING_CATEGORY_IDX_MAX
];
390 /* Table of coding category masks. Nth element is a mask for a coding
391 cateogry of which priority is Nth. */
393 int coding_priorities
[CODING_CATEGORY_IDX_MAX
];
395 /* Flag to tell if we look up translation table on character code
397 Lisp_Object Venable_character_translation
;
398 /* Standard translation table to look up on decoding (reading). */
399 Lisp_Object Vstandard_translation_table_for_decode
;
400 /* Standard translation table to look up on encoding (writing). */
401 Lisp_Object Vstandard_translation_table_for_encode
;
403 Lisp_Object Qtranslation_table
;
404 Lisp_Object Qtranslation_table_id
;
405 Lisp_Object Qtranslation_table_for_decode
;
406 Lisp_Object Qtranslation_table_for_encode
;
408 /* Alist of charsets vs revision number. */
409 Lisp_Object Vcharset_revision_alist
;
411 /* Default coding systems used for process I/O. */
412 Lisp_Object Vdefault_process_coding_system
;
415 /*** 2. Emacs internal format (emacs-mule) handlers ***/
417 /* Emacs' internal format for encoding multiple character sets is a
418 kind of multi-byte encoding, i.e. characters are encoded by
419 variable-length sequences of one-byte codes. ASCII characters
420 and control characters (e.g. `tab', `newline') are represented by
421 one-byte sequences which are their ASCII codes, in the range 0x00
422 through 0x7F. The other characters are represented by a sequence
423 of `base leading-code', optional `extended leading-code', and one
424 or two `position-code's. The length of the sequence is determined
425 by the base leading-code. Leading-code takes the range 0x80
426 through 0x9F, whereas extended leading-code and position-code take
427 the range 0xA0 through 0xFF. See `charset.h' for more details
428 about leading-code and position-code.
430 There's one exception to this rule. Special leading-code
431 `leading-code-composition' denotes that the following several
432 characters should be composed into one character. Leading-codes of
433 components (except for ASCII) are added 0x20. An ASCII character
434 component is represented by a 2-byte sequence of `0xA0' and
435 `ASCII-code + 0x80'. See also the comments in `charset.h' for the
436 details of composite character. Hence, we can summarize the code
439 --- CODE RANGE of Emacs' internal format ---
440 (character set) (range)
442 ELSE (1st byte) 0x80 .. 0x9F
443 (rest bytes) 0xA0 .. 0xFF
444 ---------------------------------------------
448 enum emacs_code_class_type emacs_code_class
[256];
450 /* Go to the next statement only if *SRC is accessible and the code is
451 greater than 0xA0. */
452 #define CHECK_CODE_RANGE_A0_FF \
454 if (src >= src_end) \
455 goto label_end_of_switch; \
456 else if (*src++ < 0xA0) \
460 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
461 Check if a text is encoded in Emacs' internal format. If it is,
462 return CODING_CATEGORY_MASK_EMACS_MULE, else return 0. */
465 detect_coding_emacs_mule (src
, src_end
)
466 unsigned char *src
, *src_end
;
471 while (src
< src_end
)
483 switch (emacs_code_class
[c
])
485 case EMACS_ascii_code
:
486 case EMACS_linefeed_code
:
489 case EMACS_control_code
:
490 if (c
== ISO_CODE_ESC
|| c
== ISO_CODE_SI
|| c
== ISO_CODE_SO
)
494 case EMACS_invalid_code
:
497 case EMACS_leading_code_composition
: /* c == 0x80 */
499 CHECK_CODE_RANGE_A0_FF
;
504 case EMACS_leading_code_4
:
505 CHECK_CODE_RANGE_A0_FF
;
506 /* fall down to check it two more times ... */
508 case EMACS_leading_code_3
:
509 CHECK_CODE_RANGE_A0_FF
;
510 /* fall down to check it one more time ... */
512 case EMACS_leading_code_2
:
513 CHECK_CODE_RANGE_A0_FF
;
521 return CODING_CATEGORY_MASK_EMACS_MULE
;
525 /*** 3. ISO2022 handlers ***/
527 /* The following note describes the coding system ISO2022 briefly.
528 Since the intention of this note is to help in understanding of
529 the programs in this file, some parts are NOT ACCURATE or OVERLY
530 SIMPLIFIED. For the thorough understanding, please refer to the
531 original document of ISO2022.
533 ISO2022 provides many mechanisms to encode several character sets
534 in 7-bit and 8-bit environment. If one chooses 7-bite environment,
535 all text is encoded by codes of less than 128. This may make the
536 encoded text a little bit longer, but the text gets more stability
537 to pass through several gateways (some of them strip off the MSB).
539 There are two kinds of character set: control character set and
540 graphic character set. The former contains control characters such
541 as `newline' and `escape' to provide control functions (control
542 functions are provided also by escape sequences). The latter
543 contains graphic characters such as ' A' and '-'. Emacs recognizes
544 two control character sets and many graphic character sets.
546 Graphic character sets are classified into one of the following
547 four classes, DIMENSION1_CHARS94, DIMENSION1_CHARS96,
548 DIMENSION2_CHARS94, DIMENSION2_CHARS96 according to the number of
549 bytes (DIMENSION) and the number of characters in one dimension
550 (CHARS) of the set. In addition, each character set is assigned an
551 identification tag (called "final character" and denoted as <F>
552 here after) which is unique in each class. <F> of each character
553 set is decided by ECMA(*) when it is registered in ISO. Code range
554 of <F> is 0x30..0x7F (0x30..0x3F are for private use only).
556 Note (*): ECMA = European Computer Manufacturers Association
558 Here are examples of graphic character set [NAME(<F>)]:
559 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
560 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
561 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
562 o DIMENSION2_CHARS96 -- none for the moment
564 A code area (1byte=8bits) is divided into 4 areas, C0, GL, C1, and GR.
565 C0 [0x00..0x1F] -- control character plane 0
566 GL [0x20..0x7F] -- graphic character plane 0
567 C1 [0x80..0x9F] -- control character plane 1
568 GR [0xA0..0xFF] -- graphic character plane 1
570 A control character set is directly designated and invoked to C0 or
571 C1 by an escape sequence. The most common case is that ISO646's
572 control character set is designated/invoked to C0 and ISO6429's
573 control character set is designated/invoked to C1, and usually
574 these designations/invocations are omitted in a coded text. With
575 7-bit environment, only C0 can be used, and a control character for
576 C1 is encoded by an appropriate escape sequence to fit in the
577 environment. All control characters for C1 are defined the
578 corresponding escape sequences.
580 A graphic character set is at first designated to one of four
581 graphic registers (G0 through G3), then these graphic registers are
582 invoked to GL or GR. These designations and invocations can be
583 done independently. The most common case is that G0 is invoked to
584 GL, G1 is invoked to GR, and ASCII is designated to G0, and usually
585 these invocations and designations are omitted in a coded text.
586 With 7-bit environment, only GL can be used.
588 When a graphic character set of CHARS94 is invoked to GL, code 0x20
589 and 0x7F of GL area work as control characters SPACE and DEL
590 respectively, and code 0xA0 and 0xFF of GR area should not be used.
592 There are two ways of invocation: locking-shift and single-shift.
593 With locking-shift, the invocation lasts until the next different
594 invocation, whereas with single-shift, the invocation works only
595 for the following character and doesn't affect locking-shift.
596 Invocations are done by the following control characters or escape
599 ----------------------------------------------------------------------
600 function control char escape sequence description
601 ----------------------------------------------------------------------
602 SI (shift-in) 0x0F none invoke G0 to GL
603 SO (shift-out) 0x0E none invoke G1 to GL
604 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
605 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
606 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 into GL
607 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 into GL
608 ----------------------------------------------------------------------
609 The first four are for locking-shift. Control characters for these
610 functions are defined by macros ISO_CODE_XXX in `coding.h'.
612 Designations are done by the following escape sequences.
613 ----------------------------------------------------------------------
614 escape sequence description
615 ----------------------------------------------------------------------
616 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
617 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
618 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
619 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
620 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
621 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
622 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
623 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
624 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
625 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
626 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
627 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
628 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
629 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
630 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
631 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
632 ----------------------------------------------------------------------
634 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
635 of dimension 1, chars 94, and final character <F>, and etc.
637 Note (*): Although these designations are not allowed in ISO2022,
638 Emacs accepts them on decoding, and produces them on encoding
639 CHARS96 character set in a coding system which is characterized as
640 7-bit environment, non-locking-shift, and non-single-shift.
642 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
643 '(' can be omitted. We call this as "short-form" here after.
645 Now you may notice that there are a lot of ways for encoding the
646 same multilingual text in ISO2022. Actually, there exists many
647 coding systems such as Compound Text (used in X's inter client
648 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
649 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
650 localized platforms), and all of these are variants of ISO2022.
652 In addition to the above, Emacs handles two more kinds of escape
653 sequences: ISO6429's direction specification and Emacs' private
654 sequence for specifying character composition.
656 ISO6429's direction specification takes the following format:
657 o CSI ']' -- end of the current direction
658 o CSI '0' ']' -- end of the current direction
659 o CSI '1' ']' -- start of left-to-right text
660 o CSI '2' ']' -- start of right-to-left text
661 The control character CSI (0x9B: control sequence introducer) is
662 abbreviated to the escape sequence ESC '[' in 7-bit environment.
664 Character composition specification takes the following format:
665 o ESC '0' -- start character composition
666 o ESC '1' -- end character composition
667 Since these are not standard escape sequences of any ISO, the use
668 of them for these meaning is restricted to Emacs only. */
670 enum iso_code_class_type iso_code_class
[256];
672 #define CHARSET_OK(idx, charset) \
673 (coding_system_table[idx] \
674 && (coding_system_table[idx]->safe_charsets[charset] \
675 || (CODING_SPEC_ISO_REQUESTED_DESIGNATION \
676 (coding_system_table[idx], charset) \
677 != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)))
679 #define SHIFT_OUT_OK(idx) \
680 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding_system_table[idx], 1) >= 0)
682 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
683 Check if a text is encoded in ISO2022. If it is, returns an
684 integer in which appropriate flag bits any of:
685 CODING_CATEGORY_MASK_ISO_7
686 CODING_CATEGORY_MASK_ISO_7_TIGHT
687 CODING_CATEGORY_MASK_ISO_8_1
688 CODING_CATEGORY_MASK_ISO_8_2
689 CODING_CATEGORY_MASK_ISO_7_ELSE
690 CODING_CATEGORY_MASK_ISO_8_ELSE
691 are set. If a code which should never appear in ISO2022 is found,
695 detect_coding_iso2022 (src
, src_end
)
696 unsigned char *src
, *src_end
;
698 int mask
= CODING_CATEGORY_MASK_ISO
;
700 int reg
[4], shift_out
= 0, single_shifting
= 0;
701 int c
, c1
, i
, charset
;
703 reg
[0] = CHARSET_ASCII
, reg
[1] = reg
[2] = reg
[3] = -1;
704 while (mask
&& src
< src_end
)
714 if (c
>= '(' && c
<= '/')
716 /* Designation sequence for a charset of dimension 1. */
720 if (c1
< ' ' || c1
>= 0x80
721 || (charset
= iso_charset_table
[0][c
>= ','][c1
]) < 0)
722 /* Invalid designation sequence. Just ignore. */
724 reg
[(c
- '(') % 4] = charset
;
728 /* Designation sequence for a charset of dimension 2. */
732 if (c
>= '@' && c
<= 'B')
733 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
734 reg
[0] = charset
= iso_charset_table
[1][0][c
];
735 else if (c
>= '(' && c
<= '/')
740 if (c1
< ' ' || c1
>= 0x80
741 || (charset
= iso_charset_table
[1][c
>= ','][c1
]) < 0)
742 /* Invalid designation sequence. Just ignore. */
744 reg
[(c
- '(') % 4] = charset
;
747 /* Invalid designation sequence. Just ignore. */
750 else if (c
== 'N' || c
== 'O')
752 /* ESC <Fe> for SS2 or SS3. */
753 mask
&= CODING_CATEGORY_MASK_ISO_7_ELSE
;
756 else if (c
== '0' || c
== '1' || c
== '2')
757 /* ESC <Fp> for start/end composition. Just ignore. */
760 /* Invalid escape sequence. Just ignore. */
763 /* We found a valid designation sequence for CHARSET. */
764 mask
&= ~CODING_CATEGORY_MASK_ISO_8BIT
;
765 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7
, charset
))
766 mask_found
|= CODING_CATEGORY_MASK_ISO_7
;
768 mask
&= ~CODING_CATEGORY_MASK_ISO_7
;
769 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT
, charset
))
770 mask_found
|= CODING_CATEGORY_MASK_ISO_7_TIGHT
;
772 mask
&= ~CODING_CATEGORY_MASK_ISO_7_TIGHT
;
773 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE
, charset
))
774 mask_found
|= CODING_CATEGORY_MASK_ISO_7_ELSE
;
776 mask
&= ~CODING_CATEGORY_MASK_ISO_7_ELSE
;
777 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE
, charset
))
778 mask_found
|= CODING_CATEGORY_MASK_ISO_8_ELSE
;
780 mask
&= ~CODING_CATEGORY_MASK_ISO_8_ELSE
;
787 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE
)
788 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE
)))
790 /* Locking shift out. */
791 mask
&= ~CODING_CATEGORY_MASK_ISO_7BIT
;
792 mask_found
|= CODING_CATEGORY_MASK_ISO_SHIFT
;
800 /* Locking shift in. */
801 mask
&= ~CODING_CATEGORY_MASK_ISO_7BIT
;
802 mask_found
|= CODING_CATEGORY_MASK_ISO_SHIFT
;
811 int newmask
= CODING_CATEGORY_MASK_ISO_8_ELSE
;
813 if (c
!= ISO_CODE_CSI
)
815 if (coding_system_table
[CODING_CATEGORY_IDX_ISO_8_1
]->flags
816 & CODING_FLAG_ISO_SINGLE_SHIFT
)
817 newmask
|= CODING_CATEGORY_MASK_ISO_8_1
;
818 if (coding_system_table
[CODING_CATEGORY_IDX_ISO_8_2
]->flags
819 & CODING_FLAG_ISO_SINGLE_SHIFT
)
820 newmask
|= CODING_CATEGORY_MASK_ISO_8_2
;
823 if (VECTORP (Vlatin_extra_code_table
)
824 && !NILP (XVECTOR (Vlatin_extra_code_table
)->contents
[c
]))
826 if (coding_system_table
[CODING_CATEGORY_IDX_ISO_8_1
]->flags
827 & CODING_FLAG_ISO_LATIN_EXTRA
)
828 newmask
|= CODING_CATEGORY_MASK_ISO_8_1
;
829 if (coding_system_table
[CODING_CATEGORY_IDX_ISO_8_2
]->flags
830 & CODING_FLAG_ISO_LATIN_EXTRA
)
831 newmask
|= CODING_CATEGORY_MASK_ISO_8_2
;
834 mask_found
|= newmask
;
847 if (VECTORP (Vlatin_extra_code_table
)
848 && !NILP (XVECTOR (Vlatin_extra_code_table
)->contents
[c
]))
852 if (coding_system_table
[CODING_CATEGORY_IDX_ISO_8_1
]->flags
853 & CODING_FLAG_ISO_LATIN_EXTRA
)
854 newmask
|= CODING_CATEGORY_MASK_ISO_8_1
;
855 if (coding_system_table
[CODING_CATEGORY_IDX_ISO_8_2
]->flags
856 & CODING_FLAG_ISO_LATIN_EXTRA
)
857 newmask
|= CODING_CATEGORY_MASK_ISO_8_2
;
859 mask_found
|= newmask
;
866 unsigned char *src_begin
= src
;
868 mask
&= ~(CODING_CATEGORY_MASK_ISO_7BIT
869 | CODING_CATEGORY_MASK_ISO_7_ELSE
);
870 mask_found
|= CODING_CATEGORY_MASK_ISO_8_1
;
871 /* Check the length of succeeding codes of the range
872 0xA0..0FF. If the byte length is odd, we exclude
873 CODING_CATEGORY_MASK_ISO_8_2. We can check this only
874 when we are not single shifting. */
875 if (!single_shifting
)
877 while (src
< src_end
&& *src
>= 0xA0)
879 if ((src
- src_begin
- 1) & 1 && src
< src_end
)
880 mask
&= ~CODING_CATEGORY_MASK_ISO_8_2
;
882 mask_found
|= CODING_CATEGORY_MASK_ISO_8_2
;
889 return (mask
& mask_found
);
892 /* Decode a character of which charset is CHARSET and the 1st position
893 code is C1. If dimension of CHARSET is 2, the 2nd position code is
894 fetched from SRC and set to C2. If CHARSET is negative, it means
895 that we are decoding ill formed text, and what we can do is just to
898 #define DECODE_ISO_CHARACTER(charset, c1) \
900 int c_alt, charset_alt = (charset); \
901 if (COMPOSING_HEAD_P (coding->composing)) \
903 *dst++ = LEADING_CODE_COMPOSITION; \
904 if (COMPOSING_WITH_RULE_P (coding->composing)) \
905 /* To tell composition rules are embeded. */ \
907 coding->composing += 2; \
909 if (charset_alt >= 0) \
911 if (CHARSET_DIMENSION (charset_alt) == 2) \
913 ONE_MORE_BYTE (c2); \
914 if (iso_code_class[(c2) & 0x7F] != ISO_0x20_or_0x7F \
915 && iso_code_class[(c2) & 0x7F] != ISO_graphic_plane_0) \
918 charset_alt = CHARSET_ASCII; \
921 if (!NILP (translation_table) \
922 && ((c_alt = translate_char (translation_table, \
923 -1, charset_alt, c1, c2)) >= 0)) \
924 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
926 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
927 DECODE_CHARACTER_ASCII (c1); \
928 else if (CHARSET_DIMENSION (charset_alt) == 1) \
929 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
931 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
932 if (COMPOSING_WITH_RULE_P (coding->composing)) \
933 /* To tell a composition rule follows. */ \
934 coding->composing = COMPOSING_WITH_RULE_RULE; \
937 /* Set designation state into CODING. */
938 #define DECODE_DESIGNATION(reg, dimension, chars, final_char) \
940 int charset = ISO_CHARSET_TABLE (make_number (dimension), \
941 make_number (chars), \
942 make_number (final_char)); \
944 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) == reg \
945 || coding->safe_charsets[charset])) \
947 if (coding->spec.iso2022.last_invalid_designation_register == 0 \
949 && charset == CHARSET_ASCII) \
951 /* We should insert this designation sequence as is so \
952 that it is surely written back to a file. */ \
953 coding->spec.iso2022.last_invalid_designation_register = -1; \
954 goto label_invalid_code; \
956 coding->spec.iso2022.last_invalid_designation_register = -1; \
957 if ((coding->mode & CODING_MODE_DIRECTION) \
958 && CHARSET_REVERSE_CHARSET (charset) >= 0) \
959 charset = CHARSET_REVERSE_CHARSET (charset); \
960 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
964 coding->spec.iso2022.last_invalid_designation_register = reg; \
965 goto label_invalid_code; \
969 /* Return 0 if there's a valid composing sequence starting at SRC and
970 ending before SRC_END, else return -1. */
973 check_composing_code (coding
, src
, src_end
)
974 struct coding_system
*coding
;
975 unsigned char *src
, *src_end
;
977 int charset
, c
, c1
, dim
;
979 while (src
< src_end
)
984 if (c
!= ISO_CODE_ESC
|| src
>= src_end
)
987 if (c
== '1') /* end of compsition */
989 if (src
+ 2 >= src_end
990 || !coding
->flags
& CODING_FLAG_ISO_DESIGNATION
)
995 c
= (*src
>= '@' && *src
<= 'B') ? '(' : *src
++;
996 if (c
>= '(' && c
<= '/')
999 if ((c1
< ' ' || c1
>= 0x80)
1000 || (charset
= iso_charset_table
[dim
][c
>= ','][c1
]) < 0
1001 || ! coding
->safe_charsets
[charset
]
1002 || (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
)
1003 == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION
))
1010 /* We have not found the sequence "ESC 1". */
1014 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
1017 decode_coding_iso2022 (coding
, source
, destination
, src_bytes
, dst_bytes
)
1018 struct coding_system
*coding
;
1019 unsigned char *source
, *destination
;
1020 int src_bytes
, dst_bytes
;
1022 unsigned char *src
= source
;
1023 unsigned char *src_end
= source
+ src_bytes
;
1024 unsigned char *dst
= destination
;
1025 unsigned char *dst_end
= destination
+ dst_bytes
;
1026 /* Since the maximum bytes produced by each loop is 7, we subtract 6
1027 from DST_END to assure that overflow checking is necessary only
1028 at the head of loop. */
1029 unsigned char *adjusted_dst_end
= dst_end
- 6;
1031 /* Charsets invoked to graphic plane 0 and 1 respectively. */
1032 int charset0
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 0);
1033 int charset1
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 1);
1034 Lisp_Object translation_table
1035 = coding
->translation_table_for_decode
;
1036 int result
= CODING_FINISH_NORMAL
;
1038 if (!NILP (Venable_character_translation
) && NILP (translation_table
))
1039 translation_table
= Vstandard_translation_table_for_decode
;
1041 coding
->produced_char
= 0;
1042 coding
->composed_chars
= 0;
1043 coding
->fake_multibyte
= 0;
1044 while (src
< src_end
&& (dst_bytes
1045 ? (dst
< adjusted_dst_end
)
1048 /* SRC_BASE remembers the start position in source in each loop.
1049 The loop will be exited when there's not enough source text
1050 to analyze long escape sequence or 2-byte code (within macros
1051 ONE_MORE_BYTE or TWO_MORE_BYTES). In that case, SRC is reset
1052 to SRC_BASE before exiting. */
1053 unsigned char *src_base
= src
;
1054 int c1
= *src
++, c2
;
1056 switch (iso_code_class
[c1
])
1058 case ISO_0x20_or_0x7F
:
1059 if (!coding
->composing
1060 && (charset0
< 0 || CHARSET_CHARS (charset0
) == 94))
1062 /* This is SPACE or DEL. */
1064 coding
->produced_char
++;
1067 /* This is a graphic character, we fall down ... */
1069 case ISO_graphic_plane_0
:
1070 if (coding
->composing
== COMPOSING_WITH_RULE_RULE
)
1072 /* This is a composition rule. */
1074 coding
->composing
= COMPOSING_WITH_RULE_TAIL
;
1077 DECODE_ISO_CHARACTER (charset0
, c1
);
1080 case ISO_0xA0_or_0xFF
:
1081 if (charset1
< 0 || CHARSET_CHARS (charset1
) == 94
1082 || coding
->flags
& CODING_FLAG_ISO_SEVEN_BITS
)
1083 goto label_invalid_code
;
1084 /* This is a graphic character, we fall down ... */
1086 case ISO_graphic_plane_1
:
1087 if (coding
->flags
& CODING_FLAG_ISO_SEVEN_BITS
)
1088 goto label_invalid_code
;
1090 DECODE_ISO_CHARACTER (charset1
, c1
);
1093 case ISO_control_code
:
1094 /* All ISO2022 control characters in this class have the
1095 same representation in Emacs internal format. */
1097 && (coding
->mode
& CODING_MODE_INHIBIT_INCONSISTENT_EOL
)
1098 && (coding
->eol_type
== CODING_EOL_CR
1099 || coding
->eol_type
== CODING_EOL_CRLF
))
1101 result
= CODING_FINISH_INCONSISTENT_EOL
;
1102 goto label_end_of_loop_2
;
1105 coding
->produced_char
++;
1107 coding
->fake_multibyte
= 1;
1110 case ISO_carriage_return
:
1111 if (coding
->eol_type
== CODING_EOL_CR
)
1113 else if (coding
->eol_type
== CODING_EOL_CRLF
)
1116 if (c1
== ISO_CODE_LF
)
1120 if (coding
->mode
& CODING_MODE_INHIBIT_INCONSISTENT_EOL
)
1122 result
= CODING_FINISH_INCONSISTENT_EOL
;
1123 goto label_end_of_loop_2
;
1131 coding
->produced_char
++;
1135 if (! (coding
->flags
& CODING_FLAG_ISO_LOCKING_SHIFT
)
1136 || CODING_SPEC_ISO_DESIGNATION (coding
, 1) < 0)
1137 goto label_invalid_code
;
1138 CODING_SPEC_ISO_INVOCATION (coding
, 0) = 1;
1139 charset0
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 0);
1143 if (! (coding
->flags
& CODING_FLAG_ISO_LOCKING_SHIFT
))
1144 goto label_invalid_code
;
1145 CODING_SPEC_ISO_INVOCATION (coding
, 0) = 0;
1146 charset0
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 0);
1149 case ISO_single_shift_2_7
:
1150 case ISO_single_shift_2
:
1151 if (! (coding
->flags
& CODING_FLAG_ISO_SINGLE_SHIFT
))
1152 goto label_invalid_code
;
1153 /* SS2 is handled as an escape sequence of ESC 'N' */
1155 goto label_escape_sequence
;
1157 case ISO_single_shift_3
:
1158 if (! (coding
->flags
& CODING_FLAG_ISO_SINGLE_SHIFT
))
1159 goto label_invalid_code
;
1160 /* SS2 is handled as an escape sequence of ESC 'O' */
1162 goto label_escape_sequence
;
1164 case ISO_control_sequence_introducer
:
1165 /* CSI is handled as an escape sequence of ESC '[' ... */
1167 goto label_escape_sequence
;
1171 label_escape_sequence
:
1172 /* Escape sequences handled by Emacs are invocation,
1173 designation, direction specification, and character
1174 composition specification. */
1177 case '&': /* revision of following character set */
1179 if (!(c1
>= '@' && c1
<= '~'))
1180 goto label_invalid_code
;
1182 if (c1
!= ISO_CODE_ESC
)
1183 goto label_invalid_code
;
1185 goto label_escape_sequence
;
1187 case '$': /* designation of 2-byte character set */
1188 if (! (coding
->flags
& CODING_FLAG_ISO_DESIGNATION
))
1189 goto label_invalid_code
;
1191 if (c1
>= '@' && c1
<= 'B')
1192 { /* designation of JISX0208.1978, GB2312.1980,
1194 DECODE_DESIGNATION (0, 2, 94, c1
);
1196 else if (c1
>= 0x28 && c1
<= 0x2B)
1197 { /* designation of DIMENSION2_CHARS94 character set */
1199 DECODE_DESIGNATION (c1
- 0x28, 2, 94, c2
);
1201 else if (c1
>= 0x2C && c1
<= 0x2F)
1202 { /* designation of DIMENSION2_CHARS96 character set */
1204 DECODE_DESIGNATION (c1
- 0x2C, 2, 96, c2
);
1207 goto label_invalid_code
;
1210 case 'n': /* invocation of locking-shift-2 */
1211 if (! (coding
->flags
& CODING_FLAG_ISO_LOCKING_SHIFT
)
1212 || CODING_SPEC_ISO_DESIGNATION (coding
, 2) < 0)
1213 goto label_invalid_code
;
1214 CODING_SPEC_ISO_INVOCATION (coding
, 0) = 2;
1215 charset0
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 0);
1218 case 'o': /* invocation of locking-shift-3 */
1219 if (! (coding
->flags
& CODING_FLAG_ISO_LOCKING_SHIFT
)
1220 || CODING_SPEC_ISO_DESIGNATION (coding
, 3) < 0)
1221 goto label_invalid_code
;
1222 CODING_SPEC_ISO_INVOCATION (coding
, 0) = 3;
1223 charset0
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 0);
1226 case 'N': /* invocation of single-shift-2 */
1227 if (! (coding
->flags
& CODING_FLAG_ISO_SINGLE_SHIFT
)
1228 || CODING_SPEC_ISO_DESIGNATION (coding
, 2) < 0)
1229 goto label_invalid_code
;
1231 charset
= CODING_SPEC_ISO_DESIGNATION (coding
, 2);
1232 DECODE_ISO_CHARACTER (charset
, c1
);
1235 case 'O': /* invocation of single-shift-3 */
1236 if (! (coding
->flags
& CODING_FLAG_ISO_SINGLE_SHIFT
)
1237 || CODING_SPEC_ISO_DESIGNATION (coding
, 3) < 0)
1238 goto label_invalid_code
;
1240 charset
= CODING_SPEC_ISO_DESIGNATION (coding
, 3);
1241 DECODE_ISO_CHARACTER (charset
, c1
);
1244 case '0': case '2': /* start composing */
1245 /* Before processing composing, we must be sure that all
1246 characters being composed are supported by CODING.
1247 If not, we must give up composing. */
1248 if (check_composing_code (coding
, src
, src_end
) == 0)
1250 /* We are looking at a valid composition sequence. */
1251 coding
->composing
= (c1
== '0'
1252 ? COMPOSING_NO_RULE_HEAD
1253 : COMPOSING_WITH_RULE_HEAD
);
1254 coding
->composed_chars
= 0;
1258 *dst
++ = ISO_CODE_ESC
;
1260 coding
->produced_char
+= 2;
1264 case '1': /* end composing */
1265 if (!coding
->composing
)
1267 *dst
++ = ISO_CODE_ESC
;
1269 coding
->produced_char
+= 2;
1273 if (coding
->composed_chars
> 0)
1275 if (coding
->composed_chars
== 1)
1277 unsigned char *this_char_start
= dst
;
1280 /* Only one character is in the composing
1281 sequence. Make it a normal character. */
1282 while (*--this_char_start
!= LEADING_CODE_COMPOSITION
);
1283 dst
= (this_char_start
1284 + (coding
->composing
== COMPOSING_NO_RULE_TAIL
1289 this_bytes
= BYTES_BY_CHAR_HEAD (*dst
);
1290 while (this_bytes
--) *this_char_start
++ = *dst
++;
1291 dst
= this_char_start
;
1293 coding
->produced_char
++;
1295 coding
->composing
= COMPOSING_NO
;
1298 case '[': /* specification of direction */
1299 if (coding
->flags
& CODING_FLAG_ISO_NO_DIRECTION
)
1300 goto label_invalid_code
;
1301 /* For the moment, nested direction is not supported.
1302 So, `coding->mode & CODING_MODE_DIRECTION' zero means
1303 left-to-right, and nozero means right-to-left. */
1307 case ']': /* end of the current direction */
1308 coding
->mode
&= ~CODING_MODE_DIRECTION
;
1310 case '0': /* end of the current direction */
1311 case '1': /* start of left-to-right direction */
1314 coding
->mode
&= ~CODING_MODE_DIRECTION
;
1316 goto label_invalid_code
;
1319 case '2': /* start of right-to-left direction */
1322 coding
->mode
|= CODING_MODE_DIRECTION
;
1324 goto label_invalid_code
;
1328 goto label_invalid_code
;
1333 if (! (coding
->flags
& CODING_FLAG_ISO_DESIGNATION
))
1334 goto label_invalid_code
;
1335 if (c1
>= 0x28 && c1
<= 0x2B)
1336 { /* designation of DIMENSION1_CHARS94 character set */
1338 DECODE_DESIGNATION (c1
- 0x28, 1, 94, c2
);
1340 else if (c1
>= 0x2C && c1
<= 0x2F)
1341 { /* designation of DIMENSION1_CHARS96 character set */
1343 DECODE_DESIGNATION (c1
- 0x2C, 1, 96, c2
);
1347 goto label_invalid_code
;
1350 /* We must update these variables now. */
1351 charset0
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 0);
1352 charset1
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 1);
1356 while (src_base
< src
)
1357 *dst
++ = *src_base
++;
1358 coding
->fake_multibyte
= 1;
1363 result
= CODING_FINISH_INSUFFICIENT_SRC
;
1364 label_end_of_loop_2
:
1371 if (result
== CODING_FINISH_NORMAL
)
1372 result
= CODING_FINISH_INSUFFICIENT_DST
;
1373 else if (result
!= CODING_FINISH_INCONSISTENT_EOL
1374 && coding
->mode
& CODING_MODE_LAST_BLOCK
)
1376 /* This is the last block of the text to be decoded. We had
1377 better just flush out all remaining codes in the text
1378 although they are not valid characters. */
1379 src_bytes
= src_end
- src
;
1380 if (dst_bytes
&& (dst_end
- dst
< src_bytes
))
1381 src_bytes
= dst_end
- dst
;
1382 bcopy (src
, dst
, src_bytes
);
1385 coding
->fake_multibyte
= 1;
1389 coding
->consumed
= coding
->consumed_char
= src
- source
;
1390 coding
->produced
= dst
- destination
;
1394 /* ISO2022 encoding stuff. */
1397 It is not enough to say just "ISO2022" on encoding, we have to
1398 specify more details. In Emacs, each coding system of ISO2022
1399 variant has the following specifications:
1400 1. Initial designation to G0 thru G3.
1401 2. Allows short-form designation?
1402 3. ASCII should be designated to G0 before control characters?
1403 4. ASCII should be designated to G0 at end of line?
1404 5. 7-bit environment or 8-bit environment?
1405 6. Use locking-shift?
1406 7. Use Single-shift?
1407 And the following two are only for Japanese:
1408 8. Use ASCII in place of JIS0201-1976-Roman?
1409 9. Use JISX0208-1983 in place of JISX0208-1978?
1410 These specifications are encoded in `coding->flags' as flag bits
1411 defined by macros CODING_FLAG_ISO_XXX. See `coding.h' for more
1415 /* Produce codes (escape sequence) for designating CHARSET to graphic
1416 register REG. If <final-char> of CHARSET is '@', 'A', or 'B' and
1417 the coding system CODING allows, produce designation sequence of
1420 #define ENCODE_DESIGNATION(charset, reg, coding) \
1422 unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset); \
1423 char *intermediate_char_94 = "()*+"; \
1424 char *intermediate_char_96 = ",-./"; \
1425 int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset); \
1426 if (revision < 255) \
1428 *dst++ = ISO_CODE_ESC; \
1430 *dst++ = '@' + revision; \
1432 *dst++ = ISO_CODE_ESC; \
1433 if (CHARSET_DIMENSION (charset) == 1) \
1435 if (CHARSET_CHARS (charset) == 94) \
1436 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1438 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1443 if (CHARSET_CHARS (charset) == 94) \
1445 if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM) \
1447 || final_char < '@' || final_char > 'B') \
1448 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1451 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1453 *dst++ = final_char; \
1454 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
1457 /* The following two macros produce codes (control character or escape
1458 sequence) for ISO2022 single-shift functions (single-shift-2 and
1461 #define ENCODE_SINGLE_SHIFT_2 \
1463 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1464 *dst++ = ISO_CODE_ESC, *dst++ = 'N'; \
1467 *dst++ = ISO_CODE_SS2; \
1468 coding->fake_multibyte = 1; \
1470 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1473 #define ENCODE_SINGLE_SHIFT_3 \
1475 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1476 *dst++ = ISO_CODE_ESC, *dst++ = 'O'; \
1479 *dst++ = ISO_CODE_SS3; \
1480 coding->fake_multibyte = 1; \
1482 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1485 /* The following four macros produce codes (control character or
1486 escape sequence) for ISO2022 locking-shift functions (shift-in,
1487 shift-out, locking-shift-2, and locking-shift-3). */
1489 #define ENCODE_SHIFT_IN \
1491 *dst++ = ISO_CODE_SI; \
1492 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1495 #define ENCODE_SHIFT_OUT \
1497 *dst++ = ISO_CODE_SO; \
1498 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1501 #define ENCODE_LOCKING_SHIFT_2 \
1503 *dst++ = ISO_CODE_ESC, *dst++ = 'n'; \
1504 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1507 #define ENCODE_LOCKING_SHIFT_3 \
1509 *dst++ = ISO_CODE_ESC, *dst++ = 'o'; \
1510 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1513 /* Produce codes for a DIMENSION1 character whose character set is
1514 CHARSET and whose position-code is C1. Designation and invocation
1515 sequences are also produced in advance if necessary. */
1518 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
1520 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1522 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1523 *dst++ = c1 & 0x7F; \
1525 *dst++ = c1 | 0x80; \
1526 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1529 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1531 *dst++ = c1 & 0x7F; \
1534 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1536 *dst++ = c1 | 0x80; \
1539 else if (coding->flags & CODING_FLAG_ISO_SAFE \
1540 && !coding->safe_charsets[charset]) \
1542 /* We should not encode this character, instead produce one or \
1544 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1545 if (CHARSET_WIDTH (charset) == 2) \
1546 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1550 /* Since CHARSET is not yet invoked to any graphic planes, we \
1551 must invoke it, or, at first, designate it to some graphic \
1552 register. Then repeat the loop to actually produce the \
1554 dst = encode_invocation_designation (charset, coding, dst); \
1557 /* Produce codes for a DIMENSION2 character whose character set is
1558 CHARSET and whose position-codes are C1 and C2. Designation and
1559 invocation codes are also produced in advance if necessary. */
1561 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
1563 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1565 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1566 *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F; \
1568 *dst++ = c1 | 0x80, *dst++ = c2 | 0x80; \
1569 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1572 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1574 *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F; \
1577 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1579 *dst++ = c1 | 0x80, *dst++= c2 | 0x80; \
1582 else if (coding->flags & CODING_FLAG_ISO_SAFE \
1583 && !coding->safe_charsets[charset]) \
1585 /* We should not encode this character, instead produce one or \
1587 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1588 if (CHARSET_WIDTH (charset) == 2) \
1589 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1593 /* Since CHARSET is not yet invoked to any graphic planes, we \
1594 must invoke it, or, at first, designate it to some graphic \
1595 register. Then repeat the loop to actually produce the \
1597 dst = encode_invocation_designation (charset, coding, dst); \
1600 #define ENCODE_ISO_CHARACTER(charset, c1, c2) \
1602 int c_alt, charset_alt; \
1603 if (!NILP (translation_table) \
1604 && ((c_alt = translate_char (translation_table, -1, \
1607 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1609 charset_alt = charset; \
1610 if (CHARSET_DIMENSION (charset_alt) == 1) \
1612 if (charset == CHARSET_ASCII \
1613 && coding->flags & CODING_FLAG_ISO_USE_ROMAN) \
1614 charset_alt = charset_latin_jisx0201; \
1615 ENCODE_ISO_CHARACTER_DIMENSION1 (charset_alt, c1); \
1619 if (charset == charset_jisx0208 \
1620 && coding->flags & CODING_FLAG_ISO_USE_OLDJIS) \
1621 charset_alt = charset_jisx0208_1978; \
1622 ENCODE_ISO_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
1624 if (! COMPOSING_P (coding->composing)) \
1625 coding->consumed_char++; \
1628 /* Produce designation and invocation codes at a place pointed by DST
1629 to use CHARSET. The element `spec.iso2022' of *CODING is updated.
1633 encode_invocation_designation (charset
, coding
, dst
)
1635 struct coding_system
*coding
;
1638 int reg
; /* graphic register number */
1640 /* At first, check designations. */
1641 for (reg
= 0; reg
< 4; reg
++)
1642 if (charset
== CODING_SPEC_ISO_DESIGNATION (coding
, reg
))
1647 /* CHARSET is not yet designated to any graphic registers. */
1648 /* At first check the requested designation. */
1649 reg
= CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
);
1650 if (reg
== CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION
)
1651 /* Since CHARSET requests no special designation, designate it
1652 to graphic register 0. */
1655 ENCODE_DESIGNATION (charset
, reg
, coding
);
1658 if (CODING_SPEC_ISO_INVOCATION (coding
, 0) != reg
1659 && CODING_SPEC_ISO_INVOCATION (coding
, 1) != reg
)
1661 /* Since the graphic register REG is not invoked to any graphic
1662 planes, invoke it to graphic plane 0. */
1665 case 0: /* graphic register 0 */
1669 case 1: /* graphic register 1 */
1673 case 2: /* graphic register 2 */
1674 if (coding
->flags
& CODING_FLAG_ISO_SINGLE_SHIFT
)
1675 ENCODE_SINGLE_SHIFT_2
;
1677 ENCODE_LOCKING_SHIFT_2
;
1680 case 3: /* graphic register 3 */
1681 if (coding
->flags
& CODING_FLAG_ISO_SINGLE_SHIFT
)
1682 ENCODE_SINGLE_SHIFT_3
;
1684 ENCODE_LOCKING_SHIFT_3
;
1691 /* The following two macros produce codes for indicating composition. */
1692 #define ENCODE_COMPOSITION_NO_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '0'
1693 #define ENCODE_COMPOSITION_WITH_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '2'
1694 #define ENCODE_COMPOSITION_END *dst++ = ISO_CODE_ESC, *dst++ = '1'
1696 /* The following three macros produce codes for indicating direction
1698 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
1700 if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS) \
1701 *dst++ = ISO_CODE_ESC, *dst++ = '['; \
1703 *dst++ = ISO_CODE_CSI; \
1706 #define ENCODE_DIRECTION_R2L \
1707 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '2', *dst++ = ']'
1709 #define ENCODE_DIRECTION_L2R \
1710 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '0', *dst++ = ']'
1712 /* Produce codes for designation and invocation to reset the graphic
1713 planes and registers to initial state. */
1714 #define ENCODE_RESET_PLANE_AND_REGISTER \
1717 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0) \
1719 for (reg = 0; reg < 4; reg++) \
1720 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0 \
1721 && (CODING_SPEC_ISO_DESIGNATION (coding, reg) \
1722 != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg))) \
1723 ENCODE_DESIGNATION \
1724 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
1727 /* Produce designation sequences of charsets in the line started from
1728 SRC to a place pointed by *DSTP, and update DSTP.
1730 If the current block ends before any end-of-line, we may fail to
1731 find all the necessary designations. */
1734 encode_designation_at_bol (coding
, table
, src
, src_end
, dstp
)
1735 struct coding_system
*coding
;
1737 unsigned char *src
, *src_end
, **dstp
;
1739 int charset
, c
, found
= 0, reg
;
1740 /* Table of charsets to be designated to each graphic register. */
1742 unsigned char *dst
= *dstp
;
1744 for (reg
= 0; reg
< 4; reg
++)
1747 while (src
< src_end
&& *src
!= '\n' && found
< 4)
1749 int bytes
= BYTES_BY_CHAR_HEAD (*src
);
1752 charset
= CHARSET_AT (src
);
1756 unsigned char c1
, c2
;
1758 SPLIT_STRING(src
, bytes
, charset
, c1
, c2
);
1759 if ((c_alt
= translate_char (table
, -1, charset
, c1
, c2
)) >= 0)
1760 charset
= CHAR_CHARSET (c_alt
);
1763 reg
= CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
);
1764 if (reg
!= CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION
&& r
[reg
] < 0)
1775 for (reg
= 0; reg
< 4; reg
++)
1777 && CODING_SPEC_ISO_DESIGNATION (coding
, reg
) != r
[reg
])
1778 ENCODE_DESIGNATION (r
[reg
], reg
, coding
);
1783 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
1786 encode_coding_iso2022 (coding
, source
, destination
, src_bytes
, dst_bytes
)
1787 struct coding_system
*coding
;
1788 unsigned char *source
, *destination
;
1789 int src_bytes
, dst_bytes
;
1791 unsigned char *src
= source
;
1792 unsigned char *src_end
= source
+ src_bytes
;
1793 unsigned char *dst
= destination
;
1794 unsigned char *dst_end
= destination
+ dst_bytes
;
1795 /* Since the maximum bytes produced by each loop is 20, we subtract 19
1796 from DST_END to assure overflow checking is necessary only at the
1798 unsigned char *adjusted_dst_end
= dst_end
- 19;
1799 Lisp_Object translation_table
1800 = coding
->translation_table_for_encode
;
1801 int result
= CODING_FINISH_NORMAL
;
1803 if (!NILP (Venable_character_translation
) && NILP (translation_table
))
1804 translation_table
= Vstandard_translation_table_for_encode
;
1806 coding
->consumed_char
= 0;
1807 coding
->fake_multibyte
= 0;
1808 while (src
< src_end
&& (dst_bytes
1809 ? (dst
< adjusted_dst_end
)
1810 : (dst
< src
- 19)))
1812 /* SRC_BASE remembers the start position in source in each loop.
1813 The loop will be exited when there's not enough source text
1814 to analyze multi-byte codes (within macros ONE_MORE_BYTE,
1815 TWO_MORE_BYTES, and THREE_MORE_BYTES). In that case, SRC is
1816 reset to SRC_BASE before exiting. */
1817 unsigned char *src_base
= src
;
1818 int charset
, c1
, c2
, c3
, c4
;
1820 if (coding
->flags
& CODING_FLAG_ISO_DESIGNATE_AT_BOL
1821 && CODING_SPEC_ISO_BOL (coding
))
1823 /* We have to produce designation sequences if any now. */
1824 encode_designation_at_bol (coding
, translation_table
,
1825 src
, src_end
, &dst
);
1826 CODING_SPEC_ISO_BOL (coding
) = 0;
1830 /* If we are seeing a component of a composite character, we are
1831 seeing a leading-code encoded irregularly for composition, or
1832 a composition rule if composing with rule. We must set C1 to
1833 a normal leading-code or an ASCII code. If we are not seeing
1834 a composite character, we must reset composition,
1835 designation, and invocation states. */
1836 if (COMPOSING_P (coding
->composing
))
1840 /* We are not in a composite character any longer. */
1841 coding
->composing
= COMPOSING_NO
;
1842 ENCODE_RESET_PLANE_AND_REGISTER
;
1843 ENCODE_COMPOSITION_END
;
1847 if (coding
->composing
== COMPOSING_WITH_RULE_RULE
)
1850 coding
->composing
= COMPOSING_WITH_RULE_HEAD
;
1853 else if (coding
->composing
== COMPOSING_WITH_RULE_HEAD
)
1854 coding
->composing
= COMPOSING_WITH_RULE_RULE
;
1857 /* This is an ASCII component. */
1862 /* This is a leading-code of non ASCII component. */
1867 /* Now encode one character. C1 is a control character, an
1868 ASCII character, or a leading-code of multi-byte character. */
1869 switch (emacs_code_class
[c1
])
1871 case EMACS_ascii_code
:
1872 ENCODE_ISO_CHARACTER (CHARSET_ASCII
, c1
, /* dummy */ c2
);
1875 case EMACS_control_code
:
1876 if (coding
->flags
& CODING_FLAG_ISO_RESET_AT_CNTL
)
1877 ENCODE_RESET_PLANE_AND_REGISTER
;
1879 coding
->consumed_char
++;
1882 case EMACS_carriage_return_code
:
1883 if (! (coding
->mode
& CODING_MODE_SELECTIVE_DISPLAY
))
1885 if (coding
->flags
& CODING_FLAG_ISO_RESET_AT_CNTL
)
1886 ENCODE_RESET_PLANE_AND_REGISTER
;
1888 coding
->consumed_char
++;
1891 /* fall down to treat '\r' as '\n' ... */
1893 case EMACS_linefeed_code
:
1894 if (coding
->flags
& CODING_FLAG_ISO_RESET_AT_EOL
)
1895 ENCODE_RESET_PLANE_AND_REGISTER
;
1896 if (coding
->flags
& CODING_FLAG_ISO_INIT_AT_BOL
)
1897 bcopy (coding
->spec
.iso2022
.initial_designation
,
1898 coding
->spec
.iso2022
.current_designation
,
1899 sizeof coding
->spec
.iso2022
.initial_designation
);
1900 if (coding
->eol_type
== CODING_EOL_LF
1901 || coding
->eol_type
== CODING_EOL_UNDECIDED
)
1902 *dst
++ = ISO_CODE_LF
;
1903 else if (coding
->eol_type
== CODING_EOL_CRLF
)
1904 *dst
++ = ISO_CODE_CR
, *dst
++ = ISO_CODE_LF
;
1906 *dst
++ = ISO_CODE_CR
;
1907 CODING_SPEC_ISO_BOL (coding
) = 1;
1908 coding
->consumed_char
++;
1911 case EMACS_leading_code_2
:
1915 /* invalid sequence */
1918 coding
->consumed_char
++;
1921 ENCODE_ISO_CHARACTER (c1
, c2
, /* dummy */ c3
);
1924 case EMACS_leading_code_3
:
1925 TWO_MORE_BYTES (c2
, c3
);
1926 if (c2
< 0xA0 || c3
< 0xA0)
1928 /* invalid sequence */
1931 coding
->consumed_char
++;
1933 else if (c1
< LEADING_CODE_PRIVATE_11
)
1934 ENCODE_ISO_CHARACTER (c1
, c2
, c3
);
1936 ENCODE_ISO_CHARACTER (c2
, c3
, /* dummy */ c4
);
1939 case EMACS_leading_code_4
:
1940 THREE_MORE_BYTES (c2
, c3
, c4
);
1941 if (c2
< 0xA0 || c3
< 0xA0 || c4
< 0xA0)
1943 /* invalid sequence */
1946 coding
->consumed_char
++;
1949 ENCODE_ISO_CHARACTER (c2
, c3
, c4
);
1952 case EMACS_leading_code_composition
:
1956 /* invalid sequence */
1959 coding
->consumed_char
++;
1961 else if (c2
== 0xFF)
1963 ENCODE_RESET_PLANE_AND_REGISTER
;
1964 coding
->composing
= COMPOSING_WITH_RULE_HEAD
;
1965 ENCODE_COMPOSITION_WITH_RULE_START
;
1966 coding
->consumed_char
++;
1970 ENCODE_RESET_PLANE_AND_REGISTER
;
1971 /* Rewind one byte because it is a character code of
1972 composition elements. */
1974 coding
->composing
= COMPOSING_NO_RULE_HEAD
;
1975 ENCODE_COMPOSITION_NO_RULE_START
;
1976 coding
->consumed_char
++;
1980 case EMACS_invalid_code
:
1982 coding
->consumed_char
++;
1987 result
= CODING_FINISH_INSUFFICIENT_SRC
;
1992 if (src
< src_end
&& result
== CODING_FINISH_NORMAL
)
1993 result
= CODING_FINISH_INSUFFICIENT_DST
;
1995 /* If this is the last block of the text to be encoded, we must
1996 reset graphic planes and registers to the initial state, and
1997 flush out the carryover if any. */
1998 if (coding
->mode
& CODING_MODE_LAST_BLOCK
)
2000 ENCODE_RESET_PLANE_AND_REGISTER
;
2001 if (COMPOSING_P (coding
->composing
))
2002 ENCODE_COMPOSITION_END
;
2003 if (result
== CODING_FINISH_INSUFFICIENT_SRC
)
2005 while (src
< src_end
&& dst
< dst_end
)
2009 coding
->consumed
= src
- source
;
2010 coding
->produced
= coding
->produced_char
= dst
- destination
;
2015 /*** 4. SJIS and BIG5 handlers ***/
2017 /* Although SJIS and BIG5 are not ISO's coding system, they are used
2018 quite widely. So, for the moment, Emacs supports them in the bare
2019 C code. But, in the future, they may be supported only by CCL. */
2021 /* SJIS is a coding system encoding three character sets: ASCII, right
2022 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
2023 as is. A character of charset katakana-jisx0201 is encoded by
2024 "position-code + 0x80". A character of charset japanese-jisx0208
2025 is encoded in 2-byte but two position-codes are divided and shifted
2026 so that it fit in the range below.
2028 --- CODE RANGE of SJIS ---
2029 (character set) (range)
2031 KATAKANA-JISX0201 0xA0 .. 0xDF
2032 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xEF
2033 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
2034 -------------------------------
2038 /* BIG5 is a coding system encoding two character sets: ASCII and
2039 Big5. An ASCII character is encoded as is. Big5 is a two-byte
2040 character set and is encoded in two-byte.
2042 --- CODE RANGE of BIG5 ---
2043 (character set) (range)
2045 Big5 (1st byte) 0xA1 .. 0xFE
2046 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
2047 --------------------------
2049 Since the number of characters in Big5 is larger than maximum
2050 characters in Emacs' charset (96x96), it can't be handled as one
2051 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
2052 and `charset-big5-2'. Both are DIMENSION2 and CHARS94. The former
2053 contains frequently used characters and the latter contains less
2054 frequently used characters. */
2056 /* Macros to decode or encode a character of Big5 in BIG5. B1 and B2
2057 are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
2058 C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
2059 format. CHARSET is `charset_big5_1' or `charset_big5_2'. */
2061 /* Number of Big5 characters which have the same code in 1st byte. */
2062 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2064 #define DECODE_BIG5(b1, b2, charset, c1, c2) \
2067 = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62); \
2069 charset = charset_big5_1; \
2072 charset = charset_big5_2; \
2073 temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW; \
2075 c1 = temp / (0xFF - 0xA1) + 0x21; \
2076 c2 = temp % (0xFF - 0xA1) + 0x21; \
2079 #define ENCODE_BIG5(charset, c1, c2, b1, b2) \
2081 unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21); \
2082 if (charset == charset_big5_2) \
2083 temp += BIG5_SAME_ROW * (0xC9 - 0xA1); \
2084 b1 = temp / BIG5_SAME_ROW + 0xA1; \
2085 b2 = temp % BIG5_SAME_ROW; \
2086 b2 += b2 < 0x3F ? 0x40 : 0x62; \
2089 #define DECODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \
2091 int c_alt, charset_alt = (charset); \
2092 if (!NILP (translation_table) \
2093 && ((c_alt = translate_char (translation_table, \
2094 -1, (charset), c1, c2)) >= 0)) \
2095 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
2096 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
2097 DECODE_CHARACTER_ASCII (c1); \
2098 else if (CHARSET_DIMENSION (charset_alt) == 1) \
2099 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
2101 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
2104 #define ENCODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \
2106 int c_alt, charset_alt; \
2107 if (!NILP (translation_table) \
2108 && ((c_alt = translate_char (translation_table, -1, \
2111 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
2113 charset_alt = charset; \
2114 if (charset_alt == charset_ascii) \
2116 else if (CHARSET_DIMENSION (charset_alt) == 1) \
2118 if (sjis_p && charset_alt == charset_katakana_jisx0201) \
2122 *dst++ = charset_alt, *dst++ = c1; \
2123 coding->fake_multibyte = 1; \
2128 c1 &= 0x7F, c2 &= 0x7F; \
2129 if (sjis_p && charset_alt == charset_jisx0208) \
2131 unsigned char s1, s2; \
2133 ENCODE_SJIS (c1, c2, s1, s2); \
2134 *dst++ = s1, *dst++ = s2; \
2135 coding->fake_multibyte = 1; \
2138 && (charset_alt == charset_big5_1 \
2139 || charset_alt == charset_big5_2)) \
2141 unsigned char b1, b2; \
2143 ENCODE_BIG5 (charset_alt, c1, c2, b1, b2); \
2144 *dst++ = b1, *dst++ = b2; \
2148 *dst++ = charset_alt, *dst++ = c1, *dst++ = c2; \
2149 coding->fake_multibyte = 1; \
2152 coding->consumed_char++; \
2155 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2156 Check if a text is encoded in SJIS. If it is, return
2157 CODING_CATEGORY_MASK_SJIS, else return 0. */
2160 detect_coding_sjis (src
, src_end
)
2161 unsigned char *src
, *src_end
;
2165 while (src
< src_end
)
2168 if ((c
>= 0x80 && c
< 0xA0) || c
>= 0xE0)
2170 if (src
< src_end
&& *src
++ < 0x40)
2174 return CODING_CATEGORY_MASK_SJIS
;
2177 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2178 Check if a text is encoded in BIG5. If it is, return
2179 CODING_CATEGORY_MASK_BIG5, else return 0. */
2182 detect_coding_big5 (src
, src_end
)
2183 unsigned char *src
, *src_end
;
2187 while (src
< src_end
)
2195 if (c
< 0x40 || (c
>= 0x7F && c
<= 0xA0))
2199 return CODING_CATEGORY_MASK_BIG5
;
2202 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2203 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
2206 decode_coding_sjis_big5 (coding
, source
, destination
,
2207 src_bytes
, dst_bytes
, sjis_p
)
2208 struct coding_system
*coding
;
2209 unsigned char *source
, *destination
;
2210 int src_bytes
, dst_bytes
;
2213 unsigned char *src
= source
;
2214 unsigned char *src_end
= source
+ src_bytes
;
2215 unsigned char *dst
= destination
;
2216 unsigned char *dst_end
= destination
+ dst_bytes
;
2217 /* Since the maximum bytes produced by each loop is 4, we subtract 3
2218 from DST_END to assure overflow checking is necessary only at the
2220 unsigned char *adjusted_dst_end
= dst_end
- 3;
2221 Lisp_Object translation_table
2222 = coding
->translation_table_for_decode
;
2223 int result
= CODING_FINISH_NORMAL
;
2225 if (!NILP (Venable_character_translation
) && NILP (translation_table
))
2226 translation_table
= Vstandard_translation_table_for_decode
;
2228 coding
->produced_char
= 0;
2229 coding
->fake_multibyte
= 0;
2230 while (src
< src_end
&& (dst_bytes
2231 ? (dst
< adjusted_dst_end
)
2234 /* SRC_BASE remembers the start position in source in each loop.
2235 The loop will be exited when there's not enough source text
2236 to analyze two-byte character (within macro ONE_MORE_BYTE).
2237 In that case, SRC is reset to SRC_BASE before exiting. */
2238 unsigned char *src_base
= src
;
2239 unsigned char c1
= *src
++, c2
, c3
, c4
;
2245 if (coding
->eol_type
== CODING_EOL_CRLF
)
2250 else if (coding
->mode
& CODING_MODE_INHIBIT_INCONSISTENT_EOL
)
2252 result
= CODING_FINISH_INCONSISTENT_EOL
;
2253 goto label_end_of_loop_2
;
2256 /* To process C2 again, SRC is subtracted by 1. */
2259 else if (coding
->eol_type
== CODING_EOL_CR
)
2265 && (coding
->mode
& CODING_MODE_INHIBIT_INCONSISTENT_EOL
)
2266 && (coding
->eol_type
== CODING_EOL_CR
2267 || coding
->eol_type
== CODING_EOL_CRLF
))
2269 result
= CODING_FINISH_INCONSISTENT_EOL
;
2270 goto label_end_of_loop_2
;
2274 coding
->produced_char
++;
2277 DECODE_SJIS_BIG5_CHARACTER (charset_ascii
, c1
, /* dummy */ c2
);
2282 if (c1
< 0xA0 || (c1
>= 0xE0 && c1
< 0xF0))
2284 /* SJIS -> JISX0208 */
2286 if (c2
>= 0x40 && c2
!= 0x7F && c2
<= 0xFC)
2288 DECODE_SJIS (c1
, c2
, c3
, c4
);
2289 DECODE_SJIS_BIG5_CHARACTER (charset_jisx0208
, c3
, c4
);
2292 goto label_invalid_code_2
;
2295 /* SJIS -> JISX0201-Kana */
2296 DECODE_SJIS_BIG5_CHARACTER (charset_katakana_jisx0201
, c1
,
2299 goto label_invalid_code_1
;
2304 if (c1
>= 0xA1 && c1
<= 0xFE)
2307 if ((c2
>= 0x40 && c2
<= 0x7E) || (c2
>= 0xA1 && c2
<= 0xFE))
2311 DECODE_BIG5 (c1
, c2
, charset
, c3
, c4
);
2312 DECODE_SJIS_BIG5_CHARACTER (charset
, c3
, c4
);
2315 goto label_invalid_code_2
;
2318 goto label_invalid_code_1
;
2323 label_invalid_code_1
:
2325 coding
->produced_char
++;
2326 coding
->fake_multibyte
= 1;
2329 label_invalid_code_2
:
2330 *dst
++ = c1
; *dst
++= c2
;
2331 coding
->produced_char
+= 2;
2332 coding
->fake_multibyte
= 1;
2336 result
= CODING_FINISH_INSUFFICIENT_SRC
;
2337 label_end_of_loop_2
:
2344 if (result
== CODING_FINISH_NORMAL
)
2345 result
= CODING_FINISH_INSUFFICIENT_DST
;
2346 else if (result
!= CODING_FINISH_INCONSISTENT_EOL
2347 && coding
->mode
& CODING_MODE_LAST_BLOCK
)
2349 src_bytes
= src_end
- src
;
2350 if (dst_bytes
&& (dst_end
- dst
< src_bytes
))
2351 src_bytes
= dst_end
- dst
;
2352 bcopy (dst
, src
, src_bytes
);
2355 coding
->fake_multibyte
= 1;
2359 coding
->consumed
= coding
->consumed_char
= src
- source
;
2360 coding
->produced
= dst
- destination
;
2364 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
2365 This function can encode `charset_ascii', `charset_katakana_jisx0201',
2366 `charset_jisx0208', `charset_big5_1', and `charset_big5-2'. We are
2367 sure that all these charsets are registered as official charset
2368 (i.e. do not have extended leading-codes). Characters of other
2369 charsets are produced without any encoding. If SJIS_P is 1, encode
2370 SJIS text, else encode BIG5 text. */
2373 encode_coding_sjis_big5 (coding
, source
, destination
,
2374 src_bytes
, dst_bytes
, sjis_p
)
2375 struct coding_system
*coding
;
2376 unsigned char *source
, *destination
;
2377 int src_bytes
, dst_bytes
;
2380 unsigned char *src
= source
;
2381 unsigned char *src_end
= source
+ src_bytes
;
2382 unsigned char *dst
= destination
;
2383 unsigned char *dst_end
= destination
+ dst_bytes
;
2384 /* Since the maximum bytes produced by each loop is 2, we subtract 1
2385 from DST_END to assure overflow checking is necessary only at the
2387 unsigned char *adjusted_dst_end
= dst_end
- 1;
2388 Lisp_Object translation_table
2389 = coding
->translation_table_for_encode
;
2390 int result
= CODING_FINISH_NORMAL
;
2392 if (!NILP (Venable_character_translation
) && NILP (translation_table
))
2393 translation_table
= Vstandard_translation_table_for_encode
;
2395 coding
->consumed_char
= 0;
2396 coding
->fake_multibyte
= 0;
2397 while (src
< src_end
&& (dst_bytes
2398 ? (dst
< adjusted_dst_end
)
2401 /* SRC_BASE remembers the start position in source in each loop.
2402 The loop will be exited when there's not enough source text
2403 to analyze multi-byte codes (within macros ONE_MORE_BYTE and
2404 TWO_MORE_BYTES). In that case, SRC is reset to SRC_BASE
2406 unsigned char *src_base
= src
;
2407 unsigned char c1
= *src
++, c2
, c3
, c4
;
2409 if (coding
->composing
)
2416 else if (c1
>= 0xA0)
2419 coding
->composing
= 0;
2422 switch (emacs_code_class
[c1
])
2424 case EMACS_ascii_code
:
2425 ENCODE_SJIS_BIG5_CHARACTER (charset_ascii
, c1
, /* dummy */ c2
);
2428 case EMACS_control_code
:
2430 coding
->consumed_char
++;
2433 case EMACS_carriage_return_code
:
2434 if (! (coding
->mode
& CODING_MODE_SELECTIVE_DISPLAY
))
2437 coding
->consumed_char
++;
2440 /* fall down to treat '\r' as '\n' ... */
2442 case EMACS_linefeed_code
:
2443 if (coding
->eol_type
== CODING_EOL_LF
2444 || coding
->eol_type
== CODING_EOL_UNDECIDED
)
2446 else if (coding
->eol_type
== CODING_EOL_CRLF
)
2447 *dst
++ = '\r', *dst
++ = '\n';
2450 coding
->consumed_char
++;
2453 case EMACS_leading_code_2
:
2455 ENCODE_SJIS_BIG5_CHARACTER (c1
, c2
, /* dummy */ c3
);
2458 case EMACS_leading_code_3
:
2459 TWO_MORE_BYTES (c2
, c3
);
2460 ENCODE_SJIS_BIG5_CHARACTER (c1
, c2
, c3
);
2463 case EMACS_leading_code_4
:
2464 THREE_MORE_BYTES (c2
, c3
, c4
);
2465 ENCODE_SJIS_BIG5_CHARACTER (c2
, c3
, c4
);
2468 case EMACS_leading_code_composition
:
2469 coding
->composing
= 1;
2472 default: /* i.e. case EMACS_invalid_code: */
2474 coding
->consumed_char
++;
2479 result
= CODING_FINISH_INSUFFICIENT_SRC
;
2484 if (result
== CODING_FINISH_NORMAL
2486 result
= CODING_FINISH_INSUFFICIENT_DST
;
2487 coding
->consumed
= src
- source
;
2488 coding
->produced
= coding
->produced_char
= dst
- destination
;
2493 /*** 5. CCL handlers ***/
2495 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2496 Check if a text is encoded in a coding system of which
2497 encoder/decoder are written in CCL program. If it is, return
2498 CODING_CATEGORY_MASK_CCL, else return 0. */
2501 detect_coding_ccl (src
, src_end
)
2502 unsigned char *src
, *src_end
;
2504 unsigned char *valid
;
2506 /* No coding system is assigned to coding-category-ccl. */
2507 if (!coding_system_table
[CODING_CATEGORY_IDX_CCL
])
2510 valid
= coding_system_table
[CODING_CATEGORY_IDX_CCL
]->spec
.ccl
.valid_codes
;
2511 while (src
< src_end
)
2513 if (! valid
[*src
]) return 0;
2516 return CODING_CATEGORY_MASK_CCL
;
2520 /*** 6. End-of-line handlers ***/
2522 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2523 This function is called only when `coding->eol_type' is
2524 CODING_EOL_CRLF or CODING_EOL_CR. */
2527 decode_eol (coding
, source
, destination
, src_bytes
, dst_bytes
)
2528 struct coding_system
*coding
;
2529 unsigned char *source
, *destination
;
2530 int src_bytes
, dst_bytes
;
2532 unsigned char *src
= source
;
2533 unsigned char *src_end
= source
+ src_bytes
;
2534 unsigned char *dst
= destination
;
2535 unsigned char *dst_end
= destination
+ dst_bytes
;
2537 int result
= CODING_FINISH_NORMAL
;
2539 coding
->fake_multibyte
= 0;
2544 switch (coding
->eol_type
)
2546 case CODING_EOL_CRLF
:
2548 /* Since the maximum bytes produced by each loop is 2, we
2549 subtract 1 from DST_END to assure overflow checking is
2550 necessary only at the head of loop. */
2551 unsigned char *adjusted_dst_end
= dst_end
- 1;
2553 while (src
< src_end
&& (dst_bytes
2554 ? (dst
< adjusted_dst_end
)
2557 unsigned char *src_base
= src
;
2567 if (coding
->mode
& CODING_MODE_INHIBIT_INCONSISTENT_EOL
)
2569 result
= CODING_FINISH_INCONSISTENT_EOL
;
2570 goto label_end_of_loop_2
;
2574 if (BASE_LEADING_CODE_P (c
))
2575 coding
->fake_multibyte
= 1;
2579 && (coding
->mode
& CODING_MODE_INHIBIT_INCONSISTENT_EOL
))
2581 result
= CODING_FINISH_INCONSISTENT_EOL
;
2582 goto label_end_of_loop_2
;
2587 if (BASE_LEADING_CODE_P (c
))
2588 coding
->fake_multibyte
= 1;
2593 result
= CODING_FINISH_INSUFFICIENT_SRC
;
2594 label_end_of_loop_2
:
2600 if (result
== CODING_FINISH_NORMAL
)
2601 result
= CODING_FINISH_INSUFFICIENT_DST
;
2602 else if (result
!= CODING_FINISH_INCONSISTENT_EOL
2603 && coding
->mode
& CODING_MODE_LAST_BLOCK
)
2605 /* This is the last block of the text to be decoded.
2606 We flush out all remaining codes. */
2607 src_bytes
= src_end
- src
;
2608 if (dst_bytes
&& (dst_end
- dst
< src_bytes
))
2609 src_bytes
= dst_end
- dst
;
2610 bcopy (src
, dst
, src_bytes
);
2619 if (coding
->mode
& CODING_MODE_INHIBIT_INCONSISTENT_EOL
)
2621 while (src
< src_end
)
2623 if ((c
= *src
++) == '\n')
2625 if (BASE_LEADING_CODE_P (c
))
2626 coding
->fake_multibyte
= 1;
2630 src_bytes
= src
- source
;
2631 result
= CODING_FINISH_INCONSISTENT_EOL
;
2634 if (dst_bytes
&& src_bytes
> dst_bytes
)
2636 result
= CODING_FINISH_INSUFFICIENT_DST
;
2637 src_bytes
= dst_bytes
;
2640 bcopy (source
, destination
, src_bytes
);
2642 safe_bcopy (source
, destination
, src_bytes
);
2643 src
= source
+ src_bytes
;
2644 while (src_bytes
--) if (*dst
++ == '\r') dst
[-1] = '\n';
2647 default: /* i.e. case: CODING_EOL_LF */
2648 if (dst_bytes
&& src_bytes
> dst_bytes
)
2650 result
= CODING_FINISH_INSUFFICIENT_DST
;
2651 src_bytes
= dst_bytes
;
2654 bcopy (source
, destination
, src_bytes
);
2656 safe_bcopy (source
, destination
, src_bytes
);
2659 coding
->fake_multibyte
= 1;
2663 coding
->consumed
= coding
->consumed_char
= src
- source
;
2664 coding
->produced
= coding
->produced_char
= dst
- destination
;
2668 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". Encode
2669 format of end-of-line according to `coding->eol_type'. If
2670 `coding->mode & CODING_MODE_SELECTIVE_DISPLAY' is nonzero, code
2671 '\r' in source text also means end-of-line. */
2674 encode_eol (coding
, source
, destination
, src_bytes
, dst_bytes
)
2675 struct coding_system
*coding
;
2676 unsigned char *source
, *destination
;
2677 int src_bytes
, dst_bytes
;
2679 unsigned char *src
= source
;
2680 unsigned char *dst
= destination
;
2681 int result
= CODING_FINISH_NORMAL
;
2683 coding
->fake_multibyte
= 0;
2685 if (coding
->eol_type
== CODING_EOL_CRLF
)
2688 unsigned char *src_end
= source
+ src_bytes
;
2689 unsigned char *dst_end
= destination
+ dst_bytes
;
2690 /* Since the maximum bytes produced by each loop is 2, we
2691 subtract 1 from DST_END to assure overflow checking is
2692 necessary only at the head of loop. */
2693 unsigned char *adjusted_dst_end
= dst_end
- 1;
2695 while (src
< src_end
&& (dst_bytes
2696 ? (dst
< adjusted_dst_end
)
2701 || (c
== '\r' && (coding
->mode
& CODING_MODE_SELECTIVE_DISPLAY
)))
2702 *dst
++ = '\r', *dst
++ = '\n';
2706 if (BASE_LEADING_CODE_P (c
))
2707 coding
->fake_multibyte
= 1;
2711 result
= CODING_FINISH_INSUFFICIENT_DST
;
2717 if (dst_bytes
&& src_bytes
> dst_bytes
)
2719 src_bytes
= dst_bytes
;
2720 result
= CODING_FINISH_INSUFFICIENT_DST
;
2723 bcopy (source
, destination
, src_bytes
);
2725 safe_bcopy (source
, destination
, src_bytes
);
2726 dst_bytes
= src_bytes
;
2727 if (coding
->eol_type
== CODING_EOL_CR
)
2731 if ((c
= *dst
++) == '\n')
2733 else if (BASE_LEADING_CODE_P (c
))
2734 coding
->fake_multibyte
= 1;
2739 if (coding
->mode
& CODING_MODE_SELECTIVE_DISPLAY
)
2742 if (*dst
++ == '\r') dst
[-1] = '\n';
2744 coding
->fake_multibyte
= 1;
2746 src
= source
+ dst_bytes
;
2747 dst
= destination
+ dst_bytes
;
2750 coding
->consumed
= coding
->consumed_char
= src
- source
;
2751 coding
->produced
= coding
->produced_char
= dst
- destination
;
2756 /*** 7. C library functions ***/
2758 /* In Emacs Lisp, coding system is represented by a Lisp symbol which
2759 has a property `coding-system'. The value of this property is a
2760 vector of length 5 (called as coding-vector). Among elements of
2761 this vector, the first (element[0]) and the fifth (element[4])
2762 carry important information for decoding/encoding. Before
2763 decoding/encoding, this information should be set in fields of a
2764 structure of type `coding_system'.
2766 A value of property `coding-system' can be a symbol of another
2767 subsidiary coding-system. In that case, Emacs gets coding-vector
2770 `element[0]' contains information to be set in `coding->type'. The
2771 value and its meaning is as follows:
2773 0 -- coding_type_emacs_mule
2774 1 -- coding_type_sjis
2775 2 -- coding_type_iso2022
2776 3 -- coding_type_big5
2777 4 -- coding_type_ccl encoder/decoder written in CCL
2778 nil -- coding_type_no_conversion
2779 t -- coding_type_undecided (automatic conversion on decoding,
2780 no-conversion on encoding)
2782 `element[4]' contains information to be set in `coding->flags' and
2783 `coding->spec'. The meaning varies by `coding->type'.
2785 If `coding->type' is `coding_type_iso2022', element[4] is a vector
2786 of length 32 (of which the first 13 sub-elements are used now).
2787 Meanings of these sub-elements are:
2789 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2790 If the value is an integer of valid charset, the charset is
2791 assumed to be designated to graphic register N initially.
2793 If the value is minus, it is a minus value of charset which
2794 reserves graphic register N, which means that the charset is
2795 not designated initially but should be designated to graphic
2796 register N just before encoding a character in that charset.
2798 If the value is nil, graphic register N is never used on
2801 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2802 Each value takes t or nil. See the section ISO2022 of
2803 `coding.h' for more information.
2805 If `coding->type' is `coding_type_big5', element[4] is t to denote
2806 BIG5-ETen or nil to denote BIG5-HKU.
2808 If `coding->type' takes the other value, element[4] is ignored.
2810 Emacs Lisp's coding system also carries information about format of
2811 end-of-line in a value of property `eol-type'. If the value is
2812 integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2813 means CODING_EOL_CR. If it is not integer, it should be a vector
2814 of subsidiary coding systems of which property `eol-type' has one
2819 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2820 and set it in CODING. If CODING_SYSTEM_SYMBOL is invalid, CODING
2821 is setup so that no conversion is necessary and return -1, else
2825 setup_coding_system (coding_system
, coding
)
2826 Lisp_Object coding_system
;
2827 struct coding_system
*coding
;
2829 Lisp_Object coding_spec
, coding_type
, eol_type
, plist
;
2833 /* Initialize some fields required for all kinds of coding systems. */
2834 coding
->symbol
= coding_system
;
2835 coding
->common_flags
= 0;
2837 coding
->heading_ascii
= -1;
2838 coding
->post_read_conversion
= coding
->pre_write_conversion
= Qnil
;
2839 coding_spec
= Fget (coding_system
, Qcoding_system
);
2840 if (!VECTORP (coding_spec
)
2841 || XVECTOR (coding_spec
)->size
!= 5
2842 || !CONSP (XVECTOR (coding_spec
)->contents
[3]))
2843 goto label_invalid_coding_system
;
2845 eol_type
= inhibit_eol_conversion
? Qnil
: Fget (coding_system
, Qeol_type
);
2846 if (VECTORP (eol_type
))
2848 coding
->eol_type
= CODING_EOL_UNDECIDED
;
2849 coding
->common_flags
= CODING_REQUIRE_DETECTION_MASK
;
2851 else if (XFASTINT (eol_type
) == 1)
2853 coding
->eol_type
= CODING_EOL_CRLF
;
2854 coding
->common_flags
2855 = CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
;
2857 else if (XFASTINT (eol_type
) == 2)
2859 coding
->eol_type
= CODING_EOL_CR
;
2860 coding
->common_flags
2861 = CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
;
2864 coding
->eol_type
= CODING_EOL_LF
;
2866 coding_type
= XVECTOR (coding_spec
)->contents
[0];
2867 /* Try short cut. */
2868 if (SYMBOLP (coding_type
))
2870 if (EQ (coding_type
, Qt
))
2872 coding
->type
= coding_type_undecided
;
2873 coding
->common_flags
|= CODING_REQUIRE_DETECTION_MASK
;
2876 coding
->type
= coding_type_no_conversion
;
2880 /* Initialize remaining fields. */
2881 coding
->composing
= 0;
2883 /* Get values of coding system properties:
2884 `post-read-conversion', `pre-write-conversion',
2885 `translation-table-for-decode', `translation-table-for-encode'. */
2886 plist
= XVECTOR (coding_spec
)->contents
[3];
2887 coding
->post_read_conversion
= Fplist_get (plist
, Qpost_read_conversion
);
2888 coding
->pre_write_conversion
= Fplist_get (plist
, Qpre_write_conversion
);
2889 val
= Fplist_get (plist
, Qtranslation_table_for_decode
);
2891 val
= Fget (val
, Qtranslation_table_for_decode
);
2892 coding
->translation_table_for_decode
= CHAR_TABLE_P (val
) ? val
: Qnil
;
2893 val
= Fplist_get (plist
, Qtranslation_table_for_encode
);
2895 val
= Fget (val
, Qtranslation_table_for_encode
);
2896 coding
->translation_table_for_encode
= CHAR_TABLE_P (val
) ? val
: Qnil
;
2897 val
= Fplist_get (plist
, Qcoding_category
);
2900 val
= Fget (val
, Qcoding_category_index
);
2902 coding
->category_idx
= XINT (val
);
2904 goto label_invalid_coding_system
;
2907 goto label_invalid_coding_system
;
2909 val
= Fplist_get (plist
, Qsafe_charsets
);
2912 for (i
= 0; i
<= MAX_CHARSET
; i
++)
2913 coding
->safe_charsets
[i
] = 1;
2917 bzero (coding
->safe_charsets
, MAX_CHARSET
+ 1);
2920 if ((i
= get_charset_id (XCONS (val
)->car
)) >= 0)
2921 coding
->safe_charsets
[i
] = 1;
2922 val
= XCONS (val
)->cdr
;
2926 switch (XFASTINT (coding_type
))
2929 coding
->type
= coding_type_emacs_mule
;
2930 if (!NILP (coding
->post_read_conversion
))
2931 coding
->common_flags
|= CODING_REQUIRE_DECODING_MASK
;
2932 if (!NILP (coding
->pre_write_conversion
))
2933 coding
->common_flags
|= CODING_REQUIRE_ENCODING_MASK
;
2937 coding
->type
= coding_type_sjis
;
2938 coding
->common_flags
2939 |= CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
;
2943 coding
->type
= coding_type_iso2022
;
2944 coding
->common_flags
2945 |= CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
;
2947 Lisp_Object val
, temp
;
2949 int i
, charset
, reg_bits
= 0;
2951 val
= XVECTOR (coding_spec
)->contents
[4];
2953 if (!VECTORP (val
) || XVECTOR (val
)->size
!= 32)
2954 goto label_invalid_coding_system
;
2956 flags
= XVECTOR (val
)->contents
;
2958 = ((NILP (flags
[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM
)
2959 | (NILP (flags
[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL
)
2960 | (NILP (flags
[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL
)
2961 | (NILP (flags
[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS
)
2962 | (NILP (flags
[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT
)
2963 | (NILP (flags
[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT
)
2964 | (NILP (flags
[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN
)
2965 | (NILP (flags
[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS
)
2966 | (NILP (flags
[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION
)
2967 | (NILP (flags
[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL
)
2968 | (NILP (flags
[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL
)
2969 | (NILP (flags
[15]) ? 0 : CODING_FLAG_ISO_SAFE
)
2970 | (NILP (flags
[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA
)
2973 /* Invoke graphic register 0 to plane 0. */
2974 CODING_SPEC_ISO_INVOCATION (coding
, 0) = 0;
2975 /* Invoke graphic register 1 to plane 1 if we can use full 8-bit. */
2976 CODING_SPEC_ISO_INVOCATION (coding
, 1)
2977 = (coding
->flags
& CODING_FLAG_ISO_SEVEN_BITS
? -1 : 1);
2978 /* Not single shifting at first. */
2979 CODING_SPEC_ISO_SINGLE_SHIFTING (coding
) = 0;
2980 /* Beginning of buffer should also be regarded as bol. */
2981 CODING_SPEC_ISO_BOL (coding
) = 1;
2983 for (charset
= 0; charset
<= MAX_CHARSET
; charset
++)
2984 CODING_SPEC_ISO_REVISION_NUMBER (coding
, charset
) = 255;
2985 val
= Vcharset_revision_alist
;
2988 charset
= get_charset_id (Fcar_safe (XCONS (val
)->car
));
2990 && (temp
= Fcdr_safe (XCONS (val
)->car
), INTEGERP (temp
))
2991 && (i
= XINT (temp
), (i
>= 0 && (i
+ '@') < 128)))
2992 CODING_SPEC_ISO_REVISION_NUMBER (coding
, charset
) = i
;
2993 val
= XCONS (val
)->cdr
;
2996 /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
2997 FLAGS[REG] can be one of below:
2998 integer CHARSET: CHARSET occupies register I,
2999 t: designate nothing to REG initially, but can be used
3001 list of integer, nil, or t: designate the first
3002 element (if integer) to REG initially, the remaining
3003 elements (if integer) is designated to REG on request,
3004 if an element is t, REG can be used by any charsets,
3005 nil: REG is never used. */
3006 for (charset
= 0; charset
<= MAX_CHARSET
; charset
++)
3007 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
)
3008 = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION
;
3009 for (i
= 0; i
< 4; i
++)
3011 if (INTEGERP (flags
[i
])
3012 && (charset
= XINT (flags
[i
]), CHARSET_VALID_P (charset
))
3013 || (charset
= get_charset_id (flags
[i
])) >= 0)
3015 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, i
) = charset
;
3016 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
) = i
;
3018 else if (EQ (flags
[i
], Qt
))
3020 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, i
) = -1;
3022 coding
->flags
|= CODING_FLAG_ISO_DESIGNATION
;
3024 else if (CONSP (flags
[i
]))
3029 coding
->flags
|= CODING_FLAG_ISO_DESIGNATION
;
3030 if (INTEGERP (XCONS (tail
)->car
)
3031 && (charset
= XINT (XCONS (tail
)->car
),
3032 CHARSET_VALID_P (charset
))
3033 || (charset
= get_charset_id (XCONS (tail
)->car
)) >= 0)
3035 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, i
) = charset
;
3036 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
) =i
;
3039 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, i
) = -1;
3040 tail
= XCONS (tail
)->cdr
;
3041 while (CONSP (tail
))
3043 if (INTEGERP (XCONS (tail
)->car
)
3044 && (charset
= XINT (XCONS (tail
)->car
),
3045 CHARSET_VALID_P (charset
))
3046 || (charset
= get_charset_id (XCONS (tail
)->car
)) >= 0)
3047 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
)
3049 else if (EQ (XCONS (tail
)->car
, Qt
))
3051 tail
= XCONS (tail
)->cdr
;
3055 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, i
) = -1;
3057 CODING_SPEC_ISO_DESIGNATION (coding
, i
)
3058 = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, i
);
3061 if (reg_bits
&& ! (coding
->flags
& CODING_FLAG_ISO_LOCKING_SHIFT
))
3063 /* REG 1 can be used only by locking shift in 7-bit env. */
3064 if (coding
->flags
& CODING_FLAG_ISO_SEVEN_BITS
)
3066 if (! (coding
->flags
& CODING_FLAG_ISO_SINGLE_SHIFT
))
3067 /* Without any shifting, only REG 0 and 1 can be used. */
3072 for (charset
= 0; charset
<= MAX_CHARSET
; charset
++)
3074 if (CHARSET_VALID_P (charset
))
3076 /* There exist some default graphic registers to be
3079 /* We had better avoid designating a charset of
3080 CHARS96 to REG 0 as far as possible. */
3081 if (CHARSET_CHARS (charset
) == 96)
3082 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
)
3084 ? 1 : (reg_bits
& 4 ? 2 : (reg_bits
& 8 ? 3 : 0)));
3086 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
)
3088 ? 0 : (reg_bits
& 2 ? 1 : (reg_bits
& 4 ? 2 : 3)));
3092 coding
->common_flags
|= CODING_REQUIRE_FLUSHING_MASK
;
3093 coding
->spec
.iso2022
.last_invalid_designation_register
= -1;
3097 coding
->type
= coding_type_big5
;
3098 coding
->common_flags
3099 |= CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
;
3101 = (NILP (XVECTOR (coding_spec
)->contents
[4])
3102 ? CODING_FLAG_BIG5_HKU
3103 : CODING_FLAG_BIG5_ETEN
);
3107 coding
->type
= coding_type_ccl
;
3108 coding
->common_flags
3109 |= CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
;
3112 Lisp_Object decoder
, encoder
;
3114 val
= XVECTOR (coding_spec
)->contents
[4];
3116 && SYMBOLP (XCONS (val
)->car
)
3117 && !NILP (decoder
= Fget (XCONS (val
)->car
, Qccl_program_idx
))
3118 && !NILP (decoder
= Fcdr (Faref (Vccl_program_table
, decoder
)))
3119 && SYMBOLP (XCONS (val
)->cdr
)
3120 && !NILP (encoder
= Fget (XCONS (val
)->cdr
, Qccl_program_idx
))
3121 && !NILP (encoder
= Fcdr (Faref (Vccl_program_table
, encoder
))))
3123 setup_ccl_program (&(coding
->spec
.ccl
.decoder
), decoder
);
3124 setup_ccl_program (&(coding
->spec
.ccl
.encoder
), encoder
);
3127 goto label_invalid_coding_system
;
3129 bzero (coding
->spec
.ccl
.valid_codes
, 256);
3130 val
= Fplist_get (plist
, Qvalid_codes
);
3135 for (; CONSP (val
); val
= XCONS (val
)->cdr
)
3137 this = XCONS (val
)->car
;
3139 && XINT (this) >= 0 && XINT (this) < 256)
3140 coding
->spec
.ccl
.valid_codes
[XINT (this)] = 1;
3141 else if (CONSP (this)
3142 && INTEGERP (XCONS (this)->car
)
3143 && INTEGERP (XCONS (this)->cdr
))
3145 int start
= XINT (XCONS (this)->car
);
3146 int end
= XINT (XCONS (this)->cdr
);
3148 if (start
>= 0 && start
<= end
&& end
< 256)
3149 while (start
<= end
)
3150 coding
->spec
.ccl
.valid_codes
[start
++] = 1;
3155 coding
->common_flags
|= CODING_REQUIRE_FLUSHING_MASK
;
3159 coding
->type
= coding_type_raw_text
;
3163 goto label_invalid_coding_system
;
3167 label_invalid_coding_system
:
3168 coding
->type
= coding_type_no_conversion
;
3169 coding
->category_idx
= CODING_CATEGORY_IDX_BINARY
;
3170 coding
->common_flags
= 0;
3171 coding
->eol_type
= CODING_EOL_LF
;
3172 coding
->pre_write_conversion
= coding
->post_read_conversion
= Qnil
;
3176 /* Setup raw-text or one of its subsidiaries in the structure
3177 coding_system CODING according to the already setup value eol_type
3178 in CODING. CODING should be setup for some coding system in
3182 setup_raw_text_coding_system (coding
)
3183 struct coding_system
*coding
;
3185 if (coding
->type
!= coding_type_raw_text
)
3187 coding
->symbol
= Qraw_text
;
3188 coding
->type
= coding_type_raw_text
;
3189 if (coding
->eol_type
!= CODING_EOL_UNDECIDED
)
3191 Lisp_Object subsidiaries
;
3192 subsidiaries
= Fget (Qraw_text
, Qeol_type
);
3194 if (VECTORP (subsidiaries
)
3195 && XVECTOR (subsidiaries
)->size
== 3)
3197 = XVECTOR (subsidiaries
)->contents
[coding
->eol_type
];
3203 /* Emacs has a mechanism to automatically detect a coding system if it
3204 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
3205 it's impossible to distinguish some coding systems accurately
3206 because they use the same range of codes. So, at first, coding
3207 systems are categorized into 7, those are:
3209 o coding-category-emacs-mule
3211 The category for a coding system which has the same code range
3212 as Emacs' internal format. Assigned the coding-system (Lisp
3213 symbol) `emacs-mule' by default.
3215 o coding-category-sjis
3217 The category for a coding system which has the same code range
3218 as SJIS. Assigned the coding-system (Lisp
3219 symbol) `japanese-shift-jis' by default.
3221 o coding-category-iso-7
3223 The category for a coding system which has the same code range
3224 as ISO2022 of 7-bit environment. This doesn't use any locking
3225 shift and single shift functions. This can encode/decode all
3226 charsets. Assigned the coding-system (Lisp symbol)
3227 `iso-2022-7bit' by default.
3229 o coding-category-iso-7-tight
3231 Same as coding-category-iso-7 except that this can
3232 encode/decode only the specified charsets.
3234 o coding-category-iso-8-1
3236 The category for a coding system which has the same code range
3237 as ISO2022 of 8-bit environment and graphic plane 1 used only
3238 for DIMENSION1 charset. This doesn't use any locking shift
3239 and single shift functions. Assigned the coding-system (Lisp
3240 symbol) `iso-latin-1' by default.
3242 o coding-category-iso-8-2
3244 The category for a coding system which has the same code range
3245 as ISO2022 of 8-bit environment and graphic plane 1 used only
3246 for DIMENSION2 charset. This doesn't use any locking shift
3247 and single shift functions. Assigned the coding-system (Lisp
3248 symbol) `japanese-iso-8bit' by default.
3250 o coding-category-iso-7-else
3252 The category for a coding system which has the same code range
3253 as ISO2022 of 7-bit environemnt but uses locking shift or
3254 single shift functions. Assigned the coding-system (Lisp
3255 symbol) `iso-2022-7bit-lock' by default.
3257 o coding-category-iso-8-else
3259 The category for a coding system which has the same code range
3260 as ISO2022 of 8-bit environemnt but uses locking shift or
3261 single shift functions. Assigned the coding-system (Lisp
3262 symbol) `iso-2022-8bit-ss2' by default.
3264 o coding-category-big5
3266 The category for a coding system which has the same code range
3267 as BIG5. Assigned the coding-system (Lisp symbol)
3268 `cn-big5' by default.
3270 o coding-category-ccl
3272 The category for a coding system of which encoder/decoder is
3273 written in CCL programs. The default value is nil, i.e., no
3274 coding system is assigned.
3276 o coding-category-binary
3278 The category for a coding system not categorized in any of the
3279 above. Assigned the coding-system (Lisp symbol)
3280 `no-conversion' by default.
3282 Each of them is a Lisp symbol and the value is an actual
3283 `coding-system's (this is also a Lisp symbol) assigned by a user.
3284 What Emacs does actually is to detect a category of coding system.
3285 Then, it uses a `coding-system' assigned to it. If Emacs can't
3286 decide only one possible category, it selects a category of the
3287 highest priority. Priorities of categories are also specified by a
3288 user in a Lisp variable `coding-category-list'.
3293 int ascii_skip_code
[256];
3295 /* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded.
3296 If it detects possible coding systems, return an integer in which
3297 appropriate flag bits are set. Flag bits are defined by macros
3298 CODING_CATEGORY_MASK_XXX in `coding.h'.
3300 How many ASCII characters are at the head is returned as *SKIP. */
3303 detect_coding_mask (source
, src_bytes
, priorities
, skip
)
3304 unsigned char *source
;
3305 int src_bytes
, *priorities
, *skip
;
3307 register unsigned char c
;
3308 unsigned char *src
= source
, *src_end
= source
+ src_bytes
;
3312 /* At first, skip all ASCII characters and control characters except
3313 for three ISO2022 specific control characters. */
3314 ascii_skip_code
[ISO_CODE_SO
] = 0;
3315 ascii_skip_code
[ISO_CODE_SI
] = 0;
3316 ascii_skip_code
[ISO_CODE_ESC
] = 0;
3318 label_loop_detect_coding
:
3319 while (src
< src_end
&& ascii_skip_code
[*src
]) src
++;
3320 *skip
= src
- source
;
3323 /* We found nothing other than ASCII. There's nothing to do. */
3327 /* The text seems to be encoded in some multilingual coding system.
3328 Now, try to find in which coding system the text is encoded. */
3331 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
3332 /* C is an ISO2022 specific control code of C0. */
3333 mask
= detect_coding_iso2022 (src
, src_end
);
3336 /* No valid ISO2022 code follows C. Try again. */
3338 if (c
== ISO_CODE_ESC
)
3339 ascii_skip_code
[ISO_CODE_ESC
] = 1;
3341 ascii_skip_code
[ISO_CODE_SO
] = ascii_skip_code
[ISO_CODE_SI
] = 1;
3342 goto label_loop_detect_coding
;
3345 goto label_return_highest_only
;
3353 /* C is the first byte of SJIS character code,
3354 or a leading-code of Emacs' internal format (emacs-mule). */
3355 try = CODING_CATEGORY_MASK_SJIS
| CODING_CATEGORY_MASK_EMACS_MULE
;
3357 /* Or, if C is a special latin extra code,
3358 or is an ISO2022 specific control code of C1 (SS2 or SS3),
3359 or is an ISO2022 control-sequence-introducer (CSI),
3360 we should also consider the possibility of ISO2022 codings. */
3361 if ((VECTORP (Vlatin_extra_code_table
)
3362 && !NILP (XVECTOR (Vlatin_extra_code_table
)->contents
[c
]))
3363 || (c
== ISO_CODE_SS2
|| c
== ISO_CODE_SS3
)
3364 || (c
== ISO_CODE_CSI
3367 || ((*src
== '0' || *src
== '1' || *src
== '2')
3368 && src
+ 1 < src_end
3369 && src
[1] == ']')))))
3370 try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
3371 | CODING_CATEGORY_MASK_ISO_8BIT
);
3374 /* C is a character of ISO2022 in graphic plane right,
3375 or a SJIS's 1-byte character code (i.e. JISX0201),
3376 or the first byte of BIG5's 2-byte code. */
3377 try = (CODING_CATEGORY_MASK_ISO_8_ELSE
3378 | CODING_CATEGORY_MASK_ISO_8BIT
3379 | CODING_CATEGORY_MASK_SJIS
3380 | CODING_CATEGORY_MASK_BIG5
);
3382 /* Or, we may have to consider the possibility of CCL. */
3383 if (coding_system_table
[CODING_CATEGORY_IDX_CCL
]
3384 && (coding_system_table
[CODING_CATEGORY_IDX_CCL
]
3385 ->spec
.ccl
.valid_codes
)[c
])
3386 try |= CODING_CATEGORY_MASK_CCL
;
3391 for (i
= 0; i
< CODING_CATEGORY_IDX_MAX
; i
++)
3393 if (priorities
[i
] & try & CODING_CATEGORY_MASK_ISO
)
3394 mask
= detect_coding_iso2022 (src
, src_end
);
3395 else if (priorities
[i
] & try & CODING_CATEGORY_MASK_SJIS
)
3396 mask
= detect_coding_sjis (src
, src_end
);
3397 else if (priorities
[i
] & try & CODING_CATEGORY_MASK_BIG5
)
3398 mask
= detect_coding_big5 (src
, src_end
);
3399 else if (priorities
[i
] & try & CODING_CATEGORY_MASK_EMACS_MULE
)
3400 mask
= detect_coding_emacs_mule (src
, src_end
);
3401 else if (priorities
[i
] & try & CODING_CATEGORY_MASK_CCL
)
3402 mask
= detect_coding_ccl (src
, src_end
);
3403 else if (priorities
[i
] & CODING_CATEGORY_MASK_RAW_TEXT
)
3404 mask
= CODING_CATEGORY_MASK_RAW_TEXT
;
3405 else if (priorities
[i
] & CODING_CATEGORY_MASK_BINARY
)
3406 mask
= CODING_CATEGORY_MASK_BINARY
;
3408 goto label_return_highest_only
;
3410 return CODING_CATEGORY_MASK_RAW_TEXT
;
3412 if (try & CODING_CATEGORY_MASK_ISO
)
3413 mask
|= detect_coding_iso2022 (src
, src_end
);
3414 if (try & CODING_CATEGORY_MASK_SJIS
)
3415 mask
|= detect_coding_sjis (src
, src_end
);
3416 if (try & CODING_CATEGORY_MASK_BIG5
)
3417 mask
|= detect_coding_big5 (src
, src_end
);
3418 if (try & CODING_CATEGORY_MASK_EMACS_MULE
)
3419 mask
|= detect_coding_emacs_mule (src
, src_end
);
3420 if (try & CODING_CATEGORY_MASK_CCL
)
3421 mask
|= detect_coding_ccl (src
, src_end
);
3423 return (mask
| CODING_CATEGORY_MASK_RAW_TEXT
| CODING_CATEGORY_MASK_BINARY
);
3425 label_return_highest_only
:
3426 for (i
= 0; i
< CODING_CATEGORY_IDX_MAX
; i
++)
3428 if (mask
& priorities
[i
])
3429 return priorities
[i
];
3431 return CODING_CATEGORY_MASK_RAW_TEXT
;
3434 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
3435 The information of the detected coding system is set in CODING. */
3438 detect_coding (coding
, src
, src_bytes
)
3439 struct coding_system
*coding
;
3447 val
= Vcoding_category_list
;
3448 mask
= detect_coding_mask (src
, src_bytes
, coding_priorities
, &skip
);
3449 coding
->heading_ascii
= skip
;
3453 /* We found a single coding system of the highest priority in MASK. */
3455 while (mask
&& ! (mask
& 1)) mask
>>= 1, idx
++;
3457 idx
= CODING_CATEGORY_IDX_RAW_TEXT
;
3459 val
= XSYMBOL (XVECTOR (Vcoding_category_table
)->contents
[idx
])->value
;
3461 if (coding
->eol_type
!= CODING_EOL_UNDECIDED
)
3465 tmp
= Fget (val
, Qeol_type
);
3467 val
= XVECTOR (tmp
)->contents
[coding
->eol_type
];
3469 setup_coding_system (val
, coding
);
3470 /* Set this again because setup_coding_system reset this member. */
3471 coding
->heading_ascii
= skip
;
3474 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
3475 SOURCE is encoded. Return one of CODING_EOL_LF, CODING_EOL_CRLF,
3476 CODING_EOL_CR, and CODING_EOL_UNDECIDED.
3478 How many non-eol characters are at the head is returned as *SKIP. */
3480 #define MAX_EOL_CHECK_COUNT 3
3483 detect_eol_type (source
, src_bytes
, skip
)
3484 unsigned char *source
;
3485 int src_bytes
, *skip
;
3487 unsigned char *src
= source
, *src_end
= src
+ src_bytes
;
3489 int total
= 0; /* How many end-of-lines are found so far. */
3490 int eol_type
= CODING_EOL_UNDECIDED
;
3495 while (src
< src_end
&& total
< MAX_EOL_CHECK_COUNT
)
3498 if (c
== '\n' || c
== '\r')
3501 *skip
= src
- 1 - source
;
3504 this_eol_type
= CODING_EOL_LF
;
3505 else if (src
>= src_end
|| *src
!= '\n')
3506 this_eol_type
= CODING_EOL_CR
;
3508 this_eol_type
= CODING_EOL_CRLF
, src
++;
3510 if (eol_type
== CODING_EOL_UNDECIDED
)
3511 /* This is the first end-of-line. */
3512 eol_type
= this_eol_type
;
3513 else if (eol_type
!= this_eol_type
)
3515 /* The found type is different from what found before. */
3516 eol_type
= CODING_EOL_INCONSISTENT
;
3523 *skip
= src_end
- source
;
3527 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
3528 is encoded. If it detects an appropriate format of end-of-line, it
3529 sets the information in *CODING. */
3532 detect_eol (coding
, src
, src_bytes
)
3533 struct coding_system
*coding
;
3539 int eol_type
= detect_eol_type (src
, src_bytes
, &skip
);
3541 if (coding
->heading_ascii
> skip
)
3542 coding
->heading_ascii
= skip
;
3544 skip
= coding
->heading_ascii
;
3546 if (eol_type
== CODING_EOL_UNDECIDED
)
3548 if (eol_type
== CODING_EOL_INCONSISTENT
)
3551 /* This code is suppressed until we find a better way to
3552 distinguish raw text file and binary file. */
3554 /* If we have already detected that the coding is raw-text, the
3555 coding should actually be no-conversion. */
3556 if (coding
->type
== coding_type_raw_text
)
3558 setup_coding_system (Qno_conversion
, coding
);
3561 /* Else, let's decode only text code anyway. */
3563 eol_type
= CODING_EOL_LF
;
3566 val
= Fget (coding
->symbol
, Qeol_type
);
3567 if (VECTORP (val
) && XVECTOR (val
)->size
== 3)
3569 setup_coding_system (XVECTOR (val
)->contents
[eol_type
], coding
);
3570 coding
->heading_ascii
= skip
;
3574 #define CONVERSION_BUFFER_EXTRA_ROOM 256
3576 #define DECODING_BUFFER_MAG(coding) \
3577 (coding->type == coding_type_iso2022 \
3579 : ((coding->type == coding_type_sjis || coding->type == coding_type_big5) \
3581 : (coding->type == coding_type_raw_text \
3583 : (coding->type == coding_type_ccl \
3584 ? coding->spec.ccl.decoder.buf_magnification \
3587 /* Return maximum size (bytes) of a buffer enough for decoding
3588 SRC_BYTES of text encoded in CODING. */
3591 decoding_buffer_size (coding
, src_bytes
)
3592 struct coding_system
*coding
;
3595 return (src_bytes
* DECODING_BUFFER_MAG (coding
)
3596 + CONVERSION_BUFFER_EXTRA_ROOM
);
3599 /* Return maximum size (bytes) of a buffer enough for encoding
3600 SRC_BYTES of text to CODING. */
3603 encoding_buffer_size (coding
, src_bytes
)
3604 struct coding_system
*coding
;
3609 if (coding
->type
== coding_type_ccl
)
3610 magnification
= coding
->spec
.ccl
.encoder
.buf_magnification
;
3614 return (src_bytes
* magnification
+ CONVERSION_BUFFER_EXTRA_ROOM
);
3617 #ifndef MINIMUM_CONVERSION_BUFFER_SIZE
3618 #define MINIMUM_CONVERSION_BUFFER_SIZE 1024
3621 char *conversion_buffer
;
3622 int conversion_buffer_size
;
3624 /* Return a pointer to a SIZE bytes of buffer to be used for encoding
3625 or decoding. Sufficient memory is allocated automatically. If we
3626 run out of memory, return NULL. */
3629 get_conversion_buffer (size
)
3632 if (size
> conversion_buffer_size
)
3635 int real_size
= conversion_buffer_size
* 2;
3637 while (real_size
< size
) real_size
*= 2;
3638 buf
= (char *) xmalloc (real_size
);
3639 xfree (conversion_buffer
);
3640 conversion_buffer
= buf
;
3641 conversion_buffer_size
= real_size
;
3643 return conversion_buffer
;
3647 ccl_coding_driver (coding
, source
, destination
, src_bytes
, dst_bytes
, encodep
)
3648 struct coding_system
*coding
;
3649 unsigned char *source
, *destination
;
3650 int src_bytes
, dst_bytes
, encodep
;
3652 struct ccl_program
*ccl
3653 = encodep
? &coding
->spec
.ccl
.encoder
: &coding
->spec
.ccl
.decoder
;
3656 ccl
->last_block
= coding
->mode
& CODING_MODE_LAST_BLOCK
;
3658 coding
->produced
= ccl_driver (ccl
, source
, destination
,
3659 src_bytes
, dst_bytes
, &(coding
->consumed
));
3660 coding
->produced_char
3661 = multibyte_chars_in_text (destination
, coding
->produced
);
3662 coding
->consumed_char
3663 = multibyte_chars_in_text (source
, coding
->consumed
);
3665 switch (ccl
->status
)
3667 case CCL_STAT_SUSPEND_BY_SRC
:
3668 result
= CODING_FINISH_INSUFFICIENT_SRC
;
3670 case CCL_STAT_SUSPEND_BY_DST
:
3671 result
= CODING_FINISH_INSUFFICIENT_DST
;
3674 case CCL_STAT_INVALID_CMD
:
3675 result
= CODING_FINISH_INTERRUPT
;
3678 result
= CODING_FINISH_NORMAL
;
3684 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions". Before
3685 decoding, it may detect coding system and format of end-of-line if
3686 those are not yet decided. */
3689 decode_coding (coding
, source
, destination
, src_bytes
, dst_bytes
)
3690 struct coding_system
*coding
;
3691 unsigned char *source
, *destination
;
3692 int src_bytes
, dst_bytes
;
3697 && ! (coding
->mode
& CODING_MODE_LAST_BLOCK
3698 && CODING_REQUIRE_FLUSHING (coding
)))
3700 coding
->produced
= coding
->produced_char
= 0;
3701 coding
->consumed
= coding
->consumed_char
= 0;
3702 coding
->fake_multibyte
= 0;
3703 return CODING_FINISH_NORMAL
;
3706 if (coding
->type
== coding_type_undecided
)
3707 detect_coding (coding
, source
, src_bytes
);
3709 if (coding
->eol_type
== CODING_EOL_UNDECIDED
)
3710 detect_eol (coding
, source
, src_bytes
);
3712 switch (coding
->type
)
3714 case coding_type_emacs_mule
:
3715 case coding_type_undecided
:
3716 case coding_type_raw_text
:
3717 if (coding
->eol_type
== CODING_EOL_LF
3718 || coding
->eol_type
== CODING_EOL_UNDECIDED
)
3719 goto label_no_conversion
;
3720 result
= decode_eol (coding
, source
, destination
, src_bytes
, dst_bytes
);
3723 case coding_type_sjis
:
3724 result
= decode_coding_sjis_big5 (coding
, source
, destination
,
3725 src_bytes
, dst_bytes
, 1);
3728 case coding_type_iso2022
:
3729 result
= decode_coding_iso2022 (coding
, source
, destination
,
3730 src_bytes
, dst_bytes
);
3733 case coding_type_big5
:
3734 result
= decode_coding_sjis_big5 (coding
, source
, destination
,
3735 src_bytes
, dst_bytes
, 0);
3738 case coding_type_ccl
:
3739 result
= ccl_coding_driver (coding
, source
, destination
,
3740 src_bytes
, dst_bytes
, 0);
3743 default: /* i.e. case coding_type_no_conversion: */
3744 label_no_conversion
:
3745 if (dst_bytes
&& src_bytes
> dst_bytes
)
3747 coding
->produced
= dst_bytes
;
3748 result
= CODING_FINISH_INSUFFICIENT_DST
;
3752 coding
->produced
= src_bytes
;
3753 result
= CODING_FINISH_NORMAL
;
3756 bcopy (source
, destination
, coding
->produced
);
3758 safe_bcopy (source
, destination
, coding
->produced
);
3759 coding
->fake_multibyte
= 1;
3761 = coding
->consumed_char
= coding
->produced_char
= coding
->produced
;
3768 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". */
3771 encode_coding (coding
, source
, destination
, src_bytes
, dst_bytes
)
3772 struct coding_system
*coding
;
3773 unsigned char *source
, *destination
;
3774 int src_bytes
, dst_bytes
;
3779 && ! (coding
->mode
& CODING_MODE_LAST_BLOCK
3780 && CODING_REQUIRE_FLUSHING (coding
)))
3782 coding
->produced
= coding
->produced_char
= 0;
3783 coding
->consumed
= coding
->consumed_char
= 0;
3784 coding
->fake_multibyte
= 0;
3785 return CODING_FINISH_NORMAL
;
3788 switch (coding
->type
)
3790 case coding_type_emacs_mule
:
3791 case coding_type_undecided
:
3792 case coding_type_raw_text
:
3793 if (coding
->eol_type
== CODING_EOL_LF
3794 || coding
->eol_type
== CODING_EOL_UNDECIDED
)
3795 goto label_no_conversion
;
3796 result
= encode_eol (coding
, source
, destination
, src_bytes
, dst_bytes
);
3799 case coding_type_sjis
:
3800 result
= encode_coding_sjis_big5 (coding
, source
, destination
,
3801 src_bytes
, dst_bytes
, 1);
3804 case coding_type_iso2022
:
3805 result
= encode_coding_iso2022 (coding
, source
, destination
,
3806 src_bytes
, dst_bytes
);
3809 case coding_type_big5
:
3810 result
= encode_coding_sjis_big5 (coding
, source
, destination
,
3811 src_bytes
, dst_bytes
, 0);
3814 case coding_type_ccl
:
3815 result
= ccl_coding_driver (coding
, source
, destination
,
3816 src_bytes
, dst_bytes
, 1);
3819 default: /* i.e. case coding_type_no_conversion: */
3820 label_no_conversion
:
3821 if (dst_bytes
&& src_bytes
> dst_bytes
)
3823 coding
->produced
= dst_bytes
;
3824 result
= CODING_FINISH_INSUFFICIENT_DST
;
3828 coding
->produced
= src_bytes
;
3829 result
= CODING_FINISH_NORMAL
;
3832 bcopy (source
, destination
, coding
->produced
);
3834 safe_bcopy (source
, destination
, coding
->produced
);
3835 if (coding
->mode
& CODING_MODE_SELECTIVE_DISPLAY
)
3837 unsigned char *p
= destination
, *pend
= p
+ coding
->produced
;
3839 if (*p
++ == '\015') p
[-1] = '\n';
3841 coding
->fake_multibyte
= 1;
3843 = coding
->consumed_char
= coding
->produced_char
= coding
->produced
;
3850 /* Scan text in the region between *BEG and *END (byte positions),
3851 skip characters which we don't have to decode by coding system
3852 CODING at the head and tail, then set *BEG and *END to the region
3853 of the text we actually have to convert. The caller should move
3854 the gap out of the region in advance.
3856 If STR is not NULL, *BEG and *END are indices into STR. */
3859 shrink_decoding_region (beg
, end
, coding
, str
)
3861 struct coding_system
*coding
;
3864 unsigned char *begp_orig
, *begp
, *endp_orig
, *endp
, c
;
3866 Lisp_Object translation_table
;
3868 if (coding
->type
== coding_type_ccl
3869 || coding
->type
== coding_type_undecided
3870 || !NILP (coding
->post_read_conversion
))
3872 /* We can't skip any data. */
3875 else if (coding
->type
== coding_type_no_conversion
)
3877 /* We need no conversion, but don't have to skip any data here.
3878 Decoding routine handles them effectively anyway. */
3882 translation_table
= coding
->translation_table_for_decode
;
3883 if (NILP (translation_table
) && !NILP (Venable_character_translation
))
3884 translation_table
= Vstandard_translation_table_for_decode
;
3885 if (CHAR_TABLE_P (translation_table
))
3888 for (i
= 0; i
< 128; i
++)
3889 if (!NILP (CHAR_TABLE_REF (translation_table
, i
)))
3892 /* Some ASCII character should be tranlsated. We give up
3897 eol_conversion
= (coding
->eol_type
!= CODING_EOL_LF
);
3899 if ((! eol_conversion
) && (coding
->heading_ascii
>= 0))
3900 /* Detection routine has already found how much we can skip at the
3902 *beg
+= coding
->heading_ascii
;
3906 begp_orig
= begp
= str
+ *beg
;
3907 endp_orig
= endp
= str
+ *end
;
3911 begp_orig
= begp
= BYTE_POS_ADDR (*beg
);
3912 endp_orig
= endp
= begp
+ *end
- *beg
;
3915 switch (coding
->type
)
3917 case coding_type_emacs_mule
:
3918 case coding_type_raw_text
:
3921 if (coding
->heading_ascii
< 0)
3922 while (begp
< endp
&& *begp
!= '\r' && *begp
< 0x80) begp
++;
3923 while (begp
< endp
&& endp
[-1] != '\r' && endp
[-1] < 0x80)
3925 /* Do not consider LF as ascii if preceded by CR, since that
3926 confuses eol decoding. */
3927 if (begp
< endp
&& endp
< endp_orig
&& endp
[-1] == '\r' && endp
[0] == '\n')
3934 case coding_type_sjis
:
3935 case coding_type_big5
:
3936 /* We can skip all ASCII characters at the head. */
3937 if (coding
->heading_ascii
< 0)
3940 while (begp
< endp
&& *begp
< 0x80 && *begp
!= '\r') begp
++;
3942 while (begp
< endp
&& *begp
< 0x80) begp
++;
3944 /* We can skip all ASCII characters at the tail except for the
3945 second byte of SJIS or BIG5 code. */
3947 while (begp
< endp
&& endp
[-1] < 0x80 && endp
[-1] != '\r') endp
--;
3949 while (begp
< endp
&& endp
[-1] < 0x80) endp
--;
3950 /* Do not consider LF as ascii if preceded by CR, since that
3951 confuses eol decoding. */
3952 if (begp
< endp
&& endp
< endp_orig
&& endp
[-1] == '\r' && endp
[0] == '\n')
3954 if (begp
< endp
&& endp
< endp_orig
&& endp
[-1] >= 0x80)
3958 default: /* i.e. case coding_type_iso2022: */
3959 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, 0) != CHARSET_ASCII
)
3960 /* We can't skip any data. */
3962 if (coding
->heading_ascii
< 0)
3964 /* We can skip all ASCII characters at the head except for a
3965 few control codes. */
3966 while (begp
< endp
&& (c
= *begp
) < 0x80
3967 && c
!= ISO_CODE_CR
&& c
!= ISO_CODE_SO
3968 && c
!= ISO_CODE_SI
&& c
!= ISO_CODE_ESC
3969 && (!eol_conversion
|| c
!= ISO_CODE_LF
))
3972 switch (coding
->category_idx
)
3974 case CODING_CATEGORY_IDX_ISO_8_1
:
3975 case CODING_CATEGORY_IDX_ISO_8_2
:
3976 /* We can skip all ASCII characters at the tail. */
3978 while (begp
< endp
&& (c
= endp
[-1]) < 0x80 && c
!= '\r') endp
--;
3980 while (begp
< endp
&& endp
[-1] < 0x80) endp
--;
3981 /* Do not consider LF as ascii if preceded by CR, since that
3982 confuses eol decoding. */
3983 if (begp
< endp
&& endp
< endp_orig
&& endp
[-1] == '\r' && endp
[0] == '\n')
3987 case CODING_CATEGORY_IDX_ISO_7
:
3988 case CODING_CATEGORY_IDX_ISO_7_TIGHT
:
3990 /* We can skip all charactes at the tail except for 8-bit
3991 codes and ESC and the following 2-byte at the tail. */
3992 unsigned char *eight_bit
= NULL
;
3996 && (c
= endp
[-1]) != ISO_CODE_ESC
&& c
!= '\r')
3998 if (!eight_bit
&& c
& 0x80) eight_bit
= endp
;
4003 && (c
= endp
[-1]) != ISO_CODE_ESC
)
4005 if (!eight_bit
&& c
& 0x80) eight_bit
= endp
;
4008 /* Do not consider LF as ascii if preceded by CR, since that
4009 confuses eol decoding. */
4010 if (begp
< endp
&& endp
< endp_orig
4011 && endp
[-1] == '\r' && endp
[0] == '\n')
4013 if (begp
< endp
&& endp
[-1] == ISO_CODE_ESC
)
4015 if (endp
+ 1 < endp_orig
&& end
[0] == '(' && end
[1] == 'B')
4016 /* This is an ASCII designation sequence. We can
4017 surely skip the tail. But, if we have
4018 encountered an 8-bit code, skip only the codes
4020 endp
= eight_bit
? eight_bit
: endp
+ 2;
4022 /* Hmmm, we can't skip the tail. */
4030 *beg
+= begp
- begp_orig
;
4031 *end
+= endp
- endp_orig
;
4035 /* Like shrink_decoding_region but for encoding. */
4038 shrink_encoding_region (beg
, end
, coding
, str
)
4040 struct coding_system
*coding
;
4043 unsigned char *begp_orig
, *begp
, *endp_orig
, *endp
;
4045 Lisp_Object translation_table
;
4047 if (coding
->type
== coding_type_ccl
)
4048 /* We can't skip any data. */
4050 else if (coding
->type
== coding_type_no_conversion
)
4052 /* We need no conversion. */
4057 translation_table
= coding
->translation_table_for_encode
;
4058 if (NILP (translation_table
) && !NILP (Venable_character_translation
))
4059 translation_table
= Vstandard_translation_table_for_encode
;
4060 if (CHAR_TABLE_P (translation_table
))
4063 for (i
= 0; i
< 128; i
++)
4064 if (!NILP (CHAR_TABLE_REF (translation_table
, i
)))
4067 /* Some ASCII character should be tranlsated. We give up
4074 begp_orig
= begp
= str
+ *beg
;
4075 endp_orig
= endp
= str
+ *end
;
4079 begp_orig
= begp
= BYTE_POS_ADDR (*beg
);
4080 endp_orig
= endp
= begp
+ *end
- *beg
;
4083 eol_conversion
= (coding
->eol_type
== CODING_EOL_CR
4084 || coding
->eol_type
== CODING_EOL_CRLF
);
4086 /* Here, we don't have to check coding->pre_write_conversion because
4087 the caller is expected to have handled it already. */
4088 switch (coding
->type
)
4090 case coding_type_undecided
:
4091 case coding_type_emacs_mule
:
4092 case coding_type_raw_text
:
4095 while (begp
< endp
&& *begp
!= '\n') begp
++;
4096 while (begp
< endp
&& endp
[-1] != '\n') endp
--;
4102 case coding_type_iso2022
:
4103 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, 0) != CHARSET_ASCII
)
4104 /* We can't skip any data. */
4106 if (coding
->flags
& CODING_FLAG_ISO_DESIGNATE_AT_BOL
)
4108 unsigned char *bol
= begp
;
4109 while (begp
< endp
&& *begp
< 0x80)
4112 if (begp
[-1] == '\n')
4116 goto label_skip_tail
;
4121 /* We can skip all ASCII characters at the head and tail. */
4123 while (begp
< endp
&& *begp
< 0x80 && *begp
!= '\n') begp
++;
4125 while (begp
< endp
&& *begp
< 0x80) begp
++;
4128 while (begp
< endp
&& endp
[-1] < 0x80 && endp
[-1] != '\n') endp
--;
4130 while (begp
< endp
&& *(endp
- 1) < 0x80) endp
--;
4134 *beg
+= begp
- begp_orig
;
4135 *end
+= endp
- endp_orig
;
4139 /* As shrinking conversion region requires some overhead, we don't try
4140 shrinking if the length of conversion region is less than this
4142 static int shrink_conversion_region_threshhold
= 1024;
4144 #define SHRINK_CONVERSION_REGION(beg, end, coding, str, encodep) \
4146 if (*(end) - *(beg) > shrink_conversion_region_threshhold) \
4148 if (encodep) shrink_encoding_region (beg, end, coding, str); \
4149 else shrink_decoding_region (beg, end, coding, str); \
4153 /* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the
4154 text from FROM to TO (byte positions are FROM_BYTE and TO_BYTE) by
4155 coding system CODING, and return the status code of code conversion
4156 (currently, this value has no meaning).
4158 How many characters (and bytes) are converted to how many
4159 characters (and bytes) are recorded in members of the structure
4162 If REPLACE is nonzero, we do various things as if the original text
4163 is deleted and a new text is inserted. See the comments in
4164 replace_range (insdel.c) to know what we are doing. */
4167 code_convert_region (from
, from_byte
, to
, to_byte
, coding
, encodep
, replace
)
4168 int from
, from_byte
, to
, to_byte
, encodep
, replace
;
4169 struct coding_system
*coding
;
4171 int len
= to
- from
, len_byte
= to_byte
- from_byte
;
4172 int require
, inserted
, inserted_byte
;
4173 int head_skip
, tail_skip
, total_skip
;
4174 Lisp_Object saved_coding_symbol
;
4175 int multibyte
= !NILP (current_buffer
->enable_multibyte_characters
);
4177 int fake_multibyte
= 0;
4178 unsigned char *src
, *dst
;
4179 Lisp_Object deletion
;
4180 int orig_point
= PT
, orig_len
= len
;
4184 saved_coding_symbol
= Qnil
;
4186 if (from
< PT
&& PT
< to
)
4188 TEMP_SET_PT_BOTH (from
, from_byte
);
4194 int saved_from
= from
;
4196 prepare_to_modify_buffer (from
, to
, &from
);
4197 if (saved_from
!= from
)
4201 from_byte
= CHAR_TO_BYTE (from
), to_byte
= CHAR_TO_BYTE (to
);
4203 from_byte
= from
, to_byte
= to
;
4204 len_byte
= to_byte
- from_byte
;
4208 if (! encodep
&& CODING_REQUIRE_DETECTION (coding
))
4210 /* We must detect encoding of text and eol format. */
4212 if (from
< GPT
&& to
> GPT
)
4213 move_gap_both (from
, from_byte
);
4214 if (coding
->type
== coding_type_undecided
)
4216 detect_coding (coding
, BYTE_POS_ADDR (from_byte
), len_byte
);
4217 if (coding
->type
== coding_type_undecided
)
4218 /* It seems that the text contains only ASCII, but we
4219 should not left it undecided because the deeper
4220 decoding routine (decode_coding) tries to detect the
4221 encodings again in vain. */
4222 coding
->type
= coding_type_emacs_mule
;
4224 if (coding
->eol_type
== CODING_EOL_UNDECIDED
)
4226 saved_coding_symbol
= coding
->symbol
;
4227 detect_eol (coding
, BYTE_POS_ADDR (from_byte
), len_byte
);
4228 if (coding
->eol_type
== CODING_EOL_UNDECIDED
)
4229 coding
->eol_type
= CODING_EOL_LF
;
4230 /* We had better recover the original eol format if we
4231 encounter an inconsitent eol format while decoding. */
4232 coding
->mode
|= CODING_MODE_INHIBIT_INCONSISTENT_EOL
;
4236 coding
->consumed_char
= len
, coding
->consumed
= len_byte
;
4239 ? ! CODING_REQUIRE_ENCODING (coding
)
4240 : ! CODING_REQUIRE_DECODING (coding
))
4242 coding
->produced
= len_byte
;
4245 /* See the comment of the member heading_ascii in coding.h. */
4246 && coding
->heading_ascii
< len_byte
)
4248 /* We still may have to combine byte at the head and the
4249 tail of the text in the region. */
4250 if (from
< GPT
&& GPT
< to
)
4251 move_gap_both (to
, to_byte
);
4252 len
= multibyte_chars_in_text (BYTE_POS_ADDR (from_byte
), len_byte
);
4253 adjust_after_insert (from
, from_byte
, to
, to_byte
, len
);
4254 coding
->produced_char
= len
;
4259 adjust_after_insert (from
, from_byte
, to
, to_byte
, len_byte
);
4260 coding
->produced_char
= len_byte
;
4265 /* Now we convert the text. */
4267 /* For encoding, we must process pre-write-conversion in advance. */
4269 && ! NILP (coding
->pre_write_conversion
)
4270 && SYMBOLP (coding
->pre_write_conversion
)
4271 && ! NILP (Ffboundp (coding
->pre_write_conversion
)))
4273 /* The function in pre-write-conversion may put a new text in a
4275 struct buffer
*prev
= current_buffer
;
4278 call2 (coding
->pre_write_conversion
,
4279 make_number (from
), make_number (to
));
4280 if (current_buffer
!= prev
)
4283 new = Fcurrent_buffer ();
4284 set_buffer_internal_1 (prev
);
4285 del_range_2 (from
, from_byte
, to
, to_byte
);
4286 TEMP_SET_PT_BOTH (from
, from_byte
);
4287 insert_from_buffer (XBUFFER (new), 1, len
, 0);
4289 if (orig_point
>= to
)
4290 orig_point
+= len
- orig_len
;
4291 else if (orig_point
> from
)
4295 from_byte
= multibyte
? CHAR_TO_BYTE (from
) : from_byte
;
4296 to_byte
= multibyte
? CHAR_TO_BYTE (to
) : to
;
4297 len_byte
= to_byte
- from_byte
;
4298 TEMP_SET_PT_BOTH (from
, from_byte
);
4303 deletion
= make_buffer_string_both (from
, from_byte
, to
, to_byte
, 1);
4305 /* Try to skip the heading and tailing ASCIIs. */
4307 int from_byte_orig
= from_byte
, to_byte_orig
= to_byte
;
4309 if (from
< GPT
&& GPT
< to
)
4310 move_gap_both (from
, from_byte
);
4311 SHRINK_CONVERSION_REGION (&from_byte
, &to_byte
, coding
, NULL
, encodep
);
4312 if (from_byte
== to_byte
4313 && ! (coding
->mode
& CODING_MODE_LAST_BLOCK
4314 && CODING_REQUIRE_FLUSHING (coding
)))
4316 coding
->produced
= len_byte
;
4317 coding
->produced_char
= multibyte
? len
: len_byte
;
4319 /* We must record and adjust for this new text now. */
4320 adjust_after_insert (from
, from_byte_orig
, to
, to_byte_orig
, len
);
4324 head_skip
= from_byte
- from_byte_orig
;
4325 tail_skip
= to_byte_orig
- to_byte
;
4326 total_skip
= head_skip
+ tail_skip
;
4329 len
-= total_skip
; len_byte
-= total_skip
;
4332 /* The code conversion routine can not preserve text properties for
4333 now. So, we must remove all text properties in the region.
4334 Here, we must suppress all modification hooks. */
4337 int saved_inhibit_modification_hooks
= inhibit_modification_hooks
;
4338 inhibit_modification_hooks
= 1;
4339 Fset_text_properties (make_number (from
), make_number (to
), Qnil
, Qnil
);
4340 inhibit_modification_hooks
= saved_inhibit_modification_hooks
;
4343 /* For converion, we must put the gap before the text in addition to
4344 making the gap larger for efficient decoding. The required gap
4345 size starts from 2000 which is the magic number used in make_gap.
4346 But, after one batch of conversion, it will be incremented if we
4347 find that it is not enough . */
4350 if (GAP_SIZE
< require
)
4351 make_gap (require
- GAP_SIZE
);
4352 move_gap_both (from
, from_byte
);
4354 inserted
= inserted_byte
= 0;
4355 src
= GAP_END_ADDR
, dst
= GPT_ADDR
;
4357 GAP_SIZE
+= len_byte
;
4360 ZV_BYTE
-= len_byte
;
4363 if (GPT
- BEG
< beg_unchanged
)
4364 beg_unchanged
= GPT
- BEG
;
4365 if (Z
- GPT
< end_unchanged
)
4366 end_unchanged
= Z
- GPT
;
4372 /* The buffer memory is changed from:
4373 +--------+converted-text+---------+-------original-text------+---+
4374 |<-from->|<--inserted-->|---------|<-----------len---------->|---|
4375 |<------------------- GAP_SIZE -------------------->| */
4377 result
= encode_coding (coding
, src
, dst
, len_byte
, 0);
4379 result
= decode_coding (coding
, src
, dst
, len_byte
, 0);
4381 +--------+-------converted-text--------+--+---original-text--+---+
4382 |<-from->|<--inserted-->|<--produced-->|--|<-(len-consumed)->|---|
4383 |<------------------- GAP_SIZE -------------------->| */
4384 if (coding
->fake_multibyte
)
4387 if (!encodep
&& !multibyte
)
4388 coding
->produced_char
= coding
->produced
;
4389 inserted
+= coding
->produced_char
;
4390 inserted_byte
+= coding
->produced
;
4391 len_byte
-= coding
->consumed
;
4392 src
+= coding
->consumed
;
4393 dst
+= inserted_byte
;
4395 if (result
== CODING_FINISH_NORMAL
)
4400 if (! encodep
&& result
== CODING_FINISH_INCONSISTENT_EOL
)
4402 unsigned char *pend
= dst
, *p
= pend
- inserted_byte
;
4404 /* Encode LFs back to the original eol format (CR or CRLF). */
4405 if (coding
->eol_type
== CODING_EOL_CR
)
4407 while (p
< pend
) if (*p
++ == '\n') p
[-1] = '\r';
4413 while (p
< pend
) if (*p
++ == '\n') count
++;
4414 if (src
- dst
< count
)
4416 /* We don't have sufficient room for putting LFs
4417 back to CRLF. We must record converted and
4418 not-yet-converted text back to the buffer
4419 content, enlarge the gap, then record them out of
4420 the buffer contents again. */
4421 int add
= len_byte
+ inserted_byte
;
4424 ZV
+= add
; Z
+= add
; ZV_BYTE
+= add
; Z_BYTE
+= add
;
4425 GPT
+= inserted_byte
; GPT_BYTE
+= inserted_byte
;
4426 make_gap (count
- GAP_SIZE
);
4428 ZV
-= add
; Z
-= add
; ZV_BYTE
-= add
; Z_BYTE
-= add
;
4429 GPT
-= inserted_byte
; GPT_BYTE
-= inserted_byte
;
4430 /* Don't forget to update SRC, DST, and PEND. */
4431 src
= GAP_END_ADDR
- len_byte
;
4432 dst
= GPT_ADDR
+ inserted_byte
;
4436 inserted_byte
+= count
;
4437 coding
->produced
+= count
;
4438 p
= dst
= pend
+ count
;
4442 if (*p
== '\n') count
--, *--p
= '\r';
4446 /* Suppress eol-format conversion in the further conversion. */
4447 coding
->eol_type
= CODING_EOL_LF
;
4449 /* Restore the original symbol. */
4450 coding
->symbol
= saved_coding_symbol
;
4456 if (result
== CODING_FINISH_INSUFFICIENT_SRC
)
4458 /* The source text ends in invalid codes. Let's just
4459 make them valid buffer contents, and finish conversion. */
4460 inserted
+= len_byte
;
4461 inserted_byte
+= len_byte
;
4467 if (result
== CODING_FINISH_INTERRUPT
)
4469 /* The conversion procedure was interrupted by a user. */
4473 /* Now RESULT == CODING_FINISH_INSUFFICIENT_DST */
4474 if (coding
->consumed
< 1)
4476 /* It's quite strange to require more memory without
4477 consuming any bytes. Perhaps CCL program bug. */
4483 /* We have just done the first batch of conversion which was
4484 stoped because of insufficient gap. Let's reconsider the
4485 required gap size (i.e. SRT - DST) now.
4487 We have converted ORIG bytes (== coding->consumed) into
4488 NEW bytes (coding->produced). To convert the remaining
4489 LEN bytes, we may need REQUIRE bytes of gap, where:
4490 REQUIRE + LEN_BYTE = LEN_BYTE * (NEW / ORIG)
4491 REQUIRE = LEN_BYTE * (NEW - ORIG) / ORIG
4492 Here, we are sure that NEW >= ORIG. */
4493 float ratio
= coding
->produced
- coding
->consumed
;
4494 ratio
/= coding
->consumed
;
4495 require
= len_byte
* ratio
;
4498 if ((src
- dst
) < (require
+ 2000))
4500 /* See the comment above the previous call of make_gap. */
4501 int add
= len_byte
+ inserted_byte
;
4504 ZV
+= add
; Z
+= add
; ZV_BYTE
+= add
; Z_BYTE
+= add
;
4505 GPT
+= inserted_byte
; GPT_BYTE
+= inserted_byte
;
4506 make_gap (require
+ 2000);
4508 ZV
-= add
; Z
-= add
; ZV_BYTE
-= add
; Z_BYTE
-= add
;
4509 GPT
-= inserted_byte
; GPT_BYTE
-= inserted_byte
;
4510 /* Don't forget to update SRC, DST. */
4511 src
= GAP_END_ADDR
- len_byte
;
4512 dst
= GPT_ADDR
+ inserted_byte
;
4515 if (src
- dst
> 0) *dst
= 0; /* Put an anchor. */
4520 || (to
- from
) != (to_byte
- from_byte
)))
4521 inserted
= multibyte_chars_in_text (GPT_ADDR
, inserted_byte
);
4523 /* If we have shrinked the conversion area, adjust it now. */
4527 safe_bcopy (GAP_END_ADDR
, GPT_ADDR
+ inserted_byte
, tail_skip
);
4528 inserted
+= total_skip
; inserted_byte
+= total_skip
;
4529 GAP_SIZE
+= total_skip
;
4530 GPT
-= head_skip
; GPT_BYTE
-= head_skip
;
4531 ZV
-= total_skip
; ZV_BYTE
-= total_skip
;
4532 Z
-= total_skip
; Z_BYTE
-= total_skip
;
4533 from
-= head_skip
; from_byte
-= head_skip
;
4534 to
+= tail_skip
; to_byte
+= tail_skip
;
4538 adjust_after_replace (from
, from_byte
, deletion
, inserted
, inserted_byte
);
4539 inserted
= Z
- prev_Z
;
4541 if (! encodep
&& ! NILP (coding
->post_read_conversion
))
4546 TEMP_SET_PT_BOTH (from
, from_byte
);
4548 val
= call1 (coding
->post_read_conversion
, make_number (inserted
));
4549 CHECK_NUMBER (val
, 0);
4550 inserted
= Z
- prev_Z
;
4553 if (orig_point
>= from
)
4555 if (orig_point
>= from
+ orig_len
)
4556 orig_point
+= inserted
- orig_len
;
4559 TEMP_SET_PT (orig_point
);
4562 signal_after_change (from
, to
- from
, inserted
);
4565 coding
->consumed
= to_byte
- from_byte
;
4566 coding
->consumed_char
= to
- from
;
4567 coding
->produced
= inserted_byte
;
4568 coding
->produced_char
= inserted
;
4575 code_convert_string (str
, coding
, encodep
, nocopy
)
4577 struct coding_system
*coding
;
4578 int encodep
, nocopy
;
4582 int from
= 0, to
= XSTRING (str
)->size
;
4583 int to_byte
= STRING_BYTES (XSTRING (str
));
4584 struct gcpro gcpro1
;
4585 Lisp_Object saved_coding_symbol
;
4588 saved_coding_symbol
= Qnil
;
4589 if (encodep
&& !NILP (coding
->pre_write_conversion
)
4590 || !encodep
&& !NILP (coding
->post_read_conversion
))
4592 /* Since we have to call Lisp functions which assume target text
4593 is in a buffer, after setting a temporary buffer, call
4594 code_convert_region. */
4595 int count
= specpdl_ptr
- specpdl
;
4596 struct buffer
*prev
= current_buffer
;
4598 record_unwind_protect (Fset_buffer
, Fcurrent_buffer ());
4599 temp_output_buffer_setup (" *code-converting-work*");
4600 set_buffer_internal (XBUFFER (Vstandard_output
));
4602 insert_from_string (str
, 0, 0, to
, to_byte
, 0);
4605 /* We must insert the contents of STR as is without
4606 unibyte<->multibyte conversion. */
4607 current_buffer
->enable_multibyte_characters
= Qnil
;
4608 insert_from_string (str
, 0, 0, to_byte
, to_byte
, 0);
4609 current_buffer
->enable_multibyte_characters
= Qt
;
4611 code_convert_region (BEGV
, BEGV_BYTE
, ZV
, ZV_BYTE
, coding
, encodep
, 1);
4613 /* We must return the buffer contents as unibyte string. */
4614 current_buffer
->enable_multibyte_characters
= Qnil
;
4615 str
= make_buffer_string (BEGV
, ZV
, 0);
4616 set_buffer_internal (prev
);
4617 return unbind_to (count
, str
);
4620 if (! encodep
&& CODING_REQUIRE_DETECTION (coding
))
4622 /* See the comments in code_convert_region. */
4623 if (coding
->type
== coding_type_undecided
)
4625 detect_coding (coding
, XSTRING (str
)->data
, to_byte
);
4626 if (coding
->type
== coding_type_undecided
)
4627 coding
->type
= coding_type_emacs_mule
;
4629 if (coding
->eol_type
== CODING_EOL_UNDECIDED
)
4631 saved_coding_symbol
= coding
->symbol
;
4632 detect_eol (coding
, XSTRING (str
)->data
, to_byte
);
4633 if (coding
->eol_type
== CODING_EOL_UNDECIDED
)
4634 coding
->eol_type
= CODING_EOL_LF
;
4635 /* We had better recover the original eol format if we
4636 encounter an inconsitent eol format while decoding. */
4637 coding
->mode
|= CODING_MODE_INHIBIT_INCONSISTENT_EOL
;
4642 ? ! CODING_REQUIRE_ENCODING (coding
)
4643 : ! CODING_REQUIRE_DECODING (coding
))
4647 /* Try to skip the heading and tailing ASCIIs. */
4648 SHRINK_CONVERSION_REGION (&from
, &to_byte
, coding
, XSTRING (str
)->data
,
4652 && coding
->type
!= coding_type_ccl
)
4653 return (nocopy
? str
: Fcopy_sequence (str
));
4656 len
= encoding_buffer_size (coding
, to_byte
- from
);
4658 len
= decoding_buffer_size (coding
, to_byte
- from
);
4659 len
+= from
+ STRING_BYTES (XSTRING (str
)) - to_byte
;
4661 buf
= get_conversion_buffer (len
);
4665 bcopy (XSTRING (str
)->data
, buf
, from
);
4667 ? encode_coding (coding
, XSTRING (str
)->data
+ from
,
4668 buf
+ from
, to_byte
- from
, len
)
4669 : decode_coding (coding
, XSTRING (str
)->data
+ from
,
4670 buf
+ from
, to_byte
- from
, len
));
4671 if (! encodep
&& result
== CODING_FINISH_INCONSISTENT_EOL
)
4673 /* We simple try to decode the whole string again but without
4674 eol-conversion this time. */
4675 coding
->eol_type
= CODING_EOL_LF
;
4676 coding
->symbol
= saved_coding_symbol
;
4677 return code_convert_string (str
, coding
, encodep
, nocopy
);
4680 bcopy (XSTRING (str
)->data
+ to_byte
, buf
+ from
+ coding
->produced
,
4681 STRING_BYTES (XSTRING (str
)) - to_byte
);
4683 len
= from
+ STRING_BYTES (XSTRING (str
)) - to_byte
;
4685 str
= make_unibyte_string (buf
, len
+ coding
->produced
);
4688 int chars
= (coding
->fake_multibyte
4689 ? multibyte_chars_in_text (buf
+ from
, coding
->produced
)
4690 : coding
->produced_char
);
4691 str
= make_multibyte_string (buf
, len
+ chars
, len
+ coding
->produced
);
4699 /*** 8. Emacs Lisp library functions ***/
4701 DEFUN ("coding-system-p", Fcoding_system_p
, Scoding_system_p
, 1, 1, 0,
4702 "Return t if OBJECT is nil or a coding-system.\n\
4703 See the documentation of `make-coding-system' for information\n\
4704 about coding-system objects.")
4712 /* Get coding-spec vector for OBJ. */
4713 obj
= Fget (obj
, Qcoding_system
);
4714 return ((VECTORP (obj
) && XVECTOR (obj
)->size
== 5)
4718 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system
,
4719 Sread_non_nil_coding_system
, 1, 1, 0,
4720 "Read a coding system from the minibuffer, prompting with string PROMPT.")
4727 val
= Fcompleting_read (prompt
, Vcoding_system_alist
, Qnil
,
4728 Qt
, Qnil
, Qcoding_system_history
, Qnil
, Qnil
);
4730 while (XSTRING (val
)->size
== 0);
4731 return (Fintern (val
, Qnil
));
4734 DEFUN ("read-coding-system", Fread_coding_system
, Sread_coding_system
, 1, 2, 0,
4735 "Read a coding system from the minibuffer, prompting with string PROMPT.\n\
4736 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.")
4737 (prompt
, default_coding_system
)
4738 Lisp_Object prompt
, default_coding_system
;
4741 if (SYMBOLP (default_coding_system
))
4742 XSETSTRING (default_coding_system
, XSYMBOL (default_coding_system
)->name
);
4743 val
= Fcompleting_read (prompt
, Vcoding_system_alist
, Qnil
,
4744 Qt
, Qnil
, Qcoding_system_history
,
4745 default_coding_system
, Qnil
);
4746 return (XSTRING (val
)->size
== 0 ? Qnil
: Fintern (val
, Qnil
));
4749 DEFUN ("check-coding-system", Fcheck_coding_system
, Scheck_coding_system
,
4751 "Check validity of CODING-SYSTEM.\n\
4752 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.\n\
4753 It is valid if it is a symbol with a non-nil `coding-system' property.\n\
4754 The value of property should be a vector of length 5.")
4756 Lisp_Object coding_system
;
4758 CHECK_SYMBOL (coding_system
, 0);
4759 if (!NILP (Fcoding_system_p (coding_system
)))
4760 return coding_system
;
4762 Fsignal (Qcoding_system_error
, Fcons (coding_system
, Qnil
));
4766 detect_coding_system (src
, src_bytes
, highest
)
4768 int src_bytes
, highest
;
4770 int coding_mask
, eol_type
;
4771 Lisp_Object val
, tmp
;
4774 coding_mask
= detect_coding_mask (src
, src_bytes
, NULL
, &dummy
);
4775 eol_type
= detect_eol_type (src
, src_bytes
, &dummy
);
4776 if (eol_type
== CODING_EOL_INCONSISTENT
)
4777 eol_type
= CODING_EOL_UNDECIDED
;
4782 if (eol_type
!= CODING_EOL_UNDECIDED
)
4785 val2
= Fget (Qundecided
, Qeol_type
);
4787 val
= XVECTOR (val2
)->contents
[eol_type
];
4789 return (highest
? val
: Fcons (val
, Qnil
));
4792 /* At first, gather possible coding systems in VAL. */
4794 for (tmp
= Vcoding_category_list
; !NILP (tmp
); tmp
= XCONS (tmp
)->cdr
)
4797 = XFASTINT (Fget (XCONS (tmp
)->car
, Qcoding_category_index
));
4798 if (coding_mask
& (1 << idx
))
4800 val
= Fcons (Fsymbol_value (XCONS (tmp
)->car
), val
);
4806 val
= Fnreverse (val
);
4808 /* Then, replace the elements with subsidiary coding systems. */
4809 for (tmp
= val
; !NILP (tmp
); tmp
= XCONS (tmp
)->cdr
)
4811 if (eol_type
!= CODING_EOL_UNDECIDED
4812 && eol_type
!= CODING_EOL_INCONSISTENT
)
4815 eol
= Fget (XCONS (tmp
)->car
, Qeol_type
);
4817 XCONS (tmp
)->car
= XVECTOR (eol
)->contents
[eol_type
];
4820 return (highest
? XCONS (val
)->car
: val
);
4823 DEFUN ("detect-coding-region", Fdetect_coding_region
, Sdetect_coding_region
,
4825 "Detect coding system of the text in the region between START and END.\n\
4826 Return a list of possible coding systems ordered by priority.\n\
4828 If only ASCII characters are found, it returns a list of single element\n\
4829 `undecided' or its subsidiary coding system according to a detected\n\
4830 end-of-line format.\n\
4832 If optional argument HIGHEST is non-nil, return the coding system of\n\
4834 (start
, end
, highest
)
4835 Lisp_Object start
, end
, highest
;
4838 int from_byte
, to_byte
;
4840 CHECK_NUMBER_COERCE_MARKER (start
, 0);
4841 CHECK_NUMBER_COERCE_MARKER (end
, 1);
4843 validate_region (&start
, &end
);
4844 from
= XINT (start
), to
= XINT (end
);
4845 from_byte
= CHAR_TO_BYTE (from
);
4846 to_byte
= CHAR_TO_BYTE (to
);
4848 if (from
< GPT
&& to
>= GPT
)
4849 move_gap_both (to
, to_byte
);
4851 return detect_coding_system (BYTE_POS_ADDR (from_byte
),
4852 to_byte
- from_byte
,
4856 DEFUN ("detect-coding-string", Fdetect_coding_string
, Sdetect_coding_string
,
4858 "Detect coding system of the text in STRING.\n\
4859 Return a list of possible coding systems ordered by priority.\n\
4861 If only ASCII characters are found, it returns a list of single element\n\
4862 `undecided' or its subsidiary coding system according to a detected\n\
4863 end-of-line format.\n\
4865 If optional argument HIGHEST is non-nil, return the coding system of\n\
4868 Lisp_Object string
, highest
;
4870 CHECK_STRING (string
, 0);
4872 return detect_coding_system (XSTRING (string
)->data
,
4873 STRING_BYTES (XSTRING (string
)),
4878 code_convert_region1 (start
, end
, coding_system
, encodep
)
4879 Lisp_Object start
, end
, coding_system
;
4882 struct coding_system coding
;
4885 CHECK_NUMBER_COERCE_MARKER (start
, 0);
4886 CHECK_NUMBER_COERCE_MARKER (end
, 1);
4887 CHECK_SYMBOL (coding_system
, 2);
4889 validate_region (&start
, &end
);
4890 from
= XFASTINT (start
);
4891 to
= XFASTINT (end
);
4893 if (NILP (coding_system
))
4894 return make_number (to
- from
);
4896 if (setup_coding_system (Fcheck_coding_system (coding_system
), &coding
) < 0)
4897 error ("Invalid coding system: %s", XSYMBOL (coding_system
)->name
->data
);
4899 coding
.mode
|= CODING_MODE_LAST_BLOCK
;
4900 code_convert_region (from
, CHAR_TO_BYTE (from
), to
, CHAR_TO_BYTE (to
),
4901 &coding
, encodep
, 1);
4902 Vlast_coding_system_used
= coding
.symbol
;
4903 return make_number (coding
.produced_char
);
4906 DEFUN ("decode-coding-region", Fdecode_coding_region
, Sdecode_coding_region
,
4907 3, 3, "r\nzCoding system: ",
4908 "Decode the current region by specified coding system.\n\
4909 When called from a program, takes three arguments:\n\
4910 START, END, and CODING-SYSTEM. START and END are buffer positions.\n\
4911 This function sets `last-coding-system-used' to the precise coding system\n\
4912 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
4913 not fully specified.)\n\
4914 It returns the length of the decoded text.")
4915 (start
, end
, coding_system
)
4916 Lisp_Object start
, end
, coding_system
;
4918 return code_convert_region1 (start
, end
, coding_system
, 0);
4921 DEFUN ("encode-coding-region", Fencode_coding_region
, Sencode_coding_region
,
4922 3, 3, "r\nzCoding system: ",
4923 "Encode the current region by specified coding system.\n\
4924 When called from a program, takes three arguments:\n\
4925 START, END, and CODING-SYSTEM. START and END are buffer positions.\n\
4926 This function sets `last-coding-system-used' to the precise coding system\n\
4927 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
4928 not fully specified.)\n\
4929 It returns the length of the encoded text.")
4930 (start
, end
, coding_system
)
4931 Lisp_Object start
, end
, coding_system
;
4933 return code_convert_region1 (start
, end
, coding_system
, 1);
4937 code_convert_string1 (string
, coding_system
, nocopy
, encodep
)
4938 Lisp_Object string
, coding_system
, nocopy
;
4941 struct coding_system coding
;
4943 CHECK_STRING (string
, 0);
4944 CHECK_SYMBOL (coding_system
, 1);
4946 if (NILP (coding_system
))
4947 return (NILP (nocopy
) ? Fcopy_sequence (string
) : string
);
4949 if (setup_coding_system (Fcheck_coding_system (coding_system
), &coding
) < 0)
4950 error ("Invalid coding system: %s", XSYMBOL (coding_system
)->name
->data
);
4952 coding
.mode
|= CODING_MODE_LAST_BLOCK
;
4953 Vlast_coding_system_used
= coding
.symbol
;
4954 return code_convert_string (string
, &coding
, encodep
, !NILP (nocopy
));
4957 DEFUN ("decode-coding-string", Fdecode_coding_string
, Sdecode_coding_string
,
4959 "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
4960 Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
4961 if the decoding operation is trivial.\n\
4962 This function sets `last-coding-system-used' to the precise coding system\n\
4963 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
4964 not fully specified.)")
4965 (string
, coding_system
, nocopy
)
4966 Lisp_Object string
, coding_system
, nocopy
;
4968 return code_convert_string1 (string
, coding_system
, nocopy
, 0);
4971 DEFUN ("encode-coding-string", Fencode_coding_string
, Sencode_coding_string
,
4973 "Encode STRING to CODING-SYSTEM, and return the result.\n\
4974 Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
4975 if the encoding operation is trivial.\n\
4976 This function sets `last-coding-system-used' to the precise coding system\n\
4977 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
4978 not fully specified.)")
4979 (string
, coding_system
, nocopy
)
4980 Lisp_Object string
, coding_system
, nocopy
;
4982 return code_convert_string1 (string
, coding_system
, nocopy
, 1);
4985 /* Encode or decode STRING according to CODING_SYSTEM.
4986 Do not set Vlast_coding_system_used. */
4989 code_convert_string_norecord (string
, coding_system
, encodep
)
4990 Lisp_Object string
, coding_system
;
4993 struct coding_system coding
;
4995 CHECK_STRING (string
, 0);
4996 CHECK_SYMBOL (coding_system
, 1);
4998 if (NILP (coding_system
))
5001 if (setup_coding_system (Fcheck_coding_system (coding_system
), &coding
) < 0)
5002 error ("Invalid coding system: %s", XSYMBOL (coding_system
)->name
->data
);
5004 coding
.mode
|= CODING_MODE_LAST_BLOCK
;
5005 return code_convert_string (string
, &coding
, encodep
, Qt
);
5008 DEFUN ("decode-sjis-char", Fdecode_sjis_char
, Sdecode_sjis_char
, 1, 1, 0,
5009 "Decode a JISX0208 character of shift-jis encoding.\n\
5010 CODE is the character code in SJIS.\n\
5011 Return the corresponding character.")
5015 unsigned char c1
, c2
, s1
, s2
;
5018 CHECK_NUMBER (code
, 0);
5019 s1
= (XFASTINT (code
)) >> 8, s2
= (XFASTINT (code
)) & 0xFF;
5020 DECODE_SJIS (s1
, s2
, c1
, c2
);
5021 XSETFASTINT (val
, MAKE_NON_ASCII_CHAR (charset_jisx0208
, c1
, c2
));
5025 DEFUN ("encode-sjis-char", Fencode_sjis_char
, Sencode_sjis_char
, 1, 1, 0,
5026 "Encode a JISX0208 character CHAR to SJIS coding system.\n\
5027 Return the corresponding character code in SJIS.")
5031 int charset
, c1
, c2
, s1
, s2
;
5034 CHECK_NUMBER (ch
, 0);
5035 SPLIT_CHAR (XFASTINT (ch
), charset
, c1
, c2
);
5036 if (charset
== charset_jisx0208
)
5038 ENCODE_SJIS (c1
, c2
, s1
, s2
);
5039 XSETFASTINT (val
, (s1
<< 8) | s2
);
5042 XSETFASTINT (val
, 0);
5046 DEFUN ("decode-big5-char", Fdecode_big5_char
, Sdecode_big5_char
, 1, 1, 0,
5047 "Decode a Big5 character CODE of BIG5 coding system.\n\
5048 CODE is the character code in BIG5.\n\
5049 Return the corresponding character.")
5054 unsigned char b1
, b2
, c1
, c2
;
5057 CHECK_NUMBER (code
, 0);
5058 b1
= (XFASTINT (code
)) >> 8, b2
= (XFASTINT (code
)) & 0xFF;
5059 DECODE_BIG5 (b1
, b2
, charset
, c1
, c2
);
5060 XSETFASTINT (val
, MAKE_NON_ASCII_CHAR (charset
, c1
, c2
));
5064 DEFUN ("encode-big5-char", Fencode_big5_char
, Sencode_big5_char
, 1, 1, 0,
5065 "Encode the Big5 character CHAR to BIG5 coding system.\n\
5066 Return the corresponding character code in Big5.")
5070 int charset
, c1
, c2
, b1
, b2
;
5073 CHECK_NUMBER (ch
, 0);
5074 SPLIT_CHAR (XFASTINT (ch
), charset
, c1
, c2
);
5075 if (charset
== charset_big5_1
|| charset
== charset_big5_2
)
5077 ENCODE_BIG5 (charset
, c1
, c2
, b1
, b2
);
5078 XSETFASTINT (val
, (b1
<< 8) | b2
);
5081 XSETFASTINT (val
, 0);
5085 DEFUN ("set-terminal-coding-system-internal",
5086 Fset_terminal_coding_system_internal
,
5087 Sset_terminal_coding_system_internal
, 1, 1, 0, "")
5089 Lisp_Object coding_system
;
5091 CHECK_SYMBOL (coding_system
, 0);
5092 setup_coding_system (Fcheck_coding_system (coding_system
), &terminal_coding
);
5093 /* We had better not send unsafe characters to terminal. */
5094 terminal_coding
.flags
|= CODING_FLAG_ISO_SAFE
;
5099 DEFUN ("set-safe-terminal-coding-system-internal",
5100 Fset_safe_terminal_coding_system_internal
,
5101 Sset_safe_terminal_coding_system_internal
, 1, 1, 0, "")
5103 Lisp_Object coding_system
;
5105 CHECK_SYMBOL (coding_system
, 0);
5106 setup_coding_system (Fcheck_coding_system (coding_system
),
5107 &safe_terminal_coding
);
5111 DEFUN ("terminal-coding-system",
5112 Fterminal_coding_system
, Sterminal_coding_system
, 0, 0, 0,
5113 "Return coding system specified for terminal output.")
5116 return terminal_coding
.symbol
;
5119 DEFUN ("set-keyboard-coding-system-internal",
5120 Fset_keyboard_coding_system_internal
,
5121 Sset_keyboard_coding_system_internal
, 1, 1, 0, "")
5123 Lisp_Object coding_system
;
5125 CHECK_SYMBOL (coding_system
, 0);
5126 setup_coding_system (Fcheck_coding_system (coding_system
), &keyboard_coding
);
5130 DEFUN ("keyboard-coding-system",
5131 Fkeyboard_coding_system
, Skeyboard_coding_system
, 0, 0, 0,
5132 "Return coding system specified for decoding keyboard input.")
5135 return keyboard_coding
.symbol
;
5139 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system
,
5140 Sfind_operation_coding_system
, 1, MANY
, 0,
5141 "Choose a coding system for an operation based on the target name.\n\
5142 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).\n\
5143 DECODING-SYSTEM is the coding system to use for decoding\n\
5144 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system\n\
5145 for encoding (in case OPERATION does encoding).\n\
5147 The first argument OPERATION specifies an I/O primitive:\n\
5148 For file I/O, `insert-file-contents' or `write-region'.\n\
5149 For process I/O, `call-process', `call-process-region', or `start-process'.\n\
5150 For network I/O, `open-network-stream'.\n\
5152 The remaining arguments should be the same arguments that were passed\n\
5153 to the primitive. Depending on which primitive, one of those arguments\n\
5154 is selected as the TARGET. For example, if OPERATION does file I/O,\n\
5155 whichever argument specifies the file name is TARGET.\n\
5157 TARGET has a meaning which depends on OPERATION:\n\
5158 For file I/O, TARGET is a file name.\n\
5159 For process I/O, TARGET is a process name.\n\
5160 For network I/O, TARGET is a service name or a port number\n\
5162 This function looks up what specified for TARGET in,\n\
5163 `file-coding-system-alist', `process-coding-system-alist',\n\
5164 or `network-coding-system-alist' depending on OPERATION.\n\
5165 They may specify a coding system, a cons of coding systems,\n\
5166 or a function symbol to call.\n\
5167 In the last case, we call the function with one argument,\n\
5168 which is a list of all the arguments given to this function.")
5173 Lisp_Object operation
, target_idx
, target
, val
;
5174 register Lisp_Object chain
;
5177 error ("Too few arguments");
5178 operation
= args
[0];
5179 if (!SYMBOLP (operation
)
5180 || !INTEGERP (target_idx
= Fget (operation
, Qtarget_idx
)))
5181 error ("Invalid first arguement");
5182 if (nargs
< 1 + XINT (target_idx
))
5183 error ("Too few arguments for operation: %s",
5184 XSYMBOL (operation
)->name
->data
);
5185 target
= args
[XINT (target_idx
) + 1];
5186 if (!(STRINGP (target
)
5187 || (EQ (operation
, Qopen_network_stream
) && INTEGERP (target
))))
5188 error ("Invalid %dth argument", XINT (target_idx
) + 1);
5190 chain
= ((EQ (operation
, Qinsert_file_contents
)
5191 || EQ (operation
, Qwrite_region
))
5192 ? Vfile_coding_system_alist
5193 : (EQ (operation
, Qopen_network_stream
)
5194 ? Vnetwork_coding_system_alist
5195 : Vprocess_coding_system_alist
));
5199 for (; CONSP (chain
); chain
= XCONS (chain
)->cdr
)
5202 elt
= XCONS (chain
)->car
;
5205 && ((STRINGP (target
)
5206 && STRINGP (XCONS (elt
)->car
)
5207 && fast_string_match (XCONS (elt
)->car
, target
) >= 0)
5208 || (INTEGERP (target
) && EQ (target
, XCONS (elt
)->car
))))
5210 val
= XCONS (elt
)->cdr
;
5211 /* Here, if VAL is both a valid coding system and a valid
5212 function symbol, we return VAL as a coding system. */
5215 if (! SYMBOLP (val
))
5217 if (! NILP (Fcoding_system_p (val
)))
5218 return Fcons (val
, val
);
5219 if (! NILP (Ffboundp (val
)))
5221 val
= call1 (val
, Flist (nargs
, args
));
5224 if (SYMBOLP (val
) && ! NILP (Fcoding_system_p (val
)))
5225 return Fcons (val
, val
);
5233 DEFUN ("update-coding-systems-internal", Fupdate_coding_systems_internal
,
5234 Supdate_coding_systems_internal
, 0, 0, 0,
5235 "Update internal database for ISO2022 and CCL based coding systems.\n\
5236 When values of the following coding categories are changed, you must\n\
5237 call this function:\n\
5238 coding-category-iso-7, coding-category-iso-7-tight,\n\
5239 coding-category-iso-8-1, coding-category-iso-8-2,\n\
5240 coding-category-iso-7-else, coding-category-iso-8-else,\n\
5241 coding-category-ccl")
5246 for (i
= CODING_CATEGORY_IDX_ISO_7
; i
<= CODING_CATEGORY_IDX_CCL
; i
++)
5250 val
= XSYMBOL (XVECTOR (Vcoding_category_table
)->contents
[i
])->value
;
5253 if (! coding_system_table
[i
])
5254 coding_system_table
[i
] = ((struct coding_system
*)
5255 xmalloc (sizeof (struct coding_system
)));
5256 setup_coding_system (val
, coding_system_table
[i
]);
5258 else if (coding_system_table
[i
])
5260 xfree (coding_system_table
[i
]);
5261 coding_system_table
[i
] = NULL
;
5268 DEFUN ("set-coding-priority-internal", Fset_coding_priority_internal
,
5269 Sset_coding_priority_internal
, 0, 0, 0,
5270 "Update internal database for the current value of `coding-category-list'.\n\
5271 This function is internal use only.")
5277 val
= Vcoding_category_list
;
5279 while (CONSP (val
) && i
< CODING_CATEGORY_IDX_MAX
)
5281 if (! SYMBOLP (XCONS (val
)->car
))
5283 idx
= XFASTINT (Fget (XCONS (val
)->car
, Qcoding_category_index
));
5284 if (idx
>= CODING_CATEGORY_IDX_MAX
)
5286 coding_priorities
[i
++] = (1 << idx
);
5287 val
= XCONS (val
)->cdr
;
5289 /* If coding-category-list is valid and contains all coding
5290 categories, `i' should be CODING_CATEGORY_IDX_MAX now. If not,
5291 the following code saves Emacs from craching. */
5292 while (i
< CODING_CATEGORY_IDX_MAX
)
5293 coding_priorities
[i
++] = CODING_CATEGORY_MASK_RAW_TEXT
;
5301 /*** 9. Post-amble ***/
5306 conversion_buffer
= (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE
);
5314 /* Emacs' internal format specific initialize routine. */
5315 for (i
= 0; i
<= 0x20; i
++)
5316 emacs_code_class
[i
] = EMACS_control_code
;
5317 emacs_code_class
[0x0A] = EMACS_linefeed_code
;
5318 emacs_code_class
[0x0D] = EMACS_carriage_return_code
;
5319 for (i
= 0x21 ; i
< 0x7F; i
++)
5320 emacs_code_class
[i
] = EMACS_ascii_code
;
5321 emacs_code_class
[0x7F] = EMACS_control_code
;
5322 emacs_code_class
[0x80] = EMACS_leading_code_composition
;
5323 for (i
= 0x81; i
< 0xFF; i
++)
5324 emacs_code_class
[i
] = EMACS_invalid_code
;
5325 emacs_code_class
[LEADING_CODE_PRIVATE_11
] = EMACS_leading_code_3
;
5326 emacs_code_class
[LEADING_CODE_PRIVATE_12
] = EMACS_leading_code_3
;
5327 emacs_code_class
[LEADING_CODE_PRIVATE_21
] = EMACS_leading_code_4
;
5328 emacs_code_class
[LEADING_CODE_PRIVATE_22
] = EMACS_leading_code_4
;
5330 /* ISO2022 specific initialize routine. */
5331 for (i
= 0; i
< 0x20; i
++)
5332 iso_code_class
[i
] = ISO_control_code
;
5333 for (i
= 0x21; i
< 0x7F; i
++)
5334 iso_code_class
[i
] = ISO_graphic_plane_0
;
5335 for (i
= 0x80; i
< 0xA0; i
++)
5336 iso_code_class
[i
] = ISO_control_code
;
5337 for (i
= 0xA1; i
< 0xFF; i
++)
5338 iso_code_class
[i
] = ISO_graphic_plane_1
;
5339 iso_code_class
[0x20] = iso_code_class
[0x7F] = ISO_0x20_or_0x7F
;
5340 iso_code_class
[0xA0] = iso_code_class
[0xFF] = ISO_0xA0_or_0xFF
;
5341 iso_code_class
[ISO_CODE_CR
] = ISO_carriage_return
;
5342 iso_code_class
[ISO_CODE_SO
] = ISO_shift_out
;
5343 iso_code_class
[ISO_CODE_SI
] = ISO_shift_in
;
5344 iso_code_class
[ISO_CODE_SS2_7
] = ISO_single_shift_2_7
;
5345 iso_code_class
[ISO_CODE_ESC
] = ISO_escape
;
5346 iso_code_class
[ISO_CODE_SS2
] = ISO_single_shift_2
;
5347 iso_code_class
[ISO_CODE_SS3
] = ISO_single_shift_3
;
5348 iso_code_class
[ISO_CODE_CSI
] = ISO_control_sequence_introducer
;
5350 conversion_buffer_size
= MINIMUM_CONVERSION_BUFFER_SIZE
;
5352 setup_coding_system (Qnil
, &keyboard_coding
);
5353 setup_coding_system (Qnil
, &terminal_coding
);
5354 setup_coding_system (Qnil
, &safe_terminal_coding
);
5355 setup_coding_system (Qnil
, &default_buffer_file_coding
);
5357 bzero (coding_system_table
, sizeof coding_system_table
);
5359 bzero (ascii_skip_code
, sizeof ascii_skip_code
);
5360 for (i
= 0; i
< 128; i
++)
5361 ascii_skip_code
[i
] = 1;
5363 #if defined (MSDOS) || defined (WINDOWSNT)
5364 system_eol_type
= CODING_EOL_CRLF
;
5366 system_eol_type
= CODING_EOL_LF
;
5375 Qtarget_idx
= intern ("target-idx");
5376 staticpro (&Qtarget_idx
);
5378 Qcoding_system_history
= intern ("coding-system-history");
5379 staticpro (&Qcoding_system_history
);
5380 Fset (Qcoding_system_history
, Qnil
);
5382 /* Target FILENAME is the first argument. */
5383 Fput (Qinsert_file_contents
, Qtarget_idx
, make_number (0));
5384 /* Target FILENAME is the third argument. */
5385 Fput (Qwrite_region
, Qtarget_idx
, make_number (2));
5387 Qcall_process
= intern ("call-process");
5388 staticpro (&Qcall_process
);
5389 /* Target PROGRAM is the first argument. */
5390 Fput (Qcall_process
, Qtarget_idx
, make_number (0));
5392 Qcall_process_region
= intern ("call-process-region");
5393 staticpro (&Qcall_process_region
);
5394 /* Target PROGRAM is the third argument. */
5395 Fput (Qcall_process_region
, Qtarget_idx
, make_number (2));
5397 Qstart_process
= intern ("start-process");
5398 staticpro (&Qstart_process
);
5399 /* Target PROGRAM is the third argument. */
5400 Fput (Qstart_process
, Qtarget_idx
, make_number (2));
5402 Qopen_network_stream
= intern ("open-network-stream");
5403 staticpro (&Qopen_network_stream
);
5404 /* Target SERVICE is the fourth argument. */
5405 Fput (Qopen_network_stream
, Qtarget_idx
, make_number (3));
5407 Qcoding_system
= intern ("coding-system");
5408 staticpro (&Qcoding_system
);
5410 Qeol_type
= intern ("eol-type");
5411 staticpro (&Qeol_type
);
5413 Qbuffer_file_coding_system
= intern ("buffer-file-coding-system");
5414 staticpro (&Qbuffer_file_coding_system
);
5416 Qpost_read_conversion
= intern ("post-read-conversion");
5417 staticpro (&Qpost_read_conversion
);
5419 Qpre_write_conversion
= intern ("pre-write-conversion");
5420 staticpro (&Qpre_write_conversion
);
5422 Qno_conversion
= intern ("no-conversion");
5423 staticpro (&Qno_conversion
);
5425 Qundecided
= intern ("undecided");
5426 staticpro (&Qundecided
);
5428 Qcoding_system_p
= intern ("coding-system-p");
5429 staticpro (&Qcoding_system_p
);
5431 Qcoding_system_error
= intern ("coding-system-error");
5432 staticpro (&Qcoding_system_error
);
5434 Fput (Qcoding_system_error
, Qerror_conditions
,
5435 Fcons (Qcoding_system_error
, Fcons (Qerror
, Qnil
)));
5436 Fput (Qcoding_system_error
, Qerror_message
,
5437 build_string ("Invalid coding system"));
5439 Qcoding_category
= intern ("coding-category");
5440 staticpro (&Qcoding_category
);
5441 Qcoding_category_index
= intern ("coding-category-index");
5442 staticpro (&Qcoding_category_index
);
5444 Vcoding_category_table
5445 = Fmake_vector (make_number (CODING_CATEGORY_IDX_MAX
), Qnil
);
5446 staticpro (&Vcoding_category_table
);
5449 for (i
= 0; i
< CODING_CATEGORY_IDX_MAX
; i
++)
5451 XVECTOR (Vcoding_category_table
)->contents
[i
]
5452 = intern (coding_category_name
[i
]);
5453 Fput (XVECTOR (Vcoding_category_table
)->contents
[i
],
5454 Qcoding_category_index
, make_number (i
));
5458 Qtranslation_table
= intern ("translation-table");
5459 staticpro (&Qtranslation_table
);
5460 Fput (Qtranslation_table
, Qchar_table_extra_slots
, make_number (1));
5462 Qtranslation_table_id
= intern ("translation-table-id");
5463 staticpro (&Qtranslation_table_id
);
5465 Qtranslation_table_for_decode
= intern ("translation-table-for-decode");
5466 staticpro (&Qtranslation_table_for_decode
);
5468 Qtranslation_table_for_encode
= intern ("translation-table-for-encode");
5469 staticpro (&Qtranslation_table_for_encode
);
5471 Qsafe_charsets
= intern ("safe-charsets");
5472 staticpro (&Qsafe_charsets
);
5474 Qvalid_codes
= intern ("valid-codes");
5475 staticpro (&Qvalid_codes
);
5477 Qemacs_mule
= intern ("emacs-mule");
5478 staticpro (&Qemacs_mule
);
5480 Qraw_text
= intern ("raw-text");
5481 staticpro (&Qraw_text
);
5483 defsubr (&Scoding_system_p
);
5484 defsubr (&Sread_coding_system
);
5485 defsubr (&Sread_non_nil_coding_system
);
5486 defsubr (&Scheck_coding_system
);
5487 defsubr (&Sdetect_coding_region
);
5488 defsubr (&Sdetect_coding_string
);
5489 defsubr (&Sdecode_coding_region
);
5490 defsubr (&Sencode_coding_region
);
5491 defsubr (&Sdecode_coding_string
);
5492 defsubr (&Sencode_coding_string
);
5493 defsubr (&Sdecode_sjis_char
);
5494 defsubr (&Sencode_sjis_char
);
5495 defsubr (&Sdecode_big5_char
);
5496 defsubr (&Sencode_big5_char
);
5497 defsubr (&Sset_terminal_coding_system_internal
);
5498 defsubr (&Sset_safe_terminal_coding_system_internal
);
5499 defsubr (&Sterminal_coding_system
);
5500 defsubr (&Sset_keyboard_coding_system_internal
);
5501 defsubr (&Skeyboard_coding_system
);
5502 defsubr (&Sfind_operation_coding_system
);
5503 defsubr (&Supdate_coding_systems_internal
);
5504 defsubr (&Sset_coding_priority_internal
);
5506 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list
,
5507 "List of coding systems.\n\
5509 Do not alter the value of this variable manually. This variable should be\n\
5510 updated by the functions `make-coding-system' and\n\
5511 `define-coding-system-alias'.");
5512 Vcoding_system_list
= Qnil
;
5514 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist
,
5515 "Alist of coding system names.\n\
5516 Each element is one element list of coding system name.\n\
5517 This variable is given to `completing-read' as TABLE argument.\n\
5519 Do not alter the value of this variable manually. This variable should be\n\
5520 updated by the functions `make-coding-system' and\n\
5521 `define-coding-system-alias'.");
5522 Vcoding_system_alist
= Qnil
;
5524 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list
,
5525 "List of coding-categories (symbols) ordered by priority.");
5529 Vcoding_category_list
= Qnil
;
5530 for (i
= CODING_CATEGORY_IDX_MAX
- 1; i
>= 0; i
--)
5531 Vcoding_category_list
5532 = Fcons (XVECTOR (Vcoding_category_table
)->contents
[i
],
5533 Vcoding_category_list
);
5536 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read
,
5537 "Specify the coding system for read operations.\n\
5538 It is useful to bind this variable with `let', but do not set it globally.\n\
5539 If the value is a coding system, it is used for decoding on read operation.\n\
5540 If not, an appropriate element is used from one of the coding system alists:\n\
5541 There are three such tables, `file-coding-system-alist',\n\
5542 `process-coding-system-alist', and `network-coding-system-alist'.");
5543 Vcoding_system_for_read
= Qnil
;
5545 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write
,
5546 "Specify the coding system for write operations.\n\
5547 It is useful to bind this variable with `let', but do not set it globally.\n\
5548 If the value is a coding system, it is used for encoding on write operation.\n\
5549 If not, an appropriate element is used from one of the coding system alists:\n\
5550 There are three such tables, `file-coding-system-alist',\n\
5551 `process-coding-system-alist', and `network-coding-system-alist'.");
5552 Vcoding_system_for_write
= Qnil
;
5554 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used
,
5555 "Coding system used in the latest file or process I/O.");
5556 Vlast_coding_system_used
= Qnil
;
5558 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion
,
5559 "*Non-nil inhibit code conversion of end-of-line format in any cases.");
5560 inhibit_eol_conversion
= 0;
5562 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system
,
5563 "Non-nil means process buffer inherits coding system of process output.\n\
5564 Bind it to t if the process output is to be treated as if it were a file\n\
5565 read from some filesystem.");
5566 inherit_process_coding_system
= 0;
5568 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist
,
5569 "Alist to decide a coding system to use for a file I/O operation.\n\
5570 The format is ((PATTERN . VAL) ...),\n\
5571 where PATTERN is a regular expression matching a file name,\n\
5572 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
5573 If VAL is a coding system, it is used for both decoding and encoding\n\
5574 the file contents.\n\
5575 If VAL is a cons of coding systems, the car part is used for decoding,\n\
5576 and the cdr part is used for encoding.\n\
5577 If VAL is a function symbol, the function must return a coding system\n\
5578 or a cons of coding systems which are used as above.\n\
5580 See also the function `find-operation-coding-system'\n\
5581 and the variable `auto-coding-alist'.");
5582 Vfile_coding_system_alist
= Qnil
;
5584 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist
,
5585 "Alist to decide a coding system to use for a process I/O operation.\n\
5586 The format is ((PATTERN . VAL) ...),\n\
5587 where PATTERN is a regular expression matching a program name,\n\
5588 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
5589 If VAL is a coding system, it is used for both decoding what received\n\
5590 from the program and encoding what sent to the program.\n\
5591 If VAL is a cons of coding systems, the car part is used for decoding,\n\
5592 and the cdr part is used for encoding.\n\
5593 If VAL is a function symbol, the function must return a coding system\n\
5594 or a cons of coding systems which are used as above.\n\
5596 See also the function `find-operation-coding-system'.");
5597 Vprocess_coding_system_alist
= Qnil
;
5599 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist
,
5600 "Alist to decide a coding system to use for a network I/O operation.\n\
5601 The format is ((PATTERN . VAL) ...),\n\
5602 where PATTERN is a regular expression matching a network service name\n\
5603 or is a port number to connect to,\n\
5604 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
5605 If VAL is a coding system, it is used for both decoding what received\n\
5606 from the network stream and encoding what sent to the network stream.\n\
5607 If VAL is a cons of coding systems, the car part is used for decoding,\n\
5608 and the cdr part is used for encoding.\n\
5609 If VAL is a function symbol, the function must return a coding system\n\
5610 or a cons of coding systems which are used as above.\n\
5612 See also the function `find-operation-coding-system'.");
5613 Vnetwork_coding_system_alist
= Qnil
;
5615 DEFVAR_INT ("eol-mnemonic-unix", &eol_mnemonic_unix
,
5616 "Mnemonic character indicating UNIX-like end-of-line format (i.e. LF) .");
5617 eol_mnemonic_unix
= ':';
5619 DEFVAR_INT ("eol-mnemonic-dos", &eol_mnemonic_dos
,
5620 "Mnemonic character indicating DOS-like end-of-line format (i.e. CRLF).");
5621 eol_mnemonic_dos
= '\\';
5623 DEFVAR_INT ("eol-mnemonic-mac", &eol_mnemonic_mac
,
5624 "Mnemonic character indicating MAC-like end-of-line format (i.e. CR).");
5625 eol_mnemonic_mac
= '/';
5627 DEFVAR_INT ("eol-mnemonic-undecided", &eol_mnemonic_undecided
,
5628 "Mnemonic character indicating end-of-line format is not yet decided.");
5629 eol_mnemonic_undecided
= ':';
5631 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation
,
5632 "*Non-nil enables character translation while encoding and decoding.");
5633 Venable_character_translation
= Qt
;
5635 DEFVAR_LISP ("standard-translation-table-for-decode",
5636 &Vstandard_translation_table_for_decode
,
5637 "Table for translating characters while decoding.");
5638 Vstandard_translation_table_for_decode
= Qnil
;
5640 DEFVAR_LISP ("standard-translation-table-for-encode",
5641 &Vstandard_translation_table_for_encode
,
5642 "Table for translationg characters while encoding.");
5643 Vstandard_translation_table_for_encode
= Qnil
;
5645 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist
,
5646 "Alist of charsets vs revision numbers.\n\
5647 While encoding, if a charset (car part of an element) is found,\n\
5648 designate it with the escape sequence identifing revision (cdr part of the element).");
5649 Vcharset_revision_alist
= Qnil
;
5651 DEFVAR_LISP ("default-process-coding-system",
5652 &Vdefault_process_coding_system
,
5653 "Cons of coding systems used for process I/O by default.\n\
5654 The car part is used for decoding a process output,\n\
5655 the cdr part is used for encoding a text to be sent to a process.");
5656 Vdefault_process_coding_system
= Qnil
;
5658 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table
,
5659 "Table of extra Latin codes in the range 128..159 (inclusive).\n\
5660 This is a vector of length 256.\n\
5661 If Nth element is non-nil, the existence of code N in a file\n\
5662 \(or output of subprocess) doesn't prevent it to be detected as\n\
5663 a coding system of ISO 2022 variant which has a flag\n\
5664 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file\n\
5665 or reading output of a subprocess.\n\
5666 Only 128th through 159th elements has a meaning.");
5667 Vlatin_extra_code_table
= Fmake_vector (make_number (256), Qnil
);
5669 DEFVAR_LISP ("select-safe-coding-system-function",
5670 &Vselect_safe_coding_system_function
,
5671 "Function to call to select safe coding system for encoding a text.\n\
5673 If set, this function is called to force a user to select a proper\n\
5674 coding system which can encode the text in the case that a default\n\
5675 coding system used in each operation can't encode the text.\n\
5677 The default value is `select-safe-coding-system' (which see).");
5678 Vselect_safe_coding_system_function
= Qnil
;