1 /* Coding system handler (conversion, detection, and etc).
2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
5 This file is part of GNU Emacs.
7 GNU Emacs is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
12 GNU Emacs is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GNU Emacs; see the file COPYING. If not, write to
19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
22 /*** TABLE OF CONTENTS ***
25 2. Emacs' internal format (emacs-mule) handlers
27 4. Shift-JIS and BIG5 handlers
29 6. End-of-line handlers
30 7. C library functions
31 8. Emacs Lisp library functions
36 /*** GENERAL NOTE on CODING SYSTEM ***
38 Coding system is an encoding mechanism of one or more character
39 sets. Here's a list of coding systems which Emacs can handle. When
40 we say "decode", it means converting some other coding system to
41 Emacs' internal format (emacs-internal), and when we say "encode",
42 it means converting the coding system emacs-mule to some other
45 0. Emacs' internal format (emacs-mule)
47 Emacs itself holds a multi-lingual character in a buffer and a string
48 in a special format. Details are described in section 2.
52 The most famous coding system for multiple character sets. X's
53 Compound Text, various EUCs (Extended Unix Code), and coding
54 systems used in Internet communication such as ISO-2022-JP are
55 all variants of ISO2022. Details are described in section 3.
57 2. SJIS (or Shift-JIS or MS-Kanji-Code)
59 A coding system to encode character sets: ASCII, JISX0201, and
60 JISX0208. Widely used for PC's in Japan. Details are described in
65 A coding system to encode character sets: ASCII and Big5. Widely
66 used by Chinese (mainly in Taiwan and Hong Kong). Details are
67 described in section 4. In this file, when we write "BIG5"
68 (all uppercase), we mean the coding system, and when we write
69 "Big5" (capitalized), we mean the character set.
73 A coding system for a text containing random 8-bit code. Emacs does
74 no code conversion on such a text except for end-of-line format.
78 If a user wants to read/write a text encoded in a coding system not
79 listed above, he can supply a decoder and an encoder for it in CCL
80 (Code Conversion Language) programs. Emacs executes the CCL program
81 while reading/writing.
83 Emacs represents a coding system by a Lisp symbol that has a property
84 `coding-system'. But, before actually using the coding system, the
85 information about it is set in a structure of type `struct
86 coding_system' for rapid processing. See section 6 for more details.
90 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
92 How end-of-line of a text is encoded depends on a system. For
93 instance, Unix's format is just one byte of `line-feed' code,
94 whereas DOS's format is two-byte sequence of `carriage-return' and
95 `line-feed' codes. MacOS's format is usually one byte of
98 Since text characters encoding and end-of-line encoding are
99 independent, any coding system described above can take
100 any format of end-of-line. So, Emacs has information of format of
101 end-of-line in each coding-system. See section 6 for more details.
105 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
107 These functions check if a text between SRC and SRC_END is encoded
108 in the coding system category XXX. Each returns an integer value in
109 which appropriate flag bits for the category XXX is set. The flag
110 bits are defined in macros CODING_CATEGORY_MASK_XXX. Below is the
111 template of these functions. */
114 detect_coding_emacs_mule (src
, src_end
)
115 unsigned char *src
, *src_end
;
121 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
123 These functions decode SRC_BYTES length text at SOURCE encoded in
124 CODING to Emacs' internal format (emacs-mule). The resulting text
125 goes to a place pointed to by DESTINATION, the length of which
126 should not exceed DST_BYTES. These functions set the information of
127 original and decoded texts in the members produced, produced_char,
128 consumed, and consumed_char of the structure *CODING.
130 The return value is an integer (CODING_FINISH_XXX) indicating how
131 the decoding finished.
133 DST_BYTES zero means that source area and destination area are
134 overlapped, which means that we can produce a decoded text until it
135 reaches at the head of not-yet-decoded source text.
137 Below is a template of these functions. */
139 decode_coding_XXX (coding
, source
, destination
, src_bytes
, dst_bytes
)
140 struct coding_system
*coding
;
141 unsigned char *source
, *destination
;
142 int src_bytes
, dst_bytes
;
148 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
150 These functions encode SRC_BYTES length text at SOURCE of Emacs'
151 internal format (emacs-mule) to CODING. The resulting text goes to
152 a place pointed to by DESTINATION, the length of which should not
153 exceed DST_BYTES. These functions set the information of
154 original and encoded texts in the members produced, produced_char,
155 consumed, and consumed_char of the structure *CODING.
157 The return value is an integer (CODING_FINISH_XXX) indicating how
158 the encoding finished.
160 DST_BYTES zero means that source area and destination area are
161 overlapped, which means that we can produce a decoded text until it
162 reaches at the head of not-yet-decoded source text.
164 Below is a template of these functions. */
166 encode_coding_XXX (coding
, source
, destination
, src_bytes
, dst_bytes
)
167 struct coding_system
*coding
;
168 unsigned char *source
, *destination
;
169 int src_bytes
, dst_bytes
;
175 /*** COMMONLY USED MACROS ***/
177 /* The following three macros ONE_MORE_BYTE, TWO_MORE_BYTES, and
178 THREE_MORE_BYTES safely get one, two, and three bytes from the
179 source text respectively. If there are not enough bytes in the
180 source, they jump to `label_end_of_loop'. The caller should set
181 variables `src' and `src_end' to appropriate areas in advance. */
183 #define ONE_MORE_BYTE(c1) \
188 goto label_end_of_loop; \
191 #define TWO_MORE_BYTES(c1, c2) \
193 if (src + 1 < src_end) \
194 c1 = *src++, c2 = *src++; \
196 goto label_end_of_loop; \
199 #define THREE_MORE_BYTES(c1, c2, c3) \
201 if (src + 2 < src_end) \
202 c1 = *src++, c2 = *src++, c3 = *src++; \
204 goto label_end_of_loop; \
207 /* The following three macros DECODE_CHARACTER_ASCII,
208 DECODE_CHARACTER_DIMENSION1, and DECODE_CHARACTER_DIMENSION2 put
209 the multi-byte form of a character of each class at the place
210 pointed by `dst'. The caller should set the variable `dst' to
211 point to an appropriate area and the variable `coding' to point to
212 the coding-system of the currently decoding text in advance. */
214 /* Decode one ASCII character C. */
216 #define DECODE_CHARACTER_ASCII(c) \
218 if (COMPOSING_P (coding->composing)) \
220 *dst++ = 0xA0, *dst++ = (c) | 0x80; \
221 coding->composed_chars++; \
222 if (((c) | 0x80) < 0xA0) \
223 coding->fake_multibyte = 1; \
228 coding->produced_char++; \
230 coding->fake_multibyte = 1; \
234 /* Decode one DIMENSION1 character whose charset is CHARSET and whose
235 position-code is C. */
237 #define DECODE_CHARACTER_DIMENSION1(charset, c) \
239 unsigned char leading_code = CHARSET_LEADING_CODE_BASE (charset); \
240 if (COMPOSING_P (coding->composing)) \
242 *dst++ = leading_code + 0x20; \
243 coding->composed_chars++; \
247 *dst++ = leading_code; \
248 coding->produced_char++; \
250 if (leading_code = CHARSET_LEADING_CODE_EXT (charset)) \
251 *dst++ = leading_code; \
252 *dst++ = (c) | 0x80; \
253 if (((c) | 0x80) < 0xA0) \
254 coding->fake_multibyte = 1; \
257 /* Decode one DIMENSION2 character whose charset is CHARSET and whose
258 position-codes are C1 and C2. */
260 #define DECODE_CHARACTER_DIMENSION2(charset, c1, c2) \
262 DECODE_CHARACTER_DIMENSION1 (charset, c1); \
263 *dst++ = (c2) | 0x80; \
264 if (((c2) | 0x80) < 0xA0) \
265 coding->fake_multibyte = 1; \
269 /*** 1. Preamble ***/
283 #else /* not emacs */
287 #endif /* not emacs */
289 Lisp_Object Qcoding_system
, Qeol_type
;
290 Lisp_Object Qbuffer_file_coding_system
;
291 Lisp_Object Qpost_read_conversion
, Qpre_write_conversion
;
292 Lisp_Object Qno_conversion
, Qundecided
;
293 Lisp_Object Qcoding_system_history
;
294 Lisp_Object Qsafe_charsets
;
295 Lisp_Object Qvalid_codes
;
297 extern Lisp_Object Qinsert_file_contents
, Qwrite_region
;
298 Lisp_Object Qcall_process
, Qcall_process_region
, Qprocess_argument
;
299 Lisp_Object Qstart_process
, Qopen_network_stream
;
300 Lisp_Object Qtarget_idx
;
302 Lisp_Object Vselect_safe_coding_system_function
;
304 /* Mnemonic string for each format of end-of-line. */
305 Lisp_Object eol_mnemonic_unix
, eol_mnemonic_dos
, eol_mnemonic_mac
;
306 /* Mnemonic string to indicate format of end-of-line is not yet
308 Lisp_Object eol_mnemonic_undecided
;
310 /* Format of end-of-line decided by system. This is CODING_EOL_LF on
311 Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac. */
316 Lisp_Object Vcoding_system_list
, Vcoding_system_alist
;
318 Lisp_Object Qcoding_system_p
, Qcoding_system_error
;
320 /* Coding system emacs-mule and raw-text are for converting only
321 end-of-line format. */
322 Lisp_Object Qemacs_mule
, Qraw_text
;
324 /* Coding-systems are handed between Emacs Lisp programs and C internal
325 routines by the following three variables. */
326 /* Coding-system for reading files and receiving data from process. */
327 Lisp_Object Vcoding_system_for_read
;
328 /* Coding-system for writing files and sending data to process. */
329 Lisp_Object Vcoding_system_for_write
;
330 /* Coding-system actually used in the latest I/O. */
331 Lisp_Object Vlast_coding_system_used
;
333 /* A vector of length 256 which contains information about special
334 Latin codes (especially for dealing with Microsoft codes). */
335 Lisp_Object Vlatin_extra_code_table
;
337 /* Flag to inhibit code conversion of end-of-line format. */
338 int inhibit_eol_conversion
;
340 /* Flag to make buffer-file-coding-system inherit from process-coding. */
341 int inherit_process_coding_system
;
343 /* Coding system to be used to encode text for terminal display. */
344 struct coding_system terminal_coding
;
346 /* Coding system to be used to encode text for terminal display when
347 terminal coding system is nil. */
348 struct coding_system safe_terminal_coding
;
350 /* Coding system of what is sent from terminal keyboard. */
351 struct coding_system keyboard_coding
;
353 /* Default coding system to be used to write a file. */
354 struct coding_system default_buffer_file_coding
;
356 Lisp_Object Vfile_coding_system_alist
;
357 Lisp_Object Vprocess_coding_system_alist
;
358 Lisp_Object Vnetwork_coding_system_alist
;
362 Lisp_Object Qcoding_category
, Qcoding_category_index
;
364 /* List of symbols `coding-category-xxx' ordered by priority. */
365 Lisp_Object Vcoding_category_list
;
367 /* Table of coding categories (Lisp symbols). */
368 Lisp_Object Vcoding_category_table
;
370 /* Table of names of symbol for each coding-category. */
371 char *coding_category_name
[CODING_CATEGORY_IDX_MAX
] = {
372 "coding-category-emacs-mule",
373 "coding-category-sjis",
374 "coding-category-iso-7",
375 "coding-category-iso-7-tight",
376 "coding-category-iso-8-1",
377 "coding-category-iso-8-2",
378 "coding-category-iso-7-else",
379 "coding-category-iso-8-else",
380 "coding-category-ccl",
381 "coding-category-big5",
382 "coding-category-raw-text",
383 "coding-category-binary"
386 /* Table of pointers to coding systems corresponding to each coding
388 struct coding_system
*coding_system_table
[CODING_CATEGORY_IDX_MAX
];
390 /* Table of coding category masks. Nth element is a mask for a coding
391 cateogry of which priority is Nth. */
393 int coding_priorities
[CODING_CATEGORY_IDX_MAX
];
395 /* Flag to tell if we look up translation table on character code
397 Lisp_Object Venable_character_translation
;
398 /* Standard translation table to look up on decoding (reading). */
399 Lisp_Object Vstandard_translation_table_for_decode
;
400 /* Standard translation table to look up on encoding (writing). */
401 Lisp_Object Vstandard_translation_table_for_encode
;
403 Lisp_Object Qtranslation_table
;
404 Lisp_Object Qtranslation_table_id
;
405 Lisp_Object Qtranslation_table_for_decode
;
406 Lisp_Object Qtranslation_table_for_encode
;
408 /* Alist of charsets vs revision number. */
409 Lisp_Object Vcharset_revision_alist
;
411 /* Default coding systems used for process I/O. */
412 Lisp_Object Vdefault_process_coding_system
;
415 /*** 2. Emacs internal format (emacs-mule) handlers ***/
417 /* Emacs' internal format for encoding multiple character sets is a
418 kind of multi-byte encoding, i.e. characters are encoded by
419 variable-length sequences of one-byte codes. ASCII characters
420 and control characters (e.g. `tab', `newline') are represented by
421 one-byte sequences which are their ASCII codes, in the range 0x00
422 through 0x7F. The other characters are represented by a sequence
423 of `base leading-code', optional `extended leading-code', and one
424 or two `position-code's. The length of the sequence is determined
425 by the base leading-code. Leading-code takes the range 0x80
426 through 0x9F, whereas extended leading-code and position-code take
427 the range 0xA0 through 0xFF. See `charset.h' for more details
428 about leading-code and position-code.
430 There's one exception to this rule. Special leading-code
431 `leading-code-composition' denotes that the following several
432 characters should be composed into one character. Leading-codes of
433 components (except for ASCII) are added 0x20. An ASCII character
434 component is represented by a 2-byte sequence of `0xA0' and
435 `ASCII-code + 0x80'. See also the comments in `charset.h' for the
436 details of composite character. Hence, we can summarize the code
439 --- CODE RANGE of Emacs' internal format ---
440 (character set) (range)
442 ELSE (1st byte) 0x80 .. 0x9F
443 (rest bytes) 0xA0 .. 0xFF
444 ---------------------------------------------
448 enum emacs_code_class_type emacs_code_class
[256];
450 /* Go to the next statement only if *SRC is accessible and the code is
451 greater than 0xA0. */
452 #define CHECK_CODE_RANGE_A0_FF \
454 if (src >= src_end) \
455 goto label_end_of_switch; \
456 else if (*src++ < 0xA0) \
460 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
461 Check if a text is encoded in Emacs' internal format. If it is,
462 return CODING_CATEGORY_MASK_EMACS_MULE, else return 0. */
465 detect_coding_emacs_mule (src
, src_end
)
466 unsigned char *src
, *src_end
;
471 while (src
< src_end
)
483 switch (emacs_code_class
[c
])
485 case EMACS_ascii_code
:
486 case EMACS_linefeed_code
:
489 case EMACS_control_code
:
490 if (c
== ISO_CODE_ESC
|| c
== ISO_CODE_SI
|| c
== ISO_CODE_SO
)
494 case EMACS_invalid_code
:
497 case EMACS_leading_code_composition
: /* c == 0x80 */
499 CHECK_CODE_RANGE_A0_FF
;
504 case EMACS_leading_code_4
:
505 CHECK_CODE_RANGE_A0_FF
;
506 /* fall down to check it two more times ... */
508 case EMACS_leading_code_3
:
509 CHECK_CODE_RANGE_A0_FF
;
510 /* fall down to check it one more time ... */
512 case EMACS_leading_code_2
:
513 CHECK_CODE_RANGE_A0_FF
;
521 return CODING_CATEGORY_MASK_EMACS_MULE
;
525 /*** 3. ISO2022 handlers ***/
527 /* The following note describes the coding system ISO2022 briefly.
528 Since the intention of this note is to help in understanding of
529 the programs in this file, some parts are NOT ACCURATE or OVERLY
530 SIMPLIFIED. For the thorough understanding, please refer to the
531 original document of ISO2022.
533 ISO2022 provides many mechanisms to encode several character sets
534 in 7-bit and 8-bit environment. If one chooses 7-bite environment,
535 all text is encoded by codes of less than 128. This may make the
536 encoded text a little bit longer, but the text gets more stability
537 to pass through several gateways (some of them strip off the MSB).
539 There are two kinds of character set: control character set and
540 graphic character set. The former contains control characters such
541 as `newline' and `escape' to provide control functions (control
542 functions are provided also by escape sequences). The latter
543 contains graphic characters such as ' A' and '-'. Emacs recognizes
544 two control character sets and many graphic character sets.
546 Graphic character sets are classified into one of the following
547 four classes, DIMENSION1_CHARS94, DIMENSION1_CHARS96,
548 DIMENSION2_CHARS94, DIMENSION2_CHARS96 according to the number of
549 bytes (DIMENSION) and the number of characters in one dimension
550 (CHARS) of the set. In addition, each character set is assigned an
551 identification tag (called "final character" and denoted as <F>
552 here after) which is unique in each class. <F> of each character
553 set is decided by ECMA(*) when it is registered in ISO. Code range
554 of <F> is 0x30..0x7F (0x30..0x3F are for private use only).
556 Note (*): ECMA = European Computer Manufacturers Association
558 Here are examples of graphic character set [NAME(<F>)]:
559 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
560 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
561 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
562 o DIMENSION2_CHARS96 -- none for the moment
564 A code area (1byte=8bits) is divided into 4 areas, C0, GL, C1, and GR.
565 C0 [0x00..0x1F] -- control character plane 0
566 GL [0x20..0x7F] -- graphic character plane 0
567 C1 [0x80..0x9F] -- control character plane 1
568 GR [0xA0..0xFF] -- graphic character plane 1
570 A control character set is directly designated and invoked to C0 or
571 C1 by an escape sequence. The most common case is that ISO646's
572 control character set is designated/invoked to C0 and ISO6429's
573 control character set is designated/invoked to C1, and usually
574 these designations/invocations are omitted in a coded text. With
575 7-bit environment, only C0 can be used, and a control character for
576 C1 is encoded by an appropriate escape sequence to fit in the
577 environment. All control characters for C1 are defined the
578 corresponding escape sequences.
580 A graphic character set is at first designated to one of four
581 graphic registers (G0 through G3), then these graphic registers are
582 invoked to GL or GR. These designations and invocations can be
583 done independently. The most common case is that G0 is invoked to
584 GL, G1 is invoked to GR, and ASCII is designated to G0, and usually
585 these invocations and designations are omitted in a coded text.
586 With 7-bit environment, only GL can be used.
588 When a graphic character set of CHARS94 is invoked to GL, code 0x20
589 and 0x7F of GL area work as control characters SPACE and DEL
590 respectively, and code 0xA0 and 0xFF of GR area should not be used.
592 There are two ways of invocation: locking-shift and single-shift.
593 With locking-shift, the invocation lasts until the next different
594 invocation, whereas with single-shift, the invocation works only
595 for the following character and doesn't affect locking-shift.
596 Invocations are done by the following control characters or escape
599 ----------------------------------------------------------------------
600 function control char escape sequence description
601 ----------------------------------------------------------------------
602 SI (shift-in) 0x0F none invoke G0 to GL
603 SO (shift-out) 0x0E none invoke G1 to GL
604 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
605 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
606 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 into GL
607 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 into GL
608 ----------------------------------------------------------------------
609 The first four are for locking-shift. Control characters for these
610 functions are defined by macros ISO_CODE_XXX in `coding.h'.
612 Designations are done by the following escape sequences.
613 ----------------------------------------------------------------------
614 escape sequence description
615 ----------------------------------------------------------------------
616 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
617 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
618 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
619 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
620 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
621 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
622 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
623 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
624 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
625 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
626 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
627 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
628 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
629 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
630 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
631 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
632 ----------------------------------------------------------------------
634 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
635 of dimension 1, chars 94, and final character <F>, and etc.
637 Note (*): Although these designations are not allowed in ISO2022,
638 Emacs accepts them on decoding, and produces them on encoding
639 CHARS96 character set in a coding system which is characterized as
640 7-bit environment, non-locking-shift, and non-single-shift.
642 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
643 '(' can be omitted. We call this as "short-form" here after.
645 Now you may notice that there are a lot of ways for encoding the
646 same multilingual text in ISO2022. Actually, there exists many
647 coding systems such as Compound Text (used in X's inter client
648 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
649 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
650 localized platforms), and all of these are variants of ISO2022.
652 In addition to the above, Emacs handles two more kinds of escape
653 sequences: ISO6429's direction specification and Emacs' private
654 sequence for specifying character composition.
656 ISO6429's direction specification takes the following format:
657 o CSI ']' -- end of the current direction
658 o CSI '0' ']' -- end of the current direction
659 o CSI '1' ']' -- start of left-to-right text
660 o CSI '2' ']' -- start of right-to-left text
661 The control character CSI (0x9B: control sequence introducer) is
662 abbreviated to the escape sequence ESC '[' in 7-bit environment.
664 Character composition specification takes the following format:
665 o ESC '0' -- start character composition
666 o ESC '1' -- end character composition
667 Since these are not standard escape sequences of any ISO, the use
668 of them for these meaning is restricted to Emacs only. */
670 enum iso_code_class_type iso_code_class
[256];
672 #define CHARSET_OK(idx, charset) \
673 (coding_system_table[idx] \
674 && (coding_system_table[idx]->safe_charsets[charset] \
675 || (CODING_SPEC_ISO_REQUESTED_DESIGNATION \
676 (coding_system_table[idx], charset) \
677 != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)))
679 #define SHIFT_OUT_OK(idx) \
680 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding_system_table[idx], 1) >= 0)
682 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
683 Check if a text is encoded in ISO2022. If it is, returns an
684 integer in which appropriate flag bits any of:
685 CODING_CATEGORY_MASK_ISO_7
686 CODING_CATEGORY_MASK_ISO_7_TIGHT
687 CODING_CATEGORY_MASK_ISO_8_1
688 CODING_CATEGORY_MASK_ISO_8_2
689 CODING_CATEGORY_MASK_ISO_7_ELSE
690 CODING_CATEGORY_MASK_ISO_8_ELSE
691 are set. If a code which should never appear in ISO2022 is found,
695 detect_coding_iso2022 (src
, src_end
)
696 unsigned char *src
, *src_end
;
698 int mask
= CODING_CATEGORY_MASK_ISO
;
700 int reg
[4], shift_out
= 0, single_shifting
= 0;
701 int c
, c1
, i
, charset
;
703 reg
[0] = CHARSET_ASCII
, reg
[1] = reg
[2] = reg
[3] = -1;
704 while (mask
&& src
< src_end
)
714 if (c
>= '(' && c
<= '/')
716 /* Designation sequence for a charset of dimension 1. */
720 if (c1
< ' ' || c1
>= 0x80
721 || (charset
= iso_charset_table
[0][c
>= ','][c1
]) < 0)
722 /* Invalid designation sequence. Just ignore. */
724 reg
[(c
- '(') % 4] = charset
;
728 /* Designation sequence for a charset of dimension 2. */
732 if (c
>= '@' && c
<= 'B')
733 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
734 reg
[0] = charset
= iso_charset_table
[1][0][c
];
735 else if (c
>= '(' && c
<= '/')
740 if (c1
< ' ' || c1
>= 0x80
741 || (charset
= iso_charset_table
[1][c
>= ','][c1
]) < 0)
742 /* Invalid designation sequence. Just ignore. */
744 reg
[(c
- '(') % 4] = charset
;
747 /* Invalid designation sequence. Just ignore. */
750 else if (c
== 'N' || c
== 'O')
752 /* ESC <Fe> for SS2 or SS3. */
753 mask
&= CODING_CATEGORY_MASK_ISO_7_ELSE
;
756 else if (c
== '0' || c
== '1' || c
== '2')
757 /* ESC <Fp> for start/end composition. Just ignore. */
760 /* Invalid escape sequence. Just ignore. */
763 /* We found a valid designation sequence for CHARSET. */
764 mask
&= ~CODING_CATEGORY_MASK_ISO_8BIT
;
765 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7
, charset
))
766 mask_found
|= CODING_CATEGORY_MASK_ISO_7
;
768 mask
&= ~CODING_CATEGORY_MASK_ISO_7
;
769 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT
, charset
))
770 mask_found
|= CODING_CATEGORY_MASK_ISO_7_TIGHT
;
772 mask
&= ~CODING_CATEGORY_MASK_ISO_7_TIGHT
;
773 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE
, charset
))
774 mask_found
|= CODING_CATEGORY_MASK_ISO_7_ELSE
;
776 mask
&= ~CODING_CATEGORY_MASK_ISO_7_ELSE
;
777 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE
, charset
))
778 mask_found
|= CODING_CATEGORY_MASK_ISO_8_ELSE
;
780 mask
&= ~CODING_CATEGORY_MASK_ISO_8_ELSE
;
787 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE
)
788 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE
)))
790 /* Locking shift out. */
791 mask
&= ~CODING_CATEGORY_MASK_ISO_7BIT
;
792 mask_found
|= CODING_CATEGORY_MASK_ISO_SHIFT
;
800 /* Locking shift in. */
801 mask
&= ~CODING_CATEGORY_MASK_ISO_7BIT
;
802 mask_found
|= CODING_CATEGORY_MASK_ISO_SHIFT
;
811 int newmask
= CODING_CATEGORY_MASK_ISO_8_ELSE
;
813 if (c
!= ISO_CODE_CSI
)
815 if (coding_system_table
[CODING_CATEGORY_IDX_ISO_8_1
]->flags
816 & CODING_FLAG_ISO_SINGLE_SHIFT
)
817 newmask
|= CODING_CATEGORY_MASK_ISO_8_1
;
818 if (coding_system_table
[CODING_CATEGORY_IDX_ISO_8_2
]->flags
819 & CODING_FLAG_ISO_SINGLE_SHIFT
)
820 newmask
|= CODING_CATEGORY_MASK_ISO_8_2
;
823 if (VECTORP (Vlatin_extra_code_table
)
824 && !NILP (XVECTOR (Vlatin_extra_code_table
)->contents
[c
]))
826 if (coding_system_table
[CODING_CATEGORY_IDX_ISO_8_1
]->flags
827 & CODING_FLAG_ISO_LATIN_EXTRA
)
828 newmask
|= CODING_CATEGORY_MASK_ISO_8_1
;
829 if (coding_system_table
[CODING_CATEGORY_IDX_ISO_8_2
]->flags
830 & CODING_FLAG_ISO_LATIN_EXTRA
)
831 newmask
|= CODING_CATEGORY_MASK_ISO_8_2
;
834 mask_found
|= newmask
;
847 if (VECTORP (Vlatin_extra_code_table
)
848 && !NILP (XVECTOR (Vlatin_extra_code_table
)->contents
[c
]))
852 if (coding_system_table
[CODING_CATEGORY_IDX_ISO_8_1
]->flags
853 & CODING_FLAG_ISO_LATIN_EXTRA
)
854 newmask
|= CODING_CATEGORY_MASK_ISO_8_1
;
855 if (coding_system_table
[CODING_CATEGORY_IDX_ISO_8_2
]->flags
856 & CODING_FLAG_ISO_LATIN_EXTRA
)
857 newmask
|= CODING_CATEGORY_MASK_ISO_8_2
;
859 mask_found
|= newmask
;
866 unsigned char *src_begin
= src
;
868 mask
&= ~(CODING_CATEGORY_MASK_ISO_7BIT
869 | CODING_CATEGORY_MASK_ISO_7_ELSE
);
870 mask_found
|= CODING_CATEGORY_MASK_ISO_8_1
;
871 /* Check the length of succeeding codes of the range
872 0xA0..0FF. If the byte length is odd, we exclude
873 CODING_CATEGORY_MASK_ISO_8_2. We can check this only
874 when we are not single shifting. */
875 if (!single_shifting
)
877 while (src
< src_end
&& *src
>= 0xA0)
879 if ((src
- src_begin
- 1) & 1 && src
< src_end
)
880 mask
&= ~CODING_CATEGORY_MASK_ISO_8_2
;
882 mask_found
|= CODING_CATEGORY_MASK_ISO_8_2
;
889 return (mask
& mask_found
);
892 /* Decode a character of which charset is CHARSET and the 1st position
893 code is C1. If dimension of CHARSET is 2, the 2nd position code is
894 fetched from SRC and set to C2. If CHARSET is negative, it means
895 that we are decoding ill formed text, and what we can do is just to
898 #define DECODE_ISO_CHARACTER(charset, c1) \
900 int c_alt, charset_alt = (charset); \
901 if (COMPOSING_HEAD_P (coding->composing)) \
903 *dst++ = LEADING_CODE_COMPOSITION; \
904 if (COMPOSING_WITH_RULE_P (coding->composing)) \
905 /* To tell composition rules are embeded. */ \
907 coding->composing += 2; \
909 if (charset_alt >= 0) \
911 if (CHARSET_DIMENSION (charset_alt) == 2) \
913 ONE_MORE_BYTE (c2); \
914 if (iso_code_class[(c2) & 0x7F] != ISO_0x20_or_0x7F \
915 && iso_code_class[(c2) & 0x7F] != ISO_graphic_plane_0) \
918 charset_alt = CHARSET_ASCII; \
921 if (!NILP (translation_table) \
922 && ((c_alt = translate_char (translation_table, \
923 -1, charset_alt, c1, c2)) >= 0)) \
924 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
926 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
927 DECODE_CHARACTER_ASCII (c1); \
928 else if (CHARSET_DIMENSION (charset_alt) == 1) \
929 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
931 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
932 if (COMPOSING_WITH_RULE_P (coding->composing)) \
933 /* To tell a composition rule follows. */ \
934 coding->composing = COMPOSING_WITH_RULE_RULE; \
937 /* Set designation state into CODING. */
938 #define DECODE_DESIGNATION(reg, dimension, chars, final_char) \
942 if (final_char < '0' || final_char >= 128) \
943 goto label_invalid_code; \
944 charset = ISO_CHARSET_TABLE (make_number (dimension), \
945 make_number (chars), \
946 make_number (final_char)); \
948 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) == reg \
949 || coding->safe_charsets[charset])) \
951 if (coding->spec.iso2022.last_invalid_designation_register == 0 \
953 && charset == CHARSET_ASCII) \
955 /* We should insert this designation sequence as is so \
956 that it is surely written back to a file. */ \
957 coding->spec.iso2022.last_invalid_designation_register = -1; \
958 goto label_invalid_code; \
960 coding->spec.iso2022.last_invalid_designation_register = -1; \
961 if ((coding->mode & CODING_MODE_DIRECTION) \
962 && CHARSET_REVERSE_CHARSET (charset) >= 0) \
963 charset = CHARSET_REVERSE_CHARSET (charset); \
964 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
968 coding->spec.iso2022.last_invalid_designation_register = reg; \
969 goto label_invalid_code; \
973 /* Return 0 if there's a valid composing sequence starting at SRC and
974 ending before SRC_END, else return -1. */
977 check_composing_code (coding
, src
, src_end
)
978 struct coding_system
*coding
;
979 unsigned char *src
, *src_end
;
981 int charset
, c
, c1
, dim
;
983 while (src
< src_end
)
988 if (c
!= ISO_CODE_ESC
|| src
>= src_end
)
991 if (c
== '1') /* end of compsition */
993 if (src
+ 2 >= src_end
994 || !coding
->flags
& CODING_FLAG_ISO_DESIGNATION
)
999 c
= (*src
>= '@' && *src
<= 'B') ? '(' : *src
++;
1000 if (c
>= '(' && c
<= '/')
1003 if ((c1
< ' ' || c1
>= 0x80)
1004 || (charset
= iso_charset_table
[dim
][c
>= ','][c1
]) < 0
1005 || ! coding
->safe_charsets
[charset
]
1006 || (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
)
1007 == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION
))
1014 /* We have not found the sequence "ESC 1". */
1018 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
1021 decode_coding_iso2022 (coding
, source
, destination
, src_bytes
, dst_bytes
)
1022 struct coding_system
*coding
;
1023 unsigned char *source
, *destination
;
1024 int src_bytes
, dst_bytes
;
1026 unsigned char *src
= source
;
1027 unsigned char *src_end
= source
+ src_bytes
;
1028 unsigned char *dst
= destination
;
1029 unsigned char *dst_end
= destination
+ dst_bytes
;
1030 /* Since the maximum bytes produced by each loop is 7, we subtract 6
1031 from DST_END to assure that overflow checking is necessary only
1032 at the head of loop. */
1033 unsigned char *adjusted_dst_end
= dst_end
- 6;
1035 /* Charsets invoked to graphic plane 0 and 1 respectively. */
1036 int charset0
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 0);
1037 int charset1
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 1);
1038 Lisp_Object translation_table
1039 = coding
->translation_table_for_decode
;
1040 int result
= CODING_FINISH_NORMAL
;
1042 if (!NILP (Venable_character_translation
) && NILP (translation_table
))
1043 translation_table
= Vstandard_translation_table_for_decode
;
1045 coding
->produced_char
= 0;
1046 coding
->fake_multibyte
= 0;
1047 while (src
< src_end
&& (dst_bytes
1048 ? (dst
< adjusted_dst_end
)
1051 /* SRC_BASE remembers the start position in source in each loop.
1052 The loop will be exited when there's not enough source text
1053 to analyze long escape sequence or 2-byte code (within macros
1054 ONE_MORE_BYTE or TWO_MORE_BYTES). In that case, SRC is reset
1055 to SRC_BASE before exiting. */
1056 unsigned char *src_base
= src
;
1057 int c1
= *src
++, c2
;
1059 switch (iso_code_class
[c1
])
1061 case ISO_0x20_or_0x7F
:
1062 if (!coding
->composing
1063 && (charset0
< 0 || CHARSET_CHARS (charset0
) == 94))
1065 /* This is SPACE or DEL. */
1067 coding
->produced_char
++;
1070 /* This is a graphic character, we fall down ... */
1072 case ISO_graphic_plane_0
:
1073 if (coding
->composing
== COMPOSING_WITH_RULE_RULE
)
1075 /* This is a composition rule. */
1077 coding
->composing
= COMPOSING_WITH_RULE_TAIL
;
1080 DECODE_ISO_CHARACTER (charset0
, c1
);
1083 case ISO_0xA0_or_0xFF
:
1084 if (charset1
< 0 || CHARSET_CHARS (charset1
) == 94
1085 || coding
->flags
& CODING_FLAG_ISO_SEVEN_BITS
)
1086 goto label_invalid_code
;
1087 /* This is a graphic character, we fall down ... */
1089 case ISO_graphic_plane_1
:
1090 if (coding
->flags
& CODING_FLAG_ISO_SEVEN_BITS
)
1091 goto label_invalid_code
;
1093 DECODE_ISO_CHARACTER (charset1
, c1
);
1096 case ISO_control_code
:
1097 /* All ISO2022 control characters in this class have the
1098 same representation in Emacs internal format. */
1100 && (coding
->mode
& CODING_MODE_INHIBIT_INCONSISTENT_EOL
)
1101 && (coding
->eol_type
== CODING_EOL_CR
1102 || coding
->eol_type
== CODING_EOL_CRLF
))
1104 result
= CODING_FINISH_INCONSISTENT_EOL
;
1105 goto label_end_of_loop_2
;
1108 coding
->produced_char
++;
1110 coding
->fake_multibyte
= 1;
1113 case ISO_carriage_return
:
1114 if (coding
->eol_type
== CODING_EOL_CR
)
1116 else if (coding
->eol_type
== CODING_EOL_CRLF
)
1119 if (c1
== ISO_CODE_LF
)
1123 if (coding
->mode
& CODING_MODE_INHIBIT_INCONSISTENT_EOL
)
1125 result
= CODING_FINISH_INCONSISTENT_EOL
;
1126 goto label_end_of_loop_2
;
1134 coding
->produced_char
++;
1138 if (! (coding
->flags
& CODING_FLAG_ISO_LOCKING_SHIFT
)
1139 || CODING_SPEC_ISO_DESIGNATION (coding
, 1) < 0)
1140 goto label_invalid_code
;
1141 CODING_SPEC_ISO_INVOCATION (coding
, 0) = 1;
1142 charset0
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 0);
1146 if (! (coding
->flags
& CODING_FLAG_ISO_LOCKING_SHIFT
))
1147 goto label_invalid_code
;
1148 CODING_SPEC_ISO_INVOCATION (coding
, 0) = 0;
1149 charset0
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 0);
1152 case ISO_single_shift_2_7
:
1153 case ISO_single_shift_2
:
1154 if (! (coding
->flags
& CODING_FLAG_ISO_SINGLE_SHIFT
))
1155 goto label_invalid_code
;
1156 /* SS2 is handled as an escape sequence of ESC 'N' */
1158 goto label_escape_sequence
;
1160 case ISO_single_shift_3
:
1161 if (! (coding
->flags
& CODING_FLAG_ISO_SINGLE_SHIFT
))
1162 goto label_invalid_code
;
1163 /* SS2 is handled as an escape sequence of ESC 'O' */
1165 goto label_escape_sequence
;
1167 case ISO_control_sequence_introducer
:
1168 /* CSI is handled as an escape sequence of ESC '[' ... */
1170 goto label_escape_sequence
;
1174 label_escape_sequence
:
1175 /* Escape sequences handled by Emacs are invocation,
1176 designation, direction specification, and character
1177 composition specification. */
1180 case '&': /* revision of following character set */
1182 if (!(c1
>= '@' && c1
<= '~'))
1183 goto label_invalid_code
;
1185 if (c1
!= ISO_CODE_ESC
)
1186 goto label_invalid_code
;
1188 goto label_escape_sequence
;
1190 case '$': /* designation of 2-byte character set */
1191 if (! (coding
->flags
& CODING_FLAG_ISO_DESIGNATION
))
1192 goto label_invalid_code
;
1194 if (c1
>= '@' && c1
<= 'B')
1195 { /* designation of JISX0208.1978, GB2312.1980,
1197 DECODE_DESIGNATION (0, 2, 94, c1
);
1199 else if (c1
>= 0x28 && c1
<= 0x2B)
1200 { /* designation of DIMENSION2_CHARS94 character set */
1202 DECODE_DESIGNATION (c1
- 0x28, 2, 94, c2
);
1204 else if (c1
>= 0x2C && c1
<= 0x2F)
1205 { /* designation of DIMENSION2_CHARS96 character set */
1207 DECODE_DESIGNATION (c1
- 0x2C, 2, 96, c2
);
1210 goto label_invalid_code
;
1213 case 'n': /* invocation of locking-shift-2 */
1214 if (! (coding
->flags
& CODING_FLAG_ISO_LOCKING_SHIFT
)
1215 || CODING_SPEC_ISO_DESIGNATION (coding
, 2) < 0)
1216 goto label_invalid_code
;
1217 CODING_SPEC_ISO_INVOCATION (coding
, 0) = 2;
1218 charset0
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 0);
1221 case 'o': /* invocation of locking-shift-3 */
1222 if (! (coding
->flags
& CODING_FLAG_ISO_LOCKING_SHIFT
)
1223 || CODING_SPEC_ISO_DESIGNATION (coding
, 3) < 0)
1224 goto label_invalid_code
;
1225 CODING_SPEC_ISO_INVOCATION (coding
, 0) = 3;
1226 charset0
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 0);
1229 case 'N': /* invocation of single-shift-2 */
1230 if (! (coding
->flags
& CODING_FLAG_ISO_SINGLE_SHIFT
)
1231 || CODING_SPEC_ISO_DESIGNATION (coding
, 2) < 0)
1232 goto label_invalid_code
;
1234 charset
= CODING_SPEC_ISO_DESIGNATION (coding
, 2);
1235 DECODE_ISO_CHARACTER (charset
, c1
);
1238 case 'O': /* invocation of single-shift-3 */
1239 if (! (coding
->flags
& CODING_FLAG_ISO_SINGLE_SHIFT
)
1240 || CODING_SPEC_ISO_DESIGNATION (coding
, 3) < 0)
1241 goto label_invalid_code
;
1243 charset
= CODING_SPEC_ISO_DESIGNATION (coding
, 3);
1244 DECODE_ISO_CHARACTER (charset
, c1
);
1247 case '0': case '2': /* start composing */
1248 /* Before processing composing, we must be sure that all
1249 characters being composed are supported by CODING.
1250 If not, we must give up composing. */
1251 if (check_composing_code (coding
, src
, src_end
) == 0)
1253 /* We are looking at a valid composition sequence. */
1254 coding
->composing
= (c1
== '0'
1255 ? COMPOSING_NO_RULE_HEAD
1256 : COMPOSING_WITH_RULE_HEAD
);
1257 coding
->composed_chars
= 0;
1261 *dst
++ = ISO_CODE_ESC
;
1263 coding
->produced_char
+= 2;
1267 case '1': /* end composing */
1268 if (!coding
->composing
)
1270 *dst
++ = ISO_CODE_ESC
;
1272 coding
->produced_char
+= 2;
1276 if (coding
->composed_chars
> 0)
1278 if (coding
->composed_chars
== 1)
1280 unsigned char *this_char_start
= dst
;
1283 /* Only one character is in the composing
1284 sequence. Make it a normal character. */
1285 while (*--this_char_start
!= LEADING_CODE_COMPOSITION
);
1286 dst
= (this_char_start
1287 + (coding
->composing
== COMPOSING_NO_RULE_TAIL
1292 this_bytes
= BYTES_BY_CHAR_HEAD (*dst
);
1293 while (this_bytes
--) *this_char_start
++ = *dst
++;
1294 dst
= this_char_start
;
1296 coding
->produced_char
++;
1298 coding
->composing
= COMPOSING_NO
;
1301 case '[': /* specification of direction */
1302 if (coding
->flags
& CODING_FLAG_ISO_NO_DIRECTION
)
1303 goto label_invalid_code
;
1304 /* For the moment, nested direction is not supported.
1305 So, `coding->mode & CODING_MODE_DIRECTION' zero means
1306 left-to-right, and nozero means right-to-left. */
1310 case ']': /* end of the current direction */
1311 coding
->mode
&= ~CODING_MODE_DIRECTION
;
1313 case '0': /* end of the current direction */
1314 case '1': /* start of left-to-right direction */
1317 coding
->mode
&= ~CODING_MODE_DIRECTION
;
1319 goto label_invalid_code
;
1322 case '2': /* start of right-to-left direction */
1325 coding
->mode
|= CODING_MODE_DIRECTION
;
1327 goto label_invalid_code
;
1331 goto label_invalid_code
;
1336 if (! (coding
->flags
& CODING_FLAG_ISO_DESIGNATION
))
1337 goto label_invalid_code
;
1338 if (c1
>= 0x28 && c1
<= 0x2B)
1339 { /* designation of DIMENSION1_CHARS94 character set */
1341 DECODE_DESIGNATION (c1
- 0x28, 1, 94, c2
);
1343 else if (c1
>= 0x2C && c1
<= 0x2F)
1344 { /* designation of DIMENSION1_CHARS96 character set */
1346 DECODE_DESIGNATION (c1
- 0x2C, 1, 96, c2
);
1350 goto label_invalid_code
;
1353 /* We must update these variables now. */
1354 charset0
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 0);
1355 charset1
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 1);
1359 while (src_base
< src
)
1360 *dst
++ = *src_base
++;
1361 coding
->fake_multibyte
= 1;
1366 result
= CODING_FINISH_INSUFFICIENT_SRC
;
1367 label_end_of_loop_2
:
1374 if (result
== CODING_FINISH_NORMAL
)
1375 result
= CODING_FINISH_INSUFFICIENT_DST
;
1376 else if (result
!= CODING_FINISH_INCONSISTENT_EOL
1377 && coding
->mode
& CODING_MODE_LAST_BLOCK
)
1379 /* This is the last block of the text to be decoded. We had
1380 better just flush out all remaining codes in the text
1381 although they are not valid characters. */
1382 src_bytes
= src_end
- src
;
1383 if (dst_bytes
&& (dst_end
- dst
< src_bytes
))
1384 src_bytes
= dst_end
- dst
;
1385 bcopy (src
, dst
, src_bytes
);
1388 coding
->fake_multibyte
= 1;
1392 coding
->consumed
= coding
->consumed_char
= src
- source
;
1393 coding
->produced
= dst
- destination
;
1397 /* ISO2022 encoding stuff. */
1400 It is not enough to say just "ISO2022" on encoding, we have to
1401 specify more details. In Emacs, each coding system of ISO2022
1402 variant has the following specifications:
1403 1. Initial designation to G0 thru G3.
1404 2. Allows short-form designation?
1405 3. ASCII should be designated to G0 before control characters?
1406 4. ASCII should be designated to G0 at end of line?
1407 5. 7-bit environment or 8-bit environment?
1408 6. Use locking-shift?
1409 7. Use Single-shift?
1410 And the following two are only for Japanese:
1411 8. Use ASCII in place of JIS0201-1976-Roman?
1412 9. Use JISX0208-1983 in place of JISX0208-1978?
1413 These specifications are encoded in `coding->flags' as flag bits
1414 defined by macros CODING_FLAG_ISO_XXX. See `coding.h' for more
1418 /* Produce codes (escape sequence) for designating CHARSET to graphic
1419 register REG. If <final-char> of CHARSET is '@', 'A', or 'B' and
1420 the coding system CODING allows, produce designation sequence of
1423 #define ENCODE_DESIGNATION(charset, reg, coding) \
1425 unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset); \
1426 char *intermediate_char_94 = "()*+"; \
1427 char *intermediate_char_96 = ",-./"; \
1428 int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset); \
1429 if (revision < 255) \
1431 *dst++ = ISO_CODE_ESC; \
1433 *dst++ = '@' + revision; \
1435 *dst++ = ISO_CODE_ESC; \
1436 if (CHARSET_DIMENSION (charset) == 1) \
1438 if (CHARSET_CHARS (charset) == 94) \
1439 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1441 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1446 if (CHARSET_CHARS (charset) == 94) \
1448 if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM) \
1450 || final_char < '@' || final_char > 'B') \
1451 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1454 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1456 *dst++ = final_char; \
1457 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
1460 /* The following two macros produce codes (control character or escape
1461 sequence) for ISO2022 single-shift functions (single-shift-2 and
1464 #define ENCODE_SINGLE_SHIFT_2 \
1466 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1467 *dst++ = ISO_CODE_ESC, *dst++ = 'N'; \
1470 *dst++ = ISO_CODE_SS2; \
1471 coding->fake_multibyte = 1; \
1473 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1476 #define ENCODE_SINGLE_SHIFT_3 \
1478 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1479 *dst++ = ISO_CODE_ESC, *dst++ = 'O'; \
1482 *dst++ = ISO_CODE_SS3; \
1483 coding->fake_multibyte = 1; \
1485 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1488 /* The following four macros produce codes (control character or
1489 escape sequence) for ISO2022 locking-shift functions (shift-in,
1490 shift-out, locking-shift-2, and locking-shift-3). */
1492 #define ENCODE_SHIFT_IN \
1494 *dst++ = ISO_CODE_SI; \
1495 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1498 #define ENCODE_SHIFT_OUT \
1500 *dst++ = ISO_CODE_SO; \
1501 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1504 #define ENCODE_LOCKING_SHIFT_2 \
1506 *dst++ = ISO_CODE_ESC, *dst++ = 'n'; \
1507 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1510 #define ENCODE_LOCKING_SHIFT_3 \
1512 *dst++ = ISO_CODE_ESC, *dst++ = 'o'; \
1513 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1516 /* Produce codes for a DIMENSION1 character whose character set is
1517 CHARSET and whose position-code is C1. Designation and invocation
1518 sequences are also produced in advance if necessary. */
1521 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
1523 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1525 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1526 *dst++ = c1 & 0x7F; \
1528 *dst++ = c1 | 0x80; \
1529 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1532 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1534 *dst++ = c1 & 0x7F; \
1537 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1539 *dst++ = c1 | 0x80; \
1542 else if (coding->flags & CODING_FLAG_ISO_SAFE \
1543 && !coding->safe_charsets[charset]) \
1545 /* We should not encode this character, instead produce one or \
1547 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1548 if (CHARSET_WIDTH (charset) == 2) \
1549 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1553 /* Since CHARSET is not yet invoked to any graphic planes, we \
1554 must invoke it, or, at first, designate it to some graphic \
1555 register. Then repeat the loop to actually produce the \
1557 dst = encode_invocation_designation (charset, coding, dst); \
1560 /* Produce codes for a DIMENSION2 character whose character set is
1561 CHARSET and whose position-codes are C1 and C2. Designation and
1562 invocation codes are also produced in advance if necessary. */
1564 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
1566 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1568 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1569 *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F; \
1571 *dst++ = c1 | 0x80, *dst++ = c2 | 0x80; \
1572 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1575 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1577 *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F; \
1580 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1582 *dst++ = c1 | 0x80, *dst++= c2 | 0x80; \
1585 else if (coding->flags & CODING_FLAG_ISO_SAFE \
1586 && !coding->safe_charsets[charset]) \
1588 /* We should not encode this character, instead produce one or \
1590 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1591 if (CHARSET_WIDTH (charset) == 2) \
1592 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1596 /* Since CHARSET is not yet invoked to any graphic planes, we \
1597 must invoke it, or, at first, designate it to some graphic \
1598 register. Then repeat the loop to actually produce the \
1600 dst = encode_invocation_designation (charset, coding, dst); \
1603 #define ENCODE_ISO_CHARACTER(charset, c1, c2) \
1605 int c_alt, charset_alt; \
1606 if (!NILP (translation_table) \
1607 && ((c_alt = translate_char (translation_table, -1, \
1610 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1612 charset_alt = charset; \
1613 if (CHARSET_DIMENSION (charset_alt) == 1) \
1615 if (charset == CHARSET_ASCII \
1616 && coding->flags & CODING_FLAG_ISO_USE_ROMAN) \
1617 charset_alt = charset_latin_jisx0201; \
1618 ENCODE_ISO_CHARACTER_DIMENSION1 (charset_alt, c1); \
1622 if (charset == charset_jisx0208 \
1623 && coding->flags & CODING_FLAG_ISO_USE_OLDJIS) \
1624 charset_alt = charset_jisx0208_1978; \
1625 ENCODE_ISO_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
1627 if (! COMPOSING_P (coding->composing)) \
1628 coding->consumed_char++; \
1631 /* Produce designation and invocation codes at a place pointed by DST
1632 to use CHARSET. The element `spec.iso2022' of *CODING is updated.
1636 encode_invocation_designation (charset
, coding
, dst
)
1638 struct coding_system
*coding
;
1641 int reg
; /* graphic register number */
1643 /* At first, check designations. */
1644 for (reg
= 0; reg
< 4; reg
++)
1645 if (charset
== CODING_SPEC_ISO_DESIGNATION (coding
, reg
))
1650 /* CHARSET is not yet designated to any graphic registers. */
1651 /* At first check the requested designation. */
1652 reg
= CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
);
1653 if (reg
== CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION
)
1654 /* Since CHARSET requests no special designation, designate it
1655 to graphic register 0. */
1658 ENCODE_DESIGNATION (charset
, reg
, coding
);
1661 if (CODING_SPEC_ISO_INVOCATION (coding
, 0) != reg
1662 && CODING_SPEC_ISO_INVOCATION (coding
, 1) != reg
)
1664 /* Since the graphic register REG is not invoked to any graphic
1665 planes, invoke it to graphic plane 0. */
1668 case 0: /* graphic register 0 */
1672 case 1: /* graphic register 1 */
1676 case 2: /* graphic register 2 */
1677 if (coding
->flags
& CODING_FLAG_ISO_SINGLE_SHIFT
)
1678 ENCODE_SINGLE_SHIFT_2
;
1680 ENCODE_LOCKING_SHIFT_2
;
1683 case 3: /* graphic register 3 */
1684 if (coding
->flags
& CODING_FLAG_ISO_SINGLE_SHIFT
)
1685 ENCODE_SINGLE_SHIFT_3
;
1687 ENCODE_LOCKING_SHIFT_3
;
1694 /* The following two macros produce codes for indicating composition. */
1695 #define ENCODE_COMPOSITION_NO_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '0'
1696 #define ENCODE_COMPOSITION_WITH_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '2'
1697 #define ENCODE_COMPOSITION_END *dst++ = ISO_CODE_ESC, *dst++ = '1'
1699 /* The following three macros produce codes for indicating direction
1701 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
1703 if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS) \
1704 *dst++ = ISO_CODE_ESC, *dst++ = '['; \
1706 *dst++ = ISO_CODE_CSI; \
1709 #define ENCODE_DIRECTION_R2L \
1710 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '2', *dst++ = ']'
1712 #define ENCODE_DIRECTION_L2R \
1713 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '0', *dst++ = ']'
1715 /* Produce codes for designation and invocation to reset the graphic
1716 planes and registers to initial state. */
1717 #define ENCODE_RESET_PLANE_AND_REGISTER \
1720 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0) \
1722 for (reg = 0; reg < 4; reg++) \
1723 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0 \
1724 && (CODING_SPEC_ISO_DESIGNATION (coding, reg) \
1725 != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg))) \
1726 ENCODE_DESIGNATION \
1727 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
1730 /* Produce designation sequences of charsets in the line started from
1731 SRC to a place pointed by *DSTP, and update DSTP.
1733 If the current block ends before any end-of-line, we may fail to
1734 find all the necessary designations. */
1737 encode_designation_at_bol (coding
, table
, src
, src_end
, dstp
)
1738 struct coding_system
*coding
;
1740 unsigned char *src
, *src_end
, **dstp
;
1742 int charset
, c
, found
= 0, reg
;
1743 /* Table of charsets to be designated to each graphic register. */
1745 unsigned char *dst
= *dstp
;
1747 for (reg
= 0; reg
< 4; reg
++)
1750 while (src
< src_end
&& *src
!= '\n' && found
< 4)
1752 int bytes
= BYTES_BY_CHAR_HEAD (*src
);
1755 charset
= CHARSET_AT (src
);
1759 unsigned char c1
, c2
;
1761 SPLIT_STRING(src
, bytes
, charset
, c1
, c2
);
1762 if ((c_alt
= translate_char (table
, -1, charset
, c1
, c2
)) >= 0)
1763 charset
= CHAR_CHARSET (c_alt
);
1766 reg
= CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
);
1767 if (reg
!= CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION
&& r
[reg
] < 0)
1778 for (reg
= 0; reg
< 4; reg
++)
1780 && CODING_SPEC_ISO_DESIGNATION (coding
, reg
) != r
[reg
])
1781 ENCODE_DESIGNATION (r
[reg
], reg
, coding
);
1786 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
1789 encode_coding_iso2022 (coding
, source
, destination
, src_bytes
, dst_bytes
)
1790 struct coding_system
*coding
;
1791 unsigned char *source
, *destination
;
1792 int src_bytes
, dst_bytes
;
1794 unsigned char *src
= source
;
1795 unsigned char *src_end
= source
+ src_bytes
;
1796 unsigned char *dst
= destination
;
1797 unsigned char *dst_end
= destination
+ dst_bytes
;
1798 /* Since the maximum bytes produced by each loop is 20, we subtract 19
1799 from DST_END to assure overflow checking is necessary only at the
1801 unsigned char *adjusted_dst_end
= dst_end
- 19;
1802 Lisp_Object translation_table
1803 = coding
->translation_table_for_encode
;
1804 int result
= CODING_FINISH_NORMAL
;
1806 if (!NILP (Venable_character_translation
) && NILP (translation_table
))
1807 translation_table
= Vstandard_translation_table_for_encode
;
1809 coding
->consumed_char
= 0;
1810 coding
->fake_multibyte
= 0;
1811 while (src
< src_end
&& (dst_bytes
1812 ? (dst
< adjusted_dst_end
)
1813 : (dst
< src
- 19)))
1815 /* SRC_BASE remembers the start position in source in each loop.
1816 The loop will be exited when there's not enough source text
1817 to analyze multi-byte codes (within macros ONE_MORE_BYTE,
1818 TWO_MORE_BYTES, and THREE_MORE_BYTES). In that case, SRC is
1819 reset to SRC_BASE before exiting. */
1820 unsigned char *src_base
= src
;
1821 int charset
, c1
, c2
, c3
, c4
;
1823 if (coding
->flags
& CODING_FLAG_ISO_DESIGNATE_AT_BOL
1824 && CODING_SPEC_ISO_BOL (coding
))
1826 /* We have to produce designation sequences if any now. */
1827 encode_designation_at_bol (coding
, translation_table
,
1828 src
, src_end
, &dst
);
1829 CODING_SPEC_ISO_BOL (coding
) = 0;
1833 /* If we are seeing a component of a composite character, we are
1834 seeing a leading-code encoded irregularly for composition, or
1835 a composition rule if composing with rule. We must set C1 to
1836 a normal leading-code or an ASCII code. If we are not seeing
1837 a composite character, we must reset composition,
1838 designation, and invocation states. */
1839 if (COMPOSING_P (coding
->composing
))
1843 /* We are not in a composite character any longer. */
1844 coding
->composing
= COMPOSING_NO
;
1845 ENCODE_RESET_PLANE_AND_REGISTER
;
1846 ENCODE_COMPOSITION_END
;
1850 if (coding
->composing
== COMPOSING_WITH_RULE_RULE
)
1853 coding
->composing
= COMPOSING_WITH_RULE_HEAD
;
1856 else if (coding
->composing
== COMPOSING_WITH_RULE_HEAD
)
1857 coding
->composing
= COMPOSING_WITH_RULE_RULE
;
1860 /* This is an ASCII component. */
1865 /* This is a leading-code of non ASCII component. */
1870 /* Now encode one character. C1 is a control character, an
1871 ASCII character, or a leading-code of multi-byte character. */
1872 switch (emacs_code_class
[c1
])
1874 case EMACS_ascii_code
:
1875 ENCODE_ISO_CHARACTER (CHARSET_ASCII
, c1
, /* dummy */ c2
);
1878 case EMACS_control_code
:
1879 if (coding
->flags
& CODING_FLAG_ISO_RESET_AT_CNTL
)
1880 ENCODE_RESET_PLANE_AND_REGISTER
;
1882 coding
->consumed_char
++;
1885 case EMACS_carriage_return_code
:
1886 if (! (coding
->mode
& CODING_MODE_SELECTIVE_DISPLAY
))
1888 if (coding
->flags
& CODING_FLAG_ISO_RESET_AT_CNTL
)
1889 ENCODE_RESET_PLANE_AND_REGISTER
;
1891 coding
->consumed_char
++;
1894 /* fall down to treat '\r' as '\n' ... */
1896 case EMACS_linefeed_code
:
1897 if (coding
->flags
& CODING_FLAG_ISO_RESET_AT_EOL
)
1898 ENCODE_RESET_PLANE_AND_REGISTER
;
1899 if (coding
->flags
& CODING_FLAG_ISO_INIT_AT_BOL
)
1900 bcopy (coding
->spec
.iso2022
.initial_designation
,
1901 coding
->spec
.iso2022
.current_designation
,
1902 sizeof coding
->spec
.iso2022
.initial_designation
);
1903 if (coding
->eol_type
== CODING_EOL_LF
1904 || coding
->eol_type
== CODING_EOL_UNDECIDED
)
1905 *dst
++ = ISO_CODE_LF
;
1906 else if (coding
->eol_type
== CODING_EOL_CRLF
)
1907 *dst
++ = ISO_CODE_CR
, *dst
++ = ISO_CODE_LF
;
1909 *dst
++ = ISO_CODE_CR
;
1910 CODING_SPEC_ISO_BOL (coding
) = 1;
1911 coding
->consumed_char
++;
1914 case EMACS_leading_code_2
:
1918 /* invalid sequence */
1921 coding
->consumed_char
++;
1924 ENCODE_ISO_CHARACTER (c1
, c2
, /* dummy */ c3
);
1927 case EMACS_leading_code_3
:
1928 TWO_MORE_BYTES (c2
, c3
);
1929 if (c2
< 0xA0 || c3
< 0xA0)
1931 /* invalid sequence */
1934 coding
->consumed_char
++;
1936 else if (c1
< LEADING_CODE_PRIVATE_11
)
1937 ENCODE_ISO_CHARACTER (c1
, c2
, c3
);
1939 ENCODE_ISO_CHARACTER (c2
, c3
, /* dummy */ c4
);
1942 case EMACS_leading_code_4
:
1943 THREE_MORE_BYTES (c2
, c3
, c4
);
1944 if (c2
< 0xA0 || c3
< 0xA0 || c4
< 0xA0)
1946 /* invalid sequence */
1949 coding
->consumed_char
++;
1952 ENCODE_ISO_CHARACTER (c2
, c3
, c4
);
1955 case EMACS_leading_code_composition
:
1959 /* invalid sequence */
1962 coding
->consumed_char
++;
1964 else if (c2
== 0xFF)
1966 ENCODE_RESET_PLANE_AND_REGISTER
;
1967 coding
->composing
= COMPOSING_WITH_RULE_HEAD
;
1968 ENCODE_COMPOSITION_WITH_RULE_START
;
1969 coding
->consumed_char
++;
1973 ENCODE_RESET_PLANE_AND_REGISTER
;
1974 /* Rewind one byte because it is a character code of
1975 composition elements. */
1977 coding
->composing
= COMPOSING_NO_RULE_HEAD
;
1978 ENCODE_COMPOSITION_NO_RULE_START
;
1979 coding
->consumed_char
++;
1983 case EMACS_invalid_code
:
1984 if (coding
->flags
& CODING_FLAG_ISO_RESET_AT_CNTL
)
1985 ENCODE_RESET_PLANE_AND_REGISTER
;
1987 coding
->consumed_char
++;
1992 result
= CODING_FINISH_INSUFFICIENT_SRC
;
1997 if (src
< src_end
&& result
== CODING_FINISH_NORMAL
)
1998 result
= CODING_FINISH_INSUFFICIENT_DST
;
2000 /* If this is the last block of the text to be encoded, we must
2001 reset graphic planes and registers to the initial state, and
2002 flush out the carryover if any. */
2003 if (coding
->mode
& CODING_MODE_LAST_BLOCK
)
2005 ENCODE_RESET_PLANE_AND_REGISTER
;
2006 if (COMPOSING_P (coding
->composing
))
2007 ENCODE_COMPOSITION_END
;
2008 if (result
== CODING_FINISH_INSUFFICIENT_SRC
)
2010 while (src
< src_end
&& dst
< dst_end
)
2014 coding
->consumed
= src
- source
;
2015 coding
->produced
= coding
->produced_char
= dst
- destination
;
2020 /*** 4. SJIS and BIG5 handlers ***/
2022 /* Although SJIS and BIG5 are not ISO's coding system, they are used
2023 quite widely. So, for the moment, Emacs supports them in the bare
2024 C code. But, in the future, they may be supported only by CCL. */
2026 /* SJIS is a coding system encoding three character sets: ASCII, right
2027 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
2028 as is. A character of charset katakana-jisx0201 is encoded by
2029 "position-code + 0x80". A character of charset japanese-jisx0208
2030 is encoded in 2-byte but two position-codes are divided and shifted
2031 so that it fit in the range below.
2033 --- CODE RANGE of SJIS ---
2034 (character set) (range)
2036 KATAKANA-JISX0201 0xA0 .. 0xDF
2037 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
2038 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
2039 -------------------------------
2043 /* BIG5 is a coding system encoding two character sets: ASCII and
2044 Big5. An ASCII character is encoded as is. Big5 is a two-byte
2045 character set and is encoded in two-byte.
2047 --- CODE RANGE of BIG5 ---
2048 (character set) (range)
2050 Big5 (1st byte) 0xA1 .. 0xFE
2051 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
2052 --------------------------
2054 Since the number of characters in Big5 is larger than maximum
2055 characters in Emacs' charset (96x96), it can't be handled as one
2056 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
2057 and `charset-big5-2'. Both are DIMENSION2 and CHARS94. The former
2058 contains frequently used characters and the latter contains less
2059 frequently used characters. */
2061 /* Macros to decode or encode a character of Big5 in BIG5. B1 and B2
2062 are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
2063 C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
2064 format. CHARSET is `charset_big5_1' or `charset_big5_2'. */
2066 /* Number of Big5 characters which have the same code in 1st byte. */
2067 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2069 #define DECODE_BIG5(b1, b2, charset, c1, c2) \
2072 = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62); \
2074 charset = charset_big5_1; \
2077 charset = charset_big5_2; \
2078 temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW; \
2080 c1 = temp / (0xFF - 0xA1) + 0x21; \
2081 c2 = temp % (0xFF - 0xA1) + 0x21; \
2084 #define ENCODE_BIG5(charset, c1, c2, b1, b2) \
2086 unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21); \
2087 if (charset == charset_big5_2) \
2088 temp += BIG5_SAME_ROW * (0xC9 - 0xA1); \
2089 b1 = temp / BIG5_SAME_ROW + 0xA1; \
2090 b2 = temp % BIG5_SAME_ROW; \
2091 b2 += b2 < 0x3F ? 0x40 : 0x62; \
2094 #define DECODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \
2096 int c_alt, charset_alt = (charset); \
2097 if (!NILP (translation_table) \
2098 && ((c_alt = translate_char (translation_table, \
2099 -1, (charset), c1, c2)) >= 0)) \
2100 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
2101 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
2102 DECODE_CHARACTER_ASCII (c1); \
2103 else if (CHARSET_DIMENSION (charset_alt) == 1) \
2104 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
2106 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
2109 #define ENCODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \
2111 int c_alt, charset_alt; \
2112 if (!NILP (translation_table) \
2113 && ((c_alt = translate_char (translation_table, -1, \
2116 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
2118 charset_alt = charset; \
2119 if (charset_alt == charset_ascii) \
2121 else if (CHARSET_DIMENSION (charset_alt) == 1) \
2123 if (sjis_p && charset_alt == charset_katakana_jisx0201) \
2127 *dst++ = charset_alt, *dst++ = c1; \
2128 coding->fake_multibyte = 1; \
2133 c1 &= 0x7F, c2 &= 0x7F; \
2134 if (sjis_p && charset_alt == charset_jisx0208) \
2136 unsigned char s1, s2; \
2138 ENCODE_SJIS (c1, c2, s1, s2); \
2139 *dst++ = s1, *dst++ = s2; \
2140 coding->fake_multibyte = 1; \
2143 && (charset_alt == charset_big5_1 \
2144 || charset_alt == charset_big5_2)) \
2146 unsigned char b1, b2; \
2148 ENCODE_BIG5 (charset_alt, c1, c2, b1, b2); \
2149 *dst++ = b1, *dst++ = b2; \
2153 *dst++ = charset_alt, *dst++ = c1, *dst++ = c2; \
2154 coding->fake_multibyte = 1; \
2157 coding->consumed_char++; \
2160 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2161 Check if a text is encoded in SJIS. If it is, return
2162 CODING_CATEGORY_MASK_SJIS, else return 0. */
2165 detect_coding_sjis (src
, src_end
)
2166 unsigned char *src
, *src_end
;
2170 while (src
< src_end
)
2173 if ((c
>= 0x80 && c
< 0xA0) || c
>= 0xE0)
2175 if (src
< src_end
&& *src
++ < 0x40)
2179 return CODING_CATEGORY_MASK_SJIS
;
2182 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2183 Check if a text is encoded in BIG5. If it is, return
2184 CODING_CATEGORY_MASK_BIG5, else return 0. */
2187 detect_coding_big5 (src
, src_end
)
2188 unsigned char *src
, *src_end
;
2192 while (src
< src_end
)
2200 if (c
< 0x40 || (c
>= 0x7F && c
<= 0xA0))
2204 return CODING_CATEGORY_MASK_BIG5
;
2207 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2208 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
2211 decode_coding_sjis_big5 (coding
, source
, destination
,
2212 src_bytes
, dst_bytes
, sjis_p
)
2213 struct coding_system
*coding
;
2214 unsigned char *source
, *destination
;
2215 int src_bytes
, dst_bytes
;
2218 unsigned char *src
= source
;
2219 unsigned char *src_end
= source
+ src_bytes
;
2220 unsigned char *dst
= destination
;
2221 unsigned char *dst_end
= destination
+ dst_bytes
;
2222 /* Since the maximum bytes produced by each loop is 4, we subtract 3
2223 from DST_END to assure overflow checking is necessary only at the
2225 unsigned char *adjusted_dst_end
= dst_end
- 3;
2226 Lisp_Object translation_table
2227 = coding
->translation_table_for_decode
;
2228 int result
= CODING_FINISH_NORMAL
;
2230 if (!NILP (Venable_character_translation
) && NILP (translation_table
))
2231 translation_table
= Vstandard_translation_table_for_decode
;
2233 coding
->produced_char
= 0;
2234 coding
->fake_multibyte
= 0;
2235 while (src
< src_end
&& (dst_bytes
2236 ? (dst
< adjusted_dst_end
)
2239 /* SRC_BASE remembers the start position in source in each loop.
2240 The loop will be exited when there's not enough source text
2241 to analyze two-byte character (within macro ONE_MORE_BYTE).
2242 In that case, SRC is reset to SRC_BASE before exiting. */
2243 unsigned char *src_base
= src
;
2244 unsigned char c1
= *src
++, c2
, c3
, c4
;
2250 if (coding
->eol_type
== CODING_EOL_CRLF
)
2255 else if (coding
->mode
& CODING_MODE_INHIBIT_INCONSISTENT_EOL
)
2257 result
= CODING_FINISH_INCONSISTENT_EOL
;
2258 goto label_end_of_loop_2
;
2261 /* To process C2 again, SRC is subtracted by 1. */
2264 else if (coding
->eol_type
== CODING_EOL_CR
)
2270 && (coding
->mode
& CODING_MODE_INHIBIT_INCONSISTENT_EOL
)
2271 && (coding
->eol_type
== CODING_EOL_CR
2272 || coding
->eol_type
== CODING_EOL_CRLF
))
2274 result
= CODING_FINISH_INCONSISTENT_EOL
;
2275 goto label_end_of_loop_2
;
2279 coding
->produced_char
++;
2282 DECODE_SJIS_BIG5_CHARACTER (charset_ascii
, c1
, /* dummy */ c2
);
2287 if (c1
< 0xA0 || (c1
>= 0xE0 && c1
< 0xF0))
2289 /* SJIS -> JISX0208 */
2291 if (c2
>= 0x40 && c2
!= 0x7F && c2
<= 0xFC)
2293 DECODE_SJIS (c1
, c2
, c3
, c4
);
2294 DECODE_SJIS_BIG5_CHARACTER (charset_jisx0208
, c3
, c4
);
2297 goto label_invalid_code_2
;
2300 /* SJIS -> JISX0201-Kana */
2301 DECODE_SJIS_BIG5_CHARACTER (charset_katakana_jisx0201
, c1
,
2304 goto label_invalid_code_1
;
2309 if (c1
>= 0xA1 && c1
<= 0xFE)
2312 if ((c2
>= 0x40 && c2
<= 0x7E) || (c2
>= 0xA1 && c2
<= 0xFE))
2316 DECODE_BIG5 (c1
, c2
, charset
, c3
, c4
);
2317 DECODE_SJIS_BIG5_CHARACTER (charset
, c3
, c4
);
2320 goto label_invalid_code_2
;
2323 goto label_invalid_code_1
;
2328 label_invalid_code_1
:
2330 coding
->produced_char
++;
2331 coding
->fake_multibyte
= 1;
2334 label_invalid_code_2
:
2335 *dst
++ = c1
; *dst
++= c2
;
2336 coding
->produced_char
+= 2;
2337 coding
->fake_multibyte
= 1;
2341 result
= CODING_FINISH_INSUFFICIENT_SRC
;
2342 label_end_of_loop_2
:
2349 if (result
== CODING_FINISH_NORMAL
)
2350 result
= CODING_FINISH_INSUFFICIENT_DST
;
2351 else if (result
!= CODING_FINISH_INCONSISTENT_EOL
2352 && coding
->mode
& CODING_MODE_LAST_BLOCK
)
2354 src_bytes
= src_end
- src
;
2355 if (dst_bytes
&& (dst_end
- dst
< src_bytes
))
2356 src_bytes
= dst_end
- dst
;
2357 bcopy (dst
, src
, src_bytes
);
2360 coding
->fake_multibyte
= 1;
2364 coding
->consumed
= coding
->consumed_char
= src
- source
;
2365 coding
->produced
= dst
- destination
;
2369 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
2370 This function can encode `charset_ascii', `charset_katakana_jisx0201',
2371 `charset_jisx0208', `charset_big5_1', and `charset_big5-2'. We are
2372 sure that all these charsets are registered as official charset
2373 (i.e. do not have extended leading-codes). Characters of other
2374 charsets are produced without any encoding. If SJIS_P is 1, encode
2375 SJIS text, else encode BIG5 text. */
2378 encode_coding_sjis_big5 (coding
, source
, destination
,
2379 src_bytes
, dst_bytes
, sjis_p
)
2380 struct coding_system
*coding
;
2381 unsigned char *source
, *destination
;
2382 int src_bytes
, dst_bytes
;
2385 unsigned char *src
= source
;
2386 unsigned char *src_end
= source
+ src_bytes
;
2387 unsigned char *dst
= destination
;
2388 unsigned char *dst_end
= destination
+ dst_bytes
;
2389 /* Since the maximum bytes produced by each loop is 2, we subtract 1
2390 from DST_END to assure overflow checking is necessary only at the
2392 unsigned char *adjusted_dst_end
= dst_end
- 1;
2393 Lisp_Object translation_table
2394 = coding
->translation_table_for_encode
;
2395 int result
= CODING_FINISH_NORMAL
;
2397 if (!NILP (Venable_character_translation
) && NILP (translation_table
))
2398 translation_table
= Vstandard_translation_table_for_encode
;
2400 coding
->consumed_char
= 0;
2401 coding
->fake_multibyte
= 0;
2402 while (src
< src_end
&& (dst_bytes
2403 ? (dst
< adjusted_dst_end
)
2406 /* SRC_BASE remembers the start position in source in each loop.
2407 The loop will be exited when there's not enough source text
2408 to analyze multi-byte codes (within macros ONE_MORE_BYTE and
2409 TWO_MORE_BYTES). In that case, SRC is reset to SRC_BASE
2411 unsigned char *src_base
= src
;
2412 unsigned char c1
= *src
++, c2
, c3
, c4
;
2414 if (coding
->composing
)
2421 else if (c1
>= 0xA0)
2424 coding
->composing
= 0;
2427 switch (emacs_code_class
[c1
])
2429 case EMACS_ascii_code
:
2430 ENCODE_SJIS_BIG5_CHARACTER (charset_ascii
, c1
, /* dummy */ c2
);
2433 case EMACS_control_code
:
2435 coding
->consumed_char
++;
2438 case EMACS_carriage_return_code
:
2439 if (! (coding
->mode
& CODING_MODE_SELECTIVE_DISPLAY
))
2442 coding
->consumed_char
++;
2445 /* fall down to treat '\r' as '\n' ... */
2447 case EMACS_linefeed_code
:
2448 if (coding
->eol_type
== CODING_EOL_LF
2449 || coding
->eol_type
== CODING_EOL_UNDECIDED
)
2451 else if (coding
->eol_type
== CODING_EOL_CRLF
)
2452 *dst
++ = '\r', *dst
++ = '\n';
2455 coding
->consumed_char
++;
2458 case EMACS_leading_code_2
:
2460 ENCODE_SJIS_BIG5_CHARACTER (c1
, c2
, /* dummy */ c3
);
2463 case EMACS_leading_code_3
:
2464 TWO_MORE_BYTES (c2
, c3
);
2465 ENCODE_SJIS_BIG5_CHARACTER (c1
, c2
, c3
);
2468 case EMACS_leading_code_4
:
2469 THREE_MORE_BYTES (c2
, c3
, c4
);
2470 ENCODE_SJIS_BIG5_CHARACTER (c2
, c3
, c4
);
2473 case EMACS_leading_code_composition
:
2474 coding
->composing
= 1;
2477 default: /* i.e. case EMACS_invalid_code: */
2479 coding
->consumed_char
++;
2484 result
= CODING_FINISH_INSUFFICIENT_SRC
;
2489 if (result
== CODING_FINISH_NORMAL
2491 result
= CODING_FINISH_INSUFFICIENT_DST
;
2492 coding
->consumed
= src
- source
;
2493 coding
->produced
= coding
->produced_char
= dst
- destination
;
2498 /*** 5. CCL handlers ***/
2500 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2501 Check if a text is encoded in a coding system of which
2502 encoder/decoder are written in CCL program. If it is, return
2503 CODING_CATEGORY_MASK_CCL, else return 0. */
2506 detect_coding_ccl (src
, src_end
)
2507 unsigned char *src
, *src_end
;
2509 unsigned char *valid
;
2511 /* No coding system is assigned to coding-category-ccl. */
2512 if (!coding_system_table
[CODING_CATEGORY_IDX_CCL
])
2515 valid
= coding_system_table
[CODING_CATEGORY_IDX_CCL
]->spec
.ccl
.valid_codes
;
2516 while (src
< src_end
)
2518 if (! valid
[*src
]) return 0;
2521 return CODING_CATEGORY_MASK_CCL
;
2525 /*** 6. End-of-line handlers ***/
2527 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2528 This function is called only when `coding->eol_type' is
2529 CODING_EOL_CRLF or CODING_EOL_CR. */
2532 decode_eol (coding
, source
, destination
, src_bytes
, dst_bytes
)
2533 struct coding_system
*coding
;
2534 unsigned char *source
, *destination
;
2535 int src_bytes
, dst_bytes
;
2537 unsigned char *src
= source
;
2538 unsigned char *src_end
= source
+ src_bytes
;
2539 unsigned char *dst
= destination
;
2540 unsigned char *dst_end
= destination
+ dst_bytes
;
2542 int result
= CODING_FINISH_NORMAL
;
2544 coding
->fake_multibyte
= 0;
2549 switch (coding
->eol_type
)
2551 case CODING_EOL_CRLF
:
2553 /* Since the maximum bytes produced by each loop is 2, we
2554 subtract 1 from DST_END to assure overflow checking is
2555 necessary only at the head of loop. */
2556 unsigned char *adjusted_dst_end
= dst_end
- 1;
2558 while (src
< src_end
&& (dst_bytes
2559 ? (dst
< adjusted_dst_end
)
2562 unsigned char *src_base
= src
;
2572 if (coding
->mode
& CODING_MODE_INHIBIT_INCONSISTENT_EOL
)
2574 result
= CODING_FINISH_INCONSISTENT_EOL
;
2575 goto label_end_of_loop_2
;
2579 if (BASE_LEADING_CODE_P (c
))
2580 coding
->fake_multibyte
= 1;
2584 && (coding
->mode
& CODING_MODE_INHIBIT_INCONSISTENT_EOL
))
2586 result
= CODING_FINISH_INCONSISTENT_EOL
;
2587 goto label_end_of_loop_2
;
2592 if (BASE_LEADING_CODE_P (c
))
2593 coding
->fake_multibyte
= 1;
2598 result
= CODING_FINISH_INSUFFICIENT_SRC
;
2599 label_end_of_loop_2
:
2605 if (result
== CODING_FINISH_NORMAL
)
2606 result
= CODING_FINISH_INSUFFICIENT_DST
;
2607 else if (result
!= CODING_FINISH_INCONSISTENT_EOL
2608 && coding
->mode
& CODING_MODE_LAST_BLOCK
)
2610 /* This is the last block of the text to be decoded.
2611 We flush out all remaining codes. */
2612 src_bytes
= src_end
- src
;
2613 if (dst_bytes
&& (dst_end
- dst
< src_bytes
))
2614 src_bytes
= dst_end
- dst
;
2615 bcopy (src
, dst
, src_bytes
);
2624 if (coding
->mode
& CODING_MODE_INHIBIT_INCONSISTENT_EOL
)
2626 while (src
< src_end
)
2628 if ((c
= *src
++) == '\n')
2630 if (BASE_LEADING_CODE_P (c
))
2631 coding
->fake_multibyte
= 1;
2635 src_bytes
= src
- source
;
2636 result
= CODING_FINISH_INCONSISTENT_EOL
;
2639 if (dst_bytes
&& src_bytes
> dst_bytes
)
2641 result
= CODING_FINISH_INSUFFICIENT_DST
;
2642 src_bytes
= dst_bytes
;
2645 bcopy (source
, destination
, src_bytes
);
2647 safe_bcopy (source
, destination
, src_bytes
);
2648 src
= source
+ src_bytes
;
2649 while (src_bytes
--) if (*dst
++ == '\r') dst
[-1] = '\n';
2652 default: /* i.e. case: CODING_EOL_LF */
2653 if (dst_bytes
&& src_bytes
> dst_bytes
)
2655 result
= CODING_FINISH_INSUFFICIENT_DST
;
2656 src_bytes
= dst_bytes
;
2659 bcopy (source
, destination
, src_bytes
);
2661 safe_bcopy (source
, destination
, src_bytes
);
2664 coding
->fake_multibyte
= 1;
2668 coding
->consumed
= coding
->consumed_char
= src
- source
;
2669 coding
->produced
= coding
->produced_char
= dst
- destination
;
2673 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". Encode
2674 format of end-of-line according to `coding->eol_type'. If
2675 `coding->mode & CODING_MODE_SELECTIVE_DISPLAY' is nonzero, code
2676 '\r' in source text also means end-of-line. */
2679 encode_eol (coding
, source
, destination
, src_bytes
, dst_bytes
)
2680 struct coding_system
*coding
;
2681 unsigned char *source
, *destination
;
2682 int src_bytes
, dst_bytes
;
2684 unsigned char *src
= source
;
2685 unsigned char *dst
= destination
;
2686 int result
= CODING_FINISH_NORMAL
;
2688 coding
->fake_multibyte
= 0;
2690 if (coding
->eol_type
== CODING_EOL_CRLF
)
2693 unsigned char *src_end
= source
+ src_bytes
;
2694 unsigned char *dst_end
= destination
+ dst_bytes
;
2695 /* Since the maximum bytes produced by each loop is 2, we
2696 subtract 1 from DST_END to assure overflow checking is
2697 necessary only at the head of loop. */
2698 unsigned char *adjusted_dst_end
= dst_end
- 1;
2700 while (src
< src_end
&& (dst_bytes
2701 ? (dst
< adjusted_dst_end
)
2706 || (c
== '\r' && (coding
->mode
& CODING_MODE_SELECTIVE_DISPLAY
)))
2707 *dst
++ = '\r', *dst
++ = '\n';
2711 if (BASE_LEADING_CODE_P (c
))
2712 coding
->fake_multibyte
= 1;
2716 result
= CODING_FINISH_INSUFFICIENT_DST
;
2722 if (dst_bytes
&& src_bytes
> dst_bytes
)
2724 src_bytes
= dst_bytes
;
2725 result
= CODING_FINISH_INSUFFICIENT_DST
;
2728 bcopy (source
, destination
, src_bytes
);
2730 safe_bcopy (source
, destination
, src_bytes
);
2731 dst_bytes
= src_bytes
;
2732 if (coding
->eol_type
== CODING_EOL_CR
)
2736 if ((c
= *dst
++) == '\n')
2738 else if (BASE_LEADING_CODE_P (c
))
2739 coding
->fake_multibyte
= 1;
2744 if (coding
->mode
& CODING_MODE_SELECTIVE_DISPLAY
)
2747 if (*dst
++ == '\r') dst
[-1] = '\n';
2749 coding
->fake_multibyte
= 1;
2751 src
= source
+ dst_bytes
;
2752 dst
= destination
+ dst_bytes
;
2755 coding
->consumed
= coding
->consumed_char
= src
- source
;
2756 coding
->produced
= coding
->produced_char
= dst
- destination
;
2761 /*** 7. C library functions ***/
2763 /* In Emacs Lisp, coding system is represented by a Lisp symbol which
2764 has a property `coding-system'. The value of this property is a
2765 vector of length 5 (called as coding-vector). Among elements of
2766 this vector, the first (element[0]) and the fifth (element[4])
2767 carry important information for decoding/encoding. Before
2768 decoding/encoding, this information should be set in fields of a
2769 structure of type `coding_system'.
2771 A value of property `coding-system' can be a symbol of another
2772 subsidiary coding-system. In that case, Emacs gets coding-vector
2775 `element[0]' contains information to be set in `coding->type'. The
2776 value and its meaning is as follows:
2778 0 -- coding_type_emacs_mule
2779 1 -- coding_type_sjis
2780 2 -- coding_type_iso2022
2781 3 -- coding_type_big5
2782 4 -- coding_type_ccl encoder/decoder written in CCL
2783 nil -- coding_type_no_conversion
2784 t -- coding_type_undecided (automatic conversion on decoding,
2785 no-conversion on encoding)
2787 `element[4]' contains information to be set in `coding->flags' and
2788 `coding->spec'. The meaning varies by `coding->type'.
2790 If `coding->type' is `coding_type_iso2022', element[4] is a vector
2791 of length 32 (of which the first 13 sub-elements are used now).
2792 Meanings of these sub-elements are:
2794 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2795 If the value is an integer of valid charset, the charset is
2796 assumed to be designated to graphic register N initially.
2798 If the value is minus, it is a minus value of charset which
2799 reserves graphic register N, which means that the charset is
2800 not designated initially but should be designated to graphic
2801 register N just before encoding a character in that charset.
2803 If the value is nil, graphic register N is never used on
2806 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2807 Each value takes t or nil. See the section ISO2022 of
2808 `coding.h' for more information.
2810 If `coding->type' is `coding_type_big5', element[4] is t to denote
2811 BIG5-ETen or nil to denote BIG5-HKU.
2813 If `coding->type' takes the other value, element[4] is ignored.
2815 Emacs Lisp's coding system also carries information about format of
2816 end-of-line in a value of property `eol-type'. If the value is
2817 integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2818 means CODING_EOL_CR. If it is not integer, it should be a vector
2819 of subsidiary coding systems of which property `eol-type' has one
2824 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2825 and set it in CODING. If CODING_SYSTEM_SYMBOL is invalid, CODING
2826 is setup so that no conversion is necessary and return -1, else
2830 setup_coding_system (coding_system
, coding
)
2831 Lisp_Object coding_system
;
2832 struct coding_system
*coding
;
2834 Lisp_Object coding_spec
, coding_type
, eol_type
, plist
;
2838 /* Initialize some fields required for all kinds of coding systems. */
2839 coding
->symbol
= coding_system
;
2840 coding
->common_flags
= 0;
2842 coding
->heading_ascii
= -1;
2843 coding
->post_read_conversion
= coding
->pre_write_conversion
= Qnil
;
2844 coding_spec
= Fget (coding_system
, Qcoding_system
);
2845 if (!VECTORP (coding_spec
)
2846 || XVECTOR (coding_spec
)->size
!= 5
2847 || !CONSP (XVECTOR (coding_spec
)->contents
[3]))
2848 goto label_invalid_coding_system
;
2850 eol_type
= inhibit_eol_conversion
? Qnil
: Fget (coding_system
, Qeol_type
);
2851 if (VECTORP (eol_type
))
2853 coding
->eol_type
= CODING_EOL_UNDECIDED
;
2854 coding
->common_flags
= CODING_REQUIRE_DETECTION_MASK
;
2856 else if (XFASTINT (eol_type
) == 1)
2858 coding
->eol_type
= CODING_EOL_CRLF
;
2859 coding
->common_flags
2860 = CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
;
2862 else if (XFASTINT (eol_type
) == 2)
2864 coding
->eol_type
= CODING_EOL_CR
;
2865 coding
->common_flags
2866 = CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
;
2869 coding
->eol_type
= CODING_EOL_LF
;
2871 coding_type
= XVECTOR (coding_spec
)->contents
[0];
2872 /* Try short cut. */
2873 if (SYMBOLP (coding_type
))
2875 if (EQ (coding_type
, Qt
))
2877 coding
->type
= coding_type_undecided
;
2878 coding
->common_flags
|= CODING_REQUIRE_DETECTION_MASK
;
2881 coding
->type
= coding_type_no_conversion
;
2885 /* Initialize remaining fields. */
2886 coding
->composing
= 0;
2887 coding
->composed_chars
= 0;
2889 /* Get values of coding system properties:
2890 `post-read-conversion', `pre-write-conversion',
2891 `translation-table-for-decode', `translation-table-for-encode'. */
2892 plist
= XVECTOR (coding_spec
)->contents
[3];
2893 coding
->post_read_conversion
= Fplist_get (plist
, Qpost_read_conversion
);
2894 coding
->pre_write_conversion
= Fplist_get (plist
, Qpre_write_conversion
);
2895 val
= Fplist_get (plist
, Qtranslation_table_for_decode
);
2897 val
= Fget (val
, Qtranslation_table_for_decode
);
2898 coding
->translation_table_for_decode
= CHAR_TABLE_P (val
) ? val
: Qnil
;
2899 val
= Fplist_get (plist
, Qtranslation_table_for_encode
);
2901 val
= Fget (val
, Qtranslation_table_for_encode
);
2902 coding
->translation_table_for_encode
= CHAR_TABLE_P (val
) ? val
: Qnil
;
2903 val
= Fplist_get (plist
, Qcoding_category
);
2906 val
= Fget (val
, Qcoding_category_index
);
2908 coding
->category_idx
= XINT (val
);
2910 goto label_invalid_coding_system
;
2913 goto label_invalid_coding_system
;
2915 val
= Fplist_get (plist
, Qsafe_charsets
);
2918 for (i
= 0; i
<= MAX_CHARSET
; i
++)
2919 coding
->safe_charsets
[i
] = 1;
2923 bzero (coding
->safe_charsets
, MAX_CHARSET
+ 1);
2926 if ((i
= get_charset_id (XCONS (val
)->car
)) >= 0)
2927 coding
->safe_charsets
[i
] = 1;
2928 val
= XCONS (val
)->cdr
;
2932 switch (XFASTINT (coding_type
))
2935 coding
->type
= coding_type_emacs_mule
;
2936 if (!NILP (coding
->post_read_conversion
))
2937 coding
->common_flags
|= CODING_REQUIRE_DECODING_MASK
;
2938 if (!NILP (coding
->pre_write_conversion
))
2939 coding
->common_flags
|= CODING_REQUIRE_ENCODING_MASK
;
2943 coding
->type
= coding_type_sjis
;
2944 coding
->common_flags
2945 |= CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
;
2949 coding
->type
= coding_type_iso2022
;
2950 coding
->common_flags
2951 |= CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
;
2953 Lisp_Object val
, temp
;
2955 int i
, charset
, reg_bits
= 0;
2957 val
= XVECTOR (coding_spec
)->contents
[4];
2959 if (!VECTORP (val
) || XVECTOR (val
)->size
!= 32)
2960 goto label_invalid_coding_system
;
2962 flags
= XVECTOR (val
)->contents
;
2964 = ((NILP (flags
[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM
)
2965 | (NILP (flags
[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL
)
2966 | (NILP (flags
[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL
)
2967 | (NILP (flags
[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS
)
2968 | (NILP (flags
[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT
)
2969 | (NILP (flags
[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT
)
2970 | (NILP (flags
[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN
)
2971 | (NILP (flags
[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS
)
2972 | (NILP (flags
[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION
)
2973 | (NILP (flags
[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL
)
2974 | (NILP (flags
[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL
)
2975 | (NILP (flags
[15]) ? 0 : CODING_FLAG_ISO_SAFE
)
2976 | (NILP (flags
[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA
)
2979 /* Invoke graphic register 0 to plane 0. */
2980 CODING_SPEC_ISO_INVOCATION (coding
, 0) = 0;
2981 /* Invoke graphic register 1 to plane 1 if we can use full 8-bit. */
2982 CODING_SPEC_ISO_INVOCATION (coding
, 1)
2983 = (coding
->flags
& CODING_FLAG_ISO_SEVEN_BITS
? -1 : 1);
2984 /* Not single shifting at first. */
2985 CODING_SPEC_ISO_SINGLE_SHIFTING (coding
) = 0;
2986 /* Beginning of buffer should also be regarded as bol. */
2987 CODING_SPEC_ISO_BOL (coding
) = 1;
2989 for (charset
= 0; charset
<= MAX_CHARSET
; charset
++)
2990 CODING_SPEC_ISO_REVISION_NUMBER (coding
, charset
) = 255;
2991 val
= Vcharset_revision_alist
;
2994 charset
= get_charset_id (Fcar_safe (XCONS (val
)->car
));
2996 && (temp
= Fcdr_safe (XCONS (val
)->car
), INTEGERP (temp
))
2997 && (i
= XINT (temp
), (i
>= 0 && (i
+ '@') < 128)))
2998 CODING_SPEC_ISO_REVISION_NUMBER (coding
, charset
) = i
;
2999 val
= XCONS (val
)->cdr
;
3002 /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
3003 FLAGS[REG] can be one of below:
3004 integer CHARSET: CHARSET occupies register I,
3005 t: designate nothing to REG initially, but can be used
3007 list of integer, nil, or t: designate the first
3008 element (if integer) to REG initially, the remaining
3009 elements (if integer) is designated to REG on request,
3010 if an element is t, REG can be used by any charsets,
3011 nil: REG is never used. */
3012 for (charset
= 0; charset
<= MAX_CHARSET
; charset
++)
3013 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
)
3014 = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION
;
3015 for (i
= 0; i
< 4; i
++)
3017 if (INTEGERP (flags
[i
])
3018 && (charset
= XINT (flags
[i
]), CHARSET_VALID_P (charset
))
3019 || (charset
= get_charset_id (flags
[i
])) >= 0)
3021 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, i
) = charset
;
3022 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
) = i
;
3024 else if (EQ (flags
[i
], Qt
))
3026 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, i
) = -1;
3028 coding
->flags
|= CODING_FLAG_ISO_DESIGNATION
;
3030 else if (CONSP (flags
[i
]))
3035 coding
->flags
|= CODING_FLAG_ISO_DESIGNATION
;
3036 if (INTEGERP (XCONS (tail
)->car
)
3037 && (charset
= XINT (XCONS (tail
)->car
),
3038 CHARSET_VALID_P (charset
))
3039 || (charset
= get_charset_id (XCONS (tail
)->car
)) >= 0)
3041 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, i
) = charset
;
3042 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
) =i
;
3045 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, i
) = -1;
3046 tail
= XCONS (tail
)->cdr
;
3047 while (CONSP (tail
))
3049 if (INTEGERP (XCONS (tail
)->car
)
3050 && (charset
= XINT (XCONS (tail
)->car
),
3051 CHARSET_VALID_P (charset
))
3052 || (charset
= get_charset_id (XCONS (tail
)->car
)) >= 0)
3053 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
)
3055 else if (EQ (XCONS (tail
)->car
, Qt
))
3057 tail
= XCONS (tail
)->cdr
;
3061 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, i
) = -1;
3063 CODING_SPEC_ISO_DESIGNATION (coding
, i
)
3064 = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, i
);
3067 if (reg_bits
&& ! (coding
->flags
& CODING_FLAG_ISO_LOCKING_SHIFT
))
3069 /* REG 1 can be used only by locking shift in 7-bit env. */
3070 if (coding
->flags
& CODING_FLAG_ISO_SEVEN_BITS
)
3072 if (! (coding
->flags
& CODING_FLAG_ISO_SINGLE_SHIFT
))
3073 /* Without any shifting, only REG 0 and 1 can be used. */
3078 for (charset
= 0; charset
<= MAX_CHARSET
; charset
++)
3080 if (CHARSET_VALID_P (charset
))
3082 /* There exist some default graphic registers to be
3085 /* We had better avoid designating a charset of
3086 CHARS96 to REG 0 as far as possible. */
3087 if (CHARSET_CHARS (charset
) == 96)
3088 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
)
3090 ? 1 : (reg_bits
& 4 ? 2 : (reg_bits
& 8 ? 3 : 0)));
3092 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
)
3094 ? 0 : (reg_bits
& 2 ? 1 : (reg_bits
& 4 ? 2 : 3)));
3098 coding
->common_flags
|= CODING_REQUIRE_FLUSHING_MASK
;
3099 coding
->spec
.iso2022
.last_invalid_designation_register
= -1;
3103 coding
->type
= coding_type_big5
;
3104 coding
->common_flags
3105 |= CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
;
3107 = (NILP (XVECTOR (coding_spec
)->contents
[4])
3108 ? CODING_FLAG_BIG5_HKU
3109 : CODING_FLAG_BIG5_ETEN
);
3113 coding
->type
= coding_type_ccl
;
3114 coding
->common_flags
3115 |= CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
;
3118 Lisp_Object decoder
, encoder
;
3120 val
= XVECTOR (coding_spec
)->contents
[4];
3122 && SYMBOLP (XCONS (val
)->car
)
3123 && !NILP (decoder
= Fget (XCONS (val
)->car
, Qccl_program_idx
))
3124 && !NILP (decoder
= Fcdr (Faref (Vccl_program_table
, decoder
)))
3125 && SYMBOLP (XCONS (val
)->cdr
)
3126 && !NILP (encoder
= Fget (XCONS (val
)->cdr
, Qccl_program_idx
))
3127 && !NILP (encoder
= Fcdr (Faref (Vccl_program_table
, encoder
))))
3129 setup_ccl_program (&(coding
->spec
.ccl
.decoder
), decoder
);
3130 setup_ccl_program (&(coding
->spec
.ccl
.encoder
), encoder
);
3133 goto label_invalid_coding_system
;
3135 bzero (coding
->spec
.ccl
.valid_codes
, 256);
3136 val
= Fplist_get (plist
, Qvalid_codes
);
3141 for (; CONSP (val
); val
= XCONS (val
)->cdr
)
3143 this = XCONS (val
)->car
;
3145 && XINT (this) >= 0 && XINT (this) < 256)
3146 coding
->spec
.ccl
.valid_codes
[XINT (this)] = 1;
3147 else if (CONSP (this)
3148 && INTEGERP (XCONS (this)->car
)
3149 && INTEGERP (XCONS (this)->cdr
))
3151 int start
= XINT (XCONS (this)->car
);
3152 int end
= XINT (XCONS (this)->cdr
);
3154 if (start
>= 0 && start
<= end
&& end
< 256)
3155 while (start
<= end
)
3156 coding
->spec
.ccl
.valid_codes
[start
++] = 1;
3161 coding
->common_flags
|= CODING_REQUIRE_FLUSHING_MASK
;
3165 coding
->type
= coding_type_raw_text
;
3169 goto label_invalid_coding_system
;
3173 label_invalid_coding_system
:
3174 coding
->type
= coding_type_no_conversion
;
3175 coding
->category_idx
= CODING_CATEGORY_IDX_BINARY
;
3176 coding
->common_flags
= 0;
3177 coding
->eol_type
= CODING_EOL_LF
;
3178 coding
->pre_write_conversion
= coding
->post_read_conversion
= Qnil
;
3182 /* Setup raw-text or one of its subsidiaries in the structure
3183 coding_system CODING according to the already setup value eol_type
3184 in CODING. CODING should be setup for some coding system in
3188 setup_raw_text_coding_system (coding
)
3189 struct coding_system
*coding
;
3191 if (coding
->type
!= coding_type_raw_text
)
3193 coding
->symbol
= Qraw_text
;
3194 coding
->type
= coding_type_raw_text
;
3195 if (coding
->eol_type
!= CODING_EOL_UNDECIDED
)
3197 Lisp_Object subsidiaries
;
3198 subsidiaries
= Fget (Qraw_text
, Qeol_type
);
3200 if (VECTORP (subsidiaries
)
3201 && XVECTOR (subsidiaries
)->size
== 3)
3203 = XVECTOR (subsidiaries
)->contents
[coding
->eol_type
];
3209 /* Emacs has a mechanism to automatically detect a coding system if it
3210 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
3211 it's impossible to distinguish some coding systems accurately
3212 because they use the same range of codes. So, at first, coding
3213 systems are categorized into 7, those are:
3215 o coding-category-emacs-mule
3217 The category for a coding system which has the same code range
3218 as Emacs' internal format. Assigned the coding-system (Lisp
3219 symbol) `emacs-mule' by default.
3221 o coding-category-sjis
3223 The category for a coding system which has the same code range
3224 as SJIS. Assigned the coding-system (Lisp
3225 symbol) `japanese-shift-jis' by default.
3227 o coding-category-iso-7
3229 The category for a coding system which has the same code range
3230 as ISO2022 of 7-bit environment. This doesn't use any locking
3231 shift and single shift functions. This can encode/decode all
3232 charsets. Assigned the coding-system (Lisp symbol)
3233 `iso-2022-7bit' by default.
3235 o coding-category-iso-7-tight
3237 Same as coding-category-iso-7 except that this can
3238 encode/decode only the specified charsets.
3240 o coding-category-iso-8-1
3242 The category for a coding system which has the same code range
3243 as ISO2022 of 8-bit environment and graphic plane 1 used only
3244 for DIMENSION1 charset. This doesn't use any locking shift
3245 and single shift functions. Assigned the coding-system (Lisp
3246 symbol) `iso-latin-1' by default.
3248 o coding-category-iso-8-2
3250 The category for a coding system which has the same code range
3251 as ISO2022 of 8-bit environment and graphic plane 1 used only
3252 for DIMENSION2 charset. This doesn't use any locking shift
3253 and single shift functions. Assigned the coding-system (Lisp
3254 symbol) `japanese-iso-8bit' by default.
3256 o coding-category-iso-7-else
3258 The category for a coding system which has the same code range
3259 as ISO2022 of 7-bit environemnt but uses locking shift or
3260 single shift functions. Assigned the coding-system (Lisp
3261 symbol) `iso-2022-7bit-lock' by default.
3263 o coding-category-iso-8-else
3265 The category for a coding system which has the same code range
3266 as ISO2022 of 8-bit environemnt but uses locking shift or
3267 single shift functions. Assigned the coding-system (Lisp
3268 symbol) `iso-2022-8bit-ss2' by default.
3270 o coding-category-big5
3272 The category for a coding system which has the same code range
3273 as BIG5. Assigned the coding-system (Lisp symbol)
3274 `cn-big5' by default.
3276 o coding-category-ccl
3278 The category for a coding system of which encoder/decoder is
3279 written in CCL programs. The default value is nil, i.e., no
3280 coding system is assigned.
3282 o coding-category-binary
3284 The category for a coding system not categorized in any of the
3285 above. Assigned the coding-system (Lisp symbol)
3286 `no-conversion' by default.
3288 Each of them is a Lisp symbol and the value is an actual
3289 `coding-system's (this is also a Lisp symbol) assigned by a user.
3290 What Emacs does actually is to detect a category of coding system.
3291 Then, it uses a `coding-system' assigned to it. If Emacs can't
3292 decide only one possible category, it selects a category of the
3293 highest priority. Priorities of categories are also specified by a
3294 user in a Lisp variable `coding-category-list'.
3299 int ascii_skip_code
[256];
3301 /* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded.
3302 If it detects possible coding systems, return an integer in which
3303 appropriate flag bits are set. Flag bits are defined by macros
3304 CODING_CATEGORY_MASK_XXX in `coding.h'.
3306 How many ASCII characters are at the head is returned as *SKIP. */
3309 detect_coding_mask (source
, src_bytes
, priorities
, skip
)
3310 unsigned char *source
;
3311 int src_bytes
, *priorities
, *skip
;
3313 register unsigned char c
;
3314 unsigned char *src
= source
, *src_end
= source
+ src_bytes
;
3318 /* At first, skip all ASCII characters and control characters except
3319 for three ISO2022 specific control characters. */
3320 ascii_skip_code
[ISO_CODE_SO
] = 0;
3321 ascii_skip_code
[ISO_CODE_SI
] = 0;
3322 ascii_skip_code
[ISO_CODE_ESC
] = 0;
3324 label_loop_detect_coding
:
3325 while (src
< src_end
&& ascii_skip_code
[*src
]) src
++;
3326 *skip
= src
- source
;
3329 /* We found nothing other than ASCII. There's nothing to do. */
3333 /* The text seems to be encoded in some multilingual coding system.
3334 Now, try to find in which coding system the text is encoded. */
3337 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
3338 /* C is an ISO2022 specific control code of C0. */
3339 mask
= detect_coding_iso2022 (src
, src_end
);
3342 /* No valid ISO2022 code follows C. Try again. */
3344 if (c
== ISO_CODE_ESC
)
3345 ascii_skip_code
[ISO_CODE_ESC
] = 1;
3347 ascii_skip_code
[ISO_CODE_SO
] = ascii_skip_code
[ISO_CODE_SI
] = 1;
3348 goto label_loop_detect_coding
;
3351 goto label_return_highest_only
;
3359 /* C is the first byte of SJIS character code,
3360 or a leading-code of Emacs' internal format (emacs-mule). */
3361 try = CODING_CATEGORY_MASK_SJIS
| CODING_CATEGORY_MASK_EMACS_MULE
;
3363 /* Or, if C is a special latin extra code,
3364 or is an ISO2022 specific control code of C1 (SS2 or SS3),
3365 or is an ISO2022 control-sequence-introducer (CSI),
3366 we should also consider the possibility of ISO2022 codings. */
3367 if ((VECTORP (Vlatin_extra_code_table
)
3368 && !NILP (XVECTOR (Vlatin_extra_code_table
)->contents
[c
]))
3369 || (c
== ISO_CODE_SS2
|| c
== ISO_CODE_SS3
)
3370 || (c
== ISO_CODE_CSI
3373 || ((*src
== '0' || *src
== '1' || *src
== '2')
3374 && src
+ 1 < src_end
3375 && src
[1] == ']')))))
3376 try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
3377 | CODING_CATEGORY_MASK_ISO_8BIT
);
3380 /* C is a character of ISO2022 in graphic plane right,
3381 or a SJIS's 1-byte character code (i.e. JISX0201),
3382 or the first byte of BIG5's 2-byte code. */
3383 try = (CODING_CATEGORY_MASK_ISO_8_ELSE
3384 | CODING_CATEGORY_MASK_ISO_8BIT
3385 | CODING_CATEGORY_MASK_SJIS
3386 | CODING_CATEGORY_MASK_BIG5
);
3388 /* Or, we may have to consider the possibility of CCL. */
3389 if (coding_system_table
[CODING_CATEGORY_IDX_CCL
]
3390 && (coding_system_table
[CODING_CATEGORY_IDX_CCL
]
3391 ->spec
.ccl
.valid_codes
)[c
])
3392 try |= CODING_CATEGORY_MASK_CCL
;
3397 for (i
= 0; i
< CODING_CATEGORY_IDX_MAX
; i
++)
3399 if (priorities
[i
] & try & CODING_CATEGORY_MASK_ISO
)
3400 mask
= detect_coding_iso2022 (src
, src_end
);
3401 else if (priorities
[i
] & try & CODING_CATEGORY_MASK_SJIS
)
3402 mask
= detect_coding_sjis (src
, src_end
);
3403 else if (priorities
[i
] & try & CODING_CATEGORY_MASK_BIG5
)
3404 mask
= detect_coding_big5 (src
, src_end
);
3405 else if (priorities
[i
] & try & CODING_CATEGORY_MASK_EMACS_MULE
)
3406 mask
= detect_coding_emacs_mule (src
, src_end
);
3407 else if (priorities
[i
] & try & CODING_CATEGORY_MASK_CCL
)
3408 mask
= detect_coding_ccl (src
, src_end
);
3409 else if (priorities
[i
] & CODING_CATEGORY_MASK_RAW_TEXT
)
3410 mask
= CODING_CATEGORY_MASK_RAW_TEXT
;
3411 else if (priorities
[i
] & CODING_CATEGORY_MASK_BINARY
)
3412 mask
= CODING_CATEGORY_MASK_BINARY
;
3414 goto label_return_highest_only
;
3416 return CODING_CATEGORY_MASK_RAW_TEXT
;
3418 if (try & CODING_CATEGORY_MASK_ISO
)
3419 mask
|= detect_coding_iso2022 (src
, src_end
);
3420 if (try & CODING_CATEGORY_MASK_SJIS
)
3421 mask
|= detect_coding_sjis (src
, src_end
);
3422 if (try & CODING_CATEGORY_MASK_BIG5
)
3423 mask
|= detect_coding_big5 (src
, src_end
);
3424 if (try & CODING_CATEGORY_MASK_EMACS_MULE
)
3425 mask
|= detect_coding_emacs_mule (src
, src_end
);
3426 if (try & CODING_CATEGORY_MASK_CCL
)
3427 mask
|= detect_coding_ccl (src
, src_end
);
3429 return (mask
| CODING_CATEGORY_MASK_RAW_TEXT
| CODING_CATEGORY_MASK_BINARY
);
3431 label_return_highest_only
:
3432 for (i
= 0; i
< CODING_CATEGORY_IDX_MAX
; i
++)
3434 if (mask
& priorities
[i
])
3435 return priorities
[i
];
3437 return CODING_CATEGORY_MASK_RAW_TEXT
;
3440 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
3441 The information of the detected coding system is set in CODING. */
3444 detect_coding (coding
, src
, src_bytes
)
3445 struct coding_system
*coding
;
3453 val
= Vcoding_category_list
;
3454 mask
= detect_coding_mask (src
, src_bytes
, coding_priorities
, &skip
);
3455 coding
->heading_ascii
= skip
;
3459 /* We found a single coding system of the highest priority in MASK. */
3461 while (mask
&& ! (mask
& 1)) mask
>>= 1, idx
++;
3463 idx
= CODING_CATEGORY_IDX_RAW_TEXT
;
3465 val
= XSYMBOL (XVECTOR (Vcoding_category_table
)->contents
[idx
])->value
;
3467 if (coding
->eol_type
!= CODING_EOL_UNDECIDED
)
3471 tmp
= Fget (val
, Qeol_type
);
3473 val
= XVECTOR (tmp
)->contents
[coding
->eol_type
];
3475 setup_coding_system (val
, coding
);
3476 /* Set this again because setup_coding_system reset this member. */
3477 coding
->heading_ascii
= skip
;
3480 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
3481 SOURCE is encoded. Return one of CODING_EOL_LF, CODING_EOL_CRLF,
3482 CODING_EOL_CR, and CODING_EOL_UNDECIDED.
3484 How many non-eol characters are at the head is returned as *SKIP. */
3486 #define MAX_EOL_CHECK_COUNT 3
3489 detect_eol_type (source
, src_bytes
, skip
)
3490 unsigned char *source
;
3491 int src_bytes
, *skip
;
3493 unsigned char *src
= source
, *src_end
= src
+ src_bytes
;
3495 int total
= 0; /* How many end-of-lines are found so far. */
3496 int eol_type
= CODING_EOL_UNDECIDED
;
3501 while (src
< src_end
&& total
< MAX_EOL_CHECK_COUNT
)
3504 if (c
== '\n' || c
== '\r')
3507 *skip
= src
- 1 - source
;
3510 this_eol_type
= CODING_EOL_LF
;
3511 else if (src
>= src_end
|| *src
!= '\n')
3512 this_eol_type
= CODING_EOL_CR
;
3514 this_eol_type
= CODING_EOL_CRLF
, src
++;
3516 if (eol_type
== CODING_EOL_UNDECIDED
)
3517 /* This is the first end-of-line. */
3518 eol_type
= this_eol_type
;
3519 else if (eol_type
!= this_eol_type
)
3521 /* The found type is different from what found before. */
3522 eol_type
= CODING_EOL_INCONSISTENT
;
3529 *skip
= src_end
- source
;
3533 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
3534 is encoded. If it detects an appropriate format of end-of-line, it
3535 sets the information in *CODING. */
3538 detect_eol (coding
, src
, src_bytes
)
3539 struct coding_system
*coding
;
3545 int eol_type
= detect_eol_type (src
, src_bytes
, &skip
);
3547 if (coding
->heading_ascii
> skip
)
3548 coding
->heading_ascii
= skip
;
3550 skip
= coding
->heading_ascii
;
3552 if (eol_type
== CODING_EOL_UNDECIDED
)
3554 if (eol_type
== CODING_EOL_INCONSISTENT
)
3557 /* This code is suppressed until we find a better way to
3558 distinguish raw text file and binary file. */
3560 /* If we have already detected that the coding is raw-text, the
3561 coding should actually be no-conversion. */
3562 if (coding
->type
== coding_type_raw_text
)
3564 setup_coding_system (Qno_conversion
, coding
);
3567 /* Else, let's decode only text code anyway. */
3569 eol_type
= CODING_EOL_LF
;
3572 val
= Fget (coding
->symbol
, Qeol_type
);
3573 if (VECTORP (val
) && XVECTOR (val
)->size
== 3)
3575 setup_coding_system (XVECTOR (val
)->contents
[eol_type
], coding
);
3576 coding
->heading_ascii
= skip
;
3580 #define CONVERSION_BUFFER_EXTRA_ROOM 256
3582 #define DECODING_BUFFER_MAG(coding) \
3583 (coding->type == coding_type_iso2022 \
3585 : ((coding->type == coding_type_sjis || coding->type == coding_type_big5) \
3587 : (coding->type == coding_type_raw_text \
3589 : (coding->type == coding_type_ccl \
3590 ? coding->spec.ccl.decoder.buf_magnification \
3593 /* Return maximum size (bytes) of a buffer enough for decoding
3594 SRC_BYTES of text encoded in CODING. */
3597 decoding_buffer_size (coding
, src_bytes
)
3598 struct coding_system
*coding
;
3601 return (src_bytes
* DECODING_BUFFER_MAG (coding
)
3602 + CONVERSION_BUFFER_EXTRA_ROOM
);
3605 /* Return maximum size (bytes) of a buffer enough for encoding
3606 SRC_BYTES of text to CODING. */
3609 encoding_buffer_size (coding
, src_bytes
)
3610 struct coding_system
*coding
;
3615 if (coding
->type
== coding_type_ccl
)
3616 magnification
= coding
->spec
.ccl
.encoder
.buf_magnification
;
3620 return (src_bytes
* magnification
+ CONVERSION_BUFFER_EXTRA_ROOM
);
3623 #ifndef MINIMUM_CONVERSION_BUFFER_SIZE
3624 #define MINIMUM_CONVERSION_BUFFER_SIZE 1024
3627 char *conversion_buffer
;
3628 int conversion_buffer_size
;
3630 /* Return a pointer to a SIZE bytes of buffer to be used for encoding
3631 or decoding. Sufficient memory is allocated automatically. If we
3632 run out of memory, return NULL. */
3635 get_conversion_buffer (size
)
3638 if (size
> conversion_buffer_size
)
3641 int real_size
= conversion_buffer_size
* 2;
3643 while (real_size
< size
) real_size
*= 2;
3644 buf
= (char *) xmalloc (real_size
);
3645 xfree (conversion_buffer
);
3646 conversion_buffer
= buf
;
3647 conversion_buffer_size
= real_size
;
3649 return conversion_buffer
;
3653 ccl_coding_driver (coding
, source
, destination
, src_bytes
, dst_bytes
, encodep
)
3654 struct coding_system
*coding
;
3655 unsigned char *source
, *destination
;
3656 int src_bytes
, dst_bytes
, encodep
;
3658 struct ccl_program
*ccl
3659 = encodep
? &coding
->spec
.ccl
.encoder
: &coding
->spec
.ccl
.decoder
;
3662 ccl
->last_block
= coding
->mode
& CODING_MODE_LAST_BLOCK
;
3664 coding
->produced
= ccl_driver (ccl
, source
, destination
,
3665 src_bytes
, dst_bytes
, &(coding
->consumed
));
3666 coding
->produced_char
3669 : multibyte_chars_in_text (destination
, coding
->produced
));
3670 coding
->consumed_char
3671 = multibyte_chars_in_text (source
, coding
->consumed
);
3673 switch (ccl
->status
)
3675 case CCL_STAT_SUSPEND_BY_SRC
:
3676 result
= CODING_FINISH_INSUFFICIENT_SRC
;
3678 case CCL_STAT_SUSPEND_BY_DST
:
3679 result
= CODING_FINISH_INSUFFICIENT_DST
;
3682 case CCL_STAT_INVALID_CMD
:
3683 result
= CODING_FINISH_INTERRUPT
;
3686 result
= CODING_FINISH_NORMAL
;
3692 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions". Before
3693 decoding, it may detect coding system and format of end-of-line if
3694 those are not yet decided. */
3697 decode_coding (coding
, source
, destination
, src_bytes
, dst_bytes
)
3698 struct coding_system
*coding
;
3699 unsigned char *source
, *destination
;
3700 int src_bytes
, dst_bytes
;
3705 && coding
->type
!= coding_type_ccl
3706 && ! (coding
->mode
& CODING_MODE_LAST_BLOCK
3707 && CODING_REQUIRE_FLUSHING (coding
)))
3709 coding
->produced
= coding
->produced_char
= 0;
3710 coding
->consumed
= coding
->consumed_char
= 0;
3711 coding
->fake_multibyte
= 0;
3712 return CODING_FINISH_NORMAL
;
3715 if (coding
->type
== coding_type_undecided
)
3716 detect_coding (coding
, source
, src_bytes
);
3718 if (coding
->eol_type
== CODING_EOL_UNDECIDED
)
3719 detect_eol (coding
, source
, src_bytes
);
3721 switch (coding
->type
)
3723 case coding_type_emacs_mule
:
3724 case coding_type_undecided
:
3725 case coding_type_raw_text
:
3726 if (coding
->eol_type
== CODING_EOL_LF
3727 || coding
->eol_type
== CODING_EOL_UNDECIDED
)
3728 goto label_no_conversion
;
3729 result
= decode_eol (coding
, source
, destination
, src_bytes
, dst_bytes
);
3732 case coding_type_sjis
:
3733 result
= decode_coding_sjis_big5 (coding
, source
, destination
,
3734 src_bytes
, dst_bytes
, 1);
3737 case coding_type_iso2022
:
3738 result
= decode_coding_iso2022 (coding
, source
, destination
,
3739 src_bytes
, dst_bytes
);
3742 case coding_type_big5
:
3743 result
= decode_coding_sjis_big5 (coding
, source
, destination
,
3744 src_bytes
, dst_bytes
, 0);
3747 case coding_type_ccl
:
3748 result
= ccl_coding_driver (coding
, source
, destination
,
3749 src_bytes
, dst_bytes
, 0);
3752 default: /* i.e. case coding_type_no_conversion: */
3753 label_no_conversion
:
3754 if (dst_bytes
&& src_bytes
> dst_bytes
)
3756 coding
->produced
= dst_bytes
;
3757 result
= CODING_FINISH_INSUFFICIENT_DST
;
3761 coding
->produced
= src_bytes
;
3762 result
= CODING_FINISH_NORMAL
;
3765 bcopy (source
, destination
, coding
->produced
);
3767 safe_bcopy (source
, destination
, coding
->produced
);
3768 coding
->fake_multibyte
= 1;
3770 = coding
->consumed_char
= coding
->produced_char
= coding
->produced
;
3777 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". */
3780 encode_coding (coding
, source
, destination
, src_bytes
, dst_bytes
)
3781 struct coding_system
*coding
;
3782 unsigned char *source
, *destination
;
3783 int src_bytes
, dst_bytes
;
3788 && ! (coding
->mode
& CODING_MODE_LAST_BLOCK
3789 && CODING_REQUIRE_FLUSHING (coding
)))
3791 coding
->produced
= coding
->produced_char
= 0;
3792 coding
->consumed
= coding
->consumed_char
= 0;
3793 coding
->fake_multibyte
= 0;
3794 return CODING_FINISH_NORMAL
;
3797 switch (coding
->type
)
3799 case coding_type_emacs_mule
:
3800 case coding_type_undecided
:
3801 case coding_type_raw_text
:
3802 if (coding
->eol_type
== CODING_EOL_LF
3803 || coding
->eol_type
== CODING_EOL_UNDECIDED
)
3804 goto label_no_conversion
;
3805 result
= encode_eol (coding
, source
, destination
, src_bytes
, dst_bytes
);
3808 case coding_type_sjis
:
3809 result
= encode_coding_sjis_big5 (coding
, source
, destination
,
3810 src_bytes
, dst_bytes
, 1);
3813 case coding_type_iso2022
:
3814 result
= encode_coding_iso2022 (coding
, source
, destination
,
3815 src_bytes
, dst_bytes
);
3818 case coding_type_big5
:
3819 result
= encode_coding_sjis_big5 (coding
, source
, destination
,
3820 src_bytes
, dst_bytes
, 0);
3823 case coding_type_ccl
:
3824 result
= ccl_coding_driver (coding
, source
, destination
,
3825 src_bytes
, dst_bytes
, 1);
3828 default: /* i.e. case coding_type_no_conversion: */
3829 label_no_conversion
:
3830 if (dst_bytes
&& src_bytes
> dst_bytes
)
3832 coding
->produced
= dst_bytes
;
3833 result
= CODING_FINISH_INSUFFICIENT_DST
;
3837 coding
->produced
= src_bytes
;
3838 result
= CODING_FINISH_NORMAL
;
3841 bcopy (source
, destination
, coding
->produced
);
3843 safe_bcopy (source
, destination
, coding
->produced
);
3844 if (coding
->mode
& CODING_MODE_SELECTIVE_DISPLAY
)
3846 unsigned char *p
= destination
, *pend
= p
+ coding
->produced
;
3848 if (*p
++ == '\015') p
[-1] = '\n';
3850 coding
->fake_multibyte
= 1;
3852 = coding
->consumed_char
= coding
->produced_char
= coding
->produced
;
3859 /* Scan text in the region between *BEG and *END (byte positions),
3860 skip characters which we don't have to decode by coding system
3861 CODING at the head and tail, then set *BEG and *END to the region
3862 of the text we actually have to convert. The caller should move
3863 the gap out of the region in advance.
3865 If STR is not NULL, *BEG and *END are indices into STR. */
3868 shrink_decoding_region (beg
, end
, coding
, str
)
3870 struct coding_system
*coding
;
3873 unsigned char *begp_orig
, *begp
, *endp_orig
, *endp
, c
;
3875 Lisp_Object translation_table
;
3877 if (coding
->type
== coding_type_ccl
3878 || coding
->type
== coding_type_undecided
3879 || !NILP (coding
->post_read_conversion
))
3881 /* We can't skip any data. */
3884 else if (coding
->type
== coding_type_no_conversion
)
3886 /* We need no conversion, but don't have to skip any data here.
3887 Decoding routine handles them effectively anyway. */
3891 translation_table
= coding
->translation_table_for_decode
;
3892 if (NILP (translation_table
) && !NILP (Venable_character_translation
))
3893 translation_table
= Vstandard_translation_table_for_decode
;
3894 if (CHAR_TABLE_P (translation_table
))
3897 for (i
= 0; i
< 128; i
++)
3898 if (!NILP (CHAR_TABLE_REF (translation_table
, i
)))
3901 /* Some ASCII character should be tranlsated. We give up
3906 eol_conversion
= (coding
->eol_type
!= CODING_EOL_LF
);
3908 if ((! eol_conversion
) && (coding
->heading_ascii
>= 0))
3909 /* Detection routine has already found how much we can skip at the
3911 *beg
+= coding
->heading_ascii
;
3915 begp_orig
= begp
= str
+ *beg
;
3916 endp_orig
= endp
= str
+ *end
;
3920 begp_orig
= begp
= BYTE_POS_ADDR (*beg
);
3921 endp_orig
= endp
= begp
+ *end
- *beg
;
3924 switch (coding
->type
)
3926 case coding_type_emacs_mule
:
3927 case coding_type_raw_text
:
3930 if (coding
->heading_ascii
< 0)
3931 while (begp
< endp
&& *begp
!= '\r' && *begp
< 0x80) begp
++;
3932 while (begp
< endp
&& endp
[-1] != '\r' && endp
[-1] < 0x80)
3934 /* Do not consider LF as ascii if preceded by CR, since that
3935 confuses eol decoding. */
3936 if (begp
< endp
&& endp
< endp_orig
&& endp
[-1] == '\r' && endp
[0] == '\n')
3943 case coding_type_sjis
:
3944 case coding_type_big5
:
3945 /* We can skip all ASCII characters at the head. */
3946 if (coding
->heading_ascii
< 0)
3949 while (begp
< endp
&& *begp
< 0x80 && *begp
!= '\r') begp
++;
3951 while (begp
< endp
&& *begp
< 0x80) begp
++;
3953 /* We can skip all ASCII characters at the tail except for the
3954 second byte of SJIS or BIG5 code. */
3956 while (begp
< endp
&& endp
[-1] < 0x80 && endp
[-1] != '\r') endp
--;
3958 while (begp
< endp
&& endp
[-1] < 0x80) endp
--;
3959 /* Do not consider LF as ascii if preceded by CR, since that
3960 confuses eol decoding. */
3961 if (begp
< endp
&& endp
< endp_orig
&& endp
[-1] == '\r' && endp
[0] == '\n')
3963 if (begp
< endp
&& endp
< endp_orig
&& endp
[-1] >= 0x80)
3967 default: /* i.e. case coding_type_iso2022: */
3968 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, 0) != CHARSET_ASCII
)
3969 /* We can't skip any data. */
3971 if (coding
->heading_ascii
< 0)
3973 /* We can skip all ASCII characters at the head except for a
3974 few control codes. */
3975 while (begp
< endp
&& (c
= *begp
) < 0x80
3976 && c
!= ISO_CODE_CR
&& c
!= ISO_CODE_SO
3977 && c
!= ISO_CODE_SI
&& c
!= ISO_CODE_ESC
3978 && (!eol_conversion
|| c
!= ISO_CODE_LF
))
3981 switch (coding
->category_idx
)
3983 case CODING_CATEGORY_IDX_ISO_8_1
:
3984 case CODING_CATEGORY_IDX_ISO_8_2
:
3985 /* We can skip all ASCII characters at the tail. */
3987 while (begp
< endp
&& (c
= endp
[-1]) < 0x80 && c
!= '\r') endp
--;
3989 while (begp
< endp
&& endp
[-1] < 0x80) endp
--;
3990 /* Do not consider LF as ascii if preceded by CR, since that
3991 confuses eol decoding. */
3992 if (begp
< endp
&& endp
< endp_orig
&& endp
[-1] == '\r' && endp
[0] == '\n')
3996 case CODING_CATEGORY_IDX_ISO_7
:
3997 case CODING_CATEGORY_IDX_ISO_7_TIGHT
:
3999 /* We can skip all charactes at the tail except for 8-bit
4000 codes and ESC and the following 2-byte at the tail. */
4001 unsigned char *eight_bit
= NULL
;
4005 && (c
= endp
[-1]) != ISO_CODE_ESC
&& c
!= '\r')
4007 if (!eight_bit
&& c
& 0x80) eight_bit
= endp
;
4012 && (c
= endp
[-1]) != ISO_CODE_ESC
)
4014 if (!eight_bit
&& c
& 0x80) eight_bit
= endp
;
4017 /* Do not consider LF as ascii if preceded by CR, since that
4018 confuses eol decoding. */
4019 if (begp
< endp
&& endp
< endp_orig
4020 && endp
[-1] == '\r' && endp
[0] == '\n')
4022 if (begp
< endp
&& endp
[-1] == ISO_CODE_ESC
)
4024 if (endp
+ 1 < endp_orig
&& end
[0] == '(' && end
[1] == 'B')
4025 /* This is an ASCII designation sequence. We can
4026 surely skip the tail. But, if we have
4027 encountered an 8-bit code, skip only the codes
4029 endp
= eight_bit
? eight_bit
: endp
+ 2;
4031 /* Hmmm, we can't skip the tail. */
4039 *beg
+= begp
- begp_orig
;
4040 *end
+= endp
- endp_orig
;
4044 /* Like shrink_decoding_region but for encoding. */
4047 shrink_encoding_region (beg
, end
, coding
, str
)
4049 struct coding_system
*coding
;
4052 unsigned char *begp_orig
, *begp
, *endp_orig
, *endp
;
4054 Lisp_Object translation_table
;
4056 if (coding
->type
== coding_type_ccl
)
4057 /* We can't skip any data. */
4059 else if (coding
->type
== coding_type_no_conversion
)
4061 /* We need no conversion. */
4066 translation_table
= coding
->translation_table_for_encode
;
4067 if (NILP (translation_table
) && !NILP (Venable_character_translation
))
4068 translation_table
= Vstandard_translation_table_for_encode
;
4069 if (CHAR_TABLE_P (translation_table
))
4072 for (i
= 0; i
< 128; i
++)
4073 if (!NILP (CHAR_TABLE_REF (translation_table
, i
)))
4076 /* Some ASCII character should be tranlsated. We give up
4083 begp_orig
= begp
= str
+ *beg
;
4084 endp_orig
= endp
= str
+ *end
;
4088 begp_orig
= begp
= BYTE_POS_ADDR (*beg
);
4089 endp_orig
= endp
= begp
+ *end
- *beg
;
4092 eol_conversion
= (coding
->eol_type
== CODING_EOL_CR
4093 || coding
->eol_type
== CODING_EOL_CRLF
);
4095 /* Here, we don't have to check coding->pre_write_conversion because
4096 the caller is expected to have handled it already. */
4097 switch (coding
->type
)
4099 case coding_type_undecided
:
4100 case coding_type_emacs_mule
:
4101 case coding_type_raw_text
:
4104 while (begp
< endp
&& *begp
!= '\n') begp
++;
4105 while (begp
< endp
&& endp
[-1] != '\n') endp
--;
4111 case coding_type_iso2022
:
4112 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, 0) != CHARSET_ASCII
)
4113 /* We can't skip any data. */
4115 if (coding
->flags
& CODING_FLAG_ISO_DESIGNATE_AT_BOL
)
4117 unsigned char *bol
= begp
;
4118 while (begp
< endp
&& *begp
< 0x80)
4121 if (begp
[-1] == '\n')
4125 goto label_skip_tail
;
4130 /* We can skip all ASCII characters at the head and tail. */
4132 while (begp
< endp
&& *begp
< 0x80 && *begp
!= '\n') begp
++;
4134 while (begp
< endp
&& *begp
< 0x80) begp
++;
4137 while (begp
< endp
&& endp
[-1] < 0x80 && endp
[-1] != '\n') endp
--;
4139 while (begp
< endp
&& *(endp
- 1) < 0x80) endp
--;
4143 *beg
+= begp
- begp_orig
;
4144 *end
+= endp
- endp_orig
;
4148 /* As shrinking conversion region requires some overhead, we don't try
4149 shrinking if the length of conversion region is less than this
4151 static int shrink_conversion_region_threshhold
= 1024;
4153 #define SHRINK_CONVERSION_REGION(beg, end, coding, str, encodep) \
4155 if (*(end) - *(beg) > shrink_conversion_region_threshhold) \
4157 if (encodep) shrink_encoding_region (beg, end, coding, str); \
4158 else shrink_decoding_region (beg, end, coding, str); \
4162 /* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the
4163 text from FROM to TO (byte positions are FROM_BYTE and TO_BYTE) by
4164 coding system CODING, and return the status code of code conversion
4165 (currently, this value has no meaning).
4167 How many characters (and bytes) are converted to how many
4168 characters (and bytes) are recorded in members of the structure
4171 If REPLACE is nonzero, we do various things as if the original text
4172 is deleted and a new text is inserted. See the comments in
4173 replace_range (insdel.c) to know what we are doing. */
4176 code_convert_region (from
, from_byte
, to
, to_byte
, coding
, encodep
, replace
)
4177 int from
, from_byte
, to
, to_byte
, encodep
, replace
;
4178 struct coding_system
*coding
;
4180 int len
= to
- from
, len_byte
= to_byte
- from_byte
;
4181 int require
, inserted
, inserted_byte
;
4182 int head_skip
, tail_skip
, total_skip
;
4183 Lisp_Object saved_coding_symbol
;
4184 int multibyte
= !NILP (current_buffer
->enable_multibyte_characters
);
4186 int fake_multibyte
= 0;
4187 unsigned char *src
, *dst
;
4188 Lisp_Object deletion
;
4189 int orig_point
= PT
, orig_len
= len
;
4193 saved_coding_symbol
= Qnil
;
4195 if (from
< PT
&& PT
< to
)
4197 TEMP_SET_PT_BOTH (from
, from_byte
);
4203 int saved_from
= from
;
4205 prepare_to_modify_buffer (from
, to
, &from
);
4206 if (saved_from
!= from
)
4210 from_byte
= CHAR_TO_BYTE (from
), to_byte
= CHAR_TO_BYTE (to
);
4212 from_byte
= from
, to_byte
= to
;
4213 len_byte
= to_byte
- from_byte
;
4217 if (! encodep
&& CODING_REQUIRE_DETECTION (coding
))
4219 /* We must detect encoding of text and eol format. */
4221 if (from
< GPT
&& to
> GPT
)
4222 move_gap_both (from
, from_byte
);
4223 if (coding
->type
== coding_type_undecided
)
4225 detect_coding (coding
, BYTE_POS_ADDR (from_byte
), len_byte
);
4226 if (coding
->type
== coding_type_undecided
)
4227 /* It seems that the text contains only ASCII, but we
4228 should not left it undecided because the deeper
4229 decoding routine (decode_coding) tries to detect the
4230 encodings again in vain. */
4231 coding
->type
= coding_type_emacs_mule
;
4233 if (coding
->eol_type
== CODING_EOL_UNDECIDED
)
4235 saved_coding_symbol
= coding
->symbol
;
4236 detect_eol (coding
, BYTE_POS_ADDR (from_byte
), len_byte
);
4237 if (coding
->eol_type
== CODING_EOL_UNDECIDED
)
4238 coding
->eol_type
= CODING_EOL_LF
;
4239 /* We had better recover the original eol format if we
4240 encounter an inconsitent eol format while decoding. */
4241 coding
->mode
|= CODING_MODE_INHIBIT_INCONSISTENT_EOL
;
4245 coding
->consumed_char
= len
, coding
->consumed
= len_byte
;
4248 ? ! CODING_REQUIRE_ENCODING (coding
)
4249 : ! CODING_REQUIRE_DECODING (coding
))
4251 coding
->produced
= len_byte
;
4254 /* See the comment of the member heading_ascii in coding.h. */
4255 && coding
->heading_ascii
< len_byte
)
4257 /* We still may have to combine byte at the head and the
4258 tail of the text in the region. */
4259 if (from
< GPT
&& GPT
< to
)
4260 move_gap_both (to
, to_byte
);
4261 len
= multibyte_chars_in_text (BYTE_POS_ADDR (from_byte
), len_byte
);
4262 adjust_after_insert (from
, from_byte
, to
, to_byte
, len
);
4263 coding
->produced_char
= len
;
4268 adjust_after_insert (from
, from_byte
, to
, to_byte
, len_byte
);
4269 coding
->produced_char
= len_byte
;
4274 /* Now we convert the text. */
4276 /* For encoding, we must process pre-write-conversion in advance. */
4278 && ! NILP (coding
->pre_write_conversion
)
4279 && SYMBOLP (coding
->pre_write_conversion
)
4280 && ! NILP (Ffboundp (coding
->pre_write_conversion
)))
4282 /* The function in pre-write-conversion may put a new text in a
4284 struct buffer
*prev
= current_buffer
;
4287 call2 (coding
->pre_write_conversion
,
4288 make_number (from
), make_number (to
));
4289 if (current_buffer
!= prev
)
4292 new = Fcurrent_buffer ();
4293 set_buffer_internal_1 (prev
);
4294 del_range_2 (from
, from_byte
, to
, to_byte
);
4295 TEMP_SET_PT_BOTH (from
, from_byte
);
4296 insert_from_buffer (XBUFFER (new), 1, len
, 0);
4298 if (orig_point
>= to
)
4299 orig_point
+= len
- orig_len
;
4300 else if (orig_point
> from
)
4304 from_byte
= multibyte
? CHAR_TO_BYTE (from
) : from_byte
;
4305 to_byte
= multibyte
? CHAR_TO_BYTE (to
) : to
;
4306 len_byte
= to_byte
- from_byte
;
4307 TEMP_SET_PT_BOTH (from
, from_byte
);
4312 deletion
= make_buffer_string_both (from
, from_byte
, to
, to_byte
, 1);
4314 /* Try to skip the heading and tailing ASCIIs. */
4316 int from_byte_orig
= from_byte
, to_byte_orig
= to_byte
;
4318 if (from
< GPT
&& GPT
< to
)
4319 move_gap_both (from
, from_byte
);
4320 SHRINK_CONVERSION_REGION (&from_byte
, &to_byte
, coding
, NULL
, encodep
);
4321 if (from_byte
== to_byte
4322 && coding
->type
!= coding_type_ccl
4323 && ! (coding
->mode
& CODING_MODE_LAST_BLOCK
4324 && CODING_REQUIRE_FLUSHING (coding
)))
4326 coding
->produced
= len_byte
;
4327 coding
->produced_char
= multibyte
? len
: len_byte
;
4329 /* We must record and adjust for this new text now. */
4330 adjust_after_insert (from
, from_byte_orig
, to
, to_byte_orig
, len
);
4334 head_skip
= from_byte
- from_byte_orig
;
4335 tail_skip
= to_byte_orig
- to_byte
;
4336 total_skip
= head_skip
+ tail_skip
;
4339 len
-= total_skip
; len_byte
-= total_skip
;
4342 /* The code conversion routine can not preserve text properties for
4343 now. So, we must remove all text properties in the region.
4344 Here, we must suppress all modification hooks. */
4347 int saved_inhibit_modification_hooks
= inhibit_modification_hooks
;
4348 inhibit_modification_hooks
= 1;
4349 Fset_text_properties (make_number (from
), make_number (to
), Qnil
, Qnil
);
4350 inhibit_modification_hooks
= saved_inhibit_modification_hooks
;
4353 /* For converion, we must put the gap before the text in addition to
4354 making the gap larger for efficient decoding. The required gap
4355 size starts from 2000 which is the magic number used in make_gap.
4356 But, after one batch of conversion, it will be incremented if we
4357 find that it is not enough . */
4360 if (GAP_SIZE
< require
)
4361 make_gap (require
- GAP_SIZE
);
4362 move_gap_both (from
, from_byte
);
4364 inserted
= inserted_byte
= 0;
4365 src
= GAP_END_ADDR
, dst
= GPT_ADDR
;
4367 GAP_SIZE
+= len_byte
;
4370 ZV_BYTE
-= len_byte
;
4373 if (GPT
- BEG
< beg_unchanged
)
4374 beg_unchanged
= GPT
- BEG
;
4375 if (Z
- GPT
< end_unchanged
)
4376 end_unchanged
= Z
- GPT
;
4382 /* The buffer memory is changed from:
4383 +--------+converted-text+---------+-------original-text------+---+
4384 |<-from->|<--inserted-->|---------|<-----------len---------->|---|
4385 |<------------------- GAP_SIZE -------------------->| */
4387 result
= encode_coding (coding
, src
, dst
, len_byte
, 0);
4389 result
= decode_coding (coding
, src
, dst
, len_byte
, 0);
4391 +--------+-------converted-text--------+--+---original-text--+---+
4392 |<-from->|<--inserted-->|<--produced-->|--|<-(len-consumed)->|---|
4393 |<------------------- GAP_SIZE -------------------->| */
4394 if (coding
->fake_multibyte
)
4397 if (!encodep
&& !multibyte
)
4398 coding
->produced_char
= coding
->produced
;
4399 inserted
+= coding
->produced_char
;
4400 inserted_byte
+= coding
->produced
;
4401 len_byte
-= coding
->consumed
;
4402 src
+= coding
->consumed
;
4403 dst
+= inserted_byte
;
4405 if (result
== CODING_FINISH_NORMAL
)
4410 if (! encodep
&& result
== CODING_FINISH_INCONSISTENT_EOL
)
4412 unsigned char *pend
= dst
, *p
= pend
- inserted_byte
;
4414 /* Encode LFs back to the original eol format (CR or CRLF). */
4415 if (coding
->eol_type
== CODING_EOL_CR
)
4417 while (p
< pend
) if (*p
++ == '\n') p
[-1] = '\r';
4423 while (p
< pend
) if (*p
++ == '\n') count
++;
4424 if (src
- dst
< count
)
4426 /* We don't have sufficient room for putting LFs
4427 back to CRLF. We must record converted and
4428 not-yet-converted text back to the buffer
4429 content, enlarge the gap, then record them out of
4430 the buffer contents again. */
4431 int add
= len_byte
+ inserted_byte
;
4434 ZV
+= add
; Z
+= add
; ZV_BYTE
+= add
; Z_BYTE
+= add
;
4435 GPT
+= inserted_byte
; GPT_BYTE
+= inserted_byte
;
4436 make_gap (count
- GAP_SIZE
);
4438 ZV
-= add
; Z
-= add
; ZV_BYTE
-= add
; Z_BYTE
-= add
;
4439 GPT
-= inserted_byte
; GPT_BYTE
-= inserted_byte
;
4440 /* Don't forget to update SRC, DST, and PEND. */
4441 src
= GAP_END_ADDR
- len_byte
;
4442 dst
= GPT_ADDR
+ inserted_byte
;
4446 inserted_byte
+= count
;
4447 coding
->produced
+= count
;
4448 p
= dst
= pend
+ count
;
4452 if (*p
== '\n') count
--, *--p
= '\r';
4456 /* Suppress eol-format conversion in the further conversion. */
4457 coding
->eol_type
= CODING_EOL_LF
;
4459 /* Restore the original symbol. */
4460 coding
->symbol
= saved_coding_symbol
;
4466 if (coding
->type
!= coding_type_ccl
4467 || coding
->mode
& CODING_MODE_LAST_BLOCK
)
4469 coding
->mode
|= CODING_MODE_LAST_BLOCK
;
4472 if (result
== CODING_FINISH_INSUFFICIENT_SRC
)
4474 /* The source text ends in invalid codes. Let's just
4475 make them valid buffer contents, and finish conversion. */
4476 inserted
+= len_byte
;
4477 inserted_byte
+= len_byte
;
4483 if (result
== CODING_FINISH_INTERRUPT
)
4485 /* The conversion procedure was interrupted by a user. */
4489 /* Now RESULT == CODING_FINISH_INSUFFICIENT_DST */
4490 if (coding
->consumed
< 1)
4492 /* It's quite strange to require more memory without
4493 consuming any bytes. Perhaps CCL program bug. */
4499 /* We have just done the first batch of conversion which was
4500 stoped because of insufficient gap. Let's reconsider the
4501 required gap size (i.e. SRT - DST) now.
4503 We have converted ORIG bytes (== coding->consumed) into
4504 NEW bytes (coding->produced). To convert the remaining
4505 LEN bytes, we may need REQUIRE bytes of gap, where:
4506 REQUIRE + LEN_BYTE = LEN_BYTE * (NEW / ORIG)
4507 REQUIRE = LEN_BYTE * (NEW - ORIG) / ORIG
4508 Here, we are sure that NEW >= ORIG. */
4509 float ratio
= coding
->produced
- coding
->consumed
;
4510 ratio
/= coding
->consumed
;
4511 require
= len_byte
* ratio
;
4514 if ((src
- dst
) < (require
+ 2000))
4516 /* See the comment above the previous call of make_gap. */
4517 int add
= len_byte
+ inserted_byte
;
4520 ZV
+= add
; Z
+= add
; ZV_BYTE
+= add
; Z_BYTE
+= add
;
4521 GPT
+= inserted_byte
; GPT_BYTE
+= inserted_byte
;
4522 make_gap (require
+ 2000);
4524 ZV
-= add
; Z
-= add
; ZV_BYTE
-= add
; Z_BYTE
-= add
;
4525 GPT
-= inserted_byte
; GPT_BYTE
-= inserted_byte
;
4526 /* Don't forget to update SRC, DST. */
4527 src
= GAP_END_ADDR
- len_byte
;
4528 dst
= GPT_ADDR
+ inserted_byte
;
4531 if (src
- dst
> 0) *dst
= 0; /* Put an anchor. */
4536 || (to
- from
) != (to_byte
- from_byte
)))
4537 inserted
= multibyte_chars_in_text (GPT_ADDR
, inserted_byte
);
4539 /* If we have shrinked the conversion area, adjust it now. */
4543 safe_bcopy (GAP_END_ADDR
, GPT_ADDR
+ inserted_byte
, tail_skip
);
4544 inserted
+= total_skip
; inserted_byte
+= total_skip
;
4545 GAP_SIZE
+= total_skip
;
4546 GPT
-= head_skip
; GPT_BYTE
-= head_skip
;
4547 ZV
-= total_skip
; ZV_BYTE
-= total_skip
;
4548 Z
-= total_skip
; Z_BYTE
-= total_skip
;
4549 from
-= head_skip
; from_byte
-= head_skip
;
4550 to
+= tail_skip
; to_byte
+= tail_skip
;
4554 adjust_after_replace (from
, from_byte
, deletion
, inserted
, inserted_byte
);
4555 inserted
= Z
- prev_Z
;
4557 if (! encodep
&& ! NILP (coding
->post_read_conversion
))
4562 TEMP_SET_PT_BOTH (from
, from_byte
);
4564 val
= call1 (coding
->post_read_conversion
, make_number (inserted
));
4565 CHECK_NUMBER (val
, 0);
4566 inserted
+= Z
- prev_Z
;
4569 if (orig_point
>= from
)
4571 if (orig_point
>= from
+ orig_len
)
4572 orig_point
+= inserted
- orig_len
;
4575 TEMP_SET_PT (orig_point
);
4578 signal_after_change (from
, to
- from
, inserted
);
4581 coding
->consumed
= to_byte
- from_byte
;
4582 coding
->consumed_char
= to
- from
;
4583 coding
->produced
= inserted_byte
;
4584 coding
->produced_char
= inserted
;
4591 code_convert_string (str
, coding
, encodep
, nocopy
)
4593 struct coding_system
*coding
;
4594 int encodep
, nocopy
;
4598 int from
= 0, to
= XSTRING (str
)->size
;
4599 int to_byte
= STRING_BYTES (XSTRING (str
));
4600 struct gcpro gcpro1
;
4601 Lisp_Object saved_coding_symbol
;
4604 saved_coding_symbol
= Qnil
;
4605 if (encodep
&& !NILP (coding
->pre_write_conversion
)
4606 || !encodep
&& !NILP (coding
->post_read_conversion
))
4608 /* Since we have to call Lisp functions which assume target text
4609 is in a buffer, after setting a temporary buffer, call
4610 code_convert_region. */
4611 int count
= specpdl_ptr
- specpdl
;
4612 struct buffer
*prev
= current_buffer
;
4614 record_unwind_protect (Fset_buffer
, Fcurrent_buffer ());
4615 temp_output_buffer_setup (" *code-converting-work*");
4616 set_buffer_internal (XBUFFER (Vstandard_output
));
4618 insert_from_string (str
, 0, 0, to
, to_byte
, 0);
4621 /* We must insert the contents of STR as is without
4622 unibyte<->multibyte conversion. */
4623 current_buffer
->enable_multibyte_characters
= Qnil
;
4624 insert_from_string (str
, 0, 0, to_byte
, to_byte
, 0);
4625 current_buffer
->enable_multibyte_characters
= Qt
;
4627 code_convert_region (BEGV
, BEGV_BYTE
, ZV
, ZV_BYTE
, coding
, encodep
, 1);
4629 /* We must return the buffer contents as unibyte string. */
4630 current_buffer
->enable_multibyte_characters
= Qnil
;
4631 str
= make_buffer_string (BEGV
, ZV
, 0);
4632 set_buffer_internal (prev
);
4633 return unbind_to (count
, str
);
4636 if (! encodep
&& CODING_REQUIRE_DETECTION (coding
))
4638 /* See the comments in code_convert_region. */
4639 if (coding
->type
== coding_type_undecided
)
4641 detect_coding (coding
, XSTRING (str
)->data
, to_byte
);
4642 if (coding
->type
== coding_type_undecided
)
4643 coding
->type
= coding_type_emacs_mule
;
4645 if (coding
->eol_type
== CODING_EOL_UNDECIDED
)
4647 saved_coding_symbol
= coding
->symbol
;
4648 detect_eol (coding
, XSTRING (str
)->data
, to_byte
);
4649 if (coding
->eol_type
== CODING_EOL_UNDECIDED
)
4650 coding
->eol_type
= CODING_EOL_LF
;
4651 /* We had better recover the original eol format if we
4652 encounter an inconsitent eol format while decoding. */
4653 coding
->mode
|= CODING_MODE_INHIBIT_INCONSISTENT_EOL
;
4658 ? ! CODING_REQUIRE_ENCODING (coding
)
4659 : ! CODING_REQUIRE_DECODING (coding
))
4663 /* Try to skip the heading and tailing ASCIIs. */
4664 SHRINK_CONVERSION_REGION (&from
, &to_byte
, coding
, XSTRING (str
)->data
,
4668 && coding
->type
!= coding_type_ccl
)
4669 return (nocopy
? str
: Fcopy_sequence (str
));
4672 len
= encoding_buffer_size (coding
, to_byte
- from
);
4674 len
= decoding_buffer_size (coding
, to_byte
- from
);
4675 len
+= from
+ STRING_BYTES (XSTRING (str
)) - to_byte
;
4677 buf
= get_conversion_buffer (len
);
4681 bcopy (XSTRING (str
)->data
, buf
, from
);
4683 ? encode_coding (coding
, XSTRING (str
)->data
+ from
,
4684 buf
+ from
, to_byte
- from
, len
)
4685 : decode_coding (coding
, XSTRING (str
)->data
+ from
,
4686 buf
+ from
, to_byte
- from
, len
));
4687 if (! encodep
&& result
== CODING_FINISH_INCONSISTENT_EOL
)
4689 /* We simple try to decode the whole string again but without
4690 eol-conversion this time. */
4691 coding
->eol_type
= CODING_EOL_LF
;
4692 coding
->symbol
= saved_coding_symbol
;
4693 return code_convert_string (str
, coding
, encodep
, nocopy
);
4696 bcopy (XSTRING (str
)->data
+ to_byte
, buf
+ from
+ coding
->produced
,
4697 STRING_BYTES (XSTRING (str
)) - to_byte
);
4699 len
= from
+ STRING_BYTES (XSTRING (str
)) - to_byte
;
4701 str
= make_unibyte_string (buf
, len
+ coding
->produced
);
4704 int chars
= (coding
->fake_multibyte
4705 ? multibyte_chars_in_text (buf
+ from
, coding
->produced
)
4706 : coding
->produced_char
);
4707 str
= make_multibyte_string (buf
, len
+ chars
, len
+ coding
->produced
);
4715 /*** 8. Emacs Lisp library functions ***/
4717 DEFUN ("coding-system-p", Fcoding_system_p
, Scoding_system_p
, 1, 1, 0,
4718 "Return t if OBJECT is nil or a coding-system.\n\
4719 See the documentation of `make-coding-system' for information\n\
4720 about coding-system objects.")
4728 /* Get coding-spec vector for OBJ. */
4729 obj
= Fget (obj
, Qcoding_system
);
4730 return ((VECTORP (obj
) && XVECTOR (obj
)->size
== 5)
4734 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system
,
4735 Sread_non_nil_coding_system
, 1, 1, 0,
4736 "Read a coding system from the minibuffer, prompting with string PROMPT.")
4743 val
= Fcompleting_read (prompt
, Vcoding_system_alist
, Qnil
,
4744 Qt
, Qnil
, Qcoding_system_history
, Qnil
, Qnil
);
4746 while (XSTRING (val
)->size
== 0);
4747 return (Fintern (val
, Qnil
));
4750 DEFUN ("read-coding-system", Fread_coding_system
, Sread_coding_system
, 1, 2, 0,
4751 "Read a coding system from the minibuffer, prompting with string PROMPT.\n\
4752 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.")
4753 (prompt
, default_coding_system
)
4754 Lisp_Object prompt
, default_coding_system
;
4757 if (SYMBOLP (default_coding_system
))
4758 XSETSTRING (default_coding_system
, XSYMBOL (default_coding_system
)->name
);
4759 val
= Fcompleting_read (prompt
, Vcoding_system_alist
, Qnil
,
4760 Qt
, Qnil
, Qcoding_system_history
,
4761 default_coding_system
, Qnil
);
4762 return (XSTRING (val
)->size
== 0 ? Qnil
: Fintern (val
, Qnil
));
4765 DEFUN ("check-coding-system", Fcheck_coding_system
, Scheck_coding_system
,
4767 "Check validity of CODING-SYSTEM.\n\
4768 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.\n\
4769 It is valid if it is a symbol with a non-nil `coding-system' property.\n\
4770 The value of property should be a vector of length 5.")
4772 Lisp_Object coding_system
;
4774 CHECK_SYMBOL (coding_system
, 0);
4775 if (!NILP (Fcoding_system_p (coding_system
)))
4776 return coding_system
;
4778 Fsignal (Qcoding_system_error
, Fcons (coding_system
, Qnil
));
4782 detect_coding_system (src
, src_bytes
, highest
)
4784 int src_bytes
, highest
;
4786 int coding_mask
, eol_type
;
4787 Lisp_Object val
, tmp
;
4790 coding_mask
= detect_coding_mask (src
, src_bytes
, NULL
, &dummy
);
4791 eol_type
= detect_eol_type (src
, src_bytes
, &dummy
);
4792 if (eol_type
== CODING_EOL_INCONSISTENT
)
4793 eol_type
= CODING_EOL_UNDECIDED
;
4798 if (eol_type
!= CODING_EOL_UNDECIDED
)
4801 val2
= Fget (Qundecided
, Qeol_type
);
4803 val
= XVECTOR (val2
)->contents
[eol_type
];
4805 return (highest
? val
: Fcons (val
, Qnil
));
4808 /* At first, gather possible coding systems in VAL. */
4810 for (tmp
= Vcoding_category_list
; !NILP (tmp
); tmp
= XCONS (tmp
)->cdr
)
4813 = XFASTINT (Fget (XCONS (tmp
)->car
, Qcoding_category_index
));
4814 if (coding_mask
& (1 << idx
))
4816 val
= Fcons (Fsymbol_value (XCONS (tmp
)->car
), val
);
4822 val
= Fnreverse (val
);
4824 /* Then, replace the elements with subsidiary coding systems. */
4825 for (tmp
= val
; !NILP (tmp
); tmp
= XCONS (tmp
)->cdr
)
4827 if (eol_type
!= CODING_EOL_UNDECIDED
4828 && eol_type
!= CODING_EOL_INCONSISTENT
)
4831 eol
= Fget (XCONS (tmp
)->car
, Qeol_type
);
4833 XCONS (tmp
)->car
= XVECTOR (eol
)->contents
[eol_type
];
4836 return (highest
? XCONS (val
)->car
: val
);
4839 DEFUN ("detect-coding-region", Fdetect_coding_region
, Sdetect_coding_region
,
4841 "Detect coding system of the text in the region between START and END.\n\
4842 Return a list of possible coding systems ordered by priority.\n\
4844 If only ASCII characters are found, it returns a list of single element\n\
4845 `undecided' or its subsidiary coding system according to a detected\n\
4846 end-of-line format.\n\
4848 If optional argument HIGHEST is non-nil, return the coding system of\n\
4850 (start
, end
, highest
)
4851 Lisp_Object start
, end
, highest
;
4854 int from_byte
, to_byte
;
4856 CHECK_NUMBER_COERCE_MARKER (start
, 0);
4857 CHECK_NUMBER_COERCE_MARKER (end
, 1);
4859 validate_region (&start
, &end
);
4860 from
= XINT (start
), to
= XINT (end
);
4861 from_byte
= CHAR_TO_BYTE (from
);
4862 to_byte
= CHAR_TO_BYTE (to
);
4864 if (from
< GPT
&& to
>= GPT
)
4865 move_gap_both (to
, to_byte
);
4867 return detect_coding_system (BYTE_POS_ADDR (from_byte
),
4868 to_byte
- from_byte
,
4872 DEFUN ("detect-coding-string", Fdetect_coding_string
, Sdetect_coding_string
,
4874 "Detect coding system of the text in STRING.\n\
4875 Return a list of possible coding systems ordered by priority.\n\
4877 If only ASCII characters are found, it returns a list of single element\n\
4878 `undecided' or its subsidiary coding system according to a detected\n\
4879 end-of-line format.\n\
4881 If optional argument HIGHEST is non-nil, return the coding system of\n\
4884 Lisp_Object string
, highest
;
4886 CHECK_STRING (string
, 0);
4888 return detect_coding_system (XSTRING (string
)->data
,
4889 STRING_BYTES (XSTRING (string
)),
4894 code_convert_region1 (start
, end
, coding_system
, encodep
)
4895 Lisp_Object start
, end
, coding_system
;
4898 struct coding_system coding
;
4901 CHECK_NUMBER_COERCE_MARKER (start
, 0);
4902 CHECK_NUMBER_COERCE_MARKER (end
, 1);
4903 CHECK_SYMBOL (coding_system
, 2);
4905 validate_region (&start
, &end
);
4906 from
= XFASTINT (start
);
4907 to
= XFASTINT (end
);
4909 if (NILP (coding_system
))
4910 return make_number (to
- from
);
4912 if (setup_coding_system (Fcheck_coding_system (coding_system
), &coding
) < 0)
4913 error ("Invalid coding system: %s", XSYMBOL (coding_system
)->name
->data
);
4915 coding
.mode
|= CODING_MODE_LAST_BLOCK
;
4916 code_convert_region (from
, CHAR_TO_BYTE (from
), to
, CHAR_TO_BYTE (to
),
4917 &coding
, encodep
, 1);
4918 Vlast_coding_system_used
= coding
.symbol
;
4919 return make_number (coding
.produced_char
);
4922 DEFUN ("decode-coding-region", Fdecode_coding_region
, Sdecode_coding_region
,
4923 3, 3, "r\nzCoding system: ",
4924 "Decode the current region by specified coding system.\n\
4925 When called from a program, takes three arguments:\n\
4926 START, END, and CODING-SYSTEM. START and END are buffer positions.\n\
4927 This function sets `last-coding-system-used' to the precise coding system\n\
4928 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
4929 not fully specified.)\n\
4930 It returns the length of the decoded text.")
4931 (start
, end
, coding_system
)
4932 Lisp_Object start
, end
, coding_system
;
4934 return code_convert_region1 (start
, end
, coding_system
, 0);
4937 DEFUN ("encode-coding-region", Fencode_coding_region
, Sencode_coding_region
,
4938 3, 3, "r\nzCoding system: ",
4939 "Encode the current region by specified coding system.\n\
4940 When called from a program, takes three arguments:\n\
4941 START, END, and CODING-SYSTEM. START and END are buffer positions.\n\
4942 This function sets `last-coding-system-used' to the precise coding system\n\
4943 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
4944 not fully specified.)\n\
4945 It returns the length of the encoded text.")
4946 (start
, end
, coding_system
)
4947 Lisp_Object start
, end
, coding_system
;
4949 return code_convert_region1 (start
, end
, coding_system
, 1);
4953 code_convert_string1 (string
, coding_system
, nocopy
, encodep
)
4954 Lisp_Object string
, coding_system
, nocopy
;
4957 struct coding_system coding
;
4959 CHECK_STRING (string
, 0);
4960 CHECK_SYMBOL (coding_system
, 1);
4962 if (NILP (coding_system
))
4963 return (NILP (nocopy
) ? Fcopy_sequence (string
) : string
);
4965 if (setup_coding_system (Fcheck_coding_system (coding_system
), &coding
) < 0)
4966 error ("Invalid coding system: %s", XSYMBOL (coding_system
)->name
->data
);
4968 coding
.mode
|= CODING_MODE_LAST_BLOCK
;
4969 Vlast_coding_system_used
= coding
.symbol
;
4970 return code_convert_string (string
, &coding
, encodep
, !NILP (nocopy
));
4973 DEFUN ("decode-coding-string", Fdecode_coding_string
, Sdecode_coding_string
,
4975 "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
4976 Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
4977 if the decoding operation is trivial.\n\
4978 This function sets `last-coding-system-used' to the precise coding system\n\
4979 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
4980 not fully specified.)")
4981 (string
, coding_system
, nocopy
)
4982 Lisp_Object string
, coding_system
, nocopy
;
4984 return code_convert_string1 (string
, coding_system
, nocopy
, 0);
4987 DEFUN ("encode-coding-string", Fencode_coding_string
, Sencode_coding_string
,
4989 "Encode STRING to CODING-SYSTEM, and return the result.\n\
4990 Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
4991 if the encoding operation is trivial.\n\
4992 This function sets `last-coding-system-used' to the precise coding system\n\
4993 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
4994 not fully specified.)")
4995 (string
, coding_system
, nocopy
)
4996 Lisp_Object string
, coding_system
, nocopy
;
4998 return code_convert_string1 (string
, coding_system
, nocopy
, 1);
5001 /* Encode or decode STRING according to CODING_SYSTEM.
5002 Do not set Vlast_coding_system_used. */
5005 code_convert_string_norecord (string
, coding_system
, encodep
)
5006 Lisp_Object string
, coding_system
;
5009 struct coding_system coding
;
5011 CHECK_STRING (string
, 0);
5012 CHECK_SYMBOL (coding_system
, 1);
5014 if (NILP (coding_system
))
5017 if (setup_coding_system (Fcheck_coding_system (coding_system
), &coding
) < 0)
5018 error ("Invalid coding system: %s", XSYMBOL (coding_system
)->name
->data
);
5020 coding
.mode
|= CODING_MODE_LAST_BLOCK
;
5021 return code_convert_string (string
, &coding
, encodep
, Qt
);
5024 DEFUN ("decode-sjis-char", Fdecode_sjis_char
, Sdecode_sjis_char
, 1, 1, 0,
5025 "Decode a Japanese character which has CODE in shift_jis encoding.\n\
5026 Return the corresponding character.")
5030 unsigned char c1
, c2
, s1
, s2
;
5033 CHECK_NUMBER (code
, 0);
5034 s1
= (XFASTINT (code
)) >> 8, s2
= (XFASTINT (code
)) & 0xFF;
5038 XSETFASTINT (val
, s2
);
5039 else if (s2
>= 0xA0 || s2
<= 0xDF)
5041 MAKE_NON_ASCII_CHAR (charset_katakana_jisx0201
, s2
, 0));
5043 error ("Invalid Shift JIS code: %x", XFASTINT (code
));
5047 if ((s1
< 0x80 || s1
> 0x9F && s1
< 0xE0 || s1
> 0xEF)
5048 || (s2
< 0x40 || s2
== 0x7F || s2
> 0xFC))
5049 error ("Invalid Shift JIS code: %x", XFASTINT (code
));
5050 DECODE_SJIS (s1
, s2
, c1
, c2
);
5051 XSETFASTINT (val
, MAKE_NON_ASCII_CHAR (charset_jisx0208
, c1
, c2
));
5056 DEFUN ("encode-sjis-char", Fencode_sjis_char
, Sencode_sjis_char
, 1, 1, 0,
5057 "Encode a Japanese character CHAR to shift_jis encoding.\n\
5058 Return the corresponding code in SJIS.")
5062 int charset
, c1
, c2
, s1
, s2
;
5065 CHECK_NUMBER (ch
, 0);
5066 SPLIT_CHAR (XFASTINT (ch
), charset
, c1
, c2
);
5067 if (charset
== CHARSET_ASCII
)
5071 else if (charset
== charset_jisx0208
5072 && c1
> 0x20 && c1
< 0x7F && c2
> 0x20 && c2
< 0x7F)
5074 ENCODE_SJIS (c1
, c2
, s1
, s2
);
5075 XSETFASTINT (val
, (s1
<< 8) | s2
);
5077 else if (charset
== charset_katakana_jisx0201
5078 && c1
> 0x20 && c2
< 0xE0)
5080 XSETFASTINT (val
, c1
| 0x80);
5083 error ("Can't encode to shift_jis: %d", XFASTINT (ch
));
5087 DEFUN ("decode-big5-char", Fdecode_big5_char
, Sdecode_big5_char
, 1, 1, 0,
5088 "Decode a Big5 character which has CODE in BIG5 coding system.\n\
5089 Return the corresponding character.")
5094 unsigned char b1
, b2
, c1
, c2
;
5097 CHECK_NUMBER (code
, 0);
5098 b1
= (XFASTINT (code
)) >> 8, b2
= (XFASTINT (code
)) & 0xFF;
5102 error ("Invalid BIG5 code: %x", XFASTINT (code
));
5107 if ((b1
< 0xA1 || b1
> 0xFE)
5108 || (b2
< 0x40 || (b2
> 0x7E && b2
< 0xA1) || b2
> 0xFE))
5109 error ("Invalid BIG5 code: %x", XFASTINT (code
));
5110 DECODE_BIG5 (b1
, b2
, charset
, c1
, c2
);
5111 XSETFASTINT (val
, MAKE_NON_ASCII_CHAR (charset
, c1
, c2
));
5116 DEFUN ("encode-big5-char", Fencode_big5_char
, Sencode_big5_char
, 1, 1, 0,
5117 "Encode the Big5 character CHAR to BIG5 coding system.\n\
5118 Return the corresponding character code in Big5.")
5122 int charset
, c1
, c2
, b1
, b2
;
5125 CHECK_NUMBER (ch
, 0);
5126 SPLIT_CHAR (XFASTINT (ch
), charset
, c1
, c2
);
5127 if (charset
== CHARSET_ASCII
)
5131 else if ((charset
== charset_big5_1
5132 && (XFASTINT (ch
) >= 0x250a1 && XFASTINT (ch
) <= 0x271ec))
5133 || (charset
== charset_big5_2
5134 && XFASTINT (ch
) >= 0x290a1 && XFASTINT (ch
) <= 0x2bdb2))
5136 ENCODE_BIG5 (charset
, c1
, c2
, b1
, b2
);
5137 XSETFASTINT (val
, (b1
<< 8) | b2
);
5140 error ("Can't encode to Big5: %d", XFASTINT (ch
));
5144 DEFUN ("set-terminal-coding-system-internal",
5145 Fset_terminal_coding_system_internal
,
5146 Sset_terminal_coding_system_internal
, 1, 1, 0, "")
5148 Lisp_Object coding_system
;
5150 CHECK_SYMBOL (coding_system
, 0);
5151 setup_coding_system (Fcheck_coding_system (coding_system
), &terminal_coding
);
5152 /* We had better not send unsafe characters to terminal. */
5153 terminal_coding
.flags
|= CODING_FLAG_ISO_SAFE
;
5158 DEFUN ("set-safe-terminal-coding-system-internal",
5159 Fset_safe_terminal_coding_system_internal
,
5160 Sset_safe_terminal_coding_system_internal
, 1, 1, 0, "")
5162 Lisp_Object coding_system
;
5164 CHECK_SYMBOL (coding_system
, 0);
5165 setup_coding_system (Fcheck_coding_system (coding_system
),
5166 &safe_terminal_coding
);
5170 DEFUN ("terminal-coding-system",
5171 Fterminal_coding_system
, Sterminal_coding_system
, 0, 0, 0,
5172 "Return coding system specified for terminal output.")
5175 return terminal_coding
.symbol
;
5178 DEFUN ("set-keyboard-coding-system-internal",
5179 Fset_keyboard_coding_system_internal
,
5180 Sset_keyboard_coding_system_internal
, 1, 1, 0, "")
5182 Lisp_Object coding_system
;
5184 CHECK_SYMBOL (coding_system
, 0);
5185 setup_coding_system (Fcheck_coding_system (coding_system
), &keyboard_coding
);
5189 DEFUN ("keyboard-coding-system",
5190 Fkeyboard_coding_system
, Skeyboard_coding_system
, 0, 0, 0,
5191 "Return coding system specified for decoding keyboard input.")
5194 return keyboard_coding
.symbol
;
5198 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system
,
5199 Sfind_operation_coding_system
, 1, MANY
, 0,
5200 "Choose a coding system for an operation based on the target name.\n\
5201 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).\n\
5202 DECODING-SYSTEM is the coding system to use for decoding\n\
5203 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system\n\
5204 for encoding (in case OPERATION does encoding).\n\
5206 The first argument OPERATION specifies an I/O primitive:\n\
5207 For file I/O, `insert-file-contents' or `write-region'.\n\
5208 For process I/O, `call-process', `call-process-region', or `start-process'.\n\
5209 For network I/O, `open-network-stream'.\n\
5211 The remaining arguments should be the same arguments that were passed\n\
5212 to the primitive. Depending on which primitive, one of those arguments\n\
5213 is selected as the TARGET. For example, if OPERATION does file I/O,\n\
5214 whichever argument specifies the file name is TARGET.\n\
5216 TARGET has a meaning which depends on OPERATION:\n\
5217 For file I/O, TARGET is a file name.\n\
5218 For process I/O, TARGET is a process name.\n\
5219 For network I/O, TARGET is a service name or a port number\n\
5221 This function looks up what specified for TARGET in,\n\
5222 `file-coding-system-alist', `process-coding-system-alist',\n\
5223 or `network-coding-system-alist' depending on OPERATION.\n\
5224 They may specify a coding system, a cons of coding systems,\n\
5225 or a function symbol to call.\n\
5226 In the last case, we call the function with one argument,\n\
5227 which is a list of all the arguments given to this function.")
5232 Lisp_Object operation
, target_idx
, target
, val
;
5233 register Lisp_Object chain
;
5236 error ("Too few arguments");
5237 operation
= args
[0];
5238 if (!SYMBOLP (operation
)
5239 || !INTEGERP (target_idx
= Fget (operation
, Qtarget_idx
)))
5240 error ("Invalid first arguement");
5241 if (nargs
< 1 + XINT (target_idx
))
5242 error ("Too few arguments for operation: %s",
5243 XSYMBOL (operation
)->name
->data
);
5244 target
= args
[XINT (target_idx
) + 1];
5245 if (!(STRINGP (target
)
5246 || (EQ (operation
, Qopen_network_stream
) && INTEGERP (target
))))
5247 error ("Invalid %dth argument", XINT (target_idx
) + 1);
5249 chain
= ((EQ (operation
, Qinsert_file_contents
)
5250 || EQ (operation
, Qwrite_region
))
5251 ? Vfile_coding_system_alist
5252 : (EQ (operation
, Qopen_network_stream
)
5253 ? Vnetwork_coding_system_alist
5254 : Vprocess_coding_system_alist
));
5258 for (; CONSP (chain
); chain
= XCONS (chain
)->cdr
)
5261 elt
= XCONS (chain
)->car
;
5264 && ((STRINGP (target
)
5265 && STRINGP (XCONS (elt
)->car
)
5266 && fast_string_match (XCONS (elt
)->car
, target
) >= 0)
5267 || (INTEGERP (target
) && EQ (target
, XCONS (elt
)->car
))))
5269 val
= XCONS (elt
)->cdr
;
5270 /* Here, if VAL is both a valid coding system and a valid
5271 function symbol, we return VAL as a coding system. */
5274 if (! SYMBOLP (val
))
5276 if (! NILP (Fcoding_system_p (val
)))
5277 return Fcons (val
, val
);
5278 if (! NILP (Ffboundp (val
)))
5280 val
= call1 (val
, Flist (nargs
, args
));
5283 if (SYMBOLP (val
) && ! NILP (Fcoding_system_p (val
)))
5284 return Fcons (val
, val
);
5292 DEFUN ("update-coding-systems-internal", Fupdate_coding_systems_internal
,
5293 Supdate_coding_systems_internal
, 0, 0, 0,
5294 "Update internal database for ISO2022 and CCL based coding systems.\n\
5295 When values of the following coding categories are changed, you must\n\
5296 call this function:\n\
5297 coding-category-iso-7, coding-category-iso-7-tight,\n\
5298 coding-category-iso-8-1, coding-category-iso-8-2,\n\
5299 coding-category-iso-7-else, coding-category-iso-8-else,\n\
5300 coding-category-ccl")
5305 for (i
= CODING_CATEGORY_IDX_ISO_7
; i
<= CODING_CATEGORY_IDX_CCL
; i
++)
5309 val
= XSYMBOL (XVECTOR (Vcoding_category_table
)->contents
[i
])->value
;
5312 if (! coding_system_table
[i
])
5313 coding_system_table
[i
] = ((struct coding_system
*)
5314 xmalloc (sizeof (struct coding_system
)));
5315 setup_coding_system (val
, coding_system_table
[i
]);
5317 else if (coding_system_table
[i
])
5319 xfree (coding_system_table
[i
]);
5320 coding_system_table
[i
] = NULL
;
5327 DEFUN ("set-coding-priority-internal", Fset_coding_priority_internal
,
5328 Sset_coding_priority_internal
, 0, 0, 0,
5329 "Update internal database for the current value of `coding-category-list'.\n\
5330 This function is internal use only.")
5336 val
= Vcoding_category_list
;
5338 while (CONSP (val
) && i
< CODING_CATEGORY_IDX_MAX
)
5340 if (! SYMBOLP (XCONS (val
)->car
))
5342 idx
= XFASTINT (Fget (XCONS (val
)->car
, Qcoding_category_index
));
5343 if (idx
>= CODING_CATEGORY_IDX_MAX
)
5345 coding_priorities
[i
++] = (1 << idx
);
5346 val
= XCONS (val
)->cdr
;
5348 /* If coding-category-list is valid and contains all coding
5349 categories, `i' should be CODING_CATEGORY_IDX_MAX now. If not,
5350 the following code saves Emacs from craching. */
5351 while (i
< CODING_CATEGORY_IDX_MAX
)
5352 coding_priorities
[i
++] = CODING_CATEGORY_MASK_RAW_TEXT
;
5360 /*** 9. Post-amble ***/
5365 conversion_buffer
= (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE
);
5373 /* Emacs' internal format specific initialize routine. */
5374 for (i
= 0; i
<= 0x20; i
++)
5375 emacs_code_class
[i
] = EMACS_control_code
;
5376 emacs_code_class
[0x0A] = EMACS_linefeed_code
;
5377 emacs_code_class
[0x0D] = EMACS_carriage_return_code
;
5378 for (i
= 0x21 ; i
< 0x7F; i
++)
5379 emacs_code_class
[i
] = EMACS_ascii_code
;
5380 emacs_code_class
[0x7F] = EMACS_control_code
;
5381 emacs_code_class
[0x80] = EMACS_leading_code_composition
;
5382 for (i
= 0x81; i
< 0xFF; i
++)
5383 emacs_code_class
[i
] = EMACS_invalid_code
;
5384 emacs_code_class
[LEADING_CODE_PRIVATE_11
] = EMACS_leading_code_3
;
5385 emacs_code_class
[LEADING_CODE_PRIVATE_12
] = EMACS_leading_code_3
;
5386 emacs_code_class
[LEADING_CODE_PRIVATE_21
] = EMACS_leading_code_4
;
5387 emacs_code_class
[LEADING_CODE_PRIVATE_22
] = EMACS_leading_code_4
;
5389 /* ISO2022 specific initialize routine. */
5390 for (i
= 0; i
< 0x20; i
++)
5391 iso_code_class
[i
] = ISO_control_code
;
5392 for (i
= 0x21; i
< 0x7F; i
++)
5393 iso_code_class
[i
] = ISO_graphic_plane_0
;
5394 for (i
= 0x80; i
< 0xA0; i
++)
5395 iso_code_class
[i
] = ISO_control_code
;
5396 for (i
= 0xA1; i
< 0xFF; i
++)
5397 iso_code_class
[i
] = ISO_graphic_plane_1
;
5398 iso_code_class
[0x20] = iso_code_class
[0x7F] = ISO_0x20_or_0x7F
;
5399 iso_code_class
[0xA0] = iso_code_class
[0xFF] = ISO_0xA0_or_0xFF
;
5400 iso_code_class
[ISO_CODE_CR
] = ISO_carriage_return
;
5401 iso_code_class
[ISO_CODE_SO
] = ISO_shift_out
;
5402 iso_code_class
[ISO_CODE_SI
] = ISO_shift_in
;
5403 iso_code_class
[ISO_CODE_SS2_7
] = ISO_single_shift_2_7
;
5404 iso_code_class
[ISO_CODE_ESC
] = ISO_escape
;
5405 iso_code_class
[ISO_CODE_SS2
] = ISO_single_shift_2
;
5406 iso_code_class
[ISO_CODE_SS3
] = ISO_single_shift_3
;
5407 iso_code_class
[ISO_CODE_CSI
] = ISO_control_sequence_introducer
;
5409 conversion_buffer_size
= MINIMUM_CONVERSION_BUFFER_SIZE
;
5411 setup_coding_system (Qnil
, &keyboard_coding
);
5412 setup_coding_system (Qnil
, &terminal_coding
);
5413 setup_coding_system (Qnil
, &safe_terminal_coding
);
5414 setup_coding_system (Qnil
, &default_buffer_file_coding
);
5416 bzero (coding_system_table
, sizeof coding_system_table
);
5418 bzero (ascii_skip_code
, sizeof ascii_skip_code
);
5419 for (i
= 0; i
< 128; i
++)
5420 ascii_skip_code
[i
] = 1;
5422 #if defined (MSDOS) || defined (WINDOWSNT)
5423 system_eol_type
= CODING_EOL_CRLF
;
5425 system_eol_type
= CODING_EOL_LF
;
5434 Qtarget_idx
= intern ("target-idx");
5435 staticpro (&Qtarget_idx
);
5437 Qcoding_system_history
= intern ("coding-system-history");
5438 staticpro (&Qcoding_system_history
);
5439 Fset (Qcoding_system_history
, Qnil
);
5441 /* Target FILENAME is the first argument. */
5442 Fput (Qinsert_file_contents
, Qtarget_idx
, make_number (0));
5443 /* Target FILENAME is the third argument. */
5444 Fput (Qwrite_region
, Qtarget_idx
, make_number (2));
5446 Qcall_process
= intern ("call-process");
5447 staticpro (&Qcall_process
);
5448 /* Target PROGRAM is the first argument. */
5449 Fput (Qcall_process
, Qtarget_idx
, make_number (0));
5451 Qcall_process_region
= intern ("call-process-region");
5452 staticpro (&Qcall_process_region
);
5453 /* Target PROGRAM is the third argument. */
5454 Fput (Qcall_process_region
, Qtarget_idx
, make_number (2));
5456 Qstart_process
= intern ("start-process");
5457 staticpro (&Qstart_process
);
5458 /* Target PROGRAM is the third argument. */
5459 Fput (Qstart_process
, Qtarget_idx
, make_number (2));
5461 Qopen_network_stream
= intern ("open-network-stream");
5462 staticpro (&Qopen_network_stream
);
5463 /* Target SERVICE is the fourth argument. */
5464 Fput (Qopen_network_stream
, Qtarget_idx
, make_number (3));
5466 Qcoding_system
= intern ("coding-system");
5467 staticpro (&Qcoding_system
);
5469 Qeol_type
= intern ("eol-type");
5470 staticpro (&Qeol_type
);
5472 Qbuffer_file_coding_system
= intern ("buffer-file-coding-system");
5473 staticpro (&Qbuffer_file_coding_system
);
5475 Qpost_read_conversion
= intern ("post-read-conversion");
5476 staticpro (&Qpost_read_conversion
);
5478 Qpre_write_conversion
= intern ("pre-write-conversion");
5479 staticpro (&Qpre_write_conversion
);
5481 Qno_conversion
= intern ("no-conversion");
5482 staticpro (&Qno_conversion
);
5484 Qundecided
= intern ("undecided");
5485 staticpro (&Qundecided
);
5487 Qcoding_system_p
= intern ("coding-system-p");
5488 staticpro (&Qcoding_system_p
);
5490 Qcoding_system_error
= intern ("coding-system-error");
5491 staticpro (&Qcoding_system_error
);
5493 Fput (Qcoding_system_error
, Qerror_conditions
,
5494 Fcons (Qcoding_system_error
, Fcons (Qerror
, Qnil
)));
5495 Fput (Qcoding_system_error
, Qerror_message
,
5496 build_string ("Invalid coding system"));
5498 Qcoding_category
= intern ("coding-category");
5499 staticpro (&Qcoding_category
);
5500 Qcoding_category_index
= intern ("coding-category-index");
5501 staticpro (&Qcoding_category_index
);
5503 Vcoding_category_table
5504 = Fmake_vector (make_number (CODING_CATEGORY_IDX_MAX
), Qnil
);
5505 staticpro (&Vcoding_category_table
);
5508 for (i
= 0; i
< CODING_CATEGORY_IDX_MAX
; i
++)
5510 XVECTOR (Vcoding_category_table
)->contents
[i
]
5511 = intern (coding_category_name
[i
]);
5512 Fput (XVECTOR (Vcoding_category_table
)->contents
[i
],
5513 Qcoding_category_index
, make_number (i
));
5517 Qtranslation_table
= intern ("translation-table");
5518 staticpro (&Qtranslation_table
);
5519 Fput (Qtranslation_table
, Qchar_table_extra_slots
, make_number (1));
5521 Qtranslation_table_id
= intern ("translation-table-id");
5522 staticpro (&Qtranslation_table_id
);
5524 Qtranslation_table_for_decode
= intern ("translation-table-for-decode");
5525 staticpro (&Qtranslation_table_for_decode
);
5527 Qtranslation_table_for_encode
= intern ("translation-table-for-encode");
5528 staticpro (&Qtranslation_table_for_encode
);
5530 Qsafe_charsets
= intern ("safe-charsets");
5531 staticpro (&Qsafe_charsets
);
5533 Qvalid_codes
= intern ("valid-codes");
5534 staticpro (&Qvalid_codes
);
5536 Qemacs_mule
= intern ("emacs-mule");
5537 staticpro (&Qemacs_mule
);
5539 Qraw_text
= intern ("raw-text");
5540 staticpro (&Qraw_text
);
5542 defsubr (&Scoding_system_p
);
5543 defsubr (&Sread_coding_system
);
5544 defsubr (&Sread_non_nil_coding_system
);
5545 defsubr (&Scheck_coding_system
);
5546 defsubr (&Sdetect_coding_region
);
5547 defsubr (&Sdetect_coding_string
);
5548 defsubr (&Sdecode_coding_region
);
5549 defsubr (&Sencode_coding_region
);
5550 defsubr (&Sdecode_coding_string
);
5551 defsubr (&Sencode_coding_string
);
5552 defsubr (&Sdecode_sjis_char
);
5553 defsubr (&Sencode_sjis_char
);
5554 defsubr (&Sdecode_big5_char
);
5555 defsubr (&Sencode_big5_char
);
5556 defsubr (&Sset_terminal_coding_system_internal
);
5557 defsubr (&Sset_safe_terminal_coding_system_internal
);
5558 defsubr (&Sterminal_coding_system
);
5559 defsubr (&Sset_keyboard_coding_system_internal
);
5560 defsubr (&Skeyboard_coding_system
);
5561 defsubr (&Sfind_operation_coding_system
);
5562 defsubr (&Supdate_coding_systems_internal
);
5563 defsubr (&Sset_coding_priority_internal
);
5565 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list
,
5566 "List of coding systems.\n\
5568 Do not alter the value of this variable manually. This variable should be\n\
5569 updated by the functions `make-coding-system' and\n\
5570 `define-coding-system-alias'.");
5571 Vcoding_system_list
= Qnil
;
5573 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist
,
5574 "Alist of coding system names.\n\
5575 Each element is one element list of coding system name.\n\
5576 This variable is given to `completing-read' as TABLE argument.\n\
5578 Do not alter the value of this variable manually. This variable should be\n\
5579 updated by the functions `make-coding-system' and\n\
5580 `define-coding-system-alias'.");
5581 Vcoding_system_alist
= Qnil
;
5583 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list
,
5584 "List of coding-categories (symbols) ordered by priority.");
5588 Vcoding_category_list
= Qnil
;
5589 for (i
= CODING_CATEGORY_IDX_MAX
- 1; i
>= 0; i
--)
5590 Vcoding_category_list
5591 = Fcons (XVECTOR (Vcoding_category_table
)->contents
[i
],
5592 Vcoding_category_list
);
5595 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read
,
5596 "Specify the coding system for read operations.\n\
5597 It is useful to bind this variable with `let', but do not set it globally.\n\
5598 If the value is a coding system, it is used for decoding on read operation.\n\
5599 If not, an appropriate element is used from one of the coding system alists:\n\
5600 There are three such tables, `file-coding-system-alist',\n\
5601 `process-coding-system-alist', and `network-coding-system-alist'.");
5602 Vcoding_system_for_read
= Qnil
;
5604 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write
,
5605 "Specify the coding system for write operations.\n\
5606 It is useful to bind this variable with `let', but do not set it globally.\n\
5607 If the value is a coding system, it is used for encoding on write operation.\n\
5608 If not, an appropriate element is used from one of the coding system alists:\n\
5609 There are three such tables, `file-coding-system-alist',\n\
5610 `process-coding-system-alist', and `network-coding-system-alist'.");
5611 Vcoding_system_for_write
= Qnil
;
5613 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used
,
5614 "Coding system used in the latest file or process I/O.");
5615 Vlast_coding_system_used
= Qnil
;
5617 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion
,
5618 "*Non-nil means always inhibit code conversion of end-of-line format.\n\
5619 See info node `Coding Systems' and info node `Text and Binary' concerning\n\
5621 inhibit_eol_conversion
= 0;
5623 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system
,
5624 "Non-nil means process buffer inherits coding system of process output.\n\
5625 Bind it to t if the process output is to be treated as if it were a file\n\
5626 read from some filesystem.");
5627 inherit_process_coding_system
= 0;
5629 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist
,
5630 "Alist to decide a coding system to use for a file I/O operation.\n\
5631 The format is ((PATTERN . VAL) ...),\n\
5632 where PATTERN is a regular expression matching a file name,\n\
5633 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
5634 If VAL is a coding system, it is used for both decoding and encoding\n\
5635 the file contents.\n\
5636 If VAL is a cons of coding systems, the car part is used for decoding,\n\
5637 and the cdr part is used for encoding.\n\
5638 If VAL is a function symbol, the function must return a coding system\n\
5639 or a cons of coding systems which are used as above.\n\
5641 See also the function `find-operation-coding-system'\n\
5642 and the variable `auto-coding-alist'.");
5643 Vfile_coding_system_alist
= Qnil
;
5645 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist
,
5646 "Alist to decide a coding system to use for a process I/O operation.\n\
5647 The format is ((PATTERN . VAL) ...),\n\
5648 where PATTERN is a regular expression matching a program name,\n\
5649 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
5650 If VAL is a coding system, it is used for both decoding what received\n\
5651 from the program and encoding what sent to the program.\n\
5652 If VAL is a cons of coding systems, the car part is used for decoding,\n\
5653 and the cdr part is used for encoding.\n\
5654 If VAL is a function symbol, the function must return a coding system\n\
5655 or a cons of coding systems which are used as above.\n\
5657 See also the function `find-operation-coding-system'.");
5658 Vprocess_coding_system_alist
= Qnil
;
5660 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist
,
5661 "Alist to decide a coding system to use for a network I/O operation.\n\
5662 The format is ((PATTERN . VAL) ...),\n\
5663 where PATTERN is a regular expression matching a network service name\n\
5664 or is a port number to connect to,\n\
5665 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
5666 If VAL is a coding system, it is used for both decoding what received\n\
5667 from the network stream and encoding what sent to the network stream.\n\
5668 If VAL is a cons of coding systems, the car part is used for decoding,\n\
5669 and the cdr part is used for encoding.\n\
5670 If VAL is a function symbol, the function must return a coding system\n\
5671 or a cons of coding systems which are used as above.\n\
5673 See also the function `find-operation-coding-system'.");
5674 Vnetwork_coding_system_alist
= Qnil
;
5676 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix
,
5677 "*String displayed in mode line for UNIX-like (LF) end-of-line format.");
5678 eol_mnemonic_unix
= build_string (":");
5680 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos
,
5681 "*String displayed in mode line for DOS-like (CRLF) end-of-line format.");
5682 eol_mnemonic_dos
= build_string ("\\");
5684 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac
,
5685 "*String displayed in mode line for MAC-like (CR) end-of-line format.");
5686 eol_mnemonic_mac
= build_string ("/");
5688 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided
,
5689 "*String displayed in mode line when end-of-line format is not yet determined.");
5690 eol_mnemonic_undecided
= build_string (":");
5692 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation
,
5693 "*Non-nil enables character translation while encoding and decoding.");
5694 Venable_character_translation
= Qt
;
5696 DEFVAR_LISP ("standard-translation-table-for-decode",
5697 &Vstandard_translation_table_for_decode
,
5698 "Table for translating characters while decoding.");
5699 Vstandard_translation_table_for_decode
= Qnil
;
5701 DEFVAR_LISP ("standard-translation-table-for-encode",
5702 &Vstandard_translation_table_for_encode
,
5703 "Table for translationg characters while encoding.");
5704 Vstandard_translation_table_for_encode
= Qnil
;
5706 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist
,
5707 "Alist of charsets vs revision numbers.\n\
5708 While encoding, if a charset (car part of an element) is found,\n\
5709 designate it with the escape sequence identifing revision (cdr part of the element).");
5710 Vcharset_revision_alist
= Qnil
;
5712 DEFVAR_LISP ("default-process-coding-system",
5713 &Vdefault_process_coding_system
,
5714 "Cons of coding systems used for process I/O by default.\n\
5715 The car part is used for decoding a process output,\n\
5716 the cdr part is used for encoding a text to be sent to a process.");
5717 Vdefault_process_coding_system
= Qnil
;
5719 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table
,
5720 "Table of extra Latin codes in the range 128..159 (inclusive).\n\
5721 This is a vector of length 256.\n\
5722 If Nth element is non-nil, the existence of code N in a file\n\
5723 \(or output of subprocess) doesn't prevent it to be detected as\n\
5724 a coding system of ISO 2022 variant which has a flag\n\
5725 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file\n\
5726 or reading output of a subprocess.\n\
5727 Only 128th through 159th elements has a meaning.");
5728 Vlatin_extra_code_table
= Fmake_vector (make_number (256), Qnil
);
5730 DEFVAR_LISP ("select-safe-coding-system-function",
5731 &Vselect_safe_coding_system_function
,
5732 "Function to call to select safe coding system for encoding a text.\n\
5734 If set, this function is called to force a user to select a proper\n\
5735 coding system which can encode the text in the case that a default\n\
5736 coding system used in each operation can't encode the text.\n\
5738 The default value is `select-safe-coding-system' (which see).");
5739 Vselect_safe_coding_system_function
= Qnil
;