1 /* Coding system handler (conversion, detection, and etc).
2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
5 This file is part of GNU Emacs.
7 GNU Emacs is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
12 GNU Emacs is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GNU Emacs; see the file COPYING. If not, write to
19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
22 /*** TABLE OF CONTENTS ***
25 2. Emacs' internal format (emacs-mule) handlers
27 4. Shift-JIS and BIG5 handlers
28 5. End-of-line handlers
29 6. C library functions
30 7. Emacs Lisp library functions
35 /*** GENERAL NOTE on CODING SYSTEM ***
37 Coding system is an encoding mechanism of one or more character
38 sets. Here's a list of coding systems which Emacs can handle. When
39 we say "decode", it means converting some other coding system to
40 Emacs' internal format (emacs-internal), and when we say "encode",
41 it means converting the coding system emacs-mule to some other
44 0. Emacs' internal format (emacs-mule)
46 Emacs itself holds a multi-lingual character in a buffer and a string
47 in a special format. Details are described in section 2.
51 The most famous coding system for multiple character sets. X's
52 Compound Text, various EUCs (Extended Unix Code), and coding
53 systems used in Internet communication such as ISO-2022-JP are
54 all variants of ISO2022. Details are described in section 3.
56 2. SJIS (or Shift-JIS or MS-Kanji-Code)
58 A coding system to encode character sets: ASCII, JISX0201, and
59 JISX0208. Widely used for PC's in Japan. Details are described in
64 A coding system to encode character sets: ASCII and Big5. Widely
65 used by Chinese (mainly in Taiwan and Hong Kong). Details are
66 described in section 4. In this file, when we write "BIG5"
67 (all uppercase), we mean the coding system, and when we write
68 "Big5" (capitalized), we mean the character set.
72 A coding system for a text containing random 8-bit code. Emacs does
73 no code conversion on such a text except for end-of-line format.
77 If a user wants to read/write a text encoded in a coding system not
78 listed above, he can supply a decoder and an encoder for it in CCL
79 (Code Conversion Language) programs. Emacs executes the CCL program
80 while reading/writing.
82 Emacs represents a coding system by a Lisp symbol that has a property
83 `coding-system'. But, before actually using the coding system, the
84 information about it is set in a structure of type `struct
85 coding_system' for rapid processing. See section 6 for more details.
89 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
91 How end-of-line of a text is encoded depends on a system. For
92 instance, Unix's format is just one byte of `line-feed' code,
93 whereas DOS's format is two-byte sequence of `carriage-return' and
94 `line-feed' codes. MacOS's format is usually one byte of
97 Since text characters encoding and end-of-line encoding are
98 independent, any coding system described above can take
99 any format of end-of-line. So, Emacs has information of format of
100 end-of-line in each coding-system. See section 6 for more details.
104 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
106 These functions check if a text between SRC and SRC_END is encoded
107 in the coding system category XXX. Each returns an integer value in
108 which appropriate flag bits for the category XXX is set. The flag
109 bits are defined in macros CODING_CATEGORY_MASK_XXX. Below is the
110 template of these functions. */
113 detect_coding_emacs_mule (src
, src_end
)
114 unsigned char *src
, *src_end
;
120 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
122 These functions decode SRC_BYTES length text at SOURCE encoded in
123 CODING to Emacs' internal format (emacs-mule). The resulting text
124 goes to a place pointed to by DESTINATION, the length of which
125 should not exceed DST_BYTES. These functions set the information of
126 original and decoded texts in the members produced, produced_char,
127 consumed, and consumed_char of the structure *CODING.
129 The return value is an integer (CODING_FINISH_XXX) indicating how
130 the decoding finished.
132 DST_BYTES zero means that source area and destination area are
133 overlapped, which means that we can produce a decoded text until it
134 reaches at the head of not-yet-decoded source text.
136 Below is a template of these functions. */
138 decode_coding_XXX (coding
, source
, destination
, src_bytes
, dst_bytes
)
139 struct coding_system
*coding
;
140 unsigned char *source
, *destination
;
141 int src_bytes
, dst_bytes
;
147 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
149 These functions encode SRC_BYTES length text at SOURCE of Emacs'
150 internal format (emacs-mule) to CODING. The resulting text goes to
151 a place pointed to by DESTINATION, the length of which should not
152 exceed DST_BYTES. These functions set the information of
153 original and encoded texts in the members produced, produced_char,
154 consumed, and consumed_char of the structure *CODING.
156 The return value is an integer (CODING_FINISH_XXX) indicating how
157 the encoding finished.
159 DST_BYTES zero means that source area and destination area are
160 overlapped, which means that we can produce a decoded text until it
161 reaches at the head of not-yet-decoded source text.
163 Below is a template of these functions. */
165 encode_coding_XXX (coding
, source
, destination
, src_bytes
, dst_bytes
)
166 struct coding_system
*coding
;
167 unsigned char *source
, *destination
;
168 int src_bytes
, dst_bytes
;
174 /*** COMMONLY USED MACROS ***/
176 /* The following three macros ONE_MORE_BYTE, TWO_MORE_BYTES, and
177 THREE_MORE_BYTES safely get one, two, and three bytes from the
178 source text respectively. If there are not enough bytes in the
179 source, they jump to `label_end_of_loop'. The caller should set
180 variables `src' and `src_end' to appropriate areas in advance. */
182 #define ONE_MORE_BYTE(c1) \
187 goto label_end_of_loop; \
190 #define TWO_MORE_BYTES(c1, c2) \
192 if (src + 1 < src_end) \
193 c1 = *src++, c2 = *src++; \
195 goto label_end_of_loop; \
198 #define THREE_MORE_BYTES(c1, c2, c3) \
200 if (src + 2 < src_end) \
201 c1 = *src++, c2 = *src++, c3 = *src++; \
203 goto label_end_of_loop; \
206 /* The following three macros DECODE_CHARACTER_ASCII,
207 DECODE_CHARACTER_DIMENSION1, and DECODE_CHARACTER_DIMENSION2 put
208 the multi-byte form of a character of each class at the place
209 pointed by `dst'. The caller should set the variable `dst' to
210 point to an appropriate area and the variable `coding' to point to
211 the coding-system of the currently decoding text in advance. */
213 /* Decode one ASCII character C. */
215 #define DECODE_CHARACTER_ASCII(c) \
217 if (COMPOSING_P (coding->composing)) \
218 *dst++ = 0xA0, *dst++ = (c) | 0x80; \
222 coding->produced_char++; \
226 /* Decode one DIMENSION1 character whose charset is CHARSET and whose
227 position-code is C. */
229 #define DECODE_CHARACTER_DIMENSION1(charset, c) \
231 unsigned char leading_code = CHARSET_LEADING_CODE_BASE (charset); \
232 if (COMPOSING_P (coding->composing)) \
233 *dst++ = leading_code + 0x20; \
236 *dst++ = leading_code; \
237 coding->produced_char++; \
239 if (leading_code = CHARSET_LEADING_CODE_EXT (charset)) \
240 *dst++ = leading_code; \
241 *dst++ = (c) | 0x80; \
244 /* Decode one DIMENSION2 character whose charset is CHARSET and whose
245 position-codes are C1 and C2. */
247 #define DECODE_CHARACTER_DIMENSION2(charset, c1, c2) \
249 DECODE_CHARACTER_DIMENSION1 (charset, c1); \
250 *dst++ = (c2) | 0x80; \
254 /*** 1. Preamble ***/
268 #else /* not emacs */
272 #endif /* not emacs */
274 Lisp_Object Qcoding_system
, Qeol_type
;
275 Lisp_Object Qbuffer_file_coding_system
;
276 Lisp_Object Qpost_read_conversion
, Qpre_write_conversion
;
277 Lisp_Object Qno_conversion
, Qundecided
;
278 Lisp_Object Qcoding_system_history
;
279 Lisp_Object Qsafe_charsets
;
281 extern Lisp_Object Qinsert_file_contents
, Qwrite_region
;
282 Lisp_Object Qcall_process
, Qcall_process_region
, Qprocess_argument
;
283 Lisp_Object Qstart_process
, Qopen_network_stream
;
284 Lisp_Object Qtarget_idx
;
286 Lisp_Object Vselect_safe_coding_system_function
;
288 /* Mnemonic character of each format of end-of-line. */
289 int eol_mnemonic_unix
, eol_mnemonic_dos
, eol_mnemonic_mac
;
290 /* Mnemonic character to indicate format of end-of-line is not yet
292 int eol_mnemonic_undecided
;
294 /* Format of end-of-line decided by system. This is CODING_EOL_LF on
295 Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac. */
300 Lisp_Object Vcoding_system_list
, Vcoding_system_alist
;
302 Lisp_Object Qcoding_system_p
, Qcoding_system_error
;
304 /* Coding system emacs-mule and raw-text are for converting only
305 end-of-line format. */
306 Lisp_Object Qemacs_mule
, Qraw_text
;
308 /* Coding-systems are handed between Emacs Lisp programs and C internal
309 routines by the following three variables. */
310 /* Coding-system for reading files and receiving data from process. */
311 Lisp_Object Vcoding_system_for_read
;
312 /* Coding-system for writing files and sending data to process. */
313 Lisp_Object Vcoding_system_for_write
;
314 /* Coding-system actually used in the latest I/O. */
315 Lisp_Object Vlast_coding_system_used
;
317 /* A vector of length 256 which contains information about special
318 Latin codes (espepcially for dealing with Microsoft code). */
319 Lisp_Object Vlatin_extra_code_table
;
321 /* Flag to inhibit code conversion of end-of-line format. */
322 int inhibit_eol_conversion
;
324 /* Coding system to be used to encode text for terminal display. */
325 struct coding_system terminal_coding
;
327 /* Coding system to be used to encode text for terminal display when
328 terminal coding system is nil. */
329 struct coding_system safe_terminal_coding
;
331 /* Coding system of what is sent from terminal keyboard. */
332 struct coding_system keyboard_coding
;
334 Lisp_Object Vfile_coding_system_alist
;
335 Lisp_Object Vprocess_coding_system_alist
;
336 Lisp_Object Vnetwork_coding_system_alist
;
340 Lisp_Object Qcoding_category
, Qcoding_category_index
;
342 /* List of symbols `coding-category-xxx' ordered by priority. */
343 Lisp_Object Vcoding_category_list
;
345 /* Table of coding categories (Lisp symbols). */
346 Lisp_Object Vcoding_category_table
;
348 /* Table of names of symbol for each coding-category. */
349 char *coding_category_name
[CODING_CATEGORY_IDX_MAX
] = {
350 "coding-category-emacs-mule",
351 "coding-category-sjis",
352 "coding-category-iso-7",
353 "coding-category-iso-7-tight",
354 "coding-category-iso-8-1",
355 "coding-category-iso-8-2",
356 "coding-category-iso-7-else",
357 "coding-category-iso-8-else",
358 "coding-category-big5",
359 "coding-category-raw-text",
360 "coding-category-binary"
363 /* Table pointers to coding systems corresponding to each coding
365 struct coding_system
*coding_system_table
[CODING_CATEGORY_IDX_MAX
];
367 /* Flag to tell if we look up unification table on character code
369 Lisp_Object Venable_character_unification
;
370 /* Standard unification table to look up on decoding (reading). */
371 Lisp_Object Vstandard_character_unification_table_for_decode
;
372 /* Standard unification table to look up on encoding (writing). */
373 Lisp_Object Vstandard_character_unification_table_for_encode
;
375 Lisp_Object Qcharacter_unification_table
;
376 Lisp_Object Qcharacter_unification_table_for_decode
;
377 Lisp_Object Qcharacter_unification_table_for_encode
;
379 /* Alist of charsets vs revision number. */
380 Lisp_Object Vcharset_revision_alist
;
382 /* Default coding systems used for process I/O. */
383 Lisp_Object Vdefault_process_coding_system
;
386 /*** 2. Emacs internal format (emacs-mule) handlers ***/
388 /* Emacs' internal format for encoding multiple character sets is a
389 kind of multi-byte encoding, i.e. characters are encoded by
390 variable-length sequences of one-byte codes. ASCII characters
391 and control characters (e.g. `tab', `newline') are represented by
392 one-byte sequences which are their ASCII codes, in the range 0x00
393 through 0x7F. The other characters are represented by a sequence
394 of `base leading-code', optional `extended leading-code', and one
395 or two `position-code's. The length of the sequence is determined
396 by the base leading-code. Leading-code takes the range 0x80
397 through 0x9F, whereas extended leading-code and position-code take
398 the range 0xA0 through 0xFF. See `charset.h' for more details
399 about leading-code and position-code.
401 There's one exception to this rule. Special leading-code
402 `leading-code-composition' denotes that the following several
403 characters should be composed into one character. Leading-codes of
404 components (except for ASCII) are added 0x20. An ASCII character
405 component is represented by a 2-byte sequence of `0xA0' and
406 `ASCII-code + 0x80'. See also the comments in `charset.h' for the
407 details of composite character. Hence, we can summarize the code
410 --- CODE RANGE of Emacs' internal format ---
411 (character set) (range)
413 ELSE (1st byte) 0x80 .. 0x9F
414 (rest bytes) 0xA0 .. 0xFF
415 ---------------------------------------------
419 enum emacs_code_class_type emacs_code_class
[256];
421 /* Go to the next statement only if *SRC is accessible and the code is
422 greater than 0xA0. */
423 #define CHECK_CODE_RANGE_A0_FF \
425 if (src >= src_end) \
426 goto label_end_of_switch; \
427 else if (*src++ < 0xA0) \
431 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
432 Check if a text is encoded in Emacs' internal format. If it is,
433 return CODING_CATEGORY_MASK_EMACS_MULE, else return 0. */
436 detect_coding_emacs_mule (src
, src_end
)
437 unsigned char *src
, *src_end
;
442 while (src
< src_end
)
454 switch (emacs_code_class
[c
])
456 case EMACS_ascii_code
:
457 case EMACS_linefeed_code
:
460 case EMACS_control_code
:
461 if (c
== ISO_CODE_ESC
|| c
== ISO_CODE_SI
|| c
== ISO_CODE_SO
)
465 case EMACS_invalid_code
:
468 case EMACS_leading_code_composition
: /* c == 0x80 */
470 CHECK_CODE_RANGE_A0_FF
;
475 case EMACS_leading_code_4
:
476 CHECK_CODE_RANGE_A0_FF
;
477 /* fall down to check it two more times ... */
479 case EMACS_leading_code_3
:
480 CHECK_CODE_RANGE_A0_FF
;
481 /* fall down to check it one more time ... */
483 case EMACS_leading_code_2
:
484 CHECK_CODE_RANGE_A0_FF
;
492 return CODING_CATEGORY_MASK_EMACS_MULE
;
496 /*** 3. ISO2022 handlers ***/
498 /* The following note describes the coding system ISO2022 briefly.
499 Since the intention of this note is to help in understanding of
500 the programs in this file, some parts are NOT ACCURATE or OVERLY
501 SIMPLIFIED. For the thorough understanding, please refer to the
502 original document of ISO2022.
504 ISO2022 provides many mechanisms to encode several character sets
505 in 7-bit and 8-bit environment. If one chooses 7-bite environment,
506 all text is encoded by codes of less than 128. This may make the
507 encoded text a little bit longer, but the text gets more stability
508 to pass through several gateways (some of them strip off the MSB).
510 There are two kinds of character set: control character set and
511 graphic character set. The former contains control characters such
512 as `newline' and `escape' to provide control functions (control
513 functions are provided also by escape sequences). The latter
514 contains graphic characters such as ' A' and '-'. Emacs recognizes
515 two control character sets and many graphic character sets.
517 Graphic character sets are classified into one of the following
518 four classes, DIMENSION1_CHARS94, DIMENSION1_CHARS96,
519 DIMENSION2_CHARS94, DIMENSION2_CHARS96 according to the number of
520 bytes (DIMENSION) and the number of characters in one dimension
521 (CHARS) of the set. In addition, each character set is assigned an
522 identification tag (called "final character" and denoted as <F>
523 here after) which is unique in each class. <F> of each character
524 set is decided by ECMA(*) when it is registered in ISO. Code range
525 of <F> is 0x30..0x7F (0x30..0x3F are for private use only).
527 Note (*): ECMA = European Computer Manufacturers Association
529 Here are examples of graphic character set [NAME(<F>)]:
530 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
531 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
532 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
533 o DIMENSION2_CHARS96 -- none for the moment
535 A code area (1byte=8bits) is divided into 4 areas, C0, GL, C1, and GR.
536 C0 [0x00..0x1F] -- control character plane 0
537 GL [0x20..0x7F] -- graphic character plane 0
538 C1 [0x80..0x9F] -- control character plane 1
539 GR [0xA0..0xFF] -- graphic character plane 1
541 A control character set is directly designated and invoked to C0 or
542 C1 by an escape sequence. The most common case is that ISO646's
543 control character set is designated/invoked to C0 and ISO6429's
544 control character set is designated/invoked to C1, and usually
545 these designations/invocations are omitted in a coded text. With
546 7-bit environment, only C0 can be used, and a control character for
547 C1 is encoded by an appropriate escape sequence to fit in the
548 environment. All control characters for C1 are defined the
549 corresponding escape sequences.
551 A graphic character set is at first designated to one of four
552 graphic registers (G0 through G3), then these graphic registers are
553 invoked to GL or GR. These designations and invocations can be
554 done independently. The most common case is that G0 is invoked to
555 GL, G1 is invoked to GR, and ASCII is designated to G0, and usually
556 these invocations and designations are omitted in a coded text.
557 With 7-bit environment, only GL can be used.
559 When a graphic character set of CHARS94 is invoked to GL, code 0x20
560 and 0x7F of GL area work as control characters SPACE and DEL
561 respectively, and code 0xA0 and 0xFF of GR area should not be used.
563 There are two ways of invocation: locking-shift and single-shift.
564 With locking-shift, the invocation lasts until the next different
565 invocation, whereas with single-shift, the invocation works only
566 for the following character and doesn't affect locking-shift.
567 Invocations are done by the following control characters or escape
570 ----------------------------------------------------------------------
571 function control char escape sequence description
572 ----------------------------------------------------------------------
573 SI (shift-in) 0x0F none invoke G0 to GL
574 SO (shift-out) 0x0E none invoke G1 to GL
575 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
576 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
577 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 into GL
578 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 into GL
579 ----------------------------------------------------------------------
580 The first four are for locking-shift. Control characters for these
581 functions are defined by macros ISO_CODE_XXX in `coding.h'.
583 Designations are done by the following escape sequences.
584 ----------------------------------------------------------------------
585 escape sequence description
586 ----------------------------------------------------------------------
587 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
588 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
589 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
590 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
591 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
592 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
593 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
594 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
595 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
596 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
597 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
598 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
599 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
600 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
601 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
602 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
603 ----------------------------------------------------------------------
605 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
606 of dimension 1, chars 94, and final character <F>, and etc.
608 Note (*): Although these designations are not allowed in ISO2022,
609 Emacs accepts them on decoding, and produces them on encoding
610 CHARS96 character set in a coding system which is characterized as
611 7-bit environment, non-locking-shift, and non-single-shift.
613 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
614 '(' can be omitted. We call this as "short-form" here after.
616 Now you may notice that there are a lot of ways for encoding the
617 same multilingual text in ISO2022. Actually, there exists many
618 coding systems such as Compound Text (used in X's inter client
619 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
620 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
621 localized platforms), and all of these are variants of ISO2022.
623 In addition to the above, Emacs handles two more kinds of escape
624 sequences: ISO6429's direction specification and Emacs' private
625 sequence for specifying character composition.
627 ISO6429's direction specification takes the following format:
628 o CSI ']' -- end of the current direction
629 o CSI '0' ']' -- end of the current direction
630 o CSI '1' ']' -- start of left-to-right text
631 o CSI '2' ']' -- start of right-to-left text
632 The control character CSI (0x9B: control sequence introducer) is
633 abbreviated to the escape sequence ESC '[' in 7-bit environment.
635 Character composition specification takes the following format:
636 o ESC '0' -- start character composition
637 o ESC '1' -- end character composition
638 Since these are not standard escape sequences of any ISO, the use
639 of them for these meaning is restricted to Emacs only. */
641 enum iso_code_class_type iso_code_class
[256];
643 #define CHARSET_OK(idx, charset) \
644 (coding_system_table[idx]->safe_charsets[charset] \
645 || (CODING_SPEC_ISO_REQUESTED_DESIGNATION \
646 (coding_system_table[idx], charset) \
647 != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
649 #define SHIFT_OUT_OK(idx) \
650 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding_system_table[idx], 1) >= 0)
652 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
653 Check if a text is encoded in ISO2022. If it is, returns an
654 integer in which appropriate flag bits any of:
655 CODING_CATEGORY_MASK_ISO_7
656 CODING_CATEGORY_MASK_ISO_7_TIGHT
657 CODING_CATEGORY_MASK_ISO_8_1
658 CODING_CATEGORY_MASK_ISO_8_2
659 CODING_CATEGORY_MASK_ISO_7_ELSE
660 CODING_CATEGORY_MASK_ISO_8_ELSE
661 are set. If a code which should never appear in ISO2022 is found,
665 detect_coding_iso2022 (src
, src_end
)
666 unsigned char *src
, *src_end
;
668 int mask
= CODING_CATEGORY_MASK_ISO
;
670 int reg
[4], shift_out
= 0;
671 int c
, c1
, i
, charset
;
673 reg
[0] = CHARSET_ASCII
, reg
[1] = reg
[2] = reg
[3] = -1;
674 while (mask
&& src
< src_end
)
683 if (c
>= '(' && c
<= '/')
685 /* Designation sequence for a charset of dimension 1. */
689 if (c1
< ' ' || c1
>= 0x80
690 || (charset
= iso_charset_table
[0][c
>= ','][c1
]) < 0)
691 /* Invalid designation sequence. Just ignore. */
693 reg
[(c
- '(') % 4] = charset
;
697 /* Designation sequence for a charset of dimension 2. */
701 if (c
>= '@' && c
<= 'B')
702 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
703 reg
[0] = charset
= iso_charset_table
[1][0][c
];
704 else if (c
>= '(' && c
<= '/')
709 if (c1
< ' ' || c1
>= 0x80
710 || (charset
= iso_charset_table
[1][c
>= ','][c1
]) < 0)
711 /* Invalid designation sequence. Just ignore. */
713 reg
[(c
- '(') % 4] = charset
;
716 /* Invalid designation sequence. Just ignore. */
719 else if (c
== 'N' || c
== 'n')
723 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE
)
724 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE
)))
726 /* Locking shift out. */
727 mask
&= ~CODING_CATEGORY_MASK_ISO_7BIT
;
728 mask_found
|= CODING_CATEGORY_MASK_ISO_SHIFT
;
733 else if (c
== 'O' || c
== 'o')
737 /* Locking shift in. */
738 mask
&= ~CODING_CATEGORY_MASK_ISO_7BIT
;
739 mask_found
|= CODING_CATEGORY_MASK_ISO_SHIFT
;
744 else if (c
== '0' || c
== '1' || c
== '2')
745 /* Start/end composition. Just ignore. */
748 /* Invalid escape sequence. Just ignore. */
751 /* We found a valid designation sequence for CHARSET. */
752 mask
&= ~CODING_CATEGORY_MASK_ISO_8BIT
;
753 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7
, charset
))
754 mask_found
|= CODING_CATEGORY_MASK_ISO_7
;
756 mask
&= ~CODING_CATEGORY_MASK_ISO_7
;
757 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT
, charset
))
758 mask_found
|= CODING_CATEGORY_MASK_ISO_7_TIGHT
;
760 mask
&= ~CODING_CATEGORY_MASK_ISO_7_TIGHT
;
761 if (! CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE
, charset
))
762 mask
&= ~CODING_CATEGORY_MASK_ISO_7_ELSE
;
763 if (! CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE
, charset
))
764 mask
&= ~CODING_CATEGORY_MASK_ISO_8_ELSE
;
770 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE
)
771 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE
)))
773 /* Locking shift out. */
774 mask
&= ~CODING_CATEGORY_MASK_ISO_7BIT
;
775 mask_found
|= CODING_CATEGORY_MASK_ISO_SHIFT
;
782 /* Locking shift in. */
783 mask
&= ~CODING_CATEGORY_MASK_ISO_7BIT
;
784 mask_found
|= CODING_CATEGORY_MASK_ISO_SHIFT
;
792 int newmask
= CODING_CATEGORY_MASK_ISO_8_ELSE
;
794 if (c
!= ISO_CODE_CSI
)
796 if (coding_system_table
[CODING_CATEGORY_IDX_ISO_8_1
]->flags
797 & CODING_FLAG_ISO_SINGLE_SHIFT
)
798 newmask
|= CODING_CATEGORY_MASK_ISO_8_1
;
799 if (coding_system_table
[CODING_CATEGORY_IDX_ISO_8_2
]->flags
800 & CODING_FLAG_ISO_SINGLE_SHIFT
)
801 newmask
|= CODING_CATEGORY_MASK_ISO_8_2
;
803 if (VECTORP (Vlatin_extra_code_table
)
804 && !NILP (XVECTOR (Vlatin_extra_code_table
)->contents
[c
]))
806 if (coding_system_table
[CODING_CATEGORY_IDX_ISO_8_1
]->flags
807 & CODING_FLAG_ISO_LATIN_EXTRA
)
808 newmask
|= CODING_CATEGORY_MASK_ISO_8_1
;
809 if (coding_system_table
[CODING_CATEGORY_IDX_ISO_8_2
]->flags
810 & CODING_FLAG_ISO_LATIN_EXTRA
)
811 newmask
|= CODING_CATEGORY_MASK_ISO_8_2
;
814 mask_found
|= newmask
;
823 if (VECTORP (Vlatin_extra_code_table
)
824 && !NILP (XVECTOR (Vlatin_extra_code_table
)->contents
[c
]))
828 if (coding_system_table
[CODING_CATEGORY_IDX_ISO_8_1
]->flags
829 & CODING_FLAG_ISO_LATIN_EXTRA
)
830 newmask
|= CODING_CATEGORY_MASK_ISO_8_1
;
831 if (coding_system_table
[CODING_CATEGORY_IDX_ISO_8_2
]->flags
832 & CODING_FLAG_ISO_LATIN_EXTRA
)
833 newmask
|= CODING_CATEGORY_MASK_ISO_8_2
;
835 mask_found
|= newmask
;
842 unsigned char *src_begin
= src
;
844 mask
&= ~(CODING_CATEGORY_MASK_ISO_7BIT
845 | CODING_CATEGORY_MASK_ISO_7_ELSE
);
846 mask_found
|= CODING_CATEGORY_MASK_ISO_8_1
;
847 while (src
< src_end
&& *src
>= 0xA0)
849 if ((src
- src_begin
- 1) & 1 && src
< src_end
)
850 mask
&= ~CODING_CATEGORY_MASK_ISO_8_2
;
852 mask_found
|= CODING_CATEGORY_MASK_ISO_8_2
;
858 return (mask
& mask_found
);
861 /* Decode a character of which charset is CHARSET and the 1st position
862 code is C1. If dimension of CHARSET is 2, the 2nd position code is
863 fetched from SRC and set to C2. If CHARSET is negative, it means
864 that we are decoding ill formed text, and what we can do is just to
867 #define DECODE_ISO_CHARACTER(charset, c1) \
869 int c_alt, charset_alt = (charset); \
870 if (COMPOSING_HEAD_P (coding->composing)) \
872 *dst++ = LEADING_CODE_COMPOSITION; \
873 if (COMPOSING_WITH_RULE_P (coding->composing)) \
874 /* To tell composition rules are embeded. */ \
876 coding->composing += 2; \
878 if ((charset) >= 0) \
880 if (CHARSET_DIMENSION (charset) == 2) \
882 ONE_MORE_BYTE (c2); \
883 if (iso_code_class[(c2) & 0x7F] != ISO_0x20_or_0x7F \
884 && iso_code_class[(c2) & 0x7F] != ISO_graphic_plane_0) \
890 if (!NILP (unification_table) \
891 && ((c_alt = unify_char (unification_table, \
892 -1, (charset), c1, c2)) >= 0)) \
893 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
895 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
896 DECODE_CHARACTER_ASCII (c1); \
897 else if (CHARSET_DIMENSION (charset_alt) == 1) \
898 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
900 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
901 if (COMPOSING_WITH_RULE_P (coding->composing)) \
902 /* To tell a composition rule follows. */ \
903 coding->composing = COMPOSING_WITH_RULE_RULE; \
906 /* Set designation state into CODING. */
907 #define DECODE_DESIGNATION(reg, dimension, chars, final_char) \
909 int charset = ISO_CHARSET_TABLE (make_number (dimension), \
910 make_number (chars), \
911 make_number (final_char)); \
913 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) == reg \
914 || coding->safe_charsets[charset])) \
916 if (coding->spec.iso2022.last_invalid_designation_register == 0 \
918 && charset == CHARSET_ASCII) \
920 /* We should insert this designation sequence as is so \
921 that it is surely written back to a file. */ \
922 coding->spec.iso2022.last_invalid_designation_register = -1; \
923 goto label_invalid_code; \
925 coding->spec.iso2022.last_invalid_designation_register = -1; \
926 if ((coding->mode & CODING_MODE_DIRECTION) \
927 && CHARSET_REVERSE_CHARSET (charset) >= 0) \
928 charset = CHARSET_REVERSE_CHARSET (charset); \
929 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
933 coding->spec.iso2022.last_invalid_designation_register = reg; \
934 goto label_invalid_code; \
938 /* Check if the current composing sequence contains only valid codes.
939 If the composing sequence doesn't end before SRC_END, return -1.
940 Else, if it contains only valid codes, return 0.
941 Else return the length of the composing sequence. */
943 int check_composing_code (coding
, src
, src_end
)
944 struct coding_system
*coding
;
945 unsigned char *src
, *src_end
;
947 unsigned char *src_start
= src
;
948 int invalid_code_found
= 0;
949 int charset
, c
, c1
, dim
;
951 while (src
< src_end
)
953 if (*src
++ != ISO_CODE_ESC
) continue;
954 if (src
>= src_end
) break;
955 if ((c
= *src
++) == '1') /* end of compsition */
956 return (invalid_code_found
? src
- src_start
: 0);
957 if (src
+ 2 >= src_end
) break;
958 if (!coding
->flags
& CODING_FLAG_ISO_DESIGNATION
)
959 invalid_code_found
= 1;
966 c
= (*src
>= '@' && *src
<= 'B') ? '(' : *src
++;
968 if (c
>= '(' && c
<= '/')
971 if ((c1
< ' ' || c1
>= 0x80)
972 || (charset
= iso_charset_table
[dim
][c
>= ','][c1
]) < 0
973 || ! coding
->safe_charsets
[charset
]
974 || (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
)
975 == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION
))
976 invalid_code_found
= 1;
979 invalid_code_found
= 1;
982 return ((coding
->mode
& CODING_MODE_LAST_BLOCK
) ? src_end
- src_start
: -1);
985 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
988 decode_coding_iso2022 (coding
, source
, destination
, src_bytes
, dst_bytes
)
989 struct coding_system
*coding
;
990 unsigned char *source
, *destination
;
991 int src_bytes
, dst_bytes
;
993 unsigned char *src
= source
;
994 unsigned char *src_end
= source
+ src_bytes
;
995 unsigned char *dst
= destination
;
996 unsigned char *dst_end
= destination
+ dst_bytes
;
997 /* Since the maximum bytes produced by each loop is 7, we subtract 6
998 from DST_END to assure that overflow checking is necessary only
999 at the head of loop. */
1000 unsigned char *adjusted_dst_end
= dst_end
- 6;
1002 /* Charsets invoked to graphic plane 0 and 1 respectively. */
1003 int charset0
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 0);
1004 int charset1
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 1);
1005 Lisp_Object unification_table
1006 = coding
->character_unification_table_for_decode
;
1007 int result
= CODING_FINISH_NORMAL
;
1009 if (!NILP (Venable_character_unification
) && NILP (unification_table
))
1010 unification_table
= Vstandard_character_unification_table_for_decode
;
1012 coding
->produced_char
= 0;
1013 coding
->fake_multibyte
= 0;
1014 while (src
< src_end
&& (dst_bytes
1015 ? (dst
< adjusted_dst_end
)
1018 /* SRC_BASE remembers the start position in source in each loop.
1019 The loop will be exited when there's not enough source text
1020 to analyze long escape sequence or 2-byte code (within macros
1021 ONE_MORE_BYTE or TWO_MORE_BYTES). In that case, SRC is reset
1022 to SRC_BASE before exiting. */
1023 unsigned char *src_base
= src
;
1024 int c1
= *src
++, c2
;
1026 switch (iso_code_class
[c1
])
1028 case ISO_0x20_or_0x7F
:
1029 if (!coding
->composing
1030 && (charset0
< 0 || CHARSET_CHARS (charset0
) == 94))
1032 /* This is SPACE or DEL. */
1034 coding
->produced_char
++;
1037 /* This is a graphic character, we fall down ... */
1039 case ISO_graphic_plane_0
:
1040 if (coding
->composing
== COMPOSING_WITH_RULE_RULE
)
1042 /* This is a composition rule. */
1044 coding
->composing
= COMPOSING_WITH_RULE_TAIL
;
1047 DECODE_ISO_CHARACTER (charset0
, c1
);
1050 case ISO_0xA0_or_0xFF
:
1051 if (charset1
< 0 || CHARSET_CHARS (charset1
) == 94
1052 || coding
->flags
& CODING_FLAG_ISO_SEVEN_BITS
)
1053 goto label_invalid_code
;
1054 /* This is a graphic character, we fall down ... */
1056 case ISO_graphic_plane_1
:
1057 if (coding
->flags
& CODING_FLAG_ISO_SEVEN_BITS
)
1058 goto label_invalid_code
;
1060 DECODE_ISO_CHARACTER (charset1
, c1
);
1063 case ISO_control_code
:
1064 /* All ISO2022 control characters in this class have the
1065 same representation in Emacs internal format. */
1067 && (coding
->mode
& CODING_MODE_INHIBIT_INCONSISTENT_EOL
)
1068 && (coding
->eol_type
== CODING_EOL_CR
1069 || coding
->eol_type
== CODING_EOL_CRLF
))
1071 result
= CODING_FINISH_INCONSISTENT_EOL
;
1072 goto label_end_of_loop_2
;
1075 coding
->produced_char
++;
1078 case ISO_carriage_return
:
1079 if (coding
->eol_type
== CODING_EOL_CR
)
1081 else if (coding
->eol_type
== CODING_EOL_CRLF
)
1084 if (c1
== ISO_CODE_LF
)
1088 if (coding
->mode
& CODING_MODE_INHIBIT_INCONSISTENT_EOL
)
1090 result
= CODING_FINISH_INCONSISTENT_EOL
;
1091 goto label_end_of_loop_2
;
1099 coding
->produced_char
++;
1103 if (! (coding
->flags
& CODING_FLAG_ISO_LOCKING_SHIFT
)
1104 || CODING_SPEC_ISO_DESIGNATION (coding
, 1) < 0)
1105 goto label_invalid_code
;
1106 CODING_SPEC_ISO_INVOCATION (coding
, 0) = 1;
1107 charset0
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 0);
1111 if (! (coding
->flags
& CODING_FLAG_ISO_LOCKING_SHIFT
))
1112 goto label_invalid_code
;
1113 CODING_SPEC_ISO_INVOCATION (coding
, 0) = 0;
1114 charset0
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 0);
1117 case ISO_single_shift_2_7
:
1118 case ISO_single_shift_2
:
1119 if (! (coding
->flags
& CODING_FLAG_ISO_SINGLE_SHIFT
))
1120 goto label_invalid_code
;
1121 /* SS2 is handled as an escape sequence of ESC 'N' */
1123 goto label_escape_sequence
;
1125 case ISO_single_shift_3
:
1126 if (! (coding
->flags
& CODING_FLAG_ISO_SINGLE_SHIFT
))
1127 goto label_invalid_code
;
1128 /* SS2 is handled as an escape sequence of ESC 'O' */
1130 goto label_escape_sequence
;
1132 case ISO_control_sequence_introducer
:
1133 /* CSI is handled as an escape sequence of ESC '[' ... */
1135 goto label_escape_sequence
;
1139 label_escape_sequence
:
1140 /* Escape sequences handled by Emacs are invocation,
1141 designation, direction specification, and character
1142 composition specification. */
1145 case '&': /* revision of following character set */
1147 if (!(c1
>= '@' && c1
<= '~'))
1148 goto label_invalid_code
;
1150 if (c1
!= ISO_CODE_ESC
)
1151 goto label_invalid_code
;
1153 goto label_escape_sequence
;
1155 case '$': /* designation of 2-byte character set */
1156 if (! (coding
->flags
& CODING_FLAG_ISO_DESIGNATION
))
1157 goto label_invalid_code
;
1159 if (c1
>= '@' && c1
<= 'B')
1160 { /* designation of JISX0208.1978, GB2312.1980,
1162 DECODE_DESIGNATION (0, 2, 94, c1
);
1164 else if (c1
>= 0x28 && c1
<= 0x2B)
1165 { /* designation of DIMENSION2_CHARS94 character set */
1167 DECODE_DESIGNATION (c1
- 0x28, 2, 94, c2
);
1169 else if (c1
>= 0x2C && c1
<= 0x2F)
1170 { /* designation of DIMENSION2_CHARS96 character set */
1172 DECODE_DESIGNATION (c1
- 0x2C, 2, 96, c2
);
1175 goto label_invalid_code
;
1178 case 'n': /* invocation of locking-shift-2 */
1179 if (! (coding
->flags
& CODING_FLAG_ISO_LOCKING_SHIFT
)
1180 || CODING_SPEC_ISO_DESIGNATION (coding
, 2) < 0)
1181 goto label_invalid_code
;
1182 CODING_SPEC_ISO_INVOCATION (coding
, 0) = 2;
1183 charset0
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 0);
1186 case 'o': /* invocation of locking-shift-3 */
1187 if (! (coding
->flags
& CODING_FLAG_ISO_LOCKING_SHIFT
)
1188 || CODING_SPEC_ISO_DESIGNATION (coding
, 3) < 0)
1189 goto label_invalid_code
;
1190 CODING_SPEC_ISO_INVOCATION (coding
, 0) = 3;
1191 charset0
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 0);
1194 case 'N': /* invocation of single-shift-2 */
1195 if (! (coding
->flags
& CODING_FLAG_ISO_SINGLE_SHIFT
)
1196 || CODING_SPEC_ISO_DESIGNATION (coding
, 2) < 0)
1197 goto label_invalid_code
;
1199 charset
= CODING_SPEC_ISO_DESIGNATION (coding
, 2);
1200 DECODE_ISO_CHARACTER (charset
, c1
);
1203 case 'O': /* invocation of single-shift-3 */
1204 if (! (coding
->flags
& CODING_FLAG_ISO_SINGLE_SHIFT
)
1205 || CODING_SPEC_ISO_DESIGNATION (coding
, 3) < 0)
1206 goto label_invalid_code
;
1208 charset
= CODING_SPEC_ISO_DESIGNATION (coding
, 3);
1209 DECODE_ISO_CHARACTER (charset
, c1
);
1212 case '0': case '2': /* start composing */
1213 /* Before processing composing, we must be sure that all
1214 characters being composed are supported by CODING.
1215 If not, we must give up composing and insert the
1216 bunch of codes for composing as is without decoding. */
1220 result1
= check_composing_code (coding
, src
, src_end
);
1222 coding
->composing
= (c1
== '0'
1223 ? COMPOSING_NO_RULE_HEAD
1224 : COMPOSING_WITH_RULE_HEAD
);
1225 else if (result1
> 0)
1227 if (result1
+ 2 < (dst_bytes
? dst_end
: src_base
) - dst
)
1229 bcopy (src_base
, dst
, result1
+ 2);
1232 coding
->produced_char
+= result1
+ 2;
1236 result
= CODING_FINISH_INSUFFICIENT_DST
;
1237 goto label_end_of_loop_2
;
1241 goto label_end_of_loop
;
1245 case '1': /* end composing */
1246 coding
->composing
= COMPOSING_NO
;
1247 coding
->produced_char
++;
1250 case '[': /* specification of direction */
1251 if (coding
->flags
& CODING_FLAG_ISO_NO_DIRECTION
)
1252 goto label_invalid_code
;
1253 /* For the moment, nested direction is not supported.
1254 So, `coding->mode & CODING_MODE_DIRECTION' zero means
1255 left-to-right, and nozero means right-to-left. */
1259 case ']': /* end of the current direction */
1260 coding
->mode
&= ~CODING_MODE_DIRECTION
;
1262 case '0': /* end of the current direction */
1263 case '1': /* start of left-to-right direction */
1266 coding
->mode
&= ~CODING_MODE_DIRECTION
;
1268 goto label_invalid_code
;
1271 case '2': /* start of right-to-left direction */
1274 coding
->mode
|= CODING_MODE_DIRECTION
;
1276 goto label_invalid_code
;
1280 goto label_invalid_code
;
1285 if (! (coding
->flags
& CODING_FLAG_ISO_DESIGNATION
))
1286 goto label_invalid_code
;
1287 if (c1
>= 0x28 && c1
<= 0x2B)
1288 { /* designation of DIMENSION1_CHARS94 character set */
1290 DECODE_DESIGNATION (c1
- 0x28, 1, 94, c2
);
1292 else if (c1
>= 0x2C && c1
<= 0x2F)
1293 { /* designation of DIMENSION1_CHARS96 character set */
1295 DECODE_DESIGNATION (c1
- 0x2C, 1, 96, c2
);
1299 goto label_invalid_code
;
1302 /* We must update these variables now. */
1303 charset0
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 0);
1304 charset1
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 1);
1308 while (src_base
< src
)
1309 *dst
++ = *src_base
++;
1310 coding
->fake_multibyte
= 1;
1315 result
= CODING_FINISH_INSUFFICIENT_SRC
;
1316 label_end_of_loop_2
:
1323 if (result
== CODING_FINISH_NORMAL
)
1324 result
= CODING_FINISH_INSUFFICIENT_DST
;
1325 else if (result
!= CODING_FINISH_INCONSISTENT_EOL
1326 && coding
->mode
& CODING_MODE_LAST_BLOCK
)
1328 /* This is the last block of the text to be decoded. We had
1329 better just flush out all remaining codes in the text
1330 although they are not valid characters. */
1331 src_bytes
= src_end
- src
;
1332 if (dst_bytes
&& (dst_end
- dst
< src_bytes
))
1333 src_bytes
= dst_end
- dst
;
1334 bcopy (src
, dst
, src_bytes
);
1337 coding
->fake_multibyte
= 1;
1341 coding
->consumed
= coding
->consumed_char
= src
- source
;
1342 coding
->produced
= dst
- destination
;
1346 /* ISO2022 encoding stuff. */
1349 It is not enough to say just "ISO2022" on encoding, we have to
1350 specify more details. In Emacs, each coding system of ISO2022
1351 variant has the following specifications:
1352 1. Initial designation to G0 thru G3.
1353 2. Allows short-form designation?
1354 3. ASCII should be designated to G0 before control characters?
1355 4. ASCII should be designated to G0 at end of line?
1356 5. 7-bit environment or 8-bit environment?
1357 6. Use locking-shift?
1358 7. Use Single-shift?
1359 And the following two are only for Japanese:
1360 8. Use ASCII in place of JIS0201-1976-Roman?
1361 9. Use JISX0208-1983 in place of JISX0208-1978?
1362 These specifications are encoded in `coding->flags' as flag bits
1363 defined by macros CODING_FLAG_ISO_XXX. See `coding.h' for more
1367 /* Produce codes (escape sequence) for designating CHARSET to graphic
1368 register REG. If <final-char> of CHARSET is '@', 'A', or 'B' and
1369 the coding system CODING allows, produce designation sequence of
1372 #define ENCODE_DESIGNATION(charset, reg, coding) \
1374 unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset); \
1375 char *intermediate_char_94 = "()*+"; \
1376 char *intermediate_char_96 = ",-./"; \
1377 int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset); \
1378 if (revision < 255) \
1380 *dst++ = ISO_CODE_ESC; \
1382 *dst++ = '@' + revision; \
1384 *dst++ = ISO_CODE_ESC; \
1385 if (CHARSET_DIMENSION (charset) == 1) \
1387 if (CHARSET_CHARS (charset) == 94) \
1388 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1390 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1395 if (CHARSET_CHARS (charset) == 94) \
1397 if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM) \
1399 || final_char < '@' || final_char > 'B') \
1400 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1403 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1405 *dst++ = final_char; \
1406 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
1409 /* The following two macros produce codes (control character or escape
1410 sequence) for ISO2022 single-shift functions (single-shift-2 and
1413 #define ENCODE_SINGLE_SHIFT_2 \
1415 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1416 *dst++ = ISO_CODE_ESC, *dst++ = 'N'; \
1419 *dst++ = ISO_CODE_SS2; \
1420 coding->fake_multibyte = 1; \
1422 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1425 #define ENCODE_SINGLE_SHIFT_3 \
1427 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1428 *dst++ = ISO_CODE_ESC, *dst++ = 'O'; \
1431 *dst++ = ISO_CODE_SS3; \
1432 coding->fake_multibyte = 1; \
1434 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1437 /* The following four macros produce codes (control character or
1438 escape sequence) for ISO2022 locking-shift functions (shift-in,
1439 shift-out, locking-shift-2, and locking-shift-3). */
1441 #define ENCODE_SHIFT_IN \
1443 *dst++ = ISO_CODE_SI; \
1444 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1447 #define ENCODE_SHIFT_OUT \
1449 *dst++ = ISO_CODE_SO; \
1450 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1453 #define ENCODE_LOCKING_SHIFT_2 \
1455 *dst++ = ISO_CODE_ESC, *dst++ = 'n'; \
1456 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1459 #define ENCODE_LOCKING_SHIFT_3 \
1461 *dst++ = ISO_CODE_ESC, *dst++ = 'o'; \
1462 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1465 /* Produce codes for a DIMENSION1 character whose character set is
1466 CHARSET and whose position-code is C1. Designation and invocation
1467 sequences are also produced in advance if necessary. */
1470 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
1472 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1474 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1475 *dst++ = c1 & 0x7F; \
1477 *dst++ = c1 | 0x80; \
1478 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1481 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1483 *dst++ = c1 & 0x7F; \
1486 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1488 *dst++ = c1 | 0x80; \
1491 else if (coding->flags & CODING_FLAG_ISO_SAFE \
1492 && !coding->safe_charsets[charset]) \
1494 /* We should not encode this character, instead produce one or \
1496 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1497 if (CHARSET_WIDTH (charset) == 2) \
1498 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1502 /* Since CHARSET is not yet invoked to any graphic planes, we \
1503 must invoke it, or, at first, designate it to some graphic \
1504 register. Then repeat the loop to actually produce the \
1506 dst = encode_invocation_designation (charset, coding, dst); \
1509 /* Produce codes for a DIMENSION2 character whose character set is
1510 CHARSET and whose position-codes are C1 and C2. Designation and
1511 invocation codes are also produced in advance if necessary. */
1513 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
1515 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1517 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1518 *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F; \
1520 *dst++ = c1 | 0x80, *dst++ = c2 | 0x80; \
1521 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1524 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1526 *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F; \
1529 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1531 *dst++ = c1 | 0x80, *dst++= c2 | 0x80; \
1534 else if (coding->flags & CODING_FLAG_ISO_SAFE \
1535 && !coding->safe_charsets[charset]) \
1537 /* We should not encode this character, instead produce one or \
1539 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1540 if (CHARSET_WIDTH (charset) == 2) \
1541 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1545 /* Since CHARSET is not yet invoked to any graphic planes, we \
1546 must invoke it, or, at first, designate it to some graphic \
1547 register. Then repeat the loop to actually produce the \
1549 dst = encode_invocation_designation (charset, coding, dst); \
1552 #define ENCODE_ISO_CHARACTER(charset, c1, c2) \
1554 int c_alt, charset_alt; \
1555 if (!NILP (unification_table) \
1556 && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
1558 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1560 charset_alt = charset; \
1561 if (CHARSET_DIMENSION (charset_alt) == 1) \
1563 if (charset == CHARSET_ASCII \
1564 && coding->flags & CODING_FLAG_ISO_USE_ROMAN) \
1565 charset_alt = charset_latin_jisx0201; \
1566 ENCODE_ISO_CHARACTER_DIMENSION1 (charset_alt, c1); \
1570 if (charset == charset_jisx0208 \
1571 && coding->flags & CODING_FLAG_ISO_USE_OLDJIS) \
1572 charset_alt = charset_jisx0208_1978; \
1573 ENCODE_ISO_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
1575 if (! COMPOSING_P (coding->composing)) \
1576 coding->consumed_char++; \
1579 /* Produce designation and invocation codes at a place pointed by DST
1580 to use CHARSET. The element `spec.iso2022' of *CODING is updated.
1584 encode_invocation_designation (charset
, coding
, dst
)
1586 struct coding_system
*coding
;
1589 int reg
; /* graphic register number */
1591 /* At first, check designations. */
1592 for (reg
= 0; reg
< 4; reg
++)
1593 if (charset
== CODING_SPEC_ISO_DESIGNATION (coding
, reg
))
1598 /* CHARSET is not yet designated to any graphic registers. */
1599 /* At first check the requested designation. */
1600 reg
= CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
);
1601 if (reg
== CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION
)
1602 /* Since CHARSET requests no special designation, designate it
1603 to graphic register 0. */
1606 ENCODE_DESIGNATION (charset
, reg
, coding
);
1609 if (CODING_SPEC_ISO_INVOCATION (coding
, 0) != reg
1610 && CODING_SPEC_ISO_INVOCATION (coding
, 1) != reg
)
1612 /* Since the graphic register REG is not invoked to any graphic
1613 planes, invoke it to graphic plane 0. */
1616 case 0: /* graphic register 0 */
1620 case 1: /* graphic register 1 */
1624 case 2: /* graphic register 2 */
1625 if (coding
->flags
& CODING_FLAG_ISO_SINGLE_SHIFT
)
1626 ENCODE_SINGLE_SHIFT_2
;
1628 ENCODE_LOCKING_SHIFT_2
;
1631 case 3: /* graphic register 3 */
1632 if (coding
->flags
& CODING_FLAG_ISO_SINGLE_SHIFT
)
1633 ENCODE_SINGLE_SHIFT_3
;
1635 ENCODE_LOCKING_SHIFT_3
;
1642 /* The following two macros produce codes for indicating composition. */
1643 #define ENCODE_COMPOSITION_NO_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '0'
1644 #define ENCODE_COMPOSITION_WITH_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '2'
1645 #define ENCODE_COMPOSITION_END *dst++ = ISO_CODE_ESC, *dst++ = '1'
1647 /* The following three macros produce codes for indicating direction
1649 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
1651 if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS) \
1652 *dst++ = ISO_CODE_ESC, *dst++ = '['; \
1654 *dst++ = ISO_CODE_CSI; \
1657 #define ENCODE_DIRECTION_R2L \
1658 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '2', *dst++ = ']'
1660 #define ENCODE_DIRECTION_L2R \
1661 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '0', *dst++ = ']'
1663 /* Produce codes for designation and invocation to reset the graphic
1664 planes and registers to initial state. */
1665 #define ENCODE_RESET_PLANE_AND_REGISTER \
1668 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0) \
1670 for (reg = 0; reg < 4; reg++) \
1671 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0 \
1672 && (CODING_SPEC_ISO_DESIGNATION (coding, reg) \
1673 != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg))) \
1674 ENCODE_DESIGNATION \
1675 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
1678 /* Produce designation sequences of charsets in the line started from
1679 SRC to a place pointed by *DSTP, and update DSTP.
1681 If the current block ends before any end-of-line, we may fail to
1682 find all the necessary designations. */
1685 encode_designation_at_bol (coding
, table
, src
, src_end
, dstp
)
1686 struct coding_system
*coding
;
1688 unsigned char *src
, *src_end
, **dstp
;
1690 int charset
, c
, found
= 0, reg
;
1691 /* Table of charsets to be designated to each graphic register. */
1693 unsigned char *dst
= *dstp
;
1695 for (reg
= 0; reg
< 4; reg
++)
1698 while (src
< src_end
&& *src
!= '\n' && found
< 4)
1700 int bytes
= BYTES_BY_CHAR_HEAD (*src
);
1703 charset
= CHARSET_AT (src
);
1707 unsigned char c1
, c2
;
1709 SPLIT_STRING(src
, bytes
, charset
, c1
, c2
);
1710 if ((c_alt
= unify_char (table
, -1, charset
, c1
, c2
)) >= 0)
1711 charset
= CHAR_CHARSET (c_alt
);
1714 reg
= CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
);
1715 if (reg
!= CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION
&& r
[reg
] < 0)
1726 for (reg
= 0; reg
< 4; reg
++)
1728 && CODING_SPEC_ISO_DESIGNATION (coding
, reg
) != r
[reg
])
1729 ENCODE_DESIGNATION (r
[reg
], reg
, coding
);
1734 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
1737 encode_coding_iso2022 (coding
, source
, destination
, src_bytes
, dst_bytes
)
1738 struct coding_system
*coding
;
1739 unsigned char *source
, *destination
;
1740 int src_bytes
, dst_bytes
;
1742 unsigned char *src
= source
;
1743 unsigned char *src_end
= source
+ src_bytes
;
1744 unsigned char *dst
= destination
;
1745 unsigned char *dst_end
= destination
+ dst_bytes
;
1746 /* Since the maximum bytes produced by each loop is 20, we subtract 19
1747 from DST_END to assure overflow checking is necessary only at the
1749 unsigned char *adjusted_dst_end
= dst_end
- 19;
1750 Lisp_Object unification_table
1751 = coding
->character_unification_table_for_encode
;
1752 int result
= CODING_FINISH_NORMAL
;
1754 if (!NILP (Venable_character_unification
) && NILP (unification_table
))
1755 unification_table
= Vstandard_character_unification_table_for_encode
;
1757 coding
->consumed_char
= 0;
1758 coding
->fake_multibyte
= 0;
1759 while (src
< src_end
&& (dst_bytes
1760 ? (dst
< adjusted_dst_end
)
1761 : (dst
< src
- 19)))
1763 /* SRC_BASE remembers the start position in source in each loop.
1764 The loop will be exited when there's not enough source text
1765 to analyze multi-byte codes (within macros ONE_MORE_BYTE,
1766 TWO_MORE_BYTES, and THREE_MORE_BYTES). In that case, SRC is
1767 reset to SRC_BASE before exiting. */
1768 unsigned char *src_base
= src
;
1769 int charset
, c1
, c2
, c3
, c4
;
1771 if (coding
->flags
& CODING_FLAG_ISO_DESIGNATE_AT_BOL
1772 && CODING_SPEC_ISO_BOL (coding
))
1774 /* We have to produce designation sequences if any now. */
1775 encode_designation_at_bol (coding
, unification_table
,
1776 src
, src_end
, &dst
);
1777 CODING_SPEC_ISO_BOL (coding
) = 0;
1781 /* If we are seeing a component of a composite character, we are
1782 seeing a leading-code encoded irregularly for composition, or
1783 a composition rule if composing with rule. We must set C1 to
1784 a normal leading-code or an ASCII code. If we are not seeing
1785 a composite character, we must reset composition,
1786 designation, and invocation states. */
1787 if (COMPOSING_P (coding
->composing
))
1791 /* We are not in a composite character any longer. */
1792 coding
->composing
= COMPOSING_NO
;
1793 ENCODE_RESET_PLANE_AND_REGISTER
;
1794 ENCODE_COMPOSITION_END
;
1798 if (coding
->composing
== COMPOSING_WITH_RULE_RULE
)
1801 coding
->composing
= COMPOSING_WITH_RULE_HEAD
;
1804 else if (coding
->composing
== COMPOSING_WITH_RULE_HEAD
)
1805 coding
->composing
= COMPOSING_WITH_RULE_RULE
;
1808 /* This is an ASCII component. */
1813 /* This is a leading-code of non ASCII component. */
1818 /* Now encode one character. C1 is a control character, an
1819 ASCII character, or a leading-code of multi-byte character. */
1820 switch (emacs_code_class
[c1
])
1822 case EMACS_ascii_code
:
1823 ENCODE_ISO_CHARACTER (CHARSET_ASCII
, c1
, /* dummy */ c2
);
1826 case EMACS_control_code
:
1827 if (coding
->flags
& CODING_FLAG_ISO_RESET_AT_CNTL
)
1828 ENCODE_RESET_PLANE_AND_REGISTER
;
1830 coding
->consumed_char
++;
1833 case EMACS_carriage_return_code
:
1834 if (! (coding
->mode
& CODING_MODE_SELECTIVE_DISPLAY
))
1836 if (coding
->flags
& CODING_FLAG_ISO_RESET_AT_CNTL
)
1837 ENCODE_RESET_PLANE_AND_REGISTER
;
1839 coding
->consumed_char
++;
1842 /* fall down to treat '\r' as '\n' ... */
1844 case EMACS_linefeed_code
:
1845 if (coding
->flags
& CODING_FLAG_ISO_RESET_AT_EOL
)
1846 ENCODE_RESET_PLANE_AND_REGISTER
;
1847 if (coding
->flags
& CODING_FLAG_ISO_INIT_AT_BOL
)
1848 bcopy (coding
->spec
.iso2022
.initial_designation
,
1849 coding
->spec
.iso2022
.current_designation
,
1850 sizeof coding
->spec
.iso2022
.initial_designation
);
1851 if (coding
->eol_type
== CODING_EOL_LF
1852 || coding
->eol_type
== CODING_EOL_UNDECIDED
)
1853 *dst
++ = ISO_CODE_LF
;
1854 else if (coding
->eol_type
== CODING_EOL_CRLF
)
1855 *dst
++ = ISO_CODE_CR
, *dst
++ = ISO_CODE_LF
;
1857 *dst
++ = ISO_CODE_CR
;
1858 CODING_SPEC_ISO_BOL (coding
) = 1;
1859 coding
->consumed_char
++;
1862 case EMACS_leading_code_2
:
1866 /* invalid sequence */
1869 coding
->consumed_char
+= 2;
1872 ENCODE_ISO_CHARACTER (c1
, c2
, /* dummy */ c3
);
1875 case EMACS_leading_code_3
:
1876 TWO_MORE_BYTES (c2
, c3
);
1877 if (c2
< 0xA0 || c3
< 0xA0)
1879 /* invalid sequence */
1883 coding
->consumed_char
+= 3;
1885 else if (c1
< LEADING_CODE_PRIVATE_11
)
1886 ENCODE_ISO_CHARACTER (c1
, c2
, c3
);
1888 ENCODE_ISO_CHARACTER (c2
, c3
, /* dummy */ c4
);
1891 case EMACS_leading_code_4
:
1892 THREE_MORE_BYTES (c2
, c3
, c4
);
1893 if (c2
< 0xA0 || c3
< 0xA0 || c4
< 0xA0)
1895 /* invalid sequence */
1900 coding
->consumed_char
+= 4;
1903 ENCODE_ISO_CHARACTER (c2
, c3
, c4
);
1906 case EMACS_leading_code_composition
:
1910 /* invalid sequence */
1913 coding
->consumed_char
+= 2;
1915 else if (c2
== 0xFF)
1917 ENCODE_RESET_PLANE_AND_REGISTER
;
1918 coding
->composing
= COMPOSING_WITH_RULE_HEAD
;
1919 ENCODE_COMPOSITION_WITH_RULE_START
;
1920 coding
->consumed_char
++;
1924 ENCODE_RESET_PLANE_AND_REGISTER
;
1925 /* Rewind one byte because it is a character code of
1926 composition elements. */
1928 coding
->composing
= COMPOSING_NO_RULE_HEAD
;
1929 ENCODE_COMPOSITION_NO_RULE_START
;
1930 coding
->consumed_char
++;
1934 case EMACS_invalid_code
:
1936 coding
->consumed_char
++;
1941 result
= CODING_FINISH_INSUFFICIENT_SRC
;
1948 if (result
== CODING_FINISH_NORMAL
)
1949 result
= CODING_FINISH_INSUFFICIENT_DST
;
1951 /* If this is the last block of the text to be encoded, we
1952 must reset graphic planes and registers to the initial
1953 state, and flush out the carryover if any. */
1954 if (coding
->mode
& CODING_MODE_LAST_BLOCK
)
1955 ENCODE_RESET_PLANE_AND_REGISTER
;
1958 coding
->consumed
= src
- source
;
1959 coding
->produced
= coding
->produced_char
= dst
- destination
;
1964 /*** 4. SJIS and BIG5 handlers ***/
1966 /* Although SJIS and BIG5 are not ISO's coding system, they are used
1967 quite widely. So, for the moment, Emacs supports them in the bare
1968 C code. But, in the future, they may be supported only by CCL. */
1970 /* SJIS is a coding system encoding three character sets: ASCII, right
1971 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
1972 as is. A character of charset katakana-jisx0201 is encoded by
1973 "position-code + 0x80". A character of charset japanese-jisx0208
1974 is encoded in 2-byte but two position-codes are divided and shifted
1975 so that it fit in the range below.
1977 --- CODE RANGE of SJIS ---
1978 (character set) (range)
1980 KATAKANA-JISX0201 0xA0 .. 0xDF
1981 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xFF
1982 (2nd byte) 0x40 .. 0xFF
1983 -------------------------------
1987 /* BIG5 is a coding system encoding two character sets: ASCII and
1988 Big5. An ASCII character is encoded as is. Big5 is a two-byte
1989 character set and is encoded in two-byte.
1991 --- CODE RANGE of BIG5 ---
1992 (character set) (range)
1994 Big5 (1st byte) 0xA1 .. 0xFE
1995 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
1996 --------------------------
1998 Since the number of characters in Big5 is larger than maximum
1999 characters in Emacs' charset (96x96), it can't be handled as one
2000 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
2001 and `charset-big5-2'. Both are DIMENSION2 and CHARS94. The former
2002 contains frequently used characters and the latter contains less
2003 frequently used characters. */
2005 /* Macros to decode or encode a character of Big5 in BIG5. B1 and B2
2006 are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
2007 C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
2008 format. CHARSET is `charset_big5_1' or `charset_big5_2'. */
2010 /* Number of Big5 characters which have the same code in 1st byte. */
2011 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2013 #define DECODE_BIG5(b1, b2, charset, c1, c2) \
2016 = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62); \
2018 charset = charset_big5_1; \
2021 charset = charset_big5_2; \
2022 temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW; \
2024 c1 = temp / (0xFF - 0xA1) + 0x21; \
2025 c2 = temp % (0xFF - 0xA1) + 0x21; \
2028 #define ENCODE_BIG5(charset, c1, c2, b1, b2) \
2030 unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21); \
2031 if (charset == charset_big5_2) \
2032 temp += BIG5_SAME_ROW * (0xC9 - 0xA1); \
2033 b1 = temp / BIG5_SAME_ROW + 0xA1; \
2034 b2 = temp % BIG5_SAME_ROW; \
2035 b2 += b2 < 0x3F ? 0x40 : 0x62; \
2038 #define DECODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \
2040 int c_alt, charset_alt = (charset); \
2041 if (!NILP (unification_table) \
2042 && ((c_alt = unify_char (unification_table, \
2043 -1, (charset), c1, c2)) >= 0)) \
2044 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
2045 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
2046 DECODE_CHARACTER_ASCII (c1); \
2047 else if (CHARSET_DIMENSION (charset_alt) == 1) \
2048 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
2050 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
2053 #define ENCODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \
2055 int c_alt, charset_alt; \
2056 if (!NILP (unification_table) \
2057 && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
2059 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
2061 charset_alt = charset; \
2062 if (charset_alt == charset_ascii) \
2064 else if (CHARSET_DIMENSION (charset_alt) == 1) \
2066 if (sjis_p && charset_alt == charset_katakana_jisx0201) \
2070 *dst++ = charset_alt, *dst++ = c1; \
2071 coding->fake_multibyte = 1; \
2076 c1 &= 0x7F, c2 &= 0x7F; \
2077 if (sjis_p && charset_alt == charset_jisx0208) \
2079 unsigned char s1, s2; \
2081 ENCODE_SJIS (c1, c2, s1, s2); \
2082 *dst++ = s1, *dst++ = s2; \
2083 coding->fake_multibyte = 1; \
2086 && (charset_alt == charset_big5_1 \
2087 || charset_alt == charset_big5_2)) \
2089 unsigned char b1, b2; \
2091 ENCODE_BIG5 (charset_alt, c1, c2, b1, b2); \
2092 *dst++ = b1, *dst++ = b2; \
2096 *dst++ = charset_alt, *dst++ = c1, *dst++ = c2; \
2097 coding->fake_multibyte = 1; \
2100 coding->consumed_char++; \
2103 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2104 Check if a text is encoded in SJIS. If it is, return
2105 CODING_CATEGORY_MASK_SJIS, else return 0. */
2108 detect_coding_sjis (src
, src_end
)
2109 unsigned char *src
, *src_end
;
2113 while (src
< src_end
)
2116 if ((c
>= 0x80 && c
< 0xA0) || c
>= 0xE0)
2118 if (src
< src_end
&& *src
++ < 0x40)
2122 return CODING_CATEGORY_MASK_SJIS
;
2125 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2126 Check if a text is encoded in BIG5. If it is, return
2127 CODING_CATEGORY_MASK_BIG5, else return 0. */
2130 detect_coding_big5 (src
, src_end
)
2131 unsigned char *src
, *src_end
;
2135 while (src
< src_end
)
2143 if (c
< 0x40 || (c
>= 0x7F && c
<= 0xA0))
2147 return CODING_CATEGORY_MASK_BIG5
;
2150 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2151 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
2154 decode_coding_sjis_big5 (coding
, source
, destination
,
2155 src_bytes
, dst_bytes
, sjis_p
)
2156 struct coding_system
*coding
;
2157 unsigned char *source
, *destination
;
2158 int src_bytes
, dst_bytes
;
2161 unsigned char *src
= source
;
2162 unsigned char *src_end
= source
+ src_bytes
;
2163 unsigned char *dst
= destination
;
2164 unsigned char *dst_end
= destination
+ dst_bytes
;
2165 /* Since the maximum bytes produced by each loop is 4, we subtract 3
2166 from DST_END to assure overflow checking is necessary only at the
2168 unsigned char *adjusted_dst_end
= dst_end
- 3;
2169 Lisp_Object unification_table
2170 = coding
->character_unification_table_for_decode
;
2171 int result
= CODING_FINISH_NORMAL
;
2173 if (!NILP (Venable_character_unification
) && NILP (unification_table
))
2174 unification_table
= Vstandard_character_unification_table_for_decode
;
2176 coding
->produced_char
= 0;
2177 coding
->fake_multibyte
= 0;
2178 while (src
< src_end
&& (dst_bytes
2179 ? (dst
< adjusted_dst_end
)
2182 /* SRC_BASE remembers the start position in source in each loop.
2183 The loop will be exited when there's not enough source text
2184 to analyze two-byte character (within macro ONE_MORE_BYTE).
2185 In that case, SRC is reset to SRC_BASE before exiting. */
2186 unsigned char *src_base
= src
;
2187 unsigned char c1
= *src
++, c2
, c3
, c4
;
2193 if (coding
->eol_type
== CODING_EOL_CRLF
)
2198 else if (coding
->mode
& CODING_MODE_INHIBIT_INCONSISTENT_EOL
)
2200 result
= CODING_FINISH_INCONSISTENT_EOL
;
2201 goto label_end_of_loop_2
;
2204 /* To process C2 again, SRC is subtracted by 1. */
2207 else if (coding
->eol_type
== CODING_EOL_CR
)
2213 && (coding
->mode
& CODING_MODE_INHIBIT_INCONSISTENT_EOL
)
2214 && (coding
->eol_type
== CODING_EOL_CR
2215 || coding
->eol_type
== CODING_EOL_CRLF
))
2217 result
= CODING_FINISH_INCONSISTENT_EOL
;
2218 goto label_end_of_loop_2
;
2222 coding
->produced_char
++;
2225 DECODE_SJIS_BIG5_CHARACTER (charset_ascii
, c1
, /* dummy */ c2
);
2228 /* SJIS -> JISX0208 */
2234 DECODE_SJIS (c1
, c2
, c3
, c4
);
2235 DECODE_SJIS_BIG5_CHARACTER (charset_jisx0208
, c3
, c4
);
2238 goto label_invalid_code_2
;
2241 goto label_invalid_code_1
;
2245 /* SJIS -> JISX0201-Kana, BIG5 -> Big5 */
2247 DECODE_SJIS_BIG5_CHARACTER (charset_katakana_jisx0201
, c1
,
2254 if ((c2
>= 0x40 && c2
<= 0x7E) || (c2
>= 0xA1 && c2
<= 0xFE))
2256 DECODE_BIG5 (c1
, c2
, charset
, c3
, c4
);
2257 DECODE_SJIS_BIG5_CHARACTER (charset
, c3
, c4
);
2260 goto label_invalid_code_2
;
2263 else /* C1 >= 0xE0 */
2265 /* SJIS -> JISX0208, BIG5 -> Big5 */
2271 DECODE_SJIS (c1
, c2
, c3
, c4
);
2272 DECODE_SJIS_BIG5_CHARACTER (charset_jisx0208
, c3
, c4
);
2275 goto label_invalid_code_2
;
2282 if ((c2
>= 0x40 && c2
<= 0x7E) || (c2
>= 0xA1 && c2
<= 0xFE))
2284 DECODE_BIG5 (c1
, c2
, charset
, c3
, c4
);
2285 DECODE_SJIS_BIG5_CHARACTER (charset
, c3
, c4
);
2288 goto label_invalid_code_2
;
2293 label_invalid_code_1
:
2295 coding
->produced_char
++;
2296 coding
->fake_multibyte
= 1;
2299 label_invalid_code_2
:
2300 *dst
++ = c1
; *dst
++= c2
;
2301 coding
->produced_char
+= 2;
2302 coding
->fake_multibyte
= 1;
2306 result
= CODING_FINISH_INSUFFICIENT_SRC
;
2307 label_end_of_loop_2
:
2314 if (result
== CODING_FINISH_NORMAL
)
2315 result
= CODING_FINISH_INSUFFICIENT_DST
;
2316 else if (result
!= CODING_FINISH_INCONSISTENT_EOL
2317 && coding
->mode
& CODING_MODE_LAST_BLOCK
)
2319 src_bytes
= src_end
- src
;
2320 if (dst_bytes
&& (dst_end
- dst
< src_bytes
))
2321 src_bytes
= dst_end
- dst
;
2322 bcopy (dst
, src
, src_bytes
);
2325 coding
->fake_multibyte
= 1;
2329 coding
->consumed
= coding
->consumed_char
= src
- source
;
2330 coding
->produced
= dst
- destination
;
2334 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
2335 This function can encode `charset_ascii', `charset_katakana_jisx0201',
2336 `charset_jisx0208', `charset_big5_1', and `charset_big5-2'. We are
2337 sure that all these charsets are registered as official charset
2338 (i.e. do not have extended leading-codes). Characters of other
2339 charsets are produced without any encoding. If SJIS_P is 1, encode
2340 SJIS text, else encode BIG5 text. */
2343 encode_coding_sjis_big5 (coding
, source
, destination
,
2344 src_bytes
, dst_bytes
, sjis_p
)
2345 struct coding_system
*coding
;
2346 unsigned char *source
, *destination
;
2347 int src_bytes
, dst_bytes
;
2350 unsigned char *src
= source
;
2351 unsigned char *src_end
= source
+ src_bytes
;
2352 unsigned char *dst
= destination
;
2353 unsigned char *dst_end
= destination
+ dst_bytes
;
2354 /* Since the maximum bytes produced by each loop is 2, we subtract 1
2355 from DST_END to assure overflow checking is necessary only at the
2357 unsigned char *adjusted_dst_end
= dst_end
- 1;
2358 Lisp_Object unification_table
2359 = coding
->character_unification_table_for_encode
;
2360 int result
= CODING_FINISH_NORMAL
;
2362 if (!NILP (Venable_character_unification
) && NILP (unification_table
))
2363 unification_table
= Vstandard_character_unification_table_for_encode
;
2365 coding
->consumed_char
= 0;
2366 coding
->fake_multibyte
= 0;
2367 while (src
< src_end
&& (dst_bytes
2368 ? (dst
< adjusted_dst_end
)
2371 /* SRC_BASE remembers the start position in source in each loop.
2372 The loop will be exited when there's not enough source text
2373 to analyze multi-byte codes (within macros ONE_MORE_BYTE and
2374 TWO_MORE_BYTES). In that case, SRC is reset to SRC_BASE
2376 unsigned char *src_base
= src
;
2377 unsigned char c1
= *src
++, c2
, c3
, c4
;
2379 if (coding
->composing
)
2386 else if (c1
>= 0xA0)
2389 coding
->composing
= 0;
2392 switch (emacs_code_class
[c1
])
2394 case EMACS_ascii_code
:
2395 ENCODE_SJIS_BIG5_CHARACTER (charset_ascii
, c1
, /* dummy */ c2
);
2398 case EMACS_control_code
:
2400 coding
->consumed_char
++;
2403 case EMACS_carriage_return_code
:
2404 if (! (coding
->mode
& CODING_MODE_SELECTIVE_DISPLAY
))
2407 coding
->consumed_char
++;
2410 /* fall down to treat '\r' as '\n' ... */
2412 case EMACS_linefeed_code
:
2413 if (coding
->eol_type
== CODING_EOL_LF
2414 || coding
->eol_type
== CODING_EOL_UNDECIDED
)
2416 else if (coding
->eol_type
== CODING_EOL_CRLF
)
2417 *dst
++ = '\r', *dst
++ = '\n';
2420 coding
->consumed_char
++;
2423 case EMACS_leading_code_2
:
2425 ENCODE_SJIS_BIG5_CHARACTER (c1
, c2
, /* dummy */ c3
);
2428 case EMACS_leading_code_3
:
2429 TWO_MORE_BYTES (c2
, c3
);
2430 ENCODE_SJIS_BIG5_CHARACTER (c1
, c2
, c3
);
2433 case EMACS_leading_code_4
:
2434 THREE_MORE_BYTES (c2
, c3
, c4
);
2435 ENCODE_SJIS_BIG5_CHARACTER (c2
, c3
, c4
);
2438 case EMACS_leading_code_composition
:
2439 coding
->composing
= 1;
2442 default: /* i.e. case EMACS_invalid_code: */
2444 coding
->consumed_char
++;
2449 result
= CODING_FINISH_INSUFFICIENT_SRC
;
2454 if (result
== CODING_FINISH_NORMAL
2456 result
= CODING_FINISH_INSUFFICIENT_DST
;
2457 coding
->consumed
= src
- source
;
2458 coding
->produced
= coding
->produced_char
= dst
- destination
;
2463 /*** 5. End-of-line handlers ***/
2465 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2466 This function is called only when `coding->eol_type' is
2467 CODING_EOL_CRLF or CODING_EOL_CR. */
2470 decode_eol (coding
, source
, destination
, src_bytes
, dst_bytes
)
2471 struct coding_system
*coding
;
2472 unsigned char *source
, *destination
;
2473 int src_bytes
, dst_bytes
;
2475 unsigned char *src
= source
;
2476 unsigned char *src_end
= source
+ src_bytes
;
2477 unsigned char *dst
= destination
;
2478 unsigned char *dst_end
= destination
+ dst_bytes
;
2480 int result
= CODING_FINISH_NORMAL
;
2482 coding
->fake_multibyte
= 0;
2487 switch (coding
->eol_type
)
2489 case CODING_EOL_CRLF
:
2491 /* Since the maximum bytes produced by each loop is 2, we
2492 subtract 1 from DST_END to assure overflow checking is
2493 necessary only at the head of loop. */
2494 unsigned char *adjusted_dst_end
= dst_end
- 1;
2496 while (src
< src_end
&& (dst_bytes
2497 ? (dst
< adjusted_dst_end
)
2500 unsigned char *src_base
= src
;
2508 if (coding
->mode
& CODING_MODE_INHIBIT_INCONSISTENT_EOL
)
2510 result
= CODING_FINISH_INCONSISTENT_EOL
;
2511 goto label_end_of_loop_2
;
2514 if (BASE_LEADING_CODE_P (c
))
2515 coding
->fake_multibyte
= 1;
2520 && (coding
->mode
& CODING_MODE_INHIBIT_INCONSISTENT_EOL
))
2522 result
= CODING_FINISH_INCONSISTENT_EOL
;
2523 goto label_end_of_loop_2
;
2528 if (BASE_LEADING_CODE_P (c
))
2529 coding
->fake_multibyte
= 1;
2534 result
= CODING_FINISH_INSUFFICIENT_SRC
;
2535 label_end_of_loop_2
:
2539 if (result
== CODING_FINISH_NORMAL
2541 result
= CODING_FINISH_INSUFFICIENT_DST
;
2546 if (coding
->mode
& CODING_MODE_INHIBIT_INCONSISTENT_EOL
)
2548 while (src
< src_end
)
2550 if ((c
= *src
++) == '\n')
2552 if (BASE_LEADING_CODE_P (c
))
2553 coding
->fake_multibyte
= 1;
2557 src_bytes
= src
- source
;
2558 result
= CODING_FINISH_INCONSISTENT_EOL
;
2561 if (dst_bytes
&& src_bytes
> dst_bytes
)
2563 result
= CODING_FINISH_INSUFFICIENT_DST
;
2564 src_bytes
= dst_bytes
;
2567 bcopy (source
, destination
, src_bytes
);
2569 safe_bcopy (source
, destination
, src_bytes
);
2570 src
= source
+ src_bytes
;
2571 while (src_bytes
--) if (*dst
++ == '\r') dst
[-1] = '\n';
2574 default: /* i.e. case: CODING_EOL_LF */
2575 if (dst_bytes
&& src_bytes
> dst_bytes
)
2577 result
= CODING_FINISH_INSUFFICIENT_DST
;
2578 src_bytes
= dst_bytes
;
2581 bcopy (source
, destination
, src_bytes
);
2583 safe_bcopy (source
, destination
, src_bytes
);
2586 coding
->fake_multibyte
= 1;
2590 coding
->consumed
= coding
->consumed_char
= src
- source
;
2591 coding
->produced
= coding
->produced_char
= dst
- destination
;
2595 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". Encode
2596 format of end-of-line according to `coding->eol_type'. If
2597 `coding->mode & CODING_MODE_SELECTIVE_DISPLAY' is nonzero, code
2598 '\r' in source text also means end-of-line. */
2601 encode_eol (coding
, source
, destination
, src_bytes
, dst_bytes
)
2602 struct coding_system
*coding
;
2603 unsigned char *source
, *destination
;
2604 int src_bytes
, dst_bytes
;
2606 unsigned char *src
= source
;
2607 unsigned char *dst
= destination
;
2608 int result
= CODING_FINISH_NORMAL
;
2610 coding
->fake_multibyte
= 0;
2612 if (coding
->eol_type
== CODING_EOL_CRLF
)
2615 unsigned char *src_end
= source
+ src_bytes
;
2616 unsigned char *dst_end
= destination
+ dst_bytes
;
2617 /* Since the maximum bytes produced by each loop is 2, we
2618 subtract 1 from DST_END to assure overflow checking is
2619 necessary only at the head of loop. */
2620 unsigned char *adjusted_dst_end
= dst_end
- 1;
2622 while (src
< src_end
&& (dst_bytes
2623 ? (dst
< adjusted_dst_end
)
2628 || (c
== '\r' && (coding
->mode
& CODING_MODE_SELECTIVE_DISPLAY
)))
2629 *dst
++ = '\r', *dst
++ = '\n';
2633 if (BASE_LEADING_CODE_P (c
))
2634 coding
->fake_multibyte
= 1;
2638 result
= CODING_FINISH_INSUFFICIENT_DST
;
2644 if (dst_bytes
&& src_bytes
> dst_bytes
)
2646 src_bytes
= dst_bytes
;
2647 result
= CODING_FINISH_INSUFFICIENT_DST
;
2650 bcopy (source
, destination
, src_bytes
);
2653 safe_bcopy (source
, destination
, src_bytes
);
2654 dst_bytes
= src_bytes
;
2656 if (coding
->eol_type
== CODING_EOL_CRLF
)
2660 if ((c
= *dst
++) == '\n')
2662 else if (BASE_LEADING_CODE_P (c
))
2663 coding
->fake_multibyte
= 1;
2668 if (coding
->mode
& CODING_MODE_SELECTIVE_DISPLAY
)
2671 if (*dst
++ == '\r') dst
[-1] = '\n';
2673 coding
->fake_multibyte
= 1;
2675 src
= source
+ dst_bytes
;
2676 dst
= destination
+ dst_bytes
;
2679 coding
->consumed
= coding
->consumed_char
= src
- source
;
2680 coding
->produced
= coding
->produced_char
= dst
- destination
;
2685 /*** 6. C library functions ***/
2687 /* In Emacs Lisp, coding system is represented by a Lisp symbol which
2688 has a property `coding-system'. The value of this property is a
2689 vector of length 5 (called as coding-vector). Among elements of
2690 this vector, the first (element[0]) and the fifth (element[4])
2691 carry important information for decoding/encoding. Before
2692 decoding/encoding, this information should be set in fields of a
2693 structure of type `coding_system'.
2695 A value of property `coding-system' can be a symbol of another
2696 subsidiary coding-system. In that case, Emacs gets coding-vector
2699 `element[0]' contains information to be set in `coding->type'. The
2700 value and its meaning is as follows:
2702 0 -- coding_type_emacs_mule
2703 1 -- coding_type_sjis
2704 2 -- coding_type_iso2022
2705 3 -- coding_type_big5
2706 4 -- coding_type_ccl encoder/decoder written in CCL
2707 nil -- coding_type_no_conversion
2708 t -- coding_type_undecided (automatic conversion on decoding,
2709 no-conversion on encoding)
2711 `element[4]' contains information to be set in `coding->flags' and
2712 `coding->spec'. The meaning varies by `coding->type'.
2714 If `coding->type' is `coding_type_iso2022', element[4] is a vector
2715 of length 32 (of which the first 13 sub-elements are used now).
2716 Meanings of these sub-elements are:
2718 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2719 If the value is an integer of valid charset, the charset is
2720 assumed to be designated to graphic register N initially.
2722 If the value is minus, it is a minus value of charset which
2723 reserves graphic register N, which means that the charset is
2724 not designated initially but should be designated to graphic
2725 register N just before encoding a character in that charset.
2727 If the value is nil, graphic register N is never used on
2730 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2731 Each value takes t or nil. See the section ISO2022 of
2732 `coding.h' for more information.
2734 If `coding->type' is `coding_type_big5', element[4] is t to denote
2735 BIG5-ETen or nil to denote BIG5-HKU.
2737 If `coding->type' takes the other value, element[4] is ignored.
2739 Emacs Lisp's coding system also carries information about format of
2740 end-of-line in a value of property `eol-type'. If the value is
2741 integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2742 means CODING_EOL_CR. If it is not integer, it should be a vector
2743 of subsidiary coding systems of which property `eol-type' has one
2748 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2749 and set it in CODING. If CODING_SYSTEM_SYMBOL is invalid, CODING
2750 is setup so that no conversion is necessary and return -1, else
2754 setup_coding_system (coding_system
, coding
)
2755 Lisp_Object coding_system
;
2756 struct coding_system
*coding
;
2758 Lisp_Object coding_spec
, coding_type
, eol_type
, plist
;
2762 /* Initialize some fields required for all kinds of coding systems. */
2763 coding
->symbol
= coding_system
;
2764 coding
->common_flags
= 0;
2766 coding
->heading_ascii
= -1;
2767 coding
->post_read_conversion
= coding
->pre_write_conversion
= Qnil
;
2768 coding_spec
= Fget (coding_system
, Qcoding_system
);
2769 if (!VECTORP (coding_spec
)
2770 || XVECTOR (coding_spec
)->size
!= 5
2771 || !CONSP (XVECTOR (coding_spec
)->contents
[3]))
2772 goto label_invalid_coding_system
;
2774 eol_type
= inhibit_eol_conversion
? Qnil
: Fget (coding_system
, Qeol_type
);
2775 if (VECTORP (eol_type
))
2777 coding
->eol_type
= CODING_EOL_UNDECIDED
;
2778 coding
->common_flags
= CODING_REQUIRE_DETECTION_MASK
;
2780 else if (XFASTINT (eol_type
) == 1)
2782 coding
->eol_type
= CODING_EOL_CRLF
;
2783 coding
->common_flags
2784 = CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
;
2786 else if (XFASTINT (eol_type
) == 2)
2788 coding
->eol_type
= CODING_EOL_CR
;
2789 coding
->common_flags
2790 = CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
;
2793 coding
->eol_type
= CODING_EOL_LF
;
2795 coding_type
= XVECTOR (coding_spec
)->contents
[0];
2796 /* Try short cut. */
2797 if (SYMBOLP (coding_type
))
2799 if (EQ (coding_type
, Qt
))
2801 coding
->type
= coding_type_undecided
;
2802 coding
->common_flags
|= CODING_REQUIRE_DETECTION_MASK
;
2805 coding
->type
= coding_type_no_conversion
;
2809 /* Initialize remaining fields. */
2810 coding
->composing
= 0;
2811 coding
->character_unification_table_for_decode
= Qnil
;
2812 coding
->character_unification_table_for_encode
= Qnil
;
2814 /* Get values of coding system properties:
2815 `post-read-conversion', `pre-write-conversion',
2816 `character-unification-table-for-decode',
2817 `character-unification-table-for-encode'. */
2818 plist
= XVECTOR (coding_spec
)->contents
[3];
2819 coding
->post_read_conversion
= Fplist_get (plist
, Qpost_read_conversion
);
2820 coding
->pre_write_conversion
= Fplist_get (plist
, Qpre_write_conversion
);
2821 val
= Fplist_get (plist
, Qcharacter_unification_table_for_decode
);
2823 val
= Fget (val
, Qcharacter_unification_table_for_decode
);
2824 coding
->character_unification_table_for_decode
2825 = CHAR_TABLE_P (val
) ? val
: Qnil
;
2826 val
= Fplist_get (plist
, Qcharacter_unification_table_for_encode
);
2828 val
= Fget (val
, Qcharacter_unification_table_for_encode
);
2829 coding
->character_unification_table_for_encode
2830 = CHAR_TABLE_P (val
) ? val
: Qnil
;
2831 val
= Fplist_get (plist
, Qcoding_category
);
2834 val
= Fget (val
, Qcoding_category_index
);
2836 coding
->category_idx
= XINT (val
);
2838 goto label_invalid_coding_system
;
2841 goto label_invalid_coding_system
;
2843 val
= Fplist_get (plist
, Qsafe_charsets
);
2846 for (i
= 0; i
<= MAX_CHARSET
; i
++)
2847 coding
->safe_charsets
[i
] = 1;
2851 bzero (coding
->safe_charsets
, MAX_CHARSET
+ 1);
2854 if ((i
= get_charset_id (XCONS (val
)->car
)) >= 0)
2855 coding
->safe_charsets
[i
] = 1;
2856 val
= XCONS (val
)->cdr
;
2860 switch (XFASTINT (coding_type
))
2863 coding
->type
= coding_type_emacs_mule
;
2864 if (!NILP (coding
->post_read_conversion
))
2865 coding
->common_flags
|= CODING_REQUIRE_DECODING_MASK
;
2866 if (!NILP (coding
->pre_write_conversion
))
2867 coding
->common_flags
|= CODING_REQUIRE_ENCODING_MASK
;
2871 coding
->type
= coding_type_sjis
;
2872 coding
->common_flags
2873 |= CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
;
2877 coding
->type
= coding_type_iso2022
;
2878 coding
->common_flags
2879 |= CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
;
2881 Lisp_Object val
, temp
;
2883 int i
, charset
, reg_bits
= 0;
2885 val
= XVECTOR (coding_spec
)->contents
[4];
2887 if (!VECTORP (val
) || XVECTOR (val
)->size
!= 32)
2888 goto label_invalid_coding_system
;
2890 flags
= XVECTOR (val
)->contents
;
2892 = ((NILP (flags
[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM
)
2893 | (NILP (flags
[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL
)
2894 | (NILP (flags
[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL
)
2895 | (NILP (flags
[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS
)
2896 | (NILP (flags
[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT
)
2897 | (NILP (flags
[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT
)
2898 | (NILP (flags
[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN
)
2899 | (NILP (flags
[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS
)
2900 | (NILP (flags
[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION
)
2901 | (NILP (flags
[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL
)
2902 | (NILP (flags
[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL
)
2903 | (NILP (flags
[15]) ? 0 : CODING_FLAG_ISO_SAFE
)
2904 | (NILP (flags
[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA
)
2907 /* Invoke graphic register 0 to plane 0. */
2908 CODING_SPEC_ISO_INVOCATION (coding
, 0) = 0;
2909 /* Invoke graphic register 1 to plane 1 if we can use full 8-bit. */
2910 CODING_SPEC_ISO_INVOCATION (coding
, 1)
2911 = (coding
->flags
& CODING_FLAG_ISO_SEVEN_BITS
? -1 : 1);
2912 /* Not single shifting at first. */
2913 CODING_SPEC_ISO_SINGLE_SHIFTING (coding
) = 0;
2914 /* Beginning of buffer should also be regarded as bol. */
2915 CODING_SPEC_ISO_BOL (coding
) = 1;
2917 for (charset
= 0; charset
<= MAX_CHARSET
; charset
++)
2918 CODING_SPEC_ISO_REVISION_NUMBER (coding
, charset
) = 255;
2919 val
= Vcharset_revision_alist
;
2922 charset
= get_charset_id (Fcar_safe (XCONS (val
)->car
));
2924 && (temp
= Fcdr_safe (XCONS (val
)->car
), INTEGERP (temp
))
2925 && (i
= XINT (temp
), (i
>= 0 && (i
+ '@') < 128)))
2926 CODING_SPEC_ISO_REVISION_NUMBER (coding
, charset
) = i
;
2927 val
= XCONS (val
)->cdr
;
2930 /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
2931 FLAGS[REG] can be one of below:
2932 integer CHARSET: CHARSET occupies register I,
2933 t: designate nothing to REG initially, but can be used
2935 list of integer, nil, or t: designate the first
2936 element (if integer) to REG initially, the remaining
2937 elements (if integer) is designated to REG on request,
2938 if an element is t, REG can be used by any charsets,
2939 nil: REG is never used. */
2940 for (charset
= 0; charset
<= MAX_CHARSET
; charset
++)
2941 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
)
2942 = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION
;
2943 for (i
= 0; i
< 4; i
++)
2945 if (INTEGERP (flags
[i
])
2946 && (charset
= XINT (flags
[i
]), CHARSET_VALID_P (charset
))
2947 || (charset
= get_charset_id (flags
[i
])) >= 0)
2949 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, i
) = charset
;
2950 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
) = i
;
2952 else if (EQ (flags
[i
], Qt
))
2954 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, i
) = -1;
2956 coding
->flags
|= CODING_FLAG_ISO_DESIGNATION
;
2958 else if (CONSP (flags
[i
]))
2960 Lisp_Object tail
= flags
[i
];
2962 coding
->flags
|= CODING_FLAG_ISO_DESIGNATION
;
2963 if (INTEGERP (XCONS (tail
)->car
)
2964 && (charset
= XINT (XCONS (tail
)->car
),
2965 CHARSET_VALID_P (charset
))
2966 || (charset
= get_charset_id (XCONS (tail
)->car
)) >= 0)
2968 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, i
) = charset
;
2969 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
) =i
;
2972 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, i
) = -1;
2973 tail
= XCONS (tail
)->cdr
;
2974 while (CONSP (tail
))
2976 if (INTEGERP (XCONS (tail
)->car
)
2977 && (charset
= XINT (XCONS (tail
)->car
),
2978 CHARSET_VALID_P (charset
))
2979 || (charset
= get_charset_id (XCONS (tail
)->car
)) >= 0)
2980 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
)
2982 else if (EQ (XCONS (tail
)->car
, Qt
))
2984 tail
= XCONS (tail
)->cdr
;
2988 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, i
) = -1;
2990 CODING_SPEC_ISO_DESIGNATION (coding
, i
)
2991 = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, i
);
2994 if (reg_bits
&& ! (coding
->flags
& CODING_FLAG_ISO_LOCKING_SHIFT
))
2996 /* REG 1 can be used only by locking shift in 7-bit env. */
2997 if (coding
->flags
& CODING_FLAG_ISO_SEVEN_BITS
)
2999 if (! (coding
->flags
& CODING_FLAG_ISO_SINGLE_SHIFT
))
3000 /* Without any shifting, only REG 0 and 1 can be used. */
3005 for (charset
= 0; charset
<= MAX_CHARSET
; charset
++)
3007 if (CHARSET_VALID_P (charset
))
3009 /* There exist some default graphic registers to be
3012 /* We had better avoid designating a charset of
3013 CHARS96 to REG 0 as far as possible. */
3014 if (CHARSET_CHARS (charset
) == 96)
3015 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
)
3017 ? 1 : (reg_bits
& 4 ? 2 : (reg_bits
& 8 ? 3 : 0)));
3019 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
)
3021 ? 0 : (reg_bits
& 2 ? 1 : (reg_bits
& 4 ? 2 : 3)));
3025 coding
->common_flags
|= CODING_REQUIRE_FLUSHING_MASK
;
3026 coding
->spec
.iso2022
.last_invalid_designation_register
= -1;
3030 coding
->type
= coding_type_big5
;
3031 coding
->common_flags
3032 |= CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
;
3034 = (NILP (XVECTOR (coding_spec
)->contents
[4])
3035 ? CODING_FLAG_BIG5_HKU
3036 : CODING_FLAG_BIG5_ETEN
);
3040 coding
->type
= coding_type_ccl
;
3041 coding
->common_flags
3042 |= CODING_REQUIRE_DECODING_MASK
| CODING_REQUIRE_ENCODING_MASK
;
3044 Lisp_Object val
= XVECTOR (coding_spec
)->contents
[4];
3046 && VECTORP (XCONS (val
)->car
)
3047 && VECTORP (XCONS (val
)->cdr
))
3049 setup_ccl_program (&(coding
->spec
.ccl
.decoder
), XCONS (val
)->car
);
3050 setup_ccl_program (&(coding
->spec
.ccl
.encoder
), XCONS (val
)->cdr
);
3053 goto label_invalid_coding_system
;
3055 coding
->common_flags
|= CODING_REQUIRE_FLUSHING_MASK
;
3059 coding
->type
= coding_type_raw_text
;
3063 goto label_invalid_coding_system
;
3067 label_invalid_coding_system
:
3068 coding
->type
= coding_type_no_conversion
;
3069 coding
->category_idx
= CODING_CATEGORY_IDX_BINARY
;
3070 coding
->common_flags
= 0;
3071 coding
->eol_type
= CODING_EOL_LF
;
3072 coding
->pre_write_conversion
= coding
->post_read_conversion
= Qnil
;
3076 /* Emacs has a mechanism to automatically detect a coding system if it
3077 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
3078 it's impossible to distinguish some coding systems accurately
3079 because they use the same range of codes. So, at first, coding
3080 systems are categorized into 7, those are:
3082 o coding-category-emacs-mule
3084 The category for a coding system which has the same code range
3085 as Emacs' internal format. Assigned the coding-system (Lisp
3086 symbol) `emacs-mule' by default.
3088 o coding-category-sjis
3090 The category for a coding system which has the same code range
3091 as SJIS. Assigned the coding-system (Lisp
3092 symbol) `japanese-shift-jis' by default.
3094 o coding-category-iso-7
3096 The category for a coding system which has the same code range
3097 as ISO2022 of 7-bit environment. This doesn't use any locking
3098 shift and single shift functions. This can encode/decode all
3099 charsets. Assigned the coding-system (Lisp symbol)
3100 `iso-2022-7bit' by default.
3102 o coding-category-iso-7-tight
3104 Same as coding-category-iso-7 except that this can
3105 encode/decode only the specified charsets.
3107 o coding-category-iso-8-1
3109 The category for a coding system which has the same code range
3110 as ISO2022 of 8-bit environment and graphic plane 1 used only
3111 for DIMENSION1 charset. This doesn't use any locking shift
3112 and single shift functions. Assigned the coding-system (Lisp
3113 symbol) `iso-latin-1' by default.
3115 o coding-category-iso-8-2
3117 The category for a coding system which has the same code range
3118 as ISO2022 of 8-bit environment and graphic plane 1 used only
3119 for DIMENSION2 charset. This doesn't use any locking shift
3120 and single shift functions. Assigned the coding-system (Lisp
3121 symbol) `japanese-iso-8bit' by default.
3123 o coding-category-iso-7-else
3125 The category for a coding system which has the same code range
3126 as ISO2022 of 7-bit environemnt but uses locking shift or
3127 single shift functions. Assigned the coding-system (Lisp
3128 symbol) `iso-2022-7bit-lock' by default.
3130 o coding-category-iso-8-else
3132 The category for a coding system which has the same code range
3133 as ISO2022 of 8-bit environemnt but uses locking shift or
3134 single shift functions. Assigned the coding-system (Lisp
3135 symbol) `iso-2022-8bit-ss2' by default.
3137 o coding-category-big5
3139 The category for a coding system which has the same code range
3140 as BIG5. Assigned the coding-system (Lisp symbol)
3141 `cn-big5' by default.
3143 o coding-category-binary
3145 The category for a coding system not categorized in any of the
3146 above. Assigned the coding-system (Lisp symbol)
3147 `no-conversion' by default.
3149 Each of them is a Lisp symbol and the value is an actual
3150 `coding-system's (this is also a Lisp symbol) assigned by a user.
3151 What Emacs does actually is to detect a category of coding system.
3152 Then, it uses a `coding-system' assigned to it. If Emacs can't
3153 decide only one possible category, it selects a category of the
3154 highest priority. Priorities of categories are also specified by a
3155 user in a Lisp variable `coding-category-list'.
3159 /* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded.
3160 If it detects possible coding systems, return an integer in which
3161 appropriate flag bits are set. Flag bits are defined by macros
3162 CODING_CATEGORY_MASK_XXX in `coding.h'.
3164 How many ASCII characters are at the head is returned as *SKIP. */
3167 detect_coding_mask (source
, src_bytes
, priorities
, skip
)
3168 unsigned char *source
;
3169 int src_bytes
, *priorities
, *skip
;
3171 register unsigned char c
;
3172 unsigned char *src
= source
, *src_end
= source
+ src_bytes
;
3173 unsigned int mask
= (CODING_CATEGORY_MASK_ISO_7BIT
3174 | CODING_CATEGORY_MASK_ISO_SHIFT
);
3177 /* At first, skip all ASCII characters and control characters except
3178 for three ISO2022 specific control characters. */
3179 label_loop_detect_coding
:
3180 while (src
< src_end
)
3184 || ((mask
& CODING_CATEGORY_MASK_ISO_7BIT
)
3185 && c
== ISO_CODE_ESC
)
3186 || ((mask
& CODING_CATEGORY_MASK_ISO_SHIFT
)
3187 && (c
== ISO_CODE_SI
|| c
== ISO_CODE_SO
)))
3191 *skip
= src
- source
;
3194 /* We found nothing other than ASCII. There's nothing to do. */
3197 /* The text seems to be encoded in some multilingual coding system.
3198 Now, try to find in which coding system the text is encoded. */
3201 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
3202 /* C is an ISO2022 specific control code of C0. */
3203 mask
= detect_coding_iso2022 (src
, src_end
);
3206 /* No valid ISO2022 code follows C. Try again. */
3208 mask
= (c
!= ISO_CODE_ESC
3209 ? CODING_CATEGORY_MASK_ISO_7BIT
3210 : CODING_CATEGORY_MASK_ISO_SHIFT
);
3211 goto label_loop_detect_coding
;
3214 goto label_return_highest_only
;
3222 /* C is the first byte of SJIS character code,
3223 or a leading-code of Emacs' internal format (emacs-mule). */
3224 try = CODING_CATEGORY_MASK_SJIS
| CODING_CATEGORY_MASK_EMACS_MULE
;
3226 /* Or, if C is a special latin extra code,
3227 or is an ISO2022 specific control code of C1 (SS2 or SS3),
3228 or is an ISO2022 control-sequence-introducer (CSI),
3229 we should also consider the possibility of ISO2022 codings. */
3230 if ((VECTORP (Vlatin_extra_code_table
)
3231 && !NILP (XVECTOR (Vlatin_extra_code_table
)->contents
[c
]))
3232 || (c
== ISO_CODE_SS2
|| c
== ISO_CODE_SS3
)
3233 || (c
== ISO_CODE_CSI
3236 || ((*src
== '0' || *src
== '1' || *src
== '2')
3237 && src
+ 1 < src_end
3238 && src
[1] == ']')))))
3239 try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
3240 | CODING_CATEGORY_MASK_ISO_8BIT
);
3243 /* C is a character of ISO2022 in graphic plane right,
3244 or a SJIS's 1-byte character code (i.e. JISX0201),
3245 or the first byte of BIG5's 2-byte code. */
3246 try = (CODING_CATEGORY_MASK_ISO_8_ELSE
3247 | CODING_CATEGORY_MASK_ISO_8BIT
3248 | CODING_CATEGORY_MASK_SJIS
3249 | CODING_CATEGORY_MASK_BIG5
);
3254 for (i
= 0; i
< CODING_CATEGORY_IDX_MAX
; i
++)
3256 priorities
[i
] &= try;
3257 if (priorities
[i
] & CODING_CATEGORY_MASK_ISO
)
3258 mask
= detect_coding_iso2022 (src
, src_end
);
3259 else if (priorities
[i
] & CODING_CATEGORY_MASK_SJIS
)
3260 mask
= detect_coding_sjis (src
, src_end
);
3261 else if (priorities
[i
] & CODING_CATEGORY_MASK_BIG5
)
3262 mask
= detect_coding_big5 (src
, src_end
);
3263 else if (priorities
[i
] & CODING_CATEGORY_MASK_EMACS_MULE
)
3264 mask
= detect_coding_emacs_mule (src
, src_end
);
3266 goto label_return_highest_only
;
3268 return CODING_CATEGORY_MASK_RAW_TEXT
;
3270 if (try & CODING_CATEGORY_MASK_ISO
)
3271 mask
|= detect_coding_iso2022 (src
, src_end
);
3272 if (try & CODING_CATEGORY_MASK_SJIS
)
3273 mask
|= detect_coding_sjis (src
, src_end
);
3274 if (try & CODING_CATEGORY_MASK_BIG5
)
3275 mask
|= detect_coding_big5 (src
, src_end
);
3276 if (try & CODING_CATEGORY_MASK_EMACS_MULE
)
3277 mask
|= detect_coding_emacs_mule (src
, src_end
);
3279 return (mask
| CODING_CATEGORY_MASK_RAW_TEXT
);
3281 label_return_highest_only
:
3282 for (i
= 0; i
< CODING_CATEGORY_IDX_MAX
; i
++)
3284 if (mask
& priorities
[i
])
3285 return priorities
[i
];
3287 return CODING_CATEGORY_MASK_RAW_TEXT
;
3290 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
3291 The information of the detected coding system is set in CODING. */
3294 detect_coding (coding
, src
, src_bytes
)
3295 struct coding_system
*coding
;
3301 int priorities
[CODING_CATEGORY_IDX_MAX
];
3302 Lisp_Object val
= Vcoding_category_list
;
3305 while (CONSP (val
) && i
< CODING_CATEGORY_IDX_MAX
)
3307 if (! SYMBOLP (XCONS (val
)->car
))
3309 idx
= XFASTINT (Fget (XCONS (val
)->car
, Qcoding_category_index
));
3310 if (idx
>= CODING_CATEGORY_IDX_MAX
)
3312 priorities
[i
++] = (1 << idx
);
3313 val
= XCONS (val
)->cdr
;
3315 /* If coding-category-list is valid and contains all coding
3316 categories, `i' should be CODING_CATEGORY_IDX_MAX now. If not,
3317 the following code saves Emacs from craching. */
3318 while (i
< CODING_CATEGORY_IDX_MAX
)
3319 priorities
[i
++] = CODING_CATEGORY_MASK_RAW_TEXT
;
3321 mask
= detect_coding_mask (src
, src_bytes
, priorities
, &skip
);
3322 coding
->heading_ascii
= skip
;
3326 /* We found a single coding system of the highest priority in MASK. */
3328 while (mask
&& ! (mask
& 1)) mask
>>= 1, idx
++;
3330 idx
= CODING_CATEGORY_IDX_RAW_TEXT
;
3332 val
= XSYMBOL (XVECTOR (Vcoding_category_table
)->contents
[idx
])->value
;
3334 if (coding
->eol_type
!= CODING_EOL_UNDECIDED
)
3336 Lisp_Object tmp
= Fget (val
, Qeol_type
);
3339 val
= XVECTOR (tmp
)->contents
[coding
->eol_type
];
3341 setup_coding_system (val
, coding
);
3342 /* Set this again because setup_coding_system reset this member. */
3343 coding
->heading_ascii
= skip
;
3346 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
3347 SOURCE is encoded. Return one of CODING_EOL_LF, CODING_EOL_CRLF,
3348 CODING_EOL_CR, and CODING_EOL_UNDECIDED.
3350 How many non-eol characters are at the head is returned as *SKIP. */
3352 #define MAX_EOL_CHECK_COUNT 3
3355 detect_eol_type (source
, src_bytes
, skip
)
3356 unsigned char *source
;
3357 int src_bytes
, *skip
;
3359 unsigned char *src
= source
, *src_end
= src
+ src_bytes
;
3361 int total
= 0; /* How many end-of-lines are found so far. */
3362 int eol_type
= CODING_EOL_UNDECIDED
;
3367 while (src
< src_end
&& total
< MAX_EOL_CHECK_COUNT
)
3370 if (c
== '\n' || c
== '\r')
3373 *skip
= src
- 1 - source
;
3376 this_eol_type
= CODING_EOL_LF
;
3377 else if (src
>= src_end
|| *src
!= '\n')
3378 this_eol_type
= CODING_EOL_CR
;
3380 this_eol_type
= CODING_EOL_CRLF
, src
++;
3382 if (eol_type
== CODING_EOL_UNDECIDED
)
3383 /* This is the first end-of-line. */
3384 eol_type
= this_eol_type
;
3385 else if (eol_type
!= this_eol_type
)
3387 /* The found type is different from what found before. */
3388 eol_type
= CODING_EOL_INCONSISTENT
;
3395 *skip
= src_end
- source
;
3399 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
3400 is encoded. If it detects an appropriate format of end-of-line, it
3401 sets the information in *CODING. */
3404 detect_eol (coding
, src
, src_bytes
)
3405 struct coding_system
*coding
;
3411 int eol_type
= detect_eol_type (src
, src_bytes
, &skip
);
3413 if (coding
->heading_ascii
> skip
)
3414 coding
->heading_ascii
= skip
;
3416 skip
= coding
->heading_ascii
;
3418 if (eol_type
== CODING_EOL_UNDECIDED
)
3420 if (eol_type
== CODING_EOL_INCONSISTENT
)
3423 /* This code is suppressed until we find a better way to
3424 distinguish raw text file and binary file. */
3426 /* If we have already detected that the coding is raw-text, the
3427 coding should actually be no-conversion. */
3428 if (coding
->type
== coding_type_raw_text
)
3430 setup_coding_system (Qno_conversion
, coding
);
3433 /* Else, let's decode only text code anyway. */
3435 eol_type
= CODING_EOL_LF
;
3438 val
= Fget (coding
->symbol
, Qeol_type
);
3439 if (VECTORP (val
) && XVECTOR (val
)->size
== 3)
3441 setup_coding_system (XVECTOR (val
)->contents
[eol_type
], coding
);
3442 coding
->heading_ascii
= skip
;
3446 #define CONVERSION_BUFFER_EXTRA_ROOM 256
3448 #define DECODING_BUFFER_MAG(coding) \
3449 (coding->type == coding_type_iso2022 \
3451 : ((coding->type == coding_type_sjis || coding->type == coding_type_big5) \
3453 : (coding->type == coding_type_raw_text \
3455 : (coding->type == coding_type_ccl \
3456 ? coding->spec.ccl.decoder.buf_magnification \
3459 /* Return maximum size (bytes) of a buffer enough for decoding
3460 SRC_BYTES of text encoded in CODING. */
3463 decoding_buffer_size (coding
, src_bytes
)
3464 struct coding_system
*coding
;
3467 return (src_bytes
* DECODING_BUFFER_MAG (coding
)
3468 + CONVERSION_BUFFER_EXTRA_ROOM
);
3471 /* Return maximum size (bytes) of a buffer enough for encoding
3472 SRC_BYTES of text to CODING. */
3475 encoding_buffer_size (coding
, src_bytes
)
3476 struct coding_system
*coding
;
3481 if (coding
->type
== coding_type_ccl
)
3482 magnification
= coding
->spec
.ccl
.encoder
.buf_magnification
;
3486 return (src_bytes
* magnification
+ CONVERSION_BUFFER_EXTRA_ROOM
);
3489 #ifndef MINIMUM_CONVERSION_BUFFER_SIZE
3490 #define MINIMUM_CONVERSION_BUFFER_SIZE 1024
3493 char *conversion_buffer
;
3494 int conversion_buffer_size
;
3496 /* Return a pointer to a SIZE bytes of buffer to be used for encoding
3497 or decoding. Sufficient memory is allocated automatically. If we
3498 run out of memory, return NULL. */
3501 get_conversion_buffer (size
)
3504 if (size
> conversion_buffer_size
)
3507 int real_size
= conversion_buffer_size
* 2;
3509 while (real_size
< size
) real_size
*= 2;
3510 buf
= (char *) xmalloc (real_size
);
3511 xfree (conversion_buffer
);
3512 conversion_buffer
= buf
;
3513 conversion_buffer_size
= real_size
;
3515 return conversion_buffer
;
3519 ccl_coding_driver (coding
, source
, destination
, src_bytes
, dst_bytes
, encodep
)
3520 struct coding_system
*coding
;
3521 unsigned char *source
, *destination
;
3522 int src_bytes
, dst_bytes
, encodep
;
3524 struct ccl_program
*ccl
3525 = encodep
? &coding
->spec
.ccl
.encoder
: &coding
->spec
.ccl
.decoder
;
3528 coding
->produced
= ccl_driver (ccl
, source
, destination
,
3529 src_bytes
, dst_bytes
, &(coding
->consumed
));
3532 coding
->produced_char
= coding
->produced
;
3533 coding
->consumed_char
3534 = multibyte_chars_in_text (source
, coding
->consumed
);
3538 coding
->produced_char
3539 = multibyte_chars_in_text (destination
, coding
->produced
);
3540 coding
->consumed_char
= coding
->consumed
;
3542 switch (ccl
->status
)
3544 case CCL_STAT_SUSPEND_BY_SRC
:
3545 result
= CODING_FINISH_INSUFFICIENT_SRC
;
3547 case CCL_STAT_SUSPEND_BY_DST
:
3548 result
= CODING_FINISH_INSUFFICIENT_DST
;
3551 result
= CODING_FINISH_NORMAL
;
3557 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions". Before
3558 decoding, it may detect coding system and format of end-of-line if
3559 those are not yet decided. */
3562 decode_coding (coding
, source
, destination
, src_bytes
, dst_bytes
)
3563 struct coding_system
*coding
;
3564 unsigned char *source
, *destination
;
3565 int src_bytes
, dst_bytes
;
3571 coding
->produced
= coding
->produced_char
= 0;
3572 coding
->consumed
= coding
->consumed_char
= 0;
3573 coding
->fake_multibyte
= 0;
3574 return CODING_FINISH_NORMAL
;
3577 if (coding
->type
== coding_type_undecided
)
3578 detect_coding (coding
, source
, src_bytes
);
3580 if (coding
->eol_type
== CODING_EOL_UNDECIDED
)
3581 detect_eol (coding
, source
, src_bytes
);
3583 switch (coding
->type
)
3585 case coding_type_emacs_mule
:
3586 case coding_type_undecided
:
3587 case coding_type_raw_text
:
3588 if (coding
->eol_type
== CODING_EOL_LF
3589 || coding
->eol_type
== CODING_EOL_UNDECIDED
)
3590 goto label_no_conversion
;
3591 result
= decode_eol (coding
, source
, destination
, src_bytes
, dst_bytes
);
3594 case coding_type_sjis
:
3595 result
= decode_coding_sjis_big5 (coding
, source
, destination
,
3596 src_bytes
, dst_bytes
, 1);
3599 case coding_type_iso2022
:
3600 result
= decode_coding_iso2022 (coding
, source
, destination
,
3601 src_bytes
, dst_bytes
);
3604 case coding_type_big5
:
3605 result
= decode_coding_sjis_big5 (coding
, source
, destination
,
3606 src_bytes
, dst_bytes
, 0);
3609 case coding_type_ccl
:
3610 result
= ccl_coding_driver (coding
, source
, destination
,
3611 src_bytes
, dst_bytes
, 0);
3614 default: /* i.e. case coding_type_no_conversion: */
3615 label_no_conversion
:
3616 if (dst_bytes
&& src_bytes
> dst_bytes
)
3618 coding
->produced
= dst_bytes
;
3619 result
= CODING_FINISH_INSUFFICIENT_DST
;
3623 coding
->produced
= src_bytes
;
3624 result
= CODING_FINISH_NORMAL
;
3627 bcopy (source
, destination
, coding
->produced
);
3629 safe_bcopy (source
, destination
, coding
->produced
);
3630 coding
->fake_multibyte
= 1;
3632 = coding
->consumed_char
= coding
->produced_char
= coding
->produced
;
3639 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". */
3642 encode_coding (coding
, source
, destination
, src_bytes
, dst_bytes
)
3643 struct coding_system
*coding
;
3644 unsigned char *source
, *destination
;
3645 int src_bytes
, dst_bytes
;
3651 coding
->produced
= coding
->produced_char
= 0;
3652 coding
->consumed
= coding
->consumed_char
= 0;
3653 coding
->fake_multibyte
= 0;
3654 return CODING_FINISH_NORMAL
;
3657 switch (coding
->type
)
3659 case coding_type_emacs_mule
:
3660 case coding_type_undecided
:
3661 case coding_type_raw_text
:
3662 if (coding
->eol_type
== CODING_EOL_LF
3663 || coding
->eol_type
== CODING_EOL_UNDECIDED
)
3664 goto label_no_conversion
;
3665 result
= encode_eol (coding
, source
, destination
, src_bytes
, dst_bytes
);
3668 case coding_type_sjis
:
3669 result
= encode_coding_sjis_big5 (coding
, source
, destination
,
3670 src_bytes
, dst_bytes
, 1);
3673 case coding_type_iso2022
:
3674 result
= encode_coding_iso2022 (coding
, source
, destination
,
3675 src_bytes
, dst_bytes
);
3678 case coding_type_big5
:
3679 result
= encode_coding_sjis_big5 (coding
, source
, destination
,
3680 src_bytes
, dst_bytes
, 0);
3683 case coding_type_ccl
:
3684 result
= ccl_coding_driver (coding
, source
, destination
,
3685 src_bytes
, dst_bytes
, 1);
3688 default: /* i.e. case coding_type_no_conversion: */
3689 label_no_conversion
:
3690 if (dst_bytes
&& src_bytes
> dst_bytes
)
3692 coding
->produced
= dst_bytes
;
3693 result
= CODING_FINISH_INSUFFICIENT_DST
;
3697 coding
->produced
= src_bytes
;
3698 result
= CODING_FINISH_NORMAL
;
3701 bcopy (source
, destination
, coding
->produced
);
3703 safe_bcopy (source
, destination
, coding
->produced
);
3704 if (coding
->mode
& CODING_MODE_SELECTIVE_DISPLAY
)
3706 unsigned char *p
= destination
, *pend
= p
+ coding
->produced
;
3708 if (*p
++ == '\015') p
[-1] = '\n';
3710 coding
->fake_multibyte
= 1;
3712 = coding
->consumed_char
= coding
->produced_char
= coding
->produced
;
3719 /* Scan text in the region between *BEG and *END (byte positions),
3720 skip characters which we don't have to decode by coding system
3721 CODING at the head and tail, then set *BEG and *END to the region
3722 of the text we actually have to convert. The caller should move
3723 the gap out of the region in advance.
3725 If STR is not NULL, *BEG and *END are indices into STR. */
3728 shrink_decoding_region (beg
, end
, coding
, str
)
3730 struct coding_system
*coding
;
3733 unsigned char *begp_orig
, *begp
, *endp_orig
, *endp
, c
;
3736 if (coding
->type
== coding_type_ccl
3737 || coding
->type
== coding_type_undecided
3738 || !NILP (coding
->post_read_conversion
))
3740 /* We can't skip any data. */
3743 else if (coding
->type
== coding_type_no_conversion
)
3745 /* We need no conversion, but don't have to skip any data here.
3746 Decoding routine handles them effectively anyway. */
3750 if (coding
->heading_ascii
>= 0)
3751 /* Detection routine has already found how much we can skip at the
3753 *beg
+= coding
->heading_ascii
;
3757 begp_orig
= begp
= str
+ *beg
;
3758 endp_orig
= endp
= str
+ *end
;
3762 begp_orig
= begp
= BYTE_POS_ADDR (*beg
);
3763 endp_orig
= endp
= begp
+ *end
- *beg
;
3766 eol_conversion
= (coding
->eol_type
!= CODING_EOL_LF
);
3768 switch (coding
->type
)
3770 case coding_type_emacs_mule
:
3771 case coding_type_raw_text
:
3774 if (coding
->heading_ascii
< 0)
3775 while (begp
< endp
&& *begp
!= '\r' && *begp
< 0x80) begp
++;
3776 while (begp
< endp
&& *(endp
- 1) != '\r' && *(endp
- 1) < 0x80)
3783 case coding_type_sjis
:
3784 case coding_type_big5
:
3785 /* We can skip all ASCII characters at the head. */
3786 if (coding
->heading_ascii
< 0)
3789 while (begp
< endp
&& *begp
< 0x80 && *begp
!= '\r') begp
++;
3791 while (begp
< endp
&& *begp
< 0x80) begp
++;
3793 /* We can skip all ASCII characters at the tail except for the
3794 second byte of SJIS or BIG5 code. */
3796 while (begp
< endp
&& endp
[-1] < 0x80 && endp
[-1] != '\r') endp
--;
3798 while (begp
< endp
&& endp
[-1] < 0x80) endp
--;
3799 if (begp
< endp
&& endp
< endp_orig
&& endp
[-1] >= 0x80)
3803 default: /* i.e. case coding_type_iso2022: */
3804 if (coding
->heading_ascii
< 0)
3806 /* We can skip all ASCII characters at the head except for a
3807 few control codes. */
3808 while (begp
< endp
&& (c
= *begp
) < 0x80
3809 && c
!= ISO_CODE_CR
&& c
!= ISO_CODE_SO
3810 && c
!= ISO_CODE_SI
&& c
!= ISO_CODE_ESC
3811 && (!eol_conversion
|| c
!= ISO_CODE_LF
))
3814 switch (coding
->category_idx
)
3816 case CODING_CATEGORY_IDX_ISO_8_1
:
3817 case CODING_CATEGORY_IDX_ISO_8_2
:
3818 /* We can skip all ASCII characters at the tail. */
3820 while (begp
< endp
&& (c
= endp
[-1]) < 0x80 && c
!= '\r') endp
--;
3822 while (begp
< endp
&& endp
[-1] < 0x80) endp
--;
3825 case CODING_CATEGORY_IDX_ISO_7
:
3826 case CODING_CATEGORY_IDX_ISO_7_TIGHT
:
3827 /* We can skip all charactes at the tail except for ESC and
3828 the following 2-byte at the tail. */
3831 && (c
= endp
[-1]) < 0x80 && c
!= ISO_CODE_ESC
&& c
!= '\r')
3835 && (c
= endp
[-1]) < 0x80 && c
!= ISO_CODE_ESC
)
3837 if (begp
< endp
&& endp
[-1] == ISO_CODE_ESC
)
3839 if (endp
+ 1 < endp_orig
&& end
[0] == '(' && end
[1] == 'B')
3840 /* This is an ASCII designation sequence. We can
3841 surely skip the tail. */
3844 /* Hmmm, we can't skip the tail. */
3849 *beg
+= begp
- begp_orig
;
3850 *end
+= endp
- endp_orig
;
3854 /* Like shrink_decoding_region but for encoding. */
3857 shrink_encoding_region (beg
, end
, coding
, str
)
3859 struct coding_system
*coding
;
3862 unsigned char *begp_orig
, *begp
, *endp_orig
, *endp
;
3865 if (coding
->type
== coding_type_ccl
)
3866 /* We can't skip any data. */
3868 else if (coding
->type
== coding_type_no_conversion
)
3870 /* We need no conversion. */
3877 begp_orig
= begp
= str
+ *beg
;
3878 endp_orig
= endp
= str
+ *end
;
3882 begp_orig
= begp
= BYTE_POS_ADDR (*beg
);
3883 endp_orig
= endp
= begp
+ *end
- *beg
;
3886 eol_conversion
= (coding
->eol_type
== CODING_EOL_CR
3887 || coding
->eol_type
== CODING_EOL_CRLF
);
3889 /* Here, we don't have to check coding->pre_write_conversion because
3890 the caller is expected to have handled it already. */
3891 switch (coding
->type
)
3893 case coding_type_undecided
:
3894 case coding_type_emacs_mule
:
3895 case coding_type_raw_text
:
3898 while (begp
< endp
&& *begp
!= '\n') begp
++;
3899 while (begp
< endp
&& endp
[-1] != '\n') endp
--;
3905 case coding_type_iso2022
:
3906 if (coding
->flags
& CODING_FLAG_ISO_DESIGNATE_AT_BOL
)
3908 unsigned char *bol
= begp
;
3909 while (begp
< endp
&& *begp
< 0x80)
3912 if (begp
[-1] == '\n')
3916 goto label_skip_tail
;
3921 /* We can skip all ASCII characters at the head and tail. */
3923 while (begp
< endp
&& *begp
< 0x80 && *begp
!= '\n') begp
++;
3925 while (begp
< endp
&& *begp
< 0x80) begp
++;
3928 while (begp
< endp
&& endp
[-1] < 0x80 && endp
[-1] != '\n') endp
--;
3930 while (begp
< endp
&& *(endp
- 1) < 0x80) endp
--;
3934 *beg
+= begp
- begp_orig
;
3935 *end
+= endp
- endp_orig
;
3939 /* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the
3940 text from FROM to TO (byte positions are FROM_BYTE and TO_BYTE) by
3941 coding system CODING, and return the status code of code conversion
3942 (currently, this value has no meaning).
3944 How many characters (and bytes) are converted to how many
3945 characters (and bytes) are recorded in members of the structure
3948 If REPLACE is nonzero, we do various things as if the original text
3949 is deleted and a new text is inserted. See the comments in
3950 replace_range (insdel.c) to know what we are doing. */
3953 code_convert_region (from
, from_byte
, to
, to_byte
, coding
, encodep
, replace
)
3954 int from
, from_byte
, to
, to_byte
, encodep
, replace
;
3955 struct coding_system
*coding
;
3957 int len
= to
- from
, len_byte
= to_byte
- from_byte
;
3958 int require
, inserted
, inserted_byte
;
3959 int head_skip
, tail_skip
, total_skip
;
3960 Lisp_Object saved_coding_symbol
= Qnil
;
3961 int multibyte
= !NILP (current_buffer
->enable_multibyte_characters
);
3963 int fake_multibyte
= 0;
3964 unsigned char *src
, *dst
;
3965 Lisp_Object deletion
= Qnil
;
3969 int saved_from
= from
;
3971 prepare_to_modify_buffer (from
, to
, &from
);
3972 if (saved_from
!= from
)
3976 from_byte
= CHAR_TO_BYTE (from
), to_byte
= CHAR_TO_BYTE (to
);
3978 from_byte
= from
, to_byte
= to
;
3979 len_byte
= to_byte
- from_byte
;
3983 if (! encodep
&& CODING_REQUIRE_DETECTION (coding
))
3985 /* We must detect encoding of text and eol format. */
3987 if (from
< GPT
&& to
> GPT
)
3988 move_gap_both (from
, from_byte
);
3989 if (coding
->type
== coding_type_undecided
)
3991 detect_coding (coding
, BYTE_POS_ADDR (from_byte
), len_byte
);
3992 if (coding
->type
== coding_type_undecided
)
3993 /* It seems that the text contains only ASCII, but we
3994 should not left it undecided because the deeper
3995 decoding routine (decode_coding) tries to detect the
3996 encodings again in vain. */
3997 coding
->type
= coding_type_emacs_mule
;
3999 if (coding
->eol_type
== CODING_EOL_UNDECIDED
)
4001 saved_coding_symbol
= coding
->symbol
;
4002 detect_eol (coding
, BYTE_POS_ADDR (from_byte
), len_byte
);
4003 if (coding
->eol_type
== CODING_EOL_UNDECIDED
)
4004 coding
->eol_type
= CODING_EOL_LF
;
4005 /* We had better recover the original eol format if we
4006 encounter an inconsitent eol format while decoding. */
4007 coding
->mode
|= CODING_MODE_INHIBIT_INCONSISTENT_EOL
;
4011 coding
->consumed_char
= len
, coding
->consumed
= len_byte
;
4014 ? ! CODING_REQUIRE_ENCODING (coding
)
4015 : ! CODING_REQUIRE_DECODING (coding
))
4017 coding
->produced
= len_byte
;
4020 /* See the comment of the member heading_ascii in coding.h. */
4021 && coding
->heading_ascii
< len_byte
)
4023 /* We still may have to combine byte at the head and the
4024 tail of the text in the region. */
4025 if (from
< GPT
&& GPT
< to
)
4026 move_gap_both (to
, to_byte
);
4027 len
= multibyte_chars_in_text (BYTE_POS_ADDR (from_byte
), len_byte
);
4028 adjust_after_insert (from
, from_byte
, to
, to_byte
, len
);
4029 coding
->produced_char
= len
;
4034 adjust_after_insert (from
, from_byte
, to
, to_byte
, len_byte
);
4035 coding
->produced_char
= len_byte
;
4040 /* Now we convert the text. */
4042 /* For encoding, we must process pre-write-conversion in advance. */
4044 && ! NILP (coding
->pre_write_conversion
)
4045 && SYMBOLP (coding
->pre_write_conversion
)
4046 && ! NILP (Ffboundp (coding
->pre_write_conversion
)))
4048 /* The function in pre-write-conversion may put a new text in a
4050 struct buffer
*prev
= current_buffer
, *new;
4052 call2 (coding
->pre_write_conversion
, from
, to
);
4053 if (current_buffer
!= prev
)
4056 new = current_buffer
;
4057 set_buffer_internal_1 (prev
);
4058 del_range_2 (from
, from_byte
, to
, to_byte
);
4059 insert_from_buffer (new, BEG
, len
, 0);
4061 to_byte
= multibyte
? CHAR_TO_BYTE (to
) : to
;
4062 len_byte
= to_byte
- from_byte
;
4067 deletion
= make_buffer_string_both (from
, from_byte
, to
, to_byte
, 1);
4069 /* Try to skip the heading and tailing ASCIIs. */
4071 int from_byte_orig
= from_byte
, to_byte_orig
= to_byte
;
4073 if (from
< GPT
&& GPT
< to
)
4074 move_gap_both (from
, from_byte
);
4076 shrink_encoding_region (&from_byte
, &to_byte
, coding
, NULL
);
4078 shrink_decoding_region (&from_byte
, &to_byte
, coding
, NULL
);
4079 if (from_byte
== to_byte
)
4081 coding
->produced
= len_byte
;
4082 coding
->produced_char
= multibyte
? len
: len_byte
;
4084 /* We must record and adjust for this new text now. */
4085 adjust_after_insert (from
, from_byte_orig
, to
, to_byte_orig
, len
);
4089 head_skip
= from_byte
- from_byte_orig
;
4090 tail_skip
= to_byte_orig
- to_byte
;
4091 total_skip
= head_skip
+ tail_skip
;
4094 len
-= total_skip
; len_byte
-= total_skip
;
4097 /* For converion, we must put the gap before the text in addition to
4098 making the gap larger for efficient decoding. The required gap
4099 size starts from 2000 which is the magic number used in make_gap.
4100 But, after one batch of conversion, it will be incremented if we
4101 find that it is not enough . */
4104 if (GAP_SIZE
< require
)
4105 make_gap (require
- GAP_SIZE
);
4106 move_gap_both (from
, from_byte
);
4108 if (GPT
- BEG
< beg_unchanged
)
4109 beg_unchanged
= GPT
- BEG
;
4110 if (Z
- GPT
< end_unchanged
)
4111 end_unchanged
= Z
- GPT
;
4113 inserted
= inserted_byte
= 0;
4114 src
= GAP_END_ADDR
, dst
= GPT_ADDR
;
4116 GAP_SIZE
+= len_byte
;
4119 ZV_BYTE
-= len_byte
;
4126 /* The buffer memory is changed from:
4127 +--------+converted-text+---------+-------original-text------+---+
4128 |<-from->|<--inserted-->|---------|<-----------len---------->|---|
4129 |<------------------- GAP_SIZE -------------------->| */
4131 result
= encode_coding (coding
, src
, dst
, len_byte
, 0);
4133 result
= decode_coding (coding
, src
, dst
, len_byte
, 0);
4135 +--------+-------converted-text--------+--+---original-text--+---+
4136 |<-from->|<--inserted-->|<--produced-->|--|<-(len-consumed)->|---|
4137 |<------------------- GAP_SIZE -------------------->| */
4138 if (coding
->fake_multibyte
)
4141 if (!encodep
&& !multibyte
)
4142 coding
->produced_char
= coding
->produced
;
4143 inserted
+= coding
->produced_char
;
4144 inserted_byte
+= coding
->produced
;
4145 len_byte
-= coding
->consumed
;
4146 src
+= coding
->consumed
;
4147 dst
+= inserted_byte
;
4149 if (! encodep
&& result
== CODING_FINISH_INCONSISTENT_EOL
)
4151 unsigned char *pend
= dst
, *p
= pend
- inserted_byte
;
4153 /* Encode LFs back to the original eol format (CR or CRLF). */
4154 if (coding
->eol_type
== CODING_EOL_CR
)
4156 while (p
< pend
) if (*p
++ == '\n') p
[-1] = '\r';
4162 while (p
< pend
) if (*p
++ == '\n') count
++;
4163 if (src
- dst
< count
)
4165 /* We don't have sufficient room for putting LFs
4166 back to CRLF. We must record converted and
4167 not-yet-converted text back to the buffer
4168 content, enlarge the gap, then record them out of
4169 the buffer contents again. */
4170 int add
= len_byte
+ inserted_byte
;
4173 ZV
+= add
; Z
+= add
; ZV_BYTE
+= add
; Z_BYTE
+= add
;
4174 GPT
+= inserted_byte
; GPT_BYTE
+= inserted_byte
;
4175 make_gap (count
- GAP_SIZE
);
4177 ZV
-= add
; Z
-= add
; ZV_BYTE
-= add
; Z_BYTE
-= add
;
4178 GPT
-= inserted_byte
; GPT_BYTE
-= inserted_byte
;
4179 /* Don't forget to update SRC, DST, and PEND. */
4180 src
= GAP_END_ADDR
- len_byte
;
4181 dst
= GPT_ADDR
+ inserted_byte
;
4185 inserted_byte
+= count
;
4186 coding
->produced
+= count
;
4187 p
= dst
= pend
+ count
;
4191 if (*p
== '\n') count
--, *--p
= '\r';
4195 /* Suppress eol-format conversion in the further conversion. */
4196 coding
->eol_type
= CODING_EOL_LF
;
4198 /* Restore the original symbol. */
4199 coding
->symbol
= saved_coding_symbol
;
4205 if (result
== CODING_FINISH_INSUFFICIENT_SRC
)
4207 /* The source text ends in invalid codes. Let's just
4208 make them valid buffer contents, and finish conversion. */
4209 inserted
+= len_byte
;
4210 inserted_byte
+= len_byte
;
4218 /* We have just done the first batch of conversion which was
4219 stoped because of insufficient gap. Let's reconsider the
4220 required gap size (i.e. SRT - DST) now.
4222 We have converted ORIG bytes (== coding->consumed) into
4223 NEW bytes (coding->produced). To convert the remaining
4224 LEN bytes, we may need REQUIRE bytes of gap, where:
4225 REQUIRE + LEN_BYTE = LEN_BYTE * (NEW / ORIG)
4226 REQUIRE = LEN_BYTE * (NEW - ORIG) / ORIG
4227 Here, we are sure that NEW >= ORIG. */
4228 float ratio
= coding
->produced
- coding
->consumed
;
4229 ratio
/= coding
->consumed
;
4230 require
= len_byte
* ratio
;
4233 if ((src
- dst
) < (require
+ 2000))
4235 /* See the comment above the previous call of make_gap. */
4236 int add
= len_byte
+ inserted_byte
;
4239 ZV
+= add
; Z
+= add
; ZV_BYTE
+= add
; Z_BYTE
+= add
;
4240 GPT
+= inserted_byte
; GPT_BYTE
+= inserted_byte
;
4241 make_gap (require
+ 2000);
4243 ZV
-= add
; Z
-= add
; ZV_BYTE
-= add
; Z_BYTE
-= add
;
4244 GPT
-= inserted_byte
; GPT_BYTE
-= inserted_byte
;
4245 /* Don't forget to update SRC, DST. */
4246 src
= GAP_END_ADDR
- len_byte
;
4247 dst
= GPT_ADDR
+ inserted_byte
;
4250 if (src
- dst
> 0) *dst
= 0; /* Put an anchor. */
4254 || !encodep
&& (to
- from
) != (to_byte
- from_byte
)))
4255 inserted
= multibyte_chars_in_text (GPT_ADDR
, inserted_byte
);
4257 /* If we have shrinked the conversion area, adjust it now. */
4261 safe_bcopy (GAP_END_ADDR
, GPT_ADDR
+ inserted_byte
, tail_skip
);
4262 inserted
+= total_skip
; inserted_byte
+= total_skip
;
4263 GAP_SIZE
+= total_skip
;
4264 GPT
-= head_skip
; GPT_BYTE
-= head_skip
;
4265 ZV
-= total_skip
; ZV_BYTE
-= total_skip
;
4266 Z
-= total_skip
; Z_BYTE
-= total_skip
;
4267 from
-= head_skip
; from_byte
-= head_skip
;
4268 to
+= tail_skip
; to_byte
+= tail_skip
;
4271 adjust_after_replace (from
, from_byte
, deletion
, inserted
, inserted_byte
);
4273 if (! encodep
&& ! NILP (coding
->post_read_conversion
))
4276 int orig_inserted
= inserted
, pos
= PT
;
4279 temp_set_point_both (current_buffer
, from
, from_byte
);
4280 val
= call1 (coding
->post_read_conversion
, make_number (inserted
));
4283 CHECK_NUMBER (val
, 0);
4284 inserted
= XFASTINT (val
);
4286 if (pos
>= from
+ orig_inserted
)
4287 temp_set_point (current_buffer
, pos
+ (inserted
- orig_inserted
));
4290 signal_after_change (from
, to
- from
, inserted
);
4293 coding
->consumed
= to_byte
- from_byte
;
4294 coding
->consumed_char
= to
- from
;
4295 coding
->produced
= inserted_byte
;
4296 coding
->produced_char
= inserted
;
4303 code_convert_string (str
, coding
, encodep
, nocopy
)
4305 struct coding_system
*coding
;
4306 int encodep
, nocopy
;
4310 int from
= 0, to
= XSTRING (str
)->size
;
4311 int to_byte
= STRING_BYTES (XSTRING (str
));
4312 struct gcpro gcpro1
;
4313 Lisp_Object saved_coding_symbol
= Qnil
;
4316 if (encodep
&& !NILP (coding
->pre_write_conversion
)
4317 || !encodep
&& !NILP (coding
->post_read_conversion
))
4319 /* Since we have to call Lisp functions which assume target text
4320 is in a buffer, after setting a temporary buffer, call
4321 code_convert_region. */
4322 int count
= specpdl_ptr
- specpdl
;
4323 struct buffer
*prev
= current_buffer
;
4325 record_unwind_protect (Fset_buffer
, Fcurrent_buffer ());
4326 temp_output_buffer_setup (" *code-converting-work*");
4327 set_buffer_internal (XBUFFER (Vstandard_output
));
4329 insert_from_string (str
, 0, 0, to
, to_byte
, 0);
4332 /* We must insert the contents of STR as is without
4333 unibyte<->multibyte conversion. */
4334 current_buffer
->enable_multibyte_characters
= Qnil
;
4335 insert_from_string (str
, 0, 0, to_byte
, to_byte
, 0);
4336 current_buffer
->enable_multibyte_characters
= Qt
;
4338 code_convert_region (BEGV
, BEGV_BYTE
, ZV
, ZV_BYTE
, coding
, encodep
, 1);
4340 /* We must return the buffer contents as unibyte string. */
4341 current_buffer
->enable_multibyte_characters
= Qnil
;
4342 str
= make_buffer_string (BEGV
, ZV
, 0);
4343 set_buffer_internal (prev
);
4344 return unbind_to (count
, str
);
4347 if (! encodep
&& CODING_REQUIRE_DETECTION (coding
))
4349 /* See the comments in code_convert_region. */
4350 if (coding
->type
== coding_type_undecided
)
4352 detect_coding (coding
, XSTRING (str
)->data
, to_byte
);
4353 if (coding
->type
== coding_type_undecided
)
4354 coding
->type
= coding_type_emacs_mule
;
4356 if (coding
->eol_type
== CODING_EOL_UNDECIDED
)
4358 saved_coding_symbol
= coding
->symbol
;
4359 detect_eol (coding
, XSTRING (str
)->data
, to_byte
);
4360 if (coding
->eol_type
== CODING_EOL_UNDECIDED
)
4361 coding
->eol_type
= CODING_EOL_LF
;
4362 /* We had better recover the original eol format if we
4363 encounter an inconsitent eol format while decoding. */
4364 coding
->mode
|= CODING_MODE_INHIBIT_INCONSISTENT_EOL
;
4369 ? ! CODING_REQUIRE_ENCODING (coding
)
4370 : ! CODING_REQUIRE_DECODING (coding
))
4374 /* Try to skip the heading and tailing ASCIIs. */
4376 shrink_encoding_region (&from
, &to_byte
, coding
, XSTRING (str
)->data
);
4378 shrink_decoding_region (&from
, &to_byte
, coding
, XSTRING (str
)->data
);
4380 if (from
== to_byte
)
4381 return (nocopy
? str
: Fcopy_sequence (str
));
4384 len
= encoding_buffer_size (coding
, to_byte
- from
);
4386 len
= decoding_buffer_size (coding
, to_byte
- from
);
4387 len
+= from
+ STRING_BYTES (XSTRING (str
)) - to_byte
;
4389 buf
= get_conversion_buffer (len
);
4393 bcopy (XSTRING (str
)->data
, buf
, from
);
4395 ? encode_coding (coding
, XSTRING (str
)->data
+ from
,
4396 buf
+ from
, to_byte
- from
, len
)
4397 : decode_coding (coding
, XSTRING (str
)->data
+ from
,
4398 buf
+ from
, to_byte
- from
, len
));
4399 if (! encodep
&& result
== CODING_FINISH_INCONSISTENT_EOL
)
4401 /* We simple try to decode the whole string again but without
4402 eol-conversion this time. */
4403 coding
->eol_type
= CODING_EOL_LF
;
4404 coding
->symbol
= saved_coding_symbol
;
4405 return code_convert_string (str
, coding
, encodep
, nocopy
);
4408 bcopy (XSTRING (str
)->data
+ to_byte
, buf
+ from
+ coding
->produced
,
4409 STRING_BYTES (XSTRING (str
)) - to_byte
);
4411 len
= from
+ STRING_BYTES (XSTRING (str
)) - to_byte
;
4413 str
= make_unibyte_string (buf
, len
+ coding
->produced
);
4415 str
= make_string_from_bytes (buf
, len
+ coding
->produced_char
,
4416 len
+ coding
->produced
);
4422 /*** 7. Emacs Lisp library functions ***/
4424 DEFUN ("coding-system-p", Fcoding_system_p
, Scoding_system_p
, 1, 1, 0,
4425 "Return t if OBJECT is nil or a coding-system.\n\
4426 See the documentation of `make-coding-system' for information\n\
4427 about coding-system objects.")
4435 /* Get coding-spec vector for OBJ. */
4436 obj
= Fget (obj
, Qcoding_system
);
4437 return ((VECTORP (obj
) && XVECTOR (obj
)->size
== 5)
4441 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system
,
4442 Sread_non_nil_coding_system
, 1, 1, 0,
4443 "Read a coding system from the minibuffer, prompting with string PROMPT.")
4450 val
= Fcompleting_read (prompt
, Vcoding_system_alist
, Qnil
,
4451 Qt
, Qnil
, Qcoding_system_history
, Qnil
, Qnil
);
4453 while (XSTRING (val
)->size
== 0);
4454 return (Fintern (val
, Qnil
));
4457 DEFUN ("read-coding-system", Fread_coding_system
, Sread_coding_system
, 1, 2, 0,
4458 "Read a coding system from the minibuffer, prompting with string PROMPT.\n\
4459 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.")
4460 (prompt
, default_coding_system
)
4461 Lisp_Object prompt
, default_coding_system
;
4464 if (SYMBOLP (default_coding_system
))
4465 XSETSTRING (default_coding_system
, XSYMBOL (default_coding_system
)->name
);
4466 val
= Fcompleting_read (prompt
, Vcoding_system_alist
, Qnil
,
4467 Qt
, Qnil
, Qcoding_system_history
,
4468 default_coding_system
, Qnil
);
4469 return (XSTRING (val
)->size
== 0 ? Qnil
: Fintern (val
, Qnil
));
4472 DEFUN ("check-coding-system", Fcheck_coding_system
, Scheck_coding_system
,
4474 "Check validity of CODING-SYSTEM.\n\
4475 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.\n\
4476 It is valid if it is a symbol with a non-nil `coding-system' property.\n\
4477 The value of property should be a vector of length 5.")
4479 Lisp_Object coding_system
;
4481 CHECK_SYMBOL (coding_system
, 0);
4482 if (!NILP (Fcoding_system_p (coding_system
)))
4483 return coding_system
;
4485 Fsignal (Qcoding_system_error
, Fcons (coding_system
, Qnil
));
4489 detect_coding_system (src
, src_bytes
, highest
)
4491 int src_bytes
, highest
;
4493 int coding_mask
, eol_type
;
4494 Lisp_Object val
, tmp
;
4497 coding_mask
= detect_coding_mask (src
, src_bytes
, NULL
, &dummy
);
4498 eol_type
= detect_eol_type (src
, src_bytes
, &dummy
);
4499 if (eol_type
== CODING_EOL_INCONSISTENT
)
4500 eol_type
== CODING_EOL_UNDECIDED
;
4505 if (eol_type
!= CODING_EOL_UNDECIDED
)
4508 val2
= Fget (Qundecided
, Qeol_type
);
4510 val
= XVECTOR (val2
)->contents
[eol_type
];
4515 /* At first, gather possible coding systems in VAL. */
4517 for (tmp
= Vcoding_category_list
; !NILP (tmp
); tmp
= XCONS (tmp
)->cdr
)
4520 = XFASTINT (Fget (XCONS (tmp
)->car
, Qcoding_category_index
));
4521 if (coding_mask
& (1 << idx
))
4523 val
= Fcons (Fsymbol_value (XCONS (tmp
)->car
), val
);
4529 val
= Fnreverse (val
);
4531 /* Then, substitute the elements by subsidiary coding systems. */
4532 for (tmp
= val
; !NILP (tmp
); tmp
= XCONS (tmp
)->cdr
)
4534 if (eol_type
!= CODING_EOL_UNDECIDED
)
4537 eol
= Fget (XCONS (tmp
)->car
, Qeol_type
);
4539 XCONS (tmp
)->car
= XVECTOR (eol
)->contents
[eol_type
];
4542 return (highest
? XCONS (val
)->car
: val
);
4545 DEFUN ("detect-coding-region", Fdetect_coding_region
, Sdetect_coding_region
,
4547 "Detect coding system of the text in the region between START and END.\n\
4548 Return a list of possible coding systems ordered by priority.\n\
4550 If only ASCII characters are found, it returns `undecided'\n\
4551 or its subsidiary coding system according to a detected end-of-line format.\n\
4553 If optional argument HIGHEST is non-nil, return the coding system of\n\
4555 (start
, end
, highest
)
4556 Lisp_Object start
, end
, highest
;
4559 int from_byte
, to_byte
;
4561 CHECK_NUMBER_COERCE_MARKER (start
, 0);
4562 CHECK_NUMBER_COERCE_MARKER (end
, 1);
4564 validate_region (&start
, &end
);
4565 from
= XINT (start
), to
= XINT (end
);
4566 from_byte
= CHAR_TO_BYTE (from
);
4567 to_byte
= CHAR_TO_BYTE (to
);
4569 if (from
< GPT
&& to
>= GPT
)
4570 move_gap_both (to
, to_byte
);
4572 return detect_coding_system (BYTE_POS_ADDR (from_byte
),
4573 to_byte
- from_byte
,
4577 DEFUN ("detect-coding-string", Fdetect_coding_string
, Sdetect_coding_string
,
4579 "Detect coding system of the text in STRING.\n\
4580 Return a list of possible coding systems ordered by priority.\n\
4582 If only ASCII characters are found, it returns `undecided'\n\
4583 or its subsidiary coding system according to a detected end-of-line format.\n\
4585 If optional argument HIGHEST is non-nil, return the coding system of\n\
4588 Lisp_Object string
, highest
;
4590 CHECK_STRING (string
, 0);
4592 return detect_coding_system (XSTRING (string
)->data
,
4593 STRING_BYTES (XSTRING (string
)),
4598 code_convert_region1 (start
, end
, coding_system
, encodep
)
4599 Lisp_Object start
, end
, coding_system
;
4602 struct coding_system coding
;
4605 CHECK_NUMBER_COERCE_MARKER (start
, 0);
4606 CHECK_NUMBER_COERCE_MARKER (end
, 1);
4607 CHECK_SYMBOL (coding_system
, 2);
4609 validate_region (&start
, &end
);
4610 from
= XFASTINT (start
);
4611 to
= XFASTINT (end
);
4613 if (NILP (coding_system
))
4614 return make_number (to
- from
);
4616 if (setup_coding_system (Fcheck_coding_system (coding_system
), &coding
) < 0)
4617 error ("Invalid coding system: %s", XSYMBOL (coding_system
)->name
->data
);
4619 coding
.mode
|= CODING_MODE_LAST_BLOCK
;
4620 code_convert_region (from
, CHAR_TO_BYTE (from
), to
, CHAR_TO_BYTE (to
),
4621 &coding
, encodep
, 1);
4622 return make_number (coding
.produced_char
);
4625 DEFUN ("decode-coding-region", Fdecode_coding_region
, Sdecode_coding_region
,
4626 3, 3, "r\nzCoding system: ",
4627 "Decode the current region by specified coding system.\n\
4628 When called from a program, takes three arguments:\n\
4629 START, END, and CODING-SYSTEM. START and END are buffer positions.\n\
4630 Return length of decoded text.")
4631 (start
, end
, coding_system
)
4632 Lisp_Object start
, end
, coding_system
;
4634 return code_convert_region1 (start
, end
, coding_system
, 0);
4637 DEFUN ("encode-coding-region", Fencode_coding_region
, Sencode_coding_region
,
4638 3, 3, "r\nzCoding system: ",
4639 "Encode the current region by specified coding system.\n\
4640 When called from a program, takes three arguments:\n\
4641 START, END, and CODING-SYSTEM. START and END are buffer positions.\n\
4642 Return length of encoded text.")
4643 (start
, end
, coding_system
)
4644 Lisp_Object start
, end
, coding_system
;
4646 return code_convert_region1 (start
, end
, coding_system
, 1);
4650 code_convert_string1 (string
, coding_system
, nocopy
, encodep
)
4651 Lisp_Object string
, coding_system
, nocopy
;
4654 struct coding_system coding
;
4656 CHECK_STRING (string
, 0);
4657 CHECK_SYMBOL (coding_system
, 1);
4659 if (NILP (coding_system
))
4660 return (NILP (nocopy
) ? Fcopy_sequence (string
) : string
);
4662 if (setup_coding_system (Fcheck_coding_system (coding_system
), &coding
) < 0)
4663 error ("Invalid coding system: %s", XSYMBOL (coding_system
)->name
->data
);
4665 coding
.mode
|= CODING_MODE_LAST_BLOCK
;
4666 return code_convert_string (string
, &coding
, encodep
, !NILP (nocopy
));
4669 DEFUN ("decode-coding-string", Fdecode_coding_string
, Sdecode_coding_string
,
4671 "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
4672 Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
4673 if the decoding operation is trivial.")
4674 (string
, coding_system
, nocopy
)
4675 Lisp_Object string
, coding_system
, nocopy
;
4677 return code_convert_string1(string
, coding_system
, nocopy
, 0);
4680 DEFUN ("encode-coding-string", Fencode_coding_string
, Sencode_coding_string
,
4682 "Encode STRING to CODING-SYSTEM, and return the result.\n\
4683 Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
4684 if the encoding operation is trivial.")
4685 (string
, coding_system
, nocopy
)
4686 Lisp_Object string
, coding_system
, nocopy
;
4688 return code_convert_string1(string
, coding_system
, nocopy
, 1);
4692 DEFUN ("decode-sjis-char", Fdecode_sjis_char
, Sdecode_sjis_char
, 1, 1, 0,
4693 "Decode a JISX0208 character of shift-jis encoding.\n\
4694 CODE is the character code in SJIS.\n\
4695 Return the corresponding character.")
4699 unsigned char c1
, c2
, s1
, s2
;
4702 CHECK_NUMBER (code
, 0);
4703 s1
= (XFASTINT (code
)) >> 8, s2
= (XFASTINT (code
)) & 0xFF;
4704 DECODE_SJIS (s1
, s2
, c1
, c2
);
4705 XSETFASTINT (val
, MAKE_NON_ASCII_CHAR (charset_jisx0208
, c1
, c2
));
4709 DEFUN ("encode-sjis-char", Fencode_sjis_char
, Sencode_sjis_char
, 1, 1, 0,
4710 "Encode a JISX0208 character CHAR to SJIS coding system.\n\
4711 Return the corresponding character code in SJIS.")
4715 int charset
, c1
, c2
, s1
, s2
;
4718 CHECK_NUMBER (ch
, 0);
4719 SPLIT_CHAR (XFASTINT (ch
), charset
, c1
, c2
);
4720 if (charset
== charset_jisx0208
)
4722 ENCODE_SJIS (c1
, c2
, s1
, s2
);
4723 XSETFASTINT (val
, (s1
<< 8) | s2
);
4726 XSETFASTINT (val
, 0);
4730 DEFUN ("decode-big5-char", Fdecode_big5_char
, Sdecode_big5_char
, 1, 1, 0,
4731 "Decode a Big5 character CODE of BIG5 coding system.\n\
4732 CODE is the character code in BIG5.\n\
4733 Return the corresponding character.")
4738 unsigned char b1
, b2
, c1
, c2
;
4741 CHECK_NUMBER (code
, 0);
4742 b1
= (XFASTINT (code
)) >> 8, b2
= (XFASTINT (code
)) & 0xFF;
4743 DECODE_BIG5 (b1
, b2
, charset
, c1
, c2
);
4744 XSETFASTINT (val
, MAKE_NON_ASCII_CHAR (charset
, c1
, c2
));
4748 DEFUN ("encode-big5-char", Fencode_big5_char
, Sencode_big5_char
, 1, 1, 0,
4749 "Encode the Big5 character CHAR to BIG5 coding system.\n\
4750 Return the corresponding character code in Big5.")
4754 int charset
, c1
, c2
, b1
, b2
;
4757 CHECK_NUMBER (ch
, 0);
4758 SPLIT_CHAR (XFASTINT (ch
), charset
, c1
, c2
);
4759 if (charset
== charset_big5_1
|| charset
== charset_big5_2
)
4761 ENCODE_BIG5 (charset
, c1
, c2
, b1
, b2
);
4762 XSETFASTINT (val
, (b1
<< 8) | b2
);
4765 XSETFASTINT (val
, 0);
4769 DEFUN ("set-terminal-coding-system-internal",
4770 Fset_terminal_coding_system_internal
,
4771 Sset_terminal_coding_system_internal
, 1, 1, 0, "")
4773 Lisp_Object coding_system
;
4775 CHECK_SYMBOL (coding_system
, 0);
4776 setup_coding_system (Fcheck_coding_system (coding_system
), &terminal_coding
);
4777 /* We had better not send unsafe characters to terminal. */
4778 terminal_coding
.flags
|= CODING_FLAG_ISO_SAFE
;
4783 DEFUN ("set-safe-terminal-coding-system-internal",
4784 Fset_safe_terminal_coding_system_internal
,
4785 Sset_safe_terminal_coding_system_internal
, 1, 1, 0, "")
4787 Lisp_Object coding_system
;
4789 CHECK_SYMBOL (coding_system
, 0);
4790 setup_coding_system (Fcheck_coding_system (coding_system
),
4791 &safe_terminal_coding
);
4795 DEFUN ("terminal-coding-system",
4796 Fterminal_coding_system
, Sterminal_coding_system
, 0, 0, 0,
4797 "Return coding system specified for terminal output.")
4800 return terminal_coding
.symbol
;
4803 DEFUN ("set-keyboard-coding-system-internal",
4804 Fset_keyboard_coding_system_internal
,
4805 Sset_keyboard_coding_system_internal
, 1, 1, 0, "")
4807 Lisp_Object coding_system
;
4809 CHECK_SYMBOL (coding_system
, 0);
4810 setup_coding_system (Fcheck_coding_system (coding_system
), &keyboard_coding
);
4814 DEFUN ("keyboard-coding-system",
4815 Fkeyboard_coding_system
, Skeyboard_coding_system
, 0, 0, 0,
4816 "Return coding system specified for decoding keyboard input.")
4819 return keyboard_coding
.symbol
;
4823 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system
,
4824 Sfind_operation_coding_system
, 1, MANY
, 0,
4825 "Choose a coding system for an operation based on the target name.\n\
4826 The value names a pair of coding systems: (DECODING-SYSTEM ENCODING-SYSTEM).\n\
4827 DECODING-SYSTEM is the coding system to use for decoding\n\
4828 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system\n\
4829 for encoding (in case OPERATION does encoding).\n\
4831 The first argument OPERATION specifies an I/O primitive:\n\
4832 For file I/O, `insert-file-contents' or `write-region'.\n\
4833 For process I/O, `call-process', `call-process-region', or `start-process'.\n\
4834 For network I/O, `open-network-stream'.\n\
4836 The remaining arguments should be the same arguments that were passed\n\
4837 to the primitive. Depending on which primitive, one of those arguments\n\
4838 is selected as the TARGET. For example, if OPERATION does file I/O,\n\
4839 whichever argument specifies the file name is TARGET.\n\
4841 TARGET has a meaning which depends on OPERATION:\n\
4842 For file I/O, TARGET is a file name.\n\
4843 For process I/O, TARGET is a process name.\n\
4844 For network I/O, TARGET is a service name or a port number\n\
4846 This function looks up what specified for TARGET in,\n\
4847 `file-coding-system-alist', `process-coding-system-alist',\n\
4848 or `network-coding-system-alist' depending on OPERATION.\n\
4849 They may specify a coding system, a cons of coding systems,\n\
4850 or a function symbol to call.\n\
4851 In the last case, we call the function with one argument,\n\
4852 which is a list of all the arguments given to this function.")
4857 Lisp_Object operation
, target_idx
, target
, val
;
4858 register Lisp_Object chain
;
4861 error ("Too few arguments");
4862 operation
= args
[0];
4863 if (!SYMBOLP (operation
)
4864 || !INTEGERP (target_idx
= Fget (operation
, Qtarget_idx
)))
4865 error ("Invalid first arguement");
4866 if (nargs
< 1 + XINT (target_idx
))
4867 error ("Too few arguments for operation: %s",
4868 XSYMBOL (operation
)->name
->data
);
4869 target
= args
[XINT (target_idx
) + 1];
4870 if (!(STRINGP (target
)
4871 || (EQ (operation
, Qopen_network_stream
) && INTEGERP (target
))))
4872 error ("Invalid %dth argument", XINT (target_idx
) + 1);
4874 chain
= ((EQ (operation
, Qinsert_file_contents
)
4875 || EQ (operation
, Qwrite_region
))
4876 ? Vfile_coding_system_alist
4877 : (EQ (operation
, Qopen_network_stream
)
4878 ? Vnetwork_coding_system_alist
4879 : Vprocess_coding_system_alist
));
4883 for (; CONSP (chain
); chain
= XCONS (chain
)->cdr
)
4886 elt
= XCONS (chain
)->car
;
4889 && ((STRINGP (target
)
4890 && STRINGP (XCONS (elt
)->car
)
4891 && fast_string_match (XCONS (elt
)->car
, target
) >= 0)
4892 || (INTEGERP (target
) && EQ (target
, XCONS (elt
)->car
))))
4894 val
= XCONS (elt
)->cdr
;
4895 /* Here, if VAL is both a valid coding system and a valid
4896 function symbol, we return VAL as a coding system. */
4899 if (! SYMBOLP (val
))
4901 if (! NILP (Fcoding_system_p (val
)))
4902 return Fcons (val
, val
);
4903 if (! NILP (Ffboundp (val
)))
4905 val
= call1 (val
, Flist (nargs
, args
));
4908 if (SYMBOLP (val
) && ! NILP (Fcoding_system_p (val
)))
4909 return Fcons (val
, val
);
4917 DEFUN ("update-iso-coding-systems", Fupdate_iso_coding_systems
,
4918 Supdate_iso_coding_systems
, 0, 0, 0,
4919 "Update internal database for ISO2022 based coding systems.\n\
4920 When values of the following coding categories are changed, you must\n\
4921 call this function:\n\
4922 coding-category-iso-7, coding-category-iso-7-tight,\n\
4923 coding-category-iso-8-1, coding-category-iso-8-2,\n\
4924 coding-category-iso-7-else, coding-category-iso-8-else")
4929 for (i
= CODING_CATEGORY_IDX_ISO_7
; i
<= CODING_CATEGORY_IDX_ISO_8_ELSE
;
4932 if (! coding_system_table
[i
])
4933 coding_system_table
[i
]
4934 = (struct coding_system
*) xmalloc (sizeof (struct coding_system
));
4936 (XSYMBOL (XVECTOR (Vcoding_category_table
)->contents
[i
])->value
,
4937 coding_system_table
[i
]);
4945 /*** 8. Post-amble ***/
4952 /* Emacs' internal format specific initialize routine. */
4953 for (i
= 0; i
<= 0x20; i
++)
4954 emacs_code_class
[i
] = EMACS_control_code
;
4955 emacs_code_class
[0x0A] = EMACS_linefeed_code
;
4956 emacs_code_class
[0x0D] = EMACS_carriage_return_code
;
4957 for (i
= 0x21 ; i
< 0x7F; i
++)
4958 emacs_code_class
[i
] = EMACS_ascii_code
;
4959 emacs_code_class
[0x7F] = EMACS_control_code
;
4960 emacs_code_class
[0x80] = EMACS_leading_code_composition
;
4961 for (i
= 0x81; i
< 0xFF; i
++)
4962 emacs_code_class
[i
] = EMACS_invalid_code
;
4963 emacs_code_class
[LEADING_CODE_PRIVATE_11
] = EMACS_leading_code_3
;
4964 emacs_code_class
[LEADING_CODE_PRIVATE_12
] = EMACS_leading_code_3
;
4965 emacs_code_class
[LEADING_CODE_PRIVATE_21
] = EMACS_leading_code_4
;
4966 emacs_code_class
[LEADING_CODE_PRIVATE_22
] = EMACS_leading_code_4
;
4968 /* ISO2022 specific initialize routine. */
4969 for (i
= 0; i
< 0x20; i
++)
4970 iso_code_class
[i
] = ISO_control_code
;
4971 for (i
= 0x21; i
< 0x7F; i
++)
4972 iso_code_class
[i
] = ISO_graphic_plane_0
;
4973 for (i
= 0x80; i
< 0xA0; i
++)
4974 iso_code_class
[i
] = ISO_control_code
;
4975 for (i
= 0xA1; i
< 0xFF; i
++)
4976 iso_code_class
[i
] = ISO_graphic_plane_1
;
4977 iso_code_class
[0x20] = iso_code_class
[0x7F] = ISO_0x20_or_0x7F
;
4978 iso_code_class
[0xA0] = iso_code_class
[0xFF] = ISO_0xA0_or_0xFF
;
4979 iso_code_class
[ISO_CODE_CR
] = ISO_carriage_return
;
4980 iso_code_class
[ISO_CODE_SO
] = ISO_shift_out
;
4981 iso_code_class
[ISO_CODE_SI
] = ISO_shift_in
;
4982 iso_code_class
[ISO_CODE_SS2_7
] = ISO_single_shift_2_7
;
4983 iso_code_class
[ISO_CODE_ESC
] = ISO_escape
;
4984 iso_code_class
[ISO_CODE_SS2
] = ISO_single_shift_2
;
4985 iso_code_class
[ISO_CODE_SS3
] = ISO_single_shift_3
;
4986 iso_code_class
[ISO_CODE_CSI
] = ISO_control_sequence_introducer
;
4988 conversion_buffer_size
= MINIMUM_CONVERSION_BUFFER_SIZE
;
4989 conversion_buffer
= (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE
);
4991 setup_coding_system (Qnil
, &keyboard_coding
);
4992 setup_coding_system (Qnil
, &terminal_coding
);
4993 setup_coding_system (Qnil
, &safe_terminal_coding
);
4995 bzero (coding_system_table
, sizeof coding_system_table
);
4997 #if defined (MSDOS) || defined (WINDOWSNT)
4998 system_eol_type
= CODING_EOL_CRLF
;
5000 system_eol_type
= CODING_EOL_LF
;
5009 Qtarget_idx
= intern ("target-idx");
5010 staticpro (&Qtarget_idx
);
5012 Qcoding_system_history
= intern ("coding-system-history");
5013 staticpro (&Qcoding_system_history
);
5014 Fset (Qcoding_system_history
, Qnil
);
5016 /* Target FILENAME is the first argument. */
5017 Fput (Qinsert_file_contents
, Qtarget_idx
, make_number (0));
5018 /* Target FILENAME is the third argument. */
5019 Fput (Qwrite_region
, Qtarget_idx
, make_number (2));
5021 Qcall_process
= intern ("call-process");
5022 staticpro (&Qcall_process
);
5023 /* Target PROGRAM is the first argument. */
5024 Fput (Qcall_process
, Qtarget_idx
, make_number (0));
5026 Qcall_process_region
= intern ("call-process-region");
5027 staticpro (&Qcall_process_region
);
5028 /* Target PROGRAM is the third argument. */
5029 Fput (Qcall_process_region
, Qtarget_idx
, make_number (2));
5031 Qstart_process
= intern ("start-process");
5032 staticpro (&Qstart_process
);
5033 /* Target PROGRAM is the third argument. */
5034 Fput (Qstart_process
, Qtarget_idx
, make_number (2));
5036 Qopen_network_stream
= intern ("open-network-stream");
5037 staticpro (&Qopen_network_stream
);
5038 /* Target SERVICE is the fourth argument. */
5039 Fput (Qopen_network_stream
, Qtarget_idx
, make_number (3));
5041 Qcoding_system
= intern ("coding-system");
5042 staticpro (&Qcoding_system
);
5044 Qeol_type
= intern ("eol-type");
5045 staticpro (&Qeol_type
);
5047 Qbuffer_file_coding_system
= intern ("buffer-file-coding-system");
5048 staticpro (&Qbuffer_file_coding_system
);
5050 Qpost_read_conversion
= intern ("post-read-conversion");
5051 staticpro (&Qpost_read_conversion
);
5053 Qpre_write_conversion
= intern ("pre-write-conversion");
5054 staticpro (&Qpre_write_conversion
);
5056 Qno_conversion
= intern ("no-conversion");
5057 staticpro (&Qno_conversion
);
5059 Qundecided
= intern ("undecided");
5060 staticpro (&Qundecided
);
5062 Qcoding_system_p
= intern ("coding-system-p");
5063 staticpro (&Qcoding_system_p
);
5065 Qcoding_system_error
= intern ("coding-system-error");
5066 staticpro (&Qcoding_system_error
);
5068 Fput (Qcoding_system_error
, Qerror_conditions
,
5069 Fcons (Qcoding_system_error
, Fcons (Qerror
, Qnil
)));
5070 Fput (Qcoding_system_error
, Qerror_message
,
5071 build_string ("Invalid coding system"));
5073 Qcoding_category
= intern ("coding-category");
5074 staticpro (&Qcoding_category
);
5075 Qcoding_category_index
= intern ("coding-category-index");
5076 staticpro (&Qcoding_category_index
);
5078 Vcoding_category_table
5079 = Fmake_vector (make_number (CODING_CATEGORY_IDX_MAX
), Qnil
);
5080 staticpro (&Vcoding_category_table
);
5083 for (i
= 0; i
< CODING_CATEGORY_IDX_MAX
; i
++)
5085 XVECTOR (Vcoding_category_table
)->contents
[i
]
5086 = intern (coding_category_name
[i
]);
5087 Fput (XVECTOR (Vcoding_category_table
)->contents
[i
],
5088 Qcoding_category_index
, make_number (i
));
5092 Qcharacter_unification_table
= intern ("character-unification-table");
5093 staticpro (&Qcharacter_unification_table
);
5094 Fput (Qcharacter_unification_table
, Qchar_table_extra_slots
,
5097 Qcharacter_unification_table_for_decode
5098 = intern ("character-unification-table-for-decode");
5099 staticpro (&Qcharacter_unification_table_for_decode
);
5101 Qcharacter_unification_table_for_encode
5102 = intern ("character-unification-table-for-encode");
5103 staticpro (&Qcharacter_unification_table_for_encode
);
5105 Qsafe_charsets
= intern ("safe-charsets");
5106 staticpro (&Qsafe_charsets
);
5108 Qemacs_mule
= intern ("emacs-mule");
5109 staticpro (&Qemacs_mule
);
5111 Qraw_text
= intern ("raw-text");
5112 staticpro (&Qraw_text
);
5114 defsubr (&Scoding_system_p
);
5115 defsubr (&Sread_coding_system
);
5116 defsubr (&Sread_non_nil_coding_system
);
5117 defsubr (&Scheck_coding_system
);
5118 defsubr (&Sdetect_coding_region
);
5119 defsubr (&Sdetect_coding_string
);
5120 defsubr (&Sdecode_coding_region
);
5121 defsubr (&Sencode_coding_region
);
5122 defsubr (&Sdecode_coding_string
);
5123 defsubr (&Sencode_coding_string
);
5124 defsubr (&Sdecode_sjis_char
);
5125 defsubr (&Sencode_sjis_char
);
5126 defsubr (&Sdecode_big5_char
);
5127 defsubr (&Sencode_big5_char
);
5128 defsubr (&Sset_terminal_coding_system_internal
);
5129 defsubr (&Sset_safe_terminal_coding_system_internal
);
5130 defsubr (&Sterminal_coding_system
);
5131 defsubr (&Sset_keyboard_coding_system_internal
);
5132 defsubr (&Skeyboard_coding_system
);
5133 defsubr (&Sfind_operation_coding_system
);
5134 defsubr (&Supdate_iso_coding_systems
);
5136 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list
,
5137 "List of coding systems.\n\
5139 Do not alter the value of this variable manually. This variable should be\n\
5140 updated by the functions `make-coding-system' and\n\
5141 `define-coding-system-alias'.");
5142 Vcoding_system_list
= Qnil
;
5144 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist
,
5145 "Alist of coding system names.\n\
5146 Each element is one element list of coding system name.\n\
5147 This variable is given to `completing-read' as TABLE argument.\n\
5149 Do not alter the value of this variable manually. This variable should be\n\
5150 updated by the functions `make-coding-system' and\n\
5151 `define-coding-system-alias'.");
5152 Vcoding_system_alist
= Qnil
;
5154 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list
,
5155 "List of coding-categories (symbols) ordered by priority.");
5159 Vcoding_category_list
= Qnil
;
5160 for (i
= CODING_CATEGORY_IDX_MAX
- 1; i
>= 0; i
--)
5161 Vcoding_category_list
5162 = Fcons (XVECTOR (Vcoding_category_table
)->contents
[i
],
5163 Vcoding_category_list
);
5166 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read
,
5167 "Specify the coding system for read operations.\n\
5168 It is useful to bind this variable with `let', but do not set it globally.\n\
5169 If the value is a coding system, it is used for decoding on read operation.\n\
5170 If not, an appropriate element is used from one of the coding system alists:\n\
5171 There are three such tables, `file-coding-system-alist',\n\
5172 `process-coding-system-alist', and `network-coding-system-alist'.");
5173 Vcoding_system_for_read
= Qnil
;
5175 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write
,
5176 "Specify the coding system for write operations.\n\
5177 It is useful to bind this variable with `let', but do not set it globally.\n\
5178 If the value is a coding system, it is used for encoding on write operation.\n\
5179 If not, an appropriate element is used from one of the coding system alists:\n\
5180 There are three such tables, `file-coding-system-alist',\n\
5181 `process-coding-system-alist', and `network-coding-system-alist'.");
5182 Vcoding_system_for_write
= Qnil
;
5184 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used
,
5185 "Coding system used in the latest file or process I/O.");
5186 Vlast_coding_system_used
= Qnil
;
5188 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion
,
5189 "*Non-nil inhibit code conversion of end-of-line format in any cases.");
5190 inhibit_eol_conversion
= 0;
5192 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist
,
5193 "Alist to decide a coding system to use for a file I/O operation.\n\
5194 The format is ((PATTERN . VAL) ...),\n\
5195 where PATTERN is a regular expression matching a file name,\n\
5196 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
5197 If VAL is a coding system, it is used for both decoding and encoding\n\
5198 the file contents.\n\
5199 If VAL is a cons of coding systems, the car part is used for decoding,\n\
5200 and the cdr part is used for encoding.\n\
5201 If VAL is a function symbol, the function must return a coding system\n\
5202 or a cons of coding systems which are used as above.\n\
5204 See also the function `find-operation-coding-system'.");
5205 Vfile_coding_system_alist
= Qnil
;
5207 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist
,
5208 "Alist to decide a coding system to use for a process I/O operation.\n\
5209 The format is ((PATTERN . VAL) ...),\n\
5210 where PATTERN is a regular expression matching a program name,\n\
5211 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
5212 If VAL is a coding system, it is used for both decoding what received\n\
5213 from the program and encoding what sent to the program.\n\
5214 If VAL is a cons of coding systems, the car part is used for decoding,\n\
5215 and the cdr part is used for encoding.\n\
5216 If VAL is a function symbol, the function must return a coding system\n\
5217 or a cons of coding systems which are used as above.\n\
5219 See also the function `find-operation-coding-system'.");
5220 Vprocess_coding_system_alist
= Qnil
;
5222 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist
,
5223 "Alist to decide a coding system to use for a network I/O operation.\n\
5224 The format is ((PATTERN . VAL) ...),\n\
5225 where PATTERN is a regular expression matching a network service name\n\
5226 or is a port number to connect to,\n\
5227 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
5228 If VAL is a coding system, it is used for both decoding what received\n\
5229 from the network stream and encoding what sent to the network stream.\n\
5230 If VAL is a cons of coding systems, the car part is used for decoding,\n\
5231 and the cdr part is used for encoding.\n\
5232 If VAL is a function symbol, the function must return a coding system\n\
5233 or a cons of coding systems which are used as above.\n\
5235 See also the function `find-operation-coding-system'.");
5236 Vnetwork_coding_system_alist
= Qnil
;
5238 DEFVAR_INT ("eol-mnemonic-unix", &eol_mnemonic_unix
,
5239 "Mnemonic character indicating UNIX-like end-of-line format (i.e. LF) .");
5240 eol_mnemonic_unix
= ':';
5242 DEFVAR_INT ("eol-mnemonic-dos", &eol_mnemonic_dos
,
5243 "Mnemonic character indicating DOS-like end-of-line format (i.e. CRLF).");
5244 eol_mnemonic_dos
= '\\';
5246 DEFVAR_INT ("eol-mnemonic-mac", &eol_mnemonic_mac
,
5247 "Mnemonic character indicating MAC-like end-of-line format (i.e. CR).");
5248 eol_mnemonic_mac
= '/';
5250 DEFVAR_INT ("eol-mnemonic-undecided", &eol_mnemonic_undecided
,
5251 "Mnemonic character indicating end-of-line format is not yet decided.");
5252 eol_mnemonic_undecided
= ':';
5254 DEFVAR_LISP ("enable-character-unification", &Venable_character_unification
,
5255 "Non-nil means ISO 2022 encoder/decoder do character unification.");
5256 Venable_character_unification
= Qt
;
5258 DEFVAR_LISP ("standard-character-unification-table-for-decode",
5259 &Vstandard_character_unification_table_for_decode
,
5260 "Table for unifying characters when reading.");
5261 Vstandard_character_unification_table_for_decode
= Qnil
;
5263 DEFVAR_LISP ("standard-character-unification-table-for-encode",
5264 &Vstandard_character_unification_table_for_encode
,
5265 "Table for unifying characters when writing.");
5266 Vstandard_character_unification_table_for_encode
= Qnil
;
5268 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist
,
5269 "Alist of charsets vs revision numbers.\n\
5270 While encoding, if a charset (car part of an element) is found,\n\
5271 designate it with the escape sequence identifing revision (cdr part of the element).");
5272 Vcharset_revision_alist
= Qnil
;
5274 DEFVAR_LISP ("default-process-coding-system",
5275 &Vdefault_process_coding_system
,
5276 "Cons of coding systems used for process I/O by default.\n\
5277 The car part is used for decoding a process output,\n\
5278 the cdr part is used for encoding a text to be sent to a process.");
5279 Vdefault_process_coding_system
= Qnil
;
5281 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table
,
5282 "Table of extra Latin codes in the range 128..159 (inclusive).\n\
5283 This is a vector of length 256.\n\
5284 If Nth element is non-nil, the existence of code N in a file\n\
5285 \(or output of subprocess) doesn't prevent it to be detected as\n\
5286 a coding system of ISO 2022 variant which has a flag\n\
5287 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file\n\
5288 or reading output of a subprocess.\n\
5289 Only 128th through 159th elements has a meaning.");
5290 Vlatin_extra_code_table
= Fmake_vector (make_number (256), Qnil
);
5292 DEFVAR_LISP ("select-safe-coding-system-function",
5293 &Vselect_safe_coding_system_function
,
5294 "Function to call to select safe coding system for encoding a text.\n\
5296 If set, this function is called to force a user to select a proper\n\
5297 coding system which can encode the text in the case that a default\n\
5298 coding system used in each operation can't encode the text.\n\
5300 The default value is `select-safe-codign-system' (which see).");
5301 Vselect_safe_coding_system_function
= Qnil
;