1 /* Coding system handler (conversion, detection, and etc).
2 Copyright (C) 1995, 1997 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
5 This file is part of GNU Emacs.
7 GNU Emacs is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
12 GNU Emacs is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GNU Emacs; see the file COPYING. If not, write to
19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
22 /*** TABLE OF CONTENTS ***
25 2. Emacs' internal format (emacs-mule) handlers
27 4. Shift-JIS and BIG5 handlers
28 5. End-of-line handlers
29 6. C library functions
30 7. Emacs Lisp library functions
35 /*** GENERAL NOTE on CODING SYSTEM ***
37 Coding system is an encoding mechanism of one or more character
38 sets. Here's a list of coding systems which Emacs can handle. When
39 we say "decode", it means converting some other coding system to
40 Emacs' internal format (emacs-internal), and when we say "encode",
41 it means converting the coding system emacs-mule to some other
44 0. Emacs' internal format (emacs-mule)
46 Emacs itself holds a multi-lingual character in a buffer and a string
47 in a special format. Details are described in section 2.
51 The most famous coding system for multiple character sets. X's
52 Compound Text, various EUCs (Extended Unix Code), and coding
53 systems used in Internet communication such as ISO-2022-JP are
54 all variants of ISO2022. Details are described in section 3.
56 2. SJIS (or Shift-JIS or MS-Kanji-Code)
58 A coding system to encode character sets: ASCII, JISX0201, and
59 JISX0208. Widely used for PC's in Japan. Details are described in
64 A coding system to encode character sets: ASCII and Big5. Widely
65 used by Chinese (mainly in Taiwan and Hong Kong). Details are
66 described in section 4. In this file, when we write "BIG5"
67 (all uppercase), we mean the coding system, and when we write
68 "Big5" (capitalized), we mean the character set.
72 A coding system to for a text containing random 8-bit code. Emacs
73 does no code conversion on such a text except for end-of-line
78 If a user wants to read/write a text encoded in a coding system not
79 listed above, he can supply a decoder and an encoder for it in CCL
80 (Code Conversion Language) programs. Emacs executes the CCL program
81 while reading/writing.
83 Emacs represents a coding-system by a Lisp symbol that has a property
84 `coding-system'. But, before actually using the coding-system, the
85 information about it is set in a structure of type `struct
86 coding_system' for rapid processing. See section 6 for more details.
90 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
92 How end-of-line of a text is encoded depends on a system. For
93 instance, Unix's format is just one byte of `line-feed' code,
94 whereas DOS's format is two-byte sequence of `carriage-return' and
95 `line-feed' codes. MacOS's format is one byte of `carriage-return'.
97 Since text characters encoding and end-of-line encoding are
98 independent, any coding system described above can take
99 any format of end-of-line. So, Emacs has information of format of
100 end-of-line in each coding-system. See section 6 for more details.
104 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
106 These functions check if a text between SRC and SRC_END is encoded
107 in the coding system category XXX. Each returns an integer value in
108 which appropriate flag bits for the category XXX is set. The flag
109 bits are defined in macros CODING_CATEGORY_MASK_XXX. Below is the
110 template of these functions. */
113 detect_coding_emacs_mule (src
, src_end
)
114 unsigned char *src
, *src_end
;
120 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
122 These functions decode SRC_BYTES length text at SOURCE encoded in
123 CODING to Emacs' internal format (emacs-mule). The resulting text
124 goes to a place pointed to by DESTINATION, the length of which should
125 not exceed DST_BYTES. The number of bytes actually processed is
126 returned as *CONSUMED. The return value is the length of the decoded
127 text. Below is a template of these functions. */
129 decode_coding_XXX (coding
, source
, destination
, src_bytes
, dst_bytes
, consumed
)
130 struct coding_system
*coding
;
131 unsigned char *source
, *destination
;
132 int src_bytes
, dst_bytes
;
139 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
141 These functions encode SRC_BYTES length text at SOURCE of Emacs'
142 internal format (emacs-mule) to CODING. The resulting text goes to
143 a place pointed to by DESTINATION, the length of which should not
144 exceed DST_BYTES. The number of bytes actually processed is
145 returned as *CONSUMED. The return value is the length of the
146 encoded text. Below is a template of these functions. */
148 encode_coding_XXX (coding
, source
, destination
, src_bytes
, dst_bytes
, consumed
)
149 struct coding_system
*coding
;
150 unsigned char *source
, *destination
;
151 int src_bytes
, dst_bytes
;
158 /*** COMMONLY USED MACROS ***/
160 /* The following three macros ONE_MORE_BYTE, TWO_MORE_BYTES, and
161 THREE_MORE_BYTES safely get one, two, and three bytes from the
162 source text respectively. If there are not enough bytes in the
163 source, they jump to `label_end_of_loop'. The caller should set
164 variables `src' and `src_end' to appropriate areas in advance. */
166 #define ONE_MORE_BYTE(c1) \
171 goto label_end_of_loop; \
174 #define TWO_MORE_BYTES(c1, c2) \
176 if (src + 1 < src_end) \
177 c1 = *src++, c2 = *src++; \
179 goto label_end_of_loop; \
182 #define THREE_MORE_BYTES(c1, c2, c3) \
184 if (src + 2 < src_end) \
185 c1 = *src++, c2 = *src++, c3 = *src++; \
187 goto label_end_of_loop; \
190 /* The following three macros DECODE_CHARACTER_ASCII,
191 DECODE_CHARACTER_DIMENSION1, and DECODE_CHARACTER_DIMENSION2 put
192 the multi-byte form of a character of each class at the place
193 pointed by `dst'. The caller should set the variable `dst' to
194 point to an appropriate area and the variable `coding' to point to
195 the coding-system of the currently decoding text in advance. */
197 /* Decode one ASCII character C. */
199 #define DECODE_CHARACTER_ASCII(c) \
201 if (COMPOSING_P (coding->composing)) \
202 *dst++ = 0xA0, *dst++ = (c) | 0x80; \
207 /* Decode one DIMENSION1 character whose charset is CHARSET and whose
208 position-code is C. */
210 #define DECODE_CHARACTER_DIMENSION1(charset, c) \
212 unsigned char leading_code = CHARSET_LEADING_CODE_BASE (charset); \
213 if (COMPOSING_P (coding->composing)) \
214 *dst++ = leading_code + 0x20; \
216 *dst++ = leading_code; \
217 if (leading_code = CHARSET_LEADING_CODE_EXT (charset)) \
218 *dst++ = leading_code; \
219 *dst++ = (c) | 0x80; \
222 /* Decode one DIMENSION2 character whose charset is CHARSET and whose
223 position-codes are C1 and C2. */
225 #define DECODE_CHARACTER_DIMENSION2(charset, c1, c2) \
227 DECODE_CHARACTER_DIMENSION1 (charset, c1); \
228 *dst++ = (c2) | 0x80; \
232 /*** 1. Preamble ***/
246 #else /* not emacs */
250 #endif /* not emacs */
252 Lisp_Object Qcoding_system
, Qeol_type
;
253 Lisp_Object Qbuffer_file_coding_system
;
254 Lisp_Object Qpost_read_conversion
, Qpre_write_conversion
;
255 Lisp_Object Qno_conversion
, Qundecided
;
256 Lisp_Object Qcoding_system_history
;
258 extern Lisp_Object Qinsert_file_contents
, Qwrite_region
;
259 Lisp_Object Qcall_process
, Qcall_process_region
, Qprocess_argument
;
260 Lisp_Object Qstart_process
, Qopen_network_stream
;
261 Lisp_Object Qtarget_idx
;
263 /* Mnemonic character of each format of end-of-line. */
264 int eol_mnemonic_unix
, eol_mnemonic_dos
, eol_mnemonic_mac
;
265 /* Mnemonic character to indicate format of end-of-line is not yet
267 int eol_mnemonic_undecided
;
269 /* Format of end-of-line decided by system. This is CODING_EOL_LF on
270 Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac. */
275 Lisp_Object Qcoding_system_spec
, Qcoding_system_p
, Qcoding_system_error
;
277 /* Coding system emacs-mule is for converting only end-of-line format. */
278 Lisp_Object Qemacs_mule
;
280 /* Coding-systems are handed between Emacs Lisp programs and C internal
281 routines by the following three variables. */
282 /* Coding-system for reading files and receiving data from process. */
283 Lisp_Object Vcoding_system_for_read
;
284 /* Coding-system for writing files and sending data to process. */
285 Lisp_Object Vcoding_system_for_write
;
286 /* Coding-system actually used in the latest I/O. */
287 Lisp_Object Vlast_coding_system_used
;
289 /* A vector of length 256 which contains information about special
290 Latin codes (espepcially for dealing with Microsoft code). */
291 Lisp_Object Vlatin_extra_code_table
;
293 /* Flag to inhibit code conversion of end-of-line format. */
294 int inhibit_eol_conversion
;
296 /* Coding system to be used to encode text for terminal display. */
297 struct coding_system terminal_coding
;
299 /* Coding system to be used to encode text for terminal display when
300 terminal coding system is nil. */
301 struct coding_system safe_terminal_coding
;
303 /* Coding system of what is sent from terminal keyboard. */
304 struct coding_system keyboard_coding
;
306 Lisp_Object Vfile_coding_system_alist
;
307 Lisp_Object Vprocess_coding_system_alist
;
308 Lisp_Object Vnetwork_coding_system_alist
;
312 Lisp_Object Qcoding_category_index
;
314 /* List of symbols `coding-category-xxx' ordered by priority. */
315 Lisp_Object Vcoding_category_list
;
317 /* Table of coding-systems currently assigned to each coding-category. */
318 Lisp_Object coding_category_table
[CODING_CATEGORY_IDX_MAX
];
320 /* Table of names of symbol for each coding-category. */
321 char *coding_category_name
[CODING_CATEGORY_IDX_MAX
] = {
322 "coding-category-emacs-mule",
323 "coding-category-sjis",
324 "coding-category-iso-7",
325 "coding-category-iso-8-1",
326 "coding-category-iso-8-2",
327 "coding-category-iso-7-else",
328 "coding-category-iso-8-else",
329 "coding-category-big5",
330 "coding-category-raw-text",
331 "coding-category-binary"
334 /* Flag to tell if we look up unification table on character code
336 Lisp_Object Venable_character_unification
;
337 /* Standard unification table to look up on decoding (reading). */
338 Lisp_Object Vstandard_character_unification_table_for_decode
;
339 /* Standard unification table to look up on encoding (writing). */
340 Lisp_Object Vstandard_character_unification_table_for_encode
;
342 Lisp_Object Qcharacter_unification_table
;
343 Lisp_Object Qcharacter_unification_table_for_decode
;
344 Lisp_Object Qcharacter_unification_table_for_encode
;
346 /* Alist of charsets vs revision number. */
347 Lisp_Object Vcharset_revision_alist
;
349 /* Default coding systems used for process I/O. */
350 Lisp_Object Vdefault_process_coding_system
;
353 /*** 2. Emacs internal format (emacs-mule) handlers ***/
355 /* Emacs' internal format for encoding multiple character sets is a
356 kind of multi-byte encoding, i.e. characters are encoded by
357 variable-length sequences of one-byte codes. ASCII characters
358 and control characters (e.g. `tab', `newline') are represented by
359 one-byte sequences which are their ASCII codes, in the range 0x00
360 through 0x7F. The other characters are represented by a sequence
361 of `base leading-code', optional `extended leading-code', and one
362 or two `position-code's. The length of the sequence is determined
363 by the base leading-code. Leading-code takes the range 0x80
364 through 0x9F, whereas extended leading-code and position-code take
365 the range 0xA0 through 0xFF. See `charset.h' for more details
366 about leading-code and position-code.
368 There's one exception to this rule. Special leading-code
369 `leading-code-composition' denotes that the following several
370 characters should be composed into one character. Leading-codes of
371 components (except for ASCII) are added 0x20. An ASCII character
372 component is represented by a 2-byte sequence of `0xA0' and
373 `ASCII-code + 0x80'. See also the comments in `charset.h' for the
374 details of composite character. Hence, we can summarize the code
377 --- CODE RANGE of Emacs' internal format ---
378 (character set) (range)
380 ELSE (1st byte) 0x80 .. 0x9F
381 (rest bytes) 0xA0 .. 0xFF
382 ---------------------------------------------
386 enum emacs_code_class_type emacs_code_class
[256];
388 /* Go to the next statement only if *SRC is accessible and the code is
389 greater than 0xA0. */
390 #define CHECK_CODE_RANGE_A0_FF \
392 if (src >= src_end) \
393 goto label_end_of_switch; \
394 else if (*src++ < 0xA0) \
398 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
399 Check if a text is encoded in Emacs' internal format. If it is,
400 return CODING_CATEGORY_MASK_EMASC_MULE, else return 0. */
403 detect_coding_emacs_mule (src
, src_end
)
404 unsigned char *src
, *src_end
;
409 while (src
< src_end
)
421 switch (emacs_code_class
[c
])
423 case EMACS_ascii_code
:
424 case EMACS_linefeed_code
:
427 case EMACS_control_code
:
428 if (c
== ISO_CODE_ESC
|| c
== ISO_CODE_SI
|| c
== ISO_CODE_SO
)
432 case EMACS_invalid_code
:
435 case EMACS_leading_code_composition
: /* c == 0x80 */
437 CHECK_CODE_RANGE_A0_FF
;
442 case EMACS_leading_code_4
:
443 CHECK_CODE_RANGE_A0_FF
;
444 /* fall down to check it two more times ... */
446 case EMACS_leading_code_3
:
447 CHECK_CODE_RANGE_A0_FF
;
448 /* fall down to check it one more time ... */
450 case EMACS_leading_code_2
:
451 CHECK_CODE_RANGE_A0_FF
;
459 return CODING_CATEGORY_MASK_EMACS_MULE
;
463 /*** 3. ISO2022 handlers ***/
465 /* The following note describes the coding system ISO2022 briefly.
466 Since the intention of this note is to help in understanding of
467 the programs in this file, some parts are NOT ACCURATE or OVERLY
468 SIMPLIFIED. For the thorough understanding, please refer to the
469 original document of ISO2022.
471 ISO2022 provides many mechanisms to encode several character sets
472 in 7-bit and 8-bit environment. If one chooses 7-bite environment,
473 all text is encoded by codes of less than 128. This may make the
474 encoded text a little bit longer, but the text gets more stability
475 to pass through several gateways (some of them strip off the MSB).
477 There are two kinds of character set: control character set and
478 graphic character set. The former contains control characters such
479 as `newline' and `escape' to provide control functions (control
480 functions are provided also by escape sequences). The latter
481 contains graphic characters such as ' A' and '-'. Emacs recognizes
482 two control character sets and many graphic character sets.
484 Graphic character sets are classified into one of the following
485 four classes, DIMENSION1_CHARS94, DIMENSION1_CHARS96,
486 DIMENSION2_CHARS94, DIMENSION2_CHARS96 according to the number of
487 bytes (DIMENSION) and the number of characters in one dimension
488 (CHARS) of the set. In addition, each character set is assigned an
489 identification tag (called "final character" and denoted as <F>
490 here after) which is unique in each class. <F> of each character
491 set is decided by ECMA(*) when it is registered in ISO. Code range
492 of <F> is 0x30..0x7F (0x30..0x3F are for private use only).
494 Note (*): ECMA = European Computer Manufacturers Association
496 Here are examples of graphic character set [NAME(<F>)]:
497 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
498 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
499 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
500 o DIMENSION2_CHARS96 -- none for the moment
502 A code area (1byte=8bits) is divided into 4 areas, C0, GL, C1, and GR.
503 C0 [0x00..0x1F] -- control character plane 0
504 GL [0x20..0x7F] -- graphic character plane 0
505 C1 [0x80..0x9F] -- control character plane 1
506 GR [0xA0..0xFF] -- graphic character plane 1
508 A control character set is directly designated and invoked to C0 or
509 C1 by an escape sequence. The most common case is that ISO646's
510 control character set is designated/invoked to C0 and ISO6429's
511 control character set is designated/invoked to C1, and usually
512 these designations/invocations are omitted in a coded text. With
513 7-bit environment, only C0 can be used, and a control character for
514 C1 is encoded by an appropriate escape sequence to fit in the
515 environment. All control characters for C1 are defined the
516 corresponding escape sequences.
518 A graphic character set is at first designated to one of four
519 graphic registers (G0 through G3), then these graphic registers are
520 invoked to GL or GR. These designations and invocations can be
521 done independently. The most common case is that G0 is invoked to
522 GL, G1 is invoked to GR, and ASCII is designated to G0, and usually
523 these invocations and designations are omitted in a coded text.
524 With 7-bit environment, only GL can be used.
526 When a graphic character set of CHARS94 is invoked to GL, code 0x20
527 and 0x7F of GL area work as control characters SPACE and DEL
528 respectively, and code 0xA0 and 0xFF of GR area should not be used.
530 There are two ways of invocation: locking-shift and single-shift.
531 With locking-shift, the invocation lasts until the next different
532 invocation, whereas with single-shift, the invocation works only
533 for the following character and doesn't affect locking-shift.
534 Invocations are done by the following control characters or escape
537 ----------------------------------------------------------------------
538 function control char escape sequence description
539 ----------------------------------------------------------------------
540 SI (shift-in) 0x0F none invoke G0 to GL
541 SO (shift-out) 0x0E none invoke G1 to GL
542 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
543 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
544 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 into GL
545 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 into GL
546 ----------------------------------------------------------------------
547 The first four are for locking-shift. Control characters for these
548 functions are defined by macros ISO_CODE_XXX in `coding.h'.
550 Designations are done by the following escape sequences.
551 ----------------------------------------------------------------------
552 escape sequence description
553 ----------------------------------------------------------------------
554 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
555 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
556 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
557 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
558 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
559 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
560 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
561 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
562 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
563 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
564 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
565 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
566 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
567 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
568 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
569 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
570 ----------------------------------------------------------------------
572 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
573 of dimension 1, chars 94, and final character <F>, and etc.
575 Note (*): Although these designations are not allowed in ISO2022,
576 Emacs accepts them on decoding, and produces them on encoding
577 CHARS96 character set in a coding system which is characterized as
578 7-bit environment, non-locking-shift, and non-single-shift.
580 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
581 '(' can be omitted. We call this as "short-form" here after.
583 Now you may notice that there are a lot of ways for encoding the
584 same multilingual text in ISO2022. Actually, there exists many
585 coding systems such as Compound Text (used in X's inter client
586 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
587 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
588 localized platforms), and all of these are variants of ISO2022.
590 In addition to the above, Emacs handles two more kinds of escape
591 sequences: ISO6429's direction specification and Emacs' private
592 sequence for specifying character composition.
594 ISO6429's direction specification takes the following format:
595 o CSI ']' -- end of the current direction
596 o CSI '0' ']' -- end of the current direction
597 o CSI '1' ']' -- start of left-to-right text
598 o CSI '2' ']' -- start of right-to-left text
599 The control character CSI (0x9B: control sequence introducer) is
600 abbreviated to the escape sequence ESC '[' in 7-bit environment.
602 Character composition specification takes the following format:
603 o ESC '0' -- start character composition
604 o ESC '1' -- end character composition
605 Since these are not standard escape sequences of any ISO, the use
606 of them for these meaning is restricted to Emacs only. */
608 enum iso_code_class_type iso_code_class
[256];
610 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
611 Check if a text is encoded in ISO2022. If it is, returns an
612 integer in which appropriate flag bits any of:
613 CODING_CATEGORY_MASK_ISO_7
614 CODING_CATEGORY_MASK_ISO_8_1
615 CODING_CATEGORY_MASK_ISO_8_2
616 CODING_CATEGORY_MASK_ISO_7_ELSE
617 CODING_CATEGORY_MASK_ISO_8_ELSE
618 are set. If a code which should never appear in ISO2022 is found,
622 detect_coding_iso2022 (src
, src_end
)
623 unsigned char *src
, *src_end
;
625 int mask
= (CODING_CATEGORY_MASK_ISO_7
626 | CODING_CATEGORY_MASK_ISO_8_1
627 | CODING_CATEGORY_MASK_ISO_8_2
628 | CODING_CATEGORY_MASK_ISO_7_ELSE
629 | CODING_CATEGORY_MASK_ISO_8_ELSE
631 int g1
= 0; /* 1 iff designating to G1. */
633 struct coding_system coding_iso_8_1
, coding_iso_8_2
;
635 /* Coding systems of these categories may accept latin extra codes. */
637 (XSYMBOL (coding_category_table
[CODING_CATEGORY_IDX_ISO_8_1
])->value
,
640 (XSYMBOL (coding_category_table
[CODING_CATEGORY_IDX_ISO_8_2
])->value
,
643 while (mask
&& src
< src_end
)
652 if ((c
>= '(' && c
<= '/'))
654 /* Designation sequence for a charset of dimension 1. */
658 if (c
< ' ' || c
>= 0x80)
659 /* Invalid designation sequence. */
664 /* Designation sequence for a charset of dimension 2. */
668 if (c
>= '@' && c
<= 'B')
669 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
671 else if (c
>= '(' && c
<= '/')
676 if (c
< ' ' || c
>= 0x80)
677 /* Invalid designation sequence. */
681 /* Invalid designation sequence. */
684 else if (c
== 'N' || c
== 'O' || c
== 'n' || c
== 'o')
686 mask
&= (CODING_CATEGORY_MASK_ISO_7_ELSE
687 | CODING_CATEGORY_MASK_ISO_8_ELSE
);
688 else if (c
== '0' || c
== '1' || c
== '2')
689 /* Start/end composition. */
692 /* Invalid escape sequence. */
697 mask
&= (CODING_CATEGORY_MASK_ISO_7_ELSE
698 | CODING_CATEGORY_MASK_ISO_8_ELSE
);
705 int newmask
= CODING_CATEGORY_MASK_ISO_8_ELSE
;
707 if (VECTORP (Vlatin_extra_code_table
)
708 && !NILP (XVECTOR (Vlatin_extra_code_table
)->contents
[c
]))
710 if (coding_iso_8_1
.flags
& CODING_FLAG_ISO_LATIN_EXTRA
)
711 newmask
|= CODING_CATEGORY_MASK_ISO_8_1
;
712 if (coding_iso_8_2
.flags
& CODING_FLAG_ISO_LATIN_EXTRA
)
713 newmask
|= CODING_CATEGORY_MASK_ISO_8_2
;
724 if (VECTORP (Vlatin_extra_code_table
)
725 && !NILP (XVECTOR (Vlatin_extra_code_table
)->contents
[c
]))
729 if (coding_iso_8_1
.flags
& CODING_FLAG_ISO_LATIN_EXTRA
)
730 newmask
|= CODING_CATEGORY_MASK_ISO_8_1
;
731 if (coding_iso_8_2
.flags
& CODING_FLAG_ISO_LATIN_EXTRA
)
732 newmask
|= CODING_CATEGORY_MASK_ISO_8_2
;
740 unsigned char *src_begin
= src
;
742 mask
&= ~(CODING_CATEGORY_MASK_ISO_7
743 | CODING_CATEGORY_MASK_ISO_7_ELSE
);
744 while (src
< src_end
&& *src
>= 0xA0)
746 if ((src
- src_begin
- 1) & 1 && src
< src_end
)
747 mask
&= ~CODING_CATEGORY_MASK_ISO_8_2
;
756 /* Decode a character of which charset is CHARSET and the 1st position
757 code is C1. If dimension of CHARSET is 2, the 2nd position code is
758 fetched from SRC and set to C2. If CHARSET is negative, it means
759 that we are decoding ill formed text, and what we can do is just to
762 #define DECODE_ISO_CHARACTER(charset, c1) \
764 int c_alt, charset_alt = (charset); \
765 if (COMPOSING_HEAD_P (coding->composing)) \
767 *dst++ = LEADING_CODE_COMPOSITION; \
768 if (COMPOSING_WITH_RULE_P (coding->composing)) \
769 /* To tell composition rules are embeded. */ \
771 coding->composing += 2; \
773 if ((charset) >= 0) \
775 if (CHARSET_DIMENSION (charset) == 2) \
776 ONE_MORE_BYTE (c2); \
777 if (!NILP (unification_table) \
778 && ((c_alt = unify_char (unification_table, \
779 -1, (charset), c1, c2)) >= 0)) \
780 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
782 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
783 DECODE_CHARACTER_ASCII (c1); \
784 else if (CHARSET_DIMENSION (charset_alt) == 1) \
785 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
787 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
788 if (COMPOSING_WITH_RULE_P (coding->composing)) \
789 /* To tell a composition rule follows. */ \
790 coding->composing = COMPOSING_WITH_RULE_RULE; \
793 /* Set designation state into CODING. */
794 #define DECODE_DESIGNATION(reg, dimension, chars, final_char) \
796 int charset = ISO_CHARSET_TABLE (make_number (dimension), \
797 make_number (chars), \
798 make_number (final_char)); \
801 if (coding->direction == 1 \
802 && CHARSET_REVERSE_CHARSET (charset) >= 0) \
803 charset = CHARSET_REVERSE_CHARSET (charset); \
804 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
808 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
811 decode_coding_iso2022 (coding
, source
, destination
,
812 src_bytes
, dst_bytes
, consumed
)
813 struct coding_system
*coding
;
814 unsigned char *source
, *destination
;
815 int src_bytes
, dst_bytes
;
818 unsigned char *src
= source
;
819 unsigned char *src_end
= source
+ src_bytes
;
820 unsigned char *dst
= destination
;
821 unsigned char *dst_end
= destination
+ dst_bytes
;
822 /* Since the maximum bytes produced by each loop is 7, we subtract 6
823 from DST_END to assure that overflow checking is necessary only
824 at the head of loop. */
825 unsigned char *adjusted_dst_end
= dst_end
- 6;
827 /* Charsets invoked to graphic plane 0 and 1 respectively. */
828 int charset0
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 0);
829 int charset1
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 1);
830 Lisp_Object unification_table
831 = coding
->character_unification_table_for_decode
;
833 if (!NILP (Venable_character_unification
) && NILP (unification_table
))
834 unification_table
= Vstandard_character_unification_table_for_decode
;
836 while (src
< src_end
&& dst
< adjusted_dst_end
)
838 /* SRC_BASE remembers the start position in source in each loop.
839 The loop will be exited when there's not enough source text
840 to analyze long escape sequence or 2-byte code (within macros
841 ONE_MORE_BYTE or TWO_MORE_BYTES). In that case, SRC is reset
842 to SRC_BASE before exiting. */
843 unsigned char *src_base
= src
;
846 switch (iso_code_class
[c1
])
848 case ISO_0x20_or_0x7F
:
849 if (!coding
->composing
850 && (charset0
< 0 || CHARSET_CHARS (charset0
) == 94))
852 /* This is SPACE or DEL. */
856 /* This is a graphic character, we fall down ... */
858 case ISO_graphic_plane_0
:
859 if (coding
->composing
== COMPOSING_WITH_RULE_RULE
)
861 /* This is a composition rule. */
863 coding
->composing
= COMPOSING_WITH_RULE_TAIL
;
866 DECODE_ISO_CHARACTER (charset0
, c1
);
869 case ISO_0xA0_or_0xFF
:
870 if (charset1
< 0 || CHARSET_CHARS (charset1
) == 94)
876 /* This is a graphic character, we fall down ... */
878 case ISO_graphic_plane_1
:
879 DECODE_ISO_CHARACTER (charset1
, c1
);
882 case ISO_control_code
:
883 /* All ISO2022 control characters in this class have the
884 same representation in Emacs internal format. */
888 case ISO_carriage_return
:
889 if (coding
->eol_type
== CODING_EOL_CR
)
893 else if (coding
->eol_type
== CODING_EOL_CRLF
)
896 if (c1
== ISO_CODE_LF
)
911 if (CODING_SPEC_ISO_DESIGNATION (coding
, 1) < 0)
912 goto label_invalid_escape_sequence
;
913 CODING_SPEC_ISO_INVOCATION (coding
, 0) = 1;
914 charset0
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 0);
918 CODING_SPEC_ISO_INVOCATION (coding
, 0) = 0;
919 charset0
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 0);
922 case ISO_single_shift_2_7
:
923 case ISO_single_shift_2
:
924 /* SS2 is handled as an escape sequence of ESC 'N' */
926 goto label_escape_sequence
;
928 case ISO_single_shift_3
:
929 /* SS2 is handled as an escape sequence of ESC 'O' */
931 goto label_escape_sequence
;
933 case ISO_control_sequence_introducer
:
934 /* CSI is handled as an escape sequence of ESC '[' ... */
936 goto label_escape_sequence
;
940 label_escape_sequence
:
941 /* Escape sequences handled by Emacs are invocation,
942 designation, direction specification, and character
943 composition specification. */
946 case '&': /* revision of following character set */
948 if (!(c1
>= '@' && c1
<= '~'))
949 goto label_invalid_escape_sequence
;
951 if (c1
!= ISO_CODE_ESC
)
952 goto label_invalid_escape_sequence
;
954 goto label_escape_sequence
;
956 case '$': /* designation of 2-byte character set */
958 if (c1
>= '@' && c1
<= 'B')
959 { /* designation of JISX0208.1978, GB2312.1980,
961 DECODE_DESIGNATION (0, 2, 94, c1
);
963 else if (c1
>= 0x28 && c1
<= 0x2B)
964 { /* designation of DIMENSION2_CHARS94 character set */
966 DECODE_DESIGNATION (c1
- 0x28, 2, 94, c2
);
968 else if (c1
>= 0x2C && c1
<= 0x2F)
969 { /* designation of DIMENSION2_CHARS96 character set */
971 DECODE_DESIGNATION (c1
- 0x2C, 2, 96, c2
);
974 goto label_invalid_escape_sequence
;
977 case 'n': /* invocation of locking-shift-2 */
978 if (CODING_SPEC_ISO_DESIGNATION (coding
, 2) < 0)
979 goto label_invalid_escape_sequence
;
980 CODING_SPEC_ISO_INVOCATION (coding
, 0) = 2;
981 charset0
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 0);
984 case 'o': /* invocation of locking-shift-3 */
985 if (CODING_SPEC_ISO_DESIGNATION (coding
, 3) < 0)
986 goto label_invalid_escape_sequence
;
987 CODING_SPEC_ISO_INVOCATION (coding
, 0) = 3;
988 charset0
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 0);
991 case 'N': /* invocation of single-shift-2 */
992 if (CODING_SPEC_ISO_DESIGNATION (coding
, 2) < 0)
993 goto label_invalid_escape_sequence
;
995 charset
= CODING_SPEC_ISO_DESIGNATION (coding
, 2);
996 DECODE_ISO_CHARACTER (charset
, c1
);
999 case 'O': /* invocation of single-shift-3 */
1000 if (CODING_SPEC_ISO_DESIGNATION (coding
, 3) < 0)
1001 goto label_invalid_escape_sequence
;
1003 charset
= CODING_SPEC_ISO_DESIGNATION (coding
, 3);
1004 DECODE_ISO_CHARACTER (charset
, c1
);
1007 case '0': /* start composing without embeded rules */
1008 coding
->composing
= COMPOSING_NO_RULE_HEAD
;
1011 case '1': /* end composing */
1012 coding
->composing
= COMPOSING_NO
;
1015 case '2': /* start composing with embeded rules */
1016 coding
->composing
= COMPOSING_WITH_RULE_HEAD
;
1019 case '[': /* specification of direction */
1020 /* For the moment, nested direction is not supported.
1021 So, the value of `coding->direction' is 0 or 1: 0
1022 means left-to-right, 1 means right-to-left. */
1026 case ']': /* end of the current direction */
1027 coding
->direction
= 0;
1029 case '0': /* end of the current direction */
1030 case '1': /* start of left-to-right direction */
1033 coding
->direction
= 0;
1035 goto label_invalid_escape_sequence
;
1038 case '2': /* start of right-to-left direction */
1041 coding
->direction
= 1;
1043 goto label_invalid_escape_sequence
;
1047 goto label_invalid_escape_sequence
;
1052 if (c1
>= 0x28 && c1
<= 0x2B)
1053 { /* designation of DIMENSION1_CHARS94 character set */
1055 DECODE_DESIGNATION (c1
- 0x28, 1, 94, c2
);
1057 else if (c1
>= 0x2C && c1
<= 0x2F)
1058 { /* designation of DIMENSION1_CHARS96 character set */
1060 DECODE_DESIGNATION (c1
- 0x2C, 1, 96, c2
);
1064 goto label_invalid_escape_sequence
;
1067 /* We must update these variables now. */
1068 charset0
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 0);
1069 charset1
= CODING_SPEC_ISO_PLANE_CHARSET (coding
, 1);
1072 label_invalid_escape_sequence
:
1074 int length
= src
- src_base
;
1076 bcopy (src_base
, dst
, length
);
1083 coding
->carryover_size
= src
- src_base
;
1084 bcopy (src_base
, coding
->carryover
, coding
->carryover_size
);
1089 /* If this is the last block of the text to be decoded, we had
1090 better just flush out all remaining codes in the text although
1091 they are not valid characters. */
1092 if (coding
->last_block
)
1094 bcopy (src
, dst
, src_end
- src
);
1095 dst
+= (src_end
- src
);
1098 *consumed
= src
- source
;
1099 return dst
- destination
;
1102 /* ISO2022 encoding stuff. */
1105 It is not enough to say just "ISO2022" on encoding, we have to
1106 specify more details. In Emacs, each coding-system of ISO2022
1107 variant has the following specifications:
1108 1. Initial designation to G0 thru G3.
1109 2. Allows short-form designation?
1110 3. ASCII should be designated to G0 before control characters?
1111 4. ASCII should be designated to G0 at end of line?
1112 5. 7-bit environment or 8-bit environment?
1113 6. Use locking-shift?
1114 7. Use Single-shift?
1115 And the following two are only for Japanese:
1116 8. Use ASCII in place of JIS0201-1976-Roman?
1117 9. Use JISX0208-1983 in place of JISX0208-1978?
1118 These specifications are encoded in `coding->flags' as flag bits
1119 defined by macros CODING_FLAG_ISO_XXX. See `coding.h' for more
1123 /* Produce codes (escape sequence) for designating CHARSET to graphic
1124 register REG. If <final-char> of CHARSET is '@', 'A', or 'B' and
1125 the coding system CODING allows, produce designation sequence of
1128 #define ENCODE_DESIGNATION(charset, reg, coding) \
1130 unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset); \
1131 char *intermediate_char_94 = "()*+"; \
1132 char *intermediate_char_96 = ",-./"; \
1134 = Fassq (make_number (charset), Vcharset_revision_alist); \
1135 if (! NILP (temp)) \
1137 *dst++ = ISO_CODE_ESC; \
1139 *dst++ = XINT (XCONS (temp)->cdr) + '@'; \
1141 *dst++ = ISO_CODE_ESC; \
1142 if (CHARSET_DIMENSION (charset) == 1) \
1144 if (CHARSET_CHARS (charset) == 94) \
1145 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1147 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1152 if (CHARSET_CHARS (charset) == 94) \
1154 if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM) \
1156 || final_char < '@' || final_char > 'B') \
1157 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
1160 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
1162 *dst++ = final_char; \
1163 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
1166 /* The following two macros produce codes (control character or escape
1167 sequence) for ISO2022 single-shift functions (single-shift-2 and
1170 #define ENCODE_SINGLE_SHIFT_2 \
1172 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1173 *dst++ = ISO_CODE_ESC, *dst++ = 'N'; \
1175 *dst++ = ISO_CODE_SS2; \
1176 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1179 #define ENCODE_SINGLE_SHIFT_3 \
1181 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1182 *dst++ = ISO_CODE_ESC, *dst++ = 'O'; \
1184 *dst++ = ISO_CODE_SS3; \
1185 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
1188 /* The following four macros produce codes (control character or
1189 escape sequence) for ISO2022 locking-shift functions (shift-in,
1190 shift-out, locking-shift-2, and locking-shift-3). */
1192 #define ENCODE_SHIFT_IN \
1194 *dst++ = ISO_CODE_SI; \
1195 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1198 #define ENCODE_SHIFT_OUT \
1200 *dst++ = ISO_CODE_SO; \
1201 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1204 #define ENCODE_LOCKING_SHIFT_2 \
1206 *dst++ = ISO_CODE_ESC, *dst++ = 'n'; \
1207 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1210 #define ENCODE_LOCKING_SHIFT_3 \
1212 *dst++ = ISO_CODE_ESC, *dst++ = 'o'; \
1213 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1216 /* Produce codes for a DIMENSION1 character whose character set is
1217 CHARSET and whose position-code is C1. Designation and invocation
1218 sequences are also produced in advance if necessary. */
1221 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
1223 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1225 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1226 *dst++ = c1 & 0x7F; \
1228 *dst++ = c1 | 0x80; \
1229 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1232 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1234 *dst++ = c1 & 0x7F; \
1237 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1239 *dst++ = c1 | 0x80; \
1242 else if (coding->flags & CODING_FLAG_ISO_SAFE \
1243 && !CODING_SPEC_ISO_EXPECTED_CHARSETS (coding)[charset]) \
1245 /* We should not encode this character, instead produce one or \
1247 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1248 if (CHARSET_WIDTH (charset) == 2) \
1249 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1253 /* Since CHARSET is not yet invoked to any graphic planes, we \
1254 must invoke it, or, at first, designate it to some graphic \
1255 register. Then repeat the loop to actually produce the \
1257 dst = encode_invocation_designation (charset, coding, dst); \
1260 /* Produce codes for a DIMENSION2 character whose character set is
1261 CHARSET and whose position-codes are C1 and C2. Designation and
1262 invocation codes are also produced in advance if necessary. */
1264 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
1266 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
1268 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
1269 *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F; \
1271 *dst++ = c1 | 0x80, *dst++ = c2 | 0x80; \
1272 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
1275 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
1277 *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F; \
1280 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
1282 *dst++ = c1 | 0x80, *dst++= c2 | 0x80; \
1285 else if (coding->flags & CODING_FLAG_ISO_SAFE \
1286 && !CODING_SPEC_ISO_EXPECTED_CHARSETS (coding)[charset]) \
1288 /* We should not encode this character, instead produce one or \
1290 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1291 if (CHARSET_WIDTH (charset) == 2) \
1292 *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION; \
1296 /* Since CHARSET is not yet invoked to any graphic planes, we \
1297 must invoke it, or, at first, designate it to some graphic \
1298 register. Then repeat the loop to actually produce the \
1300 dst = encode_invocation_designation (charset, coding, dst); \
1303 #define ENCODE_ISO_CHARACTER(charset, c1, c2) \
1305 int c_alt, charset_alt; \
1306 if (!NILP (unification_table) \
1307 && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
1309 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1311 charset_alt = charset; \
1312 if (CHARSET_DIMENSION (charset_alt) == 1) \
1313 ENCODE_ISO_CHARACTER_DIMENSION1 (charset_alt, c1); \
1315 ENCODE_ISO_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
1318 /* Produce designation and invocation codes at a place pointed by DST
1319 to use CHARSET. The element `spec.iso2022' of *CODING is updated.
1323 encode_invocation_designation (charset
, coding
, dst
)
1325 struct coding_system
*coding
;
1328 int reg
; /* graphic register number */
1330 /* At first, check designations. */
1331 for (reg
= 0; reg
< 4; reg
++)
1332 if (charset
== CODING_SPEC_ISO_DESIGNATION (coding
, reg
))
1337 /* CHARSET is not yet designated to any graphic registers. */
1338 /* At first check the requested designation. */
1339 reg
= CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
);
1340 if (reg
== CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION
)
1341 /* Since CHARSET requests no special designation, designate it
1342 to graphic register 0. */
1345 ENCODE_DESIGNATION (charset
, reg
, coding
);
1348 if (CODING_SPEC_ISO_INVOCATION (coding
, 0) != reg
1349 && CODING_SPEC_ISO_INVOCATION (coding
, 1) != reg
)
1351 /* Since the graphic register REG is not invoked to any graphic
1352 planes, invoke it to graphic plane 0. */
1355 case 0: /* graphic register 0 */
1359 case 1: /* graphic register 1 */
1363 case 2: /* graphic register 2 */
1364 if (coding
->flags
& CODING_FLAG_ISO_SINGLE_SHIFT
)
1365 ENCODE_SINGLE_SHIFT_2
;
1367 ENCODE_LOCKING_SHIFT_2
;
1370 case 3: /* graphic register 3 */
1371 if (coding
->flags
& CODING_FLAG_ISO_SINGLE_SHIFT
)
1372 ENCODE_SINGLE_SHIFT_3
;
1374 ENCODE_LOCKING_SHIFT_3
;
1381 /* The following two macros produce codes for indicating composition. */
1382 #define ENCODE_COMPOSITION_NO_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '0'
1383 #define ENCODE_COMPOSITION_WITH_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '2'
1384 #define ENCODE_COMPOSITION_END *dst++ = ISO_CODE_ESC, *dst++ = '1'
1386 /* The following three macros produce codes for indicating direction
1388 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
1390 if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS) \
1391 *dst++ = ISO_CODE_ESC, *dst++ = '['; \
1393 *dst++ = ISO_CODE_CSI; \
1396 #define ENCODE_DIRECTION_R2L \
1397 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '2', *dst++ = ']'
1399 #define ENCODE_DIRECTION_L2R \
1400 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '0', *dst++ = ']'
1402 /* Produce codes for designation and invocation to reset the graphic
1403 planes and registers to initial state. */
1404 #define ENCODE_RESET_PLANE_AND_REGISTER \
1407 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0) \
1409 for (reg = 0; reg < 4; reg++) \
1410 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0 \
1411 && (CODING_SPEC_ISO_DESIGNATION (coding, reg) \
1412 != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg))) \
1413 ENCODE_DESIGNATION \
1414 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
1417 /* Produce designation sequences of charsets in the line started from
1418 *SRC to a place pointed by DSTP.
1420 If the current block ends before any end-of-line, we may fail to
1421 find all the necessary *designations. */
1422 encode_designation_at_bol (coding
, table
, src
, src_end
, dstp
)
1423 struct coding_system
*coding
;
1425 unsigned char *src
, *src_end
, **dstp
;
1427 int charset
, c
, found
= 0, reg
;
1428 /* Table of charsets to be designated to each graphic register. */
1430 unsigned char *dst
= *dstp
;
1432 for (reg
= 0; reg
< 4; reg
++)
1435 while (src
< src_end
&& *src
!= '\n' && found
< 4)
1437 int bytes
= BYTES_BY_CHAR_HEAD (*src
);
1440 charset
= CHARSET_AT (src
);
1445 SPLIT_STRING(src
, bytes
, charset
, c1
, c2
);
1446 if ((c_alt
= unify_char (table
, -1, charset
, c1
, c2
)) >= 0)
1447 charset
= CHAR_CHARSET (c_alt
);
1450 reg
= CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
);
1451 if (r
[reg
] == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION
)
1462 for (reg
= 0; reg
< 4; reg
++)
1464 && CODING_SPEC_ISO_DESIGNATION (coding
, reg
) != r
[reg
])
1465 ENCODE_DESIGNATION (r
[reg
], reg
, coding
);
1470 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
1473 encode_coding_iso2022 (coding
, source
, destination
,
1474 src_bytes
, dst_bytes
, consumed
)
1475 struct coding_system
*coding
;
1476 unsigned char *source
, *destination
;
1477 int src_bytes
, dst_bytes
;
1480 unsigned char *src
= source
;
1481 unsigned char *src_end
= source
+ src_bytes
;
1482 unsigned char *dst
= destination
;
1483 unsigned char *dst_end
= destination
+ dst_bytes
;
1484 /* Since the maximum bytes produced by each loop is 20, we subtract 19
1485 from DST_END to assure overflow checking is necessary only at the
1487 unsigned char *adjusted_dst_end
= dst_end
- 19;
1488 Lisp_Object unification_table
1489 = coding
->character_unification_table_for_encode
;
1491 if (!NILP (Venable_character_unification
) && NILP (unification_table
))
1492 unification_table
= Vstandard_character_unification_table_for_encode
;
1494 while (src
< src_end
&& dst
< adjusted_dst_end
)
1496 /* SRC_BASE remembers the start position in source in each loop.
1497 The loop will be exited when there's not enough source text
1498 to analyze multi-byte codes (within macros ONE_MORE_BYTE,
1499 TWO_MORE_BYTES, and THREE_MORE_BYTES). In that case, SRC is
1500 reset to SRC_BASE before exiting. */
1501 unsigned char *src_base
= src
;
1502 int charset
, c1
, c2
, c3
, c4
;
1504 if (coding
->flags
& CODING_FLAG_ISO_DESIGNATE_AT_BOL
1505 && CODING_SPEC_ISO_BOL (coding
))
1507 /* We have to produce designation sequences if any now. */
1508 encode_designation_at_bol (coding
, unification_table
,
1509 src
, src_end
, &dst
);
1510 CODING_SPEC_ISO_BOL (coding
) = 0;
1514 /* If we are seeing a component of a composite character, we are
1515 seeing a leading-code specially encoded for composition, or a
1516 composition rule if composing with rule. We must set C1
1517 to a normal leading-code or an ASCII code. If we are not at
1518 a composed character, we must reset the composition state. */
1519 if (COMPOSING_P (coding
->composing
))
1523 /* We are not in a composite character any longer. */
1524 coding
->composing
= COMPOSING_NO
;
1525 ENCODE_COMPOSITION_END
;
1529 if (coding
->composing
== COMPOSING_WITH_RULE_RULE
)
1532 coding
->composing
= COMPOSING_WITH_RULE_HEAD
;
1535 else if (coding
->composing
== COMPOSING_WITH_RULE_HEAD
)
1536 coding
->composing
= COMPOSING_WITH_RULE_RULE
;
1539 /* This is an ASCII component. */
1544 /* This is a leading-code of non ASCII component. */
1549 /* Now encode one character. C1 is a control character, an
1550 ASCII character, or a leading-code of multi-byte character. */
1551 switch (emacs_code_class
[c1
])
1553 case EMACS_ascii_code
:
1554 ENCODE_ISO_CHARACTER (CHARSET_ASCII
, c1
, /* dummy */ c2
);
1557 case EMACS_control_code
:
1558 if (coding
->flags
& CODING_FLAG_ISO_RESET_AT_CNTL
)
1559 ENCODE_RESET_PLANE_AND_REGISTER
;
1563 case EMACS_carriage_return_code
:
1564 if (!coding
->selective
)
1566 if (coding
->flags
& CODING_FLAG_ISO_RESET_AT_CNTL
)
1567 ENCODE_RESET_PLANE_AND_REGISTER
;
1571 /* fall down to treat '\r' as '\n' ... */
1573 case EMACS_linefeed_code
:
1574 if (coding
->flags
& CODING_FLAG_ISO_RESET_AT_EOL
)
1575 ENCODE_RESET_PLANE_AND_REGISTER
;
1576 if (coding
->flags
& CODING_FLAG_ISO_INIT_AT_BOL
)
1577 bcopy (coding
->spec
.iso2022
.initial_designation
,
1578 coding
->spec
.iso2022
.current_designation
,
1579 sizeof coding
->spec
.iso2022
.initial_designation
);
1580 if (coding
->eol_type
== CODING_EOL_LF
1581 || coding
->eol_type
== CODING_EOL_UNDECIDED
)
1582 *dst
++ = ISO_CODE_LF
;
1583 else if (coding
->eol_type
== CODING_EOL_CRLF
)
1584 *dst
++ = ISO_CODE_CR
, *dst
++ = ISO_CODE_LF
;
1586 *dst
++ = ISO_CODE_CR
;
1587 CODING_SPEC_ISO_BOL (coding
) = 1;
1590 case EMACS_leading_code_2
:
1594 /* invalid sequence */
1599 ENCODE_ISO_CHARACTER (c1
, c2
, /* dummy */ c3
);
1602 case EMACS_leading_code_3
:
1603 TWO_MORE_BYTES (c2
, c3
);
1604 if (c2
< 0xA0 || c3
< 0xA0)
1606 /* invalid sequence */
1611 else if (c1
< LEADING_CODE_PRIVATE_11
)
1612 ENCODE_ISO_CHARACTER (c1
, c2
, c3
);
1614 ENCODE_ISO_CHARACTER (c2
, c3
, /* dummy */ c4
);
1617 case EMACS_leading_code_4
:
1618 THREE_MORE_BYTES (c2
, c3
, c4
);
1619 if (c2
< 0xA0 || c3
< 0xA0 || c4
< 0xA0)
1621 /* invalid sequence */
1628 ENCODE_ISO_CHARACTER (c2
, c3
, c4
);
1631 case EMACS_leading_code_composition
:
1635 /* invalid sequence */
1639 else if (c2
== 0xFF)
1641 coding
->composing
= COMPOSING_WITH_RULE_HEAD
;
1642 ENCODE_COMPOSITION_WITH_RULE_START
;
1646 /* Rewind one byte because it is a character code of
1647 composition elements. */
1649 coding
->composing
= COMPOSING_NO_RULE_HEAD
;
1650 ENCODE_COMPOSITION_NO_RULE_START
;
1654 case EMACS_invalid_code
:
1660 /* We reach here because the source date ends not at character
1662 coding
->carryover_size
= src_end
- src_base
;
1663 bcopy (src_base
, coding
->carryover
, coding
->carryover_size
);
1668 /* If this is the last block of the text to be encoded, we must
1669 reset graphic planes and registers to the initial state. */
1670 if (src
>= src_end
&& coding
->last_block
)
1672 ENCODE_RESET_PLANE_AND_REGISTER
;
1673 if (coding
->carryover_size
> 0
1674 && coding
->carryover_size
< (dst_end
- dst
))
1676 bcopy (coding
->carryover
, dst
, coding
->carryover_size
);
1677 dst
+= coding
->carryover_size
;
1678 coding
->carryover_size
= 0;
1681 *consumed
= src
- source
;
1682 return dst
- destination
;
1686 /*** 4. SJIS and BIG5 handlers ***/
1688 /* Although SJIS and BIG5 are not ISO's coding system, they are used
1689 quite widely. So, for the moment, Emacs supports them in the bare
1690 C code. But, in the future, they may be supported only by CCL. */
1692 /* SJIS is a coding system encoding three character sets: ASCII, right
1693 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
1694 as is. A character of charset katakana-jisx0201 is encoded by
1695 "position-code + 0x80". A character of charset japanese-jisx0208
1696 is encoded in 2-byte but two position-codes are divided and shifted
1697 so that it fit in the range below.
1699 --- CODE RANGE of SJIS ---
1700 (character set) (range)
1702 KATAKANA-JISX0201 0xA0 .. 0xDF
1703 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xFF
1704 (2nd byte) 0x40 .. 0xFF
1705 -------------------------------
1709 /* BIG5 is a coding system encoding two character sets: ASCII and
1710 Big5. An ASCII character is encoded as is. Big5 is a two-byte
1711 character set and is encoded in two-byte.
1713 --- CODE RANGE of BIG5 ---
1714 (character set) (range)
1716 Big5 (1st byte) 0xA1 .. 0xFE
1717 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
1718 --------------------------
1720 Since the number of characters in Big5 is larger than maximum
1721 characters in Emacs' charset (96x96), it can't be handled as one
1722 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
1723 and `charset-big5-2'. Both are DIMENSION2 and CHARS94. The former
1724 contains frequently used characters and the latter contains less
1725 frequently used characters. */
1727 /* Macros to decode or encode a character of Big5 in BIG5. B1 and B2
1728 are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
1729 C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
1730 format. CHARSET is `charset_big5_1' or `charset_big5_2'. */
1732 /* Number of Big5 characters which have the same code in 1st byte. */
1733 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
1735 #define DECODE_BIG5(b1, b2, charset, c1, c2) \
1738 = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62); \
1740 charset = charset_big5_1; \
1743 charset = charset_big5_2; \
1744 temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW; \
1746 c1 = temp / (0xFF - 0xA1) + 0x21; \
1747 c2 = temp % (0xFF - 0xA1) + 0x21; \
1750 #define ENCODE_BIG5(charset, c1, c2, b1, b2) \
1752 unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21); \
1753 if (charset == charset_big5_2) \
1754 temp += BIG5_SAME_ROW * (0xC9 - 0xA1); \
1755 b1 = temp / BIG5_SAME_ROW + 0xA1; \
1756 b2 = temp % BIG5_SAME_ROW; \
1757 b2 += b2 < 0x3F ? 0x40 : 0x62; \
1760 #define DECODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \
1762 int c_alt, charset_alt = (charset); \
1763 if (!NILP (unification_table) \
1764 && ((c_alt = unify_char (unification_table, \
1765 -1, (charset), c1, c2)) >= 0)) \
1766 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1767 if (charset_alt == CHARSET_ASCII || charset_alt < 0) \
1768 DECODE_CHARACTER_ASCII (c1); \
1769 else if (CHARSET_DIMENSION (charset_alt) == 1) \
1770 DECODE_CHARACTER_DIMENSION1 (charset_alt, c1); \
1772 DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2); \
1775 #define ENCODE_SJIS_BIG5_CHARACTER(charset, c1, c2) \
1777 int c_alt, charset_alt; \
1778 if (!NILP (unification_table) \
1779 && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
1781 SPLIT_CHAR (c_alt, charset_alt, c1, c2); \
1783 charset_alt = charset; \
1784 if (charset_alt == charset_ascii) \
1786 else if (CHARSET_DIMENSION (charset_alt) == 1) \
1788 if (sjis_p && charset_alt == charset_katakana_jisx0201) \
1791 *dst++ = charset_alt, *dst++ = c1; \
1795 c1 &= 0x7F, c2 &= 0x7F; \
1796 if (sjis_p && charset_alt == charset_jisx0208) \
1798 unsigned char s1, s2; \
1800 ENCODE_SJIS (c1, c2, s1, s2); \
1801 *dst++ = s1, *dst++ = s2; \
1804 && (charset_alt == charset_big5_1 \
1805 || charset_alt == charset_big5_2)) \
1807 unsigned char b1, b2; \
1809 ENCODE_BIG5 (charset_alt, c1, c2, b1, b2); \
1810 *dst++ = b1, *dst++ = b2; \
1813 *dst++ = charset_alt, *dst++ = c1, *dst++ = c2; \
1817 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1818 Check if a text is encoded in SJIS. If it is, return
1819 CODING_CATEGORY_MASK_SJIS, else return 0. */
1822 detect_coding_sjis (src
, src_end
)
1823 unsigned char *src
, *src_end
;
1827 while (src
< src_end
)
1830 if (c
== ISO_CODE_ESC
|| c
== ISO_CODE_SI
|| c
== ISO_CODE_SO
)
1832 if ((c
>= 0x80 && c
< 0xA0) || c
>= 0xE0)
1834 if (src
< src_end
&& *src
++ < 0x40)
1838 return CODING_CATEGORY_MASK_SJIS
;
1841 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1842 Check if a text is encoded in BIG5. If it is, return
1843 CODING_CATEGORY_MASK_BIG5, else return 0. */
1846 detect_coding_big5 (src
, src_end
)
1847 unsigned char *src
, *src_end
;
1851 while (src
< src_end
)
1854 if (c
== ISO_CODE_ESC
|| c
== ISO_CODE_SI
|| c
== ISO_CODE_SO
)
1861 if (c
< 0x40 || (c
>= 0x7F && c
<= 0xA0))
1865 return CODING_CATEGORY_MASK_BIG5
;
1868 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
1869 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
1872 decode_coding_sjis_big5 (coding
, source
, destination
,
1873 src_bytes
, dst_bytes
, consumed
, sjis_p
)
1874 struct coding_system
*coding
;
1875 unsigned char *source
, *destination
;
1876 int src_bytes
, dst_bytes
;
1880 unsigned char *src
= source
;
1881 unsigned char *src_end
= source
+ src_bytes
;
1882 unsigned char *dst
= destination
;
1883 unsigned char *dst_end
= destination
+ dst_bytes
;
1884 /* Since the maximum bytes produced by each loop is 4, we subtract 3
1885 from DST_END to assure overflow checking is necessary only at the
1887 unsigned char *adjusted_dst_end
= dst_end
- 3;
1888 Lisp_Object unification_table
1889 = coding
->character_unification_table_for_decode
;
1891 if (!NILP (Venable_character_unification
) && NILP (unification_table
))
1892 unification_table
= Vstandard_character_unification_table_for_decode
;
1894 while (src
< src_end
&& dst
< adjusted_dst_end
)
1896 /* SRC_BASE remembers the start position in source in each loop.
1897 The loop will be exited when there's not enough source text
1898 to analyze two-byte character (within macro ONE_MORE_BYTE).
1899 In that case, SRC is reset to SRC_BASE before exiting. */
1900 unsigned char *src_base
= src
;
1901 unsigned char c1
= *src
++, c2
, c3
, c4
;
1905 if (coding
->eol_type
== CODING_EOL_CRLF
)
1911 /* To process C2 again, SRC is subtracted by 1. */
1920 DECODE_SJIS_BIG5_CHARACTER (charset_ascii
, c1
, /* dummy */ c2
);
1921 else if (c1
< 0xA0 || c1
>= 0xE0)
1923 /* SJIS -> JISX0208, BIG5 -> Big5 (only if 0xE0 <= c1 < 0xFF) */
1927 DECODE_SJIS (c1
, c2
, c3
, c4
);
1928 DECODE_SJIS_BIG5_CHARACTER (charset_jisx0208
, c3
, c4
);
1930 else if (c1
>= 0xE0 && c1
< 0xFF)
1935 DECODE_BIG5 (c1
, c2
, charset
, c3
, c4
);
1936 DECODE_SJIS_BIG5_CHARACTER (charset
, c3
, c4
);
1938 else /* Invalid code */
1943 /* SJIS -> JISX0201-Kana, BIG5 -> Big5 */
1945 DECODE_SJIS_BIG5_CHARACTER (charset_katakana_jisx0201
, c1
, /* dummy */ c2
);
1951 DECODE_BIG5 (c1
, c2
, charset
, c3
, c4
);
1952 DECODE_SJIS_BIG5_CHARACTER (charset
, c3
, c4
);
1958 coding
->carryover_size
= src
- src_base
;
1959 bcopy (src_base
, coding
->carryover
, coding
->carryover_size
);
1964 *consumed
= src
- source
;
1965 return dst
- destination
;
1968 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
1969 This function can encode `charset_ascii', `charset_katakana_jisx0201',
1970 `charset_jisx0208', `charset_big5_1', and `charset_big5-2'. We are
1971 sure that all these charsets are registered as official charset
1972 (i.e. do not have extended leading-codes). Characters of other
1973 charsets are produced without any encoding. If SJIS_P is 1, encode
1974 SJIS text, else encode BIG5 text. */
1977 encode_coding_sjis_big5 (coding
, source
, destination
,
1978 src_bytes
, dst_bytes
, consumed
, sjis_p
)
1979 struct coding_system
*coding
;
1980 unsigned char *source
, *destination
;
1981 int src_bytes
, dst_bytes
;
1985 unsigned char *src
= source
;
1986 unsigned char *src_end
= source
+ src_bytes
;
1987 unsigned char *dst
= destination
;
1988 unsigned char *dst_end
= destination
+ dst_bytes
;
1989 /* Since the maximum bytes produced by each loop is 2, we subtract 1
1990 from DST_END to assure overflow checking is necessary only at the
1992 unsigned char *adjusted_dst_end
= dst_end
- 1;
1993 Lisp_Object unification_table
1994 = coding
->character_unification_table_for_encode
;
1996 if (!NILP (Venable_character_unification
) && NILP (unification_table
))
1997 unification_table
= Vstandard_character_unification_table_for_encode
;
1999 while (src
< src_end
&& dst
< adjusted_dst_end
)
2001 /* SRC_BASE remembers the start position in source in each loop.
2002 The loop will be exited when there's not enough source text
2003 to analyze multi-byte codes (within macros ONE_MORE_BYTE and
2004 TWO_MORE_BYTES). In that case, SRC is reset to SRC_BASE
2006 unsigned char *src_base
= src
;
2007 unsigned char c1
= *src
++, c2
, c3
, c4
;
2009 if (coding
->composing
)
2016 else if (c1
>= 0xA0)
2019 coding
->composing
= 0;
2022 switch (emacs_code_class
[c1
])
2024 case EMACS_ascii_code
:
2025 ENCODE_SJIS_BIG5_CHARACTER (charset_ascii
, c1
, /* dummy */ c2
);
2028 case EMACS_control_code
:
2032 case EMACS_carriage_return_code
:
2033 if (!coding
->selective
)
2038 /* fall down to treat '\r' as '\n' ... */
2040 case EMACS_linefeed_code
:
2041 if (coding
->eol_type
== CODING_EOL_LF
2042 || coding
->eol_type
== CODING_EOL_UNDECIDED
)
2044 else if (coding
->eol_type
== CODING_EOL_CRLF
)
2045 *dst
++ = '\r', *dst
++ = '\n';
2050 case EMACS_leading_code_2
:
2052 ENCODE_SJIS_BIG5_CHARACTER (c1
, c2
, /* dummy */ c3
);
2055 case EMACS_leading_code_3
:
2056 TWO_MORE_BYTES (c2
, c3
);
2057 ENCODE_SJIS_BIG5_CHARACTER (c1
, c2
, c3
);
2060 case EMACS_leading_code_4
:
2061 THREE_MORE_BYTES (c2
, c3
, c4
);
2062 ENCODE_SJIS_BIG5_CHARACTER (c2
, c3
, c4
);
2065 case EMACS_leading_code_composition
:
2066 coding
->composing
= 1;
2069 default: /* i.e. case EMACS_invalid_code: */
2075 coding
->carryover_size
= src_end
- src_base
;
2076 bcopy (src_base
, coding
->carryover
, coding
->carryover_size
);
2081 *consumed
= src
- source
;
2082 return dst
- destination
;
2086 /*** 5. End-of-line handlers ***/
2088 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2089 This function is called only when `coding->eol_type' is
2090 CODING_EOL_CRLF or CODING_EOL_CR. */
2092 decode_eol (coding
, source
, destination
, src_bytes
, dst_bytes
, consumed
)
2093 struct coding_system
*coding
;
2094 unsigned char *source
, *destination
;
2095 int src_bytes
, dst_bytes
;
2098 unsigned char *src
= source
;
2099 unsigned char *src_end
= source
+ src_bytes
;
2100 unsigned char *dst
= destination
;
2101 unsigned char *dst_end
= destination
+ dst_bytes
;
2104 switch (coding
->eol_type
)
2106 case CODING_EOL_CRLF
:
2108 /* Since the maximum bytes produced by each loop is 2, we
2109 subtract 1 from DST_END to assure overflow checking is
2110 necessary only at the head of loop. */
2111 unsigned char *adjusted_dst_end
= dst_end
- 1;
2113 while (src
< src_end
&& dst
< adjusted_dst_end
)
2115 unsigned char *src_base
= src
;
2116 unsigned char c
= *src
++;
2129 coding
->carryover_size
= src
- src_base
;
2130 bcopy (src_base
, coding
->carryover
, coding
->carryover_size
);
2134 *consumed
= src
- source
;
2135 produced
= dst
- destination
;
2140 produced
= (src_bytes
> dst_bytes
) ? dst_bytes
: src_bytes
;
2141 bcopy (source
, destination
, produced
);
2142 dst_end
= destination
+ produced
;
2143 while (dst
< dst_end
)
2144 if (*dst
++ == '\r') dst
[-1] = '\n';
2145 *consumed
= produced
;
2148 default: /* i.e. case: CODING_EOL_LF */
2149 produced
= (src_bytes
> dst_bytes
) ? dst_bytes
: src_bytes
;
2150 bcopy (source
, destination
, produced
);
2151 *consumed
= produced
;
2158 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". Encode
2159 format of end-of-line according to `coding->eol_type'. If
2160 `coding->selective' is 1, code '\r' in source text also means
2163 encode_eol (coding
, source
, destination
, src_bytes
, dst_bytes
, consumed
)
2164 struct coding_system
*coding
;
2165 unsigned char *source
, *destination
;
2166 int src_bytes
, dst_bytes
;
2169 unsigned char *src
= source
;
2170 unsigned char *dst
= destination
;
2176 switch (coding
->eol_type
)
2179 case CODING_EOL_UNDECIDED
:
2180 produced
= (src_bytes
> dst_bytes
) ? dst_bytes
: src_bytes
;
2181 bcopy (source
, destination
, produced
);
2182 if (coding
->selective
)
2186 if (*dst
++ == '\r') dst
[-1] = '\n';
2188 *consumed
= produced
;
2190 case CODING_EOL_CRLF
:
2193 unsigned char *src_end
= source
+ src_bytes
;
2194 unsigned char *dst_end
= destination
+ dst_bytes
;
2195 /* Since the maximum bytes produced by each loop is 2, we
2196 subtract 1 from DST_END to assure overflow checking is
2197 necessary only at the head of loop. */
2198 unsigned char *adjusted_dst_end
= dst_end
- 1;
2200 while (src
< src_end
&& dst
< adjusted_dst_end
)
2203 if (c
== '\n' || (c
== '\r' && coding
->selective
))
2204 *dst
++ = '\r', *dst
++ = '\n';
2208 produced
= dst
- destination
;
2209 *consumed
= src
- source
;
2213 default: /* i.e. case CODING_EOL_CR: */
2214 produced
= (src_bytes
> dst_bytes
) ? dst_bytes
: src_bytes
;
2215 bcopy (source
, destination
, produced
);
2219 if (*dst
++ == '\n') dst
[-1] = '\r';
2221 *consumed
= produced
;
2228 /*** 6. C library functions ***/
2230 /* In Emacs Lisp, coding system is represented by a Lisp symbol which
2231 has a property `coding-system'. The value of this property is a
2232 vector of length 5 (called as coding-vector). Among elements of
2233 this vector, the first (element[0]) and the fifth (element[4])
2234 carry important information for decoding/encoding. Before
2235 decoding/encoding, this information should be set in fields of a
2236 structure of type `coding_system'.
2238 A value of property `coding-system' can be a symbol of another
2239 subsidiary coding-system. In that case, Emacs gets coding-vector
2242 `element[0]' contains information to be set in `coding->type'. The
2243 value and its meaning is as follows:
2245 0 -- coding_type_emacs_mule
2246 1 -- coding_type_sjis
2247 2 -- coding_type_iso2022
2248 3 -- coding_type_big5
2249 4 -- coding_type_ccl encoder/decoder written in CCL
2250 nil -- coding_type_no_conversion
2251 t -- coding_type_undecided (automatic conversion on decoding,
2252 no-conversion on encoding)
2254 `element[4]' contains information to be set in `coding->flags' and
2255 `coding->spec'. The meaning varies by `coding->type'.
2257 If `coding->type' is `coding_type_iso2022', element[4] is a vector
2258 of length 32 (of which the first 13 sub-elements are used now).
2259 Meanings of these sub-elements are:
2261 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2262 If the value is an integer of valid charset, the charset is
2263 assumed to be designated to graphic register N initially.
2265 If the value is minus, it is a minus value of charset which
2266 reserves graphic register N, which means that the charset is
2267 not designated initially but should be designated to graphic
2268 register N just before encoding a character in that charset.
2270 If the value is nil, graphic register N is never used on
2273 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2274 Each value takes t or nil. See the section ISO2022 of
2275 `coding.h' for more information.
2277 If `coding->type' is `coding_type_big5', element[4] is t to denote
2278 BIG5-ETen or nil to denote BIG5-HKU.
2280 If `coding->type' takes the other value, element[4] is ignored.
2282 Emacs Lisp's coding system also carries information about format of
2283 end-of-line in a value of property `eol-type'. If the value is
2284 integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2285 means CODING_EOL_CR. If it is not integer, it should be a vector
2286 of subsidiary coding systems of which property `eol-type' has one
2291 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2292 and set it in CODING. If CODING_SYSTEM_SYMBOL is invalid, CODING
2293 is setup so that no conversion is necessary and return -1, else
2297 setup_coding_system (coding_system
, coding
)
2298 Lisp_Object coding_system
;
2299 struct coding_system
*coding
;
2301 Lisp_Object type
, eol_type
;
2303 /* At first, set several fields to default values. */
2304 coding
->require_flushing
= 0;
2305 coding
->last_block
= 0;
2306 coding
->selective
= 0;
2307 coding
->composing
= 0;
2308 coding
->direction
= 0;
2309 coding
->carryover_size
= 0;
2310 coding
->post_read_conversion
= coding
->pre_write_conversion
= Qnil
;
2311 coding
->character_unification_table_for_decode
= Qnil
;
2312 coding
->character_unification_table_for_encode
= Qnil
;
2314 Vlast_coding_system_used
= coding
->symbol
= coding_system
;
2316 /* Get value of property `coding-system' until we get a vector.
2317 While doing that, also get values of properties
2318 `post-read-conversion', `pre-write-conversion',
2319 `character-unification-table-for-decode',
2320 `character-unification-table-for-encode' and `eol-type'. */
2321 while (!NILP (coding_system
) && SYMBOLP (coding_system
))
2323 if (NILP (coding
->post_read_conversion
))
2324 coding
->post_read_conversion
= Fget (coding_system
,
2325 Qpost_read_conversion
);
2326 if (NILP (coding
->pre_write_conversion
))
2327 coding
->pre_write_conversion
= Fget (coding_system
,
2328 Qpre_write_conversion
);
2329 if (!inhibit_eol_conversion
&& NILP (eol_type
))
2330 eol_type
= Fget (coding_system
, Qeol_type
);
2332 if (NILP (coding
->character_unification_table_for_decode
))
2333 coding
->character_unification_table_for_decode
2334 = Fget (coding_system
, Qcharacter_unification_table_for_decode
);
2336 if (NILP (coding
->character_unification_table_for_encode
))
2337 coding
->character_unification_table_for_encode
2338 = Fget (coding_system
, Qcharacter_unification_table_for_encode
);
2340 coding_system
= Fget (coding_system
, Qcoding_system
);
2343 while (!NILP (coding
->character_unification_table_for_decode
)
2344 && SYMBOLP (coding
->character_unification_table_for_decode
))
2345 coding
->character_unification_table_for_decode
2346 = Fget (coding
->character_unification_table_for_decode
,
2347 Qcharacter_unification_table_for_decode
);
2348 if (!NILP (coding
->character_unification_table_for_decode
)
2349 && !CHAR_TABLE_P (coding
->character_unification_table_for_decode
))
2350 coding
->character_unification_table_for_decode
= Qnil
;
2352 while (!NILP (coding
->character_unification_table_for_encode
)
2353 && SYMBOLP (coding
->character_unification_table_for_encode
))
2354 coding
->character_unification_table_for_encode
2355 = Fget (coding
->character_unification_table_for_encode
,
2356 Qcharacter_unification_table_for_encode
);
2357 if (!NILP (coding
->character_unification_table_for_encode
)
2358 && !CHAR_TABLE_P (coding
->character_unification_table_for_encode
))
2359 coding
->character_unification_table_for_encode
= Qnil
;
2361 if (!VECTORP (coding_system
)
2362 || XVECTOR (coding_system
)->size
!= 5)
2363 goto label_invalid_coding_system
;
2365 if (VECTORP (eol_type
))
2366 coding
->eol_type
= CODING_EOL_UNDECIDED
;
2367 else if (XFASTINT (eol_type
) == 1)
2368 coding
->eol_type
= CODING_EOL_CRLF
;
2369 else if (XFASTINT (eol_type
) == 2)
2370 coding
->eol_type
= CODING_EOL_CR
;
2372 coding
->eol_type
= CODING_EOL_LF
;
2374 type
= XVECTOR (coding_system
)->contents
[0];
2375 switch (XFASTINT (type
))
2378 coding
->type
= coding_type_emacs_mule
;
2382 coding
->type
= coding_type_sjis
;
2386 coding
->type
= coding_type_iso2022
;
2390 int i
, charset
, default_reg_bits
= 0;
2392 val
= XVECTOR (coding_system
)->contents
[4];
2394 if (!VECTORP (val
) || XVECTOR (val
)->size
!= 32)
2395 goto label_invalid_coding_system
;
2397 flags
= XVECTOR (val
)->contents
;
2399 = ((NILP (flags
[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM
)
2400 | (NILP (flags
[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL
)
2401 | (NILP (flags
[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL
)
2402 | (NILP (flags
[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS
)
2403 | (NILP (flags
[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT
)
2404 | (NILP (flags
[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT
)
2405 | (NILP (flags
[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN
)
2406 | (NILP (flags
[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS
)
2407 | (NILP (flags
[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION
)
2408 | (NILP (flags
[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL
)
2409 | (NILP (flags
[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL
)
2410 | (NILP (flags
[15]) ? 0 : CODING_FLAG_ISO_SAFE
)
2411 | (NILP (flags
[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA
)
2414 /* Invoke graphic register 0 to plane 0. */
2415 CODING_SPEC_ISO_INVOCATION (coding
, 0) = 0;
2416 /* Invoke graphic register 1 to plane 1 if we can use full 8-bit. */
2417 CODING_SPEC_ISO_INVOCATION (coding
, 1)
2418 = (coding
->flags
& CODING_FLAG_ISO_SEVEN_BITS
? -1 : 1);
2419 /* Not single shifting at first. */
2420 CODING_SPEC_ISO_SINGLE_SHIFTING (coding
) = 0;
2421 /* Beginning of buffer should also be regarded as bol. */
2422 CODING_SPEC_ISO_BOL (coding
) = 1;
2424 /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
2425 FLAGS[REG] can be one of below:
2426 integer CHARSET: CHARSET occupies register I,
2427 t: designate nothing to REG initially, but can be used
2429 list of integer, nil, or t: designate the first
2430 element (if integer) to REG initially, the remaining
2431 elements (if integer) is designated to REG on request,
2432 if an element is t, REG can be used by any charset,
2433 nil: REG is never used. */
2434 for (charset
= 0; charset
<= MAX_CHARSET
; charset
++)
2435 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
)
2436 = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION
;
2437 bzero (CODING_SPEC_ISO_EXPECTED_CHARSETS (coding
), MAX_CHARSET
+ 1);
2438 for (i
= 0; i
< 4; i
++)
2440 if (INTEGERP (flags
[i
])
2441 && (charset
= XINT (flags
[i
]), CHARSET_VALID_P (charset
))
2442 || (charset
= get_charset_id (flags
[i
])) >= 0)
2444 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, i
) = charset
;
2445 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
) = i
;
2446 CODING_SPEC_ISO_EXPECTED_CHARSETS (coding
)[charset
] = 1;
2448 else if (EQ (flags
[i
], Qt
))
2450 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, i
) = -1;
2451 default_reg_bits
|= 1 << i
;
2453 else if (CONSP (flags
[i
]))
2455 Lisp_Object tail
= flags
[i
];
2457 if (INTEGERP (XCONS (tail
)->car
)
2458 && (charset
= XINT (XCONS (tail
)->car
),
2459 CHARSET_VALID_P (charset
))
2460 || (charset
= get_charset_id (XCONS (tail
)->car
)) >= 0)
2462 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, i
) = charset
;
2463 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
) =i
;
2464 CODING_SPEC_ISO_EXPECTED_CHARSETS (coding
)[charset
] = 1;
2467 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, i
) = -1;
2468 tail
= XCONS (tail
)->cdr
;
2469 while (CONSP (tail
))
2471 if (INTEGERP (XCONS (tail
)->car
)
2472 && (charset
= XINT (XCONS (tail
)->car
),
2473 CHARSET_VALID_P (charset
))
2474 || (charset
= get_charset_id (XCONS (tail
)->car
)) >= 0)
2476 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
)
2478 CODING_SPEC_ISO_EXPECTED_CHARSETS (coding
)[charset
]
2481 else if (EQ (XCONS (tail
)->car
, Qt
))
2482 default_reg_bits
|= 1 << i
;
2483 tail
= XCONS (tail
)->cdr
;
2487 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, i
) = -1;
2489 CODING_SPEC_ISO_DESIGNATION (coding
, i
)
2490 = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding
, i
);
2493 if (! (coding
->flags
& CODING_FLAG_ISO_LOCKING_SHIFT
))
2495 /* REG 1 can be used only by locking shift in 7-bit env. */
2496 if (coding
->flags
& CODING_FLAG_ISO_SEVEN_BITS
)
2497 default_reg_bits
&= ~2;
2498 if (! (coding
->flags
& CODING_FLAG_ISO_SINGLE_SHIFT
))
2499 /* Without any shifting, only REG 0 and 1 can be used. */
2500 default_reg_bits
&= 3;
2503 for (charset
= 0; charset
<= MAX_CHARSET
; charset
++)
2504 if (CHARSET_VALID_P (charset
)
2505 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
)
2506 == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION
))
2508 /* We have not yet decided where to designate CHARSET. */
2509 int reg_bits
= default_reg_bits
;
2511 if (CHARSET_CHARS (charset
) == 96)
2512 /* A charset of CHARS96 can't be designated to REG 0. */
2516 /* There exist some default graphic register. */
2517 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
)
2519 ? 0 : (reg_bits
& 2 ? 1 : (reg_bits
& 4 ? 2 : 3)));
2521 /* We anyway have to designate CHARSET to somewhere. */
2522 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding
, charset
)
2523 = (CHARSET_CHARS (charset
) == 94
2525 : ((coding
->flags
& CODING_FLAG_ISO_LOCKING_SHIFT
2526 || ! coding
->flags
& CODING_FLAG_ISO_SEVEN_BITS
)
2528 : (coding
->flags
& CODING_FLAG_ISO_SINGLE_SHIFT
2532 coding
->require_flushing
= 1;
2536 coding
->type
= coding_type_big5
;
2538 = (NILP (XVECTOR (coding_system
)->contents
[4])
2539 ? CODING_FLAG_BIG5_HKU
2540 : CODING_FLAG_BIG5_ETEN
);
2544 coding
->type
= coding_type_ccl
;
2546 Lisp_Object val
= XVECTOR (coding_system
)->contents
[4];
2548 && VECTORP (XCONS (val
)->car
)
2549 && VECTORP (XCONS (val
)->cdr
))
2551 setup_ccl_program (&(coding
->spec
.ccl
.decoder
), XCONS (val
)->car
);
2552 setup_ccl_program (&(coding
->spec
.ccl
.encoder
), XCONS (val
)->cdr
);
2555 goto label_invalid_coding_system
;
2557 coding
->require_flushing
= 1;
2561 coding
->type
= coding_type_raw_text
;
2566 coding
->type
= coding_type_undecided
;
2568 coding
->type
= coding_type_no_conversion
;
2573 label_invalid_coding_system
:
2574 coding
->type
= coding_type_no_conversion
;
2575 coding
->eol_type
= CODING_EOL_LF
;
2576 coding
->symbol
= coding
->pre_write_conversion
= coding
->post_read_conversion
2581 /* Emacs has a mechanism to automatically detect a coding system if it
2582 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
2583 it's impossible to distinguish some coding systems accurately
2584 because they use the same range of codes. So, at first, coding
2585 systems are categorized into 7, those are:
2587 o coding-category-emacs-mule
2589 The category for a coding system which has the same code range
2590 as Emacs' internal format. Assigned the coding-system (Lisp
2591 symbol) `emacs-mule' by default.
2593 o coding-category-sjis
2595 The category for a coding system which has the same code range
2596 as SJIS. Assigned the coding-system (Lisp
2597 symbol) `japanese-shift-jis' by default.
2599 o coding-category-iso-7
2601 The category for a coding system which has the same code range
2602 as ISO2022 of 7-bit environment. This doesn't use any locking
2603 shift and single shift functions. Assigned the coding-system
2604 (Lisp symbol) `iso-2022-7bit' by default.
2606 o coding-category-iso-8-1
2608 The category for a coding system which has the same code range
2609 as ISO2022 of 8-bit environment and graphic plane 1 used only
2610 for DIMENSION1 charset. This doesn't use any locking shift
2611 and single shift functions. Assigned the coding-system (Lisp
2612 symbol) `iso-latin-1' by default.
2614 o coding-category-iso-8-2
2616 The category for a coding system which has the same code range
2617 as ISO2022 of 8-bit environment and graphic plane 1 used only
2618 for DIMENSION2 charset. This doesn't use any locking shift
2619 and single shift functions. Assigned the coding-system (Lisp
2620 symbol) `japanese-iso-8bit' by default.
2622 o coding-category-iso-7-else
2624 The category for a coding system which has the same code range
2625 as ISO2022 of 7-bit environemnt but uses locking shift or
2626 single shift functions. Assigned the coding-system (Lisp
2627 symbol) `iso-2022-7bit-lock' by default.
2629 o coding-category-iso-8-else
2631 The category for a coding system which has the same code range
2632 as ISO2022 of 8-bit environemnt but uses locking shift or
2633 single shift functions. Assigned the coding-system (Lisp
2634 symbol) `iso-2022-8bit-ss2' by default.
2636 o coding-category-big5
2638 The category for a coding system which has the same code range
2639 as BIG5. Assigned the coding-system (Lisp symbol)
2640 `cn-big5' by default.
2642 o coding-category-binary
2644 The category for a coding system not categorized in any of the
2645 above. Assigned the coding-system (Lisp symbol)
2646 `no-conversion' by default.
2648 Each of them is a Lisp symbol and the value is an actual
2649 `coding-system's (this is also a Lisp symbol) assigned by a user.
2650 What Emacs does actually is to detect a category of coding system.
2651 Then, it uses a `coding-system' assigned to it. If Emacs can't
2652 decide only one possible category, it selects a category of the
2653 highest priority. Priorities of categories are also specified by a
2654 user in a Lisp variable `coding-category-list'.
2658 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2659 If it detects possible coding systems, return an integer in which
2660 appropriate flag bits are set. Flag bits are defined by macros
2661 CODING_CATEGORY_MASK_XXX in `coding.h'. */
2664 detect_coding_mask (src
, src_bytes
)
2668 register unsigned char c
;
2669 unsigned char *src_end
= src
+ src_bytes
;
2672 /* At first, skip all ASCII characters and control characters except
2673 for three ISO2022 specific control characters. */
2674 label_loop_detect_coding
:
2675 while (src
< src_end
)
2679 || (c
== ISO_CODE_ESC
|| c
== ISO_CODE_SI
|| c
== ISO_CODE_SO
))
2685 /* We found nothing other than ASCII. There's nothing to do. */
2686 return CODING_CATEGORY_MASK_ANY
;
2688 /* The text seems to be encoded in some multilingual coding system.
2689 Now, try to find in which coding system the text is encoded. */
2692 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
2693 /* C is an ISO2022 specific control code of C0. */
2694 mask
= detect_coding_iso2022 (src
, src_end
);
2697 /* No valid ISO2022 code follows C. Try again. */
2698 goto label_loop_detect_coding
;
2699 mask
|= CODING_CATEGORY_MASK_RAW_TEXT
;
2703 /* If C is a special latin extra code,
2704 or is an ISO2022 specific control code of C1 (SS2 or SS3),
2705 or is an ISO2022 control-sequence-introducer (CSI),
2706 we should also consider the possibility of ISO2022 codings. */
2707 if ((VECTORP (Vlatin_extra_code_table
)
2708 && !NILP (XVECTOR (Vlatin_extra_code_table
)->contents
[c
]))
2709 || (c
== ISO_CODE_SS2
|| c
== ISO_CODE_SS3
)
2710 || (c
== ISO_CODE_CSI
2713 || (src
+ 1 < src_end
2715 && (*src
== '0' || *src
== '1' || *src
== '2'))))))
2716 mask
= (detect_coding_iso2022 (src
, src_end
)
2717 | detect_coding_sjis (src
, src_end
)
2718 | detect_coding_emacs_mule (src
, src_end
)
2719 | CODING_CATEGORY_MASK_RAW_TEXT
);
2722 /* C is the first byte of SJIS character code,
2723 or a leading-code of Emacs' internal format (emacs-mule). */
2724 mask
= (detect_coding_sjis (src
, src_end
)
2725 | detect_coding_emacs_mule (src
, src_end
)
2726 | CODING_CATEGORY_MASK_RAW_TEXT
);
2729 /* C is a character of ISO2022 in graphic plane right,
2730 or a SJIS's 1-byte character code (i.e. JISX0201),
2731 or the first byte of BIG5's 2-byte code. */
2732 mask
= (detect_coding_iso2022 (src
, src_end
)
2733 | detect_coding_sjis (src
, src_end
)
2734 | detect_coding_big5 (src
, src_end
)
2735 | CODING_CATEGORY_MASK_RAW_TEXT
);
2740 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2741 The information of the detected coding system is set in CODING. */
2744 detect_coding (coding
, src
, src_bytes
)
2745 struct coding_system
*coding
;
2749 int mask
= detect_coding_mask (src
, src_bytes
);
2751 Lisp_Object val
= Vcoding_category_list
;
2753 if (mask
== CODING_CATEGORY_MASK_ANY
)
2754 /* We found nothing other than ASCII. There's nothing to do. */
2757 /* We found some plausible coding systems. Let's use a coding
2758 system of the highest priority. */
2763 idx
= XFASTINT (Fget (XCONS (val
)->car
, Qcoding_category_index
));
2764 if ((idx
< CODING_CATEGORY_IDX_MAX
) && (mask
& (1 << idx
)))
2766 val
= XCONS (val
)->cdr
;
2773 /* For unknown reason, `Vcoding_category_list' contains none of
2774 found categories. Let's use any of them. */
2775 for (idx
= 0; idx
< CODING_CATEGORY_IDX_MAX
; idx
++)
2776 if (mask
& (1 << idx
))
2779 setup_coding_system (XSYMBOL (coding_category_table
[idx
])->value
, coding
);
2782 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2783 is encoded. Return one of CODING_EOL_LF, CODING_EOL_CRLF,
2784 CODING_EOL_CR, and CODING_EOL_UNDECIDED. */
2786 #define MAX_EOL_CHECK_COUNT 3
2789 detect_eol_type (src
, src_bytes
)
2793 unsigned char *src_end
= src
+ src_bytes
;
2795 int total
= 0; /* How many end-of-lines are found so far. */
2796 int eol_type
= CODING_EOL_UNDECIDED
;
2799 while (src
< src_end
&& total
< MAX_EOL_CHECK_COUNT
)
2802 if (c
== '\n' || c
== '\r')
2806 this_eol_type
= CODING_EOL_LF
;
2807 else if (src
>= src_end
|| *src
!= '\n')
2808 this_eol_type
= CODING_EOL_CR
;
2810 this_eol_type
= CODING_EOL_CRLF
, src
++;
2812 if (eol_type
== CODING_EOL_UNDECIDED
)
2813 /* This is the first end-of-line. */
2814 eol_type
= this_eol_type
;
2815 else if (eol_type
!= this_eol_type
)
2816 /* The found type is different from what found before.
2817 Let's notice the caller about this inconsistency. */
2818 return CODING_EOL_INCONSISTENT
;
2825 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2826 is encoded. If it detects an appropriate format of end-of-line, it
2827 sets the information in *CODING. */
2830 detect_eol (coding
, src
, src_bytes
)
2831 struct coding_system
*coding
;
2835 Lisp_Object val
, coding_system
;
2836 int eol_type
= detect_eol_type (src
, src_bytes
);
2838 if (eol_type
== CODING_EOL_UNDECIDED
)
2839 /* We found no end-of-line in the source text. */
2842 if (eol_type
== CODING_EOL_INCONSISTENT
)
2845 /* This code is suppressed until we find a better way to
2846 distinguish raw text file and binary file. */
2848 /* If we have already detected that the coding is raw-text, the
2849 coding should actually be no-conversion. */
2850 if (coding
->type
== coding_type_raw_text
)
2852 setup_coding_system (Qno_conversion
, coding
);
2855 /* Else, let's decode only text code anyway. */
2857 eol_type
= CODING_EOL_LF
;
2860 coding_system
= coding
->symbol
;
2861 while (!NILP (coding_system
)
2862 && NILP (val
= Fget (coding_system
, Qeol_type
)))
2863 coding_system
= Fget (coding_system
, Qcoding_system
);
2864 if (VECTORP (val
) && XVECTOR (val
)->size
== 3)
2865 setup_coding_system (XVECTOR (val
)->contents
[eol_type
], coding
);
2868 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions". Before
2869 decoding, it may detect coding system and format of end-of-line if
2870 those are not yet decided. */
2873 decode_coding (coding
, source
, destination
, src_bytes
, dst_bytes
, consumed
)
2874 struct coding_system
*coding
;
2875 unsigned char *source
, *destination
;
2876 int src_bytes
, dst_bytes
;
2887 if (coding
->type
== coding_type_undecided
)
2888 detect_coding (coding
, source
, src_bytes
);
2890 if (coding
->eol_type
== CODING_EOL_UNDECIDED
)
2891 detect_eol (coding
, source
, src_bytes
);
2893 coding
->carryover_size
= 0;
2894 switch (coding
->type
)
2896 case coding_type_no_conversion
:
2897 label_no_conversion
:
2898 produced
= (src_bytes
> dst_bytes
) ? dst_bytes
: src_bytes
;
2899 bcopy (source
, destination
, produced
);
2900 *consumed
= produced
;
2903 case coding_type_emacs_mule
:
2904 case coding_type_undecided
:
2905 case coding_type_raw_text
:
2906 if (coding
->eol_type
== CODING_EOL_LF
2907 || coding
->eol_type
== CODING_EOL_UNDECIDED
)
2908 goto label_no_conversion
;
2909 produced
= decode_eol (coding
, source
, destination
,
2910 src_bytes
, dst_bytes
, consumed
);
2913 case coding_type_sjis
:
2914 produced
= decode_coding_sjis_big5 (coding
, source
, destination
,
2915 src_bytes
, dst_bytes
, consumed
,
2919 case coding_type_iso2022
:
2920 produced
= decode_coding_iso2022 (coding
, source
, destination
,
2921 src_bytes
, dst_bytes
, consumed
);
2924 case coding_type_big5
:
2925 produced
= decode_coding_sjis_big5 (coding
, source
, destination
,
2926 src_bytes
, dst_bytes
, consumed
,
2930 case coding_type_ccl
:
2931 produced
= ccl_driver (&coding
->spec
.ccl
.decoder
, source
, destination
,
2932 src_bytes
, dst_bytes
, consumed
);
2939 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". */
2942 encode_coding (coding
, source
, destination
, src_bytes
, dst_bytes
, consumed
)
2943 struct coding_system
*coding
;
2944 unsigned char *source
, *destination
;
2945 int src_bytes
, dst_bytes
;
2950 switch (coding
->type
)
2952 case coding_type_no_conversion
:
2953 label_no_conversion
:
2954 produced
= (src_bytes
> dst_bytes
) ? dst_bytes
: src_bytes
;
2957 bcopy (source
, destination
, produced
);
2958 if (coding
->selective
)
2960 unsigned char *p
= destination
, *pend
= destination
+ produced
;
2962 if (*p
++ == '\015') p
[-1] = '\n';
2965 *consumed
= produced
;
2968 case coding_type_emacs_mule
:
2969 case coding_type_undecided
:
2970 case coding_type_raw_text
:
2971 if (coding
->eol_type
== CODING_EOL_LF
2972 || coding
->eol_type
== CODING_EOL_UNDECIDED
)
2973 goto label_no_conversion
;
2974 produced
= encode_eol (coding
, source
, destination
,
2975 src_bytes
, dst_bytes
, consumed
);
2978 case coding_type_sjis
:
2979 produced
= encode_coding_sjis_big5 (coding
, source
, destination
,
2980 src_bytes
, dst_bytes
, consumed
,
2984 case coding_type_iso2022
:
2985 produced
= encode_coding_iso2022 (coding
, source
, destination
,
2986 src_bytes
, dst_bytes
, consumed
);
2989 case coding_type_big5
:
2990 produced
= encode_coding_sjis_big5 (coding
, source
, destination
,
2991 src_bytes
, dst_bytes
, consumed
,
2995 case coding_type_ccl
:
2996 produced
= ccl_driver (&coding
->spec
.ccl
.encoder
, source
, destination
,
2997 src_bytes
, dst_bytes
, consumed
);
3004 #define CONVERSION_BUFFER_EXTRA_ROOM 256
3006 /* Return maximum size (bytes) of a buffer enough for decoding
3007 SRC_BYTES of text encoded in CODING. */
3010 decoding_buffer_size (coding
, src_bytes
)
3011 struct coding_system
*coding
;
3016 if (coding
->type
== coding_type_iso2022
)
3018 else if (coding
->type
== coding_type_ccl
)
3019 magnification
= coding
->spec
.ccl
.decoder
.buf_magnification
;
3023 return (src_bytes
* magnification
+ CONVERSION_BUFFER_EXTRA_ROOM
);
3026 /* Return maximum size (bytes) of a buffer enough for encoding
3027 SRC_BYTES of text to CODING. */
3030 encoding_buffer_size (coding
, src_bytes
)
3031 struct coding_system
*coding
;
3036 if (coding
->type
== coding_type_ccl
)
3037 magnification
= coding
->spec
.ccl
.encoder
.buf_magnification
;
3041 return (src_bytes
* magnification
+ CONVERSION_BUFFER_EXTRA_ROOM
);
3044 #ifndef MINIMUM_CONVERSION_BUFFER_SIZE
3045 #define MINIMUM_CONVERSION_BUFFER_SIZE 1024
3048 char *conversion_buffer
;
3049 int conversion_buffer_size
;
3051 /* Return a pointer to a SIZE bytes of buffer to be used for encoding
3052 or decoding. Sufficient memory is allocated automatically. If we
3053 run out of memory, return NULL. */
3056 get_conversion_buffer (size
)
3059 if (size
> conversion_buffer_size
)
3062 int real_size
= conversion_buffer_size
* 2;
3064 while (real_size
< size
) real_size
*= 2;
3065 buf
= (char *) xmalloc (real_size
);
3066 xfree (conversion_buffer
);
3067 conversion_buffer
= buf
;
3068 conversion_buffer_size
= real_size
;
3070 return conversion_buffer
;
3075 /*** 7. Emacs Lisp library functions ***/
3077 DEFUN ("coding-system-spec", Fcoding_system_spec
, Scoding_system_spec
,
3079 "Return coding-spec of CODING-SYSTEM.\n\
3080 If CODING-SYSTEM is not a valid coding-system, return nil.")
3084 while (SYMBOLP (obj
) && !NILP (obj
))
3085 obj
= Fget (obj
, Qcoding_system
);
3086 return ((NILP (obj
) || !VECTORP (obj
) || XVECTOR (obj
)->size
!= 5)
3090 DEFUN ("coding-system-p", Fcoding_system_p
, Scoding_system_p
, 1, 1, 0,
3091 "Return t if OBJECT is nil or a coding-system.\n\
3092 See document of make-coding-system for coding-system object.")
3096 return ((NILP (obj
) || !NILP (Fcoding_system_spec (obj
))) ? Qt
: Qnil
);
3099 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system
,
3100 Sread_non_nil_coding_system
, 1, 1, 0,
3101 "Read a coding system from the minibuffer, prompting with string PROMPT.")
3108 val
= Fcompleting_read (prompt
, Vobarray
, Qcoding_system_spec
,
3109 Qt
, Qnil
, Qnil
, Qnil
, Qnil
);
3111 while (XSTRING (val
)->size
== 0);
3112 return (Fintern (val
, Qnil
));
3115 DEFUN ("read-coding-system", Fread_coding_system
, Sread_coding_system
, 1, 1, 0,
3116 "Read a coding system or nil from the minibuffer, prompting with string PROMPT.")
3121 val
= Fcompleting_read (prompt
, Vobarray
, Qcoding_system_p
,
3122 Qt
, Qnil
, Qcoding_system_history
, Qnil
, Qnil
);
3123 return (XSTRING (val
)->size
== 0 ? Qnil
: Fintern (val
, Qnil
));
3126 DEFUN ("check-coding-system", Fcheck_coding_system
, Scheck_coding_system
,
3128 "Check validity of CODING-SYSTEM.\n\
3129 If valid, return CODING-SYSTEM, else `coding-system-error' is signaled.\n\
3130 CODING-SYSTEM is valid if it is a symbol and has \"coding-system\" property.\n\
3131 The value of property should be a vector of length 5.")
3133 Lisp_Object coding_system
;
3135 CHECK_SYMBOL (coding_system
, 0);
3136 if (!NILP (Fcoding_system_p (coding_system
)))
3137 return coding_system
;
3139 Fsignal (Qcoding_system_error
, Fcons (coding_system
, Qnil
));
3142 DEFUN ("detect-coding-region", Fdetect_coding_region
, Sdetect_coding_region
,
3144 "Detect coding system of the text in the region between START and END.\n\
3145 Return a list of possible coding systems ordered by priority.\n\
3146 If only ASCII characters are found, it returns `undecided'\n\
3147 or its subsidiary coding system according to a detected end-of-line format.")
3151 int coding_mask
, eol_type
;
3155 validate_region (&b
, &e
);
3156 beg
= XINT (b
), end
= XINT (e
);
3157 if (beg
< GPT
&& end
>= GPT
) move_gap (end
);
3159 coding_mask
= detect_coding_mask (POS_ADDR (beg
), end
- beg
);
3160 eol_type
= detect_eol_type (POS_ADDR (beg
), end
- beg
);
3162 if (coding_mask
== CODING_CATEGORY_MASK_ANY
)
3165 if (eol_type
!= CODING_EOL_UNDECIDED
3166 && eol_type
!= CODING_EOL_INCONSISTENT
)
3169 val2
= Fget (Qundecided
, Qeol_type
);
3171 val
= XVECTOR (val2
)->contents
[eol_type
];
3178 /* At first, gather possible coding-systems in VAL in a reverse
3181 for (val2
= Vcoding_category_list
;
3183 val2
= XCONS (val2
)->cdr
)
3186 = XFASTINT (Fget (XCONS (val2
)->car
, Qcoding_category_index
));
3187 if (coding_mask
& (1 << idx
))
3190 /* This code is suppressed until we find a better way to
3191 distinguish raw text file and binary file. */
3193 if (idx
== CODING_CATEGORY_IDX_RAW_TEXT
3194 && eol_type
== CODING_EOL_INCONSISTENT
)
3195 val
= Fcons (Qno_conversion
, val
);
3198 val
= Fcons (Fsymbol_value (XCONS (val2
)->car
), val
);
3202 /* Then, change the order of the list, while getting subsidiary
3206 if (eol_type
== CODING_EOL_INCONSISTENT
)
3207 eol_type
== CODING_EOL_UNDECIDED
;
3208 for (; !NILP (val2
); val2
= XCONS (val2
)->cdr
)
3210 if (eol_type
== CODING_EOL_UNDECIDED
)
3211 val
= Fcons (XCONS (val2
)->car
, val
);
3215 val3
= Fget (XCONS (val2
)->car
, Qeol_type
);
3217 val
= Fcons (XVECTOR (val3
)->contents
[eol_type
], val
);
3219 val
= Fcons (XCONS (val2
)->car
, val
);
3227 /* Scan text in the region between *BEGP and *ENDP, skip characters
3228 which we never have to encode to (iff ENCODEP is 1) or decode from
3229 coding system CODING at the head and tail, then set BEGP and ENDP
3230 to the addresses of start and end of the text we actually convert. */
3233 shrink_conversion_area (begp
, endp
, coding
, encodep
)
3234 unsigned char **begp
, **endp
;
3235 struct coding_system
*coding
;
3238 register unsigned char *beg_addr
= *begp
, *end_addr
= *endp
;
3240 if (coding
->eol_type
!= CODING_EOL_LF
3241 && coding
->eol_type
!= CODING_EOL_UNDECIDED
)
3242 /* Since we anyway have to convert end-of-line format, it is not
3243 worth skipping at most 100 bytes or so. */
3246 if (encodep
) /* for encoding */
3248 switch (coding
->type
)
3250 case coding_type_no_conversion
:
3251 case coding_type_emacs_mule
:
3252 case coding_type_undecided
:
3253 case coding_type_raw_text
:
3254 /* We need no conversion. */
3257 case coding_type_ccl
:
3258 /* We can't skip any data. */
3260 case coding_type_iso2022
:
3261 if (coding
->flags
& CODING_FLAG_ISO_DESIGNATE_AT_BOL
)
3263 unsigned char *bol
= beg_addr
;
3264 while (beg_addr
< end_addr
&& *beg_addr
< 0x80)
3267 if (*(beg_addr
- 1) == '\n')
3271 goto label_skip_tail
;
3275 /* We can skip all ASCII characters at the head and tail. */
3276 while (beg_addr
< end_addr
&& *beg_addr
< 0x80) beg_addr
++;
3278 while (beg_addr
< end_addr
&& *(end_addr
- 1) < 0x80) end_addr
--;
3282 else /* for decoding */
3284 switch (coding
->type
)
3286 case coding_type_no_conversion
:
3287 /* We need no conversion. */
3290 case coding_type_emacs_mule
:
3291 case coding_type_raw_text
:
3292 if (coding
->eol_type
== CODING_EOL_LF
)
3294 /* We need no conversion. */
3298 /* We can skip all but carriage-return. */
3299 while (beg_addr
< end_addr
&& *beg_addr
!= '\r') beg_addr
++;
3300 while (beg_addr
< end_addr
&& *(end_addr
- 1) != '\r') end_addr
--;
3302 case coding_type_sjis
:
3303 case coding_type_big5
:
3304 /* We can skip all ASCII characters at the head. */
3305 while (beg_addr
< end_addr
&& *beg_addr
< 0x80) beg_addr
++;
3306 /* We can skip all ASCII characters at the tail except for
3307 the second byte of SJIS or BIG5 code. */
3308 while (beg_addr
< end_addr
&& *(end_addr
- 1) < 0x80) end_addr
--;
3309 if (end_addr
!= *endp
)
3312 case coding_type_ccl
:
3313 /* We can't skip any data. */
3315 default: /* i.e. case coding_type_iso2022: */
3319 /* We can skip all ASCII characters except for a few
3320 control codes at the head. */
3321 while (beg_addr
< end_addr
&& (c
= *beg_addr
) < 0x80
3322 && c
!= ISO_CODE_CR
&& c
!= ISO_CODE_SO
3323 && c
!= ISO_CODE_SI
&& c
!= ISO_CODE_ESC
)
3334 /* Encode to (iff ENCODEP is 1) or decode form coding system CODING a
3335 text between B and E. B and E are buffer position. */
3338 code_convert_region (b
, e
, coding
, encodep
)
3340 struct coding_system
*coding
;
3343 int beg
, end
, len
, consumed
, produced
;
3345 unsigned char *begp
, *endp
;
3348 validate_region (&b
, &e
);
3349 beg
= XINT (b
), end
= XINT (e
);
3350 if (beg
< GPT
&& end
>= GPT
)
3353 if (encodep
&& !NILP (coding
->pre_write_conversion
))
3355 /* We must call a pre-conversion function which may put a new
3356 text to be converted in a new buffer. */
3357 struct buffer
*old
= current_buffer
, *new;
3360 call2 (coding
->pre_write_conversion
, b
, e
);
3361 if (old
!= current_buffer
)
3363 /* Replace the original text by the text just generated. */
3365 new = current_buffer
;
3366 set_buffer_internal (old
);
3367 del_range (beg
, end
);
3368 insert_from_buffer (new, 1, len
, 0);
3373 /* We may be able to shrink the conversion region. */
3374 begp
= POS_ADDR (beg
); endp
= begp
+ (end
- beg
);
3375 shrink_conversion_area (&begp
, &endp
, coding
, encodep
);
3378 /* We need no conversion. */
3382 beg
+= begp
- POS_ADDR (beg
);
3383 end
= beg
+ (endp
- begp
);
3386 len
= encoding_buffer_size (coding
, end
- beg
);
3388 len
= decoding_buffer_size (coding
, end
- beg
);
3389 buf
= get_conversion_buffer (len
);
3391 coding
->last_block
= 1;
3393 ? encode_coding (coding
, POS_ADDR (beg
), buf
, end
- beg
, len
,
3395 : decode_coding (coding
, POS_ADDR (beg
), buf
, end
- beg
, len
,
3398 len
= produced
+ (beg
- XINT (b
)) + (XINT (e
) - end
);
3401 insert (buf
, produced
);
3402 del_range (PT
, PT
+ end
- beg
);
3404 pos
= PT
+ (pos
- end
);
3410 if (!encodep
&& !NILP (coding
->post_read_conversion
))
3412 /* We must call a post-conversion function which may alter
3413 the text just converted. */
3418 insval
= call1 (coding
->post_read_conversion
, make_number (len
));
3419 CHECK_NUMBER (insval
, 0);
3420 len
= XINT (insval
);
3423 return make_number (len
);
3427 code_convert_string (str
, coding
, encodep
, nocopy
)
3428 Lisp_Object str
, nocopy
;
3429 struct coding_system
*coding
;
3432 int len
, consumed
, produced
;
3434 unsigned char *begp
, *endp
;
3435 int head_skip
, tail_skip
;
3436 struct gcpro gcpro1
;
3438 if (encodep
&& !NILP (coding
->pre_write_conversion
)
3439 || !encodep
&& !NILP (coding
->post_read_conversion
))
3441 /* Since we have to call Lisp functions which assume target text
3442 is in a buffer, after setting a temporary buffer, call
3443 code_convert_region. */
3444 int count
= specpdl_ptr
- specpdl
;
3445 int len
= XSTRING (str
)->size
;
3447 struct buffer
*old
= current_buffer
;
3449 record_unwind_protect (Fset_buffer
, Fcurrent_buffer ());
3450 temp_output_buffer_setup (" *code-converting-work*");
3451 set_buffer_internal (XBUFFER (Vstandard_output
));
3452 insert_from_string (str
, 0, len
, 0);
3453 code_convert_region (make_number (BEGV
), make_number (ZV
),
3455 result
= make_buffer_string (BEGV
, ZV
, 0);
3456 set_buffer_internal (old
);
3457 return unbind_to (count
, result
);
3460 /* We may be able to shrink the conversion region. */
3461 begp
= XSTRING (str
)->data
;
3462 endp
= begp
+ XSTRING (str
)->size
;
3463 shrink_conversion_area (&begp
, &endp
, coding
, encodep
);
3466 /* We need no conversion. */
3467 return (NILP (nocopy
) ? Fcopy_sequence (str
) : str
);
3469 head_skip
= begp
- XSTRING (str
)->data
;
3470 tail_skip
= XSTRING (str
)->size
- head_skip
- (endp
- begp
);
3475 len
= encoding_buffer_size (coding
, endp
- begp
);
3477 len
= decoding_buffer_size (coding
, endp
- begp
);
3478 buf
= get_conversion_buffer (len
+ head_skip
+ tail_skip
);
3480 bcopy (XSTRING (str
)->data
, buf
, head_skip
);
3481 coding
->last_block
= 1;
3483 ? encode_coding (coding
, XSTRING (str
)->data
+ head_skip
,
3484 buf
+ head_skip
, endp
- begp
, len
, &consumed
)
3485 : decode_coding (coding
, XSTRING (str
)->data
+ head_skip
,
3486 buf
+ head_skip
, endp
- begp
, len
, &consumed
));
3487 bcopy (XSTRING (str
)->data
+ head_skip
+ (endp
- begp
),
3488 buf
+ head_skip
+ produced
,
3493 return make_string (buf
, head_skip
+ produced
+ tail_skip
);
3496 DEFUN ("decode-coding-region", Fdecode_coding_region
, Sdecode_coding_region
,
3497 3, 3, "r\nzCoding system: ",
3498 "Decode current region by specified coding system.\n\
3499 When called from a program, takes three arguments:\n\
3500 START, END, and CODING-SYSTEM. START END are buffer positions.\n\
3501 Return length of decoded text.")
3502 (b
, e
, coding_system
)
3503 Lisp_Object b
, e
, coding_system
;
3505 struct coding_system coding
;
3507 CHECK_NUMBER_COERCE_MARKER (b
, 0);
3508 CHECK_NUMBER_COERCE_MARKER (e
, 1);
3509 CHECK_SYMBOL (coding_system
, 2);
3511 if (NILP (coding_system
))
3512 return make_number (XFASTINT (e
) - XFASTINT (b
));
3513 if (setup_coding_system (Fcheck_coding_system (coding_system
), &coding
) < 0)
3514 error ("Invalid coding-system: %s", XSYMBOL (coding_system
)->name
->data
);
3516 return code_convert_region (b
, e
, &coding
, 0);
3519 DEFUN ("encode-coding-region", Fencode_coding_region
, Sencode_coding_region
,
3520 3, 3, "r\nzCoding system: ",
3521 "Encode current region by specified coding system.\n\
3522 When called from a program, takes three arguments:\n\
3523 START, END, and CODING-SYSTEM. START END are buffer positions.\n\
3524 Return length of encoded text.")
3525 (b
, e
, coding_system
)
3526 Lisp_Object b
, e
, coding_system
;
3528 struct coding_system coding
;
3530 CHECK_NUMBER_COERCE_MARKER (b
, 0);
3531 CHECK_NUMBER_COERCE_MARKER (e
, 1);
3532 CHECK_SYMBOL (coding_system
, 2);
3534 if (NILP (coding_system
))
3535 return make_number (XFASTINT (e
) - XFASTINT (b
));
3536 if (setup_coding_system (Fcheck_coding_system (coding_system
), &coding
) < 0)
3537 error ("Invalid coding-system: %s", XSYMBOL (coding_system
)->name
->data
);
3539 return code_convert_region (b
, e
, &coding
, 1);
3542 DEFUN ("decode-coding-string", Fdecode_coding_string
, Sdecode_coding_string
,
3544 "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
3545 Optional arg NOCOPY non-nil means return STRING itself if there's no need\n\
3547 (string
, coding_system
, nocopy
)
3548 Lisp_Object string
, coding_system
, nocopy
;
3550 struct coding_system coding
;
3552 CHECK_STRING (string
, 0);
3553 CHECK_SYMBOL (coding_system
, 1);
3555 if (NILP (coding_system
))
3556 return (NILP (nocopy
) ? Fcopy_sequence (string
) : string
);
3557 if (setup_coding_system (Fcheck_coding_system (coding_system
), &coding
) < 0)
3558 error ("Invalid coding-system: %s", XSYMBOL (coding_system
)->name
->data
);
3560 return code_convert_string (string
, &coding
, 0, nocopy
);
3563 DEFUN ("encode-coding-string", Fencode_coding_string
, Sencode_coding_string
,
3565 "Encode STRING to CODING-SYSTEM, and return the result.\n\
3566 Optional arg NOCOPY non-nil means return STRING itself if there's no need\n\
3568 (string
, coding_system
, nocopy
)
3569 Lisp_Object string
, coding_system
, nocopy
;
3571 struct coding_system coding
;
3573 CHECK_STRING (string
, 0);
3574 CHECK_SYMBOL (coding_system
, 1);
3576 if (NILP (coding_system
))
3577 return (NILP (nocopy
) ? Fcopy_sequence (string
) : string
);
3578 if (setup_coding_system (Fcheck_coding_system (coding_system
), &coding
) < 0)
3579 error ("Invalid coding-system: %s", XSYMBOL (coding_system
)->name
->data
);
3581 return code_convert_string (string
, &coding
, 1, nocopy
);
3584 DEFUN ("decode-sjis-char", Fdecode_sjis_char
, Sdecode_sjis_char
, 1, 1, 0,
3585 "Decode a JISX0208 character of shift-jis encoding.\n\
3586 CODE is the character code in SJIS.\n\
3587 Return the corresponding character.")
3591 unsigned char c1
, c2
, s1
, s2
;
3594 CHECK_NUMBER (code
, 0);
3595 s1
= (XFASTINT (code
)) >> 8, s2
= (XFASTINT (code
)) & 0xFF;
3596 DECODE_SJIS (s1
, s2
, c1
, c2
);
3597 XSETFASTINT (val
, MAKE_NON_ASCII_CHAR (charset_jisx0208
, c1
, c2
));
3601 DEFUN ("encode-sjis-char", Fencode_sjis_char
, Sencode_sjis_char
, 1, 1, 0,
3602 "Encode a JISX0208 character CHAR to SJIS coding-system.\n\
3603 Return the corresponding character code in SJIS.")
3607 int charset
, c1
, c2
, s1
, s2
;
3610 CHECK_NUMBER (ch
, 0);
3611 SPLIT_CHAR (XFASTINT (ch
), charset
, c1
, c2
);
3612 if (charset
== charset_jisx0208
)
3614 ENCODE_SJIS (c1
, c2
, s1
, s2
);
3615 XSETFASTINT (val
, (s1
<< 8) | s2
);
3618 XSETFASTINT (val
, 0);
3622 DEFUN ("decode-big5-char", Fdecode_big5_char
, Sdecode_big5_char
, 1, 1, 0,
3623 "Decode a Big5 character CODE of BIG5 coding-system.\n\
3624 CODE is the character code in BIG5.\n\
3625 Return the corresponding character.")
3630 unsigned char b1
, b2
, c1
, c2
;
3633 CHECK_NUMBER (code
, 0);
3634 b1
= (XFASTINT (code
)) >> 8, b2
= (XFASTINT (code
)) & 0xFF;
3635 DECODE_BIG5 (b1
, b2
, charset
, c1
, c2
);
3636 XSETFASTINT (val
, MAKE_NON_ASCII_CHAR (charset
, c1
, c2
));
3640 DEFUN ("encode-big5-char", Fencode_big5_char
, Sencode_big5_char
, 1, 1, 0,
3641 "Encode the Big5 character CHAR to BIG5 coding-system.\n\
3642 Return the corresponding character code in Big5.")
3646 int charset
, c1
, c2
, b1
, b2
;
3649 CHECK_NUMBER (ch
, 0);
3650 SPLIT_CHAR (XFASTINT (ch
), charset
, c1
, c2
);
3651 if (charset
== charset_big5_1
|| charset
== charset_big5_2
)
3653 ENCODE_BIG5 (charset
, c1
, c2
, b1
, b2
);
3654 XSETFASTINT (val
, (b1
<< 8) | b2
);
3657 XSETFASTINT (val
, 0);
3661 DEFUN ("set-terminal-coding-system-internal",
3662 Fset_terminal_coding_system_internal
,
3663 Sset_terminal_coding_system_internal
, 1, 1, 0, "")
3665 Lisp_Object coding_system
;
3667 CHECK_SYMBOL (coding_system
, 0);
3668 setup_coding_system (Fcheck_coding_system (coding_system
), &terminal_coding
);
3669 /* We had better not send unexpected characters to terminal. */
3670 terminal_coding
.flags
|= CODING_FLAG_ISO_SAFE
;
3675 DEFUN ("set-safe-terminal-coding-system-internal",
3676 Fset_safe_terminal_coding_system_internal
,
3677 Sset_safe_terminal_coding_system_internal
, 1, 1, 0, "")
3679 Lisp_Object coding_system
;
3681 CHECK_SYMBOL (coding_system
, 0);
3682 setup_coding_system (Fcheck_coding_system (coding_system
),
3683 &safe_terminal_coding
);
3687 DEFUN ("terminal-coding-system",
3688 Fterminal_coding_system
, Sterminal_coding_system
, 0, 0, 0,
3689 "Return coding-system of your terminal.")
3692 return terminal_coding
.symbol
;
3695 DEFUN ("set-keyboard-coding-system-internal",
3696 Fset_keyboard_coding_system_internal
,
3697 Sset_keyboard_coding_system_internal
, 1, 1, 0, "")
3699 Lisp_Object coding_system
;
3701 CHECK_SYMBOL (coding_system
, 0);
3702 setup_coding_system (Fcheck_coding_system (coding_system
), &keyboard_coding
);
3706 DEFUN ("keyboard-coding-system",
3707 Fkeyboard_coding_system
, Skeyboard_coding_system
, 0, 0, 0,
3708 "Return coding-system of what is sent from terminal keyboard.")
3711 return keyboard_coding
.symbol
;
3715 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system
,
3716 Sfind_operation_coding_system
, 1, MANY
, 0,
3717 "Choose a coding system for an operation based on the target name.\n\
3718 The value names a pair of coding systems: (DECODING-SYSTEM ENCODING-SYSTEM).\n\
3719 DECODING-SYSTEM is the coding system to use for decoding\n\
3720 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system\n\
3721 for encoding (in case OPERATION does encoding).\n\
3723 The first argument OPERATION specifies an I/O primitive:\n\
3724 For file I/O, `insert-file-contents' or `write-region'.\n\
3725 For process I/O, `call-process', `call-process-region', or `start-process'.\n\
3726 For network I/O, `open-network-stream'.\n\
3728 The remaining arguments should be the same arguments that were passed\n\
3729 to the primitive. Depending on which primitive, one of those arguments\n\
3730 is selected as the TARGET. For example, if OPERATION does file I/O,\n\
3731 whichever argument specifies the file name is TARGET.\n\
3733 TARGET has a meaning which depends on OPERATION:\n\
3734 For file I/O, TARGET is a file name.\n\
3735 For process I/O, TARGET is a process name.\n\
3736 For network I/O, TARGET is a service name or a port number\n\
3738 This function looks up what specified for TARGET in,\n\
3739 `file-coding-system-alist', `process-coding-system-alist',\n\
3740 or `network-coding-system-alist' depending on OPERATION.\n\
3741 They may specify a coding system, a cons of coding systems,\n\
3742 or a function symbol to call.\n\
3743 In the last case, we call the function with one argument,\n\
3744 which is a list of all the arguments given to this function.")
3749 Lisp_Object operation
, target_idx
, target
, val
;
3750 register Lisp_Object chain
;
3753 error ("Too few arguments");
3754 operation
= args
[0];
3755 if (!SYMBOLP (operation
)
3756 || !INTEGERP (target_idx
= Fget (operation
, Qtarget_idx
)))
3757 error ("Invalid first arguement");
3758 if (nargs
< 1 + XINT (target_idx
))
3759 error ("Too few arguments for operation: %s",
3760 XSYMBOL (operation
)->name
->data
);
3761 target
= args
[XINT (target_idx
) + 1];
3762 if (!(STRINGP (target
)
3763 || (EQ (operation
, Qopen_network_stream
) && INTEGERP (target
))))
3764 error ("Invalid %dth argument", XINT (target_idx
) + 1);
3766 chain
= ((EQ (operation
, Qinsert_file_contents
)
3767 || EQ (operation
, Qwrite_region
))
3768 ? Vfile_coding_system_alist
3769 : (EQ (operation
, Qopen_network_stream
)
3770 ? Vnetwork_coding_system_alist
3771 : Vprocess_coding_system_alist
));
3775 for (; CONSP (chain
); chain
= XCONS (chain
)->cdr
)
3778 elt
= XCONS (chain
)->car
;
3781 && ((STRINGP (target
)
3782 && STRINGP (XCONS (elt
)->car
)
3783 && fast_string_match (XCONS (elt
)->car
, target
) >= 0)
3784 || (INTEGERP (target
) && EQ (target
, XCONS (elt
)->car
))))
3786 val
= XCONS (elt
)->cdr
;
3789 if (! SYMBOLP (val
))
3791 if (! NILP (Fcoding_system_p (val
)))
3792 return Fcons (val
, val
);
3793 if (!NILP (Ffboundp (val
)))
3794 return call1 (val
, Flist (nargs
, args
));
3804 /*** 8. Post-amble ***/
3810 /* Emacs' internal format specific initialize routine. */
3811 for (i
= 0; i
<= 0x20; i
++)
3812 emacs_code_class
[i
] = EMACS_control_code
;
3813 emacs_code_class
[0x0A] = EMACS_linefeed_code
;
3814 emacs_code_class
[0x0D] = EMACS_carriage_return_code
;
3815 for (i
= 0x21 ; i
< 0x7F; i
++)
3816 emacs_code_class
[i
] = EMACS_ascii_code
;
3817 emacs_code_class
[0x7F] = EMACS_control_code
;
3818 emacs_code_class
[0x80] = EMACS_leading_code_composition
;
3819 for (i
= 0x81; i
< 0xFF; i
++)
3820 emacs_code_class
[i
] = EMACS_invalid_code
;
3821 emacs_code_class
[LEADING_CODE_PRIVATE_11
] = EMACS_leading_code_3
;
3822 emacs_code_class
[LEADING_CODE_PRIVATE_12
] = EMACS_leading_code_3
;
3823 emacs_code_class
[LEADING_CODE_PRIVATE_21
] = EMACS_leading_code_4
;
3824 emacs_code_class
[LEADING_CODE_PRIVATE_22
] = EMACS_leading_code_4
;
3826 /* ISO2022 specific initialize routine. */
3827 for (i
= 0; i
< 0x20; i
++)
3828 iso_code_class
[i
] = ISO_control_code
;
3829 for (i
= 0x21; i
< 0x7F; i
++)
3830 iso_code_class
[i
] = ISO_graphic_plane_0
;
3831 for (i
= 0x80; i
< 0xA0; i
++)
3832 iso_code_class
[i
] = ISO_control_code
;
3833 for (i
= 0xA1; i
< 0xFF; i
++)
3834 iso_code_class
[i
] = ISO_graphic_plane_1
;
3835 iso_code_class
[0x20] = iso_code_class
[0x7F] = ISO_0x20_or_0x7F
;
3836 iso_code_class
[0xA0] = iso_code_class
[0xFF] = ISO_0xA0_or_0xFF
;
3837 iso_code_class
[ISO_CODE_CR
] = ISO_carriage_return
;
3838 iso_code_class
[ISO_CODE_SO
] = ISO_shift_out
;
3839 iso_code_class
[ISO_CODE_SI
] = ISO_shift_in
;
3840 iso_code_class
[ISO_CODE_SS2_7
] = ISO_single_shift_2_7
;
3841 iso_code_class
[ISO_CODE_ESC
] = ISO_escape
;
3842 iso_code_class
[ISO_CODE_SS2
] = ISO_single_shift_2
;
3843 iso_code_class
[ISO_CODE_SS3
] = ISO_single_shift_3
;
3844 iso_code_class
[ISO_CODE_CSI
] = ISO_control_sequence_introducer
;
3846 conversion_buffer_size
= MINIMUM_CONVERSION_BUFFER_SIZE
;
3847 conversion_buffer
= (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE
);
3849 setup_coding_system (Qnil
, &keyboard_coding
);
3850 setup_coding_system (Qnil
, &terminal_coding
);
3851 setup_coding_system (Qnil
, &safe_terminal_coding
);
3853 #if defined (MSDOS) || defined (WINDOWSNT)
3854 system_eol_type
= CODING_EOL_CRLF
;
3856 system_eol_type
= CODING_EOL_LF
;
3864 Qtarget_idx
= intern ("target-idx");
3865 staticpro (&Qtarget_idx
);
3867 Qcoding_system_history
= intern ("coding-system-history");
3868 staticpro (&Qcoding_system_history
);
3869 Fset (Qcoding_system_history
, Qnil
);
3871 /* Target FILENAME is the first argument. */
3872 Fput (Qinsert_file_contents
, Qtarget_idx
, make_number (0));
3873 /* Target FILENAME is the third argument. */
3874 Fput (Qwrite_region
, Qtarget_idx
, make_number (2));
3876 Qcall_process
= intern ("call-process");
3877 staticpro (&Qcall_process
);
3878 /* Target PROGRAM is the first argument. */
3879 Fput (Qcall_process
, Qtarget_idx
, make_number (0));
3881 Qcall_process_region
= intern ("call-process-region");
3882 staticpro (&Qcall_process_region
);
3883 /* Target PROGRAM is the third argument. */
3884 Fput (Qcall_process_region
, Qtarget_idx
, make_number (2));
3886 Qstart_process
= intern ("start-process");
3887 staticpro (&Qstart_process
);
3888 /* Target PROGRAM is the third argument. */
3889 Fput (Qstart_process
, Qtarget_idx
, make_number (2));
3891 Qopen_network_stream
= intern ("open-network-stream");
3892 staticpro (&Qopen_network_stream
);
3893 /* Target SERVICE is the fourth argument. */
3894 Fput (Qopen_network_stream
, Qtarget_idx
, make_number (3));
3896 Qcoding_system
= intern ("coding-system");
3897 staticpro (&Qcoding_system
);
3899 Qeol_type
= intern ("eol-type");
3900 staticpro (&Qeol_type
);
3902 Qbuffer_file_coding_system
= intern ("buffer-file-coding-system");
3903 staticpro (&Qbuffer_file_coding_system
);
3905 Qpost_read_conversion
= intern ("post-read-conversion");
3906 staticpro (&Qpost_read_conversion
);
3908 Qpre_write_conversion
= intern ("pre-write-conversion");
3909 staticpro (&Qpre_write_conversion
);
3911 Qno_conversion
= intern ("no-conversion");
3912 staticpro (&Qno_conversion
);
3914 Qundecided
= intern ("undecided");
3915 staticpro (&Qundecided
);
3917 Qcoding_system_spec
= intern ("coding-system-spec");
3918 staticpro (&Qcoding_system_spec
);
3920 Qcoding_system_p
= intern ("coding-system-p");
3921 staticpro (&Qcoding_system_p
);
3923 Qcoding_system_error
= intern ("coding-system-error");
3924 staticpro (&Qcoding_system_error
);
3926 Fput (Qcoding_system_error
, Qerror_conditions
,
3927 Fcons (Qcoding_system_error
, Fcons (Qerror
, Qnil
)));
3928 Fput (Qcoding_system_error
, Qerror_message
,
3929 build_string ("Invalid coding system"));
3931 Qcoding_category_index
= intern ("coding-category-index");
3932 staticpro (&Qcoding_category_index
);
3936 for (i
= 0; i
< CODING_CATEGORY_IDX_MAX
; i
++)
3938 coding_category_table
[i
] = intern (coding_category_name
[i
]);
3939 staticpro (&coding_category_table
[i
]);
3940 Fput (coding_category_table
[i
], Qcoding_category_index
,
3945 Qcharacter_unification_table
= intern ("character-unification-table");
3946 staticpro (&Qcharacter_unification_table
);
3947 Fput (Qcharacter_unification_table
, Qchar_table_extra_slots
,
3950 Qcharacter_unification_table_for_decode
3951 = intern ("character-unification-table-for-decode");
3952 staticpro (&Qcharacter_unification_table_for_decode
);
3954 Qcharacter_unification_table_for_encode
3955 = intern ("character-unification-table-for-encode");
3956 staticpro (&Qcharacter_unification_table_for_encode
);
3958 Qemacs_mule
= intern ("emacs-mule");
3959 staticpro (&Qemacs_mule
);
3961 defsubr (&Scoding_system_spec
);
3962 defsubr (&Scoding_system_p
);
3963 defsubr (&Sread_coding_system
);
3964 defsubr (&Sread_non_nil_coding_system
);
3965 defsubr (&Scheck_coding_system
);
3966 defsubr (&Sdetect_coding_region
);
3967 defsubr (&Sdecode_coding_region
);
3968 defsubr (&Sencode_coding_region
);
3969 defsubr (&Sdecode_coding_string
);
3970 defsubr (&Sencode_coding_string
);
3971 defsubr (&Sdecode_sjis_char
);
3972 defsubr (&Sencode_sjis_char
);
3973 defsubr (&Sdecode_big5_char
);
3974 defsubr (&Sencode_big5_char
);
3975 defsubr (&Sset_terminal_coding_system_internal
);
3976 defsubr (&Sset_safe_terminal_coding_system_internal
);
3977 defsubr (&Sterminal_coding_system
);
3978 defsubr (&Sset_keyboard_coding_system_internal
);
3979 defsubr (&Skeyboard_coding_system
);
3980 defsubr (&Sfind_operation_coding_system
);
3982 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list
,
3983 "List of coding-categories (symbols) ordered by priority.");
3987 Vcoding_category_list
= Qnil
;
3988 for (i
= CODING_CATEGORY_IDX_MAX
- 1; i
>= 0; i
--)
3989 Vcoding_category_list
3990 = Fcons (coding_category_table
[i
], Vcoding_category_list
);
3993 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read
,
3994 "Specify the coding system for read operations.\n\
3995 It is useful to bind this variable with `let', but do not set it globally.\n\
3996 If the value is a coding system, it is used for decoding on read operation.\n\
3997 If not, an appropriate element is used from one of the coding system alists:\n\
3998 There are three such tables, `file-coding-system-alist',\n\
3999 `process-coding-system-alist', and `network-coding-system-alist'.");
4000 Vcoding_system_for_read
= Qnil
;
4002 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write
,
4003 "Specify the coding system for write operations.\n\
4004 It is useful to bind this variable with `let', but do not set it globally.\n\
4005 If the value is a coding system, it is used for encoding on write operation.\n\
4006 If not, an appropriate element is used from one of the coding system alists:\n\
4007 There are three such tables, `file-coding-system-alist',\n\
4008 `process-coding-system-alist', and `network-coding-system-alist'.");
4009 Vcoding_system_for_write
= Qnil
;
4011 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used
,
4012 "Coding system used in the latest file or process I/O.");
4013 Vlast_coding_system_used
= Qnil
;
4015 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion
,
4016 "*Non-nil inhibit code conversion of end-of-line format in any cases.");
4017 inhibit_eol_conversion
= 0;
4019 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist
,
4020 "Alist to decide a coding system to use for a file I/O operation.\n\
4021 The format is ((PATTERN . VAL) ...),\n\
4022 where PATTERN is a regular expression matching a file name,\n\
4023 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
4024 If VAL is a coding system, it is used for both decoding and encoding\n\
4025 the file contents.\n\
4026 If VAL is a cons of coding systems, the car part is used for decoding,\n\
4027 and the cdr part is used for encoding.\n\
4028 If VAL is a function symbol, the function must return a coding system\n\
4029 or a cons of coding systems which are used as above.\n\
4031 See also the function `find-operation-coding-system'.");
4032 Vfile_coding_system_alist
= Qnil
;
4034 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist
,
4035 "Alist to decide a coding system to use for a process I/O operation.\n\
4036 The format is ((PATTERN . VAL) ...),\n\
4037 where PATTERN is a regular expression matching a program name,\n\
4038 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
4039 If VAL is a coding system, it is used for both decoding what received\n\
4040 from the program and encoding what sent to the program.\n\
4041 If VAL is a cons of coding systems, the car part is used for decoding,\n\
4042 and the cdr part is used for encoding.\n\
4043 If VAL is a function symbol, the function must return a coding system\n\
4044 or a cons of coding systems which are used as above.\n\
4046 See also the function `find-operation-coding-system'.");
4047 Vprocess_coding_system_alist
= Qnil
;
4049 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist
,
4050 "Alist to decide a coding system to use for a network I/O operation.\n\
4051 The format is ((PATTERN . VAL) ...),\n\
4052 where PATTERN is a regular expression matching a network service name\n\
4053 or is a port number to connect to,\n\
4054 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
4055 If VAL is a coding system, it is used for both decoding what received\n\
4056 from the network stream and encoding what sent to the network stream.\n\
4057 If VAL is a cons of coding systems, the car part is used for decoding,\n\
4058 and the cdr part is used for encoding.\n\
4059 If VAL is a function symbol, the function must return a coding system\n\
4060 or a cons of coding systems which are used as above.\n\
4062 See also the function `find-operation-coding-system'.");
4063 Vnetwork_coding_system_alist
= Qnil
;
4065 DEFVAR_INT ("eol-mnemonic-unix", &eol_mnemonic_unix
,
4066 "Mnemonic character indicating UNIX-like end-of-line format (i.e. LF) .");
4067 eol_mnemonic_unix
= ':';
4069 DEFVAR_INT ("eol-mnemonic-dos", &eol_mnemonic_dos
,
4070 "Mnemonic character indicating DOS-like end-of-line format (i.e. CRLF).");
4071 eol_mnemonic_dos
= '\\';
4073 DEFVAR_INT ("eol-mnemonic-mac", &eol_mnemonic_mac
,
4074 "Mnemonic character indicating MAC-like end-of-line format (i.e. CR).");
4075 eol_mnemonic_mac
= '/';
4077 DEFVAR_INT ("eol-mnemonic-undecided", &eol_mnemonic_undecided
,
4078 "Mnemonic character indicating end-of-line format is not yet decided.");
4079 eol_mnemonic_undecided
= ':';
4081 DEFVAR_LISP ("enable-character-unification", &Venable_character_unification
,
4082 "Non-nil means ISO 2022 encoder/decoder do character unification.");
4083 Venable_character_unification
= Qt
;
4085 DEFVAR_LISP ("standard-character-unification-table-for-decode",
4086 &Vstandard_character_unification_table_for_decode
,
4087 "Table for unifying characters when reading.");
4088 Vstandard_character_unification_table_for_decode
= Qnil
;
4090 DEFVAR_LISP ("standard-character-unification-table-for-encode",
4091 &Vstandard_character_unification_table_for_encode
,
4092 "Table for unifying characters when writing.");
4093 Vstandard_character_unification_table_for_encode
= Qnil
;
4095 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist
,
4096 "Alist of charsets vs revision numbers.\n\
4097 While encoding, if a charset (car part of an element) is found,\n\
4098 designate it with the escape sequence identifing revision (cdr part of the element).");
4099 Vcharset_revision_alist
= Qnil
;
4101 DEFVAR_LISP ("default-process-coding-system",
4102 &Vdefault_process_coding_system
,
4103 "Cons of coding systems used for process I/O by default.\n\
4104 The car part is used for decoding a process output,\n\
4105 the cdr part is used for encoding a text to be sent to a process.");
4106 Vdefault_process_coding_system
= Qnil
;
4108 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table
,
4109 "Table of extra Latin codes in the range 128..159 (inclusive).\n\
4110 This is a vector of length 256.\n\
4111 If Nth element is non-nil, the existence of code N in a file\n\
4112 \(or output of subprocess) doesn't prevent it to be detected as\n\
4113 a coding system of ISO 2022 variant which has a flag\n\
4114 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file\n\
4115 or reading output of a subprocess.\n\
4116 Only 128th through 159th elements has a meaning.");
4117 Vlatin_extra_code_table
= Fmake_vector (make_number (256), Qnil
);